diff --git a/.devops/intel.Dockerfile b/.devops/intel.Dockerfile
index 8cad660523ecc..9ce80a71eb950 100644
--- a/.devops/intel.Dockerfile
+++ b/.devops/intel.Dockerfile
@@ -49,19 +49,23 @@ COPY --from=build /app/full /app
 
 WORKDIR /app
 
-RUN apt-get update \
-    && apt-get install -y \
-    git \
-    python3 \
-    python3-pip \
-    && pip install --upgrade pip setuptools wheel \
-    && pip install -r requirements.txt \
-    && apt autoremove -y \
-    && apt clean -y \
-    && rm -rf /tmp/* /var/tmp/* \
-    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
-    && find /var/cache -type f -delete
-
+RUN apt-get update && \
+    apt-get install -y \
+        git \
+        python3 \
+        python3-pip \
+        python3-venv && \
+    python3 -m venv /opt/venv && \
+    . /opt/venv/bin/activate && \
+    pip install --upgrade pip setuptools wheel && \
+    pip install -r requirements.txt && \
+    apt autoremove -y && \
+    apt clean -y && \
+    rm -rf /tmp/* /var/tmp/* && \
+    find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete && \
+    find /var/cache -type f -delete
+
+ENV PATH="/opt/venv/bin:$PATH"
 
 ENTRYPOINT ["/app/tools.sh"]
 
diff --git a/.github/labeler.yml b/.github/labeler.yml
index 278032ef2e1a4..3c2f67707b024 100644
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@@ -86,3 +86,10 @@ nix:
 embedding:
     - changed-files:
         - any-glob-to-any-file: examples/embedding/
+
+Ascend NPU:
+    - changed-files:
+        - any-glob-to-any-file:
+            - ggml/include/ggml-cann.h
+            - ggml/src/ggml-cann/**
+            - docs/backend/CANN.md
diff --git a/.github/workflows/build-cmake-pkg.yml b/.github/workflows/build-cmake-pkg.yml
new file mode 100644
index 0000000000000..fee2ab96bd0e8
--- /dev/null
+++ b/.github/workflows/build-cmake-pkg.yml
@@ -0,0 +1,51 @@
+name: Build relocatable cmake package
+on:
+  workflow_dispatch:
+  workflow_call:
+
+jobs:
+  linux:
+    runs-on: ubuntu-24.04
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: Install dependencies
+        run: |
+          sudo apt update
+          sudo apt install -y build-essential tcl
+
+      - name: Build
+        run: |
+          PREFIX="$(pwd)"/inst
+          cmake -S . -B build -DCMAKE_PREFIX_PATH="$PREFIX" \
+                -DLLAMA_CURL=OFF -DLLAMA_BUILD_TESTS=OFF -DLLAMA_BUILD_TOOLS=OFF \
+                -DLLAMA_BUILD_EXAMPLES=OFF -DCMAKE_BUILD_TYPE=Release
+          cmake --build build --config Release
+          cmake --install build --prefix "$PREFIX" --config Release
+
+          export LLAMA_CONFIG="$PREFIX"/lib/cmake/llama/llama-config.cmake
+          tclsh <<'EOF'
+          set build(commit)  [string trim [exec git rev-parse --short HEAD]]
+          set build(number)  [string trim [exec git rev-list  --count HEAD]]
+          set build(version) "0.0.$build(number)"
+
+          set llamaconfig [read [open "$env(LLAMA_CONFIG)" r]]
+          set checks [list "set\\(LLAMA_VERSION     \\s+$build(version)\\)" \
+                           "set\\(LLAMA_BUILD_COMMIT\\s+$build(commit)\\)" \
+                           "set\\(LLAMA_BUILD_NUMBER\\s+$build(number)\\)"]
+
+          puts -nonewline "Checking llama-config.cmake version... "
+          foreach check $checks {
+              if {![regexp -expanded -- $check $llamaconfig]} {
+                  puts "\"$check\" failed!"
+                  exit 1
+              }
+          }
+          puts "success."
+          EOF
+
+          cd examples/simple-cmake-pkg
+          cmake -S . -B build -DCMAKE_PREFIX_PATH="$PREFIX"/lib/cmake
+          cmake --build build
diff --git a/.github/workflows/build-linux-cross.yml b/.github/workflows/build-linux-cross.yml
index 92dc41f9d729c..7cfc82ba4e277 100644
--- a/.github/workflows/build-linux-cross.yml
+++ b/.github/workflows/build-linux-cross.yml
@@ -231,3 +231,116 @@ jobs:
                          -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
 
           cmake --build build --config Release -j $(nproc)
+
+  debian-13-loongarch64-cpu-cross:
+    runs-on: ubuntu-24.04
+    container: debian@sha256:653dfb9f86c3782e8369d5f7d29bb8faba1f4bff9025db46e807fa4c22903671
+
+    steps:
+      - uses: actions/checkout@v4
+      - name: Setup LoongArch
+        run: |
+          rm -f /etc/apt/sources.list.d/*
+          cat << EOF | tee /etc/apt/sources.list.d/debian-ports.list
+          deb http://snapshot.debian.org/archive/debian/20250515T202920Z/ trixie main
+          EOF
+          ( echo 'quiet "true";'; \
+            echo 'APT::Get::Assume-Yes "true";'; \
+            echo 'APT::Install-Recommends "false";'; \
+            echo 'Acquire::Check-Valid-Until "false";'; \
+            echo 'Acquire::Retries "5";'; \
+          ) > /etc/apt/apt.conf.d/99snapshot-repos
+
+          apt-get update
+          apt-get install -y ca-certificates debian-ports-archive-keyring cmake git zip
+          dpkg --add-architecture loong64
+
+          # Add arch-specific repositories for non-amd64 architectures
+          cat << EOF | tee /etc/apt/sources.list.d/loong64-ports.list
+          deb [arch=loong64] http://snapshot.debian.org/archive/debian-ports/20250515T194251Z/ sid main
+          EOF
+
+          apt-get update || true    ;# Prevent failure due to missing URLs.
+
+          apt-get install -y --no-install-recommends \
+                  build-essential \
+                  gcc-14-loongarch64-linux-gnu \
+                  g++-14-loongarch64-linux-gnu
+
+      - name: Build
+        run: |
+          cmake -B build -DLLAMA_CURL=OFF \
+                         -DCMAKE_BUILD_TYPE=Release \
+                         -DGGML_OPENMP=OFF \
+                         -DLLAMA_BUILD_EXAMPLES=ON \
+                         -DLLAMA_BUILD_TOOLS=ON \
+                         -DLLAMA_BUILD_TESTS=OFF \
+                         -DCMAKE_SYSTEM_NAME=Linux \
+                         -DCMAKE_SYSTEM_PROCESSOR=loongarch64 \
+                         -DCMAKE_C_COMPILER=loongarch64-linux-gnu-gcc-14 \
+                         -DCMAKE_CXX_COMPILER=loongarch64-linux-gnu-g++-14 \
+                         -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
+                         -DCMAKE_FIND_ROOT_PATH=/usr/lib/loongarch64-linux-gnu \
+                         -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
+                         -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
+                         -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
+
+          cmake --build build --config Release -j $(nproc)
+
+  debian-13-loongarch64-vulkan-cross:
+    runs-on: ubuntu-24.04
+    container: debian@sha256:653dfb9f86c3782e8369d5f7d29bb8faba1f4bff9025db46e807fa4c22903671
+
+    steps:
+      - uses: actions/checkout@v4
+      - name: Setup LoongArch
+        run: |
+          rm -f /etc/apt/sources.list.d/*
+          cat << EOF | tee /etc/apt/sources.list.d/debian-ports.list
+          deb http://snapshot.debian.org/archive/debian/20250515T202920Z/ trixie main
+          EOF
+          ( echo 'quiet "true";'; \
+            echo 'APT::Get::Assume-Yes "true";'; \
+            echo 'APT::Install-Recommends "false";'; \
+            echo 'Acquire::Check-Valid-Until "false";'; \
+            echo 'Acquire::Retries "5";'; \
+          ) > /etc/apt/apt.conf.d/99snapshot-repos
+
+          apt-get update
+          apt-get install -y ca-certificates debian-ports-archive-keyring cmake git zip
+          dpkg --add-architecture loong64
+
+          # Add arch-specific repositories for non-amd64 architectures
+          cat << EOF | tee /etc/apt/sources.list.d/loong64-ports.list
+          deb [arch=loong64] http://snapshot.debian.org/archive/debian-ports/20250515T194251Z/ sid main
+          EOF
+
+          apt-get update || true    ;# Prevent failure due to missing URLs.
+
+          apt-get install -y --no-install-recommends \
+                  build-essential \
+                  glslc \
+                  gcc-14-loongarch64-linux-gnu \
+                  g++-14-loongarch64-linux-gnu \
+                  libvulkan-dev:loong64
+
+      - name: Build
+        run: |
+          cmake -B build -DLLAMA_CURL=OFF \
+                         -DCMAKE_BUILD_TYPE=Release \
+                         -DGGML_VULKAN=ON \
+                         -DGGML_OPENMP=OFF \
+                         -DLLAMA_BUILD_EXAMPLES=ON \
+                         -DLLAMA_BUILD_TOOLS=ON \
+                         -DLLAMA_BUILD_TESTS=OFF \
+                         -DCMAKE_SYSTEM_NAME=Linux \
+                         -DCMAKE_SYSTEM_PROCESSOR=loongarch64 \
+                         -DCMAKE_C_COMPILER=loongarch64-linux-gnu-gcc-14 \
+                         -DCMAKE_CXX_COMPILER=loongarch64-linux-gnu-g++-14 \
+                         -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
+                         -DCMAKE_FIND_ROOT_PATH=/usr/lib/loongarch64-linux-gnu \
+                         -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
+                         -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
+                         -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
+
+          cmake --build build --config Release -j $(nproc)
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index ee76d1799e6f4..4feccf21e9e3e 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -5,10 +5,43 @@ on:
   push:
     branches:
       - master
-    paths: ['.github/workflows/build.yml', '.github/workflows/build-linux-cross.yml', '**/CMakeLists.txt', '**/.cmake', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal', '**/*.comp']
+    paths: [
+      '.github/workflows/build.yml',
+      '.github/workflows/build-linux-cross.yml',
+      '.github/workflows/build-cmake-pkg.yml',
+      '**/CMakeLists.txt',
+      '**/.cmake',
+      '**/*.h',
+      '**/*.hpp',
+      '**/*.c',
+      '**/*.cpp',
+      '**/*.cu',
+      '**/*.cuh',
+      '**/*.swift',
+      '**/*.m',
+      '**/*.metal',
+      '**/*.comp'
+    ]
+
   pull_request:
     types: [opened, synchronize, reopened]
-    paths: ['.github/workflows/build.yml', '.github/workflows/build-linux-cross.yml', '**/CMakeLists.txt', '**/.cmake', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal', '**/*.comp']
+    paths: [
+      '.github/workflows/build.yml',
+      '.github/workflows/build-linux-cross.yml',
+      '.github/workflows/build-cmake-pkg.yml',
+      '**/CMakeLists.txt',
+      '**/.cmake',
+      '**/*.h',
+      '**/*.hpp',
+      '**/*.c',
+      '**/*.cpp',
+      '**/*.cu',
+      '**/*.cuh',
+      '**/*.swift',
+      '**/*.m',
+      '**/*.metal',
+      '**/*.comp'
+    ]
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
@@ -306,6 +339,7 @@ jobs:
         id: cmake_test
         run: |
           cd build
+          export GGML_VK_VISIBLE_DEVICES=0
           # This is using llvmpipe and runs slower than other backends
           ctest -L main --verbose --timeout 3600
 
@@ -477,6 +511,9 @@ jobs:
   build-linux-cross:
     uses: ./.github/workflows/build-linux-cross.yml
 
+  build-cmake-pkg:
+    uses: ./.github/workflows/build-cmake-pkg.yml
+
   macOS-latest-cmake-ios:
     runs-on: macos-latest
 
@@ -682,17 +719,17 @@ jobs:
     env:
       OPENBLAS_VERSION: 0.3.23
       SDE_VERSION: 9.33.0-2024-01-07
-      VULKAN_VERSION: 1.4.309.0
+      VULKAN_VERSION: 1.4.313.2
 
     strategy:
       matrix:
         include:
-          - build: 'cpu-x64'
-            defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_OPENMP=OFF'
+          - build: 'cpu-x64 (static)'
+            defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF'
           - build: 'openblas-x64'
             defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_OPENMP=OFF -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
           - build: 'vulkan-x64'
-            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_VULKAN=ON'
+            defines: '-DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_VULKAN=ON'
           - build: 'llvm-arm64'
             defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON'
           - build: 'llvm-arm64-opencl-adreno'
@@ -735,7 +772,7 @@ jobs:
         id: get_vulkan
         if: ${{ matrix.build == 'kompute-x64' || matrix.build == 'vulkan-x64' }}
         run: |
-          curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/VulkanSDK-${env:VULKAN_VERSION}-Installer.exe"
+          curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/vulkansdk-windows-X64-${env:VULKAN_VERSION}.exe"
           & "$env:RUNNER_TEMP\VulkanSDK-Installer.exe" --accept-licenses --default-answer --confirm-command install
           Add-Content $env:GITHUB_ENV "VULKAN_SDK=C:\VulkanSDK\${env:VULKAN_VERSION}"
           Add-Content $env:GITHUB_PATH "C:\VulkanSDK\${env:VULKAN_VERSION}\bin"
@@ -777,6 +814,7 @@ jobs:
           cmake -S . -B build ${{ matrix.defines }} `
             -DCURL_LIBRARY="$env:CURL_PATH/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:CURL_PATH/include"
           cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS}
+          cp $env:CURL_PATH/bin/libcurl-*.dll build/bin/Release
 
       - name: Add libopenblas.dll
         id: add_libopenblas_dll
@@ -839,12 +877,12 @@ jobs:
               -DGGML_CUDA=ON
             cmake --build build
 
-  windows-2019-cmake-cuda:
-    runs-on: windows-2019
+  windows-2022-cmake-cuda:
+    runs-on: windows-2022
 
     strategy:
       matrix:
-        cuda: ['12.4', '11.7']
+        cuda: ['12.4']
 
     steps:
       - name: Clone
@@ -878,7 +916,7 @@ jobs:
         env:
           CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
         run: |
-          call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvars64.bat"
+          call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" x64
           cmake -S . -B build -G "Ninja Multi-Config" ^
             -DLLAMA_BUILD_SERVER=ON ^
             -DGGML_NATIVE=OFF ^
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 65ed244657e4f..64fff175e227b 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -131,8 +131,9 @@ jobs:
         include:
           - build: 'x64'
             os: ubuntu-22.04
-          - build: 'arm64'
-            os: ubuntu-22.04-arm
+          # GGML_BACKEND_DL and GGML_CPU_ALL_VARIANTS are not currently supported on arm
+          # - build: 'arm64'
+          #   os: ubuntu-22.04-arm
 
     runs-on: ${{ matrix.os }}
 
@@ -159,6 +160,9 @@ jobs:
         id: cmake_build
         run: |
           cmake -B build \
+            -DGGML_BACKEND_DL=ON \
+            -DGGML_NATIVE=OFF \
+            -DGGML_CPU_ALL_VARIANTS=ON \
             -DLLAMA_FATAL_WARNINGS=ON \
             ${{ env.CMAKE_ARGS }}
           cmake --build build --config Release -j $(nproc)
@@ -207,6 +211,9 @@ jobs:
         id: cmake_build
         run: |
           cmake -B build \
+            -DGGML_BACKEND_DL=ON \
+            -DGGML_NATIVE=OFF \
+            -DGGML_CPU_ALL_VARIANTS=ON \
             -DGGML_VULKAN=ON \
             ${{ env.CMAKE_ARGS }}
           cmake --build build --config Release -j $(nproc)
@@ -295,7 +302,7 @@ jobs:
 
     env:
       OPENBLAS_VERSION: 0.3.23
-      VULKAN_VERSION: 1.4.309.0
+      VULKAN_VERSION: 1.4.313.2
 
     strategy:
       matrix:
@@ -325,7 +332,7 @@ jobs:
         id: get_vulkan
         if: ${{ matrix.backend == 'vulkan' }}
         run: |
-          curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/VulkanSDK-${env:VULKAN_VERSION}-Installer.exe"
+          curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/vulkansdk-windows-X64-${env:VULKAN_VERSION}.exe"
           & "$env:RUNNER_TEMP\VulkanSDK-Installer.exe" --accept-licenses --default-answer --confirm-command install
           Add-Content $env:GITHUB_ENV "VULKAN_SDK=C:\VulkanSDK\${env:VULKAN_VERSION}"
           Add-Content $env:GITHUB_PATH "C:\VulkanSDK\${env:VULKAN_VERSION}\bin"
@@ -373,11 +380,11 @@ jobs:
           name: llama-bin-win-${{ matrix.backend }}-${{ matrix.arch }}.zip
 
   windows-cuda:
-    runs-on: windows-2019
+    runs-on: windows-2022
 
     strategy:
       matrix:
-        cuda: ['12.4', '11.7']
+        cuda: ['12.4']
 
     steps:
       - name: Clone
@@ -405,7 +412,7 @@ jobs:
         id: cmake_build
         shell: cmd
         run: |
-          call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvars64.bat"
+          call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" x64
           cmake -S . -B build -G "Ninja Multi-Config" ^
             -DGGML_BACKEND_DL=ON ^
             -DGGML_NATIVE=OFF ^
diff --git a/.github/workflows/server.yml b/.github/workflows/server.yml
index 4baf6f6c755ee..f6da488576937 100644
--- a/.github/workflows/server.yml
+++ b/.github/workflows/server.yml
@@ -180,7 +180,7 @@ jobs:
 
 
   server-windows:
-    runs-on: windows-2019
+    runs-on: windows-2022
 
     steps:
       - name: Clone
diff --git a/.gitignore b/.gitignore
index f8ceb1560a1df..803a3b1d31d8a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -146,3 +146,10 @@ poetry.toml
 # Local scripts
 /run-vim.sh
 /run-chat.sh
+
+HEXAGON_Tools/
+prebuilts/QNN_SDK/qairt/2.35.0.250530/
+prebuilts/QNN_SDK/v2.35.0.250530.zip
+prebuilts/Hexagon_SDK/minimal-hexagon-sdk-6.2.0.1.xz
+
+
diff --git a/CMakeLists.txt b/CMakeLists.txt
index ac3e9090336d9..29e68ac82cbbd 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -7,6 +7,21 @@ set(CMAKE_WARN_UNUSED_CLI YES)
 
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 
+if(CMAKE_SYSTEM_NAME STREQUAL "Android")
+    set(CMAKE_VERBOSE_MAKEFILE      ON)
+    if(DEFINED HTP_ARCH_VERSION)
+        if (${HTP_ARCH_VERSION} STREQUAL "v75" OR ${HTP_ARCH_VERSION} STREQUAL "v79")
+            #works fine on Snapdragon 8Gen3&8Elite with 1.5x - 3x performance gains with the default ggml backend
+            set(OPT_FLAG " -O3 -march=armv8.7-a -mcpu=cortex-x1 -mtune=cortex-x1 -ffp-model=fast -fno-finite-math-only")
+            message("OPT_FLAG:${OPT_FLAG}")
+            set(CMAKE_C_FLAGS   "${CMAKE_C_FLAGS}   -DGGML_USE_HEXAGON -DGGML_USE_LLAMAFILE ${DEBUG_FLAG} ${OPT_FLAG}")
+            set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DGGML_USE_HEXAGON -DGGML_USE_LLAMAFILE ${DEBUG_FLAG} ${OPT_FLAG}")
+            set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -DGGML_USE_HEXAGON -DGGML_USE_LLAMAFILE ${DEBUG_FLAG} ${OPT_FLAG}")
+            set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -DGGML_USE_HEXAGON -DGGML_USE_LLAMAFILE ${DEBUG_FLAG} ${OPT_FLAG}")
+        endif()
+    endif()
+endif()
+
 if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE)
     set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE)
     set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo")
@@ -89,6 +104,14 @@ option(LLAMA_LLGUIDANCE "llama-common: include LLGuidance library for structured
 include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake)
 include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/common.cmake)
 
+if (NOT DEFINED LLAMA_BUILD_NUMBER)
+    set(LLAMA_BUILD_NUMBER        ${BUILD_NUMBER})
+endif()
+if (NOT DEFINED LLAMA_BUILD_COMMIT)
+    set(LLAMA_BUILD_COMMIT        ${BUILD_COMMIT})
+endif()
+set(LLAMA_INSTALL_VERSION 0.0.${LLAMA_BUILD_NUMBER})
+
 # override ggml options
 set(GGML_ALL_WARNINGS   ${LLAMA_ALL_WARNINGS})
 set(GGML_FATAL_WARNINGS ${LLAMA_FATAL_WARNINGS})
@@ -120,6 +143,7 @@ llama_option_depr(WARNING     LLAMA_RPC                 GGML_RPC)
 llama_option_depr(WARNING     LLAMA_SYCL                GGML_SYCL)
 llama_option_depr(WARNING     LLAMA_SYCL_F16            GGML_SYCL_F16)
 llama_option_depr(WARNING     LLAMA_CANN                GGML_CANN)
+llama_option_depr(WARNING     LLAMA_HEXAGON             GGML_HEXAGON)
 
 if (NOT MSVC)
     if (LLAMA_SANITIZE_THREAD)
@@ -155,10 +179,17 @@ if (LLAMA_USE_SYSTEM_GGML)
 endif()
 
 if (NOT TARGET ggml AND NOT LLAMA_USE_SYSTEM_GGML)
+    set(GGML_BUILD_NUMBER ${LLAMA_BUILD_NUMBER})
+    set(GGML_BUILD_COMMIT ${LLAMA_BUILD_COMMIT})
     add_subdirectory(ggml)
     # ... otherwise assume ggml is added by a parent CMakeLists.txt
 endif()
 
+if (MINGW)
+    # Target Windows 8 for PrefetchVirtualMemory
+    add_compile_definitions(_WIN32_WINNT=${GGML_WIN_VER})
+endif()
+
 #
 # build the library
 #
@@ -199,10 +230,6 @@ endif()
 include(GNUInstallDirs)
 include(CMakePackageConfigHelpers)
 
-set(LLAMA_BUILD_NUMBER        ${BUILD_NUMBER})
-set(LLAMA_BUILD_COMMIT        ${BUILD_COMMIT})
-set(LLAMA_INSTALL_VERSION 0.0.${BUILD_NUMBER})
-
 set(LLAMA_INCLUDE_INSTALL_DIR ${CMAKE_INSTALL_INCLUDEDIR} CACHE PATH "Location of header  files")
 set(LLAMA_LIB_INSTALL_DIR     ${CMAKE_INSTALL_LIBDIR}     CACHE PATH "Location of library files")
 set(LLAMA_BIN_INSTALL_DIR     ${CMAKE_INSTALL_BINDIR}     CACHE PATH "Location of binary  files")
diff --git a/Makefile b/Makefile
index 958ad8f2fcc0a..ac442aec095d6 100644
--- a/Makefile
+++ b/Makefile
@@ -367,7 +367,7 @@ ifdef LLAMA_SERVER_SSL
 endif
 
 ifndef GGML_NO_CPU_AARCH64
-	MK_CPPFLAGS += -DGGML_USE_CPU_AARCH64
+	MK_CPPFLAGS += -DGGML_USE_CPU_REPACK
 endif
 
 # warnings
@@ -970,7 +970,7 @@ OBJ_GGML = \
 	$(DIR_GGML)/src/ggml-threading.o \
 	$(DIR_GGML)/src/ggml-cpu/ggml-cpu.o \
 	$(DIR_GGML)/src/ggml-cpu/ggml-cpu_cpp.o \
-	$(DIR_GGML)/src/ggml-cpu/ggml-cpu-aarch64.o \
+	$(DIR_GGML)/src/ggml-cpu/repack.o \
 	$(DIR_GGML)/src/ggml-cpu/ggml-cpu-hbm.o \
 	$(DIR_GGML)/src/ggml-cpu/ggml-cpu-quants.o \
 	$(DIR_GGML)/src/ggml-cpu/ggml-cpu-traits.o \
diff --git a/README.md b/README.md
index 540c29a4f1847..90c7364dfcba0 100644
--- a/README.md
+++ b/README.md
@@ -3,9 +3,10 @@
 ![llama](https://user-images.githubusercontent.com/1991296/230134379-7181e485-c521-4d23-a0d6-f7b3b61ba524.png)
 
 [![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT)
+[![Release](https://img.shields.io/github/v/release/ggml-org/llama.cpp)](https://github.com/ggml-org/llama.cpp/releases)
 [![Server](https://github.com/ggml-org/llama.cpp/actions/workflows/server.yml/badge.svg)](https://github.com/ggml-org/llama.cpp/actions/workflows/server.yml)
 
-[Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggml-org/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggml-org/llama.cpp/discussions/205) / [ggml](https://github.com/ggml-org/ggml)
+[Roadmap](https://github.com/users/ggerganov/projects/7) / [Manifesto](https://github.com/ggml-org/llama.cpp/discussions/205) / [ggml](https://github.com/ggml-org/ggml)
 
 Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) in pure C/C++
 
@@ -17,7 +18,6 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
 ## Hot topics
 
 - 🔥 Multimodal support arrived in `llama-server`: [#12898](https://github.com/ggml-org/llama.cpp/pull/12898) | [documentation](./docs/multimodal.md)
-- **GGML developer experience survey (organized and reviewed by NVIDIA):** [link](https://forms.gle/Gasw3cRgyhNEnrwK9)
 - A new binary `llama-mtmd-cli` is introduced to replace `llava-cli`, `minicpmv-cli`, `gemma3-cli` ([#13012](https://github.com/ggml-org/llama.cpp/pull/13012)) and `qwen2vl-cli` ([#13141](https://github.com/ggml-org/llama.cpp/pull/13141)), `libllava` will be deprecated
 - VS Code extension for FIM completions: https://github.com/ggml-org/llama.vscode
 - Universal [tool call support](./docs/function-calling.md) in `llama-server` https://github.com/ggml-org/llama.cpp/pull/9639
@@ -28,6 +28,30 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
 
 ----
 
+## Quick start
+
+Getting started with llama.cpp is straightforward. Here are several ways to install it on your machine:
+
+- Install `llama.cpp` using [brew, nix or winget](docs/install.md)
+- Run with Docker - see our [Docker documentation](docs/docker.md)
+- Download pre-built binaries from the [releases page](https://github.com/ggml-org/llama.cpp/releases)
+- Build from source by cloning this repository - check out [our build guide](docs/build.md)
+
+Once installed, you'll need a model to work with. Head to the [Obtaining and quantizing models](#obtaining-and-quantizing-models) section to learn more.
+
+Example command:
+
+```sh
+# Use a local model file
+llama-cli -m my_model.gguf
+
+# Or download and run a model directly from Hugging Face
+llama-cli -hf ggml-org/gemma-3-1b-it-GGUF
+
+# Launch OpenAI-compatible API server
+llama-server -hf ggml-org/gemma-3-1b-it-GGUF
+```
+
 ## Description
 
 The main goal of `llama.cpp` is to enable LLM inference with minimal setup and state-of-the-art performance on a wide
@@ -130,6 +154,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
 <details>
 <summary>Bindings</summary>
 
+- Python: [ddh0/easy-llama](https://github.com/ddh0/easy-llama)
 - Python: [abetlen/llama-cpp-python](https://github.com/abetlen/llama-cpp-python)
 - Go: [go-skynet/go-llama.cpp](https://github.com/go-skynet/go-llama.cpp)
 - Node.js: [withcatai/node-llama-cpp](https://github.com/withcatai/node-llama-cpp)
@@ -229,6 +254,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
 
 </details>
 
+
 ## Supported backends
 
 | Backend | Target devices |
@@ -245,16 +271,6 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
 | [OpenCL](docs/backend/OPENCL.md) | Adreno GPU |
 | [RPC](https://github.com/ggml-org/llama.cpp/tree/master/tools/rpc) | All |
 
-## Building the project
-
-The main product of this project is the `llama` library. Its C-style interface can be found in [include/llama.h](include/llama.h).
-The project also includes many example programs and tools using the `llama` library. The examples range from simple, minimal code snippets to sophisticated sub-projects such as an OpenAI-compatible HTTP server. Possible methods for obtaining the binaries:
-
-- Clone this repository and build locally, see [how to build](docs/build.md)
-- On MacOS or Linux, install `llama.cpp` via [brew, flox or nix](docs/install.md)
-- Use a Docker image, see [documentation for Docker](docs/docker.md)
-- Download pre-built binaries from [releases](https://github.com/ggml-org/llama.cpp/releases)
-
 ## Obtaining and quantizing models
 
 The [Hugging Face](https://huggingface.co) platform hosts a [number of LLMs](https://huggingface.co/models?library=gguf&sort=trending) compatible with `llama.cpp`:
@@ -262,7 +278,11 @@ The [Hugging Face](https://huggingface.co) platform hosts a [number of LLMs](htt
 - [Trending](https://huggingface.co/models?library=gguf&sort=trending)
 - [LLaMA](https://huggingface.co/models?sort=trending&search=llama+gguf)
 
-You can either manually download the GGUF file or directly use any `llama.cpp`-compatible models from [Hugging Face](https://huggingface.co/) or other model hosting sites, such as [ModelScope](https://modelscope.cn/), by using this CLI argument: `-hf <user>/<model>[:quant]`.
+You can either manually download the GGUF file or directly use any `llama.cpp`-compatible models from [Hugging Face](https://huggingface.co/) or other model hosting sites, such as [ModelScope](https://modelscope.cn/), by using this CLI argument: `-hf <user>/<model>[:quant]`. For example:
+
+```sh
+llama-cli -hf ggml-org/gemma-3-1b-it-GGUF
+```
 
 By default, the CLI would download from Hugging Face, you can switch to other options with the environment variable `MODEL_ENDPOINT`. For example, you may opt to downloading model checkpoints from ModelScope or other model sharing communities by setting the environment variable, e.g. `MODEL_ENDPOINT=https://www.modelscope.cn/`.
 
diff --git a/ci/run.sh b/ci/run.sh
index b49a3a5f82357..e1b777c304eaf 100755
--- a/ci/run.sh
+++ b/ci/run.sh
@@ -39,14 +39,27 @@ sd=`dirname $0`
 cd $sd/../
 SRC=`pwd`
 
-CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=OFF"
+CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON"
 
 if [ ! -z ${GG_BUILD_METAL} ]; then
     CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=ON -DGGML_METAL_USE_BF16=ON"
 fi
 
 if [ ! -z ${GG_BUILD_CUDA} ]; then
-    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES=native"
+    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_CUDA=ON"
+
+    if command -v nvidia-smi >/dev/null 2>&1; then
+        CUDA_ARCH=$(nvidia-smi --query-gpu=compute_cap --format=csv,noheader,nounits 2>/dev/null | head -1 | tr -d '.')
+        if [[ -n "$CUDA_ARCH" && "$CUDA_ARCH" =~ ^[0-9]+$ ]]; then
+            CMAKE_EXTRA="${CMAKE_EXTRA} -DCMAKE_CUDA_ARCHITECTURES=${CUDA_ARCH}"
+        else
+            echo "Warning: Using fallback CUDA architectures"
+            CMAKE_EXTRA="${CMAKE_EXTRA} -DCMAKE_CUDA_ARCHITECTURES=61;70;75;80;86;89"
+        fi
+    else
+        echo "Error: nvidia-smi not found, cannot build with CUDA"
+        exit 1
+    fi
 fi
 
 if [ ! -z ${GG_BUILD_SYCL} ]; then
@@ -766,7 +779,7 @@ function gg_run_rerank_tiny {
     model_f16="${path_models}/ggml-model-f16.gguf"
 
     # for this model, the SEP token is "</s>"
-    (time ./bin/llama-embedding --model ${model_f16} -p "what is panda?</s></s>hi\nwhat is panda?</s></s>it's a bear\nwhat is panda?</s></s>The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China." -ngl 99 -c 0 --pooling rank --embd-normalize -1 --verbose-prompt) 2>&1 | tee -a $OUT/${ci}-rk-f16.log
+    (time ./bin/llama-embedding --model ${model_f16} -p "what is panda?\thi\nwhat is panda?\tit's a bear\nwhat is panda?\tThe giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China." -ngl 99 -c 0 --pooling rank --embd-normalize -1 --verbose-prompt) 2>&1 | tee -a $OUT/${ci}-rk-f16.log
 
     # sample output
     # rerank score 0:    0.029
diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt
index 564af1448f95a..f43a630c900ff 100644
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@@ -7,8 +7,8 @@ llama_add_compile_flags()
 # Build info header
 #
 
-if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/../.git")
-    set(GIT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../.git")
+if(EXISTS "${PROJECT_SOURCE_DIR}/.git")
+    set(GIT_DIR "${PROJECT_SOURCE_DIR}/.git")
 
     # Is git submodule
     if(NOT IS_DIRECTORY "${GIT_DIR}")
@@ -18,36 +18,26 @@ if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/../.git")
         if (SLASH_POS EQUAL 0)
             set(GIT_DIR "${REAL_GIT_DIR}")
         else()
-            set(GIT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../${REAL_GIT_DIR}")
+            set(GIT_DIR "${PROJECT_SOURCE_DIR}/${REAL_GIT_DIR}")
         endif()
     endif()
 
     if(EXISTS "${GIT_DIR}/index")
-        set(GIT_INDEX "${GIT_DIR}/index")
+        # For build-info.cpp below
+        set_property(DIRECTORY APPEND PROPERTY CMAKE_CONFIGURE_DEPENDS "${GIT_DIR}/index")
     else()
         message(WARNING "Git index not found in git repository.")
-        set(GIT_INDEX "")
     endif()
 else()
     message(WARNING "Git repository not found; to enable automatic generation of build info, make sure Git is installed and the project is a Git repository.")
-    set(GIT_INDEX "")
 endif()
 
-# Add a custom command to rebuild build-info.cpp when .git/index changes
-add_custom_command(
-    OUTPUT "${CMAKE_CURRENT_SOURCE_DIR}/build-info.cpp"
-    COMMENT "Generating build details from Git"
-    COMMAND ${CMAKE_COMMAND} -DMSVC=${MSVC} -DCMAKE_C_COMPILER_VERSION=${CMAKE_C_COMPILER_VERSION}
-            -DCMAKE_C_COMPILER_ID=${CMAKE_C_COMPILER_ID} -DCMAKE_VS_PLATFORM_NAME=${CMAKE_VS_PLATFORM_NAME}
-            -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-            -DCMAKE_SYSTEM_NAME=${CMAKE_SYSTEM_NAME} -DCMAKE_SYSTEM_PROCESSOR=${CMAKE_SYSTEM_PROCESSOR}
-            -P "${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info-gen-cpp.cmake"
-    WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/.."
-    DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/build-info.cpp.in" ${GIT_INDEX}
-    VERBATIM
-)
+set(TEMPLATE_FILE "${CMAKE_CURRENT_SOURCE_DIR}/build-info.cpp.in")
+set(OUTPUT_FILE   "${CMAKE_CURRENT_BINARY_DIR}/build-info.cpp")
+configure_file(${TEMPLATE_FILE} ${OUTPUT_FILE})
+
 set(TARGET build_info)
-add_library(${TARGET} OBJECT build-info.cpp)
+add_library(${TARGET} OBJECT ${OUTPUT_FILE})
 if (BUILD_SHARED_LIBS)
     set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
 endif()
diff --git a/common/arg.cpp b/common/arg.cpp
index cfa9878f90730..c4ad85c47b61b 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -988,10 +988,6 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
         params.tensor_buft_overrides.push_back({nullptr, nullptr});
     }
 
-    if (params.reranking && params.embedding) {
-        throw std::invalid_argument("error: either --embedding or --reranking can be specified, but not both");
-    }
-
     if (!params.chat_template.empty() && !common_chat_verify_template(params.chat_template, params.use_jinja)) {
         throw std::runtime_error(string_format(
             "error: the supplied chat template is not supported: %s%s\n",
@@ -2710,6 +2706,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.embd_sep = value;
         }
     ).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
+    add_opt(common_arg(
+        {"--cls-separator"}, "STRING",
+        "separator of classification sequences (default \\t) for example \"<#seq#>\"",
+        [](common_params & params, const std::string & value) {
+            params.cls_sep = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
     add_opt(common_arg(
         {"--host"}, "HOST",
         string_format("ip address to listen, or bind to an UNIX socket if the address ends with .sock (default: %s)", params.hostname.c_str()),
@@ -2747,9 +2750,10 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_EMBEDDINGS"));
     add_opt(common_arg(
         {"--reranking", "--rerank"},
-        string_format("enable reranking endpoint on server (default: %s)", params.reranking ? "enabled" : "disabled"),
+        string_format("enable reranking endpoint on server (default: %s)", "disabled"),
         [](common_params & params) {
-            params.reranking = true;
+            params.embedding = true;
+            params.pooling_type = LLAMA_POOLING_TYPE_RANK;
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_RERANKING"));
     add_opt(common_arg(
@@ -2869,6 +2873,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         "(default: deepseek)",
         [](common_params & params, const std::string & value) {
             /**/ if (value == "deepseek") { params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK; }
+            else if (value == "deepseek-legacy") { params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY; }
             else if (value == "none") {     params.reasoning_format = COMMON_REASONING_FORMAT_NONE; }
             else { throw std::invalid_argument("invalid value"); }
         }
@@ -3212,6 +3217,32 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.speculative.model.path = value;
         }
     ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODEL_DRAFT"));
+    add_opt(common_arg(
+        {"-ctkd", "--cache-type-k-draft"}, "TYPE",
+        string_format(
+            "KV cache data type for K for the draft model\n"
+            "allowed values: %s\n"
+            "(default: %s)",
+            get_all_kv_cache_types().c_str(),
+            ggml_type_name(params.speculative.cache_type_k)
+        ),
+        [](common_params & params, const std::string & value) {
+            params.speculative.cache_type_k = kv_cache_type_from_str(value);
+        }
+    ).set_env("LLAMA_ARG_CACHE_TYPE_K_DRAFT"));
+    add_opt(common_arg(
+        {"-ctvd", "--cache-type-v-draft"}, "TYPE",
+        string_format(
+            "KV cache data type for V for the draft model\n"
+            "allowed values: %s\n"
+            "(default: %s)",
+            get_all_kv_cache_types().c_str(),
+            ggml_type_name(params.speculative.cache_type_v)
+        ),
+        [](common_params & params, const std::string & value) {
+            params.speculative.cache_type_v = kv_cache_type_from_str(value);
+        }
+    ).set_env("LLAMA_ARG_CACHE_TYPE_V_DRAFT"));
 
     add_opt(common_arg(
         {"-mv", "--model-vocoder"}, "FNAME",
diff --git a/common/build-info.cpp.in b/common/build-info.cpp.in
index 0b945aa68fff3..aee9d7eafd681 100644
--- a/common/build-info.cpp.in
+++ b/common/build-info.cpp.in
@@ -1,4 +1,4 @@
-int LLAMA_BUILD_NUMBER = @BUILD_NUMBER@;
-char const *LLAMA_COMMIT = "@BUILD_COMMIT@";
+int LLAMA_BUILD_NUMBER = @LLAMA_BUILD_NUMBER@;
+char const *LLAMA_COMMIT = "@LLAMA_BUILD_COMMIT@";
 char const *LLAMA_COMPILER = "@BUILD_COMPILER@";
 char const *LLAMA_BUILD_TARGET = "@BUILD_TARGET@";
diff --git a/common/chat-parser.cpp b/common/chat-parser.cpp
index 65b664cb37da4..18a30e49aa578 100644
--- a/common/chat-parser.cpp
+++ b/common/chat-parser.cpp
@@ -49,6 +49,7 @@ bool common_chat_msg_parser::add_tool_call(const std::string & name, const std::
 
     // LOG_DBG("Tool call arguments:\n\traw: %s\n\tresult: %s\n", arguments.c_str(), tool_call.arguments.c_str());
     result_.tool_calls.emplace_back(tool_call);
+
     return true;
 }
 bool common_chat_msg_parser::add_tool_call(const json & tool_call) {
@@ -378,3 +379,7 @@ std::optional<common_chat_msg_parser::consume_json_result> common_chat_msg_parse
         /* .is_partial = */ found_healing_marker,
     };
 }
+
+void common_chat_msg_parser::clear_tools() {
+    result_.tool_calls.clear();
+}
diff --git a/common/chat-parser.h b/common/chat-parser.h
index 7ee355056b30a..0e64c341a50aa 100644
--- a/common/chat-parser.h
+++ b/common/chat-parser.h
@@ -115,4 +115,6 @@ class common_chat_msg_parser {
         const std::vector<std::vector<std::string>> & args_paths = {},
         const std::vector<std::vector<std::string>> & content_paths = {}
     );
+
+    void clear_tools();
 };
diff --git a/common/chat.cpp b/common/chat.cpp
index f1ab4c85a913e..7d9aaeb12a190 100644
--- a/common/chat.cpp
+++ b/common/chat.cpp
@@ -82,10 +82,10 @@ json common_chat_msg::to_json_oaicompat() const
 
 std::vector<common_chat_msg_diff> common_chat_msg_diff::compute_diffs(const common_chat_msg & previous_msg, const common_chat_msg & new_msg) {
     std::vector<common_chat_msg_diff> diffs;
-    // if (previous_msg.reasoning_content != current.reasoning_content) {
-    //     auto & diff = diffs.emplace_back();
-    //     diff.reasoning_content_delta = string_diff(previous_msg.reasoning_content, current.reasoning_content);
-    // }
+    if (previous_msg.reasoning_content != new_msg.reasoning_content) {
+        auto & diff = diffs.emplace_back();
+        diff.reasoning_content_delta = string_diff(previous_msg.reasoning_content, new_msg.reasoning_content);
+    }
     if (previous_msg.content != new_msg.content) {
         auto & diff = diffs.emplace_back();
         diff.content_delta = string_diff(previous_msg.content, new_msg.content);
@@ -385,9 +385,9 @@ json common_chat_tools_to_json_oaicompat(const std::vector<common_chat_tool> & t
 
 template <> json common_chat_msg_diff_to_json_oaicompat(const common_chat_msg_diff & diff) {
     json delta = json::object();
-    // if (!diff.reasoning_content_delta.empty()) {
-    //     delta["reasoning_content"] = msg.reasoning_content;
-    // }
+    if (!diff.reasoning_content_delta.empty()) {
+        delta["reasoning_content"] = diff.reasoning_content_delta;
+    }
     if (!diff.content_delta.empty()) {
         delta["content"] = diff.content_delta;
     }
@@ -598,6 +598,7 @@ const char * common_reasoning_format_name(common_reasoning_format format) {
     switch (format) {
         case COMMON_REASONING_FORMAT_NONE:     return "none";
         case COMMON_REASONING_FORMAT_DEEPSEEK: return "deepseek";
+        case COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY: return "deepseek-legacy";
         default:
             throw std::runtime_error("Unknown reasoning format");
     }
@@ -1837,7 +1838,7 @@ static common_chat_params common_chat_templates_apply_legacy(
     if (res < 0) {
         // if the custom "tmpl" is not supported, we throw an error
         // this is a bit redundant (for good), since we're not sure if user validated the custom template with llama_chat_verify_template()
-        throw std::runtime_error("this custom template is not supported");
+        throw std::runtime_error("this custom template is not supported, try using --jinja");
     }
 
     // if it turns out that our buffer is too small, we resize it
@@ -1920,7 +1921,9 @@ common_chat_msg common_chat_parse(const std::string & input, bool is_partial, co
     } catch (const common_chat_msg_partial_exception & ex) {
         LOG_DBG("Partial parse: %s\n", ex.what());
         if (!is_partial) {
-            throw std::runtime_error(ex.what());
+            builder.clear_tools();
+            builder.move_to(0);
+            common_chat_parse_content_only(builder);
         }
     }
     auto msg = builder.result();
diff --git a/common/chat.h b/common/chat.h
index f6b1d0ffcc989..9f59e6b08738d 100644
--- a/common/chat.h
+++ b/common/chat.h
@@ -70,7 +70,7 @@ struct common_chat_msg {
 };
 
 struct common_chat_msg_diff {
-    // std::string reasoning_content_delta;
+    std::string reasoning_content_delta;
     std::string content_delta;
     size_t tool_call_index = std::string::npos;
     common_chat_tool_call tool_call_delta;
diff --git a/common/cmake/build-info-gen-cpp.cmake b/common/cmake/build-info-gen-cpp.cmake
deleted file mode 100644
index fbc92b52cc4fe..0000000000000
--- a/common/cmake/build-info-gen-cpp.cmake
+++ /dev/null
@@ -1,24 +0,0 @@
-include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake)
-
-set(TEMPLATE_FILE "${CMAKE_CURRENT_SOURCE_DIR}/common/build-info.cpp.in")
-set(OUTPUT_FILE   "${CMAKE_CURRENT_SOURCE_DIR}/common/build-info.cpp")
-
-# Only write the build info if it changed
-if(EXISTS ${OUTPUT_FILE})
-    file(READ ${OUTPUT_FILE} CONTENTS)
-    string(REGEX MATCH "LLAMA_COMMIT = \"([^\"]*)\";" _ ${CONTENTS})
-    set(OLD_COMMIT ${CMAKE_MATCH_1})
-    string(REGEX MATCH "LLAMA_COMPILER = \"([^\"]*)\";" _ ${CONTENTS})
-    set(OLD_COMPILER ${CMAKE_MATCH_1})
-    string(REGEX MATCH "LLAMA_BUILD_TARGET = \"([^\"]*)\";" _ ${CONTENTS})
-    set(OLD_TARGET ${CMAKE_MATCH_1})
-    if (
-        NOT OLD_COMMIT   STREQUAL BUILD_COMMIT   OR
-        NOT OLD_COMPILER STREQUAL BUILD_COMPILER OR
-        NOT OLD_TARGET   STREQUAL BUILD_TARGET
-    )
-        configure_file(${TEMPLATE_FILE} ${OUTPUT_FILE})
-    endif()
-else()
-    configure_file(${TEMPLATE_FILE} ${OUTPUT_FILE})
-endif()
diff --git a/common/common.cpp b/common/common.cpp
index 4cc40ed8b37a4..e4e71ad13fb59 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -466,7 +466,7 @@ size_t string_find_partial_stop(const std::string_view & str, const std::string_
 
 std::string regex_escape(const std::string & s) {
     static const std::regex special_chars("[.^$|()*+?\\[\\]{}\\\\]");
-    return std::regex_replace(s, special_chars, "\\$0");
+    return std::regex_replace(s, special_chars, "\\$&");
 }
 
 std::string string_join(const std::vector<std::string> & values, const std::string & separator) {
@@ -706,11 +706,17 @@ bool fs_validate_filename(const std::string & filename) {
         // disable C++17 deprecation warning for std::codecvt_utf8
 #    pragma clang diagnostic push
 #    pragma clang diagnostic ignored "-Wdeprecated-declarations"
+#elif defined(__GNUC__)
+#    pragma GCC diagnostic push
+#    pragma GCC diagnostic ignored "-Wdeprecated-declarations"
 #endif
+
         std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> converter;
 
 #if defined(__clang__)
 #    pragma clang diagnostic pop
+#elif defined(__GNUC__)
+#    pragma GCC diagnostic pop
 #endif
 
         filename_utf32 = converter.from_bytes(filename);
@@ -767,6 +773,9 @@ bool fs_validate_filename(const std::string & filename) {
     return true;
 }
 
+#include <iostream>
+
+
 // returns true if successful, false otherwise
 bool fs_create_directory_with_parents(const std::string & path) {
 #ifdef _WIN32
@@ -784,9 +793,16 @@ bool fs_create_directory_with_parents(const std::string & path) {
     // process path from front to back, procedurally creating directories
     while ((pos_slash = path.find('\\', pos_slash)) != std::string::npos) {
         const std::wstring subpath = wpath.substr(0, pos_slash);
-        const wchar_t * test = subpath.c_str();
 
-        const bool success = CreateDirectoryW(test, NULL);
+        pos_slash += 1;
+
+        // skip the drive letter, in some systems it can return an access denied error
+        if (subpath.length() == 2 && subpath[1] == ':') {
+            continue;
+        }
+
+        const bool success = CreateDirectoryW(subpath.c_str(), NULL);
+
         if (!success) {
             const DWORD error = GetLastError();
 
@@ -800,8 +816,6 @@ bool fs_create_directory_with_parents(const std::string & path) {
                 return false;
             }
         }
-
-        pos_slash += 1;
     }
 
     return true;
@@ -897,34 +911,6 @@ struct common_init_result common_init_from_params(common_params & params) {
 
     const llama_vocab * vocab = llama_model_get_vocab(model);
 
-    if (params.reranking) {
-        bool ok = true;
-
-        if (llama_vocab_bos(vocab) == LLAMA_TOKEN_NULL) {
-            LOG_WRN("%s: warning: vocab does not have a  BOS token, reranking will not work\n", __func__);
-            ok = false;
-        }
-
-        bool has_eos = llama_vocab_eos(vocab) != LLAMA_TOKEN_NULL;
-        bool has_sep = llama_vocab_sep(vocab) != LLAMA_TOKEN_NULL;
-
-        if (!has_eos && !has_sep) {
-            LOG_WRN("%s: warning: vocab does not have an EOS token or SEP token, reranking will not work\n", __func__);
-            ok = false;
-        } else if (!has_eos) {
-            LOG_WRN("%s: warning: vocab does not have an EOS token, using SEP token as fallback\n", __func__);
-        } else if (!has_sep) {
-            LOG_WRN("%s: warning: vocab does not have a SEP token, reranking will not work\n", __func__);
-            ok = false;
-        }
-
-        if (!ok) {
-            llama_model_free(model);
-
-            return iparams;
-        }
-    }
-
     auto cparams = common_context_params_to_llama(params);
 
     llama_context * lctx = llama_init_from_model(model, cparams);
@@ -934,7 +920,7 @@ struct common_init_result common_init_from_params(common_params & params) {
         return iparams;
     }
 
-    if (params.ctx_shift && !llama_kv_self_can_shift(lctx)) {
+    if (params.ctx_shift && !llama_memory_can_shift(llama_get_memory(lctx))) {
         LOG_WRN("%s: KV cache shifting is not supported for this context, disabling KV cache shifting\n", __func__);
         params.ctx_shift = false;
     }
@@ -966,6 +952,35 @@ struct common_init_result common_init_from_params(common_params & params) {
         }
     }
 
+    if (llama_pooling_type(lctx) == LLAMA_POOLING_TYPE_RANK) {
+        bool ok = true;
+
+        if (llama_vocab_bos(vocab) == LLAMA_TOKEN_NULL) {
+            LOG_WRN("%s: warning: vocab does not have a  BOS token, reranking will not work\n", __func__);
+            ok = false;
+        }
+
+        bool has_eos = llama_vocab_eos(vocab) != LLAMA_TOKEN_NULL;
+        bool has_sep = llama_vocab_sep(vocab) != LLAMA_TOKEN_NULL;
+
+        if (!has_eos && !has_sep) {
+            LOG_WRN("%s: warning: vocab does not have an EOS token or SEP token, reranking will not work\n", __func__);
+            ok = false;
+        } else if (!has_eos) {
+            LOG_WRN("%s: warning: vocab does not have an EOS token, using SEP token as fallback\n", __func__);
+        } else if (!has_sep) {
+            LOG_WRN("%s: warning: vocab does not have a SEP token, reranking will not work\n", __func__);
+            ok = false;
+        }
+
+        if (!ok) {
+            llama_free(lctx);
+            llama_model_free(model);
+
+            return iparams;
+        }
+    }
+
     // load and optionally apply lora adapters
     for (auto & la : params.lora_adapters) {
         llama_adapter_lora_ptr lora;
@@ -1041,7 +1056,7 @@ struct common_init_result common_init_from_params(common_params & params) {
         if (llama_model_has_decoder(model)) {
             llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch)));
         }
-        llama_kv_self_clear(lctx);
+        llama_memory_clear(llama_get_memory(lctx), true);
         llama_synchronize(lctx);
         llama_perf_context_reset(lctx);
         llama_set_warmup(lctx, false);
@@ -1143,11 +1158,6 @@ struct llama_context_params common_context_params_to_llama(const common_params &
     cparams.op_offload        = !params.no_op_offload;
     cparams.swa_full          = params.swa_full;
 
-    if (params.reranking) {
-        cparams.embeddings    = true;
-        cparams.pooling_type  = LLAMA_POOLING_TYPE_RANK;
-    }
-
     cparams.type_k = params.cache_type_k;
     cparams.type_v = params.cache_type_v;
 
@@ -1280,6 +1290,9 @@ std::vector<llama_token> common_tokenize(
     int n_tokens = text.length() + 2 * add_special;
     std::vector<llama_token> result(n_tokens);
     n_tokens = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
+    if (n_tokens == std::numeric_limits<int32_t>::min()) {
+        throw std::runtime_error("Tokenization failed: input text too large, tokenization result exceeds int32_t limit");
+    }
     if (n_tokens < 0) {
         result.resize(-n_tokens);
         int check = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
diff --git a/common/common.h b/common/common.h
index cee1e3039cf9e..e08a59eae7543 100644
--- a/common/common.h
+++ b/common/common.h
@@ -199,6 +199,9 @@ struct common_params_speculative {
     float   p_split      =  0.1f; // speculative decoding split probability
     float   p_min        = 0.75f; // minimum speculative decoding probability (greedy)
 
+    ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
+    ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V
+
     struct cpu_params cpuparams;
     struct cpu_params cpuparams_batch;
 
@@ -215,7 +218,8 @@ struct common_params_vocoder {
 
 enum common_reasoning_format {
     COMMON_REASONING_FORMAT_NONE,
-    COMMON_REASONING_FORMAT_DEEPSEEK, // Extract thinking tag contents and return as `message.reasoning_content`
+    COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY, // Extract thinking tag contents and return as `message.reasoning_content`, or leave inline in <think> tags in stream mode
+    COMMON_REASONING_FORMAT_DEEPSEEK,        // Extract thinking tag contents and return as `message.reasoning_content`, including in streaming deltas.
 };
 
 struct common_params {
@@ -354,7 +358,7 @@ struct common_params {
     int32_t embd_normalize = 2;     // normalisation for embeddings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
     std::string embd_out   = "";    // empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix
     std::string embd_sep   = "\n";  // separator of embeddings
-    bool reranking         = false; // enable reranking support on server
+    std::string cls_sep    = "\t";  // separator of classification sequences
 
     // server params
     int32_t port           = 8080;         // server listens on this network port
diff --git a/common/json-schema-to-grammar.cpp b/common/json-schema-to-grammar.cpp
index d38a74f95c213..637891f50699c 100644
--- a/common/json-schema-to-grammar.cpp
+++ b/common/json-schema-to-grammar.cpp
@@ -41,49 +41,6 @@ static std::string build_repetition(const std::string & item_rule, int min_items
     return result;
 }
 
-/* Minimalistic replacement for std::string_view, which is only available from C++17 onwards */
-class string_view {
-    const std::string & _str;
-    const size_t _start;
-    const size_t _end;
-public:
-    string_view(const std::string & str, size_t start = 0, size_t end  = std::string::npos) : _str(str), _start(start), _end(end == std::string::npos ? str.length() : end) {}
-
-    size_t size() const {
-        return _end - _start;
-    }
-
-    size_t length() const {
-        return size();
-    }
-
-    operator std::string() const {
-        return str();
-    }
-
-    std::string str() const {
-        return _str.substr(_start, _end - _start);
-    }
-
-    string_view substr(size_t pos, size_t len = std::string::npos) const {
-        return string_view(_str, _start + pos, len == std::string::npos ? _end : _start + pos + len);
-    }
-
-    char operator[](size_t pos) const {
-        auto index = _start + pos;
-        if (index >= _end) {
-            throw std::out_of_range("string_view index out of range");
-        }
-        return _str[_start + pos];
-    }
-
-    bool operator==(const string_view & other) const {
-        std::string this_str = *this;
-        std::string other_str = other;
-        return this_str == other_str;
-    }
-};
-
 static void _build_min_max_int(int min_value, int max_value, std::stringstream & out, int decimals_left = 16, bool top_level = true) {
     auto has_min = min_value != std::numeric_limits<int>::min();
     auto has_max = max_value != std::numeric_limits<int>::max();
@@ -112,14 +69,14 @@ static void _build_min_max_int(int min_value, int max_value, std::stringstream &
         }
         out << "}";
     };
-    std::function<void(const string_view &, const string_view &)> uniform_range =
-        [&](const string_view & from, const string_view & to) {
+    std::function<void(const std::string_view &, const std::string_view &)> uniform_range =
+        [&](const std::string_view & from, const std::string_view & to) {
             size_t i = 0;
             while (i < from.length() && i < to.length() && from[i] == to[i]) {
                 i++;
             }
             if (i > 0) {
-                out << "\"" << from.substr(0, i).str() << "\"";
+                out << "\"" << from.substr(0, i) << "\"";
             }
             if (i < from.length() && i < to.length()) {
                 if (i > 0) {
diff --git a/common/speculative.cpp b/common/speculative.cpp
index ccad70fa9ed85..843bd1ddbdbd7 100644
--- a/common/speculative.cpp
+++ b/common/speculative.cpp
@@ -144,6 +144,8 @@ llama_tokens common_speculative_gen_draft(
     auto & smpl   = spec->smpl;
     auto & prompt = spec->prompt;
 
+    auto * mem = llama_get_memory(ctx);
+
     int reuse_i = 0;
     int reuse_n = 0;
 
@@ -173,7 +175,7 @@ llama_tokens common_speculative_gen_draft(
     result.reserve(params.n_draft);
 
     if (reuse_n == 0) {
-        llama_kv_self_clear(ctx);
+        llama_memory_clear(mem, false);
 
         prompt.clear();
     } else {
@@ -192,14 +194,14 @@ llama_tokens common_speculative_gen_draft(
         }
 
         if (reuse_i > 0) {
-            llama_kv_self_seq_rm (ctx, 0, 0, reuse_i);
-            llama_kv_self_seq_add(ctx, 0, reuse_i, -1, -reuse_i);
+            llama_memory_seq_rm (mem, 0, 0, reuse_i);
+            llama_memory_seq_add(mem, 0, reuse_i, -1, -reuse_i);
 
             prompt.erase(prompt.begin(), prompt.begin() + reuse_i);
         }
 
         if (reuse_n < (int) prompt.size()) {
-            llama_kv_self_seq_rm (ctx, 0, reuse_n, -1);
+            llama_memory_seq_rm (mem, 0, reuse_n, -1);
 
             prompt.erase(prompt.begin() + reuse_n, prompt.end());
         }
diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index ab0f0e0ea087e..4f2339a02a13c 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -310,6 +310,8 @@ def prepare_tensors(self):
                             gguf.MODEL_TENSOR.POSNET_NORM2,
                             gguf.MODEL_TENSOR.V_ENC_EMBD_POS,
                             gguf.MODEL_TENSOR.A_ENC_EMBD_POS,
+                            gguf.MODEL_TENSOR.ALTUP_CORRECT_COEF,
+                            gguf.MODEL_TENSOR.ALTUP_PREDICT_COEF,
                         )
                     )
                     or not new_name.endswith(".weight")
@@ -320,7 +322,11 @@ def prepare_tensors(self):
                     self.match_model_tensor_name(new_name, key, bid)
                     for key in (
                         gguf.MODEL_TENSOR.TOKEN_EMBD,
+                        gguf.MODEL_TENSOR.PER_LAYER_TOKEN_EMBD,
                         gguf.MODEL_TENSOR.OUTPUT,
+                        gguf.MODEL_TENSOR.ALTUP_ROUTER,
+                        gguf.MODEL_TENSOR.LAUREL_L,
+                        gguf.MODEL_TENSOR.LAUREL_R,
                     )
                 ):
                     if self.ftype in (
@@ -519,7 +525,7 @@ def prepare_metadata(self, vocab_only: bool):
     def set_gguf_parameters(self):
         self.gguf_writer.add_block_count(self.block_count)
 
-        if (n_ctx := self.find_hparam(["max_position_embeddings", "n_ctx", "n_positions"], optional=True)) is not None:
+        if (n_ctx := self.find_hparam(["max_position_embeddings", "n_ctx", "n_positions", "max_length"], optional=True)) is not None:
             self.gguf_writer.add_context_length(n_ctx)
             logger.info(f"gguf: context length = {n_ctx}")
 
@@ -921,13 +927,16 @@ def _create_vocab_sentencepiece(self):
         tokenizer = SentencePieceProcessor()
         tokenizer.LoadFromFile(str(tokenizer_path))
 
-        vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
+        vocab_size = self.find_hparam([
+            "vocab_size_per_layer_input", # gemma3n
+            "vocab_size",
+        ], optional=True) or tokenizer.vocab_size()
 
         tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
         scores: list[float] = [-10000.0] * vocab_size
         toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size
 
-        for token_id in range(tokenizer.vocab_size()):
+        for token_id in range(vocab_size):
             piece = tokenizer.IdToPiece(token_id)
             text = piece.encode("utf-8")
             score = tokenizer.GetScore(token_id)
@@ -942,6 +951,10 @@ def _create_vocab_sentencepiece(self):
             elif tokenizer.IsByte(token_id):
                 toktype = SentencePieceTokenTypes.BYTE
 
+            if token_id >= vocab_size:
+                logger.warning(f'ignore tokens from {token_id}: id is out of range, max={vocab_size - 1}')
+                break
+
             tokens[token_id] = text
             scores[token_id] = score
             toktypes[token_id] = toktype
@@ -1898,9 +1911,7 @@ def set_gguf_parameters(self):
         hparams = self.hparams
         self.gguf_writer.add_vocab_size(hparams["vocab_size"])
 
-        if "head_dim" in hparams:
-            rope_dim = hparams["head_dim"]
-        else:
+        if (rope_dim := hparams.get("head_dim")) is None:
             rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
         self.gguf_writer.add_rope_dimension_count(rope_dim)
 
@@ -1982,7 +1993,8 @@ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
         if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
             if rope_scaling.get("rope_type", '').lower() == "llama3":
                 base = self.hparams.get("rope_theta", 10000.0)
-                dim = self.hparams.get("head_dim", self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
+                if (dim := self.hparams.get("head_dim")) is None:
+                    dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
                 freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
 
                 factor = rope_scaling.get("factor", 8.0)
@@ -2017,6 +2029,20 @@ def prepare_tensors(self):
                 raise ValueError(f"Unprocessed experts: {experts}")
 
 
+@ModelBase.register("ArceeForCausalLM")
+class ArceeModel(LlamaModel):
+    model_arch = gguf.MODEL_ARCH.ARCEE
+
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+        self._try_set_pooling_type()
+        rope_scaling = self.hparams.get("rope_scaling") or {}
+        if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling:
+            self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
+            self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
+            self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
+
+
 @ModelBase.register(
     "LlavaForConditionalGeneration", # pixtral
     "Mistral3ForConditionalGeneration", # mistral small 3.1
@@ -2132,7 +2158,6 @@ def __init__(self, *args, **kwargs):
 
     def set_vocab(self):
         self._set_vocab_gpt2()
-        self.gguf_writer.add_add_bos_token(True)
 
     def set_gguf_parameters(self):
         super().set_gguf_parameters()
@@ -2181,7 +2206,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
                 name += ".weight"
             if "multi_modal_projector.linear_1" in name:
                 # despite the name with number postfix, this is a single fully connected layer
-                return [(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_MMPROJ_FC], data_torch)]
+                return [(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_MMPROJ_FC] + '.weight', data_torch)]
             return [(self.map_tensor_name(name), data_torch)]
         return []
 
@@ -2304,9 +2329,7 @@ def set_gguf_parameters(self):
         hparams = self.hparams
         self.gguf_writer.add_vocab_size(hparams["vocab_size"])
 
-        if "head_dim" in hparams:
-            rope_dim = hparams["head_dim"]
-        else:
+        if (rope_dim := hparams.get("head_dim")) is None:
             rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
         self.gguf_writer.add_rope_dimension_count(rope_dim)
 
@@ -2346,7 +2369,8 @@ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
         if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
             if rope_scaling.get("rope_type", '').lower() == "llama3":
                 base = self.hparams.get("rope_theta", 10000.0)
-                dim = self.hparams.get("head_dim", self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
+                if (dim := self.hparams.get("head_dim")) is None:
+                    dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
                 freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
 
                 factor = rope_scaling.get("factor", 8.0)
@@ -3664,9 +3688,7 @@ def set_gguf_parameters(self):
         hparams = self.hparams
         self.gguf_writer.add_vocab_size(hparams["vocab_size"])
 
-        if "head_dim" in hparams:
-            rope_dim = hparams["head_dim"]
-        else:
+        if (rope_dim := hparams.get("head_dim")) is None:
             rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
         self.gguf_writer.add_rope_dimension_count(rope_dim)
 
@@ -3709,8 +3731,7 @@ def set_gguf_parameters(self):
         self._try_set_pooling_type()
 
         if self.cls_out_labels:
-            key_name = gguf.Keys.Classifier.OUTPUT_LABELS.format(arch = gguf.MODEL_ARCH_NAMES[self.model_arch])
-            self.gguf_writer.add_array(key_name, [v for k, v in sorted(self.cls_out_labels.items())])
+            self.gguf_writer.add_classifier_output_labels([v for k, v in sorted(self.cls_out_labels.items())])
 
     def set_vocab(self):
         tokens, toktypes, tokpre = self.get_vocab_base()
@@ -3814,7 +3835,7 @@ def _xlmroberta_set_vocab(self) -> None:
             remove_whitespaces = tokenizer.clean_up_tokenization_spaces
             precompiled_charsmap = b64decode(tokenizer_json["normalizer"]["precompiled_charsmap"])
 
-            vocab_size = self.hparams.get("vocab_size", tokenizer.vocab_size)
+            vocab_size = max(self.hparams.get("vocab_size", 0), tokenizer.vocab_size)
         else:
             sentencepiece_model = model.ModelProto()  # pyright: ignore[reportAttributeAccessIssue]
             sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read())
@@ -3827,7 +3848,7 @@ def _xlmroberta_set_vocab(self) -> None:
             tokenizer = SentencePieceProcessor()
             tokenizer.LoadFromFile(str(tokenizer_path))
 
-            vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
+            vocab_size = max(self.hparams.get("vocab_size", 0), tokenizer.vocab_size())
 
         tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
         scores: list[float] = [-10000.0] * vocab_size
@@ -3857,33 +3878,26 @@ def _xlmroberta_set_vocab(self) -> None:
             unk_token = tokenizer_config_json.get("unk_token")
             unk_token_id = added_vocab.get(unk_token, tokenizer_json["model"].get("unk_id", 3))
 
-            for token_id in range(vocab_size):
+            for token_id in range(tokenizer.vocab_size):
                 piece = tokenizer._convert_id_to_token(token_id)
-                text = piece.encode("utf-8")
-                score = tokenizer_json["model"]["vocab"][token_id][1]
-
-                toktype = SentencePieceTokenTypes.NORMAL
-                if token_id == unk_token_id:
-                    toktype = SentencePieceTokenTypes.UNKNOWN
-                elif token_id in tokenizer.all_special_ids:
-                    toktype = SentencePieceTokenTypes.CONTROL
-                elif token_id in added_vocab.values():
-                    toktype = SentencePieceTokenTypes.USER_DEFINED
-                # No reliable way to detect this, but jina doesn't have any
-                # elif tokenizer.IsByte(token_id):
-                #     toktype = SentencePieceTokenTypes.BYTE
-
-                tokens[token_id] = text
-                scores[token_id] = score
-                toktypes[token_id] = toktype
-
-        if vocab_size > len(tokens):
-            pad_count = vocab_size - len(tokens)
-            logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]")
-            for i in range(1, pad_count + 1):
-                tokens.append(bytes(f"[PAD{i}]", encoding="utf-8"))
-                scores.append(-1000.0)
-                toktypes.append(SentencePieceTokenTypes.UNUSED)
+                if (piece := tokenizer._convert_id_to_token(token_id)) is not None:
+                    text = piece.encode("utf-8")
+                    score = tokenizer_json["model"]["vocab"][token_id][1]
+
+                    toktype = SentencePieceTokenTypes.NORMAL
+                    if token_id == unk_token_id:
+                        toktype = SentencePieceTokenTypes.UNKNOWN
+                    elif token_id in tokenizer.all_special_ids:
+                        toktype = SentencePieceTokenTypes.CONTROL
+                    elif token_id in added_vocab.values():
+                        toktype = SentencePieceTokenTypes.USER_DEFINED
+                    # No reliable way to detect this, but jina doesn't have any
+                    # elif tokenizer.IsByte(token_id):
+                    #     toktype = SentencePieceTokenTypes.BYTE
+
+                    tokens[token_id] = text
+                    scores[token_id] = score
+                    toktypes[token_id] = toktype
 
         if isinstance(tokenizer, SentencePieceProcessor):
             # realign tokens (see HF tokenizer code)
@@ -3896,6 +3910,12 @@ def _xlmroberta_set_vocab(self) -> None:
                 SentencePieceTokenTypes.UNKNOWN,
             ] + toktypes[3:-1]
 
+            if self.model_arch == gguf.MODEL_ARCH.NOMIC_BERT_MOE:
+                # Add mask token missing from sentencepiece.bpe.model
+                tokens[250001] = b'<mask>'
+                scores[250001] = 0.0
+                toktypes[250001] = SentencePieceTokenTypes.CONTROL
+
         self.gguf_writer.add_tokenizer_model("t5")
         self.gguf_writer.add_tokenizer_pre("default")
         self.gguf_writer.add_token_list(tokens)
@@ -3910,9 +3930,6 @@ def _xlmroberta_set_vocab(self) -> None:
         special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
         special_vocab.add_to_gguf(self.gguf_writer)
 
-        self.gguf_writer.add_add_bos_token(True)
-        self.gguf_writer.add_add_eos_token(True)
-
 
 @ModelBase.register("DistilBertModel", "DistilBertForMaskedLM", "DistilBertForSequenceClassification")
 class DistilBertModel(BertModel):
@@ -3954,8 +3971,6 @@ def set_vocab(self):
         bpe_tok_path = self.dir_model / "tokenizer.json"
         if bpe_tok_path.exists():
             self._set_vocab_gpt2()
-            self.gguf_writer.add_add_bos_token(True)
-            self.gguf_writer.add_add_eos_token(True)
 
             # we need this to validate the size of the token_type embeddings
             # though currently we are passing all zeros to the token_type embeddings
@@ -4061,6 +4076,34 @@ def _is_tokenizer_xlmroberta(self) -> bool:
         raise ValueError(f"unknown tokenizer: {toktyp}")
 
 
+@ModelBase.register("NeoBERT", "NeoBERTLMHead", "NeoBERTForSequenceClassification")
+class NeoBert(BertModel):
+    model_arch = gguf.MODEL_ARCH.NEO_BERT
+
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+
+        # NeoBERT uses 2/3 of the intermediate size as feed forward length
+        self.gguf_writer.add_feed_forward_length(int(2 * self.hparams["intermediate_size"] / 3))
+        self.gguf_writer.add_rope_freq_base(10000.0)  # default value for NeoBERT
+        self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
+
+        f_rms_eps = self.hparams.get("norm_eps", 1e-6)  # default value for NeoBERT
+        self.gguf_writer.add_layer_norm_rms_eps(f_rms_eps)
+        logger.info(f"gguf: rms norm epsilon = {f_rms_eps}")
+
+        self.gguf_writer.add_pooling_type(gguf.PoolingType.CLS) # https://huggingface.co/chandar-lab/NeoBERT#how-to-use
+
+    def modify_tensors(self, data_torch, name, bid):
+        if name.startswith("decoder."):
+            return []
+
+        if name.startswith("model."):
+            name = name[6:]
+
+        return super().modify_tensors(data_torch, name, bid)
+
+
 @ModelBase.register("XLMRobertaModel", "XLMRobertaForSequenceClassification")
 class XLMRobertaModel(BertModel):
     model_arch = gguf.MODEL_ARCH.BERT
@@ -4187,6 +4230,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
 @ModelBase.register("Gemma3ForCausalLM", "Gemma3ForConditionalGeneration")
 class Gemma3Model(TextModel):
     model_arch = gguf.MODEL_ARCH.GEMMA3
+    norm_shift = 1.0  # Gemma3RMSNorm adds 1.0 to the norm value
 
     def set_vocab(self):
         self._set_vocab_sentencepiece()
@@ -4208,9 +4252,8 @@ def set_gguf_parameters(self):
         self.gguf_writer.add_value_length(hparams.get("head_dim", 256))
         self.gguf_writer.add_file_type(self.ftype)
         self.gguf_writer.add_rope_freq_base(hparams.get("rope_theta", 1_000_000.0)) # for global layers
-        # both attn_logit_softcapping and final_logit_softcapping are removed in Gemma3
+        # attn_logit_softcapping is removed in Gemma3
         assert hparams.get("attn_logit_softcapping") is None
-        assert hparams.get("final_logit_softcapping") is None
         self.gguf_writer.add_sliding_window(hparams["sliding_window"])
         self.gguf_writer.add_head_count_kv(hparams.get("num_key_value_heads", 4))
         if hparams.get("rope_scaling") is not None:
@@ -4222,7 +4265,7 @@ def set_gguf_parameters(self):
     def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
         del bid  # unused
 
-        if name.startswith("language_model."):
+        if "language_model." in name:
             name = name.replace("language_model.", "")
 
         elif name.startswith("multi_modal_projector.") or name.startswith("vision_tower.") \
@@ -4237,8 +4280,9 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
 
         # ref code in Gemma3RMSNorm
         # output = output * (1.0 + self.weight.float())
+        # note: this is not the case on gemma3n
         if name.endswith("norm.weight"):
-            data_torch = data_torch + 1
+            data_torch = data_torch + self.norm_shift
 
         return [(self.map_tensor_name(name), data_torch)]
 
@@ -4295,6 +4339,104 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
         return [] # skip other tensors
 
 
+@ModelBase.register("Gemma3nForConditionalGeneration")
+class Gemma3NModel(Gemma3Model):
+    model_arch = gguf.MODEL_ARCH.GEMMA3N
+    norm_shift = 0.0 # same value with Gemma3p5RMSNorm scale_shift on python code
+
+    _altup_proj: list[Tensor] = []
+    _altup_unembd: list[Tensor] = []
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        assert self.hparams["altup_num_inputs"] == 4, "Current conversion only supports 4 altup inputs"
+        self._altup_proj = [
+            torch.Tensor(), # to be replaced
+            torch.Tensor(), # to be replaced
+            torch.Tensor(), # to be replaced
+        ]
+        self._altup_unembd = [
+            torch.Tensor(), # to be replaced
+            torch.Tensor(), # to be replaced
+            torch.Tensor(), # to be replaced
+        ]
+
+    def set_vocab(self):
+        with open(self.dir_model / "chat_template.jinja") as f:
+            # quick hack to make sure chat template is added
+            self.gguf_writer.add_chat_template(f.read())
+        super().set_vocab()
+
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+        self.gguf_writer.add_altup_active_idx(self.hparams["altup_active_idx"])
+        self.gguf_writer.add_altup_num_inputs(self.hparams["altup_num_inputs"])
+        self.gguf_writer.add_embedding_length_per_layer_input(self.hparams["hidden_size_per_layer_input"])
+        self.gguf_writer.add_shared_kv_layers(self.hparams["num_kv_shared_layers"])
+
+        activation_sparsity_scale = []
+        for s in self.hparams["activation_sparsity_pattern"]:
+            normal_dist = torch.distributions.normal.Normal(0, 1)
+            std_multiplier = normal_dist.icdf(torch.tensor(s, dtype=torch.float32))
+            activation_sparsity_scale.append(std_multiplier.item())
+        self.gguf_writer.add_activation_sparsity_scale(activation_sparsity_scale)
+
+        sliding_window_pattern = []
+        for t in self.hparams["layer_types"]:
+            sliding_window_pattern.append(t == "sliding_attention")
+        self.gguf_writer.add_sliding_window_pattern(sliding_window_pattern)
+
+    def _stack_matrices(self, matrices: list[Tensor]) -> Tensor | None:
+        has_all = all(m.numel() > 0 for m in matrices)
+        if not has_all:
+            return None
+        else:
+            return torch.stack(matrices, dim=0)
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        if name.endswith("_scale"):
+            name = name + ".weight"
+
+        # TODO: implement self.prediction_coefs.weight.clamp_(...)
+
+        if "language_model." not in name:
+            return [] # skip non-language model tensors
+
+        if "altup_unembed_projections" in name:
+            data_torch = data_torch.to(device="cpu")
+            if ".0." in name:
+                self._altup_unembd[0] = data_torch
+            elif ".1." in name:
+                self._altup_unembd[1] = data_torch
+            elif ".2." in name:
+                self._altup_unembd[2] = data_torch
+            else:
+                raise ValueError(f"Unknown name: {name}")
+            out = self._stack_matrices(self._altup_unembd)
+            if out is not None:
+                return [(self.map_tensor_name("model.altup_unembed_projections.weight"), out)]
+            else:
+                return []
+
+        if "altup_projections" in name:
+            data_torch = data_torch.to(device="cpu")
+            if ".0." in name:
+                self._altup_proj[0] = data_torch
+            elif ".1." in name:
+                self._altup_proj[1] = data_torch
+            elif ".2." in name:
+                self._altup_proj[2] = data_torch
+            else:
+                raise ValueError(f"Unknown name: {name}")
+            out = self._stack_matrices(self._altup_proj)
+            if out is not None:
+                return [(self.map_tensor_name("model.altup_projections.weight"), out)]
+            else:
+                return []
+
+        return super().modify_tensors(data_torch, name, bid)
+
+
 @ModelBase.register("Starcoder2ForCausalLM")
 class StarCoder2Model(TextModel):
     model_arch = gguf.MODEL_ARCH.STARCODER2
@@ -4800,25 +4942,6 @@ def prepare_tensors(self):
 class JinaBertV2Model(BertModel):
     model_arch = gguf.MODEL_ARCH.JINA_BERT_V2
 
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.intermediate_size = self.hparams["intermediate_size"]
-
-    def get_tensors(self):
-        for name, data in super().get_tensors():
-            if 'gated_layer' in name:
-                d1 = data[:self.intermediate_size, :]
-                name1 = name.replace('gated_layers', 'gated_layers_w')
-                name1 = name1.replace('up_gated_layer', 'gated_layers_v')
-                d2 = data[self.intermediate_size:, :]
-                name2 = name.replace('gated_layers', 'gated_layers_v')
-                name2 = name2.replace('up_gated_layer', 'gated_layers_w')
-                yield name1, d1
-                yield name2, d2
-                continue
-
-            yield name, data
-
     def set_vocab(self):
         tokenizer_class = 'BertTokenizer'
         with open(self.dir_model / "tokenizer_config.json", "r", encoding="utf-8") as f:
@@ -4831,16 +4954,6 @@ def set_vocab(self):
             self.gguf_writer.add_token_type_count(2)
         else:
             raise NotImplementedError(f'Tokenizer {tokenizer_class} is not supported for JinaBertModel')
-        self.gguf_writer.add_add_bos_token(True)
-        self.gguf_writer.add_add_eos_token(True)
-
-    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
-        # if name starts with "bert.", remove the prefix
-        # e.g. https://huggingface.co/jinaai/jina-reranker-v1-tiny-en
-        if name.startswith("bert."):
-            name = name[5:]
-
-        return super().modify_tensors(data_torch, name, bid)
 
 
 @ModelBase.register("OpenELMForCausalLM")
@@ -5082,9 +5195,7 @@ def set_vocab(self):
     def set_gguf_parameters(self):
         super().set_gguf_parameters()
         hparams = self.hparams
-        if "head_dim" in hparams:
-            rope_dim = hparams["head_dim"]
-        else:
+        if (rope_dim := hparams.get("head_dim")) is None:
             rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
 
         self.gguf_writer.add_rope_dimension_count(rope_dim)
@@ -5288,6 +5399,34 @@ def prepare_tensors(self):
                 raise ValueError(f"Unprocessed experts: {experts}")
 
 
+@ModelBase.register("Dots1ForCausalLM")
+class Dots1Model(Qwen2MoeModel):
+    model_arch = gguf.MODEL_ARCH.DOTS1
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.hparams["num_experts"] = self.hparams["n_routed_experts"]
+
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+        self.gguf_writer.add_leading_dense_block_count(self.hparams["first_k_dense_replace"])
+        self.gguf_writer.add_expert_shared_count(self.hparams["n_shared_experts"])
+        self.gguf_writer.add_expert_weights_scale(self.hparams["routed_scaling_factor"])
+        self.gguf_writer.add_expert_weights_norm(self.hparams["norm_topk_prob"])
+
+        if self.hparams["scoring_func"] == "noaux_tc":
+            self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID)
+        else:
+            raise ValueError(f"Unsupported scoring_func value: {self.hparams['scoring_func']}")
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
+        if name.endswith("e_score_correction_bias"):
+            name = name.replace("e_score_correction_bias", "e_score_correction.bias")
+        if "shared_experts" in name:
+            return [(self.map_tensor_name(name), data_torch)]
+        return super().modify_tensors(data_torch, name, bid)
+
+
 @ModelBase.register("PLMForCausalLM")
 class PLMModel(TextModel):
     model_arch = gguf.MODEL_ARCH.PLM
@@ -5416,9 +5555,6 @@ def set_vocab(self):
         special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
         special_vocab.add_to_gguf(self.gguf_writer)
 
-        self.gguf_writer.add_add_bos_token(False)
-        self.gguf_writer.add_add_eos_token(True)
-
     def set_gguf_parameters(self):
         if (n_ctx := self.find_hparam(["n_positions"], optional=True)) is None:
             logger.warning("Couldn't find context length in config.json, assuming default value of 512")
@@ -5556,9 +5692,6 @@ def set_vocab(self):
         special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
         special_vocab.add_to_gguf(self.gguf_writer)
 
-        self.gguf_writer.add_add_bos_token(False)
-        self.gguf_writer.add_add_eos_token(True)
-
     def set_gguf_parameters(self):
         if (n_ctx := self.find_hparam(["n_positions"], optional=True)) is None:
             logger.warning("Couldn't find context length in config.json, assuming default value of 512")
@@ -5946,7 +6079,8 @@ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
         if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
             if rope_scaling.get("rope_type", '').lower() == "llama3":
                 base = self.hparams.get("rope_theta", 10000.0)
-                dim = self.hparams.get("head_dim", self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
+                if (dim := self.hparams.get("head_dim")) is None:
+                    dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
                 freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
 
                 factor = rope_scaling.get("factor", 8.0)
@@ -6058,7 +6192,8 @@ def set_vocab(self):
     def set_gguf_parameters(self):
         super().set_gguf_parameters()
         hparams = self.hparams
-        rope_dim = hparams.get("head_dim") or hparams["hidden_size"] // hparams["num_attention_heads"]
+        if (rope_dim := hparams.get("head_dim")) is None:
+            rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
 
         self.gguf_writer.add_rope_dimension_count(rope_dim)
         rope_scaling = self.hparams.get("rope_scaling") or {}
@@ -6090,7 +6225,8 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
         n_head = self.hparams["num_attention_heads"]
         n_kv_head = self.hparams.get("num_key_value_heads")
         n_embd = self.hparams["hidden_size"]
-        head_dim = self.hparams.get("head_dim") or n_embd // n_head
+        if (head_dim := self.hparams.get("head_dim")) is None:
+            head_dim = n_embd // n_head
 
         output_name = self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT)
 
@@ -6351,8 +6487,8 @@ def parse_args() -> argparse.Namespace:
         help="model is executed on big endian machine",
     )
     parser.add_argument(
-        "model", type=Path,
-        help="directory containing model file",
+        "model", type=str,
+        help="directory containing model file or huggingface repository ID (if --remote)",
         nargs="?",
     )
     parser.add_argument(
@@ -6455,18 +6591,20 @@ def main() -> None:
     else:
         logging.basicConfig(level=logging.INFO)
 
-    dir_model = args.model
-
     if args.remote:
+        hf_repo_id = args.model
         from huggingface_hub import snapshot_download
         local_dir = snapshot_download(
-            repo_id=str(dir_model),
+            repo_id=hf_repo_id,
             allow_patterns=["LICENSE", "*.json", "*.md", "*.txt", "tokenizer.model"])
         dir_model = Path(local_dir)
         logger.info(f"Downloaded config and tokenizer to {local_dir}")
+    else:
+        hf_repo_id = None
+        dir_model = Path(args.model)
 
     if not dir_model.is_dir():
-        logger.error(f'Error: {args.model} is not a directory')
+        logger.error(f'Error: {dir_model} is not a directory')
         sys.exit(1)
 
     ftype_map: dict[str, gguf.LlamaFileType] = {
@@ -6486,9 +6624,9 @@ def main() -> None:
 
     if args.outfile is not None:
         fname_out = args.outfile
-    elif args.remote:
+    elif hf_repo_id:
         # if remote, use the model ID as the output file name
-        fname_out = Path("./" + str(args.model).replace("/", "-") + "-{ftype}.gguf")
+        fname_out = Path("./" + hf_repo_id.replace("/", "-") + "-{ftype}.gguf")
     else:
         fname_out = dir_model
 
@@ -6517,7 +6655,7 @@ def main() -> None:
                                      split_max_tensors=args.split_max_tensors,
                                      split_max_size=split_str_to_n_bytes(args.split_max_size), dry_run=args.dry_run,
                                      small_first_shard=args.no_tensor_first_split,
-                                     remote_hf_model_id=str(args.model) if args.remote else None)
+                                     remote_hf_model_id=hf_repo_id)
 
         if args.vocab_only:
             logger.info("Exporting model vocab...")
diff --git a/docs/backend/CANN.md b/docs/backend/CANN.md
index a5ba617ca7bab..2b001f09abe45 100755
--- a/docs/backend/CANN.md
+++ b/docs/backend/CANN.md
@@ -8,6 +8,7 @@
  - [DataType Supports](#datatype-supports)
  - [Docker](#docker)
  - [Linux](#linux)
+ - [Environment variable setup](#environment-variable-setup)
  - [TODO](#todo)
 
 
@@ -290,5 +291,24 @@ Authors from Peking University: Bizhao Shi (bshi@pku.edu.cn), Yuxin Yang (yxyang
 
 We would like to thank Tuo Dai, Shanni Li, and all of the project maintainers from Huawei Technologies Co., Ltd for their help during the code development and pull request.
 
+## Environment variable setup
+
+### GGML_CANN_ASYNC_MODE
+
+Enables asynchronous operator submission. Disabled by default.
+
+### GGML_CANN_MEM_POOL
+
+Specifies the memory pool management strategy:
+
+- vmm: Utilizes a virtual memory manager pool. If hardware support for VMM is unavailable, falls back to the legacy (leg) memory pool.
+
+- prio: Employs a priority queue-based memory pool management.
+- leg: Uses a fixed-size buffer pool.
+
+### GGML_CANN_DISABLE_BUF_POOL_CLEAN
+
+Controls automatic cleanup of the memory pool. This option is only effective when using the prio or leg memory pool strategies.
+
 ## TODO
 - Support more models and data types.
diff --git a/docs/backend/SYCL.md b/docs/backend/SYCL.md
index 249e73451e66b..6e9b88935da97 100644
--- a/docs/backend/SYCL.md
+++ b/docs/backend/SYCL.md
@@ -757,7 +757,7 @@ use 1 SYCL GPUs: [0] with Max compute units:512
 | Name              | Value            | Function                                                                                                                  |
 |-------------------|------------------|---------------------------------------------------------------------------------------------------------------------------|
 | GGML_SYCL_DEBUG   | 0 (default) or 1 | Enable log function by macro: GGML_SYCL_DEBUG                                                                             |
-| GGML_SYCL_DISABLE_OPT | 0 (default) or 1 | Disable optimize features based on Intel GPU type, to compare the performance increase |
+| GGML_SYCL_DISABLE_OPT | 0 (default) or 1 | Disable optimize features for Intel GPUs. (Recommended to 1 for intel devices older than Gen 10) |
 | GGML_SYCL_DISABLE_GRAPH | 0 or 1 (default) | Disable running computations through SYCL Graphs feature. Disabled by default because graph performance isn't yet better than non-graph performance. |
 | GGML_SYCL_DISABLE_DNN | 0 (default) or 1 | Disable running computations through oneDNN and always use oneMKL. |
 | ZES_ENABLE_SYSMAN | 0 (default) or 1 | Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.<br>Recommended to use when --split-mode = layer |
diff --git a/docs/build-s390x.md b/docs/build-s390x.md
new file mode 100644
index 0000000000000..4c9ebb271cee2
--- /dev/null
+++ b/docs/build-s390x.md
@@ -0,0 +1,246 @@
+> [!IMPORTANT]
+> This build documentation is specific only to IBM Z & LinuxONE mainframes (s390x). You can find the build documentation for other architectures: [build.md](build.md).
+
+# Build llama.cpp locally (for s390x)
+
+The main product of this project is the `llama` library. Its C-style interface can be found in [include/llama.h](../include/llama.h).
+
+The project also includes many example programs and tools using the `llama` library. The examples range from simple, minimal code snippets to sophisticated sub-projects such as an OpenAI-compatible HTTP server.
+
+**To get the code:**
+
+```bash
+git clone https://github.com/ggml-org/llama.cpp
+cd llama.cpp
+```
+
+## CPU Build with BLAS
+
+Building llama.cpp with BLAS support is highly recommended as it has shown to provide performance improvements. Make sure to have OpenBLAS installed in your environment.
+
+```bash
+cmake -S . -B build             \
+    -DCMAKE_BUILD_TYPE=Release  \
+    -DGGML_BLAS=ON              \
+    -DGGML_BLAS_VENDOR=OpenBLAS
+
+cmake --build build --config Release -j $(nproc)
+```
+
+**Notes**:
+
+-   For faster repeated compilation, install [ccache](https://ccache.dev/)
+-   By default, VXE/VXE2 is enabled. To disable it (not recommended):
+
+    ```bash
+    cmake -S . -B build             \
+        -DCMAKE_BUILD_TYPE=Release  \
+        -DGGML_BLAS=ON              \
+        -DGGML_BLAS_VENDOR=OpenBLAS \
+        -DGGML_VXE=OFF
+
+    cmake --build build --config Release -j $(nproc)
+    ```
+
+-   By default, NNPA is enabled when available. To disable it (not recommended):
+
+    ```bash
+    cmake -S . -B build             \
+        -DCMAKE_BUILD_TYPE=Release  \
+        -DGGML_BLAS=ON              \
+        -DGGML_BLAS_VENDOR=OpenBLAS \
+        -DGGML_NNPA=OFF
+
+    cmake --build build --config Release -j $(nproc)
+    ```
+
+-   For debug builds:
+
+    ```bash
+    cmake -S . -B build             \
+        -DCMAKE_BUILD_TYPE=Debug    \
+        -DGGML_BLAS=ON              \
+        -DGGML_BLAS_VENDOR=OpenBLAS
+    cmake --build build --config Debug -j $(nproc)
+    ```
+
+-   For static builds, add `-DBUILD_SHARED_LIBS=OFF`:
+
+    ```bash
+    cmake -S . -B build             \
+        -DCMAKE_BUILD_TYPE=Release  \
+        -DGGML_BLAS=ON              \
+        -DGGML_BLAS_VENDOR=OpenBLAS \
+        -DBUILD_SHARED_LIBS=OFF
+
+    cmake --build build --config Release -j $(nproc)
+    ```
+
+## Getting GGUF Models
+
+All models need to be converted to Big-Endian. You can achieve this in three cases:
+
+1. **Use pre-converted models verified for use on IBM Z & LinuxONE (easiest)**
+
+    ![File Type - gguf](https://img.shields.io/badge/File_Type-gguf-fff)
+
+    You can find popular models pre-converted and verified at [s390x Ready Models](https://huggingface.co/collections/taronaeo/s390x-ready-models-672765393af438d0ccb72a08).
+
+    These models have already been converted from `safetensors` to `GGUF Big-Endian` and their respective tokenizers verified to run correctly on IBM z15 and later system.
+
+2. **Convert safetensors model to GGUF Big-Endian directly (recommended)**
+
+    ![File Type - safetensors](https://img.shields.io/badge/File_Type-safetensors-da1e28)
+
+    The model you are trying to convert must be in `safetensors` file format (for example [IBM Granite 3.3 2B](https://huggingface.co/ibm-granite/granite-3.3-2b-instruct)). Make sure you have downloaded the model repository for this case.
+
+    ```bash
+    python3 convert_hf_to_gguf.py \
+        --outfile model-name-be.f16.gguf \
+        --outtype f16 \
+        --bigendian \
+        model-directory/
+    ```
+
+    For example,
+
+    ```bash
+    python3 convert_hf_to_gguf.py \
+        --outfile granite-3.3-2b-instruct-be.f16.gguf \
+        --outtype f16 \
+        --bigendian \
+        granite-3.3-2b-instruct/
+    ```
+
+3. **Convert existing GGUF Little-Endian model to Big-Endian**
+
+    ![File Type - gguf](https://img.shields.io/badge/File_Type-gguf-fff)
+
+    The model you are trying to convert must be in `gguf` file format (for example [IBM Granite 3.3 2B](https://huggingface.co/ibm-granite/granite-3.3-2b-instruct-GGUF)). Make sure you have downloaded the model file for this case.
+
+    ```bash
+    python3 gguf-py/gguf/scripts/gguf_convert_endian.py model-name.f16.gguf BIG
+    ```
+
+    For example,
+
+    ```bash
+    python3 gguf-py/gguf/scripts/gguf_convert_endian.py granite-3.3-2b-instruct-le.f16.gguf BIG
+    mv granite-3.3-2b-instruct-le.f16.gguf granite-3.3-2b-instruct-be.f16.gguf
+    ```
+
+    **Notes:**
+
+    - The GGUF endian conversion script may not support all data types at the moment and may fail for some models/quantizations. When that happens, please try manually converting the safetensors model to GGUF Big-Endian via Step 2.
+
+## IBM Accelerators
+
+### 1. SIMD Acceleration
+
+Only available in IBM z15 or later system with the `-DGGML_VXE=ON` (turned on by default) compile flag. No hardware acceleration is possible with llama.cpp with older systems, such as IBM z14/arch12. In such systems, the APIs can still run but will use a scalar implementation.
+
+### 2. NNPA Vector Intrinsics Acceleration
+
+Only available in IBM z16 or later system with the `-DGGML_NNPA=ON` (turned on when available) compile flag. No hardware acceleration is possible with llama.cpp with older systems, such as IBM z15/arch13. In such systems, the APIs can still run but will use a scalar implementation.
+
+### 3. zDNN Accelerator
+
+_Only available in IBM z16 or later system. No direction at the moment._
+
+### 4. Spyre Accelerator
+
+_No direction at the moment._
+
+## Performance Tuning
+
+### 1. Virtualization Setup
+
+It is strongly recommended to use only LPAR (Type-1) virtualization to get the most performance.
+
+Note: Type-2 virtualization is not supported at the moment, while you can get it running, the performance will not be the best.
+
+### 2. IFL (Core) Count
+
+It is recommended to allocate a minimum of 8 shared IFLs assigned to the LPAR. Increasing the IFL count past 8 shared IFLs will only improve Prompt Processing performance but not Token Generation.
+
+Note: IFL count does not equate to vCPU count.
+
+### 3. SMT vs NOSMT (Simultaneous Multithreading)
+
+It is strongly recommended to disable SMT via the kernel boot parameters as it negatively affects performance. Please refer to your Linux distribution's guide on disabling SMT via kernel boot parameters.
+
+### 4. BLAS vs NOBLAS
+
+IBM VXE/VXE2 SIMD acceleration depends on the BLAS implementation. It is strongly recommended to use BLAS.
+
+## Frequently Asked Questions (FAQ)
+
+1. I'm getting the following error message while trying to load a model: `gguf_init_from_file_impl: failed to load model: this GGUF file version 50331648 is extremely large, is there a mismatch between the host and model endianness?`
+
+    Answer: Please ensure that the model you have downloaded/converted is GGUFv3 Big-Endian. These models are usually denoted with the `-be` suffix, i.e., `granite-3.3-2b-instruct-be.F16.gguf`.
+
+    You may refer to the [Getting GGUF Models](#getting-gguf-models) section to manually convert a `safetensors` model to `GGUF` Big Endian.
+
+2. I'm getting extremely poor performance when running inference on a model
+
+    Answer: Please refer to the [Appendix B: SIMD Support Matrix](#appendix-b-simd-support-matrix) to check if your model quantization is supported by SIMD acceleration.
+
+3. I'm building on IBM z17 and getting the following error messages: `invalid switch -march=z17`
+
+    Answer: Please ensure that your GCC compiler is of minimum GCC 15.1.0 version, and have `binutils` updated to the latest version. If this does not fix the problem, kindly open an issue.
+
+## Getting Help on IBM Z & LinuxONE
+
+1. **Bugs, Feature Requests**
+
+    Please file an issue in llama.cpp and ensure that the title contains "s390x".
+
+2. **Other Questions**
+
+    Please reach out directly to [aionz@us.ibm.com](mailto:aionz@us.ibm.com).
+
+## Appendix A: Hardware Support Matrix
+
+|         | Support | Minimum Compiler Version |
+| ------- | ------- | ------------------------ |
+| IBM z15 | ✅      |                          |
+| IBM z16 | ✅      |                          |
+| IBM z17 | ✅      | GCC 15.1.0               |
+
+-   ✅ - supported and verified to run as intended
+-   🚫 - unsupported, we are unlikely able to provide support
+
+## Appendix B: SIMD Support Matrix
+
+|            | VX/VXE/VXE2 | NNPA | zDNN | Spyre |
+| ---------- | ----------- | ---- | ---- | ----- |
+| FP32       | ✅          | ✅   | ❓   | ❓    |
+| FP16       | ✅          | ✅   | ❓   | ❓    |
+| BF16       | 🚫          | 🚫   | ❓   | ❓    |
+| Q4_0       | ✅          | ✅   | ❓   | ❓    |
+| Q4_1       | ✅          | ✅   | ❓   | ❓    |
+| Q5_0       | 🚫          | 🚫   | ❓   | ❓    |
+| Q5_1       | 🚫          | 🚫   | ❓   | ❓    |
+| Q8_0       | ✅          | ✅   | ❓   | ❓    |
+| Q2_K       | 🚫          | 🚫   | ❓   | ❓    |
+| Q3_K       | ✅          | ✅   | ❓   | ❓    |
+| Q4_K       | ✅          | ✅   | ❓   | ❓    |
+| Q5_K       | ✅          | ✅   | ❓   | ❓    |
+| Q6_K       | ✅          | ✅   | ❓   | ❓    |
+| TQ1_0      | 🚫          | 🚫   | ❓   | ❓    |
+| TQ2_0      | 🚫          | 🚫   | ❓   | ❓    |
+| IQ2_XXS    | 🚫          | 🚫   | ❓   | ❓    |
+| IQ2_XS     | 🚫          | 🚫   | ❓   | ❓    |
+| IQ2_S      | 🚫          | 🚫   | ❓   | ❓    |
+| IQ3_XXS    | 🚫          | 🚫   | ❓   | ❓    |
+| IQ3_S      | 🚫          | 🚫   | ❓   | ❓    |
+| IQ1_S      | 🚫          | 🚫   | ❓   | ❓    |
+| IQ1_M      | 🚫          | 🚫   | ❓   | ❓    |
+| IQ4_NL     | ✅          | ✅   | ❓   | ❓    |
+| IQ4_XS     | ✅          | ✅   | ❓   | ❓    |
+| FP32->FP16 | 🚫          | ✅   | ❓   | ❓    |
+| FP16->FP32 | 🚫          | ✅   | ❓   | ❓    |
+
+-   ✅ - acceleration available
+-   🚫 - acceleration unavailable, will still run using scalar implementation
+-   ❓ - acceleration unknown, please contribute if you can test it yourself
diff --git a/docs/build.md b/docs/build.md
index 32717a793ffad..2e0b5d970c91a 100644
--- a/docs/build.md
+++ b/docs/build.md
@@ -1,5 +1,9 @@
 # Build llama.cpp locally
 
+The main product of this project is the `llama` library. Its C-style interface can be found in [include/llama.h](../include/llama.h).
+
+The project also includes many example programs and tools using the `llama` library. The examples range from simple, minimal code snippets to sophisticated sub-projects such as an OpenAI-compatible HTTP server.
+
 **To get the Code:**
 
 ```bash
@@ -553,6 +557,10 @@ ninja
 
 To read documentation for how to build on Android, [click here](./android.md)
 
+## IBM Z & LinuxONE
+
+To read documentation for how to build on IBM Z & LinuxONE, [click here](./build-s390x.md)
+
 ## Notes about GPU-accelerated backends
 
 The GPU may still be used to accelerate some parts of the computation even when using the `-ngl 0` option. You can fully disable GPU acceleration by using `--device none`.
diff --git a/docs/function-calling.md b/docs/function-calling.md
index fd3db9bd16a92..37eacaf3100c1 100644
--- a/docs/function-calling.md
+++ b/docs/function-calling.md
@@ -11,7 +11,7 @@ Function calling is supported for all models (see https://github.com/ggml-org/ll
   - Llama 3.1 / 3.3 (including builtin tools support - tool names for `wolfram_alpha`, `web_search` / `brave_search`, `code_interpreter`), Llama 3.2
   - Functionary v3.1 / v3.2
   - Hermes 2/3, Qwen 2.5
-  - Qwen 2.5 Coder (WIP: https://github.com/ggml-org/llama.cpp/pull/12034)
+  - Qwen 2.5 Coder
   - Mistral Nemo
   - Firefunction v2
   - Command R7B
diff --git a/docs/install.md b/docs/install.md
index 4971c18281cc9..7200bf9b7b91d 100644
--- a/docs/install.md
+++ b/docs/install.md
@@ -1,28 +1,42 @@
 # Install pre-built version of llama.cpp
 
-## Homebrew
+| Install via | Windows | Mac | Linux |
+|-------------|---------|-----|-------|
+| Winget      | ✅      |      |      |
+| Homebrew    |         | ✅   | ✅   |
+| MacPorts    |         | ✅   |      |
+| Nix         |         | ✅   | ✅   |
 
-On Mac and Linux, the homebrew package manager can be used via
+## Winget (Windows)
+
+```sh
+winget install llama.cpp
+```
+
+The package is automatically updated with new `llama.cpp` releases. More info: https://github.com/ggml-org/llama.cpp/issues/8188
+
+## Homebrew (Mac and Linux)
 
 ```sh
 brew install llama.cpp
 ```
+
 The formula is automatically updated with new `llama.cpp` releases. More info: https://github.com/ggml-org/llama.cpp/discussions/7668
 
-## MacPorts
+## MacPorts (Mac)
 
 ```sh
 sudo port install llama.cpp
 ```
-see also: https://ports.macports.org/port/llama.cpp/details/
 
-## Nix
+See also: https://ports.macports.org/port/llama.cpp/details/
 
-On Mac and Linux, the Nix package manager can be used via
+## Nix (Mac and Linux)
 
 ```sh
 nix profile install nixpkgs#llama-cpp
 ```
+
 For flake enabled installs.
 
 Or
@@ -34,13 +48,3 @@ nix-env --file '<nixpkgs>' --install --attr llama-cpp
 For non-flake enabled installs.
 
 This expression is automatically updated within the [nixpkgs repo](https://github.com/NixOS/nixpkgs/blob/nixos-24.05/pkgs/by-name/ll/llama-cpp/package.nix#L164).
-
-## Flox
-
-On Mac and Linux, Flox can be used to install llama.cpp within a Flox environment via
-
-```sh
-flox install llama-cpp
-```
-
-Flox follows the nixpkgs build of llama.cpp.
diff --git a/docs/multimodal.md b/docs/multimodal.md
index e849c2a0b8ba1..edbd081df7969 100644
--- a/docs/multimodal.md
+++ b/docs/multimodal.md
@@ -107,3 +107,7 @@ NOTE: some models may require large context window, for example: `-c 8192`
 (tool_name) -hf ggml-org/Qwen2.5-Omni-3B-GGUF
 (tool_name) -hf ggml-org/Qwen2.5-Omni-7B-GGUF
 ```
+
+## Finding more models:
+
+GGUF models on Huggingface with vision capabilities can be found here: https://huggingface.co/models?pipeline_tag=image-text-to-text&sort=trending&search=gguf
diff --git a/examples/batched.swift/Sources/main.swift b/examples/batched.swift/Sources/main.swift
index 514989e340e2c..fd90bbec5f751 100644
--- a/examples/batched.swift/Sources/main.swift
+++ b/examples/batched.swift/Sources/main.swift
@@ -116,7 +116,7 @@ if llama_decode(context, batch) != 0 {
 }
 
 for i in 1 ..< n_parallel {
-    llama_kv_self_seq_cp(context, 0, Int32(i), 0, batch.n_tokens)
+    llama_memory_seq_cp(llama_get_memory(context), 0, Int32(i), 0, batch.n_tokens)
 }
 
 if n_parallel > 1 {
diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp
index 71f700877a3b9..0ec2999a0c8e9 100644
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@@ -37,7 +37,7 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
     const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
 
     // clear previous kv_cache values (irrelevant for embeddings)
-    llama_kv_self_clear(ctx);
+    llama_memory_clear(llama_get_memory(ctx), true);
 
     // run model
     LOG_INF("%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
@@ -133,10 +133,36 @@ int main(int argc, char ** argv) {
     // max batch size
     const uint64_t n_batch = params.n_batch;
 
+    // get added sep and eos token, if any
+    const std::string added_sep_token = llama_vocab_get_add_sep(vocab) ? llama_vocab_get_text(vocab, llama_vocab_sep(vocab)) : "";
+    const std::string added_eos_token = llama_vocab_get_add_eos(vocab) ? llama_vocab_get_text(vocab, llama_vocab_eos(vocab)) : "";
+
     // tokenize the prompts and trim
     std::vector<std::vector<int32_t>> inputs;
     for (const auto & prompt : prompts) {
-        auto inp = common_tokenize(ctx, prompt, true, true);
+        std::vector<llama_token> inp;
+
+        // split classification pairs and insert expected separator tokens
+        if (pooling_type == LLAMA_POOLING_TYPE_RANK && prompt.find(params.cls_sep) != std::string::npos) {
+            std::vector<std::string> pairs = split_lines(prompt, params.cls_sep);
+            std::string final_prompt;
+
+            for (size_t i = 0; i < pairs.size(); i++) {
+                final_prompt += pairs[i];
+                if (i != pairs.size() - 1) {
+                    if (!added_eos_token.empty()) {
+                        final_prompt += added_eos_token;
+                    }
+                    if (!added_sep_token.empty()) {
+                        final_prompt += added_sep_token;
+                    }
+                }
+            }
+
+            inp = common_tokenize(ctx, final_prompt, true, true);
+        } else {
+            inp = common_tokenize(ctx, prompt, true, true);
+        }
         if (inp.size() > n_batch) {
             LOG_ERR("%s: number of tokens in input line (%lld) exceeds batch size (%lld), increase batch size and re-run\n",
                     __func__, (long long int) inp.size(), (long long int) n_batch);
@@ -145,11 +171,11 @@ int main(int argc, char ** argv) {
         inputs.push_back(inp);
     }
 
-    // check if the last token is SEP
+    // check if the last token is SEP/EOS
     // it should be automatically added by the tokenizer when 'tokenizer.ggml.add_eos_token' is set to 'true'
     for (auto & inp : inputs) {
-        if (inp.empty() || inp.back() != llama_vocab_sep(vocab)) {
-            LOG_WRN("%s: last token in the prompt is not SEP\n", __func__);
+        if (inp.empty() || (inp.back() != llama_vocab_sep(vocab) && inp.back() != llama_vocab_eos(vocab))) {
+            LOG_WRN("%s: last token in the prompt is not SEP or EOS\n", __func__);
             LOG_WRN("%s: 'tokenizer.ggml.add_eos_token' should be set to 'true' in the GGUF header\n", __func__);
         }
     }
@@ -236,9 +262,24 @@ int main(int argc, char ** argv) {
                 LOG("\n");
             }
         } else if (pooling_type == LLAMA_POOLING_TYPE_RANK) {
+            const uint32_t n_cls_out = llama_model_n_cls_out(model);
+            std::vector<std::string> cls_out_labels;
+
+            for (uint32_t i = 0; i < n_cls_out; i++) {
+                const char * label = llama_model_cls_label(model, i);
+                const std::string label_i(label == nullptr ? "" : label);
+                cls_out_labels.emplace_back(label_i.empty() ? std::to_string(i) : label_i);
+            }
+
             for (int j = 0; j < n_embd_count; j++) {
-                // NOTE: if you change this log - update the tests in ci/run.sh
-                LOG("rerank score %d: %8.3f\n", j, emb[j * n_embd]);
+                for (uint32_t i = 0; i < n_cls_out; i++) {
+                    // NOTE: if you change this log - update the tests in ci/run.sh
+                    if (n_cls_out == 1) {
+                        LOG("rerank score %d: %8.3f\n", j, emb[j * n_embd]);
+                    } else {
+                        LOG("rerank score %d: %8.3f [%s]\n", j, emb[j * n_embd + i], cls_out_labels[i].c_str());
+                    }
+                }
             }
         } else {
             // print the first part of the embeddings or for a single prompt, the full embedding
diff --git a/examples/gritlm/gritlm.cpp b/examples/gritlm/gritlm.cpp
index 539bc4d6027fb..bdab052c3390f 100644
--- a/examples/gritlm/gritlm.cpp
+++ b/examples/gritlm/gritlm.cpp
@@ -41,12 +41,11 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve
 
         // add input to batch (this increments n_tokens)
         for (int32_t j = 0; j < n_toks; j++) {
-            common_batch_add(batch, inputs[j], j, { 0 }, j >= n_inst);
+            common_batch_add(batch, inputs[j], j, { 0 }, true);
         }
 
         // clear previous kv_cache values (irrelevant for embeddings)
-        llama_kv_self_clear(ctx);
-        llama_set_embeddings(ctx, true);
+        llama_memory_clear(llama_get_memory(ctx), true);
         llama_set_causal_attn(ctx, false);
 
         // run model
@@ -102,8 +101,7 @@ static std::string generate(llama_context * ctx, llama_sampler * smpl, const std
 
     llama_token eos_token = llama_vocab_eos(vocab);
 
-    llama_kv_self_clear(ctx);
-    llama_set_embeddings(ctx, false);
+    llama_memory_clear(llama_get_memory(ctx), true);
     llama_set_causal_attn(ctx, true);
 
     llama_batch bat = llama_batch_init(llama_n_batch(ctx), 0, 1);
@@ -166,6 +164,8 @@ int main(int argc, char * argv[]) {
     llama_model_params mparams = common_model_params_to_llama(params);
     llama_context_params cparams = common_context_params_to_llama(params);
 
+    cparams.embeddings = true;
+
     llama_backend_init();
 
     llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams);
@@ -213,6 +213,8 @@ int main(int argc, char * argv[]) {
         std::printf("Cosine similarity between \"%.50s\" and \"%.50s\" is: %.3f\n", queries[1].c_str(), documents[1].c_str(), cosine_sim_q1_d1);
     }
 
+    llama_set_embeddings(ctx, false);
+
     // ### Generation ###
     // GritLM models are not finetuned with system prompts, as you can just include system-like instructions together with your user instruction
     {
diff --git a/examples/llama.android/llama/src/main/cpp/llama-android.cpp b/examples/llama.android/llama/src/main/cpp/llama-android.cpp
index 9654cd53cf8d5..711ddc5d19587 100644
--- a/examples/llama.android/llama/src/main/cpp/llama-android.cpp
+++ b/examples/llama.android/llama/src/main/cpp/llama-android.cpp
@@ -194,7 +194,7 @@ Java_android_llama_cpp_LLamaAndroid_bench_1model(
         }
 
         batch->logits[batch->n_tokens - 1] = true;
-        llama_kv_self_clear(context);
+        llama_memory_clear(llama_get_memory(context), false);
 
         const auto t_pp_start = ggml_time_us();
         if (llama_decode(context, *batch) != 0) {
@@ -206,7 +206,7 @@ Java_android_llama_cpp_LLamaAndroid_bench_1model(
 
         LOGi("Benchmark text generation (tg)");
 
-        llama_kv_self_clear(context);
+        llama_memory_clear(llama_get_memory(context), false);
         const auto t_tg_start = ggml_time_us();
         for (i = 0; i < tg; i++) {
 
@@ -223,7 +223,7 @@ Java_android_llama_cpp_LLamaAndroid_bench_1model(
 
         const auto t_tg_end = ggml_time_us();
 
-        llama_kv_self_clear(context);
+        llama_memory_clear(llama_get_memory(context), false);
 
         const auto t_pp = double(t_pp_end - t_pp_start) / 1000000.0;
         const auto t_tg = double(t_tg_end - t_tg_start) / 1000000.0;
@@ -448,5 +448,5 @@ Java_android_llama_cpp_LLamaAndroid_completion_1loop(
 extern "C"
 JNIEXPORT void JNICALL
 Java_android_llama_cpp_LLamaAndroid_kv_1cache_1clear(JNIEnv *, jobject, jlong context) {
-    llama_kv_self_clear(reinterpret_cast<llama_context *>(context));
+    llama_memory_clear(llama_get_memory(reinterpret_cast<llama_context *>(context)), true);
 }
diff --git a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
index f6e31abc93c09..dc2bafc88b175 100644
--- a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
+++ b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
@@ -210,7 +210,7 @@ actor LlamaContext {
             }
             batch.logits[Int(batch.n_tokens) - 1] = 1 // true
 
-            llama_kv_self_clear(context)
+            llama_memory_clear(llama_get_memory(context), false)
 
             let t_pp_start = DispatchTime.now().uptimeNanoseconds / 1000;
 
@@ -223,7 +223,7 @@ actor LlamaContext {
 
             // bench text generation
 
-            llama_kv_self_clear(context)
+            llama_memory_clear(llama_get_memory(context), false)
 
             let t_tg_start = DispatchTime.now().uptimeNanoseconds / 1000;
 
@@ -242,7 +242,7 @@ actor LlamaContext {
 
             let t_tg_end = DispatchTime.now().uptimeNanoseconds / 1000;
 
-            llama_kv_self_clear(context)
+            llama_memory_clear(llama_get_memory(context), false)
 
             let t_pp = Double(t_pp_end - t_pp_start) / 1000000.0
             let t_tg = Double(t_tg_end - t_tg_start) / 1000000.0
@@ -292,7 +292,7 @@ actor LlamaContext {
     func clear() {
         tokens_list.removeAll()
         temporary_invalid_cchars.removeAll()
-        llama_kv_self_clear(context)
+        llama_memory_clear(llama_get_memory(context), true)
     }
 
     private func tokenize(text: String, add_bos: Bool) -> [llama_token] {
diff --git a/examples/lookahead/lookahead.cpp b/examples/lookahead/lookahead.cpp
index 5f8620973f40e..1e26d8221b86b 100644
--- a/examples/lookahead/lookahead.cpp
+++ b/examples/lookahead/lookahead.cpp
@@ -60,6 +60,8 @@ int main(int argc, char ** argv) {
     llama_model * model = llama_init.model.get();
     llama_context * ctx = llama_init.context.get();
 
+    auto * mem = llama_get_memory(ctx);
+
     const llama_vocab * vocab = llama_model_get_vocab(model);
 
     // Tokenize the prompt
@@ -94,7 +96,7 @@ int main(int argc, char ** argv) {
     llama_decode(ctx, llama_batch_get_one(&inp.back(),           1));
 
     for (int s = 1; s < W + G + 1; ++s) {
-        llama_kv_self_seq_cp(ctx, 0, s, -1, -1);
+        llama_memory_seq_cp(mem, 0, s, -1, -1);
     }
 
     const auto t_enc_end = ggml_time_us();
@@ -427,17 +429,17 @@ int main(int argc, char ** argv) {
 
         // KV cache management
         // if no verification token matched, we simply remove all cells from this batch -> no fragmentation
-        llama_kv_self_seq_rm(ctx, -1, n_past, -1);
+        llama_memory_seq_rm(mem, -1, n_past, -1);
 
         if (seq_id_best != 0) {
             // if a verification token matched, we keep the best sequence and remove the rest
             // this leads to some KV cache fragmentation
-            llama_kv_self_seq_keep(ctx, seq_id_best);
-            llama_kv_self_seq_cp  (ctx, seq_id_best, 0, -1, -1);
-            llama_kv_self_seq_rm  (ctx, seq_id_best,    -1, -1);
+            llama_memory_seq_keep(mem, seq_id_best);
+            llama_memory_seq_cp  (mem, seq_id_best, 0, -1, -1);
+            llama_memory_seq_rm  (mem, seq_id_best,    -1, -1);
 
             for (int s = 1; s < W + G + 1; ++s) {
-                llama_kv_self_seq_cp(ctx, 0, s, -1, -1);
+                llama_memory_seq_cp(mem, 0, s, -1, -1);
             }
         }
     }
diff --git a/examples/lookup/lookup.cpp b/examples/lookup/lookup.cpp
index 2ee502939d554..2bfa26b55f0a6 100644
--- a/examples/lookup/lookup.cpp
+++ b/examples/lookup/lookup.cpp
@@ -181,7 +181,7 @@ int main(int argc, char ** argv){
 
         // KV cache management
         // clean the cache of draft tokens that weren't accepted
-        llama_kv_self_seq_rm(ctx, 0, n_past, -1);
+        llama_memory_seq_rm(llama_get_memory(ctx), 0, n_past, -1);
 
         common_batch_clear(batch_tgt);
         common_batch_add(batch_tgt, draft[0], n_past, { 0 }, true);
diff --git a/examples/parallel/parallel.cpp b/examples/parallel/parallel.cpp
index d7b269df0dea2..d53e089a4cbc2 100644
--- a/examples/parallel/parallel.cpp
+++ b/examples/parallel/parallel.cpp
@@ -158,7 +158,7 @@ int main(int argc, char ** argv) {
     common_params params;
 
     params.n_predict = 128;
-    params.n_junk = 0;
+    params.n_junk = 1;
 
     if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_PARALLEL)) {
         return 1;
@@ -182,7 +182,7 @@ int main(int argc, char ** argv) {
     const bool is_sp_shared = params.is_pp_shared;
 
     // extra text to insert in each client's prompt in order to make it larger
-    const int32_t n_junk = params.n_junk;
+    const int32_t n_junk = std::max(1, params.n_junk);
 
     // init llama.cpp
     llama_backend_init();
@@ -194,6 +194,8 @@ int main(int argc, char ** argv) {
     llama_model * model = llama_init.model.get();
     llama_context * ctx = llama_init.context.get();
 
+    auto * mem = llama_get_memory(ctx);
+
     const llama_vocab * vocab = llama_model_get_vocab(model);
 
     // load the prompts from an external file if there are any
@@ -259,7 +261,7 @@ int main(int argc, char ** argv) {
 
         // assign the system KV cache to all parallel sequences
         for (int32_t i = 1; i <= n_clients; ++i) {
-            llama_kv_self_seq_cp(ctx, 0, i, -1, -1);
+            llama_memory_seq_cp(mem, 0, i, -1, -1);
         }
 
         LOG_INF("\n");
@@ -286,9 +288,9 @@ int main(int argc, char ** argv) {
         if (batch.n_tokens == 0) {
             // all sequences have ended - clear the entire KV cache
             for (int i = 1; i <= n_clients; ++i) {
-                llama_kv_self_seq_rm(ctx, i, -1, -1);
+                llama_memory_seq_rm(mem, i, -1, -1);
                 // but keep the system prompt
-                llama_kv_self_seq_cp(ctx, 0, i, -1, -1);
+                llama_memory_seq_cp(mem, 0, i, -1, -1);
             }
 
             LOG_INF("%s: clearing the KV cache\n", __func__);
@@ -447,8 +449,8 @@ int main(int argc, char ** argv) {
                     }
 
                     // delete only the generated part of the sequence, i.e. keep the system prompt in the cache
-                    llama_kv_self_seq_rm(ctx,    client.id + 1, -1, -1);
-                    llama_kv_self_seq_cp(ctx, 0, client.id + 1, -1, -1);
+                    llama_memory_seq_rm(mem,    client.id + 1, -1, -1);
+                    llama_memory_seq_cp(mem, 0, client.id + 1, -1, -1);
 
                     const auto t_main_end = ggml_time_us();
 
diff --git a/examples/passkey/passkey.cpp b/examples/passkey/passkey.cpp
index 5ac881b45e268..8a4faa383bf32 100644
--- a/examples/passkey/passkey.cpp
+++ b/examples/passkey/passkey.cpp
@@ -126,6 +126,8 @@ int main(int argc, char ** argv) {
 
     int n_past = 0;
 
+    auto * mem = llama_get_memory(ctx);
+
     // fill the KV cache
     for (int i = 0; i < n_ctx; i += n_batch) {
         if (i > 0 && n_grp > 1) {
@@ -133,10 +135,10 @@ int main(int argc, char ** argv) {
             const int ib = i/n_batch - 1;
             const int bd = n_batch_grp*(n_grp - 1);
 
-            llama_kv_self_seq_add(ctx, 0, n_past - n_batch,         n_past,         ib*bd);
-            llama_kv_self_seq_div(ctx, 0, n_past - n_batch + ib*bd, n_past + ib*bd, n_grp);
+            llama_memory_seq_add(mem, 0, n_past - n_batch,         n_past,         ib*bd);
+            llama_memory_seq_div(mem, 0, n_past - n_batch + ib*bd, n_past + ib*bd, n_grp);
 
-            n_past = llama_kv_self_seq_pos_max(ctx, 0) + 1;
+            n_past = llama_memory_seq_pos_max(mem, 0) + 1;
         }
 
         common_batch_clear(batch);
@@ -166,10 +168,10 @@ int main(int argc, char ** argv) {
 
         LOG_INF("%s: shifting KV cache with %d\n", __func__, n_discard);
 
-        llama_kv_self_seq_rm (ctx, 0, n_keep            , n_keep + n_discard);
-        llama_kv_self_seq_add(ctx, 0, n_keep + n_discard, n_ctx,  -n_discard);
+        llama_memory_seq_rm (mem, 0, n_keep            , n_keep + n_discard);
+        llama_memory_seq_add(mem, 0, n_keep + n_discard, n_ctx,  -n_discard);
 
-        n_past = llama_kv_self_seq_pos_max(ctx, 0) + 1;
+        n_past = llama_memory_seq_pos_max(mem, 0) + 1;
 
         common_batch_clear(batch);
 
@@ -195,10 +197,10 @@ int main(int argc, char ** argv) {
         if (n_discard > 0) {
             LOG_INF("%s: shifting KV cache with %d to free space for the answer\n", __func__, n_discard);
 
-            llama_kv_self_seq_rm (ctx, 0, n_keep            , n_keep + n_discard);
-            llama_kv_self_seq_add(ctx, 0, n_keep + n_discard, n_ctx,  -n_discard);
+            llama_memory_seq_rm (mem, 0, n_keep            , n_keep + n_discard);
+            llama_memory_seq_add(mem, 0, n_keep + n_discard, n_ctx,  -n_discard);
 
-            n_past = llama_kv_self_seq_pos_max(ctx, 0) + 1;
+            n_past = llama_memory_seq_pos_max(mem, 0) + 1;
         }
     }
 
diff --git a/examples/retrieval/retrieval.cpp b/examples/retrieval/retrieval.cpp
index 754da1411bcc1..042e12c2bf83a 100644
--- a/examples/retrieval/retrieval.cpp
+++ b/examples/retrieval/retrieval.cpp
@@ -83,7 +83,7 @@ static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & toke
 
 static void batch_process(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd) {
     // clear previous kv_cache values (irrelevant for embeddings)
-    llama_kv_self_clear(ctx);
+    llama_memory_clear(llama_get_memory(ctx), false);
 
     // run model
     LOG_INF("%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
diff --git a/examples/save-load-state/save-load-state.cpp b/examples/save-load-state/save-load-state.cpp
index 760ebbbf08788..db79588f1a5a4 100644
--- a/examples/save-load-state/save-load-state.cpp
+++ b/examples/save-load-state/save-load-state.cpp
@@ -196,7 +196,7 @@ int main(int argc, char ** argv) {
         fprintf(stderr, "%s : seq 0 copied, %zd bytes\n", __func__, ncopy);
 
         // erase whole kv
-        llama_kv_self_clear(ctx3);
+        llama_memory_clear(llama_get_memory(ctx3), true);
         fprintf(stderr, "%s : kv cache cleared\n", __func__);
 
         // restore kv into seq 1
diff --git a/examples/simple-chat/simple-chat.cpp b/examples/simple-chat/simple-chat.cpp
index 6608d4bea05c8..cf1178043d8d1 100644
--- a/examples/simple-chat/simple-chat.cpp
+++ b/examples/simple-chat/simple-chat.cpp
@@ -98,7 +98,7 @@ int main(int argc, char ** argv) {
     auto generate = [&](const std::string & prompt) {
         std::string response;
 
-        const bool is_first = llama_kv_self_seq_pos_max(ctx, 0) == 0;
+        const bool is_first = llama_memory_seq_pos_max(llama_get_memory(ctx), 0) == -1;
 
         // tokenize the prompt
         const int n_prompt_tokens = -llama_tokenize(vocab, prompt.c_str(), prompt.size(), NULL, 0, is_first, true);
@@ -113,7 +113,7 @@ int main(int argc, char ** argv) {
         while (true) {
             // check if we have enough space in the context to evaluate this batch
             int n_ctx = llama_n_ctx(ctx);
-            int n_ctx_used = llama_kv_self_seq_pos_max(ctx, 0);
+            int n_ctx_used = llama_memory_seq_pos_max(llama_get_memory(ctx), 0);
             if (n_ctx_used + batch.n_tokens > n_ctx) {
                 printf("\033[0m\n");
                 fprintf(stderr, "context size exceeded\n");
diff --git a/examples/speculative-simple/speculative-simple.cpp b/examples/speculative-simple/speculative-simple.cpp
index 0783ed4a4c43e..99196c9d047e4 100644
--- a/examples/speculative-simple/speculative-simple.cpp
+++ b/examples/speculative-simple/speculative-simple.cpp
@@ -217,7 +217,7 @@ int main(int argc, char ** argv) {
         {
             LOG_DBG("clear kv cache from any extra tokens, n_past = %d\n", n_past);
 
-            llama_kv_self_seq_rm(ctx_tgt, 0, n_past, -1);
+            llama_memory_seq_rm(llama_get_memory(ctx_tgt), 0, n_past, -1);
         }
 
         if ((params.n_predict >= 0 && n_predict > params.n_predict) || has_eos) {
diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp
index 561c308830351..0adffdb006bcf 100644
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@@ -142,6 +142,8 @@ int main(int argc, char ** argv) {
         }
     }
 
+    auto * mem_tgt = llama_get_memory(ctx_tgt);
+    auto * mem_dft = llama_get_memory(ctx_dft);
 
     // Tokenize the prompt
     std::vector<llama_token> inp;
@@ -420,14 +422,14 @@ int main(int argc, char ** argv) {
             {
                 LOG_DBG("keeping sequence %d, n_past_tgt = %d, n_past_dft = %d\n", s_keep, n_past_tgt, n_past_dft);
 
-                llama_kv_self_seq_keep(ctx_dft, s_keep);
-                llama_kv_self_seq_cp  (ctx_dft, s_keep, 0, -1, -1);
-                llama_kv_self_seq_keep(ctx_dft, 0);
+                llama_memory_seq_keep(mem_dft, s_keep);
+                llama_memory_seq_cp  (mem_dft, s_keep, 0, -1, -1);
+                llama_memory_seq_keep(mem_dft, 0);
 
-                llama_kv_self_seq_rm  (ctx_tgt, s_keep, n_past_tgt, -1);
-                llama_kv_self_seq_keep(ctx_tgt, s_keep);
-                llama_kv_self_seq_cp  (ctx_tgt, s_keep, 0, -1, -1);
-                llama_kv_self_seq_keep(ctx_tgt, 0);
+                llama_memory_seq_rm  (mem_tgt, s_keep, n_past_tgt, -1);
+                llama_memory_seq_keep(mem_tgt, s_keep);
+                llama_memory_seq_cp  (mem_tgt, s_keep, 0, -1, -1);
+                llama_memory_seq_keep(mem_tgt, 0);
             }
 
             for (int s = 0; s < n_seq_dft; ++s) {
@@ -444,7 +446,7 @@ int main(int argc, char ** argv) {
             common_batch_clear(batch_dft);
             common_batch_add  (batch_dft, token_id, n_past_dft, { 0 }, true);
 
-            llama_kv_self_seq_rm(ctx_dft, 0, n_past_dft, -1);
+            llama_memory_seq_rm(mem_dft, 0, n_past_dft, -1);
             // LOG_DBG("dft batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_dft, batch_dft).c_str());
             llama_decode(ctx_dft, batch_dft);
 
@@ -503,8 +505,8 @@ int main(int argc, char ** argv) {
                     if (n_seq_cur < n_seq_dft && cur_p->data[f].p > p_draft_split) {
                         LOG_DBG("splitting seq %3d into %3d\n", s, n_seq_cur);
 
-                        llama_kv_self_seq_rm(ctx_dft,    n_seq_cur, -1, -1);
-                        llama_kv_self_seq_cp(ctx_dft, s, n_seq_cur, -1, -1);
+                        llama_memory_seq_rm(mem_dft,    n_seq_cur, -1, -1);
+                        llama_memory_seq_cp(mem_dft, s, n_seq_cur, -1, -1);
 
                         // all previous tokens from this branch are now also part of the new branch
                         for (int t = 0; t < batch_tgt.n_tokens; ++t) {
@@ -585,9 +587,9 @@ int main(int argc, char ** argv) {
 
         // evaluate the target model on the drafted tokens
         {
-            llama_kv_self_seq_keep(ctx_tgt, 0);
+            llama_memory_seq_keep(mem_tgt, 0);
             for (int s = 1; s < n_seq_dft; ++s) {
-                llama_kv_self_seq_cp(ctx_tgt, 0, s, -1, -1);
+                llama_memory_seq_cp(mem_tgt, 0, s, -1, -1);
             }
 
             // LOG_DBG("target batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_tgt, batch_tgt).c_str());
diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt
index 3d01184a2ee6b..fdc76808ada6a 100644
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@@ -105,7 +105,7 @@ message(DEBUG "GGML_NATIVE_DEFAULT : ${GGML_NATIVE_DEFAULT}")
 message(DEBUG "INS_ENB             : ${INS_ENB}")
 
 option(GGML_CPU_HBM          "ggml: use memkind for CPU HBM" OFF)
-option(GGML_CPU_AARCH64      "ggml: use runtime weight conversion of Q4_0 to Q4_X_X" ON)
+option(GGML_CPU_REPACK       "ggml: use runtime weight conversion of Q4_0 to Q4_X_X" ON)
 option(GGML_CPU_KLEIDIAI     "ggml: use KleidiAI optimized kernels if applicable" OFF)
 option(GGML_SSE42            "ggml: enable SSE 4.2"          ${INS_ENB})
 option(GGML_AVX              "ggml: enable AVX"              ${INS_ENB})
@@ -131,13 +131,14 @@ option(GGML_RVV              "ggml: enable rvv"              ON)
 option(GGML_RV_ZFH           "ggml: enable riscv zfh"        OFF)
 option(GGML_XTHEADVECTOR     "ggml: enable xtheadvector"     OFF)
 option(GGML_VXE              "ggml: enable vxe"              ON)
+option(GGML_NNPA             "ggml: enable nnpa"             ON)
 
 option(GGML_CPU_ALL_VARIANTS "ggml: build all variants of the CPU backend (requires GGML_BACKEND_DL)" OFF)
 set(GGML_CPU_ARM_ARCH        "" CACHE STRING "ggml: CPU architecture for ARM")
 set(GGML_CPU_POWERPC_CPUTYPE "" CACHE STRING "ggml: CPU type for PowerPC")
 
 
-if (WIN32)
+if (MINGW)
     set(GGML_WIN_VER "0x602" CACHE STRING   "ggml: Windows version")
 endif()
 
@@ -172,6 +173,7 @@ option(GGML_HIP                             "ggml: use HIP"
 option(GGML_HIP_GRAPHS                      "ggml: use HIP graph, experimental, slow"         OFF)
 option(GGML_HIP_NO_VMM                      "ggml: do not try to use HIP VMM"                 ON)
 option(GGML_HIP_ROCWMMA_FATTN               "ggml: enable rocWMMA for FlashAttention"         OFF)
+option(GGML_HIP_FORCE_ROCWMMA_FATTN_GFX12   "ggml: enable rocWMMA FlashAttention on GFX12"    OFF)
 option(GGML_VULKAN                          "ggml: use Vulkan"                                OFF)
 option(GGML_VULKAN_CHECK_RESULTS            "ggml: run Vulkan op checks"                      OFF)
 option(GGML_VULKAN_DEBUG                    "ggml: enable Vulkan debug output"                OFF)
@@ -205,6 +207,7 @@ option(GGML_OPENCL_EMBED_KERNELS            "ggml: embed kernels"
 option(GGML_OPENCL_USE_ADRENO_KERNELS       "ggml: use optimized kernels for Adreno"          ON)
 set   (GGML_OPENCL_TARGET_VERSION "300" CACHE STRING
                                             "gmml: OpenCL API version to target")
+option(GGML_HEXAGON                         "ggml: use HEXAGON"                               OFF)
 
 # toolchain for vulkan-shaders-gen
 set   (GGML_VULKAN_SHADERS_GEN_TOOLCHAIN "" CACHE FILEPATH "ggml: toolchain file for vulkan-shaders-gen")
@@ -270,6 +273,7 @@ set(GGML_PUBLIC_HEADERS
     include/ggml-rpc.h
     include/ggml-sycl.h
     include/ggml-vulkan.h
+    include/ggml-hexagon.h
     include/gguf.h)
 
 set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")
@@ -367,6 +371,8 @@ if (MSVC)
         /wd4005  # Macro redefinition
         /wd4244  # Conversion from one type to another type, possible loss of data
         /wd4267  # Conversion from 'size_t' to a smaller type, possible loss of data
+        /wd4305  # Conversion from 'type1' to 'type2', possible loss of data
+        /wd4566  # Conversion from 'char' to 'wchar_t', possible loss of data
         /wd4996  # Disable POSIX deprecation warnings
         /wd4702  # Unreachable code warnings
     )
@@ -386,4 +392,46 @@ if (MSVC)
     disable_msvc_warnings(ggml-cpu-skylakex)
     disable_msvc_warnings(ggml-cpu-icelake)
     disable_msvc_warnings(ggml-cpu-alderlake)
+
+    if (GGML_BUILD_EXAMPLES)
+        disable_msvc_warnings(common-ggml)
+        disable_msvc_warnings(common)
+
+        disable_msvc_warnings(mnist-common)
+        disable_msvc_warnings(mnist-eval)
+        disable_msvc_warnings(mnist-train)
+
+        disable_msvc_warnings(gpt-2-ctx)
+        disable_msvc_warnings(gpt-2-alloc)
+        disable_msvc_warnings(gpt-2-backend)
+        disable_msvc_warnings(gpt-2-sched)
+        disable_msvc_warnings(gpt-2-quantize)
+        disable_msvc_warnings(gpt-2-batched)
+
+        disable_msvc_warnings(gpt-j)
+        disable_msvc_warnings(gpt-j-quantize)
+
+        disable_msvc_warnings(magika)
+        disable_msvc_warnings(yolov3-tiny)
+        disable_msvc_warnings(sam)
+
+        disable_msvc_warnings(simple-ctx)
+        disable_msvc_warnings(simple-backend)
+    endif()
+
+    if (GGML_BUILD_TESTS)
+        disable_msvc_warnings(test-mul-mat)
+        disable_msvc_warnings(test-arange)
+        disable_msvc_warnings(test-backend-ops)
+        disable_msvc_warnings(test-cont)
+        disable_msvc_warnings(test-conv-transpose)
+        disable_msvc_warnings(test-conv-transpose-1d)
+        disable_msvc_warnings(test-conv1d)
+        disable_msvc_warnings(test-conv2d)
+        disable_msvc_warnings(test-conv2d-dw)
+        disable_msvc_warnings(test-customop)
+        disable_msvc_warnings(test-dup)
+        disable_msvc_warnings(test-opt)
+        disable_msvc_warnings(test-pool)
+    endif ()
 endif()
diff --git a/ggml/cmake/common.cmake b/ggml/cmake/common.cmake
index bb1ec9b37a7f0..cb66388332040 100644
--- a/ggml/cmake/common.cmake
+++ b/ggml/cmake/common.cmake
@@ -36,8 +36,7 @@ function(ggml_get_system_arch)
             (NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND
             CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|i686|AMD64|amd64)$"))
         set(GGML_SYSTEM_ARCH "x86" PARENT_SCOPE)
-    elseif ("${CMAKE_SYSTEM_PROCESSOR} " STREQUAL "ppc64le " OR
-            "${CMAKE_SYSTEM_PROCESSOR} " STREQUAL "powerpc ")
+    elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc|power")
         set(GGML_SYSTEM_ARCH "PowerPC" PARENT_SCOPE)
     elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
         set(GGML_SYSTEM_ARCH "loongarch64"  PARENT_SCOPE)
diff --git a/ggml/include/ggml-cpu.h b/ggml/include/ggml-cpu.h
index de77a875ec533..e3b79d09bb66f 100644
--- a/ggml/include/ggml-cpu.h
+++ b/ggml/include/ggml-cpu.h
@@ -101,6 +101,7 @@ extern "C" {
     GGML_BACKEND_API int ggml_cpu_has_riscv_v    (void);
     GGML_BACKEND_API int ggml_cpu_has_vsx        (void);
     GGML_BACKEND_API int ggml_cpu_has_vxe        (void);
+    GGML_BACKEND_API int ggml_cpu_has_nnpa       (void);
     GGML_BACKEND_API int ggml_cpu_has_wasm_simd  (void);
     GGML_BACKEND_API int ggml_cpu_has_llamafile  (void);
 
diff --git a/ggml/include/ggml-hexagon.h b/ggml/include/ggml-hexagon.h
new file mode 100644
index 0000000000000..fe9d4d8e588ba
--- /dev/null
+++ b/ggml/include/ggml-hexagon.h
@@ -0,0 +1,51 @@
+ /*
+ * Copyright (c) 2024-2025 The ggml authors
+ */
+#pragma once
+
+#include "ggml.h"
+#include "ggml-backend.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define GGML_HEXAGON_MAX_DEVICES    4
+#define GGML_HEXAGON_BACKEND_NAME   "hexagon"
+
+enum HEXAGONBackend {
+    HEXAGON_BACKEND_QNNCPU  = 0,
+    HEXAGON_BACKEND_QNNGPU  = 1,
+    HEXAGON_BACKEND_QNNNPU  = 2,
+    HEXAGON_BACKEND_CDSP    = 3,
+    HEXAGON_BACKEND_GGML    = 4, //"fake" HEXAGON backend for compare performance between HEXAGON backend and ggml backend
+};
+
+//0: general approach through QNN:offload ggmlop to QNN(QNNCPU, QNNGPU, QNNNPU）
+//1: special approach through QNN-SINGLEGRAPH:mapping entire ggml cgraph to a single QNN graph
+//2: general approach through Hexagon cDSP:offload ggmlop to Hexagon cDSP directly
+enum hwaccel_approach_type {
+     HWACCEL_QNN            = 0,
+     HWACCEL_QNN_SINGLEGRAPH= 1,
+     HWACCEL_CDSP           = 2,
+};
+
+GGML_BACKEND_API ggml_backend_t     ggml_backend_hexagon_init(size_t dev_num, const char * qnn_lib_path);
+
+GGML_BACKEND_API bool               ggml_backend_is_hexagon(ggml_backend_t backend);
+
+GGML_BACKEND_API int                ggml_backend_hexagon_get_device_count(void);
+
+GGML_BACKEND_API ggml_backend_reg_t ggml_backend_hexagon_reg(void);
+
+GGML_BACKEND_API const char *       ggml_backend_hexagon_get_devname(size_t dev_num);
+
+GGML_BACKEND_API void               ggml_backend_hexagon_set_cfg(int new_hexagon_backend, int new_hwaccel_approach);
+
+GGML_BACKEND_API int                ggml_backend_hexagon_get_mulmat_algotype(void);
+
+GGML_BACKEND_API void               ggml_backend_hexagon_set_mulmat_algotype(int new_mulmat_algotype);
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index 2226aadcff893..9c4e24023b5ad 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -489,6 +489,7 @@ extern "C" {
         GGML_OP_UPSCALE, // nearest interpolate
         GGML_OP_PAD,
         GGML_OP_PAD_REFLECT_1D,
+        GGML_OP_ROLL,
         GGML_OP_ARANGE,
         GGML_OP_TIMESTEP_EMBEDDING,
         GGML_OP_ARGSORT,
@@ -1801,6 +1802,17 @@ extern "C" {
             int                   p0,
             int                   p1);
 
+    // Move tensor elements by an offset given for each dimension. Elements that
+    // are shifted beyond the last position are wrapped around to the beginning.
+    GGML_API struct ggml_tensor * ggml_roll(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int                   shift0,
+            int                   shift1,
+            int                   shift2,
+            int                   shift3);
+
+
     // Ref: https://github.com/CompVis/stable-diffusion/blob/main/ldm/modules/diffusionmodules/util.py#L151
     // timesteps: [N,]
     // return: [N, dim]
@@ -2095,9 +2107,6 @@ extern "C" {
     GGML_API struct ggml_tensor * ggml_graph_get_grad    (const struct ggml_cgraph * cgraph, const struct ggml_tensor * node);
     GGML_API struct ggml_tensor * ggml_graph_get_grad_acc(const struct ggml_cgraph * cgraph, const struct ggml_tensor * node);
 
-    GGML_API void                 ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname);
-    GGML_API struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval);
-
     // print info and performance information for the graph
     GGML_API void ggml_graph_print(const struct ggml_cgraph * cgraph);
 
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
index 5681ecddba782..8af27a1f753a6 100644
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@@ -125,7 +125,6 @@ if (NOT MSVC)
 endif()
 
 if (MINGW)
-    # Target Windows 8 for PrefetchVirtualMemory
     add_compile_definitions(_WIN32_WINNT=${GGML_WIN_VER})
 endif()
 
@@ -196,6 +195,7 @@ add_library(ggml-base
             ../include/ggml-opt.h
             ../include/gguf.h
             ggml.c
+            ggml.cpp
             ggml-alloc.c
             ggml-backend.cpp
             ggml-opt.cpp
@@ -212,6 +212,7 @@ endif()
 
 add_library(ggml
             ggml-backend-reg.cpp)
+add_library(ggml::ggml ALIAS ggml)
 
 target_link_libraries(ggml PUBLIC ggml-base)
 
@@ -226,6 +227,7 @@ function(ggml_add_backend_library backend)
         set_target_properties(${backend} PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
         target_compile_definitions(${backend} PRIVATE GGML_BACKEND_DL)
         add_dependencies(ggml ${backend})
+        install(TARGETS ${backend} LIBRARY DESTINATION ${CMAKE_INSTALL_BINDIR})
     else()
         add_library(${backend} ${ARGN})
         target_link_libraries(ggml PUBLIC ${backend})
@@ -268,17 +270,27 @@ endfunction()
 function(ggml_add_cpu_backend_variant tag_name)
     set(GGML_CPU_TAG_NAME ${tag_name})
     # other: OPENMP LLAMAFILE CPU_HBM
-    foreach (feat NATIVE
-                  SSE42
-                  AVX AVX2 BMI2 AVX_VNNI FMA F16C
-                  AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16
-                  AMX_TILE AMX_INT8 AMX_BF16)
-        set(GGML_${feat} OFF)
-    endforeach()
-
-    foreach (feat ${ARGN})
-        set(GGML_${feat} ON)
-    endforeach()
+    if (GGML_SYSTEM_ARCH STREQUAL "x86")
+        foreach (feat NATIVE
+                      SSE42
+                      AVX AVX2 BMI2 AVX_VNNI FMA F16C
+                      AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16
+                      AMX_TILE AMX_INT8 AMX_BF16)
+            set(GGML_${feat} OFF)
+        endforeach()
+
+        foreach (feat ${ARGN})
+            set(GGML_${feat} ON)
+        endforeach()
+    elseif (GGML_SYSTEM_ARCH STREQUAL "ARM")
+        foreach (feat ${ARGN})
+            set(GGML_INTERNAL_${feat} ON)
+        endforeach()
+    elseif (GGML_SYSTEM_ARCH STREQUAL "PowerPC")
+        foreach (feat ${ARGN})
+            set(GGML_INTERNAL_${feat} ON)
+        endforeach()
+    endif()
 
     ggml_add_cpu_backend_variant_impl(${tag_name})
 endfunction()
@@ -288,6 +300,8 @@ ggml_add_backend(CPU)
 if (GGML_CPU_ALL_VARIANTS)
     if (NOT GGML_BACKEND_DL)
         message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS requires GGML_BACKEND_DL")
+    elseif (GGML_CPU_ARM_ARCH)
+        message(FATAL_ERROR "Cannot use both GGML_CPU_ARM_ARCH and GGML_CPU_ALL_VARIANTS")
     endif()
     if (GGML_SYSTEM_ARCH STREQUAL "x86")
         ggml_add_cpu_backend_variant(x64)
@@ -301,8 +315,47 @@ if (GGML_CPU_ALL_VARIANTS)
             # MSVC doesn't support AMX
             ggml_add_cpu_backend_variant(sapphirerapids SSE42 AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8)
         endif()
+    elseif(GGML_SYSTEM_ARCH STREQUAL "ARM")
+        if (CMAKE_SYSTEM_NAME MATCHES "Linux")
+            # Many of these features are optional so we build versions with popular
+            # combinations and name the backends based on the version they were
+            # first released with
+            ggml_add_cpu_backend_variant(armv8.0_1)
+            ggml_add_cpu_backend_variant(armv8.2_1    DOTPROD)
+            ggml_add_cpu_backend_variant(armv8.2_2    DOTPROD FP16_VECTOR_ARITHMETIC)
+            ggml_add_cpu_backend_variant(armv8.2_3    DOTPROD FP16_VECTOR_ARITHMETIC SVE)
+            ggml_add_cpu_backend_variant(armv8.6_1    DOTPROD FP16_VECTOR_ARITHMETIC SVE MATMUL_INT8)
+            ggml_add_cpu_backend_variant(armv8.6_2    DOTPROD FP16_VECTOR_ARITHMETIC SVE MATMUL_INT8 SVE2)
+            ggml_add_cpu_backend_variant(armv9.2_1    DOTPROD FP16_VECTOR_ARITHMETIC SVE MATMUL_INT8 SME)
+            ggml_add_cpu_backend_variant(armv9.2_2    DOTPROD FP16_VECTOR_ARITHMETIC SVE MATMUL_INT8 SVE2 SME)
+        elseif (CMAKE_SYSTEM_NAME MATCHES "Android")
+            # Android-specific backends with SoC-compatible feature sets
+            ggml_add_cpu_backend_variant(android_armv8.0_1)
+            ggml_add_cpu_backend_variant(android_armv8.2_1    DOTPROD)
+            ggml_add_cpu_backend_variant(android_armv8.2_2    DOTPROD FP16_VECTOR_ARITHMETIC)
+            ggml_add_cpu_backend_variant(android_armv8.6_1    DOTPROD FP16_VECTOR_ARITHMETIC MATMUL_INT8)
+        elseif (APPLE)
+            ggml_add_cpu_backend_variant(apple_m1             DOTPROD)
+            ggml_add_cpu_backend_variant(apple_m2_m3          DOTPROD MATMUL_INT8)
+            ggml_add_cpu_backend_variant(apple_m4             DOTPROD MATMUL_INT8 NOSVE SME)
+        else()
+            message(FATAL_ERROR "Unsupported ARM target OS: ${CMAKE_SYSTEM_NAME}")
+        endif()
+    elseif (GGML_SYSTEM_ARCH STREQUAL "PowerPC")
+        if (CMAKE_SYSTEM_NAME MATCHES "Linux")
+            ggml_add_cpu_backend_variant(power0)
+            ggml_add_cpu_backend_variant(power7_1       POWER7)
+            ggml_add_cpu_backend_variant(power7_2       POWER7  VSX)
+            ggml_add_cpu_backend_variant(power8_1       POWER8)
+            ggml_add_cpu_backend_variant(power8_2       POWER8  VSX)
+            ggml_add_cpu_backend_variant(power9         POWER9  VSX)
+            ggml_add_cpu_backend_variant(power10        POWER10 VSX)
+            ggml_add_cpu_backend_variant(power11        POWER11 VSX)
+        else()
+            message(FATAL_ERROR "Unsupported PowerPC target OS: ${CMAKE_SYSTEM_NAME}")
+        endif()
     else()
-        message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS not yet supported on ${GGML_SYSTEM_ARCH}")
+        message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS not yet supported with ${GGML_SYSTEM_ARCH} on ${CMAKE_SYSTEM_NAME}")
     endif()
 elseif (GGML_CPU)
     ggml_add_cpu_backend_variant_impl("")
@@ -319,6 +372,7 @@ ggml_add_backend(RPC)
 ggml_add_backend(SYCL)
 ggml_add_backend(Vulkan)
 ggml_add_backend(OpenCL)
+ggml_add_backend(HEXAGON)
 
 foreach (target ggml-base ggml)
     target_include_directories(${target} PUBLIC    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../include> $<INSTALL_INTERFACE:include>)
diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp
index 405d8e31514b5..0a39ef7945888 100644
--- a/ggml/src/ggml-backend-reg.cpp
+++ b/ggml/src/ggml-backend-reg.cpp
@@ -65,10 +65,17 @@
 #include "ggml-kompute.h"
 #endif
 
+#ifdef GGML_USE_HEXAGON
+#include "ggml-hexagon.h"
+#endif
+
 // disable C++17 deprecation warning for std::codecvt_utf8
 #if defined(__clang__)
 #    pragma clang diagnostic push
 #    pragma clang diagnostic ignored "-Wdeprecated-declarations"
+#elif defined(__GNUC__)
+#    pragma GCC diagnostic push
+#    pragma GCC diagnostic ignored "-Wdeprecated-declarations"
 #endif
 
 namespace fs = std::filesystem;
@@ -91,6 +98,8 @@ static std::string path_str(const fs::path & path) {
 
 #if defined(__clang__)
 #    pragma clang diagnostic pop
+#elif defined(__GNUC__)
+#    pragma GCC diagnostic pop
 #endif
 
 #ifdef _WIN32
@@ -187,6 +196,9 @@ struct ggml_backend_registry {
 #ifdef GGML_USE_KOMPUTE
         register_backend(ggml_backend_kompute_reg());
 #endif
+#ifdef GGML_USE_HEXAGON
+        register_backend(ggml_backend_hexagon_reg());
+#endif
 #ifdef GGML_USE_CPU
         register_backend(ggml_backend_cpu_reg());
 #endif
@@ -577,6 +589,7 @@ void ggml_backend_load_all_from_path(const char * dir_path) {
     ggml_backend_load_best("vulkan", silent, dir_path);
     ggml_backend_load_best("opencl", silent, dir_path);
     ggml_backend_load_best("musa", silent, dir_path);
+    ggml_backend_load_best("hexagon", silent, dir_path);
     ggml_backend_load_best("cpu", silent, dir_path);
     // check the environment variable GGML_BACKEND_PATH to load an out-of-tree backend
     const char * backend_path = std::getenv("GGML_BACKEND_PATH");
diff --git a/ggml/src/ggml-blas/CMakeLists.txt b/ggml/src/ggml-blas/CMakeLists.txt
index 0bf3c05d93a89..76064c3fd1fe8 100644
--- a/ggml/src/ggml-blas/CMakeLists.txt
+++ b/ggml/src/ggml-blas/CMakeLists.txt
@@ -81,7 +81,7 @@ if (BLAS_FOUND)
     target_link_libraries     (ggml-blas PRIVATE ${BLAS_LIBRARIES})
     target_include_directories(ggml-blas PRIVATE ${BLAS_INCLUDE_DIRS})
 else()
-    message(ERROR "BLAS not found, please refer to "
-                  "https://cmake.org/cmake/help/latest/module/FindBLAS.html#blas-lapack-vendors"
-                  " to set correct GGML_BLAS_VENDOR")
+    message(FATAL_ERROR "BLAS not found, please refer to "
+                        "https://cmake.org/cmake/help/latest/module/FindBLAS.html#blas-lapack-vendors"
+                        " to set correct GGML_BLAS_VENDOR")
 endif()
diff --git a/ggml/src/ggml-cann/common.h b/ggml/src/ggml-cann/common.h
index 7ef80a4793314..ba2cef0c25fb2 100755
--- a/ggml/src/ggml-cann/common.h
+++ b/ggml/src/ggml-cann/common.h
@@ -37,6 +37,7 @@
 #include <thread>
 #include <unistd.h>
 #include <functional>
+#include <optional>
 
 #include "../include/ggml-cann.h"
 #include "../include/ggml.h"
@@ -103,6 +104,9 @@ const ggml_cann_device_info& ggml_cann_info();
 void ggml_cann_set_device(int32_t device);
 int32_t ggml_cann_get_device();
 
+std::optional<std::string> get_env(const std::string& name);
+bool parse_bool(const std::string& value);
+
 /**
  * @brief Abstract base class for memory pools used by CANN.
  */
@@ -354,7 +358,8 @@ struct ggml_backend_cann_context {
         : device(device), name("CANN" + std::to_string(device)), task_queue(1024, device) {
         ggml_cann_set_device(device);
         description = aclrtGetSocName();
-        async_mode = (getenv("GGML_CANN_ASYNC_MODE") != nullptr);
+
+        bool async_mode = parse_bool(get_env("GGML_CANN_ASYNC_MODE").value_or(""));
         GGML_LOG_INFO("%s: device %d async operator submission is %s\n", __func__,
             device, async_mode ? "ON" : "OFF");
     }
diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp
index c0ea26002196f..d1a0ad374d691 100755
--- a/ggml/src/ggml-cann/ggml-cann.cpp
+++ b/ggml/src/ggml-cann/ggml-cann.cpp
@@ -31,6 +31,8 @@
 #include <mutex>
 #include <queue>
 #include <chrono>
+#include <unordered_set>
+#include <optional>
 
 #include "ggml-impl.h"
 #include "ggml-backend-impl.h"
@@ -93,6 +95,26 @@ int32_t ggml_cann_get_device() {
     return id;
 }
 
+/**
+ * @brief Get the value of the specified environment variable (name).
+ *        if not empty, return a std::string object
+ */
+std::optional<std::string> get_env(const std::string& name) {
+    const char* val = std::getenv(name.c_str());
+    if (!val) return std::nullopt;
+    std::string res = std::string(val);
+    std::transform(res.begin(), res.end(), res.begin(), ::tolower);
+    return res;
+}
+
+/**
+ * @brief Verify whether the environment variable is a valid value.
+ */
+bool parse_bool(const std::string& value) {
+    std::unordered_set<std::string> valid_values = {"on", "1", "yes", "y", "enable", "true"};
+    return valid_values.find(value) != valid_values.end();
+}
+
 /**
  * @brief Initialize the CANN device information.
  *
@@ -214,7 +236,7 @@ struct ggml_cann_pool_buf_prio : public ggml_cann_pool {
      * @param device The device ID to associate with this buffer pool.
      */
     explicit ggml_cann_pool_buf_prio(int device) : device(device) {
-        disable_clean = getenv("GGML_CANN_DISABLE_BUF_POOL_CLEAN") != nullptr;
+        disable_clean = parse_bool(get_env("GGML_CANN_DISABLE_BUF_POOL_CLEAN").value_or(""));
     }
 
     /**
@@ -410,7 +432,7 @@ struct ggml_cann_pool_buf : public ggml_cann_pool {
      * @param device The device ID to associate with this buffer pool.
      */
     explicit ggml_cann_pool_buf(int device) : device(device) {
-        disable_clean = getenv("GGML_CANN_DISABLE_BUF_POOL_CLEAN") != nullptr;
+        disable_clean = parse_bool(get_env("GGML_CANN_DISABLE_BUF_POOL_CLEAN").value_or(""));
     }
 
     /**
@@ -731,16 +753,18 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
  */
 std::unique_ptr<ggml_cann_pool> ggml_backend_cann_context::new_pool_for_device(
     int device) {
-    bool disable_vmm = (getenv("GGML_CANN_DISABLE_VMM_POOL") != nullptr);
-    if (!disable_vmm && ggml_cann_info().devices[device].vmm) {
-        GGML_LOG_INFO("%s: device %d use vmm pool\n", __func__, device);
-        return std::unique_ptr<ggml_cann_pool>(new ggml_cann_pool_vmm(device));
-    }
-    bool enable_buf_prio = (getenv("GGML_CANN_ENABLE_BUF_PRIO_POOL") != nullptr);
-    if (enable_buf_prio) {
+    std::string mem_pool_type = get_env("GGML_CANN_MEM_POOL").value_or("");
+
+    if (mem_pool_type == "prio") {
         GGML_LOG_INFO("%s: device %d use buffer pool with priority queue\n", __func__, device);
         return std::unique_ptr<ggml_cann_pool>(new ggml_cann_pool_buf_prio(device));
     }
+
+    if (ggml_cann_info().devices[device].vmm && mem_pool_type != "leg") {
+        GGML_LOG_INFO("%s: device %d use vmm pool\n", __func__, device);
+        return std::unique_ptr<ggml_cann_pool>(new ggml_cann_pool_vmm(device));
+    }
+
     GGML_LOG_INFO("%s: device %d use buffer pool\n", __func__, device);
     return std::unique_ptr<ggml_cann_pool>(new ggml_cann_pool_buf(device));
 }
diff --git a/ggml/src/ggml-common.h b/ggml/src/ggml-common.h
index 086c822d73a89..fbb04426abe7e 100644
--- a/ggml/src/ggml-common.h
+++ b/ggml/src/ggml-common.h
@@ -1074,6 +1074,10 @@ GGML_TABLE_BEGIN(uint32_t, iq3s_grid, 512)
     0x0f090307, 0x0f090501, 0x0f090b01, 0x0f0b0505, 0x0f0b0905, 0x0f0d0105, 0x0f0d0703, 0x0f0f0101,
 GGML_TABLE_END()
 
+GGML_TABLE_BEGIN(int8_t, kvalues_iq4nl, 16)
+    -127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113,
+GGML_TABLE_END()
+
 #define NGRID_IQ1S 2048
 #define IQ1S_DELTA 0.125f
 #define IQ1M_DELTA 0.125f
diff --git a/ggml/src/ggml-cpu/CMakeLists.txt b/ggml/src/ggml-cpu/CMakeLists.txt
index b3237eeadd22b..671fad4d228d4 100644
--- a/ggml/src/ggml-cpu/CMakeLists.txt
+++ b/ggml/src/ggml-cpu/CMakeLists.txt
@@ -1,3 +1,17 @@
+function(ggml_add_cpu_backend_features cpu_name arch)
+    # The feature detection code is compiled as a separate target so that
+    # it can be built without the architecture flags
+    # Since multiple variants of the CPU backend may be included in the same
+    # build, using set_source_files_properties() to set the arch flags is not possible
+    set(GGML_CPU_FEATS_NAME ${cpu_name}-feats)
+    add_library(${GGML_CPU_FEATS_NAME} OBJECT ggml-cpu/arch/${arch}/cpu-feats.cpp)
+    target_include_directories(${GGML_CPU_FEATS_NAME} PRIVATE . .. ../include)
+    target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE ${ARGN})
+    target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE GGML_BACKEND_DL GGML_BACKEND_BUILD GGML_BACKEND_SHARED)
+    set_target_properties(${GGML_CPU_FEATS_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON)
+    target_link_libraries(${cpu_name} PRIVATE ${GGML_CPU_FEATS_NAME})
+endfunction()
+
 function(ggml_add_cpu_backend_variant_impl tag_name)
     if (tag_name)
         set(GGML_CPU_NAME ggml-cpu-${tag_name})
@@ -10,14 +24,14 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
     list (APPEND GGML_CPU_SOURCES
         ggml-cpu/ggml-cpu.c
         ggml-cpu/ggml-cpu.cpp
-        ggml-cpu/ggml-cpu-aarch64.cpp
-        ggml-cpu/ggml-cpu-aarch64.h
-        ggml-cpu/ggml-cpu-hbm.cpp
-        ggml-cpu/ggml-cpu-hbm.h
-        ggml-cpu/ggml-cpu-quants.c
-        ggml-cpu/ggml-cpu-quants.h
-        ggml-cpu/ggml-cpu-traits.cpp
-        ggml-cpu/ggml-cpu-traits.h
+        ggml-cpu/repack.cpp
+        ggml-cpu/repack.h
+        ggml-cpu/hbm.cpp
+        ggml-cpu/hbm.h
+        ggml-cpu/quants.c
+        ggml-cpu/quants.h
+        ggml-cpu/traits.cpp
+        ggml-cpu/traits.h
         ggml-cpu/amx/amx.cpp
         ggml-cpu/amx/amx.h
         ggml-cpu/amx/mmq.cpp
@@ -84,6 +98,11 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
 
     if (GGML_SYSTEM_ARCH STREQUAL "ARM")
         message(STATUS "ARM detected")
+        list(APPEND GGML_CPU_SOURCES
+            ggml-cpu/arch/arm/quants.c
+            ggml-cpu/arch/arm/repack.cpp
+            )
+
         if (MSVC AND NOT CMAKE_C_COMPILER_ID STREQUAL "Clang")
             message(FATAL_ERROR "MSVC is not supported for ARM, use clang")
         else()
@@ -138,6 +157,49 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
             else()
                 if (GGML_CPU_ARM_ARCH)
                     list(APPEND ARCH_FLAGS -march=${GGML_CPU_ARM_ARCH})
+                elseif(GGML_CPU_ALL_VARIANTS)
+                    # Begin with the lowest baseline
+                    set(ARM_MCPU "armv8-a")
+                    set(ARCH_TAGS "")
+                    set(ARCH_DEFINITIONS "")
+
+                    # When a feature is selected, bump the MCPU to the first
+                    # version that supported it
+                    if (GGML_INTERNAL_DOTPROD)
+                        set(ARM_MCPU "armv8.2-a")
+                        set(ARCH_TAGS "${ARCH_TAGS}+dotprod")
+                        list(APPEND ARCH_DEFINITIONS GGML_USE_DOTPROD)
+                    endif()
+                    if (GGML_INTERNAL_FP16_VECTOR_ARITHMETIC)
+                        set(ARM_MCPU "armv8.2-a")
+                        set(ARCH_TAGS "${ARCH_TAGS}+fp16")
+                        list(APPEND ARCH_DEFINITIONS GGML_USE_FP16_VECTOR_ARITHMETIC)
+                    endif()
+                    if (GGML_INTERNAL_SVE)
+                        set(ARM_MCPU "armv8.2-a")
+                        set(ARCH_TAGS "${ARCH_TAGS}+sve")
+                        list(APPEND ARCH_DEFINITIONS GGML_USE_SVE)
+                    endif()
+                    if (GGML_INTERNAL_MATMUL_INT8)
+                        set(ARM_MCPU "armv8.6-a")
+                        set(ARCH_TAGS "${ARCH_TAGS}+i8mm")
+                        list(APPEND ARCH_DEFINITIONS GGML_USE_MATMUL_INT8)
+                    endif()
+                    if (GGML_INTERNAL_SVE2)
+                        set(ARM_MCPU "armv8.6-a")
+                        set(ARCH_TAGS "${ARCH_TAGS}+sve2")
+                        list(APPEND ARCH_DEFINITIONS GGML_USE_SVE2)
+                    endif()
+                    if (GGML_INTERNAL_NOSVE)
+                        set(ARCH_TAGS "${ARCH_TAGS}+nosve")
+                    endif()
+                    if (GGML_INTERNAL_SME)
+                        set(ARM_MCPU "armv9.2-a")
+                        set(ARCH_TAGS "${ARCH_TAGS}+sme")
+                        list(APPEND ARCH_DEFINITIONS GGML_USE_SME)
+                    endif()
+                    list(APPEND ARCH_FLAGS "-march=${ARM_MCPU}${ARCH_TAGS}")
+                    ggml_add_cpu_backend_features(${GGML_CPU_NAME} arm ${ARCH_DEFINITIONS})
                 endif()
             endif()
 
@@ -167,6 +229,11 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
         endif()
     elseif (GGML_SYSTEM_ARCH STREQUAL "x86")
         message(STATUS "x86 detected")
+        list(APPEND GGML_CPU_SOURCES
+            ggml-cpu/arch/x86/quants.c
+            ggml-cpu/arch/x86/repack.cpp
+            )
+
         if (MSVC)
             # instruction set detection for MSVC only
             if (GGML_NATIVE)
@@ -296,21 +363,11 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
                 # the feature check relies on ARCH_DEFINITIONS, but it is not set with GGML_NATIVE
                 message(FATAL_ERROR "GGML_NATIVE is not compatible with GGML_BACKEND_DL, consider using GGML_CPU_ALL_VARIANTS")
             endif()
-
-            # The feature detection code is compiled as a separate target so that
-            # it can be built without the architecture flags
-            # Since multiple variants of the CPU backend may be included in the same
-            # build, using set_source_files_properties() to set the arch flags is not possible
-            set(GGML_CPU_FEATS_NAME ${GGML_CPU_NAME}-feats)
-            add_library(${GGML_CPU_FEATS_NAME} OBJECT ggml-cpu/cpu-feats-x86.cpp)
-            target_include_directories(${GGML_CPU_FEATS_NAME} PRIVATE . .. ../include)
-            target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE ${ARCH_DEFINITIONS})
-            target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE GGML_BACKEND_DL GGML_BACKEND_BUILD GGML_BACKEND_SHARED)
-            set_target_properties(${GGML_CPU_FEATS_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON)
-            target_link_libraries(${GGML_CPU_NAME} PRIVATE ${GGML_CPU_FEATS_NAME})
+            ggml_add_cpu_backend_features(${GGML_CPU_NAME} x86 ${ARCH_DEFINITIONS})
         endif()
     elseif (GGML_SYSTEM_ARCH STREQUAL "PowerPC")
         message(STATUS "PowerPC detected")
+        list(APPEND GGML_CPU_SOURCES ggml-cpu/arch/powerpc/quants.c)
         if (GGML_NATIVE)
             if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
                 file(READ "/proc/cpuinfo" POWER10_M)
@@ -318,7 +375,8 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
                 execute_process(COMMAND bash -c "prtconf |grep 'Implementation' | head -n 1" OUTPUT_VARIABLE POWER10_M)
             endif()
 
-            string(REGEX MATCHALL "POWER *([0-9]+)" MATCHED_STRING "${POWER10_M}")
+            string(TOUPPER "${POWER10_M}" POWER10_M_UPPER)
+            string(REGEX MATCHALL "POWER *([0-9]+)" MATCHED_STRING "${POWER10_M_UPPER}")
             string(REGEX REPLACE "POWER *([0-9]+)" "\\1" EXTRACTED_NUMBER "${MATCHED_STRING}")
 
             if (EXTRACTED_NUMBER GREATER_EQUAL 10)
@@ -330,6 +388,27 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
             else()
                 list(APPEND ARCH_FLAGS -mcpu=native -mtune=native -mpowerpc64)
             endif()
+        elseif(GGML_CPU_ALL_VARIANTS)
+            # Begin with the lowest baseline
+            set(ARCH_DEFINITIONS "")
+
+            # When a feature is selected, bump the MCPU to the first
+            # version that supported it
+            foreach(PVER RANGE 7 11)
+                if(DEFINED GGML_INTERNAL_POWER${PVER})
+                    set(POWERPC_MCPU "power${PVER}")
+                    list(APPEND ARCH_DEFINITIONS GGML_USE_POWER${PVER})
+                endif()
+            endforeach()
+            if (GGML_INTERNAL_VSX)
+                list(APPEND ARCH_DEFINITIONS GGML_USE_VSX)
+                list(APPEND ARCH_FLAGS -mvsx)
+            endif()
+
+            if (DEFINED POWERPC_MCPU)
+                list(APPEND ARCH_FLAGS -mcpu=${POWERPC_MCPU})
+            endif()
+            ggml_add_cpu_backend_features(${GGML_CPU_NAME} powerpc ${ARCH_DEFINITIONS})
         else()
             if (GGML_CPU_POWERPC_CPUTYPE)
                 list(APPEND ARCH_FLAGS -mcpu=${GGML_CPU_POWERPC_CPUTYPE})
@@ -337,6 +416,8 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
         endif()
     elseif (GGML_SYSTEM_ARCH STREQUAL "loongarch64")
         message(STATUS "loongarch64 detected")
+        list(APPEND GGML_CPU_SOURCES ggml-cpu/arch/loongarch/quants.c)
+
         list(APPEND ARCH_FLAGS -march=loongarch64)
         if (GGML_LASX)
             list(APPEND ARCH_FLAGS -mlasx)
@@ -346,6 +427,10 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
         endif()
     elseif (GGML_SYSTEM_ARCH STREQUAL "riscv64")
         message(STATUS "riscv64 detected")
+        list(APPEND GGML_CPU_SOURCES
+            ggml-cpu/arch/riscv/quants.c
+            ggml-cpu/arch/riscv/repack.cpp
+            )
         if (GGML_RVV)
             if (GGML_XTHEADVECTOR)
                 list(APPEND ARCH_FLAGS -march=rv64gc_xtheadvector -mabi=lp64d)
@@ -357,11 +442,13 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
         endif()
     elseif (GGML_SYSTEM_ARCH STREQUAL "s390x")
         message(STATUS "s390x detected")
+        list(APPEND GGML_CPU_SOURCES ggml-cpu/arch/s390/quants.c)
         file(READ "/proc/cpuinfo" CPUINFO_CONTENTS)
         string(REGEX REPLACE "machine[ \t\r\n]*=[ \t\r\n]*([0-9]+)" "\\1" S390X_M ${CPUINFO_CONTENTS})
 
         # TODO: Separation to determine activation of VX/VXE/VXE2
         if (${S390X_M} MATCHES "8561|8562")
+            set(GGML_NNPA OFF)
             message(STATUS "z15 target")
             list(APPEND ARCH_FLAGS -march=z15)
         elseif (${S390X_M} MATCHES "3931")
@@ -378,14 +465,25 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
         endif()
 
         if (GGML_VXE)
+            message(STATUS "VX/VXE/VXE2 enabled")
             list(APPEND ARCH_FLAGS -mvx -mzvector)
+            list(APPEND ARCH_DEFINITIONS GGML_VXE)
+        endif()
+
+        if (GGML_NNPA)
+            message(STATUS "NNPA enabled")
+            list(APPEND ARCH_DEFINITIONS GGML_NNPA)
         endif()
+    elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "wasm")
+        message(STATUS "Wasm detected")
+        list (APPEND GGML_CPU_SOURCES ggml-cpu/arch/wasm/quants.c)
     else()
-        message(STATUS "Unknown architecture")
+        message(WARNING "Unknown CPU architecture. Falling back to generic implementations.")
+        list(APPEND ARCH_FLAGS -DGGML_CPU_GENERIC)
     endif()
 
-    if (GGML_CPU_AARCH64)
-        target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_CPU_AARCH64)
+    if (GGML_CPU_REPACK)
+        target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_CPU_REPACK)
     endif()
 
     if (GGML_CPU_KLEIDIAI)
@@ -396,9 +494,9 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
 
         # Fetch KleidiAI sources:
         include(FetchContent)
-        set(KLEIDIAI_COMMIT_TAG "v1.6.0")
+        set(KLEIDIAI_COMMIT_TAG "v1.9.0")
         set(KLEIDIAI_DOWNLOAD_URL "https://github.com/ARM-software/kleidiai/archive/refs/tags/${KLEIDIAI_COMMIT_TAG}.tar.gz")
-        set(KLEIDIAI_ARCHIVE_MD5  "75b4ad68f25ab673dcc01065e5a0b05f")
+        set(KLEIDIAI_ARCHIVE_MD5  "2a8e1bb55d201557553545536489a017")
 
         if (POLICY CMP0135)
             cmake_policy(SET CMP0135 NEW)
diff --git a/ggml/src/ggml-cpu/amx/amx.cpp b/ggml/src/ggml-cpu/amx/amx.cpp
index 0f067137df006..258857b00754a 100644
--- a/ggml/src/ggml-cpu/amx/amx.cpp
+++ b/ggml/src/ggml-cpu/amx/amx.cpp
@@ -5,7 +5,7 @@
 #include "ggml-backend.h"
 #include "ggml-impl.h"
 #include "ggml-cpu.h"
-#include "ggml-cpu-traits.h"
+#include "traits.h"
 
 #if defined(__gnu_linux__)
 #include <sys/syscall.h>
diff --git a/ggml/src/ggml-cpu/amx/mmq.cpp b/ggml/src/ggml-cpu/amx/mmq.cpp
index 0ea91596bc7e2..47c61b88164b8 100644
--- a/ggml/src/ggml-cpu/amx/mmq.cpp
+++ b/ggml/src/ggml-cpu/amx/mmq.cpp
@@ -8,7 +8,8 @@
 #include "mmq.h"
 #include "ggml-impl.h"
 #include "ggml-cpu-impl.h"
-#include "ggml-cpu-quants.h"
+#include "simd-mappings.h"
+#include "quants.h"
 #include "ggml-quants.h"
 #include <algorithm>
 #include <type_traits>
@@ -453,7 +454,7 @@ void quantize_row_q8_K_vnni(const float * RESTRICT x, void * RESTRICT vy, int64_
 
         // Quantize these floats
         const float iscale = 127.f / amax;
-        y[i].d = GGML_FP32_TO_FP16(1 / iscale);
+        y[i].d = GGML_CPU_FP32_TO_FP16(1 / iscale);
         const float id = ( amax != 0.0f ) ? iscale : 0.f;
         const __m512 vscale = _mm512_set1_ps(id);
 
@@ -1090,7 +1091,7 @@ struct acc_C<block_q8_0, block_q4_0, is_acc> {
         const __m512 vd0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)((const char *)packed_B + offset)));
 
         for (int m = 0; m < nr; ++m) {
-            const __m512 vd1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[m * lda].d));
+            const __m512 vd1 = _mm512_set1_ps(GGML_CPU_FP16_TO_FP32(A[m * lda].d));
             const __m512 vtile = _mm512_cvtepi32_ps(_mm512_loadu_si512(tile + m * TILE_N));
 
             __m512 vsum;
@@ -1113,8 +1114,8 @@ struct acc_C<block_q8_1, block_q4_1, is_acc> {
         const __m512 vm0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)((const char *)packed_B + offset + TILE_N * sizeof(ggml_half))));
 
         for (int m = 0; m < nr; ++m) {
-            const __m512 vd1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[m * lda].d));
-            const __m512 vs1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[m * lda].s));
+            const __m512 vd1 = _mm512_set1_ps(GGML_CPU_FP16_TO_FP32(A[m * lda].d));
+            const __m512 vs1 = _mm512_set1_ps(GGML_CPU_FP16_TO_FP32(A[m * lda].s));
             const __m512 vtile = _mm512_cvtepi32_ps(_mm512_loadu_si512(tile + m * TILE_N));
 
             __m512 vsum;
@@ -1137,7 +1138,7 @@ struct acc_C<block_q8_0, block_q8_0, is_acc> {
         const __m512 vd0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)((const char *)packed_B + offset)));
 
         for (int m = 0; m < nr; ++m) {
-            const __m512 vd1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[m * lda].d));
+            const __m512 vd1 = _mm512_set1_ps(GGML_CPU_FP16_TO_FP32(A[m * lda].d));
             const __m512 vtile = _mm512_cvtepi32_ps(_mm512_loadu_si512(tile + m * TILE_N));
 
             __m512 vsum;
@@ -1437,7 +1438,7 @@ struct tinygemm_kernel_vnni<block_q8_0, block_q4_0, float, BLOCK_M, BLOCK_N, BLO
                     va[k] = _mm512_set1_epi32(a_ptr[k]);
                     vcomp = _mm512_dpbusd_epi32(vcomp, off, va[k]);
                 }
-                vd1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[0 * KB + i].d));
+                vd1 = _mm512_set1_ps(GGML_CPU_FP16_TO_FP32(A[0 * KB + i].d));
             }
 
             // load b
@@ -1498,8 +1499,8 @@ struct tinygemm_kernel_vnni<block_q8_1, block_q4_1, float, 1, BLOCK_N, BLOCK_K>
                 for (int k = 0; k < 8; ++k) {
                     va[k] = _mm512_set1_epi32(a_ptr[k]);
                 }
-                vd1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[0 * KB + i].d));
-                vs1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[0 * KB + i].s));
+                vd1 = _mm512_set1_ps(GGML_CPU_FP16_TO_FP32(A[0 * KB + i].d));
+                vs1 = _mm512_set1_ps(GGML_CPU_FP16_TO_FP32(A[0 * KB + i].s));
             }
 
             // load b
@@ -1571,7 +1572,7 @@ struct tinygemm_kernel_vnni<block_q8_0, block_q8_0, float, BLOCK_M, BLOCK_N, BLO
                     va[k] = _mm512_set1_epi32(a_ptr[k]);
                     va[k] = _mm512_add_epi8(va[k], off);
                 }
-                vd1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[0 * KB + i].d));
+                vd1 = _mm512_set1_ps(GGML_CPU_FP16_TO_FP32(A[0 * KB + i].d));
             }
 
             // load b
diff --git a/ggml/src/ggml-cpu/arch-fallback.h b/ggml/src/ggml-cpu/arch-fallback.h
new file mode 100644
index 0000000000000..10e5342516a9c
--- /dev/null
+++ b/ggml/src/ggml-cpu/arch-fallback.h
@@ -0,0 +1,184 @@
+#pragma once
+
+// Rename `_generic` functions if no native implementation is available.
+// This effectively selects the generic implementation.
+
+#if defined(GGML_CPU_GENERIC)
+// quants.c
+#define quantize_row_q8_0_generic quantize_row_q8_0
+#define quantize_row_q8_1_generic quantize_row_q8_1
+#define quantize_row_q8_K_generic quantize_row_q8_K
+#define ggml_vec_dot_q4_0_q8_0_generic ggml_vec_dot_q4_0_q8_0
+#define ggml_vec_dot_q4_1_q8_1_generic ggml_vec_dot_q4_1_q8_1
+#define ggml_vec_dot_q5_0_q8_0_generic ggml_vec_dot_q5_0_q8_0
+#define ggml_vec_dot_q5_1_q8_1_generic ggml_vec_dot_q5_1_q8_1
+#define ggml_vec_dot_q8_0_q8_0_generic ggml_vec_dot_q8_0_q8_0
+#define ggml_vec_dot_tq1_0_q8_K_generic ggml_vec_dot_tq1_0_q8_K
+#define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K
+#define ggml_vec_dot_q2_K_q8_K_generic ggml_vec_dot_q2_K_q8_K
+#define ggml_vec_dot_q3_K_q8_K_generic ggml_vec_dot_q3_K_q8_K
+#define ggml_vec_dot_q4_K_q8_K_generic ggml_vec_dot_q4_K_q8_K
+#define ggml_vec_dot_q5_K_q8_K_generic ggml_vec_dot_q5_K_q8_K
+#define ggml_vec_dot_q6_K_q8_K_generic ggml_vec_dot_q6_K_q8_K
+#define ggml_vec_dot_iq2_xxs_q8_K_generic ggml_vec_dot_iq2_xxs_q8_K
+#define ggml_vec_dot_iq2_xs_q8_K_generic ggml_vec_dot_iq2_xs_q8_K
+#define ggml_vec_dot_iq2_s_q8_K_generic ggml_vec_dot_iq2_s_q8_K
+#define ggml_vec_dot_iq3_xxs_q8_K_generic ggml_vec_dot_iq3_xxs_q8_K
+#define ggml_vec_dot_iq3_s_q8_K_generic ggml_vec_dot_iq3_s_q8_K
+#define ggml_vec_dot_iq1_s_q8_K_generic ggml_vec_dot_iq1_s_q8_K
+#define ggml_vec_dot_iq1_m_q8_K_generic ggml_vec_dot_iq1_m_q8_K
+#define ggml_vec_dot_iq4_nl_q8_0_generic ggml_vec_dot_iq4_nl_q8_0
+#define ggml_vec_dot_iq4_xs_q8_K_generic ggml_vec_dot_iq4_xs_q8_K
+// repack.cpp
+#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
+#define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
+#define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
+#define ggml_gemv_q4_0_4x4_q8_0_generic ggml_gemv_q4_0_4x4_q8_0
+#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
+#define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0
+#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
+#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
+#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
+#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
+#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
+#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
+#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
+#elif defined(__aarch64__) || defined(__arm__) || defined(_M_ARM) || defined(_M_ARM64)
+// repack.cpp
+#define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
+#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
+#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
+#elif defined(__x86_64__) || defined(__i386__) || defined(_M_IX86) || defined(_M_X64)
+// repack.cpp
+#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
+#define ggml_gemv_q4_0_4x4_q8_0_generic ggml_gemv_q4_0_4x4_q8_0
+#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
+#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
+#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
+#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
+#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
+#elif defined(__POWERPC__) || defined(__powerpc__)
+// ref: https://github.com/ggml-org/llama.cpp/pull/14146#issuecomment-2972561679
+// quants.c
+#define quantize_row_q8_K_generic quantize_row_q8_K
+#define ggml_vec_dot_tq1_0_q8_K_generic ggml_vec_dot_tq1_0_q8_K
+#define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K
+#define ggml_vec_dot_iq1_m_q8_K_generic ggml_vec_dot_iq1_m_q8_K
+// repack.cpp
+#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
+#define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
+#define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
+#define ggml_gemv_q4_0_4x4_q8_0_generic ggml_gemv_q4_0_4x4_q8_0
+#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
+#define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0
+#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
+#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
+#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
+#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
+#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
+#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
+#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
+#elif defined(__loongarch64)
+// quants.c
+#define quantize_row_q8_K_generic quantize_row_q8_K
+#define ggml_vec_dot_tq1_0_q8_K_generic ggml_vec_dot_tq1_0_q8_K
+#define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K
+#define ggml_vec_dot_iq1_m_q8_K_generic ggml_vec_dot_iq1_m_q8_K
+// repack.cpp
+#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
+#define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
+#define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
+#define ggml_gemv_q4_0_4x4_q8_0_generic ggml_gemv_q4_0_4x4_q8_0
+#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
+#define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0
+#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
+#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
+#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
+#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
+#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
+#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
+#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
+#elif defined(__riscv)
+// quants.c
+#define quantize_row_q8_K_generic quantize_row_q8_K
+#define ggml_vec_dot_tq1_0_q8_K_generic ggml_vec_dot_tq1_0_q8_K
+#define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K
+#define ggml_vec_dot_iq2_xxs_q8_K_generic ggml_vec_dot_iq2_xxs_q8_K
+#define ggml_vec_dot_iq2_xs_q8_K_generic ggml_vec_dot_iq2_xs_q8_K
+#define ggml_vec_dot_iq2_s_q8_K_generic ggml_vec_dot_iq2_s_q8_K
+#define ggml_vec_dot_iq3_xxs_q8_K_generic ggml_vec_dot_iq3_xxs_q8_K
+#define ggml_vec_dot_iq3_s_q8_K_generic ggml_vec_dot_iq3_s_q8_K
+#define ggml_vec_dot_iq1_s_q8_K_generic ggml_vec_dot_iq1_s_q8_K
+#define ggml_vec_dot_iq1_m_q8_K_generic ggml_vec_dot_iq1_m_q8_K
+#define ggml_vec_dot_iq4_nl_q8_0_generic ggml_vec_dot_iq4_nl_q8_0
+#define ggml_vec_dot_iq4_xs_q8_K_generic ggml_vec_dot_iq4_xs_q8_K
+// repack.cpp
+#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
+#define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
+#define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
+#define ggml_gemv_q4_0_4x4_q8_0_generic ggml_gemv_q4_0_4x4_q8_0
+#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
+#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
+#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
+#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
+#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
+#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
+#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
+#elif defined(__s390x__)
+// quants.c
+#define quantize_row_q8_K_generic quantize_row_q8_K
+#define ggml_vec_dot_q5_0_q8_0_generic ggml_vec_dot_q5_0_q8_0
+#define ggml_vec_dot_q5_1_q8_1_generic ggml_vec_dot_q5_1_q8_1
+#define ggml_vec_dot_tq1_0_q8_K_generic ggml_vec_dot_tq1_0_q8_K
+#define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K
+#define ggml_vec_dot_q2_K_q8_K_generic ggml_vec_dot_q2_K_q8_K
+#define ggml_vec_dot_iq2_xxs_q8_K_generic ggml_vec_dot_iq2_xxs_q8_K
+#define ggml_vec_dot_iq2_xs_q8_K_generic ggml_vec_dot_iq2_xs_q8_K
+#define ggml_vec_dot_iq2_s_q8_K_generic ggml_vec_dot_iq2_s_q8_K
+#define ggml_vec_dot_iq3_xxs_q8_K_generic ggml_vec_dot_iq3_xxs_q8_K
+#define ggml_vec_dot_iq3_s_q8_K_generic ggml_vec_dot_iq3_s_q8_K
+#define ggml_vec_dot_iq1_s_q8_K_generic ggml_vec_dot_iq1_s_q8_K
+#define ggml_vec_dot_iq1_m_q8_K_generic ggml_vec_dot_iq1_m_q8_K
+// repack.cpp
+#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
+#define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
+#define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
+#define ggml_gemv_q4_0_4x4_q8_0_generic ggml_gemv_q4_0_4x4_q8_0
+#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
+#define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0
+#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
+#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
+#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
+#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
+#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
+#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
+#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
+#elif defined(__wasm__)
+// quants.c
+#define ggml_vec_dot_q4_1_q8_1_generic ggml_vec_dot_q4_1_q8_1
+#define ggml_vec_dot_tq1_0_q8_K_generic ggml_vec_dot_tq1_0_q8_K
+#define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K
+#define ggml_vec_dot_iq2_xxs_q8_K_generic ggml_vec_dot_iq2_xxs_q8_K
+#define ggml_vec_dot_iq2_xs_q8_K_generic ggml_vec_dot_iq2_xs_q8_K
+#define ggml_vec_dot_iq2_s_q8_K_generic ggml_vec_dot_iq2_s_q8_K
+#define ggml_vec_dot_iq3_xxs_q8_K_generic ggml_vec_dot_iq3_xxs_q8_K
+#define ggml_vec_dot_iq3_s_q8_K_generic ggml_vec_dot_iq3_s_q8_K
+#define ggml_vec_dot_iq1_s_q8_K_generic ggml_vec_dot_iq1_s_q8_K
+#define ggml_vec_dot_iq1_m_q8_K_generic ggml_vec_dot_iq1_m_q8_K
+#define ggml_vec_dot_iq4_nl_q8_0_generic ggml_vec_dot_iq4_nl_q8_0
+#define ggml_vec_dot_iq4_xs_q8_K_generic ggml_vec_dot_iq4_xs_q8_K
+// repack.cpp
+#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
+#define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
+#define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
+#define ggml_gemv_q4_0_4x4_q8_0_generic ggml_gemv_q4_0_4x4_q8_0
+#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
+#define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0
+#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
+#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
+#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
+#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
+#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
+#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
+#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
+#endif
diff --git a/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp b/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp
new file mode 100644
index 0000000000000..67369147ce851
--- /dev/null
+++ b/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp
@@ -0,0 +1,94 @@
+#include "ggml-backend-impl.h"
+
+#if defined(__aarch64__)
+
+#if defined(__linux__)
+#include <sys/auxv.h>
+#elif defined(__APPLE__)
+#include <sys/sysctl.h>
+#endif
+
+#if !defined(HWCAP2_I8MM)
+#define HWCAP2_I8MM (1 << 13)
+#endif
+
+#if !defined(HWCAP2_SME)
+#define HWCAP2_SME (1 << 23)
+#endif
+
+struct aarch64_features {
+    // has_neon not needed, aarch64 has NEON guaranteed
+    bool has_dotprod     = false;
+    bool has_fp16_va     = false;
+    bool has_sve         = false;
+    bool has_sve2        = false;
+    bool has_i8mm        = false;
+    bool has_sme         = false;
+
+    aarch64_features() {
+#if defined(__linux__)
+        uint32_t hwcap = getauxval(AT_HWCAP);
+        uint32_t hwcap2 = getauxval(AT_HWCAP2);
+
+        has_dotprod = !!(hwcap & HWCAP_ASIMDDP);
+        has_fp16_va = !!(hwcap & HWCAP_FPHP);
+        has_sve     = !!(hwcap & HWCAP_SVE);
+        has_sve2    = !!(hwcap2 & HWCAP2_SVE2);
+        has_i8mm    = !!(hwcap2 & HWCAP2_I8MM);
+        has_sme     = !!(hwcap2 & HWCAP2_SME);
+#elif defined(__APPLE__)
+        int oldp = 0;
+        size_t size = sizeof(oldp);
+
+        if (sysctlbyname("hw.optional.arm.FEAT_DotProd", &oldp, &size, NULL, 0) == 0) {
+            has_dotprod = static_cast<bool>(oldp);
+        }
+
+        if (sysctlbyname("hw.optional.arm.FEAT_I8MM", &oldp, &size, NULL, 0) == 0) {
+            has_i8mm = static_cast<bool>(oldp);
+        }
+
+        if (sysctlbyname("hw.optional.arm.FEAT_SME", &oldp, &size, NULL, 0) == 0) {
+            has_sme = static_cast<bool>(oldp);
+        }
+
+        // Apple apparently does not implement SVE yet
+#endif
+    }
+};
+
+static int ggml_backend_cpu_aarch64_score() {
+    int score = 1;
+    aarch64_features af;
+
+#ifdef GGML_USE_DOTPROD
+    if (!af.has_dotprod) { return 0; }
+    score += 1<<1;
+#endif
+#ifdef GGML_USE_FP16_VECTOR_ARITHMETIC
+    if (!af.has_fp16_va) { return 0; }
+    score += 1<<2;
+#endif
+#ifdef GGML_USE_SVE
+    if (!af.has_sve) { return 0; }
+    score += 1<<3;
+#endif
+#ifdef GGML_USE_MATMUL_INT8
+    if (!af.has_i8mm) { return 0; }
+    score += 1<<4;
+#endif
+#ifdef GGML_USE_SVE2
+    if (!af.has_sve2) { return 0; }
+    score += 1<<5;
+#endif
+#ifdef GGML_USE_SME
+    if (!af.has_sme) { return 0; }
+    score += 1<<6;
+#endif
+
+    return score;
+}
+
+GGML_BACKEND_DL_SCORE_IMPL(ggml_backend_cpu_aarch64_score)
+
+# endif // defined(__aarch64__)
diff --git a/ggml/src/ggml-cpu/arch/arm/quants.c b/ggml/src/ggml-cpu/arch/arm/quants.c
new file mode 100644
index 0000000000000..3e2d3d03d67ec
--- /dev/null
+++ b/ggml/src/ggml-cpu/arch/arm/quants.c
@@ -0,0 +1,4114 @@
+#define GGML_COMMON_IMPL_C
+#include "ggml-common.h"
+#include "ggml-quants.h"
+#include "ggml-impl.h"
+#include "ggml-cpu.h"
+#include "simd-mappings.h"
+
+#include "../../quants.h"
+#include "../../ggml-cpu-impl.h"
+
+#include <math.h>
+#include <string.h>
+#include <assert.h>
+#include <float.h>
+#include <stdlib.h> // for qsort
+#include <stdio.h>  // for GGML_ASSERT
+
+#define GROUP_MAX_EPS 1e-15f
+#define GROUP_MAX_EPS_IQ3_XXS 1e-8f
+#define GROUP_MAX_EPS_IQ2_S 1e-8f
+#define GROUP_MAX_EPS_IQ1_M 1e-7f
+#define GROUP_MAX_EPS_IQ1_S 1e-12f
+
+#define UNUSED GGML_UNUSED
+
+#if defined(__ARM_NEON)
+#define B1(c,s,n)  0x ## n ## c ,  0x ## n ## s
+#define B2(c,s,n) B1(c,s,n ## c), B1(c,s,n ## s)
+#define B3(c,s,n) B2(c,s,n ## c), B2(c,s,n ## s)
+#define B4(c,s,n) B3(c,s,n ## c), B3(c,s,n ## s)
+#define B5(c,s,n) B4(c,s,n ## c), B4(c,s,n ## s)
+#define B6(c,s,n) B5(c,s,n ## c), B5(c,s,n ## s)
+#define B7(c,s,n) B6(c,s,n ## c), B6(c,s,n ## s)
+#define B8(c,s  ) B7(c,s,     c), B7(c,s,     s)
+
+// precomputed tables for expanding 8bits to 8 bytes:
+static const uint64_t table_b2b_0[1 << 8] = { B8(00, 10) }; // ( b) << 4
+static const uint64_t table_b2b_1[1 << 8] = { B8(10, 00) }; // (!b) << 4
+#endif
+
+void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
+    assert(QK8_0 == 32);
+    assert(k % QK8_0 == 0);
+    const int nb = k / QK8_0;
+
+    block_q8_0 * GGML_RESTRICT y = vy;
+
+#if defined(__ARM_NEON)
+    for (int i = 0; i < nb; i++) {
+        float32x4_t srcv [8];
+        float32x4_t asrcv[8];
+        float32x4_t amaxv[8];
+
+        for (int j = 0; j < 8; j++) srcv[j]  = vld1q_f32(x + i*32 + 4*j);
+        for (int j = 0; j < 8; j++) asrcv[j] = vabsq_f32(srcv[j]);
+
+        for (int j = 0; j < 4; j++) amaxv[2*j] = vmaxq_f32(asrcv[2*j], asrcv[2*j+1]);
+        for (int j = 0; j < 2; j++) amaxv[4*j] = vmaxq_f32(amaxv[4*j], amaxv[4*j+2]);
+        for (int j = 0; j < 1; j++) amaxv[8*j] = vmaxq_f32(amaxv[8*j], amaxv[8*j+4]);
+
+        const float amax = vmaxvq_f32(amaxv[0]);
+
+        const float d = amax / ((1 << 7) - 1);
+        const float id = d ? 1.0f/d : 0.0f;
+
+        y[i].d = GGML_CPU_FP32_TO_FP16(d);
+
+        for (int j = 0; j < 8; j++) {
+            const float32x4_t v  = vmulq_n_f32(srcv[j], id);
+            const int32x4_t   vi = vcvtnq_s32_f32(v);
+
+            y[i].qs[4*j + 0] = vgetq_lane_s32(vi, 0);
+            y[i].qs[4*j + 1] = vgetq_lane_s32(vi, 1);
+            y[i].qs[4*j + 2] = vgetq_lane_s32(vi, 2);
+            y[i].qs[4*j + 3] = vgetq_lane_s32(vi, 3);
+        }
+    }
+#else
+    GGML_UNUSED(nb);
+    // scalar
+    quantize_row_q8_0_ref(x, y, k);
+#endif
+}
+
+void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
+    assert(k % QK8_1 == 0);
+    const int nb = k / QK8_1;
+
+    block_q8_1 * GGML_RESTRICT y = vy;
+#if defined(__ARM_NEON)
+    for (int i = 0; i < nb; i++) {
+        float32x4_t srcv [8];
+        float32x4_t asrcv[8];
+        float32x4_t amaxv[8];
+
+        for (int j = 0; j < 8; j++) srcv[j]  = vld1q_f32(x + i*32 + 4*j);
+        for (int j = 0; j < 8; j++) asrcv[j] = vabsq_f32(srcv[j]);
+
+        for (int j = 0; j < 4; j++) amaxv[2*j] = vmaxq_f32(asrcv[2*j], asrcv[2*j+1]);
+        for (int j = 0; j < 2; j++) amaxv[4*j] = vmaxq_f32(amaxv[4*j], amaxv[4*j+2]);
+        for (int j = 0; j < 1; j++) amaxv[8*j] = vmaxq_f32(amaxv[8*j], amaxv[8*j+4]);
+
+        const float amax = vmaxvq_f32(amaxv[0]);
+
+        const float d = amax / ((1 << 7) - 1);
+        const float id = d ? 1.0f/d : 0.0f;
+
+        y[i].d = GGML_CPU_FP32_TO_FP16(d);
+
+        int32x4_t accv = vdupq_n_s32(0);
+
+        for (int j = 0; j < 8; j++) {
+            const float32x4_t v  = vmulq_n_f32(srcv[j], id);
+            const int32x4_t   vi = vcvtnq_s32_f32(v);
+
+            y[i].qs[4*j + 0] = vgetq_lane_s32(vi, 0);
+            y[i].qs[4*j + 1] = vgetq_lane_s32(vi, 1);
+            y[i].qs[4*j + 2] = vgetq_lane_s32(vi, 2);
+            y[i].qs[4*j + 3] = vgetq_lane_s32(vi, 3);
+
+            accv = vaddq_s32(accv, vi);
+        }
+
+        y[i].s = GGML_CPU_FP32_TO_FP16(d * vaddvq_s32(accv));
+    }
+#else
+    GGML_UNUSED(nb);
+    // scalar
+    quantize_row_q8_1_ref(x, y, k);
+#endif
+}
+
+// placeholder implementation for Apple targets
+void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
+    quantize_row_q8_K_ref(x, y, k);
+}
+
+//===================================== Dot products =================================
+
+void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    const int qk = QK8_0;
+    const int nb = n / qk;
+
+    assert(n % qk == 0);
+#if defined(__ARM_FEATURE_MATMUL_INT8)
+    assert((nrc == 2) || (nrc == 1));
+#else
+    assert(nrc == 1);
+#endif
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_q4_0 * GGML_RESTRICT x = vx;
+    const block_q8_0 * GGML_RESTRICT y = vy;
+
+#if defined(__ARM_FEATURE_MATMUL_INT8)
+    if (nrc == 2) {
+        const block_q4_0 * GGML_RESTRICT vx0 = vx;
+        const block_q4_0 * GGML_RESTRICT vx1 = (const block_q4_0 *) ((const uint8_t*)vx + bx);
+        const block_q8_0 * GGML_RESTRICT vy0 = vy;
+        const block_q8_0 * GGML_RESTRICT vy1 = (const block_q8_0 *) ((const uint8_t*)vy + by);
+
+        float32x4_t sumv0 = vdupq_n_f32(0.0f);
+
+        for (int i = 0; i < nb; i++) {
+            const block_q4_0 * GGML_RESTRICT b_x0 = &vx0[i];
+            const block_q4_0 * GGML_RESTRICT b_x1 = &vx1[i];
+            const block_q8_0 * GGML_RESTRICT b_y0 = &vy0[i];
+            const block_q8_0 * GGML_RESTRICT b_y1 = &vy1[i];
+
+            const uint8x16_t m4b = vdupq_n_u8(0x0F);
+            const int8x16_t  s8b = vdupq_n_s8(0x8);
+
+            const uint8x16_t v0_0 = vld1q_u8(b_x0->qs);
+            const uint8x16_t v0_1 = vld1q_u8(b_x1->qs);
+
+            // 4-bit -> 8-bit
+            const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8  (v0_0, m4b));
+            const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4));
+            const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8  (v0_1, m4b));
+            const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4));
+
+            // sub 8
+            const int8x16_t x0_l = vsubq_s8(v0_0l, s8b);
+            const int8x16_t x0_h = vsubq_s8(v0_0h, s8b);
+            const int8x16_t x1_l = vsubq_s8(v0_1l, s8b);
+            const int8x16_t x1_h = vsubq_s8(v0_1h, s8b);
+
+            // load y
+            const int8x16_t y0_l = vld1q_s8(b_y0->qs);
+            const int8x16_t y0_h = vld1q_s8(b_y0->qs + 16);
+            const int8x16_t y1_l = vld1q_s8(b_y1->qs);
+            const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16);
+
+            float32_t _scale[4] = {
+                GGML_CPU_FP16_TO_FP32(b_x0->d)*GGML_CPU_FP16_TO_FP32(b_y0->d),
+                GGML_CPU_FP16_TO_FP32(b_x0->d)*GGML_CPU_FP16_TO_FP32(b_y1->d),
+                GGML_CPU_FP16_TO_FP32(b_x1->d)*GGML_CPU_FP16_TO_FP32(b_y0->d),
+                GGML_CPU_FP16_TO_FP32(b_x1->d)*GGML_CPU_FP16_TO_FP32(b_y1->d)
+            };
+            float32x4_t scale = vld1q_f32(_scale);
+
+            int8x16_t l0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
+            int8x16_t l1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
+
+            int8x16_t l2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_h), vreinterpretq_s64_s8(x1_h)));
+            int8x16_t l3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_h), vreinterpretq_s64_s8(x1_h)));
+
+            int8x16_t r0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_l), vreinterpretq_s64_s8(y1_l)));
+            int8x16_t r1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_l), vreinterpretq_s64_s8(y1_l)));
+
+            int8x16_t r2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h)));
+            int8x16_t r3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h)));
+
+            sumv0 = vmlaq_f32(sumv0,(vcvtq_f32_s32(vmmlaq_s32((vmmlaq_s32((vmmlaq_s32((vmmlaq_s32(vdupq_n_s32(0), l0, r0)),
+                                                l1, r1)), l2, r2)), l3, r3))), scale);
+        }
+
+        float32x4_t sumv1 = vextq_f32 (sumv0, sumv0, 2);
+        float32x4_t sumv2 = vzip1q_f32(sumv0, sumv1);
+
+        vst1_f32(s,      vget_low_f32 (sumv2));
+        vst1_f32(s + bs, vget_high_f32(sumv2));
+
+        return;
+    }
+#endif
+
+    int ib = 0;
+    float sumf = 0;
+
+#if defined(__ARM_FEATURE_SVE)
+    svfloat32_t sumv0 = svdup_n_f32(0.0f);
+    svfloat32_t sumv1 = svdup_n_f32(0.0f);
+
+    const int vector_length = ggml_cpu_get_sve_cnt()*8;
+
+    // VLA Implementation using switch case
+    switch (vector_length) {
+        case 128:
+            {
+                // predicate for activating higher lanes for 4 float32 elements
+                const svbool_t ph4 = svptrue_pat_b32(SV_VL4);
+
+                for (; ib + 1 < nb; ib += 2) {
+                    const block_q4_0 * GGML_RESTRICT x0 = &x[ib + 0];
+                    const block_q4_0 * GGML_RESTRICT x1 = &x[ib + 1];
+                    const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0];
+                    const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1];
+
+                    // load x
+                    const svuint8_t qx0r = svld1rq_u8(svptrue_b8(), x0->qs);
+                    const svuint8_t qx1r = svld1rq_u8(svptrue_b8(), x1->qs);
+
+                    // 4-bit -> 8-bit
+                    const svint8_t qx0l = svreinterpret_s8_u8(svand_n_u8_m(svptrue_b8(), qx0r, 0x0F));
+                    const svint8_t qx0h = svreinterpret_s8_u8(svlsr_n_u8_m(svptrue_b8(), qx0r, 0x04));
+                    const svint8_t qx1l = svreinterpret_s8_u8(svand_n_u8_m(svptrue_b8(), qx1r, 0x0F));
+                    const svint8_t qx1h = svreinterpret_s8_u8(svlsr_n_u8_m(svptrue_b8(), qx1r, 0x04));
+
+                    // sub 8
+                    const svint8_t qx0ls = svsub_n_s8_x(svptrue_b8(), qx0h, 8);
+                    const svint8_t qx0hs = svsub_n_s8_x(svptrue_b8(), qx0l, 8);
+                    const svint8_t qx1ls = svsub_n_s8_x(svptrue_b8(), qx1h, 8);
+                    const svint8_t qx1hs = svsub_n_s8_x(svptrue_b8(), qx1l, 8);
+
+                    // load y
+                    const svint8_t qy0h = svld1_s8(svptrue_b8(), y0->qs);
+                    const svint8_t qy0l = svld1_s8(svptrue_b8(), y0->qs + 16);
+                    const svint8_t qy1h = svld1_s8(svptrue_b8(), y1->qs);
+                    const svint8_t qy1l = svld1_s8(svptrue_b8(), y1->qs + 16);
+
+                    // dot product
+                    sumv0 = svmla_n_f32_x(ph4, sumv0, svcvt_f32_s32_x(ph4, svadd_x(ph4,
+                                    svdot_s32(svdup_n_s32(0), qx0ls, qy0l),
+                                    svdot_s32(svdup_n_s32(0), qx0hs, qy0h))), GGML_CPU_FP16_TO_FP32(x0->d)*GGML_CPU_FP16_TO_FP32(y0->d));
+                    sumv1 = svmla_n_f32_x(ph4, sumv1, svcvt_f32_s32_x(ph4, svadd_x(ph4,
+                                    svdot_s32(svdup_n_s32(0), qx1ls, qy1l),
+                                    svdot_s32(svdup_n_s32(0), qx1hs, qy1h))), GGML_CPU_FP16_TO_FP32(x1->d)*GGML_CPU_FP16_TO_FP32(y1->d));
+                }
+
+                sumf = svaddv_f32(svptrue_b32(), svadd_f32_x(svptrue_b32(), sumv0, sumv1));
+            } break;
+        case 256:
+            {
+                // predicate for activating higher lanes for 16 int8 elements
+                const svbool_t ph16 = svptrue_pat_b8(SV_VL16);
+                // predicate for activating lower lanes for  16 int8 elements
+                const svbool_t pl16 = svnot_b_z(svptrue_b8(), ph16);
+
+                for (; ib + 1 < nb; ib += 2) {
+                    const block_q4_0 * GGML_RESTRICT x0 = &x[ib + 0];
+                    const block_q4_0 * GGML_RESTRICT x1 = &x[ib + 1];
+                    const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0];
+                    const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1];
+
+                    // load x
+                    const svuint8_t qx0r = svld1rq_u8(svptrue_b8(), x0->qs);
+                    const svuint8_t qx1r = svld1rq_u8(svptrue_b8(), x1->qs);
+
+                    // 4-bit -> 8-bit
+                    const svint8_t qx0 = svreinterpret_s8_u8(svlsr_n_u8_m(pl16, svand_n_u8_m(ph16, qx0r, 0x0F), 0x04));
+                    const svint8_t qx1 = svreinterpret_s8_u8(svlsr_n_u8_m(pl16, svand_n_u8_m(ph16, qx1r, 0x0F), 0x04));
+
+                    // sub 8
+                    const svint8_t qx0s = svsub_n_s8_x(svptrue_b8(), qx0, 8);
+                    const svint8_t qx1s = svsub_n_s8_x(svptrue_b8(), qx1, 8);
+
+                    // load y
+                    const svint8_t qy0 = svld1_s8(svptrue_b8(), y0->qs);
+                    const svint8_t qy1 = svld1_s8(svptrue_b8(), y1->qs);
+
+                    // dot product
+                    sumv0 = svmla_n_f32_x(svptrue_b32(), sumv0, svcvt_f32_s32_x(svptrue_b32(),
+                                svdot_s32(svdup_n_s32(0), qx0s, qy0)), GGML_CPU_FP16_TO_FP32(x0->d)*GGML_CPU_FP16_TO_FP32(y0->d));
+                    sumv1 = svmla_n_f32_x(svptrue_b32(), sumv1, svcvt_f32_s32_x(svptrue_b32(),
+                                svdot_s32(svdup_n_s32(0), qx1s, qy1)), GGML_CPU_FP16_TO_FP32(x1->d)*GGML_CPU_FP16_TO_FP32(y1->d));
+                }
+
+                sumf = svaddv_f32(svptrue_b32(), svadd_f32_x(svptrue_b32(), sumv0, sumv1));
+            } break;
+        case 512:
+            {
+                // predicate for activating higher lanes for 32 int8 elements
+                const svbool_t ph32 = svptrue_pat_b8(SV_VL32);
+
+                // predicate for activating higher lanes for 16 int8 elements
+                const svbool_t ph16 = svptrue_pat_b8(SV_VL16);
+                // predicate for activating lower lanes for 16 int8 elements from first 32 int8 activated lanes
+                const svbool_t pl16 = svnot_b_z(ph32, ph16);
+
+                for (; ib + 1 < nb; ib += 2) {
+                    const block_q4_0 * GGML_RESTRICT x0 = &x[ib + 0];
+                    const block_q4_0 * GGML_RESTRICT x1 = &x[ib + 1];
+                    const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0];
+                    const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1];
+
+                    // load x
+                    const svuint8_t qx0r = svld1rq_u8(ph32, x0->qs);
+                    const svuint8_t qx1r = svld1rq_u8(ph32, x1->qs);
+
+                    // 4-bit -> 8-bit
+                    const svint8_t qx0 = svreinterpret_s8_u8(svlsr_n_u8_m(pl16, svand_n_u8_m(ph16, qx0r, 0x0F), 0x04));
+                    const svint8_t qx1 = svreinterpret_s8_u8(svlsr_n_u8_m(pl16, svand_n_u8_m(ph16, qx1r, 0x0F), 0x04));
+
+                    // sub 8
+                    const svint8_t qx0s = svsub_n_s8_x(ph32, qx0, 8);
+                    const svint8_t qx1s = svsub_n_s8_x(ph32, qx1, 8);
+
+                    // load y
+                    const svint8_t qy0 = svld1_s8(ph32, y0->qs);
+                    const svint8_t qy1 = svld1_s8(ph32, y1->qs);
+
+                    // dot product
+                    sumv0 = svmla_n_f32_x(ph32, sumv0, svcvt_f32_s32_x(ph32,
+                                svdot_s32(svdup_n_s32(0), qx0s, qy0)), GGML_CPU_FP16_TO_FP32(x0->d)*GGML_CPU_FP16_TO_FP32(y0->d));
+                    sumv1 = svmla_n_f32_x(ph32, sumv1, svcvt_f32_s32_x(ph32,
+                                svdot_s32(svdup_n_s32(0), qx1s, qy1)), GGML_CPU_FP16_TO_FP32(x1->d)*GGML_CPU_FP16_TO_FP32(y1->d));
+                }
+
+                sumf = svaddv_f32(ph32, svadd_f32_x(ph32, sumv0, sumv1));
+            } break;
+        default:
+            assert(false && "Unsupported vector length");
+            break;
+    }
+
+#elif defined(__ARM_NEON)
+    float32x4_t sumv0 = vdupq_n_f32(0.0f);
+    float32x4_t sumv1 = vdupq_n_f32(0.0f);
+
+    for (; ib + 1 < nb; ib += 2) {
+        const block_q4_0 * GGML_RESTRICT x0 = &x[ib + 0];
+        const block_q4_0 * GGML_RESTRICT x1 = &x[ib + 1];
+        const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0];
+        const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1];
+
+        const uint8x16_t m4b = vdupq_n_u8(0x0F);
+        const int8x16_t  s8b = vdupq_n_s8(0x8);
+
+        const uint8x16_t v0_0 = vld1q_u8(x0->qs);
+        const uint8x16_t v0_1 = vld1q_u8(x1->qs);
+
+        // 4-bit -> 8-bit
+        const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8  (v0_0, m4b));
+        const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4));
+        const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8  (v0_1, m4b));
+        const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4));
+
+        // sub 8
+        const int8x16_t v0_0ls = vsubq_s8(v0_0l, s8b);
+        const int8x16_t v0_0hs = vsubq_s8(v0_0h, s8b);
+        const int8x16_t v0_1ls = vsubq_s8(v0_1l, s8b);
+        const int8x16_t v0_1hs = vsubq_s8(v0_1h, s8b);
+
+        // load y
+        const int8x16_t v1_0l = vld1q_s8(y0->qs);
+        const int8x16_t v1_0h = vld1q_s8(y0->qs + 16);
+        const int8x16_t v1_1l = vld1q_s8(y1->qs);
+        const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
+
+        // dot product into int32x4_t
+        const int32x4_t p_0 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), v0_0ls, v1_0l), v0_0hs, v1_0h);
+        const int32x4_t p_1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), v0_1ls, v1_1l), v0_1hs, v1_1h);
+
+        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), GGML_CPU_FP16_TO_FP32(x0->d)*GGML_CPU_FP16_TO_FP32(y0->d));
+        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), GGML_CPU_FP16_TO_FP32(x1->d)*GGML_CPU_FP16_TO_FP32(y1->d));
+    }
+
+    sumf = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
+#endif
+    for (; ib < nb; ++ib) {
+        int sumi0 = 0;
+        int sumi1 = 0;
+
+        for (int j = 0; j < qk/2; ++j) {
+            const int v0 = (x[ib].qs[j] & 0x0F) - 8;
+            const int v1 = (x[ib].qs[j] >>   4) - 8;
+
+            sumi0 += (v0 * y[ib].qs[j]);
+            sumi1 += (v1 * y[ib].qs[j + qk/2]);
+        }
+
+        int sumi = sumi0 + sumi1;
+        sumf += sumi*GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d);
+    }
+
+    *s = sumf;
+}
+
+void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    const int qk = QK8_1;
+    const int nb = n / qk;
+
+    assert(n % qk == 0);
+#if defined(__ARM_FEATURE_MATMUL_INT8)
+    assert((nrc == 2) || (nrc == 1));
+#else
+    assert(nrc == 1);
+#endif
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_q4_1 * GGML_RESTRICT x = vx;
+    const block_q8_1 * GGML_RESTRICT y = vy;
+
+#if defined(__ARM_FEATURE_MATMUL_INT8)
+    if (nrc == 2) {
+        const block_q4_1 * GGML_RESTRICT vx0 = vx;
+        const block_q4_1 * GGML_RESTRICT vx1 = (const block_q4_1 *) ((const uint8_t*)vx + bx);
+        const block_q8_1 * GGML_RESTRICT vy0 = vy;
+        const block_q8_1 * GGML_RESTRICT vy1 = (const block_q8_1 *) ((const uint8_t*)vy + by);
+
+        float32x4_t sumv0 = vdupq_n_f32(0.0f);
+        float32x4_t summs0 = vdupq_n_f32(0.0f);
+
+        for (int i = 0; i < nb; i++) {
+            const block_q4_1 * GGML_RESTRICT b_x0 = &vx0[i];
+            const block_q4_1 * GGML_RESTRICT b_x1 = &vx1[i];
+            const block_q8_1 * GGML_RESTRICT b_y0 = &vy0[i];
+            const block_q8_1 * GGML_RESTRICT b_y1 = &vy1[i];
+
+            float32_t summs_t[4] = {
+                GGML_CPU_FP16_TO_FP32(b_x0->m) * GGML_CPU_FP16_TO_FP32(b_y0->s),
+                GGML_CPU_FP16_TO_FP32(b_x1->m) * GGML_CPU_FP16_TO_FP32(b_y0->s),
+                GGML_CPU_FP16_TO_FP32(b_x0->m) * GGML_CPU_FP16_TO_FP32(b_y1->s),
+                GGML_CPU_FP16_TO_FP32(b_x1->m) * GGML_CPU_FP16_TO_FP32(b_y1->s)
+            };
+            summs0 = vaddq_f32(summs0, vld1q_f32(summs_t));
+
+            const uint8x16_t m4b = vdupq_n_u8(0x0F);
+
+            const uint8x16_t v0_0 = vld1q_u8(b_x0->qs);
+            const uint8x16_t v0_1 = vld1q_u8(b_x1->qs);
+
+            // 4-bit -> 8-bit
+            const int8x16_t x0_l = vreinterpretq_s8_u8(vandq_u8  (v0_0, m4b));
+            const int8x16_t x0_h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4));
+            const int8x16_t x1_l = vreinterpretq_s8_u8(vandq_u8  (v0_1, m4b));
+            const int8x16_t x1_h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4));
+
+            // load y
+            const int8x16_t y0_l = vld1q_s8(b_y0->qs);
+            const int8x16_t y0_h = vld1q_s8(b_y0->qs + 16);
+            const int8x16_t y1_l = vld1q_s8(b_y1->qs);
+            const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16);
+
+            // mmla into int32x4_t
+            float32_t _scale[4] = {
+                GGML_CPU_FP16_TO_FP32(b_x0->d)*GGML_CPU_FP16_TO_FP32(b_y0->d),
+                GGML_CPU_FP16_TO_FP32(b_x0->d)*GGML_CPU_FP16_TO_FP32(b_y1->d),
+                GGML_CPU_FP16_TO_FP32(b_x1->d)*GGML_CPU_FP16_TO_FP32(b_y0->d),
+                GGML_CPU_FP16_TO_FP32(b_x1->d)*GGML_CPU_FP16_TO_FP32(b_y1->d)
+            };
+            float32x4_t scale = vld1q_f32(_scale);
+
+            int8x16_t l0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
+            int8x16_t l1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
+
+            int8x16_t l2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_h), vreinterpretq_s64_s8(x1_h)));
+            int8x16_t l3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_h), vreinterpretq_s64_s8(x1_h)));
+
+            int8x16_t r0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_l), vreinterpretq_s64_s8(y1_l)));
+            int8x16_t r1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_l), vreinterpretq_s64_s8(y1_l)));
+
+            int8x16_t r2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h)));
+            int8x16_t r3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h)));
+            sumv0 = vmlaq_f32(sumv0,(vcvtq_f32_s32(vmmlaq_s32((vmmlaq_s32((vmmlaq_s32((vmmlaq_s32(vdupq_n_s32(0), l0, r0)),
+                                                l1, r1)), l2, r2)), l3, r3))), scale);
+        }
+
+        float32x4_t sumv1 = vextq_f32 (sumv0, sumv0, 2);
+        float32x4_t sumv2 = vzip1q_f32(sumv0, sumv1);
+
+        sumv2 = vaddq_f32(sumv2, summs0);
+
+        vst1_f32(s,      vget_low_f32 (sumv2));
+        vst1_f32(s + bs, vget_high_f32(sumv2));
+
+        return;
+    }
+#endif
+
+    int ib = 0;
+    float sumf = 0;
+
+#if defined(__ARM_NEON)
+    float32x4_t sumv0 = vdupq_n_f32(0.0f);
+    float32x4_t sumv1 = vdupq_n_f32(0.0f);
+
+    float summs = 0;
+
+    for (; ib + 1 < nb; ib += 2) {
+        const block_q4_1 * GGML_RESTRICT x0 = &x[ib + 0];
+        const block_q4_1 * GGML_RESTRICT x1 = &x[ib + 1];
+        const block_q8_1 * GGML_RESTRICT y0 = &y[ib + 0];
+        const block_q8_1 * GGML_RESTRICT y1 = &y[ib + 1];
+
+        summs += GGML_CPU_FP16_TO_FP32(x0->m) * GGML_CPU_FP16_TO_FP32(y0->s) + GGML_CPU_FP16_TO_FP32(x1->m) * GGML_CPU_FP16_TO_FP32(y1->s);
+
+        const uint8x16_t m4b = vdupq_n_u8(0x0F);
+
+        const uint8x16_t v0_0 = vld1q_u8(x0->qs);
+        const uint8x16_t v0_1 = vld1q_u8(x1->qs);
+
+        // 4-bit -> 8-bit
+        const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8  (v0_0, m4b));
+        const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4));
+        const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8  (v0_1, m4b));
+        const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4));
+
+        // load y
+        const int8x16_t v1_0l = vld1q_s8(y0->qs);
+        const int8x16_t v1_0h = vld1q_s8(y0->qs + 16);
+        const int8x16_t v1_1l = vld1q_s8(y1->qs);
+        const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
+
+        // dot product into int32x4_t
+        const int32x4_t p_0 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), v0_0l, v1_0l), v0_0h, v1_0h);
+        const int32x4_t p_1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), v0_1l, v1_1l), v0_1h, v1_1h);
+
+        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), GGML_CPU_FP16_TO_FP32(x0->d)*GGML_CPU_FP16_TO_FP32(y0->d));
+        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), GGML_CPU_FP16_TO_FP32(x1->d)*GGML_CPU_FP16_TO_FP32(y1->d));
+    }
+
+    sumf = vaddvq_f32(sumv0) + vaddvq_f32(sumv1) + summs;
+
+#endif
+    for (; ib < nb; ++ib) {
+        int sumi0 = 0;
+        int sumi1 = 0;
+
+        for (int j = 0; j < qk/2; ++j) {
+            const int v0 = (x[ib].qs[j] & 0x0F);
+            const int v1 = (x[ib].qs[j] >>   4);
+
+            sumi0 += (v0 * y[ib].qs[j]);
+            sumi1 += (v1 * y[ib].qs[j + qk/2]);
+        }
+
+        int sumi = sumi0 + sumi1;
+        sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
+    }
+
+    *s = sumf;
+}
+
+void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    const int qk = QK8_0;
+    const int nb = n / qk;
+
+    int ib = 0;
+    float sumf = 0;
+
+    assert(n % qk == 0);
+    assert(qk == QK5_0);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_q5_0 * GGML_RESTRICT x = vx;
+    const block_q8_0 * GGML_RESTRICT y = vy;
+
+#if defined(__ARM_NEON)
+    float32x4_t sumv0 = vdupq_n_f32(0.0f);
+    float32x4_t sumv1 = vdupq_n_f32(0.0f);
+
+    uint32_t qh0;
+    uint32_t qh1;
+
+    uint64_t tmp0[4];
+    uint64_t tmp1[4];
+
+    for (; ib + 1 < nb; ib += 2) {
+        const block_q5_0 * GGML_RESTRICT x0 = &x[ib];
+        const block_q5_0 * GGML_RESTRICT x1 = &x[ib + 1];
+        const block_q8_0 * GGML_RESTRICT y0 = &y[ib];
+        const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1];
+
+        const uint8x16_t m4b = vdupq_n_u8(0x0F);
+
+        // extract the 5th bit via lookup table ((!b) << 4)
+        memcpy(&qh0, x0->qh, sizeof(qh0));
+        memcpy(&qh1, x1->qh, sizeof(qh1));
+
+        tmp0[0] = table_b2b_1[(qh0 >>  0) & 0xFF];
+        tmp0[1] = table_b2b_1[(qh0 >>  8) & 0xFF];
+        tmp0[2] = table_b2b_1[(qh0 >> 16) & 0xFF];
+        tmp0[3] = table_b2b_1[(qh0 >> 24)       ];
+
+        tmp1[0] = table_b2b_1[(qh1 >>  0) & 0xFF];
+        tmp1[1] = table_b2b_1[(qh1 >>  8) & 0xFF];
+        tmp1[2] = table_b2b_1[(qh1 >> 16) & 0xFF];
+        tmp1[3] = table_b2b_1[(qh1 >> 24)       ];
+
+        const int8x16_t qhl0 = vld1q_s8((const int8_t *)(tmp0 + 0));
+        const int8x16_t qhh0 = vld1q_s8((const int8_t *)(tmp0 + 2));
+        const int8x16_t qhl1 = vld1q_s8((const int8_t *)(tmp1 + 0));
+        const int8x16_t qhh1 = vld1q_s8((const int8_t *)(tmp1 + 2));
+
+        const uint8x16_t v0_0 = vld1q_u8(x0->qs);
+        const uint8x16_t v0_1 = vld1q_u8(x1->qs);
+
+        // 4-bit -> 8-bit
+        int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8  (v0_0, m4b));
+        int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4));
+        int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8  (v0_1, m4b));
+        int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4));
+
+        // add high bit and sub 16 (equivalent to sub 0x10 when bit is zero)
+        const int8x16_t v0_0lf = vsubq_s8(v0_0l, qhl0);
+        const int8x16_t v0_0hf = vsubq_s8(v0_0h, qhh0);
+        const int8x16_t v0_1lf = vsubq_s8(v0_1l, qhl1);
+        const int8x16_t v0_1hf = vsubq_s8(v0_1h, qhh1);
+
+        // load y
+        const int8x16_t v1_0l = vld1q_s8(y0->qs);
+        const int8x16_t v1_0h = vld1q_s8(y0->qs + 16);
+        const int8x16_t v1_1l = vld1q_s8(y1->qs);
+        const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
+
+        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
+                        ggml_vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l),
+                        ggml_vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_CPU_FP16_TO_FP32(x0->d)*GGML_CPU_FP16_TO_FP32(y0->d));
+        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
+                        ggml_vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l),
+                        ggml_vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_CPU_FP16_TO_FP32(x1->d)*GGML_CPU_FP16_TO_FP32(y1->d));
+    }
+
+    sumf = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
+
+#endif
+    for (; ib < nb; ++ib) {
+        uint32_t qh;
+        memcpy(&qh, x[ib].qh, sizeof(qh));
+
+        int sumi0 = 0;
+        int sumi1 = 0;
+
+        for (int j = 0; j < qk/2; ++j) {
+            const uint8_t xh_0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
+            const uint8_t xh_1 = ((qh & (1u << (j + 16))) >> (j + 12));
+
+            const int32_t x0 = (int8_t)(((x[ib].qs[j] & 0x0F) | xh_0) - 16);
+            const int32_t x1 = (int8_t)(((x[ib].qs[j] >>   4) | xh_1) - 16);
+
+            sumi0 += (x0 * y[ib].qs[j]);
+            sumi1 += (x1 * y[ib].qs[j + qk/2]);
+        }
+
+        int sumi = sumi0 + sumi1;
+        sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d)) * sumi;
+    }
+
+    *s = sumf;
+}
+
+void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    const int qk = QK8_1;
+    const int nb = n / qk;
+
+    int ib = 0;
+    float sumf = 0;
+
+    assert(n % qk == 0);
+    assert(qk == QK5_1);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_q5_1 * GGML_RESTRICT x = vx;
+    const block_q8_1 * GGML_RESTRICT y = vy;
+
+#if defined(__ARM_NEON)
+    float32x4_t sumv0 = vdupq_n_f32(0.0f);
+    float32x4_t sumv1 = vdupq_n_f32(0.0f);
+
+    float summs0 = 0.0f;
+    float summs1 = 0.0f;
+
+    uint32_t qh0;
+    uint32_t qh1;
+
+    uint64_t tmp0[4];
+    uint64_t tmp1[4];
+
+    for (; ib + 1 < nb; ib += 2) {
+        const block_q5_1 * GGML_RESTRICT x0 = &x[ib];
+        const block_q5_1 * GGML_RESTRICT x1 = &x[ib + 1];
+        const block_q8_1 * GGML_RESTRICT y0 = &y[ib];
+        const block_q8_1 * GGML_RESTRICT y1 = &y[ib + 1];
+
+        const uint8x16_t m4b = vdupq_n_u8(0x0F);
+
+        summs0 += GGML_CPU_FP16_TO_FP32(x0->m) * GGML_CPU_FP16_TO_FP32(y0->s);
+        summs1 += GGML_CPU_FP16_TO_FP32(x1->m) * GGML_CPU_FP16_TO_FP32(y1->s);
+
+        // extract the 5th bit via lookup table ((b) << 4)
+        memcpy(&qh0, x0->qh, sizeof(qh0));
+        memcpy(&qh1, x1->qh, sizeof(qh1));
+
+        tmp0[0] = table_b2b_0[(qh0 >>  0) & 0xFF];
+        tmp0[1] = table_b2b_0[(qh0 >>  8) & 0xFF];
+        tmp0[2] = table_b2b_0[(qh0 >> 16) & 0xFF];
+        tmp0[3] = table_b2b_0[(qh0 >> 24)       ];
+
+        tmp1[0] = table_b2b_0[(qh1 >>  0) & 0xFF];
+        tmp1[1] = table_b2b_0[(qh1 >>  8) & 0xFF];
+        tmp1[2] = table_b2b_0[(qh1 >> 16) & 0xFF];
+        tmp1[3] = table_b2b_0[(qh1 >> 24)       ];
+
+        const int8x16_t qhl0 = vld1q_s8((const int8_t *)(tmp0 + 0));
+        const int8x16_t qhh0 = vld1q_s8((const int8_t *)(tmp0 + 2));
+        const int8x16_t qhl1 = vld1q_s8((const int8_t *)(tmp1 + 0));
+        const int8x16_t qhh1 = vld1q_s8((const int8_t *)(tmp1 + 2));
+
+        const uint8x16_t v0_0 = vld1q_u8(x0->qs);
+        const uint8x16_t v0_1 = vld1q_u8(x1->qs);
+
+        // 4-bit -> 8-bit
+        const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8  (v0_0, m4b));
+        const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4));
+        const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8  (v0_1, m4b));
+        const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4));
+
+        // add high bit
+        const int8x16_t v0_0lf = vorrq_s8(v0_0l, qhl0);
+        const int8x16_t v0_0hf = vorrq_s8(v0_0h, qhh0);
+        const int8x16_t v0_1lf = vorrq_s8(v0_1l, qhl1);
+        const int8x16_t v0_1hf = vorrq_s8(v0_1h, qhh1);
+
+        // load y
+        const int8x16_t v1_0l = vld1q_s8(y0->qs);
+        const int8x16_t v1_0h = vld1q_s8(y0->qs + 16);
+        const int8x16_t v1_1l = vld1q_s8(y1->qs);
+        const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
+
+        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
+                        ggml_vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l),
+                        ggml_vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_CPU_FP16_TO_FP32(x0->d)*GGML_CPU_FP16_TO_FP32(y0->d));
+        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
+                        ggml_vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l),
+                        ggml_vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_CPU_FP16_TO_FP32(x1->d)*GGML_CPU_FP16_TO_FP32(y1->d));
+    }
+
+    sumf = vaddvq_f32(sumv0) + vaddvq_f32(sumv1) + summs0 + summs1;
+
+#endif
+    for (; ib < nb; ++ib) {
+        uint32_t qh;
+        memcpy(&qh, x[ib].qh, sizeof(qh));
+
+        int sumi0 = 0;
+        int sumi1 = 0;
+
+        for (int j = 0; j < qk/2; ++j) {
+            const uint8_t xh_0 = ((qh >> (j +  0)) << 4) & 0x10;
+            const uint8_t xh_1 = ((qh >> (j + 12))     ) & 0x10;
+
+            const int32_t x0 = (x[ib].qs[j] & 0xF) | xh_0;
+            const int32_t x1 = (x[ib].qs[j] >>  4) | xh_1;
+
+            sumi0 += (x0 * y[ib].qs[j]);
+            sumi1 += (x1 * y[ib].qs[j + qk/2]);
+        }
+
+        int sumi = sumi0 + sumi1;
+        sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
+    }
+
+    *s = sumf;
+}
+
+void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    const int qk = QK8_0;
+    const int nb = n / qk;
+
+    assert(n % qk == 0);
+#if defined(__ARM_FEATURE_MATMUL_INT8)
+    assert((nrc == 2) || (nrc == 1));
+#else
+    assert(nrc == 1);
+#endif
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_q8_0 * GGML_RESTRICT x = vx;
+    const block_q8_0 * GGML_RESTRICT y = vy;
+
+#if defined(__ARM_FEATURE_MATMUL_INT8)
+    if (nrc == 2) {
+        const block_q8_0 * GGML_RESTRICT vx0 = vx;
+        const block_q8_0 * GGML_RESTRICT vx1 = (const block_q8_0 *) ((const uint8_t*)vx + bx);
+        const block_q8_0 * GGML_RESTRICT vy0 = vy;
+        const block_q8_0 * GGML_RESTRICT vy1 = (const block_q8_0 *) ((const uint8_t*)vy + by);
+
+        float32x4_t sumv0 = vdupq_n_f32(0.0f);
+
+        for (int i = 0; i < nb; i++) {
+            const block_q8_0 * GGML_RESTRICT b_x0 = &vx0[i];
+            const block_q8_0 * GGML_RESTRICT b_y0 = &vy0[i];
+
+            const block_q8_0 * GGML_RESTRICT b_x1 = &vx1[i];
+            const block_q8_0 * GGML_RESTRICT b_y1 = &vy1[i];
+
+            const int8x16_t x0_l = vld1q_s8(b_x0->qs);
+            const int8x16_t x0_h = vld1q_s8(b_x0->qs + 16);
+            const int8x16_t x1_l = vld1q_s8(b_x1->qs);
+            const int8x16_t x1_h = vld1q_s8(b_x1->qs + 16);
+
+            // load y
+            const int8x16_t y0_l = vld1q_s8(b_y0->qs);
+            const int8x16_t y0_h = vld1q_s8(b_y0->qs + 16);
+            const int8x16_t y1_l = vld1q_s8(b_y1->qs);
+            const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16);
+
+            float32_t _scale[4] = {
+                GGML_CPU_FP16_TO_FP32(b_x0->d)*GGML_CPU_FP16_TO_FP32(b_y0->d),
+                GGML_CPU_FP16_TO_FP32(b_x0->d)*GGML_CPU_FP16_TO_FP32(b_y1->d),
+                GGML_CPU_FP16_TO_FP32(b_x1->d)*GGML_CPU_FP16_TO_FP32(b_y0->d),
+                GGML_CPU_FP16_TO_FP32(b_x1->d)*GGML_CPU_FP16_TO_FP32(b_y1->d)
+            };
+            float32x4_t scale = vld1q_f32(_scale);
+
+            int8x16_t l0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
+            int8x16_t l1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
+
+            int8x16_t l2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_h), vreinterpretq_s64_s8(x1_h)));
+            int8x16_t l3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_h), vreinterpretq_s64_s8(x1_h)));
+
+            int8x16_t r0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_l), vreinterpretq_s64_s8(y1_l)));
+            int8x16_t r1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_l), vreinterpretq_s64_s8(y1_l)));
+
+            int8x16_t r2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h)));
+            int8x16_t r3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h)));
+
+            sumv0 = vmlaq_f32(sumv0,(vcvtq_f32_s32(vmmlaq_s32((vmmlaq_s32((vmmlaq_s32((vmmlaq_s32(vdupq_n_s32(0), l0, r0)),
+                                                l1, r1)), l2, r2)), l3, r3))), scale);
+        }
+
+        float32x4_t sumv1 = vextq_f32 (sumv0, sumv0, 2);
+        float32x4_t sumv2 = vzip1q_f32(sumv0, sumv1);
+
+        vst1_f32(s,      vget_low_f32 (sumv2));
+        vst1_f32(s + bs, vget_high_f32(sumv2));
+
+        return;
+    }
+#endif
+
+    int ib = 0;
+    float sumf = 0;
+
+#if defined(__ARM_FEATURE_SVE)
+    svfloat32_t sumv0 = svdup_n_f32(0.0f);
+    svfloat32_t sumv1 = svdup_n_f32(0.0f);
+
+    const int vector_length = ggml_cpu_get_sve_cnt()*8;
+
+    //VLA Implemenation for SVE
+    switch (vector_length) {
+        case 128:
+            {
+                // predicate for activating lanes for 16 Int8 elements
+                const svbool_t ph16 = svptrue_pat_b8 (SV_VL16);
+                const svbool_t pl16 = svptrue_pat_b32(SV_VL4);
+
+                for (; ib + 1 < nb; ib += 2) {
+                    const block_q8_0 * GGML_RESTRICT x0 = &x[ib + 0];
+                    const block_q8_0 * GGML_RESTRICT x1 = &x[ib + 1];
+                    const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0];
+                    const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1];
+
+                    // load x
+                    const svint8_t qx0_0 = svld1_s8(ph16, x0->qs);
+                    const svint8_t qx0_1 = svld1_s8(ph16, x0->qs+16);
+                    const svint8_t qx1_0 = svld1_s8(ph16, x1->qs);
+                    const svint8_t qx1_1 = svld1_s8(ph16, x1->qs+16);
+
+                    // load y
+                    const svint8_t qy0_0 = svld1_s8(ph16, y0->qs);
+                    const svint8_t qy0_1 = svld1_s8(ph16, y0->qs+16);
+                    const svint8_t qy1_0 = svld1_s8(ph16, y1->qs);
+                    const svint8_t qy1_1 = svld1_s8(ph16, y1->qs+16);
+
+                    sumv0 = svmla_n_f32_x(pl16, sumv0, svcvt_f32_s32_x(pl16, svadd_x(pl16,
+                                    svdot_s32(svdup_n_s32(0), qx0_0, qy0_0),
+                                    svdot_s32(svdup_n_s32(0), qx0_1, qy0_1))), GGML_CPU_FP16_TO_FP32(x0->d)*GGML_CPU_FP16_TO_FP32(y0->d));
+                    sumv1 = svmla_n_f32_x(pl16, sumv1, svcvt_f32_s32_x(pl16, svadd_x(pl16,
+                                    svdot_s32(svdup_n_s32(0), qx1_0, qy1_0),
+                                    svdot_s32(svdup_n_s32(0), qx1_1, qy1_1))), GGML_CPU_FP16_TO_FP32(x1->d)*GGML_CPU_FP16_TO_FP32(y1->d));
+                }
+
+                sumf = svaddv_f32(pl16, svadd_f32_x(pl16, sumv0, sumv1));
+            } break;
+        case 256:
+            {
+                //printf("sve256");
+                for (; ib + 1 < nb; ib += 2) {
+                    const block_q8_0 * GGML_RESTRICT x0 = &x[ib + 0];
+                    const block_q8_0 * GGML_RESTRICT x1 = &x[ib + 1];
+                    const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0];
+                    const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1];
+
+                    // load x
+                    const svint8_t qx0 = svld1_s8(svptrue_b8(), x0->qs);
+                    const svint8_t qx1 = svld1_s8(svptrue_b8(), x1->qs);
+
+                    // load y
+                    const svint8_t qy0 = svld1_s8(svptrue_b8(), y0->qs);
+                    const svint8_t qy1 = svld1_s8(svptrue_b8(), y1->qs);
+
+                    sumv0 = svmla_n_f32_x(svptrue_b32(), sumv0, svcvt_f32_s32_x(svptrue_b32(),
+                                svdot_s32(svdup_n_s32(0), qx0, qy0)), GGML_CPU_FP16_TO_FP32(x0->d)*GGML_CPU_FP16_TO_FP32(y0->d));
+                    sumv1 = svmla_n_f32_x(svptrue_b32(), sumv1, svcvt_f32_s32_x(svptrue_b32(),
+                                svdot_s32(svdup_n_s32(0), qx1, qy1)), GGML_CPU_FP16_TO_FP32(x1->d)*GGML_CPU_FP16_TO_FP32(y1->d));
+                }
+
+                sumf = svaddv_f32(svptrue_b32(), svadd_f32_x(svptrue_b32(), sumv0, sumv1));
+            } break;
+        case 512:
+            {
+                // predicate for activating high 256 bit
+                const svbool_t ph32 = svptrue_pat_b8(SV_VL32);
+                // predicate for activating low 256 bit
+                const svbool_t pl32 = svnot_b_z(svptrue_b8(), ph32);
+
+                // predicate for activating high lanes for 8 float32 elements
+                const svbool_t ph8 = svptrue_pat_b32(SV_VL8);
+                // predicate for activating low lanes for 8 float32 elements
+                const svbool_t pl8 = svnot_b_z(svptrue_b32(), ph8);
+
+                svfloat32_t sumv00 = svdup_n_f32(0.0f);
+
+                for (; ib + 1 < nb; ib += 2) {
+                    const block_q8_0 * GGML_RESTRICT x0 = &x[ib + 0];
+                    const block_q8_0 * GGML_RESTRICT x1 = &x[ib + 1];
+                    const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0];
+                    const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1];
+
+                    //load 32 int8_t in first half of vector and put another 32 int8_t in second vector lower bits
+                    // and add them to make one 64 element vector
+                    // load x
+                    const svint8_t qx_32 = svld1_s8(ph32, x0->qs);
+                          svint8_t qx_64 = svld1_s8(pl32, x0->qs + 2);
+
+                    qx_64 = svadd_s8_x(svptrue_b8(), qx_32, qx_64);
+
+                    // load y
+                    const svint8_t qy_32 = svld1_s8(ph32, y0->qs);
+                          svint8_t qy_64 = svld1_s8(pl32, y0->qs + 2);
+
+                    qy_64 = svadd_s8_x(svptrue_b8(), qy_32, qy_64);
+
+                    // scale creation
+                    const float32_t deq1 = GGML_CPU_FP16_TO_FP32(x0->d)*GGML_CPU_FP16_TO_FP32(y0->d);
+                    const float32_t deq2 = GGML_CPU_FP16_TO_FP32(x1->d)*GGML_CPU_FP16_TO_FP32(y1->d);
+
+                    // duplicate deq1 in first half of vector and deq2 in second half of vector
+                    const svfloat32_t temp = svdup_f32_m(svdup_f32_z(ph8, deq1), pl8, deq2);
+
+                    const svfloat32_t sumvt = svcvt_f32_s32_x(svptrue_b32(), svdot_s32(svdup_n_s32(0), qx_64, qy_64));
+
+                    sumv00 = svmla_f32_m(svptrue_b32(), sumv00, sumvt, temp);
+                }
+
+                sumf = svaddv_f32(svptrue_b32(), sumv00);
+                break;
+            }
+        default:
+            assert(false && "Unsupported vector length");
+            break;
+    }
+#elif defined(__ARM_NEON)
+    float32x4_t sumv0 = vdupq_n_f32(0.0f);
+    float32x4_t sumv1 = vdupq_n_f32(0.0f);
+
+    for (; ib + 1 < nb; ib += 2) {
+        const block_q8_0 * GGML_RESTRICT x0 = &x[ib + 0];
+        const block_q8_0 * GGML_RESTRICT x1 = &x[ib + 1];
+        const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0];
+        const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1];
+
+        const int8x16_t x0_0 = vld1q_s8(x0->qs);
+        const int8x16_t x0_1 = vld1q_s8(x0->qs + 16);
+        const int8x16_t x1_0 = vld1q_s8(x1->qs);
+        const int8x16_t x1_1 = vld1q_s8(x1->qs + 16);
+
+        // load y
+        const int8x16_t y0_0 = vld1q_s8(y0->qs);
+        const int8x16_t y0_1 = vld1q_s8(y0->qs + 16);
+        const int8x16_t y1_0 = vld1q_s8(y1->qs);
+        const int8x16_t y1_1 = vld1q_s8(y1->qs + 16);
+
+        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
+                        ggml_vdotq_s32(vdupq_n_s32(0), x0_0, y0_0),
+                        ggml_vdotq_s32(vdupq_n_s32(0), x0_1, y0_1))), GGML_CPU_FP16_TO_FP32(x0->d)*GGML_CPU_FP16_TO_FP32(y0->d));
+
+        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
+                        ggml_vdotq_s32(vdupq_n_s32(0), x1_0, y1_0),
+                        ggml_vdotq_s32(vdupq_n_s32(0), x1_1, y1_1))), GGML_CPU_FP16_TO_FP32(x1->d)*GGML_CPU_FP16_TO_FP32(y1->d));
+    }
+
+    sumf = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
+#endif
+    for (; ib < nb; ++ib) {
+        int sumi = 0;
+
+        for (int j = 0; j < qk; j++) {
+            sumi += x[ib].qs[j]*y[ib].qs[j];
+        }
+
+        sumf += sumi*(GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d));
+    }
+
+    *s = sumf;
+}
+
+void ggml_vec_dot_tq1_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_tq1_0 * GGML_RESTRICT x = vx;
+    const block_q8_K  * GGML_RESTRICT y = vy;
+
+    const int nb = n / QK_K;
+
+#if defined(__ARM_NEON)
+    float sumf = 0.0f;
+
+    uint8_t k_shift[16] = {1, 1, 1, 1, 3, 3, 3, 3, 9, 9, 9, 9, 27, 27, 27, 27};
+
+    const uint8x16_t shift = vld1q_u8(k_shift);
+
+    for (int i = 0; i < nb; ++i) {
+#if defined(__ARM_FEATURE_DOTPROD)
+        int32x4_t sumi0 = vdupq_n_s32(0);
+        int32x4_t sumi1 = vdupq_n_s32(0);
+#else
+        int16x8_t sumi0 = vdupq_n_s16(0);
+        int16x8_t sumi1 = vdupq_n_s16(0);
+#endif
+
+        // first 32 bytes of 5 elements
+        {
+            uint8x16_t qx0 = vld1q_u8(x[i].qs + 0);
+            uint8x16_t qx1 = vld1q_u8(x[i].qs + 16);
+            uint8x16_t qx2 = vmulq_u8(qx0, vdupq_n_u8(3));
+            uint8x16_t qx3 = vmulq_u8(qx1, vdupq_n_u8(3));
+            uint8x16_t qx4 = vmulq_u8(qx0, vdupq_n_u8(9));
+            uint8x16_t qx5 = vmulq_u8(qx1, vdupq_n_u8(9));
+            uint8x16_t qx6 = vmulq_u8(qx0, vdupq_n_u8(27));
+            uint8x16_t qx7 = vmulq_u8(qx1, vdupq_n_u8(27));
+            uint8x16_t qx8 = vmulq_u8(qx0, vdupq_n_u8(81));
+            uint8x16_t qx9 = vmulq_u8(qx1, vdupq_n_u8(81));
+
+            // multiply by 3 and keep the 2 bits above 8 bits
+            int8x16_t sqx0 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx0, vshrq_n_u8(qx0, 1)), 6));
+            int8x16_t sqx1 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx1, vshrq_n_u8(qx1, 1)), 6));
+            int8x16_t sqx2 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx2, vshrq_n_u8(qx2, 1)), 6));
+            int8x16_t sqx3 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx3, vshrq_n_u8(qx3, 1)), 6));
+            int8x16_t sqx4 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx4, vshrq_n_u8(qx4, 1)), 6));
+            int8x16_t sqx5 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx5, vshrq_n_u8(qx5, 1)), 6));
+            int8x16_t sqx6 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx6, vshrq_n_u8(qx6, 1)), 6));
+            int8x16_t sqx7 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx7, vshrq_n_u8(qx7, 1)), 6));
+            int8x16_t sqx8 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx8, vshrq_n_u8(qx8, 1)), 6));
+            int8x16_t sqx9 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx9, vshrq_n_u8(qx9, 1)), 6));
+
+            const int8x16_t qy0 = vld1q_s8(y[i].qs +   0);
+            const int8x16_t qy1 = vld1q_s8(y[i].qs +  16);
+            const int8x16_t qy2 = vld1q_s8(y[i].qs +  32);
+            const int8x16_t qy3 = vld1q_s8(y[i].qs +  48);
+            const int8x16_t qy4 = vld1q_s8(y[i].qs +  64);
+            const int8x16_t qy5 = vld1q_s8(y[i].qs +  80);
+            const int8x16_t qy6 = vld1q_s8(y[i].qs +  96);
+            const int8x16_t qy7 = vld1q_s8(y[i].qs + 112);
+            const int8x16_t qy8 = vld1q_s8(y[i].qs + 128);
+            const int8x16_t qy9 = vld1q_s8(y[i].qs + 144);
+
+#if defined(__ARM_FEATURE_DOTPROD)
+            sumi0 = vdotq_s32(sumi0, sqx0, qy0);
+            sumi1 = vdotq_s32(sumi1, sqx1, qy1);
+            sumi0 = vdotq_s32(sumi0, sqx2, qy2);
+            sumi1 = vdotq_s32(sumi1, sqx3, qy3);
+            sumi0 = vdotq_s32(sumi0, sqx4, qy4);
+            sumi1 = vdotq_s32(sumi1, sqx5, qy5);
+            sumi0 = vdotq_s32(sumi0, sqx6, qy6);
+            sumi1 = vdotq_s32(sumi1, sqx7, qy7);
+            sumi0 = vdotq_s32(sumi0, sqx8, qy8);
+            sumi1 = vdotq_s32(sumi1, sqx9, qy9);
+#else
+            sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx0), vget_low_s8(qy0));
+            sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx0), vget_high_s8(qy0));
+            sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx1), vget_low_s8(qy1));
+            sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx1), vget_high_s8(qy1));
+            sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx2), vget_low_s8(qy2));
+            sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx2), vget_high_s8(qy2));
+            sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx3), vget_low_s8(qy3));
+            sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx3), vget_high_s8(qy3));
+            sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx4), vget_low_s8(qy4));
+            sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx4), vget_high_s8(qy4));
+            sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx5), vget_low_s8(qy5));
+            sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx5), vget_high_s8(qy5));
+            sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx6), vget_low_s8(qy6));
+            sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx6), vget_high_s8(qy6));
+            sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx7), vget_low_s8(qy7));
+            sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx7), vget_high_s8(qy7));
+            sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx8), vget_low_s8(qy8));
+            sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx8), vget_high_s8(qy8));
+            sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx9), vget_low_s8(qy9));
+            sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx9), vget_high_s8(qy9));
+#endif
+        }
+
+        // last 16 bytes of 5-element, along with the 4 bytes of 4 elements
+        {
+            uint8x16_t qx0 = vld1q_u8(x[i].qs + 32);
+            uint8x16_t qx1 = vmulq_u8(qx0, vdupq_n_u8(3));
+            uint8x16_t qx2 = vmulq_u8(qx0, vdupq_n_u8(9));
+            uint8x16_t qx3 = vmulq_u8(qx0, vdupq_n_u8(27));
+            uint8x16_t qx4 = vmulq_u8(qx0, vdupq_n_u8(81));
+            uint32_t qh;
+            memcpy(&qh, x[i].qh, sizeof(qh)); // potentially unaligned
+            uint8x16_t qx5 = vreinterpretq_u8_u32(vdupq_n_u32(qh));
+            qx5 = vmulq_u8(qx5, shift);
+
+            // multiply by 3 and keep the 2 bits above 8 bits
+            int8x16_t sqx0 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx0, vshrq_n_u8(qx0, 1)), 6));
+            int8x16_t sqx1 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx1, vshrq_n_u8(qx1, 1)), 6));
+            int8x16_t sqx2 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx2, vshrq_n_u8(qx2, 1)), 6));
+            int8x16_t sqx3 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx3, vshrq_n_u8(qx3, 1)), 6));
+            int8x16_t sqx4 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx4, vshrq_n_u8(qx4, 1)), 6));
+            int8x16_t sqx5 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx5, vshrq_n_u8(qx5, 1)), 6));
+
+            const int8x16_t qy0 = vld1q_s8(y[i].qs + 160);
+            const int8x16_t qy1 = vld1q_s8(y[i].qs + 176);
+            const int8x16_t qy2 = vld1q_s8(y[i].qs + 192);
+            const int8x16_t qy3 = vld1q_s8(y[i].qs + 208);
+            const int8x16_t qy4 = vld1q_s8(y[i].qs + 224);
+            const int8x16_t qy5 = vld1q_s8(y[i].qs + 240);
+
+#if defined(__ARM_FEATURE_DOTPROD)
+            sumi0 = vdotq_s32(sumi0, sqx0, qy0);
+            sumi1 = vdotq_s32(sumi1, sqx1, qy1);
+            sumi0 = vdotq_s32(sumi0, sqx2, qy2);
+            sumi1 = vdotq_s32(sumi1, sqx3, qy3);
+            sumi0 = vdotq_s32(sumi0, sqx4, qy4);
+            sumi1 = vdotq_s32(sumi1, sqx5, qy5);
+#else
+            sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx0), vget_low_s8(qy0));
+            sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx0), vget_high_s8(qy0));
+            sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx1), vget_low_s8(qy1));
+            sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx1), vget_high_s8(qy1));
+            sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx2), vget_low_s8(qy2));
+            sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx2), vget_high_s8(qy2));
+            sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx3), vget_low_s8(qy3));
+            sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx3), vget_high_s8(qy3));
+            sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx4), vget_low_s8(qy4));
+            sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx4), vget_high_s8(qy4));
+            sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx5), vget_low_s8(qy5));
+            sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx5), vget_high_s8(qy5));
+#endif
+        }
+
+        const int16x8_t ysum0 = vld1q_s16(y[i].bsums);
+        const int16x8_t ysum1 = vld1q_s16(y[i].bsums + 8);
+
+        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+
+#if defined(__ARM_FEATURE_DOTPROD)
+        sumi0 = vaddq_s32(sumi0, sumi1);
+        sumi0 = vsubq_s32(sumi0, vpaddlq_s16(vaddq_s16(ysum0, ysum1)));
+
+        sumf += d * (float) vaddvq_s32(sumi0);
+#else
+        sumi0 = vaddq_s16(sumi0, sumi1);
+        sumi0 = vsubq_s16(sumi0, vaddq_s16(ysum0, ysum1));
+
+        sumf += d * (float) vaddlvq_s16(sumi0);
+#endif
+    }
+
+    *s = sumf;
+
+#else
+    const uint8_t pow3[6] = {1, 3, 9, 27, 81, 243};
+
+    float sumf = 0.0f;
+
+    for (int i = 0; i < nb; ++i) {
+        int sum = 0;
+
+        for (size_t j = 0; j < sizeof(x->qs) - sizeof(x->qs) % 32; j += 32) {
+            for (size_t l = 0; l < 5; ++l) {
+                for (size_t m = 0; m < 32; ++m) {
+                    uint8_t q = x[i].qs[j + m] * pow3[l];
+                    uint16_t xi = ((uint16_t) q * 3) >> 8;
+                    sum += (xi - 1) * y[i].qs[j*5 + l*32 + m];
+                }
+            }
+        }
+        for (size_t j = sizeof(x->qs) - sizeof(x->qs) % 32; j < sizeof(x->qs); j += 16) {
+            for (size_t l = 0; l < 5; ++l) {
+                for (size_t m = 0; m < 16; ++m) {
+                    uint8_t q = x[i].qs[j + m] * pow3[l];
+                    uint16_t xi = ((uint16_t) q * 3) >> 8;
+                    sum += (xi - 1) * y[i].qs[j*5 + l*16 + m];
+                }
+            }
+        }
+
+        for (size_t l = 0; l < 4; ++l) {
+            for (size_t j = 0; j < sizeof(x->qh); ++j) {
+                uint8_t q = x[i].qh[j] * pow3[l];
+                uint16_t xi = ((uint16_t) q * 3) >> 8;
+                sum += (xi - 1) * y[i].qs[sizeof(x->qs)*5 + l*sizeof(x->qh) + j];
+            }
+        }
+
+        sumf += (float) sum * (GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d);
+    }
+
+    *s = sumf;
+#endif
+}
+
+void ggml_vec_dot_tq2_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_tq2_0 * GGML_RESTRICT x = vx;
+    const block_q8_K  * GGML_RESTRICT y = vy;
+
+    const int nb = n / QK_K;
+
+#if defined(__ARM_NEON)
+    float sumf = 0.0f;
+
+    const uint8x16_t m3 = vdupq_n_u8(3);
+
+    for (int i = 0; i < nb; ++i) {
+#if defined(__ARM_FEATURE_DOTPROD)
+        int32x4_t sumi0 = vdupq_n_s32(0);
+        int32x4_t sumi1 = vdupq_n_s32(0);
+#else
+        int16x8_t sumi0 = vdupq_n_s16(0);
+        int16x8_t sumi1 = vdupq_n_s16(0);
+#endif
+
+        for (size_t j = 0; j < sizeof(x->qs); j += 32) {
+            uint8x16_t qx0 = vld1q_u8(x[i].qs + j);
+            uint8x16_t qx1 = vld1q_u8(x[i].qs + j + 16);
+            uint8x16_t qx2 = vshrq_n_u8(qx0, 2);
+            uint8x16_t qx3 = vshrq_n_u8(qx1, 2);
+            uint8x16_t qx4 = vshrq_n_u8(qx0, 4);
+            uint8x16_t qx5 = vshrq_n_u8(qx1, 4);
+            uint8x16_t qx6 = vshrq_n_u8(qx0, 6);
+            uint8x16_t qx7 = vshrq_n_u8(qx1, 6);
+
+            int8x16_t sqx0 = vreinterpretq_s8_u8(vandq_u8(qx0, m3));
+            int8x16_t sqx1 = vreinterpretq_s8_u8(vandq_u8(qx1, m3));
+            int8x16_t sqx2 = vreinterpretq_s8_u8(vandq_u8(qx2, m3));
+            int8x16_t sqx3 = vreinterpretq_s8_u8(vandq_u8(qx3, m3));
+            int8x16_t sqx4 = vreinterpretq_s8_u8(vandq_u8(qx4, m3));
+            int8x16_t sqx5 = vreinterpretq_s8_u8(vandq_u8(qx5, m3));
+            int8x16_t sqx6 = vreinterpretq_s8_u8(vandq_u8(qx6, m3));
+            int8x16_t sqx7 = vreinterpretq_s8_u8(vandq_u8(qx7, m3));
+
+            const int8x16_t qy0 = vld1q_s8(y[i].qs + j*4 +   0);
+            const int8x16_t qy1 = vld1q_s8(y[i].qs + j*4 +  16);
+            const int8x16_t qy2 = vld1q_s8(y[i].qs + j*4 +  32);
+            const int8x16_t qy3 = vld1q_s8(y[i].qs + j*4 +  48);
+            const int8x16_t qy4 = vld1q_s8(y[i].qs + j*4 +  64);
+            const int8x16_t qy5 = vld1q_s8(y[i].qs + j*4 +  80);
+            const int8x16_t qy6 = vld1q_s8(y[i].qs + j*4 +  96);
+            const int8x16_t qy7 = vld1q_s8(y[i].qs + j*4 + 112);
+
+#if defined(__ARM_FEATURE_DOTPROD)
+            sumi0 = vdotq_s32(sumi0, sqx0, qy0);
+            sumi1 = vdotq_s32(sumi1, sqx1, qy1);
+            sumi0 = vdotq_s32(sumi0, sqx2, qy2);
+            sumi1 = vdotq_s32(sumi1, sqx3, qy3);
+            sumi0 = vdotq_s32(sumi0, sqx4, qy4);
+            sumi1 = vdotq_s32(sumi1, sqx5, qy5);
+            sumi0 = vdotq_s32(sumi0, sqx6, qy6);
+            sumi1 = vdotq_s32(sumi1, sqx7, qy7);
+#else
+            sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx0), vget_low_s8(qy0));
+            sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx0), vget_high_s8(qy0));
+            sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx1), vget_low_s8(qy1));
+            sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx1), vget_high_s8(qy1));
+            sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx2), vget_low_s8(qy2));
+            sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx2), vget_high_s8(qy2));
+            sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx3), vget_low_s8(qy3));
+            sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx3), vget_high_s8(qy3));
+            sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx4), vget_low_s8(qy4));
+            sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx4), vget_high_s8(qy4));
+            sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx5), vget_low_s8(qy5));
+            sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx5), vget_high_s8(qy5));
+            sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx6), vget_low_s8(qy6));
+            sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx6), vget_high_s8(qy6));
+            sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx7), vget_low_s8(qy7));
+            sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx7), vget_high_s8(qy7));
+#endif
+        }
+
+        const int16x8_t ysum0 = vld1q_s16(y[i].bsums);
+        const int16x8_t ysum1 = vld1q_s16(y[i].bsums + 8);
+
+        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+
+#if defined(__ARM_FEATURE_DOTPROD)
+        sumi0 = vaddq_s32(sumi0, sumi1);
+        sumi0 = vsubq_s32(sumi0, vpaddlq_s16(vaddq_s16(ysum0, ysum1)));
+
+        sumf += d * (float) vaddvq_s32(sumi0);
+#else
+        sumi0 = vaddq_s16(sumi0, sumi1);
+        sumi0 = vsubq_s16(sumi0, vaddq_s16(ysum0, ysum1));
+
+        sumf += d * (float) vaddlvq_s16(sumi0);
+#endif
+    }
+
+    *s = sumf;
+
+#else
+    float sumf = 0.0f;
+
+    for (int i = 0; i < nb; ++i) {
+        int32_t sumi = 0;
+
+        for (size_t j = 0; j < sizeof(x->qs); j += 32) {
+            for (size_t l = 0; l < 4; ++l) {
+                for (size_t k = 0; k < 32; ++k) {
+                    sumi += y[i].qs[j*4 + l*32 + k] * (((x[i].qs[j + k] >> (l*2)) & 3) - 1);
+                }
+            }
+        }
+
+        const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
+
+        sumf += (float) sumi * d;
+    }
+
+    *s = sumf;
+#endif
+}
+
+void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_q2_K * GGML_RESTRICT x = vx;
+    const block_q8_K * GGML_RESTRICT y = vy;
+
+    const int nb = n / QK_K;
+
+#ifdef __ARM_FEATURE_SVE
+    const int vector_length = svcntb()*8;
+    const svuint8_t m3s = svdup_n_u8(0x3);
+    const svuint32_t m4s = svdup_n_u32(0xF);
+    const svint32_t vzero_sv = svdup_n_s32(0);
+    svfloat32_t acc_sum = svdup_n_f32(0);
+    svbool_t pred_s32 = svptrue_pat_b32(SV_VL4);
+
+    switch (vector_length) {
+        case 128:
+            for (int i = 0; i < nb; ++i) {
+                const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
+                svfloat32_t d_broad = svdup_n_f32((float32_t)d);
+                const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
+                svfloat32_t dmin_broad = svdup_n_f32((float32_t)dmin);
+
+                const uint8_t * GGML_RESTRICT q2 = x[i].qs;
+                const int8_t  * GGML_RESTRICT q8_sv = y[i].qs;
+                const uint8_t * GGML_RESTRICT sc = x[i].scales;
+
+                svuint32_t mins_and_scales_sve = svld1ub_u32(svptrue_b32(), sc);
+                const svint32_t mins_sv_1 = svreinterpret_s32_u32(svlsr_n_u32_x(svptrue_b32(), mins_and_scales_sve, 4));
+
+                mins_and_scales_sve = svld1ub_u32(svptrue_b32(), sc+4);
+                const svint32_t mins_sv_2 = svreinterpret_s32_u32(svlsr_n_u32_x(svptrue_b32(), mins_and_scales_sve, 4));
+
+                svint32_t q8sums_sv_1 = svld1sh_s32(svptrue_b32(), y[i].bsums);
+                svint32_t q8sums_sv_2 = svld1sh_s32(svptrue_b32(), y[i].bsums+4);
+
+                const svint32_t s0 = svadd_s32_x(svptrue_b32(), svmul_s32_x(svptrue_b32(), mins_sv_1, q8sums_sv_1), svmul_s32_x(svptrue_b32(), mins_sv_2, q8sums_sv_2));
+
+                mins_and_scales_sve = svld1ub_u32(svptrue_b32(), sc+8);
+                const svint32_t mins_sv_3 = svreinterpret_s32_u32(svlsr_n_u32_x(svptrue_b32(), mins_and_scales_sve, 4));
+
+                mins_and_scales_sve = svld1ub_u32(svptrue_b32(), sc+12);
+                const svint32_t mins_sv_4 = svreinterpret_s32_u32(svlsr_n_u32_x(svptrue_b32(), mins_and_scales_sve, 4));
+
+                q8sums_sv_1 = svld1sh_s32(svptrue_b32(), y[i].bsums+8);
+                q8sums_sv_2 = svld1sh_s32(svptrue_b32(), y[i].bsums+12);
+
+                svint32_t s1 = svadd_s32_x(svptrue_b32(), svmul_s32_x(svptrue_b32(), mins_sv_3, q8sums_sv_1), svmul_s32_x(svptrue_b32(), mins_sv_4, q8sums_sv_2));
+
+                svfloat32_t temp = svcvt_f32_s32_x(svptrue_b32(), svadd_s32_x(svptrue_b32(), s0, s1));
+
+                acc_sum = svmla_f32_m(svptrue_b32(), acc_sum, temp, dmin_broad);
+
+                svint32_t sumi1 = svdup_n_s32(0);
+
+                {
+                    const svuint8_t q2bits_1 = svld1_u8(svptrue_b8(), q2);
+                    svint8_t q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), q2bits_1, m3s));
+                    svint8_t q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
+                    const svint32_t scales_sv = svreinterpret_s32_u32(svand_u32_m(svptrue_b32(), svld1ub_u32(svptrue_b32(), sc), m4s));
+
+                    sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv, 0));
+
+                    const svuint8_t q2bits_3 = svld1_u8(svptrue_b8(), q2+16);
+                    q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), q2bits_3, m3s));
+                    q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
+
+                    sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv, 1));
+
+                    q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q2bits_1, 2), m3s));
+                    q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
+
+                    sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv, 2));
+
+                    q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q2bits_3, 2), m3s));
+                    q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
+
+                    sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv, 3));
+
+
+                    const svint32_t scales_sv_1 = svreinterpret_s32_u32(svand_u32_m(svptrue_b32(), svld1ub_u32(svptrue_b32(), sc+4), m4s));
+
+                    q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q2bits_1, 4), m3s));
+                    q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
+
+                    sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv_1, 0));
+
+                    q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q2bits_3, 4), m3s));
+                    q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
+
+                    sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv_1, 1));
+
+                    q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q2bits_1, 6), m3s));
+                    q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
+
+                    sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv_1, 2));
+
+                    q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q2bits_3, 6), m3s));
+                    q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
+
+                    sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv_1, 3));
+
+                    //-------------------------------
+
+                    q2 += 32;
+                    const svint32_t scales_sv_2 = svreinterpret_s32_u32(svand_u32_m(svptrue_b32(), svld1ub_u32(svptrue_b32(), sc+8), m4s));
+                    const svuint8_t q2bits_2 = svld1_u8(svptrue_b8(), q2);
+
+                    q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), q2bits_2, m3s));
+                    q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
+
+                    sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv_2, 0));
+
+                    const svuint8_t q2bits_4 = svld1_u8(svptrue_b8(), q2+16);
+                    q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), q2bits_4, m3s));
+                    q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
+
+                    sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv_2, 1));
+
+
+                    q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q2bits_2, 2), m3s));
+                    q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
+
+                    sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv_2, 2));
+
+                    q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q2bits_4, 2), m3s));
+                    q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
+
+                    sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv_2, 3));
+
+
+                    const svint32_t scales_sv_3 = svreinterpret_s32_u32(svand_u32_m(svptrue_b32(), svld1ub_u32(svptrue_b32(), sc+12), m4s));
+
+                    q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q2bits_2, 4), m3s));
+                    q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
+
+                    sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv_3, 0));
+
+                    q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q2bits_4, 4), m3s));
+                    q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
+
+                    sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv_3, 1));
+
+
+
+                    q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q2bits_2, 6), m3s));
+                    q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
+
+                    sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv_3, 2));
+
+                    q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q2bits_4, 6), m3s));
+                    q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
+
+                    sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv_3, 3));
+                }
+                acc_sum = svmla_f32_m(svptrue_b32(), acc_sum, svcvt_f32_s32_x(svptrue_b32(), sumi1), d_broad);
+            }
+            *s = svaddv_f32(svptrue_b32(), acc_sum);
+            break;
+
+        case 256:
+        case 512:
+            for (int i = 0; i < nb; ++i) {
+                const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
+                svfloat32_t d_broad = svdup_n_f32((float32_t)d);
+                const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
+                svfloat32_t dmin_broad = svdup_n_f32((float32_t)dmin);
+
+                const uint8_t * GGML_RESTRICT q2 = x[i].qs;
+                const int8_t  * GGML_RESTRICT q8_sv = y[i].qs;
+                const uint8_t * GGML_RESTRICT sc = x[i].scales;
+
+                const svuint32_t mins_and_scales_sve = svld1ub_u32(svptrue_pat_b32(SV_VL8), sc); sc += 8;
+                const svint32_t scales_sv = svreinterpret_s32_u32(svand_u32_m(svptrue_pat_b32(SV_VL8), mins_and_scales_sve, m4s));
+                const svint32_t mins_sv_1 = svreinterpret_s32_u32(svlsr_n_u32_x(svptrue_pat_b32(SV_VL8), mins_and_scales_sve, 4));
+                svint32_t q8sums_sv_1 = svld1sh_s32(svptrue_pat_b32(SV_VL8), y[i].bsums);
+
+                const svuint32_t mins_and_scales_sve_1 = svld1ub_u32(svptrue_pat_b32(SV_VL8), sc);
+                const svint32_t scales_sv_1 = svreinterpret_s32_u32(svand_u32_m(svptrue_pat_b32(SV_VL8), mins_and_scales_sve_1, m4s));
+                const svint32_t mins_sv_2 = svreinterpret_s32_u32(svlsr_n_u32_x(svptrue_pat_b32(SV_VL8), mins_and_scales_sve_1, 4));
+
+                svint32_t q8sums_sv_2 = svld1sh_s32(svptrue_pat_b32(SV_VL8), y[i].bsums+8);
+
+                svfloat32_t temp = svcvt_f32_s32_x(svptrue_pat_b32(SV_VL8), svadd_s32_x(svptrue_pat_b32(SV_VL8), svmul_s32_x(svptrue_pat_b32(SV_VL8), mins_sv_1, q8sums_sv_1), svmul_s32_x(svptrue_pat_b32(SV_VL8), mins_sv_2, q8sums_sv_2)));
+
+                acc_sum = svmla_f32_m(svptrue_pat_b32(SV_VL8), acc_sum, temp, dmin_broad);
+
+                svint32_t sumi1 = svdup_n_s32(0);
+
+                {
+                    const svuint8_t q2bits_1 = svld1_u8(svptrue_pat_b8(SV_VL32), q2);
+                    svint8_t q2bytes_sv = svreinterpret_s8_u8(svand_u8_m(svptrue_pat_b8(SV_VL32), q2bits_1, m3s));
+                    svint8_t q8bytes_sv = svld1_s8(svptrue_pat_b8(SV_VL32), q8_sv); q8_sv += 32;
+
+                    svint32_t scale_1 = svsel(pred_s32, svdup_lane_s32(scales_sv, 0), svdup_lane_s32(scales_sv, 1));
+                    sumi1 = svmla_s32_m(svptrue_pat_b32(SV_VL8), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), scale_1);
+
+                    q2bytes_sv = svreinterpret_s8_u8(svand_u8_m(svptrue_pat_b8(SV_VL32), svlsr_n_u8_x(svptrue_pat_b8(SV_VL32), q2bits_1, 2), m3s));
+                    q8bytes_sv = svld1_s8(svptrue_pat_b8(SV_VL32), q8_sv); q8_sv += 32;
+
+                    svint32_t scale_2 = svsel(pred_s32, svdup_lane_s32(scales_sv, 2), svdup_lane_s32(scales_sv, 3));
+                    sumi1 = svmla_s32_m(svptrue_pat_b32(SV_VL8), sumi1, svdot_s32(svdup_n_s32(0), q2bytes_sv, q8bytes_sv), scale_2);
+
+                    q2bytes_sv = svreinterpret_s8_u8(svand_u8_m(svptrue_pat_b8(SV_VL32), svlsr_n_u8_x(svptrue_pat_b8(SV_VL32), q2bits_1, 4), m3s));
+                    q8bytes_sv = svld1_s8(svptrue_pat_b8(SV_VL32), q8_sv); q8_sv += 32;
+
+                    scale_1 = svsel(pred_s32, svdup_lane_s32(scales_sv, 4), svdup_lane_s32(scales_sv, 5));
+                    sumi1 = svmla_s32_m(svptrue_pat_b32(SV_VL8), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), scale_1);
+
+                    q2bytes_sv = svreinterpret_s8_u8(svand_u8_m(svptrue_pat_b8(SV_VL32), svlsr_n_u8_x(svptrue_pat_b8(SV_VL32), q2bits_1, 6), m3s));
+                    q8bytes_sv = svld1_s8(svptrue_pat_b8(SV_VL32), q8_sv); q8_sv += 32;
+
+                    scale_2 = svsel(pred_s32, svdup_lane_s32(scales_sv, 6), svdup_lane_s32(scales_sv, 7));
+                    sumi1 = svmla_s32_m(svptrue_pat_b32(SV_VL8), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), scale_2);
+
+                    q2 += 32;
+
+                    const svuint8_t q2bits_2 = svld1_u8(svptrue_pat_b8(SV_VL32), q2);
+                    q2bytes_sv = svreinterpret_s8_u8(svand_u8_m(svptrue_pat_b8(SV_VL32), q2bits_2, m3s));
+                    q8bytes_sv = svld1_s8(svptrue_pat_b8(SV_VL32), q8_sv); q8_sv += 32;
+
+                    scale_1 = svsel(pred_s32, svdup_lane_s32(scales_sv_1, 0), svdup_lane_s32(scales_sv_1, 1));
+                    sumi1 = svmla_s32_m(svptrue_pat_b32(SV_VL8), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), scale_1);
+
+                    q2bytes_sv = svreinterpret_s8_u8(svand_u8_m(svptrue_pat_b8(SV_VL32), svlsr_n_u8_x(svptrue_pat_b8(SV_VL32), q2bits_2, 2), m3s));
+                    q8bytes_sv = svld1_s8(svptrue_pat_b8(SV_VL32), q8_sv); q8_sv += 32;
+
+                    scale_2 = svsel(pred_s32, svdup_lane_s32(scales_sv_1, 2), svdup_lane_s32(scales_sv_1, 3));
+                    sumi1 = svmla_s32_m(svptrue_pat_b32(SV_VL8), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), scale_2);
+
+                    q2bytes_sv = svreinterpret_s8_u8(svand_u8_m(svptrue_pat_b8(SV_VL32), svlsr_n_u8_x(svptrue_pat_b8(SV_VL32), q2bits_2, 4), m3s));
+                    q8bytes_sv = svld1_s8(svptrue_pat_b8(SV_VL32), q8_sv); q8_sv += 32;
+
+                    scale_1 = svsel(pred_s32, svdup_lane_s32(scales_sv_1, 4), svdup_lane_s32(scales_sv_1, 5));
+                    sumi1 = svmla_s32_m(svptrue_pat_b32(SV_VL8), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), scale_1);
+
+                    q2bytes_sv = svreinterpret_s8_u8(svand_u8_m(svptrue_pat_b8(SV_VL32), svlsr_n_u8_x(svptrue_pat_b8(SV_VL32), q2bits_2, 6), m3s));
+                    q8bytes_sv = svld1_s8(svptrue_pat_b8(SV_VL32), q8_sv); q8_sv += 32;
+
+                    scale_2 = svsel(pred_s32, svdup_lane_s32(scales_sv_1, 6), svdup_lane_s32(scales_sv_1, 7));
+                    sumi1 = svmla_s32_m(svptrue_pat_b32(SV_VL8), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), scale_2);
+                }
+                acc_sum = svmla_f32_m(svptrue_pat_b32(SV_VL8), acc_sum, svcvt_f32_s32_x(svptrue_pat_b32(SV_VL8), sumi1), d_broad);
+            }
+            *s = svaddv_f32(svptrue_pat_b32(SV_VL8), acc_sum);
+            break;
+
+        default:
+            assert(false && "Unsupported vector length");
+            break;
+    }
+
+#elif __ARM_NEON
+    const uint8x16_t m3 = vdupq_n_u8(0x3);
+    const uint8x16_t m4 = vdupq_n_u8(0xF);
+
+    const int32x4_t vzero = vdupq_n_s32(0);
+
+    ggml_int8x16x2_t q2bytes;
+    uint8_t aux[16];
+
+    float sum = 0;
+
+    for (int i = 0; i < nb; ++i) {
+        const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
+        const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
+
+        const uint8_t * GGML_RESTRICT q2 = x[i].qs;
+        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
+        const uint8_t * GGML_RESTRICT sc = x[i].scales;
+
+        const uint8x16_t mins_and_scales = vld1q_u8(sc);
+        const uint8x16_t scales = vandq_u8(mins_and_scales, m4);
+        vst1q_u8(aux, scales);
+
+        const uint8x16_t mins = vshrq_n_u8(mins_and_scales, 4);
+        const ggml_int16x8x2_t q8sums = ggml_vld1q_s16_x2(y[i].bsums);
+        const ggml_int16x8x2_t mins16 = {{vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(mins))), vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(mins)))}};
+        const int32x4_t s0 = vaddq_s32(vmull_s16(vget_low_s16 (mins16.val[0]), vget_low_s16 (q8sums.val[0])),
+                                       vmull_s16(vget_high_s16(mins16.val[0]), vget_high_s16(q8sums.val[0])));
+        const int32x4_t s1 = vaddq_s32(vmull_s16(vget_low_s16 (mins16.val[1]), vget_low_s16 (q8sums.val[1])),
+                                       vmull_s16(vget_high_s16(mins16.val[1]), vget_high_s16(q8sums.val[1])));
+        sum += dmin * vaddvq_s32(vaddq_s32(s0, s1));
+
+        int isum = 0;
+        int is = 0;
+
+// We use this macro instead of a function call because for some reason
+// the code runs 2-3% slower, even if the function is declared inline
+#define MULTIPLY_ACCUM_WITH_SCALE(index)\
+        isum += vaddvq_s32(ggml_vdotq_s32(vzero, q2bytes.val[0], q8bytes.val[0])) * aux[is+(index)];\
+        isum += vaddvq_s32(ggml_vdotq_s32(vzero, q2bytes.val[1], q8bytes.val[1])) * aux[is+1+(index)];
+
+#define SHIFT_MULTIPLY_ACCUM_WITH_SCALE(shift, index)\
+        q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32;\
+        q2bytes.val[0] = vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q2bits.val[0], (shift)), m3));\
+        q2bytes.val[1] = vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q2bits.val[1], (shift)), m3));\
+        MULTIPLY_ACCUM_WITH_SCALE((index));
+
+        for (int j = 0; j < QK_K/128; ++j) {
+            const ggml_uint8x16x2_t q2bits = ggml_vld1q_u8_x2(q2); q2 += 32;
+
+            ggml_int8x16x2_t q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32;
+            q2bytes.val[0] = vreinterpretq_s8_u8(vandq_u8(q2bits.val[0], m3));
+            q2bytes.val[1] = vreinterpretq_s8_u8(vandq_u8(q2bits.val[1], m3));
+
+            MULTIPLY_ACCUM_WITH_SCALE(0);
+
+            SHIFT_MULTIPLY_ACCUM_WITH_SCALE(2, 2);
+            SHIFT_MULTIPLY_ACCUM_WITH_SCALE(4, 4);
+            SHIFT_MULTIPLY_ACCUM_WITH_SCALE(6, 6);
+
+            is += 8;
+        }
+
+        sum += d * isum;
+    }
+
+    *s = sum;
+
+#else
+
+    float sumf = 0;
+
+    for (int i = 0; i < nb; ++i) {
+
+        const uint8_t * q2 = x[i].qs;
+        const  int8_t * q8 = y[i].qs;
+        const uint8_t * sc = x[i].scales;
+
+        int summs = 0;
+        for (int j = 0; j < 16; ++j) {
+            summs += y[i].bsums[j] * (sc[j] >> 4);
+        }
+
+        const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
+        const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
+
+        int isum = 0;
+        int is = 0;
+        int d;
+        for (int k = 0; k < QK_K/128; ++k) {
+            int shift = 0;
+            for (int j = 0; j < 4; ++j) {
+                d = sc[is++] & 0xF;
+                int isuml = 0;
+                for (int l =  0; l < 16; ++l) isuml += q8[l] * ((q2[l] >> shift) & 3);
+                isum += d * isuml;
+                d = sc[is++] & 0xF;
+                isuml = 0;
+                for (int l = 16; l < 32; ++l) isuml += q8[l] * ((q2[l] >> shift) & 3);
+                isum += d * isuml;
+                shift += 2;
+                q8 += 32;
+            }
+            q2 += 32;
+        }
+        sumf += dall * isum - dmin * summs;
+    }
+    *s = sumf;
+#endif
+}
+
+void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    assert(n % QK_K == 0);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const uint32_t kmask1 = 0x03030303;
+    const uint32_t kmask2 = 0x0f0f0f0f;
+
+    const block_q3_K * GGML_RESTRICT x = vx;
+    const block_q8_K * GGML_RESTRICT y = vy;
+
+    const int nb = n / QK_K;
+
+#if defined(__ARM_FEATURE_SVE)
+
+    uint32_t aux[3];
+    uint32_t utmp[4];
+
+    const int8_t m32 = 32;
+    const int vector_length = svcntb()*8;
+    const svuint8_t m3b_sv = svdup_n_u8(0x3);
+    const svint32_t vzero_sv = svdup_n_s32(0);
+
+    const svuint8_t m0_sv = svdup_n_u8(1);
+    const svuint8_t m1_sv = svlsl_n_u8_x(svptrue_b8(), m0_sv, 1);
+    const svuint8_t m2_sv = svlsl_n_u8_x(svptrue_b8(), m0_sv, 2);
+    const svuint8_t m3_sv = svlsl_n_u8_x(svptrue_b8(), m0_sv, 3);
+
+    float sum = 0;
+
+    for (int i = 0; i < nb; ++i) {
+
+        const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
+
+        const uint8_t * GGML_RESTRICT q3_sv = x[i].qs;
+        const uint8_t * GGML_RESTRICT qh_sv = x[i].hmask;
+        const int8_t  * GGML_RESTRICT q8_sv = y[i].qs;
+
+        // Set up scales
+        memcpy(aux, x[i].scales, 12);
+        utmp[3] = ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4);
+        utmp[2] = ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4);
+        utmp[1] = (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4);
+        utmp[0] = (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4);
+
+        int8_t * scale = (int8_t *)utmp;
+
+        for (int j = 0; j < 16; ++j) scale[j] -= m32;
+
+        switch (vector_length) {
+            case 128:
+                {
+                    svuint8_t qhbits_sv_1 = svld1_u8(svptrue_b8(), qh_sv);
+                    svuint8_t qhbits_sv_2 = svld1_u8(svptrue_b8(), qh_sv+16);
+                    svuint8_t q3h_sv;
+
+                    svint32_t sumi1_1 = svdup_n_s32(0);
+                    svint8_t q3bytes_sv;
+
+                    for (int j = 0; j < QK_K/128; ++j) {
+
+                        const svuint8_t q3bits_sv = svld1_u8(svptrue_b8(), q3_sv); q3_sv += 16;
+                        const svuint8_t q3bits_sv_1 = svld1_u8(svptrue_b8(), q3_sv); q3_sv += 16;
+                        svint8_t q8bytes_1_sv_1 = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
+                        svint8_t q8bytes_1_sv_2 = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
+
+                        q3h_sv = svlsl_n_u8_x(svptrue_b8(), svbic_u8_x(svptrue_b8(), m0_sv, qhbits_sv_1), 2);
+                        q3bytes_sv = svsub_s8_x(svptrue_b8(), svreinterpret_s8_u8(svand_u8_m(svptrue_b8(), q3bits_sv, m3b_sv)), svreinterpret_s8_u8(q3h_sv));
+
+                        sumi1_1 = svmla_s32_m(svptrue_b32(), sumi1_1, svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_1), svdup_n_s32((int32_t)scale[0]));
+
+                        q3h_sv = svlsl_n_u8_x(svptrue_b8(), svbic_u8_x(svptrue_b8(), m0_sv, qhbits_sv_2), 2);
+                        q3bytes_sv = svsub_s8_x(svptrue_b8(), svreinterpret_s8_u8(svand_u8_m(svptrue_b8(), q3bits_sv_1, m3b_sv)), svreinterpret_s8_u8(q3h_sv));
+
+                        sumi1_1 = svmla_s32_m(svptrue_b32(), sumi1_1, svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_2), svdup_n_s32((int32_t)scale[1]));
+
+                        q8bytes_1_sv_1 = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
+                        q8bytes_1_sv_2 = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
+
+                        q3h_sv = svlsl_n_u8_x(svptrue_b8(), svbic_u8_x(svptrue_b8(), m1_sv, qhbits_sv_1), 1);
+                        q3bytes_sv = svsub_s8_x(svptrue_b8(), svreinterpret_s8_u8(svand_u8_m(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q3bits_sv, 2), m3b_sv)), svreinterpret_s8_u8(q3h_sv));
+
+                        sumi1_1 = svmla_s32_m(svptrue_b32(), sumi1_1, svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_1), svdup_n_s32((int32_t)scale[2]));
+
+                        q3h_sv = svlsl_n_u8_x(svptrue_b8(), svbic_u8_x(svptrue_b8(), m1_sv, qhbits_sv_2), 1);
+                        q3bytes_sv = svsub_s8_x(svptrue_b8(), svreinterpret_s8_u8(svand_u8_m(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q3bits_sv_1, 2), m3b_sv)), svreinterpret_s8_u8(q3h_sv));
+
+                        sumi1_1 = svmla_s32_m(svptrue_b32(), sumi1_1, svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_2), svdup_n_s32((int32_t)scale[3]));
+
+
+                        scale += 4;
+                        q8bytes_1_sv_1 = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
+                        q8bytes_1_sv_2 = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
+
+                        q3h_sv = svbic_u8_x(svptrue_b8(), m2_sv, qhbits_sv_1);
+                        q3bytes_sv = svsub_s8_x(svptrue_b8(), svreinterpret_s8_u8(svand_u8_m(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q3bits_sv, 4), m3b_sv)), svreinterpret_s8_u8(q3h_sv));
+
+                        sumi1_1 = svmla_s32_m(svptrue_b32(), sumi1_1, svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_1), svdup_n_s32((int32_t)scale[0]));
+
+                        q3h_sv = svbic_u8_x(svptrue_b8(), m2_sv, qhbits_sv_2);
+                        q3bytes_sv = svsub_s8_x(svptrue_b8(), svreinterpret_s8_u8(svand_u8_m(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q3bits_sv_1, 4), m3b_sv)), svreinterpret_s8_u8(q3h_sv));
+
+                        sumi1_1 = svmla_s32_m(svptrue_b32(), sumi1_1, svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_2), svdup_n_s32((int32_t)scale[1]));
+
+
+                        q8bytes_1_sv_1 = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
+                        q8bytes_1_sv_2 = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
+
+                        q3h_sv = svlsr_n_u8_x(svptrue_b8(), svbic_u8_x(svptrue_b8(), m3_sv, qhbits_sv_1), 1);
+                        q3bytes_sv = svsub_s8_x(svptrue_b8(), svreinterpret_s8_u8(svand_u8_m(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q3bits_sv, 6), m3b_sv)), svreinterpret_s8_u8(q3h_sv));
+
+                        sumi1_1 = svmla_s32_m(svptrue_b32(), sumi1_1, svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_1), svdup_n_s32((int32_t)scale[2]));
+
+                        q3h_sv = svlsr_n_u8_x(svptrue_b8(), svbic_u8_x(svptrue_b8(), m3_sv, qhbits_sv_2), 1);
+                        q3bytes_sv = svsub_s8_x(svptrue_b8(), svreinterpret_s8_u8(svand_u8_m(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q3bits_sv_1, 6), m3b_sv)), svreinterpret_s8_u8(q3h_sv));
+
+                        sumi1_1 = svmla_s32_m(svptrue_b32(), sumi1_1, svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_2), svdup_n_s32((int32_t)scale[3]));
+
+                        if (j == 0) {
+                            qhbits_sv_1 = svlsr_n_u8_x(svptrue_b8(), qhbits_sv_1, 4);
+                            qhbits_sv_2 = svlsr_n_u8_x(svptrue_b8(), qhbits_sv_2, 4);
+                        }
+
+                        scale += 4;
+                    }
+
+                    sum += d * (svaddv_s32(svptrue_b32(), sumi1_1));
+                } break;
+            case 256:
+            case 512:
+                {
+                    svuint8_t qhbits_sv = svld1_u8(svptrue_pat_b8(SV_VL32), qh_sv);
+                    svuint8_t q3h_sv;
+
+                    svint32_t sumi1_1 = svdup_n_s32(0);
+                    svint8_t q3bytes_sv;
+
+                    for (int j = 0; j < QK_K/128; ++j) {
+
+                        const svuint8_t q3bits_sv = svld1_u8(svptrue_pat_b8(SV_VL32), q3_sv); q3_sv += 32;
+                        svint8_t q8bytes_1_sv_1 = svld1_s8(svptrue_pat_b8(SV_VL32), q8_sv); q8_sv += 32;
+                        svint8_t q8bytes_1_sv_2 = svld1_s8(svptrue_pat_b8(SV_VL32), q8_sv); q8_sv += 32;
+
+                        q3h_sv = svlsl_n_u8_x(svptrue_pat_b8(SV_VL32), svbic_u8_x(svptrue_pat_b8(SV_VL32), m0_sv, qhbits_sv), 2);
+                        q3bytes_sv = svsub_s8_x(svptrue_pat_b8(SV_VL32), svreinterpret_s8_u8(svand_u8_m(svptrue_pat_b8(SV_VL32), q3bits_sv, m3b_sv)), svreinterpret_s8_u8(q3h_sv));
+
+
+                        svint32_t scale_1 = svsel_s32(svptrue_pat_b32(SV_VL4), svdup_n_s32((int32_t)scale[0]), svdup_n_s32((int32_t)scale[1]));
+                        sumi1_1 = svmla_s32_m(svptrue_pat_b32(SV_VL8), sumi1_1, svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_1), scale_1);
+
+                        q3h_sv = svlsl_n_u8_x(svptrue_pat_b8(SV_VL32), svbic_u8_x(svptrue_pat_b8(SV_VL32), m1_sv, qhbits_sv), 1);
+                        q3bytes_sv = svsub_s8_x(svptrue_pat_b8(SV_VL32), svreinterpret_s8_u8(svand_u8_m(svptrue_pat_b8(SV_VL32), svlsr_n_u8_x(svptrue_pat_b8(SV_VL32), q3bits_sv, 2), m3b_sv)), svreinterpret_s8_u8(q3h_sv));
+
+                        scale_1 = svsel_s32(svptrue_pat_b32(SV_VL4), svdup_n_s32((int32_t)scale[2]), svdup_n_s32((int32_t)scale[3]));
+                        sumi1_1 = svmla_s32_m(svptrue_pat_b32(SV_VL8), sumi1_1, svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_2), scale_1);
+
+                        scale += 4;
+                        q8bytes_1_sv_1 = svld1_s8(svptrue_pat_b8(SV_VL32), q8_sv); q8_sv += 32;
+                        q8bytes_1_sv_2 = svld1_s8(svptrue_pat_b8(SV_VL32), q8_sv); q8_sv += 32;
+
+                        q3h_sv = svbic_u8_x(svptrue_pat_b8(SV_VL32), m2_sv, qhbits_sv);
+                        q3bytes_sv = svsub_s8_x(svptrue_pat_b8(SV_VL32), svreinterpret_s8_u8(svand_u8_m(svptrue_pat_b8(SV_VL32), svlsr_n_u8_x(svptrue_pat_b8(SV_VL32), q3bits_sv, 4), m3b_sv)), svreinterpret_s8_u8(q3h_sv));
+
+                        scale_1 = svsel_s32(svptrue_pat_b32(SV_VL4), svdup_n_s32((int32_t)scale[0]), svdup_n_s32((int32_t)scale[1]));
+                        sumi1_1 = svmla_s32_m(svptrue_pat_b32(SV_VL8), sumi1_1, svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_1), scale_1);
+
+                        q3h_sv = svlsr_n_u8_x(svptrue_pat_b8(SV_VL32), svbic_u8_x(svptrue_pat_b8(SV_VL32), m3_sv, qhbits_sv), 1);
+                        q3bytes_sv = svsub_s8_x(svptrue_pat_b8(SV_VL32), svreinterpret_s8_u8(svand_u8_m(svptrue_pat_b8(SV_VL32), svlsr_n_u8_x(svptrue_pat_b8(SV_VL32), q3bits_sv, 6), m3b_sv)), svreinterpret_s8_u8(q3h_sv));
+
+                        scale_1 = svsel_s32(svptrue_pat_b32(SV_VL4), svdup_n_s32((int32_t)scale[2]), svdup_n_s32((int32_t)scale[3]));
+                        sumi1_1 = svmla_s32_m(svptrue_pat_b32(SV_VL8), sumi1_1, svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_2), scale_1);
+
+                        if (j == 0) {
+                            qhbits_sv = svlsr_n_u8_x(svptrue_pat_b8(SV_VL32), qhbits_sv, 4);
+                        }
+
+                        scale += 4;
+                    }
+
+                    sum += d * (svaddv_s32(svptrue_pat_b32(SV_VL8), sumi1_1));
+                } break;
+            default:
+                assert(false && "Unsupported vector length");
+                break;
+        }
+    }
+    *s = sum;
+
+#elif __ARM_NEON
+
+    uint32_t aux[3];
+    uint32_t utmp[4];
+
+    const uint8x16_t m3b = vdupq_n_u8(0x3);
+    const int32x4_t  vzero = vdupq_n_s32(0);
+
+    const uint8x16_t m0 = vdupq_n_u8(1);
+    const uint8x16_t m1 = vshlq_n_u8(m0, 1);
+    const uint8x16_t m2 = vshlq_n_u8(m0, 2);
+    const uint8x16_t m3 = vshlq_n_u8(m0, 3);
+    const int8_t m32 = 32;
+
+    ggml_int8x16x4_t q3bytes;
+
+    float sum = 0;
+
+    for (int i = 0; i < nb; ++i) {
+
+        const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
+
+        const uint8_t * GGML_RESTRICT q3 = x[i].qs;
+        const uint8_t * GGML_RESTRICT qh = x[i].hmask;
+        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
+
+        ggml_uint8x16x2_t qhbits = ggml_vld1q_u8_x2(qh);
+
+        ggml_uint8x16x4_t q3h;
+
+        int32_t isum = 0;
+
+        // Set up scales
+        memcpy(aux, x[i].scales, 12);
+        utmp[3] = ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4);
+        utmp[2] = ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4);
+        utmp[1] = (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4);
+        utmp[0] = (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4);
+
+        int8_t * scale = (int8_t *)utmp;
+        for (int j = 0; j < 16; ++j) scale[j] -= m32;
+
+        for (int j = 0; j < QK_K/128; ++j) {
+
+            const ggml_uint8x16x2_t q3bits = ggml_vld1q_u8_x2(q3); q3 += 32;
+            const ggml_int8x16x4_t q8bytes_1 = ggml_vld1q_s8_x4(q8); q8 += 64;
+            const ggml_int8x16x4_t q8bytes_2 = ggml_vld1q_s8_x4(q8); q8 += 64;
+
+            q3h.val[0] = vshlq_n_u8(vbicq_u8(m0, qhbits.val[0]), 2);
+            q3h.val[1] = vshlq_n_u8(vbicq_u8(m0, qhbits.val[1]), 2);
+            q3h.val[2] = vshlq_n_u8(vbicq_u8(m1, qhbits.val[0]), 1);
+            q3h.val[3] = vshlq_n_u8(vbicq_u8(m1, qhbits.val[1]), 1);
+
+            q3bytes.val[0] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(q3bits.val[0], m3b)), vreinterpretq_s8_u8(q3h.val[0]));
+            q3bytes.val[1] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(q3bits.val[1], m3b)), vreinterpretq_s8_u8(q3h.val[1]));
+            q3bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[0], 2), m3b)), vreinterpretq_s8_u8(q3h.val[2]));
+            q3bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[1], 2), m3b)), vreinterpretq_s8_u8(q3h.val[3]));
+
+            isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[0], q8bytes_1.val[0])) * scale[0];
+            isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[1], q8bytes_1.val[1])) * scale[1];
+            isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[2], q8bytes_1.val[2])) * scale[2];
+            isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[3], q8bytes_1.val[3])) * scale[3];
+
+            scale += 4;
+
+            q3h.val[0] = vbicq_u8(m2, qhbits.val[0]);
+            q3h.val[1] = vbicq_u8(m2, qhbits.val[1]);
+            q3h.val[2] = vshrq_n_u8(vbicq_u8(m3, qhbits.val[0]), 1);
+            q3h.val[3] = vshrq_n_u8(vbicq_u8(m3, qhbits.val[1]), 1);
+
+            q3bytes.val[0] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[0], 4), m3b)), vreinterpretq_s8_u8(q3h.val[0]));
+            q3bytes.val[1] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[1], 4), m3b)), vreinterpretq_s8_u8(q3h.val[1]));
+            q3bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[0], 6), m3b)), vreinterpretq_s8_u8(q3h.val[2]));
+            q3bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[1], 6), m3b)), vreinterpretq_s8_u8(q3h.val[3]));
+
+            isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[0], q8bytes_2.val[0])) * scale[0];
+            isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[1], q8bytes_2.val[1])) * scale[1];
+            isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[2], q8bytes_2.val[2])) * scale[2];
+            isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[3], q8bytes_2.val[3])) * scale[3];
+
+            scale += 4;
+
+            if (j == 0) {
+                qhbits.val[0] = vshrq_n_u8(qhbits.val[0], 4);
+                qhbits.val[1] = vshrq_n_u8(qhbits.val[1], 4);
+            }
+
+        }
+        sum += d * isum;
+
+    }
+
+    *s = sum;
+
+#else
+    // scalar version
+    // This function is written like this so the compiler can manage to vectorize most of it
+    // Using -Ofast, GCC and clang manage to produce code that is within a factor of 2 or so from the
+    // manually vectorized version above. Every other version I tried would run at least 4 times slower.
+    // The ideal situation would be if we could just write the code once, and the compiler would
+    // automatically produce the best possible set of machine instructions, instead of us having to manually
+    // write vectorized versions for AVX, ARM_NEON, etc.
+
+    int8_t  aux8[QK_K];
+    int16_t aux16[8];
+    float   sums [8];
+    int32_t aux32[8];
+    memset(sums, 0, 8*sizeof(float));
+
+    uint32_t auxs[4];
+    const int8_t * scales = (const int8_t*)auxs;
+
+    float sumf = 0;
+    for (int i = 0; i < nb; ++i) {
+        const uint8_t * GGML_RESTRICT q3 = x[i].qs;
+        const uint8_t * GGML_RESTRICT hm = x[i].hmask;
+        const  int8_t * GGML_RESTRICT q8 = y[i].qs;
+        memset(aux32, 0, 8*sizeof(int32_t));
+        int8_t * GGML_RESTRICT a = aux8;
+        uint8_t m = 1;
+        for (int j = 0; j < QK_K; j += 128) {
+            for (int l = 0; l < 32; ++l) a[l] = q3[l] & 3;
+            for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
+            a += 32; m <<= 1;
+            for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 2) & 3;
+            for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
+            a += 32; m <<= 1;
+            for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 4) & 3;
+            for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
+            a += 32; m <<= 1;
+            for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 6) & 3;
+            for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
+            a += 32; m <<= 1;
+            q3 += 32;
+        }
+        a = aux8;
+
+        memcpy(auxs, x[i].scales, 12);
+        uint32_t tmp = auxs[2];
+        auxs[2] = ((auxs[0] >> 4) & kmask2) | (((tmp >> 4) & kmask1) << 4);
+        auxs[3] = ((auxs[1] >> 4) & kmask2) | (((tmp >> 6) & kmask1) << 4);
+        auxs[0] = (auxs[0] & kmask2) | (((tmp >> 0) & kmask1) << 4);
+        auxs[1] = (auxs[1] & kmask2) | (((tmp >> 2) & kmask1) << 4);
+        for (int j = 0; j < QK_K/16; ++j) {
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
+            q8 += 8; a += 8;
+        }
+        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+        for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
+    }
+    for (int l = 0; l < 8; ++l) sumf += sums[l];
+    *s = sumf;
+
+#endif
+
+}
+
+void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    assert(n % QK_K == 0);
+#ifdef __ARM_FEATURE_MATMUL_INT8
+    assert((nrc == 2) || (nrc == 1));
+#else
+    assert(nrc == 1);
+#endif
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_q4_K * GGML_RESTRICT x = vx;
+    const block_q8_K * GGML_RESTRICT y = vy;
+
+    const int nb = n / QK_K;
+
+    static const uint32_t kmask1 = 0x3f3f3f3f;
+    static const uint32_t kmask2 = 0x0f0f0f0f;
+    static const uint32_t kmask3 = 0x03030303;
+
+    uint32_t utmp[4];
+
+#if defined(__ARM_FEATURE_MATMUL_INT8)
+    if (nrc == 2) {
+        const block_q4_K * GGML_RESTRICT x0 = x;
+        const block_q4_K * GGML_RESTRICT x1 = (const block_q4_K *) ((const uint8_t *)vx + bx);
+        const block_q8_K * GGML_RESTRICT y0 = y;
+        const block_q8_K * GGML_RESTRICT y1 = (const block_q8_K *) ((const uint8_t *)vy + by);
+
+        const uint8x16_t m4b = vdupq_n_u8(0x0f);
+
+        float32x4_t vfsum = vdupq_n_f32(0.0f);
+
+        for (int i = 0; i < nb; ++i, ++x0, ++x1, ++y0, ++y1) {
+            const uint8_t * GGML_RESTRICT qx0 = x0->qs;
+            const uint8_t * GGML_RESTRICT qx1 = x1->qs;
+            const  int8_t * GGML_RESTRICT qy0 = y0->qs;
+            const  int8_t * GGML_RESTRICT qy1 = y1->qs;
+
+            // decode scales and mins
+            int8_t x0_scales[8], x1_scales[8];
+            int16x8_t x0_mins, x1_mins;
+            {
+                uint32_t scales_mins[3];
+                memcpy(scales_mins, x0->scales, 12);
+                const uint32_t mins_0_3 = scales_mins[1] & kmask1;
+                const uint32_t mins_4_7 = ((scales_mins[2] >> 4) & kmask2) | (((scales_mins[1] >> 6) & kmask3) << 4);
+                const uint32x2_t mins = {mins_0_3, mins_4_7};
+                x0_mins = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(mins)));
+                uint32_t scales[2];
+                scales[0] = scales_mins[0] & kmask1; // scales 0~3
+                scales[1] = (scales_mins[2] & kmask2) | (((scales_mins[0] >> 6) & kmask3) << 4); // scales 4~7
+                memcpy(x0_scales, scales, 8);
+            }
+            {
+                uint32_t scales_mins[3];
+                memcpy(scales_mins, x1->scales, 12);
+                const uint32_t mins_0_3 = scales_mins[1] & kmask1;
+                const uint32_t mins_4_7 = ((scales_mins[2] >> 4) & kmask2) | (((scales_mins[1] >> 6) & kmask3) << 4);
+                const uint32x2_t mins = {mins_0_3, mins_4_7};
+                x1_mins = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(mins)));
+                uint32_t scales[2];
+                scales[0] = scales_mins[0] & kmask1; // scales 0~3
+                scales[1] = (scales_mins[2] & kmask2) | (((scales_mins[0] >> 6) & kmask3) << 4); // scales 4~7
+                memcpy(x1_scales, scales, 8);
+            }
+
+            int32x4_t visum = {0};
+
+            // process 64 data points per iteration, totally 256 data points
+            for (int j = 0; j < QK_K / 64; ++j, qx0 += 32, qx1 += 32, qy0 += 64, qy1 += 64) {
+                const int8x16x4_t vy0 = vld1q_s8_x4(qy0);
+                const int8x16x4_t vy1 = vld1q_s8_x4(qy1);
+
+                int8x16_t vx0[4], vx1[4];
+                {
+                    const uint8x16x2_t vv = vld1q_u8_x2(qx0);
+                    vx0[0] = vreinterpretq_s8_u8(vandq_u8(vv.val[0], m4b));
+                    vx0[1] = vreinterpretq_s8_u8(vandq_u8(vv.val[1], m4b));
+                    vx0[2] = vreinterpretq_s8_u8(vshrq_n_u8(vv.val[0], 4));
+                    vx0[3] = vreinterpretq_s8_u8(vshrq_n_u8(vv.val[1], 4));
+                }
+                {
+                    const uint8x16x2_t vv = vld1q_u8_x2(qx1);
+                    vx1[0] = vreinterpretq_s8_u8(vandq_u8(vv.val[0], m4b));
+                    vx1[1] = vreinterpretq_s8_u8(vandq_u8(vv.val[1], m4b));
+                    vx1[2] = vreinterpretq_s8_u8(vshrq_n_u8(vv.val[0], 4));
+                    vx1[3] = vreinterpretq_s8_u8(vshrq_n_u8(vv.val[1], 4));
+                }
+
+                // process 32 data points (share same block scale) per iteration
+                for (int k = 0; k < 2; ++k) {
+                    const int blk = j * 2 + k;
+                    const int32x4_t block_scale = {
+                        x0_scales[blk],
+                        x0_scales[blk],
+                        x1_scales[blk],
+                        x1_scales[blk],
+                    };
+
+                    int32x4_t vr = {0};
+                    for (int l = 0; l < 2; ++l) {
+                        const int idx = k * 2 + l;
+                        const int64x2_t vx0_s64 = vreinterpretq_s64_s8(vx0[idx]);
+                        const int64x2_t vx1_s64 = vreinterpretq_s64_s8(vx1[idx]);
+                        const int64x2_t vy0_s64 = vreinterpretq_s64_s8(vy0.val[idx]);
+                        const int64x2_t vy1_s64 = vreinterpretq_s64_s8(vy1.val[idx]);
+                        const int8x16_t vx_l = vreinterpretq_s8_s64(vzip1q_s64(vx0_s64, vx1_s64));
+                        const int8x16_t vx_h = vreinterpretq_s8_s64(vzip2q_s64(vx0_s64, vx1_s64));
+                        const int8x16_t vy_l = vreinterpretq_s8_s64(vzip1q_s64(vy0_s64, vy1_s64));
+                        const int8x16_t vy_h = vreinterpretq_s8_s64(vzip2q_s64(vy0_s64, vy1_s64));
+                        vr = vmmlaq_s32(vr, vx_l, vy_l);
+                        vr = vmmlaq_s32(vr, vx_h, vy_h);
+                    }
+                    // apply block scale, will NOT overflow
+                    // block_scale * sum_256(int4*int8) <= 2^(8+8+4+8) = 28 bits
+                    visum = vmlaq_s32(visum, vr, block_scale);
+                }
+            }
+
+            // adjust bias, apply superblock scale
+            {
+                int32_t bias[4];
+                // no obvious uplift from sve sdot-16, just use neon mul add
+                const int16x8_t y0_sums = vpaddq_s16(vld1q_s16(y0->bsums), vld1q_s16(y0->bsums+8));
+                const int16x8_t y1_sums = vpaddq_s16(vld1q_s16(y1->bsums), vld1q_s16(y1->bsums+8));
+                bias[0] = vaddvq_s32(vaddq_s32(vmull_s16(vget_low_s16(y0_sums), vget_low_s16(x0_mins)),
+                                               vmull_s16(vget_high_s16(y0_sums), vget_high_s16(x0_mins))));
+                bias[1] = vaddvq_s32(vaddq_s32(vmull_s16(vget_low_s16(y1_sums), vget_low_s16(x0_mins)),
+                                               vmull_s16(vget_high_s16(y1_sums), vget_high_s16(x0_mins))));
+                bias[2] = vaddvq_s32(vaddq_s32(vmull_s16(vget_low_s16(y0_sums), vget_low_s16(x1_mins)),
+                                               vmull_s16(vget_high_s16(y0_sums), vget_high_s16(x1_mins))));
+                bias[3] = vaddvq_s32(vaddq_s32(vmull_s16(vget_low_s16(y1_sums), vget_low_s16(x1_mins)),
+                                               vmull_s16(vget_high_s16(y1_sums), vget_high_s16(x1_mins))));
+                const float32x4_t dmins = {
+                    GGML_CPU_FP16_TO_FP32(x0->dmin) * y0->d,
+                    GGML_CPU_FP16_TO_FP32(x0->dmin) * y1->d,
+                    GGML_CPU_FP16_TO_FP32(x1->dmin) * y0->d,
+                    GGML_CPU_FP16_TO_FP32(x1->dmin) * y1->d,
+                };
+                vfsum = vmlsq_f32(vfsum, vcvtq_f32_s32(vld1q_s32(bias)), dmins);
+
+                const float32x4_t superblock_scale = {
+                    GGML_CPU_FP16_TO_FP32(x0->d) * y0->d,
+                    GGML_CPU_FP16_TO_FP32(x0->d) * y1->d,
+                    GGML_CPU_FP16_TO_FP32(x1->d) * y0->d,
+                    GGML_CPU_FP16_TO_FP32(x1->d) * y1->d,
+                };
+                vfsum = vmlaq_f32(vfsum, vcvtq_f32_s32(visum), superblock_scale);
+            }
+        }
+
+        // vfsum = ABCD -> ACBD
+        // AC -> s, BD -> (s+bs)
+        vfsum = vzip1q_f32(vfsum, vextq_f32(vfsum, vfsum, 2));
+        vst1_f32(s,      vget_low_f32 (vfsum));
+        vst1_f32(s + bs, vget_high_f32(vfsum));
+
+        return;
+    }
+#endif
+
+#ifdef __ARM_FEATURE_SVE
+    float sumf = 0;
+    for (int i = 0; i < nb; ++i) {
+
+        const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
+        const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
+
+        const int16x8_t q8sums = vpaddq_s16(vld1q_s16(y[i].bsums), vld1q_s16(y[i].bsums + 8));
+
+        memcpy(utmp, x[i].scales, K_SCALE_SIZE);
+
+        uint32x2_t mins8 = { 0 };
+        mins8 = vset_lane_u32(utmp[1] & kmask1, mins8, 0);
+        mins8 = vset_lane_u32(((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4), mins8, 1);
+
+        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
+        utmp[0] &= kmask1;
+
+        const int16x8_t mins = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(mins8)));
+        const int32x4_t prod = vaddq_s32(vmull_s16(vget_low_s16 (q8sums), vget_low_s16 (mins)),
+                                         vmull_s16(vget_high_s16(q8sums), vget_high_s16(mins)));
+        sumf -= dmin * vaddvq_s32(prod);
+
+        const uint8_t * scales = (const uint8_t *)utmp;
+
+        const uint8_t * GGML_RESTRICT q4 = x[i].qs;
+        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
+
+        const int vector_length = ggml_cpu_get_sve_cnt()*8;
+        const svuint8_t m4b = svdup_n_u8(0xf);
+        const svint32_t mzero = svdup_n_s32(0);
+        svint32_t sumi1 = svdup_n_s32(0);
+        svint32_t sumi1_1 = svdup_n_s32(0);
+        svint32_t sumi1_2 = svdup_n_s32(0);
+        svint32_t sumi2 = svdup_n_s32(0);
+        svint32_t sumi2_1 = svdup_n_s32(0);
+        svint32_t sumi2_2 = svdup_n_s32(0);
+        switch (vector_length) {
+            case 128:
+                {
+                    for (int j = 0; j < QK_K/64; ++j) {
+                        svint8_t q4bytes = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svld1_u8(svptrue_b8(), q4), m4b));
+                        svint8_t q8bytes = svld1_s8(svptrue_b8(), q8); q8 += 16;
+                        sumi1_1 = svmla_n_s32_x(svptrue_b32(), sumi1_1, svdot_s32(mzero, q4bytes, q8bytes), scales[2*j+0]);
+                        q4bytes = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svld1_u8(svptrue_b8(), q4+16), m4b));
+                        q8bytes = svld1_s8(svptrue_b8(), q8); q8 += 16;
+                        sumi1_2 = svmla_n_s32_x(svptrue_b32(), sumi1_2, svdot_s32(mzero, q4bytes, q8bytes), scales[2*j+0]);
+
+                        q4bytes = svreinterpret_s8_u8(svlsr_n_u8_x(svptrue_b8(), svld1_u8(svptrue_b8(), q4), 4));
+                        q8bytes = svld1_s8(svptrue_b8(), q8); q8 += 16;
+                        sumi2_1 = svmla_n_s32_x(svptrue_b32(), sumi2_1, svdot_s32(mzero, q4bytes, q8bytes), scales[2*j+1]);
+                        q4bytes = svreinterpret_s8_u8(svlsr_n_u8_x(svptrue_b8(), svld1_u8(svptrue_b8(), q4+16), 4));
+                        q8bytes = svld1_s8(svptrue_b8(), q8); q8 += 16;
+                        sumi2_2 = svmla_n_s32_x(svptrue_b32(), sumi2_2, svdot_s32(mzero, q4bytes, q8bytes), scales[2*j+1]);
+                        q4 += 32;
+                    }
+                    sumi1 = svadd_s32_x(svptrue_b32(), sumi1_1, sumi1_2);
+                    sumi2 = svadd_s32_x(svptrue_b32(), sumi2_1, sumi2_2);
+                    sumf += d * (svaddv_s32(svptrue_b32(), svadd_s32_x(svptrue_b32(), sumi1, sumi2)));
+                } break;
+            case 256:
+            case 512:
+                {
+                    for (int j = 0; j < QK_K/64; ++j) {
+                        const svuint8_t q4bits  = svld1_u8(svptrue_pat_b8(SV_VL32), q4); q4 += 32;
+                        svint8_t q4bytes = svreinterpret_s8_u8(svand_u8_x(svptrue_pat_b8(SV_VL32), q4bits, m4b));
+                        svint8_t q8bytes = svld1_s8(svptrue_pat_b8(SV_VL32), q8); q8 += 32;
+                        sumi1 = svmla_n_s32_x(svptrue_pat_b32(SV_VL8), sumi1, svdot_s32(mzero, q4bytes, q8bytes), scales[2*j+0]);
+
+                        q4bytes = svreinterpret_s8_u8(svlsr_n_u8_x(svptrue_pat_b8(SV_VL32), q4bits, 4));
+                        q8bytes = svld1_s8(svptrue_pat_b8(SV_VL32), q8); q8 += 32;
+                        sumi2 = svmla_n_s32_x(svptrue_pat_b32(SV_VL8), sumi2, svdot_s32(mzero, q4bytes, q8bytes), scales[2*j+1]);
+                    }
+                    sumf += d * (svaddv_s32(svptrue_pat_b32(SV_VL8), svadd_s32_x(svptrue_pat_b32(SV_VL8), sumi1, sumi2)));
+                } break;
+            default:
+                assert(false && "Unsupported vector length");
+                break;
+        }
+    }
+    *s = sumf;
+#elif defined __ARM_NEON
+    const uint8x16_t m4b = vdupq_n_u8(0xf);
+    const int32x4_t mzero = vdupq_n_s32(0);
+
+    ggml_int8x16x2_t q4bytes;
+    ggml_int8x16x2_t q8bytes;
+
+    float sumf = 0;
+
+    for (int i = 0; i < nb; ++i) {
+
+        const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
+        const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
+
+        const int16x8_t q8sums = vpaddq_s16(vld1q_s16(y[i].bsums), vld1q_s16(y[i].bsums + 8));
+
+        memcpy(utmp, x[i].scales, 12);
+
+        uint32x2_t mins8 = { 0 };
+        mins8 = vset_lane_u32(utmp[1] & kmask1, mins8, 0);
+        mins8 = vset_lane_u32(((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4), mins8, 1);
+
+        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
+        utmp[0] &= kmask1;
+
+        const int16x8_t mins = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(mins8)));
+        const int32x4_t prod = vaddq_s32(vmull_s16(vget_low_s16 (q8sums), vget_low_s16 (mins)),
+                                         vmull_s16(vget_high_s16(q8sums), vget_high_s16(mins)));
+        sumf -= dmin * vaddvq_s32(prod);
+
+        const uint8_t * scales = (const uint8_t *)utmp;
+
+        const uint8_t * GGML_RESTRICT q4 = x[i].qs;
+        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
+
+        int32_t sumi1 = 0;
+        int32_t sumi2 = 0;
+
+        for (int j = 0; j < QK_K/64; ++j) {
+            const ggml_uint8x16x2_t q4bits = ggml_vld1q_u8_x2(q4); q4 += 32;
+
+            q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32;
+            q4bytes.val[0] = vreinterpretq_s8_u8(vandq_u8  (q4bits.val[0], m4b));
+            q4bytes.val[1] = vreinterpretq_s8_u8(vandq_u8  (q4bits.val[1], m4b));
+
+            const int32x4_t p1 = ggml_vdotq_s32(ggml_vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[0]), q4bytes.val[1], q8bytes.val[1]);
+            sumi1 += vaddvq_s32(p1) * scales[2*j+0];
+
+            q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32;
+            q4bytes.val[0] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[0], 4));
+            q4bytes.val[1] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[1], 4));
+
+            const int32x4_t p2 = ggml_vdotq_s32(ggml_vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[0]), q4bytes.val[1], q8bytes.val[1]);
+
+            sumi2 += vaddvq_s32(p2) * scales[2*j+1];
+        }
+
+        sumf += d * (sumi1 + sumi2);
+
+    }
+
+    *s = sumf;
+
+#else
+
+    const uint8_t * scales = (const uint8_t*)&utmp[0];
+    const uint8_t * mins   = (const uint8_t*)&utmp[2];
+
+    int8_t  aux8[QK_K];
+    int16_t aux16[8];
+    float   sums [8];
+    int32_t aux32[8];
+    memset(sums, 0, 8*sizeof(float));
+
+    float sumf = 0;
+    for (int i = 0; i < nb; ++i) {
+        const uint8_t * GGML_RESTRICT q4 = x[i].qs;
+        const  int8_t * GGML_RESTRICT q8 = y[i].qs;
+        memset(aux32, 0, 8*sizeof(int32_t));
+        int8_t * GGML_RESTRICT a = aux8;
+        for (int j = 0; j < QK_K/64; ++j) {
+            for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
+            a += 32;
+            for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l]  >> 4);
+            a += 32; q4 += 32;
+        }
+        memcpy(utmp, x[i].scales, 12);
+        utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
+        const uint32_t uaux = utmp[1] & kmask1;
+        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
+        utmp[2] = uaux;
+        utmp[0] &= kmask1;
+
+        int sumi = 0;
+        for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
+        a = aux8;
+        int is = 0;
+        for (int j = 0; j < QK_K/32; ++j) {
+            int32_t scale = scales[is++];
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+        }
+        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+        for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
+        const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
+        sumf -= dmin * sumi;
+    }
+    for (int l = 0; l < 8; ++l) sumf += sums[l];
+    *s = sumf;
+#endif
+}
+
+void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy,  size_t by, int nrc) {
+    assert(n % QK_K == 0);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_q5_K * GGML_RESTRICT x = vx;
+    const block_q8_K * GGML_RESTRICT y = vy;
+
+    const int nb = n / QK_K;
+
+    static const uint32_t kmask1 = 0x3f3f3f3f;
+    static const uint32_t kmask2 = 0x0f0f0f0f;
+    static const uint32_t kmask3 = 0x03030303;
+
+    uint32_t utmp[4];
+
+
+#ifdef __ARM_NEON
+    const uint8x16_t m4b = vdupq_n_u8(0xf);
+    const uint8x16_t mone = vdupq_n_u8(1);
+    const uint8x16_t mtwo = vdupq_n_u8(2);
+    const int32x4_t mzero = vdupq_n_s32(0);
+
+    ggml_int8x16x4_t q5bytes;
+
+    float sumf = 0;
+
+    for (int i = 0; i < nb; ++i) {
+
+        const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
+        const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
+
+        const int16x8_t q8sums = vpaddq_s16(vld1q_s16(y[i].bsums), vld1q_s16(y[i].bsums + 8));
+
+        memcpy(utmp, x[i].scales, 12);
+        utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
+        const uint32_t uaux = utmp[1] & kmask1;
+        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
+        utmp[2] = uaux;
+        utmp[0] &= kmask1;
+
+        const uint8x8_t mins8 = vld1_u8((const uint8_t*)utmp + 8);
+        const int16x8_t mins = vreinterpretq_s16_u16(vmovl_u8(mins8));
+        const int32x4_t prod = vaddq_s32(vmull_s16(vget_low_s16 (q8sums), vget_low_s16 (mins)),
+                                         vmull_s16(vget_high_s16(q8sums), vget_high_s16(mins)));
+        int32_t sumi_mins = vaddvq_s32(prod);
+
+        const uint8_t * scales = (const uint8_t *)utmp;
+
+        const uint8_t * GGML_RESTRICT q5 = x[i].qs;
+        const uint8_t * GGML_RESTRICT qh = x[i].qh;
+        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
+
+        ggml_uint8x16x2_t qhbits = ggml_vld1q_u8_x2(qh);
+
+        ggml_uint8x16x4_t q5h;
+
+        int32_t sumi = 0;
+
+        for (int j = 0; j < QK_K/64; ++j) {
+
+            const ggml_uint8x16x2_t q5bits = ggml_vld1q_u8_x2(q5); q5 += 32;
+            const ggml_int8x16x4_t q8bytes = ggml_vld1q_s8_x4(q8); q8 += 64;
+
+            q5h.val[0] = vshlq_n_u8(vandq_u8(mone, qhbits.val[0]), 4);
+            q5h.val[1] = vshlq_n_u8(vandq_u8(mone, qhbits.val[1]), 4);
+            q5h.val[2] = vshlq_n_u8(vandq_u8(mtwo, qhbits.val[0]), 3);
+            q5h.val[3] = vshlq_n_u8(vandq_u8(mtwo, qhbits.val[1]), 3);
+            qhbits.val[0] = vshrq_n_u8(qhbits.val[0], 2);
+            qhbits.val[1] = vshrq_n_u8(qhbits.val[1], 2);
+
+            q5bytes.val[0] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q5bits.val[0], m4b), q5h.val[0]));
+            q5bytes.val[1] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q5bits.val[1], m4b), q5h.val[1]));
+            q5bytes.val[2] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q5bits.val[0], 4), q5h.val[2]));
+            q5bytes.val[3] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q5bits.val[1], 4), q5h.val[3]));
+
+            sumi += vaddvq_s32(ggml_vdotq_s32(ggml_vdotq_s32(mzero, q5bytes.val[0], q8bytes.val[0]), q5bytes.val[1], q8bytes.val[1])) * *scales++;
+            sumi += vaddvq_s32(ggml_vdotq_s32(ggml_vdotq_s32(mzero, q5bytes.val[2], q8bytes.val[2]), q5bytes.val[3], q8bytes.val[3])) * *scales++;
+        }
+
+        sumf += d * sumi - dmin * sumi_mins;
+    }
+
+    *s = sumf;
+
+#else
+
+    const uint8_t * scales = (const uint8_t*)&utmp[0];
+    const uint8_t * mins   = (const uint8_t*)&utmp[2];
+
+    int8_t  aux8[QK_K];
+    int16_t aux16[8];
+    float   sums [8];
+    int32_t aux32[8];
+    memset(sums, 0, 8*sizeof(float));
+
+    float sumf = 0;
+    for (int i = 0; i < nb; ++i) {
+        const uint8_t * GGML_RESTRICT q4 = x[i].qs;
+        const uint8_t * GGML_RESTRICT hm = x[i].qh;
+        const  int8_t * GGML_RESTRICT q8 = y[i].qs;
+        memset(aux32, 0, 8*sizeof(int32_t));
+        int8_t * GGML_RESTRICT a = aux8;
+        uint8_t m = 1;
+        for (int j = 0; j < QK_K/64; ++j) {
+            for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
+            for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
+            a += 32; m <<= 1;
+            for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l]  >> 4);
+            for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
+            a += 32; m <<= 1;
+            q4 += 32;
+        }
+        memcpy(utmp, x[i].scales, 12);
+        utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
+        const uint32_t uaux = utmp[1] & kmask1;
+        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
+        utmp[2] = uaux;
+        utmp[0] &= kmask1;
+
+        int sumi = 0;
+        for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
+        a = aux8;
+        int is = 0;
+        for (int j = 0; j < QK_K/32; ++j) {
+            int32_t scale = scales[is++];
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+        }
+        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+        for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
+        const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
+        sumf -= dmin * sumi;
+    }
+    for (int l = 0; l < 8; ++l) sumf += sums[l];
+    *s = sumf;
+#endif
+}
+
+void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    assert(n % QK_K == 0);
+#ifdef __ARM_FEATURE_MATMUL_INT8
+    assert((nrc == 2) || (nrc == 1));
+#else
+    assert(nrc == 1);
+#endif
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_q6_K * GGML_RESTRICT x = vx;
+    const block_q8_K * GGML_RESTRICT y = vy;
+
+    const int nb = n / QK_K;
+
+#if defined(__ARM_FEATURE_MATMUL_INT8)
+    if (nrc == 2) {
+        const block_q6_K * GGML_RESTRICT x0 = x;
+        const block_q6_K * GGML_RESTRICT x1 = (const block_q6_K *) ((const uint8_t *)vx + bx);
+        const block_q8_K * GGML_RESTRICT y0 = y;
+        const block_q8_K * GGML_RESTRICT y1 = (const block_q8_K *) ((const uint8_t *)vy + by);
+
+        float32x4_t vfsum = vdupq_n_f32(0.0f);
+
+        for (int i = 0; i < nb; ++i, ++x0, ++x1, ++y0, ++y1) {
+            const uint8_t * GGML_RESTRICT ql0 = x0->ql;
+            const uint8_t * GGML_RESTRICT ql1 = x1->ql;
+            const uint8_t * GGML_RESTRICT qh0 = x0->qh;
+            const uint8_t * GGML_RESTRICT qh1 = x1->qh;
+            const  int8_t * GGML_RESTRICT qy0 = y0->qs;
+            const  int8_t * GGML_RESTRICT qy1 = y1->qs;
+
+            const uint8x16_t mone = vdupq_n_u8(0x30);
+            const uint8x16_t  m4b = vdupq_n_u8(0x0f);
+
+            int32x4_t visum = vdupq_n_s32(0);
+
+            // process 8 blocks per iteration, totally 16 blocks
+            for (int j = 0; j < 2; ++j, qh0 += 32, ql0 += 64, qh1 += 32, ql1 += 64) {
+                int8x16_t vx0[8], vx1[8];
+
+                // de-quantize vx0[8]
+                {
+                    const uint8x16x2_t qh_bits = vld1q_u8_x2(qh0);
+                    const uint8x16x4_t ql_bits = vld1q_u8_x4(ql0);
+
+                    uint8x16_t q6h_0 = vandq_u8(mone, vshlq_n_u8(qh_bits.val[0], 4));
+                    uint8x16_t q6h_1 = vandq_u8(mone, vshlq_n_u8(qh_bits.val[1], 4));
+                    uint8x16_t q6h_2 = vandq_u8(mone, vshlq_n_u8(qh_bits.val[0], 2));
+                    uint8x16_t q6h_3 = vandq_u8(mone, vshlq_n_u8(qh_bits.val[1], 2));
+
+                    vx0[0] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(ql_bits.val[0], m4b), q6h_0));
+                    vx0[1] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(ql_bits.val[1], m4b), q6h_1));
+                    vx0[2] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(ql_bits.val[2], m4b), q6h_2));
+                    vx0[3] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(ql_bits.val[3], m4b), q6h_3));
+
+                    q6h_0 = vandq_u8(mone, qh_bits.val[0]);
+                    q6h_1 = vandq_u8(mone, qh_bits.val[1]);
+                    q6h_2 = vandq_u8(mone, vshrq_n_u8(qh_bits.val[0], 2));
+                    q6h_3 = vandq_u8(mone, vshrq_n_u8(qh_bits.val[1], 2));
+
+                    vx0[4] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(ql_bits.val[0], 4), q6h_0));
+                    vx0[5] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(ql_bits.val[1], 4), q6h_1));
+                    vx0[6] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(ql_bits.val[2], 4), q6h_2));
+                    vx0[7] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(ql_bits.val[3], 4), q6h_3));
+                }
+
+                // de-quantize vx1[8]
+                {
+                    const uint8x16x2_t qh_bits = vld1q_u8_x2(qh1);
+                    const uint8x16x4_t ql_bits = vld1q_u8_x4(ql1);
+
+                    uint8x16_t q6h_0 = vandq_u8(mone, vshlq_n_u8(qh_bits.val[0], 4));
+                    uint8x16_t q6h_1 = vandq_u8(mone, vshlq_n_u8(qh_bits.val[1], 4));
+                    uint8x16_t q6h_2 = vandq_u8(mone, vshlq_n_u8(qh_bits.val[0], 2));
+                    uint8x16_t q6h_3 = vandq_u8(mone, vshlq_n_u8(qh_bits.val[1], 2));
+
+                    vx1[0] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(ql_bits.val[0], m4b), q6h_0));
+                    vx1[1] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(ql_bits.val[1], m4b), q6h_1));
+                    vx1[2] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(ql_bits.val[2], m4b), q6h_2));
+                    vx1[3] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(ql_bits.val[3], m4b), q6h_3));
+
+                    q6h_0 = vandq_u8(mone, qh_bits.val[0]);
+                    q6h_1 = vandq_u8(mone, qh_bits.val[1]);
+                    q6h_2 = vandq_u8(mone, vshrq_n_u8(qh_bits.val[0], 2));
+                    q6h_3 = vandq_u8(mone, vshrq_n_u8(qh_bits.val[1], 2));
+
+                    vx1[4] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(ql_bits.val[0], 4), q6h_0));
+                    vx1[5] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(ql_bits.val[1], 4), q6h_1));
+                    vx1[6] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(ql_bits.val[2], 4), q6h_2));
+                    vx1[7] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(ql_bits.val[3], 4), q6h_3));
+                }
+
+                // process 16 elements (one block with same scale) per iteration
+                // - vx = concat(ql, qh) - 32
+                // - r1,r2,r3,r4 = smmla(vx, vy)
+                for (int k = 0; k < 8; ++k) {
+                    const int blk = j * 8 + k;
+
+                    const int8x16_t vy0 = vld1q_s8(qy0);
+                    const int8x16_t vy1 = vld1q_s8(qy1);
+                    qy0 += 16;
+                    qy1 += 16;
+
+                    const int32x4_t block_scale = {
+                        x0->scales[blk],
+                        x0->scales[blk],
+                        x1->scales[blk],
+                        x1->scales[blk],
+                    };
+
+                    // calculate four results at once with outer product
+                    const int8x16_t vx_l = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(vx0[k]), vreinterpretq_s64_s8(vx1[k])));
+                    const int8x16_t vx_h = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(vx0[k]), vreinterpretq_s64_s8(vx1[k])));
+                    const int8x16_t vy_l = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(vy0), vreinterpretq_s64_s8(vy1)));
+                    const int8x16_t vy_h = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(vy0), vreinterpretq_s64_s8(vy1)));
+                    int32x4_t vr = vdupq_n_s32(0);
+                    vr = vmmlaq_s32(vr, vx_l, vy_l);
+                    vr = vmmlaq_s32(vr, vx_h, vy_h);
+
+                    // apply block scale, will NOT overflow
+                    // block_scale * sum_256(int6*int8) <= 2^(8+8+6+8) = 30 bits
+                    visum = vmlaq_s32(visum, vr, block_scale);
+                }
+            }
+
+            // adjust bias, apply superblock scale
+            {
+                int32_t bias[4];
+#ifdef __ARM_FEATURE_SVE
+                const svbool_t pg16_8 = svptrue_pat_b16(SV_VL8);
+                const svbool_t pg8_8 = svptrue_pat_b8(SV_VL8);
+                const svint16_t y0_q8sums_0 = svld1_s16(pg16_8, y0->bsums);
+                const svint16_t y0_q8sums_1 = svld1_s16(pg16_8, y0->bsums + 8);
+                const svint16_t y1_q8sums_0 = svld1_s16(pg16_8, y1->bsums);
+                const svint16_t y1_q8sums_1 = svld1_s16(pg16_8, y1->bsums + 8);
+                const svint16_t x0_q6scales_0 = svunpklo_s16(svld1_s8(pg8_8, x0->scales));
+                const svint16_t x0_q6scales_1 = svunpklo_s16(svld1_s8(pg8_8, x0->scales + 8));
+                const svint16_t x1_q6scales_0 = svunpklo_s16(svld1_s8(pg8_8, x1->scales));
+                const svint16_t x1_q6scales_1 = svunpklo_s16(svld1_s8(pg8_8, x1->scales + 8));
+                const svint64_t zero = svdup_n_s64(0);
+                bias[0] = svaddv_s64(svptrue_b64(), svadd_s64_x(svptrue_b64(), svdot_s64(zero, y0_q8sums_0, x0_q6scales_0),
+                                                                               svdot_s64(zero, y0_q8sums_1, x0_q6scales_1)));
+                bias[1] = svaddv_s64(svptrue_b64(), svadd_s64_x(svptrue_b64(), svdot_s64(zero, y1_q8sums_0, x0_q6scales_0),
+                                                                               svdot_s64(zero, y1_q8sums_1, x0_q6scales_1)));
+                bias[2] = svaddv_s64(svptrue_b64(), svadd_s64_x(svptrue_b64(), svdot_s64(zero, y0_q8sums_0, x1_q6scales_0),
+                                                                               svdot_s64(zero, y0_q8sums_1, x1_q6scales_1)));
+                bias[3] = svaddv_s64(svptrue_b64(), svadd_s64_x(svptrue_b64(), svdot_s64(zero, y1_q8sums_0, x1_q6scales_0),
+                                                                               svdot_s64(zero, y1_q8sums_1, x1_q6scales_1)));
+#else
+                // NEON doesn't support int16 dot product, fallback to separated mul and add
+                const int16x8x2_t q8sums0 = vld1q_s16_x2(y0->bsums);
+                const int16x8x2_t q8sums1 = vld1q_s16_x2(y1->bsums);
+
+                int8x16_t scales_s8 = vld1q_s8(x0->scales);
+                const int16x8x2_t q6scales0 = {{vmovl_s8(vget_low_s8(scales_s8)), vmovl_s8(vget_high_s8(scales_s8))}};
+                scales_s8 = vld1q_s8(x1->scales);
+                const int16x8x2_t q6scales1 = {{vmovl_s8(vget_low_s8(scales_s8)), vmovl_s8(vget_high_s8(scales_s8))}};
+
+                int32x4_t prod;
+                prod = vaddq_s32(vaddq_s32(vmull_s16(vget_low_s16 (q8sums0.val[0]), vget_low_s16 (q6scales0.val[0])),
+                                           vmull_s16(vget_high_s16(q8sums0.val[0]), vget_high_s16(q6scales0.val[0]))),
+                                 vaddq_s32(vmull_s16(vget_low_s16 (q8sums0.val[1]), vget_low_s16 (q6scales0.val[1])),
+                                           vmull_s16(vget_high_s16(q8sums0.val[1]), vget_high_s16(q6scales0.val[1]))));
+                bias[0] = vaddvq_s32(prod);
+                prod = vaddq_s32(vaddq_s32(vmull_s16(vget_low_s16 (q8sums1.val[0]), vget_low_s16 (q6scales0.val[0])),
+                                           vmull_s16(vget_high_s16(q8sums1.val[0]), vget_high_s16(q6scales0.val[0]))),
+                                 vaddq_s32(vmull_s16(vget_low_s16 (q8sums1.val[1]), vget_low_s16 (q6scales0.val[1])),
+                                           vmull_s16(vget_high_s16(q8sums1.val[1]), vget_high_s16(q6scales0.val[1]))));
+                bias[1] = vaddvq_s32(prod);
+                prod = vaddq_s32(vaddq_s32(vmull_s16(vget_low_s16 (q8sums0.val[0]), vget_low_s16 (q6scales1.val[0])),
+                                           vmull_s16(vget_high_s16(q8sums0.val[0]), vget_high_s16(q6scales1.val[0]))),
+                                 vaddq_s32(vmull_s16(vget_low_s16 (q8sums0.val[1]), vget_low_s16 (q6scales1.val[1])),
+                                           vmull_s16(vget_high_s16(q8sums0.val[1]), vget_high_s16(q6scales1.val[1]))));
+                bias[2] = vaddvq_s32(prod);
+                prod = vaddq_s32(vaddq_s32(vmull_s16(vget_low_s16 (q8sums1.val[0]), vget_low_s16 (q6scales1.val[0])),
+                                           vmull_s16(vget_high_s16(q8sums1.val[0]), vget_high_s16(q6scales1.val[0]))),
+                                 vaddq_s32(vmull_s16(vget_low_s16 (q8sums1.val[1]), vget_low_s16 (q6scales1.val[1])),
+                                           vmull_s16(vget_high_s16(q8sums1.val[1]), vget_high_s16(q6scales1.val[1]))));
+                bias[3] = vaddvq_s32(prod);
+
+#endif
+                const int32x4_t vibias = vmulq_n_s32(vld1q_s32(bias), 32);
+
+                const float32x4_t superblock_scale = {
+                    GGML_CPU_FP16_TO_FP32(x0->d) * y0->d,
+                    GGML_CPU_FP16_TO_FP32(x0->d) * y1->d,
+                    GGML_CPU_FP16_TO_FP32(x1->d) * y0->d,
+                    GGML_CPU_FP16_TO_FP32(x1->d) * y1->d,
+                };
+
+                visum = vsubq_s32(visum, vibias);
+                vfsum = vmlaq_f32(vfsum, vcvtq_f32_s32(visum), superblock_scale);
+            }
+        }
+
+        // vfsum = ABCD -> ACBD
+        // AC -> s, BD -> (s+bs)
+        vfsum = vzip1q_f32(vfsum, vextq_f32(vfsum, vfsum, 2));
+        vst1_f32(s,      vget_low_f32 (vfsum));
+        vst1_f32(s + bs, vget_high_f32(vfsum));
+
+        return;
+    }
+#endif
+
+#ifdef __ARM_FEATURE_SVE
+    const int vector_length = ggml_cpu_get_sve_cnt()*8;
+    float sum = 0;
+    svuint8_t m4b = svdup_n_u8(0xf);
+    svint32_t vzero = svdup_n_s32(0);
+    svuint8_t mone = svdup_n_u8(0x30);
+    svint8_t q6bytes_1, q6bytes_2, q6bytes_3, q6bytes_4;
+    svuint8_t q6h_1, q6h_2, q6h_3, q6h_4;
+
+    for (int i = 0; i < nb; ++i) {
+        const float d_all = GGML_CPU_FP16_TO_FP32(x[i].d);
+
+        const uint8_t * GGML_RESTRICT q6 = x[i].ql;
+        const uint8_t * GGML_RESTRICT qh = x[i].qh;
+        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
+
+        const int8_t * GGML_RESTRICT scale = x[i].scales;
+
+        const svbool_t pg16_8 = svptrue_pat_b16(SV_VL8);
+        const svint16_t q8sums_1 = svld1_s16(pg16_8, y[i].bsums);
+        const svint16_t q8sums_2 = svld1_s16(pg16_8, y[i].bsums + 8);
+        const svint16_t q6scales_1 = svunpklo_s16(svld1_s8(svptrue_pat_b8(SV_VL8), scale));
+        const svint16_t q6scales_2 = svunpklo_s16(svld1_s8(svptrue_pat_b8(SV_VL8), scale + 8));
+        const svint64_t prod = svdup_n_s64(0);
+        int32_t isum_mins = svaddv_s64(svptrue_b64(), svadd_s64_x(svptrue_b64(), svdot_s64(prod, q8sums_1, q6scales_1),
+                                                                                 svdot_s64(prod, q8sums_2, q6scales_2)));
+        int32_t isum = 0;
+
+        switch (vector_length) {
+            case 128:
+                {
+                    const svbool_t pg32_4 = svptrue_pat_b32(SV_VL4);
+                    const svbool_t pg8_16 = svptrue_pat_b8(SV_VL16);
+                    svint32_t isum_tmp = svdup_n_s32(0);
+                    for (int j = 0; j < QK_K/128; ++j) {
+                        svuint8_t qhbits_1 = svld1_u8(pg8_16, qh);
+                        svuint8_t qhbits_2 = svld1_u8(pg8_16, qh+16);
+                        qh += 32;
+                        svuint8_t q6bits_1 = svld1_u8(pg8_16, q6);
+                        svuint8_t q6bits_2 = svld1_u8(pg8_16, q6+16);
+                        svuint8_t q6bits_3 = svld1_u8(pg8_16, q6+32);
+                        svuint8_t q6bits_4 = svld1_u8(pg8_16, q6+48);
+                        q6 += 64;
+                        svint8_t q8bytes_1 = svld1_s8(pg8_16, q8);
+                        svint8_t q8bytes_2 = svld1_s8(pg8_16, q8+16);
+                        svint8_t q8bytes_3 = svld1_s8(pg8_16, q8+32);
+                        svint8_t q8bytes_4 = svld1_s8(pg8_16, q8+48);
+                        q8 += 64;
+
+                        q6h_1 = svand_u8_x(pg16_8, mone, svlsl_n_u8_x(pg16_8, qhbits_1, 4));
+                        q6h_2 = svand_u8_x(pg16_8, mone, svlsl_n_u8_x(pg16_8, qhbits_2, 4));
+                        q6h_3 = svand_u8_x(pg16_8, mone, svlsl_n_u8_x(pg16_8, qhbits_1, 2));
+                        q6h_4 = svand_u8_x(pg16_8, mone, svlsl_n_u8_x(pg16_8, qhbits_2, 2));
+                        q6bytes_1 = svreinterpret_s8_u8(svorr_u8_x(pg8_16, svand_u8_x(pg8_16, q6bits_1, m4b), q6h_1));
+                        q6bytes_2 = svreinterpret_s8_u8(svorr_u8_x(pg8_16, svand_u8_x(pg8_16, q6bits_2, m4b), q6h_2));
+                        q6bytes_3 = svreinterpret_s8_u8(svorr_u8_x(pg8_16, svand_u8_x(pg8_16, q6bits_3, m4b), q6h_3));
+                        q6bytes_4 = svreinterpret_s8_u8(svorr_u8_x(pg8_16, svand_u8_x(pg8_16, q6bits_4, m4b), q6h_4));
+                        isum_tmp = svmla_n_s32_x(pg32_4, isum_tmp, svdot_s32(vzero, q6bytes_1, q8bytes_1), scale[0]);
+                        isum_tmp = svmla_n_s32_x(pg32_4, isum_tmp, svdot_s32(vzero, q6bytes_2, q8bytes_2), scale[1]);
+                        isum_tmp = svmla_n_s32_x(pg32_4, isum_tmp, svdot_s32(vzero, q6bytes_3, q8bytes_3), scale[2]);
+                        isum_tmp = svmla_n_s32_x(pg32_4, isum_tmp, svdot_s32(vzero, q6bytes_4, q8bytes_4), scale[3]);
+
+                        scale += 4;
+                        q8bytes_1 = svld1_s8(pg8_16, q8);
+                        q8bytes_2 = svld1_s8(pg8_16, q8+16);
+                        q8bytes_3 = svld1_s8(pg8_16, q8+32);
+                        q8bytes_4 = svld1_s8(pg8_16, q8+48);
+                        q8 += 64;
+
+                        q6h_1 = svand_u8_x(pg16_8, mone, qhbits_1);
+                        q6h_2 = svand_u8_x(pg16_8, mone, qhbits_2);
+                        q6h_3 = svand_u8_x(pg16_8, mone, svlsr_n_u8_x(pg16_8, qhbits_1, 2));
+                        q6h_4 = svand_u8_x(pg16_8, mone, svlsr_n_u8_x(pg16_8, qhbits_2, 2));
+                        q6bytes_1 = svreinterpret_s8_u8(svorr_u8_x(pg8_16, svlsr_n_u8_x(pg8_16, q6bits_1, 4), q6h_1));
+                        q6bytes_2 = svreinterpret_s8_u8(svorr_u8_x(pg8_16, svlsr_n_u8_x(pg8_16, q6bits_2, 4), q6h_2));
+                        q6bytes_3 = svreinterpret_s8_u8(svorr_u8_x(pg8_16, svlsr_n_u8_x(pg8_16, q6bits_3, 4), q6h_3));
+                        q6bytes_4 = svreinterpret_s8_u8(svorr_u8_x(pg8_16, svlsr_n_u8_x(pg8_16, q6bits_4, 4), q6h_4));
+                        isum_tmp = svmla_n_s32_x(pg32_4, isum_tmp, svdot_s32(vzero, q6bytes_1, q8bytes_1), scale[0]);
+                        isum_tmp = svmla_n_s32_x(pg32_4, isum_tmp, svdot_s32(vzero, q6bytes_2, q8bytes_2), scale[1]);
+                        isum_tmp = svmla_n_s32_x(pg32_4, isum_tmp, svdot_s32(vzero, q6bytes_3, q8bytes_3), scale[2]);
+                        isum_tmp = svmla_n_s32_x(pg32_4, isum_tmp, svdot_s32(vzero, q6bytes_4, q8bytes_4), scale[3]);
+                        scale += 4;
+                    }
+                    isum += svaddv_s32(pg32_4, isum_tmp);
+                    sum += d_all * y[i].d * (isum - 32 * isum_mins);
+                }
+                break;
+            case 256:
+            case 512:
+                {
+                    const svbool_t pg8_2 = svptrue_pat_b8(SV_VL2);
+                    const svbool_t pg32_8 = svptrue_pat_b32(SV_VL8);
+                    const svbool_t pg8_32 = svptrue_pat_b8(SV_VL32);
+                    svint32_t isum_tmp = svdup_n_s32(0);
+                    for (int j = 0; j < QK_K/128; j++) {
+                        svuint8_t qhbits_1 = svld1_u8(pg8_32, qh);
+                        qh += 32;
+                        svuint8_t q6bits_1 = svld1_u8(pg8_32, q6);
+                        svuint8_t q6bits_2 = svld1_u8(pg8_32, q6+32);
+                        q6 += 64;
+                        svint8_t q8bytes_1 = svld1_s8(pg8_32, q8);
+                        svint8_t q8bytes_2 = svld1_s8(pg8_32, q8+32);
+                        svint8_t q8bytes_3 = svld1_s8(pg8_32, q8+64);
+                        svint8_t q8bytes_4 = svld1_s8(pg8_32, q8+96);
+                        q8 += 128;
+                        q6h_1 = svand_u8_x(pg8_32, mone, svlsl_n_u8_x(pg8_32, qhbits_1, 4));
+                        q6h_2 = svand_u8_x(pg8_32, mone, svlsl_n_u8_x(pg8_32, qhbits_1, 2));
+                        q6h_3 = svand_u8_x(pg8_32, mone, qhbits_1);
+                        q6h_4 = svand_u8_x(pg8_32, mone, svlsr_n_u8_x(pg8_32, qhbits_1, 2));
+                        q6bytes_1 = svreinterpret_s8_u8(svorr_u8_x(pg8_32, svand_u8_x(pg8_32, q6bits_1, m4b), q6h_1));
+                        q6bytes_2 = svreinterpret_s8_u8(svorr_u8_x(pg8_32, svand_u8_x(pg8_32, q6bits_2, m4b), q6h_2));
+                        q6bytes_3 = svreinterpret_s8_u8(svorr_u8_x(pg8_32, svlsr_n_u8_x(pg8_32, q6bits_1, 4), q6h_3));
+                        q6bytes_4 = svreinterpret_s8_u8(svorr_u8_x(pg8_32, svlsr_n_u8_x(pg8_32, q6bits_2, 4), q6h_4));
+
+                        svint8_t scale_lane_1_tmp = svld1_s8(pg8_2, scale);
+                        scale_lane_1_tmp= svzip1_s8(scale_lane_1_tmp, scale_lane_1_tmp);
+                        scale_lane_1_tmp= svzip1_s8(scale_lane_1_tmp, scale_lane_1_tmp);
+                        svint8_t scale_lane_2_tmp = svld1_s8(pg8_2, scale+2);
+                        scale_lane_2_tmp = svzip1_s8(scale_lane_2_tmp, scale_lane_2_tmp);
+                        scale_lane_2_tmp = svzip1_s8(scale_lane_2_tmp, scale_lane_2_tmp);
+                        svint8_t scale_lane_3_tmp = svld1_s8(pg8_2, scale+4);
+                        scale_lane_3_tmp = svzip1_s8(scale_lane_3_tmp, scale_lane_3_tmp);
+                        scale_lane_3_tmp = svzip1_s8(scale_lane_3_tmp, scale_lane_3_tmp);
+                        svint8_t scale_lane_4_tmp = svld1_s8(pg8_2, scale+6);
+                        scale_lane_4_tmp = svzip1_s8(scale_lane_4_tmp, scale_lane_4_tmp);
+                        scale_lane_4_tmp = svzip1_s8(scale_lane_4_tmp, scale_lane_4_tmp);
+                        svint32_t scale_lane_1 = svunpklo_s32(svunpklo_s16(scale_lane_1_tmp));
+                        svint32_t scale_lane_2 = svunpklo_s32(svunpklo_s16(scale_lane_2_tmp));
+                        svint32_t scale_lane_3 = svunpklo_s32(svunpklo_s16(scale_lane_3_tmp));
+                        svint32_t scale_lane_4 = svunpklo_s32(svunpklo_s16(scale_lane_4_tmp));
+
+                        isum_tmp = svmla_s32_x(pg32_8, isum_tmp, svdot_s32(vzero, q6bytes_1, q8bytes_1), scale_lane_1);
+                        isum_tmp = svmla_s32_x(pg32_8, isum_tmp, svdot_s32(vzero, q6bytes_2, q8bytes_2), scale_lane_2);
+                        isum_tmp = svmla_s32_x(pg32_8, isum_tmp, svdot_s32(vzero, q6bytes_3, q8bytes_3), scale_lane_3);
+                        isum_tmp = svmla_s32_x(pg32_8, isum_tmp, svdot_s32(vzero, q6bytes_4, q8bytes_4), scale_lane_4);
+                        scale += 8;
+                    }
+                    isum += svaddv_s32(pg32_8, isum_tmp);
+                    sum += d_all * y[i].d * (isum - 32 * isum_mins);
+                }
+                break;
+            default:
+                assert(false && "Unsupported vector length");
+                break;
+        }
+    }
+
+    *s = sum;
+
+#elif __ARM_NEON
+    float sum = 0;
+
+    const uint8x16_t m4b = vdupq_n_u8(0xF);
+    const int32x4_t  vzero = vdupq_n_s32(0);
+    //const int8x16_t  m32s = vdupq_n_s8(32);
+
+    const uint8x16_t mone = vdupq_n_u8(3);
+
+    ggml_int8x16x4_t q6bytes;
+    ggml_uint8x16x4_t q6h;
+
+    for (int i = 0; i < nb; ++i) {
+
+        const float d_all = GGML_CPU_FP16_TO_FP32(x[i].d);
+
+        const uint8_t * GGML_RESTRICT q6 = x[i].ql;
+        const uint8_t * GGML_RESTRICT qh = x[i].qh;
+        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
+
+        const int8_t * GGML_RESTRICT scale = x[i].scales;
+
+        const ggml_int16x8x2_t q8sums = ggml_vld1q_s16_x2(y[i].bsums);
+        const int8x16_t scales = vld1q_s8(scale);
+        const ggml_int16x8x2_t q6scales = {{vmovl_s8(vget_low_s8(scales)), vmovl_s8(vget_high_s8(scales))}};
+
+        const int32x4_t prod = vaddq_s32(vaddq_s32(vmull_s16(vget_low_s16 (q8sums.val[0]), vget_low_s16 (q6scales.val[0])),
+                                                   vmull_s16(vget_high_s16(q8sums.val[0]), vget_high_s16(q6scales.val[0]))),
+                                         vaddq_s32(vmull_s16(vget_low_s16 (q8sums.val[1]), vget_low_s16 (q6scales.val[1])),
+                                                   vmull_s16(vget_high_s16(q8sums.val[1]), vget_high_s16(q6scales.val[1]))));
+        int32_t isum_mins = vaddvq_s32(prod);
+
+        int32_t isum = 0;
+
+        for (int j = 0; j < QK_K/128; ++j) {
+
+            ggml_uint8x16x2_t qhbits = ggml_vld1q_u8_x2(qh); qh += 32;
+            ggml_uint8x16x4_t q6bits = ggml_vld1q_u8_x4(q6); q6 += 64;
+            ggml_int8x16x4_t q8bytes = ggml_vld1q_s8_x4(q8); q8 += 64;
+
+            q6h.val[0] = vshlq_n_u8(vandq_u8(mone, qhbits.val[0]), 4);
+            q6h.val[1] = vshlq_n_u8(vandq_u8(mone, qhbits.val[1]), 4);
+            uint8x16_t shifted = vshrq_n_u8(qhbits.val[0], 2);
+            q6h.val[2] = vshlq_n_u8(vandq_u8(mone, shifted), 4);
+            shifted = vshrq_n_u8(qhbits.val[1], 2);
+            q6h.val[3] = vshlq_n_u8(vandq_u8(mone, shifted), 4);
+
+            //q6bytes.val[0] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[0], m4b), q6h.val[0])), m32s);
+            //q6bytes.val[1] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[1], m4b), q6h.val[1])), m32s);
+            //q6bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[2], m4b), q6h.val[2])), m32s);
+            //q6bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[3], m4b), q6h.val[3])), m32s);
+            q6bytes.val[0] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[0], m4b), q6h.val[0]));
+            q6bytes.val[1] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[1], m4b), q6h.val[1]));
+            q6bytes.val[2] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[2], m4b), q6h.val[2]));
+            q6bytes.val[3] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[3], m4b), q6h.val[3]));
+
+            isum += vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[0], q8bytes.val[0])) * scale[0] +
+                    vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[1], q8bytes.val[1])) * scale[1] +
+                    vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[2], q8bytes.val[2])) * scale[2] +
+                    vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[3], q8bytes.val[3])) * scale[3];
+
+            scale += 4;
+
+            q8bytes = ggml_vld1q_s8_x4(q8); q8 += 64;
+
+            shifted = vshrq_n_u8(qhbits.val[0], 4);
+            q6h.val[0] = vshlq_n_u8(vandq_u8(mone, shifted), 4);
+            shifted = vshrq_n_u8(qhbits.val[1], 4);
+            q6h.val[1] = vshlq_n_u8(vandq_u8(mone, shifted), 4);
+            shifted = vshrq_n_u8(qhbits.val[0], 6);
+            q6h.val[2] = vshlq_n_u8(vandq_u8(mone, shifted), 4);
+            shifted = vshrq_n_u8(qhbits.val[1], 6);
+            q6h.val[3] = vshlq_n_u8(vandq_u8(mone, shifted), 4);
+
+            //q6bytes.val[0] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[0], 4), q6h.val[0])), m32s);
+            //q6bytes.val[1] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[1], 4), q6h.val[1])), m32s);
+            //q6bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[2], 4), q6h.val[2])), m32s);
+            //q6bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[3], 4), q6h.val[3])), m32s);
+            q6bytes.val[0] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[0], 4), q6h.val[0]));
+            q6bytes.val[1] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[1], 4), q6h.val[1]));
+            q6bytes.val[2] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[2], 4), q6h.val[2]));
+            q6bytes.val[3] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[3], 4), q6h.val[3]));
+
+            isum += vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[0], q8bytes.val[0])) * scale[0] +
+                    vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[1], q8bytes.val[1])) * scale[1] +
+                    vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[2], q8bytes.val[2])) * scale[2] +
+                    vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[3], q8bytes.val[3])) * scale[3];
+            scale += 4;
+        }
+        //sum += isum * d_all * y[i].d;
+        sum += d_all * y[i].d * (isum - 32 * isum_mins);
+
+    }
+    *s = sum;
+#else
+
+    int8_t  aux8[QK_K];
+    int16_t aux16[8];
+    float   sums [8];
+    int32_t aux32[8];
+    memset(sums, 0, 8*sizeof(float));
+
+    float sumf = 0;
+    for (int i = 0; i < nb; ++i) {
+        const uint8_t * GGML_RESTRICT q4 = x[i].ql;
+        const uint8_t * GGML_RESTRICT qh = x[i].qh;
+        const  int8_t * GGML_RESTRICT q8 = y[i].qs;
+        memset(aux32, 0, 8*sizeof(int32_t));
+        int8_t * GGML_RESTRICT a = aux8;
+        for (int j = 0; j < QK_K; j += 128) {
+            for (int l = 0; l < 32; ++l) {
+                a[l +  0] = (int8_t)((q4[l +  0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
+                a[l + 32] = (int8_t)((q4[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
+                a[l + 64] = (int8_t)((q4[l +  0] >>  4) | (((qh[l] >> 4) & 3) << 4)) - 32;
+                a[l + 96] = (int8_t)((q4[l + 32] >>  4) | (((qh[l] >> 6) & 3) << 4)) - 32;
+            }
+            a  += 128;
+            q4 += 64;
+            qh += 32;
+        }
+        a = aux8;
+        int is = 0;
+        for (int j = 0; j < QK_K/16; ++j) {
+            int scale = x[i].scales[is++];
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+        }
+        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+        for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
+    }
+    for (int l = 0; l < 8; ++l) sumf += sums[l];
+    *s = sumf;
+#endif
+}
+
+#if defined (__ARM_NEON)
+static const int8_t keven_signs_q2xs[1024] = {
+     1,  1,  1,  1,  1,  1,  1,  1, -1,  1,  1,  1,  1,  1,  1, -1,  1, -1,  1,  1,  1,  1,  1, -1, -1, -1,  1,  1,  1,  1,  1,  1,
+     1,  1, -1,  1,  1,  1,  1, -1, -1,  1, -1,  1,  1,  1,  1,  1,  1, -1, -1,  1,  1,  1,  1,  1, -1, -1, -1,  1,  1,  1,  1, -1,
+     1,  1,  1, -1,  1,  1,  1, -1, -1,  1,  1, -1,  1,  1,  1,  1,  1, -1,  1, -1,  1,  1,  1,  1, -1, -1,  1, -1,  1,  1,  1, -1,
+     1,  1, -1, -1,  1,  1,  1,  1, -1,  1, -1, -1,  1,  1,  1, -1,  1, -1, -1, -1,  1,  1,  1, -1, -1, -1, -1, -1,  1,  1,  1,  1,
+     1,  1,  1,  1, -1,  1,  1, -1, -1,  1,  1,  1, -1,  1,  1,  1,  1, -1,  1,  1, -1,  1,  1,  1, -1, -1,  1,  1, -1,  1,  1, -1,
+     1,  1, -1,  1, -1,  1,  1,  1, -1,  1, -1,  1, -1,  1,  1, -1,  1, -1, -1,  1, -1,  1,  1, -1, -1, -1, -1,  1, -1,  1,  1,  1,
+     1,  1,  1, -1, -1,  1,  1,  1, -1,  1,  1, -1, -1,  1,  1, -1,  1, -1,  1, -1, -1,  1,  1, -1, -1, -1,  1, -1, -1,  1,  1,  1,
+     1,  1, -1, -1, -1,  1,  1, -1, -1,  1, -1, -1, -1,  1,  1,  1,  1, -1, -1, -1, -1,  1,  1,  1, -1, -1, -1, -1, -1,  1,  1, -1,
+     1,  1,  1,  1,  1, -1,  1, -1, -1,  1,  1,  1,  1, -1,  1,  1,  1, -1,  1,  1,  1, -1,  1,  1, -1, -1,  1,  1,  1, -1,  1, -1,
+     1,  1, -1,  1,  1, -1,  1,  1, -1,  1, -1,  1,  1, -1,  1, -1,  1, -1, -1,  1,  1, -1,  1, -1, -1, -1, -1,  1,  1, -1,  1,  1,
+     1,  1,  1, -1,  1, -1,  1,  1, -1,  1,  1, -1,  1, -1,  1, -1,  1, -1,  1, -1,  1, -1,  1, -1, -1, -1,  1, -1,  1, -1,  1,  1,
+     1,  1, -1, -1,  1, -1,  1, -1, -1,  1, -1, -1,  1, -1,  1,  1,  1, -1, -1, -1,  1, -1,  1,  1, -1, -1, -1, -1,  1, -1,  1, -1,
+     1,  1,  1,  1, -1, -1,  1,  1, -1,  1,  1,  1, -1, -1,  1, -1,  1, -1,  1,  1, -1, -1,  1, -1, -1, -1,  1,  1, -1, -1,  1,  1,
+     1,  1, -1,  1, -1, -1,  1, -1, -1,  1, -1,  1, -1, -1,  1,  1,  1, -1, -1,  1, -1, -1,  1,  1, -1, -1, -1,  1, -1, -1,  1, -1,
+     1,  1,  1, -1, -1, -1,  1, -1, -1,  1,  1, -1, -1, -1,  1,  1,  1, -1,  1, -1, -1, -1,  1,  1, -1, -1,  1, -1, -1, -1,  1, -1,
+     1,  1, -1, -1, -1, -1,  1,  1, -1,  1, -1, -1, -1, -1,  1, -1,  1, -1, -1, -1, -1, -1,  1, -1, -1, -1, -1, -1, -1, -1,  1,  1,
+     1,  1,  1,  1,  1,  1, -1, -1, -1,  1,  1,  1,  1,  1, -1,  1,  1, -1,  1,  1,  1,  1, -1,  1, -1, -1,  1,  1,  1,  1, -1, -1,
+     1,  1, -1,  1,  1,  1, -1,  1, -1,  1, -1,  1,  1,  1, -1, -1,  1, -1, -1,  1,  1,  1, -1, -1, -1, -1, -1,  1,  1,  1, -1,  1,
+     1,  1,  1, -1,  1,  1, -1,  1, -1,  1,  1, -1,  1,  1, -1, -1,  1, -1,  1, -1,  1,  1, -1, -1, -1, -1,  1, -1,  1,  1, -1,  1,
+     1,  1, -1, -1,  1,  1, -1, -1, -1,  1, -1, -1,  1,  1, -1,  1,  1, -1, -1, -1,  1,  1, -1,  1, -1, -1, -1, -1,  1,  1, -1, -1,
+     1,  1,  1,  1, -1,  1, -1,  1, -1,  1,  1,  1, -1,  1, -1, -1,  1, -1,  1,  1, -1,  1, -1, -1, -1, -1,  1,  1, -1,  1, -1,  1,
+     1,  1, -1,  1, -1,  1, -1, -1, -1,  1, -1,  1, -1,  1, -1,  1,  1, -1, -1,  1, -1,  1, -1,  1, -1, -1, -1,  1, -1,  1, -1, -1,
+     1,  1,  1, -1, -1,  1, -1, -1, -1,  1,  1, -1, -1,  1, -1,  1,  1, -1,  1, -1, -1,  1, -1,  1, -1, -1,  1, -1, -1,  1, -1, -1,
+     1,  1, -1, -1, -1,  1, -1,  1, -1,  1, -1, -1, -1,  1, -1, -1,  1, -1, -1, -1, -1,  1, -1, -1, -1, -1, -1, -1, -1,  1, -1,  1,
+     1,  1,  1,  1,  1, -1, -1,  1, -1,  1,  1,  1,  1, -1, -1, -1,  1, -1,  1,  1,  1, -1, -1, -1, -1, -1,  1,  1,  1, -1, -1,  1,
+     1,  1, -1,  1,  1, -1, -1, -1, -1,  1, -1,  1,  1, -1, -1,  1,  1, -1, -1,  1,  1, -1, -1,  1, -1, -1, -1,  1,  1, -1, -1, -1,
+     1,  1,  1, -1,  1, -1, -1, -1, -1,  1,  1, -1,  1, -1, -1,  1,  1, -1,  1, -1,  1, -1, -1,  1, -1, -1,  1, -1,  1, -1, -1, -1,
+     1,  1, -1, -1,  1, -1, -1,  1, -1,  1, -1, -1,  1, -1, -1, -1,  1, -1, -1, -1,  1, -1, -1, -1, -1, -1, -1, -1,  1, -1, -1,  1,
+     1,  1,  1,  1, -1, -1, -1, -1, -1,  1,  1,  1, -1, -1, -1,  1,  1, -1,  1,  1, -1, -1, -1,  1, -1, -1,  1,  1, -1, -1, -1, -1,
+     1,  1, -1,  1, -1, -1, -1,  1, -1,  1, -1,  1, -1, -1, -1, -1,  1, -1, -1,  1, -1, -1, -1, -1, -1, -1, -1,  1, -1, -1, -1,  1,
+     1,  1,  1, -1, -1, -1, -1,  1, -1,  1,  1, -1, -1, -1, -1, -1,  1, -1,  1, -1, -1, -1, -1, -1, -1, -1,  1, -1, -1, -1, -1,  1,
+     1,  1, -1, -1, -1, -1, -1, -1, -1,  1, -1, -1, -1, -1, -1,  1,  1, -1, -1, -1, -1, -1, -1,  1, -1, -1, -1, -1, -1, -1, -1, -1,
+};
+#endif
+
+void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    assert(n % QK_K == 0);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_iq2_xxs * GGML_RESTRICT x = vx;
+    const block_q8_K    * GGML_RESTRICT y = vy;
+
+    const int nb = n / QK_K;
+
+#if defined(__ARM_NEON)
+
+    const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
+
+    uint32_t aux32[4];
+    const uint8_t * aux8 = (const uint8_t *)aux32;
+
+    ggml_int8x16x4_t q2u;
+    ggml_int8x16x4_t q2s;
+    ggml_int8x16x4_t q8b;
+
+    float sumf = 0;
+    for (int i = 0; i < nb; ++i) {
+        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+        const uint16_t * GGML_RESTRICT q2 = x[i].qs;
+        const int8_t   * GGML_RESTRICT q8 = y[i].qs;
+        float sumf1 = 0, sumf2 = 0;
+        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
+            q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
+            memcpy(aux32, q2, 4*sizeof(uint32_t)); q2 += 8;
+            q2u.val[0] = vcombine_s8(vld1_s8((const void *)(iq2xxs_grid + aux8[ 0])), vld1_s8((const void *)(iq2xxs_grid + aux8[ 1])));
+            q2u.val[1] = vcombine_s8(vld1_s8((const void *)(iq2xxs_grid + aux8[ 2])), vld1_s8((const void *)(iq2xxs_grid + aux8[ 3])));
+            q2u.val[2] = vcombine_s8(vld1_s8((const void *)(iq2xxs_grid + aux8[ 8])), vld1_s8((const void *)(iq2xxs_grid + aux8[ 9])));
+            q2u.val[3] = vcombine_s8(vld1_s8((const void *)(iq2xxs_grid + aux8[10])), vld1_s8((const void *)(iq2xxs_grid + aux8[11])));
+            q2s.val[0] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[1] >>  0) & 127))), vld1_s8((const void *)(signs64 + ((aux32[1] >>  7) & 127))));
+            q2s.val[1] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[1] >> 14) & 127))), vld1_s8((const void *)(signs64 + ((aux32[1] >> 21) & 127))));
+            q2s.val[2] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[3] >>  0) & 127))), vld1_s8((const void *)(signs64 + ((aux32[3] >>  7) & 127))));
+            q2s.val[3] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[3] >> 14) & 127))), vld1_s8((const void *)(signs64 + ((aux32[3] >> 21) & 127))));
+            q2u.val[0] = vmulq_s8(q2u.val[0], q2s.val[0]);
+            q2u.val[1] = vmulq_s8(q2u.val[1], q2s.val[1]);
+            q2u.val[2] = vmulq_s8(q2u.val[2], q2s.val[2]);
+            q2u.val[3] = vmulq_s8(q2u.val[3], q2s.val[3]);
+            const int32x4_t p1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q2u.val[0], q8b.val[0]), q2u.val[1], q8b.val[1]);
+            const int32x4_t p2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q2u.val[2], q8b.val[2]), q2u.val[3], q8b.val[3]);
+            sumf1 += vaddvq_s32(p1) * (0.5f + (aux32[1] >> 28));
+            sumf2 += vaddvq_s32(p2) * (0.5f + (aux32[3] >> 28));
+        }
+        sumf += d*(sumf1 + sumf2);
+    }
+    *s = 0.25f * sumf;
+
+#else
+
+    uint32_t aux32[2];
+    const uint8_t * aux8 = (const uint8_t *)aux32;
+
+    float sumf = 0.f;
+    for (int i = 0; i < nb; ++i) {
+        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+        const uint16_t * GGML_RESTRICT q2 = x[i].qs;
+        const int8_t   * GGML_RESTRICT q8 = y[i].qs;
+        int32_t bsum = 0;
+        for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
+            memcpy(aux32, q2, 2*sizeof(uint32_t));
+            q2 += 4;
+            const uint32_t ls = 2*(aux32[1] >> 28) + 1;
+            int32_t sumi = 0;
+            for (int l = 0; l < 4; ++l) {
+                const uint8_t * grid = (const uint8_t *)(iq2xxs_grid + aux8[l]);
+                const uint8_t  signs = ksigns_iq2xs[(aux32[1] >> 7*l) & 127];
+                for (int j = 0; j < 8; ++j) {
+                    sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
+                }
+                q8 += 8;
+            }
+            bsum += sumi * ls;
+        }
+        sumf += d * bsum;
+    }
+    *s = 0.125f * sumf;
+#endif
+}
+
+void ggml_vec_dot_iq2_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    assert(n % QK_K == 0);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_iq2_xs * GGML_RESTRICT x = vx;
+    const block_q8_K   * GGML_RESTRICT y = vy;
+
+    const int nb = n / QK_K;
+
+#if defined(__ARM_NEON)
+
+    const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
+
+    ggml_int8x16x4_t q2u;
+    ggml_int8x16x4_t q2s;
+    ggml_int8x16x4_t q8b;
+
+    int32x4x4_t scales32;
+
+    float sumf = 0;
+    for (int i = 0; i < nb; ++i) {
+        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+        const uint16_t * GGML_RESTRICT q2 = x[i].qs;
+        const int8_t   * GGML_RESTRICT q8 = y[i].qs;
+        const uint8x8_t scales8 = vld1_u8(x[i].scales);
+        const uint8x8_t scales_l = vand_u8(scales8, vdup_n_u8(0xf));
+        const uint8x8_t scales_h = vshr_n_u8(scales8, 4);
+        uint8x16_t scales = vcombine_u8(vzip1_u8(scales_l, scales_h), vzip2_u8(scales_l, scales_h));
+        scales = vaddq_u8(vshlq_n_u8(scales, 1), vdupq_n_u8(1));
+        const uint16x8_t scales1 = vmovl_u8(vget_low_u8(scales));
+        const uint16x8_t scales2 = vmovl_u8(vget_high_u8(scales));
+        scales32.val[0] = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(scales1)));
+        scales32.val[1] = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(scales1)));
+        scales32.val[2] = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(scales2)));
+        scales32.val[3] = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(scales2)));
+        int32x4_t sumi = vdupq_n_s32(0);
+        for (int ib64 = 0; ib64 < QK_K/64; ++ib64) {
+            q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
+            q2u.val[0] = vcombine_s8(vld1_s8((const void *)(iq2xs_grid + (q2[0] & 511))), vld1_s8((const void *)(iq2xs_grid + (q2[1] & 511))));
+            q2u.val[1] = vcombine_s8(vld1_s8((const void *)(iq2xs_grid + (q2[2] & 511))), vld1_s8((const void *)(iq2xs_grid + (q2[3] & 511))));
+            q2u.val[2] = vcombine_s8(vld1_s8((const void *)(iq2xs_grid + (q2[4] & 511))), vld1_s8((const void *)(iq2xs_grid + (q2[5] & 511))));
+            q2u.val[3] = vcombine_s8(vld1_s8((const void *)(iq2xs_grid + (q2[6] & 511))), vld1_s8((const void *)(iq2xs_grid + (q2[7] & 511))));
+            q2s.val[0] = vcombine_s8(vld1_s8((const void *)(signs64 + (q2[0] >> 9))), vld1_s8((const void *)(signs64 + (q2[1] >> 9))));
+            q2s.val[1] = vcombine_s8(vld1_s8((const void *)(signs64 + (q2[2] >> 9))), vld1_s8((const void *)(signs64 + (q2[3] >> 9))));
+            q2s.val[2] = vcombine_s8(vld1_s8((const void *)(signs64 + (q2[4] >> 9))), vld1_s8((const void *)(signs64 + (q2[5] >> 9))));
+            q2s.val[3] = vcombine_s8(vld1_s8((const void *)(signs64 + (q2[6] >> 9))), vld1_s8((const void *)(signs64 + (q2[7] >> 9))));
+            q2u.val[0] = vmulq_s8(q2u.val[0], q2s.val[0]);
+            q2u.val[1] = vmulq_s8(q2u.val[1], q2s.val[1]);
+            q2u.val[2] = vmulq_s8(q2u.val[2], q2s.val[2]);
+            q2u.val[3] = vmulq_s8(q2u.val[3], q2s.val[3]);
+            const int32x4_t p1 = ggml_vdotq_s32(vdupq_n_s32(0), q2u.val[0], q8b.val[0]);
+            const int32x4_t p2 = ggml_vdotq_s32(vdupq_n_s32(0), q2u.val[1], q8b.val[1]);
+            const int32x4_t p3 = ggml_vdotq_s32(vdupq_n_s32(0), q2u.val[2], q8b.val[2]);
+            const int32x4_t p4 = ggml_vdotq_s32(vdupq_n_s32(0), q2u.val[3], q8b.val[3]);
+            const int32x4_t p = vpaddq_s32(vpaddq_s32(p1, p2), vpaddq_s32(p3, p4));
+            sumi = vmlaq_s32(sumi, p, scales32.val[ib64]);
+            q2 += 8;
+        }
+        sumf += d*vaddvq_s32(sumi);
+    }
+    *s = 0.125f * sumf;
+
+#else
+
+    float sumf = 0.f;
+    for (int i = 0; i < nb; ++i) {
+        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+        const uint16_t * GGML_RESTRICT q2 = x[i].qs;
+        const uint8_t  * GGML_RESTRICT sc = x[i].scales;
+        const int8_t   * GGML_RESTRICT q8 = y[i].qs;
+        int32_t bsum = 0;
+        for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
+            const uint16_t ls1 = 2*(sc[ib32] & 0xf) + 1;
+            const uint16_t ls2 = 2*(sc[ib32] >>  4) + 1;
+            int32_t sumi = 0;
+            for (int l = 0; l < 2; ++l) {
+                const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511));
+                const uint8_t  signs = ksigns_iq2xs[q2[l] >> 9];
+                for (int j = 0; j < 8; ++j) {
+                    sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
+                }
+                q8 += 8;
+            }
+            bsum += sumi * ls1;
+            sumi = 0;
+            for (int l = 2; l < 4; ++l) {
+                const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511));
+                const uint8_t  signs = ksigns_iq2xs[q2[l] >> 9];
+                for (int j = 0; j < 8; ++j) {
+                    sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
+                }
+                q8 += 8;
+            }
+            bsum += sumi * ls2;
+            q2 += 4;
+        }
+        sumf += d * bsum;
+    }
+    *s = 0.125f * sumf;
+#endif
+}
+
+void ggml_vec_dot_iq2_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    assert(n % QK_K == 0);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_iq2_s * GGML_RESTRICT x = vx;
+    const block_q8_K  * GGML_RESTRICT y = vy;
+
+    const int nb = n / QK_K;
+
+#if defined(__ARM_NEON)
+
+   static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+                                       0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
+   };
+
+    static const uint8_t k_mask2[16] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,};
+
+    const ggml_uint8x16x2_t mask1 = ggml_vld1q_u8_x2(k_mask1);
+    const uint8x16_t        mask2 = vld1q_u8(k_mask2);
+    const uint8x16_t m1 = vdupq_n_u8(1);
+    const int32x4_t vzero = vdupq_n_s32(0);
+
+    uint8x16x2_t vs;
+    ggml_int8x16x4_t q2s;
+    ggml_int8x16x4_t q8b;
+
+    float sumf = 0;
+    for (int i = 0; i < nb; ++i) {
+
+        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+
+        const uint8_t * GGML_RESTRICT qs = x[i].qs;
+        const uint8_t * GGML_RESTRICT qh = x[i].qh;
+        const uint16_t * GGML_RESTRICT signs = (const uint16_t *)(x[i].qs + QK_K/8);
+        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
+
+        int sumi1 = 0, sumi2 = 0;
+        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
+            q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
+            q2s.val[0] = vcombine_s8(vld1_s8((const int8_t *)(iq2s_grid + (qs[0] | ((qh[ib32+0] << 8) & 0x300)))),
+                                     vld1_s8((const int8_t *)(iq2s_grid + (qs[1] | ((qh[ib32+0] << 6) & 0x300)))));
+            q2s.val[1] = vcombine_s8(vld1_s8((const int8_t *)(iq2s_grid + (qs[2] | ((qh[ib32+0] << 4) & 0x300)))),
+                                     vld1_s8((const int8_t *)(iq2s_grid + (qs[3] | ((qh[ib32+0] << 2) & 0x300)))));
+            q2s.val[2] = vcombine_s8(vld1_s8((const int8_t *)(iq2s_grid + (qs[4] | ((qh[ib32+1] << 8) & 0x300)))),
+                                     vld1_s8((const int8_t *)(iq2s_grid + (qs[5] | ((qh[ib32+1] << 6) & 0x300)))));
+            q2s.val[3] = vcombine_s8(vld1_s8((const int8_t *)(iq2s_grid + (qs[6] | ((qh[ib32+1] << 4) & 0x300)))),
+                                     vld1_s8((const int8_t *)(iq2s_grid + (qs[7] | ((qh[ib32+1] << 2) & 0x300)))));
+            qs += 8;
+
+            vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[0] | ((uint32_t) signs[1] << 16)));
+            vs.val[1] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[1]), mask2);
+            vs.val[0] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[0]), mask2);
+            vs.val[0] = vceqq_u8(vs.val[0], mask2);
+            vs.val[1] = vceqq_u8(vs.val[1], mask2);
+
+            q2s.val[0] = vmulq_s8(vreinterpretq_s8_u8(vorrq_u8(vs.val[0], m1)), q2s.val[0]);
+            q2s.val[1] = vmulq_s8(vreinterpretq_s8_u8(vorrq_u8(vs.val[1], m1)), q2s.val[1]);
+
+            vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[2] | ((uint32_t) signs[3] << 16)));
+            vs.val[1] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[1]), mask2);
+            vs.val[0] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[0]), mask2);
+            vs.val[0] = vceqq_u8(vs.val[0], mask2);
+            vs.val[1] = vceqq_u8(vs.val[1], mask2);
+
+            signs += 4;
+
+            q2s.val[2] = vmulq_s8(vreinterpretq_s8_u8(vorrq_u8(vs.val[0], m1)), q2s.val[2]);
+            q2s.val[3] = vmulq_s8(vreinterpretq_s8_u8(vorrq_u8(vs.val[1], m1)), q2s.val[3]);
+
+            const int32x4_t p1 = ggml_vdotq_s32(vzero, q2s.val[0], q8b.val[0]);
+            const int32x4_t p2 = ggml_vdotq_s32(vzero, q2s.val[1], q8b.val[1]);
+            const int32x4_t p3 = ggml_vdotq_s32(vzero, q2s.val[2], q8b.val[2]);
+            const int32x4_t p4 = ggml_vdotq_s32(vzero, q2s.val[3], q8b.val[3]);
+
+            sumi1 += vaddvq_s32(p1) * (1 + 2*(x[i].scales[ib32+0] & 0xf));
+            sumi2 += vaddvq_s32(p2) * (1 + 2*(x[i].scales[ib32+0] >>  4));
+            sumi1 += vaddvq_s32(p3) * (1 + 2*(x[i].scales[ib32+1] & 0xf));
+            sumi2 += vaddvq_s32(p4) * (1 + 2*(x[i].scales[ib32+1] >>  4));
+        }
+        sumf += d*(sumi1 + sumi2);
+    }
+
+    *s = 0.125f * sumf;
+
+#else
+
+    float sumf = 0;
+    for (int i = 0; i < nb; i++) {
+
+        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+        const int8_t  * q8 = y[i].qs;
+        const uint8_t * qs = x[i].qs;
+        const uint8_t * qh = x[i].qh;
+        const uint8_t * signs = qs + QK_K/8;
+
+        int bsum = 0;
+        for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
+            int ls1 = 1 + 2*(x[i].scales[ib32] & 0xf);
+            int ls2 = 1 + 2*(x[i].scales[ib32] >>  4);
+            int sumi1 = 0, sumi2 = 0;
+            for (int l = 0; l < 2; ++l) {
+                const uint8_t * grid = (const uint8_t *)(iq2s_grid + (qs[l] | (qh[ib32] << (8-2*l) & 0x300)));
+                for (int j = 0; j < 8; ++j) {
+                    sumi1 += q8[j] * grid[j] * (signs[l] & kmask_iq2xs[j] ? -1 : 1);
+                }
+                q8 += 8;
+            }
+            for (int l = 2; l < 4; ++l) {
+                const uint8_t * grid = (const uint8_t *)(iq2s_grid + (qs[l] | (qh[ib32] << (8-2*l) & 0x300)));
+                for (int j = 0; j < 8; ++j) {
+                    sumi2 += q8[j] * grid[j] * (signs[l] & kmask_iq2xs[j] ? -1 : 1);
+                }
+                q8 += 8;
+            }
+            bsum += ls1 * sumi1 + ls2 * sumi2;
+            qs += 4;
+            signs += 4;
+        }
+
+        sumf += d * bsum;
+    }
+
+    *s = 0.125f * sumf;
+
+#endif
+
+}
+
+void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    assert(n % QK_K == 0);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_iq3_xxs * GGML_RESTRICT x = vx;
+    const block_q8_K    * GGML_RESTRICT y = vy;
+
+    const int nb = n / QK_K;
+
+#if defined(__ARM_NEON)
+
+    const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
+
+    uint32_t aux32[2];
+
+    ggml_int8x16x4_t q3s;
+    ggml_int8x16x4_t q8b;
+
+    float sumf = 0;
+    for (int i = 0; i < nb; ++i) {
+        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+        const uint8_t * GGML_RESTRICT q3 = x[i].qs;
+        const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4;
+        const int8_t   * GGML_RESTRICT q8 = y[i].qs;
+        float sumf1 = 0, sumf2 = 0;
+        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
+            q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
+            memcpy(aux32, gas, 2*sizeof(uint32_t)); gas += 2*sizeof(uint32_t);
+            const uint32x4_t aux32x4_0 = ggml_vld1q_u32(iq3xxs_grid[q3[ 0]], iq3xxs_grid[q3[ 1]], iq3xxs_grid[q3[ 2]], iq3xxs_grid[q3[ 3]]);
+            const uint32x4_t aux32x4_1 = ggml_vld1q_u32(iq3xxs_grid[q3[ 4]], iq3xxs_grid[q3[ 5]], iq3xxs_grid[q3[ 6]], iq3xxs_grid[q3[ 7]]);
+            const uint32x4_t aux32x4_2 = ggml_vld1q_u32(iq3xxs_grid[q3[ 8]], iq3xxs_grid[q3[ 9]], iq3xxs_grid[q3[10]], iq3xxs_grid[q3[11]]);
+            const uint32x4_t aux32x4_3 = ggml_vld1q_u32(iq3xxs_grid[q3[12]], iq3xxs_grid[q3[13]], iq3xxs_grid[q3[14]], iq3xxs_grid[q3[15]]);
+            q3 += 16;
+            q3s.val[0] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[0] >>  0) & 127))), vld1_s8((const void *)(signs64 + ((aux32[0] >>  7) & 127))));
+            q3s.val[1] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[0] >> 14) & 127))), vld1_s8((const void *)(signs64 + ((aux32[0] >> 21) & 127))));
+            q3s.val[2] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[1] >>  0) & 127))), vld1_s8((const void *)(signs64 + ((aux32[1] >>  7) & 127))));
+            q3s.val[3] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[1] >> 14) & 127))), vld1_s8((const void *)(signs64 + ((aux32[1] >> 21) & 127))));
+            q3s.val[0] = vmulq_s8(q3s.val[0], vreinterpretq_s8_u32(aux32x4_0));
+            q3s.val[1] = vmulq_s8(q3s.val[1], vreinterpretq_s8_u32(aux32x4_1));
+            q3s.val[2] = vmulq_s8(q3s.val[2], vreinterpretq_s8_u32(aux32x4_2));
+            q3s.val[3] = vmulq_s8(q3s.val[3], vreinterpretq_s8_u32(aux32x4_3));
+            const int32x4_t p1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q3s.val[0], q8b.val[0]), q3s.val[1], q8b.val[1]);
+            const int32x4_t p2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q3s.val[2], q8b.val[2]), q3s.val[3], q8b.val[3]);
+            sumf1 += vaddvq_s32(p1) * (0.5f + (aux32[0] >> 28));
+            sumf2 += vaddvq_s32(p2) * (0.5f + (aux32[1] >> 28));
+        }
+        sumf += d*(sumf1 + sumf2);
+    }
+    *s = 0.5f * sumf;
+
+#else
+
+    uint32_t aux32;
+
+    float sumf = 0.f;
+    for (int i = 0; i < nb; ++i) {
+        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+        const uint8_t * GGML_RESTRICT q3 = x[i].qs;
+        const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4;
+        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
+        int32_t bsum = 0;
+        for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
+            memcpy(&aux32, gas, sizeof(uint32_t)); gas += sizeof(uint32_t);
+            const uint32_t ls = 2*(aux32 >> 28) + 1;
+            int32_t sumi = 0;
+            for (int l = 0; l < 4; ++l) {
+                const uint8_t * grid1 = (const uint8_t *)(iq3xxs_grid + q3[2*l+0]);
+                const uint8_t * grid2 = (const uint8_t *)(iq3xxs_grid + q3[2*l+1]);
+                const uint8_t  signs = ksigns_iq2xs[(aux32 >> 7*l) & 127];
+                for (int j = 0; j < 4; ++j) {
+                    sumi += grid1[j] * q8[j+0] * (signs & kmask_iq2xs[j+0] ? -1 : 1);
+                    sumi += grid2[j] * q8[j+4] * (signs & kmask_iq2xs[j+4] ? -1 : 1);
+                }
+                q8 += 8;
+            }
+            q3 += 8;
+            bsum += sumi * ls;
+        }
+        sumf += d * bsum;
+    }
+    *s = 0.25f * sumf;
+#endif
+}
+
+void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    assert(n % QK_K == 0);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_iq3_s * GGML_RESTRICT x = vx;
+    const block_q8_K  * GGML_RESTRICT y = vy;
+
+    const int nb = n / QK_K;
+
+#if defined(__ARM_NEON)
+
+    typedef union {
+        uint16x8_t vec_index;
+        uint16_t   index[8];
+    } vec_index_t;
+
+   static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+                                       0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
+   };
+
+    static const uint8_t k_mask2[16] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,};
+
+    static const int16_t k_shift[8] = {8, 7, 6, 5, 4, 3, 2, 1};
+
+    const ggml_uint8x16x2_t mask1 = ggml_vld1q_u8_x2(k_mask1);
+    const uint8x16_t        mask2 = vld1q_u8(k_mask2);
+
+    const int16x8_t  hshift = vld1q_s16(k_shift);
+    const uint16x8_t m256   = vdupq_n_u16(256);
+    const uint8x16_t m1     = vdupq_n_u8(1);
+
+    uint8x16x2_t vs;
+    ggml_int8x16x4_t q3s;
+    ggml_int8x16x4_t q8b;
+    vec_index_t idx;
+
+    uint32_t scales32[2];
+    const uint8_t * scales8 = (const uint8_t *)scales32;
+
+    float sumf = 0;
+    for (int i = 0; i < nb; ++i) {
+        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+        const uint8_t * GGML_RESTRICT qs = x[i].qs;
+        const uint8_t * GGML_RESTRICT qh = x[i].qh;
+        const uint16_t * GGML_RESTRICT signs = (const uint16_t *)x[i].signs;
+        const int8_t   * GGML_RESTRICT q8 = y[i].qs;
+
+        memcpy(scales32, x[i].scales, 4);
+        scales32[1] = (((scales32[0] >> 4) & 0x0f0f0f0f) << 1) | 0x01010101;
+        scales32[0] = ((scales32[0] & 0x0f0f0f0f) << 1) | 0x01010101;
+
+        int sumi1 = 0, sumi2 = 0;
+        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
+            q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
+
+            const uint8x16_t idx_l = vld1q_u8(qs); qs += 16;
+            idx.vec_index = vorrq_u16(vmovl_u8(vget_low_u8 (idx_l)), vandq_u16(vshlq_u16(vdupq_n_u16(qh[ib32+0]), hshift), m256));
+            const uint32x4_t aux32x4_0 = ggml_vld1q_u32(iq3s_grid[idx.index[0]], iq3s_grid[idx.index[1]],
+                                                        iq3s_grid[idx.index[2]], iq3s_grid[idx.index[3]]);
+            const uint32x4_t aux32x4_1 = ggml_vld1q_u32(iq3s_grid[idx.index[4]], iq3s_grid[idx.index[5]],
+                                                        iq3s_grid[idx.index[6]], iq3s_grid[idx.index[7]]);
+            idx.vec_index = vorrq_u16(vmovl_u8(vget_high_u8(idx_l)), vandq_u16(vshlq_u16(vdupq_n_u16(qh[ib32+1]), hshift), m256));
+            const uint32x4_t aux32x4_2 = ggml_vld1q_u32(iq3s_grid[idx.index[0]], iq3s_grid[idx.index[1]],
+                                                        iq3s_grid[idx.index[2]], iq3s_grid[idx.index[3]]);
+            const uint32x4_t aux32x4_3 = ggml_vld1q_u32(iq3s_grid[idx.index[4]], iq3s_grid[idx.index[5]],
+                                                        iq3s_grid[idx.index[6]], iq3s_grid[idx.index[7]]);
+
+
+            vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[0] | ((uint32_t) signs[1] << 16)));
+            vs.val[1] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[1]), mask2);
+            vs.val[0] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[0]), mask2);
+            vs.val[0] = vorrq_u8(vceqq_u8(vs.val[0], mask2), m1);
+            vs.val[1] = vorrq_u8(vceqq_u8(vs.val[1], mask2), m1);
+
+            q3s.val[0] = vmulq_s8(vreinterpretq_s8_u8(vs.val[0]), vreinterpretq_s8_u32(aux32x4_0));
+            q3s.val[1] = vmulq_s8(vreinterpretq_s8_u8(vs.val[1]), vreinterpretq_s8_u32(aux32x4_1));
+
+            vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[2] | ((uint32_t) signs[3] << 16)));
+            vs.val[1] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[1]), mask2);
+            vs.val[0] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[0]), mask2);
+            vs.val[0] = vorrq_u8(vceqq_u8(vs.val[0], mask2), m1);
+            vs.val[1] = vorrq_u8(vceqq_u8(vs.val[1], mask2), m1);
+
+            signs += 4;
+
+            q3s.val[2] = vmulq_s8(vreinterpretq_s8_u8(vs.val[0]), vreinterpretq_s8_u32(aux32x4_2));
+            q3s.val[3] = vmulq_s8(vreinterpretq_s8_u8(vs.val[1]), vreinterpretq_s8_u32(aux32x4_3));
+
+            const int32x4_t p1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q3s.val[0], q8b.val[0]), q3s.val[1], q8b.val[1]);
+            const int32x4_t p2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q3s.val[2], q8b.val[2]), q3s.val[3], q8b.val[3]);
+
+            sumi1 += vaddvq_s32(p1) * scales8[ib32/2+0];
+            sumi2 += vaddvq_s32(p2) * scales8[ib32/2+4];
+        }
+        sumf += d*(sumi1 + sumi2);
+    }
+    *s = sumf;
+
+#else
+
+    float sumf = 0.f;
+    for (int i = 0; i < nb; ++i) {
+        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+        const uint8_t * GGML_RESTRICT qs = x[i].qs;
+        const uint8_t * GGML_RESTRICT qh = x[i].qh;
+        const uint8_t * GGML_RESTRICT signs = x[i].signs;
+        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
+        int32_t bsum = 0;
+        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
+            const uint32_t ls1 = 2*(x[i].scales[ib32/2] & 0xf) + 1;
+            const uint32_t ls2 = 2*(x[i].scales[ib32/2] >>  4) + 1;
+            int32_t sumi = 0;
+            for (int l = 0; l < 4; ++l) {
+                const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*l+0] | ((qh[ib32+0] << (8-2*l)) & 256)));
+                const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*l+1] | ((qh[ib32+0] << (7-2*l)) & 256)));
+                for (int j = 0; j < 4; ++j) {
+                    sumi += grid1[j] * q8[j+0] * (signs[l] & kmask_iq2xs[j+0] ? -1 : 1);
+                    sumi += grid2[j] * q8[j+4] * (signs[l] & kmask_iq2xs[j+4] ? -1 : 1);
+                }
+                q8 += 8;
+            }
+            qs += 8;
+            signs += 4;
+            bsum += sumi * ls1;
+            sumi = 0;
+            for (int l = 0; l < 4; ++l) {
+                const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*l+0] | ((qh[ib32+1] << (8-2*l)) & 256)));
+                const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*l+1] | ((qh[ib32+1] << (7-2*l)) & 256)));
+                for (int j = 0; j < 4; ++j) {
+                    sumi += grid1[j] * q8[j+0] * (signs[l] & kmask_iq2xs[j+0] ? -1 : 1);
+                    sumi += grid2[j] * q8[j+4] * (signs[l] & kmask_iq2xs[j+4] ? -1 : 1);
+                }
+                q8 += 8;
+            }
+            qs += 8;
+            signs += 4;
+            bsum += sumi * ls2;
+        }
+        sumf += d * bsum;
+    }
+    *s = sumf;
+#endif
+}
+
+void ggml_vec_dot_iq1_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    assert(n % QK_K == 0);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_iq1_s * GGML_RESTRICT x = vx;
+    const block_q8_K  * GGML_RESTRICT y = vy;
+
+    const int nb = n / QK_K;
+
+#if defined __ARM_NEON
+
+    ggml_int8x16x4_t q1b;
+    ggml_int8x16x4_t q8b;
+
+    float sumf = 0;
+    for (int i = 0; i < nb; ++i) {
+
+        const int8_t   * q8 = y[i].qs;
+        const uint8_t  * qs = x[i].qs;
+        const uint16_t * qh = x[i].qh;
+
+        int sumi1 = 0, sumi2 = 0, sumi3 = 0;
+
+        for (int ib = 0; ib < QK_K/32; ib += 2) {
+
+            q1b.val[0] = vcombine_s8(vld1_s8((const int8_t *)(iq1s_grid + (qs[0] | ((qh[ib+0] << 8) & 0x700)))),
+                                     vld1_s8((const int8_t *)(iq1s_grid + (qs[1] | ((qh[ib+0] << 5) & 0x700)))));
+            q1b.val[1] = vcombine_s8(vld1_s8((const int8_t *)(iq1s_grid + (qs[2] | ((qh[ib+0] << 2) & 0x700)))),
+                                     vld1_s8((const int8_t *)(iq1s_grid + (qs[3] | ((qh[ib+0] >> 1) & 0x700)))));
+            q1b.val[2] = vcombine_s8(vld1_s8((const int8_t *)(iq1s_grid + (qs[4] | ((qh[ib+1] << 8) & 0x700)))),
+                                     vld1_s8((const int8_t *)(iq1s_grid + (qs[5] | ((qh[ib+1] << 5) & 0x700)))));
+            q1b.val[3] = vcombine_s8(vld1_s8((const int8_t *)(iq1s_grid + (qs[6] | ((qh[ib+1] << 2) & 0x700)))),
+                                     vld1_s8((const int8_t *)(iq1s_grid + (qs[7] | ((qh[ib+1] >> 1) & 0x700)))));
+            qs += 8;
+
+            q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
+
+            const int32x4_t p1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q1b.val[0], q8b.val[0]), q1b.val[1], q8b.val[1]);
+            const int32x4_t p2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q1b.val[2], q8b.val[2]), q1b.val[3], q8b.val[3]);
+
+            const int ls1 = 2*((qh[ib+0] >> 12) & 7) + 1;
+            const int ls2 = 2*((qh[ib+1] >> 12) & 7) + 1;
+            sumi1 += vaddvq_s32(p1) * ls1;
+            sumi2 += vaddvq_s32(p2) * ls2;
+            sumi3 += (y[i].bsums[2*ib+0] + y[i].bsums[2*ib+1]) * ls1 * (qh[ib+0] & 0x8000 ? -1 : 1)
+                   + (y[i].bsums[2*ib+2] + y[i].bsums[2*ib+3]) * ls2 * (qh[ib+1] & 0x8000 ? -1 : 1);
+
+        }
+
+        sumf += y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d) * (sumi1 + sumi2 + IQ1S_DELTA * sumi3);
+    }
+
+    *s = sumf;
+
+#else
+
+    float sumf = 0;
+    for (int i = 0; i < nb; i++) {
+
+        const int8_t   * q8 = y[i].qs;
+        const uint8_t  * qs = x[i].qs;
+        const uint16_t * qh = x[i].qh;
+
+        int sumi = 0, sumi1 = 0;
+        for (int ib = 0; ib < QK_K/32; ++ib) {
+            const int ls = 2*((qh[ib] >> 12) & 7) + 1;
+            const int delta = qh[ib] & 0x8000 ? -1 : 1;
+            int lsum = 0;
+            for (int l = 0; l < 4; ++l) {
+                const int8_t * grid = (const int8_t *)(iq1s_grid + (qs[l] | (((qh[ib] >> 3*l) & 7) << 8)));
+                for (int j = 0; j < 8; ++j) {
+                    lsum += q8[j] * grid[j];
+                }
+                q8 += 8;
+            }
+            sumi  += ls * lsum;
+            sumi1 += ls * delta * (y[i].bsums[2*ib+0] + y[i].bsums[2*ib+1]);
+            qs += 4;
+        }
+
+        sumf += GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d * (sumi + IQ1S_DELTA * sumi1);
+    }
+
+    *s = sumf;
+
+#endif
+}
+
+void ggml_vec_dot_iq1_m_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    assert(n % QK_K == 0);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_iq1_m * GGML_RESTRICT x = vx;
+    const block_q8_K  * GGML_RESTRICT y = vy;
+
+    const int nb = n / QK_K;
+
+    iq1m_scale_t scale;
+
+#if defined __ARM_NEON
+    const int32x4_t mask  = vdupq_n_s32(0x7);
+    const int32x4_t mone  = vdupq_n_s32(1);
+    const int32x4_t mzero = vdupq_n_s32(0);
+
+    ggml_int8x16x4_t deltas;
+    deltas.val[0] = vcombine_s8(vdup_n_s8(+1), vdup_n_s8(+1));
+    deltas.val[1] = vcombine_s8(vdup_n_s8(-1), vdup_n_s8(+1));
+    deltas.val[2] = vcombine_s8(vdup_n_s8(+1), vdup_n_s8(-1));
+    deltas.val[3] = vcombine_s8(vdup_n_s8(-1), vdup_n_s8(-1));
+
+    ggml_int8x16x4_t q1b;
+    ggml_int8x16x4_t q8b;
+
+    uint32_t aux32;
+    const uint8_t * aux8 = (const uint8_t *)&aux32;
+
+    float sumf = 0;
+    for (int i = 0; i < nb; ++i) {
+
+        const int8_t   * q8 = y[i].qs;
+        const uint8_t  * qs = x[i].qs;
+        const uint8_t  * qh = x[i].qh;
+        const uint16_t * sc = (const uint16_t *)x[i].scales;
+
+        scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
+
+        int32x4_t sumi1 = mzero;
+        int32x4_t sumi2 = mzero;
+
+        for (int ib = 0; ib < QK_K/32; ib += 2) {
+
+            q1b.val[0] = vcombine_s8(vld1_s8((const int8_t *)(iq1s_grid + (qs[0] | ((qh[0] << 8) & 0x700)))),
+                                     vld1_s8((const int8_t *)(iq1s_grid + (qs[1] | ((qh[0] << 4) & 0x700)))));
+            q1b.val[1] = vcombine_s8(vld1_s8((const int8_t *)(iq1s_grid + (qs[2] | ((qh[1] << 8) & 0x700)))),
+                                     vld1_s8((const int8_t *)(iq1s_grid + (qs[3] | ((qh[1] << 4) & 0x700)))));
+            q1b.val[2] = vcombine_s8(vld1_s8((const int8_t *)(iq1s_grid + (qs[4] | ((qh[2] << 8) & 0x700)))),
+                                     vld1_s8((const int8_t *)(iq1s_grid + (qs[5] | ((qh[2] << 4) & 0x700)))));
+            q1b.val[3] = vcombine_s8(vld1_s8((const int8_t *)(iq1s_grid + (qs[6] | ((qh[3] << 8) & 0x700)))),
+                                     vld1_s8((const int8_t *)(iq1s_grid + (qs[7] | ((qh[3] << 4) & 0x700)))));
+
+            q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
+
+            const int32x4_t p1 = vpaddq_s32(ggml_vdotq_s32(mzero, q1b.val[0], q8b.val[0]), ggml_vdotq_s32(mzero, q1b.val[1], q8b.val[1]));
+            const int32x4_t p2 = vpaddq_s32(ggml_vdotq_s32(mzero, q1b.val[2], q8b.val[2]), ggml_vdotq_s32(mzero, q1b.val[3], q8b.val[3]));
+            const int32x4_t p12 = vpaddq_s32(p1, p2);
+
+            const uint32_t * qh32 = (const uint32_t *)qh; // we are 4-byte aligned, so we can do that
+            aux32 = ((qh32[0] >> 3) & 0x01010101) | ((qh32[0] >> 6) & 0x02020202);
+
+            const int32x4_t p3 = vpaddq_s32(ggml_vdotq_s32(mzero, deltas.val[aux8[0]], q8b.val[0]), ggml_vdotq_s32(mzero, deltas.val[aux8[1]], q8b.val[1]));
+            const int32x4_t p4 = vpaddq_s32(ggml_vdotq_s32(mzero, deltas.val[aux8[2]], q8b.val[2]), ggml_vdotq_s32(mzero, deltas.val[aux8[3]], q8b.val[3]));
+            const int32x4_t p34 = vpaddq_s32(p3, p4);
+
+            int32x4_t scales_4 = ggml_vld1q_u32(sc[ib/2] >> 0, sc[ib/2] >> 3, sc[ib/2] >> 6, sc[ib/2] >> 9);
+
+            scales_4 = vaddq_s32(vshlq_n_s32(vandq_s32(scales_4, mask), 1), mone);
+
+            sumi1 = vmlaq_s32(sumi1, scales_4, p12);
+            sumi2 = vmlaq_s32(sumi2, scales_4, p34);
+
+            qs += 8; qh += 4;
+
+        }
+
+        sumf += y[i].d * GGML_CPU_FP16_TO_FP32(scale.f16) * (vaddvq_s32(sumi1) + IQ1M_DELTA * vaddvq_s32(sumi2));
+    }
+
+    *s = sumf;
+
+#else
+
+    int sum1[2], sum2[2], delta[4];
+
+    float sumf = 0;
+    for (int i = 0; i < nb; i++) {
+
+        const int8_t   * q8 = y[i].qs;
+        const uint8_t  * qs = x[i].qs;
+        const uint8_t  * qh = x[i].qh;
+        const uint16_t * sc = (const uint16_t *)x[i].scales;
+
+        scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
+
+        int sumi1 = 0, sumi2 = 0;
+        for (int ib = 0; ib < QK_K/32; ++ib) {
+            delta[0] = qh[0] & 0x08 ? -1 : 1;
+            delta[1] = qh[0] & 0x80 ? -1 : 1;
+            delta[2] = qh[1] & 0x08 ? -1 : 1;
+            delta[3] = qh[1] & 0x80 ? -1 : 1;
+            sum1[0] = sum1[1] = sum2[0] = sum2[1] = 0;
+            for (int l = 0; l < 4; ++l) {
+                const int8_t * grid = (const int8_t *)(iq1s_grid + (qs[l] | (((uint16_t)qh[l/2] << (8 - 4*(l%2))) & 0x700)));
+                int lsum1 = 0, lsum2 = 0;
+                for (int j = 0; j < 8; ++j) {
+                    lsum1 += q8[j] * grid[j];
+                    lsum2 += q8[j];
+                }
+                q8 += 8;
+                sum1[l/2] += lsum1;
+                sum2[l/2] += lsum2*delta[l];
+            }
+
+            const int ls1 = 2*((sc[ib/2] >> (6*(ib%2)+0)) & 0x7) + 1;
+            const int ls2 = 2*((sc[ib/2] >> (6*(ib%2)+3)) & 0x7) + 1;
+
+            sumi1 += sum1[0] * ls1 + sum1[1] * ls2;
+            sumi2 += sum2[0] * ls1 + sum2[1] * ls2;
+            qs += 4;
+            qh += 2;
+        }
+
+        sumf += GGML_CPU_FP16_TO_FP32(scale.f16) * y[i].d * (sumi1 + IQ1M_DELTA * sumi2);
+    }
+
+    *s = sumf;
+
+#endif
+}
+
+void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+    assert(n % QK4_NL == 0);
+    static_assert(QK4_NL == QK8_0, "QK4_NL and QK8_0 must be the same");
+
+    const block_iq4_nl * GGML_RESTRICT x = vx;
+    const block_q8_0   * GGML_RESTRICT y = vy;
+
+    const int nb = n / QK4_NL;
+
+    int ib = 0;
+    float sumf = 0;
+
+#if defined __ARM_NEON
+    const int8x16_t values = vld1q_s8(kvalues_iq4nl);
+    const uint8x16_t m4b = vdupq_n_u8(0x0f);
+    uint8x16x2_t q4bits;
+    int8x16x4_t q4b;
+    int8x16x4_t q8b;
+    int32x4_t prod_1, prod_2;
+
+    for (; ib + 1 < nb; ib += 2) {
+
+        q4bits.val[0] = vld1q_u8(x[ib + 0].qs);
+        q4bits.val[1] = vld1q_u8(x[ib + 1].qs);
+        q8b.val[0]    = vld1q_s8(y[ib + 0].qs);
+        q8b.val[1]    = vld1q_s8(y[ib + 0].qs + 16);
+        q8b.val[2]    = vld1q_s8(y[ib + 1].qs);
+        q8b.val[3]    = vld1q_s8(y[ib + 1].qs + 16);
+
+        q4b.val[0] = ggml_vqtbl1q_s8(values, vandq_u8  (q4bits.val[0], m4b));
+        q4b.val[1] = ggml_vqtbl1q_s8(values, vshrq_n_u8(q4bits.val[0], 4));
+        q4b.val[2] = ggml_vqtbl1q_s8(values, vandq_u8  (q4bits.val[1], m4b));
+        q4b.val[3] = ggml_vqtbl1q_s8(values, vshrq_n_u8(q4bits.val[1], 4));
+
+        prod_1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[0], q8b.val[0]), q4b.val[1], q8b.val[1]);
+        prod_2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[2], q8b.val[2]), q4b.val[3], q8b.val[3]);
+
+        sumf +=
+            GGML_CPU_FP16_TO_FP32(x[ib+0].d) * GGML_CPU_FP16_TO_FP32(y[ib + 0].d) * vaddvq_s32(prod_1) +
+            GGML_CPU_FP16_TO_FP32(x[ib+1].d) * GGML_CPU_FP16_TO_FP32(y[ib + 1].d) * vaddvq_s32(prod_2);
+    }
+
+#endif
+    for (; ib < nb; ++ib) {
+        const float d = GGML_CPU_FP16_TO_FP32(y[ib].d)*GGML_CPU_FP16_TO_FP32(x[ib].d);
+        int sumi1 = 0, sumi2 = 0;
+        for (int j = 0; j < QK4_NL/2; ++j) {
+            sumi1 += y[ib].qs[j+       0] * kvalues_iq4nl[x[ib].qs[j] & 0xf];
+            sumi2 += y[ib].qs[j+QK4_NL/2] * kvalues_iq4nl[x[ib].qs[j] >>  4];
+        }
+        sumf += d * (sumi1 + sumi2);
+    }
+    *s = sumf;
+}
+
+void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+    assert(n % QK_K == 0);
+
+    const block_iq4_xs * GGML_RESTRICT x = vx;
+    const block_q8_K   * GGML_RESTRICT y = vy;
+
+    const int nb = n / QK_K;
+
+#if defined __ARM_NEON
+    const int8x16_t values = vld1q_s8(kvalues_iq4nl);
+    const uint8x16_t m4b = vdupq_n_u8(0x0f);
+    ggml_uint8x16x2_t q4bits;
+    ggml_int8x16x4_t q4b;
+    ggml_int8x16x4_t q8b;
+    int32x4_t prod_1, prod_2;
+
+    float sumf = 0;
+
+    for (int ibl = 0; ibl < nb; ++ibl) {
+
+        const int8_t  * q8 = y[ibl].qs;
+        const uint8_t * q4 = x[ibl].qs;
+        uint16_t h = x[ibl].scales_h;
+
+        int sumi1 = 0, sumi2 = 0;
+        for (int ib = 0; ib < QK_K/64; ++ib) {
+
+            q4bits = ggml_vld1q_u8_x2(q4); q4 += 32;
+            q8b    = ggml_vld1q_s8_x4(q8); q8 += 64;
+
+            q4b.val[0] = ggml_vqtbl1q_s8(values, vandq_u8  (q4bits.val[0], m4b));
+            q4b.val[1] = ggml_vqtbl1q_s8(values, vshrq_n_u8(q4bits.val[0], 4));
+            q4b.val[2] = ggml_vqtbl1q_s8(values, vandq_u8  (q4bits.val[1], m4b));
+            q4b.val[3] = ggml_vqtbl1q_s8(values, vshrq_n_u8(q4bits.val[1], 4));
+
+            prod_1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[0], q8b.val[0]), q4b.val[1], q8b.val[1]);
+            prod_2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[2], q8b.val[2]), q4b.val[3], q8b.val[3]);
+
+            int ls1 = ((x[ibl].scales_l[ib] & 0xf) | ((h << 4) & 0x30)) - 32;
+            int ls2 = ((x[ibl].scales_l[ib] >>  4) | ((h << 2) & 0x30)) - 32;
+            h >>= 4;
+            sumi1 += vaddvq_s32(prod_1) * ls1;
+            sumi2 += vaddvq_s32(prod_2) * ls2;
+
+        }
+
+        sumf += GGML_CPU_FP16_TO_FP32(x[ibl].d) * y[ibl].d * (sumi1 + sumi2);
+    }
+
+    *s = sumf;
+
+#else
+    float sumf = 0;
+    for (int ibl = 0; ibl < nb; ++ibl) {
+        const float d4d8 = GGML_CPU_FP16_TO_FP32(x[ibl].d) * y[ibl].d;
+        uint16_t h = x[ibl].scales_h;
+        const uint8_t * qs = x[ibl].qs;
+        const int8_t  * q8 = y[ibl].qs;
+        for (int ib = 0; ib < QK_K/32; ib += 2) {
+            const uint8_t ls1 = (x[ibl].scales_l[ib/2] & 0xf) | ((h << 4) & 0x30);
+            const uint8_t ls2 = (x[ibl].scales_l[ib/2] >>  4) | ((h << 2) & 0x30);
+            h >>= 4;
+            const float d1 = d4d8*(ls1 - 32);
+            const float d2 = d4d8*(ls2 - 32);
+            int sumi1 = 0, sumi2 = 0;
+            for (int j = 0; j < 16; ++j) {
+                sumi1 += q8[j+ 0] * kvalues_iq4nl[qs[j] & 0xf];
+                sumi2 += q8[j+16] * kvalues_iq4nl[qs[j] >>  4];
+            }
+            sumf += d1 * (sumi1 + sumi2);
+            qs += 16;
+            q8 += 32;
+            sumi1 = sumi2 = 0;
+            for (int j = 0; j < 16; ++j) {
+                sumi1 += q8[j+ 0] * kvalues_iq4nl[qs[j] & 0xf];
+                sumi2 += q8[j+16] * kvalues_iq4nl[qs[j] >>  4];
+            }
+            sumf += d2 * (sumi1 + sumi2);
+            qs += 16;
+            q8 += 32;
+        }
+    }
+    *s = sumf;
+#endif
+}
+
diff --git a/ggml/src/ggml-cpu/arch/arm/repack.cpp b/ggml/src/ggml-cpu/arch/arm/repack.cpp
new file mode 100644
index 0000000000000..2f8bc9e251735
--- /dev/null
+++ b/ggml/src/ggml-cpu/arch/arm/repack.cpp
@@ -0,0 +1,2163 @@
+#define GGML_COMMON_IMPL_CPP
+#define GGML_COMMON_DECL_CPP
+#include "ggml-common.h"
+#include "ggml-backend-impl.h"
+
+#include "ggml-impl.h"
+#include "ggml-cpu.h"
+#include "ggml-cpu-impl.h"
+#include "simd-mappings.h"
+#include "traits.h"
+
+#include <cmath>
+#include <cstring>
+#include <cassert>
+#include <cstdlib> // for qsort
+#include <cstdio>  // for GGML_ASSERT
+
+#define GGML_CPU_CLANG_WORKAROUND
+#include "../../repack.h"
+
+#if defined(__GNUC__)
+#pragma GCC diagnostic ignored "-Woverlength-strings"
+#endif
+
+#define UNUSED GGML_UNUSED
+
+void ggml_quantize_mat_q8_0_4x4(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
+    assert(QK8_0 == 32);
+    assert(k % QK8_0 == 0);
+    const int nb = k / QK8_0;
+
+    block_q8_0x4 * GGML_RESTRICT y = (block_q8_0x4 *) vy;
+
+#if defined(__ARM_NEON)
+    float32x4_t srcv[4][8];
+    float id[4];
+
+    for (int i = 0; i < nb; i++) {
+        float32x4_t asrcv[8];
+        float32x4_t amaxv[8];
+
+        for (int row_iter = 0; row_iter < 4; row_iter++) {
+            for (int j = 0; j < 8; j++) srcv[row_iter][j] = vld1q_f32(x + row_iter * k + i * 32 + 4 * j);
+            for (int j = 0; j < 8; j++) asrcv[j] = vabsq_f32(srcv[row_iter][j]);
+
+            for (int j = 0; j < 4; j++) amaxv[2 * j] = vmaxq_f32(asrcv[2 * j], asrcv[2 * j + 1]);
+            for (int j = 0; j < 2; j++) amaxv[4 * j] = vmaxq_f32(amaxv[4 * j], amaxv[4 * j + 2]);
+            for (int j = 0; j < 1; j++) amaxv[8 * j] = vmaxq_f32(amaxv[8 * j], amaxv[8 * j + 4]);
+
+            const float amax = vmaxvq_f32(amaxv[0]);
+
+            const float d = amax / ((1 << 7) - 1);
+            id[row_iter] = d ? 1.0f / d : 0.0f;
+
+            y[i].d[row_iter] = GGML_CPU_FP32_TO_FP16(d);
+        }
+
+        for (int j = 0; j < 8; j++) {
+            float32x4_t v = vmulq_n_f32(srcv[0][j], id[0]);
+            int32x4_t vi = vcvtnq_s32_f32(v);
+            y[i].qs[16 * j + 0] = vgetq_lane_s32(vi, 0);
+            y[i].qs[16 * j + 1] = vgetq_lane_s32(vi, 1);
+            y[i].qs[16 * j + 2] = vgetq_lane_s32(vi, 2);
+            y[i].qs[16 * j + 3] = vgetq_lane_s32(vi, 3);
+
+            v = vmulq_n_f32(srcv[1][j], id[1]);
+            vi = vcvtnq_s32_f32(v);
+            y[i].qs[16 * j + 4] = vgetq_lane_s32(vi, 0);
+            y[i].qs[16 * j + 5] = vgetq_lane_s32(vi, 1);
+            y[i].qs[16 * j + 6] = vgetq_lane_s32(vi, 2);
+            y[i].qs[16 * j + 7] = vgetq_lane_s32(vi, 3);
+
+            v = vmulq_n_f32(srcv[2][j], id[2]);
+            vi = vcvtnq_s32_f32(v);
+            y[i].qs[16 * j + 8] = vgetq_lane_s32(vi, 0);
+            y[i].qs[16 * j + 9] = vgetq_lane_s32(vi, 1);
+            y[i].qs[16 * j + 10] = vgetq_lane_s32(vi, 2);
+            y[i].qs[16 * j + 11] = vgetq_lane_s32(vi, 3);
+
+            v = vmulq_n_f32(srcv[3][j], id[3]);
+            vi = vcvtnq_s32_f32(v);
+            y[i].qs[16 * j + 12] = vgetq_lane_s32(vi, 0);
+            y[i].qs[16 * j + 13] = vgetq_lane_s32(vi, 1);
+            y[i].qs[16 * j + 14] = vgetq_lane_s32(vi, 2);
+            y[i].qs[16 * j + 15] = vgetq_lane_s32(vi, 3);
+        }
+    }
+#else
+    // scalar
+    const int blck_size_interleave = 4;
+    float srcv[4][QK8_0];
+    float id[4];
+
+    for (int i = 0; i < nb; i++) {
+        for (int row_iter = 0; row_iter < 4; row_iter++) {
+            float amax = 0.0f; // absolute max
+
+            for (int j = 0; j < QK8_0; j++) {
+                srcv[row_iter][j] = x[row_iter * k + i * QK8_0 + j];
+                amax = MAX(amax, fabsf(srcv[row_iter][j]));
+            }
+
+            const float d = amax / ((1 << 7) - 1);
+            id[row_iter] = d ? 1.0f / d : 0.0f;
+
+            y[i].d[row_iter] = GGML_CPU_FP32_TO_FP16(d);
+        }
+
+        for (int j = 0; j < QK8_0 * 4; j++) {
+            int src_offset = (j / (4 * blck_size_interleave)) * blck_size_interleave;
+            int src_id = (j % (4 * blck_size_interleave)) / blck_size_interleave;
+            src_offset += (j % blck_size_interleave);
+
+            float x0 = srcv[src_id][src_offset] * id[src_id];
+            y[i].qs[j] = roundf(x0);
+        }
+    }
+#endif
+}
+
+void ggml_quantize_mat_q8_0_4x8(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
+    assert(QK8_0 == 32);
+    assert(k % QK8_0 == 0);
+    const int nb = k / QK8_0;
+
+    block_q8_0x4 * GGML_RESTRICT y = (block_q8_0x4 *) vy;
+
+#if defined(__ARM_NEON)
+    float32x4_t srcv[4][8];
+    float id[4];
+
+    for (int i = 0; i < nb; i++) {
+        float32x4_t asrcv[8];
+        float32x4_t amaxv[8];
+
+        for (int row_iter = 0; row_iter < 4; row_iter++) {
+            for (int j = 0; j < 8; j++) srcv[row_iter][j] = vld1q_f32(x + row_iter * k + i * 32 + 4 * j);
+            for (int j = 0; j < 8; j++) asrcv[j] = vabsq_f32(srcv[row_iter][j]);
+
+            for (int j = 0; j < 4; j++) amaxv[2 * j] = vmaxq_f32(asrcv[2 * j], asrcv[2 * j + 1]);
+            for (int j = 0; j < 2; j++) amaxv[4 * j] = vmaxq_f32(amaxv[4 * j], amaxv[4 * j + 2]);
+            for (int j = 0; j < 1; j++) amaxv[8 * j] = vmaxq_f32(amaxv[8 * j], amaxv[8 * j + 4]);
+
+            const float amax = vmaxvq_f32(amaxv[0]);
+
+            const float d = amax / ((1 << 7) - 1);
+            id[row_iter] = d ? 1.0f / d : 0.0f;
+
+            y[i].d[row_iter] = GGML_CPU_FP32_TO_FP16(d);
+        }
+
+        for (int j = 0; j < 4; j++) {
+            float32x4_t v = vmulq_n_f32(srcv[0][2 * j], id[0]);
+            int32x4_t vi = vcvtnq_s32_f32(v);
+            y[i].qs[32 * j + 0] = vgetq_lane_s32(vi, 0);
+            y[i].qs[32 * j + 1] = vgetq_lane_s32(vi, 1);
+            y[i].qs[32 * j + 2] = vgetq_lane_s32(vi, 2);
+            y[i].qs[32 * j + 3] = vgetq_lane_s32(vi, 3);
+            v = vmulq_n_f32(srcv[0][2 * j + 1], id[0]);
+            vi = vcvtnq_s32_f32(v);
+            y[i].qs[32 * j + 4] = vgetq_lane_s32(vi, 0);
+            y[i].qs[32 * j + 5] = vgetq_lane_s32(vi, 1);
+            y[i].qs[32 * j + 6] = vgetq_lane_s32(vi, 2);
+            y[i].qs[32 * j + 7] = vgetq_lane_s32(vi, 3);
+
+            v = vmulq_n_f32(srcv[1][2 * j], id[1]);
+            vi = vcvtnq_s32_f32(v);
+            y[i].qs[32 * j + 8] = vgetq_lane_s32(vi, 0);
+            y[i].qs[32 * j + 9] = vgetq_lane_s32(vi, 1);
+            y[i].qs[32 * j + 10] = vgetq_lane_s32(vi, 2);
+            y[i].qs[32 * j + 11] = vgetq_lane_s32(vi, 3);
+            v = vmulq_n_f32(srcv[1][2 * j + 1], id[1]);
+            vi = vcvtnq_s32_f32(v);
+            y[i].qs[32 * j + 12] = vgetq_lane_s32(vi, 0);
+            y[i].qs[32 * j + 13] = vgetq_lane_s32(vi, 1);
+            y[i].qs[32 * j + 14] = vgetq_lane_s32(vi, 2);
+            y[i].qs[32 * j + 15] = vgetq_lane_s32(vi, 3);
+
+            v = vmulq_n_f32(srcv[2][2 * j], id[2]);
+            vi = vcvtnq_s32_f32(v);
+            y[i].qs[32 * j + 16] = vgetq_lane_s32(vi, 0);
+            y[i].qs[32 * j + 17] = vgetq_lane_s32(vi, 1);
+            y[i].qs[32 * j + 18] = vgetq_lane_s32(vi, 2);
+            y[i].qs[32 * j + 19] = vgetq_lane_s32(vi, 3);
+            v = vmulq_n_f32(srcv[2][2 * j + 1], id[2]);
+            vi = vcvtnq_s32_f32(v);
+            y[i].qs[32 * j + 20] = vgetq_lane_s32(vi, 0);
+            y[i].qs[32 * j + 21] = vgetq_lane_s32(vi, 1);
+            y[i].qs[32 * j + 22] = vgetq_lane_s32(vi, 2);
+            y[i].qs[32 * j + 23] = vgetq_lane_s32(vi, 3);
+
+            v = vmulq_n_f32(srcv[3][2 * j], id[3]);
+            vi = vcvtnq_s32_f32(v);
+            y[i].qs[32 * j + 24] = vgetq_lane_s32(vi, 0);
+            y[i].qs[32 * j + 25] = vgetq_lane_s32(vi, 1);
+            y[i].qs[32 * j + 26] = vgetq_lane_s32(vi, 2);
+            y[i].qs[32 * j + 27] = vgetq_lane_s32(vi, 3);
+            v = vmulq_n_f32(srcv[3][2 * j + 1], id[3]);
+            vi = vcvtnq_s32_f32(v);
+            y[i].qs[32 * j + 28] = vgetq_lane_s32(vi, 0);
+            y[i].qs[32 * j + 29] = vgetq_lane_s32(vi, 1);
+            y[i].qs[32 * j + 30] = vgetq_lane_s32(vi, 2);
+            y[i].qs[32 * j + 31] = vgetq_lane_s32(vi, 3);
+        }
+    }
+
+#else
+    // scalar
+    const int blck_size_interleave = 8;
+    float srcv[4][QK8_0];
+    float id[4];
+
+    for (int i = 0; i < nb; i++) {
+        for (int row_iter = 0; row_iter < 4; row_iter++) {
+            float amax = 0.0f; // absolute max
+
+            for (int j = 0; j < QK8_0; j++) {
+                srcv[row_iter][j] = x[row_iter * k + i * QK8_0 + j];
+                amax = MAX(amax, fabsf(srcv[row_iter][j]));
+            }
+
+            const float d = amax / ((1 << 7) - 1);
+            id[row_iter] = d ? 1.0f / d : 0.0f;
+
+            y[i].d[row_iter] = GGML_CPU_FP32_TO_FP16(d);
+        }
+
+        for (int j = 0; j < QK8_0 * 4; j++) {
+            int src_offset = (j / (4 * blck_size_interleave)) * blck_size_interleave;
+            int src_id = (j % (4 * blck_size_interleave)) / blck_size_interleave;
+            src_offset += (j % blck_size_interleave);
+
+            float x0 = srcv[src_id][src_offset] * id[src_id];
+            y[i].qs[j] = roundf(x0);
+        }
+    }
+#endif
+}
+
+void ggml_gemv_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
+    const int qk = QK8_0;
+    const int nb = n / qk;
+    const int ncols_interleaved = 4;
+    const int blocklen = 4;
+
+    assert (n % qk == 0);
+    assert (nc % ncols_interleaved == 0);
+
+    UNUSED(s);
+    UNUSED(bs);
+    UNUSED(vx);
+    UNUSED(vy);
+    UNUSED(nr);
+    UNUSED(nc);
+    UNUSED(nb);
+    UNUSED(ncols_interleaved);
+    UNUSED(blocklen);
+
+#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
+    const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx;
+
+    for (int c = 0; c < nc; c += ncols_interleaved) {
+        const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
+        float32x4_t acc = vdupq_n_f32(0);
+        for (int b = 0; b < nb; b++) {
+            int8x16_t b0 = vld1q_s8((const int8_t *) b_ptr->qs);
+            int8x16_t b1 = vld1q_s8((const int8_t *) b_ptr->qs + 16);
+            int8x16_t b2 = vld1q_s8((const int8_t *) b_ptr->qs + 32);
+            int8x16_t b3 = vld1q_s8((const int8_t *) b_ptr->qs + 48);
+            float16x4_t bd = vld1_f16((const __fp16 *) b_ptr->d);
+
+            int8x16_t a0 = vld1q_s8(a_ptr->qs);
+            int8x16_t a1 = vld1q_s8(a_ptr->qs + qk/2);
+            float16x4_t ad = vld1_dup_f16((const __fp16 *) &a_ptr->d);
+
+            int32x4_t ret = vdupq_n_s32(0);
+
+            ret = vdotq_laneq_s32(ret, b0 << 4, a0, 0);
+            ret = vdotq_laneq_s32(ret, b1 << 4, a0, 1);
+            ret = vdotq_laneq_s32(ret, b2 << 4, a0, 2);
+            ret = vdotq_laneq_s32(ret, b3 << 4, a0, 3);
+
+            ret = vdotq_laneq_s32(ret, b0 & 0xf0U, a1, 0);
+            ret = vdotq_laneq_s32(ret, b1 & 0xf0U, a1, 1);
+            ret = vdotq_laneq_s32(ret, b2 & 0xf0U, a1, 2);
+            ret = vdotq_laneq_s32(ret, b3 & 0xf0U, a1, 3);
+
+            acc = vfmaq_f32(acc, vcvtq_n_f32_s32(ret, 4),
+                            vmulq_f32(vcvt_f32_f16(ad), vcvt_f32_f16(bd)));
+            a_ptr++;
+            b_ptr++;
+        }
+        vst1q_f32(s, acc);
+        s += ncols_interleaved;
+    }
+    return;
+#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
+    float sumf[4];
+    int sumi;
+
+    const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
+    for (int x = 0; x < nc / ncols_interleaved; x++) {
+        const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
+
+        for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
+        for (int l = 0; l < nb; l++) {
+            for (int k = 0; k < (qk / (2 * blocklen)); k++) {
+                for (int j = 0; j < ncols_interleaved; j++) {
+                    sumi = 0;
+                    for (int i = 0; i < blocklen; ++i) {
+                        const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
+                        const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
+                        sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
+                    }
+                    sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
+                }
+            }
+        }
+        for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
+    }
+}
+
+void ggml_gemv_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
+    const int qk = QK8_0;
+    const int nb = n / qk;
+    const int ncols_interleaved = 4;
+    const int blocklen = 8;
+
+    assert (n % qk == 0);
+    assert (nc % ncols_interleaved == 0);
+
+    UNUSED(s);
+    UNUSED(bs);
+    UNUSED(vx);
+    UNUSED(vy);
+    UNUSED(nr);
+    UNUSED(nc);
+    UNUSED(nb);
+    UNUSED(ncols_interleaved);
+    UNUSED(blocklen);
+
+#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
+    const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx;
+
+    for (int c = 0; c < nc; c += ncols_interleaved) {
+        const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
+        float32x4_t acc = vdupq_n_f32(0);
+        for (int b = 0; b < nb; b++) {
+            int8x16_t b0 = vld1q_s8((const int8_t *) b_ptr->qs);
+            int8x16_t b1 = vld1q_s8((const int8_t *) b_ptr->qs + 16);
+            int8x16_t b2 = vld1q_s8((const int8_t *) b_ptr->qs + 32);
+            int8x16_t b3 = vld1q_s8((const int8_t *) b_ptr->qs + 48);
+            float16x4_t bd = vld1_f16((const __fp16 *) b_ptr->d);
+
+            int8x16_t a0 = (int8x16_t) vld1q_dup_s64((const int64_t *) a_ptr->qs);
+            int8x16_t a1 = (int8x16_t) vld1q_dup_s64((const int64_t *) a_ptr->qs + 1);
+            int8x16_t a2 = (int8x16_t) vld1q_dup_s64((const int64_t *) a_ptr->qs + 2);
+            int8x16_t a3 = (int8x16_t) vld1q_dup_s64((const int64_t *) a_ptr->qs + 3);
+            float16x4_t ad = vld1_dup_f16((const __fp16 *) &a_ptr->d);
+
+            int32x4_t ret0 = vdupq_n_s32(0);
+            int32x4_t ret1 = vdupq_n_s32(0);
+
+            ret0 = vdotq_s32(ret0, b0 << 4, a0);
+            ret1 = vdotq_s32(ret1, b1 << 4, a0);
+            ret0 = vdotq_s32(ret0, b2 << 4, a1);
+            ret1 = vdotq_s32(ret1, b3 << 4, a1);
+
+            ret0 = vdotq_s32(ret0, b0 & 0xf0U, a2);
+            ret1 = vdotq_s32(ret1, b1 & 0xf0U, a2);
+            ret0 = vdotq_s32(ret0, b2 & 0xf0U, a3);
+            ret1 = vdotq_s32(ret1, b3 & 0xf0U, a3);
+
+            int32x4_t ret = vpaddq_s32(ret0, ret1);
+
+            acc = vfmaq_f32(acc, vcvtq_n_f32_s32(ret, 4),
+                    vmulq_f32(vcvt_f32_f16(ad), vcvt_f32_f16(bd)));
+            a_ptr++;
+            b_ptr++;
+        }
+        vst1q_f32(s, acc);
+        s += ncols_interleaved;
+    }
+    return;
+#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
+    float sumf[4];
+    int sumi;
+
+    const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
+    for (int x = 0; x < nc / ncols_interleaved; x++) {
+        const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
+
+        for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
+        for (int l = 0; l < nb; l++) {
+            for (int k = 0; k < (qk / (2 * blocklen)); k++) {
+                for (int j = 0; j < ncols_interleaved; j++) {
+                    sumi = 0;
+                    for (int i = 0; i < blocklen; ++i) {
+                        const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
+                        const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
+                        sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
+                    }
+                    sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
+                }
+            }
+        }
+        for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
+    }
+}
+
+void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
+    const int qk = QK8_0;
+    const int nb = n / qk;
+    const int ncols_interleaved = 8;
+    const int blocklen = 8;
+
+    assert (n % qk == 0);
+    assert (nc % ncols_interleaved == 0);
+
+    UNUSED(s);
+    UNUSED(bs);
+    UNUSED(vx);
+    UNUSED(vy);
+    UNUSED(nr);
+    UNUSED(nc);
+    UNUSED(nb);
+    UNUSED(ncols_interleaved);
+    UNUSED(blocklen);
+
+#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__)
+#if defined(__ARM_FEATURE_SVE)
+    if (ggml_cpu_get_sve_cnt() == QK8_0) {
+        const void * b_ptr = vx;
+        const void * a_ptr = vy;
+        float * res_ptr = s;
+
+        __asm__ __volatile__(
+            "ptrue p0.b\n"
+            "add %x[b_ptr], %x[b_ptr], #0x10\n"
+            "1:"  // Column loop
+            "add x22, %x[a_ptr], #0x2\n"
+            "mov z31.b, #0x0\n"
+            "mov x21, %x[nb]\n"
+            "2:"  // Block loop
+            "ld1b { z30.b }, p0/Z, [%x[b_ptr]]\n"
+            "ld1b { z29.b }, p0/Z, [%x[b_ptr], #1, MUL VL]\n"
+            "mov z28.s, #0x0\n"
+            "mov z27.s, #0x0\n"
+            "ld1rd { z26.d }, p0/Z, [x22]\n"
+            "ld1b { z25.b }, p0/Z, [%x[b_ptr], #2, MUL VL]\n"
+            "sub x20, x22, #0x2\n"
+            "sub x21, x21, #0x1\n"
+            "ld1b { z24.b }, p0/Z, [%x[b_ptr], #3, MUL VL]\n"
+            "ld1rd { z23.d }, p0/Z, [x22, #8]\n"
+            "lsl z22.b, z30.b, #0x4\n"
+            "lsl z16.b, z29.b, #0x4\n"
+            "and z30.b, z30.b, #0xf0\n"
+            "and z29.b, z29.b, #0xf0\n"
+            "ld1rd { z21.d }, p0/Z, [x22, #16]\n"
+            "ld1rd { z20.d }, p0/Z, [x22, #24]\n"
+            "lsl z19.b, z25.b, #0x4\n"
+            "and z25.b, z25.b, #0xf0\n"
+            "ld1rh { z17.h }, p0/Z, [x20]\n"
+            "ld1h { z18.s }, p0/Z, [%x[b_ptr], #-1, MUL VL]\n"
+            "sdot z28.s, z22.b, z26.b\n"
+            "sdot z27.s, z16.b, z26.b\n"
+            "lsl z16.b, z24.b, #0x4\n"
+            "add x22, x22, #0x22\n"
+            "and z24.b, z24.b, #0xf0\n"
+            "add %x[b_ptr], %x[b_ptr], #0x90\n"
+            "fcvt z17.s, p0/m, z17.h\n"
+            "fcvt z18.s, p0/m, z18.h\n"
+            "sdot z28.s, z19.b, z23.b\n"
+            "sdot z27.s, z16.b, z23.b\n"
+            "fmul z18.s, z18.s, z17.s\n"
+            "sdot z28.s, z30.b, z21.b\n"
+            "sdot z27.s, z29.b, z21.b\n"
+            "sdot z28.s, z25.b, z20.b\n"
+            "sdot z27.s, z24.b, z20.b\n"
+            "uzp1 z17.s, z28.s, z27.s\n"
+            "uzp2 z16.s, z28.s, z27.s\n"
+            "add z17.s, z17.s, z16.s\n"
+            "asr z17.s, z17.s, #0x4\n"
+            "scvtf z17.s, p0/m, z17.s\n"
+            "fmla z31.s, p0/M, z17.s, z18.s\n"
+            "cbnz x21, 2b\n"
+            "sub %x[nc], %x[nc], #0x8\n"
+            "st1w { z31.s }, p0, [%x[res_ptr]]\n"
+            "add %x[res_ptr], %x[res_ptr], #0x20\n"
+            "cbnz %x[nc], 1b\n"
+            : [b_ptr] "+&r" (b_ptr), [res_ptr] "+&r" (res_ptr), [nc] "+&r" (nc)
+            : [a_ptr] "r" (a_ptr), [nb] "r" (nb)
+            : "memory", "p0", "x20", "x21", "x22", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+        );
+        return;
+    }
+#endif // #if defined(__ARM_FEATURE_SVE)
+
+#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__)
+    {
+        float sumf[8];
+        int sumi;
+
+        const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
+        for (int x = 0; x < nc / ncols_interleaved; x++) {
+            const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb);
+
+            for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
+            for (int l = 0; l < nb; l++) {
+                for (int k = 0; k < (qk / (2 * blocklen)); k++) {
+                    for (int j = 0; j < ncols_interleaved; j++) {
+                        sumi = 0;
+                        for (int i = 0; i < blocklen; ++i) {
+                            const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
+                            const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
+                            sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
+                        }
+                        sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
+                    }
+                }
+            }
+            for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
+        }
+    }
+}
+
+void ggml_gemv_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
+    const int qk = QK8_0;
+    const int nb = n / qk;
+    const int ncols_interleaved = 4;
+    const int blocklen = 4;
+
+    assert (n % qk == 0);
+    assert (nc % ncols_interleaved == 0);
+
+    UNUSED(s);
+    UNUSED(bs);
+    UNUSED(vx);
+    UNUSED(vy);
+    UNUSED(nr);
+    UNUSED(nc);
+    UNUSED(nb);
+    UNUSED(ncols_interleaved);
+    UNUSED(blocklen);
+
+#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
+    const int8x16_t kvalues = vld1q_s8(kvalues_iq4nl);
+    const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
+    float * res_ptr = s;
+
+    for (int x = 0; x < nc / ncols_interleaved; x++) {
+        const block_iq4_nlx4 * b_ptr = (const block_iq4_nlx4 *) vx + (x * nb);
+
+        float32x4_t sumf = vdupq_n_f32(0);
+        for (int l = 0; l < nb; l++) {
+            uint8x16_t b_0 = vld1q_u8(b_ptr[l].qs + 0);
+            uint8x16_t b_1 = vld1q_u8(b_ptr[l].qs + 16);
+            uint8x16_t b_2 = vld1q_u8(b_ptr[l].qs + 32);
+            uint8x16_t b_3 = vld1q_u8(b_ptr[l].qs + 48);
+
+            int8x16_t b_0_hi = vqtbl1q_s8(kvalues, b_0 >> 4);
+            int8x16_t b_0_lo = vqtbl1q_s8(kvalues, b_0 & 0x0F);
+            int8x16_t b_1_hi = vqtbl1q_s8(kvalues, b_1 >> 4);
+            int8x16_t b_1_lo = vqtbl1q_s8(kvalues, b_1 & 0x0F);
+            int8x16_t b_2_hi = vqtbl1q_s8(kvalues, b_2 >> 4);
+            int8x16_t b_2_lo = vqtbl1q_s8(kvalues, b_2 & 0x0F);
+            int8x16_t b_3_hi = vqtbl1q_s8(kvalues, b_3 >> 4);
+            int8x16_t b_3_lo = vqtbl1q_s8(kvalues, b_3 & 0x0F);
+
+            int8x16_t a_0 = vld1q_s8(a_ptr[l].qs + 0);
+            int8x16_t a_1 = vld1q_s8(a_ptr[l].qs + 16);
+
+            int32x4_t sumi = vdupq_n_s32(0);
+            sumi = vdotq_laneq_s32(sumi, b_0_lo, a_0, 0);
+            sumi = vdotq_laneq_s32(sumi, b_0_hi, a_1, 0);
+            sumi = vdotq_laneq_s32(sumi, b_1_lo, a_0, 1);
+            sumi = vdotq_laneq_s32(sumi, b_1_hi, a_1, 1);
+            sumi = vdotq_laneq_s32(sumi, b_2_lo, a_0, 2);
+            sumi = vdotq_laneq_s32(sumi, b_2_hi, a_1, 2);
+            sumi = vdotq_laneq_s32(sumi, b_3_lo, a_0, 3);
+            sumi = vdotq_laneq_s32(sumi, b_3_hi, a_1, 3);
+
+            float32x4_t a_d = vcvt_f32_f16(vld1_dup_f16((const float16_t *)&a_ptr[l].d));
+            float32x4_t b_d = vcvt_f32_f16(vld1_f16((const float16_t *)b_ptr[l].d));
+            float32x4_t d = a_d * b_d;
+
+            sumf = vmlaq_f32(sumf, d, vcvtq_f32_s32(sumi));
+        }
+
+        vst1q_f32(res_ptr + x * 4, sumf);
+    }
+    return;
+#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON)
+    {
+        float sumf[4];
+        int sumi;
+
+        const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
+        for (int x = 0; x < nc / ncols_interleaved; x++) {
+            const block_iq4_nlx4 * b_ptr = (const block_iq4_nlx4 *) vx + (x * nb);
+
+            for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
+            for (int l = 0; l < nb; l++) {
+                for (int k = 0; k < (qk / (2 * blocklen)); k++) {
+                    for (int j = 0; j < ncols_interleaved; j++) {
+                        sumi = 0;
+                        for (int i = 0; i < blocklen; ++i) {
+                            const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
+                            const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
+                            sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2]));
+                        }
+                        sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
+                    }
+                }
+            }
+            for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
+        }
+    }
+}
+
+void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
+    const int qk = QK8_0;
+    const int nb = n / qk;
+    const int ncols_interleaved = 4;
+    const int blocklen = 4;
+
+    assert (n % qk == 0);
+    assert (nr % 4 == 0);
+    assert (nc % ncols_interleaved == 0);
+
+    UNUSED(s);
+    UNUSED(bs);
+    UNUSED(vx);
+    UNUSED(vy);
+    UNUSED(nr);
+    UNUSED(nc);
+    UNUSED(nb);
+    UNUSED(ncols_interleaved);
+    UNUSED(blocklen);
+
+#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
+    const void * b_ptr = vx;
+    const void * a_ptr = vy;
+    float * res_ptr = s;
+    size_t res_stride = bs * sizeof(float);
+
+    __asm__ __volatile__(
+        "mov x10, %x[nr]\n"
+        "mov x9, #0x88\n"
+        "cmp x10, #0x10\n"
+        "mul x9, %x[nb], x9\n"
+        "blt 4f\n"
+        "1:"  // Row loop
+        "add x28, %x[b_ptr], #0x8\n"
+        "mov x27, %x[nc]\n"
+        "add x26, %x[res_ptr], %x[res_stride], LSL #4\n"
+        "2:"  // Column loop
+        "add x25, %x[a_ptr], #0x8\n"
+        "movi v15.16b, #0x0\n"
+        "movi v19.16b, #0x0\n"
+        "mov x24, %x[nb]\n"
+        "add x23, x25, x9\n"
+        "movi v18.16b, #0x0\n"
+        "movi v14.16b, #0x0\n"
+        "add x22, x23, x9\n"
+        "movi v11.16b, #0x0\n"
+        "movi v13.16b, #0x0\n"
+        "add x21, x22, x9\n"
+        "movi v23.16b, #0x0\n"
+        "movi v16.16b, #0x0\n"
+        "movi v25.16b, #0x0\n"
+        "movi v7.16b, #0x0\n"
+        "movi v0.16b, #0x0\n"
+        "movi v4.16b, #0x0\n"
+        "movi v5.16b, #0x0\n"
+        "movi v21.16b, #0x0\n"
+        "movi v8.16b, #0x0\n"
+        "movi v1.16b, #0x0\n"
+        "3:"  // Block loop
+        "ldr q3, [x28, #0x0]\n"
+        "ldr q31, [x25, #0x0]\n"
+        "movi v28.16b, #0x4\n"
+        "movi v10.4s, #0x0\n"
+        "ldr q22, [x28, #0x10]\n"
+        "ldr q6, [x25, #0x10]\n"
+        "movi v29.4s, #0x0\n"
+        "movi v9.4s, #0x0\n"
+        "ldr q27, [x28, #0x20]\n"
+        "ldr q30, [x28, #0x30]\n"
+        "movi v20.4s, #0x0\n"
+        "movi v24.16b, #0xf0\n"
+        "ldr d2, [x25, #-0x8]\n"
+        "ldr d26, [x23, #-0x8]\n"
+        "sshl v12.16b, v3.16b, v28.16b\n"
+        "sub x20, x28, #0x8\n"
+        "ldr d17, [x20, #0x0]\n"
+        "and v3.16b, v3.16b, v24.16b\n"
+        "subs x24, x24, #0x1\n"
+        "add x28, x28, #0x48\n"
+        ".inst 0x4f9fe18a  // sdot v10.4s, v12.16b, v31.4b[0]\n"
+        ".inst 0x4fbfe19d  // sdot v29.4s, v12.16b, v31.4b[1]\n"
+        ".inst 0x4f9fe989  // sdot v9.4s, v12.16b, v31.4b[2]\n"
+        ".inst 0x4fbfe994  // sdot v20.4s, v12.16b, v31.4b[3]\n"
+        "sshl v31.16b, v22.16b, v28.16b\n"
+        "and v22.16b, v22.16b, v24.16b\n"
+        "fcvtl v17.4s, v17.4h\n"
+        "fcvtl v2.4s, v2.4h\n"
+        "fcvtl v26.4s, v26.4h\n"
+        ".inst 0x4f86e3ea  // sdot v10.4s, v31.16b, v6.4b[0]\n"
+        ".inst 0x4fa6e3fd  // sdot v29.4s, v31.16b, v6.4b[1]\n"
+        ".inst 0x4f86ebe9  // sdot v9.4s, v31.16b, v6.4b[2]\n"
+        ".inst 0x4fa6ebf4  // sdot v20.4s, v31.16b, v6.4b[3]\n"
+        "sshl v6.16b, v27.16b, v28.16b\n"
+        "sshl v28.16b, v30.16b, v28.16b\n"
+        "and v27.16b, v27.16b, v24.16b\n"
+        "and v30.16b, v30.16b, v24.16b\n"
+        "ldr q24, [x25, #0x20]\n"
+        ".inst 0x4f98e0ca  // sdot v10.4s, v6.16b, v24.4b[0]\n"
+        ".inst 0x4fb8e0dd  // sdot v29.4s, v6.16b, v24.4b[1]\n"
+        ".inst 0x4f98e8c9  // sdot v9.4s, v6.16b, v24.4b[2]\n"
+        ".inst 0x4fb8e8d4  // sdot v20.4s, v6.16b, v24.4b[3]\n"
+        "ldr q24, [x25, #0x30]\n"
+        ".inst 0x4f98e38a  // sdot v10.4s, v28.16b, v24.4b[0]\n"
+        ".inst 0x4fb8e39d  // sdot v29.4s, v28.16b, v24.4b[1]\n"
+        ".inst 0x4f98eb89  // sdot v9.4s, v28.16b, v24.4b[2]\n"
+        ".inst 0x4fb8eb94  // sdot v20.4s, v28.16b, v24.4b[3]\n"
+        "ldr q24, [x25, #0x40]\n"
+        ".inst 0x4f98e06a  // sdot v10.4s, v3.16b, v24.4b[0]\n"
+        ".inst 0x4fb8e07d  // sdot v29.4s, v3.16b, v24.4b[1]\n"
+        ".inst 0x4f98e869  // sdot v9.4s, v3.16b, v24.4b[2]\n"
+        ".inst 0x4fb8e874  // sdot v20.4s, v3.16b, v24.4b[3]\n"
+        "ldr q24, [x25, #0x50]\n"
+        ".inst 0x4f98e2ca  // sdot v10.4s, v22.16b, v24.4b[0]\n"
+        ".inst 0x4fb8e2dd  // sdot v29.4s, v22.16b, v24.4b[1]\n"
+        ".inst 0x4f98eac9  // sdot v9.4s, v22.16b, v24.4b[2]\n"
+        ".inst 0x4fb8ead4  // sdot v20.4s, v22.16b, v24.4b[3]\n"
+        "ldr q24, [x25, #0x60]\n"
+        ".inst 0x4f98e36a  // sdot v10.4s, v27.16b, v24.4b[0]\n"
+        ".inst 0x4fb8e37d  // sdot v29.4s, v27.16b, v24.4b[1]\n"
+        ".inst 0x4f98eb69  // sdot v9.4s, v27.16b, v24.4b[2]\n"
+        ".inst 0x4fb8eb74  // sdot v20.4s, v27.16b, v24.4b[3]\n"
+        "ldr q24, [x25, #0x70]\n"
+        "add x25, x25, #0x88\n"
+        ".inst 0x4f98e3ca  // sdot v10.4s, v30.16b, v24.4b[0]\n"
+        ".inst 0x4fb8e3dd  // sdot v29.4s, v30.16b, v24.4b[1]\n"
+        ".inst 0x4f98ebc9  // sdot v9.4s, v30.16b, v24.4b[2]\n"
+        ".inst 0x4fb8ebd4  // sdot v20.4s, v30.16b, v24.4b[3]\n"
+        "fmul v24.4s, v17.4s, v2.s[0]\n"
+        "scvtf v10.4s, v10.4s, #0x4\n"
+        "scvtf v29.4s, v29.4s, #0x4\n"
+        "scvtf v9.4s, v9.4s, #0x4\n"
+        "scvtf v20.4s, v20.4s, #0x4\n"
+        "fmla v15.4s, v10.4s, v24.4s\n"
+        "ldr q24, [x23, #0x0]\n"
+        "fmul v10.4s, v17.4s, v2.s[1]\n"
+        "fmla v19.4s, v29.4s, v10.4s\n"
+        "ldr q10, [x23, #0x10]\n"
+        "fmul v29.4s, v17.4s, v2.s[2]\n"
+        "fmul v2.4s, v17.4s, v2.s[3]\n"
+        "fmla v18.4s, v9.4s, v29.4s\n"
+        "movi v9.4s, #0x0\n"
+        "movi v29.4s, #0x0\n"
+        ".inst 0x4f98e189  // sdot v9.4s, v12.16b, v24.4b[0]\n"
+        ".inst 0x4fb8e19d  // sdot v29.4s, v12.16b, v24.4b[1]\n"
+        "fmla v14.4s, v20.4s, v2.4s\n"
+        "movi v20.4s, #0x0\n"
+        "movi v2.4s, #0x0\n"
+        ".inst 0x4f98e994  // sdot v20.4s, v12.16b, v24.4b[2]\n"
+        ".inst 0x4fb8e982  // sdot v2.4s, v12.16b, v24.4b[3]\n"
+        "ldr q24, [x23, #0x20]\n"
+        ".inst 0x4f8ae3e9  // sdot v9.4s, v31.16b, v10.4b[0]\n"
+        ".inst 0x4faae3fd  // sdot v29.4s, v31.16b, v10.4b[1]\n"
+        ".inst 0x4f8aebf4  // sdot v20.4s, v31.16b, v10.4b[2]\n"
+        ".inst 0x4faaebe2  // sdot v2.4s, v31.16b, v10.4b[3]\n"
+        "ldr q10, [x23, #0x30]\n"
+        ".inst 0x4f98e0c9  // sdot v9.4s, v6.16b, v24.4b[0]\n"
+        ".inst 0x4fb8e0dd  // sdot v29.4s, v6.16b, v24.4b[1]\n"
+        ".inst 0x4f98e8d4  // sdot v20.4s, v6.16b, v24.4b[2]\n"
+        ".inst 0x4fb8e8c2  // sdot v2.4s, v6.16b, v24.4b[3]\n"
+        "ldr q24, [x23, #0x40]\n"
+        ".inst 0x4f8ae389  // sdot v9.4s, v28.16b, v10.4b[0]\n"
+        ".inst 0x4faae39d  // sdot v29.4s, v28.16b, v10.4b[1]\n"
+        ".inst 0x4f8aeb94  // sdot v20.4s, v28.16b, v10.4b[2]\n"
+        ".inst 0x4faaeb82  // sdot v2.4s, v28.16b, v10.4b[3]\n"
+        "ldr q10, [x23, #0x50]\n"
+        ".inst 0x4f98e069  // sdot v9.4s, v3.16b, v24.4b[0]\n"
+        ".inst 0x4fb8e07d  // sdot v29.4s, v3.16b, v24.4b[1]\n"
+        ".inst 0x4f98e874  // sdot v20.4s, v3.16b, v24.4b[2]\n"
+        ".inst 0x4fb8e862  // sdot v2.4s, v3.16b, v24.4b[3]\n"
+        "ldr q24, [x23, #0x60]\n"
+        ".inst 0x4f8ae2c9  // sdot v9.4s, v22.16b, v10.4b[0]\n"
+        ".inst 0x4faae2dd  // sdot v29.4s, v22.16b, v10.4b[1]\n"
+        ".inst 0x4f8aead4  // sdot v20.4s, v22.16b, v10.4b[2]\n"
+        ".inst 0x4faaeac2  // sdot v2.4s, v22.16b, v10.4b[3]\n"
+        "ldr q10, [x23, #0x70]\n"
+        "add x23, x23, #0x88\n"
+        ".inst 0x4f98e369  // sdot v9.4s, v27.16b, v24.4b[0]\n"
+        ".inst 0x4fb8e37d  // sdot v29.4s, v27.16b, v24.4b[1]\n"
+        ".inst 0x4f98eb74  // sdot v20.4s, v27.16b, v24.4b[2]\n"
+        ".inst 0x4fb8eb62  // sdot v2.4s, v27.16b, v24.4b[3]\n"
+        "ldr q24, [x22, #0x0]\n"
+        ".inst 0x4f8ae3c9  // sdot v9.4s, v30.16b, v10.4b[0]\n"
+        ".inst 0x4faae3dd  // sdot v29.4s, v30.16b, v10.4b[1]\n"
+        ".inst 0x4f8aebd4  // sdot v20.4s, v30.16b, v10.4b[2]\n"
+        ".inst 0x4faaebc2  // sdot v2.4s, v30.16b, v10.4b[3]\n"
+        "fmul v10.4s, v17.4s, v26.s[0]\n"
+        "scvtf v9.4s, v9.4s, #0x4\n"
+        "scvtf v29.4s, v29.4s, #0x4\n"
+        "scvtf v20.4s, v20.4s, #0x4\n"
+        "scvtf v2.4s, v2.4s, #0x4\n"
+        "fmla v11.4s, v9.4s, v10.4s\n"
+        "ldr q9, [x22, #0x10]\n"
+        "fmul v10.4s, v17.4s, v26.s[1]\n"
+        "fmla v13.4s, v29.4s, v10.4s\n"
+        "ldr d29, [x22, #-0x8]\n"
+        "fmul v10.4s, v17.4s, v26.s[2]\n"
+        "fmul v26.4s, v17.4s, v26.s[3]\n"
+        "fcvtl v29.4s, v29.4h\n"
+        "fmla v23.4s, v20.4s, v10.4s\n"
+        "movi v20.4s, #0x0\n"
+        "movi v10.4s, #0x0\n"
+        "fmla v16.4s, v2.4s, v26.4s\n"
+        "movi v26.4s, #0x0\n"
+        "movi v2.4s, #0x0\n"
+        ".inst 0x4f98e194  // sdot v20.4s, v12.16b, v24.4b[0]\n"
+        ".inst 0x4fb8e18a  // sdot v10.4s, v12.16b, v24.4b[1]\n"
+        ".inst 0x4f98e99a  // sdot v26.4s, v12.16b, v24.4b[2]\n"
+        ".inst 0x4fb8e982  // sdot v2.4s, v12.16b, v24.4b[3]\n"
+        "ldr q24, [x22, #0x20]\n"
+        ".inst 0x4f89e3f4  // sdot v20.4s, v31.16b, v9.4b[0]\n"
+        ".inst 0x4fa9e3ea  // sdot v10.4s, v31.16b, v9.4b[1]\n"
+        ".inst 0x4f89ebfa  // sdot v26.4s, v31.16b, v9.4b[2]\n"
+        ".inst 0x4fa9ebe2  // sdot v2.4s, v31.16b, v9.4b[3]\n"
+        "ldr q9, [x22, #0x30]\n"
+        ".inst 0x4f98e0d4  // sdot v20.4s, v6.16b, v24.4b[0]\n"
+        ".inst 0x4fb8e0ca  // sdot v10.4s, v6.16b, v24.4b[1]\n"
+        ".inst 0x4f98e8da  // sdot v26.4s, v6.16b, v24.4b[2]\n"
+        ".inst 0x4fb8e8c2  // sdot v2.4s, v6.16b, v24.4b[3]\n"
+        "ldr q24, [x22, #0x40]\n"
+        ".inst 0x4f89e394  // sdot v20.4s, v28.16b, v9.4b[0]\n"
+        ".inst 0x4fa9e38a  // sdot v10.4s, v28.16b, v9.4b[1]\n"
+        ".inst 0x4f89eb9a  // sdot v26.4s, v28.16b, v9.4b[2]\n"
+        ".inst 0x4fa9eb82  // sdot v2.4s, v28.16b, v9.4b[3]\n"
+        "ldr q9, [x22, #0x50]\n"
+        ".inst 0x4f98e074  // sdot v20.4s, v3.16b, v24.4b[0]\n"
+        ".inst 0x4fb8e06a  // sdot v10.4s, v3.16b, v24.4b[1]\n"
+        ".inst 0x4f98e87a  // sdot v26.4s, v3.16b, v24.4b[2]\n"
+        ".inst 0x4fb8e862  // sdot v2.4s, v3.16b, v24.4b[3]\n"
+        "ldr q24, [x22, #0x60]\n"
+        ".inst 0x4f89e2d4  // sdot v20.4s, v22.16b, v9.4b[0]\n"
+        ".inst 0x4fa9e2ca  // sdot v10.4s, v22.16b, v9.4b[1]\n"
+        ".inst 0x4f89eada  // sdot v26.4s, v22.16b, v9.4b[2]\n"
+        ".inst 0x4fa9eac2  // sdot v2.4s, v22.16b, v9.4b[3]\n"
+        "ldr q9, [x22, #0x70]\n"
+        "add x22, x22, #0x88\n"
+        ".inst 0x4f98e374  // sdot v20.4s, v27.16b, v24.4b[0]\n"
+        ".inst 0x4fb8e36a  // sdot v10.4s, v27.16b, v24.4b[1]\n"
+        ".inst 0x4f98eb7a  // sdot v26.4s, v27.16b, v24.4b[2]\n"
+        ".inst 0x4fb8eb62  // sdot v2.4s, v27.16b, v24.4b[3]\n"
+        "ldr q24, [x21, #0x0]\n"
+        ".inst 0x4f89e3d4  // sdot v20.4s, v30.16b, v9.4b[0]\n"
+        ".inst 0x4fa9e3ca  // sdot v10.4s, v30.16b, v9.4b[1]\n"
+        ".inst 0x4f89ebda  // sdot v26.4s, v30.16b, v9.4b[2]\n"
+        ".inst 0x4fa9ebc2  // sdot v2.4s, v30.16b, v9.4b[3]\n"
+        "fmul v9.4s, v17.4s, v29.s[0]\n"
+        "scvtf v20.4s, v20.4s, #0x4\n"
+        "scvtf v10.4s, v10.4s, #0x4\n"
+        "scvtf v26.4s, v26.4s, #0x4\n"
+        "scvtf v2.4s, v2.4s, #0x4\n"
+        "fmla v25.4s, v20.4s, v9.4s\n"
+        "ldr q9, [x21, #0x10]\n"
+        "fmul v20.4s, v17.4s, v29.s[1]\n"
+        "fmla v7.4s, v10.4s, v20.4s\n"
+        "ldr d20, [x21, #-0x8]\n"
+        "fmul v10.4s, v17.4s, v29.s[2]\n"
+        "fmul v29.4s, v17.4s, v29.s[3]\n"
+        "fcvtl v20.4s, v20.4h\n"
+        "fmla v0.4s, v26.4s, v10.4s\n"
+        "movi v26.4s, #0x0\n"
+        "movi v10.4s, #0x0\n"
+        "fmla v4.4s, v2.4s, v29.4s\n"
+        "movi v2.4s, #0x0\n"
+        "movi v29.4s, #0x0\n"
+        ".inst 0x4f98e19a  // sdot v26.4s, v12.16b, v24.4b[0]\n"
+        ".inst 0x4fb8e18a  // sdot v10.4s, v12.16b, v24.4b[1]\n"
+        ".inst 0x4f98e982  // sdot v2.4s, v12.16b, v24.4b[2]\n"
+        ".inst 0x4fb8e99d  // sdot v29.4s, v12.16b, v24.4b[3]\n"
+        "ldr q12, [x21, #0x20]\n"
+        "fmul v24.4s, v17.4s, v20.s[0]\n"
+        ".inst 0x4f89e3fa  // sdot v26.4s, v31.16b, v9.4b[0]\n"
+        ".inst 0x4fa9e3ea  // sdot v10.4s, v31.16b, v9.4b[1]\n"
+        ".inst 0x4f89ebe2  // sdot v2.4s, v31.16b, v9.4b[2]\n"
+        ".inst 0x4fa9ebfd  // sdot v29.4s, v31.16b, v9.4b[3]\n"
+        "ldr q9, [x21, #0x30]\n"
+        "fmul v31.4s, v17.4s, v20.s[1]\n"
+        ".inst 0x4f8ce0da  // sdot v26.4s, v6.16b, v12.4b[0]\n"
+        ".inst 0x4face0ca  // sdot v10.4s, v6.16b, v12.4b[1]\n"
+        ".inst 0x4f8ce8c2  // sdot v2.4s, v6.16b, v12.4b[2]\n"
+        ".inst 0x4face8dd  // sdot v29.4s, v6.16b, v12.4b[3]\n"
+        "ldr q12, [x21, #0x40]\n"
+        "fmul v6.4s, v17.4s, v20.s[2]\n"
+        "fmul v20.4s, v17.4s, v20.s[3]\n"
+        ".inst 0x4f89e39a  // sdot v26.4s, v28.16b, v9.4b[0]\n"
+        ".inst 0x4fa9e38a  // sdot v10.4s, v28.16b, v9.4b[1]\n"
+        ".inst 0x4f89eb82  // sdot v2.4s, v28.16b, v9.4b[2]\n"
+        ".inst 0x4fa9eb9d  // sdot v29.4s, v28.16b, v9.4b[3]\n"
+        "ldr q9, [x21, #0x50]\n"
+        ".inst 0x4f8ce07a  // sdot v26.4s, v3.16b, v12.4b[0]\n"
+        ".inst 0x4face06a  // sdot v10.4s, v3.16b, v12.4b[1]\n"
+        ".inst 0x4f8ce862  // sdot v2.4s, v3.16b, v12.4b[2]\n"
+        ".inst 0x4face87d  // sdot v29.4s, v3.16b, v12.4b[3]\n"
+        "ldr q12, [x21, #0x60]\n"
+        ".inst 0x4f89e2da  // sdot v26.4s, v22.16b, v9.4b[0]\n"
+        ".inst 0x4fa9e2ca  // sdot v10.4s, v22.16b, v9.4b[1]\n"
+        ".inst 0x4f89eac2  // sdot v2.4s, v22.16b, v9.4b[2]\n"
+        ".inst 0x4fa9eadd  // sdot v29.4s, v22.16b, v9.4b[3]\n"
+        "ldr q17, [x21, #0x70]\n"
+        "add x21, x21, #0x88\n"
+        ".inst 0x4f8ce37a  // sdot v26.4s, v27.16b, v12.4b[0]\n"
+        ".inst 0x4face36a  // sdot v10.4s, v27.16b, v12.4b[1]\n"
+        ".inst 0x4f8ceb62  // sdot v2.4s, v27.16b, v12.4b[2]\n"
+        ".inst 0x4faceb7d  // sdot v29.4s, v27.16b, v12.4b[3]\n"
+        ".inst 0x4f91e3da  // sdot v26.4s, v30.16b, v17.4b[0]\n"
+        ".inst 0x4fb1e3ca  // sdot v10.4s, v30.16b, v17.4b[1]\n"
+        ".inst 0x4f91ebc2  // sdot v2.4s, v30.16b, v17.4b[2]\n"
+        ".inst 0x4fb1ebdd  // sdot v29.4s, v30.16b, v17.4b[3]\n"
+        "scvtf v26.4s, v26.4s, #0x4\n"
+        "scvtf v10.4s, v10.4s, #0x4\n"
+        "fmla v5.4s, v26.4s, v24.4s\n"
+        "scvtf v2.4s, v2.4s, #0x4\n"
+        "scvtf v29.4s, v29.4s, #0x4\n"
+        "fmla v21.4s, v10.4s, v31.4s\n"
+        "fmla v8.4s, v2.4s, v6.4s\n"
+        "fmla v1.4s, v29.4s, v20.4s\n"
+        "bgt 3b\n"
+        "mov x20, %x[res_ptr]\n"
+        "subs x27, x27, #0x4\n"
+        "add %x[res_ptr], %x[res_ptr], #0x10\n"
+        "str q15, [x20, #0x0]\n"
+        "add x20, x20, %x[res_stride]\n"
+        "str q19, [x20, #0x0]\n"
+        "add x20, x20, %x[res_stride]\n"
+        "str q18, [x20, #0x0]\n"
+        "add x20, x20, %x[res_stride]\n"
+        "str q14, [x20, #0x0]\n"
+        "add x20, x20, %x[res_stride]\n"
+        "str q11, [x20, #0x0]\n"
+        "add x20, x20, %x[res_stride]\n"
+        "str q13, [x20, #0x0]\n"
+        "add x20, x20, %x[res_stride]\n"
+        "str q23, [x20, #0x0]\n"
+        "add x20, x20, %x[res_stride]\n"
+        "str q16, [x20, #0x0]\n"
+        "add x20, x20, %x[res_stride]\n"
+        "str q25, [x20, #0x0]\n"
+        "add x20, x20, %x[res_stride]\n"
+        "str q7, [x20, #0x0]\n"
+        "add x20, x20, %x[res_stride]\n"
+        "str q0, [x20, #0x0]\n"
+        "add x20, x20, %x[res_stride]\n"
+        "str q4, [x20, #0x0]\n"
+        "add x20, x20, %x[res_stride]\n"
+        "str q5, [x20, #0x0]\n"
+        "add x20, x20, %x[res_stride]\n"
+        "str q21, [x20, #0x0]\n"
+        "add x20, x20, %x[res_stride]\n"
+        "str q8, [x20, #0x0]\n"
+        "add x20, x20, %x[res_stride]\n"
+        "str q1, [x20, #0x0]\n"
+        "bne 2b\n"
+        "mov x20, #0x4\n"
+        "sub x10, x10, #0x10\n"
+        "cmp x10, #0x10\n"
+        "mov %x[res_ptr], x26\n"
+        "madd %x[a_ptr], x20, x9, %x[a_ptr]\n"
+        "bge 1b\n"
+        "4:"  // Row loop skip
+        "cbz x10, 9f\n"
+        "5:"  // Row tail: Row loop
+        "add x24, %x[b_ptr], #0x8\n"
+        "mov x23, %x[nc]\n"
+        "add x22, %x[res_ptr], %x[res_stride], LSL #2\n"
+        "6:"  // Row tail: Column loop
+        "movi v15.16b, #0x0\n"
+        "movi v19.16b, #0x0\n"
+        "add x25, %x[a_ptr], #0x8\n"
+        "mov x21, %x[nb]\n"
+        "movi v18.16b, #0x0\n"
+        "movi v14.16b, #0x0\n"
+        "7:"  // Row tail: Block loop
+        "ldr q7, [x24, #0x0]\n"
+        "ldr q5, [x25, #0x0]\n"
+        "movi v9.16b, #0x4\n"
+        "movi v4.4s, #0x0\n"
+        "ldr q3, [x24, #0x10]\n"
+        "ldr q2, [x25, #0x10]\n"
+        "movi v1.4s, #0x0\n"
+        "movi v0.4s, #0x0\n"
+        "ldr q13, [x24, #0x20]\n"
+        "ldr q31, [x25, #0x20]\n"
+        "movi v30.4s, #0x0\n"
+        "movi v29.16b, #0xf0\n"
+        "ldr q28, [x24, #0x30]\n"
+        "ldr q27, [x25, #0x30]\n"
+        "sshl v20.16b, v7.16b, v9.16b\n"
+        "sub x20, x24, #0x8\n"
+        "ldr q26, [x25, #0x40]\n"
+        "ldr q25, [x25, #0x50]\n"
+        "sshl v17.16b, v3.16b, v9.16b\n"
+        "and v7.16b, v7.16b, v29.16b\n"
+        "ldr q24, [x25, #0x60]\n"
+        "ldr q16, [x25, #0x70]\n"
+        "sshl v22.16b, v13.16b, v9.16b\n"
+        "and v3.16b, v3.16b, v29.16b\n"
+        "ldr d21, [x20, #0x0]\n"
+        "ldr d12, [x25, #-0x8]\n"
+        ".inst 0x4f85e284  // sdot v4.4s, v20.16b, v5.4b[0]\n"
+        ".inst 0x4fa5e281  // sdot v1.4s, v20.16b, v5.4b[1]\n"
+        ".inst 0x4f85ea80  // sdot v0.4s, v20.16b, v5.4b[2]\n"
+        ".inst 0x4fa5ea9e  // sdot v30.4s, v20.16b, v5.4b[3]\n"
+        "sshl v9.16b, v28.16b, v9.16b\n"
+        "subs x21, x21, #0x1\n"
+        "and v13.16b, v13.16b, v29.16b\n"
+        "and v28.16b, v28.16b, v29.16b\n"
+        "add x25, x25, #0x88\n"
+        "add x24, x24, #0x48\n"
+        "fcvtl v21.4s, v21.4h\n"
+        "fcvtl v12.4s, v12.4h\n"
+        ".inst 0x4f82e224  // sdot v4.4s, v17.16b, v2.4b[0]\n"
+        ".inst 0x4fa2e221  // sdot v1.4s, v17.16b, v2.4b[1]\n"
+        ".inst 0x4f82ea20  // sdot v0.4s, v17.16b, v2.4b[2]\n"
+        ".inst 0x4fa2ea3e  // sdot v30.4s, v17.16b, v2.4b[3]\n"
+        "fmul v11.4s, v21.4s, v12.s[0]\n"
+        "fmul v23.4s, v21.4s, v12.s[1]\n"
+        "fmul v17.4s, v21.4s, v12.s[2]\n"
+        ".inst 0x4f9fe2c4  // sdot v4.4s, v22.16b, v31.4b[0]\n"
+        "fmul v6.4s, v21.4s, v12.s[3]\n"
+        ".inst 0x4fbfe2c1  // sdot v1.4s, v22.16b, v31.4b[1]\n"
+        ".inst 0x4f9feac0  // sdot v0.4s, v22.16b, v31.4b[2]\n"
+        ".inst 0x4fbfeade  // sdot v30.4s, v22.16b, v31.4b[3]\n"
+        ".inst 0x4f9be124  // sdot v4.4s, v9.16b, v27.4b[0]\n"
+        ".inst 0x4fbbe121  // sdot v1.4s, v9.16b, v27.4b[1]\n"
+        ".inst 0x4f9be920  // sdot v0.4s, v9.16b, v27.4b[2]\n"
+        ".inst 0x4fbbe93e  // sdot v30.4s, v9.16b, v27.4b[3]\n"
+        ".inst 0x4f9ae0e4  // sdot v4.4s, v7.16b, v26.4b[0]\n"
+        ".inst 0x4fbae0e1  // sdot v1.4s, v7.16b, v26.4b[1]\n"
+        ".inst 0x4f9ae8e0  // sdot v0.4s, v7.16b, v26.4b[2]\n"
+        ".inst 0x4fbae8fe  // sdot v30.4s, v7.16b, v26.4b[3]\n"
+        ".inst 0x4f99e064  // sdot v4.4s, v3.16b, v25.4b[0]\n"
+        ".inst 0x4fb9e061  // sdot v1.4s, v3.16b, v25.4b[1]\n"
+        ".inst 0x4f99e860  // sdot v0.4s, v3.16b, v25.4b[2]\n"
+        ".inst 0x4fb9e87e  // sdot v30.4s, v3.16b, v25.4b[3]\n"
+        ".inst 0x4f98e1a4  // sdot v4.4s, v13.16b, v24.4b[0]\n"
+        ".inst 0x4fb8e1a1  // sdot v1.4s, v13.16b, v24.4b[1]\n"
+        ".inst 0x4f98e9a0  // sdot v0.4s, v13.16b, v24.4b[2]\n"
+        ".inst 0x4fb8e9be  // sdot v30.4s, v13.16b, v24.4b[3]\n"
+        ".inst 0x4f90e384  // sdot v4.4s, v28.16b, v16.4b[0]\n"
+        ".inst 0x4fb0e381  // sdot v1.4s, v28.16b, v16.4b[1]\n"
+        ".inst 0x4f90eb80  // sdot v0.4s, v28.16b, v16.4b[2]\n"
+        ".inst 0x4fb0eb9e  // sdot v30.4s, v28.16b, v16.4b[3]\n"
+        "scvtf v4.4s, v4.4s, #0x4\n"
+        "scvtf v1.4s, v1.4s, #0x4\n"
+        "scvtf v0.4s, v0.4s, #0x4\n"
+        "fmla v15.4s, v4.4s, v11.4s\n"
+        "scvtf v30.4s, v30.4s, #0x4\n"
+        "fmla v19.4s, v1.4s, v23.4s\n"
+        "fmla v18.4s, v0.4s, v17.4s\n"
+        "fmla v14.4s, v30.4s, v6.4s\n"
+        "bgt 7b\n"
+        "mov x20, %x[res_ptr]\n"
+        "cmp x10, #0x1\n"
+        "str q15, [x20, #0x0]\n"
+        "add x20, x20, %x[res_stride]\n"
+        "ble 8f\n"
+        "cmp x10, #0x2\n"
+        "str q19, [x20, #0x0]\n"
+        "add x20, x20, %x[res_stride]\n"
+        "ble 8f\n"
+        "cmp x10, #0x3\n"
+        "str q18, [x20, #0x0]\n"
+        "add x20, x20, %x[res_stride]\n"
+        "ble 8f\n"
+        "str q14, [x20, #0x0]\n"
+        "8:"  // Row tail: Accumulator store skip
+        "subs x23, x23, #0x4\n"
+        "add %x[res_ptr], %x[res_ptr], #0x10\n"
+        "bne 6b\n"
+        "subs x10, x10, #0x4\n"
+        "add %x[a_ptr], %x[a_ptr], x9\n"
+        "mov %x[res_ptr], x22\n"
+        "bgt 5b\n"
+        "9:"  // Row tail: Row loop skip
+        : [a_ptr] "+&r" (a_ptr), [res_ptr] "+&r" (res_ptr)
+        : [b_ptr] "r" (b_ptr), [nr] "r" (nr), [nb] "r" (nb), [res_stride] "r" (res_stride), [nc] "r" (nc)
+        : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+    );
+    return;
+#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON)
+    {
+        float sumf[4][4];
+        int sumi;
+
+        for (int y = 0; y < nr / 4; y++) {
+            const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
+            for (int x = 0; x < nc / ncols_interleaved; x++) {
+                const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
+                for (int m = 0; m < 4; m++) {
+                    for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
+                }
+                for (int l = 0; l < nb; l++) {
+                    for (int k = 0; k < (qk / (2 * blocklen)); k++) {
+                        for (int m = 0; m < 4; m++) {
+                            for (int j = 0; j < ncols_interleaved; j++) {
+                                sumi = 0;
+                                for (int i = 0; i < blocklen; ++i) {
+                                    const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
+                                    const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
+                                    sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
+                                            (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
+                                }
+                                sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
+                            }
+                        }
+                    }
+                }
+                for (int m = 0; m < 4; m++) {
+                    for (int j = 0; j < ncols_interleaved; j++)
+                        s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
+                }
+            }
+        }
+    }
+}
+
+void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
+    const int qk = QK8_0;
+    const int nb = n / qk;
+    const int ncols_interleaved = 4;
+    const int blocklen = 8;
+
+    assert (n % qk == 0);
+    assert (nr % 4 == 0);
+    assert (nc % ncols_interleaved == 0);
+
+    UNUSED(s);
+    UNUSED(bs);
+    UNUSED(vx);
+    UNUSED(vy);
+    UNUSED(nr);
+    UNUSED(nc);
+    UNUSED(nb);
+    UNUSED(ncols_interleaved);
+    UNUSED(blocklen);
+
+#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
+    const void * b_ptr = vx;
+    const void * a_ptr = vy;
+    float * res_ptr = s;
+    size_t res_stride = bs * sizeof(float);
+
+    __asm__ __volatile__(
+        "mov x10, %x[nr]\n"
+        "mov x9, #0x88\n"
+        "cmp x10, #0x10\n"
+        "mul x9, %x[nb], x9\n"
+        "blt 4f\n"
+        "1:"  // Row loop
+        "add x28, %x[b_ptr], #0x8\n"
+        "mov x27, %x[nc]\n"
+        "add x26, %x[res_ptr], %x[res_stride], LSL #4\n"
+        "2:"  // Column loop
+        "add x25, %x[a_ptr], #0x8\n"
+        "movi v2.16b, #0x0\n"
+        "movi v10.16b, #0x0\n"
+        "mov x24, %x[nb]\n"
+        "add x23, x25, x9\n"
+        "movi v12.16b, #0x0\n"
+        "movi v28.16b, #0x0\n"
+        "add x22, x23, x9\n"
+        "movi v11.16b, #0x0\n"
+        "movi v13.16b, #0x0\n"
+        "add x21, x22, x9\n"
+        "movi v22.16b, #0x0\n"
+        "movi v23.16b, #0x0\n"
+        "movi v25.16b, #0x0\n"
+        "movi v5.16b, #0x0\n"
+        "movi v7.16b, #0x0\n"
+        "movi v4.16b, #0x0\n"
+        "movi v6.16b, #0x0\n"
+        "movi v30.16b, #0x0\n"
+        "movi v24.16b, #0x0\n"
+        "movi v14.16b, #0x0\n"
+        "3:"  // Block loop
+        "ldr q21, [x28, #0x0]\n"
+        "ldr q16, [x28, #0x10]\n"
+        "movi v1.16b, #0x4\n"
+        "movi v19.4s, #0x0\n"
+        "ldr q27, [x25, #0x0]\n"
+        "ldr q15, [x25, #0x10]\n"
+        "movi v26.4s, #0x0\n"
+        "movi v18.4s, #0x0\n"
+        "ldr q29, [x28, #0x20]\n"
+        "ldr q3, [x28, #0x30]\n"
+        "movi v17.4s, #0x0\n"
+        "movi v0.16b, #0xf0\n"
+        "ldr d20, [x25, #-0x8]\n"
+        "ldr d9, [x23, #-0x8]\n"
+        "sshl v8.16b, v21.16b, v1.16b\n"
+        "sshl v31.16b, v16.16b, v1.16b\n"
+        "and v21.16b, v21.16b, v0.16b\n"
+        "and v16.16b, v16.16b, v0.16b\n"
+        "sub x20, x28, #0x8\n"
+        "subs x24, x24, #0x1\n"
+        "add x28, x28, #0x48\n"
+        ".inst 0x4e88a773  // smmla v19.4s, v27.16b, v8.16b\n"
+        ".inst 0x4e9fa77a  // smmla v26.4s, v27.16b, v31.16b\n"
+        "ldr q27, [x25, #0x20]\n"
+        ".inst 0x4e88a5f2  // smmla v18.4s, v15.16b, v8.16b\n"
+        ".inst 0x4e9fa5f1  // smmla v17.4s, v15.16b, v31.16b\n"
+        "sshl v15.16b, v29.16b, v1.16b\n"
+        "sshl v1.16b, v3.16b, v1.16b\n"
+        "and v29.16b, v29.16b, v0.16b\n"
+        "and v3.16b, v3.16b, v0.16b\n"
+        "ldr q0, [x25, #0x30]\n"
+        "fcvtl v20.4s, v20.4h\n"
+        ".inst 0x4e8fa773  // smmla v19.4s, v27.16b, v15.16b\n"
+        "fcvtl v9.4s, v9.4h\n"
+        ".inst 0x4e81a77a  // smmla v26.4s, v27.16b, v1.16b\n"
+        "ldr q27, [x25, #0x40]\n"
+        ".inst 0x4e8fa412  // smmla v18.4s, v0.16b, v15.16b\n"
+        ".inst 0x4e81a411  // smmla v17.4s, v0.16b, v1.16b\n"
+        "ldr q0, [x25, #0x50]\n"
+        ".inst 0x4e95a773  // smmla v19.4s, v27.16b, v21.16b\n"
+        ".inst 0x4e90a77a  // smmla v26.4s, v27.16b, v16.16b\n"
+        "ldr q27, [x25, #0x60]\n"
+        ".inst 0x4e95a412  // smmla v18.4s, v0.16b, v21.16b\n"
+        ".inst 0x4e90a411  // smmla v17.4s, v0.16b, v16.16b\n"
+        "ldr q0, [x25, #0x70]\n"
+        "add x25, x25, #0x88\n"
+        ".inst 0x4e9da773  // smmla v19.4s, v27.16b, v29.16b\n"
+        ".inst 0x4e83a77a  // smmla v26.4s, v27.16b, v3.16b\n"
+        "ldr d27, [x20, #0x0]\n"
+        ".inst 0x4e9da412  // smmla v18.4s, v0.16b, v29.16b\n"
+        ".inst 0x4e83a411  // smmla v17.4s, v0.16b, v3.16b\n"
+        "fcvtl v27.4s, v27.4h\n"
+        "uzp1 v0.2d, v19.2d, v26.2d\n"
+        "uzp2 v26.2d, v19.2d, v26.2d\n"
+        "fmul v19.4s, v27.4s, v20.s[0]\n"
+        "scvtf v0.4s, v0.4s, #0x4\n"
+        "scvtf v26.4s, v26.4s, #0x4\n"
+        "fmla v2.4s, v0.4s, v19.4s\n"
+        "ldr q19, [x23, #0x0]\n"
+        "uzp1 v0.2d, v18.2d, v17.2d\n"
+        "uzp2 v18.2d, v18.2d, v17.2d\n"
+        "fmul v17.4s, v27.4s, v20.s[1]\n"
+        "scvtf v0.4s, v0.4s, #0x4\n"
+        "scvtf v18.4s, v18.4s, #0x4\n"
+        "fmla v10.4s, v26.4s, v17.4s\n"
+        "ldr q17, [x23, #0x10]\n"
+        "fmul v26.4s, v27.4s, v20.s[2]\n"
+        "fmul v20.4s, v27.4s, v20.s[3]\n"
+        "fmla v12.4s, v0.4s, v26.4s\n"
+        "ldr d0, [x22, #-0x8]\n"
+        "ldr d26, [x21, #-0x8]\n"
+        "fcvtl v0.4s, v0.4h\n"
+        "fmla v28.4s, v18.4s, v20.4s\n"
+        "movi v20.4s, #0x0\n"
+        "movi v18.4s, #0x0\n"
+        ".inst 0x4e88a674  // smmla v20.4s, v19.16b, v8.16b\n"
+        ".inst 0x4e9fa672  // smmla v18.4s, v19.16b, v31.16b\n"
+        "ldr q19, [x23, #0x20]\n"
+        "fcvtl v26.4s, v26.4h\n"
+        ".inst 0x4e8fa674  // smmla v20.4s, v19.16b, v15.16b\n"
+        ".inst 0x4e81a672  // smmla v18.4s, v19.16b, v1.16b\n"
+        "ldr q19, [x23, #0x40]\n"
+        ".inst 0x4e95a674  // smmla v20.4s, v19.16b, v21.16b\n"
+        ".inst 0x4e90a672  // smmla v18.4s, v19.16b, v16.16b\n"
+        "ldr q19, [x23, #0x60]\n"
+        ".inst 0x4e9da674  // smmla v20.4s, v19.16b, v29.16b\n"
+        ".inst 0x4e83a672  // smmla v18.4s, v19.16b, v3.16b\n"
+        "uzp1 v19.2d, v20.2d, v18.2d\n"
+        "scvtf v19.4s, v19.4s, #0x4\n"
+        "uzp2 v20.2d, v20.2d, v18.2d\n"
+        "fmul v18.4s, v27.4s, v9.s[0]\n"
+        "scvtf v20.4s, v20.4s, #0x4\n"
+        "fmla v11.4s, v19.4s, v18.4s\n"
+        "ldr q18, [x22, #0x0]\n"
+        "fmul v19.4s, v27.4s, v9.s[1]\n"
+        "fmla v13.4s, v20.4s, v19.4s\n"
+        "movi v19.4s, #0x0\n"
+        "movi v20.4s, #0x0\n"
+        ".inst 0x4e88a633  // smmla v19.4s, v17.16b, v8.16b\n"
+        ".inst 0x4e9fa634  // smmla v20.4s, v17.16b, v31.16b\n"
+        "ldr q17, [x23, #0x30]\n"
+        ".inst 0x4e8fa633  // smmla v19.4s, v17.16b, v15.16b\n"
+        ".inst 0x4e81a634  // smmla v20.4s, v17.16b, v1.16b\n"
+        "ldr q17, [x23, #0x50]\n"
+        ".inst 0x4e95a633  // smmla v19.4s, v17.16b, v21.16b\n"
+        ".inst 0x4e90a634  // smmla v20.4s, v17.16b, v16.16b\n"
+        "ldr q17, [x23, #0x70]\n"
+        "add x23, x23, #0x88\n"
+        ".inst 0x4e9da633  // smmla v19.4s, v17.16b, v29.16b\n"
+        ".inst 0x4e83a634  // smmla v20.4s, v17.16b, v3.16b\n"
+        "uzp1 v17.2d, v19.2d, v20.2d\n"
+        "scvtf v17.4s, v17.4s, #0x4\n"
+        "uzp2 v20.2d, v19.2d, v20.2d\n"
+        "fmul v19.4s, v27.4s, v9.s[2]\n"
+        "fmul v9.4s, v27.4s, v9.s[3]\n"
+        "scvtf v20.4s, v20.4s, #0x4\n"
+        "fmla v22.4s, v17.4s, v19.4s\n"
+        "ldr q17, [x22, #0x10]\n"
+        "movi v19.4s, #0x0\n"
+        ".inst 0x4e88a653  // smmla v19.4s, v18.16b, v8.16b\n"
+        "fmla v23.4s, v20.4s, v9.4s\n"
+        "movi v20.4s, #0x0\n"
+        "movi v9.4s, #0x0\n"
+        ".inst 0x4e9fa654  // smmla v20.4s, v18.16b, v31.16b\n"
+        "ldr q18, [x22, #0x20]\n"
+        ".inst 0x4e88a629  // smmla v9.4s, v17.16b, v8.16b\n"
+        ".inst 0x4e8fa653  // smmla v19.4s, v18.16b, v15.16b\n"
+        ".inst 0x4e81a654  // smmla v20.4s, v18.16b, v1.16b\n"
+        "ldr q18, [x22, #0x40]\n"
+        ".inst 0x4e95a653  // smmla v19.4s, v18.16b, v21.16b\n"
+        ".inst 0x4e90a654  // smmla v20.4s, v18.16b, v16.16b\n"
+        "ldr q18, [x22, #0x60]\n"
+        ".inst 0x4e9da653  // smmla v19.4s, v18.16b, v29.16b\n"
+        ".inst 0x4e83a654  // smmla v20.4s, v18.16b, v3.16b\n"
+        "movi v18.4s, #0x0\n"
+        ".inst 0x4e9fa632  // smmla v18.4s, v17.16b, v31.16b\n"
+        "ldr q17, [x22, #0x30]\n"
+        ".inst 0x4e8fa629  // smmla v9.4s, v17.16b, v15.16b\n"
+        ".inst 0x4e81a632  // smmla v18.4s, v17.16b, v1.16b\n"
+        "ldr q17, [x22, #0x50]\n"
+        ".inst 0x4e95a629  // smmla v9.4s, v17.16b, v21.16b\n"
+        ".inst 0x4e90a632  // smmla v18.4s, v17.16b, v16.16b\n"
+        "ldr q17, [x22, #0x70]\n"
+        "add x22, x22, #0x88\n"
+        ".inst 0x4e9da629  // smmla v9.4s, v17.16b, v29.16b\n"
+        ".inst 0x4e83a632  // smmla v18.4s, v17.16b, v3.16b\n"
+        "uzp1 v17.2d, v19.2d, v20.2d\n"
+        "uzp2 v20.2d, v19.2d, v20.2d\n"
+        "fmul v19.4s, v27.4s, v0.s[0]\n"
+        "scvtf v17.4s, v17.4s, #0x4\n"
+        "scvtf v20.4s, v20.4s, #0x4\n"
+        "fmla v25.4s, v17.4s, v19.4s\n"
+        "ldr q19, [x21, #0x0]\n"
+        "fmul v17.4s, v27.4s, v0.s[1]\n"
+        "fmla v5.4s, v20.4s, v17.4s\n"
+        "ldr q17, [x21, #0x10]\n"
+        "uzp1 v20.2d, v9.2d, v18.2d\n"
+        "uzp2 v9.2d, v9.2d, v18.2d\n"
+        "fmul v18.4s, v27.4s, v0.s[2]\n"
+        "fmul v0.4s, v27.4s, v0.s[3]\n"
+        "scvtf v20.4s, v20.4s, #0x4\n"
+        "scvtf v9.4s, v9.4s, #0x4\n"
+        "fmla v7.4s, v20.4s, v18.4s\n"
+        "movi v20.4s, #0x0\n"
+        "movi v18.4s, #0x0\n"
+        ".inst 0x4e88a674  // smmla v20.4s, v19.16b, v8.16b\n"
+        ".inst 0x4e9fa672  // smmla v18.4s, v19.16b, v31.16b\n"
+        "ldr q19, [x21, #0x20]\n"
+        "fmla v4.4s, v9.4s, v0.4s\n"
+        "movi v9.4s, #0x0\n"
+        "movi v0.4s, #0x0\n"
+        ".inst 0x4e88a629  // smmla v9.4s, v17.16b, v8.16b\n"
+        "fmul v8.4s, v27.4s, v26.s[0]\n"
+        ".inst 0x4e9fa620  // smmla v0.4s, v17.16b, v31.16b\n"
+        "ldr q17, [x21, #0x30]\n"
+        ".inst 0x4e8fa674  // smmla v20.4s, v19.16b, v15.16b\n"
+        "fmul v31.4s, v27.4s, v26.s[1]\n"
+        ".inst 0x4e81a672  // smmla v18.4s, v19.16b, v1.16b\n"
+        "ldr q19, [x21, #0x40]\n"
+        ".inst 0x4e8fa629  // smmla v9.4s, v17.16b, v15.16b\n"
+        "fmul v15.4s, v27.4s, v26.s[2]\n"
+        "fmul v27.4s, v27.4s, v26.s[3]\n"
+        ".inst 0x4e81a620  // smmla v0.4s, v17.16b, v1.16b\n"
+        "ldr q1, [x21, #0x50]\n"
+        ".inst 0x4e95a674  // smmla v20.4s, v19.16b, v21.16b\n"
+        ".inst 0x4e90a672  // smmla v18.4s, v19.16b, v16.16b\n"
+        "ldr q26, [x21, #0x60]\n"
+        ".inst 0x4e95a429  // smmla v9.4s, v1.16b, v21.16b\n"
+        ".inst 0x4e90a420  // smmla v0.4s, v1.16b, v16.16b\n"
+        "ldr q21, [x21, #0x70]\n"
+        "add x21, x21, #0x88\n"
+        ".inst 0x4e9da754  // smmla v20.4s, v26.16b, v29.16b\n"
+        ".inst 0x4e83a752  // smmla v18.4s, v26.16b, v3.16b\n"
+        ".inst 0x4e9da6a9  // smmla v9.4s, v21.16b, v29.16b\n"
+        ".inst 0x4e83a6a0  // smmla v0.4s, v21.16b, v3.16b\n"
+        "uzp1 v29.2d, v20.2d, v18.2d\n"
+        "uzp2 v21.2d, v20.2d, v18.2d\n"
+        "scvtf v29.4s, v29.4s, #0x4\n"
+        "uzp1 v18.2d, v9.2d, v0.2d\n"
+        "uzp2 v16.2d, v9.2d, v0.2d\n"
+        "scvtf v21.4s, v21.4s, #0x4\n"
+        "fmla v6.4s, v29.4s, v8.4s\n"
+        "scvtf v18.4s, v18.4s, #0x4\n"
+        "scvtf v16.4s, v16.4s, #0x4\n"
+        "fmla v30.4s, v21.4s, v31.4s\n"
+        "fmla v24.4s, v18.4s, v15.4s\n"
+        "fmla v14.4s, v16.4s, v27.4s\n"
+        "bgt 3b\n"
+        "mov x20, %x[res_ptr]\n"
+        "subs x27, x27, #0x4\n"
+        "add %x[res_ptr], %x[res_ptr], #0x10\n"
+        "str q2, [x20, #0x0]\n"
+        "add x20, x20, %x[res_stride]\n"
+        "str q10, [x20, #0x0]\n"
+        "add x20, x20, %x[res_stride]\n"
+        "str q12, [x20, #0x0]\n"
+        "add x20, x20, %x[res_stride]\n"
+        "str q28, [x20, #0x0]\n"
+        "add x20, x20, %x[res_stride]\n"
+        "str q11, [x20, #0x0]\n"
+        "add x20, x20, %x[res_stride]\n"
+        "str q13, [x20, #0x0]\n"
+        "add x20, x20, %x[res_stride]\n"
+        "str q22, [x20, #0x0]\n"
+        "add x20, x20, %x[res_stride]\n"
+        "str q23, [x20, #0x0]\n"
+        "add x20, x20, %x[res_stride]\n"
+        "str q25, [x20, #0x0]\n"
+        "add x20, x20, %x[res_stride]\n"
+        "str q5, [x20, #0x0]\n"
+        "add x20, x20, %x[res_stride]\n"
+        "str q7, [x20, #0x0]\n"
+        "add x20, x20, %x[res_stride]\n"
+        "str q4, [x20, #0x0]\n"
+        "add x20, x20, %x[res_stride]\n"
+        "str q6, [x20, #0x0]\n"
+        "add x20, x20, %x[res_stride]\n"
+        "str q30, [x20, #0x0]\n"
+        "add x20, x20, %x[res_stride]\n"
+        "str q24, [x20, #0x0]\n"
+        "add x20, x20, %x[res_stride]\n"
+        "str q14, [x20, #0x0]\n"
+        "bne 2b\n"
+        "mov x20, #0x4\n"
+        "sub x10, x10, #0x10\n"
+        "cmp x10, #0x10\n"
+        "mov %x[res_ptr], x26\n"
+        "madd %x[a_ptr], x20, x9, %x[a_ptr]\n"
+        "bge 1b\n"
+        "4:"  // Row loop skip
+        "cbz x10, 9f\n"
+        "5:"  // Row tail: Row loop
+        "add x24, %x[b_ptr], #0x8\n"
+        "mov x23, %x[nc]\n"
+        "add x22, %x[res_ptr], %x[res_stride], LSL #2\n"
+        "6:"  // Row tail: Column loop
+        "movi v2.16b, #0x0\n"
+        "movi v10.16b, #0x0\n"
+        "add x25, %x[a_ptr], #0x8\n"
+        "mov x21, %x[nb]\n"
+        "movi v12.16b, #0x0\n"
+        "movi v28.16b, #0x0\n"
+        "7:"  // Row tail: Block loop
+        "ldr q6, [x24, #0x0]\n"
+        "ldr q5, [x24, #0x10]\n"
+        "movi v17.16b, #0x4\n"
+        "movi v8.4s, #0x0\n"
+        "ldr q4, [x25, #0x0]\n"
+        "ldr q13, [x25, #0x10]\n"
+        "movi v27.4s, #0x0\n"
+        "movi v0.4s, #0x0\n"
+        "ldr q31, [x24, #0x20]\n"
+        "ldr q14, [x24, #0x30]\n"
+        "movi v29.4s, #0x0\n"
+        "movi v22.16b, #0xf0\n"
+        "ldr q11, [x25, #0x20]\n"
+        "ldr q23, [x25, #0x30]\n"
+        "sshl v21.16b, v6.16b, v17.16b\n"
+        "sshl v16.16b, v5.16b, v17.16b\n"
+        "ldr q20, [x25, #0x40]\n"
+        "ldr q26, [x25, #0x50]\n"
+        "and v6.16b, v6.16b, v22.16b\n"
+        "and v5.16b, v5.16b, v22.16b\n"
+        "ldr q25, [x25, #0x60]\n"
+        "ldr q3, [x25, #0x70]\n"
+        "sshl v19.16b, v31.16b, v17.16b\n"
+        "sshl v18.16b, v14.16b, v17.16b\n"
+        "ldr d17, [x25, #-0x8]\n"
+        ".inst 0x4e95a488  // smmla v8.4s, v4.16b, v21.16b\n"
+        ".inst 0x4e90a49b  // smmla v27.4s, v4.16b, v16.16b\n"
+        "and v31.16b, v31.16b, v22.16b\n"
+        ".inst 0x4e95a5a0  // smmla v0.4s, v13.16b, v21.16b\n"
+        ".inst 0x4e90a5bd  // smmla v29.4s, v13.16b, v16.16b\n"
+        "and v14.16b, v14.16b, v22.16b\n"
+        "sub x20, x24, #0x8\n"
+        "ldr d16, [x20, #0x0]\n"
+        "subs x21, x21, #0x1\n"
+        "add x25, x25, #0x88\n"
+        "fcvtl v17.4s, v17.4h\n"
+        "add x24, x24, #0x48\n"
+        ".inst 0x4e93a568  // smmla v8.4s, v11.16b, v19.16b\n"
+        ".inst 0x4e92a57b  // smmla v27.4s, v11.16b, v18.16b\n"
+        ".inst 0x4e93a6e0  // smmla v0.4s, v23.16b, v19.16b\n"
+        ".inst 0x4e92a6fd  // smmla v29.4s, v23.16b, v18.16b\n"
+        "fcvtl v16.4s, v16.4h\n"
+        ".inst 0x4e86a688  // smmla v8.4s, v20.16b, v6.16b\n"
+        ".inst 0x4e85a69b  // smmla v27.4s, v20.16b, v5.16b\n"
+        "fmul v23.4s, v16.4s, v17.s[0]\n"
+        "fmul v21.4s, v16.4s, v17.s[1]\n"
+        "fmul v1.4s, v16.4s, v17.s[2]\n"
+        "fmul v20.4s, v16.4s, v17.s[3]\n"
+        ".inst 0x4e86a740  // smmla v0.4s, v26.16b, v6.16b\n"
+        ".inst 0x4e85a75d  // smmla v29.4s, v26.16b, v5.16b\n"
+        ".inst 0x4e9fa728  // smmla v8.4s, v25.16b, v31.16b\n"
+        ".inst 0x4e8ea73b  // smmla v27.4s, v25.16b, v14.16b\n"
+        ".inst 0x4e9fa460  // smmla v0.4s, v3.16b, v31.16b\n"
+        ".inst 0x4e8ea47d  // smmla v29.4s, v3.16b, v14.16b\n"
+        "uzp1 v19.2d, v8.2d, v27.2d\n"
+        "uzp2 v18.2d, v8.2d, v27.2d\n"
+        "scvtf v19.4s, v19.4s, #0x4\n"
+        "uzp1 v17.2d, v0.2d, v29.2d\n"
+        "uzp2 v16.2d, v0.2d, v29.2d\n"
+        "scvtf v18.4s, v18.4s, #0x4\n"
+        "fmla v2.4s, v19.4s, v23.4s\n"
+        "scvtf v17.4s, v17.4s, #0x4\n"
+        "scvtf v16.4s, v16.4s, #0x4\n"
+        "fmla v10.4s, v18.4s, v21.4s\n"
+        "fmla v12.4s, v17.4s, v1.4s\n"
+        "fmla v28.4s, v16.4s, v20.4s\n"
+        "bgt 7b\n"
+        "mov x20, %x[res_ptr]\n"
+        "cmp x10, #0x1\n"
+        "str q2, [x20, #0x0]\n"
+        "add x20, x20, %x[res_stride]\n"
+        "ble 8f\n"
+        "cmp x10, #0x2\n"
+        "str q10, [x20, #0x0]\n"
+        "add x20, x20, %x[res_stride]\n"
+        "ble 8f\n"
+        "cmp x10, #0x3\n"
+        "str q12, [x20, #0x0]\n"
+        "add x20, x20, %x[res_stride]\n"
+        "ble 8f\n"
+        "str q28, [x20, #0x0]\n"
+        "8:"  // Row tail: Accumulator store skip
+        "subs x23, x23, #0x4\n"
+        "add %x[res_ptr], %x[res_ptr], #0x10\n"
+        "bne 6b\n"
+        "subs x10, x10, #0x4\n"
+        "add %x[a_ptr], %x[a_ptr], x9\n"
+        "mov %x[res_ptr], x22\n"
+        "bgt 5b\n"
+        "9:"  // Row tail: Row loop skip
+        : [a_ptr] "+&r" (a_ptr), [res_ptr] "+&r" (res_ptr)
+        : [b_ptr] "r" (b_ptr), [nr] "r" (nr), [nb] "r" (nb), [res_stride] "r" (res_stride), [nc] "r" (nc)
+        : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+    );
+    return;
+#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
+    float sumf[4][4];
+    int sumi;
+
+    for (int y = 0; y < nr / 4; y++) {
+        const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
+        for (int x = 0; x < nc / ncols_interleaved; x++) {
+            const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
+            for (int m = 0; m < 4; m++) {
+                for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
+            }
+            for (int l = 0; l < nb; l++) {
+                for (int k = 0; k < (qk / (2 * blocklen)); k++) {
+                    for (int m = 0; m < 4; m++) {
+                        for (int j = 0; j < ncols_interleaved; j++) {
+                            sumi = 0;
+                            for (int i = 0; i < blocklen; ++i) {
+                                const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
+                                const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
+                                sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
+                                        (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
+                            }
+                            sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
+                        }
+                    }
+                }
+            }
+            for (int m = 0; m < 4; m++) {
+                for (int j = 0; j < ncols_interleaved; j++)
+                    s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
+            }
+        }
+    }
+}
+
+void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
+    const int qk = QK8_0;
+    const int nb = n / qk;
+    const int ncols_interleaved = 8;
+    const int blocklen = 8;
+
+    assert (n % qk == 0);
+    assert (nr % 4 == 0);
+    assert (nc % ncols_interleaved == 0);
+
+    UNUSED(s);
+    UNUSED(bs);
+    UNUSED(vx);
+    UNUSED(vy);
+    UNUSED(nr);
+    UNUSED(nc);
+    UNUSED(nb);
+    UNUSED(ncols_interleaved);
+    UNUSED(blocklen);
+
+#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__)
+#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
+    if (ggml_cpu_get_sve_cnt() == QK8_0) {
+        const void * b_ptr = vx;
+        const void * a_ptr = vy;
+        float * res_ptr = s;
+        size_t res_stride = bs * sizeof(float);
+
+        __asm__ __volatile__(
+            "mov x20, #0x4\n"
+            "mov x13, %x[nr]\n"
+            "mov z28.s, #-0x4\n"
+            "mov x12, #0x88\n"
+            "ptrue p1.b\n"
+            "whilelt p0.s, XZR, x20\n"
+            "cmp x13, #0x10\n"
+            "mul x12, %x[nb], x12\n"
+            "blt 4f\n"
+            "1:"  // Row loop
+            "add x11, %x[b_ptr], #0x10\n"
+            "mov x10, %x[nc]\n"
+            "add x9, %x[res_ptr], %x[res_stride], LSL #4\n"
+            "2:"  // Column loop
+            "add x28, %x[a_ptr], #0x8\n"
+            "mov z24.b, #0x0\n"
+            "mov z15.b, #0x0\n"
+            "mov x27, %x[nb]\n"
+            "add x26, x28, x12\n"
+            "mov z12.b, #0x0\n"
+            "mov z0.b, #0x0\n"
+            "add x25, x26, x12\n"
+            "mov z13.b, #0x0\n"
+            "mov z1.b, #0x0\n"
+            "add x24, x25, x12\n"
+            "mov z20.b, #0x0\n"
+            "mov z25.b, #0x0\n"
+            "mov z11.b, #0x0\n"
+            "mov z16.b, #0x0\n"
+            "mov z19.b, #0x0\n"
+            "mov z26.b, #0x0\n"
+            "mov z8.b, #0x0\n"
+            "mov z29.b, #0x0\n"
+            "mov z27.b, #0x0\n"
+            "mov z10.b, #0x0\n"
+            "3:"  // Block loop
+            "ld1b { z30.b }, p1/Z, [x11]\n"
+            "ld1b { z21.b }, p1/Z, [x11, #1, MUL VL]\n"
+            "mov z18.s, #0x0\n"
+            "mov z7.s, #0x0\n"
+            "ld1rqb { z3.b }, p1/Z, [x28]\n"
+            "ld1rqb { z5.b }, p1/Z, [x28, #16]\n"
+            "mov z9.s, #0x0\n"
+            "mov z22.s, #0x0\n"
+            "ld1b { z4.b }, p1/Z, [x11, #2, MUL VL]\n"
+            "ld1b { z17.b }, p1/Z, [x11, #3, MUL VL]\n"
+            "sub x20, x11, #0x10\n"
+            "sub x23, x28, #0x8\n"
+            "lsl z31.b, z30.b, #0x4\n"
+            "lsl z6.b, z21.b, #0x4\n"
+            "ld1h { z23.s }, p1/Z, [x20]\n"
+            "sub x22, x26, #0x8\n"
+            "and z30.b, z30.b, #0xf0\n"
+            "and z21.b, z21.b, #0xf0\n"
+            "sub x21, x25, #0x8\n"
+            "sub x20, x24, #0x8\n"
+            "lsl z14.b, z4.b, #0x4\n"
+            "lsl z2.b, z17.b, #0x4\n"
+            "subs x27, x27, #0x1\n"
+            "add x11, x11, #0x90\n"
+            ".inst 0x451f9872  // smmla z18.s, z3.b, z31.b\n"
+            ".inst 0x45069867  // smmla z7.s, z3.b, z6.b\n"
+            "ld1rqb { z3.b }, p1/Z, [x28, #32]\n"
+            "and z4.b, z4.b, #0xf0\n"
+            ".inst 0x451f98a9  // smmla z9.s, z5.b, z31.b\n"
+            ".inst 0x450698b6  // smmla z22.s, z5.b, z6.b\n"
+            "ld1rqb { z5.b }, p1/Z, [x28, #48]\n"
+            "and z17.b, z17.b, #0xf0\n"
+            "fcvt z23.s, p1/m, z23.h\n"
+            ".inst 0x450e9872  // smmla z18.s, z3.b, z14.b\n"
+            ".inst 0x45029867  // smmla z7.s, z3.b, z2.b\n"
+            "ld1rqb { z3.b }, p1/Z, [x28, #64]\n"
+            ".inst 0x450e98a9  // smmla z9.s, z5.b, z14.b\n"
+            ".inst 0x450298b6  // smmla z22.s, z5.b, z2.b\n"
+            "ld1rqb { z5.b }, p1/Z, [x28, #80]\n"
+            "fscale z23.s, p1/m, z23.s, z28.s\n"
+            ".inst 0x451e9872  // smmla z18.s, z3.b, z30.b\n"
+            ".inst 0x45159867  // smmla z7.s, z3.b, z21.b\n"
+            "ld1rqb { z3.b }, p1/Z, [x28, #96]\n"
+            ".inst 0x451e98a9  // smmla z9.s, z5.b, z30.b\n"
+            ".inst 0x451598b6  // smmla z22.s, z5.b, z21.b\n"
+            "ld1rqb { z5.b }, p1/Z, [x28, #112]\n"
+            "add x28, x28, #0x88\n"
+            ".inst 0x45049872  // smmla z18.s, z3.b, z4.b\n"
+            ".inst 0x45119867  // smmla z7.s, z3.b, z17.b\n"
+            "ld1h { z3.s }, p0/Z, [x23]\n"
+            ".inst 0x450498a9  // smmla z9.s, z5.b, z4.b\n"
+            ".inst 0x451198b6  // smmla z22.s, z5.b, z17.b\n"
+            "fcvt z3.s, p1/m, z3.h\n"
+            "uzp1 z5.d, z18.d, z7.d\n"
+            "uzp2 z18.d, z18.d, z7.d\n"
+            "mov z3.q, z3.q[0]\n"
+            "uzp1 z7.d, z9.d, z22.d\n"
+            "uzp2 z22.d, z9.d, z22.d\n"
+            "fmul z9.s, z23.s, z3.s[0]\n"
+            "scvtf z5.s, p1/m, z5.s\n"
+            "scvtf z18.s, p1/m, z18.s\n"
+            "scvtf z7.s, p1/m, z7.s\n"
+            "scvtf z22.s, p1/m, z22.s\n"
+            "fmla z24.s, p1/M, z5.s, z9.s\n"
+            "ld1rqb { z5.b }, p1/Z, [x26]\n"
+            "fmul z9.s, z23.s, z3.s[1]\n"
+            "fmla z15.s, p1/M, z18.s, z9.s\n"
+            "ld1rqb { z18.b }, p1/Z, [x26, #16]\n"
+            "fmul z9.s, z23.s, z3.s[2]\n"
+            "fmul z3.s, z23.s, z3.s[3]\n"
+            "fmla z12.s, p1/M, z7.s, z9.s\n"
+            "mov z9.s, #0x0\n"
+            "ld1h { z7.s }, p0/Z, [x22]\n"
+            ".inst 0x451f98a9  // smmla z9.s, z5.b, z31.b\n"
+            "fmla z0.s, p1/M, z22.s, z3.s\n"
+            "mov z22.s, #0x0\n"
+            "ld1h { z3.s }, p0/Z, [x21]\n"
+            ".inst 0x450698b6  // smmla z22.s, z5.b, z6.b\n"
+            "ld1rqb { z5.b }, p1/Z, [x26, #32]\n"
+            "fcvt z7.s, p1/m, z7.h\n"
+            "fcvt z3.s, p1/m, z3.h\n"
+            ".inst 0x450e98a9  // smmla z9.s, z5.b, z14.b\n"
+            ".inst 0x450298b6  // smmla z22.s, z5.b, z2.b\n"
+            "ld1rqb { z5.b }, p1/Z, [x26, #64]\n"
+            "mov z7.q, z7.q[0]\n"
+            "mov z3.q, z3.q[0]\n"
+            ".inst 0x451e98a9  // smmla z9.s, z5.b, z30.b\n"
+            ".inst 0x451598b6  // smmla z22.s, z5.b, z21.b\n"
+            "ld1rqb { z5.b }, p1/Z, [x26, #96]\n"
+            ".inst 0x450498a9  // smmla z9.s, z5.b, z4.b\n"
+            ".inst 0x451198b6  // smmla z22.s, z5.b, z17.b\n"
+            "uzp1 z5.d, z9.d, z22.d\n"
+            "scvtf z5.s, p1/m, z5.s\n"
+            "uzp2 z22.d, z9.d, z22.d\n"
+            "fmul z9.s, z23.s, z7.s[0]\n"
+            "scvtf z22.s, p1/m, z22.s\n"
+            "fmla z13.s, p1/M, z5.s, z9.s\n"
+            "ld1rqb { z9.b }, p1/Z, [x25]\n"
+            "fmul z5.s, z23.s, z7.s[1]\n"
+            "fmla z1.s, p1/M, z22.s, z5.s\n"
+            "mov z5.s, #0x0\n"
+            "mov z22.s, #0x0\n"
+            ".inst 0x451f9a45  // smmla z5.s, z18.b, z31.b\n"
+            ".inst 0x45069a56  // smmla z22.s, z18.b, z6.b\n"
+            "ld1rqb { z18.b }, p1/Z, [x26, #48]\n"
+            ".inst 0x450e9a45  // smmla z5.s, z18.b, z14.b\n"
+            ".inst 0x45029a56  // smmla z22.s, z18.b, z2.b\n"
+            "ld1rqb { z18.b }, p1/Z, [x26, #80]\n"
+            ".inst 0x451e9a45  // smmla z5.s, z18.b, z30.b\n"
+            ".inst 0x45159a56  // smmla z22.s, z18.b, z21.b\n"
+            "ld1rqb { z18.b }, p1/Z, [x26, #112]\n"
+            "add x26, x26, #0x88\n"
+            ".inst 0x45049a45  // smmla z5.s, z18.b, z4.b\n"
+            ".inst 0x45119a56  // smmla z22.s, z18.b, z17.b\n"
+            "uzp1 z18.d, z5.d, z22.d\n"
+            "scvtf z18.s, p1/m, z18.s\n"
+            "uzp2 z22.d, z5.d, z22.d\n"
+            "fmul z5.s, z23.s, z7.s[2]\n"
+            "fmul z7.s, z23.s, z7.s[3]\n"
+            "scvtf z22.s, p1/m, z22.s\n"
+            "fmla z20.s, p1/M, z18.s, z5.s\n"
+            "ld1rqb { z18.b }, p1/Z, [x25, #16]\n"
+            "ld1h { z5.s }, p0/Z, [x20]\n"
+            "fcvt z5.s, p1/m, z5.h\n"
+            "fmla z25.s, p1/M, z22.s, z7.s\n"
+            "mov z22.s, #0x0\n"
+            "mov z7.s, #0x0\n"
+            ".inst 0x451f9936  // smmla z22.s, z9.b, z31.b\n"
+            ".inst 0x45069927  // smmla z7.s, z9.b, z6.b\n"
+            "ld1rqb { z9.b }, p1/Z, [x25, #32]\n"
+            "mov z5.q, z5.q[0]\n"
+            ".inst 0x450e9936  // smmla z22.s, z9.b, z14.b\n"
+            ".inst 0x45029927  // smmla z7.s, z9.b, z2.b\n"
+            "ld1rqb { z9.b }, p1/Z, [x25, #64]\n"
+            ".inst 0x451e9936  // smmla z22.s, z9.b, z30.b\n"
+            ".inst 0x45159927  // smmla z7.s, z9.b, z21.b\n"
+            "ld1rqb { z9.b }, p1/Z, [x25, #96]\n"
+            ".inst 0x45049936  // smmla z22.s, z9.b, z4.b\n"
+            ".inst 0x45119927  // smmla z7.s, z9.b, z17.b\n"
+            "uzp1 z9.d, z22.d, z7.d\n"
+            "scvtf z9.s, p1/m, z9.s\n"
+            "uzp2 z22.d, z22.d, z7.d\n"
+            "fmul z7.s, z23.s, z3.s[0]\n"
+            "scvtf z22.s, p1/m, z22.s\n"
+            "fmla z11.s, p1/M, z9.s, z7.s\n"
+            "ld1rqb { z9.b }, p1/Z, [x24]\n"
+            "fmul z7.s, z23.s, z3.s[1]\n"
+            "fmla z16.s, p1/M, z22.s, z7.s\n"
+            "mov z22.s, #0x0\n"
+            "mov z7.s, #0x0\n"
+            ".inst 0x451f9a56  // smmla z22.s, z18.b, z31.b\n"
+            ".inst 0x45069a47  // smmla z7.s, z18.b, z6.b\n"
+            "ld1rqb { z18.b }, p1/Z, [x25, #48]\n"
+            ".inst 0x450e9a56  // smmla z22.s, z18.b, z14.b\n"
+            ".inst 0x45029a47  // smmla z7.s, z18.b, z2.b\n"
+            "ld1rqb { z18.b }, p1/Z, [x25, #80]\n"
+            ".inst 0x451e9a56  // smmla z22.s, z18.b, z30.b\n"
+            ".inst 0x45159a47  // smmla z7.s, z18.b, z21.b\n"
+            "ld1rqb { z18.b }, p1/Z, [x25, #112]\n"
+            "add x25, x25, #0x88\n"
+            ".inst 0x45049a56  // smmla z22.s, z18.b, z4.b\n"
+            ".inst 0x45119a47  // smmla z7.s, z18.b, z17.b\n"
+            "uzp1 z18.d, z22.d, z7.d\n"
+            "scvtf z18.s, p1/m, z18.s\n"
+            "uzp2 z7.d, z22.d, z7.d\n"
+            "fmul z22.s, z23.s, z3.s[2]\n"
+            "fmul z3.s, z23.s, z3.s[3]\n"
+            "scvtf z7.s, p1/m, z7.s\n"
+            "fmla z19.s, p1/M, z18.s, z22.s\n"
+            "ld1rqb { z18.b }, p1/Z, [x24, #16]\n"
+            "fmul z22.s, z23.s, z5.s[0]\n"
+            "fmla z26.s, p1/M, z7.s, z3.s\n"
+            "mov z3.s, #0x0\n"
+            "mov z7.s, #0x0\n"
+            ".inst 0x451f9923  // smmla z3.s, z9.b, z31.b\n"
+            ".inst 0x45069927  // smmla z7.s, z9.b, z6.b\n"
+            "ld1rqb { z9.b }, p1/Z, [x24, #32]\n"
+            ".inst 0x450e9923  // smmla z3.s, z9.b, z14.b\n"
+            ".inst 0x45029927  // smmla z7.s, z9.b, z2.b\n"
+            "mov z9.s, #0x0\n"
+            ".inst 0x451f9a49  // smmla z9.s, z18.b, z31.b\n"
+            "mov z31.s, #0x0\n"
+            ".inst 0x45069a5f  // smmla z31.s, z18.b, z6.b\n"
+            "ld1rqb { z6.b }, p1/Z, [x24, #48]\n"
+            "ld1rqb { z18.b }, p1/Z, [x24, #64]\n"
+            ".inst 0x450e98c9  // smmla z9.s, z6.b, z14.b\n"
+            "fmul z14.s, z23.s, z5.s[1]\n"
+            ".inst 0x450298df  // smmla z31.s, z6.b, z2.b\n"
+            "ld1rqb { z6.b }, p1/Z, [x24, #80]\n"
+            "fmul z2.s, z23.s, z5.s[2]\n"
+            "fmul z23.s, z23.s, z5.s[3]\n"
+            ".inst 0x451e9a43  // smmla z3.s, z18.b, z30.b\n"
+            ".inst 0x45159a47  // smmla z7.s, z18.b, z21.b\n"
+            "ld1rqb { z5.b }, p1/Z, [x24, #96]\n"
+            ".inst 0x451e98c9  // smmla z9.s, z6.b, z30.b\n"
+            ".inst 0x451598df  // smmla z31.s, z6.b, z21.b\n"
+            "ld1rqb { z18.b }, p1/Z, [x24, #112]\n"
+            "add x24, x24, #0x88\n"
+            ".inst 0x450498a3  // smmla z3.s, z5.b, z4.b\n"
+            ".inst 0x451198a7  // smmla z7.s, z5.b, z17.b\n"
+            ".inst 0x45049a49  // smmla z9.s, z18.b, z4.b\n"
+            ".inst 0x45119a5f  // smmla z31.s, z18.b, z17.b\n"
+            "uzp1 z18.d, z3.d, z7.d\n"
+            "uzp2 z5.d, z3.d, z7.d\n"
+            "scvtf z18.s, p1/m, z18.s\n"
+            "uzp1 z6.d, z9.d, z31.d\n"
+            "uzp2 z9.d, z9.d, z31.d\n"
+            "scvtf z5.s, p1/m, z5.s\n"
+            "fmla z8.s, p1/M, z18.s, z22.s\n"
+            "scvtf z6.s, p1/m, z6.s\n"
+            "scvtf z9.s, p1/m, z9.s\n"
+            "fmla z29.s, p1/M, z5.s, z14.s\n"
+            "fmla z27.s, p1/M, z6.s, z2.s\n"
+            "fmla z10.s, p1/M, z9.s, z23.s\n"
+            "bgt 3b\n"
+            "mov x20, %x[res_ptr]\n"
+            "subs x10, x10, #0x8\n"
+            "add %x[res_ptr], %x[res_ptr], #0x20\n"
+            "st1w { z24.s }, p1, [x20]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "st1w { z15.s }, p1, [x20]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "st1w { z12.s }, p1, [x20]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "st1w { z0.s }, p1, [x20]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "st1w { z13.s }, p1, [x20]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "st1w { z1.s }, p1, [x20]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "st1w { z20.s }, p1, [x20]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "st1w { z25.s }, p1, [x20]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "st1w { z11.s }, p1, [x20]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "st1w { z16.s }, p1, [x20]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "st1w { z19.s }, p1, [x20]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "st1w { z26.s }, p1, [x20]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "st1w { z8.s }, p1, [x20]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "st1w { z29.s }, p1, [x20]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "st1w { z27.s }, p1, [x20]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "st1w { z10.s }, p1, [x20]\n"
+            "bne 2b\n"
+            "mov x20, #0x4\n"
+            "sub x13, x13, #0x10\n"
+            "cmp x13, #0x10\n"
+            "mov %x[res_ptr], x9\n"
+            "madd %x[a_ptr], x20, x12, %x[a_ptr]\n"
+            "bge 1b\n"
+            "4:"  // Row loop skip
+            "cbz x13, 9f\n"
+            "5:"  // Row tail: Row loop
+            "add x25, %x[b_ptr], #0x10\n"
+            "mov x24, %x[nc]\n"
+            "add x23, %x[res_ptr], %x[res_stride], LSL #2\n"
+            "6:"  // Row tail: Column loop
+            "mov z24.b, #0x0\n"
+            "mov z15.b, #0x0\n"
+            "add x28, %x[a_ptr], #0x8\n"
+            "mov x22, %x[nb]\n"
+            "mov z12.b, #0x0\n"
+            "mov z0.b, #0x0\n"
+            "7:"  // Row tail: Block loop
+            "ld1b { z3.b }, p1/Z, [x25]\n"
+            "ld1b { z6.b }, p1/Z, [x25, #1, MUL VL]\n"
+            "mov z2.s, #0x0\n"
+            "mov z25.s, #0x0\n"
+            "ld1rqb { z26.b }, p1/Z, [x28]\n"
+            "ld1rqb { z21.b }, p1/Z, [x28, #16]\n"
+            "mov z27.s, #0x0\n"
+            "mov z19.s, #0x0\n"
+            "ld1b { z29.b }, p1/Z, [x25, #2, MUL VL]\n"
+            "ld1b { z16.b }, p1/Z, [x25, #3, MUL VL]\n"
+            "sub x21, x25, #0x10\n"
+            "sub x20, x28, #0x8\n"
+            "lsl z20.b, z3.b, #0x4\n"
+            "lsl z4.b, z6.b, #0x4\n"
+            "ld1rqb { z10.b }, p1/Z, [x28, #32]\n"
+            "ld1rqb { z23.b }, p1/Z, [x28, #48]\n"
+            "and z3.b, z3.b, #0xf0\n"
+            "and z6.b, z6.b, #0xf0\n"
+            "ld1rqb { z11.b }, p1/Z, [x28, #64]\n"
+            "ld1rqb { z7.b }, p1/Z, [x28, #80]\n"
+            "lsl z8.b, z29.b, #0x4\n"
+            "lsl z14.b, z16.b, #0x4\n"
+            "ld1rqb { z18.b }, p1/Z, [x28, #96]\n"
+            "ld1rqb { z30.b }, p1/Z, [x28, #112]\n"
+            ".inst 0x45149b42  // smmla z2.s, z26.b, z20.b\n"
+            ".inst 0x45049b59  // smmla z25.s, z26.b, z4.b\n"
+            "and z29.b, z29.b, #0xf0\n"
+            "ld1h { z17.s }, p1/Z, [x21]\n"
+            ".inst 0x45149abb  // smmla z27.s, z21.b, z20.b\n"
+            ".inst 0x45049ab3  // smmla z19.s, z21.b, z4.b\n"
+            "and z16.b, z16.b, #0xf0\n"
+            "ld1h { z4.s }, p0/Z, [x20]\n"
+            "subs x22, x22, #0x1\n"
+            "add x28, x28, #0x88\n"
+            "fcvt z17.s, p1/m, z17.h\n"
+            "add x25, x25, #0x90\n"
+            ".inst 0x45089942  // smmla z2.s, z10.b, z8.b\n"
+            ".inst 0x450e9959  // smmla z25.s, z10.b, z14.b\n"
+            "fcvt z4.s, p1/m, z4.h\n"
+            ".inst 0x45089afb  // smmla z27.s, z23.b, z8.b\n"
+            ".inst 0x450e9af3  // smmla z19.s, z23.b, z14.b\n"
+            "fscale z17.s, p1/m, z17.s, z28.s\n"
+            "mov z4.q, z4.q[0]\n"
+            ".inst 0x45039962  // smmla z2.s, z11.b, z3.b\n"
+            ".inst 0x45069979  // smmla z25.s, z11.b, z6.b\n"
+            "fmul z23.s, z17.s, z4.s[0]\n"
+            "fmul z9.s, z17.s, z4.s[1]\n"
+            "fmul z21.s, z17.s, z4.s[2]\n"
+            "fmul z4.s, z17.s, z4.s[3]\n"
+            ".inst 0x450398fb  // smmla z27.s, z7.b, z3.b\n"
+            ".inst 0x450698f3  // smmla z19.s, z7.b, z6.b\n"
+            ".inst 0x451d9a42  // smmla z2.s, z18.b, z29.b\n"
+            ".inst 0x45109a59  // smmla z25.s, z18.b, z16.b\n"
+            ".inst 0x451d9bdb  // smmla z27.s, z30.b, z29.b\n"
+            ".inst 0x45109bd3  // smmla z19.s, z30.b, z16.b\n"
+            "uzp1 z31.d, z2.d, z25.d\n"
+            "uzp2 z13.d, z2.d, z25.d\n"
+            "scvtf z31.s, p1/m, z31.s\n"
+            "uzp1 z17.d, z27.d, z19.d\n"
+            "uzp2 z18.d, z27.d, z19.d\n"
+            "scvtf z13.s, p1/m, z13.s\n"
+            "fmla z24.s, p1/M, z31.s, z23.s\n"
+            "scvtf z17.s, p1/m, z17.s\n"
+            "scvtf z18.s, p1/m, z18.s\n"
+            "fmla z15.s, p1/M, z13.s, z9.s\n"
+            "fmla z12.s, p1/M, z17.s, z21.s\n"
+            "fmla z0.s, p1/M, z18.s, z4.s\n"
+            "bgt 7b\n"
+            "mov x20, %x[res_ptr]\n"
+            "cmp x13, #0x1\n"
+            "st1w { z24.s }, p1, [x20]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "ble 8f\n"
+            "cmp x13, #0x2\n"
+            "st1w { z15.s }, p1, [x20]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "ble 8f\n"
+            "cmp x13, #0x3\n"
+            "st1w { z12.s }, p1, [x20]\n"
+            "add x20, x20, %x[res_stride]\n"
+            "ble 8f\n"
+            "st1w { z0.s }, p1, [x20]\n"
+            "8:"  // Row tail: Accumulator store skip
+            "subs x24, x24, #0x8\n"
+            "add %x[res_ptr], %x[res_ptr], #0x20\n"
+            "bne 6b\n"
+            "subs x13, x13, #0x4\n"
+            "add %x[a_ptr], %x[a_ptr], x12\n"
+            "mov %x[res_ptr], x23\n"
+            "bgt 5b\n"
+            "9:"  // Row tail: Row loop skip
+            : [a_ptr] "+&r" (a_ptr), [res_ptr] "+&r" (res_ptr)
+            : [b_ptr] "r" (b_ptr), [nr] "r" (nr), [nb] "r" (nb), [res_stride] "r" (res_stride), [nc] "r" (nc)
+            : "cc", "memory", "p0", "p1", "x9", "x10", "x11", "x12", "x13", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+        );
+        return;
+    }
+#endif // #if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
+
+#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__)
+    float sumf[4][8];
+    int sumi;
+
+    for (int y = 0; y < nr / 4; y++) {
+        const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
+        for (int x = 0; x < nc / ncols_interleaved; x++) {
+            const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb);
+            for (int m = 0; m < 4; m++) {
+                for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
+            }
+            for (int l = 0; l < nb; l++) {
+                for (int k = 0; k < (qk / (2 * blocklen)); k++) {
+                    for (int m = 0; m < 4; m++) {
+                        for (int j = 0; j < ncols_interleaved; j++) {
+                            sumi = 0;
+                            for (int i = 0; i < blocklen; ++i) {
+                                const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
+                                const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
+                                sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
+                                         (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
+                            }
+                            sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
+                        }
+                    }
+                }
+            }
+            for (int m = 0; m < 4; m++) {
+                for (int j = 0; j < ncols_interleaved; j++)
+                    s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
+            }
+        }
+    }
+}
+
+void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
+    const int qk = QK8_0;
+    const int nb = n / qk;
+    const int ncols_interleaved = 4;
+    const int blocklen = 4;
+
+    assert (n % qk == 0);
+    assert (nr % 4 == 0);
+    assert (nc % ncols_interleaved == 0);
+
+    UNUSED(s);
+    UNUSED(bs);
+    UNUSED(vx);
+    UNUSED(vy);
+    UNUSED(nr);
+    UNUSED(nc);
+    UNUSED(nb);
+    UNUSED(ncols_interleaved);
+    UNUSED(blocklen);
+
+#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
+    const int8x16_t kvalues = vld1q_s8(kvalues_iq4nl);
+
+    for (int y = 0; y < nr / 4; y++) {
+        const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
+        for (int x = 0; x < nc / ncols_interleaved; x++) {
+            const block_iq4_nlx4 * b_ptr = (const block_iq4_nlx4 *) vx + (x * nb);
+
+            float32x4_t sumf[4];
+            for (int m = 0; m < 4; m++) {
+                sumf[m] = vdupq_n_f32(0);
+            }
+
+            for (int l = 0; l < nb; l++) {
+                float32x4_t a_d = vcvt_f32_f16(vld1_f16((const float16_t *)a_ptr[l].d));
+                float32x4_t b_d = vcvt_f32_f16(vld1_f16((const float16_t *)b_ptr[l].d));
+
+                int32x4_t sumi_0 = vdupq_n_s32(0);
+                int32x4_t sumi_1 = vdupq_n_s32(0);
+                int32x4_t sumi_2 = vdupq_n_s32(0);
+                int32x4_t sumi_3 = vdupq_n_s32(0);
+
+                for (int k = 0; k < 4; k++) {
+                    int8x16_t a_0 = vld1q_s8(a_ptr[l].qs + 16 * k + 0);
+                    int8x16_t a_1 = vld1q_s8(a_ptr[l].qs + 16 * k + 64);
+
+                    uint8x16_t b = vld1q_u8(b_ptr[l].qs + 16 * k);
+                    int8x16_t b_hi = vqtbl1q_s8(kvalues, b >> 4);
+                    int8x16_t b_lo = vqtbl1q_s8(kvalues, b & 0xF);
+
+                    sumi_0 = vdotq_laneq_s32(sumi_0, b_lo, a_0, 0);
+                    sumi_1 = vdotq_laneq_s32(sumi_1, b_lo, a_0, 1);
+                    sumi_2 = vdotq_laneq_s32(sumi_2, b_lo, a_0, 2);
+                    sumi_3 = vdotq_laneq_s32(sumi_3, b_lo, a_0, 3);
+                    sumi_0 = vdotq_laneq_s32(sumi_0, b_hi, a_1, 0);
+                    sumi_1 = vdotq_laneq_s32(sumi_1, b_hi, a_1, 1);
+                    sumi_2 = vdotq_laneq_s32(sumi_2, b_hi, a_1, 2);
+                    sumi_3 = vdotq_laneq_s32(sumi_3, b_hi, a_1, 3);
+                }
+
+                sumf[0] = vmlaq_f32(sumf[0], vmulq_laneq_f32(b_d, a_d, 0), vcvtq_f32_s32(sumi_0));
+                sumf[1] = vmlaq_f32(sumf[1], vmulq_laneq_f32(b_d, a_d, 1), vcvtq_f32_s32(sumi_1));
+                sumf[2] = vmlaq_f32(sumf[2], vmulq_laneq_f32(b_d, a_d, 2), vcvtq_f32_s32(sumi_2));
+                sumf[3] = vmlaq_f32(sumf[3], vmulq_laneq_f32(b_d, a_d, 3), vcvtq_f32_s32(sumi_3));
+            }
+
+            for (int m = 0; m < 4; m++) {
+                vst1q_f32(s + (y * 4 + m) * bs + x * 4, sumf[m]);
+            }
+        }
+    }
+    return;
+#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON)
+    {
+        float sumf[4][4];
+        int sumi;
+
+        for (int y = 0; y < nr / 4; y++) {
+            const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
+            for (int x = 0; x < nc / ncols_interleaved; x++) {
+                const block_iq4_nlx4 * b_ptr = (const block_iq4_nlx4 *) vx + (x * nb);
+                for (int m = 0; m < 4; m++) {
+                    for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
+                }
+                for (int l = 0; l < nb; l++) {
+                    for (int k = 0; k < (qk / (2 * blocklen)); k++) {
+                        for (int m = 0; m < 4; m++) {
+                            for (int j = 0; j < ncols_interleaved; j++) {
+                                sumi = 0;
+                                for (int i = 0; i < blocklen; ++i) {
+                                    const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
+                                    const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
+                                    sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
+                                            (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4]));
+                                }
+                                sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
+                            }
+                        }
+                    }
+                }
+                for (int m = 0; m < 4; m++) {
+                    for (int j = 0; j < ncols_interleaved; j++)
+                        s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
+                }
+            }
+        }
+    }
+}
diff --git a/ggml/src/ggml-cpu/arch/loongarch/quants.c b/ggml/src/ggml-cpu/arch/loongarch/quants.c
new file mode 100644
index 0000000000000..9e33fb3228633
--- /dev/null
+++ b/ggml/src/ggml-cpu/arch/loongarch/quants.c
@@ -0,0 +1,2639 @@
+#define GGML_COMMON_IMPL_C
+#include "ggml-common.h"
+#include "ggml-quants.h"
+#include "ggml-impl.h"
+#include "ggml-cpu.h"
+#include "simd-mappings.h"
+
+#include "../../quants.h"
+#include "../../ggml-cpu-impl.h"
+
+#include <math.h>
+#include <string.h>
+#include <assert.h>
+#include <float.h>
+#include <stdlib.h> // for qsort
+#include <stdio.h>  // for GGML_ASSERT
+
+#define GROUP_MAX_EPS 1e-15f
+#define GROUP_MAX_EPS_IQ3_XXS 1e-8f
+#define GROUP_MAX_EPS_IQ2_S 1e-8f
+#define GROUP_MAX_EPS_IQ1_M 1e-7f
+#define GROUP_MAX_EPS_IQ1_S 1e-12f
+
+#define UNUSED GGML_UNUSED
+
+#if defined(__loongarch_sx)
+
+static __m128i lsx_packs_w(__m128i a, __m128i b) {
+    __m128i tmp, tmp1;
+    tmp = __lsx_vsat_w(a, 15);
+    tmp1 = __lsx_vsat_w(b, 15);
+    return __lsx_vpickev_h(tmp1, tmp);
+}
+
+static __m128i lsx_packs_h(__m128i a, __m128i b) {
+    __m128i tmp, tmp1;
+    tmp = __lsx_vsat_h(a, 7);
+    tmp1 = __lsx_vsat_h(b, 7);
+    return __lsx_vpickev_b(tmp1, tmp);
+}
+
+static __m128i lsx_packus_h(__m128i a, __m128i b) {
+    __m128i tmp, tmp1;
+    tmp = __lsx_vsat_hu(a, 7);
+    tmp1 = __lsx_vsat_hu(b, 7);
+    return __lsx_vpickev_b(tmp1, tmp);
+}
+
+static __m128i lsx_maddubs_h(__m128i a, __m128i b) {
+    __m128i tmp1, tmp2;
+    tmp1 = __lsx_vmulwev_h_b(a, b);
+    tmp2 = __lsx_vmulwod_h_b(a, b);
+    return __lsx_vsadd_h(tmp1, tmp2);
+}
+
+static __m128i lsx_madd_h(__m128i a, __m128i b) {
+    __m128i tmp1, tmp2;
+    tmp1 = __lsx_vmulwev_w_h(a, b);
+    tmp2 = __lsx_vmulwod_w_h(a, b);
+    return __lsx_vadd_w(tmp1, tmp2);
+}
+
+static __m128i lsx_set_w(int32_t a, int32_t b, int32_t c, int32_t d) {
+    v4i32 __ret = {d, c, b, a};
+    return (__m128i)__ret;
+}
+
+static __m128i lsx_shuffle_b(__m128i a, __m128i b) {
+    __m128i mask_f, zero, tmp0, tmp2, mask;
+    int f = 0x8f;
+    mask_f = __lsx_vreplgr2vr_b(f);
+    zero = __lsx_vldi(0);
+    tmp0 = __lsx_vand_v(b, mask_f); // get mask with low 4 bit and sign bits
+    tmp0 = __lsx_vori_b(tmp0, 0x10); // make each mask or  with 0x10 prepare for positive
+    mask = __lsx_vsle_b(zero, tmp0); // if mask >= 0, set mask
+    tmp2 = __lsx_vand_v(tmp0, mask); // maskout the in2 < ones
+    return __lsx_vshuf_b(a, zero, tmp2);
+}
+
+static __m128i lsx_hadd_h(__m128i a, __m128i b) {
+    __m128i tmp1 = __lsx_vpickev_h(b, a);
+    __m128i tmp2 = __lsx_vpickod_h(b, a);
+    return __lsx_vadd_h(tmp1, tmp2);
+}
+
+static __m128i lsx_hadd_w(__m128i a, __m128i b) {
+    __m128i tmp1 = __lsx_vpickev_w(b, a);
+    __m128i tmp2 = __lsx_vpickod_w(b, a);
+    return __lsx_vadd_w(tmp1, tmp2);
+}
+
+static __m128 lsx_hadd_s(__m128 a, __m128 b) {
+    __m128 tmp1 = (__m128)__lsx_vpickev_w((__m128i)b, (__m128i)a);
+    __m128 tmp2 = (__m128)__lsx_vpickod_w((__m128i)b, (__m128i)a);
+
+    return __lsx_vfadd_s(tmp1, tmp2);
+}
+
+static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128 c, const __m128 d) {
+    __m128 res_0 =lsx_hadd_s(a, b);
+    __m128 res_1 =lsx_hadd_s(c, d);
+    __m128 res =lsx_hadd_s(res_0, res_1);
+    res =lsx_hadd_s(res, res);
+    res =lsx_hadd_s(res, res);
+
+    return ((v4f32)res)[0];
+}
+#endif
+
+#if defined(__loongarch_asx)
+
+#ifdef __clang__
+#define VREGS_PREFIX "$vr"
+#define XREGS_PREFIX "$xr"
+#else // GCC
+#define VREGS_PREFIX "$f"
+#define XREGS_PREFIX "$f"
+#endif
+#define __ALL_REGS "0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31"
+// Convert __m128i to __m256i
+static inline __m256i ____m256i(__m128i in) {
+    __m256i out = __lasx_xvldi(0);
+    __asm__ volatile (
+        ".irp i," __ALL_REGS                "\n\t"
+        " .ifc %[out], " XREGS_PREFIX"\\i    \n\t"
+        "  .irp j," __ALL_REGS              "\n\t"
+        "   .ifc %[in], " VREGS_PREFIX "\\j  \n\t"
+        "    xvpermi.q $xr\\i, $xr\\j, 0x20  \n\t"
+        "   .endif                           \n\t"
+        "  .endr                             \n\t"
+        " .endif                             \n\t"
+        ".endr                               \n\t"
+        : [out] "+f" (out) : [in] "f" (in)
+    );
+    return out;
+}
+// Convert two __m128i to __m256i
+static inline __m256i lasx_set_q(__m128i inhi, __m128i inlo) {
+    __m256i out;
+    __asm__ volatile (
+        ".irp i," __ALL_REGS                "\n\t"
+        " .ifc %[hi], " VREGS_PREFIX "\\i    \n\t"
+        "  .irp j," __ALL_REGS              "\n\t"
+        "   .ifc %[lo], " VREGS_PREFIX "\\j  \n\t"
+        "    xvpermi.q $xr\\i, $xr\\j, 0x20  \n\t"
+        "   .endif                           \n\t"
+        "  .endr                             \n\t"
+        " .endif                             \n\t"
+        ".endr                               \n\t"
+        ".ifnc %[out], %[hi]                 \n\t"
+        ".irp i," __ALL_REGS                "\n\t"
+        " .ifc %[out], " XREGS_PREFIX "\\i   \n\t"
+        "  .irp j," __ALL_REGS              "\n\t"
+        "   .ifc %[hi], " VREGS_PREFIX "\\j  \n\t"
+        "    xvori.b $xr\\i, $xr\\j, 0       \n\t"
+        "   .endif                           \n\t"
+        "  .endr                             \n\t"
+        " .endif                             \n\t"
+        ".endr                               \n\t"
+        ".endif                              \n\t"
+        : [out] "=f" (out), [hi] "+f" (inhi)
+        : [lo] "f" (inlo)
+    );
+    return out;
+}
+// Convert __m256i low part to __m128i
+static inline __m128i lasx_extracti128_lo(__m256i in) {
+    __m128i out;
+    __asm__ volatile (
+        ".ifnc %[out], %[in]                 \n\t"
+        ".irp i," __ALL_REGS                "\n\t"
+        " .ifc %[out], " VREGS_PREFIX "\\i   \n\t"
+        "  .irp j," __ALL_REGS              "\n\t"
+        "   .ifc %[in], " XREGS_PREFIX "\\j  \n\t"
+        "    vori.b $vr\\i, $vr\\j, 0        \n\t"
+        "   .endif                           \n\t"
+        "  .endr                             \n\t"
+        " .endif                             \n\t"
+        ".endr                               \n\t"
+        ".endif                              \n\t"
+        : [out] "=f" (out) : [in] "f" (in)
+    );
+    return out;
+}
+// Convert __m256i high part to __m128i
+static inline __m128i lasx_extracti128_hi(__m256i in) {
+    __m128i out;
+    __asm__ volatile (
+        ".irp i," __ALL_REGS                "\n\t"
+        " .ifc %[out], " VREGS_PREFIX "\\i   \n\t"
+        "  .irp j," __ALL_REGS              "\n\t"
+        "   .ifc %[in], " XREGS_PREFIX "\\j  \n\t"
+        "    xvpermi.q $xr\\i, $xr\\j, 0x11  \n\t"
+        "   .endif                           \n\t"
+        "  .endr                             \n\t"
+        " .endif                             \n\t"
+        ".endr                               \n\t"
+        : [out] "=f" (out) : [in] "f" (in)
+    );
+    return out;
+}
+
+static __m256i lasx_set_w(int e7, int e6, int e5, int e4, int e3, int e2, int e1, int e0) {
+    v8i32 __ret = {e0, e1, e2, e3, e4, e5, e6, e7};
+    return (__m256i)__ret;
+}
+
+static __m256i lasx_set_d(int64_t a, int64_t b, int64_t c, int64_t d) {
+    v4i64 __ret = {d, c, b, a};
+    return (__m256i)__ret;
+}
+
+static __m256i lasx_insertf128( __m128i x, __m128i y) {
+    return lasx_set_q(x, y);
+}
+
+static __m256i lasx_shuffle_b(__m256i a, __m256i b) {
+    __m256i mask_f, zero, tmp0, tmp2, mask;
+    int f = 0x8f;
+    mask_f = __lasx_xvreplgr2vr_b(f);
+    zero = __lasx_xvldi(0);
+    tmp0 = __lasx_xvand_v(b, mask_f); // get mask with low 4 bit and sign bits
+    tmp0 = __lasx_xvori_b(tmp0, 0x10); // make each mask or  with 0x10 prepare for positive
+    mask = __lasx_xvsle_b(zero, tmp0); // if mask >= 0, set mask
+    tmp2 = __lasx_xvand_v(tmp0, mask); // maskout the in2 < ones
+    return __lasx_xvshuf_b(a, zero, tmp2);
+}
+
+static __m256i lasx_extu8_16(__m128i a) {
+    return __lasx_vext2xv_hu_bu(____m256i(a));
+}
+
+static __m256i lasx_ext8_16(__m128i a) {
+    return __lasx_vext2xv_h_b(____m256i(a));
+}
+
+static __m256i lasx_ext16_32(__m128i a) {
+    return __lasx_vext2xv_w_h(____m256i(a));
+}
+
+static __m128i lasx_extracti128( __m256i a, int pos) {
+    __m128i ret;
+    if( pos == 0)
+    {
+       ret = lasx_extracti128_lo(a);
+    } else {
+       ret = lasx_extracti128_hi(a);
+    }
+    return ret;
+}
+
+static __m128 lasx_extractf128( __m256 a, int pos) {
+    __m128 ret;
+    if( pos == 0)
+    {
+       ret = (__m128)lasx_extracti128_lo((__m256i)a);
+    } else {
+       ret = (__m128)lasx_extracti128_hi((__m256i)a);
+    }
+    return ret;
+}
+
+static __m256i lasx_maddubs_h(__m256i a, __m256i b) {
+    __m256i tmp1, tmp2;
+    tmp1 = __lasx_xvmulwev_h_b(a, b);
+    tmp2 = __lasx_xvmulwod_h_b(a, b);
+    return __lasx_xvsadd_h(tmp1, tmp2);
+}
+
+static __m256i lasx_madd_h(__m256i a, __m256i b) {
+    __m256i tmp1, tmp2;
+    tmp1 = __lasx_xvmulwev_w_h(a, b);
+    tmp2 = __lasx_xvmulwod_w_h(a, b);
+    return __lasx_xvadd_w(tmp1, tmp2);
+}
+
+static __m256i lasx_packs_w(__m256i a, __m256i b) {
+    __m256i tmp, tmp1;
+    tmp = __lasx_xvsat_w(a, 15);
+    tmp1 = __lasx_xvsat_w(b, 15);
+    return __lasx_xvpickev_h(tmp1, tmp);
+}
+
+static __m256i lasx_packs_h(__m256i a, __m256i b) {
+    __m256i tmp, tmp1;
+    tmp = __lasx_xvsat_h(a, 7);
+    tmp1 = __lasx_xvsat_h(b, 7);
+    return __lasx_xvpickev_b(tmp1, tmp);
+}
+
+static inline __m256i lasx_madd_h_b(__m256i a, __m256i b) {
+    __m256i tmp1, tmp2;
+    tmp1 = __lasx_xvmulwev_h_b(a, b);
+    tmp2 = __lasx_xvmulwod_h_b(a, b);
+    return __lasx_xvadd_h(tmp1, tmp2);
+}
+
+static inline __m256i lasx_xvrepl128vei_h(__m256i a, const unsigned int b) {
+    switch (b) {
+        case 0: return __lasx_xvrepl128vei_h(a, 0);
+        case 1: return __lasx_xvrepl128vei_h(a, 1);
+        case 2: return __lasx_xvrepl128vei_h(a, 2);
+        case 3: return __lasx_xvrepl128vei_h(a, 3);
+        case 4: return __lasx_xvrepl128vei_h(a, 4);
+        case 5: return __lasx_xvrepl128vei_h(a, 5);
+        case 6: return __lasx_xvrepl128vei_h(a, 6);
+        case 7: return __lasx_xvrepl128vei_h(a, 7);
+        default: __builtin_unreachable();
+    }
+}
+
+static inline __m256i lasx_xvandi_b_bit(__m256i a, const unsigned int b) {
+    switch (b) {
+        case 0: return __lasx_xvandi_b(a, 1 << 0);
+        case 1: return __lasx_xvandi_b(a, 1 << 1);
+        case 2: return __lasx_xvandi_b(a, 1 << 2);
+        case 3: return __lasx_xvandi_b(a, 1 << 3);
+        case 4: return __lasx_xvandi_b(a, 1 << 4);
+        case 5: return __lasx_xvandi_b(a, 1 << 5);
+        case 6: return __lasx_xvandi_b(a, 1 << 6);
+        case 7: return __lasx_xvandi_b(a, 1 << 7);
+        default: __builtin_unreachable();
+    }
+}
+
+// multiply int8_t, add results pairwise twice
+static inline __m128i mul_sum_i8_pairs(const __m128i x, const __m128i y) {
+    // Get absolute values of x vectors
+    const __m128i ax = __lsx_vsigncov_b(x, x);
+    // Sign the values of the y vectors
+    const __m128i sy = __lsx_vsigncov_b(x, y);
+    // Perform multiplication and create 16-bit values
+    const __m128i dot = lsx_maddubs_h(ax, sy);
+    const __m128i ones = __lsx_vreplgr2vr_h(1);
+    return lsx_madd_h(ones, dot);
+}
+
+// horizontally add 8 floats
+static inline float hsum_float_8(const __m256 x) {
+    __m128 res = lasx_extractf128(x, 1);
+    res = __lsx_vfadd_s(res, lasx_extractf128(x, 0));
+    res = __lsx_vfadd_s(res, (__m128)__lsx_vpickod_d((__m128i)res, (__m128i)res));
+    res = __lsx_vfadd_s(res, (__m128)__lsx_vinsgr2vr_w(__lsx_vldi(0), __lsx_vpickve2gr_w(res, 1), 0));
+    return ((v4f32)res)[0];
+}
+
+// horizontally add 8 int32_t
+static inline int hsum_i32_8(const __m256i a) {
+
+    __m256i tmp1 = __lasx_xvpermi_q(a, a, 0x11);
+    __m256i tmp2 = __lasx_xvpermi_q(a, a, 0x00);
+
+    __m128i  tmp1_128 = lasx_extracti128_lo(tmp1);
+    __m128i  tmp2_128 = lasx_extracti128_lo(tmp2);
+
+    __m128i sum128 = __lsx_vadd_w(tmp1_128, tmp2_128);
+
+    __m128i ev = __lsx_vpickev_w(sum128, sum128);
+    __m128i od = __lsx_vpickod_w(sum128, sum128);
+    __m128i sum64 = __lsx_vadd_w(ev, od);
+
+    int sum64_1, sum64_2;
+    sum64_1 = __lsx_vpickve2gr_w(sum64, 0);
+    sum64_2 = __lsx_vpickve2gr_w(sum64, 1);
+
+    return  sum64_1 + sum64_2;
+}
+
+// horizontally add 4 int32_t
+static inline int hsum_i32_4(const __m128i a) {
+    __m128i ev = __lsx_vpickev_w(a, a);
+    __m128i od = __lsx_vpickod_w(a, a);
+    __m128i sum64 = __lsx_vadd_w(ev, od);
+
+    int sum64_1, sum64_2;
+    sum64_1 = __lsx_vpickve2gr_w(sum64, 0);
+    sum64_2 = __lsx_vpickve2gr_w(sum64, 1);
+
+    return  sum64_1 + sum64_2;
+}
+
+// spread 32 bits to 32 bytes { 0x00, 0xFF }
+static inline __m256i bytes_from_bits_32(const uint8_t * x) {
+
+    uint32_t x32;
+    memcpy(&x32, x, sizeof(uint32_t));
+    const __m256i shuf_mask = lasx_set_d(
+            0x0303030303030303, 0x0202020202020202,
+            0x0101010101010101, 0x0000000000000000);
+
+    __m256i bytes = lasx_shuffle_b(__lasx_xvreplgr2vr_w(x32), shuf_mask);
+    const __m256i bit_mask = __lasx_xvreplgr2vr_d(0x7fbfdfeff7fbfdfe);
+    bytes = __lasx_xvor_v(bytes, bit_mask);
+    return __lasx_xvseq_b(bytes, __lasx_xvreplgr2vr_d(-1));
+}
+
+// Unpack 32 4-bit fields into 32 bytes
+// The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval
+static inline __m256i bytes_from_nibbles_32(const uint8_t * rsi) {
+    const __m128i lo = __lsx_vld((const __m128i *)rsi, 0);
+    __m128i hi = __lsx_vsrli_h(lo, 4);
+    return __lasx_xvandi_b(lasx_insertf128(hi, lo), 0xf);
+}
+
+// add int16_t pairwise and return as float vector
+static inline __m256 sum_i16_pairs_float(const __m256i x) {
+    __m256i v = __lasx_xvpackod_h(x, x);
+    __m256i summed_pairs = __lasx_xvaddwev_w_h(x, v);
+    return __lasx_xvffint_s_w(summed_pairs);
+}
+
+static inline __m256 mul_sum_us8_pairs_float(const __m256i ax, const __m256i sy) {
+    // Perform multiplication and create 16-bit values
+    const __m256i dot = lasx_maddubs_h(ax, sy);
+    return sum_i16_pairs_float(dot);
+}
+
+// multiply int8_t, add results pairwise twice and return as float vector
+static inline __m256 mul_sum_i8_pairs_float(const __m256i x, const __m256i y) {
+    const __m256i dot = lasx_madd_h_b(x, y);
+    return sum_i16_pairs_float(dot);
+}
+
+static inline __m128i packNibbles( __m256i bytes ) {
+    // Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh
+    const __m256i lowByte = __lasx_xvreplgr2vr_h(0xFF);
+     __m256i high = __lasx_xvandn_v(lowByte, bytes);
+    __m256i low = __lasx_xvand_v(lowByte, bytes);
+    high = __lasx_xvsrli_h(high, 4);
+    bytes = __lasx_xvor_v(low, high);
+    // Compress uint16_t lanes into bytes
+    __m128i *r0 = (__m128i *)&bytes;
+    __m256i tmp_h128 = __lasx_xvpermi_q(bytes, bytes, 0x11);
+    __m128i *r1 = (__m128i *)&tmp_h128;
+
+    __m128i zero = __lsx_vldi(0);
+    __m128i tmp, tmp2, tmp3;
+
+    tmp = __lsx_vmax_h(zero, *r0);
+    tmp2 = __lsx_vsat_hu(tmp, 7);
+
+    tmp = __lsx_vmax_h(zero, *r1);
+    tmp3 = __lsx_vsat_hu(tmp, 7);
+    return  __lsx_vpickev_b(tmp3, tmp2);
+}
+#endif  //__loongarch_asx
+
+void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
+    assert(QK8_0 == 32);
+    assert(k % QK8_0 == 0);
+    const int nb = k / QK8_0;
+
+    block_q8_0 * GGML_RESTRICT y = vy;
+
+#if defined(__loongarch_asx)
+    for (int i = 0; i < nb; i++) {
+        __m256 v0 = (__m256)__lasx_xvld( x , 0);
+        __m256 v1 = (__m256)__lasx_xvld( x , 32);
+        __m256 v2 = (__m256)__lasx_xvld( x , 64);
+        __m256 v3 = (__m256)__lasx_xvld( x , 96);
+        x += 32;
+
+        // Compute max(abs(e)) for the block
+        const __m256 sign_bit = __lasx_xvreplfr2vr_s( -0.0f );
+        __m256 max_abs = (__m256)__lasx_xvandn_v( (__m256i)sign_bit, (__m256i)v0 );
+        max_abs = __lasx_xvfmax_s( max_abs, (__m256)__lasx_xvandn_v( (__m256i)sign_bit, (__m256i)v1 ) );
+        max_abs = __lasx_xvfmax_s( max_abs, (__m256)__lasx_xvandn_v( (__m256i)sign_bit, (__m256i)v2 ) );
+        max_abs = __lasx_xvfmax_s( max_abs, (__m256)__lasx_xvandn_v( (__m256i)sign_bit, (__m256i)v3 ) );
+
+        __m128 max4 = __lsx_vfmax_s( lasx_extractf128( max_abs, 1 ), lasx_extractf128( max_abs , 0) );
+        max4 = __lsx_vfmax_s( max4, (__m128)__lsx_vpickod_d((__m128i) max4, (__m128i)max4 ) );
+        __m128 tmp = max4;
+        max4 = __lsx_vfmax_s( max4, (__m128)__lsx_vinsgr2vr_w(tmp, __lsx_vpickve2gr_w( max4, 1 ), 0 ));
+        const float max_scalar = ((v4f32)max4)[0];
+
+        // Quantize these floats
+        const float d = max_scalar / 127.f;
+        y[i].d = GGML_CPU_FP32_TO_FP16(d);
+        const float id = ( max_scalar != 0.0f ) ? 127.f / max_scalar : 0.0f;
+        const __m256 mul = (__m256)__lasx_xvreplfr2vr_s( id );
+
+        // Apply the multiplier
+        v0 = __lasx_xvfmul_s( v0, mul );
+        v1 = __lasx_xvfmul_s( v1, mul );
+        v2 = __lasx_xvfmul_s( v2, mul );
+        v3 = __lasx_xvfmul_s( v3, mul );
+
+        // Round to nearest integer
+        __m256i i0 = __lasx_xvftintrne_w_s( v0 );
+        __m256i i1 = __lasx_xvftintrne_w_s( v1 );
+        __m256i i2 = __lasx_xvftintrne_w_s( v2 );
+        __m256i i3 = __lasx_xvftintrne_w_s( v3 );
+
+        __m128i ni0 = lasx_extracti128( i0, 0 );
+        __m128i ni1 = lasx_extracti128( i0, 1);
+        __m128i ni2 = lasx_extracti128( i1, 0);
+        __m128i ni3 = lasx_extracti128( i1, 1);
+        __m128i ni4 = lasx_extracti128( i2, 0);
+        __m128i ni5 = lasx_extracti128( i2, 1);
+        __m128i ni6 = lasx_extracti128( i3, 0);
+        __m128i ni7 = lasx_extracti128( i3, 1);
+
+        // Convert int32 to int16
+        ni0 = lsx_packs_w( ni0, ni1 );
+        ni2 = lsx_packs_w( ni2, ni3 );
+        ni4 = lsx_packs_w( ni4, ni5 );
+        ni6 = lsx_packs_w( ni6, ni7 );
+        // Convert int16 to int8
+        ni0 = lsx_packs_h( ni0, ni2 );
+        ni4 = lsx_packs_h( ni4, ni6 );
+
+        __lsx_vst(ni0, (__m128i *)(y[i].qs +  0), 0);
+        __lsx_vst(ni4, (__m128i *)(y[i].qs + 16), 0);
+
+    }
+#else
+    GGML_UNUSED(nb);
+    // scalar
+    quantize_row_q8_0_ref(x, y, k);
+#endif
+}
+
+void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
+    assert(k % QK8_1 == 0);
+    const int nb = k / QK8_1;
+
+    block_q8_1 * GGML_RESTRICT y = vy;
+
+#if defined(__loongarch_asx)
+    for (int i = 0; i < nb; i++) {
+        __m256 v0 = (__m256)__lasx_xvld( x , 0 );
+        __m256 v1 = (__m256)__lasx_xvld( x , 32 );
+        __m256 v2 = (__m256)__lasx_xvld( x , 64 );
+        __m256 v3 = (__m256)__lasx_xvld( x , 96 );
+        x += 32;
+
+        // Compute max(abs(e)) for the block
+        const __m256 sign_bit = __lasx_xvreplfr2vr_s( -0.0f );
+        __m256 max_abs = (__m256)__lasx_xvandn_v( (__m256i)sign_bit, (__m256i)v0 );
+        max_abs = __lasx_xvfmax_s( max_abs, (__m256)__lasx_xvandn_v( (__m256i)sign_bit, (__m256i)v1 ) );
+        max_abs = __lasx_xvfmax_s( max_abs, (__m256)__lasx_xvandn_v( (__m256i)sign_bit, (__m256i)v2 ) );
+        max_abs = __lasx_xvfmax_s( max_abs, (__m256)__lasx_xvandn_v( (__m256i)sign_bit, (__m256i)v3 ) );
+
+        __m128 max4 = __lsx_vfmax_s( lasx_extractf128( max_abs, 1 ), lasx_extractf128( max_abs, 0) );
+        max4 = __lsx_vfmax_s( max4, (__m128)__lsx_vpickod_d((__m128i) max4, (__m128i)max4 ) );
+        __m128 tmp = max4;
+        max4 = __lsx_vfmax_s( max4, (__m128)__lsx_vextrins_w((__m128i)tmp, (__m128i)max4, 0x10 ));
+        const float max_scalar = ((v4f32)max4)[0];
+
+        // Quantize these floats
+        const float d = max_scalar / 127.f;
+        y[i].d = GGML_CPU_FP32_TO_FP16(d);
+        const float id = ( max_scalar != 0.0f ) ? 127.f / max_scalar : 0.0f;
+        const __m256 mul = __lasx_xvreplfr2vr_s( id );
+
+        // Apply the multiplier
+        v0 = __lasx_xvfmul_s( v0, mul );
+        v1 = __lasx_xvfmul_s( v1, mul );
+        v2 = __lasx_xvfmul_s( v2, mul );
+        v3 = __lasx_xvfmul_s( v3, mul );
+
+        // Round to nearest integer
+        __m256i i0 = __lasx_xvftintrne_w_s( v0 );
+        __m256i i1 = __lasx_xvftintrne_w_s( v1 );
+        __m256i i2 = __lasx_xvftintrne_w_s( v2 );
+        __m256i i3 = __lasx_xvftintrne_w_s( v3 );
+
+        __m128i ni0 = lasx_extracti128(i0, 0);
+        __m128i ni1 = lasx_extracti128( i0, 1);
+        __m128i ni2 = lasx_extracti128( i1, 0);
+        __m128i ni3 = lasx_extracti128( i1, 1);
+        __m128i ni4 = lasx_extracti128( i2, 0 );
+        __m128i ni5 = lasx_extracti128( i2, 1);
+        __m128i ni6 = lasx_extracti128( i3, 0);
+        __m128i ni7 = lasx_extracti128( i3, 1);
+
+        // Compute the sum of the quants and set y[i].s
+        const __m128i s0 = __lsx_vadd_w(__lsx_vadd_w(ni0, ni1), __lsx_vadd_w(ni2, ni3));
+        const __m128i s1 = __lsx_vadd_w(__lsx_vadd_w(ni4, ni5), __lsx_vadd_w(ni6, ni7));
+        y[i].s = GGML_CPU_FP32_TO_FP16(d * hsum_i32_4(__lsx_vadd_w(s0, s1)));
+
+        // Convert int32 to int16
+        ni0 = lsx_packs_w( ni0, ni1 );
+        ni2 = lsx_packs_w( ni2, ni3 );
+        ni4 = lsx_packs_w( ni4, ni5 );
+        ni6 = lsx_packs_w( ni6, ni7 );
+        // Convert int16 to int8
+        ni0 = lsx_packs_h( ni0, ni2 );
+        ni4 = lsx_packs_h( ni4, ni6 );
+
+        __lsx_vst(ni0, (__m128i *)(y[i].qs +  0), 0);
+        __lsx_vst(ni4, (__m128i *)(y[i].qs + 16), 0);
+    }
+#else
+    GGML_UNUSED(nb);
+    // scalar
+    quantize_row_q8_1_ref(x, y, k);
+#endif
+}
+
+
+//===================================== Dot products =================================
+
+//
+// Helper functions
+//
+
+#if defined(__loongarch_asx)
+// shuffles to pick the required scales in dot products
+static inline __m256i get_scale_shuffle_q3k(int i) {
+    static const uint8_t k_shuffle[128] = {
+         0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,     2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3,
+         4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5,     6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7,
+         8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9,    10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,
+        12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,    14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,
+    };
+    return __lasx_xvld((const __m256i*)k_shuffle + i, 0);
+}
+static inline __m256i get_scale_shuffle_k4(int i) {
+    static const uint8_t k_shuffle[256] = {
+         0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
+         2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3,
+         4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5,
+         6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7,
+         8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9,
+        10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,
+        12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,
+        14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15
+    };
+    return __lasx_xvld((const __m256i*)k_shuffle + i, 0);
+}
+static inline __m128i get_scale_shuffle(int i) {
+    static const uint8_t k_shuffle[128] = {
+         0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
+         2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3,
+         4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5,
+         6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7,
+         8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9,
+        10,10,10,10,10,10,10,10, 11,11,11,11,11,11,11,11,
+        12,12,12,12,12,12,12,12, 13,13,13,13,13,13,13,13,
+        14,14,14,14,14,14,14,14, 15,15,15,15,15,15,15,15
+    };
+    return __lsx_vld((const __m128i*)k_shuffle + i, 0);
+}
+#endif
+
+void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    const int qk = QK8_0;
+    const int nb = n / qk;
+
+    assert(n % qk == 0);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_q4_0 * GGML_RESTRICT x = vx;
+    const block_q8_0 * GGML_RESTRICT y = vy;
+
+    int ib = 0;
+    float sumf = 0;
+
+#if defined(__loongarch_asx)
+    // Initialize accumulator with zeros
+    __m256 acc = (__m256)__lasx_xvldi(0);
+
+    // Main loop
+    for (; ib < nb; ++ib) {
+        /* Compute combined scale for the block */
+        const __m256 d = __lasx_xvreplfr2vr_s( GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d) );
+
+        __m256i qx = bytes_from_nibbles_32(x[ib].qs);
+
+        // Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval.
+        const __m256i off = __lasx_xvreplgr2vr_b( 8 );
+        qx = __lasx_xvsub_b( qx, off );
+
+        __m256i qy = __lasx_xvld((const __m256i *)y[ib].qs, 0);
+
+        const __m256 q = mul_sum_i8_pairs_float(qx, qy);
+
+        /* Multiply q with scale and accumulate */
+        acc = __lasx_xvfmadd_s( d, q, acc );
+    }
+
+    sumf = hsum_float_8(acc);
+
+#elif defined(__loongarch_sx)
+    // set constants
+    const __m128i low_mask = __lsx_vreplgr2vr_b(0xF);
+    const __m128i off = __lsx_vreplgr2vr_b(8);
+
+    // Initialize accumulator with zeros
+    __m128 acc_0 = (__m128)__lsx_vldi(0);
+    __m128 acc_1 = (__m128)__lsx_vldi(0);
+    __m128 acc_2 = (__m128)__lsx_vldi(0);
+    __m128 acc_3 = (__m128)__lsx_vldi(0);
+
+    for (; ib + 1 < nb; ib += 2) {
+
+        // Compute combined scale for the block 0 and 1
+        const __m128 d_0_1 = (__m128)__lsx_vreplgr2vr_w( GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d) );
+
+        const __m128i tmp_0_1 = __lsx_vld((const __m128i *)x[ib].qs, 0);
+
+        __m128i bx_0 = __lsx_vand_v(low_mask, tmp_0_1);
+        __m128i by_0 = __lsx_vld((const __m128i *)y[ib].qs, 0);
+        bx_0 = __lsx_vsub_b(bx_0, off);
+        const __m128i i32_0 = mul_sum_i8_pairs(bx_0, by_0);
+
+        __m128i bx_1 = __lsx_vand_v(low_mask, __lsx_vsrli_d(tmp_0_1, 4));
+        __m128i by_1 = __lsx_vld((const __m128i *)(y[ib].qs + 16), 0);
+        bx_1 = __lsx_vsub_b(bx_1, off);
+        const __m128i i32_1 = mul_sum_i8_pairs(bx_1, by_1);
+
+        //_mm_prefetch(&x[ib] + 2 * sizeof(block_q4_0), _MM_HINT_T0);
+        //_mm_prefetch(&y[ib] + 2 * sizeof(block_q8_0), _MM_HINT_T0);
+
+        // Compute combined scale for the block 2 and 3
+        const __m128 d_2_3 = (__m128)__lsx_vreplgr2vr_w( GGML_CPU_FP16_TO_FP32(x[ib + 1].d) * GGML_CPU_FP16_TO_FP32(y[ib + 1].d) );
+
+        const __m128i tmp_2_3 = __lsx_vld((const __m128i *)x[ib + 1].qs, 0);
+
+        __m128i bx_2 = __lsx_vand_v(low_mask, tmp_2_3);
+        __m128i by_2 = __lsx_vld((const __m128i *)y[ib + 1].qs, 0);
+        bx_2 = __lsx_vsub_b(bx_2, off);
+        const __m128i i32_2 = mul_sum_i8_pairs(bx_2, by_2);
+
+        __m128i bx_3 = __lsx_vand_v(low_mask, __lsx_vsrli_d(tmp_2_3, 4));
+        __m128i by_3 = __lsx_vld((const __m128i *)(y[ib + 1].qs + 16), 0);
+        bx_3 = __lsx_vsub_b(bx_3, off);
+        const __m128i i32_3 = mul_sum_i8_pairs(bx_3, by_3);
+
+        // Convert int32_t to float
+        __m128 p0 = __lsx_vffint_s_w(i32_0);
+        __m128 p1 = __lsx_vffint_s_w(i32_1);
+        __m128 p2 = __lsx_vffint_s_w(i32_2);
+        __m128 p3 = __lsx_vffint_s_w(i32_3);
+
+        // Apply the scale
+        __m128 p0_d = __lsx_vfmul_s( d_0_1, p0 );
+        __m128 p1_d = __lsx_vfmul_s( d_0_1, p1 );
+        __m128 p2_d = __lsx_vfmul_s( d_2_3, p2 );
+        __m128 p3_d = __lsx_vfmul_s( d_2_3, p3 );
+
+        // Acummulate
+        acc_0 = __lsx_vfadd_s(p0_d, acc_0);
+        acc_1 = __lsx_vfadd_s(p1_d, acc_1);
+        acc_2 = __lsx_vfadd_s(p2_d, acc_2);
+        acc_3 = __lsx_vfadd_s(p3_d, acc_3);
+    }
+
+    sumf = hsum_float_4x4(acc_0, acc_1, acc_2, acc_3);
+
+#endif
+    for (; ib < nb; ++ib) {
+        int sumi0 = 0;
+        int sumi1 = 0;
+
+        for (int j = 0; j < qk/2; ++j) {
+            const int v0 = (x[ib].qs[j] & 0x0F) - 8;
+            const int v1 = (x[ib].qs[j] >>   4) - 8;
+
+            sumi0 += (v0 * y[ib].qs[j]);
+            sumi1 += (v1 * y[ib].qs[j + qk/2]);
+        }
+
+        int sumi = sumi0 + sumi1;
+        sumf += sumi*GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d);
+    }
+
+    *s = sumf;
+}
+
+void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    const int qk = QK8_1;
+    const int nb = n / qk;
+
+    assert(n % qk == 0);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_q4_1 * GGML_RESTRICT x = vx;
+    const block_q8_1 * GGML_RESTRICT y = vy;
+
+    int ib = 0;
+    float sumf = 0;
+
+#if defined(__loongarch_asx)
+    // Initialize accumulator with zeros
+    __m256 acc = (__m256)__lasx_xvldi(0);
+
+    float summs = 0;
+
+    // Main loop
+    for (; ib < nb; ++ib) {
+        const float d0 = GGML_CPU_FP16_TO_FP32(x[ib].d);
+        const float d1 = GGML_CPU_FP16_TO_FP32(y[ib].d);
+
+        summs += GGML_CPU_FP16_TO_FP32(x[ib].m) * GGML_CPU_FP16_TO_FP32(y[ib].s);
+
+        const __m256 d0v = __lasx_xvreplfr2vr_s( d0 );
+        const __m256 d1v = __lasx_xvreplfr2vr_s( d1 );
+
+        // Compute combined scales
+        const __m256 d0d1 = __lasx_xvfmul_s( d0v, d1v );
+
+        // Load 16 bytes, and unpack 4 bit fields into bytes, making 32 bytes
+        const __m256i qx = bytes_from_nibbles_32(x[ib].qs);
+        const __m256i qy = __lasx_xvld( (const __m256i *)y[ib].qs, 0);
+
+        const __m256 xy = mul_sum_us8_pairs_float(qx, qy);
+
+        // Accumulate d0*d1*x*y
+        acc = __lasx_xvfmadd_s( d0d1, xy, acc );
+    }
+
+    sumf = hsum_float_8(acc) + summs;
+
+#endif
+    for (; ib < nb; ++ib) {
+        int sumi0 = 0;
+        int sumi1 = 0;
+
+        for (int j = 0; j < qk/2; ++j) {
+            const int v0 = (x[ib].qs[j] & 0x0F);
+            const int v1 = (x[ib].qs[j] >>   4);
+
+            sumi0 += (v0 * y[ib].qs[j]);
+            sumi1 += (v1 * y[ib].qs[j + qk/2]);
+        }
+
+        int sumi = sumi0 + sumi1;
+        sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
+    }
+
+    *s = sumf;
+}
+
+void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    const int qk = QK8_0;
+    const int nb = n / qk;
+
+    int ib = 0;
+    float sumf = 0;
+
+    assert(n % qk == 0);
+    assert(qk == QK5_0);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_q5_0 * GGML_RESTRICT x = vx;
+    const block_q8_0 * GGML_RESTRICT y = vy;
+
+#if defined(__loongarch_asx)
+    // Initialize accumulator with zeros
+    __m256 acc = (__m256)__lasx_xvldi(0);
+
+    // Main loop
+    for (; ib < nb; ++ib) {
+        /* Compute combined scale for the block */
+        const __m256 d = __lasx_xvreplfr2vr_s(GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d)); //FIXME
+
+        __m256i qx = bytes_from_nibbles_32(x[ib].qs);
+        __m256i bxhi = bytes_from_bits_32(x[ib].qh);
+        bxhi = __lasx_xvandn_v(bxhi, __lasx_xvreplgr2vr_b((char)0xF0));
+        qx = __lasx_xvor_v(qx, bxhi);
+
+        __m256i qy = __lasx_xvld((const __m256i *)y[ib].qs, 0);
+
+        const __m256 q = mul_sum_i8_pairs_float(qx, qy);
+
+        /* Multiply q with scale and accumulate */
+        acc = __lasx_xvfmadd_s(d, q, acc);
+    }
+
+    sumf = hsum_float_8(acc);
+
+#endif
+    for (; ib < nb; ++ib) {
+        uint32_t qh;
+        memcpy(&qh, x[ib].qh, sizeof(qh));
+
+        int sumi0 = 0;
+        int sumi1 = 0;
+
+        for (int j = 0; j < qk/2; ++j) {
+            const uint8_t xh_0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
+            const uint8_t xh_1 = ((qh & (1u << (j + 16))) >> (j + 12));
+
+            const int32_t x0 = (int8_t)(((x[ib].qs[j] & 0x0F) | xh_0) - 16);
+            const int32_t x1 = (int8_t)(((x[ib].qs[j] >>   4) | xh_1) - 16);
+
+            sumi0 += (x0 * y[ib].qs[j]);
+            sumi1 += (x1 * y[ib].qs[j + qk/2]);
+        }
+
+        int sumi = sumi0 + sumi1;
+        sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d)) * sumi;
+    }
+
+    *s = sumf;
+}
+
+void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    const int qk = QK8_1;
+    const int nb = n / qk;
+
+    int ib = 0;
+    float sumf = 0;
+
+    assert(n % qk == 0);
+    assert(qk == QK5_1);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_q5_1 * GGML_RESTRICT x = vx;
+    const block_q8_1 * GGML_RESTRICT y = vy;
+
+#if defined(__loongarch_asx)
+    // Initialize accumulator with zeros
+    __m256 acc = (__m256)__lasx_xvldi(0);
+
+    float summs = 0.0f;
+
+    // Main loop
+    for (; ib < nb; ++ib) {
+        const __m256 dx = __lasx_xvreplfr2vr_s(GGML_CPU_FP16_TO_FP32(x[ib].d));
+
+        summs += GGML_CPU_FP16_TO_FP32(x[ib].m) * GGML_CPU_FP16_TO_FP32(y[ib].s);
+
+        __m256i qx = bytes_from_nibbles_32(x[ib].qs);
+        __m256i bxhi = bytes_from_bits_32(x[ib].qh);
+        bxhi = __lasx_xvand_v(bxhi, __lasx_xvreplgr2vr_b(0x10));
+        qx = __lasx_xvor_v(qx, bxhi);
+
+        const __m256 dy = __lasx_xvreplfr2vr_s(GGML_CPU_FP16_TO_FP32(y[ib].d));
+        const __m256i qy = __lasx_xvld((const __m256i *)y[ib].qs, 0);
+
+        const __m256 q = mul_sum_us8_pairs_float(qx, qy);
+
+        acc = __lasx_xvfmadd_s(q, __lasx_xvfmul_s(dx, dy), acc);
+    }
+
+    sumf = hsum_float_8(acc) + summs;
+
+#endif
+    for (; ib < nb; ++ib) {
+        uint32_t qh;
+        memcpy(&qh, x[ib].qh, sizeof(qh));
+
+        int sumi0 = 0;
+        int sumi1 = 0;
+
+        for (int j = 0; j < qk/2; ++j) {
+            const uint8_t xh_0 = ((qh >> (j +  0)) << 4) & 0x10;
+            const uint8_t xh_1 = ((qh >> (j + 12))     ) & 0x10;
+
+            const int32_t x0 = (x[ib].qs[j] & 0xF) | xh_0;
+            const int32_t x1 = (x[ib].qs[j] >>  4) | xh_1;
+
+            sumi0 += (x0 * y[ib].qs[j]);
+            sumi1 += (x1 * y[ib].qs[j + qk/2]);
+        }
+
+        int sumi = sumi0 + sumi1;
+        sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
+    }
+
+    *s = sumf;
+}
+
+void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    const int qk = QK8_0;
+    const int nb = n / qk;
+
+    assert(n % qk == 0);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_q8_0 * GGML_RESTRICT x = vx;
+    const block_q8_0 * GGML_RESTRICT y = vy;
+
+    int ib = 0;
+    float sumf = 0;
+
+#if defined(__loongarch_asx)
+    // Initialize accumulator with zeros
+    __m256 acc = (__m256)__lasx_xvldi(0);
+
+    // Main loop
+    for (; ib < nb; ++ib) {
+        // Compute combined scale for the block
+        const __m256 d = __lasx_xvreplfr2vr_s(GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d));
+        __m256i qx = __lasx_xvld((const __m256i *)x[ib].qs, 0);
+        __m256i qy = __lasx_xvld((const __m256i *)y[ib].qs, 0);
+
+        const __m256 q = mul_sum_i8_pairs_float(qx, qy);
+
+        // Multiply q with scale and accumulate
+        acc = __lasx_xvfmadd_s( d, q, acc );
+    }
+
+    sumf = hsum_float_8(acc);
+
+#endif
+    for (; ib < nb; ++ib) {
+        int sumi = 0;
+
+        for (int j = 0; j < qk; j++) {
+            sumi += x[ib].qs[j]*y[ib].qs[j];
+        }
+
+        sumf += sumi*(GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d));
+    }
+
+    *s = sumf;
+}
+
+void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_q2_K * GGML_RESTRICT x = vx;
+    const block_q8_K * GGML_RESTRICT y = vy;
+
+    const int nb = n / QK_K;
+
+#if defined __loongarch_asx
+
+    __m256 acc = (__m256)__lasx_xvldi(0);
+
+    for (int i = 0; i < nb; ++i) {
+
+        const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
+        const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
+
+        const uint8_t * GGML_RESTRICT q2 = x[i].qs;
+        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
+
+        const __m128i mins_and_scales128 = __lsx_vld((const __m128i*)x[i].scales, 0);
+        const __m128i scales128 = __lsx_vandi_b(mins_and_scales128, 0xf);
+        const __m256i mins = lasx_ext8_16(__lsx_vsrli_b(mins_and_scales128, 4));
+        const __m256i prod = lasx_madd_h(mins, __lasx_xvld((const __m256i*)y[i].bsums, 0));
+
+        acc = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(dmin), __lasx_xvffint_s_w(prod), acc);
+
+        const v16i8 shuffle_mask = {0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15};
+        const __m256i scales_shuffled = lasx_ext8_16(__lsx_vshuf_b(scales128, scales128, (__m128i)shuffle_mask));
+
+        __m256i sumi = __lasx_xvldi(0);
+
+        for (int j = 0; j < QK_K/128; ++j) {
+
+            const __m256i q2bits = __lasx_xvld((const __m256i*)q2, 0); q2 += 32;
+
+            const __m256i q8_0 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
+            const __m256i q8_1 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
+            const __m256i q8_2 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
+            const __m256i q8_3 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
+
+            const __m256i q2_0 = __lasx_xvandi_b(q2bits, 3);
+            const __m256i q2_1 = __lasx_xvandi_b(__lasx_xvsrli_b(q2bits, 2), 3);
+            const __m256i q2_2 = __lasx_xvandi_b(__lasx_xvsrli_b(q2bits, 4), 3);
+            const __m256i q2_3 = __lasx_xvsrli_b(q2bits, 6);
+
+            __m256i p0 = lasx_madd_h_b(q2_0, q8_0);
+            __m256i p1 = lasx_madd_h_b(q2_1, q8_1);
+            __m256i p2 = lasx_madd_h_b(q2_2, q8_2);
+            __m256i p3 = lasx_madd_h_b(q2_3, q8_3);
+
+            p0 = lasx_madd_h(lasx_xvrepl128vei_h(scales_shuffled, 4 * j + 0), p0);
+            p1 = lasx_madd_h(lasx_xvrepl128vei_h(scales_shuffled, 4 * j + 1), p1);
+            p2 = lasx_madd_h(lasx_xvrepl128vei_h(scales_shuffled, 4 * j + 2), p2);
+            p3 = lasx_madd_h(lasx_xvrepl128vei_h(scales_shuffled, 4 * j + 3), p3);
+
+            p0 = __lasx_xvadd_w(p0, p1);
+            p2 = __lasx_xvadd_w(p2, p3);
+
+            sumi = __lasx_xvadd_w(sumi, __lasx_xvadd_w(p0, p2));
+        }
+
+        acc = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(d), __lasx_xvffint_s_w(sumi), acc);
+
+    }
+
+    *s = hsum_float_8(acc);
+
+#else
+
+    float sumf = 0;
+
+    for (int i = 0; i < nb; ++i) {
+
+        const uint8_t * q2 = x[i].qs;
+        const  int8_t * q8 = y[i].qs;
+        const uint8_t * sc = x[i].scales;
+
+        int summs = 0;
+        for (int j = 0; j < 16; ++j) {
+            summs += y[i].bsums[j] * (sc[j] >> 4);
+        }
+
+        const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
+        const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
+
+        int isum = 0;
+        int is = 0;
+        int d;
+        for (int k = 0; k < QK_K/128; ++k) {
+            int shift = 0;
+            for (int j = 0; j < 4; ++j) {
+                d = sc[is++] & 0xF;
+                int isuml = 0;
+                for (int l =  0; l < 16; ++l) isuml += q8[l] * ((q2[l] >> shift) & 3);
+                isum += d * isuml;
+                d = sc[is++] & 0xF;
+                isuml = 0;
+                for (int l = 16; l < 32; ++l) isuml += q8[l] * ((q2[l] >> shift) & 3);
+                isum += d * isuml;
+                shift += 2;
+                q8 += 32;
+            }
+            q2 += 32;
+        }
+        sumf += dall * isum - dmin * summs;
+    }
+    *s = sumf;
+#endif
+}
+
+void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    assert(n % QK_K == 0);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const uint32_t kmask1 = 0x03030303;
+    const uint32_t kmask2 = 0x0f0f0f0f;
+
+    const block_q3_K * GGML_RESTRICT x = vx;
+    const block_q8_K * GGML_RESTRICT y = vy;
+
+    const int nb = n / QK_K;
+
+#if defined __loongarch_asx
+
+    const __m128i m32 = __lsx_vreplgr2vr_b(32);
+
+    __m256 acc = (__m256)__lasx_xvldi(0);
+
+    uint32_t aux[3];
+
+    for (int i = 0; i < nb; ++i) {
+
+        const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
+        const uint8_t * GGML_RESTRICT q3 = x[i].qs;
+        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
+        // Set up scales
+        memcpy(aux, x[i].scales, 12);
+        __m128i scales128 = lsx_set_w(
+                ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4),
+                ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4),
+                (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4),
+                (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4));
+        scales128 = __lsx_vsub_b(scales128, m32);
+
+        const v16i8 shuffle_mask = {0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15};
+        const __m256i scales_shuffled = lasx_ext8_16(__lsx_vshuf_b(scales128, scales128, (__m128i)shuffle_mask));
+
+        // high bit
+        const __m256i hbits = __lasx_xvld((const __m256i*)x[i].hmask, 0);
+
+        // integer accumulator
+        __m256i sumi = __lasx_xvldi(0);
+
+        for (int j = 0; j < QK_K/128; ++j) {
+            // load low 2 bits
+            const __m256i q3bits = __lasx_xvld((const __m256i*)q3, 0); q3 += 32;
+
+            // prepare low and high bits
+            const __m256i q3l_0 = __lasx_xvandi_b(q3bits, 3);
+            const __m256i q3l_1 = __lasx_xvandi_b(__lasx_xvsrli_b(q3bits, 2), 3);
+            const __m256i q3l_2 = __lasx_xvandi_b(__lasx_xvsrli_b(q3bits, 4), 3);
+            const __m256i q3l_3 = __lasx_xvsrli_b(q3bits, 6);
+            const __m256i q3h_0 = __lasx_xvslli_b(__lasx_xvseqi_b(lasx_xvandi_b_bit(hbits, 4 * j + 0), 0), 2);
+            const __m256i q3h_1 = __lasx_xvslli_b(__lasx_xvseqi_b(lasx_xvandi_b_bit(hbits, 4 * j + 1), 0), 2);
+            const __m256i q3h_2 = __lasx_xvslli_b(__lasx_xvseqi_b(lasx_xvandi_b_bit(hbits, 4 * j + 2), 0), 2);
+            const __m256i q3h_3 = __lasx_xvslli_b(__lasx_xvseqi_b(lasx_xvandi_b_bit(hbits, 4 * j + 3), 0), 2);
+            const __m256i q3_0 = __lasx_xvor_v(q3h_0, q3l_0);
+            const __m256i q3_1 = __lasx_xvor_v(q3h_1, q3l_1);
+            const __m256i q3_2 = __lasx_xvor_v(q3h_2, q3l_2);
+            const __m256i q3_3 = __lasx_xvor_v(q3h_3, q3l_3);
+
+            // load Q8 quants
+            const __m256i q8_0 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
+            const __m256i q8_1 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
+            const __m256i q8_2 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
+            const __m256i q8_3 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
+
+            __m256i p16_0 = lasx_madd_h_b(q8_0, q3_0);
+            __m256i p16_1 = lasx_madd_h_b(q8_1, q3_1);
+            __m256i p16_2 = lasx_madd_h_b(q8_2, q3_2);
+            __m256i p16_3 = lasx_madd_h_b(q8_3, q3_3);
+
+            // multiply with scales
+            p16_0 = lasx_madd_h(lasx_xvrepl128vei_h(scales_shuffled, 4 * j + 0), p16_0);
+            p16_1 = lasx_madd_h(lasx_xvrepl128vei_h(scales_shuffled, 4 * j + 1), p16_1);
+            p16_2 = lasx_madd_h(lasx_xvrepl128vei_h(scales_shuffled, 4 * j + 2), p16_2);
+            p16_3 = lasx_madd_h(lasx_xvrepl128vei_h(scales_shuffled, 4 * j + 3), p16_3);
+
+            // accumulate
+            p16_0 = __lasx_xvadd_w(p16_0, p16_1);
+            p16_2 = __lasx_xvadd_w(p16_2, p16_3);
+            sumi  = __lasx_xvadd_w(sumi, __lasx_xvadd_w(p16_0, p16_2));
+        }
+        // multiply with block scale and accumulate
+        acc = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(d), __lasx_xvffint_s_w(sumi), acc);
+    }
+
+    *s = hsum_float_8(acc);
+
+#else
+    // scalar version
+    // This function is written like this so the compiler can manage to vectorize most of it
+    // Using -Ofast, GCC and clang manage to produce code that is within a factor of 2 or so from the
+    // manually vectorized version above. Every other version I tried would run at least 4 times slower.
+    // The ideal situation would be if we could just write the code once, and the compiler would
+    // automatically produce the best possible set of machine instructions, instead of us having to manually
+    // write vectorized versions for AVX, ARM_NEON, etc.
+
+    int8_t  aux8[QK_K];
+    int16_t aux16[8];
+    float   sums [8];
+    int32_t aux32[8];
+    memset(sums, 0, 8*sizeof(float));
+
+    uint32_t auxs[4];
+    const int8_t * scales = (const int8_t*)auxs;
+
+    float sumf = 0;
+    for (int i = 0; i < nb; ++i) {
+        const uint8_t * GGML_RESTRICT q3 = x[i].qs;
+        const uint8_t * GGML_RESTRICT hm = x[i].hmask;
+        const  int8_t * GGML_RESTRICT q8 = y[i].qs;
+        memset(aux32, 0, 8*sizeof(int32_t));
+        int8_t * GGML_RESTRICT a = aux8;
+        uint8_t m = 1;
+        for (int j = 0; j < QK_K; j += 128) {
+            for (int l = 0; l < 32; ++l) a[l] = q3[l] & 3;
+            for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
+            a += 32; m <<= 1;
+            for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 2) & 3;
+            for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
+            a += 32; m <<= 1;
+            for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 4) & 3;
+            for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
+            a += 32; m <<= 1;
+            for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 6) & 3;
+            for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
+            a += 32; m <<= 1;
+            q3 += 32;
+        }
+        a = aux8;
+
+        memcpy(auxs, x[i].scales, 12);
+        uint32_t tmp = auxs[2];
+        auxs[2] = ((auxs[0] >> 4) & kmask2) | (((tmp >> 4) & kmask1) << 4);
+        auxs[3] = ((auxs[1] >> 4) & kmask2) | (((tmp >> 6) & kmask1) << 4);
+        auxs[0] = (auxs[0] & kmask2) | (((tmp >> 0) & kmask1) << 4);
+        auxs[1] = (auxs[1] & kmask2) | (((tmp >> 2) & kmask1) << 4);
+        for (int j = 0; j < QK_K/16; ++j) {
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
+            q8 += 8; a += 8;
+        }
+        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+        for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
+    }
+    for (int l = 0; l < 8; ++l) sumf += sums[l];
+    *s = sumf;
+
+#endif
+
+}
+
+void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    assert(n % QK_K == 0);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_q4_K * GGML_RESTRICT x = vx;
+    const block_q8_K * GGML_RESTRICT y = vy;
+
+    const int nb = n / QK_K;
+
+    static const uint32_t kmask1 = 0x3f3f3f3f;
+    static const uint32_t kmask2 = 0x0f0f0f0f;
+    static const uint32_t kmask3 = 0x03030303;
+
+    uint32_t utmp[4];
+
+#if defined __loongarch_asx
+
+    __m256 acc = (__m256)__lasx_xvldi(0);
+    __m128 acc_m = (__m128)__lsx_vldi(0);
+
+   for (int i = 0; i < nb; ++i) {
+
+        const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
+        const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
+
+        memcpy(utmp, x[i].scales, 12);
+        utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
+        const uint32_t uaux = utmp[1] & kmask1;
+        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
+        utmp[2] = uaux;
+        utmp[0] &= kmask1;
+
+        const uint8_t * GGML_RESTRICT q4 = x[i].qs;
+        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
+
+        const __m128i mins_and_scales128 = lsx_set_w(utmp[3], utmp[2], utmp[1], utmp[0]);
+        const __m128i mins128 = __lsx_vexth_h_b(mins_and_scales128);
+        const __m128i scales128 = __lsx_vsllwil_h_b(mins_and_scales128, 0);
+
+        const __m256i q8sums = __lasx_xvld((const __m256i*)y[i].bsums, 0);
+        const __m128i q8s = lsx_hadd_h(lasx_extracti128(q8sums, 0), lasx_extracti128(q8sums, 1));
+        const __m128i prod = lsx_madd_h(mins128, q8s);
+        acc_m = __lsx_vfmadd_s(__lsx_vreplfr2vr_s(dmin), __lsx_vffint_s_w(prod), acc_m);
+
+        const __m256i scales = lasx_insertf128(scales128, scales128);
+
+        __m256i sumi = __lasx_xvldi(0);
+
+        for (int j = 0; j < QK_K/64; ++j) {
+
+            const __m256i scale_l = lasx_xvrepl128vei_h(scales, 2 * j + 0);
+            const __m256i scale_h = lasx_xvrepl128vei_h(scales, 2 * j + 1);
+
+            const __m256i q4bits = __lasx_xvld((const __m256i*)q4, 0); q4 += 32;
+            const __m256i q4l = __lasx_xvandi_b(q4bits, 0xf);
+            const __m256i q4h = __lasx_xvsrli_b(q4bits, 4);
+
+            const __m256i q8l = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
+            __m256i p16l = lasx_madd_h_b(q4l, q8l);
+            p16l = lasx_madd_h(scale_l, p16l);
+
+            const __m256i q8h = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
+            __m256i p16h = lasx_madd_h_b(q4h, q8h);
+            p16h = lasx_madd_h(scale_h, p16h);
+            const __m256i sumj = __lasx_xvadd_w(p16l, p16h);
+
+            sumi = __lasx_xvadd_w(sumi, sumj);
+        }
+
+        __m256 vd = __lasx_xvreplfr2vr_s(d);
+        acc = __lasx_xvfmadd_s(vd, __lasx_xvffint_s_w(sumi), acc);
+
+    }
+
+    acc_m = __lsx_vfadd_s(acc_m, (__m128)__lsx_vpermi_w((__m128i)acc_m, (__m128i)acc_m, 0xee));
+    __m128i tmp1 = __lsx_vinsgr2vr_w(__lsx_vldi(0), __lsx_vpickve2gr_w((__m128i)acc_m, 1), 0);
+    acc_m = __lsx_vfadd_s(acc_m, (__m128)tmp1);
+
+
+    *s = hsum_float_8(acc) + ((v4f32)acc_m)[0];
+
+#else
+
+    const uint8_t * scales = (const uint8_t*)&utmp[0];
+    const uint8_t * mins   = (const uint8_t*)&utmp[2];
+
+    int8_t  aux8[QK_K];
+    int16_t aux16[8];
+    float   sums [8];
+    int32_t aux32[8];
+    memset(sums, 0, 8*sizeof(float));
+
+    float sumf = 0;
+    for (int i = 0; i < nb; ++i) {
+        const uint8_t * GGML_RESTRICT q4 = x[i].qs;
+        const  int8_t * GGML_RESTRICT q8 = y[i].qs;
+        memset(aux32, 0, 8*sizeof(int32_t));
+        int8_t * GGML_RESTRICT a = aux8;
+        for (int j = 0; j < QK_K/64; ++j) {
+            for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
+            a += 32;
+            for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l]  >> 4);
+            a += 32; q4 += 32;
+        }
+        memcpy(utmp, x[i].scales, 12);
+        utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
+        const uint32_t uaux = utmp[1] & kmask1;
+        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
+        utmp[2] = uaux;
+        utmp[0] &= kmask1;
+
+        int sumi = 0;
+        for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
+        a = aux8;
+        int is = 0;
+        for (int j = 0; j < QK_K/32; ++j) {
+            int32_t scale = scales[is++];
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+        }
+        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+        for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
+        const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
+        sumf -= dmin * sumi;
+    }
+    for (int l = 0; l < 8; ++l) sumf += sums[l];
+    *s = sumf;
+#endif
+}
+
+void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy,  size_t by, int nrc) {
+    assert(n % QK_K == 0);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_q5_K * GGML_RESTRICT x = vx;
+    const block_q8_K * GGML_RESTRICT y = vy;
+
+    const int nb = n / QK_K;
+
+    static const uint32_t kmask1 = 0x3f3f3f3f;
+    static const uint32_t kmask2 = 0x0f0f0f0f;
+    static const uint32_t kmask3 = 0x03030303;
+
+    uint32_t utmp[4];
+
+#if defined __loongarch_asx
+
+    __m256 acc = (__m256)__lasx_xvldi(0);
+    __m128 acc_m = (__m128)__lsx_vldi(0);
+
+    for (int i = 0; i < nb; ++i) {
+
+        const uint8_t * GGML_RESTRICT q5 = x[i].qs;
+        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
+
+        const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
+        const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
+
+        memcpy(utmp, x[i].scales, 12);
+        utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
+        const uint32_t uaux = utmp[1] & kmask1;
+        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
+        utmp[2] = uaux;
+        utmp[0] &= kmask1;
+
+        const __m128i mins_and_scales128 = lsx_set_w(utmp[3], utmp[2], utmp[1], utmp[0]);
+        const __m128i mins128 = __lsx_vexth_h_b(mins_and_scales128);
+        const __m128i scales128 = __lsx_vsllwil_h_b(mins_and_scales128, 0);
+
+        const __m256i q8sums = __lasx_xvld((const __m256i*)y[i].bsums, 0);
+        const __m128i q8s = lsx_hadd_h(lasx_extracti128(q8sums, 0), lasx_extracti128(q8sums, 1));
+        const __m128i prod = lsx_madd_h(mins128, q8s);
+        acc_m = __lsx_vfmadd_s(__lsx_vreplfr2vr_s(dmin), __lsx_vffint_s_w(prod), acc_m);
+
+        const __m256i scales = lasx_insertf128(scales128, scales128);
+
+        const __m256i hbits = __lasx_xvld((const __m256i*)x[i].qh, 0);
+
+        __m256i sumi = __lasx_xvldi(0);
+
+        for (int j = 0; j < QK_K/64; ++j) {
+
+            const __m256i scale_0 = lasx_xvrepl128vei_h(scales, 2 * j + 0);
+            const __m256i scale_1 = lasx_xvrepl128vei_h(scales, 2 * j + 1);
+
+            const __m256i q5bits = __lasx_xvld((const __m256i*)q5, 0); q5 += 32;
+
+            const __m256i q5l_0 = __lasx_xvandi_b(q5bits, 0xf);
+            const __m256i q5l_1 = __lasx_xvsrli_b(q5bits, 4);
+            const __m256i q5h_0 = __lasx_xvnori_b(__lasx_xvseqi_b(lasx_xvandi_b_bit(hbits, 2 * j + 0), 0), 0xef);
+            const __m256i q5h_1 = __lasx_xvnori_b(__lasx_xvseqi_b(lasx_xvandi_b_bit(hbits, 2 * j + 1), 0), 0xef);
+            const __m256i q5_0  = __lasx_xvor_v(q5l_0, q5h_0);
+            const __m256i q5_1  = __lasx_xvor_v(q5l_1, q5h_1);
+
+            const __m256i q8_0 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
+            const __m256i q8_1 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
+
+            __m256i p16_0 = lasx_madd_h_b(q5_0, q8_0);
+            __m256i p16_1 = lasx_madd_h_b(q5_1, q8_1);
+
+            p16_0 = lasx_madd_h(scale_0, p16_0);
+            p16_1 = lasx_madd_h(scale_1, p16_1);
+
+            sumi = __lasx_xvadd_w(sumi, __lasx_xvadd_w(p16_0, p16_1));
+
+        }
+
+        __m256 vd = __lasx_xvreplfr2vr_s(d);
+        acc = __lasx_xvfmadd_s(vd, __lasx_xvffint_s_w(sumi), acc);
+
+    }
+
+    acc_m = __lsx_vfadd_s(acc_m, (__m128)__lsx_vbsrl_v(acc_m, 8));
+    acc_m = __lsx_vfadd_s(acc_m, (__m128)__lsx_vbsrl_v(acc_m, 4));
+
+    *s = hsum_float_8(acc) + ((v4f32)acc_m)[0];
+
+#else
+
+    const uint8_t * scales = (const uint8_t*)&utmp[0];
+    const uint8_t * mins   = (const uint8_t*)&utmp[2];
+
+    int8_t  aux8[QK_K];
+    int16_t aux16[8];
+    float   sums [8];
+    int32_t aux32[8];
+    memset(sums, 0, 8*sizeof(float));
+
+    float sumf = 0;
+    for (int i = 0; i < nb; ++i) {
+        const uint8_t * GGML_RESTRICT q4 = x[i].qs;
+        const uint8_t * GGML_RESTRICT hm = x[i].qh;
+        const  int8_t * GGML_RESTRICT q8 = y[i].qs;
+        memset(aux32, 0, 8*sizeof(int32_t));
+        int8_t * GGML_RESTRICT a = aux8;
+        uint8_t m = 1;
+        for (int j = 0; j < QK_K/64; ++j) {
+            for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
+            for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
+            a += 32; m <<= 1;
+            for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l]  >> 4);
+            for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
+            a += 32; m <<= 1;
+            q4 += 32;
+        }
+        memcpy(utmp, x[i].scales, 12);
+        utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
+        const uint32_t uaux = utmp[1] & kmask1;
+        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
+        utmp[2] = uaux;
+        utmp[0] &= kmask1;
+
+        int sumi = 0;
+        for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
+        a = aux8;
+        int is = 0;
+        for (int j = 0; j < QK_K/32; ++j) {
+            int32_t scale = scales[is++];
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+        }
+        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+        for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
+        const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
+        sumf -= dmin * sumi;
+    }
+    for (int l = 0; l < 8; ++l) sumf += sums[l];
+    *s = sumf;
+#endif
+}
+
+void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    assert(n % QK_K == 0);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_q6_K * GGML_RESTRICT x = vx;
+    const block_q8_K * GGML_RESTRICT y = vy;
+
+    const int nb = n / QK_K;
+
+#if defined __loongarch_asx
+
+    const __m256i m32s = __lasx_xvreplgr2vr_b(32);
+
+    __m256 acc = (__m256)__lasx_xvldi(0);
+
+    for (int i = 0; i < nb; ++i) {
+
+        const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
+
+        const uint8_t * GGML_RESTRICT q4 = x[i].ql;
+        const uint8_t * GGML_RESTRICT qh = x[i].qh;
+        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
+
+        const __m128i scales128 = __lsx_vld((const __m128i*)x[i].scales, 0);
+        const v16i8 shuffle_mask = {0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15};
+        const __m256i scales_shuffled = lasx_ext8_16(__lsx_vshuf_b(scales128, scales128, (__m128i)shuffle_mask));
+
+        __m256i sumi = __lasx_xvldi(0);
+
+        for (int j = 0; j < QK_K/128; ++j) {
+
+            const __m256i q4bits1 = __lasx_xvld((const __m256i*)q4, 0); q4 += 32;
+            const __m256i q4bits2 = __lasx_xvld((const __m256i*)q4, 0); q4 += 32;
+            const __m256i q4bitsH = __lasx_xvld((const __m256i*)qh, 0); qh += 32;
+
+            const __m256i q4h_0 = __lasx_xvslli_b(__lasx_xvandi_b(q4bitsH, 3), 4);
+            const __m256i q4h_1 = __lasx_xvslli_b(__lasx_xvandi_b(q4bitsH, 3 << 2), 2);
+            const __m256i q4h_2 = __lasx_xvandi_b(q4bitsH, 3 << 4);
+            const __m256i q4h_3 = __lasx_xvsrli_b(__lasx_xvandi_b(q4bitsH, 3 << 6), 2);
+
+            const __m256i q4_0 = __lasx_xvor_v(__lasx_xvandi_b(q4bits1, 0xf), q4h_0);
+            const __m256i q4_1 = __lasx_xvor_v(__lasx_xvandi_b(q4bits2, 0xf), q4h_1);
+            const __m256i q4_2 = __lasx_xvor_v(__lasx_xvsrli_b(q4bits1, 4), q4h_2);
+            const __m256i q4_3 = __lasx_xvor_v(__lasx_xvsrli_b(q4bits2, 4), q4h_3);
+
+            const __m256i q8_0 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
+            const __m256i q8_1 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
+            const __m256i q8_2 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
+            const __m256i q8_3 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
+
+            __m256i p16_0 = lasx_madd_h_b(__lasx_xvsub_b(q4_0, m32s), q8_0);
+            __m256i p16_1 = lasx_madd_h_b(__lasx_xvsub_b(q4_1, m32s), q8_1);
+            __m256i p16_2 = lasx_madd_h_b(__lasx_xvsub_b(q4_2, m32s), q8_2);
+            __m256i p16_3 = lasx_madd_h_b(__lasx_xvsub_b(q4_3, m32s), q8_3);
+
+            p16_0 = lasx_madd_h(lasx_xvrepl128vei_h(scales_shuffled, 4 * j + 0), p16_0);
+            p16_1 = lasx_madd_h(lasx_xvrepl128vei_h(scales_shuffled, 4 * j + 1), p16_1);
+            p16_2 = lasx_madd_h(lasx_xvrepl128vei_h(scales_shuffled, 4 * j + 2), p16_2);
+            p16_3 = lasx_madd_h(lasx_xvrepl128vei_h(scales_shuffled, 4 * j + 3), p16_3);
+
+            sumi = __lasx_xvadd_w(sumi, __lasx_xvadd_w(p16_0, p16_1));
+            sumi = __lasx_xvadd_w(sumi, __lasx_xvadd_w(p16_2, p16_3));
+        }
+
+        acc = __lasx_xvfmadd_s((__m256)__lasx_xvreplfr2vr_s(d), __lasx_xvffint_s_w(sumi), acc);
+    }
+
+    *s = hsum_float_8(acc);
+
+#else
+
+    int8_t  aux8[QK_K];
+    int16_t aux16[8];
+    float   sums [8];
+    int32_t aux32[8];
+    memset(sums, 0, 8*sizeof(float));
+
+    float sumf = 0;
+    for (int i = 0; i < nb; ++i) {
+        const uint8_t * GGML_RESTRICT q4 = x[i].ql;
+        const uint8_t * GGML_RESTRICT qh = x[i].qh;
+        const  int8_t * GGML_RESTRICT q8 = y[i].qs;
+        memset(aux32, 0, 8*sizeof(int32_t));
+        int8_t * GGML_RESTRICT a = aux8;
+        for (int j = 0; j < QK_K; j += 128) {
+            for (int l = 0; l < 32; ++l) {
+                a[l +  0] = (int8_t)((q4[l +  0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
+                a[l + 32] = (int8_t)((q4[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
+                a[l + 64] = (int8_t)((q4[l +  0] >>  4) | (((qh[l] >> 4) & 3) << 4)) - 32;
+                a[l + 96] = (int8_t)((q4[l + 32] >>  4) | (((qh[l] >> 6) & 3) << 4)) - 32;
+            }
+            a  += 128;
+            q4 += 64;
+            qh += 32;
+        }
+        a = aux8;
+        int is = 0;
+        for (int j = 0; j < QK_K/16; ++j) {
+            int scale = x[i].scales[is++];
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+        }
+        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+        for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
+    }
+    for (int l = 0; l < 8; ++l) sumf += sums[l];
+    *s = sumf;
+#endif
+}
+
+#if defined(__loongarch_asx)
+static const int8_t keven_signs_q2xs[1024] = {
+     1,  1,  1,  1,  1,  1,  1,  1, -1,  1,  1,  1,  1,  1,  1, -1,  1, -1,  1,  1,  1,  1,  1, -1, -1, -1,  1,  1,  1,  1,  1,  1,
+     1,  1, -1,  1,  1,  1,  1, -1, -1,  1, -1,  1,  1,  1,  1,  1,  1, -1, -1,  1,  1,  1,  1,  1, -1, -1, -1,  1,  1,  1,  1, -1,
+     1,  1,  1, -1,  1,  1,  1, -1, -1,  1,  1, -1,  1,  1,  1,  1,  1, -1,  1, -1,  1,  1,  1,  1, -1, -1,  1, -1,  1,  1,  1, -1,
+     1,  1, -1, -1,  1,  1,  1,  1, -1,  1, -1, -1,  1,  1,  1, -1,  1, -1, -1, -1,  1,  1,  1, -1, -1, -1, -1, -1,  1,  1,  1,  1,
+     1,  1,  1,  1, -1,  1,  1, -1, -1,  1,  1,  1, -1,  1,  1,  1,  1, -1,  1,  1, -1,  1,  1,  1, -1, -1,  1,  1, -1,  1,  1, -1,
+     1,  1, -1,  1, -1,  1,  1,  1, -1,  1, -1,  1, -1,  1,  1, -1,  1, -1, -1,  1, -1,  1,  1, -1, -1, -1, -1,  1, -1,  1,  1,  1,
+     1,  1,  1, -1, -1,  1,  1,  1, -1,  1,  1, -1, -1,  1,  1, -1,  1, -1,  1, -1, -1,  1,  1, -1, -1, -1,  1, -1, -1,  1,  1,  1,
+     1,  1, -1, -1, -1,  1,  1, -1, -1,  1, -1, -1, -1,  1,  1,  1,  1, -1, -1, -1, -1,  1,  1,  1, -1, -1, -1, -1, -1,  1,  1, -1,
+     1,  1,  1,  1,  1, -1,  1, -1, -1,  1,  1,  1,  1, -1,  1,  1,  1, -1,  1,  1,  1, -1,  1,  1, -1, -1,  1,  1,  1, -1,  1, -1,
+     1,  1, -1,  1,  1, -1,  1,  1, -1,  1, -1,  1,  1, -1,  1, -1,  1, -1, -1,  1,  1, -1,  1, -1, -1, -1, -1,  1,  1, -1,  1,  1,
+     1,  1,  1, -1,  1, -1,  1,  1, -1,  1,  1, -1,  1, -1,  1, -1,  1, -1,  1, -1,  1, -1,  1, -1, -1, -1,  1, -1,  1, -1,  1,  1,
+     1,  1, -1, -1,  1, -1,  1, -1, -1,  1, -1, -1,  1, -1,  1,  1,  1, -1, -1, -1,  1, -1,  1,  1, -1, -1, -1, -1,  1, -1,  1, -1,
+     1,  1,  1,  1, -1, -1,  1,  1, -1,  1,  1,  1, -1, -1,  1, -1,  1, -1,  1,  1, -1, -1,  1, -1, -1, -1,  1,  1, -1, -1,  1,  1,
+     1,  1, -1,  1, -1, -1,  1, -1, -1,  1, -1,  1, -1, -1,  1,  1,  1, -1, -1,  1, -1, -1,  1,  1, -1, -1, -1,  1, -1, -1,  1, -1,
+     1,  1,  1, -1, -1, -1,  1, -1, -1,  1,  1, -1, -1, -1,  1,  1,  1, -1,  1, -1, -1, -1,  1,  1, -1, -1,  1, -1, -1, -1,  1, -1,
+     1,  1, -1, -1, -1, -1,  1,  1, -1,  1, -1, -1, -1, -1,  1, -1,  1, -1, -1, -1, -1, -1,  1, -1, -1, -1, -1, -1, -1, -1,  1,  1,
+     1,  1,  1,  1,  1,  1, -1, -1, -1,  1,  1,  1,  1,  1, -1,  1,  1, -1,  1,  1,  1,  1, -1,  1, -1, -1,  1,  1,  1,  1, -1, -1,
+     1,  1, -1,  1,  1,  1, -1,  1, -1,  1, -1,  1,  1,  1, -1, -1,  1, -1, -1,  1,  1,  1, -1, -1, -1, -1, -1,  1,  1,  1, -1,  1,
+     1,  1,  1, -1,  1,  1, -1,  1, -1,  1,  1, -1,  1,  1, -1, -1,  1, -1,  1, -1,  1,  1, -1, -1, -1, -1,  1, -1,  1,  1, -1,  1,
+     1,  1, -1, -1,  1,  1, -1, -1, -1,  1, -1, -1,  1,  1, -1,  1,  1, -1, -1, -1,  1,  1, -1,  1, -1, -1, -1, -1,  1,  1, -1, -1,
+     1,  1,  1,  1, -1,  1, -1,  1, -1,  1,  1,  1, -1,  1, -1, -1,  1, -1,  1,  1, -1,  1, -1, -1, -1, -1,  1,  1, -1,  1, -1,  1,
+     1,  1, -1,  1, -1,  1, -1, -1, -1,  1, -1,  1, -1,  1, -1,  1,  1, -1, -1,  1, -1,  1, -1,  1, -1, -1, -1,  1, -1,  1, -1, -1,
+     1,  1,  1, -1, -1,  1, -1, -1, -1,  1,  1, -1, -1,  1, -1,  1,  1, -1,  1, -1, -1,  1, -1,  1, -1, -1,  1, -1, -1,  1, -1, -1,
+     1,  1, -1, -1, -1,  1, -1,  1, -1,  1, -1, -1, -1,  1, -1, -1,  1, -1, -1, -1, -1,  1, -1, -1, -1, -1, -1, -1, -1,  1, -1,  1,
+     1,  1,  1,  1,  1, -1, -1,  1, -1,  1,  1,  1,  1, -1, -1, -1,  1, -1,  1,  1,  1, -1, -1, -1, -1, -1,  1,  1,  1, -1, -1,  1,
+     1,  1, -1,  1,  1, -1, -1, -1, -1,  1, -1,  1,  1, -1, -1,  1,  1, -1, -1,  1,  1, -1, -1,  1, -1, -1, -1,  1,  1, -1, -1, -1,
+     1,  1,  1, -1,  1, -1, -1, -1, -1,  1,  1, -1,  1, -1, -1,  1,  1, -1,  1, -1,  1, -1, -1,  1, -1, -1,  1, -1,  1, -1, -1, -1,
+     1,  1, -1, -1,  1, -1, -1,  1, -1,  1, -1, -1,  1, -1, -1, -1,  1, -1, -1, -1,  1, -1, -1, -1, -1, -1, -1, -1,  1, -1, -1,  1,
+     1,  1,  1,  1, -1, -1, -1, -1, -1,  1,  1,  1, -1, -1, -1,  1,  1, -1,  1,  1, -1, -1, -1,  1, -1, -1,  1,  1, -1, -1, -1, -1,
+     1,  1, -1,  1, -1, -1, -1,  1, -1,  1, -1,  1, -1, -1, -1, -1,  1, -1, -1,  1, -1, -1, -1, -1, -1, -1, -1,  1, -1, -1, -1,  1,
+     1,  1,  1, -1, -1, -1, -1,  1, -1,  1,  1, -1, -1, -1, -1, -1,  1, -1,  1, -1, -1, -1, -1, -1, -1, -1,  1, -1, -1, -1, -1,  1,
+     1,  1, -1, -1, -1, -1, -1, -1, -1,  1, -1, -1, -1, -1, -1,  1,  1, -1, -1, -1, -1, -1, -1,  1, -1, -1, -1, -1, -1, -1, -1, -1,
+};
+#endif
+
+void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    assert(n % QK_K == 0);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_iq2_xxs * GGML_RESTRICT x = vx;
+    const block_q8_K    * GGML_RESTRICT y = vy;
+
+    const int nb = n / QK_K;
+
+#if defined(__loongarch_asx)
+
+    const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
+
+    uint32_t aux32[4];
+    const uint8_t * aux8 = (const uint8_t *)aux32;
+
+    __m256 accumf = (__m256)__lasx_xvldi(0);
+    for (int i = 0; i < nb; ++i) {
+        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+        const uint16_t * GGML_RESTRICT q2 = x[i].qs;
+        const int8_t   * GGML_RESTRICT q8 = y[i].qs;
+        __m256i sumi1 = __lasx_xvldi(0);
+        __m256i sumi2 = __lasx_xvldi(0);
+        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
+            const __m256i q8_1 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32;
+            const __m256i q8_2 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32;
+            memcpy(aux32, q2, 4*sizeof(uint32_t)); q2 += 8;
+
+            const __m256i q2_1 = lasx_set_d(iq2xxs_grid[aux8[ 3]], iq2xxs_grid[aux8[ 2]], iq2xxs_grid[aux8[1]], iq2xxs_grid[aux8[0]]);
+            const __m256i q2_2 = lasx_set_d(iq2xxs_grid[aux8[11]], iq2xxs_grid[aux8[10]], iq2xxs_grid[aux8[9]], iq2xxs_grid[aux8[8]]);
+            const __m256i s2_1 = lasx_set_d(signs64[(aux32[1] >> 21) & 127], signs64[(aux32[1] >> 14) & 127],
+                                                   signs64[(aux32[1] >>  7) & 127], signs64[(aux32[1] >>  0) & 127]);
+            const __m256i s2_2 = lasx_set_d(signs64[(aux32[3] >> 21) & 127], signs64[(aux32[3] >> 14) & 127],
+                                                   signs64[(aux32[3] >>  7) & 127], signs64[(aux32[3] >>  0) & 127]);
+            const __m256i q8s_1 = __lasx_xvsigncov_b(s2_1, q8_1);
+            const __m256i q8s_2 = __lasx_xvsigncov_b(s2_2, q8_2);
+            const __m256i dot1  = lasx_maddubs_h(q2_1, q8s_1);
+            const __m256i dot2  = lasx_maddubs_h(q2_2, q8s_2);
+            const uint16_t ls1 = aux32[1] >> 28;
+            const uint16_t ls2 = aux32[3] >> 28;
+            const __m256i p1 = lasx_madd_h(dot1, __lasx_xvreplgr2vr_h(2*ls1+1));
+            const __m256i p2 = lasx_madd_h(dot2, __lasx_xvreplgr2vr_h(2*ls2+1));
+            sumi1 = __lasx_xvadd_w(sumi1, p1);
+            sumi2 = __lasx_xvadd_w(sumi2, p2);
+        }
+
+        accumf = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(d), __lasx_xvffint_s_w(__lasx_xvadd_w(sumi1, sumi2)), accumf);
+    }
+
+    *s = 0.125f * hsum_float_8(accumf);
+
+#else
+
+    uint32_t aux32[2];
+    const uint8_t * aux8 = (const uint8_t *)aux32;
+
+    float sumf = 0.f;
+    for (int i = 0; i < nb; ++i) {
+        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+        const uint16_t * GGML_RESTRICT q2 = x[i].qs;
+        const int8_t   * GGML_RESTRICT q8 = y[i].qs;
+        int32_t bsum = 0;
+        for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
+            memcpy(aux32, q2, 2*sizeof(uint32_t));
+            q2 += 4;
+            const uint32_t ls = 2*(aux32[1] >> 28) + 1;
+            int32_t sumi = 0;
+            for (int l = 0; l < 4; ++l) {
+                const uint8_t * grid = (const uint8_t *)(iq2xxs_grid + aux8[l]);
+                const uint8_t  signs = ksigns_iq2xs[(aux32[1] >> 7*l) & 127];
+                for (int j = 0; j < 8; ++j) {
+                    sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
+                }
+                q8 += 8;
+            }
+            bsum += sumi * ls;
+        }
+        sumf += d * bsum;
+    }
+    *s = 0.125f * sumf;
+#endif
+}
+
+void ggml_vec_dot_iq2_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    assert(n % QK_K == 0);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_iq2_xs * GGML_RESTRICT x = vx;
+    const block_q8_K   * GGML_RESTRICT y = vy;
+
+    const int nb = n / QK_K;
+
+#if defined(__loongarch_asx)
+
+    const __m256i mone = __lasx_xvreplgr2vr_b(1);
+    static const char block_sign_shuffle_mask_1[32] = {
+        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
+        0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06,
+    };
+    static const char block_sign_shuffle_mask_2[32] = {
+        0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a,
+        0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e,
+    };
+    static const uint8_t bit_selector_mask_bytes[32] = {
+        0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
+        0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
+    };
+
+    const __m256i bit_selector_mask = __lasx_xvld((const __m256i*)bit_selector_mask_bytes, 0);
+    const __m256i block_sign_shuffle_1 = __lasx_xvld((const __m256i*)block_sign_shuffle_mask_1, 0);
+    const __m256i block_sign_shuffle_2 = __lasx_xvld((const __m256i*)block_sign_shuffle_mask_2, 0);
+
+    static const uint8_t k_bit_helper[32] = {
+        0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00,
+        0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00,
+    };
+    const __m256i bit_helper = __lasx_xvld((const __m256i*)k_bit_helper, 0);
+    const __m256i m511 = __lasx_xvreplgr2vr_h(511);
+    const __m128i m4 = __lsx_vreplgr2vr_b(0xf);
+    const __m128i m1 = __lsx_vreplgr2vr_b(1);
+
+    uint64_t aux64;
+
+    // somewhat hacky, but gives a significant boost in performance
+    __m256i aux_gindex;
+    const uint16_t * gindex = (const uint16_t *)&aux_gindex;
+
+    __m256 accumf = (__m256)__lasx_xvldi(0);
+    for (int i = 0; i < nb; ++i) {
+        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+        const uint16_t * GGML_RESTRICT q2 = x[i].qs;
+        const int8_t   * GGML_RESTRICT q8 = y[i].qs;
+
+        memcpy(&aux64, x[i].scales, 8);
+        __m128i stmp = __lsx_vreplgr2vr_d(aux64);
+        stmp = __lsx_vilvl_b( __lsx_vand_v(__lsx_vsrli_h(stmp, 4), m4), __lsx_vand_v(stmp, m4));
+        const __m128i scales = __lsx_vadd_b(__lsx_vslli_h(stmp, 1), m1);
+
+        __m256i sumi1 = __lasx_xvldi(0);
+        __m256i sumi2 = __lasx_xvldi(0);
+        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 4) {
+
+            const __m256i q2_data = __lasx_xvld((const __m256i*)q2, 0);  q2 += 16;
+            aux_gindex = __lasx_xvand_v(q2_data, m511);
+
+            const __m256i partial_sign_bits = __lasx_xvsrli_h(q2_data, 9);
+            const __m256i partial_sign_bits_upper = __lasx_xvsrli_h(q2_data, 13);
+            const __m256i partial_sign_bits_for_counting = __lasx_xvxor_v(partial_sign_bits, partial_sign_bits_upper);
+
+            const __m256i odd_bits = lasx_shuffle_b(bit_helper, partial_sign_bits_for_counting);
+            const __m256i full_sign_bits = __lasx_xvor_v(partial_sign_bits, odd_bits);
+
+            const __m256i q8_1 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32;
+            const __m256i q8_2 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32;
+            const __m256i q8_3 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32;
+            const __m256i q8_4 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32;
+
+            const __m256i q2_1 = lasx_set_d(iq2xs_grid[gindex[ 3]], iq2xs_grid[gindex[ 2]],
+                                                   iq2xs_grid[gindex[ 1]], iq2xs_grid[gindex[ 0]]);
+            const __m256i q2_2 = lasx_set_d(iq2xs_grid[gindex[ 7]], iq2xs_grid[gindex[ 6]],
+                                                   iq2xs_grid[gindex[ 5]], iq2xs_grid[gindex[ 4]]);
+            const __m256i q2_3 = lasx_set_d(iq2xs_grid[gindex[11]], iq2xs_grid[gindex[10]],
+                                                   iq2xs_grid[gindex[ 9]], iq2xs_grid[gindex[ 8]]);
+            const __m256i q2_4 = lasx_set_d(iq2xs_grid[gindex[15]], iq2xs_grid[gindex[14]],
+                                                   iq2xs_grid[gindex[13]], iq2xs_grid[gindex[12]]);
+
+            const __m128i full_signs_l = lasx_extracti128(full_sign_bits, 0);
+            const __m128i full_signs_h = lasx_extracti128(full_sign_bits, 1);
+            const __m256i full_signs_1 = lasx_insertf128(full_signs_l, full_signs_l);
+            const __m256i full_signs_2 = lasx_insertf128(full_signs_h, full_signs_h);
+
+            __m256i signs;
+            signs = lasx_shuffle_b(full_signs_1, block_sign_shuffle_1);
+            signs = __lasx_xvseq_b(__lasx_xvand_v(signs, bit_selector_mask), bit_selector_mask);
+            const __m256i q8s_1 = __lasx_xvsigncov_b(__lasx_xvor_v(signs, mone), q8_1);
+
+            signs = lasx_shuffle_b(full_signs_1, block_sign_shuffle_2);
+            signs = __lasx_xvseq_b(__lasx_xvand_v(signs, bit_selector_mask), bit_selector_mask);
+            const __m256i q8s_2 = __lasx_xvsigncov_b(__lasx_xvor_v(signs, mone), q8_2);
+
+            signs = lasx_shuffle_b(full_signs_2, block_sign_shuffle_1);
+            signs = __lasx_xvseq_b(__lasx_xvand_v(signs, bit_selector_mask), bit_selector_mask);
+            const __m256i q8s_3 = __lasx_xvsigncov_b(__lasx_xvor_v(signs, mone), q8_3);
+
+            signs = lasx_shuffle_b(full_signs_2, block_sign_shuffle_2);
+            signs = __lasx_xvseq_b(__lasx_xvand_v(signs, bit_selector_mask), bit_selector_mask);
+            const __m256i q8s_4 = __lasx_xvsigncov_b(__lasx_xvor_v(signs, mone), q8_4);
+
+            const __m256i dot1  = lasx_maddubs_h(q2_1, q8s_1);
+            const __m256i dot2  = lasx_maddubs_h(q2_2, q8s_2);
+            const __m256i dot3  = lasx_maddubs_h(q2_3, q8s_3);
+            const __m256i dot4  = lasx_maddubs_h(q2_4, q8s_4);
+
+            const __m256i sc1 = lasx_ext8_16(lsx_shuffle_b(scales, get_scale_shuffle(ib32+0)));
+            const __m256i sc2 = lasx_ext8_16(lsx_shuffle_b(scales, get_scale_shuffle(ib32+1)));
+            const __m256i sc3 = lasx_ext8_16(lsx_shuffle_b(scales, get_scale_shuffle(ib32+2)));
+            const __m256i sc4 = lasx_ext8_16(lsx_shuffle_b(scales, get_scale_shuffle(ib32+3)));
+
+            sumi1 = __lasx_xvadd_w(sumi1, lasx_madd_h(dot1, sc1));
+            sumi2 = __lasx_xvadd_w(sumi2, lasx_madd_h(dot2, sc2));
+            sumi1 = __lasx_xvadd_w(sumi1, lasx_madd_h(dot3, sc3));
+            sumi2 = __lasx_xvadd_w(sumi2, lasx_madd_h(dot4, sc4));
+        }
+
+        accumf = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(d), __lasx_xvffint_s_w(__lasx_xvadd_w(sumi1, sumi2)), accumf);
+
+    }
+
+    *s = 0.125f * hsum_float_8(accumf);
+
+#else
+
+    float sumf = 0.f;
+    for (int i = 0; i < nb; ++i) {
+        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+        const uint16_t * GGML_RESTRICT q2 = x[i].qs;
+        const uint8_t  * GGML_RESTRICT sc = x[i].scales;
+        const int8_t   * GGML_RESTRICT q8 = y[i].qs;
+        int32_t bsum = 0;
+        for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
+            const uint16_t ls1 = 2*(sc[ib32] & 0xf) + 1;
+            const uint16_t ls2 = 2*(sc[ib32] >>  4) + 1;
+            int32_t sumi = 0;
+            for (int l = 0; l < 2; ++l) {
+                const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511));
+                const uint8_t  signs = ksigns_iq2xs[q2[l] >> 9];
+                for (int j = 0; j < 8; ++j) {
+                    sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
+                }
+                q8 += 8;
+            }
+            bsum += sumi * ls1;
+            sumi = 0;
+            for (int l = 2; l < 4; ++l) {
+                const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511));
+                const uint8_t  signs = ksigns_iq2xs[q2[l] >> 9];
+                for (int j = 0; j < 8; ++j) {
+                    sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
+                }
+                q8 += 8;
+            }
+            bsum += sumi * ls2;
+            q2 += 4;
+        }
+        sumf += d * bsum;
+    }
+    *s = 0.125f * sumf;
+#endif
+}
+
+void ggml_vec_dot_iq2_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    assert(n % QK_K == 0);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_iq2_s * GGML_RESTRICT x = vx;
+    const block_q8_K  * GGML_RESTRICT y = vy;
+
+    const int nb = n / QK_K;
+
+#if defined(__loongarch_asx)
+
+   static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+                                       0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
+   };
+
+    static const uint8_t k_mask2[32] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
+                                        0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
+    };
+
+
+    const __m128i m4 = __lsx_vreplgr2vr_b(0xf);
+    const __m128i m1 = __lsx_vreplgr2vr_b(1);
+
+    const __m256i mask1 = __lasx_xvld((const __m256i*)k_mask1, 0);
+    const __m256i mask2 = __lasx_xvld((const __m256i*)k_mask2, 0);
+    uint64_t aux64;
+
+    __m256 accumf = (__m256)__lasx_xvldi(0);
+    for (int i = 0; i < nb; ++i) {
+        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+        const uint8_t * GGML_RESTRICT qs = x[i].qs;
+        const uint8_t * GGML_RESTRICT qh = x[i].qh;
+        const uint16_t * GGML_RESTRICT signs = (const uint16_t *)(x[i].qs + QK_K/8);
+        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
+
+        __m128i tmp1;
+        memcpy(&aux64, x[i].scales, 8);
+        tmp1 = __lsx_vinsgr2vr_d(tmp1, aux64, 0);
+        tmp1 = __lsx_vinsgr2vr_d(tmp1, aux64 >> 4, 1);
+        const __m128i scales8 = __lsx_vadd_b(__lsx_vslli_h(__lsx_vand_v(tmp1, m4), 1), m1);
+        const __m256i scales16 = lasx_ext8_16(scales8); // 0 2 4 6 8 10 12 14 1 3 5 7 9 11 13 15
+
+        __m256i sumi1 = __lasx_xvldi(0);
+        __m256i sumi2 = __lasx_xvldi(0);
+        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
+            const __m256i q8_1 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32;
+            const __m256i q8_2 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32;
+            const __m256i q2_1 = lasx_set_d(iq2s_grid[qs[3] | ((qh[ib32+0] << 2) & 0x300)],
+                                                   iq2s_grid[qs[2] | ((qh[ib32+0] << 4) & 0x300)],
+                                                   iq2s_grid[qs[1] | ((qh[ib32+0] << 6) & 0x300)],
+                                                   iq2s_grid[qs[0] | ((qh[ib32+0] << 8) & 0x300)]);
+            const __m256i q2_2 = lasx_set_d(iq2s_grid[qs[7] | ((qh[ib32+1] << 2) & 0x300)],
+                                                   iq2s_grid[qs[6] | ((qh[ib32+1] << 4) & 0x300)],
+                                                   iq2s_grid[qs[5] | ((qh[ib32+1] << 6) & 0x300)],
+                                                   iq2s_grid[qs[4] | ((qh[ib32+1] << 8) & 0x300)]);
+            qs += 8;
+
+            __m256i aux256 = __lasx_xvreplgr2vr_w(signs[0] | ((uint32_t) signs[1] << 16));
+            aux256 = __lasx_xvand_v(lasx_shuffle_b(aux256,mask1), mask2);
+            const __m256i s2_1 = __lasx_xvseq_b(aux256, mask2);
+            const __m256i q8s_1 = __lasx_xvsub_b(__lasx_xvxor_v(s2_1, q8_1), s2_1);
+
+            aux256 = __lasx_xvreplgr2vr_w(signs[2] | ((uint32_t) signs[3] << 16));
+            aux256 = __lasx_xvand_v(lasx_shuffle_b(aux256,mask1), mask2);
+            const __m256i s2_2 = __lasx_xvseq_b(aux256, mask2);
+            const __m256i q8s_2 = __lasx_xvsub_b(__lasx_xvxor_v(s2_2, q8_2), s2_2);
+
+            signs += 4;
+
+            const __m256i dot1  = lasx_maddubs_h(q2_1, q8s_1); // blocks 2*ib32+0, 2*ib32+1
+            const __m256i dot2  = lasx_maddubs_h(q2_2, q8s_2); // blocks 2*ib32+2, 2*ib32+3
+
+            const __m256i p1 = lasx_madd_h(dot1, lasx_shuffle_b(scales16, get_scale_shuffle_k4(ib32+0)));
+            const __m256i p2 = lasx_madd_h(dot2, lasx_shuffle_b(scales16, get_scale_shuffle_k4(ib32+1)));
+            sumi1 = __lasx_xvadd_w(sumi1, p1);
+            sumi2 = __lasx_xvadd_w(sumi2, p2);
+        }
+
+        accumf = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(d), __lasx_xvffint_s_w(__lasx_xvadd_w(sumi1, sumi2)), accumf);
+    }
+
+    *s = 0.125f * hsum_float_8(accumf);
+
+#else
+
+    float sumf = 0;
+    for (int i = 0; i < nb; i++) {
+
+        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+        const int8_t  * q8 = y[i].qs;
+        const uint8_t * qs = x[i].qs;
+        const uint8_t * qh = x[i].qh;
+        const uint8_t * signs = qs + QK_K/8;
+
+        int bsum = 0;
+        for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
+            int ls1 = 1 + 2*(x[i].scales[ib32] & 0xf);
+            int ls2 = 1 + 2*(x[i].scales[ib32] >>  4);
+            int sumi1 = 0, sumi2 = 0;
+            for (int l = 0; l < 2; ++l) {
+                const uint8_t * grid = (const uint8_t *)(iq2s_grid + (qs[l] | (qh[ib32] << (8-2*l) & 0x300)));
+                for (int j = 0; j < 8; ++j) {
+                    sumi1 += q8[j] * grid[j] * (signs[l] & kmask_iq2xs[j] ? -1 : 1);
+                }
+                q8 += 8;
+            }
+            for (int l = 2; l < 4; ++l) {
+                const uint8_t * grid = (const uint8_t *)(iq2s_grid + (qs[l] | (qh[ib32] << (8-2*l) & 0x300)));
+                for (int j = 0; j < 8; ++j) {
+                    sumi2 += q8[j] * grid[j] * (signs[l] & kmask_iq2xs[j] ? -1 : 1);
+                }
+                q8 += 8;
+            }
+            bsum += ls1 * sumi1 + ls2 * sumi2;
+            qs += 4;
+            signs += 4;
+        }
+
+        sumf += d * bsum;
+    }
+
+    *s = 0.125f * sumf;
+
+#endif
+
+}
+
+void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    assert(n % QK_K == 0);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_iq3_xxs * GGML_RESTRICT x = vx;
+    const block_q8_K    * GGML_RESTRICT y = vy;
+
+    const int nb = n / QK_K;
+
+#if defined(__loongarch_asx)
+
+    const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
+
+    uint32_t aux32[2];
+
+    __m256 accumf = (__m256)__lasx_xvldi(0);
+    for (int i = 0; i < nb; ++i) {
+        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+        const uint8_t * GGML_RESTRICT q3 = x[i].qs;
+        const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4;
+        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
+        __m256i sumi1 = __lasx_xvldi(0);
+        __m256i sumi2 = __lasx_xvldi(0);
+        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
+            const __m256i q8_1 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32;
+            const __m256i q8_2 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32;
+            const __m256i q2_1 = lasx_set_w(iq3xxs_grid[q3[7]], iq3xxs_grid[q3[6]], iq3xxs_grid[q3[5]], iq3xxs_grid[q3[4]],
+                                                iq3xxs_grid[q3[3]], iq3xxs_grid[q3[2]], iq3xxs_grid[q3[1]], iq3xxs_grid[q3[0]]);
+            q3 += 8;
+            const __m256i q2_2 = lasx_set_w(iq3xxs_grid[q3[7]], iq3xxs_grid[q3[6]], iq3xxs_grid[q3[5]], iq3xxs_grid[q3[4]],
+                                                iq3xxs_grid[q3[3]], iq3xxs_grid[q3[2]], iq3xxs_grid[q3[1]], iq3xxs_grid[q3[0]]);
+            q3 += 8;
+            memcpy(aux32, gas, 8); gas += 8;
+
+            const __m256i s2_1 = lasx_set_d(signs64[(aux32[0] >> 21) & 127], signs64[(aux32[0] >> 14) & 127],
+                                                   signs64[(aux32[0] >>  7) & 127], signs64[(aux32[0] >>  0) & 127]);
+            const __m256i s2_2 = lasx_set_d(signs64[(aux32[1] >> 21) & 127], signs64[(aux32[1] >> 14) & 127],
+                                                   signs64[(aux32[1] >>  7) & 127], signs64[(aux32[1] >>  0) & 127]);
+            const __m256i q8s_1 = __lasx_xvsigncov_b(s2_1, q8_1);
+            const __m256i q8s_2 = __lasx_xvsigncov_b(s2_2, q8_2);
+            const __m256i dot1  = lasx_maddubs_h(q2_1, q8s_1);
+            const __m256i dot2  = lasx_maddubs_h(q2_2, q8s_2);
+            const uint16_t ls1 = aux32[0] >> 28;
+            const uint16_t ls2 = aux32[1] >> 28;
+
+            const __m256i p1 = lasx_madd_h(dot1, __lasx_xvreplgr2vr_h(2*ls1+1));
+            const __m256i p2 = lasx_madd_h(dot2, __lasx_xvreplgr2vr_h(2*ls2+1));
+            sumi1 = __lasx_xvadd_w(sumi1, p1);
+            sumi2 = __lasx_xvadd_w(sumi2, p2);
+        }
+
+        accumf = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(d), __lasx_xvffint_s_w(__lasx_xvadd_w(sumi1, sumi2)), accumf);
+    }
+
+    *s = 0.25f * hsum_float_8(accumf);
+
+#else
+
+    uint32_t aux32;
+
+    float sumf = 0.f;
+    for (int i = 0; i < nb; ++i) {
+        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+        const uint8_t * GGML_RESTRICT q3 = x[i].qs;
+        const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4;
+        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
+        int32_t bsum = 0;
+        for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
+            memcpy(&aux32, gas, sizeof(uint32_t)); gas += sizeof(uint32_t);
+            const uint32_t ls = 2*(aux32 >> 28) + 1;
+            int32_t sumi = 0;
+            for (int l = 0; l < 4; ++l) {
+                const uint8_t * grid1 = (const uint8_t *)(iq3xxs_grid + q3[2*l+0]);
+                const uint8_t * grid2 = (const uint8_t *)(iq3xxs_grid + q3[2*l+1]);
+                const uint8_t  signs = ksigns_iq2xs[(aux32 >> 7*l) & 127];
+                for (int j = 0; j < 4; ++j) {
+                    sumi += grid1[j] * q8[j+0] * (signs & kmask_iq2xs[j+0] ? -1 : 1);
+                    sumi += grid2[j] * q8[j+4] * (signs & kmask_iq2xs[j+4] ? -1 : 1);
+                }
+                q8 += 8;
+            }
+            q3 += 8;
+            bsum += sumi * ls;
+        }
+        sumf += d * bsum;
+    }
+    *s = 0.25f * sumf;
+#endif
+}
+
+void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    assert(n % QK_K == 0);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_iq3_s * GGML_RESTRICT x = vx;
+    const block_q8_K  * GGML_RESTRICT y = vy;
+
+    const int nb = n / QK_K;
+
+#if defined(__loongarch_asx)
+
+   static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+                                       0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
+   };
+
+    static const uint8_t k_mask2[32] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
+                                        0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
+    };
+
+    const __m256i mask1 = __lasx_xvld((const __m256i*)k_mask1, 0);
+    const __m256i mask2 = __lasx_xvld((const __m256i*)k_mask2, 0);
+
+    __m256i idx_shift = lasx_set_w(1, 2, 3, 4, 5, 6, 7, 8);
+    const __m256i idx_mask  = __lasx_xvreplgr2vr_w(256);
+
+    typedef union {
+        __m256i  vec[2];
+        uint32_t index[16];
+    } index_t;
+
+    index_t idx;
+
+    __m256 accumf = (__m256)__lasx_xvldi(0);
+    for (int i = 0; i < nb; ++i) {
+        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+        const uint8_t * GGML_RESTRICT qs = x[i].qs;
+        const uint8_t * GGML_RESTRICT qh = x[i].qh;
+        const uint16_t * GGML_RESTRICT signs = (const uint16_t *)x[i].signs;
+        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
+        __m256i sumi1 = __lasx_xvldi(0);
+        __m256i sumi2 = __lasx_xvldi(0);
+        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
+            const __m256i q8_1 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32;
+            const __m256i q8_2 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32;
+            const __m256i idx_l = lasx_extu8_16(__lsx_vld(qs, 0)); qs += 16;
+            idx.vec[0] = __lasx_xvreplgr2vr_w(qh[ib32+0]);
+            idx.vec[1] = __lasx_xvreplgr2vr_w(qh[ib32+1]);
+            idx.vec[0] = __lasx_xvand_v(__lasx_xvsll_w(idx.vec[0], idx_shift), idx_mask);
+            idx.vec[1] = __lasx_xvand_v(__lasx_xvsll_w(idx.vec[1], idx_shift), idx_mask);
+            idx.vec[0] = __lasx_xvor_v(idx.vec[0], lasx_ext16_32(lasx_extracti128(idx_l, 0)));
+            idx.vec[1] = __lasx_xvor_v(idx.vec[1], lasx_ext16_32(lasx_extracti128(idx_l, 1)));
+
+            // At leat on my CPU (Ryzen 7950X), using _mm256_i32gather_epi32 is slower than _mm256_set_epi32. Strange.
+            //const __m256i q2_1 = _mm256_i32gather_epi32((const int *)iq3s_grid, idx.vec[0], 4);
+            //const __m256i q2_2 = _mm256_i32gather_epi32((const int *)iq3s_grid, idx.vec[1], 4);
+            const __m256i q2_1 = lasx_set_w(
+                    iq3s_grid[idx.index[7]], iq3s_grid[idx.index[6]], iq3s_grid[idx.index[5]], iq3s_grid[idx.index[4]],
+                    iq3s_grid[idx.index[3]], iq3s_grid[idx.index[2]], iq3s_grid[idx.index[1]], iq3s_grid[idx.index[0]]
+            );
+            const __m256i q2_2 = lasx_set_w(
+                    iq3s_grid[idx.index[15]], iq3s_grid[idx.index[14]], iq3s_grid[idx.index[13]], iq3s_grid[idx.index[12]],
+                    iq3s_grid[idx.index[11]], iq3s_grid[idx.index[10]], iq3s_grid[idx.index[ 9]], iq3s_grid[idx.index[ 8]]
+            );
+
+            __m256i aux256 = __lasx_xvreplgr2vr_w(signs[0] | (signs[1] << 16));
+            aux256 = __lasx_xvand_v(lasx_shuffle_b(aux256,mask1), mask2);
+            const __m256i s2_1 = __lasx_xvseq_b(aux256, mask2);
+            const __m256i q8s_1 = __lasx_xvsub_b(__lasx_xvxor_v(s2_1, q8_1), s2_1);
+
+            aux256 = __lasx_xvreplgr2vr_w(signs[2] | (signs[3] << 16));
+            aux256 = __lasx_xvand_v(lasx_shuffle_b(aux256,mask1), mask2);
+            const __m256i s2_2 = __lasx_xvseq_b(aux256, mask2);
+            const __m256i q8s_2 = __lasx_xvsub_b(__lasx_xvxor_v(s2_2, q8_2), s2_2);
+
+            signs += 4;
+
+            const __m256i dot1 = lasx_maddubs_h(q2_1, q8s_1);
+            const __m256i dot2  = lasx_maddubs_h(q2_2, q8s_2);
+            const uint16_t ls1 = x[i].scales[ib32/2] & 0xf;
+            const uint16_t ls2 = x[i].scales[ib32/2] >>  4;
+            const __m256i p1 = lasx_madd_h(dot1, __lasx_xvreplgr2vr_h(2*ls1+1));
+            const __m256i p2 = lasx_madd_h(dot2, __lasx_xvreplgr2vr_h(2*ls2+1));
+            sumi1 = __lasx_xvadd_w(sumi1, p1);
+            sumi2 = __lasx_xvadd_w(sumi2, p2);
+        }
+
+        accumf = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(d), __lasx_xvffint_s_w(__lasx_xvadd_w(sumi1, sumi2)), accumf);
+    }
+
+    *s = hsum_float_8(accumf);
+
+#else
+
+    float sumf = 0.f;
+    for (int i = 0; i < nb; ++i) {
+        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+        const uint8_t * GGML_RESTRICT qs = x[i].qs;
+        const uint8_t * GGML_RESTRICT qh = x[i].qh;
+        const uint8_t * GGML_RESTRICT signs = x[i].signs;
+        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
+        int32_t bsum = 0;
+        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
+            const uint32_t ls1 = 2*(x[i].scales[ib32/2] & 0xf) + 1;
+            const uint32_t ls2 = 2*(x[i].scales[ib32/2] >>  4) + 1;
+            int32_t sumi = 0;
+            for (int l = 0; l < 4; ++l) {
+                const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*l+0] | ((qh[ib32+0] << (8-2*l)) & 256)));
+                const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*l+1] | ((qh[ib32+0] << (7-2*l)) & 256)));
+                for (int j = 0; j < 4; ++j) {
+                    sumi += grid1[j] * q8[j+0] * (signs[l] & kmask_iq2xs[j+0] ? -1 : 1);
+                    sumi += grid2[j] * q8[j+4] * (signs[l] & kmask_iq2xs[j+4] ? -1 : 1);
+                }
+                q8 += 8;
+            }
+            qs += 8;
+            signs += 4;
+            bsum += sumi * ls1;
+            sumi = 0;
+            for (int l = 0; l < 4; ++l) {
+                const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*l+0] | ((qh[ib32+1] << (8-2*l)) & 256)));
+                const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*l+1] | ((qh[ib32+1] << (7-2*l)) & 256)));
+                for (int j = 0; j < 4; ++j) {
+                    sumi += grid1[j] * q8[j+0] * (signs[l] & kmask_iq2xs[j+0] ? -1 : 1);
+                    sumi += grid2[j] * q8[j+4] * (signs[l] & kmask_iq2xs[j+4] ? -1 : 1);
+                }
+                q8 += 8;
+            }
+            qs += 8;
+            signs += 4;
+            bsum += sumi * ls2;
+        }
+        sumf += d * bsum;
+    }
+    *s = sumf;
+#endif
+}
+
+#if defined(__loongarch_asx)
+static inline __m256i mul_add_epi8(const __m256i x, const __m256i y) {
+    const __m256i a = __lasx_xvmulwev_h_b(x, y);
+    const __m256i b = __lasx_xvmulwod_h_b(x, y);
+    return __lasx_xvadd_h(a, b);
+}
+#endif
+
+void ggml_vec_dot_iq1_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    assert(n % QK_K == 0);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_iq1_s * GGML_RESTRICT x = vx;
+    const block_q8_K  * GGML_RESTRICT y = vy;
+
+    const int nb = n / QK_K;
+
+#if defined(__loongarch_asx)
+
+    __m256 accum = (__m256)__lasx_xvldi(0);
+    float accum1 = 0;
+    for (int i = 0; i < nb; ++i) {
+
+        const int8_t   * q8 = y[i].qs;
+        const uint8_t  * qs = x[i].qs;
+        const uint16_t * qh = x[i].qh;
+
+        __m256i sumi = __lasx_xvldi(0);
+        int sumi1 = 0;
+        for (int ib = 0; ib < QK_K/32; ib += 2) {
+            __m256i q1b_1 = __lasx_xvinsgr2vr_d(q1b_1, iq1s_grid[qs[0] | ((qh[ib+0] << 8) & 0x700)], 0);
+            q1b_1 = __lasx_xvinsgr2vr_d(q1b_1, iq1s_grid[qs[1] | ((qh[ib+0] << 5) & 0x700)], 1);
+            q1b_1 = __lasx_xvinsgr2vr_d(q1b_1, iq1s_grid[qs[2] | ((qh[ib+0] << 2) & 0x700)], 2);
+            q1b_1 = __lasx_xvinsgr2vr_d(q1b_1, iq1s_grid[qs[3] | ((qh[ib+0] >> 1) & 0x700)], 3);
+
+            __m256i q1b_2 = __lasx_xvinsgr2vr_d(q1b_2, iq1s_grid[qs[4] | ((qh[ib+1] << 8) & 0x700)], 0);
+            q1b_2 = __lasx_xvinsgr2vr_d(q1b_2, iq1s_grid[qs[5] | ((qh[ib+1] << 5) & 0x700)], 1);
+            q1b_2 = __lasx_xvinsgr2vr_d(q1b_2, iq1s_grid[qs[6] | ((qh[ib+1] << 2) & 0x700)], 2);
+            q1b_2 = __lasx_xvinsgr2vr_d(q1b_2, iq1s_grid[qs[7] | ((qh[ib+1] >> 1) & 0x700)], 3);
+
+            qs += 8;
+            const __m256i q8b_1 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
+            const __m256i q8b_2 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
+
+            const __m256i dot1 = mul_add_epi8(q1b_1, q8b_1);
+            const __m256i dot2 = mul_add_epi8(q1b_2, q8b_2);
+            const int16_t ls1 = 2*((qh[ib+0] >> 12) & 7) + 1;
+            const int16_t ls2 = 2*((qh[ib+1] >> 12) & 7) + 1;
+
+            __m256i tmp1, tmp5, tmp6;
+            tmp1 = __lasx_xvreplgr2vr_h(ls1);
+            tmp5 = __lasx_xvmulwev_w_h(dot1, tmp1);
+            tmp6 = __lasx_xvmulwod_w_h(dot1, tmp1);
+            const __m256i p1 = __lasx_xvadd_w(tmp5, tmp6);
+
+            tmp1 = __lasx_xvreplgr2vr_h(ls2);
+            tmp5 = __lasx_xvmulwev_w_h(dot2, tmp1);
+            tmp6 = __lasx_xvmulwod_w_h(dot2, tmp1);
+            const __m256i p2 = __lasx_xvadd_w(tmp5, tmp6);
+
+            sumi = __lasx_xvadd_w(sumi, __lasx_xvadd_w(p1, p2));
+            sumi1 += (y[i].bsums[2*ib+0] + y[i].bsums[2*ib+1]) * (qh[ib+0] & 0x8000 ? -1 : 1) * ls1
+                   + (y[i].bsums[2*ib+2] + y[i].bsums[2*ib+3]) * (qh[ib+1] & 0x8000 ? -1 : 1) * ls2;
+        }
+
+        const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
+        accum = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(d), __lasx_xvffint_s_w(sumi), accum);
+        accum1 += d * sumi1;
+    }
+
+    *s = hsum_float_8(accum) + IQ1S_DELTA * accum1;
+
+#else
+
+    float sumf = 0;
+    for (int i = 0; i < nb; i++) {
+
+        const int8_t   * q8 = y[i].qs;
+        const uint8_t  * qs = x[i].qs;
+        const uint16_t * qh = x[i].qh;
+
+        int sumi = 0, sumi1 = 0;
+        for (int ib = 0; ib < QK_K/32; ++ib) {
+            const int ls = 2*((qh[ib] >> 12) & 7) + 1;
+            const int delta = qh[ib] & 0x8000 ? -1 : 1;
+            int lsum = 0;
+            for (int l = 0; l < 4; ++l) {
+                const int8_t * grid = (const int8_t *)(iq1s_grid + (qs[l] | (((qh[ib] >> 3*l) & 7) << 8)));
+                for (int j = 0; j < 8; ++j) {
+                    lsum += q8[j] * grid[j];
+                }
+                q8 += 8;
+            }
+            sumi  += ls * lsum;
+            sumi1 += ls * delta * (y[i].bsums[2*ib+0] + y[i].bsums[2*ib+1]);
+            qs += 4;
+        }
+
+        sumf += GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d * (sumi + IQ1S_DELTA * sumi1);
+    }
+
+    *s = sumf;
+
+#endif
+}
+
+void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+    assert(n % QK4_NL == 0);
+    static_assert(QK4_NL == QK8_0, "QK4_NL and QK8_0 must be the same");
+
+    const block_iq4_nl * GGML_RESTRICT x = vx;
+    const block_q8_0   * GGML_RESTRICT y = vy;
+
+    const int nb = n / QK4_NL;
+
+    int ib = 0;
+    float sumf = 0;
+
+#if defined (__loongarch_asx)
+
+    const __m128i values128 = __lsx_vld((const __m128i*)kvalues_iq4nl, 0);
+    const __m128i m4b  = __lsx_vreplgr2vr_b(0x0f);
+    const __m256i mone = __lasx_xvreplgr2vr_h(1);
+
+    __m256 accum1 = (__m256)__lasx_xvldi(0);
+    __m256 accum2 = (__m256)__lasx_xvldi(0);
+    for (; ib + 1 < nb; ib += 2) {
+        const __m128i q4bits_1 = __lsx_vld((const __m128i*)x[ib + 0].qs, 0);
+        const __m128i q4bits_2 = __lsx_vld((const __m128i*)x[ib + 1].qs, 0);
+        const __m256i q8b_1 = __lasx_xvld((const __m256i *)y[ib + 0].qs, 0);
+        const __m256i q8b_2 = __lasx_xvld((const __m256i *)y[ib + 1].qs, 0);
+        const __m256i q4b_1 = lasx_insertf128(lsx_shuffle_b(values128, __lsx_vand_v(__lsx_vsrli_h(q4bits_1, 4), m4b)),
+                                              lsx_shuffle_b(values128, __lsx_vand_v(q4bits_1, m4b)));
+        const __m256i q4b_2 = lasx_insertf128(lsx_shuffle_b(values128, __lsx_vand_v(__lsx_vsrli_h(q4bits_2, 4), m4b)),
+                                              lsx_shuffle_b(values128, __lsx_vand_v(q4bits_2, m4b)));
+        const __m256i p16_1 = mul_add_epi8(q4b_1, q8b_1);
+        const __m256i p16_2 = mul_add_epi8(q4b_2, q8b_2);
+        const __m256i p_1 = lasx_madd_h(p16_1, mone);
+        const __m256i p_2 = lasx_madd_h(p16_2, mone);
+        accum1 = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(GGML_CPU_FP16_TO_FP32(y[ib + 0].d)*GGML_CPU_FP16_TO_FP32(x[ib + 0].d)),
+                __lasx_xvffint_s_w(p_1), accum1);
+        accum2 = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(GGML_CPU_FP16_TO_FP32(y[ib + 1].d)*GGML_CPU_FP16_TO_FP32(x[ib + 1].d)),
+                __lasx_xvffint_s_w(p_2), accum2);
+    }
+
+    sumf = hsum_float_8(__lasx_xvfadd_s(accum1, accum2));
+
+#endif
+    for (; ib < nb; ++ib) {
+        const float d = GGML_CPU_FP16_TO_FP32(y[ib].d)*GGML_CPU_FP16_TO_FP32(x[ib].d);
+        int sumi1 = 0, sumi2 = 0;
+        for (int j = 0; j < QK4_NL/2; ++j) {
+            sumi1 += y[ib].qs[j+       0] * kvalues_iq4nl[x[ib].qs[j] & 0xf];
+            sumi2 += y[ib].qs[j+QK4_NL/2] * kvalues_iq4nl[x[ib].qs[j] >>  4];
+        }
+        sumf += d * (sumi1 + sumi2);
+    }
+    *s = sumf;
+}
+
+void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+    assert(n % QK_K == 0);
+
+    const block_iq4_xs * GGML_RESTRICT x = vx;
+    const block_q8_K   * GGML_RESTRICT y = vy;
+
+    const int nb = n / QK_K;
+
+#if defined(__loongarch_asx)
+
+    const __m128i values128 = __lsx_vld((const __m128i*)kvalues_iq4nl, 0);
+
+    __m256 accum = (__m256)__lasx_xvldi(0);
+
+    for (int ibl = 0; ibl < nb; ++ibl) {
+        const uint8_t * qs = x[ibl].qs;
+        const int8_t  * q8 = y[ibl].qs;
+        uint16_t sh = x[ibl].scales_h;
+        __m256i sumi1 = __lasx_xvldi(0);
+        __m256i sumi2 = __lasx_xvldi(0);
+        for (int ib = 0; ib < QK_K/32; ib += 2) {
+            const __m128i q4bits_1 = __lsx_vld((const __m128i*)qs, 0); qs += 16;
+            const __m128i q4bits_2 = __lsx_vld((const __m128i*)qs, 0); qs += 16;
+            const __m256i q8b_1 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32;
+            const __m256i q8b_2 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32;
+            const __m256i q4b_1 = lasx_insertf128(__lsx_vshuf_b(values128, values128, __lsx_vsrli_b(q4bits_1, 4)),
+                                                  __lsx_vshuf_b(values128, values128, __lsx_vandi_b(q4bits_1, 0xf)));
+            const __m256i q4b_2 = lasx_insertf128(__lsx_vshuf_b(values128, values128, __lsx_vsrli_b(q4bits_2, 4)),
+                                                  __lsx_vshuf_b(values128, values128, __lsx_vandi_b(q4bits_2, 0xf)));
+            const __m256i p16_1 = mul_add_epi8(q4b_1, q8b_1);
+            const __m256i p16_2 = mul_add_epi8(q4b_2, q8b_2);
+            const int16_t ls1 = ((x[ibl].scales_l[ib/2] & 0xf) | ((sh << 4) & 0x30)) - 32;
+            const int16_t ls2 = ((x[ibl].scales_l[ib/2] >>  4) | ((sh << 2) & 0x30)) - 32;
+            sh >>= 4;
+            const __m256i p_1 = lasx_madd_h(p16_1, __lasx_xvreplgr2vr_h(ls1));
+            const __m256i p_2 = lasx_madd_h(p16_2, __lasx_xvreplgr2vr_h(ls2));
+            sumi1 = __lasx_xvadd_w(p_1, sumi1);
+            sumi2 = __lasx_xvadd_w(p_2, sumi2);
+        }
+        accum = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(GGML_CPU_FP16_TO_FP32(x[ibl].d)*y[ibl].d),
+                __lasx_xvffint_s_w(__lasx_xvadd_w(sumi1, sumi2)), accum);
+    }
+
+    *s = hsum_float_8(accum);
+
+#else
+    float sumf = 0;
+    for (int ibl = 0; ibl < nb; ++ibl) {
+        const float d4d8 = GGML_CPU_FP16_TO_FP32(x[ibl].d) * y[ibl].d;
+        uint16_t h = x[ibl].scales_h;
+        const uint8_t * qs = x[ibl].qs;
+        const int8_t  * q8 = y[ibl].qs;
+        for (int ib = 0; ib < QK_K/32; ib += 2) {
+            const uint8_t ls1 = (x[ibl].scales_l[ib/2] & 0xf) | ((h << 4) & 0x30);
+            const uint8_t ls2 = (x[ibl].scales_l[ib/2] >>  4) | ((h << 2) & 0x30);
+            h >>= 4;
+            const float d1 = d4d8*(ls1 - 32);
+            const float d2 = d4d8*(ls2 - 32);
+            int sumi1 = 0, sumi2 = 0;
+            for (int j = 0; j < 16; ++j) {
+                sumi1 += q8[j+ 0] * kvalues_iq4nl[qs[j] & 0xf];
+                sumi2 += q8[j+16] * kvalues_iq4nl[qs[j] >>  4];
+            }
+            sumf += d1 * (sumi1 + sumi2);
+            qs += 16;
+            q8 += 32;
+            sumi1 = sumi2 = 0;
+            for (int j = 0; j < 16; ++j) {
+                sumi1 += q8[j+ 0] * kvalues_iq4nl[qs[j] & 0xf];
+                sumi2 += q8[j+16] * kvalues_iq4nl[qs[j] >>  4];
+            }
+            sumf += d2 * (sumi1 + sumi2);
+            qs += 16;
+            q8 += 32;
+        }
+    }
+    *s = sumf;
+#endif
+}
+
diff --git a/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp b/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp
new file mode 100644
index 0000000000000..fedd6430278c2
--- /dev/null
+++ b/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp
@@ -0,0 +1,82 @@
+# include "ggml-backend-impl.h"
+
+#if defined(__powerpc64__) || defined(__ppc64__) || defined(__PPC64__)
+
+#if defined(__linux__)
+#include <sys/auxv.h>
+#endif
+
+#include <string>
+
+struct powerpc_features {
+    std::string platform = "";
+    int power_version    = -1;
+
+    bool has_vsx         = false;
+
+    powerpc_features() {
+#if defined(__linux__)
+        unsigned long auxval = getauxval(AT_PLATFORM);
+        if (auxval) {
+            platform = std::string(reinterpret_cast<const char*>(auxval));
+            // TBD: Do systems exist that return this in uppercase?
+            if (platform.substr(0, 5) == "power") {
+                // Extractt a numeric suffix, if one exists
+                int vpos = -1;
+                for (int i = platform.length() - 1; i >= 0; i--) {
+                    if (std::isdigit(platform[i])) {
+                        vpos = i;
+                    } else {
+                        break;
+                    }
+                }
+                if (vpos > -1) {
+                    power_version = std::stoi(platform.substr(vpos));
+                }
+            }
+        }
+#endif
+        if (power_version >= 9) {
+            has_vsx = true;
+        }
+    }
+};
+
+static int ggml_backend_cpu_powerpc_score() {
+    int score = 1;
+    powerpc_features pf;
+
+// Platform scores
+#if defined(GGML_USE_POWER7)
+    if (pf.power_version < 7) { return 0; }
+    score += 1<<1;
+#endif
+#if defined(GGML_USE_POWER8)
+    if (pf.power_version < 8) { return 0; }
+    score += 1<<2;
+#endif
+#if defined(GGML_USE_POWER9)
+    if (pf.power_version < 9) { return 0; }
+    score += 1<<3;
+#endif
+#if defined(GGML_USE_POWER10)
+    if (pf.power_version < 10) { return 0; }
+    score += 1<<4;
+#endif
+#if defined(GGML_USE_POWER11)
+    if (pf.power_version < 11) { return 0; }
+    score += 1<<5;
+#endif
+
+// Feature scores
+#if defined(GGML_USE_VSX)
+    if (!pf.has_vsx) { return 0; }
+    score += 1<<6;
+#endif
+
+    return score;
+}
+
+GGML_BACKEND_DL_SCORE_IMPL(ggml_backend_cpu_powerpc_score)
+
+#endif // defined(__powerpc64__) || defined(__ppc64__) || defined(__PPC64__)
diff --git a/ggml/src/ggml-cpu/arch/powerpc/quants.c b/ggml/src/ggml-cpu/arch/powerpc/quants.c
new file mode 100644
index 0000000000000..053d5cbdc7bd8
--- /dev/null
+++ b/ggml/src/ggml-cpu/arch/powerpc/quants.c
@@ -0,0 +1,2732 @@
+#define GGML_COMMON_IMPL_C
+#include "ggml-common.h"
+#include "ggml-quants.h"
+#include "ggml-impl.h"
+#include "ggml-cpu.h"
+#include "simd-mappings.h"
+
+#include "../../quants.h"
+#include "../../ggml-cpu-impl.h"
+
+#include <math.h>
+#include <string.h>
+#include <assert.h>
+#include <float.h>
+#include <stdlib.h> // for qsort
+#include <stdio.h>  // for GGML_ASSERT
+
+#define GROUP_MAX_EPS 1e-15f
+#define GROUP_MAX_EPS_IQ3_XXS 1e-8f
+#define GROUP_MAX_EPS_IQ2_S 1e-8f
+#define GROUP_MAX_EPS_IQ1_M 1e-7f
+#define GROUP_MAX_EPS_IQ1_S 1e-12f
+
+#define UNUSED GGML_UNUSED
+
+#if defined(__POWER9_VECTOR__)
+#define B1(c,s,n)  0x ## n ## c ,  0x ## n ## s
+#define B2(c,s,n) B1(c,s,n ## c), B1(c,s,n ## s)
+#define B3(c,s,n) B2(c,s,n ## c), B2(c,s,n ## s)
+#define B4(c,s,n) B3(c,s,n ## c), B3(c,s,n ## s)
+#define B5(c,s,n) B4(c,s,n ## c), B4(c,s,n ## s)
+#define B6(c,s,n) B5(c,s,n ## c), B5(c,s,n ## s)
+#define B7(c,s,n) B6(c,s,n ## c), B6(c,s,n ## s)
+#define B8(c,s  ) B7(c,s,     c), B7(c,s,     s)
+
+// precomputed tables for expanding 8bits to 8 bytes:
+static const uint64_t table_b2b_0[1 << 8] = { B8(00, 10) }; // ( b) << 4
+static const uint64_t table_b2b_1[1 << 8] = { B8(10, 00) }; // (!b) << 4
+#endif
+
+void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
+    assert(QK8_0 == 32);
+    assert(k % QK8_0 == 0);
+    const int nb = k / QK8_0;
+
+    block_q8_0 * GGML_RESTRICT y = vy;
+
+#if defined(__POWER9_VECTOR__)
+    for (int i = 0; i < nb; i++) {
+        vector float srcv [8];
+        vector float asrcv[8];
+        vector float amaxv[8];
+        vector signed int vi[8];
+
+        for (int j = 0; j < 8; j++) srcv[j]  = vec_xl(0, x + i*32 + 4*j);
+        for (int j = 0; j < 8; j++) asrcv[j] = vec_abs(srcv[j]);
+
+        for (int j = 0; j < 4; j++) amaxv[2*j] = vec_max(asrcv[2*j], asrcv[2*j+1]);
+        for (int j = 0; j < 2; j++) amaxv[4*j] = vec_max(amaxv[4*j], amaxv[4*j+2]);
+        for (int j = 0; j < 1; j++) amaxv[8*j] = vec_max(amaxv[8*j], amaxv[8*j+4]);
+
+        const float amax = MAX(MAX(vec_extract(amaxv[0], 0),
+                                   vec_extract(amaxv[0], 1)),
+                               MAX(vec_extract(amaxv[0], 2),
+                                   vec_extract(amaxv[0], 3)));
+
+        const float d = amax / ((1 << 7) - 1);
+        const float id = d ? 1.0f/d : 0.0f;
+        const vector float vid = vec_splats(id);
+
+        y[i].d = GGML_CPU_FP32_TO_FP16(d);
+
+        for (int j = 0; j < 8; j++) {
+            const vector float v  = vec_round(vec_mul(srcv[j], vid));
+            vi[j] = vec_cts(v, 0);
+        }
+        vec_xst(vec_pack(vec_pack(vi[0], vi[1]), vec_pack(vi[2], vi[3])),  0, &y[i].qs[0]);
+        vec_xst(vec_pack(vec_pack(vi[4], vi[5]), vec_pack(vi[6], vi[7])), 16, &y[i].qs[0]);
+    }
+#else
+    GGML_UNUSED(nb);
+    // scalar
+    quantize_row_q8_0_ref(x, y, k);
+#endif
+}
+
+void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
+    assert(k % QK8_1 == 0);
+    const int nb = k / QK8_1;
+
+    block_q8_1 * GGML_RESTRICT y = vy;
+
+#if defined(__POWER9_VECTOR__)
+    for (int i = 0; i < nb; i++) {
+        vector float srcv [8];
+        vector float asrcv[8];
+        vector float amaxv[8];
+        vector signed int vi[8];
+
+        for (int j = 0; j < 8; j++) srcv[j]  = vec_xl(0, x + i*32 + 4*j);
+        for (int j = 0; j < 8; j++) asrcv[j] = vec_abs(srcv[j]);
+
+        for (int j = 0; j < 4; j++) amaxv[2*j] = vec_max(asrcv[2*j], asrcv[2*j+1]);
+        for (int j = 0; j < 2; j++) amaxv[4*j] = vec_max(amaxv[4*j], amaxv[4*j+2]);
+        for (int j = 0; j < 1; j++) amaxv[8*j] = vec_max(amaxv[8*j], amaxv[8*j+4]);
+
+        const float amax = MAX(MAX(vec_extract(amaxv[0], 0),
+                                   vec_extract(amaxv[0], 1)),
+                               MAX(vec_extract(amaxv[0], 2),
+                                   vec_extract(amaxv[0], 3)));
+
+        const float d = amax / ((1 << 7) - 1);
+        const float id = d ? 1.0f/d : 0.0f;
+        const vector float vid = vec_splats(id);
+
+        y[i].d = GGML_CPU_FP32_TO_FP16(d);
+
+        vector int accv = vec_splats(0);
+
+        for (int j = 0; j < 8; j++) {
+            const vector float v  = vec_round(vec_mul(srcv[j], vid));
+            vi[j] = vec_cts(v, 0);
+
+            accv = vec_add(accv, vi[j]);
+        }
+        vec_xst(vec_pack(vec_pack(vi[0], vi[1]), vec_pack(vi[2], vi[3])),  0, &y[i].qs[0]);
+        vec_xst(vec_pack(vec_pack(vi[4], vi[5]), vec_pack(vi[6], vi[7])), 16, &y[i].qs[0]);
+
+        accv = vec_add(accv, vec_sld(accv, accv, 4));
+        accv = vec_add(accv, vec_sld(accv, accv, 8));
+        y[i].s = GGML_CPU_FP32_TO_FP16(d * vec_extract(accv, 0));
+    }
+
+#else
+    GGML_UNUSED(nb);
+    // scalar
+    quantize_row_q8_1_ref(x, y, k);
+#endif
+}
+
+
+//===================================== Dot products =================================
+
+void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    const int qk = QK8_0;
+    const int nb = n / qk;
+
+    assert(n % qk == 0);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_q4_0 * GGML_RESTRICT x = vx;
+    const block_q8_0 * GGML_RESTRICT y = vy;
+
+    int ib = 0;
+    float sumf = 0;
+
+#if defined(__POWER9_VECTOR__)
+    const vector signed char lowMask = vec_splats((signed char)0xF);
+    const vector signed int v0 = vec_splats((int32_t)0);
+    const vector unsigned char v4 = vec_splats((unsigned char)0x4);
+    const vector signed char v8 = vec_splats((signed char)0x8);
+
+    vector float vsumf0 = vec_splats(0.0f);
+
+#pragma GCC unroll 8
+    for (; ib < nb; ++ib) {
+        __builtin_prefetch(x[ib].qs, 0, 1);
+        __builtin_prefetch(y[ib].qs, 0, 1);
+
+        vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].d));
+        vector float vyd = vec_splats(GGML_CPU_FP16_TO_FP32(y[ib].d));
+        vector float vd = vec_mul(vxd, vyd);
+
+        vector signed char qxs = (vector signed char)vec_xl( 0, x[ib].qs);
+        vector signed char q8y0 = vec_xl( 0, y[ib].qs);
+        vector signed char q8y1 = vec_xl(16, y[ib].qs);
+
+        vector signed char q4x0 = vec_and(qxs, lowMask);
+        vector signed char q4x1 = vec_sr(qxs, v4);
+
+        q4x0 = vec_sub(q4x0, v8);
+        q4x1 = vec_sub(q4x1, v8);
+
+        vector signed short qv0 = vec_add(vec_mule(q4x0, q8y0), vec_mulo(q4x0, q8y0));
+        vector signed short qv1 = vec_add(vec_mule(q4x1, q8y1), vec_mulo(q4x1, q8y1));
+
+        vector signed int vsumi0 = v0;
+
+        vsumi0 = vec_sum4s(qv0, vsumi0);
+        vsumi0 = vec_sum4s(qv1, vsumi0);
+
+        vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
+    }
+
+    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
+    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
+
+    sumf = vec_extract(vsumf0, 0);
+
+#endif
+    for (; ib < nb; ++ib) {
+        int sumi0 = 0;
+        int sumi1 = 0;
+
+        for (int j = 0; j < qk/2; ++j) {
+            const int v0 = (x[ib].qs[j] & 0x0F) - 8;
+            const int v1 = (x[ib].qs[j] >>   4) - 8;
+
+            sumi0 += (v0 * y[ib].qs[j]);
+            sumi1 += (v1 * y[ib].qs[j + qk/2]);
+        }
+
+        int sumi = sumi0 + sumi1;
+        sumf += sumi*GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d);
+    }
+
+    *s = sumf;
+}
+
+void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    const int qk = QK8_1;
+    const int nb = n / qk;
+
+    assert(n % qk == 0);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_q4_1 * GGML_RESTRICT x = vx;
+    const block_q8_1 * GGML_RESTRICT y = vy;
+
+    int ib = 0;
+    float sumf = 0;
+
+#if defined(__POWER9_VECTOR__)
+    const vector signed char lowMask = vec_splats((signed char)0xF);
+    const vector signed int v0 = vec_splats((int32_t)0);
+    const vector unsigned char v4 = vec_splats((unsigned char)0x4);
+
+    vector float vsumf0 = vec_splats(0.0f);
+
+#pragma GCC unroll 4
+    for (; ib < nb; ++ib) {
+        __builtin_prefetch(x[ib].qs, 0, 1);
+        __builtin_prefetch(y[ib].qs, 0, 1);
+
+        vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].d));
+        vector float vyd = vec_splats(GGML_CPU_FP16_TO_FP32(y[ib].d));
+        vector float vd = vec_mul(vxd, vyd);
+
+        vector float vxmin = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].m));
+        vector float vys = {GGML_CPU_FP16_TO_FP32(y[ib].s), 0.0f, 0.0f, 0.0f};
+        vsumf0 = vec_madd(vxmin, vys, vsumf0);
+
+        vector signed char qxs = (vector signed char)vec_xl( 0, x[ib].qs);
+        vector signed char q8y0 = vec_xl( 0, y[ib].qs);
+        vector signed char q8y1 = vec_xl(16, y[ib].qs);
+
+        vector unsigned char q4x0 = (vector unsigned char)vec_and(qxs, lowMask);
+        vector unsigned char q4x1 = (vector unsigned char)vec_sr(qxs, v4);
+
+        vector signed int vsumi0 = v0;
+
+        vsumi0 = vec_msum(q8y0, q4x0, vsumi0);
+        vsumi0 = vec_msum(q8y1, q4x1, vsumi0);
+
+        vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
+    }
+
+    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
+    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
+
+    sumf = vec_extract(vsumf0, 0);
+
+#endif
+    for (; ib < nb; ++ib) {
+        int sumi0 = 0;
+        int sumi1 = 0;
+
+        for (int j = 0; j < qk/2; ++j) {
+            const int v0 = (x[ib].qs[j] & 0x0F);
+            const int v1 = (x[ib].qs[j] >>   4);
+
+            sumi0 += (v0 * y[ib].qs[j]);
+            sumi1 += (v1 * y[ib].qs[j + qk/2]);
+        }
+
+        int sumi = sumi0 + sumi1;
+        sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
+    }
+
+    *s = sumf;
+}
+
+void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    const int qk = QK8_0;
+    const int nb = n / qk;
+
+    int ib = 0;
+    float sumf = 0;
+
+    assert(n % qk == 0);
+    assert(qk == QK5_0);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_q5_0 * GGML_RESTRICT x = vx;
+    const block_q8_0 * GGML_RESTRICT y = vy;
+
+#if defined(__POWER9_VECTOR__)
+    const vector signed char lowMask = vec_splats((signed char)0xF);
+    const vector unsigned char v4 = vec_splats((unsigned char)4);
+
+    vector float vsumf0 = vec_splats(0.0f);
+
+#pragma GCC unroll 4
+    for (; ib < nb; ++ib) {
+        __builtin_prefetch(x[ib].qs, 0, 1);
+        __builtin_prefetch(y[ib].qs, 0, 1);
+
+        vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].d));
+        vector float vyd = vec_splats(GGML_CPU_FP16_TO_FP32(y[ib].d));
+        vector float vd = vec_mul(vxd, vyd);
+
+        vector signed long long aux64x2_0 = {(uint64_t)(table_b2b_1[x[ib].qh[0]]), (uint64_t)(table_b2b_1[x[ib].qh[1]])};
+        vector signed long long aux64x2_1 = {(uint64_t)(table_b2b_1[x[ib].qh[2]]), (uint64_t)(table_b2b_1[x[ib].qh[3]])};
+
+        vector signed char qh0 = (vector signed char)aux64x2_0;
+        vector signed char qh1 = (vector signed char)aux64x2_1;
+
+        vector signed char qxs = (vector signed char)vec_xl( 0, x[ib].qs);
+
+        vector signed char q5x0 = vec_sub(vec_and (qxs, lowMask), qh0);
+        vector signed char q5x1 = vec_sub(vec_sr(qxs, v4), qh1);
+
+        vector signed char q8y0 = vec_xl(  0, y[ib].qs);
+        vector signed char q8y1 = vec_xl( 16, y[ib].qs);
+
+        vector signed short qv0 = vec_add(vec_mule(q5x0, q8y0), vec_mulo(q5x0, q8y0));
+        vector signed short qv1 = vec_add(vec_mule(q5x1, q8y1), vec_mulo(q5x1, q8y1));
+
+        qv0 = vec_add(qv0, qv1);
+
+        vector signed int vsumi0 = vec_add(vec_unpackh(qv0), vec_unpackl(qv0));
+
+        vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
+    }
+
+    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
+    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
+
+    sumf = vec_extract(vsumf0, 0);
+
+#endif
+    for (; ib < nb; ++ib) {
+        uint32_t qh;
+        memcpy(&qh, x[ib].qh, sizeof(qh));
+
+        int sumi0 = 0;
+        int sumi1 = 0;
+
+        for (int j = 0; j < qk/2; ++j) {
+            const uint8_t xh_0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
+            const uint8_t xh_1 = ((qh & (1u << (j + 16))) >> (j + 12));
+
+            const int32_t x0 = (int8_t)(((x[ib].qs[j] & 0x0F) | xh_0) - 16);
+            const int32_t x1 = (int8_t)(((x[ib].qs[j] >>   4) | xh_1) - 16);
+
+            sumi0 += (x0 * y[ib].qs[j]);
+            sumi1 += (x1 * y[ib].qs[j + qk/2]);
+        }
+
+        int sumi = sumi0 + sumi1;
+        sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d)) * sumi;
+    }
+
+    *s = sumf;
+}
+
+void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    const int qk = QK8_1;
+    const int nb = n / qk;
+
+    int ib = 0;
+    float sumf = 0;
+
+    assert(n % qk == 0);
+    assert(qk == QK5_1);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_q5_1 * GGML_RESTRICT x = vx;
+    const block_q8_1 * GGML_RESTRICT y = vy;
+
+#if defined(__POWER9_VECTOR__)
+    const vector signed char lowMask = vec_splats((signed char)0xF);
+    const vector signed int v0 = vec_splats((int32_t)0);
+    const vector unsigned char v4 = vec_splats((unsigned char)0x4);
+
+    vector float vsumf0 = vec_splats(0.0f);
+
+#pragma GCC unroll 4
+    for (; ib < nb; ++ib) {
+        __builtin_prefetch(x[ib].qs, 0, 1);
+        __builtin_prefetch(y[ib].qs, 0, 1);
+
+        vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].d));
+        vector float vyd = vec_splats(GGML_CPU_FP16_TO_FP32(y[ib].d));
+        vector float vd = vec_mul(vxd, vyd);
+
+        vector float vxmin = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].m));
+        vector float vys = {GGML_CPU_FP16_TO_FP32(y[ib].s), 0.f, 0.f, 0.f};
+        vsumf0 = vec_madd(vxmin, vys, vsumf0);
+
+        vector unsigned long long aux64x2_0 = {(uint64_t)(table_b2b_0[x[ib].qh[0]]), (uint64_t)(table_b2b_0[x[ib].qh[1]])};
+        vector unsigned long long aux64x2_1 = {(uint64_t)(table_b2b_0[x[ib].qh[2]]), (uint64_t)(table_b2b_0[x[ib].qh[3]])};
+
+        vector signed char qh0 = (vector signed char)aux64x2_0;
+        vector signed char qh1 = (vector signed char)aux64x2_1;
+
+        vector signed char qxs = (vector signed char)vec_xl( 0, x[ib].qs);
+
+        vector unsigned char q5x0 = (vector unsigned char)vec_or(vec_and(qxs, lowMask), qh0);
+        vector unsigned char q5x1 = (vector unsigned char)vec_or(vec_sr(qxs, v4), qh1);
+
+        vector signed char q8y0 = vec_xl(  0, y[ib].qs);
+        vector signed char q8y1 = vec_xl( 16, y[ib].qs);
+
+        vector signed int vsumi0 = v0;
+
+        vsumi0 = vec_msum(q8y0, q5x0, vsumi0);
+        vsumi0 = vec_msum(q8y1, q5x1, vsumi0);
+
+        vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
+    }
+
+    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
+    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
+
+    sumf = vec_extract(vsumf0, 0);
+
+#endif
+    for (; ib < nb; ++ib) {
+        uint32_t qh;
+        memcpy(&qh, x[ib].qh, sizeof(qh));
+
+        int sumi0 = 0;
+        int sumi1 = 0;
+
+        for (int j = 0; j < qk/2; ++j) {
+            const uint8_t xh_0 = ((qh >> (j +  0)) << 4) & 0x10;
+            const uint8_t xh_1 = ((qh >> (j + 12))     ) & 0x10;
+
+            const int32_t x0 = (x[ib].qs[j] & 0xF) | xh_0;
+            const int32_t x1 = (x[ib].qs[j] >>  4) | xh_1;
+
+            sumi0 += (x0 * y[ib].qs[j]);
+            sumi1 += (x1 * y[ib].qs[j + qk/2]);
+        }
+
+        int sumi = sumi0 + sumi1;
+        sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
+    }
+
+    *s = sumf;
+}
+
+void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    const int qk = QK8_0;
+    const int nb = n / qk;
+
+    assert(n % qk == 0);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_q8_0 * GGML_RESTRICT x = vx;
+    const block_q8_0 * GGML_RESTRICT y = vy;
+
+    int ib = 0;
+    float sumf = 0;
+
+#if defined(__POWER9_VECTOR__)
+    const vector signed int v0 = vec_splats((int32_t)0);
+    vector float vsumf0 = vec_splats(0.0f);
+
+#pragma GCC unroll 8
+    for (; ib < nb; ++ib) {
+        __builtin_prefetch(x[ib].qs, 0, 1);
+        __builtin_prefetch(y[ib].qs, 0, 1);
+
+        vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].d));
+        vector float vyd = vec_splats(GGML_CPU_FP16_TO_FP32(y[ib].d));
+        vector float vd = vec_mul(vxd, vyd);
+
+        vector signed char q8x0 = vec_xl( 0, x[ib].qs);
+        vector signed char q8x1 = vec_xl(16, x[ib].qs);
+        vector signed char q8y0 = vec_xl( 0, y[ib].qs);
+        vector signed char q8y1 = vec_xl(16, y[ib].qs);
+
+        vector signed short qv0 = vec_mule(q8x0, q8y0);
+        vector signed short qv1 = vec_mulo(q8x0, q8y0);
+        vector signed short qv2 = vec_mule(q8x1, q8y1);
+        vector signed short qv3 = vec_mulo(q8x1, q8y1);
+
+        vector signed int vsumi0 = v0;
+        vector signed int vsumi1 = v0;
+
+        vsumi0 = vec_sum4s(qv0, vsumi0);
+        vsumi1 = vec_sum4s(qv1, vsumi1);
+        vsumi0 = vec_sum4s(qv2, vsumi0);
+        vsumi1 = vec_sum4s(qv3, vsumi1);
+
+        vsumi0 = vec_add(vsumi0, vsumi1);
+
+        vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
+    }
+
+    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
+    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
+
+    sumf = vec_extract(vsumf0, 0);
+
+#endif
+    for (; ib < nb; ++ib) {
+        int sumi = 0;
+
+        for (int j = 0; j < qk; j++) {
+            sumi += x[ib].qs[j]*y[ib].qs[j];
+        }
+
+        sumf += sumi*(GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d));
+    }
+
+    *s = sumf;
+}
+
+void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_q2_K * GGML_RESTRICT x = vx;
+    const block_q8_K * GGML_RESTRICT y = vy;
+
+    const int nb = n / QK_K;
+
+#if defined(__POWER9_VECTOR__)
+    const vector signed char lowMask = vec_splats((signed char)0x3);
+    const vector signed char lowScaleMask = vec_splats((signed char)0xF);
+    const vector int v0 = vec_splats((int32_t)0);
+    const vector unsigned char v2 = vec_splats((unsigned char)0x2);
+    const vector unsigned char v6 = vec_splats((unsigned char)0x6);
+    const vector unsigned char v4 = vec_splats((unsigned char)0x4);
+
+    vector float vsumf0 = vec_splats(0.0f);
+    vector float vsumf1 = vec_splats(0.0f);
+    vector float vsumf2 = vec_splats(0.0f);
+    vector float vsumf3 = vec_splats(0.0f);
+
+    for (int i = 0; i < nb; ++i) {
+        vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].d));
+        vector float vyd = vec_splats(y[i].d);
+        vector float vd = vec_mul(vxd, vyd);
+
+        vector float vxmin = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].dmin));
+        vector float vdmin = vec_mul(vxmin, vyd);
+
+        vector signed short q8ysums0 = vec_xl( 0, y[i].bsums);
+        vector signed short q8ysums1 = vec_xl(16, y[i].bsums);
+
+        vector signed char q2xmins = (vector signed char)vec_xl( 0, x[i].scales);
+        vector signed char vscales = vec_and(q2xmins, lowScaleMask);
+
+        q2xmins = vec_sr(q2xmins, v4);
+        vector signed short q2xmins0 = vec_unpackh(q2xmins);
+        vector signed short q2xmins1 = vec_unpackl(q2xmins);
+
+        vector signed int prod0 = vec_mule(q2xmins0, q8ysums0);
+        vector signed int prod1 = vec_mulo(q2xmins0, q8ysums0);
+        vector signed int prod2 = vec_mule(q2xmins1, q8ysums1);
+        vector signed int prod3 = vec_mulo(q2xmins1, q8ysums1);
+
+        vsumf0 = vec_nmsub(vec_ctf(prod0, 0), vdmin, vsumf0);
+        vsumf1 = vec_nmsub(vec_ctf(prod1, 0), vdmin, vsumf1);
+        vsumf2 = vec_nmsub(vec_ctf(prod2, 0), vdmin, vsumf2);
+        vsumf3 = vec_nmsub(vec_ctf(prod3, 0), vdmin, vsumf3);
+
+        vector signed int vsumi0 = v0;
+        vector signed int vsumi1 = v0;
+        vector signed int vsumi2 = v0;
+        vector signed int vsumi3 = v0;
+        vector signed int vsumi4 = v0;
+        vector signed int vsumi5 = v0;
+        vector signed int vsumi6 = v0;
+        vector signed int vsumi7 = v0;
+
+        const uint8_t * GGML_RESTRICT q2 = x[i].qs;
+        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
+
+        for (int j = 0; j < QK_K/128; ++j) {
+            __builtin_prefetch(q2, 0, 1);
+            __builtin_prefetch(q8, 0, 1);
+
+            vector signed char qxs0 = (vector signed char)vec_xl( 0, q2);
+            vector signed char qxs1 = (vector signed char)vec_xl(16, q2);
+            q2 += 32;
+
+            vector unsigned char q2x00 = (vector unsigned char)vec_and(qxs0, lowMask);
+            vector unsigned char q2x01 = (vector unsigned char)vec_and(vec_sr(qxs0, v2), lowMask);
+            vector unsigned char q2x02 = (vector unsigned char)vec_and(vec_sr(qxs0, v4), lowMask);
+            vector unsigned char q2x03 = (vector unsigned char)vec_and(vec_sr(qxs0, v6), lowMask);
+            vector unsigned char q2x10 = (vector unsigned char)vec_and(qxs1, lowMask);
+            vector unsigned char q2x11 = (vector unsigned char)vec_and(vec_sr(qxs1, v2), lowMask);
+            vector unsigned char q2x12 = (vector unsigned char)vec_and(vec_sr(qxs1, v4), lowMask);
+            vector unsigned char q2x13 = (vector unsigned char)vec_and(vec_sr(qxs1, v6), lowMask);
+
+            vector signed char q8y00 = vec_xl(  0, q8);
+            vector signed char q8y10 = vec_xl( 16, q8);
+            vector signed char q8y01 = vec_xl( 32, q8);
+            vector signed char q8y11 = vec_xl( 48, q8);
+            vector signed char q8y02 = vec_xl( 64, q8);
+            vector signed char q8y12 = vec_xl( 80, q8);
+            vector signed char q8y03 = vec_xl( 96, q8);
+            vector signed char q8y13 = vec_xl(112, q8);
+            q8 += 128;
+
+            vector signed int qv0 = vec_msum(q8y00, q2x00, v0);
+            vector signed int qv1 = vec_msum(q8y01, q2x01, v0);
+            vector signed int qv2 = vec_msum(q8y02, q2x02, v0);
+            vector signed int qv3 = vec_msum(q8y03, q2x03, v0);
+            vector signed int qv4 = vec_msum(q8y10, q2x10, v0);
+            vector signed int qv5 = vec_msum(q8y11, q2x11, v0);
+            vector signed int qv6 = vec_msum(q8y12, q2x12, v0);
+            vector signed int qv7 = vec_msum(q8y13, q2x13, v0);
+
+            vector signed short vscales_07 = vec_unpackh(vscales);
+            vector signed int vscales_03 = vec_unpackh(vscales_07);
+            vector signed int vscales_47 = vec_unpackl(vscales_07);
+            vector signed int vs0 = vec_splat(vscales_03, 0);
+            vector signed int vs1 = vec_splat(vscales_03, 1);
+            vector signed int vs2 = vec_splat(vscales_03, 2);
+            vector signed int vs3 = vec_splat(vscales_03, 3);
+            vector signed int vs4 = vec_splat(vscales_47, 0);
+            vector signed int vs5 = vec_splat(vscales_47, 1);
+            vector signed int vs6 = vec_splat(vscales_47, 2);
+            vector signed int vs7 = vec_splat(vscales_47, 3);
+            vscales = vec_sld(vscales, vscales, 8);
+
+            vsumi0 = vec_add(vec_mul(qv0, vs0), vsumi0);
+            vsumi1 = vec_add(vec_mul(qv1, vs2), vsumi1);
+            vsumi2 = vec_add(vec_mul(qv2, vs4), vsumi2);
+            vsumi3 = vec_add(vec_mul(qv3, vs6), vsumi3);
+            vsumi4 = vec_add(vec_mul(qv4, vs1), vsumi4);
+            vsumi5 = vec_add(vec_mul(qv5, vs3), vsumi5);
+            vsumi6 = vec_add(vec_mul(qv6, vs5), vsumi6);
+            vsumi7 = vec_add(vec_mul(qv7, vs7), vsumi7);
+        }
+
+        vsumi0 = vec_add(vsumi0, vsumi4);
+        vsumi1 = vec_add(vsumi1, vsumi5);
+        vsumi2 = vec_add(vsumi2, vsumi6);
+        vsumi3 = vec_add(vsumi3, vsumi7);
+
+        vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
+        vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
+        vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
+        vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
+    }
+
+    vsumf0 = vec_add(vsumf0, vsumf2);
+    vsumf1 = vec_add(vsumf1, vsumf3);
+
+    vsumf0 = vec_add(vsumf0, vsumf1);
+
+    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
+    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
+
+    *s = vec_extract(vsumf0, 0);
+
+#else
+
+    float sumf = 0;
+
+    for (int i = 0; i < nb; ++i) {
+
+        const uint8_t * q2 = x[i].qs;
+        const  int8_t * q8 = y[i].qs;
+        const uint8_t * sc = x[i].scales;
+
+        int summs = 0;
+        for (int j = 0; j < 16; ++j) {
+            summs += y[i].bsums[j] * (sc[j] >> 4);
+        }
+
+        const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
+        const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
+
+        int isum = 0;
+        int is = 0;
+        int d;
+        for (int k = 0; k < QK_K/128; ++k) {
+            int shift = 0;
+            for (int j = 0; j < 4; ++j) {
+                d = sc[is++] & 0xF;
+                int isuml = 0;
+                for (int l =  0; l < 16; ++l) isuml += q8[l] * ((q2[l] >> shift) & 3);
+                isum += d * isuml;
+                d = sc[is++] & 0xF;
+                isuml = 0;
+                for (int l = 16; l < 32; ++l) isuml += q8[l] * ((q2[l] >> shift) & 3);
+                isum += d * isuml;
+                shift += 2;
+                q8 += 32;
+            }
+            q2 += 32;
+        }
+        sumf += dall * isum - dmin * summs;
+    }
+    *s = sumf;
+#endif
+}
+
+void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    assert(n % QK_K == 0);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const uint32_t kmask1 = 0x03030303;
+    const uint32_t kmask2 = 0x0f0f0f0f;
+
+    const block_q3_K * GGML_RESTRICT x = vx;
+    const block_q8_K * GGML_RESTRICT y = vy;
+
+    const int nb = n / QK_K;
+
+#if defined(__POWER9_VECTOR__)
+    const vector signed char lowMask = vec_splats((signed char)0x3);
+    const vector signed char lowMask1 = vec_splats((int8_t)0xf);
+    const vector signed char lowMask2 = vec_splats((int8_t)0x30);
+    const vector int v0 = vec_splats((int32_t)0);
+    const vector signed char v1 = vec_splats((signed char)0x1);
+    const vector unsigned char v2 = vec_splats((unsigned char)0x2);
+    const vector unsigned char v3 = vec_splats((unsigned char)0x3);
+    const vector unsigned char v4 = vec_splats((unsigned char)0x4);
+    const vector unsigned char v6 = vec_splats((unsigned char)0x6);
+    const vector signed char off = vec_splats((signed char)0x20);
+
+    vector float vsumf0 = vec_splats(0.0f);
+    vector float vsumf1 = vec_splats(0.0f);
+    vector float vsumf2 = vec_splats(0.0f);
+    vector float vsumf3 = vec_splats(0.0f);
+
+    for (int i = 0; i < nb; ++i) {
+        vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].d));
+        vector float vyd = vec_splats(y[i].d);
+        vector float vd = vec_mul(vxd, vyd);
+
+        UNUSED(kmask1);
+        UNUSED(kmask2);
+
+        vector signed char u0 = (vector signed char)vec_xl_len(x[i].scales, 8);
+        vector signed char u1 = vec_and(u0, lowMask1);
+        vector signed char u2 = (vector signed char)vec_xl_len(x[i].scales + 8, 4);
+        vector signed char u3 = (vector signed char)vec_mergeh((vector signed int)u2, (vector signed int)vec_sr(u2, v2));
+        vector signed char u30 = vec_sl(vec_and(u3, lowMask), v4);
+        vector signed char u31 = vec_and(u3, lowMask2);
+
+        u1 = vec_or(u1, u30);
+        u2 = vec_or(vec_sr(u0, v4), u31);
+
+        vector signed char vscales = (vector signed char)vec_mergeh((vector signed long long)u1, (vector signed long long)u2);
+        vector signed char qxhs0 = (vector signed char)vec_xl( 0, x[i].hmask);
+        vector signed char qxhs1 = (vector signed char)vec_xl(16, x[i].hmask);
+
+        vscales = vec_sub(vscales, off);
+
+        vector signed int vsumi0 = v0;
+        vector signed int vsumi1 = v0;
+        vector signed int vsumi2 = v0;
+        vector signed int vsumi3 = v0;
+        vector signed int vsumi4 = v0;
+        vector signed int vsumi5 = v0;
+        vector signed int vsumi6 = v0;
+        vector signed int vsumi7 = v0;
+
+        const uint8_t * GGML_RESTRICT q3 = x[i].qs;
+        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
+
+        for (int j = 0; j < QK_K/128; ++j) {
+            __builtin_prefetch(q3, 0, 1);
+            __builtin_prefetch(q8, 0, 1);
+
+            vector signed char qxs0 = (vector signed char)vec_xl( 0, q3);
+            vector signed char qxs1 = (vector signed char)vec_xl(16, q3);
+            q3 += 32;
+
+            //the low 2 bits
+            vector signed char qxs00 = vec_and(qxs0, lowMask);
+            vector signed char qxs01 = vec_and(vec_sr(qxs0, v2), lowMask);
+            vector signed char qxs02 = vec_and(vec_sr(qxs0, v4), lowMask);
+            vector signed char qxs03 = vec_and(vec_sr(qxs0, v6), lowMask);
+            vector signed char qxs10 = vec_and(qxs1, lowMask);
+            vector signed char qxs11 = vec_and(vec_sr(qxs1, v2), lowMask);
+            vector signed char qxs12 = vec_and(vec_sr(qxs1, v4), lowMask);
+            vector signed char qxs13 = vec_and(vec_sr(qxs1, v6), lowMask);
+
+            //the 3rd bit
+            vector signed char qxh00 = vec_sl(vec_andc(v1, qxhs0), v2);
+            vector signed char qxh01 = vec_sl(vec_andc(v1, vec_sr(qxhs0, (vector unsigned char)v1)), v2);
+            vector signed char qxh02 = vec_sl(vec_andc(v1, vec_sr(qxhs0, v2)), v2);
+            vector signed char qxh03 = vec_sl(vec_andc(v1, vec_sr(qxhs0, v3)), v2);
+            vector signed char qxh10 = vec_sl(vec_andc(v1, qxhs1), v2);
+            vector signed char qxh11 = vec_sl(vec_andc(v1, vec_sr(qxhs1, (vector unsigned char)v1)), v2);
+            vector signed char qxh12 = vec_sl(vec_andc(v1, vec_sr(qxhs1, v2)), v2);
+            vector signed char qxh13 = vec_sl(vec_andc(v1, vec_sr(qxhs1, v3)), v2);
+            qxhs0 = vec_sr(qxhs0, v4);
+            qxhs1 = vec_sr(qxhs1, v4);
+
+            vector signed char q3x00 = vec_sub(qxs00, qxh00);
+            vector signed char q3x01 = vec_sub(qxs01, qxh01);
+            vector signed char q3x02 = vec_sub(qxs02, qxh02);
+            vector signed char q3x03 = vec_sub(qxs03, qxh03);
+            vector signed char q3x10 = vec_sub(qxs10, qxh10);
+            vector signed char q3x11 = vec_sub(qxs11, qxh11);
+            vector signed char q3x12 = vec_sub(qxs12, qxh12);
+            vector signed char q3x13 = vec_sub(qxs13, qxh13);
+
+            vector signed char q8y00 = vec_xl(  0, q8);
+            vector signed char q8y10 = vec_xl( 16, q8);
+            vector signed char q8y01 = vec_xl( 32, q8);
+            vector signed char q8y11 = vec_xl( 48, q8);
+            vector signed char q8y02 = vec_xl( 64, q8);
+            vector signed char q8y12 = vec_xl( 80, q8);
+            vector signed char q8y03 = vec_xl( 96, q8);
+            vector signed char q8y13 = vec_xl(112, q8);
+            q8 += 128;
+
+            vector signed short vscales_h = vec_unpackh(vscales);
+            vector signed short vs0 = vec_splat(vscales_h, 0);
+            vector signed short vs1 = vec_splat(vscales_h, 1);
+            vector signed short vs2 = vec_splat(vscales_h, 2);
+            vector signed short vs3 = vec_splat(vscales_h, 3);
+            vector signed short vs4 = vec_splat(vscales_h, 4);
+            vector signed short vs5 = vec_splat(vscales_h, 5);
+            vector signed short vs6 = vec_splat(vscales_h, 6);
+            vector signed short vs7 = vec_splat(vscales_h, 7);
+            vscales = vec_sld(vscales, vscales, 8);
+
+            vector signed short qv00 = vec_add(vec_mule(q3x00, q8y00), vec_mulo(q3x00, q8y00));
+            vector signed short qv01 = vec_add(vec_mule(q3x01, q8y01), vec_mulo(q3x01, q8y01));
+            vector signed short qv02 = vec_add(vec_mule(q3x02, q8y02), vec_mulo(q3x02, q8y02));
+            vector signed short qv03 = vec_add(vec_mule(q3x03, q8y03), vec_mulo(q3x03, q8y03));
+            vector signed short qv10 = vec_add(vec_mule(q3x10, q8y10), vec_mulo(q3x10, q8y10));
+            vector signed short qv11 = vec_add(vec_mule(q3x11, q8y11), vec_mulo(q3x11, q8y11));
+            vector signed short qv12 = vec_add(vec_mule(q3x12, q8y12), vec_mulo(q3x12, q8y12));
+            vector signed short qv13 = vec_add(vec_mule(q3x13, q8y13), vec_mulo(q3x13, q8y13));
+
+            vsumi0 = vec_msum(qv00, vs0, vsumi0);
+            vsumi1 = vec_msum(qv01, vs2, vsumi1);
+            vsumi2 = vec_msum(qv02, vs4, vsumi2);
+            vsumi3 = vec_msum(qv03, vs6, vsumi3);
+            vsumi4 = vec_msum(qv10, vs1, vsumi4);
+            vsumi5 = vec_msum(qv11, vs3, vsumi5);
+            vsumi6 = vec_msum(qv12, vs5, vsumi6);
+            vsumi7 = vec_msum(qv13, vs7, vsumi7);
+        }
+
+        vsumi0 = vec_add(vsumi0, vsumi4);
+        vsumi1 = vec_add(vsumi1, vsumi5);
+        vsumi2 = vec_add(vsumi2, vsumi6);
+        vsumi3 = vec_add(vsumi3, vsumi7);
+
+        vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
+        vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
+        vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
+        vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
+    }
+
+    vsumf0 = vec_add(vsumf0, vsumf2);
+    vsumf1 = vec_add(vsumf1, vsumf3);
+
+    vsumf0 = vec_add(vsumf0, vsumf1);
+
+    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
+    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
+
+    *s = vec_extract(vsumf0, 0);
+
+#else
+    // scalar version
+    // This function is written like this so the compiler can manage to vectorize most of it
+    // Using -Ofast, GCC and clang manage to produce code that is within a factor of 2 or so from the
+    // manually vectorized version above. Every other version I tried would run at least 4 times slower.
+    // The ideal situation would be if we could just write the code once, and the compiler would
+    // automatically produce the best possible set of machine instructions, instead of us having to manually
+    // write vectorized versions for AVX, ARM_NEON, etc.
+
+    int8_t  aux8[QK_K];
+    int16_t aux16[8];
+    float   sums [8];
+    int32_t aux32[8];
+    memset(sums, 0, 8*sizeof(float));
+
+    uint32_t auxs[4];
+    const int8_t * scales = (const int8_t*)auxs;
+
+    float sumf = 0;
+    for (int i = 0; i < nb; ++i) {
+        const uint8_t * GGML_RESTRICT q3 = x[i].qs;
+        const uint8_t * GGML_RESTRICT hm = x[i].hmask;
+        const  int8_t * GGML_RESTRICT q8 = y[i].qs;
+        memset(aux32, 0, 8*sizeof(int32_t));
+        int8_t * GGML_RESTRICT a = aux8;
+        uint8_t m = 1;
+        for (int j = 0; j < QK_K; j += 128) {
+            for (int l = 0; l < 32; ++l) a[l] = q3[l] & 3;
+            for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
+            a += 32; m <<= 1;
+            for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 2) & 3;
+            for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
+            a += 32; m <<= 1;
+            for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 4) & 3;
+            for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
+            a += 32; m <<= 1;
+            for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 6) & 3;
+            for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
+            a += 32; m <<= 1;
+            q3 += 32;
+        }
+        a = aux8;
+
+        memcpy(auxs, x[i].scales, 12);
+        uint32_t tmp = auxs[2];
+        auxs[2] = ((auxs[0] >> 4) & kmask2) | (((tmp >> 4) & kmask1) << 4);
+        auxs[3] = ((auxs[1] >> 4) & kmask2) | (((tmp >> 6) & kmask1) << 4);
+        auxs[0] = (auxs[0] & kmask2) | (((tmp >> 0) & kmask1) << 4);
+        auxs[1] = (auxs[1] & kmask2) | (((tmp >> 2) & kmask1) << 4);
+        for (int j = 0; j < QK_K/16; ++j) {
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
+            q8 += 8; a += 8;
+        }
+        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+        for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
+    }
+    for (int l = 0; l < 8; ++l) sumf += sums[l];
+    *s = sumf;
+
+#endif
+
+}
+
+void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    assert(n % QK_K == 0);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_q4_K * GGML_RESTRICT x = vx;
+    const block_q8_K * GGML_RESTRICT y = vy;
+
+    const int nb = n / QK_K;
+
+    static const uint32_t kmask1 = 0x3f3f3f3f;
+    static const uint32_t kmask2 = 0x0f0f0f0f;
+    static const uint32_t kmask3 = 0x03030303;
+
+    uint32_t utmp[4];
+
+#if defined(__POWER9_VECTOR__)
+    const vector signed char lowMask = vec_splats((signed char)0xF);
+    const vector signed char lowMask1 = vec_splats((int8_t)0x3f);
+    const vector signed char lowMask2 = vec_splats((int8_t)0x30);
+    const vector int v0 = vec_splats((int32_t)0);
+    const vector unsigned char v2 = vec_splats((uint8_t)2);
+    const vector unsigned char v4 = vec_splats((unsigned char)0x4);
+
+    vector float vsumf0 = vec_splats(0.0f);
+    vector float vsumf1 = vec_splats(0.0f);
+    vector float vsumf2 = vec_splats(0.0f);
+    vector float vsumf3 = vec_splats(0.0f);
+
+    for (int i = 0; i < nb; ++i) {
+        vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].d));
+        vector float vyd = vec_splats(y[i].d);
+        vector float vd = vec_mul(vxd, vyd);
+
+        vector float vxmin = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].dmin));
+        vector float vdmin = vec_mul(vxmin, vyd);
+
+        vector signed short q8ysums0 = vec_xl( 0, y[i].bsums);
+        vector signed short q8ysums1 = vec_xl(16, y[i].bsums);
+
+        UNUSED(kmask1);
+        UNUSED(kmask2);
+        UNUSED(kmask3);
+        UNUSED(utmp);
+
+        vector signed char u0 = (vector signed char)vec_xl_len(x[i].scales, 8);
+        vector signed char u1 = vec_and(vec_sr(u0, v2), lowMask2);
+        vector signed char u2 = (vector signed char)vec_xl_len(x[i].scales + 8, 4);
+        vector signed char u3 = vec_sr(u2, v4);
+
+        vector signed char u30 = u1;
+        vector signed char u31 = (vector signed char)vec_mergeh((vector signed int)vec_and(u2, lowMask), (vector signed int)u3);
+
+        u1 = vec_and(u0, lowMask1);
+        u2 = vec_or(u30, u31);
+
+        vector signed char utmps = (vector signed char)vec_mergeh((vector signed int)u1, (vector signed int)u2);
+
+        vector signed short vscales = vec_unpackh(utmps);
+        vector signed short q4xmins = vec_unpackl(utmps);
+        vector signed short q4xmins0 = vec_mergeh(q4xmins, q4xmins);
+        vector signed short q4xmins1 = vec_mergel(q4xmins, q4xmins);
+
+        vector signed int prod0 = vec_mule(q4xmins0, q8ysums0);
+        vector signed int prod1 = vec_mule(q4xmins1, q8ysums1);
+        vector signed int prod2 = vec_mulo(q4xmins0, q8ysums0);
+        vector signed int prod3 = vec_mulo(q4xmins1, q8ysums1);
+
+        vsumf0 = vec_nmsub(vec_ctf(prod0, 0), vdmin, vsumf0);
+        vsumf1 = vec_nmsub(vec_ctf(prod1, 0), vdmin, vsumf1);
+        vsumf2 = vec_nmsub(vec_ctf(prod2, 0), vdmin, vsumf2);
+        vsumf3 = vec_nmsub(vec_ctf(prod3, 0), vdmin, vsumf3);
+
+        vector signed int vsumi0 = v0;
+        vector signed int vsumi1 = v0;
+        vector signed int vsumi2 = v0;
+        vector signed int vsumi3 = v0;
+
+        const uint8_t * GGML_RESTRICT q4 = x[i].qs;
+        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
+
+        for (int j = 0; j < QK_K/64; j+=2) {
+            __builtin_prefetch(q4, 0, 1);
+            __builtin_prefetch(q8, 0, 1);
+
+            vector signed char qxs0 = (vector signed char)vec_xl( 0, q4);
+            vector signed char qxs1 = (vector signed char)vec_xl(16, q4);
+            vector signed char qxs2 = (vector signed char)vec_xl(32, q4);
+            vector signed char qxs3 = (vector signed char)vec_xl(48, q4);
+            q4 += 64;
+
+            vector unsigned char q4x00 = (vector unsigned char)vec_and(qxs0, lowMask);
+            vector unsigned char q4x01 = (vector unsigned char)vec_sr(qxs0, v4);
+            vector unsigned char q4x10 = (vector unsigned char)vec_and(qxs1, lowMask);
+            vector unsigned char q4x11 = (vector unsigned char)vec_sr(qxs1, v4);
+            vector unsigned char q4x20 = (vector unsigned char)vec_and(qxs2, lowMask);
+            vector unsigned char q4x21 = (vector unsigned char)vec_sr(qxs2, v4);
+            vector unsigned char q4x30 = (vector unsigned char)vec_and(qxs3, lowMask);
+            vector unsigned char q4x31 = (vector unsigned char)vec_sr(qxs3, v4);
+
+            vector signed char q8y00 = vec_xl(  0, q8);
+            vector signed char q8y10 = vec_xl( 16, q8);
+            vector signed char q8y01 = vec_xl( 32, q8);
+            vector signed char q8y11 = vec_xl( 48, q8);
+            vector signed char q8y20 = vec_xl( 64, q8);
+            vector signed char q8y30 = vec_xl( 80, q8);
+            vector signed char q8y21 = vec_xl( 96, q8);
+            vector signed char q8y31 = vec_xl(112, q8);
+            q8 += 128;
+
+            vector signed int qv00 = vec_msum(q8y00, q4x00, v0);
+            vector signed int qv01 = vec_msum(q8y01, q4x01, v0);
+            vector signed int qv10 = vec_msum(q8y10, q4x10, v0);
+            vector signed int qv11 = vec_msum(q8y11, q4x11, v0);
+            vector signed int qv20 = vec_msum(q8y20, q4x20, v0);
+            vector signed int qv21 = vec_msum(q8y21, q4x21, v0);
+            vector signed int qv30 = vec_msum(q8y30, q4x30, v0);
+            vector signed int qv31 = vec_msum(q8y31, q4x31, v0);
+
+            vector signed int vscales_h = vec_unpackh(vscales);
+            vector signed int vs0 = vec_splat(vscales_h, 0);
+            vector signed int vs1 = vec_splat(vscales_h, 1);
+            vector signed int vs2 = vec_splat(vscales_h, 2);
+            vector signed int vs3 = vec_splat(vscales_h, 3);
+            vscales = vec_sld(vscales, vscales, 8);
+
+            vsumi0 = vec_add(vec_mul(qv00, vs0), vsumi0);
+            vsumi1 = vec_add(vec_mul(qv01, vs1), vsumi1);
+            vsumi2 = vec_add(vec_mul(qv20, vs2), vsumi2);
+            vsumi3 = vec_add(vec_mul(qv21, vs3), vsumi3);
+
+            vsumi0 = vec_add(vec_mul(qv10, vs0), vsumi0);
+            vsumi1 = vec_add(vec_mul(qv11, vs1), vsumi1);
+            vsumi2 = vec_add(vec_mul(qv30, vs2), vsumi2);
+            vsumi3 = vec_add(vec_mul(qv31, vs3), vsumi3);
+        }
+
+        vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
+        vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
+        vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
+        vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
+    }
+
+    vsumf0 = vec_add(vsumf0, vsumf2);
+    vsumf1 = vec_add(vsumf1, vsumf3);
+
+    vsumf0 = vec_add(vsumf0, vsumf1);
+
+    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
+    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
+
+    *s = vec_extract(vsumf0, 0);
+
+#else
+
+    const uint8_t * scales = (const uint8_t*)&utmp[0];
+    const uint8_t * mins   = (const uint8_t*)&utmp[2];
+
+    int8_t  aux8[QK_K];
+    int16_t aux16[8];
+    float   sums [8];
+    int32_t aux32[8];
+    memset(sums, 0, 8*sizeof(float));
+
+    float sumf = 0;
+    for (int i = 0; i < nb; ++i) {
+        const uint8_t * GGML_RESTRICT q4 = x[i].qs;
+        const  int8_t * GGML_RESTRICT q8 = y[i].qs;
+        memset(aux32, 0, 8*sizeof(int32_t));
+        int8_t * GGML_RESTRICT a = aux8;
+        for (int j = 0; j < QK_K/64; ++j) {
+            for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
+            a += 32;
+            for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l]  >> 4);
+            a += 32; q4 += 32;
+        }
+        memcpy(utmp, x[i].scales, 12);
+        utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
+        const uint32_t uaux = utmp[1] & kmask1;
+        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
+        utmp[2] = uaux;
+        utmp[0] &= kmask1;
+
+        int sumi = 0;
+        for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
+        a = aux8;
+        int is = 0;
+        for (int j = 0; j < QK_K/32; ++j) {
+            int32_t scale = scales[is++];
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+        }
+        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+        for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
+        const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
+        sumf -= dmin * sumi;
+    }
+    for (int l = 0; l < 8; ++l) sumf += sums[l];
+    *s = sumf;
+#endif
+}
+
+void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy,  size_t by, int nrc) {
+    assert(n % QK_K == 0);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_q5_K * GGML_RESTRICT x = vx;
+    const block_q8_K * GGML_RESTRICT y = vy;
+
+    const int nb = n / QK_K;
+
+    static const uint32_t kmask1 = 0x3f3f3f3f;
+    static const uint32_t kmask2 = 0x0f0f0f0f;
+    static const uint32_t kmask3 = 0x03030303;
+
+    uint32_t utmp[4];
+
+#if defined(__POWER9_VECTOR__)
+    const vector signed char lowMask = vec_splats((signed char)0xF);
+    const vector signed char lowMask1 = vec_splats((int8_t)0x3f);
+    const vector signed char lowMask2 = vec_splats((int8_t)0x30);
+    const vector int v0 = vec_splats((int32_t)0);
+    const vector unsigned char v1 = vec_splats((unsigned char)0x1);
+    const vector unsigned char v2 = vec_splats((unsigned char)0x2);
+    const vector unsigned char v3 = vec_splats((unsigned char)0x3);
+    const vector unsigned char v4 = vec_splats((unsigned char)0x4);
+
+    vector float vsumf0 = vec_splats(0.0f);
+    vector float vsumf1 = vec_splats(0.0f);
+    vector float vsumf2 = vec_splats(0.0f);
+    vector float vsumf3 = vec_splats(0.0f);
+
+    for (int i = 0; i < nb; ++i) {
+        vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].d));
+        vector float vyd = vec_splats(y[i].d);
+        vector float vd = vec_mul(vxd, vyd);
+
+        vector float vxmin = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].dmin));
+        vector float vdmin = vec_mul(vxmin, vyd);
+
+        UNUSED(kmask1);
+        UNUSED(kmask2);
+        UNUSED(kmask3);
+        UNUSED(utmp);
+
+        vector signed char u0 = (vector signed char)vec_xl_len(x[i].scales, 8);
+        vector signed char u1 = vec_and(vec_sr(u0, v2), lowMask2);
+        vector signed char u2 = (vector signed char)vec_xl_len(x[i].scales + 8, 4);
+        vector signed char u3 = vec_sr(u2, v4);
+
+        vector signed char u30 = u1;
+        vector signed char u31 = (vector signed char)vec_mergeh((vector signed int)vec_and(u2, lowMask), (vector signed int)u3);
+
+        u1 = vec_and(u0, lowMask1);
+        u2 = vec_or(u30, u31);
+
+        vector signed char utmps = (vector signed char)vec_mergeh((vector signed int)u1, (vector signed int)u2);
+
+        vector signed short q8ysums0 = vec_xl( 0, y[i].bsums);
+        vector signed short q8ysums1 = vec_xl(16, y[i].bsums);
+
+        vector signed short vscales = vec_unpackh(utmps);
+
+        vector signed short q5xmins = vec_unpackl(utmps);
+        vector signed short q5xmins0 = vec_mergeh(q5xmins, q5xmins);
+        vector signed short q5xmins1 = vec_mergel(q5xmins, q5xmins);
+
+        vector signed int prod0 = vec_mule(q5xmins0, q8ysums0);
+        vector signed int prod1 = vec_mule(q5xmins1, q8ysums1);
+        vector signed int prod2 = vec_mulo(q5xmins0, q8ysums0);
+        vector signed int prod3 = vec_mulo(q5xmins1, q8ysums1);
+
+        vsumf0 = vec_nmsub(vec_ctf(prod0, 0), vdmin, vsumf0);
+        vsumf1 = vec_nmsub(vec_ctf(prod1, 0), vdmin, vsumf1);
+        vsumf2 = vec_nmsub(vec_ctf(prod2, 0), vdmin, vsumf2);
+        vsumf3 = vec_nmsub(vec_ctf(prod3, 0), vdmin, vsumf3);
+
+        vector signed char qxhs0 = (vector signed char)vec_xl( 0, x[i].qh);
+        vector signed char qxhs1 = (vector signed char)vec_xl(16, x[i].qh);
+
+        vector signed int vsumi0 = v0;
+        vector signed int vsumi1 = v0;
+        vector signed int vsumi2 = v0;
+        vector signed int vsumi3 = v0;
+
+        const uint8_t * GGML_RESTRICT q5 = x[i].qs;
+        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
+
+        for (int j = 0; j < QK_K/64; ++j) {
+            __builtin_prefetch(q5, 0, 1);
+            __builtin_prefetch(q8, 0, 1);
+
+            vector signed char qxs0 = (vector signed char)vec_xl( 0, q5);
+            vector signed char qxs1 = (vector signed char)vec_xl(16, q5);
+            q5 += 32;
+
+            vector signed char qxs00 = vec_and(qxs0, lowMask);
+            vector signed char qxs01 = vec_sr(qxs0, v4);
+            vector signed char qxs10 = vec_and(qxs1, lowMask);
+            vector signed char qxs11 = vec_sr(qxs1, v4);
+
+            vector signed char q5h00 = vec_sl(vec_and((vector signed char)v1, qxhs0), v4);
+            vector signed char q5h01 = vec_sl(vec_and((vector signed char)v2, qxhs0), v3);
+            vector signed char q5h10 = vec_sl(vec_and((vector signed char)v1, qxhs1), v4);
+            vector signed char q5h11 = vec_sl(vec_and((vector signed char)v2, qxhs1), v3);
+            qxhs0 = vec_sr(qxhs0, v2);
+            qxhs1 = vec_sr(qxhs1, v2);
+
+            vector unsigned char q5x00 = (vector unsigned char)vec_or(q5h00, qxs00);
+            vector unsigned char q5x01 = (vector unsigned char)vec_or(q5h01, qxs01);
+            vector unsigned char q5x10 = (vector unsigned char)vec_or(q5h10, qxs10);
+            vector unsigned char q5x11 = (vector unsigned char)vec_or(q5h11, qxs11);
+
+            vector signed char q8y00 = vec_xl( 0, q8);
+            vector signed char q8y10 = vec_xl(16, q8);
+            vector signed char q8y01 = vec_xl(32, q8);
+            vector signed char q8y11 = vec_xl(48, q8);
+            q8 += 64;
+
+            vector signed int qv00 = vec_msum(q8y00, q5x00, v0);
+            vector signed int qv01 = vec_msum(q8y01, q5x01, v0);
+            vector signed int qv10 = vec_msum(q8y10, q5x10, v0);
+            vector signed int qv11 = vec_msum(q8y11, q5x11, v0);
+
+            vector signed int vscales_h = vec_unpackh(vscales);
+            vector signed int vs0 = vec_splat(vscales_h, 0);
+            vector signed int vs1 = vec_splat(vscales_h, 1);
+            vscales = vec_sld(vscales, vscales, 12);
+
+            vsumi0 = vec_add(vec_mul(qv00, vs0), vsumi0);
+            vsumi1 = vec_add(vec_mul(qv10, vs0), vsumi1);
+            vsumi2 = vec_add(vec_mul(qv01, vs1), vsumi2);
+            vsumi3 = vec_add(vec_mul(qv11, vs1), vsumi3);
+        }
+
+        vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
+        vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
+        vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
+        vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
+    }
+
+    vsumf0 = vec_add(vsumf0, vsumf2);
+    vsumf1 = vec_add(vsumf1, vsumf3);
+
+    vsumf0 = vec_add(vsumf0, vsumf1);
+
+    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
+    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
+
+    *s = vec_extract(vsumf0, 0);
+
+#else
+
+    const uint8_t * scales = (const uint8_t*)&utmp[0];
+    const uint8_t * mins   = (const uint8_t*)&utmp[2];
+
+    int8_t  aux8[QK_K];
+    int16_t aux16[8];
+    float   sums [8];
+    int32_t aux32[8];
+    memset(sums, 0, 8*sizeof(float));
+
+    float sumf = 0;
+    for (int i = 0; i < nb; ++i) {
+        const uint8_t * GGML_RESTRICT q4 = x[i].qs;
+        const uint8_t * GGML_RESTRICT hm = x[i].qh;
+        const  int8_t * GGML_RESTRICT q8 = y[i].qs;
+        memset(aux32, 0, 8*sizeof(int32_t));
+        int8_t * GGML_RESTRICT a = aux8;
+        uint8_t m = 1;
+        for (int j = 0; j < QK_K/64; ++j) {
+            for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
+            for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
+            a += 32; m <<= 1;
+            for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l]  >> 4);
+            for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
+            a += 32; m <<= 1;
+            q4 += 32;
+        }
+        memcpy(utmp, x[i].scales, 12);
+        utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
+        const uint32_t uaux = utmp[1] & kmask1;
+        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
+        utmp[2] = uaux;
+        utmp[0] &= kmask1;
+
+        int sumi = 0;
+        for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
+        a = aux8;
+        int is = 0;
+        for (int j = 0; j < QK_K/32; ++j) {
+            int32_t scale = scales[is++];
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+        }
+        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+        for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
+        const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
+        sumf -= dmin * sumi;
+    }
+    for (int l = 0; l < 8; ++l) sumf += sums[l];
+    *s = sumf;
+#endif
+}
+
+void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    assert(n % QK_K == 0);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_q6_K * GGML_RESTRICT x = vx;
+    const block_q8_K * GGML_RESTRICT y = vy;
+
+    const int nb = n / QK_K;
+
+#if defined(__POWER9_VECTOR__)
+    const vector signed char lowMask = vec_splats((signed char)0xF);
+    const vector int v0 = vec_splats((int32_t)0);
+    const vector unsigned char v2 = vec_splats((unsigned char)0x2);
+    const vector unsigned char v3 = vec_splats((unsigned char)0x3);
+    const vector unsigned char v4 = vec_splats((unsigned char)0x4);
+    const vector unsigned char v6 = vec_splats((unsigned char)0x6);
+    const vector signed char off = vec_splats((signed char)0x20);
+
+    vector float vsumf0 = vec_splats(0.0f);
+    vector float vsumf1 = vec_splats(0.0f);
+    vector float vsumf2 = vec_splats(0.0f);
+    vector float vsumf3 = vec_splats(0.0f);
+
+    for (int i = 0; i < nb; ++i) {
+        vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].d));
+        vector float vyd = vec_splats(y[i].d);
+        vector float vd = vec_mul(vxd, vyd);
+
+        vector signed int vsumi0 = v0;
+        vector signed int vsumi1 = v0;
+        vector signed int vsumi2 = v0;
+        vector signed int vsumi3 = v0;
+        vector signed int vsumi4 = v0;
+        vector signed int vsumi5 = v0;
+        vector signed int vsumi6 = v0;
+        vector signed int vsumi7 = v0;
+
+        const uint8_t * GGML_RESTRICT q6 = x[i].ql;
+        const uint8_t * GGML_RESTRICT qh = x[i].qh;
+        const int8_t  * GGML_RESTRICT qs = x[i].scales;
+        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
+
+        for (int j = 0; j < QK_K/128; ++j) {
+            __builtin_prefetch(q6, 0, 0);
+            __builtin_prefetch(qh, 0, 0);
+            __builtin_prefetch(q8, 0, 0);
+
+            vector signed char qxs0 = (vector signed char)vec_xl( 0, q6);
+            vector signed char qxs1 = (vector signed char)vec_xl(16, q6);
+            vector signed char qxs2 = (vector signed char)vec_xl(32, q6);
+            vector signed char qxs3 = (vector signed char)vec_xl(48, q6);
+            q6 += 64;
+
+            vector signed char qxs00 = vec_and(qxs0, lowMask);
+            vector signed char qxs01 = vec_sr(qxs0, v4);
+            vector signed char qxs10 = vec_and(qxs1, lowMask);
+            vector signed char qxs11 = vec_sr(qxs1, v4);
+            vector signed char qxs20 = vec_and(qxs2, lowMask);
+            vector signed char qxs21 = vec_sr(qxs2, v4);
+            vector signed char qxs30 = vec_and(qxs3, lowMask);
+            vector signed char qxs31 = vec_sr(qxs3, v4);
+
+            vector signed char qxhs0 = (vector signed char)vec_xl( 0, qh);
+            vector signed char qxhs1 = (vector signed char)vec_xl(16, qh);
+            qh += 32;
+
+            vector signed char qxh00 = vec_sl(vec_and((vector signed char)v3, qxhs0), v4);
+            vector signed char qxh01 = vec_sl(vec_and((vector signed char)v3, vec_sr(qxhs0, v4)), v4);
+            vector signed char qxh10 = vec_sl(vec_and((vector signed char)v3, qxhs1), v4);
+            vector signed char qxh11 = vec_sl(vec_and((vector signed char)v3, vec_sr(qxhs1, v4)), v4);
+            vector signed char qxh20 = vec_sl(vec_and((vector signed char)v3, vec_sr(qxhs0, v2)), v4);
+            vector signed char qxh21 = vec_sl(vec_and((vector signed char)v3, vec_sr(qxhs0, v6)), v4);
+            vector signed char qxh30 = vec_sl(vec_and((vector signed char)v3, vec_sr(qxhs1, v2)), v4);
+            vector signed char qxh31 = vec_sl(vec_and((vector signed char)v3, vec_sr(qxhs1, v6)), v4);
+
+            vector signed char q6x00 = vec_sub(vec_or(qxh00, qxs00), off);
+            vector signed char q6x01 = vec_sub(vec_or(qxh01, qxs01), off);
+            vector signed char q6x10 = vec_sub(vec_or(qxh10, qxs10), off);
+            vector signed char q6x11 = vec_sub(vec_or(qxh11, qxs11), off);
+            vector signed char q6x20 = vec_sub(vec_or(qxh20, qxs20), off);
+            vector signed char q6x21 = vec_sub(vec_or(qxh21, qxs21), off);
+            vector signed char q6x30 = vec_sub(vec_or(qxh30, qxs30), off);
+            vector signed char q6x31 = vec_sub(vec_or(qxh31, qxs31), off);
+
+            vector signed char q8y00 = vec_xl(  0, q8);
+            vector signed char q8y10 = vec_xl( 16, q8);
+            vector signed char q8y20 = vec_xl( 32, q8);
+            vector signed char q8y30 = vec_xl( 48, q8);
+            vector signed char q8y01 = vec_xl( 64, q8);
+            vector signed char q8y11 = vec_xl( 80, q8);
+            vector signed char q8y21 = vec_xl( 96, q8);
+            vector signed char q8y31 = vec_xl(112, q8);
+            q8 += 128;
+
+            vector signed short qv00 = vec_add(vec_mule(q6x00, q8y00), vec_mulo(q6x00, q8y00));
+            vector signed short qv10 = vec_add(vec_mule(q6x10, q8y10), vec_mulo(q6x10, q8y10));
+            vector signed short qv20 = vec_add(vec_mule(q6x20, q8y20), vec_mulo(q6x20, q8y20));
+            vector signed short qv30 = vec_add(vec_mule(q6x30, q8y30), vec_mulo(q6x30, q8y30));
+            vector signed short qv01 = vec_add(vec_mule(q6x01, q8y01), vec_mulo(q6x01, q8y01));
+            vector signed short qv11 = vec_add(vec_mule(q6x11, q8y11), vec_mulo(q6x11, q8y11));
+            vector signed short qv21 = vec_add(vec_mule(q6x21, q8y21), vec_mulo(q6x21, q8y21));
+            vector signed short qv31 = vec_add(vec_mule(q6x31, q8y31), vec_mulo(q6x31, q8y31));
+
+            vector signed short vscales = vec_unpackh(vec_xl_len(qs, 8));
+            qs += 8;
+
+            vector signed short vs0 = vec_splat(vscales, 0);
+            vector signed short vs1 = vec_splat(vscales, 1);
+            vector signed short vs2 = vec_splat(vscales, 2);
+            vector signed short vs3 = vec_splat(vscales, 3);
+            vector signed short vs4 = vec_splat(vscales, 4);
+            vector signed short vs5 = vec_splat(vscales, 5);
+            vector signed short vs6 = vec_splat(vscales, 6);
+            vector signed short vs7 = vec_splat(vscales, 7);
+
+            vsumi0 = vec_msum(qv00, vs0, vsumi0);
+            vsumi1 = vec_msum(qv01, vs4, vsumi1);
+            vsumi2 = vec_msum(qv10, vs1, vsumi2);
+            vsumi3 = vec_msum(qv11, vs5, vsumi3);
+            vsumi4 = vec_msum(qv20, vs2, vsumi4);
+            vsumi5 = vec_msum(qv21, vs6, vsumi5);
+            vsumi6 = vec_msum(qv30, vs3, vsumi6);
+            vsumi7 = vec_msum(qv31, vs7, vsumi7);
+        }
+
+        vsumi0 = vec_add(vsumi0, vsumi4);
+        vsumi1 = vec_add(vsumi1, vsumi5);
+        vsumi2 = vec_add(vsumi2, vsumi6);
+        vsumi3 = vec_add(vsumi3, vsumi7);
+
+        vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
+        vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
+        vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
+        vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
+    }
+
+    vsumf0 = vec_add(vsumf0, vsumf2);
+    vsumf1 = vec_add(vsumf1, vsumf3);
+
+    vsumf0 = vec_add(vsumf0, vsumf1);
+
+    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
+    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
+
+    *s = vec_extract(vsumf0, 0);
+
+#else
+
+    int8_t  aux8[QK_K];
+    int16_t aux16[8];
+    float   sums [8];
+    int32_t aux32[8];
+    memset(sums, 0, 8*sizeof(float));
+
+    float sumf = 0;
+    for (int i = 0; i < nb; ++i) {
+        const uint8_t * GGML_RESTRICT q4 = x[i].ql;
+        const uint8_t * GGML_RESTRICT qh = x[i].qh;
+        const  int8_t * GGML_RESTRICT q8 = y[i].qs;
+        memset(aux32, 0, 8*sizeof(int32_t));
+        int8_t * GGML_RESTRICT a = aux8;
+        for (int j = 0; j < QK_K; j += 128) {
+            for (int l = 0; l < 32; ++l) {
+                a[l +  0] = (int8_t)((q4[l +  0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
+                a[l + 32] = (int8_t)((q4[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
+                a[l + 64] = (int8_t)((q4[l +  0] >>  4) | (((qh[l] >> 4) & 3) << 4)) - 32;
+                a[l + 96] = (int8_t)((q4[l + 32] >>  4) | (((qh[l] >> 6) & 3) << 4)) - 32;
+            }
+            a  += 128;
+            q4 += 64;
+            qh += 32;
+        }
+        a = aux8;
+        int is = 0;
+        for (int j = 0; j < QK_K/16; ++j) {
+            int scale = x[i].scales[is++];
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+        }
+        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+        for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
+    }
+    for (int l = 0; l < 8; ++l) sumf += sums[l];
+    *s = sumf;
+#endif
+}
+
+#if defined (__POWER9_VECTOR__)
+static const int8_t keven_signs_q2xs[1024] = {
+     1,  1,  1,  1,  1,  1,  1,  1, -1,  1,  1,  1,  1,  1,  1, -1,  1, -1,  1,  1,  1,  1,  1, -1, -1, -1,  1,  1,  1,  1,  1,  1,
+     1,  1, -1,  1,  1,  1,  1, -1, -1,  1, -1,  1,  1,  1,  1,  1,  1, -1, -1,  1,  1,  1,  1,  1, -1, -1, -1,  1,  1,  1,  1, -1,
+     1,  1,  1, -1,  1,  1,  1, -1, -1,  1,  1, -1,  1,  1,  1,  1,  1, -1,  1, -1,  1,  1,  1,  1, -1, -1,  1, -1,  1,  1,  1, -1,
+     1,  1, -1, -1,  1,  1,  1,  1, -1,  1, -1, -1,  1,  1,  1, -1,  1, -1, -1, -1,  1,  1,  1, -1, -1, -1, -1, -1,  1,  1,  1,  1,
+     1,  1,  1,  1, -1,  1,  1, -1, -1,  1,  1,  1, -1,  1,  1,  1,  1, -1,  1,  1, -1,  1,  1,  1, -1, -1,  1,  1, -1,  1,  1, -1,
+     1,  1, -1,  1, -1,  1,  1,  1, -1,  1, -1,  1, -1,  1,  1, -1,  1, -1, -1,  1, -1,  1,  1, -1, -1, -1, -1,  1, -1,  1,  1,  1,
+     1,  1,  1, -1, -1,  1,  1,  1, -1,  1,  1, -1, -1,  1,  1, -1,  1, -1,  1, -1, -1,  1,  1, -1, -1, -1,  1, -1, -1,  1,  1,  1,
+     1,  1, -1, -1, -1,  1,  1, -1, -1,  1, -1, -1, -1,  1,  1,  1,  1, -1, -1, -1, -1,  1,  1,  1, -1, -1, -1, -1, -1,  1,  1, -1,
+     1,  1,  1,  1,  1, -1,  1, -1, -1,  1,  1,  1,  1, -1,  1,  1,  1, -1,  1,  1,  1, -1,  1,  1, -1, -1,  1,  1,  1, -1,  1, -1,
+     1,  1, -1,  1,  1, -1,  1,  1, -1,  1, -1,  1,  1, -1,  1, -1,  1, -1, -1,  1,  1, -1,  1, -1, -1, -1, -1,  1,  1, -1,  1,  1,
+     1,  1,  1, -1,  1, -1,  1,  1, -1,  1,  1, -1,  1, -1,  1, -1,  1, -1,  1, -1,  1, -1,  1, -1, -1, -1,  1, -1,  1, -1,  1,  1,
+     1,  1, -1, -1,  1, -1,  1, -1, -1,  1, -1, -1,  1, -1,  1,  1,  1, -1, -1, -1,  1, -1,  1,  1, -1, -1, -1, -1,  1, -1,  1, -1,
+     1,  1,  1,  1, -1, -1,  1,  1, -1,  1,  1,  1, -1, -1,  1, -1,  1, -1,  1,  1, -1, -1,  1, -1, -1, -1,  1,  1, -1, -1,  1,  1,
+     1,  1, -1,  1, -1, -1,  1, -1, -1,  1, -1,  1, -1, -1,  1,  1,  1, -1, -1,  1, -1, -1,  1,  1, -1, -1, -1,  1, -1, -1,  1, -1,
+     1,  1,  1, -1, -1, -1,  1, -1, -1,  1,  1, -1, -1, -1,  1,  1,  1, -1,  1, -1, -1, -1,  1,  1, -1, -1,  1, -1, -1, -1,  1, -1,
+     1,  1, -1, -1, -1, -1,  1,  1, -1,  1, -1, -1, -1, -1,  1, -1,  1, -1, -1, -1, -1, -1,  1, -1, -1, -1, -1, -1, -1, -1,  1,  1,
+     1,  1,  1,  1,  1,  1, -1, -1, -1,  1,  1,  1,  1,  1, -1,  1,  1, -1,  1,  1,  1,  1, -1,  1, -1, -1,  1,  1,  1,  1, -1, -1,
+     1,  1, -1,  1,  1,  1, -1,  1, -1,  1, -1,  1,  1,  1, -1, -1,  1, -1, -1,  1,  1,  1, -1, -1, -1, -1, -1,  1,  1,  1, -1,  1,
+     1,  1,  1, -1,  1,  1, -1,  1, -1,  1,  1, -1,  1,  1, -1, -1,  1, -1,  1, -1,  1,  1, -1, -1, -1, -1,  1, -1,  1,  1, -1,  1,
+     1,  1, -1, -1,  1,  1, -1, -1, -1,  1, -1, -1,  1,  1, -1,  1,  1, -1, -1, -1,  1,  1, -1,  1, -1, -1, -1, -1,  1,  1, -1, -1,
+     1,  1,  1,  1, -1,  1, -1,  1, -1,  1,  1,  1, -1,  1, -1, -1,  1, -1,  1,  1, -1,  1, -1, -1, -1, -1,  1,  1, -1,  1, -1,  1,
+     1,  1, -1,  1, -1,  1, -1, -1, -1,  1, -1,  1, -1,  1, -1,  1,  1, -1, -1,  1, -1,  1, -1,  1, -1, -1, -1,  1, -1,  1, -1, -1,
+     1,  1,  1, -1, -1,  1, -1, -1, -1,  1,  1, -1, -1,  1, -1,  1,  1, -1,  1, -1, -1,  1, -1,  1, -1, -1,  1, -1, -1,  1, -1, -1,
+     1,  1, -1, -1, -1,  1, -1,  1, -1,  1, -1, -1, -1,  1, -1, -1,  1, -1, -1, -1, -1,  1, -1, -1, -1, -1, -1, -1, -1,  1, -1,  1,
+     1,  1,  1,  1,  1, -1, -1,  1, -1,  1,  1,  1,  1, -1, -1, -1,  1, -1,  1,  1,  1, -1, -1, -1, -1, -1,  1,  1,  1, -1, -1,  1,
+     1,  1, -1,  1,  1, -1, -1, -1, -1,  1, -1,  1,  1, -1, -1,  1,  1, -1, -1,  1,  1, -1, -1,  1, -1, -1, -1,  1,  1, -1, -1, -1,
+     1,  1,  1, -1,  1, -1, -1, -1, -1,  1,  1, -1,  1, -1, -1,  1,  1, -1,  1, -1,  1, -1, -1,  1, -1, -1,  1, -1,  1, -1, -1, -1,
+     1,  1, -1, -1,  1, -1, -1,  1, -1,  1, -1, -1,  1, -1, -1, -1,  1, -1, -1, -1,  1, -1, -1, -1, -1, -1, -1, -1,  1, -1, -1,  1,
+     1,  1,  1,  1, -1, -1, -1, -1, -1,  1,  1,  1, -1, -1, -1,  1,  1, -1,  1,  1, -1, -1, -1,  1, -1, -1,  1,  1, -1, -1, -1, -1,
+     1,  1, -1,  1, -1, -1, -1,  1, -1,  1, -1,  1, -1, -1, -1, -1,  1, -1, -1,  1, -1, -1, -1, -1, -1, -1, -1,  1, -1, -1, -1,  1,
+     1,  1,  1, -1, -1, -1, -1,  1, -1,  1,  1, -1, -1, -1, -1, -1,  1, -1,  1, -1, -1, -1, -1, -1, -1, -1,  1, -1, -1, -1, -1,  1,
+     1,  1, -1, -1, -1, -1, -1, -1, -1,  1, -1, -1, -1, -1, -1,  1,  1, -1, -1, -1, -1, -1, -1,  1, -1, -1, -1, -1, -1, -1, -1, -1,
+};
+#endif
+
+void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    assert(n % QK_K == 0);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_iq2_xxs * GGML_RESTRICT x = vx;
+    const block_q8_K    * GGML_RESTRICT y = vy;
+
+    const int nb = n / QK_K;
+
+#if defined(__POWER9_VECTOR__)
+    const vector int v0 = vec_splats((int32_t)0);
+    vector float vsumf0 = vec_splats(0.0f);
+    vector float vsumf1 = vec_splats(0.0f);
+    vector float vsumf2 = vec_splats(0.0f);
+    vector float vsumf3 = vec_splats(0.0f);
+
+    const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
+
+    for (int i = 0; i < nb; ++i) {
+        vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].d));
+        vector float vyd = vec_splats(y[i].d);
+        vector float vd = vec_mul(vxd, vyd);
+
+        vector signed int vsumi0 = v0;
+        vector signed int vsumi1 = v0;
+        vector signed int vsumi2 = v0;
+        vector signed int vsumi3 = v0;
+
+        const uint16_t * GGML_RESTRICT q2 = x[i].qs;
+        const int8_t  *  GGML_RESTRICT q8 = y[i].qs;
+
+        for (int j = 0; j < QK_K/32; j += 2) {
+            __builtin_prefetch(q2, 0, 1);
+            __builtin_prefetch(q8, 0, 1);
+
+            uint32_t aux32[4];
+            const uint8_t * aux8 = (const uint8_t *)aux32;
+
+            memcpy(aux32, q2, 4*sizeof(uint32_t));
+            q2 += 8;
+
+            vector signed long long aux64x2_0 = {*(const int64_t *)(iq2xxs_grid + aux8[ 0]), *(const int64_t *)(iq2xxs_grid + aux8[ 1])};
+            vector signed long long aux64x2_1 = {*(const int64_t *)(iq2xxs_grid + aux8[ 2]), *(const int64_t *)(iq2xxs_grid + aux8[ 3])};
+            vector signed long long aux64x2_2 = {*(const int64_t *)(iq2xxs_grid + aux8[ 8]), *(const int64_t *)(iq2xxs_grid + aux8[ 9])};
+            vector signed long long aux64x2_3 = {*(const int64_t *)(iq2xxs_grid + aux8[10]), *(const int64_t *)(iq2xxs_grid + aux8[11])};
+
+            vector signed long long vsigns0 = {*(const int64_t *)(signs64 + ((aux32[1] >>  0) & 127)), *(const int64_t *)(signs64 + ((aux32[1] >>  7) & 127))};
+            vector signed long long vsigns1 = {*(const int64_t *)(signs64 + ((aux32[1] >> 14) & 127)), *(const int64_t *)(signs64 + ((aux32[1] >> 21) & 127))};
+            vector signed long long vsigns2 = {*(const int64_t *)(signs64 + ((aux32[3] >>  0) & 127)), *(const int64_t *)(signs64 + ((aux32[3] >>  7) & 127))};
+            vector signed long long vsigns3 = {*(const int64_t *)(signs64 + ((aux32[3] >> 14) & 127)), *(const int64_t *)(signs64 + ((aux32[3] >> 21) & 127))};
+
+            vector signed char q2x0 = (vector signed char)vec_mul((vector signed char)vsigns0, (vector signed char)aux64x2_0);
+            vector signed char q2x1 = (vector signed char)vec_mul((vector signed char)vsigns1, (vector signed char)aux64x2_1);
+            vector signed char q2x2 = (vector signed char)vec_mul((vector signed char)vsigns2, (vector signed char)aux64x2_2);
+            vector signed char q2x3 = (vector signed char)vec_mul((vector signed char)vsigns3, (vector signed char)aux64x2_3);
+
+            vector signed char q8y0 = vec_xl( 0, q8);
+            vector signed char q8y1 = vec_xl(16, q8);
+            vector signed char q8y2 = vec_xl(32, q8);
+            vector signed char q8y3 = vec_xl(48, q8);
+            q8 += 64;
+
+            vector signed short qv0 = vec_add(vec_mule(q2x0, q8y0), vec_mulo(q2x0, q8y0));
+            vector signed short qv1 = vec_add(vec_mule(q2x1, q8y1), vec_mulo(q2x1, q8y1));
+            vector signed short qv2 = vec_add(vec_mule(q2x2, q8y2), vec_mulo(q2x2, q8y2));
+            vector signed short qv3 = vec_add(vec_mule(q2x3, q8y3), vec_mulo(q2x3, q8y3));
+
+            const uint16_t ls0 = aux32[1] >> 28;
+            const uint16_t ls1 = aux32[3] >> 28;
+
+            vector signed short vscales01 = vec_splats((int16_t)(2*ls0+1));
+            vector signed short vscales23 = vec_splats((int16_t)(2*ls1+1));
+
+            vsumi0 = vec_msum(qv0, vscales01, vsumi0);
+            vsumi1 = vec_msum(qv1, vscales01, vsumi1);
+            vsumi2 = vec_msum(qv2, vscales23, vsumi2);
+            vsumi3 = vec_msum(qv3, vscales23, vsumi3);
+        }
+
+        vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
+        vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
+        vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
+        vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
+    }
+
+    vsumf0 = vec_add(vsumf0, vsumf2);
+    vsumf1 = vec_add(vsumf1, vsumf3);
+
+    vsumf0 = vec_add(vsumf0, vsumf1);
+
+    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
+    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
+
+    *s = 0.125f * vec_extract(vsumf0, 0);
+
+#else
+
+    uint32_t aux32[2];
+    const uint8_t * aux8 = (const uint8_t *)aux32;
+
+    float sumf = 0.f;
+    for (int i = 0; i < nb; ++i) {
+        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+        const uint16_t * GGML_RESTRICT q2 = x[i].qs;
+        const int8_t   * GGML_RESTRICT q8 = y[i].qs;
+        int32_t bsum = 0;
+        for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
+            memcpy(aux32, q2, 2*sizeof(uint32_t));
+            q2 += 4;
+            const uint32_t ls = 2*(aux32[1] >> 28) + 1;
+            int32_t sumi = 0;
+            for (int l = 0; l < 4; ++l) {
+                const uint8_t * grid = (const uint8_t *)(iq2xxs_grid + aux8[l]);
+                const uint8_t  signs = ksigns_iq2xs[(aux32[1] >> 7*l) & 127];
+                for (int j = 0; j < 8; ++j) {
+                    sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
+                }
+                q8 += 8;
+            }
+            bsum += sumi * ls;
+        }
+        sumf += d * bsum;
+    }
+    *s = 0.125f * sumf;
+#endif
+}
+
+void ggml_vec_dot_iq2_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    assert(n % QK_K == 0);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_iq2_xs * GGML_RESTRICT x = vx;
+    const block_q8_K   * GGML_RESTRICT y = vy;
+
+    const int nb = n / QK_K;
+
+#if defined(__POWER9_VECTOR__)
+    const vector int v0 = vec_splats((int32_t)0);
+    vector float vsumf0 = vec_splats(0.0f);
+    vector float vsumf1 = vec_splats(0.0f);
+    vector float vsumf2 = vec_splats(0.0f);
+    vector float vsumf3 = vec_splats(0.0f);
+
+    const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
+
+    for (int i = 0; i < nb; ++i) {
+        vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].d));
+        vector float vyd = vec_splats(y[i].d);
+        vector float vd = vec_mul(vxd, vyd);
+
+        vector signed int vsumi0 = v0;
+        vector signed int vsumi1 = v0;
+        vector signed int vsumi2 = v0;
+        vector signed int vsumi3 = v0;
+
+        const uint16_t * GGML_RESTRICT q2 = x[i].qs;
+        const uint8_t  * GGML_RESTRICT sc = x[i].scales;
+        const int8_t  *  GGML_RESTRICT q8 = y[i].qs;
+
+        for (int j = 0; j < QK_K/64; ++j) {
+            __builtin_prefetch(q2, 0, 1);
+            __builtin_prefetch(q8, 0, 1);
+
+            vector signed long long aux64x2_0 = {*(const int64_t *)(iq2xs_grid + (q2[0] & 511)), *(const int64_t *)(iq2xs_grid + (q2[1] & 511))};
+            vector signed long long aux64x2_1 = {*(const int64_t *)(iq2xs_grid + (q2[2] & 511)), *(const int64_t *)(iq2xs_grid + (q2[3] & 511))};
+            vector signed long long aux64x2_2 = {*(const int64_t *)(iq2xs_grid + (q2[4] & 511)), *(const int64_t *)(iq2xs_grid + (q2[5] & 511))};
+            vector signed long long aux64x2_3 = {*(const int64_t *)(iq2xs_grid + (q2[6] & 511)), *(const int64_t *)(iq2xs_grid + (q2[7] & 511))};
+
+            vector signed long long vsigns0 = {*(const int64_t *)(signs64 + ((q2[0] >> 9))), *(const int64_t *)(signs64 + ((q2[1] >> 9)))};
+            vector signed long long vsigns1 = {*(const int64_t *)(signs64 + ((q2[2] >> 9))), *(const int64_t *)(signs64 + ((q2[3] >> 9)))};
+            vector signed long long vsigns2 = {*(const int64_t *)(signs64 + ((q2[4] >> 9))), *(const int64_t *)(signs64 + ((q2[5] >> 9)))};
+            vector signed long long vsigns3 = {*(const int64_t *)(signs64 + ((q2[6] >> 9))), *(const int64_t *)(signs64 + ((q2[7] >> 9)))};
+            q2 += 8;
+
+            vector signed char q2x0 = (vector signed char)vec_mul((vector signed char)vsigns0, (vector signed char)aux64x2_0);
+            vector signed char q2x1 = (vector signed char)vec_mul((vector signed char)vsigns1, (vector signed char)aux64x2_1);
+            vector signed char q2x2 = (vector signed char)vec_mul((vector signed char)vsigns2, (vector signed char)aux64x2_2);
+            vector signed char q2x3 = (vector signed char)vec_mul((vector signed char)vsigns3, (vector signed char)aux64x2_3);
+
+            vector signed char q8y0 = vec_xl( 0, q8);
+            vector signed char q8y1 = vec_xl(16, q8);
+            vector signed char q8y2 = vec_xl(32, q8);
+            vector signed char q8y3 = vec_xl(48, q8);
+            q8 += 64;
+
+            vector signed short qv0 = vec_add(vec_mule(q2x0, q8y0), vec_mulo(q2x0, q8y0));
+            vector signed short qv1 = vec_add(vec_mule(q2x1, q8y1), vec_mulo(q2x1, q8y1));
+            vector signed short qv2 = vec_add(vec_mule(q2x2, q8y2), vec_mulo(q2x2, q8y2));
+            vector signed short qv3 = vec_add(vec_mule(q2x3, q8y3), vec_mulo(q2x3, q8y3));
+
+            const uint16_t ls0 = (uint16_t)(sc[0] & 0xf);
+            const uint16_t ls1 = (uint16_t)(sc[0] >>  4);
+            const uint16_t ls2 = (uint16_t)(sc[1] & 0xf);
+            const uint16_t ls3 = (uint16_t)(sc[1] >>  4);
+            sc += 2;
+
+            vector signed short vscales0 = vec_splats((int16_t)(2*ls0+1));
+            vector signed short vscales1 = vec_splats((int16_t)(2*ls1+1));
+            vector signed short vscales2 = vec_splats((int16_t)(2*ls2+1));
+            vector signed short vscales3 = vec_splats((int16_t)(2*ls3+1));
+
+            vsumi0 = vec_msum(qv0, vscales0, vsumi0);
+            vsumi1 = vec_msum(qv1, vscales1, vsumi1);
+            vsumi2 = vec_msum(qv2, vscales2, vsumi2);
+            vsumi3 = vec_msum(qv3, vscales3, vsumi3);
+        }
+
+        vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
+        vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
+        vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
+        vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
+    }
+
+    vsumf0 = vec_add(vsumf0, vsumf2);
+    vsumf1 = vec_add(vsumf1, vsumf3);
+
+    vsumf0 = vec_add(vsumf0, vsumf1);
+
+    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
+    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
+
+    *s = 0.125f * vec_extract(vsumf0, 0);
+
+#else
+
+    float sumf = 0.f;
+    for (int i = 0; i < nb; ++i) {
+        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+        const uint16_t * GGML_RESTRICT q2 = x[i].qs;
+        const uint8_t  * GGML_RESTRICT sc = x[i].scales;
+        const int8_t   * GGML_RESTRICT q8 = y[i].qs;
+        int32_t bsum = 0;
+        for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
+            const uint16_t ls1 = 2*(sc[ib32] & 0xf) + 1;
+            const uint16_t ls2 = 2*(sc[ib32] >>  4) + 1;
+            int32_t sumi = 0;
+            for (int l = 0; l < 2; ++l) {
+                const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511));
+                const uint8_t  signs = ksigns_iq2xs[q2[l] >> 9];
+                for (int j = 0; j < 8; ++j) {
+                    sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
+                }
+                q8 += 8;
+            }
+            bsum += sumi * ls1;
+            sumi = 0;
+            for (int l = 2; l < 4; ++l) {
+                const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511));
+                const uint8_t  signs = ksigns_iq2xs[q2[l] >> 9];
+                for (int j = 0; j < 8; ++j) {
+                    sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
+                }
+                q8 += 8;
+            }
+            bsum += sumi * ls2;
+            q2 += 4;
+        }
+        sumf += d * bsum;
+    }
+    *s = 0.125f * sumf;
+#endif
+}
+
+void ggml_vec_dot_iq2_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    assert(n % QK_K == 0);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_iq2_s * GGML_RESTRICT x = vx;
+    const block_q8_K  * GGML_RESTRICT y = vy;
+
+    const int nb = n / QK_K;
+
+#if defined(__POWER9_VECTOR__)
+    static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+                                        0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
+    };
+
+    static const uint8_t k_mask2[16] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,};
+
+    const vector int v0 = vec_splats((int32_t)0);
+
+    vector float vsumf0 = vec_splats(0.0f);
+    vector float vsumf1 = vec_splats(0.0f);
+    vector float vsumf2 = vec_splats(0.0f);
+    vector float vsumf3 = vec_splats(0.0f);
+
+    const vector unsigned char mask0 = vec_xl( 0, k_mask1);
+    const vector unsigned char mask1 = vec_xl(16, k_mask1);
+    const vector signed char mask2 = (vector signed char)vec_xl( 0, k_mask2);
+
+    for (int i = 0; i < nb; ++i) {
+        vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].d));
+        vector float vyd = vec_splats(y[i].d);
+        vector float vd = vec_mul(vxd, vyd);
+
+        vector signed int vsumi0 = v0;
+        vector signed int vsumi1 = v0;
+        vector signed int vsumi2 = v0;
+        vector signed int vsumi3 = v0;
+
+        const uint8_t *  GGML_RESTRICT q2 = x[i].qs;
+        const uint8_t *  GGML_RESTRICT qh = x[i].qh;
+        const uint16_t * GGML_RESTRICT signs = (const uint16_t *)(x[i].qs + QK_K/8);
+        const uint8_t *  GGML_RESTRICT sc = x[i].scales;
+        const int8_t  *  GGML_RESTRICT q8 = y[i].qs;
+
+        for (int j = 0; j < QK_K/32; j += 2) {
+            __builtin_prefetch(q2, 0, 1);
+            __builtin_prefetch(q8, 0, 1);
+
+            vector signed long long aux64x2_0 = {*(const int64_t *)(iq2s_grid + (q2[0] | ((qh[0] << 8) & 0x300))), *(const int64_t *)(iq2s_grid + (q2[1] | ((qh[0] << 6) & 0x300)))};
+            vector signed long long aux64x2_1 = {*(const int64_t *)(iq2s_grid + (q2[2] | ((qh[0] << 4) & 0x300))), *(const int64_t *)(iq2s_grid + (q2[3] | ((qh[0] << 2) & 0x300)))};
+            vector signed long long aux64x2_2 = {*(const int64_t *)(iq2s_grid + (q2[4] | ((qh[1] << 8) & 0x300))), *(const int64_t *)(iq2s_grid + (q2[5] | ((qh[1] << 6) & 0x300)))};
+            vector signed long long aux64x2_3 = {*(const int64_t *)(iq2s_grid + (q2[6] | ((qh[1] << 4) & 0x300))), *(const int64_t *)(iq2s_grid + (q2[7] | ((qh[1] << 2) & 0x300)))};
+            q2 += 8;
+            qh += 2;
+
+            vector signed char vsigns01 = (vector signed char)vec_splats(*(const uint32_t *)&signs[0]);
+            vector signed char vsigns23 = (vector signed char)vec_splats(*(const uint32_t *)&signs[2]);
+            signs += 4;
+
+            vector signed char vsigns0 = vec_perm(vsigns01, vsigns01, mask0);
+            vector signed char vsigns1 = vec_perm(vsigns01, vsigns01, mask1);
+            vector signed char vsigns2 = vec_perm(vsigns23, vsigns23, mask0);
+            vector signed char vsigns3 = vec_perm(vsigns23, vsigns23, mask1);
+
+            vsigns0 = (vector signed char)vec_cmpeq(vec_and(vsigns0, mask2), mask2);
+            vsigns1 = (vector signed char)vec_cmpeq(vec_and(vsigns1, mask2), mask2);
+            vsigns2 = (vector signed char)vec_cmpeq(vec_and(vsigns2, mask2), mask2);
+            vsigns3 = (vector signed char)vec_cmpeq(vec_and(vsigns3, mask2), mask2);
+
+            vector signed char q2x0 = vec_sub(vec_xor(vsigns0, (vector signed char)aux64x2_0), vsigns0);
+            vector signed char q2x1 = vec_sub(vec_xor(vsigns1, (vector signed char)aux64x2_1), vsigns1);
+            vector signed char q2x2 = vec_sub(vec_xor(vsigns2, (vector signed char)aux64x2_2), vsigns2);
+            vector signed char q2x3 = vec_sub(vec_xor(vsigns3, (vector signed char)aux64x2_3), vsigns3);
+
+            vector signed char q8y0 = vec_xl( 0, q8);
+            vector signed char q8y1 = vec_xl(16, q8);
+            vector signed char q8y2 = vec_xl(32, q8);
+            vector signed char q8y3 = vec_xl(48, q8);
+            q8 += 64;
+
+            vector signed short qv0 = vec_add(vec_mule(q2x0, q8y0), vec_mulo(q2x0, q8y0));
+            vector signed short qv1 = vec_add(vec_mule(q2x1, q8y1), vec_mulo(q2x1, q8y1));
+            vector signed short qv2 = vec_add(vec_mule(q2x2, q8y2), vec_mulo(q2x2, q8y2));
+            vector signed short qv3 = vec_add(vec_mule(q2x3, q8y3), vec_mulo(q2x3, q8y3));
+
+            const uint16_t ls0 = (uint16_t)(sc[0] & 0xf);
+            const uint16_t ls1 = (uint16_t)(sc[0] >>  4);
+            const uint16_t ls2 = (uint16_t)(sc[1] & 0xf);
+            const uint16_t ls3 = (uint16_t)(sc[1] >>  4);
+            sc += 2;
+
+            vector signed short vscales0 = vec_splats((int16_t)(2*ls0+1));
+            vector signed short vscales1 = vec_splats((int16_t)(2*ls1+1));
+            vector signed short vscales2 = vec_splats((int16_t)(2*ls2+1));
+            vector signed short vscales3 = vec_splats((int16_t)(2*ls3+1));
+
+            vsumi0 = vec_msum(qv0, vscales0, vsumi0);
+            vsumi1 = vec_msum(qv1, vscales1, vsumi1);
+            vsumi2 = vec_msum(qv2, vscales2, vsumi2);
+            vsumi3 = vec_msum(qv3, vscales3, vsumi3);
+        }
+
+        vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
+        vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
+        vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
+        vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
+    }
+
+    vsumf0 = vec_add(vsumf0, vsumf2);
+    vsumf1 = vec_add(vsumf1, vsumf3);
+
+    vsumf0 = vec_add(vsumf0, vsumf1);
+
+    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
+    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
+
+    *s = 0.125f * vec_extract(vsumf0, 0);
+
+#else
+
+    float sumf = 0;
+    for (int i = 0; i < nb; i++) {
+
+        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+        const int8_t  * q8 = y[i].qs;
+        const uint8_t * qs = x[i].qs;
+        const uint8_t * qh = x[i].qh;
+        const uint8_t * signs = qs + QK_K/8;
+
+        int bsum = 0;
+        for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
+            int ls1 = 1 + 2*(x[i].scales[ib32] & 0xf);
+            int ls2 = 1 + 2*(x[i].scales[ib32] >>  4);
+            int sumi1 = 0, sumi2 = 0;
+            for (int l = 0; l < 2; ++l) {
+                const uint8_t * grid = (const uint8_t *)(iq2s_grid + (qs[l] | (qh[ib32] << (8-2*l) & 0x300)));
+                for (int j = 0; j < 8; ++j) {
+                    sumi1 += q8[j] * grid[j] * (signs[l] & kmask_iq2xs[j] ? -1 : 1);
+                }
+                q8 += 8;
+            }
+            for (int l = 2; l < 4; ++l) {
+                const uint8_t * grid = (const uint8_t *)(iq2s_grid + (qs[l] | (qh[ib32] << (8-2*l) & 0x300)));
+                for (int j = 0; j < 8; ++j) {
+                    sumi2 += q8[j] * grid[j] * (signs[l] & kmask_iq2xs[j] ? -1 : 1);
+                }
+                q8 += 8;
+            }
+            bsum += ls1 * sumi1 + ls2 * sumi2;
+            qs += 4;
+            signs += 4;
+        }
+
+        sumf += d * bsum;
+    }
+
+    *s = 0.125f * sumf;
+
+#endif
+
+}
+
+void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    assert(n % QK_K == 0);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_iq3_xxs * GGML_RESTRICT x = vx;
+    const block_q8_K    * GGML_RESTRICT y = vy;
+
+    const int nb = n / QK_K;
+
+#if defined(__POWER9_VECTOR__)
+    const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
+
+    const vector int v0 = vec_splats((int32_t)0);
+
+    vector float vsumf0 = vec_splats(0.0f);
+    vector float vsumf1 = vec_splats(0.0f);
+    vector float vsumf2 = vec_splats(0.0f);
+    vector float vsumf3 = vec_splats(0.0f);
+
+    for (int i = 0; i < nb; ++i) {
+        vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].d));
+        vector float vyd = vec_splats(y[i].d);
+        vector float vd = vec_mul(vxd, vyd);
+
+        vector signed int vsumi0 = v0;
+        vector signed int vsumi1 = v0;
+        vector signed int vsumi2 = v0;
+        vector signed int vsumi3 = v0;
+
+        const uint8_t * GGML_RESTRICT q3 = x[i].qs;
+        const uint32_t * GGML_RESTRICT signs = (const uint32_t *)(x[i].qs + QK_K/4);
+        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
+
+#pragma GCC unroll 1
+        for (int j = 0; j < QK_K/32; j += 2) {
+            __builtin_prefetch(q3, 0, 1);
+            __builtin_prefetch(q8, 0, 1);
+
+            vector unsigned int aux32x4_0 = {iq3xxs_grid[q3[ 0]], iq3xxs_grid[q3[ 1]], iq3xxs_grid[q3[ 2]], iq3xxs_grid[q3[ 3]]};
+            vector unsigned int aux32x4_1 = {iq3xxs_grid[q3[ 4]], iq3xxs_grid[q3[ 5]], iq3xxs_grid[q3[ 6]], iq3xxs_grid[q3[ 7]]};
+            vector unsigned int aux32x4_2 = {iq3xxs_grid[q3[ 8]], iq3xxs_grid[q3[ 9]], iq3xxs_grid[q3[10]], iq3xxs_grid[q3[11]]};
+            vector unsigned int aux32x4_3 = {iq3xxs_grid[q3[12]], iq3xxs_grid[q3[13]], iq3xxs_grid[q3[14]], iq3xxs_grid[q3[15]]};
+            q3 += 16;
+
+            vector unsigned long long aux64x2_0 = {(uint64_t)(signs64[(signs[0] >>  0) & 127]), (uint64_t)(signs64[(signs[0] >>  7) & 127])};
+            vector unsigned long long aux64x2_1 = {(uint64_t)(signs64[(signs[0] >> 14) & 127]), (uint64_t)(signs64[(signs[0] >> 21) & 127])};
+            vector unsigned long long aux64x2_2 = {(uint64_t)(signs64[(signs[1] >>  0) & 127]), (uint64_t)(signs64[(signs[1] >>  7) & 127])};
+            vector unsigned long long aux64x2_3 = {(uint64_t)(signs64[(signs[1] >> 14) & 127]), (uint64_t)(signs64[(signs[1] >> 21) & 127])};
+
+            vector signed char q3x0 = vec_mul((vector signed char)aux64x2_0, (vector signed char)aux32x4_0);
+            vector signed char q3x1 = vec_mul((vector signed char)aux64x2_1, (vector signed char)aux32x4_1);
+            vector signed char q3x2 = vec_mul((vector signed char)aux64x2_2, (vector signed char)aux32x4_2);
+            vector signed char q3x3 = vec_mul((vector signed char)aux64x2_3, (vector signed char)aux32x4_3);
+
+            vector signed char q8y0 = vec_xl( 0, q8);
+            vector signed char q8y1 = vec_xl(16, q8);
+            vector signed char q8y2 = vec_xl(32, q8);
+            vector signed char q8y3 = vec_xl(48, q8);
+            q8 += 64;
+
+            vector signed short qv0 = vec_add(vec_mule(q3x0, q8y0), vec_mulo(q3x0, q8y0));
+            vector signed short qv1 = vec_add(vec_mule(q3x1, q8y1), vec_mulo(q3x1, q8y1));
+            vector signed short qv2 = vec_add(vec_mule(q3x2, q8y2), vec_mulo(q3x2, q8y2));
+            vector signed short qv3 = vec_add(vec_mule(q3x3, q8y3), vec_mulo(q3x3, q8y3));
+
+            const uint16_t ls0 = (uint16_t)(signs[0] >> 28);
+            const uint16_t ls1 = (uint16_t)(signs[1] >> 28);
+            signs += 2;
+
+            vector signed short vscales01 = (vector signed short)vec_splats((uint16_t)(2*ls0+1));
+            vector signed short vscales23 = (vector signed short)vec_splats((uint16_t)(2*ls1+1));
+
+            vsumi0 = vec_msum(qv0, vscales01, vsumi0);
+            vsumi1 = vec_msum(qv1, vscales01, vsumi1);
+            vsumi2 = vec_msum(qv2, vscales23, vsumi2);
+            vsumi3 = vec_msum(qv3, vscales23, vsumi3);
+        }
+
+        vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
+        vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
+        vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
+        vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
+    }
+
+    vsumf0 = vec_add(vsumf0, vsumf2);
+    vsumf1 = vec_add(vsumf1, vsumf3);
+
+    vsumf0 = vec_add(vsumf0, vsumf1);
+
+    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
+    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
+
+    *s = 0.25f * vec_extract(vsumf0, 0);
+
+#else
+
+    uint32_t aux32;
+
+    float sumf = 0.f;
+    for (int i = 0; i < nb; ++i) {
+        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+        const uint8_t * GGML_RESTRICT q3 = x[i].qs;
+        const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4;
+        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
+        int32_t bsum = 0;
+        for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
+            memcpy(&aux32, gas, sizeof(uint32_t)); gas += sizeof(uint32_t);
+            const uint32_t ls = 2*(aux32 >> 28) + 1;
+            int32_t sumi = 0;
+            for (int l = 0; l < 4; ++l) {
+                const uint8_t * grid1 = (const uint8_t *)(iq3xxs_grid + q3[2*l+0]);
+                const uint8_t * grid2 = (const uint8_t *)(iq3xxs_grid + q3[2*l+1]);
+                const uint8_t  signs = ksigns_iq2xs[(aux32 >> 7*l) & 127];
+                for (int j = 0; j < 4; ++j) {
+                    sumi += grid1[j] * q8[j+0] * (signs & kmask_iq2xs[j+0] ? -1 : 1);
+                    sumi += grid2[j] * q8[j+4] * (signs & kmask_iq2xs[j+4] ? -1 : 1);
+                }
+                q8 += 8;
+            }
+            q3 += 8;
+            bsum += sumi * ls;
+        }
+        sumf += d * bsum;
+    }
+    *s = 0.25f * sumf;
+#endif
+}
+
+void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    assert(n % QK_K == 0);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_iq3_s * GGML_RESTRICT x = vx;
+    const block_q8_K  * GGML_RESTRICT y = vy;
+
+    const int nb = n / QK_K;
+
+#if defined(__POWER9_VECTOR__)
+    static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+                                        0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
+    };
+
+    static const uint8_t k_mask2[16] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,};
+
+    const vector int v0 = vec_splats((int32_t)0);
+
+    vector float vsumf0 = vec_splats(0.0f);
+    vector float vsumf1 = vec_splats(0.0f);
+    vector float vsumf2 = vec_splats(0.0f);
+    vector float vsumf3 = vec_splats(0.0f);
+
+    const vector unsigned char mask0 = vec_xl( 0, k_mask1);
+    const vector unsigned char mask1 = vec_xl(16, k_mask1);
+    const vector signed char mask2 = (vector signed char)vec_xl( 0, k_mask2);
+
+    for (int i = 0; i < nb; ++i) {
+        vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].d));
+        vector float vyd = vec_splats(y[i].d);
+        vector float vd = vec_mul(vxd, vyd);
+
+        const uint8_t *  GGML_RESTRICT q3 = x[i].qs;
+        const uint8_t *  GGML_RESTRICT qh = x[i].qh;
+        const uint16_t * GGML_RESTRICT signs = (const uint16_t *)(x[i].signs);
+        const uint8_t *  GGML_RESTRICT sc = x[i].scales;
+        const int8_t  *  GGML_RESTRICT q8 = y[i].qs;
+
+        vector signed int vsumi0 = v0;
+        vector signed int vsumi1 = v0;
+        vector signed int vsumi2 = v0;
+        vector signed int vsumi3 = v0;
+
+        for (int j = 0; j < QK_K/32; j += 2) {
+            __builtin_prefetch(q3, 0, 1);
+            __builtin_prefetch(q8, 0, 1);
+
+            vector unsigned int aux32x4_0 = {iq3s_grid[q3[ 0] | ((qh[0] << 8) & 256)], iq3s_grid[q3[ 1] | ((qh[0] << 7) & 256)],
+                                             iq3s_grid[q3[ 2] | ((qh[0] << 6) & 256)], iq3s_grid[q3[ 3] | ((qh[0] << 5) & 256)]};
+            vector unsigned int aux32x4_1 = {iq3s_grid[q3[ 4] | ((qh[0] << 4) & 256)], iq3s_grid[q3[ 5] | ((qh[0] << 3) & 256)],
+                                             iq3s_grid[q3[ 6] | ((qh[0] << 2) & 256)], iq3s_grid[q3[ 7] | ((qh[0] << 1) & 256)]};
+            vector unsigned int aux32x4_2 = {iq3s_grid[q3[ 8] | ((qh[1] << 8) & 256)], iq3s_grid[q3[ 9] | ((qh[1] << 7) & 256)],
+                                             iq3s_grid[q3[10] | ((qh[1] << 6) & 256)], iq3s_grid[q3[11] | ((qh[1] << 5) & 256)]};
+            vector unsigned int aux32x4_3 = {iq3s_grid[q3[12] | ((qh[1] << 4) & 256)], iq3s_grid[q3[13] | ((qh[1] << 3) & 256)],
+                                             iq3s_grid[q3[14] | ((qh[1] << 2) & 256)], iq3s_grid[q3[15] | ((qh[1] << 1) & 256)]};
+            q3 += 16;
+            qh += 2;
+
+            vector signed char vsigns01 = (vector signed char)vec_splats(*(const uint32_t *)&signs[0]);
+            vector signed char vsigns02 = (vector signed char)vec_splats(*(const uint32_t *)&signs[2]);
+            signs += 4;
+
+            vector signed char vsigns0 = vec_perm(vsigns01, vsigns01, mask0);
+            vector signed char vsigns1 = vec_perm(vsigns01, vsigns01, mask1);
+            vector signed char vsigns2 = vec_perm(vsigns02, vsigns02, mask0);
+            vector signed char vsigns3 = vec_perm(vsigns02, vsigns02, mask1);
+
+            vsigns0 = (vector signed char)vec_cmpeq(vec_and(vsigns0, mask2), mask2);
+            vsigns1 = (vector signed char)vec_cmpeq(vec_and(vsigns1, mask2), mask2);
+            vsigns2 = (vector signed char)vec_cmpeq(vec_and(vsigns2, mask2), mask2);
+            vsigns3 = (vector signed char)vec_cmpeq(vec_and(vsigns3, mask2), mask2);
+
+            vector signed char q3x0 = vec_sub(vec_xor(vsigns0, (vector signed char)aux32x4_0), vsigns0);
+            vector signed char q3x1 = vec_sub(vec_xor(vsigns1, (vector signed char)aux32x4_1), vsigns1);
+            vector signed char q3x2 = vec_sub(vec_xor(vsigns2, (vector signed char)aux32x4_2), vsigns2);
+            vector signed char q3x3 = vec_sub(vec_xor(vsigns3, (vector signed char)aux32x4_3), vsigns3);
+
+            vector signed char q8y0 = vec_xl( 0, q8);
+            vector signed char q8y1 = vec_xl(16, q8);
+            vector signed char q8y2 = vec_xl(32, q8);
+            vector signed char q8y3 = vec_xl(48, q8);
+            q8 += 64;
+
+            vector signed short qv0 = vec_add(vec_mule(q3x0, q8y0), vec_mulo(q3x0, q8y0));
+            vector signed short qv1 = vec_add(vec_mule(q3x1, q8y1), vec_mulo(q3x1, q8y1));
+            vector signed short qv2 = vec_add(vec_mule(q3x2, q8y2), vec_mulo(q3x2, q8y2));
+            vector signed short qv3 = vec_add(vec_mule(q3x3, q8y3), vec_mulo(q3x3, q8y3));
+
+            const uint16_t ls0 = (uint16_t)(sc[0] & 0xf);
+            const uint16_t ls1 = (uint16_t)(sc[0] >>  4);
+            sc ++;
+
+            vector signed short vscales01 = (vector signed short)vec_splats((uint16_t)(2*ls0+1));
+            vector signed short vscales23 = (vector signed short)vec_splats((uint16_t)(2*ls1+1));
+
+            vsumi0 = vec_msum(qv0, vscales01, vsumi0);
+            vsumi1 = vec_msum(qv1, vscales01, vsumi1);
+            vsumi2 = vec_msum(qv2, vscales23, vsumi2);
+            vsumi3 = vec_msum(qv3, vscales23, vsumi3);
+        }
+
+        vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
+        vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
+        vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
+        vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
+    }
+
+    vsumf0 = vec_add(vsumf0, vsumf2);
+    vsumf1 = vec_add(vsumf1, vsumf3);
+
+    vsumf0 = vec_add(vsumf0, vsumf1);
+
+    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
+    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
+
+    *s = vec_extract(vsumf0, 0);
+
+#else
+
+    float sumf = 0.f;
+    for (int i = 0; i < nb; ++i) {
+        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+        const uint8_t * GGML_RESTRICT qs = x[i].qs;
+        const uint8_t * GGML_RESTRICT qh = x[i].qh;
+        const uint8_t * GGML_RESTRICT signs = x[i].signs;
+        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
+        int32_t bsum = 0;
+        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
+            const uint32_t ls1 = 2*(x[i].scales[ib32/2] & 0xf) + 1;
+            const uint32_t ls2 = 2*(x[i].scales[ib32/2] >>  4) + 1;
+            int32_t sumi = 0;
+            for (int l = 0; l < 4; ++l) {
+                const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*l+0] | ((qh[ib32+0] << (8-2*l)) & 256)));
+                const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*l+1] | ((qh[ib32+0] << (7-2*l)) & 256)));
+                for (int j = 0; j < 4; ++j) {
+                    sumi += grid1[j] * q8[j+0] * (signs[l] & kmask_iq2xs[j+0] ? -1 : 1);
+                    sumi += grid2[j] * q8[j+4] * (signs[l] & kmask_iq2xs[j+4] ? -1 : 1);
+                }
+                q8 += 8;
+            }
+            qs += 8;
+            signs += 4;
+            bsum += sumi * ls1;
+            sumi = 0;
+            for (int l = 0; l < 4; ++l) {
+                const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*l+0] | ((qh[ib32+1] << (8-2*l)) & 256)));
+                const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*l+1] | ((qh[ib32+1] << (7-2*l)) & 256)));
+                for (int j = 0; j < 4; ++j) {
+                    sumi += grid1[j] * q8[j+0] * (signs[l] & kmask_iq2xs[j+0] ? -1 : 1);
+                    sumi += grid2[j] * q8[j+4] * (signs[l] & kmask_iq2xs[j+4] ? -1 : 1);
+                }
+                q8 += 8;
+            }
+            qs += 8;
+            signs += 4;
+            bsum += sumi * ls2;
+        }
+        sumf += d * bsum;
+    }
+    *s = sumf;
+#endif
+}
+
+void ggml_vec_dot_iq1_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    assert(n % QK_K == 0);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_iq1_s * GGML_RESTRICT x = vx;
+    const block_q8_K  * GGML_RESTRICT y = vy;
+
+    const int nb = n / QK_K;
+
+#if defined(__POWER9_VECTOR__)
+    const vector unsigned char v0 = vec_splats((unsigned char)0x0);
+    const vector unsigned short vsign = vec_splats((unsigned short)0x8000);
+
+    vector float vsumf0 = vec_splats(0.0f);
+    vector float vsumf1 = vec_splats(0.0f);
+    vector float vsumf2 = vec_splats(0.0f);
+    vector float vsumf3 = vec_splats(0.0f);
+
+    for (int i = 0; i < nb; ++i) {
+        vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].d));
+        vector float vyd = vec_splats(y[i].d);
+        vector float vd = vec_mul(vxd, vyd);
+
+        vector signed int vsumi0 = vec_splats((int32_t)0);
+        vector signed int vsumi1 = vec_splats((int32_t)0);
+        vector signed int vsumi2 = vec_splats((int32_t)0);
+        vector signed int vsumi3 = vec_splats((int32_t)0);
+        vector signed int vsumi8 = vec_splats((int32_t)0);
+
+        const uint8_t  * GGML_RESTRICT q1 = x[i].qs;
+        const uint16_t * GGML_RESTRICT qh = x[i].qh;
+        const int8_t   * GGML_RESTRICT q8 = y[i].qs;
+        const int16_t  * GGML_RESTRICT qs = y[i].bsums;
+
+        for (int j = 0; j < QK_K/32; j += 2) {
+            __builtin_prefetch(q1, 0, 1);
+            __builtin_prefetch(qh, 0, 1);
+            __builtin_prefetch(q8, 0, 1);
+
+            vector signed long long aux64x2_0 = {*(const int64_t *)(iq1s_grid + (q1[0] | ((qh[0] << 8) & 0x700))), *(const int64_t *)(iq1s_grid + (q1[1] | ((qh[0] << 5) & 0x700)))};
+            vector signed long long aux64x2_1 = {*(const int64_t *)(iq1s_grid + (q1[2] | ((qh[0] << 2) & 0x700))), *(const int64_t *)(iq1s_grid + (q1[3] | ((qh[0] >> 1) & 0x700)))};
+            vector signed long long aux64x2_2 = {*(const int64_t *)(iq1s_grid + (q1[4] | ((qh[1] << 8) & 0x700))), *(const int64_t *)(iq1s_grid + (q1[5] | ((qh[1] << 5) & 0x700)))};
+            vector signed long long aux64x2_3 = {*(const int64_t *)(iq1s_grid + (q1[6] | ((qh[1] << 2) & 0x700))), *(const int64_t *)(iq1s_grid + (q1[7] | ((qh[1] >> 1) & 0x700)))};
+            q1 += 8;
+
+            vector signed char q1x0 = (vector signed char)aux64x2_0;
+            vector signed char q1x1 = (vector signed char)aux64x2_1;
+            vector signed char q1x2 = (vector signed char)aux64x2_2;
+            vector signed char q1x3 = (vector signed char)aux64x2_3;
+
+            vector signed char q8y0 = vec_xl( 0, q8);
+            vector signed char q8y1 = vec_xl(16, q8);
+            vector signed char q8y2 = vec_xl(32, q8);
+            vector signed char q8y3 = vec_xl(48, q8);
+            q8 += 64;
+
+            vector signed short qv0 = vec_add(vec_mule(q1x0, q8y0), vec_mulo(q1x0, q8y0));
+            vector signed short qv1 = vec_add(vec_mule(q1x1, q8y1), vec_mulo(q1x1, q8y1));
+            vector signed short qv2 = vec_add(vec_mule(q1x2, q8y2), vec_mulo(q1x2, q8y2));
+            vector signed short qv3 = vec_add(vec_mule(q1x3, q8y3), vec_mulo(q1x3, q8y3));
+
+            const uint16_t ls0 = (uint16_t)((qh[0] >> 12) & 7);
+            const uint16_t ls1 = (uint16_t)((qh[1] >> 12) & 7);
+
+            vector signed short vscales01 = (vector signed short)vec_splats((uint16_t)(2*ls0+1));
+            vector signed short vscales23 = (vector signed short)vec_splats((uint16_t)(2*ls1+1));
+            vector signed short vscales = vec_sld(vscales23, vscales01, 8);
+
+            vsumi0 = vec_msum(qv0, vscales01, vsumi0);
+            vsumi1 = vec_msum(qv1, vscales01, vsumi1);
+            vsumi2 = vec_msum(qv2, vscales23, vsumi2);
+            vsumi3 = vec_msum(qv3, vscales23, vsumi3);
+
+            vector signed short q8ysums = vec_xl_len(qs, 8);
+            qs += 4;
+            q8ysums = vec_mergeh(q8ysums, (vector signed short)v0);
+
+            vector signed short qxh = (vector signed short)vec_sld(vec_splats(qh[1]), vec_splats(qh[0]), 8);
+            qh += 2;
+            vector __bool short vsel = vec_cmpge(qxh, (vector signed short)v0);
+
+            vector signed short q8ysum = vec_sel((vector signed short)vec_xor((vector unsigned short)q8ysums, vsign), q8ysums, vsel);
+
+            vsumi8 = vec_add(vec_mule(q8ysum, vscales), vsumi8);
+        }
+
+        vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
+        vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
+        vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
+        vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
+
+        vsumf0 = vec_madd(vec_ctf(vsumi8, 0), vec_mul(vd, vec_splats(IQ1S_DELTA)), vsumf0);
+    }
+
+    vsumf0 = vec_add(vsumf0, vsumf2);
+    vsumf1 = vec_add(vsumf1, vsumf3);
+
+    vsumf0 = vec_add(vsumf0, vsumf1);
+
+    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
+    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
+
+    *s = vec_extract(vsumf0, 0);
+
+#else
+
+    float sumf = 0;
+    for (int i = 0; i < nb; i++) {
+
+        const int8_t   * q8 = y[i].qs;
+        const uint8_t  * qs = x[i].qs;
+        const uint16_t * qh = x[i].qh;
+
+        int sumi = 0, sumi1 = 0;
+        for (int ib = 0; ib < QK_K/32; ++ib) {
+            const int ls = 2*((qh[ib] >> 12) & 7) + 1;
+            const int delta = qh[ib] & 0x8000 ? -1 : 1;
+            int lsum = 0;
+            for (int l = 0; l < 4; ++l) {
+                const int8_t * grid = (const int8_t *)(iq1s_grid + (qs[l] | (((qh[ib] >> 3*l) & 7) << 8)));
+                for (int j = 0; j < 8; ++j) {
+                    lsum += q8[j] * grid[j];
+                }
+                q8 += 8;
+            }
+            sumi  += ls * lsum;
+            sumi1 += ls * delta * (y[i].bsums[2*ib+0] + y[i].bsums[2*ib+1]);
+            qs += 4;
+        }
+
+        sumf += GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d * (sumi + IQ1S_DELTA * sumi1);
+    }
+
+    *s = sumf;
+
+#endif
+}
+
+void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+    assert(n % QK4_NL == 0);
+    static_assert(QK4_NL == QK8_0, "QK4_NL and QK8_0 must be the same");
+
+    const block_iq4_nl * GGML_RESTRICT x = vx;
+    const block_q8_0   * GGML_RESTRICT y = vy;
+
+    const int nb = n / QK4_NL;
+
+    int ib = 0;
+    float sumf = 0;
+
+#if defined(__POWER9_VECTOR__)
+    const vector signed char lowMask = vec_splats((signed char)0xF);
+    const vector signed int v0 = vec_splats((int32_t)0);
+    const vector unsigned char v4 = vec_splats((unsigned char)0x4);
+
+    vector float vsumf0 = vec_splats(0.0f);
+    vector float vsumf1 = vec_splats(0.0f);
+
+    const vector signed char values = vec_xl( 0, kvalues_iq4nl);
+
+#pragma GCC unroll 4
+    for (; ib < nb; ++ib) {
+        __builtin_prefetch(x[ib].qs, 0, 1);
+        __builtin_prefetch(y[ib].qs, 0, 1);
+
+
+        vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].d));
+        vector float vyd = vec_splats(GGML_CPU_FP16_TO_FP32(y[ib].d));
+        vector float vd = vec_mul(vxd, vyd);
+
+        vector signed char qxs = (vector signed char)vec_xl( 0, x[ib].qs);
+        vector signed char q4x0 = vec_and(qxs, lowMask);
+        vector signed char q4x1 = vec_sr(qxs, v4);
+
+        q4x0 = vec_perm(values, values, (vector unsigned char)q4x0);
+        q4x1 = vec_perm(values, values, (vector unsigned char)q4x1);
+
+        vector signed char q8y0 = vec_xl( 0, y[ib].qs);
+        vector signed char q8y1 = vec_xl(16, y[ib].qs);
+
+        vector signed short qv0 = vec_add(vec_mule(q4x0, q8y0), vec_mulo(q4x0, q8y0));
+        vector signed short qv1 = vec_add(vec_mule(q4x1, q8y1), vec_mulo(q4x1, q8y1));
+
+        vector signed int vsumi0 = v0;
+        vector signed int vsumi1 = v0;
+
+        vsumi0 = vec_sum4s(qv0, vsumi0);
+        vsumi1 = vec_sum4s(qv1, vsumi1);
+
+        vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
+        vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
+    }
+
+    vsumf0 = vec_add(vsumf0, vsumf1);
+
+    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
+    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
+
+    sumf = vec_extract(vsumf0, 0);
+
+#endif
+    for (; ib < nb; ++ib) {
+        const float d = GGML_CPU_FP16_TO_FP32(y[ib].d)*GGML_CPU_FP16_TO_FP32(x[ib].d);
+        int sumi1 = 0, sumi2 = 0;
+        for (int j = 0; j < QK4_NL/2; ++j) {
+            sumi1 += y[ib].qs[j+       0] * kvalues_iq4nl[x[ib].qs[j] & 0xf];
+            sumi2 += y[ib].qs[j+QK4_NL/2] * kvalues_iq4nl[x[ib].qs[j] >>  4];
+        }
+        sumf += d * (sumi1 + sumi2);
+    }
+    *s = sumf;
+}
+
+void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+    assert(n % QK_K == 0);
+
+    const block_iq4_xs * GGML_RESTRICT x = vx;
+    const block_q8_K   * GGML_RESTRICT y = vy;
+
+    const int nb = n / QK_K;
+
+#if defined(__POWER9_VECTOR__)
+    const vector signed char lowMask = vec_splats((signed char)0xF);
+    const vector int v0 = vec_splats((int32_t)0);
+    const vector unsigned char v4 = vec_splats((unsigned char)0x4);
+
+    vector float vsumf0 = vec_splats(0.0f);
+    vector float vsumf1 = vec_splats(0.0f);
+    vector float vsumf2 = vec_splats(0.0f);
+    vector float vsumf3 = vec_splats(0.0f);
+
+    const vector signed char values = vec_xl( 0, kvalues_iq4nl);
+
+    for (int ibl = 0; ibl < nb; ++ibl) {
+
+        vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[ibl].d));
+        vector float vyd = vec_splats(y[ibl].d);
+        vector float vd = vec_mul(vxd, vyd);
+
+        vector signed int vsumi0 = v0;
+        vector signed int vsumi1 = v0;
+        vector signed int vsumi2 = v0;
+        vector signed int vsumi3 = v0;
+
+        uint16_t h = x[ibl].scales_h;
+
+        const uint8_t * GGML_RESTRICT q4 = x[ibl].qs;
+        const uint8_t * GGML_RESTRICT sc = x[ibl].scales_l;
+        const int8_t  * GGML_RESTRICT q8 = y[ibl].qs;
+
+        for (int ib = 0; ib < QK_K/64; ib ++ ) {
+            __builtin_prefetch(q4, 0, 1);
+            __builtin_prefetch(q8, 0, 1);
+
+            vector signed char qxs0 = (vector signed char)vec_xl( 0, q4);
+            vector signed char qxs1 = (vector signed char)vec_xl(16, q4);
+            q4 += 32;
+
+            vector signed char q4x00 = (vector signed char)vec_and(qxs0, lowMask);
+            vector signed char q4x01 = (vector signed char)vec_sr(qxs0, v4);
+            vector signed char q4x10 = (vector signed char)vec_and(qxs1, lowMask);
+            vector signed char q4x11 = (vector signed char)vec_sr(qxs1, v4);
+
+            q4x00 = vec_perm(values, values, (vector unsigned char)q4x00);
+            q4x01 = vec_perm(values, values, (vector unsigned char)q4x01);
+            q4x10 = vec_perm(values, values, (vector unsigned char)q4x10);
+            q4x11 = vec_perm(values, values, (vector unsigned char)q4x11);
+
+            vector signed char q8y0 = vec_xl( 0, q8);
+            vector signed char q8y1 = vec_xl(16, q8);
+            vector signed char q8y2 = vec_xl(32, q8);
+            vector signed char q8y3 = vec_xl(48, q8);
+            q8 += 64;
+
+            vector signed short qv0 = vec_add(vec_mule(q4x00, q8y0), vec_mulo(q4x00, q8y0));
+            vector signed short qv1 = vec_add(vec_mule(q4x01, q8y1), vec_mulo(q4x01, q8y1));
+            vector signed short qv2 = vec_add(vec_mule(q4x10, q8y2), vec_mulo(q4x10, q8y2));
+            vector signed short qv3 = vec_add(vec_mule(q4x11, q8y3), vec_mulo(q4x11, q8y3));
+
+            const uint16_t ls0 = (uint16_t)(((sc[0] & 0xf) | ((h << 4) & 0x30)) - 32);
+            const uint16_t ls1 = (uint16_t)(((sc[0] >>  4) | ((h << 2) & 0x30)) - 32);
+            h >>= 4;
+            sc ++;
+
+            vector signed short vscales01 = vec_splats((int16_t)ls0);
+            vector signed short vscales23 = vec_splats((int16_t)ls1);
+
+            vsumi0 = vec_msum(qv0, vscales01, vsumi0);
+            vsumi1 = vec_msum(qv1, vscales01, vsumi1);
+            vsumi2 = vec_msum(qv2, vscales23, vsumi2);
+            vsumi3 = vec_msum(qv3, vscales23, vsumi3);
+        }
+
+        vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
+        vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
+        vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
+        vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
+    }
+
+    vsumf0 = vec_add(vsumf0, vsumf2);
+    vsumf1 = vec_add(vsumf1, vsumf3);
+
+    vsumf0 = vec_add(vsumf0, vsumf1);
+
+    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
+    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
+
+    *s = vec_extract(vsumf0, 0);
+
+#else
+    float sumf = 0;
+    for (int ibl = 0; ibl < nb; ++ibl) {
+        const float d4d8 = GGML_CPU_FP16_TO_FP32(x[ibl].d) * y[ibl].d;
+        uint16_t h = x[ibl].scales_h;
+        const uint8_t * qs = x[ibl].qs;
+        const int8_t  * q8 = y[ibl].qs;
+        for (int ib = 0; ib < QK_K/32; ib += 2) {
+            const uint8_t ls1 = (x[ibl].scales_l[ib/2] & 0xf) | ((h << 4) & 0x30);
+            const uint8_t ls2 = (x[ibl].scales_l[ib/2] >>  4) | ((h << 2) & 0x30);
+            h >>= 4;
+            const float d1 = d4d8*(ls1 - 32);
+            const float d2 = d4d8*(ls2 - 32);
+            int sumi1 = 0, sumi2 = 0;
+            for (int j = 0; j < 16; ++j) {
+                sumi1 += q8[j+ 0] * kvalues_iq4nl[qs[j] & 0xf];
+                sumi2 += q8[j+16] * kvalues_iq4nl[qs[j] >>  4];
+            }
+            sumf += d1 * (sumi1 + sumi2);
+            qs += 16;
+            q8 += 32;
+            sumi1 = sumi2 = 0;
+            for (int j = 0; j < 16; ++j) {
+                sumi1 += q8[j+ 0] * kvalues_iq4nl[qs[j] & 0xf];
+                sumi2 += q8[j+16] * kvalues_iq4nl[qs[j] >>  4];
+            }
+            sumf += d2 * (sumi1 + sumi2);
+            qs += 16;
+            q8 += 32;
+        }
+    }
+    *s = sumf;
+#endif
+}
+
diff --git a/ggml/src/ggml-cpu/arch/riscv/quants.c b/ggml/src/ggml-cpu/arch/riscv/quants.c
new file mode 100644
index 0000000000000..8b64d8adc48f4
--- /dev/null
+++ b/ggml/src/ggml-cpu/arch/riscv/quants.c
@@ -0,0 +1,2069 @@
+#define GGML_COMMON_IMPL_C
+#include "ggml-common.h"
+#include "ggml-quants.h"
+#include "ggml-impl.h"
+#include "ggml-cpu.h"
+#include "simd-mappings.h"
+
+#include "../../quants.h"
+#include "../../ggml-cpu-impl.h"
+
+#include <math.h>
+#include <string.h>
+#include <assert.h>
+#include <float.h>
+#include <stdlib.h> // for qsort
+#include <stdio.h>  // for GGML_ASSERT
+
+#define GROUP_MAX_EPS 1e-15f
+#define GROUP_MAX_EPS_IQ3_XXS 1e-8f
+#define GROUP_MAX_EPS_IQ2_S 1e-8f
+#define GROUP_MAX_EPS_IQ1_M 1e-7f
+#define GROUP_MAX_EPS_IQ1_S 1e-12f
+
+#define UNUSED GGML_UNUSED
+
+void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
+    assert(QK8_0 == 32);
+    assert(k % QK8_0 == 0);
+    const int nb = k / QK8_0;
+
+    block_q8_0 * GGML_RESTRICT y = vy;
+
+#if defined(__riscv_v)
+
+    size_t vl = QK8_0;
+
+    for (int i = 0; i < nb; i++) {
+        // load elements
+        vfloat32m8_t v_x   = __riscv_vle32_v_f32m8(x+i*QK8_0, vl);
+
+        vfloat32m8_t vfabs = __riscv_vfabs_v_f32m8(v_x, vl);
+        vfloat32m1_t tmp   = __riscv_vfmv_v_f_f32m1(0.0f, vl);
+        vfloat32m1_t vmax  = __riscv_vfredmax_vs_f32m8_f32m1(vfabs, tmp, vl);
+        float amax = __riscv_vfmv_f_s_f32m1_f32(vmax);
+
+        const float d = amax / ((1 << 7) - 1);
+        const float id = d ? 1.0f/d : 0.0f;
+
+        y[i].d = GGML_CPU_FP32_TO_FP16(d);
+
+        vfloat32m8_t x0 = __riscv_vfmul_vf_f32m8(v_x, id, vl);
+
+        // convert to integer
+        vint16m4_t   vi = __riscv_vfncvt_x_f_w_i16m4(x0, vl);
+        vint8m2_t    vs = __riscv_vncvt_x_x_w_i8m2(vi, vl);
+
+        // store result
+        __riscv_vse8_v_i8m2(y[i].qs , vs, vl);
+    }
+#else
+    GGML_UNUSED(nb);
+    // scalar
+    quantize_row_q8_0_ref(x, y, k);
+#endif
+}
+
+void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
+    assert(k % QK8_1 == 0);
+    const int nb = k / QK8_1;
+
+    block_q8_1 * GGML_RESTRICT y = vy;
+
+#if defined(__riscv_v)
+
+    size_t vl = QK8_1;
+
+    for (int i = 0; i < nb; i++) {
+        // load elements
+        vfloat32m8_t v_x   = __riscv_vle32_v_f32m8(x+i*QK8_1, vl);
+
+        vfloat32m8_t vfabs = __riscv_vfabs_v_f32m8(v_x, vl);
+        vfloat32m1_t tmp   = __riscv_vfmv_v_f_f32m1(0.0, vl);
+        vfloat32m1_t vmax  = __riscv_vfredmax_vs_f32m8_f32m1(vfabs, tmp, vl);
+        float amax = __riscv_vfmv_f_s_f32m1_f32(vmax);
+
+        const float d  = amax / ((1 << 7) - 1);
+        const float id = d ? 1.0f/d : 0.0f;
+
+        y[i].d = GGML_CPU_FP32_TO_FP16(d);
+
+        vfloat32m8_t x0 = __riscv_vfmul_vf_f32m8(v_x, id, vl);
+
+        // convert to integer
+        vint16m4_t   vi = __riscv_vfncvt_x_f_w_i16m4(x0, vl);
+        vint8m2_t    vs = __riscv_vncvt_x_x_w_i8m2(vi, vl);
+
+        // store result
+        __riscv_vse8_v_i8m2(y[i].qs , vs, vl);
+
+        // compute sum for y[i].s
+        vint16m1_t tmp2 = __riscv_vmv_v_x_i16m1(0, vl);
+        vint16m1_t vwrs = __riscv_vwredsum_vs_i8m2_i16m1(vs, tmp2, vl);
+
+        // set y[i].s
+        int sum = __riscv_vmv_x_s_i16m1_i16(vwrs);
+        y[i].s = GGML_CPU_FP32_TO_FP16(sum*d);
+    }
+
+#else
+    GGML_UNUSED(nb);
+    // scalar
+    quantize_row_q8_1_ref(x, y, k);
+#endif
+}
+
+//===================================== Dot products =================================
+
+void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    const int qk = QK8_0;
+    const int nb = n / qk;
+
+    assert(n % qk == 0);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_q4_0 * GGML_RESTRICT x = vx;
+    const block_q8_0 * GGML_RESTRICT y = vy;
+
+    int ib = 0;
+    float sumf = 0;
+
+#if defined(__riscv_v)
+    size_t vl = qk / 2;
+
+    for (; ib < nb; ++ib) {
+        // load elements
+        vuint8m1_t tx = __riscv_vle8_v_u8m1(x[ib].qs, vl);
+
+        vint8m1_t y0 = __riscv_vle8_v_i8m1(y[ib].qs, vl);
+        vint8m1_t y1 = __riscv_vle8_v_i8m1(y[ib].qs+16, vl);
+
+        // mask and store lower part of x, and then upper part
+        vuint8m1_t x_a = __riscv_vand_vx_u8m1(tx, 0x0F, vl);
+        vuint8m1_t x_l = __riscv_vsrl_vx_u8m1(tx, 0x04, vl);
+
+        vint8m1_t x_ai = __riscv_vreinterpret_v_u8m1_i8m1(x_a);
+        vint8m1_t x_li = __riscv_vreinterpret_v_u8m1_i8m1(x_l);
+
+        // subtract offset
+        vint8m1_t v0 = __riscv_vsub_vx_i8m1(x_ai, 8, vl);
+        vint8m1_t v1 = __riscv_vsub_vx_i8m1(x_li, 8, vl);
+
+        vint16m2_t vec_mul1 = __riscv_vwmul_vv_i16m2(v0, y0, vl);
+        vint16m2_t vec_mul2 = __riscv_vwmacc_vv_i16m2(vec_mul1, v1, y1, vl);
+
+        vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl);
+        vint32m1_t vs2 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul2, vec_zero, vl);
+
+        int sumi = __riscv_vmv_x_s_i32m1_i32(vs2);
+
+        sumf += sumi*GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d);
+    }
+
+#endif
+    for (; ib < nb; ++ib) {
+        int sumi0 = 0;
+        int sumi1 = 0;
+
+        for (int j = 0; j < qk/2; ++j) {
+            const int v0 = (x[ib].qs[j] & 0x0F) - 8;
+            const int v1 = (x[ib].qs[j] >>   4) - 8;
+
+            sumi0 += (v0 * y[ib].qs[j]);
+            sumi1 += (v1 * y[ib].qs[j + qk/2]);
+        }
+
+        int sumi = sumi0 + sumi1;
+        sumf += sumi*GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d);
+    }
+
+    *s = sumf;
+}
+
+void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    const int qk = QK8_1;
+    const int nb = n / qk;
+
+    assert(n % qk == 0);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_q4_1 * GGML_RESTRICT x = vx;
+    const block_q8_1 * GGML_RESTRICT y = vy;
+
+    int ib = 0;
+    float sumf = 0;
+
+#if defined(__riscv_v)
+    size_t vl = qk / 2;
+
+    for (; ib < nb; ++ib) {
+        // load elements
+        vuint8m1_t tx = __riscv_vle8_v_u8m1(x[ib].qs, vl);
+
+        vint8m1_t y0 = __riscv_vle8_v_i8m1(y[ib].qs, vl);
+        vint8m1_t y1 = __riscv_vle8_v_i8m1(y[ib].qs+16, vl);
+
+        // mask and store lower part of x, and then upper part
+        vuint8m1_t x_a = __riscv_vand_vx_u8m1(tx, 0x0F, vl);
+        vuint8m1_t x_l = __riscv_vsrl_vx_u8m1(tx, 0x04, vl);
+
+        vint8m1_t v0 = __riscv_vreinterpret_v_u8m1_i8m1(x_a);
+        vint8m1_t v1 = __riscv_vreinterpret_v_u8m1_i8m1(x_l);
+
+        vint16m2_t vec_mul1 = __riscv_vwmul_vv_i16m2(v0, y0, vl);
+        vint16m2_t vec_mul2 = __riscv_vwmacc_vv_i16m2(vec_mul1, v1, y1, vl);
+
+        vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl);
+        vint32m1_t vs2 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul2, vec_zero, vl);
+
+        int sumi = __riscv_vmv_x_s_i32m1_i32(vs2);
+
+        sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
+    }
+
+#endif
+    for (; ib < nb; ++ib) {
+        int sumi0 = 0;
+        int sumi1 = 0;
+
+        for (int j = 0; j < qk/2; ++j) {
+            const int v0 = (x[ib].qs[j] & 0x0F);
+            const int v1 = (x[ib].qs[j] >>   4);
+
+            sumi0 += (v0 * y[ib].qs[j]);
+            sumi1 += (v1 * y[ib].qs[j + qk/2]);
+        }
+
+        int sumi = sumi0 + sumi1;
+        sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
+    }
+
+    *s = sumf;
+}
+
+void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    const int qk = QK8_0;
+    const int nb = n / qk;
+
+    int ib = 0;
+    float sumf = 0;
+
+    assert(n % qk == 0);
+    assert(qk == QK5_0);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_q5_0 * GGML_RESTRICT x = vx;
+    const block_q8_0 * GGML_RESTRICT y = vy;
+
+#if defined(__riscv_v)
+    size_t vl;
+    size_t vlenb = __riscv_vlenb();
+
+    for (; ib < nb; ++ib) {
+        vl = qk / 2;
+        vuint8m1_t v0 = __riscv_vle8_v_u8m1(x[ib].qs, vl);
+        vint8m1_t v0l = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(v0, 0x0F, vl));
+        vint8m1_t v0h = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vsrl_vx_u8m1(v0, 4, vl));
+        vint8m2_t v0c;
+        if (vlenb == 16) {
+            v0c = __riscv_vcreate_v_i8m1_i8m2(v0l, v0h);
+        } else {
+            v0l = __riscv_vslideup_vx_i8m1(v0l, v0h, 16, 32);
+            v0c = __riscv_vlmul_ext_v_i8m1_i8m2(v0l);
+        }
+
+        vl = qk;
+        vbool4_t qh = __riscv_vlm_v_b4(x[ib].qh, vl);
+        qh = __riscv_vmnand_mm_b4(qh, qh, vl);
+        vint8m2_t v0f = __riscv_vsub_vx_i8m2_mu(qh, v0c, v0c, 0x10, vl);
+        vint8m2_t v1 = __riscv_vle8_v_i8m2(y[ib].qs, vl);
+        vint16m4_t mul = __riscv_vwmul_vv_i16m4(v0f, v1, vl);
+        vint32m1_t zero = __riscv_vmv_v_x_i32m1(0, vl);
+        vint32m1_t sum = __riscv_vwredsum_vs_i16m4_i32m1(mul, zero, vl);
+        int32_t sumi = __riscv_vmv_x_s_i32m1_i32(sum);
+
+        sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d)) * sumi;
+    }
+
+#endif
+    for (; ib < nb; ++ib) {
+        uint32_t qh;
+        memcpy(&qh, x[ib].qh, sizeof(qh));
+
+        int sumi0 = 0;
+        int sumi1 = 0;
+
+        for (int j = 0; j < qk/2; ++j) {
+            const uint8_t xh_0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
+            const uint8_t xh_1 = ((qh & (1u << (j + 16))) >> (j + 12));
+
+            const int32_t x0 = (int8_t)(((x[ib].qs[j] & 0x0F) | xh_0) - 16);
+            const int32_t x1 = (int8_t)(((x[ib].qs[j] >>   4) | xh_1) - 16);
+
+            sumi0 += (x0 * y[ib].qs[j]);
+            sumi1 += (x1 * y[ib].qs[j + qk/2]);
+        }
+
+        int sumi = sumi0 + sumi1;
+        sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d)) * sumi;
+    }
+
+    *s = sumf;
+}
+
+void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    const int qk = QK8_1;
+    const int nb = n / qk;
+
+    int ib = 0;
+    float sumf = 0;
+
+    assert(n % qk == 0);
+    assert(qk == QK5_1);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_q5_1 * GGML_RESTRICT x = vx;
+    const block_q8_1 * GGML_RESTRICT y = vy;
+
+#if defined(__riscv_v)
+    size_t vl;
+    size_t vlenb = __riscv_vlenb();
+
+    for (; ib < nb; ++ib) {
+        vl = qk / 2;
+        vuint8m1_t v0 = __riscv_vle8_v_u8m1(x[ib].qs, vl);
+        vint8m1_t v0l = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(v0, 0x0F, vl));
+        vint8m1_t v0h = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vsrl_vx_u8m1(v0, 4, vl));
+        vint8m2_t v0c;
+        if (vlenb == 16) {
+            v0c = __riscv_vcreate_v_i8m1_i8m2(v0l, v0h);
+        } else {
+            v0l = __riscv_vslideup_vx_i8m1(v0l, v0h, 16, 32);
+            v0c = __riscv_vlmul_ext_v_i8m1_i8m2(v0l);
+        }
+
+        vl = qk;
+        vbool4_t qh = __riscv_vlm_v_b4(x[ib].qh, vl);
+        vint8m2_t v0f = __riscv_vor_vx_i8m2_mu(qh, v0c, v0c, 0x10, vl);
+        vint8m2_t v1 = __riscv_vle8_v_i8m2(y[ib].qs, vl);
+        vint16m4_t mul = __riscv_vwmul_vv_i16m4(v0f, v1, vl);
+        vint32m1_t zero = __riscv_vmv_v_x_i32m1(0, vl);
+        vint32m1_t sum = __riscv_vwredsum_vs_i16m4_i32m1(mul, zero, vl);
+        int32_t sumi = __riscv_vmv_x_s_i32m1_i32(sum);
+
+        sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
+    }
+
+#endif
+    for (; ib < nb; ++ib) {
+        uint32_t qh;
+        memcpy(&qh, x[ib].qh, sizeof(qh));
+
+        int sumi0 = 0;
+        int sumi1 = 0;
+
+        for (int j = 0; j < qk/2; ++j) {
+            const uint8_t xh_0 = ((qh >> (j +  0)) << 4) & 0x10;
+            const uint8_t xh_1 = ((qh >> (j + 12))     ) & 0x10;
+
+            const int32_t x0 = (x[ib].qs[j] & 0xF) | xh_0;
+            const int32_t x1 = (x[ib].qs[j] >>  4) | xh_1;
+
+            sumi0 += (x0 * y[ib].qs[j]);
+            sumi1 += (x1 * y[ib].qs[j + qk/2]);
+        }
+
+        int sumi = sumi0 + sumi1;
+        sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
+    }
+
+    *s = sumf;
+}
+
+void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    const int qk = QK8_0;
+    const int nb = n / qk;
+
+    assert(n % qk == 0);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_q8_0 * GGML_RESTRICT x = vx;
+    const block_q8_0 * GGML_RESTRICT y = vy;
+
+    int ib = 0;
+    float sumf = 0;
+
+#if defined(__riscv_v)
+    size_t vl = qk;
+
+    for (; ib < nb; ++ib) {
+        // load elements
+        vint8m2_t bx_0 = __riscv_vle8_v_i8m2(x[ib].qs, vl);
+        vint8m2_t by_0 = __riscv_vle8_v_i8m2(y[ib].qs, vl);
+
+        vint16m4_t vw_mul = __riscv_vwmul_vv_i16m4(bx_0, by_0, vl);
+
+        vint32m1_t v_zero = __riscv_vmv_v_x_i32m1(0, vl);
+        vint32m1_t v_sum = __riscv_vwredsum_vs_i16m4_i32m1(vw_mul, v_zero, vl);
+
+        int sumi = __riscv_vmv_x_s_i32m1_i32(v_sum);
+
+        sumf += sumi*(GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d));
+    }
+
+#endif
+    for (; ib < nb; ++ib) {
+        int sumi = 0;
+
+        for (int j = 0; j < qk; j++) {
+            sumi += x[ib].qs[j]*y[ib].qs[j];
+        }
+
+        sumf += sumi*(GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d));
+    }
+
+    *s = sumf;
+}
+
+void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_q2_K * GGML_RESTRICT x = vx;
+    const block_q8_K * GGML_RESTRICT y = vy;
+
+    const int nb = n / QK_K;
+
+#if defined __riscv_xtheadvector
+
+    float sumf = 0;
+    uint8_t atmp[16];
+
+    for (int i = 0; i < nb; ++i) {
+        const uint8_t * q2 = x[i].qs;
+        const  int8_t * q8 = y[i].qs;
+        const uint8_t * sc = x[i].scales;
+        const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
+        const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
+        uint8_t *patmp = atmp;
+        int vsums;
+        int tmp;
+        __asm__ __volatile__(
+            "th.vsetvli zero, %[vl16], e8, m1\n\t"
+            "th.vmv.v.x v8, zero\n\t"
+            "th.vlb.v v1, (%[sc])\n\t"
+            "th.vand.vi v0, v1, 0xF\n\t"
+            "th.vsrl.vi v1, v1, 4\n\t"
+            "th.vsb.v v0, (%[scale])\n\t"
+            "th.vwaddu.vx v16, v1, zero\n\t"
+            "th.vsetvli zero, %[vl16], e16, m2\n\t"
+            "th.vlh.v v2, (%[bsums])\n\t"
+            "th.vwmul.vv v4, v16, v2\n\t"
+            "th.vsetvli zero, %[vl16], e32, m4\n\t"
+            "th.vredsum.vs v8, v4, v8\n\t"
+            "th.vmv.x.s %[vsums], v8"
+            : [tmp] "=&r" (tmp), [vsums] "=&r" (vsums)
+            : [sc] "r" (sc), [scale] "r" (atmp), [bsums] "r" (y[i].bsums)
+            , [vl16] "r" (16)
+            : "memory"
+            , "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
+            , "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
+            , "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
+            , "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
+        );
+        sumf += dmin * vsums;
+        int isum = 0;
+
+        for (int j = 0; j < QK_K/128; ++j) {
+            __asm__ __volatile__(
+                "th.vsetvli zero, %[vl32], e8, m2\n\t"
+                "th.vlb.v v0, (%[q2])\n\t"
+                "th.vsrl.vi v2, v0, 2\n\t"
+                "th.vsrl.vi v4, v0, 4\n\t"
+                "th.vsrl.vi v6, v0, 6\n\t"
+                "th.vand.vi v0, v0, 0x3\n\t"
+                "th.vand.vi v2, v2, 0x3\n\t"
+                "th.vand.vi v4, v4, 0x3\n\t"
+                "th.vsetvli zero, %[vl128], e8, m8\n\t"
+                "th.vlb.v v8, (%[q8])\n\t"
+                "th.vsetvli zero, %[vl64], e8, m4\n\t"
+                "th.vwmul.vv v16, v0, v8\n\t"
+                "th.vwmul.vv v24, v4, v12\n\t"
+                "th.vsetvli zero, %[vl16], e16, m2\n\t"
+                "th.vmv.v.x v0, zero\n\t"
+                "th.vwredsum.vs v10, v16, v0\n\t"
+                "th.vwredsum.vs v9, v18, v0\n\t"
+                "th.vwredsum.vs v8, v20, v0\n\t"
+                "th.vwredsum.vs v7, v22, v0\n\t"
+                "th.vwredsum.vs v11, v24, v0\n\t"
+                "th.vwredsum.vs v12, v26, v0\n\t"
+                "th.vwredsum.vs v13, v28, v0\n\t"
+                "th.vwredsum.vs v14, v30, v0\n\t"
+                "li %[tmp], 4\n\t"
+                "th.vsetvli zero, %[tmp], e32, m1\n\t"
+                "th.vslideup.vi v10, v9, 1\n\t"
+                "th.vslideup.vi v8, v7, 1\n\t"
+                "th.vslideup.vi v11, v12, 1\n\t"
+                "th.vslideup.vi v13, v14, 1\n\t"
+                "th.vslideup.vi v10, v8, 2\n\t"
+                "th.vslideup.vi v11, v13, 2\n\t"
+                "li %[tmp], 8\n\t"
+                "th.vsetvli zero, %[tmp], e32, m2\n\t"
+                "th.vlbu.v v12, (%[scale])\n\t"
+                "th.vmul.vv v10, v10, v12\n\t"
+                "th.vredsum.vs v0, v10, v0\n\t"
+                "th.vmv.x.s %[tmp], v0\n\t"
+                "add %[isum], %[isum], %[tmp]"
+                : [tmp] "=&r" (tmp), [isum] "+&r" (isum)
+                : [q2] "r" (q2), [scale] "r" (patmp), [q8] "r" (q8)
+                , [vl16] "r" (16), [vl32] "r" (32), [vl64] "r" (64), [vl128] "r" (128)
+                : "memory"
+                , "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
+                , "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
+                , "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
+                , "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
+            );
+            q2 += 32; q8 += 128; patmp += 8;
+        }
+
+        sumf += dall * isum;
+    }
+
+    *s = sumf;
+
+#elif defined __riscv_v
+
+    float sumf = 0;
+    uint8_t atmp[16];
+
+    const int vector_length = __riscv_vlenb() * 8;
+    uint8_t temp_01[32] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 };
+
+    switch (vector_length) {
+    case 256:
+        for (int i = 0; i < nb; ++i) {
+            const uint8_t * q2 = x[i].qs;
+            const int8_t *  q8 = y[i].qs;
+            const uint8_t * sc = x[i].scales;
+
+            const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
+            const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
+
+            size_t vl = 16;
+
+            vuint8m1_t scales = __riscv_vle8_v_u8m1(sc, vl);
+            vuint8m1_t aux    = __riscv_vand_vx_u8m1(scales, 0x0F, vl);
+
+            vint16m1_t q8sums = __riscv_vle16_v_i16m1(y[i].bsums, vl);
+
+            vuint8mf2_t scales_2 = __riscv_vle8_v_u8mf2(sc, vl);
+            vuint8mf2_t mins8    = __riscv_vsrl_vx_u8mf2(scales_2, 0x4, vl);
+            vint16m1_t  mins     = __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(mins8, vl));
+            vint32m2_t  prod     = __riscv_vwmul_vv_i32m2(q8sums, mins, vl);
+            vint32m1_t  vsums    = __riscv_vredsum_vs_i32m2_i32m1(prod, __riscv_vmv_v_x_i32m1(0, 1), vl);
+
+            sumf += dmin * __riscv_vmv_x_s_i32m1_i32(vsums);
+
+            vl = 32;
+
+            vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1);
+            vuint8m1_t v_b   = __riscv_vle8_v_u8m1(temp_01, vl);
+
+            uint8_t is   = 0;
+            int     isum = 0;
+
+            for (int j = 0; j < QK_K / 128; ++j) {
+                // load Q2
+                vuint8m1_t q2_x = __riscv_vle8_v_u8m1(q2, vl);
+
+                vuint8m1_t q2_0 = __riscv_vand_vx_u8m1(q2_x, 0x03, vl);
+                vuint8m1_t q2_1 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(q2_x, 0x2, vl), 0x03, vl);
+                vuint8m1_t q2_2 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(q2_x, 0x4, vl), 0x03, vl);
+                vuint8m1_t q2_3 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(q2_x, 0x6, vl), 0x03, vl);
+
+                // duplicate scale elements for product
+                vuint8m1_t sc0 = __riscv_vrgather_vv_u8m1(aux, __riscv_vadd_vx_u8m1(v_b, 0 + is, vl), vl);
+                vuint8m1_t sc1 = __riscv_vrgather_vv_u8m1(aux, __riscv_vadd_vx_u8m1(v_b, 2 + is, vl), vl);
+                vuint8m1_t sc2 = __riscv_vrgather_vv_u8m1(aux, __riscv_vadd_vx_u8m1(v_b, 4 + is, vl), vl);
+                vuint8m1_t sc3 = __riscv_vrgather_vv_u8m1(aux, __riscv_vadd_vx_u8m1(v_b, 6 + is, vl), vl);
+
+                vint16m2_t p0 = __riscv_vreinterpret_v_u16m2_i16m2(__riscv_vwmulu_vv_u16m2(q2_0, sc0, vl));
+                vint16m2_t p1 = __riscv_vreinterpret_v_u16m2_i16m2(__riscv_vwmulu_vv_u16m2(q2_1, sc1, vl));
+                vint16m2_t p2 = __riscv_vreinterpret_v_u16m2_i16m2(__riscv_vwmulu_vv_u16m2(q2_2, sc2, vl));
+                vint16m2_t p3 = __riscv_vreinterpret_v_u16m2_i16m2(__riscv_vwmulu_vv_u16m2(q2_3, sc3, vl));
+
+                // load Q8
+                vint8m1_t q8_0 = __riscv_vle8_v_i8m1(q8, vl);
+                vint8m1_t q8_1 = __riscv_vle8_v_i8m1(q8 + 32, vl);
+                vint8m1_t q8_2 = __riscv_vle8_v_i8m1(q8 + 64, vl);
+                vint8m1_t q8_3 = __riscv_vle8_v_i8m1(q8 + 96, vl);
+
+                vint32m4_t s0 = __riscv_vwmul_vv_i32m4(p0, __riscv_vwcvt_x_x_v_i16m2(q8_0, vl), vl);
+                vint32m4_t s1 = __riscv_vwmul_vv_i32m4(p1, __riscv_vwcvt_x_x_v_i16m2(q8_1, vl), vl);
+                vint32m4_t s2 = __riscv_vwmul_vv_i32m4(p2, __riscv_vwcvt_x_x_v_i16m2(q8_2, vl), vl);
+                vint32m4_t s3 = __riscv_vwmul_vv_i32m4(p3, __riscv_vwcvt_x_x_v_i16m2(q8_3, vl), vl);
+
+                vint32m1_t isum0 = __riscv_vredsum_vs_i32m4_i32m1(__riscv_vadd_vv_i32m4(s0, s1, vl), vzero, vl);
+                vint32m1_t isum1 = __riscv_vredsum_vs_i32m4_i32m1(__riscv_vadd_vv_i32m4(s2, s3, vl), isum0, vl);
+
+                isum += __riscv_vmv_x_s_i32m1_i32(isum1);
+
+                q2 += 32;
+                q8 += 128;
+                is = 8;
+            }
+
+            sumf += dall * isum;
+        }
+        break;
+    case 128:
+        for (int i = 0; i < nb; ++i) {
+            const uint8_t * q2 = x[i].qs;
+            const  int8_t * q8 = y[i].qs;
+            const uint8_t * sc = x[i].scales;
+            const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
+            const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
+            uint8_t *patmp = atmp;
+            int vsums;
+            int tmp;
+            __asm__ __volatile__(
+                "vsetivli zero, 16, e8, m1\n\t"
+                "vmv.v.x v8, zero\n\t"
+                "vle8.v v1, (%[sc])\n\t"
+                "vand.vi v0, v1, 0xF\n\t"
+                "vsrl.vi v1, v1, 4\n\t"
+                "vse8.v v0, (%[scale])\n\t"
+                "vsetivli zero, 16, e16, m2\n\t"
+                "vle16.v v2, (%[bsums])\n\t"
+                "vzext.vf2 v0, v1\n\t"
+                "vwmul.vv v4, v0, v2\n\t"
+                "vsetivli zero, 16, e32, m4\n\t"
+                "vredsum.vs v8, v4, v8\n\t"
+                "vmv.x.s %[vsums], v8"
+                : [tmp] "=&r" (tmp), [vsums] "=&r" (vsums)
+                : [sc] "r" (sc), [scale] "r" (atmp), [bsums] "r" (y[i].bsums)
+                : "memory"
+                , "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
+                , "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
+                , "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
+                , "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
+            );
+            sumf += dmin * vsums;
+            int isum = 0;
+
+            for (int j = 0; j < QK_K/128; ++j) {
+                __asm__ __volatile__(
+                    "vsetvli zero, %[vl32], e8, m2\n\t"
+                    "vle8.v v0, (%[q2])\n\t"
+                    "vsrl.vi v2, v0, 2\n\t"
+                    "vsrl.vi v4, v0, 4\n\t"
+                    "vsrl.vi v6, v0, 6\n\t"
+                    "vand.vi v0, v0, 0x3\n\t"
+                    "vand.vi v2, v2, 0x3\n\t"
+                    "vand.vi v4, v4, 0x3\n\t"
+                    "vsetvli zero, %[vl128], e8, m8\n\t"
+                    "vle8.v v8, (%[q8])\n\t"
+                    "vsetvli zero, %[vl64], e8, m4\n\t"
+                    "vwmul.vv v16, v0, v8\n\t"
+                    "vwmul.vv v24, v4, v12\n\t"
+                    "vsetivli zero, 16, e16, m2\n\t"
+                    "vmv.v.x v0, zero\n\t"
+                    "vwredsum.vs v10, v16, v0\n\t"
+                    "vwredsum.vs v9, v18, v0\n\t"
+                    "vwredsum.vs v8, v20, v0\n\t"
+                    "vwredsum.vs v7, v22, v0\n\t"
+                    "vwredsum.vs v11, v24, v0\n\t"
+                    "vwredsum.vs v12, v26, v0\n\t"
+                    "vwredsum.vs v13, v28, v0\n\t"
+                    "vwredsum.vs v14, v30, v0\n\t"
+                    "vsetivli zero, 4, e32, m1\n\t"
+                    "vslideup.vi v10, v9, 1\n\t"
+                    "vslideup.vi v8, v7, 1\n\t"
+                    "vslideup.vi v11, v12, 1\n\t"
+                    "vslideup.vi v13, v14, 1\n\t"
+                    "vslideup.vi v10, v8, 2\n\t"
+                    "vslideup.vi v11, v13, 2\n\t"
+                    "vsetivli zero, 8, e32, m2\n\t"
+                    "vle8.v v15, (%[scale])\n\t"
+                    "vzext.vf4 v12, v15\n\t"
+                    "vmul.vv v10, v10, v12\n\t"
+                    "vredsum.vs v0, v10, v0\n\t"
+                    "vmv.x.s %[tmp], v0\n\t"
+                    "add %[isum], %[isum], %[tmp]"
+                    : [tmp] "=&r" (tmp), [isum] "+&r" (isum)
+                    : [q2] "r" (q2), [scale] "r" (patmp), [q8] "r" (q8)
+                    , [vl32] "r" (32), [vl64] "r" (64), [vl128] "r" (128)
+                    : "memory"
+                    , "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
+                    , "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
+                    , "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
+                    , "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
+                );
+                q2 += 32; q8 += 128; patmp += 8;
+            }
+
+            sumf += dall * isum;
+        }
+        break;
+    default:
+        assert(false && "Unsupported vector length");
+        break;
+    }
+
+    *s = sumf;
+
+#else
+
+    float sumf = 0;
+
+    for (int i = 0; i < nb; ++i) {
+
+        const uint8_t * q2 = x[i].qs;
+        const  int8_t * q8 = y[i].qs;
+        const uint8_t * sc = x[i].scales;
+
+        int summs = 0;
+        for (int j = 0; j < 16; ++j) {
+            summs += y[i].bsums[j] * (sc[j] >> 4);
+        }
+
+        const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
+        const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
+
+        int isum = 0;
+        int is = 0;
+        int d;
+        for (int k = 0; k < QK_K/128; ++k) {
+            int shift = 0;
+            for (int j = 0; j < 4; ++j) {
+                d = sc[is++] & 0xF;
+                int isuml = 0;
+                for (int l =  0; l < 16; ++l) isuml += q8[l] * ((q2[l] >> shift) & 3);
+                isum += d * isuml;
+                d = sc[is++] & 0xF;
+                isuml = 0;
+                for (int l = 16; l < 32; ++l) isuml += q8[l] * ((q2[l] >> shift) & 3);
+                isum += d * isuml;
+                shift += 2;
+                q8 += 32;
+            }
+            q2 += 32;
+        }
+        sumf += dall * isum - dmin * summs;
+    }
+    *s = sumf;
+#endif
+}
+
+void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    assert(n % QK_K == 0);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const uint32_t kmask1 = 0x03030303;
+    const uint32_t kmask2 = 0x0f0f0f0f;
+
+    const block_q3_K * GGML_RESTRICT x = vx;
+    const block_q8_K * GGML_RESTRICT y = vy;
+
+    const int nb = n / QK_K;
+
+#if defined __riscv_xtheadvector
+
+    uint32_t utmp[4];
+    float sumf = 0;
+
+    for (int i = 0; i < nb; ++i) {
+        const uint8_t * restrict q3 = x[i].qs;
+        const uint8_t * restrict qh = x[i].hmask;
+        const  int8_t * restrict q8 = y[i].qs;
+
+        int8_t * scale = (int8_t *)utmp;
+        int tmp;
+        __asm__ __volatile__(
+            "li %[tmp], 12\n\t"
+            "th.vsetvli zero, %[tmp], e8, m1\n\t"
+            "th.vlb.v v0, (%[s6b])\n\t"
+            "th.vmv.v.v v2, v0\n\t"
+            "li %[tmp], 2\n\t"
+            "th.vsetvli zero, %[tmp], e64, m1\n\t"
+            "th.vmv.v.x v9, %[sh]\n\t"\
+            "th.vslidedown.vi v1, v0, 1\n\t"
+            "th.vslide1up.vx v8, v9, zero\n\t" // {0, 0, 4, 4}
+            "th.vslideup.vi v0, v2, 1\n\t" // {aux[0], aux[1], aux[0], aux[1]}
+            "li %[tmp], 4\n\t"
+            "th.vsetvli zero, %[tmp], e32, m1\n\t"
+            "th.vid.v v9\n\t"
+            "th.vmv.x.s %[tmp], v1\n\t"
+            "th.vsll.vi v9, v9, 1\n\t" // {0, 2, 4, 6}
+            "th.vmv.v.x v1, %[tmp]\n\t" // {aux[2], aux[2], aux[2], aux[2]}
+            "th.vsrl.vv v4, v1, v9\n\t"
+            "th.vsrl.vv v2, v0, v8\n\t"
+            "th.vand.vx v5, v4, %[kmask1]\n\t"
+            "th.vand.vx v3, v2, %[kmask2]\n\t"
+            "th.vsll.vi v6, v5, 4\n\t"
+            "th.vor.vv v7, v6, v3\n\t"
+            "li %[tmp], 16\n\t"
+            "th.vsetvli zero, %[tmp], e8, m1\n\t"
+            "th.vsub.vx v0, v7, %[c]\n\t"
+            "th.vsb.v v0, (%[scale])"
+            : [tmp] "=&r" (tmp)
+            : [sh] "r" (0x0000000400000004), [s6b] "r" (x[i].scales), [c] "r" (32)
+            , [scale] "r" (scale), [kmask1] "r" (kmask1), [kmask2] "r" (kmask2)
+            : "memory"
+            , "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
+            , "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
+            , "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
+            , "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
+        );
+
+        uint8_t m = 1;
+        int isum = 0;
+        for (int j = 0; j < QK_K; j += 128) {
+            __asm__ __volatile__(
+                // fixme: use v0p7 mask layout directly
+                "th.vsetvli zero, %[vl32], e8, m2\n\t"
+                "th.vlb.v v8, (%[q3])\n\t"
+                "th.vsrl.vi v10, v8, 2\n\t"
+                "th.vsrl.vi v12, v8, 4\n\t"
+                "th.vsrl.vi v14, v8, 6\n\t"
+                "th.vand.vi v8, v8, 3\n\t"
+                "th.vand.vi v10, v10, 3\n\t"
+                "th.vand.vi v12, v12, 3\n\t"
+                "th.vlb.v v2, (%[qh])\n\t"
+                "th.vand.vx v4, v2, %[m]\n\t"
+                "slli %[m], %[m], 1\n\t"
+                "th.vmseq.vx v0, v4, zero\n\t"
+                "th.vadd.vi v8, v8, -4, v0.t\n\t"
+                "th.vand.vx v4, v2, %[m]\n\t"
+                "slli %[m], %[m], 1\n\t"
+                "th.vmseq.vx v0, v4, zero\n\t"
+                "th.vadd.vi v10, v10, -4, v0.t\n\t"
+                "th.vand.vx v4, v2, %[m]\n\t"
+                "slli %[m], %[m], 1\n\t"
+                "th.vmseq.vx v0, v4, zero\n\t"
+                "th.vadd.vi v12, v12, -4, v0.t\n\t"
+                "th.vand.vx v4, v2, %[m]\n\t"
+                "slli %[m], %[m], 1\n\t"
+                "th.vmseq.vx v0, v4, zero\n\t"
+                "th.vadd.vi v14, v14, -4, v0.t\n\t"
+                "th.vsetvli zero, %[vl128], e8, m8\n\t"
+                "th.vlb.v v0, (%[q8])\n\t"
+                "th.vsetvli zero, %[vl64], e8, m4\n\t"
+                "th.vwmul.vv v16, v0, v8\n\t"
+                "th.vwmul.vv v24, v4, v12\n\t"
+                "li %[tmp], 16\n\t"
+                "th.vsetvli zero, %[tmp], e16, m2\n\t"
+                "th.vmv.v.x v0, zero\n\t"
+                "th.vwredsum.vs v10, v16, v0\n\t"
+                "th.vwredsum.vs v9, v18, v0\n\t"
+                "th.vwredsum.vs v8, v20, v0\n\t"
+                "th.vwredsum.vs v7, v22, v0\n\t"
+                "th.vwredsum.vs v11, v24, v0\n\t"
+                "th.vwredsum.vs v12, v26, v0\n\t"
+                "th.vwredsum.vs v13, v28, v0\n\t"
+                "th.vwredsum.vs v14, v30, v0\n\t"
+                "li %[tmp], 4\n\t"
+                "th.vsetvli zero, %[tmp], e32, m1\n\t"
+                "th.vslideup.vi v10, v9, 1\n\t"
+                "th.vslideup.vi v8, v7, 1\n\t"
+                "th.vslideup.vi v11, v12, 1\n\t"
+                "th.vslideup.vi v13, v14, 1\n\t"
+                "th.vslideup.vi v10, v8, 2\n\t"
+                "th.vslideup.vi v11, v13, 2\n\t"
+                "li %[tmp], 8\n\t"
+                "th.vsetvli zero, %[tmp], e32, m2\n\t"
+                "th.vlb.v v12, (%[scale])\n\t"
+                "th.vmul.vv v10, v10, v12\n\t"
+                "th.vredsum.vs v0, v10, v0\n\t"
+                "th.vmv.x.s %[tmp], v0\n\t"
+                "add %[isum], %[isum], %[tmp]"
+                : [tmp] "=&r" (tmp), [m] "+&r" (m), [isum] "+&r" (isum)
+                : [vl128] "r" (128), [vl64] "r" (64), [vl32] "r" (32)
+                , [q3] "r" (q3), [qh] "r" (qh), [scale] "r" (scale), [q8] "r" (q8)
+                : "memory"
+                , "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
+                , "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
+                , "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
+                , "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
+            );
+            q3 += 32;    q8 += 128;   scale += 8;
+        }
+
+        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+        sumf += d * isum;
+    }
+
+    *s = sumf;
+
+#elif defined __riscv_v
+
+    uint32_t utmp[4];
+    float sumf = 0;
+    uint32_t aux[3];
+    const int vector_length = __riscv_vlenb() * 8;
+
+    switch (vector_length) {
+    case 256:
+        for (int i = 0; i < nb; ++i) {
+
+            const uint8_t * GGML_RESTRICT q3 = x[i].qs;
+            const uint8_t * GGML_RESTRICT qh = x[i].hmask;
+            const  int8_t * GGML_RESTRICT q8 = y[i].qs;
+
+            memcpy(aux, x[i].scales, 12);
+            utmp[3] = ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4);
+            utmp[2] = ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4);
+            utmp[1] = (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4);
+            utmp[0] = (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4);
+
+            int8_t * scale = (int8_t *)utmp;
+            for (int j = 0; j < 16; ++j) scale[j] -= 32;
+
+
+            size_t vl = 32;
+            uint8_t m =  1;
+
+            vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1);
+            vuint8m1_t vqh = __riscv_vle8_v_u8m1(qh, vl);
+
+            int sum_t = 0;
+
+            for (int j = 0; j < QK_K; j += 128) {
+
+                vl = 32;
+
+                // load Q3
+                vuint8m1_t q3_x = __riscv_vle8_v_u8m1(q3, vl);
+
+                vint8m1_t q3_0 = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(q3_x, 0x03, vl));
+                vint8m1_t q3_1 = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(q3_x, 0x2, vl), 0x03 , vl));
+                vint8m1_t q3_2 = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(q3_x, 0x4, vl), 0x03 , vl));
+                vint8m1_t q3_3 = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(q3_x, 0x6, vl), 0x03 , vl));
+
+                // compute mask for subtraction
+                vuint8m1_t qh_m0 = __riscv_vand_vx_u8m1(vqh, m, vl);
+                vbool8_t vmask_0 = __riscv_vmseq_vx_u8m1_b8(qh_m0, 0, vl);
+                vint8m1_t q3_m0 = __riscv_vsub_vx_i8m1_mu(vmask_0, q3_0, q3_0, 0x4, vl);
+                m <<= 1;
+
+                vuint8m1_t qh_m1 = __riscv_vand_vx_u8m1(vqh, m, vl);
+                vbool8_t vmask_1 = __riscv_vmseq_vx_u8m1_b8(qh_m1, 0, vl);
+                vint8m1_t q3_m1 = __riscv_vsub_vx_i8m1_mu(vmask_1, q3_1, q3_1, 0x4, vl);
+                m <<= 1;
+
+                vuint8m1_t qh_m2 = __riscv_vand_vx_u8m1(vqh, m, vl);
+                vbool8_t vmask_2 = __riscv_vmseq_vx_u8m1_b8(qh_m2, 0, vl);
+                vint8m1_t q3_m2 = __riscv_vsub_vx_i8m1_mu(vmask_2, q3_2, q3_2, 0x4, vl);
+                m <<= 1;
+
+                vuint8m1_t qh_m3 = __riscv_vand_vx_u8m1(vqh, m, vl);
+                vbool8_t vmask_3 = __riscv_vmseq_vx_u8m1_b8(qh_m3, 0, vl);
+                vint8m1_t q3_m3 = __riscv_vsub_vx_i8m1_mu(vmask_3, q3_3, q3_3, 0x4, vl);
+                m <<= 1;
+
+                // load Q8 and take product with Q3
+                vint16m2_t a0 = __riscv_vwmul_vv_i16m2(q3_m0, __riscv_vle8_v_i8m1(q8, vl), vl);
+                vint16m2_t a1 = __riscv_vwmul_vv_i16m2(q3_m1, __riscv_vle8_v_i8m1(q8+32, vl), vl);
+                vint16m2_t a2 = __riscv_vwmul_vv_i16m2(q3_m2, __riscv_vle8_v_i8m1(q8+64, vl), vl);
+                vint16m2_t a3 = __riscv_vwmul_vv_i16m2(q3_m3, __riscv_vle8_v_i8m1(q8+96, vl), vl);
+
+                vl = 16;
+
+                // retrieve lane to multiply with scale
+                vint32m2_t aux0_0 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a0, 0), (scale[0]), vl);
+                vint32m2_t aux0_1 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a0, 1), (scale[1]), vl);
+                vint32m2_t aux1_0 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a1, 0), (scale[2]), vl);
+                vint32m2_t aux1_1 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a1, 1), (scale[3]), vl);
+                vint32m2_t aux2_0 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a2, 0), (scale[4]), vl);
+                vint32m2_t aux2_1 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a2, 1), (scale[5]), vl);
+                vint32m2_t aux3_0 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a3, 0), (scale[6]), vl);
+                vint32m2_t aux3_1 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a3, 1), (scale[7]), vl);
+
+                vint32m1_t isum0 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(aux0_0, aux0_1, vl), vzero, vl);
+                vint32m1_t isum1 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(aux1_0, aux1_1, vl), isum0, vl);
+                vint32m1_t isum2 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(aux2_0, aux2_1, vl), isum1, vl);
+                vint32m1_t isum3 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(aux3_0, aux3_1, vl), isum2, vl);
+
+                sum_t +=  __riscv_vmv_x_s_i32m1_i32(isum3);
+
+                q3 += 32;    q8 += 128;   scale += 8;
+
+            }
+
+            const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+
+            sumf += d*sum_t;
+
+        }
+        break;
+    case 128:
+        for (int i = 0; i < nb; ++i) {
+            const uint8_t * restrict q3 = x[i].qs;
+            const uint8_t * restrict qh = x[i].hmask;
+            const  int8_t * restrict q8 = y[i].qs;
+
+            int8_t * scale = (int8_t *)utmp;
+            int tmp;
+            __asm__ __volatile__(
+                "vsetivli zero, 12, e8, m1\n\t"
+                "vle8.v v0, (%[s6b])\n\t"
+                "vmv1r.v v2, v0\n\t"
+                "vsetivli zero, 2, e64, m1\n\t"
+                "vmv.v.x v9, %[sh]\n\t"\
+                "vslidedown.vi v1, v0, 1\n\t"
+                "vslide1up.vx v8, v9, zero\n\t" // {0, 0, 4, 4}
+                "vslideup.vi v0, v2, 1\n\t" // {aux[0], aux[1], aux[0], aux[1]}
+                "vsetivli zero, 4, e32, m1\n\t"
+                "vid.v v9\n\t"
+                "vmv.x.s %[tmp], v1\n\t"
+                "vsll.vi v9, v9, 1\n\t" // {0, 2, 4, 6}
+                "vmv.v.x v1, %[tmp]\n\t" // {aux[2], aux[2], aux[2], aux[2]}
+                "vsrl.vv v4, v1, v9\n\t"
+                "vsrl.vv v2, v0, v8\n\t"
+                "vand.vx v5, v4, %[kmask1]\n\t"
+                "vand.vx v3, v2, %[kmask2]\n\t"
+                "vsll.vi v6, v5, 4\n\t"
+                "vor.vv v7, v6, v3\n\t"
+                "vsetivli zero, 16, e8, m1\n\t"
+                "vsub.vx v0, v7, %[c]\n\t"
+                "vse8.v v0, (%[scale])"
+                : [tmp] "=&r" (tmp)
+                : [sh] "r" (0x0000000400000004), [s6b] "r" (x[i].scales), [c] "r" (32)
+                , [scale] "r" (scale), [kmask1] "r" (kmask1), [kmask2] "r" (kmask2)
+                : "memory"
+                , "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
+                , "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
+                , "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
+                , "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
+            );
+
+            uint8_t m = 1;
+            int isum = 0;
+            for (int j = 0; j < QK_K; j += 128) {
+                __asm__ __volatile__(
+                    "vsetvli zero, %[vl32], e8, m2, ta, mu\n\t"
+                    "vle8.v v8, (%[q3])\n\t"
+                    "vsrl.vi v10, v8, 2\n\t"
+                    "vsrl.vi v12, v8, 4\n\t"
+                    "vsrl.vi v14, v8, 6\n\t"
+                    "vand.vi v8, v8, 3\n\t"
+                    "vand.vi v10, v10, 3\n\t"
+                    "vand.vi v12, v12, 3\n\t"
+                    "vle8.v v2, (%[qh])\n\t"
+                    "vand.vx v4, v2, %[m]\n\t"
+                    "slli %[m], %[m], 1\n\t"
+                    "vmseq.vx v0, v4, zero\n\t"
+                    "vadd.vi v8, v8, -4, v0.t\n\t"
+                    "vand.vx v4, v2, %[m]\n\t"
+                    "slli %[m], %[m], 1\n\t"
+                    "vmseq.vx v0, v4, zero\n\t"
+                    "vadd.vi v10, v10, -4, v0.t\n\t"
+                    "vand.vx v4, v2, %[m]\n\t"
+                    "slli %[m], %[m], 1\n\t"
+                    "vmseq.vx v0, v4, zero\n\t"
+                    "vadd.vi v12, v12, -4, v0.t\n\t"
+                    "vand.vx v4, v2, %[m]\n\t"
+                    "slli %[m], %[m], 1\n\t"
+                    "vmseq.vx v0, v4, zero\n\t"
+                    "vadd.vi v14, v14, -4, v0.t\n\t"
+                    "vsetvli zero, %[vl128], e8, m8\n\t"
+                    "vle8.v v0, (%[q8])\n\t"
+                    "vsetvli zero, %[vl64], e8, m4\n\t"
+                    "vwmul.vv v16, v0, v8\n\t"
+                    "vwmul.vv v24, v4, v12\n\t"
+                    "vsetivli zero, 16, e16, m2\n\t"
+                    "vmv.v.x v0, zero\n\t"
+                    "vwredsum.vs v10, v16, v0\n\t"
+                    "vwredsum.vs v9, v18, v0\n\t"
+                    "vwredsum.vs v8, v20, v0\n\t"
+                    "vwredsum.vs v7, v22, v0\n\t"
+                    "vwredsum.vs v11, v24, v0\n\t"
+                    "vwredsum.vs v12, v26, v0\n\t"
+                    "vwredsum.vs v13, v28, v0\n\t"
+                    "vwredsum.vs v14, v30, v0\n\t"
+                    "vsetivli zero, 4, e32, m1\n\t"
+                    "vslideup.vi v10, v9, 1\n\t"
+                    "vslideup.vi v8, v7, 1\n\t"
+                    "vslideup.vi v11, v12, 1\n\t"
+                    "vslideup.vi v13, v14, 1\n\t"
+                    "vslideup.vi v10, v8, 2\n\t"
+                    "vslideup.vi v11, v13, 2\n\t"
+                    "vsetivli zero, 8, e32, m2\n\t"
+                    "vle8.v v15, (%[scale])\n\t"
+                    "vsext.vf4 v12, v15\n\t"
+                    "vmul.vv v10, v10, v12\n\t"
+                    "vredsum.vs v0, v10, v0\n\t"
+                    "vmv.x.s %[tmp], v0\n\t"
+                    "add %[isum], %[isum], %[tmp]"
+                    : [tmp] "=&r" (tmp), [m] "+&r" (m), [isum] "+&r" (isum)
+                    : [vl128] "r" (128), [vl64] "r" (64), [vl32] "r" (32)
+                    , [q3] "r" (q3), [qh] "r" (qh), [scale] "r" (scale), [q8] "r" (q8)
+                    : "memory"
+                    , "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
+                    , "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
+                    , "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
+                    , "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
+                );
+                q3 += 32;    q8 += 128;   scale += 8;
+            }
+
+            const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+            sumf += d * isum;
+        }
+        break;
+    default:
+        assert(false && "Unsupported vector length");
+        break;
+    }
+
+    *s = sumf;
+
+#else
+    // scalar version
+    // This function is written like this so the compiler can manage to vectorize most of it
+    // Using -Ofast, GCC and clang manage to produce code that is within a factor of 2 or so from the
+    // manually vectorized version above. Every other version I tried would run at least 4 times slower.
+    // The ideal situation would be if we could just write the code once, and the compiler would
+    // automatically produce the best possible set of machine instructions, instead of us having to manually
+    // write vectorized versions for AVX, ARM_NEON, etc.
+
+    int8_t  aux8[QK_K];
+    int16_t aux16[8];
+    float   sums [8];
+    int32_t aux32[8];
+    memset(sums, 0, 8*sizeof(float));
+
+    uint32_t auxs[4];
+    const int8_t * scales = (const int8_t*)auxs;
+
+    float sumf = 0;
+    for (int i = 0; i < nb; ++i) {
+        const uint8_t * GGML_RESTRICT q3 = x[i].qs;
+        const uint8_t * GGML_RESTRICT hm = x[i].hmask;
+        const  int8_t * GGML_RESTRICT q8 = y[i].qs;
+        memset(aux32, 0, 8*sizeof(int32_t));
+        int8_t * GGML_RESTRICT a = aux8;
+        uint8_t m = 1;
+        for (int j = 0; j < QK_K; j += 128) {
+            for (int l = 0; l < 32; ++l) a[l] = q3[l] & 3;
+            for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
+            a += 32; m <<= 1;
+            for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 2) & 3;
+            for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
+            a += 32; m <<= 1;
+            for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 4) & 3;
+            for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
+            a += 32; m <<= 1;
+            for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 6) & 3;
+            for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
+            a += 32; m <<= 1;
+            q3 += 32;
+        }
+        a = aux8;
+
+        memcpy(auxs, x[i].scales, 12);
+        uint32_t tmp = auxs[2];
+        auxs[2] = ((auxs[0] >> 4) & kmask2) | (((tmp >> 4) & kmask1) << 4);
+        auxs[3] = ((auxs[1] >> 4) & kmask2) | (((tmp >> 6) & kmask1) << 4);
+        auxs[0] = (auxs[0] & kmask2) | (((tmp >> 0) & kmask1) << 4);
+        auxs[1] = (auxs[1] & kmask2) | (((tmp >> 2) & kmask1) << 4);
+        for (int j = 0; j < QK_K/16; ++j) {
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
+            q8 += 8; a += 8;
+        }
+        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+        for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
+    }
+    for (int l = 0; l < 8; ++l) sumf += sums[l];
+    *s = sumf;
+
+#endif
+
+}
+
+void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    assert(n % QK_K == 0);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_q4_K * GGML_RESTRICT x = vx;
+    const block_q8_K * GGML_RESTRICT y = vy;
+
+    const int nb = n / QK_K;
+
+    static const uint32_t kmask1 = 0x3f3f3f3f;
+    static const uint32_t kmask2 = 0x0f0f0f0f;
+    static const uint32_t kmask3 = 0x03030303;
+
+    uint32_t utmp[4];
+
+#if defined __riscv_xtheadvector
+
+    const uint8_t * scales = (const uint8_t*)&utmp[0];
+    const uint8_t * mins   = (const uint8_t*)&utmp[2];
+
+    float sumf = 0;
+
+    for (int i = 0; i < nb; ++i) {
+        const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
+        const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
+
+        int tmp, tmp2, sumi;
+        __asm__ __volatile__(
+            "li %[t1], 12\n\t"
+            "th.vsetvli zero, %[t1], e8, m1\n\t"
+            "th.vlb.v v1, (%[s6b])\n\t" // {aux[0], aux[1], aux[2]}
+            "li %[t1], 4\n\t"
+            "th.vsetvli zero, %[t1], e32, m1\n\t"
+            "th.vslidedown.vi v2, v1, 2\n\t"
+            "th.vmv.v.v v3, v2\n\t"
+            "th.vslideup.vi v2, v3, 1\n\t" // {aux[2], aux[2]}
+            "li %[t1], 2\n\t"
+            "th.vsetvli zero, %[t1], e32, m1\n\t"
+            "th.vmv.v.i v4, 4\n\t"
+            "th.vand.vx v8, v1, %[kmask1]\n\t"
+            "th.vslide1up.vx v5, v4, zero\n\t" // {0, 4}
+            "th.vsrl.vi v6, v1, 6\n\t"
+            "th.vsrl.vv v7, v2, v5\n\t"
+            "th.vand.vx v0, v6, %[kmask3]\n\t"
+            "th.vand.vx v2, v7, %[kmask2]\n\t"
+            "th.vsll.vi v6, v0, 4\n\t"
+            "li %[t2], 8\n\t"
+            "addi %[t1], %[utmp], 4\n\t"
+            "th.vor.vv v1, v6, v2\n\t"
+            "th.vssw.v v8, (%[utmp]), %[t2]\n\t"
+            "th.vssw.v v1, (%[t1]), %[t2]\n\t"
+            "th.vsetvli zero, zero, e32, m2\n\t" // vl == 8
+            "th.vlw.v v2, (%[bsums])\n\t"
+            "th.vsetvli zero, %[t2], e16, m1\n\t"
+            "th.vnsrl.vi v0, v2, 0\n\t"
+            "th.vnsrl.vi v1, v2, 16\n\t"
+            "th.vadd.vv v2, v0, v1\n\t"
+            "th.vlbu.v v4, (%[mins])\n\t"
+            "th.vwmul.vv v6, v4, v2\n\t"
+            "th.vmv.v.x v0, zero\n\t"
+            "th.vsetvli zero, %[t2], e32, m2\n\t"
+            "th.vredsum.vs v0, v6, v0\n\t"
+            "th.vmv.x.s %[sumi], v0"
+            : [t1] "=&r" (tmp), [t2] "=&r" (tmp2), [sumi] "=&r" (sumi)
+            : [bsums] "r" (y[i].bsums), [mins] "r" (mins), [utmp] "r" (utmp)
+            , [s6b] "r" (x[i].scales), [kmask1] "r" (kmask1)
+            , [kmask2] "r" (kmask2), [kmask3] "r" (kmask3)
+            : "memory"
+            , "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
+            , "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
+            , "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
+            , "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
+        );
+        sumf -= dmin * sumi;
+
+        const uint8_t * restrict q4 = x[i].qs;
+        const int8_t  * restrict q8 = y[i].qs;
+
+        sumi = 0;
+        const uint8_t * scale = scales;
+
+        for (int j = 0; j < QK_K/128; ++j) {
+            int vl128 = 128, vl64 = 64, vl32 = 32;
+            __asm__ __volatile__(
+                "th.vsetvli zero, %[vl128], e8, m8\n\t"
+                "th.vlb.v v8, (%[q8])\n\t"
+                "th.vsetvli zero, %[vl64], e8, m4\n\t"
+                "th.vlb.v v0, (%[q4])\n\t"
+                "th.vsrl.vi v4, v0, 4\n\t"
+                "th.vand.vi v0, v0, 0xF\n\t"
+                "th.vsetvli zero, %[vl32], e8, m2\n\t"
+                "th.vwmul.vv v28, v6, v14\n\t"
+                "th.vwmul.vv v20, v4, v10\n\t"
+                "th.vwmul.vv v24, v2, v12\n\t"
+                "th.vwmul.vv v16, v0, v8\n\t"
+                "li %[tmp], 4\n\t"
+                "th.vsetvli zero, %[tmp], e32, m1\n\t"
+                "th.vlbu.v v1, (%[scale])\n\t"
+                "th.vmv.v.x v0, zero\n\t"
+                "th.vsetvli zero, %[vl32], e16, m4\n\t"
+                "th.vwredsum.vs v6, v24, v0\n\t"
+                "th.vwredsum.vs v7, v28, v0\n\t"
+                "th.vwredsum.vs v4, v16, v0\n\t"
+                "th.vwredsum.vs v5, v20, v0\n\t"
+                "th.vsetvli zero, %[tmp], e32, m1\n\t"
+                "th.vslideup.vi v6, v7, 1\n\t"
+                "th.vslideup.vi v4, v5, 1\n\t"
+                "th.vslideup.vi v4, v6, 2\n\t"
+                "th.vmul.vv v8, v4, v1\n\t"
+                "th.vredsum.vs v0, v8, v0\n\t"
+                "th.vmv.x.s %[tmp], v0\n\t"
+                "add %[sumi], %[sumi], %[tmp]"
+                : [tmp] "=&r" (tmp), [sumi] "+&r" (sumi)
+                : [vl128] "r" (vl128), [vl64] "r" (vl64), [vl32] "r" (vl32)
+                , [q4] "r" (q4), [q8] "r" (q8), [scale] "r" (scale)
+                : "memory"
+                , "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
+                , "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
+                , "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
+                , "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
+            );
+
+            q4 += 64;    q8 += 128;    scale += 4;
+        }
+
+        sumf += d * sumi;
+
+    }
+
+    *s = sumf;
+
+#elif defined __riscv_v
+
+    const uint8_t * scales = (const uint8_t*)&utmp[0];
+    const uint8_t * mins   = (const uint8_t*)&utmp[2];
+
+    float sumf = 0;
+    const int vector_length = __riscv_vlenb() * 8;
+
+    switch (vector_length) {
+    case 256:
+        for (int i = 0; i < nb; ++i) {
+
+            size_t vl = 8;
+
+            const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
+            const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
+
+            vint16mf2_t q8sums_0 = __riscv_vlse16_v_i16mf2(y[i].bsums, 4, vl);
+            vint16mf2_t q8sums_1 = __riscv_vlse16_v_i16mf2(y[i].bsums+1, 4, vl);
+            vint16mf2_t q8sums   = __riscv_vadd_vv_i16mf2(q8sums_0, q8sums_1, vl);
+
+            memcpy(utmp, x[i].scales, 12);
+            utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
+            const uint32_t uaux = utmp[1] & kmask1;
+            utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
+            utmp[2] = uaux;
+            utmp[0] &= kmask1;
+
+            vuint8mf4_t mins8  = __riscv_vle8_v_u8mf4(mins, vl);
+            vint16mf2_t v_mins = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vzext_vf2_u16mf2(mins8, vl));
+            vint32m1_t  prod   = __riscv_vwmul_vv_i32m1(q8sums, v_mins, vl);
+
+            vint32m1_t sumi = __riscv_vredsum_vs_i32m1_i32m1(prod, __riscv_vmv_v_x_i32m1(0, 1), vl);
+            sumf -= dmin * __riscv_vmv_x_s_i32m1_i32(sumi);
+
+            const uint8_t * GGML_RESTRICT q4 = x[i].qs;
+            const int8_t  * GGML_RESTRICT q8 = y[i].qs;
+
+            vl = 32;
+
+            int32_t sum_1 = 0;
+            int32_t sum_2 = 0;
+
+            vint16m1_t vzero = __riscv_vmv_v_x_i16m1(0, 1);
+
+            for (int j = 0; j < QK_K/64; ++j) {
+                // load Q4
+                vuint8m1_t q4_x = __riscv_vle8_v_u8m1(q4, vl);
+
+                // load Q8 and multiply it with lower Q4 nibble
+                vint8m1_t  q8_0 = __riscv_vle8_v_i8m1(q8, vl);
+                vint8m1_t  q4_0 = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(q4_x, 0x0F, vl));
+                vint16m2_t qv_0 = __riscv_vwmul_vv_i16m2(q4_0, q8_0, vl);
+                vint16m1_t vs_0 = __riscv_vredsum_vs_i16m2_i16m1(qv_0, vzero, vl);
+
+                sum_1 += __riscv_vmv_x_s_i16m1_i16(vs_0) * scales[2*j+0];
+
+                // load Q8 and multiply it with upper Q4 nibble
+                vint8m1_t  q8_1 = __riscv_vle8_v_i8m1(q8+32, vl);
+                vint8m1_t  q4_1 = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vsrl_vx_u8m1(q4_x, 0x04, vl));
+                vint16m2_t qv_1 = __riscv_vwmul_vv_i16m2(q4_1, q8_1, vl);
+                vint16m1_t vs_1 = __riscv_vredsum_vs_i16m2_i16m1(qv_1, vzero, vl);
+
+                sum_2 += __riscv_vmv_x_s_i16m1_i16(vs_1) * scales[2*j+1];
+
+                q4 += 32;    q8 += 64;
+
+            }
+
+            sumf += d*(sum_1 + sum_2);
+
+        }
+        break;
+    case 128:
+        for (int i = 0; i < nb; ++i) {
+            const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
+            const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
+
+            int tmp, tmp2, sumi;
+            __asm__ __volatile__(
+                "vsetivli zero, 12, e8, m1\n\t"
+                "vle8.v v1, (%[s6b])\n\t" // {aux[0], aux[1], aux[2]}
+                "vsetivli zero, 4, e32, m1\n\t"
+                "vslidedown.vi v2, v1, 2\n\t"
+                "vmv1r.v v3, v2\n\t"
+                "vslideup.vi v2, v3, 1\n\t" // {aux[2], aux[2]}
+                "vsetivli zero, 2, e32, m1\n\t"
+                "vmv.v.i v4, 4\n\t"
+                "vand.vx v8, v1, %[kmask1]\n\t"
+                "vslide1up.vx v5, v4, zero\n\t" // {0, 4}
+                "vsrl.vi v6, v1, 6\n\t"
+                "vsrl.vv v7, v2, v5\n\t"
+                "vand.vx v0, v6, %[kmask3]\n\t"
+                "vand.vx v2, v7, %[kmask2]\n\t"
+                "vsll.vi v6, v0, 4\n\t"
+                "li %[t2], 8\n\t"
+                "addi %[t1], %[utmp], 4\n\t"
+                "vor.vv v1, v6, v2\n\t"
+                "vsse32.v v8, (%[utmp]), %[t2]\n\t"
+                "vsse32.v v1, (%[t1]), %[t2]\n\t"
+                "vsetivli zero, 8, e16, m1\n\t"
+                "vle32.v v2, (%[bsums])\n\t"
+                "vnsrl.wi v0, v2, 0\n\t"
+                "vnsrl.wi v1, v2, 16\n\t"
+                "vadd.vv v2, v0, v1\n\t"
+                "vle8.v v3, (%[mins])\n\t"
+                "vzext.vf2 v4, v3\n\t"
+                "vwmul.vv v6, v4, v2\n\t"
+                "vmv.v.x v0, zero\n\t"
+                "vsetivli zero, 8, e32, m2\n\t"
+                "vredsum.vs v0, v6, v0\n\t"
+                "vmv.x.s %[sumi], v0"
+                : [t1] "=&r" (tmp), [t2] "=&r" (tmp2), [sumi] "=&r" (sumi)
+                : [bsums] "r" (y[i].bsums), [mins] "r" (mins), [utmp] "r" (utmp)
+                , [s6b] "r" (x[i].scales), [kmask1] "r" (kmask1)
+                , [kmask2] "r" (kmask2), [kmask3] "r" (kmask3)
+                : "memory"
+                , "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
+                , "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
+                , "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
+                , "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
+            );
+            sumf -= dmin * sumi;
+
+            const uint8_t * restrict q4 = x[i].qs;
+            const int8_t  * restrict q8 = y[i].qs;
+
+            sumi = 0;
+            const uint8_t * scale = scales;
+
+            for (int j = 0; j < QK_K/128; ++j) {
+                int vl128 = 128, vl64 = 64, vl32 = 32;
+                __asm__ __volatile__(
+                    "vsetvli zero, %[vl128], e8, m8\n\t"
+                    "vle8.v v8, (%[q8])\n\t"
+                    "vsetvli zero, %[vl64], e8, m4\n\t"
+                    "vle8.v v0, (%[q4])\n\t"
+                    "vsrl.vi v4, v0, 4\n\t"
+                    "vand.vi v0, v0, 0xF\n\t"
+                    "vsetvli zero, %[vl32], e8, m2\n\t"
+                    "vwmul.vv v28, v6, v14\n\t"
+                    "vwmul.vv v20, v4, v10\n\t"
+                    "vwmul.vv v24, v2, v12\n\t"
+                    "vwmul.vv v16, v0, v8\n\t"
+                    "vsetivli zero, 4, e32, m1\n\t"
+                    "vle8.v v2, (%[scale])\n\t"
+                    "vmv.v.x v0, zero\n\t"
+                    "vzext.vf4 v1, v2\n\t"
+                    "vsetvli zero, %[vl32], e16, m4\n\t"
+                    "vwredsum.vs v6, v24, v0\n\t"
+                    "vwredsum.vs v7, v28, v0\n\t"
+                    "vwredsum.vs v4, v16, v0\n\t"
+                    "vwredsum.vs v5, v20, v0\n\t"
+                    "vsetivli zero, 4, e32, m1\n\t"
+                    "vslideup.vi v6, v7, 1\n\t"
+                    "vslideup.vi v4, v5, 1\n\t"
+                    "vslideup.vi v4, v6, 2\n\t"
+                    "vmul.vv v8, v4, v1\n\t"
+                    "vredsum.vs v0, v8, v0\n\t"
+                    "vmv.x.s %[tmp], v0\n\t"
+                    "add %[sumi], %[sumi], %[tmp]"
+                    : [tmp] "=&r" (tmp), [sumi] "+&r" (sumi)
+                    : [vl128] "r" (vl128), [vl64] "r" (vl64), [vl32] "r" (vl32)
+                    , [q4] "r" (q4), [q8] "r" (q8), [scale] "r" (scale)
+                    : "memory"
+                    , "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
+                    , "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
+                    , "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
+                    , "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
+                );
+
+                q4 += 64;    q8 += 128;    scale += 4;
+            }
+
+            sumf += d * sumi;
+        }
+        break;
+    default:
+        assert(false && "Unsupported vector length");
+        break;
+    }
+
+    *s = sumf;
+
+#else
+
+    const uint8_t * scales = (const uint8_t*)&utmp[0];
+    const uint8_t * mins   = (const uint8_t*)&utmp[2];
+
+    int8_t  aux8[QK_K];
+    int16_t aux16[8];
+    float   sums [8];
+    int32_t aux32[8];
+    memset(sums, 0, 8*sizeof(float));
+
+    float sumf = 0;
+    for (int i = 0; i < nb; ++i) {
+        const uint8_t * GGML_RESTRICT q4 = x[i].qs;
+        const  int8_t * GGML_RESTRICT q8 = y[i].qs;
+        memset(aux32, 0, 8*sizeof(int32_t));
+        int8_t * GGML_RESTRICT a = aux8;
+        for (int j = 0; j < QK_K/64; ++j) {
+            for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
+            a += 32;
+            for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l]  >> 4);
+            a += 32; q4 += 32;
+        }
+        memcpy(utmp, x[i].scales, 12);
+        utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
+        const uint32_t uaux = utmp[1] & kmask1;
+        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
+        utmp[2] = uaux;
+        utmp[0] &= kmask1;
+
+        int sumi = 0;
+        for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
+        a = aux8;
+        int is = 0;
+        for (int j = 0; j < QK_K/32; ++j) {
+            int32_t scale = scales[is++];
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+        }
+        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+        for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
+        const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
+        sumf -= dmin * sumi;
+    }
+    for (int l = 0; l < 8; ++l) sumf += sums[l];
+    *s = sumf;
+#endif
+}
+
+void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy,  size_t by, int nrc) {
+    assert(n % QK_K == 0);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_q5_K * GGML_RESTRICT x = vx;
+    const block_q8_K * GGML_RESTRICT y = vy;
+
+    const int nb = n / QK_K;
+
+    static const uint32_t kmask1 = 0x3f3f3f3f;
+    static const uint32_t kmask2 = 0x0f0f0f0f;
+    static const uint32_t kmask3 = 0x03030303;
+
+    uint32_t utmp[4];
+
+#if defined __riscv_v
+
+    const uint8_t * scales = (const uint8_t*)&utmp[0];
+    const uint8_t * mins   = (const uint8_t*)&utmp[2];
+
+    float sumf = 0;
+    float sums = 0.0;
+
+    size_t vl;
+
+    for (int i = 0; i < nb; ++i) {
+
+        vl = 8;
+
+        const uint8_t * GGML_RESTRICT q5 = x[i].qs;
+        const uint8_t * GGML_RESTRICT hm = x[i].qh;
+        const  int8_t * GGML_RESTRICT q8 = y[i].qs;
+
+        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+        const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
+
+        vint16m1_t q8sums_0 = __riscv_vlse16_v_i16m1(y[i].bsums, 4, vl);
+        vint16m1_t q8sums_1 = __riscv_vlse16_v_i16m1(y[i].bsums+1, 4, vl);
+        vint16m1_t q8sums = __riscv_vadd_vv_i16m1(q8sums_0, q8sums_1, vl);
+
+        memcpy(utmp, x[i].scales, 12);
+        utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
+        const uint32_t uaux = utmp[1] & kmask1;
+        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
+        utmp[2] = uaux;
+        utmp[0] &= kmask1;
+
+        vuint8mf2_t mins8 = __riscv_vle8_v_u8mf2(mins, vl);
+        vint16m1_t v_mins = __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(mins8, vl));
+        vint32m2_t prod = __riscv_vwmul_vv_i32m2(q8sums, v_mins, vl);
+
+        vint32m1_t sumi = __riscv_vredsum_vs_i32m2_i32m1(prod, __riscv_vmv_v_x_i32m1(0, 1), vl);
+        sumf -= dmin * __riscv_vmv_x_s_i32m1_i32(sumi);
+
+        vl = 32;
+        int32_t aux32 = 0;
+        int is = 0;
+
+        uint8_t m = 1;
+        vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1);
+        vuint8m2_t vqh = __riscv_vle8_v_u8m2(hm, vl);
+
+        for (int j = 0; j < QK_K/64; ++j) {
+            // load Q5 and Q8
+            vuint8m2_t q5_x = __riscv_vle8_v_u8m2(q5, vl);
+            vint8m2_t  q8_y1 = __riscv_vle8_v_i8m2(q8, vl);
+            vint8m2_t  q8_y2 = __riscv_vle8_v_i8m2(q8+32, vl);
+
+            // compute mask for addition
+            vint8m2_t q5_a = __riscv_vreinterpret_v_u8m2_i8m2(__riscv_vand_vx_u8m2(q5_x, 0x0F, vl));
+            vuint8m2_t qh_m1 = __riscv_vand_vx_u8m2(vqh, m, vl);
+            vbool4_t vmask_1 = __riscv_vmsne_vx_u8m2_b4(qh_m1, 0, vl);
+            vint8m2_t q5_m1 = __riscv_vadd_vx_i8m2_mu(vmask_1, q5_a, q5_a, 16, vl);
+            m <<= 1;
+
+            vint8m2_t q5_l = __riscv_vreinterpret_v_u8m2_i8m2(__riscv_vsrl_vx_u8m2(q5_x, 0x04, vl));
+            vuint8m2_t qh_m2 = __riscv_vand_vx_u8m2(vqh, m, vl);
+            vbool4_t vmask_2 = __riscv_vmsne_vx_u8m2_b4(qh_m2, 0, vl);
+            vint8m2_t q5_m2 = __riscv_vadd_vx_i8m2_mu(vmask_2, q5_l, q5_l, 16, vl);
+            m <<= 1;
+
+            vint16m4_t v0 = __riscv_vwmul_vv_i16m4(q5_m1, q8_y1, vl);
+            vint16m4_t v1 = __riscv_vwmul_vv_i16m4(q5_m2, q8_y2, vl);
+
+            vint32m8_t vs1 = __riscv_vwmul_vx_i32m8(v0, scales[is++], vl);
+            vint32m8_t vs2 = __riscv_vwmul_vx_i32m8(v1, scales[is++], vl);
+
+            vint32m1_t vacc1 = __riscv_vredsum_vs_i32m8_i32m1(vs1, vzero, vl);
+            vint32m1_t vacc2 = __riscv_vredsum_vs_i32m8_i32m1(vs2, vacc1, vl);
+
+            aux32 += __riscv_vmv_x_s_i32m1_i32(vacc2);
+            q5 += 32;    q8 += 64;
+
+        }
+
+        sums += aux32 * d;
+
+    }
+
+    *s = sumf+sums;
+
+#else
+
+    const uint8_t * scales = (const uint8_t*)&utmp[0];
+    const uint8_t * mins   = (const uint8_t*)&utmp[2];
+
+    int8_t  aux8[QK_K];
+    int16_t aux16[8];
+    float   sums [8];
+    int32_t aux32[8];
+    memset(sums, 0, 8*sizeof(float));
+
+    float sumf = 0;
+    for (int i = 0; i < nb; ++i) {
+        const uint8_t * GGML_RESTRICT q4 = x[i].qs;
+        const uint8_t * GGML_RESTRICT hm = x[i].qh;
+        const  int8_t * GGML_RESTRICT q8 = y[i].qs;
+        memset(aux32, 0, 8*sizeof(int32_t));
+        int8_t * GGML_RESTRICT a = aux8;
+        uint8_t m = 1;
+        for (int j = 0; j < QK_K/64; ++j) {
+            for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
+            for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
+            a += 32; m <<= 1;
+            for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l]  >> 4);
+            for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
+            a += 32; m <<= 1;
+            q4 += 32;
+        }
+        memcpy(utmp, x[i].scales, 12);
+        utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
+        const uint32_t uaux = utmp[1] & kmask1;
+        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
+        utmp[2] = uaux;
+        utmp[0] &= kmask1;
+
+        int sumi = 0;
+        for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
+        a = aux8;
+        int is = 0;
+        for (int j = 0; j < QK_K/32; ++j) {
+            int32_t scale = scales[is++];
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+        }
+        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+        for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
+        const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
+        sumf -= dmin * sumi;
+    }
+    for (int l = 0; l < 8; ++l) sumf += sums[l];
+    *s = sumf;
+#endif
+}
+
+void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    assert(n % QK_K == 0);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_q6_K * GGML_RESTRICT x = vx;
+    const block_q8_K * GGML_RESTRICT y = vy;
+
+    const int nb = n / QK_K;
+
+#if defined __riscv_xtheadvector
+
+    float sumf = 0;
+
+    for (int i = 0; i < nb; ++i) {
+
+        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+
+        const uint8_t * restrict q6 = x[i].ql;
+        const uint8_t * restrict qh = x[i].qh;
+        const  int8_t * restrict q8 = y[i].qs;
+
+        const int8_t * restrict scale = x[i].scales;
+
+        int sum_t = 0;
+        int t0;
+
+        for (int j = 0; j < QK_K/128; ++j) {
+            __asm__ __volatile__(
+                "th.vsetvli zero, %[vl32], e8, m2\n\t" // vl == 32
+                "th.vlb.v v4, (%[qh])\n\t"
+                "th.vsll.vi v0, v4, 4\n\t"
+                "th.vsll.vi v2, v4, 2\n\t"
+                "th.vsrl.vi v6, v4, 2\n\t"
+                "th.vsetvli zero, %[vl64], e8, m4\n\t" // vl == 64
+                "th.vlb.v v8, (%[q6])\n\t"
+                "th.vsrl.vi v12, v8, 4\n\t"
+                "th.vand.vi v8, v8, 0xF\n\t"
+                "th.vsetvli zero, %[vl128], e8, m8\n\t" // vl == 128
+                "th.vand.vx v0, v0, %[mask]\n\t"
+                "th.vor.vv v8, v8, v0\n\t"
+                "th.vlb.v v0, (%[q8])\n\t"
+                "th.vsub.vx v8, v8, %[vl32]\n\t"
+                "th.vsetvli zero, %[vl64], e8, m4\n\t" // vl == 64
+                "th.vwmul.vv v16, v0, v8\n\t"
+                "th.vwmul.vv v24, v4, v12\n\t"
+                "li %[t0], 16\n\t"
+                "th.vsetvli zero, %[t0], e16, m2\n\t" // vl == 16
+                "th.vmv.v.x v0, zero\n\t"
+                "th.vwredsum.vs v10, v16, v0\n\t"
+                "th.vwredsum.vs v9, v18, v0\n\t"
+                "th.vwredsum.vs v8, v20, v0\n\t"
+                "th.vwredsum.vs v7, v22, v0\n\t"
+                "th.vwredsum.vs v11, v24, v0\n\t"
+                "th.vwredsum.vs v12, v26, v0\n\t"
+                "th.vwredsum.vs v13, v28, v0\n\t"
+                "th.vwredsum.vs v14, v30, v0\n\t"
+                "li %[t0], 4\n\t"
+                "th.vsetvli zero, %[t0], e32, m1\n\t" // vl == 4
+                "th.vslideup.vi v10, v9, 1\n\t"
+                "th.vslideup.vi v8, v7, 1\n\t"
+                "th.vslideup.vi v11, v12, 1\n\t"
+                "th.vslideup.vi v13, v14, 1\n\t"
+                "th.vslideup.vi v10, v8, 2\n\t"
+                "th.vslideup.vi v11, v13, 2\n\t"
+                "li %[t0], 8\n\t"
+                "th.vsetvli zero, %[t0], e32, m2\n\t" // vl == 8
+                "th.vlb.v v4, (%[scale])\n\t"
+                "th.vmul.vv v2, v4, v10\n\t"
+                "th.vredsum.vs v0, v2, v0\n\t"
+                "th.vmv.x.s %[t0], v0\n\t"
+                "add %[sumi], %[sumi], %[t0]"
+                : [sumi] "+&r" (sum_t), [t0] "=&r" (t0)
+                : [qh] "r" (qh), [q6] "r" (q6), [q8] "r" (q8), [scale] "r" (scale)
+                , [vl32] "r" (32), [vl64] "r" (64), [vl128] "r" (128)
+                , [mask] "r" (0x30)
+                : "memory"
+                , "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
+                , "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
+                , "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
+                , "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
+            );
+            q6 += 64;   qh += 32;   q8 += 128;   scale += 8;
+        }
+
+        sumf += d * sum_t;
+
+    }
+
+    *s = sumf;
+
+#elif defined __riscv_v
+
+    float sumf = 0;
+    const int vector_length = __riscv_vlenb() * 8;
+
+    switch (vector_length) {
+    case 256:
+        for (int i = 0; i < nb; ++i) {
+
+            const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+
+            const uint8_t * GGML_RESTRICT q6 = x[i].ql;
+            const uint8_t * GGML_RESTRICT qh = x[i].qh;
+            const  int8_t * GGML_RESTRICT q8 = y[i].qs;
+
+            const int8_t * GGML_RESTRICT scale = x[i].scales;
+
+            size_t vl;
+
+            vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1);
+
+            int sum_t = 0;
+            int is = 0;
+
+            for (int j = 0; j < QK_K/128; ++j) {
+
+                vl = 32;
+
+                // load qh
+                vuint8m1_t qh_x = __riscv_vle8_v_u8m1(qh, vl);
+
+                // load Q6
+                vuint8m1_t q6_0 = __riscv_vle8_v_u8m1(q6, vl);
+                vuint8m1_t q6_1 = __riscv_vle8_v_u8m1(q6+32, vl);
+
+                vuint8m1_t q6a_0 = __riscv_vand_vx_u8m1(q6_0, 0x0F, vl);
+                vuint8m1_t q6a_1 = __riscv_vand_vx_u8m1(q6_1, 0x0F, vl);
+                vuint8m1_t q6s_0 = __riscv_vsrl_vx_u8m1(q6_0, 0x04, vl);
+                vuint8m1_t q6s_1 = __riscv_vsrl_vx_u8m1(q6_1, 0x04, vl);
+
+                vuint8m1_t qh_0 = __riscv_vand_vx_u8m1(qh_x, 0x03, vl);
+                vuint8m1_t qh_1 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(qh_x, 0x2, vl), 0x03 , vl);
+                vuint8m1_t qh_2 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(qh_x, 0x4, vl), 0x03 , vl);
+                vuint8m1_t qh_3 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(qh_x, 0x6, vl), 0x03 , vl);
+
+                vuint8m1_t qhi_0 = __riscv_vor_vv_u8m1(q6a_0, __riscv_vsll_vx_u8m1(qh_0, 0x04, vl), vl);
+                vuint8m1_t qhi_1 = __riscv_vor_vv_u8m1(q6a_1, __riscv_vsll_vx_u8m1(qh_1, 0x04, vl), vl);
+                vuint8m1_t qhi_2 = __riscv_vor_vv_u8m1(q6s_0, __riscv_vsll_vx_u8m1(qh_2, 0x04, vl), vl);
+                vuint8m1_t qhi_3 = __riscv_vor_vv_u8m1(q6s_1, __riscv_vsll_vx_u8m1(qh_3, 0x04, vl), vl);
+
+                vint8m1_t a_0 = __riscv_vsub_vx_i8m1(__riscv_vreinterpret_v_u8m1_i8m1(qhi_0), 32, vl);
+                vint8m1_t a_1 = __riscv_vsub_vx_i8m1(__riscv_vreinterpret_v_u8m1_i8m1(qhi_1), 32, vl);
+                vint8m1_t a_2 = __riscv_vsub_vx_i8m1(__riscv_vreinterpret_v_u8m1_i8m1(qhi_2), 32, vl);
+                vint8m1_t a_3 = __riscv_vsub_vx_i8m1(__riscv_vreinterpret_v_u8m1_i8m1(qhi_3), 32, vl);
+
+                // load Q8 and take product
+                vint16m2_t va_q_0 = __riscv_vwmul_vv_i16m2(a_0, __riscv_vle8_v_i8m1(q8, vl), vl);
+                vint16m2_t va_q_1 = __riscv_vwmul_vv_i16m2(a_1, __riscv_vle8_v_i8m1(q8+32, vl), vl);
+                vint16m2_t va_q_2 = __riscv_vwmul_vv_i16m2(a_2, __riscv_vle8_v_i8m1(q8+64, vl), vl);
+                vint16m2_t va_q_3 = __riscv_vwmul_vv_i16m2(a_3, __riscv_vle8_v_i8m1(q8+96, vl), vl);
+
+                vl = 16;
+
+                vint32m2_t vaux_0 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_0, 0), scale[is+0], vl);
+                vint32m2_t vaux_1 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_0, 1), scale[is+1], vl);
+                vint32m2_t vaux_2 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_1, 0), scale[is+2], vl);
+                vint32m2_t vaux_3 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_1, 1), scale[is+3], vl);
+                vint32m2_t vaux_4 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_2, 0), scale[is+4], vl);
+                vint32m2_t vaux_5 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_2, 1), scale[is+5], vl);
+                vint32m2_t vaux_6 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_3, 0), scale[is+6], vl);
+                vint32m2_t vaux_7 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_3, 1), scale[is+7], vl);
+
+                vint32m1_t isum0 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(vaux_0, vaux_1, vl), vzero, vl);
+                vint32m1_t isum1 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(vaux_2, vaux_3, vl), isum0, vl);
+                vint32m1_t isum2 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(vaux_4, vaux_5, vl), isum1, vl);
+                vint32m1_t isum3 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(vaux_6, vaux_7, vl), isum2, vl);
+
+                sum_t += __riscv_vmv_x_s_i32m1_i32(isum3);
+
+                q6 += 64;   qh += 32;   q8 += 128;   is=8;
+
+            }
+
+            sumf += d * sum_t;
+
+        }
+        break;
+    case 128:
+        for (int i = 0; i < nb; ++i) {
+
+            const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+
+            const uint8_t * restrict q6 = x[i].ql;
+            const uint8_t * restrict qh = x[i].qh;
+            const  int8_t * restrict q8 = y[i].qs;
+
+            const int8_t * restrict scale = x[i].scales;
+
+            int sum_t = 0;
+            int t0;
+
+            for (int j = 0; j < QK_K/128; ++j) {
+                __asm__ __volatile__(
+                    "vsetvli zero, %[vl32], e8, m2\n\t"
+                    "vle8.v v4, (%[qh])\n\t"
+                    "vsll.vi v0, v4, 4\n\t"
+                    "vsll.vi v2, v4, 2\n\t"
+                    "vsrl.vi v6, v4, 2\n\t"
+                    "vsetvli zero, %[vl64], e8, m4\n\t"
+                    "vle8.v v8, (%[q6])\n\t"
+                    "vsrl.vi v12, v8, 4\n\t"
+                    "vand.vi v8, v8, 0xF\n\t"
+                    "vsetvli zero, %[vl128], e8, m8\n\t"
+                    "vand.vx v0, v0, %[mask]\n\t"
+                    "vor.vv v8, v8, v0\n\t"
+                    "vle8.v v0, (%[q8])\n\t"
+                    "vsub.vx v8, v8, %[vl32]\n\t"
+                    "vsetvli zero, %[vl64], e8, m4\n\t"
+                    "vwmul.vv v16, v0, v8\n\t"
+                    "vwmul.vv v24, v4, v12\n\t"
+                    "vsetivli zero, 16, e16, m2\n\t"
+                    "vmv.v.x v0, zero\n\t"
+                    "vwredsum.vs v10, v16, v0\n\t"
+                    "vwredsum.vs v9, v18, v0\n\t"
+                    "vwredsum.vs v8, v20, v0\n\t"
+                    "vwredsum.vs v7, v22, v0\n\t"
+                    "vwredsum.vs v11, v24, v0\n\t"
+                    "vwredsum.vs v12, v26, v0\n\t"
+                    "vwredsum.vs v13, v28, v0\n\t"
+                    "vwredsum.vs v14, v30, v0\n\t"
+                    "vsetivli zero, 4, e32, m1\n\t"
+                    "vslideup.vi v10, v9, 1\n\t"
+                    "vslideup.vi v8, v7, 1\n\t"
+                    "vslideup.vi v11, v12, 1\n\t"
+                    "vslideup.vi v13, v14, 1\n\t"
+                    "vslideup.vi v10, v8, 2\n\t"
+                    "vslideup.vi v11, v13, 2\n\t"
+                    "vsetivli zero, 8, e32, m2\n\t"
+                    "vle8.v v2, (%[scale])\n\t"
+                    "vsext.vf4 v4, v2\n\t"
+                    "vmul.vv v2, v4, v10\n\t"
+                    "vredsum.vs v0, v2, v0\n\t"
+                    "vmv.x.s %[t0], v0\n\t"
+                    "add %[sumi], %[sumi], %[t0]"
+                    : [sumi] "+&r" (sum_t), [t0] "=&r" (t0)
+                    : [qh] "r" (qh), [q6] "r" (q6), [q8] "r" (q8), [scale] "r" (scale)
+                    , [vl32] "r" (32), [vl64] "r" (64), [vl128] "r" (128)
+                    , [mask] "r" (0x30)
+                    : "memory"
+                    , "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
+                    , "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
+                    , "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
+                    , "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
+                );
+                q6 += 64;   qh += 32;   q8 += 128;   scale += 8;
+            }
+
+            sumf += d * sum_t;
+
+        }
+        break;
+    default:
+        assert(false && "Unsupported vector length");
+        break;
+    }
+
+    *s = sumf;
+
+#else
+
+    int8_t  aux8[QK_K];
+    int16_t aux16[8];
+    float   sums [8];
+    int32_t aux32[8];
+    memset(sums, 0, 8*sizeof(float));
+
+    float sumf = 0;
+    for (int i = 0; i < nb; ++i) {
+        const uint8_t * GGML_RESTRICT q4 = x[i].ql;
+        const uint8_t * GGML_RESTRICT qh = x[i].qh;
+        const  int8_t * GGML_RESTRICT q8 = y[i].qs;
+        memset(aux32, 0, 8*sizeof(int32_t));
+        int8_t * GGML_RESTRICT a = aux8;
+        for (int j = 0; j < QK_K; j += 128) {
+            for (int l = 0; l < 32; ++l) {
+                a[l +  0] = (int8_t)((q4[l +  0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
+                a[l + 32] = (int8_t)((q4[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
+                a[l + 64] = (int8_t)((q4[l +  0] >>  4) | (((qh[l] >> 4) & 3) << 4)) - 32;
+                a[l + 96] = (int8_t)((q4[l + 32] >>  4) | (((qh[l] >> 6) & 3) << 4)) - 32;
+            }
+            a  += 128;
+            q4 += 64;
+            qh += 32;
+        }
+        a = aux8;
+        int is = 0;
+        for (int j = 0; j < QK_K/16; ++j) {
+            int scale = x[i].scales[is++];
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+        }
+        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+        for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
+    }
+    for (int l = 0; l < 8; ++l) sumf += sums[l];
+    *s = sumf;
+#endif
+}
+
diff --git a/ggml/src/ggml-cpu/arch/riscv/repack.cpp b/ggml/src/ggml-cpu/arch/riscv/repack.cpp
new file mode 100644
index 0000000000000..45c91a694820a
--- /dev/null
+++ b/ggml/src/ggml-cpu/arch/riscv/repack.cpp
@@ -0,0 +1,397 @@
+#define GGML_COMMON_IMPL_CPP
+#define GGML_COMMON_DECL_CPP
+#include "ggml-common.h"
+#include "ggml-backend-impl.h"
+
+#include "ggml-impl.h"
+#include "ggml-cpu.h"
+#include "ggml-cpu-impl.h"
+#include "simd-mappings.h"
+#include "traits.h"
+
+#include <cmath>
+#include <cstring>
+#include <cassert>
+#include <cstdlib> // for qsort
+#include <cstdio>  // for GGML_ASSERT
+
+#define GGML_CPU_CLANG_WORKAROUND
+#include "../../repack.h"
+
+#if defined(__GNUC__)
+#pragma GCC diagnostic ignored "-Woverlength-strings"
+#endif
+
+#define UNUSED GGML_UNUSED
+
+void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
+    const int qk = QK8_0;
+    const int nb = n / qk;
+    const int ncols_interleaved = 8;
+    const int blocklen = 8;
+
+    assert (n % qk == 0);
+    assert (nc % ncols_interleaved == 0);
+
+    UNUSED(s);
+    UNUSED(bs);
+    UNUSED(vx);
+    UNUSED(vy);
+    UNUSED(nr);
+    UNUSED(nc);
+    UNUSED(nb);
+    UNUSED(ncols_interleaved);
+    UNUSED(blocklen);
+
+#if defined __riscv_v
+    if (__riscv_vlenb() >= QK4_0) {
+        const size_t vl = QK4_0;
+
+        const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
+        for (int x = 0; x < nc / ncols_interleaved; x++) {
+            const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb);
+
+            vfloat32m1_t sumf = __riscv_vfmv_v_f_f32m1(0.0, vl / 4);
+            for (int l = 0; l < nb; l++) {
+                const int64_t a0 = *(const int64_t *)&a_ptr[l].qs[0];
+                const int64_t a1 = *(const int64_t *)&a_ptr[l].qs[8];
+                const int64_t a2 = *(const int64_t *)&a_ptr[l].qs[16];
+                const int64_t a3 = *(const int64_t *)&a_ptr[l].qs[24];
+                __asm__ __volatile__("" ::: "memory"); // prevent gcc from emitting fused vlse64, violating alignment constraints
+                const vint8m2_t lhs_0_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(a0, vl / 4));
+                const vint8m2_t lhs_1_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(a1, vl / 4));
+                const vint8m2_t lhs_2_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(a2, vl / 4));
+                const vint8m2_t lhs_3_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(a3, vl / 4));
+
+                const vint8m4_t rhs_raw_vec = __riscv_vle8_v_i8m4((const int8_t *)b_ptr[l].qs, vl * 4);
+                const vint8m4_t rhs_vec_lo = __riscv_vsra_vx_i8m4(__riscv_vsll_vx_i8m4(rhs_raw_vec, 4, vl * 4), 4, vl * 4);
+                const vint8m4_t rhs_vec_hi = __riscv_vsra_vx_i8m4(rhs_raw_vec, 4, vl * 4);
+                const vint8m2_t rhs_vec_lo_0 = __riscv_vget_v_i8m4_i8m2(rhs_vec_lo, 0);
+                const vint8m2_t rhs_vec_lo_1 = __riscv_vget_v_i8m4_i8m2(rhs_vec_lo, 1);
+                const vint8m2_t rhs_vec_hi_0 = __riscv_vget_v_i8m4_i8m2(rhs_vec_hi, 0);
+                const vint8m2_t rhs_vec_hi_1 = __riscv_vget_v_i8m4_i8m2(rhs_vec_hi, 1);
+
+                const vint16m4_t sumi_lo_0 = __riscv_vwmul_vv_i16m4(rhs_vec_lo_0, lhs_0_8, vl * 2);
+                const vint16m4_t sumi_lo_1 = __riscv_vwmacc_vv_i16m4(sumi_lo_0, rhs_vec_lo_1, lhs_1_8, vl * 2);
+                const vint16m4_t sumi_hi_0 = __riscv_vwmacc_vv_i16m4(sumi_lo_1, rhs_vec_hi_0, lhs_2_8, vl * 2);
+                const vint16m4_t sumi_hi_m = __riscv_vwmacc_vv_i16m4(sumi_hi_0, rhs_vec_hi_1, lhs_3_8, vl * 2);
+
+                const vuint32m4_t sumi_i32 = __riscv_vreinterpret_v_i32m4_u32m4(__riscv_vreinterpret_v_i16m4_i32m4(sumi_hi_m));
+                const vuint16m2_t sumi_h2_0 = __riscv_vnsrl_wx_u16m2(sumi_i32, 0, vl);
+                const vuint16m2_t sumi_h2_1 = __riscv_vnsrl_wx_u16m2(sumi_i32, 16, vl);
+                const vuint16m2_t sumi_h2 = __riscv_vadd_vv_u16m2(sumi_h2_0, sumi_h2_1, vl);
+                const vuint32m2_t sumi_h2_i32 = __riscv_vreinterpret_v_u16m2_u32m2(sumi_h2);
+                const vuint16m1_t sumi_h4_0 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 0, vl / 2);
+                const vuint16m1_t sumi_h4_1 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 16, vl / 2);
+                const vuint16m1_t sumi_h4 = __riscv_vadd_vv_u16m1(sumi_h4_0, sumi_h4_1, vl / 2);
+                const vuint32m1_t sumi_h4_i32 = __riscv_vreinterpret_v_u16m1_u32m1(sumi_h4);
+                const vint16mf2_t sumi_h8_0 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 0, vl / 4));
+                const vint16mf2_t sumi_h8_1 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 16, vl / 4));
+                const vint32m1_t sumi_h8 = __riscv_vwadd_vv_i32m1(sumi_h8_0, sumi_h8_1, vl / 4);
+                const vfloat32m1_t facc = __riscv_vfcvt_f_x_v_f32m1(sumi_h8, vl / 4);
+
+                // vector version needs Zvfhmin extension
+                const float a_scale = GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
+                const float b_scales[8] = {
+                    GGML_CPU_FP16_TO_FP32(b_ptr[l].d[0]),
+                    GGML_CPU_FP16_TO_FP32(b_ptr[l].d[1]),
+                    GGML_CPU_FP16_TO_FP32(b_ptr[l].d[2]),
+                    GGML_CPU_FP16_TO_FP32(b_ptr[l].d[3]),
+                    GGML_CPU_FP16_TO_FP32(b_ptr[l].d[4]),
+                    GGML_CPU_FP16_TO_FP32(b_ptr[l].d[5]),
+                    GGML_CPU_FP16_TO_FP32(b_ptr[l].d[6]),
+                    GGML_CPU_FP16_TO_FP32(b_ptr[l].d[7])
+                };
+                const vfloat32m1_t b_scales_vec = __riscv_vle32_v_f32m1(b_scales, vl / 4);
+                const vfloat32m1_t tmp1 = __riscv_vfmul_vf_f32m1(facc, a_scale, vl / 4);
+                sumf = __riscv_vfmacc_vv_f32m1(sumf, tmp1, b_scales_vec, vl / 4);
+            }
+            __riscv_vse32_v_f32m1(s + x * ncols_interleaved, sumf, vl / 4);
+        }
+        return;
+    }
+
+#endif
+    {
+        float sumf[8];
+        int sumi;
+
+        const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
+        for (int x = 0; x < nc / ncols_interleaved; x++) {
+            const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb);
+
+            for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
+            for (int l = 0; l < nb; l++) {
+                for (int k = 0; k < (qk / (2 * blocklen)); k++) {
+                    for (int j = 0; j < ncols_interleaved; j++) {
+                        sumi = 0;
+                        for (int i = 0; i < blocklen; ++i) {
+                            const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
+                            const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
+                            sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
+                        }
+                        sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
+                    }
+                }
+            }
+            for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
+        }
+    }
+}
+
+void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
+    const int qk = QK8_0;
+    const int nb = n / qk;
+    const int ncols_interleaved = 8;
+    const int blocklen = 8;
+
+    assert (n % qk == 0);
+    assert (nr % 4 == 0);
+    assert (nc % ncols_interleaved == 0);
+
+    UNUSED(s);
+    UNUSED(bs);
+    UNUSED(vx);
+    UNUSED(vy);
+    UNUSED(nr);
+    UNUSED(nc);
+    UNUSED(nb);
+    UNUSED(ncols_interleaved);
+    UNUSED(blocklen);
+
+#if defined __riscv_v
+    if (__riscv_vlenb() >= QK4_0) {
+        const size_t vl = QK4_0;
+
+        for (int y = 0; y < nr / 4; y++) {
+            const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
+            for (int x = 0; x < nc / ncols_interleaved; x++) {
+                const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb);
+                vfloat32m1_t sumf0 = __riscv_vfmv_v_f_f32m1(0.0, vl / 4);
+                vfloat32m1_t sumf1 = __riscv_vfmv_v_f_f32m1(0.0, vl / 4);
+                vfloat32m1_t sumf2 = __riscv_vfmv_v_f_f32m1(0.0, vl / 4);
+                vfloat32m1_t sumf3 = __riscv_vfmv_v_f_f32m1(0.0, vl / 4);
+                for (int l = 0; l < nb; l++) {
+                    const vint8m4_t rhs_raw_vec = __riscv_vle8_v_i8m4((const int8_t *)b_ptr[l].qs, vl * 4);
+                    const vint8m4_t rhs_vec_lo = __riscv_vsra_vx_i8m4(__riscv_vsll_vx_i8m4(rhs_raw_vec, 4, vl * 4), 4, vl * 4);
+                    const vint8m4_t rhs_vec_hi = __riscv_vsra_vx_i8m4(rhs_raw_vec, 4, vl * 4);
+                    const vint8m2_t rhs_vec_lo_0 = __riscv_vget_v_i8m4_i8m2(rhs_vec_lo, 0);
+                    const vint8m2_t rhs_vec_lo_1 = __riscv_vget_v_i8m4_i8m2(rhs_vec_lo, 1);
+                    const vint8m2_t rhs_vec_hi_0 = __riscv_vget_v_i8m4_i8m2(rhs_vec_hi, 0);
+                    const vint8m2_t rhs_vec_hi_1 = __riscv_vget_v_i8m4_i8m2(rhs_vec_hi, 1);
+
+                    // vector version needs Zvfhmin extension
+                    const float a_scales[4] = {
+                        GGML_CPU_FP16_TO_FP32(a_ptr[l].d[0]),
+                        GGML_CPU_FP16_TO_FP32(a_ptr[l].d[1]),
+                        GGML_CPU_FP16_TO_FP32(a_ptr[l].d[2]),
+                        GGML_CPU_FP16_TO_FP32(a_ptr[l].d[3])
+                    };
+                    const float b_scales[8] = {
+                        GGML_CPU_FP16_TO_FP32(b_ptr[l].d[0]),
+                        GGML_CPU_FP16_TO_FP32(b_ptr[l].d[1]),
+                        GGML_CPU_FP16_TO_FP32(b_ptr[l].d[2]),
+                        GGML_CPU_FP16_TO_FP32(b_ptr[l].d[3]),
+                        GGML_CPU_FP16_TO_FP32(b_ptr[l].d[4]),
+                        GGML_CPU_FP16_TO_FP32(b_ptr[l].d[5]),
+                        GGML_CPU_FP16_TO_FP32(b_ptr[l].d[6]),
+                        GGML_CPU_FP16_TO_FP32(b_ptr[l].d[7])
+                    };
+                    const vfloat32m1_t b_scales_vec = __riscv_vle32_v_f32m1(b_scales, vl / 4);
+
+                    const int64_t A0 = *(const int64_t *)&a_ptr[l].qs[0];
+                    const int64_t A4 = *(const int64_t *)&a_ptr[l].qs[32];
+                    const int64_t A8 = *(const int64_t *)&a_ptr[l].qs[64];
+                    const int64_t Ac = *(const int64_t *)&a_ptr[l].qs[96];
+                    __asm__ __volatile__("" ::: "memory"); // prevent gcc from emitting fused vlse64, violating alignment
+                    vint16m4_t sumi_l0;
+                    {
+                        const vint8m2_t lhs_0_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A0, vl / 4));
+                        const vint8m2_t lhs_1_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A4, vl / 4));
+                        const vint8m2_t lhs_2_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A8, vl / 4));
+                        const vint8m2_t lhs_3_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(Ac, vl / 4));
+                        const vint16m4_t sumi_lo_0 = __riscv_vwmul_vv_i16m4(rhs_vec_lo_0, lhs_0_8, vl * 2);
+                        const vint16m4_t sumi_lo_1 = __riscv_vwmacc_vv_i16m4(sumi_lo_0, rhs_vec_lo_1, lhs_1_8, vl * 2);
+                        const vint16m4_t sumi_hi_0 = __riscv_vwmacc_vv_i16m4(sumi_lo_1, rhs_vec_hi_0, lhs_2_8, vl * 2);
+                        const vint16m4_t sumi_hi_m = __riscv_vwmacc_vv_i16m4(sumi_hi_0, rhs_vec_hi_1, lhs_3_8, vl * 2);
+
+                        sumi_l0 = sumi_hi_m;
+                    }
+
+                    {
+                        const vuint32m4_t sumi_i32 = __riscv_vreinterpret_v_i32m4_u32m4(__riscv_vreinterpret_v_i16m4_i32m4(sumi_l0));
+                        const vuint16m2_t sumi_h2_0 = __riscv_vnsrl_wx_u16m2(sumi_i32, 0, vl);
+                        const vuint16m2_t sumi_h2_1 = __riscv_vnsrl_wx_u16m2(sumi_i32, 16, vl);
+                        const vuint16m2_t sumi_h2 = __riscv_vadd_vv_u16m2(sumi_h2_0, sumi_h2_1, vl);
+                        const vuint32m2_t sumi_h2_i32 = __riscv_vreinterpret_v_u16m2_u32m2(sumi_h2);
+                        const vuint16m1_t sumi_h4_0 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 0, vl / 2);
+                        const vuint16m1_t sumi_h4_1 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 16, vl / 2);
+                        const vuint16m1_t sumi_h4 = __riscv_vadd_vv_u16m1(sumi_h4_0, sumi_h4_1, vl / 2);
+                        const vuint32m1_t sumi_h4_i32 = __riscv_vreinterpret_v_u16m1_u32m1(sumi_h4);
+                        const vint16mf2_t sumi_h8_0 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 0, vl / 4));
+                        const vint16mf2_t sumi_h8_1 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 16, vl / 4));
+                        const vint32m1_t sumi_h8 = __riscv_vwadd_vv_i32m1(sumi_h8_0, sumi_h8_1, vl / 4);
+                        const vfloat32m1_t facc = __riscv_vfcvt_f_x_v_f32m1(sumi_h8, vl / 4);
+
+                        const vfloat32m1_t tmp1 = __riscv_vfmul_vf_f32m1(facc, a_scales[0], vl / 4);
+                        sumf0 = __riscv_vfmacc_vv_f32m1(sumf0, tmp1, b_scales_vec, vl / 4);
+                    }
+
+                    const int64_t A1 = *(const int64_t *)&a_ptr[l].qs[8];
+                    const int64_t A5 = *(const int64_t *)&a_ptr[l].qs[40];
+                    const int64_t A9 = *(const int64_t *)&a_ptr[l].qs[72];
+                    const int64_t Ad = *(const int64_t *)&a_ptr[l].qs[104];
+                    __asm__ __volatile__("" ::: "memory"); // prevent gcc from emitting fused vlse64, violating alignment
+                    vint16m4_t sumi_l1;
+                    {
+                        const vint8m2_t lhs_0_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A1, vl / 4));
+                        const vint8m2_t lhs_1_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A5, vl / 4));
+                        const vint8m2_t lhs_2_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A9, vl / 4));
+                        const vint8m2_t lhs_3_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(Ad, vl / 4));
+                        const vint16m4_t sumi_lo_0 = __riscv_vwmul_vv_i16m4(rhs_vec_lo_0, lhs_0_8, vl * 2);
+                        const vint16m4_t sumi_lo_1 = __riscv_vwmacc_vv_i16m4(sumi_lo_0, rhs_vec_lo_1, lhs_1_8, vl * 2);
+                        const vint16m4_t sumi_hi_0 = __riscv_vwmacc_vv_i16m4(sumi_lo_1, rhs_vec_hi_0, lhs_2_8, vl * 2);
+                        const vint16m4_t sumi_hi_m = __riscv_vwmacc_vv_i16m4(sumi_hi_0, rhs_vec_hi_1, lhs_3_8, vl * 2);
+
+                        sumi_l1 = sumi_hi_m;
+                    }
+
+                    {
+                        const vuint32m4_t sumi_i32 = __riscv_vreinterpret_v_i32m4_u32m4(__riscv_vreinterpret_v_i16m4_i32m4(sumi_l1));
+                        const vuint16m2_t sumi_h2_0 = __riscv_vnsrl_wx_u16m2(sumi_i32, 0, vl);
+                        const vuint16m2_t sumi_h2_1 = __riscv_vnsrl_wx_u16m2(sumi_i32, 16, vl);
+                        const vuint16m2_t sumi_h2 = __riscv_vadd_vv_u16m2(sumi_h2_0, sumi_h2_1, vl);
+                        const vuint32m2_t sumi_h2_i32 = __riscv_vreinterpret_v_u16m2_u32m2(sumi_h2);
+                        const vuint16m1_t sumi_h4_0 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 0, vl / 2);
+                        const vuint16m1_t sumi_h4_1 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 16, vl / 2);
+                        const vuint16m1_t sumi_h4 = __riscv_vadd_vv_u16m1(sumi_h4_0, sumi_h4_1, vl / 2);
+                        const vuint32m1_t sumi_h4_i32 = __riscv_vreinterpret_v_u16m1_u32m1(sumi_h4);
+                        const vint16mf2_t sumi_h8_0 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 0, vl / 4));
+                        const vint16mf2_t sumi_h8_1 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 16, vl / 4));
+                        const vint32m1_t sumi_h8 = __riscv_vwadd_vv_i32m1(sumi_h8_0, sumi_h8_1, vl / 4);
+                        const vfloat32m1_t facc = __riscv_vfcvt_f_x_v_f32m1(sumi_h8, vl / 4);
+
+                        const vfloat32m1_t tmp1 = __riscv_vfmul_vf_f32m1(facc, a_scales[1], vl / 4);
+                        sumf1 = __riscv_vfmacc_vv_f32m1(sumf1, tmp1, b_scales_vec, vl / 4);
+                    }
+
+                    const int64_t A2 = *(const int64_t *)&a_ptr[l].qs[16];
+                    const int64_t A6 = *(const int64_t *)&a_ptr[l].qs[48];
+                    const int64_t Aa = *(const int64_t *)&a_ptr[l].qs[80];
+                    const int64_t Ae = *(const int64_t *)&a_ptr[l].qs[112];
+                    __asm__ __volatile__("" ::: "memory"); // prevent gcc from emitting fused vlse64, violating alignment
+                    vint16m4_t sumi_l2;
+                    {
+                        const vint8m2_t lhs_0_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A2, vl / 4));
+                        const vint8m2_t lhs_1_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A6, vl / 4));
+                        const vint8m2_t lhs_2_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(Aa, vl / 4));
+                        const vint8m2_t lhs_3_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(Ae, vl / 4));
+                        const vint16m4_t sumi_lo_0 = __riscv_vwmul_vv_i16m4(rhs_vec_lo_0, lhs_0_8, vl * 2);
+                        const vint16m4_t sumi_lo_1 = __riscv_vwmacc_vv_i16m4(sumi_lo_0, rhs_vec_lo_1, lhs_1_8, vl * 2);
+                        const vint16m4_t sumi_hi_0 = __riscv_vwmacc_vv_i16m4(sumi_lo_1, rhs_vec_hi_0, lhs_2_8, vl * 2);
+                        const vint16m4_t sumi_hi_m = __riscv_vwmacc_vv_i16m4(sumi_hi_0, rhs_vec_hi_1, lhs_3_8, vl * 2);
+
+                        sumi_l2 = sumi_hi_m;
+                    }
+
+                    {
+                        const vuint32m4_t sumi_i32 = __riscv_vreinterpret_v_i32m4_u32m4(__riscv_vreinterpret_v_i16m4_i32m4(sumi_l2));
+                        const vuint16m2_t sumi_h2_0 = __riscv_vnsrl_wx_u16m2(sumi_i32, 0, vl);
+                        const vuint16m2_t sumi_h2_1 = __riscv_vnsrl_wx_u16m2(sumi_i32, 16, vl);
+                        const vuint16m2_t sumi_h2 = __riscv_vadd_vv_u16m2(sumi_h2_0, sumi_h2_1, vl);
+                        const vuint32m2_t sumi_h2_i32 = __riscv_vreinterpret_v_u16m2_u32m2(sumi_h2);
+                        const vuint16m1_t sumi_h4_0 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 0, vl / 2);
+                        const vuint16m1_t sumi_h4_1 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 16, vl / 2);
+                        const vuint16m1_t sumi_h4 = __riscv_vadd_vv_u16m1(sumi_h4_0, sumi_h4_1, vl / 2);
+                        const vuint32m1_t sumi_h4_i32 = __riscv_vreinterpret_v_u16m1_u32m1(sumi_h4);
+                        const vint16mf2_t sumi_h8_0 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 0, vl / 4));
+                        const vint16mf2_t sumi_h8_1 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 16, vl / 4));
+                        const vint32m1_t sumi_h8 = __riscv_vwadd_vv_i32m1(sumi_h8_0, sumi_h8_1, vl / 4);
+                        const vfloat32m1_t facc = __riscv_vfcvt_f_x_v_f32m1(sumi_h8, vl / 4);
+
+                        const vfloat32m1_t tmp1 = __riscv_vfmul_vf_f32m1(facc, a_scales[2], vl / 4);
+                        sumf2 = __riscv_vfmacc_vv_f32m1(sumf2, tmp1, b_scales_vec, vl / 4);
+                    }
+
+                    const int64_t A3 = *(const int64_t *)&a_ptr[l].qs[24];
+                    const int64_t A7 = *(const int64_t *)&a_ptr[l].qs[56];
+                    const int64_t Ab = *(const int64_t *)&a_ptr[l].qs[88];
+                    const int64_t Af = *(const int64_t *)&a_ptr[l].qs[120];
+                    __asm__ __volatile__("" ::: "memory"); // prevent gcc from emitting fused vlse64, violating alignment
+                    vint16m4_t sumi_l3;
+                    {
+                        const vint8m2_t lhs_0_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A3, vl / 4));
+                        const vint8m2_t lhs_1_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A7, vl / 4));
+                        const vint8m2_t lhs_2_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(Ab, vl / 4));
+                        const vint8m2_t lhs_3_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(Af, vl / 4));
+                        const vint16m4_t sumi_lo_0 = __riscv_vwmul_vv_i16m4(rhs_vec_lo_0, lhs_0_8, vl * 2);
+                        const vint16m4_t sumi_lo_1 = __riscv_vwmacc_vv_i16m4(sumi_lo_0, rhs_vec_lo_1, lhs_1_8, vl * 2);
+                        const vint16m4_t sumi_hi_0 = __riscv_vwmacc_vv_i16m4(sumi_lo_1, rhs_vec_hi_0, lhs_2_8, vl * 2);
+                        const vint16m4_t sumi_hi_m = __riscv_vwmacc_vv_i16m4(sumi_hi_0, rhs_vec_hi_1, lhs_3_8, vl * 2);
+
+                        sumi_l3 = sumi_hi_m;
+                    }
+
+                    {
+                        const vuint32m4_t sumi_i32 = __riscv_vreinterpret_v_i32m4_u32m4(__riscv_vreinterpret_v_i16m4_i32m4(sumi_l3));
+                        const vuint16m2_t sumi_h2_0 = __riscv_vnsrl_wx_u16m2(sumi_i32, 0, vl);
+                        const vuint16m2_t sumi_h2_1 = __riscv_vnsrl_wx_u16m2(sumi_i32, 16, vl);
+                        const vuint16m2_t sumi_h2 = __riscv_vadd_vv_u16m2(sumi_h2_0, sumi_h2_1, vl);
+                        const vuint32m2_t sumi_h2_i32 = __riscv_vreinterpret_v_u16m2_u32m2(sumi_h2);
+                        const vuint16m1_t sumi_h4_0 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 0, vl / 2);
+                        const vuint16m1_t sumi_h4_1 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 16, vl / 2);
+                        const vuint16m1_t sumi_h4 = __riscv_vadd_vv_u16m1(sumi_h4_0, sumi_h4_1, vl / 2);
+                        const vuint32m1_t sumi_h4_i32 = __riscv_vreinterpret_v_u16m1_u32m1(sumi_h4);
+                        const vint16mf2_t sumi_h8_0 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 0, vl / 4));
+                        const vint16mf2_t sumi_h8_1 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 16, vl / 4));
+                        const vint32m1_t sumi_h8 = __riscv_vwadd_vv_i32m1(sumi_h8_0, sumi_h8_1, vl / 4);
+                        const vfloat32m1_t facc = __riscv_vfcvt_f_x_v_f32m1(sumi_h8, vl / 4);
+
+                        const vfloat32m1_t tmp1 = __riscv_vfmul_vf_f32m1(facc, a_scales[3], vl / 4);
+                        sumf3 = __riscv_vfmacc_vv_f32m1(sumf3, tmp1, b_scales_vec, vl / 4);
+                    }
+                }
+                __riscv_vse32_v_f32m1(&s[(y * 4 + 0) * bs + x * ncols_interleaved], sumf0, vl / 4);
+                __riscv_vse32_v_f32m1(&s[(y * 4 + 1) * bs + x * ncols_interleaved], sumf1, vl / 4);
+                __riscv_vse32_v_f32m1(&s[(y * 4 + 2) * bs + x * ncols_interleaved], sumf2, vl / 4);
+                __riscv_vse32_v_f32m1(&s[(y * 4 + 3) * bs + x * ncols_interleaved], sumf3, vl / 4);
+            }
+        }
+
+        return;
+    }
+
+#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__)
+    float sumf[4][8];
+    int sumi;
+
+    for (int y = 0; y < nr / 4; y++) {
+        const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
+        for (int x = 0; x < nc / ncols_interleaved; x++) {
+            const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb);
+            for (int m = 0; m < 4; m++) {
+                for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
+            }
+            for (int l = 0; l < nb; l++) {
+                for (int k = 0; k < (qk / (2 * blocklen)); k++) {
+                    for (int m = 0; m < 4; m++) {
+                        for (int j = 0; j < ncols_interleaved; j++) {
+                            sumi = 0;
+                            for (int i = 0; i < blocklen; ++i) {
+                                const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
+                                const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
+                                sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
+                                         (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
+                            }
+                            sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
+                        }
+                    }
+                }
+            }
+            for (int m = 0; m < 4; m++) {
+                for (int j = 0; j < ncols_interleaved; j++)
+                    s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
+            }
+        }
+    }
+}
diff --git a/ggml/src/ggml-cpu/arch/s390/quants.c b/ggml/src/ggml-cpu/arch/s390/quants.c
new file mode 100644
index 0000000000000..a840219a4fc08
--- /dev/null
+++ b/ggml/src/ggml-cpu/arch/s390/quants.c
@@ -0,0 +1,1300 @@
+#define GGML_COMMON_IMPL_C
+#include "ggml-common.h"
+#include "ggml-quants.h"
+#include "ggml-impl.h"
+#include "ggml-cpu.h"
+#include "simd-mappings.h"
+
+#include "../../quants.h"
+#include "../../ggml-cpu-impl.h"
+
+#include <math.h>
+#include <string.h>
+#include <assert.h>
+#include <float.h>
+#include <stdlib.h> // for qsort
+#include <stdio.h>  // for GGML_ASSERT
+
+#define GROUP_MAX_EPS 1e-15f
+#define GROUP_MAX_EPS_IQ3_XXS 1e-8f
+#define GROUP_MAX_EPS_IQ2_S 1e-8f
+#define GROUP_MAX_EPS_IQ1_M 1e-7f
+#define GROUP_MAX_EPS_IQ1_S 1e-12f
+
+#define UNUSED GGML_UNUSED
+
+void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
+    assert(QK8_0 == 32);
+    assert(k % QK8_0 == 0);
+    const int nb = k / QK8_0;
+
+    block_q8_0 * GGML_RESTRICT y = vy;
+
+#if defined(__VXE__) || defined(__VXE2__)
+    for (int i = 0; i < nb; i++) {
+        __vector float srcv [8];
+        __vector float asrcv[8];
+        __vector float amaxv[8];
+
+        for (int j = 0; j < 8; j++) srcv[j] = vec_xl(0, x + i*32 + 4*j);
+        for (int j = 0; j < 8; j++) asrcv[j] = vec_abs(srcv[j]);
+        for (int j = 0; j < 4; j++) amaxv[2*j] = vec_max(asrcv[2*j], asrcv[2*j+1]);
+        for (int j = 0; j < 2; j++) amaxv[4*j] = vec_max(amaxv[4*j], amaxv[4*j+2]);
+        for (int j = 0; j < 1; j++) amaxv[8*j] = vec_max(amaxv[8*j], amaxv[8*j+4]);
+
+        const float amax = MAX(MAX(vec_extract(amaxv[0], 0),
+                                   vec_extract(amaxv[0], 1)),
+                               MAX(vec_extract(amaxv[0], 2),
+                                   vec_extract(amaxv[0], 3)));
+
+        const float d = amax / ((1 << 7) - 1);
+        const float id = d ? 1.0f / d : 0.0f;
+
+        y[i].d = GGML_CPU_FP32_TO_FP16(d);
+
+        for (int j = 0; j < 8; j++) {
+            const __vector float v = vec_mul(srcv[j], vec_splats(id));
+            const __vector int32_t vi = vec_signed(v);
+
+            y[i].qs[4*j + 0] = vec_extract(vi, 0);
+            y[i].qs[4*j + 1] = vec_extract(vi, 1);
+            y[i].qs[4*j + 2] = vec_extract(vi, 2);
+            y[i].qs[4*j + 3] = vec_extract(vi, 3);
+        }
+    }
+#else
+    GGML_UNUSED(nb);
+    // scalar
+    quantize_row_q8_0_ref(x, y, k);
+#endif
+}
+
+void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
+    assert(k % QK8_1 == 0);
+    const int nb = k / QK8_1;
+
+    block_q8_1 * GGML_RESTRICT y = vy;
+
+#if defined(__VXE__) || defined(__VXE2__)
+    for (int i = 0; i < nb; i++) {
+        __vector float srcv [8];
+        __vector float asrcv[8];
+        __vector float amaxv[8];
+
+        for (int j = 0; j < 8; j++) srcv[j] = vec_xl(0, x + i*32 + 4*j);
+        for (int j = 0; j < 8; j++) asrcv[j] = vec_abs(srcv[j]);
+        for (int j = 0; j < 4; j++) amaxv[2*j] = vec_max(asrcv[2*j], asrcv[2*j+1]);
+        for (int j = 0; j < 2; j++) amaxv[4*j] = vec_max(amaxv[4*j], amaxv[4*j+2]);
+        for (int j = 0; j < 1; j++) amaxv[8*j] = vec_max(amaxv[8*j], amaxv[8*j+4]);
+
+        const float amax = MAX(MAX(vec_extract(amaxv[0], 0),
+                                   vec_extract(amaxv[0], 1)),
+                               MAX(vec_extract(amaxv[0], 2),
+                                   vec_extract(amaxv[0], 3)));
+
+        const float d = amax / ((1 << 7) - 1);
+        const float id = d ? 1.0f / d : 0.0f;
+
+        y[i].d = GGML_CPU_FP32_TO_FP16(d);
+
+        __vector int32_t acc = vec_splats(0);
+
+        for (int j = 0; j < 8; j++) {
+            const __vector float v = vec_mul(srcv[j], vec_splats(id));
+            const __vector int32_t vi = vec_signed(v);
+
+            y[i].qs[4*j + 0] = vec_extract(vi, 0);
+            y[i].qs[4*j + 1] = vec_extract(vi, 1);
+            y[i].qs[4*j + 2] = vec_extract(vi, 2);
+            y[i].qs[4*j + 3] = vec_extract(vi, 3);
+
+            acc = vec_add(acc, vi);
+        }
+
+        y[i].s = GGML_CPU_FP32_TO_FP16(d * (acc[0] + acc[1] + acc[2] + acc[3]));
+    }
+#else
+    GGML_UNUSED(nb);
+    // scalar
+    quantize_row_q8_1_ref(x, y, k);
+#endif
+}
+
+
+//===================================== Dot products =================================
+
+void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    const int qk = QK8_0;
+    const int nb = n / qk;
+
+    assert(n % qk == 0);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_q4_0 * GGML_RESTRICT x = vx;
+    const block_q8_0 * GGML_RESTRICT y = vy;
+
+    int ib = 0;
+    float sumf = 0;
+
+#if defined(__VXE__) || defined(__VXE2__)
+    __vector float acc = vec_splats(0.0f);
+
+    const __vector uint8_t v_m = vec_splats((const uint8_t)0x0F);
+    const __vector int8_t  v_s = vec_splats( (const int8_t)0x08);
+
+    for (; ib < nb; ++ib) {
+        const __vector uint8_t v_x = vec_xl(0, x[ib].qs);
+        const __vector int8_t v_xl = (const __vector int8_t)(v_x & v_m);
+        const __vector int8_t v_xh = (const __vector int8_t)(v_x >> 4);
+
+        const __vector int8_t v_xls = vec_sub(v_xl, v_s);
+        const __vector int8_t v_xhs = vec_sub(v_xh, v_s);
+
+        const __vector int8_t v_yl = vec_xl(0      , y[ib].qs);
+        const __vector int8_t v_yh = vec_xl(QK8_0/2, y[ib].qs);
+
+        const __vector int16_t v_xylso = vec_mulo(v_xls, v_yl);
+        const __vector int16_t v_xylse = vec_mule(v_xls, v_yl);
+        const __vector int16_t v_xyhso = vec_mulo(v_xhs, v_yh);
+        const __vector int16_t v_xyhse = vec_mule(v_xhs, v_yh);
+
+        __vector int16_t v_xy_ = v_xylso + v_xylse + v_xyhso + v_xyhse; v_xy_ += vec_reve(v_xy_);
+
+        const __vector float v_xy = vec_float(vec_unpackh(v_xy_));
+        const __vector float v_d = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d));
+
+        acc = vec_madd(v_xy, v_d, acc);
+    }
+
+    sumf = acc[0] + acc[1] + acc[2] + acc[3];
+
+#endif
+    for (; ib < nb; ++ib) {
+        int sumi0 = 0;
+        int sumi1 = 0;
+
+        for (int j = 0; j < qk/2; ++j) {
+            const int v0 = (x[ib].qs[j] & 0x0F) - 8;
+            const int v1 = (x[ib].qs[j] >>   4) - 8;
+
+            sumi0 += (v0 * y[ib].qs[j]);
+            sumi1 += (v1 * y[ib].qs[j + qk/2]);
+        }
+
+        int sumi = sumi0 + sumi1;
+        sumf += sumi*GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d);
+    }
+
+    *s = sumf;
+}
+
+void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    const int qk = QK8_1;
+    const int nb = n / qk;
+
+    assert(n % qk == 0);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_q4_1 * GGML_RESTRICT x = vx;
+    const block_q8_1 * GGML_RESTRICT y = vy;
+
+    int ib = 0;
+    float sumf = 0;
+
+#if defined(__VXE__) || defined(__VXE2__)
+    float summs = 0;
+    float32x4_t acc = vec_splats(0.0f);
+
+    const uint8x16_t v_m = vec_splat_u8(0x0F);
+
+#pragma GCC unroll 4
+    for (; ib < nb; ++ib) {
+        __builtin_prefetch(x[ib].qs, 0, 1);
+        __builtin_prefetch(y[ib].qs, 0, 1);
+
+        summs += GGML_CPU_FP16_TO_FP32(x[ib].m) * GGML_CPU_FP16_TO_FP32(y[ib].s);
+
+        const uint8x16_t v_x = vec_xl(0, x[ib].qs);
+        const int8x16_t v_xl = (const int8x16_t)(v_x & v_m);
+        const int8x16_t v_xh = (const int8x16_t)(v_x >> 4);
+
+        const int8x16_t v_yl = vec_xl(0      , y[ib].qs);
+        const int8x16_t v_yh = vec_xl(QK8_1/2, y[ib].qs);
+
+        const int32x4_t v_xy_ = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_xl, v_yl), v_xh, v_yh);
+        const float32x4_t v_xy = vec_float(v_xy_);
+
+        const float32x4_t v_d = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d));
+
+        acc = vec_madd(v_xy, v_d, acc);
+    }
+
+    sumf = acc[0] + acc[1] + acc[2] + acc[3] + summs;
+
+#endif
+    for (; ib < nb; ++ib) {
+        int sumi0 = 0;
+        int sumi1 = 0;
+
+        for (int j = 0; j < qk/2; ++j) {
+            const int v0 = (x[ib].qs[j] & 0x0F);
+            const int v1 = (x[ib].qs[j] >>   4);
+
+            sumi0 += (v0 * y[ib].qs[j]);
+            sumi1 += (v1 * y[ib].qs[j + qk/2]);
+        }
+
+        int sumi = sumi0 + sumi1;
+        sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
+    }
+
+    *s = sumf;
+}
+
+void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    const int qk = QK8_0;
+    const int nb = n / qk;
+
+    assert(n % qk == 0);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_q8_0 * GGML_RESTRICT x = vx;
+    const block_q8_0 * GGML_RESTRICT y = vy;
+
+    int ib = 0;
+    float sumf = 0;
+
+#if defined(__VXE__) || defined(__VXE2__)
+    __vector float acc = vec_splats(0.0f);
+
+#pragma GCC unroll 8
+    for (; ib < nb; ++ib) {
+        __builtin_prefetch(x[ib].qs, 0, 1);
+        __builtin_prefetch(y[ib].qs, 0, 1);
+
+        const int8x16_t v_xl = vec_xl(0      , x[ib].qs);
+        const int8x16_t v_xh = vec_xl(QK8_0/2, x[ib].qs);
+        const int8x16_t v_yl = vec_xl(0      , y[ib].qs);
+        const int8x16_t v_yh = vec_xl(QK8_0/2, y[ib].qs);
+
+        const int32x4_t v_xy_ = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_xl, v_yl), v_xh, v_yh);
+        const float32x4_t v_xy = vec_float(v_xy_);
+        const float32x4_t v_d = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d));
+
+        acc = vec_madd(v_xy, v_d, acc);
+    }
+
+    sumf = acc[0] + acc[1] + acc[2] + acc[3];
+
+#endif
+    for (; ib < nb; ++ib) {
+        int sumi = 0;
+
+        for (int j = 0; j < qk; j++) {
+            sumi += x[ib].qs[j]*y[ib].qs[j];
+        }
+
+        sumf += sumi*(GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d));
+    }
+
+    *s = sumf;
+}
+
+void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    assert(n % QK_K == 0);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const uint32_t kmask1 = 0x03030303;
+    const uint32_t kmask2 = 0x0f0f0f0f;
+
+    const block_q3_K * GGML_RESTRICT x = vx;
+    const block_q8_K * GGML_RESTRICT y = vy;
+
+    const int nb = n / QK_K;
+
+#if defined(__VXE__) || defined(__VXE2__)
+    uint32_t aux[3];
+    uint32_t utmp[4];
+
+    const int32x4_t v_z = vec_splat_s32(0);
+    const uint8x16_t v_3m = vec_splat_u8(0x03);
+
+    const uint8x16_t v_0c = vec_splat_u8(1);
+    const uint8x16_t v_1c = vec_sl(v_0c, 1);
+    const uint8x16_t v_2c = vec_sl(v_0c, 2);
+    const uint8x16_t v_3c = vec_sl(v_0c, 3);
+
+    uint8x16_t q3h[4];
+    uint8x16_t q3b[2];
+    int8x16_t q3bytes[4];
+    int8x16_t q8bytes[4];
+    uint8x16_t qhbits[2];
+
+    float sum = 0;
+
+    for (int i = 0; i < nb; ++i) {
+        const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
+
+        const uint8_t * restrict x0l = x[i].qs;
+        const uint8_t * restrict x0h = x[i].hmask;
+        const int8_t  * restrict y0  = y[i].qs;
+
+        qhbits[0] = vec_xl(0 , x0h);
+        qhbits[1] = vec_xl(16, x0h);
+
+        int32_t isum = 0;
+
+        memcpy(aux, x[i].scales, 12);
+        utmp[3] = ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4);
+        utmp[2] = ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4);
+        utmp[1] = (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4);
+        utmp[0] = (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4);
+
+        int8_t * scale = (int8_t *)utmp;
+        for (int j = 0; j < 16; ++j) scale[j] -= 32;
+
+        for (int j = 0; j < QK_K/128; ++j) {
+            int32x4_t isum0, isum1, isum2, isum3;
+
+            q3b[0] = vec_xl(0 , x0l);
+            q3b[1] = vec_xl(16, x0l);
+            x0l += 32;
+
+            q8bytes[0] = vec_xl(0  , y0);
+            q8bytes[1] = vec_xl(16 , y0);
+            q8bytes[2] = vec_xl(32 , y0);
+            q8bytes[3] = vec_xl(48 , y0);
+            q8bytes[4] = vec_xl(64 , y0);
+            q8bytes[5] = vec_xl(80 , y0);
+            q8bytes[6] = vec_xl(96 , y0);
+            q8bytes[7] = vec_xl(112, y0);
+            y0 += 128;
+
+            q3h[0] = vec_sl(vec_andc(v_0c, qhbits[0]), 2);
+            q3h[1] = vec_sl(vec_andc(v_0c, qhbits[1]), 2);
+            q3h[2] = vec_sl(vec_andc(v_1c, qhbits[0]), 1);
+            q3h[3] = vec_sl(vec_andc(v_1c, qhbits[1]), 1);
+
+            q3bytes[0] = vec_sub((int8x16_t)vec_and(q3b[0], v_3m), (int8x16_t)q3h[0]);
+            q3bytes[1] = vec_sub((int8x16_t)vec_and(q3b[1], v_3m), (int8x16_t)q3h[1]);
+            q3bytes[2] = vec_sub((int8x16_t)vec_and(vec_sr(q3b[0], 2), v_3m), (int8x16_t)q3h[2]);
+            q3bytes[3] = vec_sub((int8x16_t)vec_and(vec_sr(q3b[1], 2), v_3m), (int8x16_t)q3h[3]);
+
+            isum0 = ggml_vec_dot(v_z, q3bytes[0], q8bytes[0]);
+            isum1 = ggml_vec_dot(v_z, q3bytes[1], q8bytes[1]);
+            isum2 = ggml_vec_dot(v_z, q3bytes[2], q8bytes[2]);
+            isum3 = ggml_vec_dot(v_z, q3bytes[3], q8bytes[3]);
+
+            isum += (isum0[0] + isum0[1] + isum0[2] + isum0[3]) * scale[0];
+            isum += (isum1[0] + isum1[1] + isum1[2] + isum1[3]) * scale[1];
+            isum += (isum2[0] + isum2[1] + isum2[2] + isum2[3]) * scale[2];
+            isum += (isum3[0] + isum3[1] + isum3[2] + isum3[3]) * scale[3];
+
+            scale += 4;
+
+            q3h[0] = vec_andc(v_2c, qhbits[0]);
+            q3h[1] = vec_andc(v_2c, qhbits[1]);
+            q3h[2] = vec_sr(vec_andc(v_3c, qhbits[0]), 1);
+            q3h[3] = vec_sr(vec_andc(v_3c, qhbits[1]), 1);
+
+            q3bytes[0] = vec_sub((int8x16_t)vec_and(vec_sr(q3b[0], 4), v_3m), (int8x16_t)q3h[0]);
+            q3bytes[1] = vec_sub((int8x16_t)vec_and(vec_sr(q3b[1], 4), v_3m), (int8x16_t)q3h[1]);
+            q3bytes[2] = vec_sub((int8x16_t)vec_and(vec_sr(q3b[0], 6), v_3m), (int8x16_t)q3h[2]);
+            q3bytes[3] = vec_sub((int8x16_t)vec_and(vec_sr(q3b[1], 6), v_3m), (int8x16_t)q3h[3]);
+
+            isum0 = ggml_vec_dot(v_z, q3bytes[0], q8bytes[4]);
+            isum1 = ggml_vec_dot(v_z, q3bytes[1], q8bytes[5]);
+            isum2 = ggml_vec_dot(v_z, q3bytes[2], q8bytes[6]);
+            isum3 = ggml_vec_dot(v_z, q3bytes[3], q8bytes[7]);
+
+            isum += (isum0[0] + isum0[1] + isum0[2] + isum0[3]) * scale[0];
+            isum += (isum1[0] + isum1[1] + isum1[2] + isum1[3]) * scale[1];
+            isum += (isum2[0] + isum2[1] + isum2[2] + isum2[3]) * scale[2];
+            isum += (isum3[0] + isum3[1] + isum3[2] + isum3[3]) * scale[3];
+
+            scale += 4;
+
+            if (j == 0) {
+                qhbits[0] = vec_sr(qhbits[0], 4);
+                qhbits[1] = vec_sr(qhbits[1], 4);
+            }
+        }
+
+        sum += d * isum;
+    }
+
+    *s = sum;
+
+#else
+    // scalar version
+    // This function is written like this so the compiler can manage to vectorize most of it
+    // Using -Ofast, GCC and clang manage to produce code that is within a factor of 2 or so from the
+    // manually vectorized version above. Every other version I tried would run at least 4 times slower.
+    // The ideal situation would be if we could just write the code once, and the compiler would
+    // automatically produce the best possible set of machine instructions, instead of us having to manually
+    // write vectorized versions for AVX, ARM_NEON, etc.
+
+    int8_t  aux8[QK_K];
+    int16_t aux16[8];
+    float   sums [8];
+    int32_t aux32[8];
+    memset(sums, 0, 8*sizeof(float));
+
+    uint32_t auxs[4];
+    const int8_t * scales = (const int8_t*)auxs;
+
+    float sumf = 0;
+    for (int i = 0; i < nb; ++i) {
+        const uint8_t * GGML_RESTRICT q3 = x[i].qs;
+        const uint8_t * GGML_RESTRICT hm = x[i].hmask;
+        const  int8_t * GGML_RESTRICT q8 = y[i].qs;
+        memset(aux32, 0, 8*sizeof(int32_t));
+        int8_t * GGML_RESTRICT a = aux8;
+        uint8_t m = 1;
+        for (int j = 0; j < QK_K; j += 128) {
+            for (int l = 0; l < 32; ++l) a[l] = q3[l] & 3;
+            for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
+            a += 32; m <<= 1;
+            for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 2) & 3;
+            for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
+            a += 32; m <<= 1;
+            for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 4) & 3;
+            for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
+            a += 32; m <<= 1;
+            for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 6) & 3;
+            for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
+            a += 32; m <<= 1;
+            q3 += 32;
+        }
+        a = aux8;
+
+        memcpy(auxs, x[i].scales, 12);
+        uint32_t tmp = auxs[2];
+        auxs[2] = ((auxs[0] >> 4) & kmask2) | (((tmp >> 4) & kmask1) << 4);
+        auxs[3] = ((auxs[1] >> 4) & kmask2) | (((tmp >> 6) & kmask1) << 4);
+        auxs[0] = (auxs[0] & kmask2) | (((tmp >> 0) & kmask1) << 4);
+        auxs[1] = (auxs[1] & kmask2) | (((tmp >> 2) & kmask1) << 4);
+        for (int j = 0; j < QK_K/16; ++j) {
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
+            q8 += 8; a += 8;
+        }
+        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+        for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
+    }
+    for (int l = 0; l < 8; ++l) sumf += sums[l];
+    *s = sumf;
+
+#endif
+
+}
+
+void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    assert(n % QK_K == 0);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_q4_K * GGML_RESTRICT x = vx;
+    const block_q8_K * GGML_RESTRICT y = vy;
+
+    const int nb = n / QK_K;
+
+    static const uint32_t kmask1 = 0x3f3f3f3f;
+    static const uint32_t kmask2 = 0x0f0f0f0f;
+    static const uint32_t kmask3 = 0x03030303;
+
+    uint32_t utmp[4];
+
+#if defined(__VXE__) || defined(__VXE2__)
+    const uint8x16_t v_lm = vec_splat_u8(0x0F);
+    const int32x4_t v_z = vec_splat_s32(0);
+
+    uint8x16_t v_x[2];
+    int8x16_t  v_xl[2];
+    int8x16_t  v_y[2];
+
+    float sumf = 0;
+
+    for (int i = 0; i < nb; ++i) {
+        const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
+        const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
+
+        const int16x8_t v_ysumsl = vec_xl(0 , y[i].bsums);
+        const int16x8_t v_ysumsh = vec_xl(16, y[i].bsums);
+        const int16x8_t v_ysums = vec_padd_s16(v_ysumsl, v_ysumsh);
+
+        memcpy(utmp, x[i].scales, 12);
+
+        uint32x4_t v_mins8 = { 0 };
+        v_mins8 = vec_insert(utmp[1] & kmask1, v_mins8, 0);
+        v_mins8 = vec_insert(((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4), v_mins8, 1);
+
+        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
+        utmp[0] &= kmask1;
+
+        const int16x8_t v_minsh = (int16x8_t)vec_unpackh((uint8x16_t)v_mins8);
+
+        const int32x4_t v_minso = vec_mulo(v_ysums, v_minsh);
+        const int32x4_t v_minse = vec_mule(v_ysums, v_minsh);
+        const int32x4_t v_mins = v_minso + v_minse;
+        sumf -= dmin * (v_mins[0] + v_mins[1] + v_mins[2] + v_mins[3]);
+
+        const uint8_t * scales = (const uint8_t *)utmp;
+        const uint8_t * GGML_RESTRICT x0 = x[i].qs;
+        const int8_t  * GGML_RESTRICT y0 = y[i].qs;
+
+        int32_t sumi1 = 0;
+        int32_t sumi2 = 0;
+
+        for (int j = 0; j < QK_K/64; ++j) {
+            v_x[0] = vec_xl(0 , x0);
+            v_x[1] = vec_xl(16, x0);
+            x0 += 32;
+
+            v_y[0] = vec_xl(0 , y0);
+            v_y[1] = vec_xl(16, y0);
+            y0 += 32;
+
+            v_xl[0] = (int8x16_t)vec_and(v_x[0], v_lm);
+            v_xl[1] = (int8x16_t)vec_and(v_x[1], v_lm);
+
+            const int32x4_t p1 = ggml_vec_dot(ggml_vec_dot(v_z, v_xl[0], v_y[0]), v_xl[1], v_y[1]);
+            sumi1 += (p1[0] + p1[1] + p1[2] + p1[3]) * scales[2*j+0];
+
+            v_y[0] = vec_xl(0 , y0);
+            v_y[1] = vec_xl(16, y0);
+            y0 += 32;
+
+            v_xl[0] = (int8x16_t)vec_sr(v_x[0], 4);
+            v_xl[1] = (int8x16_t)vec_sr(v_x[1], 4);
+
+            const int32x4_t p2 = ggml_vec_dot(ggml_vec_dot(v_z, v_xl[0], v_y[0]), v_xl[1], v_y[1]);
+            sumi2 += (p2[0] + p2[1] + p2[2] + p2[3]) * scales[2*j+1];
+        }
+
+        sumf += d * (sumi1 + sumi2);
+    }
+
+    *s = sumf;
+
+#else
+
+    const uint8_t * scales = (const uint8_t*)&utmp[0];
+    const uint8_t * mins   = (const uint8_t*)&utmp[2];
+
+    int8_t  aux8[QK_K];
+    int16_t aux16[8];
+    float   sums [8];
+    int32_t aux32[8];
+    memset(sums, 0, 8*sizeof(float));
+
+    float sumf = 0;
+    for (int i = 0; i < nb; ++i) {
+        const uint8_t * GGML_RESTRICT q4 = x[i].qs;
+        const  int8_t * GGML_RESTRICT q8 = y[i].qs;
+        memset(aux32, 0, 8*sizeof(int32_t));
+        int8_t * GGML_RESTRICT a = aux8;
+        for (int j = 0; j < QK_K/64; ++j) {
+            for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
+            a += 32;
+            for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l]  >> 4);
+            a += 32; q4 += 32;
+        }
+        memcpy(utmp, x[i].scales, 12);
+        utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
+        const uint32_t uaux = utmp[1] & kmask1;
+        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
+        utmp[2] = uaux;
+        utmp[0] &= kmask1;
+
+        int sumi = 0;
+        for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
+        a = aux8;
+        int is = 0;
+        for (int j = 0; j < QK_K/32; ++j) {
+            int32_t scale = scales[is++];
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+        }
+        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+        for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
+        const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
+        sumf -= dmin * sumi;
+    }
+    for (int l = 0; l < 8; ++l) sumf += sums[l];
+    *s = sumf;
+#endif
+}
+
+void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy,  size_t by, int nrc) {
+    assert(n % QK_K == 0);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_q5_K * GGML_RESTRICT x = vx;
+    const block_q8_K * GGML_RESTRICT y = vy;
+
+    const int nb = n / QK_K;
+
+    static const uint32_t kmask1 = 0x3f3f3f3f;
+    static const uint32_t kmask2 = 0x0f0f0f0f;
+    static const uint32_t kmask3 = 0x03030303;
+
+    uint32_t utmp[4];
+
+#if defined(__VXE__) || defined(__VXE2__)
+    const uint8x16_t v_lm = vec_splat_u8(0x0F);
+    const uint8x16_t v_1m = vec_splat_u8(0x01);
+    const uint8x16_t v_2m = vec_splat_u8(0x02);
+
+    const int32x4_t v_z = vec_splat_s32(0);
+
+    const uchar8x16_t v_minsm = {
+        0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
+        0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF
+    };
+
+    int8x16_t  q5b[4];
+    uint8x16_t q5h[4];
+
+    uint8x16_t v_xl[2];
+    uint8x16_t v_xh[2];
+    int8x16_t  v_y[4];
+
+    float sumf = 0;
+
+    for (int i = 0; i < nb; ++i) {
+        const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
+        const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
+
+        const int16x8_t v_ysumsl = vec_xl(0 , y[i].bsums);
+        const int16x8_t v_ysumsh = vec_xl(16, y[i].bsums);
+        const int16x8_t v_ysums = vec_padd_s16(v_ysumsl, v_ysumsh);
+
+        memcpy(utmp, x[i].scales, 12);
+        utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
+        const uint32_t uaux = utmp[1] & kmask1;
+        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
+        utmp[2] = uaux;
+        utmp[0] &= kmask1;
+
+        const uint8x16_t v_mins16 = vec_xl(0, (const uint8_t *)utmp);
+        const uint8x16_t v_mins8 = vec_perm(v_mins16, v_mins16, v_minsm);
+        const int16x8_t v_minsh = (int16x8_t)vec_unpackh(v_mins8);
+
+        const int32x4_t v_minsho = vec_mulo(v_ysums, v_minsh);
+        const int32x4_t v_minshe = vec_mule(v_ysums, v_minsh);
+        const int32x4_t v_mins = vec_add(v_minsho, v_minshe);
+        const int32_t mins = v_mins[0] + v_mins[1] + v_mins[2] + v_mins[3];
+
+        const uint8_t * scales = (const uint8_t *)utmp;
+        const uint8_t * GGML_RESTRICT x0l = x[i].qs;
+        const uint8_t * GGML_RESTRICT x0h = x[i].qh;
+        const int8_t  * GGML_RESTRICT y0 = y[i].qs;
+
+        v_xh[0] = vec_xl(0 , x0h);
+        v_xh[1] = vec_xl(16, x0h);
+
+        int32_t sumi = 0;
+        for (int j = 0; j < QK_K/64; ++j) {
+            v_xl[0] = vec_xl(0 , x0l);
+            v_xl[1] = vec_xl(16, x0l);
+            x0l += 32;
+
+            v_y[0] = vec_xl(0 , y0);
+            v_y[1] = vec_xl(16, y0);
+            v_y[2] = vec_xl(32, y0);
+            v_y[3] = vec_xl(48, y0);
+            y0 += 64;
+
+            q5h[0] = vec_sl(vec_and(v_1m, v_xh[0]), 4);
+            q5h[1] = vec_sl(vec_and(v_1m, v_xh[1]), 4);
+            q5h[2] = vec_sl(vec_and(v_2m, v_xh[0]), 3);
+            q5h[3] = vec_sl(vec_and(v_2m, v_xh[1]), 3);
+            v_xh[0] = vec_sr(v_xh[0], 2);
+            v_xh[1] = vec_sr(v_xh[1], 2);
+
+            q5b[0] = (int8x16_t)vec_or(vec_and(v_xl[0], v_lm), q5h[0]);
+            q5b[1] = (int8x16_t)vec_or(vec_and(v_xl[1], v_lm), q5h[1]);
+            q5b[2] = (int8x16_t)vec_or(vec_sr(v_xl[0], 4), q5h[2]);
+            q5b[3] = (int8x16_t)vec_or(vec_sr(v_xl[1], 4), q5h[3]);
+
+            int32x4_t sumi0 = ggml_vec_dot(ggml_vec_dot(v_z, q5b[0], v_y[0]), q5b[1], v_y[1]);
+            int32x4_t sumi1 = ggml_vec_dot(ggml_vec_dot(v_z, q5b[2], v_y[2]), q5b[3], v_y[3]);
+
+            sumi += (sumi0[0] + sumi0[1] + sumi0[2] + sumi0[3]) * *scales++;
+            sumi += (sumi1[0] + sumi1[1] + sumi1[2] + sumi1[3]) * *scales++;
+        }
+
+        sumf += d * sumi - dmin * mins;
+    }
+
+    *s = sumf;
+
+#else
+
+    const uint8_t * scales = (const uint8_t*)&utmp[0];
+    const uint8_t * mins   = (const uint8_t*)&utmp[2];
+
+    int8_t  aux8[QK_K];
+    int16_t aux16[8];
+    float   sums [8];
+    int32_t aux32[8];
+    memset(sums, 0, 8*sizeof(float));
+
+    float sumf = 0;
+    for (int i = 0; i < nb; ++i) {
+        const uint8_t * GGML_RESTRICT q4 = x[i].qs;
+        const uint8_t * GGML_RESTRICT hm = x[i].qh;
+        const  int8_t * GGML_RESTRICT q8 = y[i].qs;
+        memset(aux32, 0, 8*sizeof(int32_t));
+        int8_t * GGML_RESTRICT a = aux8;
+        uint8_t m = 1;
+        for (int j = 0; j < QK_K/64; ++j) {
+            for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
+            for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
+            a += 32; m <<= 1;
+            for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l]  >> 4);
+            for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
+            a += 32; m <<= 1;
+            q4 += 32;
+        }
+        memcpy(utmp, x[i].scales, 12);
+        utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
+        const uint32_t uaux = utmp[1] & kmask1;
+        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
+        utmp[2] = uaux;
+        utmp[0] &= kmask1;
+
+        int sumi = 0;
+        for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
+        a = aux8;
+        int is = 0;
+        for (int j = 0; j < QK_K/32; ++j) {
+            int32_t scale = scales[is++];
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+        }
+        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+        for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
+        const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
+        sumf -= dmin * sumi;
+    }
+    for (int l = 0; l < 8; ++l) sumf += sums[l];
+    *s = sumf;
+#endif
+}
+
+void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    assert(n % QK_K == 0);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_q6_K * GGML_RESTRICT x = vx;
+    const block_q8_K * GGML_RESTRICT y = vy;
+
+    const int nb = n / QK_K;
+
+#if defined(__VXE__) || defined(__VXE2__)
+    float sum = 0;
+
+    // Lower 4-bit and upper 2-bit masks
+    const uint8x16_t v_lm = vec_splat_u8(0x0F);
+    const uint8x16_t v_um = vec_splat_u8(0x03);
+
+    const int32x4_t v_z = vec_splat_s32(0);
+
+    int8x16_t  q6b[4];
+    uint8x16_t q6h[4];
+
+    uint8x16_t v_xl[4];
+    uint8x16_t v_xh[2];
+    int8x16_t  v_y[4];
+
+    for (int i = 0; i < nb; ++i) {
+        const float d_all = GGML_CPU_FP16_TO_FP32(x[i].d);
+
+        const uint8_t * GGML_RESTRICT x0l = x[i].ql;
+        const uint8_t * GGML_RESTRICT x0h = x[i].qh;
+        const int8_t  * GGML_RESTRICT y0 = y[i].qs;
+
+        const int8_t  * GGML_RESTRICT scale = x[i].scales;
+
+        const int16x8_t v_ysumsl = vec_xl(0 , y[i].bsums);
+        const int16x8_t v_ysumsh = vec_xl(16, y[i].bsums);
+
+        const int8x16_t v_scale  = vec_xl(0, scale);
+        const int16x8_t v_scalel = vec_unpackh(v_scale);
+        const int16x8_t v_scaleh = vec_unpackl(v_scale);
+
+        const int32x4_t v_minslo = vec_mulo(v_ysumsl, v_scalel);
+        const int32x4_t v_minsle = vec_mule(v_ysumsl, v_scalel);
+        const int32x4_t v_minsho = vec_mulo(v_ysumsh, v_scaleh);
+        const int32x4_t v_minshe = vec_mule(v_ysumsh, v_scaleh);
+        const int32x4_t v_mins = v_minslo + v_minsle + v_minsho + v_minshe;
+
+        const int32_t mins = v_mins[0] + v_mins[1] + v_mins[2] + v_mins[3];
+
+        int32_t isum = 0;
+        for (int j = 0; j < QK_K/128; ++j) {
+            // Load model upper 2 bits
+            v_xh[0] = vec_xl(0 , x0h);
+            v_xh[1] = vec_xl(16, x0h);
+            x0h += 32;
+
+            // Load model lower 4 bits
+            v_xl[0] = vec_xl(0 , x0l);
+            v_xl[1] = vec_xl(16, x0l);
+            v_xl[2] = vec_xl(32, x0l);
+            v_xl[3] = vec_xl(48, x0l);
+            x0l += 64;
+
+            // Load activation quants
+            v_y[0] = vec_xl(0 , y0);
+            v_y[1] = vec_xl(16, y0);
+            v_y[2] = vec_xl(32, y0);
+            v_y[3] = vec_xl(48, y0);
+            y0 += 64;
+
+            q6h[0] = vec_sl(vec_and(v_um, v_xh[0]), 4);
+            q6h[1] = vec_sl(vec_and(v_um, v_xh[1]), 4);
+            uint8x16_t shifted = vec_sr(v_xh[0], 2);
+            q6h[2] = vec_sl(vec_and(v_um, shifted), 4);
+            shifted = vec_sr(v_xh[1], 2);
+            q6h[3] = vec_sl(vec_and(v_um, shifted), 4);
+
+            q6b[0] = (int8x16_t)(vec_or(vec_and(v_xl[0], v_lm), q6h[0]));
+            q6b[1] = (int8x16_t)(vec_or(vec_and(v_xl[1], v_lm), q6h[1]));
+            q6b[2] = (int8x16_t)(vec_or(vec_and(v_xl[2], v_lm), q6h[2]));
+            q6b[3] = (int8x16_t)(vec_or(vec_and(v_xl[3], v_lm), q6h[3]));
+
+            int32x4_t summs0 = ggml_vec_dot(v_z, q6b[0], v_y[0]);
+            int32x4_t summs1 = ggml_vec_dot(v_z, q6b[1], v_y[1]);
+            int32x4_t summs2 = ggml_vec_dot(v_z, q6b[2], v_y[2]);
+            int32x4_t summs3 = ggml_vec_dot(v_z, q6b[3], v_y[3]);
+
+            isum += (summs0[0] + summs0[1] + summs0[2] + summs0[3]) * scale[0] +
+                    (summs1[0] + summs1[1] + summs1[2] + summs1[3]) * scale[1] +
+                    (summs2[0] + summs2[1] + summs2[2] + summs2[3]) * scale[2] +
+                    (summs3[0] + summs3[1] + summs3[2] + summs3[3]) * scale[3];
+
+            scale += 4;
+
+
+            // Load activation quants
+            v_y[0] = vec_xl(0 , y0);
+            v_y[1] = vec_xl(16, y0);
+            v_y[2] = vec_xl(32, y0);
+            v_y[3] = vec_xl(48, y0);
+            y0 += 64;
+
+            shifted = vec_sr(v_xh[0], 4);
+            q6h[0] = vec_sl(vec_and(v_um, shifted), 4);
+            shifted = vec_sr(v_xh[1], 4);
+            q6h[1] = vec_sl(vec_and(v_um, shifted), 4);
+            shifted = vec_sr(v_xh[0], 6);
+            q6h[2] = vec_sl(vec_and(v_um, shifted), 4);
+            shifted = vec_sr(v_xh[1], 6);
+            q6h[3] = vec_sl(vec_and(v_um, shifted), 4);
+
+            q6b[0] = (int8x16_t)(vec_or(vec_sr(v_xl[0], 4), q6h[0]));
+            q6b[1] = (int8x16_t)(vec_or(vec_sr(v_xl[1], 4), q6h[1]));
+            q6b[2] = (int8x16_t)(vec_or(vec_sr(v_xl[2], 4), q6h[2]));
+            q6b[3] = (int8x16_t)(vec_or(vec_sr(v_xl[3], 4), q6h[3]));
+
+            summs0 = ggml_vec_dot(v_z, q6b[0], v_y[0]);
+            summs1 = ggml_vec_dot(v_z, q6b[1], v_y[1]);
+            summs2 = ggml_vec_dot(v_z, q6b[2], v_y[2]);
+            summs3 = ggml_vec_dot(v_z, q6b[3], v_y[3]);
+
+            isum += (summs0[0] + summs0[1] + summs0[2] + summs0[3]) * scale[0] +
+                    (summs1[0] + summs1[1] + summs1[2] + summs1[3]) * scale[1] +
+                    (summs2[0] + summs2[1] + summs2[2] + summs2[3]) * scale[2] +
+                    (summs3[0] + summs3[1] + summs3[2] + summs3[3]) * scale[3];
+
+            scale += 4;
+        }
+
+        sum += d_all * y[i].d * (isum - 32 * mins);
+    }
+
+    *s = sum;
+
+#else
+
+    int8_t  aux8[QK_K];
+    int16_t aux16[8];
+    float   sums [8];
+    int32_t aux32[8];
+    memset(sums, 0, 8*sizeof(float));
+
+    float sumf = 0;
+    for (int i = 0; i < nb; ++i) {
+        const uint8_t * GGML_RESTRICT q4 = x[i].ql;
+        const uint8_t * GGML_RESTRICT qh = x[i].qh;
+        const  int8_t * GGML_RESTRICT q8 = y[i].qs;
+        memset(aux32, 0, 8*sizeof(int32_t));
+        int8_t * GGML_RESTRICT a = aux8;
+        for (int j = 0; j < QK_K; j += 128) {
+            for (int l = 0; l < 32; ++l) {
+                a[l +  0] = (int8_t)((q4[l +  0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
+                a[l + 32] = (int8_t)((q4[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
+                a[l + 64] = (int8_t)((q4[l +  0] >>  4) | (((qh[l] >> 4) & 3) << 4)) - 32;
+                a[l + 96] = (int8_t)((q4[l + 32] >>  4) | (((qh[l] >> 6) & 3) << 4)) - 32;
+            }
+            a  += 128;
+            q4 += 64;
+            qh += 32;
+        }
+        a = aux8;
+        int is = 0;
+        for (int j = 0; j < QK_K/16; ++j) {
+            int scale = x[i].scales[is++];
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+        }
+        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+        for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
+    }
+    for (int l = 0; l < 8; ++l) sumf += sums[l];
+    *s = sumf;
+#endif
+}
+
+// #if defined(__VXE__) || defined(__VXE2__)
+// static const int8_t keven_signs_q2xs[1024] = {
+//      1,  1,  1,  1,  1,  1,  1,  1, -1,  1,  1,  1,  1,  1,  1, -1,  1, -1,  1,  1,  1,  1,  1, -1, -1, -1,  1,  1,  1,  1,  1,  1,
+//      1,  1, -1,  1,  1,  1,  1, -1, -1,  1, -1,  1,  1,  1,  1,  1,  1, -1, -1,  1,  1,  1,  1,  1, -1, -1, -1,  1,  1,  1,  1, -1,
+//      1,  1,  1, -1,  1,  1,  1, -1, -1,  1,  1, -1,  1,  1,  1,  1,  1, -1,  1, -1,  1,  1,  1,  1, -1, -1,  1, -1,  1,  1,  1, -1,
+//      1,  1, -1, -1,  1,  1,  1,  1, -1,  1, -1, -1,  1,  1,  1, -1,  1, -1, -1, -1,  1,  1,  1, -1, -1, -1, -1, -1,  1,  1,  1,  1,
+//      1,  1,  1,  1, -1,  1,  1, -1, -1,  1,  1,  1, -1,  1,  1,  1,  1, -1,  1,  1, -1,  1,  1,  1, -1, -1,  1,  1, -1,  1,  1, -1,
+//      1,  1, -1,  1, -1,  1,  1,  1, -1,  1, -1,  1, -1,  1,  1, -1,  1, -1, -1,  1, -1,  1,  1, -1, -1, -1, -1,  1, -1,  1,  1,  1,
+//      1,  1,  1, -1, -1,  1,  1,  1, -1,  1,  1, -1, -1,  1,  1, -1,  1, -1,  1, -1, -1,  1,  1, -1, -1, -1,  1, -1, -1,  1,  1,  1,
+//      1,  1, -1, -1, -1,  1,  1, -1, -1,  1, -1, -1, -1,  1,  1,  1,  1, -1, -1, -1, -1,  1,  1,  1, -1, -1, -1, -1, -1,  1,  1, -1,
+//      1,  1,  1,  1,  1, -1,  1, -1, -1,  1,  1,  1,  1, -1,  1,  1,  1, -1,  1,  1,  1, -1,  1,  1, -1, -1,  1,  1,  1, -1,  1, -1,
+//      1,  1, -1,  1,  1, -1,  1,  1, -1,  1, -1,  1,  1, -1,  1, -1,  1, -1, -1,  1,  1, -1,  1, -1, -1, -1, -1,  1,  1, -1,  1,  1,
+//      1,  1,  1, -1,  1, -1,  1,  1, -1,  1,  1, -1,  1, -1,  1, -1,  1, -1,  1, -1,  1, -1,  1, -1, -1, -1,  1, -1,  1, -1,  1,  1,
+//      1,  1, -1, -1,  1, -1,  1, -1, -1,  1, -1, -1,  1, -1,  1,  1,  1, -1, -1, -1,  1, -1,  1,  1, -1, -1, -1, -1,  1, -1,  1, -1,
+//      1,  1,  1,  1, -1, -1,  1,  1, -1,  1,  1,  1, -1, -1,  1, -1,  1, -1,  1,  1, -1, -1,  1, -1, -1, -1,  1,  1, -1, -1,  1,  1,
+//      1,  1, -1,  1, -1, -1,  1, -1, -1,  1, -1,  1, -1, -1,  1,  1,  1, -1, -1,  1, -1, -1,  1,  1, -1, -1, -1,  1, -1, -1,  1, -1,
+//      1,  1,  1, -1, -1, -1,  1, -1, -1,  1,  1, -1, -1, -1,  1,  1,  1, -1,  1, -1, -1, -1,  1,  1, -1, -1,  1, -1, -1, -1,  1, -1,
+//      1,  1, -1, -1, -1, -1,  1,  1, -1,  1, -1, -1, -1, -1,  1, -1,  1, -1, -1, -1, -1, -1,  1, -1, -1, -1, -1, -1, -1, -1,  1,  1,
+//      1,  1,  1,  1,  1,  1, -1, -1, -1,  1,  1,  1,  1,  1, -1,  1,  1, -1,  1,  1,  1,  1, -1,  1, -1, -1,  1,  1,  1,  1, -1, -1,
+//      1,  1, -1,  1,  1,  1, -1,  1, -1,  1, -1,  1,  1,  1, -1, -1,  1, -1, -1,  1,  1,  1, -1, -1, -1, -1, -1,  1,  1,  1, -1,  1,
+//      1,  1,  1, -1,  1,  1, -1,  1, -1,  1,  1, -1,  1,  1, -1, -1,  1, -1,  1, -1,  1,  1, -1, -1, -1, -1,  1, -1,  1,  1, -1,  1,
+//      1,  1, -1, -1,  1,  1, -1, -1, -1,  1, -1, -1,  1,  1, -1,  1,  1, -1, -1, -1,  1,  1, -1,  1, -1, -1, -1, -1,  1,  1, -1, -1,
+//      1,  1,  1,  1, -1,  1, -1,  1, -1,  1,  1,  1, -1,  1, -1, -1,  1, -1,  1,  1, -1,  1, -1, -1, -1, -1,  1,  1, -1,  1, -1,  1,
+//      1,  1, -1,  1, -1,  1, -1, -1, -1,  1, -1,  1, -1,  1, -1,  1,  1, -1, -1,  1, -1,  1, -1,  1, -1, -1, -1,  1, -1,  1, -1, -1,
+//      1,  1,  1, -1, -1,  1, -1, -1, -1,  1,  1, -1, -1,  1, -1,  1,  1, -1,  1, -1, -1,  1, -1,  1, -1, -1,  1, -1, -1,  1, -1, -1,
+//      1,  1, -1, -1, -1,  1, -1,  1, -1,  1, -1, -1, -1,  1, -1, -1,  1, -1, -1, -1, -1,  1, -1, -1, -1, -1, -1, -1, -1,  1, -1,  1,
+//      1,  1,  1,  1,  1, -1, -1,  1, -1,  1,  1,  1,  1, -1, -1, -1,  1, -1,  1,  1,  1, -1, -1, -1, -1, -1,  1,  1,  1, -1, -1,  1,
+//      1,  1, -1,  1,  1, -1, -1, -1, -1,  1, -1,  1,  1, -1, -1,  1,  1, -1, -1,  1,  1, -1, -1,  1, -1, -1, -1,  1,  1, -1, -1, -1,
+//      1,  1,  1, -1,  1, -1, -1, -1, -1,  1,  1, -1,  1, -1, -1,  1,  1, -1,  1, -1,  1, -1, -1,  1, -1, -1,  1, -1,  1, -1, -1, -1,
+//      1,  1, -1, -1,  1, -1, -1,  1, -1,  1, -1, -1,  1, -1, -1, -1,  1, -1, -1, -1,  1, -1, -1, -1, -1, -1, -1, -1,  1, -1, -1,  1,
+//      1,  1,  1,  1, -1, -1, -1, -1, -1,  1,  1,  1, -1, -1, -1,  1,  1, -1,  1,  1, -1, -1, -1,  1, -1, -1,  1,  1, -1, -1, -1, -1,
+//      1,  1, -1,  1, -1, -1, -1,  1, -1,  1, -1,  1, -1, -1, -1, -1,  1, -1, -1,  1, -1, -1, -1, -1, -1, -1, -1,  1, -1, -1, -1,  1,
+//      1,  1,  1, -1, -1, -1, -1,  1, -1,  1,  1, -1, -1, -1, -1, -1,  1, -1,  1, -1, -1, -1, -1, -1, -1, -1,  1, -1, -1, -1, -1,  1,
+//      1,  1, -1, -1, -1, -1, -1, -1, -1,  1, -1, -1, -1, -1, -1,  1,  1, -1, -1, -1, -1, -1, -1,  1, -1, -1, -1, -1, -1, -1, -1, -1,
+// };
+// #endif
+
+// void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+//     assert(n % QK_K == 0);
+//     assert(nrc == 1);
+//     UNUSED(nrc);
+//     UNUSED(bx);
+//     UNUSED(by);
+//     UNUSED(bs);
+
+//     const block_iq2_xxs * GGML_RESTRICT x = vx;
+//     const block_q8_K    * GGML_RESTRICT y = vy;
+
+//     const int nb = n / QK_K;
+
+// #if defined(__VXE__) || defined(__VXE2__)
+//    const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
+
+//    uint32_t aux32[4];
+//    const uint8_t * aux8 = (const uint8_t *)aux32;
+
+//    float sumf = 0;
+
+//    for (int i = 0; i < nb; ++i) {
+//        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+//        const uint16_t * GGML_RESTRICT q2 = x[i].qs;
+//        const int8_t   * GGML_RESTRICT q8 = y[i].qs;
+
+//        float sumf1 = 0, sumf2 = 0;
+
+//        for (int ib32 = 0; ib32 < QK_K/32; ib += 2) {
+//            int8x16_t q8b0 = vec_xl( 0, q8);
+//            int8x16_t qb81 = vec_xl(16, q8);
+//            int8x16_t q8b2 = vec_xl(32, q8);
+//            int8x16_t q8b3 = vec_xl(48, q8);
+//            q8 += 64;
+
+//            memcpy(aux32, q2, 4 * sizeof(uint32_t));
+//            q2 += 8;
+
+//            int8x16_t q2u0 = { *(const int64_t *)(iq2xxs_grid + aux8[ 0]), *(const int64_t *)(iq2xxs_grid + aux8[ 1]) };
+//            int8x16_t q2u1 = { *(const int64_t *)(iq2xxs_grid + aux8[ 2]), *(const int64_t *)(iq2xxs_grid + aux8[ 3]) };
+//            int8x16_t q2u2 = { *(const int64_t *)(iq2xxs_grid + aux8[ 8]), *(const int64_t *)(iq2xxs_grid + aux8[ 9]) };
+//            int8x16_t q2u3 = { *(const int64_t *)(iq2xxs_grid + aux8[10]), *(const int64_t *)(iq2xxs_grid + aux8[11]) };
+
+//            int8x16_t q2s0 = { *(const int64_t *)(signs64 + ((aux32[1] >>  0) & 127)), *(const int64_t *)(signs64 + ((aux32[1] >>  7) & 127)) };
+//            int8x16_t q2s1 = { *(const int64_t *)(signs64 + ((aux32[1] >> 14) & 127)), *(const int64_t *)(signs64 + ((aux32[1] >> 21) & 127)) };
+//            int8x16_t q2s2 = { *(const int64_t *)(signs64 + ((aux32[3] >>  0) & 127)), *(const int64_t *)(signs64 + ((aux32[3] >>  7) & 127)) };
+//            int8x16_t q2s3 = { *(const int64_t *)(signs64 + ((aux32[3] >> 14) & 127)), *(const int64_t *)(signs64 + ((aux32[3] >> 21) & 127)) };
+
+//            q2u0 = vec_mul(q2u0, q2s0);
+//            q2u1 = vec_mul(q2u1, q2s1);
+//            q2u2 = vec_mul(q2u2, q2s2);
+//            q2u3 = vec_mul(q2u3, q2s3);
+
+//            const int32x4_t p1 = ggml_vec_dot(ggml_vec_dot(vec_splat_s32(0), q2u0, q8b0), q2u1, q8b1);
+//            const int32x4_t p2 = ggml_vec_dot(ggml_vec_dot(vec_splat_s32(0), q2u2, q8b2), q2u3, q8b3);
+
+//            sumf1 += (p1[0] + p1[1] + p1[2] + p1[3]) * (0.5f + (aux32[1] >> 28));
+//            sumf2 += (p2[0] + p2[1] + p2[2] + p2[3]) * (0.5f + (aux32[3] >> 28));
+//        }
+
+//        sumf += d * (sumf1 + sumf2);
+//    }
+
+//    *s = 0.25f * sumf;
+
+// #else
+
+//     uint32_t aux32[2];
+//     const uint8_t * aux8 = (const uint8_t *)aux32;
+
+//     float sumf = 0.f;
+//     for (int i = 0; i < nb; ++i) {
+//         const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+//         const uint16_t * GGML_RESTRICT q2 = x[i].qs;
+//         const int8_t   * GGML_RESTRICT q8 = y[i].qs;
+//         int32_t bsum = 0;
+//         for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
+//             memcpy(aux32, q2, 2*sizeof(uint32_t));
+//             q2 += 4;
+//             const uint32_t ls = 2*(aux32[1] >> 28) + 1;
+//             int32_t sumi = 0;
+//             for (int l = 0; l < 4; ++l) {
+//                 const uint8_t * grid = (const uint8_t *)(iq2xxs_grid + aux8[l]);
+//                 const uint8_t  signs = ksigns_iq2xs[(aux32[1] >> 7*l) & 127];
+//                 for (int j = 0; j < 8; ++j) {
+//                     sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
+//                 }
+//                 q8 += 8;
+//             }
+//             bsum += sumi * ls;
+//         }
+//         sumf += d * bsum;
+//     }
+//     *s = 0.125f * sumf;
+// #endif
+// }
+
+void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+    assert(n % QK4_NL == 0);
+    static_assert(QK4_NL == QK8_0, "QK4_NL and QK8_0 must be the same");
+
+    const block_iq4_nl * GGML_RESTRICT x = vx;
+    const block_q8_0   * GGML_RESTRICT y = vy;
+
+    const int nb = n / QK4_NL;
+
+    int ib = 0;
+    float sumf = 0;
+
+#if defined(__VXE__) || defined(__VXE2__)
+    const int8x16_t v_k = vec_xl(0, kvalues_iq4nl);
+    const uint8x16_t v_m = vec_splat_u8(0x0F);
+
+    for (; ib < nb; ++ib) {
+        const block_iq4_nl * GGML_RESTRICT x0 = &x[ib];
+        const block_q8_0   * GGML_RESTRICT y0 = &y[ib];
+
+        const uint8x16_t v_x = vec_xl(0, x0->qs);
+        int8x16_t v_xl = (int8x16_t)vec_and(v_x, v_m);
+        int8x16_t v_xh = (int8x16_t)vec_sr(v_x, 4);
+
+        v_xl = vec_perm(v_k, v_k, (uchar8x16_t)v_xl);
+        v_xh = vec_perm(v_k, v_k, (uchar8x16_t)v_xh);
+
+        const int8x16_t v_yl = vec_xl(0      , y0->qs);
+        const int8x16_t v_yh = vec_xl(QK8_0/2, y0->qs);
+        const int32x4_t v_xy = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_xl, v_yl), v_xh, v_yh);
+
+        sumf += GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d) * (v_xy[0] + v_xy[1] + v_xy[2] + v_xy[3]);
+    }
+
+#endif
+    for (; ib < nb; ++ib) {
+        const float d = GGML_CPU_FP16_TO_FP32(y[ib].d)*GGML_CPU_FP16_TO_FP32(x[ib].d);
+        int sumi1 = 0, sumi2 = 0;
+        for (int j = 0; j < QK4_NL/2; ++j) {
+            sumi1 += y[ib].qs[j+       0] * kvalues_iq4nl[x[ib].qs[j] & 0xf];
+            sumi2 += y[ib].qs[j+QK4_NL/2] * kvalues_iq4nl[x[ib].qs[j] >>  4];
+        }
+        sumf += d * (sumi1 + sumi2);
+    }
+    *s = sumf;
+}
+
+void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+    assert(n % QK_K == 0);
+
+    const block_iq4_xs * GGML_RESTRICT x = vx;
+    const block_q8_K   * GGML_RESTRICT y = vy;
+
+    const int nb = n / QK_K;
+
+#if defined(__VXE__) || defined(__VXE2__)
+    const int8x16_t v_k = vec_xl(0, kvalues_iq4nl);
+    const uint8x16_t v_m = vec_splat_u8(0x0F);
+
+    float sumf = 0;
+
+    for (int ibl = 0; ibl < nb; ++ibl) {
+        const uint8_t * GGML_RESTRICT q4 = x[ibl].qs;
+        const int8_t  * GGML_RESTRICT q8 = y[ibl].qs;
+
+        uint16_t h = x[ibl].scales_h;
+
+        int sumi1 = 0, sumi2 = 0;
+        for (int ib = 0; ib < QK_K/64; ++ib) {
+            const uint8x16_t v_x0 = vec_xl(0       , q4);
+            const uint8x16_t v_x1 = vec_xl(QK4_NL/2, q4);
+            q4 += 32;
+
+            int8x16_t v_x0l = (int8x16_t)vec_and(v_x0, v_m);
+            int8x16_t v_x0h = (int8x16_t)vec_sr(v_x0, 4);
+            int8x16_t v_x1l = (int8x16_t)vec_and(v_x1, v_m);
+            int8x16_t v_x1h = (int8x16_t)vec_sr(v_x1, 4);
+
+            v_x0l = vec_perm(v_k, v_k, (uchar8x16_t)v_x0l);
+            v_x0h = vec_perm(v_k, v_k, (uchar8x16_t)v_x0h);
+            v_x1l = vec_perm(v_k, v_k, (uchar8x16_t)v_x1l);
+            v_x1h = vec_perm(v_k, v_k, (uchar8x16_t)v_x1h);
+
+            const int8x16_t v_y0 = vec_xl( 0, q8);
+            const int8x16_t v_y1 = vec_xl(16, q8);
+            const int8x16_t v_y2 = vec_xl(32, q8);
+            const int8x16_t v_y3 = vec_xl(48, q8);
+            q8 += 64;
+
+            int32x4_t vsumi0 = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_x0l, v_y0), v_x0h, v_y1);
+            int32x4_t vsumi1 = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_x1l, v_y2), v_x1h, v_y3);
+
+            int ls1 = ((x[ibl].scales_l[ib] & 0xF) | ((h << 4) & 0x30)) - 32;
+            int ls2 = ((x[ibl].scales_l[ib] >>  4) | ((h << 2) & 0x30)) - 32;
+
+            h >>= 4;
+
+            sumi1 += (vsumi0[0] + vsumi0[1] + vsumi0[2] + vsumi0[3]) * ls1;
+            sumi2 += (vsumi1[0] + vsumi1[1] + vsumi1[2] + vsumi1[3]) * ls2;
+        }
+
+        sumf += GGML_CPU_FP16_TO_FP32(x[ibl].d) * y[ibl].d * (sumi1 + sumi2);
+    }
+
+    *s = sumf;
+
+#else
+    float sumf = 0;
+    for (int ibl = 0; ibl < nb; ++ibl) {
+        const float d4d8 = GGML_CPU_FP16_TO_FP32(x[ibl].d) * y[ibl].d;
+        uint16_t h = x[ibl].scales_h;
+        const uint8_t * qs = x[ibl].qs;
+        const int8_t  * q8 = y[ibl].qs;
+        for (int ib = 0; ib < QK_K/32; ib += 2) {
+            const uint8_t ls1 = (x[ibl].scales_l[ib/2] & 0xf) | ((h << 4) & 0x30);
+            const uint8_t ls2 = (x[ibl].scales_l[ib/2] >>  4) | ((h << 2) & 0x30);
+            h >>= 4;
+            const float d1 = d4d8*(ls1 - 32);
+            const float d2 = d4d8*(ls2 - 32);
+            int sumi1 = 0, sumi2 = 0;
+            for (int j = 0; j < 16; ++j) {
+                sumi1 += q8[j+ 0] * kvalues_iq4nl[qs[j] & 0xf];
+                sumi2 += q8[j+16] * kvalues_iq4nl[qs[j] >>  4];
+            }
+            sumf += d1 * (sumi1 + sumi2);
+            qs += 16;
+            q8 += 32;
+            sumi1 = sumi2 = 0;
+            for (int j = 0; j < 16; ++j) {
+                sumi1 += q8[j+ 0] * kvalues_iq4nl[qs[j] & 0xf];
+                sumi2 += q8[j+16] * kvalues_iq4nl[qs[j] >>  4];
+            }
+            sumf += d2 * (sumi1 + sumi2);
+            qs += 16;
+            q8 += 32;
+        }
+    }
+    *s = sumf;
+#endif
+}
+
diff --git a/ggml/src/ggml-cpu/arch/wasm/quants.c b/ggml/src/ggml-cpu/arch/wasm/quants.c
new file mode 100644
index 0000000000000..b0904d8a3ab5e
--- /dev/null
+++ b/ggml/src/ggml-cpu/arch/wasm/quants.c
@@ -0,0 +1,1481 @@
+#define GGML_COMMON_IMPL_C
+#include "ggml-common.h"
+#include "ggml-quants.h"
+#include "ggml-impl.h"
+#include "ggml-cpu.h"
+#include "simd-mappings.h"
+
+#include "../../quants.h"
+#include "../../ggml-cpu-impl.h"
+
+#include <math.h>
+#include <string.h>
+#include <assert.h>
+#include <float.h>
+#include <stdlib.h> // for qsort
+#include <stdio.h>  // for GGML_ASSERT
+
+#define GROUP_MAX_EPS 1e-15f
+#define GROUP_MAX_EPS_IQ3_XXS 1e-8f
+#define GROUP_MAX_EPS_IQ2_S 1e-8f
+#define GROUP_MAX_EPS_IQ1_M 1e-7f
+#define GROUP_MAX_EPS_IQ1_S 1e-12f
+
+#define UNUSED GGML_UNUSED
+
+#if defined(__wasm_simd128__)
+#define B1(c,s,n)  0x ## n ## c ,  0x ## n ## s
+#define B2(c,s,n) B1(c,s,n ## c), B1(c,s,n ## s)
+#define B3(c,s,n) B2(c,s,n ## c), B2(c,s,n ## s)
+#define B4(c,s,n) B3(c,s,n ## c), B3(c,s,n ## s)
+#define B5(c,s,n) B4(c,s,n ## c), B4(c,s,n ## s)
+#define B6(c,s,n) B5(c,s,n ## c), B5(c,s,n ## s)
+#define B7(c,s,n) B6(c,s,n ## c), B6(c,s,n ## s)
+#define B8(c,s  ) B7(c,s,     c), B7(c,s,     s)
+
+// precomputed tables for expanding 8bits to 8 bytes:
+static const uint64_t table_b2b_0[1 << 8] = { B8(00, 10) }; // ( b) << 4
+static const uint64_t table_b2b_1[1 << 8] = { B8(10, 00) }; // (!b) << 4
+#endif
+
+void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
+    assert(QK8_0 == 32);
+    assert(k % QK8_0 == 0);
+    const int nb = k / QK8_0;
+
+    block_q8_0 * GGML_RESTRICT y = vy;
+
+#if defined __wasm_simd128__
+    for (int i = 0; i < nb; i++) {
+        v128_t srcv [8];
+        v128_t asrcv[8];
+        v128_t amaxv[8];
+
+        for (int j = 0; j < 8; j++) srcv[j]  = wasm_v128_load(x + i*32 + 4*j);
+        for (int j = 0; j < 8; j++) asrcv[j] = wasm_f32x4_abs(srcv[j]);
+
+        for (int j = 0; j < 4; j++) amaxv[2*j] = wasm_f32x4_max(asrcv[2*j], asrcv[2*j+1]);
+        for (int j = 0; j < 2; j++) amaxv[4*j] = wasm_f32x4_max(amaxv[4*j], amaxv[4*j+2]);
+        for (int j = 0; j < 1; j++) amaxv[8*j] = wasm_f32x4_max(amaxv[8*j], amaxv[8*j+4]);
+
+        const float amax = MAX(MAX(wasm_f32x4_extract_lane(amaxv[0], 0),
+                                   wasm_f32x4_extract_lane(amaxv[0], 1)),
+                               MAX(wasm_f32x4_extract_lane(amaxv[0], 2),
+                                   wasm_f32x4_extract_lane(amaxv[0], 3)));
+
+        const float d = amax / ((1 << 7) - 1);
+        const float id = d ? 1.0f/d : 0.0f;
+
+        y[i].d = GGML_CPU_FP32_TO_FP16(d);
+
+        for (int j = 0; j < 8; j++) {
+            const v128_t v  = wasm_f32x4_mul(srcv[j], wasm_f32x4_splat(id));
+            const v128_t vi = wasm_i32x4_trunc_sat_f32x4(v);
+
+            y[i].qs[4*j + 0] = wasm_i32x4_extract_lane(vi, 0);
+            y[i].qs[4*j + 1] = wasm_i32x4_extract_lane(vi, 1);
+            y[i].qs[4*j + 2] = wasm_i32x4_extract_lane(vi, 2);
+            y[i].qs[4*j + 3] = wasm_i32x4_extract_lane(vi, 3);
+        }
+    }
+#else
+    GGML_UNUSED(nb);
+    // scalar
+    quantize_row_q8_0_ref(x, y, k);
+#endif
+}
+
+void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
+    assert(k % QK8_1 == 0);
+    const int nb = k / QK8_1;
+
+    block_q8_1 * GGML_RESTRICT y = vy;
+#if defined __wasm_simd128__
+    for (int i = 0; i < nb; i++) {
+        v128_t srcv [8];
+        v128_t asrcv[8];
+        v128_t amaxv[8];
+
+        for (int j = 0; j < 8; j++) srcv[j]  = wasm_v128_load(x + i*32 + 4*j);
+        for (int j = 0; j < 8; j++) asrcv[j] = wasm_f32x4_abs(srcv[j]);
+
+        for (int j = 0; j < 4; j++) amaxv[2*j] = wasm_f32x4_max(asrcv[2*j], asrcv[2*j+1]);
+        for (int j = 0; j < 2; j++) amaxv[4*j] = wasm_f32x4_max(amaxv[4*j], amaxv[4*j+2]);
+        for (int j = 0; j < 1; j++) amaxv[8*j] = wasm_f32x4_max(amaxv[8*j], amaxv[8*j+4]);
+
+        const float amax = MAX(MAX(wasm_f32x4_extract_lane(amaxv[0], 0),
+                                   wasm_f32x4_extract_lane(amaxv[0], 1)),
+                               MAX(wasm_f32x4_extract_lane(amaxv[0], 2),
+                                   wasm_f32x4_extract_lane(amaxv[0], 3)));
+
+        const float d = amax / ((1 << 7) - 1);
+        const float id = d ? 1.0f/d : 0.0f;
+
+        y[i].d = GGML_CPU_FP32_TO_FP16(d);
+
+        v128_t accv = wasm_i32x4_splat(0);
+
+        for (int j = 0; j < 8; j++) {
+            const v128_t v  = wasm_f32x4_mul(srcv[j], wasm_f32x4_splat(id));
+            const v128_t vi = wasm_i32x4_trunc_sat_f32x4(v);
+
+            y[i].qs[4*j + 0] = wasm_i32x4_extract_lane(vi, 0);
+            y[i].qs[4*j + 1] = wasm_i32x4_extract_lane(vi, 1);
+            y[i].qs[4*j + 2] = wasm_i32x4_extract_lane(vi, 2);
+            y[i].qs[4*j + 3] = wasm_i32x4_extract_lane(vi, 3);
+
+            accv = wasm_i32x4_add(accv, vi);
+        }
+
+        y[i].s = GGML_CPU_FP32_TO_FP16(
+                d * (wasm_i32x4_extract_lane(accv, 0) +
+                     wasm_i32x4_extract_lane(accv, 1) +
+                     wasm_i32x4_extract_lane(accv, 2) +
+                     wasm_i32x4_extract_lane(accv, 3)));
+    }
+#else
+    GGML_UNUSED(nb);
+    // scalar
+    quantize_row_q8_1_ref(x, y, k);
+#endif
+}
+
+//===================================== Q8_K ==============================================
+
+void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
+#ifdef __wasm_simd128__
+    assert(k % QK_K == 0);
+    const int64_t nb = k / QK_K;
+    block_q8_K * GGML_RESTRICT yc = y; // Cast to proper type
+
+    for (int i = 0; i < nb; i++) {
+        const float * x_block = x + i * QK_K;
+
+        v128_t min_vec = wasm_v128_load(x_block);
+        v128_t max_vec = min_vec;
+
+        for (int j = 4; j < QK_K; j += 4) {
+            v128_t x_vec = wasm_v128_load(x_block + j);
+            max_vec = wasm_f32x4_pmax(max_vec, x_vec);
+            min_vec = wasm_f32x4_pmin(min_vec, x_vec);
+        }
+        max_vec = wasm_f32x4_pmax(max_vec, wasm_i32x4_shuffle(max_vec, max_vec, 2, 3, 0, 1));
+        max_vec = wasm_f32x4_pmax(max_vec, wasm_i32x4_shuffle(max_vec, max_vec, 1, 0, 3, 2));
+        min_vec = wasm_f32x4_pmin(min_vec, wasm_i32x4_shuffle(min_vec, min_vec, 2, 3, 0, 1));
+        min_vec = wasm_f32x4_pmin(min_vec, wasm_i32x4_shuffle(min_vec, min_vec, 1, 0, 3, 2));
+        float max = wasm_f32x4_extract_lane(max_vec, 0);
+        float min = wasm_f32x4_extract_lane(min_vec, 0);
+        float amax = -min > max ? min : max;
+
+        if (amax == 0.0f) {
+            yc[i].d = 0.0f;
+            const v128_t zero = wasm_i8x16_splat(0);
+            for (int j = 0; j < QK_K; j += 16) {
+                wasm_v128_store(yc[i].qs + j, zero);
+            }
+            continue;
+        }
+
+        const float iscale = -127.0f / amax;
+        const v128_t scale_vec = wasm_f32x4_splat(iscale);
+
+        // Process 16 elements per iteration
+        for (int j = 0, jb = 0; j < QK_K; j += 16, jb++) {
+            // Load and quantize 16 floats
+            v128_t x0 = wasm_v128_load(x_block + j);
+            v128_t x1 = wasm_v128_load(x_block + j + 4);
+            v128_t x2 = wasm_v128_load(x_block + j + 8);
+            v128_t x3 = wasm_v128_load(x_block + j + 12);
+
+            v128_t q0 = wasm_f32x4_nearest(wasm_f32x4_mul(x0, scale_vec));
+            v128_t q1 = wasm_f32x4_nearest(wasm_f32x4_mul(x1, scale_vec));
+            v128_t q2 = wasm_f32x4_nearest(wasm_f32x4_mul(x2, scale_vec));
+            v128_t q3 = wasm_f32x4_nearest(wasm_f32x4_mul(x3, scale_vec));
+
+            // Convert to i32 with saturation
+            v128_t i0 = wasm_i32x4_trunc_sat_f32x4(q0);
+            v128_t i1 = wasm_i32x4_trunc_sat_f32x4(q1);
+            v128_t i2 = wasm_i32x4_trunc_sat_f32x4(q2);
+            v128_t i3 = wasm_i32x4_trunc_sat_f32x4(q3);
+
+            // Pack into 16 i8 values
+            v128_t i8 = wasm_i8x16_narrow_i16x8(
+                wasm_i16x8_narrow_i32x4(i0, i1),
+                wasm_i16x8_narrow_i32x4(i2, i3)
+            );
+            wasm_v128_store(yc[i].qs + j, i8);
+
+            // Calculate bsums using SIMD
+            v128_t sum16 = wasm_i16x8_add(
+                wasm_i16x8_extend_low_i8x16(i8),
+                wasm_i16x8_extend_high_i8x16(i8)
+            );
+            v128_t sum32 = wasm_i32x4_add(
+                wasm_i32x4_extend_low_i16x8(sum16),
+                wasm_i32x4_extend_high_i16x8(sum16)
+            );
+            sum32 = wasm_i32x4_add(sum32, wasm_i32x4_shuffle(sum32, sum32, 2, 3, 0, 1));
+            sum32 = wasm_i32x4_add(sum32, wasm_i32x4_shuffle(sum32, sum32, 1, 0, 3, 2));
+            yc[i].bsums[jb] = wasm_i32x4_extract_lane(sum32, 0);
+        }
+
+        yc[i].d = 1.0f / iscale;
+    }
+#else
+    quantize_row_q8_K_ref(x, y, k);
+#endif
+}
+
+
+//===================================== Dot products =================================
+
+void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    const int qk = QK8_0;
+    const int nb = n / qk;
+
+    assert(n % qk == 0);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_q4_0 * GGML_RESTRICT x = vx;
+    const block_q8_0 * GGML_RESTRICT y = vy;
+
+    int ib = 0;
+    float sumf = 0;
+
+#if defined __wasm_simd128__
+    v128_t sumv = wasm_f32x4_splat(0.0f);
+
+    const v128_t m4b = wasm_i8x16_splat(0x0F);
+    const v128_t s8b = wasm_i8x16_splat(0x8);
+
+    for (; ib + 1 < nb; ib += 2) {
+        const block_q4_0 * GGML_RESTRICT x0 = &x[ib];
+        const block_q4_0 * GGML_RESTRICT x1 = &x[ib + 1];
+        const block_q8_0 * GGML_RESTRICT y0 = &y[ib];
+        const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1];
+
+        // Load and process x0
+        v128_t v0_0 = wasm_v128_load(x0->qs);
+        v128_t v0_0l = wasm_v128_and(v0_0, m4b);
+        v128_t v0_0h = wasm_u8x16_shr(v0_0, 4);
+        v128_t v0_0ls = wasm_i8x16_sub(v0_0l, s8b);
+        v128_t v0_0hs = wasm_i8x16_sub(v0_0h, s8b);
+
+        // Load y0 vectors
+        v128_t y0_l = wasm_v128_load(y0->qs);
+        v128_t y0_h = wasm_v128_load(y0->qs + 16);
+
+        // Extend to i16x8 and compute dot products
+        v128_t dx0l = wasm_i16x8_extend_low_i8x16(v0_0ls);
+        v128_t dx0h = wasm_i16x8_extend_high_i8x16(v0_0ls);
+        v128_t dx0hl = wasm_i16x8_extend_low_i8x16(v0_0hs);
+        v128_t dx0hh = wasm_i16x8_extend_high_i8x16(v0_0hs);
+
+        v128_t dy0ll = wasm_i16x8_extend_low_i8x16(y0_l);
+        v128_t dy0lh = wasm_i16x8_extend_high_i8x16(y0_l);
+        v128_t dy0hl = wasm_i16x8_extend_low_i8x16(y0_h);
+        v128_t dy0hh = wasm_i16x8_extend_high_i8x16(y0_h);
+
+        v128_t dp0 = wasm_i32x4_add(
+            wasm_i32x4_add(
+                wasm_i32x4_dot_i16x8(dx0l, dy0ll),
+                wasm_i32x4_dot_i16x8(dx0h, dy0lh)
+            ),
+            wasm_i32x4_add(
+                wasm_i32x4_dot_i16x8(dx0hl, dy0hl),
+                wasm_i32x4_dot_i16x8(dx0hh, dy0hh)
+            )
+        );
+
+        // Load and process x1
+        v128_t v0_1 = wasm_v128_load(x1->qs);
+        v128_t v0_1l = wasm_v128_and(v0_1, m4b);
+        v128_t v0_1h = wasm_u8x16_shr(v0_1, 4);
+        v128_t v0_1ls = wasm_i8x16_sub(v0_1l, s8b);
+        v128_t v0_1hs = wasm_i8x16_sub(v0_1h, s8b);
+
+        // Load y1 vectors
+        v128_t y1_l = wasm_v128_load(y1->qs);
+        v128_t y1_h = wasm_v128_load(y1->qs + 16);
+
+        // Extend to i16x8 and compute dot products
+        v128_t dx1l = wasm_i16x8_extend_low_i8x16(v0_1ls);
+        v128_t dx1h = wasm_i16x8_extend_high_i8x16(v0_1ls);
+        v128_t dx1hl = wasm_i16x8_extend_low_i8x16(v0_1hs);
+        v128_t dx1hh = wasm_i16x8_extend_high_i8x16(v0_1hs);
+
+        v128_t dy1ll = wasm_i16x8_extend_low_i8x16(y1_l);
+        v128_t dy1lh = wasm_i16x8_extend_high_i8x16(y1_l);
+        v128_t dy1hl = wasm_i16x8_extend_low_i8x16(y1_h);
+        v128_t dy1hh = wasm_i16x8_extend_high_i8x16(y1_h);
+
+        v128_t dp1 = wasm_i32x4_add(
+            wasm_i32x4_add(
+                wasm_i32x4_dot_i16x8(dx1l, dy1ll),
+                wasm_i32x4_dot_i16x8(dx1h, dy1lh)
+            ),
+            wasm_i32x4_add(
+                wasm_i32x4_dot_i16x8(dx1hl, dy1hl),
+                wasm_i32x4_dot_i16x8(dx1hh, dy1hh)
+            )
+        );
+
+        // Accumulate results with scaling
+        float scale0 = GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d);
+        float scale1 = GGML_CPU_FP16_TO_FP32(x1->d) * GGML_CPU_FP16_TO_FP32(y1->d);
+
+        sumv = wasm_f32x4_add(sumv, wasm_f32x4_mul(wasm_f32x4_convert_i32x4(dp0), wasm_f32x4_splat(scale0)));
+        sumv = wasm_f32x4_add(sumv, wasm_f32x4_mul(wasm_f32x4_convert_i32x4(dp1), wasm_f32x4_splat(scale1)));
+    }
+
+    sumf = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
+           wasm_f32x4_extract_lane(sumv, 2) + wasm_f32x4_extract_lane(sumv, 3);
+
+#endif
+    for (; ib < nb; ++ib) {
+        int sumi0 = 0;
+        int sumi1 = 0;
+
+        for (int j = 0; j < qk/2; ++j) {
+            const int v0 = (x[ib].qs[j] & 0x0F) - 8;
+            const int v1 = (x[ib].qs[j] >>   4) - 8;
+
+            sumi0 += (v0 * y[ib].qs[j]);
+            sumi1 += (v1 * y[ib].qs[j + qk/2]);
+        }
+
+        int sumi = sumi0 + sumi1;
+        sumf += sumi*GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d);
+    }
+
+    *s = sumf;
+}
+
+void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    const int qk = QK8_0;
+    const int nb = n / qk;
+
+    int ib = 0;
+    float sumf = 0;
+
+    assert(n % qk == 0);
+    assert(qk == QK5_0);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_q5_0 * GGML_RESTRICT x = vx;
+    const block_q8_0 * GGML_RESTRICT y = vy;
+
+#if defined __wasm_simd128__
+    v128_t sumv = wasm_f32x4_splat(0.0f);
+
+    uint32_t qh_;
+    uint64_t tmp[4];
+
+    // TODO: check if unrolling this is better
+    for (; ib < nb; ++ib) {
+        const block_q5_0 * GGML_RESTRICT x0 = &x[ib];
+        const block_q8_0 * GGML_RESTRICT y0 = &y[ib];
+
+        const v128_t m4b  = wasm_i8x16_splat(0x0F);
+
+        // extract the 5th bit
+        memcpy(&qh_, x0->qh, sizeof(qh_));
+
+        tmp[0] = table_b2b_1[(qh_ >>  0) & 0xFF];
+        tmp[1] = table_b2b_1[(qh_ >>  8) & 0xFF];
+        tmp[2] = table_b2b_1[(qh_ >> 16) & 0xFF];
+        tmp[3] = table_b2b_1[(qh_ >> 24)       ];
+
+        const v128_t qhl = wasm_v128_load(tmp + 0);
+        const v128_t qhh = wasm_v128_load(tmp + 2);
+
+        const v128_t v0 = wasm_v128_load(x0->qs);
+
+        // 4-bit -> 8-bit
+        const v128_t v0l = wasm_v128_and (v0, m4b);
+        const v128_t v0h = wasm_u8x16_shr(v0, 4);
+
+        // add high bit and sub 16 (equivalent to sub 0x10 when bit is zero)
+        const v128_t v0lf = wasm_i8x16_sub(v0l, qhl);
+        const v128_t v0hf = wasm_i8x16_sub(v0h, qhh);
+
+        // load y
+        const v128_t v1l = wasm_v128_load(y0->qs);
+        const v128_t v1h = wasm_v128_load(y0->qs + 16);
+
+        // int8x16 -> int16x8
+        const v128_t v0lfl = wasm_i16x8_extend_low_i8x16 (v0lf);
+        const v128_t v0lfh = wasm_i16x8_extend_high_i8x16(v0lf);
+        const v128_t v0hfl = wasm_i16x8_extend_low_i8x16 (v0hf);
+        const v128_t v0hfh = wasm_i16x8_extend_high_i8x16(v0hf);
+
+        const v128_t v1ll = wasm_i16x8_extend_low_i8x16 (v1l);
+        const v128_t v1lh = wasm_i16x8_extend_high_i8x16(v1l);
+        const v128_t v1hl = wasm_i16x8_extend_low_i8x16 (v1h);
+        const v128_t v1hh = wasm_i16x8_extend_high_i8x16(v1h);
+
+        // dot product
+        sumv = wasm_f32x4_add(sumv, wasm_f32x4_mul(wasm_f32x4_convert_i32x4(
+                        wasm_i32x4_add(
+                            wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0lfl, v1ll),
+                                           wasm_i32x4_dot_i16x8(v0lfh, v1lh)),
+                            wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl),
+                                           wasm_i32x4_dot_i16x8(v0hfh, v1hh)))),
+                    wasm_f32x4_splat(GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d))));
+    }
+
+    sumf = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
+           wasm_f32x4_extract_lane(sumv, 2) + wasm_f32x4_extract_lane(sumv, 3);
+
+#endif
+    for (; ib < nb; ++ib) {
+        uint32_t qh;
+        memcpy(&qh, x[ib].qh, sizeof(qh));
+
+        int sumi0 = 0;
+        int sumi1 = 0;
+
+        for (int j = 0; j < qk/2; ++j) {
+            const uint8_t xh_0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
+            const uint8_t xh_1 = ((qh & (1u << (j + 16))) >> (j + 12));
+
+            const int32_t x0 = (int8_t)(((x[ib].qs[j] & 0x0F) | xh_0) - 16);
+            const int32_t x1 = (int8_t)(((x[ib].qs[j] >>   4) | xh_1) - 16);
+
+            sumi0 += (x0 * y[ib].qs[j]);
+            sumi1 += (x1 * y[ib].qs[j + qk/2]);
+        }
+
+        int sumi = sumi0 + sumi1;
+        sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d)) * sumi;
+    }
+
+    *s = sumf;
+}
+
+void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    const int qk = QK8_1;
+    const int nb = n / qk;
+
+    int ib = 0;
+    float sumf = 0;
+
+    assert(n % qk == 0);
+    assert(qk == QK5_1);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_q5_1 * GGML_RESTRICT x = vx;
+    const block_q8_1 * GGML_RESTRICT y = vy;
+
+#if defined __wasm_simd128__
+    v128_t sumv = wasm_f32x4_splat(0.0f);
+
+    float summs = 0.0f;
+
+    uint32_t qh_;
+    uint64_t tmp[4];
+
+    // TODO: check if unrolling this is better
+    for (; ib < nb; ++ib) {
+        const block_q5_1 * GGML_RESTRICT x0 = &x[ib];
+        const block_q8_1 * GGML_RESTRICT y0 = &y[ib];
+
+        summs += GGML_CPU_FP16_TO_FP32(x0->m) * GGML_CPU_FP16_TO_FP32(y0->s);
+
+        const v128_t m4b = wasm_i8x16_splat(0x0F);
+
+        // extract the 5th bit
+        memcpy(&qh_, x0->qh, sizeof(qh_));
+
+        tmp[0] = table_b2b_0[(qh_ >>  0) & 0xFF];
+        tmp[1] = table_b2b_0[(qh_ >>  8) & 0xFF];
+        tmp[2] = table_b2b_0[(qh_ >> 16) & 0xFF];
+        tmp[3] = table_b2b_0[(qh_ >> 24)       ];
+
+        const v128_t qhl = wasm_v128_load(tmp + 0);
+        const v128_t qhh = wasm_v128_load(tmp + 2);
+
+        const v128_t v0 = wasm_v128_load(x0->qs);
+
+        // 4-bit -> 8-bit
+        const v128_t v0l = wasm_v128_and (v0, m4b);
+        const v128_t v0h = wasm_u8x16_shr(v0, 4);
+
+        // add high bit
+        const v128_t v0lf = wasm_v128_or(v0l, qhl);
+        const v128_t v0hf = wasm_v128_or(v0h, qhh);
+
+        // load y
+        const v128_t v1l = wasm_v128_load(y0->qs);
+        const v128_t v1h = wasm_v128_load(y0->qs + 16);
+
+        // int8x16 -> int16x8
+        const v128_t v0lfl = wasm_i16x8_extend_low_i8x16 (v0lf);
+        const v128_t v0lfh = wasm_i16x8_extend_high_i8x16(v0lf);
+        const v128_t v0hfl = wasm_i16x8_extend_low_i8x16 (v0hf);
+        const v128_t v0hfh = wasm_i16x8_extend_high_i8x16(v0hf);
+
+        const v128_t v1ll = wasm_i16x8_extend_low_i8x16 (v1l);
+        const v128_t v1lh = wasm_i16x8_extend_high_i8x16(v1l);
+        const v128_t v1hl = wasm_i16x8_extend_low_i8x16 (v1h);
+        const v128_t v1hh = wasm_i16x8_extend_high_i8x16(v1h);
+
+        // dot product
+        sumv = wasm_f32x4_add(sumv,
+                wasm_f32x4_mul(wasm_f32x4_convert_i32x4(wasm_i32x4_add(
+                            wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0lfl, v1ll),
+                                           wasm_i32x4_dot_i16x8(v0lfh, v1lh)),
+                            wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl),
+                                           wasm_i32x4_dot_i16x8(v0hfh, v1hh)))),
+                    wasm_f32x4_splat(GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d))));
+    }
+
+    sumf = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
+           wasm_f32x4_extract_lane(sumv, 2) + wasm_f32x4_extract_lane(sumv, 3) + summs;
+
+#endif
+    for (; ib < nb; ++ib) {
+        uint32_t qh;
+        memcpy(&qh, x[ib].qh, sizeof(qh));
+
+        int sumi0 = 0;
+        int sumi1 = 0;
+
+        for (int j = 0; j < qk/2; ++j) {
+            const uint8_t xh_0 = ((qh >> (j +  0)) << 4) & 0x10;
+            const uint8_t xh_1 = ((qh >> (j + 12))     ) & 0x10;
+
+            const int32_t x0 = (x[ib].qs[j] & 0xF) | xh_0;
+            const int32_t x1 = (x[ib].qs[j] >>  4) | xh_1;
+
+            sumi0 += (x0 * y[ib].qs[j]);
+            sumi1 += (x1 * y[ib].qs[j + qk/2]);
+        }
+
+        int sumi = sumi0 + sumi1;
+        sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
+    }
+
+    *s = sumf;
+}
+
+void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    const int qk = QK8_0;
+    const int nb = n / qk;
+
+    assert(n % qk == 0);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_q8_0 * GGML_RESTRICT x = vx;
+    const block_q8_0 * GGML_RESTRICT y = vy;
+
+    int ib = 0;
+    float sumf = 0;
+
+#if defined __wasm_simd128__
+    v128_t sumv = wasm_f32x4_splat(0.0f);
+
+    for (; ib < nb; ++ib) {
+        const block_q8_0 * GGML_RESTRICT x0 = &x[ib];
+        const block_q8_0 * GGML_RESTRICT y0 = &y[ib];
+
+        const v128_t x0_0 = wasm_v128_load(x0->qs);
+        const v128_t x0_1 = wasm_v128_load(x0->qs + 16);
+        const v128_t y0_0 = wasm_v128_load(y0->qs);
+        const v128_t y0_1 = wasm_v128_load(y0->qs + 16);
+
+        // Extend 8-bit to 16-bit
+        const v128_t x0_0l = wasm_i16x8_extend_low_i8x16(x0_0);
+        const v128_t x0_0h = wasm_i16x8_extend_high_i8x16(x0_0);
+        const v128_t x0_1l = wasm_i16x8_extend_low_i8x16(x0_1);
+        const v128_t x0_1h = wasm_i16x8_extend_high_i8x16(x0_1);
+
+        const v128_t y0_0l = wasm_i16x8_extend_low_i8x16(y0_0);
+        const v128_t y0_0h = wasm_i16x8_extend_high_i8x16(y0_0);
+        const v128_t y0_1l = wasm_i16x8_extend_low_i8x16(y0_1);
+        const v128_t y0_1h = wasm_i16x8_extend_high_i8x16(y0_1);
+
+        // Compute dot products
+        const v128_t dx0_0 = wasm_i32x4_dot_i16x8(x0_0l, y0_0l);
+        const v128_t dx0_1 = wasm_i32x4_dot_i16x8(x0_0h, y0_0h);
+        const v128_t dx1_0 = wasm_i32x4_dot_i16x8(x0_1l, y0_1l);
+        const v128_t dx1_1 = wasm_i32x4_dot_i16x8(x0_1h, y0_1h);
+
+        // Sum all dot products
+        const v128_t sum_dots = wasm_i32x4_add(wasm_i32x4_add(dx0_0, dx0_1), wasm_i32x4_add(dx1_0, dx1_1));
+
+        // Convert to float and accumulate
+        const float scale = GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d);
+        sumv = wasm_f32x4_add(sumv, wasm_f32x4_mul(wasm_f32x4_convert_i32x4(sum_dots), wasm_f32x4_splat(scale)));
+    }
+
+    sumf = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
+           wasm_f32x4_extract_lane(sumv, 2) + wasm_f32x4_extract_lane(sumv, 3);
+
+#endif
+    for (; ib < nb; ++ib) {
+        int sumi = 0;
+
+        for (int j = 0; j < qk; j++) {
+            sumi += x[ib].qs[j]*y[ib].qs[j];
+        }
+
+        sumf += sumi*(GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d));
+    }
+
+    *s = sumf;
+}
+
+void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_q2_K * GGML_RESTRICT x = vx;
+    const block_q8_K * GGML_RESTRICT y = vy;
+
+    const int nb = n / QK_K;
+
+#if defined __wasm_simd128__
+    float sumf = 0;
+
+    for (int i = 0; i < nb; ++i) {
+        const uint8_t * q2 = x[i].qs;
+        const int8_t * q8 = y[i].qs;
+        const uint8_t * sc = x[i].scales;
+
+        // Vectorized summs calculation
+        v128_t summs_vec = wasm_i32x4_splat(0);
+        {
+            v128_t sc_vec = wasm_v128_load(sc);
+            v128_t sc_upper = wasm_u8x16_shr(sc_vec, 4);
+
+            v128_t sc_low = wasm_u16x8_extend_low_u8x16(sc_upper);
+            v128_t sc_high = wasm_u16x8_extend_high_u8x16(sc_upper);
+
+            v128_t bsums1 = wasm_v128_load(&y[i].bsums[0]);
+            v128_t bsums2 = wasm_v128_load(&y[i].bsums[8]);
+
+            summs_vec = wasm_i32x4_add(
+                wasm_i32x4_add(wasm_i32x4_dot_i16x8(sc_low, bsums1),
+                               wasm_i32x4_dot_i16x8(sc_high, bsums2)),
+                summs_vec
+            );
+
+            summs_vec = wasm_i32x4_add(summs_vec, wasm_i32x4_shuffle(summs_vec, summs_vec, 2, 3, 0, 1));
+            summs_vec = wasm_i32x4_add(summs_vec, wasm_i32x4_shuffle(summs_vec, summs_vec, 1, 0, 3, 2));
+        }
+        int32_t summs = wasm_i32x4_extract_lane(summs_vec, 0);
+
+        // Vectorized isum calculation
+        int32_t isum = 0;
+        const uint8_t * sc_ptr = sc;
+        const int k_iters = QK_K/128;
+
+        for (int k = 0; k < k_iters; ++k) {
+            v128_t isum_vec = wasm_i32x4_splat(0);
+            int shift = 0;
+
+            for (int j = 0; j < 4; ++j) {
+                const int d0 = (sc_ptr[0] & 0xF);
+                const int d1 = (sc_ptr[1] & 0xF);
+                sc_ptr += 2;
+
+                // Process first 16 elements
+                v128_t q2_0 = wasm_v128_load(q2);
+                v128_t q8_0 = wasm_v128_load(q8);
+                v128_t q2_shift_0 = wasm_u8x16_shr(q2_0, shift);
+                v128_t q2_bits_0 = wasm_v128_and(q2_shift_0, wasm_i8x16_splat(0x03));
+
+                // Process next 16 elements
+                v128_t q2_1 = wasm_v128_load(q2 + 16);
+                v128_t q8_1 = wasm_v128_load(q8 + 16);
+                v128_t q2_shift_1 = wasm_u8x16_shr(q2_1, shift);
+                v128_t q2_bits_1 = wasm_v128_and(q2_shift_1, wasm_i8x16_splat(0x03));
+
+                // Calculate dot products
+                v128_t p0 = wasm_i32x4_dot_i16x8(
+                    wasm_i16x8_extend_low_i8x16(q8_0),
+                    wasm_i16x8_extend_low_i8x16(q2_bits_0)
+                );
+                v128_t p1 = wasm_i32x4_dot_i16x8(
+                    wasm_i16x8_extend_high_i8x16(q8_0),
+                    wasm_i16x8_extend_high_i8x16(q2_bits_0)
+                );
+                v128_t p2 = wasm_i32x4_dot_i16x8(
+                    wasm_i16x8_extend_low_i8x16(q8_1),
+                    wasm_i16x8_extend_low_i8x16(q2_bits_1)
+                );
+                v128_t p3 = wasm_i32x4_dot_i16x8(
+                    wasm_i16x8_extend_high_i8x16(q8_1),
+                    wasm_i16x8_extend_high_i8x16(q2_bits_1)
+                );
+
+                // Accumulate scaled results
+                v128_t scaled = wasm_i32x4_add(
+                    wasm_i32x4_mul(wasm_i32x4_add(p0, p1), wasm_i32x4_splat(d0)),
+                    wasm_i32x4_mul(wasm_i32x4_add(p2, p3), wasm_i32x4_splat(d1))
+                );
+
+                isum_vec = wasm_i32x4_add(isum_vec, scaled);
+                q8 += 32;
+                shift += 2;
+            }
+            q2 += 32;
+
+            // Horizontal sum of isum_vec
+            isum_vec = wasm_i32x4_add(isum_vec, wasm_i32x4_shuffle(isum_vec, isum_vec, 2, 3, 0, 1));
+            isum_vec = wasm_i32x4_add(isum_vec, wasm_i32x4_shuffle(isum_vec, isum_vec, 1, 0, 3, 2));
+            isum += wasm_i32x4_extract_lane(isum_vec, 0);
+        }
+
+        const float dall = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+        const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
+        sumf += dall * isum - dmin * summs;
+    }
+
+    *s = sumf;
+
+#else
+
+    float sumf = 0;
+
+    for (int i = 0; i < nb; ++i) {
+
+        const uint8_t * q2 = x[i].qs;
+        const  int8_t * q8 = y[i].qs;
+        const uint8_t * sc = x[i].scales;
+
+        int summs = 0;
+        for (int j = 0; j < 16; ++j) {
+            summs += y[i].bsums[j] * (sc[j] >> 4);
+        }
+
+        const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
+        const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
+
+        int isum = 0;
+        int is = 0;
+        int d;
+        for (int k = 0; k < QK_K/128; ++k) {
+            int shift = 0;
+            for (int j = 0; j < 4; ++j) {
+                d = sc[is++] & 0xF;
+                int isuml = 0;
+                for (int l =  0; l < 16; ++l) isuml += q8[l] * ((q2[l] >> shift) & 3);
+                isum += d * isuml;
+                d = sc[is++] & 0xF;
+                isuml = 0;
+                for (int l = 16; l < 32; ++l) isuml += q8[l] * ((q2[l] >> shift) & 3);
+                isum += d * isuml;
+                shift += 2;
+                q8 += 32;
+            }
+            q2 += 32;
+        }
+        sumf += dall * isum - dmin * summs;
+    }
+    *s = sumf;
+#endif
+}
+
+void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    assert(n % QK_K == 0);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const uint32_t kmask1 = 0x03030303;
+    const uint32_t kmask2 = 0x0f0f0f0f;
+
+    const block_q3_K * GGML_RESTRICT x = vx;
+    const block_q8_K * GGML_RESTRICT y = vy;
+
+    const int nb = n / QK_K;
+
+#if defined __wasm_simd128__
+    int8_t  aux8[QK_K];
+    float   sums[8] = {0};
+    uint32_t auxs[4];
+
+    float sumf = 0;
+    for (int i = 0; i < nb; ++i) {
+        const uint8_t * GGML_RESTRICT q3 = x[i].qs;
+        const uint8_t * GGML_RESTRICT hm = x[i].hmask;
+        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
+
+        // Process blocks with SIMD
+        int8_t * a = aux8;
+        uint8_t m = 1;
+        for (int j = 0; j < QK_K; j += 128) {
+            for (int shift = 0; shift <= 6; shift += 2) {
+                v128_t v_m = wasm_i8x16_splat(m);
+                for (int l = 0; l < 32; l += 16) {
+                    v128_t v_q3 = wasm_v128_load(q3 + l);
+                    v128_t v_shift = wasm_i8x16_shr(v_q3, shift);
+                    v128_t v_low2 = wasm_v128_and(v_shift, wasm_i8x16_splat(0x03));
+
+                    v128_t v_hm = wasm_v128_load(hm + l);
+                    v128_t v_mask = wasm_v128_and(v_hm, v_m);
+                    v_mask = wasm_i8x16_ne(v_mask, wasm_i8x16_splat(0));
+
+                    v_low2 = wasm_i8x16_sub(v_low2, wasm_v128_and(wasm_i8x16_splat(4), wasm_v128_not(v_mask)));
+                    wasm_v128_store(a + l, v_low2);
+                }
+                a += 32;
+                m <<= 1;
+            }
+            q3 += 32;
+        }
+
+        // Extract scales
+        memcpy(auxs, x[i].scales, 12);
+        uint32_t tmp = auxs[2];
+        auxs[2] = ((auxs[0] >> 4) & kmask2) | (((tmp >> 4) & kmask1) << 4);
+        auxs[3] = ((auxs[1] >> 4) & kmask2) | (((tmp >> 6) & kmask1) << 4);
+        auxs[0] = (auxs[0] & kmask2) | (((tmp >> 0) & kmask1) << 4);
+        auxs[1] = (auxs[1] & kmask2) | (((tmp >> 2) & kmask1) << 4);
+        const int8_t * scales = (const int8_t *)auxs;
+
+        // SIMD dot product with register accumulators
+        v128_t v_acc0 = wasm_i32x4_splat(0);
+        v128_t v_acc1 = wasm_i32x4_splat(0);
+        a = aux8;
+        for (int j = 0; j < QK_K/16; ++j) {
+            const v128_t v_scale = wasm_i16x8_splat(scales[j] - 32);
+
+            // Process 16 elements per iteration
+            for (int k = 0; k < 2; ++k) {
+                const v128_t v_q8 = wasm_i16x8_load8x8(q8);
+                const v128_t v_a = wasm_i16x8_load8x8(a);
+
+                v128_t v_prod = wasm_i16x8_mul(v_q8, v_a);
+                v_prod = wasm_i16x8_mul(v_prod, v_scale);
+
+                v_acc0 = wasm_i32x4_add(v_acc0, wasm_i32x4_extend_low_i16x8(v_prod));
+                v_acc1 = wasm_i32x4_add(v_acc1, wasm_i32x4_extend_high_i16x8(v_prod));
+
+                q8 += 8;
+                a += 8;
+            }
+        }
+
+        // Accumulate results
+        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+        const v128_t v_d = wasm_f32x4_splat(d);
+        v128_t v_sum = wasm_f32x4_add(
+            wasm_f32x4_mul(wasm_f32x4_convert_i32x4(v_acc0), v_d),
+            wasm_f32x4_mul(wasm_f32x4_convert_i32x4(v_acc1), v_d)
+        );
+
+        // Accumulate into sums vector
+        wasm_v128_store(sums, wasm_f32x4_add(wasm_v128_load(sums), v_sum));
+    }
+
+    // Horizontal sum
+    v128_t v_sum = wasm_f32x4_add(wasm_v128_load(sums), wasm_v128_load(sums + 4));
+    sumf = wasm_f32x4_extract_lane(v_sum, 0) +
+           wasm_f32x4_extract_lane(v_sum, 1) +
+           wasm_f32x4_extract_lane(v_sum, 2) +
+           wasm_f32x4_extract_lane(v_sum, 3);
+
+    *s = sumf;
+
+#else
+    // scalar version
+    // This function is written like this so the compiler can manage to vectorize most of it
+    // Using -Ofast, GCC and clang manage to produce code that is within a factor of 2 or so from the
+    // manually vectorized version above. Every other version I tried would run at least 4 times slower.
+    // The ideal situation would be if we could just write the code once, and the compiler would
+    // automatically produce the best possible set of machine instructions, instead of us having to manually
+    // write vectorized versions for AVX, ARM_NEON, etc.
+
+    int8_t  aux8[QK_K];
+    int16_t aux16[8];
+    float   sums [8];
+    int32_t aux32[8];
+    memset(sums, 0, 8*sizeof(float));
+
+    uint32_t auxs[4];
+    const int8_t * scales = (const int8_t*)auxs;
+
+    float sumf = 0;
+    for (int i = 0; i < nb; ++i) {
+        const uint8_t * GGML_RESTRICT q3 = x[i].qs;
+        const uint8_t * GGML_RESTRICT hm = x[i].hmask;
+        const  int8_t * GGML_RESTRICT q8 = y[i].qs;
+        memset(aux32, 0, 8*sizeof(int32_t));
+        int8_t * GGML_RESTRICT a = aux8;
+        uint8_t m = 1;
+        for (int j = 0; j < QK_K; j += 128) {
+            for (int l = 0; l < 32; ++l) a[l] = q3[l] & 3;
+            for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
+            a += 32; m <<= 1;
+            for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 2) & 3;
+            for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
+            a += 32; m <<= 1;
+            for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 4) & 3;
+            for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
+            a += 32; m <<= 1;
+            for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 6) & 3;
+            for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
+            a += 32; m <<= 1;
+            q3 += 32;
+        }
+        a = aux8;
+
+        memcpy(auxs, x[i].scales, 12);
+        uint32_t tmp = auxs[2];
+        auxs[2] = ((auxs[0] >> 4) & kmask2) | (((tmp >> 4) & kmask1) << 4);
+        auxs[3] = ((auxs[1] >> 4) & kmask2) | (((tmp >> 6) & kmask1) << 4);
+        auxs[0] = (auxs[0] & kmask2) | (((tmp >> 0) & kmask1) << 4);
+        auxs[1] = (auxs[1] & kmask2) | (((tmp >> 2) & kmask1) << 4);
+        for (int j = 0; j < QK_K/16; ++j) {
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
+            q8 += 8; a += 8;
+        }
+        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+        for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
+    }
+    for (int l = 0; l < 8; ++l) sumf += sums[l];
+    *s = sumf;
+
+#endif
+
+}
+
+void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    assert(n % QK_K == 0);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_q4_K * GGML_RESTRICT x = vx;
+    const block_q8_K * GGML_RESTRICT y = vy;
+
+    const int nb = n / QK_K;
+
+    static const uint32_t kmask1 = 0x3f3f3f3f;
+    static const uint32_t kmask2 = 0x0f0f0f0f;
+    static const uint32_t kmask3 = 0x03030303;
+
+    uint32_t utmp[4];
+
+#if defined __wasm_simd128__
+    const uint8_t * scales = (const uint8_t*)&utmp[0];
+    float sumf = 0;
+
+    for (int i = 0; i < nb; ++i) {
+        const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
+        const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin); // Corrected sign
+
+        const uint8_t * GGML_RESTRICT q4 = x[i].qs;
+        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
+
+        // Process scales and mins
+        memcpy(utmp, x[i].scales, 12);
+        utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
+        const uint32_t uaux = utmp[1] & kmask1;
+        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
+        utmp[2] = uaux;
+        utmp[0] &= kmask1;
+
+        // Sum mins * q8sums
+        int32_t sumi = 0;
+        const int16_t * GGML_RESTRICT q8sums = y[i].bsums;
+        const uint8_t * m = (const uint8_t *)&utmp[2];
+        for (int j = 0; j < 16; j += 2) {
+            sumi += (q8sums[j] + q8sums[j+1]) * m[j/2];
+        }
+        sumf -= dmin * sumi;
+
+        int32_t sumi1 = 0;
+        int32_t sumi2 = 0;
+
+        for (int j = 0; j < QK_K/64; ++j) {
+            // Load 64 4-bit weights (32 bytes)
+            const v128_t q4x0 = wasm_v128_load(q4);
+            const v128_t q4x1 = wasm_v128_load(q4 + 16);
+            q4 += 32;
+
+            // Split into low/high nibbles
+            const v128_t q4l0 = wasm_v128_and(q4x0, wasm_i8x16_splat(0x0F));
+            const v128_t q4h0 = wasm_u8x16_shr(q4x0, 4);
+            const v128_t q4l1 = wasm_v128_and(q4x1, wasm_i8x16_splat(0x0F));
+            const v128_t q4h1 = wasm_u8x16_shr(q4x1, 4);
+
+            // Load 64 8-bit values (64 bytes)
+            const v128_t q8x0 = wasm_v128_load(q8);
+            const v128_t q8x1 = wasm_v128_load(q8 + 16);
+            const v128_t q8x2 = wasm_v128_load(q8 + 32);
+            const v128_t q8x3 = wasm_v128_load(q8 + 48);
+            q8 += 64;
+
+            // Low nibble products
+            v128_t vacc1 = wasm_i32x4_dot_i16x8(
+                wasm_i16x8_extend_low_i8x16(q4l0),
+                wasm_i16x8_extend_low_i8x16(q8x0)
+            );
+            vacc1 = wasm_i32x4_add(vacc1, wasm_i32x4_dot_i16x8(
+                wasm_i16x8_extend_high_i8x16(q4l0),
+                wasm_i16x8_extend_high_i8x16(q8x0)
+            ));
+            vacc1 = wasm_i32x4_add(vacc1, wasm_i32x4_dot_i16x8(
+                wasm_i16x8_extend_low_i8x16(q4l1),
+                wasm_i16x8_extend_low_i8x16(q8x1)
+            ));
+            vacc1 = wasm_i32x4_add(vacc1, wasm_i32x4_dot_i16x8(
+                wasm_i16x8_extend_high_i8x16(q4l1),
+                wasm_i16x8_extend_high_i8x16(q8x1)
+            ));
+
+            // High nibble products
+            v128_t vacc2 = wasm_i32x4_dot_i16x8(
+                wasm_i16x8_extend_low_i8x16(q4h0),
+                wasm_i16x8_extend_low_i8x16(q8x2)
+            );
+            vacc2 = wasm_i32x4_add(vacc2, wasm_i32x4_dot_i16x8(
+                wasm_i16x8_extend_high_i8x16(q4h0),
+                wasm_i16x8_extend_high_i8x16(q8x2)
+            ));
+            vacc2 = wasm_i32x4_add(vacc2, wasm_i32x4_dot_i16x8(
+                wasm_i16x8_extend_low_i8x16(q4h1),
+                wasm_i16x8_extend_low_i8x16(q8x3)
+            ));
+            vacc2 = wasm_i32x4_add(vacc2, wasm_i32x4_dot_i16x8(
+                wasm_i16x8_extend_high_i8x16(q4h1),
+                wasm_i16x8_extend_high_i8x16(q8x3)
+            ));
+
+            // Accumulate scaled results
+            int32_t vacc1_sum = wasm_i32x4_extract_lane(vacc1, 0) + wasm_i32x4_extract_lane(vacc1, 1) +
+                                wasm_i32x4_extract_lane(vacc1, 2) + wasm_i32x4_extract_lane(vacc1, 3);
+            sumi1 += vacc1_sum * scales[2*j];
+
+            int32_t vacc2_sum = wasm_i32x4_extract_lane(vacc2, 0) + wasm_i32x4_extract_lane(vacc2, 1) +
+                                wasm_i32x4_extract_lane(vacc2, 2) + wasm_i32x4_extract_lane(vacc2, 3);
+            sumi2 += vacc2_sum * scales[2*j+1];
+        }
+
+        sumf += d * (sumi1 + sumi2);
+    }
+
+    *s = sumf;
+
+#else
+
+    const uint8_t * scales = (const uint8_t*)&utmp[0];
+    const uint8_t * mins   = (const uint8_t*)&utmp[2];
+
+    int8_t  aux8[QK_K];
+    int16_t aux16[8];
+    float   sums [8];
+    int32_t aux32[8];
+    memset(sums, 0, 8*sizeof(float));
+
+    float sumf = 0;
+    for (int i = 0; i < nb; ++i) {
+        const uint8_t * GGML_RESTRICT q4 = x[i].qs;
+        const  int8_t * GGML_RESTRICT q8 = y[i].qs;
+        memset(aux32, 0, 8*sizeof(int32_t));
+        int8_t * GGML_RESTRICT a = aux8;
+        for (int j = 0; j < QK_K/64; ++j) {
+            for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
+            a += 32;
+            for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l]  >> 4);
+            a += 32; q4 += 32;
+        }
+        memcpy(utmp, x[i].scales, 12);
+        utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
+        const uint32_t uaux = utmp[1] & kmask1;
+        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
+        utmp[2] = uaux;
+        utmp[0] &= kmask1;
+
+        int sumi = 0;
+        for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
+        a = aux8;
+        int is = 0;
+        for (int j = 0; j < QK_K/32; ++j) {
+            int32_t scale = scales[is++];
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+        }
+        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+        for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
+        const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
+        sumf -= dmin * sumi;
+    }
+    for (int l = 0; l < 8; ++l) sumf += sums[l];
+    *s = sumf;
+#endif
+}
+
+void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy,  size_t by, int nrc) {
+    assert(n % QK_K == 0);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_q5_K * GGML_RESTRICT x = vx;
+    const block_q8_K * GGML_RESTRICT y = vy;
+
+    const int nb = n / QK_K;
+
+    static const uint32_t kmask1 = 0x3f3f3f3f;
+    static const uint32_t kmask2 = 0x0f0f0f0f;
+    static const uint32_t kmask3 = 0x03030303;
+
+    uint32_t utmp[4];
+
+#if defined __wasm_simd128__
+    //const uint8_t * scales = (const uint8_t*)&utmp[0];
+    float sumf = 0;
+
+    for (int i = 0; i < nb; ++i) {
+        const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
+        const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin); // Fixed sign
+
+        const uint8_t * GGML_RESTRICT q5 = x[i].qs;
+        const uint8_t * GGML_RESTRICT qh = x[i].qh;
+        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
+
+        // Process scales and mins
+        memcpy(utmp, x[i].scales, 12);
+        utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
+        const uint32_t uaux = utmp[1] & kmask1;
+        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
+        utmp[2] = uaux;
+        utmp[0] &= kmask1;
+
+        // Sum mins * q8sums
+        int32_t sumi_mins = 0;
+        const int16_t * GGML_RESTRICT q8sums = y[i].bsums;
+        const uint8_t * m = (const uint8_t *)&utmp[2];
+        for (int j = 0; j < 16; j += 2) {
+            sumi_mins += (q8sums[j] + q8sums[j+1]) * m[j/2];
+        }
+        sumf -= dmin * sumi_mins; // Correct subtraction
+
+        v128_t qh0 = wasm_v128_load(qh);
+        v128_t qh1 = wasm_v128_load(qh + 16);
+        const uint8_t * sc = (const uint8_t *)utmp;
+
+        int32_t sumi = 0;
+
+        for (int j = 0; j < QK_K/64; ++j) {
+            const int shift = j * 2;
+            v128_t qh_shift0 = wasm_u8x16_shr(qh0, shift);
+            v128_t qh_shift1 = wasm_u8x16_shr(qh1, shift);
+
+            v128_t qh_low0 = wasm_i8x16_shl(wasm_v128_and(qh_shift0, wasm_i8x16_splat(0x01)), 4);
+            v128_t qh_high0 = wasm_i8x16_shl(wasm_v128_and(qh_shift0, wasm_i8x16_splat(0x02)), 3);
+            v128_t qh_low1 = wasm_i8x16_shl(wasm_v128_and(qh_shift1, wasm_i8x16_splat(0x01)), 4);
+            v128_t qh_high1 = wasm_i8x16_shl(wasm_v128_and(qh_shift1, wasm_i8x16_splat(0x02)), 3);
+
+            v128_t q5_0 = wasm_v128_load(q5);
+            v128_t q5_1 = wasm_v128_load(q5 + 16);
+            q5 += 32;
+
+            v128_t q5l_0 = wasm_v128_or(wasm_v128_and(q5_0, wasm_i8x16_splat(0x0F)), qh_low0);
+            v128_t q5h_0 = wasm_v128_or(wasm_u8x16_shr(q5_0, 4), qh_high0);
+            v128_t q5l_1 = wasm_v128_or(wasm_v128_and(q5_1, wasm_i8x16_splat(0x0F)), qh_low1);
+            v128_t q5h_1 = wasm_v128_or(wasm_u8x16_shr(q5_1, 4), qh_high1);
+
+            v128_t q8_0 = wasm_v128_load(q8);
+            v128_t q8_1 = wasm_v128_load(q8 + 16);
+            v128_t q8_2 = wasm_v128_load(q8 + 32);
+            v128_t q8_3 = wasm_v128_load(q8 + 48);
+            q8 += 64;
+
+            // Process low quants
+            v128_t pl0 = wasm_i32x4_dot_i16x8(
+                wasm_i16x8_extend_low_i8x16(q5l_0),
+                wasm_i16x8_extend_low_i8x16(q8_0)
+            );
+            pl0 = wasm_i32x4_add(pl0, wasm_i32x4_dot_i16x8(
+                wasm_i16x8_extend_high_i8x16(q5l_0),
+                wasm_i16x8_extend_high_i8x16(q8_0)
+            ));
+            v128_t pl1 = wasm_i32x4_dot_i16x8(
+                wasm_i16x8_extend_low_i8x16(q5l_1),
+                wasm_i16x8_extend_low_i8x16(q8_1)
+            );
+            pl1 = wasm_i32x4_add(pl1, wasm_i32x4_dot_i16x8(
+                wasm_i16x8_extend_high_i8x16(q5l_1),
+                wasm_i16x8_extend_high_i8x16(q8_1)
+            ));
+            v128_t sum_low = wasm_i32x4_add(pl0, pl1);
+
+            // Process high quants
+            v128_t ph0 = wasm_i32x4_dot_i16x8(
+                wasm_i16x8_extend_low_i8x16(q5h_0),
+                wasm_i16x8_extend_low_i8x16(q8_2)
+            );
+            ph0 = wasm_i32x4_add(ph0, wasm_i32x4_dot_i16x8(
+                wasm_i16x8_extend_high_i8x16(q5h_0),
+                wasm_i16x8_extend_high_i8x16(q8_2)
+            ));
+            v128_t ph1 = wasm_i32x4_dot_i16x8(
+                wasm_i16x8_extend_low_i8x16(q5h_1),
+                wasm_i16x8_extend_low_i8x16(q8_3)
+            );
+            ph1 = wasm_i32x4_add(ph1, wasm_i32x4_dot_i16x8(
+                wasm_i16x8_extend_high_i8x16(q5h_1),
+                wasm_i16x8_extend_high_i8x16(q8_3)
+            ));
+            v128_t sum_high = wasm_i32x4_add(ph0, ph1);
+
+            // Accumulate with scale factors
+            int32_t sl = wasm_i32x4_extract_lane(sum_low, 0) + wasm_i32x4_extract_lane(sum_low, 1) +
+                        wasm_i32x4_extract_lane(sum_low, 2) + wasm_i32x4_extract_lane(sum_low, 3);
+            int32_t sh = wasm_i32x4_extract_lane(sum_high, 0) + wasm_i32x4_extract_lane(sum_high, 1) +
+                        wasm_i32x4_extract_lane(sum_high, 2) + wasm_i32x4_extract_lane(sum_high, 3);
+
+            sumi += sl * sc[2*j] + sh * sc[2*j+1];
+        }
+
+        sumf += d * sumi;
+    }
+
+    *s = sumf;
+
+#else
+
+    const uint8_t * scales = (const uint8_t*)&utmp[0];
+    const uint8_t * mins   = (const uint8_t*)&utmp[2];
+
+    int8_t  aux8[QK_K];
+    int16_t aux16[8];
+    float   sums [8];
+    int32_t aux32[8];
+    memset(sums, 0, 8*sizeof(float));
+
+    float sumf = 0;
+    for (int i = 0; i < nb; ++i) {
+        const uint8_t * GGML_RESTRICT q4 = x[i].qs;
+        const uint8_t * GGML_RESTRICT hm = x[i].qh;
+        const  int8_t * GGML_RESTRICT q8 = y[i].qs;
+        memset(aux32, 0, 8*sizeof(int32_t));
+        int8_t * GGML_RESTRICT a = aux8;
+        uint8_t m = 1;
+        for (int j = 0; j < QK_K/64; ++j) {
+            for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
+            for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
+            a += 32; m <<= 1;
+            for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l]  >> 4);
+            for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
+            a += 32; m <<= 1;
+            q4 += 32;
+        }
+        memcpy(utmp, x[i].scales, 12);
+        utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
+        const uint32_t uaux = utmp[1] & kmask1;
+        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
+        utmp[2] = uaux;
+        utmp[0] &= kmask1;
+
+        int sumi = 0;
+        for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
+        a = aux8;
+        int is = 0;
+        for (int j = 0; j < QK_K/32; ++j) {
+            int32_t scale = scales[is++];
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+        }
+        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+        for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
+        const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
+        sumf -= dmin * sumi;
+    }
+    for (int l = 0; l < 8; ++l) sumf += sums[l];
+    *s = sumf;
+#endif
+}
+
+void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    assert(n % QK_K == 0);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_q6_K * GGML_RESTRICT x = vx;
+    const block_q8_K * GGML_RESTRICT y = vy;
+
+    const int nb = n / QK_K;
+
+#if defined __wasm_simd128__
+    int8_t aux8[QK_K] __attribute__((aligned(16)));
+    int32_t aux32[8] __attribute__((aligned(16))) = {0};
+    float sums[8] __attribute__((aligned(16))) = {0};
+
+    for (int i = 0; i < nb; ++i) {
+        // Unpack 6-bit quantized data into aux8 (unchanged)
+        const uint8_t * GGML_RESTRICT q4 = x[i].ql;
+        const uint8_t * GGML_RESTRICT qh = x[i].qh;
+        int8_t * a = aux8;
+        for (int j = 0; j < QK_K; j += 128) {
+            for (int l = 0; l < 32; ++l) {
+                a[l +  0] = (int8_t)((q4[l +  0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
+                a[l + 32] = (int8_t)((q4[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
+                a[l + 64] = (int8_t)((q4[l +  0] >>  4) | (((qh[l] >> 4) & 3) << 4)) - 32;
+                a[l + 96] = (int8_t)((q4[l + 32] >>  4) | (((qh[l] >> 6) & 3) << 4)) - 32;
+            }
+            a += 128;
+            q4 += 64;
+            qh += 32;
+        }
+
+        const int8_t * GGML_RESTRICT a_ptr = aux8;
+        const int8_t * GGML_RESTRICT q8 = y[i].qs;
+        v128_t acc0 = wasm_i32x4_splat(0);
+        v128_t acc1 = wasm_i32x4_splat(0);
+
+        for (int j = 0; j < QK_K/16; ++j) {
+            const int scale = x[i].scales[j];
+            const v128_t vscale = wasm_i32x4_splat(scale);
+
+            // Load 16 elements from a and q8
+            const v128_t a_vec = wasm_v128_load(a_ptr);
+            const v128_t q8_vec = wasm_v128_load(q8);
+
+            // Process low 8 elements
+            v128_t a_low = wasm_i16x8_extend_low_i8x16(a_vec);
+            v128_t q8_low = wasm_i16x8_extend_low_i8x16(q8_vec);
+            v128_t prod_low = wasm_i16x8_mul(a_low, q8_low);
+            v128_t prod_lo_lo = wasm_i32x4_extend_low_i16x8(prod_low);
+            v128_t prod_lo_hi = wasm_i32x4_extend_high_i16x8(prod_low);
+
+            // Process high 8 elements
+            v128_t a_high = wasm_i16x8_extend_high_i8x16(a_vec);
+            v128_t q8_high = wasm_i16x8_extend_high_i8x16(q8_vec);
+            v128_t prod_high = wasm_i16x8_mul(a_high, q8_high);
+            v128_t prod_hi_lo = wasm_i32x4_extend_low_i16x8(prod_high);
+            v128_t prod_hi_hi = wasm_i32x4_extend_high_i16x8(prod_high);
+
+            // Scale and accumulate
+            prod_lo_lo = wasm_i32x4_mul(prod_lo_lo, vscale);
+            prod_lo_hi = wasm_i32x4_mul(prod_lo_hi, vscale);
+            prod_hi_lo = wasm_i32x4_mul(prod_hi_lo, vscale);
+            prod_hi_hi = wasm_i32x4_mul(prod_hi_hi, vscale);
+
+            acc0 = wasm_i32x4_add(acc0, wasm_i32x4_add(prod_lo_lo, prod_hi_lo));
+            acc1 = wasm_i32x4_add(acc1, wasm_i32x4_add(prod_lo_hi, prod_hi_hi));
+
+            a_ptr += 16;
+            q8 += 16;
+        }
+
+        // Store accumulated results
+        wasm_v128_store(&aux32[0], acc0);
+        wasm_v128_store(&aux32[4], acc1);
+
+        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+        for (int l = 0; l < 8; ++l) {
+            sums[l] += d * aux32[l];
+        }
+    }
+
+    // Sum final results
+    float sumf = 0;
+    for (int l = 0; l < 8; ++l) {
+        sumf += sums[l];
+    }
+    *s = sumf;
+
+#else
+
+    int8_t  aux8[QK_K];
+    int16_t aux16[8];
+    float   sums [8];
+    int32_t aux32[8];
+    memset(sums, 0, 8*sizeof(float));
+
+    float sumf = 0;
+    for (int i = 0; i < nb; ++i) {
+        const uint8_t * GGML_RESTRICT q4 = x[i].ql;
+        const uint8_t * GGML_RESTRICT qh = x[i].qh;
+        const  int8_t * GGML_RESTRICT q8 = y[i].qs;
+        memset(aux32, 0, 8*sizeof(int32_t));
+        int8_t * GGML_RESTRICT a = aux8;
+        for (int j = 0; j < QK_K; j += 128) {
+            for (int l = 0; l < 32; ++l) {
+                a[l +  0] = (int8_t)((q4[l +  0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
+                a[l + 32] = (int8_t)((q4[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
+                a[l + 64] = (int8_t)((q4[l +  0] >>  4) | (((qh[l] >> 4) & 3) << 4)) - 32;
+                a[l + 96] = (int8_t)((q4[l + 32] >>  4) | (((qh[l] >> 6) & 3) << 4)) - 32;
+            }
+            a  += 128;
+            q4 += 64;
+            qh += 32;
+        }
+        a = aux8;
+        int is = 0;
+        for (int j = 0; j < QK_K/16; ++j) {
+            int scale = x[i].scales[is++];
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+        }
+        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+        for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
+    }
+    for (int l = 0; l < 8; ++l) sumf += sums[l];
+    *s = sumf;
+#endif
+}
+
diff --git a/ggml/src/ggml-cpu/cpu-feats-x86.cpp b/ggml/src/ggml-cpu/arch/x86/cpu-feats.cpp
similarity index 100%
rename from ggml/src/ggml-cpu/cpu-feats-x86.cpp
rename to ggml/src/ggml-cpu/arch/x86/cpu-feats.cpp
diff --git a/ggml/src/ggml-cpu/arch/x86/quants.c b/ggml/src/ggml-cpu/arch/x86/quants.c
new file mode 100644
index 0000000000000..e7527c00a8f17
--- /dev/null
+++ b/ggml/src/ggml-cpu/arch/x86/quants.c
@@ -0,0 +1,4311 @@
+#define GGML_COMMON_IMPL_C
+#include "ggml-common.h"
+#include "ggml-quants.h"
+#include "ggml-impl.h"
+#include "ggml-cpu.h"
+#include "simd-mappings.h"
+
+#include "../../quants.h"
+#include "../../ggml-cpu-impl.h"
+
+#include <math.h>
+#include <string.h>
+#include <assert.h>
+#include <stdlib.h> // for qsort
+#include <stdio.h>  // for GGML_ASSERT
+
+#define GROUP_MAX_EPS 1e-15f
+#define GROUP_MAX_EPS_IQ3_XXS 1e-8f
+#define GROUP_MAX_EPS_IQ2_S 1e-8f
+#define GROUP_MAX_EPS_IQ1_M 1e-7f
+#define GROUP_MAX_EPS_IQ1_S 1e-12f
+
+#define UNUSED GGML_UNUSED
+
+// some compilers don't provide _mm256_set_m128i, e.g. gcc 7
+#define MM256_SET_M128I(a, b) _mm256_insertf128_si256(_mm256_castsi128_si256(b), (a), 1)
+
+#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__)
+// multiply int8_t, add results pairwise twice
+static inline __m128i mul_sum_i8_pairs(const __m128i x, const __m128i y) {
+    // Get absolute values of x vectors
+    const __m128i ax = _mm_sign_epi8(x, x);
+    // Sign the values of the y vectors
+    const __m128i sy = _mm_sign_epi8(y, x);
+    // Perform multiplication and create 16-bit values
+    const __m128i dot = _mm_maddubs_epi16(ax, sy);
+    const __m128i ones = _mm_set1_epi16(1);
+    return _mm_madd_epi16(ones, dot);
+}
+
+#if __AVX__ || __AVX2__ || __AVX512F__
+// horizontally add 8 floats
+static inline float hsum_float_8(const __m256 x) {
+    __m128 res = _mm256_extractf128_ps(x, 1);
+    res = _mm_add_ps(res, _mm256_castps256_ps128(x));
+    res = _mm_add_ps(res, _mm_movehl_ps(res, res));
+    res = _mm_add_ss(res, _mm_movehdup_ps(res));
+    return _mm_cvtss_f32(res);
+}
+
+// horizontally add 8 int32_t
+static inline int hsum_i32_8(const __m256i a) {
+    const __m128i sum128 = _mm_add_epi32(_mm256_castsi256_si128(a), _mm256_extractf128_si256(a, 1));
+    const __m128i hi64 = _mm_unpackhi_epi64(sum128, sum128);
+    const __m128i sum64 = _mm_add_epi32(hi64, sum128);
+    const __m128i hi32  = _mm_shuffle_epi32(sum64, _MM_SHUFFLE(2, 3, 0, 1));
+    return _mm_cvtsi128_si32(_mm_add_epi32(sum64, hi32));
+}
+
+// horizontally add 4 int32_t
+static inline int hsum_i32_4(const __m128i a) {
+    const __m128i hi64 = _mm_unpackhi_epi64(a, a);
+    const __m128i sum64 = _mm_add_epi32(hi64, a);
+    const __m128i hi32  = _mm_shuffle_epi32(sum64, _MM_SHUFFLE(2, 3, 0, 1));
+    return _mm_cvtsi128_si32(_mm_add_epi32(sum64, hi32));
+}
+
+#if defined(__AVX2__) || defined(__AVX512F__)
+// spread 32 bits to 32 bytes { 0x00, 0xFF }
+static inline __m256i bytes_from_bits_32(const uint8_t * x) {
+    uint32_t x32;
+    memcpy(&x32, x, sizeof(uint32_t));
+    const __m256i shuf_mask = _mm256_set_epi64x(
+            0x0303030303030303, 0x0202020202020202,
+            0x0101010101010101, 0x0000000000000000);
+    __m256i bytes = _mm256_shuffle_epi8(_mm256_set1_epi32(x32), shuf_mask);
+    const __m256i bit_mask = _mm256_set1_epi64x(0x7fbfdfeff7fbfdfe);
+    bytes = _mm256_or_si256(bytes, bit_mask);
+    return _mm256_cmpeq_epi8(bytes, _mm256_set1_epi64x(-1));
+}
+
+// Unpack 32 4-bit fields into 32 bytes
+// The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval
+static inline __m256i bytes_from_nibbles_32(const uint8_t * rsi)
+{
+    const __m128i tmp = _mm_loadu_si128((const __m128i *)rsi);
+    const __m256i bytes = MM256_SET_M128I(_mm_srli_epi16(tmp, 4), tmp);
+    const __m256i lowMask = _mm256_set1_epi8( 0xF );
+    return _mm256_and_si256(lowMask, bytes);
+}
+
+// add int16_t pairwise and return as float vector
+static inline __m256 sum_i16_pairs_float(const __m256i x) {
+    const __m256i ones = _mm256_set1_epi16(1);
+    const __m256i summed_pairs = _mm256_madd_epi16(ones, x);
+    return _mm256_cvtepi32_ps(summed_pairs);
+}
+
+static inline __m256 mul_sum_us8_pairs_float(const __m256i ax, const __m256i sy) {
+#if defined(__AVX512VNNI__) && defined(__AVX512VL__)
+    const __m256i zero = _mm256_setzero_si256();
+    const __m256i summed_pairs = _mm256_dpbusd_epi32(zero, ax, sy);
+    return _mm256_cvtepi32_ps(summed_pairs);
+#elif defined(__AVXVNNI__)
+    const __m256i zero = _mm256_setzero_si256();
+    const __m256i summed_pairs = _mm256_dpbusd_avx_epi32(zero, ax, sy);
+    return _mm256_cvtepi32_ps(summed_pairs);
+#else
+    // Perform multiplication and create 16-bit values
+    const __m256i dot = _mm256_maddubs_epi16(ax, sy);
+    return sum_i16_pairs_float(dot);
+#endif
+}
+
+// multiply int8_t, add results pairwise twice and return as float vector
+static inline __m256 mul_sum_i8_pairs_float(const __m256i x, const __m256i y) {
+#if __AVXVNNIINT8__
+    const __m256i zero = _mm256_setzero_si256();
+    const __m256i summed_pairs = _mm256_dpbssd_epi32(zero, x, y);
+    return _mm256_cvtepi32_ps(summed_pairs);
+#else
+    // Get absolute values of x vectors
+    const __m256i ax = _mm256_sign_epi8(x, x);
+    // Sign the values of the y vectors
+    const __m256i sy = _mm256_sign_epi8(y, x);
+    return mul_sum_us8_pairs_float(ax, sy);
+#endif
+}
+
+static inline __m128i packNibbles( __m256i bytes )
+{
+    // Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh
+#if __AVX512F__
+    const __m256i bytes_srli_4 = _mm256_srli_epi16(bytes, 4);   // 0000_0000_abcd_0000
+    bytes = _mm256_or_si256(bytes, bytes_srli_4);               // 0000_abcd_abcd_efgh
+    return _mm256_cvtepi16_epi8(bytes);                         // abcd_efgh
+#else
+    const __m256i lowByte = _mm256_set1_epi16( 0xFF );
+    __m256i high = _mm256_andnot_si256( lowByte, bytes );
+    __m256i low = _mm256_and_si256( lowByte, bytes );
+    high = _mm256_srli_epi16( high, 4 );
+    bytes = _mm256_or_si256( low, high );
+
+    // Compress uint16_t lanes into bytes
+    __m128i r0 = _mm256_castsi256_si128( bytes );
+    __m128i r1 = _mm256_extracti128_si256( bytes, 1 );
+    return _mm_packus_epi16( r0, r1 );
+#endif
+}
+#elif defined(__AVX__)
+static inline __m128i packNibbles( __m128i bytes1, __m128i bytes2 )
+{
+    // Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh
+    const __m128i lowByte = _mm_set1_epi16( 0xFF );
+    __m128i high = _mm_andnot_si128( lowByte, bytes1 );
+    __m128i low = _mm_and_si128( lowByte, bytes1 );
+    high = _mm_srli_epi16( high, 4 );
+    bytes1 = _mm_or_si128( low, high );
+    high = _mm_andnot_si128( lowByte, bytes2 );
+    low = _mm_and_si128( lowByte, bytes2 );
+    high = _mm_srli_epi16( high, 4 );
+    bytes2 = _mm_or_si128( low, high );
+
+    return _mm_packus_epi16( bytes1, bytes2);
+}
+
+static inline __m128i mul_add_epi8_sse(const __m128i x, const __m128i y) {
+    const __m128i ax = _mm_sign_epi8(x, x);
+    const __m128i sy = _mm_sign_epi8(y, x);
+    return _mm_maddubs_epi16(ax, sy);
+}
+
+// spread 32 bits to 32 bytes { 0x00, 0xFF }
+static inline __m256i bytes_from_bits_32(const uint8_t * x) {
+    uint32_t x32;
+    memcpy(&x32, x, sizeof(uint32_t));
+    const __m128i shuf_maskl = _mm_set_epi64x(0x0101010101010101, 0x0000000000000000);
+    const __m128i shuf_maskh = _mm_set_epi64x(0x0303030303030303, 0x0202020202020202);
+    __m128i bytesl = _mm_shuffle_epi8(_mm_set1_epi32(x32), shuf_maskl);
+    __m128i bytesh = _mm_shuffle_epi8(_mm_set1_epi32(x32), shuf_maskh);
+    const __m128i bit_mask = _mm_set1_epi64x(0x7fbfdfeff7fbfdfe);
+    bytesl = _mm_or_si128(bytesl, bit_mask);
+    bytesh = _mm_or_si128(bytesh, bit_mask);
+    bytesl = _mm_cmpeq_epi8(bytesl, _mm_set1_epi64x(-1));
+    bytesh = _mm_cmpeq_epi8(bytesh, _mm_set1_epi64x(-1));
+    return MM256_SET_M128I(bytesh, bytesl);
+}
+
+// Unpack 32 4-bit fields into 32 bytes
+// The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval
+static inline __m256i bytes_from_nibbles_32(const uint8_t * rsi)
+{
+    // Load 16 bytes from memory
+    __m128i tmpl = _mm_loadu_si128((const __m128i *)rsi);
+    __m128i tmph = _mm_srli_epi16(tmpl, 4);
+    const __m128i lowMask = _mm_set1_epi8(0xF);
+    tmpl = _mm_and_si128(lowMask, tmpl);
+    tmph = _mm_and_si128(lowMask, tmph);
+    return MM256_SET_M128I(tmph, tmpl);
+}
+
+// add int16_t pairwise and return as float vector
+static inline __m256 sum_i16_pairs_float(const __m128i xh, const __m128i xl) {
+    const __m128i ones = _mm_set1_epi16(1);
+    const __m128i summed_pairsl = _mm_madd_epi16(ones, xl);
+    const __m128i summed_pairsh = _mm_madd_epi16(ones, xh);
+    const __m256i summed_pairs = MM256_SET_M128I(summed_pairsh, summed_pairsl);
+    return _mm256_cvtepi32_ps(summed_pairs);
+}
+
+static inline __m256 mul_sum_us8_pairs_float(const __m256i ax, const __m256i sy) {
+    const __m128i axl = _mm256_castsi256_si128(ax);
+    const __m128i axh = _mm256_extractf128_si256(ax, 1);
+    const __m128i syl = _mm256_castsi256_si128(sy);
+    const __m128i syh = _mm256_extractf128_si256(sy, 1);
+    // Perform multiplication and create 16-bit values
+    const __m128i dotl = _mm_maddubs_epi16(axl, syl);
+    const __m128i doth = _mm_maddubs_epi16(axh, syh);
+    return sum_i16_pairs_float(doth, dotl);
+}
+
+// multiply int8_t, add results pairwise twice and return as float vector
+static inline __m256 mul_sum_i8_pairs_float(const __m256i x, const __m256i y) {
+    const __m128i xl = _mm256_castsi256_si128(x);
+    const __m128i xh = _mm256_extractf128_si256(x, 1);
+    const __m128i yl = _mm256_castsi256_si128(y);
+    const __m128i yh = _mm256_extractf128_si256(y, 1);
+    // Get absolute values of x vectors
+    const __m128i axl = _mm_sign_epi8(xl, xl);
+    const __m128i axh = _mm_sign_epi8(xh, xh);
+    // Sign the values of the y vectors
+    const __m128i syl = _mm_sign_epi8(yl, xl);
+    const __m128i syh = _mm_sign_epi8(yh, xh);
+    // Perform multiplication and create 16-bit values
+    const __m128i dotl = _mm_maddubs_epi16(axl, syl);
+    const __m128i doth = _mm_maddubs_epi16(axh, syh);
+    return sum_i16_pairs_float(doth, dotl);
+}
+
+// larger version of mul_sum_i8_pairs_float where x and y are each represented by four 128-bit vectors
+static inline __m256 mul_sum_i8_quad_float(const __m128i x_1_0, const __m128i x_1_1, const __m128i x_2_0, const __m128i x_2_1,
+                                           const __m128i y_1_0, const __m128i y_1_1, const __m128i y_2_0, const __m128i y_2_1) {
+    const __m128i mone = _mm_set1_epi16(1);
+
+    const __m128i p16_1_0 = mul_add_epi8_sse(x_1_0, y_1_0);
+    const __m128i p16_1_1 = mul_add_epi8_sse(x_1_1, y_1_1);
+    const __m128i p16_2_0 = mul_add_epi8_sse(x_2_0, y_2_0);
+    const __m128i p16_2_1 = mul_add_epi8_sse(x_2_1, y_2_1);
+    const __m128i p_1_0 = _mm_madd_epi16(p16_1_0, mone);
+    const __m128i p_1_1 = _mm_madd_epi16(p16_1_1, mone);
+    const __m128i p_2_0 = _mm_madd_epi16(p16_2_0, mone);
+    const __m128i p_2_1 = _mm_madd_epi16(p16_2_1, mone);
+    const __m128i p_1 = _mm_add_epi32(p_1_0, p_1_1);
+    const __m128i p_2 = _mm_add_epi32(p_2_0, p_2_1);
+    return _mm256_cvtepi32_ps(MM256_SET_M128I(p_2, p_1));
+}
+
+// quad fp16 delta calculation
+static inline __m256 quad_fp16_delta_float(const float x0, const float y0, const float x1, const float y1) {
+    // GGML_CPU_FP16_TO_FP32 is faster than Intel F16C
+    return _mm256_set_m128(_mm_set1_ps(GGML_CPU_FP16_TO_FP32(x1) * GGML_CPU_FP16_TO_FP32(y1)),
+                           _mm_set1_ps(GGML_CPU_FP16_TO_FP32(x0) * GGML_CPU_FP16_TO_FP32(y0)));
+}
+#endif
+#elif defined(__SSSE3__)
+// horizontally add 4x4 floats
+static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128 c, const __m128 d) {
+    __m128 res_0 =_mm_hadd_ps(a, b);
+    __m128 res_1 =_mm_hadd_ps(c, d);
+    __m128 res =_mm_hadd_ps(res_0, res_1);
+    res =_mm_hadd_ps(res, res);
+    res =_mm_hadd_ps(res, res);
+
+    return _mm_cvtss_f32(res);
+}
+#endif // __AVX__ || __AVX2__ || __AVX512F__
+#endif // defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__)
+
+void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
+    assert(QK8_0 == 32);
+    assert(k % QK8_0 == 0);
+    const int nb = k / QK8_0;
+
+    block_q8_0 * GGML_RESTRICT y = vy;
+
+#if defined(__AVX2__) || defined(__AVX__)
+    for (int i = 0; i < nb; i++) {
+        // Load elements into 4 AVX vectors
+        __m256 v0 = _mm256_loadu_ps( x );
+        __m256 v1 = _mm256_loadu_ps( x + 8 );
+        __m256 v2 = _mm256_loadu_ps( x + 16 );
+        __m256 v3 = _mm256_loadu_ps( x + 24 );
+        x += 32;
+
+        // Compute max(abs(e)) for the block
+        const __m256 signBit = _mm256_set1_ps( -0.0f );
+        __m256 maxAbs = _mm256_andnot_ps( signBit, v0 );
+        maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v1 ) );
+        maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v2 ) );
+        maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v3 ) );
+
+        __m128 max4 = _mm_max_ps( _mm256_extractf128_ps( maxAbs, 1 ), _mm256_castps256_ps128( maxAbs ) );
+        max4 = _mm_max_ps( max4, _mm_movehl_ps( max4, max4 ) );
+        max4 = _mm_max_ss( max4, _mm_movehdup_ps( max4 ) );
+        const float maxScalar = _mm_cvtss_f32( max4 );
+
+        // Quantize these floats
+        const float d = maxScalar / 127.f;
+        y[i].d = GGML_CPU_FP32_TO_FP16(d);
+        const float id = ( maxScalar != 0.0f ) ? 127.f / maxScalar : 0.0f;
+        const __m256 mul = _mm256_set1_ps( id );
+
+        // Apply the multiplier
+        v0 = _mm256_mul_ps( v0, mul );
+        v1 = _mm256_mul_ps( v1, mul );
+        v2 = _mm256_mul_ps( v2, mul );
+        v3 = _mm256_mul_ps( v3, mul );
+
+        // Round to nearest integer
+        v0 = _mm256_round_ps( v0, _MM_ROUND_NEAREST );
+        v1 = _mm256_round_ps( v1, _MM_ROUND_NEAREST );
+        v2 = _mm256_round_ps( v2, _MM_ROUND_NEAREST );
+        v3 = _mm256_round_ps( v3, _MM_ROUND_NEAREST );
+
+        // Convert floats to integers
+        __m256i i0 = _mm256_cvtps_epi32( v0 );
+        __m256i i1 = _mm256_cvtps_epi32( v1 );
+        __m256i i2 = _mm256_cvtps_epi32( v2 );
+        __m256i i3 = _mm256_cvtps_epi32( v3 );
+
+#if defined(__AVX2__)
+        // Convert int32 to int16
+        i0 = _mm256_packs_epi32( i0, i1 );	// 0, 1, 2, 3,  8, 9, 10, 11,  4, 5, 6, 7, 12, 13, 14, 15
+        i2 = _mm256_packs_epi32( i2, i3 );	// 16, 17, 18, 19,  24, 25, 26, 27,  20, 21, 22, 23, 28, 29, 30, 31
+                                            // Convert int16 to int8
+        i0 = _mm256_packs_epi16( i0, i2 );	// 0, 1, 2, 3,  8, 9, 10, 11,  16, 17, 18, 19,  24, 25, 26, 27,  4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
+
+        // We got our precious signed bytes, but the order is now wrong
+        // These AVX2 pack instructions process 16-byte pieces independently
+        // The following instruction is fixing the order
+        const __m256i perm = _mm256_setr_epi32( 0, 4, 1, 5, 2, 6, 3, 7 );
+        i0 = _mm256_permutevar8x32_epi32( i0, perm );
+
+        _mm256_storeu_si256((__m256i *)y[i].qs, i0);
+#else
+        // Since we don't have in AVX some necessary functions,
+        // we split the registers in half and call AVX2 analogs from SSE
+        __m128i ni0 = _mm256_castsi256_si128( i0 );
+        __m128i ni1 = _mm256_extractf128_si256( i0, 1);
+        __m128i ni2 = _mm256_castsi256_si128( i1 );
+        __m128i ni3 = _mm256_extractf128_si256( i1, 1);
+        __m128i ni4 = _mm256_castsi256_si128( i2 );
+        __m128i ni5 = _mm256_extractf128_si256( i2, 1);
+        __m128i ni6 = _mm256_castsi256_si128( i3 );
+        __m128i ni7 = _mm256_extractf128_si256( i3, 1);
+
+        // Convert int32 to int16
+        ni0 = _mm_packs_epi32( ni0, ni1 );
+        ni2 = _mm_packs_epi32( ni2, ni3 );
+        ni4 = _mm_packs_epi32( ni4, ni5 );
+        ni6 = _mm_packs_epi32( ni6, ni7 );
+        // Convert int16 to int8
+        ni0 = _mm_packs_epi16( ni0, ni2 );
+        ni4 = _mm_packs_epi16( ni4, ni6 );
+
+        _mm_storeu_si128((__m128i *)(y[i].qs +  0), ni0);
+        _mm_storeu_si128((__m128i *)(y[i].qs + 16), ni4);
+#endif
+    }
+#else
+    GGML_UNUSED(nb);
+    // scalar
+    quantize_row_q8_0_ref(x, y, k);
+#endif
+}
+
+void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
+    assert(k % QK8_1 == 0);
+    const int nb = k / QK8_1;
+
+    block_q8_1 * GGML_RESTRICT y = vy;
+#if defined(__AVX2__) || defined(__AVX__)
+    for (int i = 0; i < nb; i++) {
+        // Load elements into 4 AVX vectors
+        __m256 v0 = _mm256_loadu_ps( x );
+        __m256 v1 = _mm256_loadu_ps( x + 8 );
+        __m256 v2 = _mm256_loadu_ps( x + 16 );
+        __m256 v3 = _mm256_loadu_ps( x + 24 );
+        x += 32;
+
+        // Compute max(abs(e)) for the block
+        const __m256 signBit = _mm256_set1_ps( -0.0f );
+        __m256 maxAbs = _mm256_andnot_ps( signBit, v0 );
+        maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v1 ) );
+        maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v2 ) );
+        maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v3 ) );
+
+        __m128 max4 = _mm_max_ps( _mm256_extractf128_ps( maxAbs, 1 ), _mm256_castps256_ps128( maxAbs ) );
+        max4 = _mm_max_ps( max4, _mm_movehl_ps( max4, max4 ) );
+        max4 = _mm_max_ss( max4, _mm_movehdup_ps( max4 ) );
+        const float max_scalar = _mm_cvtss_f32( max4 );
+
+        // Quantize these floats
+        const float d = max_scalar / 127.f;
+        y[i].d = GGML_CPU_FP32_TO_FP16(d);
+        const float id = ( max_scalar != 0.0f ) ? 127.f / max_scalar : 0.0f;
+        const __m256 mul = _mm256_set1_ps( id );
+
+        // Apply the multiplier
+        v0 = _mm256_mul_ps( v0, mul );
+        v1 = _mm256_mul_ps( v1, mul );
+        v2 = _mm256_mul_ps( v2, mul );
+        v3 = _mm256_mul_ps( v3, mul );
+
+        // Round to nearest integer
+        v0 = _mm256_round_ps( v0, _MM_ROUND_NEAREST );
+        v1 = _mm256_round_ps( v1, _MM_ROUND_NEAREST );
+        v2 = _mm256_round_ps( v2, _MM_ROUND_NEAREST );
+        v3 = _mm256_round_ps( v3, _MM_ROUND_NEAREST );
+
+        // Convert floats to integers
+        __m256i i0 = _mm256_cvtps_epi32( v0 );
+        __m256i i1 = _mm256_cvtps_epi32( v1 );
+        __m256i i2 = _mm256_cvtps_epi32( v2 );
+        __m256i i3 = _mm256_cvtps_epi32( v3 );
+
+#if defined(__AVX2__)
+        // Compute the sum of the quants and set y[i].s
+        y[i].s = GGML_CPU_FP32_TO_FP16(d * hsum_i32_8(_mm256_add_epi32(_mm256_add_epi32(i0, i1), _mm256_add_epi32(i2, i3))));
+
+        // Convert int32 to int16
+        i0 = _mm256_packs_epi32( i0, i1 );	// 0, 1, 2, 3,  8, 9, 10, 11,  4, 5, 6, 7, 12, 13, 14, 15
+        i2 = _mm256_packs_epi32( i2, i3 );	// 16, 17, 18, 19,  24, 25, 26, 27,  20, 21, 22, 23, 28, 29, 30, 31
+                                            // Convert int16 to int8
+        i0 = _mm256_packs_epi16( i0, i2 );	// 0, 1, 2, 3,  8, 9, 10, 11,  16, 17, 18, 19,  24, 25, 26, 27,  4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
+
+        // We got our precious signed bytes, but the order is now wrong
+        // These AVX2 pack instructions process 16-byte pieces independently
+        // The following instruction is fixing the order
+        const __m256i perm = _mm256_setr_epi32( 0, 4, 1, 5, 2, 6, 3, 7 );
+        i0 = _mm256_permutevar8x32_epi32( i0, perm );
+
+        _mm256_storeu_si256((__m256i *)y[i].qs, i0);
+#else
+        // Since we don't have in AVX some necessary functions,
+        // we split the registers in half and call AVX2 analogs from SSE
+        __m128i ni0 = _mm256_castsi256_si128( i0 );
+        __m128i ni1 = _mm256_extractf128_si256( i0, 1);
+        __m128i ni2 = _mm256_castsi256_si128( i1 );
+        __m128i ni3 = _mm256_extractf128_si256( i1, 1);
+        __m128i ni4 = _mm256_castsi256_si128( i2 );
+        __m128i ni5 = _mm256_extractf128_si256( i2, 1);
+        __m128i ni6 = _mm256_castsi256_si128( i3 );
+        __m128i ni7 = _mm256_extractf128_si256( i3, 1);
+
+        // Compute the sum of the quants and set y[i].s
+        const __m128i s0 = _mm_add_epi32(_mm_add_epi32(ni0, ni1), _mm_add_epi32(ni2, ni3));
+        const __m128i s1 = _mm_add_epi32(_mm_add_epi32(ni4, ni5), _mm_add_epi32(ni6, ni7));
+        y[i].s = GGML_CPU_FP32_TO_FP16(d * hsum_i32_4(_mm_add_epi32(s0, s1)));
+
+        // Convert int32 to int16
+        ni0 = _mm_packs_epi32( ni0, ni1 );
+        ni2 = _mm_packs_epi32( ni2, ni3 );
+        ni4 = _mm_packs_epi32( ni4, ni5 );
+        ni6 = _mm_packs_epi32( ni6, ni7 );
+        // Convert int16 to int8
+        ni0 = _mm_packs_epi16( ni0, ni2 );
+        ni4 = _mm_packs_epi16( ni4, ni6 );
+
+        _mm_storeu_si128((__m128i *)(y[i].qs +  0), ni0);
+        _mm_storeu_si128((__m128i *)(y[i].qs + 16), ni4);
+#endif
+    }
+#else
+    GGML_UNUSED(nb);
+    // scalar
+    quantize_row_q8_1_ref(x, y, k);
+#endif
+}
+
+// placeholder implementation for Apple targets
+void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
+    quantize_row_q8_K_ref(x, y, k);
+}
+
+//===================================== Dot products =================================
+
+//
+// Helper functions
+//
+
+#if __AVX__ || __AVX2__ || __AVX512F__
+
+// shuffles to pick the required scales in dot products
+static inline __m256i get_scale_shuffle_q3k(int i) {
+    static const uint8_t k_shuffle[128] = {
+         0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,     2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3,
+         4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5,     6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7,
+         8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9,    10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,
+        12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,    14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,
+    };
+    return _mm256_loadu_si256((const __m256i*)k_shuffle + i);
+}
+static inline __m256i get_scale_shuffle_k4(int i) {
+    static const uint8_t k_shuffle[256] = {
+         0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
+         2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3,
+         4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5,
+         6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7,
+         8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9,
+        10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,
+        12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,
+        14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15
+    };
+    return _mm256_loadu_si256((const __m256i*)k_shuffle + i);
+}
+static inline __m128i get_scale_shuffle(int i) {
+    static const uint8_t k_shuffle[128] = {
+         0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
+         2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3,
+         4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5,
+         6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7,
+         8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9,
+        10,10,10,10,10,10,10,10, 11,11,11,11,11,11,11,11,
+        12,12,12,12,12,12,12,12, 13,13,13,13,13,13,13,13,
+        14,14,14,14,14,14,14,14, 15,15,15,15,15,15,15,15
+    };
+    return _mm_loadu_si128((const __m128i*)k_shuffle + i);
+}
+#endif
+
+void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    const int qk = QK8_0;
+    const int nb = n / qk;
+
+    assert(n % qk == 0);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_q4_0 * GGML_RESTRICT x = vx;
+    const block_q8_0 * GGML_RESTRICT y = vy;
+
+    int ib = 0;
+    float sumf = 0;
+
+#if defined(__AVX2__)
+    // Initialize accumulator with zeros
+    __m256 acc = _mm256_setzero_ps();
+
+    // Main loop
+    for (; ib < nb; ++ib) {
+        /* Compute combined scale for the block */
+        const __m256 d = _mm256_set1_ps( GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d) );
+
+        __m256i qx = bytes_from_nibbles_32(x[ib].qs);
+
+        // Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval.
+        const __m256i off = _mm256_set1_epi8( 8 );
+        qx = _mm256_sub_epi8( qx, off );
+
+        __m256i qy = _mm256_loadu_si256((const __m256i *)y[ib].qs);
+
+        const __m256 q = mul_sum_i8_pairs_float(qx, qy);
+
+        /* Multiply q with scale and accumulate */
+        acc = _mm256_fmadd_ps( d, q, acc );
+    }
+
+    sumf = hsum_float_8(acc);
+#elif defined(__AVX__)
+    __m256 accum = _mm256_setzero_ps();
+    for (; ib + 1 < nb; ib += 2) {
+        const __m128i q4bits_1 = _mm_loadu_si128((const __m128i *)x[ib + 0].qs);
+        const __m128i q4bits_2 = _mm_loadu_si128((const __m128i *)x[ib + 1].qs);
+        const __m128i q8b_1_0 = _mm_loadu_si128((const __m128i *)y[ib + 0].qs);
+        const __m128i q8b_1_1 = _mm_loadu_si128((const __m128i *)y[ib + 0].qs + 1);
+        const __m128i q8b_2_0 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs);
+        const __m128i q8b_2_1 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs + 1);
+
+        const __m128i q4b_1_0 = _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), q4bits_1), _mm_set1_epi8(8));
+        const __m128i q4b_1_1 = _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), _mm_srli_epi16(q4bits_1, 4)), _mm_set1_epi8(8));
+        const __m128i q4b_2_0 = _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), q4bits_2), _mm_set1_epi8(8));
+        const __m128i q4b_2_1 = _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), _mm_srli_epi16(q4bits_2, 4)), _mm_set1_epi8(8));
+
+        const __m128i p16_1_0 = mul_add_epi8_sse(q4b_1_0, q8b_1_0);
+        const __m128i p16_1_1 = mul_add_epi8_sse(q4b_1_1, q8b_1_1);
+        const __m128i p16_2_0 = mul_add_epi8_sse(q4b_2_0, q8b_2_0);
+        const __m128i p16_2_1 = mul_add_epi8_sse(q4b_2_1, q8b_2_1);
+        const __m128i p_1 = _mm_add_epi16(p16_1_0, p16_1_1);
+        const __m128i p_2 = _mm_add_epi16(p16_2_0, p16_2_1);
+        const __m256 p =  sum_i16_pairs_float(p_2, p_1);
+
+        const __m256 deltas = quad_fp16_delta_float(x[ib].d, y[ib].d, x[ib + 1].d, y[ib + 1].d);
+        accum = _mm256_add_ps(_mm256_mul_ps(deltas, p), accum);
+    }
+
+    sumf = hsum_float_8(accum);
+#elif defined(__SSSE3__)
+    // set constants
+    const __m128i lowMask = _mm_set1_epi8(0xF);
+    const __m128i off = _mm_set1_epi8(8);
+
+    // Initialize accumulator with zeros
+    __m128 acc_0 = _mm_setzero_ps();
+    __m128 acc_1 = _mm_setzero_ps();
+    __m128 acc_2 = _mm_setzero_ps();
+    __m128 acc_3 = _mm_setzero_ps();
+
+    for (; ib + 1 < nb; ib += 2) {
+        _mm_prefetch(&x[ib] + sizeof(block_q4_0), _MM_HINT_T0);
+        _mm_prefetch(&y[ib] + sizeof(block_q8_0), _MM_HINT_T0);
+
+        // Compute combined scale for the block 0 and 1
+        const __m128 d_0_1 = _mm_set1_ps( GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d) );
+
+        const __m128i tmp_0_1 = _mm_loadu_si128((const __m128i *)x[ib].qs);
+
+        __m128i bx_0 = _mm_and_si128(lowMask, tmp_0_1);
+        __m128i by_0 = _mm_loadu_si128((const __m128i *)y[ib].qs);
+        bx_0 = _mm_sub_epi8(bx_0, off);
+        const __m128i i32_0 = mul_sum_i8_pairs(bx_0, by_0);
+
+        __m128i bx_1 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp_0_1, 4));
+        __m128i by_1 = _mm_loadu_si128((const __m128i *)(y[ib].qs + 16));
+        bx_1 = _mm_sub_epi8(bx_1, off);
+        const __m128i i32_1 = mul_sum_i8_pairs(bx_1, by_1);
+
+        _mm_prefetch(&x[ib] + 2 * sizeof(block_q4_0), _MM_HINT_T0);
+        _mm_prefetch(&y[ib] + 2 * sizeof(block_q8_0), _MM_HINT_T0);
+
+        // Compute combined scale for the block 2 and 3
+        const __m128 d_2_3 = _mm_set1_ps( GGML_CPU_FP16_TO_FP32(x[ib + 1].d) * GGML_CPU_FP16_TO_FP32(y[ib + 1].d) );
+
+        const __m128i tmp_2_3 = _mm_loadu_si128((const __m128i *)x[ib + 1].qs);
+
+        __m128i bx_2 = _mm_and_si128(lowMask, tmp_2_3);
+        __m128i by_2 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs);
+        bx_2 = _mm_sub_epi8(bx_2, off);
+        const __m128i i32_2 = mul_sum_i8_pairs(bx_2, by_2);
+
+        __m128i bx_3 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp_2_3, 4));
+        __m128i by_3 = _mm_loadu_si128((const __m128i *)(y[ib + 1].qs + 16));
+        bx_3 = _mm_sub_epi8(bx_3, off);
+        const __m128i i32_3 = mul_sum_i8_pairs(bx_3, by_3);
+
+        // Convert int32_t to float
+        __m128 p0 = _mm_cvtepi32_ps(i32_0);
+        __m128 p1 = _mm_cvtepi32_ps(i32_1);
+        __m128 p2 = _mm_cvtepi32_ps(i32_2);
+        __m128 p3 = _mm_cvtepi32_ps(i32_3);
+
+        // Apply the scale
+        __m128 p0_d = _mm_mul_ps( d_0_1, p0 );
+        __m128 p1_d = _mm_mul_ps( d_0_1, p1 );
+        __m128 p2_d = _mm_mul_ps( d_2_3, p2 );
+        __m128 p3_d = _mm_mul_ps( d_2_3, p3 );
+
+        // Acummulate
+        acc_0 = _mm_add_ps(p0_d, acc_0);
+        acc_1 = _mm_add_ps(p1_d, acc_1);
+        acc_2 = _mm_add_ps(p2_d, acc_2);
+        acc_3 = _mm_add_ps(p3_d, acc_3);
+    }
+
+    sumf = hsum_float_4x4(acc_0, acc_1, acc_2, acc_3);
+
+#endif
+    for (; ib < nb; ++ib) {
+        int sumi0 = 0;
+        int sumi1 = 0;
+
+        for (int j = 0; j < qk/2; ++j) {
+            const int v0 = (x[ib].qs[j] & 0x0F) - 8;
+            const int v1 = (x[ib].qs[j] >>   4) - 8;
+
+            sumi0 += (v0 * y[ib].qs[j]);
+            sumi1 += (v1 * y[ib].qs[j + qk/2]);
+        }
+
+        int sumi = sumi0 + sumi1;
+        sumf += sumi*GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d);
+    }
+
+    *s = sumf;
+}
+
+void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    const int qk = QK8_1;
+    const int nb = n / qk;
+
+    assert(n % qk == 0);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_q4_1 * GGML_RESTRICT x = vx;
+    const block_q8_1 * GGML_RESTRICT y = vy;
+
+    int ib = 0;
+    float sumf = 0;
+
+#if defined(__AVX2__) || defined(__AVX__)
+    // Initialize accumulator with zeros
+    __m256 acc = _mm256_setzero_ps();
+
+    float summs = 0;
+
+    // Main loop
+    for (; ib < nb; ++ib) {
+        const float d0 = GGML_CPU_FP16_TO_FP32(x[ib].d);
+        const float d1 = GGML_CPU_FP16_TO_FP32(y[ib].d);
+
+        summs += GGML_CPU_FP16_TO_FP32(x[ib].m) * GGML_CPU_FP16_TO_FP32(y[ib].s);
+
+        const __m256 d0v = _mm256_set1_ps( d0 );
+        const __m256 d1v = _mm256_set1_ps( d1 );
+
+        // Compute combined scales
+        const __m256 d0d1 = _mm256_mul_ps( d0v, d1v );
+
+        // Load 16 bytes, and unpack 4 bit fields into bytes, making 32 bytes
+        const __m256i qx = bytes_from_nibbles_32(x[ib].qs);
+        const __m256i qy = _mm256_loadu_si256( (const __m256i *)y[ib].qs );
+
+        const __m256 xy = mul_sum_us8_pairs_float(qx, qy);
+
+        // Accumulate d0*d1*x*y
+#if defined(__AVX2__)
+        acc = _mm256_fmadd_ps( d0d1, xy, acc );
+#else
+        acc = _mm256_add_ps( _mm256_mul_ps( d0d1, xy ), acc );
+#endif
+    }
+
+    sumf = hsum_float_8(acc) + summs;
+
+#endif
+    for (; ib < nb; ++ib) {
+        int sumi0 = 0;
+        int sumi1 = 0;
+
+        for (int j = 0; j < qk/2; ++j) {
+            const int v0 = (x[ib].qs[j] & 0x0F);
+            const int v1 = (x[ib].qs[j] >>   4);
+
+            sumi0 += (v0 * y[ib].qs[j]);
+            sumi1 += (v1 * y[ib].qs[j + qk/2]);
+        }
+
+        int sumi = sumi0 + sumi1;
+        sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
+    }
+
+    *s = sumf;
+}
+
+void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    const int qk = QK8_0;
+    const int nb = n / qk;
+
+    int ib = 0;
+    float sumf = 0;
+
+    assert(n % qk == 0);
+    assert(qk == QK5_0);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_q5_0 * GGML_RESTRICT x = vx;
+    const block_q8_0 * GGML_RESTRICT y = vy;
+
+#if defined(__AVX2__)
+    // Initialize accumulator with zeros
+    __m256 acc = _mm256_setzero_ps();
+
+    // Main loop
+    for (; ib < nb; ++ib) {
+        /* Compute combined scale for the block */
+        const __m256 d = _mm256_set1_ps(GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d));
+
+        __m256i qx = bytes_from_nibbles_32(x[ib].qs);
+        __m256i bxhi = bytes_from_bits_32(x[ib].qh);
+        bxhi = _mm256_andnot_si256(bxhi, _mm256_set1_epi8((char)0xF0));
+        qx = _mm256_or_si256(qx, bxhi);
+
+        __m256i qy = _mm256_loadu_si256((const __m256i *)y[ib].qs);
+
+        const __m256 q = mul_sum_i8_pairs_float(qx, qy);
+
+        /* Multiply q with scale and accumulate */
+        acc = _mm256_fmadd_ps(d, q, acc);
+    }
+
+    sumf = hsum_float_8(acc);
+#elif defined(__AVX__)
+    // Initialize accumulator with zeros
+    __m256 acc = _mm256_setzero_ps();
+    __m128i mask = _mm_set1_epi8((char)0xF0);
+
+    // Main loop
+    for (; ib < nb; ++ib) {
+        /* Compute combined scale for the block */
+        const __m256 d = _mm256_set1_ps(GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d));
+
+        __m256i bx_0 = bytes_from_nibbles_32(x[ib].qs);
+        const __m256i bxhi = bytes_from_bits_32(x[ib].qh);
+        __m128i bxhil = _mm256_castsi256_si128(bxhi);
+        __m128i bxhih = _mm256_extractf128_si256(bxhi, 1);
+        bxhil = _mm_andnot_si128(bxhil, mask);
+        bxhih = _mm_andnot_si128(bxhih, mask);
+        __m128i bxl = _mm256_castsi256_si128(bx_0);
+        __m128i bxh = _mm256_extractf128_si256(bx_0, 1);
+        bxl = _mm_or_si128(bxl, bxhil);
+        bxh = _mm_or_si128(bxh, bxhih);
+        bx_0 = MM256_SET_M128I(bxh, bxl);
+
+        const __m256i by_0 = _mm256_loadu_si256((const __m256i *)y[ib].qs);
+
+        const __m256 q = mul_sum_i8_pairs_float(bx_0, by_0);
+
+        /* Multiply q with scale and accumulate */
+        acc = _mm256_add_ps(_mm256_mul_ps(d, q), acc);
+    }
+
+    sumf = hsum_float_8(acc);
+
+#endif
+    for (; ib < nb; ++ib) {
+        uint32_t qh;
+        memcpy(&qh, x[ib].qh, sizeof(qh));
+
+        int sumi0 = 0;
+        int sumi1 = 0;
+
+        for (int j = 0; j < qk/2; ++j) {
+            const uint8_t xh_0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
+            const uint8_t xh_1 = ((qh & (1u << (j + 16))) >> (j + 12));
+
+            const int32_t x0 = (int8_t)(((x[ib].qs[j] & 0x0F) | xh_0) - 16);
+            const int32_t x1 = (int8_t)(((x[ib].qs[j] >>   4) | xh_1) - 16);
+
+            sumi0 += (x0 * y[ib].qs[j]);
+            sumi1 += (x1 * y[ib].qs[j + qk/2]);
+        }
+
+        int sumi = sumi0 + sumi1;
+        sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d)) * sumi;
+    }
+
+    *s = sumf;
+}
+
+void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    const int qk = QK8_1;
+    const int nb = n / qk;
+
+    int ib = 0;
+    float sumf = 0;
+
+    assert(n % qk == 0);
+    assert(qk == QK5_1);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_q5_1 * GGML_RESTRICT x = vx;
+    const block_q8_1 * GGML_RESTRICT y = vy;
+
+#if defined(__AVX2__)
+    // Initialize accumulator with zeros
+    __m256 acc = _mm256_setzero_ps();
+
+    float summs = 0.0f;
+
+    // Main loop
+    for (; ib < nb; ++ib) {
+        const __m256 dx = _mm256_set1_ps(GGML_CPU_FP16_TO_FP32(x[ib].d));
+
+        summs += GGML_CPU_FP16_TO_FP32(x[ib].m) * GGML_CPU_FP16_TO_FP32(y[ib].s);
+
+        __m256i qx = bytes_from_nibbles_32(x[ib].qs);
+        __m256i bxhi = bytes_from_bits_32(x[ib].qh);
+        bxhi = _mm256_and_si256(bxhi, _mm256_set1_epi8(0x10));
+        qx = _mm256_or_si256(qx, bxhi);
+
+        const __m256 dy = _mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y[ib].d));
+        const __m256i qy = _mm256_loadu_si256((const __m256i *)y[ib].qs);
+
+        const __m256 q = mul_sum_us8_pairs_float(qx, qy);
+
+        acc = _mm256_fmadd_ps(q, _mm256_mul_ps(dx, dy), acc);
+    }
+
+    sumf = hsum_float_8(acc) + summs;
+#elif defined(__AVX__)
+    // Initialize accumulator with zeros
+    __m256 acc = _mm256_setzero_ps();
+    __m128i mask = _mm_set1_epi8(0x10);
+
+    float summs = 0.0f;
+
+    // Main loop
+    for (; ib < nb; ++ib) {
+        const __m256 dx = _mm256_set1_ps(GGML_CPU_FP16_TO_FP32(x[ib].d));
+
+        summs += GGML_CPU_FP16_TO_FP32(x[ib].m) * GGML_CPU_FP16_TO_FP32(y[ib].s);
+
+        __m256i bx_0 = bytes_from_nibbles_32(x[ib].qs);
+        const __m256i bxhi = bytes_from_bits_32(x[ib].qh);
+        __m128i bxhil = _mm256_castsi256_si128(bxhi);
+        __m128i bxhih = _mm256_extractf128_si256(bxhi, 1);
+        bxhil = _mm_and_si128(bxhil, mask);
+        bxhih = _mm_and_si128(bxhih, mask);
+        __m128i bxl = _mm256_castsi256_si128(bx_0);
+        __m128i bxh = _mm256_extractf128_si256(bx_0, 1);
+        bxl = _mm_or_si128(bxl, bxhil);
+        bxh = _mm_or_si128(bxh, bxhih);
+        bx_0 = MM256_SET_M128I(bxh, bxl);
+
+        const __m256 dy = _mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y[ib].d));
+        const __m256i by_0 = _mm256_loadu_si256((const __m256i *)y[ib].qs);
+
+        const __m256 q = mul_sum_us8_pairs_float(bx_0, by_0);
+
+        acc = _mm256_add_ps(_mm256_mul_ps(q, _mm256_mul_ps(dx, dy)), acc);
+    }
+
+    sumf = hsum_float_8(acc) + summs;
+
+#endif
+    for (; ib < nb; ++ib) {
+        uint32_t qh;
+        memcpy(&qh, x[ib].qh, sizeof(qh));
+
+        int sumi0 = 0;
+        int sumi1 = 0;
+
+        for (int j = 0; j < qk/2; ++j) {
+            const uint8_t xh_0 = ((qh >> (j +  0)) << 4) & 0x10;
+            const uint8_t xh_1 = ((qh >> (j + 12))     ) & 0x10;
+
+            const int32_t x0 = (x[ib].qs[j] & 0xF) | xh_0;
+            const int32_t x1 = (x[ib].qs[j] >>  4) | xh_1;
+
+            sumi0 += (x0 * y[ib].qs[j]);
+            sumi1 += (x1 * y[ib].qs[j + qk/2]);
+        }
+
+        int sumi = sumi0 + sumi1;
+        sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
+    }
+
+    *s = sumf;
+}
+
+void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    const int qk = QK8_0;
+    const int nb = n / qk;
+
+    assert(n % qk == 0);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_q8_0 * GGML_RESTRICT x = vx;
+    const block_q8_0 * GGML_RESTRICT y = vy;
+
+    int ib = 0;
+    float sumf = 0;
+
+#if defined(__AVX2__)
+    // Initialize accumulator with zeros
+    __m256 acc = _mm256_setzero_ps();
+
+    // Main loop
+    for (; ib < nb; ++ib) {
+        // Compute combined scale for the block
+        const __m256 d = _mm256_set1_ps(GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d));
+        __m256i qx = _mm256_loadu_si256((const __m256i *)x[ib].qs);
+        __m256i qy = _mm256_loadu_si256((const __m256i *)y[ib].qs);
+
+        const __m256 q = mul_sum_i8_pairs_float(qx, qy);
+
+        // Multiply q with scale and accumulate
+        acc = _mm256_fmadd_ps( d, q, acc );
+    }
+
+    sumf = hsum_float_8(acc);
+#elif defined(__AVX__)
+    __m256 accum = _mm256_setzero_ps();
+
+    for (; ib + 1 < nb; ib += 2) {
+        const __m128i qx_1_0 = _mm_loadu_si128((const __m128i *)x[ib].qs);
+        const __m128i qx_1_1 = _mm_loadu_si128((const __m128i *)x[ib].qs + 1);
+        const __m128i qx_2_0 = _mm_loadu_si128((const __m128i *)x[ib + 1].qs);
+        const __m128i qx_2_1 = _mm_loadu_si128((const __m128i *)x[ib + 1].qs + 1);
+        const __m128i qy_1_0 = _mm_loadu_si128((const __m128i *)y[ib].qs);
+        const __m128i qy_1_1 = _mm_loadu_si128((const __m128i *)y[ib].qs + 1);
+        const __m128i qy_2_0 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs);
+        const __m128i qy_2_1 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs + 1);
+
+        const __m256 p = mul_sum_i8_quad_float(qx_1_0, qx_1_1, qx_2_0, qx_2_1, qy_1_0, qy_1_1, qy_2_0, qy_2_1);
+        const __m256 deltas = quad_fp16_delta_float(x[ib].d, y[ib].d, x[ib + 1].d, y[ib + 1].d);
+        accum = _mm256_add_ps(_mm256_mul_ps(deltas, p), accum);
+    }
+
+    sumf = hsum_float_8(accum);
+
+#endif
+    for (; ib < nb; ++ib) {
+        int sumi = 0;
+
+        for (int j = 0; j < qk; j++) {
+            sumi += x[ib].qs[j]*y[ib].qs[j];
+        }
+
+        sumf += sumi*(GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d));
+    }
+
+    *s = sumf;
+}
+
+void ggml_vec_dot_tq1_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_tq1_0 * GGML_RESTRICT x = vx;
+    const block_q8_K  * GGML_RESTRICT y = vy;
+
+    const int nb = n / QK_K;
+
+#if defined(__AVX2__)
+    __m256 sumf = _mm256_setzero_ps();
+
+    for (int i = 0; i < nb; ++i) {
+        // 16-bit sums
+        __m256i sumi0 = _mm256_setzero_si256();
+        __m256i sumi1 = _mm256_setzero_si256();
+        __m256i sumi2 = _mm256_setzero_si256();
+
+        // first 32 bytes of 5 elements
+        {
+            __m256i qx0 = _mm256_loadu_si256((const __m256i *) (x[i].qs));
+            // 8-bit multiplies with shifts, masks and adds
+            __m256i qx1 = _mm256_add_epi8(qx0, _mm256_add_epi8(qx0, qx0)); // 1 * 3
+            __m256i qx2 = _mm256_add_epi8(_mm256_and_si256(_mm256_slli_epi16(qx0, 3), _mm256_set1_epi8(-8)), qx0); // 1 * 9
+            __m256i qx3 = _mm256_add_epi8(_mm256_and_si256(_mm256_slli_epi16(qx1, 3), _mm256_set1_epi8(-8)), qx1); // 3 * 9
+            __m256i qx4 = _mm256_add_epi8(_mm256_and_si256(_mm256_slli_epi16(qx2, 3), _mm256_set1_epi8(-8)), qx2); // 9 * 9
+
+            // TODO: can _mm256_mulhi_epu16 be faster even if 16-bits?
+
+            // Cancel the +1 from avg so that it behaves like a halving add
+            qx0 = _mm256_subs_epu8(qx0, _mm256_set1_epi8(1));
+            qx1 = _mm256_subs_epu8(qx1, _mm256_set1_epi8(1));
+            qx2 = _mm256_subs_epu8(qx2, _mm256_set1_epi8(1));
+            qx3 = _mm256_subs_epu8(qx3, _mm256_set1_epi8(1));
+            qx4 = _mm256_subs_epu8(qx4, _mm256_set1_epi8(1));
+            // Multiply by 3 and get the top 2 bits
+            qx0 = _mm256_avg_epu8(qx0, _mm256_avg_epu8(qx0, _mm256_setzero_si256()));
+            qx1 = _mm256_avg_epu8(qx1, _mm256_avg_epu8(qx1, _mm256_setzero_si256()));
+            qx2 = _mm256_avg_epu8(qx2, _mm256_avg_epu8(qx2, _mm256_setzero_si256()));
+            qx3 = _mm256_avg_epu8(qx3, _mm256_avg_epu8(qx3, _mm256_setzero_si256()));
+            qx4 = _mm256_avg_epu8(qx4, _mm256_avg_epu8(qx4, _mm256_setzero_si256()));
+            qx0 = _mm256_and_si256(_mm256_srli_epi16(qx0, 6), _mm256_set1_epi8(3));
+            qx1 = _mm256_and_si256(_mm256_srli_epi16(qx1, 6), _mm256_set1_epi8(3));
+            qx2 = _mm256_and_si256(_mm256_srli_epi16(qx2, 6), _mm256_set1_epi8(3));
+            qx3 = _mm256_and_si256(_mm256_srli_epi16(qx3, 6), _mm256_set1_epi8(3));
+            qx4 = _mm256_and_si256(_mm256_srli_epi16(qx4, 6), _mm256_set1_epi8(3));
+
+            const __m256i qy0 = _mm256_loadu_si256((const __m256i *) (y[i].qs +   0));
+            const __m256i qy1 = _mm256_loadu_si256((const __m256i *) (y[i].qs +  32));
+            const __m256i qy2 = _mm256_loadu_si256((const __m256i *) (y[i].qs +  64));
+            const __m256i qy3 = _mm256_loadu_si256((const __m256i *) (y[i].qs +  96));
+            const __m256i qy4 = _mm256_loadu_si256((const __m256i *) (y[i].qs + 128));
+
+            qx0 = _mm256_maddubs_epi16(qx0, qy0);
+            qx1 = _mm256_maddubs_epi16(qx1, qy1);
+            qx2 = _mm256_maddubs_epi16(qx2, qy2);
+            qx3 = _mm256_maddubs_epi16(qx3, qy3);
+            qx4 = _mm256_maddubs_epi16(qx4, qy4);
+
+            sumi0 = _mm256_add_epi16(sumi0, _mm256_add_epi16(qx0, qx1));
+            sumi1 = _mm256_add_epi16(sumi1, _mm256_add_epi16(qx2, qx3));
+            sumi2 = _mm256_add_epi16(sumi2, qx4);
+        }
+
+        // last 16 bytes of 5-element, along with the 4 bytes of 4 elements
+        {
+            __m128i qx0 = _mm_loadu_si128((const __m128i *) (x[i].qs + 32));
+            uint32_t qh;
+            memcpy(&qh, x[i].qh, sizeof(qh)); // potentially unaligned
+            __m256i qx5_l = _mm256_cvtepu8_epi16(_mm_set1_epi32(qh));
+            __m128i qx1 = _mm_add_epi8(qx0, _mm_add_epi8(qx0, qx0)); // 1 * 3
+            __m128i qx2 = _mm_add_epi8(_mm_and_si128(_mm_slli_epi16(qx0, 3), _mm_set1_epi8(-8)), qx0); // 1 * 9
+            __m128i qx3 = _mm_add_epi8(_mm_and_si128(_mm_slli_epi16(qx1, 3), _mm_set1_epi8(-8)), qx1); // 3 * 9
+            __m128i qx4 = _mm_add_epi8(_mm_and_si128(_mm_slli_epi16(qx2, 3), _mm_set1_epi8(-8)), qx2); // 9 * 9
+            __m256i qx01 = MM256_SET_M128I(qx1, qx0);
+            __m256i qx23 = MM256_SET_M128I(qx3, qx2);
+
+            // avx2 does not have 8-bit multiplies, so 16-bit it is.
+            qx5_l = _mm256_mullo_epi16(qx5_l, _mm256_set_epi16(27, 27, 27, 27, 9, 9, 9, 9, 3, 3, 3, 3, 1, 1, 1, 1));
+            qx5_l = _mm256_and_si256(qx5_l, _mm256_set1_epi16(0xFF));
+            __m128i qx5 = _mm_packus_epi16(_mm256_castsi256_si128(qx5_l), _mm256_extracti128_si256(qx5_l, 1));
+
+            __m256i qx45 = MM256_SET_M128I(qx5, qx4);
+
+            // Cancel the +1 from avg so that it behaves like a halving add
+            qx01 = _mm256_subs_epu8(qx01, _mm256_set1_epi8(1));
+            qx23 = _mm256_subs_epu8(qx23, _mm256_set1_epi8(1));
+            qx45 = _mm256_subs_epu8(qx45, _mm256_set1_epi8(1));
+            // Multiply by 3 and get the top 2 bits
+            qx01 = _mm256_avg_epu8(qx01, _mm256_avg_epu8(qx01, _mm256_setzero_si256()));
+            qx23 = _mm256_avg_epu8(qx23, _mm256_avg_epu8(qx23, _mm256_setzero_si256()));
+            qx45 = _mm256_avg_epu8(qx45, _mm256_avg_epu8(qx45, _mm256_setzero_si256()));
+            qx01 = _mm256_and_si256(_mm256_srli_epi16(qx01, 6), _mm256_set1_epi8(3));
+            qx23 = _mm256_and_si256(_mm256_srli_epi16(qx23, 6), _mm256_set1_epi8(3));
+            qx45 = _mm256_and_si256(_mm256_srli_epi16(qx45, 6), _mm256_set1_epi8(3));
+
+            const __m256i qy01 = _mm256_loadu_si256((const __m256i *) (y[i].qs + 160));
+            const __m256i qy23 = _mm256_loadu_si256((const __m256i *) (y[i].qs + 192));
+            const __m256i qy45 = _mm256_loadu_si256((const __m256i *) (y[i].qs + 224));
+
+            qx01 = _mm256_maddubs_epi16(qx01, qy01);
+            qx23 = _mm256_maddubs_epi16(qx23, qy23);
+            qx45 = _mm256_maddubs_epi16(qx45, qy45);
+
+            sumi0 = _mm256_add_epi16(sumi0, qx01);
+            sumi1 = _mm256_add_epi16(sumi1, qx23);
+            sumi2 = _mm256_add_epi16(sumi2, qx45);
+        }
+
+        const __m256i ysum = _mm256_loadu_si256((const __m256i *) y[i].bsums);
+        const __m256 d = _mm256_set1_ps(y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d));
+
+        sumi0 = _mm256_sub_epi16(sumi0, ysum);
+        sumi0 = _mm256_add_epi16(sumi0, _mm256_add_epi16(sumi1, sumi2));
+        sumi0 = _mm256_madd_epi16(sumi0, _mm256_set1_epi16(1));
+
+        sumf = _mm256_add_ps(_mm256_mul_ps(_mm256_cvtepi32_ps(sumi0), d), sumf);
+    }
+
+    *s = hsum_float_8(sumf);
+
+#else
+    const uint8_t pow3[6] = {1, 3, 9, 27, 81, 243};
+
+    float sumf = 0.0f;
+
+    for (int i = 0; i < nb; ++i) {
+        int sum = 0;
+
+        for (size_t j = 0; j < sizeof(x->qs) - sizeof(x->qs) % 32; j += 32) {
+            for (size_t l = 0; l < 5; ++l) {
+                for (size_t m = 0; m < 32; ++m) {
+                    uint8_t q = x[i].qs[j + m] * pow3[l];
+                    uint16_t xi = ((uint16_t) q * 3) >> 8;
+                    sum += (xi - 1) * y[i].qs[j*5 + l*32 + m];
+                }
+            }
+        }
+        for (size_t j = sizeof(x->qs) - sizeof(x->qs) % 32; j < sizeof(x->qs); j += 16) {
+            for (size_t l = 0; l < 5; ++l) {
+                for (size_t m = 0; m < 16; ++m) {
+                    uint8_t q = x[i].qs[j + m] * pow3[l];
+                    uint16_t xi = ((uint16_t) q * 3) >> 8;
+                    sum += (xi - 1) * y[i].qs[j*5 + l*16 + m];
+                }
+            }
+        }
+
+        for (size_t l = 0; l < 4; ++l) {
+            for (size_t j = 0; j < sizeof(x->qh); ++j) {
+                uint8_t q = x[i].qh[j] * pow3[l];
+                uint16_t xi = ((uint16_t) q * 3) >> 8;
+                sum += (xi - 1) * y[i].qs[sizeof(x->qs)*5 + l*sizeof(x->qh) + j];
+            }
+        }
+
+        sumf += (float) sum * (GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d);
+    }
+
+    *s = sumf;
+#endif
+}
+
+void ggml_vec_dot_tq2_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_tq2_0 * GGML_RESTRICT x = vx;
+    const block_q8_K  * GGML_RESTRICT y = vy;
+
+    const int nb = n / QK_K;
+
+#if defined(__AVX2__)
+    __m256 sumf = _mm256_setzero_ps();
+
+    for (int i = 0; i < nb; ++i) {
+        // 16-bit sums, because 256*127 still fits
+        __m256i sumi0 = _mm256_setzero_si256();
+        __m256i sumi1 = _mm256_setzero_si256();
+
+        for (size_t j = 0; j < sizeof(x->qs); j += 32) {
+            __m256i qx0 = _mm256_loadu_si256((const __m256i *) (x[i].qs + j));
+            __m256i qx1 = _mm256_srli_epi16(qx0, 2);
+            __m256i qx2 = _mm256_srli_epi16(qx0, 4);
+            __m256i qx3 = _mm256_srli_epi16(qx0, 6);
+
+            // 0, 1, 2 (should not be 3)
+            qx0 = _mm256_and_si256(qx0, _mm256_set1_epi8(3));
+            qx1 = _mm256_and_si256(qx1, _mm256_set1_epi8(3));
+            qx2 = _mm256_and_si256(qx2, _mm256_set1_epi8(3));
+            qx3 = _mm256_and_si256(qx3, _mm256_set1_epi8(3));
+
+            const __m256i qy0 = _mm256_loadu_si256((const __m256i *) (y[i].qs + j*4 +  0));
+            const __m256i qy1 = _mm256_loadu_si256((const __m256i *) (y[i].qs + j*4 + 32));
+            const __m256i qy2 = _mm256_loadu_si256((const __m256i *) (y[i].qs + j*4 + 64));
+            const __m256i qy3 = _mm256_loadu_si256((const __m256i *) (y[i].qs + j*4 + 96));
+
+            qx0 = _mm256_maddubs_epi16(qx0, qy0);
+            qx1 = _mm256_maddubs_epi16(qx1, qy1);
+            qx2 = _mm256_maddubs_epi16(qx2, qy2);
+            qx3 = _mm256_maddubs_epi16(qx3, qy3);
+
+            sumi0 = _mm256_add_epi16(sumi0, _mm256_add_epi16(qx0, qx1));
+            sumi1 = _mm256_add_epi16(sumi1, _mm256_add_epi16(qx2, qx3));
+        }
+
+        const __m256i ysum = _mm256_loadu_si256((const __m256i *) y[i].bsums);
+        const __m256 d = _mm256_set1_ps(y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d));
+
+        sumi0 = _mm256_add_epi16(sumi0, sumi1);
+        sumi0 = _mm256_sub_epi16(sumi0, ysum);
+        sumi0 = _mm256_madd_epi16(sumi0, _mm256_set1_epi16(1));
+
+        sumf = _mm256_add_ps(_mm256_mul_ps(_mm256_cvtepi32_ps(sumi0), d), sumf);
+    }
+
+    *s = hsum_float_8(sumf);
+
+#else
+    float sumf = 0.0f;
+
+    for (int i = 0; i < nb; ++i) {
+        int32_t sumi = 0;
+
+        for (size_t j = 0; j < sizeof(x->qs); j += 32) {
+            for (size_t l = 0; l < 4; ++l) {
+                for (size_t k = 0; k < 32; ++k) {
+                    sumi += y[i].qs[j*4 + l*32 + k] * (((x[i].qs[j + k] >> (l*2)) & 3) - 1);
+                }
+            }
+        }
+
+        const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
+
+        sumf += (float) sumi * d;
+    }
+
+    *s = sumf;
+#endif
+}
+
+void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_q2_K * GGML_RESTRICT x = vx;
+    const block_q8_K * GGML_RESTRICT y = vy;
+
+    const int nb = n / QK_K;
+
+#if defined __AVX2__
+
+    const __m256i m3 = _mm256_set1_epi8(3);
+    const __m128i m4 = _mm_set1_epi8(0xF);
+
+    __m256 acc = _mm256_setzero_ps();
+
+    for (int i = 0; i < nb; ++i) {
+
+        const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
+        const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
+
+        const uint8_t * GGML_RESTRICT q2 = x[i].qs;
+        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
+
+        const __m128i mins_and_scales = _mm_loadu_si128((const __m128i*)x[i].scales);
+        const __m128i scales8 = _mm_and_si128(mins_and_scales, m4);
+        const __m128i mins8 = _mm_and_si128(_mm_srli_epi16(mins_and_scales, 4), m4);
+        const __m256i mins = _mm256_cvtepi8_epi16(mins8);
+        const __m256i prod = _mm256_madd_epi16(mins, _mm256_loadu_si256((const __m256i*)y[i].bsums));
+
+        acc = _mm256_fmadd_ps(_mm256_broadcast_ss(&dmin), _mm256_cvtepi32_ps(prod), acc);
+
+        const __m256i all_scales = _mm256_cvtepi8_epi16(scales8);
+        const __m128i l_scales = _mm256_extracti128_si256(all_scales, 0);
+        const __m128i h_scales = _mm256_extracti128_si256(all_scales, 1);
+        const __m256i scales[2] = {MM256_SET_M128I(l_scales, l_scales), MM256_SET_M128I(h_scales, h_scales)};
+
+        __m256i sumi = _mm256_setzero_si256();
+
+        for (int j = 0; j < QK_K/128; ++j) {
+
+            const __m256i q2bits = _mm256_loadu_si256((const __m256i*)q2); q2 += 32;
+
+            const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
+            const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
+            const __m256i q8_2 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
+            const __m256i q8_3 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
+
+            const __m256i q2_0 = _mm256_and_si256(q2bits, m3);
+            const __m256i q2_1 = _mm256_and_si256(_mm256_srli_epi16(q2bits, 2), m3);
+            const __m256i q2_2 = _mm256_and_si256(_mm256_srli_epi16(q2bits, 4), m3);
+            const __m256i q2_3 = _mm256_and_si256(_mm256_srli_epi16(q2bits, 6), m3);
+
+            __m256i p0 = _mm256_maddubs_epi16(q2_0, q8_0);
+            __m256i p1 = _mm256_maddubs_epi16(q2_1, q8_1);
+            __m256i p2 = _mm256_maddubs_epi16(q2_2, q8_2);
+            __m256i p3 = _mm256_maddubs_epi16(q2_3, q8_3);
+
+            p0 = _mm256_madd_epi16(_mm256_shuffle_epi8(scales[j], get_scale_shuffle_q3k(0)), p0);
+            p1 = _mm256_madd_epi16(_mm256_shuffle_epi8(scales[j], get_scale_shuffle_q3k(1)), p1);
+            p2 = _mm256_madd_epi16(_mm256_shuffle_epi8(scales[j], get_scale_shuffle_q3k(2)), p2);
+            p3 = _mm256_madd_epi16(_mm256_shuffle_epi8(scales[j], get_scale_shuffle_q3k(3)), p3);
+
+            p0 = _mm256_add_epi32(p0, p1);
+            p2 = _mm256_add_epi32(p2, p3);
+
+            sumi = _mm256_add_epi32(sumi, _mm256_add_epi32(p0, p2));
+        }
+
+        acc = _mm256_fmadd_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi), acc);
+
+    }
+
+    *s = hsum_float_8(acc);
+
+#elif defined __AVX__
+
+    const __m128i m3 = _mm_set1_epi8(0x3);
+    const __m128i m4 = _mm_set1_epi8(0xF);
+    const __m128i m2 = _mm_set1_epi8(0x2);
+
+    __m256 acc = _mm256_setzero_ps();
+
+    for (int i = 0; i < nb; ++i) {
+
+        const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
+        const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
+
+        const uint8_t * GGML_RESTRICT q2 = x[i].qs;
+        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
+
+        // load mins and scales from block_q2_K.scales[QK_K/16]
+        const __m128i mins_and_scales = _mm_loadu_si128((const __m128i*)x[i].scales);
+        const __m128i scales16 = _mm_and_si128(mins_and_scales, m4);
+        const __m128i mins16 = _mm_and_si128(_mm_srli_epi16(mins_and_scales, 4), m4);
+        const __m128i mins_0 = _mm_cvtepi8_epi16(mins16);
+        const __m128i mins_1 = _mm_cvtepi8_epi16(_mm_unpackhi_epi64(mins16, mins16));
+
+        // summs = y[i].bsums * (x[i].scales >> 4) in 16bits*8*2 to 32bits*4*2
+        const __m128i summs_0 = _mm_madd_epi16(mins_0, _mm_loadu_si128((const __m128i*)&y[i].bsums[0]));
+        const __m128i summs_1 = _mm_madd_epi16(mins_1, _mm_loadu_si128((const __m128i*)&y[i].bsums[8]));
+
+        // sumf += -dmin * summs in 32bits*8
+        acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&dmin), _mm256_cvtepi32_ps(MM256_SET_M128I(summs_1, summs_0))), acc);
+
+        const __m128i scales_0 = _mm_cvtepi8_epi16(scales16);
+        const __m128i scales_1 = _mm_cvtepi8_epi16(_mm_unpackhi_epi64(scales16, scales16));
+        const __m128i scales[2] = { scales_0, scales_1 };
+
+        __m128i sumi_0 = _mm_setzero_si128();
+        __m128i sumi_1 = _mm_setzero_si128();
+
+        for (int j = 0; j < QK_K/128; ++j) {
+
+            // load Q8 quants int8*16*8 from block_q8_K.qs[QK_K]
+            const __m128i q8_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+            const __m128i q8_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+            const __m128i q8_2 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+            const __m128i q8_3 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+            const __m128i q8_4 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+            const __m128i q8_5 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+            const __m128i q8_6 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+            const __m128i q8_7 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+
+            // load 2bits*16*8 from block_q2_K.qs[QK_K/4]
+            __m128i q2bits = _mm_loadu_si128((const __m128i*)q2); q2 += 16;
+            const __m128i q2_0 = _mm_and_si128(q2bits, m3);
+            const __m128i q2_2 = _mm_and_si128(_mm_srli_epi16(q2bits, 2), m3);
+            const __m128i q2_4 = _mm_and_si128(_mm_srli_epi16(q2bits, 4), m3);
+            const __m128i q2_6 = _mm_and_si128(_mm_srli_epi16(q2bits, 6), m3);
+            q2bits = _mm_loadu_si128((const __m128i*)q2); q2 += 16;
+            const __m128i q2_1 = _mm_and_si128(q2bits, m3);
+            const __m128i q2_3 = _mm_and_si128(_mm_srli_epi16(q2bits, 2), m3);
+            const __m128i q2_5 = _mm_and_si128(_mm_srli_epi16(q2bits, 4), m3);
+            const __m128i q2_7 = _mm_and_si128(_mm_srli_epi16(q2bits, 6), m3);
+
+            // isuml = q8[l] * ((q2[l] >> shift) & 3) in 8bits*16*8 to 16bits*8*8
+            __m128i p0 = _mm_maddubs_epi16(q2_0, q8_0);
+            __m128i p1 = _mm_maddubs_epi16(q2_1, q8_1);
+            __m128i p2 = _mm_maddubs_epi16(q2_2, q8_2);
+            __m128i p3 = _mm_maddubs_epi16(q2_3, q8_3);
+            __m128i p4 = _mm_maddubs_epi16(q2_4, q8_4);
+            __m128i p5 = _mm_maddubs_epi16(q2_5, q8_5);
+            __m128i p6 = _mm_maddubs_epi16(q2_6, q8_6);
+            __m128i p7 = _mm_maddubs_epi16(q2_7, q8_7);
+
+            // isum += (x[i].scales[is++] & 0xF) * isuml in 16bits*8*8 to 32bits*4*8
+            __m128i shuffle = _mm_set1_epi16(0x0100);
+            p0 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p0);
+            shuffle = _mm_add_epi16(shuffle, m2);
+            p1 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p1);
+            shuffle = _mm_add_epi16(shuffle, m2);
+            p2 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p2);
+            shuffle = _mm_add_epi16(shuffle, m2);
+            p3 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p3);
+            shuffle = _mm_add_epi16(shuffle, m2);
+            p4 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p4);
+            shuffle = _mm_add_epi16(shuffle, m2);
+            p5 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p5);
+            shuffle = _mm_add_epi16(shuffle, m2);
+            p6 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p6);
+            shuffle = _mm_add_epi16(shuffle, m2);
+            p7 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p7);
+
+            p0 = _mm_add_epi32(p0, p1);
+            p2 = _mm_add_epi32(p2, p3);
+            p4 = _mm_add_epi32(p4, p5);
+            p6 = _mm_add_epi32(p6, p7);
+
+            // isum in 32bits*4*2
+            sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p0, p2));
+            sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p4, p6));
+        }
+
+        // sumf += dall * isum - dmin * summs in 32bits
+        __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
+        acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&dall), _mm256_cvtepi32_ps(sumi)), acc);
+    }
+
+    *s = hsum_float_8(acc);
+
+#else
+
+    float sumf = 0;
+
+    for (int i = 0; i < nb; ++i) {
+
+        const uint8_t * q2 = x[i].qs;
+        const  int8_t * q8 = y[i].qs;
+        const uint8_t * sc = x[i].scales;
+
+        int summs = 0;
+        for (int j = 0; j < 16; ++j) {
+            summs += y[i].bsums[j] * (sc[j] >> 4);
+        }
+
+        const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
+        const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
+
+        int isum = 0;
+        int is = 0;
+        int d;
+        for (int k = 0; k < QK_K/128; ++k) {
+            int shift = 0;
+            for (int j = 0; j < 4; ++j) {
+                d = sc[is++] & 0xF;
+                int isuml = 0;
+                for (int l =  0; l < 16; ++l) isuml += q8[l] * ((q2[l] >> shift) & 3);
+                isum += d * isuml;
+                d = sc[is++] & 0xF;
+                isuml = 0;
+                for (int l = 16; l < 32; ++l) isuml += q8[l] * ((q2[l] >> shift) & 3);
+                isum += d * isuml;
+                shift += 2;
+                q8 += 32;
+            }
+            q2 += 32;
+        }
+        sumf += dall * isum - dmin * summs;
+    }
+    *s = sumf;
+#endif
+}
+
+void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    assert(n % QK_K == 0);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const uint32_t kmask1 = 0x03030303;
+    const uint32_t kmask2 = 0x0f0f0f0f;
+
+    const block_q3_K * GGML_RESTRICT x = vx;
+    const block_q8_K * GGML_RESTRICT y = vy;
+
+    const int nb = n / QK_K;
+
+#if defined __AVX2__
+
+    const __m256i m3 = _mm256_set1_epi8(3);
+    const __m256i mone = _mm256_set1_epi8(1);
+    const __m128i m32 = _mm_set1_epi8(32);
+
+    __m256 acc = _mm256_setzero_ps();
+
+    uint32_t aux[3];
+
+    for (int i = 0; i < nb; ++i) {
+
+        const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
+
+        const uint8_t * GGML_RESTRICT q3 = x[i].qs;
+        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
+
+        // Set up scales
+        memcpy(aux, x[i].scales, 12);
+        __m128i scales128 = _mm_set_epi32(
+                ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4),
+                ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4),
+                (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4),
+                (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4));
+        scales128 = _mm_sub_epi8(scales128, m32);
+        const __m256i all_scales = _mm256_cvtepi8_epi16(scales128);
+        const __m128i l_scales = _mm256_extracti128_si256(all_scales, 0);
+        const __m128i h_scales = _mm256_extracti128_si256(all_scales, 1);
+        const __m256i scales[2] = {MM256_SET_M128I(l_scales, l_scales), MM256_SET_M128I(h_scales, h_scales)};
+
+        // high bit
+        const __m256i hbits = _mm256_loadu_si256((const __m256i*)x[i].hmask);
+
+        // integer accumulator
+        __m256i sumi = _mm256_setzero_si256();
+
+        int bit = 0;
+        int is  = 0;
+
+        for (int j = 0; j < QK_K/128; ++j) {
+            // load low 2 bits
+            const __m256i q3bits = _mm256_loadu_si256((const __m256i*)q3); q3 += 32;
+
+            // prepare low and high bits
+            const __m256i q3l_0 = _mm256_and_si256(q3bits, m3);
+            const __m256i q3h_0 = _mm256_slli_epi16(_mm256_srli_epi16(_mm256_andnot_si256(hbits, _mm256_slli_epi16(mone, bit)), bit), 2);
+            ++bit;
+
+            const __m256i q3l_1 = _mm256_and_si256(_mm256_srli_epi16(q3bits, 2), m3);
+            const __m256i q3h_1 = _mm256_slli_epi16(_mm256_srli_epi16(_mm256_andnot_si256(hbits, _mm256_slli_epi16(mone, bit)), bit), 2);
+            ++bit;
+
+            const __m256i q3l_2 = _mm256_and_si256(_mm256_srli_epi16(q3bits, 4), m3);
+            const __m256i q3h_2 = _mm256_slli_epi16(_mm256_srli_epi16(_mm256_andnot_si256(hbits, _mm256_slli_epi16(mone, bit)), bit), 2);
+            ++bit;
+
+            const __m256i q3l_3 = _mm256_and_si256(_mm256_srli_epi16(q3bits, 6), m3);
+            const __m256i q3h_3 = _mm256_slli_epi16(_mm256_srli_epi16(_mm256_andnot_si256(hbits, _mm256_slli_epi16(mone, bit)), bit), 2);
+            ++bit;
+
+            // load Q8 quants
+            const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
+            const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
+            const __m256i q8_2 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
+            const __m256i q8_3 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
+
+            // Dot product: we multiply the 2 low bits and 1 high bit part separately, so we can use _mm256_maddubs_epi16,
+            // and then subtract. The high bit part has the 2 already subtracted (and so, it is zero if the high bit was not set,
+            // and 2 if the high bit was set)
+            __m256i q8s_0 = _mm256_maddubs_epi16(q3h_0, q8_0);
+            __m256i q8s_1 = _mm256_maddubs_epi16(q3h_1, q8_1);
+            __m256i q8s_2 = _mm256_maddubs_epi16(q3h_2, q8_2);
+            __m256i q8s_3 = _mm256_maddubs_epi16(q3h_3, q8_3);
+
+            __m256i p16_0 = _mm256_maddubs_epi16(q3l_0, q8_0);
+            __m256i p16_1 = _mm256_maddubs_epi16(q3l_1, q8_1);
+            __m256i p16_2 = _mm256_maddubs_epi16(q3l_2, q8_2);
+            __m256i p16_3 = _mm256_maddubs_epi16(q3l_3, q8_3);
+
+            p16_0 = _mm256_sub_epi16(p16_0, q8s_0);
+            p16_1 = _mm256_sub_epi16(p16_1, q8s_1);
+            p16_2 = _mm256_sub_epi16(p16_2, q8s_2);
+            p16_3 = _mm256_sub_epi16(p16_3, q8s_3);
+
+            // multiply with scales
+            p16_0 = _mm256_madd_epi16(_mm256_shuffle_epi8(scales[j], get_scale_shuffle_q3k(is + 0)), p16_0);
+            p16_1 = _mm256_madd_epi16(_mm256_shuffle_epi8(scales[j], get_scale_shuffle_q3k(is + 1)), p16_1);
+            p16_2 = _mm256_madd_epi16(_mm256_shuffle_epi8(scales[j], get_scale_shuffle_q3k(is + 2)), p16_2);
+            p16_3 = _mm256_madd_epi16(_mm256_shuffle_epi8(scales[j], get_scale_shuffle_q3k(is + 3)), p16_3);
+
+            // accumulate
+            p16_0 = _mm256_add_epi32(p16_0, p16_1);
+            p16_2 = _mm256_add_epi32(p16_2, p16_3);
+            sumi  = _mm256_add_epi32(sumi, _mm256_add_epi32(p16_0, p16_2));
+
+        }
+
+        // multiply with block scale and accumulate
+        acc = _mm256_fmadd_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi), acc);
+
+    }
+
+    *s = hsum_float_8(acc);
+
+#elif defined __AVX__
+
+    const __m128i m3 = _mm_set1_epi8(3);
+    const __m128i mone = _mm_set1_epi8(1);
+    const __m128i m32 = _mm_set1_epi8(32);
+    const __m128i m2 = _mm_set1_epi8(2);
+
+    __m256 acc = _mm256_setzero_ps();
+
+    const uint32_t *aux;
+
+    for (int i = 0; i < nb; ++i) {
+
+        const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
+
+        const uint8_t * GGML_RESTRICT q3 = x[i].qs;
+        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
+
+        // Set up scales
+        aux = (const uint32_t *)x[i].scales;
+        __m128i scales128 = _mm_set_epi32(
+                ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4),
+                ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4),
+                (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4),
+                (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4));
+        scales128 = _mm_sub_epi8(scales128, m32);
+        const __m128i scales_0 = _mm_cvtepi8_epi16(scales128);
+        const __m128i scales_1 = _mm_cvtepi8_epi16(_mm_unpackhi_epi64(scales128, scales128));
+        const __m128i scales[2] = { scales_0, scales_1 };
+
+        // high bit *128*2 from block_q3_K.hmask[QK_K/8]
+        const __m128i hbits_0 = _mm_loadu_si128((const __m128i*)&x[i].hmask[0]);
+        const __m128i hbits_1 = _mm_loadu_si128((const __m128i*)&x[i].hmask[16]);
+
+        // integer accumulator
+        __m128i sumi_0 = _mm_setzero_si128();
+        __m128i sumi_1 = _mm_setzero_si128();
+
+        for (int j = 0; j < QK_K/128; ++j) {
+            // load low 2 bits *64*2 from block_q3_K.qs[QK_K/4]
+            const __m128i q3bits_0 = _mm_loadu_si128((const __m128i*)q3); q3 += 16;
+            const __m128i q3bits_1 = _mm_loadu_si128((const __m128i*)q3); q3 += 16;
+
+            // prepare low and high bits
+            const int bit = j << 2;
+
+            const __m128i q3l_0 = _mm_and_si128(q3bits_0, m3);
+            const __m128i q3l_1 = _mm_and_si128(q3bits_1, m3);
+            const __m128i q3h_0 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_0, _mm_slli_epi16(mone, bit)), bit), 2);
+            const __m128i q3h_1 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_1, _mm_slli_epi16(mone, bit)), bit), 2);
+
+            const __m128i q3l_2 = _mm_and_si128(_mm_srli_epi16(q3bits_0, 2), m3);
+            const __m128i q3l_3 = _mm_and_si128(_mm_srli_epi16(q3bits_1, 2), m3);
+            const __m128i q3h_2 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_0, _mm_slli_epi16(mone, bit+1)), bit+1), 2);
+            const __m128i q3h_3 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_1, _mm_slli_epi16(mone, bit+1)), bit+1), 2);
+
+            const __m128i q3l_4 = _mm_and_si128(_mm_srli_epi16(q3bits_0, 4), m3);
+            const __m128i q3l_5 = _mm_and_si128(_mm_srli_epi16(q3bits_1, 4), m3);
+            const __m128i q3h_4 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_0, _mm_slli_epi16(mone, bit+2)), bit+2), 2);
+            const __m128i q3h_5 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_1, _mm_slli_epi16(mone, bit+2)), bit+2), 2);
+
+            const __m128i q3l_6 = _mm_and_si128(_mm_srli_epi16(q3bits_0, 6), m3);
+            const __m128i q3l_7 = _mm_and_si128(_mm_srli_epi16(q3bits_1, 6), m3);
+            const __m128i q3h_6 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_0, _mm_slli_epi16(mone, bit+3)), bit+3), 2);
+            const __m128i q3h_7 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_1, _mm_slli_epi16(mone, bit+3)), bit+3), 2);
+
+            // load Q8 quants from block_q8_K.qs[QK_K]
+            const __m128i q8_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+            const __m128i q8_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+            const __m128i q8_2 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+            const __m128i q8_3 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+            const __m128i q8_4 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+            const __m128i q8_5 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+            const __m128i q8_6 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+            const __m128i q8_7 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+
+            // Dot product: we multiply the 2 low bits and 1 high bit part separately, so we can use _mm256_maddubs_epi16,
+            // and then subtract. The high bit part has the 2 already subtracted (and so, it is zero if the high bit was not set,
+            // and 2 if the high bit was set)
+            __m128i q8s_0 = _mm_maddubs_epi16(q3h_0, q8_0);
+            __m128i q8s_1 = _mm_maddubs_epi16(q3h_1, q8_1);
+            __m128i q8s_2 = _mm_maddubs_epi16(q3h_2, q8_2);
+            __m128i q8s_3 = _mm_maddubs_epi16(q3h_3, q8_3);
+            __m128i q8s_4 = _mm_maddubs_epi16(q3h_4, q8_4);
+            __m128i q8s_5 = _mm_maddubs_epi16(q3h_5, q8_5);
+            __m128i q8s_6 = _mm_maddubs_epi16(q3h_6, q8_6);
+            __m128i q8s_7 = _mm_maddubs_epi16(q3h_7, q8_7);
+
+            __m128i p16_0 = _mm_maddubs_epi16(q3l_0, q8_0);
+            __m128i p16_1 = _mm_maddubs_epi16(q3l_1, q8_1);
+            __m128i p16_2 = _mm_maddubs_epi16(q3l_2, q8_2);
+            __m128i p16_3 = _mm_maddubs_epi16(q3l_3, q8_3);
+            __m128i p16_4 = _mm_maddubs_epi16(q3l_4, q8_4);
+            __m128i p16_5 = _mm_maddubs_epi16(q3l_5, q8_5);
+            __m128i p16_6 = _mm_maddubs_epi16(q3l_6, q8_6);
+            __m128i p16_7 = _mm_maddubs_epi16(q3l_7, q8_7);
+
+            p16_0 = _mm_sub_epi16(p16_0, q8s_0);
+            p16_1 = _mm_sub_epi16(p16_1, q8s_1);
+            p16_2 = _mm_sub_epi16(p16_2, q8s_2);
+            p16_3 = _mm_sub_epi16(p16_3, q8s_3);
+            p16_4 = _mm_sub_epi16(p16_4, q8s_4);
+            p16_5 = _mm_sub_epi16(p16_5, q8s_5);
+            p16_6 = _mm_sub_epi16(p16_6, q8s_6);
+            p16_7 = _mm_sub_epi16(p16_7, q8s_7);
+
+            // multiply with scales
+            __m128i shuffle = _mm_set1_epi16(0x0100);
+            p16_0 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_0);
+            shuffle = _mm_add_epi16(shuffle, m2);
+            p16_1 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_1);
+            shuffle = _mm_add_epi16(shuffle, m2);
+            p16_2 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_2);
+            shuffle = _mm_add_epi16(shuffle, m2);
+            p16_3 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_3);
+            shuffle = _mm_add_epi16(shuffle, m2);
+            p16_4 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_4);
+            shuffle = _mm_add_epi16(shuffle, m2);
+            p16_5 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_5);
+            shuffle = _mm_add_epi16(shuffle, m2);
+            p16_6 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_6);
+            shuffle = _mm_add_epi16(shuffle, m2);
+            p16_7 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_7);
+
+            // accumulate
+            p16_0 = _mm_add_epi32(p16_0, p16_1);
+            p16_2 = _mm_add_epi32(p16_2, p16_3);
+            p16_4 = _mm_add_epi32(p16_4, p16_5);
+            p16_6 = _mm_add_epi32(p16_6, p16_7);
+            sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p16_0, p16_2));
+            sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p16_4, p16_6));
+
+        }
+
+        // multiply with block scale and accumulate
+        __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
+        acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi)), acc);
+
+    }
+
+    *s = hsum_float_8(acc);
+
+#else
+    // scalar version
+    // This function is written like this so the compiler can manage to vectorize most of it
+    // Using -Ofast, GCC and clang manage to produce code that is within a factor of 2 or so from the
+    // manually vectorized version above. Every other version I tried would run at least 4 times slower.
+    // The ideal situation would be if we could just write the code once, and the compiler would
+    // automatically produce the best possible set of machine instructions, instead of us having to manually
+    // write vectorized versions for AVX, ARM_NEON, etc.
+
+    int8_t  aux8[QK_K];
+    int16_t aux16[8];
+    float   sums [8];
+    int32_t aux32[8];
+    memset(sums, 0, 8*sizeof(float));
+
+    uint32_t auxs[4];
+    const int8_t * scales = (const int8_t*)auxs;
+
+    float sumf = 0;
+    for (int i = 0; i < nb; ++i) {
+        const uint8_t * GGML_RESTRICT q3 = x[i].qs;
+        const uint8_t * GGML_RESTRICT hm = x[i].hmask;
+        const  int8_t * GGML_RESTRICT q8 = y[i].qs;
+        memset(aux32, 0, 8*sizeof(int32_t));
+        int8_t * GGML_RESTRICT a = aux8;
+        uint8_t m = 1;
+        for (int j = 0; j < QK_K; j += 128) {
+            for (int l = 0; l < 32; ++l) a[l] = q3[l] & 3;
+            for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
+            a += 32; m <<= 1;
+            for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 2) & 3;
+            for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
+            a += 32; m <<= 1;
+            for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 4) & 3;
+            for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
+            a += 32; m <<= 1;
+            for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 6) & 3;
+            for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
+            a += 32; m <<= 1;
+            q3 += 32;
+        }
+        a = aux8;
+
+        memcpy(auxs, x[i].scales, 12);
+        uint32_t tmp = auxs[2];
+        auxs[2] = ((auxs[0] >> 4) & kmask2) | (((tmp >> 4) & kmask1) << 4);
+        auxs[3] = ((auxs[1] >> 4) & kmask2) | (((tmp >> 6) & kmask1) << 4);
+        auxs[0] = (auxs[0] & kmask2) | (((tmp >> 0) & kmask1) << 4);
+        auxs[1] = (auxs[1] & kmask2) | (((tmp >> 2) & kmask1) << 4);
+        for (int j = 0; j < QK_K/16; ++j) {
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
+            q8 += 8; a += 8;
+        }
+        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+        for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
+    }
+    for (int l = 0; l < 8; ++l) sumf += sums[l];
+    *s = sumf;
+
+#endif
+
+}
+
+void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    assert(n % QK_K == 0);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_q4_K * GGML_RESTRICT x = vx;
+    const block_q8_K * GGML_RESTRICT y = vy;
+
+    const int nb = n / QK_K;
+
+    static const uint32_t kmask1 = 0x3f3f3f3f;
+    static const uint32_t kmask2 = 0x0f0f0f0f;
+    static const uint32_t kmask3 = 0x03030303;
+
+    uint32_t utmp[4];
+
+#if defined __AVX2__
+
+    const __m256i m4 = _mm256_set1_epi8(0xF);
+
+    __m256 acc = _mm256_setzero_ps();
+    __m128 acc_m = _mm_setzero_ps();
+
+   for (int i = 0; i < nb; ++i) {
+
+        const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
+        const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
+
+        memcpy(utmp, x[i].scales, 12);
+        utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
+        const uint32_t uaux = utmp[1] & kmask1;
+        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
+        utmp[2] = uaux;
+        utmp[0] &= kmask1;
+
+        const uint8_t * GGML_RESTRICT q4 = x[i].qs;
+        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
+
+        const __m256i mins_and_scales = _mm256_cvtepu8_epi16(_mm_set_epi32(utmp[3], utmp[2], utmp[1], utmp[0]));
+
+        const __m256i q8sums = _mm256_loadu_si256((const __m256i*)y[i].bsums);
+        const __m128i q8s = _mm_hadd_epi16(_mm256_extracti128_si256(q8sums, 0), _mm256_extracti128_si256(q8sums, 1));
+        const __m128i prod = _mm_madd_epi16(_mm256_extracti128_si256(mins_and_scales, 1), q8s);
+        acc_m = _mm_fmadd_ps(_mm_set1_ps(dmin), _mm_cvtepi32_ps(prod), acc_m);
+
+        const __m128i sc128  = _mm256_extracti128_si256(mins_and_scales, 0);
+        const __m256i scales = MM256_SET_M128I(sc128, sc128);
+
+        __m256i sumi = _mm256_setzero_si256();
+
+        for (int j = 0; j < QK_K/64; ++j) {
+
+            const __m256i scale_l = _mm256_shuffle_epi8(scales, get_scale_shuffle_k4(2*j+0));
+            const __m256i scale_h = _mm256_shuffle_epi8(scales, get_scale_shuffle_k4(2*j+1));
+
+            const __m256i q4bits = _mm256_loadu_si256((const __m256i*)q4); q4 += 32;
+            const __m256i q4l = _mm256_and_si256(q4bits, m4);
+            const __m256i q4h = _mm256_and_si256(_mm256_srli_epi16(q4bits, 4), m4);
+
+            const __m256i q8l = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
+            __m256i p16l = _mm256_maddubs_epi16(q4l, q8l);
+            p16l = _mm256_madd_epi16(scale_l, p16l);
+
+            const __m256i q8h = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
+            __m256i p16h = _mm256_maddubs_epi16(q4h, q8h);
+            p16h = _mm256_madd_epi16(scale_h, p16h);
+            const __m256i sumj = _mm256_add_epi32(p16l, p16h);
+
+            sumi = _mm256_add_epi32(sumi, sumj);
+        }
+
+        __m256 vd = _mm256_set1_ps(d);
+        acc = _mm256_fmadd_ps(vd, _mm256_cvtepi32_ps(sumi), acc);
+
+    }
+
+    acc_m = _mm_add_ps(acc_m, _mm_movehl_ps(acc_m, acc_m));
+    acc_m = _mm_add_ss(acc_m, _mm_movehdup_ps(acc_m));
+
+    *s = hsum_float_8(acc) + _mm_cvtss_f32(acc_m);
+
+#elif defined __AVX__
+
+    const __m128i m4 = _mm_set1_epi8(0xF);
+    const __m128i m2 = _mm_set1_epi8(0x2);
+
+    __m256 acc = _mm256_setzero_ps();
+    __m128 acc_m = _mm_setzero_ps();
+
+   for (int i = 0; i < nb; ++i) {
+
+        const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
+        const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
+
+        const uint8_t * GGML_RESTRICT q4 = x[i].qs;
+        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
+
+        memcpy(utmp, x[i].scales, 12);
+        utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
+        const uint32_t uaux = utmp[1] & kmask1;
+        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
+        utmp[2] = uaux;
+        utmp[0] &= kmask1;
+
+        const __m128i utmps = _mm_set_epi32(utmp[3], utmp[2], utmp[1], utmp[0]);
+        const __m128i scales = _mm_cvtepu8_epi16(utmps);
+        const __m128i mins = _mm_cvtepu8_epi16(_mm_unpackhi_epi64(utmps, utmps));
+
+        const __m128i q8sums_0 = _mm_loadu_si128((const __m128i*)&y[i].bsums[0]);
+        const __m128i q8sums_1 = _mm_loadu_si128((const __m128i*)&y[i].bsums[8]);
+        const __m128i q8s = _mm_hadd_epi16(q8sums_0, q8sums_1);
+        const __m128i prod = _mm_madd_epi16(mins, q8s);
+        acc_m = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(dmin), _mm_cvtepi32_ps(prod)), acc_m);
+
+        __m128i sumi_0 = _mm_setzero_si128();
+        __m128i sumi_1 = _mm_setzero_si128();
+
+        __m128i shuffle = _mm_set1_epi16(0x0100);
+        for (int j = 0; j < QK_K/64; ++j) {
+
+            const __m128i scale_l = _mm_shuffle_epi8(scales, shuffle);
+            shuffle = _mm_add_epi16(shuffle, m2);
+            const __m128i scale_h = _mm_shuffle_epi8(scales, shuffle);
+            shuffle = _mm_add_epi16(shuffle, m2);
+
+            __m128i q4bits = _mm_loadu_si128((const __m128i*)q4); q4 += 16;
+            const __m128i q4l_0 = _mm_and_si128(q4bits, m4);
+            const __m128i q4h_0 = _mm_and_si128(_mm_srli_epi16(q4bits, 4), m4);
+            q4bits = _mm_loadu_si128((const __m128i*)q4); q4 += 16;
+            const __m128i q4l_1 = _mm_and_si128(q4bits, m4);
+            const __m128i q4h_1 = _mm_and_si128(_mm_srli_epi16(q4bits, 4), m4);
+
+            const __m128i q8l_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+            __m128i p16l = _mm_maddubs_epi16(q4l_0, q8l_0);
+            p16l = _mm_madd_epi16(scale_l, p16l);
+            sumi_0 = _mm_add_epi32(sumi_0, p16l);
+            const __m128i q8l_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+            p16l = _mm_maddubs_epi16(q4l_1, q8l_1);
+            p16l = _mm_madd_epi16(scale_l, p16l);
+            sumi_1 = _mm_add_epi32(sumi_1, p16l);
+
+            const __m128i q8h_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+            __m128i p16h = _mm_maddubs_epi16(q4h_0, q8h_0);
+            p16h = _mm_madd_epi16(scale_h, p16h);
+            sumi_0 = _mm_add_epi32(sumi_0, p16h);
+            const __m128i q8h_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+            p16h = _mm_maddubs_epi16(q4h_1, q8h_1);
+            p16h = _mm_madd_epi16(scale_h, p16h);
+            sumi_1 = _mm_add_epi32(sumi_1, p16h);
+
+        }
+
+        __m256 vd = _mm256_set1_ps(d);
+        __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
+        acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(sumi)), acc);
+
+    }
+
+    acc_m = _mm_add_ps(acc_m, _mm_movehl_ps(acc_m, acc_m));
+    acc_m = _mm_add_ss(acc_m, _mm_movehdup_ps(acc_m));
+
+    *s = hsum_float_8(acc) + _mm_cvtss_f32(acc_m);
+
+#else
+
+    const uint8_t * scales = (const uint8_t*)&utmp[0];
+    const uint8_t * mins   = (const uint8_t*)&utmp[2];
+
+    int8_t  aux8[QK_K];
+    int16_t aux16[8];
+    float   sums [8];
+    int32_t aux32[8];
+    memset(sums, 0, 8*sizeof(float));
+
+    float sumf = 0;
+    for (int i = 0; i < nb; ++i) {
+        const uint8_t * GGML_RESTRICT q4 = x[i].qs;
+        const  int8_t * GGML_RESTRICT q8 = y[i].qs;
+        memset(aux32, 0, 8*sizeof(int32_t));
+        int8_t * GGML_RESTRICT a = aux8;
+        for (int j = 0; j < QK_K/64; ++j) {
+            for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
+            a += 32;
+            for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l]  >> 4);
+            a += 32; q4 += 32;
+        }
+        memcpy(utmp, x[i].scales, 12);
+        utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
+        const uint32_t uaux = utmp[1] & kmask1;
+        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
+        utmp[2] = uaux;
+        utmp[0] &= kmask1;
+
+        int sumi = 0;
+        for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
+        a = aux8;
+        int is = 0;
+        for (int j = 0; j < QK_K/32; ++j) {
+            int32_t scale = scales[is++];
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+        }
+        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+        for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
+        const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
+        sumf -= dmin * sumi;
+    }
+    for (int l = 0; l < 8; ++l) sumf += sums[l];
+    *s = sumf;
+#endif
+}
+
+void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy,  size_t by, int nrc) {
+    assert(n % QK_K == 0);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_q5_K * GGML_RESTRICT x = vx;
+    const block_q8_K * GGML_RESTRICT y = vy;
+
+    const int nb = n / QK_K;
+
+    static const uint32_t kmask1 = 0x3f3f3f3f;
+    static const uint32_t kmask2 = 0x0f0f0f0f;
+    static const uint32_t kmask3 = 0x03030303;
+
+    uint32_t utmp[4];
+
+#if defined __AVX2__
+
+    const __m256i m4 = _mm256_set1_epi8(0xF);
+    const __m128i mzero = _mm_setzero_si128();
+    const __m256i mone  = _mm256_set1_epi8(1);
+
+    __m256 acc = _mm256_setzero_ps();
+
+    float summs = 0.f;
+
+    for (int i = 0; i < nb; ++i) {
+        const uint8_t * GGML_RESTRICT q5 = x[i].qs;
+        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
+
+        const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
+        const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
+
+        memcpy(utmp, x[i].scales, 12);
+        utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
+        const uint32_t uaux = utmp[1] & kmask1;
+        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
+        utmp[2] = uaux;
+        utmp[0] &= kmask1;
+
+        const __m256i mins_and_scales = _mm256_cvtepu8_epi16(_mm_set_epi32(utmp[3], utmp[2], utmp[1], utmp[0]));
+
+        const __m256i q8sums = _mm256_loadu_si256((const __m256i*)y[i].bsums);
+        const __m128i q8s = _mm_hadd_epi16(_mm256_extracti128_si256(q8sums, 0), _mm256_extracti128_si256(q8sums, 1));
+        const __m128i prod = _mm_madd_epi16(_mm256_extracti128_si256(mins_and_scales, 1), q8s);
+        const __m128i hsum = _mm_hadd_epi32(_mm_hadd_epi32(prod, mzero), mzero);
+        summs += dmin * _mm_extract_epi32(hsum, 0);
+
+        const __m128i sc128  = _mm256_extracti128_si256(mins_and_scales, 0);
+        const __m256i scales = MM256_SET_M128I(sc128, sc128);
+
+        const __m256i hbits = _mm256_loadu_si256((const __m256i*)x[i].qh);
+        __m256i hmask = mone;
+
+        __m256i sumi = _mm256_setzero_si256();
+
+        int bit = 0;
+
+        for (int j = 0; j < QK_K/64; ++j) {
+
+            const __m256i scale_0 = _mm256_shuffle_epi8(scales, get_scale_shuffle_k4(2*j+0));
+            const __m256i scale_1 = _mm256_shuffle_epi8(scales, get_scale_shuffle_k4(2*j+1));
+
+            const __m256i q5bits = _mm256_loadu_si256((const __m256i*)q5); q5 += 32;
+
+            const __m256i q5l_0 = _mm256_and_si256(q5bits, m4);
+            const __m256i q5h_0 = _mm256_slli_epi16(_mm256_srli_epi16(_mm256_and_si256(hbits, hmask), bit++), 4);
+            const __m256i q5_0  = _mm256_add_epi8(q5l_0, q5h_0);
+            hmask = _mm256_slli_epi16(hmask, 1);
+
+            const __m256i q5l_1 = _mm256_and_si256(_mm256_srli_epi16(q5bits, 4), m4);
+            const __m256i q5h_1 = _mm256_slli_epi16(_mm256_srli_epi16(_mm256_and_si256(hbits, hmask), bit++), 4);
+            const __m256i q5_1  = _mm256_add_epi8(q5l_1, q5h_1);
+            hmask = _mm256_slli_epi16(hmask, 1);
+
+            const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
+            const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
+
+            __m256i p16_0 = _mm256_maddubs_epi16(q5_0, q8_0);
+            __m256i p16_1 = _mm256_maddubs_epi16(q5_1, q8_1);
+
+            p16_0 = _mm256_madd_epi16(scale_0, p16_0);
+            p16_1 = _mm256_madd_epi16(scale_1, p16_1);
+
+            sumi = _mm256_add_epi32(sumi, _mm256_add_epi32(p16_0, p16_1));
+
+        }
+
+        __m256 vd = _mm256_set1_ps(d);
+        acc = _mm256_fmadd_ps(vd, _mm256_cvtepi32_ps(sumi), acc);
+
+    }
+
+    *s = hsum_float_8(acc) + summs;
+
+#elif defined __AVX__
+
+    const __m128i m4 = _mm_set1_epi8(0xF);
+    const __m128i mzero = _mm_setzero_si128();
+    const __m128i mone  = _mm_set1_epi8(1);
+    const __m128i m2 = _mm_set1_epi8(2);
+
+    __m256 acc = _mm256_setzero_ps();
+
+    float summs = 0.f;
+
+    for (int i = 0; i < nb; ++i) {
+
+        const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
+        const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
+
+        const uint8_t * GGML_RESTRICT q5 = x[i].qs;
+        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
+
+        memcpy(utmp, x[i].scales, 12);
+        utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
+        const uint32_t uaux = utmp[1] & kmask1;
+        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
+        utmp[2] = uaux;
+        utmp[0] &= kmask1;
+
+        const __m128i utmps = _mm_set_epi32(utmp[3], utmp[2], utmp[1], utmp[0]);
+        const __m128i scales = _mm_cvtepu8_epi16(utmps);
+        const __m128i mins = _mm_cvtepu8_epi16(_mm_unpackhi_epi64(utmps, utmps));
+
+        const __m128i q8sums_0 = _mm_loadu_si128((const __m128i*)&y[i].bsums[0]);
+        const __m128i q8sums_1 = _mm_loadu_si128((const __m128i*)&y[i].bsums[8]);
+        const __m128i q8s = _mm_hadd_epi16(q8sums_0, q8sums_1);
+        const __m128i prod = _mm_madd_epi16(mins, q8s);
+        const __m128i hsum = _mm_hadd_epi32(_mm_hadd_epi32(prod, mzero), mzero);
+        summs += dmin * _mm_extract_epi32(hsum, 0);
+
+        const __m128i hbits_0 = _mm_loadu_si128((const __m128i*)&x[i].qh[0]);
+        const __m128i hbits_1 = _mm_loadu_si128((const __m128i*)&x[i].qh[16]);
+        __m128i hmask = mone;
+
+        __m128i sumi_0 = _mm_setzero_si128();
+        __m128i sumi_1 = _mm_setzero_si128();
+
+        int bit = 0;
+
+        __m128i shuffle = _mm_set1_epi16(0x0100);
+        for (int j = 0; j < QK_K/64; ++j) {
+
+            const __m128i scale_0 = _mm_shuffle_epi8(scales, shuffle);
+            shuffle = _mm_add_epi16(shuffle, m2);
+            const __m128i scale_1 = _mm_shuffle_epi8(scales, shuffle);
+            shuffle = _mm_add_epi16(shuffle, m2);
+
+            const __m128i q5bits_0 = _mm_loadu_si128((const __m128i*)q5); q5 += 16;
+            const __m128i q5bits_1 = _mm_loadu_si128((const __m128i*)q5); q5 += 16;
+
+            __m128i q5l_0 = _mm_and_si128(q5bits_0, m4);
+            __m128i q5l_1 = _mm_and_si128(q5bits_1, m4);
+            __m128i q5h_0 = _mm_slli_epi16(_mm_srli_epi16(_mm_and_si128(hbits_0, hmask), bit), 4);
+            __m128i q5h_1 = _mm_slli_epi16(_mm_srli_epi16(_mm_and_si128(hbits_1, hmask), bit++), 4);
+            __m128i q5_0  = _mm_add_epi8(q5l_0, q5h_0);
+            __m128i q5_1  = _mm_add_epi8(q5l_1, q5h_1);
+            hmask = _mm_slli_epi16(hmask, 1);
+
+            __m128i q8_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+            __m128i q8_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+            __m128i p16_0 = _mm_maddubs_epi16(q5_0, q8_0);
+            __m128i p16_1 = _mm_maddubs_epi16(q5_1, q8_1);
+            p16_0 = _mm_madd_epi16(scale_0, p16_0);
+            p16_1 = _mm_madd_epi16(scale_0, p16_1);
+
+            q5l_0 = _mm_and_si128(_mm_srli_epi16(q5bits_0, 4), m4);
+            q5l_1 = _mm_and_si128(_mm_srli_epi16(q5bits_1, 4), m4);
+            q5h_0 = _mm_slli_epi16(_mm_srli_epi16(_mm_and_si128(hbits_0, hmask), bit), 4);
+            q5h_1 = _mm_slli_epi16(_mm_srli_epi16(_mm_and_si128(hbits_1, hmask), bit++), 4);
+            q5_0  = _mm_add_epi8(q5l_0, q5h_0);
+            q5_1  = _mm_add_epi8(q5l_1, q5h_1);
+            hmask = _mm_slli_epi16(hmask, 1);
+
+            q8_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+            q8_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+            __m128i p16_2 = _mm_maddubs_epi16(q5_0, q8_0);
+            __m128i p16_3 = _mm_maddubs_epi16(q5_1, q8_1);
+            p16_2 = _mm_madd_epi16(scale_1, p16_2);
+            p16_3 = _mm_madd_epi16(scale_1, p16_3);
+
+            sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p16_0, p16_2));
+            sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p16_1, p16_3));
+
+        }
+
+        __m256 vd = _mm256_set1_ps(d);
+        __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
+        acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(sumi)), acc);
+
+    }
+
+    *s = hsum_float_8(acc) + summs;
+
+#else
+
+    const uint8_t * scales = (const uint8_t*)&utmp[0];
+    const uint8_t * mins   = (const uint8_t*)&utmp[2];
+
+    int8_t  aux8[QK_K];
+    int16_t aux16[8];
+    float   sums [8];
+    int32_t aux32[8];
+    memset(sums, 0, 8*sizeof(float));
+
+    float sumf = 0;
+    for (int i = 0; i < nb; ++i) {
+        const uint8_t * GGML_RESTRICT q4 = x[i].qs;
+        const uint8_t * GGML_RESTRICT hm = x[i].qh;
+        const  int8_t * GGML_RESTRICT q8 = y[i].qs;
+        memset(aux32, 0, 8*sizeof(int32_t));
+        int8_t * GGML_RESTRICT a = aux8;
+        uint8_t m = 1;
+        for (int j = 0; j < QK_K/64; ++j) {
+            for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
+            for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
+            a += 32; m <<= 1;
+            for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l]  >> 4);
+            for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
+            a += 32; m <<= 1;
+            q4 += 32;
+        }
+        memcpy(utmp, x[i].scales, 12);
+        utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
+        const uint32_t uaux = utmp[1] & kmask1;
+        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
+        utmp[2] = uaux;
+        utmp[0] &= kmask1;
+
+        int sumi = 0;
+        for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
+        a = aux8;
+        int is = 0;
+        for (int j = 0; j < QK_K/32; ++j) {
+            int32_t scale = scales[is++];
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+        }
+        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+        for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
+        const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
+        sumf -= dmin * sumi;
+    }
+    for (int l = 0; l < 8; ++l) sumf += sums[l];
+    *s = sumf;
+#endif
+}
+
+void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    assert(n % QK_K == 0);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_q6_K * GGML_RESTRICT x = vx;
+    const block_q8_K * GGML_RESTRICT y = vy;
+
+    const int nb = n / QK_K;
+
+#if defined __AVX2__
+
+    const __m256i m4 = _mm256_set1_epi8(0xF);
+    const __m256i m2 = _mm256_set1_epi8(3);
+    const __m256i m32s = _mm256_set1_epi8(32);
+
+    __m256 acc = _mm256_setzero_ps();
+
+    for (int i = 0; i < nb; ++i) {
+
+        const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
+
+        const uint8_t * GGML_RESTRICT q4 = x[i].ql;
+        const uint8_t * GGML_RESTRICT qh = x[i].qh;
+        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
+
+        const __m128i scales = _mm_loadu_si128((const __m128i*)x[i].scales);
+
+        __m256i sumi = _mm256_setzero_si256();
+
+        int is = 0;
+
+        for (int j = 0; j < QK_K/128; ++j) {
+
+            const __m128i scale_0 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 0));
+            const __m128i scale_1 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 1));
+            const __m128i scale_2 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 2));
+            const __m128i scale_3 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 3));
+            is += 4;
+
+            const __m256i q4bits1 = _mm256_loadu_si256((const __m256i*)q4); q4 += 32;
+            const __m256i q4bits2 = _mm256_loadu_si256((const __m256i*)q4); q4 += 32;
+            const __m256i q4bitsH = _mm256_loadu_si256((const __m256i*)qh); qh += 32;
+
+            const __m256i q4h_0 = _mm256_slli_epi16(_mm256_and_si256(q4bitsH, m2), 4);
+            const __m256i q4h_1 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(q4bitsH, 2), m2), 4);
+            const __m256i q4h_2 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(q4bitsH, 4), m2), 4);
+            const __m256i q4h_3 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(q4bitsH, 6), m2), 4);
+
+            const __m256i q4_0 = _mm256_or_si256(_mm256_and_si256(q4bits1, m4), q4h_0);
+            const __m256i q4_1 = _mm256_or_si256(_mm256_and_si256(q4bits2, m4), q4h_1);
+            const __m256i q4_2 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(q4bits1, 4), m4), q4h_2);
+            const __m256i q4_3 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(q4bits2, 4), m4), q4h_3);
+
+            const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
+            const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
+            const __m256i q8_2 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
+            const __m256i q8_3 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
+
+            __m256i q8s_0 = _mm256_maddubs_epi16(m32s, q8_0);
+            __m256i q8s_1 = _mm256_maddubs_epi16(m32s, q8_1);
+            __m256i q8s_2 = _mm256_maddubs_epi16(m32s, q8_2);
+            __m256i q8s_3 = _mm256_maddubs_epi16(m32s, q8_3);
+
+            __m256i p16_0 = _mm256_maddubs_epi16(q4_0, q8_0);
+            __m256i p16_1 = _mm256_maddubs_epi16(q4_1, q8_1);
+            __m256i p16_2 = _mm256_maddubs_epi16(q4_2, q8_2);
+            __m256i p16_3 = _mm256_maddubs_epi16(q4_3, q8_3);
+
+            p16_0 = _mm256_sub_epi16(p16_0, q8s_0);
+            p16_1 = _mm256_sub_epi16(p16_1, q8s_1);
+            p16_2 = _mm256_sub_epi16(p16_2, q8s_2);
+            p16_3 = _mm256_sub_epi16(p16_3, q8s_3);
+
+            p16_0 = _mm256_madd_epi16(_mm256_cvtepi8_epi16(scale_0), p16_0);
+            p16_1 = _mm256_madd_epi16(_mm256_cvtepi8_epi16(scale_1), p16_1);
+            p16_2 = _mm256_madd_epi16(_mm256_cvtepi8_epi16(scale_2), p16_2);
+            p16_3 = _mm256_madd_epi16(_mm256_cvtepi8_epi16(scale_3), p16_3);
+
+            sumi = _mm256_add_epi32(sumi, _mm256_add_epi32(p16_0, p16_1));
+            sumi = _mm256_add_epi32(sumi, _mm256_add_epi32(p16_2, p16_3));
+
+        }
+
+        acc = _mm256_fmadd_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi), acc);
+    }
+
+    *s = hsum_float_8(acc);
+
+#elif defined __AVX__
+
+    const __m128i m3 = _mm_set1_epi8(3);
+    const __m128i m15 = _mm_set1_epi8(15);
+
+    __m256 acc = _mm256_setzero_ps();
+
+    for (int i = 0; i < nb; ++i) {
+
+        const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
+
+        const uint8_t * GGML_RESTRICT q4 = x[i].ql;
+        const uint8_t * GGML_RESTRICT qh = x[i].qh;
+        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
+
+        // handle the q6_k -32 offset separately using bsums
+        const __m128i q8sums_0 = _mm_loadu_si128((const __m128i*)y[i].bsums);
+        const __m128i q8sums_1 = _mm_loadu_si128((const __m128i*)y[i].bsums + 1);
+        const __m128i scales = _mm_loadu_si128((const __m128i*)x[i].scales);
+        const __m128i scales_16_0 = _mm_cvtepi8_epi16(scales);
+        const __m128i scales_16_1 = _mm_cvtepi8_epi16(_mm_bsrli_si128(scales, 8));
+        const __m128i q8sclsub_0 = _mm_slli_epi32(_mm_madd_epi16(q8sums_0, scales_16_0), 5);
+        const __m128i q8sclsub_1 = _mm_slli_epi32(_mm_madd_epi16(q8sums_1, scales_16_1), 5);
+
+        __m128i sumi_0 = _mm_setzero_si128();
+        __m128i sumi_1 = _mm_setzero_si128();
+
+        int is = 0;
+
+        for (int j = 0; j < QK_K/128; ++j) {
+
+            const __m128i q4bitsH_0 = _mm_loadu_si128((const __m128i*)qh); qh += 16;
+            const __m128i q4bitsH_1 = _mm_loadu_si128((const __m128i*)qh); qh += 16;
+
+            const __m128i q4h_0 = _mm_slli_epi16(_mm_and_si128(q4bitsH_0, m3), 4);
+            const __m128i q4h_1 = _mm_slli_epi16(_mm_and_si128(q4bitsH_1, m3), 4);
+            const __m128i q4h_2 = _mm_slli_epi16(_mm_and_si128(q4bitsH_0, _mm_set1_epi8(12)), 2);
+            const __m128i q4h_3 = _mm_slli_epi16(_mm_and_si128(q4bitsH_1, _mm_set1_epi8(12)), 2);
+            const __m128i q4h_4 = _mm_and_si128(q4bitsH_0, _mm_set1_epi8(48));
+            const __m128i q4h_5 = _mm_and_si128(q4bitsH_1, _mm_set1_epi8(48));
+            const __m128i q4h_6 = _mm_srli_epi16(_mm_and_si128(q4bitsH_0, _mm_set1_epi8(-64)), 2);
+            const __m128i q4h_7 = _mm_srli_epi16(_mm_and_si128(q4bitsH_1, _mm_set1_epi8(-64)), 2);
+
+            const __m128i q4bits1_0 = _mm_loadu_si128((const __m128i*)q4); q4 += 16;
+            const __m128i q4bits1_1 = _mm_loadu_si128((const __m128i*)q4); q4 += 16;
+            const __m128i q4bits2_0 = _mm_loadu_si128((const __m128i*)q4); q4 += 16;
+            const __m128i q4bits2_1 = _mm_loadu_si128((const __m128i*)q4); q4 += 16;
+
+            const __m128i q4_0 = _mm_or_si128(_mm_and_si128(q4bits1_0, m15), q4h_0);
+            const __m128i q4_1 = _mm_or_si128(_mm_and_si128(q4bits1_1, m15), q4h_1);
+            const __m128i q4_2 = _mm_or_si128(_mm_and_si128(q4bits2_0, m15), q4h_2);
+            const __m128i q4_3 = _mm_or_si128(_mm_and_si128(q4bits2_1, m15), q4h_3);
+            const __m128i q4_4 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits1_0, 4), m15), q4h_4);
+            const __m128i q4_5 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits1_1, 4), m15), q4h_5);
+            const __m128i q4_6 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits2_0, 4), m15), q4h_6);
+            const __m128i q4_7 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits2_1, 4), m15), q4h_7);
+
+            const __m128i q8_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+            const __m128i q8_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+            const __m128i q8_2 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+            const __m128i q8_3 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+            const __m128i q8_4 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+            const __m128i q8_5 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+            const __m128i q8_6 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+            const __m128i q8_7 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+
+            __m128i p16_0 = _mm_maddubs_epi16(q4_0, q8_0);
+            __m128i p16_1 = _mm_maddubs_epi16(q4_1, q8_1);
+            __m128i p16_2 = _mm_maddubs_epi16(q4_2, q8_2);
+            __m128i p16_3 = _mm_maddubs_epi16(q4_3, q8_3);
+            __m128i p16_4 = _mm_maddubs_epi16(q4_4, q8_4);
+            __m128i p16_5 = _mm_maddubs_epi16(q4_5, q8_5);
+            __m128i p16_6 = _mm_maddubs_epi16(q4_6, q8_6);
+            __m128i p16_7 = _mm_maddubs_epi16(q4_7, q8_7);
+
+            const __m128i scale_0 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 0));
+            const __m128i scale_1 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 1));
+            const __m128i scale_2 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 2));
+            const __m128i scale_3 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 3));
+            is += 4;
+
+            p16_0 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_0), p16_0);
+            p16_1 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_bsrli_si128(scale_0, 8)), p16_1);
+            p16_2 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_1), p16_2);
+            p16_3 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_bsrli_si128(scale_1, 8)), p16_3);
+            p16_4 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_2), p16_4);
+            p16_5 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_bsrli_si128(scale_2, 8)), p16_5);
+            p16_6 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_3), p16_6);
+            p16_7 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_bsrli_si128(scale_3, 8)), p16_7);
+
+            sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p16_0, p16_2));
+            sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p16_1, p16_3));
+            sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p16_4, p16_6));
+            sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p16_5, p16_7));
+
+        }
+
+        sumi_0 = _mm_sub_epi32(sumi_0, q8sclsub_0);
+        sumi_1 = _mm_sub_epi32(sumi_1, q8sclsub_1);
+        const __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
+        acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(sumi)), acc);
+    }
+
+    *s = hsum_float_8(acc);
+
+#else
+
+    int8_t  aux8[QK_K];
+    int16_t aux16[8];
+    float   sums [8];
+    int32_t aux32[8];
+    memset(sums, 0, 8*sizeof(float));
+
+    float sumf = 0;
+    for (int i = 0; i < nb; ++i) {
+        const uint8_t * GGML_RESTRICT q4 = x[i].ql;
+        const uint8_t * GGML_RESTRICT qh = x[i].qh;
+        const  int8_t * GGML_RESTRICT q8 = y[i].qs;
+        memset(aux32, 0, 8*sizeof(int32_t));
+        int8_t * GGML_RESTRICT a = aux8;
+        for (int j = 0; j < QK_K; j += 128) {
+            for (int l = 0; l < 32; ++l) {
+                a[l +  0] = (int8_t)((q4[l +  0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
+                a[l + 32] = (int8_t)((q4[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
+                a[l + 64] = (int8_t)((q4[l +  0] >>  4) | (((qh[l] >> 4) & 3) << 4)) - 32;
+                a[l + 96] = (int8_t)((q4[l + 32] >>  4) | (((qh[l] >> 6) & 3) << 4)) - 32;
+            }
+            a  += 128;
+            q4 += 64;
+            qh += 32;
+        }
+        a = aux8;
+        int is = 0;
+        for (int j = 0; j < QK_K/16; ++j) {
+            int scale = x[i].scales[is++];
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+        }
+        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+        for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
+    }
+    for (int l = 0; l < 8; ++l) sumf += sums[l];
+    *s = sumf;
+#endif
+}
+
+#if defined (__AVX__) || defined (__AVX2__)
+static const int8_t keven_signs_q2xs[1024] = {
+     1,  1,  1,  1,  1,  1,  1,  1, -1,  1,  1,  1,  1,  1,  1, -1,  1, -1,  1,  1,  1,  1,  1, -1, -1, -1,  1,  1,  1,  1,  1,  1,
+     1,  1, -1,  1,  1,  1,  1, -1, -1,  1, -1,  1,  1,  1,  1,  1,  1, -1, -1,  1,  1,  1,  1,  1, -1, -1, -1,  1,  1,  1,  1, -1,
+     1,  1,  1, -1,  1,  1,  1, -1, -1,  1,  1, -1,  1,  1,  1,  1,  1, -1,  1, -1,  1,  1,  1,  1, -1, -1,  1, -1,  1,  1,  1, -1,
+     1,  1, -1, -1,  1,  1,  1,  1, -1,  1, -1, -1,  1,  1,  1, -1,  1, -1, -1, -1,  1,  1,  1, -1, -1, -1, -1, -1,  1,  1,  1,  1,
+     1,  1,  1,  1, -1,  1,  1, -1, -1,  1,  1,  1, -1,  1,  1,  1,  1, -1,  1,  1, -1,  1,  1,  1, -1, -1,  1,  1, -1,  1,  1, -1,
+     1,  1, -1,  1, -1,  1,  1,  1, -1,  1, -1,  1, -1,  1,  1, -1,  1, -1, -1,  1, -1,  1,  1, -1, -1, -1, -1,  1, -1,  1,  1,  1,
+     1,  1,  1, -1, -1,  1,  1,  1, -1,  1,  1, -1, -1,  1,  1, -1,  1, -1,  1, -1, -1,  1,  1, -1, -1, -1,  1, -1, -1,  1,  1,  1,
+     1,  1, -1, -1, -1,  1,  1, -1, -1,  1, -1, -1, -1,  1,  1,  1,  1, -1, -1, -1, -1,  1,  1,  1, -1, -1, -1, -1, -1,  1,  1, -1,
+     1,  1,  1,  1,  1, -1,  1, -1, -1,  1,  1,  1,  1, -1,  1,  1,  1, -1,  1,  1,  1, -1,  1,  1, -1, -1,  1,  1,  1, -1,  1, -1,
+     1,  1, -1,  1,  1, -1,  1,  1, -1,  1, -1,  1,  1, -1,  1, -1,  1, -1, -1,  1,  1, -1,  1, -1, -1, -1, -1,  1,  1, -1,  1,  1,
+     1,  1,  1, -1,  1, -1,  1,  1, -1,  1,  1, -1,  1, -1,  1, -1,  1, -1,  1, -1,  1, -1,  1, -1, -1, -1,  1, -1,  1, -1,  1,  1,
+     1,  1, -1, -1,  1, -1,  1, -1, -1,  1, -1, -1,  1, -1,  1,  1,  1, -1, -1, -1,  1, -1,  1,  1, -1, -1, -1, -1,  1, -1,  1, -1,
+     1,  1,  1,  1, -1, -1,  1,  1, -1,  1,  1,  1, -1, -1,  1, -1,  1, -1,  1,  1, -1, -1,  1, -1, -1, -1,  1,  1, -1, -1,  1,  1,
+     1,  1, -1,  1, -1, -1,  1, -1, -1,  1, -1,  1, -1, -1,  1,  1,  1, -1, -1,  1, -1, -1,  1,  1, -1, -1, -1,  1, -1, -1,  1, -1,
+     1,  1,  1, -1, -1, -1,  1, -1, -1,  1,  1, -1, -1, -1,  1,  1,  1, -1,  1, -1, -1, -1,  1,  1, -1, -1,  1, -1, -1, -1,  1, -1,
+     1,  1, -1, -1, -1, -1,  1,  1, -1,  1, -1, -1, -1, -1,  1, -1,  1, -1, -1, -1, -1, -1,  1, -1, -1, -1, -1, -1, -1, -1,  1,  1,
+     1,  1,  1,  1,  1,  1, -1, -1, -1,  1,  1,  1,  1,  1, -1,  1,  1, -1,  1,  1,  1,  1, -1,  1, -1, -1,  1,  1,  1,  1, -1, -1,
+     1,  1, -1,  1,  1,  1, -1,  1, -1,  1, -1,  1,  1,  1, -1, -1,  1, -1, -1,  1,  1,  1, -1, -1, -1, -1, -1,  1,  1,  1, -1,  1,
+     1,  1,  1, -1,  1,  1, -1,  1, -1,  1,  1, -1,  1,  1, -1, -1,  1, -1,  1, -1,  1,  1, -1, -1, -1, -1,  1, -1,  1,  1, -1,  1,
+     1,  1, -1, -1,  1,  1, -1, -1, -1,  1, -1, -1,  1,  1, -1,  1,  1, -1, -1, -1,  1,  1, -1,  1, -1, -1, -1, -1,  1,  1, -1, -1,
+     1,  1,  1,  1, -1,  1, -1,  1, -1,  1,  1,  1, -1,  1, -1, -1,  1, -1,  1,  1, -1,  1, -1, -1, -1, -1,  1,  1, -1,  1, -1,  1,
+     1,  1, -1,  1, -1,  1, -1, -1, -1,  1, -1,  1, -1,  1, -1,  1,  1, -1, -1,  1, -1,  1, -1,  1, -1, -1, -1,  1, -1,  1, -1, -1,
+     1,  1,  1, -1, -1,  1, -1, -1, -1,  1,  1, -1, -1,  1, -1,  1,  1, -1,  1, -1, -1,  1, -1,  1, -1, -1,  1, -1, -1,  1, -1, -1,
+     1,  1, -1, -1, -1,  1, -1,  1, -1,  1, -1, -1, -1,  1, -1, -1,  1, -1, -1, -1, -1,  1, -1, -1, -1, -1, -1, -1, -1,  1, -1,  1,
+     1,  1,  1,  1,  1, -1, -1,  1, -1,  1,  1,  1,  1, -1, -1, -1,  1, -1,  1,  1,  1, -1, -1, -1, -1, -1,  1,  1,  1, -1, -1,  1,
+     1,  1, -1,  1,  1, -1, -1, -1, -1,  1, -1,  1,  1, -1, -1,  1,  1, -1, -1,  1,  1, -1, -1,  1, -1, -1, -1,  1,  1, -1, -1, -1,
+     1,  1,  1, -1,  1, -1, -1, -1, -1,  1,  1, -1,  1, -1, -1,  1,  1, -1,  1, -1,  1, -1, -1,  1, -1, -1,  1, -1,  1, -1, -1, -1,
+     1,  1, -1, -1,  1, -1, -1,  1, -1,  1, -1, -1,  1, -1, -1, -1,  1, -1, -1, -1,  1, -1, -1, -1, -1, -1, -1, -1,  1, -1, -1,  1,
+     1,  1,  1,  1, -1, -1, -1, -1, -1,  1,  1,  1, -1, -1, -1,  1,  1, -1,  1,  1, -1, -1, -1,  1, -1, -1,  1,  1, -1, -1, -1, -1,
+     1,  1, -1,  1, -1, -1, -1,  1, -1,  1, -1,  1, -1, -1, -1, -1,  1, -1, -1,  1, -1, -1, -1, -1, -1, -1, -1,  1, -1, -1, -1,  1,
+     1,  1,  1, -1, -1, -1, -1,  1, -1,  1,  1, -1, -1, -1, -1, -1,  1, -1,  1, -1, -1, -1, -1, -1, -1, -1,  1, -1, -1, -1, -1,  1,
+     1,  1, -1, -1, -1, -1, -1, -1, -1,  1, -1, -1, -1, -1, -1,  1,  1, -1, -1, -1, -1, -1, -1,  1, -1, -1, -1, -1, -1, -1, -1, -1,
+};
+#endif
+
+void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    assert(n % QK_K == 0);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_iq2_xxs * GGML_RESTRICT x = vx;
+    const block_q8_K    * GGML_RESTRICT y = vy;
+
+    const int nb = n / QK_K;
+
+#if defined(__AVX2__)
+
+    const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
+
+    uint32_t aux32[4];
+    const uint8_t * aux8 = (const uint8_t *)aux32;
+
+    __m256 accumf = _mm256_setzero_ps();
+    for (int i = 0; i < nb; ++i) {
+        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+        const uint16_t * GGML_RESTRICT q2 = x[i].qs;
+        const int8_t   * GGML_RESTRICT q8 = y[i].qs;
+        __m256i sumi1 = _mm256_setzero_si256();
+        __m256i sumi2 = _mm256_setzero_si256();
+        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
+            const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
+            const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
+            memcpy(aux32, q2, 4*sizeof(uint32_t)); q2 += 8;
+            const __m256i q2_1 = _mm256_set_epi64x(iq2xxs_grid[aux8[ 3]], iq2xxs_grid[aux8[ 2]], iq2xxs_grid[aux8[1]], iq2xxs_grid[aux8[0]]);
+            const __m256i q2_2 = _mm256_set_epi64x(iq2xxs_grid[aux8[11]], iq2xxs_grid[aux8[10]], iq2xxs_grid[aux8[9]], iq2xxs_grid[aux8[8]]);
+            const __m256i s2_1 = _mm256_set_epi64x(signs64[(aux32[1] >> 21) & 127], signs64[(aux32[1] >> 14) & 127],
+                                                   signs64[(aux32[1] >>  7) & 127], signs64[(aux32[1] >>  0) & 127]);
+            const __m256i s2_2 = _mm256_set_epi64x(signs64[(aux32[3] >> 21) & 127], signs64[(aux32[3] >> 14) & 127],
+                                                   signs64[(aux32[3] >>  7) & 127], signs64[(aux32[3] >>  0) & 127]);
+            const __m256i q8s_1 = _mm256_sign_epi8(q8_1, s2_1);
+            const __m256i q8s_2 = _mm256_sign_epi8(q8_2, s2_2);
+            const __m256i dot1  = _mm256_maddubs_epi16(q2_1, q8s_1);
+            const __m256i dot2  = _mm256_maddubs_epi16(q2_2, q8s_2);
+            const uint16_t ls1 = aux32[1] >> 28;
+            const uint16_t ls2 = aux32[3] >> 28;
+            const __m256i p1 = _mm256_madd_epi16(dot1, _mm256_set1_epi16(2*ls1+1));
+            const __m256i p2 = _mm256_madd_epi16(dot2, _mm256_set1_epi16(2*ls2+1));
+            sumi1 = _mm256_add_epi32(sumi1, p1);
+            sumi2 = _mm256_add_epi32(sumi2, p2);
+        }
+
+        accumf = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accumf);
+
+    }
+
+    *s = 0.125f * hsum_float_8(accumf);
+
+#elif defined(__AVX__)
+    const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
+
+    uint32_t aux32[4];
+    const uint8_t * aux8 = (const uint8_t *)aux32;
+
+    __m256 accumf = _mm256_setzero_ps();
+    for (int i = 0; i < nb; ++i) {
+        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+        const uint16_t * GGML_RESTRICT q2 = x[i].qs;
+        const int8_t   * GGML_RESTRICT q8 = y[i].qs;
+        __m128i sumi1_0 = _mm_setzero_si128();
+        __m128i sumi1_1 = _mm_setzero_si128();
+        __m128i sumi2_0 = _mm_setzero_si128();
+        __m128i sumi2_1 = _mm_setzero_si128();
+        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
+            const __m128i q8_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+            const __m128i q8_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+            const __m128i q8_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+            const __m128i q8_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+            memcpy(aux32, q2, 4*sizeof(uint32_t)); q2 += 8;
+            const __m128i q2_1_0 = _mm_set_epi64x(iq2xxs_grid[aux8[1]], iq2xxs_grid[aux8[0]]);
+            const __m128i q2_1_1 = _mm_set_epi64x(iq2xxs_grid[aux8[3]], iq2xxs_grid[aux8[2]]);
+            const __m128i q2_2_0 = _mm_set_epi64x(iq2xxs_grid[aux8[9]], iq2xxs_grid[aux8[8]]);
+            const __m128i q2_2_1 = _mm_set_epi64x(iq2xxs_grid[aux8[11]], iq2xxs_grid[aux8[10]]);
+            const __m128i s2_1_0 = _mm_set_epi64x(signs64[(aux32[1] >>  7) & 127], signs64[(aux32[1] >>  0) & 127]);
+            const __m128i s2_1_1 = _mm_set_epi64x(signs64[(aux32[1] >> 21) & 127], signs64[(aux32[1] >> 14) & 127]);
+            const __m128i s2_2_0 = _mm_set_epi64x(signs64[(aux32[3] >>  7) & 127], signs64[(aux32[3] >>  0) & 127]);
+            const __m128i s2_2_1 = _mm_set_epi64x(signs64[(aux32[3] >> 21) & 127], signs64[(aux32[3] >> 14) & 127]);
+            const __m128i q8s_1_0 = _mm_sign_epi8(q8_1_0, s2_1_0);
+            const __m128i q8s_1_1 = _mm_sign_epi8(q8_1_1, s2_1_1);
+            const __m128i q8s_2_0 = _mm_sign_epi8(q8_2_0, s2_2_0);
+            const __m128i q8s_2_1 = _mm_sign_epi8(q8_2_1, s2_2_1);
+            const __m128i dot1_0  = _mm_maddubs_epi16(q2_1_0, q8s_1_0);
+            const __m128i dot1_1  = _mm_maddubs_epi16(q2_1_1, q8s_1_1);
+            const __m128i dot2_0  = _mm_maddubs_epi16(q2_2_0, q8s_2_0);
+            const __m128i dot2_1  = _mm_maddubs_epi16(q2_2_1, q8s_2_1);
+            const uint16_t ls1 = aux32[1] >> 28;
+            const uint16_t ls2 = aux32[3] >> 28;
+            const __m128i p1_0 = _mm_madd_epi16(dot1_0, _mm_set1_epi16(2*ls1+1));
+            const __m128i p1_1 = _mm_madd_epi16(dot1_1, _mm_set1_epi16(2*ls1+1));
+            const __m128i p2_0 = _mm_madd_epi16(dot2_0, _mm_set1_epi16(2*ls2+1));
+            const __m128i p2_1 = _mm_madd_epi16(dot2_1, _mm_set1_epi16(2*ls2+1));
+            sumi1_0 = _mm_add_epi32(sumi1_0, p1_0);
+            sumi1_1 = _mm_add_epi32(sumi1_1, p1_1);
+            sumi2_0 = _mm_add_epi32(sumi2_0, p2_0);
+            sumi2_1 = _mm_add_epi32(sumi2_1, p2_1);
+        }
+
+        accumf = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(_mm_add_epi32(sumi1_1, sumi2_1), _mm_add_epi32(sumi1_0, sumi2_0)))), accumf);
+
+    }
+
+    *s = 0.125f * hsum_float_8(accumf);
+
+#else
+
+    uint32_t aux32[2];
+    const uint8_t * aux8 = (const uint8_t *)aux32;
+
+    float sumf = 0.f;
+    for (int i = 0; i < nb; ++i) {
+        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+        const uint16_t * GGML_RESTRICT q2 = x[i].qs;
+        const int8_t   * GGML_RESTRICT q8 = y[i].qs;
+        int32_t bsum = 0;
+        for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
+            memcpy(aux32, q2, 2*sizeof(uint32_t));
+            q2 += 4;
+            const uint32_t ls = 2*(aux32[1] >> 28) + 1;
+            int32_t sumi = 0;
+            for (int l = 0; l < 4; ++l) {
+                const uint8_t * grid = (const uint8_t *)(iq2xxs_grid + aux8[l]);
+                const uint8_t  signs = ksigns_iq2xs[(aux32[1] >> 7*l) & 127];
+                for (int j = 0; j < 8; ++j) {
+                    sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
+                }
+                q8 += 8;
+            }
+            bsum += sumi * ls;
+        }
+        sumf += d * bsum;
+    }
+    *s = 0.125f * sumf;
+#endif
+}
+
+void ggml_vec_dot_iq2_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    assert(n % QK_K == 0);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_iq2_xs * GGML_RESTRICT x = vx;
+    const block_q8_K   * GGML_RESTRICT y = vy;
+
+    const int nb = n / QK_K;
+
+#if defined(__AVX2__)
+
+    const __m256i mone = _mm256_set1_epi8(1);
+    static const char block_sign_shuffle_mask_1[32] = {
+        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
+        0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06,
+    };
+    static const char block_sign_shuffle_mask_2[32] = {
+        0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a,
+        0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e,
+    };
+    static const uint8_t bit_selector_mask_bytes[32] = {
+        0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
+        0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
+    };
+
+    const __m256i bit_selector_mask = _mm256_loadu_si256((const __m256i*)bit_selector_mask_bytes);
+    const __m256i block_sign_shuffle_1 = _mm256_loadu_si256((const __m256i*)block_sign_shuffle_mask_1);
+    const __m256i block_sign_shuffle_2 = _mm256_loadu_si256((const __m256i*)block_sign_shuffle_mask_2);
+
+    static const uint8_t k_bit_helper[32] = {
+        0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00,
+        0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00,
+    };
+    const __m256i bit_helper = _mm256_loadu_si256((const __m256i*)k_bit_helper);
+    const __m256i m511 = _mm256_set1_epi16(511);
+    const __m128i m4 = _mm_set1_epi8(0xf);
+    const __m128i m1 = _mm_set1_epi8(1);
+
+    uint64_t aux64;
+
+    // somewhat hacky, but gives a significant boost in performance
+    __m256i aux_gindex;
+    const uint16_t * gindex = (const uint16_t *)&aux_gindex;
+
+    __m256 accumf = _mm256_setzero_ps();
+    for (int i = 0; i < nb; ++i) {
+        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+        const uint16_t * GGML_RESTRICT q2 = x[i].qs;
+        const int8_t   * GGML_RESTRICT q8 = y[i].qs;
+
+        memcpy(&aux64, x[i].scales, 8);
+        __m128i stmp = _mm_set1_epi64x(aux64);
+        stmp = _mm_unpacklo_epi8(_mm_and_si128(stmp, m4), _mm_and_si128(_mm_srli_epi16(stmp, 4), m4));
+        const __m128i scales = _mm_add_epi8(_mm_slli_epi16(stmp, 1), m1);
+
+        __m256i sumi1 = _mm256_setzero_si256();
+        __m256i sumi2 = _mm256_setzero_si256();
+        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 4) {
+
+            const __m256i q2_data = _mm256_loadu_si256((const __m256i*)q2);  q2 += 16;
+            aux_gindex = _mm256_and_si256(q2_data, m511);
+
+            const __m256i partial_sign_bits = _mm256_srli_epi16(q2_data, 9);
+            const __m256i partial_sign_bits_upper = _mm256_srli_epi16(q2_data, 13);
+            const __m256i partial_sign_bits_for_counting = _mm256_xor_si256(partial_sign_bits, partial_sign_bits_upper);
+
+            const __m256i odd_bits = _mm256_shuffle_epi8(bit_helper, partial_sign_bits_for_counting);
+            const __m256i full_sign_bits = _mm256_or_si256(partial_sign_bits, odd_bits);
+
+            const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
+            const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
+            const __m256i q8_3 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
+            const __m256i q8_4 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
+
+            const __m256i q2_1 = _mm256_set_epi64x(iq2xs_grid[gindex[ 3]], iq2xs_grid[gindex[ 2]],
+                                                   iq2xs_grid[gindex[ 1]], iq2xs_grid[gindex[ 0]]);
+            const __m256i q2_2 = _mm256_set_epi64x(iq2xs_grid[gindex[ 7]], iq2xs_grid[gindex[ 6]],
+                                                   iq2xs_grid[gindex[ 5]], iq2xs_grid[gindex[ 4]]);
+            const __m256i q2_3 = _mm256_set_epi64x(iq2xs_grid[gindex[11]], iq2xs_grid[gindex[10]],
+                                                   iq2xs_grid[gindex[ 9]], iq2xs_grid[gindex[ 8]]);
+            const __m256i q2_4 = _mm256_set_epi64x(iq2xs_grid[gindex[15]], iq2xs_grid[gindex[14]],
+                                                   iq2xs_grid[gindex[13]], iq2xs_grid[gindex[12]]);
+
+            const __m128i full_signs_l = _mm256_castsi256_si128(full_sign_bits);
+            const __m128i full_signs_h = _mm256_extractf128_si256(full_sign_bits, 1);
+            const __m256i full_signs_1 = MM256_SET_M128I(full_signs_l, full_signs_l);
+            const __m256i full_signs_2 = MM256_SET_M128I(full_signs_h, full_signs_h);
+
+            __m256i signs;
+            signs = _mm256_shuffle_epi8(full_signs_1, block_sign_shuffle_1);
+            signs = _mm256_cmpeq_epi8(_mm256_and_si256(signs, bit_selector_mask), bit_selector_mask);
+            const __m256i q8s_1 = _mm256_sign_epi8(q8_1, _mm256_or_si256(signs, mone));
+
+            signs = _mm256_shuffle_epi8(full_signs_1, block_sign_shuffle_2);
+            signs = _mm256_cmpeq_epi8(_mm256_and_si256(signs, bit_selector_mask), bit_selector_mask);
+            const __m256i q8s_2 = _mm256_sign_epi8(q8_2, _mm256_or_si256(signs, mone));
+
+            signs = _mm256_shuffle_epi8(full_signs_2, block_sign_shuffle_1);
+            signs = _mm256_cmpeq_epi8(_mm256_and_si256(signs, bit_selector_mask), bit_selector_mask);
+            const __m256i q8s_3 = _mm256_sign_epi8(q8_3, _mm256_or_si256(signs, mone));
+
+            signs = _mm256_shuffle_epi8(full_signs_2, block_sign_shuffle_2);
+            signs = _mm256_cmpeq_epi8(_mm256_and_si256(signs, bit_selector_mask), bit_selector_mask);
+            const __m256i q8s_4 = _mm256_sign_epi8(q8_4, _mm256_or_si256(signs, mone));
+
+            const __m256i dot1  = _mm256_maddubs_epi16(q2_1, q8s_1);
+            const __m256i dot2  = _mm256_maddubs_epi16(q2_2, q8s_2);
+            const __m256i dot3  = _mm256_maddubs_epi16(q2_3, q8s_3);
+            const __m256i dot4  = _mm256_maddubs_epi16(q2_4, q8s_4);
+
+            const __m256i sc1 = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales, get_scale_shuffle(ib32+0)));
+            const __m256i sc2 = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales, get_scale_shuffle(ib32+1)));
+            const __m256i sc3 = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales, get_scale_shuffle(ib32+2)));
+            const __m256i sc4 = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales, get_scale_shuffle(ib32+3)));
+
+            sumi1 = _mm256_add_epi32(sumi1, _mm256_madd_epi16(dot1, sc1));
+            sumi2 = _mm256_add_epi32(sumi2, _mm256_madd_epi16(dot2, sc2));
+            sumi1 = _mm256_add_epi32(sumi1, _mm256_madd_epi16(dot3, sc3));
+            sumi2 = _mm256_add_epi32(sumi2, _mm256_madd_epi16(dot4, sc4));
+        }
+
+        accumf = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accumf);
+
+    }
+
+    *s = 0.125f * hsum_float_8(accumf);
+
+#elif defined(__AVX__)
+    const __m128i mone = _mm_set1_epi8(1);
+    static const char block_sign_shuffle_mask_1[32] = {
+        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
+        0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06,
+    };
+    static const char block_sign_shuffle_mask_2[32] = {
+        0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a,
+        0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e,
+    };
+    static const uint8_t bit_selector_mask_bytes[32] = {
+        0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
+        0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
+    };
+
+    const __m128i bit_selector_mask_0 = _mm_loadu_si128((const __m128i*)bit_selector_mask_bytes);
+    const __m128i bit_selector_mask_1 = _mm_loadu_si128((const __m128i*)bit_selector_mask_bytes + 1);
+    const __m128i block_sign_shuffle_1_0 = _mm_loadu_si128((const __m128i*)block_sign_shuffle_mask_1);
+    const __m128i block_sign_shuffle_1_1 = _mm_loadu_si128((const __m128i*)block_sign_shuffle_mask_1 + 1);
+    const __m128i block_sign_shuffle_2_0 = _mm_loadu_si128((const __m128i*)block_sign_shuffle_mask_2);
+    const __m128i block_sign_shuffle_2_1 = _mm_loadu_si128((const __m128i*)block_sign_shuffle_mask_2 + 1);
+
+    static const uint8_t k_bit_helper[32] = {
+        0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00,
+        0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00,
+    };
+    const __m128i bit_helper_0 = _mm_loadu_si128((const __m128i*)k_bit_helper);
+    const __m128i bit_helper_1 = _mm_loadu_si128((const __m128i*)k_bit_helper + 1);
+    const __m128i m511 = _mm_set1_epi16(511);
+    const __m128i m4 = _mm_set1_epi8(0xf);
+    const __m128i m1 = _mm_set1_epi8(1);
+
+    uint64_t aux64;
+
+    // somewhat hacky, but gives a significant boost in performance
+    __m256i aux_gindex;
+    const uint16_t * gindex = (const uint16_t *)&aux_gindex;
+
+    __m256 accumf = _mm256_setzero_ps();
+    for (int i = 0; i < nb; ++i) {
+        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+        const uint16_t * GGML_RESTRICT q2 = x[i].qs;
+        const int8_t   * GGML_RESTRICT q8 = y[i].qs;
+
+        memcpy(&aux64, x[i].scales, 8);
+        __m128i stmp = _mm_set1_epi64x(aux64);
+        stmp = _mm_unpacklo_epi8(_mm_and_si128(stmp, m4), _mm_and_si128(_mm_srli_epi16(stmp, 4), m4));
+        const __m128i scales = _mm_add_epi8(_mm_slli_epi16(stmp, 1), m1);
+
+        __m128i sumi1_0 = _mm_setzero_si128();
+        __m128i sumi1_1 = _mm_setzero_si128();
+        __m128i sumi2_0 = _mm_setzero_si128();
+        __m128i sumi2_1 = _mm_setzero_si128();
+        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 4) {
+
+            const __m128i q2_data_0 = _mm_loadu_si128((const __m128i*)q2);
+            const __m128i q2_data_1 = _mm_loadu_si128((const __m128i*)q2 + 1);  q2 += 16;
+            aux_gindex = MM256_SET_M128I(_mm_and_si128(q2_data_1, m511), _mm_and_si128(q2_data_0, m511));
+
+            const __m128i partial_sign_bits_0 = _mm_srli_epi16(q2_data_0, 9);
+            const __m128i partial_sign_bits_1 = _mm_srli_epi16(q2_data_1, 9);
+            const __m128i partial_sign_bits_upper_0 = _mm_srli_epi16(q2_data_0, 13);
+            const __m128i partial_sign_bits_upper_1 = _mm_srli_epi16(q2_data_1, 13);
+            const __m128i partial_sign_bits_for_counting_0 = _mm_xor_si128(partial_sign_bits_0, partial_sign_bits_upper_0);
+            const __m128i partial_sign_bits_for_counting_1 = _mm_xor_si128(partial_sign_bits_1, partial_sign_bits_upper_1);
+
+            const __m128i odd_bits_0 = _mm_shuffle_epi8(bit_helper_0, partial_sign_bits_for_counting_0);
+            const __m128i odd_bits_1 = _mm_shuffle_epi8(bit_helper_1, partial_sign_bits_for_counting_1);
+            const __m128i full_sign_bits_0 = _mm_or_si128(partial_sign_bits_0, odd_bits_0);
+            const __m128i full_sign_bits_1 = _mm_or_si128(partial_sign_bits_1, odd_bits_1);
+
+            const __m128i q8_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+            const __m128i q8_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+            const __m128i q8_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+            const __m128i q8_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+            const __m128i q8_3_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+            const __m128i q8_3_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+            const __m128i q8_4_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+            const __m128i q8_4_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+
+            const __m128i q2_1_0 = _mm_set_epi64x(iq2xs_grid[gindex[1]], iq2xs_grid[gindex[0]]);
+            const __m128i q2_1_1 = _mm_set_epi64x(iq2xs_grid[gindex[3]], iq2xs_grid[gindex[2]]);
+            const __m128i q2_2_0 = _mm_set_epi64x(iq2xs_grid[gindex[5]], iq2xs_grid[gindex[4]]);
+            const __m128i q2_2_1 = _mm_set_epi64x(iq2xs_grid[gindex[7]], iq2xs_grid[gindex[6]]);
+            const __m128i q2_3_0 = _mm_set_epi64x(iq2xs_grid[gindex[9]], iq2xs_grid[gindex[8]]);
+            const __m128i q2_3_1 = _mm_set_epi64x(iq2xs_grid[gindex[11]], iq2xs_grid[gindex[10]]);
+            const __m128i q2_4_0 = _mm_set_epi64x(iq2xs_grid[gindex[13]], iq2xs_grid[gindex[12]]);
+            const __m128i q2_4_1 = _mm_set_epi64x(iq2xs_grid[gindex[15]], iq2xs_grid[gindex[14]]);
+
+            // AVX2 full_signs_1 is full_sign_bits_0 here
+            // AVX2 full_signs_2 is full_sign_bits_1 here
+            __m128i signs_0, signs_1;
+            signs_0 = _mm_shuffle_epi8(full_sign_bits_0, block_sign_shuffle_1_0);
+            signs_1 = _mm_shuffle_epi8(full_sign_bits_0, block_sign_shuffle_1_1);
+            signs_0 = _mm_cmpeq_epi8(_mm_and_si128(signs_0, bit_selector_mask_0), bit_selector_mask_0);
+            signs_1 = _mm_cmpeq_epi8(_mm_and_si128(signs_1, bit_selector_mask_1), bit_selector_mask_1);
+            const __m128i q8s_1_0 = _mm_sign_epi8(q8_1_0, _mm_or_si128(signs_0, mone));
+            const __m128i q8s_1_1 = _mm_sign_epi8(q8_1_1, _mm_or_si128(signs_1, mone));
+
+            signs_0 = _mm_shuffle_epi8(full_sign_bits_0, block_sign_shuffle_2_0);
+            signs_1 = _mm_shuffle_epi8(full_sign_bits_0, block_sign_shuffle_2_1);
+            signs_0 = _mm_cmpeq_epi8(_mm_and_si128(signs_0, bit_selector_mask_0), bit_selector_mask_0);
+            signs_1 = _mm_cmpeq_epi8(_mm_and_si128(signs_1, bit_selector_mask_1), bit_selector_mask_1);
+            const __m128i q8s_2_0 = _mm_sign_epi8(q8_2_0, _mm_or_si128(signs_0, mone));
+            const __m128i q8s_2_1 = _mm_sign_epi8(q8_2_1, _mm_or_si128(signs_1, mone));
+
+            signs_0 = _mm_shuffle_epi8(full_sign_bits_1, block_sign_shuffle_1_0);
+            signs_1 = _mm_shuffle_epi8(full_sign_bits_1, block_sign_shuffle_1_1);
+            signs_0 = _mm_cmpeq_epi8(_mm_and_si128(signs_0, bit_selector_mask_0), bit_selector_mask_0);
+            signs_1 = _mm_cmpeq_epi8(_mm_and_si128(signs_1, bit_selector_mask_1), bit_selector_mask_1);
+            const __m128i q8s_3_0 = _mm_sign_epi8(q8_3_0, _mm_or_si128(signs_0, mone));
+            const __m128i q8s_3_1 = _mm_sign_epi8(q8_3_1, _mm_or_si128(signs_1, mone));
+
+            signs_0 = _mm_shuffle_epi8(full_sign_bits_1, block_sign_shuffle_2_0);
+            signs_1 = _mm_shuffle_epi8(full_sign_bits_1, block_sign_shuffle_2_1);
+            signs_0 = _mm_cmpeq_epi8(_mm_and_si128(signs_0, bit_selector_mask_0), bit_selector_mask_0);
+            signs_1 = _mm_cmpeq_epi8(_mm_and_si128(signs_1, bit_selector_mask_1), bit_selector_mask_1);
+            const __m128i q8s_4_0 = _mm_sign_epi8(q8_4_0, _mm_or_si128(signs_0, mone));
+            const __m128i q8s_4_1 = _mm_sign_epi8(q8_4_1, _mm_or_si128(signs_1, mone));
+
+            const __m128i dot1_0  = _mm_maddubs_epi16(q2_1_0, q8s_1_0);
+            const __m128i dot1_1  = _mm_maddubs_epi16(q2_1_1, q8s_1_1);
+            const __m128i dot2_0  = _mm_maddubs_epi16(q2_2_0, q8s_2_0);
+            const __m128i dot2_1  = _mm_maddubs_epi16(q2_2_1, q8s_2_1);
+            const __m128i dot3_0  = _mm_maddubs_epi16(q2_3_0, q8s_3_0);
+            const __m128i dot3_1  = _mm_maddubs_epi16(q2_3_1, q8s_3_1);
+            const __m128i dot4_0  = _mm_maddubs_epi16(q2_4_0, q8s_4_0);
+            const __m128i dot4_1  = _mm_maddubs_epi16(q2_4_1, q8s_4_1);
+
+            __m128i sc_tmp = _mm_shuffle_epi8(scales, get_scale_shuffle(ib32+0));
+            const __m128i sc1_0 = _mm_cvtepi8_epi16(sc_tmp);
+            const __m128i sc1_1 = _mm_cvtepi8_epi16(_mm_srli_si128(sc_tmp, 8));
+            sc_tmp = _mm_shuffle_epi8(scales, get_scale_shuffle(ib32+1));
+            const __m128i sc2_0 = _mm_cvtepi8_epi16(sc_tmp);
+            const __m128i sc2_1 = _mm_cvtepi8_epi16(_mm_srli_si128(sc_tmp, 8));
+            sc_tmp = _mm_shuffle_epi8(scales, get_scale_shuffle(ib32+2));
+            const __m128i sc3_0 = _mm_cvtepi8_epi16(sc_tmp);
+            const __m128i sc3_1 = _mm_cvtepi8_epi16(_mm_srli_si128(sc_tmp, 8));
+            sc_tmp = _mm_shuffle_epi8(scales, get_scale_shuffle(ib32+3));
+            const __m128i sc4_0 = _mm_cvtepi8_epi16(sc_tmp);
+            const __m128i sc4_1 = _mm_cvtepi8_epi16(_mm_srli_si128(sc_tmp, 8));
+
+            sumi1_0 = _mm_add_epi32(sumi1_0, _mm_madd_epi16(dot1_0, sc1_0));
+            sumi1_1 = _mm_add_epi32(sumi1_1, _mm_madd_epi16(dot1_1, sc1_1));
+            sumi2_0 = _mm_add_epi32(sumi2_0, _mm_madd_epi16(dot2_0, sc2_0));
+            sumi2_1 = _mm_add_epi32(sumi2_1, _mm_madd_epi16(dot2_1, sc2_1));
+            sumi1_0 = _mm_add_epi32(sumi1_0, _mm_madd_epi16(dot3_0, sc3_0));
+            sumi1_1 = _mm_add_epi32(sumi1_1, _mm_madd_epi16(dot3_1, sc3_1));
+            sumi2_0 = _mm_add_epi32(sumi2_0, _mm_madd_epi16(dot4_0, sc4_0));
+            sumi2_1 = _mm_add_epi32(sumi2_1, _mm_madd_epi16(dot4_1, sc4_1));
+        }
+
+        accumf = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(_mm_add_epi32(sumi1_1, sumi2_1), _mm_add_epi32(sumi1_0, sumi2_0)))), accumf);
+
+    }
+
+    *s = 0.125f * hsum_float_8(accumf);
+
+#else
+
+    float sumf = 0.f;
+    for (int i = 0; i < nb; ++i) {
+        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+        const uint16_t * GGML_RESTRICT q2 = x[i].qs;
+        const uint8_t  * GGML_RESTRICT sc = x[i].scales;
+        const int8_t   * GGML_RESTRICT q8 = y[i].qs;
+        int32_t bsum = 0;
+        for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
+            const uint16_t ls1 = 2*(sc[ib32] & 0xf) + 1;
+            const uint16_t ls2 = 2*(sc[ib32] >>  4) + 1;
+            int32_t sumi = 0;
+            for (int l = 0; l < 2; ++l) {
+                const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511));
+                const uint8_t  signs = ksigns_iq2xs[q2[l] >> 9];
+                for (int j = 0; j < 8; ++j) {
+                    sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
+                }
+                q8 += 8;
+            }
+            bsum += sumi * ls1;
+            sumi = 0;
+            for (int l = 2; l < 4; ++l) {
+                const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511));
+                const uint8_t  signs = ksigns_iq2xs[q2[l] >> 9];
+                for (int j = 0; j < 8; ++j) {
+                    sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
+                }
+                q8 += 8;
+            }
+            bsum += sumi * ls2;
+            q2 += 4;
+        }
+        sumf += d * bsum;
+    }
+    *s = 0.125f * sumf;
+#endif
+}
+
+void ggml_vec_dot_iq2_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    assert(n % QK_K == 0);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_iq2_s * GGML_RESTRICT x = vx;
+    const block_q8_K  * GGML_RESTRICT y = vy;
+
+    const int nb = n / QK_K;
+
+#if defined(__AVX2__)
+
+   static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+                                       0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
+   };
+
+    static const uint8_t k_mask2[32] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
+                                        0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
+    };
+
+    const __m128i m4 = _mm_set1_epi8(0xf);
+    const __m128i m1 = _mm_set1_epi8(1);
+
+    const __m256i mask1 = _mm256_loadu_si256((const __m256i*)k_mask1);
+    const __m256i mask2 = _mm256_loadu_si256((const __m256i*)k_mask2);
+
+    uint64_t aux64;
+
+    __m256 accumf = _mm256_setzero_ps();
+    for (int i = 0; i < nb; ++i) {
+        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+        const uint8_t * GGML_RESTRICT qs = x[i].qs;
+        const uint8_t * GGML_RESTRICT qh = x[i].qh;
+        const uint16_t * GGML_RESTRICT signs = (const uint16_t *)(x[i].qs + QK_K/8);
+        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
+
+        memcpy(&aux64, x[i].scales, 8);
+        const __m128i scales8 = _mm_add_epi8(_mm_slli_epi16(_mm_and_si128(_mm_set_epi64x(aux64 >> 4, aux64), m4), 1), m1);
+        const __m256i scales16 = _mm256_cvtepi8_epi16(scales8); // 0 2 4 6 8 10 12 14 1 3 5 7 9 11 13 15
+
+        __m256i sumi1 = _mm256_setzero_si256();
+        __m256i sumi2 = _mm256_setzero_si256();
+        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
+            const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
+            const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
+            const __m256i q2_1 = _mm256_set_epi64x(iq2s_grid[qs[3] | ((qh[ib32+0] << 2) & 0x300)],
+                                                   iq2s_grid[qs[2] | ((qh[ib32+0] << 4) & 0x300)],
+                                                   iq2s_grid[qs[1] | ((qh[ib32+0] << 6) & 0x300)],
+                                                   iq2s_grid[qs[0] | ((qh[ib32+0] << 8) & 0x300)]);
+            const __m256i q2_2 = _mm256_set_epi64x(iq2s_grid[qs[7] | ((qh[ib32+1] << 2) & 0x300)],
+                                                   iq2s_grid[qs[6] | ((qh[ib32+1] << 4) & 0x300)],
+                                                   iq2s_grid[qs[5] | ((qh[ib32+1] << 6) & 0x300)],
+                                                   iq2s_grid[qs[4] | ((qh[ib32+1] << 8) & 0x300)]);
+            qs += 8;
+
+            __m256i aux256 = _mm256_set1_epi32(signs[0] | ((uint32_t) signs[1] << 16));
+            aux256 = _mm256_and_si256(_mm256_shuffle_epi8(aux256,mask1), mask2);
+            const __m256i s2_1 = _mm256_cmpeq_epi8(aux256, mask2);
+            const __m256i q8s_1 = _mm256_sub_epi8(_mm256_xor_si256(s2_1, q8_1), s2_1);
+
+            aux256 = _mm256_set1_epi32(signs[2] | ((uint32_t) signs[3] << 16));
+            aux256 = _mm256_and_si256(_mm256_shuffle_epi8(aux256,mask1), mask2);
+            const __m256i s2_2 = _mm256_cmpeq_epi8(aux256, mask2);
+            const __m256i q8s_2 = _mm256_sub_epi8(_mm256_xor_si256(s2_2, q8_2), s2_2);
+
+            signs += 4;
+
+            const __m256i dot1  = _mm256_maddubs_epi16(q2_1, q8s_1); // blocks 2*ib32+0, 2*ib32+1
+            const __m256i dot2  = _mm256_maddubs_epi16(q2_2, q8s_2); // blocks 2*ib32+2, 2*ib32+3
+
+            const __m256i p1 = _mm256_madd_epi16(dot1, _mm256_shuffle_epi8(scales16, get_scale_shuffle_k4(ib32+0)));
+            const __m256i p2 = _mm256_madd_epi16(dot2, _mm256_shuffle_epi8(scales16, get_scale_shuffle_k4(ib32+1)));
+            sumi1 = _mm256_add_epi32(sumi1, p1);
+            sumi2 = _mm256_add_epi32(sumi2, p2);
+        }
+
+        accumf = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accumf);
+
+    }
+
+    *s = 0.125f * hsum_float_8(accumf);
+
+#elif defined(__AVX__)
+   static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+                                       0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
+   };
+
+    static const uint8_t k_mask2[32] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
+                                        0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
+    };
+
+    const __m128i m4 = _mm_set1_epi8(0xf);
+    const __m128i m1 = _mm_set1_epi8(1);
+
+    const __m128i mask1_0 = _mm_loadu_si128((const __m128i*)k_mask1);
+    const __m128i mask1_1 = _mm_loadu_si128((const __m128i*)k_mask1 + 1);
+    const __m128i mask2_0 = _mm_loadu_si128((const __m128i*)k_mask2);
+    const __m128i mask2_1 = _mm_loadu_si128((const __m128i*)k_mask2 + 1);
+
+    uint64_t aux64;
+
+    __m256 accumf = _mm256_setzero_ps();
+    for (int i = 0; i < nb; ++i) {
+        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+        const uint8_t * GGML_RESTRICT qs = x[i].qs;
+        const uint8_t * GGML_RESTRICT qh = x[i].qh;
+        const uint16_t * GGML_RESTRICT signs = (const uint16_t *)(x[i].qs + QK_K/8);
+        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
+
+        memcpy(&aux64, x[i].scales, 8);
+        const __m128i scales8 = _mm_add_epi8(_mm_slli_epi16(_mm_and_si128(_mm_set_epi64x(aux64 >> 4, aux64), m4), 1), m1);
+        const __m128i scales16_0 = _mm_cvtepi8_epi16(scales8);
+        const __m128i scales16_1 = _mm_cvtepi8_epi16(_mm_srli_si128(scales8, 8));
+
+        __m128i sumi1_0 = _mm_setzero_si128();
+        __m128i sumi1_1 = _mm_setzero_si128();
+        __m128i sumi2_0 = _mm_setzero_si128();
+        __m128i sumi2_1 = _mm_setzero_si128();
+        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
+            const __m128i q8_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+            const __m128i q8_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+            const __m128i q8_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+            const __m128i q8_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+            const __m128i q2_1_0 = _mm_set_epi64x(iq2s_grid[qs[1] | ((qh[ib32+0] << 6) & 0x300)],
+                                                  iq2s_grid[qs[0] | ((qh[ib32+0] << 8) & 0x300)]);
+            const __m128i q2_1_1 = _mm_set_epi64x(iq2s_grid[qs[3] | ((qh[ib32+0] << 2) & 0x300)],
+                                                  iq2s_grid[qs[2] | ((qh[ib32+0] << 4) & 0x300)]);
+            const __m128i q2_2_0 = _mm_set_epi64x(iq2s_grid[qs[5] | ((qh[ib32+1] << 6) & 0x300)],
+                                                  iq2s_grid[qs[4] | ((qh[ib32+1] << 8) & 0x300)]);
+            const __m128i q2_2_1 = _mm_set_epi64x(iq2s_grid[qs[7] | ((qh[ib32+1] << 2) & 0x300)],
+                                                  iq2s_grid[qs[6] | ((qh[ib32+1] << 4) & 0x300)]);
+            qs += 8;
+
+            __m128i aux128_0 = _mm_set1_epi32(signs[0] | ((uint32_t) signs[1] << 16));
+            __m128i aux128_1 = aux128_0;
+            aux128_0 = _mm_and_si128(_mm_shuffle_epi8(aux128_0,mask1_0), mask2_0);
+            aux128_1 = _mm_and_si128(_mm_shuffle_epi8(aux128_1,mask1_1), mask2_1);
+            const __m128i s2_1_0 = _mm_cmpeq_epi8(aux128_0, mask2_0);
+            const __m128i s2_1_1 = _mm_cmpeq_epi8(aux128_1, mask2_1);
+            const __m128i q8s_1_0 = _mm_sub_epi8(_mm_xor_si128(s2_1_0, q8_1_0), s2_1_0);
+            const __m128i q8s_1_1 = _mm_sub_epi8(_mm_xor_si128(s2_1_1, q8_1_1), s2_1_1);
+
+            aux128_0 = _mm_set1_epi32(signs[2] | ((uint32_t) signs[3] << 16));
+            aux128_1 = aux128_0;
+            aux128_0 = _mm_and_si128(_mm_shuffle_epi8(aux128_0,mask1_0), mask2_0);
+            aux128_1 = _mm_and_si128(_mm_shuffle_epi8(aux128_1,mask1_1), mask2_1);
+            const __m128i s2_2_0 = _mm_cmpeq_epi8(aux128_0, mask2_0);
+            const __m128i s2_2_1 = _mm_cmpeq_epi8(aux128_1, mask2_1);
+            const __m128i q8s_2_0 = _mm_sub_epi8(_mm_xor_si128(s2_2_0, q8_2_0), s2_2_0);
+            const __m128i q8s_2_1 = _mm_sub_epi8(_mm_xor_si128(s2_2_1, q8_2_1), s2_2_1);
+
+            signs += 4;
+
+            const __m128i dot1_0  = _mm_maddubs_epi16(q2_1_0, q8s_1_0);
+            const __m128i dot1_1  = _mm_maddubs_epi16(q2_1_1, q8s_1_1);
+            const __m128i dot2_0  = _mm_maddubs_epi16(q2_2_0, q8s_2_0);
+            const __m128i dot2_1  = _mm_maddubs_epi16(q2_2_1, q8s_2_1);
+
+            const __m128i p1_0 = _mm_madd_epi16(dot1_0, _mm_shuffle_epi8(scales16_0, _mm256_extractf128_si256(get_scale_shuffle_k4(ib32+0), 0)));
+            const __m128i p1_1 = _mm_madd_epi16(dot1_1, _mm_shuffle_epi8(scales16_1, _mm256_extractf128_si256(get_scale_shuffle_k4(ib32+0), 1)));
+            const __m128i p2_0 = _mm_madd_epi16(dot2_0, _mm_shuffle_epi8(scales16_0, _mm256_extractf128_si256(get_scale_shuffle_k4(ib32+1), 0)));
+            const __m128i p2_1 = _mm_madd_epi16(dot2_1, _mm_shuffle_epi8(scales16_1, _mm256_extractf128_si256(get_scale_shuffle_k4(ib32+1), 1)));
+            sumi1_0 = _mm_add_epi32(sumi1_0, p1_0);
+            sumi1_1 = _mm_add_epi32(sumi1_1, p1_1);
+            sumi2_0 = _mm_add_epi32(sumi2_0, p2_0);
+            sumi2_1 = _mm_add_epi32(sumi2_1, p2_1);
+        }
+
+        accumf = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(_mm_add_epi32(sumi1_1, sumi2_1), _mm_add_epi32(sumi1_0, sumi2_0)))), accumf);
+
+    }
+
+    *s = 0.125f * hsum_float_8(accumf);
+
+#else
+
+    float sumf = 0;
+    for (int i = 0; i < nb; i++) {
+
+        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+        const int8_t  * q8 = y[i].qs;
+        const uint8_t * qs = x[i].qs;
+        const uint8_t * qh = x[i].qh;
+        const uint8_t * signs = qs + QK_K/8;
+
+        int bsum = 0;
+        for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
+            int ls1 = 1 + 2*(x[i].scales[ib32] & 0xf);
+            int ls2 = 1 + 2*(x[i].scales[ib32] >>  4);
+            int sumi1 = 0, sumi2 = 0;
+            for (int l = 0; l < 2; ++l) {
+                const uint8_t * grid = (const uint8_t *)(iq2s_grid + (qs[l] | (qh[ib32] << (8-2*l) & 0x300)));
+                for (int j = 0; j < 8; ++j) {
+                    sumi1 += q8[j] * grid[j] * (signs[l] & kmask_iq2xs[j] ? -1 : 1);
+                }
+                q8 += 8;
+            }
+            for (int l = 2; l < 4; ++l) {
+                const uint8_t * grid = (const uint8_t *)(iq2s_grid + (qs[l] | (qh[ib32] << (8-2*l) & 0x300)));
+                for (int j = 0; j < 8; ++j) {
+                    sumi2 += q8[j] * grid[j] * (signs[l] & kmask_iq2xs[j] ? -1 : 1);
+                }
+                q8 += 8;
+            }
+            bsum += ls1 * sumi1 + ls2 * sumi2;
+            qs += 4;
+            signs += 4;
+        }
+
+        sumf += d * bsum;
+    }
+
+    *s = 0.125f * sumf;
+
+#endif
+
+}
+
+void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    assert(n % QK_K == 0);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_iq3_xxs * GGML_RESTRICT x = vx;
+    const block_q8_K    * GGML_RESTRICT y = vy;
+
+    const int nb = n / QK_K;
+
+#if defined(__AVX2__)
+
+    const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
+
+    uint32_t aux32[2];
+
+    __m256 accumf = _mm256_setzero_ps();
+    for (int i = 0; i < nb; ++i) {
+        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+        const uint8_t * GGML_RESTRICT q3 = x[i].qs;
+        const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4;
+        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
+        __m256i sumi1 = _mm256_setzero_si256();
+        __m256i sumi2 = _mm256_setzero_si256();
+        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
+            const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
+            const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
+            const __m256i q2_1 = _mm256_set_epi32(iq3xxs_grid[q3[7]], iq3xxs_grid[q3[6]], iq3xxs_grid[q3[5]], iq3xxs_grid[q3[4]],
+                                                  iq3xxs_grid[q3[3]], iq3xxs_grid[q3[2]], iq3xxs_grid[q3[1]], iq3xxs_grid[q3[0]]);
+            q3 += 8;
+            const __m256i q2_2 = _mm256_set_epi32(iq3xxs_grid[q3[7]], iq3xxs_grid[q3[6]], iq3xxs_grid[q3[5]], iq3xxs_grid[q3[4]],
+                                                  iq3xxs_grid[q3[3]], iq3xxs_grid[q3[2]], iq3xxs_grid[q3[1]], iq3xxs_grid[q3[0]]);
+            q3 += 8;
+            memcpy(aux32, gas, 8); gas += 8;
+            const __m256i s2_1 = _mm256_set_epi64x(signs64[(aux32[0] >> 21) & 127], signs64[(aux32[0] >> 14) & 127],
+                                                   signs64[(aux32[0] >>  7) & 127], signs64[(aux32[0] >>  0) & 127]);
+            const __m256i s2_2 = _mm256_set_epi64x(signs64[(aux32[1] >> 21) & 127], signs64[(aux32[1] >> 14) & 127],
+                                                   signs64[(aux32[1] >>  7) & 127], signs64[(aux32[1] >>  0) & 127]);
+            const __m256i q8s_1 = _mm256_sign_epi8(q8_1, s2_1);
+            const __m256i q8s_2 = _mm256_sign_epi8(q8_2, s2_2);
+            const __m256i dot1  = _mm256_maddubs_epi16(q2_1, q8s_1);
+            const __m256i dot2  = _mm256_maddubs_epi16(q2_2, q8s_2);
+            const uint16_t ls1 = aux32[0] >> 28;
+            const uint16_t ls2 = aux32[1] >> 28;
+            const __m256i p1 = _mm256_madd_epi16(dot1, _mm256_set1_epi16(2*ls1+1));
+            const __m256i p2 = _mm256_madd_epi16(dot2, _mm256_set1_epi16(2*ls2+1));
+            sumi1 = _mm256_add_epi32(sumi1, p1);
+            sumi2 = _mm256_add_epi32(sumi2, p2);
+        }
+
+        accumf = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accumf);
+
+    }
+
+    *s = 0.25f * hsum_float_8(accumf);
+
+#elif defined(__AVX__)
+    const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
+
+    uint32_t aux32[2];
+
+    __m256 accumf = _mm256_setzero_ps();
+    for (int i = 0; i < nb; ++i) {
+        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+        const uint8_t * GGML_RESTRICT q3 = x[i].qs;
+        const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4;
+        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
+        __m128i sumi1_0 = _mm_setzero_si128();
+        __m128i sumi1_1 = _mm_setzero_si128();
+        __m128i sumi2_0 = _mm_setzero_si128();
+        __m128i sumi2_1 = _mm_setzero_si128();
+        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
+            const __m128i q8_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+            const __m128i q8_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+            const __m128i q8_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+            const __m128i q8_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+            const __m128i q2_1_0 = _mm_set_epi32(iq3xxs_grid[q3[3]], iq3xxs_grid[q3[2]], iq3xxs_grid[q3[1]], iq3xxs_grid[q3[0]]);
+            const __m128i q2_1_1 = _mm_set_epi32(iq3xxs_grid[q3[7]], iq3xxs_grid[q3[6]], iq3xxs_grid[q3[5]], iq3xxs_grid[q3[4]]);
+            q3 += 8;
+            const __m128i q2_2_0 = _mm_set_epi32(iq3xxs_grid[q3[3]], iq3xxs_grid[q3[2]], iq3xxs_grid[q3[1]], iq3xxs_grid[q3[0]]);
+            const __m128i q2_2_1 = _mm_set_epi32(iq3xxs_grid[q3[7]], iq3xxs_grid[q3[6]], iq3xxs_grid[q3[5]], iq3xxs_grid[q3[4]]);
+            q3 += 8;
+            memcpy(aux32, gas, 8); gas += 8;
+            const __m128i s2_1_0 = _mm_set_epi64x(signs64[(aux32[0] >>  7) & 127], signs64[(aux32[0] >>  0) & 127]);
+            const __m128i s2_1_1 = _mm_set_epi64x(signs64[(aux32[0] >> 21) & 127], signs64[(aux32[0] >> 14) & 127]);
+            const __m128i s2_2_0 = _mm_set_epi64x(signs64[(aux32[1] >>  7) & 127], signs64[(aux32[1] >>  0) & 127]);
+            const __m128i s2_2_1 = _mm_set_epi64x(signs64[(aux32[1] >> 21) & 127], signs64[(aux32[1] >> 14) & 127]);
+            const __m128i q8s_1_0 = _mm_sign_epi8(q8_1_0, s2_1_0);
+            const __m128i q8s_1_1 = _mm_sign_epi8(q8_1_1, s2_1_1);
+            const __m128i q8s_2_0 = _mm_sign_epi8(q8_2_0, s2_2_0);
+            const __m128i q8s_2_1 = _mm_sign_epi8(q8_2_1, s2_2_1);
+            const __m128i dot1_0  = _mm_maddubs_epi16(q2_1_0, q8s_1_0);
+            const __m128i dot1_1  = _mm_maddubs_epi16(q2_1_1, q8s_1_1);
+            const __m128i dot2_0  = _mm_maddubs_epi16(q2_2_0, q8s_2_0);
+            const __m128i dot2_1  = _mm_maddubs_epi16(q2_2_1, q8s_2_1);
+            const uint16_t ls1 = aux32[0] >> 28;
+            const uint16_t ls2 = aux32[1] >> 28;
+            const __m128i p1_0 = _mm_madd_epi16(dot1_0, _mm_set1_epi16(2*ls1+1));
+            const __m128i p1_1 = _mm_madd_epi16(dot1_1, _mm_set1_epi16(2*ls1+1));
+            const __m128i p2_0 = _mm_madd_epi16(dot2_0, _mm_set1_epi16(2*ls2+1));
+            const __m128i p2_1 = _mm_madd_epi16(dot2_1, _mm_set1_epi16(2*ls2+1));
+            sumi1_0 = _mm_add_epi32(sumi1_0, p1_0);
+            sumi1_1 = _mm_add_epi32(sumi1_1, p1_1);
+            sumi2_0 = _mm_add_epi32(sumi2_0, p2_0);
+            sumi2_1 = _mm_add_epi32(sumi2_1, p2_1);
+        }
+
+        accumf = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(_mm_add_epi32(sumi1_1, sumi2_1), _mm_add_epi32(sumi1_0, sumi2_0)))), accumf);
+
+    }
+
+    *s = 0.25f * hsum_float_8(accumf);
+
+#else
+
+    uint32_t aux32;
+
+    float sumf = 0.f;
+    for (int i = 0; i < nb; ++i) {
+        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+        const uint8_t * GGML_RESTRICT q3 = x[i].qs;
+        const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4;
+        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
+        int32_t bsum = 0;
+        for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
+            memcpy(&aux32, gas, sizeof(uint32_t)); gas += sizeof(uint32_t);
+            const uint32_t ls = 2*(aux32 >> 28) + 1;
+            int32_t sumi = 0;
+            for (int l = 0; l < 4; ++l) {
+                const uint8_t * grid1 = (const uint8_t *)(iq3xxs_grid + q3[2*l+0]);
+                const uint8_t * grid2 = (const uint8_t *)(iq3xxs_grid + q3[2*l+1]);
+                const uint8_t  signs = ksigns_iq2xs[(aux32 >> 7*l) & 127];
+                for (int j = 0; j < 4; ++j) {
+                    sumi += grid1[j] * q8[j+0] * (signs & kmask_iq2xs[j+0] ? -1 : 1);
+                    sumi += grid2[j] * q8[j+4] * (signs & kmask_iq2xs[j+4] ? -1 : 1);
+                }
+                q8 += 8;
+            }
+            q3 += 8;
+            bsum += sumi * ls;
+        }
+        sumf += d * bsum;
+    }
+    *s = 0.25f * sumf;
+#endif
+}
+
+void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    assert(n % QK_K == 0);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_iq3_s * GGML_RESTRICT x = vx;
+    const block_q8_K  * GGML_RESTRICT y = vy;
+
+    const int nb = n / QK_K;
+
+#if defined(__AVX2__)
+
+   static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+                                       0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
+   };
+
+    static const uint8_t k_mask2[32] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
+                                        0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
+    };
+
+    const __m256i mask1 = _mm256_loadu_si256((const __m256i*)k_mask1);
+    const __m256i mask2 = _mm256_loadu_si256((const __m256i*)k_mask2);
+
+    const __m256i idx_shift = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+    const __m256i idx_mask  = _mm256_set1_epi32(256);
+
+    typedef union {
+        __m256i  vec[2];
+        uint32_t index[16];
+    } index_t;
+
+    index_t idx;
+
+    __m256 accumf = _mm256_setzero_ps();
+    for (int i = 0; i < nb; ++i) {
+        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+        const uint8_t * GGML_RESTRICT qs = x[i].qs;
+        const uint8_t * GGML_RESTRICT qh = x[i].qh;
+        const uint16_t * GGML_RESTRICT signs = (const uint16_t *)x[i].signs;
+        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
+        __m256i sumi1 = _mm256_setzero_si256();
+        __m256i sumi2 = _mm256_setzero_si256();
+        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
+            const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
+            const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
+            const __m256i idx_l = _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)qs)); qs += 16;
+            idx.vec[0] = _mm256_set1_epi32(qh[ib32+0]);
+            idx.vec[1] = _mm256_set1_epi32(qh[ib32+1]);
+            idx.vec[0] = _mm256_and_si256(_mm256_sllv_epi32(idx.vec[0], idx_shift), idx_mask);
+            idx.vec[1] = _mm256_and_si256(_mm256_sllv_epi32(idx.vec[1], idx_shift), idx_mask);
+            idx.vec[0] = _mm256_or_si256(idx.vec[0], _mm256_cvtepi16_epi32(_mm256_castsi256_si128(idx_l)));
+            idx.vec[1] = _mm256_or_si256(idx.vec[1], _mm256_cvtepi16_epi32(_mm256_extractf128_si256(idx_l, 1)));
+
+            // At leat on my CPU (Ryzen 7950X), using _mm256_i32gather_epi32 is slower than _mm256_set_epi32. Strange.
+            //const __m256i q2_1 = _mm256_i32gather_epi32((const int *)iq3s_grid, idx.vec[0], 4);
+            //const __m256i q2_2 = _mm256_i32gather_epi32((const int *)iq3s_grid, idx.vec[1], 4);
+            const __m256i q2_1 = _mm256_set_epi32(
+                    iq3s_grid[idx.index[7]], iq3s_grid[idx.index[6]], iq3s_grid[idx.index[5]], iq3s_grid[idx.index[4]],
+                    iq3s_grid[idx.index[3]], iq3s_grid[idx.index[2]], iq3s_grid[idx.index[1]], iq3s_grid[idx.index[0]]
+            );
+            const __m256i q2_2 = _mm256_set_epi32(
+                    iq3s_grid[idx.index[15]], iq3s_grid[idx.index[14]], iq3s_grid[idx.index[13]], iq3s_grid[idx.index[12]],
+                    iq3s_grid[idx.index[11]], iq3s_grid[idx.index[10]], iq3s_grid[idx.index[ 9]], iq3s_grid[idx.index[ 8]]
+            );
+
+            __m256i aux256 = _mm256_set1_epi32(signs[0] | (signs[1] << 16));
+            aux256 = _mm256_and_si256(_mm256_shuffle_epi8(aux256,mask1), mask2);
+            const __m256i s2_1 = _mm256_cmpeq_epi8(aux256, mask2);
+            const __m256i q8s_1 = _mm256_sub_epi8(_mm256_xor_si256(s2_1, q8_1), s2_1);
+
+            aux256 = _mm256_set1_epi32(signs[2] | (signs[3] << 16));
+            aux256 = _mm256_and_si256(_mm256_shuffle_epi8(aux256,mask1), mask2);
+            const __m256i s2_2 = _mm256_cmpeq_epi8(aux256, mask2);
+            const __m256i q8s_2 = _mm256_sub_epi8(_mm256_xor_si256(s2_2, q8_2), s2_2);
+
+            signs += 4;
+
+            const __m256i dot1  = _mm256_maddubs_epi16(q2_1, q8s_1);
+            const __m256i dot2  = _mm256_maddubs_epi16(q2_2, q8s_2);
+            const uint16_t ls1 = x[i].scales[ib32/2] & 0xf;
+            const uint16_t ls2 = x[i].scales[ib32/2] >>  4;
+            const __m256i p1 = _mm256_madd_epi16(dot1, _mm256_set1_epi16(2*ls1+1));
+            const __m256i p2 = _mm256_madd_epi16(dot2, _mm256_set1_epi16(2*ls2+1));
+            sumi1 = _mm256_add_epi32(sumi1, p1);
+            sumi2 = _mm256_add_epi32(sumi2, p2);
+        }
+
+        accumf = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accumf);
+
+    }
+
+    *s = hsum_float_8(accumf);
+
+#elif defined(__AVX__)
+   static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+                                       0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
+   };
+
+    static const uint8_t k_mask2[32] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
+                                        0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
+    };
+
+    const __m128i mask1_0 = _mm_loadu_si128((const __m128i*)k_mask1);
+    const __m128i mask1_1 = _mm_loadu_si128((const __m128i*)k_mask1 + 1);
+    const __m128i mask2_0 = _mm_loadu_si128((const __m128i*)k_mask2);
+    const __m128i mask2_1 = _mm_loadu_si128((const __m128i*)k_mask2 + 1);
+
+    const __m128i idx_mul_0 = _mm_set_epi32(32, 64, 128, 256);
+    const __m128i idx_mul_1 = _mm_set_epi32(2, 4, 8, 16);
+    const __m128i idx_mask  = _mm_set1_epi32(256);
+
+    typedef union {
+        __m128i  vec[4];
+        uint32_t index[16];
+    } index_t;
+
+    index_t idx;
+
+    __m256 accumf = _mm256_setzero_ps();
+    for (int i = 0; i < nb; ++i) {
+        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+        const uint8_t * GGML_RESTRICT qs = x[i].qs;
+        const uint8_t * GGML_RESTRICT qh = x[i].qh;
+        const uint16_t * GGML_RESTRICT signs = (const uint16_t *)x[i].signs;
+        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
+        __m128i sumi1_0 = _mm_setzero_si128();
+        __m128i sumi1_1 = _mm_setzero_si128();
+        __m128i sumi2_0 = _mm_setzero_si128();
+        __m128i sumi2_1 = _mm_setzero_si128();
+        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
+            const __m128i q8_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+            const __m128i q8_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+            const __m128i q8_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+            const __m128i q8_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+            const __m128i qs_tmp = _mm_loadu_si128((const __m128i *)qs);
+            const __m128i idx_l_0 = _mm_cvtepu8_epi16(qs_tmp);
+            const __m128i idx_l_1 = _mm_cvtepu8_epi16(_mm_srli_si128(qs_tmp, 8)); qs += 16;
+            idx.vec[0] = _mm_set1_epi32(qh[ib32+0]);
+            idx.vec[1] = idx.vec[0];
+            idx.vec[2] = _mm_set1_epi32(qh[ib32+1]);
+            idx.vec[3] = idx.vec[2];
+
+            idx.vec[0] = _mm_and_si128(_mm_mullo_epi32(idx.vec[0], idx_mul_0), idx_mask);
+            idx.vec[1] = _mm_and_si128(_mm_mullo_epi32(idx.vec[1], idx_mul_1), idx_mask);
+            idx.vec[2] = _mm_and_si128(_mm_mullo_epi32(idx.vec[2], idx_mul_0), idx_mask);
+            idx.vec[3] = _mm_and_si128(_mm_mullo_epi32(idx.vec[3], idx_mul_1), idx_mask);
+
+            idx.vec[0] = _mm_or_si128(idx.vec[0], _mm_cvtepi16_epi32(idx_l_0));
+            idx.vec[1] = _mm_or_si128(idx.vec[1], _mm_cvtepi16_epi32(_mm_srli_si128(idx_l_0, 8)));
+            idx.vec[2] = _mm_or_si128(idx.vec[2], _mm_cvtepi16_epi32(idx_l_1));
+            idx.vec[3] = _mm_or_si128(idx.vec[3], _mm_cvtepi16_epi32(_mm_srli_si128(idx_l_1, 8)));
+
+            const __m128i q2_1_0 = _mm_set_epi32(iq3s_grid[idx.index[3]], iq3s_grid[idx.index[2]], iq3s_grid[idx.index[1]], iq3s_grid[idx.index[0]]);
+            const __m128i q2_1_1 = _mm_set_epi32(iq3s_grid[idx.index[7]], iq3s_grid[idx.index[6]], iq3s_grid[idx.index[5]], iq3s_grid[idx.index[4]]);
+            const __m128i q2_2_0 = _mm_set_epi32(iq3s_grid[idx.index[11]], iq3s_grid[idx.index[10]], iq3s_grid[idx.index[9]], iq3s_grid[idx.index[8]]);
+            const __m128i q2_2_1 = _mm_set_epi32(iq3s_grid[idx.index[15]], iq3s_grid[idx.index[14]], iq3s_grid[idx.index[13]], iq3s_grid[idx.index[12]]);
+
+            __m128i aux128_0 = _mm_set1_epi32(signs[0] | (signs[1] << 16));
+            __m128i aux128_1 = aux128_0;
+            aux128_0 = _mm_and_si128(_mm_shuffle_epi8(aux128_0,mask1_0), mask2_0);
+            aux128_1 = _mm_and_si128(_mm_shuffle_epi8(aux128_1,mask1_1), mask2_1);
+            const __m128i s2_1_0 = _mm_cmpeq_epi8(aux128_0, mask2_0);
+            const __m128i s2_1_1 = _mm_cmpeq_epi8(aux128_1, mask2_1);
+            const __m128i q8s_1_0 = _mm_sub_epi8(_mm_xor_si128(s2_1_0, q8_1_0), s2_1_0);
+            const __m128i q8s_1_1 = _mm_sub_epi8(_mm_xor_si128(s2_1_1, q8_1_1), s2_1_1);
+
+            aux128_0 = _mm_set1_epi32(signs[2] | (signs[3] << 16));
+            aux128_1 = aux128_0;
+            aux128_0 = _mm_and_si128(_mm_shuffle_epi8(aux128_0,mask1_0), mask2_0);
+            aux128_1 = _mm_and_si128(_mm_shuffle_epi8(aux128_1,mask1_1), mask2_1);
+            const __m128i s2_2_0 = _mm_cmpeq_epi8(aux128_0, mask2_0);
+            const __m128i s2_2_1 = _mm_cmpeq_epi8(aux128_1, mask2_1);
+            const __m128i q8s_2_0 = _mm_sub_epi8(_mm_xor_si128(s2_2_0, q8_2_0), s2_2_0);
+            const __m128i q8s_2_1 = _mm_sub_epi8(_mm_xor_si128(s2_2_1, q8_2_1), s2_2_1);
+
+            signs += 4;
+
+            const __m128i dot1_0  = _mm_maddubs_epi16(q2_1_0, q8s_1_0);
+            const __m128i dot1_1  = _mm_maddubs_epi16(q2_1_1, q8s_1_1);
+            const __m128i dot2_0  = _mm_maddubs_epi16(q2_2_0, q8s_2_0);
+            const __m128i dot2_1  = _mm_maddubs_epi16(q2_2_1, q8s_2_1);
+            const uint16_t ls1 = x[i].scales[ib32/2] & 0xf;
+            const uint16_t ls2 = x[i].scales[ib32/2] >>  4;
+            const __m128i p1_0 = _mm_madd_epi16(dot1_0, _mm_set1_epi16(2*ls1+1));
+            const __m128i p1_1 = _mm_madd_epi16(dot1_1, _mm_set1_epi16(2*ls1+1));
+            const __m128i p2_0 = _mm_madd_epi16(dot2_0, _mm_set1_epi16(2*ls2+1));
+            const __m128i p2_1 = _mm_madd_epi16(dot2_1, _mm_set1_epi16(2*ls2+1));
+            sumi1_0 = _mm_add_epi32(sumi1_0, p1_0);
+            sumi1_1 = _mm_add_epi32(sumi1_1, p1_1);
+            sumi2_0 = _mm_add_epi32(sumi2_0, p2_0);
+            sumi2_1 = _mm_add_epi32(sumi2_1, p2_1);
+        }
+
+        accumf = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(_mm_add_epi32(sumi1_1, sumi2_1), _mm_add_epi32(sumi1_0, sumi2_0)))), accumf);
+
+    }
+
+    *s = hsum_float_8(accumf);
+
+#else
+
+    float sumf = 0.f;
+    for (int i = 0; i < nb; ++i) {
+        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+        const uint8_t * GGML_RESTRICT qs = x[i].qs;
+        const uint8_t * GGML_RESTRICT qh = x[i].qh;
+        const uint8_t * GGML_RESTRICT signs = x[i].signs;
+        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
+        int32_t bsum = 0;
+        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
+            const uint32_t ls1 = 2*(x[i].scales[ib32/2] & 0xf) + 1;
+            const uint32_t ls2 = 2*(x[i].scales[ib32/2] >>  4) + 1;
+            int32_t sumi = 0;
+            for (int l = 0; l < 4; ++l) {
+                const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*l+0] | ((qh[ib32+0] << (8-2*l)) & 256)));
+                const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*l+1] | ((qh[ib32+0] << (7-2*l)) & 256)));
+                for (int j = 0; j < 4; ++j) {
+                    sumi += grid1[j] * q8[j+0] * (signs[l] & kmask_iq2xs[j+0] ? -1 : 1);
+                    sumi += grid2[j] * q8[j+4] * (signs[l] & kmask_iq2xs[j+4] ? -1 : 1);
+                }
+                q8 += 8;
+            }
+            qs += 8;
+            signs += 4;
+            bsum += sumi * ls1;
+            sumi = 0;
+            for (int l = 0; l < 4; ++l) {
+                const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*l+0] | ((qh[ib32+1] << (8-2*l)) & 256)));
+                const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*l+1] | ((qh[ib32+1] << (7-2*l)) & 256)));
+                for (int j = 0; j < 4; ++j) {
+                    sumi += grid1[j] * q8[j+0] * (signs[l] & kmask_iq2xs[j+0] ? -1 : 1);
+                    sumi += grid2[j] * q8[j+4] * (signs[l] & kmask_iq2xs[j+4] ? -1 : 1);
+                }
+                q8 += 8;
+            }
+            qs += 8;
+            signs += 4;
+            bsum += sumi * ls2;
+        }
+        sumf += d * bsum;
+    }
+    *s = sumf;
+#endif
+}
+
+#if defined(__AVX2__)
+static inline __m256i mul_add_epi8(const __m256i x, const __m256i y) {
+    const __m256i ax = _mm256_sign_epi8(x, x);
+    const __m256i sy = _mm256_sign_epi8(y, x);
+    return _mm256_maddubs_epi16(ax, sy);
+}
+#endif
+
+void ggml_vec_dot_iq1_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    assert(n % QK_K == 0);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_iq1_s * GGML_RESTRICT x = vx;
+    const block_q8_K  * GGML_RESTRICT y = vy;
+
+    const int nb = n / QK_K;
+
+#if defined __AVX2__
+
+    __m256 accum = _mm256_setzero_ps();
+    float accum1 = 0;
+    for (int i = 0; i < nb; ++i) {
+
+        const int8_t   * q8 = y[i].qs;
+        const uint8_t  * qs = x[i].qs;
+        const uint16_t * qh = x[i].qh;
+
+        __m256i sumi = _mm256_setzero_si256();
+        int sumi1 = 0;
+        for (int ib = 0; ib < QK_K/32; ib += 2) {
+#ifdef __BMI2__
+            const uint64_t packed_idx1 = _pdep_u64(*(const uint32_t *)qs, 0x00ff00ff00ff00ffULL) | _pdep_u64(qh[ib], 0x700070007000700ULL);
+            const uint64_t packed_idx2 = _pdep_u64(*(const uint32_t *)(qs + 4), 0x00ff00ff00ff00ffULL) | _pdep_u64(qh[ib + 1], 0x700070007000700ULL);
+            const uint16_t *idx1 = (const uint16_t *)(&packed_idx1);
+            const uint16_t *idx2 = (const uint16_t *)(&packed_idx2);
+            const __m256i q1b_1 = _mm256_set_epi64x(iq1s_grid[idx1[3]], iq1s_grid[idx1[2]], iq1s_grid[idx1[1]], iq1s_grid[idx1[0]]);
+            const __m256i q1b_2 = _mm256_set_epi64x(iq1s_grid[idx2[3]], iq1s_grid[idx2[2]], iq1s_grid[idx2[1]], iq1s_grid[idx2[0]]);
+#else
+            const __m256i q1b_1 = _mm256_set_epi64x(iq1s_grid[qs[3] | ((qh[ib+0] >> 1) & 0x700)], iq1s_grid[qs[2] | ((qh[ib+0] << 2) & 0x700)],
+                                                    iq1s_grid[qs[1] | ((qh[ib+0] << 5) & 0x700)], iq1s_grid[qs[0] | ((qh[ib+0] << 8) & 0x700)]);
+            const __m256i q1b_2 = _mm256_set_epi64x(iq1s_grid[qs[7] | ((qh[ib+1] >> 1) & 0x700)], iq1s_grid[qs[6] | ((qh[ib+1] << 2) & 0x700)],
+                                                    iq1s_grid[qs[5] | ((qh[ib+1] << 5) & 0x700)], iq1s_grid[qs[4] | ((qh[ib+1] << 8) & 0x700)]);
+#endif
+            qs += 8;
+            const __m256i q8b_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
+            const __m256i q8b_2 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
+
+            const __m256i dot1 = mul_add_epi8(q1b_1, q8b_1);
+            const __m256i dot2 = mul_add_epi8(q1b_2, q8b_2);
+            const int16_t ls1 = 2*((qh[ib+0] >> 12) & 7) + 1;
+            const int16_t ls2 = 2*((qh[ib+1] >> 12) & 7) + 1;
+            const __m256i p1 = _mm256_madd_epi16(dot1, _mm256_set1_epi16(ls1));
+            const __m256i p2 = _mm256_madd_epi16(dot2, _mm256_set1_epi16(ls2));
+
+            sumi = _mm256_add_epi32(sumi, _mm256_add_epi32(p1, p2));
+            sumi1 += (y[i].bsums[2*ib+0] + y[i].bsums[2*ib+1]) * (qh[ib+0] & 0x8000 ? -1 : 1) * ls1
+                   + (y[i].bsums[2*ib+2] + y[i].bsums[2*ib+3]) * (qh[ib+1] & 0x8000 ? -1 : 1) * ls2;
+        }
+
+        const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
+        accum = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(sumi), accum);
+        accum1 += d * sumi1;
+
+    }
+
+    *s = hsum_float_8(accum) + IQ1S_DELTA * accum1;
+
+#elif defined __AVX__
+    __m256 accum = _mm256_setzero_ps();
+    float accum1 = 0;
+    for (int i = 0; i < nb; ++i) {
+
+        const int8_t   * q8 = y[i].qs;
+        const uint8_t  * qs = x[i].qs;
+        const uint16_t * qh = x[i].qh;
+
+        __m128i sumi1_0 = _mm_setzero_si128();
+        __m128i sumi1_1 = _mm_setzero_si128();
+        int sumi1 = 0;
+        for (int ib = 0; ib < QK_K/32; ib += 2) {
+            const __m128i q1b_1_0 = _mm_set_epi64x(iq1s_grid[qs[1] | ((qh[ib+0] << 5) & 0x700)], iq1s_grid[qs[0] | ((qh[ib+0] << 8) & 0x700)]);
+            const __m128i q1b_1_1 = _mm_set_epi64x(iq1s_grid[qs[3] | ((qh[ib+0] >> 1) & 0x700)], iq1s_grid[qs[2] | ((qh[ib+0] << 2) & 0x700)]);
+            const __m128i q1b_2_0 = _mm_set_epi64x(iq1s_grid[qs[5] | ((qh[ib+1] << 5) & 0x700)], iq1s_grid[qs[4] | ((qh[ib+1] << 8) & 0x700)]);
+            const __m128i q1b_2_1 = _mm_set_epi64x(iq1s_grid[qs[7] | ((qh[ib+1] >> 1) & 0x700)], iq1s_grid[qs[6] | ((qh[ib+1] << 2) & 0x700)]);
+            qs += 8;
+            const __m128i q8b_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+            const __m128i q8b_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+            const __m128i q8b_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+            const __m128i q8b_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+
+            const __m128i dot1_0 = mul_add_epi8_sse(q1b_1_0, q8b_1_0);
+            const __m128i dot1_1 = mul_add_epi8_sse(q1b_1_1, q8b_1_1);
+            const __m128i dot2_0 = mul_add_epi8_sse(q1b_2_0, q8b_2_0);
+            const __m128i dot2_1 = mul_add_epi8_sse(q1b_2_1, q8b_2_1);
+            const int16_t ls1 = 2*((qh[ib+0] >> 12) & 7) + 1;
+            const int16_t ls2 = 2*((qh[ib+1] >> 12) & 7) + 1;
+            const __m128i p1_0 = _mm_madd_epi16(dot1_0, _mm_set1_epi16(ls1));
+            const __m128i p1_1 = _mm_madd_epi16(dot1_1, _mm_set1_epi16(ls1));
+            const __m128i p2_0 = _mm_madd_epi16(dot2_0, _mm_set1_epi16(ls2));
+            const __m128i p2_1 = _mm_madd_epi16(dot2_1, _mm_set1_epi16(ls2));
+
+            sumi1_0 = _mm_add_epi32(sumi1_0, _mm_add_epi32(p1_0, p2_0));
+            sumi1_1 = _mm_add_epi32(sumi1_1, _mm_add_epi32(p1_1, p2_1));
+            sumi1 += (y[i].bsums[2*ib+0] + y[i].bsums[2*ib+1]) * (qh[ib+0] & 0x8000 ? -1 : 1) * ls1
+                   + (y[i].bsums[2*ib+2] + y[i].bsums[2*ib+3]) * (qh[ib+1] & 0x8000 ? -1 : 1) * ls2;
+        }
+
+        const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
+        accum = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(sumi1_1, sumi1_0))), accum);
+        accum1 += d * sumi1;
+
+    }
+
+    *s = hsum_float_8(accum) + IQ1S_DELTA * accum1;
+
+#else
+
+    float sumf = 0;
+    for (int i = 0; i < nb; i++) {
+
+        const int8_t   * q8 = y[i].qs;
+        const uint8_t  * qs = x[i].qs;
+        const uint16_t * qh = x[i].qh;
+
+        int sumi = 0, sumi1 = 0;
+        for (int ib = 0; ib < QK_K/32; ++ib) {
+            const int ls = 2*((qh[ib] >> 12) & 7) + 1;
+            const int delta = qh[ib] & 0x8000 ? -1 : 1;
+            int lsum = 0;
+            for (int l = 0; l < 4; ++l) {
+                const int8_t * grid = (const int8_t *)(iq1s_grid + (qs[l] | (((qh[ib] >> 3*l) & 7) << 8)));
+                for (int j = 0; j < 8; ++j) {
+                    lsum += q8[j] * grid[j];
+                }
+                q8 += 8;
+            }
+            sumi  += ls * lsum;
+            sumi1 += ls * delta * (y[i].bsums[2*ib+0] + y[i].bsums[2*ib+1]);
+            qs += 4;
+        }
+
+        sumf += GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d * (sumi + IQ1S_DELTA * sumi1);
+    }
+
+    *s = sumf;
+
+#endif
+}
+
+void ggml_vec_dot_iq1_m_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    assert(n % QK_K == 0);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_iq1_m * GGML_RESTRICT x = vx;
+    const block_q8_K  * GGML_RESTRICT y = vy;
+
+    const int nb = n / QK_K;
+
+    iq1m_scale_t scale;
+
+#if defined __AVX2__
+
+    const __m256i mask = _mm256_set1_epi16(0x7);
+    const __m256i mone = _mm256_set1_epi16(1);
+    const __m256i mone8 = _mm256_set1_epi8(1);
+    const __m256i mtwo8 = _mm256_set1_epi8(2);
+    // VPSHUFB cannot cross 128-bit lanes so odd shifts go to upper half.
+    const __m256i scales_shift = _mm256_set_epi64x(9, 3, 6, 0);
+
+    __m256 accum1 = _mm256_setzero_ps();
+    __m256 accum2 = _mm256_setzero_ps();
+    for (int i = 0; i < nb; ++i) {
+
+        const int8_t   * q8 = y[i].qs;
+        const uint8_t  * qs = x[i].qs;
+        const uint8_t  * qh = x[i].qh;
+        const uint16_t * sc = (const uint16_t *)x[i].scales;
+
+        scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
+        // Extract 3-bit scales (16 values)
+        __m256i scales = _mm256_set1_epi64x(*(const uint64_t*)sc);
+        scales = _mm256_srlv_epi64(scales, scales_shift);
+        scales = _mm256_add_epi16(_mm256_slli_epi16(_mm256_and_si256(scales, mask), 1), mone);
+
+        // Indices to repeat each scale 8 times.
+        __m256i scales_idx1 = _mm256_set1_epi16(0x0100);
+        __m256i scales_idx2 = _mm256_add_epi8(scales_idx1, _mm256_set1_epi8(8));
+
+        __m256i sumi1 = _mm256_setzero_si256();
+        __m256i sumi2 = _mm256_setzero_si256();
+        for (int ib = 0; ib < QK_K/32; ib += 2) {
+#ifdef __BMI2__
+            const uint64_t packed_idx1 = _pdep_u64(*(const uint32_t *)qs, 0x00ff00ff00ff00ffULL)
+                                       | _pdep_u64(*(const uint16_t*)(qh) & 0x7777, 0xf000f000f000f00ULL);
+            const uint64_t packed_idx2 = _pdep_u64(*(const uint32_t *)(qs + 4), 0x00ff00ff00ff00ffULL)
+                                       | _pdep_u64(*(const uint16_t*)(qh + 2) & 0x7777, 0xf000f000f000f00ULL);
+            const uint16_t *idx1 = (const uint16_t *)(&packed_idx1);
+            const uint16_t *idx2 = (const uint16_t *)(&packed_idx2);
+            const __m256i q1b_1 = _mm256_set_epi64x(iq1s_grid[idx1[3]], iq1s_grid[idx1[2]], iq1s_grid[idx1[1]], iq1s_grid[idx1[0]]);
+            const __m256i q1b_2 = _mm256_set_epi64x(iq1s_grid[idx2[3]], iq1s_grid[idx2[2]], iq1s_grid[idx2[1]], iq1s_grid[idx2[0]]);
+
+            // Convert signs to bytes 0x81 (negative) or 0x01 (positive)
+            const uint64_t delta_sign = _pdep_u64(*(const uint32_t*)(qh) & 0x88888888, 0xf0f0f0f0f0f0f0f0ULL);
+            const __m256i delta1 = _mm256_or_si256(mone8, _mm256_cvtepi8_epi64(_mm_set1_epi32(delta_sign)));
+            const __m256i delta2 = _mm256_or_si256(mone8, _mm256_cvtepi8_epi64(_mm_set1_epi32(delta_sign >> 32)));
+#else
+            const __m256i q1b_1 = _mm256_set_epi64x(
+                    iq1s_grid[qs[3] | (((uint16_t)qh[1] << 4) & 0x700)], iq1s_grid[qs[2] | (((uint16_t)qh[1] << 8) & 0x700)],
+                    iq1s_grid[qs[1] | (((uint16_t)qh[0] << 4) & 0x700)], iq1s_grid[qs[0] | (((uint16_t)qh[0] << 8) & 0x700)]
+            );
+            const __m256i q1b_2 = _mm256_set_epi64x(
+                    iq1s_grid[qs[7] | (((uint16_t)qh[3] << 4) & 0x700)], iq1s_grid[qs[6] | (((uint16_t)qh[3] << 8) & 0x700)],
+                    iq1s_grid[qs[5] | (((uint16_t)qh[2] << 4) & 0x700)], iq1s_grid[qs[4] | (((uint16_t)qh[2] << 8) & 0x700)]
+            );
+
+            const __m256i delta1 = _mm256_set_epi64x(qh[1] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
+                                                     qh[1] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101,
+                                                     qh[0] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
+                                                     qh[0] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101);
+            const __m256i delta2 = _mm256_set_epi64x(qh[3] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
+                                                     qh[3] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101,
+                                                     qh[2] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
+                                                     qh[2] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101);
+#endif
+            const __m256i q8b_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
+            const __m256i q8b_2 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
+
+            const __m256i dot1 = mul_add_epi8(q1b_1, q8b_1);
+            const __m256i dot2 = mul_add_epi8(q1b_2, q8b_2);
+            const __m256i dot3 = _mm256_maddubs_epi16(mone8, _mm256_sign_epi8(q8b_1, delta1));
+            const __m256i dot4 = _mm256_maddubs_epi16(mone8, _mm256_sign_epi8(q8b_2, delta2));
+
+            __m256i scale1 = _mm256_shuffle_epi8(scales, scales_idx1);
+            __m256i scale2 = _mm256_shuffle_epi8(scales, scales_idx2);
+
+            scales_idx1 = _mm256_add_epi8(scales_idx1, mtwo8);
+            scales_idx2 = _mm256_add_epi8(scales_idx2, mtwo8);
+
+            const __m256i p1 = _mm256_madd_epi16(dot1, scale1);
+            const __m256i p2 = _mm256_madd_epi16(dot2, scale2);
+            const __m256i p3 = _mm256_madd_epi16(dot3, scale1);
+            const __m256i p4 = _mm256_madd_epi16(dot4, scale2);
+
+            sumi1 = _mm256_add_epi32(sumi1, _mm256_add_epi32(p1, p2));
+            sumi2 = _mm256_add_epi32(sumi2, _mm256_add_epi32(p3, p4));
+
+            qs += 8; qh += 4;
+        }
+
+        const __m256 d = _mm256_set1_ps(y[i].d * GGML_CPU_FP16_TO_FP32(scale.f16));
+
+        accum1 = _mm256_fmadd_ps(d, _mm256_cvtepi32_ps(sumi1), accum1);
+        accum2 = _mm256_fmadd_ps(d, _mm256_cvtepi32_ps(sumi2), accum2);
+    }
+
+    *s = hsum_float_8(accum1) + IQ1M_DELTA * hsum_float_8(accum2);
+
+#elif defined __AVX__
+    const __m128i mask = _mm_set1_epi16(0x7);
+    const __m128i mone = _mm_set1_epi16(1);
+
+    __m256 accum1 = _mm256_setzero_ps();
+    __m256 accum2 = _mm256_setzero_ps();
+    for (int i = 0; i < nb; ++i) {
+
+        const int8_t   * q8 = y[i].qs;
+        const uint8_t  * qs = x[i].qs;
+        const uint8_t  * qh = x[i].qh;
+        const uint16_t * sc = (const uint16_t *)x[i].scales;
+
+        scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
+
+        __m128i sumi1_0 = _mm_setzero_si128();
+        __m128i sumi1_1 = _mm_setzero_si128();
+        __m128i sumi2_0 = _mm_setzero_si128();
+        __m128i sumi2_1 = _mm_setzero_si128();
+        for (int ib = 0; ib < QK_K/32; ib += 2) {
+            const __m128i q1b_1_0 = _mm_set_epi64x(
+                    iq1s_grid[qs[1] | (((uint16_t)qh[0] << 4) & 0x700)], iq1s_grid[qs[0] | (((uint16_t)qh[0] << 8) & 0x700)]);
+            const __m128i q1b_1_1 = _mm_set_epi64x(
+                    iq1s_grid[qs[3] | (((uint16_t)qh[1] << 4) & 0x700)], iq1s_grid[qs[2] | (((uint16_t)qh[1] << 8) & 0x700)]);
+            const __m128i q1b_2_0 = _mm_set_epi64x(
+                    iq1s_grid[qs[5] | (((uint16_t)qh[2] << 4) & 0x700)], iq1s_grid[qs[4] | (((uint16_t)qh[2] << 8) & 0x700)]);
+            const __m128i q1b_2_1 = _mm_set_epi64x(
+                    iq1s_grid[qs[7] | (((uint16_t)qh[3] << 4) & 0x700)], iq1s_grid[qs[6] | (((uint16_t)qh[3] << 8) & 0x700)]);
+            const __m128i q8b_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+            const __m128i q8b_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+            const __m128i q8b_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+            const __m128i q8b_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+
+            const __m128i dot1_0 = mul_add_epi8_sse(q1b_1_0, q8b_1_0);
+            const __m128i dot1_1 = mul_add_epi8_sse(q1b_1_1, q8b_1_1);
+            const __m128i dot2_0 = mul_add_epi8_sse(q1b_2_0, q8b_2_0);
+            const __m128i dot2_1 = mul_add_epi8_sse(q1b_2_1, q8b_2_1);
+
+            const __m128i delta1_0 = _mm_set_epi64x(qh[0] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
+                                                     qh[0] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101);
+            const __m128i delta1_1 = _mm_set_epi64x(qh[1] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
+                                                     qh[1] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101);
+            const __m128i delta2_0 = _mm_set_epi64x(qh[2] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
+                                                     qh[2] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101);
+            const __m128i delta2_1 = _mm_set_epi64x(qh[3] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
+                                                     qh[3] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101);
+
+            const __m128i dot3_0 = mul_add_epi8_sse(delta1_0, q8b_1_0);
+            const __m128i dot3_1 = mul_add_epi8_sse(delta1_1, q8b_1_1);
+            const __m128i dot4_0 = mul_add_epi8_sse(delta2_0, q8b_2_0);
+            const __m128i dot4_1 = mul_add_epi8_sse(delta2_1, q8b_2_1);
+
+            __m128i scale1_0 = _mm_set1_epi16(sc[ib/2] >> 0);
+            __m128i scale1_1 = _mm_set1_epi16(sc[ib/2] >> 3);
+            __m128i scale2_0 = _mm_set1_epi16(sc[ib/2] >> 6);
+            __m128i scale2_1 = _mm_set1_epi16(sc[ib/2] >> 9);
+
+            scale1_0 = _mm_add_epi16(_mm_slli_epi16(_mm_and_si128(scale1_0, mask), 1), mone);
+            scale1_1 = _mm_add_epi16(_mm_slli_epi16(_mm_and_si128(scale1_1, mask), 1), mone);
+            scale2_0 = _mm_add_epi16(_mm_slli_epi16(_mm_and_si128(scale2_0, mask), 1), mone);
+            scale2_1 = _mm_add_epi16(_mm_slli_epi16(_mm_and_si128(scale2_1, mask), 1), mone);
+            const __m128i p1_0 = _mm_madd_epi16(dot1_0, scale1_0);
+            const __m128i p1_1 = _mm_madd_epi16(dot1_1, scale1_1);
+            const __m128i p2_0 = _mm_madd_epi16(dot2_0, scale2_0);
+            const __m128i p2_1 = _mm_madd_epi16(dot2_1, scale2_1);
+            const __m128i p3_0 = _mm_madd_epi16(dot3_0, scale1_0);
+            const __m128i p3_1 = _mm_madd_epi16(dot3_1, scale1_1);
+            const __m128i p4_0 = _mm_madd_epi16(dot4_0, scale2_0);
+            const __m128i p4_1 = _mm_madd_epi16(dot4_1, scale2_1);
+
+            sumi1_0 = _mm_add_epi32(sumi1_0, _mm_add_epi32(p1_0, p2_0));
+            sumi1_1 = _mm_add_epi32(sumi1_1, _mm_add_epi32(p1_1, p2_1));
+            sumi2_0 = _mm_add_epi32(sumi2_0, _mm_add_epi32(p3_0, p4_0));
+            sumi2_1 = _mm_add_epi32(sumi2_1, _mm_add_epi32(p3_1, p4_1));
+
+            qs += 8; qh += 4;
+        }
+
+        const __m256 d = _mm256_set1_ps(y[i].d * GGML_CPU_FP16_TO_FP32(scale.f16));
+
+        accum1 = _mm256_add_ps(_mm256_mul_ps(d, _mm256_cvtepi32_ps(MM256_SET_M128I(sumi1_1, sumi1_0))), accum1);
+        accum2 = _mm256_add_ps(_mm256_mul_ps(d, _mm256_cvtepi32_ps(MM256_SET_M128I(sumi2_1, sumi2_0))), accum2);
+    }
+
+    *s = hsum_float_8(accum1) + IQ1M_DELTA * hsum_float_8(accum2);
+
+#else
+
+    int sum1[2], sum2[2], delta[4];
+
+    float sumf = 0;
+    for (int i = 0; i < nb; i++) {
+
+        const int8_t   * q8 = y[i].qs;
+        const uint8_t  * qs = x[i].qs;
+        const uint8_t  * qh = x[i].qh;
+        const uint16_t * sc = (const uint16_t *)x[i].scales;
+
+        scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
+
+        int sumi1 = 0, sumi2 = 0;
+        for (int ib = 0; ib < QK_K/32; ++ib) {
+            delta[0] = qh[0] & 0x08 ? -1 : 1;
+            delta[1] = qh[0] & 0x80 ? -1 : 1;
+            delta[2] = qh[1] & 0x08 ? -1 : 1;
+            delta[3] = qh[1] & 0x80 ? -1 : 1;
+            sum1[0] = sum1[1] = sum2[0] = sum2[1] = 0;
+            for (int l = 0; l < 4; ++l) {
+                const int8_t * grid = (const int8_t *)(iq1s_grid + (qs[l] | (((uint16_t)qh[l/2] << (8 - 4*(l%2))) & 0x700)));
+                int lsum1 = 0, lsum2 = 0;
+                for (int j = 0; j < 8; ++j) {
+                    lsum1 += q8[j] * grid[j];
+                    lsum2 += q8[j];
+                }
+                q8 += 8;
+                sum1[l/2] += lsum1;
+                sum2[l/2] += lsum2*delta[l];
+            }
+
+            const int ls1 = 2*((sc[ib/2] >> (6*(ib%2)+0)) & 0x7) + 1;
+            const int ls2 = 2*((sc[ib/2] >> (6*(ib%2)+3)) & 0x7) + 1;
+
+            sumi1 += sum1[0] * ls1 + sum1[1] * ls2;
+            sumi2 += sum2[0] * ls1 + sum2[1] * ls2;
+            qs += 4;
+            qh += 2;
+        }
+
+        sumf += GGML_CPU_FP16_TO_FP32(scale.f16) * y[i].d * (sumi1 + IQ1M_DELTA * sumi2);
+    }
+
+    *s = sumf;
+
+#endif
+}
+
+void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+    assert(n % QK4_NL == 0);
+    static_assert(QK4_NL == QK8_0, "QK4_NL and QK8_0 must be the same");
+
+    const block_iq4_nl * GGML_RESTRICT x = vx;
+    const block_q8_0   * GGML_RESTRICT y = vy;
+
+    const int nb = n / QK4_NL;
+
+    int ib = 0;
+    float sumf = 0;
+
+#if defined __AVX2__
+
+    const __m128i values128 = _mm_loadu_si128((const __m128i*)kvalues_iq4nl);
+    const __m128i m4b  = _mm_set1_epi8(0x0f);
+    const __m256i mone = _mm256_set1_epi16(1);
+
+    __m256 accum1 = _mm256_setzero_ps();
+    __m256 accum2 = _mm256_setzero_ps();
+    for (; ib + 1 < nb; ib += 2) {
+        const __m128i q4bits_1 = _mm_loadu_si128((const __m128i*)x[ib + 0].qs);
+        const __m128i q4bits_2 = _mm_loadu_si128((const __m128i*)x[ib + 1].qs);
+        const __m256i q8b_1 = _mm256_loadu_si256((const __m256i *)y[ib + 0].qs);
+        const __m256i q8b_2 = _mm256_loadu_si256((const __m256i *)y[ib + 1].qs);
+        const __m256i q4b_1 = MM256_SET_M128I(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b)),
+                                              _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b)));
+        const __m256i q4b_2 = MM256_SET_M128I(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b)),
+                                              _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b)));
+        const __m256i p16_1 = mul_add_epi8(q4b_1, q8b_1);
+        const __m256i p16_2 = mul_add_epi8(q4b_2, q8b_2);
+        const __m256i p_1 = _mm256_madd_epi16(p16_1, mone);
+        const __m256i p_2 = _mm256_madd_epi16(p16_2, mone);
+        accum1 = _mm256_fmadd_ps(_mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y[ib + 0].d)*GGML_CPU_FP16_TO_FP32(x[ib + 0].d)),
+                _mm256_cvtepi32_ps(p_1), accum1);
+        accum2 = _mm256_fmadd_ps(_mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y[ib + 1].d)*GGML_CPU_FP16_TO_FP32(x[ib + 1].d)),
+                _mm256_cvtepi32_ps(p_2), accum2);
+    }
+
+    sumf = hsum_float_8(_mm256_add_ps(accum1, accum2));
+
+#elif defined __AVX__
+    const __m128i values128 = _mm_loadu_si128((const __m128i*)kvalues_iq4nl);
+    const __m128i m4b  = _mm_set1_epi8(0x0f);
+
+    __m256 accum = _mm256_setzero_ps();
+    for (; ib + 1 < nb; ib += 2) {
+        const __m128i q4bits_1 = _mm_loadu_si128((const __m128i *)x[ib + 0].qs);
+        const __m128i q4bits_2 = _mm_loadu_si128((const __m128i *)x[ib + 1].qs);
+        const __m128i q8b_1_0 = _mm_loadu_si128((const __m128i *)y[ib + 0].qs);
+        const __m128i q8b_1_1 = _mm_loadu_si128((const __m128i *)y[ib + 0].qs + 1);
+        const __m128i q8b_2_0 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs);
+        const __m128i q8b_2_1 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs + 1);
+
+        const __m128i q4b_1_0 = _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b));
+        const __m128i q4b_1_1 = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b));
+        const __m128i q4b_2_0 = _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b));
+        const __m128i q4b_2_1 = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b));
+
+        const __m256 p = mul_sum_i8_quad_float(q4b_1_0, q4b_1_1, q4b_2_0, q4b_2_1, q8b_1_0, q8b_1_1, q8b_2_0, q8b_2_1);
+        const __m256 deltas = quad_fp16_delta_float(x[ib].d, y[ib].d, x[ib + 1].d, y[ib + 1].d);
+        accum = _mm256_add_ps(_mm256_mul_ps(deltas, p), accum);
+    }
+
+    sumf = hsum_float_8(accum);
+
+#endif
+    for (; ib < nb; ++ib) {
+        const float d = GGML_CPU_FP16_TO_FP32(y[ib].d)*GGML_CPU_FP16_TO_FP32(x[ib].d);
+        int sumi1 = 0, sumi2 = 0;
+        for (int j = 0; j < QK4_NL/2; ++j) {
+            sumi1 += y[ib].qs[j+       0] * kvalues_iq4nl[x[ib].qs[j] & 0xf];
+            sumi2 += y[ib].qs[j+QK4_NL/2] * kvalues_iq4nl[x[ib].qs[j] >>  4];
+        }
+        sumf += d * (sumi1 + sumi2);
+    }
+    *s = sumf;
+}
+
+void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+    assert(n % QK_K == 0);
+
+    const block_iq4_xs * GGML_RESTRICT x = vx;
+    const block_q8_K   * GGML_RESTRICT y = vy;
+
+    const int nb = n / QK_K;
+
+#if defined __AVX2__
+
+    const __m128i values128 = _mm_loadu_si128((const __m128i*)kvalues_iq4nl);
+    const __m128i m4b  = _mm_set1_epi8(0x0f);
+
+    __m256 accum = _mm256_setzero_ps();
+    for (int ibl = 0; ibl < nb; ++ibl) {
+        const uint8_t * qs = x[ibl].qs;
+        const int8_t  * q8 = y[ibl].qs;
+        uint16_t sh = x[ibl].scales_h;
+        __m256i sumi1 = _mm256_setzero_si256();
+        __m256i sumi2 = _mm256_setzero_si256();
+        for (int ib = 0; ib < QK_K/32; ib += 2) {
+            const __m128i q4bits_1 = _mm_loadu_si128((const __m128i*)qs);  qs += 16;
+            const __m128i q4bits_2 = _mm_loadu_si128((const __m128i*)qs);  qs += 16;
+            const __m256i q8b_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
+            const __m256i q8b_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
+            const __m256i q4b_1 = MM256_SET_M128I(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b)),
+                                                  _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b)));
+            const __m256i q4b_2 = MM256_SET_M128I(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b)),
+                                                  _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b)));
+            const __m256i p16_1 = mul_add_epi8(q4b_1, q8b_1);
+            const __m256i p16_2 = mul_add_epi8(q4b_2, q8b_2);
+            const int16_t ls1 = ((x[ibl].scales_l[ib/2] & 0xf) | ((sh << 4) & 0x30)) - 32;
+            const int16_t ls2 = ((x[ibl].scales_l[ib/2] >>  4) | ((sh << 2) & 0x30)) - 32;
+            sh >>= 4;
+            const __m256i p_1 = _mm256_madd_epi16(p16_1, _mm256_set1_epi16(ls1));
+            const __m256i p_2 = _mm256_madd_epi16(p16_2, _mm256_set1_epi16(ls2));
+            sumi1 = _mm256_add_epi32(p_1, sumi1);
+            sumi2 = _mm256_add_epi32(p_2, sumi2);
+        }
+        accum = _mm256_fmadd_ps(_mm256_set1_ps(GGML_CPU_FP16_TO_FP32(x[ibl].d)*y[ibl].d),
+                _mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accum);
+    }
+
+    *s = hsum_float_8(accum);
+
+#elif defined __AVX__
+    const __m128i values128 = _mm_loadu_si128((const __m128i*)kvalues_iq4nl);
+    const __m128i m4b  = _mm_set1_epi8(0x0f);
+
+    __m256 accum = _mm256_setzero_ps();
+    for (int ibl = 0; ibl < nb; ++ibl) {
+        const uint8_t * qs = x[ibl].qs;
+        const int8_t  * q8 = y[ibl].qs;
+        uint16_t sh = x[ibl].scales_h;
+        __m128i sumi1_0 = _mm_setzero_si128();
+        __m128i sumi1_1 = _mm_setzero_si128();
+        __m128i sumi2_0 = _mm_setzero_si128();
+        __m128i sumi2_1 = _mm_setzero_si128();
+        for (int ib = 0; ib < QK_K/32; ib += 2) {
+            const __m128i q4bits_1 = _mm_loadu_si128((const __m128i *)qs); qs += 16;
+            const __m128i q4bits_2 = _mm_loadu_si128((const __m128i *)qs); qs += 16;
+            const __m128i q8b_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+            const __m128i q8b_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+            const __m128i q8b_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+            const __m128i q8b_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+            const __m128i q4b_1_0 = _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b));
+            const __m128i q4b_1_1 = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b));
+            const __m128i q4b_2_0 = _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b));
+            const __m128i q4b_2_1 = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b));
+            const __m128i p16_1_0 = mul_add_epi8_sse(q4b_1_0, q8b_1_0);
+            const __m128i p16_1_1 = mul_add_epi8_sse(q4b_1_1, q8b_1_1);
+            const __m128i p16_2_0 = mul_add_epi8_sse(q4b_2_0, q8b_2_0);
+            const __m128i p16_2_1 = mul_add_epi8_sse(q4b_2_1, q8b_2_1);
+            const int16_t ls1 = ((x[ibl].scales_l[ib/2] & 0xf) | ((sh << 4) & 0x30)) - 32;
+            const int16_t ls2 = ((x[ibl].scales_l[ib/2] >>  4) | ((sh << 2) & 0x30)) - 32;
+            sh >>= 4;
+            const __m128i p_1_0 = _mm_madd_epi16(p16_1_0, _mm_set1_epi16(ls1));
+            const __m128i p_1_1 = _mm_madd_epi16(p16_1_1, _mm_set1_epi16(ls1));
+            const __m128i p_2_0 = _mm_madd_epi16(p16_2_0, _mm_set1_epi16(ls2));
+            const __m128i p_2_1 = _mm_madd_epi16(p16_2_1, _mm_set1_epi16(ls2));
+            sumi1_0 = _mm_add_epi32(p_1_0, sumi1_0);
+            sumi1_1 = _mm_add_epi32(p_1_1, sumi1_1);
+            sumi2_0 = _mm_add_epi32(p_2_0, sumi2_0);
+            sumi2_1 = _mm_add_epi32(p_2_1, sumi2_1);
+        }
+        __m128i sumi12_0 = _mm_add_epi32(sumi1_0, sumi2_0);
+        __m128i sumi12_1 = _mm_add_epi32(sumi1_1, sumi2_1);
+        accum = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(GGML_CPU_FP16_TO_FP32(x[ibl].d)*y[ibl].d),
+                _mm256_cvtepi32_ps(MM256_SET_M128I(sumi12_1, sumi12_0))), accum);
+    }
+
+    *s = hsum_float_8(accum);
+
+#else
+    float sumf = 0;
+    for (int ibl = 0; ibl < nb; ++ibl) {
+        const float d4d8 = GGML_CPU_FP16_TO_FP32(x[ibl].d) * y[ibl].d;
+        uint16_t h = x[ibl].scales_h;
+        const uint8_t * qs = x[ibl].qs;
+        const int8_t  * q8 = y[ibl].qs;
+        for (int ib = 0; ib < QK_K/32; ib += 2) {
+            const uint8_t ls1 = (x[ibl].scales_l[ib/2] & 0xf) | ((h << 4) & 0x30);
+            const uint8_t ls2 = (x[ibl].scales_l[ib/2] >>  4) | ((h << 2) & 0x30);
+            h >>= 4;
+            const float d1 = d4d8*(ls1 - 32);
+            const float d2 = d4d8*(ls2 - 32);
+            int sumi1 = 0, sumi2 = 0;
+            for (int j = 0; j < 16; ++j) {
+                sumi1 += q8[j+ 0] * kvalues_iq4nl[qs[j] & 0xf];
+                sumi2 += q8[j+16] * kvalues_iq4nl[qs[j] >>  4];
+            }
+            sumf += d1 * (sumi1 + sumi2);
+            qs += 16;
+            q8 += 32;
+            sumi1 = sumi2 = 0;
+            for (int j = 0; j < 16; ++j) {
+                sumi1 += q8[j+ 0] * kvalues_iq4nl[qs[j] & 0xf];
+                sumi2 += q8[j+16] * kvalues_iq4nl[qs[j] >>  4];
+            }
+            sumf += d2 * (sumi1 + sumi2);
+            qs += 16;
+            q8 += 32;
+        }
+    }
+    *s = sumf;
+#endif
+}
+
diff --git a/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp b/ggml/src/ggml-cpu/arch/x86/repack.cpp
similarity index 67%
rename from ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp
rename to ggml/src/ggml-cpu/arch/x86/repack.cpp
index 0a3ff867cfeca..c00c1e541cb44 100644
--- a/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp
+++ b/ggml/src/ggml-cpu/arch/x86/repack.cpp
@@ -3,72 +3,20 @@
 #include "ggml-common.h"
 #include "ggml-backend-impl.h"
 
-#include "ggml-quants.h"
 #include "ggml-impl.h"
 #include "ggml-cpu.h"
 #include "ggml-cpu-impl.h"
-#include "ggml-cpu-traits.h"
+#include "simd-mappings.h"
+#include "traits.h"
 
 #include <cmath>
 #include <cstring>
 #include <cassert>
-#include <cfloat>
 #include <cstdlib> // for qsort
 #include <cstdio>  // for GGML_ASSERT
 
-#include "ggml-cpu-aarch64.h"
-
-// TODO: move to include file?
-template <int K> constexpr int QK_0() {
-    if constexpr (K == 4) {
-        return QK4_0;
-    }
-    if constexpr (K == 8) {
-        return QK8_0;
-    }
-    return -1;
-}
-
-template <int K, int N> struct block {
-    ggml_half d[N];                         // deltas for N qK_0 blocks
-    int8_t    qs[(QK_0<K>() * N * K) / 8];  // quants for N qK_0 blocks
-};
-
-// control size
-static_assert(sizeof(block<4, 4>) == 4 * sizeof(ggml_half) + QK8_0 * 2, "wrong block<4,4> size/padding");
-static_assert(sizeof(block<4, 8>) == 8 * sizeof(ggml_half) + QK8_0 * 4, "wrong block<4,8> size/padding");
-static_assert(sizeof(block<8, 4>) == 4 * sizeof(ggml_half) + QK8_0 * 4, "wrong block<8,4> size/padding");
-static_assert(sizeof(block<8, 8>) == 8 * sizeof(ggml_half) + QK8_0 * 8, "wrong block<8,8> size/padding");
-
-using block_q4_0x4 = block<4, 4>;
-using block_q4_0x8 = block<4, 8>;
-using block_q8_0x4 = block<8, 4>;
-using block_q8_0x8 = block<8, 8>;
-
-
-struct block_q4_Kx8 {
-    ggml_half d[8];      // super-block scale for quantized scales
-    ggml_half dmin[8];   // super-block scale for quantized mins
-    uint8_t scales[96];  // scales and mins, quantized with 6 bits
-    uint8_t qs[1024];    // 4--bit quants
-};
-
-static_assert(sizeof(block_q4_Kx8) == sizeof(ggml_half) * 16 + K_SCALE_SIZE * 8 + QK_K * 4, "wrong q4_K block size/padding");
-
-struct block_q8_Kx4 {
-    float d[4];              // delta
-    int8_t qs[QK_K * 4];     // quants
-    int16_t bsums[QK_K / 4]; // sum of quants in groups of 16
-};
-
-static_assert(sizeof(block_q8_Kx4) == sizeof(float) * 4 + QK_K * 4 + (QK_K / 4) * sizeof(int16_t), "wrong q8_K block size/padding");
-
-struct block_iq4_nlx4 {
-    ggml_half d[4];            // deltas for 4 iq4_nl blocks
-    uint8_t   qs[QK4_NL * 2];  // nibbles / quants for 4 iq4_nl blocks
-};
-
-static_assert(sizeof(block_iq4_nlx4) == 4 * sizeof(ggml_half) + QK4_NL * 2, "wrong iq4_nlx4 block size/padding");
+#define GGML_CPU_CLANG_WORKAROUND
+#include "../../repack.h"
 
 #if defined(__GNUC__)
 #pragma GCC diagnostic ignored "-Woverlength-strings"
@@ -76,27 +24,6 @@ static_assert(sizeof(block_iq4_nlx4) == 4 * sizeof(ggml_half) + QK4_NL * 2, "wro
 
 #define UNUSED GGML_UNUSED
 
-static inline int nearest_int(float fval) {
-    assert(fabsf(fval) <= 4194303.f);
-    float val = fval + 12582912.f;
-    int i; memcpy(&i, &val, sizeof(int));
-    return (i & 0x007fffff) - 0x00400000;
-}
-
-// Functions to create the interleaved data layout formats
-
-// interleave 4 block_q4_0s in blocks of blck_size_interleave
-// returns an interleaved block_q4_0x4
-// in the interleaved block_q4_0x4, place deltas for 4 block_q4_0 blocks
-// first, then interleave quants from 4 block_q4_0s in blocks of blck_size_interleave
-//
-// - in                  : an array of block_q4_0 pointers
-// - blck_size_interleave : the block_q4_0 quants bytes are interleaved in blocks of
-//                         blck_size_interleave bytes
-// - xor_mask            : the mask to convert the nibbles in block_q4_0 quants bytes
-//                         from bias offset form to pure sign form (this saves subtract
-//                         operations durin unpacking)
-//
 #if defined(__AVX__)
 #if defined(__F16C__)
 #if defined(__AVX512F__)
@@ -113,11 +40,11 @@ static inline __m512 __avx512_f32cx8x2_load(ggml_fp16_t *x, ggml_fp16_t *y) {
     float tmp[16];
 
     for (int i = 0; i < 8; i++) {
-        tmp[i] = GGML_FP16_TO_FP32(x[i]);
+        tmp[i] = GGML_CPU_FP16_TO_FP32(x[i]);
     }
 
     for (int i = 0; i < 8; i++) {
-        tmp[i + 8] = GGML_FP16_TO_FP32(y[i]);
+        tmp[i + 8] = GGML_CPU_FP16_TO_FP32(y[i]);
     }
 
     return _mm512_loadu_ps(tmp);
@@ -128,10 +55,10 @@ static inline __m512 __avx512_repeat_f32cx16_load(__m128i x) {
     _mm_storeu_si128((__m128i*)tmphalf, x);
 
     for (int i = 0; i < 4; i++) {
-        tmp[i] = GGML_FP16_TO_FP32(tmphalf[i]);
-        tmp[i + 4] = GGML_FP16_TO_FP32(tmphalf[i]);
-        tmp[i + 8] = GGML_FP16_TO_FP32(tmphalf[i]);
-        tmp[i + 12] = GGML_FP16_TO_FP32(tmphalf[i]);
+        tmp[i] = GGML_CPU_FP16_TO_FP32(tmphalf[i]);
+        tmp[i + 4] = GGML_CPU_FP16_TO_FP32(tmphalf[i]);
+        tmp[i + 8] = GGML_CPU_FP16_TO_FP32(tmphalf[i]);
+        tmp[i + 12] = GGML_CPU_FP16_TO_FP32(tmphalf[i]);
     }
 
     return _mm512_loadu_ps(tmp);
@@ -141,7 +68,7 @@ static inline __m256 __avx_f32cx8_load(ggml_fp16_t *x) {
     float tmp[8];
 
     for (int i = 0; i < 8; i++) {
-        tmp[i] = GGML_FP16_TO_FP32(x[i]);
+        tmp[i] = GGML_CPU_FP16_TO_FP32(x[i]);
     }
 
     return _mm256_loadu_ps(tmp);
@@ -150,8 +77,8 @@ static inline __m256 __avx_repeat_f32cx8_load(ggml_fp16_t *x) {
     float tmp[8];
 
     for (int i = 0; i < 4; i++) {
-        tmp[i] = GGML_FP16_TO_FP32(x[i]);
-        tmp[i + 4] = GGML_FP16_TO_FP32(x[i]);
+        tmp[i] = GGML_CPU_FP16_TO_FP32(x[i]);
+        tmp[i + 4] = GGML_CPU_FP16_TO_FP32(x[i]);
     }
 
     return _mm256_loadu_ps(tmp);
@@ -162,7 +89,7 @@ static inline __m256 __avx_rearranged_f32cx8_load(ggml_fp16_t *x, __m128i arrang
 
     _mm_storeu_si128((__m128i*)tmphalf, _mm_shuffle_epi8(_mm_loadu_si128((const __m128i *) x), arrangeMask));
     for (int i = 0; i < 8; i++) {
-        tmp[i] = GGML_FP16_TO_FP32(tmphalf[i]);
+        tmp[i] = GGML_CPU_FP16_TO_FP32(tmphalf[i]);
     }
 
     return _mm256_loadu_ps(tmp);
@@ -178,6 +105,12 @@ static inline __m256 __avx_rearranged_f32cx8_load(ggml_fp16_t *x, __m128i arrang
 #endif
 #endif
 
+static inline int nearest_int(float fval) {
+    assert(fabsf(fval) <= 4194303.f);
+    float val = fval + 12582912.f;
+    int i; memcpy(&i, &val, sizeof(int));
+    return (i & 0x007fffff) - 0x00400000;
+}
 
 #if defined(__AVX2__) || defined(__AVX512F__)
 #if defined(__AVX512F__)
@@ -242,188 +175,14 @@ static inline __m256i mul_sum_i8_pairs_acc_int32x8(const __m256i acc, const __m2
 }
 #endif
 
-static const int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
-
-static void ggml_quantize_mat_q8_0_4x4(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
-    assert(QK8_0 == 32);
-    assert(k % QK8_0 == 0);
-    const int nb = k / QK8_0;
-
-    block_q8_0x4 * GGML_RESTRICT y = (block_q8_0x4 *) vy;
-
-#if defined(__ARM_NEON)
-    float32x4_t srcv[4][8];
-    float id[4];
-
-    for (int i = 0; i < nb; i++) {
-        float32x4_t asrcv[8];
-        float32x4_t amaxv[8];
-
-        for (int row_iter = 0; row_iter < 4; row_iter++) {
-            for (int j = 0; j < 8; j++) srcv[row_iter][j] = vld1q_f32(x + row_iter * k + i * 32 + 4 * j);
-            for (int j = 0; j < 8; j++) asrcv[j] = vabsq_f32(srcv[row_iter][j]);
-
-            for (int j = 0; j < 4; j++) amaxv[2 * j] = vmaxq_f32(asrcv[2 * j], asrcv[2 * j + 1]);
-            for (int j = 0; j < 2; j++) amaxv[4 * j] = vmaxq_f32(amaxv[4 * j], amaxv[4 * j + 2]);
-            for (int j = 0; j < 1; j++) amaxv[8 * j] = vmaxq_f32(amaxv[8 * j], amaxv[8 * j + 4]);
-
-            const float amax = vmaxvq_f32(amaxv[0]);
-
-            const float d = amax / ((1 << 7) - 1);
-            id[row_iter] = d ? 1.0f / d : 0.0f;
-
-            y[i].d[row_iter] = GGML_FP32_TO_FP16(d);
-        }
-
-        for (int j = 0; j < 8; j++) {
-            float32x4_t v = vmulq_n_f32(srcv[0][j], id[0]);
-            int32x4_t vi = vcvtnq_s32_f32(v);
-            y[i].qs[16 * j + 0] = vgetq_lane_s32(vi, 0);
-            y[i].qs[16 * j + 1] = vgetq_lane_s32(vi, 1);
-            y[i].qs[16 * j + 2] = vgetq_lane_s32(vi, 2);
-            y[i].qs[16 * j + 3] = vgetq_lane_s32(vi, 3);
-
-            v = vmulq_n_f32(srcv[1][j], id[1]);
-            vi = vcvtnq_s32_f32(v);
-            y[i].qs[16 * j + 4] = vgetq_lane_s32(vi, 0);
-            y[i].qs[16 * j + 5] = vgetq_lane_s32(vi, 1);
-            y[i].qs[16 * j + 6] = vgetq_lane_s32(vi, 2);
-            y[i].qs[16 * j + 7] = vgetq_lane_s32(vi, 3);
-
-            v = vmulq_n_f32(srcv[2][j], id[2]);
-            vi = vcvtnq_s32_f32(v);
-            y[i].qs[16 * j + 8] = vgetq_lane_s32(vi, 0);
-            y[i].qs[16 * j + 9] = vgetq_lane_s32(vi, 1);
-            y[i].qs[16 * j + 10] = vgetq_lane_s32(vi, 2);
-            y[i].qs[16 * j + 11] = vgetq_lane_s32(vi, 3);
-
-            v = vmulq_n_f32(srcv[3][j], id[3]);
-            vi = vcvtnq_s32_f32(v);
-            y[i].qs[16 * j + 12] = vgetq_lane_s32(vi, 0);
-            y[i].qs[16 * j + 13] = vgetq_lane_s32(vi, 1);
-            y[i].qs[16 * j + 14] = vgetq_lane_s32(vi, 2);
-            y[i].qs[16 * j + 15] = vgetq_lane_s32(vi, 3);
-        }
-    }
-#else
-    // scalar
-    const int blck_size_interleave = 4;
-    float srcv[4][QK8_0];
-    float id[4];
-
-    for (int i = 0; i < nb; i++) {
-        for (int row_iter = 0; row_iter < 4; row_iter++) {
-            float amax = 0.0f; // absolute max
-
-            for (int j = 0; j < QK8_0; j++) {
-                srcv[row_iter][j] = x[row_iter * k + i * QK8_0 + j];
-                amax = MAX(amax, fabsf(srcv[row_iter][j]));
-            }
-
-            const float d = amax / ((1 << 7) - 1);
-            id[row_iter] = d ? 1.0f / d : 0.0f;
-
-            y[i].d[row_iter] = GGML_FP32_TO_FP16(d);
-        }
-
-        for (int j = 0; j < QK8_0 * 4; j++) {
-            int src_offset = (j / (4 * blck_size_interleave)) * blck_size_interleave;
-            int src_id = (j % (4 * blck_size_interleave)) / blck_size_interleave;
-            src_offset += (j % blck_size_interleave);
-
-            float x0 = srcv[src_id][src_offset] * id[src_id];
-            y[i].qs[j] = roundf(x0);
-        }
-    }
-#endif
-}
-
-static void ggml_quantize_mat_q8_0_4x8(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
+void ggml_quantize_mat_q8_0_4x8(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
     assert(QK8_0 == 32);
     assert(k % QK8_0 == 0);
     const int nb = k / QK8_0;
 
     block_q8_0x4 * GGML_RESTRICT y = (block_q8_0x4 *) vy;
 
-#if defined(__ARM_NEON)
-    float32x4_t srcv[4][8];
-    float id[4];
-
-    for (int i = 0; i < nb; i++) {
-        float32x4_t asrcv[8];
-        float32x4_t amaxv[8];
-
-        for (int row_iter = 0; row_iter < 4; row_iter++) {
-            for (int j = 0; j < 8; j++) srcv[row_iter][j] = vld1q_f32(x + row_iter * k + i * 32 + 4 * j);
-            for (int j = 0; j < 8; j++) asrcv[j] = vabsq_f32(srcv[row_iter][j]);
-
-            for (int j = 0; j < 4; j++) amaxv[2 * j] = vmaxq_f32(asrcv[2 * j], asrcv[2 * j + 1]);
-            for (int j = 0; j < 2; j++) amaxv[4 * j] = vmaxq_f32(amaxv[4 * j], amaxv[4 * j + 2]);
-            for (int j = 0; j < 1; j++) amaxv[8 * j] = vmaxq_f32(amaxv[8 * j], amaxv[8 * j + 4]);
-
-            const float amax = vmaxvq_f32(amaxv[0]);
-
-            const float d = amax / ((1 << 7) - 1);
-            id[row_iter] = d ? 1.0f / d : 0.0f;
-
-            y[i].d[row_iter] = GGML_FP32_TO_FP16(d);
-        }
-
-        for (int j = 0; j < 4; j++) {
-            float32x4_t v = vmulq_n_f32(srcv[0][2 * j], id[0]);
-            int32x4_t vi = vcvtnq_s32_f32(v);
-            y[i].qs[32 * j + 0] = vgetq_lane_s32(vi, 0);
-            y[i].qs[32 * j + 1] = vgetq_lane_s32(vi, 1);
-            y[i].qs[32 * j + 2] = vgetq_lane_s32(vi, 2);
-            y[i].qs[32 * j + 3] = vgetq_lane_s32(vi, 3);
-            v = vmulq_n_f32(srcv[0][2 * j + 1], id[0]);
-            vi = vcvtnq_s32_f32(v);
-            y[i].qs[32 * j + 4] = vgetq_lane_s32(vi, 0);
-            y[i].qs[32 * j + 5] = vgetq_lane_s32(vi, 1);
-            y[i].qs[32 * j + 6] = vgetq_lane_s32(vi, 2);
-            y[i].qs[32 * j + 7] = vgetq_lane_s32(vi, 3);
-
-            v = vmulq_n_f32(srcv[1][2 * j], id[1]);
-            vi = vcvtnq_s32_f32(v);
-            y[i].qs[32 * j + 8] = vgetq_lane_s32(vi, 0);
-            y[i].qs[32 * j + 9] = vgetq_lane_s32(vi, 1);
-            y[i].qs[32 * j + 10] = vgetq_lane_s32(vi, 2);
-            y[i].qs[32 * j + 11] = vgetq_lane_s32(vi, 3);
-            v = vmulq_n_f32(srcv[1][2 * j + 1], id[1]);
-            vi = vcvtnq_s32_f32(v);
-            y[i].qs[32 * j + 12] = vgetq_lane_s32(vi, 0);
-            y[i].qs[32 * j + 13] = vgetq_lane_s32(vi, 1);
-            y[i].qs[32 * j + 14] = vgetq_lane_s32(vi, 2);
-            y[i].qs[32 * j + 15] = vgetq_lane_s32(vi, 3);
-
-            v = vmulq_n_f32(srcv[2][2 * j], id[2]);
-            vi = vcvtnq_s32_f32(v);
-            y[i].qs[32 * j + 16] = vgetq_lane_s32(vi, 0);
-            y[i].qs[32 * j + 17] = vgetq_lane_s32(vi, 1);
-            y[i].qs[32 * j + 18] = vgetq_lane_s32(vi, 2);
-            y[i].qs[32 * j + 19] = vgetq_lane_s32(vi, 3);
-            v = vmulq_n_f32(srcv[2][2 * j + 1], id[2]);
-            vi = vcvtnq_s32_f32(v);
-            y[i].qs[32 * j + 20] = vgetq_lane_s32(vi, 0);
-            y[i].qs[32 * j + 21] = vgetq_lane_s32(vi, 1);
-            y[i].qs[32 * j + 22] = vgetq_lane_s32(vi, 2);
-            y[i].qs[32 * j + 23] = vgetq_lane_s32(vi, 3);
-
-            v = vmulq_n_f32(srcv[3][2 * j], id[3]);
-            vi = vcvtnq_s32_f32(v);
-            y[i].qs[32 * j + 24] = vgetq_lane_s32(vi, 0);
-            y[i].qs[32 * j + 25] = vgetq_lane_s32(vi, 1);
-            y[i].qs[32 * j + 26] = vgetq_lane_s32(vi, 2);
-            y[i].qs[32 * j + 27] = vgetq_lane_s32(vi, 3);
-            v = vmulq_n_f32(srcv[3][2 * j + 1], id[3]);
-            vi = vcvtnq_s32_f32(v);
-            y[i].qs[32 * j + 28] = vgetq_lane_s32(vi, 0);
-            y[i].qs[32 * j + 29] = vgetq_lane_s32(vi, 1);
-            y[i].qs[32 * j + 30] = vgetq_lane_s32(vi, 2);
-            y[i].qs[32 * j + 31] = vgetq_lane_s32(vi, 3);
-        }
-    }
-#elif defined(__AVX2__) || defined(__AVX__)
+#if defined(__AVX2__) || defined(__AVX__)
     float id[4];
     __m256 srcv[4][4];
     __m256 idvec[4];
@@ -453,7 +212,7 @@ static void ggml_quantize_mat_q8_0_4x8(const float * GGML_RESTRICT x, void * GGM
             id[row_iter] = ( maxScalar != 0.0f ) ? 127.f / maxScalar : 0.0f; //d ? 1.0f / d : 0.0f;
 
             // Store the scale for the individual block
-            y[i].d[row_iter] = GGML_FP32_TO_FP16(d);
+            y[i].d[row_iter] = GGML_CPU_FP32_TO_FP16(d);
 
             // Store the values in blocks of eight values - Aim is to use these later for block interleaving
             srcv[row_iter][0] = v0;
@@ -520,6 +279,7 @@ static void ggml_quantize_mat_q8_0_4x8(const float * GGML_RESTRICT x, void * GGM
 #endif
         }
     }
+
 #else
     // scalar
     const int blck_size_interleave = 8;
@@ -538,7 +298,7 @@ static void ggml_quantize_mat_q8_0_4x8(const float * GGML_RESTRICT x, void * GGM
             const float d = amax / ((1 << 7) - 1);
             id[row_iter] = d ? 1.0f / d : 0.0f;
 
-            y[i].d[row_iter] = GGML_FP32_TO_FP16(d);
+            y[i].d[row_iter] = GGML_CPU_FP32_TO_FP16(d);
         }
 
         for (int j = 0; j < QK8_0 * 4; j++) {
@@ -553,7 +313,7 @@ static void ggml_quantize_mat_q8_0_4x8(const float * GGML_RESTRICT x, void * GGM
 #endif
 }
 
-static void ggml_quantize_mat_q8_K_4x8(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
+void ggml_quantize_mat_q8_K_4x8(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
     assert(QK_K == 256);
     assert(k % QK_K == 0);
     const int nb = k / QK_K;
@@ -817,203 +577,7 @@ static void ggml_quantize_mat_q8_K_4x8(const float * GGML_RESTRICT x, void * GGM
 #endif
 }
 
-template <int64_t INTER_SIZE, ggml_type PARAM_TYPE>
-void ggml_quantize_mat_t(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t nrow, int64_t n_per_row);
-
-template <> void ggml_quantize_mat_t<4, GGML_TYPE_Q8_0>(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t nrow, int64_t n_per_row) {
-    assert(nrow == 4);
-    UNUSED(nrow);
-    ggml_quantize_mat_q8_0_4x4(x, vy, n_per_row);
-}
-
-template <> void ggml_quantize_mat_t<8, GGML_TYPE_Q8_0>(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t nrow, int64_t n_per_row) {
-    assert(nrow == 4);
-    UNUSED(nrow);
-    ggml_quantize_mat_q8_0_4x8(x, vy, n_per_row);
-}
-
-template <> void ggml_quantize_mat_t<8, GGML_TYPE_Q8_K>(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t nrow, int64_t n_per_row) {
-    assert(nrow == 4);
-    UNUSED(nrow);
-    ggml_quantize_mat_q8_K_4x8(x, vy, n_per_row);
-}
-
-static void ggml_gemv_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
-    const int qk = QK8_0;
-    const int nb = n / qk;
-    const int ncols_interleaved = 4;
-    const int blocklen = 4;
-
-    assert (n % qk == 0);
-    assert (nc % ncols_interleaved == 0);
-
-    UNUSED(s);
-    UNUSED(bs);
-    UNUSED(vx);
-    UNUSED(vy);
-    UNUSED(nr);
-    UNUSED(nc);
-    UNUSED(nb);
-    UNUSED(ncols_interleaved);
-    UNUSED(blocklen);
-
-#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
-    if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
-        const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx;
-
-        for (int c = 0; c < nc; c += ncols_interleaved) {
-            const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
-            float32x4_t acc = vdupq_n_f32(0);
-            for (int b = 0; b < nb; b++) {
-                int8x16_t b0 = vld1q_s8((const int8_t *) b_ptr->qs);
-                int8x16_t b1 = vld1q_s8((const int8_t *) b_ptr->qs + 16);
-                int8x16_t b2 = vld1q_s8((const int8_t *) b_ptr->qs + 32);
-                int8x16_t b3 = vld1q_s8((const int8_t *) b_ptr->qs + 48);
-                float16x4_t bd = vld1_f16((const __fp16 *) b_ptr->d);
-
-                int8x16_t a0 = vld1q_s8(a_ptr->qs);
-                int8x16_t a1 = vld1q_s8(a_ptr->qs + qk/2);
-                float16x4_t ad = vld1_dup_f16((const __fp16 *) &a_ptr->d);
-
-                int32x4_t ret = vdupq_n_s32(0);
-
-                ret = vdotq_laneq_s32(ret, b0 << 4, a0, 0);
-                ret = vdotq_laneq_s32(ret, b1 << 4, a0, 1);
-                ret = vdotq_laneq_s32(ret, b2 << 4, a0, 2);
-                ret = vdotq_laneq_s32(ret, b3 << 4, a0, 3);
-
-                ret = vdotq_laneq_s32(ret, b0 & 0xf0U, a1, 0);
-                ret = vdotq_laneq_s32(ret, b1 & 0xf0U, a1, 1);
-                ret = vdotq_laneq_s32(ret, b2 & 0xf0U, a1, 2);
-                ret = vdotq_laneq_s32(ret, b3 & 0xf0U, a1, 3);
-
-                acc = vfmaq_f32(acc, vcvtq_n_f32_s32(ret, 4),
-                                vmulq_f32(vcvt_f32_f16(ad), vcvt_f32_f16(bd)));
-                a_ptr++;
-                b_ptr++;
-            }
-            vst1q_f32(s, acc);
-            s += ncols_interleaved;
-        }
-        return;
-    }
-#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
-    float sumf[4];
-    int sumi;
-
-    const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
-    for (int x = 0; x < nc / ncols_interleaved; x++) {
-        const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
-
-        for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
-        for (int l = 0; l < nb; l++) {
-            for (int k = 0; k < (qk / (2 * blocklen)); k++) {
-                for (int j = 0; j < ncols_interleaved; j++) {
-                    sumi = 0;
-                    for (int i = 0; i < blocklen; ++i) {
-                        const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
-                        const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
-                        sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
-                    }
-                    sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d);
-                }
-            }
-        }
-        for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
-    }
-}
-
-static void ggml_gemv_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
-    const int qk = QK8_0;
-    const int nb = n / qk;
-    const int ncols_interleaved = 4;
-    const int blocklen = 8;
-
-    assert (n % qk == 0);
-    assert (nc % ncols_interleaved == 0);
-
-    UNUSED(s);
-    UNUSED(bs);
-    UNUSED(vx);
-    UNUSED(vy);
-    UNUSED(nr);
-    UNUSED(nc);
-    UNUSED(nb);
-    UNUSED(ncols_interleaved);
-    UNUSED(blocklen);
-
-#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
-    if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
-        const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx;
-
-        for (int c = 0; c < nc; c += ncols_interleaved) {
-            const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
-            float32x4_t acc = vdupq_n_f32(0);
-            for (int b = 0; b < nb; b++) {
-                int8x16_t b0 = vld1q_s8((const int8_t *) b_ptr->qs);
-                int8x16_t b1 = vld1q_s8((const int8_t *) b_ptr->qs + 16);
-                int8x16_t b2 = vld1q_s8((const int8_t *) b_ptr->qs + 32);
-                int8x16_t b3 = vld1q_s8((const int8_t *) b_ptr->qs + 48);
-                float16x4_t bd = vld1_f16((const __fp16 *) b_ptr->d);
-
-                int8x16_t a0 = (int8x16_t) vld1q_dup_s64((const int64_t *) a_ptr->qs);
-                int8x16_t a1 = (int8x16_t) vld1q_dup_s64((const int64_t *) a_ptr->qs + 1);
-                int8x16_t a2 = (int8x16_t) vld1q_dup_s64((const int64_t *) a_ptr->qs + 2);
-                int8x16_t a3 = (int8x16_t) vld1q_dup_s64((const int64_t *) a_ptr->qs + 3);
-                float16x4_t ad = vld1_dup_f16((const __fp16 *) &a_ptr->d);
-
-                int32x4_t ret0 = vdupq_n_s32(0);
-                int32x4_t ret1 = vdupq_n_s32(0);
-
-                ret0 = vdotq_s32(ret0, b0 << 4, a0);
-                ret1 = vdotq_s32(ret1, b1 << 4, a0);
-                ret0 = vdotq_s32(ret0, b2 << 4, a1);
-                ret1 = vdotq_s32(ret1, b3 << 4, a1);
-
-                ret0 = vdotq_s32(ret0, b0 & 0xf0U, a2);
-                ret1 = vdotq_s32(ret1, b1 & 0xf0U, a2);
-                ret0 = vdotq_s32(ret0, b2 & 0xf0U, a3);
-                ret1 = vdotq_s32(ret1, b3 & 0xf0U, a3);
-
-                int32x4_t ret = vpaddq_s32(ret0, ret1);
-
-                acc = vfmaq_f32(acc, vcvtq_n_f32_s32(ret, 4),
-                        vmulq_f32(vcvt_f32_f16(ad), vcvt_f32_f16(bd)));
-                a_ptr++;
-                b_ptr++;
-            }
-            vst1q_f32(s, acc);
-            s += ncols_interleaved;
-        }
-        return;
-    }
-#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
-    float sumf[4];
-    int sumi;
-
-    const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
-    for (int x = 0; x < nc / ncols_interleaved; x++) {
-        const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
-
-        for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
-        for (int l = 0; l < nb; l++) {
-            for (int k = 0; k < (qk / (2 * blocklen)); k++) {
-                for (int j = 0; j < ncols_interleaved; j++) {
-                    sumi = 0;
-                    for (int i = 0; i < blocklen; ++i) {
-                        const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
-                        const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
-                        sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
-                    }
-                    sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d);
-                }
-            }
-        }
-        for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
-    }
-}
-
-static void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
+void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
     const int qk = QK8_0;
     const int nb = n / qk;
     const int ncols_interleaved = 8;
@@ -1032,75 +596,7 @@ static void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, c
     UNUSED(ncols_interleaved);
     UNUSED(blocklen);
 
-#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__)
-#if defined(__ARM_FEATURE_SVE)
-    if (ggml_cpu_has_sve() && ggml_cpu_get_sve_cnt() == QK8_0) {
-        const void * b_ptr = vx;
-        const void * a_ptr = vy;
-        float * res_ptr = s;
-
-        __asm__ __volatile__(
-            "ptrue p0.b\n"
-            "add %x[b_ptr], %x[b_ptr], #0x10\n"
-            "1:"  // Column loop
-            "add x22, %x[a_ptr], #0x2\n"
-            "mov z31.b, #0x0\n"
-            "mov x21, %x[nb]\n"
-            "2:"  // Block loop
-            "ld1b { z30.b }, p0/Z, [%x[b_ptr]]\n"
-            "ld1b { z29.b }, p0/Z, [%x[b_ptr], #1, MUL VL]\n"
-            "mov z28.s, #0x0\n"
-            "mov z27.s, #0x0\n"
-            "ld1rd { z26.d }, p0/Z, [x22]\n"
-            "ld1b { z25.b }, p0/Z, [%x[b_ptr], #2, MUL VL]\n"
-            "sub x20, x22, #0x2\n"
-            "sub x21, x21, #0x1\n"
-            "ld1b { z24.b }, p0/Z, [%x[b_ptr], #3, MUL VL]\n"
-            "ld1rd { z23.d }, p0/Z, [x22, #8]\n"
-            "lsl z22.b, z30.b, #0x4\n"
-            "lsl z16.b, z29.b, #0x4\n"
-            "and z30.b, z30.b, #0xf0\n"
-            "and z29.b, z29.b, #0xf0\n"
-            "ld1rd { z21.d }, p0/Z, [x22, #16]\n"
-            "ld1rd { z20.d }, p0/Z, [x22, #24]\n"
-            "lsl z19.b, z25.b, #0x4\n"
-            "and z25.b, z25.b, #0xf0\n"
-            "ld1rh { z17.h }, p0/Z, [x20]\n"
-            "ld1h { z18.s }, p0/Z, [%x[b_ptr], #-1, MUL VL]\n"
-            "sdot z28.s, z22.b, z26.b\n"
-            "sdot z27.s, z16.b, z26.b\n"
-            "lsl z16.b, z24.b, #0x4\n"
-            "add x22, x22, #0x22\n"
-            "and z24.b, z24.b, #0xf0\n"
-            "add %x[b_ptr], %x[b_ptr], #0x90\n"
-            "fcvt z17.s, p0/m, z17.h\n"
-            "fcvt z18.s, p0/m, z18.h\n"
-            "sdot z28.s, z19.b, z23.b\n"
-            "sdot z27.s, z16.b, z23.b\n"
-            "fmul z18.s, z18.s, z17.s\n"
-            "sdot z28.s, z30.b, z21.b\n"
-            "sdot z27.s, z29.b, z21.b\n"
-            "sdot z28.s, z25.b, z20.b\n"
-            "sdot z27.s, z24.b, z20.b\n"
-            "uzp1 z17.s, z28.s, z27.s\n"
-            "uzp2 z16.s, z28.s, z27.s\n"
-            "add z17.s, z17.s, z16.s\n"
-            "asr z17.s, z17.s, #0x4\n"
-            "scvtf z17.s, p0/m, z17.s\n"
-            "fmla z31.s, p0/M, z17.s, z18.s\n"
-            "cbnz x21, 2b\n"
-            "sub %x[nc], %x[nc], #0x8\n"
-            "st1w { z31.s }, p0, [%x[res_ptr]]\n"
-            "add %x[res_ptr], %x[res_ptr], #0x20\n"
-            "cbnz %x[nc], 1b\n"
-            : [b_ptr] "+&r" (b_ptr), [res_ptr] "+&r" (res_ptr), [nc] "+&r" (nc)
-            : [a_ptr] "r" (a_ptr), [nb] "r" (nb)
-            : "memory", "p0", "x20", "x21", "x22", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
-        );
-        return;
-    }
-#endif // #if defined(__ARM_FEATURE_SVE)
-#elif defined(__AVX2__)
+#if defined(__AVX2__)
     // Lookup table to convert signed nibbles to signed bytes
     __m256i signextendlut = _mm256_castsi128_si256(_mm_set_epi8(-1, -2, -3, -4, -5, -6, -7, -8, 7, 6, 5, 4, 3, 2, 1, 0));
     signextendlut = _mm256_permute2f128_si256(signextendlut, signextendlut, 0);
@@ -1152,7 +648,7 @@ static void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, c
                 const __m256 col_scale_f32 = GGML_F32Cx8_REARRANGE_LOAD(b_ptr[b].d, changemask);
 
                 // Load and convert to FP32 scale from block_q8_0
-                const __m256 row_scale_f32 = _mm256_set1_ps(GGML_FP16_TO_FP32(a_ptr[b].d));
+                const __m256 row_scale_f32 = _mm256_set1_ps(GGML_CPU_FP16_TO_FP32(a_ptr[b].d));
 
                 // Load the block values in block_q8_0 in batches of 16 bytes and replicate the same across 256 bit vector
                 __m256i lhs_vec_0 = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)a_ptr[b].qs));
@@ -1191,74 +687,8 @@ static void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, c
         }
     }
     return;
-#elif defined __riscv_v
-    if (__riscv_vlenb() >= QK4_0) {
-        const size_t vl = QK4_0;
-
-        const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
-        for (int x = 0; x < nc / ncols_interleaved; x++) {
-            const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb);
 
-            vfloat32m1_t sumf = __riscv_vfmv_v_f_f32m1(0.0, vl / 4);
-            for (int l = 0; l < nb; l++) {
-                const int64_t a0 = *(const int64_t *)&a_ptr[l].qs[0];
-                const int64_t a1 = *(const int64_t *)&a_ptr[l].qs[8];
-                const int64_t a2 = *(const int64_t *)&a_ptr[l].qs[16];
-                const int64_t a3 = *(const int64_t *)&a_ptr[l].qs[24];
-                __asm__ __volatile__("" ::: "memory"); // prevent gcc from emitting fused vlse64, violating alignment
-                const vint8m2_t lhs_0_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(a0, vl / 4));
-                const vint8m2_t lhs_1_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(a1, vl / 4));
-                const vint8m2_t lhs_2_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(a2, vl / 4));
-                const vint8m2_t lhs_3_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(a3, vl / 4));
-
-                const vint8m4_t rhs_raw_vec = __riscv_vle8_v_i8m4((const int8_t *)b_ptr[l].qs, vl * 4);
-                const vint8m4_t rhs_vec_lo = __riscv_vsra_vx_i8m4(__riscv_vsll_vx_i8m4(rhs_raw_vec, 4, vl * 4), 4, vl * 4);
-                const vint8m4_t rhs_vec_hi = __riscv_vsra_vx_i8m4(rhs_raw_vec, 4, vl * 4);
-                const vint8m2_t rhs_vec_lo_0 = __riscv_vget_v_i8m4_i8m2(rhs_vec_lo, 0);
-                const vint8m2_t rhs_vec_lo_1 = __riscv_vget_v_i8m4_i8m2(rhs_vec_lo, 1);
-                const vint8m2_t rhs_vec_hi_0 = __riscv_vget_v_i8m4_i8m2(rhs_vec_hi, 0);
-                const vint8m2_t rhs_vec_hi_1 = __riscv_vget_v_i8m4_i8m2(rhs_vec_hi, 1);
-
-                const vint16m4_t sumi_lo_0 = __riscv_vwmul_vv_i16m4(rhs_vec_lo_0, lhs_0_8, vl * 2);
-                const vint16m4_t sumi_lo_1 = __riscv_vwmacc_vv_i16m4(sumi_lo_0, rhs_vec_lo_1, lhs_1_8, vl * 2);
-                const vint16m4_t sumi_hi_0 = __riscv_vwmacc_vv_i16m4(sumi_lo_1, rhs_vec_hi_0, lhs_2_8, vl * 2);
-                const vint16m4_t sumi_hi_m = __riscv_vwmacc_vv_i16m4(sumi_hi_0, rhs_vec_hi_1, lhs_3_8, vl * 2);
-
-                const vuint32m4_t sumi_i32 = __riscv_vreinterpret_v_i32m4_u32m4(__riscv_vreinterpret_v_i16m4_i32m4(sumi_hi_m));
-                const vuint16m2_t sumi_h2_0 = __riscv_vnsrl_wx_u16m2(sumi_i32, 0, vl);
-                const vuint16m2_t sumi_h2_1 = __riscv_vnsrl_wx_u16m2(sumi_i32, 16, vl);
-                const vuint16m2_t sumi_h2 = __riscv_vadd_vv_u16m2(sumi_h2_0, sumi_h2_1, vl);
-                const vuint32m2_t sumi_h2_i32 = __riscv_vreinterpret_v_u16m2_u32m2(sumi_h2);
-                const vuint16m1_t sumi_h4_0 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 0, vl / 2);
-                const vuint16m1_t sumi_h4_1 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 16, vl / 2);
-                const vuint16m1_t sumi_h4 = __riscv_vadd_vv_u16m1(sumi_h4_0, sumi_h4_1, vl / 2);
-                const vuint32m1_t sumi_h4_i32 = __riscv_vreinterpret_v_u16m1_u32m1(sumi_h4);
-                const vint16mf2_t sumi_h8_0 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 0, vl / 4));
-                const vint16mf2_t sumi_h8_1 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 16, vl / 4));
-                const vint32m1_t sumi_h8 = __riscv_vwadd_vv_i32m1(sumi_h8_0, sumi_h8_1, vl / 4);
-                const vfloat32m1_t facc = __riscv_vfcvt_f_x_v_f32m1(sumi_h8, vl / 4);
-
-                // vector version needs Zvfhmin extension
-                const float a_scale = GGML_FP16_TO_FP32(a_ptr[l].d);
-                const float b_scales[8] = {
-                    GGML_FP16_TO_FP32(b_ptr[l].d[0]),
-                    GGML_FP16_TO_FP32(b_ptr[l].d[1]),
-                    GGML_FP16_TO_FP32(b_ptr[l].d[2]),
-                    GGML_FP16_TO_FP32(b_ptr[l].d[3]),
-                    GGML_FP16_TO_FP32(b_ptr[l].d[4]),
-                    GGML_FP16_TO_FP32(b_ptr[l].d[5]),
-                    GGML_FP16_TO_FP32(b_ptr[l].d[6]),
-                    GGML_FP16_TO_FP32(b_ptr[l].d[7])
-                };
-                const vfloat32m1_t b_scales_vec = __riscv_vle32_v_f32m1(b_scales, vl / 4);
-                const vfloat32m1_t tmp1 = __riscv_vfmul_vf_f32m1(facc, a_scale, vl / 4);
-                sumf = __riscv_vfmacc_vv_f32m1(sumf, tmp1, b_scales_vec, vl / 4);
-            }
-            __riscv_vse32_v_f32m1(s + x * ncols_interleaved, sumf, vl / 4);
-        }
-        return;
-    }
-#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__)
+#endif
     {
         float sumf[8];
         int sumi;
@@ -1277,7 +707,7 @@ static void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, c
                             const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
                             sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
                         }
-                        sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d);
+                        sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
                     }
                 }
             }
@@ -1286,7 +716,7 @@ static void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, c
     }
 }
 
-static void ggml_gemv_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
+void ggml_gemv_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
     const int qk = QK_K;
     const int nb = n / qk;
     const int ncols_interleaved = 8;
@@ -1543,13 +973,13 @@ static void ggml_gemv_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, c
                         sumi2 = sumi2 * scales_1[j];
                         sumi += sumi1 + sumi2;
                     }
-                    sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d;
+                    sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d;
                 }
             }
             for (int sb = 0; sb < 8; sb++) {
                 uint8_t *mins = (uint8_t*) utmp + 8 + sb * 16;
                 for (int j = 0; j < ncols_interleaved; j++) {
-                    sum_minf[j] += mins[j] * (a_ptr[l].bsums[sb * 2] + a_ptr[l].bsums[sb * 2 + 1]) * GGML_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d;
+                    sum_minf[j] += mins[j] * (a_ptr[l].bsums[sb * 2] + a_ptr[l].bsums[sb * 2 + 1]) * GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d;
                 }
             }
         }
@@ -1560,14 +990,14 @@ static void ggml_gemv_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, c
 #endif
 }
 
-
-static void ggml_gemv_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
+void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
     const int qk = QK8_0;
     const int nb = n / qk;
-    const int ncols_interleaved = 4;
-    const int blocklen = 4;
+    const int ncols_interleaved = 8;
+    const int blocklen = 8;
 
     assert (n % qk == 0);
+    assert (nr % 4 == 0);
     assert (nc % ncols_interleaved == 0);
 
     UNUSED(s);
@@ -1580,1529 +1010,49 @@ static void ggml_gemv_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs,
     UNUSED(ncols_interleaved);
     UNUSED(blocklen);
 
-#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
-    if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
-        const int8x16_t kvalues = vld1q_s8(kvalues_iq4nl);
-        const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
-        float * res_ptr = s;
+#if defined(__AVX2__) || defined(__AVX512F__)
+    {
+        const block_q4_0x8 * b_ptr_start = (const block_q4_0x8 *)vx;
+        const block_q8_0x4 * a_ptr_start = (const block_q8_0x4 *)vy;
+        int64_t b_nb = n / QK4_0;
+        int64_t y = 0;
+        // Mask to mask out nibbles from packed bytes
+        const __m256i m4b = _mm256_set1_epi8(0x0F);
+        const __m128i loadMask = _mm_blend_epi32(_mm_setzero_si128(), _mm_set1_epi32(0xFFFFFFFF), 3);
+        // Lookup table to convert signed nibbles to signed bytes
+        __m256i signextendlut = _mm256_castsi128_si256(_mm_set_epi8(-1, -2, -3, -4, -5, -6, -7, -8, 7, 6, 5, 4, 3, 2, 1, 0));
+        signextendlut = _mm256_permute2f128_si256(signextendlut, signextendlut, 0);
+        // Permute mask used for easier vector processing at later stages
+        __m256i requiredOrder = _mm256_set_epi32(3, 2, 1, 0, 7, 6, 5, 4);
+        int64_t xstart = 0;
+        int anr = nr - nr%16; // Used to align nr with boundary of 16
+    #ifdef __AVX512F__
+        int anc = nc - nc%16; // Used to align nc with boundary of 16
+        // Mask to mask out nibbles from packed bytes expanded to 512 bit length
+        const __m512i m4bexpanded = _mm512_set1_epi8(0x0F);
+        // Lookup table to convert signed nibbles to signed bytes expanded to 512 bit length
+        __m512i signextendlutexpanded = _mm512_inserti32x8(_mm512_castsi256_si512(signextendlut), signextendlut, 1);
 
-        for (int x = 0; x < nc / ncols_interleaved; x++) {
-            const block_iq4_nlx4 * b_ptr = (const block_iq4_nlx4 *) vx + (x * nb);
+        // Take group of four block_q8_0x4 structures at each pass of the loop and perform dot product operation
+        for (; y < anr / 4; y += 4) {
 
-            float32x4_t sumf = vdupq_n_f32(0);
-            for (int l = 0; l < nb; l++) {
-                uint8x16_t b_0 = vld1q_u8(b_ptr[l].qs + 0);
-                uint8x16_t b_1 = vld1q_u8(b_ptr[l].qs + 16);
-                uint8x16_t b_2 = vld1q_u8(b_ptr[l].qs + 32);
-                uint8x16_t b_3 = vld1q_u8(b_ptr[l].qs + 48);
-
-                int8x16_t b_0_hi = vqtbl1q_s8(kvalues, b_0 >> 4);
-                int8x16_t b_0_lo = vqtbl1q_s8(kvalues, b_0 & 0x0F);
-                int8x16_t b_1_hi = vqtbl1q_s8(kvalues, b_1 >> 4);
-                int8x16_t b_1_lo = vqtbl1q_s8(kvalues, b_1 & 0x0F);
-                int8x16_t b_2_hi = vqtbl1q_s8(kvalues, b_2 >> 4);
-                int8x16_t b_2_lo = vqtbl1q_s8(kvalues, b_2 & 0x0F);
-                int8x16_t b_3_hi = vqtbl1q_s8(kvalues, b_3 >> 4);
-                int8x16_t b_3_lo = vqtbl1q_s8(kvalues, b_3 & 0x0F);
-
-                int8x16_t a_0 = vld1q_s8(a_ptr[l].qs + 0);
-                int8x16_t a_1 = vld1q_s8(a_ptr[l].qs + 16);
-
-                int32x4_t sumi = vdupq_n_s32(0);
-                sumi = vdotq_laneq_s32(sumi, b_0_lo, a_0, 0);
-                sumi = vdotq_laneq_s32(sumi, b_0_hi, a_1, 0);
-                sumi = vdotq_laneq_s32(sumi, b_1_lo, a_0, 1);
-                sumi = vdotq_laneq_s32(sumi, b_1_hi, a_1, 1);
-                sumi = vdotq_laneq_s32(sumi, b_2_lo, a_0, 2);
-                sumi = vdotq_laneq_s32(sumi, b_2_hi, a_1, 2);
-                sumi = vdotq_laneq_s32(sumi, b_3_lo, a_0, 3);
-                sumi = vdotq_laneq_s32(sumi, b_3_hi, a_1, 3);
-
-                float32x4_t a_d = vcvt_f32_f16(vld1_dup_f16((const float16_t *)&a_ptr[l].d));
-                float32x4_t b_d = vcvt_f32_f16(vld1_f16((const float16_t *)b_ptr[l].d));
-                float32x4_t d = a_d * b_d;
-
-                sumf = vmlaq_f32(sumf, d, vcvtq_f32_s32(sumi));
+            const block_q8_0x4 * a_ptrs[4];
+
+            a_ptrs[0] = a_ptr_start + (y * nb);
+            for (int i = 0; i < 3; ++i) {
+                a_ptrs[i + 1] = a_ptrs[i] + nb;
             }
 
-            vst1q_f32(res_ptr + x * 4, sumf);
-        }
-        return;
-    }
-#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON)
-    {
-        float sumf[4];
-        int sumi;
+            // Take group of two block_q4_0x8 structures at each pass of the loop and perform dot product operation
+            for (int64_t x = 0; x < anc / 8; x += 2) {
 
-        const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
-        for (int x = 0; x < nc / ncols_interleaved; x++) {
-            const block_iq4_nlx4 * b_ptr = (const block_iq4_nlx4 *) vx + (x * nb);
+                const block_q4_0x8 * b_ptr_0 = b_ptr_start + ((x)     * b_nb);
+                const block_q4_0x8 * b_ptr_1 = b_ptr_start + ((x + 1) * b_nb);
 
-            for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
-            for (int l = 0; l < nb; l++) {
-                for (int k = 0; k < (qk / (2 * blocklen)); k++) {
-                    for (int j = 0; j < ncols_interleaved; j++) {
-                        sumi = 0;
-                        for (int i = 0; i < blocklen; ++i) {
-                            const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
-                            const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
-                            sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2]));
-                        }
-                        sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d);
-                    }
-                }
-            }
-            for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
-        }
-    }
-}
-
-static void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
-    const int qk = QK8_0;
-    const int nb = n / qk;
-    const int ncols_interleaved = 4;
-    const int blocklen = 4;
-
-    assert (n % qk == 0);
-    assert (nr % 4 == 0);
-    assert (nc % ncols_interleaved == 0);
-
-    UNUSED(s);
-    UNUSED(bs);
-    UNUSED(vx);
-    UNUSED(vy);
-    UNUSED(nr);
-    UNUSED(nc);
-    UNUSED(nb);
-    UNUSED(ncols_interleaved);
-    UNUSED(blocklen);
-
-#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON)
-    if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
-        const void * b_ptr = vx;
-        const void * a_ptr = vy;
-        float * res_ptr = s;
-        size_t res_stride = bs * sizeof(float);
-
-        __asm__ __volatile__(
-            "mov x10, %x[nr]\n"
-            "mov x9, #0x88\n"
-            "cmp x10, #0x10\n"
-            "mul x9, %x[nb], x9\n"
-            "blt 4f\n"
-            "1:"  // Row loop
-            "add x28, %x[b_ptr], #0x8\n"
-            "mov x27, %x[nc]\n"
-            "add x26, %x[res_ptr], %x[res_stride], LSL #4\n"
-            "2:"  // Column loop
-            "add x25, %x[a_ptr], #0x8\n"
-            "movi v15.16b, #0x0\n"
-            "movi v19.16b, #0x0\n"
-            "mov x24, %x[nb]\n"
-            "add x23, x25, x9\n"
-            "movi v18.16b, #0x0\n"
-            "movi v14.16b, #0x0\n"
-            "add x22, x23, x9\n"
-            "movi v11.16b, #0x0\n"
-            "movi v13.16b, #0x0\n"
-            "add x21, x22, x9\n"
-            "movi v23.16b, #0x0\n"
-            "movi v16.16b, #0x0\n"
-            "movi v25.16b, #0x0\n"
-            "movi v7.16b, #0x0\n"
-            "movi v0.16b, #0x0\n"
-            "movi v4.16b, #0x0\n"
-            "movi v5.16b, #0x0\n"
-            "movi v21.16b, #0x0\n"
-            "movi v8.16b, #0x0\n"
-            "movi v1.16b, #0x0\n"
-            "3:"  // Block loop
-            "ldr q3, [x28, #0x0]\n"
-            "ldr q31, [x25, #0x0]\n"
-            "movi v28.16b, #0x4\n"
-            "movi v10.4s, #0x0\n"
-            "ldr q22, [x28, #0x10]\n"
-            "ldr q6, [x25, #0x10]\n"
-            "movi v29.4s, #0x0\n"
-            "movi v9.4s, #0x0\n"
-            "ldr q27, [x28, #0x20]\n"
-            "ldr q30, [x28, #0x30]\n"
-            "movi v20.4s, #0x0\n"
-            "movi v24.16b, #0xf0\n"
-            "ldr d2, [x25, #-0x8]\n"
-            "ldr d26, [x23, #-0x8]\n"
-            "sshl v12.16b, v3.16b, v28.16b\n"
-            "sub x20, x28, #0x8\n"
-            "ldr d17, [x20, #0x0]\n"
-            "and v3.16b, v3.16b, v24.16b\n"
-            "subs x24, x24, #0x1\n"
-            "add x28, x28, #0x48\n"
-            ".inst 0x4f9fe18a  // sdot v10.4s, v12.16b, v31.4b[0]\n"
-            ".inst 0x4fbfe19d  // sdot v29.4s, v12.16b, v31.4b[1]\n"
-            ".inst 0x4f9fe989  // sdot v9.4s, v12.16b, v31.4b[2]\n"
-            ".inst 0x4fbfe994  // sdot v20.4s, v12.16b, v31.4b[3]\n"
-            "sshl v31.16b, v22.16b, v28.16b\n"
-            "and v22.16b, v22.16b, v24.16b\n"
-            "fcvtl v17.4s, v17.4h\n"
-            "fcvtl v2.4s, v2.4h\n"
-            "fcvtl v26.4s, v26.4h\n"
-            ".inst 0x4f86e3ea  // sdot v10.4s, v31.16b, v6.4b[0]\n"
-            ".inst 0x4fa6e3fd  // sdot v29.4s, v31.16b, v6.4b[1]\n"
-            ".inst 0x4f86ebe9  // sdot v9.4s, v31.16b, v6.4b[2]\n"
-            ".inst 0x4fa6ebf4  // sdot v20.4s, v31.16b, v6.4b[3]\n"
-            "sshl v6.16b, v27.16b, v28.16b\n"
-            "sshl v28.16b, v30.16b, v28.16b\n"
-            "and v27.16b, v27.16b, v24.16b\n"
-            "and v30.16b, v30.16b, v24.16b\n"
-            "ldr q24, [x25, #0x20]\n"
-            ".inst 0x4f98e0ca  // sdot v10.4s, v6.16b, v24.4b[0]\n"
-            ".inst 0x4fb8e0dd  // sdot v29.4s, v6.16b, v24.4b[1]\n"
-            ".inst 0x4f98e8c9  // sdot v9.4s, v6.16b, v24.4b[2]\n"
-            ".inst 0x4fb8e8d4  // sdot v20.4s, v6.16b, v24.4b[3]\n"
-            "ldr q24, [x25, #0x30]\n"
-            ".inst 0x4f98e38a  // sdot v10.4s, v28.16b, v24.4b[0]\n"
-            ".inst 0x4fb8e39d  // sdot v29.4s, v28.16b, v24.4b[1]\n"
-            ".inst 0x4f98eb89  // sdot v9.4s, v28.16b, v24.4b[2]\n"
-            ".inst 0x4fb8eb94  // sdot v20.4s, v28.16b, v24.4b[3]\n"
-            "ldr q24, [x25, #0x40]\n"
-            ".inst 0x4f98e06a  // sdot v10.4s, v3.16b, v24.4b[0]\n"
-            ".inst 0x4fb8e07d  // sdot v29.4s, v3.16b, v24.4b[1]\n"
-            ".inst 0x4f98e869  // sdot v9.4s, v3.16b, v24.4b[2]\n"
-            ".inst 0x4fb8e874  // sdot v20.4s, v3.16b, v24.4b[3]\n"
-            "ldr q24, [x25, #0x50]\n"
-            ".inst 0x4f98e2ca  // sdot v10.4s, v22.16b, v24.4b[0]\n"
-            ".inst 0x4fb8e2dd  // sdot v29.4s, v22.16b, v24.4b[1]\n"
-            ".inst 0x4f98eac9  // sdot v9.4s, v22.16b, v24.4b[2]\n"
-            ".inst 0x4fb8ead4  // sdot v20.4s, v22.16b, v24.4b[3]\n"
-            "ldr q24, [x25, #0x60]\n"
-            ".inst 0x4f98e36a  // sdot v10.4s, v27.16b, v24.4b[0]\n"
-            ".inst 0x4fb8e37d  // sdot v29.4s, v27.16b, v24.4b[1]\n"
-            ".inst 0x4f98eb69  // sdot v9.4s, v27.16b, v24.4b[2]\n"
-            ".inst 0x4fb8eb74  // sdot v20.4s, v27.16b, v24.4b[3]\n"
-            "ldr q24, [x25, #0x70]\n"
-            "add x25, x25, #0x88\n"
-            ".inst 0x4f98e3ca  // sdot v10.4s, v30.16b, v24.4b[0]\n"
-            ".inst 0x4fb8e3dd  // sdot v29.4s, v30.16b, v24.4b[1]\n"
-            ".inst 0x4f98ebc9  // sdot v9.4s, v30.16b, v24.4b[2]\n"
-            ".inst 0x4fb8ebd4  // sdot v20.4s, v30.16b, v24.4b[3]\n"
-            "fmul v24.4s, v17.4s, v2.s[0]\n"
-            "scvtf v10.4s, v10.4s, #0x4\n"
-            "scvtf v29.4s, v29.4s, #0x4\n"
-            "scvtf v9.4s, v9.4s, #0x4\n"
-            "scvtf v20.4s, v20.4s, #0x4\n"
-            "fmla v15.4s, v10.4s, v24.4s\n"
-            "ldr q24, [x23, #0x0]\n"
-            "fmul v10.4s, v17.4s, v2.s[1]\n"
-            "fmla v19.4s, v29.4s, v10.4s\n"
-            "ldr q10, [x23, #0x10]\n"
-            "fmul v29.4s, v17.4s, v2.s[2]\n"
-            "fmul v2.4s, v17.4s, v2.s[3]\n"
-            "fmla v18.4s, v9.4s, v29.4s\n"
-            "movi v9.4s, #0x0\n"
-            "movi v29.4s, #0x0\n"
-            ".inst 0x4f98e189  // sdot v9.4s, v12.16b, v24.4b[0]\n"
-            ".inst 0x4fb8e19d  // sdot v29.4s, v12.16b, v24.4b[1]\n"
-            "fmla v14.4s, v20.4s, v2.4s\n"
-            "movi v20.4s, #0x0\n"
-            "movi v2.4s, #0x0\n"
-            ".inst 0x4f98e994  // sdot v20.4s, v12.16b, v24.4b[2]\n"
-            ".inst 0x4fb8e982  // sdot v2.4s, v12.16b, v24.4b[3]\n"
-            "ldr q24, [x23, #0x20]\n"
-            ".inst 0x4f8ae3e9  // sdot v9.4s, v31.16b, v10.4b[0]\n"
-            ".inst 0x4faae3fd  // sdot v29.4s, v31.16b, v10.4b[1]\n"
-            ".inst 0x4f8aebf4  // sdot v20.4s, v31.16b, v10.4b[2]\n"
-            ".inst 0x4faaebe2  // sdot v2.4s, v31.16b, v10.4b[3]\n"
-            "ldr q10, [x23, #0x30]\n"
-            ".inst 0x4f98e0c9  // sdot v9.4s, v6.16b, v24.4b[0]\n"
-            ".inst 0x4fb8e0dd  // sdot v29.4s, v6.16b, v24.4b[1]\n"
-            ".inst 0x4f98e8d4  // sdot v20.4s, v6.16b, v24.4b[2]\n"
-            ".inst 0x4fb8e8c2  // sdot v2.4s, v6.16b, v24.4b[3]\n"
-            "ldr q24, [x23, #0x40]\n"
-            ".inst 0x4f8ae389  // sdot v9.4s, v28.16b, v10.4b[0]\n"
-            ".inst 0x4faae39d  // sdot v29.4s, v28.16b, v10.4b[1]\n"
-            ".inst 0x4f8aeb94  // sdot v20.4s, v28.16b, v10.4b[2]\n"
-            ".inst 0x4faaeb82  // sdot v2.4s, v28.16b, v10.4b[3]\n"
-            "ldr q10, [x23, #0x50]\n"
-            ".inst 0x4f98e069  // sdot v9.4s, v3.16b, v24.4b[0]\n"
-            ".inst 0x4fb8e07d  // sdot v29.4s, v3.16b, v24.4b[1]\n"
-            ".inst 0x4f98e874  // sdot v20.4s, v3.16b, v24.4b[2]\n"
-            ".inst 0x4fb8e862  // sdot v2.4s, v3.16b, v24.4b[3]\n"
-            "ldr q24, [x23, #0x60]\n"
-            ".inst 0x4f8ae2c9  // sdot v9.4s, v22.16b, v10.4b[0]\n"
-            ".inst 0x4faae2dd  // sdot v29.4s, v22.16b, v10.4b[1]\n"
-            ".inst 0x4f8aead4  // sdot v20.4s, v22.16b, v10.4b[2]\n"
-            ".inst 0x4faaeac2  // sdot v2.4s, v22.16b, v10.4b[3]\n"
-            "ldr q10, [x23, #0x70]\n"
-            "add x23, x23, #0x88\n"
-            ".inst 0x4f98e369  // sdot v9.4s, v27.16b, v24.4b[0]\n"
-            ".inst 0x4fb8e37d  // sdot v29.4s, v27.16b, v24.4b[1]\n"
-            ".inst 0x4f98eb74  // sdot v20.4s, v27.16b, v24.4b[2]\n"
-            ".inst 0x4fb8eb62  // sdot v2.4s, v27.16b, v24.4b[3]\n"
-            "ldr q24, [x22, #0x0]\n"
-            ".inst 0x4f8ae3c9  // sdot v9.4s, v30.16b, v10.4b[0]\n"
-            ".inst 0x4faae3dd  // sdot v29.4s, v30.16b, v10.4b[1]\n"
-            ".inst 0x4f8aebd4  // sdot v20.4s, v30.16b, v10.4b[2]\n"
-            ".inst 0x4faaebc2  // sdot v2.4s, v30.16b, v10.4b[3]\n"
-            "fmul v10.4s, v17.4s, v26.s[0]\n"
-            "scvtf v9.4s, v9.4s, #0x4\n"
-            "scvtf v29.4s, v29.4s, #0x4\n"
-            "scvtf v20.4s, v20.4s, #0x4\n"
-            "scvtf v2.4s, v2.4s, #0x4\n"
-            "fmla v11.4s, v9.4s, v10.4s\n"
-            "ldr q9, [x22, #0x10]\n"
-            "fmul v10.4s, v17.4s, v26.s[1]\n"
-            "fmla v13.4s, v29.4s, v10.4s\n"
-            "ldr d29, [x22, #-0x8]\n"
-            "fmul v10.4s, v17.4s, v26.s[2]\n"
-            "fmul v26.4s, v17.4s, v26.s[3]\n"
-            "fcvtl v29.4s, v29.4h\n"
-            "fmla v23.4s, v20.4s, v10.4s\n"
-            "movi v20.4s, #0x0\n"
-            "movi v10.4s, #0x0\n"
-            "fmla v16.4s, v2.4s, v26.4s\n"
-            "movi v26.4s, #0x0\n"
-            "movi v2.4s, #0x0\n"
-            ".inst 0x4f98e194  // sdot v20.4s, v12.16b, v24.4b[0]\n"
-            ".inst 0x4fb8e18a  // sdot v10.4s, v12.16b, v24.4b[1]\n"
-            ".inst 0x4f98e99a  // sdot v26.4s, v12.16b, v24.4b[2]\n"
-            ".inst 0x4fb8e982  // sdot v2.4s, v12.16b, v24.4b[3]\n"
-            "ldr q24, [x22, #0x20]\n"
-            ".inst 0x4f89e3f4  // sdot v20.4s, v31.16b, v9.4b[0]\n"
-            ".inst 0x4fa9e3ea  // sdot v10.4s, v31.16b, v9.4b[1]\n"
-            ".inst 0x4f89ebfa  // sdot v26.4s, v31.16b, v9.4b[2]\n"
-            ".inst 0x4fa9ebe2  // sdot v2.4s, v31.16b, v9.4b[3]\n"
-            "ldr q9, [x22, #0x30]\n"
-            ".inst 0x4f98e0d4  // sdot v20.4s, v6.16b, v24.4b[0]\n"
-            ".inst 0x4fb8e0ca  // sdot v10.4s, v6.16b, v24.4b[1]\n"
-            ".inst 0x4f98e8da  // sdot v26.4s, v6.16b, v24.4b[2]\n"
-            ".inst 0x4fb8e8c2  // sdot v2.4s, v6.16b, v24.4b[3]\n"
-            "ldr q24, [x22, #0x40]\n"
-            ".inst 0x4f89e394  // sdot v20.4s, v28.16b, v9.4b[0]\n"
-            ".inst 0x4fa9e38a  // sdot v10.4s, v28.16b, v9.4b[1]\n"
-            ".inst 0x4f89eb9a  // sdot v26.4s, v28.16b, v9.4b[2]\n"
-            ".inst 0x4fa9eb82  // sdot v2.4s, v28.16b, v9.4b[3]\n"
-            "ldr q9, [x22, #0x50]\n"
-            ".inst 0x4f98e074  // sdot v20.4s, v3.16b, v24.4b[0]\n"
-            ".inst 0x4fb8e06a  // sdot v10.4s, v3.16b, v24.4b[1]\n"
-            ".inst 0x4f98e87a  // sdot v26.4s, v3.16b, v24.4b[2]\n"
-            ".inst 0x4fb8e862  // sdot v2.4s, v3.16b, v24.4b[3]\n"
-            "ldr q24, [x22, #0x60]\n"
-            ".inst 0x4f89e2d4  // sdot v20.4s, v22.16b, v9.4b[0]\n"
-            ".inst 0x4fa9e2ca  // sdot v10.4s, v22.16b, v9.4b[1]\n"
-            ".inst 0x4f89eada  // sdot v26.4s, v22.16b, v9.4b[2]\n"
-            ".inst 0x4fa9eac2  // sdot v2.4s, v22.16b, v9.4b[3]\n"
-            "ldr q9, [x22, #0x70]\n"
-            "add x22, x22, #0x88\n"
-            ".inst 0x4f98e374  // sdot v20.4s, v27.16b, v24.4b[0]\n"
-            ".inst 0x4fb8e36a  // sdot v10.4s, v27.16b, v24.4b[1]\n"
-            ".inst 0x4f98eb7a  // sdot v26.4s, v27.16b, v24.4b[2]\n"
-            ".inst 0x4fb8eb62  // sdot v2.4s, v27.16b, v24.4b[3]\n"
-            "ldr q24, [x21, #0x0]\n"
-            ".inst 0x4f89e3d4  // sdot v20.4s, v30.16b, v9.4b[0]\n"
-            ".inst 0x4fa9e3ca  // sdot v10.4s, v30.16b, v9.4b[1]\n"
-            ".inst 0x4f89ebda  // sdot v26.4s, v30.16b, v9.4b[2]\n"
-            ".inst 0x4fa9ebc2  // sdot v2.4s, v30.16b, v9.4b[3]\n"
-            "fmul v9.4s, v17.4s, v29.s[0]\n"
-            "scvtf v20.4s, v20.4s, #0x4\n"
-            "scvtf v10.4s, v10.4s, #0x4\n"
-            "scvtf v26.4s, v26.4s, #0x4\n"
-            "scvtf v2.4s, v2.4s, #0x4\n"
-            "fmla v25.4s, v20.4s, v9.4s\n"
-            "ldr q9, [x21, #0x10]\n"
-            "fmul v20.4s, v17.4s, v29.s[1]\n"
-            "fmla v7.4s, v10.4s, v20.4s\n"
-            "ldr d20, [x21, #-0x8]\n"
-            "fmul v10.4s, v17.4s, v29.s[2]\n"
-            "fmul v29.4s, v17.4s, v29.s[3]\n"
-            "fcvtl v20.4s, v20.4h\n"
-            "fmla v0.4s, v26.4s, v10.4s\n"
-            "movi v26.4s, #0x0\n"
-            "movi v10.4s, #0x0\n"
-            "fmla v4.4s, v2.4s, v29.4s\n"
-            "movi v2.4s, #0x0\n"
-            "movi v29.4s, #0x0\n"
-            ".inst 0x4f98e19a  // sdot v26.4s, v12.16b, v24.4b[0]\n"
-            ".inst 0x4fb8e18a  // sdot v10.4s, v12.16b, v24.4b[1]\n"
-            ".inst 0x4f98e982  // sdot v2.4s, v12.16b, v24.4b[2]\n"
-            ".inst 0x4fb8e99d  // sdot v29.4s, v12.16b, v24.4b[3]\n"
-            "ldr q12, [x21, #0x20]\n"
-            "fmul v24.4s, v17.4s, v20.s[0]\n"
-            ".inst 0x4f89e3fa  // sdot v26.4s, v31.16b, v9.4b[0]\n"
-            ".inst 0x4fa9e3ea  // sdot v10.4s, v31.16b, v9.4b[1]\n"
-            ".inst 0x4f89ebe2  // sdot v2.4s, v31.16b, v9.4b[2]\n"
-            ".inst 0x4fa9ebfd  // sdot v29.4s, v31.16b, v9.4b[3]\n"
-            "ldr q9, [x21, #0x30]\n"
-            "fmul v31.4s, v17.4s, v20.s[1]\n"
-            ".inst 0x4f8ce0da  // sdot v26.4s, v6.16b, v12.4b[0]\n"
-            ".inst 0x4face0ca  // sdot v10.4s, v6.16b, v12.4b[1]\n"
-            ".inst 0x4f8ce8c2  // sdot v2.4s, v6.16b, v12.4b[2]\n"
-            ".inst 0x4face8dd  // sdot v29.4s, v6.16b, v12.4b[3]\n"
-            "ldr q12, [x21, #0x40]\n"
-            "fmul v6.4s, v17.4s, v20.s[2]\n"
-            "fmul v20.4s, v17.4s, v20.s[3]\n"
-            ".inst 0x4f89e39a  // sdot v26.4s, v28.16b, v9.4b[0]\n"
-            ".inst 0x4fa9e38a  // sdot v10.4s, v28.16b, v9.4b[1]\n"
-            ".inst 0x4f89eb82  // sdot v2.4s, v28.16b, v9.4b[2]\n"
-            ".inst 0x4fa9eb9d  // sdot v29.4s, v28.16b, v9.4b[3]\n"
-            "ldr q9, [x21, #0x50]\n"
-            ".inst 0x4f8ce07a  // sdot v26.4s, v3.16b, v12.4b[0]\n"
-            ".inst 0x4face06a  // sdot v10.4s, v3.16b, v12.4b[1]\n"
-            ".inst 0x4f8ce862  // sdot v2.4s, v3.16b, v12.4b[2]\n"
-            ".inst 0x4face87d  // sdot v29.4s, v3.16b, v12.4b[3]\n"
-            "ldr q12, [x21, #0x60]\n"
-            ".inst 0x4f89e2da  // sdot v26.4s, v22.16b, v9.4b[0]\n"
-            ".inst 0x4fa9e2ca  // sdot v10.4s, v22.16b, v9.4b[1]\n"
-            ".inst 0x4f89eac2  // sdot v2.4s, v22.16b, v9.4b[2]\n"
-            ".inst 0x4fa9eadd  // sdot v29.4s, v22.16b, v9.4b[3]\n"
-            "ldr q17, [x21, #0x70]\n"
-            "add x21, x21, #0x88\n"
-            ".inst 0x4f8ce37a  // sdot v26.4s, v27.16b, v12.4b[0]\n"
-            ".inst 0x4face36a  // sdot v10.4s, v27.16b, v12.4b[1]\n"
-            ".inst 0x4f8ceb62  // sdot v2.4s, v27.16b, v12.4b[2]\n"
-            ".inst 0x4faceb7d  // sdot v29.4s, v27.16b, v12.4b[3]\n"
-            ".inst 0x4f91e3da  // sdot v26.4s, v30.16b, v17.4b[0]\n"
-            ".inst 0x4fb1e3ca  // sdot v10.4s, v30.16b, v17.4b[1]\n"
-            ".inst 0x4f91ebc2  // sdot v2.4s, v30.16b, v17.4b[2]\n"
-            ".inst 0x4fb1ebdd  // sdot v29.4s, v30.16b, v17.4b[3]\n"
-            "scvtf v26.4s, v26.4s, #0x4\n"
-            "scvtf v10.4s, v10.4s, #0x4\n"
-            "fmla v5.4s, v26.4s, v24.4s\n"
-            "scvtf v2.4s, v2.4s, #0x4\n"
-            "scvtf v29.4s, v29.4s, #0x4\n"
-            "fmla v21.4s, v10.4s, v31.4s\n"
-            "fmla v8.4s, v2.4s, v6.4s\n"
-            "fmla v1.4s, v29.4s, v20.4s\n"
-            "bgt 3b\n"
-            "mov x20, %x[res_ptr]\n"
-            "subs x27, x27, #0x4\n"
-            "add %x[res_ptr], %x[res_ptr], #0x10\n"
-            "str q15, [x20, #0x0]\n"
-            "add x20, x20, %x[res_stride]\n"
-            "str q19, [x20, #0x0]\n"
-            "add x20, x20, %x[res_stride]\n"
-            "str q18, [x20, #0x0]\n"
-            "add x20, x20, %x[res_stride]\n"
-            "str q14, [x20, #0x0]\n"
-            "add x20, x20, %x[res_stride]\n"
-            "str q11, [x20, #0x0]\n"
-            "add x20, x20, %x[res_stride]\n"
-            "str q13, [x20, #0x0]\n"
-            "add x20, x20, %x[res_stride]\n"
-            "str q23, [x20, #0x0]\n"
-            "add x20, x20, %x[res_stride]\n"
-            "str q16, [x20, #0x0]\n"
-            "add x20, x20, %x[res_stride]\n"
-            "str q25, [x20, #0x0]\n"
-            "add x20, x20, %x[res_stride]\n"
-            "str q7, [x20, #0x0]\n"
-            "add x20, x20, %x[res_stride]\n"
-            "str q0, [x20, #0x0]\n"
-            "add x20, x20, %x[res_stride]\n"
-            "str q4, [x20, #0x0]\n"
-            "add x20, x20, %x[res_stride]\n"
-            "str q5, [x20, #0x0]\n"
-            "add x20, x20, %x[res_stride]\n"
-            "str q21, [x20, #0x0]\n"
-            "add x20, x20, %x[res_stride]\n"
-            "str q8, [x20, #0x0]\n"
-            "add x20, x20, %x[res_stride]\n"
-            "str q1, [x20, #0x0]\n"
-            "bne 2b\n"
-            "mov x20, #0x4\n"
-            "sub x10, x10, #0x10\n"
-            "cmp x10, #0x10\n"
-            "mov %x[res_ptr], x26\n"
-            "madd %x[a_ptr], x20, x9, %x[a_ptr]\n"
-            "bge 1b\n"
-            "4:"  // Row loop skip
-            "cbz x10, 9f\n"
-            "5:"  // Row tail: Row loop
-            "add x24, %x[b_ptr], #0x8\n"
-            "mov x23, %x[nc]\n"
-            "add x22, %x[res_ptr], %x[res_stride], LSL #2\n"
-            "6:"  // Row tail: Column loop
-            "movi v15.16b, #0x0\n"
-            "movi v19.16b, #0x0\n"
-            "add x25, %x[a_ptr], #0x8\n"
-            "mov x21, %x[nb]\n"
-            "movi v18.16b, #0x0\n"
-            "movi v14.16b, #0x0\n"
-            "7:"  // Row tail: Block loop
-            "ldr q7, [x24, #0x0]\n"
-            "ldr q5, [x25, #0x0]\n"
-            "movi v9.16b, #0x4\n"
-            "movi v4.4s, #0x0\n"
-            "ldr q3, [x24, #0x10]\n"
-            "ldr q2, [x25, #0x10]\n"
-            "movi v1.4s, #0x0\n"
-            "movi v0.4s, #0x0\n"
-            "ldr q13, [x24, #0x20]\n"
-            "ldr q31, [x25, #0x20]\n"
-            "movi v30.4s, #0x0\n"
-            "movi v29.16b, #0xf0\n"
-            "ldr q28, [x24, #0x30]\n"
-            "ldr q27, [x25, #0x30]\n"
-            "sshl v20.16b, v7.16b, v9.16b\n"
-            "sub x20, x24, #0x8\n"
-            "ldr q26, [x25, #0x40]\n"
-            "ldr q25, [x25, #0x50]\n"
-            "sshl v17.16b, v3.16b, v9.16b\n"
-            "and v7.16b, v7.16b, v29.16b\n"
-            "ldr q24, [x25, #0x60]\n"
-            "ldr q16, [x25, #0x70]\n"
-            "sshl v22.16b, v13.16b, v9.16b\n"
-            "and v3.16b, v3.16b, v29.16b\n"
-            "ldr d21, [x20, #0x0]\n"
-            "ldr d12, [x25, #-0x8]\n"
-            ".inst 0x4f85e284  // sdot v4.4s, v20.16b, v5.4b[0]\n"
-            ".inst 0x4fa5e281  // sdot v1.4s, v20.16b, v5.4b[1]\n"
-            ".inst 0x4f85ea80  // sdot v0.4s, v20.16b, v5.4b[2]\n"
-            ".inst 0x4fa5ea9e  // sdot v30.4s, v20.16b, v5.4b[3]\n"
-            "sshl v9.16b, v28.16b, v9.16b\n"
-            "subs x21, x21, #0x1\n"
-            "and v13.16b, v13.16b, v29.16b\n"
-            "and v28.16b, v28.16b, v29.16b\n"
-            "add x25, x25, #0x88\n"
-            "add x24, x24, #0x48\n"
-            "fcvtl v21.4s, v21.4h\n"
-            "fcvtl v12.4s, v12.4h\n"
-            ".inst 0x4f82e224  // sdot v4.4s, v17.16b, v2.4b[0]\n"
-            ".inst 0x4fa2e221  // sdot v1.4s, v17.16b, v2.4b[1]\n"
-            ".inst 0x4f82ea20  // sdot v0.4s, v17.16b, v2.4b[2]\n"
-            ".inst 0x4fa2ea3e  // sdot v30.4s, v17.16b, v2.4b[3]\n"
-            "fmul v11.4s, v21.4s, v12.s[0]\n"
-            "fmul v23.4s, v21.4s, v12.s[1]\n"
-            "fmul v17.4s, v21.4s, v12.s[2]\n"
-            ".inst 0x4f9fe2c4  // sdot v4.4s, v22.16b, v31.4b[0]\n"
-            "fmul v6.4s, v21.4s, v12.s[3]\n"
-            ".inst 0x4fbfe2c1  // sdot v1.4s, v22.16b, v31.4b[1]\n"
-            ".inst 0x4f9feac0  // sdot v0.4s, v22.16b, v31.4b[2]\n"
-            ".inst 0x4fbfeade  // sdot v30.4s, v22.16b, v31.4b[3]\n"
-            ".inst 0x4f9be124  // sdot v4.4s, v9.16b, v27.4b[0]\n"
-            ".inst 0x4fbbe121  // sdot v1.4s, v9.16b, v27.4b[1]\n"
-            ".inst 0x4f9be920  // sdot v0.4s, v9.16b, v27.4b[2]\n"
-            ".inst 0x4fbbe93e  // sdot v30.4s, v9.16b, v27.4b[3]\n"
-            ".inst 0x4f9ae0e4  // sdot v4.4s, v7.16b, v26.4b[0]\n"
-            ".inst 0x4fbae0e1  // sdot v1.4s, v7.16b, v26.4b[1]\n"
-            ".inst 0x4f9ae8e0  // sdot v0.4s, v7.16b, v26.4b[2]\n"
-            ".inst 0x4fbae8fe  // sdot v30.4s, v7.16b, v26.4b[3]\n"
-            ".inst 0x4f99e064  // sdot v4.4s, v3.16b, v25.4b[0]\n"
-            ".inst 0x4fb9e061  // sdot v1.4s, v3.16b, v25.4b[1]\n"
-            ".inst 0x4f99e860  // sdot v0.4s, v3.16b, v25.4b[2]\n"
-            ".inst 0x4fb9e87e  // sdot v30.4s, v3.16b, v25.4b[3]\n"
-            ".inst 0x4f98e1a4  // sdot v4.4s, v13.16b, v24.4b[0]\n"
-            ".inst 0x4fb8e1a1  // sdot v1.4s, v13.16b, v24.4b[1]\n"
-            ".inst 0x4f98e9a0  // sdot v0.4s, v13.16b, v24.4b[2]\n"
-            ".inst 0x4fb8e9be  // sdot v30.4s, v13.16b, v24.4b[3]\n"
-            ".inst 0x4f90e384  // sdot v4.4s, v28.16b, v16.4b[0]\n"
-            ".inst 0x4fb0e381  // sdot v1.4s, v28.16b, v16.4b[1]\n"
-            ".inst 0x4f90eb80  // sdot v0.4s, v28.16b, v16.4b[2]\n"
-            ".inst 0x4fb0eb9e  // sdot v30.4s, v28.16b, v16.4b[3]\n"
-            "scvtf v4.4s, v4.4s, #0x4\n"
-            "scvtf v1.4s, v1.4s, #0x4\n"
-            "scvtf v0.4s, v0.4s, #0x4\n"
-            "fmla v15.4s, v4.4s, v11.4s\n"
-            "scvtf v30.4s, v30.4s, #0x4\n"
-            "fmla v19.4s, v1.4s, v23.4s\n"
-            "fmla v18.4s, v0.4s, v17.4s\n"
-            "fmla v14.4s, v30.4s, v6.4s\n"
-            "bgt 7b\n"
-            "mov x20, %x[res_ptr]\n"
-            "cmp x10, #0x1\n"
-            "str q15, [x20, #0x0]\n"
-            "add x20, x20, %x[res_stride]\n"
-            "ble 8f\n"
-            "cmp x10, #0x2\n"
-            "str q19, [x20, #0x0]\n"
-            "add x20, x20, %x[res_stride]\n"
-            "ble 8f\n"
-            "cmp x10, #0x3\n"
-            "str q18, [x20, #0x0]\n"
-            "add x20, x20, %x[res_stride]\n"
-            "ble 8f\n"
-            "str q14, [x20, #0x0]\n"
-            "8:"  // Row tail: Accumulator store skip
-            "subs x23, x23, #0x4\n"
-            "add %x[res_ptr], %x[res_ptr], #0x10\n"
-            "bne 6b\n"
-            "subs x10, x10, #0x4\n"
-            "add %x[a_ptr], %x[a_ptr], x9\n"
-            "mov %x[res_ptr], x22\n"
-            "bgt 5b\n"
-            "9:"  // Row tail: Row loop skip
-            : [a_ptr] "+&r" (a_ptr), [res_ptr] "+&r" (res_ptr)
-            : [b_ptr] "r" (b_ptr), [nr] "r" (nr), [nb] "r" (nb), [res_stride] "r" (res_stride), [nc] "r" (nc)
-            : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
-        );
-        return;
-    }
-#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON)
-    {
-        float sumf[4][4];
-        int sumi;
-
-        for (int y = 0; y < nr / 4; y++) {
-            const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
-            for (int x = 0; x < nc / ncols_interleaved; x++) {
-                const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
-                for (int m = 0; m < 4; m++) {
-                    for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
-                }
-                for (int l = 0; l < nb; l++) {
-                    for (int k = 0; k < (qk / (2 * blocklen)); k++) {
-                        for (int m = 0; m < 4; m++) {
-                            for (int j = 0; j < ncols_interleaved; j++) {
-                                sumi = 0;
-                                for (int i = 0; i < blocklen; ++i) {
-                                    const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
-                                    const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
-                                    sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
-                                            (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
-                                }
-                                sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d[m]);
-                            }
-                        }
-                    }
-                }
-                for (int m = 0; m < 4; m++) {
-                    for (int j = 0; j < ncols_interleaved; j++)
-                        s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
-                }
-            }
-        }
-    }
-}
-
-static void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
-    const int qk = QK8_0;
-    const int nb = n / qk;
-    const int ncols_interleaved = 4;
-    const int blocklen = 8;
-
-    assert (n % qk == 0);
-    assert (nr % 4 == 0);
-    assert (nc % ncols_interleaved == 0);
-
-    UNUSED(s);
-    UNUSED(bs);
-    UNUSED(vx);
-    UNUSED(vy);
-    UNUSED(nr);
-    UNUSED(nc);
-    UNUSED(nb);
-    UNUSED(ncols_interleaved);
-    UNUSED(blocklen);
-
-#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
-    if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
-        const void * b_ptr = vx;
-        const void * a_ptr = vy;
-        float * res_ptr = s;
-        size_t res_stride = bs * sizeof(float);
-
-        __asm__ __volatile__(
-            "mov x10, %x[nr]\n"
-            "mov x9, #0x88\n"
-            "cmp x10, #0x10\n"
-            "mul x9, %x[nb], x9\n"
-            "blt 4f\n"
-            "1:"  // Row loop
-            "add x28, %x[b_ptr], #0x8\n"
-            "mov x27, %x[nc]\n"
-            "add x26, %x[res_ptr], %x[res_stride], LSL #4\n"
-            "2:"  // Column loop
-            "add x25, %x[a_ptr], #0x8\n"
-            "movi v2.16b, #0x0\n"
-            "movi v10.16b, #0x0\n"
-            "mov x24, %x[nb]\n"
-            "add x23, x25, x9\n"
-            "movi v12.16b, #0x0\n"
-            "movi v28.16b, #0x0\n"
-            "add x22, x23, x9\n"
-            "movi v11.16b, #0x0\n"
-            "movi v13.16b, #0x0\n"
-            "add x21, x22, x9\n"
-            "movi v22.16b, #0x0\n"
-            "movi v23.16b, #0x0\n"
-            "movi v25.16b, #0x0\n"
-            "movi v5.16b, #0x0\n"
-            "movi v7.16b, #0x0\n"
-            "movi v4.16b, #0x0\n"
-            "movi v6.16b, #0x0\n"
-            "movi v30.16b, #0x0\n"
-            "movi v24.16b, #0x0\n"
-            "movi v14.16b, #0x0\n"
-            "3:"  // Block loop
-            "ldr q21, [x28, #0x0]\n"
-            "ldr q16, [x28, #0x10]\n"
-            "movi v1.16b, #0x4\n"
-            "movi v19.4s, #0x0\n"
-            "ldr q27, [x25, #0x0]\n"
-            "ldr q15, [x25, #0x10]\n"
-            "movi v26.4s, #0x0\n"
-            "movi v18.4s, #0x0\n"
-            "ldr q29, [x28, #0x20]\n"
-            "ldr q3, [x28, #0x30]\n"
-            "movi v17.4s, #0x0\n"
-            "movi v0.16b, #0xf0\n"
-            "ldr d20, [x25, #-0x8]\n"
-            "ldr d9, [x23, #-0x8]\n"
-            "sshl v8.16b, v21.16b, v1.16b\n"
-            "sshl v31.16b, v16.16b, v1.16b\n"
-            "and v21.16b, v21.16b, v0.16b\n"
-            "and v16.16b, v16.16b, v0.16b\n"
-            "sub x20, x28, #0x8\n"
-            "subs x24, x24, #0x1\n"
-            "add x28, x28, #0x48\n"
-            ".inst 0x4e88a773  // smmla v19.4s, v27.16b, v8.16b\n"
-            ".inst 0x4e9fa77a  // smmla v26.4s, v27.16b, v31.16b\n"
-            "ldr q27, [x25, #0x20]\n"
-            ".inst 0x4e88a5f2  // smmla v18.4s, v15.16b, v8.16b\n"
-            ".inst 0x4e9fa5f1  // smmla v17.4s, v15.16b, v31.16b\n"
-            "sshl v15.16b, v29.16b, v1.16b\n"
-            "sshl v1.16b, v3.16b, v1.16b\n"
-            "and v29.16b, v29.16b, v0.16b\n"
-            "and v3.16b, v3.16b, v0.16b\n"
-            "ldr q0, [x25, #0x30]\n"
-            "fcvtl v20.4s, v20.4h\n"
-            ".inst 0x4e8fa773  // smmla v19.4s, v27.16b, v15.16b\n"
-            "fcvtl v9.4s, v9.4h\n"
-            ".inst 0x4e81a77a  // smmla v26.4s, v27.16b, v1.16b\n"
-            "ldr q27, [x25, #0x40]\n"
-            ".inst 0x4e8fa412  // smmla v18.4s, v0.16b, v15.16b\n"
-            ".inst 0x4e81a411  // smmla v17.4s, v0.16b, v1.16b\n"
-            "ldr q0, [x25, #0x50]\n"
-            ".inst 0x4e95a773  // smmla v19.4s, v27.16b, v21.16b\n"
-            ".inst 0x4e90a77a  // smmla v26.4s, v27.16b, v16.16b\n"
-            "ldr q27, [x25, #0x60]\n"
-            ".inst 0x4e95a412  // smmla v18.4s, v0.16b, v21.16b\n"
-            ".inst 0x4e90a411  // smmla v17.4s, v0.16b, v16.16b\n"
-            "ldr q0, [x25, #0x70]\n"
-            "add x25, x25, #0x88\n"
-            ".inst 0x4e9da773  // smmla v19.4s, v27.16b, v29.16b\n"
-            ".inst 0x4e83a77a  // smmla v26.4s, v27.16b, v3.16b\n"
-            "ldr d27, [x20, #0x0]\n"
-            ".inst 0x4e9da412  // smmla v18.4s, v0.16b, v29.16b\n"
-            ".inst 0x4e83a411  // smmla v17.4s, v0.16b, v3.16b\n"
-            "fcvtl v27.4s, v27.4h\n"
-            "uzp1 v0.2d, v19.2d, v26.2d\n"
-            "uzp2 v26.2d, v19.2d, v26.2d\n"
-            "fmul v19.4s, v27.4s, v20.s[0]\n"
-            "scvtf v0.4s, v0.4s, #0x4\n"
-            "scvtf v26.4s, v26.4s, #0x4\n"
-            "fmla v2.4s, v0.4s, v19.4s\n"
-            "ldr q19, [x23, #0x0]\n"
-            "uzp1 v0.2d, v18.2d, v17.2d\n"
-            "uzp2 v18.2d, v18.2d, v17.2d\n"
-            "fmul v17.4s, v27.4s, v20.s[1]\n"
-            "scvtf v0.4s, v0.4s, #0x4\n"
-            "scvtf v18.4s, v18.4s, #0x4\n"
-            "fmla v10.4s, v26.4s, v17.4s\n"
-            "ldr q17, [x23, #0x10]\n"
-            "fmul v26.4s, v27.4s, v20.s[2]\n"
-            "fmul v20.4s, v27.4s, v20.s[3]\n"
-            "fmla v12.4s, v0.4s, v26.4s\n"
-            "ldr d0, [x22, #-0x8]\n"
-            "ldr d26, [x21, #-0x8]\n"
-            "fcvtl v0.4s, v0.4h\n"
-            "fmla v28.4s, v18.4s, v20.4s\n"
-            "movi v20.4s, #0x0\n"
-            "movi v18.4s, #0x0\n"
-            ".inst 0x4e88a674  // smmla v20.4s, v19.16b, v8.16b\n"
-            ".inst 0x4e9fa672  // smmla v18.4s, v19.16b, v31.16b\n"
-            "ldr q19, [x23, #0x20]\n"
-            "fcvtl v26.4s, v26.4h\n"
-            ".inst 0x4e8fa674  // smmla v20.4s, v19.16b, v15.16b\n"
-            ".inst 0x4e81a672  // smmla v18.4s, v19.16b, v1.16b\n"
-            "ldr q19, [x23, #0x40]\n"
-            ".inst 0x4e95a674  // smmla v20.4s, v19.16b, v21.16b\n"
-            ".inst 0x4e90a672  // smmla v18.4s, v19.16b, v16.16b\n"
-            "ldr q19, [x23, #0x60]\n"
-            ".inst 0x4e9da674  // smmla v20.4s, v19.16b, v29.16b\n"
-            ".inst 0x4e83a672  // smmla v18.4s, v19.16b, v3.16b\n"
-            "uzp1 v19.2d, v20.2d, v18.2d\n"
-            "scvtf v19.4s, v19.4s, #0x4\n"
-            "uzp2 v20.2d, v20.2d, v18.2d\n"
-            "fmul v18.4s, v27.4s, v9.s[0]\n"
-            "scvtf v20.4s, v20.4s, #0x4\n"
-            "fmla v11.4s, v19.4s, v18.4s\n"
-            "ldr q18, [x22, #0x0]\n"
-            "fmul v19.4s, v27.4s, v9.s[1]\n"
-            "fmla v13.4s, v20.4s, v19.4s\n"
-            "movi v19.4s, #0x0\n"
-            "movi v20.4s, #0x0\n"
-            ".inst 0x4e88a633  // smmla v19.4s, v17.16b, v8.16b\n"
-            ".inst 0x4e9fa634  // smmla v20.4s, v17.16b, v31.16b\n"
-            "ldr q17, [x23, #0x30]\n"
-            ".inst 0x4e8fa633  // smmla v19.4s, v17.16b, v15.16b\n"
-            ".inst 0x4e81a634  // smmla v20.4s, v17.16b, v1.16b\n"
-            "ldr q17, [x23, #0x50]\n"
-            ".inst 0x4e95a633  // smmla v19.4s, v17.16b, v21.16b\n"
-            ".inst 0x4e90a634  // smmla v20.4s, v17.16b, v16.16b\n"
-            "ldr q17, [x23, #0x70]\n"
-            "add x23, x23, #0x88\n"
-            ".inst 0x4e9da633  // smmla v19.4s, v17.16b, v29.16b\n"
-            ".inst 0x4e83a634  // smmla v20.4s, v17.16b, v3.16b\n"
-            "uzp1 v17.2d, v19.2d, v20.2d\n"
-            "scvtf v17.4s, v17.4s, #0x4\n"
-            "uzp2 v20.2d, v19.2d, v20.2d\n"
-            "fmul v19.4s, v27.4s, v9.s[2]\n"
-            "fmul v9.4s, v27.4s, v9.s[3]\n"
-            "scvtf v20.4s, v20.4s, #0x4\n"
-            "fmla v22.4s, v17.4s, v19.4s\n"
-            "ldr q17, [x22, #0x10]\n"
-            "movi v19.4s, #0x0\n"
-            ".inst 0x4e88a653  // smmla v19.4s, v18.16b, v8.16b\n"
-            "fmla v23.4s, v20.4s, v9.4s\n"
-            "movi v20.4s, #0x0\n"
-            "movi v9.4s, #0x0\n"
-            ".inst 0x4e9fa654  // smmla v20.4s, v18.16b, v31.16b\n"
-            "ldr q18, [x22, #0x20]\n"
-            ".inst 0x4e88a629  // smmla v9.4s, v17.16b, v8.16b\n"
-            ".inst 0x4e8fa653  // smmla v19.4s, v18.16b, v15.16b\n"
-            ".inst 0x4e81a654  // smmla v20.4s, v18.16b, v1.16b\n"
-            "ldr q18, [x22, #0x40]\n"
-            ".inst 0x4e95a653  // smmla v19.4s, v18.16b, v21.16b\n"
-            ".inst 0x4e90a654  // smmla v20.4s, v18.16b, v16.16b\n"
-            "ldr q18, [x22, #0x60]\n"
-            ".inst 0x4e9da653  // smmla v19.4s, v18.16b, v29.16b\n"
-            ".inst 0x4e83a654  // smmla v20.4s, v18.16b, v3.16b\n"
-            "movi v18.4s, #0x0\n"
-            ".inst 0x4e9fa632  // smmla v18.4s, v17.16b, v31.16b\n"
-            "ldr q17, [x22, #0x30]\n"
-            ".inst 0x4e8fa629  // smmla v9.4s, v17.16b, v15.16b\n"
-            ".inst 0x4e81a632  // smmla v18.4s, v17.16b, v1.16b\n"
-            "ldr q17, [x22, #0x50]\n"
-            ".inst 0x4e95a629  // smmla v9.4s, v17.16b, v21.16b\n"
-            ".inst 0x4e90a632  // smmla v18.4s, v17.16b, v16.16b\n"
-            "ldr q17, [x22, #0x70]\n"
-            "add x22, x22, #0x88\n"
-            ".inst 0x4e9da629  // smmla v9.4s, v17.16b, v29.16b\n"
-            ".inst 0x4e83a632  // smmla v18.4s, v17.16b, v3.16b\n"
-            "uzp1 v17.2d, v19.2d, v20.2d\n"
-            "uzp2 v20.2d, v19.2d, v20.2d\n"
-            "fmul v19.4s, v27.4s, v0.s[0]\n"
-            "scvtf v17.4s, v17.4s, #0x4\n"
-            "scvtf v20.4s, v20.4s, #0x4\n"
-            "fmla v25.4s, v17.4s, v19.4s\n"
-            "ldr q19, [x21, #0x0]\n"
-            "fmul v17.4s, v27.4s, v0.s[1]\n"
-            "fmla v5.4s, v20.4s, v17.4s\n"
-            "ldr q17, [x21, #0x10]\n"
-            "uzp1 v20.2d, v9.2d, v18.2d\n"
-            "uzp2 v9.2d, v9.2d, v18.2d\n"
-            "fmul v18.4s, v27.4s, v0.s[2]\n"
-            "fmul v0.4s, v27.4s, v0.s[3]\n"
-            "scvtf v20.4s, v20.4s, #0x4\n"
-            "scvtf v9.4s, v9.4s, #0x4\n"
-            "fmla v7.4s, v20.4s, v18.4s\n"
-            "movi v20.4s, #0x0\n"
-            "movi v18.4s, #0x0\n"
-            ".inst 0x4e88a674  // smmla v20.4s, v19.16b, v8.16b\n"
-            ".inst 0x4e9fa672  // smmla v18.4s, v19.16b, v31.16b\n"
-            "ldr q19, [x21, #0x20]\n"
-            "fmla v4.4s, v9.4s, v0.4s\n"
-            "movi v9.4s, #0x0\n"
-            "movi v0.4s, #0x0\n"
-            ".inst 0x4e88a629  // smmla v9.4s, v17.16b, v8.16b\n"
-            "fmul v8.4s, v27.4s, v26.s[0]\n"
-            ".inst 0x4e9fa620  // smmla v0.4s, v17.16b, v31.16b\n"
-            "ldr q17, [x21, #0x30]\n"
-            ".inst 0x4e8fa674  // smmla v20.4s, v19.16b, v15.16b\n"
-            "fmul v31.4s, v27.4s, v26.s[1]\n"
-            ".inst 0x4e81a672  // smmla v18.4s, v19.16b, v1.16b\n"
-            "ldr q19, [x21, #0x40]\n"
-            ".inst 0x4e8fa629  // smmla v9.4s, v17.16b, v15.16b\n"
-            "fmul v15.4s, v27.4s, v26.s[2]\n"
-            "fmul v27.4s, v27.4s, v26.s[3]\n"
-            ".inst 0x4e81a620  // smmla v0.4s, v17.16b, v1.16b\n"
-            "ldr q1, [x21, #0x50]\n"
-            ".inst 0x4e95a674  // smmla v20.4s, v19.16b, v21.16b\n"
-            ".inst 0x4e90a672  // smmla v18.4s, v19.16b, v16.16b\n"
-            "ldr q26, [x21, #0x60]\n"
-            ".inst 0x4e95a429  // smmla v9.4s, v1.16b, v21.16b\n"
-            ".inst 0x4e90a420  // smmla v0.4s, v1.16b, v16.16b\n"
-            "ldr q21, [x21, #0x70]\n"
-            "add x21, x21, #0x88\n"
-            ".inst 0x4e9da754  // smmla v20.4s, v26.16b, v29.16b\n"
-            ".inst 0x4e83a752  // smmla v18.4s, v26.16b, v3.16b\n"
-            ".inst 0x4e9da6a9  // smmla v9.4s, v21.16b, v29.16b\n"
-            ".inst 0x4e83a6a0  // smmla v0.4s, v21.16b, v3.16b\n"
-            "uzp1 v29.2d, v20.2d, v18.2d\n"
-            "uzp2 v21.2d, v20.2d, v18.2d\n"
-            "scvtf v29.4s, v29.4s, #0x4\n"
-            "uzp1 v18.2d, v9.2d, v0.2d\n"
-            "uzp2 v16.2d, v9.2d, v0.2d\n"
-            "scvtf v21.4s, v21.4s, #0x4\n"
-            "fmla v6.4s, v29.4s, v8.4s\n"
-            "scvtf v18.4s, v18.4s, #0x4\n"
-            "scvtf v16.4s, v16.4s, #0x4\n"
-            "fmla v30.4s, v21.4s, v31.4s\n"
-            "fmla v24.4s, v18.4s, v15.4s\n"
-            "fmla v14.4s, v16.4s, v27.4s\n"
-            "bgt 3b\n"
-            "mov x20, %x[res_ptr]\n"
-            "subs x27, x27, #0x4\n"
-            "add %x[res_ptr], %x[res_ptr], #0x10\n"
-            "str q2, [x20, #0x0]\n"
-            "add x20, x20, %x[res_stride]\n"
-            "str q10, [x20, #0x0]\n"
-            "add x20, x20, %x[res_stride]\n"
-            "str q12, [x20, #0x0]\n"
-            "add x20, x20, %x[res_stride]\n"
-            "str q28, [x20, #0x0]\n"
-            "add x20, x20, %x[res_stride]\n"
-            "str q11, [x20, #0x0]\n"
-            "add x20, x20, %x[res_stride]\n"
-            "str q13, [x20, #0x0]\n"
-            "add x20, x20, %x[res_stride]\n"
-            "str q22, [x20, #0x0]\n"
-            "add x20, x20, %x[res_stride]\n"
-            "str q23, [x20, #0x0]\n"
-            "add x20, x20, %x[res_stride]\n"
-            "str q25, [x20, #0x0]\n"
-            "add x20, x20, %x[res_stride]\n"
-            "str q5, [x20, #0x0]\n"
-            "add x20, x20, %x[res_stride]\n"
-            "str q7, [x20, #0x0]\n"
-            "add x20, x20, %x[res_stride]\n"
-            "str q4, [x20, #0x0]\n"
-            "add x20, x20, %x[res_stride]\n"
-            "str q6, [x20, #0x0]\n"
-            "add x20, x20, %x[res_stride]\n"
-            "str q30, [x20, #0x0]\n"
-            "add x20, x20, %x[res_stride]\n"
-            "str q24, [x20, #0x0]\n"
-            "add x20, x20, %x[res_stride]\n"
-            "str q14, [x20, #0x0]\n"
-            "bne 2b\n"
-            "mov x20, #0x4\n"
-            "sub x10, x10, #0x10\n"
-            "cmp x10, #0x10\n"
-            "mov %x[res_ptr], x26\n"
-            "madd %x[a_ptr], x20, x9, %x[a_ptr]\n"
-            "bge 1b\n"
-            "4:"  // Row loop skip
-            "cbz x10, 9f\n"
-            "5:"  // Row tail: Row loop
-            "add x24, %x[b_ptr], #0x8\n"
-            "mov x23, %x[nc]\n"
-            "add x22, %x[res_ptr], %x[res_stride], LSL #2\n"
-            "6:"  // Row tail: Column loop
-            "movi v2.16b, #0x0\n"
-            "movi v10.16b, #0x0\n"
-            "add x25, %x[a_ptr], #0x8\n"
-            "mov x21, %x[nb]\n"
-            "movi v12.16b, #0x0\n"
-            "movi v28.16b, #0x0\n"
-            "7:"  // Row tail: Block loop
-            "ldr q6, [x24, #0x0]\n"
-            "ldr q5, [x24, #0x10]\n"
-            "movi v17.16b, #0x4\n"
-            "movi v8.4s, #0x0\n"
-            "ldr q4, [x25, #0x0]\n"
-            "ldr q13, [x25, #0x10]\n"
-            "movi v27.4s, #0x0\n"
-            "movi v0.4s, #0x0\n"
-            "ldr q31, [x24, #0x20]\n"
-            "ldr q14, [x24, #0x30]\n"
-            "movi v29.4s, #0x0\n"
-            "movi v22.16b, #0xf0\n"
-            "ldr q11, [x25, #0x20]\n"
-            "ldr q23, [x25, #0x30]\n"
-            "sshl v21.16b, v6.16b, v17.16b\n"
-            "sshl v16.16b, v5.16b, v17.16b\n"
-            "ldr q20, [x25, #0x40]\n"
-            "ldr q26, [x25, #0x50]\n"
-            "and v6.16b, v6.16b, v22.16b\n"
-            "and v5.16b, v5.16b, v22.16b\n"
-            "ldr q25, [x25, #0x60]\n"
-            "ldr q3, [x25, #0x70]\n"
-            "sshl v19.16b, v31.16b, v17.16b\n"
-            "sshl v18.16b, v14.16b, v17.16b\n"
-            "ldr d17, [x25, #-0x8]\n"
-            ".inst 0x4e95a488  // smmla v8.4s, v4.16b, v21.16b\n"
-            ".inst 0x4e90a49b  // smmla v27.4s, v4.16b, v16.16b\n"
-            "and v31.16b, v31.16b, v22.16b\n"
-            ".inst 0x4e95a5a0  // smmla v0.4s, v13.16b, v21.16b\n"
-            ".inst 0x4e90a5bd  // smmla v29.4s, v13.16b, v16.16b\n"
-            "and v14.16b, v14.16b, v22.16b\n"
-            "sub x20, x24, #0x8\n"
-            "ldr d16, [x20, #0x0]\n"
-            "subs x21, x21, #0x1\n"
-            "add x25, x25, #0x88\n"
-            "fcvtl v17.4s, v17.4h\n"
-            "add x24, x24, #0x48\n"
-            ".inst 0x4e93a568  // smmla v8.4s, v11.16b, v19.16b\n"
-            ".inst 0x4e92a57b  // smmla v27.4s, v11.16b, v18.16b\n"
-            ".inst 0x4e93a6e0  // smmla v0.4s, v23.16b, v19.16b\n"
-            ".inst 0x4e92a6fd  // smmla v29.4s, v23.16b, v18.16b\n"
-            "fcvtl v16.4s, v16.4h\n"
-            ".inst 0x4e86a688  // smmla v8.4s, v20.16b, v6.16b\n"
-            ".inst 0x4e85a69b  // smmla v27.4s, v20.16b, v5.16b\n"
-            "fmul v23.4s, v16.4s, v17.s[0]\n"
-            "fmul v21.4s, v16.4s, v17.s[1]\n"
-            "fmul v1.4s, v16.4s, v17.s[2]\n"
-            "fmul v20.4s, v16.4s, v17.s[3]\n"
-            ".inst 0x4e86a740  // smmla v0.4s, v26.16b, v6.16b\n"
-            ".inst 0x4e85a75d  // smmla v29.4s, v26.16b, v5.16b\n"
-            ".inst 0x4e9fa728  // smmla v8.4s, v25.16b, v31.16b\n"
-            ".inst 0x4e8ea73b  // smmla v27.4s, v25.16b, v14.16b\n"
-            ".inst 0x4e9fa460  // smmla v0.4s, v3.16b, v31.16b\n"
-            ".inst 0x4e8ea47d  // smmla v29.4s, v3.16b, v14.16b\n"
-            "uzp1 v19.2d, v8.2d, v27.2d\n"
-            "uzp2 v18.2d, v8.2d, v27.2d\n"
-            "scvtf v19.4s, v19.4s, #0x4\n"
-            "uzp1 v17.2d, v0.2d, v29.2d\n"
-            "uzp2 v16.2d, v0.2d, v29.2d\n"
-            "scvtf v18.4s, v18.4s, #0x4\n"
-            "fmla v2.4s, v19.4s, v23.4s\n"
-            "scvtf v17.4s, v17.4s, #0x4\n"
-            "scvtf v16.4s, v16.4s, #0x4\n"
-            "fmla v10.4s, v18.4s, v21.4s\n"
-            "fmla v12.4s, v17.4s, v1.4s\n"
-            "fmla v28.4s, v16.4s, v20.4s\n"
-            "bgt 7b\n"
-            "mov x20, %x[res_ptr]\n"
-            "cmp x10, #0x1\n"
-            "str q2, [x20, #0x0]\n"
-            "add x20, x20, %x[res_stride]\n"
-            "ble 8f\n"
-            "cmp x10, #0x2\n"
-            "str q10, [x20, #0x0]\n"
-            "add x20, x20, %x[res_stride]\n"
-            "ble 8f\n"
-            "cmp x10, #0x3\n"
-            "str q12, [x20, #0x0]\n"
-            "add x20, x20, %x[res_stride]\n"
-            "ble 8f\n"
-            "str q28, [x20, #0x0]\n"
-            "8:"  // Row tail: Accumulator store skip
-            "subs x23, x23, #0x4\n"
-            "add %x[res_ptr], %x[res_ptr], #0x10\n"
-            "bne 6b\n"
-            "subs x10, x10, #0x4\n"
-            "add %x[a_ptr], %x[a_ptr], x9\n"
-            "mov %x[res_ptr], x22\n"
-            "bgt 5b\n"
-            "9:"  // Row tail: Row loop skip
-            : [a_ptr] "+&r" (a_ptr), [res_ptr] "+&r" (res_ptr)
-            : [b_ptr] "r" (b_ptr), [nr] "r" (nr), [nb] "r" (nb), [res_stride] "r" (res_stride), [nc] "r" (nc)
-            : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
-        );
-        return;
-    }
-#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
-    float sumf[4][4];
-    int sumi;
-
-    for (int y = 0; y < nr / 4; y++) {
-        const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
-        for (int x = 0; x < nc / ncols_interleaved; x++) {
-            const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
-            for (int m = 0; m < 4; m++) {
-                for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
-            }
-            for (int l = 0; l < nb; l++) {
-                for (int k = 0; k < (qk / (2 * blocklen)); k++) {
-                    for (int m = 0; m < 4; m++) {
-                        for (int j = 0; j < ncols_interleaved; j++) {
-                            sumi = 0;
-                            for (int i = 0; i < blocklen; ++i) {
-                                const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
-                                const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
-                                sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
-                                        (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
-                            }
-                            sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d[m]);
-                        }
-                    }
-                }
-            }
-            for (int m = 0; m < 4; m++) {
-                for (int j = 0; j < ncols_interleaved; j++)
-                    s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
-            }
-        }
-    }
-}
-
-static void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
-    const int qk = QK8_0;
-    const int nb = n / qk;
-    const int ncols_interleaved = 8;
-    const int blocklen = 8;
-
-    assert (n % qk == 0);
-    assert (nr % 4 == 0);
-    assert (nc % ncols_interleaved == 0);
-
-    UNUSED(s);
-    UNUSED(bs);
-    UNUSED(vx);
-    UNUSED(vy);
-    UNUSED(nr);
-    UNUSED(nc);
-    UNUSED(nb);
-    UNUSED(ncols_interleaved);
-    UNUSED(blocklen);
-
-#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__)
-#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
-    if (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0) {
-        const void * b_ptr = vx;
-        const void * a_ptr = vy;
-        float * res_ptr = s;
-        size_t res_stride = bs * sizeof(float);
-
-        __asm__ __volatile__(
-            "mov x20, #0x4\n"
-            "mov x13, %x[nr]\n"
-            "mov z28.s, #-0x4\n"
-            "mov x12, #0x88\n"
-            "ptrue p1.b\n"
-            "whilelt p0.s, XZR, x20\n"
-            "cmp x13, #0x10\n"
-            "mul x12, %x[nb], x12\n"
-            "blt 4f\n"
-            "1:"  // Row loop
-            "add x11, %x[b_ptr], #0x10\n"
-            "mov x10, %x[nc]\n"
-            "add x9, %x[res_ptr], %x[res_stride], LSL #4\n"
-            "2:"  // Column loop
-            "add x28, %x[a_ptr], #0x8\n"
-            "mov z24.b, #0x0\n"
-            "mov z15.b, #0x0\n"
-            "mov x27, %x[nb]\n"
-            "add x26, x28, x12\n"
-            "mov z12.b, #0x0\n"
-            "mov z0.b, #0x0\n"
-            "add x25, x26, x12\n"
-            "mov z13.b, #0x0\n"
-            "mov z1.b, #0x0\n"
-            "add x24, x25, x12\n"
-            "mov z20.b, #0x0\n"
-            "mov z25.b, #0x0\n"
-            "mov z11.b, #0x0\n"
-            "mov z16.b, #0x0\n"
-            "mov z19.b, #0x0\n"
-            "mov z26.b, #0x0\n"
-            "mov z8.b, #0x0\n"
-            "mov z29.b, #0x0\n"
-            "mov z27.b, #0x0\n"
-            "mov z10.b, #0x0\n"
-            "3:"  // Block loop
-            "ld1b { z30.b }, p1/Z, [x11]\n"
-            "ld1b { z21.b }, p1/Z, [x11, #1, MUL VL]\n"
-            "mov z18.s, #0x0\n"
-            "mov z7.s, #0x0\n"
-            "ld1rqb { z3.b }, p1/Z, [x28]\n"
-            "ld1rqb { z5.b }, p1/Z, [x28, #16]\n"
-            "mov z9.s, #0x0\n"
-            "mov z22.s, #0x0\n"
-            "ld1b { z4.b }, p1/Z, [x11, #2, MUL VL]\n"
-            "ld1b { z17.b }, p1/Z, [x11, #3, MUL VL]\n"
-            "sub x20, x11, #0x10\n"
-            "sub x23, x28, #0x8\n"
-            "lsl z31.b, z30.b, #0x4\n"
-            "lsl z6.b, z21.b, #0x4\n"
-            "ld1h { z23.s }, p1/Z, [x20]\n"
-            "sub x22, x26, #0x8\n"
-            "and z30.b, z30.b, #0xf0\n"
-            "and z21.b, z21.b, #0xf0\n"
-            "sub x21, x25, #0x8\n"
-            "sub x20, x24, #0x8\n"
-            "lsl z14.b, z4.b, #0x4\n"
-            "lsl z2.b, z17.b, #0x4\n"
-            "subs x27, x27, #0x1\n"
-            "add x11, x11, #0x90\n"
-            ".inst 0x451f9872  // smmla z18.s, z3.b, z31.b\n"
-            ".inst 0x45069867  // smmla z7.s, z3.b, z6.b\n"
-            "ld1rqb { z3.b }, p1/Z, [x28, #32]\n"
-            "and z4.b, z4.b, #0xf0\n"
-            ".inst 0x451f98a9  // smmla z9.s, z5.b, z31.b\n"
-            ".inst 0x450698b6  // smmla z22.s, z5.b, z6.b\n"
-            "ld1rqb { z5.b }, p1/Z, [x28, #48]\n"
-            "and z17.b, z17.b, #0xf0\n"
-            "fcvt z23.s, p1/m, z23.h\n"
-            ".inst 0x450e9872  // smmla z18.s, z3.b, z14.b\n"
-            ".inst 0x45029867  // smmla z7.s, z3.b, z2.b\n"
-            "ld1rqb { z3.b }, p1/Z, [x28, #64]\n"
-            ".inst 0x450e98a9  // smmla z9.s, z5.b, z14.b\n"
-            ".inst 0x450298b6  // smmla z22.s, z5.b, z2.b\n"
-            "ld1rqb { z5.b }, p1/Z, [x28, #80]\n"
-            "fscale z23.s, p1/m, z23.s, z28.s\n"
-            ".inst 0x451e9872  // smmla z18.s, z3.b, z30.b\n"
-            ".inst 0x45159867  // smmla z7.s, z3.b, z21.b\n"
-            "ld1rqb { z3.b }, p1/Z, [x28, #96]\n"
-            ".inst 0x451e98a9  // smmla z9.s, z5.b, z30.b\n"
-            ".inst 0x451598b6  // smmla z22.s, z5.b, z21.b\n"
-            "ld1rqb { z5.b }, p1/Z, [x28, #112]\n"
-            "add x28, x28, #0x88\n"
-            ".inst 0x45049872  // smmla z18.s, z3.b, z4.b\n"
-            ".inst 0x45119867  // smmla z7.s, z3.b, z17.b\n"
-            "ld1h { z3.s }, p0/Z, [x23]\n"
-            ".inst 0x450498a9  // smmla z9.s, z5.b, z4.b\n"
-            ".inst 0x451198b6  // smmla z22.s, z5.b, z17.b\n"
-            "fcvt z3.s, p1/m, z3.h\n"
-            "uzp1 z5.d, z18.d, z7.d\n"
-            "uzp2 z18.d, z18.d, z7.d\n"
-            "mov z3.q, z3.q[0]\n"
-            "uzp1 z7.d, z9.d, z22.d\n"
-            "uzp2 z22.d, z9.d, z22.d\n"
-            "fmul z9.s, z23.s, z3.s[0]\n"
-            "scvtf z5.s, p1/m, z5.s\n"
-            "scvtf z18.s, p1/m, z18.s\n"
-            "scvtf z7.s, p1/m, z7.s\n"
-            "scvtf z22.s, p1/m, z22.s\n"
-            "fmla z24.s, p1/M, z5.s, z9.s\n"
-            "ld1rqb { z5.b }, p1/Z, [x26]\n"
-            "fmul z9.s, z23.s, z3.s[1]\n"
-            "fmla z15.s, p1/M, z18.s, z9.s\n"
-            "ld1rqb { z18.b }, p1/Z, [x26, #16]\n"
-            "fmul z9.s, z23.s, z3.s[2]\n"
-            "fmul z3.s, z23.s, z3.s[3]\n"
-            "fmla z12.s, p1/M, z7.s, z9.s\n"
-            "mov z9.s, #0x0\n"
-            "ld1h { z7.s }, p0/Z, [x22]\n"
-            ".inst 0x451f98a9  // smmla z9.s, z5.b, z31.b\n"
-            "fmla z0.s, p1/M, z22.s, z3.s\n"
-            "mov z22.s, #0x0\n"
-            "ld1h { z3.s }, p0/Z, [x21]\n"
-            ".inst 0x450698b6  // smmla z22.s, z5.b, z6.b\n"
-            "ld1rqb { z5.b }, p1/Z, [x26, #32]\n"
-            "fcvt z7.s, p1/m, z7.h\n"
-            "fcvt z3.s, p1/m, z3.h\n"
-            ".inst 0x450e98a9  // smmla z9.s, z5.b, z14.b\n"
-            ".inst 0x450298b6  // smmla z22.s, z5.b, z2.b\n"
-            "ld1rqb { z5.b }, p1/Z, [x26, #64]\n"
-            "mov z7.q, z7.q[0]\n"
-            "mov z3.q, z3.q[0]\n"
-            ".inst 0x451e98a9  // smmla z9.s, z5.b, z30.b\n"
-            ".inst 0x451598b6  // smmla z22.s, z5.b, z21.b\n"
-            "ld1rqb { z5.b }, p1/Z, [x26, #96]\n"
-            ".inst 0x450498a9  // smmla z9.s, z5.b, z4.b\n"
-            ".inst 0x451198b6  // smmla z22.s, z5.b, z17.b\n"
-            "uzp1 z5.d, z9.d, z22.d\n"
-            "scvtf z5.s, p1/m, z5.s\n"
-            "uzp2 z22.d, z9.d, z22.d\n"
-            "fmul z9.s, z23.s, z7.s[0]\n"
-            "scvtf z22.s, p1/m, z22.s\n"
-            "fmla z13.s, p1/M, z5.s, z9.s\n"
-            "ld1rqb { z9.b }, p1/Z, [x25]\n"
-            "fmul z5.s, z23.s, z7.s[1]\n"
-            "fmla z1.s, p1/M, z22.s, z5.s\n"
-            "mov z5.s, #0x0\n"
-            "mov z22.s, #0x0\n"
-            ".inst 0x451f9a45  // smmla z5.s, z18.b, z31.b\n"
-            ".inst 0x45069a56  // smmla z22.s, z18.b, z6.b\n"
-            "ld1rqb { z18.b }, p1/Z, [x26, #48]\n"
-            ".inst 0x450e9a45  // smmla z5.s, z18.b, z14.b\n"
-            ".inst 0x45029a56  // smmla z22.s, z18.b, z2.b\n"
-            "ld1rqb { z18.b }, p1/Z, [x26, #80]\n"
-            ".inst 0x451e9a45  // smmla z5.s, z18.b, z30.b\n"
-            ".inst 0x45159a56  // smmla z22.s, z18.b, z21.b\n"
-            "ld1rqb { z18.b }, p1/Z, [x26, #112]\n"
-            "add x26, x26, #0x88\n"
-            ".inst 0x45049a45  // smmla z5.s, z18.b, z4.b\n"
-            ".inst 0x45119a56  // smmla z22.s, z18.b, z17.b\n"
-            "uzp1 z18.d, z5.d, z22.d\n"
-            "scvtf z18.s, p1/m, z18.s\n"
-            "uzp2 z22.d, z5.d, z22.d\n"
-            "fmul z5.s, z23.s, z7.s[2]\n"
-            "fmul z7.s, z23.s, z7.s[3]\n"
-            "scvtf z22.s, p1/m, z22.s\n"
-            "fmla z20.s, p1/M, z18.s, z5.s\n"
-            "ld1rqb { z18.b }, p1/Z, [x25, #16]\n"
-            "ld1h { z5.s }, p0/Z, [x20]\n"
-            "fcvt z5.s, p1/m, z5.h\n"
-            "fmla z25.s, p1/M, z22.s, z7.s\n"
-            "mov z22.s, #0x0\n"
-            "mov z7.s, #0x0\n"
-            ".inst 0x451f9936  // smmla z22.s, z9.b, z31.b\n"
-            ".inst 0x45069927  // smmla z7.s, z9.b, z6.b\n"
-            "ld1rqb { z9.b }, p1/Z, [x25, #32]\n"
-            "mov z5.q, z5.q[0]\n"
-            ".inst 0x450e9936  // smmla z22.s, z9.b, z14.b\n"
-            ".inst 0x45029927  // smmla z7.s, z9.b, z2.b\n"
-            "ld1rqb { z9.b }, p1/Z, [x25, #64]\n"
-            ".inst 0x451e9936  // smmla z22.s, z9.b, z30.b\n"
-            ".inst 0x45159927  // smmla z7.s, z9.b, z21.b\n"
-            "ld1rqb { z9.b }, p1/Z, [x25, #96]\n"
-            ".inst 0x45049936  // smmla z22.s, z9.b, z4.b\n"
-            ".inst 0x45119927  // smmla z7.s, z9.b, z17.b\n"
-            "uzp1 z9.d, z22.d, z7.d\n"
-            "scvtf z9.s, p1/m, z9.s\n"
-            "uzp2 z22.d, z22.d, z7.d\n"
-            "fmul z7.s, z23.s, z3.s[0]\n"
-            "scvtf z22.s, p1/m, z22.s\n"
-            "fmla z11.s, p1/M, z9.s, z7.s\n"
-            "ld1rqb { z9.b }, p1/Z, [x24]\n"
-            "fmul z7.s, z23.s, z3.s[1]\n"
-            "fmla z16.s, p1/M, z22.s, z7.s\n"
-            "mov z22.s, #0x0\n"
-            "mov z7.s, #0x0\n"
-            ".inst 0x451f9a56  // smmla z22.s, z18.b, z31.b\n"
-            ".inst 0x45069a47  // smmla z7.s, z18.b, z6.b\n"
-            "ld1rqb { z18.b }, p1/Z, [x25, #48]\n"
-            ".inst 0x450e9a56  // smmla z22.s, z18.b, z14.b\n"
-            ".inst 0x45029a47  // smmla z7.s, z18.b, z2.b\n"
-            "ld1rqb { z18.b }, p1/Z, [x25, #80]\n"
-            ".inst 0x451e9a56  // smmla z22.s, z18.b, z30.b\n"
-            ".inst 0x45159a47  // smmla z7.s, z18.b, z21.b\n"
-            "ld1rqb { z18.b }, p1/Z, [x25, #112]\n"
-            "add x25, x25, #0x88\n"
-            ".inst 0x45049a56  // smmla z22.s, z18.b, z4.b\n"
-            ".inst 0x45119a47  // smmla z7.s, z18.b, z17.b\n"
-            "uzp1 z18.d, z22.d, z7.d\n"
-            "scvtf z18.s, p1/m, z18.s\n"
-            "uzp2 z7.d, z22.d, z7.d\n"
-            "fmul z22.s, z23.s, z3.s[2]\n"
-            "fmul z3.s, z23.s, z3.s[3]\n"
-            "scvtf z7.s, p1/m, z7.s\n"
-            "fmla z19.s, p1/M, z18.s, z22.s\n"
-            "ld1rqb { z18.b }, p1/Z, [x24, #16]\n"
-            "fmul z22.s, z23.s, z5.s[0]\n"
-            "fmla z26.s, p1/M, z7.s, z3.s\n"
-            "mov z3.s, #0x0\n"
-            "mov z7.s, #0x0\n"
-            ".inst 0x451f9923  // smmla z3.s, z9.b, z31.b\n"
-            ".inst 0x45069927  // smmla z7.s, z9.b, z6.b\n"
-            "ld1rqb { z9.b }, p1/Z, [x24, #32]\n"
-            ".inst 0x450e9923  // smmla z3.s, z9.b, z14.b\n"
-            ".inst 0x45029927  // smmla z7.s, z9.b, z2.b\n"
-            "mov z9.s, #0x0\n"
-            ".inst 0x451f9a49  // smmla z9.s, z18.b, z31.b\n"
-            "mov z31.s, #0x0\n"
-            ".inst 0x45069a5f  // smmla z31.s, z18.b, z6.b\n"
-            "ld1rqb { z6.b }, p1/Z, [x24, #48]\n"
-            "ld1rqb { z18.b }, p1/Z, [x24, #64]\n"
-            ".inst 0x450e98c9  // smmla z9.s, z6.b, z14.b\n"
-            "fmul z14.s, z23.s, z5.s[1]\n"
-            ".inst 0x450298df  // smmla z31.s, z6.b, z2.b\n"
-            "ld1rqb { z6.b }, p1/Z, [x24, #80]\n"
-            "fmul z2.s, z23.s, z5.s[2]\n"
-            "fmul z23.s, z23.s, z5.s[3]\n"
-            ".inst 0x451e9a43  // smmla z3.s, z18.b, z30.b\n"
-            ".inst 0x45159a47  // smmla z7.s, z18.b, z21.b\n"
-            "ld1rqb { z5.b }, p1/Z, [x24, #96]\n"
-            ".inst 0x451e98c9  // smmla z9.s, z6.b, z30.b\n"
-            ".inst 0x451598df  // smmla z31.s, z6.b, z21.b\n"
-            "ld1rqb { z18.b }, p1/Z, [x24, #112]\n"
-            "add x24, x24, #0x88\n"
-            ".inst 0x450498a3  // smmla z3.s, z5.b, z4.b\n"
-            ".inst 0x451198a7  // smmla z7.s, z5.b, z17.b\n"
-            ".inst 0x45049a49  // smmla z9.s, z18.b, z4.b\n"
-            ".inst 0x45119a5f  // smmla z31.s, z18.b, z17.b\n"
-            "uzp1 z18.d, z3.d, z7.d\n"
-            "uzp2 z5.d, z3.d, z7.d\n"
-            "scvtf z18.s, p1/m, z18.s\n"
-            "uzp1 z6.d, z9.d, z31.d\n"
-            "uzp2 z9.d, z9.d, z31.d\n"
-            "scvtf z5.s, p1/m, z5.s\n"
-            "fmla z8.s, p1/M, z18.s, z22.s\n"
-            "scvtf z6.s, p1/m, z6.s\n"
-            "scvtf z9.s, p1/m, z9.s\n"
-            "fmla z29.s, p1/M, z5.s, z14.s\n"
-            "fmla z27.s, p1/M, z6.s, z2.s\n"
-            "fmla z10.s, p1/M, z9.s, z23.s\n"
-            "bgt 3b\n"
-            "mov x20, %x[res_ptr]\n"
-            "subs x10, x10, #0x8\n"
-            "add %x[res_ptr], %x[res_ptr], #0x20\n"
-            "st1w { z24.s }, p1, [x20]\n"
-            "add x20, x20, %x[res_stride]\n"
-            "st1w { z15.s }, p1, [x20]\n"
-            "add x20, x20, %x[res_stride]\n"
-            "st1w { z12.s }, p1, [x20]\n"
-            "add x20, x20, %x[res_stride]\n"
-            "st1w { z0.s }, p1, [x20]\n"
-            "add x20, x20, %x[res_stride]\n"
-            "st1w { z13.s }, p1, [x20]\n"
-            "add x20, x20, %x[res_stride]\n"
-            "st1w { z1.s }, p1, [x20]\n"
-            "add x20, x20, %x[res_stride]\n"
-            "st1w { z20.s }, p1, [x20]\n"
-            "add x20, x20, %x[res_stride]\n"
-            "st1w { z25.s }, p1, [x20]\n"
-            "add x20, x20, %x[res_stride]\n"
-            "st1w { z11.s }, p1, [x20]\n"
-            "add x20, x20, %x[res_stride]\n"
-            "st1w { z16.s }, p1, [x20]\n"
-            "add x20, x20, %x[res_stride]\n"
-            "st1w { z19.s }, p1, [x20]\n"
-            "add x20, x20, %x[res_stride]\n"
-            "st1w { z26.s }, p1, [x20]\n"
-            "add x20, x20, %x[res_stride]\n"
-            "st1w { z8.s }, p1, [x20]\n"
-            "add x20, x20, %x[res_stride]\n"
-            "st1w { z29.s }, p1, [x20]\n"
-            "add x20, x20, %x[res_stride]\n"
-            "st1w { z27.s }, p1, [x20]\n"
-            "add x20, x20, %x[res_stride]\n"
-            "st1w { z10.s }, p1, [x20]\n"
-            "bne 2b\n"
-            "mov x20, #0x4\n"
-            "sub x13, x13, #0x10\n"
-            "cmp x13, #0x10\n"
-            "mov %x[res_ptr], x9\n"
-            "madd %x[a_ptr], x20, x12, %x[a_ptr]\n"
-            "bge 1b\n"
-            "4:"  // Row loop skip
-            "cbz x13, 9f\n"
-            "5:"  // Row tail: Row loop
-            "add x25, %x[b_ptr], #0x10\n"
-            "mov x24, %x[nc]\n"
-            "add x23, %x[res_ptr], %x[res_stride], LSL #2\n"
-            "6:"  // Row tail: Column loop
-            "mov z24.b, #0x0\n"
-            "mov z15.b, #0x0\n"
-            "add x28, %x[a_ptr], #0x8\n"
-            "mov x22, %x[nb]\n"
-            "mov z12.b, #0x0\n"
-            "mov z0.b, #0x0\n"
-            "7:"  // Row tail: Block loop
-            "ld1b { z3.b }, p1/Z, [x25]\n"
-            "ld1b { z6.b }, p1/Z, [x25, #1, MUL VL]\n"
-            "mov z2.s, #0x0\n"
-            "mov z25.s, #0x0\n"
-            "ld1rqb { z26.b }, p1/Z, [x28]\n"
-            "ld1rqb { z21.b }, p1/Z, [x28, #16]\n"
-            "mov z27.s, #0x0\n"
-            "mov z19.s, #0x0\n"
-            "ld1b { z29.b }, p1/Z, [x25, #2, MUL VL]\n"
-            "ld1b { z16.b }, p1/Z, [x25, #3, MUL VL]\n"
-            "sub x21, x25, #0x10\n"
-            "sub x20, x28, #0x8\n"
-            "lsl z20.b, z3.b, #0x4\n"
-            "lsl z4.b, z6.b, #0x4\n"
-            "ld1rqb { z10.b }, p1/Z, [x28, #32]\n"
-            "ld1rqb { z23.b }, p1/Z, [x28, #48]\n"
-            "and z3.b, z3.b, #0xf0\n"
-            "and z6.b, z6.b, #0xf0\n"
-            "ld1rqb { z11.b }, p1/Z, [x28, #64]\n"
-            "ld1rqb { z7.b }, p1/Z, [x28, #80]\n"
-            "lsl z8.b, z29.b, #0x4\n"
-            "lsl z14.b, z16.b, #0x4\n"
-            "ld1rqb { z18.b }, p1/Z, [x28, #96]\n"
-            "ld1rqb { z30.b }, p1/Z, [x28, #112]\n"
-            ".inst 0x45149b42  // smmla z2.s, z26.b, z20.b\n"
-            ".inst 0x45049b59  // smmla z25.s, z26.b, z4.b\n"
-            "and z29.b, z29.b, #0xf0\n"
-            "ld1h { z17.s }, p1/Z, [x21]\n"
-            ".inst 0x45149abb  // smmla z27.s, z21.b, z20.b\n"
-            ".inst 0x45049ab3  // smmla z19.s, z21.b, z4.b\n"
-            "and z16.b, z16.b, #0xf0\n"
-            "ld1h { z4.s }, p0/Z, [x20]\n"
-            "subs x22, x22, #0x1\n"
-            "add x28, x28, #0x88\n"
-            "fcvt z17.s, p1/m, z17.h\n"
-            "add x25, x25, #0x90\n"
-            ".inst 0x45089942  // smmla z2.s, z10.b, z8.b\n"
-            ".inst 0x450e9959  // smmla z25.s, z10.b, z14.b\n"
-            "fcvt z4.s, p1/m, z4.h\n"
-            ".inst 0x45089afb  // smmla z27.s, z23.b, z8.b\n"
-            ".inst 0x450e9af3  // smmla z19.s, z23.b, z14.b\n"
-            "fscale z17.s, p1/m, z17.s, z28.s\n"
-            "mov z4.q, z4.q[0]\n"
-            ".inst 0x45039962  // smmla z2.s, z11.b, z3.b\n"
-            ".inst 0x45069979  // smmla z25.s, z11.b, z6.b\n"
-            "fmul z23.s, z17.s, z4.s[0]\n"
-            "fmul z9.s, z17.s, z4.s[1]\n"
-            "fmul z21.s, z17.s, z4.s[2]\n"
-            "fmul z4.s, z17.s, z4.s[3]\n"
-            ".inst 0x450398fb  // smmla z27.s, z7.b, z3.b\n"
-            ".inst 0x450698f3  // smmla z19.s, z7.b, z6.b\n"
-            ".inst 0x451d9a42  // smmla z2.s, z18.b, z29.b\n"
-            ".inst 0x45109a59  // smmla z25.s, z18.b, z16.b\n"
-            ".inst 0x451d9bdb  // smmla z27.s, z30.b, z29.b\n"
-            ".inst 0x45109bd3  // smmla z19.s, z30.b, z16.b\n"
-            "uzp1 z31.d, z2.d, z25.d\n"
-            "uzp2 z13.d, z2.d, z25.d\n"
-            "scvtf z31.s, p1/m, z31.s\n"
-            "uzp1 z17.d, z27.d, z19.d\n"
-            "uzp2 z18.d, z27.d, z19.d\n"
-            "scvtf z13.s, p1/m, z13.s\n"
-            "fmla z24.s, p1/M, z31.s, z23.s\n"
-            "scvtf z17.s, p1/m, z17.s\n"
-            "scvtf z18.s, p1/m, z18.s\n"
-            "fmla z15.s, p1/M, z13.s, z9.s\n"
-            "fmla z12.s, p1/M, z17.s, z21.s\n"
-            "fmla z0.s, p1/M, z18.s, z4.s\n"
-            "bgt 7b\n"
-            "mov x20, %x[res_ptr]\n"
-            "cmp x13, #0x1\n"
-            "st1w { z24.s }, p1, [x20]\n"
-            "add x20, x20, %x[res_stride]\n"
-            "ble 8f\n"
-            "cmp x13, #0x2\n"
-            "st1w { z15.s }, p1, [x20]\n"
-            "add x20, x20, %x[res_stride]\n"
-            "ble 8f\n"
-            "cmp x13, #0x3\n"
-            "st1w { z12.s }, p1, [x20]\n"
-            "add x20, x20, %x[res_stride]\n"
-            "ble 8f\n"
-            "st1w { z0.s }, p1, [x20]\n"
-            "8:"  // Row tail: Accumulator store skip
-            "subs x24, x24, #0x8\n"
-            "add %x[res_ptr], %x[res_ptr], #0x20\n"
-            "bne 6b\n"
-            "subs x13, x13, #0x4\n"
-            "add %x[a_ptr], %x[a_ptr], x12\n"
-            "mov %x[res_ptr], x23\n"
-            "bgt 5b\n"
-            "9:"  // Row tail: Row loop skip
-            : [a_ptr] "+&r" (a_ptr), [res_ptr] "+&r" (res_ptr)
-            : [b_ptr] "r" (b_ptr), [nr] "r" (nr), [nb] "r" (nb), [res_stride] "r" (res_stride), [nc] "r" (nc)
-            : "cc", "memory", "p0", "p1", "x9", "x10", "x11", "x12", "x13", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
-        );
-        return;
-    }
-#endif // #if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
-#elif defined(__AVX2__) || defined(__AVX512F__)
-    {
-        const block_q4_0x8 * b_ptr_start = (const block_q4_0x8 *)vx;
-        const block_q8_0x4 * a_ptr_start = (const block_q8_0x4 *)vy;
-        int64_t b_nb = n / QK4_0;
-        int64_t y = 0;
-        // Mask to mask out nibbles from packed bytes
-        const __m256i m4b = _mm256_set1_epi8(0x0F);
-        const __m128i loadMask = _mm_blend_epi32(_mm_setzero_si128(), _mm_set1_epi32(0xFFFFFFFF), 3);
-        // Lookup table to convert signed nibbles to signed bytes
-        __m256i signextendlut = _mm256_castsi128_si256(_mm_set_epi8(-1, -2, -3, -4, -5, -6, -7, -8, 7, 6, 5, 4, 3, 2, 1, 0));
-        signextendlut = _mm256_permute2f128_si256(signextendlut, signextendlut, 0);
-        // Permute mask used for easier vector processing at later stages
-        __m256i requiredOrder = _mm256_set_epi32(3, 2, 1, 0, 7, 6, 5, 4);
-        int64_t xstart = 0;
-        int anr = nr - nr%16; // Used to align nr with boundary of 16
-    #ifdef __AVX512F__
-        int anc = nc - nc%16; // Used to align nc with boundary of 16
-        // Mask to mask out nibbles from packed bytes expanded to 512 bit length
-        const __m512i m4bexpanded = _mm512_set1_epi8(0x0F);
-        // Lookup table to convert signed nibbles to signed bytes expanded to 512 bit length
-        __m512i signextendlutexpanded = _mm512_inserti32x8(_mm512_castsi256_si512(signextendlut), signextendlut, 1);
-
-        // Take group of four block_q8_0x4 structures at each pass of the loop and perform dot product operation
-        for (; y < anr / 4; y += 4) {
-
-            const block_q8_0x4 * a_ptrs[4];
-
-            a_ptrs[0] = a_ptr_start + (y * nb);
-            for (int i = 0; i < 3; ++i) {
-                a_ptrs[i + 1] = a_ptrs[i] + nb;
-            }
-
-            // Take group of two block_q4_0x8 structures at each pass of the loop and perform dot product operation
-            for (int64_t x = 0; x < anc / 8; x += 2) {
-
-                const block_q4_0x8 * b_ptr_0 = b_ptr_start + ((x)     * b_nb);
-                const block_q4_0x8 * b_ptr_1 = b_ptr_start + ((x + 1) * b_nb);
-
-                // Master FP accumulators
-                __m512 acc_rows[16];
-                for (int i = 0; i < 16; i++) {
-                    acc_rows[i] = _mm512_setzero_ps();
+                // Master FP accumulators
+                __m512 acc_rows[16];
+                for (int i = 0; i < 16; i++) {
+                    acc_rows[i] = _mm512_setzero_ps();
                 }
 
                 for (int64_t b = 0; b < nb; b++) {
@@ -3783,207 +1733,7 @@ static void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, c
         }
         return;
     }
-#elif defined __riscv_v
-    if (__riscv_vlenb() >= QK4_0) {
-        const size_t vl = QK4_0;
-
-        for (int y = 0; y < nr / 4; y++) {
-            const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
-            for (int x = 0; x < nc / ncols_interleaved; x++) {
-                const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb);
-                vfloat32m1_t sumf0 = __riscv_vfmv_v_f_f32m1(0.0, vl / 4);
-                vfloat32m1_t sumf1 = __riscv_vfmv_v_f_f32m1(0.0, vl / 4);
-                vfloat32m1_t sumf2 = __riscv_vfmv_v_f_f32m1(0.0, vl / 4);
-                vfloat32m1_t sumf3 = __riscv_vfmv_v_f_f32m1(0.0, vl / 4);
-                for (int l = 0; l < nb; l++) {
-                    const vint8m4_t rhs_raw_vec = __riscv_vle8_v_i8m4((const int8_t *)b_ptr[l].qs, vl * 4);
-                    const vint8m4_t rhs_vec_lo = __riscv_vsra_vx_i8m4(__riscv_vsll_vx_i8m4(rhs_raw_vec, 4, vl * 4), 4, vl * 4);
-                    const vint8m4_t rhs_vec_hi = __riscv_vsra_vx_i8m4(rhs_raw_vec, 4, vl * 4);
-                    const vint8m2_t rhs_vec_lo_0 = __riscv_vget_v_i8m4_i8m2(rhs_vec_lo, 0);
-                    const vint8m2_t rhs_vec_lo_1 = __riscv_vget_v_i8m4_i8m2(rhs_vec_lo, 1);
-                    const vint8m2_t rhs_vec_hi_0 = __riscv_vget_v_i8m4_i8m2(rhs_vec_hi, 0);
-                    const vint8m2_t rhs_vec_hi_1 = __riscv_vget_v_i8m4_i8m2(rhs_vec_hi, 1);
-
-                    // vector version needs Zvfhmin extension
-                    const float a_scales[4] = {
-                        GGML_FP16_TO_FP32(a_ptr[l].d[0]),
-                        GGML_FP16_TO_FP32(a_ptr[l].d[1]),
-                        GGML_FP16_TO_FP32(a_ptr[l].d[2]),
-                        GGML_FP16_TO_FP32(a_ptr[l].d[3])
-                    };
-                    const float b_scales[8] = {
-                        GGML_FP16_TO_FP32(b_ptr[l].d[0]),
-                        GGML_FP16_TO_FP32(b_ptr[l].d[1]),
-                        GGML_FP16_TO_FP32(b_ptr[l].d[2]),
-                        GGML_FP16_TO_FP32(b_ptr[l].d[3]),
-                        GGML_FP16_TO_FP32(b_ptr[l].d[4]),
-                        GGML_FP16_TO_FP32(b_ptr[l].d[5]),
-                        GGML_FP16_TO_FP32(b_ptr[l].d[6]),
-                        GGML_FP16_TO_FP32(b_ptr[l].d[7])
-                    };
-                    const vfloat32m1_t b_scales_vec = __riscv_vle32_v_f32m1(b_scales, vl / 4);
-
-                    const int64_t A0 = *(const int64_t *)&a_ptr[l].qs[0];
-                    const int64_t A4 = *(const int64_t *)&a_ptr[l].qs[32];
-                    const int64_t A8 = *(const int64_t *)&a_ptr[l].qs[64];
-                    const int64_t Ac = *(const int64_t *)&a_ptr[l].qs[96];
-                    __asm__ __volatile__("" ::: "memory"); // prevent gcc from emitting fused vlse64, violating alignment
-                    vint16m4_t sumi_l0;
-                    {
-                        const vint8m2_t lhs_0_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A0, vl / 4));
-                        const vint8m2_t lhs_1_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A4, vl / 4));
-                        const vint8m2_t lhs_2_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A8, vl / 4));
-                        const vint8m2_t lhs_3_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(Ac, vl / 4));
-                        const vint16m4_t sumi_lo_0 = __riscv_vwmul_vv_i16m4(rhs_vec_lo_0, lhs_0_8, vl * 2);
-                        const vint16m4_t sumi_lo_1 = __riscv_vwmacc_vv_i16m4(sumi_lo_0, rhs_vec_lo_1, lhs_1_8, vl * 2);
-                        const vint16m4_t sumi_hi_0 = __riscv_vwmacc_vv_i16m4(sumi_lo_1, rhs_vec_hi_0, lhs_2_8, vl * 2);
-                        const vint16m4_t sumi_hi_m = __riscv_vwmacc_vv_i16m4(sumi_hi_0, rhs_vec_hi_1, lhs_3_8, vl * 2);
-
-                        sumi_l0 = sumi_hi_m;
-                    }
-
-                    {
-                        const vuint32m4_t sumi_i32 = __riscv_vreinterpret_v_i32m4_u32m4(__riscv_vreinterpret_v_i16m4_i32m4(sumi_l0));
-                        const vuint16m2_t sumi_h2_0 = __riscv_vnsrl_wx_u16m2(sumi_i32, 0, vl);
-                        const vuint16m2_t sumi_h2_1 = __riscv_vnsrl_wx_u16m2(sumi_i32, 16, vl);
-                        const vuint16m2_t sumi_h2 = __riscv_vadd_vv_u16m2(sumi_h2_0, sumi_h2_1, vl);
-                        const vuint32m2_t sumi_h2_i32 = __riscv_vreinterpret_v_u16m2_u32m2(sumi_h2);
-                        const vuint16m1_t sumi_h4_0 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 0, vl / 2);
-                        const vuint16m1_t sumi_h4_1 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 16, vl / 2);
-                        const vuint16m1_t sumi_h4 = __riscv_vadd_vv_u16m1(sumi_h4_0, sumi_h4_1, vl / 2);
-                        const vuint32m1_t sumi_h4_i32 = __riscv_vreinterpret_v_u16m1_u32m1(sumi_h4);
-                        const vint16mf2_t sumi_h8_0 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 0, vl / 4));
-                        const vint16mf2_t sumi_h8_1 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 16, vl / 4));
-                        const vint32m1_t sumi_h8 = __riscv_vwadd_vv_i32m1(sumi_h8_0, sumi_h8_1, vl / 4);
-                        const vfloat32m1_t facc = __riscv_vfcvt_f_x_v_f32m1(sumi_h8, vl / 4);
-
-                        const vfloat32m1_t tmp1 = __riscv_vfmul_vf_f32m1(facc, a_scales[0], vl / 4);
-                        sumf0 = __riscv_vfmacc_vv_f32m1(sumf0, tmp1, b_scales_vec, vl / 4);
-                    }
-
-                    const int64_t A1 = *(const int64_t *)&a_ptr[l].qs[8];
-                    const int64_t A5 = *(const int64_t *)&a_ptr[l].qs[40];
-                    const int64_t A9 = *(const int64_t *)&a_ptr[l].qs[72];
-                    const int64_t Ad = *(const int64_t *)&a_ptr[l].qs[104];
-                    __asm__ __volatile__("" ::: "memory"); // prevent gcc from emitting fused vlse64, violating alignment
-                    vint16m4_t sumi_l1;
-                    {
-                        const vint8m2_t lhs_0_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A1, vl / 4));
-                        const vint8m2_t lhs_1_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A5, vl / 4));
-                        const vint8m2_t lhs_2_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A9, vl / 4));
-                        const vint8m2_t lhs_3_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(Ad, vl / 4));
-                        const vint16m4_t sumi_lo_0 = __riscv_vwmul_vv_i16m4(rhs_vec_lo_0, lhs_0_8, vl * 2);
-                        const vint16m4_t sumi_lo_1 = __riscv_vwmacc_vv_i16m4(sumi_lo_0, rhs_vec_lo_1, lhs_1_8, vl * 2);
-                        const vint16m4_t sumi_hi_0 = __riscv_vwmacc_vv_i16m4(sumi_lo_1, rhs_vec_hi_0, lhs_2_8, vl * 2);
-                        const vint16m4_t sumi_hi_m = __riscv_vwmacc_vv_i16m4(sumi_hi_0, rhs_vec_hi_1, lhs_3_8, vl * 2);
-
-                        sumi_l1 = sumi_hi_m;
-                    }
-
-                    {
-                        const vuint32m4_t sumi_i32 = __riscv_vreinterpret_v_i32m4_u32m4(__riscv_vreinterpret_v_i16m4_i32m4(sumi_l1));
-                        const vuint16m2_t sumi_h2_0 = __riscv_vnsrl_wx_u16m2(sumi_i32, 0, vl);
-                        const vuint16m2_t sumi_h2_1 = __riscv_vnsrl_wx_u16m2(sumi_i32, 16, vl);
-                        const vuint16m2_t sumi_h2 = __riscv_vadd_vv_u16m2(sumi_h2_0, sumi_h2_1, vl);
-                        const vuint32m2_t sumi_h2_i32 = __riscv_vreinterpret_v_u16m2_u32m2(sumi_h2);
-                        const vuint16m1_t sumi_h4_0 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 0, vl / 2);
-                        const vuint16m1_t sumi_h4_1 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 16, vl / 2);
-                        const vuint16m1_t sumi_h4 = __riscv_vadd_vv_u16m1(sumi_h4_0, sumi_h4_1, vl / 2);
-                        const vuint32m1_t sumi_h4_i32 = __riscv_vreinterpret_v_u16m1_u32m1(sumi_h4);
-                        const vint16mf2_t sumi_h8_0 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 0, vl / 4));
-                        const vint16mf2_t sumi_h8_1 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 16, vl / 4));
-                        const vint32m1_t sumi_h8 = __riscv_vwadd_vv_i32m1(sumi_h8_0, sumi_h8_1, vl / 4);
-                        const vfloat32m1_t facc = __riscv_vfcvt_f_x_v_f32m1(sumi_h8, vl / 4);
-
-                        const vfloat32m1_t tmp1 = __riscv_vfmul_vf_f32m1(facc, a_scales[1], vl / 4);
-                        sumf1 = __riscv_vfmacc_vv_f32m1(sumf1, tmp1, b_scales_vec, vl / 4);
-                    }
-
-                    const int64_t A2 = *(const int64_t *)&a_ptr[l].qs[16];
-                    const int64_t A6 = *(const int64_t *)&a_ptr[l].qs[48];
-                    const int64_t Aa = *(const int64_t *)&a_ptr[l].qs[80];
-                    const int64_t Ae = *(const int64_t *)&a_ptr[l].qs[112];
-                    __asm__ __volatile__("" ::: "memory"); // prevent gcc from emitting fused vlse64, violating alignment
-                    vint16m4_t sumi_l2;
-                    {
-                        const vint8m2_t lhs_0_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A2, vl / 4));
-                        const vint8m2_t lhs_1_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A6, vl / 4));
-                        const vint8m2_t lhs_2_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(Aa, vl / 4));
-                        const vint8m2_t lhs_3_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(Ae, vl / 4));
-                        const vint16m4_t sumi_lo_0 = __riscv_vwmul_vv_i16m4(rhs_vec_lo_0, lhs_0_8, vl * 2);
-                        const vint16m4_t sumi_lo_1 = __riscv_vwmacc_vv_i16m4(sumi_lo_0, rhs_vec_lo_1, lhs_1_8, vl * 2);
-                        const vint16m4_t sumi_hi_0 = __riscv_vwmacc_vv_i16m4(sumi_lo_1, rhs_vec_hi_0, lhs_2_8, vl * 2);
-                        const vint16m4_t sumi_hi_m = __riscv_vwmacc_vv_i16m4(sumi_hi_0, rhs_vec_hi_1, lhs_3_8, vl * 2);
-
-                        sumi_l2 = sumi_hi_m;
-                    }
-
-                    {
-                        const vuint32m4_t sumi_i32 = __riscv_vreinterpret_v_i32m4_u32m4(__riscv_vreinterpret_v_i16m4_i32m4(sumi_l2));
-                        const vuint16m2_t sumi_h2_0 = __riscv_vnsrl_wx_u16m2(sumi_i32, 0, vl);
-                        const vuint16m2_t sumi_h2_1 = __riscv_vnsrl_wx_u16m2(sumi_i32, 16, vl);
-                        const vuint16m2_t sumi_h2 = __riscv_vadd_vv_u16m2(sumi_h2_0, sumi_h2_1, vl);
-                        const vuint32m2_t sumi_h2_i32 = __riscv_vreinterpret_v_u16m2_u32m2(sumi_h2);
-                        const vuint16m1_t sumi_h4_0 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 0, vl / 2);
-                        const vuint16m1_t sumi_h4_1 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 16, vl / 2);
-                        const vuint16m1_t sumi_h4 = __riscv_vadd_vv_u16m1(sumi_h4_0, sumi_h4_1, vl / 2);
-                        const vuint32m1_t sumi_h4_i32 = __riscv_vreinterpret_v_u16m1_u32m1(sumi_h4);
-                        const vint16mf2_t sumi_h8_0 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 0, vl / 4));
-                        const vint16mf2_t sumi_h8_1 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 16, vl / 4));
-                        const vint32m1_t sumi_h8 = __riscv_vwadd_vv_i32m1(sumi_h8_0, sumi_h8_1, vl / 4);
-                        const vfloat32m1_t facc = __riscv_vfcvt_f_x_v_f32m1(sumi_h8, vl / 4);
-
-                        const vfloat32m1_t tmp1 = __riscv_vfmul_vf_f32m1(facc, a_scales[2], vl / 4);
-                        sumf2 = __riscv_vfmacc_vv_f32m1(sumf2, tmp1, b_scales_vec, vl / 4);
-                    }
-
-                    const int64_t A3 = *(const int64_t *)&a_ptr[l].qs[24];
-                    const int64_t A7 = *(const int64_t *)&a_ptr[l].qs[56];
-                    const int64_t Ab = *(const int64_t *)&a_ptr[l].qs[88];
-                    const int64_t Af = *(const int64_t *)&a_ptr[l].qs[120];
-                    __asm__ __volatile__("" ::: "memory"); // prevent gcc from emitting fused vlse64, violating alignment
-                    vint16m4_t sumi_l3;
-                    {
-                        const vint8m2_t lhs_0_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A3, vl / 4));
-                        const vint8m2_t lhs_1_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A7, vl / 4));
-                        const vint8m2_t lhs_2_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(Ab, vl / 4));
-                        const vint8m2_t lhs_3_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(Af, vl / 4));
-                        const vint16m4_t sumi_lo_0 = __riscv_vwmul_vv_i16m4(rhs_vec_lo_0, lhs_0_8, vl * 2);
-                        const vint16m4_t sumi_lo_1 = __riscv_vwmacc_vv_i16m4(sumi_lo_0, rhs_vec_lo_1, lhs_1_8, vl * 2);
-                        const vint16m4_t sumi_hi_0 = __riscv_vwmacc_vv_i16m4(sumi_lo_1, rhs_vec_hi_0, lhs_2_8, vl * 2);
-                        const vint16m4_t sumi_hi_m = __riscv_vwmacc_vv_i16m4(sumi_hi_0, rhs_vec_hi_1, lhs_3_8, vl * 2);
-
-                        sumi_l3 = sumi_hi_m;
-                    }
 
-                    {
-                        const vuint32m4_t sumi_i32 = __riscv_vreinterpret_v_i32m4_u32m4(__riscv_vreinterpret_v_i16m4_i32m4(sumi_l3));
-                        const vuint16m2_t sumi_h2_0 = __riscv_vnsrl_wx_u16m2(sumi_i32, 0, vl);
-                        const vuint16m2_t sumi_h2_1 = __riscv_vnsrl_wx_u16m2(sumi_i32, 16, vl);
-                        const vuint16m2_t sumi_h2 = __riscv_vadd_vv_u16m2(sumi_h2_0, sumi_h2_1, vl);
-                        const vuint32m2_t sumi_h2_i32 = __riscv_vreinterpret_v_u16m2_u32m2(sumi_h2);
-                        const vuint16m1_t sumi_h4_0 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 0, vl / 2);
-                        const vuint16m1_t sumi_h4_1 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 16, vl / 2);
-                        const vuint16m1_t sumi_h4 = __riscv_vadd_vv_u16m1(sumi_h4_0, sumi_h4_1, vl / 2);
-                        const vuint32m1_t sumi_h4_i32 = __riscv_vreinterpret_v_u16m1_u32m1(sumi_h4);
-                        const vint16mf2_t sumi_h8_0 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 0, vl / 4));
-                        const vint16mf2_t sumi_h8_1 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 16, vl / 4));
-                        const vint32m1_t sumi_h8 = __riscv_vwadd_vv_i32m1(sumi_h8_0, sumi_h8_1, vl / 4);
-                        const vfloat32m1_t facc = __riscv_vfcvt_f_x_v_f32m1(sumi_h8, vl / 4);
-
-                        const vfloat32m1_t tmp1 = __riscv_vfmul_vf_f32m1(facc, a_scales[3], vl / 4);
-                        sumf3 = __riscv_vfmacc_vv_f32m1(sumf3, tmp1, b_scales_vec, vl / 4);
-                    }
-                }
-                __riscv_vse32_v_f32m1(&s[(y * 4 + 0) * bs + x * ncols_interleaved], sumf0, vl / 4);
-                __riscv_vse32_v_f32m1(&s[(y * 4 + 1) * bs + x * ncols_interleaved], sumf1, vl / 4);
-                __riscv_vse32_v_f32m1(&s[(y * 4 + 2) * bs + x * ncols_interleaved], sumf2, vl / 4);
-                __riscv_vse32_v_f32m1(&s[(y * 4 + 3) * bs + x * ncols_interleaved], sumf3, vl / 4);
-            }
-        }
-
-        return;
-    }
 #endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__)
     float sumf[4][8];
     int sumi;
@@ -4006,7 +1756,7 @@ static void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, c
                                 sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
                                          (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
                             }
-                            sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d[m]);
+                            sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
                         }
                     }
                 }
@@ -4019,7 +1769,7 @@ static void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, c
     }
 }
 
-static void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
+void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
     const int qk = QK_K;
     const int nb = n / qk;
     const int ncols_interleaved = 8;
@@ -5510,7 +3260,7 @@ static void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, c
                                 sumi2 = sumi2 * scales_1[j];
                                 sumi += sumi1 + sumi2;
                             }
-                            sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d[m];
+                            sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d[m];
                         }
                     }
                 }
@@ -5519,7 +3269,7 @@ static void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, c
                     for(int m = 0; m < 4; m++) {
                         const int16_t *bsums = a_ptr[l].bsums + (sb * 8) + (m * 4) - ((sb % 2) * 6);
                         for(int j = 0; j < ncols_interleaved; j++) {
-                            sum_minf[m][j] += mins[j] * (bsums[0] + bsums[1]) * GGML_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d[m];
+                            sum_minf[m][j] += mins[j] * (bsums[0] + bsums[1]) * GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d[m];
                         }
                     }
                 }
@@ -5533,899 +3283,3 @@ static void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, c
     }
 #endif
 }
-
-static void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
-    const int qk = QK8_0;
-    const int nb = n / qk;
-    const int ncols_interleaved = 4;
-    const int blocklen = 4;
-
-    assert (n % qk == 0);
-    assert (nr % 4 == 0);
-    assert (nc % ncols_interleaved == 0);
-
-    UNUSED(s);
-    UNUSED(bs);
-    UNUSED(vx);
-    UNUSED(vy);
-    UNUSED(nr);
-    UNUSED(nc);
-    UNUSED(nb);
-    UNUSED(ncols_interleaved);
-    UNUSED(blocklen);
-
-#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
-    if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
-        const int8x16_t kvalues = vld1q_s8(kvalues_iq4nl);
-
-        for (int y = 0; y < nr / 4; y++) {
-            const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
-            for (int x = 0; x < nc / ncols_interleaved; x++) {
-                const block_iq4_nlx4 * b_ptr = (const block_iq4_nlx4 *) vx + (x * nb);
-
-                float32x4_t sumf[4];
-                for (int m = 0; m < 4; m++) {
-                    sumf[m] = vdupq_n_f32(0);
-                }
-
-                for (int l = 0; l < nb; l++) {
-                    float32x4_t a_d = vcvt_f32_f16(vld1_f16((const float16_t *)a_ptr[l].d));
-                    float32x4_t b_d = vcvt_f32_f16(vld1_f16((const float16_t *)b_ptr[l].d));
-
-                    int32x4_t sumi_0 = vdupq_n_s32(0);
-                    int32x4_t sumi_1 = vdupq_n_s32(0);
-                    int32x4_t sumi_2 = vdupq_n_s32(0);
-                    int32x4_t sumi_3 = vdupq_n_s32(0);
-
-                    for (int k = 0; k < 4; k++) {
-                        int8x16_t a_0 = vld1q_s8(a_ptr[l].qs + 16 * k + 0);
-                        int8x16_t a_1 = vld1q_s8(a_ptr[l].qs + 16 * k + 64);
-
-                        uint8x16_t b = vld1q_u8(b_ptr[l].qs + 16 * k);
-                        int8x16_t b_hi = vqtbl1q_s8(kvalues, b >> 4);
-                        int8x16_t b_lo = vqtbl1q_s8(kvalues, b & 0xF);
-
-                        sumi_0 = vdotq_laneq_s32(sumi_0, b_lo, a_0, 0);
-                        sumi_1 = vdotq_laneq_s32(sumi_1, b_lo, a_0, 1);
-                        sumi_2 = vdotq_laneq_s32(sumi_2, b_lo, a_0, 2);
-                        sumi_3 = vdotq_laneq_s32(sumi_3, b_lo, a_0, 3);
-                        sumi_0 = vdotq_laneq_s32(sumi_0, b_hi, a_1, 0);
-                        sumi_1 = vdotq_laneq_s32(sumi_1, b_hi, a_1, 1);
-                        sumi_2 = vdotq_laneq_s32(sumi_2, b_hi, a_1, 2);
-                        sumi_3 = vdotq_laneq_s32(sumi_3, b_hi, a_1, 3);
-                    }
-
-                    sumf[0] = vmlaq_f32(sumf[0], vmulq_laneq_f32(b_d, a_d, 0), vcvtq_f32_s32(sumi_0));
-                    sumf[1] = vmlaq_f32(sumf[1], vmulq_laneq_f32(b_d, a_d, 1), vcvtq_f32_s32(sumi_1));
-                    sumf[2] = vmlaq_f32(sumf[2], vmulq_laneq_f32(b_d, a_d, 2), vcvtq_f32_s32(sumi_2));
-                    sumf[3] = vmlaq_f32(sumf[3], vmulq_laneq_f32(b_d, a_d, 3), vcvtq_f32_s32(sumi_3));
-                }
-
-                for (int m = 0; m < 4; m++) {
-                    vst1q_f32(s + (y * 4 + m) * bs + x * 4, sumf[m]);
-                }
-            }
-        }
-        return;
-    }
-#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON)
-    {
-        float sumf[4][4];
-        int sumi;
-
-        for (int y = 0; y < nr / 4; y++) {
-            const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
-            for (int x = 0; x < nc / ncols_interleaved; x++) {
-                const block_iq4_nlx4 * b_ptr = (const block_iq4_nlx4 *) vx + (x * nb);
-                for (int m = 0; m < 4; m++) {
-                    for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
-                }
-                for (int l = 0; l < nb; l++) {
-                    for (int k = 0; k < (qk / (2 * blocklen)); k++) {
-                        for (int m = 0; m < 4; m++) {
-                            for (int j = 0; j < ncols_interleaved; j++) {
-                                sumi = 0;
-                                for (int i = 0; i < blocklen; ++i) {
-                                    const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
-                                    const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
-                                    sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
-                                            (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4]));
-                                }
-                                sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d[m]);
-                            }
-                        }
-                    }
-                }
-                for (int m = 0; m < 4; m++) {
-                    for (int j = 0; j < ncols_interleaved; j++)
-                        s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
-                }
-            }
-        }
-    }
-}
-
-static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int blck_size_interleave) {
-    block_q4_0x4 out;
-
-    for (int i = 0; i < 4; i++) {
-        out.d[i] = in[i].d;
-    }
-
-    const int end = QK4_0 * 2 / blck_size_interleave;
-
-    if (blck_size_interleave == 8) {
-        const uint64_t xor_mask = 0x8888888888888888ULL;
-        for (int i = 0; i < end; ++i) {
-            int src_id = i % 4;
-            int src_offset = (i / 4) * blck_size_interleave;
-            int dst_offset = i * blck_size_interleave;
-
-            uint64_t elems;
-            // Using memcpy to avoid unaligned memory accesses
-            memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t));
-            elems ^= xor_mask;
-            memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t));
-        }
-    } else if (blck_size_interleave == 4) {
-        const uint32_t xor_mask = 0x88888888;
-        for (int i = 0; i < end; ++i) {
-            int src_id = i % 4;
-            int src_offset = (i / 4) * blck_size_interleave;
-            int dst_offset = i * blck_size_interleave;
-
-            uint32_t elems;
-            memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint32_t));
-            elems ^= xor_mask;
-            memcpy(&out.qs[dst_offset], &elems, sizeof(uint32_t));
-        }
-    } else {
-        GGML_ASSERT(false);
-    }
-
-    return out;
-}
-
-// interleave 8 block_q4_0s in blocks of blck_size_interleave
-// returns an interleaved block_q4_0x8
-// in the interleaved block_q4_0x8, place deltas for 8 block_q4_0 blocks
-// first, then interleave quants from 8 block_q4_0s in blocks of blck_size_interleave
-static block_q4_0x8 make_block_q4_0x8(block_q4_0 * in, unsigned int blck_size_interleave) {
-    block_q4_0x8 out;
-
-    for (int i = 0; i < 8; i++) {
-        out.d[i] = in[i].d;
-    }
-
-    const int end = QK4_0 * 4 / blck_size_interleave;
-    const uint64_t xor_mask = 0x8888888888888888ULL;
-
-    for (int i = 0; i < end; ++i) {
-        int src_id = i % 8;
-        int src_offset = (i / 8) * blck_size_interleave;
-        int dst_offset = i * blck_size_interleave;
-
-        uint64_t elems;
-        memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t));
-        elems ^= xor_mask;
-        memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t));
-    }
-
-    return out;
-}
-
-static block_q4_Kx8 make_block_q4_Kx8(block_q4_K * in, unsigned int blck_size_interleave) {
-    block_q4_Kx8 out;
-    //Delta(scale) and dmin values of the eight Q4_K structures are copied onto the output interleaved structure
-    for (int i = 0; i < 8; i++) {
-        out.d[i] = in[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.d;
-    }
-
-    for (int i = 0; i < 8; i++) {
-        out.dmin[i] = in[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.dmin;
-    }
-
-    const int end = QK_K * 4 / blck_size_interleave;
-
-    // Interleave Q4_K quants by taking 8 bytes at a time
-    for (int i = 0; i < end; ++i) {
-        int src_id = i % 8;
-        int src_offset = (i / 8) * blck_size_interleave;
-        int dst_offset = i * blck_size_interleave;
-
-        uint64_t elems;
-        memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t));
-        memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t));
-    }
-
-    // The below logic is designed so as to unpack and rearrange scales and mins values in Q4_K
-    // Currently the Q4_K structure has 8 scales and 8 mins packed in 12 bytes ( 6 bits for each value)
-    // The output Q4_Kx8 structure has 96 bytes
-    // Every 12 byte is packed such that it contains scales and mins for corresponding sub blocks from Q4_K structure
-    // For eg - First 12 bytes contains 8 scales and 8 mins - each of first sub block from different Q4_K structures
-    uint8_t s[8], m[8];
-
-    for (int i = 0; i < 4; i++) {
-        for (int j = 0; j < 8; j++) {
-            s[j] = in[j].scales[i] & 63;
-            m[j] = in[j].scales[i + 4] & 63;
-        }
-
-        out.scales[i * 12]      = (s[0] & 63) + ((s[4] & 48) << 2);
-        out.scales[i * 12 + 1]  = (s[1] & 63) + ((s[5] & 48) << 2);
-        out.scales[i * 12 + 2]  = (s[2] & 63) + ((s[6] & 48) << 2);
-        out.scales[i * 12 + 3]  = (s[3] & 63) + ((s[7] & 48) << 2);
-        out.scales[i * 12 + 4]  = (m[0] & 63) + ((m[4] & 48) << 2);
-        out.scales[i * 12 + 5]  = (m[1] & 63) + ((m[5] & 48) << 2);
-        out.scales[i * 12 + 6]  = (m[2] & 63) + ((m[6] & 48) << 2);
-        out.scales[i * 12 + 7]  = (m[3] & 63) + ((m[7] & 48) << 2);
-        out.scales[i * 12 + 8]  = (s[4] & 15) + ((m[4] & 15) << 4);
-        out.scales[i * 12 + 9]  = (s[5] & 15) + ((m[5] & 15) << 4);
-        out.scales[i * 12 + 10] = (s[6] & 15) + ((m[6] & 15) << 4);
-        out.scales[i * 12 + 11] = (s[7] & 15) + ((m[7] & 15) << 4);
-
-    }
-
-    for (int i = 0; i < 4; i++) {
-        for (int j = 0; j < 8; j++) {
-            s[j] = ((in[j].scales[i] & 192) >> 2) | (in[j].scales[i+8] & 15);
-            m[j] = ((in[j].scales[i + 4] & 192) >> 2) | ((in[j].scales[i+8] & 240) >> 4);
-        }
-
-        out.scales[i * 12 + 48] = (s[0] & 63) + ((s[4] & 48) << 2);
-        out.scales[i * 12 + 49] = (s[1] & 63) + ((s[5] & 48) << 2);
-        out.scales[i * 12 + 50] = (s[2] & 63) + ((s[6] & 48) << 2);
-        out.scales[i * 12 + 51] = (s[3] & 63) + ((s[7] & 48) << 2);
-        out.scales[i * 12 + 52] = (m[0] & 63) + ((m[4] & 48) << 2);
-        out.scales[i * 12 + 53] = (m[1] & 63) + ((m[5] & 48) << 2);
-        out.scales[i * 12 + 54] = (m[2] & 63) + ((m[6] & 48) << 2);
-        out.scales[i * 12 + 55] = (m[3] & 63) + ((m[7] & 48) << 2);
-        out.scales[i * 12 + 56] = (s[4] & 15) + ((m[4] & 15) << 4);
-        out.scales[i * 12 + 57] = (s[5] & 15) + ((m[5] & 15) << 4);
-        out.scales[i * 12 + 58] = (s[6] & 15) + ((m[6] & 15) << 4);
-        out.scales[i * 12 + 59] = (s[7] & 15) + ((m[7] & 15) << 4);
-
-    }
-
-    return out;
-}
-
-static int repack_q4_0_to_q4_0_4_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
-    GGML_ASSERT(t->type == GGML_TYPE_Q4_0);
-    GGML_ASSERT(interleave_block == 4 || interleave_block == 8);
-    constexpr int nrows_interleaved = 4;
-
-    block_q4_0x4 * dst = (block_q4_0x4 *)t->data;
-    const block_q4_0 * src = (const block_q4_0 *)data;
-    block_q4_0 dst_tmp[4];
-    int nrow = ggml_nrows(t);
-    int nblocks = t->ne[0] / QK4_0;
-
-    GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q4_0));
-
-    if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
-        return -1;
-    }
-
-    for (int b = 0; b < nrow; b += nrows_interleaved) {
-        for (int64_t x = 0; x < nblocks; x++) {
-            for (int i = 0; i < nrows_interleaved; i++) {
-                dst_tmp[i] = src[x + i * nblocks];
-            }
-            *dst++ = make_block_q4_0x4(dst_tmp, interleave_block);
-        }
-        src += nrows_interleaved * nblocks;
-    }
-    return 0;
-
-    GGML_UNUSED(data_size);
-}
-static int repack_q4_K_to_q4_K_8_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
-    GGML_ASSERT(t->type == GGML_TYPE_Q4_K);
-    GGML_ASSERT(interleave_block == 8);
-    constexpr int nrows_interleaved = 8;
-
-    block_q4_Kx8 * dst = (block_q4_Kx8*)t->data;
-    const block_q4_K * src = (const block_q4_K*) data;
-    block_q4_K dst_tmp[8];
-    int nrow = ggml_nrows(t);
-    int nblocks = t->ne[0] / QK_K;
-
-    GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q4_K));
-
-    if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
-        return -1;
-    }
-
-    for (int b = 0; b < nrow; b += nrows_interleaved) {
-        for (int64_t x = 0; x < nblocks; x++) {
-            for (int i  = 0; i < nrows_interleaved; i++ ) {
-                dst_tmp[i] = src[x + i * nblocks];
-            }
-            *dst++ = make_block_q4_Kx8(dst_tmp, interleave_block);
-        }
-        src += nrows_interleaved * nblocks;
-    }
-    return 0;
-
-    GGML_UNUSED(data_size);
-}
-
-static int repack_q4_0_to_q4_0_8_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
-    GGML_ASSERT(t->type == GGML_TYPE_Q4_0);
-    GGML_ASSERT(interleave_block == 8);
-    constexpr int nrows_interleaved = 8;
-
-    block_q4_0x8 * dst = (block_q4_0x8*)t->data;
-    const block_q4_0 * src = (const block_q4_0*) data;
-    block_q4_0 dst_tmp[8];
-    int nrow = ggml_nrows(t);
-    int nblocks = t->ne[0] / QK4_0;
-
-    GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q4_0));
-
-    if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
-        return -1;
-    }
-
-    for (int b = 0; b < nrow; b += nrows_interleaved) {
-        for (int64_t x = 0; x < nblocks; x++) {
-            for (int i  = 0; i < nrows_interleaved; i++ ) {
-                dst_tmp[i] = src[x + i * nblocks];
-            }
-            *dst++ = make_block_q4_0x8(dst_tmp, interleave_block);
-        }
-        src += nrows_interleaved * nblocks;
-    }
-    return 0;
-
-    GGML_UNUSED(data_size);
-}
-
-static block_iq4_nlx4 make_block_iq4_nlx4(block_iq4_nl * in, unsigned int blck_size_interleave) {
-    block_iq4_nlx4 out;
-
-    for (int i = 0; i < 4; i++) {
-        out.d[i] = in[i].d;
-    }
-
-    const int end = QK4_NL * 2 / blck_size_interleave;
-
-    // TODO: this branch seems wrong
-    //if (blck_size_interleave == 8) {
-    //    for (int i = 0; i < end; ++i) {
-    //        int src_id = i % 4;
-    //        int src_offset = (i / 4) * blck_size_interleave;
-    //        int dst_offset = i * blck_size_interleave;
-
-    //        // Using memcpy to avoid unaligned memory accesses
-    //        memcpy(&out.qs[dst_offset], &in[src_id].qs[src_offset], sizeof(uint64_t));
-    //    }
-    //} else
-    if (blck_size_interleave == 4) {
-        for (int i = 0; i < end; ++i) {
-            int src_id = i % 4;
-            int src_offset = (i / 4) * blck_size_interleave;
-            int dst_offset = i * blck_size_interleave;
-
-            memcpy(&out.qs[dst_offset], &in[src_id].qs[src_offset], sizeof(uint32_t));
-        }
-    } else {
-        GGML_ASSERT(false);
-    }
-
-    return out;
-}
-
-static int repack_iq4_nl_to_iq4_nl_4_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
-    GGML_ASSERT(t->type == GGML_TYPE_IQ4_NL);
-    //GGML_ASSERT(interleave_block == 4 || interleave_block == 8);
-    GGML_ASSERT(interleave_block == 4);
-
-    block_iq4_nlx4 * dst = (block_iq4_nlx4 *)t->data;
-    const block_iq4_nl * src = (const block_iq4_nl *)data;
-    block_iq4_nl dst_tmp[4];
-    int nrow = ggml_nrows(t);
-    int nrows_interleaved = 4;
-    int nblocks = t->ne[0] / QK4_0;
-
-    GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_iq4_nl));
-
-    if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
-        return -1;
-    }
-
-    for (int b = 0; b < nrow; b += nrows_interleaved) {
-        for (int64_t x = 0; x < nblocks; x++) {
-            for (int i = 0; i < nrows_interleaved; i++) {
-                dst_tmp[i] = src[x + i * nblocks];
-            }
-            *dst++ = make_block_iq4_nlx4(dst_tmp, interleave_block);
-        }
-        src += nrows_interleaved * nblocks;
-    }
-    return 0;
-
-    GGML_UNUSED(data_size);
-}
-
-namespace ggml::cpu::aarch64 {
-// repack
-template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS>
-int repack(struct ggml_tensor *, const void *, size_t);
-
-// TODO: generalise.
-template <> int repack<block_q4_0, 4, 4>(struct ggml_tensor * t, const void * data, size_t data_size) {
-    return repack_q4_0_to_q4_0_4_bl(t, 4, data, data_size);
-}
-
-template <> int repack<block_q4_0, 8, 4>(struct ggml_tensor * t, const void * data, size_t data_size) {
-    return repack_q4_0_to_q4_0_4_bl(t, 8, data, data_size);
-}
-
-template <> int repack<block_q4_0, 8, 8>(struct ggml_tensor * t, const void * data, size_t data_size) {
-    return repack_q4_0_to_q4_0_8_bl(t, 8, data, data_size);
-}
-
-template <> int repack<block_q4_K, 8, 8>(struct ggml_tensor * t, const void * data, size_t data_size) {
-    return repack_q4_K_to_q4_K_8_bl(t, 8, data, data_size);
-}
-
-template <> int repack<block_iq4_nl, 4, 4>(struct ggml_tensor * t, const void * data, size_t data_size) {
-    return repack_iq4_nl_to_iq4_nl_4_bl(t, 4, data, data_size);
-}
-
-// TODO: needs to be revisited
-//template <> int repack<block_iq4_nl, 8, 4>(struct ggml_tensor * t, const void * data, size_t data_size) {
-//    return repack_iq4_nl_to_iq4_nl_4_bl(t, 8, data, data_size);
-//}
-
-// gemv
-template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PARAM_TYPE>
-void gemv(int, float *, size_t, const void *, const void *, int, int);
-
-template <> void gemv<block_q4_0, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
-    ggml_gemv_q4_0_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
-}
-
-template <> void gemv<block_q4_0, 8, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
-    ggml_gemv_q4_0_4x8_q8_0(n, s, bs, vx, vy, nr, nc);
-}
-
-template <> void gemv<block_q4_0, 8, 8, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
-    ggml_gemv_q4_0_8x8_q8_0(n, s, bs, vx, vy, nr, nc);
-}
-
-template <> void gemv<block_q4_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
-    ggml_gemv_q4_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
-}
-
-template <> void gemv<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
-    ggml_gemv_iq4_nl_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
-}
-
-// gemm
-template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PARAM_TYPE>
-void gemm(int, float *, size_t, const void *, const void *, int, int);
-
-template <> void gemm<block_q4_0, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
-    ggml_gemm_q4_0_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
-}
-
-template <> void gemm<block_q4_0, 8, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
-    ggml_gemm_q4_0_4x8_q8_0(n, s, bs, vx, vy, nr, nc);
-}
-
-template <> void gemm<block_q4_0, 8, 8, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
-    ggml_gemm_q4_0_8x8_q8_0(n, s, bs, vx, vy, nr, nc);
-}
-
-template <> void gemm<block_q4_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
-    ggml_gemm_q4_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
-}
-
-template <> void gemm<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
-    ggml_gemm_iq4_nl_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
-}
-
-class tensor_traits_base : public ggml::cpu::tensor_traits {
-  public:
-    virtual int repack(struct ggml_tensor * t, const void * data, size_t data_size) = 0;
-};
-
-template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PARAM_TYPE> class tensor_traits : public tensor_traits_base {
-
-    bool work_size(int /* n_threads */, const struct ggml_tensor * op, size_t & size) override {
-        // not realy a GGML_TYPE_Q8_0 but same size.
-        switch (op->op) {
-            case GGML_OP_MUL_MAT:
-                size = ggml_row_size(PARAM_TYPE, ggml_nelements(op->src[1]));
-                return true;
-            case GGML_OP_MUL_MAT_ID:
-                size = ggml_row_size(PARAM_TYPE, ggml_nelements(op->src[1]));
-                size = GGML_PAD(size, sizeof(int64_t));  // + padding for next bloc.
-                size += sizeof(int64_t) * (1+op->src[0]->ne[2]) * op->src[1]->ne[2];
-                return true;
-            default:
-                // GGML_ABORT("fatal error");
-                break;
-        }
-        return false;
-    }
-
-    bool compute_forward(struct ggml_compute_params * params, struct ggml_tensor * op) override {
-        switch (op->op) {
-            case GGML_OP_MUL_MAT:
-                forward_mul_mat(params, op);
-                return true;
-            case GGML_OP_MUL_MAT_ID:
-                forward_mul_mat_id(params, op);
-                return true;
-            default:
-                // GGML_ABORT("fatal error");
-                break;
-        }
-        return false;
-    }
-
-    void forward_mul_mat(ggml_compute_params * params, ggml_tensor * op) {
-        const ggml_tensor * src0 = op->src[0];
-        const ggml_tensor * src1 = op->src[1];
-        ggml_tensor *       dst  = op;
-
-        GGML_TENSOR_BINARY_OP_LOCALS
-
-        const int ith = params->ith;
-        const int nth = params->nth;
-
-        GGML_ASSERT(ne0 == ne01);
-        GGML_ASSERT(ne1 == ne11);
-        GGML_ASSERT(ne2 == ne12);
-        GGML_ASSERT(ne3 == ne13);
-
-        // dst cannot be transposed or permuted
-        GGML_ASSERT(nb0 == sizeof(float));
-        GGML_ASSERT(nb0 <= nb1);
-        GGML_ASSERT(nb1 <= nb2);
-        GGML_ASSERT(nb2 <= nb3);
-
-        GGML_ASSERT(src1->type == GGML_TYPE_F32);
-
-        GGML_ASSERT(ggml_n_dims(op->src[0]) == 2);
-        // GGML_ASSERT(ggml_n_dims(op->src[1]) == 2);
-
-        char *       wdata = static_cast<char *>(params->wdata);
-        const size_t nbw1  = ggml_row_size(PARAM_TYPE, ne10);
-
-        assert(params->wsize >= nbw1 * ne11);
-
-        const ggml_from_float_t from_float = ggml_get_type_traits_cpu(PARAM_TYPE)->from_float;
-
-        int64_t i11_processed = 0;
-        for (int64_t i11 = ith * 4; i11 < ne11 - ne11 % 4; i11 += nth * 4) {
-            ggml_quantize_mat_t<INTER_SIZE, PARAM_TYPE>((float *) ((char *) src1->data + i11 * nb11), (void *) (wdata + i11 * nbw1), 4, ne10);
-        }
-
-        i11_processed = ne11 - ne11 % 4;
-        for (int64_t i11 = i11_processed + ith; i11 < ne11; i11 += nth) {
-            from_float((float *) ((char *) src1->data + i11 * nb11), (void *) (wdata + i11 * nbw1), ne10);
-        }
-
-        ggml_barrier(params->threadpool);
-
-        const void * src1_wdata      = params->wdata;
-        const size_t src1_col_stride = ggml_row_size(PARAM_TYPE, ne10);
-        int64_t      src0_start      = (ith * ne01) / nth;
-        int64_t      src0_end        = ((ith + 1) * ne01) / nth;
-        src0_start = (src0_start % NB_COLS) ? src0_start + NB_COLS - (src0_start % NB_COLS) : src0_start;
-        src0_end   = (src0_end   % NB_COLS) ? src0_end   + NB_COLS - (src0_end   % NB_COLS) : src0_end;
-        if (src0_start >= src0_end) {
-            return;
-        }
-
-        // If there are more than three rows in src1, use gemm; otherwise, use gemv.
-        if (ne11 > 3) {
-            gemm<BLOC_TYPE, INTER_SIZE, NB_COLS, PARAM_TYPE>(ne00,
-                    (float *) ((char *) dst->data) + src0_start, ne01,
-                    (const char *) src0->data + src0_start * nb01,
-                    (const char *) src1_wdata, ne11 - ne11 % 4, src0_end - src0_start);
-        }
-        for (int iter = ne11 - ne11 % 4; iter < ne11; iter++) {
-            gemv<BLOC_TYPE, INTER_SIZE, NB_COLS, PARAM_TYPE>(ne00,
-                    (float *) ((char *) dst->data + (iter * nb1)) + src0_start, ne01,
-                    (const char *) src0->data + src0_start * nb01,
-                    (const char *) src1_wdata + (src1_col_stride * iter), 1,
-                    src0_end - src0_start);
-        }
-    }
-
-    void forward_mul_mat_id(ggml_compute_params * params, ggml_tensor * op) {
-        const ggml_tensor * src0 = op->src[0];
-        const ggml_tensor * src1 = op->src[1];
-        const ggml_tensor * ids  = op->src[2];
-        ggml_tensor *       dst  = op;
-
-        GGML_TENSOR_BINARY_OP_LOCALS
-
-        const int ith = params->ith;
-        const int nth = params->nth;
-
-        const ggml_from_float_t from_float = ggml_get_type_traits_cpu(PARAM_TYPE)->from_float;
-
-        // we don't support permuted src0 or src1
-        GGML_ASSERT(nb00 == ggml_type_size(src0->type));
-        GGML_ASSERT(nb10 == ggml_type_size(src1->type));
-
-        // dst cannot be transposed or permuted
-        GGML_ASSERT(nb0 == sizeof(float));
-        GGML_ASSERT(nb0 <= nb1);
-        GGML_ASSERT(nb1 <= nb2);
-        GGML_ASSERT(nb2 <= nb3);
-
-        GGML_ASSERT(ne03 == 1);
-        GGML_ASSERT(ne13 == 1);
-        GGML_ASSERT(ne3  == 1);
-
-        GGML_ASSERT(src1->type == GGML_TYPE_F32);
-
-        // row groups
-        const int n_ids = ids->ne[0]; // n_expert_used
-        const int n_as  = ne02;       // n_expert
-
-        const size_t nbw1 = ggml_row_size(PARAM_TYPE, ne10);
-        const size_t nbw2 = nbw1*ne11;
-        const size_t nbw3 = nbw2*ne12;
-
-        struct mmid_row_mapping {
-            int32_t i1;
-            int32_t i2;
-        };
-
-        GGML_ASSERT(params->wsize >= (GGML_PAD(nbw3, sizeof(int64_t)) + n_as * sizeof(int64_t) +
-                                      n_as * ne12 * sizeof(mmid_row_mapping)));
-
-        auto * wdata             = (char *)     params->wdata;
-        auto * wdata_src1_end    = (char *)     wdata + GGML_PAD(nbw3, sizeof(int64_t));
-        auto * matrix_row_counts = (int64_t *) (wdata_src1_end); // [n_as]
-
-        struct mmid_row_mapping * matrix_rows = (struct mmid_row_mapping *) (matrix_row_counts + n_as);  // [n_as][ne12]
-
-        // src1: float32 => param type
-        for (int64_t i12 = 0; i12 < ne12; ++i12) {
-            for (int64_t i11 = ith; i11 < ne11; i11 += nth) {
-                from_float((float *)((char *) src1->data + i12 * nb12 + i11 * nb11),
-                           (void *)               (wdata + i12 * nbw2 + i11 * nbw1),
-                           ne10);
-            }
-        }
-
-#define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id) * ne12 + (i1)]
-
-        if (ith == 0) {
-            // initialize matrix_row_counts
-            memset(matrix_row_counts, 0, n_as * sizeof(int64_t));
-
-            // group rows by src0 matrix
-            for (int32_t iid1 = 0; iid1 < ids->ne[1]; ++iid1) {
-                for (int32_t id = 0; id < n_ids; ++id) {
-                    const int32_t i02 =
-                        *(const int32_t *) ((const char *) ids->data + iid1 * ids->nb[1] + id * ids->nb[0]);
-
-                    GGML_ASSERT(i02 >= 0 && i02 < n_as);
-
-                    MMID_MATRIX_ROW(i02, matrix_row_counts[i02]) = { id, iid1 };
-                    matrix_row_counts[i02] += 1;
-                }
-            }
-        }
-
-        ggml_barrier(params->threadpool);
-
-        // compute each matrix multiplication in sequence
-        for (int cur_a = 0; cur_a < n_as; ++cur_a) {
-            const int64_t cne1 = matrix_row_counts[cur_a];
-
-            if (cne1 == 0) {
-                continue;
-            }
-
-            const auto * src0_cur = (const char *) src0->data + cur_a*nb02;
-
-            //const int64_t nr0 = ne01; // src0 rows
-            const int64_t nr1 = cne1; // src1 rows
-
-            int64_t src0_cur_start = (ith * ne01) / nth;
-            int64_t src0_cur_end   = ((ith + 1) * ne01) / nth;
-
-            src0_cur_start = (src0_cur_start % NB_COLS) ? src0_cur_start + NB_COLS - (src0_cur_start % NB_COLS) : src0_cur_start;
-            src0_cur_end   = (src0_cur_end   % NB_COLS) ? src0_cur_end   + NB_COLS - (src0_cur_end   % NB_COLS) : src0_cur_end;
-
-            if (src0_cur_start >= src0_cur_end) {
-                return;
-            }
-
-            for (int ir1 = 0; ir1 < nr1; ir1++) {
-                struct mmid_row_mapping row_mapping = MMID_MATRIX_ROW(cur_a, ir1);
-
-                const int id = row_mapping.i1; // selected expert index
-
-                const int64_t i11 = id % ne11;
-                const int64_t i12 = row_mapping.i2; // row index in src1
-
-                const int64_t i1 = id;  // selected expert index
-                const int64_t i2 = i12; // row
-
-                const auto * src1_col = (const char *) wdata + (i11 * nbw1 + i12 * nbw2);
-
-                gemv<BLOC_TYPE, INTER_SIZE, NB_COLS, PARAM_TYPE>(ne00,
-                        (float *)((char *) dst->data + (i1 * nb1 + i2 * nb2)) + src0_cur_start, ne01,
-                        src0_cur + src0_cur_start * nb01,
-                        src1_col, 1, src0_cur_end - src0_cur_start);
-            }
-        }
-#undef MMID_MATRIX_ROW
-    }
-
-    int repack(struct ggml_tensor * t, const void * data, size_t data_size) override {
-        GGML_LOG_DEBUG("%s: repack tensor %s with %s_%dx%d\n", __func__, t->name, ggml_type_name(t->type),
-                       (int) NB_COLS, (int) INTER_SIZE);
-        return ggml::cpu::aarch64::repack<BLOC_TYPE, INTER_SIZE, NB_COLS>(t, data, data_size);
-    }
-};
-
-// instance for Q4
-static const tensor_traits<block_q4_0, 4, 4, GGML_TYPE_Q8_0> q4_0_4x4_q8_0;
-static const tensor_traits<block_q4_0, 8, 4, GGML_TYPE_Q8_0> q4_0_4x8_q8_0;
-static const tensor_traits<block_q4_0, 8, 8, GGML_TYPE_Q8_0> q4_0_8x8_q8_0;
-static const tensor_traits<block_q4_K, 8, 8, GGML_TYPE_Q8_K> q4_K_8x8_q8_K;
-
-// instance for IQ4
-static const tensor_traits<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0> iq4_nl_4x4_q8_0;
-
-}  // namespace ggml::cpu::aarch64
-
-static const ggml::cpu::tensor_traits * ggml_aarch64_get_optimal_repack_type(const struct ggml_tensor * cur) {
-    if (cur->type == GGML_TYPE_Q4_0) {
-        if (ggml_cpu_has_avx2() || (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0)) {
-            if (cur->ne[1] % 8 == 0) {
-                return &ggml::cpu::aarch64::q4_0_8x8_q8_0;
-            }
-        }
-        if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
-            if (cur->ne[1] % 4 == 0) {
-                return &ggml::cpu::aarch64::q4_0_4x8_q8_0;
-            }
-        }
-        if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
-            if (cur->ne[1] % 4 == 0) {
-                return &ggml::cpu::aarch64::q4_0_4x4_q8_0;
-            }
-        }
-    } else if (cur->type == GGML_TYPE_Q4_K) {
-        if (ggml_cpu_has_avx2()) {
-            if (cur->ne[1] % 8 == 0) {
-                return &ggml::cpu::aarch64::q4_K_8x8_q8_K;
-            }
-        }
-    } else if (cur->type == GGML_TYPE_IQ4_NL) {
-        if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
-            if (cur->ne[1] % 4 == 0) {
-                return &ggml::cpu::aarch64::iq4_nl_4x4_q8_0;
-            }
-        }
-    }
-
-    return nullptr;
-}
-
-static enum ggml_status ggml_backend_cpu_aarch64_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
-    tensor->extra = (void *) const_cast<ggml::cpu::tensor_traits *>(ggml_aarch64_get_optimal_repack_type(tensor));
-
-    GGML_UNUSED(buffer);
-    return GGML_STATUS_SUCCESS;
-}
-
-static void ggml_backend_cpu_aarch64_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor,
-                                                       const void * data, size_t offset, size_t size) {
-    GGML_ASSERT(offset == 0);
-    GGML_ASSERT(size == ggml_nbytes(tensor));
-
-    auto tensor_traits = (ggml::cpu::aarch64::tensor_traits_base *) tensor->extra;
-    auto OK            = tensor_traits->repack(tensor, data, size);
-
-    GGML_ASSERT(OK == 0);
-    GGML_UNUSED(buffer);
-}
-
-static const char * ggml_backend_cpu_aarch64_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
-    return "CPU_AARCH64";
-
-    GGML_UNUSED(buft);
-}
-
-static ggml_backend_buffer_t ggml_backend_cpu_aarch64_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
-    ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size);
-
-    if (buffer == nullptr) {
-        return nullptr;
-    }
-
-    buffer->buft              = buft;
-    buffer->iface.init_tensor = ggml_backend_cpu_aarch64_buffer_init_tensor;
-    buffer->iface.set_tensor  = ggml_backend_cpu_aarch64_buffer_set_tensor;
-    buffer->iface.get_tensor  = nullptr;
-    buffer->iface.cpy_tensor  = nullptr;
-    return buffer;
-}
-
-static size_t ggml_backend_cpu_aarch64_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
-    return TENSOR_ALIGNMENT;
-
-    GGML_UNUSED(buft);
-}
-
-namespace ggml::cpu::aarch64 {
-class extra_buffer_type : ggml::cpu::extra_buffer_type {
-    bool supports_op(ggml_backend_dev_t, const struct ggml_tensor * op) override {
-        if (    op->op == GGML_OP_MUL_MAT &&
-                op->src[0]->buffer &&
-                (ggml_n_dims(op->src[0]) == 2) &&
-                op->src[0]->buffer->buft == ggml_backend_cpu_aarch64_buffer_type() &&
-                ggml_aarch64_get_optimal_repack_type(op->src[0])
-                ) {
-            if (op->src[1]->buffer && !ggml_backend_buft_is_host(op->src[1]->buffer->buft)) {
-                return false;
-            }
-            if (op->src[1]->type == GGML_TYPE_F32) {
-                return true;
-            }
-            //if (op->src[1]->type == GGML_TYPE_Q8_0) {
-            //    return true;
-            //}
-            // may be possible if Q8_0 packed...
-        } else if (op->op == GGML_OP_MUL_MAT_ID
-                && op->src[0]->buffer
-                && (ggml_n_dims(op->src[0]) == 3)
-                && op->src[0]->buffer->buft == ggml_backend_cpu_aarch64_buffer_type()
-                && ggml_aarch64_get_optimal_repack_type(op->src[0])
-                ) {
-            if (op->src[1]->buffer && !ggml_backend_buft_is_host(op->src[1]->buffer->buft)) {
-                return false;
-            }
-            if (op->src[1]->type == GGML_TYPE_F32) {
-                return true;
-            }
-            //if (op->src[1]->type == GGML_TYPE_Q8_0) {
-            //    return true;
-            //}
-        }
-        return false;
-    }
-
-    ggml::cpu::tensor_traits * get_tensor_traits(const struct ggml_tensor * op) override {
-        if (op->op == GGML_OP_MUL_MAT || op->op == GGML_OP_MUL_MAT_ID) {
-            if (op->src[0]->buffer && op->src[0]->buffer->buft == ggml_backend_cpu_aarch64_buffer_type()) {
-                return (ggml::cpu::tensor_traits *) op->src[0]->extra;
-            }
-        }
-        return nullptr;
-    }
-};
-}  // namespace ggml::cpu::aarch64
-
-ggml_backend_buffer_type_t ggml_backend_cpu_aarch64_buffer_type(void) {
-    static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_aarch64 = {
-        /* .iface    = */ {
-                           /* .get_name         = */ ggml_backend_cpu_aarch64_buffer_type_get_name,
-                           /* .alloc_buffer     = */ ggml_backend_cpu_aarch64_buffer_type_alloc_buffer,
-                           /* .get_alignment    = */ ggml_backend_cpu_aarch64_buffer_type_get_alignment,
-                           /* .get_max_size     = */ nullptr,  // defaults to SIZE_MAX
-                           /* .get_alloc_size   = */ nullptr,  // defaults to ggml_nbytes
-                           /* .is_host          = */ nullptr,
-                           },
-        /* .device  = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
-        /* .context = */ new ggml::cpu::aarch64::extra_buffer_type(),
-    };
-
-    return &ggml_backend_cpu_buffer_type_aarch64;
-}
diff --git a/ggml/src/ggml-cpu/common.h b/ggml/src/ggml-cpu/common.h
index 3df01c1edffeb..353563dc35c5d 100644
--- a/ggml/src/ggml-cpu/common.h
+++ b/ggml/src/ggml-cpu/common.h
@@ -1,9 +1,10 @@
 #pragma once
 
 #include "ggml.h"
-#include "ggml-cpu-traits.h"
+#include "traits.h"
 #include "ggml-cpu-impl.h"
 #include "ggml-impl.h"
+#include "simd-mappings.h"
 
 #ifdef __cplusplus
 
@@ -12,11 +13,11 @@
 // convenience functions/macros for use in template calls
 // note: these won't be required after the 'traits' lookup table is used.
 static inline ggml_fp16_t f32_to_f16(float x) {
-    return GGML_FP32_TO_FP16(x);
+    return GGML_CPU_FP32_TO_FP16(x);
 }
 
 static inline float f16_to_f32(ggml_fp16_t x) {
-    return GGML_FP16_TO_FP32(x);
+    return GGML_CPU_FP16_TO_FP32(x);
 }
 
 static inline ggml_bf16_t f32_to_bf16(float x) {
diff --git a/ggml/src/ggml-cpu/ggml-cpu-aarch64.h b/ggml/src/ggml-cpu/ggml-cpu-aarch64.h
deleted file mode 100644
index 6e84c826b4091..0000000000000
--- a/ggml/src/ggml-cpu/ggml-cpu-aarch64.h
+++ /dev/null
@@ -1,8 +0,0 @@
-#pragma once
-
-#include "ggml-cpu-traits.h"
-#include "ggml.h"
-
-// GGML internal header
-
-ggml_backend_buffer_type_t ggml_backend_cpu_aarch64_buffer_type(void);
diff --git a/ggml/src/ggml-cpu/ggml-cpu-impl.h b/ggml/src/ggml-cpu/ggml-cpu-impl.h
index b3f1b5ca79092..d839cf5c55e81 100644
--- a/ggml/src/ggml-cpu/ggml-cpu-impl.h
+++ b/ggml/src/ggml-cpu/ggml-cpu-impl.h
@@ -62,11 +62,17 @@ struct ggml_compute_params {
 #if defined(__s390x__) && defined(__VEC__)
 #ifndef __VXE__
 #define __VXE__
-#endif
+#endif  // __VXE__
 #ifndef __VXE2__
 #define __VXE2__
-#endif
-#endif
+#endif  // __VXE2__
+#endif  // __s390x__ && __VEC__
+
+#if defined(__s390x__) && defined(GGML_NNPA)
+#ifndef __NNPA__
+#define __NNPA__
+#endif  // __NNPA__
+#endif  // __s390x__ && GGML_NNPA
 
 #if defined(__ARM_FEATURE_SVE)
 #include <sys/prctl.h>
@@ -371,7 +377,7 @@ inline static int32x4_t ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b)
 #define vec_xor(a, b) ((a) ^ (b)) // Vector XOR
 #endif
 
-typedef signed char char8x16_t __attribute__((vector_size(16)));
+typedef signed   char char8x16_t  __attribute__((vector_size(16)));
 typedef unsigned char uchar8x16_t __attribute__((vector_size(16)));
 
 typedef int8_t  int8x16_t __attribute__((vector_size(16)));
@@ -382,10 +388,10 @@ typedef uint8_t  uint8x16_t __attribute__((vector_size(16)));
 typedef uint16_t uint16x8_t __attribute__((vector_size(16)));
 typedef uint32_t uint32x4_t __attribute__((vector_size(16)));
 
-typedef float float32x4_t __attribute__((vector_size(16)));
-typedef double double64x2_t __attribute((vector_size(16)));
+typedef float  float32x4_t  __attribute__((vector_size(16)));
+typedef double double64x2_t __attribute__((vector_size(16)));
 
-typedef signed long long long64x2_t __attribute((vector_size(16)));
+typedef signed   long long long64x2_t  __attribute__((vector_size(16)));
 typedef unsigned long long ulong64x2_t __attribute__((vector_size(16)));
 
 typedef struct ggml_uint8x16x2_t {
@@ -503,6 +509,9 @@ static __m256 __lasx_xvreplfr2vr_s(const float val) {
 // TODO: move to ggml-threading
 void ggml_barrier(struct ggml_threadpool * tp);
 
+void ggml_threadpool_chunk_set(struct ggml_threadpool * tp, int value);
+int  ggml_threadpool_chunk_add(struct ggml_threadpool * tp, int value);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/ggml/src/ggml-cpu/ggml-cpu-quants.c b/ggml/src/ggml-cpu/ggml-cpu-quants.c
deleted file mode 100644
index 40bded4767b47..0000000000000
--- a/ggml/src/ggml-cpu/ggml-cpu-quants.c
+++ /dev/null
@@ -1,13891 +0,0 @@
-#define GGML_COMMON_IMPL_C
-#include "ggml-common.h"
-
-#include "ggml-quants.h"
-#include "ggml-cpu-quants.h"
-#include "ggml-impl.h"
-#include "ggml-cpu-impl.h"
-#include "ggml-cpu.h"
-
-#include <math.h>
-#include <string.h>
-#include <assert.h>
-#include <float.h>
-#include <stdlib.h> // for qsort
-#include <stdio.h>  // for GGML_ASSERT
-
-#define GROUP_MAX_EPS 1e-15f
-#define GROUP_MAX_EPS_IQ3_XXS 1e-8f
-#define GROUP_MAX_EPS_IQ2_S 1e-8f
-#define GROUP_MAX_EPS_IQ1_M 1e-7f
-#define GROUP_MAX_EPS_IQ1_S 1e-12f
-
-#define UNUSED GGML_UNUSED
-
-// some compilers don't provide _mm256_set_m128i, e.g. gcc 7
-#define MM256_SET_M128I(a, b) _mm256_insertf128_si256(_mm256_castsi128_si256(b), (a), 1)
-
-#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__)
-// multiply int8_t, add results pairwise twice
-static inline __m128i mul_sum_i8_pairs(const __m128i x, const __m128i y) {
-    // Get absolute values of x vectors
-    const __m128i ax = _mm_sign_epi8(x, x);
-    // Sign the values of the y vectors
-    const __m128i sy = _mm_sign_epi8(y, x);
-    // Perform multiplication and create 16-bit values
-    const __m128i dot = _mm_maddubs_epi16(ax, sy);
-    const __m128i ones = _mm_set1_epi16(1);
-    return _mm_madd_epi16(ones, dot);
-}
-
-#if __AVX__ || __AVX2__ || __AVX512F__
-// horizontally add 8 floats
-static inline float hsum_float_8(const __m256 x) {
-    __m128 res = _mm256_extractf128_ps(x, 1);
-    res = _mm_add_ps(res, _mm256_castps256_ps128(x));
-    res = _mm_add_ps(res, _mm_movehl_ps(res, res));
-    res = _mm_add_ss(res, _mm_movehdup_ps(res));
-    return _mm_cvtss_f32(res);
-}
-
-// horizontally add 8 int32_t
-static inline int hsum_i32_8(const __m256i a) {
-    const __m128i sum128 = _mm_add_epi32(_mm256_castsi256_si128(a), _mm256_extractf128_si256(a, 1));
-    const __m128i hi64 = _mm_unpackhi_epi64(sum128, sum128);
-    const __m128i sum64 = _mm_add_epi32(hi64, sum128);
-    const __m128i hi32  = _mm_shuffle_epi32(sum64, _MM_SHUFFLE(2, 3, 0, 1));
-    return _mm_cvtsi128_si32(_mm_add_epi32(sum64, hi32));
-}
-
-// horizontally add 4 int32_t
-static inline int hsum_i32_4(const __m128i a) {
-    const __m128i hi64 = _mm_unpackhi_epi64(a, a);
-    const __m128i sum64 = _mm_add_epi32(hi64, a);
-    const __m128i hi32  = _mm_shuffle_epi32(sum64, _MM_SHUFFLE(2, 3, 0, 1));
-    return _mm_cvtsi128_si32(_mm_add_epi32(sum64, hi32));
-}
-
-#if defined(__AVX2__) || defined(__AVX512F__)
-// spread 32 bits to 32 bytes { 0x00, 0xFF }
-static inline __m256i bytes_from_bits_32(const uint8_t * x) {
-    uint32_t x32;
-    memcpy(&x32, x, sizeof(uint32_t));
-    const __m256i shuf_mask = _mm256_set_epi64x(
-            0x0303030303030303, 0x0202020202020202,
-            0x0101010101010101, 0x0000000000000000);
-    __m256i bytes = _mm256_shuffle_epi8(_mm256_set1_epi32(x32), shuf_mask);
-    const __m256i bit_mask = _mm256_set1_epi64x(0x7fbfdfeff7fbfdfe);
-    bytes = _mm256_or_si256(bytes, bit_mask);
-    return _mm256_cmpeq_epi8(bytes, _mm256_set1_epi64x(-1));
-}
-
-// Unpack 32 4-bit fields into 32 bytes
-// The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval
-static inline __m256i bytes_from_nibbles_32(const uint8_t * rsi)
-{
-    const __m128i tmp = _mm_loadu_si128((const __m128i *)rsi);
-    const __m256i bytes = MM256_SET_M128I(_mm_srli_epi16(tmp, 4), tmp);
-    const __m256i lowMask = _mm256_set1_epi8( 0xF );
-    return _mm256_and_si256(lowMask, bytes);
-}
-
-// add int16_t pairwise and return as float vector
-static inline __m256 sum_i16_pairs_float(const __m256i x) {
-    const __m256i ones = _mm256_set1_epi16(1);
-    const __m256i summed_pairs = _mm256_madd_epi16(ones, x);
-    return _mm256_cvtepi32_ps(summed_pairs);
-}
-
-static inline __m256 mul_sum_us8_pairs_float(const __m256i ax, const __m256i sy) {
-#if defined(__AVX512VNNI__) && defined(__AVX512VL__)
-    const __m256i zero = _mm256_setzero_si256();
-    const __m256i summed_pairs = _mm256_dpbusd_epi32(zero, ax, sy);
-    return _mm256_cvtepi32_ps(summed_pairs);
-#elif defined(__AVXVNNI__)
-    const __m256i zero = _mm256_setzero_si256();
-    const __m256i summed_pairs = _mm256_dpbusd_avx_epi32(zero, ax, sy);
-    return _mm256_cvtepi32_ps(summed_pairs);
-#else
-    // Perform multiplication and create 16-bit values
-    const __m256i dot = _mm256_maddubs_epi16(ax, sy);
-    return sum_i16_pairs_float(dot);
-#endif
-}
-
-// multiply int8_t, add results pairwise twice and return as float vector
-static inline __m256 mul_sum_i8_pairs_float(const __m256i x, const __m256i y) {
-#if __AVXVNNIINT8__
-    const __m256i zero = _mm256_setzero_si256();
-    const __m256i summed_pairs = _mm256_dpbssd_epi32(zero, x, y);
-    return _mm256_cvtepi32_ps(summed_pairs);
-#else
-    // Get absolute values of x vectors
-    const __m256i ax = _mm256_sign_epi8(x, x);
-    // Sign the values of the y vectors
-    const __m256i sy = _mm256_sign_epi8(y, x);
-    return mul_sum_us8_pairs_float(ax, sy);
-#endif
-}
-
-static inline __m128i packNibbles( __m256i bytes )
-{
-    // Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh
-#if __AVX512F__
-    const __m256i bytes_srli_4 = _mm256_srli_epi16(bytes, 4);   // 0000_0000_abcd_0000
-    bytes = _mm256_or_si256(bytes, bytes_srli_4);               // 0000_abcd_abcd_efgh
-    return _mm256_cvtepi16_epi8(bytes);                         // abcd_efgh
-#else
-    const __m256i lowByte = _mm256_set1_epi16( 0xFF );
-    __m256i high = _mm256_andnot_si256( lowByte, bytes );
-    __m256i low = _mm256_and_si256( lowByte, bytes );
-    high = _mm256_srli_epi16( high, 4 );
-    bytes = _mm256_or_si256( low, high );
-
-    // Compress uint16_t lanes into bytes
-    __m128i r0 = _mm256_castsi256_si128( bytes );
-    __m128i r1 = _mm256_extracti128_si256( bytes, 1 );
-    return _mm_packus_epi16( r0, r1 );
-#endif
-}
-#elif defined(__AVX__)
-static inline __m128i packNibbles( __m128i bytes1, __m128i bytes2 )
-{
-    // Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh
-    const __m128i lowByte = _mm_set1_epi16( 0xFF );
-    __m128i high = _mm_andnot_si128( lowByte, bytes1 );
-    __m128i low = _mm_and_si128( lowByte, bytes1 );
-    high = _mm_srli_epi16( high, 4 );
-    bytes1 = _mm_or_si128( low, high );
-    high = _mm_andnot_si128( lowByte, bytes2 );
-    low = _mm_and_si128( lowByte, bytes2 );
-    high = _mm_srli_epi16( high, 4 );
-    bytes2 = _mm_or_si128( low, high );
-
-    return _mm_packus_epi16( bytes1, bytes2);
-}
-
-static inline __m128i mul_add_epi8_sse(const __m128i x, const __m128i y) {
-    const __m128i ax = _mm_sign_epi8(x, x);
-    const __m128i sy = _mm_sign_epi8(y, x);
-    return _mm_maddubs_epi16(ax, sy);
-}
-
-// spread 32 bits to 32 bytes { 0x00, 0xFF }
-static inline __m256i bytes_from_bits_32(const uint8_t * x) {
-    uint32_t x32;
-    memcpy(&x32, x, sizeof(uint32_t));
-    const __m128i shuf_maskl = _mm_set_epi64x(0x0101010101010101, 0x0000000000000000);
-    const __m128i shuf_maskh = _mm_set_epi64x(0x0303030303030303, 0x0202020202020202);
-    __m128i bytesl = _mm_shuffle_epi8(_mm_set1_epi32(x32), shuf_maskl);
-    __m128i bytesh = _mm_shuffle_epi8(_mm_set1_epi32(x32), shuf_maskh);
-    const __m128i bit_mask = _mm_set1_epi64x(0x7fbfdfeff7fbfdfe);
-    bytesl = _mm_or_si128(bytesl, bit_mask);
-    bytesh = _mm_or_si128(bytesh, bit_mask);
-    bytesl = _mm_cmpeq_epi8(bytesl, _mm_set1_epi64x(-1));
-    bytesh = _mm_cmpeq_epi8(bytesh, _mm_set1_epi64x(-1));
-    return MM256_SET_M128I(bytesh, bytesl);
-}
-
-// Unpack 32 4-bit fields into 32 bytes
-// The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval
-static inline __m256i bytes_from_nibbles_32(const uint8_t * rsi)
-{
-    // Load 16 bytes from memory
-    __m128i tmpl = _mm_loadu_si128((const __m128i *)rsi);
-    __m128i tmph = _mm_srli_epi16(tmpl, 4);
-    const __m128i lowMask = _mm_set1_epi8(0xF);
-    tmpl = _mm_and_si128(lowMask, tmpl);
-    tmph = _mm_and_si128(lowMask, tmph);
-    return MM256_SET_M128I(tmph, tmpl);
-}
-
-// add int16_t pairwise and return as float vector
-static inline __m256 sum_i16_pairs_float(const __m128i xh, const __m128i xl) {
-    const __m128i ones = _mm_set1_epi16(1);
-    const __m128i summed_pairsl = _mm_madd_epi16(ones, xl);
-    const __m128i summed_pairsh = _mm_madd_epi16(ones, xh);
-    const __m256i summed_pairs = MM256_SET_M128I(summed_pairsh, summed_pairsl);
-    return _mm256_cvtepi32_ps(summed_pairs);
-}
-
-static inline __m256 mul_sum_us8_pairs_float(const __m256i ax, const __m256i sy) {
-    const __m128i axl = _mm256_castsi256_si128(ax);
-    const __m128i axh = _mm256_extractf128_si256(ax, 1);
-    const __m128i syl = _mm256_castsi256_si128(sy);
-    const __m128i syh = _mm256_extractf128_si256(sy, 1);
-    // Perform multiplication and create 16-bit values
-    const __m128i dotl = _mm_maddubs_epi16(axl, syl);
-    const __m128i doth = _mm_maddubs_epi16(axh, syh);
-    return sum_i16_pairs_float(doth, dotl);
-}
-
-// multiply int8_t, add results pairwise twice and return as float vector
-static inline __m256 mul_sum_i8_pairs_float(const __m256i x, const __m256i y) {
-    const __m128i xl = _mm256_castsi256_si128(x);
-    const __m128i xh = _mm256_extractf128_si256(x, 1);
-    const __m128i yl = _mm256_castsi256_si128(y);
-    const __m128i yh = _mm256_extractf128_si256(y, 1);
-    // Get absolute values of x vectors
-    const __m128i axl = _mm_sign_epi8(xl, xl);
-    const __m128i axh = _mm_sign_epi8(xh, xh);
-    // Sign the values of the y vectors
-    const __m128i syl = _mm_sign_epi8(yl, xl);
-    const __m128i syh = _mm_sign_epi8(yh, xh);
-    // Perform multiplication and create 16-bit values
-    const __m128i dotl = _mm_maddubs_epi16(axl, syl);
-    const __m128i doth = _mm_maddubs_epi16(axh, syh);
-    return sum_i16_pairs_float(doth, dotl);
-}
-
-// larger version of mul_sum_i8_pairs_float where x and y are each represented by four 128-bit vectors
-static inline __m256 mul_sum_i8_quad_float(const __m128i x_1_0, const __m128i x_1_1, const __m128i x_2_0, const __m128i x_2_1,
-                                           const __m128i y_1_0, const __m128i y_1_1, const __m128i y_2_0, const __m128i y_2_1) {
-    const __m128i mone = _mm_set1_epi16(1);
-
-    const __m128i p16_1_0 = mul_add_epi8_sse(x_1_0, y_1_0);
-    const __m128i p16_1_1 = mul_add_epi8_sse(x_1_1, y_1_1);
-    const __m128i p16_2_0 = mul_add_epi8_sse(x_2_0, y_2_0);
-    const __m128i p16_2_1 = mul_add_epi8_sse(x_2_1, y_2_1);
-    const __m128i p_1_0 = _mm_madd_epi16(p16_1_0, mone);
-    const __m128i p_1_1 = _mm_madd_epi16(p16_1_1, mone);
-    const __m128i p_2_0 = _mm_madd_epi16(p16_2_0, mone);
-    const __m128i p_2_1 = _mm_madd_epi16(p16_2_1, mone);
-    const __m128i p_1 = _mm_add_epi32(p_1_0, p_1_1);
-    const __m128i p_2 = _mm_add_epi32(p_2_0, p_2_1);
-    return _mm256_cvtepi32_ps(MM256_SET_M128I(p_2, p_1));
-}
-
-// quad fp16 delta calculation
-static inline __m256 quad_fp16_delta_float(const float x0, const float y0, const float x1, const float y1) {
-    // GGML_FP16_TO_FP32 is faster than Intel F16C
-    return _mm256_set_m128(_mm_set1_ps(GGML_FP16_TO_FP32(x1) * GGML_FP16_TO_FP32(y1)),
-                           _mm_set1_ps(GGML_FP16_TO_FP32(x0) * GGML_FP16_TO_FP32(y0)));
-}
-#endif
-#elif defined(__SSSE3__)
-// horizontally add 4x4 floats
-static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128 c, const __m128 d) {
-    __m128 res_0 =_mm_hadd_ps(a, b);
-    __m128 res_1 =_mm_hadd_ps(c, d);
-    __m128 res =_mm_hadd_ps(res_0, res_1);
-    res =_mm_hadd_ps(res, res);
-    res =_mm_hadd_ps(res, res);
-
-    return _mm_cvtss_f32(res);
-}
-#endif // __AVX__ || __AVX2__ || __AVX512F__
-#endif // defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__)
-
-#if defined(__ARM_NEON) || defined(__wasm_simd128__) || defined(__POWER9_VECTOR__)
-#define B1(c,s,n)  0x ## n ## c ,  0x ## n ## s
-#define B2(c,s,n) B1(c,s,n ## c), B1(c,s,n ## s)
-#define B3(c,s,n) B2(c,s,n ## c), B2(c,s,n ## s)
-#define B4(c,s,n) B3(c,s,n ## c), B3(c,s,n ## s)
-#define B5(c,s,n) B4(c,s,n ## c), B4(c,s,n ## s)
-#define B6(c,s,n) B5(c,s,n ## c), B5(c,s,n ## s)
-#define B7(c,s,n) B6(c,s,n ## c), B6(c,s,n ## s)
-#define B8(c,s  ) B7(c,s,     c), B7(c,s,     s)
-
-// precomputed tables for expanding 8bits to 8 bytes:
-static const uint64_t table_b2b_0[1 << 8] = { B8(00, 10) }; // ( b) << 4
-static const uint64_t table_b2b_1[1 << 8] = { B8(10, 00) }; // (!b) << 4
-#endif
-
-#if defined(__loongarch_sx)
-
-static __m128i lsx_packs_w(__m128i a, __m128i b) {
-    __m128i tmp, tmp1;
-    tmp = __lsx_vsat_w(a, 15);
-    tmp1 = __lsx_vsat_w(b, 15);
-    return __lsx_vpickev_h(tmp1, tmp);
-}
-
-static __m128i lsx_packs_h(__m128i a, __m128i b) {
-    __m128i tmp, tmp1;
-    tmp = __lsx_vsat_h(a, 7);
-    tmp1 = __lsx_vsat_h(b, 7);
-    return __lsx_vpickev_b(tmp1, tmp);
-}
-
-static __m128i lsx_packus_h(__m128i a, __m128i b) {
-    __m128i tmp, tmp1;
-    tmp = __lsx_vsat_hu(a, 7);
-    tmp1 = __lsx_vsat_hu(b, 7);
-    return __lsx_vpickev_b(tmp1, tmp);
-}
-
-static __m128i lsx_maddubs_h(__m128i a, __m128i b) {
-    __m128i tmp1, tmp2;
-    tmp1 = __lsx_vmulwev_h_b(a, b);
-    tmp2 = __lsx_vmulwod_h_b(a, b);
-    return __lsx_vsadd_h(tmp1, tmp2);
-}
-
-static __m128i lsx_madd_h(__m128i a, __m128i b) {
-    __m128i tmp1, tmp2;
-    tmp1 = __lsx_vmulwev_w_h(a, b);
-    tmp2 = __lsx_vmulwod_w_h(a, b);
-    return __lsx_vadd_w(tmp1, tmp2);
-}
-
-static __m128i lsx_set_w(int32_t a, int32_t b, int32_t c, int32_t d) {
-    v4i32 __ret = {d, c, b, a};
-    return (__m128i)__ret;
-}
-
-static __m128i lsx_shuffle_b(__m128i a, __m128i b) {
-    __m128i mask_f, zero, tmp0, tmp2, mask;
-    int f = 0x8f;
-    mask_f = __lsx_vreplgr2vr_b(f);
-    zero = __lsx_vldi(0);
-    tmp0 = __lsx_vand_v(b, mask_f); // get mask with low 4 bit and sign bits
-    tmp0 = __lsx_vori_b(tmp0, 0x10); // make each mask or  with 0x10 prepare for positive
-    mask = __lsx_vsle_b(zero, tmp0); // if mask >= 0, set mask
-    tmp2 = __lsx_vand_v(tmp0, mask); // maskout the in2 < ones
-    return __lsx_vshuf_b(a, zero, tmp2);
-}
-
-static __m128i lsx_hadd_h(__m128i a, __m128i b) {
-    __m128i tmp1 = __lsx_vpickev_h(b, a);
-    __m128i tmp2 = __lsx_vpickod_h(b, a);
-    return __lsx_vadd_h(tmp1, tmp2);
-}
-
-static __m128i lsx_hadd_w(__m128i a, __m128i b) {
-    __m128i tmp1 = __lsx_vpickev_w(b, a);
-    __m128i tmp2 = __lsx_vpickod_w(b, a);
-    return __lsx_vadd_w(tmp1, tmp2);
-}
-
-static __m128 lsx_hadd_s(__m128 a, __m128 b) {
-    __m128 tmp1 = (__m128)__lsx_vpickev_w((__m128i)b, (__m128i)a);
-    __m128 tmp2 = (__m128)__lsx_vpickod_w((__m128i)b, (__m128i)a);
-
-    return __lsx_vfadd_s(tmp1, tmp2);
-}
-
-static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128 c, const __m128 d) {
-    __m128 res_0 =lsx_hadd_s(a, b);
-    __m128 res_1 =lsx_hadd_s(c, d);
-    __m128 res =lsx_hadd_s(res_0, res_1);
-    res =lsx_hadd_s(res, res);
-    res =lsx_hadd_s(res, res);
-
-    return ((v4f32)res)[0];
-}
-#endif
-
-#if defined(__loongarch_asx)
-
-#ifdef __clang__
-#define VREGS_PREFIX "$vr"
-#define XREGS_PREFIX "$xr"
-#else // GCC
-#define VREGS_PREFIX "$f"
-#define XREGS_PREFIX "$f"
-#endif
-#define __ALL_REGS "0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31"
-// Convert __m128i to __m256i
-static inline __m256i ____m256i(__m128i in) {
-    __m256i out = __lasx_xvldi(0);
-    __asm__ volatile (
-        ".irp i," __ALL_REGS                "\n\t"
-        " .ifc %[out], " XREGS_PREFIX"\\i    \n\t"
-        "  .irp j," __ALL_REGS              "\n\t"
-        "   .ifc %[in], " VREGS_PREFIX "\\j  \n\t"
-        "    xvpermi.q $xr\\i, $xr\\j, 0x20  \n\t"
-        "   .endif                           \n\t"
-        "  .endr                             \n\t"
-        " .endif                             \n\t"
-        ".endr                               \n\t"
-        : [out] "+f" (out) : [in] "f" (in)
-    );
-    return out;
-}
-// Convert two __m128i to __m256i
-static inline __m256i lasx_set_q(__m128i inhi, __m128i inlo) {
-    __m256i out;
-    __asm__ volatile (
-        ".irp i," __ALL_REGS                "\n\t"
-        " .ifc %[hi], " VREGS_PREFIX "\\i    \n\t"
-        "  .irp j," __ALL_REGS              "\n\t"
-        "   .ifc %[lo], " VREGS_PREFIX "\\j  \n\t"
-        "    xvpermi.q $xr\\i, $xr\\j, 0x20  \n\t"
-        "   .endif                           \n\t"
-        "  .endr                             \n\t"
-        " .endif                             \n\t"
-        ".endr                               \n\t"
-        ".ifnc %[out], %[hi]                 \n\t"
-        ".irp i," __ALL_REGS                "\n\t"
-        " .ifc %[out], " XREGS_PREFIX "\\i   \n\t"
-        "  .irp j," __ALL_REGS              "\n\t"
-        "   .ifc %[hi], " VREGS_PREFIX "\\j  \n\t"
-        "    xvori.b $xr\\i, $xr\\j, 0       \n\t"
-        "   .endif                           \n\t"
-        "  .endr                             \n\t"
-        " .endif                             \n\t"
-        ".endr                               \n\t"
-        ".endif                              \n\t"
-        : [out] "=f" (out), [hi] "+f" (inhi)
-        : [lo] "f" (inlo)
-    );
-    return out;
-}
-// Convert __m256i low part to __m128i
-static inline __m128i lasx_extracti128_lo(__m256i in) {
-    __m128i out;
-    __asm__ volatile (
-        ".ifnc %[out], %[in]                 \n\t"
-        ".irp i," __ALL_REGS                "\n\t"
-        " .ifc %[out], " VREGS_PREFIX "\\i   \n\t"
-        "  .irp j," __ALL_REGS              "\n\t"
-        "   .ifc %[in], " XREGS_PREFIX "\\j  \n\t"
-        "    vori.b $vr\\i, $vr\\j, 0        \n\t"
-        "   .endif                           \n\t"
-        "  .endr                             \n\t"
-        " .endif                             \n\t"
-        ".endr                               \n\t"
-        ".endif                              \n\t"
-        : [out] "=f" (out) : [in] "f" (in)
-    );
-    return out;
-}
-// Convert __m256i high part to __m128i
-static inline __m128i lasx_extracti128_hi(__m256i in) {
-    __m128i out;
-    __asm__ volatile (
-        ".irp i," __ALL_REGS                "\n\t"
-        " .ifc %[out], " VREGS_PREFIX "\\i   \n\t"
-        "  .irp j," __ALL_REGS              "\n\t"
-        "   .ifc %[in], " XREGS_PREFIX "\\j  \n\t"
-        "    xvpermi.q $xr\\i, $xr\\j, 0x11  \n\t"
-        "   .endif                           \n\t"
-        "  .endr                             \n\t"
-        " .endif                             \n\t"
-        ".endr                               \n\t"
-        : [out] "=f" (out) : [in] "f" (in)
-    );
-    return out;
-}
-
-static __m256i lasx_set_w(int e7, int e6, int e5, int e4, int e3, int e2, int e1, int e0) {
-    v8i32 __ret = {e0, e1, e2, e3, e4, e5, e6, e7};
-    return (__m256i)__ret;
-}
-
-static __m256i lasx_set_d(int64_t a, int64_t b, int64_t c, int64_t d) {
-    v4i64 __ret = {d, c, b, a};
-    return (__m256i)__ret;
-}
-
-static __m256i lasx_insertf128( __m128i x, __m128i y) {
-    return lasx_set_q(x, y);
-}
-
-static __m256i lasx_shuffle_b(__m256i a, __m256i b) {
-    __m256i mask_f, zero, tmp0, tmp2, mask;
-    int f = 0x8f;
-    mask_f = __lasx_xvreplgr2vr_b(f);
-    zero = __lasx_xvldi(0);
-    tmp0 = __lasx_xvand_v(b, mask_f); // get mask with low 4 bit and sign bits
-    tmp0 = __lasx_xvori_b(tmp0, 0x10); // make each mask or  with 0x10 prepare for positive
-    mask = __lasx_xvsle_b(zero, tmp0); // if mask >= 0, set mask
-    tmp2 = __lasx_xvand_v(tmp0, mask); // maskout the in2 < ones
-    return __lasx_xvshuf_b(a, zero, tmp2);
-}
-
-static __m256i lasx_extu8_16(__m128i a) {
-    return __lasx_vext2xv_hu_bu(____m256i(a));
-}
-
-static __m256i lasx_ext8_16(__m128i a) {
-    return __lasx_vext2xv_h_b(____m256i(a));
-}
-
-static __m256i lasx_ext16_32(__m128i a) {
-    return __lasx_vext2xv_w_h(____m256i(a));
-}
-
-static __m128i lasx_extracti128( __m256i a, int pos) {
-    __m128i ret;
-    if( pos == 0)
-    {
-       ret = lasx_extracti128_lo(a);
-    } else {
-       ret = lasx_extracti128_hi(a);
-    }
-    return ret;
-}
-
-static __m128 lasx_extractf128( __m256 a, int pos) {
-    __m128 ret;
-    if( pos == 0)
-    {
-       ret = (__m128)lasx_extracti128_lo((__m256i)a);
-    } else {
-       ret = (__m128)lasx_extracti128_hi((__m256i)a);
-    }
-    return ret;
-}
-
-static __m256i lasx_maddubs_h(__m256i a, __m256i b) {
-    __m256i tmp1, tmp2;
-    tmp1 = __lasx_xvmulwev_h_b(a, b);
-    tmp2 = __lasx_xvmulwod_h_b(a, b);
-    return __lasx_xvsadd_h(tmp1, tmp2);
-}
-
-static __m256i lasx_madd_h(__m256i a, __m256i b) {
-    __m256i tmp1, tmp2;
-    tmp1 = __lasx_xvmulwev_w_h(a, b);
-    tmp2 = __lasx_xvmulwod_w_h(a, b);
-    return __lasx_xvadd_w(tmp1, tmp2);
-}
-
-static __m256i lasx_packs_w(__m256i a, __m256i b) {
-    __m256i tmp, tmp1;
-    tmp = __lasx_xvsat_w(a, 15);
-    tmp1 = __lasx_xvsat_w(b, 15);
-    return __lasx_xvpickev_h(tmp1, tmp);
-}
-
-static __m256i lasx_packs_h(__m256i a, __m256i b) {
-    __m256i tmp, tmp1;
-    tmp = __lasx_xvsat_h(a, 7);
-    tmp1 = __lasx_xvsat_h(b, 7);
-    return __lasx_xvpickev_b(tmp1, tmp);
-}
-
-static inline __m256i lasx_madd_h_b(__m256i a, __m256i b) {
-    __m256i tmp1, tmp2;
-    tmp1 = __lasx_xvmulwev_h_b(a, b);
-    tmp2 = __lasx_xvmulwod_h_b(a, b);
-    return __lasx_xvadd_h(tmp1, tmp2);
-}
-
-static inline __m256i lasx_xvrepl128vei_h(__m256i a, const unsigned int b) {
-    switch (b) {
-        case 0: return __lasx_xvrepl128vei_h(a, 0);
-        case 1: return __lasx_xvrepl128vei_h(a, 1);
-        case 2: return __lasx_xvrepl128vei_h(a, 2);
-        case 3: return __lasx_xvrepl128vei_h(a, 3);
-        case 4: return __lasx_xvrepl128vei_h(a, 4);
-        case 5: return __lasx_xvrepl128vei_h(a, 5);
-        case 6: return __lasx_xvrepl128vei_h(a, 6);
-        case 7: return __lasx_xvrepl128vei_h(a, 7);
-        default: __builtin_unreachable();
-    }
-}
-
-static inline __m256i lasx_xvandi_b_bit(__m256i a, const unsigned int b) {
-    switch (b) {
-        case 0: return __lasx_xvandi_b(a, 1 << 0);
-        case 1: return __lasx_xvandi_b(a, 1 << 1);
-        case 2: return __lasx_xvandi_b(a, 1 << 2);
-        case 3: return __lasx_xvandi_b(a, 1 << 3);
-        case 4: return __lasx_xvandi_b(a, 1 << 4);
-        case 5: return __lasx_xvandi_b(a, 1 << 5);
-        case 6: return __lasx_xvandi_b(a, 1 << 6);
-        case 7: return __lasx_xvandi_b(a, 1 << 7);
-        default: __builtin_unreachable();
-    }
-}
-
-// multiply int8_t, add results pairwise twice
-static inline __m128i mul_sum_i8_pairs(const __m128i x, const __m128i y) {
-    // Get absolute values of x vectors
-    const __m128i ax = __lsx_vsigncov_b(x, x);
-    // Sign the values of the y vectors
-    const __m128i sy = __lsx_vsigncov_b(x, y);
-    // Perform multiplication and create 16-bit values
-    const __m128i dot = lsx_maddubs_h(ax, sy);
-    const __m128i ones = __lsx_vreplgr2vr_h(1);
-    return lsx_madd_h(ones, dot);
-}
-
-// horizontally add 8 floats
-static inline float hsum_float_8(const __m256 x) {
-    __m128 res = lasx_extractf128(x, 1);
-    res = __lsx_vfadd_s(res, lasx_extractf128(x, 0));
-    res = __lsx_vfadd_s(res, (__m128)__lsx_vpickod_d((__m128i)res, (__m128i)res));
-    res = __lsx_vfadd_s(res, (__m128)__lsx_vinsgr2vr_w(__lsx_vldi(0), __lsx_vpickve2gr_w(res, 1), 0));
-    return ((v4f32)res)[0];
-}
-
-// horizontally add 8 int32_t
-static inline int hsum_i32_8(const __m256i a) {
-
-    __m256i tmp1 = __lasx_xvpermi_q(a, a, 0x11);
-    __m256i tmp2 = __lasx_xvpermi_q(a, a, 0x00);
-
-    __m128i  tmp1_128 = lasx_extracti128_lo(tmp1);
-    __m128i  tmp2_128 = lasx_extracti128_lo(tmp2);
-
-    __m128i sum128 = __lsx_vadd_w(tmp1_128, tmp2_128);
-
-    __m128i ev = __lsx_vpickev_w(sum128, sum128);
-    __m128i od = __lsx_vpickod_w(sum128, sum128);
-    __m128i sum64 = __lsx_vadd_w(ev, od);
-
-    int sum64_1, sum64_2;
-    sum64_1 = __lsx_vpickve2gr_w(sum64, 0);
-    sum64_2 = __lsx_vpickve2gr_w(sum64, 1);
-
-    return  sum64_1 + sum64_2;
-}
-
-// horizontally add 4 int32_t
-static inline int hsum_i32_4(const __m128i a) {
-    __m128i ev = __lsx_vpickev_w(a, a);
-    __m128i od = __lsx_vpickod_w(a, a);
-    __m128i sum64 = __lsx_vadd_w(ev, od);
-
-    int sum64_1, sum64_2;
-    sum64_1 = __lsx_vpickve2gr_w(sum64, 0);
-    sum64_2 = __lsx_vpickve2gr_w(sum64, 1);
-
-    return  sum64_1 + sum64_2;
-}
-
-// spread 32 bits to 32 bytes { 0x00, 0xFF }
-static inline __m256i bytes_from_bits_32(const uint8_t * x) {
-
-    uint32_t x32;
-    memcpy(&x32, x, sizeof(uint32_t));
-    const __m256i shuf_mask = lasx_set_d(
-            0x0303030303030303, 0x0202020202020202,
-            0x0101010101010101, 0x0000000000000000);
-
-    __m256i bytes = lasx_shuffle_b(__lasx_xvreplgr2vr_w(x32), shuf_mask);
-    const __m256i bit_mask = __lasx_xvreplgr2vr_d(0x7fbfdfeff7fbfdfe);
-    bytes = __lasx_xvor_v(bytes, bit_mask);
-    return __lasx_xvseq_b(bytes, __lasx_xvreplgr2vr_d(-1));
-}
-
-// Unpack 32 4-bit fields into 32 bytes
-// The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval
-static inline __m256i bytes_from_nibbles_32(const uint8_t * rsi) {
-    const __m128i lo = __lsx_vld((const __m128i *)rsi, 0);
-    __m128i hi = __lsx_vsrli_h(lo, 4);
-    return __lasx_xvandi_b(lasx_insertf128(hi, lo), 0xf);
-}
-
-// add int16_t pairwise and return as float vector
-static inline __m256 sum_i16_pairs_float(const __m256i x) {
-    __m256i v = __lasx_xvpackod_h(x, x);
-    __m256i summed_pairs = __lasx_xvaddwev_w_h(x, v);
-    return __lasx_xvffint_s_w(summed_pairs);
-}
-
-static inline __m256 mul_sum_us8_pairs_float(const __m256i ax, const __m256i sy) {
-    // Perform multiplication and create 16-bit values
-    const __m256i dot = lasx_maddubs_h(ax, sy);
-    return sum_i16_pairs_float(dot);
-}
-
-// multiply int8_t, add results pairwise twice and return as float vector
-static inline __m256 mul_sum_i8_pairs_float(const __m256i x, const __m256i y) {
-    const __m256i dot = lasx_madd_h_b(x, y);
-    return sum_i16_pairs_float(dot);
-}
-
-static inline __m128i packNibbles( __m256i bytes ) {
-    // Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh
-    const __m256i lowByte = __lasx_xvreplgr2vr_h(0xFF);
-     __m256i high = __lasx_xvandn_v(lowByte, bytes);
-    __m256i low = __lasx_xvand_v(lowByte, bytes);
-    high = __lasx_xvsrli_h(high, 4);
-    bytes = __lasx_xvor_v(low, high);
-    // Compress uint16_t lanes into bytes
-    __m128i *r0 = (__m128i *)&bytes;
-    __m256i tmp_h128 = __lasx_xvpermi_q(bytes, bytes, 0x11);
-    __m128i *r1 = (__m128i *)&tmp_h128;
-
-    __m128i zero = __lsx_vldi(0);
-    __m128i tmp, tmp2, tmp3;
-
-    tmp = __lsx_vmax_h(zero, *r0);
-    tmp2 = __lsx_vsat_hu(tmp, 7);
-
-    tmp = __lsx_vmax_h(zero, *r1);
-    tmp3 = __lsx_vsat_hu(tmp, 7);
-    return  __lsx_vpickev_b(tmp3, tmp2);
-}
-#endif  //__loongarch_asx
-
-void quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
-    quantize_row_q4_0_ref(x, y, k);
-}
-
-void quantize_row_q4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
-    quantize_row_q4_1_ref(x, y, k);
-}
-
-void quantize_row_q5_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
-    quantize_row_q5_0_ref(x, y, k);
-}
-
-void quantize_row_q5_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
-    quantize_row_q5_1_ref(x, y, k);
-}
-
-void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
-    assert(QK8_0 == 32);
-    assert(k % QK8_0 == 0);
-    const int nb = k / QK8_0;
-
-    block_q8_0 * GGML_RESTRICT y = vy;
-
-#if defined(__ARM_NEON)
-    for (int i = 0; i < nb; i++) {
-        float32x4_t srcv [8];
-        float32x4_t asrcv[8];
-        float32x4_t amaxv[8];
-
-        for (int j = 0; j < 8; j++) srcv[j]  = vld1q_f32(x + i*32 + 4*j);
-        for (int j = 0; j < 8; j++) asrcv[j] = vabsq_f32(srcv[j]);
-
-        for (int j = 0; j < 4; j++) amaxv[2*j] = vmaxq_f32(asrcv[2*j], asrcv[2*j+1]);
-        for (int j = 0; j < 2; j++) amaxv[4*j] = vmaxq_f32(amaxv[4*j], amaxv[4*j+2]);
-        for (int j = 0; j < 1; j++) amaxv[8*j] = vmaxq_f32(amaxv[8*j], amaxv[8*j+4]);
-
-        const float amax = vmaxvq_f32(amaxv[0]);
-
-        const float d = amax / ((1 << 7) - 1);
-        const float id = d ? 1.0f/d : 0.0f;
-
-        y[i].d = GGML_FP32_TO_FP16(d);
-
-        for (int j = 0; j < 8; j++) {
-            const float32x4_t v  = vmulq_n_f32(srcv[j], id);
-            const int32x4_t   vi = vcvtnq_s32_f32(v);
-
-            y[i].qs[4*j + 0] = vgetq_lane_s32(vi, 0);
-            y[i].qs[4*j + 1] = vgetq_lane_s32(vi, 1);
-            y[i].qs[4*j + 2] = vgetq_lane_s32(vi, 2);
-            y[i].qs[4*j + 3] = vgetq_lane_s32(vi, 3);
-        }
-    }
-#elif defined __wasm_simd128__
-    for (int i = 0; i < nb; i++) {
-        v128_t srcv [8];
-        v128_t asrcv[8];
-        v128_t amaxv[8];
-
-        for (int j = 0; j < 8; j++) srcv[j]  = wasm_v128_load(x + i*32 + 4*j);
-        for (int j = 0; j < 8; j++) asrcv[j] = wasm_f32x4_abs(srcv[j]);
-
-        for (int j = 0; j < 4; j++) amaxv[2*j] = wasm_f32x4_max(asrcv[2*j], asrcv[2*j+1]);
-        for (int j = 0; j < 2; j++) amaxv[4*j] = wasm_f32x4_max(amaxv[4*j], amaxv[4*j+2]);
-        for (int j = 0; j < 1; j++) amaxv[8*j] = wasm_f32x4_max(amaxv[8*j], amaxv[8*j+4]);
-
-        const float amax = MAX(MAX(wasm_f32x4_extract_lane(amaxv[0], 0),
-                                   wasm_f32x4_extract_lane(amaxv[0], 1)),
-                               MAX(wasm_f32x4_extract_lane(amaxv[0], 2),
-                                   wasm_f32x4_extract_lane(amaxv[0], 3)));
-
-        const float d = amax / ((1 << 7) - 1);
-        const float id = d ? 1.0f/d : 0.0f;
-
-        y[i].d = GGML_FP32_TO_FP16(d);
-
-        for (int j = 0; j < 8; j++) {
-            const v128_t v  = wasm_f32x4_mul(srcv[j], wasm_f32x4_splat(id));
-            const v128_t vi = wasm_i32x4_trunc_sat_f32x4(v);
-
-            y[i].qs[4*j + 0] = wasm_i32x4_extract_lane(vi, 0);
-            y[i].qs[4*j + 1] = wasm_i32x4_extract_lane(vi, 1);
-            y[i].qs[4*j + 2] = wasm_i32x4_extract_lane(vi, 2);
-            y[i].qs[4*j + 3] = wasm_i32x4_extract_lane(vi, 3);
-        }
-    }
-#elif defined(__AVX2__) || defined(__AVX__)
-    for (int i = 0; i < nb; i++) {
-        // Load elements into 4 AVX vectors
-        __m256 v0 = _mm256_loadu_ps( x );
-        __m256 v1 = _mm256_loadu_ps( x + 8 );
-        __m256 v2 = _mm256_loadu_ps( x + 16 );
-        __m256 v3 = _mm256_loadu_ps( x + 24 );
-        x += 32;
-
-        // Compute max(abs(e)) for the block
-        const __m256 signBit = _mm256_set1_ps( -0.0f );
-        __m256 maxAbs = _mm256_andnot_ps( signBit, v0 );
-        maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v1 ) );
-        maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v2 ) );
-        maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v3 ) );
-
-        __m128 max4 = _mm_max_ps( _mm256_extractf128_ps( maxAbs, 1 ), _mm256_castps256_ps128( maxAbs ) );
-        max4 = _mm_max_ps( max4, _mm_movehl_ps( max4, max4 ) );
-        max4 = _mm_max_ss( max4, _mm_movehdup_ps( max4 ) );
-        const float maxScalar = _mm_cvtss_f32( max4 );
-
-        // Quantize these floats
-        const float d = maxScalar / 127.f;
-        y[i].d = GGML_FP32_TO_FP16(d);
-        const float id = ( maxScalar != 0.0f ) ? 127.f / maxScalar : 0.0f;
-        const __m256 mul = _mm256_set1_ps( id );
-
-        // Apply the multiplier
-        v0 = _mm256_mul_ps( v0, mul );
-        v1 = _mm256_mul_ps( v1, mul );
-        v2 = _mm256_mul_ps( v2, mul );
-        v3 = _mm256_mul_ps( v3, mul );
-
-        // Round to nearest integer
-        v0 = _mm256_round_ps( v0, _MM_ROUND_NEAREST );
-        v1 = _mm256_round_ps( v1, _MM_ROUND_NEAREST );
-        v2 = _mm256_round_ps( v2, _MM_ROUND_NEAREST );
-        v3 = _mm256_round_ps( v3, _MM_ROUND_NEAREST );
-
-        // Convert floats to integers
-        __m256i i0 = _mm256_cvtps_epi32( v0 );
-        __m256i i1 = _mm256_cvtps_epi32( v1 );
-        __m256i i2 = _mm256_cvtps_epi32( v2 );
-        __m256i i3 = _mm256_cvtps_epi32( v3 );
-
-#if defined(__AVX2__)
-        // Convert int32 to int16
-        i0 = _mm256_packs_epi32( i0, i1 );	// 0, 1, 2, 3,  8, 9, 10, 11,  4, 5, 6, 7, 12, 13, 14, 15
-        i2 = _mm256_packs_epi32( i2, i3 );	// 16, 17, 18, 19,  24, 25, 26, 27,  20, 21, 22, 23, 28, 29, 30, 31
-                                            // Convert int16 to int8
-        i0 = _mm256_packs_epi16( i0, i2 );	// 0, 1, 2, 3,  8, 9, 10, 11,  16, 17, 18, 19,  24, 25, 26, 27,  4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
-
-        // We got our precious signed bytes, but the order is now wrong
-        // These AVX2 pack instructions process 16-byte pieces independently
-        // The following instruction is fixing the order
-        const __m256i perm = _mm256_setr_epi32( 0, 4, 1, 5, 2, 6, 3, 7 );
-        i0 = _mm256_permutevar8x32_epi32( i0, perm );
-
-        _mm256_storeu_si256((__m256i *)y[i].qs, i0);
-#else
-        // Since we don't have in AVX some necessary functions,
-        // we split the registers in half and call AVX2 analogs from SSE
-        __m128i ni0 = _mm256_castsi256_si128( i0 );
-        __m128i ni1 = _mm256_extractf128_si256( i0, 1);
-        __m128i ni2 = _mm256_castsi256_si128( i1 );
-        __m128i ni3 = _mm256_extractf128_si256( i1, 1);
-        __m128i ni4 = _mm256_castsi256_si128( i2 );
-        __m128i ni5 = _mm256_extractf128_si256( i2, 1);
-        __m128i ni6 = _mm256_castsi256_si128( i3 );
-        __m128i ni7 = _mm256_extractf128_si256( i3, 1);
-
-        // Convert int32 to int16
-        ni0 = _mm_packs_epi32( ni0, ni1 );
-        ni2 = _mm_packs_epi32( ni2, ni3 );
-        ni4 = _mm_packs_epi32( ni4, ni5 );
-        ni6 = _mm_packs_epi32( ni6, ni7 );
-        // Convert int16 to int8
-        ni0 = _mm_packs_epi16( ni0, ni2 );
-        ni4 = _mm_packs_epi16( ni4, ni6 );
-
-        _mm_storeu_si128((__m128i *)(y[i].qs +  0), ni0);
-        _mm_storeu_si128((__m128i *)(y[i].qs + 16), ni4);
-#endif
-    }
-#elif defined(__riscv_v)
-
-    size_t vl = QK8_0;
-
-    for (int i = 0; i < nb; i++) {
-        // load elements
-        vfloat32m8_t v_x   = __riscv_vle32_v_f32m8(x+i*QK8_0, vl);
-
-        vfloat32m8_t vfabs = __riscv_vfabs_v_f32m8(v_x, vl);
-        vfloat32m1_t tmp   = __riscv_vfmv_v_f_f32m1(0.0f, vl);
-        vfloat32m1_t vmax  = __riscv_vfredmax_vs_f32m8_f32m1(vfabs, tmp, vl);
-        float amax = __riscv_vfmv_f_s_f32m1_f32(vmax);
-
-        const float d = amax / ((1 << 7) - 1);
-        const float id = d ? 1.0f/d : 0.0f;
-
-        y[i].d = GGML_FP32_TO_FP16(d);
-
-        vfloat32m8_t x0 = __riscv_vfmul_vf_f32m8(v_x, id, vl);
-
-        // convert to integer
-        vint16m4_t   vi = __riscv_vfncvt_x_f_w_i16m4(x0, vl);
-        vint8m2_t    vs = __riscv_vncvt_x_x_w_i8m2(vi, vl);
-
-        // store result
-        __riscv_vse8_v_i8m2(y[i].qs , vs, vl);
-    }
-
-#elif defined(__POWER9_VECTOR__)
-    for (int i = 0; i < nb; i++) {
-        vector float srcv [8];
-        vector float asrcv[8];
-        vector float amaxv[8];
-        vector signed int vi[8];
-
-        for (int j = 0; j < 8; j++) srcv[j]  = vec_xl(0, x + i*32 + 4*j);
-        for (int j = 0; j < 8; j++) asrcv[j] = vec_abs(srcv[j]);
-
-        for (int j = 0; j < 4; j++) amaxv[2*j] = vec_max(asrcv[2*j], asrcv[2*j+1]);
-        for (int j = 0; j < 2; j++) amaxv[4*j] = vec_max(amaxv[4*j], amaxv[4*j+2]);
-        for (int j = 0; j < 1; j++) amaxv[8*j] = vec_max(amaxv[8*j], amaxv[8*j+4]);
-
-        const float amax = MAX(MAX(vec_extract(amaxv[0], 0),
-                                   vec_extract(amaxv[0], 1)),
-                               MAX(vec_extract(amaxv[0], 2),
-                                   vec_extract(amaxv[0], 3)));
-
-        const float d = amax / ((1 << 7) - 1);
-        const float id = d ? 1.0f/d : 0.0f;
-        const vector float vid = vec_splats(id);
-
-        y[i].d = GGML_FP32_TO_FP16(d);
-
-        for (int j = 0; j < 8; j++) {
-            const vector float v  = vec_round(vec_mul(srcv[j], vid));
-            vi[j] = vec_cts(v, 0);
-        }
-        vec_xst(vec_pack(vec_pack(vi[0], vi[1]), vec_pack(vi[2], vi[3])),  0, &y[i].qs[0]);
-        vec_xst(vec_pack(vec_pack(vi[4], vi[5]), vec_pack(vi[6], vi[7])), 16, &y[i].qs[0]);
-    }
-
-#elif defined(__loongarch_asx)
-    for (int i = 0; i < nb; i++) {
-        __m256 v0 = (__m256)__lasx_xvld( x , 0);
-        __m256 v1 = (__m256)__lasx_xvld( x , 32);
-        __m256 v2 = (__m256)__lasx_xvld( x , 64);
-        __m256 v3 = (__m256)__lasx_xvld( x , 96);
-        x += 32;
-
-        // Compute max(abs(e)) for the block
-        const __m256 sign_bit = __lasx_xvreplfr2vr_s( -0.0f );
-        __m256 max_abs = (__m256)__lasx_xvandn_v( (__m256i)sign_bit, (__m256i)v0 );
-        max_abs = __lasx_xvfmax_s( max_abs, (__m256)__lasx_xvandn_v( (__m256i)sign_bit, (__m256i)v1 ) );
-        max_abs = __lasx_xvfmax_s( max_abs, (__m256)__lasx_xvandn_v( (__m256i)sign_bit, (__m256i)v2 ) );
-        max_abs = __lasx_xvfmax_s( max_abs, (__m256)__lasx_xvandn_v( (__m256i)sign_bit, (__m256i)v3 ) );
-
-        __m128 max4 = __lsx_vfmax_s( lasx_extractf128( max_abs, 1 ), lasx_extractf128( max_abs , 0) );
-        max4 = __lsx_vfmax_s( max4, (__m128)__lsx_vpickod_d((__m128i) max4, (__m128i)max4 ) );
-        __m128 tmp = max4;
-        max4 = __lsx_vfmax_s( max4, (__m128)__lsx_vinsgr2vr_w(tmp, __lsx_vpickve2gr_w( max4, 1 ), 0 ));
-        const float max_scalar = ((v4f32)max4)[0];
-
-        // Quantize these floats
-        const float d = max_scalar / 127.f;
-        y[i].d = GGML_FP32_TO_FP16(d);
-        const float id = ( max_scalar != 0.0f ) ? 127.f / max_scalar : 0.0f;
-        const __m256 mul = (__m256)__lasx_xvreplfr2vr_s( id );
-
-        // Apply the multiplier
-        v0 = __lasx_xvfmul_s( v0, mul );
-        v1 = __lasx_xvfmul_s( v1, mul );
-        v2 = __lasx_xvfmul_s( v2, mul );
-        v3 = __lasx_xvfmul_s( v3, mul );
-
-        // Round to nearest integer
-        __m256i i0 = __lasx_xvftintrne_w_s( v0 );
-        __m256i i1 = __lasx_xvftintrne_w_s( v1 );
-        __m256i i2 = __lasx_xvftintrne_w_s( v2 );
-        __m256i i3 = __lasx_xvftintrne_w_s( v3 );
-
-        __m128i ni0 = lasx_extracti128( i0, 0 );
-        __m128i ni1 = lasx_extracti128( i0, 1);
-        __m128i ni2 = lasx_extracti128( i1, 0);
-        __m128i ni3 = lasx_extracti128( i1, 1);
-        __m128i ni4 = lasx_extracti128( i2, 0);
-        __m128i ni5 = lasx_extracti128( i2, 1);
-        __m128i ni6 = lasx_extracti128( i3, 0);
-        __m128i ni7 = lasx_extracti128( i3, 1);
-
-        // Convert int32 to int16
-        ni0 = lsx_packs_w( ni0, ni1 );
-        ni2 = lsx_packs_w( ni2, ni3 );
-        ni4 = lsx_packs_w( ni4, ni5 );
-        ni6 = lsx_packs_w( ni6, ni7 );
-        // Convert int16 to int8
-        ni0 = lsx_packs_h( ni0, ni2 );
-        ni4 = lsx_packs_h( ni4, ni6 );
-
-        __lsx_vst(ni0, (__m128i *)(y[i].qs +  0), 0);
-        __lsx_vst(ni4, (__m128i *)(y[i].qs + 16), 0);
-
-    }
-#elif defined(__VXE__) || defined(__VXE2__)
-    for (int i = 0; i < nb; i++) {
-        __vector float srcv [8];
-        __vector float asrcv[8];
-        __vector float amaxv[8];
-
-        for (int j = 0; j < 8; j++) srcv[j] = vec_xl(0, x + i*32 + 4*j);
-        for (int j = 0; j < 8; j++) asrcv[j] = vec_abs(srcv[j]);
-        for (int j = 0; j < 4; j++) amaxv[2*j] = vec_max(asrcv[2*j], asrcv[2*j+1]);
-        for (int j = 0; j < 2; j++) amaxv[4*j] = vec_max(amaxv[4*j], amaxv[4*j+2]);
-        for (int j = 0; j < 1; j++) amaxv[8*j] = vec_max(amaxv[8*j], amaxv[8*j+4]);
-
-        const float amax = MAX(MAX(vec_extract(amaxv[0], 0),
-                                   vec_extract(amaxv[0], 1)),
-                               MAX(vec_extract(amaxv[0], 2),
-                                   vec_extract(amaxv[0], 3)));
-
-        const float d = amax / ((1 << 7) - 1);
-        const float id = d ? 1.0f / d : 0.0f;
-
-        y[i].d = GGML_FP32_TO_FP16(d);
-
-        for (int j = 0; j < 8; j++) {
-            const __vector float v = vec_mul(srcv[j], vec_splats(id));
-            const __vector int32_t vi = vec_signed(v);
-
-            y[i].qs[4*j + 0] = vec_extract(vi, 0);
-            y[i].qs[4*j + 1] = vec_extract(vi, 1);
-            y[i].qs[4*j + 2] = vec_extract(vi, 2);
-            y[i].qs[4*j + 3] = vec_extract(vi, 3);
-        }
-    }
-#else
-    GGML_UNUSED(nb);
-    // scalar
-    quantize_row_q8_0_ref(x, y, k);
-#endif
-}
-
-void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
-    assert(k % QK8_1 == 0);
-    const int nb = k / QK8_1;
-
-    block_q8_1 * GGML_RESTRICT y = vy;
-
-#if defined(__ARM_NEON)
-    for (int i = 0; i < nb; i++) {
-        float32x4_t srcv [8];
-        float32x4_t asrcv[8];
-        float32x4_t amaxv[8];
-
-        for (int j = 0; j < 8; j++) srcv[j]  = vld1q_f32(x + i*32 + 4*j);
-        for (int j = 0; j < 8; j++) asrcv[j] = vabsq_f32(srcv[j]);
-
-        for (int j = 0; j < 4; j++) amaxv[2*j] = vmaxq_f32(asrcv[2*j], asrcv[2*j+1]);
-        for (int j = 0; j < 2; j++) amaxv[4*j] = vmaxq_f32(amaxv[4*j], amaxv[4*j+2]);
-        for (int j = 0; j < 1; j++) amaxv[8*j] = vmaxq_f32(amaxv[8*j], amaxv[8*j+4]);
-
-        const float amax = vmaxvq_f32(amaxv[0]);
-
-        const float d = amax / ((1 << 7) - 1);
-        const float id = d ? 1.0f/d : 0.0f;
-
-        y[i].d = GGML_FP32_TO_FP16(d);
-
-        int32x4_t accv = vdupq_n_s32(0);
-
-        for (int j = 0; j < 8; j++) {
-            const float32x4_t v  = vmulq_n_f32(srcv[j], id);
-            const int32x4_t   vi = vcvtnq_s32_f32(v);
-
-            y[i].qs[4*j + 0] = vgetq_lane_s32(vi, 0);
-            y[i].qs[4*j + 1] = vgetq_lane_s32(vi, 1);
-            y[i].qs[4*j + 2] = vgetq_lane_s32(vi, 2);
-            y[i].qs[4*j + 3] = vgetq_lane_s32(vi, 3);
-
-            accv = vaddq_s32(accv, vi);
-        }
-
-        y[i].s = GGML_FP32_TO_FP16(d * vaddvq_s32(accv));
-    }
-#elif defined __wasm_simd128__
-    for (int i = 0; i < nb; i++) {
-        v128_t srcv [8];
-        v128_t asrcv[8];
-        v128_t amaxv[8];
-
-        for (int j = 0; j < 8; j++) srcv[j]  = wasm_v128_load(x + i*32 + 4*j);
-        for (int j = 0; j < 8; j++) asrcv[j] = wasm_f32x4_abs(srcv[j]);
-
-        for (int j = 0; j < 4; j++) amaxv[2*j] = wasm_f32x4_max(asrcv[2*j], asrcv[2*j+1]);
-        for (int j = 0; j < 2; j++) amaxv[4*j] = wasm_f32x4_max(amaxv[4*j], amaxv[4*j+2]);
-        for (int j = 0; j < 1; j++) amaxv[8*j] = wasm_f32x4_max(amaxv[8*j], amaxv[8*j+4]);
-
-        const float amax = MAX(MAX(wasm_f32x4_extract_lane(amaxv[0], 0),
-                                   wasm_f32x4_extract_lane(amaxv[0], 1)),
-                               MAX(wasm_f32x4_extract_lane(amaxv[0], 2),
-                                   wasm_f32x4_extract_lane(amaxv[0], 3)));
-
-        const float d = amax / ((1 << 7) - 1);
-        const float id = d ? 1.0f/d : 0.0f;
-
-        y[i].d = GGML_FP32_TO_FP16(d);
-
-        v128_t accv = wasm_i32x4_splat(0);
-
-        for (int j = 0; j < 8; j++) {
-            const v128_t v  = wasm_f32x4_mul(srcv[j], wasm_f32x4_splat(id));
-            const v128_t vi = wasm_i32x4_trunc_sat_f32x4(v);
-
-            y[i].qs[4*j + 0] = wasm_i32x4_extract_lane(vi, 0);
-            y[i].qs[4*j + 1] = wasm_i32x4_extract_lane(vi, 1);
-            y[i].qs[4*j + 2] = wasm_i32x4_extract_lane(vi, 2);
-            y[i].qs[4*j + 3] = wasm_i32x4_extract_lane(vi, 3);
-
-            accv = wasm_i32x4_add(accv, vi);
-        }
-
-        y[i].s = GGML_FP32_TO_FP16(
-                d * (wasm_i32x4_extract_lane(accv, 0) +
-                     wasm_i32x4_extract_lane(accv, 1) +
-                     wasm_i32x4_extract_lane(accv, 2) +
-                     wasm_i32x4_extract_lane(accv, 3)));
-    }
-#elif defined(__AVX2__) || defined(__AVX__)
-    for (int i = 0; i < nb; i++) {
-        // Load elements into 4 AVX vectors
-        __m256 v0 = _mm256_loadu_ps( x );
-        __m256 v1 = _mm256_loadu_ps( x + 8 );
-        __m256 v2 = _mm256_loadu_ps( x + 16 );
-        __m256 v3 = _mm256_loadu_ps( x + 24 );
-        x += 32;
-
-        // Compute max(abs(e)) for the block
-        const __m256 signBit = _mm256_set1_ps( -0.0f );
-        __m256 maxAbs = _mm256_andnot_ps( signBit, v0 );
-        maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v1 ) );
-        maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v2 ) );
-        maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v3 ) );
-
-        __m128 max4 = _mm_max_ps( _mm256_extractf128_ps( maxAbs, 1 ), _mm256_castps256_ps128( maxAbs ) );
-        max4 = _mm_max_ps( max4, _mm_movehl_ps( max4, max4 ) );
-        max4 = _mm_max_ss( max4, _mm_movehdup_ps( max4 ) );
-        const float max_scalar = _mm_cvtss_f32( max4 );
-
-        // Quantize these floats
-        const float d = max_scalar / 127.f;
-        y[i].d = GGML_FP32_TO_FP16(d);
-        const float id = ( max_scalar != 0.0f ) ? 127.f / max_scalar : 0.0f;
-        const __m256 mul = _mm256_set1_ps( id );
-
-        // Apply the multiplier
-        v0 = _mm256_mul_ps( v0, mul );
-        v1 = _mm256_mul_ps( v1, mul );
-        v2 = _mm256_mul_ps( v2, mul );
-        v3 = _mm256_mul_ps( v3, mul );
-
-        // Round to nearest integer
-        v0 = _mm256_round_ps( v0, _MM_ROUND_NEAREST );
-        v1 = _mm256_round_ps( v1, _MM_ROUND_NEAREST );
-        v2 = _mm256_round_ps( v2, _MM_ROUND_NEAREST );
-        v3 = _mm256_round_ps( v3, _MM_ROUND_NEAREST );
-
-        // Convert floats to integers
-        __m256i i0 = _mm256_cvtps_epi32( v0 );
-        __m256i i1 = _mm256_cvtps_epi32( v1 );
-        __m256i i2 = _mm256_cvtps_epi32( v2 );
-        __m256i i3 = _mm256_cvtps_epi32( v3 );
-
-#if defined(__AVX2__)
-        // Compute the sum of the quants and set y[i].s
-        y[i].s = GGML_FP32_TO_FP16(d * hsum_i32_8(_mm256_add_epi32(_mm256_add_epi32(i0, i1), _mm256_add_epi32(i2, i3))));
-
-        // Convert int32 to int16
-        i0 = _mm256_packs_epi32( i0, i1 );	// 0, 1, 2, 3,  8, 9, 10, 11,  4, 5, 6, 7, 12, 13, 14, 15
-        i2 = _mm256_packs_epi32( i2, i3 );	// 16, 17, 18, 19,  24, 25, 26, 27,  20, 21, 22, 23, 28, 29, 30, 31
-                                            // Convert int16 to int8
-        i0 = _mm256_packs_epi16( i0, i2 );	// 0, 1, 2, 3,  8, 9, 10, 11,  16, 17, 18, 19,  24, 25, 26, 27,  4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
-
-        // We got our precious signed bytes, but the order is now wrong
-        // These AVX2 pack instructions process 16-byte pieces independently
-        // The following instruction is fixing the order
-        const __m256i perm = _mm256_setr_epi32( 0, 4, 1, 5, 2, 6, 3, 7 );
-        i0 = _mm256_permutevar8x32_epi32( i0, perm );
-
-        _mm256_storeu_si256((__m256i *)y[i].qs, i0);
-#else
-        // Since we don't have in AVX some necessary functions,
-        // we split the registers in half and call AVX2 analogs from SSE
-        __m128i ni0 = _mm256_castsi256_si128( i0 );
-        __m128i ni1 = _mm256_extractf128_si256( i0, 1);
-        __m128i ni2 = _mm256_castsi256_si128( i1 );
-        __m128i ni3 = _mm256_extractf128_si256( i1, 1);
-        __m128i ni4 = _mm256_castsi256_si128( i2 );
-        __m128i ni5 = _mm256_extractf128_si256( i2, 1);
-        __m128i ni6 = _mm256_castsi256_si128( i3 );
-        __m128i ni7 = _mm256_extractf128_si256( i3, 1);
-
-        // Compute the sum of the quants and set y[i].s
-        const __m128i s0 = _mm_add_epi32(_mm_add_epi32(ni0, ni1), _mm_add_epi32(ni2, ni3));
-        const __m128i s1 = _mm_add_epi32(_mm_add_epi32(ni4, ni5), _mm_add_epi32(ni6, ni7));
-        y[i].s = GGML_FP32_TO_FP16(d * hsum_i32_4(_mm_add_epi32(s0, s1)));
-
-        // Convert int32 to int16
-        ni0 = _mm_packs_epi32( ni0, ni1 );
-        ni2 = _mm_packs_epi32( ni2, ni3 );
-        ni4 = _mm_packs_epi32( ni4, ni5 );
-        ni6 = _mm_packs_epi32( ni6, ni7 );
-        // Convert int16 to int8
-        ni0 = _mm_packs_epi16( ni0, ni2 );
-        ni4 = _mm_packs_epi16( ni4, ni6 );
-
-        _mm_storeu_si128((__m128i *)(y[i].qs +  0), ni0);
-        _mm_storeu_si128((__m128i *)(y[i].qs + 16), ni4);
-#endif
-    }
-#elif defined(__riscv_v)
-
-    size_t vl = QK8_1;
-
-    for (int i = 0; i < nb; i++) {
-        // load elements
-        vfloat32m8_t v_x   = __riscv_vle32_v_f32m8(x+i*QK8_1, vl);
-
-        vfloat32m8_t vfabs = __riscv_vfabs_v_f32m8(v_x, vl);
-        vfloat32m1_t tmp   = __riscv_vfmv_v_f_f32m1(0.0, vl);
-        vfloat32m1_t vmax  = __riscv_vfredmax_vs_f32m8_f32m1(vfabs, tmp, vl);
-        float amax = __riscv_vfmv_f_s_f32m1_f32(vmax);
-
-        const float d  = amax / ((1 << 7) - 1);
-        const float id = d ? 1.0f/d : 0.0f;
-
-        y[i].d = GGML_FP32_TO_FP16(d);
-
-        vfloat32m8_t x0 = __riscv_vfmul_vf_f32m8(v_x, id, vl);
-
-        // convert to integer
-        vint16m4_t   vi = __riscv_vfncvt_x_f_w_i16m4(x0, vl);
-        vint8m2_t    vs = __riscv_vncvt_x_x_w_i8m2(vi, vl);
-
-        // store result
-        __riscv_vse8_v_i8m2(y[i].qs , vs, vl);
-
-        // compute sum for y[i].s
-        vint16m1_t tmp2 = __riscv_vmv_v_x_i16m1(0, vl);
-        vint16m1_t vwrs = __riscv_vwredsum_vs_i8m2_i16m1(vs, tmp2, vl);
-
-        // set y[i].s
-        int sum = __riscv_vmv_x_s_i16m1_i16(vwrs);
-        y[i].s = GGML_FP32_TO_FP16(sum*d);
-    }
-
-#elif defined(__POWER9_VECTOR__)
-    for (int i = 0; i < nb; i++) {
-        vector float srcv [8];
-        vector float asrcv[8];
-        vector float amaxv[8];
-        vector signed int vi[8];
-
-        for (int j = 0; j < 8; j++) srcv[j]  = vec_xl(0, x + i*32 + 4*j);
-        for (int j = 0; j < 8; j++) asrcv[j] = vec_abs(srcv[j]);
-
-        for (int j = 0; j < 4; j++) amaxv[2*j] = vec_max(asrcv[2*j], asrcv[2*j+1]);
-        for (int j = 0; j < 2; j++) amaxv[4*j] = vec_max(amaxv[4*j], amaxv[4*j+2]);
-        for (int j = 0; j < 1; j++) amaxv[8*j] = vec_max(amaxv[8*j], amaxv[8*j+4]);
-
-        const float amax = MAX(MAX(vec_extract(amaxv[0], 0),
-                                   vec_extract(amaxv[0], 1)),
-                               MAX(vec_extract(amaxv[0], 2),
-                                   vec_extract(amaxv[0], 3)));
-
-        const float d = amax / ((1 << 7) - 1);
-        const float id = d ? 1.0f/d : 0.0f;
-        const vector float vid = vec_splats(id);
-
-        y[i].d = GGML_FP32_TO_FP16(d);
-
-        vector int accv = vec_splats(0);
-
-        for (int j = 0; j < 8; j++) {
-            const vector float v  = vec_round(vec_mul(srcv[j], vid));
-            vi[j] = vec_cts(v, 0);
-
-            accv = vec_add(accv, vi[j]);
-        }
-        vec_xst(vec_pack(vec_pack(vi[0], vi[1]), vec_pack(vi[2], vi[3])),  0, &y[i].qs[0]);
-        vec_xst(vec_pack(vec_pack(vi[4], vi[5]), vec_pack(vi[6], vi[7])), 16, &y[i].qs[0]);
-
-        accv = vec_add(accv, vec_sld(accv, accv, 4));
-        accv = vec_add(accv, vec_sld(accv, accv, 8));
-        y[i].s = GGML_FP32_TO_FP16(d * vec_extract(accv, 0));
-    }
-
-#elif defined(__loongarch_asx)
-    for (int i = 0; i < nb; i++) {
-        __m256 v0 = (__m256)__lasx_xvld( x , 0 );
-        __m256 v1 = (__m256)__lasx_xvld( x , 32 );
-        __m256 v2 = (__m256)__lasx_xvld( x , 64 );
-        __m256 v3 = (__m256)__lasx_xvld( x , 96 );
-        x += 32;
-
-        // Compute max(abs(e)) for the block
-        const __m256 sign_bit = __lasx_xvreplfr2vr_s( -0.0f );
-        __m256 max_abs = (__m256)__lasx_xvandn_v( (__m256i)sign_bit, (__m256i)v0 );
-        max_abs = __lasx_xvfmax_s( max_abs, (__m256)__lasx_xvandn_v( (__m256i)sign_bit, (__m256i)v1 ) );
-        max_abs = __lasx_xvfmax_s( max_abs, (__m256)__lasx_xvandn_v( (__m256i)sign_bit, (__m256i)v2 ) );
-        max_abs = __lasx_xvfmax_s( max_abs, (__m256)__lasx_xvandn_v( (__m256i)sign_bit, (__m256i)v3 ) );
-
-        __m128 max4 = __lsx_vfmax_s( lasx_extractf128( max_abs, 1 ), lasx_extractf128( max_abs, 0) );
-        max4 = __lsx_vfmax_s( max4, (__m128)__lsx_vpickod_d((__m128i) max4, (__m128i)max4 ) );
-        __m128 tmp = max4;
-        max4 = __lsx_vfmax_s( max4, (__m128)__lsx_vextrins_w((__m128i)tmp, (__m128i)max4, 0x10 ));
-        const float max_scalar = ((v4f32)max4)[0];
-
-        // Quantize these floats
-        const float d = max_scalar / 127.f;
-        y[i].d = GGML_FP32_TO_FP16(d);
-        const float id = ( max_scalar != 0.0f ) ? 127.f / max_scalar : 0.0f;
-        const __m256 mul = __lasx_xvreplfr2vr_s( id );
-
-        // Apply the multiplier
-        v0 = __lasx_xvfmul_s( v0, mul );
-        v1 = __lasx_xvfmul_s( v1, mul );
-        v2 = __lasx_xvfmul_s( v2, mul );
-        v3 = __lasx_xvfmul_s( v3, mul );
-
-        // Round to nearest integer
-        __m256i i0 = __lasx_xvftintrne_w_s( v0 );
-        __m256i i1 = __lasx_xvftintrne_w_s( v1 );
-        __m256i i2 = __lasx_xvftintrne_w_s( v2 );
-        __m256i i3 = __lasx_xvftintrne_w_s( v3 );
-
-        __m128i ni0 = lasx_extracti128(i0, 0);
-        __m128i ni1 = lasx_extracti128( i0, 1);
-        __m128i ni2 = lasx_extracti128( i1, 0);
-        __m128i ni3 = lasx_extracti128( i1, 1);
-        __m128i ni4 = lasx_extracti128( i2, 0 );
-        __m128i ni5 = lasx_extracti128( i2, 1);
-        __m128i ni6 = lasx_extracti128( i3, 0);
-        __m128i ni7 = lasx_extracti128( i3, 1);
-
-        // Compute the sum of the quants and set y[i].s
-        const __m128i s0 = __lsx_vadd_w(__lsx_vadd_w(ni0, ni1), __lsx_vadd_w(ni2, ni3));
-        const __m128i s1 = __lsx_vadd_w(__lsx_vadd_w(ni4, ni5), __lsx_vadd_w(ni6, ni7));
-        y[i].s = GGML_FP32_TO_FP16(d * hsum_i32_4(__lsx_vadd_w(s0, s1)));
-
-        // Convert int32 to int16
-        ni0 = lsx_packs_w( ni0, ni1 );
-        ni2 = lsx_packs_w( ni2, ni3 );
-        ni4 = lsx_packs_w( ni4, ni5 );
-        ni6 = lsx_packs_w( ni6, ni7 );
-        // Convert int16 to int8
-        ni0 = lsx_packs_h( ni0, ni2 );
-        ni4 = lsx_packs_h( ni4, ni6 );
-
-        __lsx_vst(ni0, (__m128i *)(y[i].qs +  0), 0);
-        __lsx_vst(ni4, (__m128i *)(y[i].qs + 16), 0);
-    }
-#elif defined(__VXE__) || defined(__VXE2__)
-    for (int i = 0; i < nb; i++) {
-        __vector float srcv [8];
-        __vector float asrcv[8];
-        __vector float amaxv[8];
-
-        for (int j = 0; j < 8; j++) srcv[j] = vec_xl(0, x + i*32 + 4*j);
-        for (int j = 0; j < 8; j++) asrcv[j] = vec_abs(srcv[j]);
-        for (int j = 0; j < 4; j++) amaxv[2*j] = vec_max(asrcv[2*j], asrcv[2*j+1]);
-        for (int j = 0; j < 2; j++) amaxv[4*j] = vec_max(amaxv[4*j], amaxv[4*j+2]);
-        for (int j = 0; j < 1; j++) amaxv[8*j] = vec_max(amaxv[8*j], amaxv[8*j+4]);
-
-        const float amax = MAX(MAX(vec_extract(amaxv[0], 0),
-                                   vec_extract(amaxv[0], 1)),
-                               MAX(vec_extract(amaxv[0], 2),
-                                   vec_extract(amaxv[0], 3)));
-
-        const float d = amax / ((1 << 7) - 1);
-        const float id = d ? 1.0f / d : 0.0f;
-
-        y[i].d = GGML_FP32_TO_FP16(d);
-
-        __vector int32_t acc = vec_splats(0);
-
-        for (int j = 0; j < 8; j++) {
-            const __vector float v = vec_mul(srcv[j], vec_splats(id));
-            const __vector int32_t vi = vec_signed(v);
-
-            y[i].qs[4*j + 0] = vec_extract(vi, 0);
-            y[i].qs[4*j + 1] = vec_extract(vi, 1);
-            y[i].qs[4*j + 2] = vec_extract(vi, 2);
-            y[i].qs[4*j + 3] = vec_extract(vi, 3);
-
-            acc = vec_add(acc, vi);
-        }
-
-        y[i].s = GGML_FP32_TO_FP16(d * (acc[0] + acc[1] + acc[2] + acc[3]));
-    }
-#else
-    GGML_UNUSED(nb);
-    // scalar
-    quantize_row_q8_1_ref(x, y, k);
-#endif
-}
-
-//
-// 2-6 bit quantization in super-blocks
-//
-
-//
-// ===================== Helper functions
-//
-static inline int nearest_int(float fval) {
-    assert(fabsf(fval) <= 4194303.f);
-    float val = fval + 12582912.f;
-    int i; memcpy(&i, &val, sizeof(int));
-    return (i & 0x007fffff) - 0x00400000;
-}
-
-static float make_qx_quants(int n, int nmax, const float * GGML_RESTRICT x, int8_t * GGML_RESTRICT L, int rmse_type,
-        const float * GGML_RESTRICT qw) {
-    float max = 0;
-    float amax = 0;
-    for (int i = 0; i < n; ++i) {
-        float ax = fabsf(x[i]);
-        if (ax > amax) { amax = ax; max = x[i]; }
-    }
-    if (amax < GROUP_MAX_EPS) { // all zero
-        for (int i = 0; i < n; ++i) {
-            L[i] = 0;
-        }
-        return 0.f;
-    }
-    float iscale = -nmax / max;
-    if (rmse_type == 0) {
-        for (int i = 0; i < n; ++i) {
-            int l = nearest_int(iscale * x[i]);
-            L[i] = nmax + MAX(-nmax, MIN(nmax-1, l));
-        }
-        return 1/iscale;
-    }
-    bool return_early = false;
-    if (rmse_type < 0) {
-        rmse_type = -rmse_type;
-        return_early = true;
-    }
-    float sumlx = 0;
-    float suml2 = 0;
-#ifdef HAVE_BUGGY_APPLE_LINKER
-    // use 'volatile' to prevent unroll and work around a bug in Apple ld64 1015.7
-    for (volatile int i = 0; i < n; ++i) {
-#else
-    for (int i = 0; i < n; ++i) {
-#endif
-        int l = nearest_int(iscale * x[i]);
-        l = MAX(-nmax, MIN(nmax-1, l));
-        L[i] = l + nmax;
-        float w = qw ? qw[i] : rmse_type == 1 ? x[i] * x[i] : rmse_type == 2 ? 1 : rmse_type == 3 ? fabsf(x[i]) : sqrtf(fabsf(x[i]));
-        sumlx += w*x[i]*l;
-        suml2 += w*l*l;
-    }
-    float scale = suml2 ? sumlx/suml2 : 0.0f;
-    if (return_early) return suml2 > 0 ? 0.5f*(scale + 1/iscale) : 1/iscale;
-    float best = scale * sumlx;
-    for (int is = -9; is <= 9; ++is) {
-        if (is == 0) {
-            continue;
-        }
-        iscale = -(nmax + 0.1f*is) / max;
-        sumlx = suml2 = 0;
-        for (int i = 0; i < n; ++i) {
-            int l = nearest_int(iscale * x[i]);
-            l = MAX(-nmax, MIN(nmax-1, l));
-            float w = qw ? qw[i] : rmse_type == 1 ? x[i] * x[i] : rmse_type == 2 ? 1 : rmse_type == 3 ? fabsf(x[i]) : sqrtf(fabsf(x[i]));
-            sumlx += w*x[i]*l;
-            suml2 += w*l*l;
-        }
-        if (suml2 > 0 && sumlx*sumlx > best*suml2) {
-            for (int i = 0; i < n; ++i) {
-                int l = nearest_int(iscale * x[i]);
-                L[i] = nmax + MAX(-nmax, MIN(nmax-1, l));
-            }
-            scale = sumlx/suml2; best = scale*sumlx;
-        }
-    }
-    return scale;
-}
-
-static float make_q3_quants(int n, int nmax, const float * GGML_RESTRICT x, int8_t * GGML_RESTRICT L, bool do_rmse) {
-    float max = 0;
-    float amax = 0;
-    for (int i = 0; i < n; ++i) {
-        float ax = fabsf(x[i]);
-        if (ax > amax) { amax = ax; max = x[i]; }
-    }
-    if (amax < GROUP_MAX_EPS) { // all zero
-        for (int i = 0; i < n; ++i) { L[i] = 0; }
-        return 0.f;
-    }
-    float iscale = -nmax / max;
-    if (do_rmse) {
-        float sumlx = 0;
-        float suml2 = 0;
-        for (int i = 0; i < n; ++i) {
-            int l = nearest_int(iscale * x[i]);
-            l = MAX(-nmax, MIN(nmax-1, l));
-            L[i] = l;
-            float w = x[i]*x[i];
-            sumlx += w*x[i]*l;
-            suml2 += w*l*l;
-        }
-        for (int itry = 0; itry < 5; ++itry) {
-            int n_changed = 0;
-            for (int i = 0; i < n; ++i) {
-                float w = x[i]*x[i];
-                float slx = sumlx - w*x[i]*L[i];
-                if (slx > 0) {
-                    float sl2 = suml2 - w*L[i]*L[i];
-                    int new_l = nearest_int(x[i] * sl2 / slx);
-                    new_l = MAX(-nmax, MIN(nmax-1, new_l));
-                    if (new_l != L[i]) {
-                        slx += w*x[i]*new_l;
-                        sl2 += w*new_l*new_l;
-                        if (sl2 > 0 && slx*slx*suml2 > sumlx*sumlx*sl2) {
-                            L[i] = new_l; sumlx = slx; suml2 = sl2;
-                            ++n_changed;
-                        }
-                    }
-                }
-            }
-            if (!n_changed) {
-                break;
-            }
-        }
-        for (int i = 0; i < n; ++i) {
-            L[i] += nmax;
-        }
-        return sumlx / suml2;
-    }
-    for (int i = 0; i < n; ++i) {
-        int l = nearest_int(iscale * x[i]);
-        l = MAX(-nmax, MIN(nmax-1, l));
-        L[i] = l + nmax;
-    }
-    return 1/iscale;
-}
-
-static float make_qkx1_quants(int n, int nmax, const float * GGML_RESTRICT x, uint8_t * GGML_RESTRICT L, float * GGML_RESTRICT the_min,
-        int ntry, float alpha) {
-    float min = x[0];
-    float max = x[0];
-    for (int i = 1; i < n; ++i) {
-        if (x[i] < min) min = x[i];
-        if (x[i] > max) max = x[i];
-    }
-    if (max == min) {
-        for (int i = 0; i < n; ++i) L[i] = 0;
-        *the_min = 0;
-        return 0.f;
-    }
-    if (min > 0) min = 0;
-    float iscale = nmax/(max - min);
-    float scale = 1/iscale;
-    for (int itry = 0; itry < ntry; ++itry) {
-        float sumlx = 0; int suml2 = 0;
-        bool did_change = false;
-        for (int i = 0; i < n; ++i) {
-            int l = nearest_int(iscale*(x[i] - min));
-            l = MAX(0, MIN(nmax, l));
-            if (l != L[i]) {
-                L[i] = l;
-                did_change = true;
-            }
-            sumlx += (x[i] - min)*l;
-            suml2 += l*l;
-        }
-        scale = sumlx/suml2;
-        float sum = 0;
-        for (int i = 0; i < n; ++i) {
-            sum += x[i] - scale*L[i];
-        }
-        min = alpha*min + (1 - alpha)*sum/n;
-        if (min > 0) min = 0;
-        iscale = 1/scale;
-        if (!did_change) break;
-    }
-    *the_min = -min;
-    return scale;
-}
-
-static float make_qkx2_quants(int n, int nmax, const float * GGML_RESTRICT x, const float * GGML_RESTRICT weights,
-        uint8_t * GGML_RESTRICT L, float * GGML_RESTRICT the_min, uint8_t * GGML_RESTRICT Laux,
-        float rmin, float rdelta, int nstep, bool use_mad) {
-    float min = x[0];
-    float max = x[0];
-    float sum_w = weights[0];
-    float sum_x = sum_w * x[0];
-#ifdef HAVE_BUGGY_APPLE_LINKER
-    // use 'volatile' to prevent unroll and work around a bug in Apple ld64 1015.7
-    for (volatile int i = 1; i < n; ++i) {
-#else
-    for (int i = 1; i < n; ++i) {
-#endif
-        if (x[i] < min) min = x[i];
-        if (x[i] > max) max = x[i];
-        float w = weights[i];
-        sum_w += w;
-        sum_x += w * x[i];
-    }
-    if (min > 0) min = 0;
-    if (max == min) {
-        for (int i = 0; i < n; ++i) L[i] = 0;
-        *the_min = -min;
-        return 0.f;
-    }
-    float iscale = nmax/(max - min);
-    float scale = 1/iscale;
-    float best_mad = 0;
-    for (int i = 0; i < n; ++i) {
-        int l = nearest_int(iscale*(x[i] - min));
-        L[i] = MAX(0, MIN(nmax, l));
-        float diff = scale * L[i] + min - x[i];
-        diff = use_mad ? fabsf(diff) : diff * diff;
-        float w = weights[i];
-        best_mad += w * diff;
-    }
-    if (nstep < 1) {
-        *the_min = -min;
-        return scale;
-    }
-    for (int is = 0; is <= nstep; ++is) {
-        iscale = (rmin + rdelta*is + nmax)/(max - min);
-        float sum_l = 0, sum_l2 = 0, sum_xl = 0;
-        for (int i = 0; i < n; ++i) {
-            int l = nearest_int(iscale*(x[i] - min));
-            l = MAX(0, MIN(nmax, l));
-            Laux[i] = l;
-            float w = weights[i];
-            sum_l += w*l;
-            sum_l2 += w*l*l;
-            sum_xl += w*l*x[i];
-        }
-        float D = sum_w * sum_l2 - sum_l * sum_l;
-        if (D > 0) {
-            float this_scale = (sum_w * sum_xl - sum_x * sum_l)/D;
-            float this_min   = (sum_l2 * sum_x - sum_l * sum_xl)/D;
-            if (this_min > 0) {
-                this_min = 0;
-                this_scale = sum_xl / sum_l2;
-            }
-            float mad = 0;
-            for (int i = 0; i < n; ++i) {
-                float diff = this_scale * Laux[i] + this_min - x[i];
-                diff = use_mad ? fabsf(diff) : diff * diff;
-                float w = weights[i];
-                mad += w * diff;
-            }
-            if (mad < best_mad) {
-                for (int i = 0; i < n; ++i) {
-                    L[i] = Laux[i];
-                }
-                best_mad = mad;
-                scale = this_scale;
-                min = this_min;
-            }
-        }
-    }
-    *the_min = -min;
-    return scale;
-}
-
-static inline void get_scale_min_k4(int j, const uint8_t * GGML_RESTRICT q, uint8_t * GGML_RESTRICT d, uint8_t * GGML_RESTRICT m) {
-    if (j < 4) {
-        *d = q[j] & 63; *m = q[j + 4] & 63;
-    } else {
-        *d = (q[j+4] & 0xF) | ((q[j-4] >> 6) << 4);
-        *m = (q[j+4] >>  4) | ((q[j-0] >> 6) << 4);
-    }
-}
-
-//========================- 2-bit (de)-quantization
-
-void quantize_row_q2_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
-    quantize_row_q2_K_ref(x, vy, k);
-}
-
-//========================= 3-bit (de)-quantization
-
-void quantize_row_q3_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
-    quantize_row_q3_K_ref(x, vy, k);
-}
-
-// ====================== 4-bit (de)-quantization
-
-void quantize_row_q4_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
-    assert(k % QK_K == 0);
-    block_q4_K * GGML_RESTRICT y = vy;
-    quantize_row_q4_K_ref(x, y, k);
-}
-
-// ====================== 5-bit (de)-quantization
-
-void quantize_row_q5_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
-    assert(k % QK_K == 0);
-    block_q5_K * GGML_RESTRICT y = vy;
-    quantize_row_q5_K_ref(x, y, k);
-}
-
-// ====================== 6-bit (de)-quantization
-
-void quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
-    assert(k % QK_K == 0);
-    block_q6_K * GGML_RESTRICT y = vy;
-    quantize_row_q6_K_ref(x, y, k);
-}
-
-// ====================== Ternary (de)-quantization (BitNet b1.58 and TriLMs)
-
-void quantize_row_tq1_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
-    assert(k % QK_K == 0);
-    block_tq1_0 * GGML_RESTRICT y = vy;
-    quantize_row_tq1_0_ref(x, y, k);
-}
-
-void quantize_row_tq2_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
-    assert(k % QK_K == 0);
-    block_tq2_0 * GGML_RESTRICT y = vy;
-    quantize_row_tq2_0_ref(x, y, k);
-}
-
-static const int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
-
-//===================================== Q8_K ==============================================
-
-void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
-#ifdef __wasm_simd128__
-    assert(k % QK_K == 0);
-    const int64_t nb = k / QK_K;
-    block_q8_K * GGML_RESTRICT yc = y; // Cast to proper type
-
-    for (int i = 0; i < nb; i++) {
-        const float * x_block = x + i * QK_K;
-
-        v128_t min_vec = wasm_v128_load(x_block);
-        v128_t max_vec = min_vec;
-
-        for (int j = 4; j < QK_K; j += 4) {
-            v128_t x_vec = wasm_v128_load(x_block + j);
-            max_vec = wasm_f32x4_pmax(max_vec, x_vec);
-            min_vec = wasm_f32x4_pmin(min_vec, x_vec);
-        }
-        max_vec = wasm_f32x4_pmax(max_vec, wasm_i32x4_shuffle(max_vec, max_vec, 2, 3, 0, 1));
-        max_vec = wasm_f32x4_pmax(max_vec, wasm_i32x4_shuffle(max_vec, max_vec, 1, 0, 3, 2));
-        min_vec = wasm_f32x4_pmin(min_vec, wasm_i32x4_shuffle(min_vec, min_vec, 2, 3, 0, 1));
-        min_vec = wasm_f32x4_pmin(min_vec, wasm_i32x4_shuffle(min_vec, min_vec, 1, 0, 3, 2));
-        float max = wasm_f32x4_extract_lane(max_vec, 0);
-        float min = wasm_f32x4_extract_lane(min_vec, 0);
-        float amax = -min > max ? min : max;
-
-        if (amax == 0.0f) {
-            yc[i].d = 0.0f;
-            const v128_t zero = wasm_i8x16_splat(0);
-            for (int j = 0; j < QK_K; j += 16) {
-                wasm_v128_store(yc[i].qs + j, zero);
-            }
-            continue;
-        }
-
-        const float iscale = -127.0f / amax;
-        const v128_t scale_vec = wasm_f32x4_splat(iscale);
-
-        // Process 16 elements per iteration
-        for (int j = 0, jb = 0; j < QK_K; j += 16, jb++) {
-            // Load and quantize 16 floats
-            v128_t x0 = wasm_v128_load(x_block + j);
-            v128_t x1 = wasm_v128_load(x_block + j + 4);
-            v128_t x2 = wasm_v128_load(x_block + j + 8);
-            v128_t x3 = wasm_v128_load(x_block + j + 12);
-
-            v128_t q0 = wasm_f32x4_nearest(wasm_f32x4_mul(x0, scale_vec));
-            v128_t q1 = wasm_f32x4_nearest(wasm_f32x4_mul(x1, scale_vec));
-            v128_t q2 = wasm_f32x4_nearest(wasm_f32x4_mul(x2, scale_vec));
-            v128_t q3 = wasm_f32x4_nearest(wasm_f32x4_mul(x3, scale_vec));
-
-            // Convert to i32 with saturation
-            v128_t i0 = wasm_i32x4_trunc_sat_f32x4(q0);
-            v128_t i1 = wasm_i32x4_trunc_sat_f32x4(q1);
-            v128_t i2 = wasm_i32x4_trunc_sat_f32x4(q2);
-            v128_t i3 = wasm_i32x4_trunc_sat_f32x4(q3);
-
-            // Pack into 16 i8 values
-            v128_t i8 = wasm_i8x16_narrow_i16x8(
-                wasm_i16x8_narrow_i32x4(i0, i1),
-                wasm_i16x8_narrow_i32x4(i2, i3)
-            );
-            wasm_v128_store(yc[i].qs + j, i8);
-
-            // Calculate bsums using SIMD
-            v128_t sum16 = wasm_i16x8_add(
-                wasm_i16x8_extend_low_i8x16(i8),
-                wasm_i16x8_extend_high_i8x16(i8)
-            );
-            v128_t sum32 = wasm_i32x4_add(
-                wasm_i32x4_extend_low_i16x8(sum16),
-                wasm_i32x4_extend_high_i16x8(sum16)
-            );
-            sum32 = wasm_i32x4_add(sum32, wasm_i32x4_shuffle(sum32, sum32, 2, 3, 0, 1));
-            sum32 = wasm_i32x4_add(sum32, wasm_i32x4_shuffle(sum32, sum32, 1, 0, 3, 2));
-            yc[i].bsums[jb] = wasm_i32x4_extract_lane(sum32, 0);
-        }
-
-        yc[i].d = 1.0f / iscale;
-    }
-#else
-    quantize_row_q8_K_ref(x, y, k);
-#endif
-}
-
-//===================================== Dot products =================================
-
-//
-// Helper functions
-//
-#if __AVX__ || __AVX2__ || __AVX512F__
-
-// shuffles to pick the required scales in dot products
-static inline __m256i get_scale_shuffle_q3k(int i) {
-    static const uint8_t k_shuffle[128] = {
-         0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,     2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3,
-         4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5,     6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7,
-         8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9,    10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,
-        12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,    14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,
-    };
-    return _mm256_loadu_si256((const __m256i*)k_shuffle + i);
-}
-static inline __m256i get_scale_shuffle_k4(int i) {
-    static const uint8_t k_shuffle[256] = {
-         0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
-         2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3,
-         4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5,
-         6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7,
-         8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9,
-        10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,
-        12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,
-        14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15
-    };
-    return _mm256_loadu_si256((const __m256i*)k_shuffle + i);
-}
-static inline __m128i get_scale_shuffle(int i) {
-    static const uint8_t k_shuffle[128] = {
-         0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
-         2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3,
-         4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5,
-         6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7,
-         8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9,
-        10,10,10,10,10,10,10,10, 11,11,11,11,11,11,11,11,
-        12,12,12,12,12,12,12,12, 13,13,13,13,13,13,13,13,
-        14,14,14,14,14,14,14,14, 15,15,15,15,15,15,15,15
-    };
-    return _mm_loadu_si128((const __m128i*)k_shuffle + i);
-}
-#elif defined(__loongarch_asx)
-// shuffles to pick the required scales in dot products
-static inline __m256i get_scale_shuffle_q3k(int i) {
-    static const uint8_t k_shuffle[128] = {
-         0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,     2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3,
-         4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5,     6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7,
-         8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9,    10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,
-        12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,    14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,
-    };
-    return __lasx_xvld((const __m256i*)k_shuffle + i, 0);
-}
-static inline __m256i get_scale_shuffle_k4(int i) {
-    static const uint8_t k_shuffle[256] = {
-         0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
-         2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3,
-         4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5,
-         6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7,
-         8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9,
-        10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,
-        12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,
-        14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15
-    };
-    return __lasx_xvld((const __m256i*)k_shuffle + i, 0);
-}
-static inline __m128i get_scale_shuffle(int i) {
-    static const uint8_t k_shuffle[128] = {
-         0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
-         2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3,
-         4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5,
-         6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7,
-         8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9,
-        10,10,10,10,10,10,10,10, 11,11,11,11,11,11,11,11,
-        12,12,12,12,12,12,12,12, 13,13,13,13,13,13,13,13,
-        14,14,14,14,14,14,14,14, 15,15,15,15,15,15,15,15
-    };
-    return __lsx_vld((const __m128i*)k_shuffle + i, 0);
-}
-#endif
-
-void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
-    const int qk = QK8_0;
-    const int nb = n / qk;
-
-    assert(n % qk == 0);
-#if defined(__ARM_FEATURE_MATMUL_INT8)
-    assert((nrc == 2) || (nrc == 1));
-#else
-    assert(nrc == 1);
-#endif
-    UNUSED(nrc);
-    UNUSED(bx);
-    UNUSED(by);
-    UNUSED(bs);
-
-    const block_q4_0 * GGML_RESTRICT x = vx;
-    const block_q8_0 * GGML_RESTRICT y = vy;
-
-#if defined(__ARM_FEATURE_MATMUL_INT8)
-    if (nrc == 2) {
-        const block_q4_0 * GGML_RESTRICT vx0 = vx;
-        const block_q4_0 * GGML_RESTRICT vx1 = (const block_q4_0 *) ((const uint8_t*)vx + bx);
-        const block_q8_0 * GGML_RESTRICT vy0 = vy;
-        const block_q8_0 * GGML_RESTRICT vy1 = (const block_q8_0 *) ((const uint8_t*)vy + by);
-
-        float32x4_t sumv0 = vdupq_n_f32(0.0f);
-
-        for (int i = 0; i < nb; i++) {
-            const block_q4_0 * GGML_RESTRICT b_x0 = &vx0[i];
-            const block_q4_0 * GGML_RESTRICT b_x1 = &vx1[i];
-            const block_q8_0 * GGML_RESTRICT b_y0 = &vy0[i];
-            const block_q8_0 * GGML_RESTRICT b_y1 = &vy1[i];
-
-            const uint8x16_t m4b = vdupq_n_u8(0x0F);
-            const int8x16_t  s8b = vdupq_n_s8(0x8);
-
-            const uint8x16_t v0_0 = vld1q_u8(b_x0->qs);
-            const uint8x16_t v0_1 = vld1q_u8(b_x1->qs);
-
-            // 4-bit -> 8-bit
-            const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8  (v0_0, m4b));
-            const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4));
-            const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8  (v0_1, m4b));
-            const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4));
-
-            // sub 8
-            const int8x16_t x0_l = vsubq_s8(v0_0l, s8b);
-            const int8x16_t x0_h = vsubq_s8(v0_0h, s8b);
-            const int8x16_t x1_l = vsubq_s8(v0_1l, s8b);
-            const int8x16_t x1_h = vsubq_s8(v0_1h, s8b);
-
-            // load y
-            const int8x16_t y0_l = vld1q_s8(b_y0->qs);
-            const int8x16_t y0_h = vld1q_s8(b_y0->qs + 16);
-            const int8x16_t y1_l = vld1q_s8(b_y1->qs);
-            const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16);
-
-            float32_t _scale[4] = {
-                GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y0->d),
-                GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y1->d),
-                GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y0->d),
-                GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y1->d)
-            };
-            float32x4_t scale = vld1q_f32(_scale);
-
-            int8x16_t l0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
-            int8x16_t l1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
-
-            int8x16_t l2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_h), vreinterpretq_s64_s8(x1_h)));
-            int8x16_t l3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_h), vreinterpretq_s64_s8(x1_h)));
-
-            int8x16_t r0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_l), vreinterpretq_s64_s8(y1_l)));
-            int8x16_t r1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_l), vreinterpretq_s64_s8(y1_l)));
-
-            int8x16_t r2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h)));
-            int8x16_t r3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h)));
-
-            sumv0 = vmlaq_f32(sumv0,(vcvtq_f32_s32(vmmlaq_s32((vmmlaq_s32((vmmlaq_s32((vmmlaq_s32(vdupq_n_s32(0), l0, r0)),
-                                                l1, r1)), l2, r2)), l3, r3))), scale);
-        }
-
-        float32x4_t sumv1 = vextq_f32 (sumv0, sumv0, 2);
-        float32x4_t sumv2 = vzip1q_f32(sumv0, sumv1);
-
-        vst1_f32(s,      vget_low_f32 (sumv2));
-        vst1_f32(s + bs, vget_high_f32(sumv2));
-
-        return;
-    }
-#endif
-
-    int ib = 0;
-    float sumf = 0;
-
-#if defined(__ARM_FEATURE_SVE)
-    svfloat32_t sumv0 = svdup_n_f32(0.0f);
-    svfloat32_t sumv1 = svdup_n_f32(0.0f);
-
-    const int vector_length = ggml_cpu_get_sve_cnt()*8;
-
-    // VLA Implementation using switch case
-    switch (vector_length) {
-        case 128:
-            {
-                // predicate for activating higher lanes for 4 float32 elements
-                const svbool_t ph4 = svptrue_pat_b32(SV_VL4);
-
-                for (; ib + 1 < nb; ib += 2) {
-                    const block_q4_0 * GGML_RESTRICT x0 = &x[ib + 0];
-                    const block_q4_0 * GGML_RESTRICT x1 = &x[ib + 1];
-                    const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0];
-                    const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1];
-
-                    // load x
-                    const svuint8_t qx0r = svld1rq_u8(svptrue_b8(), x0->qs);
-                    const svuint8_t qx1r = svld1rq_u8(svptrue_b8(), x1->qs);
-
-                    // 4-bit -> 8-bit
-                    const svint8_t qx0l = svreinterpret_s8_u8(svand_n_u8_m(svptrue_b8(), qx0r, 0x0F));
-                    const svint8_t qx0h = svreinterpret_s8_u8(svlsr_n_u8_m(svptrue_b8(), qx0r, 0x04));
-                    const svint8_t qx1l = svreinterpret_s8_u8(svand_n_u8_m(svptrue_b8(), qx1r, 0x0F));
-                    const svint8_t qx1h = svreinterpret_s8_u8(svlsr_n_u8_m(svptrue_b8(), qx1r, 0x04));
-
-                    // sub 8
-                    const svint8_t qx0ls = svsub_n_s8_x(svptrue_b8(), qx0h, 8);
-                    const svint8_t qx0hs = svsub_n_s8_x(svptrue_b8(), qx0l, 8);
-                    const svint8_t qx1ls = svsub_n_s8_x(svptrue_b8(), qx1h, 8);
-                    const svint8_t qx1hs = svsub_n_s8_x(svptrue_b8(), qx1l, 8);
-
-                    // load y
-                    const svint8_t qy0h = svld1_s8(svptrue_b8(), y0->qs);
-                    const svint8_t qy0l = svld1_s8(svptrue_b8(), y0->qs + 16);
-                    const svint8_t qy1h = svld1_s8(svptrue_b8(), y1->qs);
-                    const svint8_t qy1l = svld1_s8(svptrue_b8(), y1->qs + 16);
-
-                    // dot product
-                    sumv0 = svmla_n_f32_x(ph4, sumv0, svcvt_f32_s32_x(ph4, svadd_x(ph4,
-                                    svdot_s32(svdup_n_s32(0), qx0ls, qy0l),
-                                    svdot_s32(svdup_n_s32(0), qx0hs, qy0h))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
-                    sumv1 = svmla_n_f32_x(ph4, sumv1, svcvt_f32_s32_x(ph4, svadd_x(ph4,
-                                    svdot_s32(svdup_n_s32(0), qx1ls, qy1l),
-                                    svdot_s32(svdup_n_s32(0), qx1hs, qy1h))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
-                }
-
-                sumf = svaddv_f32(svptrue_b32(), svadd_f32_x(svptrue_b32(), sumv0, sumv1));
-            } break;
-        case 256:
-            {
-                // predicate for activating higher lanes for 16 int8 elements
-                const svbool_t ph16 = svptrue_pat_b8(SV_VL16);
-                // predicate for activating lower lanes for  16 int8 elements
-                const svbool_t pl16 = svnot_b_z(svptrue_b8(), ph16);
-
-                for (; ib + 1 < nb; ib += 2) {
-                    const block_q4_0 * GGML_RESTRICT x0 = &x[ib + 0];
-                    const block_q4_0 * GGML_RESTRICT x1 = &x[ib + 1];
-                    const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0];
-                    const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1];
-
-                    // load x
-                    const svuint8_t qx0r = svld1rq_u8(svptrue_b8(), x0->qs);
-                    const svuint8_t qx1r = svld1rq_u8(svptrue_b8(), x1->qs);
-
-                    // 4-bit -> 8-bit
-                    const svint8_t qx0 = svreinterpret_s8_u8(svlsr_n_u8_m(pl16, svand_n_u8_m(ph16, qx0r, 0x0F), 0x04));
-                    const svint8_t qx1 = svreinterpret_s8_u8(svlsr_n_u8_m(pl16, svand_n_u8_m(ph16, qx1r, 0x0F), 0x04));
-
-                    // sub 8
-                    const svint8_t qx0s = svsub_n_s8_x(svptrue_b8(), qx0, 8);
-                    const svint8_t qx1s = svsub_n_s8_x(svptrue_b8(), qx1, 8);
-
-                    // load y
-                    const svint8_t qy0 = svld1_s8(svptrue_b8(), y0->qs);
-                    const svint8_t qy1 = svld1_s8(svptrue_b8(), y1->qs);
-
-                    // dot product
-                    sumv0 = svmla_n_f32_x(svptrue_b32(), sumv0, svcvt_f32_s32_x(svptrue_b32(),
-                                svdot_s32(svdup_n_s32(0), qx0s, qy0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
-                    sumv1 = svmla_n_f32_x(svptrue_b32(), sumv1, svcvt_f32_s32_x(svptrue_b32(),
-                                svdot_s32(svdup_n_s32(0), qx1s, qy1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
-                }
-
-                sumf = svaddv_f32(svptrue_b32(), svadd_f32_x(svptrue_b32(), sumv0, sumv1));
-            } break;
-        case 512:
-            {
-                // predicate for activating higher lanes for 32 int8 elements
-                const svbool_t ph32 = svptrue_pat_b8(SV_VL32);
-
-                // predicate for activating higher lanes for 16 int8 elements
-                const svbool_t ph16 = svptrue_pat_b8(SV_VL16);
-                // predicate for activating lower lanes for 16 int8 elements from first 32 int8 activated lanes
-                const svbool_t pl16 = svnot_b_z(ph32, ph16);
-
-                for (; ib + 1 < nb; ib += 2) {
-                    const block_q4_0 * GGML_RESTRICT x0 = &x[ib + 0];
-                    const block_q4_0 * GGML_RESTRICT x1 = &x[ib + 1];
-                    const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0];
-                    const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1];
-
-                    // load x
-                    const svuint8_t qx0r = svld1rq_u8(ph32, x0->qs);
-                    const svuint8_t qx1r = svld1rq_u8(ph32, x1->qs);
-
-                    // 4-bit -> 8-bit
-                    const svint8_t qx0 = svreinterpret_s8_u8(svlsr_n_u8_m(pl16, svand_n_u8_m(ph16, qx0r, 0x0F), 0x04));
-                    const svint8_t qx1 = svreinterpret_s8_u8(svlsr_n_u8_m(pl16, svand_n_u8_m(ph16, qx1r, 0x0F), 0x04));
-
-                    // sub 8
-                    const svint8_t qx0s = svsub_n_s8_x(ph32, qx0, 8);
-                    const svint8_t qx1s = svsub_n_s8_x(ph32, qx1, 8);
-
-                    // load y
-                    const svint8_t qy0 = svld1_s8(ph32, y0->qs);
-                    const svint8_t qy1 = svld1_s8(ph32, y1->qs);
-
-                    // dot product
-                    sumv0 = svmla_n_f32_x(ph32, sumv0, svcvt_f32_s32_x(ph32,
-                                svdot_s32(svdup_n_s32(0), qx0s, qy0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
-                    sumv1 = svmla_n_f32_x(ph32, sumv1, svcvt_f32_s32_x(ph32,
-                                svdot_s32(svdup_n_s32(0), qx1s, qy1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
-                }
-
-                sumf = svaddv_f32(ph32, svadd_f32_x(ph32, sumv0, sumv1));
-            } break;
-        default:
-            assert(false && "Unsupported vector length");
-            break;
-    }
-
-#elif defined(__ARM_NEON)
-    float32x4_t sumv0 = vdupq_n_f32(0.0f);
-    float32x4_t sumv1 = vdupq_n_f32(0.0f);
-
-    for (; ib + 1 < nb; ib += 2) {
-        const block_q4_0 * GGML_RESTRICT x0 = &x[ib + 0];
-        const block_q4_0 * GGML_RESTRICT x1 = &x[ib + 1];
-        const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0];
-        const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1];
-
-        const uint8x16_t m4b = vdupq_n_u8(0x0F);
-        const int8x16_t  s8b = vdupq_n_s8(0x8);
-
-        const uint8x16_t v0_0 = vld1q_u8(x0->qs);
-        const uint8x16_t v0_1 = vld1q_u8(x1->qs);
-
-        // 4-bit -> 8-bit
-        const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8  (v0_0, m4b));
-        const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4));
-        const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8  (v0_1, m4b));
-        const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4));
-
-        // sub 8
-        const int8x16_t v0_0ls = vsubq_s8(v0_0l, s8b);
-        const int8x16_t v0_0hs = vsubq_s8(v0_0h, s8b);
-        const int8x16_t v0_1ls = vsubq_s8(v0_1l, s8b);
-        const int8x16_t v0_1hs = vsubq_s8(v0_1h, s8b);
-
-        // load y
-        const int8x16_t v1_0l = vld1q_s8(y0->qs);
-        const int8x16_t v1_0h = vld1q_s8(y0->qs + 16);
-        const int8x16_t v1_1l = vld1q_s8(y1->qs);
-        const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
-
-        // dot product into int32x4_t
-        const int32x4_t p_0 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), v0_0ls, v1_0l), v0_0hs, v1_0h);
-        const int32x4_t p_1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), v0_1ls, v1_1l), v0_1hs, v1_1h);
-
-        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
-        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
-    }
-
-    sumf = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
-#elif defined __wasm_simd128__
-    v128_t sumv = wasm_f32x4_splat(0.0f);
-
-    const v128_t m4b = wasm_i8x16_splat(0x0F);
-    const v128_t s8b = wasm_i8x16_splat(0x8);
-
-    for (; ib + 1 < nb; ib += 2) {
-        const block_q4_0 * GGML_RESTRICT x0 = &x[ib];
-        const block_q4_0 * GGML_RESTRICT x1 = &x[ib + 1];
-        const block_q8_0 * GGML_RESTRICT y0 = &y[ib];
-        const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1];
-
-        // Load and process x0
-        v128_t v0_0 = wasm_v128_load(x0->qs);
-        v128_t v0_0l = wasm_v128_and(v0_0, m4b);
-        v128_t v0_0h = wasm_u8x16_shr(v0_0, 4);
-        v128_t v0_0ls = wasm_i8x16_sub(v0_0l, s8b);
-        v128_t v0_0hs = wasm_i8x16_sub(v0_0h, s8b);
-
-        // Load y0 vectors
-        v128_t y0_l = wasm_v128_load(y0->qs);
-        v128_t y0_h = wasm_v128_load(y0->qs + 16);
-
-        // Extend to i16x8 and compute dot products
-        v128_t dx0l = wasm_i16x8_extend_low_i8x16(v0_0ls);
-        v128_t dx0h = wasm_i16x8_extend_high_i8x16(v0_0ls);
-        v128_t dx0hl = wasm_i16x8_extend_low_i8x16(v0_0hs);
-        v128_t dx0hh = wasm_i16x8_extend_high_i8x16(v0_0hs);
-
-        v128_t dy0ll = wasm_i16x8_extend_low_i8x16(y0_l);
-        v128_t dy0lh = wasm_i16x8_extend_high_i8x16(y0_l);
-        v128_t dy0hl = wasm_i16x8_extend_low_i8x16(y0_h);
-        v128_t dy0hh = wasm_i16x8_extend_high_i8x16(y0_h);
-
-        v128_t dp0 = wasm_i32x4_add(
-            wasm_i32x4_add(
-                wasm_i32x4_dot_i16x8(dx0l, dy0ll),
-                wasm_i32x4_dot_i16x8(dx0h, dy0lh)
-            ),
-            wasm_i32x4_add(
-                wasm_i32x4_dot_i16x8(dx0hl, dy0hl),
-                wasm_i32x4_dot_i16x8(dx0hh, dy0hh)
-            )
-        );
-
-        // Load and process x1
-        v128_t v0_1 = wasm_v128_load(x1->qs);
-        v128_t v0_1l = wasm_v128_and(v0_1, m4b);
-        v128_t v0_1h = wasm_u8x16_shr(v0_1, 4);
-        v128_t v0_1ls = wasm_i8x16_sub(v0_1l, s8b);
-        v128_t v0_1hs = wasm_i8x16_sub(v0_1h, s8b);
-
-        // Load y1 vectors
-        v128_t y1_l = wasm_v128_load(y1->qs);
-        v128_t y1_h = wasm_v128_load(y1->qs + 16);
-
-        // Extend to i16x8 and compute dot products
-        v128_t dx1l = wasm_i16x8_extend_low_i8x16(v0_1ls);
-        v128_t dx1h = wasm_i16x8_extend_high_i8x16(v0_1ls);
-        v128_t dx1hl = wasm_i16x8_extend_low_i8x16(v0_1hs);
-        v128_t dx1hh = wasm_i16x8_extend_high_i8x16(v0_1hs);
-
-        v128_t dy1ll = wasm_i16x8_extend_low_i8x16(y1_l);
-        v128_t dy1lh = wasm_i16x8_extend_high_i8x16(y1_l);
-        v128_t dy1hl = wasm_i16x8_extend_low_i8x16(y1_h);
-        v128_t dy1hh = wasm_i16x8_extend_high_i8x16(y1_h);
-
-        v128_t dp1 = wasm_i32x4_add(
-            wasm_i32x4_add(
-                wasm_i32x4_dot_i16x8(dx1l, dy1ll),
-                wasm_i32x4_dot_i16x8(dx1h, dy1lh)
-            ),
-            wasm_i32x4_add(
-                wasm_i32x4_dot_i16x8(dx1hl, dy1hl),
-                wasm_i32x4_dot_i16x8(dx1hh, dy1hh)
-            )
-        );
-
-        // Accumulate results with scaling
-        float scale0 = GGML_FP16_TO_FP32(x0->d) * GGML_FP16_TO_FP32(y0->d);
-        float scale1 = GGML_FP16_TO_FP32(x1->d) * GGML_FP16_TO_FP32(y1->d);
-
-        sumv = wasm_f32x4_add(sumv, wasm_f32x4_mul(wasm_f32x4_convert_i32x4(dp0), wasm_f32x4_splat(scale0)));
-        sumv = wasm_f32x4_add(sumv, wasm_f32x4_mul(wasm_f32x4_convert_i32x4(dp1), wasm_f32x4_splat(scale1)));
-    }
-
-    sumf = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
-           wasm_f32x4_extract_lane(sumv, 2) + wasm_f32x4_extract_lane(sumv, 3);
-#elif defined(__AVX2__)
-    // Initialize accumulator with zeros
-    __m256 acc = _mm256_setzero_ps();
-
-    // Main loop
-    for (; ib < nb; ++ib) {
-        /* Compute combined scale for the block */
-        const __m256 d = _mm256_set1_ps( GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d) );
-
-        __m256i qx = bytes_from_nibbles_32(x[ib].qs);
-
-        // Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval.
-        const __m256i off = _mm256_set1_epi8( 8 );
-        qx = _mm256_sub_epi8( qx, off );
-
-        __m256i qy = _mm256_loadu_si256((const __m256i *)y[ib].qs);
-
-        const __m256 q = mul_sum_i8_pairs_float(qx, qy);
-
-        /* Multiply q with scale and accumulate */
-        acc = _mm256_fmadd_ps( d, q, acc );
-    }
-
-    sumf = hsum_float_8(acc);
-#elif defined(__AVX__)
-    __m256 accum = _mm256_setzero_ps();
-    for (; ib + 1 < nb; ib += 2) {
-        const __m128i q4bits_1 = _mm_loadu_si128((const __m128i *)x[ib + 0].qs);
-        const __m128i q4bits_2 = _mm_loadu_si128((const __m128i *)x[ib + 1].qs);
-        const __m128i q8b_1_0 = _mm_loadu_si128((const __m128i *)y[ib + 0].qs);
-        const __m128i q8b_1_1 = _mm_loadu_si128((const __m128i *)y[ib + 0].qs + 1);
-        const __m128i q8b_2_0 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs);
-        const __m128i q8b_2_1 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs + 1);
-
-        const __m128i q4b_1_0 = _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), q4bits_1), _mm_set1_epi8(8));
-        const __m128i q4b_1_1 = _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), _mm_srli_epi16(q4bits_1, 4)), _mm_set1_epi8(8));
-        const __m128i q4b_2_0 = _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), q4bits_2), _mm_set1_epi8(8));
-        const __m128i q4b_2_1 = _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), _mm_srli_epi16(q4bits_2, 4)), _mm_set1_epi8(8));
-
-        const __m128i p16_1_0 = mul_add_epi8_sse(q4b_1_0, q8b_1_0);
-        const __m128i p16_1_1 = mul_add_epi8_sse(q4b_1_1, q8b_1_1);
-        const __m128i p16_2_0 = mul_add_epi8_sse(q4b_2_0, q8b_2_0);
-        const __m128i p16_2_1 = mul_add_epi8_sse(q4b_2_1, q8b_2_1);
-        const __m128i p_1 = _mm_add_epi16(p16_1_0, p16_1_1);
-        const __m128i p_2 = _mm_add_epi16(p16_2_0, p16_2_1);
-        const __m256 p =  sum_i16_pairs_float(p_2, p_1);
-
-        const __m256 deltas = quad_fp16_delta_float(x[ib].d, y[ib].d, x[ib + 1].d, y[ib + 1].d);
-        accum = _mm256_add_ps(_mm256_mul_ps(deltas, p), accum);
-    }
-
-    sumf = hsum_float_8(accum);
-#elif defined(__SSSE3__)
-    // set constants
-    const __m128i lowMask = _mm_set1_epi8(0xF);
-    const __m128i off = _mm_set1_epi8(8);
-
-    // Initialize accumulator with zeros
-    __m128 acc_0 = _mm_setzero_ps();
-    __m128 acc_1 = _mm_setzero_ps();
-    __m128 acc_2 = _mm_setzero_ps();
-    __m128 acc_3 = _mm_setzero_ps();
-
-    for (; ib + 1 < nb; ib += 2) {
-        _mm_prefetch(&x[ib] + sizeof(block_q4_0), _MM_HINT_T0);
-        _mm_prefetch(&y[ib] + sizeof(block_q8_0), _MM_HINT_T0);
-
-        // Compute combined scale for the block 0 and 1
-        const __m128 d_0_1 = _mm_set1_ps( GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d) );
-
-        const __m128i tmp_0_1 = _mm_loadu_si128((const __m128i *)x[ib].qs);
-
-        __m128i bx_0 = _mm_and_si128(lowMask, tmp_0_1);
-        __m128i by_0 = _mm_loadu_si128((const __m128i *)y[ib].qs);
-        bx_0 = _mm_sub_epi8(bx_0, off);
-        const __m128i i32_0 = mul_sum_i8_pairs(bx_0, by_0);
-
-        __m128i bx_1 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp_0_1, 4));
-        __m128i by_1 = _mm_loadu_si128((const __m128i *)(y[ib].qs + 16));
-        bx_1 = _mm_sub_epi8(bx_1, off);
-        const __m128i i32_1 = mul_sum_i8_pairs(bx_1, by_1);
-
-        _mm_prefetch(&x[ib] + 2 * sizeof(block_q4_0), _MM_HINT_T0);
-        _mm_prefetch(&y[ib] + 2 * sizeof(block_q8_0), _MM_HINT_T0);
-
-        // Compute combined scale for the block 2 and 3
-        const __m128 d_2_3 = _mm_set1_ps( GGML_FP16_TO_FP32(x[ib + 1].d) * GGML_FP16_TO_FP32(y[ib + 1].d) );
-
-        const __m128i tmp_2_3 = _mm_loadu_si128((const __m128i *)x[ib + 1].qs);
-
-        __m128i bx_2 = _mm_and_si128(lowMask, tmp_2_3);
-        __m128i by_2 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs);
-        bx_2 = _mm_sub_epi8(bx_2, off);
-        const __m128i i32_2 = mul_sum_i8_pairs(bx_2, by_2);
-
-        __m128i bx_3 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp_2_3, 4));
-        __m128i by_3 = _mm_loadu_si128((const __m128i *)(y[ib + 1].qs + 16));
-        bx_3 = _mm_sub_epi8(bx_3, off);
-        const __m128i i32_3 = mul_sum_i8_pairs(bx_3, by_3);
-
-        // Convert int32_t to float
-        __m128 p0 = _mm_cvtepi32_ps(i32_0);
-        __m128 p1 = _mm_cvtepi32_ps(i32_1);
-        __m128 p2 = _mm_cvtepi32_ps(i32_2);
-        __m128 p3 = _mm_cvtepi32_ps(i32_3);
-
-        // Apply the scale
-        __m128 p0_d = _mm_mul_ps( d_0_1, p0 );
-        __m128 p1_d = _mm_mul_ps( d_0_1, p1 );
-        __m128 p2_d = _mm_mul_ps( d_2_3, p2 );
-        __m128 p3_d = _mm_mul_ps( d_2_3, p3 );
-
-        // Acummulate
-        acc_0 = _mm_add_ps(p0_d, acc_0);
-        acc_1 = _mm_add_ps(p1_d, acc_1);
-        acc_2 = _mm_add_ps(p2_d, acc_2);
-        acc_3 = _mm_add_ps(p3_d, acc_3);
-    }
-
-    sumf = hsum_float_4x4(acc_0, acc_1, acc_2, acc_3);
-#elif defined(__riscv_v)
-    size_t vl = qk / 2;
-
-    for (; ib < nb; ++ib) {
-        // load elements
-        vuint8m1_t tx = __riscv_vle8_v_u8m1(x[ib].qs, vl);
-
-        vint8m1_t y0 = __riscv_vle8_v_i8m1(y[ib].qs, vl);
-        vint8m1_t y1 = __riscv_vle8_v_i8m1(y[ib].qs+16, vl);
-
-        // mask and store lower part of x, and then upper part
-        vuint8m1_t x_a = __riscv_vand_vx_u8m1(tx, 0x0F, vl);
-        vuint8m1_t x_l = __riscv_vsrl_vx_u8m1(tx, 0x04, vl);
-
-        vint8m1_t x_ai = __riscv_vreinterpret_v_u8m1_i8m1(x_a);
-        vint8m1_t x_li = __riscv_vreinterpret_v_u8m1_i8m1(x_l);
-
-        // subtract offset
-        vint8m1_t v0 = __riscv_vsub_vx_i8m1(x_ai, 8, vl);
-        vint8m1_t v1 = __riscv_vsub_vx_i8m1(x_li, 8, vl);
-
-        vint16m2_t vec_mul1 = __riscv_vwmul_vv_i16m2(v0, y0, vl);
-        vint16m2_t vec_mul2 = __riscv_vwmacc_vv_i16m2(vec_mul1, v1, y1, vl);
-
-        vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl);
-        vint32m1_t vs2 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul2, vec_zero, vl);
-
-        int sumi = __riscv_vmv_x_s_i32m1_i32(vs2);
-
-        sumf += sumi*GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d);
-    }
-
-#elif defined(__POWER9_VECTOR__)
-    const vector signed char lowMask = vec_splats((signed char)0xF);
-    const vector signed int v0 = vec_splats((int32_t)0);
-    const vector unsigned char v4 = vec_splats((unsigned char)0x4);
-    const vector signed char v8 = vec_splats((signed char)0x8);
-
-    vector float vsumf0 = vec_splats(0.0f);
-
-#pragma GCC unroll 8
-    for (; ib < nb; ++ib) {
-        __builtin_prefetch(x[ib].qs, 0, 1);
-        __builtin_prefetch(y[ib].qs, 0, 1);
-
-        vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[ib].d));
-        vector float vyd = vec_splats(GGML_FP16_TO_FP32(y[ib].d));
-        vector float vd = vec_mul(vxd, vyd);
-
-        vector signed char qxs = (vector signed char)vec_xl( 0, x[ib].qs);
-        vector signed char q8y0 = vec_xl( 0, y[ib].qs);
-        vector signed char q8y1 = vec_xl(16, y[ib].qs);
-
-        vector signed char q4x0 = vec_and(qxs, lowMask);
-        vector signed char q4x1 = vec_sr(qxs, v4);
-
-        q4x0 = vec_sub(q4x0, v8);
-        q4x1 = vec_sub(q4x1, v8);
-
-        vector signed short qv0 = vec_add(vec_mule(q4x0, q8y0), vec_mulo(q4x0, q8y0));
-        vector signed short qv1 = vec_add(vec_mule(q4x1, q8y1), vec_mulo(q4x1, q8y1));
-
-        vector signed int vsumi0 = v0;
-
-        vsumi0 = vec_sum4s(qv0, vsumi0);
-        vsumi0 = vec_sum4s(qv1, vsumi0);
-
-        vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
-    }
-
-    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
-    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
-
-    sumf = vec_extract(vsumf0, 0);
-
-#elif defined(__loongarch_asx)
-    // Initialize accumulator with zeros
-    __m256 acc = (__m256)__lasx_xvldi(0);
-
-    // Main loop
-    for (; ib < nb; ++ib) {
-        /* Compute combined scale for the block */
-        const __m256 d = __lasx_xvreplfr2vr_s( GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d) );
-
-        __m256i qx = bytes_from_nibbles_32(x[ib].qs);
-
-        // Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval.
-        const __m256i off = __lasx_xvreplgr2vr_b( 8 );
-        qx = __lasx_xvsub_b( qx, off );
-
-        __m256i qy = __lasx_xvld((const __m256i *)y[ib].qs, 0);
-
-        const __m256 q = mul_sum_i8_pairs_float(qx, qy);
-
-        /* Multiply q with scale and accumulate */
-        acc = __lasx_xvfmadd_s( d, q, acc );
-    }
-
-    sumf = hsum_float_8(acc);
-
-#elif defined(__loongarch_sx)
-    // set constants
-    const __m128i low_mask = __lsx_vreplgr2vr_b(0xF);
-    const __m128i off = __lsx_vreplgr2vr_b(8);
-
-    // Initialize accumulator with zeros
-    __m128 acc_0 = (__m128)__lsx_vldi(0);
-    __m128 acc_1 = (__m128)__lsx_vldi(0);
-    __m128 acc_2 = (__m128)__lsx_vldi(0);
-    __m128 acc_3 = (__m128)__lsx_vldi(0);
-
-    for (; ib + 1 < nb; ib += 2) {
-
-        // Compute combined scale for the block 0 and 1
-        const __m128 d_0_1 = (__m128)__lsx_vreplgr2vr_w( GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d) );
-
-        const __m128i tmp_0_1 = __lsx_vld((const __m128i *)x[ib].qs, 0);
-
-        __m128i bx_0 = __lsx_vand_v(low_mask, tmp_0_1);
-        __m128i by_0 = __lsx_vld((const __m128i *)y[ib].qs, 0);
-        bx_0 = __lsx_vsub_b(bx_0, off);
-        const __m128i i32_0 = mul_sum_i8_pairs(bx_0, by_0);
-
-        __m128i bx_1 = __lsx_vand_v(low_mask, __lsx_vsrli_d(tmp_0_1, 4));
-        __m128i by_1 = __lsx_vld((const __m128i *)(y[ib].qs + 16), 0);
-        bx_1 = __lsx_vsub_b(bx_1, off);
-        const __m128i i32_1 = mul_sum_i8_pairs(bx_1, by_1);
-
-        //_mm_prefetch(&x[ib] + 2 * sizeof(block_q4_0), _MM_HINT_T0);
-        //_mm_prefetch(&y[ib] + 2 * sizeof(block_q8_0), _MM_HINT_T0);
-
-        // Compute combined scale for the block 2 and 3
-        const __m128 d_2_3 = (__m128)__lsx_vreplgr2vr_w( GGML_FP16_TO_FP32(x[ib + 1].d) * GGML_FP16_TO_FP32(y[ib + 1].d) );
-
-        const __m128i tmp_2_3 = __lsx_vld((const __m128i *)x[ib + 1].qs, 0);
-
-        __m128i bx_2 = __lsx_vand_v(low_mask, tmp_2_3);
-        __m128i by_2 = __lsx_vld((const __m128i *)y[ib + 1].qs, 0);
-        bx_2 = __lsx_vsub_b(bx_2, off);
-        const __m128i i32_2 = mul_sum_i8_pairs(bx_2, by_2);
-
-        __m128i bx_3 = __lsx_vand_v(low_mask, __lsx_vsrli_d(tmp_2_3, 4));
-        __m128i by_3 = __lsx_vld((const __m128i *)(y[ib + 1].qs + 16), 0);
-        bx_3 = __lsx_vsub_b(bx_3, off);
-        const __m128i i32_3 = mul_sum_i8_pairs(bx_3, by_3);
-
-        // Convert int32_t to float
-        __m128 p0 = __lsx_vffint_s_w(i32_0);
-        __m128 p1 = __lsx_vffint_s_w(i32_1);
-        __m128 p2 = __lsx_vffint_s_w(i32_2);
-        __m128 p3 = __lsx_vffint_s_w(i32_3);
-
-        // Apply the scale
-        __m128 p0_d = __lsx_vfmul_s( d_0_1, p0 );
-        __m128 p1_d = __lsx_vfmul_s( d_0_1, p1 );
-        __m128 p2_d = __lsx_vfmul_s( d_2_3, p2 );
-        __m128 p3_d = __lsx_vfmul_s( d_2_3, p3 );
-
-        // Acummulate
-        acc_0 = __lsx_vfadd_s(p0_d, acc_0);
-        acc_1 = __lsx_vfadd_s(p1_d, acc_1);
-        acc_2 = __lsx_vfadd_s(p2_d, acc_2);
-        acc_3 = __lsx_vfadd_s(p3_d, acc_3);
-    }
-
-    sumf = hsum_float_4x4(acc_0, acc_1, acc_2, acc_3);
-#elif defined(__VXE__) || defined(__VXE2__)
-    __vector float acc = vec_splats(0.0f);
-
-    const __vector uint8_t v_m = vec_splats((const uint8_t)0x0F);
-    const __vector int8_t  v_s = vec_splats( (const int8_t)0x08);
-
-    for (; ib < nb; ++ib) {
-        const __vector uint8_t v_x = vec_xl(0, x[ib].qs);
-        const __vector int8_t v_xl = (const __vector int8_t)(v_x & v_m);
-        const __vector int8_t v_xh = (const __vector int8_t)(v_x >> 4);
-
-        const __vector int8_t v_xls = vec_sub(v_xl, v_s);
-        const __vector int8_t v_xhs = vec_sub(v_xh, v_s);
-
-        const __vector int8_t v_yl = vec_xl(0      , y[ib].qs);
-        const __vector int8_t v_yh = vec_xl(QK8_0/2, y[ib].qs);
-
-        const __vector int16_t v_xylso = vec_mulo(v_xls, v_yl);
-        const __vector int16_t v_xylse = vec_mule(v_xls, v_yl);
-        const __vector int16_t v_xyhso = vec_mulo(v_xhs, v_yh);
-        const __vector int16_t v_xyhse = vec_mule(v_xhs, v_yh);
-
-        __vector int16_t v_xy_ = v_xylso + v_xylse + v_xyhso + v_xyhse; v_xy_ += vec_reve(v_xy_);
-
-        const __vector float v_xy = vec_float(vec_unpackh(v_xy_));
-        const __vector float v_d = vec_splats(GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d));
-
-        acc = vec_madd(v_xy, v_d, acc);
-    }
-
-    sumf = acc[0] + acc[1] + acc[2] + acc[3];
-#endif
-    for (; ib < nb; ++ib) {
-        int sumi0 = 0;
-        int sumi1 = 0;
-
-        for (int j = 0; j < qk/2; ++j) {
-            const int v0 = (x[ib].qs[j] & 0x0F) - 8;
-            const int v1 = (x[ib].qs[j] >>   4) - 8;
-
-            sumi0 += (v0 * y[ib].qs[j]);
-            sumi1 += (v1 * y[ib].qs[j + qk/2]);
-        }
-
-        int sumi = sumi0 + sumi1;
-        sumf += sumi*GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d);
-    }
-
-    *s = sumf;
-}
-
-void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
-    const int qk = QK8_1;
-    const int nb = n / qk;
-
-    assert(n % qk == 0);
-#if defined(__ARM_FEATURE_MATMUL_INT8)
-    assert((nrc == 2) || (nrc == 1));
-#else
-    assert(nrc == 1);
-#endif
-    UNUSED(nrc);
-    UNUSED(bx);
-    UNUSED(by);
-    UNUSED(bs);
-
-    const block_q4_1 * GGML_RESTRICT x = vx;
-    const block_q8_1 * GGML_RESTRICT y = vy;
-
-#if defined(__ARM_FEATURE_MATMUL_INT8)
-    if (nrc == 2) {
-        const block_q4_1 * GGML_RESTRICT vx0 = vx;
-        const block_q4_1 * GGML_RESTRICT vx1 = (const block_q4_1 *) ((const uint8_t*)vx + bx);
-        const block_q8_1 * GGML_RESTRICT vy0 = vy;
-        const block_q8_1 * GGML_RESTRICT vy1 = (const block_q8_1 *) ((const uint8_t*)vy + by);
-
-        float32x4_t sumv0 = vdupq_n_f32(0.0f);
-        float32x4_t summs0 = vdupq_n_f32(0.0f);
-
-        for (int i = 0; i < nb; i++) {
-            const block_q4_1 * GGML_RESTRICT b_x0 = &vx0[i];
-            const block_q4_1 * GGML_RESTRICT b_x1 = &vx1[i];
-            const block_q8_1 * GGML_RESTRICT b_y0 = &vy0[i];
-            const block_q8_1 * GGML_RESTRICT b_y1 = &vy1[i];
-
-            float32_t summs_t[4] = {
-                GGML_FP16_TO_FP32(b_x0->m) * GGML_FP16_TO_FP32(b_y0->s),
-                GGML_FP16_TO_FP32(b_x1->m) * GGML_FP16_TO_FP32(b_y0->s),
-                GGML_FP16_TO_FP32(b_x0->m) * GGML_FP16_TO_FP32(b_y1->s),
-                GGML_FP16_TO_FP32(b_x1->m) * GGML_FP16_TO_FP32(b_y1->s)
-            };
-            summs0 = vaddq_f32(summs0, vld1q_f32(summs_t));
-
-            const uint8x16_t m4b = vdupq_n_u8(0x0F);
-
-            const uint8x16_t v0_0 = vld1q_u8(b_x0->qs);
-            const uint8x16_t v0_1 = vld1q_u8(b_x1->qs);
-
-            // 4-bit -> 8-bit
-            const int8x16_t x0_l = vreinterpretq_s8_u8(vandq_u8  (v0_0, m4b));
-            const int8x16_t x0_h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4));
-            const int8x16_t x1_l = vreinterpretq_s8_u8(vandq_u8  (v0_1, m4b));
-            const int8x16_t x1_h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4));
-
-            // load y
-            const int8x16_t y0_l = vld1q_s8(b_y0->qs);
-            const int8x16_t y0_h = vld1q_s8(b_y0->qs + 16);
-            const int8x16_t y1_l = vld1q_s8(b_y1->qs);
-            const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16);
-
-            // mmla into int32x4_t
-            float32_t _scale[4] = {
-                GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y0->d),
-                GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y1->d),
-                GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y0->d),
-                GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y1->d)
-            };
-            float32x4_t scale = vld1q_f32(_scale);
-
-            int8x16_t l0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
-            int8x16_t l1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
-
-            int8x16_t l2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_h), vreinterpretq_s64_s8(x1_h)));
-            int8x16_t l3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_h), vreinterpretq_s64_s8(x1_h)));
-
-            int8x16_t r0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_l), vreinterpretq_s64_s8(y1_l)));
-            int8x16_t r1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_l), vreinterpretq_s64_s8(y1_l)));
-
-            int8x16_t r2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h)));
-            int8x16_t r3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h)));
-            sumv0 = vmlaq_f32(sumv0,(vcvtq_f32_s32(vmmlaq_s32((vmmlaq_s32((vmmlaq_s32((vmmlaq_s32(vdupq_n_s32(0), l0, r0)),
-                                                l1, r1)), l2, r2)), l3, r3))), scale);
-        }
-
-        float32x4_t sumv1 = vextq_f32 (sumv0, sumv0, 2);
-        float32x4_t sumv2 = vzip1q_f32(sumv0, sumv1);
-
-        sumv2 = vaddq_f32(sumv2, summs0);
-
-        vst1_f32(s,      vget_low_f32 (sumv2));
-        vst1_f32(s + bs, vget_high_f32(sumv2));
-
-        return;
-    }
-#endif
-
-    int ib = 0;
-    float sumf = 0;
-
-    // TODO: add WASM SIMD
-#if defined(__ARM_NEON)
-    float32x4_t sumv0 = vdupq_n_f32(0.0f);
-    float32x4_t sumv1 = vdupq_n_f32(0.0f);
-
-    float summs = 0;
-
-    for (; ib + 1 < nb; ib += 2) {
-        const block_q4_1 * GGML_RESTRICT x0 = &x[ib + 0];
-        const block_q4_1 * GGML_RESTRICT x1 = &x[ib + 1];
-        const block_q8_1 * GGML_RESTRICT y0 = &y[ib + 0];
-        const block_q8_1 * GGML_RESTRICT y1 = &y[ib + 1];
-
-        summs += GGML_FP16_TO_FP32(x0->m) * GGML_FP16_TO_FP32(y0->s) + GGML_FP16_TO_FP32(x1->m) * GGML_FP16_TO_FP32(y1->s);
-
-        const uint8x16_t m4b = vdupq_n_u8(0x0F);
-
-        const uint8x16_t v0_0 = vld1q_u8(x0->qs);
-        const uint8x16_t v0_1 = vld1q_u8(x1->qs);
-
-        // 4-bit -> 8-bit
-        const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8  (v0_0, m4b));
-        const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4));
-        const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8  (v0_1, m4b));
-        const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4));
-
-        // load y
-        const int8x16_t v1_0l = vld1q_s8(y0->qs);
-        const int8x16_t v1_0h = vld1q_s8(y0->qs + 16);
-        const int8x16_t v1_1l = vld1q_s8(y1->qs);
-        const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
-
-        // dot product into int32x4_t
-        const int32x4_t p_0 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), v0_0l, v1_0l), v0_0h, v1_0h);
-        const int32x4_t p_1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), v0_1l, v1_1l), v0_1h, v1_1h);
-
-        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
-        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
-    }
-
-    sumf = vaddvq_f32(sumv0) + vaddvq_f32(sumv1) + summs;
-#elif defined(__AVX2__) || defined(__AVX__)
-    // Initialize accumulator with zeros
-    __m256 acc = _mm256_setzero_ps();
-
-    float summs = 0;
-
-    // Main loop
-    for (; ib < nb; ++ib) {
-        const float d0 = GGML_FP16_TO_FP32(x[ib].d);
-        const float d1 = GGML_FP16_TO_FP32(y[ib].d);
-
-        summs += GGML_FP16_TO_FP32(x[ib].m) * GGML_FP16_TO_FP32(y[ib].s);
-
-        const __m256 d0v = _mm256_set1_ps( d0 );
-        const __m256 d1v = _mm256_set1_ps( d1 );
-
-        // Compute combined scales
-        const __m256 d0d1 = _mm256_mul_ps( d0v, d1v );
-
-        // Load 16 bytes, and unpack 4 bit fields into bytes, making 32 bytes
-        const __m256i qx = bytes_from_nibbles_32(x[ib].qs);
-        const __m256i qy = _mm256_loadu_si256( (const __m256i *)y[ib].qs );
-
-        const __m256 xy = mul_sum_us8_pairs_float(qx, qy);
-
-        // Accumulate d0*d1*x*y
-#if defined(__AVX2__)
-        acc = _mm256_fmadd_ps( d0d1, xy, acc );
-#else
-        acc = _mm256_add_ps( _mm256_mul_ps( d0d1, xy ), acc );
-#endif
-    }
-
-    sumf = hsum_float_8(acc) + summs;
-#elif defined(__riscv_v)
-    size_t vl = qk / 2;
-
-    for (; ib < nb; ++ib) {
-        // load elements
-        vuint8m1_t tx = __riscv_vle8_v_u8m1(x[ib].qs, vl);
-
-        vint8m1_t y0 = __riscv_vle8_v_i8m1(y[ib].qs, vl);
-        vint8m1_t y1 = __riscv_vle8_v_i8m1(y[ib].qs+16, vl);
-
-        // mask and store lower part of x, and then upper part
-        vuint8m1_t x_a = __riscv_vand_vx_u8m1(tx, 0x0F, vl);
-        vuint8m1_t x_l = __riscv_vsrl_vx_u8m1(tx, 0x04, vl);
-
-        vint8m1_t v0 = __riscv_vreinterpret_v_u8m1_i8m1(x_a);
-        vint8m1_t v1 = __riscv_vreinterpret_v_u8m1_i8m1(x_l);
-
-        vint16m2_t vec_mul1 = __riscv_vwmul_vv_i16m2(v0, y0, vl);
-        vint16m2_t vec_mul2 = __riscv_vwmacc_vv_i16m2(vec_mul1, v1, y1, vl);
-
-        vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl);
-        vint32m1_t vs2 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul2, vec_zero, vl);
-
-        int sumi = __riscv_vmv_x_s_i32m1_i32(vs2);
-
-        sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d))*sumi + GGML_FP16_TO_FP32(x[ib].m)*GGML_FP16_TO_FP32(y[ib].s);
-    }
-
-#elif defined(__POWER9_VECTOR__)
-    const vector signed char lowMask = vec_splats((signed char)0xF);
-    const vector signed int v0 = vec_splats((int32_t)0);
-    const vector unsigned char v4 = vec_splats((unsigned char)0x4);
-
-    vector float vsumf0 = vec_splats(0.0f);
-
-#pragma GCC unroll 4
-    for (; ib < nb; ++ib) {
-        __builtin_prefetch(x[ib].qs, 0, 1);
-        __builtin_prefetch(y[ib].qs, 0, 1);
-
-        vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[ib].d));
-        vector float vyd = vec_splats(GGML_FP16_TO_FP32(y[ib].d));
-        vector float vd = vec_mul(vxd, vyd);
-
-        vector float vxmin = vec_splats(GGML_FP16_TO_FP32(x[ib].m));
-        vector float vys = {GGML_FP16_TO_FP32(y[ib].s), 0.0f, 0.0f, 0.0f};
-        vsumf0 = vec_madd(vxmin, vys, vsumf0);
-
-        vector signed char qxs = (vector signed char)vec_xl( 0, x[ib].qs);
-        vector signed char q8y0 = vec_xl( 0, y[ib].qs);
-        vector signed char q8y1 = vec_xl(16, y[ib].qs);
-
-        vector unsigned char q4x0 = (vector unsigned char)vec_and(qxs, lowMask);
-        vector unsigned char q4x1 = (vector unsigned char)vec_sr(qxs, v4);
-
-        vector signed int vsumi0 = v0;
-
-        vsumi0 = vec_msum(q8y0, q4x0, vsumi0);
-        vsumi0 = vec_msum(q8y1, q4x1, vsumi0);
-
-        vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
-    }
-
-    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
-    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
-
-    sumf = vec_extract(vsumf0, 0);
-
-#elif defined(__loongarch_asx)
-    // Initialize accumulator with zeros
-    __m256 acc = (__m256)__lasx_xvldi(0);
-
-    float summs = 0;
-
-    // Main loop
-    for (; ib < nb; ++ib) {
-        const float d0 = GGML_FP16_TO_FP32(x[ib].d);
-        const float d1 = GGML_FP16_TO_FP32(y[ib].d);
-
-        summs += GGML_FP16_TO_FP32(x[ib].m) * GGML_FP16_TO_FP32(y[ib].s);
-
-        const __m256 d0v = __lasx_xvreplfr2vr_s( d0 );
-        const __m256 d1v = __lasx_xvreplfr2vr_s( d1 );
-
-        // Compute combined scales
-        const __m256 d0d1 = __lasx_xvfmul_s( d0v, d1v );
-
-        // Load 16 bytes, and unpack 4 bit fields into bytes, making 32 bytes
-        const __m256i qx = bytes_from_nibbles_32(x[ib].qs);
-        const __m256i qy = __lasx_xvld( (const __m256i *)y[ib].qs, 0);
-
-        const __m256 xy = mul_sum_us8_pairs_float(qx, qy);
-
-        // Accumulate d0*d1*x*y
-        acc = __lasx_xvfmadd_s( d0d1, xy, acc );
-    }
-
-    sumf = hsum_float_8(acc) + summs;
-#elif defined(__VXE__) || defined(__VXE2__)
-    float summs = 0;
-    float32x4_t acc = vec_splats(0.0f);
-
-    const uint8x16_t v_m = vec_splat_u8(0x0F);
-
-#pragma GCC unroll 4
-    for (; ib < nb; ++ib) {
-        __builtin_prefetch(x[ib].qs, 0, 1);
-        __builtin_prefetch(y[ib].qs, 0, 1);
-
-        summs += GGML_FP16_TO_FP32(x[ib].m) * GGML_FP16_TO_FP32(y[ib].s);
-
-        const uint8x16_t v_x = vec_xl(0, x[ib].qs);
-        const int8x16_t v_xl = (const int8x16_t)(v_x & v_m);
-        const int8x16_t v_xh = (const int8x16_t)(v_x >> 4);
-
-        const int8x16_t v_yl = vec_xl(0      , y[ib].qs);
-        const int8x16_t v_yh = vec_xl(QK8_1/2, y[ib].qs);
-
-        const int32x4_t v_xy_ = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_xl, v_yl), v_xh, v_yh);
-        const float32x4_t v_xy = vec_float(v_xy_);
-
-        const float32x4_t v_d = vec_splats(GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d));
-
-        acc = vec_madd(v_xy, v_d, acc);
-    }
-
-    sumf = acc[0] + acc[1] + acc[2] + acc[3] + summs;
-#endif
-    for (; ib < nb; ++ib) {
-        int sumi0 = 0;
-        int sumi1 = 0;
-
-        for (int j = 0; j < qk/2; ++j) {
-            const int v0 = (x[ib].qs[j] & 0x0F);
-            const int v1 = (x[ib].qs[j] >>   4);
-
-            sumi0 += (v0 * y[ib].qs[j]);
-            sumi1 += (v1 * y[ib].qs[j + qk/2]);
-        }
-
-        int sumi = sumi0 + sumi1;
-        sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d))*sumi + GGML_FP16_TO_FP32(x[ib].m)*GGML_FP16_TO_FP32(y[ib].s);
-    }
-
-    *s = sumf;
-}
-
-void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
-    const int qk = QK8_0;
-    const int nb = n / qk;
-
-    int ib = 0;
-    float sumf = 0;
-
-    assert(n % qk == 0);
-    assert(qk == QK5_0);
-    assert(nrc == 1);
-    UNUSED(nrc);
-    UNUSED(bx);
-    UNUSED(by);
-    UNUSED(bs);
-
-    const block_q5_0 * GGML_RESTRICT x = vx;
-    const block_q8_0 * GGML_RESTRICT y = vy;
-
-#if defined(__ARM_NEON)
-    float32x4_t sumv0 = vdupq_n_f32(0.0f);
-    float32x4_t sumv1 = vdupq_n_f32(0.0f);
-
-    uint32_t qh0;
-    uint32_t qh1;
-
-    uint64_t tmp0[4];
-    uint64_t tmp1[4];
-
-    for (; ib + 1 < nb; ib += 2) {
-        const block_q5_0 * GGML_RESTRICT x0 = &x[ib];
-        const block_q5_0 * GGML_RESTRICT x1 = &x[ib + 1];
-        const block_q8_0 * GGML_RESTRICT y0 = &y[ib];
-        const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1];
-
-        const uint8x16_t m4b = vdupq_n_u8(0x0F);
-
-        // extract the 5th bit via lookup table ((!b) << 4)
-        memcpy(&qh0, x0->qh, sizeof(qh0));
-        memcpy(&qh1, x1->qh, sizeof(qh1));
-
-        tmp0[0] = table_b2b_1[(qh0 >>  0) & 0xFF];
-        tmp0[1] = table_b2b_1[(qh0 >>  8) & 0xFF];
-        tmp0[2] = table_b2b_1[(qh0 >> 16) & 0xFF];
-        tmp0[3] = table_b2b_1[(qh0 >> 24)       ];
-
-        tmp1[0] = table_b2b_1[(qh1 >>  0) & 0xFF];
-        tmp1[1] = table_b2b_1[(qh1 >>  8) & 0xFF];
-        tmp1[2] = table_b2b_1[(qh1 >> 16) & 0xFF];
-        tmp1[3] = table_b2b_1[(qh1 >> 24)       ];
-
-        const int8x16_t qhl0 = vld1q_s8((const int8_t *)(tmp0 + 0));
-        const int8x16_t qhh0 = vld1q_s8((const int8_t *)(tmp0 + 2));
-        const int8x16_t qhl1 = vld1q_s8((const int8_t *)(tmp1 + 0));
-        const int8x16_t qhh1 = vld1q_s8((const int8_t *)(tmp1 + 2));
-
-        const uint8x16_t v0_0 = vld1q_u8(x0->qs);
-        const uint8x16_t v0_1 = vld1q_u8(x1->qs);
-
-        // 4-bit -> 8-bit
-        int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8  (v0_0, m4b));
-        int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4));
-        int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8  (v0_1, m4b));
-        int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4));
-
-        // add high bit and sub 16 (equivalent to sub 0x10 when bit is zero)
-        const int8x16_t v0_0lf = vsubq_s8(v0_0l, qhl0);
-        const int8x16_t v0_0hf = vsubq_s8(v0_0h, qhh0);
-        const int8x16_t v0_1lf = vsubq_s8(v0_1l, qhl1);
-        const int8x16_t v0_1hf = vsubq_s8(v0_1h, qhh1);
-
-        // load y
-        const int8x16_t v1_0l = vld1q_s8(y0->qs);
-        const int8x16_t v1_0h = vld1q_s8(y0->qs + 16);
-        const int8x16_t v1_1l = vld1q_s8(y1->qs);
-        const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
-
-        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
-                        ggml_vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l),
-                        ggml_vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
-        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
-                        ggml_vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l),
-                        ggml_vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
-    }
-
-    sumf = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
-#elif defined __wasm_simd128__
-    v128_t sumv = wasm_f32x4_splat(0.0f);
-
-    uint32_t qh_;
-    uint64_t tmp[4];
-
-    // TODO: check if unrolling this is better
-    for (; ib < nb; ++ib) {
-        const block_q5_0 * GGML_RESTRICT x0 = &x[ib];
-        const block_q8_0 * GGML_RESTRICT y0 = &y[ib];
-
-        const v128_t m4b  = wasm_i8x16_splat(0x0F);
-
-        // extract the 5th bit
-        memcpy(&qh_, x0->qh, sizeof(qh_));
-
-        tmp[0] = table_b2b_1[(qh_ >>  0) & 0xFF];
-        tmp[1] = table_b2b_1[(qh_ >>  8) & 0xFF];
-        tmp[2] = table_b2b_1[(qh_ >> 16) & 0xFF];
-        tmp[3] = table_b2b_1[(qh_ >> 24)       ];
-
-        const v128_t qhl = wasm_v128_load(tmp + 0);
-        const v128_t qhh = wasm_v128_load(tmp + 2);
-
-        const v128_t v0 = wasm_v128_load(x0->qs);
-
-        // 4-bit -> 8-bit
-        const v128_t v0l = wasm_v128_and (v0, m4b);
-        const v128_t v0h = wasm_u8x16_shr(v0, 4);
-
-        // add high bit and sub 16 (equivalent to sub 0x10 when bit is zero)
-        const v128_t v0lf = wasm_i8x16_sub(v0l, qhl);
-        const v128_t v0hf = wasm_i8x16_sub(v0h, qhh);
-
-        // load y
-        const v128_t v1l = wasm_v128_load(y0->qs);
-        const v128_t v1h = wasm_v128_load(y0->qs + 16);
-
-        // int8x16 -> int16x8
-        const v128_t v0lfl = wasm_i16x8_extend_low_i8x16 (v0lf);
-        const v128_t v0lfh = wasm_i16x8_extend_high_i8x16(v0lf);
-        const v128_t v0hfl = wasm_i16x8_extend_low_i8x16 (v0hf);
-        const v128_t v0hfh = wasm_i16x8_extend_high_i8x16(v0hf);
-
-        const v128_t v1ll = wasm_i16x8_extend_low_i8x16 (v1l);
-        const v128_t v1lh = wasm_i16x8_extend_high_i8x16(v1l);
-        const v128_t v1hl = wasm_i16x8_extend_low_i8x16 (v1h);
-        const v128_t v1hh = wasm_i16x8_extend_high_i8x16(v1h);
-
-        // dot product
-        sumv = wasm_f32x4_add(sumv, wasm_f32x4_mul(wasm_f32x4_convert_i32x4(
-                        wasm_i32x4_add(
-                            wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0lfl, v1ll),
-                                           wasm_i32x4_dot_i16x8(v0lfh, v1lh)),
-                            wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl),
-                                           wasm_i32x4_dot_i16x8(v0hfh, v1hh)))),
-                    wasm_f32x4_splat(GGML_FP16_TO_FP32(x0->d) * GGML_FP16_TO_FP32(y0->d))));
-    }
-
-    sumf = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
-           wasm_f32x4_extract_lane(sumv, 2) + wasm_f32x4_extract_lane(sumv, 3);
-#elif defined(__AVX2__)
-    // Initialize accumulator with zeros
-    __m256 acc = _mm256_setzero_ps();
-
-    // Main loop
-    for (; ib < nb; ++ib) {
-        /* Compute combined scale for the block */
-        const __m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d));
-
-        __m256i qx = bytes_from_nibbles_32(x[ib].qs);
-        __m256i bxhi = bytes_from_bits_32(x[ib].qh);
-        bxhi = _mm256_andnot_si256(bxhi, _mm256_set1_epi8((char)0xF0));
-        qx = _mm256_or_si256(qx, bxhi);
-
-        __m256i qy = _mm256_loadu_si256((const __m256i *)y[ib].qs);
-
-        const __m256 q = mul_sum_i8_pairs_float(qx, qy);
-
-        /* Multiply q with scale and accumulate */
-        acc = _mm256_fmadd_ps(d, q, acc);
-    }
-
-    sumf = hsum_float_8(acc);
-#elif defined(__AVX__)
-    // Initialize accumulator with zeros
-    __m256 acc = _mm256_setzero_ps();
-    __m128i mask = _mm_set1_epi8((char)0xF0);
-
-    // Main loop
-    for (; ib < nb; ++ib) {
-        /* Compute combined scale for the block */
-        const __m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d));
-
-        __m256i bx_0 = bytes_from_nibbles_32(x[ib].qs);
-        const __m256i bxhi = bytes_from_bits_32(x[ib].qh);
-        __m128i bxhil = _mm256_castsi256_si128(bxhi);
-        __m128i bxhih = _mm256_extractf128_si256(bxhi, 1);
-        bxhil = _mm_andnot_si128(bxhil, mask);
-        bxhih = _mm_andnot_si128(bxhih, mask);
-        __m128i bxl = _mm256_castsi256_si128(bx_0);
-        __m128i bxh = _mm256_extractf128_si256(bx_0, 1);
-        bxl = _mm_or_si128(bxl, bxhil);
-        bxh = _mm_or_si128(bxh, bxhih);
-        bx_0 = MM256_SET_M128I(bxh, bxl);
-
-        const __m256i by_0 = _mm256_loadu_si256((const __m256i *)y[ib].qs);
-
-        const __m256 q = mul_sum_i8_pairs_float(bx_0, by_0);
-
-        /* Multiply q with scale and accumulate */
-        acc = _mm256_add_ps(_mm256_mul_ps(d, q), acc);
-    }
-
-    sumf = hsum_float_8(acc);
-#elif defined(__riscv_v)
-    size_t vl;
-    size_t vlenb = __riscv_vlenb();
-
-    for (; ib < nb; ++ib) {
-        vl = qk / 2;
-        vuint8m1_t v0 = __riscv_vle8_v_u8m1(x[ib].qs, vl);
-        vint8m1_t v0l = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(v0, 0x0F, vl));
-        vint8m1_t v0h = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vsrl_vx_u8m1(v0, 4, vl));
-        vint8m2_t v0c;
-        if (vlenb == 16) {
-            v0c = __riscv_vcreate_v_i8m1_i8m2(v0l, v0h);
-        } else {
-            v0l = __riscv_vslideup_vx_i8m1(v0l, v0h, 16, 32);
-            v0c = __riscv_vlmul_ext_v_i8m1_i8m2(v0l);
-        }
-
-        vl = qk;
-        vbool4_t qh = __riscv_vlm_v_b4(x[ib].qh, vl);
-        qh = __riscv_vmnand_mm_b4(qh, qh, vl);
-        vint8m2_t v0f = __riscv_vsub_vx_i8m2_mu(qh, v0c, v0c, 0x10, vl);
-        vint8m2_t v1 = __riscv_vle8_v_i8m2(y[ib].qs, vl);
-        vint16m4_t mul = __riscv_vwmul_vv_i16m4(v0f, v1, vl);
-        vint32m1_t zero = __riscv_vmv_v_x_i32m1(0, vl);
-        vint32m1_t sum = __riscv_vwredsum_vs_i16m4_i32m1(mul, zero, vl);
-        int32_t sumi = __riscv_vmv_x_s_i32m1_i32(sum);
-
-        sumf += (GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d)) * sumi;
-    }
-
-#elif defined(__POWER9_VECTOR__)
-    const vector signed char lowMask = vec_splats((signed char)0xF);
-    const vector unsigned char v4 = vec_splats((unsigned char)4);
-
-    vector float vsumf0 = vec_splats(0.0f);
-
-#pragma GCC unroll 4
-    for (; ib < nb; ++ib) {
-        __builtin_prefetch(x[ib].qs, 0, 1);
-        __builtin_prefetch(y[ib].qs, 0, 1);
-
-        vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[ib].d));
-        vector float vyd = vec_splats(GGML_FP16_TO_FP32(y[ib].d));
-        vector float vd = vec_mul(vxd, vyd);
-
-        vector signed long long aux64x2_0 = {(uint64_t)(table_b2b_1[x[ib].qh[0]]), (uint64_t)(table_b2b_1[x[ib].qh[1]])};
-        vector signed long long aux64x2_1 = {(uint64_t)(table_b2b_1[x[ib].qh[2]]), (uint64_t)(table_b2b_1[x[ib].qh[3]])};
-
-        vector signed char qh0 = (vector signed char)aux64x2_0;
-        vector signed char qh1 = (vector signed char)aux64x2_1;
-
-        vector signed char qxs = (vector signed char)vec_xl( 0, x[ib].qs);
-
-        vector signed char q5x0 = vec_sub(vec_and (qxs, lowMask), qh0);
-        vector signed char q5x1 = vec_sub(vec_sr(qxs, v4), qh1);
-
-        vector signed char q8y0 = vec_xl(  0, y[ib].qs);
-        vector signed char q8y1 = vec_xl( 16, y[ib].qs);
-
-        vector signed short qv0 = vec_add(vec_mule(q5x0, q8y0), vec_mulo(q5x0, q8y0));
-        vector signed short qv1 = vec_add(vec_mule(q5x1, q8y1), vec_mulo(q5x1, q8y1));
-
-        qv0 = vec_add(qv0, qv1);
-
-        vector signed int vsumi0 = vec_add(vec_unpackh(qv0), vec_unpackl(qv0));
-
-        vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
-    }
-
-    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
-    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
-
-    sumf = vec_extract(vsumf0, 0);
-
-#elif defined(__loongarch_asx)
-    // Initialize accumulator with zeros
-    __m256 acc = (__m256)__lasx_xvldi(0);
-
-    // Main loop
-    for (; ib < nb; ++ib) {
-        /* Compute combined scale for the block */
-        const __m256 d = __lasx_xvreplfr2vr_s(GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d)); //FIXME
-
-        __m256i qx = bytes_from_nibbles_32(x[ib].qs);
-        __m256i bxhi = bytes_from_bits_32(x[ib].qh);
-        bxhi = __lasx_xvandn_v(bxhi, __lasx_xvreplgr2vr_b((char)0xF0));
-        qx = __lasx_xvor_v(qx, bxhi);
-
-        __m256i qy = __lasx_xvld((const __m256i *)y[ib].qs, 0);
-
-        const __m256 q = mul_sum_i8_pairs_float(qx, qy);
-
-        /* Multiply q with scale and accumulate */
-        acc = __lasx_xvfmadd_s(d, q, acc);
-    }
-
-    sumf = hsum_float_8(acc);
-#endif
-    for (; ib < nb; ++ib) {
-        uint32_t qh;
-        memcpy(&qh, x[ib].qh, sizeof(qh));
-
-        int sumi0 = 0;
-        int sumi1 = 0;
-
-        for (int j = 0; j < qk/2; ++j) {
-            const uint8_t xh_0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
-            const uint8_t xh_1 = ((qh & (1u << (j + 16))) >> (j + 12));
-
-            const int32_t x0 = (int8_t)(((x[ib].qs[j] & 0x0F) | xh_0) - 16);
-            const int32_t x1 = (int8_t)(((x[ib].qs[j] >>   4) | xh_1) - 16);
-
-            sumi0 += (x0 * y[ib].qs[j]);
-            sumi1 += (x1 * y[ib].qs[j + qk/2]);
-        }
-
-        int sumi = sumi0 + sumi1;
-        sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d)) * sumi;
-    }
-
-    *s = sumf;
-}
-
-void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
-    const int qk = QK8_1;
-    const int nb = n / qk;
-
-    int ib = 0;
-    float sumf = 0;
-
-    assert(n % qk == 0);
-    assert(qk == QK5_1);
-    assert(nrc == 1);
-    UNUSED(nrc);
-    UNUSED(bx);
-    UNUSED(by);
-    UNUSED(bs);
-
-    const block_q5_1 * GGML_RESTRICT x = vx;
-    const block_q8_1 * GGML_RESTRICT y = vy;
-
-#if defined(__ARM_NEON)
-    float32x4_t sumv0 = vdupq_n_f32(0.0f);
-    float32x4_t sumv1 = vdupq_n_f32(0.0f);
-
-    float summs0 = 0.0f;
-    float summs1 = 0.0f;
-
-    uint32_t qh0;
-    uint32_t qh1;
-
-    uint64_t tmp0[4];
-    uint64_t tmp1[4];
-
-    for (; ib + 1 < nb; ib += 2) {
-        const block_q5_1 * GGML_RESTRICT x0 = &x[ib];
-        const block_q5_1 * GGML_RESTRICT x1 = &x[ib + 1];
-        const block_q8_1 * GGML_RESTRICT y0 = &y[ib];
-        const block_q8_1 * GGML_RESTRICT y1 = &y[ib + 1];
-
-        const uint8x16_t m4b = vdupq_n_u8(0x0F);
-
-        summs0 += GGML_FP16_TO_FP32(x0->m) * GGML_FP16_TO_FP32(y0->s);
-        summs1 += GGML_FP16_TO_FP32(x1->m) * GGML_FP16_TO_FP32(y1->s);
-
-        // extract the 5th bit via lookup table ((b) << 4)
-        memcpy(&qh0, x0->qh, sizeof(qh0));
-        memcpy(&qh1, x1->qh, sizeof(qh1));
-
-        tmp0[0] = table_b2b_0[(qh0 >>  0) & 0xFF];
-        tmp0[1] = table_b2b_0[(qh0 >>  8) & 0xFF];
-        tmp0[2] = table_b2b_0[(qh0 >> 16) & 0xFF];
-        tmp0[3] = table_b2b_0[(qh0 >> 24)       ];
-
-        tmp1[0] = table_b2b_0[(qh1 >>  0) & 0xFF];
-        tmp1[1] = table_b2b_0[(qh1 >>  8) & 0xFF];
-        tmp1[2] = table_b2b_0[(qh1 >> 16) & 0xFF];
-        tmp1[3] = table_b2b_0[(qh1 >> 24)       ];
-
-        const int8x16_t qhl0 = vld1q_s8((const int8_t *)(tmp0 + 0));
-        const int8x16_t qhh0 = vld1q_s8((const int8_t *)(tmp0 + 2));
-        const int8x16_t qhl1 = vld1q_s8((const int8_t *)(tmp1 + 0));
-        const int8x16_t qhh1 = vld1q_s8((const int8_t *)(tmp1 + 2));
-
-        const uint8x16_t v0_0 = vld1q_u8(x0->qs);
-        const uint8x16_t v0_1 = vld1q_u8(x1->qs);
-
-        // 4-bit -> 8-bit
-        const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8  (v0_0, m4b));
-        const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4));
-        const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8  (v0_1, m4b));
-        const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4));
-
-        // add high bit
-        const int8x16_t v0_0lf = vorrq_s8(v0_0l, qhl0);
-        const int8x16_t v0_0hf = vorrq_s8(v0_0h, qhh0);
-        const int8x16_t v0_1lf = vorrq_s8(v0_1l, qhl1);
-        const int8x16_t v0_1hf = vorrq_s8(v0_1h, qhh1);
-
-        // load y
-        const int8x16_t v1_0l = vld1q_s8(y0->qs);
-        const int8x16_t v1_0h = vld1q_s8(y0->qs + 16);
-        const int8x16_t v1_1l = vld1q_s8(y1->qs);
-        const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
-
-        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
-                        ggml_vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l),
-                        ggml_vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
-        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
-                        ggml_vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l),
-                        ggml_vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
-    }
-
-    sumf = vaddvq_f32(sumv0) + vaddvq_f32(sumv1) + summs0 + summs1;
-#elif defined __wasm_simd128__
-    v128_t sumv = wasm_f32x4_splat(0.0f);
-
-    float summs = 0.0f;
-
-    uint32_t qh_;
-    uint64_t tmp[4];
-
-    // TODO: check if unrolling this is better
-    for (; ib < nb; ++ib) {
-        const block_q5_1 * GGML_RESTRICT x0 = &x[ib];
-        const block_q8_1 * GGML_RESTRICT y0 = &y[ib];
-
-        summs += GGML_FP16_TO_FP32(x0->m) * GGML_FP16_TO_FP32(y0->s);
-
-        const v128_t m4b = wasm_i8x16_splat(0x0F);
-
-        // extract the 5th bit
-        memcpy(&qh_, x0->qh, sizeof(qh_));
-
-        tmp[0] = table_b2b_0[(qh_ >>  0) & 0xFF];
-        tmp[1] = table_b2b_0[(qh_ >>  8) & 0xFF];
-        tmp[2] = table_b2b_0[(qh_ >> 16) & 0xFF];
-        tmp[3] = table_b2b_0[(qh_ >> 24)       ];
-
-        const v128_t qhl = wasm_v128_load(tmp + 0);
-        const v128_t qhh = wasm_v128_load(tmp + 2);
-
-        const v128_t v0 = wasm_v128_load(x0->qs);
-
-        // 4-bit -> 8-bit
-        const v128_t v0l = wasm_v128_and (v0, m4b);
-        const v128_t v0h = wasm_u8x16_shr(v0, 4);
-
-        // add high bit
-        const v128_t v0lf = wasm_v128_or(v0l, qhl);
-        const v128_t v0hf = wasm_v128_or(v0h, qhh);
-
-        // load y
-        const v128_t v1l = wasm_v128_load(y0->qs);
-        const v128_t v1h = wasm_v128_load(y0->qs + 16);
-
-        // int8x16 -> int16x8
-        const v128_t v0lfl = wasm_i16x8_extend_low_i8x16 (v0lf);
-        const v128_t v0lfh = wasm_i16x8_extend_high_i8x16(v0lf);
-        const v128_t v0hfl = wasm_i16x8_extend_low_i8x16 (v0hf);
-        const v128_t v0hfh = wasm_i16x8_extend_high_i8x16(v0hf);
-
-        const v128_t v1ll = wasm_i16x8_extend_low_i8x16 (v1l);
-        const v128_t v1lh = wasm_i16x8_extend_high_i8x16(v1l);
-        const v128_t v1hl = wasm_i16x8_extend_low_i8x16 (v1h);
-        const v128_t v1hh = wasm_i16x8_extend_high_i8x16(v1h);
-
-        // dot product
-        sumv = wasm_f32x4_add(sumv,
-                wasm_f32x4_mul(wasm_f32x4_convert_i32x4(wasm_i32x4_add(
-                            wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0lfl, v1ll),
-                                           wasm_i32x4_dot_i16x8(v0lfh, v1lh)),
-                            wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl),
-                                           wasm_i32x4_dot_i16x8(v0hfh, v1hh)))),
-                    wasm_f32x4_splat(GGML_FP16_TO_FP32(x0->d) * GGML_FP16_TO_FP32(y0->d))));
-    }
-
-    sumf = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
-           wasm_f32x4_extract_lane(sumv, 2) + wasm_f32x4_extract_lane(sumv, 3) + summs;
-#elif defined(__AVX2__)
-    // Initialize accumulator with zeros
-    __m256 acc = _mm256_setzero_ps();
-
-    float summs = 0.0f;
-
-    // Main loop
-    for (; ib < nb; ++ib) {
-        const __m256 dx = _mm256_set1_ps(GGML_FP16_TO_FP32(x[ib].d));
-
-        summs += GGML_FP16_TO_FP32(x[ib].m) * GGML_FP16_TO_FP32(y[ib].s);
-
-        __m256i qx = bytes_from_nibbles_32(x[ib].qs);
-        __m256i bxhi = bytes_from_bits_32(x[ib].qh);
-        bxhi = _mm256_and_si256(bxhi, _mm256_set1_epi8(0x10));
-        qx = _mm256_or_si256(qx, bxhi);
-
-        const __m256 dy = _mm256_set1_ps(GGML_FP16_TO_FP32(y[ib].d));
-        const __m256i qy = _mm256_loadu_si256((const __m256i *)y[ib].qs);
-
-        const __m256 q = mul_sum_us8_pairs_float(qx, qy);
-
-        acc = _mm256_fmadd_ps(q, _mm256_mul_ps(dx, dy), acc);
-    }
-
-    sumf = hsum_float_8(acc) + summs;
-#elif defined(__AVX__)
-    // Initialize accumulator with zeros
-    __m256 acc = _mm256_setzero_ps();
-    __m128i mask = _mm_set1_epi8(0x10);
-
-    float summs = 0.0f;
-
-    // Main loop
-    for (; ib < nb; ++ib) {
-        const __m256 dx = _mm256_set1_ps(GGML_FP16_TO_FP32(x[ib].d));
-
-        summs += GGML_FP16_TO_FP32(x[ib].m) * GGML_FP16_TO_FP32(y[ib].s);
-
-        __m256i bx_0 = bytes_from_nibbles_32(x[ib].qs);
-        const __m256i bxhi = bytes_from_bits_32(x[ib].qh);
-        __m128i bxhil = _mm256_castsi256_si128(bxhi);
-        __m128i bxhih = _mm256_extractf128_si256(bxhi, 1);
-        bxhil = _mm_and_si128(bxhil, mask);
-        bxhih = _mm_and_si128(bxhih, mask);
-        __m128i bxl = _mm256_castsi256_si128(bx_0);
-        __m128i bxh = _mm256_extractf128_si256(bx_0, 1);
-        bxl = _mm_or_si128(bxl, bxhil);
-        bxh = _mm_or_si128(bxh, bxhih);
-        bx_0 = MM256_SET_M128I(bxh, bxl);
-
-        const __m256 dy = _mm256_set1_ps(GGML_FP16_TO_FP32(y[ib].d));
-        const __m256i by_0 = _mm256_loadu_si256((const __m256i *)y[ib].qs);
-
-        const __m256 q = mul_sum_us8_pairs_float(bx_0, by_0);
-
-        acc = _mm256_add_ps(_mm256_mul_ps(q, _mm256_mul_ps(dx, dy)), acc);
-    }
-
-    sumf = hsum_float_8(acc) + summs;
-#elif defined(__riscv_v)
-    size_t vl;
-    size_t vlenb = __riscv_vlenb();
-
-    for (; ib < nb; ++ib) {
-        vl = qk / 2;
-        vuint8m1_t v0 = __riscv_vle8_v_u8m1(x[ib].qs, vl);
-        vint8m1_t v0l = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(v0, 0x0F, vl));
-        vint8m1_t v0h = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vsrl_vx_u8m1(v0, 4, vl));
-        vint8m2_t v0c;
-        if (vlenb == 16) {
-            v0c = __riscv_vcreate_v_i8m1_i8m2(v0l, v0h);
-        } else {
-            v0l = __riscv_vslideup_vx_i8m1(v0l, v0h, 16, 32);
-            v0c = __riscv_vlmul_ext_v_i8m1_i8m2(v0l);
-        }
-
-        vl = qk;
-        vbool4_t qh = __riscv_vlm_v_b4(x[ib].qh, vl);
-        vint8m2_t v0f = __riscv_vor_vx_i8m2_mu(qh, v0c, v0c, 0x10, vl);
-        vint8m2_t v1 = __riscv_vle8_v_i8m2(y[ib].qs, vl);
-        vint16m4_t mul = __riscv_vwmul_vv_i16m4(v0f, v1, vl);
-        vint32m1_t zero = __riscv_vmv_v_x_i32m1(0, vl);
-        vint32m1_t sum = __riscv_vwredsum_vs_i16m4_i32m1(mul, zero, vl);
-        int32_t sumi = __riscv_vmv_x_s_i32m1_i32(sum);
-
-        sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d))*sumi + GGML_FP16_TO_FP32(x[ib].m)*GGML_FP16_TO_FP32(y[ib].s);
-    }
-
-#elif defined(__POWER9_VECTOR__)
-    const vector signed char lowMask = vec_splats((signed char)0xF);
-    const vector signed int v0 = vec_splats((int32_t)0);
-    const vector unsigned char v4 = vec_splats((unsigned char)0x4);
-
-    vector float vsumf0 = vec_splats(0.0f);
-
-#pragma GCC unroll 4
-    for (; ib < nb; ++ib) {
-        __builtin_prefetch(x[ib].qs, 0, 1);
-        __builtin_prefetch(y[ib].qs, 0, 1);
-
-        vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[ib].d));
-        vector float vyd = vec_splats(GGML_FP16_TO_FP32(y[ib].d));
-        vector float vd = vec_mul(vxd, vyd);
-
-        vector float vxmin = vec_splats(GGML_FP16_TO_FP32(x[ib].m));
-        vector float vys = {GGML_FP16_TO_FP32(y[ib].s), 0.f, 0.f, 0.f};
-        vsumf0 = vec_madd(vxmin, vys, vsumf0);
-
-        vector unsigned long long aux64x2_0 = {(uint64_t)(table_b2b_0[x[ib].qh[0]]), (uint64_t)(table_b2b_0[x[ib].qh[1]])};
-        vector unsigned long long aux64x2_1 = {(uint64_t)(table_b2b_0[x[ib].qh[2]]), (uint64_t)(table_b2b_0[x[ib].qh[3]])};
-
-        vector signed char qh0 = (vector signed char)aux64x2_0;
-        vector signed char qh1 = (vector signed char)aux64x2_1;
-
-        vector signed char qxs = (vector signed char)vec_xl( 0, x[ib].qs);
-
-        vector unsigned char q5x0 = (vector unsigned char)vec_or(vec_and(qxs, lowMask), qh0);
-        vector unsigned char q5x1 = (vector unsigned char)vec_or(vec_sr(qxs, v4), qh1);
-
-        vector signed char q8y0 = vec_xl(  0, y[ib].qs);
-        vector signed char q8y1 = vec_xl( 16, y[ib].qs);
-
-        vector signed int vsumi0 = v0;
-
-        vsumi0 = vec_msum(q8y0, q5x0, vsumi0);
-        vsumi0 = vec_msum(q8y1, q5x1, vsumi0);
-
-        vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
-    }
-
-    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
-    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
-
-    sumf = vec_extract(vsumf0, 0);
-
-#elif defined(__loongarch_asx)
-    // Initialize accumulator with zeros
-    __m256 acc = (__m256)__lasx_xvldi(0);
-
-    float summs = 0.0f;
-
-    // Main loop
-    for (; ib < nb; ++ib) {
-        const __m256 dx = __lasx_xvreplfr2vr_s(GGML_FP16_TO_FP32(x[ib].d));
-
-        summs += GGML_FP16_TO_FP32(x[ib].m) * GGML_FP16_TO_FP32(y[ib].s);
-
-        __m256i qx = bytes_from_nibbles_32(x[ib].qs);
-        __m256i bxhi = bytes_from_bits_32(x[ib].qh);
-        bxhi = __lasx_xvand_v(bxhi, __lasx_xvreplgr2vr_b(0x10));
-        qx = __lasx_xvor_v(qx, bxhi);
-
-        const __m256 dy = __lasx_xvreplfr2vr_s(GGML_FP16_TO_FP32(y[ib].d));
-        const __m256i qy = __lasx_xvld((const __m256i *)y[ib].qs, 0);
-
-        const __m256 q = mul_sum_us8_pairs_float(qx, qy);
-
-        acc = __lasx_xvfmadd_s(q, __lasx_xvfmul_s(dx, dy), acc);
-    }
-
-    sumf = hsum_float_8(acc) + summs;
-#endif
-    for (; ib < nb; ++ib) {
-        uint32_t qh;
-        memcpy(&qh, x[ib].qh, sizeof(qh));
-
-        int sumi0 = 0;
-        int sumi1 = 0;
-
-        for (int j = 0; j < qk/2; ++j) {
-            const uint8_t xh_0 = ((qh >> (j +  0)) << 4) & 0x10;
-            const uint8_t xh_1 = ((qh >> (j + 12))     ) & 0x10;
-
-            const int32_t x0 = (x[ib].qs[j] & 0xF) | xh_0;
-            const int32_t x1 = (x[ib].qs[j] >>  4) | xh_1;
-
-            sumi0 += (x0 * y[ib].qs[j]);
-            sumi1 += (x1 * y[ib].qs[j + qk/2]);
-        }
-
-        int sumi = sumi0 + sumi1;
-        sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d))*sumi + GGML_FP16_TO_FP32(x[ib].m)*GGML_FP16_TO_FP32(y[ib].s);
-    }
-
-    *s = sumf;
-}
-
-void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
-    const int qk = QK8_0;
-    const int nb = n / qk;
-
-    assert(n % qk == 0);
-#if defined(__ARM_FEATURE_MATMUL_INT8)
-    assert((nrc == 2) || (nrc == 1));
-#else
-    assert(nrc == 1);
-#endif
-    UNUSED(nrc);
-    UNUSED(bx);
-    UNUSED(by);
-    UNUSED(bs);
-
-    const block_q8_0 * GGML_RESTRICT x = vx;
-    const block_q8_0 * GGML_RESTRICT y = vy;
-
-#if defined(__ARM_FEATURE_MATMUL_INT8)
-    if (nrc == 2) {
-        const block_q8_0 * GGML_RESTRICT vx0 = vx;
-        const block_q8_0 * GGML_RESTRICT vx1 = (const block_q8_0 *) ((const uint8_t*)vx + bx);
-        const block_q8_0 * GGML_RESTRICT vy0 = vy;
-        const block_q8_0 * GGML_RESTRICT vy1 = (const block_q8_0 *) ((const uint8_t*)vy + by);
-
-        float32x4_t sumv0 = vdupq_n_f32(0.0f);
-
-        for (int i = 0; i < nb; i++) {
-            const block_q8_0 * GGML_RESTRICT b_x0 = &vx0[i];
-            const block_q8_0 * GGML_RESTRICT b_y0 = &vy0[i];
-
-            const block_q8_0 * GGML_RESTRICT b_x1 = &vx1[i];
-            const block_q8_0 * GGML_RESTRICT b_y1 = &vy1[i];
-
-            const int8x16_t x0_l = vld1q_s8(b_x0->qs);
-            const int8x16_t x0_h = vld1q_s8(b_x0->qs + 16);
-            const int8x16_t x1_l = vld1q_s8(b_x1->qs);
-            const int8x16_t x1_h = vld1q_s8(b_x1->qs + 16);
-
-            // load y
-            const int8x16_t y0_l = vld1q_s8(b_y0->qs);
-            const int8x16_t y0_h = vld1q_s8(b_y0->qs + 16);
-            const int8x16_t y1_l = vld1q_s8(b_y1->qs);
-            const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16);
-
-            float32_t _scale[4] = {
-                GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y0->d),
-                GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y1->d),
-                GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y0->d),
-                GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y1->d)
-            };
-            float32x4_t scale = vld1q_f32(_scale);
-
-            int8x16_t l0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
-            int8x16_t l1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
-
-            int8x16_t l2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_h), vreinterpretq_s64_s8(x1_h)));
-            int8x16_t l3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_h), vreinterpretq_s64_s8(x1_h)));
-
-            int8x16_t r0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_l), vreinterpretq_s64_s8(y1_l)));
-            int8x16_t r1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_l), vreinterpretq_s64_s8(y1_l)));
-
-            int8x16_t r2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h)));
-            int8x16_t r3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h)));
-
-            sumv0 = vmlaq_f32(sumv0,(vcvtq_f32_s32(vmmlaq_s32((vmmlaq_s32((vmmlaq_s32((vmmlaq_s32(vdupq_n_s32(0), l0, r0)),
-                                                l1, r1)), l2, r2)), l3, r3))), scale);
-        }
-
-        float32x4_t sumv1 = vextq_f32 (sumv0, sumv0, 2);
-        float32x4_t sumv2 = vzip1q_f32(sumv0, sumv1);
-
-        vst1_f32(s,      vget_low_f32 (sumv2));
-        vst1_f32(s + bs, vget_high_f32(sumv2));
-
-        return;
-    }
-#endif
-
-    int ib = 0;
-    float sumf = 0;
-
-#if defined(__ARM_FEATURE_SVE)
-    svfloat32_t sumv0 = svdup_n_f32(0.0f);
-    svfloat32_t sumv1 = svdup_n_f32(0.0f);
-
-    const int vector_length = ggml_cpu_get_sve_cnt()*8;
-
-    //VLA Implemenation for SVE
-    switch (vector_length) {
-        case 128:
-            {
-                // predicate for activating lanes for 16 Int8 elements
-                const svbool_t ph16 = svptrue_pat_b8 (SV_VL16);
-                const svbool_t pl16 = svptrue_pat_b32(SV_VL4);
-
-                for (; ib + 1 < nb; ib += 2) {
-                    const block_q8_0 * GGML_RESTRICT x0 = &x[ib + 0];
-                    const block_q8_0 * GGML_RESTRICT x1 = &x[ib + 1];
-                    const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0];
-                    const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1];
-
-                    // load x
-                    const svint8_t qx0_0 = svld1_s8(ph16, x0->qs);
-                    const svint8_t qx0_1 = svld1_s8(ph16, x0->qs+16);
-                    const svint8_t qx1_0 = svld1_s8(ph16, x1->qs);
-                    const svint8_t qx1_1 = svld1_s8(ph16, x1->qs+16);
-
-                    // load y
-                    const svint8_t qy0_0 = svld1_s8(ph16, y0->qs);
-                    const svint8_t qy0_1 = svld1_s8(ph16, y0->qs+16);
-                    const svint8_t qy1_0 = svld1_s8(ph16, y1->qs);
-                    const svint8_t qy1_1 = svld1_s8(ph16, y1->qs+16);
-
-                    sumv0 = svmla_n_f32_x(pl16, sumv0, svcvt_f32_s32_x(pl16, svadd_x(pl16,
-                                    svdot_s32(svdup_n_s32(0), qx0_0, qy0_0),
-                                    svdot_s32(svdup_n_s32(0), qx0_1, qy0_1))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
-                    sumv1 = svmla_n_f32_x(pl16, sumv1, svcvt_f32_s32_x(pl16, svadd_x(pl16,
-                                    svdot_s32(svdup_n_s32(0), qx1_0, qy1_0),
-                                    svdot_s32(svdup_n_s32(0), qx1_1, qy1_1))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
-                }
-
-                sumf = svaddv_f32(pl16, svadd_f32_x(pl16, sumv0, sumv1));
-            } break;
-        case 256:
-            {
-                //printf("sve256");
-                for (; ib + 1 < nb; ib += 2) {
-                    const block_q8_0 * GGML_RESTRICT x0 = &x[ib + 0];
-                    const block_q8_0 * GGML_RESTRICT x1 = &x[ib + 1];
-                    const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0];
-                    const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1];
-
-                    // load x
-                    const svint8_t qx0 = svld1_s8(svptrue_b8(), x0->qs);
-                    const svint8_t qx1 = svld1_s8(svptrue_b8(), x1->qs);
-
-                    // load y
-                    const svint8_t qy0 = svld1_s8(svptrue_b8(), y0->qs);
-                    const svint8_t qy1 = svld1_s8(svptrue_b8(), y1->qs);
-
-                    sumv0 = svmla_n_f32_x(svptrue_b32(), sumv0, svcvt_f32_s32_x(svptrue_b32(),
-                                svdot_s32(svdup_n_s32(0), qx0, qy0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
-                    sumv1 = svmla_n_f32_x(svptrue_b32(), sumv1, svcvt_f32_s32_x(svptrue_b32(),
-                                svdot_s32(svdup_n_s32(0), qx1, qy1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
-                }
-
-                sumf = svaddv_f32(svptrue_b32(), svadd_f32_x(svptrue_b32(), sumv0, sumv1));
-            } break;
-        case 512:
-            {
-                // predicate for activating high 256 bit
-                const svbool_t ph32 = svptrue_pat_b8(SV_VL32);
-                // predicate for activating low 256 bit
-                const svbool_t pl32 = svnot_b_z(svptrue_b8(), ph32);
-
-                // predicate for activating high lanes for 8 float32 elements
-                const svbool_t ph8 = svptrue_pat_b32(SV_VL8);
-                // predicate for activating low lanes for 8 float32 elements
-                const svbool_t pl8 = svnot_b_z(svptrue_b32(), ph8);
-
-                svfloat32_t sumv00 = svdup_n_f32(0.0f);
-
-                for (; ib + 1 < nb; ib += 2) {
-                    const block_q8_0 * GGML_RESTRICT x0 = &x[ib + 0];
-                    const block_q8_0 * GGML_RESTRICT x1 = &x[ib + 1];
-                    const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0];
-                    const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1];
-
-                    //load 32 int8_t in first half of vector and put another 32 int8_t in second vector lower bits
-                    // and add them to make one 64 element vector
-                    // load x
-                    const svint8_t qx_32 = svld1_s8(ph32, x0->qs);
-                          svint8_t qx_64 = svld1_s8(pl32, x0->qs + 2);
-
-                    qx_64 = svadd_s8_x(svptrue_b8(), qx_32, qx_64);
-
-                    // load y
-                    const svint8_t qy_32 = svld1_s8(ph32, y0->qs);
-                          svint8_t qy_64 = svld1_s8(pl32, y0->qs + 2);
-
-                    qy_64 = svadd_s8_x(svptrue_b8(), qy_32, qy_64);
-
-                    // scale creation
-                    const float32_t deq1 = GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d);
-                    const float32_t deq2 = GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d);
-
-                    // duplicate deq1 in first half of vector and deq2 in second half of vector
-                    const svfloat32_t temp = svdup_f32_m(svdup_f32_z(ph8, deq1), pl8, deq2);
-
-                    const svfloat32_t sumvt = svcvt_f32_s32_x(svptrue_b32(), svdot_s32(svdup_n_s32(0), qx_64, qy_64));
-
-                    sumv00 = svmla_f32_m(svptrue_b32(), sumv00, sumvt, temp);
-                }
-
-                sumf = svaddv_f32(svptrue_b32(), sumv00);
-                break;
-            }
-        default:
-            assert(false && "Unsupported vector length");
-            break;
-    }
-#elif defined(__ARM_NEON)
-    float32x4_t sumv0 = vdupq_n_f32(0.0f);
-    float32x4_t sumv1 = vdupq_n_f32(0.0f);
-
-    for (; ib + 1 < nb; ib += 2) {
-        const block_q8_0 * GGML_RESTRICT x0 = &x[ib + 0];
-        const block_q8_0 * GGML_RESTRICT x1 = &x[ib + 1];
-        const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0];
-        const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1];
-
-        const int8x16_t x0_0 = vld1q_s8(x0->qs);
-        const int8x16_t x0_1 = vld1q_s8(x0->qs + 16);
-        const int8x16_t x1_0 = vld1q_s8(x1->qs);
-        const int8x16_t x1_1 = vld1q_s8(x1->qs + 16);
-
-        // load y
-        const int8x16_t y0_0 = vld1q_s8(y0->qs);
-        const int8x16_t y0_1 = vld1q_s8(y0->qs + 16);
-        const int8x16_t y1_0 = vld1q_s8(y1->qs);
-        const int8x16_t y1_1 = vld1q_s8(y1->qs + 16);
-
-        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
-                        ggml_vdotq_s32(vdupq_n_s32(0), x0_0, y0_0),
-                        ggml_vdotq_s32(vdupq_n_s32(0), x0_1, y0_1))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
-
-        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
-                        ggml_vdotq_s32(vdupq_n_s32(0), x1_0, y1_0),
-                        ggml_vdotq_s32(vdupq_n_s32(0), x1_1, y1_1))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
-    }
-
-    sumf = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
-#elif defined __wasm_simd128__
-    v128_t sumv = wasm_f32x4_splat(0.0f);
-
-    for (; ib < nb; ++ib) {
-        const block_q8_0 * GGML_RESTRICT x0 = &x[ib];
-        const block_q8_0 * GGML_RESTRICT y0 = &y[ib];
-
-        const v128_t x0_0 = wasm_v128_load(x0->qs);
-        const v128_t x0_1 = wasm_v128_load(x0->qs + 16);
-        const v128_t y0_0 = wasm_v128_load(y0->qs);
-        const v128_t y0_1 = wasm_v128_load(y0->qs + 16);
-
-        // Extend 8-bit to 16-bit
-        const v128_t x0_0l = wasm_i16x8_extend_low_i8x16(x0_0);
-        const v128_t x0_0h = wasm_i16x8_extend_high_i8x16(x0_0);
-        const v128_t x0_1l = wasm_i16x8_extend_low_i8x16(x0_1);
-        const v128_t x0_1h = wasm_i16x8_extend_high_i8x16(x0_1);
-
-        const v128_t y0_0l = wasm_i16x8_extend_low_i8x16(y0_0);
-        const v128_t y0_0h = wasm_i16x8_extend_high_i8x16(y0_0);
-        const v128_t y0_1l = wasm_i16x8_extend_low_i8x16(y0_1);
-        const v128_t y0_1h = wasm_i16x8_extend_high_i8x16(y0_1);
-
-        // Compute dot products
-        const v128_t dx0_0 = wasm_i32x4_dot_i16x8(x0_0l, y0_0l);
-        const v128_t dx0_1 = wasm_i32x4_dot_i16x8(x0_0h, y0_0h);
-        const v128_t dx1_0 = wasm_i32x4_dot_i16x8(x0_1l, y0_1l);
-        const v128_t dx1_1 = wasm_i32x4_dot_i16x8(x0_1h, y0_1h);
-
-        // Sum all dot products
-        const v128_t sum_dots = wasm_i32x4_add(wasm_i32x4_add(dx0_0, dx0_1), wasm_i32x4_add(dx1_0, dx1_1));
-
-        // Convert to float and accumulate
-        const float scale = GGML_FP16_TO_FP32(x0->d) * GGML_FP16_TO_FP32(y0->d);
-        sumv = wasm_f32x4_add(sumv, wasm_f32x4_mul(wasm_f32x4_convert_i32x4(sum_dots), wasm_f32x4_splat(scale)));
-    }
-
-    sumf = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
-           wasm_f32x4_extract_lane(sumv, 2) + wasm_f32x4_extract_lane(sumv, 3);
-#elif defined(__AVX2__)
-    // Initialize accumulator with zeros
-    __m256 acc = _mm256_setzero_ps();
-
-    // Main loop
-    for (; ib < nb; ++ib) {
-        // Compute combined scale for the block
-        const __m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d));
-        __m256i qx = _mm256_loadu_si256((const __m256i *)x[ib].qs);
-        __m256i qy = _mm256_loadu_si256((const __m256i *)y[ib].qs);
-
-        const __m256 q = mul_sum_i8_pairs_float(qx, qy);
-
-        // Multiply q with scale and accumulate
-        acc = _mm256_fmadd_ps( d, q, acc );
-    }
-
-    sumf = hsum_float_8(acc);
-#elif defined(__AVX__)
-    __m256 accum = _mm256_setzero_ps();
-
-    for (; ib + 1 < nb; ib += 2) {
-        const __m128i qx_1_0 = _mm_loadu_si128((const __m128i *)x[ib].qs);
-        const __m128i qx_1_1 = _mm_loadu_si128((const __m128i *)x[ib].qs + 1);
-        const __m128i qx_2_0 = _mm_loadu_si128((const __m128i *)x[ib + 1].qs);
-        const __m128i qx_2_1 = _mm_loadu_si128((const __m128i *)x[ib + 1].qs + 1);
-        const __m128i qy_1_0 = _mm_loadu_si128((const __m128i *)y[ib].qs);
-        const __m128i qy_1_1 = _mm_loadu_si128((const __m128i *)y[ib].qs + 1);
-        const __m128i qy_2_0 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs);
-        const __m128i qy_2_1 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs + 1);
-
-        const __m256 p = mul_sum_i8_quad_float(qx_1_0, qx_1_1, qx_2_0, qx_2_1, qy_1_0, qy_1_1, qy_2_0, qy_2_1);
-        const __m256 deltas = quad_fp16_delta_float(x[ib].d, y[ib].d, x[ib + 1].d, y[ib + 1].d);
-        accum = _mm256_add_ps(_mm256_mul_ps(deltas, p), accum);
-    }
-
-    sumf = hsum_float_8(accum);
-#elif defined(__riscv_v)
-    size_t vl = qk;
-
-    for (; ib < nb; ++ib) {
-        // load elements
-        vint8m2_t bx_0 = __riscv_vle8_v_i8m2(x[ib].qs, vl);
-        vint8m2_t by_0 = __riscv_vle8_v_i8m2(y[ib].qs, vl);
-
-        vint16m4_t vw_mul = __riscv_vwmul_vv_i16m4(bx_0, by_0, vl);
-
-        vint32m1_t v_zero = __riscv_vmv_v_x_i32m1(0, vl);
-        vint32m1_t v_sum = __riscv_vwredsum_vs_i16m4_i32m1(vw_mul, v_zero, vl);
-
-        int sumi = __riscv_vmv_x_s_i32m1_i32(v_sum);
-
-        sumf += sumi*(GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d));
-    }
-#elif defined(__POWER9_VECTOR__)
-    const vector signed int v0 = vec_splats((int32_t)0);
-    vector float vsumf0 = vec_splats(0.0f);
-
-#pragma GCC unroll 8
-    for (; ib < nb; ++ib) {
-        __builtin_prefetch(x[ib].qs, 0, 1);
-        __builtin_prefetch(y[ib].qs, 0, 1);
-
-        vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[ib].d));
-        vector float vyd = vec_splats(GGML_FP16_TO_FP32(y[ib].d));
-        vector float vd = vec_mul(vxd, vyd);
-
-        vector signed char q8x0 = vec_xl( 0, x[ib].qs);
-        vector signed char q8x1 = vec_xl(16, x[ib].qs);
-        vector signed char q8y0 = vec_xl( 0, y[ib].qs);
-        vector signed char q8y1 = vec_xl(16, y[ib].qs);
-
-        vector signed short qv0 = vec_mule(q8x0, q8y0);
-        vector signed short qv1 = vec_mulo(q8x0, q8y0);
-        vector signed short qv2 = vec_mule(q8x1, q8y1);
-        vector signed short qv3 = vec_mulo(q8x1, q8y1);
-
-        vector signed int vsumi0 = v0;
-        vector signed int vsumi1 = v0;
-
-        vsumi0 = vec_sum4s(qv0, vsumi0);
-        vsumi1 = vec_sum4s(qv1, vsumi1);
-        vsumi0 = vec_sum4s(qv2, vsumi0);
-        vsumi1 = vec_sum4s(qv3, vsumi1);
-
-        vsumi0 = vec_add(vsumi0, vsumi1);
-
-        vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
-    }
-
-    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
-    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
-
-    sumf = vec_extract(vsumf0, 0);
-
-#elif defined(__loongarch_asx)
-    // Initialize accumulator with zeros
-    __m256 acc = (__m256)__lasx_xvldi(0);
-
-    // Main loop
-    for (; ib < nb; ++ib) {
-        // Compute combined scale for the block
-        const __m256 d = __lasx_xvreplfr2vr_s(GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d));
-        __m256i qx = __lasx_xvld((const __m256i *)x[ib].qs, 0);
-        __m256i qy = __lasx_xvld((const __m256i *)y[ib].qs, 0);
-
-        const __m256 q = mul_sum_i8_pairs_float(qx, qy);
-
-        // Multiply q with scale and accumulate
-        acc = __lasx_xvfmadd_s( d, q, acc );
-    }
-
-    sumf = hsum_float_8(acc);
-#elif defined(__VXE__) || defined(__VXE2__)
-    __vector float acc = vec_splats(0.0f);
-
-#pragma GCC unroll 8
-    for (; ib < nb; ++ib) {
-        __builtin_prefetch(x[ib].qs, 0, 1);
-        __builtin_prefetch(y[ib].qs, 0, 1);
-
-        const int8x16_t v_xl = vec_xl(0      , x[ib].qs);
-        const int8x16_t v_xh = vec_xl(QK8_0/2, x[ib].qs);
-        const int8x16_t v_yl = vec_xl(0      , y[ib].qs);
-        const int8x16_t v_yh = vec_xl(QK8_0/2, y[ib].qs);
-
-        const int32x4_t v_xy_ = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_xl, v_yl), v_xh, v_yh);
-        const float32x4_t v_xy = vec_float(v_xy_);
-        const float32x4_t v_d = vec_splats(GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d));
-
-        acc = vec_madd(v_xy, v_d, acc);
-    }
-
-    sumf = acc[0] + acc[1] + acc[2] + acc[3];
-#endif
-    for (; ib < nb; ++ib) {
-        int sumi = 0;
-
-        for (int j = 0; j < qk; j++) {
-            sumi += x[ib].qs[j]*y[ib].qs[j];
-        }
-
-        sumf += sumi*(GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d));
-    }
-
-    *s = sumf;
-}
-
-void ggml_vec_dot_tq1_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
-    assert(nrc == 1);
-    UNUSED(nrc);
-    UNUSED(bx);
-    UNUSED(by);
-    UNUSED(bs);
-
-    const block_tq1_0 * GGML_RESTRICT x = vx;
-    const block_q8_K  * GGML_RESTRICT y = vy;
-
-    const int nb = n / QK_K;
-
-#if defined(__ARM_NEON)
-    float sumf = 0.0f;
-
-    uint8_t k_shift[16] = {1, 1, 1, 1, 3, 3, 3, 3, 9, 9, 9, 9, 27, 27, 27, 27};
-
-    const uint8x16_t shift = vld1q_u8(k_shift);
-
-    for (int i = 0; i < nb; ++i) {
-#if defined(__ARM_FEATURE_DOTPROD)
-        int32x4_t sumi0 = vdupq_n_s32(0);
-        int32x4_t sumi1 = vdupq_n_s32(0);
-#else
-        int16x8_t sumi0 = vdupq_n_s16(0);
-        int16x8_t sumi1 = vdupq_n_s16(0);
-#endif
-
-        // first 32 bytes of 5 elements
-        {
-            uint8x16_t qx0 = vld1q_u8(x[i].qs + 0);
-            uint8x16_t qx1 = vld1q_u8(x[i].qs + 16);
-            uint8x16_t qx2 = vmulq_u8(qx0, vdupq_n_u8(3));
-            uint8x16_t qx3 = vmulq_u8(qx1, vdupq_n_u8(3));
-            uint8x16_t qx4 = vmulq_u8(qx0, vdupq_n_u8(9));
-            uint8x16_t qx5 = vmulq_u8(qx1, vdupq_n_u8(9));
-            uint8x16_t qx6 = vmulq_u8(qx0, vdupq_n_u8(27));
-            uint8x16_t qx7 = vmulq_u8(qx1, vdupq_n_u8(27));
-            uint8x16_t qx8 = vmulq_u8(qx0, vdupq_n_u8(81));
-            uint8x16_t qx9 = vmulq_u8(qx1, vdupq_n_u8(81));
-
-            // multiply by 3 and keep the 2 bits above 8 bits
-            int8x16_t sqx0 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx0, vshrq_n_u8(qx0, 1)), 6));
-            int8x16_t sqx1 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx1, vshrq_n_u8(qx1, 1)), 6));
-            int8x16_t sqx2 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx2, vshrq_n_u8(qx2, 1)), 6));
-            int8x16_t sqx3 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx3, vshrq_n_u8(qx3, 1)), 6));
-            int8x16_t sqx4 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx4, vshrq_n_u8(qx4, 1)), 6));
-            int8x16_t sqx5 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx5, vshrq_n_u8(qx5, 1)), 6));
-            int8x16_t sqx6 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx6, vshrq_n_u8(qx6, 1)), 6));
-            int8x16_t sqx7 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx7, vshrq_n_u8(qx7, 1)), 6));
-            int8x16_t sqx8 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx8, vshrq_n_u8(qx8, 1)), 6));
-            int8x16_t sqx9 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx9, vshrq_n_u8(qx9, 1)), 6));
-
-            const int8x16_t qy0 = vld1q_s8(y[i].qs +   0);
-            const int8x16_t qy1 = vld1q_s8(y[i].qs +  16);
-            const int8x16_t qy2 = vld1q_s8(y[i].qs +  32);
-            const int8x16_t qy3 = vld1q_s8(y[i].qs +  48);
-            const int8x16_t qy4 = vld1q_s8(y[i].qs +  64);
-            const int8x16_t qy5 = vld1q_s8(y[i].qs +  80);
-            const int8x16_t qy6 = vld1q_s8(y[i].qs +  96);
-            const int8x16_t qy7 = vld1q_s8(y[i].qs + 112);
-            const int8x16_t qy8 = vld1q_s8(y[i].qs + 128);
-            const int8x16_t qy9 = vld1q_s8(y[i].qs + 144);
-
-#if defined(__ARM_FEATURE_DOTPROD)
-            sumi0 = vdotq_s32(sumi0, sqx0, qy0);
-            sumi1 = vdotq_s32(sumi1, sqx1, qy1);
-            sumi0 = vdotq_s32(sumi0, sqx2, qy2);
-            sumi1 = vdotq_s32(sumi1, sqx3, qy3);
-            sumi0 = vdotq_s32(sumi0, sqx4, qy4);
-            sumi1 = vdotq_s32(sumi1, sqx5, qy5);
-            sumi0 = vdotq_s32(sumi0, sqx6, qy6);
-            sumi1 = vdotq_s32(sumi1, sqx7, qy7);
-            sumi0 = vdotq_s32(sumi0, sqx8, qy8);
-            sumi1 = vdotq_s32(sumi1, sqx9, qy9);
-#else
-            sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx0), vget_low_s8(qy0));
-            sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx0), vget_high_s8(qy0));
-            sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx1), vget_low_s8(qy1));
-            sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx1), vget_high_s8(qy1));
-            sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx2), vget_low_s8(qy2));
-            sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx2), vget_high_s8(qy2));
-            sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx3), vget_low_s8(qy3));
-            sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx3), vget_high_s8(qy3));
-            sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx4), vget_low_s8(qy4));
-            sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx4), vget_high_s8(qy4));
-            sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx5), vget_low_s8(qy5));
-            sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx5), vget_high_s8(qy5));
-            sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx6), vget_low_s8(qy6));
-            sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx6), vget_high_s8(qy6));
-            sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx7), vget_low_s8(qy7));
-            sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx7), vget_high_s8(qy7));
-            sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx8), vget_low_s8(qy8));
-            sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx8), vget_high_s8(qy8));
-            sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx9), vget_low_s8(qy9));
-            sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx9), vget_high_s8(qy9));
-#endif
-        }
-
-        // last 16 bytes of 5-element, along with the 4 bytes of 4 elements
-        {
-            uint8x16_t qx0 = vld1q_u8(x[i].qs + 32);
-            uint8x16_t qx1 = vmulq_u8(qx0, vdupq_n_u8(3));
-            uint8x16_t qx2 = vmulq_u8(qx0, vdupq_n_u8(9));
-            uint8x16_t qx3 = vmulq_u8(qx0, vdupq_n_u8(27));
-            uint8x16_t qx4 = vmulq_u8(qx0, vdupq_n_u8(81));
-            uint32_t qh;
-            memcpy(&qh, x[i].qh, sizeof(qh)); // potentially unaligned
-            uint8x16_t qx5 = vreinterpretq_u8_u32(vdupq_n_u32(qh));
-            qx5 = vmulq_u8(qx5, shift);
-
-            // multiply by 3 and keep the 2 bits above 8 bits
-            int8x16_t sqx0 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx0, vshrq_n_u8(qx0, 1)), 6));
-            int8x16_t sqx1 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx1, vshrq_n_u8(qx1, 1)), 6));
-            int8x16_t sqx2 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx2, vshrq_n_u8(qx2, 1)), 6));
-            int8x16_t sqx3 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx3, vshrq_n_u8(qx3, 1)), 6));
-            int8x16_t sqx4 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx4, vshrq_n_u8(qx4, 1)), 6));
-            int8x16_t sqx5 = vreinterpretq_s8_u8(vshrq_n_u8(vhaddq_u8(qx5, vshrq_n_u8(qx5, 1)), 6));
-
-            const int8x16_t qy0 = vld1q_s8(y[i].qs + 160);
-            const int8x16_t qy1 = vld1q_s8(y[i].qs + 176);
-            const int8x16_t qy2 = vld1q_s8(y[i].qs + 192);
-            const int8x16_t qy3 = vld1q_s8(y[i].qs + 208);
-            const int8x16_t qy4 = vld1q_s8(y[i].qs + 224);
-            const int8x16_t qy5 = vld1q_s8(y[i].qs + 240);
-
-#if defined(__ARM_FEATURE_DOTPROD)
-            sumi0 = vdotq_s32(sumi0, sqx0, qy0);
-            sumi1 = vdotq_s32(sumi1, sqx1, qy1);
-            sumi0 = vdotq_s32(sumi0, sqx2, qy2);
-            sumi1 = vdotq_s32(sumi1, sqx3, qy3);
-            sumi0 = vdotq_s32(sumi0, sqx4, qy4);
-            sumi1 = vdotq_s32(sumi1, sqx5, qy5);
-#else
-            sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx0), vget_low_s8(qy0));
-            sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx0), vget_high_s8(qy0));
-            sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx1), vget_low_s8(qy1));
-            sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx1), vget_high_s8(qy1));
-            sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx2), vget_low_s8(qy2));
-            sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx2), vget_high_s8(qy2));
-            sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx3), vget_low_s8(qy3));
-            sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx3), vget_high_s8(qy3));
-            sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx4), vget_low_s8(qy4));
-            sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx4), vget_high_s8(qy4));
-            sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx5), vget_low_s8(qy5));
-            sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx5), vget_high_s8(qy5));
-#endif
-        }
-
-        const int16x8_t ysum0 = vld1q_s16(y[i].bsums);
-        const int16x8_t ysum1 = vld1q_s16(y[i].bsums + 8);
-
-        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-
-#if defined(__ARM_FEATURE_DOTPROD)
-        sumi0 = vaddq_s32(sumi0, sumi1);
-        sumi0 = vsubq_s32(sumi0, vpaddlq_s16(vaddq_s16(ysum0, ysum1)));
-
-        sumf += d * (float) vaddvq_s32(sumi0);
-#else
-        sumi0 = vaddq_s16(sumi0, sumi1);
-        sumi0 = vsubq_s16(sumi0, vaddq_s16(ysum0, ysum1));
-
-        sumf += d * (float) vaddlvq_s16(sumi0);
-#endif
-    }
-
-    *s = sumf;
-
-#elif defined(__AVX2__)
-    __m256 sumf = _mm256_setzero_ps();
-
-    for (int i = 0; i < nb; ++i) {
-        // 16-bit sums
-        __m256i sumi0 = _mm256_setzero_si256();
-        __m256i sumi1 = _mm256_setzero_si256();
-        __m256i sumi2 = _mm256_setzero_si256();
-
-        // first 32 bytes of 5 elements
-        {
-            __m256i qx0 = _mm256_loadu_si256((const __m256i *) (x[i].qs));
-            // 8-bit multiplies with shifts, masks and adds
-            __m256i qx1 = _mm256_add_epi8(qx0, _mm256_add_epi8(qx0, qx0)); // 1 * 3
-            __m256i qx2 = _mm256_add_epi8(_mm256_and_si256(_mm256_slli_epi16(qx0, 3), _mm256_set1_epi8(-8)), qx0); // 1 * 9
-            __m256i qx3 = _mm256_add_epi8(_mm256_and_si256(_mm256_slli_epi16(qx1, 3), _mm256_set1_epi8(-8)), qx1); // 3 * 9
-            __m256i qx4 = _mm256_add_epi8(_mm256_and_si256(_mm256_slli_epi16(qx2, 3), _mm256_set1_epi8(-8)), qx2); // 9 * 9
-
-            // TODO: can _mm256_mulhi_epu16 be faster even if 16-bits?
-
-            // Cancel the +1 from avg so that it behaves like a halving add
-            qx0 = _mm256_subs_epu8(qx0, _mm256_set1_epi8(1));
-            qx1 = _mm256_subs_epu8(qx1, _mm256_set1_epi8(1));
-            qx2 = _mm256_subs_epu8(qx2, _mm256_set1_epi8(1));
-            qx3 = _mm256_subs_epu8(qx3, _mm256_set1_epi8(1));
-            qx4 = _mm256_subs_epu8(qx4, _mm256_set1_epi8(1));
-            // Multiply by 3 and get the top 2 bits
-            qx0 = _mm256_avg_epu8(qx0, _mm256_avg_epu8(qx0, _mm256_setzero_si256()));
-            qx1 = _mm256_avg_epu8(qx1, _mm256_avg_epu8(qx1, _mm256_setzero_si256()));
-            qx2 = _mm256_avg_epu8(qx2, _mm256_avg_epu8(qx2, _mm256_setzero_si256()));
-            qx3 = _mm256_avg_epu8(qx3, _mm256_avg_epu8(qx3, _mm256_setzero_si256()));
-            qx4 = _mm256_avg_epu8(qx4, _mm256_avg_epu8(qx4, _mm256_setzero_si256()));
-            qx0 = _mm256_and_si256(_mm256_srli_epi16(qx0, 6), _mm256_set1_epi8(3));
-            qx1 = _mm256_and_si256(_mm256_srli_epi16(qx1, 6), _mm256_set1_epi8(3));
-            qx2 = _mm256_and_si256(_mm256_srli_epi16(qx2, 6), _mm256_set1_epi8(3));
-            qx3 = _mm256_and_si256(_mm256_srli_epi16(qx3, 6), _mm256_set1_epi8(3));
-            qx4 = _mm256_and_si256(_mm256_srli_epi16(qx4, 6), _mm256_set1_epi8(3));
-
-            const __m256i qy0 = _mm256_loadu_si256((const __m256i *) (y[i].qs +   0));
-            const __m256i qy1 = _mm256_loadu_si256((const __m256i *) (y[i].qs +  32));
-            const __m256i qy2 = _mm256_loadu_si256((const __m256i *) (y[i].qs +  64));
-            const __m256i qy3 = _mm256_loadu_si256((const __m256i *) (y[i].qs +  96));
-            const __m256i qy4 = _mm256_loadu_si256((const __m256i *) (y[i].qs + 128));
-
-            qx0 = _mm256_maddubs_epi16(qx0, qy0);
-            qx1 = _mm256_maddubs_epi16(qx1, qy1);
-            qx2 = _mm256_maddubs_epi16(qx2, qy2);
-            qx3 = _mm256_maddubs_epi16(qx3, qy3);
-            qx4 = _mm256_maddubs_epi16(qx4, qy4);
-
-            sumi0 = _mm256_add_epi16(sumi0, _mm256_add_epi16(qx0, qx1));
-            sumi1 = _mm256_add_epi16(sumi1, _mm256_add_epi16(qx2, qx3));
-            sumi2 = _mm256_add_epi16(sumi2, qx4);
-        }
-
-        // last 16 bytes of 5-element, along with the 4 bytes of 4 elements
-        {
-            __m128i qx0 = _mm_loadu_si128((const __m128i *) (x[i].qs + 32));
-            uint32_t qh;
-            memcpy(&qh, x[i].qh, sizeof(qh)); // potentially unaligned
-            __m256i qx5_l = _mm256_cvtepu8_epi16(_mm_set1_epi32(qh));
-            __m128i qx1 = _mm_add_epi8(qx0, _mm_add_epi8(qx0, qx0)); // 1 * 3
-            __m128i qx2 = _mm_add_epi8(_mm_and_si128(_mm_slli_epi16(qx0, 3), _mm_set1_epi8(-8)), qx0); // 1 * 9
-            __m128i qx3 = _mm_add_epi8(_mm_and_si128(_mm_slli_epi16(qx1, 3), _mm_set1_epi8(-8)), qx1); // 3 * 9
-            __m128i qx4 = _mm_add_epi8(_mm_and_si128(_mm_slli_epi16(qx2, 3), _mm_set1_epi8(-8)), qx2); // 9 * 9
-            __m256i qx01 = MM256_SET_M128I(qx1, qx0);
-            __m256i qx23 = MM256_SET_M128I(qx3, qx2);
-
-            // avx2 does not have 8-bit multiplies, so 16-bit it is.
-            qx5_l = _mm256_mullo_epi16(qx5_l, _mm256_set_epi16(27, 27, 27, 27, 9, 9, 9, 9, 3, 3, 3, 3, 1, 1, 1, 1));
-            qx5_l = _mm256_and_si256(qx5_l, _mm256_set1_epi16(0xFF));
-            __m128i qx5 = _mm_packus_epi16(_mm256_castsi256_si128(qx5_l), _mm256_extracti128_si256(qx5_l, 1));
-
-            __m256i qx45 = MM256_SET_M128I(qx5, qx4);
-
-            // Cancel the +1 from avg so that it behaves like a halving add
-            qx01 = _mm256_subs_epu8(qx01, _mm256_set1_epi8(1));
-            qx23 = _mm256_subs_epu8(qx23, _mm256_set1_epi8(1));
-            qx45 = _mm256_subs_epu8(qx45, _mm256_set1_epi8(1));
-            // Multiply by 3 and get the top 2 bits
-            qx01 = _mm256_avg_epu8(qx01, _mm256_avg_epu8(qx01, _mm256_setzero_si256()));
-            qx23 = _mm256_avg_epu8(qx23, _mm256_avg_epu8(qx23, _mm256_setzero_si256()));
-            qx45 = _mm256_avg_epu8(qx45, _mm256_avg_epu8(qx45, _mm256_setzero_si256()));
-            qx01 = _mm256_and_si256(_mm256_srli_epi16(qx01, 6), _mm256_set1_epi8(3));
-            qx23 = _mm256_and_si256(_mm256_srli_epi16(qx23, 6), _mm256_set1_epi8(3));
-            qx45 = _mm256_and_si256(_mm256_srli_epi16(qx45, 6), _mm256_set1_epi8(3));
-
-            const __m256i qy01 = _mm256_loadu_si256((const __m256i *) (y[i].qs + 160));
-            const __m256i qy23 = _mm256_loadu_si256((const __m256i *) (y[i].qs + 192));
-            const __m256i qy45 = _mm256_loadu_si256((const __m256i *) (y[i].qs + 224));
-
-            qx01 = _mm256_maddubs_epi16(qx01, qy01);
-            qx23 = _mm256_maddubs_epi16(qx23, qy23);
-            qx45 = _mm256_maddubs_epi16(qx45, qy45);
-
-            sumi0 = _mm256_add_epi16(sumi0, qx01);
-            sumi1 = _mm256_add_epi16(sumi1, qx23);
-            sumi2 = _mm256_add_epi16(sumi2, qx45);
-        }
-
-        const __m256i ysum = _mm256_loadu_si256((const __m256i *) y[i].bsums);
-        const __m256 d = _mm256_set1_ps(y[i].d * GGML_FP16_TO_FP32(x[i].d));
-
-        sumi0 = _mm256_sub_epi16(sumi0, ysum);
-        sumi0 = _mm256_add_epi16(sumi0, _mm256_add_epi16(sumi1, sumi2));
-        sumi0 = _mm256_madd_epi16(sumi0, _mm256_set1_epi16(1));
-
-        sumf = _mm256_add_ps(_mm256_mul_ps(_mm256_cvtepi32_ps(sumi0), d), sumf);
-    }
-
-    *s = hsum_float_8(sumf);
-
-#else
-    const uint8_t pow3[6] = {1, 3, 9, 27, 81, 243};
-
-    float sumf = 0.0f;
-
-    for (int i = 0; i < nb; ++i) {
-        int sum = 0;
-
-        for (size_t j = 0; j < sizeof(x->qs) - sizeof(x->qs) % 32; j += 32) {
-            for (size_t l = 0; l < 5; ++l) {
-                for (size_t m = 0; m < 32; ++m) {
-                    uint8_t q = x[i].qs[j + m] * pow3[l];
-                    uint16_t xi = ((uint16_t) q * 3) >> 8;
-                    sum += (xi - 1) * y[i].qs[j*5 + l*32 + m];
-                }
-            }
-        }
-        for (size_t j = sizeof(x->qs) - sizeof(x->qs) % 32; j < sizeof(x->qs); j += 16) {
-            for (size_t l = 0; l < 5; ++l) {
-                for (size_t m = 0; m < 16; ++m) {
-                    uint8_t q = x[i].qs[j + m] * pow3[l];
-                    uint16_t xi = ((uint16_t) q * 3) >> 8;
-                    sum += (xi - 1) * y[i].qs[j*5 + l*16 + m];
-                }
-            }
-        }
-
-        for (size_t l = 0; l < 4; ++l) {
-            for (size_t j = 0; j < sizeof(x->qh); ++j) {
-                uint8_t q = x[i].qh[j] * pow3[l];
-                uint16_t xi = ((uint16_t) q * 3) >> 8;
-                sum += (xi - 1) * y[i].qs[sizeof(x->qs)*5 + l*sizeof(x->qh) + j];
-            }
-        }
-
-        sumf += (float) sum * (GGML_FP16_TO_FP32(x[i].d) * y[i].d);
-    }
-
-    *s = sumf;
-#endif
-}
-
-void ggml_vec_dot_tq2_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
-    assert(nrc == 1);
-    UNUSED(nrc);
-    UNUSED(bx);
-    UNUSED(by);
-    UNUSED(bs);
-
-    const block_tq2_0 * GGML_RESTRICT x = vx;
-    const block_q8_K  * GGML_RESTRICT y = vy;
-
-    const int nb = n / QK_K;
-
-#if defined(__ARM_NEON)
-    float sumf = 0.0f;
-
-    const uint8x16_t m3 = vdupq_n_u8(3);
-
-    for (int i = 0; i < nb; ++i) {
-#if defined(__ARM_FEATURE_DOTPROD)
-        int32x4_t sumi0 = vdupq_n_s32(0);
-        int32x4_t sumi1 = vdupq_n_s32(0);
-#else
-        int16x8_t sumi0 = vdupq_n_s16(0);
-        int16x8_t sumi1 = vdupq_n_s16(0);
-#endif
-
-        for (size_t j = 0; j < sizeof(x->qs); j += 32) {
-            uint8x16_t qx0 = vld1q_u8(x[i].qs + j);
-            uint8x16_t qx1 = vld1q_u8(x[i].qs + j + 16);
-            uint8x16_t qx2 = vshrq_n_u8(qx0, 2);
-            uint8x16_t qx3 = vshrq_n_u8(qx1, 2);
-            uint8x16_t qx4 = vshrq_n_u8(qx0, 4);
-            uint8x16_t qx5 = vshrq_n_u8(qx1, 4);
-            uint8x16_t qx6 = vshrq_n_u8(qx0, 6);
-            uint8x16_t qx7 = vshrq_n_u8(qx1, 6);
-
-            int8x16_t sqx0 = vreinterpretq_s8_u8(vandq_u8(qx0, m3));
-            int8x16_t sqx1 = vreinterpretq_s8_u8(vandq_u8(qx1, m3));
-            int8x16_t sqx2 = vreinterpretq_s8_u8(vandq_u8(qx2, m3));
-            int8x16_t sqx3 = vreinterpretq_s8_u8(vandq_u8(qx3, m3));
-            int8x16_t sqx4 = vreinterpretq_s8_u8(vandq_u8(qx4, m3));
-            int8x16_t sqx5 = vreinterpretq_s8_u8(vandq_u8(qx5, m3));
-            int8x16_t sqx6 = vreinterpretq_s8_u8(vandq_u8(qx6, m3));
-            int8x16_t sqx7 = vreinterpretq_s8_u8(vandq_u8(qx7, m3));
-
-            const int8x16_t qy0 = vld1q_s8(y[i].qs + j*4 +   0);
-            const int8x16_t qy1 = vld1q_s8(y[i].qs + j*4 +  16);
-            const int8x16_t qy2 = vld1q_s8(y[i].qs + j*4 +  32);
-            const int8x16_t qy3 = vld1q_s8(y[i].qs + j*4 +  48);
-            const int8x16_t qy4 = vld1q_s8(y[i].qs + j*4 +  64);
-            const int8x16_t qy5 = vld1q_s8(y[i].qs + j*4 +  80);
-            const int8x16_t qy6 = vld1q_s8(y[i].qs + j*4 +  96);
-            const int8x16_t qy7 = vld1q_s8(y[i].qs + j*4 + 112);
-
-#if defined(__ARM_FEATURE_DOTPROD)
-            sumi0 = vdotq_s32(sumi0, sqx0, qy0);
-            sumi1 = vdotq_s32(sumi1, sqx1, qy1);
-            sumi0 = vdotq_s32(sumi0, sqx2, qy2);
-            sumi1 = vdotq_s32(sumi1, sqx3, qy3);
-            sumi0 = vdotq_s32(sumi0, sqx4, qy4);
-            sumi1 = vdotq_s32(sumi1, sqx5, qy5);
-            sumi0 = vdotq_s32(sumi0, sqx6, qy6);
-            sumi1 = vdotq_s32(sumi1, sqx7, qy7);
-#else
-            sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx0), vget_low_s8(qy0));
-            sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx0), vget_high_s8(qy0));
-            sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx1), vget_low_s8(qy1));
-            sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx1), vget_high_s8(qy1));
-            sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx2), vget_low_s8(qy2));
-            sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx2), vget_high_s8(qy2));
-            sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx3), vget_low_s8(qy3));
-            sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx3), vget_high_s8(qy3));
-            sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx4), vget_low_s8(qy4));
-            sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx4), vget_high_s8(qy4));
-            sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx5), vget_low_s8(qy5));
-            sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx5), vget_high_s8(qy5));
-            sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx6), vget_low_s8(qy6));
-            sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx6), vget_high_s8(qy6));
-            sumi0 = vmlal_s8(sumi0, vget_low_s8(sqx7), vget_low_s8(qy7));
-            sumi1 = vmlal_s8(sumi1, vget_high_s8(sqx7), vget_high_s8(qy7));
-#endif
-        }
-
-        const int16x8_t ysum0 = vld1q_s16(y[i].bsums);
-        const int16x8_t ysum1 = vld1q_s16(y[i].bsums + 8);
-
-        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-
-#if defined(__ARM_FEATURE_DOTPROD)
-        sumi0 = vaddq_s32(sumi0, sumi1);
-        sumi0 = vsubq_s32(sumi0, vpaddlq_s16(vaddq_s16(ysum0, ysum1)));
-
-        sumf += d * (float) vaddvq_s32(sumi0);
-#else
-        sumi0 = vaddq_s16(sumi0, sumi1);
-        sumi0 = vsubq_s16(sumi0, vaddq_s16(ysum0, ysum1));
-
-        sumf += d * (float) vaddlvq_s16(sumi0);
-#endif
-    }
-
-    *s = sumf;
-
-#elif defined(__AVX2__)
-    __m256 sumf = _mm256_setzero_ps();
-
-    for (int i = 0; i < nb; ++i) {
-        // 16-bit sums, because 256*127 still fits
-        __m256i sumi0 = _mm256_setzero_si256();
-        __m256i sumi1 = _mm256_setzero_si256();
-
-        for (size_t j = 0; j < sizeof(x->qs); j += 32) {
-            __m256i qx0 = _mm256_loadu_si256((const __m256i *) (x[i].qs + j));
-            __m256i qx1 = _mm256_srli_epi16(qx0, 2);
-            __m256i qx2 = _mm256_srli_epi16(qx0, 4);
-            __m256i qx3 = _mm256_srli_epi16(qx0, 6);
-
-            // 0, 1, 2 (should not be 3)
-            qx0 = _mm256_and_si256(qx0, _mm256_set1_epi8(3));
-            qx1 = _mm256_and_si256(qx1, _mm256_set1_epi8(3));
-            qx2 = _mm256_and_si256(qx2, _mm256_set1_epi8(3));
-            qx3 = _mm256_and_si256(qx3, _mm256_set1_epi8(3));
-
-            const __m256i qy0 = _mm256_loadu_si256((const __m256i *) (y[i].qs + j*4 +  0));
-            const __m256i qy1 = _mm256_loadu_si256((const __m256i *) (y[i].qs + j*4 + 32));
-            const __m256i qy2 = _mm256_loadu_si256((const __m256i *) (y[i].qs + j*4 + 64));
-            const __m256i qy3 = _mm256_loadu_si256((const __m256i *) (y[i].qs + j*4 + 96));
-
-            qx0 = _mm256_maddubs_epi16(qx0, qy0);
-            qx1 = _mm256_maddubs_epi16(qx1, qy1);
-            qx2 = _mm256_maddubs_epi16(qx2, qy2);
-            qx3 = _mm256_maddubs_epi16(qx3, qy3);
-
-            sumi0 = _mm256_add_epi16(sumi0, _mm256_add_epi16(qx0, qx1));
-            sumi1 = _mm256_add_epi16(sumi1, _mm256_add_epi16(qx2, qx3));
-        }
-
-        const __m256i ysum = _mm256_loadu_si256((const __m256i *) y[i].bsums);
-        const __m256 d = _mm256_set1_ps(y[i].d * GGML_FP16_TO_FP32(x[i].d));
-
-        sumi0 = _mm256_add_epi16(sumi0, sumi1);
-        sumi0 = _mm256_sub_epi16(sumi0, ysum);
-        sumi0 = _mm256_madd_epi16(sumi0, _mm256_set1_epi16(1));
-
-        sumf = _mm256_add_ps(_mm256_mul_ps(_mm256_cvtepi32_ps(sumi0), d), sumf);
-    }
-
-    *s = hsum_float_8(sumf);
-
-#else
-    float sumf = 0.0f;
-
-    for (int i = 0; i < nb; ++i) {
-        int32_t sumi = 0;
-
-        for (size_t j = 0; j < sizeof(x->qs); j += 32) {
-            for (size_t l = 0; l < 4; ++l) {
-                for (size_t k = 0; k < 32; ++k) {
-                    sumi += y[i].qs[j*4 + l*32 + k] * (((x[i].qs[j + k] >> (l*2)) & 3) - 1);
-                }
-            }
-        }
-
-        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
-
-        sumf += (float) sumi * d;
-    }
-
-    *s = sumf;
-#endif
-}
-
-void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
-    assert(nrc == 1);
-    UNUSED(nrc);
-    UNUSED(bx);
-    UNUSED(by);
-    UNUSED(bs);
-
-    const block_q2_K * GGML_RESTRICT x = vx;
-    const block_q8_K * GGML_RESTRICT y = vy;
-
-    const int nb = n / QK_K;
-
-#ifdef __ARM_FEATURE_SVE
-    const int vector_length = svcntb()*8;
-    const svuint8_t m3s = svdup_n_u8(0x3);
-    const svuint32_t m4s = svdup_n_u32(0xF);
-    const svint32_t vzero_sv = svdup_n_s32(0);
-    svfloat32_t acc_sum = svdup_n_f32(0);
-    svbool_t pred_s32 = svptrue_pat_b32(SV_VL4);
-
-    switch (vector_length) {
-        case 128:
-            for (int i = 0; i < nb; ++i) {
-                const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
-                svfloat32_t d_broad = svdup_n_f32((float32_t)d);
-                const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
-                svfloat32_t dmin_broad = svdup_n_f32((float32_t)dmin);
-
-                const uint8_t * GGML_RESTRICT q2 = x[i].qs;
-                const int8_t  * GGML_RESTRICT q8_sv = y[i].qs;
-                const uint8_t * GGML_RESTRICT sc = x[i].scales;
-
-                svuint32_t mins_and_scales_sve = svld1ub_u32(svptrue_b32(), sc);
-                const svint32_t mins_sv_1 = svreinterpret_s32_u32(svlsr_n_u32_x(svptrue_b32(), mins_and_scales_sve, 4));
-
-                mins_and_scales_sve = svld1ub_u32(svptrue_b32(), sc+4);
-                const svint32_t mins_sv_2 = svreinterpret_s32_u32(svlsr_n_u32_x(svptrue_b32(), mins_and_scales_sve, 4));
-
-                svint32_t q8sums_sv_1 = svld1sh_s32(svptrue_b32(), y[i].bsums);
-                svint32_t q8sums_sv_2 = svld1sh_s32(svptrue_b32(), y[i].bsums+4);
-
-                const svint32_t s0 = svadd_s32_x(svptrue_b32(), svmul_s32_x(svptrue_b32(), mins_sv_1, q8sums_sv_1), svmul_s32_x(svptrue_b32(), mins_sv_2, q8sums_sv_2));
-
-                mins_and_scales_sve = svld1ub_u32(svptrue_b32(), sc+8);
-                const svint32_t mins_sv_3 = svreinterpret_s32_u32(svlsr_n_u32_x(svptrue_b32(), mins_and_scales_sve, 4));
-
-                mins_and_scales_sve = svld1ub_u32(svptrue_b32(), sc+12);
-                const svint32_t mins_sv_4 = svreinterpret_s32_u32(svlsr_n_u32_x(svptrue_b32(), mins_and_scales_sve, 4));
-
-                q8sums_sv_1 = svld1sh_s32(svptrue_b32(), y[i].bsums+8);
-                q8sums_sv_2 = svld1sh_s32(svptrue_b32(), y[i].bsums+12);
-
-                svint32_t s1 = svadd_s32_x(svptrue_b32(), svmul_s32_x(svptrue_b32(), mins_sv_3, q8sums_sv_1), svmul_s32_x(svptrue_b32(), mins_sv_4, q8sums_sv_2));
-
-                svfloat32_t temp = svcvt_f32_s32_x(svptrue_b32(), svadd_s32_x(svptrue_b32(), s0, s1));
-
-                acc_sum = svmla_f32_m(svptrue_b32(), acc_sum, temp, dmin_broad);
-
-                svint32_t sumi1 = svdup_n_s32(0);
-
-                {
-                    const svuint8_t q2bits_1 = svld1_u8(svptrue_b8(), q2);
-                    svint8_t q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), q2bits_1, m3s));
-                    svint8_t q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
-                    const svint32_t scales_sv = svreinterpret_s32_u32(svand_u32_m(svptrue_b32(), svld1ub_u32(svptrue_b32(), sc), m4s));
-
-                    sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv, 0));
-
-                    const svuint8_t q2bits_3 = svld1_u8(svptrue_b8(), q2+16);
-                    q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), q2bits_3, m3s));
-                    q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
-
-                    sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv, 1));
-
-                    q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q2bits_1, 2), m3s));
-                    q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
-
-                    sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv, 2));
-
-                    q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q2bits_3, 2), m3s));
-                    q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
-
-                    sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv, 3));
-
-
-                    const svint32_t scales_sv_1 = svreinterpret_s32_u32(svand_u32_m(svptrue_b32(), svld1ub_u32(svptrue_b32(), sc+4), m4s));
-
-                    q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q2bits_1, 4), m3s));
-                    q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
-
-                    sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv_1, 0));
-
-                    q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q2bits_3, 4), m3s));
-                    q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
-
-                    sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv_1, 1));
-
-                    q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q2bits_1, 6), m3s));
-                    q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
-
-                    sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv_1, 2));
-
-                    q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q2bits_3, 6), m3s));
-                    q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
-
-                    sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv_1, 3));
-
-                    //-------------------------------
-
-                    q2 += 32;
-                    const svint32_t scales_sv_2 = svreinterpret_s32_u32(svand_u32_m(svptrue_b32(), svld1ub_u32(svptrue_b32(), sc+8), m4s));
-                    const svuint8_t q2bits_2 = svld1_u8(svptrue_b8(), q2);
-
-                    q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), q2bits_2, m3s));
-                    q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
-
-                    sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv_2, 0));
-
-                    const svuint8_t q2bits_4 = svld1_u8(svptrue_b8(), q2+16);
-                    q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), q2bits_4, m3s));
-                    q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
-
-                    sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv_2, 1));
-
-
-                    q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q2bits_2, 2), m3s));
-                    q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
-
-                    sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv_2, 2));
-
-                    q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q2bits_4, 2), m3s));
-                    q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
-
-                    sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv_2, 3));
-
-
-                    const svint32_t scales_sv_3 = svreinterpret_s32_u32(svand_u32_m(svptrue_b32(), svld1ub_u32(svptrue_b32(), sc+12), m4s));
-
-                    q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q2bits_2, 4), m3s));
-                    q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
-
-                    sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv_3, 0));
-
-                    q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q2bits_4, 4), m3s));
-                    q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
-
-                    sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv_3, 1));
-
-
-
-                    q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q2bits_2, 6), m3s));
-                    q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
-
-                    sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv_3, 2));
-
-                    q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q2bits_4, 6), m3s));
-                    q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
-
-                    sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv_3, 3));
-                }
-                acc_sum = svmla_f32_m(svptrue_b32(), acc_sum, svcvt_f32_s32_x(svptrue_b32(), sumi1), d_broad);
-            }
-            *s = svaddv_f32(svptrue_b32(), acc_sum);
-            break;
-
-        case 256:
-        case 512:
-            for (int i = 0; i < nb; ++i) {
-                const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
-                svfloat32_t d_broad = svdup_n_f32((float32_t)d);
-                const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
-                svfloat32_t dmin_broad = svdup_n_f32((float32_t)dmin);
-
-                const uint8_t * GGML_RESTRICT q2 = x[i].qs;
-                const int8_t  * GGML_RESTRICT q8_sv = y[i].qs;
-                const uint8_t * GGML_RESTRICT sc = x[i].scales;
-
-                const svuint32_t mins_and_scales_sve = svld1ub_u32(svptrue_pat_b32(SV_VL8), sc); sc += 8;
-                const svint32_t scales_sv = svreinterpret_s32_u32(svand_u32_m(svptrue_pat_b32(SV_VL8), mins_and_scales_sve, m4s));
-                const svint32_t mins_sv_1 = svreinterpret_s32_u32(svlsr_n_u32_x(svptrue_pat_b32(SV_VL8), mins_and_scales_sve, 4));
-                svint32_t q8sums_sv_1 = svld1sh_s32(svptrue_pat_b32(SV_VL8), y[i].bsums);
-
-                const svuint32_t mins_and_scales_sve_1 = svld1ub_u32(svptrue_pat_b32(SV_VL8), sc);
-                const svint32_t scales_sv_1 = svreinterpret_s32_u32(svand_u32_m(svptrue_pat_b32(SV_VL8), mins_and_scales_sve_1, m4s));
-                const svint32_t mins_sv_2 = svreinterpret_s32_u32(svlsr_n_u32_x(svptrue_pat_b32(SV_VL8), mins_and_scales_sve_1, 4));
-
-                svint32_t q8sums_sv_2 = svld1sh_s32(svptrue_pat_b32(SV_VL8), y[i].bsums+8);
-
-                svfloat32_t temp = svcvt_f32_s32_x(svptrue_pat_b32(SV_VL8), svadd_s32_x(svptrue_pat_b32(SV_VL8), svmul_s32_x(svptrue_pat_b32(SV_VL8), mins_sv_1, q8sums_sv_1), svmul_s32_x(svptrue_pat_b32(SV_VL8), mins_sv_2, q8sums_sv_2)));
-
-                acc_sum = svmla_f32_m(svptrue_pat_b32(SV_VL8), acc_sum, temp, dmin_broad);
-
-                svint32_t sumi1 = svdup_n_s32(0);
-
-                {
-                    const svuint8_t q2bits_1 = svld1_u8(svptrue_pat_b8(SV_VL32), q2);
-                    svint8_t q2bytes_sv = svreinterpret_s8_u8(svand_u8_m(svptrue_pat_b8(SV_VL32), q2bits_1, m3s));
-                    svint8_t q8bytes_sv = svld1_s8(svptrue_pat_b8(SV_VL32), q8_sv); q8_sv += 32;
-
-                    svint32_t scale_1 = svsel(pred_s32, svdup_lane_s32(scales_sv, 0), svdup_lane_s32(scales_sv, 1));
-                    sumi1 = svmla_s32_m(svptrue_pat_b32(SV_VL8), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), scale_1);
-
-                    q2bytes_sv = svreinterpret_s8_u8(svand_u8_m(svptrue_pat_b8(SV_VL32), svlsr_n_u8_x(svptrue_pat_b8(SV_VL32), q2bits_1, 2), m3s));
-                    q8bytes_sv = svld1_s8(svptrue_pat_b8(SV_VL32), q8_sv); q8_sv += 32;
-
-                    svint32_t scale_2 = svsel(pred_s32, svdup_lane_s32(scales_sv, 2), svdup_lane_s32(scales_sv, 3));
-                    sumi1 = svmla_s32_m(svptrue_pat_b32(SV_VL8), sumi1, svdot_s32(svdup_n_s32(0), q2bytes_sv, q8bytes_sv), scale_2);
-
-                    q2bytes_sv = svreinterpret_s8_u8(svand_u8_m(svptrue_pat_b8(SV_VL32), svlsr_n_u8_x(svptrue_pat_b8(SV_VL32), q2bits_1, 4), m3s));
-                    q8bytes_sv = svld1_s8(svptrue_pat_b8(SV_VL32), q8_sv); q8_sv += 32;
-
-                    scale_1 = svsel(pred_s32, svdup_lane_s32(scales_sv, 4), svdup_lane_s32(scales_sv, 5));
-                    sumi1 = svmla_s32_m(svptrue_pat_b32(SV_VL8), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), scale_1);
-
-                    q2bytes_sv = svreinterpret_s8_u8(svand_u8_m(svptrue_pat_b8(SV_VL32), svlsr_n_u8_x(svptrue_pat_b8(SV_VL32), q2bits_1, 6), m3s));
-                    q8bytes_sv = svld1_s8(svptrue_pat_b8(SV_VL32), q8_sv); q8_sv += 32;
-
-                    scale_2 = svsel(pred_s32, svdup_lane_s32(scales_sv, 6), svdup_lane_s32(scales_sv, 7));
-                    sumi1 = svmla_s32_m(svptrue_pat_b32(SV_VL8), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), scale_2);
-
-                    q2 += 32;
-
-                    const svuint8_t q2bits_2 = svld1_u8(svptrue_pat_b8(SV_VL32), q2);
-                    q2bytes_sv = svreinterpret_s8_u8(svand_u8_m(svptrue_pat_b8(SV_VL32), q2bits_2, m3s));
-                    q8bytes_sv = svld1_s8(svptrue_pat_b8(SV_VL32), q8_sv); q8_sv += 32;
-
-                    scale_1 = svsel(pred_s32, svdup_lane_s32(scales_sv_1, 0), svdup_lane_s32(scales_sv_1, 1));
-                    sumi1 = svmla_s32_m(svptrue_pat_b32(SV_VL8), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), scale_1);
-
-                    q2bytes_sv = svreinterpret_s8_u8(svand_u8_m(svptrue_pat_b8(SV_VL32), svlsr_n_u8_x(svptrue_pat_b8(SV_VL32), q2bits_2, 2), m3s));
-                    q8bytes_sv = svld1_s8(svptrue_pat_b8(SV_VL32), q8_sv); q8_sv += 32;
-
-                    scale_2 = svsel(pred_s32, svdup_lane_s32(scales_sv_1, 2), svdup_lane_s32(scales_sv_1, 3));
-                    sumi1 = svmla_s32_m(svptrue_pat_b32(SV_VL8), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), scale_2);
-
-                    q2bytes_sv = svreinterpret_s8_u8(svand_u8_m(svptrue_pat_b8(SV_VL32), svlsr_n_u8_x(svptrue_pat_b8(SV_VL32), q2bits_2, 4), m3s));
-                    q8bytes_sv = svld1_s8(svptrue_pat_b8(SV_VL32), q8_sv); q8_sv += 32;
-
-                    scale_1 = svsel(pred_s32, svdup_lane_s32(scales_sv_1, 4), svdup_lane_s32(scales_sv_1, 5));
-                    sumi1 = svmla_s32_m(svptrue_pat_b32(SV_VL8), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), scale_1);
-
-                    q2bytes_sv = svreinterpret_s8_u8(svand_u8_m(svptrue_pat_b8(SV_VL32), svlsr_n_u8_x(svptrue_pat_b8(SV_VL32), q2bits_2, 6), m3s));
-                    q8bytes_sv = svld1_s8(svptrue_pat_b8(SV_VL32), q8_sv); q8_sv += 32;
-
-                    scale_2 = svsel(pred_s32, svdup_lane_s32(scales_sv_1, 6), svdup_lane_s32(scales_sv_1, 7));
-                    sumi1 = svmla_s32_m(svptrue_pat_b32(SV_VL8), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), scale_2);
-                }
-                acc_sum = svmla_f32_m(svptrue_pat_b32(SV_VL8), acc_sum, svcvt_f32_s32_x(svptrue_pat_b32(SV_VL8), sumi1), d_broad);
-            }
-            *s = svaddv_f32(svptrue_pat_b32(SV_VL8), acc_sum);
-            break;
-
-        default:
-            assert(false && "Unsupported vector length");
-            break;
-    }
-
-#elif __ARM_NEON
-    const uint8x16_t m3 = vdupq_n_u8(0x3);
-    const uint8x16_t m4 = vdupq_n_u8(0xF);
-
-    const int32x4_t vzero = vdupq_n_s32(0);
-
-    ggml_int8x16x2_t q2bytes;
-    uint8_t aux[16];
-
-    float sum = 0;
-
-    for (int i = 0; i < nb; ++i) {
-        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
-        const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
-
-        const uint8_t * GGML_RESTRICT q2 = x[i].qs;
-        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
-        const uint8_t * GGML_RESTRICT sc = x[i].scales;
-
-        const uint8x16_t mins_and_scales = vld1q_u8(sc);
-        const uint8x16_t scales = vandq_u8(mins_and_scales, m4);
-        vst1q_u8(aux, scales);
-
-        const uint8x16_t mins = vshrq_n_u8(mins_and_scales, 4);
-        const ggml_int16x8x2_t q8sums = ggml_vld1q_s16_x2(y[i].bsums);
-        const ggml_int16x8x2_t mins16 = {{vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(mins))), vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(mins)))}};
-        const int32x4_t s0 = vaddq_s32(vmull_s16(vget_low_s16 (mins16.val[0]), vget_low_s16 (q8sums.val[0])),
-                                       vmull_s16(vget_high_s16(mins16.val[0]), vget_high_s16(q8sums.val[0])));
-        const int32x4_t s1 = vaddq_s32(vmull_s16(vget_low_s16 (mins16.val[1]), vget_low_s16 (q8sums.val[1])),
-                                       vmull_s16(vget_high_s16(mins16.val[1]), vget_high_s16(q8sums.val[1])));
-        sum += dmin * vaddvq_s32(vaddq_s32(s0, s1));
-
-        int isum = 0;
-        int is = 0;
-
-// We use this macro instead of a function call because for some reason
-// the code runs 2-3% slower, even if the function is declared inline
-#define MULTIPLY_ACCUM_WITH_SCALE(index)\
-        isum += vaddvq_s32(ggml_vdotq_s32(vzero, q2bytes.val[0], q8bytes.val[0])) * aux[is+(index)];\
-        isum += vaddvq_s32(ggml_vdotq_s32(vzero, q2bytes.val[1], q8bytes.val[1])) * aux[is+1+(index)];
-
-#define SHIFT_MULTIPLY_ACCUM_WITH_SCALE(shift, index)\
-        q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32;\
-        q2bytes.val[0] = vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q2bits.val[0], (shift)), m3));\
-        q2bytes.val[1] = vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q2bits.val[1], (shift)), m3));\
-        MULTIPLY_ACCUM_WITH_SCALE((index));
-
-        for (int j = 0; j < QK_K/128; ++j) {
-            const ggml_uint8x16x2_t q2bits = ggml_vld1q_u8_x2(q2); q2 += 32;
-
-            ggml_int8x16x2_t q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32;
-            q2bytes.val[0] = vreinterpretq_s8_u8(vandq_u8(q2bits.val[0], m3));
-            q2bytes.val[1] = vreinterpretq_s8_u8(vandq_u8(q2bits.val[1], m3));
-
-            MULTIPLY_ACCUM_WITH_SCALE(0);
-
-            SHIFT_MULTIPLY_ACCUM_WITH_SCALE(2, 2);
-            SHIFT_MULTIPLY_ACCUM_WITH_SCALE(4, 4);
-            SHIFT_MULTIPLY_ACCUM_WITH_SCALE(6, 6);
-
-            is += 8;
-        }
-
-        sum += d * isum;
-    }
-
-    *s = sum;
-
-#elif defined __AVX2__
-
-    const __m256i m3 = _mm256_set1_epi8(3);
-    const __m128i m4 = _mm_set1_epi8(0xF);
-
-    __m256 acc = _mm256_setzero_ps();
-
-    for (int i = 0; i < nb; ++i) {
-
-        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
-        const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
-
-        const uint8_t * GGML_RESTRICT q2 = x[i].qs;
-        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
-
-        const __m128i mins_and_scales = _mm_loadu_si128((const __m128i*)x[i].scales);
-        const __m128i scales8 = _mm_and_si128(mins_and_scales, m4);
-        const __m128i mins8 = _mm_and_si128(_mm_srli_epi16(mins_and_scales, 4), m4);
-        const __m256i mins = _mm256_cvtepi8_epi16(mins8);
-        const __m256i prod = _mm256_madd_epi16(mins, _mm256_loadu_si256((const __m256i*)y[i].bsums));
-
-        acc = _mm256_fmadd_ps(_mm256_broadcast_ss(&dmin), _mm256_cvtepi32_ps(prod), acc);
-
-        const __m256i all_scales = _mm256_cvtepi8_epi16(scales8);
-        const __m128i l_scales = _mm256_extracti128_si256(all_scales, 0);
-        const __m128i h_scales = _mm256_extracti128_si256(all_scales, 1);
-        const __m256i scales[2] = {MM256_SET_M128I(l_scales, l_scales), MM256_SET_M128I(h_scales, h_scales)};
-
-        __m256i sumi = _mm256_setzero_si256();
-
-        for (int j = 0; j < QK_K/128; ++j) {
-
-            const __m256i q2bits = _mm256_loadu_si256((const __m256i*)q2); q2 += 32;
-
-            const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
-            const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
-            const __m256i q8_2 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
-            const __m256i q8_3 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
-
-            const __m256i q2_0 = _mm256_and_si256(q2bits, m3);
-            const __m256i q2_1 = _mm256_and_si256(_mm256_srli_epi16(q2bits, 2), m3);
-            const __m256i q2_2 = _mm256_and_si256(_mm256_srli_epi16(q2bits, 4), m3);
-            const __m256i q2_3 = _mm256_and_si256(_mm256_srli_epi16(q2bits, 6), m3);
-
-            __m256i p0 = _mm256_maddubs_epi16(q2_0, q8_0);
-            __m256i p1 = _mm256_maddubs_epi16(q2_1, q8_1);
-            __m256i p2 = _mm256_maddubs_epi16(q2_2, q8_2);
-            __m256i p3 = _mm256_maddubs_epi16(q2_3, q8_3);
-
-            p0 = _mm256_madd_epi16(_mm256_shuffle_epi8(scales[j], get_scale_shuffle_q3k(0)), p0);
-            p1 = _mm256_madd_epi16(_mm256_shuffle_epi8(scales[j], get_scale_shuffle_q3k(1)), p1);
-            p2 = _mm256_madd_epi16(_mm256_shuffle_epi8(scales[j], get_scale_shuffle_q3k(2)), p2);
-            p3 = _mm256_madd_epi16(_mm256_shuffle_epi8(scales[j], get_scale_shuffle_q3k(3)), p3);
-
-            p0 = _mm256_add_epi32(p0, p1);
-            p2 = _mm256_add_epi32(p2, p3);
-
-            sumi = _mm256_add_epi32(sumi, _mm256_add_epi32(p0, p2));
-        }
-
-        acc = _mm256_fmadd_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi), acc);
-
-    }
-
-    *s = hsum_float_8(acc);
-
-#elif defined __AVX__
-
-    const __m128i m3 = _mm_set1_epi8(0x3);
-    const __m128i m4 = _mm_set1_epi8(0xF);
-    const __m128i m2 = _mm_set1_epi8(0x2);
-
-    __m256 acc = _mm256_setzero_ps();
-
-    for (int i = 0; i < nb; ++i) {
-
-        const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d);
-        const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
-
-        const uint8_t * GGML_RESTRICT q2 = x[i].qs;
-        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
-
-        // load mins and scales from block_q2_K.scales[QK_K/16]
-        const __m128i mins_and_scales = _mm_loadu_si128((const __m128i*)x[i].scales);
-        const __m128i scales16 = _mm_and_si128(mins_and_scales, m4);
-        const __m128i mins16 = _mm_and_si128(_mm_srli_epi16(mins_and_scales, 4), m4);
-        const __m128i mins_0 = _mm_cvtepi8_epi16(mins16);
-        const __m128i mins_1 = _mm_cvtepi8_epi16(_mm_unpackhi_epi64(mins16, mins16));
-
-        // summs = y[i].bsums * (x[i].scales >> 4) in 16bits*8*2 to 32bits*4*2
-        const __m128i summs_0 = _mm_madd_epi16(mins_0, _mm_loadu_si128((const __m128i*)&y[i].bsums[0]));
-        const __m128i summs_1 = _mm_madd_epi16(mins_1, _mm_loadu_si128((const __m128i*)&y[i].bsums[8]));
-
-        // sumf += -dmin * summs in 32bits*8
-        acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&dmin), _mm256_cvtepi32_ps(MM256_SET_M128I(summs_1, summs_0))), acc);
-
-        const __m128i scales_0 = _mm_cvtepi8_epi16(scales16);
-        const __m128i scales_1 = _mm_cvtepi8_epi16(_mm_unpackhi_epi64(scales16, scales16));
-        const __m128i scales[2] = { scales_0, scales_1 };
-
-        __m128i sumi_0 = _mm_setzero_si128();
-        __m128i sumi_1 = _mm_setzero_si128();
-
-        for (int j = 0; j < QK_K/128; ++j) {
-
-            // load Q8 quants int8*16*8 from block_q8_K.qs[QK_K]
-            const __m128i q8_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-            const __m128i q8_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-            const __m128i q8_2 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-            const __m128i q8_3 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-            const __m128i q8_4 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-            const __m128i q8_5 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-            const __m128i q8_6 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-            const __m128i q8_7 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-
-            // load 2bits*16*8 from block_q2_K.qs[QK_K/4]
-            __m128i q2bits = _mm_loadu_si128((const __m128i*)q2); q2 += 16;
-            const __m128i q2_0 = _mm_and_si128(q2bits, m3);
-            const __m128i q2_2 = _mm_and_si128(_mm_srli_epi16(q2bits, 2), m3);
-            const __m128i q2_4 = _mm_and_si128(_mm_srli_epi16(q2bits, 4), m3);
-            const __m128i q2_6 = _mm_and_si128(_mm_srli_epi16(q2bits, 6), m3);
-            q2bits = _mm_loadu_si128((const __m128i*)q2); q2 += 16;
-            const __m128i q2_1 = _mm_and_si128(q2bits, m3);
-            const __m128i q2_3 = _mm_and_si128(_mm_srli_epi16(q2bits, 2), m3);
-            const __m128i q2_5 = _mm_and_si128(_mm_srli_epi16(q2bits, 4), m3);
-            const __m128i q2_7 = _mm_and_si128(_mm_srli_epi16(q2bits, 6), m3);
-
-            // isuml = q8[l] * ((q2[l] >> shift) & 3) in 8bits*16*8 to 16bits*8*8
-            __m128i p0 = _mm_maddubs_epi16(q2_0, q8_0);
-            __m128i p1 = _mm_maddubs_epi16(q2_1, q8_1);
-            __m128i p2 = _mm_maddubs_epi16(q2_2, q8_2);
-            __m128i p3 = _mm_maddubs_epi16(q2_3, q8_3);
-            __m128i p4 = _mm_maddubs_epi16(q2_4, q8_4);
-            __m128i p5 = _mm_maddubs_epi16(q2_5, q8_5);
-            __m128i p6 = _mm_maddubs_epi16(q2_6, q8_6);
-            __m128i p7 = _mm_maddubs_epi16(q2_7, q8_7);
-
-            // isum += (x[i].scales[is++] & 0xF) * isuml in 16bits*8*8 to 32bits*4*8
-            __m128i shuffle = _mm_set1_epi16(0x0100);
-            p0 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p0);
-            shuffle = _mm_add_epi16(shuffle, m2);
-            p1 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p1);
-            shuffle = _mm_add_epi16(shuffle, m2);
-            p2 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p2);
-            shuffle = _mm_add_epi16(shuffle, m2);
-            p3 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p3);
-            shuffle = _mm_add_epi16(shuffle, m2);
-            p4 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p4);
-            shuffle = _mm_add_epi16(shuffle, m2);
-            p5 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p5);
-            shuffle = _mm_add_epi16(shuffle, m2);
-            p6 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p6);
-            shuffle = _mm_add_epi16(shuffle, m2);
-            p7 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p7);
-
-            p0 = _mm_add_epi32(p0, p1);
-            p2 = _mm_add_epi32(p2, p3);
-            p4 = _mm_add_epi32(p4, p5);
-            p6 = _mm_add_epi32(p6, p7);
-
-            // isum in 32bits*4*2
-            sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p0, p2));
-            sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p4, p6));
-        }
-
-        // sumf += dall * isum - dmin * summs in 32bits
-        __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
-        acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&dall), _mm256_cvtepi32_ps(sumi)), acc);
-    }
-
-    *s = hsum_float_8(acc);
-
-#elif defined __wasm_simd128__
-    float sumf = 0;
-
-    for (int i = 0; i < nb; ++i) {
-        const uint8_t * q2 = x[i].qs;
-        const int8_t * q8 = y[i].qs;
-        const uint8_t * sc = x[i].scales;
-
-        // Vectorized summs calculation
-        v128_t summs_vec = wasm_i32x4_splat(0);
-        {
-            v128_t sc_vec = wasm_v128_load(sc);
-            v128_t sc_upper = wasm_u8x16_shr(sc_vec, 4);
-
-            v128_t sc_low = wasm_u16x8_extend_low_u8x16(sc_upper);
-            v128_t sc_high = wasm_u16x8_extend_high_u8x16(sc_upper);
-
-            v128_t bsums1 = wasm_v128_load(&y[i].bsums[0]);
-            v128_t bsums2 = wasm_v128_load(&y[i].bsums[8]);
-
-            summs_vec = wasm_i32x4_add(
-                wasm_i32x4_add(wasm_i32x4_dot_i16x8(sc_low, bsums1),
-                               wasm_i32x4_dot_i16x8(sc_high, bsums2)),
-                summs_vec
-            );
-
-            summs_vec = wasm_i32x4_add(summs_vec, wasm_i32x4_shuffle(summs_vec, summs_vec, 2, 3, 0, 1));
-            summs_vec = wasm_i32x4_add(summs_vec, wasm_i32x4_shuffle(summs_vec, summs_vec, 1, 0, 3, 2));
-        }
-        int32_t summs = wasm_i32x4_extract_lane(summs_vec, 0);
-
-        // Vectorized isum calculation
-        int32_t isum = 0;
-        const uint8_t * sc_ptr = sc;
-        const int k_iters = QK_K/128;
-
-        for (int k = 0; k < k_iters; ++k) {
-            v128_t isum_vec = wasm_i32x4_splat(0);
-            int shift = 0;
-
-            for (int j = 0; j < 4; ++j) {
-                const int d0 = (sc_ptr[0] & 0xF);
-                const int d1 = (sc_ptr[1] & 0xF);
-                sc_ptr += 2;
-
-                // Process first 16 elements
-                v128_t q2_0 = wasm_v128_load(q2);
-                v128_t q8_0 = wasm_v128_load(q8);
-                v128_t q2_shift_0 = wasm_u8x16_shr(q2_0, shift);
-                v128_t q2_bits_0 = wasm_v128_and(q2_shift_0, wasm_i8x16_splat(0x03));
-
-                // Process next 16 elements
-                v128_t q2_1 = wasm_v128_load(q2 + 16);
-                v128_t q8_1 = wasm_v128_load(q8 + 16);
-                v128_t q2_shift_1 = wasm_u8x16_shr(q2_1, shift);
-                v128_t q2_bits_1 = wasm_v128_and(q2_shift_1, wasm_i8x16_splat(0x03));
-
-                // Calculate dot products
-                v128_t p0 = wasm_i32x4_dot_i16x8(
-                    wasm_i16x8_extend_low_i8x16(q8_0),
-                    wasm_i16x8_extend_low_i8x16(q2_bits_0)
-                );
-                v128_t p1 = wasm_i32x4_dot_i16x8(
-                    wasm_i16x8_extend_high_i8x16(q8_0),
-                    wasm_i16x8_extend_high_i8x16(q2_bits_0)
-                );
-                v128_t p2 = wasm_i32x4_dot_i16x8(
-                    wasm_i16x8_extend_low_i8x16(q8_1),
-                    wasm_i16x8_extend_low_i8x16(q2_bits_1)
-                );
-                v128_t p3 = wasm_i32x4_dot_i16x8(
-                    wasm_i16x8_extend_high_i8x16(q8_1),
-                    wasm_i16x8_extend_high_i8x16(q2_bits_1)
-                );
-
-                // Accumulate scaled results
-                v128_t scaled = wasm_i32x4_add(
-                    wasm_i32x4_mul(wasm_i32x4_add(p0, p1), wasm_i32x4_splat(d0)),
-                    wasm_i32x4_mul(wasm_i32x4_add(p2, p3), wasm_i32x4_splat(d1))
-                );
-
-                isum_vec = wasm_i32x4_add(isum_vec, scaled);
-                q8 += 32;
-                shift += 2;
-            }
-            q2 += 32;
-
-            // Horizontal sum of isum_vec
-            isum_vec = wasm_i32x4_add(isum_vec, wasm_i32x4_shuffle(isum_vec, isum_vec, 2, 3, 0, 1));
-            isum_vec = wasm_i32x4_add(isum_vec, wasm_i32x4_shuffle(isum_vec, isum_vec, 1, 0, 3, 2));
-            isum += wasm_i32x4_extract_lane(isum_vec, 0);
-        }
-
-        const float dall = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-        const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d;
-        sumf += dall * isum - dmin * summs;
-    }
-
-    *s = sumf;
-
-#elif defined __riscv_xtheadvector
-
-    float sumf = 0;
-    uint8_t atmp[16];
-
-    for (int i = 0; i < nb; ++i) {
-        const uint8_t * q2 = x[i].qs;
-        const  int8_t * q8 = y[i].qs;
-        const uint8_t * sc = x[i].scales;
-        const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d);
-        const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
-        uint8_t *patmp = atmp;
-        int vsums;
-        int tmp;
-        __asm__ __volatile__(
-            "th.vsetvli zero, %[vl16], e8, m1\n\t"
-            "th.vmv.v.x v8, zero\n\t"
-            "th.vlb.v v1, (%[sc])\n\t"
-            "th.vand.vi v0, v1, 0xF\n\t"
-            "th.vsrl.vi v1, v1, 4\n\t"
-            "th.vsb.v v0, (%[scale])\n\t"
-            "th.vwaddu.vx v16, v1, zero\n\t"
-            "th.vsetvli zero, %[vl16], e16, m2\n\t"
-            "th.vlh.v v2, (%[bsums])\n\t"
-            "th.vwmul.vv v4, v16, v2\n\t"
-            "th.vsetvli zero, %[vl16], e32, m4\n\t"
-            "th.vredsum.vs v8, v4, v8\n\t"
-            "th.vmv.x.s %[vsums], v8"
-            : [tmp] "=&r" (tmp), [vsums] "=&r" (vsums)
-            : [sc] "r" (sc), [scale] "r" (atmp), [bsums] "r" (y[i].bsums)
-            , [vl16] "r" (16)
-            : "memory"
-            , "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
-            , "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
-            , "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
-            , "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
-        );
-        sumf += dmin * vsums;
-        int isum = 0;
-
-        for (int j = 0; j < QK_K/128; ++j) {
-            __asm__ __volatile__(
-                "th.vsetvli zero, %[vl32], e8, m2\n\t"
-                "th.vlb.v v0, (%[q2])\n\t"
-                "th.vsrl.vi v2, v0, 2\n\t"
-                "th.vsrl.vi v4, v0, 4\n\t"
-                "th.vsrl.vi v6, v0, 6\n\t"
-                "th.vand.vi v0, v0, 0x3\n\t"
-                "th.vand.vi v2, v2, 0x3\n\t"
-                "th.vand.vi v4, v4, 0x3\n\t"
-                "th.vsetvli zero, %[vl128], e8, m8\n\t"
-                "th.vlb.v v8, (%[q8])\n\t"
-                "th.vsetvli zero, %[vl64], e8, m4\n\t"
-                "th.vwmul.vv v16, v0, v8\n\t"
-                "th.vwmul.vv v24, v4, v12\n\t"
-                "th.vsetvli zero, %[vl16], e16, m2\n\t"
-                "th.vmv.v.x v0, zero\n\t"
-                "th.vwredsum.vs v10, v16, v0\n\t"
-                "th.vwredsum.vs v9, v18, v0\n\t"
-                "th.vwredsum.vs v8, v20, v0\n\t"
-                "th.vwredsum.vs v7, v22, v0\n\t"
-                "th.vwredsum.vs v11, v24, v0\n\t"
-                "th.vwredsum.vs v12, v26, v0\n\t"
-                "th.vwredsum.vs v13, v28, v0\n\t"
-                "th.vwredsum.vs v14, v30, v0\n\t"
-                "li %[tmp], 4\n\t"
-                "th.vsetvli zero, %[tmp], e32, m1\n\t"
-                "th.vslideup.vi v10, v9, 1\n\t"
-                "th.vslideup.vi v8, v7, 1\n\t"
-                "th.vslideup.vi v11, v12, 1\n\t"
-                "th.vslideup.vi v13, v14, 1\n\t"
-                "th.vslideup.vi v10, v8, 2\n\t"
-                "th.vslideup.vi v11, v13, 2\n\t"
-                "li %[tmp], 8\n\t"
-                "th.vsetvli zero, %[tmp], e32, m2\n\t"
-                "th.vlbu.v v12, (%[scale])\n\t"
-                "th.vmul.vv v10, v10, v12\n\t"
-                "th.vredsum.vs v0, v10, v0\n\t"
-                "th.vmv.x.s %[tmp], v0\n\t"
-                "add %[isum], %[isum], %[tmp]"
-                : [tmp] "=&r" (tmp), [isum] "+&r" (isum)
-                : [q2] "r" (q2), [scale] "r" (patmp), [q8] "r" (q8)
-                , [vl16] "r" (16), [vl32] "r" (32), [vl64] "r" (64), [vl128] "r" (128)
-                : "memory"
-                , "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
-                , "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
-                , "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
-                , "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
-            );
-            q2 += 32; q8 += 128; patmp += 8;
-        }
-
-        sumf += dall * isum;
-    }
-
-    *s = sumf;
-
-#elif defined __riscv_v
-
-    float sumf = 0;
-    uint8_t atmp[16];
-
-    const int vector_length = __riscv_vlenb() * 8;
-    uint8_t temp_01[32] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 };
-
-    switch (vector_length) {
-    case 256:
-        for (int i = 0; i < nb; ++i) {
-            const uint8_t * q2 = x[i].qs;
-            const int8_t *  q8 = y[i].qs;
-            const uint8_t * sc = x[i].scales;
-
-            const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d);
-            const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
-
-            size_t vl = 16;
-
-            vuint8m1_t scales = __riscv_vle8_v_u8m1(sc, vl);
-            vuint8m1_t aux    = __riscv_vand_vx_u8m1(scales, 0x0F, vl);
-
-            vint16m1_t q8sums = __riscv_vle16_v_i16m1(y[i].bsums, vl);
-
-            vuint8mf2_t scales_2 = __riscv_vle8_v_u8mf2(sc, vl);
-            vuint8mf2_t mins8    = __riscv_vsrl_vx_u8mf2(scales_2, 0x4, vl);
-            vint16m1_t  mins     = __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(mins8, vl));
-            vint32m2_t  prod     = __riscv_vwmul_vv_i32m2(q8sums, mins, vl);
-            vint32m1_t  vsums    = __riscv_vredsum_vs_i32m2_i32m1(prod, __riscv_vmv_v_x_i32m1(0, 1), vl);
-
-            sumf += dmin * __riscv_vmv_x_s_i32m1_i32(vsums);
-
-            vl = 32;
-
-            vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1);
-            vuint8m1_t v_b   = __riscv_vle8_v_u8m1(temp_01, vl);
-
-            uint8_t is   = 0;
-            int     isum = 0;
-
-            for (int j = 0; j < QK_K / 128; ++j) {
-                // load Q2
-                vuint8m1_t q2_x = __riscv_vle8_v_u8m1(q2, vl);
-
-                vuint8m1_t q2_0 = __riscv_vand_vx_u8m1(q2_x, 0x03, vl);
-                vuint8m1_t q2_1 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(q2_x, 0x2, vl), 0x03, vl);
-                vuint8m1_t q2_2 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(q2_x, 0x4, vl), 0x03, vl);
-                vuint8m1_t q2_3 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(q2_x, 0x6, vl), 0x03, vl);
-
-                // duplicate scale elements for product
-                vuint8m1_t sc0 = __riscv_vrgather_vv_u8m1(aux, __riscv_vadd_vx_u8m1(v_b, 0 + is, vl), vl);
-                vuint8m1_t sc1 = __riscv_vrgather_vv_u8m1(aux, __riscv_vadd_vx_u8m1(v_b, 2 + is, vl), vl);
-                vuint8m1_t sc2 = __riscv_vrgather_vv_u8m1(aux, __riscv_vadd_vx_u8m1(v_b, 4 + is, vl), vl);
-                vuint8m1_t sc3 = __riscv_vrgather_vv_u8m1(aux, __riscv_vadd_vx_u8m1(v_b, 6 + is, vl), vl);
-
-                vint16m2_t p0 = __riscv_vreinterpret_v_u16m2_i16m2(__riscv_vwmulu_vv_u16m2(q2_0, sc0, vl));
-                vint16m2_t p1 = __riscv_vreinterpret_v_u16m2_i16m2(__riscv_vwmulu_vv_u16m2(q2_1, sc1, vl));
-                vint16m2_t p2 = __riscv_vreinterpret_v_u16m2_i16m2(__riscv_vwmulu_vv_u16m2(q2_2, sc2, vl));
-                vint16m2_t p3 = __riscv_vreinterpret_v_u16m2_i16m2(__riscv_vwmulu_vv_u16m2(q2_3, sc3, vl));
-
-                // load Q8
-                vint8m1_t q8_0 = __riscv_vle8_v_i8m1(q8, vl);
-                vint8m1_t q8_1 = __riscv_vle8_v_i8m1(q8 + 32, vl);
-                vint8m1_t q8_2 = __riscv_vle8_v_i8m1(q8 + 64, vl);
-                vint8m1_t q8_3 = __riscv_vle8_v_i8m1(q8 + 96, vl);
-
-                vint32m4_t s0 = __riscv_vwmul_vv_i32m4(p0, __riscv_vwcvt_x_x_v_i16m2(q8_0, vl), vl);
-                vint32m4_t s1 = __riscv_vwmul_vv_i32m4(p1, __riscv_vwcvt_x_x_v_i16m2(q8_1, vl), vl);
-                vint32m4_t s2 = __riscv_vwmul_vv_i32m4(p2, __riscv_vwcvt_x_x_v_i16m2(q8_2, vl), vl);
-                vint32m4_t s3 = __riscv_vwmul_vv_i32m4(p3, __riscv_vwcvt_x_x_v_i16m2(q8_3, vl), vl);
-
-                vint32m1_t isum0 = __riscv_vredsum_vs_i32m4_i32m1(__riscv_vadd_vv_i32m4(s0, s1, vl), vzero, vl);
-                vint32m1_t isum1 = __riscv_vredsum_vs_i32m4_i32m1(__riscv_vadd_vv_i32m4(s2, s3, vl), isum0, vl);
-
-                isum += __riscv_vmv_x_s_i32m1_i32(isum1);
-
-                q2 += 32;
-                q8 += 128;
-                is = 8;
-            }
-
-            sumf += dall * isum;
-        }
-        break;
-    case 128:
-        for (int i = 0; i < nb; ++i) {
-            const uint8_t * q2 = x[i].qs;
-            const  int8_t * q8 = y[i].qs;
-            const uint8_t * sc = x[i].scales;
-            const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d);
-            const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
-            uint8_t *patmp = atmp;
-            int vsums;
-            int tmp;
-            __asm__ __volatile__(
-                "vsetivli zero, 16, e8, m1\n\t"
-                "vmv.v.x v8, zero\n\t"
-                "vle8.v v1, (%[sc])\n\t"
-                "vand.vi v0, v1, 0xF\n\t"
-                "vsrl.vi v1, v1, 4\n\t"
-                "vse8.v v0, (%[scale])\n\t"
-                "vsetivli zero, 16, e16, m2\n\t"
-                "vle16.v v2, (%[bsums])\n\t"
-                "vzext.vf2 v0, v1\n\t"
-                "vwmul.vv v4, v0, v2\n\t"
-                "vsetivli zero, 16, e32, m4\n\t"
-                "vredsum.vs v8, v4, v8\n\t"
-                "vmv.x.s %[vsums], v8"
-                : [tmp] "=&r" (tmp), [vsums] "=&r" (vsums)
-                : [sc] "r" (sc), [scale] "r" (atmp), [bsums] "r" (y[i].bsums)
-                : "memory"
-                , "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
-                , "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
-                , "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
-                , "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
-            );
-            sumf += dmin * vsums;
-            int isum = 0;
-
-            for (int j = 0; j < QK_K/128; ++j) {
-                __asm__ __volatile__(
-                    "vsetvli zero, %[vl32], e8, m2\n\t"
-                    "vle8.v v0, (%[q2])\n\t"
-                    "vsrl.vi v2, v0, 2\n\t"
-                    "vsrl.vi v4, v0, 4\n\t"
-                    "vsrl.vi v6, v0, 6\n\t"
-                    "vand.vi v0, v0, 0x3\n\t"
-                    "vand.vi v2, v2, 0x3\n\t"
-                    "vand.vi v4, v4, 0x3\n\t"
-                    "vsetvli zero, %[vl128], e8, m8\n\t"
-                    "vle8.v v8, (%[q8])\n\t"
-                    "vsetvli zero, %[vl64], e8, m4\n\t"
-                    "vwmul.vv v16, v0, v8\n\t"
-                    "vwmul.vv v24, v4, v12\n\t"
-                    "vsetivli zero, 16, e16, m2\n\t"
-                    "vmv.v.x v0, zero\n\t"
-                    "vwredsum.vs v10, v16, v0\n\t"
-                    "vwredsum.vs v9, v18, v0\n\t"
-                    "vwredsum.vs v8, v20, v0\n\t"
-                    "vwredsum.vs v7, v22, v0\n\t"
-                    "vwredsum.vs v11, v24, v0\n\t"
-                    "vwredsum.vs v12, v26, v0\n\t"
-                    "vwredsum.vs v13, v28, v0\n\t"
-                    "vwredsum.vs v14, v30, v0\n\t"
-                    "vsetivli zero, 4, e32, m1\n\t"
-                    "vslideup.vi v10, v9, 1\n\t"
-                    "vslideup.vi v8, v7, 1\n\t"
-                    "vslideup.vi v11, v12, 1\n\t"
-                    "vslideup.vi v13, v14, 1\n\t"
-                    "vslideup.vi v10, v8, 2\n\t"
-                    "vslideup.vi v11, v13, 2\n\t"
-                    "vsetivli zero, 8, e32, m2\n\t"
-                    "vle8.v v15, (%[scale])\n\t"
-                    "vzext.vf4 v12, v15\n\t"
-                    "vmul.vv v10, v10, v12\n\t"
-                    "vredsum.vs v0, v10, v0\n\t"
-                    "vmv.x.s %[tmp], v0\n\t"
-                    "add %[isum], %[isum], %[tmp]"
-                    : [tmp] "=&r" (tmp), [isum] "+&r" (isum)
-                    : [q2] "r" (q2), [scale] "r" (patmp), [q8] "r" (q8)
-                    , [vl32] "r" (32), [vl64] "r" (64), [vl128] "r" (128)
-                    : "memory"
-                    , "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
-                    , "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
-                    , "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
-                    , "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
-                );
-                q2 += 32; q8 += 128; patmp += 8;
-            }
-
-            sumf += dall * isum;
-        }
-        break;
-    default:
-        assert(false && "Unsupported vector length");
-        break;
-    }
-
-    *s = sumf;
-
-#elif defined(__POWER9_VECTOR__)
-    const vector signed char lowMask = vec_splats((signed char)0x3);
-    const vector signed char lowScaleMask = vec_splats((signed char)0xF);
-    const vector int v0 = vec_splats((int32_t)0);
-    const vector unsigned char v2 = vec_splats((unsigned char)0x2);
-    const vector unsigned char v6 = vec_splats((unsigned char)0x6);
-    const vector unsigned char v4 = vec_splats((unsigned char)0x4);
-
-    vector float vsumf0 = vec_splats(0.0f);
-    vector float vsumf1 = vec_splats(0.0f);
-    vector float vsumf2 = vec_splats(0.0f);
-    vector float vsumf3 = vec_splats(0.0f);
-
-    for (int i = 0; i < nb; ++i) {
-        vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
-        vector float vyd = vec_splats(y[i].d);
-        vector float vd = vec_mul(vxd, vyd);
-
-        vector float vxmin = vec_splats(GGML_FP16_TO_FP32(x[i].dmin));
-        vector float vdmin = vec_mul(vxmin, vyd);
-
-        vector signed short q8ysums0 = vec_xl( 0, y[i].bsums);
-        vector signed short q8ysums1 = vec_xl(16, y[i].bsums);
-
-        vector signed char q2xmins = (vector signed char)vec_xl( 0, x[i].scales);
-        vector signed char vscales = vec_and(q2xmins, lowScaleMask);
-
-        q2xmins = vec_sr(q2xmins, v4);
-        vector signed short q2xmins0 = vec_unpackh(q2xmins);
-        vector signed short q2xmins1 = vec_unpackl(q2xmins);
-
-        vector signed int prod0 = vec_mule(q2xmins0, q8ysums0);
-        vector signed int prod1 = vec_mulo(q2xmins0, q8ysums0);
-        vector signed int prod2 = vec_mule(q2xmins1, q8ysums1);
-        vector signed int prod3 = vec_mulo(q2xmins1, q8ysums1);
-
-        vsumf0 = vec_nmsub(vec_ctf(prod0, 0), vdmin, vsumf0);
-        vsumf1 = vec_nmsub(vec_ctf(prod1, 0), vdmin, vsumf1);
-        vsumf2 = vec_nmsub(vec_ctf(prod2, 0), vdmin, vsumf2);
-        vsumf3 = vec_nmsub(vec_ctf(prod3, 0), vdmin, vsumf3);
-
-        vector signed int vsumi0 = v0;
-        vector signed int vsumi1 = v0;
-        vector signed int vsumi2 = v0;
-        vector signed int vsumi3 = v0;
-        vector signed int vsumi4 = v0;
-        vector signed int vsumi5 = v0;
-        vector signed int vsumi6 = v0;
-        vector signed int vsumi7 = v0;
-
-        const uint8_t * GGML_RESTRICT q2 = x[i].qs;
-        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
-
-        for (int j = 0; j < QK_K/128; ++j) {
-            __builtin_prefetch(q2, 0, 1);
-            __builtin_prefetch(q8, 0, 1);
-
-            vector signed char qxs0 = (vector signed char)vec_xl( 0, q2);
-            vector signed char qxs1 = (vector signed char)vec_xl(16, q2);
-            q2 += 32;
-
-            vector unsigned char q2x00 = (vector unsigned char)vec_and(qxs0, lowMask);
-            vector unsigned char q2x01 = (vector unsigned char)vec_and(vec_sr(qxs0, v2), lowMask);
-            vector unsigned char q2x02 = (vector unsigned char)vec_and(vec_sr(qxs0, v4), lowMask);
-            vector unsigned char q2x03 = (vector unsigned char)vec_and(vec_sr(qxs0, v6), lowMask);
-            vector unsigned char q2x10 = (vector unsigned char)vec_and(qxs1, lowMask);
-            vector unsigned char q2x11 = (vector unsigned char)vec_and(vec_sr(qxs1, v2), lowMask);
-            vector unsigned char q2x12 = (vector unsigned char)vec_and(vec_sr(qxs1, v4), lowMask);
-            vector unsigned char q2x13 = (vector unsigned char)vec_and(vec_sr(qxs1, v6), lowMask);
-
-            vector signed char q8y00 = vec_xl(  0, q8);
-            vector signed char q8y10 = vec_xl( 16, q8);
-            vector signed char q8y01 = vec_xl( 32, q8);
-            vector signed char q8y11 = vec_xl( 48, q8);
-            vector signed char q8y02 = vec_xl( 64, q8);
-            vector signed char q8y12 = vec_xl( 80, q8);
-            vector signed char q8y03 = vec_xl( 96, q8);
-            vector signed char q8y13 = vec_xl(112, q8);
-            q8 += 128;
-
-            vector signed int qv0 = vec_msum(q8y00, q2x00, v0);
-            vector signed int qv1 = vec_msum(q8y01, q2x01, v0);
-            vector signed int qv2 = vec_msum(q8y02, q2x02, v0);
-            vector signed int qv3 = vec_msum(q8y03, q2x03, v0);
-            vector signed int qv4 = vec_msum(q8y10, q2x10, v0);
-            vector signed int qv5 = vec_msum(q8y11, q2x11, v0);
-            vector signed int qv6 = vec_msum(q8y12, q2x12, v0);
-            vector signed int qv7 = vec_msum(q8y13, q2x13, v0);
-
-            vector signed short vscales_07 = vec_unpackh(vscales);
-            vector signed int vscales_03 = vec_unpackh(vscales_07);
-            vector signed int vscales_47 = vec_unpackl(vscales_07);
-            vector signed int vs0 = vec_splat(vscales_03, 0);
-            vector signed int vs1 = vec_splat(vscales_03, 1);
-            vector signed int vs2 = vec_splat(vscales_03, 2);
-            vector signed int vs3 = vec_splat(vscales_03, 3);
-            vector signed int vs4 = vec_splat(vscales_47, 0);
-            vector signed int vs5 = vec_splat(vscales_47, 1);
-            vector signed int vs6 = vec_splat(vscales_47, 2);
-            vector signed int vs7 = vec_splat(vscales_47, 3);
-            vscales = vec_sld(vscales, vscales, 8);
-
-            vsumi0 = vec_add(vec_mul(qv0, vs0), vsumi0);
-            vsumi1 = vec_add(vec_mul(qv1, vs2), vsumi1);
-            vsumi2 = vec_add(vec_mul(qv2, vs4), vsumi2);
-            vsumi3 = vec_add(vec_mul(qv3, vs6), vsumi3);
-            vsumi4 = vec_add(vec_mul(qv4, vs1), vsumi4);
-            vsumi5 = vec_add(vec_mul(qv5, vs3), vsumi5);
-            vsumi6 = vec_add(vec_mul(qv6, vs5), vsumi6);
-            vsumi7 = vec_add(vec_mul(qv7, vs7), vsumi7);
-        }
-
-        vsumi0 = vec_add(vsumi0, vsumi4);
-        vsumi1 = vec_add(vsumi1, vsumi5);
-        vsumi2 = vec_add(vsumi2, vsumi6);
-        vsumi3 = vec_add(vsumi3, vsumi7);
-
-        vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
-        vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
-        vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
-        vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
-    }
-
-    vsumf0 = vec_add(vsumf0, vsumf2);
-    vsumf1 = vec_add(vsumf1, vsumf3);
-
-    vsumf0 = vec_add(vsumf0, vsumf1);
-
-    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
-    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
-
-    *s = vec_extract(vsumf0, 0);
-
-#elif defined __loongarch_asx
-
-    __m256 acc = (__m256)__lasx_xvldi(0);
-
-    for (int i = 0; i < nb; ++i) {
-
-        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
-        const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
-
-        const uint8_t * GGML_RESTRICT q2 = x[i].qs;
-        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
-
-        const __m128i mins_and_scales128 = __lsx_vld((const __m128i*)x[i].scales, 0);
-        const __m128i scales128 = __lsx_vandi_b(mins_and_scales128, 0xf);
-        const __m256i mins = lasx_ext8_16(__lsx_vsrli_b(mins_and_scales128, 4));
-        const __m256i prod = lasx_madd_h(mins, __lasx_xvld((const __m256i*)y[i].bsums, 0));
-
-        acc = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(dmin), __lasx_xvffint_s_w(prod), acc);
-
-        const v16i8 shuffle_mask = {0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15};
-        const __m256i scales_shuffled = lasx_ext8_16(__lsx_vshuf_b(scales128, scales128, (__m128i)shuffle_mask));
-
-        __m256i sumi = __lasx_xvldi(0);
-
-        for (int j = 0; j < QK_K/128; ++j) {
-
-            const __m256i q2bits = __lasx_xvld((const __m256i*)q2, 0); q2 += 32;
-
-            const __m256i q8_0 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
-            const __m256i q8_1 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
-            const __m256i q8_2 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
-            const __m256i q8_3 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
-
-            const __m256i q2_0 = __lasx_xvandi_b(q2bits, 3);
-            const __m256i q2_1 = __lasx_xvandi_b(__lasx_xvsrli_b(q2bits, 2), 3);
-            const __m256i q2_2 = __lasx_xvandi_b(__lasx_xvsrli_b(q2bits, 4), 3);
-            const __m256i q2_3 = __lasx_xvsrli_b(q2bits, 6);
-
-            __m256i p0 = lasx_madd_h_b(q2_0, q8_0);
-            __m256i p1 = lasx_madd_h_b(q2_1, q8_1);
-            __m256i p2 = lasx_madd_h_b(q2_2, q8_2);
-            __m256i p3 = lasx_madd_h_b(q2_3, q8_3);
-
-            p0 = lasx_madd_h(lasx_xvrepl128vei_h(scales_shuffled, 4 * j + 0), p0);
-            p1 = lasx_madd_h(lasx_xvrepl128vei_h(scales_shuffled, 4 * j + 1), p1);
-            p2 = lasx_madd_h(lasx_xvrepl128vei_h(scales_shuffled, 4 * j + 2), p2);
-            p3 = lasx_madd_h(lasx_xvrepl128vei_h(scales_shuffled, 4 * j + 3), p3);
-
-            p0 = __lasx_xvadd_w(p0, p1);
-            p2 = __lasx_xvadd_w(p2, p3);
-
-            sumi = __lasx_xvadd_w(sumi, __lasx_xvadd_w(p0, p2));
-        }
-
-        acc = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(d), __lasx_xvffint_s_w(sumi), acc);
-
-    }
-
-    *s = hsum_float_8(acc);
-
-#else
-
-    float sumf = 0;
-
-    for (int i = 0; i < nb; ++i) {
-
-        const uint8_t * q2 = x[i].qs;
-        const  int8_t * q8 = y[i].qs;
-        const uint8_t * sc = x[i].scales;
-
-        int summs = 0;
-        for (int j = 0; j < 16; ++j) {
-            summs += y[i].bsums[j] * (sc[j] >> 4);
-        }
-
-        const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d);
-        const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
-
-        int isum = 0;
-        int is = 0;
-        int d;
-        for (int k = 0; k < QK_K/128; ++k) {
-            int shift = 0;
-            for (int j = 0; j < 4; ++j) {
-                d = sc[is++] & 0xF;
-                int isuml = 0;
-                for (int l =  0; l < 16; ++l) isuml += q8[l] * ((q2[l] >> shift) & 3);
-                isum += d * isuml;
-                d = sc[is++] & 0xF;
-                isuml = 0;
-                for (int l = 16; l < 32; ++l) isuml += q8[l] * ((q2[l] >> shift) & 3);
-                isum += d * isuml;
-                shift += 2;
-                q8 += 32;
-            }
-            q2 += 32;
-        }
-        sumf += dall * isum - dmin * summs;
-    }
-    *s = sumf;
-#endif
-}
-
-void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
-    assert(n % QK_K == 0);
-    assert(nrc == 1);
-    UNUSED(nrc);
-    UNUSED(bx);
-    UNUSED(by);
-    UNUSED(bs);
-
-    const uint32_t kmask1 = 0x03030303;
-    const uint32_t kmask2 = 0x0f0f0f0f;
-
-    const block_q3_K * GGML_RESTRICT x = vx;
-    const block_q8_K * GGML_RESTRICT y = vy;
-
-    const int nb = n / QK_K;
-
-#if defined(__ARM_FEATURE_SVE)
-
-    uint32_t aux[3];
-    uint32_t utmp[4];
-
-    const int8_t m32 = 32;
-    const int vector_length = svcntb()*8;
-    const svuint8_t m3b_sv = svdup_n_u8(0x3);
-    const svint32_t vzero_sv = svdup_n_s32(0);
-
-    const svuint8_t m0_sv = svdup_n_u8(1);
-    const svuint8_t m1_sv = svlsl_n_u8_x(svptrue_b8(), m0_sv, 1);
-    const svuint8_t m2_sv = svlsl_n_u8_x(svptrue_b8(), m0_sv, 2);
-    const svuint8_t m3_sv = svlsl_n_u8_x(svptrue_b8(), m0_sv, 3);
-
-    float sum = 0;
-
-    for (int i = 0; i < nb; ++i) {
-
-        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
-
-        const uint8_t * GGML_RESTRICT q3_sv = x[i].qs;
-        const uint8_t * GGML_RESTRICT qh_sv = x[i].hmask;
-        const int8_t  * GGML_RESTRICT q8_sv = y[i].qs;
-
-        // Set up scales
-        memcpy(aux, x[i].scales, 12);
-        utmp[3] = ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4);
-        utmp[2] = ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4);
-        utmp[1] = (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4);
-        utmp[0] = (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4);
-
-        int8_t * scale = (int8_t *)utmp;
-
-        for (int j = 0; j < 16; ++j) scale[j] -= m32;
-
-        switch (vector_length) {
-            case 128:
-                {
-                    svuint8_t qhbits_sv_1 = svld1_u8(svptrue_b8(), qh_sv);
-                    svuint8_t qhbits_sv_2 = svld1_u8(svptrue_b8(), qh_sv+16);
-                    svuint8_t q3h_sv;
-
-                    svint32_t sumi1_1 = svdup_n_s32(0);
-                    svint8_t q3bytes_sv;
-
-                    for (int j = 0; j < QK_K/128; ++j) {
-
-                        const svuint8_t q3bits_sv = svld1_u8(svptrue_b8(), q3_sv); q3_sv += 16;
-                        const svuint8_t q3bits_sv_1 = svld1_u8(svptrue_b8(), q3_sv); q3_sv += 16;
-                        svint8_t q8bytes_1_sv_1 = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
-                        svint8_t q8bytes_1_sv_2 = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
-
-                        q3h_sv = svlsl_n_u8_x(svptrue_b8(), svbic_u8_x(svptrue_b8(), m0_sv, qhbits_sv_1), 2);
-                        q3bytes_sv = svsub_s8_x(svptrue_b8(), svreinterpret_s8_u8(svand_u8_m(svptrue_b8(), q3bits_sv, m3b_sv)), svreinterpret_s8_u8(q3h_sv));
-
-                        sumi1_1 = svmla_s32_m(svptrue_b32(), sumi1_1, svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_1), svdup_n_s32((int32_t)scale[0]));
-
-                        q3h_sv = svlsl_n_u8_x(svptrue_b8(), svbic_u8_x(svptrue_b8(), m0_sv, qhbits_sv_2), 2);
-                        q3bytes_sv = svsub_s8_x(svptrue_b8(), svreinterpret_s8_u8(svand_u8_m(svptrue_b8(), q3bits_sv_1, m3b_sv)), svreinterpret_s8_u8(q3h_sv));
-
-                        sumi1_1 = svmla_s32_m(svptrue_b32(), sumi1_1, svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_2), svdup_n_s32((int32_t)scale[1]));
-
-                        q8bytes_1_sv_1 = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
-                        q8bytes_1_sv_2 = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
-
-                        q3h_sv = svlsl_n_u8_x(svptrue_b8(), svbic_u8_x(svptrue_b8(), m1_sv, qhbits_sv_1), 1);
-                        q3bytes_sv = svsub_s8_x(svptrue_b8(), svreinterpret_s8_u8(svand_u8_m(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q3bits_sv, 2), m3b_sv)), svreinterpret_s8_u8(q3h_sv));
-
-                        sumi1_1 = svmla_s32_m(svptrue_b32(), sumi1_1, svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_1), svdup_n_s32((int32_t)scale[2]));
-
-                        q3h_sv = svlsl_n_u8_x(svptrue_b8(), svbic_u8_x(svptrue_b8(), m1_sv, qhbits_sv_2), 1);
-                        q3bytes_sv = svsub_s8_x(svptrue_b8(), svreinterpret_s8_u8(svand_u8_m(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q3bits_sv_1, 2), m3b_sv)), svreinterpret_s8_u8(q3h_sv));
-
-                        sumi1_1 = svmla_s32_m(svptrue_b32(), sumi1_1, svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_2), svdup_n_s32((int32_t)scale[3]));
-
-
-                        scale += 4;
-                        q8bytes_1_sv_1 = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
-                        q8bytes_1_sv_2 = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
-
-                        q3h_sv = svbic_u8_x(svptrue_b8(), m2_sv, qhbits_sv_1);
-                        q3bytes_sv = svsub_s8_x(svptrue_b8(), svreinterpret_s8_u8(svand_u8_m(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q3bits_sv, 4), m3b_sv)), svreinterpret_s8_u8(q3h_sv));
-
-                        sumi1_1 = svmla_s32_m(svptrue_b32(), sumi1_1, svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_1), svdup_n_s32((int32_t)scale[0]));
-
-                        q3h_sv = svbic_u8_x(svptrue_b8(), m2_sv, qhbits_sv_2);
-                        q3bytes_sv = svsub_s8_x(svptrue_b8(), svreinterpret_s8_u8(svand_u8_m(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q3bits_sv_1, 4), m3b_sv)), svreinterpret_s8_u8(q3h_sv));
-
-                        sumi1_1 = svmla_s32_m(svptrue_b32(), sumi1_1, svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_2), svdup_n_s32((int32_t)scale[1]));
-
-
-                        q8bytes_1_sv_1 = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
-                        q8bytes_1_sv_2 = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
-
-                        q3h_sv = svlsr_n_u8_x(svptrue_b8(), svbic_u8_x(svptrue_b8(), m3_sv, qhbits_sv_1), 1);
-                        q3bytes_sv = svsub_s8_x(svptrue_b8(), svreinterpret_s8_u8(svand_u8_m(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q3bits_sv, 6), m3b_sv)), svreinterpret_s8_u8(q3h_sv));
-
-                        sumi1_1 = svmla_s32_m(svptrue_b32(), sumi1_1, svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_1), svdup_n_s32((int32_t)scale[2]));
-
-                        q3h_sv = svlsr_n_u8_x(svptrue_b8(), svbic_u8_x(svptrue_b8(), m3_sv, qhbits_sv_2), 1);
-                        q3bytes_sv = svsub_s8_x(svptrue_b8(), svreinterpret_s8_u8(svand_u8_m(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q3bits_sv_1, 6), m3b_sv)), svreinterpret_s8_u8(q3h_sv));
-
-                        sumi1_1 = svmla_s32_m(svptrue_b32(), sumi1_1, svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_2), svdup_n_s32((int32_t)scale[3]));
-
-                        if (j == 0) {
-                            qhbits_sv_1 = svlsr_n_u8_x(svptrue_b8(), qhbits_sv_1, 4);
-                            qhbits_sv_2 = svlsr_n_u8_x(svptrue_b8(), qhbits_sv_2, 4);
-                        }
-
-                        scale += 4;
-                    }
-
-                    sum += d * (svaddv_s32(svptrue_b32(), sumi1_1));
-                } break;
-            case 256:
-            case 512:
-                {
-                    svuint8_t qhbits_sv = svld1_u8(svptrue_pat_b8(SV_VL32), qh_sv);
-                    svuint8_t q3h_sv;
-
-                    svint32_t sumi1_1 = svdup_n_s32(0);
-                    svint8_t q3bytes_sv;
-
-                    for (int j = 0; j < QK_K/128; ++j) {
-
-                        const svuint8_t q3bits_sv = svld1_u8(svptrue_pat_b8(SV_VL32), q3_sv); q3_sv += 32;
-                        svint8_t q8bytes_1_sv_1 = svld1_s8(svptrue_pat_b8(SV_VL32), q8_sv); q8_sv += 32;
-                        svint8_t q8bytes_1_sv_2 = svld1_s8(svptrue_pat_b8(SV_VL32), q8_sv); q8_sv += 32;
-
-                        q3h_sv = svlsl_n_u8_x(svptrue_pat_b8(SV_VL32), svbic_u8_x(svptrue_pat_b8(SV_VL32), m0_sv, qhbits_sv), 2);
-                        q3bytes_sv = svsub_s8_x(svptrue_pat_b8(SV_VL32), svreinterpret_s8_u8(svand_u8_m(svptrue_pat_b8(SV_VL32), q3bits_sv, m3b_sv)), svreinterpret_s8_u8(q3h_sv));
-
-
-                        svint32_t scale_1 = svsel_s32(svptrue_pat_b32(SV_VL4), svdup_n_s32((int32_t)scale[0]), svdup_n_s32((int32_t)scale[1]));
-                        sumi1_1 = svmla_s32_m(svptrue_pat_b32(SV_VL8), sumi1_1, svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_1), scale_1);
-
-                        q3h_sv = svlsl_n_u8_x(svptrue_pat_b8(SV_VL32), svbic_u8_x(svptrue_pat_b8(SV_VL32), m1_sv, qhbits_sv), 1);
-                        q3bytes_sv = svsub_s8_x(svptrue_pat_b8(SV_VL32), svreinterpret_s8_u8(svand_u8_m(svptrue_pat_b8(SV_VL32), svlsr_n_u8_x(svptrue_pat_b8(SV_VL32), q3bits_sv, 2), m3b_sv)), svreinterpret_s8_u8(q3h_sv));
-
-                        scale_1 = svsel_s32(svptrue_pat_b32(SV_VL4), svdup_n_s32((int32_t)scale[2]), svdup_n_s32((int32_t)scale[3]));
-                        sumi1_1 = svmla_s32_m(svptrue_pat_b32(SV_VL8), sumi1_1, svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_2), scale_1);
-
-                        scale += 4;
-                        q8bytes_1_sv_1 = svld1_s8(svptrue_pat_b8(SV_VL32), q8_sv); q8_sv += 32;
-                        q8bytes_1_sv_2 = svld1_s8(svptrue_pat_b8(SV_VL32), q8_sv); q8_sv += 32;
-
-                        q3h_sv = svbic_u8_x(svptrue_pat_b8(SV_VL32), m2_sv, qhbits_sv);
-                        q3bytes_sv = svsub_s8_x(svptrue_pat_b8(SV_VL32), svreinterpret_s8_u8(svand_u8_m(svptrue_pat_b8(SV_VL32), svlsr_n_u8_x(svptrue_pat_b8(SV_VL32), q3bits_sv, 4), m3b_sv)), svreinterpret_s8_u8(q3h_sv));
-
-                        scale_1 = svsel_s32(svptrue_pat_b32(SV_VL4), svdup_n_s32((int32_t)scale[0]), svdup_n_s32((int32_t)scale[1]));
-                        sumi1_1 = svmla_s32_m(svptrue_pat_b32(SV_VL8), sumi1_1, svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_1), scale_1);
-
-                        q3h_sv = svlsr_n_u8_x(svptrue_pat_b8(SV_VL32), svbic_u8_x(svptrue_pat_b8(SV_VL32), m3_sv, qhbits_sv), 1);
-                        q3bytes_sv = svsub_s8_x(svptrue_pat_b8(SV_VL32), svreinterpret_s8_u8(svand_u8_m(svptrue_pat_b8(SV_VL32), svlsr_n_u8_x(svptrue_pat_b8(SV_VL32), q3bits_sv, 6), m3b_sv)), svreinterpret_s8_u8(q3h_sv));
-
-                        scale_1 = svsel_s32(svptrue_pat_b32(SV_VL4), svdup_n_s32((int32_t)scale[2]), svdup_n_s32((int32_t)scale[3]));
-                        sumi1_1 = svmla_s32_m(svptrue_pat_b32(SV_VL8), sumi1_1, svdot_s32(vzero_sv, q3bytes_sv, q8bytes_1_sv_2), scale_1);
-
-                        if (j == 0) {
-                            qhbits_sv = svlsr_n_u8_x(svptrue_pat_b8(SV_VL32), qhbits_sv, 4);
-                        }
-
-                        scale += 4;
-                    }
-
-                    sum += d * (svaddv_s32(svptrue_pat_b32(SV_VL8), sumi1_1));
-                } break;
-            default:
-                assert(false && "Unsupported vector length");
-                break;
-        }
-    }
-    *s = sum;
-
-#elif __ARM_NEON
-
-    uint32_t aux[3];
-    uint32_t utmp[4];
-
-    const uint8x16_t m3b = vdupq_n_u8(0x3);
-    const int32x4_t  vzero = vdupq_n_s32(0);
-
-    const uint8x16_t m0 = vdupq_n_u8(1);
-    const uint8x16_t m1 = vshlq_n_u8(m0, 1);
-    const uint8x16_t m2 = vshlq_n_u8(m0, 2);
-    const uint8x16_t m3 = vshlq_n_u8(m0, 3);
-    const int8_t m32 = 32;
-
-    ggml_int8x16x4_t q3bytes;
-
-    float sum = 0;
-
-    for (int i = 0; i < nb; ++i) {
-
-        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
-
-        const uint8_t * GGML_RESTRICT q3 = x[i].qs;
-        const uint8_t * GGML_RESTRICT qh = x[i].hmask;
-        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
-
-        ggml_uint8x16x2_t qhbits = ggml_vld1q_u8_x2(qh);
-
-        ggml_uint8x16x4_t q3h;
-
-        int32_t isum = 0;
-
-        // Set up scales
-        memcpy(aux, x[i].scales, 12);
-        utmp[3] = ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4);
-        utmp[2] = ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4);
-        utmp[1] = (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4);
-        utmp[0] = (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4);
-
-        int8_t * scale = (int8_t *)utmp;
-        for (int j = 0; j < 16; ++j) scale[j] -= m32;
-
-        for (int j = 0; j < QK_K/128; ++j) {
-
-            const ggml_uint8x16x2_t q3bits = ggml_vld1q_u8_x2(q3); q3 += 32;
-            const ggml_int8x16x4_t q8bytes_1 = ggml_vld1q_s8_x4(q8); q8 += 64;
-            const ggml_int8x16x4_t q8bytes_2 = ggml_vld1q_s8_x4(q8); q8 += 64;
-
-            q3h.val[0] = vshlq_n_u8(vbicq_u8(m0, qhbits.val[0]), 2);
-            q3h.val[1] = vshlq_n_u8(vbicq_u8(m0, qhbits.val[1]), 2);
-            q3h.val[2] = vshlq_n_u8(vbicq_u8(m1, qhbits.val[0]), 1);
-            q3h.val[3] = vshlq_n_u8(vbicq_u8(m1, qhbits.val[1]), 1);
-
-            q3bytes.val[0] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(q3bits.val[0], m3b)), vreinterpretq_s8_u8(q3h.val[0]));
-            q3bytes.val[1] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(q3bits.val[1], m3b)), vreinterpretq_s8_u8(q3h.val[1]));
-            q3bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[0], 2), m3b)), vreinterpretq_s8_u8(q3h.val[2]));
-            q3bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[1], 2), m3b)), vreinterpretq_s8_u8(q3h.val[3]));
-
-            isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[0], q8bytes_1.val[0])) * scale[0];
-            isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[1], q8bytes_1.val[1])) * scale[1];
-            isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[2], q8bytes_1.val[2])) * scale[2];
-            isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[3], q8bytes_1.val[3])) * scale[3];
-
-            scale += 4;
-
-            q3h.val[0] = vbicq_u8(m2, qhbits.val[0]);
-            q3h.val[1] = vbicq_u8(m2, qhbits.val[1]);
-            q3h.val[2] = vshrq_n_u8(vbicq_u8(m3, qhbits.val[0]), 1);
-            q3h.val[3] = vshrq_n_u8(vbicq_u8(m3, qhbits.val[1]), 1);
-
-            q3bytes.val[0] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[0], 4), m3b)), vreinterpretq_s8_u8(q3h.val[0]));
-            q3bytes.val[1] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[1], 4), m3b)), vreinterpretq_s8_u8(q3h.val[1]));
-            q3bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[0], 6), m3b)), vreinterpretq_s8_u8(q3h.val[2]));
-            q3bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[1], 6), m3b)), vreinterpretq_s8_u8(q3h.val[3]));
-
-            isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[0], q8bytes_2.val[0])) * scale[0];
-            isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[1], q8bytes_2.val[1])) * scale[1];
-            isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[2], q8bytes_2.val[2])) * scale[2];
-            isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[3], q8bytes_2.val[3])) * scale[3];
-
-            scale += 4;
-
-            if (j == 0) {
-                qhbits.val[0] = vshrq_n_u8(qhbits.val[0], 4);
-                qhbits.val[1] = vshrq_n_u8(qhbits.val[1], 4);
-            }
-
-        }
-        sum += d * isum;
-
-    }
-
-    *s = sum;
-
-#elif defined __AVX2__
-
-    const __m256i m3 = _mm256_set1_epi8(3);
-    const __m256i mone = _mm256_set1_epi8(1);
-    const __m128i m32 = _mm_set1_epi8(32);
-
-    __m256 acc = _mm256_setzero_ps();
-
-    uint32_t aux[3];
-
-    for (int i = 0; i < nb; ++i) {
-
-        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
-
-        const uint8_t * GGML_RESTRICT q3 = x[i].qs;
-        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
-
-        // Set up scales
-        memcpy(aux, x[i].scales, 12);
-        __m128i scales128 = _mm_set_epi32(
-                ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4),
-                ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4),
-                (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4),
-                (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4));
-        scales128 = _mm_sub_epi8(scales128, m32);
-        const __m256i all_scales = _mm256_cvtepi8_epi16(scales128);
-        const __m128i l_scales = _mm256_extracti128_si256(all_scales, 0);
-        const __m128i h_scales = _mm256_extracti128_si256(all_scales, 1);
-        const __m256i scales[2] = {MM256_SET_M128I(l_scales, l_scales), MM256_SET_M128I(h_scales, h_scales)};
-
-        // high bit
-        const __m256i hbits = _mm256_loadu_si256((const __m256i*)x[i].hmask);
-
-        // integer accumulator
-        __m256i sumi = _mm256_setzero_si256();
-
-        int bit = 0;
-        int is  = 0;
-
-        for (int j = 0; j < QK_K/128; ++j) {
-            // load low 2 bits
-            const __m256i q3bits = _mm256_loadu_si256((const __m256i*)q3); q3 += 32;
-
-            // prepare low and high bits
-            const __m256i q3l_0 = _mm256_and_si256(q3bits, m3);
-            const __m256i q3h_0 = _mm256_slli_epi16(_mm256_srli_epi16(_mm256_andnot_si256(hbits, _mm256_slli_epi16(mone, bit)), bit), 2);
-            ++bit;
-
-            const __m256i q3l_1 = _mm256_and_si256(_mm256_srli_epi16(q3bits, 2), m3);
-            const __m256i q3h_1 = _mm256_slli_epi16(_mm256_srli_epi16(_mm256_andnot_si256(hbits, _mm256_slli_epi16(mone, bit)), bit), 2);
-            ++bit;
-
-            const __m256i q3l_2 = _mm256_and_si256(_mm256_srli_epi16(q3bits, 4), m3);
-            const __m256i q3h_2 = _mm256_slli_epi16(_mm256_srli_epi16(_mm256_andnot_si256(hbits, _mm256_slli_epi16(mone, bit)), bit), 2);
-            ++bit;
-
-            const __m256i q3l_3 = _mm256_and_si256(_mm256_srli_epi16(q3bits, 6), m3);
-            const __m256i q3h_3 = _mm256_slli_epi16(_mm256_srli_epi16(_mm256_andnot_si256(hbits, _mm256_slli_epi16(mone, bit)), bit), 2);
-            ++bit;
-
-            // load Q8 quants
-            const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
-            const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
-            const __m256i q8_2 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
-            const __m256i q8_3 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
-
-            // Dot product: we multiply the 2 low bits and 1 high bit part separately, so we can use _mm256_maddubs_epi16,
-            // and then subtract. The high bit part has the 2 already subtracted (and so, it is zero if the high bit was not set,
-            // and 2 if the high bit was set)
-            __m256i q8s_0 = _mm256_maddubs_epi16(q3h_0, q8_0);
-            __m256i q8s_1 = _mm256_maddubs_epi16(q3h_1, q8_1);
-            __m256i q8s_2 = _mm256_maddubs_epi16(q3h_2, q8_2);
-            __m256i q8s_3 = _mm256_maddubs_epi16(q3h_3, q8_3);
-
-            __m256i p16_0 = _mm256_maddubs_epi16(q3l_0, q8_0);
-            __m256i p16_1 = _mm256_maddubs_epi16(q3l_1, q8_1);
-            __m256i p16_2 = _mm256_maddubs_epi16(q3l_2, q8_2);
-            __m256i p16_3 = _mm256_maddubs_epi16(q3l_3, q8_3);
-
-            p16_0 = _mm256_sub_epi16(p16_0, q8s_0);
-            p16_1 = _mm256_sub_epi16(p16_1, q8s_1);
-            p16_2 = _mm256_sub_epi16(p16_2, q8s_2);
-            p16_3 = _mm256_sub_epi16(p16_3, q8s_3);
-
-            // multiply with scales
-            p16_0 = _mm256_madd_epi16(_mm256_shuffle_epi8(scales[j], get_scale_shuffle_q3k(is + 0)), p16_0);
-            p16_1 = _mm256_madd_epi16(_mm256_shuffle_epi8(scales[j], get_scale_shuffle_q3k(is + 1)), p16_1);
-            p16_2 = _mm256_madd_epi16(_mm256_shuffle_epi8(scales[j], get_scale_shuffle_q3k(is + 2)), p16_2);
-            p16_3 = _mm256_madd_epi16(_mm256_shuffle_epi8(scales[j], get_scale_shuffle_q3k(is + 3)), p16_3);
-
-            // accumulate
-            p16_0 = _mm256_add_epi32(p16_0, p16_1);
-            p16_2 = _mm256_add_epi32(p16_2, p16_3);
-            sumi  = _mm256_add_epi32(sumi, _mm256_add_epi32(p16_0, p16_2));
-
-        }
-
-        // multiply with block scale and accumulate
-        acc = _mm256_fmadd_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi), acc);
-
-    }
-
-    *s = hsum_float_8(acc);
-
-#elif defined __AVX__
-
-    const __m128i m3 = _mm_set1_epi8(3);
-    const __m128i mone = _mm_set1_epi8(1);
-    const __m128i m32 = _mm_set1_epi8(32);
-    const __m128i m2 = _mm_set1_epi8(2);
-
-    __m256 acc = _mm256_setzero_ps();
-
-    const uint32_t *aux;
-
-    for (int i = 0; i < nb; ++i) {
-
-        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
-
-        const uint8_t * GGML_RESTRICT q3 = x[i].qs;
-        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
-
-        // Set up scales
-        aux = (const uint32_t *)x[i].scales;
-        __m128i scales128 = _mm_set_epi32(
-                ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4),
-                ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4),
-                (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4),
-                (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4));
-        scales128 = _mm_sub_epi8(scales128, m32);
-        const __m128i scales_0 = _mm_cvtepi8_epi16(scales128);
-        const __m128i scales_1 = _mm_cvtepi8_epi16(_mm_unpackhi_epi64(scales128, scales128));
-        const __m128i scales[2] = { scales_0, scales_1 };
-
-        // high bit *128*2 from block_q3_K.hmask[QK_K/8]
-        const __m128i hbits_0 = _mm_loadu_si128((const __m128i*)&x[i].hmask[0]);
-        const __m128i hbits_1 = _mm_loadu_si128((const __m128i*)&x[i].hmask[16]);
-
-        // integer accumulator
-        __m128i sumi_0 = _mm_setzero_si128();
-        __m128i sumi_1 = _mm_setzero_si128();
-
-        for (int j = 0; j < QK_K/128; ++j) {
-            // load low 2 bits *64*2 from block_q3_K.qs[QK_K/4]
-            const __m128i q3bits_0 = _mm_loadu_si128((const __m128i*)q3); q3 += 16;
-            const __m128i q3bits_1 = _mm_loadu_si128((const __m128i*)q3); q3 += 16;
-
-            // prepare low and high bits
-            const int bit = j << 2;
-
-            const __m128i q3l_0 = _mm_and_si128(q3bits_0, m3);
-            const __m128i q3l_1 = _mm_and_si128(q3bits_1, m3);
-            const __m128i q3h_0 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_0, _mm_slli_epi16(mone, bit)), bit), 2);
-            const __m128i q3h_1 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_1, _mm_slli_epi16(mone, bit)), bit), 2);
-
-            const __m128i q3l_2 = _mm_and_si128(_mm_srli_epi16(q3bits_0, 2), m3);
-            const __m128i q3l_3 = _mm_and_si128(_mm_srli_epi16(q3bits_1, 2), m3);
-            const __m128i q3h_2 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_0, _mm_slli_epi16(mone, bit+1)), bit+1), 2);
-            const __m128i q3h_3 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_1, _mm_slli_epi16(mone, bit+1)), bit+1), 2);
-
-            const __m128i q3l_4 = _mm_and_si128(_mm_srli_epi16(q3bits_0, 4), m3);
-            const __m128i q3l_5 = _mm_and_si128(_mm_srli_epi16(q3bits_1, 4), m3);
-            const __m128i q3h_4 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_0, _mm_slli_epi16(mone, bit+2)), bit+2), 2);
-            const __m128i q3h_5 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_1, _mm_slli_epi16(mone, bit+2)), bit+2), 2);
-
-            const __m128i q3l_6 = _mm_and_si128(_mm_srli_epi16(q3bits_0, 6), m3);
-            const __m128i q3l_7 = _mm_and_si128(_mm_srli_epi16(q3bits_1, 6), m3);
-            const __m128i q3h_6 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_0, _mm_slli_epi16(mone, bit+3)), bit+3), 2);
-            const __m128i q3h_7 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_1, _mm_slli_epi16(mone, bit+3)), bit+3), 2);
-
-            // load Q8 quants from block_q8_K.qs[QK_K]
-            const __m128i q8_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-            const __m128i q8_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-            const __m128i q8_2 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-            const __m128i q8_3 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-            const __m128i q8_4 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-            const __m128i q8_5 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-            const __m128i q8_6 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-            const __m128i q8_7 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-
-            // Dot product: we multiply the 2 low bits and 1 high bit part separately, so we can use _mm256_maddubs_epi16,
-            // and then subtract. The high bit part has the 2 already subtracted (and so, it is zero if the high bit was not set,
-            // and 2 if the high bit was set)
-            __m128i q8s_0 = _mm_maddubs_epi16(q3h_0, q8_0);
-            __m128i q8s_1 = _mm_maddubs_epi16(q3h_1, q8_1);
-            __m128i q8s_2 = _mm_maddubs_epi16(q3h_2, q8_2);
-            __m128i q8s_3 = _mm_maddubs_epi16(q3h_3, q8_3);
-            __m128i q8s_4 = _mm_maddubs_epi16(q3h_4, q8_4);
-            __m128i q8s_5 = _mm_maddubs_epi16(q3h_5, q8_5);
-            __m128i q8s_6 = _mm_maddubs_epi16(q3h_6, q8_6);
-            __m128i q8s_7 = _mm_maddubs_epi16(q3h_7, q8_7);
-
-            __m128i p16_0 = _mm_maddubs_epi16(q3l_0, q8_0);
-            __m128i p16_1 = _mm_maddubs_epi16(q3l_1, q8_1);
-            __m128i p16_2 = _mm_maddubs_epi16(q3l_2, q8_2);
-            __m128i p16_3 = _mm_maddubs_epi16(q3l_3, q8_3);
-            __m128i p16_4 = _mm_maddubs_epi16(q3l_4, q8_4);
-            __m128i p16_5 = _mm_maddubs_epi16(q3l_5, q8_5);
-            __m128i p16_6 = _mm_maddubs_epi16(q3l_6, q8_6);
-            __m128i p16_7 = _mm_maddubs_epi16(q3l_7, q8_7);
-
-            p16_0 = _mm_sub_epi16(p16_0, q8s_0);
-            p16_1 = _mm_sub_epi16(p16_1, q8s_1);
-            p16_2 = _mm_sub_epi16(p16_2, q8s_2);
-            p16_3 = _mm_sub_epi16(p16_3, q8s_3);
-            p16_4 = _mm_sub_epi16(p16_4, q8s_4);
-            p16_5 = _mm_sub_epi16(p16_5, q8s_5);
-            p16_6 = _mm_sub_epi16(p16_6, q8s_6);
-            p16_7 = _mm_sub_epi16(p16_7, q8s_7);
-
-            // multiply with scales
-            __m128i shuffle = _mm_set1_epi16(0x0100);
-            p16_0 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_0);
-            shuffle = _mm_add_epi16(shuffle, m2);
-            p16_1 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_1);
-            shuffle = _mm_add_epi16(shuffle, m2);
-            p16_2 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_2);
-            shuffle = _mm_add_epi16(shuffle, m2);
-            p16_3 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_3);
-            shuffle = _mm_add_epi16(shuffle, m2);
-            p16_4 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_4);
-            shuffle = _mm_add_epi16(shuffle, m2);
-            p16_5 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_5);
-            shuffle = _mm_add_epi16(shuffle, m2);
-            p16_6 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_6);
-            shuffle = _mm_add_epi16(shuffle, m2);
-            p16_7 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_7);
-
-            // accumulate
-            p16_0 = _mm_add_epi32(p16_0, p16_1);
-            p16_2 = _mm_add_epi32(p16_2, p16_3);
-            p16_4 = _mm_add_epi32(p16_4, p16_5);
-            p16_6 = _mm_add_epi32(p16_6, p16_7);
-            sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p16_0, p16_2));
-            sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p16_4, p16_6));
-
-        }
-
-        // multiply with block scale and accumulate
-        __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
-        acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi)), acc);
-
-    }
-
-    *s = hsum_float_8(acc);
-
-#elif defined __wasm_simd128__
-    int8_t  aux8[QK_K];
-    float   sums[8] = {0};
-    uint32_t auxs[4];
-
-    float sumf = 0;
-    for (int i = 0; i < nb; ++i) {
-        const uint8_t * GGML_RESTRICT q3 = x[i].qs;
-        const uint8_t * GGML_RESTRICT hm = x[i].hmask;
-        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
-
-        // Process blocks with SIMD
-        int8_t * a = aux8;
-        uint8_t m = 1;
-        for (int j = 0; j < QK_K; j += 128) {
-            for (int shift = 0; shift <= 6; shift += 2) {
-                v128_t v_m = wasm_i8x16_splat(m);
-                for (int l = 0; l < 32; l += 16) {
-                    v128_t v_q3 = wasm_v128_load(q3 + l);
-                    v128_t v_shift = wasm_i8x16_shr(v_q3, shift);
-                    v128_t v_low2 = wasm_v128_and(v_shift, wasm_i8x16_splat(0x03));
-
-                    v128_t v_hm = wasm_v128_load(hm + l);
-                    v128_t v_mask = wasm_v128_and(v_hm, v_m);
-                    v_mask = wasm_i8x16_ne(v_mask, wasm_i8x16_splat(0));
-
-                    v_low2 = wasm_i8x16_sub(v_low2, wasm_v128_and(wasm_i8x16_splat(4), wasm_v128_not(v_mask)));
-                    wasm_v128_store(a + l, v_low2);
-                }
-                a += 32;
-                m <<= 1;
-            }
-            q3 += 32;
-        }
-
-        // Extract scales
-        memcpy(auxs, x[i].scales, 12);
-        uint32_t tmp = auxs[2];
-        auxs[2] = ((auxs[0] >> 4) & kmask2) | (((tmp >> 4) & kmask1) << 4);
-        auxs[3] = ((auxs[1] >> 4) & kmask2) | (((tmp >> 6) & kmask1) << 4);
-        auxs[0] = (auxs[0] & kmask2) | (((tmp >> 0) & kmask1) << 4);
-        auxs[1] = (auxs[1] & kmask2) | (((tmp >> 2) & kmask1) << 4);
-        const int8_t * scales = (const int8_t *)auxs;
-
-        // SIMD dot product with register accumulators
-        v128_t v_acc0 = wasm_i32x4_splat(0);
-        v128_t v_acc1 = wasm_i32x4_splat(0);
-        a = aux8;
-        for (int j = 0; j < QK_K/16; ++j) {
-            const v128_t v_scale = wasm_i16x8_splat(scales[j] - 32);
-
-            // Process 16 elements per iteration
-            for (int k = 0; k < 2; ++k) {
-                const v128_t v_q8 = wasm_i16x8_load8x8(q8);
-                const v128_t v_a = wasm_i16x8_load8x8(a);
-
-                v128_t v_prod = wasm_i16x8_mul(v_q8, v_a);
-                v_prod = wasm_i16x8_mul(v_prod, v_scale);
-
-                v_acc0 = wasm_i32x4_add(v_acc0, wasm_i32x4_extend_low_i16x8(v_prod));
-                v_acc1 = wasm_i32x4_add(v_acc1, wasm_i32x4_extend_high_i16x8(v_prod));
-
-                q8 += 8;
-                a += 8;
-            }
-        }
-
-        // Accumulate results
-        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-        const v128_t v_d = wasm_f32x4_splat(d);
-        v128_t v_sum = wasm_f32x4_add(
-            wasm_f32x4_mul(wasm_f32x4_convert_i32x4(v_acc0), v_d),
-            wasm_f32x4_mul(wasm_f32x4_convert_i32x4(v_acc1), v_d)
-        );
-
-        // Accumulate into sums vector
-        wasm_v128_store(sums, wasm_f32x4_add(wasm_v128_load(sums), v_sum));
-    }
-
-    // Horizontal sum
-    v128_t v_sum = wasm_f32x4_add(wasm_v128_load(sums), wasm_v128_load(sums + 4));
-    sumf = wasm_f32x4_extract_lane(v_sum, 0) +
-           wasm_f32x4_extract_lane(v_sum, 1) +
-           wasm_f32x4_extract_lane(v_sum, 2) +
-           wasm_f32x4_extract_lane(v_sum, 3);
-
-    *s = sumf;
-
-#elif defined __riscv_xtheadvector
-
-    uint32_t utmp[4];
-    float sumf = 0;
-
-    for (int i = 0; i < nb; ++i) {
-        const uint8_t * restrict q3 = x[i].qs;
-        const uint8_t * restrict qh = x[i].hmask;
-        const  int8_t * restrict q8 = y[i].qs;
-
-        int8_t * scale = (int8_t *)utmp;
-        int tmp;
-        __asm__ __volatile__(
-            "li %[tmp], 12\n\t"
-            "th.vsetvli zero, %[tmp], e8, m1\n\t"
-            "th.vlb.v v0, (%[s6b])\n\t"
-            "th.vmv.v.v v2, v0\n\t"
-            "li %[tmp], 2\n\t"
-            "th.vsetvli zero, %[tmp], e64, m1\n\t"
-            "th.vmv.v.x v9, %[sh]\n\t"\
-            "th.vslidedown.vi v1, v0, 1\n\t"
-            "th.vslide1up.vx v8, v9, zero\n\t" // {0, 0, 4, 4}
-            "th.vslideup.vi v0, v2, 1\n\t" // {aux[0], aux[1], aux[0], aux[1]}
-            "li %[tmp], 4\n\t"
-            "th.vsetvli zero, %[tmp], e32, m1\n\t"
-            "th.vid.v v9\n\t"
-            "th.vmv.x.s %[tmp], v1\n\t"
-            "th.vsll.vi v9, v9, 1\n\t" // {0, 2, 4, 6}
-            "th.vmv.v.x v1, %[tmp]\n\t" // {aux[2], aux[2], aux[2], aux[2]}
-            "th.vsrl.vv v4, v1, v9\n\t"
-            "th.vsrl.vv v2, v0, v8\n\t"
-            "th.vand.vx v5, v4, %[kmask1]\n\t"
-            "th.vand.vx v3, v2, %[kmask2]\n\t"
-            "th.vsll.vi v6, v5, 4\n\t"
-            "th.vor.vv v7, v6, v3\n\t"
-            "li %[tmp], 16\n\t"
-            "th.vsetvli zero, %[tmp], e8, m1\n\t"
-            "th.vsub.vx v0, v7, %[c]\n\t"
-            "th.vsb.v v0, (%[scale])"
-            : [tmp] "=&r" (tmp)
-            : [sh] "r" (0x0000000400000004), [s6b] "r" (x[i].scales), [c] "r" (32)
-            , [scale] "r" (scale), [kmask1] "r" (kmask1), [kmask2] "r" (kmask2)
-            : "memory"
-            , "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
-            , "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
-            , "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
-            , "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
-        );
-
-        uint8_t m = 1;
-        int isum = 0;
-        for (int j = 0; j < QK_K; j += 128) {
-            __asm__ __volatile__(
-                // fixme: use v0p7 mask layout directly
-                "th.vsetvli zero, %[vl32], e8, m2\n\t"
-                "th.vlb.v v8, (%[q3])\n\t"
-                "th.vsrl.vi v10, v8, 2\n\t"
-                "th.vsrl.vi v12, v8, 4\n\t"
-                "th.vsrl.vi v14, v8, 6\n\t"
-                "th.vand.vi v8, v8, 3\n\t"
-                "th.vand.vi v10, v10, 3\n\t"
-                "th.vand.vi v12, v12, 3\n\t"
-                "th.vlb.v v2, (%[qh])\n\t"
-                "th.vand.vx v4, v2, %[m]\n\t"
-                "slli %[m], %[m], 1\n\t"
-                "th.vmseq.vx v0, v4, zero\n\t"
-                "th.vadd.vi v8, v8, -4, v0.t\n\t"
-                "th.vand.vx v4, v2, %[m]\n\t"
-                "slli %[m], %[m], 1\n\t"
-                "th.vmseq.vx v0, v4, zero\n\t"
-                "th.vadd.vi v10, v10, -4, v0.t\n\t"
-                "th.vand.vx v4, v2, %[m]\n\t"
-                "slli %[m], %[m], 1\n\t"
-                "th.vmseq.vx v0, v4, zero\n\t"
-                "th.vadd.vi v12, v12, -4, v0.t\n\t"
-                "th.vand.vx v4, v2, %[m]\n\t"
-                "slli %[m], %[m], 1\n\t"
-                "th.vmseq.vx v0, v4, zero\n\t"
-                "th.vadd.vi v14, v14, -4, v0.t\n\t"
-                "th.vsetvli zero, %[vl128], e8, m8\n\t"
-                "th.vlb.v v0, (%[q8])\n\t"
-                "th.vsetvli zero, %[vl64], e8, m4\n\t"
-                "th.vwmul.vv v16, v0, v8\n\t"
-                "th.vwmul.vv v24, v4, v12\n\t"
-                "li %[tmp], 16\n\t"
-                "th.vsetvli zero, %[tmp], e16, m2\n\t"
-                "th.vmv.v.x v0, zero\n\t"
-                "th.vwredsum.vs v10, v16, v0\n\t"
-                "th.vwredsum.vs v9, v18, v0\n\t"
-                "th.vwredsum.vs v8, v20, v0\n\t"
-                "th.vwredsum.vs v7, v22, v0\n\t"
-                "th.vwredsum.vs v11, v24, v0\n\t"
-                "th.vwredsum.vs v12, v26, v0\n\t"
-                "th.vwredsum.vs v13, v28, v0\n\t"
-                "th.vwredsum.vs v14, v30, v0\n\t"
-                "li %[tmp], 4\n\t"
-                "th.vsetvli zero, %[tmp], e32, m1\n\t"
-                "th.vslideup.vi v10, v9, 1\n\t"
-                "th.vslideup.vi v8, v7, 1\n\t"
-                "th.vslideup.vi v11, v12, 1\n\t"
-                "th.vslideup.vi v13, v14, 1\n\t"
-                "th.vslideup.vi v10, v8, 2\n\t"
-                "th.vslideup.vi v11, v13, 2\n\t"
-                "li %[tmp], 8\n\t"
-                "th.vsetvli zero, %[tmp], e32, m2\n\t"
-                "th.vlb.v v12, (%[scale])\n\t"
-                "th.vmul.vv v10, v10, v12\n\t"
-                "th.vredsum.vs v0, v10, v0\n\t"
-                "th.vmv.x.s %[tmp], v0\n\t"
-                "add %[isum], %[isum], %[tmp]"
-                : [tmp] "=&r" (tmp), [m] "+&r" (m), [isum] "+&r" (isum)
-                : [vl128] "r" (128), [vl64] "r" (64), [vl32] "r" (32)
-                , [q3] "r" (q3), [qh] "r" (qh), [scale] "r" (scale), [q8] "r" (q8)
-                : "memory"
-                , "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
-                , "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
-                , "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
-                , "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
-            );
-            q3 += 32;    q8 += 128;   scale += 8;
-        }
-
-        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-        sumf += d * isum;
-    }
-
-    *s = sumf;
-
-#elif defined __riscv_v
-
-    uint32_t utmp[4];
-    float sumf = 0;
-    uint32_t aux[3];
-    const int vector_length = __riscv_vlenb() * 8;
-
-    switch (vector_length) {
-    case 256:
-        for (int i = 0; i < nb; ++i) {
-
-            const uint8_t * GGML_RESTRICT q3 = x[i].qs;
-            const uint8_t * GGML_RESTRICT qh = x[i].hmask;
-            const  int8_t * GGML_RESTRICT q8 = y[i].qs;
-
-            memcpy(aux, x[i].scales, 12);
-            utmp[3] = ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4);
-            utmp[2] = ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4);
-            utmp[1] = (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4);
-            utmp[0] = (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4);
-
-            int8_t * scale = (int8_t *)utmp;
-            for (int j = 0; j < 16; ++j) scale[j] -= 32;
-
-
-            size_t vl = 32;
-            uint8_t m =  1;
-
-            vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1);
-            vuint8m1_t vqh = __riscv_vle8_v_u8m1(qh, vl);
-
-            int sum_t = 0;
-
-            for (int j = 0; j < QK_K; j += 128) {
-
-                vl = 32;
-
-                // load Q3
-                vuint8m1_t q3_x = __riscv_vle8_v_u8m1(q3, vl);
-
-                vint8m1_t q3_0 = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(q3_x, 0x03, vl));
-                vint8m1_t q3_1 = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(q3_x, 0x2, vl), 0x03 , vl));
-                vint8m1_t q3_2 = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(q3_x, 0x4, vl), 0x03 , vl));
-                vint8m1_t q3_3 = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(q3_x, 0x6, vl), 0x03 , vl));
-
-                // compute mask for subtraction
-                vuint8m1_t qh_m0 = __riscv_vand_vx_u8m1(vqh, m, vl);
-                vbool8_t vmask_0 = __riscv_vmseq_vx_u8m1_b8(qh_m0, 0, vl);
-                vint8m1_t q3_m0 = __riscv_vsub_vx_i8m1_mu(vmask_0, q3_0, q3_0, 0x4, vl);
-                m <<= 1;
-
-                vuint8m1_t qh_m1 = __riscv_vand_vx_u8m1(vqh, m, vl);
-                vbool8_t vmask_1 = __riscv_vmseq_vx_u8m1_b8(qh_m1, 0, vl);
-                vint8m1_t q3_m1 = __riscv_vsub_vx_i8m1_mu(vmask_1, q3_1, q3_1, 0x4, vl);
-                m <<= 1;
-
-                vuint8m1_t qh_m2 = __riscv_vand_vx_u8m1(vqh, m, vl);
-                vbool8_t vmask_2 = __riscv_vmseq_vx_u8m1_b8(qh_m2, 0, vl);
-                vint8m1_t q3_m2 = __riscv_vsub_vx_i8m1_mu(vmask_2, q3_2, q3_2, 0x4, vl);
-                m <<= 1;
-
-                vuint8m1_t qh_m3 = __riscv_vand_vx_u8m1(vqh, m, vl);
-                vbool8_t vmask_3 = __riscv_vmseq_vx_u8m1_b8(qh_m3, 0, vl);
-                vint8m1_t q3_m3 = __riscv_vsub_vx_i8m1_mu(vmask_3, q3_3, q3_3, 0x4, vl);
-                m <<= 1;
-
-                // load Q8 and take product with Q3
-                vint16m2_t a0 = __riscv_vwmul_vv_i16m2(q3_m0, __riscv_vle8_v_i8m1(q8, vl), vl);
-                vint16m2_t a1 = __riscv_vwmul_vv_i16m2(q3_m1, __riscv_vle8_v_i8m1(q8+32, vl), vl);
-                vint16m2_t a2 = __riscv_vwmul_vv_i16m2(q3_m2, __riscv_vle8_v_i8m1(q8+64, vl), vl);
-                vint16m2_t a3 = __riscv_vwmul_vv_i16m2(q3_m3, __riscv_vle8_v_i8m1(q8+96, vl), vl);
-
-                vl = 16;
-
-                // retrieve lane to multiply with scale
-                vint32m2_t aux0_0 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a0, 0), (scale[0]), vl);
-                vint32m2_t aux0_1 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a0, 1), (scale[1]), vl);
-                vint32m2_t aux1_0 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a1, 0), (scale[2]), vl);
-                vint32m2_t aux1_1 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a1, 1), (scale[3]), vl);
-                vint32m2_t aux2_0 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a2, 0), (scale[4]), vl);
-                vint32m2_t aux2_1 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a2, 1), (scale[5]), vl);
-                vint32m2_t aux3_0 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a3, 0), (scale[6]), vl);
-                vint32m2_t aux3_1 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a3, 1), (scale[7]), vl);
-
-                vint32m1_t isum0 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(aux0_0, aux0_1, vl), vzero, vl);
-                vint32m1_t isum1 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(aux1_0, aux1_1, vl), isum0, vl);
-                vint32m1_t isum2 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(aux2_0, aux2_1, vl), isum1, vl);
-                vint32m1_t isum3 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(aux3_0, aux3_1, vl), isum2, vl);
-
-                sum_t +=  __riscv_vmv_x_s_i32m1_i32(isum3);
-
-                q3 += 32;    q8 += 128;   scale += 8;
-
-            }
-
-            const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-
-            sumf += d*sum_t;
-
-        }
-        break;
-    case 128:
-        for (int i = 0; i < nb; ++i) {
-            const uint8_t * restrict q3 = x[i].qs;
-            const uint8_t * restrict qh = x[i].hmask;
-            const  int8_t * restrict q8 = y[i].qs;
-
-            int8_t * scale = (int8_t *)utmp;
-            int tmp;
-            __asm__ __volatile__(
-                "vsetivli zero, 12, e8, m1\n\t"
-                "vle8.v v0, (%[s6b])\n\t"
-                "vmv1r.v v2, v0\n\t"
-                "vsetivli zero, 2, e64, m1\n\t"
-                "vmv.v.x v9, %[sh]\n\t"\
-                "vslidedown.vi v1, v0, 1\n\t"
-                "vslide1up.vx v8, v9, zero\n\t" // {0, 0, 4, 4}
-                "vslideup.vi v0, v2, 1\n\t" // {aux[0], aux[1], aux[0], aux[1]}
-                "vsetivli zero, 4, e32, m1\n\t"
-                "vid.v v9\n\t"
-                "vmv.x.s %[tmp], v1\n\t"
-                "vsll.vi v9, v9, 1\n\t" // {0, 2, 4, 6}
-                "vmv.v.x v1, %[tmp]\n\t" // {aux[2], aux[2], aux[2], aux[2]}
-                "vsrl.vv v4, v1, v9\n\t"
-                "vsrl.vv v2, v0, v8\n\t"
-                "vand.vx v5, v4, %[kmask1]\n\t"
-                "vand.vx v3, v2, %[kmask2]\n\t"
-                "vsll.vi v6, v5, 4\n\t"
-                "vor.vv v7, v6, v3\n\t"
-                "vsetivli zero, 16, e8, m1\n\t"
-                "vsub.vx v0, v7, %[c]\n\t"
-                "vse8.v v0, (%[scale])"
-                : [tmp] "=&r" (tmp)
-                : [sh] "r" (0x0000000400000004), [s6b] "r" (x[i].scales), [c] "r" (32)
-                , [scale] "r" (scale), [kmask1] "r" (kmask1), [kmask2] "r" (kmask2)
-                : "memory"
-                , "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
-                , "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
-                , "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
-                , "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
-            );
-
-            uint8_t m = 1;
-            int isum = 0;
-            for (int j = 0; j < QK_K; j += 128) {
-                __asm__ __volatile__(
-                    "vsetvli zero, %[vl32], e8, m2, ta, mu\n\t"
-                    "vle8.v v8, (%[q3])\n\t"
-                    "vsrl.vi v10, v8, 2\n\t"
-                    "vsrl.vi v12, v8, 4\n\t"
-                    "vsrl.vi v14, v8, 6\n\t"
-                    "vand.vi v8, v8, 3\n\t"
-                    "vand.vi v10, v10, 3\n\t"
-                    "vand.vi v12, v12, 3\n\t"
-                    "vle8.v v2, (%[qh])\n\t"
-                    "vand.vx v4, v2, %[m]\n\t"
-                    "slli %[m], %[m], 1\n\t"
-                    "vmseq.vx v0, v4, zero\n\t"
-                    "vadd.vi v8, v8, -4, v0.t\n\t"
-                    "vand.vx v4, v2, %[m]\n\t"
-                    "slli %[m], %[m], 1\n\t"
-                    "vmseq.vx v0, v4, zero\n\t"
-                    "vadd.vi v10, v10, -4, v0.t\n\t"
-                    "vand.vx v4, v2, %[m]\n\t"
-                    "slli %[m], %[m], 1\n\t"
-                    "vmseq.vx v0, v4, zero\n\t"
-                    "vadd.vi v12, v12, -4, v0.t\n\t"
-                    "vand.vx v4, v2, %[m]\n\t"
-                    "slli %[m], %[m], 1\n\t"
-                    "vmseq.vx v0, v4, zero\n\t"
-                    "vadd.vi v14, v14, -4, v0.t\n\t"
-                    "vsetvli zero, %[vl128], e8, m8\n\t"
-                    "vle8.v v0, (%[q8])\n\t"
-                    "vsetvli zero, %[vl64], e8, m4\n\t"
-                    "vwmul.vv v16, v0, v8\n\t"
-                    "vwmul.vv v24, v4, v12\n\t"
-                    "vsetivli zero, 16, e16, m2\n\t"
-                    "vmv.v.x v0, zero\n\t"
-                    "vwredsum.vs v10, v16, v0\n\t"
-                    "vwredsum.vs v9, v18, v0\n\t"
-                    "vwredsum.vs v8, v20, v0\n\t"
-                    "vwredsum.vs v7, v22, v0\n\t"
-                    "vwredsum.vs v11, v24, v0\n\t"
-                    "vwredsum.vs v12, v26, v0\n\t"
-                    "vwredsum.vs v13, v28, v0\n\t"
-                    "vwredsum.vs v14, v30, v0\n\t"
-                    "vsetivli zero, 4, e32, m1\n\t"
-                    "vslideup.vi v10, v9, 1\n\t"
-                    "vslideup.vi v8, v7, 1\n\t"
-                    "vslideup.vi v11, v12, 1\n\t"
-                    "vslideup.vi v13, v14, 1\n\t"
-                    "vslideup.vi v10, v8, 2\n\t"
-                    "vslideup.vi v11, v13, 2\n\t"
-                    "vsetivli zero, 8, e32, m2\n\t"
-                    "vle8.v v15, (%[scale])\n\t"
-                    "vsext.vf4 v12, v15\n\t"
-                    "vmul.vv v10, v10, v12\n\t"
-                    "vredsum.vs v0, v10, v0\n\t"
-                    "vmv.x.s %[tmp], v0\n\t"
-                    "add %[isum], %[isum], %[tmp]"
-                    : [tmp] "=&r" (tmp), [m] "+&r" (m), [isum] "+&r" (isum)
-                    : [vl128] "r" (128), [vl64] "r" (64), [vl32] "r" (32)
-                    , [q3] "r" (q3), [qh] "r" (qh), [scale] "r" (scale), [q8] "r" (q8)
-                    : "memory"
-                    , "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
-                    , "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
-                    , "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
-                    , "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
-                );
-                q3 += 32;    q8 += 128;   scale += 8;
-            }
-
-            const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-            sumf += d * isum;
-        }
-        break;
-    default:
-        assert(false && "Unsupported vector length");
-        break;
-    }
-
-    *s = sumf;
-
-#elif defined(__POWER9_VECTOR__)
-    const vector signed char lowMask = vec_splats((signed char)0x3);
-    const vector signed char lowMask1 = vec_splats((int8_t)0xf);
-    const vector signed char lowMask2 = vec_splats((int8_t)0x30);
-    const vector int v0 = vec_splats((int32_t)0);
-    const vector signed char v1 = vec_splats((signed char)0x1);
-    const vector unsigned char v2 = vec_splats((unsigned char)0x2);
-    const vector unsigned char v3 = vec_splats((unsigned char)0x3);
-    const vector unsigned char v4 = vec_splats((unsigned char)0x4);
-    const vector unsigned char v6 = vec_splats((unsigned char)0x6);
-    const vector signed char off = vec_splats((signed char)0x20);
-
-    vector float vsumf0 = vec_splats(0.0f);
-    vector float vsumf1 = vec_splats(0.0f);
-    vector float vsumf2 = vec_splats(0.0f);
-    vector float vsumf3 = vec_splats(0.0f);
-
-    for (int i = 0; i < nb; ++i) {
-        vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
-        vector float vyd = vec_splats(y[i].d);
-        vector float vd = vec_mul(vxd, vyd);
-
-        UNUSED(kmask1);
-        UNUSED(kmask2);
-
-        vector signed char u0 = (vector signed char)vec_xl_len(x[i].scales, 8);
-        vector signed char u1 = vec_and(u0, lowMask1);
-        vector signed char u2 = (vector signed char)vec_xl_len(x[i].scales + 8, 4);
-        vector signed char u3 = (vector signed char)vec_mergeh((vector signed int)u2, (vector signed int)vec_sr(u2, v2));
-        vector signed char u30 = vec_sl(vec_and(u3, lowMask), v4);
-        vector signed char u31 = vec_and(u3, lowMask2);
-
-        u1 = vec_or(u1, u30);
-        u2 = vec_or(vec_sr(u0, v4), u31);
-
-        vector signed char vscales = (vector signed char)vec_mergeh((vector signed long long)u1, (vector signed long long)u2);
-        vector signed char qxhs0 = (vector signed char)vec_xl( 0, x[i].hmask);
-        vector signed char qxhs1 = (vector signed char)vec_xl(16, x[i].hmask);
-
-        vscales = vec_sub(vscales, off);
-
-        vector signed int vsumi0 = v0;
-        vector signed int vsumi1 = v0;
-        vector signed int vsumi2 = v0;
-        vector signed int vsumi3 = v0;
-        vector signed int vsumi4 = v0;
-        vector signed int vsumi5 = v0;
-        vector signed int vsumi6 = v0;
-        vector signed int vsumi7 = v0;
-
-        const uint8_t * GGML_RESTRICT q3 = x[i].qs;
-        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
-
-        for (int j = 0; j < QK_K/128; ++j) {
-            __builtin_prefetch(q3, 0, 1);
-            __builtin_prefetch(q8, 0, 1);
-
-            vector signed char qxs0 = (vector signed char)vec_xl( 0, q3);
-            vector signed char qxs1 = (vector signed char)vec_xl(16, q3);
-            q3 += 32;
-
-            //the low 2 bits
-            vector signed char qxs00 = vec_and(qxs0, lowMask);
-            vector signed char qxs01 = vec_and(vec_sr(qxs0, v2), lowMask);
-            vector signed char qxs02 = vec_and(vec_sr(qxs0, v4), lowMask);
-            vector signed char qxs03 = vec_and(vec_sr(qxs0, v6), lowMask);
-            vector signed char qxs10 = vec_and(qxs1, lowMask);
-            vector signed char qxs11 = vec_and(vec_sr(qxs1, v2), lowMask);
-            vector signed char qxs12 = vec_and(vec_sr(qxs1, v4), lowMask);
-            vector signed char qxs13 = vec_and(vec_sr(qxs1, v6), lowMask);
-
-            //the 3rd bit
-            vector signed char qxh00 = vec_sl(vec_andc(v1, qxhs0), v2);
-            vector signed char qxh01 = vec_sl(vec_andc(v1, vec_sr(qxhs0, (vector unsigned char)v1)), v2);
-            vector signed char qxh02 = vec_sl(vec_andc(v1, vec_sr(qxhs0, v2)), v2);
-            vector signed char qxh03 = vec_sl(vec_andc(v1, vec_sr(qxhs0, v3)), v2);
-            vector signed char qxh10 = vec_sl(vec_andc(v1, qxhs1), v2);
-            vector signed char qxh11 = vec_sl(vec_andc(v1, vec_sr(qxhs1, (vector unsigned char)v1)), v2);
-            vector signed char qxh12 = vec_sl(vec_andc(v1, vec_sr(qxhs1, v2)), v2);
-            vector signed char qxh13 = vec_sl(vec_andc(v1, vec_sr(qxhs1, v3)), v2);
-            qxhs0 = vec_sr(qxhs0, v4);
-            qxhs1 = vec_sr(qxhs1, v4);
-
-            vector signed char q3x00 = vec_sub(qxs00, qxh00);
-            vector signed char q3x01 = vec_sub(qxs01, qxh01);
-            vector signed char q3x02 = vec_sub(qxs02, qxh02);
-            vector signed char q3x03 = vec_sub(qxs03, qxh03);
-            vector signed char q3x10 = vec_sub(qxs10, qxh10);
-            vector signed char q3x11 = vec_sub(qxs11, qxh11);
-            vector signed char q3x12 = vec_sub(qxs12, qxh12);
-            vector signed char q3x13 = vec_sub(qxs13, qxh13);
-
-            vector signed char q8y00 = vec_xl(  0, q8);
-            vector signed char q8y10 = vec_xl( 16, q8);
-            vector signed char q8y01 = vec_xl( 32, q8);
-            vector signed char q8y11 = vec_xl( 48, q8);
-            vector signed char q8y02 = vec_xl( 64, q8);
-            vector signed char q8y12 = vec_xl( 80, q8);
-            vector signed char q8y03 = vec_xl( 96, q8);
-            vector signed char q8y13 = vec_xl(112, q8);
-            q8 += 128;
-
-            vector signed short vscales_h = vec_unpackh(vscales);
-            vector signed short vs0 = vec_splat(vscales_h, 0);
-            vector signed short vs1 = vec_splat(vscales_h, 1);
-            vector signed short vs2 = vec_splat(vscales_h, 2);
-            vector signed short vs3 = vec_splat(vscales_h, 3);
-            vector signed short vs4 = vec_splat(vscales_h, 4);
-            vector signed short vs5 = vec_splat(vscales_h, 5);
-            vector signed short vs6 = vec_splat(vscales_h, 6);
-            vector signed short vs7 = vec_splat(vscales_h, 7);
-            vscales = vec_sld(vscales, vscales, 8);
-
-            vector signed short qv00 = vec_add(vec_mule(q3x00, q8y00), vec_mulo(q3x00, q8y00));
-            vector signed short qv01 = vec_add(vec_mule(q3x01, q8y01), vec_mulo(q3x01, q8y01));
-            vector signed short qv02 = vec_add(vec_mule(q3x02, q8y02), vec_mulo(q3x02, q8y02));
-            vector signed short qv03 = vec_add(vec_mule(q3x03, q8y03), vec_mulo(q3x03, q8y03));
-            vector signed short qv10 = vec_add(vec_mule(q3x10, q8y10), vec_mulo(q3x10, q8y10));
-            vector signed short qv11 = vec_add(vec_mule(q3x11, q8y11), vec_mulo(q3x11, q8y11));
-            vector signed short qv12 = vec_add(vec_mule(q3x12, q8y12), vec_mulo(q3x12, q8y12));
-            vector signed short qv13 = vec_add(vec_mule(q3x13, q8y13), vec_mulo(q3x13, q8y13));
-
-            vsumi0 = vec_msum(qv00, vs0, vsumi0);
-            vsumi1 = vec_msum(qv01, vs2, vsumi1);
-            vsumi2 = vec_msum(qv02, vs4, vsumi2);
-            vsumi3 = vec_msum(qv03, vs6, vsumi3);
-            vsumi4 = vec_msum(qv10, vs1, vsumi4);
-            vsumi5 = vec_msum(qv11, vs3, vsumi5);
-            vsumi6 = vec_msum(qv12, vs5, vsumi6);
-            vsumi7 = vec_msum(qv13, vs7, vsumi7);
-        }
-
-        vsumi0 = vec_add(vsumi0, vsumi4);
-        vsumi1 = vec_add(vsumi1, vsumi5);
-        vsumi2 = vec_add(vsumi2, vsumi6);
-        vsumi3 = vec_add(vsumi3, vsumi7);
-
-        vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
-        vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
-        vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
-        vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
-    }
-
-    vsumf0 = vec_add(vsumf0, vsumf2);
-    vsumf1 = vec_add(vsumf1, vsumf3);
-
-    vsumf0 = vec_add(vsumf0, vsumf1);
-
-    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
-    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
-
-    *s = vec_extract(vsumf0, 0);
-
-#elif defined __loongarch_asx
-
-    const __m128i m32 = __lsx_vreplgr2vr_b(32);
-
-    __m256 acc = (__m256)__lasx_xvldi(0);
-
-    uint32_t aux[3];
-
-    for (int i = 0; i < nb; ++i) {
-
-        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
-        const uint8_t * GGML_RESTRICT q3 = x[i].qs;
-        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
-        // Set up scales
-        memcpy(aux, x[i].scales, 12);
-        __m128i scales128 = lsx_set_w(
-                ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4),
-                ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4),
-                (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4),
-                (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4));
-        scales128 = __lsx_vsub_b(scales128, m32);
-
-        const v16i8 shuffle_mask = {0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15};
-        const __m256i scales_shuffled = lasx_ext8_16(__lsx_vshuf_b(scales128, scales128, (__m128i)shuffle_mask));
-
-        // high bit
-        const __m256i hbits = __lasx_xvld((const __m256i*)x[i].hmask, 0);
-
-        // integer accumulator
-        __m256i sumi = __lasx_xvldi(0);
-
-        for (int j = 0; j < QK_K/128; ++j) {
-            // load low 2 bits
-            const __m256i q3bits = __lasx_xvld((const __m256i*)q3, 0); q3 += 32;
-
-            // prepare low and high bits
-            const __m256i q3l_0 = __lasx_xvandi_b(q3bits, 3);
-            const __m256i q3l_1 = __lasx_xvandi_b(__lasx_xvsrli_b(q3bits, 2), 3);
-            const __m256i q3l_2 = __lasx_xvandi_b(__lasx_xvsrli_b(q3bits, 4), 3);
-            const __m256i q3l_3 = __lasx_xvsrli_b(q3bits, 6);
-            const __m256i q3h_0 = __lasx_xvslli_b(__lasx_xvseqi_b(lasx_xvandi_b_bit(hbits, 4 * j + 0), 0), 2);
-            const __m256i q3h_1 = __lasx_xvslli_b(__lasx_xvseqi_b(lasx_xvandi_b_bit(hbits, 4 * j + 1), 0), 2);
-            const __m256i q3h_2 = __lasx_xvslli_b(__lasx_xvseqi_b(lasx_xvandi_b_bit(hbits, 4 * j + 2), 0), 2);
-            const __m256i q3h_3 = __lasx_xvslli_b(__lasx_xvseqi_b(lasx_xvandi_b_bit(hbits, 4 * j + 3), 0), 2);
-            const __m256i q3_0 = __lasx_xvor_v(q3h_0, q3l_0);
-            const __m256i q3_1 = __lasx_xvor_v(q3h_1, q3l_1);
-            const __m256i q3_2 = __lasx_xvor_v(q3h_2, q3l_2);
-            const __m256i q3_3 = __lasx_xvor_v(q3h_3, q3l_3);
-
-            // load Q8 quants
-            const __m256i q8_0 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
-            const __m256i q8_1 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
-            const __m256i q8_2 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
-            const __m256i q8_3 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
-
-            __m256i p16_0 = lasx_madd_h_b(q8_0, q3_0);
-            __m256i p16_1 = lasx_madd_h_b(q8_1, q3_1);
-            __m256i p16_2 = lasx_madd_h_b(q8_2, q3_2);
-            __m256i p16_3 = lasx_madd_h_b(q8_3, q3_3);
-
-            // multiply with scales
-            p16_0 = lasx_madd_h(lasx_xvrepl128vei_h(scales_shuffled, 4 * j + 0), p16_0);
-            p16_1 = lasx_madd_h(lasx_xvrepl128vei_h(scales_shuffled, 4 * j + 1), p16_1);
-            p16_2 = lasx_madd_h(lasx_xvrepl128vei_h(scales_shuffled, 4 * j + 2), p16_2);
-            p16_3 = lasx_madd_h(lasx_xvrepl128vei_h(scales_shuffled, 4 * j + 3), p16_3);
-
-            // accumulate
-            p16_0 = __lasx_xvadd_w(p16_0, p16_1);
-            p16_2 = __lasx_xvadd_w(p16_2, p16_3);
-            sumi  = __lasx_xvadd_w(sumi, __lasx_xvadd_w(p16_0, p16_2));
-        }
-        // multiply with block scale and accumulate
-        acc = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(d), __lasx_xvffint_s_w(sumi), acc);
-    }
-
-    *s = hsum_float_8(acc);
-#elif defined(__VXE__) || defined(__VXE2__)
-    uint32_t aux[3];
-    uint32_t utmp[4];
-
-    const int32x4_t v_z = vec_splat_s32(0);
-    const uint8x16_t v_3m = vec_splat_u8(0x03);
-
-    const uint8x16_t v_0c = vec_splat_u8(1);
-    const uint8x16_t v_1c = vec_sl(v_0c, 1);
-    const uint8x16_t v_2c = vec_sl(v_0c, 2);
-    const uint8x16_t v_3c = vec_sl(v_0c, 3);
-
-    uint8x16_t q3h[4];
-    uint8x16_t q3b[2];
-    int8x16_t q3bytes[4];
-    int8x16_t q8bytes[4];
-    uint8x16_t qhbits[2];
-
-    float sum = 0;
-
-    for (int i = 0; i < nb; ++i) {
-        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
-
-        const uint8_t * restrict x0l = x[i].qs;
-        const uint8_t * restrict x0h = x[i].hmask;
-        const int8_t  * restrict y0  = y[i].qs;
-
-        qhbits[0] = vec_xl(0 , x0h);
-        qhbits[1] = vec_xl(16, x0h);
-
-        int32_t isum = 0;
-
-        memcpy(aux, x[i].scales, 12);
-        utmp[3] = ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4);
-        utmp[2] = ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4);
-        utmp[1] = (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4);
-        utmp[0] = (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4);
-
-        int8_t * scale = (int8_t *)utmp;
-        for (int j = 0; j < 16; ++j) scale[j] -= 32;
-
-        for (int j = 0; j < QK_K/128; ++j) {
-            int32x4_t isum0, isum1, isum2, isum3;
-
-            q3b[0] = vec_xl(0 , x0l);
-            q3b[1] = vec_xl(16, x0l);
-            x0l += 32;
-
-            q8bytes[0] = vec_xl(0  , y0);
-            q8bytes[1] = vec_xl(16 , y0);
-            q8bytes[2] = vec_xl(32 , y0);
-            q8bytes[3] = vec_xl(48 , y0);
-            q8bytes[4] = vec_xl(64 , y0);
-            q8bytes[5] = vec_xl(80 , y0);
-            q8bytes[6] = vec_xl(96 , y0);
-            q8bytes[7] = vec_xl(112, y0);
-            y0 += 128;
-
-            q3h[0] = vec_sl(vec_andc(v_0c, qhbits[0]), 2);
-            q3h[1] = vec_sl(vec_andc(v_0c, qhbits[1]), 2);
-            q3h[2] = vec_sl(vec_andc(v_1c, qhbits[0]), 1);
-            q3h[3] = vec_sl(vec_andc(v_1c, qhbits[1]), 1);
-
-            q3bytes[0] = vec_sub((int8x16_t)vec_and(q3b[0], v_3m), (int8x16_t)q3h[0]);
-            q3bytes[1] = vec_sub((int8x16_t)vec_and(q3b[1], v_3m), (int8x16_t)q3h[1]);
-            q3bytes[2] = vec_sub((int8x16_t)vec_and(vec_sr(q3b[0], 2), v_3m), (int8x16_t)q3h[2]);
-            q3bytes[3] = vec_sub((int8x16_t)vec_and(vec_sr(q3b[1], 2), v_3m), (int8x16_t)q3h[3]);
-
-            isum0 = ggml_vec_dot(v_z, q3bytes[0], q8bytes[0]);
-            isum1 = ggml_vec_dot(v_z, q3bytes[1], q8bytes[1]);
-            isum2 = ggml_vec_dot(v_z, q3bytes[2], q8bytes[2]);
-            isum3 = ggml_vec_dot(v_z, q3bytes[3], q8bytes[3]);
-
-            isum += (isum0[0] + isum0[1] + isum0[2] + isum0[3]) * scale[0];
-            isum += (isum1[0] + isum1[1] + isum1[2] + isum1[3]) * scale[1];
-            isum += (isum2[0] + isum2[1] + isum2[2] + isum2[3]) * scale[2];
-            isum += (isum3[0] + isum3[1] + isum3[2] + isum3[3]) * scale[3];
-
-            scale += 4;
-
-            q3h[0] = vec_andc(v_2c, qhbits[0]);
-            q3h[1] = vec_andc(v_2c, qhbits[1]);
-            q3h[2] = vec_sr(vec_andc(v_3c, qhbits[0]), 1);
-            q3h[3] = vec_sr(vec_andc(v_3c, qhbits[1]), 1);
-
-            q3bytes[0] = vec_sub((int8x16_t)vec_and(vec_sr(q3b[0], 4), v_3m), (int8x16_t)q3h[0]);
-            q3bytes[1] = vec_sub((int8x16_t)vec_and(vec_sr(q3b[1], 4), v_3m), (int8x16_t)q3h[1]);
-            q3bytes[2] = vec_sub((int8x16_t)vec_and(vec_sr(q3b[0], 6), v_3m), (int8x16_t)q3h[2]);
-            q3bytes[3] = vec_sub((int8x16_t)vec_and(vec_sr(q3b[1], 6), v_3m), (int8x16_t)q3h[3]);
-
-            isum0 = ggml_vec_dot(v_z, q3bytes[0], q8bytes[4]);
-            isum1 = ggml_vec_dot(v_z, q3bytes[1], q8bytes[5]);
-            isum2 = ggml_vec_dot(v_z, q3bytes[2], q8bytes[6]);
-            isum3 = ggml_vec_dot(v_z, q3bytes[3], q8bytes[7]);
-
-            isum += (isum0[0] + isum0[1] + isum0[2] + isum0[3]) * scale[0];
-            isum += (isum1[0] + isum1[1] + isum1[2] + isum1[3]) * scale[1];
-            isum += (isum2[0] + isum2[1] + isum2[2] + isum2[3]) * scale[2];
-            isum += (isum3[0] + isum3[1] + isum3[2] + isum3[3]) * scale[3];
-
-            scale += 4;
-
-            if (j == 0) {
-                qhbits[0] = vec_sr(qhbits[0], 4);
-                qhbits[1] = vec_sr(qhbits[1], 4);
-            }
-        }
-
-        sum += d * isum;
-    }
-
-    *s = sum;
-#else
-    // scalar version
-    // This function is written like this so the compiler can manage to vectorize most of it
-    // Using -Ofast, GCC and clang manage to produce code that is within a factor of 2 or so from the
-    // manually vectorized version above. Every other version I tried would run at least 4 times slower.
-    // The ideal situation would be if we could just write the code once, and the compiler would
-    // automatically produce the best possible set of machine instructions, instead of us having to manually
-    // write vectorized versions for AVX, ARM_NEON, etc.
-
-    int8_t  aux8[QK_K];
-    int16_t aux16[8];
-    float   sums [8];
-    int32_t aux32[8];
-    memset(sums, 0, 8*sizeof(float));
-
-    uint32_t auxs[4];
-    const int8_t * scales = (const int8_t*)auxs;
-
-    float sumf = 0;
-    for (int i = 0; i < nb; ++i) {
-        const uint8_t * GGML_RESTRICT q3 = x[i].qs;
-        const uint8_t * GGML_RESTRICT hm = x[i].hmask;
-        const  int8_t * GGML_RESTRICT q8 = y[i].qs;
-        memset(aux32, 0, 8*sizeof(int32_t));
-        int8_t * GGML_RESTRICT a = aux8;
-        uint8_t m = 1;
-        for (int j = 0; j < QK_K; j += 128) {
-            for (int l = 0; l < 32; ++l) a[l] = q3[l] & 3;
-            for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
-            a += 32; m <<= 1;
-            for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 2) & 3;
-            for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
-            a += 32; m <<= 1;
-            for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 4) & 3;
-            for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
-            a += 32; m <<= 1;
-            for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 6) & 3;
-            for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
-            a += 32; m <<= 1;
-            q3 += 32;
-        }
-        a = aux8;
-
-        memcpy(auxs, x[i].scales, 12);
-        uint32_t tmp = auxs[2];
-        auxs[2] = ((auxs[0] >> 4) & kmask2) | (((tmp >> 4) & kmask1) << 4);
-        auxs[3] = ((auxs[1] >> 4) & kmask2) | (((tmp >> 6) & kmask1) << 4);
-        auxs[0] = (auxs[0] & kmask2) | (((tmp >> 0) & kmask1) << 4);
-        auxs[1] = (auxs[1] & kmask2) | (((tmp >> 2) & kmask1) << 4);
-        for (int j = 0; j < QK_K/16; ++j) {
-            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
-            for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
-            q8 += 8; a += 8;
-            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
-            for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
-            q8 += 8; a += 8;
-        }
-        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-        for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
-    }
-    for (int l = 0; l < 8; ++l) sumf += sums[l];
-    *s = sumf;
-
-#endif
-
-}
-
-void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
-    assert(n % QK_K == 0);
-#ifdef __ARM_FEATURE_MATMUL_INT8
-    assert((nrc == 2) || (nrc == 1));
-#else
-    assert(nrc == 1);
-#endif
-    UNUSED(nrc);
-    UNUSED(bx);
-    UNUSED(by);
-    UNUSED(bs);
-
-    const block_q4_K * GGML_RESTRICT x = vx;
-    const block_q8_K * GGML_RESTRICT y = vy;
-
-    const int nb = n / QK_K;
-
-    static const uint32_t kmask1 = 0x3f3f3f3f;
-    static const uint32_t kmask2 = 0x0f0f0f0f;
-    static const uint32_t kmask3 = 0x03030303;
-
-    uint32_t utmp[4];
-
-#if defined(__ARM_FEATURE_MATMUL_INT8)
-    if (nrc == 2) {
-        const block_q4_K * GGML_RESTRICT x0 = x;
-        const block_q4_K * GGML_RESTRICT x1 = (const block_q4_K *) ((const uint8_t *)vx + bx);
-        const block_q8_K * GGML_RESTRICT y0 = y;
-        const block_q8_K * GGML_RESTRICT y1 = (const block_q8_K *) ((const uint8_t *)vy + by);
-
-        const uint8x16_t m4b = vdupq_n_u8(0x0f);
-
-        float32x4_t vfsum = vdupq_n_f32(0.0f);
-
-        for (int i = 0; i < nb; ++i, ++x0, ++x1, ++y0, ++y1) {
-            const uint8_t * GGML_RESTRICT qx0 = x0->qs;
-            const uint8_t * GGML_RESTRICT qx1 = x1->qs;
-            const  int8_t * GGML_RESTRICT qy0 = y0->qs;
-            const  int8_t * GGML_RESTRICT qy1 = y1->qs;
-
-            // decode scales and mins
-            int8_t x0_scales[8], x1_scales[8];
-            int16x8_t x0_mins, x1_mins;
-            {
-                uint32_t scales_mins[3];
-                memcpy(scales_mins, x0->scales, 12);
-                const uint32_t mins_0_3 = scales_mins[1] & kmask1;
-                const uint32_t mins_4_7 = ((scales_mins[2] >> 4) & kmask2) | (((scales_mins[1] >> 6) & kmask3) << 4);
-                const uint32x2_t mins = {mins_0_3, mins_4_7};
-                x0_mins = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(mins)));
-                uint32_t scales[2];
-                scales[0] = scales_mins[0] & kmask1; // scales 0~3
-                scales[1] = (scales_mins[2] & kmask2) | (((scales_mins[0] >> 6) & kmask3) << 4); // scales 4~7
-                memcpy(x0_scales, scales, 8);
-            }
-            {
-                uint32_t scales_mins[3];
-                memcpy(scales_mins, x1->scales, 12);
-                const uint32_t mins_0_3 = scales_mins[1] & kmask1;
-                const uint32_t mins_4_7 = ((scales_mins[2] >> 4) & kmask2) | (((scales_mins[1] >> 6) & kmask3) << 4);
-                const uint32x2_t mins = {mins_0_3, mins_4_7};
-                x1_mins = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(mins)));
-                uint32_t scales[2];
-                scales[0] = scales_mins[0] & kmask1; // scales 0~3
-                scales[1] = (scales_mins[2] & kmask2) | (((scales_mins[0] >> 6) & kmask3) << 4); // scales 4~7
-                memcpy(x1_scales, scales, 8);
-            }
-
-            int32x4_t visum = {0};
-
-            // process 64 data points per iteration, totally 256 data points
-            for (int j = 0; j < QK_K / 64; ++j, qx0 += 32, qx1 += 32, qy0 += 64, qy1 += 64) {
-                const int8x16x4_t vy0 = vld1q_s8_x4(qy0);
-                const int8x16x4_t vy1 = vld1q_s8_x4(qy1);
-
-                int8x16_t vx0[4], vx1[4];
-                {
-                    const uint8x16x2_t vv = vld1q_u8_x2(qx0);
-                    vx0[0] = vreinterpretq_s8_u8(vandq_u8(vv.val[0], m4b));
-                    vx0[1] = vreinterpretq_s8_u8(vandq_u8(vv.val[1], m4b));
-                    vx0[2] = vreinterpretq_s8_u8(vshrq_n_u8(vv.val[0], 4));
-                    vx0[3] = vreinterpretq_s8_u8(vshrq_n_u8(vv.val[1], 4));
-                }
-                {
-                    const uint8x16x2_t vv = vld1q_u8_x2(qx1);
-                    vx1[0] = vreinterpretq_s8_u8(vandq_u8(vv.val[0], m4b));
-                    vx1[1] = vreinterpretq_s8_u8(vandq_u8(vv.val[1], m4b));
-                    vx1[2] = vreinterpretq_s8_u8(vshrq_n_u8(vv.val[0], 4));
-                    vx1[3] = vreinterpretq_s8_u8(vshrq_n_u8(vv.val[1], 4));
-                }
-
-                // process 32 data points (share same block scale) per iteration
-                for (int k = 0; k < 2; ++k) {
-                    const int blk = j * 2 + k;
-                    const int32x4_t block_scale = {
-                        x0_scales[blk],
-                        x0_scales[blk],
-                        x1_scales[blk],
-                        x1_scales[blk],
-                    };
-
-                    int32x4_t vr = {0};
-                    for (int l = 0; l < 2; ++l) {
-                        const int idx = k * 2 + l;
-                        const int64x2_t vx0_s64 = vreinterpretq_s64_s8(vx0[idx]);
-                        const int64x2_t vx1_s64 = vreinterpretq_s64_s8(vx1[idx]);
-                        const int64x2_t vy0_s64 = vreinterpretq_s64_s8(vy0.val[idx]);
-                        const int64x2_t vy1_s64 = vreinterpretq_s64_s8(vy1.val[idx]);
-                        const int8x16_t vx_l = vreinterpretq_s8_s64(vzip1q_s64(vx0_s64, vx1_s64));
-                        const int8x16_t vx_h = vreinterpretq_s8_s64(vzip2q_s64(vx0_s64, vx1_s64));
-                        const int8x16_t vy_l = vreinterpretq_s8_s64(vzip1q_s64(vy0_s64, vy1_s64));
-                        const int8x16_t vy_h = vreinterpretq_s8_s64(vzip2q_s64(vy0_s64, vy1_s64));
-                        vr = vmmlaq_s32(vr, vx_l, vy_l);
-                        vr = vmmlaq_s32(vr, vx_h, vy_h);
-                    }
-                    // apply block scale, will NOT overflow
-                    // block_scale * sum_256(int4*int8) <= 2^(8+8+4+8) = 28 bits
-                    visum = vmlaq_s32(visum, vr, block_scale);
-                }
-            }
-
-            // adjust bias, apply superblock scale
-            {
-                int32_t bias[4];
-                // no obvious uplift from sve sdot-16, just use neon mul add
-                const int16x8_t y0_sums = vpaddq_s16(vld1q_s16(y0->bsums), vld1q_s16(y0->bsums+8));
-                const int16x8_t y1_sums = vpaddq_s16(vld1q_s16(y1->bsums), vld1q_s16(y1->bsums+8));
-                bias[0] = vaddvq_s32(vaddq_s32(vmull_s16(vget_low_s16(y0_sums), vget_low_s16(x0_mins)),
-                                               vmull_s16(vget_high_s16(y0_sums), vget_high_s16(x0_mins))));
-                bias[1] = vaddvq_s32(vaddq_s32(vmull_s16(vget_low_s16(y1_sums), vget_low_s16(x0_mins)),
-                                               vmull_s16(vget_high_s16(y1_sums), vget_high_s16(x0_mins))));
-                bias[2] = vaddvq_s32(vaddq_s32(vmull_s16(vget_low_s16(y0_sums), vget_low_s16(x1_mins)),
-                                               vmull_s16(vget_high_s16(y0_sums), vget_high_s16(x1_mins))));
-                bias[3] = vaddvq_s32(vaddq_s32(vmull_s16(vget_low_s16(y1_sums), vget_low_s16(x1_mins)),
-                                               vmull_s16(vget_high_s16(y1_sums), vget_high_s16(x1_mins))));
-                const float32x4_t dmins = {
-                    GGML_FP16_TO_FP32(x0->dmin) * y0->d,
-                    GGML_FP16_TO_FP32(x0->dmin) * y1->d,
-                    GGML_FP16_TO_FP32(x1->dmin) * y0->d,
-                    GGML_FP16_TO_FP32(x1->dmin) * y1->d,
-                };
-                vfsum = vmlsq_f32(vfsum, vcvtq_f32_s32(vld1q_s32(bias)), dmins);
-
-                const float32x4_t superblock_scale = {
-                    GGML_FP16_TO_FP32(x0->d) * y0->d,
-                    GGML_FP16_TO_FP32(x0->d) * y1->d,
-                    GGML_FP16_TO_FP32(x1->d) * y0->d,
-                    GGML_FP16_TO_FP32(x1->d) * y1->d,
-                };
-                vfsum = vmlaq_f32(vfsum, vcvtq_f32_s32(visum), superblock_scale);
-            }
-        }
-
-        // vfsum = ABCD -> ACBD
-        // AC -> s, BD -> (s+bs)
-        vfsum = vzip1q_f32(vfsum, vextq_f32(vfsum, vfsum, 2));
-        vst1_f32(s,      vget_low_f32 (vfsum));
-        vst1_f32(s + bs, vget_high_f32(vfsum));
-
-        return;
-    }
-#endif
-
-#ifdef __ARM_FEATURE_SVE
-    float sumf = 0;
-    for (int i = 0; i < nb; ++i) {
-
-        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
-        const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
-
-        const int16x8_t q8sums = vpaddq_s16(vld1q_s16(y[i].bsums), vld1q_s16(y[i].bsums + 8));
-
-        memcpy(utmp, x[i].scales, K_SCALE_SIZE);
-
-        uint32x2_t mins8 = { 0 };
-        mins8 = vset_lane_u32(utmp[1] & kmask1, mins8, 0);
-        mins8 = vset_lane_u32(((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4), mins8, 1);
-
-        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
-        utmp[0] &= kmask1;
-
-        const int16x8_t mins = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(mins8)));
-        const int32x4_t prod = vaddq_s32(vmull_s16(vget_low_s16 (q8sums), vget_low_s16 (mins)),
-                                         vmull_s16(vget_high_s16(q8sums), vget_high_s16(mins)));
-        sumf -= dmin * vaddvq_s32(prod);
-
-        const uint8_t * scales = (const uint8_t *)utmp;
-
-        const uint8_t * GGML_RESTRICT q4 = x[i].qs;
-        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
-
-        const int vector_length = ggml_cpu_get_sve_cnt()*8;
-        const svuint8_t m4b = svdup_n_u8(0xf);
-        const svint32_t mzero = svdup_n_s32(0);
-        svint32_t sumi1 = svdup_n_s32(0);
-        svint32_t sumi1_1 = svdup_n_s32(0);
-        svint32_t sumi1_2 = svdup_n_s32(0);
-        svint32_t sumi2 = svdup_n_s32(0);
-        svint32_t sumi2_1 = svdup_n_s32(0);
-        svint32_t sumi2_2 = svdup_n_s32(0);
-        switch (vector_length) {
-            case 128:
-                {
-                    for (int j = 0; j < QK_K/64; ++j) {
-                        svint8_t q4bytes = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svld1_u8(svptrue_b8(), q4), m4b));
-                        svint8_t q8bytes = svld1_s8(svptrue_b8(), q8); q8 += 16;
-                        sumi1_1 = svmla_n_s32_x(svptrue_b32(), sumi1_1, svdot_s32(mzero, q4bytes, q8bytes), scales[2*j+0]);
-                        q4bytes = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svld1_u8(svptrue_b8(), q4+16), m4b));
-                        q8bytes = svld1_s8(svptrue_b8(), q8); q8 += 16;
-                        sumi1_2 = svmla_n_s32_x(svptrue_b32(), sumi1_2, svdot_s32(mzero, q4bytes, q8bytes), scales[2*j+0]);
-
-                        q4bytes = svreinterpret_s8_u8(svlsr_n_u8_x(svptrue_b8(), svld1_u8(svptrue_b8(), q4), 4));
-                        q8bytes = svld1_s8(svptrue_b8(), q8); q8 += 16;
-                        sumi2_1 = svmla_n_s32_x(svptrue_b32(), sumi2_1, svdot_s32(mzero, q4bytes, q8bytes), scales[2*j+1]);
-                        q4bytes = svreinterpret_s8_u8(svlsr_n_u8_x(svptrue_b8(), svld1_u8(svptrue_b8(), q4+16), 4));
-                        q8bytes = svld1_s8(svptrue_b8(), q8); q8 += 16;
-                        sumi2_2 = svmla_n_s32_x(svptrue_b32(), sumi2_2, svdot_s32(mzero, q4bytes, q8bytes), scales[2*j+1]);
-                        q4 += 32;
-                    }
-                    sumi1 = svadd_s32_x(svptrue_b32(), sumi1_1, sumi1_2);
-                    sumi2 = svadd_s32_x(svptrue_b32(), sumi2_1, sumi2_2);
-                    sumf += d * (svaddv_s32(svptrue_b32(), svadd_s32_x(svptrue_b32(), sumi1, sumi2)));
-                } break;
-            case 256:
-            case 512:
-                {
-                    for (int j = 0; j < QK_K/64; ++j) {
-                        const svuint8_t q4bits  = svld1_u8(svptrue_pat_b8(SV_VL32), q4); q4 += 32;
-                        svint8_t q4bytes = svreinterpret_s8_u8(svand_u8_x(svptrue_pat_b8(SV_VL32), q4bits, m4b));
-                        svint8_t q8bytes = svld1_s8(svptrue_pat_b8(SV_VL32), q8); q8 += 32;
-                        sumi1 = svmla_n_s32_x(svptrue_pat_b32(SV_VL8), sumi1, svdot_s32(mzero, q4bytes, q8bytes), scales[2*j+0]);
-
-                        q4bytes = svreinterpret_s8_u8(svlsr_n_u8_x(svptrue_pat_b8(SV_VL32), q4bits, 4));
-                        q8bytes = svld1_s8(svptrue_pat_b8(SV_VL32), q8); q8 += 32;
-                        sumi2 = svmla_n_s32_x(svptrue_pat_b32(SV_VL8), sumi2, svdot_s32(mzero, q4bytes, q8bytes), scales[2*j+1]);
-                    }
-                    sumf += d * (svaddv_s32(svptrue_pat_b32(SV_VL8), svadd_s32_x(svptrue_pat_b32(SV_VL8), sumi1, sumi2)));
-                } break;
-            default:
-                assert(false && "Unsupported vector length");
-                break;
-        }
-    }
-    *s = sumf;
-#elif defined __ARM_NEON
-    const uint8x16_t m4b = vdupq_n_u8(0xf);
-    const int32x4_t mzero = vdupq_n_s32(0);
-
-    ggml_int8x16x2_t q4bytes;
-    ggml_int8x16x2_t q8bytes;
-
-    float sumf = 0;
-
-    for (int i = 0; i < nb; ++i) {
-
-        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
-        const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
-
-        const int16x8_t q8sums = vpaddq_s16(vld1q_s16(y[i].bsums), vld1q_s16(y[i].bsums + 8));
-
-        memcpy(utmp, x[i].scales, 12);
-
-        uint32x2_t mins8 = { 0 };
-        mins8 = vset_lane_u32(utmp[1] & kmask1, mins8, 0);
-        mins8 = vset_lane_u32(((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4), mins8, 1);
-
-        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
-        utmp[0] &= kmask1;
-
-        const int16x8_t mins = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(mins8)));
-        const int32x4_t prod = vaddq_s32(vmull_s16(vget_low_s16 (q8sums), vget_low_s16 (mins)),
-                                         vmull_s16(vget_high_s16(q8sums), vget_high_s16(mins)));
-        sumf -= dmin * vaddvq_s32(prod);
-
-        const uint8_t * scales = (const uint8_t *)utmp;
-
-        const uint8_t * GGML_RESTRICT q4 = x[i].qs;
-        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
-
-        int32_t sumi1 = 0;
-        int32_t sumi2 = 0;
-
-        for (int j = 0; j < QK_K/64; ++j) {
-            const ggml_uint8x16x2_t q4bits = ggml_vld1q_u8_x2(q4); q4 += 32;
-
-            q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32;
-            q4bytes.val[0] = vreinterpretq_s8_u8(vandq_u8  (q4bits.val[0], m4b));
-            q4bytes.val[1] = vreinterpretq_s8_u8(vandq_u8  (q4bits.val[1], m4b));
-
-            const int32x4_t p1 = ggml_vdotq_s32(ggml_vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[0]), q4bytes.val[1], q8bytes.val[1]);
-            sumi1 += vaddvq_s32(p1) * scales[2*j+0];
-
-            q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32;
-            q4bytes.val[0] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[0], 4));
-            q4bytes.val[1] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[1], 4));
-
-            const int32x4_t p2 = ggml_vdotq_s32(ggml_vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[0]), q4bytes.val[1], q8bytes.val[1]);
-
-            sumi2 += vaddvq_s32(p2) * scales[2*j+1];
-        }
-
-        sumf += d * (sumi1 + sumi2);
-
-    }
-
-    *s = sumf;
-
-#elif defined __wasm_simd128__
-    const uint8_t * scales = (const uint8_t*)&utmp[0];
-    float sumf = 0;
-
-    for (int i = 0; i < nb; ++i) {
-        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
-        const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin); // Corrected sign
-
-        const uint8_t * GGML_RESTRICT q4 = x[i].qs;
-        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
-
-        // Process scales and mins
-        memcpy(utmp, x[i].scales, 12);
-        utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
-        const uint32_t uaux = utmp[1] & kmask1;
-        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
-        utmp[2] = uaux;
-        utmp[0] &= kmask1;
-
-        // Sum mins * q8sums
-        int32_t sumi = 0;
-        const int16_t * GGML_RESTRICT q8sums = y[i].bsums;
-        const uint8_t * m = (const uint8_t *)&utmp[2];
-        for (int j = 0; j < 16; j += 2) {
-            sumi += (q8sums[j] + q8sums[j+1]) * m[j/2];
-        }
-        sumf -= dmin * sumi;
-
-        int32_t sumi1 = 0;
-        int32_t sumi2 = 0;
-
-        for (int j = 0; j < QK_K/64; ++j) {
-            // Load 64 4-bit weights (32 bytes)
-            const v128_t q4x0 = wasm_v128_load(q4);
-            const v128_t q4x1 = wasm_v128_load(q4 + 16);
-            q4 += 32;
-
-            // Split into low/high nibbles
-            const v128_t q4l0 = wasm_v128_and(q4x0, wasm_i8x16_splat(0x0F));
-            const v128_t q4h0 = wasm_u8x16_shr(q4x0, 4);
-            const v128_t q4l1 = wasm_v128_and(q4x1, wasm_i8x16_splat(0x0F));
-            const v128_t q4h1 = wasm_u8x16_shr(q4x1, 4);
-
-            // Load 64 8-bit values (64 bytes)
-            const v128_t q8x0 = wasm_v128_load(q8);
-            const v128_t q8x1 = wasm_v128_load(q8 + 16);
-            const v128_t q8x2 = wasm_v128_load(q8 + 32);
-            const v128_t q8x3 = wasm_v128_load(q8 + 48);
-            q8 += 64;
-
-            // Low nibble products
-            v128_t vacc1 = wasm_i32x4_dot_i16x8(
-                wasm_i16x8_extend_low_i8x16(q4l0),
-                wasm_i16x8_extend_low_i8x16(q8x0)
-            );
-            vacc1 = wasm_i32x4_add(vacc1, wasm_i32x4_dot_i16x8(
-                wasm_i16x8_extend_high_i8x16(q4l0),
-                wasm_i16x8_extend_high_i8x16(q8x0)
-            ));
-            vacc1 = wasm_i32x4_add(vacc1, wasm_i32x4_dot_i16x8(
-                wasm_i16x8_extend_low_i8x16(q4l1),
-                wasm_i16x8_extend_low_i8x16(q8x1)
-            ));
-            vacc1 = wasm_i32x4_add(vacc1, wasm_i32x4_dot_i16x8(
-                wasm_i16x8_extend_high_i8x16(q4l1),
-                wasm_i16x8_extend_high_i8x16(q8x1)
-            ));
-
-            // High nibble products
-            v128_t vacc2 = wasm_i32x4_dot_i16x8(
-                wasm_i16x8_extend_low_i8x16(q4h0),
-                wasm_i16x8_extend_low_i8x16(q8x2)
-            );
-            vacc2 = wasm_i32x4_add(vacc2, wasm_i32x4_dot_i16x8(
-                wasm_i16x8_extend_high_i8x16(q4h0),
-                wasm_i16x8_extend_high_i8x16(q8x2)
-            ));
-            vacc2 = wasm_i32x4_add(vacc2, wasm_i32x4_dot_i16x8(
-                wasm_i16x8_extend_low_i8x16(q4h1),
-                wasm_i16x8_extend_low_i8x16(q8x3)
-            ));
-            vacc2 = wasm_i32x4_add(vacc2, wasm_i32x4_dot_i16x8(
-                wasm_i16x8_extend_high_i8x16(q4h1),
-                wasm_i16x8_extend_high_i8x16(q8x3)
-            ));
-
-            // Accumulate scaled results
-            int32_t vacc1_sum = wasm_i32x4_extract_lane(vacc1, 0) + wasm_i32x4_extract_lane(vacc1, 1) +
-                                wasm_i32x4_extract_lane(vacc1, 2) + wasm_i32x4_extract_lane(vacc1, 3);
-            sumi1 += vacc1_sum * scales[2*j];
-
-            int32_t vacc2_sum = wasm_i32x4_extract_lane(vacc2, 0) + wasm_i32x4_extract_lane(vacc2, 1) +
-                                wasm_i32x4_extract_lane(vacc2, 2) + wasm_i32x4_extract_lane(vacc2, 3);
-            sumi2 += vacc2_sum * scales[2*j+1];
-        }
-
-        sumf += d * (sumi1 + sumi2);
-    }
-
-    *s = sumf;
-
-#elif defined __AVX2__
-
-    const __m256i m4 = _mm256_set1_epi8(0xF);
-
-    __m256 acc = _mm256_setzero_ps();
-    __m128 acc_m = _mm_setzero_ps();
-
-   for (int i = 0; i < nb; ++i) {
-
-        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
-        const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
-
-        memcpy(utmp, x[i].scales, 12);
-        utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
-        const uint32_t uaux = utmp[1] & kmask1;
-        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
-        utmp[2] = uaux;
-        utmp[0] &= kmask1;
-
-        const uint8_t * GGML_RESTRICT q4 = x[i].qs;
-        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
-
-        const __m256i mins_and_scales = _mm256_cvtepu8_epi16(_mm_set_epi32(utmp[3], utmp[2], utmp[1], utmp[0]));
-
-        const __m256i q8sums = _mm256_loadu_si256((const __m256i*)y[i].bsums);
-        const __m128i q8s = _mm_hadd_epi16(_mm256_extracti128_si256(q8sums, 0), _mm256_extracti128_si256(q8sums, 1));
-        const __m128i prod = _mm_madd_epi16(_mm256_extracti128_si256(mins_and_scales, 1), q8s);
-        acc_m = _mm_fmadd_ps(_mm_set1_ps(dmin), _mm_cvtepi32_ps(prod), acc_m);
-
-        const __m128i sc128  = _mm256_extracti128_si256(mins_and_scales, 0);
-        const __m256i scales = MM256_SET_M128I(sc128, sc128);
-
-        __m256i sumi = _mm256_setzero_si256();
-
-        for (int j = 0; j < QK_K/64; ++j) {
-
-            const __m256i scale_l = _mm256_shuffle_epi8(scales, get_scale_shuffle_k4(2*j+0));
-            const __m256i scale_h = _mm256_shuffle_epi8(scales, get_scale_shuffle_k4(2*j+1));
-
-            const __m256i q4bits = _mm256_loadu_si256((const __m256i*)q4); q4 += 32;
-            const __m256i q4l = _mm256_and_si256(q4bits, m4);
-            const __m256i q4h = _mm256_and_si256(_mm256_srli_epi16(q4bits, 4), m4);
-
-            const __m256i q8l = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
-            __m256i p16l = _mm256_maddubs_epi16(q4l, q8l);
-            p16l = _mm256_madd_epi16(scale_l, p16l);
-
-            const __m256i q8h = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
-            __m256i p16h = _mm256_maddubs_epi16(q4h, q8h);
-            p16h = _mm256_madd_epi16(scale_h, p16h);
-            const __m256i sumj = _mm256_add_epi32(p16l, p16h);
-
-            sumi = _mm256_add_epi32(sumi, sumj);
-        }
-
-        __m256 vd = _mm256_set1_ps(d);
-        acc = _mm256_fmadd_ps(vd, _mm256_cvtepi32_ps(sumi), acc);
-
-    }
-
-    acc_m = _mm_add_ps(acc_m, _mm_movehl_ps(acc_m, acc_m));
-    acc_m = _mm_add_ss(acc_m, _mm_movehdup_ps(acc_m));
-
-    *s = hsum_float_8(acc) + _mm_cvtss_f32(acc_m);
-
-#elif defined __AVX__
-
-    const __m128i m4 = _mm_set1_epi8(0xF);
-    const __m128i m2 = _mm_set1_epi8(0x2);
-
-    __m256 acc = _mm256_setzero_ps();
-    __m128 acc_m = _mm_setzero_ps();
-
-   for (int i = 0; i < nb; ++i) {
-
-        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
-        const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
-
-        const uint8_t * GGML_RESTRICT q4 = x[i].qs;
-        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
-
-        memcpy(utmp, x[i].scales, 12);
-        utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
-        const uint32_t uaux = utmp[1] & kmask1;
-        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
-        utmp[2] = uaux;
-        utmp[0] &= kmask1;
-
-        const __m128i utmps = _mm_set_epi32(utmp[3], utmp[2], utmp[1], utmp[0]);
-        const __m128i scales = _mm_cvtepu8_epi16(utmps);
-        const __m128i mins = _mm_cvtepu8_epi16(_mm_unpackhi_epi64(utmps, utmps));
-
-        const __m128i q8sums_0 = _mm_loadu_si128((const __m128i*)&y[i].bsums[0]);
-        const __m128i q8sums_1 = _mm_loadu_si128((const __m128i*)&y[i].bsums[8]);
-        const __m128i q8s = _mm_hadd_epi16(q8sums_0, q8sums_1);
-        const __m128i prod = _mm_madd_epi16(mins, q8s);
-        acc_m = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(dmin), _mm_cvtepi32_ps(prod)), acc_m);
-
-        __m128i sumi_0 = _mm_setzero_si128();
-        __m128i sumi_1 = _mm_setzero_si128();
-
-        __m128i shuffle = _mm_set1_epi16(0x0100);
-        for (int j = 0; j < QK_K/64; ++j) {
-
-            const __m128i scale_l = _mm_shuffle_epi8(scales, shuffle);
-            shuffle = _mm_add_epi16(shuffle, m2);
-            const __m128i scale_h = _mm_shuffle_epi8(scales, shuffle);
-            shuffle = _mm_add_epi16(shuffle, m2);
-
-            __m128i q4bits = _mm_loadu_si128((const __m128i*)q4); q4 += 16;
-            const __m128i q4l_0 = _mm_and_si128(q4bits, m4);
-            const __m128i q4h_0 = _mm_and_si128(_mm_srli_epi16(q4bits, 4), m4);
-            q4bits = _mm_loadu_si128((const __m128i*)q4); q4 += 16;
-            const __m128i q4l_1 = _mm_and_si128(q4bits, m4);
-            const __m128i q4h_1 = _mm_and_si128(_mm_srli_epi16(q4bits, 4), m4);
-
-            const __m128i q8l_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-            __m128i p16l = _mm_maddubs_epi16(q4l_0, q8l_0);
-            p16l = _mm_madd_epi16(scale_l, p16l);
-            sumi_0 = _mm_add_epi32(sumi_0, p16l);
-            const __m128i q8l_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-            p16l = _mm_maddubs_epi16(q4l_1, q8l_1);
-            p16l = _mm_madd_epi16(scale_l, p16l);
-            sumi_1 = _mm_add_epi32(sumi_1, p16l);
-
-            const __m128i q8h_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-            __m128i p16h = _mm_maddubs_epi16(q4h_0, q8h_0);
-            p16h = _mm_madd_epi16(scale_h, p16h);
-            sumi_0 = _mm_add_epi32(sumi_0, p16h);
-            const __m128i q8h_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-            p16h = _mm_maddubs_epi16(q4h_1, q8h_1);
-            p16h = _mm_madd_epi16(scale_h, p16h);
-            sumi_1 = _mm_add_epi32(sumi_1, p16h);
-
-        }
-
-        __m256 vd = _mm256_set1_ps(d);
-        __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
-        acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(sumi)), acc);
-
-    }
-
-    acc_m = _mm_add_ps(acc_m, _mm_movehl_ps(acc_m, acc_m));
-    acc_m = _mm_add_ss(acc_m, _mm_movehdup_ps(acc_m));
-
-    *s = hsum_float_8(acc) + _mm_cvtss_f32(acc_m);
-
-#elif defined __riscv_xtheadvector
-
-    const uint8_t * scales = (const uint8_t*)&utmp[0];
-    const uint8_t * mins   = (const uint8_t*)&utmp[2];
-
-    float sumf = 0;
-
-    for (int i = 0; i < nb; ++i) {
-        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
-        const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
-
-        int tmp, tmp2, sumi;
-        __asm__ __volatile__(
-            "li %[t1], 12\n\t"
-            "th.vsetvli zero, %[t1], e8, m1\n\t"
-            "th.vlb.v v1, (%[s6b])\n\t" // {aux[0], aux[1], aux[2]}
-            "li %[t1], 4\n\t"
-            "th.vsetvli zero, %[t1], e32, m1\n\t"
-            "th.vslidedown.vi v2, v1, 2\n\t"
-            "th.vmv.v.v v3, v2\n\t"
-            "th.vslideup.vi v2, v3, 1\n\t" // {aux[2], aux[2]}
-            "li %[t1], 2\n\t"
-            "th.vsetvli zero, %[t1], e32, m1\n\t"
-            "th.vmv.v.i v4, 4\n\t"
-            "th.vand.vx v8, v1, %[kmask1]\n\t"
-            "th.vslide1up.vx v5, v4, zero\n\t" // {0, 4}
-            "th.vsrl.vi v6, v1, 6\n\t"
-            "th.vsrl.vv v7, v2, v5\n\t"
-            "th.vand.vx v0, v6, %[kmask3]\n\t"
-            "th.vand.vx v2, v7, %[kmask2]\n\t"
-            "th.vsll.vi v6, v0, 4\n\t"
-            "li %[t2], 8\n\t"
-            "addi %[t1], %[utmp], 4\n\t"
-            "th.vor.vv v1, v6, v2\n\t"
-            "th.vssw.v v8, (%[utmp]), %[t2]\n\t"
-            "th.vssw.v v1, (%[t1]), %[t2]\n\t"
-            "th.vsetvli zero, zero, e32, m2\n\t" // vl == 8
-            "th.vlw.v v2, (%[bsums])\n\t"
-            "th.vsetvli zero, %[t2], e16, m1\n\t"
-            "th.vnsrl.vi v0, v2, 0\n\t"
-            "th.vnsrl.vi v1, v2, 16\n\t"
-            "th.vadd.vv v2, v0, v1\n\t"
-            "th.vlbu.v v4, (%[mins])\n\t"
-            "th.vwmul.vv v6, v4, v2\n\t"
-            "th.vmv.v.x v0, zero\n\t"
-            "th.vsetvli zero, %[t2], e32, m2\n\t"
-            "th.vredsum.vs v0, v6, v0\n\t"
-            "th.vmv.x.s %[sumi], v0"
-            : [t1] "=&r" (tmp), [t2] "=&r" (tmp2), [sumi] "=&r" (sumi)
-            : [bsums] "r" (y[i].bsums), [mins] "r" (mins), [utmp] "r" (utmp)
-            , [s6b] "r" (x[i].scales), [kmask1] "r" (kmask1)
-            , [kmask2] "r" (kmask2), [kmask3] "r" (kmask3)
-            : "memory"
-            , "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
-            , "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
-            , "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
-            , "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
-        );
-        sumf -= dmin * sumi;
-
-        const uint8_t * restrict q4 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
-
-        sumi = 0;
-        const uint8_t * scale = scales;
-
-        for (int j = 0; j < QK_K/128; ++j) {
-            int vl128 = 128, vl64 = 64, vl32 = 32;
-            __asm__ __volatile__(
-                "th.vsetvli zero, %[vl128], e8, m8\n\t"
-                "th.vlb.v v8, (%[q8])\n\t"
-                "th.vsetvli zero, %[vl64], e8, m4\n\t"
-                "th.vlb.v v0, (%[q4])\n\t"
-                "th.vsrl.vi v4, v0, 4\n\t"
-                "th.vand.vi v0, v0, 0xF\n\t"
-                "th.vsetvli zero, %[vl32], e8, m2\n\t"
-                "th.vwmul.vv v28, v6, v14\n\t"
-                "th.vwmul.vv v20, v4, v10\n\t"
-                "th.vwmul.vv v24, v2, v12\n\t"
-                "th.vwmul.vv v16, v0, v8\n\t"
-                "li %[tmp], 4\n\t"
-                "th.vsetvli zero, %[tmp], e32, m1\n\t"
-                "th.vlbu.v v1, (%[scale])\n\t"
-                "th.vmv.v.x v0, zero\n\t"
-                "th.vsetvli zero, %[vl32], e16, m4\n\t"
-                "th.vwredsum.vs v6, v24, v0\n\t"
-                "th.vwredsum.vs v7, v28, v0\n\t"
-                "th.vwredsum.vs v4, v16, v0\n\t"
-                "th.vwredsum.vs v5, v20, v0\n\t"
-                "th.vsetvli zero, %[tmp], e32, m1\n\t"
-                "th.vslideup.vi v6, v7, 1\n\t"
-                "th.vslideup.vi v4, v5, 1\n\t"
-                "th.vslideup.vi v4, v6, 2\n\t"
-                "th.vmul.vv v8, v4, v1\n\t"
-                "th.vredsum.vs v0, v8, v0\n\t"
-                "th.vmv.x.s %[tmp], v0\n\t"
-                "add %[sumi], %[sumi], %[tmp]"
-                : [tmp] "=&r" (tmp), [sumi] "+&r" (sumi)
-                : [vl128] "r" (vl128), [vl64] "r" (vl64), [vl32] "r" (vl32)
-                , [q4] "r" (q4), [q8] "r" (q8), [scale] "r" (scale)
-                : "memory"
-                , "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
-                , "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
-                , "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
-                , "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
-            );
-
-            q4 += 64;    q8 += 128;    scale += 4;
-        }
-
-        sumf += d * sumi;
-
-    }
-
-    *s = sumf;
-
-#elif defined __riscv_v
-
-    const uint8_t * scales = (const uint8_t*)&utmp[0];
-    const uint8_t * mins   = (const uint8_t*)&utmp[2];
-
-    float sumf = 0;
-    const int vector_length = __riscv_vlenb() * 8;
-
-    switch (vector_length) {
-    case 256:
-        for (int i = 0; i < nb; ++i) {
-
-            size_t vl = 8;
-
-            const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
-            const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
-
-            vint16mf2_t q8sums_0 = __riscv_vlse16_v_i16mf2(y[i].bsums, 4, vl);
-            vint16mf2_t q8sums_1 = __riscv_vlse16_v_i16mf2(y[i].bsums+1, 4, vl);
-            vint16mf2_t q8sums   = __riscv_vadd_vv_i16mf2(q8sums_0, q8sums_1, vl);
-
-            memcpy(utmp, x[i].scales, 12);
-            utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
-            const uint32_t uaux = utmp[1] & kmask1;
-            utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
-            utmp[2] = uaux;
-            utmp[0] &= kmask1;
-
-            vuint8mf4_t mins8  = __riscv_vle8_v_u8mf4(mins, vl);
-            vint16mf2_t v_mins = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vzext_vf2_u16mf2(mins8, vl));
-            vint32m1_t  prod   = __riscv_vwmul_vv_i32m1(q8sums, v_mins, vl);
-
-            vint32m1_t sumi = __riscv_vredsum_vs_i32m1_i32m1(prod, __riscv_vmv_v_x_i32m1(0, 1), vl);
-            sumf -= dmin * __riscv_vmv_x_s_i32m1_i32(sumi);
-
-            const uint8_t * GGML_RESTRICT q4 = x[i].qs;
-            const int8_t  * GGML_RESTRICT q8 = y[i].qs;
-
-            vl = 32;
-
-            int32_t sum_1 = 0;
-            int32_t sum_2 = 0;
-
-            vint16m1_t vzero = __riscv_vmv_v_x_i16m1(0, 1);
-
-            for (int j = 0; j < QK_K/64; ++j) {
-                // load Q4
-                vuint8m1_t q4_x = __riscv_vle8_v_u8m1(q4, vl);
-
-                // load Q8 and multiply it with lower Q4 nibble
-                vint8m1_t  q8_0 = __riscv_vle8_v_i8m1(q8, vl);
-                vint8m1_t  q4_0 = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(q4_x, 0x0F, vl));
-                vint16m2_t qv_0 = __riscv_vwmul_vv_i16m2(q4_0, q8_0, vl);
-                vint16m1_t vs_0 = __riscv_vredsum_vs_i16m2_i16m1(qv_0, vzero, vl);
-
-                sum_1 += __riscv_vmv_x_s_i16m1_i16(vs_0) * scales[2*j+0];
-
-                // load Q8 and multiply it with upper Q4 nibble
-                vint8m1_t  q8_1 = __riscv_vle8_v_i8m1(q8+32, vl);
-                vint8m1_t  q4_1 = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vsrl_vx_u8m1(q4_x, 0x04, vl));
-                vint16m2_t qv_1 = __riscv_vwmul_vv_i16m2(q4_1, q8_1, vl);
-                vint16m1_t vs_1 = __riscv_vredsum_vs_i16m2_i16m1(qv_1, vzero, vl);
-
-                sum_2 += __riscv_vmv_x_s_i16m1_i16(vs_1) * scales[2*j+1];
-
-                q4 += 32;    q8 += 64;
-
-            }
-
-            sumf += d*(sum_1 + sum_2);
-
-        }
-        break;
-    case 128:
-        for (int i = 0; i < nb; ++i) {
-            const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
-            const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
-
-            int tmp, tmp2, sumi;
-            __asm__ __volatile__(
-                "vsetivli zero, 12, e8, m1\n\t"
-                "vle8.v v1, (%[s6b])\n\t" // {aux[0], aux[1], aux[2]}
-                "vsetivli zero, 4, e32, m1\n\t"
-                "vslidedown.vi v2, v1, 2\n\t"
-                "vmv1r.v v3, v2\n\t"
-                "vslideup.vi v2, v3, 1\n\t" // {aux[2], aux[2]}
-                "vsetivli zero, 2, e32, m1\n\t"
-                "vmv.v.i v4, 4\n\t"
-                "vand.vx v8, v1, %[kmask1]\n\t"
-                "vslide1up.vx v5, v4, zero\n\t" // {0, 4}
-                "vsrl.vi v6, v1, 6\n\t"
-                "vsrl.vv v7, v2, v5\n\t"
-                "vand.vx v0, v6, %[kmask3]\n\t"
-                "vand.vx v2, v7, %[kmask2]\n\t"
-                "vsll.vi v6, v0, 4\n\t"
-                "li %[t2], 8\n\t"
-                "addi %[t1], %[utmp], 4\n\t"
-                "vor.vv v1, v6, v2\n\t"
-                "vsse32.v v8, (%[utmp]), %[t2]\n\t"
-                "vsse32.v v1, (%[t1]), %[t2]\n\t"
-                "vsetivli zero, 8, e16, m1\n\t"
-                "vle32.v v2, (%[bsums])\n\t"
-                "vnsrl.wi v0, v2, 0\n\t"
-                "vnsrl.wi v1, v2, 16\n\t"
-                "vadd.vv v2, v0, v1\n\t"
-                "vle8.v v3, (%[mins])\n\t"
-                "vzext.vf2 v4, v3\n\t"
-                "vwmul.vv v6, v4, v2\n\t"
-                "vmv.v.x v0, zero\n\t"
-                "vsetivli zero, 8, e32, m2\n\t"
-                "vredsum.vs v0, v6, v0\n\t"
-                "vmv.x.s %[sumi], v0"
-                : [t1] "=&r" (tmp), [t2] "=&r" (tmp2), [sumi] "=&r" (sumi)
-                : [bsums] "r" (y[i].bsums), [mins] "r" (mins), [utmp] "r" (utmp)
-                , [s6b] "r" (x[i].scales), [kmask1] "r" (kmask1)
-                , [kmask2] "r" (kmask2), [kmask3] "r" (kmask3)
-                : "memory"
-                , "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
-                , "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
-                , "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
-                , "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
-            );
-            sumf -= dmin * sumi;
-
-            const uint8_t * restrict q4 = x[i].qs;
-            const int8_t  * restrict q8 = y[i].qs;
-
-            sumi = 0;
-            const uint8_t * scale = scales;
-
-            for (int j = 0; j < QK_K/128; ++j) {
-                int vl128 = 128, vl64 = 64, vl32 = 32;
-                __asm__ __volatile__(
-                    "vsetvli zero, %[vl128], e8, m8\n\t"
-                    "vle8.v v8, (%[q8])\n\t"
-                    "vsetvli zero, %[vl64], e8, m4\n\t"
-                    "vle8.v v0, (%[q4])\n\t"
-                    "vsrl.vi v4, v0, 4\n\t"
-                    "vand.vi v0, v0, 0xF\n\t"
-                    "vsetvli zero, %[vl32], e8, m2\n\t"
-                    "vwmul.vv v28, v6, v14\n\t"
-                    "vwmul.vv v20, v4, v10\n\t"
-                    "vwmul.vv v24, v2, v12\n\t"
-                    "vwmul.vv v16, v0, v8\n\t"
-                    "vsetivli zero, 4, e32, m1\n\t"
-                    "vle8.v v2, (%[scale])\n\t"
-                    "vmv.v.x v0, zero\n\t"
-                    "vzext.vf4 v1, v2\n\t"
-                    "vsetvli zero, %[vl32], e16, m4\n\t"
-                    "vwredsum.vs v6, v24, v0\n\t"
-                    "vwredsum.vs v7, v28, v0\n\t"
-                    "vwredsum.vs v4, v16, v0\n\t"
-                    "vwredsum.vs v5, v20, v0\n\t"
-                    "vsetivli zero, 4, e32, m1\n\t"
-                    "vslideup.vi v6, v7, 1\n\t"
-                    "vslideup.vi v4, v5, 1\n\t"
-                    "vslideup.vi v4, v6, 2\n\t"
-                    "vmul.vv v8, v4, v1\n\t"
-                    "vredsum.vs v0, v8, v0\n\t"
-                    "vmv.x.s %[tmp], v0\n\t"
-                    "add %[sumi], %[sumi], %[tmp]"
-                    : [tmp] "=&r" (tmp), [sumi] "+&r" (sumi)
-                    : [vl128] "r" (vl128), [vl64] "r" (vl64), [vl32] "r" (vl32)
-                    , [q4] "r" (q4), [q8] "r" (q8), [scale] "r" (scale)
-                    : "memory"
-                    , "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
-                    , "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
-                    , "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
-                    , "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
-                );
-
-                q4 += 64;    q8 += 128;    scale += 4;
-            }
-
-            sumf += d * sumi;
-        }
-        break;
-    default:
-        assert(false && "Unsupported vector length");
-        break;
-    }
-
-    *s = sumf;
-
-#elif defined(__POWER9_VECTOR__)
-    const vector signed char lowMask = vec_splats((signed char)0xF);
-    const vector signed char lowMask1 = vec_splats((int8_t)0x3f);
-    const vector signed char lowMask2 = vec_splats((int8_t)0x30);
-    const vector int v0 = vec_splats((int32_t)0);
-    const vector unsigned char v2 = vec_splats((uint8_t)2);
-    const vector unsigned char v4 = vec_splats((unsigned char)0x4);
-
-    vector float vsumf0 = vec_splats(0.0f);
-    vector float vsumf1 = vec_splats(0.0f);
-    vector float vsumf2 = vec_splats(0.0f);
-    vector float vsumf3 = vec_splats(0.0f);
-
-    for (int i = 0; i < nb; ++i) {
-        vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
-        vector float vyd = vec_splats(y[i].d);
-        vector float vd = vec_mul(vxd, vyd);
-
-        vector float vxmin = vec_splats(GGML_FP16_TO_FP32(x[i].dmin));
-        vector float vdmin = vec_mul(vxmin, vyd);
-
-        vector signed short q8ysums0 = vec_xl( 0, y[i].bsums);
-        vector signed short q8ysums1 = vec_xl(16, y[i].bsums);
-
-        UNUSED(kmask1);
-        UNUSED(kmask2);
-        UNUSED(kmask3);
-        UNUSED(utmp);
-
-        vector signed char u0 = (vector signed char)vec_xl_len(x[i].scales, 8);
-        vector signed char u1 = vec_and(vec_sr(u0, v2), lowMask2);
-        vector signed char u2 = (vector signed char)vec_xl_len(x[i].scales + 8, 4);
-        vector signed char u3 = vec_sr(u2, v4);
-
-        vector signed char u30 = u1;
-        vector signed char u31 = (vector signed char)vec_mergeh((vector signed int)vec_and(u2, lowMask), (vector signed int)u3);
-
-        u1 = vec_and(u0, lowMask1);
-        u2 = vec_or(u30, u31);
-
-        vector signed char utmps = (vector signed char)vec_mergeh((vector signed int)u1, (vector signed int)u2);
-
-        vector signed short vscales = vec_unpackh(utmps);
-        vector signed short q4xmins = vec_unpackl(utmps);
-        vector signed short q4xmins0 = vec_mergeh(q4xmins, q4xmins);
-        vector signed short q4xmins1 = vec_mergel(q4xmins, q4xmins);
-
-        vector signed int prod0 = vec_mule(q4xmins0, q8ysums0);
-        vector signed int prod1 = vec_mule(q4xmins1, q8ysums1);
-        vector signed int prod2 = vec_mulo(q4xmins0, q8ysums0);
-        vector signed int prod3 = vec_mulo(q4xmins1, q8ysums1);
-
-        vsumf0 = vec_nmsub(vec_ctf(prod0, 0), vdmin, vsumf0);
-        vsumf1 = vec_nmsub(vec_ctf(prod1, 0), vdmin, vsumf1);
-        vsumf2 = vec_nmsub(vec_ctf(prod2, 0), vdmin, vsumf2);
-        vsumf3 = vec_nmsub(vec_ctf(prod3, 0), vdmin, vsumf3);
-
-        vector signed int vsumi0 = v0;
-        vector signed int vsumi1 = v0;
-        vector signed int vsumi2 = v0;
-        vector signed int vsumi3 = v0;
-
-        const uint8_t * GGML_RESTRICT q4 = x[i].qs;
-        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
-
-        for (int j = 0; j < QK_K/64; j+=2) {
-            __builtin_prefetch(q4, 0, 1);
-            __builtin_prefetch(q8, 0, 1);
-
-            vector signed char qxs0 = (vector signed char)vec_xl( 0, q4);
-            vector signed char qxs1 = (vector signed char)vec_xl(16, q4);
-            vector signed char qxs2 = (vector signed char)vec_xl(32, q4);
-            vector signed char qxs3 = (vector signed char)vec_xl(48, q4);
-            q4 += 64;
-
-            vector unsigned char q4x00 = (vector unsigned char)vec_and(qxs0, lowMask);
-            vector unsigned char q4x01 = (vector unsigned char)vec_sr(qxs0, v4);
-            vector unsigned char q4x10 = (vector unsigned char)vec_and(qxs1, lowMask);
-            vector unsigned char q4x11 = (vector unsigned char)vec_sr(qxs1, v4);
-            vector unsigned char q4x20 = (vector unsigned char)vec_and(qxs2, lowMask);
-            vector unsigned char q4x21 = (vector unsigned char)vec_sr(qxs2, v4);
-            vector unsigned char q4x30 = (vector unsigned char)vec_and(qxs3, lowMask);
-            vector unsigned char q4x31 = (vector unsigned char)vec_sr(qxs3, v4);
-
-            vector signed char q8y00 = vec_xl(  0, q8);
-            vector signed char q8y10 = vec_xl( 16, q8);
-            vector signed char q8y01 = vec_xl( 32, q8);
-            vector signed char q8y11 = vec_xl( 48, q8);
-            vector signed char q8y20 = vec_xl( 64, q8);
-            vector signed char q8y30 = vec_xl( 80, q8);
-            vector signed char q8y21 = vec_xl( 96, q8);
-            vector signed char q8y31 = vec_xl(112, q8);
-            q8 += 128;
-
-            vector signed int qv00 = vec_msum(q8y00, q4x00, v0);
-            vector signed int qv01 = vec_msum(q8y01, q4x01, v0);
-            vector signed int qv10 = vec_msum(q8y10, q4x10, v0);
-            vector signed int qv11 = vec_msum(q8y11, q4x11, v0);
-            vector signed int qv20 = vec_msum(q8y20, q4x20, v0);
-            vector signed int qv21 = vec_msum(q8y21, q4x21, v0);
-            vector signed int qv30 = vec_msum(q8y30, q4x30, v0);
-            vector signed int qv31 = vec_msum(q8y31, q4x31, v0);
-
-            vector signed int vscales_h = vec_unpackh(vscales);
-            vector signed int vs0 = vec_splat(vscales_h, 0);
-            vector signed int vs1 = vec_splat(vscales_h, 1);
-            vector signed int vs2 = vec_splat(vscales_h, 2);
-            vector signed int vs3 = vec_splat(vscales_h, 3);
-            vscales = vec_sld(vscales, vscales, 8);
-
-            vsumi0 = vec_add(vec_mul(qv00, vs0), vsumi0);
-            vsumi1 = vec_add(vec_mul(qv01, vs1), vsumi1);
-            vsumi2 = vec_add(vec_mul(qv20, vs2), vsumi2);
-            vsumi3 = vec_add(vec_mul(qv21, vs3), vsumi3);
-
-            vsumi0 = vec_add(vec_mul(qv10, vs0), vsumi0);
-            vsumi1 = vec_add(vec_mul(qv11, vs1), vsumi1);
-            vsumi2 = vec_add(vec_mul(qv30, vs2), vsumi2);
-            vsumi3 = vec_add(vec_mul(qv31, vs3), vsumi3);
-        }
-
-        vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
-        vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
-        vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
-        vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
-    }
-
-    vsumf0 = vec_add(vsumf0, vsumf2);
-    vsumf1 = vec_add(vsumf1, vsumf3);
-
-    vsumf0 = vec_add(vsumf0, vsumf1);
-
-    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
-    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
-
-    *s = vec_extract(vsumf0, 0);
-
-#elif defined __loongarch_asx
-
-    __m256 acc = (__m256)__lasx_xvldi(0);
-    __m128 acc_m = (__m128)__lsx_vldi(0);
-
-   for (int i = 0; i < nb; ++i) {
-
-        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
-        const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
-
-        memcpy(utmp, x[i].scales, 12);
-        utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
-        const uint32_t uaux = utmp[1] & kmask1;
-        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
-        utmp[2] = uaux;
-        utmp[0] &= kmask1;
-
-        const uint8_t * GGML_RESTRICT q4 = x[i].qs;
-        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
-
-        const __m128i mins_and_scales128 = lsx_set_w(utmp[3], utmp[2], utmp[1], utmp[0]);
-        const __m128i mins128 = __lsx_vexth_h_b(mins_and_scales128);
-        const __m128i scales128 = __lsx_vsllwil_h_b(mins_and_scales128, 0);
-
-        const __m256i q8sums = __lasx_xvld((const __m256i*)y[i].bsums, 0);
-        const __m128i q8s = lsx_hadd_h(lasx_extracti128(q8sums, 0), lasx_extracti128(q8sums, 1));
-        const __m128i prod = lsx_madd_h(mins128, q8s);
-        acc_m = __lsx_vfmadd_s(__lsx_vreplfr2vr_s(dmin), __lsx_vffint_s_w(prod), acc_m);
-
-        const __m256i scales = lasx_insertf128(scales128, scales128);
-
-        __m256i sumi = __lasx_xvldi(0);
-
-        for (int j = 0; j < QK_K/64; ++j) {
-
-            const __m256i scale_l = lasx_xvrepl128vei_h(scales, 2 * j + 0);
-            const __m256i scale_h = lasx_xvrepl128vei_h(scales, 2 * j + 1);
-
-            const __m256i q4bits = __lasx_xvld((const __m256i*)q4, 0); q4 += 32;
-            const __m256i q4l = __lasx_xvandi_b(q4bits, 0xf);
-            const __m256i q4h = __lasx_xvsrli_b(q4bits, 4);
-
-            const __m256i q8l = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
-            __m256i p16l = lasx_madd_h_b(q4l, q8l);
-            p16l = lasx_madd_h(scale_l, p16l);
-
-            const __m256i q8h = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
-            __m256i p16h = lasx_madd_h_b(q4h, q8h);
-            p16h = lasx_madd_h(scale_h, p16h);
-            const __m256i sumj = __lasx_xvadd_w(p16l, p16h);
-
-            sumi = __lasx_xvadd_w(sumi, sumj);
-        }
-
-        __m256 vd = __lasx_xvreplfr2vr_s(d);
-        acc = __lasx_xvfmadd_s(vd, __lasx_xvffint_s_w(sumi), acc);
-
-    }
-
-    acc_m = __lsx_vfadd_s(acc_m, (__m128)__lsx_vpermi_w((__m128i)acc_m, (__m128i)acc_m, 0xee));
-    __m128i tmp1 = __lsx_vinsgr2vr_w(__lsx_vldi(0), __lsx_vpickve2gr_w((__m128i)acc_m, 1), 0);
-    acc_m = __lsx_vfadd_s(acc_m, (__m128)tmp1);
-
-
-    *s = hsum_float_8(acc) + ((v4f32)acc_m)[0];
-#elif defined(__VXE__) || defined(__VXE2__)
-    const uint8x16_t v_lm = vec_splat_u8(0x0F);
-    const int32x4_t v_z = vec_splat_s32(0);
-
-    uint8x16_t v_x[2];
-    int8x16_t  v_xl[2];
-    int8x16_t  v_y[2];
-
-    float sumf = 0;
-
-    for (int i = 0; i < nb; ++i) {
-        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
-        const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
-
-        const int16x8_t v_ysumsl = vec_xl(0 , y[i].bsums);
-        const int16x8_t v_ysumsh = vec_xl(16, y[i].bsums);
-        const int16x8_t v_ysums = vec_padd_s16(v_ysumsl, v_ysumsh);
-
-        memcpy(utmp, x[i].scales, 12);
-
-        uint32x4_t v_mins8 = { 0 };
-        v_mins8 = vec_insert(utmp[1] & kmask1, v_mins8, 0);
-        v_mins8 = vec_insert(((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4), v_mins8, 1);
-
-        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
-        utmp[0] &= kmask1;
-
-        const int16x8_t v_minsh = (int16x8_t)vec_unpackh((uint8x16_t)v_mins8);
-
-        const int32x4_t v_minso = vec_mulo(v_ysums, v_minsh);
-        const int32x4_t v_minse = vec_mule(v_ysums, v_minsh);
-        const int32x4_t v_mins = v_minso + v_minse;
-        sumf -= dmin * (v_mins[0] + v_mins[1] + v_mins[2] + v_mins[3]);
-
-        const uint8_t * scales = (const uint8_t *)utmp;
-        const uint8_t * GGML_RESTRICT x0 = x[i].qs;
-        const int8_t  * GGML_RESTRICT y0 = y[i].qs;
-
-        int32_t sumi1 = 0;
-        int32_t sumi2 = 0;
-
-        for (int j = 0; j < QK_K/64; ++j) {
-            v_x[0] = vec_xl(0 , x0);
-            v_x[1] = vec_xl(16, x0);
-            x0 += 32;
-
-            v_y[0] = vec_xl(0 , y0);
-            v_y[1] = vec_xl(16, y0);
-            y0 += 32;
-
-            v_xl[0] = (int8x16_t)vec_and(v_x[0], v_lm);
-            v_xl[1] = (int8x16_t)vec_and(v_x[1], v_lm);
-
-            const int32x4_t p1 = ggml_vec_dot(ggml_vec_dot(v_z, v_xl[0], v_y[0]), v_xl[1], v_y[1]);
-            sumi1 += (p1[0] + p1[1] + p1[2] + p1[3]) * scales[2*j+0];
-
-            v_y[0] = vec_xl(0 , y0);
-            v_y[1] = vec_xl(16, y0);
-            y0 += 32;
-
-            v_xl[0] = (int8x16_t)vec_sr(v_x[0], 4);
-            v_xl[1] = (int8x16_t)vec_sr(v_x[1], 4);
-
-            const int32x4_t p2 = ggml_vec_dot(ggml_vec_dot(v_z, v_xl[0], v_y[0]), v_xl[1], v_y[1]);
-            sumi2 += (p2[0] + p2[1] + p2[2] + p2[3]) * scales[2*j+1];
-        }
-
-        sumf += d * (sumi1 + sumi2);
-    }
-
-    *s = sumf;
-#else
-
-    const uint8_t * scales = (const uint8_t*)&utmp[0];
-    const uint8_t * mins   = (const uint8_t*)&utmp[2];
-
-    int8_t  aux8[QK_K];
-    int16_t aux16[8];
-    float   sums [8];
-    int32_t aux32[8];
-    memset(sums, 0, 8*sizeof(float));
-
-    float sumf = 0;
-    for (int i = 0; i < nb; ++i) {
-        const uint8_t * GGML_RESTRICT q4 = x[i].qs;
-        const  int8_t * GGML_RESTRICT q8 = y[i].qs;
-        memset(aux32, 0, 8*sizeof(int32_t));
-        int8_t * GGML_RESTRICT a = aux8;
-        for (int j = 0; j < QK_K/64; ++j) {
-            for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
-            a += 32;
-            for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l]  >> 4);
-            a += 32; q4 += 32;
-        }
-        memcpy(utmp, x[i].scales, 12);
-        utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
-        const uint32_t uaux = utmp[1] & kmask1;
-        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
-        utmp[2] = uaux;
-        utmp[0] &= kmask1;
-
-        int sumi = 0;
-        for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
-        a = aux8;
-        int is = 0;
-        for (int j = 0; j < QK_K/32; ++j) {
-            int32_t scale = scales[is++];
-            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
-            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
-            q8 += 8; a += 8;
-            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
-            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
-            q8 += 8; a += 8;
-            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
-            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
-            q8 += 8; a += 8;
-            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
-            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
-            q8 += 8; a += 8;
-        }
-        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-        for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
-        const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d;
-        sumf -= dmin * sumi;
-    }
-    for (int l = 0; l < 8; ++l) sumf += sums[l];
-    *s = sumf;
-#endif
-}
-
-void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy,  size_t by, int nrc) {
-    assert(n % QK_K == 0);
-    assert(nrc == 1);
-    UNUSED(nrc);
-    UNUSED(bx);
-    UNUSED(by);
-    UNUSED(bs);
-
-    const block_q5_K * GGML_RESTRICT x = vx;
-    const block_q8_K * GGML_RESTRICT y = vy;
-
-    const int nb = n / QK_K;
-
-    static const uint32_t kmask1 = 0x3f3f3f3f;
-    static const uint32_t kmask2 = 0x0f0f0f0f;
-    static const uint32_t kmask3 = 0x03030303;
-
-    uint32_t utmp[4];
-
-#ifdef __ARM_NEON
-    const uint8x16_t m4b = vdupq_n_u8(0xf);
-    const uint8x16_t mone = vdupq_n_u8(1);
-    const uint8x16_t mtwo = vdupq_n_u8(2);
-    const int32x4_t mzero = vdupq_n_s32(0);
-
-    ggml_int8x16x4_t q5bytes;
-
-    float sumf = 0;
-
-    for (int i = 0; i < nb; ++i) {
-
-        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
-        const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
-
-        const int16x8_t q8sums = vpaddq_s16(vld1q_s16(y[i].bsums), vld1q_s16(y[i].bsums + 8));
-
-        memcpy(utmp, x[i].scales, 12);
-        utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
-        const uint32_t uaux = utmp[1] & kmask1;
-        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
-        utmp[2] = uaux;
-        utmp[0] &= kmask1;
-
-        const uint8x8_t mins8 = vld1_u8((const uint8_t*)utmp + 8);
-        const int16x8_t mins = vreinterpretq_s16_u16(vmovl_u8(mins8));
-        const int32x4_t prod = vaddq_s32(vmull_s16(vget_low_s16 (q8sums), vget_low_s16 (mins)),
-                                         vmull_s16(vget_high_s16(q8sums), vget_high_s16(mins)));
-        int32_t sumi_mins = vaddvq_s32(prod);
-
-        const uint8_t * scales = (const uint8_t *)utmp;
-
-        const uint8_t * GGML_RESTRICT q5 = x[i].qs;
-        const uint8_t * GGML_RESTRICT qh = x[i].qh;
-        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
-
-        ggml_uint8x16x2_t qhbits = ggml_vld1q_u8_x2(qh);
-
-        ggml_uint8x16x4_t q5h;
-
-        int32_t sumi = 0;
-
-        for (int j = 0; j < QK_K/64; ++j) {
-
-            const ggml_uint8x16x2_t q5bits = ggml_vld1q_u8_x2(q5); q5 += 32;
-            const ggml_int8x16x4_t q8bytes = ggml_vld1q_s8_x4(q8); q8 += 64;
-
-            q5h.val[0] = vshlq_n_u8(vandq_u8(mone, qhbits.val[0]), 4);
-            q5h.val[1] = vshlq_n_u8(vandq_u8(mone, qhbits.val[1]), 4);
-            q5h.val[2] = vshlq_n_u8(vandq_u8(mtwo, qhbits.val[0]), 3);
-            q5h.val[3] = vshlq_n_u8(vandq_u8(mtwo, qhbits.val[1]), 3);
-            qhbits.val[0] = vshrq_n_u8(qhbits.val[0], 2);
-            qhbits.val[1] = vshrq_n_u8(qhbits.val[1], 2);
-
-            q5bytes.val[0] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q5bits.val[0], m4b), q5h.val[0]));
-            q5bytes.val[1] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q5bits.val[1], m4b), q5h.val[1]));
-            q5bytes.val[2] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q5bits.val[0], 4), q5h.val[2]));
-            q5bytes.val[3] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q5bits.val[1], 4), q5h.val[3]));
-
-            sumi += vaddvq_s32(ggml_vdotq_s32(ggml_vdotq_s32(mzero, q5bytes.val[0], q8bytes.val[0]), q5bytes.val[1], q8bytes.val[1])) * *scales++;
-            sumi += vaddvq_s32(ggml_vdotq_s32(ggml_vdotq_s32(mzero, q5bytes.val[2], q8bytes.val[2]), q5bytes.val[3], q8bytes.val[3])) * *scales++;
-        }
-
-        sumf += d * sumi - dmin * sumi_mins;
-    }
-
-    *s = sumf;
-
-#elif defined __AVX2__
-
-    const __m256i m4 = _mm256_set1_epi8(0xF);
-    const __m128i mzero = _mm_setzero_si128();
-    const __m256i mone  = _mm256_set1_epi8(1);
-
-    __m256 acc = _mm256_setzero_ps();
-
-    float summs = 0.f;
-
-    for (int i = 0; i < nb; ++i) {
-        const uint8_t * GGML_RESTRICT q5 = x[i].qs;
-        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
-
-        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
-        const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
-
-        memcpy(utmp, x[i].scales, 12);
-        utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
-        const uint32_t uaux = utmp[1] & kmask1;
-        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
-        utmp[2] = uaux;
-        utmp[0] &= kmask1;
-
-        const __m256i mins_and_scales = _mm256_cvtepu8_epi16(_mm_set_epi32(utmp[3], utmp[2], utmp[1], utmp[0]));
-
-        const __m256i q8sums = _mm256_loadu_si256((const __m256i*)y[i].bsums);
-        const __m128i q8s = _mm_hadd_epi16(_mm256_extracti128_si256(q8sums, 0), _mm256_extracti128_si256(q8sums, 1));
-        const __m128i prod = _mm_madd_epi16(_mm256_extracti128_si256(mins_and_scales, 1), q8s);
-        const __m128i hsum = _mm_hadd_epi32(_mm_hadd_epi32(prod, mzero), mzero);
-        summs += dmin * _mm_extract_epi32(hsum, 0);
-
-        const __m128i sc128  = _mm256_extracti128_si256(mins_and_scales, 0);
-        const __m256i scales = MM256_SET_M128I(sc128, sc128);
-
-        const __m256i hbits = _mm256_loadu_si256((const __m256i*)x[i].qh);
-        __m256i hmask = mone;
-
-        __m256i sumi = _mm256_setzero_si256();
-
-        int bit = 0;
-
-        for (int j = 0; j < QK_K/64; ++j) {
-
-            const __m256i scale_0 = _mm256_shuffle_epi8(scales, get_scale_shuffle_k4(2*j+0));
-            const __m256i scale_1 = _mm256_shuffle_epi8(scales, get_scale_shuffle_k4(2*j+1));
-
-            const __m256i q5bits = _mm256_loadu_si256((const __m256i*)q5); q5 += 32;
-
-            const __m256i q5l_0 = _mm256_and_si256(q5bits, m4);
-            const __m256i q5h_0 = _mm256_slli_epi16(_mm256_srli_epi16(_mm256_and_si256(hbits, hmask), bit++), 4);
-            const __m256i q5_0  = _mm256_add_epi8(q5l_0, q5h_0);
-            hmask = _mm256_slli_epi16(hmask, 1);
-
-            const __m256i q5l_1 = _mm256_and_si256(_mm256_srli_epi16(q5bits, 4), m4);
-            const __m256i q5h_1 = _mm256_slli_epi16(_mm256_srli_epi16(_mm256_and_si256(hbits, hmask), bit++), 4);
-            const __m256i q5_1  = _mm256_add_epi8(q5l_1, q5h_1);
-            hmask = _mm256_slli_epi16(hmask, 1);
-
-            const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
-            const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
-
-            __m256i p16_0 = _mm256_maddubs_epi16(q5_0, q8_0);
-            __m256i p16_1 = _mm256_maddubs_epi16(q5_1, q8_1);
-
-            p16_0 = _mm256_madd_epi16(scale_0, p16_0);
-            p16_1 = _mm256_madd_epi16(scale_1, p16_1);
-
-            sumi = _mm256_add_epi32(sumi, _mm256_add_epi32(p16_0, p16_1));
-
-        }
-
-        __m256 vd = _mm256_set1_ps(d);
-        acc = _mm256_fmadd_ps(vd, _mm256_cvtepi32_ps(sumi), acc);
-
-    }
-
-    *s = hsum_float_8(acc) + summs;
-
-#elif defined __AVX__
-
-    const __m128i m4 = _mm_set1_epi8(0xF);
-    const __m128i mzero = _mm_setzero_si128();
-    const __m128i mone  = _mm_set1_epi8(1);
-    const __m128i m2 = _mm_set1_epi8(2);
-
-    __m256 acc = _mm256_setzero_ps();
-
-    float summs = 0.f;
-
-    for (int i = 0; i < nb; ++i) {
-
-        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
-        const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
-
-        const uint8_t * GGML_RESTRICT q5 = x[i].qs;
-        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
-
-        memcpy(utmp, x[i].scales, 12);
-        utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
-        const uint32_t uaux = utmp[1] & kmask1;
-        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
-        utmp[2] = uaux;
-        utmp[0] &= kmask1;
-
-        const __m128i utmps = _mm_set_epi32(utmp[3], utmp[2], utmp[1], utmp[0]);
-        const __m128i scales = _mm_cvtepu8_epi16(utmps);
-        const __m128i mins = _mm_cvtepu8_epi16(_mm_unpackhi_epi64(utmps, utmps));
-
-        const __m128i q8sums_0 = _mm_loadu_si128((const __m128i*)&y[i].bsums[0]);
-        const __m128i q8sums_1 = _mm_loadu_si128((const __m128i*)&y[i].bsums[8]);
-        const __m128i q8s = _mm_hadd_epi16(q8sums_0, q8sums_1);
-        const __m128i prod = _mm_madd_epi16(mins, q8s);
-        const __m128i hsum = _mm_hadd_epi32(_mm_hadd_epi32(prod, mzero), mzero);
-        summs += dmin * _mm_extract_epi32(hsum, 0);
-
-        const __m128i hbits_0 = _mm_loadu_si128((const __m128i*)&x[i].qh[0]);
-        const __m128i hbits_1 = _mm_loadu_si128((const __m128i*)&x[i].qh[16]);
-        __m128i hmask = mone;
-
-        __m128i sumi_0 = _mm_setzero_si128();
-        __m128i sumi_1 = _mm_setzero_si128();
-
-        int bit = 0;
-
-        __m128i shuffle = _mm_set1_epi16(0x0100);
-        for (int j = 0; j < QK_K/64; ++j) {
-
-            const __m128i scale_0 = _mm_shuffle_epi8(scales, shuffle);
-            shuffle = _mm_add_epi16(shuffle, m2);
-            const __m128i scale_1 = _mm_shuffle_epi8(scales, shuffle);
-            shuffle = _mm_add_epi16(shuffle, m2);
-
-            const __m128i q5bits_0 = _mm_loadu_si128((const __m128i*)q5); q5 += 16;
-            const __m128i q5bits_1 = _mm_loadu_si128((const __m128i*)q5); q5 += 16;
-
-            __m128i q5l_0 = _mm_and_si128(q5bits_0, m4);
-            __m128i q5l_1 = _mm_and_si128(q5bits_1, m4);
-            __m128i q5h_0 = _mm_slli_epi16(_mm_srli_epi16(_mm_and_si128(hbits_0, hmask), bit), 4);
-            __m128i q5h_1 = _mm_slli_epi16(_mm_srli_epi16(_mm_and_si128(hbits_1, hmask), bit++), 4);
-            __m128i q5_0  = _mm_add_epi8(q5l_0, q5h_0);
-            __m128i q5_1  = _mm_add_epi8(q5l_1, q5h_1);
-            hmask = _mm_slli_epi16(hmask, 1);
-
-            __m128i q8_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-            __m128i q8_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-            __m128i p16_0 = _mm_maddubs_epi16(q5_0, q8_0);
-            __m128i p16_1 = _mm_maddubs_epi16(q5_1, q8_1);
-            p16_0 = _mm_madd_epi16(scale_0, p16_0);
-            p16_1 = _mm_madd_epi16(scale_0, p16_1);
-
-            q5l_0 = _mm_and_si128(_mm_srli_epi16(q5bits_0, 4), m4);
-            q5l_1 = _mm_and_si128(_mm_srli_epi16(q5bits_1, 4), m4);
-            q5h_0 = _mm_slli_epi16(_mm_srli_epi16(_mm_and_si128(hbits_0, hmask), bit), 4);
-            q5h_1 = _mm_slli_epi16(_mm_srli_epi16(_mm_and_si128(hbits_1, hmask), bit++), 4);
-            q5_0  = _mm_add_epi8(q5l_0, q5h_0);
-            q5_1  = _mm_add_epi8(q5l_1, q5h_1);
-            hmask = _mm_slli_epi16(hmask, 1);
-
-            q8_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-            q8_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-            __m128i p16_2 = _mm_maddubs_epi16(q5_0, q8_0);
-            __m128i p16_3 = _mm_maddubs_epi16(q5_1, q8_1);
-            p16_2 = _mm_madd_epi16(scale_1, p16_2);
-            p16_3 = _mm_madd_epi16(scale_1, p16_3);
-
-            sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p16_0, p16_2));
-            sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p16_1, p16_3));
-
-        }
-
-        __m256 vd = _mm256_set1_ps(d);
-        __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
-        acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(sumi)), acc);
-
-    }
-
-    *s = hsum_float_8(acc) + summs;
-
-#elif defined __wasm_simd128__
-    //const uint8_t * scales = (const uint8_t*)&utmp[0];
-    float sumf = 0;
-
-    for (int i = 0; i < nb; ++i) {
-        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
-        const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin); // Fixed sign
-
-        const uint8_t * GGML_RESTRICT q5 = x[i].qs;
-        const uint8_t * GGML_RESTRICT qh = x[i].qh;
-        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
-
-        // Process scales and mins
-        memcpy(utmp, x[i].scales, 12);
-        utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
-        const uint32_t uaux = utmp[1] & kmask1;
-        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
-        utmp[2] = uaux;
-        utmp[0] &= kmask1;
-
-        // Sum mins * q8sums
-        int32_t sumi_mins = 0;
-        const int16_t * GGML_RESTRICT q8sums = y[i].bsums;
-        const uint8_t * m = (const uint8_t *)&utmp[2];
-        for (int j = 0; j < 16; j += 2) {
-            sumi_mins += (q8sums[j] + q8sums[j+1]) * m[j/2];
-        }
-        sumf -= dmin * sumi_mins; // Correct subtraction
-
-        v128_t qh0 = wasm_v128_load(qh);
-        v128_t qh1 = wasm_v128_load(qh + 16);
-        const uint8_t * sc = (const uint8_t *)utmp;
-
-        int32_t sumi = 0;
-
-        for (int j = 0; j < QK_K/64; ++j) {
-            const int shift = j * 2;
-            v128_t qh_shift0 = wasm_u8x16_shr(qh0, shift);
-            v128_t qh_shift1 = wasm_u8x16_shr(qh1, shift);
-
-            v128_t qh_low0 = wasm_i8x16_shl(wasm_v128_and(qh_shift0, wasm_i8x16_splat(0x01)), 4);
-            v128_t qh_high0 = wasm_i8x16_shl(wasm_v128_and(qh_shift0, wasm_i8x16_splat(0x02)), 3);
-            v128_t qh_low1 = wasm_i8x16_shl(wasm_v128_and(qh_shift1, wasm_i8x16_splat(0x01)), 4);
-            v128_t qh_high1 = wasm_i8x16_shl(wasm_v128_and(qh_shift1, wasm_i8x16_splat(0x02)), 3);
-
-            v128_t q5_0 = wasm_v128_load(q5);
-            v128_t q5_1 = wasm_v128_load(q5 + 16);
-            q5 += 32;
-
-            v128_t q5l_0 = wasm_v128_or(wasm_v128_and(q5_0, wasm_i8x16_splat(0x0F)), qh_low0);
-            v128_t q5h_0 = wasm_v128_or(wasm_u8x16_shr(q5_0, 4), qh_high0);
-            v128_t q5l_1 = wasm_v128_or(wasm_v128_and(q5_1, wasm_i8x16_splat(0x0F)), qh_low1);
-            v128_t q5h_1 = wasm_v128_or(wasm_u8x16_shr(q5_1, 4), qh_high1);
-
-            v128_t q8_0 = wasm_v128_load(q8);
-            v128_t q8_1 = wasm_v128_load(q8 + 16);
-            v128_t q8_2 = wasm_v128_load(q8 + 32);
-            v128_t q8_3 = wasm_v128_load(q8 + 48);
-            q8 += 64;
-
-            // Process low quants
-            v128_t pl0 = wasm_i32x4_dot_i16x8(
-                wasm_i16x8_extend_low_i8x16(q5l_0),
-                wasm_i16x8_extend_low_i8x16(q8_0)
-            );
-            pl0 = wasm_i32x4_add(pl0, wasm_i32x4_dot_i16x8(
-                wasm_i16x8_extend_high_i8x16(q5l_0),
-                wasm_i16x8_extend_high_i8x16(q8_0)
-            ));
-            v128_t pl1 = wasm_i32x4_dot_i16x8(
-                wasm_i16x8_extend_low_i8x16(q5l_1),
-                wasm_i16x8_extend_low_i8x16(q8_1)
-            );
-            pl1 = wasm_i32x4_add(pl1, wasm_i32x4_dot_i16x8(
-                wasm_i16x8_extend_high_i8x16(q5l_1),
-                wasm_i16x8_extend_high_i8x16(q8_1)
-            ));
-            v128_t sum_low = wasm_i32x4_add(pl0, pl1);
-
-            // Process high quants
-            v128_t ph0 = wasm_i32x4_dot_i16x8(
-                wasm_i16x8_extend_low_i8x16(q5h_0),
-                wasm_i16x8_extend_low_i8x16(q8_2)
-            );
-            ph0 = wasm_i32x4_add(ph0, wasm_i32x4_dot_i16x8(
-                wasm_i16x8_extend_high_i8x16(q5h_0),
-                wasm_i16x8_extend_high_i8x16(q8_2)
-            ));
-            v128_t ph1 = wasm_i32x4_dot_i16x8(
-                wasm_i16x8_extend_low_i8x16(q5h_1),
-                wasm_i16x8_extend_low_i8x16(q8_3)
-            );
-            ph1 = wasm_i32x4_add(ph1, wasm_i32x4_dot_i16x8(
-                wasm_i16x8_extend_high_i8x16(q5h_1),
-                wasm_i16x8_extend_high_i8x16(q8_3)
-            ));
-            v128_t sum_high = wasm_i32x4_add(ph0, ph1);
-
-            // Accumulate with scale factors
-            int32_t sl = wasm_i32x4_extract_lane(sum_low, 0) + wasm_i32x4_extract_lane(sum_low, 1) +
-                        wasm_i32x4_extract_lane(sum_low, 2) + wasm_i32x4_extract_lane(sum_low, 3);
-            int32_t sh = wasm_i32x4_extract_lane(sum_high, 0) + wasm_i32x4_extract_lane(sum_high, 1) +
-                        wasm_i32x4_extract_lane(sum_high, 2) + wasm_i32x4_extract_lane(sum_high, 3);
-
-            sumi += sl * sc[2*j] + sh * sc[2*j+1];
-        }
-
-        sumf += d * sumi;
-    }
-
-    *s = sumf;
-
-#elif defined __riscv_v
-
-    const uint8_t * scales = (const uint8_t*)&utmp[0];
-    const uint8_t * mins   = (const uint8_t*)&utmp[2];
-
-    float sumf = 0;
-    float sums = 0.0;
-
-    size_t vl;
-
-    for (int i = 0; i < nb; ++i) {
-
-        vl = 8;
-
-        const uint8_t * GGML_RESTRICT q5 = x[i].qs;
-        const uint8_t * GGML_RESTRICT hm = x[i].qh;
-        const  int8_t * GGML_RESTRICT q8 = y[i].qs;
-
-        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-        const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d;
-
-        vint16m1_t q8sums_0 = __riscv_vlse16_v_i16m1(y[i].bsums, 4, vl);
-        vint16m1_t q8sums_1 = __riscv_vlse16_v_i16m1(y[i].bsums+1, 4, vl);
-        vint16m1_t q8sums = __riscv_vadd_vv_i16m1(q8sums_0, q8sums_1, vl);
-
-        memcpy(utmp, x[i].scales, 12);
-        utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
-        const uint32_t uaux = utmp[1] & kmask1;
-        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
-        utmp[2] = uaux;
-        utmp[0] &= kmask1;
-
-        vuint8mf2_t mins8 = __riscv_vle8_v_u8mf2(mins, vl);
-        vint16m1_t v_mins = __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(mins8, vl));
-        vint32m2_t prod = __riscv_vwmul_vv_i32m2(q8sums, v_mins, vl);
-
-        vint32m1_t sumi = __riscv_vredsum_vs_i32m2_i32m1(prod, __riscv_vmv_v_x_i32m1(0, 1), vl);
-        sumf -= dmin * __riscv_vmv_x_s_i32m1_i32(sumi);
-
-        vl = 32;
-        int32_t aux32 = 0;
-        int is = 0;
-
-        uint8_t m = 1;
-        vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1);
-        vuint8m2_t vqh = __riscv_vle8_v_u8m2(hm, vl);
-
-        for (int j = 0; j < QK_K/64; ++j) {
-            // load Q5 and Q8
-            vuint8m2_t q5_x = __riscv_vle8_v_u8m2(q5, vl);
-            vint8m2_t  q8_y1 = __riscv_vle8_v_i8m2(q8, vl);
-            vint8m2_t  q8_y2 = __riscv_vle8_v_i8m2(q8+32, vl);
-
-            // compute mask for addition
-            vint8m2_t q5_a = __riscv_vreinterpret_v_u8m2_i8m2(__riscv_vand_vx_u8m2(q5_x, 0x0F, vl));
-            vuint8m2_t qh_m1 = __riscv_vand_vx_u8m2(vqh, m, vl);
-            vbool4_t vmask_1 = __riscv_vmsne_vx_u8m2_b4(qh_m1, 0, vl);
-            vint8m2_t q5_m1 = __riscv_vadd_vx_i8m2_mu(vmask_1, q5_a, q5_a, 16, vl);
-            m <<= 1;
-
-            vint8m2_t q5_l = __riscv_vreinterpret_v_u8m2_i8m2(__riscv_vsrl_vx_u8m2(q5_x, 0x04, vl));
-            vuint8m2_t qh_m2 = __riscv_vand_vx_u8m2(vqh, m, vl);
-            vbool4_t vmask_2 = __riscv_vmsne_vx_u8m2_b4(qh_m2, 0, vl);
-            vint8m2_t q5_m2 = __riscv_vadd_vx_i8m2_mu(vmask_2, q5_l, q5_l, 16, vl);
-            m <<= 1;
-
-            vint16m4_t v0 = __riscv_vwmul_vv_i16m4(q5_m1, q8_y1, vl);
-            vint16m4_t v1 = __riscv_vwmul_vv_i16m4(q5_m2, q8_y2, vl);
-
-            vint32m8_t vs1 = __riscv_vwmul_vx_i32m8(v0, scales[is++], vl);
-            vint32m8_t vs2 = __riscv_vwmul_vx_i32m8(v1, scales[is++], vl);
-
-            vint32m1_t vacc1 = __riscv_vredsum_vs_i32m8_i32m1(vs1, vzero, vl);
-            vint32m1_t vacc2 = __riscv_vredsum_vs_i32m8_i32m1(vs2, vacc1, vl);
-
-            aux32 += __riscv_vmv_x_s_i32m1_i32(vacc2);
-            q5 += 32;    q8 += 64;
-
-        }
-
-        sums += aux32 * d;
-
-    }
-
-    *s = sumf+sums;
-
-#elif defined(__POWER9_VECTOR__)
-    const vector signed char lowMask = vec_splats((signed char)0xF);
-    const vector signed char lowMask1 = vec_splats((int8_t)0x3f);
-    const vector signed char lowMask2 = vec_splats((int8_t)0x30);
-    const vector int v0 = vec_splats((int32_t)0);
-    const vector unsigned char v1 = vec_splats((unsigned char)0x1);
-    const vector unsigned char v2 = vec_splats((unsigned char)0x2);
-    const vector unsigned char v3 = vec_splats((unsigned char)0x3);
-    const vector unsigned char v4 = vec_splats((unsigned char)0x4);
-
-    vector float vsumf0 = vec_splats(0.0f);
-    vector float vsumf1 = vec_splats(0.0f);
-    vector float vsumf2 = vec_splats(0.0f);
-    vector float vsumf3 = vec_splats(0.0f);
-
-    for (int i = 0; i < nb; ++i) {
-        vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
-        vector float vyd = vec_splats(y[i].d);
-        vector float vd = vec_mul(vxd, vyd);
-
-        vector float vxmin = vec_splats(GGML_FP16_TO_FP32(x[i].dmin));
-        vector float vdmin = vec_mul(vxmin, vyd);
-
-        UNUSED(kmask1);
-        UNUSED(kmask2);
-        UNUSED(kmask3);
-        UNUSED(utmp);
-
-        vector signed char u0 = (vector signed char)vec_xl_len(x[i].scales, 8);
-        vector signed char u1 = vec_and(vec_sr(u0, v2), lowMask2);
-        vector signed char u2 = (vector signed char)vec_xl_len(x[i].scales + 8, 4);
-        vector signed char u3 = vec_sr(u2, v4);
-
-        vector signed char u30 = u1;
-        vector signed char u31 = (vector signed char)vec_mergeh((vector signed int)vec_and(u2, lowMask), (vector signed int)u3);
-
-        u1 = vec_and(u0, lowMask1);
-        u2 = vec_or(u30, u31);
-
-        vector signed char utmps = (vector signed char)vec_mergeh((vector signed int)u1, (vector signed int)u2);
-
-        vector signed short q8ysums0 = vec_xl( 0, y[i].bsums);
-        vector signed short q8ysums1 = vec_xl(16, y[i].bsums);
-
-        vector signed short vscales = vec_unpackh(utmps);
-
-        vector signed short q5xmins = vec_unpackl(utmps);
-        vector signed short q5xmins0 = vec_mergeh(q5xmins, q5xmins);
-        vector signed short q5xmins1 = vec_mergel(q5xmins, q5xmins);
-
-        vector signed int prod0 = vec_mule(q5xmins0, q8ysums0);
-        vector signed int prod1 = vec_mule(q5xmins1, q8ysums1);
-        vector signed int prod2 = vec_mulo(q5xmins0, q8ysums0);
-        vector signed int prod3 = vec_mulo(q5xmins1, q8ysums1);
-
-        vsumf0 = vec_nmsub(vec_ctf(prod0, 0), vdmin, vsumf0);
-        vsumf1 = vec_nmsub(vec_ctf(prod1, 0), vdmin, vsumf1);
-        vsumf2 = vec_nmsub(vec_ctf(prod2, 0), vdmin, vsumf2);
-        vsumf3 = vec_nmsub(vec_ctf(prod3, 0), vdmin, vsumf3);
-
-        vector signed char qxhs0 = (vector signed char)vec_xl( 0, x[i].qh);
-        vector signed char qxhs1 = (vector signed char)vec_xl(16, x[i].qh);
-
-        vector signed int vsumi0 = v0;
-        vector signed int vsumi1 = v0;
-        vector signed int vsumi2 = v0;
-        vector signed int vsumi3 = v0;
-
-        const uint8_t * GGML_RESTRICT q5 = x[i].qs;
-        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
-
-        for (int j = 0; j < QK_K/64; ++j) {
-            __builtin_prefetch(q5, 0, 1);
-            __builtin_prefetch(q8, 0, 1);
-
-            vector signed char qxs0 = (vector signed char)vec_xl( 0, q5);
-            vector signed char qxs1 = (vector signed char)vec_xl(16, q5);
-            q5 += 32;
-
-            vector signed char qxs00 = vec_and(qxs0, lowMask);
-            vector signed char qxs01 = vec_sr(qxs0, v4);
-            vector signed char qxs10 = vec_and(qxs1, lowMask);
-            vector signed char qxs11 = vec_sr(qxs1, v4);
-
-            vector signed char q5h00 = vec_sl(vec_and((vector signed char)v1, qxhs0), v4);
-            vector signed char q5h01 = vec_sl(vec_and((vector signed char)v2, qxhs0), v3);
-            vector signed char q5h10 = vec_sl(vec_and((vector signed char)v1, qxhs1), v4);
-            vector signed char q5h11 = vec_sl(vec_and((vector signed char)v2, qxhs1), v3);
-            qxhs0 = vec_sr(qxhs0, v2);
-            qxhs1 = vec_sr(qxhs1, v2);
-
-            vector unsigned char q5x00 = (vector unsigned char)vec_or(q5h00, qxs00);
-            vector unsigned char q5x01 = (vector unsigned char)vec_or(q5h01, qxs01);
-            vector unsigned char q5x10 = (vector unsigned char)vec_or(q5h10, qxs10);
-            vector unsigned char q5x11 = (vector unsigned char)vec_or(q5h11, qxs11);
-
-            vector signed char q8y00 = vec_xl( 0, q8);
-            vector signed char q8y10 = vec_xl(16, q8);
-            vector signed char q8y01 = vec_xl(32, q8);
-            vector signed char q8y11 = vec_xl(48, q8);
-            q8 += 64;
-
-            vector signed int qv00 = vec_msum(q8y00, q5x00, v0);
-            vector signed int qv01 = vec_msum(q8y01, q5x01, v0);
-            vector signed int qv10 = vec_msum(q8y10, q5x10, v0);
-            vector signed int qv11 = vec_msum(q8y11, q5x11, v0);
-
-            vector signed int vscales_h = vec_unpackh(vscales);
-            vector signed int vs0 = vec_splat(vscales_h, 0);
-            vector signed int vs1 = vec_splat(vscales_h, 1);
-            vscales = vec_sld(vscales, vscales, 12);
-
-            vsumi0 = vec_add(vec_mul(qv00, vs0), vsumi0);
-            vsumi1 = vec_add(vec_mul(qv10, vs0), vsumi1);
-            vsumi2 = vec_add(vec_mul(qv01, vs1), vsumi2);
-            vsumi3 = vec_add(vec_mul(qv11, vs1), vsumi3);
-        }
-
-        vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
-        vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
-        vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
-        vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
-    }
-
-    vsumf0 = vec_add(vsumf0, vsumf2);
-    vsumf1 = vec_add(vsumf1, vsumf3);
-
-    vsumf0 = vec_add(vsumf0, vsumf1);
-
-    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
-    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
-
-    *s = vec_extract(vsumf0, 0);
-
-#elif defined __loongarch_asx
-
-    __m256 acc = (__m256)__lasx_xvldi(0);
-    __m128 acc_m = (__m128)__lsx_vldi(0);
-
-    for (int i = 0; i < nb; ++i) {
-
-        const uint8_t * GGML_RESTRICT q5 = x[i].qs;
-        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
-
-        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
-        const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
-
-        memcpy(utmp, x[i].scales, 12);
-        utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
-        const uint32_t uaux = utmp[1] & kmask1;
-        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
-        utmp[2] = uaux;
-        utmp[0] &= kmask1;
-
-        const __m128i mins_and_scales128 = lsx_set_w(utmp[3], utmp[2], utmp[1], utmp[0]);
-        const __m128i mins128 = __lsx_vexth_h_b(mins_and_scales128);
-        const __m128i scales128 = __lsx_vsllwil_h_b(mins_and_scales128, 0);
-
-        const __m256i q8sums = __lasx_xvld((const __m256i*)y[i].bsums, 0);
-        const __m128i q8s = lsx_hadd_h(lasx_extracti128(q8sums, 0), lasx_extracti128(q8sums, 1));
-        const __m128i prod = lsx_madd_h(mins128, q8s);
-        acc_m = __lsx_vfmadd_s(__lsx_vreplfr2vr_s(dmin), __lsx_vffint_s_w(prod), acc_m);
-
-        const __m256i scales = lasx_insertf128(scales128, scales128);
-
-        const __m256i hbits = __lasx_xvld((const __m256i*)x[i].qh, 0);
-
-        __m256i sumi = __lasx_xvldi(0);
-
-        for (int j = 0; j < QK_K/64; ++j) {
-
-            const __m256i scale_0 = lasx_xvrepl128vei_h(scales, 2 * j + 0);
-            const __m256i scale_1 = lasx_xvrepl128vei_h(scales, 2 * j + 1);
-
-            const __m256i q5bits = __lasx_xvld((const __m256i*)q5, 0); q5 += 32;
-
-            const __m256i q5l_0 = __lasx_xvandi_b(q5bits, 0xf);
-            const __m256i q5l_1 = __lasx_xvsrli_b(q5bits, 4);
-            const __m256i q5h_0 = __lasx_xvnori_b(__lasx_xvseqi_b(lasx_xvandi_b_bit(hbits, 2 * j + 0), 0), 0xef);
-            const __m256i q5h_1 = __lasx_xvnori_b(__lasx_xvseqi_b(lasx_xvandi_b_bit(hbits, 2 * j + 1), 0), 0xef);
-            const __m256i q5_0  = __lasx_xvor_v(q5l_0, q5h_0);
-            const __m256i q5_1  = __lasx_xvor_v(q5l_1, q5h_1);
-
-            const __m256i q8_0 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
-            const __m256i q8_1 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
-
-            __m256i p16_0 = lasx_madd_h_b(q5_0, q8_0);
-            __m256i p16_1 = lasx_madd_h_b(q5_1, q8_1);
-
-            p16_0 = lasx_madd_h(scale_0, p16_0);
-            p16_1 = lasx_madd_h(scale_1, p16_1);
-
-            sumi = __lasx_xvadd_w(sumi, __lasx_xvadd_w(p16_0, p16_1));
-
-        }
-
-        __m256 vd = __lasx_xvreplfr2vr_s(d);
-        acc = __lasx_xvfmadd_s(vd, __lasx_xvffint_s_w(sumi), acc);
-
-    }
-
-    acc_m = __lsx_vfadd_s(acc_m, (__m128)__lsx_vbsrl_v(acc_m, 8));
-    acc_m = __lsx_vfadd_s(acc_m, (__m128)__lsx_vbsrl_v(acc_m, 4));
-
-    *s = hsum_float_8(acc) + ((v4f32)acc_m)[0];
-#elif defined(__VXE__) || defined(__VXE2__)
-    const uint8x16_t v_lm = vec_splat_u8(0x0F);
-    const uint8x16_t v_1m = vec_splat_u8(0x01);
-    const uint8x16_t v_2m = vec_splat_u8(0x02);
-
-    const int32x4_t v_z = vec_splat_s32(0);
-
-    const uchar8x16_t v_minsm = {
-        0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
-        0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF
-    };
-
-    int8x16_t  q5b[4];
-    uint8x16_t q5h[4];
-
-    uint8x16_t v_xl[2];
-    uint8x16_t v_xh[2];
-    int8x16_t  v_y[4];
-
-    float sumf = 0;
-
-    for (int i = 0; i < nb; ++i) {
-        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
-        const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
-
-        const int16x8_t v_ysumsl = vec_xl(0 , y[i].bsums);
-        const int16x8_t v_ysumsh = vec_xl(16, y[i].bsums);
-        const int16x8_t v_ysums = vec_padd_s16(v_ysumsl, v_ysumsh);
-
-        memcpy(utmp, x[i].scales, 12);
-        utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
-        const uint32_t uaux = utmp[1] & kmask1;
-        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
-        utmp[2] = uaux;
-        utmp[0] &= kmask1;
-
-        const uint8x16_t v_mins16 = vec_xl(0, (const uint8_t *)utmp);
-        const uint8x16_t v_mins8 = vec_perm(v_mins16, v_mins16, v_minsm);
-        const int16x8_t v_minsh = (int16x8_t)vec_unpackh(v_mins8);
-
-        const int32x4_t v_minsho = vec_mulo(v_ysums, v_minsh);
-        const int32x4_t v_minshe = vec_mule(v_ysums, v_minsh);
-        const int32x4_t v_mins = vec_add(v_minsho, v_minshe);
-        const int32_t mins = v_mins[0] + v_mins[1] + v_mins[2] + v_mins[3];
-
-        const uint8_t * scales = (const uint8_t *)utmp;
-        const uint8_t * GGML_RESTRICT x0l = x[i].qs;
-        const uint8_t * GGML_RESTRICT x0h = x[i].qh;
-        const int8_t  * GGML_RESTRICT y0 = y[i].qs;
-
-        v_xh[0] = vec_xl(0 , x0h);
-        v_xh[1] = vec_xl(16, x0h);
-
-        int32_t sumi = 0;
-        for (int j = 0; j < QK_K/64; ++j) {
-            v_xl[0] = vec_xl(0 , x0l);
-            v_xl[1] = vec_xl(16, x0l);
-            x0l += 32;
-
-            v_y[0] = vec_xl(0 , y0);
-            v_y[1] = vec_xl(16, y0);
-            v_y[2] = vec_xl(32, y0);
-            v_y[3] = vec_xl(48, y0);
-            y0 += 64;
-
-            q5h[0] = vec_sl(vec_and(v_1m, v_xh[0]), 4);
-            q5h[1] = vec_sl(vec_and(v_1m, v_xh[1]), 4);
-            q5h[2] = vec_sl(vec_and(v_2m, v_xh[0]), 3);
-            q5h[3] = vec_sl(vec_and(v_2m, v_xh[1]), 3);
-            v_xh[0] = vec_sr(v_xh[0], 2);
-            v_xh[1] = vec_sr(v_xh[1], 2);
-
-            q5b[0] = (int8x16_t)vec_or(vec_and(v_xl[0], v_lm), q5h[0]);
-            q5b[1] = (int8x16_t)vec_or(vec_and(v_xl[1], v_lm), q5h[1]);
-            q5b[2] = (int8x16_t)vec_or(vec_sr(v_xl[0], 4), q5h[2]);
-            q5b[3] = (int8x16_t)vec_or(vec_sr(v_xl[1], 4), q5h[3]);
-
-            int32x4_t sumi0 = ggml_vec_dot(ggml_vec_dot(v_z, q5b[0], v_y[0]), q5b[1], v_y[1]);
-            int32x4_t sumi1 = ggml_vec_dot(ggml_vec_dot(v_z, q5b[2], v_y[2]), q5b[3], v_y[3]);
-
-            sumi += (sumi0[0] + sumi0[1] + sumi0[2] + sumi0[3]) * *scales++;
-            sumi += (sumi1[0] + sumi1[1] + sumi1[2] + sumi1[3]) * *scales++;
-        }
-
-        sumf += d * sumi - dmin * mins;
-    }
-
-    *s = sumf;
-#else
-
-    const uint8_t * scales = (const uint8_t*)&utmp[0];
-    const uint8_t * mins   = (const uint8_t*)&utmp[2];
-
-    int8_t  aux8[QK_K];
-    int16_t aux16[8];
-    float   sums [8];
-    int32_t aux32[8];
-    memset(sums, 0, 8*sizeof(float));
-
-    float sumf = 0;
-    for (int i = 0; i < nb; ++i) {
-        const uint8_t * GGML_RESTRICT q4 = x[i].qs;
-        const uint8_t * GGML_RESTRICT hm = x[i].qh;
-        const  int8_t * GGML_RESTRICT q8 = y[i].qs;
-        memset(aux32, 0, 8*sizeof(int32_t));
-        int8_t * GGML_RESTRICT a = aux8;
-        uint8_t m = 1;
-        for (int j = 0; j < QK_K/64; ++j) {
-            for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
-            for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
-            a += 32; m <<= 1;
-            for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l]  >> 4);
-            for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
-            a += 32; m <<= 1;
-            q4 += 32;
-        }
-        memcpy(utmp, x[i].scales, 12);
-        utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
-        const uint32_t uaux = utmp[1] & kmask1;
-        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
-        utmp[2] = uaux;
-        utmp[0] &= kmask1;
-
-        int sumi = 0;
-        for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
-        a = aux8;
-        int is = 0;
-        for (int j = 0; j < QK_K/32; ++j) {
-            int32_t scale = scales[is++];
-            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
-            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
-            q8 += 8; a += 8;
-            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
-            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
-            q8 += 8; a += 8;
-            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
-            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
-            q8 += 8; a += 8;
-            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
-            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
-            q8 += 8; a += 8;
-        }
-        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-        for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
-        const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d;
-        sumf -= dmin * sumi;
-    }
-    for (int l = 0; l < 8; ++l) sumf += sums[l];
-    *s = sumf;
-#endif
-}
-
-void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
-    assert(n % QK_K == 0);
-#ifdef __ARM_FEATURE_MATMUL_INT8
-    assert((nrc == 2) || (nrc == 1));
-#else
-    assert(nrc == 1);
-#endif
-    UNUSED(nrc);
-    UNUSED(bx);
-    UNUSED(by);
-    UNUSED(bs);
-
-    const block_q6_K * GGML_RESTRICT x = vx;
-    const block_q8_K * GGML_RESTRICT y = vy;
-
-    const int nb = n / QK_K;
-
-#if defined(__ARM_FEATURE_MATMUL_INT8)
-    if (nrc == 2) {
-        const block_q6_K * GGML_RESTRICT x0 = x;
-        const block_q6_K * GGML_RESTRICT x1 = (const block_q6_K *) ((const uint8_t *)vx + bx);
-        const block_q8_K * GGML_RESTRICT y0 = y;
-        const block_q8_K * GGML_RESTRICT y1 = (const block_q8_K *) ((const uint8_t *)vy + by);
-
-        float32x4_t vfsum = vdupq_n_f32(0.0f);
-
-        for (int i = 0; i < nb; ++i, ++x0, ++x1, ++y0, ++y1) {
-            const uint8_t * GGML_RESTRICT ql0 = x0->ql;
-            const uint8_t * GGML_RESTRICT ql1 = x1->ql;
-            const uint8_t * GGML_RESTRICT qh0 = x0->qh;
-            const uint8_t * GGML_RESTRICT qh1 = x1->qh;
-            const  int8_t * GGML_RESTRICT qy0 = y0->qs;
-            const  int8_t * GGML_RESTRICT qy1 = y1->qs;
-
-            const uint8x16_t mone = vdupq_n_u8(0x30);
-            const uint8x16_t  m4b = vdupq_n_u8(0x0f);
-
-            int32x4_t visum = vdupq_n_s32(0);
-
-            // process 8 blocks per iteration, totally 16 blocks
-            for (int j = 0; j < 2; ++j, qh0 += 32, ql0 += 64, qh1 += 32, ql1 += 64) {
-                int8x16_t vx0[8], vx1[8];
-
-                // de-quantize vx0[8]
-                {
-                    const uint8x16x2_t qh_bits = vld1q_u8_x2(qh0);
-                    const uint8x16x4_t ql_bits = vld1q_u8_x4(ql0);
-
-                    uint8x16_t q6h_0 = vandq_u8(mone, vshlq_n_u8(qh_bits.val[0], 4));
-                    uint8x16_t q6h_1 = vandq_u8(mone, vshlq_n_u8(qh_bits.val[1], 4));
-                    uint8x16_t q6h_2 = vandq_u8(mone, vshlq_n_u8(qh_bits.val[0], 2));
-                    uint8x16_t q6h_3 = vandq_u8(mone, vshlq_n_u8(qh_bits.val[1], 2));
-
-                    vx0[0] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(ql_bits.val[0], m4b), q6h_0));
-                    vx0[1] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(ql_bits.val[1], m4b), q6h_1));
-                    vx0[2] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(ql_bits.val[2], m4b), q6h_2));
-                    vx0[3] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(ql_bits.val[3], m4b), q6h_3));
-
-                    q6h_0 = vandq_u8(mone, qh_bits.val[0]);
-                    q6h_1 = vandq_u8(mone, qh_bits.val[1]);
-                    q6h_2 = vandq_u8(mone, vshrq_n_u8(qh_bits.val[0], 2));
-                    q6h_3 = vandq_u8(mone, vshrq_n_u8(qh_bits.val[1], 2));
-
-                    vx0[4] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(ql_bits.val[0], 4), q6h_0));
-                    vx0[5] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(ql_bits.val[1], 4), q6h_1));
-                    vx0[6] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(ql_bits.val[2], 4), q6h_2));
-                    vx0[7] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(ql_bits.val[3], 4), q6h_3));
-                }
-
-                // de-quantize vx1[8]
-                {
-                    const uint8x16x2_t qh_bits = vld1q_u8_x2(qh1);
-                    const uint8x16x4_t ql_bits = vld1q_u8_x4(ql1);
-
-                    uint8x16_t q6h_0 = vandq_u8(mone, vshlq_n_u8(qh_bits.val[0], 4));
-                    uint8x16_t q6h_1 = vandq_u8(mone, vshlq_n_u8(qh_bits.val[1], 4));
-                    uint8x16_t q6h_2 = vandq_u8(mone, vshlq_n_u8(qh_bits.val[0], 2));
-                    uint8x16_t q6h_3 = vandq_u8(mone, vshlq_n_u8(qh_bits.val[1], 2));
-
-                    vx1[0] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(ql_bits.val[0], m4b), q6h_0));
-                    vx1[1] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(ql_bits.val[1], m4b), q6h_1));
-                    vx1[2] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(ql_bits.val[2], m4b), q6h_2));
-                    vx1[3] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(ql_bits.val[3], m4b), q6h_3));
-
-                    q6h_0 = vandq_u8(mone, qh_bits.val[0]);
-                    q6h_1 = vandq_u8(mone, qh_bits.val[1]);
-                    q6h_2 = vandq_u8(mone, vshrq_n_u8(qh_bits.val[0], 2));
-                    q6h_3 = vandq_u8(mone, vshrq_n_u8(qh_bits.val[1], 2));
-
-                    vx1[4] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(ql_bits.val[0], 4), q6h_0));
-                    vx1[5] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(ql_bits.val[1], 4), q6h_1));
-                    vx1[6] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(ql_bits.val[2], 4), q6h_2));
-                    vx1[7] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(ql_bits.val[3], 4), q6h_3));
-                }
-
-                // process 16 elements (one block with same scale) per iteration
-                // - vx = concat(ql, qh) - 32
-                // - r1,r2,r3,r4 = smmla(vx, vy)
-                for (int k = 0; k < 8; ++k) {
-                    const int blk = j * 8 + k;
-
-                    const int8x16_t vy0 = vld1q_s8(qy0);
-                    const int8x16_t vy1 = vld1q_s8(qy1);
-                    qy0 += 16;
-                    qy1 += 16;
-
-                    const int32x4_t block_scale = {
-                        x0->scales[blk],
-                        x0->scales[blk],
-                        x1->scales[blk],
-                        x1->scales[blk],
-                    };
-
-                    // calculate four results at once with outer product
-                    const int8x16_t vx_l = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(vx0[k]), vreinterpretq_s64_s8(vx1[k])));
-                    const int8x16_t vx_h = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(vx0[k]), vreinterpretq_s64_s8(vx1[k])));
-                    const int8x16_t vy_l = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(vy0), vreinterpretq_s64_s8(vy1)));
-                    const int8x16_t vy_h = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(vy0), vreinterpretq_s64_s8(vy1)));
-                    int32x4_t vr = vdupq_n_s32(0);
-                    vr = vmmlaq_s32(vr, vx_l, vy_l);
-                    vr = vmmlaq_s32(vr, vx_h, vy_h);
-
-                    // apply block scale, will NOT overflow
-                    // block_scale * sum_256(int6*int8) <= 2^(8+8+6+8) = 30 bits
-                    visum = vmlaq_s32(visum, vr, block_scale);
-                }
-            }
-
-            // adjust bias, apply superblock scale
-            {
-                int32_t bias[4];
-#ifdef __ARM_FEATURE_SVE
-                const svbool_t pg16_8 = svptrue_pat_b16(SV_VL8);
-                const svbool_t pg8_8 = svptrue_pat_b8(SV_VL8);
-                const svint16_t y0_q8sums_0 = svld1_s16(pg16_8, y0->bsums);
-                const svint16_t y0_q8sums_1 = svld1_s16(pg16_8, y0->bsums + 8);
-                const svint16_t y1_q8sums_0 = svld1_s16(pg16_8, y1->bsums);
-                const svint16_t y1_q8sums_1 = svld1_s16(pg16_8, y1->bsums + 8);
-                const svint16_t x0_q6scales_0 = svunpklo_s16(svld1_s8(pg8_8, x0->scales));
-                const svint16_t x0_q6scales_1 = svunpklo_s16(svld1_s8(pg8_8, x0->scales + 8));
-                const svint16_t x1_q6scales_0 = svunpklo_s16(svld1_s8(pg8_8, x1->scales));
-                const svint16_t x1_q6scales_1 = svunpklo_s16(svld1_s8(pg8_8, x1->scales + 8));
-                const svint64_t zero = svdup_n_s64(0);
-                bias[0] = svaddv_s64(svptrue_b64(), svadd_s64_x(svptrue_b64(), svdot_s64(zero, y0_q8sums_0, x0_q6scales_0),
-                                                                               svdot_s64(zero, y0_q8sums_1, x0_q6scales_1)));
-                bias[1] = svaddv_s64(svptrue_b64(), svadd_s64_x(svptrue_b64(), svdot_s64(zero, y1_q8sums_0, x0_q6scales_0),
-                                                                               svdot_s64(zero, y1_q8sums_1, x0_q6scales_1)));
-                bias[2] = svaddv_s64(svptrue_b64(), svadd_s64_x(svptrue_b64(), svdot_s64(zero, y0_q8sums_0, x1_q6scales_0),
-                                                                               svdot_s64(zero, y0_q8sums_1, x1_q6scales_1)));
-                bias[3] = svaddv_s64(svptrue_b64(), svadd_s64_x(svptrue_b64(), svdot_s64(zero, y1_q8sums_0, x1_q6scales_0),
-                                                                               svdot_s64(zero, y1_q8sums_1, x1_q6scales_1)));
-#else
-                // NEON doesn't support int16 dot product, fallback to separated mul and add
-                const int16x8x2_t q8sums0 = vld1q_s16_x2(y0->bsums);
-                const int16x8x2_t q8sums1 = vld1q_s16_x2(y1->bsums);
-
-                int8x16_t scales_s8 = vld1q_s8(x0->scales);
-                const int16x8x2_t q6scales0 = {{vmovl_s8(vget_low_s8(scales_s8)), vmovl_s8(vget_high_s8(scales_s8))}};
-                scales_s8 = vld1q_s8(x1->scales);
-                const int16x8x2_t q6scales1 = {{vmovl_s8(vget_low_s8(scales_s8)), vmovl_s8(vget_high_s8(scales_s8))}};
-
-                int32x4_t prod;
-                prod = vaddq_s32(vaddq_s32(vmull_s16(vget_low_s16 (q8sums0.val[0]), vget_low_s16 (q6scales0.val[0])),
-                                           vmull_s16(vget_high_s16(q8sums0.val[0]), vget_high_s16(q6scales0.val[0]))),
-                                 vaddq_s32(vmull_s16(vget_low_s16 (q8sums0.val[1]), vget_low_s16 (q6scales0.val[1])),
-                                           vmull_s16(vget_high_s16(q8sums0.val[1]), vget_high_s16(q6scales0.val[1]))));
-                bias[0] = vaddvq_s32(prod);
-                prod = vaddq_s32(vaddq_s32(vmull_s16(vget_low_s16 (q8sums1.val[0]), vget_low_s16 (q6scales0.val[0])),
-                                           vmull_s16(vget_high_s16(q8sums1.val[0]), vget_high_s16(q6scales0.val[0]))),
-                                 vaddq_s32(vmull_s16(vget_low_s16 (q8sums1.val[1]), vget_low_s16 (q6scales0.val[1])),
-                                           vmull_s16(vget_high_s16(q8sums1.val[1]), vget_high_s16(q6scales0.val[1]))));
-                bias[1] = vaddvq_s32(prod);
-                prod = vaddq_s32(vaddq_s32(vmull_s16(vget_low_s16 (q8sums0.val[0]), vget_low_s16 (q6scales1.val[0])),
-                                           vmull_s16(vget_high_s16(q8sums0.val[0]), vget_high_s16(q6scales1.val[0]))),
-                                 vaddq_s32(vmull_s16(vget_low_s16 (q8sums0.val[1]), vget_low_s16 (q6scales1.val[1])),
-                                           vmull_s16(vget_high_s16(q8sums0.val[1]), vget_high_s16(q6scales1.val[1]))));
-                bias[2] = vaddvq_s32(prod);
-                prod = vaddq_s32(vaddq_s32(vmull_s16(vget_low_s16 (q8sums1.val[0]), vget_low_s16 (q6scales1.val[0])),
-                                           vmull_s16(vget_high_s16(q8sums1.val[0]), vget_high_s16(q6scales1.val[0]))),
-                                 vaddq_s32(vmull_s16(vget_low_s16 (q8sums1.val[1]), vget_low_s16 (q6scales1.val[1])),
-                                           vmull_s16(vget_high_s16(q8sums1.val[1]), vget_high_s16(q6scales1.val[1]))));
-                bias[3] = vaddvq_s32(prod);
-
-#endif
-                const int32x4_t vibias = vmulq_n_s32(vld1q_s32(bias), 32);
-
-                const float32x4_t superblock_scale = {
-                    GGML_FP16_TO_FP32(x0->d) * y0->d,
-                    GGML_FP16_TO_FP32(x0->d) * y1->d,
-                    GGML_FP16_TO_FP32(x1->d) * y0->d,
-                    GGML_FP16_TO_FP32(x1->d) * y1->d,
-                };
-
-                visum = vsubq_s32(visum, vibias);
-                vfsum = vmlaq_f32(vfsum, vcvtq_f32_s32(visum), superblock_scale);
-            }
-        }
-
-        // vfsum = ABCD -> ACBD
-        // AC -> s, BD -> (s+bs)
-        vfsum = vzip1q_f32(vfsum, vextq_f32(vfsum, vfsum, 2));
-        vst1_f32(s,      vget_low_f32 (vfsum));
-        vst1_f32(s + bs, vget_high_f32(vfsum));
-
-        return;
-    }
-#endif
-
-#ifdef __ARM_FEATURE_SVE
-    const int vector_length = ggml_cpu_get_sve_cnt()*8;
-    float sum = 0;
-    svuint8_t m4b = svdup_n_u8(0xf);
-    svint32_t vzero = svdup_n_s32(0);
-    svuint8_t mone = svdup_n_u8(0x30);
-    svint8_t q6bytes_1, q6bytes_2, q6bytes_3, q6bytes_4;
-    svuint8_t q6h_1, q6h_2, q6h_3, q6h_4;
-
-    for (int i = 0; i < nb; ++i) {
-        const float d_all = GGML_FP16_TO_FP32(x[i].d);
-
-        const uint8_t * GGML_RESTRICT q6 = x[i].ql;
-        const uint8_t * GGML_RESTRICT qh = x[i].qh;
-        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
-
-        const int8_t * GGML_RESTRICT scale = x[i].scales;
-
-        const svbool_t pg16_8 = svptrue_pat_b16(SV_VL8);
-        const svint16_t q8sums_1 = svld1_s16(pg16_8, y[i].bsums);
-        const svint16_t q8sums_2 = svld1_s16(pg16_8, y[i].bsums + 8);
-        const svint16_t q6scales_1 = svunpklo_s16(svld1_s8(svptrue_pat_b8(SV_VL8), scale));
-        const svint16_t q6scales_2 = svunpklo_s16(svld1_s8(svptrue_pat_b8(SV_VL8), scale + 8));
-        const svint64_t prod = svdup_n_s64(0);
-        int32_t isum_mins = svaddv_s64(svptrue_b64(), svadd_s64_x(svptrue_b64(), svdot_s64(prod, q8sums_1, q6scales_1),
-                                                                                 svdot_s64(prod, q8sums_2, q6scales_2)));
-        int32_t isum = 0;
-
-        switch (vector_length) {
-            case 128:
-                {
-                    const svbool_t pg32_4 = svptrue_pat_b32(SV_VL4);
-                    const svbool_t pg8_16 = svptrue_pat_b8(SV_VL16);
-                    svint32_t isum_tmp = svdup_n_s32(0);
-                    for (int j = 0; j < QK_K/128; ++j) {
-                        svuint8_t qhbits_1 = svld1_u8(pg8_16, qh);
-                        svuint8_t qhbits_2 = svld1_u8(pg8_16, qh+16);
-                        qh += 32;
-                        svuint8_t q6bits_1 = svld1_u8(pg8_16, q6);
-                        svuint8_t q6bits_2 = svld1_u8(pg8_16, q6+16);
-                        svuint8_t q6bits_3 = svld1_u8(pg8_16, q6+32);
-                        svuint8_t q6bits_4 = svld1_u8(pg8_16, q6+48);
-                        q6 += 64;
-                        svint8_t q8bytes_1 = svld1_s8(pg8_16, q8);
-                        svint8_t q8bytes_2 = svld1_s8(pg8_16, q8+16);
-                        svint8_t q8bytes_3 = svld1_s8(pg8_16, q8+32);
-                        svint8_t q8bytes_4 = svld1_s8(pg8_16, q8+48);
-                        q8 += 64;
-
-                        q6h_1 = svand_u8_x(pg16_8, mone, svlsl_n_u8_x(pg16_8, qhbits_1, 4));
-                        q6h_2 = svand_u8_x(pg16_8, mone, svlsl_n_u8_x(pg16_8, qhbits_2, 4));
-                        q6h_3 = svand_u8_x(pg16_8, mone, svlsl_n_u8_x(pg16_8, qhbits_1, 2));
-                        q6h_4 = svand_u8_x(pg16_8, mone, svlsl_n_u8_x(pg16_8, qhbits_2, 2));
-                        q6bytes_1 = svreinterpret_s8_u8(svorr_u8_x(pg8_16, svand_u8_x(pg8_16, q6bits_1, m4b), q6h_1));
-                        q6bytes_2 = svreinterpret_s8_u8(svorr_u8_x(pg8_16, svand_u8_x(pg8_16, q6bits_2, m4b), q6h_2));
-                        q6bytes_3 = svreinterpret_s8_u8(svorr_u8_x(pg8_16, svand_u8_x(pg8_16, q6bits_3, m4b), q6h_3));
-                        q6bytes_4 = svreinterpret_s8_u8(svorr_u8_x(pg8_16, svand_u8_x(pg8_16, q6bits_4, m4b), q6h_4));
-                        isum_tmp = svmla_n_s32_x(pg32_4, isum_tmp, svdot_s32(vzero, q6bytes_1, q8bytes_1), scale[0]);
-                        isum_tmp = svmla_n_s32_x(pg32_4, isum_tmp, svdot_s32(vzero, q6bytes_2, q8bytes_2), scale[1]);
-                        isum_tmp = svmla_n_s32_x(pg32_4, isum_tmp, svdot_s32(vzero, q6bytes_3, q8bytes_3), scale[2]);
-                        isum_tmp = svmla_n_s32_x(pg32_4, isum_tmp, svdot_s32(vzero, q6bytes_4, q8bytes_4), scale[3]);
-
-                        scale += 4;
-                        q8bytes_1 = svld1_s8(pg8_16, q8);
-                        q8bytes_2 = svld1_s8(pg8_16, q8+16);
-                        q8bytes_3 = svld1_s8(pg8_16, q8+32);
-                        q8bytes_4 = svld1_s8(pg8_16, q8+48);
-                        q8 += 64;
-
-                        q6h_1 = svand_u8_x(pg16_8, mone, qhbits_1);
-                        q6h_2 = svand_u8_x(pg16_8, mone, qhbits_2);
-                        q6h_3 = svand_u8_x(pg16_8, mone, svlsr_n_u8_x(pg16_8, qhbits_1, 2));
-                        q6h_4 = svand_u8_x(pg16_8, mone, svlsr_n_u8_x(pg16_8, qhbits_2, 2));
-                        q6bytes_1 = svreinterpret_s8_u8(svorr_u8_x(pg8_16, svlsr_n_u8_x(pg8_16, q6bits_1, 4), q6h_1));
-                        q6bytes_2 = svreinterpret_s8_u8(svorr_u8_x(pg8_16, svlsr_n_u8_x(pg8_16, q6bits_2, 4), q6h_2));
-                        q6bytes_3 = svreinterpret_s8_u8(svorr_u8_x(pg8_16, svlsr_n_u8_x(pg8_16, q6bits_3, 4), q6h_3));
-                        q6bytes_4 = svreinterpret_s8_u8(svorr_u8_x(pg8_16, svlsr_n_u8_x(pg8_16, q6bits_4, 4), q6h_4));
-                        isum_tmp = svmla_n_s32_x(pg32_4, isum_tmp, svdot_s32(vzero, q6bytes_1, q8bytes_1), scale[0]);
-                        isum_tmp = svmla_n_s32_x(pg32_4, isum_tmp, svdot_s32(vzero, q6bytes_2, q8bytes_2), scale[1]);
-                        isum_tmp = svmla_n_s32_x(pg32_4, isum_tmp, svdot_s32(vzero, q6bytes_3, q8bytes_3), scale[2]);
-                        isum_tmp = svmla_n_s32_x(pg32_4, isum_tmp, svdot_s32(vzero, q6bytes_4, q8bytes_4), scale[3]);
-                        scale += 4;
-                    }
-                    isum += svaddv_s32(pg32_4, isum_tmp);
-                    sum += d_all * y[i].d * (isum - 32 * isum_mins);
-                }
-                break;
-            case 256:
-            case 512:
-                {
-                    const svbool_t pg8_2 = svptrue_pat_b8(SV_VL2);
-                    const svbool_t pg32_8 = svptrue_pat_b32(SV_VL8);
-                    const svbool_t pg8_32 = svptrue_pat_b8(SV_VL32);
-                    svint32_t isum_tmp = svdup_n_s32(0);
-                    for (int j = 0; j < QK_K/128; j++) {
-                        svuint8_t qhbits_1 = svld1_u8(pg8_32, qh);
-                        qh += 32;
-                        svuint8_t q6bits_1 = svld1_u8(pg8_32, q6);
-                        svuint8_t q6bits_2 = svld1_u8(pg8_32, q6+32);
-                        q6 += 64;
-                        svint8_t q8bytes_1 = svld1_s8(pg8_32, q8);
-                        svint8_t q8bytes_2 = svld1_s8(pg8_32, q8+32);
-                        svint8_t q8bytes_3 = svld1_s8(pg8_32, q8+64);
-                        svint8_t q8bytes_4 = svld1_s8(pg8_32, q8+96);
-                        q8 += 128;
-                        q6h_1 = svand_u8_x(pg8_32, mone, svlsl_n_u8_x(pg8_32, qhbits_1, 4));
-                        q6h_2 = svand_u8_x(pg8_32, mone, svlsl_n_u8_x(pg8_32, qhbits_1, 2));
-                        q6h_3 = svand_u8_x(pg8_32, mone, qhbits_1);
-                        q6h_4 = svand_u8_x(pg8_32, mone, svlsr_n_u8_x(pg8_32, qhbits_1, 2));
-                        q6bytes_1 = svreinterpret_s8_u8(svorr_u8_x(pg8_32, svand_u8_x(pg8_32, q6bits_1, m4b), q6h_1));
-                        q6bytes_2 = svreinterpret_s8_u8(svorr_u8_x(pg8_32, svand_u8_x(pg8_32, q6bits_2, m4b), q6h_2));
-                        q6bytes_3 = svreinterpret_s8_u8(svorr_u8_x(pg8_32, svlsr_n_u8_x(pg8_32, q6bits_1, 4), q6h_3));
-                        q6bytes_4 = svreinterpret_s8_u8(svorr_u8_x(pg8_32, svlsr_n_u8_x(pg8_32, q6bits_2, 4), q6h_4));
-
-                        svint8_t scale_lane_1_tmp = svld1_s8(pg8_2, scale);
-                        scale_lane_1_tmp= svzip1_s8(scale_lane_1_tmp, scale_lane_1_tmp);
-                        scale_lane_1_tmp= svzip1_s8(scale_lane_1_tmp, scale_lane_1_tmp);
-                        svint8_t scale_lane_2_tmp = svld1_s8(pg8_2, scale+2);
-                        scale_lane_2_tmp = svzip1_s8(scale_lane_2_tmp, scale_lane_2_tmp);
-                        scale_lane_2_tmp = svzip1_s8(scale_lane_2_tmp, scale_lane_2_tmp);
-                        svint8_t scale_lane_3_tmp = svld1_s8(pg8_2, scale+4);
-                        scale_lane_3_tmp = svzip1_s8(scale_lane_3_tmp, scale_lane_3_tmp);
-                        scale_lane_3_tmp = svzip1_s8(scale_lane_3_tmp, scale_lane_3_tmp);
-                        svint8_t scale_lane_4_tmp = svld1_s8(pg8_2, scale+6);
-                        scale_lane_4_tmp = svzip1_s8(scale_lane_4_tmp, scale_lane_4_tmp);
-                        scale_lane_4_tmp = svzip1_s8(scale_lane_4_tmp, scale_lane_4_tmp);
-                        svint32_t scale_lane_1 = svunpklo_s32(svunpklo_s16(scale_lane_1_tmp));
-                        svint32_t scale_lane_2 = svunpklo_s32(svunpklo_s16(scale_lane_2_tmp));
-                        svint32_t scale_lane_3 = svunpklo_s32(svunpklo_s16(scale_lane_3_tmp));
-                        svint32_t scale_lane_4 = svunpklo_s32(svunpklo_s16(scale_lane_4_tmp));
-
-                        isum_tmp = svmla_s32_x(pg32_8, isum_tmp, svdot_s32(vzero, q6bytes_1, q8bytes_1), scale_lane_1);
-                        isum_tmp = svmla_s32_x(pg32_8, isum_tmp, svdot_s32(vzero, q6bytes_2, q8bytes_2), scale_lane_2);
-                        isum_tmp = svmla_s32_x(pg32_8, isum_tmp, svdot_s32(vzero, q6bytes_3, q8bytes_3), scale_lane_3);
-                        isum_tmp = svmla_s32_x(pg32_8, isum_tmp, svdot_s32(vzero, q6bytes_4, q8bytes_4), scale_lane_4);
-                        scale += 8;
-                    }
-                    isum += svaddv_s32(pg32_8, isum_tmp);
-                    sum += d_all * y[i].d * (isum - 32 * isum_mins);
-                }
-                break;
-            default:
-                assert(false && "Unsupported vector length");
-                break;
-        }
-    }
-
-    *s = sum;
-
-#elif __ARM_NEON
-    float sum = 0;
-
-    const uint8x16_t m4b = vdupq_n_u8(0xF);
-    const int32x4_t  vzero = vdupq_n_s32(0);
-    //const int8x16_t  m32s = vdupq_n_s8(32);
-
-    const uint8x16_t mone = vdupq_n_u8(3);
-
-    ggml_int8x16x4_t q6bytes;
-    ggml_uint8x16x4_t q6h;
-
-    for (int i = 0; i < nb; ++i) {
-
-        const float d_all = GGML_FP16_TO_FP32(x[i].d);
-
-        const uint8_t * GGML_RESTRICT q6 = x[i].ql;
-        const uint8_t * GGML_RESTRICT qh = x[i].qh;
-        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
-
-        const int8_t * GGML_RESTRICT scale = x[i].scales;
-
-        const ggml_int16x8x2_t q8sums = ggml_vld1q_s16_x2(y[i].bsums);
-        const int8x16_t scales = vld1q_s8(scale);
-        const ggml_int16x8x2_t q6scales = {{vmovl_s8(vget_low_s8(scales)), vmovl_s8(vget_high_s8(scales))}};
-
-        const int32x4_t prod = vaddq_s32(vaddq_s32(vmull_s16(vget_low_s16 (q8sums.val[0]), vget_low_s16 (q6scales.val[0])),
-                                                   vmull_s16(vget_high_s16(q8sums.val[0]), vget_high_s16(q6scales.val[0]))),
-                                         vaddq_s32(vmull_s16(vget_low_s16 (q8sums.val[1]), vget_low_s16 (q6scales.val[1])),
-                                                   vmull_s16(vget_high_s16(q8sums.val[1]), vget_high_s16(q6scales.val[1]))));
-        int32_t isum_mins = vaddvq_s32(prod);
-
-        int32_t isum = 0;
-
-        for (int j = 0; j < QK_K/128; ++j) {
-
-            ggml_uint8x16x2_t qhbits = ggml_vld1q_u8_x2(qh); qh += 32;
-            ggml_uint8x16x4_t q6bits = ggml_vld1q_u8_x4(q6); q6 += 64;
-            ggml_int8x16x4_t q8bytes = ggml_vld1q_s8_x4(q8); q8 += 64;
-
-            q6h.val[0] = vshlq_n_u8(vandq_u8(mone, qhbits.val[0]), 4);
-            q6h.val[1] = vshlq_n_u8(vandq_u8(mone, qhbits.val[1]), 4);
-            uint8x16_t shifted = vshrq_n_u8(qhbits.val[0], 2);
-            q6h.val[2] = vshlq_n_u8(vandq_u8(mone, shifted), 4);
-            shifted = vshrq_n_u8(qhbits.val[1], 2);
-            q6h.val[3] = vshlq_n_u8(vandq_u8(mone, shifted), 4);
-
-            //q6bytes.val[0] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[0], m4b), q6h.val[0])), m32s);
-            //q6bytes.val[1] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[1], m4b), q6h.val[1])), m32s);
-            //q6bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[2], m4b), q6h.val[2])), m32s);
-            //q6bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[3], m4b), q6h.val[3])), m32s);
-            q6bytes.val[0] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[0], m4b), q6h.val[0]));
-            q6bytes.val[1] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[1], m4b), q6h.val[1]));
-            q6bytes.val[2] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[2], m4b), q6h.val[2]));
-            q6bytes.val[3] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[3], m4b), q6h.val[3]));
-
-            isum += vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[0], q8bytes.val[0])) * scale[0] +
-                    vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[1], q8bytes.val[1])) * scale[1] +
-                    vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[2], q8bytes.val[2])) * scale[2] +
-                    vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[3], q8bytes.val[3])) * scale[3];
-
-            scale += 4;
-
-            q8bytes = ggml_vld1q_s8_x4(q8); q8 += 64;
-
-            shifted = vshrq_n_u8(qhbits.val[0], 4);
-            q6h.val[0] = vshlq_n_u8(vandq_u8(mone, shifted), 4);
-            shifted = vshrq_n_u8(qhbits.val[1], 4);
-            q6h.val[1] = vshlq_n_u8(vandq_u8(mone, shifted), 4);
-            shifted = vshrq_n_u8(qhbits.val[0], 6);
-            q6h.val[2] = vshlq_n_u8(vandq_u8(mone, shifted), 4);
-            shifted = vshrq_n_u8(qhbits.val[1], 6);
-            q6h.val[3] = vshlq_n_u8(vandq_u8(mone, shifted), 4);
-
-            //q6bytes.val[0] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[0], 4), q6h.val[0])), m32s);
-            //q6bytes.val[1] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[1], 4), q6h.val[1])), m32s);
-            //q6bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[2], 4), q6h.val[2])), m32s);
-            //q6bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[3], 4), q6h.val[3])), m32s);
-            q6bytes.val[0] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[0], 4), q6h.val[0]));
-            q6bytes.val[1] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[1], 4), q6h.val[1]));
-            q6bytes.val[2] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[2], 4), q6h.val[2]));
-            q6bytes.val[3] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[3], 4), q6h.val[3]));
-
-            isum += vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[0], q8bytes.val[0])) * scale[0] +
-                    vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[1], q8bytes.val[1])) * scale[1] +
-                    vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[2], q8bytes.val[2])) * scale[2] +
-                    vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[3], q8bytes.val[3])) * scale[3];
-            scale += 4;
-        }
-        //sum += isum * d_all * y[i].d;
-        sum += d_all * y[i].d * (isum - 32 * isum_mins);
-
-    }
-    *s = sum;
-
-#elif defined __AVX2__
-
-    const __m256i m4 = _mm256_set1_epi8(0xF);
-    const __m256i m2 = _mm256_set1_epi8(3);
-    const __m256i m32s = _mm256_set1_epi8(32);
-
-    __m256 acc = _mm256_setzero_ps();
-
-    for (int i = 0; i < nb; ++i) {
-
-        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
-
-        const uint8_t * GGML_RESTRICT q4 = x[i].ql;
-        const uint8_t * GGML_RESTRICT qh = x[i].qh;
-        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
-
-        const __m128i scales = _mm_loadu_si128((const __m128i*)x[i].scales);
-
-        __m256i sumi = _mm256_setzero_si256();
-
-        int is = 0;
-
-        for (int j = 0; j < QK_K/128; ++j) {
-
-            const __m128i scale_0 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 0));
-            const __m128i scale_1 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 1));
-            const __m128i scale_2 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 2));
-            const __m128i scale_3 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 3));
-            is += 4;
-
-            const __m256i q4bits1 = _mm256_loadu_si256((const __m256i*)q4); q4 += 32;
-            const __m256i q4bits2 = _mm256_loadu_si256((const __m256i*)q4); q4 += 32;
-            const __m256i q4bitsH = _mm256_loadu_si256((const __m256i*)qh); qh += 32;
-
-            const __m256i q4h_0 = _mm256_slli_epi16(_mm256_and_si256(q4bitsH, m2), 4);
-            const __m256i q4h_1 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(q4bitsH, 2), m2), 4);
-            const __m256i q4h_2 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(q4bitsH, 4), m2), 4);
-            const __m256i q4h_3 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(q4bitsH, 6), m2), 4);
-
-            const __m256i q4_0 = _mm256_or_si256(_mm256_and_si256(q4bits1, m4), q4h_0);
-            const __m256i q4_1 = _mm256_or_si256(_mm256_and_si256(q4bits2, m4), q4h_1);
-            const __m256i q4_2 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(q4bits1, 4), m4), q4h_2);
-            const __m256i q4_3 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(q4bits2, 4), m4), q4h_3);
-
-            const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
-            const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
-            const __m256i q8_2 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
-            const __m256i q8_3 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
-
-            __m256i q8s_0 = _mm256_maddubs_epi16(m32s, q8_0);
-            __m256i q8s_1 = _mm256_maddubs_epi16(m32s, q8_1);
-            __m256i q8s_2 = _mm256_maddubs_epi16(m32s, q8_2);
-            __m256i q8s_3 = _mm256_maddubs_epi16(m32s, q8_3);
-
-            __m256i p16_0 = _mm256_maddubs_epi16(q4_0, q8_0);
-            __m256i p16_1 = _mm256_maddubs_epi16(q4_1, q8_1);
-            __m256i p16_2 = _mm256_maddubs_epi16(q4_2, q8_2);
-            __m256i p16_3 = _mm256_maddubs_epi16(q4_3, q8_3);
-
-            p16_0 = _mm256_sub_epi16(p16_0, q8s_0);
-            p16_1 = _mm256_sub_epi16(p16_1, q8s_1);
-            p16_2 = _mm256_sub_epi16(p16_2, q8s_2);
-            p16_3 = _mm256_sub_epi16(p16_3, q8s_3);
-
-            p16_0 = _mm256_madd_epi16(_mm256_cvtepi8_epi16(scale_0), p16_0);
-            p16_1 = _mm256_madd_epi16(_mm256_cvtepi8_epi16(scale_1), p16_1);
-            p16_2 = _mm256_madd_epi16(_mm256_cvtepi8_epi16(scale_2), p16_2);
-            p16_3 = _mm256_madd_epi16(_mm256_cvtepi8_epi16(scale_3), p16_3);
-
-            sumi = _mm256_add_epi32(sumi, _mm256_add_epi32(p16_0, p16_1));
-            sumi = _mm256_add_epi32(sumi, _mm256_add_epi32(p16_2, p16_3));
-
-        }
-
-        acc = _mm256_fmadd_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi), acc);
-    }
-
-    *s = hsum_float_8(acc);
-
-#elif defined __AVX__
-
-    const __m128i m3 = _mm_set1_epi8(3);
-    const __m128i m15 = _mm_set1_epi8(15);
-
-    __m256 acc = _mm256_setzero_ps();
-
-    for (int i = 0; i < nb; ++i) {
-
-        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
-
-        const uint8_t * GGML_RESTRICT q4 = x[i].ql;
-        const uint8_t * GGML_RESTRICT qh = x[i].qh;
-        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
-
-        // handle the q6_k -32 offset separately using bsums
-        const __m128i q8sums_0 = _mm_loadu_si128((const __m128i*)y[i].bsums);
-        const __m128i q8sums_1 = _mm_loadu_si128((const __m128i*)y[i].bsums + 1);
-        const __m128i scales = _mm_loadu_si128((const __m128i*)x[i].scales);
-        const __m128i scales_16_0 = _mm_cvtepi8_epi16(scales);
-        const __m128i scales_16_1 = _mm_cvtepi8_epi16(_mm_bsrli_si128(scales, 8));
-        const __m128i q8sclsub_0 = _mm_slli_epi32(_mm_madd_epi16(q8sums_0, scales_16_0), 5);
-        const __m128i q8sclsub_1 = _mm_slli_epi32(_mm_madd_epi16(q8sums_1, scales_16_1), 5);
-
-        __m128i sumi_0 = _mm_setzero_si128();
-        __m128i sumi_1 = _mm_setzero_si128();
-
-        int is = 0;
-
-        for (int j = 0; j < QK_K/128; ++j) {
-
-            const __m128i q4bitsH_0 = _mm_loadu_si128((const __m128i*)qh); qh += 16;
-            const __m128i q4bitsH_1 = _mm_loadu_si128((const __m128i*)qh); qh += 16;
-
-            const __m128i q4h_0 = _mm_slli_epi16(_mm_and_si128(q4bitsH_0, m3), 4);
-            const __m128i q4h_1 = _mm_slli_epi16(_mm_and_si128(q4bitsH_1, m3), 4);
-            const __m128i q4h_2 = _mm_slli_epi16(_mm_and_si128(q4bitsH_0, _mm_set1_epi8(12)), 2);
-            const __m128i q4h_3 = _mm_slli_epi16(_mm_and_si128(q4bitsH_1, _mm_set1_epi8(12)), 2);
-            const __m128i q4h_4 = _mm_and_si128(q4bitsH_0, _mm_set1_epi8(48));
-            const __m128i q4h_5 = _mm_and_si128(q4bitsH_1, _mm_set1_epi8(48));
-            const __m128i q4h_6 = _mm_srli_epi16(_mm_and_si128(q4bitsH_0, _mm_set1_epi8(-64)), 2);
-            const __m128i q4h_7 = _mm_srli_epi16(_mm_and_si128(q4bitsH_1, _mm_set1_epi8(-64)), 2);
-
-            const __m128i q4bits1_0 = _mm_loadu_si128((const __m128i*)q4); q4 += 16;
-            const __m128i q4bits1_1 = _mm_loadu_si128((const __m128i*)q4); q4 += 16;
-            const __m128i q4bits2_0 = _mm_loadu_si128((const __m128i*)q4); q4 += 16;
-            const __m128i q4bits2_1 = _mm_loadu_si128((const __m128i*)q4); q4 += 16;
-
-            const __m128i q4_0 = _mm_or_si128(_mm_and_si128(q4bits1_0, m15), q4h_0);
-            const __m128i q4_1 = _mm_or_si128(_mm_and_si128(q4bits1_1, m15), q4h_1);
-            const __m128i q4_2 = _mm_or_si128(_mm_and_si128(q4bits2_0, m15), q4h_2);
-            const __m128i q4_3 = _mm_or_si128(_mm_and_si128(q4bits2_1, m15), q4h_3);
-            const __m128i q4_4 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits1_0, 4), m15), q4h_4);
-            const __m128i q4_5 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits1_1, 4), m15), q4h_5);
-            const __m128i q4_6 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits2_0, 4), m15), q4h_6);
-            const __m128i q4_7 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits2_1, 4), m15), q4h_7);
-
-            const __m128i q8_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-            const __m128i q8_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-            const __m128i q8_2 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-            const __m128i q8_3 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-            const __m128i q8_4 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-            const __m128i q8_5 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-            const __m128i q8_6 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-            const __m128i q8_7 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-
-            __m128i p16_0 = _mm_maddubs_epi16(q4_0, q8_0);
-            __m128i p16_1 = _mm_maddubs_epi16(q4_1, q8_1);
-            __m128i p16_2 = _mm_maddubs_epi16(q4_2, q8_2);
-            __m128i p16_3 = _mm_maddubs_epi16(q4_3, q8_3);
-            __m128i p16_4 = _mm_maddubs_epi16(q4_4, q8_4);
-            __m128i p16_5 = _mm_maddubs_epi16(q4_5, q8_5);
-            __m128i p16_6 = _mm_maddubs_epi16(q4_6, q8_6);
-            __m128i p16_7 = _mm_maddubs_epi16(q4_7, q8_7);
-
-            const __m128i scale_0 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 0));
-            const __m128i scale_1 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 1));
-            const __m128i scale_2 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 2));
-            const __m128i scale_3 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 3));
-            is += 4;
-
-            p16_0 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_0), p16_0);
-            p16_1 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_bsrli_si128(scale_0, 8)), p16_1);
-            p16_2 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_1), p16_2);
-            p16_3 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_bsrli_si128(scale_1, 8)), p16_3);
-            p16_4 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_2), p16_4);
-            p16_5 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_bsrli_si128(scale_2, 8)), p16_5);
-            p16_6 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_3), p16_6);
-            p16_7 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_bsrli_si128(scale_3, 8)), p16_7);
-
-            sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p16_0, p16_2));
-            sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p16_1, p16_3));
-            sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p16_4, p16_6));
-            sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p16_5, p16_7));
-
-        }
-
-        sumi_0 = _mm_sub_epi32(sumi_0, q8sclsub_0);
-        sumi_1 = _mm_sub_epi32(sumi_1, q8sclsub_1);
-        const __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
-        acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(sumi)), acc);
-    }
-
-    *s = hsum_float_8(acc);
-
-#elif defined __wasm_simd128__
-    int8_t aux8[QK_K] __attribute__((aligned(16)));
-    int32_t aux32[8] __attribute__((aligned(16))) = {0};
-    float sums[8] __attribute__((aligned(16))) = {0};
-
-    for (int i = 0; i < nb; ++i) {
-        // Unpack 6-bit quantized data into aux8 (unchanged)
-        const uint8_t * GGML_RESTRICT q4 = x[i].ql;
-        const uint8_t * GGML_RESTRICT qh = x[i].qh;
-        int8_t * a = aux8;
-        for (int j = 0; j < QK_K; j += 128) {
-            for (int l = 0; l < 32; ++l) {
-                a[l +  0] = (int8_t)((q4[l +  0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
-                a[l + 32] = (int8_t)((q4[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
-                a[l + 64] = (int8_t)((q4[l +  0] >>  4) | (((qh[l] >> 4) & 3) << 4)) - 32;
-                a[l + 96] = (int8_t)((q4[l + 32] >>  4) | (((qh[l] >> 6) & 3) << 4)) - 32;
-            }
-            a += 128;
-            q4 += 64;
-            qh += 32;
-        }
-
-        const int8_t * GGML_RESTRICT a_ptr = aux8;
-        const int8_t * GGML_RESTRICT q8 = y[i].qs;
-        v128_t acc0 = wasm_i32x4_splat(0);
-        v128_t acc1 = wasm_i32x4_splat(0);
-
-        for (int j = 0; j < QK_K/16; ++j) {
-            const int scale = x[i].scales[j];
-            const v128_t vscale = wasm_i32x4_splat(scale);
-
-            // Load 16 elements from a and q8
-            const v128_t a_vec = wasm_v128_load(a_ptr);
-            const v128_t q8_vec = wasm_v128_load(q8);
-
-            // Process low 8 elements
-            v128_t a_low = wasm_i16x8_extend_low_i8x16(a_vec);
-            v128_t q8_low = wasm_i16x8_extend_low_i8x16(q8_vec);
-            v128_t prod_low = wasm_i16x8_mul(a_low, q8_low);
-            v128_t prod_lo_lo = wasm_i32x4_extend_low_i16x8(prod_low);
-            v128_t prod_lo_hi = wasm_i32x4_extend_high_i16x8(prod_low);
-
-            // Process high 8 elements
-            v128_t a_high = wasm_i16x8_extend_high_i8x16(a_vec);
-            v128_t q8_high = wasm_i16x8_extend_high_i8x16(q8_vec);
-            v128_t prod_high = wasm_i16x8_mul(a_high, q8_high);
-            v128_t prod_hi_lo = wasm_i32x4_extend_low_i16x8(prod_high);
-            v128_t prod_hi_hi = wasm_i32x4_extend_high_i16x8(prod_high);
-
-            // Scale and accumulate
-            prod_lo_lo = wasm_i32x4_mul(prod_lo_lo, vscale);
-            prod_lo_hi = wasm_i32x4_mul(prod_lo_hi, vscale);
-            prod_hi_lo = wasm_i32x4_mul(prod_hi_lo, vscale);
-            prod_hi_hi = wasm_i32x4_mul(prod_hi_hi, vscale);
-
-            acc0 = wasm_i32x4_add(acc0, wasm_i32x4_add(prod_lo_lo, prod_hi_lo));
-            acc1 = wasm_i32x4_add(acc1, wasm_i32x4_add(prod_lo_hi, prod_hi_hi));
-
-            a_ptr += 16;
-            q8 += 16;
-        }
-
-        // Store accumulated results
-        wasm_v128_store(&aux32[0], acc0);
-        wasm_v128_store(&aux32[4], acc1);
-
-        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-        for (int l = 0; l < 8; ++l) {
-            sums[l] += d * aux32[l];
-        }
-    }
-
-    // Sum final results
-    float sumf = 0;
-    for (int l = 0; l < 8; ++l) {
-        sumf += sums[l];
-    }
-    *s = sumf;
-
-#elif defined __riscv_xtheadvector
-
-    float sumf = 0;
-
-    for (int i = 0; i < nb; ++i) {
-
-        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-
-        const uint8_t * restrict q6 = x[i].ql;
-        const uint8_t * restrict qh = x[i].qh;
-        const  int8_t * restrict q8 = y[i].qs;
-
-        const int8_t * restrict scale = x[i].scales;
-
-        int sum_t = 0;
-        int t0;
-
-        for (int j = 0; j < QK_K/128; ++j) {
-            __asm__ __volatile__(
-                "th.vsetvli zero, %[vl32], e8, m2\n\t" // vl == 32
-                "th.vlb.v v4, (%[qh])\n\t"
-                "th.vsll.vi v0, v4, 4\n\t"
-                "th.vsll.vi v2, v4, 2\n\t"
-                "th.vsrl.vi v6, v4, 2\n\t"
-                "th.vsetvli zero, %[vl64], e8, m4\n\t" // vl == 64
-                "th.vlb.v v8, (%[q6])\n\t"
-                "th.vsrl.vi v12, v8, 4\n\t"
-                "th.vand.vi v8, v8, 0xF\n\t"
-                "th.vsetvli zero, %[vl128], e8, m8\n\t" // vl == 128
-                "th.vand.vx v0, v0, %[mask]\n\t"
-                "th.vor.vv v8, v8, v0\n\t"
-                "th.vlb.v v0, (%[q8])\n\t"
-                "th.vsub.vx v8, v8, %[vl32]\n\t"
-                "th.vsetvli zero, %[vl64], e8, m4\n\t" // vl == 64
-                "th.vwmul.vv v16, v0, v8\n\t"
-                "th.vwmul.vv v24, v4, v12\n\t"
-                "li %[t0], 16\n\t"
-                "th.vsetvli zero, %[t0], e16, m2\n\t" // vl == 16
-                "th.vmv.v.x v0, zero\n\t"
-                "th.vwredsum.vs v10, v16, v0\n\t"
-                "th.vwredsum.vs v9, v18, v0\n\t"
-                "th.vwredsum.vs v8, v20, v0\n\t"
-                "th.vwredsum.vs v7, v22, v0\n\t"
-                "th.vwredsum.vs v11, v24, v0\n\t"
-                "th.vwredsum.vs v12, v26, v0\n\t"
-                "th.vwredsum.vs v13, v28, v0\n\t"
-                "th.vwredsum.vs v14, v30, v0\n\t"
-                "li %[t0], 4\n\t"
-                "th.vsetvli zero, %[t0], e32, m1\n\t" // vl == 4
-                "th.vslideup.vi v10, v9, 1\n\t"
-                "th.vslideup.vi v8, v7, 1\n\t"
-                "th.vslideup.vi v11, v12, 1\n\t"
-                "th.vslideup.vi v13, v14, 1\n\t"
-                "th.vslideup.vi v10, v8, 2\n\t"
-                "th.vslideup.vi v11, v13, 2\n\t"
-                "li %[t0], 8\n\t"
-                "th.vsetvli zero, %[t0], e32, m2\n\t" // vl == 8
-                "th.vlb.v v4, (%[scale])\n\t"
-                "th.vmul.vv v2, v4, v10\n\t"
-                "th.vredsum.vs v0, v2, v0\n\t"
-                "th.vmv.x.s %[t0], v0\n\t"
-                "add %[sumi], %[sumi], %[t0]"
-                : [sumi] "+&r" (sum_t), [t0] "=&r" (t0)
-                : [qh] "r" (qh), [q6] "r" (q6), [q8] "r" (q8), [scale] "r" (scale)
-                , [vl32] "r" (32), [vl64] "r" (64), [vl128] "r" (128)
-                , [mask] "r" (0x30)
-                : "memory"
-                , "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
-                , "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
-                , "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
-                , "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
-            );
-            q6 += 64;   qh += 32;   q8 += 128;   scale += 8;
-        }
-
-        sumf += d * sum_t;
-
-    }
-
-    *s = sumf;
-
-#elif defined __riscv_v
-
-    float sumf = 0;
-    const int vector_length = __riscv_vlenb() * 8;
-
-    switch (vector_length) {
-    case 256:
-        for (int i = 0; i < nb; ++i) {
-
-            const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-
-            const uint8_t * GGML_RESTRICT q6 = x[i].ql;
-            const uint8_t * GGML_RESTRICT qh = x[i].qh;
-            const  int8_t * GGML_RESTRICT q8 = y[i].qs;
-
-            const int8_t * GGML_RESTRICT scale = x[i].scales;
-
-            size_t vl;
-
-            vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1);
-
-            int sum_t = 0;
-            int is = 0;
-
-            for (int j = 0; j < QK_K/128; ++j) {
-
-                vl = 32;
-
-                // load qh
-                vuint8m1_t qh_x = __riscv_vle8_v_u8m1(qh, vl);
-
-                // load Q6
-                vuint8m1_t q6_0 = __riscv_vle8_v_u8m1(q6, vl);
-                vuint8m1_t q6_1 = __riscv_vle8_v_u8m1(q6+32, vl);
-
-                vuint8m1_t q6a_0 = __riscv_vand_vx_u8m1(q6_0, 0x0F, vl);
-                vuint8m1_t q6a_1 = __riscv_vand_vx_u8m1(q6_1, 0x0F, vl);
-                vuint8m1_t q6s_0 = __riscv_vsrl_vx_u8m1(q6_0, 0x04, vl);
-                vuint8m1_t q6s_1 = __riscv_vsrl_vx_u8m1(q6_1, 0x04, vl);
-
-                vuint8m1_t qh_0 = __riscv_vand_vx_u8m1(qh_x, 0x03, vl);
-                vuint8m1_t qh_1 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(qh_x, 0x2, vl), 0x03 , vl);
-                vuint8m1_t qh_2 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(qh_x, 0x4, vl), 0x03 , vl);
-                vuint8m1_t qh_3 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(qh_x, 0x6, vl), 0x03 , vl);
-
-                vuint8m1_t qhi_0 = __riscv_vor_vv_u8m1(q6a_0, __riscv_vsll_vx_u8m1(qh_0, 0x04, vl), vl);
-                vuint8m1_t qhi_1 = __riscv_vor_vv_u8m1(q6a_1, __riscv_vsll_vx_u8m1(qh_1, 0x04, vl), vl);
-                vuint8m1_t qhi_2 = __riscv_vor_vv_u8m1(q6s_0, __riscv_vsll_vx_u8m1(qh_2, 0x04, vl), vl);
-                vuint8m1_t qhi_3 = __riscv_vor_vv_u8m1(q6s_1, __riscv_vsll_vx_u8m1(qh_3, 0x04, vl), vl);
-
-                vint8m1_t a_0 = __riscv_vsub_vx_i8m1(__riscv_vreinterpret_v_u8m1_i8m1(qhi_0), 32, vl);
-                vint8m1_t a_1 = __riscv_vsub_vx_i8m1(__riscv_vreinterpret_v_u8m1_i8m1(qhi_1), 32, vl);
-                vint8m1_t a_2 = __riscv_vsub_vx_i8m1(__riscv_vreinterpret_v_u8m1_i8m1(qhi_2), 32, vl);
-                vint8m1_t a_3 = __riscv_vsub_vx_i8m1(__riscv_vreinterpret_v_u8m1_i8m1(qhi_3), 32, vl);
-
-                // load Q8 and take product
-                vint16m2_t va_q_0 = __riscv_vwmul_vv_i16m2(a_0, __riscv_vle8_v_i8m1(q8, vl), vl);
-                vint16m2_t va_q_1 = __riscv_vwmul_vv_i16m2(a_1, __riscv_vle8_v_i8m1(q8+32, vl), vl);
-                vint16m2_t va_q_2 = __riscv_vwmul_vv_i16m2(a_2, __riscv_vle8_v_i8m1(q8+64, vl), vl);
-                vint16m2_t va_q_3 = __riscv_vwmul_vv_i16m2(a_3, __riscv_vle8_v_i8m1(q8+96, vl), vl);
-
-                vl = 16;
-
-                vint32m2_t vaux_0 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_0, 0), scale[is+0], vl);
-                vint32m2_t vaux_1 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_0, 1), scale[is+1], vl);
-                vint32m2_t vaux_2 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_1, 0), scale[is+2], vl);
-                vint32m2_t vaux_3 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_1, 1), scale[is+3], vl);
-                vint32m2_t vaux_4 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_2, 0), scale[is+4], vl);
-                vint32m2_t vaux_5 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_2, 1), scale[is+5], vl);
-                vint32m2_t vaux_6 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_3, 0), scale[is+6], vl);
-                vint32m2_t vaux_7 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_3, 1), scale[is+7], vl);
-
-                vint32m1_t isum0 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(vaux_0, vaux_1, vl), vzero, vl);
-                vint32m1_t isum1 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(vaux_2, vaux_3, vl), isum0, vl);
-                vint32m1_t isum2 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(vaux_4, vaux_5, vl), isum1, vl);
-                vint32m1_t isum3 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(vaux_6, vaux_7, vl), isum2, vl);
-
-                sum_t += __riscv_vmv_x_s_i32m1_i32(isum3);
-
-                q6 += 64;   qh += 32;   q8 += 128;   is=8;
-
-            }
-
-            sumf += d * sum_t;
-
-        }
-        break;
-    case 128:
-        for (int i = 0; i < nb; ++i) {
-
-            const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-
-            const uint8_t * restrict q6 = x[i].ql;
-            const uint8_t * restrict qh = x[i].qh;
-            const  int8_t * restrict q8 = y[i].qs;
-
-            const int8_t * restrict scale = x[i].scales;
-
-            int sum_t = 0;
-            int t0;
-
-            for (int j = 0; j < QK_K/128; ++j) {
-                __asm__ __volatile__(
-                    "vsetvli zero, %[vl32], e8, m2\n\t"
-                    "vle8.v v4, (%[qh])\n\t"
-                    "vsll.vi v0, v4, 4\n\t"
-                    "vsll.vi v2, v4, 2\n\t"
-                    "vsrl.vi v6, v4, 2\n\t"
-                    "vsetvli zero, %[vl64], e8, m4\n\t"
-                    "vle8.v v8, (%[q6])\n\t"
-                    "vsrl.vi v12, v8, 4\n\t"
-                    "vand.vi v8, v8, 0xF\n\t"
-                    "vsetvli zero, %[vl128], e8, m8\n\t"
-                    "vand.vx v0, v0, %[mask]\n\t"
-                    "vor.vv v8, v8, v0\n\t"
-                    "vle8.v v0, (%[q8])\n\t"
-                    "vsub.vx v8, v8, %[vl32]\n\t"
-                    "vsetvli zero, %[vl64], e8, m4\n\t"
-                    "vwmul.vv v16, v0, v8\n\t"
-                    "vwmul.vv v24, v4, v12\n\t"
-                    "vsetivli zero, 16, e16, m2\n\t"
-                    "vmv.v.x v0, zero\n\t"
-                    "vwredsum.vs v10, v16, v0\n\t"
-                    "vwredsum.vs v9, v18, v0\n\t"
-                    "vwredsum.vs v8, v20, v0\n\t"
-                    "vwredsum.vs v7, v22, v0\n\t"
-                    "vwredsum.vs v11, v24, v0\n\t"
-                    "vwredsum.vs v12, v26, v0\n\t"
-                    "vwredsum.vs v13, v28, v0\n\t"
-                    "vwredsum.vs v14, v30, v0\n\t"
-                    "vsetivli zero, 4, e32, m1\n\t"
-                    "vslideup.vi v10, v9, 1\n\t"
-                    "vslideup.vi v8, v7, 1\n\t"
-                    "vslideup.vi v11, v12, 1\n\t"
-                    "vslideup.vi v13, v14, 1\n\t"
-                    "vslideup.vi v10, v8, 2\n\t"
-                    "vslideup.vi v11, v13, 2\n\t"
-                    "vsetivli zero, 8, e32, m2\n\t"
-                    "vle8.v v2, (%[scale])\n\t"
-                    "vsext.vf4 v4, v2\n\t"
-                    "vmul.vv v2, v4, v10\n\t"
-                    "vredsum.vs v0, v2, v0\n\t"
-                    "vmv.x.s %[t0], v0\n\t"
-                    "add %[sumi], %[sumi], %[t0]"
-                    : [sumi] "+&r" (sum_t), [t0] "=&r" (t0)
-                    : [qh] "r" (qh), [q6] "r" (q6), [q8] "r" (q8), [scale] "r" (scale)
-                    , [vl32] "r" (32), [vl64] "r" (64), [vl128] "r" (128)
-                    , [mask] "r" (0x30)
-                    : "memory"
-                    , "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
-                    , "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
-                    , "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
-                    , "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
-                );
-                q6 += 64;   qh += 32;   q8 += 128;   scale += 8;
-            }
-
-            sumf += d * sum_t;
-
-        }
-        break;
-    default:
-        assert(false && "Unsupported vector length");
-        break;
-    }
-
-    *s = sumf;
-
-#elif defined(__POWER9_VECTOR__)
-    const vector signed char lowMask = vec_splats((signed char)0xF);
-    const vector int v0 = vec_splats((int32_t)0);
-    const vector unsigned char v2 = vec_splats((unsigned char)0x2);
-    const vector unsigned char v3 = vec_splats((unsigned char)0x3);
-    const vector unsigned char v4 = vec_splats((unsigned char)0x4);
-    const vector unsigned char v6 = vec_splats((unsigned char)0x6);
-    const vector signed char off = vec_splats((signed char)0x20);
-
-    vector float vsumf0 = vec_splats(0.0f);
-    vector float vsumf1 = vec_splats(0.0f);
-    vector float vsumf2 = vec_splats(0.0f);
-    vector float vsumf3 = vec_splats(0.0f);
-
-    for (int i = 0; i < nb; ++i) {
-        vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
-        vector float vyd = vec_splats(y[i].d);
-        vector float vd = vec_mul(vxd, vyd);
-
-        vector signed int vsumi0 = v0;
-        vector signed int vsumi1 = v0;
-        vector signed int vsumi2 = v0;
-        vector signed int vsumi3 = v0;
-        vector signed int vsumi4 = v0;
-        vector signed int vsumi5 = v0;
-        vector signed int vsumi6 = v0;
-        vector signed int vsumi7 = v0;
-
-        const uint8_t * GGML_RESTRICT q6 = x[i].ql;
-        const uint8_t * GGML_RESTRICT qh = x[i].qh;
-        const int8_t  * GGML_RESTRICT qs = x[i].scales;
-        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
-
-        for (int j = 0; j < QK_K/128; ++j) {
-            __builtin_prefetch(q6, 0, 0);
-            __builtin_prefetch(qh, 0, 0);
-            __builtin_prefetch(q8, 0, 0);
-
-            vector signed char qxs0 = (vector signed char)vec_xl( 0, q6);
-            vector signed char qxs1 = (vector signed char)vec_xl(16, q6);
-            vector signed char qxs2 = (vector signed char)vec_xl(32, q6);
-            vector signed char qxs3 = (vector signed char)vec_xl(48, q6);
-            q6 += 64;
-
-            vector signed char qxs00 = vec_and(qxs0, lowMask);
-            vector signed char qxs01 = vec_sr(qxs0, v4);
-            vector signed char qxs10 = vec_and(qxs1, lowMask);
-            vector signed char qxs11 = vec_sr(qxs1, v4);
-            vector signed char qxs20 = vec_and(qxs2, lowMask);
-            vector signed char qxs21 = vec_sr(qxs2, v4);
-            vector signed char qxs30 = vec_and(qxs3, lowMask);
-            vector signed char qxs31 = vec_sr(qxs3, v4);
-
-            vector signed char qxhs0 = (vector signed char)vec_xl( 0, qh);
-            vector signed char qxhs1 = (vector signed char)vec_xl(16, qh);
-            qh += 32;
-
-            vector signed char qxh00 = vec_sl(vec_and((vector signed char)v3, qxhs0), v4);
-            vector signed char qxh01 = vec_sl(vec_and((vector signed char)v3, vec_sr(qxhs0, v4)), v4);
-            vector signed char qxh10 = vec_sl(vec_and((vector signed char)v3, qxhs1), v4);
-            vector signed char qxh11 = vec_sl(vec_and((vector signed char)v3, vec_sr(qxhs1, v4)), v4);
-            vector signed char qxh20 = vec_sl(vec_and((vector signed char)v3, vec_sr(qxhs0, v2)), v4);
-            vector signed char qxh21 = vec_sl(vec_and((vector signed char)v3, vec_sr(qxhs0, v6)), v4);
-            vector signed char qxh30 = vec_sl(vec_and((vector signed char)v3, vec_sr(qxhs1, v2)), v4);
-            vector signed char qxh31 = vec_sl(vec_and((vector signed char)v3, vec_sr(qxhs1, v6)), v4);
-
-            vector signed char q6x00 = vec_sub(vec_or(qxh00, qxs00), off);
-            vector signed char q6x01 = vec_sub(vec_or(qxh01, qxs01), off);
-            vector signed char q6x10 = vec_sub(vec_or(qxh10, qxs10), off);
-            vector signed char q6x11 = vec_sub(vec_or(qxh11, qxs11), off);
-            vector signed char q6x20 = vec_sub(vec_or(qxh20, qxs20), off);
-            vector signed char q6x21 = vec_sub(vec_or(qxh21, qxs21), off);
-            vector signed char q6x30 = vec_sub(vec_or(qxh30, qxs30), off);
-            vector signed char q6x31 = vec_sub(vec_or(qxh31, qxs31), off);
-
-            vector signed char q8y00 = vec_xl(  0, q8);
-            vector signed char q8y10 = vec_xl( 16, q8);
-            vector signed char q8y20 = vec_xl( 32, q8);
-            vector signed char q8y30 = vec_xl( 48, q8);
-            vector signed char q8y01 = vec_xl( 64, q8);
-            vector signed char q8y11 = vec_xl( 80, q8);
-            vector signed char q8y21 = vec_xl( 96, q8);
-            vector signed char q8y31 = vec_xl(112, q8);
-            q8 += 128;
-
-            vector signed short qv00 = vec_add(vec_mule(q6x00, q8y00), vec_mulo(q6x00, q8y00));
-            vector signed short qv10 = vec_add(vec_mule(q6x10, q8y10), vec_mulo(q6x10, q8y10));
-            vector signed short qv20 = vec_add(vec_mule(q6x20, q8y20), vec_mulo(q6x20, q8y20));
-            vector signed short qv30 = vec_add(vec_mule(q6x30, q8y30), vec_mulo(q6x30, q8y30));
-            vector signed short qv01 = vec_add(vec_mule(q6x01, q8y01), vec_mulo(q6x01, q8y01));
-            vector signed short qv11 = vec_add(vec_mule(q6x11, q8y11), vec_mulo(q6x11, q8y11));
-            vector signed short qv21 = vec_add(vec_mule(q6x21, q8y21), vec_mulo(q6x21, q8y21));
-            vector signed short qv31 = vec_add(vec_mule(q6x31, q8y31), vec_mulo(q6x31, q8y31));
-
-            vector signed short vscales = vec_unpackh(vec_xl_len(qs, 8));
-            qs += 8;
-
-            vector signed short vs0 = vec_splat(vscales, 0);
-            vector signed short vs1 = vec_splat(vscales, 1);
-            vector signed short vs2 = vec_splat(vscales, 2);
-            vector signed short vs3 = vec_splat(vscales, 3);
-            vector signed short vs4 = vec_splat(vscales, 4);
-            vector signed short vs5 = vec_splat(vscales, 5);
-            vector signed short vs6 = vec_splat(vscales, 6);
-            vector signed short vs7 = vec_splat(vscales, 7);
-
-            vsumi0 = vec_msum(qv00, vs0, vsumi0);
-            vsumi1 = vec_msum(qv01, vs4, vsumi1);
-            vsumi2 = vec_msum(qv10, vs1, vsumi2);
-            vsumi3 = vec_msum(qv11, vs5, vsumi3);
-            vsumi4 = vec_msum(qv20, vs2, vsumi4);
-            vsumi5 = vec_msum(qv21, vs6, vsumi5);
-            vsumi6 = vec_msum(qv30, vs3, vsumi6);
-            vsumi7 = vec_msum(qv31, vs7, vsumi7);
-        }
-
-        vsumi0 = vec_add(vsumi0, vsumi4);
-        vsumi1 = vec_add(vsumi1, vsumi5);
-        vsumi2 = vec_add(vsumi2, vsumi6);
-        vsumi3 = vec_add(vsumi3, vsumi7);
-
-        vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
-        vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
-        vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
-        vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
-    }
-
-    vsumf0 = vec_add(vsumf0, vsumf2);
-    vsumf1 = vec_add(vsumf1, vsumf3);
-
-    vsumf0 = vec_add(vsumf0, vsumf1);
-
-    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
-    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
-
-    *s = vec_extract(vsumf0, 0);
-
-#elif defined __loongarch_asx
-
-    const __m256i m32s = __lasx_xvreplgr2vr_b(32);
-
-    __m256 acc = (__m256)__lasx_xvldi(0);
-
-    for (int i = 0; i < nb; ++i) {
-
-        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
-
-        const uint8_t * GGML_RESTRICT q4 = x[i].ql;
-        const uint8_t * GGML_RESTRICT qh = x[i].qh;
-        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
-
-        const __m128i scales128 = __lsx_vld((const __m128i*)x[i].scales, 0);
-        const v16i8 shuffle_mask = {0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15};
-        const __m256i scales_shuffled = lasx_ext8_16(__lsx_vshuf_b(scales128, scales128, (__m128i)shuffle_mask));
-
-        __m256i sumi = __lasx_xvldi(0);
-
-        for (int j = 0; j < QK_K/128; ++j) {
-
-            const __m256i q4bits1 = __lasx_xvld((const __m256i*)q4, 0); q4 += 32;
-            const __m256i q4bits2 = __lasx_xvld((const __m256i*)q4, 0); q4 += 32;
-            const __m256i q4bitsH = __lasx_xvld((const __m256i*)qh, 0); qh += 32;
-
-            const __m256i q4h_0 = __lasx_xvslli_b(__lasx_xvandi_b(q4bitsH, 3), 4);
-            const __m256i q4h_1 = __lasx_xvslli_b(__lasx_xvandi_b(q4bitsH, 3 << 2), 2);
-            const __m256i q4h_2 = __lasx_xvandi_b(q4bitsH, 3 << 4);
-            const __m256i q4h_3 = __lasx_xvsrli_b(__lasx_xvandi_b(q4bitsH, 3 << 6), 2);
-
-            const __m256i q4_0 = __lasx_xvor_v(__lasx_xvandi_b(q4bits1, 0xf), q4h_0);
-            const __m256i q4_1 = __lasx_xvor_v(__lasx_xvandi_b(q4bits2, 0xf), q4h_1);
-            const __m256i q4_2 = __lasx_xvor_v(__lasx_xvsrli_b(q4bits1, 4), q4h_2);
-            const __m256i q4_3 = __lasx_xvor_v(__lasx_xvsrli_b(q4bits2, 4), q4h_3);
-
-            const __m256i q8_0 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
-            const __m256i q8_1 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
-            const __m256i q8_2 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
-            const __m256i q8_3 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
-
-            __m256i p16_0 = lasx_madd_h_b(__lasx_xvsub_b(q4_0, m32s), q8_0);
-            __m256i p16_1 = lasx_madd_h_b(__lasx_xvsub_b(q4_1, m32s), q8_1);
-            __m256i p16_2 = lasx_madd_h_b(__lasx_xvsub_b(q4_2, m32s), q8_2);
-            __m256i p16_3 = lasx_madd_h_b(__lasx_xvsub_b(q4_3, m32s), q8_3);
-
-            p16_0 = lasx_madd_h(lasx_xvrepl128vei_h(scales_shuffled, 4 * j + 0), p16_0);
-            p16_1 = lasx_madd_h(lasx_xvrepl128vei_h(scales_shuffled, 4 * j + 1), p16_1);
-            p16_2 = lasx_madd_h(lasx_xvrepl128vei_h(scales_shuffled, 4 * j + 2), p16_2);
-            p16_3 = lasx_madd_h(lasx_xvrepl128vei_h(scales_shuffled, 4 * j + 3), p16_3);
-
-            sumi = __lasx_xvadd_w(sumi, __lasx_xvadd_w(p16_0, p16_1));
-            sumi = __lasx_xvadd_w(sumi, __lasx_xvadd_w(p16_2, p16_3));
-        }
-
-        acc = __lasx_xvfmadd_s((__m256)__lasx_xvreplfr2vr_s(d), __lasx_xvffint_s_w(sumi), acc);
-    }
-
-    *s = hsum_float_8(acc);
-#elif defined(__VXE__) || defined(__VXE2__)
-    float sum = 0;
-
-    // Lower 4-bit and upper 2-bit masks
-    const uint8x16_t v_lm = vec_splat_u8(0x0F);
-    const uint8x16_t v_um = vec_splat_u8(0x03);
-
-    const int32x4_t v_z = vec_splat_s32(0);
-
-    int8x16_t  q6b[4];
-    uint8x16_t q6h[4];
-
-    uint8x16_t v_xl[4];
-    uint8x16_t v_xh[2];
-    int8x16_t  v_y[4];
-
-    for (int i = 0; i < nb; ++i) {
-        const float d_all = GGML_FP16_TO_FP32(x[i].d);
-
-        const uint8_t * GGML_RESTRICT x0l = x[i].ql;
-        const uint8_t * GGML_RESTRICT x0h = x[i].qh;
-        const int8_t  * GGML_RESTRICT y0 = y[i].qs;
-
-        const int8_t  * GGML_RESTRICT scale = x[i].scales;
-
-        const int16x8_t v_ysumsl = vec_xl(0 , y[i].bsums);
-        const int16x8_t v_ysumsh = vec_xl(16, y[i].bsums);
-
-        const int8x16_t v_scale  = vec_xl(0, scale);
-        const int16x8_t v_scalel = vec_unpackh(v_scale);
-        const int16x8_t v_scaleh = vec_unpackl(v_scale);
-
-        const int32x4_t v_minslo = vec_mulo(v_ysumsl, v_scalel);
-        const int32x4_t v_minsle = vec_mule(v_ysumsl, v_scalel);
-        const int32x4_t v_minsho = vec_mulo(v_ysumsh, v_scaleh);
-        const int32x4_t v_minshe = vec_mule(v_ysumsh, v_scaleh);
-        const int32x4_t v_mins = v_minslo + v_minsle + v_minsho + v_minshe;
-
-        const int32_t mins = v_mins[0] + v_mins[1] + v_mins[2] + v_mins[3];
-
-        int32_t isum = 0;
-        for (int j = 0; j < QK_K/128; ++j) {
-            // Load model upper 2 bits
-            v_xh[0] = vec_xl(0 , x0h);
-            v_xh[1] = vec_xl(16, x0h);
-            x0h += 32;
-
-            // Load model lower 4 bits
-            v_xl[0] = vec_xl(0 , x0l);
-            v_xl[1] = vec_xl(16, x0l);
-            v_xl[2] = vec_xl(32, x0l);
-            v_xl[3] = vec_xl(48, x0l);
-            x0l += 64;
-
-            // Load activation quants
-            v_y[0] = vec_xl(0 , y0);
-            v_y[1] = vec_xl(16, y0);
-            v_y[2] = vec_xl(32, y0);
-            v_y[3] = vec_xl(48, y0);
-            y0 += 64;
-
-            q6h[0] = vec_sl(vec_and(v_um, v_xh[0]), 4);
-            q6h[1] = vec_sl(vec_and(v_um, v_xh[1]), 4);
-            uint8x16_t shifted = vec_sr(v_xh[0], 2);
-            q6h[2] = vec_sl(vec_and(v_um, shifted), 4);
-            shifted = vec_sr(v_xh[1], 2);
-            q6h[3] = vec_sl(vec_and(v_um, shifted), 4);
-
-            q6b[0] = (int8x16_t)(vec_or(vec_and(v_xl[0], v_lm), q6h[0]));
-            q6b[1] = (int8x16_t)(vec_or(vec_and(v_xl[1], v_lm), q6h[1]));
-            q6b[2] = (int8x16_t)(vec_or(vec_and(v_xl[2], v_lm), q6h[2]));
-            q6b[3] = (int8x16_t)(vec_or(vec_and(v_xl[3], v_lm), q6h[3]));
-
-            int32x4_t summs0 = ggml_vec_dot(v_z, q6b[0], v_y[0]);
-            int32x4_t summs1 = ggml_vec_dot(v_z, q6b[1], v_y[1]);
-            int32x4_t summs2 = ggml_vec_dot(v_z, q6b[2], v_y[2]);
-            int32x4_t summs3 = ggml_vec_dot(v_z, q6b[3], v_y[3]);
-
-            isum += (summs0[0] + summs0[1] + summs0[2] + summs0[3]) * scale[0] +
-                    (summs1[0] + summs1[1] + summs1[2] + summs1[3]) * scale[1] +
-                    (summs2[0] + summs2[1] + summs2[2] + summs2[3]) * scale[2] +
-                    (summs3[0] + summs3[1] + summs3[2] + summs3[3]) * scale[3];
-
-            scale += 4;
-
-
-            // Load activation quants
-            v_y[0] = vec_xl(0 , y0);
-            v_y[1] = vec_xl(16, y0);
-            v_y[2] = vec_xl(32, y0);
-            v_y[3] = vec_xl(48, y0);
-            y0 += 64;
-
-            shifted = vec_sr(v_xh[0], 4);
-            q6h[0] = vec_sl(vec_and(v_um, shifted), 4);
-            shifted = vec_sr(v_xh[1], 4);
-            q6h[1] = vec_sl(vec_and(v_um, shifted), 4);
-            shifted = vec_sr(v_xh[0], 6);
-            q6h[2] = vec_sl(vec_and(v_um, shifted), 4);
-            shifted = vec_sr(v_xh[1], 6);
-            q6h[3] = vec_sl(vec_and(v_um, shifted), 4);
-
-            q6b[0] = (int8x16_t)(vec_or(vec_sr(v_xl[0], 4), q6h[0]));
-            q6b[1] = (int8x16_t)(vec_or(vec_sr(v_xl[1], 4), q6h[1]));
-            q6b[2] = (int8x16_t)(vec_or(vec_sr(v_xl[2], 4), q6h[2]));
-            q6b[3] = (int8x16_t)(vec_or(vec_sr(v_xl[3], 4), q6h[3]));
-
-            summs0 = ggml_vec_dot(v_z, q6b[0], v_y[0]);
-            summs1 = ggml_vec_dot(v_z, q6b[1], v_y[1]);
-            summs2 = ggml_vec_dot(v_z, q6b[2], v_y[2]);
-            summs3 = ggml_vec_dot(v_z, q6b[3], v_y[3]);
-
-            isum += (summs0[0] + summs0[1] + summs0[2] + summs0[3]) * scale[0] +
-                    (summs1[0] + summs1[1] + summs1[2] + summs1[3]) * scale[1] +
-                    (summs2[0] + summs2[1] + summs2[2] + summs2[3]) * scale[2] +
-                    (summs3[0] + summs3[1] + summs3[2] + summs3[3]) * scale[3];
-
-            scale += 4;
-        }
-
-        sum += d_all * y[i].d * (isum - 32 * mins);
-    }
-
-    *s = sum;
-#else
-
-    int8_t  aux8[QK_K];
-    int16_t aux16[8];
-    float   sums [8];
-    int32_t aux32[8];
-    memset(sums, 0, 8*sizeof(float));
-
-    float sumf = 0;
-    for (int i = 0; i < nb; ++i) {
-        const uint8_t * GGML_RESTRICT q4 = x[i].ql;
-        const uint8_t * GGML_RESTRICT qh = x[i].qh;
-        const  int8_t * GGML_RESTRICT q8 = y[i].qs;
-        memset(aux32, 0, 8*sizeof(int32_t));
-        int8_t * GGML_RESTRICT a = aux8;
-        for (int j = 0; j < QK_K; j += 128) {
-            for (int l = 0; l < 32; ++l) {
-                a[l +  0] = (int8_t)((q4[l +  0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
-                a[l + 32] = (int8_t)((q4[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
-                a[l + 64] = (int8_t)((q4[l +  0] >>  4) | (((qh[l] >> 4) & 3) << 4)) - 32;
-                a[l + 96] = (int8_t)((q4[l + 32] >>  4) | (((qh[l] >> 6) & 3) << 4)) - 32;
-            }
-            a  += 128;
-            q4 += 64;
-            qh += 32;
-        }
-        a = aux8;
-        int is = 0;
-        for (int j = 0; j < QK_K/16; ++j) {
-            int scale = x[i].scales[is++];
-            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
-            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
-            q8 += 8; a += 8;
-            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
-            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
-            q8 += 8; a += 8;
-        }
-        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-        for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
-    }
-    for (int l = 0; l < 8; ++l) sumf += sums[l];
-    *s = sumf;
-#endif
-}
-
-#if defined (__AVX__) || defined (__AVX2__) || defined (__ARM_NEON) || defined (__POWER9_VECTOR__) || defined(__loongarch_asx)
-static const int8_t keven_signs_q2xs[1024] = {
-     1,  1,  1,  1,  1,  1,  1,  1, -1,  1,  1,  1,  1,  1,  1, -1,  1, -1,  1,  1,  1,  1,  1, -1, -1, -1,  1,  1,  1,  1,  1,  1,
-     1,  1, -1,  1,  1,  1,  1, -1, -1,  1, -1,  1,  1,  1,  1,  1,  1, -1, -1,  1,  1,  1,  1,  1, -1, -1, -1,  1,  1,  1,  1, -1,
-     1,  1,  1, -1,  1,  1,  1, -1, -1,  1,  1, -1,  1,  1,  1,  1,  1, -1,  1, -1,  1,  1,  1,  1, -1, -1,  1, -1,  1,  1,  1, -1,
-     1,  1, -1, -1,  1,  1,  1,  1, -1,  1, -1, -1,  1,  1,  1, -1,  1, -1, -1, -1,  1,  1,  1, -1, -1, -1, -1, -1,  1,  1,  1,  1,
-     1,  1,  1,  1, -1,  1,  1, -1, -1,  1,  1,  1, -1,  1,  1,  1,  1, -1,  1,  1, -1,  1,  1,  1, -1, -1,  1,  1, -1,  1,  1, -1,
-     1,  1, -1,  1, -1,  1,  1,  1, -1,  1, -1,  1, -1,  1,  1, -1,  1, -1, -1,  1, -1,  1,  1, -1, -1, -1, -1,  1, -1,  1,  1,  1,
-     1,  1,  1, -1, -1,  1,  1,  1, -1,  1,  1, -1, -1,  1,  1, -1,  1, -1,  1, -1, -1,  1,  1, -1, -1, -1,  1, -1, -1,  1,  1,  1,
-     1,  1, -1, -1, -1,  1,  1, -1, -1,  1, -1, -1, -1,  1,  1,  1,  1, -1, -1, -1, -1,  1,  1,  1, -1, -1, -1, -1, -1,  1,  1, -1,
-     1,  1,  1,  1,  1, -1,  1, -1, -1,  1,  1,  1,  1, -1,  1,  1,  1, -1,  1,  1,  1, -1,  1,  1, -1, -1,  1,  1,  1, -1,  1, -1,
-     1,  1, -1,  1,  1, -1,  1,  1, -1,  1, -1,  1,  1, -1,  1, -1,  1, -1, -1,  1,  1, -1,  1, -1, -1, -1, -1,  1,  1, -1,  1,  1,
-     1,  1,  1, -1,  1, -1,  1,  1, -1,  1,  1, -1,  1, -1,  1, -1,  1, -1,  1, -1,  1, -1,  1, -1, -1, -1,  1, -1,  1, -1,  1,  1,
-     1,  1, -1, -1,  1, -1,  1, -1, -1,  1, -1, -1,  1, -1,  1,  1,  1, -1, -1, -1,  1, -1,  1,  1, -1, -1, -1, -1,  1, -1,  1, -1,
-     1,  1,  1,  1, -1, -1,  1,  1, -1,  1,  1,  1, -1, -1,  1, -1,  1, -1,  1,  1, -1, -1,  1, -1, -1, -1,  1,  1, -1, -1,  1,  1,
-     1,  1, -1,  1, -1, -1,  1, -1, -1,  1, -1,  1, -1, -1,  1,  1,  1, -1, -1,  1, -1, -1,  1,  1, -1, -1, -1,  1, -1, -1,  1, -1,
-     1,  1,  1, -1, -1, -1,  1, -1, -1,  1,  1, -1, -1, -1,  1,  1,  1, -1,  1, -1, -1, -1,  1,  1, -1, -1,  1, -1, -1, -1,  1, -1,
-     1,  1, -1, -1, -1, -1,  1,  1, -1,  1, -1, -1, -1, -1,  1, -1,  1, -1, -1, -1, -1, -1,  1, -1, -1, -1, -1, -1, -1, -1,  1,  1,
-     1,  1,  1,  1,  1,  1, -1, -1, -1,  1,  1,  1,  1,  1, -1,  1,  1, -1,  1,  1,  1,  1, -1,  1, -1, -1,  1,  1,  1,  1, -1, -1,
-     1,  1, -1,  1,  1,  1, -1,  1, -1,  1, -1,  1,  1,  1, -1, -1,  1, -1, -1,  1,  1,  1, -1, -1, -1, -1, -1,  1,  1,  1, -1,  1,
-     1,  1,  1, -1,  1,  1, -1,  1, -1,  1,  1, -1,  1,  1, -1, -1,  1, -1,  1, -1,  1,  1, -1, -1, -1, -1,  1, -1,  1,  1, -1,  1,
-     1,  1, -1, -1,  1,  1, -1, -1, -1,  1, -1, -1,  1,  1, -1,  1,  1, -1, -1, -1,  1,  1, -1,  1, -1, -1, -1, -1,  1,  1, -1, -1,
-     1,  1,  1,  1, -1,  1, -1,  1, -1,  1,  1,  1, -1,  1, -1, -1,  1, -1,  1,  1, -1,  1, -1, -1, -1, -1,  1,  1, -1,  1, -1,  1,
-     1,  1, -1,  1, -1,  1, -1, -1, -1,  1, -1,  1, -1,  1, -1,  1,  1, -1, -1,  1, -1,  1, -1,  1, -1, -1, -1,  1, -1,  1, -1, -1,
-     1,  1,  1, -1, -1,  1, -1, -1, -1,  1,  1, -1, -1,  1, -1,  1,  1, -1,  1, -1, -1,  1, -1,  1, -1, -1,  1, -1, -1,  1, -1, -1,
-     1,  1, -1, -1, -1,  1, -1,  1, -1,  1, -1, -1, -1,  1, -1, -1,  1, -1, -1, -1, -1,  1, -1, -1, -1, -1, -1, -1, -1,  1, -1,  1,
-     1,  1,  1,  1,  1, -1, -1,  1, -1,  1,  1,  1,  1, -1, -1, -1,  1, -1,  1,  1,  1, -1, -1, -1, -1, -1,  1,  1,  1, -1, -1,  1,
-     1,  1, -1,  1,  1, -1, -1, -1, -1,  1, -1,  1,  1, -1, -1,  1,  1, -1, -1,  1,  1, -1, -1,  1, -1, -1, -1,  1,  1, -1, -1, -1,
-     1,  1,  1, -1,  1, -1, -1, -1, -1,  1,  1, -1,  1, -1, -1,  1,  1, -1,  1, -1,  1, -1, -1,  1, -1, -1,  1, -1,  1, -1, -1, -1,
-     1,  1, -1, -1,  1, -1, -1,  1, -1,  1, -1, -1,  1, -1, -1, -1,  1, -1, -1, -1,  1, -1, -1, -1, -1, -1, -1, -1,  1, -1, -1,  1,
-     1,  1,  1,  1, -1, -1, -1, -1, -1,  1,  1,  1, -1, -1, -1,  1,  1, -1,  1,  1, -1, -1, -1,  1, -1, -1,  1,  1, -1, -1, -1, -1,
-     1,  1, -1,  1, -1, -1, -1,  1, -1,  1, -1,  1, -1, -1, -1, -1,  1, -1, -1,  1, -1, -1, -1, -1, -1, -1, -1,  1, -1, -1, -1,  1,
-     1,  1,  1, -1, -1, -1, -1,  1, -1,  1,  1, -1, -1, -1, -1, -1,  1, -1,  1, -1, -1, -1, -1, -1, -1, -1,  1, -1, -1, -1, -1,  1,
-     1,  1, -1, -1, -1, -1, -1, -1, -1,  1, -1, -1, -1, -1, -1,  1,  1, -1, -1, -1, -1, -1, -1,  1, -1, -1, -1, -1, -1, -1, -1, -1,
-};
-#endif
-
-void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
-    assert(n % QK_K == 0);
-    assert(nrc == 1);
-    UNUSED(nrc);
-    UNUSED(bx);
-    UNUSED(by);
-    UNUSED(bs);
-
-    const block_iq2_xxs * GGML_RESTRICT x = vx;
-    const block_q8_K    * GGML_RESTRICT y = vy;
-
-    const int nb = n / QK_K;
-
-#if defined(__ARM_NEON)
-
-    const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
-
-    uint32_t aux32[4];
-    const uint8_t * aux8 = (const uint8_t *)aux32;
-
-    ggml_int8x16x4_t q2u;
-    ggml_int8x16x4_t q2s;
-    ggml_int8x16x4_t q8b;
-
-    float sumf = 0;
-    for (int i = 0; i < nb; ++i) {
-        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-        const uint16_t * GGML_RESTRICT q2 = x[i].qs;
-        const int8_t   * GGML_RESTRICT q8 = y[i].qs;
-        float sumf1 = 0, sumf2 = 0;
-        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
-            q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
-            memcpy(aux32, q2, 4*sizeof(uint32_t)); q2 += 8;
-            q2u.val[0] = vcombine_s8(vld1_s8((const void *)(iq2xxs_grid + aux8[ 0])), vld1_s8((const void *)(iq2xxs_grid + aux8[ 1])));
-            q2u.val[1] = vcombine_s8(vld1_s8((const void *)(iq2xxs_grid + aux8[ 2])), vld1_s8((const void *)(iq2xxs_grid + aux8[ 3])));
-            q2u.val[2] = vcombine_s8(vld1_s8((const void *)(iq2xxs_grid + aux8[ 8])), vld1_s8((const void *)(iq2xxs_grid + aux8[ 9])));
-            q2u.val[3] = vcombine_s8(vld1_s8((const void *)(iq2xxs_grid + aux8[10])), vld1_s8((const void *)(iq2xxs_grid + aux8[11])));
-            q2s.val[0] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[1] >>  0) & 127))), vld1_s8((const void *)(signs64 + ((aux32[1] >>  7) & 127))));
-            q2s.val[1] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[1] >> 14) & 127))), vld1_s8((const void *)(signs64 + ((aux32[1] >> 21) & 127))));
-            q2s.val[2] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[3] >>  0) & 127))), vld1_s8((const void *)(signs64 + ((aux32[3] >>  7) & 127))));
-            q2s.val[3] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[3] >> 14) & 127))), vld1_s8((const void *)(signs64 + ((aux32[3] >> 21) & 127))));
-            q2u.val[0] = vmulq_s8(q2u.val[0], q2s.val[0]);
-            q2u.val[1] = vmulq_s8(q2u.val[1], q2s.val[1]);
-            q2u.val[2] = vmulq_s8(q2u.val[2], q2s.val[2]);
-            q2u.val[3] = vmulq_s8(q2u.val[3], q2s.val[3]);
-            const int32x4_t p1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q2u.val[0], q8b.val[0]), q2u.val[1], q8b.val[1]);
-            const int32x4_t p2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q2u.val[2], q8b.val[2]), q2u.val[3], q8b.val[3]);
-            sumf1 += vaddvq_s32(p1) * (0.5f + (aux32[1] >> 28));
-            sumf2 += vaddvq_s32(p2) * (0.5f + (aux32[3] >> 28));
-        }
-        sumf += d*(sumf1 + sumf2);
-    }
-    *s = 0.25f * sumf;
-
-#elif defined(__AVX2__)
-
-    const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
-
-    uint32_t aux32[4];
-    const uint8_t * aux8 = (const uint8_t *)aux32;
-
-    __m256 accumf = _mm256_setzero_ps();
-    for (int i = 0; i < nb; ++i) {
-        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-        const uint16_t * GGML_RESTRICT q2 = x[i].qs;
-        const int8_t   * GGML_RESTRICT q8 = y[i].qs;
-        __m256i sumi1 = _mm256_setzero_si256();
-        __m256i sumi2 = _mm256_setzero_si256();
-        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
-            const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
-            const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
-            memcpy(aux32, q2, 4*sizeof(uint32_t)); q2 += 8;
-            const __m256i q2_1 = _mm256_set_epi64x(iq2xxs_grid[aux8[ 3]], iq2xxs_grid[aux8[ 2]], iq2xxs_grid[aux8[1]], iq2xxs_grid[aux8[0]]);
-            const __m256i q2_2 = _mm256_set_epi64x(iq2xxs_grid[aux8[11]], iq2xxs_grid[aux8[10]], iq2xxs_grid[aux8[9]], iq2xxs_grid[aux8[8]]);
-            const __m256i s2_1 = _mm256_set_epi64x(signs64[(aux32[1] >> 21) & 127], signs64[(aux32[1] >> 14) & 127],
-                                                   signs64[(aux32[1] >>  7) & 127], signs64[(aux32[1] >>  0) & 127]);
-            const __m256i s2_2 = _mm256_set_epi64x(signs64[(aux32[3] >> 21) & 127], signs64[(aux32[3] >> 14) & 127],
-                                                   signs64[(aux32[3] >>  7) & 127], signs64[(aux32[3] >>  0) & 127]);
-            const __m256i q8s_1 = _mm256_sign_epi8(q8_1, s2_1);
-            const __m256i q8s_2 = _mm256_sign_epi8(q8_2, s2_2);
-            const __m256i dot1  = _mm256_maddubs_epi16(q2_1, q8s_1);
-            const __m256i dot2  = _mm256_maddubs_epi16(q2_2, q8s_2);
-            const uint16_t ls1 = aux32[1] >> 28;
-            const uint16_t ls2 = aux32[3] >> 28;
-            const __m256i p1 = _mm256_madd_epi16(dot1, _mm256_set1_epi16(2*ls1+1));
-            const __m256i p2 = _mm256_madd_epi16(dot2, _mm256_set1_epi16(2*ls2+1));
-            sumi1 = _mm256_add_epi32(sumi1, p1);
-            sumi2 = _mm256_add_epi32(sumi2, p2);
-        }
-
-        accumf = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accumf);
-
-    }
-
-    *s = 0.125f * hsum_float_8(accumf);
-
-#elif defined(__AVX__)
-    const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
-
-    uint32_t aux32[4];
-    const uint8_t * aux8 = (const uint8_t *)aux32;
-
-    __m256 accumf = _mm256_setzero_ps();
-    for (int i = 0; i < nb; ++i) {
-        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-        const uint16_t * GGML_RESTRICT q2 = x[i].qs;
-        const int8_t   * GGML_RESTRICT q8 = y[i].qs;
-        __m128i sumi1_0 = _mm_setzero_si128();
-        __m128i sumi1_1 = _mm_setzero_si128();
-        __m128i sumi2_0 = _mm_setzero_si128();
-        __m128i sumi2_1 = _mm_setzero_si128();
-        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
-            const __m128i q8_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
-            const __m128i q8_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
-            const __m128i q8_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
-            const __m128i q8_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
-            memcpy(aux32, q2, 4*sizeof(uint32_t)); q2 += 8;
-            const __m128i q2_1_0 = _mm_set_epi64x(iq2xxs_grid[aux8[1]], iq2xxs_grid[aux8[0]]);
-            const __m128i q2_1_1 = _mm_set_epi64x(iq2xxs_grid[aux8[3]], iq2xxs_grid[aux8[2]]);
-            const __m128i q2_2_0 = _mm_set_epi64x(iq2xxs_grid[aux8[9]], iq2xxs_grid[aux8[8]]);
-            const __m128i q2_2_1 = _mm_set_epi64x(iq2xxs_grid[aux8[11]], iq2xxs_grid[aux8[10]]);
-            const __m128i s2_1_0 = _mm_set_epi64x(signs64[(aux32[1] >>  7) & 127], signs64[(aux32[1] >>  0) & 127]);
-            const __m128i s2_1_1 = _mm_set_epi64x(signs64[(aux32[1] >> 21) & 127], signs64[(aux32[1] >> 14) & 127]);
-            const __m128i s2_2_0 = _mm_set_epi64x(signs64[(aux32[3] >>  7) & 127], signs64[(aux32[3] >>  0) & 127]);
-            const __m128i s2_2_1 = _mm_set_epi64x(signs64[(aux32[3] >> 21) & 127], signs64[(aux32[3] >> 14) & 127]);
-            const __m128i q8s_1_0 = _mm_sign_epi8(q8_1_0, s2_1_0);
-            const __m128i q8s_1_1 = _mm_sign_epi8(q8_1_1, s2_1_1);
-            const __m128i q8s_2_0 = _mm_sign_epi8(q8_2_0, s2_2_0);
-            const __m128i q8s_2_1 = _mm_sign_epi8(q8_2_1, s2_2_1);
-            const __m128i dot1_0  = _mm_maddubs_epi16(q2_1_0, q8s_1_0);
-            const __m128i dot1_1  = _mm_maddubs_epi16(q2_1_1, q8s_1_1);
-            const __m128i dot2_0  = _mm_maddubs_epi16(q2_2_0, q8s_2_0);
-            const __m128i dot2_1  = _mm_maddubs_epi16(q2_2_1, q8s_2_1);
-            const uint16_t ls1 = aux32[1] >> 28;
-            const uint16_t ls2 = aux32[3] >> 28;
-            const __m128i p1_0 = _mm_madd_epi16(dot1_0, _mm_set1_epi16(2*ls1+1));
-            const __m128i p1_1 = _mm_madd_epi16(dot1_1, _mm_set1_epi16(2*ls1+1));
-            const __m128i p2_0 = _mm_madd_epi16(dot2_0, _mm_set1_epi16(2*ls2+1));
-            const __m128i p2_1 = _mm_madd_epi16(dot2_1, _mm_set1_epi16(2*ls2+1));
-            sumi1_0 = _mm_add_epi32(sumi1_0, p1_0);
-            sumi1_1 = _mm_add_epi32(sumi1_1, p1_1);
-            sumi2_0 = _mm_add_epi32(sumi2_0, p2_0);
-            sumi2_1 = _mm_add_epi32(sumi2_1, p2_1);
-        }
-
-        accumf = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(_mm_add_epi32(sumi1_1, sumi2_1), _mm_add_epi32(sumi1_0, sumi2_0)))), accumf);
-
-    }
-
-    *s = 0.125f * hsum_float_8(accumf);
-
-#elif defined(__POWER9_VECTOR__)
-    const vector int v0 = vec_splats((int32_t)0);
-    vector float vsumf0 = vec_splats(0.0f);
-    vector float vsumf1 = vec_splats(0.0f);
-    vector float vsumf2 = vec_splats(0.0f);
-    vector float vsumf3 = vec_splats(0.0f);
-
-    const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
-
-    for (int i = 0; i < nb; ++i) {
-        vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
-        vector float vyd = vec_splats(y[i].d);
-        vector float vd = vec_mul(vxd, vyd);
-
-        vector signed int vsumi0 = v0;
-        vector signed int vsumi1 = v0;
-        vector signed int vsumi2 = v0;
-        vector signed int vsumi3 = v0;
-
-        const uint16_t * GGML_RESTRICT q2 = x[i].qs;
-        const int8_t  *  GGML_RESTRICT q8 = y[i].qs;
-
-        for (int j = 0; j < QK_K/32; j += 2) {
-            __builtin_prefetch(q2, 0, 1);
-            __builtin_prefetch(q8, 0, 1);
-
-            uint32_t aux32[4];
-            const uint8_t * aux8 = (const uint8_t *)aux32;
-
-            memcpy(aux32, q2, 4*sizeof(uint32_t));
-            q2 += 8;
-
-            vector signed long long aux64x2_0 = {*(const int64_t *)(iq2xxs_grid + aux8[ 0]), *(const int64_t *)(iq2xxs_grid + aux8[ 1])};
-            vector signed long long aux64x2_1 = {*(const int64_t *)(iq2xxs_grid + aux8[ 2]), *(const int64_t *)(iq2xxs_grid + aux8[ 3])};
-            vector signed long long aux64x2_2 = {*(const int64_t *)(iq2xxs_grid + aux8[ 8]), *(const int64_t *)(iq2xxs_grid + aux8[ 9])};
-            vector signed long long aux64x2_3 = {*(const int64_t *)(iq2xxs_grid + aux8[10]), *(const int64_t *)(iq2xxs_grid + aux8[11])};
-
-            vector signed long long vsigns0 = {*(const int64_t *)(signs64 + ((aux32[1] >>  0) & 127)), *(const int64_t *)(signs64 + ((aux32[1] >>  7) & 127))};
-            vector signed long long vsigns1 = {*(const int64_t *)(signs64 + ((aux32[1] >> 14) & 127)), *(const int64_t *)(signs64 + ((aux32[1] >> 21) & 127))};
-            vector signed long long vsigns2 = {*(const int64_t *)(signs64 + ((aux32[3] >>  0) & 127)), *(const int64_t *)(signs64 + ((aux32[3] >>  7) & 127))};
-            vector signed long long vsigns3 = {*(const int64_t *)(signs64 + ((aux32[3] >> 14) & 127)), *(const int64_t *)(signs64 + ((aux32[3] >> 21) & 127))};
-
-            vector signed char q2x0 = (vector signed char)vec_mul((vector signed char)vsigns0, (vector signed char)aux64x2_0);
-            vector signed char q2x1 = (vector signed char)vec_mul((vector signed char)vsigns1, (vector signed char)aux64x2_1);
-            vector signed char q2x2 = (vector signed char)vec_mul((vector signed char)vsigns2, (vector signed char)aux64x2_2);
-            vector signed char q2x3 = (vector signed char)vec_mul((vector signed char)vsigns3, (vector signed char)aux64x2_3);
-
-            vector signed char q8y0 = vec_xl( 0, q8);
-            vector signed char q8y1 = vec_xl(16, q8);
-            vector signed char q8y2 = vec_xl(32, q8);
-            vector signed char q8y3 = vec_xl(48, q8);
-            q8 += 64;
-
-            vector signed short qv0 = vec_add(vec_mule(q2x0, q8y0), vec_mulo(q2x0, q8y0));
-            vector signed short qv1 = vec_add(vec_mule(q2x1, q8y1), vec_mulo(q2x1, q8y1));
-            vector signed short qv2 = vec_add(vec_mule(q2x2, q8y2), vec_mulo(q2x2, q8y2));
-            vector signed short qv3 = vec_add(vec_mule(q2x3, q8y3), vec_mulo(q2x3, q8y3));
-
-            const uint16_t ls0 = aux32[1] >> 28;
-            const uint16_t ls1 = aux32[3] >> 28;
-
-            vector signed short vscales01 = vec_splats((int16_t)(2*ls0+1));
-            vector signed short vscales23 = vec_splats((int16_t)(2*ls1+1));
-
-            vsumi0 = vec_msum(qv0, vscales01, vsumi0);
-            vsumi1 = vec_msum(qv1, vscales01, vsumi1);
-            vsumi2 = vec_msum(qv2, vscales23, vsumi2);
-            vsumi3 = vec_msum(qv3, vscales23, vsumi3);
-        }
-
-        vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
-        vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
-        vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
-        vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
-    }
-
-    vsumf0 = vec_add(vsumf0, vsumf2);
-    vsumf1 = vec_add(vsumf1, vsumf3);
-
-    vsumf0 = vec_add(vsumf0, vsumf1);
-
-    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
-    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
-
-    *s = 0.125f * vec_extract(vsumf0, 0);
-
-#elif defined(__loongarch_asx)
-
-    const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
-
-    uint32_t aux32[4];
-    const uint8_t * aux8 = (const uint8_t *)aux32;
-
-    __m256 accumf = (__m256)__lasx_xvldi(0);
-    for (int i = 0; i < nb; ++i) {
-        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-        const uint16_t * GGML_RESTRICT q2 = x[i].qs;
-        const int8_t   * GGML_RESTRICT q8 = y[i].qs;
-        __m256i sumi1 = __lasx_xvldi(0);
-        __m256i sumi2 = __lasx_xvldi(0);
-        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
-            const __m256i q8_1 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32;
-            const __m256i q8_2 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32;
-            memcpy(aux32, q2, 4*sizeof(uint32_t)); q2 += 8;
-
-            const __m256i q2_1 = lasx_set_d(iq2xxs_grid[aux8[ 3]], iq2xxs_grid[aux8[ 2]], iq2xxs_grid[aux8[1]], iq2xxs_grid[aux8[0]]);
-            const __m256i q2_2 = lasx_set_d(iq2xxs_grid[aux8[11]], iq2xxs_grid[aux8[10]], iq2xxs_grid[aux8[9]], iq2xxs_grid[aux8[8]]);
-            const __m256i s2_1 = lasx_set_d(signs64[(aux32[1] >> 21) & 127], signs64[(aux32[1] >> 14) & 127],
-                                                   signs64[(aux32[1] >>  7) & 127], signs64[(aux32[1] >>  0) & 127]);
-            const __m256i s2_2 = lasx_set_d(signs64[(aux32[3] >> 21) & 127], signs64[(aux32[3] >> 14) & 127],
-                                                   signs64[(aux32[3] >>  7) & 127], signs64[(aux32[3] >>  0) & 127]);
-            const __m256i q8s_1 = __lasx_xvsigncov_b(s2_1, q8_1);
-            const __m256i q8s_2 = __lasx_xvsigncov_b(s2_2, q8_2);
-            const __m256i dot1  = lasx_maddubs_h(q2_1, q8s_1);
-            const __m256i dot2  = lasx_maddubs_h(q2_2, q8s_2);
-            const uint16_t ls1 = aux32[1] >> 28;
-            const uint16_t ls2 = aux32[3] >> 28;
-            const __m256i p1 = lasx_madd_h(dot1, __lasx_xvreplgr2vr_h(2*ls1+1));
-            const __m256i p2 = lasx_madd_h(dot2, __lasx_xvreplgr2vr_h(2*ls2+1));
-            sumi1 = __lasx_xvadd_w(sumi1, p1);
-            sumi2 = __lasx_xvadd_w(sumi2, p2);
-        }
-
-        accumf = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(d), __lasx_xvffint_s_w(__lasx_xvadd_w(sumi1, sumi2)), accumf);
-    }
-
-    *s = 0.125f * hsum_float_8(accumf);
-//#elif defined(__VXE__) || defined(__VXE2__)
-//    const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
-//
-//    uint32_t aux32[4];
-//    const uint8_t * aux8 = (const uint8_t *)aux32;
-//
-//    float sumf = 0;
-//
-//    for (int i = 0; i < nb; ++i) {
-//        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-//        const uint16_t * GGML_RESTRICT q2 = x[i].qs;
-//        const int8_t   * GGML_RESTRICT q8 = y[i].qs;
-//
-//        float sumf1 = 0, sumf2 = 0;
-//
-//        for (int ib32 = 0; ib32 < QK_K/32; ib += 2) {
-//            int8x16_t q8b0 = vec_xl( 0, q8);
-//            int8x16_t qb81 = vec_xl(16, q8);
-//            int8x16_t q8b2 = vec_xl(32, q8);
-//            int8x16_t q8b3 = vec_xl(48, q8);
-//            q8 += 64;
-//
-//            memcpy(aux32, q2, 4 * sizeof(uint32_t));
-//            q2 += 8;
-//
-//            int8x16_t q2u0 = { *(const int64_t *)(iq2xxs_grid + aux8[ 0]), *(const int64_t *)(iq2xxs_grid + aux8[ 1]) };
-//            int8x16_t q2u1 = { *(const int64_t *)(iq2xxs_grid + aux8[ 2]), *(const int64_t *)(iq2xxs_grid + aux8[ 3]) };
-//            int8x16_t q2u2 = { *(const int64_t *)(iq2xxs_grid + aux8[ 8]), *(const int64_t *)(iq2xxs_grid + aux8[ 9]) };
-//            int8x16_t q2u3 = { *(const int64_t *)(iq2xxs_grid + aux8[10]), *(const int64_t *)(iq2xxs_grid + aux8[11]) };
-//
-//            int8x16_t q2s0 = { *(const int64_t *)(signs64 + ((aux32[1] >>  0) & 127)), *(const int64_t *)(signs64 + ((aux32[1] >>  7) & 127)) };
-//            int8x16_t q2s1 = { *(const int64_t *)(signs64 + ((aux32[1] >> 14) & 127)), *(const int64_t *)(signs64 + ((aux32[1] >> 21) & 127)) };
-//            int8x16_t q2s2 = { *(const int64_t *)(signs64 + ((aux32[3] >>  0) & 127)), *(const int64_t *)(signs64 + ((aux32[3] >>  7) & 127)) };
-//            int8x16_t q2s3 = { *(const int64_t *)(signs64 + ((aux32[3] >> 14) & 127)), *(const int64_t *)(signs64 + ((aux32[3] >> 21) & 127)) };
-//
-//            q2u0 = vec_mul(q2u0, q2s0);
-//            q2u1 = vec_mul(q2u1, q2s1);
-//            q2u2 = vec_mul(q2u2, q2s2);
-//            q2u3 = vec_mul(q2u3, q2s3);
-//
-//            const int32x4_t p1 = ggml_vec_dot(ggml_vec_dot(vec_splat_s32(0), q2u0, q8b0), q2u1, q8b1);
-//            const int32x4_t p2 = ggml_vec_dot(ggml_vec_dot(vec_splat_s32(0), q2u2, q8b2), q2u3, q8b3);
-//
-//            sumf1 += (p1[0] + p1[1] + p1[2] + p1[3]) * (0.5f + (aux32[1] >> 28));
-//            sumf2 += (p2[0] + p2[1] + p2[2] + p2[3]) * (0.5f + (aux32[3] >> 28));
-//        }
-//
-//        sumf += d * (sumf1 + sumf2);
-//    }
-//
-//    *s = 0.25f * sumf;
-#else
-
-    uint32_t aux32[2];
-    const uint8_t * aux8 = (const uint8_t *)aux32;
-
-    float sumf = 0.f;
-    for (int i = 0; i < nb; ++i) {
-        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-        const uint16_t * GGML_RESTRICT q2 = x[i].qs;
-        const int8_t   * GGML_RESTRICT q8 = y[i].qs;
-        int32_t bsum = 0;
-        for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
-            memcpy(aux32, q2, 2*sizeof(uint32_t));
-            q2 += 4;
-            const uint32_t ls = 2*(aux32[1] >> 28) + 1;
-            int32_t sumi = 0;
-            for (int l = 0; l < 4; ++l) {
-                const uint8_t * grid = (const uint8_t *)(iq2xxs_grid + aux8[l]);
-                const uint8_t  signs = ksigns_iq2xs[(aux32[1] >> 7*l) & 127];
-                for (int j = 0; j < 8; ++j) {
-                    sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
-                }
-                q8 += 8;
-            }
-            bsum += sumi * ls;
-        }
-        sumf += d * bsum;
-    }
-    *s = 0.125f * sumf;
-#endif
-}
-
-void ggml_vec_dot_iq2_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
-    assert(n % QK_K == 0);
-    assert(nrc == 1);
-    UNUSED(nrc);
-    UNUSED(bx);
-    UNUSED(by);
-    UNUSED(bs);
-
-    const block_iq2_xs * GGML_RESTRICT x = vx;
-    const block_q8_K   * GGML_RESTRICT y = vy;
-
-    const int nb = n / QK_K;
-
-#if defined(__ARM_NEON)
-
-    const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
-
-    ggml_int8x16x4_t q2u;
-    ggml_int8x16x4_t q2s;
-    ggml_int8x16x4_t q8b;
-
-    int32x4x4_t scales32;
-
-    float sumf = 0;
-    for (int i = 0; i < nb; ++i) {
-        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-        const uint16_t * GGML_RESTRICT q2 = x[i].qs;
-        const int8_t   * GGML_RESTRICT q8 = y[i].qs;
-        const uint8x8_t scales8 = vld1_u8(x[i].scales);
-        const uint8x8_t scales_l = vand_u8(scales8, vdup_n_u8(0xf));
-        const uint8x8_t scales_h = vshr_n_u8(scales8, 4);
-        uint8x16_t scales = vcombine_u8(vzip1_u8(scales_l, scales_h), vzip2_u8(scales_l, scales_h));
-        scales = vaddq_u8(vshlq_n_u8(scales, 1), vdupq_n_u8(1));
-        const uint16x8_t scales1 = vmovl_u8(vget_low_u8(scales));
-        const uint16x8_t scales2 = vmovl_u8(vget_high_u8(scales));
-        scales32.val[0] = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(scales1)));
-        scales32.val[1] = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(scales1)));
-        scales32.val[2] = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(scales2)));
-        scales32.val[3] = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(scales2)));
-        int32x4_t sumi = vdupq_n_s32(0);
-        for (int ib64 = 0; ib64 < QK_K/64; ++ib64) {
-            q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
-            q2u.val[0] = vcombine_s8(vld1_s8((const void *)(iq2xs_grid + (q2[0] & 511))), vld1_s8((const void *)(iq2xs_grid + (q2[1] & 511))));
-            q2u.val[1] = vcombine_s8(vld1_s8((const void *)(iq2xs_grid + (q2[2] & 511))), vld1_s8((const void *)(iq2xs_grid + (q2[3] & 511))));
-            q2u.val[2] = vcombine_s8(vld1_s8((const void *)(iq2xs_grid + (q2[4] & 511))), vld1_s8((const void *)(iq2xs_grid + (q2[5] & 511))));
-            q2u.val[3] = vcombine_s8(vld1_s8((const void *)(iq2xs_grid + (q2[6] & 511))), vld1_s8((const void *)(iq2xs_grid + (q2[7] & 511))));
-            q2s.val[0] = vcombine_s8(vld1_s8((const void *)(signs64 + (q2[0] >> 9))), vld1_s8((const void *)(signs64 + (q2[1] >> 9))));
-            q2s.val[1] = vcombine_s8(vld1_s8((const void *)(signs64 + (q2[2] >> 9))), vld1_s8((const void *)(signs64 + (q2[3] >> 9))));
-            q2s.val[2] = vcombine_s8(vld1_s8((const void *)(signs64 + (q2[4] >> 9))), vld1_s8((const void *)(signs64 + (q2[5] >> 9))));
-            q2s.val[3] = vcombine_s8(vld1_s8((const void *)(signs64 + (q2[6] >> 9))), vld1_s8((const void *)(signs64 + (q2[7] >> 9))));
-            q2u.val[0] = vmulq_s8(q2u.val[0], q2s.val[0]);
-            q2u.val[1] = vmulq_s8(q2u.val[1], q2s.val[1]);
-            q2u.val[2] = vmulq_s8(q2u.val[2], q2s.val[2]);
-            q2u.val[3] = vmulq_s8(q2u.val[3], q2s.val[3]);
-            const int32x4_t p1 = ggml_vdotq_s32(vdupq_n_s32(0), q2u.val[0], q8b.val[0]);
-            const int32x4_t p2 = ggml_vdotq_s32(vdupq_n_s32(0), q2u.val[1], q8b.val[1]);
-            const int32x4_t p3 = ggml_vdotq_s32(vdupq_n_s32(0), q2u.val[2], q8b.val[2]);
-            const int32x4_t p4 = ggml_vdotq_s32(vdupq_n_s32(0), q2u.val[3], q8b.val[3]);
-            const int32x4_t p = vpaddq_s32(vpaddq_s32(p1, p2), vpaddq_s32(p3, p4));
-            sumi = vmlaq_s32(sumi, p, scales32.val[ib64]);
-            q2 += 8;
-        }
-        sumf += d*vaddvq_s32(sumi);
-    }
-    *s = 0.125f * sumf;
-
-#elif defined(__AVX2__)
-
-    const __m256i mone = _mm256_set1_epi8(1);
-    static const char block_sign_shuffle_mask_1[32] = {
-        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
-        0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06,
-    };
-    static const char block_sign_shuffle_mask_2[32] = {
-        0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a,
-        0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e,
-    };
-    static const uint8_t bit_selector_mask_bytes[32] = {
-        0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
-        0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
-    };
-
-    const __m256i bit_selector_mask = _mm256_loadu_si256((const __m256i*)bit_selector_mask_bytes);
-    const __m256i block_sign_shuffle_1 = _mm256_loadu_si256((const __m256i*)block_sign_shuffle_mask_1);
-    const __m256i block_sign_shuffle_2 = _mm256_loadu_si256((const __m256i*)block_sign_shuffle_mask_2);
-
-    static const uint8_t k_bit_helper[32] = {
-        0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00,
-        0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00,
-    };
-    const __m256i bit_helper = _mm256_loadu_si256((const __m256i*)k_bit_helper);
-    const __m256i m511 = _mm256_set1_epi16(511);
-    const __m128i m4 = _mm_set1_epi8(0xf);
-    const __m128i m1 = _mm_set1_epi8(1);
-
-    uint64_t aux64;
-
-    // somewhat hacky, but gives a significant boost in performance
-    __m256i aux_gindex;
-    const uint16_t * gindex = (const uint16_t *)&aux_gindex;
-
-    __m256 accumf = _mm256_setzero_ps();
-    for (int i = 0; i < nb; ++i) {
-        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-        const uint16_t * GGML_RESTRICT q2 = x[i].qs;
-        const int8_t   * GGML_RESTRICT q8 = y[i].qs;
-
-        memcpy(&aux64, x[i].scales, 8);
-        __m128i stmp = _mm_set1_epi64x(aux64);
-        stmp = _mm_unpacklo_epi8(_mm_and_si128(stmp, m4), _mm_and_si128(_mm_srli_epi16(stmp, 4), m4));
-        const __m128i scales = _mm_add_epi8(_mm_slli_epi16(stmp, 1), m1);
-
-        __m256i sumi1 = _mm256_setzero_si256();
-        __m256i sumi2 = _mm256_setzero_si256();
-        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 4) {
-
-            const __m256i q2_data = _mm256_loadu_si256((const __m256i*)q2);  q2 += 16;
-            aux_gindex = _mm256_and_si256(q2_data, m511);
-
-            const __m256i partial_sign_bits = _mm256_srli_epi16(q2_data, 9);
-            const __m256i partial_sign_bits_upper = _mm256_srli_epi16(q2_data, 13);
-            const __m256i partial_sign_bits_for_counting = _mm256_xor_si256(partial_sign_bits, partial_sign_bits_upper);
-
-            const __m256i odd_bits = _mm256_shuffle_epi8(bit_helper, partial_sign_bits_for_counting);
-            const __m256i full_sign_bits = _mm256_or_si256(partial_sign_bits, odd_bits);
-
-            const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
-            const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
-            const __m256i q8_3 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
-            const __m256i q8_4 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
-
-            const __m256i q2_1 = _mm256_set_epi64x(iq2xs_grid[gindex[ 3]], iq2xs_grid[gindex[ 2]],
-                                                   iq2xs_grid[gindex[ 1]], iq2xs_grid[gindex[ 0]]);
-            const __m256i q2_2 = _mm256_set_epi64x(iq2xs_grid[gindex[ 7]], iq2xs_grid[gindex[ 6]],
-                                                   iq2xs_grid[gindex[ 5]], iq2xs_grid[gindex[ 4]]);
-            const __m256i q2_3 = _mm256_set_epi64x(iq2xs_grid[gindex[11]], iq2xs_grid[gindex[10]],
-                                                   iq2xs_grid[gindex[ 9]], iq2xs_grid[gindex[ 8]]);
-            const __m256i q2_4 = _mm256_set_epi64x(iq2xs_grid[gindex[15]], iq2xs_grid[gindex[14]],
-                                                   iq2xs_grid[gindex[13]], iq2xs_grid[gindex[12]]);
-
-            const __m128i full_signs_l = _mm256_castsi256_si128(full_sign_bits);
-            const __m128i full_signs_h = _mm256_extractf128_si256(full_sign_bits, 1);
-            const __m256i full_signs_1 = MM256_SET_M128I(full_signs_l, full_signs_l);
-            const __m256i full_signs_2 = MM256_SET_M128I(full_signs_h, full_signs_h);
-
-            __m256i signs;
-            signs = _mm256_shuffle_epi8(full_signs_1, block_sign_shuffle_1);
-            signs = _mm256_cmpeq_epi8(_mm256_and_si256(signs, bit_selector_mask), bit_selector_mask);
-            const __m256i q8s_1 = _mm256_sign_epi8(q8_1, _mm256_or_si256(signs, mone));
-
-            signs = _mm256_shuffle_epi8(full_signs_1, block_sign_shuffle_2);
-            signs = _mm256_cmpeq_epi8(_mm256_and_si256(signs, bit_selector_mask), bit_selector_mask);
-            const __m256i q8s_2 = _mm256_sign_epi8(q8_2, _mm256_or_si256(signs, mone));
-
-            signs = _mm256_shuffle_epi8(full_signs_2, block_sign_shuffle_1);
-            signs = _mm256_cmpeq_epi8(_mm256_and_si256(signs, bit_selector_mask), bit_selector_mask);
-            const __m256i q8s_3 = _mm256_sign_epi8(q8_3, _mm256_or_si256(signs, mone));
-
-            signs = _mm256_shuffle_epi8(full_signs_2, block_sign_shuffle_2);
-            signs = _mm256_cmpeq_epi8(_mm256_and_si256(signs, bit_selector_mask), bit_selector_mask);
-            const __m256i q8s_4 = _mm256_sign_epi8(q8_4, _mm256_or_si256(signs, mone));
-
-            const __m256i dot1  = _mm256_maddubs_epi16(q2_1, q8s_1);
-            const __m256i dot2  = _mm256_maddubs_epi16(q2_2, q8s_2);
-            const __m256i dot3  = _mm256_maddubs_epi16(q2_3, q8s_3);
-            const __m256i dot4  = _mm256_maddubs_epi16(q2_4, q8s_4);
-
-            const __m256i sc1 = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales, get_scale_shuffle(ib32+0)));
-            const __m256i sc2 = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales, get_scale_shuffle(ib32+1)));
-            const __m256i sc3 = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales, get_scale_shuffle(ib32+2)));
-            const __m256i sc4 = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales, get_scale_shuffle(ib32+3)));
-
-            sumi1 = _mm256_add_epi32(sumi1, _mm256_madd_epi16(dot1, sc1));
-            sumi2 = _mm256_add_epi32(sumi2, _mm256_madd_epi16(dot2, sc2));
-            sumi1 = _mm256_add_epi32(sumi1, _mm256_madd_epi16(dot3, sc3));
-            sumi2 = _mm256_add_epi32(sumi2, _mm256_madd_epi16(dot4, sc4));
-        }
-
-        accumf = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accumf);
-
-    }
-
-    *s = 0.125f * hsum_float_8(accumf);
-
-#elif defined(__AVX__)
-    const __m128i mone = _mm_set1_epi8(1);
-    static const char block_sign_shuffle_mask_1[32] = {
-        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
-        0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06,
-    };
-    static const char block_sign_shuffle_mask_2[32] = {
-        0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a,
-        0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e,
-    };
-    static const uint8_t bit_selector_mask_bytes[32] = {
-        0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
-        0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
-    };
-
-    const __m128i bit_selector_mask_0 = _mm_loadu_si128((const __m128i*)bit_selector_mask_bytes);
-    const __m128i bit_selector_mask_1 = _mm_loadu_si128((const __m128i*)bit_selector_mask_bytes + 1);
-    const __m128i block_sign_shuffle_1_0 = _mm_loadu_si128((const __m128i*)block_sign_shuffle_mask_1);
-    const __m128i block_sign_shuffle_1_1 = _mm_loadu_si128((const __m128i*)block_sign_shuffle_mask_1 + 1);
-    const __m128i block_sign_shuffle_2_0 = _mm_loadu_si128((const __m128i*)block_sign_shuffle_mask_2);
-    const __m128i block_sign_shuffle_2_1 = _mm_loadu_si128((const __m128i*)block_sign_shuffle_mask_2 + 1);
-
-    static const uint8_t k_bit_helper[32] = {
-        0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00,
-        0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00,
-    };
-    const __m128i bit_helper_0 = _mm_loadu_si128((const __m128i*)k_bit_helper);
-    const __m128i bit_helper_1 = _mm_loadu_si128((const __m128i*)k_bit_helper + 1);
-    const __m128i m511 = _mm_set1_epi16(511);
-    const __m128i m4 = _mm_set1_epi8(0xf);
-    const __m128i m1 = _mm_set1_epi8(1);
-
-    uint64_t aux64;
-
-    // somewhat hacky, but gives a significant boost in performance
-    __m256i aux_gindex;
-    const uint16_t * gindex = (const uint16_t *)&aux_gindex;
-
-    __m256 accumf = _mm256_setzero_ps();
-    for (int i = 0; i < nb; ++i) {
-        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-        const uint16_t * GGML_RESTRICT q2 = x[i].qs;
-        const int8_t   * GGML_RESTRICT q8 = y[i].qs;
-
-        memcpy(&aux64, x[i].scales, 8);
-        __m128i stmp = _mm_set1_epi64x(aux64);
-        stmp = _mm_unpacklo_epi8(_mm_and_si128(stmp, m4), _mm_and_si128(_mm_srli_epi16(stmp, 4), m4));
-        const __m128i scales = _mm_add_epi8(_mm_slli_epi16(stmp, 1), m1);
-
-        __m128i sumi1_0 = _mm_setzero_si128();
-        __m128i sumi1_1 = _mm_setzero_si128();
-        __m128i sumi2_0 = _mm_setzero_si128();
-        __m128i sumi2_1 = _mm_setzero_si128();
-        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 4) {
-
-            const __m128i q2_data_0 = _mm_loadu_si128((const __m128i*)q2);
-            const __m128i q2_data_1 = _mm_loadu_si128((const __m128i*)q2 + 1);  q2 += 16;
-            aux_gindex = MM256_SET_M128I(_mm_and_si128(q2_data_1, m511), _mm_and_si128(q2_data_0, m511));
-
-            const __m128i partial_sign_bits_0 = _mm_srli_epi16(q2_data_0, 9);
-            const __m128i partial_sign_bits_1 = _mm_srli_epi16(q2_data_1, 9);
-            const __m128i partial_sign_bits_upper_0 = _mm_srli_epi16(q2_data_0, 13);
-            const __m128i partial_sign_bits_upper_1 = _mm_srli_epi16(q2_data_1, 13);
-            const __m128i partial_sign_bits_for_counting_0 = _mm_xor_si128(partial_sign_bits_0, partial_sign_bits_upper_0);
-            const __m128i partial_sign_bits_for_counting_1 = _mm_xor_si128(partial_sign_bits_1, partial_sign_bits_upper_1);
-
-            const __m128i odd_bits_0 = _mm_shuffle_epi8(bit_helper_0, partial_sign_bits_for_counting_0);
-            const __m128i odd_bits_1 = _mm_shuffle_epi8(bit_helper_1, partial_sign_bits_for_counting_1);
-            const __m128i full_sign_bits_0 = _mm_or_si128(partial_sign_bits_0, odd_bits_0);
-            const __m128i full_sign_bits_1 = _mm_or_si128(partial_sign_bits_1, odd_bits_1);
-
-            const __m128i q8_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
-            const __m128i q8_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
-            const __m128i q8_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
-            const __m128i q8_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
-            const __m128i q8_3_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
-            const __m128i q8_3_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
-            const __m128i q8_4_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
-            const __m128i q8_4_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
-
-            const __m128i q2_1_0 = _mm_set_epi64x(iq2xs_grid[gindex[1]], iq2xs_grid[gindex[0]]);
-            const __m128i q2_1_1 = _mm_set_epi64x(iq2xs_grid[gindex[3]], iq2xs_grid[gindex[2]]);
-            const __m128i q2_2_0 = _mm_set_epi64x(iq2xs_grid[gindex[5]], iq2xs_grid[gindex[4]]);
-            const __m128i q2_2_1 = _mm_set_epi64x(iq2xs_grid[gindex[7]], iq2xs_grid[gindex[6]]);
-            const __m128i q2_3_0 = _mm_set_epi64x(iq2xs_grid[gindex[9]], iq2xs_grid[gindex[8]]);
-            const __m128i q2_3_1 = _mm_set_epi64x(iq2xs_grid[gindex[11]], iq2xs_grid[gindex[10]]);
-            const __m128i q2_4_0 = _mm_set_epi64x(iq2xs_grid[gindex[13]], iq2xs_grid[gindex[12]]);
-            const __m128i q2_4_1 = _mm_set_epi64x(iq2xs_grid[gindex[15]], iq2xs_grid[gindex[14]]);
-
-            // AVX2 full_signs_1 is full_sign_bits_0 here
-            // AVX2 full_signs_2 is full_sign_bits_1 here
-            __m128i signs_0, signs_1;
-            signs_0 = _mm_shuffle_epi8(full_sign_bits_0, block_sign_shuffle_1_0);
-            signs_1 = _mm_shuffle_epi8(full_sign_bits_0, block_sign_shuffle_1_1);
-            signs_0 = _mm_cmpeq_epi8(_mm_and_si128(signs_0, bit_selector_mask_0), bit_selector_mask_0);
-            signs_1 = _mm_cmpeq_epi8(_mm_and_si128(signs_1, bit_selector_mask_1), bit_selector_mask_1);
-            const __m128i q8s_1_0 = _mm_sign_epi8(q8_1_0, _mm_or_si128(signs_0, mone));
-            const __m128i q8s_1_1 = _mm_sign_epi8(q8_1_1, _mm_or_si128(signs_1, mone));
-
-            signs_0 = _mm_shuffle_epi8(full_sign_bits_0, block_sign_shuffle_2_0);
-            signs_1 = _mm_shuffle_epi8(full_sign_bits_0, block_sign_shuffle_2_1);
-            signs_0 = _mm_cmpeq_epi8(_mm_and_si128(signs_0, bit_selector_mask_0), bit_selector_mask_0);
-            signs_1 = _mm_cmpeq_epi8(_mm_and_si128(signs_1, bit_selector_mask_1), bit_selector_mask_1);
-            const __m128i q8s_2_0 = _mm_sign_epi8(q8_2_0, _mm_or_si128(signs_0, mone));
-            const __m128i q8s_2_1 = _mm_sign_epi8(q8_2_1, _mm_or_si128(signs_1, mone));
-
-            signs_0 = _mm_shuffle_epi8(full_sign_bits_1, block_sign_shuffle_1_0);
-            signs_1 = _mm_shuffle_epi8(full_sign_bits_1, block_sign_shuffle_1_1);
-            signs_0 = _mm_cmpeq_epi8(_mm_and_si128(signs_0, bit_selector_mask_0), bit_selector_mask_0);
-            signs_1 = _mm_cmpeq_epi8(_mm_and_si128(signs_1, bit_selector_mask_1), bit_selector_mask_1);
-            const __m128i q8s_3_0 = _mm_sign_epi8(q8_3_0, _mm_or_si128(signs_0, mone));
-            const __m128i q8s_3_1 = _mm_sign_epi8(q8_3_1, _mm_or_si128(signs_1, mone));
-
-            signs_0 = _mm_shuffle_epi8(full_sign_bits_1, block_sign_shuffle_2_0);
-            signs_1 = _mm_shuffle_epi8(full_sign_bits_1, block_sign_shuffle_2_1);
-            signs_0 = _mm_cmpeq_epi8(_mm_and_si128(signs_0, bit_selector_mask_0), bit_selector_mask_0);
-            signs_1 = _mm_cmpeq_epi8(_mm_and_si128(signs_1, bit_selector_mask_1), bit_selector_mask_1);
-            const __m128i q8s_4_0 = _mm_sign_epi8(q8_4_0, _mm_or_si128(signs_0, mone));
-            const __m128i q8s_4_1 = _mm_sign_epi8(q8_4_1, _mm_or_si128(signs_1, mone));
-
-            const __m128i dot1_0  = _mm_maddubs_epi16(q2_1_0, q8s_1_0);
-            const __m128i dot1_1  = _mm_maddubs_epi16(q2_1_1, q8s_1_1);
-            const __m128i dot2_0  = _mm_maddubs_epi16(q2_2_0, q8s_2_0);
-            const __m128i dot2_1  = _mm_maddubs_epi16(q2_2_1, q8s_2_1);
-            const __m128i dot3_0  = _mm_maddubs_epi16(q2_3_0, q8s_3_0);
-            const __m128i dot3_1  = _mm_maddubs_epi16(q2_3_1, q8s_3_1);
-            const __m128i dot4_0  = _mm_maddubs_epi16(q2_4_0, q8s_4_0);
-            const __m128i dot4_1  = _mm_maddubs_epi16(q2_4_1, q8s_4_1);
-
-            __m128i sc_tmp = _mm_shuffle_epi8(scales, get_scale_shuffle(ib32+0));
-            const __m128i sc1_0 = _mm_cvtepi8_epi16(sc_tmp);
-            const __m128i sc1_1 = _mm_cvtepi8_epi16(_mm_srli_si128(sc_tmp, 8));
-            sc_tmp = _mm_shuffle_epi8(scales, get_scale_shuffle(ib32+1));
-            const __m128i sc2_0 = _mm_cvtepi8_epi16(sc_tmp);
-            const __m128i sc2_1 = _mm_cvtepi8_epi16(_mm_srli_si128(sc_tmp, 8));
-            sc_tmp = _mm_shuffle_epi8(scales, get_scale_shuffle(ib32+2));
-            const __m128i sc3_0 = _mm_cvtepi8_epi16(sc_tmp);
-            const __m128i sc3_1 = _mm_cvtepi8_epi16(_mm_srli_si128(sc_tmp, 8));
-            sc_tmp = _mm_shuffle_epi8(scales, get_scale_shuffle(ib32+3));
-            const __m128i sc4_0 = _mm_cvtepi8_epi16(sc_tmp);
-            const __m128i sc4_1 = _mm_cvtepi8_epi16(_mm_srli_si128(sc_tmp, 8));
-
-            sumi1_0 = _mm_add_epi32(sumi1_0, _mm_madd_epi16(dot1_0, sc1_0));
-            sumi1_1 = _mm_add_epi32(sumi1_1, _mm_madd_epi16(dot1_1, sc1_1));
-            sumi2_0 = _mm_add_epi32(sumi2_0, _mm_madd_epi16(dot2_0, sc2_0));
-            sumi2_1 = _mm_add_epi32(sumi2_1, _mm_madd_epi16(dot2_1, sc2_1));
-            sumi1_0 = _mm_add_epi32(sumi1_0, _mm_madd_epi16(dot3_0, sc3_0));
-            sumi1_1 = _mm_add_epi32(sumi1_1, _mm_madd_epi16(dot3_1, sc3_1));
-            sumi2_0 = _mm_add_epi32(sumi2_0, _mm_madd_epi16(dot4_0, sc4_0));
-            sumi2_1 = _mm_add_epi32(sumi2_1, _mm_madd_epi16(dot4_1, sc4_1));
-        }
-
-        accumf = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(_mm_add_epi32(sumi1_1, sumi2_1), _mm_add_epi32(sumi1_0, sumi2_0)))), accumf);
-
-    }
-
-    *s = 0.125f * hsum_float_8(accumf);
-
-#elif defined(__loongarch_asx)
-
-    const __m256i mone = __lasx_xvreplgr2vr_b(1);
-    static const char block_sign_shuffle_mask_1[32] = {
-        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
-        0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06,
-    };
-    static const char block_sign_shuffle_mask_2[32] = {
-        0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a,
-        0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e,
-    };
-    static const uint8_t bit_selector_mask_bytes[32] = {
-        0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
-        0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
-    };
-
-    const __m256i bit_selector_mask = __lasx_xvld((const __m256i*)bit_selector_mask_bytes, 0);
-    const __m256i block_sign_shuffle_1 = __lasx_xvld((const __m256i*)block_sign_shuffle_mask_1, 0);
-    const __m256i block_sign_shuffle_2 = __lasx_xvld((const __m256i*)block_sign_shuffle_mask_2, 0);
-
-    static const uint8_t k_bit_helper[32] = {
-        0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00,
-        0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00,
-    };
-    const __m256i bit_helper = __lasx_xvld((const __m256i*)k_bit_helper, 0);
-    const __m256i m511 = __lasx_xvreplgr2vr_h(511);
-    const __m128i m4 = __lsx_vreplgr2vr_b(0xf);
-    const __m128i m1 = __lsx_vreplgr2vr_b(1);
-
-    uint64_t aux64;
-
-    // somewhat hacky, but gives a significant boost in performance
-    __m256i aux_gindex;
-    const uint16_t * gindex = (const uint16_t *)&aux_gindex;
-
-    __m256 accumf = (__m256)__lasx_xvldi(0);
-    for (int i = 0; i < nb; ++i) {
-        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-        const uint16_t * GGML_RESTRICT q2 = x[i].qs;
-        const int8_t   * GGML_RESTRICT q8 = y[i].qs;
-
-        memcpy(&aux64, x[i].scales, 8);
-        __m128i stmp = __lsx_vreplgr2vr_d(aux64);
-        stmp = __lsx_vilvl_b( __lsx_vand_v(__lsx_vsrli_h(stmp, 4), m4), __lsx_vand_v(stmp, m4));
-        const __m128i scales = __lsx_vadd_b(__lsx_vslli_h(stmp, 1), m1);
-
-        __m256i sumi1 = __lasx_xvldi(0);
-        __m256i sumi2 = __lasx_xvldi(0);
-        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 4) {
-
-            const __m256i q2_data = __lasx_xvld((const __m256i*)q2, 0);  q2 += 16;
-            aux_gindex = __lasx_xvand_v(q2_data, m511);
-
-            const __m256i partial_sign_bits = __lasx_xvsrli_h(q2_data, 9);
-            const __m256i partial_sign_bits_upper = __lasx_xvsrli_h(q2_data, 13);
-            const __m256i partial_sign_bits_for_counting = __lasx_xvxor_v(partial_sign_bits, partial_sign_bits_upper);
-
-            const __m256i odd_bits = lasx_shuffle_b(bit_helper, partial_sign_bits_for_counting);
-            const __m256i full_sign_bits = __lasx_xvor_v(partial_sign_bits, odd_bits);
-
-            const __m256i q8_1 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32;
-            const __m256i q8_2 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32;
-            const __m256i q8_3 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32;
-            const __m256i q8_4 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32;
-
-            const __m256i q2_1 = lasx_set_d(iq2xs_grid[gindex[ 3]], iq2xs_grid[gindex[ 2]],
-                                                   iq2xs_grid[gindex[ 1]], iq2xs_grid[gindex[ 0]]);
-            const __m256i q2_2 = lasx_set_d(iq2xs_grid[gindex[ 7]], iq2xs_grid[gindex[ 6]],
-                                                   iq2xs_grid[gindex[ 5]], iq2xs_grid[gindex[ 4]]);
-            const __m256i q2_3 = lasx_set_d(iq2xs_grid[gindex[11]], iq2xs_grid[gindex[10]],
-                                                   iq2xs_grid[gindex[ 9]], iq2xs_grid[gindex[ 8]]);
-            const __m256i q2_4 = lasx_set_d(iq2xs_grid[gindex[15]], iq2xs_grid[gindex[14]],
-                                                   iq2xs_grid[gindex[13]], iq2xs_grid[gindex[12]]);
-
-            const __m128i full_signs_l = lasx_extracti128(full_sign_bits, 0);
-            const __m128i full_signs_h = lasx_extracti128(full_sign_bits, 1);
-            const __m256i full_signs_1 = lasx_insertf128(full_signs_l, full_signs_l);
-            const __m256i full_signs_2 = lasx_insertf128(full_signs_h, full_signs_h);
-
-            __m256i signs;
-            signs = lasx_shuffle_b(full_signs_1, block_sign_shuffle_1);
-            signs = __lasx_xvseq_b(__lasx_xvand_v(signs, bit_selector_mask), bit_selector_mask);
-            const __m256i q8s_1 = __lasx_xvsigncov_b(__lasx_xvor_v(signs, mone), q8_1);
-
-            signs = lasx_shuffle_b(full_signs_1, block_sign_shuffle_2);
-            signs = __lasx_xvseq_b(__lasx_xvand_v(signs, bit_selector_mask), bit_selector_mask);
-            const __m256i q8s_2 = __lasx_xvsigncov_b(__lasx_xvor_v(signs, mone), q8_2);
-
-            signs = lasx_shuffle_b(full_signs_2, block_sign_shuffle_1);
-            signs = __lasx_xvseq_b(__lasx_xvand_v(signs, bit_selector_mask), bit_selector_mask);
-            const __m256i q8s_3 = __lasx_xvsigncov_b(__lasx_xvor_v(signs, mone), q8_3);
-
-            signs = lasx_shuffle_b(full_signs_2, block_sign_shuffle_2);
-            signs = __lasx_xvseq_b(__lasx_xvand_v(signs, bit_selector_mask), bit_selector_mask);
-            const __m256i q8s_4 = __lasx_xvsigncov_b(__lasx_xvor_v(signs, mone), q8_4);
-
-            const __m256i dot1  = lasx_maddubs_h(q2_1, q8s_1);
-            const __m256i dot2  = lasx_maddubs_h(q2_2, q8s_2);
-            const __m256i dot3  = lasx_maddubs_h(q2_3, q8s_3);
-            const __m256i dot4  = lasx_maddubs_h(q2_4, q8s_4);
-
-            const __m256i sc1 = lasx_ext8_16(lsx_shuffle_b(scales, get_scale_shuffle(ib32+0)));
-            const __m256i sc2 = lasx_ext8_16(lsx_shuffle_b(scales, get_scale_shuffle(ib32+1)));
-            const __m256i sc3 = lasx_ext8_16(lsx_shuffle_b(scales, get_scale_shuffle(ib32+2)));
-            const __m256i sc4 = lasx_ext8_16(lsx_shuffle_b(scales, get_scale_shuffle(ib32+3)));
-
-            sumi1 = __lasx_xvadd_w(sumi1, lasx_madd_h(dot1, sc1));
-            sumi2 = __lasx_xvadd_w(sumi2, lasx_madd_h(dot2, sc2));
-            sumi1 = __lasx_xvadd_w(sumi1, lasx_madd_h(dot3, sc3));
-            sumi2 = __lasx_xvadd_w(sumi2, lasx_madd_h(dot4, sc4));
-        }
-
-        accumf = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(d), __lasx_xvffint_s_w(__lasx_xvadd_w(sumi1, sumi2)), accumf);
-
-    }
-
-    *s = 0.125f * hsum_float_8(accumf);
-#elif defined(__POWER9_VECTOR__)
-    const vector int v0 = vec_splats((int32_t)0);
-    vector float vsumf0 = vec_splats(0.0f);
-    vector float vsumf1 = vec_splats(0.0f);
-    vector float vsumf2 = vec_splats(0.0f);
-    vector float vsumf3 = vec_splats(0.0f);
-
-    const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
-
-    for (int i = 0; i < nb; ++i) {
-        vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
-        vector float vyd = vec_splats(y[i].d);
-        vector float vd = vec_mul(vxd, vyd);
-
-        vector signed int vsumi0 = v0;
-        vector signed int vsumi1 = v0;
-        vector signed int vsumi2 = v0;
-        vector signed int vsumi3 = v0;
-
-        const uint16_t * GGML_RESTRICT q2 = x[i].qs;
-        const uint8_t  * GGML_RESTRICT sc = x[i].scales;
-        const int8_t  *  GGML_RESTRICT q8 = y[i].qs;
-
-        for (int j = 0; j < QK_K/64; ++j) {
-            __builtin_prefetch(q2, 0, 1);
-            __builtin_prefetch(q8, 0, 1);
-
-            vector signed long long aux64x2_0 = {*(const int64_t *)(iq2xs_grid + (q2[0] & 511)), *(const int64_t *)(iq2xs_grid + (q2[1] & 511))};
-            vector signed long long aux64x2_1 = {*(const int64_t *)(iq2xs_grid + (q2[2] & 511)), *(const int64_t *)(iq2xs_grid + (q2[3] & 511))};
-            vector signed long long aux64x2_2 = {*(const int64_t *)(iq2xs_grid + (q2[4] & 511)), *(const int64_t *)(iq2xs_grid + (q2[5] & 511))};
-            vector signed long long aux64x2_3 = {*(const int64_t *)(iq2xs_grid + (q2[6] & 511)), *(const int64_t *)(iq2xs_grid + (q2[7] & 511))};
-
-            vector signed long long vsigns0 = {*(const int64_t *)(signs64 + ((q2[0] >> 9))), *(const int64_t *)(signs64 + ((q2[1] >> 9)))};
-            vector signed long long vsigns1 = {*(const int64_t *)(signs64 + ((q2[2] >> 9))), *(const int64_t *)(signs64 + ((q2[3] >> 9)))};
-            vector signed long long vsigns2 = {*(const int64_t *)(signs64 + ((q2[4] >> 9))), *(const int64_t *)(signs64 + ((q2[5] >> 9)))};
-            vector signed long long vsigns3 = {*(const int64_t *)(signs64 + ((q2[6] >> 9))), *(const int64_t *)(signs64 + ((q2[7] >> 9)))};
-            q2 += 8;
-
-            vector signed char q2x0 = (vector signed char)vec_mul((vector signed char)vsigns0, (vector signed char)aux64x2_0);
-            vector signed char q2x1 = (vector signed char)vec_mul((vector signed char)vsigns1, (vector signed char)aux64x2_1);
-            vector signed char q2x2 = (vector signed char)vec_mul((vector signed char)vsigns2, (vector signed char)aux64x2_2);
-            vector signed char q2x3 = (vector signed char)vec_mul((vector signed char)vsigns3, (vector signed char)aux64x2_3);
-
-            vector signed char q8y0 = vec_xl( 0, q8);
-            vector signed char q8y1 = vec_xl(16, q8);
-            vector signed char q8y2 = vec_xl(32, q8);
-            vector signed char q8y3 = vec_xl(48, q8);
-            q8 += 64;
-
-            vector signed short qv0 = vec_add(vec_mule(q2x0, q8y0), vec_mulo(q2x0, q8y0));
-            vector signed short qv1 = vec_add(vec_mule(q2x1, q8y1), vec_mulo(q2x1, q8y1));
-            vector signed short qv2 = vec_add(vec_mule(q2x2, q8y2), vec_mulo(q2x2, q8y2));
-            vector signed short qv3 = vec_add(vec_mule(q2x3, q8y3), vec_mulo(q2x3, q8y3));
-
-            const uint16_t ls0 = (uint16_t)(sc[0] & 0xf);
-            const uint16_t ls1 = (uint16_t)(sc[0] >>  4);
-            const uint16_t ls2 = (uint16_t)(sc[1] & 0xf);
-            const uint16_t ls3 = (uint16_t)(sc[1] >>  4);
-            sc += 2;
-
-            vector signed short vscales0 = vec_splats((int16_t)(2*ls0+1));
-            vector signed short vscales1 = vec_splats((int16_t)(2*ls1+1));
-            vector signed short vscales2 = vec_splats((int16_t)(2*ls2+1));
-            vector signed short vscales3 = vec_splats((int16_t)(2*ls3+1));
-
-            vsumi0 = vec_msum(qv0, vscales0, vsumi0);
-            vsumi1 = vec_msum(qv1, vscales1, vsumi1);
-            vsumi2 = vec_msum(qv2, vscales2, vsumi2);
-            vsumi3 = vec_msum(qv3, vscales3, vsumi3);
-        }
-
-        vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
-        vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
-        vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
-        vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
-    }
-
-    vsumf0 = vec_add(vsumf0, vsumf2);
-    vsumf1 = vec_add(vsumf1, vsumf3);
-
-    vsumf0 = vec_add(vsumf0, vsumf1);
-
-    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
-    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
-
-    *s = 0.125f * vec_extract(vsumf0, 0);
-#else
-
-    float sumf = 0.f;
-    for (int i = 0; i < nb; ++i) {
-        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-        const uint16_t * GGML_RESTRICT q2 = x[i].qs;
-        const uint8_t  * GGML_RESTRICT sc = x[i].scales;
-        const int8_t   * GGML_RESTRICT q8 = y[i].qs;
-        int32_t bsum = 0;
-        for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
-            const uint16_t ls1 = 2*(sc[ib32] & 0xf) + 1;
-            const uint16_t ls2 = 2*(sc[ib32] >>  4) + 1;
-            int32_t sumi = 0;
-            for (int l = 0; l < 2; ++l) {
-                const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511));
-                const uint8_t  signs = ksigns_iq2xs[q2[l] >> 9];
-                for (int j = 0; j < 8; ++j) {
-                    sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
-                }
-                q8 += 8;
-            }
-            bsum += sumi * ls1;
-            sumi = 0;
-            for (int l = 2; l < 4; ++l) {
-                const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511));
-                const uint8_t  signs = ksigns_iq2xs[q2[l] >> 9];
-                for (int j = 0; j < 8; ++j) {
-                    sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
-                }
-                q8 += 8;
-            }
-            bsum += sumi * ls2;
-            q2 += 4;
-        }
-        sumf += d * bsum;
-    }
-    *s = 0.125f * sumf;
-#endif
-}
-
-void ggml_vec_dot_iq2_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
-    assert(n % QK_K == 0);
-    assert(nrc == 1);
-    UNUSED(nrc);
-    UNUSED(bx);
-    UNUSED(by);
-    UNUSED(bs);
-
-    const block_iq2_s * GGML_RESTRICT x = vx;
-    const block_q8_K  * GGML_RESTRICT y = vy;
-
-    const int nb = n / QK_K;
-
-#if defined(__ARM_NEON)
-
-   static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
-                                       0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
-   };
-
-    static const uint8_t k_mask2[16] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,};
-
-    const ggml_uint8x16x2_t mask1 = ggml_vld1q_u8_x2(k_mask1);
-    const uint8x16_t        mask2 = vld1q_u8(k_mask2);
-    const uint8x16_t m1 = vdupq_n_u8(1);
-    const int32x4_t vzero = vdupq_n_s32(0);
-
-    uint8x16x2_t vs;
-    ggml_int8x16x4_t q2s;
-    ggml_int8x16x4_t q8b;
-
-    float sumf = 0;
-    for (int i = 0; i < nb; ++i) {
-
-        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-
-        const uint8_t * GGML_RESTRICT qs = x[i].qs;
-        const uint8_t * GGML_RESTRICT qh = x[i].qh;
-        const uint16_t * GGML_RESTRICT signs = (const uint16_t *)(x[i].qs + QK_K/8);
-        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
-
-        int sumi1 = 0, sumi2 = 0;
-        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
-            q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
-            q2s.val[0] = vcombine_s8(vld1_s8((const int8_t *)(iq2s_grid + (qs[0] | ((qh[ib32+0] << 8) & 0x300)))),
-                                     vld1_s8((const int8_t *)(iq2s_grid + (qs[1] | ((qh[ib32+0] << 6) & 0x300)))));
-            q2s.val[1] = vcombine_s8(vld1_s8((const int8_t *)(iq2s_grid + (qs[2] | ((qh[ib32+0] << 4) & 0x300)))),
-                                     vld1_s8((const int8_t *)(iq2s_grid + (qs[3] | ((qh[ib32+0] << 2) & 0x300)))));
-            q2s.val[2] = vcombine_s8(vld1_s8((const int8_t *)(iq2s_grid + (qs[4] | ((qh[ib32+1] << 8) & 0x300)))),
-                                     vld1_s8((const int8_t *)(iq2s_grid + (qs[5] | ((qh[ib32+1] << 6) & 0x300)))));
-            q2s.val[3] = vcombine_s8(vld1_s8((const int8_t *)(iq2s_grid + (qs[6] | ((qh[ib32+1] << 4) & 0x300)))),
-                                     vld1_s8((const int8_t *)(iq2s_grid + (qs[7] | ((qh[ib32+1] << 2) & 0x300)))));
-            qs += 8;
-
-            vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[0] | ((uint32_t) signs[1] << 16)));
-            vs.val[1] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[1]), mask2);
-            vs.val[0] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[0]), mask2);
-            vs.val[0] = vceqq_u8(vs.val[0], mask2);
-            vs.val[1] = vceqq_u8(vs.val[1], mask2);
-
-            q2s.val[0] = vmulq_s8(vreinterpretq_s8_u8(vorrq_u8(vs.val[0], m1)), q2s.val[0]);
-            q2s.val[1] = vmulq_s8(vreinterpretq_s8_u8(vorrq_u8(vs.val[1], m1)), q2s.val[1]);
-
-            vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[2] | ((uint32_t) signs[3] << 16)));
-            vs.val[1] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[1]), mask2);
-            vs.val[0] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[0]), mask2);
-            vs.val[0] = vceqq_u8(vs.val[0], mask2);
-            vs.val[1] = vceqq_u8(vs.val[1], mask2);
-
-            signs += 4;
-
-            q2s.val[2] = vmulq_s8(vreinterpretq_s8_u8(vorrq_u8(vs.val[0], m1)), q2s.val[2]);
-            q2s.val[3] = vmulq_s8(vreinterpretq_s8_u8(vorrq_u8(vs.val[1], m1)), q2s.val[3]);
-
-            const int32x4_t p1 = ggml_vdotq_s32(vzero, q2s.val[0], q8b.val[0]);
-            const int32x4_t p2 = ggml_vdotq_s32(vzero, q2s.val[1], q8b.val[1]);
-            const int32x4_t p3 = ggml_vdotq_s32(vzero, q2s.val[2], q8b.val[2]);
-            const int32x4_t p4 = ggml_vdotq_s32(vzero, q2s.val[3], q8b.val[3]);
-
-            sumi1 += vaddvq_s32(p1) * (1 + 2*(x[i].scales[ib32+0] & 0xf));
-            sumi2 += vaddvq_s32(p2) * (1 + 2*(x[i].scales[ib32+0] >>  4));
-            sumi1 += vaddvq_s32(p3) * (1 + 2*(x[i].scales[ib32+1] & 0xf));
-            sumi2 += vaddvq_s32(p4) * (1 + 2*(x[i].scales[ib32+1] >>  4));
-        }
-        sumf += d*(sumi1 + sumi2);
-    }
-
-    *s = 0.125f * sumf;
-
-#elif defined(__AVX2__)
-
-   static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
-                                       0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
-   };
-
-    static const uint8_t k_mask2[32] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
-                                        0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
-    };
-
-    const __m128i m4 = _mm_set1_epi8(0xf);
-    const __m128i m1 = _mm_set1_epi8(1);
-
-    const __m256i mask1 = _mm256_loadu_si256((const __m256i*)k_mask1);
-    const __m256i mask2 = _mm256_loadu_si256((const __m256i*)k_mask2);
-
-    uint64_t aux64;
-
-    __m256 accumf = _mm256_setzero_ps();
-    for (int i = 0; i < nb; ++i) {
-        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-        const uint8_t * GGML_RESTRICT qs = x[i].qs;
-        const uint8_t * GGML_RESTRICT qh = x[i].qh;
-        const uint16_t * GGML_RESTRICT signs = (const uint16_t *)(x[i].qs + QK_K/8);
-        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
-
-        memcpy(&aux64, x[i].scales, 8);
-        const __m128i scales8 = _mm_add_epi8(_mm_slli_epi16(_mm_and_si128(_mm_set_epi64x(aux64 >> 4, aux64), m4), 1), m1);
-        const __m256i scales16 = _mm256_cvtepi8_epi16(scales8); // 0 2 4 6 8 10 12 14 1 3 5 7 9 11 13 15
-
-        __m256i sumi1 = _mm256_setzero_si256();
-        __m256i sumi2 = _mm256_setzero_si256();
-        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
-            const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
-            const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
-            const __m256i q2_1 = _mm256_set_epi64x(iq2s_grid[qs[3] | ((qh[ib32+0] << 2) & 0x300)],
-                                                   iq2s_grid[qs[2] | ((qh[ib32+0] << 4) & 0x300)],
-                                                   iq2s_grid[qs[1] | ((qh[ib32+0] << 6) & 0x300)],
-                                                   iq2s_grid[qs[0] | ((qh[ib32+0] << 8) & 0x300)]);
-            const __m256i q2_2 = _mm256_set_epi64x(iq2s_grid[qs[7] | ((qh[ib32+1] << 2) & 0x300)],
-                                                   iq2s_grid[qs[6] | ((qh[ib32+1] << 4) & 0x300)],
-                                                   iq2s_grid[qs[5] | ((qh[ib32+1] << 6) & 0x300)],
-                                                   iq2s_grid[qs[4] | ((qh[ib32+1] << 8) & 0x300)]);
-            qs += 8;
-
-            __m256i aux256 = _mm256_set1_epi32(signs[0] | ((uint32_t) signs[1] << 16));
-            aux256 = _mm256_and_si256(_mm256_shuffle_epi8(aux256,mask1), mask2);
-            const __m256i s2_1 = _mm256_cmpeq_epi8(aux256, mask2);
-            const __m256i q8s_1 = _mm256_sub_epi8(_mm256_xor_si256(s2_1, q8_1), s2_1);
-
-            aux256 = _mm256_set1_epi32(signs[2] | ((uint32_t) signs[3] << 16));
-            aux256 = _mm256_and_si256(_mm256_shuffle_epi8(aux256,mask1), mask2);
-            const __m256i s2_2 = _mm256_cmpeq_epi8(aux256, mask2);
-            const __m256i q8s_2 = _mm256_sub_epi8(_mm256_xor_si256(s2_2, q8_2), s2_2);
-
-            signs += 4;
-
-            const __m256i dot1  = _mm256_maddubs_epi16(q2_1, q8s_1); // blocks 2*ib32+0, 2*ib32+1
-            const __m256i dot2  = _mm256_maddubs_epi16(q2_2, q8s_2); // blocks 2*ib32+2, 2*ib32+3
-
-            const __m256i p1 = _mm256_madd_epi16(dot1, _mm256_shuffle_epi8(scales16, get_scale_shuffle_k4(ib32+0)));
-            const __m256i p2 = _mm256_madd_epi16(dot2, _mm256_shuffle_epi8(scales16, get_scale_shuffle_k4(ib32+1)));
-            sumi1 = _mm256_add_epi32(sumi1, p1);
-            sumi2 = _mm256_add_epi32(sumi2, p2);
-        }
-
-        accumf = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accumf);
-
-    }
-
-    *s = 0.125f * hsum_float_8(accumf);
-
-#elif defined(__AVX__)
-   static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
-                                       0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
-   };
-
-    static const uint8_t k_mask2[32] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
-                                        0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
-    };
-
-    const __m128i m4 = _mm_set1_epi8(0xf);
-    const __m128i m1 = _mm_set1_epi8(1);
-
-    const __m128i mask1_0 = _mm_loadu_si128((const __m128i*)k_mask1);
-    const __m128i mask1_1 = _mm_loadu_si128((const __m128i*)k_mask1 + 1);
-    const __m128i mask2_0 = _mm_loadu_si128((const __m128i*)k_mask2);
-    const __m128i mask2_1 = _mm_loadu_si128((const __m128i*)k_mask2 + 1);
-
-    uint64_t aux64;
-
-    __m256 accumf = _mm256_setzero_ps();
-    for (int i = 0; i < nb; ++i) {
-        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-        const uint8_t * GGML_RESTRICT qs = x[i].qs;
-        const uint8_t * GGML_RESTRICT qh = x[i].qh;
-        const uint16_t * GGML_RESTRICT signs = (const uint16_t *)(x[i].qs + QK_K/8);
-        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
-
-        memcpy(&aux64, x[i].scales, 8);
-        const __m128i scales8 = _mm_add_epi8(_mm_slli_epi16(_mm_and_si128(_mm_set_epi64x(aux64 >> 4, aux64), m4), 1), m1);
-        const __m128i scales16_0 = _mm_cvtepi8_epi16(scales8);
-        const __m128i scales16_1 = _mm_cvtepi8_epi16(_mm_srli_si128(scales8, 8));
-
-        __m128i sumi1_0 = _mm_setzero_si128();
-        __m128i sumi1_1 = _mm_setzero_si128();
-        __m128i sumi2_0 = _mm_setzero_si128();
-        __m128i sumi2_1 = _mm_setzero_si128();
-        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
-            const __m128i q8_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
-            const __m128i q8_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
-            const __m128i q8_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
-            const __m128i q8_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
-            const __m128i q2_1_0 = _mm_set_epi64x(iq2s_grid[qs[1] | ((qh[ib32+0] << 6) & 0x300)],
-                                                  iq2s_grid[qs[0] | ((qh[ib32+0] << 8) & 0x300)]);
-            const __m128i q2_1_1 = _mm_set_epi64x(iq2s_grid[qs[3] | ((qh[ib32+0] << 2) & 0x300)],
-                                                  iq2s_grid[qs[2] | ((qh[ib32+0] << 4) & 0x300)]);
-            const __m128i q2_2_0 = _mm_set_epi64x(iq2s_grid[qs[5] | ((qh[ib32+1] << 6) & 0x300)],
-                                                  iq2s_grid[qs[4] | ((qh[ib32+1] << 8) & 0x300)]);
-            const __m128i q2_2_1 = _mm_set_epi64x(iq2s_grid[qs[7] | ((qh[ib32+1] << 2) & 0x300)],
-                                                  iq2s_grid[qs[6] | ((qh[ib32+1] << 4) & 0x300)]);
-            qs += 8;
-
-            __m128i aux128_0 = _mm_set1_epi32(signs[0] | ((uint32_t) signs[1] << 16));
-            __m128i aux128_1 = aux128_0;
-            aux128_0 = _mm_and_si128(_mm_shuffle_epi8(aux128_0,mask1_0), mask2_0);
-            aux128_1 = _mm_and_si128(_mm_shuffle_epi8(aux128_1,mask1_1), mask2_1);
-            const __m128i s2_1_0 = _mm_cmpeq_epi8(aux128_0, mask2_0);
-            const __m128i s2_1_1 = _mm_cmpeq_epi8(aux128_1, mask2_1);
-            const __m128i q8s_1_0 = _mm_sub_epi8(_mm_xor_si128(s2_1_0, q8_1_0), s2_1_0);
-            const __m128i q8s_1_1 = _mm_sub_epi8(_mm_xor_si128(s2_1_1, q8_1_1), s2_1_1);
-
-            aux128_0 = _mm_set1_epi32(signs[2] | ((uint32_t) signs[3] << 16));
-            aux128_1 = aux128_0;
-            aux128_0 = _mm_and_si128(_mm_shuffle_epi8(aux128_0,mask1_0), mask2_0);
-            aux128_1 = _mm_and_si128(_mm_shuffle_epi8(aux128_1,mask1_1), mask2_1);
-            const __m128i s2_2_0 = _mm_cmpeq_epi8(aux128_0, mask2_0);
-            const __m128i s2_2_1 = _mm_cmpeq_epi8(aux128_1, mask2_1);
-            const __m128i q8s_2_0 = _mm_sub_epi8(_mm_xor_si128(s2_2_0, q8_2_0), s2_2_0);
-            const __m128i q8s_2_1 = _mm_sub_epi8(_mm_xor_si128(s2_2_1, q8_2_1), s2_2_1);
-
-            signs += 4;
-
-            const __m128i dot1_0  = _mm_maddubs_epi16(q2_1_0, q8s_1_0);
-            const __m128i dot1_1  = _mm_maddubs_epi16(q2_1_1, q8s_1_1);
-            const __m128i dot2_0  = _mm_maddubs_epi16(q2_2_0, q8s_2_0);
-            const __m128i dot2_1  = _mm_maddubs_epi16(q2_2_1, q8s_2_1);
-
-            const __m128i p1_0 = _mm_madd_epi16(dot1_0, _mm_shuffle_epi8(scales16_0, _mm256_extractf128_si256(get_scale_shuffle_k4(ib32+0), 0)));
-            const __m128i p1_1 = _mm_madd_epi16(dot1_1, _mm_shuffle_epi8(scales16_1, _mm256_extractf128_si256(get_scale_shuffle_k4(ib32+0), 1)));
-            const __m128i p2_0 = _mm_madd_epi16(dot2_0, _mm_shuffle_epi8(scales16_0, _mm256_extractf128_si256(get_scale_shuffle_k4(ib32+1), 0)));
-            const __m128i p2_1 = _mm_madd_epi16(dot2_1, _mm_shuffle_epi8(scales16_1, _mm256_extractf128_si256(get_scale_shuffle_k4(ib32+1), 1)));
-            sumi1_0 = _mm_add_epi32(sumi1_0, p1_0);
-            sumi1_1 = _mm_add_epi32(sumi1_1, p1_1);
-            sumi2_0 = _mm_add_epi32(sumi2_0, p2_0);
-            sumi2_1 = _mm_add_epi32(sumi2_1, p2_1);
-        }
-
-        accumf = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(_mm_add_epi32(sumi1_1, sumi2_1), _mm_add_epi32(sumi1_0, sumi2_0)))), accumf);
-
-    }
-
-    *s = 0.125f * hsum_float_8(accumf);
-
-#elif defined(__POWER9_VECTOR__)
-    static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
-                                        0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
-    };
-
-    static const uint8_t k_mask2[16] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,};
-
-    const vector int v0 = vec_splats((int32_t)0);
-
-    vector float vsumf0 = vec_splats(0.0f);
-    vector float vsumf1 = vec_splats(0.0f);
-    vector float vsumf2 = vec_splats(0.0f);
-    vector float vsumf3 = vec_splats(0.0f);
-
-    const vector unsigned char mask0 = vec_xl( 0, k_mask1);
-    const vector unsigned char mask1 = vec_xl(16, k_mask1);
-    const vector signed char mask2 = (vector signed char)vec_xl( 0, k_mask2);
-
-    for (int i = 0; i < nb; ++i) {
-        vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
-        vector float vyd = vec_splats(y[i].d);
-        vector float vd = vec_mul(vxd, vyd);
-
-        vector signed int vsumi0 = v0;
-        vector signed int vsumi1 = v0;
-        vector signed int vsumi2 = v0;
-        vector signed int vsumi3 = v0;
-
-        const uint8_t *  GGML_RESTRICT q2 = x[i].qs;
-        const uint8_t *  GGML_RESTRICT qh = x[i].qh;
-        const uint16_t * GGML_RESTRICT signs = (const uint16_t *)(x[i].qs + QK_K/8);
-        const uint8_t *  GGML_RESTRICT sc = x[i].scales;
-        const int8_t  *  GGML_RESTRICT q8 = y[i].qs;
-
-        for (int j = 0; j < QK_K/32; j += 2) {
-            __builtin_prefetch(q2, 0, 1);
-            __builtin_prefetch(q8, 0, 1);
-
-            vector signed long long aux64x2_0 = {*(const int64_t *)(iq2s_grid + (q2[0] | ((qh[0] << 8) & 0x300))), *(const int64_t *)(iq2s_grid + (q2[1] | ((qh[0] << 6) & 0x300)))};
-            vector signed long long aux64x2_1 = {*(const int64_t *)(iq2s_grid + (q2[2] | ((qh[0] << 4) & 0x300))), *(const int64_t *)(iq2s_grid + (q2[3] | ((qh[0] << 2) & 0x300)))};
-            vector signed long long aux64x2_2 = {*(const int64_t *)(iq2s_grid + (q2[4] | ((qh[1] << 8) & 0x300))), *(const int64_t *)(iq2s_grid + (q2[5] | ((qh[1] << 6) & 0x300)))};
-            vector signed long long aux64x2_3 = {*(const int64_t *)(iq2s_grid + (q2[6] | ((qh[1] << 4) & 0x300))), *(const int64_t *)(iq2s_grid + (q2[7] | ((qh[1] << 2) & 0x300)))};
-            q2 += 8;
-            qh += 2;
-
-            vector signed char vsigns01 = (vector signed char)vec_splats(*(const uint32_t *)&signs[0]);
-            vector signed char vsigns23 = (vector signed char)vec_splats(*(const uint32_t *)&signs[2]);
-            signs += 4;
-
-            vector signed char vsigns0 = vec_perm(vsigns01, vsigns01, mask0);
-            vector signed char vsigns1 = vec_perm(vsigns01, vsigns01, mask1);
-            vector signed char vsigns2 = vec_perm(vsigns23, vsigns23, mask0);
-            vector signed char vsigns3 = vec_perm(vsigns23, vsigns23, mask1);
-
-            vsigns0 = (vector signed char)vec_cmpeq(vec_and(vsigns0, mask2), mask2);
-            vsigns1 = (vector signed char)vec_cmpeq(vec_and(vsigns1, mask2), mask2);
-            vsigns2 = (vector signed char)vec_cmpeq(vec_and(vsigns2, mask2), mask2);
-            vsigns3 = (vector signed char)vec_cmpeq(vec_and(vsigns3, mask2), mask2);
-
-            vector signed char q2x0 = vec_sub(vec_xor(vsigns0, (vector signed char)aux64x2_0), vsigns0);
-            vector signed char q2x1 = vec_sub(vec_xor(vsigns1, (vector signed char)aux64x2_1), vsigns1);
-            vector signed char q2x2 = vec_sub(vec_xor(vsigns2, (vector signed char)aux64x2_2), vsigns2);
-            vector signed char q2x3 = vec_sub(vec_xor(vsigns3, (vector signed char)aux64x2_3), vsigns3);
-
-            vector signed char q8y0 = vec_xl( 0, q8);
-            vector signed char q8y1 = vec_xl(16, q8);
-            vector signed char q8y2 = vec_xl(32, q8);
-            vector signed char q8y3 = vec_xl(48, q8);
-            q8 += 64;
-
-            vector signed short qv0 = vec_add(vec_mule(q2x0, q8y0), vec_mulo(q2x0, q8y0));
-            vector signed short qv1 = vec_add(vec_mule(q2x1, q8y1), vec_mulo(q2x1, q8y1));
-            vector signed short qv2 = vec_add(vec_mule(q2x2, q8y2), vec_mulo(q2x2, q8y2));
-            vector signed short qv3 = vec_add(vec_mule(q2x3, q8y3), vec_mulo(q2x3, q8y3));
-
-            const uint16_t ls0 = (uint16_t)(sc[0] & 0xf);
-            const uint16_t ls1 = (uint16_t)(sc[0] >>  4);
-            const uint16_t ls2 = (uint16_t)(sc[1] & 0xf);
-            const uint16_t ls3 = (uint16_t)(sc[1] >>  4);
-            sc += 2;
-
-            vector signed short vscales0 = vec_splats((int16_t)(2*ls0+1));
-            vector signed short vscales1 = vec_splats((int16_t)(2*ls1+1));
-            vector signed short vscales2 = vec_splats((int16_t)(2*ls2+1));
-            vector signed short vscales3 = vec_splats((int16_t)(2*ls3+1));
-
-            vsumi0 = vec_msum(qv0, vscales0, vsumi0);
-            vsumi1 = vec_msum(qv1, vscales1, vsumi1);
-            vsumi2 = vec_msum(qv2, vscales2, vsumi2);
-            vsumi3 = vec_msum(qv3, vscales3, vsumi3);
-        }
-
-        vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
-        vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
-        vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
-        vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
-    }
-
-    vsumf0 = vec_add(vsumf0, vsumf2);
-    vsumf1 = vec_add(vsumf1, vsumf3);
-
-    vsumf0 = vec_add(vsumf0, vsumf1);
-
-    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
-    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
-
-    *s = 0.125f * vec_extract(vsumf0, 0);
-
-#elif defined(__loongarch_asx)
-
-   static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
-                                       0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
-   };
-
-    static const uint8_t k_mask2[32] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
-                                        0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
-    };
-
-
-    const __m128i m4 = __lsx_vreplgr2vr_b(0xf);
-    const __m128i m1 = __lsx_vreplgr2vr_b(1);
-
-    const __m256i mask1 = __lasx_xvld((const __m256i*)k_mask1, 0);
-    const __m256i mask2 = __lasx_xvld((const __m256i*)k_mask2, 0);
-    uint64_t aux64;
-
-    __m256 accumf = (__m256)__lasx_xvldi(0);
-    for (int i = 0; i < nb; ++i) {
-        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-        const uint8_t * GGML_RESTRICT qs = x[i].qs;
-        const uint8_t * GGML_RESTRICT qh = x[i].qh;
-        const uint16_t * GGML_RESTRICT signs = (const uint16_t *)(x[i].qs + QK_K/8);
-        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
-
-        __m128i tmp1;
-        memcpy(&aux64, x[i].scales, 8);
-        tmp1 = __lsx_vinsgr2vr_d(tmp1, aux64, 0);
-        tmp1 = __lsx_vinsgr2vr_d(tmp1, aux64 >> 4, 1);
-        const __m128i scales8 = __lsx_vadd_b(__lsx_vslli_h(__lsx_vand_v(tmp1, m4), 1), m1);
-        const __m256i scales16 = lasx_ext8_16(scales8); // 0 2 4 6 8 10 12 14 1 3 5 7 9 11 13 15
-
-        __m256i sumi1 = __lasx_xvldi(0);
-        __m256i sumi2 = __lasx_xvldi(0);
-        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
-            const __m256i q8_1 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32;
-            const __m256i q8_2 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32;
-            const __m256i q2_1 = lasx_set_d(iq2s_grid[qs[3] | ((qh[ib32+0] << 2) & 0x300)],
-                                                   iq2s_grid[qs[2] | ((qh[ib32+0] << 4) & 0x300)],
-                                                   iq2s_grid[qs[1] | ((qh[ib32+0] << 6) & 0x300)],
-                                                   iq2s_grid[qs[0] | ((qh[ib32+0] << 8) & 0x300)]);
-            const __m256i q2_2 = lasx_set_d(iq2s_grid[qs[7] | ((qh[ib32+1] << 2) & 0x300)],
-                                                   iq2s_grid[qs[6] | ((qh[ib32+1] << 4) & 0x300)],
-                                                   iq2s_grid[qs[5] | ((qh[ib32+1] << 6) & 0x300)],
-                                                   iq2s_grid[qs[4] | ((qh[ib32+1] << 8) & 0x300)]);
-            qs += 8;
-
-            __m256i aux256 = __lasx_xvreplgr2vr_w(signs[0] | ((uint32_t) signs[1] << 16));
-            aux256 = __lasx_xvand_v(lasx_shuffle_b(aux256,mask1), mask2);
-            const __m256i s2_1 = __lasx_xvseq_b(aux256, mask2);
-            const __m256i q8s_1 = __lasx_xvsub_b(__lasx_xvxor_v(s2_1, q8_1), s2_1);
-
-            aux256 = __lasx_xvreplgr2vr_w(signs[2] | ((uint32_t) signs[3] << 16));
-            aux256 = __lasx_xvand_v(lasx_shuffle_b(aux256,mask1), mask2);
-            const __m256i s2_2 = __lasx_xvseq_b(aux256, mask2);
-            const __m256i q8s_2 = __lasx_xvsub_b(__lasx_xvxor_v(s2_2, q8_2), s2_2);
-
-            signs += 4;
-
-            const __m256i dot1  = lasx_maddubs_h(q2_1, q8s_1); // blocks 2*ib32+0, 2*ib32+1
-            const __m256i dot2  = lasx_maddubs_h(q2_2, q8s_2); // blocks 2*ib32+2, 2*ib32+3
-
-            const __m256i p1 = lasx_madd_h(dot1, lasx_shuffle_b(scales16, get_scale_shuffle_k4(ib32+0)));
-            const __m256i p2 = lasx_madd_h(dot2, lasx_shuffle_b(scales16, get_scale_shuffle_k4(ib32+1)));
-            sumi1 = __lasx_xvadd_w(sumi1, p1);
-            sumi2 = __lasx_xvadd_w(sumi2, p2);
-        }
-
-        accumf = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(d), __lasx_xvffint_s_w(__lasx_xvadd_w(sumi1, sumi2)), accumf);
-    }
-
-    *s = 0.125f * hsum_float_8(accumf);
-
-#else
-
-    float sumf = 0;
-    for (int i = 0; i < nb; i++) {
-
-        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-        const int8_t  * q8 = y[i].qs;
-        const uint8_t * qs = x[i].qs;
-        const uint8_t * qh = x[i].qh;
-        const uint8_t * signs = qs + QK_K/8;
-
-        int bsum = 0;
-        for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
-            int ls1 = 1 + 2*(x[i].scales[ib32] & 0xf);
-            int ls2 = 1 + 2*(x[i].scales[ib32] >>  4);
-            int sumi1 = 0, sumi2 = 0;
-            for (int l = 0; l < 2; ++l) {
-                const uint8_t * grid = (const uint8_t *)(iq2s_grid + (qs[l] | (qh[ib32] << (8-2*l) & 0x300)));
-                for (int j = 0; j < 8; ++j) {
-                    sumi1 += q8[j] * grid[j] * (signs[l] & kmask_iq2xs[j] ? -1 : 1);
-                }
-                q8 += 8;
-            }
-            for (int l = 2; l < 4; ++l) {
-                const uint8_t * grid = (const uint8_t *)(iq2s_grid + (qs[l] | (qh[ib32] << (8-2*l) & 0x300)));
-                for (int j = 0; j < 8; ++j) {
-                    sumi2 += q8[j] * grid[j] * (signs[l] & kmask_iq2xs[j] ? -1 : 1);
-                }
-                q8 += 8;
-            }
-            bsum += ls1 * sumi1 + ls2 * sumi2;
-            qs += 4;
-            signs += 4;
-        }
-
-        sumf += d * bsum;
-    }
-
-    *s = 0.125f * sumf;
-
-#endif
-
-}
-
-void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
-    assert(n % QK_K == 0);
-    assert(nrc == 1);
-    UNUSED(nrc);
-    UNUSED(bx);
-    UNUSED(by);
-    UNUSED(bs);
-
-    const block_iq3_xxs * GGML_RESTRICT x = vx;
-    const block_q8_K    * GGML_RESTRICT y = vy;
-
-    const int nb = n / QK_K;
-
-#if defined(__ARM_NEON)
-
-    const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
-
-    uint32_t aux32[2];
-
-    ggml_int8x16x4_t q3s;
-    ggml_int8x16x4_t q8b;
-
-    float sumf = 0;
-    for (int i = 0; i < nb; ++i) {
-        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-        const uint8_t * GGML_RESTRICT q3 = x[i].qs;
-        const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4;
-        const int8_t   * GGML_RESTRICT q8 = y[i].qs;
-        float sumf1 = 0, sumf2 = 0;
-        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
-            q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
-            memcpy(aux32, gas, 2*sizeof(uint32_t)); gas += 2*sizeof(uint32_t);
-            const uint32x4_t aux32x4_0 = ggml_vld1q_u32(iq3xxs_grid[q3[ 0]], iq3xxs_grid[q3[ 1]], iq3xxs_grid[q3[ 2]], iq3xxs_grid[q3[ 3]]);
-            const uint32x4_t aux32x4_1 = ggml_vld1q_u32(iq3xxs_grid[q3[ 4]], iq3xxs_grid[q3[ 5]], iq3xxs_grid[q3[ 6]], iq3xxs_grid[q3[ 7]]);
-            const uint32x4_t aux32x4_2 = ggml_vld1q_u32(iq3xxs_grid[q3[ 8]], iq3xxs_grid[q3[ 9]], iq3xxs_grid[q3[10]], iq3xxs_grid[q3[11]]);
-            const uint32x4_t aux32x4_3 = ggml_vld1q_u32(iq3xxs_grid[q3[12]], iq3xxs_grid[q3[13]], iq3xxs_grid[q3[14]], iq3xxs_grid[q3[15]]);
-            q3 += 16;
-            q3s.val[0] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[0] >>  0) & 127))), vld1_s8((const void *)(signs64 + ((aux32[0] >>  7) & 127))));
-            q3s.val[1] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[0] >> 14) & 127))), vld1_s8((const void *)(signs64 + ((aux32[0] >> 21) & 127))));
-            q3s.val[2] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[1] >>  0) & 127))), vld1_s8((const void *)(signs64 + ((aux32[1] >>  7) & 127))));
-            q3s.val[3] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[1] >> 14) & 127))), vld1_s8((const void *)(signs64 + ((aux32[1] >> 21) & 127))));
-            q3s.val[0] = vmulq_s8(q3s.val[0], vreinterpretq_s8_u32(aux32x4_0));
-            q3s.val[1] = vmulq_s8(q3s.val[1], vreinterpretq_s8_u32(aux32x4_1));
-            q3s.val[2] = vmulq_s8(q3s.val[2], vreinterpretq_s8_u32(aux32x4_2));
-            q3s.val[3] = vmulq_s8(q3s.val[3], vreinterpretq_s8_u32(aux32x4_3));
-            const int32x4_t p1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q3s.val[0], q8b.val[0]), q3s.val[1], q8b.val[1]);
-            const int32x4_t p2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q3s.val[2], q8b.val[2]), q3s.val[3], q8b.val[3]);
-            sumf1 += vaddvq_s32(p1) * (0.5f + (aux32[0] >> 28));
-            sumf2 += vaddvq_s32(p2) * (0.5f + (aux32[1] >> 28));
-        }
-        sumf += d*(sumf1 + sumf2);
-    }
-    *s = 0.5f * sumf;
-
-#elif defined(__AVX2__)
-
-    const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
-
-    uint32_t aux32[2];
-
-    __m256 accumf = _mm256_setzero_ps();
-    for (int i = 0; i < nb; ++i) {
-        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-        const uint8_t * GGML_RESTRICT q3 = x[i].qs;
-        const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4;
-        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
-        __m256i sumi1 = _mm256_setzero_si256();
-        __m256i sumi2 = _mm256_setzero_si256();
-        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
-            const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
-            const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
-            const __m256i q2_1 = _mm256_set_epi32(iq3xxs_grid[q3[7]], iq3xxs_grid[q3[6]], iq3xxs_grid[q3[5]], iq3xxs_grid[q3[4]],
-                                                  iq3xxs_grid[q3[3]], iq3xxs_grid[q3[2]], iq3xxs_grid[q3[1]], iq3xxs_grid[q3[0]]);
-            q3 += 8;
-            const __m256i q2_2 = _mm256_set_epi32(iq3xxs_grid[q3[7]], iq3xxs_grid[q3[6]], iq3xxs_grid[q3[5]], iq3xxs_grid[q3[4]],
-                                                  iq3xxs_grid[q3[3]], iq3xxs_grid[q3[2]], iq3xxs_grid[q3[1]], iq3xxs_grid[q3[0]]);
-            q3 += 8;
-            memcpy(aux32, gas, 8); gas += 8;
-            const __m256i s2_1 = _mm256_set_epi64x(signs64[(aux32[0] >> 21) & 127], signs64[(aux32[0] >> 14) & 127],
-                                                   signs64[(aux32[0] >>  7) & 127], signs64[(aux32[0] >>  0) & 127]);
-            const __m256i s2_2 = _mm256_set_epi64x(signs64[(aux32[1] >> 21) & 127], signs64[(aux32[1] >> 14) & 127],
-                                                   signs64[(aux32[1] >>  7) & 127], signs64[(aux32[1] >>  0) & 127]);
-            const __m256i q8s_1 = _mm256_sign_epi8(q8_1, s2_1);
-            const __m256i q8s_2 = _mm256_sign_epi8(q8_2, s2_2);
-            const __m256i dot1  = _mm256_maddubs_epi16(q2_1, q8s_1);
-            const __m256i dot2  = _mm256_maddubs_epi16(q2_2, q8s_2);
-            const uint16_t ls1 = aux32[0] >> 28;
-            const uint16_t ls2 = aux32[1] >> 28;
-            const __m256i p1 = _mm256_madd_epi16(dot1, _mm256_set1_epi16(2*ls1+1));
-            const __m256i p2 = _mm256_madd_epi16(dot2, _mm256_set1_epi16(2*ls2+1));
-            sumi1 = _mm256_add_epi32(sumi1, p1);
-            sumi2 = _mm256_add_epi32(sumi2, p2);
-        }
-
-        accumf = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accumf);
-
-    }
-
-    *s = 0.25f * hsum_float_8(accumf);
-
-#elif defined(__AVX__)
-    const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
-
-    uint32_t aux32[2];
-
-    __m256 accumf = _mm256_setzero_ps();
-    for (int i = 0; i < nb; ++i) {
-        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-        const uint8_t * GGML_RESTRICT q3 = x[i].qs;
-        const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4;
-        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
-        __m128i sumi1_0 = _mm_setzero_si128();
-        __m128i sumi1_1 = _mm_setzero_si128();
-        __m128i sumi2_0 = _mm_setzero_si128();
-        __m128i sumi2_1 = _mm_setzero_si128();
-        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
-            const __m128i q8_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
-            const __m128i q8_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
-            const __m128i q8_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
-            const __m128i q8_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
-            const __m128i q2_1_0 = _mm_set_epi32(iq3xxs_grid[q3[3]], iq3xxs_grid[q3[2]], iq3xxs_grid[q3[1]], iq3xxs_grid[q3[0]]);
-            const __m128i q2_1_1 = _mm_set_epi32(iq3xxs_grid[q3[7]], iq3xxs_grid[q3[6]], iq3xxs_grid[q3[5]], iq3xxs_grid[q3[4]]);
-            q3 += 8;
-            const __m128i q2_2_0 = _mm_set_epi32(iq3xxs_grid[q3[3]], iq3xxs_grid[q3[2]], iq3xxs_grid[q3[1]], iq3xxs_grid[q3[0]]);
-            const __m128i q2_2_1 = _mm_set_epi32(iq3xxs_grid[q3[7]], iq3xxs_grid[q3[6]], iq3xxs_grid[q3[5]], iq3xxs_grid[q3[4]]);
-            q3 += 8;
-            memcpy(aux32, gas, 8); gas += 8;
-            const __m128i s2_1_0 = _mm_set_epi64x(signs64[(aux32[0] >>  7) & 127], signs64[(aux32[0] >>  0) & 127]);
-            const __m128i s2_1_1 = _mm_set_epi64x(signs64[(aux32[0] >> 21) & 127], signs64[(aux32[0] >> 14) & 127]);
-            const __m128i s2_2_0 = _mm_set_epi64x(signs64[(aux32[1] >>  7) & 127], signs64[(aux32[1] >>  0) & 127]);
-            const __m128i s2_2_1 = _mm_set_epi64x(signs64[(aux32[1] >> 21) & 127], signs64[(aux32[1] >> 14) & 127]);
-            const __m128i q8s_1_0 = _mm_sign_epi8(q8_1_0, s2_1_0);
-            const __m128i q8s_1_1 = _mm_sign_epi8(q8_1_1, s2_1_1);
-            const __m128i q8s_2_0 = _mm_sign_epi8(q8_2_0, s2_2_0);
-            const __m128i q8s_2_1 = _mm_sign_epi8(q8_2_1, s2_2_1);
-            const __m128i dot1_0  = _mm_maddubs_epi16(q2_1_0, q8s_1_0);
-            const __m128i dot1_1  = _mm_maddubs_epi16(q2_1_1, q8s_1_1);
-            const __m128i dot2_0  = _mm_maddubs_epi16(q2_2_0, q8s_2_0);
-            const __m128i dot2_1  = _mm_maddubs_epi16(q2_2_1, q8s_2_1);
-            const uint16_t ls1 = aux32[0] >> 28;
-            const uint16_t ls2 = aux32[1] >> 28;
-            const __m128i p1_0 = _mm_madd_epi16(dot1_0, _mm_set1_epi16(2*ls1+1));
-            const __m128i p1_1 = _mm_madd_epi16(dot1_1, _mm_set1_epi16(2*ls1+1));
-            const __m128i p2_0 = _mm_madd_epi16(dot2_0, _mm_set1_epi16(2*ls2+1));
-            const __m128i p2_1 = _mm_madd_epi16(dot2_1, _mm_set1_epi16(2*ls2+1));
-            sumi1_0 = _mm_add_epi32(sumi1_0, p1_0);
-            sumi1_1 = _mm_add_epi32(sumi1_1, p1_1);
-            sumi2_0 = _mm_add_epi32(sumi2_0, p2_0);
-            sumi2_1 = _mm_add_epi32(sumi2_1, p2_1);
-        }
-
-        accumf = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(_mm_add_epi32(sumi1_1, sumi2_1), _mm_add_epi32(sumi1_0, sumi2_0)))), accumf);
-
-    }
-
-    *s = 0.25f * hsum_float_8(accumf);
-
-#elif defined(__POWER9_VECTOR__)
-    const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
-
-    const vector int v0 = vec_splats((int32_t)0);
-
-    vector float vsumf0 = vec_splats(0.0f);
-    vector float vsumf1 = vec_splats(0.0f);
-    vector float vsumf2 = vec_splats(0.0f);
-    vector float vsumf3 = vec_splats(0.0f);
-
-    for (int i = 0; i < nb; ++i) {
-        vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
-        vector float vyd = vec_splats(y[i].d);
-        vector float vd = vec_mul(vxd, vyd);
-
-        vector signed int vsumi0 = v0;
-        vector signed int vsumi1 = v0;
-        vector signed int vsumi2 = v0;
-        vector signed int vsumi3 = v0;
-
-        const uint8_t * GGML_RESTRICT q3 = x[i].qs;
-        const uint32_t * GGML_RESTRICT signs = (const uint32_t *)(x[i].qs + QK_K/4);
-        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
-
-#pragma GCC unroll 1
-        for (int j = 0; j < QK_K/32; j += 2) {
-            __builtin_prefetch(q3, 0, 1);
-            __builtin_prefetch(q8, 0, 1);
-
-            vector unsigned int aux32x4_0 = {iq3xxs_grid[q3[ 0]], iq3xxs_grid[q3[ 1]], iq3xxs_grid[q3[ 2]], iq3xxs_grid[q3[ 3]]};
-            vector unsigned int aux32x4_1 = {iq3xxs_grid[q3[ 4]], iq3xxs_grid[q3[ 5]], iq3xxs_grid[q3[ 6]], iq3xxs_grid[q3[ 7]]};
-            vector unsigned int aux32x4_2 = {iq3xxs_grid[q3[ 8]], iq3xxs_grid[q3[ 9]], iq3xxs_grid[q3[10]], iq3xxs_grid[q3[11]]};
-            vector unsigned int aux32x4_3 = {iq3xxs_grid[q3[12]], iq3xxs_grid[q3[13]], iq3xxs_grid[q3[14]], iq3xxs_grid[q3[15]]};
-            q3 += 16;
-
-            vector unsigned long long aux64x2_0 = {(uint64_t)(signs64[(signs[0] >>  0) & 127]), (uint64_t)(signs64[(signs[0] >>  7) & 127])};
-            vector unsigned long long aux64x2_1 = {(uint64_t)(signs64[(signs[0] >> 14) & 127]), (uint64_t)(signs64[(signs[0] >> 21) & 127])};
-            vector unsigned long long aux64x2_2 = {(uint64_t)(signs64[(signs[1] >>  0) & 127]), (uint64_t)(signs64[(signs[1] >>  7) & 127])};
-            vector unsigned long long aux64x2_3 = {(uint64_t)(signs64[(signs[1] >> 14) & 127]), (uint64_t)(signs64[(signs[1] >> 21) & 127])};
-
-            vector signed char q3x0 = vec_mul((vector signed char)aux64x2_0, (vector signed char)aux32x4_0);
-            vector signed char q3x1 = vec_mul((vector signed char)aux64x2_1, (vector signed char)aux32x4_1);
-            vector signed char q3x2 = vec_mul((vector signed char)aux64x2_2, (vector signed char)aux32x4_2);
-            vector signed char q3x3 = vec_mul((vector signed char)aux64x2_3, (vector signed char)aux32x4_3);
-
-            vector signed char q8y0 = vec_xl( 0, q8);
-            vector signed char q8y1 = vec_xl(16, q8);
-            vector signed char q8y2 = vec_xl(32, q8);
-            vector signed char q8y3 = vec_xl(48, q8);
-            q8 += 64;
-
-            vector signed short qv0 = vec_add(vec_mule(q3x0, q8y0), vec_mulo(q3x0, q8y0));
-            vector signed short qv1 = vec_add(vec_mule(q3x1, q8y1), vec_mulo(q3x1, q8y1));
-            vector signed short qv2 = vec_add(vec_mule(q3x2, q8y2), vec_mulo(q3x2, q8y2));
-            vector signed short qv3 = vec_add(vec_mule(q3x3, q8y3), vec_mulo(q3x3, q8y3));
-
-            const uint16_t ls0 = (uint16_t)(signs[0] >> 28);
-            const uint16_t ls1 = (uint16_t)(signs[1] >> 28);
-            signs += 2;
-
-            vector signed short vscales01 = (vector signed short)vec_splats((uint16_t)(2*ls0+1));
-            vector signed short vscales23 = (vector signed short)vec_splats((uint16_t)(2*ls1+1));
-
-            vsumi0 = vec_msum(qv0, vscales01, vsumi0);
-            vsumi1 = vec_msum(qv1, vscales01, vsumi1);
-            vsumi2 = vec_msum(qv2, vscales23, vsumi2);
-            vsumi3 = vec_msum(qv3, vscales23, vsumi3);
-        }
-
-        vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
-        vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
-        vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
-        vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
-    }
-
-    vsumf0 = vec_add(vsumf0, vsumf2);
-    vsumf1 = vec_add(vsumf1, vsumf3);
-
-    vsumf0 = vec_add(vsumf0, vsumf1);
-
-    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
-    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
-
-    *s = 0.25f * vec_extract(vsumf0, 0);
-
-#elif defined(__loongarch_asx)
-
-    const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
-
-    uint32_t aux32[2];
-
-    __m256 accumf = (__m256)__lasx_xvldi(0);
-    for (int i = 0; i < nb; ++i) {
-        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-        const uint8_t * GGML_RESTRICT q3 = x[i].qs;
-        const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4;
-        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
-        __m256i sumi1 = __lasx_xvldi(0);
-        __m256i sumi2 = __lasx_xvldi(0);
-        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
-            const __m256i q8_1 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32;
-            const __m256i q8_2 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32;
-            const __m256i q2_1 = lasx_set_w(iq3xxs_grid[q3[7]], iq3xxs_grid[q3[6]], iq3xxs_grid[q3[5]], iq3xxs_grid[q3[4]],
-                                                iq3xxs_grid[q3[3]], iq3xxs_grid[q3[2]], iq3xxs_grid[q3[1]], iq3xxs_grid[q3[0]]);
-            q3 += 8;
-            const __m256i q2_2 = lasx_set_w(iq3xxs_grid[q3[7]], iq3xxs_grid[q3[6]], iq3xxs_grid[q3[5]], iq3xxs_grid[q3[4]],
-                                                iq3xxs_grid[q3[3]], iq3xxs_grid[q3[2]], iq3xxs_grid[q3[1]], iq3xxs_grid[q3[0]]);
-            q3 += 8;
-            memcpy(aux32, gas, 8); gas += 8;
-
-            const __m256i s2_1 = lasx_set_d(signs64[(aux32[0] >> 21) & 127], signs64[(aux32[0] >> 14) & 127],
-                                                   signs64[(aux32[0] >>  7) & 127], signs64[(aux32[0] >>  0) & 127]);
-            const __m256i s2_2 = lasx_set_d(signs64[(aux32[1] >> 21) & 127], signs64[(aux32[1] >> 14) & 127],
-                                                   signs64[(aux32[1] >>  7) & 127], signs64[(aux32[1] >>  0) & 127]);
-            const __m256i q8s_1 = __lasx_xvsigncov_b(s2_1, q8_1);
-            const __m256i q8s_2 = __lasx_xvsigncov_b(s2_2, q8_2);
-            const __m256i dot1  = lasx_maddubs_h(q2_1, q8s_1);
-            const __m256i dot2  = lasx_maddubs_h(q2_2, q8s_2);
-            const uint16_t ls1 = aux32[0] >> 28;
-            const uint16_t ls2 = aux32[1] >> 28;
-
-            const __m256i p1 = lasx_madd_h(dot1, __lasx_xvreplgr2vr_h(2*ls1+1));
-            const __m256i p2 = lasx_madd_h(dot2, __lasx_xvreplgr2vr_h(2*ls2+1));
-            sumi1 = __lasx_xvadd_w(sumi1, p1);
-            sumi2 = __lasx_xvadd_w(sumi2, p2);
-        }
-
-        accumf = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(d), __lasx_xvffint_s_w(__lasx_xvadd_w(sumi1, sumi2)), accumf);
-    }
-
-    *s = 0.25f * hsum_float_8(accumf);
-
-#else
-
-    uint32_t aux32;
-
-    float sumf = 0.f;
-    for (int i = 0; i < nb; ++i) {
-        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-        const uint8_t * GGML_RESTRICT q3 = x[i].qs;
-        const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4;
-        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
-        int32_t bsum = 0;
-        for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
-            memcpy(&aux32, gas, sizeof(uint32_t)); gas += sizeof(uint32_t);
-            const uint32_t ls = 2*(aux32 >> 28) + 1;
-            int32_t sumi = 0;
-            for (int l = 0; l < 4; ++l) {
-                const uint8_t * grid1 = (const uint8_t *)(iq3xxs_grid + q3[2*l+0]);
-                const uint8_t * grid2 = (const uint8_t *)(iq3xxs_grid + q3[2*l+1]);
-                const uint8_t  signs = ksigns_iq2xs[(aux32 >> 7*l) & 127];
-                for (int j = 0; j < 4; ++j) {
-                    sumi += grid1[j] * q8[j+0] * (signs & kmask_iq2xs[j+0] ? -1 : 1);
-                    sumi += grid2[j] * q8[j+4] * (signs & kmask_iq2xs[j+4] ? -1 : 1);
-                }
-                q8 += 8;
-            }
-            q3 += 8;
-            bsum += sumi * ls;
-        }
-        sumf += d * bsum;
-    }
-    *s = 0.25f * sumf;
-#endif
-}
-
-void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
-    assert(n % QK_K == 0);
-    assert(nrc == 1);
-    UNUSED(nrc);
-    UNUSED(bx);
-    UNUSED(by);
-    UNUSED(bs);
-
-    const block_iq3_s * GGML_RESTRICT x = vx;
-    const block_q8_K  * GGML_RESTRICT y = vy;
-
-    const int nb = n / QK_K;
-
-#if defined(__ARM_NEON)
-
-    typedef union {
-        uint16x8_t vec_index;
-        uint16_t   index[8];
-    } vec_index_t;
-
-   static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
-                                       0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
-   };
-
-    static const uint8_t k_mask2[16] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,};
-
-    static const int16_t k_shift[8] = {8, 7, 6, 5, 4, 3, 2, 1};
-
-    const ggml_uint8x16x2_t mask1 = ggml_vld1q_u8_x2(k_mask1);
-    const uint8x16_t        mask2 = vld1q_u8(k_mask2);
-
-    const int16x8_t  hshift = vld1q_s16(k_shift);
-    const uint16x8_t m256   = vdupq_n_u16(256);
-    const uint8x16_t m1     = vdupq_n_u8(1);
-
-    uint8x16x2_t vs;
-    ggml_int8x16x4_t q3s;
-    ggml_int8x16x4_t q8b;
-    vec_index_t idx;
-
-    uint32_t scales32[2];
-    const uint8_t * scales8 = (const uint8_t *)scales32;
-
-    float sumf = 0;
-    for (int i = 0; i < nb; ++i) {
-        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-        const uint8_t * GGML_RESTRICT qs = x[i].qs;
-        const uint8_t * GGML_RESTRICT qh = x[i].qh;
-        const uint16_t * GGML_RESTRICT signs = (const uint16_t *)x[i].signs;
-        const int8_t   * GGML_RESTRICT q8 = y[i].qs;
-
-        memcpy(scales32, x[i].scales, 4);
-        scales32[1] = (((scales32[0] >> 4) & 0x0f0f0f0f) << 1) | 0x01010101;
-        scales32[0] = ((scales32[0] & 0x0f0f0f0f) << 1) | 0x01010101;
-
-        int sumi1 = 0, sumi2 = 0;
-        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
-            q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
-
-            const uint8x16_t idx_l = vld1q_u8(qs); qs += 16;
-            idx.vec_index = vorrq_u16(vmovl_u8(vget_low_u8 (idx_l)), vandq_u16(vshlq_u16(vdupq_n_u16(qh[ib32+0]), hshift), m256));
-            const uint32x4_t aux32x4_0 = ggml_vld1q_u32(iq3s_grid[idx.index[0]], iq3s_grid[idx.index[1]],
-                                                        iq3s_grid[idx.index[2]], iq3s_grid[idx.index[3]]);
-            const uint32x4_t aux32x4_1 = ggml_vld1q_u32(iq3s_grid[idx.index[4]], iq3s_grid[idx.index[5]],
-                                                        iq3s_grid[idx.index[6]], iq3s_grid[idx.index[7]]);
-            idx.vec_index = vorrq_u16(vmovl_u8(vget_high_u8(idx_l)), vandq_u16(vshlq_u16(vdupq_n_u16(qh[ib32+1]), hshift), m256));
-            const uint32x4_t aux32x4_2 = ggml_vld1q_u32(iq3s_grid[idx.index[0]], iq3s_grid[idx.index[1]],
-                                                        iq3s_grid[idx.index[2]], iq3s_grid[idx.index[3]]);
-            const uint32x4_t aux32x4_3 = ggml_vld1q_u32(iq3s_grid[idx.index[4]], iq3s_grid[idx.index[5]],
-                                                        iq3s_grid[idx.index[6]], iq3s_grid[idx.index[7]]);
-
-
-            vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[0] | ((uint32_t) signs[1] << 16)));
-            vs.val[1] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[1]), mask2);
-            vs.val[0] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[0]), mask2);
-            vs.val[0] = vorrq_u8(vceqq_u8(vs.val[0], mask2), m1);
-            vs.val[1] = vorrq_u8(vceqq_u8(vs.val[1], mask2), m1);
-
-            q3s.val[0] = vmulq_s8(vreinterpretq_s8_u8(vs.val[0]), vreinterpretq_s8_u32(aux32x4_0));
-            q3s.val[1] = vmulq_s8(vreinterpretq_s8_u8(vs.val[1]), vreinterpretq_s8_u32(aux32x4_1));
-
-            vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[2] | ((uint32_t) signs[3] << 16)));
-            vs.val[1] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[1]), mask2);
-            vs.val[0] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[0]), mask2);
-            vs.val[0] = vorrq_u8(vceqq_u8(vs.val[0], mask2), m1);
-            vs.val[1] = vorrq_u8(vceqq_u8(vs.val[1], mask2), m1);
-
-            signs += 4;
-
-            q3s.val[2] = vmulq_s8(vreinterpretq_s8_u8(vs.val[0]), vreinterpretq_s8_u32(aux32x4_2));
-            q3s.val[3] = vmulq_s8(vreinterpretq_s8_u8(vs.val[1]), vreinterpretq_s8_u32(aux32x4_3));
-
-            const int32x4_t p1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q3s.val[0], q8b.val[0]), q3s.val[1], q8b.val[1]);
-            const int32x4_t p2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q3s.val[2], q8b.val[2]), q3s.val[3], q8b.val[3]);
-
-            sumi1 += vaddvq_s32(p1) * scales8[ib32/2+0];
-            sumi2 += vaddvq_s32(p2) * scales8[ib32/2+4];
-        }
-        sumf += d*(sumi1 + sumi2);
-    }
-    *s = sumf;
-
-#elif defined(__AVX2__)
-
-   static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
-                                       0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
-   };
-
-    static const uint8_t k_mask2[32] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
-                                        0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
-    };
-
-    const __m256i mask1 = _mm256_loadu_si256((const __m256i*)k_mask1);
-    const __m256i mask2 = _mm256_loadu_si256((const __m256i*)k_mask2);
-
-    const __m256i idx_shift = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
-    const __m256i idx_mask  = _mm256_set1_epi32(256);
-
-    typedef union {
-        __m256i  vec[2];
-        uint32_t index[16];
-    } index_t;
-
-    index_t idx;
-
-    __m256 accumf = _mm256_setzero_ps();
-    for (int i = 0; i < nb; ++i) {
-        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-        const uint8_t * GGML_RESTRICT qs = x[i].qs;
-        const uint8_t * GGML_RESTRICT qh = x[i].qh;
-        const uint16_t * GGML_RESTRICT signs = (const uint16_t *)x[i].signs;
-        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
-        __m256i sumi1 = _mm256_setzero_si256();
-        __m256i sumi2 = _mm256_setzero_si256();
-        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
-            const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
-            const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
-            const __m256i idx_l = _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)qs)); qs += 16;
-            idx.vec[0] = _mm256_set1_epi32(qh[ib32+0]);
-            idx.vec[1] = _mm256_set1_epi32(qh[ib32+1]);
-            idx.vec[0] = _mm256_and_si256(_mm256_sllv_epi32(idx.vec[0], idx_shift), idx_mask);
-            idx.vec[1] = _mm256_and_si256(_mm256_sllv_epi32(idx.vec[1], idx_shift), idx_mask);
-            idx.vec[0] = _mm256_or_si256(idx.vec[0], _mm256_cvtepi16_epi32(_mm256_castsi256_si128(idx_l)));
-            idx.vec[1] = _mm256_or_si256(idx.vec[1], _mm256_cvtepi16_epi32(_mm256_extractf128_si256(idx_l, 1)));
-
-            // At leat on my CPU (Ryzen 7950X), using _mm256_i32gather_epi32 is slower than _mm256_set_epi32. Strange.
-            //const __m256i q2_1 = _mm256_i32gather_epi32((const int *)iq3s_grid, idx.vec[0], 4);
-            //const __m256i q2_2 = _mm256_i32gather_epi32((const int *)iq3s_grid, idx.vec[1], 4);
-            const __m256i q2_1 = _mm256_set_epi32(
-                    iq3s_grid[idx.index[7]], iq3s_grid[idx.index[6]], iq3s_grid[idx.index[5]], iq3s_grid[idx.index[4]],
-                    iq3s_grid[idx.index[3]], iq3s_grid[idx.index[2]], iq3s_grid[idx.index[1]], iq3s_grid[idx.index[0]]
-            );
-            const __m256i q2_2 = _mm256_set_epi32(
-                    iq3s_grid[idx.index[15]], iq3s_grid[idx.index[14]], iq3s_grid[idx.index[13]], iq3s_grid[idx.index[12]],
-                    iq3s_grid[idx.index[11]], iq3s_grid[idx.index[10]], iq3s_grid[idx.index[ 9]], iq3s_grid[idx.index[ 8]]
-            );
-
-            __m256i aux256 = _mm256_set1_epi32(signs[0] | (signs[1] << 16));
-            aux256 = _mm256_and_si256(_mm256_shuffle_epi8(aux256,mask1), mask2);
-            const __m256i s2_1 = _mm256_cmpeq_epi8(aux256, mask2);
-            const __m256i q8s_1 = _mm256_sub_epi8(_mm256_xor_si256(s2_1, q8_1), s2_1);
-
-            aux256 = _mm256_set1_epi32(signs[2] | (signs[3] << 16));
-            aux256 = _mm256_and_si256(_mm256_shuffle_epi8(aux256,mask1), mask2);
-            const __m256i s2_2 = _mm256_cmpeq_epi8(aux256, mask2);
-            const __m256i q8s_2 = _mm256_sub_epi8(_mm256_xor_si256(s2_2, q8_2), s2_2);
-
-            signs += 4;
-
-            const __m256i dot1  = _mm256_maddubs_epi16(q2_1, q8s_1);
-            const __m256i dot2  = _mm256_maddubs_epi16(q2_2, q8s_2);
-            const uint16_t ls1 = x[i].scales[ib32/2] & 0xf;
-            const uint16_t ls2 = x[i].scales[ib32/2] >>  4;
-            const __m256i p1 = _mm256_madd_epi16(dot1, _mm256_set1_epi16(2*ls1+1));
-            const __m256i p2 = _mm256_madd_epi16(dot2, _mm256_set1_epi16(2*ls2+1));
-            sumi1 = _mm256_add_epi32(sumi1, p1);
-            sumi2 = _mm256_add_epi32(sumi2, p2);
-        }
-
-        accumf = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accumf);
-
-    }
-
-    *s = hsum_float_8(accumf);
-
-#elif defined(__AVX__)
-   static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
-                                       0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
-   };
-
-    static const uint8_t k_mask2[32] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
-                                        0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
-    };
-
-    const __m128i mask1_0 = _mm_loadu_si128((const __m128i*)k_mask1);
-    const __m128i mask1_1 = _mm_loadu_si128((const __m128i*)k_mask1 + 1);
-    const __m128i mask2_0 = _mm_loadu_si128((const __m128i*)k_mask2);
-    const __m128i mask2_1 = _mm_loadu_si128((const __m128i*)k_mask2 + 1);
-
-    const __m128i idx_mul_0 = _mm_set_epi32(32, 64, 128, 256);
-    const __m128i idx_mul_1 = _mm_set_epi32(2, 4, 8, 16);
-    const __m128i idx_mask  = _mm_set1_epi32(256);
-
-    typedef union {
-        __m128i  vec[4];
-        uint32_t index[16];
-    } index_t;
-
-    index_t idx;
-
-    __m256 accumf = _mm256_setzero_ps();
-    for (int i = 0; i < nb; ++i) {
-        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-        const uint8_t * GGML_RESTRICT qs = x[i].qs;
-        const uint8_t * GGML_RESTRICT qh = x[i].qh;
-        const uint16_t * GGML_RESTRICT signs = (const uint16_t *)x[i].signs;
-        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
-        __m128i sumi1_0 = _mm_setzero_si128();
-        __m128i sumi1_1 = _mm_setzero_si128();
-        __m128i sumi2_0 = _mm_setzero_si128();
-        __m128i sumi2_1 = _mm_setzero_si128();
-        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
-            const __m128i q8_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
-            const __m128i q8_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
-            const __m128i q8_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
-            const __m128i q8_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
-            const __m128i qs_tmp = _mm_loadu_si128((const __m128i *)qs);
-            const __m128i idx_l_0 = _mm_cvtepu8_epi16(qs_tmp);
-            const __m128i idx_l_1 = _mm_cvtepu8_epi16(_mm_srli_si128(qs_tmp, 8)); qs += 16;
-            idx.vec[0] = _mm_set1_epi32(qh[ib32+0]);
-            idx.vec[1] = idx.vec[0];
-            idx.vec[2] = _mm_set1_epi32(qh[ib32+1]);
-            idx.vec[3] = idx.vec[2];
-
-            idx.vec[0] = _mm_and_si128(_mm_mullo_epi32(idx.vec[0], idx_mul_0), idx_mask);
-            idx.vec[1] = _mm_and_si128(_mm_mullo_epi32(idx.vec[1], idx_mul_1), idx_mask);
-            idx.vec[2] = _mm_and_si128(_mm_mullo_epi32(idx.vec[2], idx_mul_0), idx_mask);
-            idx.vec[3] = _mm_and_si128(_mm_mullo_epi32(idx.vec[3], idx_mul_1), idx_mask);
-
-            idx.vec[0] = _mm_or_si128(idx.vec[0], _mm_cvtepi16_epi32(idx_l_0));
-            idx.vec[1] = _mm_or_si128(idx.vec[1], _mm_cvtepi16_epi32(_mm_srli_si128(idx_l_0, 8)));
-            idx.vec[2] = _mm_or_si128(idx.vec[2], _mm_cvtepi16_epi32(idx_l_1));
-            idx.vec[3] = _mm_or_si128(idx.vec[3], _mm_cvtepi16_epi32(_mm_srli_si128(idx_l_1, 8)));
-
-            const __m128i q2_1_0 = _mm_set_epi32(iq3s_grid[idx.index[3]], iq3s_grid[idx.index[2]], iq3s_grid[idx.index[1]], iq3s_grid[idx.index[0]]);
-            const __m128i q2_1_1 = _mm_set_epi32(iq3s_grid[idx.index[7]], iq3s_grid[idx.index[6]], iq3s_grid[idx.index[5]], iq3s_grid[idx.index[4]]);
-            const __m128i q2_2_0 = _mm_set_epi32(iq3s_grid[idx.index[11]], iq3s_grid[idx.index[10]], iq3s_grid[idx.index[9]], iq3s_grid[idx.index[8]]);
-            const __m128i q2_2_1 = _mm_set_epi32(iq3s_grid[idx.index[15]], iq3s_grid[idx.index[14]], iq3s_grid[idx.index[13]], iq3s_grid[idx.index[12]]);
-
-            __m128i aux128_0 = _mm_set1_epi32(signs[0] | (signs[1] << 16));
-            __m128i aux128_1 = aux128_0;
-            aux128_0 = _mm_and_si128(_mm_shuffle_epi8(aux128_0,mask1_0), mask2_0);
-            aux128_1 = _mm_and_si128(_mm_shuffle_epi8(aux128_1,mask1_1), mask2_1);
-            const __m128i s2_1_0 = _mm_cmpeq_epi8(aux128_0, mask2_0);
-            const __m128i s2_1_1 = _mm_cmpeq_epi8(aux128_1, mask2_1);
-            const __m128i q8s_1_0 = _mm_sub_epi8(_mm_xor_si128(s2_1_0, q8_1_0), s2_1_0);
-            const __m128i q8s_1_1 = _mm_sub_epi8(_mm_xor_si128(s2_1_1, q8_1_1), s2_1_1);
-
-            aux128_0 = _mm_set1_epi32(signs[2] | (signs[3] << 16));
-            aux128_1 = aux128_0;
-            aux128_0 = _mm_and_si128(_mm_shuffle_epi8(aux128_0,mask1_0), mask2_0);
-            aux128_1 = _mm_and_si128(_mm_shuffle_epi8(aux128_1,mask1_1), mask2_1);
-            const __m128i s2_2_0 = _mm_cmpeq_epi8(aux128_0, mask2_0);
-            const __m128i s2_2_1 = _mm_cmpeq_epi8(aux128_1, mask2_1);
-            const __m128i q8s_2_0 = _mm_sub_epi8(_mm_xor_si128(s2_2_0, q8_2_0), s2_2_0);
-            const __m128i q8s_2_1 = _mm_sub_epi8(_mm_xor_si128(s2_2_1, q8_2_1), s2_2_1);
-
-            signs += 4;
-
-            const __m128i dot1_0  = _mm_maddubs_epi16(q2_1_0, q8s_1_0);
-            const __m128i dot1_1  = _mm_maddubs_epi16(q2_1_1, q8s_1_1);
-            const __m128i dot2_0  = _mm_maddubs_epi16(q2_2_0, q8s_2_0);
-            const __m128i dot2_1  = _mm_maddubs_epi16(q2_2_1, q8s_2_1);
-            const uint16_t ls1 = x[i].scales[ib32/2] & 0xf;
-            const uint16_t ls2 = x[i].scales[ib32/2] >>  4;
-            const __m128i p1_0 = _mm_madd_epi16(dot1_0, _mm_set1_epi16(2*ls1+1));
-            const __m128i p1_1 = _mm_madd_epi16(dot1_1, _mm_set1_epi16(2*ls1+1));
-            const __m128i p2_0 = _mm_madd_epi16(dot2_0, _mm_set1_epi16(2*ls2+1));
-            const __m128i p2_1 = _mm_madd_epi16(dot2_1, _mm_set1_epi16(2*ls2+1));
-            sumi1_0 = _mm_add_epi32(sumi1_0, p1_0);
-            sumi1_1 = _mm_add_epi32(sumi1_1, p1_1);
-            sumi2_0 = _mm_add_epi32(sumi2_0, p2_0);
-            sumi2_1 = _mm_add_epi32(sumi2_1, p2_1);
-        }
-
-        accumf = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(_mm_add_epi32(sumi1_1, sumi2_1), _mm_add_epi32(sumi1_0, sumi2_0)))), accumf);
-
-    }
-
-    *s = hsum_float_8(accumf);
-
-#elif defined(__POWER9_VECTOR__)
-    static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
-                                        0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
-    };
-
-    static const uint8_t k_mask2[16] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,};
-
-    const vector int v0 = vec_splats((int32_t)0);
-
-    vector float vsumf0 = vec_splats(0.0f);
-    vector float vsumf1 = vec_splats(0.0f);
-    vector float vsumf2 = vec_splats(0.0f);
-    vector float vsumf3 = vec_splats(0.0f);
-
-    const vector unsigned char mask0 = vec_xl( 0, k_mask1);
-    const vector unsigned char mask1 = vec_xl(16, k_mask1);
-    const vector signed char mask2 = (vector signed char)vec_xl( 0, k_mask2);
-
-    for (int i = 0; i < nb; ++i) {
-        vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
-        vector float vyd = vec_splats(y[i].d);
-        vector float vd = vec_mul(vxd, vyd);
-
-        const uint8_t *  GGML_RESTRICT q3 = x[i].qs;
-        const uint8_t *  GGML_RESTRICT qh = x[i].qh;
-        const uint16_t * GGML_RESTRICT signs = (const uint16_t *)(x[i].signs);
-        const uint8_t *  GGML_RESTRICT sc = x[i].scales;
-        const int8_t  *  GGML_RESTRICT q8 = y[i].qs;
-
-        vector signed int vsumi0 = v0;
-        vector signed int vsumi1 = v0;
-        vector signed int vsumi2 = v0;
-        vector signed int vsumi3 = v0;
-
-        for (int j = 0; j < QK_K/32; j += 2) {
-            __builtin_prefetch(q3, 0, 1);
-            __builtin_prefetch(q8, 0, 1);
-
-            vector unsigned int aux32x4_0 = {iq3s_grid[q3[ 0] | ((qh[0] << 8) & 256)], iq3s_grid[q3[ 1] | ((qh[0] << 7) & 256)],
-                                             iq3s_grid[q3[ 2] | ((qh[0] << 6) & 256)], iq3s_grid[q3[ 3] | ((qh[0] << 5) & 256)]};
-            vector unsigned int aux32x4_1 = {iq3s_grid[q3[ 4] | ((qh[0] << 4) & 256)], iq3s_grid[q3[ 5] | ((qh[0] << 3) & 256)],
-                                             iq3s_grid[q3[ 6] | ((qh[0] << 2) & 256)], iq3s_grid[q3[ 7] | ((qh[0] << 1) & 256)]};
-            vector unsigned int aux32x4_2 = {iq3s_grid[q3[ 8] | ((qh[1] << 8) & 256)], iq3s_grid[q3[ 9] | ((qh[1] << 7) & 256)],
-                                             iq3s_grid[q3[10] | ((qh[1] << 6) & 256)], iq3s_grid[q3[11] | ((qh[1] << 5) & 256)]};
-            vector unsigned int aux32x4_3 = {iq3s_grid[q3[12] | ((qh[1] << 4) & 256)], iq3s_grid[q3[13] | ((qh[1] << 3) & 256)],
-                                             iq3s_grid[q3[14] | ((qh[1] << 2) & 256)], iq3s_grid[q3[15] | ((qh[1] << 1) & 256)]};
-            q3 += 16;
-            qh += 2;
-
-            vector signed char vsigns01 = (vector signed char)vec_splats(*(const uint32_t *)&signs[0]);
-            vector signed char vsigns02 = (vector signed char)vec_splats(*(const uint32_t *)&signs[2]);
-            signs += 4;
-
-            vector signed char vsigns0 = vec_perm(vsigns01, vsigns01, mask0);
-            vector signed char vsigns1 = vec_perm(vsigns01, vsigns01, mask1);
-            vector signed char vsigns2 = vec_perm(vsigns02, vsigns02, mask0);
-            vector signed char vsigns3 = vec_perm(vsigns02, vsigns02, mask1);
-
-            vsigns0 = (vector signed char)vec_cmpeq(vec_and(vsigns0, mask2), mask2);
-            vsigns1 = (vector signed char)vec_cmpeq(vec_and(vsigns1, mask2), mask2);
-            vsigns2 = (vector signed char)vec_cmpeq(vec_and(vsigns2, mask2), mask2);
-            vsigns3 = (vector signed char)vec_cmpeq(vec_and(vsigns3, mask2), mask2);
-
-            vector signed char q3x0 = vec_sub(vec_xor(vsigns0, (vector signed char)aux32x4_0), vsigns0);
-            vector signed char q3x1 = vec_sub(vec_xor(vsigns1, (vector signed char)aux32x4_1), vsigns1);
-            vector signed char q3x2 = vec_sub(vec_xor(vsigns2, (vector signed char)aux32x4_2), vsigns2);
-            vector signed char q3x3 = vec_sub(vec_xor(vsigns3, (vector signed char)aux32x4_3), vsigns3);
-
-            vector signed char q8y0 = vec_xl( 0, q8);
-            vector signed char q8y1 = vec_xl(16, q8);
-            vector signed char q8y2 = vec_xl(32, q8);
-            vector signed char q8y3 = vec_xl(48, q8);
-            q8 += 64;
-
-            vector signed short qv0 = vec_add(vec_mule(q3x0, q8y0), vec_mulo(q3x0, q8y0));
-            vector signed short qv1 = vec_add(vec_mule(q3x1, q8y1), vec_mulo(q3x1, q8y1));
-            vector signed short qv2 = vec_add(vec_mule(q3x2, q8y2), vec_mulo(q3x2, q8y2));
-            vector signed short qv3 = vec_add(vec_mule(q3x3, q8y3), vec_mulo(q3x3, q8y3));
-
-            const uint16_t ls0 = (uint16_t)(sc[0] & 0xf);
-            const uint16_t ls1 = (uint16_t)(sc[0] >>  4);
-            sc ++;
-
-            vector signed short vscales01 = (vector signed short)vec_splats((uint16_t)(2*ls0+1));
-            vector signed short vscales23 = (vector signed short)vec_splats((uint16_t)(2*ls1+1));
-
-            vsumi0 = vec_msum(qv0, vscales01, vsumi0);
-            vsumi1 = vec_msum(qv1, vscales01, vsumi1);
-            vsumi2 = vec_msum(qv2, vscales23, vsumi2);
-            vsumi3 = vec_msum(qv3, vscales23, vsumi3);
-        }
-
-        vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
-        vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
-        vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
-        vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
-    }
-
-    vsumf0 = vec_add(vsumf0, vsumf2);
-    vsumf1 = vec_add(vsumf1, vsumf3);
-
-    vsumf0 = vec_add(vsumf0, vsumf1);
-
-    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
-    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
-
-    *s = vec_extract(vsumf0, 0);
-
-#elif defined(__loongarch_asx)
-
-   static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
-                                       0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
-   };
-
-    static const uint8_t k_mask2[32] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
-                                        0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
-    };
-
-    const __m256i mask1 = __lasx_xvld((const __m256i*)k_mask1, 0);
-    const __m256i mask2 = __lasx_xvld((const __m256i*)k_mask2, 0);
-
-    __m256i idx_shift = lasx_set_w(1, 2, 3, 4, 5, 6, 7, 8);
-    const __m256i idx_mask  = __lasx_xvreplgr2vr_w(256);
-
-    typedef union {
-        __m256i  vec[2];
-        uint32_t index[16];
-    } index_t;
-
-    index_t idx;
-
-    __m256 accumf = (__m256)__lasx_xvldi(0);
-    for (int i = 0; i < nb; ++i) {
-        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-        const uint8_t * GGML_RESTRICT qs = x[i].qs;
-        const uint8_t * GGML_RESTRICT qh = x[i].qh;
-        const uint16_t * GGML_RESTRICT signs = (const uint16_t *)x[i].signs;
-        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
-        __m256i sumi1 = __lasx_xvldi(0);
-        __m256i sumi2 = __lasx_xvldi(0);
-        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
-            const __m256i q8_1 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32;
-            const __m256i q8_2 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32;
-            const __m256i idx_l = lasx_extu8_16(__lsx_vld(qs, 0)); qs += 16;
-            idx.vec[0] = __lasx_xvreplgr2vr_w(qh[ib32+0]);
-            idx.vec[1] = __lasx_xvreplgr2vr_w(qh[ib32+1]);
-            idx.vec[0] = __lasx_xvand_v(__lasx_xvsll_w(idx.vec[0], idx_shift), idx_mask);
-            idx.vec[1] = __lasx_xvand_v(__lasx_xvsll_w(idx.vec[1], idx_shift), idx_mask);
-            idx.vec[0] = __lasx_xvor_v(idx.vec[0], lasx_ext16_32(lasx_extracti128(idx_l, 0)));
-            idx.vec[1] = __lasx_xvor_v(idx.vec[1], lasx_ext16_32(lasx_extracti128(idx_l, 1)));
-
-            // At leat on my CPU (Ryzen 7950X), using _mm256_i32gather_epi32 is slower than _mm256_set_epi32. Strange.
-            //const __m256i q2_1 = _mm256_i32gather_epi32((const int *)iq3s_grid, idx.vec[0], 4);
-            //const __m256i q2_2 = _mm256_i32gather_epi32((const int *)iq3s_grid, idx.vec[1], 4);
-            const __m256i q2_1 = lasx_set_w(
-                    iq3s_grid[idx.index[7]], iq3s_grid[idx.index[6]], iq3s_grid[idx.index[5]], iq3s_grid[idx.index[4]],
-                    iq3s_grid[idx.index[3]], iq3s_grid[idx.index[2]], iq3s_grid[idx.index[1]], iq3s_grid[idx.index[0]]
-            );
-            const __m256i q2_2 = lasx_set_w(
-                    iq3s_grid[idx.index[15]], iq3s_grid[idx.index[14]], iq3s_grid[idx.index[13]], iq3s_grid[idx.index[12]],
-                    iq3s_grid[idx.index[11]], iq3s_grid[idx.index[10]], iq3s_grid[idx.index[ 9]], iq3s_grid[idx.index[ 8]]
-            );
-
-            __m256i aux256 = __lasx_xvreplgr2vr_w(signs[0] | (signs[1] << 16));
-            aux256 = __lasx_xvand_v(lasx_shuffle_b(aux256,mask1), mask2);
-            const __m256i s2_1 = __lasx_xvseq_b(aux256, mask2);
-            const __m256i q8s_1 = __lasx_xvsub_b(__lasx_xvxor_v(s2_1, q8_1), s2_1);
-
-            aux256 = __lasx_xvreplgr2vr_w(signs[2] | (signs[3] << 16));
-            aux256 = __lasx_xvand_v(lasx_shuffle_b(aux256,mask1), mask2);
-            const __m256i s2_2 = __lasx_xvseq_b(aux256, mask2);
-            const __m256i q8s_2 = __lasx_xvsub_b(__lasx_xvxor_v(s2_2, q8_2), s2_2);
-
-            signs += 4;
-
-            const __m256i dot1 = lasx_maddubs_h(q2_1, q8s_1);
-            const __m256i dot2  = lasx_maddubs_h(q2_2, q8s_2);
-            const uint16_t ls1 = x[i].scales[ib32/2] & 0xf;
-            const uint16_t ls2 = x[i].scales[ib32/2] >>  4;
-            const __m256i p1 = lasx_madd_h(dot1, __lasx_xvreplgr2vr_h(2*ls1+1));
-            const __m256i p2 = lasx_madd_h(dot2, __lasx_xvreplgr2vr_h(2*ls2+1));
-            sumi1 = __lasx_xvadd_w(sumi1, p1);
-            sumi2 = __lasx_xvadd_w(sumi2, p2);
-        }
-
-        accumf = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(d), __lasx_xvffint_s_w(__lasx_xvadd_w(sumi1, sumi2)), accumf);
-    }
-
-    *s = hsum_float_8(accumf);
-
-#else
-
-    float sumf = 0.f;
-    for (int i = 0; i < nb; ++i) {
-        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-        const uint8_t * GGML_RESTRICT qs = x[i].qs;
-        const uint8_t * GGML_RESTRICT qh = x[i].qh;
-        const uint8_t * GGML_RESTRICT signs = x[i].signs;
-        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
-        int32_t bsum = 0;
-        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
-            const uint32_t ls1 = 2*(x[i].scales[ib32/2] & 0xf) + 1;
-            const uint32_t ls2 = 2*(x[i].scales[ib32/2] >>  4) + 1;
-            int32_t sumi = 0;
-            for (int l = 0; l < 4; ++l) {
-                const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*l+0] | ((qh[ib32+0] << (8-2*l)) & 256)));
-                const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*l+1] | ((qh[ib32+0] << (7-2*l)) & 256)));
-                for (int j = 0; j < 4; ++j) {
-                    sumi += grid1[j] * q8[j+0] * (signs[l] & kmask_iq2xs[j+0] ? -1 : 1);
-                    sumi += grid2[j] * q8[j+4] * (signs[l] & kmask_iq2xs[j+4] ? -1 : 1);
-                }
-                q8 += 8;
-            }
-            qs += 8;
-            signs += 4;
-            bsum += sumi * ls1;
-            sumi = 0;
-            for (int l = 0; l < 4; ++l) {
-                const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*l+0] | ((qh[ib32+1] << (8-2*l)) & 256)));
-                const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*l+1] | ((qh[ib32+1] << (7-2*l)) & 256)));
-                for (int j = 0; j < 4; ++j) {
-                    sumi += grid1[j] * q8[j+0] * (signs[l] & kmask_iq2xs[j+0] ? -1 : 1);
-                    sumi += grid2[j] * q8[j+4] * (signs[l] & kmask_iq2xs[j+4] ? -1 : 1);
-                }
-                q8 += 8;
-            }
-            qs += 8;
-            signs += 4;
-            bsum += sumi * ls2;
-        }
-        sumf += d * bsum;
-    }
-    *s = sumf;
-#endif
-}
-
-#if defined(__AVX2__)
-static inline __m256i mul_add_epi8(const __m256i x, const __m256i y) {
-    const __m256i ax = _mm256_sign_epi8(x, x);
-    const __m256i sy = _mm256_sign_epi8(y, x);
-    return _mm256_maddubs_epi16(ax, sy);
-}
-#elif defined(__loongarch_asx)
-static inline __m256i mul_add_epi8(const __m256i x, const __m256i y) {
-    const __m256i a = __lasx_xvmulwev_h_b(x, y);
-    const __m256i b = __lasx_xvmulwod_h_b(x, y);
-    return __lasx_xvadd_h(a, b);
-}
-#endif
-
-void ggml_vec_dot_iq1_s_q8_K  (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
-    assert(n % QK_K == 0);
-    assert(nrc == 1);
-    UNUSED(nrc);
-    UNUSED(bx);
-    UNUSED(by);
-    UNUSED(bs);
-
-    const block_iq1_s * GGML_RESTRICT x = vx;
-    const block_q8_K  * GGML_RESTRICT y = vy;
-
-    const int nb = n / QK_K;
-
-#if defined __ARM_NEON
-
-    ggml_int8x16x4_t q1b;
-    ggml_int8x16x4_t q8b;
-
-    float sumf = 0;
-    for (int i = 0; i < nb; ++i) {
-
-        const int8_t   * q8 = y[i].qs;
-        const uint8_t  * qs = x[i].qs;
-        const uint16_t * qh = x[i].qh;
-
-        int sumi1 = 0, sumi2 = 0, sumi3 = 0;
-
-        for (int ib = 0; ib < QK_K/32; ib += 2) {
-
-            q1b.val[0] = vcombine_s8(vld1_s8((const int8_t *)(iq1s_grid + (qs[0] | ((qh[ib+0] << 8) & 0x700)))),
-                                     vld1_s8((const int8_t *)(iq1s_grid + (qs[1] | ((qh[ib+0] << 5) & 0x700)))));
-            q1b.val[1] = vcombine_s8(vld1_s8((const int8_t *)(iq1s_grid + (qs[2] | ((qh[ib+0] << 2) & 0x700)))),
-                                     vld1_s8((const int8_t *)(iq1s_grid + (qs[3] | ((qh[ib+0] >> 1) & 0x700)))));
-            q1b.val[2] = vcombine_s8(vld1_s8((const int8_t *)(iq1s_grid + (qs[4] | ((qh[ib+1] << 8) & 0x700)))),
-                                     vld1_s8((const int8_t *)(iq1s_grid + (qs[5] | ((qh[ib+1] << 5) & 0x700)))));
-            q1b.val[3] = vcombine_s8(vld1_s8((const int8_t *)(iq1s_grid + (qs[6] | ((qh[ib+1] << 2) & 0x700)))),
-                                     vld1_s8((const int8_t *)(iq1s_grid + (qs[7] | ((qh[ib+1] >> 1) & 0x700)))));
-            qs += 8;
-
-            q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
-
-            const int32x4_t p1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q1b.val[0], q8b.val[0]), q1b.val[1], q8b.val[1]);
-            const int32x4_t p2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q1b.val[2], q8b.val[2]), q1b.val[3], q8b.val[3]);
-
-            const int ls1 = 2*((qh[ib+0] >> 12) & 7) + 1;
-            const int ls2 = 2*((qh[ib+1] >> 12) & 7) + 1;
-            sumi1 += vaddvq_s32(p1) * ls1;
-            sumi2 += vaddvq_s32(p2) * ls2;
-            sumi3 += (y[i].bsums[2*ib+0] + y[i].bsums[2*ib+1]) * ls1 * (qh[ib+0] & 0x8000 ? -1 : 1)
-                   + (y[i].bsums[2*ib+2] + y[i].bsums[2*ib+3]) * ls2 * (qh[ib+1] & 0x8000 ? -1 : 1);
-
-        }
-
-        sumf += y[i].d * GGML_FP16_TO_FP32(x[i].d) * (sumi1 + sumi2 + IQ1S_DELTA * sumi3);
-    }
-
-    *s = sumf;
-
-#elif defined __AVX2__
-
-    __m256 accum = _mm256_setzero_ps();
-    float accum1 = 0;
-    for (int i = 0; i < nb; ++i) {
-
-        const int8_t   * q8 = y[i].qs;
-        const uint8_t  * qs = x[i].qs;
-        const uint16_t * qh = x[i].qh;
-
-        __m256i sumi = _mm256_setzero_si256();
-        int sumi1 = 0;
-        for (int ib = 0; ib < QK_K/32; ib += 2) {
-#ifdef __BMI2__
-            const uint64_t packed_idx1 = _pdep_u64(*(const uint32_t *)qs, 0x00ff00ff00ff00ffULL) | _pdep_u64(qh[ib], 0x700070007000700ULL);
-            const uint64_t packed_idx2 = _pdep_u64(*(const uint32_t *)(qs + 4), 0x00ff00ff00ff00ffULL) | _pdep_u64(qh[ib + 1], 0x700070007000700ULL);
-            const uint16_t *idx1 = (const uint16_t *)(&packed_idx1);
-            const uint16_t *idx2 = (const uint16_t *)(&packed_idx2);
-            const __m256i q1b_1 = _mm256_set_epi64x(iq1s_grid[idx1[3]], iq1s_grid[idx1[2]], iq1s_grid[idx1[1]], iq1s_grid[idx1[0]]);
-            const __m256i q1b_2 = _mm256_set_epi64x(iq1s_grid[idx2[3]], iq1s_grid[idx2[2]], iq1s_grid[idx2[1]], iq1s_grid[idx2[0]]);
-#else
-            const __m256i q1b_1 = _mm256_set_epi64x(iq1s_grid[qs[3] | ((qh[ib+0] >> 1) & 0x700)], iq1s_grid[qs[2] | ((qh[ib+0] << 2) & 0x700)],
-                                                    iq1s_grid[qs[1] | ((qh[ib+0] << 5) & 0x700)], iq1s_grid[qs[0] | ((qh[ib+0] << 8) & 0x700)]);
-            const __m256i q1b_2 = _mm256_set_epi64x(iq1s_grid[qs[7] | ((qh[ib+1] >> 1) & 0x700)], iq1s_grid[qs[6] | ((qh[ib+1] << 2) & 0x700)],
-                                                    iq1s_grid[qs[5] | ((qh[ib+1] << 5) & 0x700)], iq1s_grid[qs[4] | ((qh[ib+1] << 8) & 0x700)]);
-#endif
-            qs += 8;
-            const __m256i q8b_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
-            const __m256i q8b_2 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
-
-            const __m256i dot1 = mul_add_epi8(q1b_1, q8b_1);
-            const __m256i dot2 = mul_add_epi8(q1b_2, q8b_2);
-            const int16_t ls1 = 2*((qh[ib+0] >> 12) & 7) + 1;
-            const int16_t ls2 = 2*((qh[ib+1] >> 12) & 7) + 1;
-            const __m256i p1 = _mm256_madd_epi16(dot1, _mm256_set1_epi16(ls1));
-            const __m256i p2 = _mm256_madd_epi16(dot2, _mm256_set1_epi16(ls2));
-
-            sumi = _mm256_add_epi32(sumi, _mm256_add_epi32(p1, p2));
-            sumi1 += (y[i].bsums[2*ib+0] + y[i].bsums[2*ib+1]) * (qh[ib+0] & 0x8000 ? -1 : 1) * ls1
-                   + (y[i].bsums[2*ib+2] + y[i].bsums[2*ib+3]) * (qh[ib+1] & 0x8000 ? -1 : 1) * ls2;
-        }
-
-        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
-        accum = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(sumi), accum);
-        accum1 += d * sumi1;
-
-    }
-
-    *s = hsum_float_8(accum) + IQ1S_DELTA * accum1;
-
-#elif defined __AVX__
-    __m256 accum = _mm256_setzero_ps();
-    float accum1 = 0;
-    for (int i = 0; i < nb; ++i) {
-
-        const int8_t   * q8 = y[i].qs;
-        const uint8_t  * qs = x[i].qs;
-        const uint16_t * qh = x[i].qh;
-
-        __m128i sumi1_0 = _mm_setzero_si128();
-        __m128i sumi1_1 = _mm_setzero_si128();
-        int sumi1 = 0;
-        for (int ib = 0; ib < QK_K/32; ib += 2) {
-            const __m128i q1b_1_0 = _mm_set_epi64x(iq1s_grid[qs[1] | ((qh[ib+0] << 5) & 0x700)], iq1s_grid[qs[0] | ((qh[ib+0] << 8) & 0x700)]);
-            const __m128i q1b_1_1 = _mm_set_epi64x(iq1s_grid[qs[3] | ((qh[ib+0] >> 1) & 0x700)], iq1s_grid[qs[2] | ((qh[ib+0] << 2) & 0x700)]);
-            const __m128i q1b_2_0 = _mm_set_epi64x(iq1s_grid[qs[5] | ((qh[ib+1] << 5) & 0x700)], iq1s_grid[qs[4] | ((qh[ib+1] << 8) & 0x700)]);
-            const __m128i q1b_2_1 = _mm_set_epi64x(iq1s_grid[qs[7] | ((qh[ib+1] >> 1) & 0x700)], iq1s_grid[qs[6] | ((qh[ib+1] << 2) & 0x700)]);
-            qs += 8;
-            const __m128i q8b_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
-            const __m128i q8b_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
-            const __m128i q8b_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
-            const __m128i q8b_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
-
-            const __m128i dot1_0 = mul_add_epi8_sse(q1b_1_0, q8b_1_0);
-            const __m128i dot1_1 = mul_add_epi8_sse(q1b_1_1, q8b_1_1);
-            const __m128i dot2_0 = mul_add_epi8_sse(q1b_2_0, q8b_2_0);
-            const __m128i dot2_1 = mul_add_epi8_sse(q1b_2_1, q8b_2_1);
-            const int16_t ls1 = 2*((qh[ib+0] >> 12) & 7) + 1;
-            const int16_t ls2 = 2*((qh[ib+1] >> 12) & 7) + 1;
-            const __m128i p1_0 = _mm_madd_epi16(dot1_0, _mm_set1_epi16(ls1));
-            const __m128i p1_1 = _mm_madd_epi16(dot1_1, _mm_set1_epi16(ls1));
-            const __m128i p2_0 = _mm_madd_epi16(dot2_0, _mm_set1_epi16(ls2));
-            const __m128i p2_1 = _mm_madd_epi16(dot2_1, _mm_set1_epi16(ls2));
-
-            sumi1_0 = _mm_add_epi32(sumi1_0, _mm_add_epi32(p1_0, p2_0));
-            sumi1_1 = _mm_add_epi32(sumi1_1, _mm_add_epi32(p1_1, p2_1));
-            sumi1 += (y[i].bsums[2*ib+0] + y[i].bsums[2*ib+1]) * (qh[ib+0] & 0x8000 ? -1 : 1) * ls1
-                   + (y[i].bsums[2*ib+2] + y[i].bsums[2*ib+3]) * (qh[ib+1] & 0x8000 ? -1 : 1) * ls2;
-        }
-
-        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
-        accum = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(sumi1_1, sumi1_0))), accum);
-        accum1 += d * sumi1;
-
-    }
-
-    *s = hsum_float_8(accum) + IQ1S_DELTA * accum1;
-
-#elif defined(__POWER9_VECTOR__)
-    const vector unsigned char v0 = vec_splats((unsigned char)0x0);
-    const vector unsigned short vsign = vec_splats((unsigned short)0x8000);
-
-    vector float vsumf0 = vec_splats(0.0f);
-    vector float vsumf1 = vec_splats(0.0f);
-    vector float vsumf2 = vec_splats(0.0f);
-    vector float vsumf3 = vec_splats(0.0f);
-
-    for (int i = 0; i < nb; ++i) {
-        vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
-        vector float vyd = vec_splats(y[i].d);
-        vector float vd = vec_mul(vxd, vyd);
-
-        vector signed int vsumi0 = vec_splats((int32_t)0);
-        vector signed int vsumi1 = vec_splats((int32_t)0);
-        vector signed int vsumi2 = vec_splats((int32_t)0);
-        vector signed int vsumi3 = vec_splats((int32_t)0);
-        vector signed int vsumi8 = vec_splats((int32_t)0);
-
-        const uint8_t  * GGML_RESTRICT q1 = x[i].qs;
-        const uint16_t * GGML_RESTRICT qh = x[i].qh;
-        const int8_t   * GGML_RESTRICT q8 = y[i].qs;
-        const int16_t  * GGML_RESTRICT qs = y[i].bsums;
-
-        for (int j = 0; j < QK_K/32; j += 2) {
-            __builtin_prefetch(q1, 0, 1);
-            __builtin_prefetch(qh, 0, 1);
-            __builtin_prefetch(q8, 0, 1);
-
-            vector signed long long aux64x2_0 = {*(const int64_t *)(iq1s_grid + (q1[0] | ((qh[0] << 8) & 0x700))), *(const int64_t *)(iq1s_grid + (q1[1] | ((qh[0] << 5) & 0x700)))};
-            vector signed long long aux64x2_1 = {*(const int64_t *)(iq1s_grid + (q1[2] | ((qh[0] << 2) & 0x700))), *(const int64_t *)(iq1s_grid + (q1[3] | ((qh[0] >> 1) & 0x700)))};
-            vector signed long long aux64x2_2 = {*(const int64_t *)(iq1s_grid + (q1[4] | ((qh[1] << 8) & 0x700))), *(const int64_t *)(iq1s_grid + (q1[5] | ((qh[1] << 5) & 0x700)))};
-            vector signed long long aux64x2_3 = {*(const int64_t *)(iq1s_grid + (q1[6] | ((qh[1] << 2) & 0x700))), *(const int64_t *)(iq1s_grid + (q1[7] | ((qh[1] >> 1) & 0x700)))};
-            q1 += 8;
-
-            vector signed char q1x0 = (vector signed char)aux64x2_0;
-            vector signed char q1x1 = (vector signed char)aux64x2_1;
-            vector signed char q1x2 = (vector signed char)aux64x2_2;
-            vector signed char q1x3 = (vector signed char)aux64x2_3;
-
-            vector signed char q8y0 = vec_xl( 0, q8);
-            vector signed char q8y1 = vec_xl(16, q8);
-            vector signed char q8y2 = vec_xl(32, q8);
-            vector signed char q8y3 = vec_xl(48, q8);
-            q8 += 64;
-
-            vector signed short qv0 = vec_add(vec_mule(q1x0, q8y0), vec_mulo(q1x0, q8y0));
-            vector signed short qv1 = vec_add(vec_mule(q1x1, q8y1), vec_mulo(q1x1, q8y1));
-            vector signed short qv2 = vec_add(vec_mule(q1x2, q8y2), vec_mulo(q1x2, q8y2));
-            vector signed short qv3 = vec_add(vec_mule(q1x3, q8y3), vec_mulo(q1x3, q8y3));
-
-            const uint16_t ls0 = (uint16_t)((qh[0] >> 12) & 7);
-            const uint16_t ls1 = (uint16_t)((qh[1] >> 12) & 7);
-
-            vector signed short vscales01 = (vector signed short)vec_splats((uint16_t)(2*ls0+1));
-            vector signed short vscales23 = (vector signed short)vec_splats((uint16_t)(2*ls1+1));
-            vector signed short vscales = vec_sld(vscales23, vscales01, 8);
-
-            vsumi0 = vec_msum(qv0, vscales01, vsumi0);
-            vsumi1 = vec_msum(qv1, vscales01, vsumi1);
-            vsumi2 = vec_msum(qv2, vscales23, vsumi2);
-            vsumi3 = vec_msum(qv3, vscales23, vsumi3);
-
-            vector signed short q8ysums = vec_xl_len(qs, 8);
-            qs += 4;
-            q8ysums = vec_mergeh(q8ysums, (vector signed short)v0);
-
-            vector signed short qxh = (vector signed short)vec_sld(vec_splats(qh[1]), vec_splats(qh[0]), 8);
-            qh += 2;
-            vector __bool short vsel = vec_cmpge(qxh, (vector signed short)v0);
-
-            vector signed short q8ysum = vec_sel((vector signed short)vec_xor((vector unsigned short)q8ysums, vsign), q8ysums, vsel);
-
-            vsumi8 = vec_add(vec_mule(q8ysum, vscales), vsumi8);
-        }
-
-        vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
-        vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
-        vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
-        vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
-
-        vsumf0 = vec_madd(vec_ctf(vsumi8, 0), vec_mul(vd, vec_splats(IQ1S_DELTA)), vsumf0);
-    }
-
-    vsumf0 = vec_add(vsumf0, vsumf2);
-    vsumf1 = vec_add(vsumf1, vsumf3);
-
-    vsumf0 = vec_add(vsumf0, vsumf1);
-
-    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
-    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
-
-    *s = vec_extract(vsumf0, 0);
-
-#elif defined(__loongarch_asx)
-
-    __m256 accum = (__m256)__lasx_xvldi(0);
-    float accum1 = 0;
-    for (int i = 0; i < nb; ++i) {
-
-        const int8_t   * q8 = y[i].qs;
-        const uint8_t  * qs = x[i].qs;
-        const uint16_t * qh = x[i].qh;
-
-        __m256i sumi = __lasx_xvldi(0);
-        int sumi1 = 0;
-        for (int ib = 0; ib < QK_K/32; ib += 2) {
-            __m256i q1b_1 = __lasx_xvinsgr2vr_d(q1b_1, iq1s_grid[qs[0] | ((qh[ib+0] << 8) & 0x700)], 0);
-            q1b_1 = __lasx_xvinsgr2vr_d(q1b_1, iq1s_grid[qs[1] | ((qh[ib+0] << 5) & 0x700)], 1);
-            q1b_1 = __lasx_xvinsgr2vr_d(q1b_1, iq1s_grid[qs[2] | ((qh[ib+0] << 2) & 0x700)], 2);
-            q1b_1 = __lasx_xvinsgr2vr_d(q1b_1, iq1s_grid[qs[3] | ((qh[ib+0] >> 1) & 0x700)], 3);
-
-            __m256i q1b_2 = __lasx_xvinsgr2vr_d(q1b_2, iq1s_grid[qs[4] | ((qh[ib+1] << 8) & 0x700)], 0);
-            q1b_2 = __lasx_xvinsgr2vr_d(q1b_2, iq1s_grid[qs[5] | ((qh[ib+1] << 5) & 0x700)], 1);
-            q1b_2 = __lasx_xvinsgr2vr_d(q1b_2, iq1s_grid[qs[6] | ((qh[ib+1] << 2) & 0x700)], 2);
-            q1b_2 = __lasx_xvinsgr2vr_d(q1b_2, iq1s_grid[qs[7] | ((qh[ib+1] >> 1) & 0x700)], 3);
-
-            qs += 8;
-            const __m256i q8b_1 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
-            const __m256i q8b_2 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
-
-            const __m256i dot1 = mul_add_epi8(q1b_1, q8b_1);
-            const __m256i dot2 = mul_add_epi8(q1b_2, q8b_2);
-            const int16_t ls1 = 2*((qh[ib+0] >> 12) & 7) + 1;
-            const int16_t ls2 = 2*((qh[ib+1] >> 12) & 7) + 1;
-
-            __m256i tmp1, tmp5, tmp6;
-            tmp1 = __lasx_xvreplgr2vr_h(ls1);
-            tmp5 = __lasx_xvmulwev_w_h(dot1, tmp1);
-            tmp6 = __lasx_xvmulwod_w_h(dot1, tmp1);
-            const __m256i p1 = __lasx_xvadd_w(tmp5, tmp6);
-
-            tmp1 = __lasx_xvreplgr2vr_h(ls2);
-            tmp5 = __lasx_xvmulwev_w_h(dot2, tmp1);
-            tmp6 = __lasx_xvmulwod_w_h(dot2, tmp1);
-            const __m256i p2 = __lasx_xvadd_w(tmp5, tmp6);
-
-            sumi = __lasx_xvadd_w(sumi, __lasx_xvadd_w(p1, p2));
-            sumi1 += (y[i].bsums[2*ib+0] + y[i].bsums[2*ib+1]) * (qh[ib+0] & 0x8000 ? -1 : 1) * ls1
-                   + (y[i].bsums[2*ib+2] + y[i].bsums[2*ib+3]) * (qh[ib+1] & 0x8000 ? -1 : 1) * ls2;
-        }
-
-        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
-        accum = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(d), __lasx_xvffint_s_w(sumi), accum);
-        accum1 += d * sumi1;
-    }
-
-    *s = hsum_float_8(accum) + IQ1S_DELTA * accum1;
-
-#else
-
-    float sumf = 0;
-    for (int i = 0; i < nb; i++) {
-
-        const int8_t   * q8 = y[i].qs;
-        const uint8_t  * qs = x[i].qs;
-        const uint16_t * qh = x[i].qh;
-
-        int sumi = 0, sumi1 = 0;
-        for (int ib = 0; ib < QK_K/32; ++ib) {
-            const int ls = 2*((qh[ib] >> 12) & 7) + 1;
-            const int delta = qh[ib] & 0x8000 ? -1 : 1;
-            int lsum = 0;
-            for (int l = 0; l < 4; ++l) {
-                const int8_t * grid = (const int8_t *)(iq1s_grid + (qs[l] | (((qh[ib] >> 3*l) & 7) << 8)));
-                for (int j = 0; j < 8; ++j) {
-                    lsum += q8[j] * grid[j];
-                }
-                q8 += 8;
-            }
-            sumi  += ls * lsum;
-            sumi1 += ls * delta * (y[i].bsums[2*ib+0] + y[i].bsums[2*ib+1]);
-            qs += 4;
-        }
-
-        sumf += GGML_FP16_TO_FP32(x[i].d) * y[i].d * (sumi + IQ1S_DELTA * sumi1);
-    }
-
-    *s = sumf;
-
-#endif
-}
-
-void ggml_vec_dot_iq1_m_q8_K  (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
-    assert(n % QK_K == 0);
-    assert(nrc == 1);
-    UNUSED(nrc);
-    UNUSED(bx);
-    UNUSED(by);
-    UNUSED(bs);
-
-    const block_iq1_m * GGML_RESTRICT x = vx;
-    const block_q8_K  * GGML_RESTRICT y = vy;
-
-    const int nb = n / QK_K;
-
-    iq1m_scale_t scale;
-
-#if defined __ARM_NEON
-    const int32x4_t mask  = vdupq_n_s32(0x7);
-    const int32x4_t mone  = vdupq_n_s32(1);
-    const int32x4_t mzero = vdupq_n_s32(0);
-
-    ggml_int8x16x4_t deltas;
-    deltas.val[0] = vcombine_s8(vdup_n_s8(+1), vdup_n_s8(+1));
-    deltas.val[1] = vcombine_s8(vdup_n_s8(-1), vdup_n_s8(+1));
-    deltas.val[2] = vcombine_s8(vdup_n_s8(+1), vdup_n_s8(-1));
-    deltas.val[3] = vcombine_s8(vdup_n_s8(-1), vdup_n_s8(-1));
-
-    ggml_int8x16x4_t q1b;
-    ggml_int8x16x4_t q8b;
-
-    uint32_t aux32;
-    const uint8_t * aux8 = (const uint8_t *)&aux32;
-
-    float sumf = 0;
-    for (int i = 0; i < nb; ++i) {
-
-        const int8_t   * q8 = y[i].qs;
-        const uint8_t  * qs = x[i].qs;
-        const uint8_t  * qh = x[i].qh;
-        const uint16_t * sc = (const uint16_t *)x[i].scales;
-
-        scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
-
-        int32x4_t sumi1 = mzero;
-        int32x4_t sumi2 = mzero;
-
-        for (int ib = 0; ib < QK_K/32; ib += 2) {
-
-            q1b.val[0] = vcombine_s8(vld1_s8((const int8_t *)(iq1s_grid + (qs[0] | ((qh[0] << 8) & 0x700)))),
-                                     vld1_s8((const int8_t *)(iq1s_grid + (qs[1] | ((qh[0] << 4) & 0x700)))));
-            q1b.val[1] = vcombine_s8(vld1_s8((const int8_t *)(iq1s_grid + (qs[2] | ((qh[1] << 8) & 0x700)))),
-                                     vld1_s8((const int8_t *)(iq1s_grid + (qs[3] | ((qh[1] << 4) & 0x700)))));
-            q1b.val[2] = vcombine_s8(vld1_s8((const int8_t *)(iq1s_grid + (qs[4] | ((qh[2] << 8) & 0x700)))),
-                                     vld1_s8((const int8_t *)(iq1s_grid + (qs[5] | ((qh[2] << 4) & 0x700)))));
-            q1b.val[3] = vcombine_s8(vld1_s8((const int8_t *)(iq1s_grid + (qs[6] | ((qh[3] << 8) & 0x700)))),
-                                     vld1_s8((const int8_t *)(iq1s_grid + (qs[7] | ((qh[3] << 4) & 0x700)))));
-
-            q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
-
-            const int32x4_t p1 = vpaddq_s32(ggml_vdotq_s32(mzero, q1b.val[0], q8b.val[0]), ggml_vdotq_s32(mzero, q1b.val[1], q8b.val[1]));
-            const int32x4_t p2 = vpaddq_s32(ggml_vdotq_s32(mzero, q1b.val[2], q8b.val[2]), ggml_vdotq_s32(mzero, q1b.val[3], q8b.val[3]));
-            const int32x4_t p12 = vpaddq_s32(p1, p2);
-
-            const uint32_t * qh32 = (const uint32_t *)qh; // we are 4-byte aligned, so we can do that
-            aux32 = ((qh32[0] >> 3) & 0x01010101) | ((qh32[0] >> 6) & 0x02020202);
-
-            const int32x4_t p3 = vpaddq_s32(ggml_vdotq_s32(mzero, deltas.val[aux8[0]], q8b.val[0]), ggml_vdotq_s32(mzero, deltas.val[aux8[1]], q8b.val[1]));
-            const int32x4_t p4 = vpaddq_s32(ggml_vdotq_s32(mzero, deltas.val[aux8[2]], q8b.val[2]), ggml_vdotq_s32(mzero, deltas.val[aux8[3]], q8b.val[3]));
-            const int32x4_t p34 = vpaddq_s32(p3, p4);
-
-            int32x4_t scales_4 = ggml_vld1q_u32(sc[ib/2] >> 0, sc[ib/2] >> 3, sc[ib/2] >> 6, sc[ib/2] >> 9);
-
-            scales_4 = vaddq_s32(vshlq_n_s32(vandq_s32(scales_4, mask), 1), mone);
-
-            sumi1 = vmlaq_s32(sumi1, scales_4, p12);
-            sumi2 = vmlaq_s32(sumi2, scales_4, p34);
-
-            qs += 8; qh += 4;
-
-        }
-
-        sumf += y[i].d * GGML_FP16_TO_FP32(scale.f16) * (vaddvq_s32(sumi1) + IQ1M_DELTA * vaddvq_s32(sumi2));
-    }
-
-    *s = sumf;
-
-#elif defined __AVX2__
-
-    const __m256i mask = _mm256_set1_epi16(0x7);
-    const __m256i mone = _mm256_set1_epi16(1);
-    const __m256i mone8 = _mm256_set1_epi8(1);
-    const __m256i mtwo8 = _mm256_set1_epi8(2);
-    // VPSHUFB cannot cross 128-bit lanes so odd shifts go to upper half.
-    const __m256i scales_shift = _mm256_set_epi64x(9, 3, 6, 0);
-
-    __m256 accum1 = _mm256_setzero_ps();
-    __m256 accum2 = _mm256_setzero_ps();
-    for (int i = 0; i < nb; ++i) {
-
-        const int8_t   * q8 = y[i].qs;
-        const uint8_t  * qs = x[i].qs;
-        const uint8_t  * qh = x[i].qh;
-        const uint16_t * sc = (const uint16_t *)x[i].scales;
-
-        scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
-        // Extract 3-bit scales (16 values)
-        __m256i scales = _mm256_set1_epi64x(*(const uint64_t*)sc);
-        scales = _mm256_srlv_epi64(scales, scales_shift);
-        scales = _mm256_add_epi16(_mm256_slli_epi16(_mm256_and_si256(scales, mask), 1), mone);
-
-        // Indices to repeat each scale 8 times.
-        __m256i scales_idx1 = _mm256_set1_epi16(0x0100);
-        __m256i scales_idx2 = _mm256_add_epi8(scales_idx1, _mm256_set1_epi8(8));
-
-        __m256i sumi1 = _mm256_setzero_si256();
-        __m256i sumi2 = _mm256_setzero_si256();
-        for (int ib = 0; ib < QK_K/32; ib += 2) {
-#ifdef __BMI2__
-            const uint64_t packed_idx1 = _pdep_u64(*(const uint32_t *)qs, 0x00ff00ff00ff00ffULL)
-                                       | _pdep_u64(*(const uint16_t*)(qh) & 0x7777, 0xf000f000f000f00ULL);
-            const uint64_t packed_idx2 = _pdep_u64(*(const uint32_t *)(qs + 4), 0x00ff00ff00ff00ffULL)
-                                       | _pdep_u64(*(const uint16_t*)(qh + 2) & 0x7777, 0xf000f000f000f00ULL);
-            const uint16_t *idx1 = (const uint16_t *)(&packed_idx1);
-            const uint16_t *idx2 = (const uint16_t *)(&packed_idx2);
-            const __m256i q1b_1 = _mm256_set_epi64x(iq1s_grid[idx1[3]], iq1s_grid[idx1[2]], iq1s_grid[idx1[1]], iq1s_grid[idx1[0]]);
-            const __m256i q1b_2 = _mm256_set_epi64x(iq1s_grid[idx2[3]], iq1s_grid[idx2[2]], iq1s_grid[idx2[1]], iq1s_grid[idx2[0]]);
-
-            // Convert signs to bytes 0x81 (negative) or 0x01 (positive)
-            const uint64_t delta_sign = _pdep_u64(*(const uint32_t*)(qh) & 0x88888888, 0xf0f0f0f0f0f0f0f0ULL);
-            const __m256i delta1 = _mm256_or_si256(mone8, _mm256_cvtepi8_epi64(_mm_set1_epi32(delta_sign)));
-            const __m256i delta2 = _mm256_or_si256(mone8, _mm256_cvtepi8_epi64(_mm_set1_epi32(delta_sign >> 32)));
-#else
-            const __m256i q1b_1 = _mm256_set_epi64x(
-                    iq1s_grid[qs[3] | (((uint16_t)qh[1] << 4) & 0x700)], iq1s_grid[qs[2] | (((uint16_t)qh[1] << 8) & 0x700)],
-                    iq1s_grid[qs[1] | (((uint16_t)qh[0] << 4) & 0x700)], iq1s_grid[qs[0] | (((uint16_t)qh[0] << 8) & 0x700)]
-            );
-            const __m256i q1b_2 = _mm256_set_epi64x(
-                    iq1s_grid[qs[7] | (((uint16_t)qh[3] << 4) & 0x700)], iq1s_grid[qs[6] | (((uint16_t)qh[3] << 8) & 0x700)],
-                    iq1s_grid[qs[5] | (((uint16_t)qh[2] << 4) & 0x700)], iq1s_grid[qs[4] | (((uint16_t)qh[2] << 8) & 0x700)]
-            );
-
-            const __m256i delta1 = _mm256_set_epi64x(qh[1] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
-                                                     qh[1] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101,
-                                                     qh[0] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
-                                                     qh[0] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101);
-            const __m256i delta2 = _mm256_set_epi64x(qh[3] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
-                                                     qh[3] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101,
-                                                     qh[2] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
-                                                     qh[2] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101);
-#endif
-            const __m256i q8b_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
-            const __m256i q8b_2 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
-
-            const __m256i dot1 = mul_add_epi8(q1b_1, q8b_1);
-            const __m256i dot2 = mul_add_epi8(q1b_2, q8b_2);
-            const __m256i dot3 = _mm256_maddubs_epi16(mone8, _mm256_sign_epi8(q8b_1, delta1));
-            const __m256i dot4 = _mm256_maddubs_epi16(mone8, _mm256_sign_epi8(q8b_2, delta2));
-
-            __m256i scale1 = _mm256_shuffle_epi8(scales, scales_idx1);
-            __m256i scale2 = _mm256_shuffle_epi8(scales, scales_idx2);
-
-            scales_idx1 = _mm256_add_epi8(scales_idx1, mtwo8);
-            scales_idx2 = _mm256_add_epi8(scales_idx2, mtwo8);
-
-            const __m256i p1 = _mm256_madd_epi16(dot1, scale1);
-            const __m256i p2 = _mm256_madd_epi16(dot2, scale2);
-            const __m256i p3 = _mm256_madd_epi16(dot3, scale1);
-            const __m256i p4 = _mm256_madd_epi16(dot4, scale2);
-
-            sumi1 = _mm256_add_epi32(sumi1, _mm256_add_epi32(p1, p2));
-            sumi2 = _mm256_add_epi32(sumi2, _mm256_add_epi32(p3, p4));
-
-            qs += 8; qh += 4;
-        }
-
-        const __m256 d = _mm256_set1_ps(y[i].d * GGML_FP16_TO_FP32(scale.f16));
-
-        accum1 = _mm256_fmadd_ps(d, _mm256_cvtepi32_ps(sumi1), accum1);
-        accum2 = _mm256_fmadd_ps(d, _mm256_cvtepi32_ps(sumi2), accum2);
-    }
-
-    *s = hsum_float_8(accum1) + IQ1M_DELTA * hsum_float_8(accum2);
-
-#elif defined __AVX__
-    const __m128i mask = _mm_set1_epi16(0x7);
-    const __m128i mone = _mm_set1_epi16(1);
-
-    __m256 accum1 = _mm256_setzero_ps();
-    __m256 accum2 = _mm256_setzero_ps();
-    for (int i = 0; i < nb; ++i) {
-
-        const int8_t   * q8 = y[i].qs;
-        const uint8_t  * qs = x[i].qs;
-        const uint8_t  * qh = x[i].qh;
-        const uint16_t * sc = (const uint16_t *)x[i].scales;
-
-        scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
-
-        __m128i sumi1_0 = _mm_setzero_si128();
-        __m128i sumi1_1 = _mm_setzero_si128();
-        __m128i sumi2_0 = _mm_setzero_si128();
-        __m128i sumi2_1 = _mm_setzero_si128();
-        for (int ib = 0; ib < QK_K/32; ib += 2) {
-            const __m128i q1b_1_0 = _mm_set_epi64x(
-                    iq1s_grid[qs[1] | (((uint16_t)qh[0] << 4) & 0x700)], iq1s_grid[qs[0] | (((uint16_t)qh[0] << 8) & 0x700)]);
-            const __m128i q1b_1_1 = _mm_set_epi64x(
-                    iq1s_grid[qs[3] | (((uint16_t)qh[1] << 4) & 0x700)], iq1s_grid[qs[2] | (((uint16_t)qh[1] << 8) & 0x700)]);
-            const __m128i q1b_2_0 = _mm_set_epi64x(
-                    iq1s_grid[qs[5] | (((uint16_t)qh[2] << 4) & 0x700)], iq1s_grid[qs[4] | (((uint16_t)qh[2] << 8) & 0x700)]);
-            const __m128i q1b_2_1 = _mm_set_epi64x(
-                    iq1s_grid[qs[7] | (((uint16_t)qh[3] << 4) & 0x700)], iq1s_grid[qs[6] | (((uint16_t)qh[3] << 8) & 0x700)]);
-            const __m128i q8b_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
-            const __m128i q8b_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
-            const __m128i q8b_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
-            const __m128i q8b_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
-
-            const __m128i dot1_0 = mul_add_epi8_sse(q1b_1_0, q8b_1_0);
-            const __m128i dot1_1 = mul_add_epi8_sse(q1b_1_1, q8b_1_1);
-            const __m128i dot2_0 = mul_add_epi8_sse(q1b_2_0, q8b_2_0);
-            const __m128i dot2_1 = mul_add_epi8_sse(q1b_2_1, q8b_2_1);
-
-            const __m128i delta1_0 = _mm_set_epi64x(qh[0] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
-                                                     qh[0] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101);
-            const __m128i delta1_1 = _mm_set_epi64x(qh[1] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
-                                                     qh[1] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101);
-            const __m128i delta2_0 = _mm_set_epi64x(qh[2] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
-                                                     qh[2] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101);
-            const __m128i delta2_1 = _mm_set_epi64x(qh[3] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
-                                                     qh[3] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101);
-
-            const __m128i dot3_0 = mul_add_epi8_sse(delta1_0, q8b_1_0);
-            const __m128i dot3_1 = mul_add_epi8_sse(delta1_1, q8b_1_1);
-            const __m128i dot4_0 = mul_add_epi8_sse(delta2_0, q8b_2_0);
-            const __m128i dot4_1 = mul_add_epi8_sse(delta2_1, q8b_2_1);
-
-            __m128i scale1_0 = _mm_set1_epi16(sc[ib/2] >> 0);
-            __m128i scale1_1 = _mm_set1_epi16(sc[ib/2] >> 3);
-            __m128i scale2_0 = _mm_set1_epi16(sc[ib/2] >> 6);
-            __m128i scale2_1 = _mm_set1_epi16(sc[ib/2] >> 9);
-
-            scale1_0 = _mm_add_epi16(_mm_slli_epi16(_mm_and_si128(scale1_0, mask), 1), mone);
-            scale1_1 = _mm_add_epi16(_mm_slli_epi16(_mm_and_si128(scale1_1, mask), 1), mone);
-            scale2_0 = _mm_add_epi16(_mm_slli_epi16(_mm_and_si128(scale2_0, mask), 1), mone);
-            scale2_1 = _mm_add_epi16(_mm_slli_epi16(_mm_and_si128(scale2_1, mask), 1), mone);
-            const __m128i p1_0 = _mm_madd_epi16(dot1_0, scale1_0);
-            const __m128i p1_1 = _mm_madd_epi16(dot1_1, scale1_1);
-            const __m128i p2_0 = _mm_madd_epi16(dot2_0, scale2_0);
-            const __m128i p2_1 = _mm_madd_epi16(dot2_1, scale2_1);
-            const __m128i p3_0 = _mm_madd_epi16(dot3_0, scale1_0);
-            const __m128i p3_1 = _mm_madd_epi16(dot3_1, scale1_1);
-            const __m128i p4_0 = _mm_madd_epi16(dot4_0, scale2_0);
-            const __m128i p4_1 = _mm_madd_epi16(dot4_1, scale2_1);
-
-            sumi1_0 = _mm_add_epi32(sumi1_0, _mm_add_epi32(p1_0, p2_0));
-            sumi1_1 = _mm_add_epi32(sumi1_1, _mm_add_epi32(p1_1, p2_1));
-            sumi2_0 = _mm_add_epi32(sumi2_0, _mm_add_epi32(p3_0, p4_0));
-            sumi2_1 = _mm_add_epi32(sumi2_1, _mm_add_epi32(p3_1, p4_1));
-
-            qs += 8; qh += 4;
-        }
-
-        const __m256 d = _mm256_set1_ps(y[i].d * GGML_FP16_TO_FP32(scale.f16));
-
-        accum1 = _mm256_add_ps(_mm256_mul_ps(d, _mm256_cvtepi32_ps(MM256_SET_M128I(sumi1_1, sumi1_0))), accum1);
-        accum2 = _mm256_add_ps(_mm256_mul_ps(d, _mm256_cvtepi32_ps(MM256_SET_M128I(sumi2_1, sumi2_0))), accum2);
-    }
-
-    *s = hsum_float_8(accum1) + IQ1M_DELTA * hsum_float_8(accum2);
-
-#else
-
-    int sum1[2], sum2[2], delta[4];
-
-    float sumf = 0;
-    for (int i = 0; i < nb; i++) {
-
-        const int8_t   * q8 = y[i].qs;
-        const uint8_t  * qs = x[i].qs;
-        const uint8_t  * qh = x[i].qh;
-        const uint16_t * sc = (const uint16_t *)x[i].scales;
-
-        scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
-
-        int sumi1 = 0, sumi2 = 0;
-        for (int ib = 0; ib < QK_K/32; ++ib) {
-            delta[0] = qh[0] & 0x08 ? -1 : 1;
-            delta[1] = qh[0] & 0x80 ? -1 : 1;
-            delta[2] = qh[1] & 0x08 ? -1 : 1;
-            delta[3] = qh[1] & 0x80 ? -1 : 1;
-            sum1[0] = sum1[1] = sum2[0] = sum2[1] = 0;
-            for (int l = 0; l < 4; ++l) {
-                const int8_t * grid = (const int8_t *)(iq1s_grid + (qs[l] | (((uint16_t)qh[l/2] << (8 - 4*(l%2))) & 0x700)));
-                int lsum1 = 0, lsum2 = 0;
-                for (int j = 0; j < 8; ++j) {
-                    lsum1 += q8[j] * grid[j];
-                    lsum2 += q8[j];
-                }
-                q8 += 8;
-                sum1[l/2] += lsum1;
-                sum2[l/2] += lsum2*delta[l];
-            }
-
-            const int ls1 = 2*((sc[ib/2] >> (6*(ib%2)+0)) & 0x7) + 1;
-            const int ls2 = 2*((sc[ib/2] >> (6*(ib%2)+3)) & 0x7) + 1;
-
-            sumi1 += sum1[0] * ls1 + sum1[1] * ls2;
-            sumi2 += sum2[0] * ls1 + sum2[1] * ls2;
-            qs += 4;
-            qh += 2;
-        }
-
-        sumf += GGML_FP16_TO_FP32(scale.f16) * y[i].d * (sumi1 + IQ1M_DELTA * sumi2);
-    }
-
-    *s = sumf;
-
-#endif
-}
-
-void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
-    assert(nrc == 1);
-    UNUSED(nrc);
-    UNUSED(bx);
-    UNUSED(by);
-    UNUSED(bs);
-    assert(n % QK4_NL == 0);
-    static_assert(QK4_NL == QK8_0, "QK4_NL and QK8_0 must be the same");
-
-    const block_iq4_nl * GGML_RESTRICT x = vx;
-    const block_q8_0   * GGML_RESTRICT y = vy;
-
-    const int nb = n / QK4_NL;
-
-    int ib = 0;
-    float sumf = 0;
-
-#if defined __ARM_NEON
-    const int8x16_t values = vld1q_s8(kvalues_iq4nl);
-    const uint8x16_t m4b = vdupq_n_u8(0x0f);
-    uint8x16x2_t q4bits;
-    int8x16x4_t q4b;
-    int8x16x4_t q8b;
-    int32x4_t prod_1, prod_2;
-
-    for (; ib + 1 < nb; ib += 2) {
-
-        q4bits.val[0] = vld1q_u8(x[ib + 0].qs);
-        q4bits.val[1] = vld1q_u8(x[ib + 1].qs);
-        q8b.val[0]    = vld1q_s8(y[ib + 0].qs);
-        q8b.val[1]    = vld1q_s8(y[ib + 0].qs + 16);
-        q8b.val[2]    = vld1q_s8(y[ib + 1].qs);
-        q8b.val[3]    = vld1q_s8(y[ib + 1].qs + 16);
-
-        q4b.val[0] = ggml_vqtbl1q_s8(values, vandq_u8  (q4bits.val[0], m4b));
-        q4b.val[1] = ggml_vqtbl1q_s8(values, vshrq_n_u8(q4bits.val[0], 4));
-        q4b.val[2] = ggml_vqtbl1q_s8(values, vandq_u8  (q4bits.val[1], m4b));
-        q4b.val[3] = ggml_vqtbl1q_s8(values, vshrq_n_u8(q4bits.val[1], 4));
-
-        prod_1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[0], q8b.val[0]), q4b.val[1], q8b.val[1]);
-        prod_2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[2], q8b.val[2]), q4b.val[3], q8b.val[3]);
-
-        sumf +=
-            GGML_FP16_TO_FP32(x[ib+0].d) * GGML_FP16_TO_FP32(y[ib + 0].d) * vaddvq_s32(prod_1) +
-            GGML_FP16_TO_FP32(x[ib+1].d) * GGML_FP16_TO_FP32(y[ib + 1].d) * vaddvq_s32(prod_2);
-    }
-
-#elif defined __AVX2__
-
-    const __m128i values128 = _mm_loadu_si128((const __m128i*)kvalues_iq4nl);
-    const __m128i m4b  = _mm_set1_epi8(0x0f);
-    const __m256i mone = _mm256_set1_epi16(1);
-
-    __m256 accum1 = _mm256_setzero_ps();
-    __m256 accum2 = _mm256_setzero_ps();
-    for (; ib + 1 < nb; ib += 2) {
-        const __m128i q4bits_1 = _mm_loadu_si128((const __m128i*)x[ib + 0].qs);
-        const __m128i q4bits_2 = _mm_loadu_si128((const __m128i*)x[ib + 1].qs);
-        const __m256i q8b_1 = _mm256_loadu_si256((const __m256i *)y[ib + 0].qs);
-        const __m256i q8b_2 = _mm256_loadu_si256((const __m256i *)y[ib + 1].qs);
-        const __m256i q4b_1 = MM256_SET_M128I(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b)),
-                                              _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b)));
-        const __m256i q4b_2 = MM256_SET_M128I(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b)),
-                                              _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b)));
-        const __m256i p16_1 = mul_add_epi8(q4b_1, q8b_1);
-        const __m256i p16_2 = mul_add_epi8(q4b_2, q8b_2);
-        const __m256i p_1 = _mm256_madd_epi16(p16_1, mone);
-        const __m256i p_2 = _mm256_madd_epi16(p16_2, mone);
-        accum1 = _mm256_fmadd_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(y[ib + 0].d)*GGML_FP16_TO_FP32(x[ib + 0].d)),
-                _mm256_cvtepi32_ps(p_1), accum1);
-        accum2 = _mm256_fmadd_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(y[ib + 1].d)*GGML_FP16_TO_FP32(x[ib + 1].d)),
-                _mm256_cvtepi32_ps(p_2), accum2);
-    }
-
-    sumf = hsum_float_8(_mm256_add_ps(accum1, accum2));
-
-#elif defined __AVX__
-    const __m128i values128 = _mm_loadu_si128((const __m128i*)kvalues_iq4nl);
-    const __m128i m4b  = _mm_set1_epi8(0x0f);
-
-    __m256 accum = _mm256_setzero_ps();
-    for (; ib + 1 < nb; ib += 2) {
-        const __m128i q4bits_1 = _mm_loadu_si128((const __m128i *)x[ib + 0].qs);
-        const __m128i q4bits_2 = _mm_loadu_si128((const __m128i *)x[ib + 1].qs);
-        const __m128i q8b_1_0 = _mm_loadu_si128((const __m128i *)y[ib + 0].qs);
-        const __m128i q8b_1_1 = _mm_loadu_si128((const __m128i *)y[ib + 0].qs + 1);
-        const __m128i q8b_2_0 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs);
-        const __m128i q8b_2_1 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs + 1);
-
-        const __m128i q4b_1_0 = _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b));
-        const __m128i q4b_1_1 = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b));
-        const __m128i q4b_2_0 = _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b));
-        const __m128i q4b_2_1 = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b));
-
-        const __m256 p = mul_sum_i8_quad_float(q4b_1_0, q4b_1_1, q4b_2_0, q4b_2_1, q8b_1_0, q8b_1_1, q8b_2_0, q8b_2_1);
-        const __m256 deltas = quad_fp16_delta_float(x[ib].d, y[ib].d, x[ib + 1].d, y[ib + 1].d);
-        accum = _mm256_add_ps(_mm256_mul_ps(deltas, p), accum);
-    }
-
-    sumf = hsum_float_8(accum);
-
-#elif defined(__POWER9_VECTOR__)
-    const vector signed char lowMask = vec_splats((signed char)0xF);
-    const vector signed int v0 = vec_splats((int32_t)0);
-    const vector unsigned char v4 = vec_splats((unsigned char)0x4);
-
-    vector float vsumf0 = vec_splats(0.0f);
-    vector float vsumf1 = vec_splats(0.0f);
-
-    const vector signed char values = vec_xl( 0, kvalues_iq4nl);
-
-#pragma GCC unroll 4
-    for (; ib < nb; ++ib) {
-        __builtin_prefetch(x[ib].qs, 0, 1);
-        __builtin_prefetch(y[ib].qs, 0, 1);
-
-
-        vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[ib].d));
-        vector float vyd = vec_splats(GGML_FP16_TO_FP32(y[ib].d));
-        vector float vd = vec_mul(vxd, vyd);
-
-        vector signed char qxs = (vector signed char)vec_xl( 0, x[ib].qs);
-        vector signed char q4x0 = vec_and(qxs, lowMask);
-        vector signed char q4x1 = vec_sr(qxs, v4);
-
-        q4x0 = vec_perm(values, values, (vector unsigned char)q4x0);
-        q4x1 = vec_perm(values, values, (vector unsigned char)q4x1);
-
-        vector signed char q8y0 = vec_xl( 0, y[ib].qs);
-        vector signed char q8y1 = vec_xl(16, y[ib].qs);
-
-        vector signed short qv0 = vec_add(vec_mule(q4x0, q8y0), vec_mulo(q4x0, q8y0));
-        vector signed short qv1 = vec_add(vec_mule(q4x1, q8y1), vec_mulo(q4x1, q8y1));
-
-        vector signed int vsumi0 = v0;
-        vector signed int vsumi1 = v0;
-
-        vsumi0 = vec_sum4s(qv0, vsumi0);
-        vsumi1 = vec_sum4s(qv1, vsumi1);
-
-        vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
-        vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
-    }
-
-    vsumf0 = vec_add(vsumf0, vsumf1);
-
-    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
-    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
-
-    sumf = vec_extract(vsumf0, 0);
-
-#elif defined (__loongarch_asx)
-
-    const __m128i values128 = __lsx_vld((const __m128i*)kvalues_iq4nl, 0);
-    const __m128i m4b  = __lsx_vreplgr2vr_b(0x0f);
-    const __m256i mone = __lasx_xvreplgr2vr_h(1);
-
-    __m256 accum1 = (__m256)__lasx_xvldi(0);
-    __m256 accum2 = (__m256)__lasx_xvldi(0);
-    for (; ib + 1 < nb; ib += 2) {
-        const __m128i q4bits_1 = __lsx_vld((const __m128i*)x[ib + 0].qs, 0);
-        const __m128i q4bits_2 = __lsx_vld((const __m128i*)x[ib + 1].qs, 0);
-        const __m256i q8b_1 = __lasx_xvld((const __m256i *)y[ib + 0].qs, 0);
-        const __m256i q8b_2 = __lasx_xvld((const __m256i *)y[ib + 1].qs, 0);
-        const __m256i q4b_1 = lasx_insertf128(lsx_shuffle_b(values128, __lsx_vand_v(__lsx_vsrli_h(q4bits_1, 4), m4b)),
-                                              lsx_shuffle_b(values128, __lsx_vand_v(q4bits_1, m4b)));
-        const __m256i q4b_2 = lasx_insertf128(lsx_shuffle_b(values128, __lsx_vand_v(__lsx_vsrli_h(q4bits_2, 4), m4b)),
-                                              lsx_shuffle_b(values128, __lsx_vand_v(q4bits_2, m4b)));
-        const __m256i p16_1 = mul_add_epi8(q4b_1, q8b_1);
-        const __m256i p16_2 = mul_add_epi8(q4b_2, q8b_2);
-        const __m256i p_1 = lasx_madd_h(p16_1, mone);
-        const __m256i p_2 = lasx_madd_h(p16_2, mone);
-        accum1 = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(GGML_FP16_TO_FP32(y[ib + 0].d)*GGML_FP16_TO_FP32(x[ib + 0].d)),
-                __lasx_xvffint_s_w(p_1), accum1);
-        accum2 = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(GGML_FP16_TO_FP32(y[ib + 1].d)*GGML_FP16_TO_FP32(x[ib + 1].d)),
-                __lasx_xvffint_s_w(p_2), accum2);
-    }
-
-    sumf = hsum_float_8(__lasx_xvfadd_s(accum1, accum2));
-
-#elif defined(__VXE__) || defined(__VXE2__)
-    const int8x16_t v_k = vec_xl(0, kvalues_iq4nl);
-    const uint8x16_t v_m = vec_splat_u8(0x0F);
-
-    for (; ib < nb; ++ib) {
-        const block_iq4_nl * GGML_RESTRICT x0 = &x[ib];
-        const block_q8_0   * GGML_RESTRICT y0 = &y[ib];
-
-        const uint8x16_t v_x = vec_xl(0, x0->qs);
-        int8x16_t v_xl = (int8x16_t)vec_and(v_x, v_m);
-        int8x16_t v_xh = (int8x16_t)vec_sr(v_x, 4);
-
-        v_xl = vec_perm(v_k, v_k, (uchar8x16_t)v_xl);
-        v_xh = vec_perm(v_k, v_k, (uchar8x16_t)v_xh);
-
-        const int8x16_t v_yl = vec_xl(0      , y0->qs);
-        const int8x16_t v_yh = vec_xl(QK8_0/2, y0->qs);
-        const int32x4_t v_xy = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_xl, v_yl), v_xh, v_yh);
-
-        sumf += GGML_FP16_TO_FP32(x0->d) * GGML_FP16_TO_FP32(y0->d) * (v_xy[0] + v_xy[1] + v_xy[2] + v_xy[3]);
-    }
-#endif
-    for (; ib < nb; ++ib) {
-        const float d = GGML_FP16_TO_FP32(y[ib].d)*GGML_FP16_TO_FP32(x[ib].d);
-        int sumi1 = 0, sumi2 = 0;
-        for (int j = 0; j < QK4_NL/2; ++j) {
-            sumi1 += y[ib].qs[j+       0] * kvalues_iq4nl[x[ib].qs[j] & 0xf];
-            sumi2 += y[ib].qs[j+QK4_NL/2] * kvalues_iq4nl[x[ib].qs[j] >>  4];
-        }
-        sumf += d * (sumi1 + sumi2);
-    }
-    *s = sumf;
-}
-
-void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
-    assert(nrc == 1);
-    UNUSED(nrc);
-    UNUSED(bx);
-    UNUSED(by);
-    UNUSED(bs);
-    assert(n % QK_K == 0);
-
-    const block_iq4_xs * GGML_RESTRICT x = vx;
-    const block_q8_K   * GGML_RESTRICT y = vy;
-
-    const int nb = n / QK_K;
-
-#if defined __ARM_NEON
-    const int8x16_t values = vld1q_s8(kvalues_iq4nl);
-    const uint8x16_t m4b = vdupq_n_u8(0x0f);
-    ggml_uint8x16x2_t q4bits;
-    ggml_int8x16x4_t q4b;
-    ggml_int8x16x4_t q8b;
-    int32x4_t prod_1, prod_2;
-
-    float sumf = 0;
-
-    for (int ibl = 0; ibl < nb; ++ibl) {
-
-        const int8_t  * q8 = y[ibl].qs;
-        const uint8_t * q4 = x[ibl].qs;
-        uint16_t h = x[ibl].scales_h;
-
-        int sumi1 = 0, sumi2 = 0;
-        for (int ib = 0; ib < QK_K/64; ++ib) {
-
-            q4bits = ggml_vld1q_u8_x2(q4); q4 += 32;
-            q8b    = ggml_vld1q_s8_x4(q8); q8 += 64;
-
-            q4b.val[0] = ggml_vqtbl1q_s8(values, vandq_u8  (q4bits.val[0], m4b));
-            q4b.val[1] = ggml_vqtbl1q_s8(values, vshrq_n_u8(q4bits.val[0], 4));
-            q4b.val[2] = ggml_vqtbl1q_s8(values, vandq_u8  (q4bits.val[1], m4b));
-            q4b.val[3] = ggml_vqtbl1q_s8(values, vshrq_n_u8(q4bits.val[1], 4));
-
-            prod_1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[0], q8b.val[0]), q4b.val[1], q8b.val[1]);
-            prod_2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[2], q8b.val[2]), q4b.val[3], q8b.val[3]);
-
-            int ls1 = ((x[ibl].scales_l[ib] & 0xf) | ((h << 4) & 0x30)) - 32;
-            int ls2 = ((x[ibl].scales_l[ib] >>  4) | ((h << 2) & 0x30)) - 32;
-            h >>= 4;
-            sumi1 += vaddvq_s32(prod_1) * ls1;
-            sumi2 += vaddvq_s32(prod_2) * ls2;
-
-        }
-
-        sumf += GGML_FP16_TO_FP32(x[ibl].d) * y[ibl].d * (sumi1 + sumi2);
-    }
-
-    *s = sumf;
-
-#elif defined __AVX2__
-
-    const __m128i values128 = _mm_loadu_si128((const __m128i*)kvalues_iq4nl);
-    const __m128i m4b  = _mm_set1_epi8(0x0f);
-
-    __m256 accum = _mm256_setzero_ps();
-    for (int ibl = 0; ibl < nb; ++ibl) {
-        const uint8_t * qs = x[ibl].qs;
-        const int8_t  * q8 = y[ibl].qs;
-        uint16_t sh = x[ibl].scales_h;
-        __m256i sumi1 = _mm256_setzero_si256();
-        __m256i sumi2 = _mm256_setzero_si256();
-        for (int ib = 0; ib < QK_K/32; ib += 2) {
-            const __m128i q4bits_1 = _mm_loadu_si128((const __m128i*)qs);  qs += 16;
-            const __m128i q4bits_2 = _mm_loadu_si128((const __m128i*)qs);  qs += 16;
-            const __m256i q8b_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
-            const __m256i q8b_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
-            const __m256i q4b_1 = MM256_SET_M128I(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b)),
-                                                  _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b)));
-            const __m256i q4b_2 = MM256_SET_M128I(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b)),
-                                                  _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b)));
-            const __m256i p16_1 = mul_add_epi8(q4b_1, q8b_1);
-            const __m256i p16_2 = mul_add_epi8(q4b_2, q8b_2);
-            const int16_t ls1 = ((x[ibl].scales_l[ib/2] & 0xf) | ((sh << 4) & 0x30)) - 32;
-            const int16_t ls2 = ((x[ibl].scales_l[ib/2] >>  4) | ((sh << 2) & 0x30)) - 32;
-            sh >>= 4;
-            const __m256i p_1 = _mm256_madd_epi16(p16_1, _mm256_set1_epi16(ls1));
-            const __m256i p_2 = _mm256_madd_epi16(p16_2, _mm256_set1_epi16(ls2));
-            sumi1 = _mm256_add_epi32(p_1, sumi1);
-            sumi2 = _mm256_add_epi32(p_2, sumi2);
-        }
-        accum = _mm256_fmadd_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(x[ibl].d)*y[ibl].d),
-                _mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accum);
-    }
-
-    *s = hsum_float_8(accum);
-
-#elif defined __AVX__
-    const __m128i values128 = _mm_loadu_si128((const __m128i*)kvalues_iq4nl);
-    const __m128i m4b  = _mm_set1_epi8(0x0f);
-
-    __m256 accum = _mm256_setzero_ps();
-    for (int ibl = 0; ibl < nb; ++ibl) {
-        const uint8_t * qs = x[ibl].qs;
-        const int8_t  * q8 = y[ibl].qs;
-        uint16_t sh = x[ibl].scales_h;
-        __m128i sumi1_0 = _mm_setzero_si128();
-        __m128i sumi1_1 = _mm_setzero_si128();
-        __m128i sumi2_0 = _mm_setzero_si128();
-        __m128i sumi2_1 = _mm_setzero_si128();
-        for (int ib = 0; ib < QK_K/32; ib += 2) {
-            const __m128i q4bits_1 = _mm_loadu_si128((const __m128i *)qs); qs += 16;
-            const __m128i q4bits_2 = _mm_loadu_si128((const __m128i *)qs); qs += 16;
-            const __m128i q8b_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
-            const __m128i q8b_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
-            const __m128i q8b_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
-            const __m128i q8b_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
-            const __m128i q4b_1_0 = _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b));
-            const __m128i q4b_1_1 = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b));
-            const __m128i q4b_2_0 = _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b));
-            const __m128i q4b_2_1 = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b));
-            const __m128i p16_1_0 = mul_add_epi8_sse(q4b_1_0, q8b_1_0);
-            const __m128i p16_1_1 = mul_add_epi8_sse(q4b_1_1, q8b_1_1);
-            const __m128i p16_2_0 = mul_add_epi8_sse(q4b_2_0, q8b_2_0);
-            const __m128i p16_2_1 = mul_add_epi8_sse(q4b_2_1, q8b_2_1);
-            const int16_t ls1 = ((x[ibl].scales_l[ib/2] & 0xf) | ((sh << 4) & 0x30)) - 32;
-            const int16_t ls2 = ((x[ibl].scales_l[ib/2] >>  4) | ((sh << 2) & 0x30)) - 32;
-            sh >>= 4;
-            const __m128i p_1_0 = _mm_madd_epi16(p16_1_0, _mm_set1_epi16(ls1));
-            const __m128i p_1_1 = _mm_madd_epi16(p16_1_1, _mm_set1_epi16(ls1));
-            const __m128i p_2_0 = _mm_madd_epi16(p16_2_0, _mm_set1_epi16(ls2));
-            const __m128i p_2_1 = _mm_madd_epi16(p16_2_1, _mm_set1_epi16(ls2));
-            sumi1_0 = _mm_add_epi32(p_1_0, sumi1_0);
-            sumi1_1 = _mm_add_epi32(p_1_1, sumi1_1);
-            sumi2_0 = _mm_add_epi32(p_2_0, sumi2_0);
-            sumi2_1 = _mm_add_epi32(p_2_1, sumi2_1);
-        }
-        __m128i sumi12_0 = _mm_add_epi32(sumi1_0, sumi2_0);
-        __m128i sumi12_1 = _mm_add_epi32(sumi1_1, sumi2_1);
-        accum = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(x[ibl].d)*y[ibl].d),
-                _mm256_cvtepi32_ps(MM256_SET_M128I(sumi12_1, sumi12_0))), accum);
-    }
-
-    *s = hsum_float_8(accum);
-
-#elif defined(__POWER9_VECTOR__)
-    const vector signed char lowMask = vec_splats((signed char)0xF);
-    const vector int v0 = vec_splats((int32_t)0);
-    const vector unsigned char v4 = vec_splats((unsigned char)0x4);
-
-    vector float vsumf0 = vec_splats(0.0f);
-    vector float vsumf1 = vec_splats(0.0f);
-    vector float vsumf2 = vec_splats(0.0f);
-    vector float vsumf3 = vec_splats(0.0f);
-
-    const vector signed char values = vec_xl( 0, kvalues_iq4nl);
-
-    for (int ibl = 0; ibl < nb; ++ibl) {
-
-        vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[ibl].d));
-        vector float vyd = vec_splats(y[ibl].d);
-        vector float vd = vec_mul(vxd, vyd);
-
-        vector signed int vsumi0 = v0;
-        vector signed int vsumi1 = v0;
-        vector signed int vsumi2 = v0;
-        vector signed int vsumi3 = v0;
-
-        uint16_t h = x[ibl].scales_h;
-
-        const uint8_t * GGML_RESTRICT q4 = x[ibl].qs;
-        const uint8_t * GGML_RESTRICT sc = x[ibl].scales_l;
-        const int8_t  * GGML_RESTRICT q8 = y[ibl].qs;
-
-        for (int ib = 0; ib < QK_K/64; ib ++ ) {
-            __builtin_prefetch(q4, 0, 1);
-            __builtin_prefetch(q8, 0, 1);
-
-            vector signed char qxs0 = (vector signed char)vec_xl( 0, q4);
-            vector signed char qxs1 = (vector signed char)vec_xl(16, q4);
-            q4 += 32;
-
-            vector signed char q4x00 = (vector signed char)vec_and(qxs0, lowMask);
-            vector signed char q4x01 = (vector signed char)vec_sr(qxs0, v4);
-            vector signed char q4x10 = (vector signed char)vec_and(qxs1, lowMask);
-            vector signed char q4x11 = (vector signed char)vec_sr(qxs1, v4);
-
-            q4x00 = vec_perm(values, values, (vector unsigned char)q4x00);
-            q4x01 = vec_perm(values, values, (vector unsigned char)q4x01);
-            q4x10 = vec_perm(values, values, (vector unsigned char)q4x10);
-            q4x11 = vec_perm(values, values, (vector unsigned char)q4x11);
-
-            vector signed char q8y0 = vec_xl( 0, q8);
-            vector signed char q8y1 = vec_xl(16, q8);
-            vector signed char q8y2 = vec_xl(32, q8);
-            vector signed char q8y3 = vec_xl(48, q8);
-            q8 += 64;
-
-            vector signed short qv0 = vec_add(vec_mule(q4x00, q8y0), vec_mulo(q4x00, q8y0));
-            vector signed short qv1 = vec_add(vec_mule(q4x01, q8y1), vec_mulo(q4x01, q8y1));
-            vector signed short qv2 = vec_add(vec_mule(q4x10, q8y2), vec_mulo(q4x10, q8y2));
-            vector signed short qv3 = vec_add(vec_mule(q4x11, q8y3), vec_mulo(q4x11, q8y3));
-
-            const uint16_t ls0 = (uint16_t)(((sc[0] & 0xf) | ((h << 4) & 0x30)) - 32);
-            const uint16_t ls1 = (uint16_t)(((sc[0] >>  4) | ((h << 2) & 0x30)) - 32);
-            h >>= 4;
-            sc ++;
-
-            vector signed short vscales01 = vec_splats((int16_t)ls0);
-            vector signed short vscales23 = vec_splats((int16_t)ls1);
-
-            vsumi0 = vec_msum(qv0, vscales01, vsumi0);
-            vsumi1 = vec_msum(qv1, vscales01, vsumi1);
-            vsumi2 = vec_msum(qv2, vscales23, vsumi2);
-            vsumi3 = vec_msum(qv3, vscales23, vsumi3);
-        }
-
-        vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
-        vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
-        vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
-        vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
-    }
-
-    vsumf0 = vec_add(vsumf0, vsumf2);
-    vsumf1 = vec_add(vsumf1, vsumf3);
-
-    vsumf0 = vec_add(vsumf0, vsumf1);
-
-    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
-    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
-
-    *s = vec_extract(vsumf0, 0);
-
-#elif defined(__loongarch_asx)
-
-    const __m128i values128 = __lsx_vld((const __m128i*)kvalues_iq4nl, 0);
-
-    __m256 accum = (__m256)__lasx_xvldi(0);
-
-    for (int ibl = 0; ibl < nb; ++ibl) {
-        const uint8_t * qs = x[ibl].qs;
-        const int8_t  * q8 = y[ibl].qs;
-        uint16_t sh = x[ibl].scales_h;
-        __m256i sumi1 = __lasx_xvldi(0);
-        __m256i sumi2 = __lasx_xvldi(0);
-        for (int ib = 0; ib < QK_K/32; ib += 2) {
-            const __m128i q4bits_1 = __lsx_vld((const __m128i*)qs, 0); qs += 16;
-            const __m128i q4bits_2 = __lsx_vld((const __m128i*)qs, 0); qs += 16;
-            const __m256i q8b_1 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32;
-            const __m256i q8b_2 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32;
-            const __m256i q4b_1 = lasx_insertf128(__lsx_vshuf_b(values128, values128, __lsx_vsrli_b(q4bits_1, 4)),
-                                                  __lsx_vshuf_b(values128, values128, __lsx_vandi_b(q4bits_1, 0xf)));
-            const __m256i q4b_2 = lasx_insertf128(__lsx_vshuf_b(values128, values128, __lsx_vsrli_b(q4bits_2, 4)),
-                                                  __lsx_vshuf_b(values128, values128, __lsx_vandi_b(q4bits_2, 0xf)));
-            const __m256i p16_1 = mul_add_epi8(q4b_1, q8b_1);
-            const __m256i p16_2 = mul_add_epi8(q4b_2, q8b_2);
-            const int16_t ls1 = ((x[ibl].scales_l[ib/2] & 0xf) | ((sh << 4) & 0x30)) - 32;
-            const int16_t ls2 = ((x[ibl].scales_l[ib/2] >>  4) | ((sh << 2) & 0x30)) - 32;
-            sh >>= 4;
-            const __m256i p_1 = lasx_madd_h(p16_1, __lasx_xvreplgr2vr_h(ls1));
-            const __m256i p_2 = lasx_madd_h(p16_2, __lasx_xvreplgr2vr_h(ls2));
-            sumi1 = __lasx_xvadd_w(p_1, sumi1);
-            sumi2 = __lasx_xvadd_w(p_2, sumi2);
-        }
-        accum = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(GGML_FP16_TO_FP32(x[ibl].d)*y[ibl].d),
-                __lasx_xvffint_s_w(__lasx_xvadd_w(sumi1, sumi2)), accum);
-    }
-
-    *s = hsum_float_8(accum);
-#elif defined(__VXE__) || defined(__VXE2__)
-    const int8x16_t v_k = vec_xl(0, kvalues_iq4nl);
-    const uint8x16_t v_m = vec_splat_u8(0x0F);
-
-    float sumf = 0;
-
-    for (int ibl = 0; ibl < nb; ++ibl) {
-        const uint8_t * GGML_RESTRICT q4 = x[ibl].qs;
-        const int8_t  * GGML_RESTRICT q8 = y[ibl].qs;
-
-        uint16_t h = x[ibl].scales_h;
-
-        int sumi1 = 0, sumi2 = 0;
-        for (int ib = 0; ib < QK_K/64; ++ib) {
-            const uint8x16_t v_x0 = vec_xl(0       , q4);
-            const uint8x16_t v_x1 = vec_xl(QK4_NL/2, q4);
-            q4 += 32;
-
-            int8x16_t v_x0l = (int8x16_t)vec_and(v_x0, v_m);
-            int8x16_t v_x0h = (int8x16_t)vec_sr(v_x0, 4);
-            int8x16_t v_x1l = (int8x16_t)vec_and(v_x1, v_m);
-            int8x16_t v_x1h = (int8x16_t)vec_sr(v_x1, 4);
-
-            v_x0l = vec_perm(v_k, v_k, (uchar8x16_t)v_x0l);
-            v_x0h = vec_perm(v_k, v_k, (uchar8x16_t)v_x0h);
-            v_x1l = vec_perm(v_k, v_k, (uchar8x16_t)v_x1l);
-            v_x1h = vec_perm(v_k, v_k, (uchar8x16_t)v_x1h);
-
-            const int8x16_t v_y0 = vec_xl( 0, q8);
-            const int8x16_t v_y1 = vec_xl(16, q8);
-            const int8x16_t v_y2 = vec_xl(32, q8);
-            const int8x16_t v_y3 = vec_xl(48, q8);
-            q8 += 64;
-
-            int32x4_t vsumi0 = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_x0l, v_y0), v_x0h, v_y1);
-            int32x4_t vsumi1 = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_x1l, v_y2), v_x1h, v_y3);
-
-            int ls1 = ((x[ibl].scales_l[ib] & 0xF) | ((h << 4) & 0x30)) - 32;
-            int ls2 = ((x[ibl].scales_l[ib] >>  4) | ((h << 2) & 0x30)) - 32;
-
-            h >>= 4;
-
-            sumi1 += (vsumi0[0] + vsumi0[1] + vsumi0[2] + vsumi0[3]) * ls1;
-            sumi2 += (vsumi1[0] + vsumi1[1] + vsumi1[2] + vsumi1[3]) * ls2;
-        }
-
-        sumf += GGML_FP16_TO_FP32(x[ibl].d) * y[ibl].d * (sumi1 + sumi2);
-    }
-
-    *s = sumf;
-
-#else
-    float sumf = 0;
-    for (int ibl = 0; ibl < nb; ++ibl) {
-        const float d4d8 = GGML_FP16_TO_FP32(x[ibl].d) * y[ibl].d;
-        uint16_t h = x[ibl].scales_h;
-        const uint8_t * qs = x[ibl].qs;
-        const int8_t  * q8 = y[ibl].qs;
-        for (int ib = 0; ib < QK_K/32; ib += 2) {
-            const uint8_t ls1 = (x[ibl].scales_l[ib/2] & 0xf) | ((h << 4) & 0x30);
-            const uint8_t ls2 = (x[ibl].scales_l[ib/2] >>  4) | ((h << 2) & 0x30);
-            h >>= 4;
-            const float d1 = d4d8*(ls1 - 32);
-            const float d2 = d4d8*(ls2 - 32);
-            int sumi1 = 0, sumi2 = 0;
-            for (int j = 0; j < 16; ++j) {
-                sumi1 += q8[j+ 0] * kvalues_iq4nl[qs[j] & 0xf];
-                sumi2 += q8[j+16] * kvalues_iq4nl[qs[j] >>  4];
-            }
-            sumf += d1 * (sumi1 + sumi2);
-            qs += 16;
-            q8 += 32;
-            sumi1 = sumi2 = 0;
-            for (int j = 0; j < 16; ++j) {
-                sumi1 += q8[j+ 0] * kvalues_iq4nl[qs[j] & 0xf];
-                sumi2 += q8[j+16] * kvalues_iq4nl[qs[j] >>  4];
-            }
-            sumf += d2 * (sumi1 + sumi2);
-            qs += 16;
-            q8 += 32;
-        }
-    }
-    *s = sumf;
-#endif
-}
-
-// ============================ 4-bit non-linear quants
-
-void quantize_row_iq4_nl(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
-    assert(k % QK4_NL == 0);
-    quantize_row_iq4_nl_ref(x, y, k);
-}
-
-void quantize_row_iq4_xs(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
-    assert(k % QK_K == 0);
-    quantize_iq4_xs(x, y, 1, k, NULL);
-}
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index c7426df2b851b..7cae96f4b4885 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -3,11 +3,11 @@
 
 #include "ggml-backend-impl.h"
 #include "ggml-backend.h"
-#include "ggml-cpu-traits.h"
+#include "traits.h"
 #include "ggml-cpu-impl.h"
 #include "ggml-cpu.h"
 #include "ggml-impl.h"
-#include "ggml-cpu-quants.h"
+#include "quants.h"
 #include "ggml-threading.h"
 #include "unary-ops.h"
 #include "binary-ops.h"
@@ -72,15 +72,13 @@
 #define UNUSED GGML_UNUSED
 #define SWAP(x, y, T) do { T SWAP = x; (x) = y; (y) = SWAP; } while (0)
 
+// precomputed f32 table for f16 (256 KB) (simd-mappings.h)
+float ggml_table_f32_f16[1 << 16];
+
 #if defined(__ARM_ARCH)
 struct ggml_arm_arch_features_type {
-    int has_neon;
-    int has_dotprod;
-    int has_i8mm;
-    int has_sve;
     int sve_cnt;
-    int has_sme;
-} ggml_arm_arch_features = {-1, -1, -1, -1, 0, -1};
+} ggml_arm_arch_features = { 0 };
 #endif
 
 
@@ -559,6 +557,14 @@ void ggml_barrier(struct ggml_threadpool * tp) {
 #endif
 }
 
+void ggml_threadpool_chunk_set(struct ggml_threadpool * tp, int value) {
+    atomic_store_explicit(&tp->current_chunk, value, memory_order_relaxed);
+}
+
+int ggml_threadpool_chunk_add(struct ggml_threadpool * tp, int value) {
+    return atomic_fetch_add_explicit(&tp->current_chunk, value, memory_order_relaxed);
+}
+
 #if defined(__gnu_linux__)
 static cpu_set_t ggml_get_numa_affinity(void) {
     cpu_set_t cpuset;
@@ -670,87 +676,15 @@ bool ggml_is_numa(void) {
 
 #if defined(__linux__) && defined(__aarch64__)
 #include <sys/auxv.h>
-#elif defined(__APPLE__)
-#include <sys/sysctl.h>
-#endif
-
-#if !defined(HWCAP2_I8MM)
-#define HWCAP2_I8MM (1 << 13)
-#endif
-
-#if !defined(HWCAP2_SME)
-#define HWCAP2_SME (1 << 23)
 #endif
 
 static void ggml_init_arm_arch_features(void) {
-#if defined(__linux__) && defined(__aarch64__)
-    uint32_t hwcap = getauxval(AT_HWCAP);
-    uint32_t hwcap2 = getauxval(AT_HWCAP2);
-
-    ggml_arm_arch_features.has_neon    = !!(hwcap & HWCAP_ASIMD);
-    ggml_arm_arch_features.has_dotprod = !!(hwcap & HWCAP_ASIMDDP);
-    ggml_arm_arch_features.has_i8mm    = !!(hwcap2 & HWCAP2_I8MM);
-    ggml_arm_arch_features.has_sve     = !!(hwcap & HWCAP_SVE);
-    ggml_arm_arch_features.has_sme     = !!(hwcap2 & HWCAP2_SME);
-
-#if defined(__ARM_FEATURE_SVE)
+#if defined(__linux__) && defined(__aarch64__) && defined(__ARM_FEATURE_SVE)
     ggml_arm_arch_features.sve_cnt = PR_SVE_VL_LEN_MASK & prctl(PR_SVE_GET_VL);
 #endif
-#elif defined(__APPLE__)
-    int oldp = 0;
-    size_t size = sizeof(oldp);
-    if (sysctlbyname("hw.optional.AdvSIMD", &oldp, &size, NULL, 0) != 0) {
-        oldp = 0;
-    }
-    ggml_arm_arch_features.has_neon = oldp;
-
-    if (sysctlbyname("hw.optional.arm.FEAT_DotProd", &oldp, &size, NULL, 0) != 0) {
-        oldp = 0;
-    }
-    ggml_arm_arch_features.has_dotprod = oldp;
-
-    if (sysctlbyname("hw.optional.arm.FEAT_I8MM", &oldp, &size, NULL, 0) != 0) {
-        oldp = 0;
-    }
-    ggml_arm_arch_features.has_i8mm = oldp;
-
-    if (sysctlbyname("hw.optional.arm.FEAT_SME", &oldp, &size, NULL, 0) != 0) {
-        oldp = 0;
-    }
-    ggml_arm_arch_features.has_sme = oldp;
-
-    ggml_arm_arch_features.has_sve = 0;
-    ggml_arm_arch_features.sve_cnt = 0;
-#else
-// Run-time CPU feature detection not implemented for this platform, fallback to compile time
-#if defined(__ARM_NEON)
-    ggml_arm_arch_features.has_neon = 1;
-#else
-    ggml_arm_arch_features.has_neon = 0;
-#endif
-
-#if defined(__ARM_FEATURE_MATMUL_INT8)
-    ggml_arm_arch_features.has_i8mm = 1;
-#else
-    ggml_arm_arch_features.has_i8mm = 0;
-#endif
-
-#if defined(__ARM_FEATURE_SVE)
-    ggml_arm_arch_features.has_sve = 1;
-    ggml_arm_arch_features.sve_cnt = 16;
-#else
-    ggml_arm_arch_features.has_sve = 0;
-    ggml_arm_arch_features.sve_cnt = 0;
-#endif
-
-#if defined(__ARM_FEATURE_SME) || defined(__ARM_FEATURE_SME2)
-    ggml_arm_arch_features.has_sme = 1;
-#else
-    ggml_arm_arch_features.has_sme = 0;
-#endif
-#endif
 }
-#endif
+
+#endif // __ARM_ARCH
 
 struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value) {
     GGML_ASSERT(!ggml_get_no_alloc(ctx));
@@ -805,7 +739,7 @@ struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value) {
             {
                 assert(tensor->nb[0] == sizeof(ggml_fp16_t));
                 for (int i = 0; i < n; i++) {
-                    ggml_vec_set_f16(nc, (ggml_fp16_t *)(data + i*n1), GGML_FP32_TO_FP16(value));
+                    ggml_vec_set_f16(nc, (ggml_fp16_t *)(data + i*n1), GGML_CPU_FP32_TO_FP16(value));
                 }
             } break;
         case GGML_TYPE_BF16:
@@ -864,7 +798,7 @@ struct ggml_tensor * ggml_set_f32(struct ggml_tensor * tensor, float value) {
             {
                 assert(tensor->nb[0] == sizeof(ggml_fp16_t));
                 for (int i = 0; i < n; i++) {
-                    ggml_vec_set_f16(nc, (ggml_fp16_t *)(data + i*n1), GGML_FP32_TO_FP16(value));
+                    ggml_vec_set_f16(nc, (ggml_fp16_t *)(data + i*n1), GGML_CPU_FP32_TO_FP16(value));
                 }
             } break;
         case GGML_TYPE_BF16:
@@ -915,7 +849,7 @@ int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i) {
         case GGML_TYPE_F16:
             {
                 GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
-                return GGML_FP16_TO_FP32(((ggml_fp16_t *)(tensor->data))[i]);
+                return GGML_CPU_FP16_TO_FP32(((ggml_fp16_t *)(tensor->data))[i]);
             }
         case GGML_TYPE_BF16:
             {
@@ -960,7 +894,7 @@ void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value) {
         case GGML_TYPE_F16:
             {
                 GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
-                ((ggml_fp16_t *)(tensor->data))[i] = GGML_FP32_TO_FP16(value);
+                ((ggml_fp16_t *)(tensor->data))[i] = GGML_CPU_FP32_TO_FP16(value);
             } break;
         case GGML_TYPE_BF16:
             {
@@ -989,7 +923,7 @@ int32_t ggml_get_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i
         case GGML_TYPE_I32:
             return ((int32_t *) data)[0];
         case GGML_TYPE_F16:
-            return GGML_FP16_TO_FP32(((ggml_fp16_t *) data)[0]);
+            return GGML_CPU_FP16_TO_FP32(((ggml_fp16_t *) data)[0]);
         case GGML_TYPE_BF16:
             return GGML_BF16_TO_FP32(((ggml_bf16_t *) data)[0]);
         case GGML_TYPE_F32:
@@ -1016,7 +950,7 @@ void ggml_set_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2,
             } break;
         case GGML_TYPE_F16:
             {
-                ((ggml_fp16_t *)(data))[0] = GGML_FP32_TO_FP16(value);
+                ((ggml_fp16_t *)(data))[0] = GGML_CPU_FP32_TO_FP16(value);
             } break;
         case GGML_TYPE_BF16:
             {
@@ -1054,7 +988,7 @@ float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i) {
             }
         case GGML_TYPE_F16:
             {
-                return GGML_FP16_TO_FP32(((ggml_fp16_t *)(tensor->data))[i]);
+                return GGML_CPU_FP16_TO_FP32(((ggml_fp16_t *)(tensor->data))[i]);
             }
         case GGML_TYPE_BF16:
             {
@@ -1093,7 +1027,7 @@ void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value) {
             } break;
         case GGML_TYPE_F16:
             {
-                ((ggml_fp16_t *)(tensor->data))[i] = GGML_FP32_TO_FP16(value);
+                ((ggml_fp16_t *)(tensor->data))[i] = GGML_CPU_FP32_TO_FP16(value);
             } break;
         case GGML_TYPE_BF16:
             {
@@ -1120,7 +1054,7 @@ float ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2,
         case GGML_TYPE_I32:
             return ((int32_t *) data)[0];
         case GGML_TYPE_F16:
-            return GGML_FP16_TO_FP32(((ggml_fp16_t *) data)[0]);
+            return GGML_CPU_FP16_TO_FP32(((ggml_fp16_t *) data)[0]);
         case GGML_TYPE_BF16:
             return GGML_BF16_TO_FP32(((ggml_bf16_t *) data)[0]);
         case GGML_TYPE_F32:
@@ -1147,7 +1081,7 @@ void ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2,
             } break;
         case GGML_TYPE_F16:
             {
-                ((ggml_fp16_t *)(data))[0] = GGML_FP32_TO_FP16(value);
+                ((ggml_fp16_t *)(data))[0] = GGML_CPU_FP32_TO_FP16(value);
             } break;
         case GGML_TYPE_BF16:
             {
@@ -1959,6 +1893,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
             {
                 ggml_compute_forward_pad_reflect_1d(params, tensor);
             } break;
+        case GGML_OP_ROLL:
+            {
+                ggml_compute_forward_roll(params, tensor);
+            } break;
         case GGML_OP_ARANGE:
             {
                 ggml_compute_forward_arange(params, tensor);
@@ -2283,6 +2221,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
         case GGML_OP_UPSCALE:
         case GGML_OP_PAD:
         case GGML_OP_PAD_REFLECT_1D:
+        case GGML_OP_ROLL:
         case GGML_OP_ARANGE:
         case GGML_OP_TIMESTEP_EMBEDDING:
         case GGML_OP_ARGSORT:
@@ -3205,9 +3144,24 @@ void ggml_cpu_fp32_to_fp16(const float * x, ggml_fp16_t * y, int64_t n) {
         __m128i y_vec = _mm_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
         _mm_storel_epi64((__m128i *)(y + i), y_vec);
     }
+#elif defined(__NNPA__)
+    for (; i + 7 < n; i += 8) {
+        float32x4_t v_xh = vec_xl(0, (const float *)(x + i + 0));
+        float32x4_t v_xl = vec_xl(0, (const float *)(x + i + 4));
+        uint16x8_t v_yd = vec_round_from_fp32(v_xh, v_xl, 0);
+        uint16x8_t v_y = vec_convert_to_fp16(v_yd, 0);
+        vec_xst(v_y, 0, (ggml_fp16_t *)(y + i));
+    }
+    for (; i + 3 < n; i += 4) {
+        float32x4_t v_x = vec_xl(0, (const float *)(x + i));
+        float32x4_t v_zero = vec_splats(0.0f);
+        uint16x8_t v_yd = vec_round_from_fp32(v_x, v_zero, 0);
+        uint16x8_t v_y = vec_convert_to_fp16(v_yd, 0);
+        vec_xst(v_y, 0, (ggml_fp16_t *)(y + i));
+    }
 #endif
     for (; i < n; ++i) {
-        y[i] = GGML_FP32_TO_FP16(x[i]);
+        y[i] = GGML_CPU_FP32_TO_FP16(x[i]);
     }
 }
 
@@ -3231,9 +3185,25 @@ void ggml_cpu_fp16_to_fp32(const ggml_fp16_t * x, float * y, int64_t n) {
         __m128 y_vec = _mm_cvtph_ps(x_vec);
         _mm_storeu_ps(y + i, y_vec);
     }
+#elif defined(__NNPA__)
+    for (; i + 7 < n; i += 8) {
+        uint16x8_t v_x = vec_xl(0, (const ggml_fp16_t *)(x + i));
+        uint16x8_t v_yd = vec_convert_from_fp16(v_x, 0);
+        float32x4_t v_yh = vec_extend_to_fp32_hi(v_yd, 0);
+        float32x4_t v_yl = vec_extend_to_fp32_lo(v_yd, 0);
+        vec_xst(v_yh, 0, (float *)(y + i + 0));
+        vec_xst(v_yl, 0, (float *)(y + i + 4));
+    }
+    for (; i + 3 < n; i += 4) {
+        uint16x8_t v_x = vec_xl(0, (const ggml_fp16_t *)(x + i));
+        uint16x8_t v_yd = vec_convert_from_fp16(v_x, 0);
+        float32x4_t v_yh = vec_extend_to_fp32_hi(v_yd, 0);
+        vec_xst(v_yh, 0, (float *)(y + i));
+    }
 #endif
+
     for (; i < n; ++i) {
-        y[i] = GGML_FP16_TO_FP32(x[i]);
+        y[i] = GGML_CPU_FP16_TO_FP32(x[i]);
     }
 }
 
@@ -3433,9 +3403,17 @@ int ggml_cpu_has_vxe(void) {
 #endif
 }
 
+int ggml_cpu_has_nnpa(void) {
+#if defined(GGML_NNPA)
+    return 1;
+#else
+    return 0;
+#endif
+}
+
 int ggml_cpu_has_neon(void) {
 #if defined(__ARM_ARCH) && defined(__ARM_NEON)
-    return ggml_arm_arch_features.has_neon;
+    return 1;
 #else
     return 0;
 #endif
@@ -3443,7 +3421,7 @@ int ggml_cpu_has_neon(void) {
 
 int ggml_cpu_has_dotprod(void) {
 #if defined(__ARM_ARCH) && defined(__ARM_FEATURE_DOTPROD)
-    return ggml_arm_arch_features.has_dotprod;
+    return 1;
 #else
     return 0;
 #endif
@@ -3451,7 +3429,7 @@ int ggml_cpu_has_dotprod(void) {
 
 int ggml_cpu_has_sve(void) {
 #if defined(__ARM_ARCH) && defined(__ARM_FEATURE_SVE)
-    return ggml_arm_arch_features.has_sve;
+    return 1;
 #else
     return 0;
 #endif
@@ -3459,7 +3437,7 @@ int ggml_cpu_has_sve(void) {
 
 int ggml_cpu_has_matmul_int8(void) {
 #if defined(__ARM_ARCH) && defined(__ARM_FEATURE_MATMUL_INT8)
-    return ggml_arm_arch_features.has_i8mm;
+    return 1;
 #else
     return 0;
 #endif
@@ -3475,14 +3453,14 @@ int ggml_cpu_get_sve_cnt(void) {
 
 int ggml_cpu_has_sme(void) {
 #if defined(__ARM_ARCH) && defined(__ARM_FEATURE_SME)
-    return ggml_arm_arch_features.has_sme;
+    return 1;
 #else
     return 0;
 #endif
 }
 
 void ggml_cpu_init(void) {
-    // needed to initialize f16 tables
+    // needed to initialize ggml_time
     {
         struct ggml_init_params params = { 0, NULL, false };
         struct ggml_context * ctx = ggml_init(params);
@@ -3503,9 +3481,10 @@ void ggml_cpu_init(void) {
                     uint16_t u16;
                     ggml_fp16_t fp16;
                 } u = {i};
-                float f = GGML_FP16_TO_FP32(u.fp16);
-                ggml_table_gelu_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_f32(f));
-                ggml_table_gelu_quick_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_quick_f32(f));
+                float f = GGML_COMPUTE_FP16_TO_FP32(u.fp16);
+                ggml_table_f32_f16[i] = f;
+                ggml_table_gelu_f16[i] = GGML_CPU_FP32_TO_FP16(ggml_gelu_f32(f));
+                ggml_table_gelu_quick_f16[i] = GGML_CPU_FP32_TO_FP16(ggml_gelu_quick_f32(f));
             }
 
             const uint64_t t_end = ggml_time_us(); UNUSED(t_end);
diff --git a/ggml/src/ggml-cpu/ggml-cpu.cpp b/ggml/src/ggml-cpu/ggml-cpu.cpp
index e013e8b416222..a98866a2d8052 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.cpp
+++ b/ggml/src/ggml-cpu/ggml-cpu.cpp
@@ -1,8 +1,8 @@
 #include "ggml-backend.h"
 #include "ggml-backend-impl.h"
 #include "ggml-cpu.h"
-#include "ggml-cpu-aarch64.h"
-#include "ggml-cpu-traits.h"
+#include "repack.h"
+#include "traits.h"
 #include "ggml-impl.h"
 #include "amx/amx.h"
 
@@ -11,7 +11,7 @@
 #include <vector>
 
 #ifdef GGML_USE_CPU_HBM
-#    include "ggml-cpu-hbm.h"
+#    include "hbm.h"
 #endif
 
 #ifdef GGML_USE_CPU_KLEIDIAI
@@ -51,9 +51,9 @@ std::vector<ggml_backend_buffer_type_t>& ggml_backend_cpu_get_extra_buffers_type
         }
 #endif
 
-#ifdef GGML_USE_CPU_AARCH64
-        if (ggml_backend_cpu_aarch64_buffer_type()) {
-            bufts.push_back(ggml_backend_cpu_aarch64_buffer_type());
+#ifdef GGML_USE_CPU_REPACK
+        if (ggml_backend_cpu_repack_buffer_type()) {
+            bufts.push_back(ggml_backend_cpu_repack_buffer_type());
         }
 #endif
 
@@ -578,6 +578,9 @@ static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t r
         if (ggml_cpu_has_vxe()) {
             features.push_back({ "VXE", "1" });
         }
+        if (ggml_cpu_has_nnpa()) {
+            features.push_back({ "NNPA", "1" });
+        }
         if (ggml_cpu_has_wasm_simd()) {
             features.push_back({ "WASM_SIMD", "1" });
         }
@@ -596,8 +599,8 @@ static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t r
     #ifdef GGML_USE_CPU_KLEIDIAI
         features.push_back({ "KLEIDIAI", "1" });
     #endif
-    #ifdef GGML_USE_CPU_AARCH64
-        features.push_back({ "AARCH64_REPACK", "1" });
+    #ifdef GGML_USE_CPU_REPACK
+        features.push_back({ "REPACK", "1" });
     #endif
 
         features.push_back({ nullptr, nullptr });
diff --git a/ggml/src/ggml-cpu/ggml-cpu-hbm.cpp b/ggml/src/ggml-cpu/hbm.cpp
similarity index 98%
rename from ggml/src/ggml-cpu/ggml-cpu-hbm.cpp
rename to ggml/src/ggml-cpu/hbm.cpp
index fa8dea2af9c72..a4073c15e6c90 100644
--- a/ggml/src/ggml-cpu/ggml-cpu-hbm.cpp
+++ b/ggml/src/ggml-cpu/hbm.cpp
@@ -5,7 +5,7 @@
 #include "ggml-cpu.h"
 #include "ggml-impl.h"
 
-#include "ggml-cpu-hbm.h"
+#include "hbm.h"
 
 // buffer type HBM
 
diff --git a/ggml/src/ggml-cpu/ggml-cpu-hbm.h b/ggml/src/ggml-cpu/hbm.h
similarity index 100%
rename from ggml/src/ggml-cpu/ggml-cpu-hbm.h
rename to ggml/src/ggml-cpu/hbm.h
diff --git a/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp b/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp
index 15f0cd1540686..fafe45e6c5c51 100644
--- a/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp
+++ b/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp
@@ -26,7 +26,7 @@
 #include "ggml-impl.h"
 #include "ggml-backend-impl.h"
 #include "ggml-threading.h"
-#include "ggml-cpu-traits.h"
+#include "traits.h"
 
 #include "kernels.h"
 
diff --git a/ggml/src/ggml-cpu/llamafile/sgemm.cpp b/ggml/src/ggml-cpu/llamafile/sgemm.cpp
index 1d46158f928c4..ed61869a5508a 100644
--- a/ggml/src/ggml-cpu/llamafile/sgemm.cpp
+++ b/ggml/src/ggml-cpu/llamafile/sgemm.cpp
@@ -52,8 +52,8 @@
 #include "ggml-impl.h"
 #include "ggml-cpu-impl.h"
 #include "ggml-quants.h"
+#include "simd-mappings.h"
 
-#include <atomic>
 #include <array>
 #include <type_traits>
 
@@ -63,7 +63,7 @@
 #define NOINLINE __attribute__((__noinline__))
 #endif
 
-#if defined(__ARM_NEON) || defined(__AVX512F__)
+#if defined(__ARM_NEON) || defined(__AVX512F__) || defined(__VXE__) || defined(__VXE2__)
 #define VECTOR_REGISTERS 32
 #else
 #define VECTOR_REGISTERS 16
@@ -74,7 +74,7 @@
 namespace {
 
 inline float unhalf(ggml_fp16_t d) {
-    return GGML_FP16_TO_FP32(d);
+    return GGML_CPU_FP16_TO_FP32(d);
 }
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -110,6 +110,12 @@ inline float16x8_t sub(float16x8_t x, float16x8_t y) { return vsubq_f16(x, y); }
 inline float16x8_t mul(float16x8_t x, float16x8_t y) { return vmulq_f16(x, y); }
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 
+#if defined(__VXE__) || defined(__VXE2__)
+inline float32x4_t add(float32x4_t x, float32x4_t y) { return vec_add(x, y); }
+inline float32x4_t sub(float32x4_t x, float32x4_t y) { return vec_sub(x, y); }
+inline float32x4_t mul(float32x4_t x, float32x4_t y) { return vec_mul(x, y); }
+#endif
+
 #if defined(__MMA__)
 typedef vector unsigned char vec_t;
 typedef __vector_quad acc_t;
@@ -163,6 +169,13 @@ inline float16x8_t madd(float16x8_t a, float16x8_t b, float16x8_t c) {
 #endif
 #endif
 
+#if defined(__VXE__) || defined(__VXE2__)
+template <>
+inline float32x4_t madd(float32x4_t a, float32x4_t b, float32x4_t c) {
+    return vec_madd(a, b, c);
+}
+#endif
+
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // VECTORIZED HORIZONTAL SUM
 
@@ -179,6 +192,13 @@ inline float hsum(float16x8_t x) {
 }
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 
+#if defined(__VXE__) || defined(__VXE2__)
+inline float hsum(float32x4_t x) {
+    float32x4_t tmp = x + vec_reve(x);
+    return tmp[0] + tmp[1];
+}
+#endif
+
 #if defined(__SSE__) || defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
 inline float hsum(__m128 x) {
 #if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
@@ -228,6 +248,21 @@ template <> inline float32x4_t load(const ggml_fp16_t *p) {
 #endif // _MSC_VER
 #endif // __ARM_NEON
 
+#if defined(__VXE__) || defined(__VXE2__)
+template <> inline float32x4_t load(const ggml_fp16_t * p) {
+    float tmp[4];
+
+    for (int i = 0; i < 4; i++) {
+        tmp[i] = GGML_CPU_FP16_TO_FP32(p[i]);
+    }
+
+    return vec_xl(0, (const float *)(tmp));
+}
+template <> inline float32x4_t load(const float * p) {
+    return vec_xl(0, p);
+}
+#endif
+
 #if defined(__SSE__) || defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
 template <> inline __m128 load(const float *p) {
     return _mm_loadu_ps(p);
@@ -394,8 +429,6 @@ class tinyBLAS {
 
     template <int RM, int RN, int BM>
     NOINLINE void gemm(int64_t m, int64_t n, int64_t BN) {
-        static std::atomic<int64_t> current_chunk;
-
         GGML_ASSERT(m % (RM * BM) == 0);
         const int64_t ytiles = m / (RM * BM);
         const int64_t xtiles = (n + RN -1) / RN;
@@ -410,7 +443,7 @@ class tinyBLAS {
         if (params->ith == 0) {
             GGML_ASSERT( jj_BN * SIZE_BN + (NB_BN - jj_BN) * (SIZE_BN - 1) == xtiles);
             // Every thread starts at ith, so the first unprocessed chunk is nth.  This save a bit of coordination right at the start.
-            std::atomic_store_explicit(&current_chunk, (int64_t)params->nth, std::memory_order_relaxed);
+            ggml_threadpool_chunk_set(params->threadpool, params->nth);
         }
 
         ggml_barrier(params->threadpool);
@@ -439,8 +472,7 @@ class tinyBLAS {
                 GGML_ASSERT(jj == jj2);
             }
 
-            // next step.
-            job = std::atomic_fetch_add_explicit(&current_chunk, (int64_t)1, std::memory_order_relaxed);
+            job = ggml_threadpool_chunk_add(params->threadpool, 1);
         }
 
         ggml_barrier(params->threadpool);
@@ -3323,6 +3355,14 @@ bool llamafile_sgemm(const struct ggml_compute_params * params, int64_t m, int64
             (const float *)B, ldb,
             (float *)C, ldc};
         return tb.matmul(m, n);
+#elif defined(__VXE__) || defined(__VXE2__)
+        if (n < 4)
+            return false;
+        tinyBLAS<4, float32x4_t, float32x4_t, float, float, float> tb{ params,
+            k, (const float *)A, lda,
+            (const float *)B, ldb,
+            (float *)C, ldc};
+        return tb.matmul(m, n);
 #elif defined(__MMA__)
         if (k % 8)
             return false;
@@ -3414,6 +3454,16 @@ bool llamafile_sgemm(const struct ggml_compute_params * params, int64_t m, int64
                 (float *)C, ldc};
             return tb.matmul(m, n);
         }
+#elif defined(__VXE__) || defined(__VXE2__)
+        if (n < 4)
+            return false;
+        if (Btype == GGML_TYPE_F16) {
+            tinyBLAS<4, float32x4_t, float32x4_t, ggml_fp16_t, ggml_fp16_t, float> tb{ params,
+                k, (const ggml_fp16_t *)A, lda,
+                (const ggml_fp16_t *)B, ldb,
+                (float *)C, ldc};
+            return tb.matmul(m, n);
+        }
 #endif
         return false;
     }
diff --git a/ggml/src/ggml-cpu/llamafile/sgemm.h b/ggml/src/ggml-cpu/llamafile/sgemm.h
index 3d2909515242a..729e8853d516c 100644
--- a/ggml/src/ggml-cpu/llamafile/sgemm.h
+++ b/ggml/src/ggml-cpu/llamafile/sgemm.h
@@ -1,6 +1,11 @@
 #pragma once
 #include <stdint.h>
 #include <stdbool.h>
+
+#if defined(__VXE__) || defined(__VXE2__)
+#include <vecintrin.h>
+#endif
+
 #ifdef __cplusplus
 extern "C" {
 #endif
diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
index d8de7531b0e5f..8531baf6c57fb 100644
--- a/ggml/src/ggml-cpu/ops.cpp
+++ b/ggml/src/ggml-cpu/ops.cpp
@@ -108,7 +108,7 @@ static void ggml_compute_forward_dup_f16(
                         for (int i01 = ir0; i01 < ir1; i01++) {
                             const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
                             for (int i00 = 0; i00 < ne00; i00++) {
-                                dst_ptr[id] = GGML_FP16_TO_FP32(src0_ptr[i00]);
+                                dst_ptr[id] = GGML_CPU_FP16_TO_FP32(src0_ptr[i00]);
                                 id++;
                             }
                         }
@@ -130,7 +130,7 @@ static void ggml_compute_forward_dup_f16(
                             const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
 
                             for (int i00 = 0; i00 < ne00; i00++) {
-                                src0_f32[i00] = GGML_FP16_TO_FP32(src0_ptr[i00]);
+                                src0_f32[i00] = GGML_CPU_FP16_TO_FP32(src0_ptr[i00]);
                             }
 
                             quantize_row_q(src0_f32, dst_ptr + id, ne00);
@@ -156,7 +156,7 @@ static void ggml_compute_forward_dup_f16(
                             for (int i00 = 0; i00 < ne00; i00++) {
                                 const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
 
-                                dst_ptr[id] = GGML_FP16_TO_FP32(*src0_ptr);
+                                dst_ptr[id] = GGML_CPU_FP16_TO_FP32(*src0_ptr);
                                 id++;
                             }
                         }
@@ -267,7 +267,7 @@ static void ggml_compute_forward_dup_f16(
                         const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
                               char * dst_ptr  = ((char *)  dst->data + i10*nb0  + i11*nb1  + i12*nb2  + i13*nb3);
 
-                        *(float *) dst_ptr = GGML_FP16_TO_FP32(*(const ggml_fp16_t *) src0_ptr);
+                        *(float *) dst_ptr = GGML_CPU_FP16_TO_FP32(*(const ggml_fp16_t *) src0_ptr);
 
                         if (++i10 == ne0) {
                             i10 = 0;
@@ -372,7 +372,7 @@ static void ggml_compute_forward_dup_bf16(
                         for (int i01 = ir0; i01 < ir1; i01++) {
                             const ggml_bf16_t * src0_ptr = (ggml_bf16_t *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
                             for (int i00 = 0; i00 < ne00; i00++) {
-                                dst_ptr[id] = GGML_FP32_TO_FP16(GGML_BF16_TO_FP32(src0_ptr[i00]));
+                                dst_ptr[id] = GGML_CPU_FP32_TO_FP16(GGML_BF16_TO_FP32(src0_ptr[i00]));
                                 id++;
                             }
                         }
@@ -473,7 +473,7 @@ static void ggml_compute_forward_dup_bf16(
                             for (int i00 = 0; i00 < ne00; i00++) {
                                 const ggml_bf16_t * src0_ptr = (ggml_bf16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
 
-                                dst_ptr[id] = GGML_FP32_TO_FP16(GGML_BF16_TO_FP32(*src0_ptr));
+                                dst_ptr[id] = GGML_CPU_FP32_TO_FP16(GGML_BF16_TO_FP32(*src0_ptr));
                                 id++;
                             }
                         }
@@ -566,7 +566,7 @@ static void ggml_compute_forward_dup_bf16(
                         const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
                               char * dst_ptr  = ((char *)  dst->data + i10*nb0  + i11*nb1  + i12*nb2  + i13*nb3);
 
-                        *(ggml_fp16_t *) dst_ptr = GGML_FP32_TO_FP16(GGML_BF16_TO_FP32(*(const ggml_bf16_t *) src0_ptr));
+                        *(ggml_fp16_t *) dst_ptr = GGML_CPU_FP32_TO_FP16(GGML_BF16_TO_FP32(*(const ggml_bf16_t *) src0_ptr));
 
                         if (++i10 == ne0) {
                             i10 = 0;
@@ -765,7 +765,7 @@ static void ggml_compute_forward_dup_f32(
                             for (int i00 = 0; i00 < ne00; i00++) {
                                 const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
 
-                                dst_ptr[id] = GGML_FP32_TO_FP16(*src0_ptr);
+                                dst_ptr[id] = GGML_CPU_FP32_TO_FP16(*src0_ptr);
                                 id++;
                             }
                         }
@@ -878,7 +878,7 @@ static void ggml_compute_forward_dup_f32(
                         const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
                               char * dst_ptr  = ((char *)  dst->data + i10*nb0  + i11*nb1  + i12*nb2  + i13*nb3);
 
-                        *(ggml_fp16_t *) dst_ptr = GGML_FP32_TO_FP16(*(const float *) src0_ptr);
+                        *(ggml_fp16_t *) dst_ptr = GGML_CPU_FP32_TO_FP16(*(const float *) src0_ptr);
 
                         if (++i10 == ne0) {
                             i10 = 0;
@@ -1419,7 +1419,7 @@ static void ggml_compute_forward_add1_f16_f32(
         ggml_fp16_t * dst_ptr  = (ggml_fp16_t *) ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1 );
         ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
         for (int i = 0; i < ne0; i++) {
-            dst_ptr[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(src0_ptr[i]) + v);
+            dst_ptr[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(src0_ptr[i]) + v);
         }
     }
 }
@@ -1435,7 +1435,7 @@ static void ggml_compute_forward_add1_f16_f16(
     GGML_ASSERT(ggml_is_scalar(src1));
 
     // scalar to add
-    const float v = GGML_FP16_TO_FP32(*(ggml_fp16_t *) src1->data);
+    const float v = GGML_CPU_FP16_TO_FP32(*(ggml_fp16_t *) src1->data);
 
     const int ith = params->ith;
     const int nth = params->nth;
@@ -1467,7 +1467,7 @@ static void ggml_compute_forward_add1_f16_f16(
         ggml_fp16_t * dst_ptr  = (ggml_fp16_t *) ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1 );
         ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
         for (int i = 0; i < ne0; i++) {
-            dst_ptr[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(src0_ptr[i]) + v);
+            dst_ptr[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(src0_ptr[i]) + v);
         }
     }
 }
@@ -1889,7 +1889,7 @@ static void ggml_compute_forward_sum_f16(
             }
         }
     }
-    ((ggml_fp16_t *) dst->data)[0] = GGML_FP32_TO_FP16(sum);
+    ((ggml_fp16_t *) dst->data)[0] = GGML_CPU_FP32_TO_FP16(sum);
 }
 
 static void ggml_compute_forward_sum_bf16(
@@ -2660,7 +2660,7 @@ static void ggml_compute_forward_gelu_f16(
 #ifndef NDEBUG
         for (int k = 0; k < nc; k++) {
             const ggml_fp16_t x = ((ggml_fp16_t *) ((char *) dst->data + i1*( dst->nb[1])))[k];
-            const float v = GGML_FP16_TO_FP32(x);
+            const float v = GGML_CPU_FP16_TO_FP32(x);
             GGML_UNUSED(v);
             assert(!isnan(v));
             assert(!isinf(v));
@@ -2763,7 +2763,7 @@ static void ggml_compute_forward_gelu_erf_f16(
 #ifndef NDEBUG
         for (int k = 0; k < nc; k++) {
             const ggml_fp16_t x = ((ggml_fp16_t *) ((char *) dst->data + i1*( dst->nb[1])))[k];
-            const float v = GGML_FP16_TO_FP32(x);
+            const float v = GGML_CPU_FP16_TO_FP32(x);
             GGML_UNUSED(v);
             assert(!isnan(v));
             assert(!isinf(v));
@@ -2866,7 +2866,7 @@ static void ggml_compute_forward_gelu_quick_f16(
 #ifndef NDEBUG
         for (int k = 0; k < nc; k++) {
             const ggml_fp16_t x = ((ggml_fp16_t *) ((char *) dst->data + i1*( dst->nb[1])))[k];
-            const float v = GGML_FP16_TO_FP32(x);
+            const float v = GGML_CPU_FP16_TO_FP32(x);
             GGML_UNUSED(v);
             assert(!isnan(v));
             assert(!isinf(v));
@@ -2969,7 +2969,7 @@ static void ggml_compute_forward_silu_f16(
 #ifndef NDEBUG
         for (int k = 0; k < nc; k++) {
             const ggml_fp16_t x = ((ggml_fp16_t *) ((char *) dst->data + i1*(dst->nb[1])))[k];
-            const float v = GGML_FP16_TO_FP32(x);
+            const float v = GGML_CPU_FP16_TO_FP32(x);
             GGML_UNUSED(v);
             assert(!isnan(v));
             assert(!isinf(v));
@@ -3163,7 +3163,7 @@ static void ggml_compute_forward_silu_back_f16(
     #ifndef NDEBUG
         for (int k = 0; k < nc; k++) {
             const float x = ((ggml_fp16_t *) ((char *) dst->data + i1*( dst->nb[1])))[k];
-            const float v = GGML_FP16_TO_FP32(x);
+            const float v = GGML_CPU_FP16_TO_FP32(x);
             GGML_UNUSED(v);
             assert(!isnan(v));
             assert(!isinf(v));
@@ -4500,7 +4500,7 @@ static void ggml_compute_forward_get_rows_back_f32_f16(
 
         for (int j = 0; j < nc; ++j) {
             ggml_fp16_t v = ((ggml_fp16_t *) ((char *) src0->data + i*src0->nb[1]))[j];
-            ((float *) ((char *) dst->data + r*dst->nb[1]))[j] += GGML_FP16_TO_FP32(v);
+            ((float *) ((char *) dst->data + r*dst->nb[1]))[j] += GGML_CPU_FP16_TO_FP32(v);
         }
     }
 }
@@ -4792,7 +4792,7 @@ static void ggml_compute_forward_soft_max_f32(
         if (mp_f32) {
             if (use_f16) {
                 for (int i = 0; i < nc; ++i) {
-                    wp[i] += slope*GGML_FP16_TO_FP32(mp_f16[i]);
+                    wp[i] += slope*GGML_CPU_FP16_TO_FP32(mp_f16[i]);
                 }
             } else {
                 for (int i = 0; i < nc; ++i) {
@@ -5018,8 +5018,8 @@ static void ggml_compute_forward_clamp_f16(
         ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + j*nb01);
 
         for (int i = 0; i < nc; i++) {
-            float v = GGML_FP16_TO_FP32(src0_ptr[i]);
-            dst_ptr[i] = GGML_FP32_TO_FP16(MAX(MIN(v, max), min));
+            float v = GGML_CPU_FP16_TO_FP32(src0_ptr[i]);
+            dst_ptr[i] = GGML_CPU_FP32_TO_FP16(MAX(MIN(v, max), min));
         }
     }
 }
@@ -5476,11 +5476,11 @@ static void ggml_compute_forward_rope_f16(
                             const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00);
                             ggml_fp16_t * dst_data  = (ggml_fp16_t *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + ic*nb0);
 
-                            const float x0 = GGML_FP16_TO_FP32(src[0]);
-                            const float x1 = GGML_FP16_TO_FP32(src[n_dims]);
+                            const float x0 = GGML_CPU_FP16_TO_FP32(src[0]);
+                            const float x1 = GGML_CPU_FP16_TO_FP32(src[n_dims]);
 
-                            dst_data[0]      = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
-                            dst_data[n_dims] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
+                            dst_data[0]      = GGML_CPU_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
+                            dst_data[n_dims] = GGML_CPU_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
                         }
                     } else {
                         for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
@@ -5492,11 +5492,11 @@ static void ggml_compute_forward_rope_f16(
                             const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00);
                             ggml_fp16_t * dst_data  = (ggml_fp16_t *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + ic*nb0);
 
-                            const float x0 = GGML_FP16_TO_FP32(src[0]);
-                            const float x1 = GGML_FP16_TO_FP32(src[n_dims/2]);
+                            const float x0 = GGML_CPU_FP16_TO_FP32(src[0]);
+                            const float x1 = GGML_CPU_FP16_TO_FP32(src[n_dims/2]);
 
-                            dst_data[0]        = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
-                            dst_data[n_dims/2] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
+                            dst_data[0]        = GGML_CPU_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
+                            dst_data[n_dims/2] = GGML_CPU_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
                         }
                     }
                 } else {
@@ -5507,11 +5507,11 @@ static void ggml_compute_forward_rope_f16(
                         const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
                               ggml_fp16_t * dst_data  = (ggml_fp16_t *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
 
-                        const float x0 = GGML_FP16_TO_FP32(src[0]);
-                        const float x1 = GGML_FP16_TO_FP32(src[1]);
+                        const float x0 = GGML_CPU_FP16_TO_FP32(src[0]);
+                        const float x1 = GGML_CPU_FP16_TO_FP32(src[1]);
 
-                        dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
-                        dst_data[1] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
+                        dst_data[0] = GGML_CPU_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
+                        dst_data[1] = GGML_CPU_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
                     }
                 }
 
@@ -5525,11 +5525,11 @@ static void ggml_compute_forward_rope_f16(
                         const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00);
                         ggml_fp16_t * dst_data  = (ggml_fp16_t *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + ic*nb0);
 
-                        const float x0 = GGML_FP16_TO_FP32(src[0]);
-                        const float x1 = GGML_FP16_TO_FP32(src[n_dims]);
+                        const float x0 = GGML_CPU_FP16_TO_FP32(src[0]);
+                        const float x1 = GGML_CPU_FP16_TO_FP32(src[n_dims]);
 
-                        dst_data[0]      = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
-                        dst_data[n_dims] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
+                        dst_data[0]      = GGML_CPU_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
+                        dst_data[n_dims] = GGML_CPU_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
                     }
                 } else {
                     for (int64_t i0 = n_dims; i0 < ne0; i0 += 2) {
@@ -5640,7 +5640,7 @@ static void ggml_compute_forward_conv_transpose_1d_f16_f32(
             for (int64_t i11 = 0; i11 < ne11; i11++) {
                 const float * const src = (float *)((char *) src1->data + i11*nb11);
                 for (int64_t i10 = 0; i10 < ne10; i10++) {
-                    dst_data[i10*ne11 + i11] = GGML_FP32_TO_FP16(src[i10]);
+                    dst_data[i10*ne11 + i11] = GGML_CPU_FP32_TO_FP16(src[i10]);
                 }
             }
         }
@@ -5933,7 +5933,7 @@ static void ggml_compute_forward_im2col_f16(
                                 if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
                                     dst_data[iic*(KH*KW) + ikh*KW + ikw] = 0;
                                 } else {
-                                    dst_data[iic*(KH*KW) + ikh*KW + ikw] = GGML_FP32_TO_FP16(src_data[iih*IW + iiw]);
+                                    dst_data[iic*(KH*KW) + ikh*KW + ikw] = GGML_CPU_FP32_TO_FP16(src_data[iih*IW + iiw]);
                                 }
                             }
                         }
@@ -6109,7 +6109,7 @@ void ggml_compute_forward_conv_transpose_2d(
                     const float * const src = (float *)((char *) src1->data + i12*nb12 + i11*nb11);
                     ggml_fp16_t * dst_data = wdata + i11*ne10*ne12;
                     for (int i10 = 0; i10 < ne10; i10++) {
-                        dst_data[i10*ne12 + i12] = GGML_FP32_TO_FP16(src[i10]);
+                        dst_data[i10*ne12 + i12] = GGML_CPU_FP32_TO_FP16(src[i10]);
                     }
                 }
             }
@@ -6358,7 +6358,7 @@ static void ggml_compute_forward_pool_1d_sk_p0(
                 case GGML_OP_POOL_COUNT: GGML_ABORT("fatal error");
             }
             for (int ki = 0; ki < k; ++ki) {
-                const float srow_j = (src->type == GGML_TYPE_F32) ? ((const float*)srow)[j] : GGML_FP16_TO_FP32(((const ggml_fp16_t*)srow)[j]);
+                const float srow_j = (src->type == GGML_TYPE_F32) ? ((const float*)srow)[j] : GGML_CPU_FP16_TO_FP32(((const ggml_fp16_t*)srow)[j]);
                 switch (op) {
                     case GGML_OP_POOL_AVG:                         drow[i] += srow_j; break;
                     case GGML_OP_POOL_MAX:   if (srow_j > drow[i]) drow[i]  = srow_j; break;
@@ -6450,7 +6450,7 @@ void ggml_compute_forward_pool_2d(
                     for (int kx = 0; kx < k0; ++kx) {
                         int j = ix + kx;
                         if (j < 0 || j >= src->ne[0]) continue;
-                        const float srow_j = (src->type == GGML_TYPE_F32) ? ((const float*)srow)[j] : GGML_FP16_TO_FP32(((const ggml_fp16_t*)srow)[j]);
+                        const float srow_j = (src->type == GGML_TYPE_F32) ? ((const float*)srow)[j] : GGML_CPU_FP16_TO_FP32(((const ggml_fp16_t*)srow)[j]);
                         switch (op) {
                             case GGML_OP_POOL_AVG:                     *out += srow_j; break;
                             case GGML_OP_POOL_MAX: if (srow_j > *out)  *out  = srow_j; break;
@@ -6538,7 +6538,7 @@ void ggml_compute_forward_pool_2d_back(
                             }
 
                             const float val = dst->type == GGML_TYPE_F32 ?
-                                ((const float *) drowf)[j] : GGML_FP16_TO_FP32(((const ggml_fp16_t *) drowf)[j]);
+                                ((const float *) drowf)[j] : GGML_CPU_FP16_TO_FP32(((const ggml_fp16_t *) drowf)[j]);
                             if (val <= maxval) {
                                 continue;
                             }
@@ -6558,7 +6558,7 @@ void ggml_compute_forward_pool_2d_back(
                     if (dst->type == GGML_TYPE_F32) {
                         ((float *) drow)[j] += grad0;
                     } else {
-                        ((ggml_fp16_t *) drow)[j] = GGML_FP32_TO_FP16(grad0 + GGML_FP16_TO_FP32(((const ggml_fp16_t *) drow)[j]));
+                        ((ggml_fp16_t *) drow)[j] = GGML_CPU_FP32_TO_FP16(grad0 + GGML_CPU_FP16_TO_FP32(((const ggml_fp16_t *) drow)[j]));
                     }
                 } else if (op == GGML_OP_POOL_AVG) {
                     const float grad = grad0 / ka;
@@ -6577,7 +6577,7 @@ void ggml_compute_forward_pool_2d_back(
                             if (dst->type == GGML_TYPE_F32) {
                                 ((float *) drow)[j] += grad;
                             } else {
-                                ((ggml_fp16_t *) drow)[j] += GGML_FP32_TO_FP16(grad);
+                                ((ggml_fp16_t *) drow)[j] += GGML_CPU_FP32_TO_FP16(grad);
                             }
                         }
                     }
@@ -6793,6 +6793,73 @@ void ggml_compute_forward_pad_reflect_1d(
     }
 }
 
+// ggml_compute_forward_roll
+
+static int64_t ggml_wrap_index(int64_t i, int64_t ne) {
+    if (i < 0) {
+        return i + ne;
+    } else if (i >= ne) {
+        return i - ne;
+    }
+    return i;
+}
+
+static void ggml_compute_forward_roll_f32(
+        const ggml_compute_params * params,
+        ggml_tensor * dst) {
+
+    const ggml_tensor * src0 = dst->src[0];
+    const float * src_data = (const float *) src0->data;
+    float * dst_data = (float *) dst->data;
+
+    GGML_TENSOR_UNARY_OP_LOCALS
+
+    const int s0 = ggml_get_op_params_i32(dst, 0);
+    const int s1 = ggml_get_op_params_i32(dst, 1);
+    const int s2 = ggml_get_op_params_i32(dst, 2);
+    const int s3 = ggml_get_op_params_i32(dst, 3);
+
+    const int64_t total = ne1 * ne2 * ne3;
+    const int64_t per_thread = (total + params->nth) / params->nth;
+    const int64_t start = params->ith * per_thread;
+    const int64_t end   = std::min(start + per_thread, total);
+
+    for (int64_t i = start; i < end; ++i) {
+        const int64_t i1 = i % ne1;
+        const int64_t i2 = (i / ne1) % ne2;
+        const int64_t i3 = i / (ne2 * ne1);
+        float * dst_row = dst_data + (i3*nb3 + i2*nb2 + i1*nb1) / sizeof(float);
+
+        const int64_t i01 = ggml_wrap_index(i1 - s1, ne01);
+        const int64_t i02 = ggml_wrap_index(i2 - s2, ne02);
+        const int64_t i03 = ggml_wrap_index(i3 - s3, ne03);
+        const float * src_row = src_data + (i03*nb03 + i02*nb02 + i01*nb01) / sizeof(float);
+
+        const int64_t s = ggml_wrap_index(-s0, ne00);
+        const int64_t n = ne00 - s;
+        ggml_vec_cpy_f32(n, dst_row,     src_row + s);
+        ggml_vec_cpy_f32(s, dst_row + n, src_row);
+    }
+}
+
+void ggml_compute_forward_roll(
+        const ggml_compute_params * params,
+        ggml_tensor * dst) {
+
+    const ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_roll_f32(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
 // ggml_compute_forward_arange
 
 static void ggml_compute_forward_arange_f32(
@@ -7075,7 +7142,7 @@ static void ggml_compute_forward_flash_attn_ext_f16(
         // loop over n_kv and n_head_kv
         // ref: https://arxiv.org/pdf/2112.05682.pdf
         for (int64_t ic = 0; ic < nek1; ++ic) {
-            const float mv = mp ? slope*GGML_FP16_TO_FP32(mp[ic]) : 0.0f;
+            const float mv = mp ? slope*GGML_CPU_FP16_TO_FP32(mp[ic]) : 0.0f;
             if (mv == -INFINITY) {
                 continue;
             }
@@ -7143,7 +7210,7 @@ static void ggml_compute_forward_flash_attn_ext_f16(
 
         if (v->type == GGML_TYPE_F16) {
             for (int64_t d = 0; d < DV; ++d) {
-                VKQ32[d] = GGML_FP16_TO_FP32(VKQ16[d]);
+                VKQ32[d] = GGML_CPU_FP16_TO_FP32(VKQ16[d]);
             }
         }
 
@@ -8132,8 +8199,8 @@ static void ggml_compute_forward_rwkv_wkv6_f32(
         #define WKV_VECTOR_SIZE 4
     #endif
 
-    int wkv_vector_size;
     #ifdef WKV_VECTOR_SIZE
+        int wkv_vector_size;
         #if defined(__ARM_FEATURE_SVE)
             wkv_vector_size = svcntw();
         #else
@@ -8348,8 +8415,8 @@ static void ggml_compute_forward_gla_f32(
         #define GLA_VECTOR_SIZE 4
     #endif
 
-    int gla_vector_size;
     #ifdef GLA_VECTOR_SIZE
+        int gla_vector_size;
         #if defined(__ARM_FEATURE_SVE)
             gla_vector_size = svcntw();
         #else
diff --git a/ggml/src/ggml-cpu/ops.h b/ggml/src/ggml-cpu/ops.h
index dc081b9e66397..2d8544d7d3d43 100644
--- a/ggml/src/ggml-cpu/ops.h
+++ b/ggml/src/ggml-cpu/ops.h
@@ -72,6 +72,7 @@ void ggml_compute_forward_pool_2d_back(const struct ggml_compute_params * params
 void ggml_compute_forward_upscale(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_pad(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_pad_reflect_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_roll(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_arange(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_timestep_embedding(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_argsort(const struct ggml_compute_params * params, struct ggml_tensor * dst);
diff --git a/ggml/src/ggml-cpu/quants.c b/ggml/src/ggml-cpu/quants.c
new file mode 100644
index 0000000000000..ee35ab42fda07
--- /dev/null
+++ b/ggml/src/ggml-cpu/quants.c
@@ -0,0 +1,1158 @@
+#define GGML_COMMON_IMPL_C
+#include "ggml-common.h"
+
+#include "ggml-cpu-impl.h"
+#include "simd-mappings.h"
+#include "ggml-quants.h"
+#include "quants.h"
+
+#include "arch-fallback.h"
+
+#include <string.h>
+#include <assert.h>
+#include <float.h>
+#include <stdlib.h> // for qsort
+#include <stdio.h>  // for GGML_ASSERT
+
+#define GROUP_MAX_EPS 1e-15f
+#define GROUP_MAX_EPS_IQ3_XXS 1e-8f
+#define GROUP_MAX_EPS_IQ2_S 1e-8f
+#define GROUP_MAX_EPS_IQ1_M 1e-7f
+#define GROUP_MAX_EPS_IQ1_S 1e-12f
+
+#define UNUSED GGML_UNUSED
+
+void quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
+    quantize_row_q4_0_ref(x, y, k);
+}
+
+void quantize_row_q4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
+    quantize_row_q4_1_ref(x, y, k);
+}
+
+void quantize_row_q5_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
+    quantize_row_q5_0_ref(x, y, k);
+}
+
+void quantize_row_q5_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
+    quantize_row_q5_1_ref(x, y, k);
+}
+
+void quantize_row_q8_0_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
+    quantize_row_q8_0_ref(x, y, k);
+}
+
+void quantize_row_q8_1_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
+    quantize_row_q8_1_ref(x, y, k);
+}
+
+//
+// 2-6 bit quantization in super-blocks
+//
+
+//========================- 2-bit (de)-quantization
+
+void quantize_row_q2_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
+    quantize_row_q2_K_ref(x, vy, k);
+}
+
+//========================= 3-bit (de)-quantization
+
+void quantize_row_q3_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
+    quantize_row_q3_K_ref(x, vy, k);
+}
+
+// ====================== 4-bit (de)-quantization
+
+void quantize_row_q4_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
+    assert(k % QK_K == 0);
+    block_q4_K * GGML_RESTRICT y = vy;
+    quantize_row_q4_K_ref(x, y, k);
+}
+
+// ====================== 5-bit (de)-quantization
+
+void quantize_row_q5_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
+    assert(k % QK_K == 0);
+    block_q5_K * GGML_RESTRICT y = vy;
+    quantize_row_q5_K_ref(x, y, k);
+}
+
+// ====================== 6-bit (de)-quantization
+
+void quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
+    assert(k % QK_K == 0);
+    block_q6_K * GGML_RESTRICT y = vy;
+    quantize_row_q6_K_ref(x, y, k);
+}
+
+// ====================== Ternary (de)-quantization (BitNet b1.58 and TriLMs)
+
+void quantize_row_tq1_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
+    assert(k % QK_K == 0);
+    block_tq1_0 * GGML_RESTRICT y = vy;
+    quantize_row_tq1_0_ref(x, y, k);
+}
+
+void quantize_row_tq2_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
+    assert(k % QK_K == 0);
+    block_tq2_0 * GGML_RESTRICT y = vy;
+    quantize_row_tq2_0_ref(x, y, k);
+}
+
+//===================================== Q8_K ==============================================
+
+void quantize_row_q8_K_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
+    quantize_row_q8_K_ref(x, y, k);
+}
+
+//===================================== Dot products =================================
+
+void ggml_vec_dot_q4_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    const int qk = QK8_0;
+    const int nb = n / qk;
+
+    assert(n % qk == 0);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_q4_0 * GGML_RESTRICT x = vx;
+    const block_q8_0 * GGML_RESTRICT y = vy;
+
+    int ib = 0;
+    float sumf = 0;
+
+    for (; ib < nb; ++ib) {
+        int sumi0 = 0;
+        int sumi1 = 0;
+
+        for (int j = 0; j < qk/2; ++j) {
+            const int v0 = (x[ib].qs[j] & 0x0F) - 8;
+            const int v1 = (x[ib].qs[j] >>   4) - 8;
+
+            sumi0 += (v0 * y[ib].qs[j]);
+            sumi1 += (v1 * y[ib].qs[j + qk/2]);
+        }
+
+        int sumi = sumi0 + sumi1;
+        sumf += sumi*GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d);
+    }
+
+    *s = sumf;
+}
+
+// TODO: add WASM SIMD
+void ggml_vec_dot_q4_1_q8_1_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    const int qk = QK8_1;
+    const int nb = n / qk;
+
+    assert(n % qk == 0);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_q4_1 * GGML_RESTRICT x = vx;
+    const block_q8_1 * GGML_RESTRICT y = vy;
+
+    int ib = 0;
+    float sumf = 0;
+
+    for (; ib < nb; ++ib) {
+        int sumi0 = 0;
+        int sumi1 = 0;
+
+        for (int j = 0; j < qk/2; ++j) {
+            const int v0 = (x[ib].qs[j] & 0x0F);
+            const int v1 = (x[ib].qs[j] >>   4);
+
+            sumi0 += (v0 * y[ib].qs[j]);
+            sumi1 += (v1 * y[ib].qs[j + qk/2]);
+        }
+
+        int sumi = sumi0 + sumi1;
+        sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
+    }
+
+    *s = sumf;
+}
+
+void ggml_vec_dot_q5_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    const int qk = QK8_0;
+    const int nb = n / qk;
+
+    int ib = 0;
+    float sumf = 0;
+
+    assert(n % qk == 0);
+    assert(qk == QK5_0);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_q5_0 * GGML_RESTRICT x = vx;
+    const block_q8_0 * GGML_RESTRICT y = vy;
+
+    for (; ib < nb; ++ib) {
+        uint32_t qh;
+        memcpy(&qh, x[ib].qh, sizeof(qh));
+
+        int sumi0 = 0;
+        int sumi1 = 0;
+
+        for (int j = 0; j < qk/2; ++j) {
+            const uint8_t xh_0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
+            const uint8_t xh_1 = ((qh & (1u << (j + 16))) >> (j + 12));
+
+            const int32_t x0 = (int8_t)(((x[ib].qs[j] & 0x0F) | xh_0) - 16);
+            const int32_t x1 = (int8_t)(((x[ib].qs[j] >>   4) | xh_1) - 16);
+
+            sumi0 += (x0 * y[ib].qs[j]);
+            sumi1 += (x1 * y[ib].qs[j + qk/2]);
+        }
+
+        int sumi = sumi0 + sumi1;
+        sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d)) * sumi;
+    }
+
+    *s = sumf;
+}
+
+void ggml_vec_dot_q5_1_q8_1_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    const int qk = QK8_1;
+    const int nb = n / qk;
+
+    int ib = 0;
+    float sumf = 0;
+
+    assert(n % qk == 0);
+    assert(qk == QK5_1);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_q5_1 * GGML_RESTRICT x = vx;
+    const block_q8_1 * GGML_RESTRICT y = vy;
+
+    for (; ib < nb; ++ib) {
+        uint32_t qh;
+        memcpy(&qh, x[ib].qh, sizeof(qh));
+
+        int sumi0 = 0;
+        int sumi1 = 0;
+
+        for (int j = 0; j < qk/2; ++j) {
+            const uint8_t xh_0 = ((qh >> (j +  0)) << 4) & 0x10;
+            const uint8_t xh_1 = ((qh >> (j + 12))     ) & 0x10;
+
+            const int32_t x0 = (x[ib].qs[j] & 0xF) | xh_0;
+            const int32_t x1 = (x[ib].qs[j] >>  4) | xh_1;
+
+            sumi0 += (x0 * y[ib].qs[j]);
+            sumi1 += (x1 * y[ib].qs[j + qk/2]);
+        }
+
+        int sumi = sumi0 + sumi1;
+        sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
+    }
+
+    *s = sumf;
+}
+
+void ggml_vec_dot_q8_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    const int qk = QK8_0;
+    const int nb = n / qk;
+
+    assert(n % qk == 0);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_q8_0 * GGML_RESTRICT x = vx;
+    const block_q8_0 * GGML_RESTRICT y = vy;
+
+    int ib = 0;
+    float sumf = 0;
+
+    for (; ib < nb; ++ib) {
+        int sumi = 0;
+
+        for (int j = 0; j < qk; j++) {
+            sumi += x[ib].qs[j]*y[ib].qs[j];
+        }
+
+        sumf += sumi*(GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d));
+    }
+
+    *s = sumf;
+}
+
+void ggml_vec_dot_tq1_0_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_tq1_0 * GGML_RESTRICT x = vx;
+    const block_q8_K  * GGML_RESTRICT y = vy;
+
+    const int nb = n / QK_K;
+
+    const uint8_t pow3[6] = {1, 3, 9, 27, 81, 243};
+
+    float sumf = 0.0f;
+
+    for (int i = 0; i < nb; ++i) {
+        int sum = 0;
+
+        for (size_t j = 0; j < sizeof(x->qs) - sizeof(x->qs) % 32; j += 32) {
+            for (size_t l = 0; l < 5; ++l) {
+                for (size_t m = 0; m < 32; ++m) {
+                    uint8_t q = x[i].qs[j + m] * pow3[l];
+                    uint16_t xi = ((uint16_t) q * 3) >> 8;
+                    sum += (xi - 1) * y[i].qs[j*5 + l*32 + m];
+                }
+            }
+        }
+        for (size_t j = sizeof(x->qs) - sizeof(x->qs) % 32; j < sizeof(x->qs); j += 16) {
+            for (size_t l = 0; l < 5; ++l) {
+                for (size_t m = 0; m < 16; ++m) {
+                    uint8_t q = x[i].qs[j + m] * pow3[l];
+                    uint16_t xi = ((uint16_t) q * 3) >> 8;
+                    sum += (xi - 1) * y[i].qs[j*5 + l*16 + m];
+                }
+            }
+        }
+
+        for (size_t l = 0; l < 4; ++l) {
+            for (size_t j = 0; j < sizeof(x->qh); ++j) {
+                uint8_t q = x[i].qh[j] * pow3[l];
+                uint16_t xi = ((uint16_t) q * 3) >> 8;
+                sum += (xi - 1) * y[i].qs[sizeof(x->qs)*5 + l*sizeof(x->qh) + j];
+            }
+        }
+
+        sumf += (float) sum * (GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d);
+    }
+
+    *s = sumf;
+}
+
+void ggml_vec_dot_tq2_0_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_tq2_0 * GGML_RESTRICT x = vx;
+    const block_q8_K  * GGML_RESTRICT y = vy;
+
+    const int nb = n / QK_K;
+    float sumf = 0.0f;
+
+    for (int i = 0; i < nb; ++i) {
+        int32_t sumi = 0;
+
+        for (size_t j = 0; j < sizeof(x->qs); j += 32) {
+            for (size_t l = 0; l < 4; ++l) {
+                for (size_t k = 0; k < 32; ++k) {
+                    sumi += y[i].qs[j*4 + l*32 + k] * (((x[i].qs[j + k] >> (l*2)) & 3) - 1);
+                }
+            }
+        }
+
+        const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
+
+        sumf += (float) sumi * d;
+    }
+
+    *s = sumf;
+}
+
+void ggml_vec_dot_q2_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_q2_K * GGML_RESTRICT x = vx;
+    const block_q8_K * GGML_RESTRICT y = vy;
+
+    const int nb = n / QK_K;
+
+    float sumf = 0;
+
+    for (int i = 0; i < nb; ++i) {
+
+        const uint8_t * q2 = x[i].qs;
+        const  int8_t * q8 = y[i].qs;
+        const uint8_t * sc = x[i].scales;
+
+        int summs = 0;
+        for (int j = 0; j < 16; ++j) {
+            summs += y[i].bsums[j] * (sc[j] >> 4);
+        }
+
+        const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
+        const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
+
+        int isum = 0;
+        int is = 0;
+        int d;
+        for (int k = 0; k < QK_K/128; ++k) {
+            int shift = 0;
+            for (int j = 0; j < 4; ++j) {
+                d = sc[is++] & 0xF;
+                int isuml = 0;
+                for (int l =  0; l < 16; ++l) isuml += q8[l] * ((q2[l] >> shift) & 3);
+                isum += d * isuml;
+                d = sc[is++] & 0xF;
+                isuml = 0;
+                for (int l = 16; l < 32; ++l) isuml += q8[l] * ((q2[l] >> shift) & 3);
+                isum += d * isuml;
+                shift += 2;
+                q8 += 32;
+            }
+            q2 += 32;
+        }
+        sumf += dall * isum - dmin * summs;
+    }
+    *s = sumf;
+}
+
+void ggml_vec_dot_q3_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    assert(n % QK_K == 0);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const uint32_t kmask1 = 0x03030303;
+    const uint32_t kmask2 = 0x0f0f0f0f;
+
+    const block_q3_K * GGML_RESTRICT x = vx;
+    const block_q8_K * GGML_RESTRICT y = vy;
+
+    const int nb = n / QK_K;
+
+    // scalar version
+    // This function is written like this so the compiler can manage to vectorize most of it
+    // Using -Ofast, GCC and clang manage to produce code that is within a factor of 2 or so from the
+    // manually vectorized version above. Every other version I tried would run at least 4 times slower.
+    // The ideal situation would be if we could just write the code once, and the compiler would
+    // automatically produce the best possible set of machine instructions, instead of us having to manually
+    // write vectorized versions for AVX, ARM_NEON, etc.
+
+    int8_t  aux8[QK_K];
+    int16_t aux16[8];
+    float   sums [8];
+    int32_t aux32[8];
+    memset(sums, 0, 8*sizeof(float));
+
+    uint32_t auxs[4];
+    const int8_t * scales = (const int8_t*)auxs;
+
+    float sumf = 0;
+    for (int i = 0; i < nb; ++i) {
+        const uint8_t * GGML_RESTRICT q3 = x[i].qs;
+        const uint8_t * GGML_RESTRICT hm = x[i].hmask;
+        const  int8_t * GGML_RESTRICT q8 = y[i].qs;
+        memset(aux32, 0, 8*sizeof(int32_t));
+        int8_t * GGML_RESTRICT a = aux8;
+        uint8_t m = 1;
+        for (int j = 0; j < QK_K; j += 128) {
+            for (int l = 0; l < 32; ++l) a[l] = q3[l] & 3;
+            for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
+            a += 32; m <<= 1;
+            for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 2) & 3;
+            for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
+            a += 32; m <<= 1;
+            for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 4) & 3;
+            for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
+            a += 32; m <<= 1;
+            for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 6) & 3;
+            for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
+            a += 32; m <<= 1;
+            q3 += 32;
+        }
+        a = aux8;
+
+        memcpy(auxs, x[i].scales, 12);
+        uint32_t tmp = auxs[2];
+        auxs[2] = ((auxs[0] >> 4) & kmask2) | (((tmp >> 4) & kmask1) << 4);
+        auxs[3] = ((auxs[1] >> 4) & kmask2) | (((tmp >> 6) & kmask1) << 4);
+        auxs[0] = (auxs[0] & kmask2) | (((tmp >> 0) & kmask1) << 4);
+        auxs[1] = (auxs[1] & kmask2) | (((tmp >> 2) & kmask1) << 4);
+        for (int j = 0; j < QK_K/16; ++j) {
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
+            q8 += 8; a += 8;
+        }
+        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+        for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
+    }
+    for (int l = 0; l < 8; ++l) sumf += sums[l];
+    *s = sumf;
+}
+
+void ggml_vec_dot_q4_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    assert(n % QK_K == 0);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_q4_K * GGML_RESTRICT x = vx;
+    const block_q8_K * GGML_RESTRICT y = vy;
+
+    const int nb = n / QK_K;
+
+    static const uint32_t kmask1 = 0x3f3f3f3f;
+    static const uint32_t kmask2 = 0x0f0f0f0f;
+    static const uint32_t kmask3 = 0x03030303;
+
+    uint32_t utmp[4];
+
+    const uint8_t * scales = (const uint8_t*)&utmp[0];
+    const uint8_t * mins   = (const uint8_t*)&utmp[2];
+
+    int8_t  aux8[QK_K];
+    int16_t aux16[8];
+    float   sums [8];
+    int32_t aux32[8];
+    memset(sums, 0, 8*sizeof(float));
+
+    float sumf = 0;
+    for (int i = 0; i < nb; ++i) {
+        const uint8_t * GGML_RESTRICT q4 = x[i].qs;
+        const  int8_t * GGML_RESTRICT q8 = y[i].qs;
+        memset(aux32, 0, 8*sizeof(int32_t));
+        int8_t * GGML_RESTRICT a = aux8;
+        for (int j = 0; j < QK_K/64; ++j) {
+            for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
+            a += 32;
+            for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l]  >> 4);
+            a += 32; q4 += 32;
+        }
+        memcpy(utmp, x[i].scales, 12);
+        utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
+        const uint32_t uaux = utmp[1] & kmask1;
+        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
+        utmp[2] = uaux;
+        utmp[0] &= kmask1;
+
+        int sumi = 0;
+        for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
+        a = aux8;
+        int is = 0;
+        for (int j = 0; j < QK_K/32; ++j) {
+            int32_t scale = scales[is++];
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+        }
+        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+        for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
+        const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
+        sumf -= dmin * sumi;
+    }
+    for (int l = 0; l < 8; ++l) sumf += sums[l];
+    *s = sumf;
+}
+
+void ggml_vec_dot_q5_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy,  size_t by, int nrc) {
+    assert(n % QK_K == 0);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_q5_K * GGML_RESTRICT x = vx;
+    const block_q8_K * GGML_RESTRICT y = vy;
+
+    const int nb = n / QK_K;
+
+    static const uint32_t kmask1 = 0x3f3f3f3f;
+    static const uint32_t kmask2 = 0x0f0f0f0f;
+    static const uint32_t kmask3 = 0x03030303;
+
+    uint32_t utmp[4];
+
+    const uint8_t * scales = (const uint8_t*)&utmp[0];
+    const uint8_t * mins   = (const uint8_t*)&utmp[2];
+
+    int8_t  aux8[QK_K];
+    int16_t aux16[8];
+    float   sums [8];
+    int32_t aux32[8];
+    memset(sums, 0, 8*sizeof(float));
+
+    float sumf = 0;
+    for (int i = 0; i < nb; ++i) {
+        const uint8_t * GGML_RESTRICT q4 = x[i].qs;
+        const uint8_t * GGML_RESTRICT hm = x[i].qh;
+        const  int8_t * GGML_RESTRICT q8 = y[i].qs;
+        memset(aux32, 0, 8*sizeof(int32_t));
+        int8_t * GGML_RESTRICT a = aux8;
+        uint8_t m = 1;
+        for (int j = 0; j < QK_K/64; ++j) {
+            for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
+            for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
+            a += 32; m <<= 1;
+            for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l]  >> 4);
+            for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
+            a += 32; m <<= 1;
+            q4 += 32;
+        }
+        memcpy(utmp, x[i].scales, 12);
+        utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
+        const uint32_t uaux = utmp[1] & kmask1;
+        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
+        utmp[2] = uaux;
+        utmp[0] &= kmask1;
+
+        int sumi = 0;
+        for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
+        a = aux8;
+        int is = 0;
+        for (int j = 0; j < QK_K/32; ++j) {
+            int32_t scale = scales[is++];
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+        }
+        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+        for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
+        const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
+        sumf -= dmin * sumi;
+    }
+    for (int l = 0; l < 8; ++l) sumf += sums[l];
+    *s = sumf;
+}
+
+void ggml_vec_dot_q6_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    assert(n % QK_K == 0);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_q6_K * GGML_RESTRICT x = vx;
+    const block_q8_K * GGML_RESTRICT y = vy;
+
+    const int nb = n / QK_K;
+
+    int8_t  aux8[QK_K];
+    int16_t aux16[8];
+    float   sums [8];
+    int32_t aux32[8];
+    memset(sums, 0, 8*sizeof(float));
+
+    float sumf = 0;
+    for (int i = 0; i < nb; ++i) {
+        const uint8_t * GGML_RESTRICT q4 = x[i].ql;
+        const uint8_t * GGML_RESTRICT qh = x[i].qh;
+        const  int8_t * GGML_RESTRICT q8 = y[i].qs;
+        memset(aux32, 0, 8*sizeof(int32_t));
+        int8_t * GGML_RESTRICT a = aux8;
+        for (int j = 0; j < QK_K; j += 128) {
+            for (int l = 0; l < 32; ++l) {
+                a[l +  0] = (int8_t)((q4[l +  0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
+                a[l + 32] = (int8_t)((q4[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
+                a[l + 64] = (int8_t)((q4[l +  0] >>  4) | (((qh[l] >> 4) & 3) << 4)) - 32;
+                a[l + 96] = (int8_t)((q4[l + 32] >>  4) | (((qh[l] >> 6) & 3) << 4)) - 32;
+            }
+            a  += 128;
+            q4 += 64;
+            qh += 32;
+        }
+        a = aux8;
+        int is = 0;
+        for (int j = 0; j < QK_K/16; ++j) {
+            int scale = x[i].scales[is++];
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+        }
+        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+        for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
+    }
+    for (int l = 0; l < 8; ++l) sumf += sums[l];
+    *s = sumf;
+}
+
+void ggml_vec_dot_iq2_xxs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    assert(n % QK_K == 0);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_iq2_xxs * GGML_RESTRICT x = vx;
+    const block_q8_K    * GGML_RESTRICT y = vy;
+
+    const int nb = n / QK_K;
+
+    uint32_t aux32[2];
+    const uint8_t * aux8 = (const uint8_t *)aux32;
+
+    float sumf = 0.f;
+    for (int i = 0; i < nb; ++i) {
+        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+        const uint16_t * GGML_RESTRICT q2 = x[i].qs;
+        const int8_t   * GGML_RESTRICT q8 = y[i].qs;
+        int32_t bsum = 0;
+        for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
+            memcpy(aux32, q2, 2*sizeof(uint32_t));
+            q2 += 4;
+            const uint32_t ls = 2*(aux32[1] >> 28) + 1;
+            int32_t sumi = 0;
+            for (int l = 0; l < 4; ++l) {
+                const uint8_t * grid = (const uint8_t *)(iq2xxs_grid + aux8[l]);
+                const uint8_t  signs = ksigns_iq2xs[(aux32[1] >> 7*l) & 127];
+                for (int j = 0; j < 8; ++j) {
+                    sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
+                }
+                q8 += 8;
+            }
+            bsum += sumi * ls;
+        }
+        sumf += d * bsum;
+    }
+    *s = 0.125f * sumf;
+}
+
+void ggml_vec_dot_iq2_xs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    assert(n % QK_K == 0);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_iq2_xs * GGML_RESTRICT x = vx;
+    const block_q8_K   * GGML_RESTRICT y = vy;
+
+    const int nb = n / QK_K;
+
+    float sumf = 0.f;
+    for (int i = 0; i < nb; ++i) {
+        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+        const uint16_t * GGML_RESTRICT q2 = x[i].qs;
+        const uint8_t  * GGML_RESTRICT sc = x[i].scales;
+        const int8_t   * GGML_RESTRICT q8 = y[i].qs;
+        int32_t bsum = 0;
+        for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
+            const uint16_t ls1 = 2*(sc[ib32] & 0xf) + 1;
+            const uint16_t ls2 = 2*(sc[ib32] >>  4) + 1;
+            int32_t sumi = 0;
+            for (int l = 0; l < 2; ++l) {
+                const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511));
+                const uint8_t  signs = ksigns_iq2xs[q2[l] >> 9];
+                for (int j = 0; j < 8; ++j) {
+                    sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
+                }
+                q8 += 8;
+            }
+            bsum += sumi * ls1;
+            sumi = 0;
+            for (int l = 2; l < 4; ++l) {
+                const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511));
+                const uint8_t  signs = ksigns_iq2xs[q2[l] >> 9];
+                for (int j = 0; j < 8; ++j) {
+                    sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
+                }
+                q8 += 8;
+            }
+            bsum += sumi * ls2;
+            q2 += 4;
+        }
+        sumf += d * bsum;
+    }
+    *s = 0.125f * sumf;
+}
+
+void ggml_vec_dot_iq2_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    assert(n % QK_K == 0);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_iq2_s * GGML_RESTRICT x = vx;
+    const block_q8_K  * GGML_RESTRICT y = vy;
+
+    const int nb = n / QK_K;
+
+    float sumf = 0;
+    for (int i = 0; i < nb; i++) {
+
+        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+        const int8_t  * q8 = y[i].qs;
+        const uint8_t * qs = x[i].qs;
+        const uint8_t * qh = x[i].qh;
+        const uint8_t * signs = qs + QK_K/8;
+
+        int bsum = 0;
+        for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
+            int ls1 = 1 + 2*(x[i].scales[ib32] & 0xf);
+            int ls2 = 1 + 2*(x[i].scales[ib32] >>  4);
+            int sumi1 = 0, sumi2 = 0;
+            for (int l = 0; l < 2; ++l) {
+                const uint8_t * grid = (const uint8_t *)(iq2s_grid + (qs[l] | (qh[ib32] << (8-2*l) & 0x300)));
+                for (int j = 0; j < 8; ++j) {
+                    sumi1 += q8[j] * grid[j] * (signs[l] & kmask_iq2xs[j] ? -1 : 1);
+                }
+                q8 += 8;
+            }
+            for (int l = 2; l < 4; ++l) {
+                const uint8_t * grid = (const uint8_t *)(iq2s_grid + (qs[l] | (qh[ib32] << (8-2*l) & 0x300)));
+                for (int j = 0; j < 8; ++j) {
+                    sumi2 += q8[j] * grid[j] * (signs[l] & kmask_iq2xs[j] ? -1 : 1);
+                }
+                q8 += 8;
+            }
+            bsum += ls1 * sumi1 + ls2 * sumi2;
+            qs += 4;
+            signs += 4;
+        }
+
+        sumf += d * bsum;
+    }
+
+    *s = 0.125f * sumf;
+}
+
+void ggml_vec_dot_iq3_xxs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    assert(n % QK_K == 0);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_iq3_xxs * GGML_RESTRICT x = vx;
+    const block_q8_K    * GGML_RESTRICT y = vy;
+
+    const int nb = n / QK_K;
+
+    uint32_t aux32;
+
+    float sumf = 0.f;
+    for (int i = 0; i < nb; ++i) {
+        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+        const uint8_t * GGML_RESTRICT q3 = x[i].qs;
+        const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4;
+        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
+        int32_t bsum = 0;
+        for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
+            memcpy(&aux32, gas, sizeof(uint32_t)); gas += sizeof(uint32_t);
+            const uint32_t ls = 2*(aux32 >> 28) + 1;
+            int32_t sumi = 0;
+            for (int l = 0; l < 4; ++l) {
+                const uint8_t * grid1 = (const uint8_t *)(iq3xxs_grid + q3[2*l+0]);
+                const uint8_t * grid2 = (const uint8_t *)(iq3xxs_grid + q3[2*l+1]);
+                const uint8_t  signs = ksigns_iq2xs[(aux32 >> 7*l) & 127];
+                for (int j = 0; j < 4; ++j) {
+                    sumi += grid1[j] * q8[j+0] * (signs & kmask_iq2xs[j+0] ? -1 : 1);
+                    sumi += grid2[j] * q8[j+4] * (signs & kmask_iq2xs[j+4] ? -1 : 1);
+                }
+                q8 += 8;
+            }
+            q3 += 8;
+            bsum += sumi * ls;
+        }
+        sumf += d * bsum;
+    }
+    *s = 0.25f * sumf;
+}
+
+void ggml_vec_dot_iq3_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    assert(n % QK_K == 0);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_iq3_s * GGML_RESTRICT x = vx;
+    const block_q8_K  * GGML_RESTRICT y = vy;
+
+    const int nb = n / QK_K;
+
+    float sumf = 0.f;
+    for (int i = 0; i < nb; ++i) {
+        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+        const uint8_t * GGML_RESTRICT qs = x[i].qs;
+        const uint8_t * GGML_RESTRICT qh = x[i].qh;
+        const uint8_t * GGML_RESTRICT signs = x[i].signs;
+        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
+        int32_t bsum = 0;
+        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
+            const uint32_t ls1 = 2*(x[i].scales[ib32/2] & 0xf) + 1;
+            const uint32_t ls2 = 2*(x[i].scales[ib32/2] >>  4) + 1;
+            int32_t sumi = 0;
+            for (int l = 0; l < 4; ++l) {
+                const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*l+0] | ((qh[ib32+0] << (8-2*l)) & 256)));
+                const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*l+1] | ((qh[ib32+0] << (7-2*l)) & 256)));
+                for (int j = 0; j < 4; ++j) {
+                    sumi += grid1[j] * q8[j+0] * (signs[l] & kmask_iq2xs[j+0] ? -1 : 1);
+                    sumi += grid2[j] * q8[j+4] * (signs[l] & kmask_iq2xs[j+4] ? -1 : 1);
+                }
+                q8 += 8;
+            }
+            qs += 8;
+            signs += 4;
+            bsum += sumi * ls1;
+            sumi = 0;
+            for (int l = 0; l < 4; ++l) {
+                const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*l+0] | ((qh[ib32+1] << (8-2*l)) & 256)));
+                const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*l+1] | ((qh[ib32+1] << (7-2*l)) & 256)));
+                for (int j = 0; j < 4; ++j) {
+                    sumi += grid1[j] * q8[j+0] * (signs[l] & kmask_iq2xs[j+0] ? -1 : 1);
+                    sumi += grid2[j] * q8[j+4] * (signs[l] & kmask_iq2xs[j+4] ? -1 : 1);
+                }
+                q8 += 8;
+            }
+            qs += 8;
+            signs += 4;
+            bsum += sumi * ls2;
+        }
+        sumf += d * bsum;
+    }
+    *s = sumf;
+}
+
+void ggml_vec_dot_iq1_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    assert(n % QK_K == 0);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_iq1_s * GGML_RESTRICT x = vx;
+    const block_q8_K  * GGML_RESTRICT y = vy;
+
+    const int nb = n / QK_K;
+
+    float sumf = 0;
+    for (int i = 0; i < nb; i++) {
+
+        const int8_t   * q8 = y[i].qs;
+        const uint8_t  * qs = x[i].qs;
+        const uint16_t * qh = x[i].qh;
+
+        int sumi = 0, sumi1 = 0;
+        for (int ib = 0; ib < QK_K/32; ++ib) {
+            const int ls = 2*((qh[ib] >> 12) & 7) + 1;
+            const int delta = qh[ib] & 0x8000 ? -1 : 1;
+            int lsum = 0;
+            for (int l = 0; l < 4; ++l) {
+                const int8_t * grid = (const int8_t *)(iq1s_grid + (qs[l] | (((qh[ib] >> 3*l) & 7) << 8)));
+                for (int j = 0; j < 8; ++j) {
+                    lsum += q8[j] * grid[j];
+                }
+                q8 += 8;
+            }
+            sumi  += ls * lsum;
+            sumi1 += ls * delta * (y[i].bsums[2*ib+0] + y[i].bsums[2*ib+1]);
+            qs += 4;
+        }
+
+        sumf += GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d * (sumi + IQ1S_DELTA * sumi1);
+    }
+
+    *s = sumf;
+}
+
+void ggml_vec_dot_iq1_m_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    assert(n % QK_K == 0);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_iq1_m * GGML_RESTRICT x = vx;
+    const block_q8_K  * GGML_RESTRICT y = vy;
+
+    const int nb = n / QK_K;
+
+    iq1m_scale_t scale;
+
+    int sum1[2], sum2[2], delta[4];
+
+    float sumf = 0;
+    for (int i = 0; i < nb; i++) {
+
+        const int8_t   * q8 = y[i].qs;
+        const uint8_t  * qs = x[i].qs;
+        const uint8_t  * qh = x[i].qh;
+        const uint16_t * sc = (const uint16_t *)x[i].scales;
+
+        scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
+
+        int sumi1 = 0, sumi2 = 0;
+        for (int ib = 0; ib < QK_K/32; ++ib) {
+            delta[0] = qh[0] & 0x08 ? -1 : 1;
+            delta[1] = qh[0] & 0x80 ? -1 : 1;
+            delta[2] = qh[1] & 0x08 ? -1 : 1;
+            delta[3] = qh[1] & 0x80 ? -1 : 1;
+            sum1[0] = sum1[1] = sum2[0] = sum2[1] = 0;
+            for (int l = 0; l < 4; ++l) {
+                const int8_t * grid = (const int8_t *)(iq1s_grid + (qs[l] | (((uint16_t)qh[l/2] << (8 - 4*(l%2))) & 0x700)));
+                int lsum1 = 0, lsum2 = 0;
+                for (int j = 0; j < 8; ++j) {
+                    lsum1 += q8[j] * grid[j];
+                    lsum2 += q8[j];
+                }
+                q8 += 8;
+                sum1[l/2] += lsum1;
+                sum2[l/2] += lsum2*delta[l];
+            }
+
+            const int ls1 = 2*((sc[ib/2] >> (6*(ib%2)+0)) & 0x7) + 1;
+            const int ls2 = 2*((sc[ib/2] >> (6*(ib%2)+3)) & 0x7) + 1;
+
+            sumi1 += sum1[0] * ls1 + sum1[1] * ls2;
+            sumi2 += sum2[0] * ls1 + sum2[1] * ls2;
+            qs += 4;
+            qh += 2;
+        }
+
+        sumf += GGML_CPU_FP16_TO_FP32(scale.f16) * y[i].d * (sumi1 + IQ1M_DELTA * sumi2);
+    }
+
+    *s = sumf;
+}
+
+void ggml_vec_dot_iq4_nl_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+    assert(n % QK4_NL == 0);
+    static_assert(QK4_NL == QK8_0, "QK4_NL and QK8_0 must be the same");
+
+    const block_iq4_nl * GGML_RESTRICT x = vx;
+    const block_q8_0   * GGML_RESTRICT y = vy;
+
+    const int nb = n / QK4_NL;
+
+    int ib = 0;
+    float sumf = 0;
+
+    for (; ib < nb; ++ib) {
+        const float d = GGML_CPU_FP16_TO_FP32(y[ib].d)*GGML_CPU_FP16_TO_FP32(x[ib].d);
+        int sumi1 = 0, sumi2 = 0;
+        for (int j = 0; j < QK4_NL/2; ++j) {
+            sumi1 += y[ib].qs[j+       0] * kvalues_iq4nl[x[ib].qs[j] & 0xf];
+            sumi2 += y[ib].qs[j+QK4_NL/2] * kvalues_iq4nl[x[ib].qs[j] >>  4];
+        }
+        sumf += d * (sumi1 + sumi2);
+    }
+    *s = sumf;
+}
+
+void ggml_vec_dot_iq4_xs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+    assert(n % QK_K == 0);
+
+    const block_iq4_xs * GGML_RESTRICT x = vx;
+    const block_q8_K   * GGML_RESTRICT y = vy;
+
+    const int nb = n / QK_K;
+
+    float sumf = 0;
+    for (int ibl = 0; ibl < nb; ++ibl) {
+        const float d4d8 = GGML_CPU_FP16_TO_FP32(x[ibl].d) * y[ibl].d;
+        uint16_t h = x[ibl].scales_h;
+        const uint8_t * qs = x[ibl].qs;
+        const int8_t  * q8 = y[ibl].qs;
+        for (int ib = 0; ib < QK_K/32; ib += 2) {
+            const uint8_t ls1 = (x[ibl].scales_l[ib/2] & 0xf) | ((h << 4) & 0x30);
+            const uint8_t ls2 = (x[ibl].scales_l[ib/2] >>  4) | ((h << 2) & 0x30);
+            h >>= 4;
+            const float d1 = d4d8*(ls1 - 32);
+            const float d2 = d4d8*(ls2 - 32);
+            int sumi1 = 0, sumi2 = 0;
+            for (int j = 0; j < 16; ++j) {
+                sumi1 += q8[j+ 0] * kvalues_iq4nl[qs[j] & 0xf];
+                sumi2 += q8[j+16] * kvalues_iq4nl[qs[j] >>  4];
+            }
+            sumf += d1 * (sumi1 + sumi2);
+            qs += 16;
+            q8 += 32;
+            sumi1 = sumi2 = 0;
+            for (int j = 0; j < 16; ++j) {
+                sumi1 += q8[j+ 0] * kvalues_iq4nl[qs[j] & 0xf];
+                sumi2 += q8[j+16] * kvalues_iq4nl[qs[j] >>  4];
+            }
+            sumf += d2 * (sumi1 + sumi2);
+            qs += 16;
+            q8 += 32;
+        }
+    }
+    *s = sumf;
+}
+
+// ============================ 4-bit non-linear quants
+
+void quantize_row_iq4_nl(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
+    assert(k % QK4_NL == 0);
+    quantize_row_iq4_nl_ref(x, y, k);
+}
+
+void quantize_row_iq4_xs(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
+    assert(k % QK_K == 0);
+    quantize_iq4_xs(x, y, 1, k, NULL);
+}
diff --git a/ggml/src/ggml-cpu/ggml-cpu-quants.h b/ggml/src/ggml-cpu/quants.h
similarity index 56%
rename from ggml/src/ggml-cpu/ggml-cpu-quants.h
rename to ggml/src/ggml-cpu/quants.h
index e33d9d473ea66..dc4342c87f592 100644
--- a/ggml/src/ggml-cpu/ggml-cpu-quants.h
+++ b/ggml/src/ggml-cpu/quants.h
@@ -58,6 +58,32 @@ void ggml_vec_dot_iq4_nl_q8_0 (int n, float * GGML_RESTRICT s, size_t bs, const
 void ggml_vec_dot_iq4_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 void ggml_vec_dot_iq3_s_q8_K  (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 
+// Generic implementation
+void quantize_row_q8_0_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
+void quantize_row_q8_1_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
+void quantize_row_q8_K_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
+void ggml_vec_dot_q4_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+void ggml_vec_dot_q4_1_q8_1_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+void ggml_vec_dot_q5_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+void ggml_vec_dot_q5_1_q8_1_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+void ggml_vec_dot_q8_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+void ggml_vec_dot_tq1_0_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+void ggml_vec_dot_tq2_0_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+void ggml_vec_dot_q2_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+void ggml_vec_dot_q3_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+void ggml_vec_dot_q4_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+void ggml_vec_dot_q5_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy,  size_t by, int nrc);
+void ggml_vec_dot_q6_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+void ggml_vec_dot_iq2_xxs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+void ggml_vec_dot_iq2_xs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+void ggml_vec_dot_iq2_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+void ggml_vec_dot_iq3_xxs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+void ggml_vec_dot_iq3_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+void ggml_vec_dot_iq1_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+void ggml_vec_dot_iq1_m_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+void ggml_vec_dot_iq4_nl_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+void ggml_vec_dot_iq4_xs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/ggml/src/ggml-cpu/repack.cpp b/ggml/src/ggml-cpu/repack.cpp
new file mode 100644
index 0000000000000..72ee93a5abc7c
--- /dev/null
+++ b/ggml/src/ggml-cpu/repack.cpp
@@ -0,0 +1,1571 @@
+#define GGML_COMMON_IMPL_CPP
+#define GGML_COMMON_DECL_CPP
+#include "ggml-common.h"
+#include "ggml-backend-impl.h"
+
+#include "ggml-impl.h"
+#include "ggml-cpu.h"
+#include "ggml-cpu-impl.h"
+#include "simd-mappings.h"
+#include "traits.h"
+
+#include "arch-fallback.h"
+
+#include <cmath>
+#include <cstring>
+#include <cassert>
+#include <cstdlib> // for qsort
+#include <cstdio>  // for GGML_ASSERT
+
+#include "repack.h"
+
+#if defined(__GNUC__)
+#pragma GCC diagnostic ignored "-Woverlength-strings"
+#endif
+
+#define UNUSED GGML_UNUSED
+
+static inline int nearest_int(float fval) {
+    assert(fabsf(fval) <= 4194303.f);
+    float val = fval + 12582912.f;
+    int i; memcpy(&i, &val, sizeof(int));
+    return (i & 0x007fffff) - 0x00400000;
+}
+
+// Functions to create the interleaved data layout formats
+
+// interleave 4 block_q4_0s in blocks of blck_size_interleave
+// returns an interleaved block_q4_0x4
+// in the interleaved block_q4_0x4, place deltas for 4 block_q4_0 blocks
+// first, then interleave quants from 4 block_q4_0s in blocks of blck_size_interleave
+//
+// - in                  : an array of block_q4_0 pointers
+// - blck_size_interleave : the block_q4_0 quants bytes are interleaved in blocks of
+//                         blck_size_interleave bytes
+// - xor_mask            : the mask to convert the nibbles in block_q4_0 quants bytes
+//                         from bias offset form to pure sign form (this saves subtract
+//                         operations durin unpacking)
+//
+
+extern "C" {
+
+void ggml_quantize_mat_q8_0_4x4_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
+    assert(QK8_0 == 32);
+    assert(k % QK8_0 == 0);
+    const int nb = k / QK8_0;
+
+    block_q8_0x4 * GGML_RESTRICT y = (block_q8_0x4 *) vy;
+
+    // scalar
+    const int blck_size_interleave = 4;
+    float srcv[4][QK8_0];
+    float id[4];
+
+    for (int i = 0; i < nb; i++) {
+        for (int row_iter = 0; row_iter < 4; row_iter++) {
+            float amax = 0.0f; // absolute max
+
+            for (int j = 0; j < QK8_0; j++) {
+                srcv[row_iter][j] = x[row_iter * k + i * QK8_0 + j];
+                amax = MAX(amax, fabsf(srcv[row_iter][j]));
+            }
+
+            const float d = amax / ((1 << 7) - 1);
+            id[row_iter] = d ? 1.0f / d : 0.0f;
+
+            y[i].d[row_iter] = GGML_CPU_FP32_TO_FP16(d);
+        }
+
+        for (int j = 0; j < QK8_0 * 4; j++) {
+            int src_offset = (j / (4 * blck_size_interleave)) * blck_size_interleave;
+            int src_id = (j % (4 * blck_size_interleave)) / blck_size_interleave;
+            src_offset += (j % blck_size_interleave);
+
+            float x0 = srcv[src_id][src_offset] * id[src_id];
+            y[i].qs[j] = roundf(x0);
+        }
+    }
+}
+
+void ggml_quantize_mat_q8_0_4x8_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
+    assert(QK8_0 == 32);
+    assert(k % QK8_0 == 0);
+    const int nb = k / QK8_0;
+
+    block_q8_0x4 * GGML_RESTRICT y = (block_q8_0x4 *) vy;
+
+    // scalar
+    const int blck_size_interleave = 8;
+    float srcv[4][QK8_0];
+    float id[4];
+
+    for (int i = 0; i < nb; i++) {
+        for (int row_iter = 0; row_iter < 4; row_iter++) {
+            float amax = 0.0f; // absolute max
+
+            for (int j = 0; j < QK8_0; j++) {
+                srcv[row_iter][j] = x[row_iter * k + i * QK8_0 + j];
+                amax = MAX(amax, fabsf(srcv[row_iter][j]));
+            }
+
+            const float d = amax / ((1 << 7) - 1);
+            id[row_iter] = d ? 1.0f / d : 0.0f;
+
+            y[i].d[row_iter] = GGML_CPU_FP32_TO_FP16(d);
+        }
+
+        for (int j = 0; j < QK8_0 * 4; j++) {
+            int src_offset = (j / (4 * blck_size_interleave)) * blck_size_interleave;
+            int src_id = (j % (4 * blck_size_interleave)) / blck_size_interleave;
+            src_offset += (j % blck_size_interleave);
+
+            float x0 = srcv[src_id][src_offset] * id[src_id];
+            y[i].qs[j] = roundf(x0);
+        }
+    }
+}
+
+void ggml_quantize_mat_q8_K_4x8_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
+    assert(QK_K == 256);
+    assert(k % QK_K == 0);
+    const int nb = k / QK_K;
+
+    block_q8_Kx4 * GGML_RESTRICT y = (block_q8_Kx4 *) vy;
+
+    // scalar
+    const int blck_size_interleave = 8;
+    float srcv[4][QK_K];
+    float iscale[4];
+
+    for (int i = 0; i < nb; i++) {
+        for (int row_iter = 0; row_iter < 4; row_iter++) {
+            float amax = 0.0f; // absolute max
+            float max = 0;
+
+            for (int j = 0; j < QK_K; j++) {
+                srcv[row_iter][j] = x[row_iter * k + i * QK_K + j];
+                // Update the maximum value of the corresponding super block
+                if(amax < fabsf(srcv[row_iter][j])) {
+                    amax = fabsf(srcv[row_iter][j]);
+                    max = srcv[row_iter][j];
+                }
+            }
+
+            iscale[row_iter] = amax ? -127.f/max : 0;
+
+            y[i].d[row_iter] = amax ? 1/iscale[row_iter] : 0;
+        }
+
+        for (int j = 0; j < QK_K / 4; j++) {
+            y[i].bsums[j] = 0;
+        }
+
+        // Quants values are interleaved in sequence of eight bytes from corresponding super blocks
+        // Bsums values are interleaved in sequence of four bsums from each super block taken for interleaving
+        // i.e first four bsums from the first super block, followed by first four bsums from second super block and so on
+        for (int j = 0; j < QK_K * 4; j++) {
+            int src_offset = (j / (4 * blck_size_interleave)) * blck_size_interleave;
+            int src_id     = (j % (4 * blck_size_interleave)) / blck_size_interleave;
+            src_offset += (j % blck_size_interleave);
+            int index = (((j & 31) >> 3) << 2) + ((j >> 8) << 4) + ((j >> 6) & 3);
+
+            float x0 = srcv[src_id][src_offset] * iscale[src_id];
+            y[i].qs[j] = nearest_int(x0);
+            y[i].bsums[index] += y[i].qs[j];
+        }
+    }
+}
+
+} // extern "C"
+
+template <int64_t INTER_SIZE, ggml_type PARAM_TYPE>
+void ggml_quantize_mat_t(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t nrow, int64_t n_per_row);
+
+template <> void ggml_quantize_mat_t<4, GGML_TYPE_Q8_0>(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t nrow, int64_t n_per_row) {
+    assert(nrow == 4);
+    UNUSED(nrow);
+    ggml_quantize_mat_q8_0_4x4(x, vy, n_per_row);
+}
+
+template <> void ggml_quantize_mat_t<8, GGML_TYPE_Q8_0>(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t nrow, int64_t n_per_row) {
+    assert(nrow == 4);
+    UNUSED(nrow);
+    ggml_quantize_mat_q8_0_4x8(x, vy, n_per_row);
+}
+
+template <> void ggml_quantize_mat_t<8, GGML_TYPE_Q8_K>(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t nrow, int64_t n_per_row) {
+    assert(nrow == 4);
+    UNUSED(nrow);
+    ggml_quantize_mat_q8_K_4x8(x, vy, n_per_row);
+}
+
+extern "C" {
+
+void ggml_gemv_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
+    const int qk = QK8_0;
+    const int nb = n / qk;
+    const int ncols_interleaved = 4;
+    const int blocklen = 4;
+
+    assert (n % qk == 0);
+    assert (nc % ncols_interleaved == 0);
+
+    UNUSED(s);
+    UNUSED(bs);
+    UNUSED(vx);
+    UNUSED(vy);
+    UNUSED(nr);
+    UNUSED(nc);
+    UNUSED(nb);
+    UNUSED(ncols_interleaved);
+    UNUSED(blocklen);
+
+    float sumf[4];
+    int sumi;
+
+    const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
+    for (int x = 0; x < nc / ncols_interleaved; x++) {
+        const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
+
+        for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
+        for (int l = 0; l < nb; l++) {
+            for (int k = 0; k < (qk / (2 * blocklen)); k++) {
+                for (int j = 0; j < ncols_interleaved; j++) {
+                    sumi = 0;
+                    for (int i = 0; i < blocklen; ++i) {
+                        const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
+                        const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
+                        sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
+                    }
+                    sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
+                }
+            }
+        }
+        for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
+    }
+}
+
+void ggml_gemv_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
+    const int qk = QK8_0;
+    const int nb = n / qk;
+    const int ncols_interleaved = 4;
+    const int blocklen = 8;
+
+    assert (n % qk == 0);
+    assert (nc % ncols_interleaved == 0);
+
+    UNUSED(s);
+    UNUSED(bs);
+    UNUSED(vx);
+    UNUSED(vy);
+    UNUSED(nr);
+    UNUSED(nc);
+    UNUSED(nb);
+    UNUSED(ncols_interleaved);
+    UNUSED(blocklen);
+
+    float sumf[4];
+    int sumi;
+
+    const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
+    for (int x = 0; x < nc / ncols_interleaved; x++) {
+        const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
+
+        for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
+        for (int l = 0; l < nb; l++) {
+            for (int k = 0; k < (qk / (2 * blocklen)); k++) {
+                for (int j = 0; j < ncols_interleaved; j++) {
+                    sumi = 0;
+                    for (int i = 0; i < blocklen; ++i) {
+                        const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
+                        const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
+                        sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
+                    }
+                    sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
+                }
+            }
+        }
+        for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
+    }
+}
+
+void ggml_gemv_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
+    const int qk = QK8_0;
+    const int nb = n / qk;
+    const int ncols_interleaved = 8;
+    const int blocklen = 8;
+
+    assert (n % qk == 0);
+    assert (nc % ncols_interleaved == 0);
+
+    UNUSED(s);
+    UNUSED(bs);
+    UNUSED(vx);
+    UNUSED(vy);
+    UNUSED(nr);
+    UNUSED(nc);
+    UNUSED(nb);
+    UNUSED(ncols_interleaved);
+    UNUSED(blocklen);
+
+    {
+        float sumf[8];
+        int sumi;
+
+        const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
+        for (int x = 0; x < nc / ncols_interleaved; x++) {
+            const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb);
+
+            for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
+            for (int l = 0; l < nb; l++) {
+                for (int k = 0; k < (qk / (2 * blocklen)); k++) {
+                    for (int j = 0; j < ncols_interleaved; j++) {
+                        sumi = 0;
+                        for (int i = 0; i < blocklen; ++i) {
+                            const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
+                            const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
+                            sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
+                        }
+                        sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
+                    }
+                }
+            }
+            for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
+        }
+    }
+}
+
+void ggml_gemv_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
+    const int qk = QK_K;
+    const int nb = n / qk;
+    const int ncols_interleaved = 8;
+    const int blocklen = 8;
+    static const uint32_t kmask1 = 0x3f3f3f3f;
+    static const uint32_t kmask2 = 0x0f0f0f0f;
+    static const uint32_t kmask3 = 0x03030303;
+
+    assert (n % qk == 0);
+    assert (nc % ncols_interleaved == 0);
+
+    UNUSED(s);
+    UNUSED(bs);
+    UNUSED(vx);
+    UNUSED(vy);
+    UNUSED(nr);
+    UNUSED(nc);
+    UNUSED(nb);
+    UNUSED(ncols_interleaved);
+    UNUSED(blocklen);
+
+    float sumf[8];
+    float sum_minf[8];
+    uint32_t utmp[32];
+    int sumi1;
+    int sumi2;
+    int sumi;
+
+    const block_q8_K * a_ptr = (const block_q8_K *) vy;
+    for (int x = 0; x < nc / ncols_interleaved; x++) {
+        const block_q4_Kx8 * b_ptr = (const block_q4_Kx8 *) vx + (x * nb);
+
+        for (int j = 0; j < ncols_interleaved; j++) {
+            sumf[j] = 0.0;
+            sum_minf[j] = 0.0;
+        }
+        for (int l = 0; l < nb; l++) {
+            for (int sb = 0; sb < 8; sb++) {
+                memcpy(utmp + sb * 4, b_ptr[l].scales + sb * 12, 12);
+                utmp[sb * 4 + 3] = ((utmp[sb * 4 + 2] >> 4) & kmask2) | (((utmp[sb * 4 + 1] >> 6) & kmask3) << 4);
+                const uint32_t uaux_0 = utmp[sb * 4 + 1] & kmask1;
+                utmp[sb * 4 + 1] = (utmp[sb * 4 + 2] & kmask2) | (((utmp[sb * 4 + 0] >> 6) & kmask3) << 4);
+                utmp[sb * 4 + 2] = uaux_0;
+                utmp[sb * 4 + 0] &= kmask1;
+            }
+            for (int k = 0; k < (qk / (2 * blocklen)); k++) {
+                uint8_t *scales_0 = (uint8_t*) utmp + (k / 4) * 32;
+                uint8_t *scales_1 = (uint8_t*) utmp + (k / 4) * 32 + 16;
+                for (int j = 0; j < ncols_interleaved; j++) {
+                    sumi1 = 0;
+                    sumi2 = 0;
+                    sumi = 0;
+                    for (int i = 0; i < blocklen; ++i) {
+                        const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF);
+                        const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4);
+                        sumi1 = (v0 * a_ptr[l].qs[(k >> 2) * 64 + (k % 4) * blocklen + i]);
+                        sumi2 = (v1 * a_ptr[l].qs[(k >> 2) * 64 + (k % 4) * blocklen + i + 32]);
+                        sumi1 = sumi1 * scales_0[j];
+                        sumi2 = sumi2 * scales_1[j];
+                        sumi += sumi1 + sumi2;
+                    }
+                    sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d;
+                }
+            }
+            for (int sb = 0; sb < 8; sb++) {
+                uint8_t *mins = (uint8_t*) utmp + 8 + sb * 16;
+                for (int j = 0; j < ncols_interleaved; j++) {
+                    sum_minf[j] += mins[j] * (a_ptr[l].bsums[sb * 2] + a_ptr[l].bsums[sb * 2 + 1]) * GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d;
+                }
+            }
+        }
+        for (int j = 0; j < ncols_interleaved; j++) {
+            s[x * ncols_interleaved + j] = sumf[j] - sum_minf[j];
+        }
+    }
+}
+
+void ggml_gemv_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
+    const int qk = QK8_0;
+    const int nb = n / qk;
+    const int ncols_interleaved = 4;
+    const int blocklen = 4;
+
+    assert (n % qk == 0);
+    assert (nc % ncols_interleaved == 0);
+
+    UNUSED(s);
+    UNUSED(bs);
+    UNUSED(vx);
+    UNUSED(vy);
+    UNUSED(nr);
+    UNUSED(nc);
+    UNUSED(nb);
+    UNUSED(ncols_interleaved);
+    UNUSED(blocklen);
+
+    {
+        float sumf[4];
+        int sumi;
+
+        const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
+        for (int x = 0; x < nc / ncols_interleaved; x++) {
+            const block_iq4_nlx4 * b_ptr = (const block_iq4_nlx4 *) vx + (x * nb);
+
+            for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
+            for (int l = 0; l < nb; l++) {
+                for (int k = 0; k < (qk / (2 * blocklen)); k++) {
+                    for (int j = 0; j < ncols_interleaved; j++) {
+                        sumi = 0;
+                        for (int i = 0; i < blocklen; ++i) {
+                            const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
+                            const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
+                            sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2]));
+                        }
+                        sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
+                    }
+                }
+            }
+            for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
+        }
+    }
+}
+
+void ggml_gemm_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
+    const int qk = QK8_0;
+    const int nb = n / qk;
+    const int ncols_interleaved = 4;
+    const int blocklen = 4;
+
+    assert (n % qk == 0);
+    assert (nr % 4 == 0);
+    assert (nc % ncols_interleaved == 0);
+
+    UNUSED(s);
+    UNUSED(bs);
+    UNUSED(vx);
+    UNUSED(vy);
+    UNUSED(nr);
+    UNUSED(nc);
+    UNUSED(nb);
+    UNUSED(ncols_interleaved);
+    UNUSED(blocklen);
+
+    {
+        float sumf[4][4];
+        int sumi;
+
+        for (int y = 0; y < nr / 4; y++) {
+            const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
+            for (int x = 0; x < nc / ncols_interleaved; x++) {
+                const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
+                for (int m = 0; m < 4; m++) {
+                    for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
+                }
+                for (int l = 0; l < nb; l++) {
+                    for (int k = 0; k < (qk / (2 * blocklen)); k++) {
+                        for (int m = 0; m < 4; m++) {
+                            for (int j = 0; j < ncols_interleaved; j++) {
+                                sumi = 0;
+                                for (int i = 0; i < blocklen; ++i) {
+                                    const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
+                                    const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
+                                    sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
+                                            (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
+                                }
+                                sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
+                            }
+                        }
+                    }
+                }
+                for (int m = 0; m < 4; m++) {
+                    for (int j = 0; j < ncols_interleaved; j++)
+                        s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
+                }
+            }
+        }
+    }
+}
+
+void ggml_gemm_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
+    const int qk = QK8_0;
+    const int nb = n / qk;
+    const int ncols_interleaved = 4;
+    const int blocklen = 8;
+
+    assert (n % qk == 0);
+    assert (nr % 4 == 0);
+    assert (nc % ncols_interleaved == 0);
+
+    UNUSED(s);
+    UNUSED(bs);
+    UNUSED(vx);
+    UNUSED(vy);
+    UNUSED(nr);
+    UNUSED(nc);
+    UNUSED(nb);
+    UNUSED(ncols_interleaved);
+    UNUSED(blocklen);
+
+    float sumf[4][4];
+    int sumi;
+
+    for (int y = 0; y < nr / 4; y++) {
+        const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
+        for (int x = 0; x < nc / ncols_interleaved; x++) {
+            const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
+            for (int m = 0; m < 4; m++) {
+                for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
+            }
+            for (int l = 0; l < nb; l++) {
+                for (int k = 0; k < (qk / (2 * blocklen)); k++) {
+                    for (int m = 0; m < 4; m++) {
+                        for (int j = 0; j < ncols_interleaved; j++) {
+                            sumi = 0;
+                            for (int i = 0; i < blocklen; ++i) {
+                                const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
+                                const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
+                                sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
+                                        (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
+                            }
+                            sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
+                        }
+                    }
+                }
+            }
+            for (int m = 0; m < 4; m++) {
+                for (int j = 0; j < ncols_interleaved; j++)
+                    s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
+            }
+        }
+    }
+}
+
+void ggml_gemm_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
+    const int qk = QK8_0;
+    const int nb = n / qk;
+    const int ncols_interleaved = 8;
+    const int blocklen = 8;
+
+    assert (n % qk == 0);
+    assert (nr % 4 == 0);
+    assert (nc % ncols_interleaved == 0);
+
+    UNUSED(s);
+    UNUSED(bs);
+    UNUSED(vx);
+    UNUSED(vy);
+    UNUSED(nr);
+    UNUSED(nc);
+    UNUSED(nb);
+    UNUSED(ncols_interleaved);
+    UNUSED(blocklen);
+
+    float sumf[4][8];
+    int sumi;
+
+    for (int y = 0; y < nr / 4; y++) {
+        const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
+        for (int x = 0; x < nc / ncols_interleaved; x++) {
+            const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb);
+            for (int m = 0; m < 4; m++) {
+                for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
+            }
+            for (int l = 0; l < nb; l++) {
+                for (int k = 0; k < (qk / (2 * blocklen)); k++) {
+                    for (int m = 0; m < 4; m++) {
+                        for (int j = 0; j < ncols_interleaved; j++) {
+                            sumi = 0;
+                            for (int i = 0; i < blocklen; ++i) {
+                                const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
+                                const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
+                                sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
+                                         (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
+                            }
+                            sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
+                        }
+                    }
+                }
+            }
+            for (int m = 0; m < 4; m++) {
+                for (int j = 0; j < ncols_interleaved; j++)
+                    s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
+            }
+        }
+    }
+}
+
+void ggml_gemm_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
+    const int qk = QK_K;
+    const int nb = n / qk;
+    const int ncols_interleaved = 8;
+    const int blocklen = 8;
+    static const uint32_t kmask1 = 0x3f3f3f3f;
+    static const uint32_t kmask2 = 0x0f0f0f0f;
+    static const uint32_t kmask3 = 0x03030303;
+
+    assert (n % qk == 0);
+    assert (nr % 4 == 0);
+    assert (nc % ncols_interleaved == 0);
+
+    UNUSED(s);
+    UNUSED(bs);
+    UNUSED(vx);
+    UNUSED(vy);
+    UNUSED(nr);
+    UNUSED(nc);
+    UNUSED(nb);
+    UNUSED(ncols_interleaved);
+    UNUSED(blocklen);
+
+    float sumf[4][8];
+    float sum_minf[4][8];
+    uint32_t utmp[32];
+    int sumi1;
+    int sumi2;
+    int sumi;
+
+    for (int y = 0; y < nr / 4; y++) {
+        const block_q8_Kx4 * a_ptr = (const block_q8_Kx4 *) vy + (y * nb);
+        for (int x = 0; x < nc / ncols_interleaved; x++) {
+            const block_q4_Kx8 * b_ptr = (const block_q4_Kx8 *) vx + (x * nb);
+            for (int m = 0; m < 4; m++) {
+                for (int j = 0; j < ncols_interleaved; j++) {
+                    sumf[m][j] = 0.0;
+                    sum_minf[m][j] = 0.0;
+                }
+            }
+            for (int l = 0; l < nb; l++) {
+                for (int sb = 0; sb < 8; sb++) {
+                    memcpy(utmp + sb * 4, b_ptr[l].scales + sb * 12, 12);
+                    utmp[sb * 4 + 3] = ((utmp[sb * 4 + 2] >> 4) & kmask2) | (((utmp[sb * 4 + 1] >> 6) & kmask3) << 4);
+                    const uint32_t uaux_0 = utmp[sb * 4 + 1] & kmask1;
+                    utmp[sb * 4 + 1] = (utmp[sb * 4 + 2] & kmask2) | (((utmp[sb * 4 + 0] >> 6) & kmask3) << 4);
+                    utmp[sb * 4 + 2] = uaux_0;
+                    utmp[sb * 4 + 0] &= kmask1;
+                }
+                for (int k = 0; k < (qk / (2 * blocklen)); k++) {
+                    uint8_t *scales_0 = (uint8_t*) utmp + (k / 4) * 32;
+                    uint8_t *scales_1 = (uint8_t*) utmp + (k / 4) * 32 + 16;
+                    for (int m = 0; m < 4; m++) {
+                        for (int j = 0; j < ncols_interleaved; j++) {
+                            sumi1 = 0;
+                            sumi2 = 0;
+                            sumi = 0;
+                            for (int i = 0; i < blocklen; ++i) {
+                                const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF);
+                                const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4);
+                                sumi1 = (v0 * a_ptr[l].qs[(k >> 2) * 256 + (k % 4) * 4 * blocklen + m * blocklen + i]);
+                                sumi2 = (v1 * a_ptr[l].qs[(k >> 2) * 256 + (k % 4) * 4 * blocklen + m * blocklen + i + 128]);
+                                sumi1 = sumi1 * scales_0[j];
+                                sumi2 = sumi2 * scales_1[j];
+                                sumi += sumi1 + sumi2;
+                            }
+                            sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d[m];
+                        }
+                    }
+                }
+                for (int sb = 0; sb < 8; sb++) {
+                    uint8_t *mins = (uint8_t*) utmp + 8 + sb * 16;
+                    for(int m = 0; m < 4; m++) {
+                        const int16_t *bsums = a_ptr[l].bsums + (sb * 8) + (m * 4) - ((sb % 2) * 6);
+                        for(int j = 0; j < ncols_interleaved; j++) {
+                            sum_minf[m][j] += mins[j] * (bsums[0] + bsums[1]) * GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d[m];
+                        }
+                    }
+                }
+            }
+            for (int m = 0; m < 4; m++) {
+                for (int j = 0; j < ncols_interleaved; j++) {
+                    s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j] - sum_minf[m][j];
+                }
+            }
+        }
+    }
+}
+
+void ggml_gemm_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
+    const int qk = QK8_0;
+    const int nb = n / qk;
+    const int ncols_interleaved = 4;
+    const int blocklen = 4;
+
+    assert (n % qk == 0);
+    assert (nr % 4 == 0);
+    assert (nc % ncols_interleaved == 0);
+
+    UNUSED(s);
+    UNUSED(bs);
+    UNUSED(vx);
+    UNUSED(vy);
+    UNUSED(nr);
+    UNUSED(nc);
+    UNUSED(nb);
+    UNUSED(ncols_interleaved);
+    UNUSED(blocklen);
+
+    {
+        float sumf[4][4];
+        int sumi;
+
+        for (int y = 0; y < nr / 4; y++) {
+            const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
+            for (int x = 0; x < nc / ncols_interleaved; x++) {
+                const block_iq4_nlx4 * b_ptr = (const block_iq4_nlx4 *) vx + (x * nb);
+                for (int m = 0; m < 4; m++) {
+                    for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
+                }
+                for (int l = 0; l < nb; l++) {
+                    for (int k = 0; k < (qk / (2 * blocklen)); k++) {
+                        for (int m = 0; m < 4; m++) {
+                            for (int j = 0; j < ncols_interleaved; j++) {
+                                sumi = 0;
+                                for (int i = 0; i < blocklen; ++i) {
+                                    const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
+                                    const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
+                                    sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
+                                            (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4]));
+                                }
+                                sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
+                            }
+                        }
+                    }
+                }
+                for (int m = 0; m < 4; m++) {
+                    for (int j = 0; j < ncols_interleaved; j++)
+                        s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
+                }
+            }
+        }
+    }
+}
+
+} // extern "C"
+
+static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int blck_size_interleave) {
+    block_q4_0x4 out;
+
+    for (int i = 0; i < 4; i++) {
+        out.d[i] = in[i].d;
+    }
+
+    const int end = QK4_0 * 2 / blck_size_interleave;
+
+    if (blck_size_interleave == 8) {
+        const uint64_t xor_mask = 0x8888888888888888ULL;
+        for (int i = 0; i < end; ++i) {
+            int src_id = i % 4;
+            int src_offset = (i / 4) * blck_size_interleave;
+            int dst_offset = i * blck_size_interleave;
+
+            uint64_t elems;
+            // Using memcpy to avoid unaligned memory accesses
+            memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t));
+            elems ^= xor_mask;
+            memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t));
+        }
+    } else if (blck_size_interleave == 4) {
+        const uint32_t xor_mask = 0x88888888;
+        for (int i = 0; i < end; ++i) {
+            int src_id = i % 4;
+            int src_offset = (i / 4) * blck_size_interleave;
+            int dst_offset = i * blck_size_interleave;
+
+            uint32_t elems;
+            memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint32_t));
+            elems ^= xor_mask;
+            memcpy(&out.qs[dst_offset], &elems, sizeof(uint32_t));
+        }
+    } else {
+        GGML_ASSERT(false);
+    }
+
+    return out;
+}
+
+// interleave 8 block_q4_0s in blocks of blck_size_interleave
+// returns an interleaved block_q4_0x8
+// in the interleaved block_q4_0x8, place deltas for 8 block_q4_0 blocks
+// first, then interleave quants from 8 block_q4_0s in blocks of blck_size_interleave
+static block_q4_0x8 make_block_q4_0x8(block_q4_0 * in, unsigned int blck_size_interleave) {
+    block_q4_0x8 out;
+
+    for (int i = 0; i < 8; i++) {
+        out.d[i] = in[i].d;
+    }
+
+    const int end = QK4_0 * 4 / blck_size_interleave;
+    const uint64_t xor_mask = 0x8888888888888888ULL;
+
+    for (int i = 0; i < end; ++i) {
+        int src_id = i % 8;
+        int src_offset = (i / 8) * blck_size_interleave;
+        int dst_offset = i * blck_size_interleave;
+
+        uint64_t elems;
+        memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t));
+        elems ^= xor_mask;
+        memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t));
+    }
+
+    return out;
+}
+
+static block_q4_Kx8 make_block_q4_Kx8(block_q4_K * in, unsigned int blck_size_interleave) {
+    block_q4_Kx8 out;
+    //Delta(scale) and dmin values of the eight Q4_K structures are copied onto the output interleaved structure
+    for (int i = 0; i < 8; i++) {
+        out.d[i] = in[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.d;
+    }
+
+    for (int i = 0; i < 8; i++) {
+        out.dmin[i] = in[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.dmin;
+    }
+
+    const int end = QK_K * 4 / blck_size_interleave;
+
+    // Interleave Q4_K quants by taking 8 bytes at a time
+    for (int i = 0; i < end; ++i) {
+        int src_id = i % 8;
+        int src_offset = (i / 8) * blck_size_interleave;
+        int dst_offset = i * blck_size_interleave;
+
+        uint64_t elems;
+        memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t));
+        memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t));
+    }
+
+    // The below logic is designed so as to unpack and rearrange scales and mins values in Q4_K
+    // Currently the Q4_K structure has 8 scales and 8 mins packed in 12 bytes ( 6 bits for each value)
+    // The output Q4_Kx8 structure has 96 bytes
+    // Every 12 byte is packed such that it contains scales and mins for corresponding sub blocks from Q4_K structure
+    // For eg - First 12 bytes contains 8 scales and 8 mins - each of first sub block from different Q4_K structures
+    uint8_t s[8], m[8];
+
+    for (int i = 0; i < 4; i++) {
+        for (int j = 0; j < 8; j++) {
+            s[j] = in[j].scales[i] & 63;
+            m[j] = in[j].scales[i + 4] & 63;
+        }
+
+        out.scales[i * 12]      = (s[0] & 63) + ((s[4] & 48) << 2);
+        out.scales[i * 12 + 1]  = (s[1] & 63) + ((s[5] & 48) << 2);
+        out.scales[i * 12 + 2]  = (s[2] & 63) + ((s[6] & 48) << 2);
+        out.scales[i * 12 + 3]  = (s[3] & 63) + ((s[7] & 48) << 2);
+        out.scales[i * 12 + 4]  = (m[0] & 63) + ((m[4] & 48) << 2);
+        out.scales[i * 12 + 5]  = (m[1] & 63) + ((m[5] & 48) << 2);
+        out.scales[i * 12 + 6]  = (m[2] & 63) + ((m[6] & 48) << 2);
+        out.scales[i * 12 + 7]  = (m[3] & 63) + ((m[7] & 48) << 2);
+        out.scales[i * 12 + 8]  = (s[4] & 15) + ((m[4] & 15) << 4);
+        out.scales[i * 12 + 9]  = (s[5] & 15) + ((m[5] & 15) << 4);
+        out.scales[i * 12 + 10] = (s[6] & 15) + ((m[6] & 15) << 4);
+        out.scales[i * 12 + 11] = (s[7] & 15) + ((m[7] & 15) << 4);
+
+    }
+
+    for (int i = 0; i < 4; i++) {
+        for (int j = 0; j < 8; j++) {
+            s[j] = ((in[j].scales[i] & 192) >> 2) | (in[j].scales[i+8] & 15);
+            m[j] = ((in[j].scales[i + 4] & 192) >> 2) | ((in[j].scales[i+8] & 240) >> 4);
+        }
+
+        out.scales[i * 12 + 48] = (s[0] & 63) + ((s[4] & 48) << 2);
+        out.scales[i * 12 + 49] = (s[1] & 63) + ((s[5] & 48) << 2);
+        out.scales[i * 12 + 50] = (s[2] & 63) + ((s[6] & 48) << 2);
+        out.scales[i * 12 + 51] = (s[3] & 63) + ((s[7] & 48) << 2);
+        out.scales[i * 12 + 52] = (m[0] & 63) + ((m[4] & 48) << 2);
+        out.scales[i * 12 + 53] = (m[1] & 63) + ((m[5] & 48) << 2);
+        out.scales[i * 12 + 54] = (m[2] & 63) + ((m[6] & 48) << 2);
+        out.scales[i * 12 + 55] = (m[3] & 63) + ((m[7] & 48) << 2);
+        out.scales[i * 12 + 56] = (s[4] & 15) + ((m[4] & 15) << 4);
+        out.scales[i * 12 + 57] = (s[5] & 15) + ((m[5] & 15) << 4);
+        out.scales[i * 12 + 58] = (s[6] & 15) + ((m[6] & 15) << 4);
+        out.scales[i * 12 + 59] = (s[7] & 15) + ((m[7] & 15) << 4);
+
+    }
+
+    return out;
+}
+
+static int repack_q4_0_to_q4_0_4_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
+    GGML_ASSERT(t->type == GGML_TYPE_Q4_0);
+    GGML_ASSERT(interleave_block == 4 || interleave_block == 8);
+    constexpr int nrows_interleaved = 4;
+
+    block_q4_0x4 * dst = (block_q4_0x4 *)t->data;
+    const block_q4_0 * src = (const block_q4_0 *)data;
+    block_q4_0 dst_tmp[4];
+    int nrow = ggml_nrows(t);
+    int nblocks = t->ne[0] / QK4_0;
+
+    GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q4_0));
+
+    if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
+        return -1;
+    }
+
+    for (int b = 0; b < nrow; b += nrows_interleaved) {
+        for (int64_t x = 0; x < nblocks; x++) {
+            for (int i = 0; i < nrows_interleaved; i++) {
+                dst_tmp[i] = src[x + i * nblocks];
+            }
+            *dst++ = make_block_q4_0x4(dst_tmp, interleave_block);
+        }
+        src += nrows_interleaved * nblocks;
+    }
+    return 0;
+
+    GGML_UNUSED(data_size);
+}
+static int repack_q4_K_to_q4_K_8_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
+    GGML_ASSERT(t->type == GGML_TYPE_Q4_K);
+    GGML_ASSERT(interleave_block == 8);
+    constexpr int nrows_interleaved = 8;
+
+    block_q4_Kx8 * dst = (block_q4_Kx8*)t->data;
+    const block_q4_K * src = (const block_q4_K*) data;
+    block_q4_K dst_tmp[8];
+    int nrow = ggml_nrows(t);
+    int nblocks = t->ne[0] / QK_K;
+
+    GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q4_K));
+
+    if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
+        return -1;
+    }
+
+    for (int b = 0; b < nrow; b += nrows_interleaved) {
+        for (int64_t x = 0; x < nblocks; x++) {
+            for (int i  = 0; i < nrows_interleaved; i++ ) {
+                dst_tmp[i] = src[x + i * nblocks];
+            }
+            *dst++ = make_block_q4_Kx8(dst_tmp, interleave_block);
+        }
+        src += nrows_interleaved * nblocks;
+    }
+    return 0;
+
+    GGML_UNUSED(data_size);
+}
+
+static int repack_q4_0_to_q4_0_8_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
+    GGML_ASSERT(t->type == GGML_TYPE_Q4_0);
+    GGML_ASSERT(interleave_block == 8);
+    constexpr int nrows_interleaved = 8;
+
+    block_q4_0x8 * dst = (block_q4_0x8*)t->data;
+    const block_q4_0 * src = (const block_q4_0*) data;
+    block_q4_0 dst_tmp[8];
+    int nrow = ggml_nrows(t);
+    int nblocks = t->ne[0] / QK4_0;
+
+    GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q4_0));
+
+    if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
+        return -1;
+    }
+
+    for (int b = 0; b < nrow; b += nrows_interleaved) {
+        for (int64_t x = 0; x < nblocks; x++) {
+            for (int i  = 0; i < nrows_interleaved; i++ ) {
+                dst_tmp[i] = src[x + i * nblocks];
+            }
+            *dst++ = make_block_q4_0x8(dst_tmp, interleave_block);
+        }
+        src += nrows_interleaved * nblocks;
+    }
+    return 0;
+
+    GGML_UNUSED(data_size);
+}
+
+static block_iq4_nlx4 make_block_iq4_nlx4(block_iq4_nl * in, unsigned int blck_size_interleave) {
+    block_iq4_nlx4 out;
+
+    for (int i = 0; i < 4; i++) {
+        out.d[i] = in[i].d;
+    }
+
+    const int end = QK4_NL * 2 / blck_size_interleave;
+
+    // TODO: this branch seems wrong
+    //if (blck_size_interleave == 8) {
+    //    for (int i = 0; i < end; ++i) {
+    //        int src_id = i % 4;
+    //        int src_offset = (i / 4) * blck_size_interleave;
+    //        int dst_offset = i * blck_size_interleave;
+
+    //        // Using memcpy to avoid unaligned memory accesses
+    //        memcpy(&out.qs[dst_offset], &in[src_id].qs[src_offset], sizeof(uint64_t));
+    //    }
+    //} else
+    if (blck_size_interleave == 4) {
+        for (int i = 0; i < end; ++i) {
+            int src_id = i % 4;
+            int src_offset = (i / 4) * blck_size_interleave;
+            int dst_offset = i * blck_size_interleave;
+
+            memcpy(&out.qs[dst_offset], &in[src_id].qs[src_offset], sizeof(uint32_t));
+        }
+    } else {
+        GGML_ASSERT(false);
+    }
+
+    return out;
+}
+
+static int repack_iq4_nl_to_iq4_nl_4_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
+    GGML_ASSERT(t->type == GGML_TYPE_IQ4_NL);
+    //GGML_ASSERT(interleave_block == 4 || interleave_block == 8);
+    GGML_ASSERT(interleave_block == 4);
+
+    block_iq4_nlx4 * dst = (block_iq4_nlx4 *)t->data;
+    const block_iq4_nl * src = (const block_iq4_nl *)data;
+    block_iq4_nl dst_tmp[4];
+    int nrow = ggml_nrows(t);
+    int nrows_interleaved = 4;
+    int nblocks = t->ne[0] / QK4_0;
+
+    GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_iq4_nl));
+
+    if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
+        return -1;
+    }
+
+    for (int b = 0; b < nrow; b += nrows_interleaved) {
+        for (int64_t x = 0; x < nblocks; x++) {
+            for (int i = 0; i < nrows_interleaved; i++) {
+                dst_tmp[i] = src[x + i * nblocks];
+            }
+            *dst++ = make_block_iq4_nlx4(dst_tmp, interleave_block);
+        }
+        src += nrows_interleaved * nblocks;
+    }
+    return 0;
+
+    GGML_UNUSED(data_size);
+}
+
+namespace ggml::cpu::repack {
+// repack
+template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS>
+int repack(struct ggml_tensor *, const void *, size_t);
+
+// TODO: generalise.
+template <> int repack<block_q4_0, 4, 4>(struct ggml_tensor * t, const void * data, size_t data_size) {
+    return repack_q4_0_to_q4_0_4_bl(t, 4, data, data_size);
+}
+
+template <> int repack<block_q4_0, 8, 4>(struct ggml_tensor * t, const void * data, size_t data_size) {
+    return repack_q4_0_to_q4_0_4_bl(t, 8, data, data_size);
+}
+
+template <> int repack<block_q4_0, 8, 8>(struct ggml_tensor * t, const void * data, size_t data_size) {
+    return repack_q4_0_to_q4_0_8_bl(t, 8, data, data_size);
+}
+
+template <> int repack<block_q4_K, 8, 8>(struct ggml_tensor * t, const void * data, size_t data_size) {
+    return repack_q4_K_to_q4_K_8_bl(t, 8, data, data_size);
+}
+
+template <> int repack<block_iq4_nl, 4, 4>(struct ggml_tensor * t, const void * data, size_t data_size) {
+    return repack_iq4_nl_to_iq4_nl_4_bl(t, 4, data, data_size);
+}
+
+// TODO: needs to be revisited
+//template <> int repack<block_iq4_nl, 8, 4>(struct ggml_tensor * t, const void * data, size_t data_size) {
+//    return repack_iq4_nl_to_iq4_nl_4_bl(t, 8, data, data_size);
+//}
+
+// gemv
+template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PARAM_TYPE>
+void gemv(int, float *, size_t, const void *, const void *, int, int);
+
+template <> void gemv<block_q4_0, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
+    ggml_gemv_q4_0_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
+}
+
+template <> void gemv<block_q4_0, 8, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
+    ggml_gemv_q4_0_4x8_q8_0(n, s, bs, vx, vy, nr, nc);
+}
+
+template <> void gemv<block_q4_0, 8, 8, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
+    ggml_gemv_q4_0_8x8_q8_0(n, s, bs, vx, vy, nr, nc);
+}
+
+template <> void gemv<block_q4_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
+    ggml_gemv_q4_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
+}
+
+template <> void gemv<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
+    ggml_gemv_iq4_nl_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
+}
+
+// gemm
+template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PARAM_TYPE>
+void gemm(int, float *, size_t, const void *, const void *, int, int);
+
+template <> void gemm<block_q4_0, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
+    ggml_gemm_q4_0_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
+}
+
+template <> void gemm<block_q4_0, 8, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
+    ggml_gemm_q4_0_4x8_q8_0(n, s, bs, vx, vy, nr, nc);
+}
+
+template <> void gemm<block_q4_0, 8, 8, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
+    ggml_gemm_q4_0_8x8_q8_0(n, s, bs, vx, vy, nr, nc);
+}
+
+template <> void gemm<block_q4_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
+    ggml_gemm_q4_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
+}
+
+template <> void gemm<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
+    ggml_gemm_iq4_nl_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
+}
+
+class tensor_traits_base : public ggml::cpu::tensor_traits {
+  public:
+    virtual int repack(struct ggml_tensor * t, const void * data, size_t data_size) = 0;
+};
+
+template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PARAM_TYPE> class tensor_traits : public tensor_traits_base {
+
+    bool work_size(int /* n_threads */, const struct ggml_tensor * op, size_t & size) override {
+        // not realy a GGML_TYPE_Q8_0 but same size.
+        switch (op->op) {
+            case GGML_OP_MUL_MAT:
+                {
+                    size = ggml_row_size(PARAM_TYPE, ggml_nelements(op->src[1]));
+                    return true;
+                }
+            case GGML_OP_MUL_MAT_ID:
+                {
+                    size = ggml_row_size(PARAM_TYPE, ggml_nelements(op->src[1]));
+                    size = GGML_PAD(size, sizeof(int64_t)); // + padding for next bloc.
+
+                    const int64_t ne02 = op->src[0]->ne[2]; // n_as, n_expert
+                    const int64_t ne12 = op->src[1]->ne[2]; // n_tokens
+
+                    const size_t sizeof_mmid_row_mapping = sizeof(int64_t);
+
+                    size += sizeof_mmid_row_mapping*ne02*(ne12 + 1);
+
+                    return true;
+                }
+            default:
+                // GGML_ABORT("fatal error");
+                break;
+        }
+        return false;
+    }
+
+    bool compute_forward(struct ggml_compute_params * params, struct ggml_tensor * op) override {
+        switch (op->op) {
+            case GGML_OP_MUL_MAT:
+                forward_mul_mat(params, op);
+                return true;
+            case GGML_OP_MUL_MAT_ID:
+                forward_mul_mat_id(params, op);
+                return true;
+            default:
+                // GGML_ABORT("fatal error");
+                break;
+        }
+        return false;
+    }
+
+    void forward_mul_mat(ggml_compute_params * params, ggml_tensor * op) {
+        const ggml_tensor * src0 = op->src[0];
+        const ggml_tensor * src1 = op->src[1];
+        ggml_tensor *       dst  = op;
+
+        GGML_TENSOR_BINARY_OP_LOCALS
+
+        const int ith = params->ith;
+        const int nth = params->nth;
+
+        GGML_ASSERT(ne0 == ne01);
+        GGML_ASSERT(ne1 == ne11);
+        GGML_ASSERT(ne2 == ne12);
+        GGML_ASSERT(ne3 == ne13);
+
+        // dst cannot be transposed or permuted
+        GGML_ASSERT(nb0 == sizeof(float));
+        GGML_ASSERT(nb0 <= nb1);
+        GGML_ASSERT(nb1 <= nb2);
+        GGML_ASSERT(nb2 <= nb3);
+
+        GGML_ASSERT(src1->type == GGML_TYPE_F32);
+
+        GGML_ASSERT(ggml_n_dims(op->src[0]) == 2);
+        // GGML_ASSERT(ggml_n_dims(op->src[1]) == 2);
+
+        char *       wdata = static_cast<char *>(params->wdata);
+        const size_t nbw1  = ggml_row_size(PARAM_TYPE, ne10);
+
+        assert(params->wsize >= nbw1 * ne11);
+
+        const ggml_from_float_t from_float = ggml_get_type_traits_cpu(PARAM_TYPE)->from_float;
+
+        int64_t i11_processed = 0;
+        for (int64_t i11 = ith * 4; i11 < ne11 - ne11 % 4; i11 += nth * 4) {
+            ggml_quantize_mat_t<INTER_SIZE, PARAM_TYPE>((float *) ((char *) src1->data + i11 * nb11), (void *) (wdata + i11 * nbw1), 4, ne10);
+        }
+
+        i11_processed = ne11 - ne11 % 4;
+        for (int64_t i11 = i11_processed + ith; i11 < ne11; i11 += nth) {
+            from_float((float *) ((char *) src1->data + i11 * nb11), (void *) (wdata + i11 * nbw1), ne10);
+        }
+
+        ggml_barrier(params->threadpool);
+
+        const void * src1_wdata      = params->wdata;
+        const size_t src1_col_stride = ggml_row_size(PARAM_TYPE, ne10);
+        int64_t      src0_start      = (ith * ne01) / nth;
+        int64_t      src0_end        = ((ith + 1) * ne01) / nth;
+        src0_start = (src0_start % NB_COLS) ? src0_start + NB_COLS - (src0_start % NB_COLS) : src0_start;
+        src0_end   = (src0_end   % NB_COLS) ? src0_end   + NB_COLS - (src0_end   % NB_COLS) : src0_end;
+        if (src0_start >= src0_end) {
+            return;
+        }
+
+        // If there are more than three rows in src1, use gemm; otherwise, use gemv.
+        if (ne11 > 3) {
+            gemm<BLOC_TYPE, INTER_SIZE, NB_COLS, PARAM_TYPE>(ne00,
+                    (float *) ((char *) dst->data) + src0_start, ne01,
+                    (const char *) src0->data + src0_start * nb01,
+                    (const char *) src1_wdata, ne11 - ne11 % 4, src0_end - src0_start);
+        }
+        for (int iter = ne11 - ne11 % 4; iter < ne11; iter++) {
+            gemv<BLOC_TYPE, INTER_SIZE, NB_COLS, PARAM_TYPE>(ne00,
+                    (float *) ((char *) dst->data + (iter * nb1)) + src0_start, ne01,
+                    (const char *) src0->data + src0_start * nb01,
+                    (const char *) src1_wdata + (src1_col_stride * iter), 1,
+                    src0_end - src0_start);
+        }
+    }
+
+    void forward_mul_mat_id(ggml_compute_params * params, ggml_tensor * op) {
+        const ggml_tensor * src0 = op->src[0];
+        const ggml_tensor * src1 = op->src[1];
+        const ggml_tensor * ids  = op->src[2];
+        ggml_tensor *       dst  = op;
+
+        GGML_TENSOR_BINARY_OP_LOCALS
+
+        const int ith = params->ith;
+        const int nth = params->nth;
+
+        const ggml_from_float_t from_float = ggml_get_type_traits_cpu(PARAM_TYPE)->from_float;
+
+        // we don't support permuted src0 or src1
+        GGML_ASSERT(nb00 == ggml_type_size(src0->type));
+        GGML_ASSERT(nb10 == ggml_type_size(src1->type));
+
+        // dst cannot be transposed or permuted
+        GGML_ASSERT(nb0 == sizeof(float));
+        GGML_ASSERT(nb0 <= nb1);
+        GGML_ASSERT(nb1 <= nb2);
+        GGML_ASSERT(nb2 <= nb3);
+
+        GGML_ASSERT(ne03 == 1);
+        GGML_ASSERT(ne13 == 1);
+        GGML_ASSERT(ne3  == 1);
+
+        GGML_ASSERT(src1->type == GGML_TYPE_F32);
+
+        // row groups
+        const int n_ids = ids->ne[0]; // n_expert_used
+        const int n_as  = ne02;       // n_expert
+
+        const size_t nbw1 = ggml_row_size(PARAM_TYPE, ne10);
+        const size_t nbw2 = nbw1*ne11;
+        const size_t nbw3 = nbw2*ne12;
+
+        struct mmid_row_mapping {
+            int32_t i1;
+            int32_t i2;
+        };
+
+        GGML_ASSERT(params->wsize >=
+                (GGML_PAD(nbw3, sizeof(int64_t)) +
+                 n_as*(ne12 + 1)*sizeof(mmid_row_mapping))
+                );
+
+        auto * wdata          = (char *)params->wdata;
+        auto * wdata_src1_end = (char *)wdata + GGML_PAD(nbw3, sizeof(int64_t));
+
+        // total of [n_as][ne12 + 1] elemets of type mmid_row_mapping (2*int32_t = int64_t)
+        auto * matrix_row_counts = (int64_t *) (wdata_src1_end);                                        // [n_as]
+        struct mmid_row_mapping * matrix_rows = (struct mmid_row_mapping *) (matrix_row_counts + n_as); // [n_as][ne12]
+
+        // src1: float32 => param type
+        for (int64_t i12 = 0; i12 < ne12; ++i12) {
+            for (int64_t i11 = ith; i11 < ne11; i11 += nth) {
+                from_float((float *)((char *) src1->data + i12 * nb12 + i11 * nb11),
+                           (void *)               (wdata + i12 * nbw2 + i11 * nbw1),
+                           ne10);
+            }
+        }
+
+#define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id) * ne12 + (i1)]
+
+        if (ith == 0) {
+            // initialize matrix_row_counts
+            memset(matrix_row_counts, 0, n_as * sizeof(int64_t));
+
+            // group rows by src0 matrix
+            for (int32_t iid1 = 0; iid1 < ids->ne[1]; ++iid1) {
+                for (int32_t id = 0; id < n_ids; ++id) {
+                    const int32_t i02 =
+                        *(const int32_t *) ((const char *) ids->data + iid1 * ids->nb[1] + id * ids->nb[0]);
+
+                    GGML_ASSERT(i02 >= 0 && i02 < n_as);
+
+                    MMID_MATRIX_ROW(i02, matrix_row_counts[i02]) = { id, iid1 };
+                    matrix_row_counts[i02] += 1;
+                }
+            }
+        }
+
+        ggml_barrier(params->threadpool);
+
+        // compute each matrix multiplication in sequence
+        for (int cur_a = 0; cur_a < n_as; ++cur_a) {
+            const int64_t cne1 = matrix_row_counts[cur_a];
+
+            if (cne1 == 0) {
+                continue;
+            }
+
+            const auto * src0_cur = (const char *) src0->data + cur_a*nb02;
+
+            //const int64_t nr0 = ne01; // src0 rows
+            const int64_t nr1 = cne1; // src1 rows
+
+            int64_t src0_cur_start = (ith * ne01) / nth;
+            int64_t src0_cur_end   = ((ith + 1) * ne01) / nth;
+
+            src0_cur_start = (src0_cur_start % NB_COLS) ? src0_cur_start + NB_COLS - (src0_cur_start % NB_COLS) : src0_cur_start;
+            src0_cur_end   = (src0_cur_end   % NB_COLS) ? src0_cur_end   + NB_COLS - (src0_cur_end   % NB_COLS) : src0_cur_end;
+
+            if (src0_cur_start >= src0_cur_end) {
+                return;
+            }
+
+            for (int ir1 = 0; ir1 < nr1; ir1++) {
+                struct mmid_row_mapping row_mapping = MMID_MATRIX_ROW(cur_a, ir1);
+
+                const int id = row_mapping.i1; // selected expert index
+
+                const int64_t i11 = id % ne11;
+                const int64_t i12 = row_mapping.i2; // row index in src1
+
+                const int64_t i1 = id;  // selected expert index
+                const int64_t i2 = i12; // row
+
+                const auto * src1_col = (const char *) wdata + (i11 * nbw1 + i12 * nbw2);
+
+                gemv<BLOC_TYPE, INTER_SIZE, NB_COLS, PARAM_TYPE>(ne00,
+                        (float *)((char *) dst->data + (i1 * nb1 + i2 * nb2)) + src0_cur_start, ne01,
+                        src0_cur + src0_cur_start * nb01,
+                        src1_col, 1, src0_cur_end - src0_cur_start);
+            }
+        }
+#undef MMID_MATRIX_ROW
+    }
+
+    int repack(struct ggml_tensor * t, const void * data, size_t data_size) override {
+        GGML_LOG_DEBUG("%s: repack tensor %s with %s_%dx%d\n", __func__, t->name, ggml_type_name(t->type),
+                       (int) NB_COLS, (int) INTER_SIZE);
+        return ggml::cpu::repack::repack<BLOC_TYPE, INTER_SIZE, NB_COLS>(t, data, data_size);
+    }
+};
+
+}  // namespace ggml::cpu::repack
+
+static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(const struct ggml_tensor * cur) {
+
+    // instance for Q4
+    static const ggml::cpu::repack::tensor_traits<block_q4_0, 4, 4, GGML_TYPE_Q8_0> q4_0_4x4_q8_0;
+    static const ggml::cpu::repack::tensor_traits<block_q4_0, 8, 4, GGML_TYPE_Q8_0> q4_0_4x8_q8_0;
+    static const ggml::cpu::repack::tensor_traits<block_q4_0, 8, 8, GGML_TYPE_Q8_0> q4_0_8x8_q8_0;
+    static const ggml::cpu::repack::tensor_traits<block_q4_K, 8, 8, GGML_TYPE_Q8_K> q4_K_8x8_q8_K;
+
+    // instance for IQ4
+    static const ggml::cpu::repack::tensor_traits<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0> iq4_nl_4x4_q8_0;
+
+    if (cur->type == GGML_TYPE_Q4_0) {
+        if (ggml_cpu_has_avx2() || (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0)) {
+            if (cur->ne[1] % 8 == 0) {
+                return &q4_0_8x8_q8_0;
+            }
+        }
+        if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
+            if (cur->ne[1] % 4 == 0) {
+                return &q4_0_4x8_q8_0;
+            }
+        }
+        if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
+            if (cur->ne[1] % 4 == 0) {
+                return &q4_0_4x4_q8_0;
+            }
+        }
+    } else if (cur->type == GGML_TYPE_Q4_K) {
+        if (ggml_cpu_has_avx2()) {
+            if (cur->ne[1] % 8 == 0) {
+                return &q4_K_8x8_q8_K;
+            }
+        }
+    } else if (cur->type == GGML_TYPE_IQ4_NL) {
+        if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
+            if (cur->ne[1] % 4 == 0) {
+                return &iq4_nl_4x4_q8_0;
+            }
+        }
+    }
+
+    return nullptr;
+}
+
+static enum ggml_status ggml_backend_cpu_repack_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
+    tensor->extra = (void *) const_cast<ggml::cpu::tensor_traits *>(ggml_repack_get_optimal_repack_type(tensor));
+
+    GGML_UNUSED(buffer);
+    return GGML_STATUS_SUCCESS;
+}
+
+static void ggml_backend_cpu_repack_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor,
+                                                       const void * data, size_t offset, size_t size) {
+    GGML_ASSERT(offset == 0);
+    GGML_ASSERT(size == ggml_nbytes(tensor));
+
+    auto tensor_traits = (ggml::cpu::repack::tensor_traits_base *) tensor->extra;
+    auto OK            = tensor_traits->repack(tensor, data, size);
+
+    GGML_ASSERT(OK == 0);
+    GGML_UNUSED(buffer);
+}
+
+static const char * ggml_backend_cpu_repack_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
+    return "CPU_REPACK";
+
+    GGML_UNUSED(buft);
+}
+
+static ggml_backend_buffer_t ggml_backend_cpu_repack_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
+    ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size);
+
+    if (buffer == nullptr) {
+        return nullptr;
+    }
+
+    buffer->buft              = buft;
+    buffer->iface.init_tensor = ggml_backend_cpu_repack_buffer_init_tensor;
+    buffer->iface.set_tensor  = ggml_backend_cpu_repack_buffer_set_tensor;
+    buffer->iface.get_tensor  = nullptr;
+    buffer->iface.cpy_tensor  = nullptr;
+    return buffer;
+}
+
+static size_t ggml_backend_cpu_repack_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
+    return TENSOR_ALIGNMENT;
+
+    GGML_UNUSED(buft);
+}
+
+namespace ggml::cpu::repack {
+class extra_buffer_type : ggml::cpu::extra_buffer_type {
+    bool supports_op(ggml_backend_dev_t, const struct ggml_tensor * op) override {
+        if (    op->op == GGML_OP_MUL_MAT &&
+                op->src[0]->buffer &&
+                (ggml_n_dims(op->src[0]) == 2) &&
+                op->src[0]->buffer->buft == ggml_backend_cpu_repack_buffer_type() &&
+                ggml_repack_get_optimal_repack_type(op->src[0])
+                ) {
+            if (op->src[1]->buffer && !ggml_backend_buft_is_host(op->src[1]->buffer->buft)) {
+                return false;
+            }
+            if (op->src[1]->type == GGML_TYPE_F32) {
+                return true;
+            }
+            //if (op->src[1]->type == GGML_TYPE_Q8_0) {
+            //    return true;
+            //}
+            // may be possible if Q8_0 packed...
+        } else if (op->op == GGML_OP_MUL_MAT_ID
+                && op->src[0]->buffer
+                && (ggml_n_dims(op->src[0]) == 3)
+                && op->src[0]->buffer->buft == ggml_backend_cpu_repack_buffer_type()
+                && ggml_repack_get_optimal_repack_type(op->src[0])
+                ) {
+            if (op->src[1]->buffer && !ggml_backend_buft_is_host(op->src[1]->buffer->buft)) {
+                return false;
+            }
+            if (op->src[1]->type == GGML_TYPE_F32) {
+                return true;
+            }
+            //if (op->src[1]->type == GGML_TYPE_Q8_0) {
+            //    return true;
+            //}
+        }
+        return false;
+    }
+
+    ggml::cpu::tensor_traits * get_tensor_traits(const struct ggml_tensor * op) override {
+        if (op->op == GGML_OP_MUL_MAT || op->op == GGML_OP_MUL_MAT_ID) {
+            if (op->src[0]->buffer && op->src[0]->buffer->buft == ggml_backend_cpu_repack_buffer_type()) {
+                return (ggml::cpu::tensor_traits *) op->src[0]->extra;
+            }
+        }
+        return nullptr;
+    }
+};
+}  // namespace ggml::cpu::repack
+
+ggml_backend_buffer_type_t ggml_backend_cpu_repack_buffer_type(void) {
+    static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_repack = {
+        /* .iface    = */ {
+                           /* .get_name         = */ ggml_backend_cpu_repack_buffer_type_get_name,
+                           /* .alloc_buffer     = */ ggml_backend_cpu_repack_buffer_type_alloc_buffer,
+                           /* .get_alignment    = */ ggml_backend_cpu_repack_buffer_type_get_alignment,
+                           /* .get_max_size     = */ nullptr,  // defaults to SIZE_MAX
+                           /* .get_alloc_size   = */ nullptr,  // defaults to ggml_nbytes
+                           /* .is_host          = */ nullptr,
+                           },
+        /* .device  = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
+        /* .context = */ new ggml::cpu::repack::extra_buffer_type(),
+    };
+
+    return &ggml_backend_cpu_buffer_type_repack;
+}
diff --git a/ggml/src/ggml-cpu/repack.h b/ggml/src/ggml-cpu/repack.h
new file mode 100644
index 0000000000000..4421e5f8e7046
--- /dev/null
+++ b/ggml/src/ggml-cpu/repack.h
@@ -0,0 +1,98 @@
+#pragma once
+
+#define GGML_COMMON_DECL_CPP
+#include "ggml-common.h"
+
+#include "traits.h"
+#include "ggml.h"
+
+// GGML internal header
+
+ggml_backend_buffer_type_t ggml_backend_cpu_repack_buffer_type(void);
+
+template <int K> constexpr int QK_0() {
+    if constexpr (K == 4) {
+        return QK4_0;
+    }
+    if constexpr (K == 8) {
+        return QK8_0;
+    }
+    return -1;
+}
+
+template <int K, int N> struct block {
+    ggml_half d[N];                         // deltas for N qK_0 blocks
+    int8_t    qs[(QK_0<K>() * N * K) / 8];  // quants for N qK_0 blocks
+};
+
+// control size
+static_assert(sizeof(block<4, 4>) == 4 * sizeof(ggml_half) + QK8_0 * 2, "wrong block<4,4> size/padding");
+static_assert(sizeof(block<4, 8>) == 8 * sizeof(ggml_half) + QK8_0 * 4, "wrong block<4,8> size/padding");
+static_assert(sizeof(block<8, 4>) == 4 * sizeof(ggml_half) + QK8_0 * 4, "wrong block<8,4> size/padding");
+static_assert(sizeof(block<8, 8>) == 8 * sizeof(ggml_half) + QK8_0 * 8, "wrong block<8,8> size/padding");
+
+using block_q4_0x4 = block<4, 4>;
+using block_q4_0x8 = block<4, 8>;
+using block_q8_0x4 = block<8, 4>;
+using block_q8_0x8 = block<8, 8>;
+
+struct block_q4_Kx8 {
+    ggml_half d[8];      // super-block scale for quantized scales
+    ggml_half dmin[8];   // super-block scale for quantized mins
+    uint8_t scales[96];  // scales and mins, quantized with 6 bits
+    uint8_t qs[1024];    // 4--bit quants
+};
+
+static_assert(sizeof(block_q4_Kx8) == sizeof(ggml_half) * 16 + K_SCALE_SIZE * 8 + QK_K * 4, "wrong q4_K block size/padding");
+
+struct block_q8_Kx4 {
+    float d[4];              // delta
+    int8_t qs[QK_K * 4];     // quants
+    int16_t bsums[QK_K / 4]; // sum of quants in groups of 16
+};
+
+static_assert(sizeof(block_q8_Kx4) == sizeof(float) * 4 + QK_K * 4 + (QK_K / 4) * sizeof(int16_t), "wrong q8_K block size/padding");
+
+struct block_iq4_nlx4 {
+    ggml_half d[4];            // deltas for 4 iq4_nl blocks
+    uint8_t   qs[QK4_NL * 2];  // nibbles / quants for 4 iq4_nl blocks
+};
+
+static_assert(sizeof(block_iq4_nlx4) == 4 * sizeof(ggml_half) + QK4_NL * 2, "wrong iq4_nlx4 block size/padding");
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+void ggml_quantize_mat_q8_0_4x4(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
+void ggml_quantize_mat_q8_0_4x8(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
+void ggml_quantize_mat_q8_K_4x8(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
+void ggml_gemv_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
+void ggml_gemv_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
+void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
+void ggml_gemv_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
+void ggml_gemv_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
+void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
+void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
+void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
+void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
+void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
+
+// Native implementations
+void ggml_quantize_mat_q8_0_4x4_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
+void ggml_quantize_mat_q8_0_4x8_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
+void ggml_quantize_mat_q8_K_4x8_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
+void ggml_gemv_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
+void ggml_gemv_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
+void ggml_gemv_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
+void ggml_gemv_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
+void ggml_gemv_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
+void ggml_gemm_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
+void ggml_gemm_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
+void ggml_gemm_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
+void ggml_gemm_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
+void ggml_gemm_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
+
+#if defined(__cplusplus)
+} // extern "C"
+#endif
diff --git a/ggml/src/ggml-cpu/simd-mappings.h b/ggml/src/ggml-cpu/simd-mappings.h
index 2e3669c0186c9..b68ac0dd68b40 100644
--- a/ggml/src/ggml-cpu/simd-mappings.h
+++ b/ggml/src/ggml-cpu/simd-mappings.h
@@ -2,10 +2,167 @@
 
 #include "ggml-cpu-impl.h"
 
+#ifdef __ARM_FEATURE_SVE
+#include <arm_sve.h>
+#endif // __ARM_FEATURE_SVE
+
+#if defined(__ARM_NEON) && !defined(__CUDACC__) && !defined(__MUSACC__)
+// if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
+//
+//   $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/
+//
+#include <arm_neon.h>
+#endif
+
+#if defined(__F16C__)
+#include <immintrin.h>
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 //
 // simd mappings
 //
 
+// FP16 to FP32 conversion
+
+// 16-bit float
+// on Arm, we use __fp16
+// on x86, we use uint16_t
+//
+// for old CUDA compilers (<= 11), we use uint16_t: ref https://github.com/ggml-org/llama.cpp/pull/10616
+// for     MUSA compilers        , we use uint16_t: ref https://github.com/ggml-org/llama.cpp/pull/11843
+//
+#if defined(__ARM_NEON) && !(defined(__CUDACC__) && __CUDACC_VER_MAJOR__ <= 11) && !defined(__MUSACC__)
+    #define GGML_CPU_COMPUTE_FP16_TO_FP32(x) neon_compute_fp16_to_fp32(x)
+    #define GGML_CPU_COMPUTE_FP32_TO_FP16(x) neon_compute_fp32_to_fp16(x)
+
+    #define GGML_CPU_FP16_TO_FP32(x) GGML_CPU_COMPUTE_FP16_TO_FP32(x)
+
+    static inline float neon_compute_fp16_to_fp32(ggml_fp16_t h) {
+        __fp16 tmp;
+        memcpy(&tmp, &h, sizeof(ggml_fp16_t));
+        return (float)tmp;
+    }
+
+    static inline ggml_fp16_t neon_compute_fp32_to_fp16(float f) {
+        ggml_fp16_t res;
+        __fp16 tmp = f;
+        memcpy(&res, &tmp, sizeof(ggml_fp16_t));
+        return res;
+    }
+#elif defined(__F16C__)
+    #ifdef _MSC_VER
+        #define GGML_CPU_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x)))
+        #define GGML_CPU_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0)
+    #else
+        #define GGML_CPU_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x)
+        #define GGML_CPU_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0)
+    #endif
+#elif defined(__POWER9_VECTOR__)
+    #define GGML_CPU_COMPUTE_FP16_TO_FP32(x) power_compute_fp16_to_fp32(x)
+    #define GGML_CPU_COMPUTE_FP32_TO_FP16(x) power_compute_fp32_to_fp16(x)
+    /* the inline asm below is about 12% faster than the lookup method */
+    #define GGML_CPU_FP16_TO_FP32(x) GGML_CPU_COMPUTE_FP16_TO_FP32(x)
+    #define GGML_CPU_FP32_TO_FP16(x) GGML_CPU_COMPUTE_FP32_TO_FP16(x)
+
+    static inline float power_compute_fp16_to_fp32(ggml_fp16_t h) {
+        float f;
+        double d;
+        __asm__(
+            "mtfprd %0,%2\n"
+            "xscvhpdp %0,%0\n"
+            "frsp %1,%0\n" :
+            /* temp */ "=d"(d),
+            /* out */  "=f"(f):
+            /* in */   "r"(h));
+        return f;
+    }
+
+    static inline ggml_fp16_t power_compute_fp32_to_fp16(float f) {
+        double d;
+        ggml_fp16_t r;
+        __asm__( /* xscvdphp can work on double or single precision */
+            "xscvdphp %0,%2\n"
+            "mffprd %1,%0\n" :
+            /* temp */ "=d"(d),
+            /* out */  "=r"(r):
+            /* in */   "f"(f));
+        return r;
+    }
+#elif defined(__riscv) && defined(__riscv_zfhmin)
+    static inline float riscv_compute_fp16_to_fp32(ggml_fp16_t h) {
+        float f;
+        __asm__(
+            "fmv.h.x %[f], %[h]\n\t"
+            "fcvt.s.h %[f], %[f]"
+            : [f] "=&f" (f)
+            : [h] "r" (h)
+        );
+        return f;
+    }
+
+    static inline ggml_fp16_t riscv_compute_fp32_to_fp16(float f) {
+        ggml_fp16_t res;
+        __asm__(
+            "fcvt.h.s %[f], %[f]\n\t"
+            "fmv.x.h %[h], %[f]"
+            : [h] "=&r" (res)
+            : [f] "f" (f)
+        );
+        return res;
+    }
+
+    #define GGML_CPU_COMPUTE_FP16_TO_FP32(x) riscv_compute_fp16_to_fp32(x)
+    #define GGML_CPU_COMPUTE_FP32_TO_FP16(x) riscv_compute_fp32_to_fp16(x)
+    #define GGML_CPU_FP16_TO_FP32(x) GGML_CPU_COMPUTE_FP16_TO_FP32(x)
+    #define GGML_CPU_FP32_TO_FP16(x) GGML_CPU_COMPUTE_FP32_TO_FP16(x)
+#elif defined(__NNPA__)
+    #define GGML_CPU_COMPUTE_FP16_TO_FP32(x) nnpa_compute_fp16_to_fp32(x)
+    #define GGML_CPU_COMPUTE_FP32_TO_FP16(x) nnpa_compute_fp32_to_fp16(x)
+
+    #define GGML_CPU_FP16_TO_FP32(x) GGML_CPU_COMPUTE_FP16_TO_FP32(x)
+    #define GGML_CPU_FP32_TO_FP16(x) GGML_CPU_COMPUTE_FP32_TO_FP16(x)
+
+    static inline float nnpa_compute_fp16_to_fp32(ggml_fp16_t h) {
+        uint16x8_t v_h = vec_splats(h);
+        uint16x8_t v_hd = vec_convert_from_fp16(v_h, 0);
+        return vec_extend_to_fp32_hi(v_hd, 0)[0];
+    }
+
+    static inline ggml_fp16_t nnpa_compute_fp32_to_fp16(float f) {
+        float32x4_t v_f = vec_splats(f);
+        float32x4_t v_zero = vec_splats(0.0f);
+        uint16x8_t v_hd = vec_round_from_fp32(v_f, v_zero, 0);
+        uint16x8_t v_h = vec_convert_to_fp16(v_hd, 0);
+        return vec_extract(v_h, 0);
+    }
+#endif
+
+// precomputed f32 table for f16 (256 KB)
+// defined in ggml-cpu.c, initialized in ggml_cpu_init()
+extern float ggml_table_f32_f16[1 << 16];
+
+// On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32,
+// so we define GGML_CPU_FP16_TO_FP32 and GGML_CPU_FP32_TO_FP16 elsewhere for NEON.
+// This is also true for POWER9.
+#if !defined(GGML_CPU_FP16_TO_FP32)
+inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
+    uint16_t s;
+    memcpy(&s, &f, sizeof(uint16_t));
+    return ggml_table_f32_f16[s];
+}
+
+#define GGML_CPU_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x)
+#endif
+
+#if !defined(GGML_CPU_FP32_TO_FP16)
+#define GGML_CPU_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
+#endif
+
+
 // we define a common set of C macros which map to specific intrinsics based on the current architecture
 // we then implement the fundamental computation operations below using only these macros
 // adding support for new architectures requires to define the corresponding SIMD macros
@@ -415,7 +572,7 @@ static inline __m256 __avx_f32cx8_load(const ggml_fp16_t * x) {
     float tmp[8];
 
     for (int i = 0; i < 8; i++) {
-        tmp[i] = GGML_FP16_TO_FP32(x[i]);
+        tmp[i] = GGML_CPU_FP16_TO_FP32(x[i]);
     }
 
     return _mm256_loadu_ps(tmp);
@@ -426,7 +583,7 @@ static inline void __avx_f32cx8_store(ggml_fp16_t *x, __m256 y) {
     _mm256_storeu_ps(arr, y);
 
     for (int i = 0; i < 8; i++)
-        x[i] = GGML_FP32_TO_FP16(arr[i]);
+        x[i] = GGML_CPU_FP32_TO_FP16(arr[i]);
 }
 #define GGML_F32Cx8_LOAD(x)     __avx_f32cx8_load(x)
 #define GGML_F32Cx8_STORE(x, y) __avx_f32cx8_store(x, y)
@@ -574,10 +731,10 @@ static inline unsigned char ggml_endian_byte(int i) {
 inline static v128_t __wasm_f16x4_load(const ggml_fp16_t * p) {
     float tmp[4];
 
-    tmp[0] = GGML_FP16_TO_FP32(p[0]);
-    tmp[1] = GGML_FP16_TO_FP32(p[1]);
-    tmp[2] = GGML_FP16_TO_FP32(p[2]);
-    tmp[3] = GGML_FP16_TO_FP32(p[3]);
+    tmp[0] = GGML_CPU_FP16_TO_FP32(p[0]);
+    tmp[1] = GGML_CPU_FP16_TO_FP32(p[1]);
+    tmp[2] = GGML_CPU_FP16_TO_FP32(p[2]);
+    tmp[3] = GGML_CPU_FP16_TO_FP32(p[3]);
 
     return wasm_v128_load(tmp);
 }
@@ -587,10 +744,10 @@ inline static void __wasm_f16x4_store(ggml_fp16_t * p, v128_t x) {
 
     wasm_v128_store(tmp, x);
 
-    p[0] = GGML_FP32_TO_FP16(tmp[0]);
-    p[1] = GGML_FP32_TO_FP16(tmp[1]);
-    p[2] = GGML_FP32_TO_FP16(tmp[2]);
-    p[3] = GGML_FP32_TO_FP16(tmp[3]);
+    p[0] = GGML_CPU_FP32_TO_FP16(tmp[0]);
+    p[1] = GGML_CPU_FP32_TO_FP16(tmp[1]);
+    p[2] = GGML_CPU_FP32_TO_FP16(tmp[2]);
+    p[3] = GGML_CPU_FP32_TO_FP16(tmp[3]);
 }
 
 #define GGML_F16x4             v128_t
@@ -690,10 +847,10 @@ inline static void __wasm_f16x4_store(ggml_fp16_t * p, v128_t x) {
 static inline __m128 __sse_f16x4_load(const ggml_fp16_t * x) {
     float tmp[4];
 
-    tmp[0] = GGML_FP16_TO_FP32(x[0]);
-    tmp[1] = GGML_FP16_TO_FP32(x[1]);
-    tmp[2] = GGML_FP16_TO_FP32(x[2]);
-    tmp[3] = GGML_FP16_TO_FP32(x[3]);
+    tmp[0] = GGML_CPU_FP16_TO_FP32(x[0]);
+    tmp[1] = GGML_CPU_FP16_TO_FP32(x[1]);
+    tmp[2] = GGML_CPU_FP16_TO_FP32(x[2]);
+    tmp[3] = GGML_CPU_FP16_TO_FP32(x[3]);
 
     return _mm_loadu_ps(tmp);
 }
@@ -703,10 +860,10 @@ static inline void __sse_f16x4_store(ggml_fp16_t * x, __m128 y) {
 
     _mm_storeu_ps(arr, y);
 
-    x[0] = GGML_FP32_TO_FP16(arr[0]);
-    x[1] = GGML_FP32_TO_FP16(arr[1]);
-    x[2] = GGML_FP32_TO_FP16(arr[2]);
-    x[3] = GGML_FP32_TO_FP16(arr[3]);
+    x[0] = GGML_CPU_FP32_TO_FP16(arr[0]);
+    x[1] = GGML_CPU_FP32_TO_FP16(arr[1]);
+    x[2] = GGML_CPU_FP32_TO_FP16(arr[2]);
+    x[3] = GGML_CPU_FP32_TO_FP16(arr[3]);
 }
 
 #define GGML_F32Cx4             __m128
@@ -828,7 +985,7 @@ static inline void __lasx_f32cx8_store(ggml_fp16_t * x, __m256 y) {
 #define GGML_F32x4_ZERO    __lsx_vldi(0)
 #define GGML_F32x4_SET1(x) __lsx_vinsgr2vr_w(__lsx_vldi(0),(x), 0)
 #define GGML_F32x4_LOAD(x) __lsx_vld((x), 0)
-#define GGML_F32x4_STORE((x),(y))   __lsx_vst((y), (x), 0)
+#define GGML_F32x4_STORE(x, y)   __lsx_vst(y, x, 0)
 #define GGML_F32x4_FMA(a, b, c) __lsx_vfmadd_s(b, c, a)
 #define GGML_F32x4_ADD     __lsx_vfadd_s
 #define GGML_F32x4_MUL     __lsx_vfmul_s
@@ -874,10 +1031,10 @@ static inline void __lasx_f32cx8_store(ggml_fp16_t * x, __m256 y) {
 static inline __m128 __lsx_f16x4_load(const ggml_fp16_t * x) {
     float tmp[4];
 
-    tmp[0] = GGML_FP16_TO_FP32(x[0]);
-    tmp[1] = GGML_FP16_TO_FP32(x[1]);
-    tmp[2] = GGML_FP16_TO_FP32(x[2]);
-    tmp[3] = GGML_FP16_TO_FP32(x[3]);
+    tmp[0] = GGML_CPU_FP16_TO_FP32(x[0]);
+    tmp[1] = GGML_CPU_FP16_TO_FP32(x[1]);
+    tmp[2] = GGML_CPU_FP16_TO_FP32(x[2]);
+    tmp[3] = GGML_CPU_FP16_TO_FP32(x[3]);
 
     return __lsx_vld(tmp, 0);
 }
@@ -887,10 +1044,10 @@ static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) {
 
     __lsx_vst(y, arr, 0);
 
-    x[0] = GGML_FP32_TO_FP16(arr[0]);
-    x[1] = GGML_FP32_TO_FP16(arr[1]);
-    x[2] = GGML_FP32_TO_FP16(arr[2]);
-    x[3] = GGML_FP32_TO_FP16(arr[3]);
+    x[0] = GGML_CPU_FP32_TO_FP16(arr[0]);
+    x[1] = GGML_CPU_FP32_TO_FP16(arr[1]);
+    x[2] = GGML_CPU_FP32_TO_FP16(arr[2]);
+    x[3] = GGML_CPU_FP32_TO_FP16(arr[3]);
 }
 
 #define GGML_F32Cx4             __m128
@@ -922,7 +1079,7 @@ static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) {
 #define GGML_F32_STEP 32
 #define GGML_F32_EPR  4
 
-#define GGML_F32x4              __vector float
+#define GGML_F32x4              float32x4_t
 #define GGML_F32x4_ZERO         vec_splats(0.0f)
 #define GGML_F32x4_SET1         vec_splats
 #define GGML_F32x4_LOAD(p)      vec_xl(0, p)
@@ -944,10 +1101,8 @@ static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) {
     for (int i = 0; i < offset; ++i) {              \
         x[i] = vec_add(x[i], x[offset + i]);        \
     }                                               \
-    res = vec_extract(x[0], 0) +                    \
-          vec_extract(x[0], 1) +                    \
-          vec_extract(x[0], 2) +                    \
-          vec_extract(x[0], 3);                     \
+    float32x4_t tmp = x[0] + vec_reve(x[0]);        \
+    res = tmp[0] + tmp[1];                          \
 }
 
 #define GGML_F32_VEC        GGML_F32x4
@@ -964,28 +1119,45 @@ static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) {
 #define GGML_F16_STEP GGML_F32_STEP
 #define GGML_F16_EPR  GGML_F32_EPR
 
-static inline __vector float __lzs_f16cx4_load(const ggml_fp16_t * x) {
+static inline float32x4_t __lzs_f16cx4_load(const ggml_fp16_t * x) {
+#if defined(__NNPA__)
+    uint16x8_t v_x = vec_xl(0, (const ggml_fp16_t *)x);
+    uint16x8_t v_xd = vec_convert_from_fp16(v_x, 0);
+    return vec_extend_to_fp32_hi(v_xd, 0);
+#else
     float tmp[4];
 
     for (int i = 0; i < 4; i++) {
-        tmp[i] = GGML_FP16_TO_FP32(x[i]);
+        tmp[i] = GGML_CPU_FP16_TO_FP32(x[i]);
     }
 
     // note: keep type-cast here to prevent compiler bugs
     // see: https://github.com/ggml-org/llama.cpp/issues/12846
     return vec_xl(0, (const float *)(tmp));
+#endif
 }
 
-static inline void __lzs_f16cx4_store(ggml_fp16_t * x, __vector float y) {
+static inline void __lzs_f16cx4_store(ggml_fp16_t * x, float32x4_t v_y) {
+#if defined(__NNPA__)
+    float32x4_t v_zero = vec_splats(0.0f);
+    uint16x8_t v_xd = vec_round_from_fp32(v_y, v_zero, 0);
+    uint16x8_t v_x = vec_convert_to_fp16(v_xd, 0);
+
+    x[0] = vec_extract(v_x, 0);
+    x[1] = vec_extract(v_x, 1);
+    x[2] = vec_extract(v_x, 2);
+    x[3] = vec_extract(v_x, 3);
+#else
     float arr[4];
 
     // note: keep type-cast here to prevent compiler bugs
     // see: https://github.com/ggml-org/llama.cpp/issues/12846
-    vec_xst(y, 0, (float *)(arr));
+    vec_xst(v_y, 0, (float *)(arr));
 
     for (int i = 0; i < 4; i++) {
-        x[i] = GGML_FP32_TO_FP16(arr[i]);
+        x[i] = GGML_CPU_FP32_TO_FP16(arr[i]);
     }
+#endif
 }
 
 #define GGML_F16_VEC                GGML_F32x4
@@ -1006,3 +1178,7 @@ static inline void __lzs_f16cx4_store(ggml_fp16_t * x, __vector float y) {
 #define GGML_F32_ARR (GGML_F32_STEP/GGML_F32_EPR)
 #define GGML_F16_ARR (GGML_F16_STEP/GGML_F16_EPR)
 #endif
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/ggml/src/ggml-cpu/ggml-cpu-traits.cpp b/ggml/src/ggml-cpu/traits.cpp
similarity index 97%
rename from ggml/src/ggml-cpu/ggml-cpu-traits.cpp
rename to ggml/src/ggml-cpu/traits.cpp
index 62a0712dabbf6..139fa59641440 100644
--- a/ggml/src/ggml-cpu/ggml-cpu-traits.cpp
+++ b/ggml/src/ggml-cpu/traits.cpp
@@ -1,4 +1,4 @@
-#include "ggml-cpu-traits.h"
+#include "traits.h"
 
 #include "ggml-backend-impl.h"
 #include "ggml-backend.h"
diff --git a/ggml/src/ggml-cpu/ggml-cpu-traits.h b/ggml/src/ggml-cpu/traits.h
similarity index 100%
rename from ggml/src/ggml-cpu/ggml-cpu-traits.h
rename to ggml/src/ggml-cpu/traits.h
diff --git a/ggml/src/ggml-cpu/vec.cpp b/ggml/src/ggml-cpu/vec.cpp
index f7614568ea388..5e34d79a1695f 100644
--- a/ggml/src/ggml-cpu/vec.cpp
+++ b/ggml/src/ggml-cpu/vec.cpp
@@ -219,11 +219,11 @@ void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * G
 
     // leftovers
     for (int i = np; i < n; ++i) {
-        sumf += (ggml_float)(GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i]));
+        sumf += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[i])*GGML_CPU_FP16_TO_FP32(y[i]));
     }
 #else
     for (int i = 0; i < n; ++i) {
-        sumf += (ggml_float)(GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i]));
+        sumf += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[i])*GGML_CPU_FP16_TO_FP32(y[i]));
     }
 #endif
 
diff --git a/ggml/src/ggml-cpu/vec.h b/ggml/src/ggml-cpu/vec.h
index 09dbade2179fb..84f6c0e6d26c4 100644
--- a/ggml/src/ggml-cpu/vec.h
+++ b/ggml/src/ggml-cpu/vec.h
@@ -58,7 +58,7 @@ inline static void ggml_vec_set_bf16(const int n, ggml_bf16_t * x, const ggml_bf
 inline static void ggml_vec_add_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i]  = x[i] + y[i]; }
 inline static void ggml_vec_add_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) {
     for (int i = 0; i < n; ++i) {
-        z[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(x[i]) + GGML_FP16_TO_FP32(y[i]));
+        z[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(x[i]) + GGML_CPU_FP16_TO_FP32(y[i]));
     }
 }
 inline static void ggml_vec_add1_f32(const int n, float * z, const float * x, const float   v) { for (int i = 0; i < n; ++i) z[i]  = x[i] + v;    }
@@ -67,7 +67,7 @@ inline static void ggml_vec_acc1_f32(const int n, float * y, const float   v)
 inline static void ggml_vec_sub_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i]  = x[i] - y[i]; }
 inline static void ggml_vec_sub_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) {
     for (int i = 0; i < n; ++i) {
-        z[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(x[i]) - GGML_FP16_TO_FP32(y[i]));
+        z[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(x[i]) - GGML_CPU_FP16_TO_FP32(y[i]));
     }
 }
 inline static void ggml_vec_set_f32 (const int n, float * x, const float   v)                  { for (int i = 0; i < n; ++i) x[i]  = v;           }
@@ -75,20 +75,20 @@ inline static void ggml_vec_cpy_f32 (const int n, float * y, const float * x)
 inline static void ggml_vec_neg_f32 (const int n, float * y, const float * x)                  { for (int i = 0; i < n; ++i) y[i]  = -x[i];       }
 inline static void ggml_vec_neg_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
     for (int i = 0; i < n; ++i) {
-        y[i] = GGML_FP32_TO_FP16(-GGML_FP16_TO_FP32(x[i]));
+        y[i] = GGML_CPU_FP32_TO_FP16(-GGML_CPU_FP16_TO_FP32(x[i]));
     }
 }
 
 inline static void ggml_vec_mul_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i]  = x[i]*y[i];   }
 inline static void ggml_vec_mul_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) {
     for (int i = 0; i < n; ++i) {
-        z[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(x[i]) * GGML_FP16_TO_FP32(y[i]));
+        z[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(x[i]) * GGML_CPU_FP16_TO_FP32(y[i]));
     }
 }
 inline static void ggml_vec_div_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i]  = x[i]/y[i];   }
 inline static void ggml_vec_div_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) {
     for (int i = 0; i < n; ++i) {
-        z[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(x[i]) / GGML_FP16_TO_FP32(y[i]));
+        z[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(x[i]) / GGML_CPU_FP16_TO_FP32(y[i]));
     }
 }
 
@@ -131,13 +131,13 @@ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GG
     // leftovers
     for (int i = np; i < n; ++i) {
         for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) {
-            sumf[j] += (ggml_float)(GGML_FP16_TO_FP32(x[j][i])*GGML_FP16_TO_FP32(y[i]));
+            sumf[j] += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[j][i])*GGML_CPU_FP16_TO_FP32(y[i]));
         }
     }
 #else
     for (int i = 0; i < n; ++i) {
         for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) {
-            sumf[j] += (ggml_float)(GGML_FP16_TO_FP32(x[j][i])*GGML_FP16_TO_FP32(y[i]));
+            sumf[j] += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[j][i])*GGML_CPU_FP16_TO_FP32(y[i]));
         }
     }
 #endif
@@ -280,12 +280,12 @@ inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * GGML_RESTRICT y,
 
     // leftovers
     for (int i = np; i < n; ++i) {
-        y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v);
+        y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i]) + GGML_CPU_FP16_TO_FP32(x[i])*v);
     }
 #else
     // scalar
     for (int i = 0; i < n; ++i) {
-        y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v);
+        y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i]) + GGML_CPU_FP16_TO_FP32(x[i])*v);
     }
 #endif
 }
@@ -430,12 +430,12 @@ inline static void ggml_vec_scale_f16(const int n, ggml_fp16_t * y, const float
 
     // leftovers
     for (int i = np; i < n; ++i) {
-        y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i])*v);
+        y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i])*v);
     }
 #else
     // scalar
     for (int i = 0; i < n; ++i) {
-        y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i])*v);
+        y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i])*v);
     }
 #endif
 }
@@ -444,103 +444,103 @@ inline static void ggml_vec_norm_f32 (const int n, float * s, const float * x) {
 inline static void ggml_vec_sqr_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]*x[i];   }
 inline static void ggml_vec_sqr_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
     for (int i = 0; i < n; ++i) {
-        float v = GGML_FP16_TO_FP32(x[i]);
-        y[i] = GGML_FP32_TO_FP16(v*v);
+        float v = GGML_CPU_FP16_TO_FP32(x[i]);
+        y[i] = GGML_CPU_FP32_TO_FP16(v*v);
     }
 }
 inline static void ggml_vec_sqrt_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sqrtf(x[i]); }
 inline static void ggml_vec_sqrt_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
     for (int i = 0; i < n; ++i) {
-        y[i] = GGML_FP32_TO_FP16(sqrtf(GGML_FP16_TO_FP32(x[i])));
+        y[i] = GGML_CPU_FP32_TO_FP16(sqrtf(GGML_CPU_FP16_TO_FP32(x[i])));
     }
 }
 inline static void ggml_vec_log_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = logf(x[i]);  }
 inline static void ggml_vec_log_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
     for (int i = 0; i < n; ++i) {
-        y[i] = GGML_FP32_TO_FP16(logf(GGML_FP16_TO_FP32(x[i])));
+        y[i] = GGML_CPU_FP32_TO_FP16(logf(GGML_CPU_FP16_TO_FP32(x[i])));
     }
 }
 inline static void ggml_vec_sin_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sinf(x[i]);  }
 inline static void ggml_vec_sin_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
     for (int i = 0; i < n; ++i) {
-        y[i] = GGML_FP32_TO_FP16(sinf(GGML_FP16_TO_FP32(x[i])));
+        y[i] = GGML_CPU_FP32_TO_FP16(sinf(GGML_CPU_FP16_TO_FP32(x[i])));
     }
 }
 inline static void ggml_vec_cos_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = cosf(x[i]);  }
 inline static void ggml_vec_cos_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
     for (int i = 0; i < n; ++i) {
-        y[i] = GGML_FP32_TO_FP16(cosf(GGML_FP16_TO_FP32(x[i])));
+        y[i] = GGML_CPU_FP32_TO_FP16(cosf(GGML_CPU_FP16_TO_FP32(x[i])));
     }
 }
 inline static void ggml_vec_abs_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fabsf(x[i]); }
 inline static void ggml_vec_abs_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
     for (int i = 0; i < n; ++i) {
-        y[i] = GGML_FP32_TO_FP16(fabsf(GGML_FP16_TO_FP32(x[i])));
+        y[i] = GGML_CPU_FP32_TO_FP16(fabsf(GGML_CPU_FP16_TO_FP32(x[i])));
     }
 }
 inline static void ggml_vec_sgn_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : ((x[i] < 0.f) ? -1.f : 0.f); }
 inline static void ggml_vec_sgn_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
     for (int i = 0; i < n; ++i) {
-        float v = GGML_FP16_TO_FP32(x[i]);
-        y[i] = GGML_FP32_TO_FP16((v > 0.f) ? 1.f : ((v < 0.f) ? -1.f : 0.f));
+        float v = GGML_CPU_FP16_TO_FP32(x[i]);
+        y[i] = GGML_CPU_FP32_TO_FP16((v > 0.f) ? 1.f : ((v < 0.f) ? -1.f : 0.f));
     }
 }
 inline static void ggml_vec_step_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : 0.f; }
 inline static void ggml_vec_step_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
     for (int i = 0; i < n; ++i) {
-        y[i] = GGML_FP32_TO_FP16((GGML_FP16_TO_FP32(x[i]) > 0.f) ? 1.f : 0.f);
+        y[i] = GGML_CPU_FP32_TO_FP16((GGML_CPU_FP16_TO_FP32(x[i]) > 0.f) ? 1.f : 0.f);
     }
 }
 inline static void ggml_vec_tanh_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = tanhf(x[i]);  }
 inline static void ggml_vec_tanh_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
     for (int i = 0; i < n; ++i) {
-        y[i] = GGML_FP32_TO_FP16(tanhf(GGML_FP16_TO_FP32(x[i])));
+        y[i] = GGML_CPU_FP32_TO_FP16(tanhf(GGML_CPU_FP16_TO_FP32(x[i])));
     }
 }
 inline static void ggml_vec_elu_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expm1f(x[i]); }
 inline static void ggml_vec_elu_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
     for (int i = 0; i < n; ++i) {
-        y[i] = GGML_FP32_TO_FP16(expm1f(GGML_FP16_TO_FP32(x[i])));
+        y[i] = GGML_CPU_FP32_TO_FP16(expm1f(GGML_CPU_FP16_TO_FP32(x[i])));
     }
 }
 inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
 inline static void ggml_vec_relu_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
     for (int i = 0; i < n; ++i) {
-        float v = GGML_FP16_TO_FP32(x[i]);
-        y[i] = GGML_FP32_TO_FP16((v > 0.f) ? v : 0.f);
+        float v = GGML_CPU_FP16_TO_FP32(x[i]);
+        y[i] = GGML_CPU_FP32_TO_FP16((v > 0.f) ? v : 0.f);
     }
 }
 inline static void ggml_vec_leaky_relu_f32 (const int n, float * y, const float * x, const float ns) { for (int i = 0; i < n; ++i) y[i] = ((x[i] > 0.f) ? x[i] : 0.f) + ns * ((x[i] < 0.0f) ? x[i] : 0.f); }
 inline static void ggml_vec_leaky_relu_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const float ns) {
     for (int i = 0; i < n; ++i) {
-        float v = GGML_FP16_TO_FP32(x[i]);
-        y[i] = GGML_FP32_TO_FP16(((v > 0.f) ? v : 0.f) + ns * ((v < 0.0f) ? v : 0.f));
+        float v = GGML_CPU_FP16_TO_FP32(x[i]);
+        y[i] = GGML_CPU_FP32_TO_FP16(((v > 0.f) ? v : 0.f) + ns * ((v < 0.0f) ? v : 0.f));
     }
 }
 inline static void ggml_vec_sigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = 1.f / (1.f + expf(-x[i])); }
 inline static void ggml_vec_sigmoid_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
     for (int i = 0; i < n; ++i) {
-        y[i] = GGML_FP32_TO_FP16(1.f / (1.f + expf(-GGML_FP16_TO_FP32(x[i]))));
+        y[i] = GGML_CPU_FP32_TO_FP16(1.f / (1.f + expf(-GGML_CPU_FP16_TO_FP32(x[i]))));
     }
 }
 // TODO: optimize performance
 inline static void ggml_vec_hardswish_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i] * fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); }
 inline static void ggml_vec_hardswish_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
     for (int i = 0; i < n; ++i) {
-        float v = GGML_FP16_TO_FP32(x[i]);
-        y[i] = GGML_FP32_TO_FP16(v * fminf(1.0f, fmaxf(0.0f, (v + 3.0f) / 6.0f)));
+        float v = GGML_CPU_FP16_TO_FP32(x[i]);
+        y[i] = GGML_CPU_FP32_TO_FP16(v * fminf(1.0f, fmaxf(0.0f, (v + 3.0f) / 6.0f)));
     }
 }
 inline static void ggml_vec_hardsigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); }
 inline static void ggml_vec_hardsigmoid_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
     for (int i = 0; i < n; ++i) {
-        y[i] = GGML_FP32_TO_FP16(fminf(1.0f, fmaxf(0.0f, (GGML_FP16_TO_FP32(x[i]) + 3.0f) / 6.0f)));
+        y[i] = GGML_CPU_FP32_TO_FP16(fminf(1.0f, fmaxf(0.0f, (GGML_CPU_FP16_TO_FP32(x[i]) + 3.0f) / 6.0f)));
     }
 }
 inline static void ggml_vec_exp_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = expf(x[i]); }
 inline static void ggml_vec_exp_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
     for (int i = 0; i < n; ++i) {
-        y[i] = GGML_FP32_TO_FP16(expf(GGML_FP16_TO_FP32(x[i])));
+        y[i] = GGML_CPU_FP32_TO_FP16(expf(GGML_CPU_FP16_TO_FP32(x[i])));
     }
 }
 
@@ -562,9 +562,9 @@ inline static void ggml_vec_gelu_f16(const int n, ggml_fp16_t * y, const ggml_fp
 
 inline static void ggml_vec_gelu_erf_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
     for (int i = 0; i < n; ++i) {
-        float xi = GGML_FP16_TO_FP32(x[i]);
+        float xi = GGML_CPU_FP16_TO_FP32(x[i]);
         float res = 0.5f*xi*(1.0f + erff(xi*SQRT_2_INV));
-        y[i] = GGML_FP32_TO_FP16(res);
+        y[i] = GGML_CPU_FP32_TO_FP16(res);
     }
 }
 
@@ -577,9 +577,9 @@ inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) {
         } else if (x[i] >= 10.0f) {
             y[i] = x[i];
         } else {
-            ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]);
+            ggml_fp16_t fp16 = GGML_CPU_FP32_TO_FP16(x[i]);
             memcpy(&t, &fp16, sizeof(uint16_t));
-            y[i] = GGML_FP16_TO_FP32(ggml_table_gelu_f16[t]);
+            y[i] = GGML_CPU_FP16_TO_FP32(ggml_table_gelu_f16[t]);
         }
     }
 }
@@ -613,9 +613,9 @@ inline static float ggml_gelu_quick_f32(float x) {
 inline static void ggml_vec_gelu_quick_f32(const int n, float * y, const float * x) {
     uint16_t t;
     for (int i = 0; i < n; ++i) {
-        ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]);
+        ggml_fp16_t fp16 = GGML_CPU_FP32_TO_FP16(x[i]);
         memcpy(&t, &fp16, sizeof(uint16_t));
-        y[i] = GGML_FP16_TO_FP32(ggml_table_gelu_quick_f16[t]);
+        y[i] = GGML_CPU_FP16_TO_FP32(ggml_table_gelu_quick_f16[t]);
     }
 }
 #else
@@ -628,8 +628,8 @@ inline static void ggml_vec_gelu_quick_f32(const int n, float * y, const float *
 
 inline static void ggml_vec_gelu_quick_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
     for (int i = 0; i < n; ++i) {
-        float v = GGML_FP16_TO_FP32(x[i]);
-        y[i] = GGML_FP32_TO_FP16(v*(1.0f/(1.0f+expf(GELU_QUICK_COEF*v))));
+        float v = GGML_CPU_FP16_TO_FP32(x[i]);
+        y[i] = GGML_CPU_FP32_TO_FP16(v*(1.0f/(1.0f+expf(GELU_QUICK_COEF*v))));
     }
 }
 
@@ -638,8 +638,8 @@ inline static float ggml_silu_f32(float x) {
     return x/(1.0f + expf(-x));
 }
 inline static ggml_fp16_t ggml_silu_f16(ggml_fp16_t x) {
-    float v = GGML_FP16_TO_FP32(x);
-    return GGML_FP32_TO_FP16(v/(1.0f + expf(-v)));
+    float v = GGML_CPU_FP16_TO_FP32(x);
+    return GGML_CPU_FP32_TO_FP16(v/(1.0f + expf(-v)));
 }
 
 #if __FINITE_MATH_ONLY__
@@ -888,9 +888,9 @@ inline static float ggml_silu_backward_f32(float x, float dy) {
 }
 
 inline static ggml_fp16_t ggml_silu_backward_f16(ggml_fp16_t x, ggml_fp16_t dy) {
-    const float v = GGML_FP16_TO_FP32(x);
+    const float v = GGML_CPU_FP16_TO_FP32(x);
     const float s = 1.0f/(1.0f + expf(-v));
-    return GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(dy)*s*(1.0f + v*(1.0f - s)));
+    return GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(dy)*s*(1.0f + v*(1.0f - s)));
 }
 
 inline static void ggml_vec_silu_backward_f32(const int n, float * dx, const float * x, const float * dy) {
@@ -928,7 +928,7 @@ inline static void ggml_vec_sum_f32_ggf(const int n, ggml_float * s, const float
 inline static void ggml_vec_sum_f16_ggf(const int n, float * s, const ggml_fp16_t * x) {
     float sum = 0.0f;
     for (int i = 0; i < n; ++i) {
-        sum += GGML_FP16_TO_FP32(x[i]);
+        sum += GGML_CPU_FP16_TO_FP32(x[i]);
     }
     *s = sum;
 }
diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh
index e1ce1d4cd1558..ea20355023825 100644
--- a/ggml/src/ggml-cuda/common.cuh
+++ b/ggml/src/ggml-cuda/common.cuh
@@ -19,10 +19,10 @@
 #endif
 #include "ggml-common.h"
 
-#include <cstdio>
 #include <array>
 #include <cassert>
 #include <cfloat>
+#include <cstdio>
 #include <string>
 #include <vector>
 
@@ -76,11 +76,9 @@
 #define GGML_CUDA_CC_IS_CDNA(cc)  (cc >= GGML_CUDA_CC_CDNA && cc < GGML_CUDA_CC_RDNA1)
 
 // Moore Threads
-#define GGML_CUDA_MUSA_ARCH_IS_QY1 (__MUSA_ARCH__ <= 210)
-
-#define GGML_CUDA_CC_QY1  (GGML_CUDA_CC_OFFSET_MTHREADS + 0x210) // MTT S80, MTT S3000
-#define GGML_CUDA_CC_QY2  (GGML_CUDA_CC_OFFSET_MTHREADS + 0x220) // MTT S4000
-#define GGML_CUDA_CC_NG   (GGML_CUDA_CC_OFFSET_MTHREADS + 0x310) // TBD
+#define GGML_CUDA_CC_QY1 (GGML_CUDA_CC_OFFSET_MTHREADS + 0x210) // MTT S80, MTT S3000
+#define GGML_CUDA_CC_QY2 (GGML_CUDA_CC_OFFSET_MTHREADS + 0x220) // MTT S4000
+#define GGML_CUDA_CC_NG  (GGML_CUDA_CC_OFFSET_MTHREADS + 0x310) // TBD
 
 #define GGML_CUDA_CC_IS_MTHREADS(cc) (cc >= GGML_CUDA_CC_OFFSET_MTHREADS && cc < GGML_CUDA_CC_OFFSET_AMD)
 #define GGML_CUDA_CC_IS_QY1(cc)      (cc >= GGML_CUDA_CC_QY1 && cc < GGML_CUDA_CC_QY2)
@@ -203,13 +201,13 @@ typedef float2 dfloat2;
 #define FAST_FP16_AVAILABLE
 #endif // defined(FP16_AVAILABLE) && __CUDA_ARCH__ != 610
 
-#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
+#if (!defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA) || defined(GGML_USE_MUSA)
 #define FP16_MMA_AVAILABLE
-#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
+#endif // (!defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA) || defined(GGML_USE_MUSA)
 
-#if defined(GGML_HIP_ROCWMMA_FATTN) && (defined(CDNA) || defined(RDNA3) || defined(RDNA4))
+#if defined(GGML_HIP_ROCWMMA_FATTN) && (defined(CDNA) || defined(RDNA3) || (defined(GGML_HIP_ROCWMMA_FATTN_GFX12) && defined(RDNA4)))
 #define FP16_MMA_AVAILABLE
-#endif // defined(GGML_HIP_ROCWMMA_FATTN) && (defined(CDNA) || defined(RDNA3) || defined(RDNA4))
+#endif // defined(GGML_HIP_ROCWMMA_FATTN) && (defined(CDNA) || defined(RDNA3) || (defined(GGML_HIP_ROCWMMA_FATTN_GFX12) && defined(RDNA4)))
 
 #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
 #define NEW_MMA_AVAILABLE
@@ -219,9 +217,9 @@ typedef float2 dfloat2;
 #define CP_ASYNC_AVAILABLE
 #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
 
-#if !defined(GGML_CUDA_NO_FA) && !(defined(GGML_USE_MUSA) && GGML_CUDA_MUSA_ARCH_IS_QY1)
+#if !defined(GGML_CUDA_NO_FA) && !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ < 220)
 #define FLASH_ATTN_AVAILABLE
-#endif // !defined(GGML_CUDA_NO_FA) && !(defined(GGML_USE_MUSA) && GGML_CUDA_MUSA_ARCH_IS_QY1)
+#endif // !defined(GGML_CUDA_NO_FA) && !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ < 220)
 
 static bool fp16_available(const int cc) {
     return ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_PASCAL;
@@ -233,7 +231,8 @@ static bool fast_fp16_available(const int cc) {
 
 // To be used for feature selection of external libraries, e.g. cuBLAS.
 static bool fast_fp16_hardware_available(const int cc) {
-    return (GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_PASCAL && cc != 610) || GGML_CUDA_CC_IS_AMD(cc);
+    return (GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_PASCAL && cc != 610) || GGML_CUDA_CC_IS_AMD(cc) ||
+        (GGML_CUDA_CC_IS_MTHREADS(cc) && cc >= GGML_CUDA_CC_QY2);
 }
 
 // Any FP16 tensor core instructions are available for ggml code.
@@ -241,15 +240,35 @@ static bool fp16_mma_available(const int cc) {
 #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && !defined(GGML_HIP_ROCWMMA_FATTN)
     return false;
 #else
-    return (GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA) ||
-        GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA3(cc) || GGML_CUDA_CC_IS_RDNA4(cc);
+    if ((GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA) ||
+        GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA3(cc) ||
+        GGML_CUDA_CC_IS_MTHREADS(cc)) {
+        return true;
+    } else if (GGML_CUDA_CC_IS_RDNA4(cc)) {
+#if defined(GGML_HIP_ROCWMMA_FATTN) && defined(GGML_HIP_ROCWMMA_FATTN_GFX12)
+        return true;
+#else
+        return false;
+#endif // defined(GGML_HIP_ROCWMMA_FATTN) && defined(GGML_HIP_ROCWMMA_FATTN_GFX12)
+    } else {
+        return false;
+    }
 #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && !defined(GGML_HIP_ROCWMMA_FATTN)
 }
 
 // To be used for feature selection of external libraries, e.g. cuBLAS.
 static bool fp16_mma_hardware_available(const int cc) {
     return (GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_VOLTA) ||
-        GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA3(cc) || GGML_CUDA_CC_IS_RDNA4(cc);
+        GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA3(cc) || GGML_CUDA_CC_IS_RDNA4(cc) ||
+        (GGML_CUDA_CC_IS_MTHREADS(cc) && cc >= GGML_CUDA_CC_QY2);
+}
+
+static bool bf16_mma_hardware_available(const int cc) {
+    return (GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_AMPERE) || GGML_CUDA_CC_IS_CDNA(cc) || cc >= GGML_CUDA_CC_RDNA3;
+}
+
+static bool fp32_mma_hardware_available(const int cc) {
+    return GGML_CUDA_CC_IS_CDNA(cc);
 }
 
 // Volta technically had FP16 tensor cores but they work very differently compared to Turing and later.
@@ -262,11 +281,11 @@ static bool cp_async_available(const int cc) {
 }
 
 static constexpr __device__ int ggml_cuda_get_physical_warp_size() {
-#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
-    return __AMDGCN_WAVEFRONT_SIZE;
+#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && (defined(__GFX9__) || defined(__GFX8__))
+    return 64;
 #else
     return 32;
-#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
+#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && (defined(__GFX9__) || defined(__GFX8__))
 }
 
 [[noreturn]]
@@ -362,6 +381,26 @@ static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
 #endif // FP16_AVAILABLE
 }
 
+// Row reduction kernel template - compute sum (norm=false) or mean (norm=true)
+template<bool norm>
+static __global__ void reduce_rows_f32(const float * x, float * dst, const int ncols) {
+    const int row = blockIdx.x;
+    const int col = threadIdx.x;
+
+    float sum = 0.0f;
+    for (int i = col; i < ncols; i += blockDim.x) {
+        sum += x[row * ncols + i];
+    }
+
+    sum = warp_reduce_sum(sum);
+
+    if (col != 0) {
+        return;
+    }
+
+    dst[row] = norm ? sum / ncols : sum;
+}
+
 template<int width = WARP_SIZE>
 static __device__ __forceinline__ float warp_reduce_max(float x) {
 #pragma unroll
@@ -466,9 +505,6 @@ static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, i
 #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
 }
 
-// TODO: move to ggml-common.h
-static constexpr __device__ int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
-
 typedef void (*dequantize_kernel_t)(const void * vx, const int64_t ib, const int iqs, dfloat2 & v);
 
 static __device__ __forceinline__ float get_alibi_slope(
@@ -770,21 +806,7 @@ struct ggml_backend_cuda_context {
         name(GGML_CUDA_NAME + std::to_string(device)) {
     }
 
-    ~ggml_backend_cuda_context() {
-        if (copy_event != nullptr) {
-            CUDA_CHECK(cudaEventDestroy(copy_event));
-        }
-        for (int i = 0; i < GGML_CUDA_MAX_DEVICES; ++i) {
-            for (int j = 0; j < GGML_CUDA_MAX_STREAMS; ++j) {
-                if (streams[i][j] != nullptr) {
-                    CUDA_CHECK(cudaStreamDestroy(streams[i][j]));
-                }
-            }
-            if (cublas_handles[i] != nullptr) {
-                CUBLAS_CHECK(cublasDestroy(cublas_handles[i]));
-            }
-        }
-    }
+    ~ggml_backend_cuda_context();
 
     cudaStream_t stream(int device, int stream) {
         if (streams[device][stream] == nullptr) {
diff --git a/ggml/src/ggml-cuda/conv2d-dw.cu b/ggml/src/ggml-cuda/conv2d-dw.cu
new file mode 100644
index 0000000000000..7583233b1b7cd
--- /dev/null
+++ b/ggml/src/ggml-cuda/conv2d-dw.cu
@@ -0,0 +1,161 @@
+#include "conv2d-dw.cuh"
+
+struct conv_params {
+    int in_w, in_h;
+    int out_w, out_h;
+    int kernel_w, kernel_h;
+    int stride_x, stride_y;
+    int padding_x, padding_y;
+    int dilation_x, dilation_y;
+    int channels, batches;
+};
+
+struct kernel_bounds {
+    int y_min, y_max;
+    int x_min, x_max;
+};
+
+__device__ __forceinline__ kernel_bounds calculate_kernel_bounds(int out_x, int out_y, const conv_params & params) {
+    kernel_bounds bounds;
+    bounds.y_min = max(0, (params.padding_y - out_y * params.stride_y + params.dilation_y - 1) / params.dilation_y);
+    bounds.y_max =
+        min(params.kernel_h,
+            (params.in_h + params.padding_y - out_y * params.stride_y + params.dilation_y - 1) / params.dilation_y);
+    bounds.x_min = max(0, (params.padding_x - out_x * params.stride_x + params.dilation_x - 1) / params.dilation_x);
+    bounds.x_max =
+        min(params.kernel_w,
+            (params.in_w + params.padding_x - out_x * params.stride_x + params.dilation_x - 1) / params.dilation_x);
+    return bounds;
+}
+
+__device__ __forceinline__ int calculate_input_coord(int out_coord, int kern_coord, int stride, int dilation, int padding) {
+    return out_coord * stride + kern_coord * dilation - padding;
+}
+
+struct whcn_layout {
+    __device__ static int input_index(int n, int c, int y, int x, const conv_params & params) {
+        return n * (params.channels * params.in_w * params.in_h) + c * params.in_w * params.in_h + y * params.in_w + x;
+    }
+
+    __device__ static int kernel_index(int c, int ky, int kx, const conv_params & params) {
+        return c * params.kernel_h * params.kernel_w + ky * params.kernel_w + kx;
+    }
+
+    __device__ static int output_index(int n, int c, int y, int x, const conv_params & params) {
+        return n * (params.channels * params.out_w * params.out_h) + c * params.out_w * params.out_h +
+               y * params.out_w + x;
+    }
+
+    __device__ static void unpack_indices(int global_idx, const conv_params & params, int & n, int & c, int & out_y,
+                                          int & out_x) {
+        out_x = global_idx % params.out_w;
+        out_y = (global_idx / params.out_w) % params.out_h;
+        c     = (global_idx / (params.out_w * params.out_h)) % params.channels;
+        n     = global_idx / (params.out_w * params.out_h * params.channels);
+    }
+};
+
+struct cwhn_layout {
+    __device__ static int input_index(int n, int c, int y, int x, const conv_params & params) {
+        return n * (params.channels * params.in_w * params.in_h) + (y * params.in_w + x) * params.channels + c;
+    }
+
+    __device__ static int kernel_index(int c, int ky, int kx, const conv_params & params) {
+        return (ky * params.kernel_w + kx) * params.channels + c;
+    }
+
+    __device__ static int output_index(int n, int c, int y, int x, const conv_params & params) {
+        return n * (params.channels * params.out_w * params.out_h) + y * (params.out_w * params.channels) +
+               x * params.channels + c;
+    }
+
+    __device__ static void unpack_indices(int global_idx, const conv_params & params, int & n, int & c, int & out_y,
+                                          int & out_x) {
+        c     = global_idx % params.channels;
+        out_x = (global_idx / params.channels) % params.out_w;
+        out_y = (global_idx / (params.channels * params.out_w)) % params.out_h;
+        n     = global_idx / (params.channels * params.out_w * params.out_h);
+    }
+};
+
+template <typename T, typename Layout>
+__global__ void conv2d_dw_kernel(const T * __restrict__ input, const T * __restrict__ kernel, T * __restrict__ output,
+                                 const int in_w, const int in_h, const int out_w, const int out_h,
+                                 const int kernel_w, const int kernel_h, const int stride_x, const int stride_y,
+                                 const int padding_x, const int padding_y, const int dilation_x, const int dilation_y,
+                                 const int channels, const int batches) {
+    const int global_idx     = blockIdx.x * blockDim.x + threadIdx.x;
+    const int total_elements = batches * channels * out_h * out_w;
+
+    if (global_idx >= total_elements) {
+        return;
+    }
+
+    conv_params params = { in_w,     in_h,      out_w,     out_h,      kernel_w,   kernel_h, stride_x,
+                           stride_y, padding_x, padding_y, dilation_x, dilation_y, channels, batches };
+
+    int batch_idx, channel_idx, out_y_idx, out_x_idx;
+    Layout::unpack_indices(global_idx, params, batch_idx, channel_idx, out_y_idx, out_x_idx);
+
+    T accumulator = 0;
+    kernel_bounds bounds = calculate_kernel_bounds(out_x_idx, out_y_idx, params);
+
+    for (int kern_y = bounds.y_min; kern_y < bounds.y_max; ++kern_y) {
+        int in_y_idx = calculate_input_coord(out_y_idx, kern_y, params.stride_y, params.dilation_y, params.padding_y);
+
+        for (int kern_x = bounds.x_min; kern_x < bounds.x_max; ++kern_x) {
+            int in_x_idx = calculate_input_coord(out_x_idx, kern_x, params.stride_x, params.dilation_x, params.padding_x);
+
+            const T input_val  = input[Layout::input_index(batch_idx, channel_idx, in_y_idx, in_x_idx, params)];
+            const T kernel_val = kernel[Layout::kernel_index(channel_idx, kern_y, kern_x, params)];
+
+            accumulator += input_val * kernel_val;
+        }
+    }
+
+    output[Layout::output_index(batch_idx, channel_idx, out_y_idx, out_x_idx, params)] = accumulator;
+}
+
+void ggml_cuda_op_conv2d_dw(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    const ggml_tensor * kernel = dst->src[0];
+    const ggml_tensor * input  = dst->src[1];
+
+    GGML_ASSERT(kernel->type == GGML_TYPE_F32 && input->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
+    const float * w_d = (const float *) kernel->data;
+    const float * x_d = (const float *) input->data;
+    float *       y_d = (float *) dst->data;
+
+    const int32_t * p          = (const int32_t *) dst->op_params;
+    const int       stride_x   = p[0];
+    const int       stride_y   = p[1];
+    const int       padding_x  = p[2];
+    const int       padding_y  = p[3];
+    const int       dilation_x = p[4];
+    const int       dilation_y = p[5];
+
+    const int in_w     = input->ne[0];
+    const int in_h     = input->ne[1];
+    const int kernel_w = kernel->ne[0];
+    const int kernel_h = kernel->ne[1];
+    const int out_w    = dst->ne[0];
+    const int out_h    = dst->ne[1];
+    const int channels = dst->ne[2];
+    const int batches  = dst->ne[3];
+
+    cudaStream_t st = ctx.stream();
+
+    const int total  = batches * channels * out_h * out_w;
+    const int blocks = (total + CUDA_CONV2D_DW_BLOCK_SIZE - 1) / CUDA_CONV2D_DW_BLOCK_SIZE;
+
+    if (ggml_is_contiguous(input)) {
+        conv2d_dw_kernel<float, whcn_layout><<<blocks, CUDA_CONV2D_DW_BLOCK_SIZE, 0, st>>>(
+            x_d, w_d, y_d, in_w, in_h, out_w, out_h, kernel_w, kernel_h, stride_x, stride_y, padding_x, padding_y,
+            dilation_x, dilation_y, channels, batches);
+    } else if (ggml_is_contiguous_channels(input)) {
+        conv2d_dw_kernel<float, cwhn_layout><<<blocks, CUDA_CONV2D_DW_BLOCK_SIZE, 0, st>>>(
+            x_d, w_d, y_d, in_w, in_h, out_w, out_h, kernel_w, kernel_h, stride_x, stride_y, padding_x, padding_y,
+            dilation_x, dilation_y, channels, batches);
+    } else {
+        GGML_ABORT("Unsupported memory layout for conv_2d_dw");
+    }
+}
diff --git a/ggml/src/ggml-cuda/conv2d-dw.cuh b/ggml/src/ggml-cuda/conv2d-dw.cuh
new file mode 100644
index 0000000000000..b5d5a69d345cf
--- /dev/null
+++ b/ggml/src/ggml-cuda/conv2d-dw.cuh
@@ -0,0 +1,5 @@
+#pragma once
+#include "common.cuh"
+
+#define CUDA_CONV2D_DW_BLOCK_SIZE 256
+void ggml_cuda_op_conv2d_dw(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/ggml/src/ggml-cuda/conv2d-transpose.cu b/ggml/src/ggml-cuda/conv2d-transpose.cu
new file mode 100644
index 0000000000000..03224e404d32d
--- /dev/null
+++ b/ggml/src/ggml-cuda/conv2d-transpose.cu
@@ -0,0 +1,91 @@
+#include <algorithm>
+
+#include "conv2d-transpose.cuh"
+#include "ggml.h"
+
+__global__ void conv2d_transpose_kernel(const float * __restrict__ input, const half * __restrict__ kernel,
+                                        float * __restrict__ output, const int in_w, const int in_h, const int out_w,
+                                        const int out_h, const int kernel_w, const int kernel_h, const int stride,
+                                        const int c_in, const int c_out, const int batches) {
+    const int global_idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+    const int total_elements = out_w * out_h * c_out * batches;
+
+    if (global_idx >= total_elements) {
+        return;
+    }
+
+    const int out_x_idx = global_idx % out_w;
+    const int out_y_idx = (global_idx / out_w) % out_h;
+    const int c_idx     = (global_idx / (out_w * out_h)) % c_out;
+    const int n_idx     = global_idx / (out_w * out_h * c_out);
+
+    float accumulator = 0;
+    // For each output idx, find the inputs that contribute to it by checking stride alignment and bounds
+
+    for (int c_in_idx = 0; c_in_idx < c_in; c_in_idx++) {
+        for (int kh = 0; kh < kernel_h; ++kh) {
+            int in_y = out_y_idx - kh;
+            if (in_y < 0 || in_y % stride) continue;
+            in_y /= stride;
+            if (in_y >= in_h) continue;
+
+            for (int kw = 0; kw < kernel_w; ++kw) {
+                int in_x = out_x_idx - kw;
+                if (in_x < 0 || in_x % stride) continue;
+                in_x /= stride;
+                if (in_x >= in_w) continue;
+
+                const int input_idx = (in_w * in_h * c_in) * n_idx + (in_w * in_h) * c_in_idx + (in_w) *in_y + in_x;
+                const int kernel_idx =
+                    (kernel_h * kernel_w * c_out) * c_in_idx + (kernel_h * kernel_w) * c_idx + (kernel_w) *kh + kw;
+
+                float input_val = input[input_idx];
+                half  kern_val  = kernel[kernel_idx];
+
+                accumulator += input_val * (float) kern_val;
+            }
+        }
+    }
+
+    output[(out_w * out_h * c_out) * n_idx + (out_w * out_h) * c_idx + (out_w) *out_y_idx + out_x_idx] = accumulator;
+}
+
+//input is (W, H, C_in, N), Kernel is (W, H, C_out, C_in)
+void ggml_cuda_conv_2d_transpose_p0(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    const ggml_tensor * kernel = dst->src[0];
+    const ggml_tensor * input  = dst->src[1];
+
+    GGML_ASSERT(kernel->type == GGML_TYPE_F16 && input->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
+
+    const float * input_data  = (const float *) input->data;
+    float *       output_data = (float *) dst->data;
+    const half * kernel_data = (const half *) kernel->data;
+
+    const int input_w      = input->ne[0];
+    const int input_h      = input->ne[1];
+    const int output_w     = dst->ne[0];
+    const int output_h     = dst->ne[1];
+    const int channels_in  = input->ne[2];
+    const int channels_out = kernel->ne[2];
+    const int kernel_w     = kernel->ne[0];
+    const int kernel_h     = kernel->ne[1];
+    const int stride       = dst->op_params[0];
+    const int batches      = input->ne[3];
+
+    GGML_ASSERT(channels_in == kernel->ne[3]);
+    GGML_ASSERT(stride > 0);
+
+    cudaStream_t st = ctx.stream();
+
+    GGML_ASSERT(ggml_is_contiguous(input));
+    GGML_ASSERT(ggml_is_contiguous(kernel));
+    GGML_ASSERT(ggml_is_contiguous(dst));
+
+    const int total  = (output_w * output_h * channels_out * batches);
+    const int blocks = (total + CUDA_CONV2D_TRANSPOSE_BLOCK_SIZE - 1) / CUDA_CONV2D_TRANSPOSE_BLOCK_SIZE;
+
+    conv2d_transpose_kernel<<<blocks, CUDA_CONV2D_TRANSPOSE_BLOCK_SIZE, 0, st>>>(
+        input_data, kernel_data, output_data, input_w, input_h, output_w, output_h, kernel_w, kernel_h, stride,
+        channels_in, channels_out, batches);
+}
diff --git a/ggml/src/ggml-cuda/conv2d-transpose.cuh b/ggml/src/ggml-cuda/conv2d-transpose.cuh
new file mode 100644
index 0000000000000..c9430b2485021
--- /dev/null
+++ b/ggml/src/ggml-cuda/conv2d-transpose.cuh
@@ -0,0 +1,4 @@
+#include "common.cuh"
+
+#define CUDA_CONV2D_TRANSPOSE_BLOCK_SIZE 256
+void ggml_cuda_conv_2d_transpose_p0(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/ggml/src/ggml-cuda/fattn-mma-f16.cuh b/ggml/src/ggml-cuda/fattn-mma-f16.cuh
index 925f39e890db9..e230f6d494d77 100644
--- a/ggml/src/ggml-cuda/fattn-mma-f16.cuh
+++ b/ggml/src/ggml-cuda/fattn-mma-f16.cuh
@@ -652,9 +652,12 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
         float KQ_max_scale[cols_per_thread];
 #pragma unroll
         for (int col = 0; col < cols_per_thread; ++col) {
-            KQ_max_scale[col] = expf(KQ_max[col] - KQ_max_new[col]);
+            const float KQ_max_diff = KQ_max[col] - KQ_max_new[col];
+            KQ_max_scale[col] = expf(KQ_max_diff);
             KQ_max[col] = KQ_max_new[col];
 
+            *((uint32_t *) &KQ_max_scale[col]) *= KQ_max_diff >= SOFTMAX_FTZ_THRESHOLD;
+
             // Scale previous KQ_rowsum to account for a potential increase in KQ_max:
             KQ_rowsum[col] = KQ_max_scale[col]*KQ_rowsum[col] + KQ_rowsum_add[col];
         }
diff --git a/ggml/src/ggml-cuda/fattn-wmma-f16.cu b/ggml/src/ggml-cuda/fattn-wmma-f16.cu
index c5668adb152b2..f3b794c3644c8 100644
--- a/ggml/src/ggml-cuda/fattn-wmma-f16.cu
+++ b/ggml/src/ggml-cuda/fattn-wmma-f16.cu
@@ -9,7 +9,11 @@
 #ifdef FP16_MMA_AVAILABLE
 #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
 #include <mma.h>
+#ifdef GGML_USE_MUSA
+namespace wmma = mtmusa::wmma;
+#else // GGML_USE_MUSA
 namespace wmma = nvcuda::wmma;
+#endif // GGML_USE_MUSA
 #elif defined(GGML_HIP_ROCWMMA_FATTN) && defined(FP16_MMA_AVAILABLE)
 #undef HIP_ENABLE_WARP_SYNC_BUILTINS // conflicts with rocWMMA headers
 #include <rocwmma/rocwmma.hpp>
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index 2a6f7f108b3f8..b30c13c62f25c 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -11,6 +11,8 @@
 #include "ggml-cuda/clamp.cuh"
 #include "ggml-cuda/concat.cuh"
 #include "ggml-cuda/conv-transpose-1d.cuh"
+#include "ggml-cuda/conv2d-dw.cuh"
+#include "ggml-cuda/conv2d-transpose.cuh"
 #include "ggml-cuda/convert.cuh"
 #include "ggml-cuda/count-equal.cuh"
 #include "ggml-cuda/cpy.cuh"
@@ -35,6 +37,7 @@
 #include "ggml-cuda/ssm-scan.cuh"
 #include "ggml-cuda/sum.cuh"
 #include "ggml-cuda/sumrows.cuh"
+#include "ggml-cuda/mean.cuh"
 #include "ggml-cuda/tsembd.cuh"
 #include "ggml-cuda/unary.cuh"
 #include "ggml-cuda/upscale.cuh"
@@ -47,6 +50,7 @@
 #include <atomic>
 #include <charconv>
 #include <cinttypes>
+#include <condition_variable>
 #include <cstddef>
 #include <cstdint>
 #include <float.h>
@@ -54,9 +58,8 @@
 #include <map>
 #include <memory>
 #include <mutex>
-#include <stdint.h>
-#include <stdio.h>
 #include <stdarg.h>
+#include <stdio.h>
 #include <stdlib.h>
 #include <string>
 #include <vector>
@@ -97,8 +100,7 @@ int ggml_cuda_get_device() {
 static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device) {
     ggml_cuda_set_device(device);
     cudaError_t err;
-    if (getenv("GGML_CUDA_ENABLE_UNIFIED_MEMORY") != nullptr)
-    {
+    if (getenv("GGML_CUDA_ENABLE_UNIFIED_MEMORY") != nullptr) {
         err = cudaMallocManaged(ptr, size);
 #if defined(GGML_USE_HIP)
         if (err == hipSuccess) {
@@ -116,9 +118,7 @@ static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device)
             err = cudaMalloc(ptr, size);
         }
 #endif // defined(GGML_USE_HIP)
-    }
-    else
-    {
+    } else {
         err = cudaMalloc(ptr, size);
     }
     return err;
@@ -514,6 +514,33 @@ std::unique_ptr<ggml_cuda_pool> ggml_backend_cuda_context::new_pool_for_device(i
     return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_leg(device));
 }
 
+// destroying a cuBLAS handle while a graph is being captured in a different thread can result in a CUDA error
+// this lock is used to ensure that no cuBLAS handle is destroyed while a graph is being captured
+
+static std::mutex ggml_cuda_lock;
+static std::condition_variable ggml_cuda_lock_cv;
+static std::atomic<int> ggml_cuda_lock_counter;
+
+ggml_backend_cuda_context::~ggml_backend_cuda_context() {
+    std::unique_lock<std::mutex> lock(ggml_cuda_lock);
+    ggml_cuda_lock_cv.wait(lock, []{ return ggml_cuda_lock_counter.load(std::memory_order_relaxed) == 0; });
+
+    if (copy_event != nullptr) {
+        CUDA_CHECK(cudaEventDestroy(copy_event));
+    }
+    for (int i = 0; i < GGML_CUDA_MAX_DEVICES; ++i) {
+        for (int j = 0; j < GGML_CUDA_MAX_STREAMS; ++j) {
+            if (streams[i][j] != nullptr) {
+                CUDA_CHECK(cudaStreamDestroy(streams[i][j]));
+            }
+        }
+        if (cublas_handles[i] != nullptr) {
+            CUBLAS_CHECK(cublasDestroy(cublas_handles[i]));
+        }
+    }
+}
+
+
 // cuda buffer
 
 struct ggml_backend_cuda_buffer_context {
@@ -615,9 +642,8 @@ static void ggml_backend_cuda_buffer_clear(ggml_backend_buffer_t buffer, uint8_t
     ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
 
     ggml_cuda_set_device(ctx->device);
-    CUDA_CHECK(cudaDeviceSynchronize());
-    CUDA_CHECK(cudaMemset(ctx->dev_ptr, value, buffer->size));
-    CUDA_CHECK(cudaDeviceSynchronize());
+    CUDA_CHECK(cudaMemsetAsync(ctx->dev_ptr, value, buffer->size, cudaStreamPerThread));
+    CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
 }
 
 static const ggml_backend_buffer_i ggml_backend_cuda_buffer_interface = {
@@ -1144,7 +1170,6 @@ typedef void (*ggml_cuda_op_mul_mat_t)(
 static cudaError_t ggml_cuda_cpy_tensor_2d(
     void * dst, const struct ggml_tensor * src, int64_t i3, int64_t i2, int64_t i1_low, int64_t i1_high, cudaStream_t stream) {
 
-    GGML_ASSERT(ggml_backend_buffer_is_cuda(src->buffer));
     const char * src_ptr = (const char *) src->data;
     char       * dst_ptr = (char       *) dst;
 
@@ -1202,9 +1227,12 @@ static void ggml_cuda_op_mul_mat_cublas(
 
     const int cc = ggml_cuda_info().devices[id].cc;
 
+    const bool supports_bf16 = GGML_CUDA_CC_IS_NVIDIA(cc) || GGML_CUDA_CC_IS_AMD(cc) ||
+        (GGML_CUDA_CC_IS_MTHREADS(cc) && cc >= GGML_CUDA_CC_QY2);
+
     const bool use_fp16 = (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT;
 
-    if (src0->type == GGML_TYPE_BF16 && ggml_is_contiguous(src0) && row_diff == src0->ne[1]) {
+    if (supports_bf16 && src0->type == GGML_TYPE_BF16 && ggml_is_contiguous(src0) && row_diff == src0->ne[1]) {
         ggml_cuda_pool_alloc<nv_bfloat16> src1_as_bf16(ctx.pool(id));
         if (src1->type != GGML_TYPE_BF16) {
             const to_bf16_cuda_t to_bf16_cuda = ggml_get_to_bf16_cuda(src1->type);
@@ -1232,7 +1260,7 @@ static void ggml_cuda_op_mul_mat_cublas(
 
         const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_BF16);
         to_fp32_cuda(dst_bf16.get(), dst_dd_i, row_diff*src1_ncols, stream);
-    } else if (((GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_VOLTA) || GGML_CUDA_CC_IS_AMD(cc)) && use_fp16) {
+    } else if (fast_fp16_hardware_available(cc) && use_fp16) {
         // convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32
         ggml_cuda_pool_alloc<half> src0_as_f16(ctx.pool(id));
         if (src0->type != GGML_TYPE_F16) {
@@ -1427,8 +1455,6 @@ static void ggml_cuda_op_mul_mat(
     const int64_t nb2 = dst->nb[2];
     const int64_t nb3 = dst->nb[3];
 
-    GGML_ASSERT(ggml_backend_buffer_is_cuda(dst->buffer));
-    GGML_ASSERT(ggml_backend_buffer_is_cuda(src1->buffer));
     ggml_backend_cuda_buffer_context * src1_ctx = (ggml_backend_cuda_buffer_context *) src1->buffer->context;
     ggml_backend_cuda_buffer_context * dst_ctx  = (ggml_backend_cuda_buffer_context *) dst->buffer->context;
 
@@ -1750,7 +1776,7 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
     GGML_ASSERT(!ggml_is_transposed(src0));
     GGML_ASSERT(!ggml_is_transposed(src1));
 
-    GGML_ASSERT(ggml_backend_buffer_is_cuda(src0->buffer));
+    GGML_ASSERT(!ggml_backend_buft_is_cuda_split(src0->buffer->buft));
     GGML_ASSERT(src0->type == GGML_TYPE_F16);
 
     // Byte offsets and tensor dimensions are currently used in an inconsistent way for dst.
@@ -1920,16 +1946,14 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
         && ggml_nbytes(src0) != ggml_backend_buffer_get_alloc_size(src0->buffer, src0) && src0->view_src;
 
     bool use_mul_mat_vec   = (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_BF16)
-        && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
-        && src0->ne[0] % 2 == 0 && src1->ne[1] == 1;
+        && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
     bool use_mul_mat_vec_q = ggml_is_quantized(src0->type) && !bad_padding_clear
         && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
         && src1->ne[1] <= MMVQ_MAX_BATCH_SIZE;
     bool use_mul_mat_q     = ggml_is_quantized(src0->type) && !bad_padding_clear
         && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
 
-    bool any_gpus_with_slow_fp16   = false;
-    bool any_gpus_without_fp16_mma = false;
+    bool any_gpus_with_slow_fp16 = false;
 
     if (split) {
         ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) src0->buffer->buft->context;
@@ -1940,16 +1964,16 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
                 continue;
             }
 
-            const int cc              = ggml_cuda_info().devices[id].cc;
-            use_mul_mat_q             = use_mul_mat_q             && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
-            any_gpus_with_slow_fp16   = any_gpus_with_slow_fp16   || !fast_fp16_hardware_available(cc);
-            any_gpus_without_fp16_mma = any_gpus_without_fp16_mma || !fp16_mma_hardware_available(cc);
+            const int cc            = ggml_cuda_info().devices[id].cc;
+            use_mul_mat_q           = use_mul_mat_q             && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
+            use_mul_mat_vec         = use_mul_mat_vec           && ggml_cuda_should_use_mmv(src0->type, cc, src0->ne, src1->ne[1]);
+            any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16   || !fast_fp16_hardware_available(cc);
         }
     } else {
-        const int cc              = ggml_cuda_info().devices[ctx.device].cc;
-        use_mul_mat_q             = use_mul_mat_q             && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
-        any_gpus_with_slow_fp16   = any_gpus_with_slow_fp16   || !fast_fp16_hardware_available(cc);
-        any_gpus_without_fp16_mma = any_gpus_without_fp16_mma || !fp16_mma_hardware_available(cc);
+        const int cc            = ggml_cuda_info().devices[ctx.device].cc;
+        use_mul_mat_q           = use_mul_mat_q             && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
+        use_mul_mat_vec         = use_mul_mat_vec           && ggml_cuda_should_use_mmv(src0->type, cc, src0->ne, src1->ne[1]);
+        any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16   || !fast_fp16_hardware_available(cc);
     }
 
     // debug helpers
@@ -1960,7 +1984,7 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
     //printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
     //printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
 
-    if (!split && use_mul_mat_vec && (src0->ne[1] <= MMV_MAX_ROWS || any_gpus_without_fp16_mma)) {
+    if (!split && use_mul_mat_vec) {
         // the custom F16 vector kernel can be used over batched cuBLAS GEMM
         // but this is only faster for GPUs without tensor cores or with a thin src0 matrix (particularly KQV in attention)
         ggml_cuda_mul_mat_vec(ctx, src0, src1, nullptr, dst);
@@ -2314,6 +2338,12 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
         case GGML_OP_IM2COL:
             ggml_cuda_op_im2col(ctx, dst);
             break;
+        case GGML_OP_CONV_2D_DW:
+            ggml_cuda_op_conv2d_dw(ctx, dst);
+            break;
+        case GGML_OP_CONV_TRANSPOSE_2D:
+            ggml_cuda_conv_2d_transpose_p0(ctx, dst);
+            break;
         case GGML_OP_CONV_TRANSPOSE_1D:
             ggml_cuda_op_conv_transpose_1d(ctx,dst);
             break;
@@ -2326,6 +2356,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
         case GGML_OP_SUM_ROWS:
             ggml_cuda_op_sum_rows(ctx, dst);
             break;
+        case GGML_OP_MEAN:
+            ggml_cuda_op_mean(ctx, dst);
+            break;
         case GGML_OP_SSM_CONV:
             ggml_cuda_op_ssm_conv(ctx, dst);
             break;
@@ -2668,7 +2701,9 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
                                ggml_backend_buft_is_cuda_split(node->src[j]->buffer->buft) || (integrated && ggml_backend_buft_is_cuda_host(node->src[j]->buffer->buft)));
                     }
                 }
-#endif
+#else
+                GGML_UNUSED(integrated);
+#endif // NDEBUG
 
                 bool ok = ggml_cuda_compute_forward(*cuda_ctx, node);
                 if (!ok) {
@@ -2687,6 +2722,11 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
 
             CUDA_CHECK(cudaStreamEndCapture(cuda_ctx->stream(), &cuda_ctx->cuda_graph->graph));
             graph_evaluated_or_captured = true; // CUDA graph has been captured
+
+            std::lock_guard<std::mutex> lock(ggml_cuda_lock);
+            if (ggml_cuda_lock_counter.fetch_sub(1, std::memory_order_relaxed) == 1) {
+                ggml_cuda_lock_cv.notify_all();
+            }
         } else {
             graph_evaluated_or_captured = true; // ggml graph has been directly evaluated
         }
@@ -2762,7 +2802,13 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
         }
     }
 
-    if (use_cuda_graph && cuda_graph_update_required) { // Start CUDA graph capture
+    if (use_cuda_graph && cuda_graph_update_required) {
+        // Start CUDA graph capture
+        {
+            std::lock_guard<std::mutex> lock(ggml_cuda_lock);
+            ggml_cuda_lock_counter.fetch_add(1, std::memory_order_relaxed);
+        }
+
         CUDA_CHECK(cudaStreamBeginCapture(cuda_ctx->stream(), cudaStreamCaptureModeRelaxed));
     }
 
@@ -3018,9 +3064,16 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
                     return false;
                 }
 #ifdef GGML_USE_MUSA
-                if (b->type == GGML_TYPE_F16 && b->ne[2]*b->ne[3] > 1 &&
-                    !ggml_is_transposed(a) && !ggml_is_transposed(b)) {
-                    return false;
+                const int cc = ggml_cuda_info().devices[dev_ctx->device].cc;
+                if (b->ne[2]*b->ne[3] > 1 && !ggml_is_transposed(a) && !ggml_is_transposed(b)) {
+                    if (GGML_CUDA_CC_IS_QY1(cc) && op->op == GGML_OP_MUL_MAT &&
+                            a->type == GGML_TYPE_F16 && b->type == GGML_TYPE_F16) {
+                        return false;
+                    }
+                    if (GGML_CUDA_CC_IS_QY2(cc) && op->op == GGML_OP_MUL_MAT_ID &&
+                            a->type == GGML_TYPE_Q2_K && b->type == GGML_TYPE_F32) {
+                        return false;
+                    }
                 }
 #endif // GGML_USE_MUSA
                 switch (a->type) {
@@ -3047,11 +3100,6 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
                     case GGML_TYPE_IQ4_NL:
                     case GGML_TYPE_IQ4_XS:
                     case GGML_TYPE_BF16:
-#ifdef GGML_USE_MUSA
-                        if (a->type == GGML_TYPE_Q3_K) {
-                            return false;
-                        }
-#endif // GGML_USE_MUSA
                         return true;
                     default:
                         return false;
@@ -3211,9 +3259,12 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
             return op->src[0]->nb[0] == ggml_type_size(op->src[0]->type) && ggml_is_contiguous_2(op->src[0]);
         }
         case GGML_OP_IM2COL:
+        case GGML_OP_CONV_2D_DW:
+        case GGML_OP_CONV_TRANSPOSE_2D:
         case GGML_OP_POOL_2D:
         case GGML_OP_SUM:
         case GGML_OP_SUM_ROWS:
+        case GGML_OP_MEAN:
         case GGML_OP_ARGSORT:
         case GGML_OP_ACC:
             return true;
diff --git a/ggml/src/ggml-cuda/mean.cu b/ggml/src/ggml-cuda/mean.cu
new file mode 100644
index 0000000000000..4b238a3998ba3
--- /dev/null
+++ b/ggml/src/ggml-cuda/mean.cu
@@ -0,0 +1,19 @@
+#include "mean.cuh"
+
+void ggml_cuda_op_mean(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    const ggml_tensor * src0   = dst->src[0];
+    const float *       src0_d = (const float *) src0->data;
+    float *             dst_d  = (float *) dst->data;
+    cudaStream_t        stream = ctx.stream();
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT(dst->type == GGML_TYPE_F32);
+    GGML_ASSERT(ggml_is_contiguous(src0));
+
+    const int64_t ncols = src0->ne[0];
+    const int64_t nrows = ggml_nrows(src0);
+
+    const dim3 block_dims(WARP_SIZE, 1, 1);
+    const dim3 block_nums(nrows, 1, 1);
+    reduce_rows_f32</*norm*/ true><<<block_nums, block_dims, 0, stream>>>(src0_d, dst_d, ncols);
+}
diff --git a/ggml/src/ggml-cuda/mean.cuh b/ggml/src/ggml-cuda/mean.cuh
new file mode 100644
index 0000000000000..2b9b10433438e
--- /dev/null
+++ b/ggml/src/ggml-cuda/mean.cuh
@@ -0,0 +1,3 @@
+#include "common.cuh"
+
+void ggml_cuda_op_mean(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/ggml/src/ggml-cuda/mmv.cu b/ggml/src/ggml-cuda/mmv.cu
index d8c385e2399ae..e14c93516bddf 100644
--- a/ggml/src/ggml-cuda/mmv.cu
+++ b/ggml/src/ggml-cuda/mmv.cu
@@ -2,25 +2,26 @@
 #include "common.cuh"
 #include "mmv.cuh"
 
-template <typename T, typename type_acc, int block_size>
+template <typename T, typename type_acc, int ncols_dst, int block_size>
 static __global__ void mul_mat_vec(
         const T * __restrict__ x, const float * __restrict__ y, const int32_t * __restrict__ ids, float * __restrict__ dst,
-        const int64_t ncols2, const int64_t nchannels_y, const int64_t stride_row,
-        const int64_t channel_ratio, const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst,
-        const int64_t sample_ratio, const int64_t stride_sample_x, const int64_t stride_sample_y, const int64_t stride_sample_dst) {
-    const int64_t row         = blockIdx.x;
-    const int64_t channel_dst = blockIdx.y;
-    const int64_t channel_x   = ids ? ids[channel_dst]          : channel_dst / channel_ratio;
-    const int64_t channel_y   = ids ? channel_dst % nchannels_y : channel_dst;
-    const int64_t sample_dst  = blockIdx.z;
-    const int64_t sample_x    = sample_dst / sample_ratio;
-    const int64_t sample_y    = sample_dst;
-    const int     tid         = threadIdx.x;
+        const int ncols2, const int nchannels_y, const int stride_row, const int stride_col_y2, const int stride_col_dst,
+        const int channel_ratio, const int stride_channel_x, const int stride_channel_y, const int stride_channel_dst,
+        const int sample_ratio, const int stride_sample_x, const int stride_sample_y, const int stride_sample_dst) {
+    const int row         = blockIdx.x;
+    const int channel_dst = blockIdx.y;
+    const int channel_x   = ids ? ids[channel_dst]          : channel_dst / channel_ratio;
+    const int channel_y   = ids ? channel_dst % nchannels_y : channel_dst;
+    const int sample_dst  = blockIdx.z;
+    const int sample_x    = sample_dst / sample_ratio;
+    const int sample_y    = sample_dst;
+    const int tid         = threadIdx.x;
+
     constexpr int warp_size   = ggml_cuda_get_physical_warp_size();
 
-    x   += sample_x  *stride_sample_x   + channel_x  *stride_channel_x   + row*stride_row;
-    y   += sample_y  *stride_sample_y   + channel_y  *stride_channel_y;
-    dst += sample_dst*stride_sample_dst + channel_dst*stride_channel_dst;
+    x   += int64_t(sample_x)  *stride_sample_x   + channel_x  *stride_channel_x   + row*stride_row;
+    y   += int64_t(sample_y)  *stride_sample_y   + channel_y  *stride_channel_y;
+    dst += int64_t(sample_dst)*stride_sample_dst + channel_dst*stride_channel_dst;
 
     const float2 * y2 = (const float2 *) y;
 
@@ -34,81 +35,108 @@ static __global__ void mul_mat_vec(
         __syncthreads();
     }
 
-    float sumf = 0.0f;
+    float sumf[ncols_dst] = {0.0f};
 
     if constexpr (std::is_same<T, float>::value) {
         const float2 * x2 = (const float2 *) x;
 
-        for (int64_t col2 = tid; col2 < ncols2; col2 += block_size) {
+        for (int col2 = tid; col2 < ncols2; col2 += block_size) {
             const float2 tmpx = x2[col2];
-            const float2 tmpy = y2[col2];
-            sumf += tmpx.x*tmpy.x;
-            sumf += tmpx.y*tmpy.y;
+
+#pragma unroll
+            for (int j = 0; j < ncols_dst; ++j) {
+                const float2 tmpy = y2[j*stride_col_y2 + col2];
+                sumf[j] += tmpx.x*tmpy.x;
+                sumf[j] += tmpx.y*tmpy.y;
+            }
         }
     } else if constexpr (std::is_same<T, half>::value) {
         const half2 * x2 = (const half2 *) x;
 
         if (std::is_same<type_acc, float>::value) {
-            for (int64_t col2 = tid; col2 < ncols2; col2 += block_size) {
+            for (int col2 = tid; col2 < ncols2; col2 += block_size) {
                 const float2 tmpx = __half22float2(x2[col2]);
-                const float2 tmpy = y2[col2];
-                sumf += tmpx.x * tmpy.x;
-                sumf += tmpx.y * tmpy.y;
+
+#pragma unroll
+                for (int j = 0; j < ncols_dst; ++j) {
+                    const float2 tmpy = y2[j*stride_col_y2 + col2];
+                    sumf[j] += tmpx.x * tmpy.x;
+                    sumf[j] += tmpx.y * tmpy.y;
+                }
             }
         } else {
 #ifdef FP16_AVAILABLE
-            half2 sumh2 = make_half2(0.0f, 0.0f);
+            half2 sumh2[ncols_dst] = {{0.0f, 0.0f}};
+
+            for (int col2 = tid; col2 < ncols2; col2 += block_size) {
+                const half2 tmpx = x2[col2];
 
-            for (int64_t col2 = tid; col2 < ncols2; col2 += block_size) {
-                const float2 tmp = y2[col2];
-                sumh2 += x2[col2] * make_half2(tmp.x, tmp.y);
+#pragma unroll
+                for (int j = 0; j < ncols_dst; ++j) {
+                    const float2 tmpy = y2[j*stride_col_y2 + col2];
+                    sumh2[j] += tmpx * make_half2(tmpy.x, tmpy.y);
+                }
             }
 
-            sumf = __low2float(sumh2) + __high2float(sumh2);
+#pragma unroll
+            for (int j = 0; j < ncols_dst; ++j) {
+                sumf[j] = __low2float(sumh2[j]) + __high2float(sumh2[j]);
+            }
 #else
             NO_DEVICE_CODE;
 #endif // FP16_AVAILABLE
         }
     } else if constexpr (std::is_same<T, nv_bfloat16>::value) {
         const int * x2 = (const int *) x;
-        for (int64_t col2 = tid; col2 < ncols2; col2 += block_size) {
-            const int    tmpx = x2[col2];
-            const float2 tmpy = y2[col2];
-            sumf += float(reinterpret_cast<const nv_bfloat16 *>(&tmpx)[0]) * tmpy.x;
-            sumf += float(reinterpret_cast<const nv_bfloat16 *>(&tmpx)[1]) * tmpy.y;
+        for (int col2 = tid; col2 < ncols2; col2 += block_size) {
+            const int tmpx = x2[col2];
+#pragma unroll
+            for (int j = 0; j < ncols_dst; ++j) {
+                const float2 tmpy = y2[j*stride_col_y2 + col2];
+                sumf[j] += float(reinterpret_cast<const nv_bfloat16 *>(&tmpx)[0]) * tmpy.x;
+                sumf[j] += float(reinterpret_cast<const nv_bfloat16 *>(&tmpx)[1]) * tmpy.y;
+            }
         }
     } else {
         static_assert(std::is_same<T, void>::value, "unsupported type");
     }
 
-    sumf = warp_reduce_sum<warp_size>(sumf);
+#pragma unroll
+    for (int j = 0; j < ncols_dst; ++j) {
+        sumf[j] = warp_reduce_sum<warp_size>(sumf[j]);
 
-    if (block_size > warp_size) {
-        buf_iw[tid/warp_size] = sumf;
-        __syncthreads();
-        if (tid >= warp_size) {
-            return;
+        if (block_size > warp_size) {
+            buf_iw[tid/warp_size] = sumf[j];
+            __syncthreads();
+            if (tid < warp_size) {
+                sumf[j] = buf_iw[tid];
+                sumf[j] = warp_reduce_sum<warp_size>(sumf[j]);
+            }
+            if (j < ncols_dst) {
+                __syncthreads();
+            }
         }
-        sumf = buf_iw[tid];
-        sumf = warp_reduce_sum<warp_size>(sumf);
     }
 
-    if (tid != 0) {
+    if (tid >= ncols_dst) {
         return;
     }
 
-    dst[row] = sumf;
+    dst[tid*stride_col_dst + row] = sumf[tid];
 }
 
-template <typename T, typename type_acc>
+template <typename T, typename type_acc, int ncols_dst>
 static void launch_mul_mat_vec_cuda(
         const T * x, const float * y, const int32_t * ids, float * dst,
-        const int64_t ncols, const int64_t nrows, const int64_t stride_row, const int64_t nchannels_x, const int64_t nchannels_y, const int64_t nchannels_dst,
+        const int64_t ncols, const int64_t nrows,
+        const int64_t stride_row, const int64_t stride_col_y, const int64_t stride_col_dst,
+        const int64_t nchannels_x, const int64_t nchannels_y, const int64_t nchannels_dst,
         const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst, const int64_t nsamples_x,
         const int64_t nsamples_dst, const int64_t stride_sample_x, const int64_t stride_sample_y, const int64_t stride_sample_dst,
         cudaStream_t stream) {
-    GGML_ASSERT(ncols      % 2 == 0);
-    GGML_ASSERT(stride_row % 2 == 0);
+    GGML_ASSERT(ncols        % 2 == 0);
+    GGML_ASSERT(stride_row   % 2 == 0);
+    GGML_ASSERT(stride_col_y % 2 == 0);
     GGML_ASSERT(ids || nchannels_dst % nchannels_x == 0);
     GGML_ASSERT(       nsamples_dst  % nsamples_x  == 0);
     const int64_t channel_ratio = nchannels_dst / nchannels_x;
@@ -138,44 +166,52 @@ static void launch_mul_mat_vec_cuda(
     const dim3 block_dims(block_size_best, 1, 1);
     switch (block_size_best) {
         case   32: {
-            mul_mat_vec<T, type_acc,  32><<<block_nums, block_dims, smem, stream>>>
-                (x, y, ids, dst, ncols/2, nchannels_y, stride_row, channel_ratio, stride_channel_x, stride_channel_y,
-                 stride_channel_dst, sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
+            mul_mat_vec<T, type_acc, ncols_dst,  32><<<block_nums, block_dims, smem, stream>>>
+                (x, y, ids, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst,
+                 channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
+                 sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
         } break;
         case   64: {
-            mul_mat_vec<T, type_acc,  64><<<block_nums, block_dims, smem, stream>>>
-                (x, y, ids, dst, ncols/2, nchannels_y, stride_row, channel_ratio, stride_channel_x, stride_channel_y,
-                 stride_channel_dst, sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
+            mul_mat_vec<T, type_acc, ncols_dst,  64><<<block_nums, block_dims, smem, stream>>>
+                (x, y, ids, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst,
+                 channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
+                 sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
         } break;
         case   96: {
-            mul_mat_vec<T, type_acc,  96><<<block_nums, block_dims, smem, stream>>>
-                (x, y, ids, dst, ncols/2, nchannels_y, stride_row, channel_ratio, stride_channel_x, stride_channel_y,
-                 stride_channel_dst, sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
+            mul_mat_vec<T, type_acc, ncols_dst,  96><<<block_nums, block_dims, smem, stream>>>
+                (x, y, ids, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst,
+                 channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
+                 sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
         } break;
         case  128: {
-            mul_mat_vec<T, type_acc, 128><<<block_nums, block_dims, smem, stream>>>
-                (x, y, ids, dst, ncols/2, nchannels_y, stride_row, channel_ratio, stride_channel_x, stride_channel_y,
-                 stride_channel_dst, sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
+            mul_mat_vec<T, type_acc, ncols_dst, 128><<<block_nums, block_dims, smem, stream>>>
+                (x, y, ids, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst,
+                 channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
+                 sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
         } break;
         case  160: {
-            mul_mat_vec<T, type_acc, 160><<<block_nums, block_dims, smem, stream>>>
-                (x, y, ids, dst, ncols/2, nchannels_y, stride_row, channel_ratio, stride_channel_x, stride_channel_y,
-                 stride_channel_dst, sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
+            mul_mat_vec<T, type_acc, ncols_dst, 160><<<block_nums, block_dims, smem, stream>>>
+                (x, y, ids, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst,
+                 channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
+                 sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
         } break;
         case  192: {
-            mul_mat_vec<T, type_acc, 192><<<block_nums, block_dims, smem, stream>>>
-                (x, y, ids, dst, ncols/2, nchannels_y, stride_row, channel_ratio, stride_channel_x, stride_channel_y,
-                 stride_channel_dst, sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
+            mul_mat_vec<T, type_acc, ncols_dst, 192><<<block_nums, block_dims, smem, stream>>>
+                (x, y, ids, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst,
+                 channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
+                 sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
         } break;
         case  224: {
-            mul_mat_vec<T, type_acc, 224><<<block_nums, block_dims, smem, stream>>>
-                (x, y, ids, dst, ncols/2, nchannels_y, stride_row, channel_ratio, stride_channel_x, stride_channel_y,
-                 stride_channel_dst, sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
+            mul_mat_vec<T, type_acc, ncols_dst, 224><<<block_nums, block_dims, smem, stream>>>
+                (x, y, ids, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst,
+                 channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
+                 sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
         } break;
         case  256: {
-            mul_mat_vec<T, type_acc, 256><<<block_nums, block_dims, smem, stream>>>
-                (x, y, ids, dst, ncols/2, nchannels_y, stride_row, channel_ratio, stride_channel_x, stride_channel_y,
-                 stride_channel_dst, sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
+            mul_mat_vec<T, type_acc, ncols_dst, 256><<<block_nums, block_dims, smem, stream>>>
+                (x, y, ids, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst,
+                 channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
+                 sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
         } break;
         default: {
             GGML_ABORT("fatal error");
@@ -183,23 +219,91 @@ static void launch_mul_mat_vec_cuda(
     }
 }
 
+template <typename T, typename type_acc>
+static void mul_mat_vec_cuda_switch_ncols_dst(
+        const T * x, const float * y, const int32_t * ids, float * dst,
+        const int64_t ncols, const int64_t nrows, const int64_t ncols_dst,
+        const int64_t stride_row, const int64_t stride_col_y, const int64_t stride_col_dst,
+        const int64_t nchannels_x, const int64_t nchannels_y, const int64_t nchannels_dst,
+        const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst, const int64_t nsamples_x,
+        const int64_t nsamples_dst, const int64_t stride_sample_x, const int64_t stride_sample_y, const int64_t stride_sample_dst,
+        cudaStream_t stream) {
+    switch (ncols_dst) {
+        case 1:
+            launch_mul_mat_vec_cuda<T, type_acc, 1>
+                (x, y, ids, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst,
+                 nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y,
+                 stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
+            break;
+        case 2:
+            launch_mul_mat_vec_cuda<T, type_acc, 2>
+                (x, y, ids, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst,
+                 nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y,
+                 stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
+            break;
+        case 3:
+            launch_mul_mat_vec_cuda<T, type_acc, 3>
+                (x, y, ids, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst,
+                 nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y,
+                 stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
+            break;
+        case 4:
+            launch_mul_mat_vec_cuda<T, type_acc, 4>
+                (x, y, ids, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst,
+                 nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y,
+                 stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
+            break;
+        case 5:
+            launch_mul_mat_vec_cuda<T, type_acc, 5>
+                (x, y, ids, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst,
+                 nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y,
+                 stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
+            break;
+        case 6:
+            launch_mul_mat_vec_cuda<T, type_acc, 6>
+                (x, y, ids, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst,
+                 nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y,
+                 stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
+            break;
+        case 7:
+            launch_mul_mat_vec_cuda<T, type_acc, 7>
+                (x, y, ids, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst,
+                 nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y,
+                 stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
+            break;
+        case 8:
+            launch_mul_mat_vec_cuda<T, type_acc, 8>
+                (x, y, ids, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst,
+                 nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y,
+                 stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
+            break;
+        default:
+            GGML_ABORT("fatal error");
+            break;
+    }
+}
+
 template<typename T>
 static void mul_mat_vec_cuda(
         const T * x, const float * y, const int32_t * ids, float * dst,
-        const int64_t ncols, const int64_t nrows, const int64_t stride_row, const int64_t nchannels_x, const int64_t nchannels_y, const int64_t nchannels_dst,
+        const int64_t ncols, const int64_t nrows, const int64_t ncols_dst,
+        const int64_t stride_row, const int64_t stride_col_y, const int stride_col_dst,
+        const int64_t nchannels_x, const int64_t nchannels_y, const int64_t nchannels_dst,
         const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst, const int64_t nsamples_x,
         const int64_t nsamples_dst, const int64_t stride_sample_x, const int64_t stride_sample_y, const int64_t stride_sample_dst,
         enum ggml_prec prec, cudaStream_t stream) {
     if constexpr(std::is_same<T, half>::value) {
         if (prec == GGML_PREC_DEFAULT) {
-            launch_mul_mat_vec_cuda<T, half>
-                (x, y, ids, dst, ncols, nrows, stride_row, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y,
+            mul_mat_vec_cuda_switch_ncols_dst<T, half>
+                (x, y, ids, dst, ncols, nrows, ncols_dst, stride_row, stride_col_y, stride_col_dst,
+                 nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y,
                  stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
             return;
         }
     }
-    launch_mul_mat_vec_cuda<T, float>
-        (x, y, ids, dst, ncols, nrows, stride_row, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y,
+    mul_mat_vec_cuda_switch_ncols_dst<T, float>
+        (x, y, ids, dst, ncols, nrows, ncols_dst, stride_row, stride_col_y, stride_col_dst,
+         nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y,
          stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
 }
 
@@ -246,24 +350,24 @@ void ggml_cuda_mul_mat_vec(ggml_backend_cuda_context & ctx, const ggml_tensor *
     const int64_t stride_channel_dst = ids ? s1   : s2;
     const int64_t stride_channel_y   = ids ? s11  : s12;
 
-    GGML_ASSERT(ncols_dst == 1);
+    GGML_ASSERT(!ids || ncols_dst == 1);
 
     switch (src0->type) {
         case GGML_TYPE_F32: {
             const float * src0_d = (const float *) src0->data;
-            mul_mat_vec_cuda(src0_d, src1_d, ids_d, dst_d, ne00, ne01, s01,
+            mul_mat_vec_cuda(src0_d, src1_d, ids_d, dst_d, ne00, ne01, ncols_dst, s01, s11, s1,
                 ne02, nchannels_y, nchannels_dst, s02, stride_channel_y, stride_channel_dst,
                 ne03,              ne3,           s03, s13,              s3,                 prec, ctx.stream());
         } break;
         case GGML_TYPE_F16: {
             const half * src0_d = (const half *) src0->data;
-            mul_mat_vec_cuda(src0_d, src1_d, ids_d, dst_d, ne00, ne01, s01,
+            mul_mat_vec_cuda(src0_d, src1_d, ids_d, dst_d, ne00, ne01, ncols_dst, s01, s11, s1,
                 ne02, nchannels_y, nchannels_dst, s02, stride_channel_y, stride_channel_dst,
                 ne03,              ne3,           s03, s13,              s3,                 prec, ctx.stream());
         } break;
         case GGML_TYPE_BF16: {
             const nv_bfloat16 * src0_d = (const nv_bfloat16 *) src0->data;
-            mul_mat_vec_cuda(src0_d, src1_d, ids_d, dst_d, ne00, ne01, s01,
+            mul_mat_vec_cuda(src0_d, src1_d, ids_d, dst_d, ne00, ne01, ncols_dst, s01, s11, s1,
                 ne02, nchannels_y, nchannels_dst, s02, stride_channel_y, stride_channel_dst,
                 ne03,              ne3,           s03, s13,              s3,                 prec, ctx.stream());
         } break;
@@ -282,16 +386,19 @@ void ggml_cuda_op_mul_mat_vec(
     GGML_ASSERT(dst->type  == GGML_TYPE_F32);
 
     const int64_t ne00 = src0->ne[0];
+    const int64_t ne10 = src1->ne[0];
+    const int64_t ne0  =  dst->ne[0];
     const int64_t row_diff = row_high - row_low;
 
-    GGML_ASSERT(src1_ncols == 1);
-
-    const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
+    const int id = ggml_cuda_get_device();
+    const int cc = ggml_cuda_info().devices[id].cc;
     const enum ggml_prec prec = fast_fp16_available(cc) ? ggml_prec(dst->op_params[0]) : GGML_PREC_F32;
 
 
     // ggml_cuda_op provides single, contiguous matrices
     const int64_t stride_row         = ne00;
+    const int64_t stride_col_y       = ne10;
+    const int64_t stride_col_dst     = id == ctx.device ? ne0 : row_diff; // main device has larger memory buffer
     const int64_t nchannels_x        = 1;
     const int64_t nchannels_y        = 1;
     const int64_t nchannels_dst      = 1;
@@ -307,19 +414,19 @@ void ggml_cuda_op_mul_mat_vec(
     switch (src0->type) {
         case GGML_TYPE_F32: {
             const float * src0_d = (const float *) src0_dd_i;
-            mul_mat_vec_cuda(src0_d, src1_ddf_i, nullptr, dst_dd_i, ne00, row_diff, stride_row,
+            mul_mat_vec_cuda(src0_d, src1_ddf_i, nullptr, dst_dd_i, ne00, row_diff, src1_ncols, stride_row, stride_col_y, stride_col_dst,
                 nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
                 nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, prec, stream);
         } break;
         case GGML_TYPE_F16: {
             const half * src0_d = (const half *) src0_dd_i;
-            mul_mat_vec_cuda(src0_d, src1_ddf_i, nullptr, dst_dd_i, ne00, row_diff, stride_row,
+            mul_mat_vec_cuda(src0_d, src1_ddf_i, nullptr, dst_dd_i, ne00, row_diff, src1_ncols, stride_row, stride_col_y, stride_col_dst,
                 nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
                 nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, prec, stream);
         } break;
         case GGML_TYPE_BF16: {
             const nv_bfloat16 * src0_d = (const nv_bfloat16 *) src0_dd_i;
-            mul_mat_vec_cuda(src0_d, src1_ddf_i, nullptr, dst_dd_i, ne00, row_diff, stride_row,
+            mul_mat_vec_cuda(src0_d, src1_ddf_i, nullptr, dst_dd_i, ne00, row_diff, src1_ncols, stride_row, stride_col_y, stride_col_dst,
                 nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
                 nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, prec, stream);
         } break;
@@ -334,3 +441,66 @@ void ggml_cuda_op_mul_mat_vec(
     GGML_UNUSED(src1_ncols);
     GGML_UNUSED(src1_padded_row_size);
 }
+
+bool ggml_cuda_should_use_mmv(enum ggml_type type, int cc, const int64_t * src0_ne, int64_t ne11) {
+    if (src0_ne[0] % 2 != 0) {
+        return false;
+    }
+    switch (type) {
+        case GGML_TYPE_F32:
+            if (GGML_CUDA_CC_IS_NVIDIA(cc)) {
+                if (cc >= GGML_CUDA_CC_ADA_LOVELACE) {
+                    return ne11 <= 8;
+                }
+                if (cc >= GGML_CUDA_CC_TURING) {
+                    return ne11 <= 4;
+                }
+                return ne11 <= 3;
+            } else if (GGML_CUDA_CC_IS_AMD(cc)) {
+                if (fp32_mma_hardware_available(cc)) {
+                    return ne11 <= 3;
+                }
+                return ne11 <= 8;
+            }
+            return ne11 <= 8;
+        case GGML_TYPE_F16:
+            if (GGML_CUDA_CC_IS_NVIDIA(cc)) {
+                const bool src0_small = (src0_ne[1] <= 512 || src0_ne[2]*src0_ne[3] == 1);
+                if (cc >= GGML_CUDA_CC_ADA_LOVELACE) {
+                    return src0_small && ne11 <= 4;
+                }
+                if (fp16_mma_hardware_available(cc)) {
+                    return src0_small && ne11 <= 3;
+                }
+                return ne11 <= 8;
+            } else if (GGML_CUDA_CC_IS_AMD(cc)) {
+                if (fp16_mma_hardware_available(cc)) {
+                    if (GGML_CUDA_CC_IS_RDNA3(cc) || GGML_CUDA_CC_IS_RDNA4(cc)) {
+                        return ne11 <= 5;
+                    }
+                    return ne11 <= 2;
+                }
+                return ne11 <= 8;
+            }
+            return ne11 <= 8;
+        case GGML_TYPE_BF16:
+            if (GGML_CUDA_CC_IS_NVIDIA(cc)) {
+                const bool src0_small = (src0_ne[1] <= 512 || src0_ne[2]*src0_ne[3] == 1);
+                if (cc >= GGML_CUDA_CC_ADA_LOVELACE) {
+                    return src0_small && ne11 <= 4;
+                }
+                if (bf16_mma_hardware_available(cc)) {
+                    return src0_small && ne11 <= 3;
+                }
+                return ne11 <= 8;
+            } else if (GGML_CUDA_CC_IS_AMD(cc)) {
+                if (bf16_mma_hardware_available(cc)) {
+                    return ne11 <= 3;
+                }
+                return ne11 <= 8;
+            }
+            return ne11 <= 8;
+        default:
+            return false;
+    }
+}
diff --git a/ggml/src/ggml-cuda/mmv.cuh b/ggml/src/ggml-cuda/mmv.cuh
index 756e7e1cc7fc3..1330bcb6a8860 100644
--- a/ggml/src/ggml-cuda/mmv.cuh
+++ b/ggml/src/ggml-cuda/mmv.cuh
@@ -1,8 +1,5 @@
 #include "common.cuh"
 
-// maximum number of src0 rows with which to use mul_mat_vec over cuBLAS if FP16 tensor cores are available
-#define MMV_MAX_ROWS 512
-
 void ggml_cuda_mul_mat_vec(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst);
 
 void ggml_cuda_op_mul_mat_vec(
@@ -10,3 +7,5 @@ void ggml_cuda_op_mul_mat_vec(
     const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
     const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
     const int64_t src1_padded_row_size, cudaStream_t stream);
+
+bool ggml_cuda_should_use_mmv(enum ggml_type type, int cc, const int64_t * src0_ne, int64_t ne11);
diff --git a/ggml/src/ggml-cuda/ssm-scan.cu b/ggml/src/ggml-cuda/ssm-scan.cu
index 37ee208c09d46..2d34b836054f8 100644
--- a/ggml/src/ggml-cuda/ssm-scan.cu
+++ b/ggml/src/ggml-cuda/ssm-scan.cu
@@ -10,6 +10,8 @@ __global__ void __launch_bounds__(splitD, 2)
                  float * __restrict__ dst, const int64_t L) {
     GGML_UNUSED(src1_nb0);
     GGML_UNUSED(src2_nb0);
+
+    constexpr int warp_size = ggml_cuda_get_physical_warp_size();
     const int bidx = blockIdx.x;  // split along B
     const int bidy = blockIdx.y;  // split along D
     const int tid  = threadIdx.x;
@@ -44,16 +46,16 @@ __global__ void __launch_bounds__(splitD, 2)
     if (N == 16) {
 #pragma unroll
         for (size_t i = 0; i < splitD / 4; i += 2) {
-            float value = A_block[(wid * warpSize + i) * stride_A + wtid];
+            float value = A_block[(wid * warp_size + i) * stride_A + wtid];
             // todo: bank conflict
             // I am always confused with how to use the swizzling method to solve
             // bank conflit. Hoping somebody can tell me.
-            smem_A[(wid * warpSize + i) * stride_sA + wtid + ((wtid / 16) > 0 ? 1 : 0)] = value;
+            smem_A[(wid * warp_size + i) * stride_sA + wtid + ((wtid / 16) > 0 ? 1 : 0)] = value;
         }
 #pragma unroll
         for (size_t i = 0; i < splitD / 4; i += 2) {
-            float value = s0_block[(wid * warpSize + i) * stride_s0 + wtid];
-            smem_s0[(wid * warpSize + i) * stride_ss0 + wtid + ((wtid / 16) > 0 ? 1 : 0)] = value;
+            float value = s0_block[(wid * warp_size + i) * stride_s0 + wtid];
+            smem_s0[(wid * warp_size + i) * stride_ss0 + wtid + ((wtid / 16) > 0 ? 1 : 0)] = value;
         }
     }
 
diff --git a/ggml/src/ggml-cuda/sumrows.cu b/ggml/src/ggml-cuda/sumrows.cu
index 38dbf1b5e1fa9..2eee08fa07375 100644
--- a/ggml/src/ggml-cuda/sumrows.cu
+++ b/ggml/src/ggml-cuda/sumrows.cu
@@ -1,25 +1,9 @@
 #include "sumrows.cuh"
 
-static __global__ void k_sum_rows_f32(const float * x, float * dst, const int ncols) {
-    const int row = blockIdx.x;
-    const int col = threadIdx.x;
-
-    float sum = 0.0f;
-    for (int i = col; i < ncols; i += blockDim.x) {
-        sum += x[row * ncols + i];
-    }
-
-    sum = warp_reduce_sum(sum);
-
-    if (col == 0) {
-        dst[row] = sum;
-    }
-}
-
 void sum_rows_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
     const dim3 block_dims(WARP_SIZE, 1, 1);
     const dim3 block_nums(nrows, 1, 1);
-    k_sum_rows_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
+    reduce_rows_f32</*norm*/false><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
 }
 
 void ggml_cuda_op_sum_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
@@ -35,5 +19,8 @@ void ggml_cuda_op_sum_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
     const int64_t ncols = src0->ne[0];
     const int64_t nrows = ggml_nrows(src0);
 
-    sum_rows_f32_cuda(src0_d, dst_d, ncols, nrows, stream);
+    const dim3 block_dims(WARP_SIZE, 1, 1);
+    const dim3 block_nums(nrows, 1, 1);
+
+    reduce_rows_f32</*norm=*/false><<<block_nums, block_dims, 0, stream>>>(src0_d, dst_d, ncols);
 }
diff --git a/ggml/src/ggml-cuda/sumrows.cuh b/ggml/src/ggml-cuda/sumrows.cuh
index 191db1c13167e..3431c599b1b89 100644
--- a/ggml/src/ggml-cuda/sumrows.cuh
+++ b/ggml/src/ggml-cuda/sumrows.cuh
@@ -1,5 +1,4 @@
 #include "common.cuh"
 
 void sum_rows_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream);
-
 void ggml_cuda_op_sum_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/ggml/src/ggml-hexagon/CMakeLists.txt b/ggml/src/ggml-hexagon/CMakeLists.txt
new file mode 100644
index 0000000000000..d644300387e32
--- /dev/null
+++ b/ggml/src/ggml-hexagon/CMakeLists.txt
@@ -0,0 +1,139 @@
+project(ggml-hexagon)
+message(STATUS "Using HEXAGON backend")
+message("CMAKE_SYSTEM_NAME : ${CMAKE_SYSTEM_NAME}")
+
+set(CMAKE_CXX_STANDARD 20)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+
+if(NOT DEFINED QNN_SDK_PATH)
+    message(FATAL_ERROR "QNN_SDK_PATH not defined")
+endif()
+
+if(NOT DEFINED HEXAGON_SDK_PATH)
+    message(FATAL_ERROR "HEXAGON_SDK_PATH not defined")
+endif()
+
+message("QNN_SDK_PATH    : ${QNN_SDK_PATH}")
+message("HEXAGON_SDK_PATH: ${HEXAGON_SDK_PATH}")
+message("HTP_ARCH_VERSION: ${HTP_ARCH_VERSION}")
+
+if (CMAKE_BUILD_TYPE STREQUAL "Debug")
+    set(DEBUG_FLAG "-DDEBUG -Wall")
+    message("Debug mode:${DEBUG_FLAG}")
+else()
+    set(DEBUG_FLAG "-DNDEBUG -Wall")
+#manually disable all verbose logs in ggml-hexagon/CMakeLists.txt to
+#make compare NPU performance through llama-bench more clear
+#set(DEBUG_FLAG "-DNDEBUG -Wall -DDISABLE_ALL_LOG")
+    message("Release mode:${DEBUG_FLAG}")
+endif()
+
+#v68 --- Snapdragon 888
+#v69 --- Snapdragon 8 Gen1
+#v73 --- Snapdragon 8 Gen2
+#v75 --- Snapdragon 8 Gen3
+#v79 --- Snapdragon 8 Elite(aka Gen4)
+if(NOT DEFINED HTP_ARCH_VERSION)
+    message(FATAL_ERROR "HTP_ARCH_VERSION not defined, valid htp arch: v68,v69,v73,v75,v79")
+endif()
+
+#check whether user's specified htp arch is valid
+set(CHECK_HTP_ARCH "WRONG")
+#ref: https://github.com/quic/ai-hub-apps/tree/main/tutorials/llm_on_genie
+#foreach (feat v68 v69 v73 v75 v79)
+#foreach (feat v73 v75 v79)
+#for simplify workflow, only support v75 and v79, or only support 8Gen3 and 8Elite
+foreach (feat v75 v79)
+    if (${feat} STREQUAL ${HTP_ARCH_VERSION})
+        set(CHECK_HTP_ARCH "GOOD")
+    endif()
+endforeach()
+if (${CHECK_HTP_ARCH} STREQUAL "WRONG")
+    #message(FATAL_ERROR "ggml-hexagon backend only support htp arch v68,v69,v73,v75,v79")
+    #for simplify workflow, only support v75 and v79, or only support 8Gen3 and 8Elite
+    message(FATAL_ERROR "ggml-hexagon backend only support htp arch v75,v79")
+endif()
+
+#check optimization flags
+set(OPT_FLAG " ")
+if (${HTP_ARCH_VERSION} STREQUAL "v75" OR ${HTP_ARCH_VERSION} STREQUAL "v79")
+    #works fine on Snapdragon 8Gen3&8Elite with 1.5x - 3x performance gains with the default ggml backend
+    set(OPT_FLAG " -O3 -march=armv8.7-a -mcpu=cortex-x1 -mtune=cortex-x1 -flto -D_GNU_SOURCE -fvectorize -ffp-model=fast -fno-finite-math-only")
+endif()
+message("OPT_FLAG:${OPT_FLAG}")
+
+if(CMAKE_SYSTEM_NAME STREQUAL "Android")
+    find_library(LOG_LIB log)
+
+    add_library(cdsprpc
+        SHARED
+        IMPORTED)
+    set_target_properties(cdsprpc
+        PROPERTIES
+        IMPORTED_LOCATION
+        ${HEXAGON_SDK_PATH}/ipc/fastrpc/remote/ship/android_aarch64/libcdsprpc.so)
+
+    set(QNN_LINK_LIBRARIES ${LOG_LIB} cdsprpc)
+    set(QNN_DEFAULT_LIB_SEARCH_PATH "/data/local/tmp/" CACHE STRING "customized library search path for QNN backend")
+
+    include_directories(${HEXAGON_SDK_PATH}/incs)
+    include_directories(${HEXAGON_SDK_PATH}/incs/stddef)
+    include_directories(${HEXAGON_SDK_PATH}/ipc/fastrpc/incs)
+    include_directories(${HEXAGON_SDK_PATH}/ipc/fastrpc/rpcmem/inc)
+    include_directories(${HEXAGON_SDK_PATH}/ipc/fastrpc/remote/ship/android_Debug_aarch64)
+    include_directories(${HEXAGON_SDK_PATH}/utils/examples)
+    include_directories(${HEXAGON_SDK_PATH}/ipc/fastrpc/rtld/ship/android_aarch64)
+    include_directories(${HEXAGON_SDK_PATH}/libs/atomic/inc)
+    include_directories(${HEXAGON_SDK_PATH}/libs/atomic/android_Debug_aarch64/ship)
+    include_directories(${CMAKE_SOURCE_DIR}/ggml/src/ggml-hexagon/)
+    include_directories(${CMAKE_SOURCE_DIR}/ggml/src/ggml-hexagon/kernels/)
+elseif(CMAKE_SYSTEM_NAME STREQUAL "Windows")
+    set(QNN_DEFAULT_LIB_SEARCH_PATH "C:\\" CACHE STRING "customized library search path for QNN backend")
+else()
+    message(FATAL_ERROR "ggml-hexagon now only available on Android and Windows(Windows on ARM)")
+endif()
+
+set(CMAKE_C_FLAGS   "${CMAKE_C_FLAGS}   -DGGML_USE_HEXAGON ${DEBUG_FLAG} ${OPT_FLAG}")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DGGML_USE_HEXAGON ${DEBUG_FLAG} ${OPT_FLAG}")
+set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -DGGML_USE_HEXAGON ${DEBUG_FLAG} ${OPT_FLAG}")
+set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -DGGML_USE_HEXAGON ${DEBUG_FLAG} ${OPT_FLAG}")
+
+file(GLOB HEXAGON_SOURCES "${CMAKE_CURRENT_LIST_DIR}/*.cpp" "${CMAKE_CURRENT_LIST_DIR}/kernels/stub.c")
+ggml_add_backend_library(ggml-hexagon ${HEXAGON_SOURCES})
+
+target_include_directories(ggml-hexagon PRIVATE ${QNN_SDK_PATH}/include/QNN ${HEXAGON_SDK_PATH} ${CMAKE_CURRENT_LIST_DIR})
+target_link_libraries(ggml-hexagon PRIVATE ${QNN_LINK_LIBRARIES})
+
+string(REGEX REPLACE "/$" "" QNN_DEFAULT_LIB_SEARCH_PATH "${QNN_DEFAULT_LIB_SEARCH_PATH}")
+target_compile_definitions(ggml-hexagon PRIVATE QNN_DEFAULT_LIB_SEARCH_PATH="${QNN_DEFAULT_LIB_SEARCH_PATH}/")
+
+#cross compiling source codes of hexagon kernels which running on cDSP side
+function(ggml_hexagon_build_kernel KNAME)
+    message(STATUS "ggml_hexagon: build hexagon-kernel ${KNAME}")
+
+    add_custom_command(
+        TARGET ${PROJECT_NAME}
+        POST_BUILD
+        COMMAND echo "current working path:`pwd`\n"
+        COMMAND echo "${CMAKE_CURRENT_LIST_DIR}/kernels"
+        COMMAND make -C ${CMAKE_CURRENT_LIST_DIR}/kernels/ clean
+        COMMAND make -C ${CMAKE_CURRENT_LIST_DIR}/kernels/ HEXAGON_SDK_PATH=${HEXAGON_SDK_PATH} HTP_ARCH_VERSION=${HTP_ARCH_VERSION} DEBUG_FLAG=${DEBUG_FLAG}
+        COMMAND echo "current working path:`pwd`\n"
+        COMMAND ls -l  ../../../bin/libggmldsp-skel.so
+        COMMENT "build hexagon-kernel"
+    )
+endfunction()
+
+function(ggml_hexagon_setup_cfg KNAME)
+    message(STATUS "ggml_hexagon: setup runtime configuration file ${KNAME}")
+    add_custom_command(
+        TARGET ${PROJECT_NAME}
+        POST_BUILD
+        COMMAND echo "current working path:`pwd`\n"
+        COMMAND /bin/cp -fv ../../../../../scripts/${KNAME}  ../../../bin/
+        COMMENT "setup runtime configuration file"
+    )
+endfunction()
+
+ggml_hexagon_build_kernel("cdsp")
+ggml_hexagon_setup_cfg("ggml-hexagon.cfg")
diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
new file mode 100644
index 0000000000000..a8ab81a5cdc70
--- /dev/null
+++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
@@ -0,0 +1,7003 @@
+/*
+ * Copyright (c) 2024-2025 The ggml authors
+ *
+ * Qualcomm QNN SDK and reference tech guides could be found at:
+ * https://www.qualcomm.com/developer/software/qualcomm-ai-engine-direct-sdk
+ * Qualcomm Hexagon SDK and reference tech guides could be found at:
+ * https://developer.qualcomm.com/software/hexagon-dsp-sdk/tools
+ *
+ * this single-source-file or self-contained implementation of ggml-hexagon backend has 8 sections:
+ * section-1  forward/prototype declaration, global vars, macros, data structures
+ * section-2  internal troubleshooting function/class
+ * section-3  helper function for WoA(Windows on ARM)
+ * section-4  general helper function
+ * section-5  QNN helper function/class
+ * section-6  implementation of hwaccel approach through QNN: offload ggmlop to QNN
+ * section-7  cDSP helper function
+ * section-8  implementation of ggml-hexagon backend according to specification in ggml backend subsystem
+ *
+ * currently provide following ggml op' implementation through QNN:
+ * - GGML_OP_ADD/GGML_OP_SUB/GGML_OP_MUL/GGML_OP_DIV/GGML_OP_LOG/GGML_OP_SQRT:
+ *   this is a simple hwaccel skeleton, can expand other ggml ops according to expertise
+ * - GGML_OP_MUL_MAT:
+ *   this is a complicated hwaccel skeleton, can expand other ggml ops accordingly
+ *
+ *  currently provide following ggml op' implementation through cDSP in hexagon-kernels:
+ * - GGML_OP_ADD & GGML_OP_MUL_MAT:
+ *   this is a hwaccel skeleton, can expand other ggml ops accordingly
+ *
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <stddef.h>
+#include <inttypes.h>
+#include <math.h>
+#include <time.h>
+
+#include <string>
+#include <vector>
+#include <thread>
+#include <mutex>
+#include <map>
+#include <set>
+#include <tuple>
+#include <queue>
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include <iomanip>
+#include <chrono>
+#include <memory>
+#include <regex>
+#include <random>
+#include <functional>
+#include <unordered_map>
+#include <condition_variable>
+#include <unordered_set>
+#include <utility>
+#include <future>
+#include <algorithm>
+
+#if defined(__ANDROID__) || defined(__linux__)
+#include <unistd.h>
+#include <dlfcn.h>
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <sys/sysinfo.h>
+#include <unistd.h>
+#include <stdatomic.h>
+#endif
+
+#if !defined(__ANDROID__) && !defined(__linux__)
+#include <wchar.h>
+#include <malloc.h>
+#include <Windows.h>
+#endif
+
+#if defined(__ANDROID__)
+#include "android/log.h"
+
+#include "rpcmem.h"
+#include "remote.h"
+#include "os_defines.h"
+#include "domain.h"
+#include "AEEStdErr.h"
+#include "HAP_power.h"
+#include "HAP_farf.h"
+#endif
+
+#include "QnnTypes.h"
+#include "QnnCommon.h"
+#include "QnnContext.h"
+#include "QnnBackend.h"
+#include "QnnGraph.h"
+#include "QnnProperty.h"
+#include "QnnTensor.h"
+#include "QnnInterface.h"
+#include "Saver/QnnSaver.h"
+#include "System/QnnSystemInterface.h"
+#include "HTP/QnnHtpDevice.h"
+#include "HTP/QnnHtpGraph.h"
+
+#include "ggml-hexagon.h"
+#include "ggml-impl.h"
+#include "ggml-backend-impl.h"
+
+#include "kernels/skel.h"
+
+// =================================================================================================
+//  section-1: forward/prototype declaration, global vars, macros, data structures
+// =================================================================================================
+class  qnn_instance;
+class  hexagon_profiler;
+struct ggml_backend_hexagon_context;
+
+#ifdef NDEBUG
+#define GGMLHEXAGON_DEBUG                               0
+#else
+#define GGMLHEXAGON_DEBUG                               1
+#endif
+
+#ifndef PROJECT_NAME
+#define PROJECT_NAME                                    "ggml-hexagon"
+#endif
+
+#define GGMLHEXAGON_LOGBUF_LEN                          4096
+#define GGMLHEXAGON_TMPBUF_LEN                          256
+
+#define GGMLHEXAGON_LOG_ERROR(...)                      ggmlhexagon_log_internal(GGML_LOG_LEVEL_ERROR, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__)
+#define GGMLHEXAGON_LOG_WARN(...)                       ggmlhexagon_log_internal(GGML_LOG_LEVEL_WARN , __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__)
+
+#if !defined (DISABLE_ALL_LOG)
+#define GGMLHEXAGON_LOG_INFO(...)                       ggmlhexagon_log_internal(GGML_LOG_LEVEL_INFO , __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__)
+#define GGMLHEXAGON_LOG_VERBOSE(...)                    ggmlhexagon_log_internal(GGML_LOG_LEVEL_CONT , __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__)
+#else
+//manually disable all verbose logs in ggml-hexagon/CMakeLists.txt to
+//make compare NPU performance through llama-bench more clear
+#define GGMLHEXAGON_LOG_INFO(...)
+#define GGMLHEXAGON_LOG_VERBOSE(...)
+#endif
+
+#if GGMLHEXAGON_DEBUG
+#define GGMLHEXAGON_LOG_DEBUG(...)                      ggmlhexagon_log_internal(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__)
+#else
+#define GGMLHEXAGON_LOG_DEBUG(...)
+#endif
+
+#define QNN_VER_PTR(x)                                  (&((x).v1))
+#define RPCMEM_DEFAULT_FLAGS                            1
+#define RPCMEM_HEAP_ID_SYSTEM                           25
+#define SIZE_IN_MB                                      (1 << 20)
+#define STATUS_CONTEXT                                  0x12345678
+
+#if !defined (_WINDOWS)
+#pragma weak remote_system_request
+#pragma weak remote_session_control
+#endif
+
+#define CHECK_QNN_API(error, result)                                            \
+    do {                                                                        \
+        error = (result);                                                       \
+        if (QNN_SUCCESS != error) {                                             \
+            if (error == QNN_COMMON_ERROR_NOT_SUPPORTED) {                      \
+                GGMLHEXAGON_LOG_WARN("WARNING: QNN feature/API not supported\n");   \
+            } else {                                                            \
+                GGMLHEXAGON_LOG_INFO("QNN API error = %d(%s)\n", error, ggmlqnn_get_qnnerror_string(error));  \
+            }                                                                   \
+        }                                                                       \
+    } while (0)
+
+#define GGMLQNN_CHECK_PARAMS(ctx, src0, src1, dst)                              \
+    do {                                                                        \
+        if (g_hexagon_appcfg.hwaccel_approach != HWACCEL_CDSP) {                    \
+            if (!ggmlqnn_is_valid_params((ctx), (src0), (src1), (dst))) {       \
+                return;                                                         \
+            }                                                                   \
+        }                                                                       \
+    } while (0)                                                                 \
+
+#ifndef ggmlop_URI
+#define ggmlop_URI "file:///libggmldsp-skel.so?ggmldsp_skel_handle_invoke&_modver=1.0&_idlver=0.0.1"
+#endif
+// =================================================================================================
+//  section-1: data type, data structure, global vars
+// =================================================================================================
+using pfn_rpc_mem_init                          = void (*)(void);
+using pfn_rpc_mem_deinit                        = void (*)(void);
+using pfn_rpc_mem_alloc                         = void *(*)(int, uint32_t, int);
+using pfn_rpc_mem_free                          = void (*)(void *);
+using pfn_rpc_mem_to_fd                         = int (*)(void *);
+using _pfn_QnnSaver_initialize                  = decltype(QnnSaver_initialize);
+using _pfn_QnnInterface_getProviders            = decltype(QnnInterface_getProviders);
+using _pfn_QnnSystemInterface_getProviders      = decltype(QnnSystemInterface_getProviders);
+
+//QNN resource management for the general approach through QNN
+using qnn_tensors_t                             = std::vector< Qnn_Tensor_t >;
+using qnn_ptensors_t                            = std::vector< Qnn_Tensor_t *>;
+using qnn_singlenode_res_t                      = std::tuple<Qnn_GraphHandle_t, qnn_ptensors_t>;
+
+typedef void (* ggmlqnn_op_func_t)(ggml_backend_hexagon_context * ctx, ggml_tensor * op);
+typedef int  (* notify_callback_fn)(void * context, int domain, int session, remote_rpc_status_flags_t status);
+typedef int  (* ggmlhexagon_op_func_t)(remote_handle64 handle, const dsptensor * src0, const dsptensor * src1, dsptensor * dst);
+
+enum qnn_index_type {
+    QNN_TENSOR_INDEX = 0,
+    QNN_OPCFG_INDEX  = 1,
+};
+
+enum qnn_profile_level {
+    PROFILE_OFF     = 0,
+    PROFILE_BASIC   = 1,
+    PROFILE_DETAIL  = 2,
+};
+
+enum hexagon_dsp_type {
+    HEXAGON_ADSP    = 0,
+    HEXAGON_MDSP    = 1,
+    HEXAGON_SDSP    = 2,
+    HEXAGON_CDSP    = 3,
+    HEXAGON_CDSP1   = 4,
+};
+
+enum qcom_htp_arch {
+    NONE = 0,
+    V68 = 68,
+    V69 = 69,
+    V73 = 73,
+    V75 = 75,
+    V79 = 79,
+};
+
+enum qcom_chipset_soc_model {
+    UNKNOWN_SM = 0,
+    SM7450 = 41,  // v69, 7 Gen1
+    SM8350 = 30,  // v68, 888
+    SM8450 = 36,  // v69, SD 8 Gen 1
+    SM8475 = 42,  // v69, SD 8+ Gen 1
+    SM8550 = 43,  // v73, SD 8 Gen 2
+    SM8650 = 57,  // v75, SD 8 Gen 3
+    SM8750 = 69,  // v79, SD 8 Elite
+#if !defined(__ANDROID__) && !defined(__linux__)
+    SC7280X     = 44,
+    SC8280X     = 37,
+    SC8380XP    = 60,
+#endif
+};
+
+//borrowed from Android source code, might not be accurate
+enum ion_heap_ids {
+    INVALID_HEAP_ID             = -1,
+    ION_CP_MM_HEAP_ID           = 8,
+    ION_SECURE_HEAP_ID          = 9,
+    ION_SECURE_DISPLAY_HEAP_ID  = 10,
+    ION_CP_MFC_HEAP_ID          = 12,
+    ION_SPSS_HEAP_ID            = 13,
+    ION_CP_WB_HEAP_ID           = 16,
+    ION_CAMERA_HEAP_ID          = 20,
+    ION_SYSTEM_CONTIG_HEAP_ID   = 21,
+    ION_ADSP_HEAP_ID            = 22,
+    ION_PIL1_HEAP_ID            = 23,
+    ION_SF_HEAP_ID              = 24,
+    ION_SYSTEM_HEAP_ID          = 25,
+    ION_PIL2_HEAP_ID            = 26,
+    ION_QSECOM_HEAP_ID          = 27,
+    ION_AUDIO_HEAP_ID           = 28,
+    ION_MM_FIRMWARE_HEAP_ID     = 29,
+    ION_HEAP_ID_RESERVED        = 31
+};
+
+struct qcom_socinfo {
+    uint32_t soc_model;
+    size_t htp_arch;
+    size_t vtcm_size_in_mb;
+    char soc_desc[GGML_MAX_NAME];
+};
+
+struct ggml_backend_hexagon_context {
+    int device;
+    char name[GGML_MAX_NAME];
+    char desc[GGML_MAX_NAME];
+    char lib[GGML_MAX_NAME];
+    qnn_instance * instance;
+    struct ggml_backend * backend;
+    QNN_INTERFACE_VER_TYPE raw_interface;
+    QNN_SYSTEM_INTERFACE_VER_TYPE raw_system_interface;
+    struct qcom_socinfo           socinfo;
+
+    //QNN resource management for the general approach through QNN
+    std::map<std::string, qnn_singlenode_res_t> qnn_singlenode_graph_map;
+
+    //quantize data -> fp32
+    std::unique_ptr<char[]> work_data;
+    std::vector<std::future<void>> tasks;
+    size_t work_size;
+    size_t desired_size;
+    int n_threads;
+
+    //Hexagon resource management for the general approach through Hexagaon cDSP
+    size_t rpc_mempool_capacity;
+    size_t rpc_mempool_len;
+    size_t rpc_mempool_usage;
+    void * rpc_mempool;
+    int rpc_mempool_handle;
+    remote_handle64 ggmlop_handle;
+    int domain_id;
+};
+
+struct qnn_op_caps {
+    bool supported;
+    ggml_op op;
+    const size_t input_param_count;
+    const char * qnn_op_name;
+};
+
+struct hexagon_op_caps {
+    bool supported;
+    ggml_op op;
+    const size_t input_param_count;
+    const char * hexagon_op_name;
+    ggmlhexagon_op_func_t dsp_op_func;
+};
+
+struct hexagon_appcfg_t {
+    int print_qnn_internal_log; // enable/disable QNN's internal log
+    int enable_perf;            // enable/disable perf of a specified ggml op
+    int enable_profiler;        // enable/disable profiler feature to visualization comparison between HWACCEL_CDSP and HWACCEL_QNN
+    int print_tensors_info;     // enable/disable print tensors info in op function
+    int dump_op_info;           // enable/disable dump op info in handle_op
+    int enable_q_mulmat;        // enable/disable offload quantized mulmat
+    int enable_pinned_memory;   // enable/disable pinned-memory feature
+    int precision_mode;         // 0: default 1:fp16
+    int hvx_threads;
+    int vtcm_size_in_mb;
+    int enable_dlbc;
+    int hwaccel_approach;       // 0: HWACCEL_QNN 1: HWACCEL_QNN_SINGLEGRAPH 2: HWACCEL_CDSP
+    int hexagon_backend;        // 0: HEXAGON_BACKEND_QNNCPU 1: HEXAGON_BACKEND_QNNGPU 2: HEXAGON_BACKEND_QNNNPU 3: HEXAGON_BACKEND_CDSP 4: ggml
+    int enable_rpc_ion_mempool; // enable/disable rpc ion memory pool
+    int enable_all_q_mulmat;    // enable/disable offload all quantized type mulmat to cDSP
+    int profiler_duration;      // threshold of duration in profiler, per seconds
+    int profiler_counts;        // threshold of counts in profiler
+    int thread_counts;          // thread_counts on cDSP side
+    int mulmat_algotype;        // algorithm type of mulmat on cDSP side
+    const char * cfgfilename;
+    const char * runtime_libpath;
+    char ggml_hexagon_version[GGMLHEXAGON_TMPBUF_LEN];
+    char ggml_dsp_version[GGMLHEXAGON_TMPBUF_LEN];
+};
+
+static struct hexagon_appcfg_t g_hexagon_appcfg = {
+        .print_qnn_internal_log = 0,
+        .enable_perf            = 1,
+        .enable_profiler        = 0,
+        .print_tensors_info     = 0,
+        .dump_op_info           = 0,
+        .enable_q_mulmat        = 0,
+        .enable_pinned_memory   = 0,
+        .precision_mode         = 0,
+        .hvx_threads            = 4,
+        .vtcm_size_in_mb        = 8,
+        .enable_dlbc            = 1,
+        .hwaccel_approach       = HWACCEL_CDSP,
+        .hexagon_backend        = HEXAGON_BACKEND_CDSP,
+        .enable_rpc_ion_mempool = 0,
+        .enable_all_q_mulmat    = 0,
+        .profiler_duration      = 5,    //seconds
+        .profiler_counts        = 100,
+        .thread_counts          = 4,
+        .mulmat_algotype        = 0,
+        .cfgfilename            = "ggml-hexagon.cfg",
+#if defined(__ANDROID__)
+    #if defined(STANDARD_ANDROID_APP)
+        .runtime_libpath        = "/data/data/com.kantvai.kantvplayer/",
+    #else
+        .runtime_libpath        = "/data/local/tmp/",
+    #endif
+#elif defined(__linux__)
+        .qnn_runtimelib_path    = "/tmp/",
+#elif defined(_WIN32)
+        .qnn_runtimelib_path    = "C:\\",
+#endif
+        .ggml_hexagon_version   = {"1.13"},
+        .ggml_dsp_version       = {"0.63"},
+};
+
+//file:///opt/qcom/aistack/qairt/2.31.0.250130/docs/QNN/general/overview.html#tbl-supported-snapdragon-devices
+static struct qcom_socinfo g_qnn_soc_info_table[] = {
+        /* Qualcomm SnapDragon 7 Gen 1 */
+        {
+                .soc_model         = SM7450,
+                .htp_arch          = V69,
+                .vtcm_size_in_mb   = 8,
+                .soc_desc          = "Qualcomm SnapDragon 7 Gen 1"},
+
+        /* Qualcomm SnapDragon 888 */
+        {
+                .soc_model         = SM8350,
+                .htp_arch          = V68,
+                .vtcm_size_in_mb   = 8,
+                .soc_desc          = "Qualcomm SnapDragon 888 "},
+
+        /* Qualcomm SnapDragon 8 Gen 1 */
+        {
+                .soc_model         = SM8450,
+                .htp_arch          = V69,
+                .vtcm_size_in_mb   = 8,
+                .soc_desc          = "Qualcomm SnapDragon 8 Gen 1"},
+
+        /* Qualcomm SnapDragon 8 Gen 1+ */
+        {
+                .soc_model         = SM8475,
+                .htp_arch          = V69,
+                .vtcm_size_in_mb   = 8,
+                .soc_desc          = "Qualcomm SnapDragon 8 Gen 1+"},
+
+        /* Qualcomm SnapDragon 8 Gen 2 */
+        {
+                .soc_model         = SM8550,
+                .htp_arch          = V73,
+                .vtcm_size_in_mb   = 8,
+                .soc_desc          = "Qualcomm SnapDragon 8 Gen 2"},
+
+        /* Qualcomm SnapDragon 8 Gen 3 */
+        {
+                .soc_model         = SM8650,
+                .htp_arch          = V75,
+                .vtcm_size_in_mb   = 8,
+                .soc_desc          = "Qualcomm SnapDragon 8 Gen 3 "},
+
+        /* Qualcomm SnapDragon 8 Gen 4 */
+        {
+                .soc_model         = SM8750,
+                .htp_arch          = V79,
+                .vtcm_size_in_mb   = 8,
+                .soc_desc          = "Qualcomm SnapDragon 8 Elite"},
+
+#if !defined(__ANDROID__) && !defined(__linux__)
+        /* Qualcomm SnapDragon 7c Gen 2 */
+        {
+                .soc_model         = SC7280X,
+                .htp_arch          = V68,
+                .vtcm_size_in_mb   = 8,
+                .soc_desc          = "Qualcomm SnapDragon 7c Gen 2"},
+
+        /* Qualcomm SnapDragon 8cx Gen 3 */
+        {
+                .soc_model         = SC8280X,
+                .htp_arch          = V68,
+                .vtcm_size_in_mb   = 8,
+                .soc_desc          = "Qualcomm SnapDragon 8cx Gen 3"},
+
+        /* Qualcomm SnapDragon 8cx Gen 4 */
+        {
+                .soc_model         = SC8380XP,
+                .htp_arch          = V73,
+                .vtcm_size_in_mb   = 8,
+                .soc_desc          = "Qualcomm SnapDragon 8cx Gen 4"},
+#endif
+
+};
+
+// file:///opt/qcom/aistack/qairt/2.31.0.250130/docs/QNN/general/quantization.html
+// CPU - Choose a non-quantized model.Quantized models are currently incompatible with the CPU backend
+// GPU - Choose a non-quantized model.Quantized models are currently incompatible with the GPU backend
+// HTP - Choose a quantized model. Quantized models are required when running on the HTP backend
+// DSP - Choose a quantized model. Quantized models are required when running on the DSP backend
+// HTA - Choose a quantized model. Quantized models are required when running on the HTA backend
+static struct ggml_backend_hexagon_context g_hexagon_mgr[GGML_HEXAGON_MAX_DEVICES] = {
+        {       .device               = 0,
+                .name                 = "qnn-cpu",
+                .desc                 = "Qualcomm Kryo CPU",
+#if !defined(__ANDROID__) && !defined(__linux__)
+                .lib                  = "QnnCpu.dll",
+#else
+                .lib                  = "libQnnCpu.so",
+#endif
+                .instance             = nullptr,
+                .backend              = nullptr,
+                .raw_interface        = {},
+                .raw_system_interface = {},
+                .socinfo              = {},
+                .qnn_singlenode_graph_map = {},
+                .work_data            = nullptr,
+                .tasks                = {},
+                .work_size            = 0,
+                .desired_size         = 0,
+                .n_threads            = 8,
+                .rpc_mempool_capacity = 0,
+                .rpc_mempool_len      = 0,
+                .rpc_mempool_usage    = 0,
+                .rpc_mempool          = nullptr,
+                .rpc_mempool_handle   = 0,
+                .ggmlop_handle        = 0,
+                .domain_id            = -1,
+        },
+
+        {       .device               = 1,
+                .name                 = "qnn-gpu",
+                .desc                 = "Qualcomm Adreno GPU",
+#if !defined(__ANDROID__) && !defined(__linux__)
+                .lib                  = "QnnGpu.dll",
+#else
+                .lib                  = "libQnnGpu.so",
+#endif
+                .instance             = nullptr,
+                .backend              = nullptr,
+                .raw_interface        = {},
+                .raw_system_interface = {},
+                .socinfo              = {},
+                .qnn_singlenode_graph_map = {},
+                .work_data            = nullptr,
+                .tasks                = {},
+                .work_size            = 0,
+                .desired_size         = 0,
+                .n_threads            = 8,
+                .rpc_mempool_capacity = 0,
+                .rpc_mempool_len      = 0,
+                .rpc_mempool_usage    = 0,
+                .rpc_mempool          = nullptr,
+                .rpc_mempool_handle   = 0,
+                .ggmlop_handle        = 0,
+                .domain_id            = -1,
+        },
+
+        {       .device               = 2,
+                .name                 = "qnn-npu",
+                .desc                 = "Qualcomm NPU(Hexagon Tensor Processor)",
+#if !defined(__ANDROID__) && !defined(__linux__)
+                .lib                  = "QnnHtp.dll",
+#else
+                .lib                  = "libQnnHtp.so",
+#endif
+                .instance             = nullptr,
+                .backend              = nullptr,
+                .raw_interface        = {},
+                .raw_system_interface = {},
+                .socinfo              = {},
+                .qnn_singlenode_graph_map = {},
+                .work_data            = nullptr,
+                .tasks                = {},
+                .work_size            = 0,
+                .desired_size         = 0,
+                .n_threads            = 8,
+                .rpc_mempool_capacity = 0,
+                .rpc_mempool_len      = 0,
+                .rpc_mempool_usage    = 0,
+                .rpc_mempool          = nullptr,
+                .rpc_mempool_handle   = 0,
+                .ggmlop_handle        = 0,
+                .domain_id            = -1,
+         },
+         {      .device               = 3,
+                .name                 = "Hexagon-cDSP",
+                .desc                 = "Qualcomm NPU(cDSP)",
+                .lib                  = "",
+                .instance             = nullptr,
+                .backend              = nullptr,
+                .raw_interface        = {},
+                .raw_system_interface = {},
+                .socinfo              = {},
+                .qnn_singlenode_graph_map = {},
+                .work_data            = nullptr,
+                .tasks                = {},
+                .work_size            = 0,
+                .desired_size         = 0,
+                .n_threads            = 8,
+                .rpc_mempool_capacity = 0,
+                .rpc_mempool_len      = 0,
+                .rpc_mempool_usage    = 0,
+                .rpc_mempool          = nullptr,
+                .rpc_mempool_handle   = 0,
+                .ggmlop_handle        = 0,
+                .domain_id            = HEXAGON_CDSP,
+        },
+};
+
+static domain hexagon_supported_domains[] = {
+        {ADSP_DOMAIN_ID, ADSP_DOMAIN},
+        {MDSP_DOMAIN_ID, MDSP_DOMAIN},
+        {SDSP_DOMAIN_ID, SDSP_DOMAIN},
+        {CDSP_DOMAIN_ID, CDSP_DOMAIN},
+        {CDSP1_DOMAIN_ID, CDSP1_DOMAIN}
+};
+
+//supported ggml op by HWACCEL_QNN
+static constexpr const qnn_op_caps ggmlqnn_k_op_caps[] = {
+        {true,  GGML_OP_NONE, 0, nullptr},
+        {false, GGML_OP_DUP, 0, nullptr},
+        {true,  GGML_OP_ADD, 2, QNN_OP_ELEMENT_WISE_ADD},
+        {false, GGML_OP_ADD1, 0, nullptr},
+        {false, GGML_OP_ACC, 0, nullptr},
+        {true,  GGML_OP_SUB, 2, QNN_OP_ELEMENT_WISE_SUBTRACT},
+        {true,  GGML_OP_MUL, 2, QNN_OP_ELEMENT_WISE_MULTIPLY},
+        {true,  GGML_OP_DIV, 2, QNN_OP_ELEMENT_WISE_DIVIDE},
+        {false, GGML_OP_SQR, 0, nullptr},
+        {true,  GGML_OP_SQRT, 1, QNN_OP_ELEMENT_WISE_SQUARE_ROOT},
+        {true,  GGML_OP_LOG, 1, QNN_OP_ELEMENT_WISE_LOG},
+        {false, GGML_OP_SIN, 0, nullptr},
+        {false, GGML_OP_COS, 0, nullptr},
+        {false, GGML_OP_SUM, 0, nullptr},
+        {false, GGML_OP_SUM_ROWS, 0, nullptr},
+        {false, GGML_OP_MEAN, 0, nullptr},
+        {false, GGML_OP_ARGMAX, 0, nullptr},
+        {false, GGML_OP_COUNT_EQUAL, 0, nullptr},
+        {false, GGML_OP_REPEAT, 0, nullptr},
+        {false, GGML_OP_REPEAT_BACK, 0, nullptr},
+        {false, GGML_OP_CONCAT, 0, nullptr},
+        {false, GGML_OP_SILU_BACK, 0, nullptr},
+        {false, GGML_OP_NORM, 0, nullptr},
+        {false, GGML_OP_RMS_NORM, 0, nullptr},
+        {false, GGML_OP_RMS_NORM_BACK, 0, nullptr},
+        {false, GGML_OP_GROUP_NORM, 0, nullptr},
+        {false, GGML_OP_L2_NORM, 0, nullptr},
+        {true,  GGML_OP_MUL_MAT, 2, QNN_OP_MAT_MUL},
+        {false, GGML_OP_MUL_MAT_ID, 0, nullptr},
+        {false, GGML_OP_OUT_PROD, 0, nullptr},
+        {false, GGML_OP_SCALE, 0, nullptr},
+        {false, GGML_OP_SET, 0, nullptr},
+        {false, GGML_OP_CPY, 0, nullptr},
+        {false, GGML_OP_CONT, 0, nullptr},
+        {false, GGML_OP_RESHAPE, 0, nullptr},
+        {false, GGML_OP_VIEW, 0, nullptr},
+        {false, GGML_OP_PERMUTE, 0, nullptr},
+        {false, GGML_OP_TRANSPOSE, 0, nullptr},
+        {false, GGML_OP_GET_ROWS, 0, nullptr},
+        {false, GGML_OP_GET_ROWS_BACK, 0, nullptr},
+        {false, GGML_OP_DIAG, 0, nullptr},
+        {false, GGML_OP_DIAG_MASK_INF, 0, nullptr},
+        {false, GGML_OP_DIAG_MASK_ZERO, 0, nullptr},
+        {false, GGML_OP_SOFT_MAX, 0, nullptr},
+        {false, GGML_OP_SOFT_MAX_BACK, 0, nullptr},
+        {false, GGML_OP_ROPE, 0, nullptr},
+        {false, GGML_OP_ROPE_BACK, 0, nullptr},
+        {false, GGML_OP_CLAMP, 0, nullptr},
+        {false, GGML_OP_CONV_TRANSPOSE_1D, 0, nullptr},
+        {false, GGML_OP_IM2COL, 0, nullptr},
+        {false, GGML_OP_IM2COL_BACK, 0, nullptr},
+        {false, GGML_OP_CONV_2D_DW, 0, nullptr},
+        {false, GGML_OP_CONV_TRANSPOSE_2D, 0, nullptr},
+        {false, GGML_OP_POOL_1D, 0, nullptr},
+        {false, GGML_OP_POOL_2D, 0, nullptr},
+        {false, GGML_OP_POOL_2D_BACK, 0, nullptr},
+        {false, GGML_OP_UPSCALE, 0, nullptr},
+        {false, GGML_OP_PAD, 0, nullptr},
+        {false, GGML_OP_PAD_REFLECT_1D, 0, nullptr},
+        {false, GGML_OP_ROLL, 0, nullptr},
+        {false, GGML_OP_ARANGE, 0, nullptr},
+        {false, GGML_OP_TIMESTEP_EMBEDDING, 0, nullptr},
+        {false, GGML_OP_ARGSORT, 0, nullptr},
+        {false, GGML_OP_LEAKY_RELU, 0, nullptr},
+        {false, GGML_OP_FLASH_ATTN_EXT, 0, nullptr},
+        {false, GGML_OP_FLASH_ATTN_BACK, 0, nullptr},
+        {false, GGML_OP_SSM_CONV, 0, nullptr},
+        {false, GGML_OP_SSM_SCAN, 0, nullptr},
+        {false, GGML_OP_WIN_PART, 0, nullptr},
+        {false, GGML_OP_WIN_UNPART, 0, nullptr},
+        {false, GGML_OP_GET_REL_POS, 0, nullptr},
+        {false, GGML_OP_ADD_REL_POS, 0, nullptr},
+        {false, GGML_OP_RWKV_WKV6, 0, nullptr},
+        {false, GGML_OP_GATED_LINEAR_ATTN, 0, nullptr},
+        {false, GGML_OP_RWKV_WKV7, 0, nullptr},
+        {false, GGML_OP_UNARY, 0, nullptr},
+        {false, GGML_OP_MAP_CUSTOM1, 0, nullptr},
+        {false, GGML_OP_MAP_CUSTOM2, 0, nullptr},
+        {false, GGML_OP_MAP_CUSTOM3, 0, nullptr},
+        {false, GGML_OP_CUSTOM, 0, nullptr},
+        {false, GGML_OP_CROSS_ENTROPY_LOSS, 0, nullptr},
+        {false, GGML_OP_CROSS_ENTROPY_LOSS_BACK, 0, nullptr},
+        {false, GGML_OP_OPT_STEP_ADAMW, 0, nullptr},
+        {false, static_cast<ggml_op>(GGML_UNARY_OP_ABS), 0, nullptr},
+        {false, static_cast<ggml_op>(GGML_UNARY_OP_SGN), 0, nullptr},
+        {false, static_cast<ggml_op>(GGML_UNARY_OP_NEG), 0, nullptr},
+        {false, static_cast<ggml_op>(GGML_UNARY_OP_STEP), 0, nullptr},
+        {false, static_cast<ggml_op>(GGML_UNARY_OP_TANH), 0, nullptr},
+        {false, static_cast<ggml_op>(GGML_UNARY_OP_ELU), 0, nullptr},
+        {false, static_cast<ggml_op>(GGML_UNARY_OP_RELU), 0, nullptr},
+        {false, static_cast<ggml_op>(GGML_UNARY_OP_SIGMOID), 0, nullptr},
+        {false, static_cast<ggml_op>(GGML_UNARY_OP_GELU), 0, nullptr},
+        {false, static_cast<ggml_op>(GGML_UNARY_OP_GELU_ERF), 0, nullptr},
+        {false, static_cast<ggml_op>(GGML_UNARY_OP_GELU_QUICK), 0, nullptr},
+        {false, static_cast<ggml_op>(GGML_UNARY_OP_SILU), 0, nullptr},
+        {false, static_cast<ggml_op>(GGML_UNARY_OP_HARDSWISH), 0, nullptr},
+        {false, static_cast<ggml_op>(GGML_UNARY_OP_HARDSIGMOID), 0, nullptr},
+        {false, static_cast<ggml_op>(GGML_UNARY_OP_EXP), 0, nullptr}
+};
+
+static_assert(ggmlqnn_k_op_caps[GGML_OP_NONE].supported,    "GGML_OP_NONE is not true");
+static_assert(ggmlqnn_k_op_caps[GGML_OP_ADD].supported,     "GGML_OP_ADD is not true");
+static_assert(ggmlqnn_k_op_caps[GGML_OP_MUL].supported,     "GGML_OP_MUL is not true");
+static_assert(ggmlqnn_k_op_caps[GGML_OP_MUL_MAT].supported, "GGML_OP_MUL_MAT is not true");
+static_assert(std::size(ggmlqnn_k_op_caps) == (static_cast<size_t>(GGML_OP_COUNT) + static_cast<size_t>(GGML_UNARY_OP_COUNT)),
+              "pls check ggmlqnn_k_op_caps and ensure is corresponding to latest ggml.h");
+
+//supported ggml op by HWACCEL_CDSP
+static constexpr const hexagon_op_caps ggmlhexagon_k_op_caps[] = {
+        {true,  GGML_OP_NONE, 0, nullptr, nullptr},
+        {false, GGML_OP_DUP, 0, nullptr, nullptr},
+        {true,  GGML_OP_ADD, 2, "ggmlop_dsp_add", ggmlop_dsp_add},
+        {false, GGML_OP_ADD1, 0, nullptr, nullptr},
+        {false, GGML_OP_ACC, 0, nullptr, nullptr},
+        {false, GGML_OP_SUB, 2, nullptr, nullptr},
+        {false, GGML_OP_MUL, 2, nullptr, nullptr},
+        {false, GGML_OP_DIV, 2, nullptr, nullptr},
+        {false, GGML_OP_SQR, 0, nullptr, nullptr},
+        {false, GGML_OP_SQRT, 0, nullptr, nullptr},
+        {false, GGML_OP_LOG, 0, nullptr, nullptr},
+        {false, GGML_OP_SIN, 0, nullptr, nullptr},
+        {false, GGML_OP_COS, 0, nullptr, nullptr},
+        {false, GGML_OP_SUM, 0, nullptr, nullptr},
+        {false, GGML_OP_SUM_ROWS, 0, nullptr, nullptr},
+        {false, GGML_OP_MEAN, 0, nullptr, nullptr},
+        {false, GGML_OP_ARGMAX, 0, nullptr, nullptr},
+        {false, GGML_OP_COUNT_EQUAL, 0, nullptr, nullptr},
+        {false, GGML_OP_REPEAT, 0, nullptr, nullptr},
+        {false, GGML_OP_REPEAT_BACK, 0, nullptr, nullptr},
+        {false, GGML_OP_CONCAT, 0, nullptr, nullptr},
+        {false, GGML_OP_SILU_BACK, 0, nullptr, nullptr},
+        {false, GGML_OP_NORM, 0, nullptr, nullptr},
+        {true,  GGML_OP_RMS_NORM, 1, "ggmlop_dsp_rmsnorm", ggmlop_dsp_rmsnorm},
+        {false, GGML_OP_RMS_NORM_BACK, 0, nullptr, nullptr},
+        {false, GGML_OP_GROUP_NORM, 0, nullptr, nullptr},
+        {false, GGML_OP_L2_NORM, 0, nullptr, nullptr},
+        {true,  GGML_OP_MUL_MAT, 2, "ggmlop_dsp_mulmat", ggmlop_dsp_mulmat},
+        {false, GGML_OP_MUL_MAT_ID, 0, nullptr, nullptr},
+        {false, GGML_OP_OUT_PROD, 0, nullptr, nullptr},
+        {false, GGML_OP_SCALE, 0, nullptr, nullptr},
+        {false, GGML_OP_SET, 0, nullptr, nullptr},
+        {false, GGML_OP_CPY, 0, nullptr, nullptr},
+        {false, GGML_OP_CONT, 0, nullptr, nullptr},
+        {false, GGML_OP_RESHAPE, 0, nullptr, nullptr},
+        {false, GGML_OP_VIEW, 0, nullptr, nullptr},
+        {false, GGML_OP_PERMUTE, 0, nullptr, nullptr},
+        {false, GGML_OP_TRANSPOSE, 0, nullptr, nullptr},
+        {false, GGML_OP_GET_ROWS, 0, nullptr, nullptr},
+        {false, GGML_OP_GET_ROWS_BACK, 0, nullptr, nullptr},
+        {false, GGML_OP_DIAG, 0, nullptr, nullptr},
+        {false, GGML_OP_DIAG_MASK_INF, 0, nullptr, nullptr},
+        {false, GGML_OP_DIAG_MASK_ZERO, 0, nullptr, nullptr},
+        {true,  GGML_OP_SOFT_MAX, 1, "ggmlop_dsp_softmax", ggmlop_dsp_softmax},
+        {false, GGML_OP_SOFT_MAX_BACK, 0, nullptr, nullptr},
+        {false, GGML_OP_ROPE, 0, nullptr, nullptr},
+        {false, GGML_OP_ROPE_BACK, 0, nullptr, nullptr},
+        {false, GGML_OP_CLAMP, 0, nullptr, nullptr},
+        {false, GGML_OP_CONV_TRANSPOSE_1D, 0, nullptr, nullptr},
+        {false, GGML_OP_IM2COL, 0, nullptr, nullptr},
+        {false, GGML_OP_IM2COL_BACK, 0, nullptr, nullptr},
+        {false, GGML_OP_CONV_2D_DW, 0, nullptr, nullptr},
+        {false, GGML_OP_CONV_TRANSPOSE_2D, 0, nullptr, nullptr},
+        {false, GGML_OP_POOL_1D, 0, nullptr, nullptr},
+        {true,  GGML_OP_POOL_2D, 1, "ggmlop_dsp_pool2d", ggmlop_dsp_pool2d},
+        {false, GGML_OP_POOL_2D_BACK, 0, nullptr, nullptr},
+        {false, GGML_OP_UPSCALE, 0, nullptr, nullptr},
+        {false, GGML_OP_PAD, 0, nullptr, nullptr},
+        {false, GGML_OP_PAD_REFLECT_1D, 0, nullptr, nullptr},
+        {false, GGML_OP_ROLL, 0, nullptr, nullptr},
+        {false, GGML_OP_ARANGE, 0, nullptr, nullptr},
+        {false, GGML_OP_TIMESTEP_EMBEDDING, 0, nullptr, nullptr},
+        {false, GGML_OP_ARGSORT, 0, nullptr, nullptr},
+        {false, GGML_OP_LEAKY_RELU, 0, nullptr, nullptr},
+        {false, GGML_OP_FLASH_ATTN_EXT, 0, nullptr, nullptr},
+        {false, GGML_OP_FLASH_ATTN_BACK, 0, nullptr, nullptr},
+        {false, GGML_OP_SSM_CONV, 0, nullptr, nullptr},
+        {false, GGML_OP_SSM_SCAN, 0, nullptr, nullptr},
+        {false, GGML_OP_WIN_PART, 0, nullptr, nullptr},
+        {false, GGML_OP_WIN_UNPART, 0, nullptr, nullptr},
+        {false, GGML_OP_GET_REL_POS, 0, nullptr, nullptr},
+        {false, GGML_OP_ADD_REL_POS, 0, nullptr, nullptr},
+        {false, GGML_OP_RWKV_WKV6, 0, nullptr, nullptr},
+        {false, GGML_OP_GATED_LINEAR_ATTN, 0, nullptr, nullptr},
+        {false, GGML_OP_RWKV_WKV7, 0, nullptr, nullptr},
+        {false, GGML_OP_UNARY, 0, nullptr, nullptr},
+        {false, GGML_OP_MAP_CUSTOM1, 0, nullptr, nullptr},
+        {false, GGML_OP_MAP_CUSTOM2, 0, nullptr, nullptr},
+        {false, GGML_OP_MAP_CUSTOM3, 0, nullptr, nullptr},
+        {false, GGML_OP_CUSTOM, 0, nullptr, nullptr},
+        {false, GGML_OP_CROSS_ENTROPY_LOSS, 0, nullptr, nullptr},
+        {false, GGML_OP_CROSS_ENTROPY_LOSS_BACK, 0, nullptr, nullptr},
+        {false, GGML_OP_OPT_STEP_ADAMW, 0, nullptr, nullptr},
+        {false, static_cast<ggml_op>(GGML_UNARY_OP_ABS), 0, nullptr, nullptr},
+        {false, static_cast<ggml_op>(GGML_UNARY_OP_SGN), 0, nullptr, nullptr},
+        {false, static_cast<ggml_op>(GGML_UNARY_OP_NEG), 0, nullptr, nullptr},
+        {false, static_cast<ggml_op>(GGML_UNARY_OP_STEP), 0, nullptr, nullptr},
+        {false, static_cast<ggml_op>(GGML_UNARY_OP_TANH), 0, nullptr, nullptr},
+        {false, static_cast<ggml_op>(GGML_UNARY_OP_ELU), 0, nullptr, nullptr},
+        {false, static_cast<ggml_op>(GGML_UNARY_OP_RELU), 0, nullptr, nullptr},
+        {false, static_cast<ggml_op>(GGML_UNARY_OP_SIGMOID), 0, nullptr, nullptr},
+        {false, static_cast<ggml_op>(GGML_UNARY_OP_GELU), 0, nullptr, nullptr},
+        {false, static_cast<ggml_op>(GGML_UNARY_OP_GELU_ERF), 0, nullptr, nullptr},
+        {false, static_cast<ggml_op>(GGML_UNARY_OP_GELU_QUICK), 0, nullptr, nullptr},
+        {false, static_cast<ggml_op>(GGML_UNARY_OP_SILU), 0, nullptr, nullptr},
+        {false, static_cast<ggml_op>(GGML_UNARY_OP_HARDSWISH), 0, nullptr, nullptr},
+        {false, static_cast<ggml_op>(GGML_UNARY_OP_HARDSIGMOID), 0, nullptr, nullptr},
+        {false, static_cast<ggml_op>(GGML_UNARY_OP_EXP), 0, nullptr, nullptr}
+};
+
+static_assert(ggmlhexagon_k_op_caps[GGML_OP_NONE].supported,     "GGML_OP_NONE is not true");
+static_assert(ggmlhexagon_k_op_caps[GGML_OP_ADD].supported,      "GGML_OP_ADD is not true");
+static_assert(ggmlhexagon_k_op_caps[GGML_OP_MUL_MAT].supported,  "GGML_OP_MUL_MAT is not true");
+static_assert(ggmlhexagon_k_op_caps[GGML_OP_SOFT_MAX].supported, "GGML_OP_SOFT_MAX is not true");
+static_assert(std::size(ggmlhexagon_k_op_caps) == (static_cast<size_t>(GGML_OP_COUNT) + static_cast<size_t>(GGML_UNARY_OP_COUNT)),
+              "pls check ggmlhexagon_k_op_caps and ensure is corresponding to latest ggml.h");
+
+static int32_t g_qnntensor_idx = 0; //ensure every QNN tensor name is unique
+static int32_t g_qnnopcfg_idx  = 0; //ensure every QNN opconfig name is unique
+
+// =================================================================================================
+//  section-2: ggml-hexagon internal troubleshooting and profiler function/class
+// =================================================================================================
+static const char * ggmlhexagon_get_hwaccel_approach_name(int hwaccle_approach) {
+    switch (hwaccle_approach) {
+        case HWACCEL_QNN:
+            return "HWACCEL_QNN";
+        case HWACCEL_QNN_SINGLEGRAPH:
+            return "HWACCEL_QNN_SINGLEGRAPH";
+        case HWACCEL_CDSP:
+            return "HWACCEL_CDSP";
+        default:
+            return "unknown hwaccel approach";
+    }
+}
+
+static void ggmlhexagon_get_timestring(char * p_currenttime) {
+    if (nullptr == p_currenttime)
+        return;
+
+    auto time_to_string = [](const std::chrono::system_clock::time_point & tp)->std::string {
+        auto as_time_t = std::chrono::system_clock::to_time_t(tp);
+        struct tm tm;
+
+        localtime_r(&as_time_t, &tm);
+
+        std::chrono::milliseconds ms = std::chrono::duration_cast<std::chrono::milliseconds>(tp.time_since_epoch());
+        char buf[GGMLHEXAGON_TMPBUF_LEN];
+        memset(buf, 0, GGMLHEXAGON_TMPBUF_LEN);
+        snprintf(buf, sizeof(buf), "%04d-%02d-%02d,%02d:%02d:%02d",
+                 tm.tm_year + 1900, tm.tm_mon + 1, tm.tm_mday, tm.tm_hour, tm.tm_min, tm.tm_sec);
+        GGML_UNUSED(ms);
+        return buf;
+    };
+
+    std::chrono::system_clock::time_point tp = std::chrono::system_clock::now();
+    snprintf(p_currenttime, GGMLHEXAGON_TMPBUF_LEN, "%s", time_to_string(tp).c_str());
+}
+
+static void ggmlhexagon_log_internal(ggml_log_level level, const char * file, const char * func, int line, const char * format, ...) {
+    static std::mutex ggmlhexagon_log_internal_mutex;
+    static char s_ggmlhexagon_log_internal_buf[GGMLHEXAGON_LOGBUF_LEN];
+
+    GGML_UNUSED(file);
+#if !(defined __ANDROID__) || !(defined ANDROID)
+    GGML_UNUSED(level);
+#endif
+    {
+        std::lock_guard<std::mutex> lock(ggmlhexagon_log_internal_mutex);
+        va_list args;
+        va_start(args, format);
+        int len_prefix = snprintf(s_ggmlhexagon_log_internal_buf, GGMLHEXAGON_LOGBUF_LEN, "[%s, %d]: ", func, line);
+        int len = vsnprintf(s_ggmlhexagon_log_internal_buf + len_prefix, GGMLHEXAGON_LOGBUF_LEN - len_prefix, format, args);
+        if (len < (GGMLHEXAGON_LOGBUF_LEN - len_prefix)) {
+#if (defined __ANDROID__) || (defined ANDROID)
+            __android_log_print(ANDROID_LOG_INFO, PROJECT_NAME, "%s\n", s_ggmlhexagon_log_internal_buf);
+            if (GGML_LOG_LEVEL_INFO == level) {
+                printf("%s\n", s_ggmlhexagon_log_internal_buf);
+            }
+#else
+            //for Snapdragon based WoA(Windows on ARM) device or Linux
+            printf("%s\n", s_ggmlhexagon_log_internal_buf);
+#endif
+        }
+        va_end(args);
+    }
+}
+
+static void ggmlhexagon_get_processname(char * p_name) {
+    if (nullptr == p_name)
+        return;
+
+    char tmpbuf[GGMLHEXAGON_TMPBUF_LEN];
+    memset(tmpbuf, 0, GGMLHEXAGON_TMPBUF_LEN);
+#if defined(__ANDROID__) || defined(__linux__)
+    int result = readlink("/proc/self/exe", tmpbuf, GGMLHEXAGON_TMPBUF_LEN - 1);
+    if (result < 0) {
+        GGMLHEXAGON_LOG_WARN("failed to get process name, reason:%s", strerror(errno));
+        return;
+    }
+    GGMLHEXAGON_LOG_DEBUG("process name %s", tmpbuf);
+    const char * realname = strrchr(tmpbuf, '/') + 1;
+    GGMLHEXAGON_LOG_DEBUG("process name %s", realname);
+    snprintf(p_name, GGMLHEXAGON_TMPBUF_LEN, "%s", realname);
+#endif
+}
+
+static bool ggmlhexagon_is_llamabench_running() {
+    char processname[GGMLHEXAGON_TMPBUF_LEN];
+    memset(processname, 0, GGMLHEXAGON_TMPBUF_LEN);
+
+    ggmlhexagon_get_processname(processname);
+    if (0 != processname[0] && 0 != processname[1] && 0 != processname[10]) {
+        if (0 == memcmp(processname, "llama-bench", strlen("llama-bench"))) {
+            return true;
+        }
+        if (0 == memcmp(processname, "test-thread-safety", strlen("test-thread-safety"))) {
+            return true;
+        }
+    }
+    return false;
+}
+
+static void ggmlhexagon_print_tensors_info(const char * func_name, const ggml_backend_hexagon_context * ctx,
+                const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * dst) {
+    //skip sanity check of params because of performance concern
+    if (0 == g_hexagon_appcfg.dump_op_info) {
+        if (0 == g_hexagon_appcfg.print_tensors_info)
+            return;
+    }
+
+    if (nullptr != func_name && nullptr != ctx) {
+        GGMLHEXAGON_LOG_DEBUG("call %s in dev %s\n", func_name, ctx->name);
+    }
+    if (nullptr != src0) {
+        GGMLHEXAGON_LOG_DEBUG(
+                "%-6s: type = %i (%s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi, %5zi)",
+                src0->name,
+                src0->type, ggml_type_name(src0->type), src0->ne[0], src0->ne[1], src0->ne[2],
+                src0->ne[3],
+                src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3]);
+    }
+    if (nullptr != src1) {
+        GGMLHEXAGON_LOG_DEBUG(
+                "%-6s: type = %i (%s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi, %5zi)",
+                src1->name,
+                src1->type, ggml_type_name(src1->type), src1->ne[0], src1->ne[1], src1->ne[2],
+                src1->ne[3],
+                src1->nb[0], src1->nb[1], src1->nb[2], src1->nb[3]);
+    }
+    GGMLHEXAGON_LOG_DEBUG("%-6s: type = %i (%s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi, %5zi)",
+                      dst->name,
+                      dst->type, ggml_type_name(dst->type), dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
+                      dst->nb[0], dst->nb[1], dst->nb[2], dst->nb[3]);
+    GGMLHEXAGON_LOG_DEBUG("\n");
+}
+
+static void ggmlhexagon_dump_op_info(const struct ggml_tensor * tensor) {
+    //skip sanity check of params because of performance concern
+    if (0 == g_hexagon_appcfg.dump_op_info)
+        return;
+
+    const struct ggml_tensor * src0 = tensor->src[0];
+    struct ggml_tensor       * src1 = tensor->src[1];
+    struct ggml_tensor       * dst  = const_cast<ggml_tensor *>(tensor);
+    GGMLHEXAGON_LOG_DEBUG("op name:%s, tensor type:%s", ggml_op_name(tensor->op), ggml_type_name(tensor->type));
+    ggmlhexagon_print_tensors_info(nullptr, nullptr, src0, src1, dst);
+}
+
+static void ggmlhexagon_dump_tensor_elements(const ggml_tensor * tensor) {
+    float value = 0;
+    std::ostringstream tmposs;
+    if (tensor->type == GGML_TYPE_F32) {
+        for (int h = 0; h < tensor->ne[3]; h++) {
+            for (int i = 0; i < tensor->ne[2]; i++) {
+                for (int j = 0; j < tensor->ne[1]; j++) {
+                    for (int k = 0; k < tensor->ne[0]; k++) {
+                        value = ((float *) tensor->data)[h * tensor->ne[2] + i * tensor->ne[1] +
+                                                         j * tensor->ne[0] + k];
+                        tmposs << std::setw(8) << std::fixed << std::setprecision(2) << value
+                               << " ";
+                    }
+                    if (strlen(tmposs.str().c_str()) <= (GGMLHEXAGON_LOGBUF_LEN - 96)) {
+                        GGMLHEXAGON_LOG_DEBUG("%s\n", tmposs.str().c_str());
+                    }
+                    tmposs.clear();
+                    tmposs.str("");
+                }
+            }
+        }
+    }
+
+    GGMLHEXAGON_LOG_DEBUG("\n");
+}
+
+static void ggmlhexagon_dump_tensor(const ggml_tensor * tensor, const char * name) {
+    GGMLHEXAGON_LOG_DEBUG("dump ggml tensor %s(%s)\n", name, tensor->name);
+    GGMLHEXAGON_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64", nb = (%5zi, %5zi, %5zi, %5zi)\n",
+                      name,
+                      tensor->type, ggml_type_name(tensor->type),
+                      tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3],
+                      tensor->nb[0], tensor->nb[1], tensor->nb[2], tensor->nb[2]);
+    ggmlhexagon_dump_tensor_elements(tensor);
+
+    GGMLHEXAGON_LOG_DEBUG("\n");
+}
+
+//a simple high-cohesion and low-coupling class to collect necessary profiler data and visualize NPU performance accordingly
+class hexagon_profiler {
+public:
+    static hexagon_profiler & get_instance() {
+        //make thread-safety without using complex dynamic resource management
+        static hexagon_profiler instance;
+        return instance;
+    }
+
+public:
+    void profiler_init(int profiler_threshold_duration, int profiler_threshold_counts) {
+        reset();
+        //here is not accurate profiler start time because inference wasn't launched at the moment
+        _profiler_starttime = ggml_time_us();
+
+        _profiler_threshold_duration = profiler_threshold_duration;
+        _profiler_threshold_counts   = profiler_threshold_counts;
+
+        std::string filename = std::string(g_hexagon_appcfg.runtime_libpath) + "/";
+        if (HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach) {
+            if (g_hexagon_appcfg.thread_counts > 1) {
+                //multi-threading feature enabled on cDSP side
+                if (0 == g_hexagon_appcfg.enable_rpc_ion_mempool) {
+                    filename = filename + "hexagon_perf_cdsp_mt.dat";
+                } else {
+                    filename = filename + "hexagon_perf_cdsp_ion_mt.dat";
+                }
+            } else {
+                if (0 == g_hexagon_appcfg.enable_rpc_ion_mempool) {
+                    filename = filename + "hexagon_perf_cdsp.dat";
+                } else {
+                    filename = filename + "hexagon_perf_cdsp_ion.dat";
+                }
+            }
+        } else {
+            filename = filename + "hexagon_perf_qnn.dat";
+        }
+        GGMLHEXAGON_LOG_DEBUG("profiler name:%s", filename.c_str());
+        const char * profiler_filename = filename.c_str();
+        _fp_profile_file = fopen(profiler_filename, "w");
+        if (nullptr == _fp_profile_file) {
+            GGMLHEXAGON_LOG_WARN("can't open profiler file %s, reason:%s", profiler_filename, strerror(errno));
+            reset();
+            return;
+        } else {
+            size_t written_size = 0;
+            char profiler_info[GGMLHEXAGON_TMPBUF_LEN];
+            const char * prefix = "### starting hexagon profiler at ";
+
+            written_size = fwrite(prefix, 1, strlen(prefix), _fp_profile_file);
+            if (written_size != strlen(prefix)) {
+                GGMLHEXAGON_LOG_WARN("write data to file %s failed, reason: %s", profiler_filename, strerror(errno));
+                profiler_deinit();
+                return;
+            }
+
+            memset(profiler_info, 0, GGMLHEXAGON_TMPBUF_LEN);
+            ggmlhexagon_get_timestring(profiler_info);
+            written_size = fwrite(profiler_info, 1, strlen(profiler_info), _fp_profile_file);
+            if (written_size != strlen(profiler_info)) {
+                GGMLHEXAGON_LOG_WARN("write data to file %s failed, reason: %s", profiler_filename, strerror(errno));
+                profiler_deinit();
+                return;
+            }
+            fprintf(_fp_profile_file, "\n\n");
+            fprintf(_fp_profile_file,
+                    "#frame     input   max     total     avg         elapse     frame       max        total      avg\n");
+            fprintf(_fp_profile_file,
+                    "#                                                           inference   inference  inference  inference\n");
+            fprintf(_fp_profile_file,
+                    "#index     len     i-len   i-len     i-speed     time       time        time       time       time\n");
+            fprintf(_fp_profile_file, "\n\n");
+        }
+        _enable_profiler = true;
+    }
+
+    void profiler_deinit() {
+        if (nullptr != _fp_profile_file) {
+            fclose(_fp_profile_file);
+            _fp_profile_file = nullptr;
+        }
+        reset();
+    }
+
+/**
+ * \param inference_time          microseconds, inference time for a single GGML op
+ * \param inference_input_size    bytes, total input data size for a single GGML op
+ * \param inference_output_size   bytes, total output data size for a single GGML op
+ */
+    void profiler_update_profilerdata(const char * ggml_opname, int inference_time, int inference_input_size, int inference_output_size) {
+        if (!_enable_profiler)
+            return;
+
+        //1.get the accurate profiler starting time in this function when frame index is 0
+        //2.update frame index in this function accordingly
+        profiler_update_frameindex();
+
+        int64_t elapse_time = ggml_time_us() - profiler_get_starttime();
+        profiler_update_elapsetime(elapse_time);
+        if (elapse_time > (_profiler_threshold_duration * SIZE_IN_MB)) {
+            //do nothing when elapsed profiler time > profiler_duration in ggml-hexagon.cfg
+            return;
+        }
+        if (profiler_get_frame_index() >= _profiler_threshold_counts) {
+            //do nothing when frame_index >= profiler_counts in ggml-hexagon.cfg
+            return;
+        }
+
+        if (inference_input_size > profiler_get_max_inputsize()) {
+            profiler_set_max_inputsize(inference_input_size);
+        }
+
+        if (inference_output_size > profiler_get_max_outputsize()) {
+            profiler_set_max_outputsize(inference_output_size);
+        }
+
+        if (inference_time > profiler_get_max_inferencetime()) {
+            profiler_set_max_inferencetime(inference_time);
+        }
+
+        profiler_update_total_inputsize(inference_input_size);
+        profiler_update_total_outputsize(inference_output_size);
+        profiler_update_total_inferencetime(inference_time);
+        profiler_update_elapsetime(elapse_time);
+
+        if (HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach) {
+            if (10 > _frame_index) {
+                //FIXME:why some initial profiler data in llama-cli looks unusual
+                //return;
+            }
+        }
+
+        if (0 == elapse_time) {
+            //filter invalid profiler data
+            return;
+        }
+
+        if (NULL != _fp_profile_file) {
+            fprintf(_fp_profile_file, "%-8d  %-6d  %-6d  %-10ld %-11ld %-10ld %-12d %-9d %-11ld %-3ld\n",
+                    profiler_get_frame_index(),
+                    inference_input_size,
+                    profiler_get_max_inputsize(),
+                    profiler_get_total_inputputsize(),
+                    profiler_get_total_inputputsize() / profiler_get_frame_index(),
+
+                    elapse_time,
+                    inference_time,
+                    profiler_get_max_inferencetime(),
+                    profiler_get_total_inferencetime(),
+                    profiler_get_total_inferencetime() / profiler_get_frame_index()
+            );
+        }
+
+        //print/compare NPU's I/O performance between 8Gen3 and 8Elite , removed in the future
+        char bps_string[GGMLHEXAGON_TMPBUF_LEN];
+        memset(bps_string, 0, GGMLHEXAGON_TMPBUF_LEN);
+        profiler_get_bpsstring(_total_inputsize + _total_outputsize, elapse_time, bps_string);
+        GGMLHEXAGON_LOG_VERBOSE("I/O performance:%s", bps_string);
+    }
+
+    int profiler_get_frame_index() {
+        return _frame_index;
+    }
+
+    int profiler_get_threshold_count() {
+        return _profiler_threshold_counts;
+    }
+
+private:
+    void profiler_set_max_inputsize(int input_size) {
+        _max_inputsize = input_size;
+    }
+
+    void profiler_set_max_outputsize(int output_size) {
+        _max_outputsize = output_size;
+    }
+
+    void profiler_set_max_inferencetime(int inference_time) {
+        _max_inferencetime = inference_time;
+    }
+
+    void profiler_update_frameindex() {
+        if (0 == _frame_index) {
+            _profiler_starttime = ggml_time_us();
+        }
+        _frame_index += 1;
+    }
+
+    void profiler_update_elapsetime(int64_t elapse_time_microseconds) {
+        _profiler_elapsetime = elapse_time_microseconds;
+    }
+
+    void profiler_update_total_inferencetime(int inference_time) {
+        _total_inferencetime += inference_time;
+    }
+
+    void profiler_update_total_inputsize(int input_size) {
+        _total_inputsize += input_size;
+    }
+
+    void profiler_update_total_outputsize(int output_size) {
+        _total_outputsize += output_size;
+    }
+
+    int profiler_get_max_inputsize() {
+        return _max_inputsize;
+    }
+
+    int profiler_get_max_outputsize() {
+        return _max_outputsize;
+    }
+
+    int profiler_get_max_inferencetime() {
+        return _max_inferencetime;
+    }
+
+    int64_t profiler_get_total_inferencetime() {
+        return _total_inferencetime;
+    }
+
+    int64_t profiler_get_total_inputputsize() {
+        return _total_inputsize;
+    }
+
+    //might-be used to calculate total I/O performance in the future
+    int64_t profiler_get_total_outputsize() {
+        return _total_outputsize;
+    }
+
+    int64_t profiler_get_starttime() {
+        return _profiler_starttime;
+    }
+
+    int64_t profiler_get_elapsedtime() {
+        return _profiler_elapsetime;
+    }
+
+    void profiler_get_bpsstring(int64_t data_size, int64_t elapse_time_microseconds, char * bps_string) {
+        if (nullptr == bps_string) {
+            return;
+        }
+
+        float bps = 0.0f;
+        bps = (data_size * SIZE_IN_MB * 1.0f) / (elapse_time_microseconds * 1.0f);
+        if (bps >= SIZE_IN_MB) {
+            snprintf(bps_string, GGMLHEXAGON_TMPBUF_LEN, "%.2f MiB/s", ((float) bps) / SIZE_IN_MB);
+        } else if (bps >= 1000) {
+            snprintf(bps_string, GGMLHEXAGON_TMPBUF_LEN, "%.1f KiB/s", ((float) bps) / 1000);
+        } else {
+            snprintf(bps_string, GGMLHEXAGON_TMPBUF_LEN, "%.2f B/s", bps);
+        }
+    }
+
+    void reset() {
+        _frame_index         = 0;
+
+        _max_inputsize       = 0;
+        _max_outputsize      = 0;
+        _max_inferencetime   = 0;
+
+        _total_inputsize     = 0;
+        _total_outputsize    = 0;
+        _total_inferencetime = 0;
+
+        _profiler_starttime  = 0;
+        _profiler_elapsetime = 0;
+        _fp_profile_file     = nullptr;
+        _enable_profiler     = false;
+        _profiler_threshold_duration = 100;
+        _profiler_threshold_duration = 5;
+    }
+
+private:
+    hexagon_profiler() {
+        reset();
+    }
+
+    hexagon_profiler(const hexagon_profiler &) = delete;
+
+    hexagon_profiler(const hexagon_profiler &&) = delete;
+
+    hexagon_profiler & operator= (const hexagon_profiler &) = delete;
+
+private:
+    int _frame_index;
+
+    int _max_inputsize;             //bytes
+    int _max_outputsize;            //bytes
+    int _max_inferencetime;         //bytes
+
+    int64_t _total_inputsize;       //bytes
+    int64_t _total_outputsize;      //bytes
+    int64_t _total_inferencetime;   //microsecond
+
+    int64_t _profiler_starttime;    //microsecond
+    int64_t _profiler_elapsetime;   //microsecond
+    FILE *  _fp_profile_file;
+
+    bool _enable_profiler;
+    int  _profiler_threshold_duration; //seconds
+    int  _profiler_threshold_counts;
+};
+static hexagon_profiler & g_hexagon_profiler = hexagon_profiler::get_instance();
+
+//a simple perf class to probe NPU performance
+class hexagon_perf {
+public:
+    hexagon_perf(const std::string & perf_name) : _perf_name(std::move(perf_name)) {}
+    hexagon_perf(const std::string & perf_name, const char * op_name, int input_size, int output_size)
+               : _perf_name(std::move(perf_name)), _op_name(op_name),
+                 _input_size(input_size),
+                 _output_size(output_size) {
+
+    }
+
+    void start() {
+        if (0 == g_hexagon_appcfg.enable_perf)
+            return;
+        _begin_time = ggml_time_us();
+    }
+
+    //use explicit function calls rather than scoped feature
+    void info() {
+        if (0 == g_hexagon_appcfg.enable_perf) {
+            return;
+        }
+
+        _end_time = ggml_time_us();
+        _duration = (_end_time - _begin_time);
+        //add following judgement will useful for other developers and AI experts although:
+        // it breaks the original logic
+        // it's not mandatory
+        // had to expose two public function in hexagon_profiler class
+        if (g_hexagon_profiler.profiler_get_frame_index() <= g_hexagon_profiler.profiler_get_threshold_count()) {
+            const char * devname = ggml_backend_hexagon_get_devname(g_hexagon_appcfg.hexagon_backend);
+            //the logic here is make sense because already checked in ggml_backend_hexagon_device_init_backend
+            if (g_hexagon_appcfg.hexagon_backend != HEXAGON_BACKEND_GGML) {
+                devname += 16;
+            }
+            GGMLHEXAGON_LOG_VERBOSE("inference duration of %s through %s: %lld microseconds",
+                                    _perf_name.c_str(), devname, _duration);
+        }
+
+        //update profiler data
+        g_hexagon_profiler.profiler_update_profilerdata(_op_name, _duration, _input_size, _output_size);
+    }
+
+private:
+    hexagon_perf() = delete;
+    hexagon_perf(const hexagon_perf & ) = delete;
+    hexagon_perf(const hexagon_perf && ) = delete;
+    hexagon_perf & operator= (const hexagon_perf & ) = delete;
+
+private:
+    int64_t _begin_time = 0LL;
+    int64_t _end_time   = 0LL;
+    int64_t _duration   = 0LL;
+    std::string _perf_name;
+    const char * _op_name;
+    int   _input_size   = 0;
+    int   _output_size  = 0;
+};
+
+//a simple class to load/set running configurations in ggml-hexagon.cfg
+class hexagon_appcfg {
+public:
+    hexagon_appcfg() {}
+
+    void dump(std::function<void(const std::string &, const std::string &, const std::string &)> worker) {
+        if (!_load_success) {
+            GGMLHEXAGON_LOG_INFO("qnn cfg file %s not loaded", _cfg_filename.c_str());
+            return;
+        }
+        auto iter = _hexagon_appcfg.begin();
+        while (iter != _hexagon_appcfg.end()) {
+            auto kv_iter = iter->second.begin();
+            while (kv_iter != iter->second.end()) {
+                worker(iter->first, kv_iter->first, kv_iter->second);
+                ++kv_iter;
+            }
+            ++iter;
+        }
+    }
+
+    bool load(const std::string & file_name) {
+        if (file_name == "") {
+            return false;
+        }
+        _cfg_filename = file_name;
+        std::ifstream in;
+        std::string line;
+        in.open(file_name.c_str());
+        if (not in.is_open()) {
+            GGMLHEXAGON_LOG_WARN("can't open file %s", file_name.c_str());
+            return false;
+        }
+        while (getline(in, line)) {
+            std::string section, key, value;
+            if (not parse_line(line, section, key, value)) {
+                continue;
+            }
+            set_section_keyvalue(section, key, value);
+        }
+        _load_success = true;
+        return true;
+    }
+
+    void get_stringvalue(const std::string & section, const std::string & key, std::string & value, std::string default_value) {
+        value = default_value;
+        if (_hexagon_appcfg.find(section) == _hexagon_appcfg.end()) {
+            return;
+        }
+        if (_hexagon_appcfg[section].find(key) == _hexagon_appcfg[section].end()) {
+            return;
+        }
+        value = _hexagon_appcfg[section][key];
+    }
+
+    void get_intvalue(const std::string & section, const std::string & key, int & value, int default_value) {
+        value = default_value;
+        if (_hexagon_appcfg.find(section) == _hexagon_appcfg.end()) {
+            return;
+        }
+        if (_hexagon_appcfg[section].find(key) == _hexagon_appcfg[section].end()) {
+            return;
+        }
+        value = atol(_hexagon_appcfg[section][key].c_str());
+    }
+
+    bool modify_hexagon_config(std::string & cfg_filename, int new_hexagon_backend, int new_hwaccel_approach, int new_mulmat_algotype) {
+        std::ifstream inputfile(cfg_filename);
+        if (!inputfile.is_open()) {
+            GGMLHEXAGON_LOG_WARN("can't open file %s", cfg_filename.c_str());
+            return false;
+        }
+
+        std::string filedata = "";
+
+        std::string line;
+        std::string backupline;
+        bool is_rewrite = false;
+        bool is_founded = false;
+        bool is_key = true;
+        std::string key;
+        std::string value;
+        std::string newvalue;
+        while (std::getline(inputfile, line)) {
+            is_founded = false;
+            backupline = line;
+            trim(line);
+            if (0 == line.rfind("#", 0)) {
+                filedata += backupline;
+                filedata += "\n";
+                continue;
+            }
+
+            newvalue = "";
+            if (line.rfind("hexagon_backend", 0) != std::string::npos) {
+                if (new_hexagon_backend >= 0) {
+                    is_founded = true;
+                    is_rewrite = true;
+                    newvalue = std::to_string(new_hexagon_backend);
+                }
+            }
+
+            if (line.rfind("hwaccel_approach", 0) != std::string::npos) {
+                //compatiable with previous logic
+                if (new_hwaccel_approach >= 0) {
+                    is_founded = true;
+                    is_rewrite = true;
+                    newvalue = std::to_string(new_hwaccel_approach);
+                }
+            }
+
+            if (line.rfind("mulmat_algotype", 0) != std::string::npos) {
+                //compatiable with previous logic
+                if (new_mulmat_algotype >= 0) {
+                    is_founded = true;
+                    is_rewrite = true;
+                    newvalue = std::to_string(new_mulmat_algotype);
+                }
+            }
+
+
+            if (is_founded) {
+                is_key = true;
+                key = "";
+                value = "";
+
+                for (size_t i = 0; i < line.size(); ++i) {
+                    if (line[i] == '=') {
+                        is_key = false;
+                        continue;
+                    }
+                    if (is_key) {
+                        key += line[i];
+                    } else {
+                        value += line[i];
+                    }
+                }
+                trim(key);
+                trim(value);
+                GGMLHEXAGON_LOG_VERBOSE("key %s value %s\n", key.c_str(), value.c_str());
+                GGMLHEXAGON_LOG_VERBOSE("key %s new value %s\n", key.c_str(), newvalue.c_str());
+                backupline = key + " = " + newvalue;
+            }
+            filedata += backupline;
+            filedata += "\n";
+        }
+        inputfile.close();
+
+        if (is_rewrite) {
+            std::ofstream outputfile;
+            outputfile.open(cfg_filename);
+            outputfile.flush();
+            outputfile << filedata;
+            outputfile.close();
+        }
+        return true;
+    }
+
+    //compatiable with previous codes
+    bool modify_hexagon_config(std::string & cfg_filename, int new_hexagon_backend, int new_hwaccel_approach) {
+        return modify_hexagon_config(cfg_filename, new_hexagon_backend, new_hwaccel_approach, -1);
+    }
+
+private:
+    void ltrim(std::string & str) {
+        if (str.empty()) return;
+        size_t len  = 0;
+        const char * temp = str.c_str();
+        while (*temp && isblank(*temp)) {
+            ++len;
+            ++temp;
+        }
+        if (len > 0) str.erase(0, len);
+    }
+
+    void rtrim(std::string & str) {
+        if (str.empty()) return;
+        size_t len = str.length();
+        size_t pos = len;
+        while (pos > 0) {
+            if (not isblank(str[pos - 1])) {
+                break;
+            }
+            --pos;
+        }
+        if (pos != len) str.erase(pos);
+    }
+
+    void trim(std::string & str) {
+        ltrim(str);
+        rtrim(str);
+    }
+
+    void set_section_keyvalue(std::string & section, std::string & key, std::string & value) {
+        if (_hexagon_appcfg.find(section) == _hexagon_appcfg.end()) {
+            std::unordered_map<std::string, std::string> kv_map;
+            _hexagon_appcfg[section] = kv_map;
+        }
+        if (key != "" && value != "") _hexagon_appcfg[section][key] = value;
+    }
+
+    bool parse_line(std::string & line, std::string & section, std::string & key, std::string & value) {
+        static std::string cur_section = "";
+        std::string nodes[2] = {"#", ";"};
+        for (int i = 0; i < 2; ++i) {
+            std::string::size_type pos = line.find(nodes[i]);
+            if (pos != std::string::npos) line.erase(pos);
+        }
+        trim(line);
+        if (line == "") return false;
+        if (line[0] == '[' && line[line.size() - 1] == ']') {
+            section = line.substr(1, line.size() - 2);
+            trim(section);
+            cur_section = section;
+            return false;
+        }
+        if (cur_section == "") return false;
+        bool is_key = true;
+        for (size_t i = 0; i < line.size(); ++i) {
+            if (line[i] == '=') {
+                is_key = false;
+                continue;
+            }
+            if (is_key) {
+                key += line[i];
+            } else {
+                value += line[i];
+            }
+        }
+        section = cur_section;
+        trim(key);
+        trim(value);
+
+        //"1.00" -> 1.00
+        if (value.front() == '"' && value.back() == '"') {
+            value.erase(0, 1); // erase the first character "
+            value.erase(value.size() - 1); // erase the last character "
+        }
+
+        return true;
+    }
+
+private:
+    hexagon_appcfg(const hexagon_appcfg & ) = delete;
+    hexagon_appcfg(const hexagon_appcfg && ) = delete;
+    hexagon_appcfg & operator= (const hexagon_appcfg & ) = delete;
+
+private:
+    std::unordered_map<std::string, std::unordered_map<std::string, std::string>> _hexagon_appcfg;
+    bool _load_success = false;
+    std::string _cfg_filename;
+};
+
+// =================================================================================================
+//  section-3: helper function for WoA(Window on ARM)
+// =================================================================================================
+#if !defined(__ANDROID__) && !defined(__linux__)
+#define RTLD_GLOBAL 0x100
+#define RTLD_LOCAL  0x000
+#define RTLD_LAZY   0x000
+#define RTLD_NOW    0x001
+static void *       dlopen(const char * filename, int flag);
+static int          dlclose(void * handle);
+static void *       dlsym(void* handle, const char* name);
+static const char * dlerror(void);
+
+static const char * last_func = nullptr;
+static long last_err;
+static void * dlopen(const char * dll, int flags) {
+  HINSTANCE h = LoadLibraryA(dll);
+  GGML_UNUSED(flags);
+  if (h == NULL) {
+    last_err  = GetLastError();
+    last_func = "dlopen";
+  }
+  return h;
+}
+
+static int dlclose(void * h) {
+  if (!FreeLibrary((HINSTANCE)h)) {
+    last_err  = GetLastError();
+    last_func = "dlclose";
+    return -1;
+  }
+  return 0;
+}
+
+static void * dlsym(void * h, const char * name) {
+  FARPROC p = GetProcAddress((HINSTANCE)h, name);
+  if (!p) {
+    last_err  = GetLastError();
+    last_func = "dlsym";
+  }
+  return (void*)(intptr_t)p;
+}
+
+static const char * dlerror(void) {
+  static char str[512];
+  if (!last_err) return nullptr;
+
+  snprintf(str, 512, "%s error #%ld", last_func, last_err);
+  last_err  = 0;
+  last_func = NULL;
+
+  return str;
+}
+#endif
+
+// =================================================================================================
+//  section-4: general helper function
+// =================================================================================================
+static const char * ggmlhexagon_get_socmodel_desc(uint32_t soc_model) {
+    switch (soc_model) {
+        case SM7450:
+            return "SM7450";
+        case SM8350:
+            return "SM8350";
+        case SM8450:
+            return "SM8450";
+        case SM8475:
+            return "SM8475";
+        case SM8550:
+            return "SM8550";
+        case SM8650:
+            return "SM8650";
+        case SM8750:
+            return "SM8750";
+        default:
+            return "unknown";
+    }
+}
+
+//0x68 -> 68, 0x69 -> 69, 0x73 -> 73, 0x75 -> 75, 0x79 -> 79
+static size_t ggmlhexagon_htparch_hex_to_decimal(size_t htp_arch) {
+    //naive algorithm
+    int a = htp_arch / 16;
+    int b = htp_arch % 16;
+    return a * 10 + b;
+}
+
+static const char * ggmlhexagon_get_htparch_desc(size_t htp_arch) {
+    switch (htp_arch) {
+        case V68:
+            return "QCOM_HTP_V68";
+        case V69:
+            return "QCOM_HTP_V69";
+        case V73:
+            return "QCOM_HTP_V73";
+        case V75:
+            return "QCOM_HTP_V75";
+        case V79:
+            return "QCOM_HTP_V79";
+        default:
+            return "unknown";
+    }
+}
+
+static struct qcom_socinfo * ggmlhexagon_get_socinfo_from_socmodel(uint32_t soc_model) {
+    size_t items = sizeof(g_qnn_soc_info_table) / sizeof(g_qnn_soc_info_table[0]);
+    for (size_t idx = 0; idx < items; idx++) {
+        if (soc_model == g_qnn_soc_info_table[idx].soc_model) {
+            return &g_qnn_soc_info_table[idx];
+        }
+    }
+    return nullptr;
+}
+
+static struct qcom_socinfo * ggmlhexagon_get_socinfo_from_socmodel(size_t htp_arch) {
+    size_t items = sizeof(g_qnn_soc_info_table) / sizeof(g_qnn_soc_info_table[0]);
+    for (size_t idx = 0; idx < items; idx++) {
+        if (htp_arch == g_qnn_soc_info_table[idx].htp_arch) {
+            return &g_qnn_soc_info_table[idx];
+        }
+    }
+    return nullptr;
+}
+
+static inline uint32_t ggmlqnn_get_tensor_data_size(const ggml_tensor * tensor) {
+    /*
+    size_t data_size = ggml_row_size(tensor->type, tensor->ne[0]);
+    size_t n_dims = ggml_get_tensor_rank(tensor);
+    for (int i = 1; i < n_dims; i++) {
+        data_size *= tensor->ne[i];
+    }
+
+    return data_size;
+    */
+    return ggml_nbytes(tensor);
+}
+
+static inline bool ggmlqnn_is_valid_params(ggml_backend_hexagon_context * ctx, const ggml_tensor * src0,
+                                           const ggml_tensor * src1, ggml_tensor * dst) {
+    if ((nullptr == ctx) || (nullptr == src0) || (nullptr == dst)) {
+        GGMLHEXAGON_LOG_WARN("invalid params\n");
+        return false;
+    }
+
+    qnn_instance * instance = ctx->instance;
+    if (nullptr == instance) {
+        GGMLHEXAGON_LOG_WARN("invalid params\n");
+        return false;
+    }
+
+    return true;
+}
+
+static size_t ggmlhexagon_get_system_total_memory_in_bytes() {
+#if defined(__ANDROID__) || defined(__linux__)
+    struct sysinfo info = {};
+    if (0 == sysinfo(&info)) {
+        return (info.totalram + info.totalswap) * info.mem_unit;
+    }
+    size_t pages      = (size_t)sysconf(_SC_PHYS_PAGES);
+    size_t page_size  = (size_t)sysconf(_SC_PAGE_SIZE);
+
+    return pages * page_size;
+#else
+    //TODO: Snapdragon based WoA(Windows on ARM)
+    MEMORYSTATUSEX statex;
+    statex.dwLength = sizeof(statex);
+    if (GlobalMemoryStatusEx(&statex)) {
+        GGMLHEXAGON_LOG_INFO("total physical mem:%llu Mb", statex.ullTotalPhys >> 20);
+        GGMLHEXAGON_LOG_INFO("avail physical mem:%llu Mb", statex.ullAvailPhys >> 20);
+        return statex.ullTotalPhys;
+    }
+    return 0;
+#endif
+}
+
+static size_t ggmlhexagon_get_system_free_memory_in_bytes() {
+#if defined(__ANDROID__) || defined(__linux__)
+    struct sysinfo info = {};
+    if (0 == sysinfo(&info)) {
+        return (info.freeram + info.freeswap) * info.mem_unit;
+    }
+    size_t avail_pages = (size_t)sysconf(_SC_AVPHYS_PAGES);
+    size_t page_size   = (size_t)sysconf(_SC_PAGE_SIZE);
+
+    return avail_pages * page_size;
+#else
+    //TODO: Snapdragon based WoA(Windows on ARM)
+    MEMORYSTATUSEX statex;
+    statex.dwLength = sizeof(statex);
+    if (GlobalMemoryStatusEx(&statex)) {
+        GGMLHEXAGON_LOG_INFO("total physical mem:%llu Mb", statex.ullTotalPhys >> 20);
+        GGMLHEXAGON_LOG_INFO("avail physical mem:%llu Mb", statex.ullAvailPhys >> 20);
+        return statex.ullAvailPhys;
+    }
+    return 0;
+#endif
+}
+
+static bool ggmlhexagon_same_types(const ggml_backend_hexagon_context * ctx, const ggml_tensor * op_tensor) {
+    GGML_UNUSED(ctx);
+    ggml_tensor * src0 = op_tensor->src[0];
+    ggml_tensor * src1 = op_tensor->src[1];
+    if (nullptr != src1) {
+        if (src0->type != op_tensor->type || src1->type != op_tensor->type) {
+            return false;
+        }
+    } else {
+        if (src0->type != op_tensor->type) {
+            return false;
+        }
+    }
+
+    if (src0->type != GGML_TYPE_F32)
+        return false;
+
+    return true;
+}
+
+static const char * ggmlhexagon_get_ggml_type_name(ggml_type type) {
+    const auto * traits = ggml_get_type_traits(type);
+    return traits->type_name;
+}
+
+static void ggmlhexagon_append_tensor_dimensions(const ggml_tensor * tensor, std::string & output) {
+    char buffer[GGMLHEXAGON_TMPBUF_LEN] = {};
+    const char * type_name = ggmlhexagon_get_ggml_type_name(tensor->type);
+    int len = 0;
+    switch (ggml_n_dims(tensor)) {
+        case 1:
+            len = snprintf(buffer, sizeof(buffer), "%ldx1%s", (long)tensor->ne[0], type_name);
+            break;
+        case 2:
+            len = snprintf(buffer, sizeof(buffer), "%ldx%ld%s", (long)tensor->ne[0], (long)tensor->ne[1], type_name);
+            break;
+        case 3:
+            len = snprintf(buffer, sizeof(buffer), "%ldx%ldx%ld%s", (long)tensor->ne[0], (long)tensor->ne[1],
+                           (long)tensor->ne[2], type_name);
+            break;
+        case 4:
+        default:
+            len = snprintf(buffer, sizeof(buffer), "%ldx%ldx%ldx%ld%s", (long)tensor->ne[0], (long)tensor->ne[1],
+                           (long)tensor->ne[2], (long)tensor->ne[3], type_name);
+            break;
+    }
+    GGML_ASSERT(len > 0 && len < (int)sizeof(buffer));
+    output.append(buffer, len);
+}
+
+static size_t ggmlhexagon_get_op_index(const ggml_tensor * tensor) {
+    if (tensor->op == GGML_OP_UNARY) {
+        return static_cast<size_t>(GGML_OP_COUNT) + static_cast<size_t>(ggml_get_unary_op(tensor));
+    }
+
+    return tensor->op;
+}
+
+static size_t ggmlhexagon_get_op_input_param_count(const ggml_tensor * op) {
+    auto op_index = ggmlhexagon_get_op_index(op);
+    GGML_ASSERT(op_index < std::size(ggmlqnn_k_op_caps));
+    return ggmlhexagon_k_op_caps[op_index].input_param_count;
+}
+
+static void ggmlhexagon_get_opkey_from_op(const ggml_tensor * op, std::string & output) {
+    GGML_ASSERT(op->op != GGML_OP_NONE);
+    output += ggml_op_desc(op);
+    output += ggmlhexagon_get_ggml_type_name(op->type);
+    size_t param_count = ggmlhexagon_get_op_input_param_count(op);
+    for (size_t i = 0; i < param_count; ++i) {
+        auto * input = op->src[i];
+        if (!input) {
+            break;
+        }
+        output += '_';
+        ggmlhexagon_append_tensor_dimensions(input, output);
+    }
+}
+
+static void * ggmlhexagon_type_trait(ggml_backend_hexagon_context * ctx, ggml_tensor * op) {
+    const ggml_tensor * src0        = op->src[0];
+    const ggml_tensor * src1        = op->src[1];
+    ggml_tensor * dst               = op;
+    const enum ggml_type src0_type  = src0->type;
+
+    GGML_TENSOR_BINARY_OP_LOCALS
+    GGML_ASSERT(ne0 == ne01);
+    GGML_ASSERT(ne1 == ne11);
+    GGML_ASSERT(ne2 == ne12);
+    GGML_ASSERT(ne3 == ne13);
+    GGML_ASSERT(nb00 == ggml_type_size(src0_type));
+    GGML_ASSERT(nb10 == ggml_type_size(src1->type));
+
+    const int64_t ne_plane = ne01 * ne00;
+    const size_t desired_size = ((GGML_TYPE_F32 == src0_type) ? 0 : ne03 * ne02 * ne_plane * sizeof(float));
+    ctx->desired_size   = desired_size;
+    if (ctx->work_size < desired_size) {
+        ctx->work_data.reset(new char[desired_size]);
+        ctx->work_size  = desired_size;
+    }
+    ctx->n_threads = std::thread::hardware_concurrency();
+    void * wdata = ctx->work_data.get();
+    // convert src0 to float
+    if (src0_type != GGML_TYPE_F32) {
+        const auto * type_traits        = ggml_get_type_traits(src0_type);
+        ggml_to_float_t const to_float  = type_traits->to_float;
+
+        for (int64_t i03 = 0; i03 < ne03; i03++) {
+            for (int64_t i02 = 0; i02 < ne02; i02++) {
+                const void * x          = (char *)src0->data + i02 * nb02 + i03 * nb03;
+                float * const wplane    = (float *)wdata + i02 * ne_plane + i03 * ne02 * ne_plane;
+
+                const int min_cols_per_thread = 4096;
+                const int min_rows_per_thread = std::max((int)(min_cols_per_thread / ne00), 1);
+                const int n_threads = std::max(std::min(ctx->n_threads, (int)(ne01 / min_rows_per_thread)), 1);
+                for (int i = 1; i < n_threads; i++) {
+                    const int64_t start = i * ne01 / n_threads;
+                    const int64_t end   = (i + 1) * ne01 / n_threads;
+                    if (start < end) {
+                        ctx->tasks.push_back(std::async(std::launch::async, [=]() {
+                            for (int64_t i01 = start; i01 < end; i01++) {
+                                to_float((const char *)x + i01 * nb01, wplane + i01 * ne00, ne00);
+                            }
+                        }));
+                    }
+                }
+                {
+                    // reuse the current thread for the first task
+                    const int64_t start = 0;
+                    const int64_t end = ne01 / n_threads;
+                    for (int64_t i01 = start; i01 < end; i01++) {
+                        to_float((const char *) x + i01 * nb01, wplane + i01 * ne00, ne00);
+                    }
+                }
+            }
+        }
+
+        // wait for all tasks to finish
+        for (auto &task: ctx->tasks) {
+            task.get();
+        }
+        ctx->tasks.clear();
+    }
+    return wdata;
+}
+
+static void ggmlhexagon_set_runtime_path(size_t device, const std::string & path) {
+#if defined(__ANDROID__)
+    if ((HEXAGON_BACKEND_QNNNPU == device) || (HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach)) {
+        std::string lib_runtime_path = path + ":/vendor/dsp/cdsp:/vendor/lib64:/vendor/dsp/dsp:/vendor/dsp/images";
+        if (0 == setenv("LD_LIBRARY_PATH", lib_runtime_path.c_str(), 1)) {
+            GGMLHEXAGON_LOG_DEBUG("setenv LD_LIBRARY_PATH %s successfully", lib_runtime_path.c_str());
+        } else {
+            GGMLHEXAGON_LOG_ERROR("setenv LD_LIBRARY_PATH %s failure", lib_runtime_path.c_str());
+        }
+
+        std::string adsp_runtime_path = path + ";/vendor/dsp/cdsp;/vendor/lib/rfsa/adsp;/system/lib/rfsa/adsp;/vendor/dsp/dsp;/vendor/dsp/images;/dsp";
+        if (0 == setenv("ADSP_LIBRARY_PATH", adsp_runtime_path.c_str(), 1)) {
+            GGMLHEXAGON_LOG_DEBUG("setenv ADSP_LIBRARY_PATH %s successfully", adsp_runtime_path.c_str());
+        } else {
+            GGMLHEXAGON_LOG_ERROR("setenv ADSP_LIBRARY_PATH %s failure", adsp_runtime_path.c_str());
+        }
+    } else {
+        if (0 == setenv("LD_LIBRARY_PATH",
+                        (path +
+                         ":/vendor/dsp/cdsp:/vendor/lib64:/vendor/dsp/dsp:/vendor/dsp/images").c_str(),
+                        1)) {
+            GGMLHEXAGON_LOG_DEBUG("%s backend setenv successfully\n",
+                                 ggml_backend_hexagon_get_devname(device));
+        } else {
+            GGMLHEXAGON_LOG_ERROR("%s backend setenv failure\n",
+                                  ggml_backend_hexagon_get_devname(device));
+        }
+    }
+#endif
+}
+
+static void ggmlhexagon_load_cfg() {
+    //this function can be called in various scenarios
+    static bool initialized = false;
+    if (initialized) {
+        GGMLHEXAGON_LOG_DEBUG("hexagon appcfg file already loaded\n");
+        return;
+    }
+    char time_string[GGMLHEXAGON_TMPBUF_LEN];
+    memset(time_string, 0, GGMLHEXAGON_TMPBUF_LEN);
+    ggmlhexagon_get_timestring(time_string);
+    GGMLHEXAGON_LOG_DEBUG("program running start time:%s", time_string);
+    std::string cfg_filename = std::string(g_hexagon_appcfg.runtime_libpath) + std::string(g_hexagon_appcfg.cfgfilename);
+
+    hexagon_appcfg hexagoncfg_instance;
+    hexagoncfg_instance.load(cfg_filename);
+    hexagoncfg_instance.dump([](const std::string & section, const std::string & key, const std::string value) {
+        std::ostringstream  tmposs;
+        tmposs << "section[" << std::setw(10) << std::left << section << "],[" << std::setw(25) << std::left << key << "] = [" << value << "]";
+        GGMLHEXAGON_LOG_VERBOSE("%s", tmposs.str().c_str());
+    });
+    std::string precision_mode;
+    std::string version; //version of ggml-hexagon.cpp
+    std::string ggmldsp_version; //version of ggml-dsp.c
+    hexagoncfg_instance.get_stringvalue("general", "version", version, "1.00");
+    hexagoncfg_instance.get_stringvalue("general", "ggmldsp_version", ggmldsp_version, "0.62");
+    hexagoncfg_instance.get_intvalue("general", "enable_perf", g_hexagon_appcfg.enable_perf, 1);
+    hexagoncfg_instance.get_intvalue("general", "print_tensors_info", g_hexagon_appcfg.print_tensors_info, 0);
+    hexagoncfg_instance.get_intvalue("general", "dump_op_info", g_hexagon_appcfg.dump_op_info, 0);
+    hexagoncfg_instance.get_intvalue("general", "hwaccel_approach", g_hexagon_appcfg.hwaccel_approach, HWACCEL_CDSP);
+    hexagoncfg_instance.get_intvalue("general", "hexagon_backend", g_hexagon_appcfg.hexagon_backend, HEXAGON_BACKEND_CDSP);
+    hexagoncfg_instance.get_intvalue("general", "enable_q_mulmat", g_hexagon_appcfg.enable_q_mulmat, 0);
+    hexagoncfg_instance.get_intvalue("general", "enable_profiler", g_hexagon_appcfg.enable_profiler, 0);
+    hexagoncfg_instance.get_intvalue("general", "profiler_duration", g_hexagon_appcfg.profiler_duration, 5);
+    hexagoncfg_instance.get_intvalue("general", "profiler_counts", g_hexagon_appcfg.profiler_counts, 100);
+    hexagoncfg_instance.get_intvalue("general", "enable_pinned_memory", g_hexagon_appcfg.enable_pinned_memory, 0);
+
+    hexagoncfg_instance.get_intvalue("qnn", "hvx_threads", g_hexagon_appcfg.hvx_threads, 4);
+    hexagoncfg_instance.get_intvalue("qnn", "vtcm_size_in_mb", g_hexagon_appcfg.vtcm_size_in_mb, 8);
+    hexagoncfg_instance.get_intvalue("qnn", "enable_dlbc", g_hexagon_appcfg.enable_dlbc, 1);
+    hexagoncfg_instance.get_stringvalue("qnn", "precision_mode", precision_mode, "fp32");
+    hexagoncfg_instance.get_intvalue("qnn", "print_qnn_internal_log", g_hexagon_appcfg.print_qnn_internal_log, 0);
+
+    hexagoncfg_instance.get_intvalue("cdsp", "enable_rpc_ion_mempool", g_hexagon_appcfg.enable_rpc_ion_mempool, 0);
+    hexagoncfg_instance.get_intvalue("cdsp", "enable_all_q_mulmat", g_hexagon_appcfg.enable_all_q_mulmat, 0);
+    hexagoncfg_instance.get_intvalue("cdsp", "thread_counts", g_hexagon_appcfg.thread_counts, 4);
+    hexagoncfg_instance.get_intvalue("cdsp", "mulmat_algotype", g_hexagon_appcfg.mulmat_algotype, 0);
+
+    memcpy(g_hexagon_appcfg.ggml_dsp_version, ggmldsp_version.c_str(), strlen(ggmldsp_version.c_str()));
+
+    GGMLHEXAGON_LOG_VERBOSE("load hexagon appcfg from %s", cfg_filename.c_str());
+    GGMLHEXAGON_LOG_VERBOSE("internal ggml_hexagon_version=%s", g_hexagon_appcfg.ggml_hexagon_version);
+    GGMLHEXAGON_LOG_VERBOSE("internal ggml_dsp_version=%s", g_hexagon_appcfg.ggml_dsp_version);
+    GGMLHEXAGON_LOG_VERBOSE("external ggml_hexagon_version=%s", version.c_str());
+    GGMLHEXAGON_LOG_VERBOSE("external ggml_dsp_version=%s", ggmldsp_version.c_str());
+    GGMLHEXAGON_LOG_VERBOSE("hwaccel_approach=%d(%s)", g_hexagon_appcfg.hwaccel_approach,
+                         ggmlhexagon_get_hwaccel_approach_name(g_hexagon_appcfg.hwaccel_approach));
+    GGMLHEXAGON_LOG_VERBOSE("hexagon_backend=%d(%s)", g_hexagon_appcfg.hexagon_backend,
+                         ggml_backend_hexagon_get_devname(g_hexagon_appcfg.hexagon_backend));
+    GGMLHEXAGON_LOG_VERBOSE("runtime libpath=%s", g_hexagon_appcfg.runtime_libpath);
+    GGMLHEXAGON_LOG_VERBOSE("enable_perf=%d", g_hexagon_appcfg.enable_perf);
+    GGMLHEXAGON_LOG_VERBOSE("enable_profiler=%d", g_hexagon_appcfg.enable_profiler);
+
+    if (precision_mode.find("fp16") != std::string::npos) {
+        g_hexagon_appcfg.precision_mode = 1;
+    } else {
+        g_hexagon_appcfg.precision_mode = 0;
+    }
+
+    ggmlhexagon_set_runtime_path(HEXAGON_BACKEND_CDSP, g_hexagon_appcfg.runtime_libpath);
+
+    if (1 == g_hexagon_appcfg.enable_profiler) {
+        //make sure this function is called only once
+        g_hexagon_profiler.profiler_init(g_hexagon_appcfg.profiler_duration, g_hexagon_appcfg.profiler_counts);
+    }
+
+    initialized = true;
+}
+
+void ggml_backend_hexagon_set_cfg(int new_hexagon_backend, int new_hwaccel_approach) {
+    if (new_hexagon_backend < 0 || new_hexagon_backend > HEXAGON_BACKEND_GGML) {
+        GGMLHEXAGON_LOG_WARN("invalid new_hexagon_backend");
+        return;
+    }
+    if (new_hwaccel_approach < 0 || new_hwaccel_approach > HWACCEL_CDSP) {
+        GGMLHEXAGON_LOG_WARN("invalid new_hwaccel_approach");
+        return;
+    }
+    std::string cfg_filename = std::string(g_hexagon_appcfg.runtime_libpath) + std::string(g_hexagon_appcfg.cfgfilename);
+    GGMLHEXAGON_LOG_VERBOSE("load hexagon appcfg from %s", cfg_filename.c_str());
+    hexagon_appcfg hexagoncfg_instance;
+    GGMLHEXAGON_LOG_VERBOSE("set_hexagon_cfg with new_hexagon_backend %d, new_hwaccel_approach %d", new_hexagon_backend, new_hwaccel_approach);
+    hexagoncfg_instance.modify_hexagon_config(cfg_filename, new_hexagon_backend, new_hwaccel_approach);
+    hexagoncfg_instance.load(cfg_filename);
+    hexagoncfg_instance.dump([](const std::string & section, const std::string & key, const std::string value) {
+        std::ostringstream  tmposs;
+        tmposs << "section[" << std::setw(10) << std::left << section << "],[" << std::setw(25) << std::left << key << "] = [" << value << "]";
+        GGMLHEXAGON_LOG_VERBOSE("%s", tmposs.str().c_str());
+    });
+}
+
+int ggml_backend_hexagon_get_mulmat_algotype() {
+    std::string cfg_filename = std::string(g_hexagon_appcfg.runtime_libpath) + std::string(g_hexagon_appcfg.cfgfilename);
+    hexagon_appcfg hexagoncfg_instance;
+    hexagoncfg_instance.load(cfg_filename);
+    hexagoncfg_instance.get_intvalue("cdsp", "mulmat_algotype", g_hexagon_appcfg.mulmat_algotype, 0);
+    return g_hexagon_appcfg.mulmat_algotype;
+}
+
+/**
+ * troubleshooting peformance of mulmat on cDSP during development stage
+ */
+void ggml_backend_hexagon_set_mulmat_algotype(int new_mulmat_algotype) {
+    //the logic here is different with logic in the ggml_backend_hexagon_set_cfg(int new_hexagon_backend, int new_hwaccel_approach)
+    if (new_mulmat_algotype < 0) {
+        GGMLHEXAGON_LOG_WARN("invalid new_mulmat_algotype");
+        return;
+    }
+    std::string cfg_filename = std::string(g_hexagon_appcfg.runtime_libpath) + std::string(g_hexagon_appcfg.cfgfilename);
+    GGMLHEXAGON_LOG_VERBOSE("load hexagon appcfg from %s", cfg_filename.c_str());
+    hexagon_appcfg hexagoncfg_instance;
+    GGMLHEXAGON_LOG_VERBOSE("set_hexagon_cfg with new_mulmat_algotype %d", new_mulmat_algotype);
+    hexagoncfg_instance.modify_hexagon_config(cfg_filename, -1, -1, new_mulmat_algotype);
+    hexagoncfg_instance.load(cfg_filename);
+    hexagoncfg_instance.dump([](const std::string & section, const std::string & key, const std::string value) {
+        std::ostringstream  tmposs;
+        tmposs << "section[" << std::setw(10) << std::left << section << "],[" << std::setw(25) << std::left << key << "] = [" << value << "]";
+        GGMLHEXAGON_LOG_VERBOSE("%s", tmposs.str().c_str());
+    });
+}
+
+static bool ggmlhexagon_check_valid_appcfg() {
+    bool is_valid_appcfg = true;
+
+    GGMLHEXAGON_LOG_DEBUG("user's specified hwaccel approach=%d(%s)", g_hexagon_appcfg.hwaccel_approach,
+                          ggmlhexagon_get_hwaccel_approach_name(g_hexagon_appcfg.hwaccel_approach));
+    GGMLHEXAGON_LOG_DEBUG("user's specified hexagon_backend=%d", g_hexagon_appcfg.hexagon_backend);
+    if (g_hexagon_appcfg.hexagon_backend >= GGML_HEXAGON_MAX_DEVICES) {
+        GGMLHEXAGON_LOG_VERBOSE("using default ggml backend");
+        is_valid_appcfg = false;
+    }
+
+    if (HWACCEL_QNN_SINGLEGRAPH == g_hexagon_appcfg.hwaccel_approach) {
+        GGMLHEXAGON_LOG_VERBOSE("HWACCEL_QNN_SINGLEGRAPH not supported");
+        is_valid_appcfg = false;
+    }
+
+    if (HWACCEL_QNN == g_hexagon_appcfg.hwaccel_approach) {
+        if (HEXAGON_BACKEND_CDSP == g_hexagon_appcfg.hexagon_backend) {
+            GGMLHEXAGON_LOG_VERBOSE("hexagon_backend HEXAGON_BACKEND_CDSP must match with hwaccel_approach HWACCEL_CDSP");
+            is_valid_appcfg = false;
+        }
+    }
+
+    if (HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach) {
+        if ((HEXAGON_BACKEND_CDSP != g_hexagon_appcfg.hexagon_backend) && (HEXAGON_BACKEND_GGML != g_hexagon_appcfg.hexagon_backend)) {
+            GGMLHEXAGON_LOG_VERBOSE("hwaccel_approach HWACCEL_CDSP must match with hexagon_backend HEXAGON_BACKEND_CDSP");
+            is_valid_appcfg = false;
+        }
+
+        if (1 == g_hexagon_appcfg.enable_all_q_mulmat) {
+            if (0 == g_hexagon_appcfg.enable_q_mulmat) {
+                GGMLHEXAGON_LOG_DEBUG("ensure set enable_q_mulmat to 1 firstly when set enable_all_q_mulmat to 1 if you are not currently comparing the performance of GGML_OP_ADD between QNNCPU, QNNGPU, QNNNPU, cDSP, ggml");
+                //is_valid_appcfg = false;
+            }
+        }
+    }
+
+    if (!is_valid_appcfg) {
+        GGMLHEXAGON_LOG_VERBOSE("it seems there is non-default configuration in ggml-hexagon.cfg, will using the default ggml backend accordingly");
+    }
+    return is_valid_appcfg;
+}
+
+static void ggmlhexagon_probe_dspinfo(ggml_backend_hexagon_context * ctx);
+static void ggmlhexagon_print_running_timestamp(ggml_backend_hexagon_context * ctx) {
+    char timestamp[GGMLHEXAGON_TMPBUF_LEN];
+    memset(timestamp, 0, GGMLHEXAGON_TMPBUF_LEN);
+
+    if (ggmlhexagon_is_llamabench_running()) {
+        //make llama-bench happy
+        return;
+    }
+
+    GGMLHEXAGON_LOG_INFO("ggml_hexagon_version:             %s", g_hexagon_appcfg.ggml_hexagon_version);
+    GGMLHEXAGON_LOG_INFO("ggml_dsp_version:                 %s", g_hexagon_appcfg.ggml_dsp_version);
+    GGMLHEXAGON_LOG_INFO("hwaccel approach:                 %d(%s)", g_hexagon_appcfg.hwaccel_approach,
+                         ggmlhexagon_get_hwaccel_approach_name(g_hexagon_appcfg.hwaccel_approach));
+    GGMLHEXAGON_LOG_INFO("hexagon_backend:                  %d(%s)", g_hexagon_appcfg.hexagon_backend,
+                         ggml_backend_hexagon_get_devname(g_hexagon_appcfg.hexagon_backend));
+    GGMLHEXAGON_LOG_INFO("enable pinned_memory:             %s", g_hexagon_appcfg.enable_pinned_memory ? "YES" : "NO");
+    ggmlhexagon_get_timestring(timestamp);
+    if (HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach) {
+        GGMLHEXAGON_LOG_INFO("offload quantize GGML_OP_MUL_MAT: %s", g_hexagon_appcfg.enable_q_mulmat ? "YES" : "NO");
+        GGMLHEXAGON_LOG_INFO("using rpc ion memory pool:        %s", g_hexagon_appcfg.enable_rpc_ion_mempool ? "YES" : "NO");
+        GGMLHEXAGON_LOG_INFO("thread_counts with HWACCEL_CDSP:  %d", g_hexagon_appcfg.thread_counts);
+        GGMLHEXAGON_LOG_INFO("mulmat algo type on cDSP       :  %d", g_hexagon_appcfg.mulmat_algotype);
+        ggmlhexagon_probe_dspinfo(ctx);
+    } else {
+        GGMLHEXAGON_LOG_INFO("thread_counts with HWACCEL_QNN: %d", g_hexagon_appcfg.hvx_threads);
+        GGMLHEXAGON_LOG_INFO("offload quantize GGML_OP_MUL_MAT: %s", g_hexagon_appcfg.enable_q_mulmat ? "YES" : "NO");
+    }
+    GGMLHEXAGON_LOG_INFO("running timestamp:%s", timestamp);
+
+    if (1 == g_hexagon_appcfg.enable_profiler) {
+        //make sure this function is called only once
+        g_hexagon_profiler.profiler_deinit();
+    }
+}
+
+// =================================================================================================
+//  section-5: QNN helper function/class
+// =================================================================================================
+//make sure every QNN tensor/opcfg name is unique, threadsafe is not required at the moment
+static void ggmlqnn_reset_idx() {
+    g_qnntensor_idx = 0;
+    g_qnnopcfg_idx = 0;
+}
+
+static void ggmlqnn_inc_idx(int idx_type) {
+    switch (idx_type) {
+        case QNN_TENSOR_INDEX:
+            g_qnntensor_idx++;
+            break;
+        case QNN_OPCFG_INDEX:
+            g_qnnopcfg_idx++;
+            break;
+        default:
+            break;
+    }
+}
+
+static int32_t ggmlqnn_get_idx(int idx_type) {
+    switch (idx_type) {
+        case QNN_TENSOR_INDEX:
+            return g_qnntensor_idx;
+        case QNN_OPCFG_INDEX:
+            return g_qnnopcfg_idx;
+        default:
+            break;
+    }
+
+    //it's not make sense, just for fix compiler warning
+    return g_qnntensor_idx;
+}
+
+static intptr_t ggmlqnn_align_to(size_t alignment, intptr_t offset) {
+    return offset % alignment == 0 ? offset
+                                   : offset +
+                                     (static_cast<intptr_t>(alignment) -
+                                      offset % static_cast<intptr_t>(alignment));
+}
+
+static size_t ggmlqnn_memscpy(void * dst, size_t dst_size, const void * src, size_t copy_size) {
+    if (!dst || !src || !dst_size || !copy_size)
+        return 0;
+
+    size_t min_size = dst_size < copy_size ? dst_size : copy_size;
+
+    memcpy(dst, src, min_size);
+
+    return min_size;
+}
+
+static char * ggmlqnn_strndup(const char * source, size_t maxlen) {
+#if defined(__ANDROID__) || defined(__linux__)
+    return strndup(source, maxlen);
+#else
+    //TODO:behaviour is not exactly same to Android&Linux
+    GGML_UNUSED(maxlen);
+    return strdup(source);
+#endif
+}
+
+static inline uint32_t ggmlqnn_get_tensorid(const Qnn_Tensor_t & tensor) {
+    if (tensor.version == QNN_TENSOR_VERSION_1) {
+        return tensor.v1.id;
+    }
+    return 0u;
+}
+
+static inline const char * ggmlqnn_get_tensorname(const Qnn_Tensor_t & tensor) {
+    if (tensor.version == QNN_TENSOR_VERSION_1) {
+        return tensor.v1.name;
+    }
+    return nullptr;
+}
+
+static inline Qnn_TensorType_t ggmlqnn_get_tensortype(const Qnn_Tensor_t & tensor) {
+    if (tensor.version == QNN_TENSOR_VERSION_1) {
+        return tensor.v1.type;
+    }
+    return QNN_TENSOR_TYPE_UNDEFINED;
+}
+
+static inline Qnn_TensorDataFormat_t ggmlqnn_get_tensor_dataformat(const Qnn_Tensor_t & tensor) {
+    if (tensor.version == QNN_TENSOR_VERSION_1) {
+        return tensor.v1.dataFormat;
+    }
+    return QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER;
+}
+
+static inline Qnn_DataType_t ggmlqnn_get_tensor_datatype(const Qnn_Tensor_t & tensor) {
+    if (tensor.version == QNN_TENSOR_VERSION_1) {
+        return tensor.v1.dataType;
+    }
+    return QNN_DATATYPE_UNDEFINED;
+}
+
+static inline Qnn_QuantizeParams_t ggmlqnn_get_tensor_quantparams(const Qnn_Tensor_t & tensor) {
+    if (tensor.version == QNN_TENSOR_VERSION_1) {
+        return tensor.v1.quantizeParams;
+    }
+    return QNN_QUANTIZE_PARAMS_INIT;
+}
+
+static inline uint32_t ggmlqnn_get_tensor_rank(const Qnn_Tensor_t & tensor) {
+    if (tensor.version == QNN_TENSOR_VERSION_1) {
+        return tensor.v1.rank;
+    }
+    return 0u;
+}
+
+static inline uint32_t * ggmlqnn_get_tensor_dimensions(const Qnn_Tensor_t & tensor) {
+    if (tensor.version == QNN_TENSOR_VERSION_1) {
+        return tensor.v1.dimensions;
+    }
+    return nullptr;
+}
+
+static inline Qnn_TensorMemType_t ggmlqnn_get_tensor_memtype(const Qnn_Tensor_t & tensor) {
+    if (tensor.version == QNN_TENSOR_VERSION_1) {
+        return tensor.v1.memType;
+    }
+    return QNN_TENSORMEMTYPE_UNDEFINED;
+}
+
+static inline void ggmlqnn_set_tensor_id(Qnn_Tensor_t & tensor, uint32_t id) {
+    if (tensor.version == QNN_TENSOR_VERSION_1) {
+        tensor.v1.id = id;
+    }
+}
+
+static inline void ggmlqnn_set_tensor_name(Qnn_Tensor_t & tensor, const char * name) {
+    if (tensor.version == QNN_TENSOR_VERSION_1) {
+        tensor.v1.name = name;
+    }
+}
+
+static inline void ggmlqnn_set_tensor_type(Qnn_Tensor_t & tensor, Qnn_TensorType_t type) {
+    if (tensor.version == QNN_TENSOR_VERSION_1) {
+        tensor.v1.type = type;
+    }
+}
+
+static inline void ggmlqnn_set_tensor_dataformat(Qnn_Tensor_t & tensor, Qnn_TensorDataFormat_t format) {
+    if (tensor.version == QNN_TENSOR_VERSION_1) {
+        tensor.v1.dataFormat = format;
+    }
+}
+
+static inline void ggmlqnn_set_tensor_datatype(Qnn_Tensor_t & tensor, Qnn_DataType_t dataType) {
+    if (tensor.version == QNN_TENSOR_VERSION_1) {
+        tensor.v1.dataType = dataType;
+    }
+}
+
+static inline void ggmlqnn_set_tensor_quantparams(Qnn_Tensor_t & tensor, Qnn_QuantizeParams_t params) {
+    if (tensor.version == QNN_TENSOR_VERSION_1) {
+        tensor.v1.quantizeParams = params;
+    }
+}
+
+static inline void ggmlqnn_set_tensor_rank(Qnn_Tensor_t & tensor, uint32_t rank) {
+    if (tensor.version == QNN_TENSOR_VERSION_1) {
+        tensor.v1.rank = rank;
+    }
+}
+
+static inline void ggmlqnn_set_tensor_dimensions(Qnn_Tensor_t & tensor, uint32_t * dims) {
+    if (tensor.version == QNN_TENSOR_VERSION_1) {
+        tensor.v1.dimensions = dims;
+    }
+}
+
+static inline void ggmlqnn_set_tensor_memtype(Qnn_Tensor_t & tensor, Qnn_TensorMemType_t memType) {
+    if (tensor.version == QNN_TENSOR_VERSION_1) {
+        tensor.v1.memType = memType;
+    }
+}
+
+static inline void ggmlqnn_set_tensor_clientbuf(Qnn_Tensor_t & tensor, Qnn_ClientBuffer_t clientBuf) {
+    if (tensor.version == QNN_TENSOR_VERSION_1) {
+        tensor.v1.clientBuf = clientBuf;
+    }
+}
+
+static inline void ggmlqnn_set_tensor_memhandle(Qnn_Tensor_t & tensor, Qnn_MemHandle_t handle) {
+    if (tensor.version == QNN_TENSOR_VERSION_1) {
+        tensor.v1.memHandle = handle;
+    }
+}
+
+static int ggmlqnn_deep_copy_qnntensor(Qnn_Tensor_t & src, Qnn_Tensor_t & dst) {
+    int err = 0;
+
+    dst.version = src.version;
+    ggmlqnn_set_tensor_name(dst, ggmlqnn_strndup(ggmlqnn_get_tensorname(src), std::string(ggmlqnn_get_tensorname(src)).size()));
+    if (nullptr == ggmlqnn_get_tensorname(dst)) {
+        return 1;
+    }
+    ggmlqnn_set_tensor_id(dst, ggmlqnn_get_tensorid(src));
+    ggmlqnn_set_tensor_type(dst, ggmlqnn_get_tensortype(src));
+    ggmlqnn_set_tensor_dataformat(dst, ggmlqnn_get_tensor_dataformat(src));
+    ggmlqnn_set_tensor_datatype(dst, ggmlqnn_get_tensor_datatype(src));
+    ggmlqnn_set_tensor_memtype(dst, ggmlqnn_get_tensor_memtype(src));
+
+    if (ggmlqnn_get_tensor_memtype(src) == QNN_TENSORMEMTYPE_RAW) {
+        Qnn_ClientBuffer_t client_buf = {nullptr, 0};
+        ggmlqnn_set_tensor_clientbuf(dst, client_buf);
+    } else if (ggmlqnn_get_tensor_memtype(src) == QNN_TENSORMEMTYPE_MEMHANDLE) {
+        ggmlqnn_set_tensor_memhandle(dst, nullptr);
+    } else {
+        return 1;
+    }
+
+    Qnn_QuantizeParams_t src_qparam      = ggmlqnn_get_tensor_quantparams(src);
+    Qnn_QuantizationEncoding_t encoding  = src_qparam.quantizationEncoding;
+    if (encoding == QNN_QUANTIZATION_ENCODING_AXIS_SCALE_OFFSET) {
+        Qnn_QuantizeParams_t src_qparam_cpy       = src_qparam;
+        Qnn_AxisScaleOffset_t & axis_scale_offset = src_qparam_cpy.axisScaleOffsetEncoding;
+        Qnn_ScaleOffset_t ** scale_offset         = &axis_scale_offset.scaleOffset;
+        size_t scale_offset_size = axis_scale_offset.numScaleOffsets * sizeof(Qnn_ScaleOffset_t);
+        *scale_offset            = (Qnn_ScaleOffset_t *)malloc(scale_offset_size);
+        ggmlqnn_memscpy(*scale_offset,
+                        scale_offset_size,
+                        src_qparam.axisScaleOffsetEncoding.scaleOffset,
+                        scale_offset_size);
+        ggmlqnn_set_tensor_quantparams(dst, src_qparam_cpy);
+    } else if (encoding == QNN_QUANTIZATION_ENCODING_BW_AXIS_SCALE_OFFSET) {
+        Qnn_QuantizeParams_t src_qparam_cpy           = src_qparam;
+        Qnn_BwAxisScaleOffset_t & bwaxis_scale_offset = src_qparam_cpy.bwAxisScaleOffsetEncoding;
+        size_t scale_size                          = bwaxis_scale_offset.numElements * sizeof(float);
+        float ** scales                            = &bwaxis_scale_offset.scales;
+        int32_t ** offsets                         = &bwaxis_scale_offset.offsets;
+        *scales                                    = (float *)malloc(scale_size);
+        ggmlqnn_memscpy(*scales, scale_size, src_qparam.bwAxisScaleOffsetEncoding.scales, scale_size);
+
+        if (bwaxis_scale_offset.offsets != nullptr) {
+            size_t offset_size = bwaxis_scale_offset.numElements * sizeof(int32_t);
+            *offsets           = (int32_t *)malloc(offset_size);
+            ggmlqnn_memscpy(*offsets, offset_size, src_qparam.bwAxisScaleOffsetEncoding.offsets, offset_size);
+        }
+        ggmlqnn_set_tensor_quantparams(dst, src_qparam_cpy);
+    } else {
+        ggmlqnn_set_tensor_quantparams(dst, src_qparam);
+    }
+
+    uint32_t rank = ggmlqnn_get_tensor_rank(src);
+    ggmlqnn_set_tensor_rank(dst, rank);
+    size_t dim_size       = GGML_MAX_DIMS * sizeof(uint32_t);
+    uint32_t * dimensions = (uint32_t *)malloc(dim_size);
+    if (nullptr == dimensions) {
+        GGMLHEXAGON_LOG_WARN("deep_copy_qnn_tensors() allocation error while copying tensor %s\n", ggmlqnn_get_tensorname(src));
+        return 1;
+    }
+    ggmlqnn_memscpy(dimensions, dim_size, ggmlqnn_get_tensor_dimensions(src), dim_size);
+    ggmlqnn_set_tensor_dimensions(dst, dimensions);
+
+    return err;
+}
+
+static int ggmlqnn_free_qnntensor(Qnn_Tensor_t * tensor) {
+    int err = 0;
+    free((void *) ggmlqnn_get_tensorname(*tensor));
+    Qnn_QuantizeParams_t src_qparam     = ggmlqnn_get_tensor_quantparams(*tensor);
+    Qnn_QuantizationEncoding_t encoding = src_qparam.quantizationEncoding;
+    if (encoding == QNN_QUANTIZATION_ENCODING_AXIS_SCALE_OFFSET) {
+        free(src_qparam.axisScaleOffsetEncoding.scaleOffset);
+    } else if (encoding == QNN_QUANTIZATION_ENCODING_BW_AXIS_SCALE_OFFSET) {
+        free(src_qparam.bwAxisScaleOffsetEncoding.scales);
+        if (src_qparam.bwAxisScaleOffsetEncoding.offsets != nullptr) {
+            free(src_qparam.bwAxisScaleOffsetEncoding.offsets);
+        }
+    }
+    GGMLHEXAGON_LOG_DEBUG("free tensor %p", tensor);
+    free(ggmlqnn_get_tensor_dimensions(*tensor));
+    free(tensor);
+
+    return err;
+}
+
+static const char * ggmlqnn_get_qnnerror_string(Qnn_ErrorHandle_t qnn_error_code) {
+    // file:///opt/qcom/aistack/qairt/2.31.0.250130/docs/QNN/general/api_error_codes.html
+    switch (qnn_error_code) {
+        case QNN_SUCCESS:
+            return "QNN_SUCCESS";
+        case QNN_COMMON_ERROR_GENERAL:
+            return "QNN_COMMON_ERROR_GENERAL";
+
+            // QnnGraph_Error_t
+        case QNN_GRAPH_ERROR_UNSUPPORTED_FEATURE:
+            return "QNN_GRAPH_ERROR_UNSUPPORTED_FEATURE";
+        case QNN_GRAPH_ERROR_MEM_ALLOC:
+            return "QNN_GRAPH_ERROR_MEM_ALLOC";
+        case QNN_GRAPH_ERROR_INVALID_ARGUMENT:
+            return "QNN_GRAPH_ERROR_INVALID_ARGUMENT";
+        case QNN_GRAPH_ERROR_INVALID_HANDLE:
+            return "QNN_GRAPH_ERROR_INVALID_HANDLE";
+        case QNN_GRAPH_ERROR_GRAPH_DOES_NOT_EXIST:
+            return "QNN_GRAPH_ERROR_GRAPH_DOES_NOT_EXIST";
+        case QNN_GRAPH_ERROR_INVALID_NAME:
+            return "QNN_GRAPH_ERROR_INVALID_NAME";
+        case QNN_GRAPH_ERROR_INVALID_TENSOR:
+            return "QNN_GRAPH_ERROR_INVALID_TENSOR";
+        case QNN_GRAPH_ERROR_INVALID_OP_CONFIG:
+            return "QNN_GRAPH_ERROR_INVALID_OP_CONFIG";
+        case QNN_GRAPH_ERROR_SET_PROFILE:
+            return "QNN_GRAPH_ERROR_SET_PROFILE";
+        case QNN_GRAPH_ERROR_UNCONNECTED_NODE:
+            return "QNN_GRAPH_ERROR_UNCONNECTED_NODE";
+        case QNN_GRAPH_ERROR_CREATE_FAILED:
+            return "QNN_GRAPH_ERROR_CREATE_FAILED";
+        case QNN_GRAPH_ERROR_OPTIMIZATION_FAILED:
+            return "QNN_GRAPH_ERROR_OPTIMIZATION_FAILED";
+        case QNN_GRAPH_ERROR_FINALIZE_FAILED:
+            return "QNN_GRAPH_ERROR_FINALIZE_FAILED";
+        case QNN_GRAPH_ERROR_GRAPH_NOT_FINALIZED:
+            return "QNN_GRAPH_ERROR_GRAPH_NOT_FINALIZED";
+        case QNN_GRAPH_ERROR_GRAPH_FINALIZED:
+            return "QNN_GRAPH_ERROR_GRAPH_FINALIZED";
+        case QNN_GRAPH_ERROR_EXECUTION_ASYNC_FIFO_FULL:
+            return "QNN_GRAPH_ERROR_EXECUTION_ASYNC_FIFO_FULL";
+        case QNN_GRAPH_ERROR_SIGNAL_IN_USE:
+            return "QNN_GRAPH_ERROR_SIGNAL_IN_USE";
+        case QNN_GRAPH_ERROR_ABORTED:
+            return "QNN_GRAPH_ERROR_ABORTED";
+        case QNN_GRAPH_ERROR_PROFILE_IN_USE:
+            return "QNN_GRAPH_ERROR_PROFILE_IN_USE";
+        case QNN_GRAPH_ERROR_TIMED_OUT:
+            return "QNN_GRAPH_ERROR_TIMED_OUT";
+        case QNN_GRAPH_ERROR_SUBGRAPH:
+            return "QNN_GRAPH_ERROR_SUBGRAPH";
+        case QNN_GRAPH_ERROR_DISABLED:
+            return "QNN_GRAPH_ERROR_DISABLED";
+        case QNN_GRAPH_ERROR_DYNAMIC_TENSOR_SHAPE:
+            return "QNN_GRAPH_ERROR_DYNAMIC_TENSOR_SHAPE";
+        case QNN_GRAPH_ERROR_TENSOR_SPARSITY:
+            return "QNN_GRAPH_ERROR_TENSOR_SPARSITY";
+        case QNN_GRAPH_ERROR_EARLY_TERMINATION:
+            return "QNN_GRAPH_ERROR_EARLY_TERMINATION";
+        case QNN_GRAPH_ERROR_INVALID_CONTEXT:
+            return "QNN_GRAPH_ERROR_INVALID_CONTEXT";
+
+            //QQnnTensor_Error_t
+            //Invalid context/graph handle in creating tensor
+        case QNN_TENSOR_ERROR_INVALID_HANDLE:
+            return "QNN_TENSOR_ERROR_INVALID_HANDLE";
+            //Tensor with specified credentials not registered with a context/graph
+        case QNN_TENSOR_ERROR_DOES_NOT_EXIST:
+            return "QNN_TENSOR_ERROR_DOES_NOT_EXIST";
+            // (deprecated) Tensor has already been registered with backend
+        case QNN_TENSOR_ERROR_ALREADY_EXISTS:
+            return "QNN_TENSOR_ERROR_ALREADY_EXISTS";
+            // Invalid tensor param.
+        case QNN_TENSOR_ERROR_INVALID_TENSOR_PARAM:
+            return "QNN_TENSOR_ERROR_INVALID_TENSOR_PARAM";
+            // This tensor param is currently unsupported
+        case QNN_TENSOR_ERROR_UNSUPPORTED_TENSOR_PARAM:
+            return "QNN_TENSOR_ERROR_UNSUPPORTED_TENSOR_PARAM";
+            // Tensor provided for update is invalid
+        case QNN_TENSOR_ERROR_INCOMPATIBLE_TENSOR_UPDATE:
+            return "QNN_TENSOR_ERROR_INCOMPATIBLE_TENSOR_UPDATE";
+
+            // QnnOpPackage_Error_t
+        case QNN_OP_PACKAGE_ERROR_LIBRARY_ALREADY_INITIALIZED:
+            return "QNN_OP_PACKAGE_ERROR_LIBRARY_ALREADY_INITIALIZED";
+        case QNN_OP_PACKAGE_ERROR_LIBRARY_NOT_INITIALIZED:
+            return "QNN_OP_PACKAGE_ERROR_LIBRARY_NOT_INITIALIZED";
+        case QNN_OP_PACKAGE_ERROR_INVALID_HANDLE:
+            return "QNN_OP_PACKAGE_ERROR_INVALID_HANDLE";
+        case QNN_OP_PACKAGE_ERROR_INVALID_INFRASTRUCTURE:
+            return "QNN_OP_PACKAGE_ERROR_INVALID_INFRASTRUCTURE";
+        case QNN_OP_PACKAGE_ERROR_INVALID_INFO:
+            return "QNN_OP_PACKAGE_ERROR_INVALID_INFO";
+        case QNN_OP_PACKAGE_ERROR_VALIDATION_FAILURE:
+            return "QNN_OP_PACKAGE_ERROR_VALIDATION_FAILURE";
+        case QNN_OP_PACKAGE_ERROR_INVALID_ARGUMENT:
+            return "QNN_OP_PACKAGE_ERROR_INVALID_ARGUMENT";
+
+        default:
+            return "unknown QNN error";
+    }
+}
+
+// ref:explanation of k-quants, https://github.com/ggerganov/llama.cpp/pull/1684
+static Qnn_DataType_t ggmlqnn_datatype_from_ggml_datatype(enum ggml_type ggmltype) {
+    switch (ggmltype) {
+        case GGML_TYPE_F16:
+            return QNN_DATATYPE_FLOAT_16;
+        case GGML_TYPE_F32:
+            return QNN_DATATYPE_FLOAT_32;
+        case GGML_TYPE_I8:
+            return QNN_DATATYPE_INT_8;
+        case GGML_TYPE_Q8_0:
+            return QNN_DATATYPE_SFIXED_POINT_8;
+        case GGML_TYPE_Q4_0:
+            return QNN_DATATYPE_SFIXED_POINT_4;
+        default:
+            break;
+    }
+    return QNN_DATATYPE_UNDEFINED;
+}
+
+static void ggmlqnn_get_qnn_dimensions_from_ggml_dimensions(uint32_t * qnn_dimensions, const uint32_t * ggml_dimensions, uint32_t rank) {
+    if (rank > GGML_MAX_DIMS) {
+        GGMLHEXAGON_LOG_WARN("invalid params");
+        return;
+    }
+    if (nullptr == qnn_dimensions || nullptr == ggml_dimensions) {
+        GGMLHEXAGON_LOG_WARN("invalid params");
+        return;
+    }
+    for (size_t idx = 0; idx < GGML_MAX_DIMS; idx++)
+        qnn_dimensions[idx] = ggml_dimensions[idx];
+
+    if (rank >= 2) {
+        qnn_dimensions[rank - 1] = ggml_dimensions[rank - 2];
+        qnn_dimensions[rank - 2] = ggml_dimensions[rank - 1];
+    }
+}
+
+template<typename Fn>
+Fn ggmlqnn_load_qnn_functionpointers(void * handle, const char * function_name) {
+    return reinterpret_cast<Fn>(dlsym(handle, function_name));
+}
+
+class qnn_interface {
+#define DEFINE_SHIM_FUNCTION_INTERFACE(F, pointer_name)           \
+  template <typename... Args>                                     \
+  inline auto qnn_##F(Args... args) const {                       \
+    return (_qnn_interface->QNN_INTERFACE_VER_NAME.pointer_name)( \
+        std::forward<Args>(args)...);                             \
+  }
+
+
+#define DEFINE_SHIM_FUNCTION_SYS_INTERFACE(F, pointer_name)                  \
+  template <typename... Args>                                                \
+  inline auto qnn_##F(Args... args) const {                                  \
+    return (_qnn_sys_interface->QNN_SYSTEM_INTERFACE_VER_NAME.pointer_name)( \
+        std::forward<Args>(args)...);                                        \
+  }
+
+    friend class qnn_instance;
+
+public:
+    qnn_interface() = default;
+
+    // QnnBackend
+    DEFINE_SHIM_FUNCTION_INTERFACE(backend_create, backendCreate)
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(backend_free, backendFree)
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(backend_register_op_package, backendRegisterOpPackage)
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(backend_validate_op_config, backendValidateOpConfig)
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(backend_get_api_version, backendGetApiVersion)
+
+    // QnnDevice
+    DEFINE_SHIM_FUNCTION_INTERFACE(device_create, deviceCreate)
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(device_free, deviceFree)
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(device_get_infrastructure, deviceGetInfrastructure)
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(device_get_platform_info, deviceGetPlatformInfo)
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(device_get_info, deviceGetInfo)
+
+    // QnnContext
+    DEFINE_SHIM_FUNCTION_INTERFACE(context_create, contextCreate)
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(context_get_binary_size, contextGetBinarySize)
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(context_get_binary, contextGetBinary)
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(context_create_from_binary, contextCreateFromBinary)
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(context_free, contextFree)
+
+    // QnnGraph
+    DEFINE_SHIM_FUNCTION_INTERFACE(graph_create, graphCreate)
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(graph_add_node, graphAddNode)
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(graph_finalize, graphFinalize)
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(graph_execute, graphExecute)
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(graph_retrieve, graphRetrieve)
+
+    // QnnLog
+    DEFINE_SHIM_FUNCTION_INTERFACE(log_create, logCreate)
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(log_free, logFree)
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(log_set_log_level, logSetLogLevel)
+
+    // QnnProfile
+    DEFINE_SHIM_FUNCTION_INTERFACE(profile_create, profileCreate)
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(profile_get_events, profileGetEvents)
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(profile_get_sub_events, profileGetSubEvents)
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(profile_get_event_data, profileGetEventData)
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(profile_free, profileFree)
+
+    // QnnMem
+    DEFINE_SHIM_FUNCTION_INTERFACE(mem_register, memRegister)
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(mem_de_register, memDeRegister)
+
+    // QnnProperty
+    DEFINE_SHIM_FUNCTION_INTERFACE(property_has_capability, propertyHasCapability)
+
+    // QnnTensor
+    DEFINE_SHIM_FUNCTION_INTERFACE(tensor_create_context_tensor, tensorCreateContextTensor)
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(tensor_create_graph_tensor, tensorCreateGraphTensor)
+
+    // QnnSystem
+    DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_create, systemContextCreate)
+
+    DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_get_binary_info, systemContextGetBinaryInfo)
+
+    DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_free, systemContextFree)
+
+    void set_qnn_interface(const QnnInterface_t * qnn_interface) {
+        _qnn_interface = qnn_interface;
+    }
+
+    void set_qnn_system_interface(const QnnSystemInterface_t * qnn_sys_interface) {
+        _qnn_sys_interface = qnn_sys_interface;
+    }
+
+    uint32_t get_backend_id() const {
+        return _qnn_interface->backendId;
+    }
+
+    bool is_loaded() const {
+        return ((_qnn_sys_interface != nullptr) && (_qnn_interface != nullptr));
+    }
+
+private:
+    const QnnInterface_t * _qnn_interface           = nullptr;
+
+    const QnnSystemInterface_t * _qnn_sys_interface = nullptr;
+};
+
+class qnn_instance {
+public:
+    using BackendIdType = decltype(QnnInterface_t{}.backendId);
+
+    explicit qnn_instance(const std::string & lib_path, const std::string & backend_name,
+                          const std::string & model_name) :
+            _lib_path(std::move(lib_path)),
+            _backend_name(std::move(backend_name)),
+            _model_name(std::move(model_name)) {}
+
+    ~qnn_instance() {
+    }
+
+    int qnn_init(const QnnSaver_Config_t ** saver_config);
+
+    int qnn_finalize();
+
+    const qnn_interface & get_qnn_interface() {
+        if (!_qnn_interface.is_loaded()) {
+            GGMLHEXAGON_LOG_WARN("pls check why _qnn_interface is not loaded\n");
+        }
+        return _qnn_interface;
+    }
+
+    const QNN_INTERFACE_VER_TYPE & get_qnn_raw_interface() {
+        if (!_qnn_interface.is_loaded()) {
+            GGMLHEXAGON_LOG_WARN("pls check why _qnn_interface is not loaded\n");
+        }
+        return _qnn_raw_interface;
+    }
+
+    const QNN_SYSTEM_INTERFACE_VER_TYPE & get_qnn_raw_system_interface() {
+        if (!_qnn_interface.is_loaded()) {
+            GGMLHEXAGON_LOG_WARN("pls check why _qnn_interface is not loaded\n");
+        }
+        return _qnn_raw_system_interface;
+    }
+
+    Qnn_LogHandle_t get_qnn_log_handle() { return _qnn_log_handle; }
+
+    Qnn_ProfileHandle_t get_qnn_profile_handle() { return _qnn_profile_handle; }
+
+    Qnn_DeviceHandle_t get_qnn_device_handle() { return _qnn_device_handle; }
+
+    Qnn_BackendHandle_t get_qnn_backend_handle() { return _qnn_backend_handle; }
+
+    Qnn_ContextHandle_t get_qnn_context_handle() { return _qnn_context_handle; }
+
+    QnnSystemContext_Handle_t get_qnn_system_handle() { return _qnn_system_handle; }
+
+    Qnn_GraphHandle_t get_qnn_graph_handle() { return _qnn_graph_handle; }
+
+    int init_qnn_graph(const char * graph_name,
+                       bool debug,
+                       uint8_t do_node_validation = 1,
+                       const QnnGraph_Config_t ** graph_configs = nullptr
+    );
+    int init_qnn_graph(const std::string & graph_name, HEXAGONBackend device, size_t vtcm_size_in_mb = 8, size_t hvx_threads = 8);
+
+    int finalize_qnn_graph();
+
+    bool is_valid_graph() const { return _qnn_graph_handle != nullptr; }
+
+    int htp_init_perfinfra();
+
+    int htp_set_rpc_polling();
+
+    int htp_set_high_performance_mode();
+
+    std::string & get_qnn_graph_name() { return _graph_name; }
+
+    bool is_rpcmem_initialized() {
+        return _rpcmem_initialized;
+    }
+
+    void set_rpcmem_initialized(bool initialized) {
+        _rpcmem_initialized = initialized;
+    }
+
+    size_t get_rpcmem_capacity() { return _rpcmem_capacity; }
+    size_t get_rpcmem_usage() { return _rpcmem_usage; }
+
+    int32_t rpcmem_to_fd(void * buf);
+
+    int register_rpcmem(void * p_data, Qnn_Tensor_t * p_tensor);
+    Qnn_MemHandle_t  register_rpcmem(void * p_data, const uint32_t rank, uint32_t * dimensions, Qnn_DataType_t data_type);
+
+    void unregister_rpcmem();
+    void unregister_rpcmem(Qnn_MemHandle_t mem_handle);
+
+    void * alloc_rpcmem(size_t bytes, size_t alignment);
+    void * get_rpcmem_from_memhandle(Qnn_MemHandle_t mem_handle);
+
+    void free_rpcmem(void * buf);
+    void free_rpcmem();
+
+    bool is_rpcmem_allocated(void * buf);
+
+    bool is_rpcmem_registered(Qnn_MemHandle_t handle) {
+        return _qnn_mem_set.count(handle) != 0U;
+    }
+
+    bool enable_qnn_rpc() {
+        return _enable_qnn_rpc;
+    }
+
+    HEXAGONBackend get_device_id() {
+        return _device_id;
+    }
+
+private:
+    int load_system();
+
+    int unload_system();
+
+    int load_backend(std::string & lib_path, const QnnSaver_Config_t ** saver_config);
+
+    int unload_backend();
+
+    void set_qnn_raw_interface(QNN_INTERFACE_VER_TYPE & raw_interface) {
+        _qnn_raw_interface = raw_interface;
+    }
+
+    void set_qnn_raw_system_interface(QNN_SYSTEM_INTERFACE_VER_TYPE & raw_interface) {
+        _qnn_raw_system_interface = raw_interface;
+    }
+
+    void * alloc_rpcmem_internal(size_t bytes, size_t alignment);
+
+    void htp_probe_rpc_meminfo();
+
+    void htp_print_info();
+
+    void print_backend_info();
+
+    void htp_set_memory_grow_size(size_t size = 1ul * 1024 * 1024);
+
+    void htp_enter_performance_mode();
+
+    void htp_set_n_hvx_threads(size_t n_threads);
+
+private:
+    static constexpr const int _required_num_providers = 1;
+
+private:
+    std::string     _lib_path;
+    std::string     _backend_name;
+    std::string     _model_name; // name of prebuilt QNN model, might be used in the future
+    BackendIdType   _backend_id;
+
+    bool _debug_tensor                      = false; // flag to indicate if requested graph is to be run in debug mode
+    bool _do_node_validations               = true;  // flag to indicate whether all add_node calls need to be validated
+    QnnLog_Level_t _qnn_log_level           = QNN_LOG_LEVEL_DEBUG;
+
+    qnn_profile_level _profile_level        = PROFILE_OFF;
+
+    void * _system_lib_handle               = nullptr;
+    void * _loaded_lib_handle               = nullptr;
+    const QnnInterface_t * _loaded_backend  = nullptr;
+
+    Qnn_GraphHandle_t _qnn_graph_handle     = nullptr;
+
+    Qnn_LogHandle_t _qnn_log_handle         = nullptr;
+
+    Qnn_ProfileHandle_t _qnn_profile_handle = nullptr;
+
+    Qnn_DeviceHandle_t _qnn_device_handle   = nullptr;
+
+    Qnn_BackendHandle_t _qnn_backend_handle = nullptr;
+
+    Qnn_ContextHandle_t _qnn_context_handle = nullptr;
+
+    QnnSystemContext_Handle_t _qnn_system_handle = nullptr;
+
+    QnnHtpDevice_PerfInfrastructure_t * _qnn_htp_perfinfra = nullptr;
+    uint32_t _qnn_htp_powerconfig_id  = 1;
+    uint32_t _qnn_htp_device_id       = 0;
+    uint32_t _qnn_htp_core_id         = 0;
+
+    uint32_t _qnn_rpc_pollingtime     = 9999; // 0-10000 us for high performing
+
+    qnn_interface _qnn_interface;
+    QNN_INTERFACE_VER_TYPE _qnn_raw_interface;
+    QNN_SYSTEM_INTERFACE_VER_TYPE _qnn_raw_system_interface;
+
+    std::unordered_map<void *, Qnn_MemHandle_t> _qnn_mem_set;
+    std::unordered_map<void *, Qnn_MemHandle_t> _qnn_rpc_buffer_to_handles;
+
+    std::atomic_bool _rpcmem_initialized{false};
+    pfn_rpc_mem_alloc _pfn_rpc_mem_alloc;
+    pfn_rpc_mem_free _pfn_rpc_mem_free;
+    pfn_rpc_mem_to_fd _pfn_rpc_mem_to_fd;
+    pfn_rpc_mem_init  _pfn_rpc_mem_init;
+    pfn_rpc_mem_deinit _pfn_rpc_mem_deinit;
+    std::unordered_map<void *, void *> _rpcmem_store_map;
+    std::unordered_map<void *, size_t> _rpcmem_usage_map;
+    size_t                             _rpcmem_usage    = 0;   // mempool usage in bytes
+    size_t                             _rpcmem_capacity = 0;   // mempool size  in bytes
+
+    std::string _graph_name;
+    HEXAGONBackend _device_id;
+    void * _rpc_lib_handle      = nullptr;
+    bool       _enable_qnn_rpc  = false; //TODO:unknown issue with QNN RPC feature
+
+    qnn_instance(const qnn_instance &) = delete;
+    void operator=(const qnn_instance &) = delete;
+
+    qnn_instance(qnn_instance &&) = delete;
+    void operator=(qnn_instance &&) = delete;
+};
+
+void * qnn_instance::alloc_rpcmem_internal(size_t bytes, size_t alignment) {
+    if (!_rpcmem_initialized) {
+        GGMLHEXAGON_LOG_WARN("rpc memory not initialized\n");
+        return nullptr;
+    }
+
+    auto allocate_bytes = static_cast<int32_t>(bytes + alignment);
+    void * buf = _pfn_rpc_mem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS, allocate_bytes);
+    if (nullptr == buf) {
+        GGMLHEXAGON_LOG_WARN("failed to allocate rpc memory\n");
+        return nullptr;
+    }
+
+    auto aligned_buf = reinterpret_cast<void *>(ggmlqnn_align_to(alignment,
+                                                reinterpret_cast<intptr_t>(buf)));
+    bool status = _rpcmem_store_map.insert(std::pair<void *, void *>(aligned_buf, buf)).second;
+    if (!status) {
+        GGMLHEXAGON_LOG_WARN("failed to allocate rpc memory\n");
+        _pfn_rpc_mem_free(buf);
+    }
+    return aligned_buf;
+}
+
+void * qnn_instance::alloc_rpcmem(size_t bytes, size_t alignment) {
+    if (_rpcmem_usage > (_rpcmem_capacity - (8 * SIZE_IN_MB))) { // reserve 8Mbytes in rpc mempool
+        GGMLHEXAGON_LOG_WARN("rpc mempool capacity: %d MiB, usage: %d MiB", _rpcmem_capacity / SIZE_IN_MB, _rpcmem_usage / SIZE_IN_MB);
+        return nullptr;
+    }
+
+    auto aligned_buf = alloc_rpcmem_internal(bytes, alignment);
+    if (nullptr == aligned_buf)
+        return nullptr;
+    _rpcmem_usage_map.insert(std::pair<void *, size_t>(aligned_buf, bytes));
+
+    _rpcmem_usage += bytes;
+    return aligned_buf;
+}
+
+void qnn_instance::free_rpcmem(void * buf) {
+    size_t rpcbuffer_size = 0;
+    if (!_rpcmem_initialized) {
+        GGMLHEXAGON_LOG_WARN("rpc memory not initialized\n");
+    } else if (0 == _rpcmem_store_map.count(buf)) {
+        GGMLHEXAGON_LOG_WARN("no allocated tensor\n");
+    } else {
+        GGMLHEXAGON_LOG_DEBUG("free rpc mem %p", _rpcmem_store_map[buf]);
+        for (const auto & [rpcbuffer, rpcbuffer_size] : _rpcmem_usage_map) {
+            if (buf == rpcbuffer) {
+                _rpcmem_usage -= rpcbuffer_size;
+            }
+        }
+
+        if (rpcbuffer_size != 0) {
+            _rpcmem_usage_map.erase(buf);
+        }
+        _pfn_rpc_mem_free(_rpcmem_store_map[buf]);
+        _rpcmem_store_map.erase(buf);
+    }
+}
+
+void qnn_instance::free_rpcmem() {
+    if (_rpcmem_store_map.empty()) {
+        GGMLHEXAGON_LOG_WARN("no rpcmem allocated\n");
+        return;
+    }
+
+    for (const auto & [rpcbuffer, raw_rpcbuffer] : _rpcmem_store_map) {
+        GGMLHEXAGON_LOG_DEBUG("free rpc buffer %p", rpcbuffer);
+        _pfn_rpc_mem_free(rpcbuffer);
+    }
+
+    _rpcmem_store_map.clear();
+    _rpcmem_usage_map.clear();
+    _rpcmem_usage = 0;
+}
+
+int32_t qnn_instance::rpcmem_to_fd(void * buf) {
+    int32_t mem_fd = -1;
+    if (!is_rpcmem_initialized()) {
+        GGMLHEXAGON_LOG_WARN("rpc memory not initialized\n");
+    } else {
+        mem_fd = _pfn_rpc_mem_to_fd(buf);
+    }
+
+    return mem_fd;
+}
+
+int qnn_instance::register_rpcmem(void * p_data, Qnn_Tensor_t * p_tensor) {
+    if (nullptr == p_data || (nullptr == p_tensor)) {
+        GGMLHEXAGON_LOG_WARN("invalid param\n");
+        return 1;
+    }
+
+    if (!is_rpcmem_initialized()) {
+        GGMLHEXAGON_LOG_WARN("rpc memory not initialized\n");
+        return 2;
+    }
+
+    if (is_rpcmem_registered((QNN_VER_PTR(*p_tensor)->memHandle))) {
+        GGMLHEXAGON_LOG_WARN("tensor %s has been registered shared memory\n", (QNN_VER_PTR(*p_tensor)->name));
+        return 3;
+    }
+
+    int32_t mem_fd = rpcmem_to_fd(p_data);
+    if (-1 == mem_fd) {
+        GGMLHEXAGON_LOG_WARN("failed to get file descriptor\n");
+        return 4;
+    }
+    GGMLHEXAGON_LOG_DEBUG("mem_fd %d\n", mem_fd);
+    Qnn_MemDescriptor_t descriptor = {
+            {QNN_VER_PTR(*p_tensor)->rank, QNN_VER_PTR(*p_tensor)->dimensions, nullptr},
+            QNN_VER_PTR(*p_tensor)->dataType,
+            QNN_MEM_TYPE_ION,
+            {{mem_fd}}};
+    Qnn_MemHandle_t handle = nullptr;
+    int error = QNN_SUCCESS;
+    error = _qnn_interface.qnn_mem_register(
+            _qnn_context_handle,
+            &descriptor,
+            /*numDescriptors=*/1,
+            &handle);
+    if (error != QNN_SUCCESS) {
+        GGMLHEXAGON_LOG_WARN("failed to register shared memory, error %d, %s\n", QNN_GET_ERROR_CODE(error), strerror(error));
+        return 5;
+    } else {
+        GGMLHEXAGON_LOG_INFO("tensor %s successfully register shared memory\n", (QNN_VER_PTR(*p_tensor)->name));
+    }
+    QNN_VER_PTR(*p_tensor)->memHandle = handle;
+    _qnn_mem_set.insert((std::pair<void*, Qnn_MemHandle_t>(p_data, handle)));
+
+    return 0;
+}
+
+Qnn_MemHandle_t  qnn_instance::register_rpcmem(void * p_data, const uint32_t rank, uint32_t * dimensions, Qnn_DataType_t data_type) {
+    if (!p_data) {
+        GGMLHEXAGON_LOG_WARN("invalid param");
+        return nullptr;
+    }
+
+    if (!is_rpcmem_initialized()) {
+        GGMLHEXAGON_LOG_WARN("rpc memory not initialized");
+        return nullptr;
+    }
+
+    if (is_rpcmem_registered(p_data)) {
+        GGMLHEXAGON_LOG_WARN("rpc memory already registered");
+        return _qnn_rpc_buffer_to_handles[p_data];
+    }
+
+    int32_t mem_fd = rpcmem_to_fd(p_data);
+    if (mem_fd == -1) {
+        GGMLHEXAGON_LOG_WARN("failed to get file descriptor");
+        return nullptr;
+    }
+
+    GGMLHEXAGON_LOG_DEBUG("mem_fd %d", mem_fd);
+    Qnn_MemDescriptor_t descriptor = {
+            {rank, dimensions, nullptr},
+            data_type, QNN_MEM_TYPE_ION,
+            {{mem_fd}}
+    };
+    Qnn_MemHandle_t handle = nullptr;
+    Qnn_ErrorHandle_t error = _qnn_interface.qnn_mem_register(_qnn_context_handle, &descriptor, /*numDescriptors=*/1, &handle);
+    if (error != QNN_SUCCESS) {
+        GGMLHEXAGON_LOG_WARN("failed to register shared memory, error %d, %s", QNN_GET_ERROR_CODE(error), strerror(error));
+        return nullptr;
+    }
+
+    _qnn_rpc_buffer_to_handles.insert({p_data, handle});
+    GGMLHEXAGON_LOG_DEBUG("successfully register shared memory handler: %p", handle);
+    return handle;
+}
+
+void * qnn_instance::get_rpcmem_from_memhandle(Qnn_MemHandle_t mem_handle) {
+    for (const auto & [ptr, handle] : _qnn_mem_set) {
+        if (mem_handle == handle) {
+            return ptr;
+        }
+    }
+
+    GGMLHEXAGON_LOG_WARN("can't find rpcmem from qnn mem handle %p", mem_handle);
+    return nullptr;
+}
+
+void qnn_instance::unregister_rpcmem() {
+    if (_qnn_mem_set.empty()) {
+        GGMLHEXAGON_LOG_WARN("no rpcmem registered\n");
+    }
+
+    for (const auto & [ptr, mem_handle] : _qnn_mem_set) {
+        auto error = _qnn_interface.qnn_mem_de_register(&mem_handle, 1);
+        if (error != QNN_SUCCESS) {
+            GGMLHEXAGON_LOG_WARN("failed to unregister shared memory, error %d\n", QNN_GET_ERROR_CODE(error));
+        } else {
+            GGMLHEXAGON_LOG_DEBUG("unregister shared memory ok");
+        }
+    }
+    _qnn_mem_set.clear();
+}
+
+void qnn_instance::unregister_rpcmem(Qnn_MemHandle_t mem_handle) {
+    Qnn_ErrorHandle_t error = _qnn_interface.qnn_mem_de_register(&mem_handle, 1);
+    if (error != QNN_SUCCESS) {
+        GGMLHEXAGON_LOG_WARN("failed to unregister shared memory, error %d", QNN_GET_ERROR_CODE(error));
+    }
+
+    auto it = std::find_if(_qnn_mem_set.begin(), _qnn_mem_set.end(),
+                           [mem_handle](const auto &kv) { return kv.second == mem_handle; });
+    if (it == _qnn_mem_set.end()) {
+        GGMLHEXAGON_LOG_WARN("failed to find shared memory handler: %p", mem_handle);
+        return;
+    }
+
+    _qnn_mem_set.erase(it);
+}
+
+bool qnn_instance::is_rpcmem_allocated(void * buf) {
+    return _rpcmem_store_map.count(buf) != 0U;
+}
+
+int qnn_instance::load_backend(std::string & lib_path, const QnnSaver_Config_t ** saver_config) {
+    Qnn_ErrorHandle_t error = QNN_SUCCESS;
+    GGMLHEXAGON_LOG_DEBUG("lib_path:%s\n", lib_path.c_str());
+
+    void * lib_handle = dlopen(lib_path.c_str(), RTLD_NOW | RTLD_GLOBAL);
+    if (nullptr == lib_handle) {
+        GGMLHEXAGON_LOG_WARN("can not open QNN library %s, with error: %s", lib_path.c_str(), dlerror());
+        return 1;
+    }
+
+    auto get_providers = ggmlqnn_load_qnn_functionpointers<_pfn_QnnInterface_getProviders *>(
+                               lib_handle,
+                               "QnnInterface_getProviders");
+    if (nullptr == get_providers) {
+        GGMLHEXAGON_LOG_WARN("can not load symbol QnnInterface_getProviders : %s", dlerror());
+        return 2;
+    }
+
+    std::uint32_t num_providers = 0;
+    const QnnInterface_t ** provider_list = nullptr;
+    error = get_providers(&provider_list, &num_providers);
+    if (error != QNN_SUCCESS) {
+        GGMLHEXAGON_LOG_WARN("failed to get providers, error %d", QNN_GET_ERROR_CODE(error));
+        return 3;
+    }
+    GGMLHEXAGON_LOG_DEBUG("num_providers=%d\n", num_providers);
+    if (num_providers != _required_num_providers) {
+        GGMLHEXAGON_LOG_WARN("providers is %d instead of required %d", num_providers, _required_num_providers);
+        return 4;
+    }
+
+    if (nullptr == provider_list) {
+        GGMLHEXAGON_LOG_WARN("failed to get qnn interface providers\n");
+        return 5;
+    }
+    bool found_valid_interface = false;
+    QNN_INTERFACE_VER_TYPE qnn_interface;
+    for (size_t idx = 0; idx < num_providers; idx++) {
+        if (QNN_API_VERSION_MAJOR == provider_list[idx]->apiVersion.coreApiVersion.major &&
+            QNN_API_VERSION_MINOR <= provider_list[idx]->apiVersion.coreApiVersion.minor) {
+            found_valid_interface = true;
+            qnn_interface = provider_list[idx]->QNN_INTERFACE_VER_NAME;
+            break;
+        }
+    }
+
+    if (!found_valid_interface) {
+        GGMLHEXAGON_LOG_WARN("unable to find a valid qnn interface\n");
+        return 6;
+    } else {
+        GGMLHEXAGON_LOG_VERBOSE("find a valid qnn interface\n");
+    }
+    set_qnn_raw_interface(qnn_interface);
+
+    BackendIdType backend_id = provider_list[0]->backendId;
+    _loaded_backend     = provider_list[0];
+    _loaded_lib_handle  = lib_handle;
+    _backend_id         = backend_id;
+
+    auto saver_initialize =
+            ggmlqnn_load_qnn_functionpointers<_pfn_QnnSaver_initialize *>(_loaded_lib_handle, "QnnSaver_initialize");
+    if (nullptr != saver_initialize) {
+        error = saver_initialize(saver_config);
+        if (error != QNN_SUCCESS) {
+            GGMLHEXAGON_LOG_WARN("failed to saver_initialize，error %d", QNN_GET_ERROR_CODE(error));
+            return 7;
+        }
+    } else {
+        GGMLHEXAGON_LOG_WARN("saver_initialize is null\n");
+    }
+
+    return 0;
+}
+
+int qnn_instance::unload_backend() {
+    int dlclose_error = 0;
+    dlclose_error = dlclose(_loaded_lib_handle);
+    if (dlclose_error != 0) {
+        GGMLHEXAGON_LOG_WARN("failed to close QNN backend %d, error %s\n", _backend_id, dlerror());
+    }
+
+    return 0;
+}
+
+int qnn_instance::load_system() {
+    Qnn_ErrorHandle_t error = QNN_SUCCESS;
+
+#if !defined(__ANDROID__) && !defined(__linux__)
+    std::string system_lib_path = _lib_path + "QnnSystem.dll";
+#else
+    std::string system_lib_path = _lib_path + "libQnnSystem.so";
+#endif
+    GGMLHEXAGON_LOG_DEBUG("system_lib_path:%s\n", system_lib_path.c_str());
+
+    _system_lib_handle = dlopen(system_lib_path.c_str(), RTLD_NOW | RTLD_LOCAL);
+    if (nullptr == _system_lib_handle) {
+        GGMLHEXAGON_LOG_WARN("can not open QNN library %s, error: %s\n", system_lib_path.c_str(), dlerror());
+        //re-try with default path of QNN binary runtime lib
+        _lib_path = std::string(g_hexagon_appcfg.runtime_libpath);
+#if !defined(__ANDROID__) && !defined(__linux__)
+        system_lib_path = _lib_path + "QnnSystem.dll";
+#else
+        system_lib_path = _lib_path + "libQnnSystem.so";
+#endif
+        _system_lib_handle = dlopen(system_lib_path.c_str(), RTLD_NOW | RTLD_LOCAL);
+        if (nullptr == _system_lib_handle) {
+            GGMLHEXAGON_LOG_WARN("can not open QNN library %s, error: %s\n", system_lib_path.c_str(), dlerror());
+            return 1;
+        }
+    }
+
+    auto * get_providers = reinterpret_cast<_pfn_QnnSystemInterface_getProviders *>(dlsym(
+            _system_lib_handle, "QnnSystemInterface_getProviders"));
+    if (nullptr == get_providers) {
+        GGMLHEXAGON_LOG_WARN("can not load QNN symbol QnnSystemInterface_getProviders: %s\n", dlerror());
+        return 2;
+    }
+
+    uint32_t num_providers = 0;
+    const QnnSystemInterface_t ** provider_list = nullptr;
+    error = get_providers(&provider_list, &num_providers);
+    if (error != QNN_SUCCESS) {
+        GGMLHEXAGON_LOG_WARN("failed to get providers, error %d\n", QNN_GET_ERROR_CODE(error));
+        return 3;
+    }
+
+    if (num_providers != _required_num_providers) {
+        GGMLHEXAGON_LOG_WARN("providers is %d instead of required %d\n", num_providers, _required_num_providers);
+        return 4;
+    }
+
+    if (nullptr == provider_list) {
+        GGMLHEXAGON_LOG_WARN("can not get providers\n");
+        return 5;
+    }
+
+    QNN_SYSTEM_INTERFACE_VER_TYPE qnn_system_interface;
+    bool found_valid_system_interface = false;
+    for (size_t idx = 0; idx < num_providers; idx++) {
+        if (QNN_SYSTEM_API_VERSION_MAJOR ==
+            provider_list[idx]->systemApiVersion.major &&
+            QNN_SYSTEM_API_VERSION_MINOR <=
+            provider_list[idx]->systemApiVersion.minor) {
+            found_valid_system_interface = true;
+            qnn_system_interface = provider_list[idx]->QNN_SYSTEM_INTERFACE_VER_NAME;
+            break;
+        }
+    }
+    if (!found_valid_system_interface) {
+        GGMLHEXAGON_LOG_WARN("unable to find a valid qnn system interface\n");
+        return 6;
+    } else {
+        GGMLHEXAGON_LOG_VERBOSE("find a valid qnn system interface\n");
+    }
+    set_qnn_raw_system_interface(qnn_system_interface);
+
+    _qnn_interface.set_qnn_system_interface(provider_list[0]);
+
+    _qnn_interface.qnn_system_context_create(&_qnn_system_handle);
+    if (nullptr == _qnn_system_handle) {
+        GGMLHEXAGON_LOG_WARN("can not create QNN system contenxt\n");
+    } else {
+        GGMLHEXAGON_LOG_VERBOSE("initialize qnn system successfully\n");
+    }
+
+    return 0;
+}
+
+int qnn_instance::unload_system() {
+    int result = 0;
+
+    if (nullptr == _system_lib_handle) {
+        GGMLHEXAGON_LOG_DEBUG("system lib handle is null\n");
+        return 1;
+    }
+
+    if (nullptr != _qnn_system_handle) {
+        result = _qnn_interface.qnn_system_context_free(_qnn_system_handle);
+        if (result != QNN_SUCCESS) {
+            GGMLHEXAGON_LOG_WARN("failed to free QNN system context\n");
+        }
+        _qnn_system_handle = nullptr;
+    }
+
+    int dlclose_error = dlclose(_system_lib_handle);
+    if (dlclose_error != 0) {
+        GGMLHEXAGON_LOG_WARN("failed to close QnnSystem library, error %s\n", dlerror());
+        return 2;
+    }
+
+    _system_lib_handle = nullptr;
+
+    return result;
+}
+
+static void ggmlqnn_sdk_logcallback(const char * fmt,
+                                 QnnLog_Level_t level,
+                                 uint64_t timestamp,
+                                 va_list argp) {
+
+    if (0 == g_hexagon_appcfg.print_qnn_internal_log)
+        return;
+
+    static std::mutex log_mutex;
+    static unsigned char s_ggmlqnn_sdk_logbuf[GGMLHEXAGON_LOGBUF_LEN];
+
+    const char * log_level_desc = "";
+    switch (level) {
+        case QNN_LOG_LEVEL_ERROR:
+            log_level_desc = " ERROR ";
+            break;
+        case QNN_LOG_LEVEL_WARN:
+            log_level_desc = "WARNING";
+            break;
+        case QNN_LOG_LEVEL_INFO:
+            log_level_desc = "  INFO ";
+            break;
+        case QNN_LOG_LEVEL_DEBUG:
+            log_level_desc = " DEBUG ";
+            break;
+        case QNN_LOG_LEVEL_VERBOSE:
+            log_level_desc = "VERBOSE";
+            break;
+        case QNN_LOG_LEVEL_MAX:
+            log_level_desc = "UNKNOWN";
+            break;
+    }
+
+    double ms = (double) timestamp / 1000000.0;
+    {
+        std::lock_guard<std::mutex> lock(log_mutex);
+        memset(s_ggmlqnn_sdk_logbuf, 0, GGMLHEXAGON_LOGBUF_LEN);
+        vsnprintf(reinterpret_cast<char *const>(s_ggmlqnn_sdk_logbuf), GGMLHEXAGON_LOGBUF_LEN, fmt, argp);
+        GGMLHEXAGON_LOG_DEBUG("%8.1fms [%-7s] %s\n", ms, log_level_desc, s_ggmlqnn_sdk_logbuf);
+    }
+#if !GGMLHEXAGON_DEBUG
+    GGML_UNUSED(log_level_desc);
+    GGML_UNUSED(ms);
+#endif
+}
+
+int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) {
+    GGMLHEXAGON_LOG_DEBUG("enter qni_init\n");
+
+    _device_id = HEXAGON_BACKEND_GGML;
+    if (_backend_name.find("QnnCpu") != std::string::npos) {
+        _device_id = HEXAGON_BACKEND_QNNCPU;
+    }
+    if (_backend_name.find("QnnGpu") != std::string::npos) {
+        _device_id = HEXAGON_BACKEND_QNNGPU;
+    }
+    if (_backend_name.find("QnnHtp") != std::string::npos) {
+        _device_id = HEXAGON_BACKEND_QNNNPU;
+    }
+    if (HEXAGON_BACKEND_GGML == _device_id) {
+        GGMLHEXAGON_LOG_INFO("user specified qnn backend is ggml, skip QNN initialize");
+        return 0;
+    }
+
+    if (0 != load_system()) {
+        GGMLHEXAGON_LOG_WARN("can not load QNN system lib, pls check why?\n");
+        return 1;
+    } else {
+        GGMLHEXAGON_LOG_DEBUG("load QNN system lib successfully\n");
+    }
+
+    std::string backend_lib_path = _lib_path + _backend_name;
+
+    int is_load_ok = load_backend(backend_lib_path, saver_config);
+    if (0 != is_load_ok) {
+        GGMLHEXAGON_LOG_WARN("failed to load QNN backend\n");
+        return 2;
+    }
+
+    _qnn_interface.set_qnn_interface(_loaded_backend);
+#if 1
+    _qnn_interface.qnn_log_create(ggmlqnn_sdk_logcallback, _qnn_log_level, &_qnn_log_handle);
+#else
+    _qnn_raw_interface.logCreate(ggmlqnn_sdk_logcallback, _qnn_log_level, &_qnn_log_handle);
+#endif
+    if (nullptr == _qnn_log_handle) {
+        GGMLHEXAGON_LOG_WARN("why failed to initialize qnn log\n"); //NPU backend not work on Qualcomm SoC based low-end phone
+        return 3;
+    } else {
+        GGMLHEXAGON_LOG_DEBUG("initialize qnn log successfully\n");
+    }
+
+    std::vector<const QnnBackend_Config_t *> temp_backend_config;
+    _qnn_interface.qnn_backend_create(_qnn_log_handle,
+                      temp_backend_config.empty() ? nullptr : temp_backend_config.data(),
+                      &_qnn_backend_handle);
+    if (nullptr == _qnn_backend_handle) {
+        GGMLHEXAGON_LOG_WARN("why failed to initialize qnn backend\n");
+        return 4;
+    } else {
+        GGMLHEXAGON_LOG_DEBUG("initialize qnn backend successfully\n");
+    }
+
+    if (nullptr != _qnn_raw_interface.propertyHasCapability) {
+        auto qnnstatus = _qnn_raw_interface.propertyHasCapability(QNN_PROPERTY_GROUP_DEVICE);
+        if (QNN_PROPERTY_NOT_SUPPORTED == qnnstatus) {
+            GGMLHEXAGON_LOG_WARN("device property is not supported\n");
+        }
+        if (QNN_PROPERTY_ERROR_UNKNOWN_KEY == qnnstatus) {
+            GGMLHEXAGON_LOG_WARN("device property is not known to backend\n");
+        }
+    }
+
+    Qnn_ErrorHandle_t qnnstatus = QNN_SUCCESS;
+    if (_device_id == HEXAGON_BACKEND_QNNNPU) {
+        const QnnDevice_PlatformInfo_t * p_info = nullptr;
+        qcom_socinfo soc_info = {};
+        qnnstatus = _qnn_raw_interface.deviceGetPlatformInfo(nullptr, &p_info);
+        if (QNN_SUCCESS == qnnstatus) {
+            GGMLHEXAGON_LOG_VERBOSE("device counts %d\n", p_info->v1.numHwDevices);
+            QnnDevice_HardwareDeviceInfo_t *         infos    = p_info->v1.hwDevices;
+            QnnHtpDevice_OnChipDeviceInfoExtension_t chipinfo = {};
+            for (uint32_t i = 0; i < p_info->v1.numHwDevices; i++) {
+                GGMLHEXAGON_LOG_VERBOSE("deviceID:%d, deviceType:%d, numCores %d\n", (int) infos[i].v1.deviceId,
+                             (int) infos[i].v1.deviceType, (int) infos[i].v1.numCores);
+                QnnDevice_DeviceInfoExtension_t devinfo = infos[i].v1.deviceInfoExtension;
+                chipinfo                                = devinfo->onChipDevice;
+                size_t htp_arch                         = (size_t) chipinfo.arch;
+                GGMLHEXAGON_LOG_VERBOSE("htp_type:%d(%s)\n", devinfo->devType,
+                             (devinfo->devType == QNN_HTP_DEVICE_TYPE_ON_CHIP) ? "ON_CHIP" : "");
+                soc_info = { chipinfo.socModel, htp_arch, chipinfo.vtcmSize, {} };
+            }
+            _qnn_raw_interface.deviceFreePlatformInfo(nullptr, p_info);
+        } else {
+            GGMLHEXAGON_LOG_WARN("failed to get platform info, are we in emulator?\n");
+            soc_info = { NONE, UNKNOWN_SM, 0, {} };
+        }
+
+        QnnHtpDevice_CustomConfig_t soc_customconfig;
+        soc_customconfig.option    = QNN_HTP_DEVICE_CONFIG_OPTION_SOC;
+        soc_customconfig.socModel  = soc_info.soc_model;
+        QnnDevice_Config_t soc_devconfig;
+        soc_devconfig.option       = QNN_DEVICE_CONFIG_OPTION_CUSTOM;
+        soc_devconfig.customConfig = &soc_customconfig;
+
+        /*
+        QnnHtpDevice_CustomConfig_t arch_customconfig;
+        arch_customconfig.option        = QNN_HTP_DEVICE_CONFIG_OPTION_ARCH;
+        arch_customconfig.arch.arch     = (QnnHtpDevice_Arch_t)soc_info.htp_arch;
+        arch_customconfig.arch.deviceId = 0;
+        QnnDevice_Config_t arch_devconfig;
+        arch_devconfig.option       = QNN_DEVICE_CONFIG_OPTION_CUSTOM;
+        arch_devconfig.customConfig = &arch_customconfig;
+        */
+        const QnnDevice_Config_t * p_deviceconfig[] = { &soc_devconfig, nullptr };
+        qnnstatus = _qnn_raw_interface.deviceCreate(_qnn_log_handle, p_deviceconfig, &_qnn_device_handle);
+    } else {
+        qnnstatus = _qnn_interface.qnn_device_create(_qnn_log_handle, nullptr, &_qnn_device_handle);
+    }
+    if (QNN_SUCCESS != qnnstatus && QNN_DEVICE_ERROR_UNSUPPORTED_FEATURE != qnnstatus) {
+        GGMLHEXAGON_LOG_WARN("failed to create QNN device\n");
+    } else {
+        GGMLHEXAGON_LOG_VERBOSE("create device successfully\n");
+    }
+
+    if (PROFILE_OFF != _profile_level) {
+        GGMLHEXAGON_LOG_INFO("profiling turned on; level = %d", _profile_level);
+        if (PROFILE_BASIC == _profile_level) {
+            GGMLHEXAGON_LOG_INFO("basic profiling requested. creating Qnn Profile object\n");
+            if (QNN_PROFILE_NO_ERROR != _qnn_raw_interface.profileCreate(
+                    _qnn_backend_handle, QNN_PROFILE_LEVEL_BASIC, &_qnn_profile_handle)) {
+                GGMLHEXAGON_LOG_WARN("unable to create profile handle in the backend\n");
+                return 5;
+            } else {
+                GGMLHEXAGON_LOG_DEBUG("initialize qnn profile successfully\n");
+            }
+        } else if (PROFILE_DETAIL == _profile_level) {
+            GGMLHEXAGON_LOG_INFO("detailed profiling requested. Creating Qnn Profile object\n");
+            if (QNN_PROFILE_NO_ERROR != _qnn_raw_interface.profileCreate(
+                    _qnn_backend_handle, QNN_PROFILE_LEVEL_DETAILED, &_qnn_profile_handle)) {
+                GGMLHEXAGON_LOG_WARN("unable to create profile handle in the backend\n");
+                return 6;
+            } else {
+                GGMLHEXAGON_LOG_DEBUG("initialize qnn profile successfully\n");
+            }
+        }
+    }
+
+#if defined(__ANDROID__) || defined(__linux__)
+    std::filesystem::path full_path(std::string(g_hexagon_appcfg.runtime_libpath) + "libcdsprpc.so");
+    _rpc_lib_handle = dlopen(full_path.string().c_str(), RTLD_NOW | RTLD_LOCAL);
+    if (nullptr == _rpc_lib_handle) {
+        GGMLHEXAGON_LOG_WARN("failed to load %s\n", full_path.c_str());
+        _rpc_lib_handle = dlopen("libcdsprpc.so", RTLD_NOW | RTLD_LOCAL);
+    }
+#else
+    _rpc_lib_handle = dlopen("libcdsprpc.dll", RTLD_NOW | RTLD_LOCAL);
+#endif
+    if (nullptr == _rpc_lib_handle) {
+        GGMLHEXAGON_LOG_WARN("failed to load qualcomm's rpc lib, error:%s\n", dlerror());
+        return 7;
+    } else {
+        GGMLHEXAGON_LOG_DEBUG("load rpcmem lib successfully\n");
+        set_rpcmem_initialized(true);
+    }
+    _pfn_rpc_mem_init   = reinterpret_cast<pfn_rpc_mem_init>(dlsym(_rpc_lib_handle, "rpcmem_init"));
+    _pfn_rpc_mem_deinit = reinterpret_cast<pfn_rpc_mem_deinit>(dlsym(_rpc_lib_handle, "rpcmem_deinit"));
+    _pfn_rpc_mem_alloc  = reinterpret_cast<pfn_rpc_mem_alloc>(dlsym(_rpc_lib_handle,"rpcmem_alloc"));
+    _pfn_rpc_mem_free   = reinterpret_cast<pfn_rpc_mem_free>(dlsym(_rpc_lib_handle, "rpcmem_free"));
+    _pfn_rpc_mem_to_fd  = reinterpret_cast<pfn_rpc_mem_to_fd>(dlsym(_rpc_lib_handle,"rpcmem_to_fd"));
+    if (nullptr == _pfn_rpc_mem_alloc || nullptr == _pfn_rpc_mem_free || nullptr == _pfn_rpc_mem_to_fd) {
+        GGMLHEXAGON_LOG_WARN("unable to access symbols in QNN RPC lib, dlerror(): %s", dlerror());
+        dlclose(_rpc_lib_handle);
+        return 8;
+    }
+
+    if (nullptr != _pfn_rpc_mem_init) // make Qualcomm's SoC based low-end phone happy
+        _pfn_rpc_mem_init();
+
+    std::vector<const QnnContext_Config_t *> temp_context_config;
+    _qnn_interface.qnn_context_create(_qnn_backend_handle, _qnn_device_handle,
+                               temp_context_config.empty() ? nullptr : temp_context_config.data(),
+                               &_qnn_context_handle);
+    if (nullptr == _qnn_context_handle) {
+        GGMLHEXAGON_LOG_WARN("why failed to initialize qnn context, error:%s\n", strerror(errno));
+        return 9;
+    } else {
+        GGMLHEXAGON_LOG_DEBUG("initialize qnn context successfully\n");
+    }
+
+    if (_backend_name.find("Htp") != std::string::npos) {
+        htp_print_info();
+        htp_probe_rpc_meminfo();
+
+        if (0 != htp_init_perfinfra()) {
+            GGMLHEXAGON_LOG_WARN("initialize HTP performance failure");
+        }
+
+        htp_enter_performance_mode();
+        htp_set_memory_grow_size();
+
+        if (enable_qnn_rpc()) {
+            GGMLHEXAGON_LOG_VERBOSE("NPU RPC feature enabled with QNN-NPU backend");
+        } else {
+            GGMLHEXAGON_LOG_VERBOSE("NPU RPC feature disabled with QNN-NPU backend");
+        }
+    }
+
+    print_backend_info();
+
+    GGMLHEXAGON_LOG_DEBUG("leave qni_init\n");
+
+    return 0;
+}
+
+int qnn_instance::qnn_finalize() {
+    int ret_status = 0;
+    Qnn_ErrorHandle_t error = QNN_SUCCESS;
+
+    GGMLHEXAGON_LOG_VERBOSE("enter %s\n", __func__);
+    ggmlqnn_reset_idx();
+
+    free_rpcmem();
+    unregister_rpcmem();
+
+    if (nullptr != _pfn_rpc_mem_deinit)
+        _pfn_rpc_mem_deinit();
+
+    if (0 != dlclose(_rpc_lib_handle)) {
+        GGMLHEXAGON_LOG_WARN("failed to unload qualcomm's rpc lib, error:%s\n", dlerror());
+    } else {
+        GGMLHEXAGON_LOG_DEBUG("succeed to close rpcmem lib\n");
+    }
+
+    if (nullptr != _qnn_context_handle) {
+        error = _qnn_interface.qnn_context_free(_qnn_context_handle, _qnn_profile_handle);
+        if (error != QNN_SUCCESS) {
+            GGMLHEXAGON_LOG_WARN("failed to free QNN context_handle: ID %u, error %d\n",
+                  _qnn_interface.get_backend_id(), QNN_GET_ERROR_CODE(error));
+
+        }
+        _qnn_context_handle = nullptr;
+    }
+
+    if (nullptr != _qnn_profile_handle) {
+        error = _qnn_interface.qnn_profile_free(_qnn_profile_handle);
+        if (error != QNN_SUCCESS) {
+            GGMLHEXAGON_LOG_WARN("failed to free QNN profile_handle: ID %u, error %d\n",
+                  _qnn_interface.get_backend_id(), QNN_GET_ERROR_CODE(error));
+
+        }
+        _qnn_profile_handle = nullptr;
+    }
+
+    if (nullptr != _qnn_device_handle) {
+        error = _qnn_interface.qnn_device_free(_qnn_device_handle);
+        if (error != QNN_SUCCESS) {
+            GGMLHEXAGON_LOG_WARN("failed to free QNN device_handle: ID %u, error %d\n",
+                  _qnn_interface.get_backend_id(), QNN_GET_ERROR_CODE(error));
+
+        }
+        _qnn_device_handle = nullptr;
+    }
+
+    if (nullptr != _qnn_backend_handle) {
+        error = _qnn_interface.qnn_backend_free(_qnn_backend_handle);
+        if (error != QNN_SUCCESS) {
+            GGMLHEXAGON_LOG_WARN("failed to free QNN backend_handle: ID %u, error %d\n",
+                  _qnn_interface.get_backend_id(), QNN_GET_ERROR_CODE(error));
+        }
+        _qnn_backend_handle = nullptr;
+
+    }
+
+    if (nullptr != _qnn_log_handle) {
+        error = _qnn_interface.qnn_log_free(_qnn_log_handle);
+        if (error != QNN_SUCCESS) {
+            GGMLHEXAGON_LOG_WARN("failed to free QNN log_handle: ID %u, error %d\n",
+                  _qnn_interface.get_backend_id(), QNN_GET_ERROR_CODE(error));
+        }
+        _qnn_log_handle = nullptr;
+    }
+
+    unload_backend();
+    unload_system();
+
+    GGMLHEXAGON_LOG_VERBOSE("leave %s\n", __func__);
+    return ret_status;
+}
+
+int qnn_instance::init_qnn_graph(const std::string & graph_name, HEXAGONBackend device, size_t vtcm_size_in_mb, size_t hvx_threads) {
+    _graph_name = graph_name;
+    _device_id = device;
+
+    //GGMLHEXAGON_LOG_DEBUG("[%s][%s]created", ggml_backend_hexagon_get_devname(device), graph_name.c_str());
+
+    Qnn_ErrorHandle_t error = QNN_SUCCESS;
+    if (HEXAGON_BACKEND_QNNNPU == device) {
+        QnnHtpGraph_CustomConfig_t hvx_config;
+        hvx_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_NUM_HVX_THREADS;
+        hvx_config.numHvxThreads = hvx_threads;
+        QnnGraph_Config_t graph_hvx_config;
+        graph_hvx_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
+        graph_hvx_config.customConfig = &hvx_config;
+
+        QnnHtpGraph_CustomConfig_t dlbc_config = QNN_HTP_GRAPH_CUSTOM_CONFIG_INIT;
+        dlbc_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_OPTIMIZATION;
+        dlbc_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_ENABLE_DLBC;
+        if (0 == g_hexagon_appcfg.enable_dlbc)
+            dlbc_config.optimizationOption.floatValue = 0.0; // set to 0.0 to turn off DLBC
+        else
+            dlbc_config.optimizationOption.floatValue = 1.0; // set to 1.0 to turn on  DLBC
+        QnnGraph_Config_t graph_dlbc_config;
+        graph_dlbc_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
+        graph_dlbc_config.customConfig = &dlbc_config;
+
+        QnnHtpGraph_CustomConfig_t opt_config = QNN_HTP_GRAPH_CUSTOM_CONFIG_INIT;
+        opt_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_OPTIMIZATION;
+        opt_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG;
+        opt_config.optimizationOption.floatValue = 1; // 1 / 3
+        QnnGraph_Config_t graph_opt_config;
+        graph_opt_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
+        graph_opt_config.customConfig = &opt_config;
+
+        QnnHtpGraph_CustomConfig_t vtcm_config = QNN_HTP_GRAPH_CUSTOM_CONFIG_INIT;
+        vtcm_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_VTCM_SIZE;
+        vtcm_config.vtcmSizeInMB = vtcm_size_in_mb;
+        QnnGraph_Config_t graph_vtcm_config;
+        graph_vtcm_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
+        graph_vtcm_config.customConfig = &vtcm_config;
+
+        std::vector<const QnnGraph_Config_t *> graph_configs;
+        graph_configs.push_back(&graph_hvx_config);
+        graph_configs.push_back(&graph_dlbc_config);
+        graph_configs.push_back(&graph_vtcm_config);
+        graph_configs.push_back(&graph_opt_config);
+        if (1 == g_hexagon_appcfg.precision_mode) {
+            QnnHtpGraph_CustomConfig_t fp16_config = QNN_HTP_GRAPH_CUSTOM_CONFIG_INIT;
+            fp16_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_PRECISION;
+            fp16_config.precision = QNN_PRECISION_FLOAT16;
+            QnnGraph_Config_t graph_fp16_config;
+            graph_fp16_config.option       = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
+            graph_fp16_config.customConfig = &fp16_config;
+            graph_configs.push_back(&graph_fp16_config);
+        }
+        graph_configs.push_back(nullptr);
+        error = _qnn_interface.qnn_graph_create(_qnn_context_handle, graph_name.c_str(), graph_configs.data(), &_qnn_graph_handle);
+        //GGMLHEXAGON_LOG_DEBUG("[%s][%s]created graph %p", ggml_backend_hexagon_get_devname(device), graph_name.c_str(), _qnn_graph_handle);
+    } else {
+        error = _qnn_interface.qnn_graph_create(_qnn_context_handle, graph_name.c_str(), nullptr, &_qnn_graph_handle);
+    }
+    if (QNN_SUCCESS != error) {
+        GGMLHEXAGON_LOG_ERROR("[%s][%s]failed to create qnn graph, error: %s",
+                      ggml_backend_hexagon_get_devname(device), graph_name.c_str(),
+                      ggmlqnn_get_qnnerror_string(error));
+        return error;
+    }
+
+    GGMLHEXAGON_LOG_DEBUG("[%s]create graph %s succeed", ggml_backend_hexagon_get_devname(device), graph_name.c_str());
+    if (HEXAGON_BACKEND_QNNNPU == device) {
+        htp_set_n_hvx_threads(hvx_threads);
+    }
+    return QNN_SUCCESS;
+}
+
+int qnn_instance::init_qnn_graph(const char * graph_name, bool debug, uint8_t do_node_validation,
+                                 const QnnGraph_Config_t ** graph_configs) {
+    Qnn_ErrorHandle_t result = 0;
+
+    if (nullptr == graph_name) {
+        GGMLHEXAGON_LOG_WARN("graph name is null\n");
+        return 1;
+    }
+
+    if (!_graph_name.empty()) {
+        GGMLHEXAGON_LOG_WARN("qnn model for graph %s already initialized\n", graph_name);
+        return 2;
+    }
+
+    if (!do_node_validation) {
+        GGMLHEXAGON_LOG_WARN("node validation disabled, backend will not perform op validation prior to adding node\n");
+    }
+
+    _graph_name             = graph_name;
+    _debug_tensor           = debug;
+    _do_node_validations    = do_node_validation;
+
+    result = _qnn_raw_interface.graphCreate(_qnn_context_handle,
+                                            graph_name,
+                                            graph_configs,
+                                            &_qnn_graph_handle);
+    if (QNN_GRAPH_NO_ERROR != result || nullptr == _qnn_graph_handle) {
+        GGMLHEXAGON_LOG_WARN("failed to create graph in qnn context\n");
+        return 3;
+    } else {
+        GGMLHEXAGON_LOG_DEBUG("succeed to create graph %s, %p\n", graph_name, _qnn_graph_handle);
+    }
+
+    return 0;
+}
+
+int qnn_instance::finalize_qnn_graph() {
+    if (nullptr != _qnn_graph_handle) {
+        if (_qnn_raw_interface.graphFinalize(_qnn_graph_handle,
+                                             _qnn_profile_handle, nullptr)
+                                             != QNN_GRAPH_NO_ERROR) {
+            GGMLHEXAGON_LOG_WARN("finalizing graph failure\n");
+            return 1;
+        }
+    } else {
+        GGMLHEXAGON_LOG_DEBUG("qnn graph handle is null\n");
+    }
+
+    return 0;
+}
+
+int qnn_instance::htp_init_perfinfra() {
+    QnnDevice_Infrastructure_t device_infra = nullptr;
+    Qnn_ErrorHandle_t error = _qnn_raw_interface.deviceGetInfrastructure(&device_infra);
+    if (QNN_SUCCESS != error) {
+        GGMLHEXAGON_LOG_WARN("failed to get qnn device infra\n");
+        return 1;
+    }
+
+    QnnHtpDevice_Infrastructure_t * htp_infra = static_cast<QnnHtpDevice_Infrastructure_t *>(device_infra);
+    QnnHtpDevice_PerfInfrastructure_t * htp_perfinfra = &htp_infra->perfInfra;
+    uint32_t power_configid = 1;
+    uint32_t device_id      = 0;
+    uint32_t core_id        = 0;
+    htp_perfinfra->createPowerConfigId(device_id, core_id, &power_configid);
+    _qnn_htp_perfinfra      = htp_perfinfra;
+    _qnn_htp_powerconfig_id = power_configid;
+    //TODO:hardcode to 0 and 0 although it's correct
+    _qnn_htp_device_id      = device_id;
+    _qnn_htp_core_id        = core_id;
+
+    return 0;
+}
+
+void qnn_instance::htp_probe_rpc_meminfo() {
+    size_t candidate_size   = 0;
+    uint8_t * rpc_buffer    = nullptr;
+    size_t probe_slots[]    = {1024, 1536, 2048 - 48, 2048};
+    size_t probe_counts     = sizeof(probe_slots) / sizeof(size_t);
+    for (size_t idx = 0; idx < probe_counts; idx++) {
+        rpc_buffer = static_cast<uint8_t *>(alloc_rpcmem_internal(probe_slots[idx] * SIZE_IN_MB, 4));
+        if (nullptr == rpc_buffer) {
+            GGMLHEXAGON_LOG_DEBUG("alloc rpcmem %d (MiB) failure during probe rpc memory info, reason: %s\n", probe_slots[idx], strerror(errno));
+            break;
+        } else {
+            candidate_size = probe_slots[idx];
+            free_rpcmem(rpc_buffer);
+            rpc_buffer = nullptr;
+        }
+    }
+    if (candidate_size > _rpcmem_capacity)
+        _rpcmem_capacity = candidate_size * SIZE_IN_MB;
+
+    free_rpcmem();
+    _rpcmem_usage = 0;
+    GGMLHEXAGON_LOG_VERBOSE("capacity of rpc ion memory %d MiB\n", _rpcmem_capacity / SIZE_IN_MB);
+}
+
+void qnn_instance::htp_print_info() {
+    const QnnDevice_PlatformInfo_t * p_info = nullptr;
+    _qnn_raw_interface.deviceGetPlatformInfo(nullptr, &p_info);
+    GGMLHEXAGON_LOG_DEBUG("HTP device counts %d", p_info->v1.numHwDevices);
+    QnnDevice_HardwareDeviceInfo_t * infos = p_info->v1.hwDevices;
+    for (size_t i = 0; i < p_info->v1.numHwDevices; i++) {
+        GGMLHEXAGON_LOG_DEBUG("HTP deviceID:%d, deviceType:%d, numCores %d", infos[i].v1.deviceId,
+                         infos[i].v1.deviceType, infos[i].v1.numCores);
+        QnnDevice_DeviceInfoExtension_t devinfo = infos[i].v1.deviceInfoExtension;
+        QnnHtpDevice_OnChipDeviceInfoExtension_t chipinfo = devinfo->onChipDevice;
+        QnnHtpDevice_Arch_t htp_arch = chipinfo.arch;
+        GGMLHEXAGON_LOG_DEBUG("HTP_TYPE:%d(%s)", devinfo->devType,
+                         (devinfo->devType == QNN_HTP_DEVICE_TYPE_ON_CHIP) ? "QNN_HTP_DEVICE_TYPE_ON_CHIP" : "QNN_HTP_DEVICE_TYPE_UNKNOWN");
+        GGMLHEXAGON_LOG_DEBUG("qualcomm soc_model:%d(%s), htp_arch:%d(%s), vtcm_size:%d MiB，" \
+                             "dlbc_support:%d, signedpd_support:%d", \
+                             chipinfo.socModel, ggmlhexagon_get_socmodel_desc(chipinfo.socModel), \
+                             htp_arch, ggmlhexagon_get_htparch_desc(htp_arch), chipinfo.vtcmSize, \
+                             chipinfo.dlbcSupport, chipinfo.signedPdSupport);
+        struct qcom_socinfo * socinfo = ggmlhexagon_get_socinfo_from_socmodel(chipinfo.socModel);
+        g_hexagon_mgr[HEXAGON_BACKEND_QNNNPU].socinfo = { chipinfo.socModel, htp_arch, chipinfo.vtcmSize, {}};
+        if (nullptr != socinfo) {
+            memcpy(g_hexagon_mgr[HEXAGON_BACKEND_QNNNPU].socinfo.soc_desc, socinfo->soc_desc, sizeof(socinfo->soc_desc));
+            GGMLHEXAGON_LOG_DEBUG("soc info:%s", socinfo->soc_desc);
+        } else {
+            memcpy(g_hexagon_mgr[HEXAGON_BACKEND_QNNNPU].socinfo.soc_desc, "unknown", 7);
+            GGMLHEXAGON_LOG_DEBUG("soc info:unknown");
+        }
+    }
+    _qnn_raw_interface.deviceFreePlatformInfo(nullptr, p_info);
+}
+
+void qnn_instance::print_backend_info() {
+    auto print_property = [&](const char * name, QnnProperty_Key_t property) {
+        auto ret = _qnn_raw_interface.propertyHasCapability(property);
+
+        const char * status = "Unknown";
+        if (ret == QNN_PROPERTY_SUPPORTED) {
+            status = "Yes";
+        } else if (ret == QNN_PROPERTY_NOT_SUPPORTED) {
+            status = "No";
+        }
+
+        GGMLHEXAGON_LOG_VERBOSE("%s: %s", name, status);
+    };
+
+    GGMLHEXAGON_LOG_VERBOSE("QNN backend properties:");
+    print_property("Create context from binary list", QNN_PROPERTY_CONTEXT_SUPPORT_CREATE_FROM_BINARY_LIST_ASYNC);
+    print_property("Dynamic batch", QNN_PROPERTY_GRAPH_SUPPORT_BATCH_MULTIPLE);
+    print_property("Early termination", QNN_PROPERTY_GRAPH_SUPPORT_EARLY_TERMINATION);
+    print_property("Dynamic dimensions", QNN_PROPERTY_TENSOR_SUPPORT_DYNAMIC_DIMENSIONS);
+    print_property("Blockwise quantization", QNN_PROPERTY_TENSOR_SUPPORT_QUANTIZATION_ENCODING_BLOCK);
+    print_property("Blockwise quantization with expansion", QNN_PROPERTY_TENSOR_SUPPORT_QUANTIZATION_ENCODING_BLOCKWISE_EXPANSION);
+    print_property("Vector quantization", QNN_PROPERTY_TENSOR_SUPPORT_QUANTIZATION_ENCODING_VECTOR);
+    print_property("Tensor sparsity", QNN_PROPERTY_TENSOR_SUPPORT_SPARSITY);
+    print_property("Updateable application tensor", QNN_PROPERTY_TENSOR_SUPPORT_UPDATEABLE_APP_TENSORS);
+    print_property("Updateable native tensor", QNN_PROPERTY_TENSOR_SUPPORT_UPDATEABLE_NATIVE_TENSORS);
+    print_property("Updateable static tensor", QNN_PROPERTY_TENSOR_SUPPORT_UPDATEABLE_STATIC_TENSORS);
+    print_property("Qnn group device", QNN_PROPERTY_GROUP_DEVICE);
+}
+
+void qnn_instance::htp_set_memory_grow_size(size_t size) {
+    QnnHtpPerfInfrastructure_MemoryConfig_t grow_size_config = {
+            .option            = QNN_HTP_PERF_INFRASTRUCTURE_MEMORY_CONFIGOPTION_GROW_SIZE,
+            .memGrowSizeConfig = (uint32_t)size,
+    };
+
+    const QnnHtpPerfInfrastructure_MemoryConfig_t *memory_config[] = {
+            &grow_size_config,
+            nullptr,
+    };
+    Qnn_ErrorHandle_t result = _qnn_htp_perfinfra->setMemoryConfig(_qnn_htp_device_id, _qnn_htp_core_id, memory_config);
+    if (QNN_SUCCESS != result) {
+        GGMLHEXAGON_LOG_WARN("failed to set HTP memory config");
+    } else {
+        GGMLHEXAGON_LOG_VERBOSE("succeed to set HTP memory config");
+    }
+}
+
+void qnn_instance::htp_set_n_hvx_threads(size_t n_threads) {
+    QnnHtpGraph_CustomConfig_t htp_hvx_thread_config = {
+            .option        = QNN_HTP_GRAPH_CONFIG_OPTION_NUM_HVX_THREADS,
+            .numHvxThreads = n_threads,
+    };
+
+    QnnGraph_Config_t hvx_thread_config = {
+            .option       = QNN_GRAPH_CONFIG_OPTION_CUSTOM,
+            .customConfig = &htp_hvx_thread_config,
+    };
+
+    const QnnGraph_Config_t * graph_configs[] = {&hvx_thread_config, nullptr};
+    Qnn_ErrorHandle_t result     = _qnn_raw_interface.graphSetConfig(_qnn_graph_handle, graph_configs);
+    if (QNN_SUCCESS != result) {
+        GGMLHEXAGON_LOG_WARN("failed to set QNN graph config: set hvx threads %d", n_threads);
+    } else {
+        //GGMLHEXAGON_LOG_DEBUG("succeed to set QNN graph config: set hvx threads %d", n_threads);
+    }
+}
+
+void qnn_instance::htp_enter_performance_mode() {
+    QnnHtpPerfInfrastructure_PowerConfig_t dcvs_v3_config = {
+            .option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_DCVS_V3,
+            .dcvsV3Config =
+                    {
+                            .contextId = _qnn_htp_powerconfig_id,
+
+                            .setDcvsEnable = 1,
+                            .dcvsEnable    = 0,
+
+                            .powerMode = QNN_HTP_PERF_INFRASTRUCTURE_POWERMODE_PERFORMANCE_MODE,
+
+                            .setSleepLatency = 1,
+                            .sleepLatency    = 40,
+
+                            .setSleepDisable = 1,
+                            .sleepDisable    = 1,
+
+                            .setBusParams           = 1,
+                            .busVoltageCornerMin    = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER,
+                            .busVoltageCornerTarget = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER,
+                            .busVoltageCornerMax    = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER,
+
+                            .setCoreParams           = 1,
+                            .coreVoltageCornerMin    = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER,
+                            .coreVoltageCornerTarget = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER,
+                            .coreVoltageCornerMax    = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER,
+                    },
+    };
+
+    QnnHtpPerfInfrastructure_PowerConfig_t hmx_config = {
+            .option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_HMX_V2,
+            .hmxV2Config =
+                    {
+                            .hmxPickDefault         = 0,
+                            .hmxVoltageCornerMin    = DCVS_EXP_VCORNER_MAX,
+                            .hmxVoltageCornerTarget = DCVS_EXP_VCORNER_MAX,
+                            .hmxVoltageCornerMax    = DCVS_EXP_VCORNER_MAX,
+                            .hmxPerfMode            = QNN_HTP_PERF_INFRASTRUCTURE_CLK_PERF_HIGH,
+                    },
+    };
+
+    QnnHtpPerfInfrastructure_PowerConfig_t rpc_ctrl_config = {
+            .option                  = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_RPC_CONTROL_LATENCY,
+            .rpcControlLatencyConfig = 100,
+    };
+
+    QnnHtpPerfInfrastructure_PowerConfig_t rpc_poll_config = {
+            .option               = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_RPC_POLLING_TIME,
+            .rpcPollingTimeConfig = 9999,
+    };
+
+    const QnnHtpPerfInfrastructure_PowerConfig_t * power_configs[] = {
+            &dcvs_v3_config,
+            &hmx_config,
+            &rpc_ctrl_config,
+            &rpc_poll_config,
+            nullptr,
+    };
+    Qnn_ErrorHandle_t ret = _qnn_htp_perfinfra->setPowerConfig(_qnn_htp_powerconfig_id, power_configs);
+    if (ret != QNN_SUCCESS) {
+        GGMLHEXAGON_LOG_WARN("failed to set HTP power config");
+    } else {
+        GGMLHEXAGON_LOG_VERBOSE("succeed to set HTP power config");
+    }
+}
+
+static uint8_t * ggmlqnn_create_rpc_buffer(qnn_instance * instance, const ggml_tensor * ggml_tensor, Qnn_Tensor_t * qnn_tensor, bool b_copydata) {
+    if (nullptr == instance || nullptr == ggml_tensor || nullptr == qnn_tensor) {
+        GGMLHEXAGON_LOG_WARN("invalid params\n");
+        return nullptr;
+    }
+
+    uint8_t * qnn_rpcbuffer = static_cast<uint8_t *>(instance->alloc_rpcmem(ggml_nbytes(ggml_tensor), 4));
+    if (nullptr == qnn_rpcbuffer) {
+        GGMLHEXAGON_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno));
+        return nullptr;
+    } else {
+        GGMLHEXAGON_LOG_DEBUG("alloc rpcmem %p successfully\n", qnn_rpcbuffer);
+    }
+    if (b_copydata)
+        memcpy(qnn_rpcbuffer, ggml_tensor->data, ggml_nbytes(ggml_tensor));
+    instance->register_rpcmem(qnn_rpcbuffer, qnn_tensor);
+    return qnn_rpcbuffer;
+}
+
+static Qnn_OpConfig_t ggmlqnn_create_op_config(const char * name, const char * package, const char * type,
+                                               Qnn_Param_t * params, uint32_t num_params,
+                                               Qnn_Tensor_t * inputs, uint32_t num_inputs,
+                                               Qnn_Tensor_t * outputs, uint32_t num_outputs) {
+
+    char opcfg_name[GGML_MAX_NAME] = {};
+
+    //ensure the opcfg name is unique
+    if (nullptr == name) {
+        snprintf(opcfg_name, GGML_MAX_NAME, "opcfg_%-8d", ggmlqnn_get_idx(QNN_OPCFG_INDEX));
+    } else {
+        snprintf(opcfg_name, GGML_MAX_NAME, "opcfg_%s_%-8d", name, ggmlqnn_get_idx(QNN_OPCFG_INDEX));
+    }
+    //GGMLHEXAGON_LOG_DEBUG("create qnn opconfig %s", opcfg_name);
+    ggmlqnn_inc_idx(QNN_OPCFG_INDEX);
+
+    Qnn_OpConfigV1_t v1 = {opcfg_name, package, type,
+                           num_params, params,
+                           num_inputs, inputs,
+                           num_outputs, outputs
+    };
+    Qnn_OpConfig_t opcfg = {QNN_OPCONFIG_VERSION_1, {v1}};
+
+    return opcfg;
+}
+
+static Qnn_Tensor_t * ggmlqnn_create_general_tensor(qnn_instance * instance, Qnn_GraphHandle_t graph_handle,
+                                                    const ggml_tensor * tensor, const char * name,
+                                                    Qnn_TensorType_t qnn_tensor_type,
+                                                    Qnn_DataType_t qnn_data_type,
+                                                    uint32_t rank, uint32_t * dims,
+                                                    void * data, uint32_t data_size,
+                                                    bool b_transpose = false) {
+    Qnn_ErrorHandle_t error         = QNN_SUCCESS;
+    char tensor_name[GGML_MAX_NAME] = {};
+
+    //ensure the tensor name is unique
+    if (nullptr == name) {
+        snprintf(tensor_name, GGML_MAX_NAME, "tensor_%-8d", ggmlqnn_get_idx(QNN_TENSOR_INDEX));
+    } else {
+        snprintf(tensor_name, GGML_MAX_NAME, "tensor_%s%-8d", name, ggmlqnn_get_idx(QNN_TENSOR_INDEX));
+    }
+    GGMLHEXAGON_LOG_DEBUG("init_tensor %s", tensor_name);
+    ggmlqnn_inc_idx(QNN_TENSOR_INDEX);
+
+    uint32_t reverse_dims[GGML_MAX_DIMS]    = {};
+    uint32_t transpose_dims[GGML_MAX_DIMS]  = {};
+    uint32_t * tensor_dims                  = nullptr;
+    //case 1:use dims info from ggml tensor
+    if (nullptr != tensor) {
+        //there are different dimension order between ggml tensor and qnn tensor
+        for (size_t idx = 0; idx < rank; idx++) {
+            reverse_dims[idx] = (uint32_t)tensor->ne[rank - 1 - idx];
+        }
+        tensor_dims = reverse_dims;
+    }
+    //case 2: use user's specified tensor_dims
+    if (nullptr != dims) {
+        tensor_dims = dims;
+    }
+    //case 3: transpose for dst tensor
+    if (b_transpose) {
+        GGML_ASSERT(tensor != nullptr); //ensure ggml_tensor is not nullptr for this special case
+
+        ggmlqnn_get_qnn_dimensions_from_ggml_dimensions(transpose_dims, reverse_dims, ggml_n_dims(tensor));
+        tensor_dims = transpose_dims;
+    }
+
+    Qnn_Tensor_t qnn_tensor = {
+            .version = QNN_TENSOR_VERSION_1,
+            .v1 = {
+                    .id = 0,
+                    .name = tensor_name,
+                    .type = qnn_tensor_type,
+                    .dataFormat = QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER,
+                    .dataType = qnn_data_type,
+                    .quantizeParams = {.encodingDefinition = QNN_DEFINITION_UNDEFINED,
+                            .quantizationEncoding = QNN_QUANTIZATION_ENCODING_UNDEFINED,
+                            .scaleOffsetEncoding = {.scale = 0.0000000000000000f, .offset = 0}
+                            },
+                    .rank = rank,
+                    .dimensions = tensor_dims,
+                    .memType = QNN_TENSORMEMTYPE_RAW,
+                    .clientBuf = {.data = nullptr, .dataSize = 0}
+            }
+    };
+    Qnn_Tensor_t * p_qnn_tensor = (Qnn_Tensor_t *)calloc(1, sizeof(Qnn_Tensor_t));
+    if (nullptr == p_qnn_tensor) {
+        GGMLHEXAGON_LOG_WARN("calloc failed");
+        return nullptr;
+    }
+    error = ggmlqnn_deep_copy_qnntensor(qnn_tensor, *p_qnn_tensor);
+    if (error != QNN_SUCCESS) {
+        free(p_qnn_tensor);
+        GGMLHEXAGON_LOG_WARN("init tensor failed");
+        return  nullptr;
+    }
+
+    bool enable_npu_rpc = (instance->enable_qnn_rpc() && instance->get_device_id() == HEXAGON_BACKEND_QNNNPU);
+    if (enable_npu_rpc) {
+        QNN_VER_PTR(*p_qnn_tensor)->memType = QNN_TENSORMEMTYPE_MEMHANDLE;
+        QNN_VER_PTR(*p_qnn_tensor)->clientBuf = {.data=nullptr, .dataSize=0};
+    } else {
+        QNN_VER_PTR(*p_qnn_tensor)->clientBuf = {data, data_size};
+    }
+    QNN_INTERFACE_VER_TYPE qnn_raw_interface    = instance->get_qnn_raw_interface();
+    CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_qnn_tensor));
+
+    return p_qnn_tensor;
+}
+
+static Qnn_Tensor_t * ggmlqnn_create_compute_tensor(qnn_instance * instance, Qnn_GraphHandle_t graph_handle,
+                          const ggml_tensor * tensor, Qnn_TensorType_t tensor_type) {
+    uint32_t dimensions[]   = {(uint32_t) tensor->ne[0], (uint32_t) tensor->ne[1],
+                               (uint32_t) tensor->ne[2], (uint32_t) tensor->ne[3]};
+    Qnn_DataType_t qnn_data_type = QNN_DATATYPE_FLOAT_32;
+    Qnn_TensorType_t qnn_tensor_type = QNN_TENSOR_TYPE_APP_WRITE;
+
+    if (0 == tensor->flags) {
+        qnn_tensor_type = tensor_type;
+    } else {
+        if (tensor->flags & GGML_TENSOR_FLAG_INPUT) {
+            qnn_tensor_type = QNN_TENSOR_TYPE_APP_WRITE;
+        } else if (tensor->flags & GGML_TENSOR_FLAG_OUTPUT) {
+            qnn_tensor_type = QNN_TENSOR_TYPE_APP_READ;
+        }
+    }
+
+    qnn_data_type = ggmlqnn_datatype_from_ggml_datatype(tensor->type);
+    Qnn_Tensor_t * p_qnn_tensor = ggmlqnn_create_general_tensor(instance, graph_handle, tensor, nullptr,
+                                      qnn_tensor_type, qnn_data_type,
+                                      ggml_n_dims(tensor), dimensions,
+                                      nullptr, 0);
+    return p_qnn_tensor;
+}
+
+// =================================================================================================
+//  section-6: hwaccel approach through QNN: offload GGML op to QNN backend
+// =================================================================================================
+/*
+ * provide a general skeleton to offload ggml op to QNN backend: perform element-wise
+ * operation on 1/2 input tensors and 1 output tensors
+*/
+static void ggmlqnn_compute_elementwise(ggml_backend_hexagon_context * ctx, ggml_tensor * op) {
+    Qnn_ErrorHandle_t error                     = QNN_SUCCESS;
+    qnn_instance * instance                     = nullptr;
+    Qnn_GraphHandle_t graph_handle              = nullptr;
+    Qnn_Tensor_t * p_tensor0                    = nullptr;
+    Qnn_Tensor_t * p_tensor1                    = nullptr;
+    Qnn_Tensor_t * p_tensor2                    = nullptr;
+    const ggml_tensor * src0                    = op->src[0];
+    const ggml_tensor * src1                    = op->src[1];
+    ggml_tensor * dst                           = op;
+
+    GGMLQNN_CHECK_PARAMS(ctx, src0, src1, dst);
+    instance                                    = ctx->instance;
+    QNN_INTERFACE_VER_TYPE qnn_raw_interface    = ctx->raw_interface;
+    size_t qnn_op_index                         = ggmlhexagon_get_op_index(op);
+    const char * qnn_op_name                    = ggmlqnn_k_op_caps[qnn_op_index].qnn_op_name;
+    size_t input_param_count                    = ggmlqnn_k_op_caps[qnn_op_index].input_param_count;
+    const char * ggml_original_opname           = ggml_op_name(op->op);
+    std::string ggml_op_name_string             = std::string("ggml_") + ggml_original_opname;
+    const char * ggml_op_name                   = ggml_op_name_string.c_str();
+
+    std::string graph_name;
+    ggmlhexagon_get_opkey_from_op(op, graph_name);
+
+    int input_size = ggml_nbytes(src0);
+    if (nullptr != src1)
+        input_size += ggml_nbytes(src1);
+    hexagon_perf op_perf(graph_name, ggml_original_opname, input_size, ggml_nbytes(dst));
+    op_perf.start();
+
+    bool enable_npu_rpc = instance->enable_qnn_rpc() && ctx->device == HEXAGON_BACKEND_QNNNPU;
+    if (ctx->qnn_singlenode_graph_map.find(graph_name) != ctx->qnn_singlenode_graph_map.end()) {
+        //retrieve computational resource from cached QNN graph
+        qnn_singlenode_res_t & graph_item = ctx->qnn_singlenode_graph_map[graph_name];
+        graph_handle                      = std::get<0>(graph_item);
+        qnn_ptensors_t & ptensors         = std::get<1>(graph_item);
+        p_tensor0  = ptensors[0];
+        if (2 == input_param_count) {
+            p_tensor1 = ptensors[1];
+            p_tensor2 = ptensors[2];
+        } else {
+            //now p_tensor1 is nullptr
+            p_tensor2 = ptensors[1];
+        }
+    } else {
+        GGML_ASSERT(instance->get_device_id() == ctx->device);
+        GGMLHEXAGON_LOG_VERBOSE("graph name %s", graph_name.c_str());
+        //create QNN graph
+        error = instance->init_qnn_graph(graph_name, static_cast<HEXAGONBackend>(ctx->device),
+                                         g_hexagon_appcfg.vtcm_size_in_mb,
+                                         g_hexagon_appcfg.hvx_threads);
+        if (QNN_SUCCESS != error) {
+            GGMLHEXAGON_LOG_WARN("can't create qnn graph handle with graph name %s, error = %d\n", graph_name.c_str(), error);
+            return;
+        }
+        graph_handle = instance->get_qnn_graph_handle();
+
+        //GGMLHEXAGON_LOG_DEBUG("graph_handle %p", graph_handle);
+        //create computational tensor
+        p_tensor0 = ggmlqnn_create_compute_tensor(instance, graph_handle, src0, QNN_TENSOR_TYPE_APP_WRITE);
+        if (2 == input_param_count) {
+            p_tensor1 = ggmlqnn_create_compute_tensor(instance, graph_handle, src1, QNN_TENSOR_TYPE_APP_WRITE);
+        }
+        p_tensor2 = ggmlqnn_create_compute_tensor(instance, graph_handle, dst, QNN_TENSOR_TYPE_APP_READ);
+
+        //compose QNN graph
+        qnn_tensors_t input_tensors;
+        input_tensors.reserve(input_param_count);
+        input_tensors.push_back(*p_tensor0);
+        if (2 == input_param_count) {
+            input_tensors.push_back(*p_tensor1);
+        }
+        Qnn_Tensor_t output_tensors[] = {
+                *p_tensor2
+        };
+        Qnn_OpConfig_t op_config = ggmlqnn_create_op_config(ggml_op_name,
+                                                            QNN_OP_PACKAGE_NAME_QTI_AISW,
+                                                            qnn_op_name, nullptr, 0,
+                                                            input_tensors.data(),
+                                                            input_param_count, output_tensors,
+                                                            1);
+        CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, op_config));
+        //finalize QNN graph
+        CHECK_QNN_API(error, qnn_raw_interface.graphFinalize(graph_handle, nullptr, nullptr));
+
+        //cache QNN graph
+        qnn_ptensors_t qnn_elementwise_tensors;
+        qnn_elementwise_tensors.reserve(input_param_count + 1);
+
+        qnn_elementwise_tensors.push_back(p_tensor0);
+        if (2 == input_param_count) {
+            qnn_elementwise_tensors.push_back(p_tensor1);
+        }
+        qnn_elementwise_tensors.push_back(p_tensor2);
+        auto graph_item = std::make_tuple(graph_handle, qnn_elementwise_tensors);
+        ctx->qnn_singlenode_graph_map[graph_name] = graph_item;
+    }
+
+    if (enable_npu_rpc) {
+        uint8_t * qnn_buffer_0 = static_cast<uint8_t *>(instance->get_rpcmem_from_memhandle(
+                QNN_VER_PTR(*p_tensor0)->memHandle));
+        GGMLHEXAGON_LOG_DEBUG("qnn_rpcbuffer_0 = %p\n", qnn_buffer_0);
+        if (nullptr != qnn_buffer_0) {
+            memcpy(qnn_buffer_0, src0->data, ggml_nbytes(src0));
+        }
+
+        if (2 == input_param_count) {
+            uint8_t * qnn_buffer_1 = static_cast<uint8_t *>(instance->get_rpcmem_from_memhandle(
+                    QNN_VER_PTR(*p_tensor1)->memHandle));
+            GGMLHEXAGON_LOG_DEBUG("qnn_rpcbuffer_1 = %p\n", qnn_buffer_1);
+            if (nullptr != qnn_buffer_1) {
+                memcpy(qnn_buffer_1, src1->data, ggml_nbytes(src1));
+            }
+        }
+    } else {
+        QNN_VER_PTR(*p_tensor0)->clientBuf = {src0->data, ggmlqnn_get_tensor_data_size(src0)};
+        if (2 == input_param_count) {
+            QNN_VER_PTR(*p_tensor1)->clientBuf = {src1->data, ggmlqnn_get_tensor_data_size(src1)};
+        }
+        QNN_VER_PTR(*p_tensor2)->clientBuf = {dst->data, ggmlqnn_get_tensor_data_size(dst)};
+    }
+
+    qnn_tensors_t input_tensors;
+    input_tensors.reserve(input_param_count);
+    input_tensors.push_back(*p_tensor0);
+    if (2 == input_param_count) {
+        input_tensors.push_back(*p_tensor1);
+    }
+    Qnn_Tensor_t output_tensors[] = {
+            *p_tensor2
+    };
+    CHECK_QNN_API(error, qnn_raw_interface.graphExecute(graph_handle,
+                                                        input_tensors.data(), input_param_count,
+                                                        output_tensors, 1,
+                                                        nullptr, nullptr));
+    if (enable_npu_rpc) {
+        uint8_t * qnn_buffer_2 = static_cast<uint8_t *>(instance->get_rpcmem_from_memhandle(QNN_VER_PTR(*p_tensor2)->memHandle));
+        if (nullptr != qnn_buffer_2) {
+            memcpy(dst->data, qnn_buffer_2, ggml_nbytes(dst));
+        }
+    }
+
+    op_perf.info();
+}
+
+/*
+ * this function is AI-assisted code from Grok 3 for purpose of offload 4d matrix mulmat to QNN backend
+ * various UT has verified and succeed but failed in CT of test-backend-ops
+ *
+ * the logic of ggmlqnn_compute_mul_mat_4d is similar to ggmlqnn_compute_mul_mat but much more complicated
+ * than ggmlqnn_compute_mul_mat, so it's a standalone function.
+ * it will be combined with ggmlqnn_compute_mul_mat in the future
+ */
+static void ggmlqnn_compute_mul_mat_4d(ggml_backend_hexagon_context * ctx, ggml_tensor * op) {
+    Qnn_ErrorHandle_t error     = QNN_SUCCESS;
+    qnn_instance * instance     = ctx->instance;
+    QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface;
+
+    const ggml_tensor * src0 = op->src[0];
+    const ggml_tensor * src1 = op->src[1];
+    ggml_tensor * dst        = op;
+
+    GGMLQNN_CHECK_PARAMS(ctx, src0, src1, dst);
+    GGML_ASSERT(ggml_n_dims(src0) == 4 && ggml_n_dims(src1) == 4);
+
+    hexagon_perf op_perf("ggmlqnn_compute_mul_mat_4d");
+    op_perf.start();
+
+    std::string graph_name;
+    ggmlhexagon_get_opkey_from_op(op, graph_name);
+    GGMLHEXAGON_LOG_DEBUG("graph name %s\n", graph_name.c_str());
+
+    ggmlhexagon_print_tensors_info(__func__, ctx, src0, src1, dst);
+
+    Qnn_GraphHandle_t graph_handle  = nullptr;
+    Qnn_Tensor_t * p_tensor0        = nullptr;
+    Qnn_Tensor_t * p_reshape0_out   = nullptr;
+    Qnn_Tensor_t * p_tile0_out      = nullptr;
+    Qnn_Tensor_t * p_tensor1        = nullptr;
+    Qnn_Tensor_t * p_permute1_out   = nullptr;
+    Qnn_Tensor_t * p_reshape1_out   = nullptr;
+    Qnn_Tensor_t * p_matmul_out     = nullptr;
+    Qnn_Tensor_t * p_reshape2_out   = nullptr;
+
+    if (ctx->qnn_singlenode_graph_map.find(graph_name) != ctx->qnn_singlenode_graph_map.end()) {
+        qnn_singlenode_res_t & graph_item   = ctx->qnn_singlenode_graph_map[graph_name];
+        graph_handle                        = std::get<0>(graph_item);
+        qnn_ptensors_t & tensors            = std::get<1>(graph_item);
+        p_tensor0                           = tensors[0];
+        p_reshape0_out                      = tensors[1];
+        p_tile0_out                         = tensors[2];
+        p_tensor1                           = tensors[3];
+        p_permute1_out                      = tensors[4];
+        p_reshape1_out                      = tensors[5];
+        p_matmul_out                        = tensors[6];
+        p_reshape2_out                      = tensors[7];
+    } else {
+        CHECK_QNN_API(error, qnn_raw_interface.graphCreate(instance->get_qnn_context_handle(), graph_name.c_str(), NULL, &graph_handle));
+
+        // Define dimensions
+        uint32_t K = src0->ne[0];               // Inner dimension
+        uint32_t M = src0->ne[1];               // Rows of src0
+        uint32_t N = src1->ne[1];               // Columns of src1
+        uint32_t B0 = src0->ne[2] * src0->ne[3]; // src0 batch
+        uint32_t B1 = src1->ne[2] * src1->ne[3]; // src1 batch (drives output)
+
+        // Validate K only
+        GGML_ASSERT(src0->ne[0] == src1->ne[0]); // K must match
+
+        // src0: [K, M, H0, B0] -> QNN: [B0, H0, M, K]
+        uint32_t src0_dims[] = {static_cast<uint32_t>(src0->ne[3]), static_cast<uint32_t>(src0->ne[2]),
+                                static_cast<uint32_t>(src0->ne[1]), static_cast<uint32_t>(src0->ne[0])
+        };
+        p_tensor0 = ggmlqnn_create_general_tensor(instance, graph_handle, src0, "input0",
+                                                  QNN_TENSOR_TYPE_APP_WRITE, QNN_DATATYPE_FLOAT_32, 4,
+                                                  src0_dims, nullptr, 0);
+
+        // Reshape src0 to [B0, M, K]
+        uint32_t reshape0_out_dims[] = {B0, M, K};
+        p_reshape0_out = ggmlqnn_create_general_tensor(instance, graph_handle, nullptr, "reshape0_out",
+                                                       QNN_TENSOR_TYPE_NATIVE, QNN_DATATYPE_FLOAT_32, 3,
+                                                       reshape0_out_dims, nullptr, 0);
+
+        Qnn_Tensor_t reshape0_inputs[]  = {*p_tensor0};
+        Qnn_Tensor_t reshape0_outputs[] = {*p_reshape0_out};
+        Qnn_OpConfig_t reshape0_op      = ggmlqnn_create_op_config("reshape0", QNN_OP_PACKAGE_NAME_QTI_AISW,
+                                                                   QNN_OP_RESHAPE, nullptr, 0,
+                                                                   reshape0_inputs, 1, reshape0_outputs, 1);
+        CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, reshape0_op));
+
+        // Tile src0 to match B1: [B0, M, K] -> [B1, M, K]
+        uint32_t tile0_out_dims[] = {B1, M, K};
+        p_tile0_out = ggmlqnn_create_general_tensor(instance, graph_handle, nullptr, "tile0_out",
+                                                    QNN_TENSOR_TYPE_NATIVE, QNN_DATATYPE_FLOAT_32, 3,
+                                                    tile0_out_dims, nullptr, 0);
+
+        uint32_t tile_multiples[] = {B1 / B0, 1, 1};
+        uint32_t tile_dims[] = {3};
+        Qnn_Tensor_t * p_tile_multiples = ggmlqnn_create_general_tensor(instance, graph_handle, nullptr, "tile_multiples",
+                                                                        QNN_TENSOR_TYPE_STATIC, QNN_DATATYPE_UINT_32, 1,
+                                                                        tile_dims, tile_multiples, sizeof(tile_multiples));
+
+        Qnn_Param_t tile_params[]       = {{.paramType = QNN_PARAMTYPE_TENSOR, .name = "multiples", .tensorParam = *p_tile_multiples}};
+        Qnn_Tensor_t tile0_inputs[]     = {*p_reshape0_out};
+        Qnn_Tensor_t tile0_outputs[]    = {*p_tile0_out};
+        Qnn_OpConfig_t tile0_op         = ggmlqnn_create_op_config("tile0", QNN_OP_PACKAGE_NAME_QTI_AISW,
+                                                                   QNN_OP_TILE, tile_params, 1,
+                                                                   tile0_inputs, 1, tile0_outputs, 1);
+        CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, tile0_op));
+
+        // src1: [N, K, H1, B1] -> QNN: [B1, H1, N, K]
+        uint32_t src1_dims[] = {static_cast<uint32_t>(src1->ne[3]), static_cast<uint32_t>(src1->ne[2]),
+                                static_cast<uint32_t>(src1->ne[1]), static_cast<uint32_t>(src1->ne[0])
+        };
+        p_tensor1 = ggmlqnn_create_general_tensor(instance, graph_handle, src1, "input1",
+                                                  QNN_TENSOR_TYPE_APP_WRITE, QNN_DATATYPE_FLOAT_32, 4,
+                                                  src1_dims, nullptr, 0);
+
+
+        // Permute src1 to [B1, H1, K, N]
+        uint32_t perm_data[] = {0, 1, 3, 2};
+        uint32_t perm_dims[] = {4};
+        Qnn_Tensor_t * p_perm = ggmlqnn_create_general_tensor(instance, graph_handle, nullptr, "perm",
+                                                              QNN_TENSOR_TYPE_STATIC, QNN_DATATYPE_UINT_32, 1,
+                                                              perm_dims, perm_data, sizeof(perm_data));
+
+        uint32_t permute1_out_dims[] = {static_cast<uint32_t>(src1->ne[3]), static_cast<uint32_t>(src1->ne[2]),
+                                        static_cast<uint32_t>(src1->ne[0]), static_cast<uint32_t>(src1->ne[1])
+        };
+        p_permute1_out = ggmlqnn_create_general_tensor(instance, graph_handle, nullptr, "permute1_out",
+                                                       QNN_TENSOR_TYPE_NATIVE, QNN_DATATYPE_FLOAT_32, 4,
+                                                       permute1_out_dims, nullptr, 0);
+
+        Qnn_Param_t permute1_params[]   = {{.paramType = QNN_PARAMTYPE_TENSOR, .name = "perm", .tensorParam = *p_perm}};
+        Qnn_Tensor_t permute1_inputs[]  = {*p_tensor1};
+        Qnn_Tensor_t permute1_outputs[] = {*p_permute1_out};
+        Qnn_OpConfig_t permute1_op      = ggmlqnn_create_op_config("permute1", QNN_OP_PACKAGE_NAME_QTI_AISW,
+                                                                   QNN_OP_TRANSPOSE, permute1_params, 1,
+                                                                   permute1_inputs, 1, permute1_outputs, 1);
+        CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, permute1_op));
+
+        // Reshape src1 to [B1, K, N]
+        uint32_t reshape1_out_dims[] = {B1, K, N};
+        p_reshape1_out = ggmlqnn_create_general_tensor(instance, graph_handle, nullptr, "reshape1_out",
+                                                       QNN_TENSOR_TYPE_NATIVE, QNN_DATATYPE_FLOAT_32, 3,
+                                                       reshape1_out_dims, nullptr, 0);
+
+        Qnn_Tensor_t reshape1_inputs[]  = {*p_permute1_out};
+        Qnn_Tensor_t reshape1_outputs[] = {*p_reshape1_out};
+        Qnn_OpConfig_t reshape1_op      = ggmlqnn_create_op_config("reshape1", QNN_OP_PACKAGE_NAME_QTI_AISW,
+                                                                   QNN_OP_RESHAPE, nullptr, 0,
+                                                                   reshape1_inputs, 1, reshape1_outputs, 1);
+        CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, reshape1_op));
+
+        // MatMul: [B1, M, K] x [B1, K, N] -> [B1, M, N]
+        uint32_t matmul_out_dims[] = {B1, M, N};
+        p_matmul_out = ggmlqnn_create_general_tensor(instance, graph_handle, nullptr, "matmul_out",
+                                                     QNN_TENSOR_TYPE_NATIVE, QNN_DATATYPE_FLOAT_32, 3,
+                                                     matmul_out_dims, nullptr, 0);
+
+        Qnn_Tensor_t matmul_inputs[]    = {*p_tile0_out, *p_reshape1_out};
+        Qnn_Tensor_t matmul_outputs[]   = {*p_matmul_out};
+        Qnn_OpConfig_t matmul_op        = ggmlqnn_create_op_config("matmul", QNN_OP_PACKAGE_NAME_QTI_AISW,
+                                                                   QNN_OP_MAT_MUL, nullptr, 0,
+                                                                   matmul_inputs, 2, matmul_outputs, 1);
+        CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, matmul_op));
+
+        // Output: [N, M, H1, B1] -> QNN: [B1, H1, M, N]
+        uint32_t reshape2_out_dims[] = {static_cast<uint32_t>(dst->ne[3]), static_cast<uint32_t>(dst->ne[2]),
+                                        static_cast<uint32_t>(dst->ne[1]), static_cast<uint32_t>(dst->ne[0])
+        };
+        p_reshape2_out = ggmlqnn_create_general_tensor(instance, graph_handle, dst, "output",
+                                                       QNN_TENSOR_TYPE_APP_READ, QNN_DATATYPE_FLOAT_32, 4,
+                                                       reshape2_out_dims, nullptr, 0);
+
+        Qnn_Tensor_t reshape2_inputs[]  = {*p_matmul_out};
+        Qnn_Tensor_t reshape2_outputs[] = {*p_reshape2_out};
+        Qnn_OpConfig_t reshape2_op      = ggmlqnn_create_op_config("reshape2", QNN_OP_PACKAGE_NAME_QTI_AISW,
+                                                                   QNN_OP_RESHAPE, nullptr, 0,
+                                                                   reshape2_inputs, 1, reshape2_outputs, 1);
+        CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, reshape2_op));
+
+        // Finalize
+        CHECK_QNN_API(error, qnn_raw_interface.graphFinalize(graph_handle, NULL, NULL));
+
+        // Cache
+        qnn_ptensors_t ggml_op_mulmat_tensors = {p_tensor0, p_reshape0_out, p_tile0_out, p_tensor1,
+                                                 p_permute1_out, p_reshape1_out, p_matmul_out, p_reshape2_out
+        };
+        ctx->qnn_singlenode_graph_map[graph_name] = std::make_tuple(graph_handle, ggml_op_mulmat_tensors);
+    }
+
+    // Execute
+    QNN_VER_PTR(*p_tensor0)->clientBuf      = {src0->data, static_cast<uint32_t>(ggml_nbytes(src0))};
+    QNN_VER_PTR(*p_tensor1)->clientBuf      = {src1->data, static_cast<uint32_t>(ggml_nbytes(src1))};
+    QNN_VER_PTR(*p_reshape2_out)->clientBuf = {dst->data, static_cast<uint32_t>(ggml_nbytes(dst))};
+
+    Qnn_Tensor_t input_tensors[]    = {*p_tensor0, *p_tensor1};
+    Qnn_Tensor_t output_tensors[]   = {*p_reshape2_out};
+    CHECK_QNN_API(error, qnn_raw_interface.graphExecute(graph_handle, input_tensors, 2, output_tensors, 1, NULL, NULL));
+
+    op_perf.info();
+}
+
+/*
+ * @brief performs matrix multiplication with FP32 & quantized weights and floating-point inputs
+ *        using the QNN backend. this function performs matrix multiplication of the input tensor
+ *        `src1` and the weight tensor `src0`, handling transposing, and quantization as needed,
+ *        and stores the result in the destination tensor `dst`.
+ *
+         there are two key-points in properly handling how to offload mulmat to the QNN
+         1. transpose
+            a 3x2 f32 matrix which means 3 rows and 2 columns. in ggml, it could be created from:
+            struct ggml_tensor* matrix = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 2, 3);
+            which like this:
+            +---+---+
+            | 0 | 1 |
+            +---+---+
+            | 2 | 3 |
+            +---+---+
+            | 4 | 5 |
+            +---+---+
+            with
+                ne[0] = 2
+                ne[1] = 3
+            there are different dimension order between ggml tensor and qnn tensor
+
+          2. QNN's MatMul can only support input tensors with rank >= 2
+
+             in the all, there is gap between ggml mulmat and QNN mulmat,we need to perform a transpose
+             operation when offloading mulmat to QNN backend. this implementation will handle transpose
+             in func ggmlqnn_create_general_tensor()
+
+ * @param ctx     the context of backend
+ * @param op      the destination tensor where the result of the matrix multiplication will be stored.
+ *
+ * @note the logic of ggmlqnn_compute_mul_mat is similar to ggmlqnn_compute_op_two_tensors but much more complicated
+ *       than ggmlqnn_compute_op_two_tensors. so it's a standalone function. accordingly, this is another
+ *       typical skeleton for offload other ggml ops to QNN backend. MUL_MAT take most of the compute
+ *       time (about 95%).so to speed up llama inference, should focus on this func. there are three kinds
+ *       of MUL_MAT to compute:
+ *       mul_mat_f32:     both src0 and src1 are F32, this will be naturally handled in QNN backend
+ *       mul_mat_f16_f32: src0 is F16 and src1 is F32, f16 in src0 -> f32 in src0', then src0' * src1
+ *       mul_mat_q_f32:   src0 is quantized (Q4_0, Q4_1, Q6_K...)
+ *                        and src1 is F32, src0 -> f32 in src0', then src0' * src1
+*/
+static void ggmlqnn_compute_mul_mat(ggml_backend_hexagon_context * ctx, ggml_tensor * op) {
+    Qnn_ErrorHandle_t error                     = QNN_SUCCESS;
+    qnn_instance * instance                     = nullptr;
+    Qnn_GraphHandle_t graph_handle              = nullptr;
+    Qnn_Tensor_t * p_tensor0                    = nullptr;
+    Qnn_Tensor_t * p_tensor1                    = nullptr;
+    Qnn_Tensor_t * p_tensor2                    = nullptr;
+    Qnn_Tensor_t * p_param_tensor               = nullptr;
+    Qnn_Tensor_t * p_tensor2_transpose          = nullptr;
+    const ggml_tensor * src0                    = op->src[0];
+    const ggml_tensor * src1                    = op->src[1];
+    ggml_tensor       * dst                     = op;
+
+    GGMLQNN_CHECK_PARAMS(ctx, src0, src1, dst);
+    instance                                    = ctx->instance;
+    QNN_INTERFACE_VER_TYPE qnn_raw_interface    = ctx->raw_interface;
+
+    const enum ggml_type src0_type              = src0->type;
+    const uint32_t src0_rank                    = ggml_n_dims(src0);
+    const uint32_t src1_rank                    = ggml_n_dims(src1);
+    const char * ggml_original_opname           = ggml_op_name(op->op);
+    ggmlhexagon_print_tensors_info(__func__, ctx, src0, src1, dst);
+
+    std::string graph_name;
+    ggmlhexagon_get_opkey_from_op(op, graph_name);
+
+    int input_size = ggml_nbytes(src0);
+    if (nullptr != src1)
+        input_size += ggml_nbytes(src1);
+    hexagon_perf op_perf(graph_name, ggml_original_opname, input_size, ggml_nbytes(dst));
+    op_perf.start();
+
+    GGML_ASSERT(src0_rank == src1_rank);
+    GGML_ASSERT(src0_rank >= 2); //QNN SDK's limitation, make QNN SDK happy
+    if (4 == src0_rank) {
+        return ggmlqnn_compute_mul_mat_4d(ctx, op);
+    }
+
+    void * wdata                                = ggmlhexagon_type_trait(ctx, op);
+    const size_t desired_size                   = ctx->desired_size;
+
+    if (ctx->qnn_singlenode_graph_map.find(graph_name) != ctx->qnn_singlenode_graph_map.end()) {
+        //retrieve computational resource from cached QNN graph
+        qnn_singlenode_res_t & graph_item = ctx->qnn_singlenode_graph_map[graph_name];
+        graph_handle = std::get<0>(graph_item);
+        qnn_ptensors_t &tensors = std::get<1>(graph_item);
+        p_tensor0 = tensors[0];
+        p_tensor1 = tensors[1];
+        p_tensor2 = tensors[2];
+        p_param_tensor = tensors[3];
+        p_tensor2_transpose = tensors[4];
+    } else {
+        //create QNN graph
+        GGMLHEXAGON_LOG_VERBOSE("graph name %s", graph_name.c_str());
+        error = instance->init_qnn_graph(graph_name, static_cast<HEXAGONBackend>(ctx->device),
+                                         g_hexagon_appcfg.vtcm_size_in_mb,
+                                         g_hexagon_appcfg.hvx_threads);
+        if (QNN_SUCCESS != error) {
+            GGMLHEXAGON_LOG_WARN("can't create qnn graph handle with graph name %s, error = %d\n",
+                                 graph_name.c_str(), error);
+            return;
+        }
+        graph_handle = instance->get_qnn_graph_handle();
+
+        //create computational tensor
+        p_tensor0 = ggmlqnn_create_general_tensor(instance, graph_handle, src0, nullptr,
+                                                  QNN_TENSOR_TYPE_APP_WRITE,
+                                                  QNN_DATATYPE_FLOAT_32, src0_rank,
+                                                  nullptr, nullptr, 0);
+        p_tensor1 = ggmlqnn_create_general_tensor(instance, graph_handle, src1, nullptr,
+                                                  QNN_TENSOR_TYPE_APP_WRITE,
+                                                  QNN_DATATYPE_FLOAT_32, src0_rank,
+                                                  nullptr, nullptr, 0);
+        p_tensor2 = ggmlqnn_create_general_tensor(instance, graph_handle, dst, nullptr,
+                                                  QNN_TENSOR_TYPE_APP_READ,
+                                                  QNN_DATATYPE_FLOAT_32, src0_rank,
+                                                  nullptr, nullptr, 0);
+
+        //create param tensor for offload 2d/3d/4d matrix multiplication
+        const uint32_t param_tensor_data[GGML_MAX_DIMS][GGML_MAX_DIMS] = {
+                {0},
+                {1, 0},
+                {0, 2, 1},
+                {0, 1, 3, 2},
+        };
+        uint32_t param_tensor_dims[1] = {src0_rank};
+        p_param_tensor = ggmlqnn_create_general_tensor(instance, graph_handle, nullptr, "param",
+                                                       QNN_TENSOR_TYPE_STATIC,
+                                                       QNN_DATATYPE_UINT_32, 1,
+                                                       param_tensor_dims,
+                                                       (void *) (param_tensor_data[src0_rank - 1]),
+                                                       src0_rank * sizeof(uint32_t));
+
+        //create transpose tensor
+        p_tensor2_transpose = ggmlqnn_create_general_tensor(instance, graph_handle, dst,
+                                                            "transpose",
+                                                            QNN_TENSOR_TYPE_NATIVE,
+                                                            QNN_DATATYPE_FLOAT_32, src0_rank,
+                                                            nullptr, nullptr, 0, true);
+
+        //compose QNN graph: add mulmat node
+        Qnn_Param_t out_0_params[] = {
+                {.paramType = QNN_PARAMTYPE_SCALAR, .name = QNN_OP_MAT_MUL_PARAM_TRANSPOSE_IN1, .scalarParam = {
+                        .dataType = QNN_DATATYPE_BOOL_8, .bool8Value = 1}}};
+        Qnn_Tensor_t out_0_inputs[] = {*p_tensor0, *p_tensor1};
+        Qnn_Tensor_t out_0_outputs[] = {*p_tensor2_transpose};
+        Qnn_OpConfig_t out_0 = ggmlqnn_create_op_config("mulmat_opconfig",
+                                                        QNN_OP_PACKAGE_NAME_QTI_AISW,
+                                                        QNN_OP_MAT_MUL, out_0_params, 1,
+                                                        out_0_inputs, 2, out_0_outputs, 1);
+        CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, out_0));
+
+        //compose QNN graph: add transpose node
+        Qnn_Param_t out_trans1_0_params[] = {
+                {.paramType = QNN_PARAMTYPE_TENSOR, .name = "perm", .tensorParam = *p_param_tensor}};
+        Qnn_Tensor_t out_trans1_0_inputs[] = {*p_tensor2_transpose};
+        Qnn_Tensor_t out_trans1_0_outputs[] = {*p_tensor2};
+        Qnn_OpConfig_t out_trans1_0 = ggmlqnn_create_op_config("mulmat_transpose_opconfig",
+                                                               QNN_OP_PACKAGE_NAME_QTI_AISW,
+                                                               QNN_OP_TRANSPOSE,
+                                                               out_trans1_0_params, 1,
+                                                               out_trans1_0_inputs, 1,
+                                                               out_trans1_0_outputs, 1);
+        CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, out_trans1_0));
+
+        //finalize QNN graph
+        CHECK_QNN_API(error, qnn_raw_interface.graphFinalize(graph_handle, nullptr, nullptr));
+
+        //cache QNN graph
+        qnn_ptensors_t ggml_op_mulmat_tensors;
+        ggml_op_mulmat_tensors.reserve(5);
+        ggml_op_mulmat_tensors.push_back(p_tensor0);
+        ggml_op_mulmat_tensors.push_back(p_tensor1);
+        ggml_op_mulmat_tensors.push_back(p_tensor2);
+        ggml_op_mulmat_tensors.push_back(p_param_tensor);
+        ggml_op_mulmat_tensors.push_back(p_tensor2_transpose);
+        auto graph_item = std::make_tuple(graph_handle, ggml_op_mulmat_tensors);
+        ctx->qnn_singlenode_graph_map[graph_name] = graph_item;
+    }
+
+    if (src0_type != GGML_TYPE_F32) {
+        QNN_VER_PTR(*p_tensor0)->clientBuf = {wdata, static_cast<uint32_t>(desired_size)};
+    } else {
+        QNN_VER_PTR(*p_tensor0)->clientBuf = {src0->data, ggmlqnn_get_tensor_data_size(src0)};
+    }
+    QNN_VER_PTR(*p_tensor1)->clientBuf = {src1->data, ggmlqnn_get_tensor_data_size(src1)};
+    QNN_VER_PTR(*p_tensor2)->clientBuf = {dst->data, ggmlqnn_get_tensor_data_size(dst)};
+
+    Qnn_Tensor_t tensor_inputs[] = {
+            *p_tensor0,
+            *p_tensor1
+    };
+    Qnn_Tensor_t tensor_outputs[] = {
+            *p_tensor2
+    };
+    CHECK_QNN_API(error, qnn_raw_interface.graphExecute(graph_handle,
+                                                        tensor_inputs, 2,
+                                                        tensor_outputs, 1,
+                                                        nullptr, nullptr));
+    op_perf.info();
+}
+
+static void ggmlqnn_compute_repeat(ggml_backend_hexagon_context * ctx, ggml_tensor * dst) {
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(dst);
+}
+
+static void ggmlqnn_compute_div(ggml_backend_hexagon_context * ctx, ggml_tensor * dst) {
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(dst);
+}
+
+static void ggmlqnn_compute_leaky_relu(ggml_backend_hexagon_context * ctx, ggml_tensor * dst) {
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(dst);
+}
+
+static void ggmlqnn_compute_concat(ggml_backend_hexagon_context * ctx, ggml_tensor * dst) {
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(dst);
+}
+
+static void ggmlqnn_compute_arange(ggml_backend_hexagon_context * ctx, ggml_tensor * dst) {
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(dst);
+}
+
+static void ggmlqnn_compute_sqr(ggml_backend_hexagon_context * ctx, ggml_tensor * dst) {
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(dst);
+}
+
+static void ggmlqnn_compute_clamp(ggml_backend_hexagon_context * ctx, ggml_tensor * dst) {
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(dst);
+}
+
+static void ggmlqnn_compute_scale(ggml_backend_hexagon_context * ctx, ggml_tensor * dst) {
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(dst);
+}
+
+static void ggmlqnn_compute_argsort(ggml_backend_hexagon_context * ctx, ggml_tensor * dst) {
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(dst);
+}
+
+static void ggmlqnn_compute_norm(ggml_backend_hexagon_context * ctx, ggml_tensor * dst) {
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(dst);
+}
+
+static void ggmlqnn_compute_group_norm(ggml_backend_hexagon_context * ctx, ggml_tensor * dst) {
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(dst);
+}
+
+static void ggmlqnn_compute_acc(ggml_backend_hexagon_context * ctx, ggml_tensor * dst) {
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(dst);
+}
+
+static void ggmlqnn_compute_sum_rows(ggml_backend_hexagon_context * ctx, ggml_tensor * dst) {
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(dst);
+}
+
+static void ggmlqnn_compute_upsample_nearest2d(ggml_backend_hexagon_context * ctx, ggml_tensor * dst) {
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(dst);
+}
+
+static void ggmlqnn_compute_pad(ggml_backend_hexagon_context * ctx, ggml_tensor * dst) {
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(dst);
+}
+
+static void ggmlqnn_compute_pool2d(ggml_backend_hexagon_context * ctx, ggml_tensor * dst) {
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(dst);
+}
+
+static void ggmlqnn_compute_dup(ggml_backend_hexagon_context * ctx, ggml_tensor * dst) {
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(dst);
+}
+
+static void ggmlqnn_compute_rms_norm(ggml_backend_hexagon_context * ctx, ggml_tensor * dst) {
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(dst);
+}
+
+static void ggmlqnn_compute_im2col(ggml_backend_hexagon_context * ctx, ggml_tensor * dst) {
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(dst);
+}
+
+static void ggmlqnn_compute_timestep_embedding(ggml_backend_hexagon_context * ctx, ggml_tensor * dst) {
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(dst);
+}
+
+static void ggmlqnn_compute_cpy(ggml_backend_hexagon_context * ctx, ggml_tensor * dst) {
+    ggmlqnn_compute_dup(ctx, dst);
+}
+
+static void ggmlqnn_compute_softmax(ggml_backend_hexagon_context * ctx, ggml_tensor * dst) {
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(dst);
+}
+
+static void ggmlqnn_compute_get_rows(ggml_backend_hexagon_context * ctx, ggml_tensor * dst) {
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(dst);
+}
+
+static void ggmlqnn_compute_rope(ggml_backend_hexagon_context * ctx, ggml_tensor * dst) {
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(dst);
+}
+
+// =================================================================================================
+//  section-7: cDSP helper function
+// =================================================================================================
+static const char * ggmlhexagon_get_dsp_name(int domain_id) {
+    switch (domain_id) {
+        case HEXAGON_ADSP:
+            return "Hexagon-aDSP";
+        case HEXAGON_MDSP:
+            return "Hexagon-mDSP";
+        case HEXAGON_SDSP:
+            return "Hexagon-sDSP";
+        case HEXAGON_CDSP:
+            return "Hexagon-cDSP";
+        case HEXAGON_CDSP1:
+            return "Hexagon-cDSP1";
+        default:
+            return "Hexagon-unknown";
+    }
+}
+
+static int ggmlhexagon_pd_status_notifier_callback(void * context, int domain, int session, remote_rpc_status_flags_t status){
+    int error = AEE_SUCCESS;
+    switch (status){
+        case  FASTRPC_USER_PD_UP:
+            GGMLHEXAGON_LOG_DEBUG("PD is up\n");
+            break;
+        case  FASTRPC_USER_PD_EXIT:
+            GGMLHEXAGON_LOG_DEBUG("PD closed\n");
+            break;
+        case  FASTRPC_USER_PD_FORCE_KILL:
+            GGMLHEXAGON_LOG_DEBUG("PD force kill\n");
+            break;
+        case  FASTRPC_USER_PD_EXCEPTION:
+            GGMLHEXAGON_LOG_DEBUG("PD exception\n");
+            break;
+        case  FASTRPC_DSP_SSR:
+            GGMLHEXAGON_LOG_DEBUG("DSP SSR\n");
+            break;
+        default :
+            error =  AEE_EBADITEM;
+            break;
+    }
+    return error;
+}
+
+static domain * ggmlhexagon_get_domain(int domain_id) {
+    int size = sizeof(hexagon_supported_domains) / sizeof(domain);
+
+    for (int i = 0; i < size; i++) {
+        if (hexagon_supported_domains[i].id == domain_id)
+            return &hexagon_supported_domains[i];
+    }
+
+    return nullptr;
+}
+
+static bool ggmlhexagon_is_cdsp(int domain_id) {
+    return (domain_id == HEXAGON_CDSP) || (domain_id == HEXAGON_CDSP1);
+}
+
+static bool ggmlhexagon_is_valid_domain_id(int domain_id, int compute_only) {
+    int size = sizeof(hexagon_supported_domains) / sizeof(domain);
+
+    if (0 != compute_only) {
+        return ggmlhexagon_is_cdsp(domain_id);
+    }
+
+    for (int i = 0; i < size; i++) {
+        if (hexagon_supported_domains[i].id == domain_id)
+            return true;
+    }
+
+    return false;
+}
+
+static int ggmlhexagon_get_domains_info(const char * domain_type, int * num_domains, fastrpc_domain ** domains_info) {
+    int hexagon_err = AEE_SUCCESS;
+    int ss_info     = 0;
+    void * buffer   = nullptr;
+    ss_info = strcmp(domain_type, "NSP")? HPASS: NSP;
+    system_req_payload req;
+    memset(&req, 0, sizeof(system_req_payload));
+    req.id = FASTRPC_GET_DOMAINS;
+    req.sys.domains = nullptr;
+    fastrpc_domain * domain = nullptr;
+
+    if (ss_info != 0) {
+        req.sys.flags = DOMAINS_LIST_FLAGS_SET_TYPE(req.sys.flags, ss_info);
+    } else {
+        req.sys.flags =0;
+    }
+
+#ifdef _WIN32
+    hexagon_err = AEE_EUNSUPPORTED;
+    goto bail;
+#endif
+
+    hexagon_err = remote_system_request(&req);
+    if (hexagon_err != AEE_SUCCESS) {
+        GGMLHEXAGON_LOG_DEBUG("failure in remote_system_request call: %d", hexagon_err);
+        goto bail;
+    }
+    //allocate memory for domain-info array
+    req.sys.max_domains = req.sys.num_domains;
+    buffer = calloc(req.sys.num_domains, sizeof(fastrpc_domain));
+    if (nullptr == buffer) {
+        hexagon_err = AEE_ENOMEMORY;
+        GGMLHEXAGON_LOG_DEBUG("unable to allocate memory for req.sys.domains");
+        goto bail;
+    }
+    req.sys.domains = static_cast<fastrpc_domain *>(buffer);
+    hexagon_err = remote_system_request(&req);
+    if (hexagon_err != AEE_SUCCESS) {
+        GGMLHEXAGON_LOG_DEBUG("failure in remote_system_request call: %d.\n", hexagon_err);
+        goto bail;
+    }
+
+    for (int i = 0; i < req.sys.num_domains; i++) {
+        //verify that only requested type domains were returned
+        domain = &req.sys.domains[i];
+        if (domain->type != ss_info) {
+            hexagon_err = -1;
+            GGMLHEXAGON_LOG_DEBUG("incorrect data received from remote_system_request.\n");
+            goto bail;
+        }
+    }
+    *domains_info = req.sys.domains;
+    *num_domains  = req.sys.num_domains;
+
+bail:
+    if (hexagon_err && !req.sys.domains) {
+        free(req.sys.domains);
+    }
+    return hexagon_err;
+}
+
+static int ggmlhexagon_get_dsp_support(int * domain) {
+    int hexagon_error = AEE_SUCCESS;
+    *domain = HEXAGON_CDSP;
+
+    if (remote_handle_control) {
+        struct remote_dsp_capability dsp_capability_domain = {HEXAGON_CDSP, DOMAIN_SUPPORT, 0};
+        hexagon_error = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_domain, sizeof(struct remote_dsp_capability));
+        if ((hexagon_error & 0xFF) == (AEE_EUNSUPPORTEDAPI & 0xFF)) {
+            GGMLHEXAGON_LOG_DEBUG("FastRPC Capability API is not supported on this device");
+            goto bail;
+        }
+
+        if (0 == dsp_capability_domain.capability) {
+            dsp_capability_domain.domain       = HEXAGON_ADSP;
+            dsp_capability_domain.attribute_ID = DOMAIN_SUPPORT;
+            dsp_capability_domain.capability   = 0;
+            hexagon_error = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_domain, sizeof(struct remote_dsp_capability));
+            if(dsp_capability_domain.capability) {
+                *domain = HEXAGON_ADSP;
+            }
+        }
+
+        if (hexagon_error != AEE_SUCCESS) {
+            GGMLHEXAGON_LOG_DEBUG("get_dsp_support failed with error 0x%x", hexagon_error);
+            goto bail;
+        }
+    } else {
+        hexagon_error = AEE_EUNSUPPORTEDAPI;
+        GGMLHEXAGON_LOG_DEBUG("remote_dsp_capability interface is not supported on this device");
+    }
+
+bail:
+    return hexagon_error;
+}
+
+static int ggmlhexagon_get_vtcm_info(int domain, uint32_t attr, uint32_t * capability) {
+    int hexagon_error = AEE_SUCCESS;
+    *capability = 0;
+
+    if (attr == VTCM_PAGE || attr == VTCM_COUNT) {
+    } else {
+        hexagon_error = AEE_EBADPARM;
+        GGMLHEXAGON_LOG_DEBUG("unsupported attr, only VTCM_PAGE and VTCM_COUNT supported");
+        goto bail;
+    }
+
+    if (remote_handle_control) {
+        if (domain == HEXAGON_ADSP || domain == HEXAGON_CDSP) {
+            /*
+            * query the DSP for VTCM information
+            * since the ADSP does not have a dedicated VTCM, we expect the output to be 0
+            */
+            struct remote_dsp_capability dsp_capability_vtcm_dsp;
+            dsp_capability_vtcm_dsp.domain       = (uint32_t)domain;
+            dsp_capability_vtcm_dsp.attribute_ID = attr;
+            dsp_capability_vtcm_dsp.capability   = (uint32_t)0;
+            hexagon_error = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_vtcm_dsp, sizeof(struct remote_dsp_capability));
+            if ((hexagon_error & 0xFF) == (AEE_EUNSUPPORTEDAPI & 0xFF)) {
+                GGMLHEXAGON_LOG_DEBUG("FastRPC Capability API is not supported on this device");
+                GGMLHEXAGON_LOG_DEBUG("running the use case without checking the capability");
+                hexagon_error = AEE_SUCCESS;
+                goto bail;
+            } else if (hexagon_error == AEE_SUCCESS) {
+                *capability = dsp_capability_vtcm_dsp.capability;
+            } else {
+                GGMLHEXAGON_LOG_DEBUG("get_vtcm_info failed with error 0x%x", hexagon_error);
+                goto bail;
+            }
+        } else {
+            hexagon_error = AEE_EUNSUPPORTED;
+            GGMLHEXAGON_LOG_DEBUG("unsupported domain %d", domain);
+            goto bail;
+        }
+    } else {
+        hexagon_error = AEE_EUNSUPPORTEDAPI;
+        GGMLHEXAGON_LOG_DEBUG("remote_dsp_capability interface is not supported on this device");
+    }
+
+bail:
+    return hexagon_error;
+}
+
+static bool ggmlhexagon_is_unsignedpd_supported(int domain_id) {
+    int hexagon_error = AEE_SUCCESS;
+    if (remote_handle_control) {
+        struct remote_dsp_capability dsp_capability_domain = {static_cast<uint32_t>(domain_id), UNSIGNED_PD_SUPPORT, 0};
+        hexagon_error = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_domain, sizeof(struct remote_dsp_capability));
+        if ((hexagon_error & 0xFF) == (AEE_EUNSUPPORTEDAPI & 0xFF)) {
+            GGMLHEXAGON_LOG_WARN("FastRPC Capability API is not supported on this device. Falling back to signed pd");
+            return false;
+        }
+
+        if (hexagon_error) {
+            GGMLHEXAGON_LOG_WARN("error 0x%x: FastRPC Capability API failed. falling back to signed pd", hexagon_error);
+            return false;
+        }
+
+        if (dsp_capability_domain.capability == 1) {
+            return true;
+        }
+    } else {
+        hexagon_error = AEE_EUNSUPPORTEDAPI;
+        GGMLHEXAGON_LOG_WARN("remote_dsp_capability interface is not supported on this device.falling back to signed pd");
+        return false;
+    }
+
+    return false;
+}
+
+static bool ggmlhexagon_get_unsignedpd_support(void) {
+    return ggmlhexagon_is_unsignedpd_supported(HEXAGON_CDSP);
+}
+
+static bool ggmlhexagon_is_async_fastrpc_supported(int domain) {
+    int hexagon_error = AEE_SUCCESS;
+    if (remote_handle_control) {
+        if (domain == HEXAGON_CDSP) {
+            /*
+            * Query the DSP for ASYNC_FASTRPC_SUPPORT information
+            * Async fastrpc is supported only on CDSP
+            */
+            struct remote_dsp_capability dsp_capability_async_support;
+            dsp_capability_async_support.domain       = (uint32_t)domain;
+            dsp_capability_async_support.attribute_ID = ASYNC_FASTRPC_SUPPORT;
+            dsp_capability_async_support.capability   = (uint32_t)0;
+            hexagon_error = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_async_support, sizeof(struct remote_dsp_capability));
+            if ((hexagon_error & 0xFF) == (AEE_EUNSUPPORTEDAPI & 0xFF)) {
+                GGMLHEXAGON_LOG_WARN("FastRPC Capability API is not supported on this device");
+                hexagon_error = AEE_SUCCESS;
+                goto bail;
+            } else if (dsp_capability_async_support.capability == 1) {
+                return true;
+            }
+
+            if (hexagon_error != AEE_SUCCESS){
+                GGMLHEXAGON_LOG_WARN("failed with error 0x%x", hexagon_error);
+                goto bail;
+            }
+        } else {
+            hexagon_error = AEE_EUNSUPPORTED;
+            GGMLHEXAGON_LOG_WARN("async FastRPC is not supported on domain %d", domain);
+            goto bail;
+        }
+    } else {
+        hexagon_error = AEE_EUNSUPPORTEDAPI;
+        GGMLHEXAGON_LOG_WARN("remote_dsp_capability interface is not supported on this device");
+    }
+
+bail:
+    return false;
+}
+
+static void ggmlhexagon_set_rpc_latency(remote_handle64 handle, int qos, int latency) {
+    int hexagon_error = AEE_SUCCESS;
+
+    if (remote_handle_control) {
+        struct remote_rpc_control_latency data;
+/*
+        qos          |  latency
+        -----------------------
+        RPC_PM_QOS   |  100
+        RPC_POLL_QOS |  1000
+*/
+        data.enable   = qos;
+        data.latency  = latency;
+        hexagon_error = remote_handle64_control(handle, DSPRPC_CONTROL_LATENCY, (void*)&data, sizeof(data));
+        if (hexagon_error != AEE_SUCCESS) {
+            GGMLHEXAGON_LOG_WARN("failed with error 0x%x", hexagon_error);
+            goto bail;
+        } else {
+            if (ggmlhexagon_is_llamabench_running()) {
+                GGMLHEXAGON_LOG_VERBOSE("set rpc qos %d, latency %d\n", qos, latency);
+            } else {
+                GGMLHEXAGON_LOG_INFO("set rpc qos %d, latency %d\n", qos, latency);
+            }
+        }
+    } else {
+        hexagon_error = AEE_EUNSUPPORTEDAPI;
+        GGMLHEXAGON_LOG_WARN("remote_dsp_capability interface is not supported on this device");
+    }
+
+bail:
+    return;
+}
+
+/**
+ * set FastRPC thread priority (default unchanged at 192)
+ * priority values range from 1 to 255, with smaller values representing higher priorities
+ * Unprivileged clients: 64 through 254 (cDSP only)
+ * Privileged clients:   1  through 254
+ *
+ * ref:file:///opt/qcom/Hexagon_SDK/6.2.0.1/docs/software/system_integration.html#priority-levels
+ */
+static int ggmlhexagon_set_priority(int domain, int priority) {
+    int err = 0;
+
+    if (priority < 1) {
+        priority = 1;
+    }
+    if (priority > 255) {
+        priority = 255;
+    }
+
+    if (remote_session_control) {
+        struct remote_rpc_thread_params data;
+        data.domain     = domain;
+        data.prio       = priority;
+        data.stack_size = -1;
+        err = remote_session_control(FASTRPC_THREAD_PARAMS, (void *)&data, sizeof(data));
+        if (err != AEE_SUCCESS) {
+            GGMLHEXAGON_LOG_WARN("remote_session_control failed with 0x%x when setting thread priority\n", err);
+        } else {
+            GGMLHEXAGON_LOG_VERBOSE("thread priority set to %d\n", priority);
+        }
+    } else {
+        GGMLHEXAGON_LOG_WARN("cannot set thread priority\n");
+    }
+    return err;
+}
+
+static bool ggmlhexagon_is_status_notification_supported(int domain) {
+    int hexagon_error = AEE_SUCCESS;
+
+    if (remote_handle_control) {
+        /*
+        * Query the DSP for STATUS_NOTIFICATION_SUPPORT information
+        * DSP User PD status notification Support
+        */
+        struct remote_dsp_capability dsp_capability_status_notification_support;
+        dsp_capability_status_notification_support.domain       = (uint32_t)domain;
+        dsp_capability_status_notification_support.attribute_ID = STATUS_NOTIFICATION_SUPPORT;
+        dsp_capability_status_notification_support.capability   = (uint32_t)0;
+        hexagon_error = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_status_notification_support, sizeof(struct remote_dsp_capability));
+        if ((hexagon_error & 0xFF) == (AEE_EUNSUPPORTEDAPI & 0xFF)) {
+            GGMLHEXAGON_LOG_WARN("FastRPC Capability API is not supported on this device");
+            hexagon_error = AEE_SUCCESS;
+            goto bail;
+        } else if (1 == dsp_capability_status_notification_support.capability) {
+            return true;
+        }
+
+        if (hexagon_error != AEE_SUCCESS){
+            GGMLHEXAGON_LOG_WARN("failed with error 0x%x", hexagon_error);
+            goto bail;
+        }
+    } else {
+        hexagon_error = AEE_EUNSUPPORTEDAPI;
+        GGMLHEXAGON_LOG_WARN("remote_dsp_capability interface is not supported on this device");
+    }
+
+bail:
+    return false;
+}
+
+static int ggmlhexagon_get_hmx_support_info(int domain, uint32_t attr, uint32_t * capability) {
+    int hexagon_error = AEE_SUCCESS;
+    *capability = 0;
+
+    if (attr != HMX_SUPPORT_SPATIAL && attr != HMX_SUPPORT_DEPTH) {
+        hexagon_error = AEE_EBADPARM;
+        GGMLHEXAGON_LOG_WARN("unsupported attr, only HMX_SUPPORT_SPATIAL and HMX_SUPPORT_DEPTH supported");
+        goto bail;
+    }
+
+    if (remote_handle_control) {
+        if (domain == HEXAGON_CDSP) {
+            /*
+            * Query the DSP for HMX SUPPORT information
+            * HMX is supported on CDSP only
+            */
+            struct remote_dsp_capability dsp_capability_hmx_dsp;
+            dsp_capability_hmx_dsp.domain       = (uint32_t)domain;
+            dsp_capability_hmx_dsp.attribute_ID = attr;
+            dsp_capability_hmx_dsp.capability   = (uint32_t)0;
+            hexagon_error = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_hmx_dsp, sizeof(struct remote_dsp_capability));
+            if ((hexagon_error & 0xFF) == (AEE_EUNSUPPORTEDAPI & 0xFF)) {
+                GGMLHEXAGON_LOG_DEBUG("FastRPC Capability API is not supported on this device");
+                hexagon_error = AEE_SUCCESS;
+                goto bail;
+            }
+            else if (hexagon_error == AEE_SUCCESS) {
+                *capability = dsp_capability_hmx_dsp.capability;
+            } else {
+                GGMLHEXAGON_LOG_DEBUG("get_hmx_support_info failed with Error 0x%x", hexagon_error);
+                goto bail;
+            }
+        } else {
+            hexagon_error = AEE_EUNSUPPORTED;
+            GGMLHEXAGON_LOG_DEBUG("HMX support is not there for domain %d", domain);
+            goto bail;
+        }
+    } else {
+        hexagon_error = AEE_EUNSUPPORTEDAPI;
+        GGMLHEXAGON_LOG_DEBUG("remote_dsp_capability interface is not supported on this device");
+    }
+
+bail:
+    return hexagon_error;
+}
+
+static int ggmlhexagon_get_hvx_arch_ver(int domain, uint32_t * capability) {
+    int hexagon_error = AEE_SUCCESS;
+    *capability = 0;
+    if(remote_handle_control) {
+        /*
+        * Query the Hexagon processor architecture version information
+        */
+        struct remote_dsp_capability dsp_capability_arch_ver;
+        dsp_capability_arch_ver.domain       = (uint32_t)domain;
+        dsp_capability_arch_ver.attribute_ID = ARCH_VER;
+        dsp_capability_arch_ver.capability   = (uint32_t)0;
+        hexagon_error = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_arch_ver, sizeof(struct remote_dsp_capability));
+        if ((hexagon_error & 0xFF) == (AEE_EUNSUPPORTEDAPI & 0xFF)) {
+            GGMLHEXAGON_LOG_DEBUG("FastRPC Capability API is not supported on this device");
+            hexagon_error = AEE_SUCCESS;
+            goto bail;
+        } else if (hexagon_error == AEE_SUCCESS) {
+            *capability = dsp_capability_arch_ver.capability & 0xFF;
+        } else {
+            GGMLHEXAGON_LOG_DEBUG("get_hex_arch_ver failed with error 0x%x", hexagon_error);
+            goto bail;
+        }
+    } else {
+        hexagon_error = AEE_EUNSUPPORTEDAPI;
+        GGMLHEXAGON_LOG_DEBUG("remote_dsp_capability interface is not supported on this device");
+    }
+
+bail:
+    return hexagon_error;
+}
+
+static int ggmlhexagon_get_hvx_support_info(int domain, uint32_t attr, uint32_t * capability)
+{
+    int hexagon_error = AEE_SUCCESS;
+    *capability = 0;
+    if (attr == HVX_SUPPORT_64B) {
+        hexagon_error = AEE_EBADPARM;
+        GGMLHEXAGON_LOG_DEBUG("latest targets have 128 byte HVX register, use HVX_SUPPORT_128B instead of HVX_SUPPORT_64B");
+        goto bail;
+    }
+
+    if (attr != HVX_SUPPORT_128B) {
+        hexagon_error = AEE_EBADPARM;
+        GGMLHEXAGON_LOG_DEBUG("unsupported attr. only HVX_SUPPORT_128B supported");
+        goto bail;
+    }
+
+    if (remote_handle_control) {
+        if (domain == HEXAGON_CDSP) {
+            /*
+            * Query the DSP for HVX SUPPORT information
+            * HVX is supported on CDSP only
+            */
+            struct remote_dsp_capability dsp_capability_hvx_dsp;
+            dsp_capability_hvx_dsp.domain       = (uint32_t)domain;
+            dsp_capability_hvx_dsp.attribute_ID = attr;
+            dsp_capability_hvx_dsp.capability   = (uint32_t)0;
+            hexagon_error = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_hvx_dsp, sizeof(struct remote_dsp_capability));
+            if ((hexagon_error & 0xFF)==(AEE_EUNSUPPORTEDAPI & 0xFF)) {
+                GGMLHEXAGON_LOG_DEBUG("FastRPC Capability API is not supported on this device");
+                hexagon_error = AEE_SUCCESS;
+                goto bail;
+            } else if (hexagon_error == AEE_SUCCESS) {
+                *capability = dsp_capability_hvx_dsp.capability;
+            } else {
+                GGMLHEXAGON_LOG_DEBUG("failed with error 0x%x", hexagon_error);
+                goto bail;
+            }
+        } else {
+            hexagon_error = AEE_EUNSUPPORTED;
+            GGMLHEXAGON_LOG_DEBUG("HVX support is not available on domain %d", domain);
+            goto bail;
+        }
+    } else {
+        hexagon_error = AEE_EUNSUPPORTEDAPI;
+        GGMLHEXAGON_LOG_DEBUG("remote_dsp_capability interface is not supported on this device");
+    }
+
+bail:
+    return hexagon_error;
+}
+
+static int ggmlhexagon_request_status_notifications(int domain_id, void * context, notify_callback_fn call_back_fn) {
+    int hexagon_error = AEE_SUCCESS;
+    struct remote_rpc_notif_register notif;
+    bool status_notification_support;
+
+    notif.context     = context;
+    notif.domain      = domain_id;
+    notif.notifier_fn = call_back_fn;
+
+    status_notification_support = ggmlhexagon_is_status_notification_supported(domain_id);
+    if (status_notification_support) {
+        hexagon_error = remote_session_control(FASTRPC_REGISTER_STATUS_NOTIFICATIONS, (void*)&notif, sizeof(notif));
+        if (hexagon_error != AEE_SUCCESS) {
+            GGMLHEXAGON_LOG_DEBUG("error 0x%x: remote_session_control failed to enable status notifications", hexagon_error);
+        }
+    } else {
+        hexagon_error = AEE_EUNSUPPORTEDAPI;
+    }
+
+    return hexagon_error;
+}
+
+static int ggmlhexagon_init_rpcmempool(ggml_backend_hexagon_context * ctx) {
+    size_t candidate_size   = 0;
+    uint8_t * rpc_buffer    = nullptr;
+#ifdef SD_USE_HEXAGON // for stable-diffusion.cpp
+    size_t probe_slots[]    = {1024, 1536, 2000, 2048, 1024 + 2048, 4096};
+#else
+    size_t probe_slots[]    = {1024, 1536, 2000, 2048};
+#endif
+    size_t probe_counts     = sizeof(probe_slots) / sizeof(size_t);
+
+    if (nullptr == ctx)
+        return 1;
+
+    for (size_t idx = 0; idx < probe_counts; idx++) {
+#ifdef SD_USE_HEXAGON // for stable-diffusion.cpp
+        rpc_buffer = static_cast<uint8_t *>(rpcmem_alloc2(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS, (probe_slots[idx] * SIZE_IN_MB)));
+#else
+        rpc_buffer = static_cast<uint8_t *>(rpcmem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS, (probe_slots[idx] * SIZE_IN_MB)));
+#endif
+        if (nullptr == rpc_buffer) {
+            GGMLHEXAGON_LOG_DEBUG("alloc rpcmem %d (MiB) failure during probe rpc memory info, reason: %s\n", probe_slots[idx], strerror(errno));
+            break;
+        } else {
+            candidate_size = probe_slots[idx];
+            rpcmem_free(rpc_buffer);
+            rpc_buffer = nullptr;
+        }
+    }
+    ctx->rpc_mempool_capacity = candidate_size * SIZE_IN_MB;
+    GGMLHEXAGON_LOG_DEBUG("rpc memory capacity %ld(%d MiB) for device %d",
+                          ctx->rpc_mempool_capacity, ctx->rpc_mempool_capacity / SIZE_IN_MB, ctx->device);
+    if (ggmlhexagon_is_llamabench_running()) {
+        GGMLHEXAGON_LOG_VERBOSE("capacity of rpc memory %d MiB", ctx->rpc_mempool_capacity / SIZE_IN_MB);
+    } else {
+        GGMLHEXAGON_LOG_INFO("capacity of rpc memory %d MiB", ctx->rpc_mempool_capacity / SIZE_IN_MB);
+    }
+
+    if ((g_hexagon_appcfg.hwaccel_approach == HWACCEL_CDSP) && (1 == g_hexagon_appcfg.enable_rpc_ion_mempool)) {
+        GGML_ASSERT(ctx->rpc_mempool_capacity > (8 * SIZE_IN_MB));
+        ctx->rpc_mempool_len = ctx->rpc_mempool_capacity - (8 * SIZE_IN_MB);
+#ifdef SD_USE_HEXAGON // use rpcmem_alloc2 to alloc 2+ GiB memory, it's a workaround to make stablediffusion.cpp happy
+        ctx->rpc_mempool = rpcmem_alloc2(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS | RPCMEM_TRY_MAP_STATIC, ctx->rpc_mempool_len);
+#else
+        //FIXME: it seems there is unknown issue with 2+ GiB memory pool
+        ctx->rpc_mempool = rpcmem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS | RPCMEM_TRY_MAP_STATIC, ctx->rpc_mempool_len);
+#endif
+        if (nullptr == ctx->rpc_mempool) {
+            GGMLHEXAGON_LOG_WARN("alloc rpc memorypool %ld(%d MiB) failed", ctx->rpc_mempool_len, ctx->rpc_mempool_capacity / SIZE_IN_MB);
+            return 2;
+        } else {
+            GGMLHEXAGON_LOG_DEBUG("alloc rpc memorypool %p successfully %ld(%d MiB)",
+                                  ctx->rpc_mempool, ctx->rpc_mempool_len,
+                                  ctx->rpc_mempool_len / SIZE_IN_MB);
+        }
+        ctx->rpc_mempool_handle = rpcmem_to_fd(ctx->rpc_mempool);
+        GGMLHEXAGON_LOG_DEBUG("rpc mempool handle %d", ctx->rpc_mempool_handle);
+        remote_register_buf(ctx->rpc_mempool, ctx->rpc_mempool_len, ctx->rpc_mempool_handle);
+    }
+
+    return 0;
+}
+
+static void ggmlhexagon_deinit_rpcmempool(ggml_backend_hexagon_context * ctx) {
+    if ((g_hexagon_appcfg.hwaccel_approach == HWACCEL_CDSP) && (1 == g_hexagon_appcfg.enable_rpc_ion_mempool)) {
+        if (ctx->rpc_mempool) {
+            //deregister rpc memory pool
+            remote_register_buf(ctx->rpc_mempool, ctx->rpc_mempool_len, -1);
+            GGMLHEXAGON_LOG_DEBUG("free rpc mempool %p", ctx->rpc_mempool);
+            rpcmem_free(ctx->rpc_mempool);
+            ctx->rpc_mempool = nullptr;
+            ctx->rpc_mempool_len = 0;
+            ctx->rpc_mempool_capacity = 0;
+        }
+    }
+}
+
+static void ggmlhexagon_probe_dspinfo(ggml_backend_hexagon_context * ctx) {
+    uint32_t dsp_version = 0;
+    ggmlhexagon_get_hvx_arch_ver(ctx->domain_id, &dsp_version);
+
+    if (dsp_version == 0x68 || dsp_version == 0x69 || dsp_version == 0x73 || dsp_version == 0x75 || dsp_version == 0x79) {
+        if (ggmlhexagon_is_llamabench_running()) {
+            GGMLHEXAGON_LOG_VERBOSE("dsp arch version 0x%x", dsp_version);
+        } else {
+            GGMLHEXAGON_LOG_INFO("dsp arch version 0x%x", dsp_version);
+        }
+        //0x68 -> 68, 0x69 -> 69, 0x73 -> 73, 0x75 -> 75, 0x79 -> 79
+        size_t htp_arch = ggmlhexagon_htparch_hex_to_decimal(dsp_version);
+        GGMLHEXAGON_LOG_DEBUG("dsp arch version %d", htp_arch);
+        struct qcom_socinfo * socinfo = ggmlhexagon_get_socinfo_from_socmodel(htp_arch);
+        if (nullptr != socinfo) {
+            //got fully description of SoC when hwaccel approach is HWACCEL_CDSP
+            if (ggmlhexagon_is_llamabench_running()) {
+                GGMLHEXAGON_LOG_VERBOSE("device info: %s, %s", socinfo->soc_desc, ggmlhexagon_get_htparch_desc(htp_arch));
+            } else {
+                GGMLHEXAGON_LOG_INFO("device info: %s, %s", socinfo->soc_desc, ggmlhexagon_get_htparch_desc(htp_arch));
+            }
+        }
+    } else {
+        GGMLHEXAGON_LOG_WARN("error: dsp arch version 0x%x is not supported", dsp_version);
+    }
+
+    uint32_t vtcm_count = 0;
+    uint32_t vtcm_page  = 0;
+    ggmlhexagon_get_vtcm_info(ctx->domain_id, VTCM_COUNT, &vtcm_count);
+    ggmlhexagon_get_vtcm_info(ctx->domain_id, VTCM_PAGE, &vtcm_page);
+
+    uint32_t hmx_depth = 0;
+    uint32_t hmx_spatial = 0;
+    ggmlhexagon_get_hmx_support_info(ctx->domain_id, HMX_SUPPORT_DEPTH, &hmx_depth);
+    ggmlhexagon_get_hmx_support_info(ctx->domain_id, HMX_SUPPORT_SPATIAL, &hmx_spatial);
+
+    uint32_t hvx_support_128b = 0;
+    ggmlhexagon_get_hvx_support_info(ctx->domain_id, HVX_SUPPORT_128B, &hvx_support_128b);
+
+    if (ggmlhexagon_is_llamabench_running()) {
+        //make llama-bench happy
+        GGMLHEXAGON_LOG_VERBOSE("vtcm_count %d", vtcm_count);
+        GGMLHEXAGON_LOG_VERBOSE("vtcm_page %d", vtcm_page);
+        GGMLHEXAGON_LOG_VERBOSE("hmx_depth %d", hmx_depth);
+        GGMLHEXAGON_LOG_VERBOSE("hmx_spatial %d", hmx_spatial);
+        GGMLHEXAGON_LOG_VERBOSE("hvx_support_128b %d", hvx_support_128b);
+        GGMLHEXAGON_LOG_VERBOSE("unsigned pd supported %d", ggmlhexagon_get_unsignedpd_support());
+        GGMLHEXAGON_LOG_VERBOSE("async fastrpc supported %d", ggmlhexagon_is_async_fastrpc_supported(ctx->domain_id));
+    } else {
+        GGMLHEXAGON_LOG_INFO("vtcm_count %d", vtcm_count);
+        GGMLHEXAGON_LOG_INFO("vtcm_page %d", vtcm_page);
+        GGMLHEXAGON_LOG_INFO("hmx_depth %d", hmx_depth);
+        GGMLHEXAGON_LOG_INFO("hmx_spatial %d", hmx_spatial);
+        GGMLHEXAGON_LOG_INFO("hvx_support_128b %d", hvx_support_128b);
+        GGMLHEXAGON_LOG_INFO("unsigned pd supported %d", ggmlhexagon_get_unsignedpd_support());
+        GGMLHEXAGON_LOG_INFO("async fastrpc supported %d", ggmlhexagon_is_async_fastrpc_supported(ctx->domain_id));
+    }
+}
+
+static void ggmlhexagon_deinit_cdsp(ggml_backend_hexagon_context * ctx) {
+    int hexagon_error  = AEE_SUCCESS;
+    if (ggmlhexagon_is_llamabench_running()) {
+        GGMLHEXAGON_LOG_VERBOSE("enter %s", __func__);
+    } else {
+        GGMLHEXAGON_LOG_INFO("enter %s", __func__);
+    }
+    if (0 != ctx->ggmlop_handle) {
+        hexagon_error = ggmlop_dsp_close(ctx->ggmlop_handle);
+        if (AEE_SUCCESS != hexagon_error) {
+            GGMLHEXAGON_LOG_WARN("error 0x%x: failed to close ggmlop dsp handle", hexagon_error);
+        }
+        ctx->ggmlop_handle = 0;
+    }
+
+    ggmlhexagon_deinit_rpcmempool(ctx);
+
+    ctx->domain_id             = -1;
+    if (ggmlhexagon_is_llamabench_running()) {
+        GGMLHEXAGON_LOG_VERBOSE("leave %s", __func__);
+    } else {
+        GGMLHEXAGON_LOG_INFO("leave %s", __func__);
+    }
+}
+
+static int ggmlhexagon_init_dsp(ggml_backend_hexagon_context * ctx) {
+    static std::mutex mutex;
+    std::lock_guard<std::mutex> lock(mutex);
+
+    int hexagon_error               = AEE_SUCCESS;
+
+    int domain_id                   = HEXAGON_CDSP;
+    const char * domain_type        = "NSP";
+
+    int unsignedpd_flag             = 1;
+    bool is_unsignedpd_enabled      = false;
+    int use_logical_id              = 0;
+    int core_id                     = -1;
+    fastrpc_domain * domains_info   = NULL;
+    int num_domains                 = -1;
+
+    domain * my_domain              = NULL;
+    char * uri                      = NULL;
+
+    char * ggmlop_domain_uri        = NULL;
+    int    ggmlop_domain_uri_len    = 0;
+
+    if (nullptr == ctx)
+        return 1;
+    GGMLHEXAGON_LOG_DEBUG("init Hexagon cDSP with backend %d(%s)", ctx->device, ggml_backend_hexagon_get_devname(ctx->device));
+    if (0 != ctx->ggmlop_handle) {
+        GGMLHEXAGON_LOG_DEBUG("already init Hexagon cDSP with backend %d(%s)", ctx->device, ggml_backend_hexagon_get_devname(ctx->device));
+        return 0;
+    }
+    ctx->ggmlop_handle = 0;
+
+    if (-1 == domain_id) {
+        if (nullptr != domain_type) {
+            if ((strcmp(domain_type, "NSP") != 0 && strcmp(domain_type, "HPASS") != 0)) {
+                GGMLHEXAGON_LOG_WARN("invalid domain_type %s. possible values are NSP or HPASS", domain_type);
+                goto bail;
+            } else {
+                hexagon_error = ggmlhexagon_get_domains_info(domain_type, &num_domains, &domains_info);
+                if (hexagon_error == AEE_EUNSUPPORTED) {
+                    GGMLHEXAGON_LOG_DEBUG("API is not supported on this target so cannot get domains info from the device. falling back to legacy approach of using default domain id");
+                    hexagon_error = ggmlhexagon_get_dsp_support(&domain_id);
+                    if (hexagon_error != AEE_SUCCESS) {
+                        GGMLHEXAGON_LOG_DEBUG("error: 0x%x, defaulting to cDSP domain", hexagon_error);
+                    }
+                } else if (hexagon_error != AEE_SUCCESS) {
+                    GGMLHEXAGON_LOG_DEBUG("error in getting domains information");
+                    goto bail;
+                } else {
+                    if (core_id != -1) {
+                        if (core_id < 0 || core_id >= num_domains) {
+                            GGMLHEXAGON_LOG_DEBUG("invalid core_id = %d for %s. core_id should be between 0 to %d", core_id, domain_type, num_domains - 1);
+                            hexagon_error = AEE_EBADPARM;
+                            goto bail;
+                        }
+                    } else {
+                        core_id = 0;
+                    }
+                    use_logical_id = 1;
+                    domain_id = domains_info[core_id].id;
+                }
+            }
+        } else {
+            GGMLHEXAGON_LOG_DEBUG("DSP domain is not provided, retrieving DSP information using Remote APIs");
+            hexagon_error = ggmlhexagon_get_dsp_support(&domain_id);
+            if (hexagon_error != AEE_SUCCESS) {
+                GGMLHEXAGON_LOG_DEBUG("error: 0x%x, defaulting to cDSP domain", hexagon_error);
+            }
+        }
+    }
+
+    if (0 == use_logical_id) {
+        if (!ggmlhexagon_is_valid_domain_id(domain_id, 0)) {
+            hexagon_error = AEE_EBADPARM;
+            GGMLHEXAGON_LOG_DEBUG("error 0x%x: invalid domain %d", hexagon_error, domain_id);
+            goto bail;
+        }
+
+        my_domain = ggmlhexagon_get_domain(domain_id);
+        if (nullptr == my_domain) {
+            GGMLHEXAGON_LOG_DEBUG("unable to get domain struct %d",  domain_id);
+            goto bail;
+        }
+        uri = my_domain->uri;
+    }
+    GGMLHEXAGON_LOG_DEBUG("temporary domain uri=%s\n", uri);
+
+    if (1 == unsignedpd_flag) {
+        is_unsignedpd_enabled = ggmlhexagon_is_unsignedpd_supported(domain_id);
+        if (!is_unsignedpd_enabled) {
+            GGMLHEXAGON_LOG_DEBUG("overriding user request for unsigned PD, only signed offload is allowed on domain %d", domain_id);
+            unsignedpd_flag = 0;
+        }
+    }
+
+    ctx->domain_id = domain_id;
+    if (ggmlhexagon_is_llamabench_running()) {
+        GGMLHEXAGON_LOG_VERBOSE("using Hexagon domain %d(%s)", domain_id, ggmlhexagon_get_dsp_name(domain_id));
+        GGMLHEXAGON_LOG_VERBOSE("unsignedpd_enabled %d", is_unsignedpd_enabled);
+    } else {
+        GGMLHEXAGON_LOG_INFO("using Hexagon domain %d(%s)", domain_id, ggmlhexagon_get_dsp_name(domain_id));
+        GGMLHEXAGON_LOG_INFO("unsignedpd_enabled %d", is_unsignedpd_enabled);
+    }
+    if (is_unsignedpd_enabled) {
+        if (remote_session_control) {
+            struct remote_rpc_control_unsigned_module data;
+            data.enable = 1;
+            data.domain = domain_id;
+            hexagon_error = remote_session_control(DSPRPC_CONTROL_UNSIGNED_MODULE, (void *)&data, sizeof(data));
+            GGMLHEXAGON_LOG_DEBUG("remote_session_control returned %d for configuring unsigned PD success", hexagon_error);
+            if (AEE_SUCCESS != hexagon_error) {
+                GGMLHEXAGON_LOG_WARN("error 0x%x: remote_session_control failed", hexagon_error);
+            }
+        } else {
+            GGMLHEXAGON_LOG_DEBUG("unsigned PD not supported on this device");
+            hexagon_error = AEE_EUNSUPPORTED;
+            GGMLHEXAGON_LOG_DEBUG("error 0x%x: remote_session_control interface is not supported on this device", hexagon_error);
+        }
+    }
+
+    hexagon_error = ggmlhexagon_request_status_notifications(domain_id, (void *)STATUS_CONTEXT, ggmlhexagon_pd_status_notifier_callback);
+    if (AEE_SUCCESS != hexagon_error) {
+        if (AEE_EUNSUPPORTEDAPI != hexagon_error) {
+            GGMLHEXAGON_LOG_WARN("error 0x%x: hexagon_request_status_notifications failed", hexagon_error);
+        }
+        GGMLHEXAGON_LOG_WARN("error 0x%x: failed to compute on domain %d", hexagon_error, domain_id);
+        goto bail;
+    }
+    ggmlhexagon_set_priority(domain_id, 160);
+
+    ggmlop_domain_uri_len   = strlen(ggmlop_URI) + MAX_DOMAIN_NAMELEN;
+    ggmlop_domain_uri       = (char *)malloc(ggmlop_domain_uri_len);
+    if (NULL == ggmlop_domain_uri) {
+        goto bail;
+    }
+    snprintf(ggmlop_domain_uri, ggmlop_domain_uri_len, "%s%s", ggmlop_URI, uri);
+    GGMLHEXAGON_LOG_DEBUG("ggmlop domain uri:%s", ggmlop_domain_uri);
+    hexagon_error = ggmlop_dsp_open(ggmlop_domain_uri, &ctx->ggmlop_handle);
+    if (AEE_SUCCESS == hexagon_error) {
+        if (ggmlhexagon_is_llamabench_running()) {
+            GGMLHEXAGON_LOG_VERBOSE("succeed to open domain %d(%s)", domain_id, ggmlhexagon_get_dsp_name(domain_id));
+            GGMLHEXAGON_LOG_VERBOSE("only support offload GGML_OP_ADD and GGML_OP_MUL_MAT to cDSP currently");
+        } else {
+            GGMLHEXAGON_LOG_INFO("succeed to open domain %d(%s)", domain_id, ggmlhexagon_get_dsp_name(domain_id));
+            GGMLHEXAGON_LOG_INFO("only support offload GGML_OP_ADD and GGML_OP_MUL_MAT to cDSP currently");
+        }
+        ggmlhexagon_probe_dspinfo(ctx);
+        //FIXME: re-use this function to pass thread_counts info to code on cDSP side before fully understand qidl mechanism
+        //ggmlop_dsp_setclocks(ctx->ggmlop_handle, HAP_DCVS_VCORNER_TURBO_PLUS, 40, 1, g_hexagon_appcfg.thread_counts);
+        //backward compatible with previous codes on cDSP side
+        ggmlop_dsp_setclocks(ctx->ggmlop_handle, HAP_DCVS_VCORNER_TURBO_PLUS, 40, g_hexagon_appcfg.mulmat_algotype, g_hexagon_appcfg.thread_counts);
+        ggmlhexagon_set_rpc_latency(ctx->ggmlop_handle, RPC_POLL_QOS, 100);
+        int result = ggmlhexagon_init_rpcmempool(ctx);
+        if (0 != result) {
+            GGMLHEXAGON_LOG_INFO("failed to init rpc mempool");
+            goto bail;
+        }
+    } else {
+        GGMLHEXAGON_LOG_INFO("error 0x%x: failed to open domain %d(%s)", hexagon_error, domain_id,
+                             ggmlhexagon_get_dsp_name(domain_id));
+        goto bail;
+    }
+
+    //make sure test-backend-ops get the correct backend name when hwaccel approach is 2(HWACCEL_CDSP)
+    memcpy(g_hexagon_mgr[ctx->device].name, "Hexagon-cDSP", strlen("Hexagon-cDSP"));
+
+    if (NULL != ggmlop_domain_uri) {
+        free(ggmlop_domain_uri);
+        ggmlop_domain_uri = NULL;
+    }
+    return 0;
+
+bail:
+    if (ggmlop_domain_uri) {
+        free(ggmlop_domain_uri);
+    }
+
+    ggmlhexagon_deinit_cdsp(ctx);
+
+    return -1;
+}
+
+static void ggmlhexagon_compute(ggml_backend_hexagon_context * ctx, struct ggml_tensor * op) {
+    //skip sanity check because already checked in other place
+    struct dsptensor dsptensor_0;
+    struct dsptensor dsptensor_1;
+    struct dsptensor dsptensor_2;
+    std::string op_name;
+    const char * ggml_opname = ggml_op_name(op->op);
+    ggmlhexagon_get_opkey_from_op(op, op_name);
+
+    int hexagon_error               = AEE_SUCCESS;
+    ggmlhexagon_op_func_t op_func   = nullptr;
+    size_t input_tensor_count       = 2;
+
+    ggml_tensor * src0  = op->src[0];
+    ggml_tensor * src1  = op->src[1];
+    ggml_tensor * dst   = op;
+
+    int input_size = ggml_nbytes(src0);
+    if (nullptr != src1)
+        input_size += ggml_nbytes(src1);
+    hexagon_perf op_perf(op_name, ggml_opname, input_size, ggml_nbytes(dst));
+    op_perf.start();
+
+    input_tensor_count  =  ggmlhexagon_k_op_caps[ggmlhexagon_get_op_index(op)].input_param_count;
+    op_func             =  ggmlhexagon_k_op_caps[ggmlhexagon_get_op_index(op)].dsp_op_func;
+    if (nullptr == op_func) {
+        GGMLHEXAGON_LOG_DEBUG("op GGML_OP_%s and dsp func %s not supported on cCSP", ggml_op_name(op->op), ggmlhexagon_k_op_caps[ggmlhexagon_get_op_index(op)].hexagon_op_name);
+        return;
+    }
+
+    //FIXME:try to fully understand the tech detail in qidl:
+    // qidl is a binary tool to generate some very complicated and hard-to customized bridge-layer codes
+    // between ARM-AP and cDSP. the mechanism in qidl/FastRPC is exactly similar to mechanism in TEE.
+    // try to find a better/efficient approach to exchange necessary data between ARM-AP side and cDSP side.
+    // manually modifying the important data structure ggml_tensor in ggml.h is not make-sense and not acceptable.
+    std::chrono::high_resolution_clock::time_point start_time = std::chrono::high_resolution_clock::now();
+    dsptensor_0.data        = src0->data;
+    dsptensor_0.data_len    = ggml_nbytes(src0);
+    dsptensor_0.type        = src0->type;
+
+    dsptensor_0.ne[0] = src0->ne[0];
+    dsptensor_0.ne[1] = src0->ne[1];
+    dsptensor_0.ne[2] = src0->ne[2];
+    dsptensor_0.ne[3] = src0->ne[3];
+
+    dsptensor_0.nb[0] = src0->nb[0];
+    dsptensor_0.nb[1] = src0->nb[1];
+    dsptensor_0.nb[2] = src0->nb[2];
+    dsptensor_0.nb[3] = src0->nb[3];
+
+    if (2 == input_tensor_count) {
+        GGML_ASSERT(nullptr != src1);
+        dsptensor_1.data        = src1->data;
+        dsptensor_1.type        = src1->type;
+        dsptensor_1.data_len    = ggml_nbytes(src1);
+
+        dsptensor_1.ne[0] = src1->ne[0];
+        dsptensor_1.ne[1] = src1->ne[1];
+        dsptensor_1.ne[2] = src1->ne[2];
+        dsptensor_1.ne[3] = src1->ne[3];
+
+        dsptensor_1.nb[0] = src1->nb[0];
+        dsptensor_1.nb[1] = src1->nb[1];
+        dsptensor_1.nb[2] = src1->nb[2];
+        dsptensor_1.nb[3] = src1->nb[3];
+    }
+
+    dsptensor_2.data        = dst->data;
+    dsptensor_2.data_len    = ggml_nbytes(dst);
+    dsptensor_2.type        = dst->type;
+
+    dsptensor_2.ne[0] = dst->ne[0];
+    dsptensor_2.ne[1] = dst->ne[1];
+    dsptensor_2.ne[2] = dst->ne[2];
+    dsptensor_2.ne[3] = dst->ne[3];
+
+    dsptensor_2.nb[0] = dst->nb[0];
+    dsptensor_2.nb[1] = dst->nb[1];
+    dsptensor_2.nb[2] = dst->nb[2];
+    dsptensor_2.nb[3] = dst->nb[3];
+
+    memcpy(dsptensor_2.op_params, dst->op_params, GGML_MAX_OP_PARAMS / sizeof(int32_t));
+    std::chrono::high_resolution_clock::time_point end_time = std::chrono::high_resolution_clock::now();
+    std::chrono::duration<size_t, std::nano> duration = end_time - start_time;
+    GGMLHEXAGON_LOG_DEBUG("pack duration %llu ns", duration.count());
+
+    hexagon_error = op_func(ctx->ggmlop_handle, &dsptensor_0, &dsptensor_1, &dsptensor_2);
+    if (AEE_SUCCESS != hexagon_error) {
+        GGMLHEXAGON_LOG_WARN("ggmlop %s computation fail on cdsp", ggml_op_name(op->op));
+    }
+
+    op_perf.info();
+    return;
+}
+
+// =================================================================================================
+//  section-8: implementation of ggml-hexagon backend according to specification in ggml backend subsystem
+// =================================================================================================
+static bool ggmlhexagon_can_handle_op_through_cdsp(ggml_backend_dev_t dev, const struct ggml_tensor * op_tensor) {
+    ggml_backend_hexagon_context * ctx = (ggml_backend_hexagon_context *)dev->context;
+    GGML_UNUSED(ctx);
+    if (op_tensor->op == GGML_OP_NONE) {
+        return true;
+    }
+
+    if (!ggmlhexagon_k_op_caps[ggmlhexagon_get_op_index(op_tensor)].supported) {
+        return false;
+    }
+
+    const ggml_tensor * src0 = op_tensor->src[0];
+    const ggml_tensor * src1 = op_tensor->src[1];
+    const int src0_rank      = ggml_n_dims(src0);
+    const int64_t ne00       = src0->ne[0];
+    int src1_rank            = 0;
+    if (nullptr != src1) {
+        src1_rank = ggml_n_dims(src1);
+    }
+    switch (op_tensor->op) {
+        case GGML_OP_ADD:
+        {
+            //TODO:workaround approach to fix HWACCEL_CDSP can't works in ASR inference and  LLM inference
+            //     with some LLM models in a standard Android APP
+            if (ne00 < 1024) {
+                return false;
+            }
+
+            if (!ggml_are_same_shape(src0, src1)) {
+                return false;
+            }
+            return (src0->type == GGML_TYPE_F32) && (src1->type == GGML_TYPE_F32) && (op_tensor->type == GGML_TYPE_F32);
+        }
+        case GGML_OP_MUL_MAT:
+        {
+            ggmlhexagon_dump_op_info(op_tensor);
+            if (src0_rank != src1_rank)
+                return false;
+            if (src0_rank != 2)
+                return false;
+
+            if (1 == g_hexagon_appcfg.enable_q_mulmat) {
+                if (1 == g_hexagon_appcfg.enable_all_q_mulmat) {
+                    return (src0->type == GGML_TYPE_F32  || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && (src1->type == GGML_TYPE_F32);
+                }
+
+                return (src0->type == GGML_TYPE_F32
+                        || src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q8_0
+                        || src0->type == GGML_TYPE_Q6_K || src0->type == GGML_TYPE_Q8_K
+                       ) && (src1->type == GGML_TYPE_F32) && (op_tensor->type == GGML_TYPE_F32);
+            } else {
+                return (src0->type == GGML_TYPE_F32) && (src1->type == GGML_TYPE_F32) &&
+                       (op_tensor->type == GGML_TYPE_F32);
+            }
+        }
+        case GGML_OP_SOFT_MAX:{
+            if (!ggml_is_contiguous(op_tensor))
+                return false;
+            if (!ggml_are_same_shape(src0, op_tensor))
+                return false;
+        }
+        case GGML_OP_RMS_NORM:
+        case GGML_OP_POOL_2D:
+        {
+
+            ggmlhexagon_dump_op_info(op_tensor);
+        }
+        default:
+            break;
+    }
+    return false;
+}
+
+static bool ggmlhexagon_can_handle_op_through_qnn(ggml_backend_dev_t dev, const struct ggml_tensor * op_tensor) {
+    ggml_backend_hexagon_context * ctx = (ggml_backend_hexagon_context *)dev->context;
+    if (op_tensor->op == GGML_OP_NONE) {
+        return true;
+    }
+
+    if (!ggmlqnn_k_op_caps[ggmlhexagon_get_op_index(op_tensor)].supported) {
+        return false;
+    }
+
+    struct ggml_tensor * src0 = op_tensor->src[0];
+    struct ggml_tensor * src1 = op_tensor->src[1];
+    const int64_t ne00        = src0->ne[0];
+    const int src0_rank       = ggml_n_dims(src0);
+    int src1_rank             = 0;
+    if (nullptr != src1) {
+        src1_rank = ggml_n_dims(src1);
+    }
+
+    switch (op_tensor->op) {
+        case GGML_OP_ADD:
+        case GGML_OP_SUB:
+        {
+            if (!ggml_are_same_shape(src0, src1)) {
+                return false;
+            }
+
+            if (ne00 < 32)
+                return false;
+
+            return ggmlhexagon_same_types(ctx, op_tensor);
+        }
+
+        case GGML_OP_DIV:
+        case GGML_OP_MUL: {
+            if (ctx->device == HEXAGON_BACKEND_QNNNPU)
+                return false;
+
+            if (!ggml_are_same_shape(src0, src1)) {
+                return false;
+            }
+
+            if ((src0_rank != 2) || (src1_rank != 2)) //TODO: 3D and 4D matrix mul
+                return false;
+
+            return ggmlhexagon_same_types(ctx, op_tensor);
+        }
+        case GGML_OP_MUL_MAT:
+        {
+            ggmlhexagon_dump_op_info(op_tensor);
+            if (src0_rank != src1_rank) // make QNN SDK happy
+                return false;
+
+            if (src0_rank != 2) {
+                // FIXME: there are some limitations for mulmat in QNN SDK: rank >= 2.
+                return false;
+            }
+
+            if (ctx->device == HEXAGON_BACKEND_QNNNPU) {
+                if (1 == g_hexagon_appcfg.enable_q_mulmat) {
+                    return (src0->type == GGML_TYPE_F32
+                        || src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q8_0
+                        || src0->type == GGML_TYPE_Q6_K || src0->type == GGML_TYPE_Q8_K
+                        ) && (src1->type == GGML_TYPE_F32) && (op_tensor->type == GGML_TYPE_F32);
+                } else {
+                    return (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && op_tensor->type == GGML_TYPE_F32);
+                }
+            } else {
+                if (1 == g_hexagon_appcfg.enable_q_mulmat) {
+                    return (src0->type == GGML_TYPE_F32 || ggml_is_quantized(src0->type))
+                        && (src1->type == GGML_TYPE_F32) && (op_tensor->type == GGML_TYPE_F32);
+                } else {
+                    return (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && op_tensor->type == GGML_TYPE_F32);
+                }
+            }
+        }
+        case GGML_OP_LOG:
+        {
+            if (ctx->device == HEXAGON_BACKEND_QNNNPU)
+                return false;
+        }
+        case GGML_OP_SQRT:
+        default:
+            return ggmlhexagon_same_types(ctx, op_tensor);
+    }
+}
+
+static bool ggmlhexagon_compute_forward(ggml_backend_t backend, struct ggml_tensor * dst) {
+    ggmlqnn_op_func_t func          = nullptr;
+    ggml_backend_hexagon_context * ctx  = (ggml_backend_hexagon_context *)backend->context;
+
+    if (HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach) {
+        ggmlhexagon_compute(ctx, dst);
+        return true;
+    }
+
+    switch (dst->op) {
+        case GGML_OP_REPEAT:
+            ggmlqnn_compute_repeat(ctx, dst);
+            break;
+        case GGML_OP_GET_ROWS:
+            ggmlqnn_compute_get_rows(ctx, dst);
+            break;
+        case GGML_OP_DUP:
+            ggmlqnn_compute_dup(ctx, dst);
+            break;
+        case GGML_OP_ADD:
+        case GGML_OP_SUB:
+        case GGML_OP_MUL:
+        case GGML_OP_DIV:
+        case GGML_OP_SQRT:
+        case GGML_OP_LOG:
+            func = ggmlqnn_compute_elementwise;
+            break;
+        case GGML_OP_ACC:
+            ggmlqnn_compute_acc(ctx, dst);
+            break;
+        case GGML_OP_UNARY:
+            switch (ggml_get_unary_op(dst)) {
+                case GGML_UNARY_OP_GELU:
+                    break;
+                case GGML_UNARY_OP_SILU:
+                    break;
+                case GGML_UNARY_OP_GELU_QUICK:
+                    break;
+                case GGML_UNARY_OP_TANH:
+                    break;
+                case GGML_UNARY_OP_RELU:
+                    break;
+                case GGML_UNARY_OP_HARDSIGMOID:
+                    break;
+                case GGML_UNARY_OP_HARDSWISH:
+                    break;
+                default:
+                    return false;
+            }
+            break;
+        case GGML_OP_NORM:
+            ggmlqnn_compute_norm(ctx, dst);
+            break;
+        case GGML_OP_GROUP_NORM:
+            ggmlqnn_compute_group_norm(ctx, dst);
+            break;
+        case GGML_OP_CONCAT:
+            ggmlqnn_compute_concat(ctx, dst);
+            break;
+        case GGML_OP_UPSCALE:
+            ggmlqnn_compute_upsample_nearest2d(ctx, dst);
+            break;
+        case GGML_OP_PAD:
+            ggmlqnn_compute_pad(ctx, dst);
+            break;
+        case GGML_OP_ARANGE:
+            ggmlqnn_compute_arange(ctx, dst);
+            break;
+        case GGML_OP_TIMESTEP_EMBEDDING:
+            ggmlqnn_compute_timestep_embedding(ctx, dst);
+            break;
+        case GGML_OP_LEAKY_RELU:
+            ggmlqnn_compute_leaky_relu(ctx, dst);
+            break;
+        case GGML_OP_RMS_NORM:
+            ggmlqnn_compute_rms_norm(ctx, dst);
+            break;
+        case GGML_OP_MUL_MAT:
+            ggmlqnn_compute_mul_mat(ctx, dst);
+            break;
+        case GGML_OP_MUL_MAT_ID:
+            return false;
+        case GGML_OP_SCALE:
+            ggmlqnn_compute_scale(ctx, dst);
+            break;
+        case GGML_OP_SQR:
+            ggmlqnn_compute_sqr(ctx, dst);
+            break;
+        case GGML_OP_CLAMP:
+            ggmlqnn_compute_clamp(ctx, dst);
+            break;
+        case GGML_OP_CPY:
+            ggmlqnn_compute_cpy(ctx, dst);
+            break;
+        case GGML_OP_CONT:
+            ggmlqnn_compute_dup(ctx, dst);
+            break;
+        case GGML_OP_NONE:
+        case GGML_OP_RESHAPE:
+        case GGML_OP_VIEW:
+        case GGML_OP_PERMUTE:
+        case GGML_OP_TRANSPOSE:
+            break;
+        case GGML_OP_SOFT_MAX:
+            ggmlqnn_compute_softmax(ctx, dst);
+            break;
+        case GGML_OP_ROPE:
+            ggmlqnn_compute_rope(ctx, dst);
+            break;
+        case GGML_OP_IM2COL:
+            ggmlqnn_compute_im2col(ctx, dst);
+            break;
+        case GGML_OP_POOL_2D:
+            ggmlqnn_compute_pool2d(ctx, dst);
+            break;
+        case GGML_OP_SUM_ROWS:
+            ggmlqnn_compute_sum_rows(ctx, dst);
+            break;
+        case GGML_OP_ARGSORT:
+            ggmlqnn_compute_argsort(ctx, dst);
+            break;
+        default:
+            return false;
+    }
+
+    if (nullptr != func)
+        func(ctx, dst);
+
+    return true;
+}
+
+struct ggml_backend_hexagon_buffer_context {
+    ~ggml_backend_hexagon_buffer_context() {
+        if (buffer) {
+            if ((g_hexagon_appcfg.hwaccel_approach == HWACCEL_CDSP) && (1 == g_hexagon_appcfg.enable_rpc_ion_mempool)) {
+                //do nothing here because rpc mempool was used for HWACCEL_CDSP
+            } else {
+                ggml_aligned_free(buffer, 0);
+            }
+        }
+    }
+
+    void * buffer       = nullptr;
+    size_t buffer_size  = 0;
+
+    struct ggml_backend_hexagon_context * backend_ctx = nullptr;
+};
+
+static void ggml_backend_hexagon_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+    ggml_backend_hexagon_buffer_context * ctx = (ggml_backend_hexagon_buffer_context *)buffer->context;
+    delete ctx;
+}
+
+static void * ggml_backend_hexagon_buffer_get_base(ggml_backend_buffer_t buffer) {
+    ggml_backend_hexagon_buffer_context * ctx = (ggml_backend_hexagon_buffer_context *)buffer->context;
+    return ctx->buffer;
+}
+
+static enum ggml_status ggml_backend_hexagon_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
+    ggml_backend_hexagon_buffer_context * ctx = (ggml_backend_hexagon_buffer_context *)buffer->context;
+    GGML_UNUSED(tensor);
+    GGML_UNUSED(ctx);
+    return GGML_STATUS_SUCCESS;
+}
+
+static void ggml_backend_hexagon_buffer_set_tensor(ggml_backend_buffer_t buffer,
+                                               ggml_tensor * tensor, const void * data,
+                                               size_t offset, size_t size) {
+    GGML_UNUSED(buffer);
+
+    memcpy((char *)tensor->data + offset, data, size);
+}
+
+static void ggml_backend_hexagon_buffer_memset_tensor(ggml_backend_buffer_t buffer,
+                                                  struct ggml_tensor * tensor,
+                                                  uint8_t value, size_t offset, size_t size) {
+    GGML_UNUSED(buffer);
+    memset((char *)tensor->data + offset, value, size);
+}
+
+static void ggml_backend_hexagon_buffer_get_tensor(ggml_backend_buffer_t buffer,
+                                               const ggml_tensor * tensor,
+                                               void * data, size_t offset, size_t size) {
+    GGML_UNUSED(buffer);
+    memcpy(data, (const char *)tensor->data + offset, size);
+}
+
+static bool ggml_backend_hexagon_buffer_cpy_tensor(ggml_backend_buffer_t buffer,
+                                               const struct ggml_tensor * src,
+                                               struct ggml_tensor * dst) {
+    GGML_UNUSED(buffer);
+    if (ggml_backend_buffer_is_host(src->buffer)) {
+        memcpy(dst->data, src->data, ggml_nbytes(src));
+        return true;
+    }
+
+    return false;
+}
+
+static void ggml_backend_hexagon_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
+    ggml_backend_hexagon_buffer_context * ctx = (ggml_backend_hexagon_buffer_context *)buffer->context;
+    memset(ctx->buffer, value, ctx->buffer_size);
+}
+
+static ggml_backend_buffer_i ggml_backend_hexagon_buffer_interface = {
+        /* .free_buffer     = */ ggml_backend_hexagon_buffer_free_buffer,
+        /* .get_base        = */ ggml_backend_hexagon_buffer_get_base,
+        /* .init_tensor     = */ ggml_backend_hexagon_buffer_init_tensor,
+        /* .memset_tensor   = */ ggml_backend_hexagon_buffer_memset_tensor,
+        /* .set_tensor      = */ ggml_backend_hexagon_buffer_set_tensor,
+        /* .get_tensor      = */ ggml_backend_hexagon_buffer_get_tensor,
+        /* .cpy_tensor      = */ ggml_backend_hexagon_buffer_cpy_tensor,
+        /* .clear           = */ ggml_backend_hexagon_buffer_clear,
+        /* .reset           = */ nullptr,
+};
+
+static const char * ggml_backend_hexagon_buffer_type_name(ggml_backend_buffer_type_t buft) {
+    GGML_UNUSED(buft);
+    if ((g_hexagon_appcfg.hwaccel_approach == HWACCEL_CDSP) && (1 == g_hexagon_appcfg.enable_rpc_ion_mempool)) {
+        return "hexagon-ion-buffer";
+    }
+
+    return "hexagon-normal-buffer";
+}
+
+static ggml_backend_buffer_t ggml_backend_hexagon_buffer_type_alloc_buffer(
+           ggml_backend_buffer_type_t buft, size_t size) {
+    GGMLHEXAGON_LOG_DEBUG("enter %s", __func__ );
+    struct ggml_backend_hexagon_context * ctx = static_cast<ggml_backend_hexagon_context *>(buft->context);
+    GGML_ASSERT(nullptr != ctx);
+    ggml_backend_hexagon_buffer_context * buffer_ctx = new ggml_backend_hexagon_buffer_context;
+
+    size_t size_page = 0;
+#if defined(__ANDROID__) || defined(__linux__)
+    size_page = sysconf(_SC_PAGESIZE);
+#else
+    SYSTEM_INFO systeminfo;
+    GetSystemInfo(&systeminfo);
+    size_page = systeminfo.dwPageSize;
+#endif
+    size_t size_aligned = size;
+    if (0 != (size_aligned % size_page)) {
+        size_aligned += (size_page - (size_aligned % size_page));
+    }
+
+    if ((HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach) && (1 == g_hexagon_appcfg.enable_rpc_ion_mempool)) {
+        GGMLHEXAGON_LOG_DEBUG("device %d(%s)", ctx->device, ggml_backend_hexagon_get_devname(ctx->device));
+        GGML_ASSERT(nullptr != ctx->rpc_mempool);
+        GGMLHEXAGON_LOG_DEBUG("size %ld(%d MiB), rpc_mempool_usage %ld(%d MiB), rpc_mempool_len %ld(%d MiB)",
+                              size, size / SIZE_IN_MB, ctx->rpc_mempool_usage, ctx->rpc_mempool_usage / SIZE_IN_MB,
+                              ctx->rpc_mempool_len, ctx->rpc_mempool_len / SIZE_IN_MB);
+        GGML_ASSERT(size + ctx->rpc_mempool_usage <= ctx->rpc_mempool_len);
+        buffer_ctx->buffer = (static_cast<char*>(ctx->rpc_mempool)) + ctx->rpc_mempool_usage;
+        GGMLHEXAGON_LOG_DEBUG("buffer_ctx->buffer %p", buffer_ctx->buffer);
+        GGML_ASSERT(nullptr != buffer_ctx->buffer);
+        ctx->rpc_mempool_usage += size_aligned;
+    } else {
+        buffer_ctx->buffer = ggml_aligned_malloc(size_aligned);
+    }
+    buffer_ctx->buffer_size = size_aligned;
+    if (nullptr == buffer_ctx->buffer) {
+        GGMLHEXAGON_LOG_WARN("%s: failed to allocate %d MiB\n", __func__, size / SIZE_IN_MB);
+        return nullptr;
+    } else {
+        //GGMLHEXAGON_LOG_DEBUG("%s: succeed to allocate %d MiB\n", __func__, size / SIZE_IN_MB);
+    }
+    GGMLHEXAGON_LOG_DEBUG("leave %s", __func__ );
+    return ggml_backend_buffer_init(buft, ggml_backend_hexagon_buffer_interface, buffer_ctx, size);
+}
+
+/**
+ * @param buft   pointer to the buffer type context
+ * @return       alignment requirement in bytes
+ */
+static size_t ggml_backend_hexagon_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
+    GGML_UNUSED(buft);
+    if ((HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach) && (1 == g_hexagon_appcfg.enable_rpc_ion_mempool)) {
+        return 128;
+    } else {
+        return 32;
+    }
+}
+
+static size_t ggml_backend_hexagon_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) {
+    struct ggml_backend_hexagon_context * ctx = static_cast<ggml_backend_hexagon_context *>(buft->context);
+    GGML_ASSERT(nullptr != ctx);
+    if ((HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach) && (1 == g_hexagon_appcfg.enable_rpc_ion_mempool)) {
+        GGML_ASSERT(ctx->rpc_mempool_len > (8 * SIZE_IN_MB));
+        return ctx->rpc_mempool_len - (8 * SIZE_IN_MB);
+    } else {
+        //TODO:this is an experimental value for LLM models
+        return (1024 * SIZE_IN_MB);
+    }
+}
+
+static bool ggml_backend_buft_is_hexagon(ggml_backend_buffer_type_t buft) {
+    return buft->iface.get_name == ggml_backend_hexagon_buffer_type_name;
+}
+
+static bool ggml_backend_hexagon_buffer_is_host(ggml_backend_buffer_type_t buft) {
+    struct ggml_backend_hexagon_context * ctx = static_cast<ggml_backend_hexagon_context *>(buft->context);
+    GGML_ASSERT(nullptr != ctx);
+    GGML_UNUSED(ctx);
+    return true;
+}
+
+static const char * ggml_backend_hexagon_name(ggml_backend_t backend) {
+    ggml_backend_hexagon_context * ctx = (ggml_backend_hexagon_context *) backend->context;
+    return g_hexagon_mgr[ctx->device].name;
+}
+
+static void ggml_backend_hexagon_free(ggml_backend_t backend) {
+    GGMLHEXAGON_LOG_DEBUG("enter %s", __func__ );
+    ggml_backend_hexagon_context * ctx = (ggml_backend_hexagon_context *)backend->context;
+
+    qnn_instance * instance = (qnn_instance*)g_hexagon_mgr[ctx->device].instance;
+    if (nullptr != instance) {
+        for (auto & [graph_name, graph_res] : ctx->qnn_singlenode_graph_map) {
+            auto & graph_handle = std::get<0>(graph_res);
+            auto & ptensors     = std::get<1>(graph_res);
+            for (auto & tensor : ptensors) {
+                ggmlqnn_free_qnntensor(tensor);
+            }
+            GGML_UNUSED(graph_handle);
+            GGMLHEXAGON_LOG_DEBUG("graph handle %p", graph_handle);
+            GGMLHEXAGON_LOG_DEBUG("clean up graph:%s", graph_name.c_str());
+        }
+
+        ctx->qnn_singlenode_graph_map.clear();
+
+        instance->qnn_finalize();
+        delete instance;
+        g_hexagon_mgr[ctx->device].instance = nullptr;
+    }
+
+    if (nullptr != g_hexagon_mgr[ctx->device].backend) {
+        //print timestamp and dsp information before deinit cdsp, useful for troubleshooting
+        ggmlhexagon_print_running_timestamp(ctx);
+        if (HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach) {
+            ggmlhexagon_deinit_cdsp(ctx);
+        }
+
+        delete backend;
+        g_hexagon_mgr[ctx->device].backend = nullptr;
+    }
+    GGMLHEXAGON_LOG_DEBUG("leave %s", __func__ );
+}
+
+static enum ggml_status ggmlhexagon_backend_graph_compute_general(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
+    enum ggml_status result         = GGML_STATUS_SUCCESS;
+    ggml_backend_hexagon_context * ctx  = (ggml_backend_hexagon_context *)backend->context;
+    GGML_UNUSED(ctx);
+
+    for (int i = 0; i < cgraph->n_nodes; i++) {
+        ggml_tensor * node = cgraph->nodes[i];
+        if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE
+            || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW
+            || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
+            continue;
+        }
+        bool ok = ggmlhexagon_compute_forward(backend, node);
+        if (!ok) {
+            GGMLHEXAGON_LOG_DEBUG("%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
+        }
+    }
+
+    return result;
+}
+
+static const char * ggml_backend_hexagon_device_get_name(ggml_backend_dev_t dev) {
+    struct ggml_backend_hexagon_context * ctx = static_cast<ggml_backend_hexagon_context *>(dev->context);
+    if (nullptr == ctx) {
+        GGMLHEXAGON_LOG_ERROR("pls check why ctx is null");
+        return "unknown";
+    }
+    return ctx->name;
+}
+
+static const char * ggml_backend_hexagon_device_get_description(ggml_backend_dev_t dev) {
+    GGMLHEXAGON_LOG_DEBUG("enter %s", __func__);
+    struct ggml_backend_hexagon_context * ctx = static_cast<ggml_backend_hexagon_context *>(dev->context);
+    static char hexagon_device_desc[GGMLHEXAGON_TMPBUF_LEN];
+    if (nullptr == ctx) {
+        GGMLHEXAGON_LOG_ERROR("pls check why ctx is null");
+        return "unknown";
+    }
+
+    if (0 == strncmp(ctx->name, "qnn-npu", 7)) {
+        const char * soc_info = ggmlhexagon_get_socmodel_desc(ctx->socinfo.soc_model);
+        const char * htp_arch = ggmlhexagon_get_htparch_desc(ctx->socinfo.htp_arch);
+        std::string dev_desc = std::string(ctx->desc)
+                + std::string(soc_info) + "_" + std::string(htp_arch)
+                + "," + std::string(ctx->socinfo.soc_desc);
+        memset(hexagon_device_desc, 0, GGMLHEXAGON_TMPBUF_LEN);
+        memcpy(hexagon_device_desc, dev_desc.c_str(), strlen(dev_desc.c_str()));
+        return hexagon_device_desc;
+    } else {
+        return ctx->desc;
+    }
+}
+
+static void ggml_backend_hexagon_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
+    struct ggml_backend_hexagon_context * ctx = static_cast<ggml_backend_hexagon_context *>(dev->context);
+    if ((nullptr == ctx) || (ctx->device > HEXAGON_BACKEND_GGML)) {
+        GGMLHEXAGON_LOG_ERROR("pls check params");
+        *free = 0;
+        *total = 0;
+    }
+
+    if (HEXAGON_BACKEND_QNNCPU == ctx->device || HEXAGON_BACKEND_GGML == ctx->device) {
+        *total = ggmlhexagon_get_system_total_memory_in_bytes();
+        *free = ggmlhexagon_get_system_free_memory_in_bytes();
+    } else if (HEXAGON_BACKEND_QNNGPU == ctx->device) {
+        //TODO: probe GPU info in Qualcomm Adreno GPU
+        *total = ggmlhexagon_get_system_total_memory_in_bytes();
+        *free = ggmlhexagon_get_system_free_memory_in_bytes();
+    } else if (HEXAGON_BACKEND_QNNNPU == ctx->device) {
+        size_t rpc_ion_memsize = 0;
+        size_t rpc_ion_usage   = 0;
+        GGML_ASSERT(nullptr != ctx->instance);
+        rpc_ion_memsize = ctx->instance->get_rpcmem_capacity();
+        rpc_ion_usage   = ctx->instance->get_rpcmem_usage();
+        *total = rpc_ion_memsize;
+        *free = (rpc_ion_memsize - rpc_ion_usage);
+        GGMLHEXAGON_LOG_DEBUG("rpc memsize %d MiB", rpc_ion_memsize / SIZE_IN_MB);
+        GGMLHEXAGON_LOG_DEBUG("rpc usage %d MiB\n\n", rpc_ion_usage / SIZE_IN_MB);
+    } else if (HEXAGON_BACKEND_CDSP == ctx->device) {
+        size_t rpc_ion_memsize = 0;
+        size_t rpc_ion_usage   = 0;
+        rpc_ion_memsize = ctx->rpc_mempool_capacity;
+        rpc_ion_usage   = ctx->rpc_mempool_usage;
+        *total = rpc_ion_memsize;
+        *free = (rpc_ion_memsize - rpc_ion_usage);
+        GGMLHEXAGON_LOG_DEBUG("rpc memsize %d MiB", rpc_ion_memsize / SIZE_IN_MB);
+        GGMLHEXAGON_LOG_DEBUG("rpc usage %d MiB\n\n", rpc_ion_usage / SIZE_IN_MB);
+    }
+}
+
+static enum ggml_backend_dev_type ggml_backend_hexagon_device_get_type(ggml_backend_dev_t dev) {
+    struct ggml_backend_hexagon_context * ctx = static_cast<ggml_backend_hexagon_context *>(dev->context);
+
+    if (HEXAGON_BACKEND_QNNCPU == ctx->device)
+        return GGML_BACKEND_DEVICE_TYPE_ACCEL;
+    else if (HEXAGON_BACKEND_QNNGPU == ctx->device)
+        return GGML_BACKEND_DEVICE_TYPE_ACCEL;
+    else if (HEXAGON_BACKEND_QNNNPU == ctx->device)
+        return GGML_BACKEND_DEVICE_TYPE_ACCEL;
+    else if (HEXAGON_BACKEND_CDSP == ctx->device)
+        return GGML_BACKEND_DEVICE_TYPE_GPU;
+    else
+        return GGML_BACKEND_DEVICE_TYPE_CPU;
+}
+
+static void ggml_backend_hexagon_device_get_props(ggml_backend_dev_t dev,
+                                              struct ggml_backend_dev_props * props) {
+    props->name        = ggml_backend_hexagon_device_get_name(dev);
+    props->description = ggml_backend_hexagon_device_get_description(dev);
+    props->type        = ggml_backend_hexagon_device_get_type(dev);
+    ggml_backend_hexagon_device_get_memory(dev, &props->memory_free, &props->memory_total);
+    props->caps = {
+            /* .async                 = */ false,
+            /* .host_buffer           = */ true,
+            /* .buffer_from_host_ptr  = */ false,
+            /* .events                = */ false,
+    };
+
+    if ((HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach) && (1 == g_hexagon_appcfg.enable_rpc_ion_mempool)) {
+        //don't use system memory in this scenario
+        props->caps.host_buffer       = false;
+    }
+}
+
+static ggml_backend_t ggml_backend_hexagon_device_init_backend(ggml_backend_dev_t dev, const char * params) {
+    GGML_UNUSED(dev);
+    GGMLHEXAGON_LOG_DEBUG("enter %s\n", __func__);
+    int dev_index = 0;
+
+    //case-1: test-backend-ops or other similar scenario: calling ggml_backend_dev_init(dev, reinterpret_cast<const char *>(i)) directly in user's code
+    ggmlhexagon_load_cfg();
+    if (!ggmlhexagon_check_valid_appcfg()) {
+        return nullptr;
+    }
+
+    if (nullptr == params) {
+        GGMLHEXAGON_LOG_DEBUG("program specified param is nullptr");
+        dev_index = (g_hexagon_appcfg.hexagon_backend > 0) ? g_hexagon_appcfg.hexagon_backend : 0;
+        if (dev_index >= GGML_HEXAGON_MAX_DEVICES) {
+            GGMLHEXAGON_LOG_INFO("assume the default ggml backend");
+            return nullptr;
+        }
+    } else {
+        GGMLHEXAGON_LOG_VERBOSE("program specified param is not nullptr");
+        //user's program calling ggml_backend_hexagon_device_init_backend directly
+        dev_index = (int)(intptr_t)params;
+        if (dev_index < 0) {
+            GGMLHEXAGON_LOG_VERBOSE("it shouldn't happend\n");
+            //test-thread-safety might-be running at the moment or an invalid value passed from user's program
+            dev_index = HEXAGON_BACKEND_QNNCPU; //0
+        }
+        if (dev_index > GGML_HEXAGON_MAX_DEVICES) {
+            dev_index = HEXAGON_BACKEND_GGML;   //4
+        }
+        g_hexagon_appcfg.hexagon_backend = dev_index;
+        GGMLHEXAGON_LOG_VERBOSE("program specified dev_index %d\n", dev_index);
+    }
+    GGMLHEXAGON_LOG_DEBUG("hexagon_backend=%d", dev_index);
+    ggml_backend_t hexagon_backend = ggml_backend_hexagon_init(dev_index, g_hexagon_appcfg.runtime_libpath);
+    GGMLHEXAGON_LOG_DEBUG("leave %s\n", __func__);
+
+    return hexagon_backend;
+}
+
+static ggml_backend_buffer_type_t ggml_backend_hexagon_buffer_type(size_t device_index) {
+    static std::mutex mutex;
+    std::lock_guard<std::mutex> lock(mutex);
+    GGMLHEXAGON_LOG_DEBUG("enter %s, device_index %d", __func__, device_index);
+    if (device_index >= GGML_HEXAGON_MAX_DEVICES) {
+        GGMLHEXAGON_LOG_DEBUG("ggml_backend_hexagon_buffer_type error: device_index:%d is out of range [0, %d]\n",
+                      device_index, GGML_HEXAGON_MAX_DEVICES - 1);
+        return nullptr;
+    }
+
+    if (HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach) {
+        //cover following special case:
+        //      toggle back and forth frequently between cDSP and ggml in a standard Android APP or in
+        //      a same running process
+        if (device_index != (size_t)(g_hexagon_appcfg.hexagon_backend)) {
+            GGMLHEXAGON_LOG_INFO("device_index %d, backend %d", device_index, g_hexagon_appcfg.hexagon_backend);
+
+            g_hexagon_appcfg.hexagon_backend = device_index;
+        }
+    }
+
+    static struct ggml_backend_buffer_type ggml_backend_hexagon_buffer_types[GGML_HEXAGON_MAX_DEVICES];
+    static bool ggml_backend_hexagon_buffer_type_initialized = false;
+    if (!ggml_backend_hexagon_buffer_type_initialized) {
+        for (int i = 0; i < GGML_HEXAGON_MAX_DEVICES; i++) {
+            ggml_backend_hexagon_buffer_types[i] = {
+                    /* .iface   = */ {
+                                             /* .get_name         = */ ggml_backend_hexagon_buffer_type_name,
+                                             /* .alloc_buffer     = */ ggml_backend_hexagon_buffer_type_alloc_buffer,
+                                             /* .get_alignment    = */ ggml_backend_hexagon_buffer_type_get_alignment,
+                                             /* .get_max_size     = */ ggml_backend_hexagon_buffer_type_get_max_size,
+                                             /* .get_alloc_size   = */ nullptr,// defaults to ggml_nbytes
+                                             /* .is_host          = */ ggml_backend_hexagon_buffer_is_host
+                                     },
+                    /* .device  = */ ggml_backend_reg_dev_get(ggml_backend_hexagon_reg(), i),
+                    /* .context = */ &g_hexagon_mgr[device_index],
+            };
+        }
+        ggml_backend_hexagon_buffer_type_initialized = true;
+    }
+
+
+    if (HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach) {
+        GGML_ASSERT(HEXAGON_BACKEND_CDSP == g_hexagon_appcfg.hexagon_backend);
+        //FIXME:this is workaround for cover following special case:
+        //      toggle back and forth frequently between cDSP and ggml in a standard Android APP or in a same running process
+        //      there is unknown issue with this workaround when toggle back and forth frequently in a standard Android APP
+        int result = ggmlhexagon_init_dsp(&g_hexagon_mgr[HEXAGON_BACKEND_CDSP]);
+        if (0 != result) {
+            GGMLHEXAGON_LOG_INFO("init hexagon dsp failure");
+            return nullptr;
+        }
+    }
+
+    GGMLHEXAGON_LOG_DEBUG("leave %s", __func__ );
+    return &ggml_backend_hexagon_buffer_types[device_index];
+}
+
+static const char * ggml_backend_hexagon_host_buffer_type_name(ggml_backend_buffer_type_t buft) {
+    GGML_UNUSED(buft);
+    return "Hexagon_Host";
+}
+
+static const char * ggml_backend_hexagon_host_buffer_name(ggml_backend_buffer_t buffer) {
+    GGML_UNUSED(buffer);
+    return "Hexagon_Host";
+}
+
+static void ggml_backend_hexagon_host_buffer_free(ggml_backend_buffer_t buffer) {
+    if (0 == g_hexagon_appcfg.enable_pinned_memory) {
+        ggml_aligned_free(buffer->context, 0);
+    } else {
+        rpcmem_free(buffer->context);
+    }
+}
+
+static void * ggml_hexagon_host_malloc(ggml_backend_buffer_type_t buft, size_t size) {
+    if (0 == g_hexagon_appcfg.enable_pinned_memory) {
+        return ggml_aligned_malloc(size);
+    } else {
+        //TODO: there are no corresponding APIs in existing Hexagon SDK, here try to re-use camera ion heap as a pinned memory
+        return rpcmem_alloc(RPCMEM_HEAP_ID_SYSTEM, ION_CAMERA_HEAP_ID | RPCMEM_TRY_MAP_STATIC, size);
+    }
+}
+
+static ggml_backend_buffer_t ggml_backend_hexagon_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
+    void * host_ptr = ggml_hexagon_host_malloc(buft, size);
+
+    if (nullptr == host_ptr) {
+        GGMLHEXAGON_LOG_INFO("failed to alloc host buffer");
+        //TODO: use assertion here before find a better approach to release "correct" host buffer
+        //      in function ggml_backend_hexagon_host_buffer_free
+        GGML_ASSERT(nullptr != host_ptr);
+        return ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size);
+    } else {
+        GGMLHEXAGON_LOG_INFO("succeed to alloc host buffer %d MiB", size / SIZE_IN_MB);
+    }
+
+    ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(host_ptr, size);
+    buffer->buft = buft;
+    buffer->iface.free_buffer = ggml_backend_hexagon_host_buffer_free;
+
+    return buffer;
+}
+
+static ggml_backend_buffer_type_t ggml_backend_hexagon_host_buffer_type() {
+    static struct ggml_backend_buffer_type ggml_backend_hexagon_buffer_type_host = {
+            /* .iface    = */ {
+                                      /* .get_name         = */ ggml_backend_hexagon_host_buffer_type_name,
+                                      /* .alloc_buffer     = */ ggml_backend_hexagon_host_buffer_type_alloc_buffer,
+                                      /* .get_alignment    = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
+                                      /* .get_max_size     = */ nullptr,
+                                      /* .get_alloc_size   = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
+                                      /* .is_host          = */ ggml_backend_cpu_buffer_type()->iface.is_host,
+                              },
+            /* .device   = */ ggml_backend_reg_dev_get(ggml_backend_hexagon_reg(), 0),
+            /* .context  = */ nullptr,
+    };
+
+    return &ggml_backend_hexagon_buffer_type_host;
+}
+
+static ggml_backend_buffer_type_t ggml_backend_hexagon_device_get_host_buffer_type(ggml_backend_dev_t dev) {
+    GGML_UNUSED(dev);
+    return ggml_backend_hexagon_host_buffer_type();
+}
+
+static ggml_backend_buffer_type_t ggml_backend_hexagon_device_get_buffer_type(ggml_backend_dev_t dev) {
+    ggml_backend_hexagon_context * ctx = (ggml_backend_hexagon_context *)dev->context;
+    return ggml_backend_hexagon_buffer_type(ctx->device);
+}
+
+static ggml_backend_buffer_t ggml_backend_hexagon_device_buffer_from_host_ptr(ggml_backend_dev_t dev,
+                                                void * ptr, size_t size, size_t max_tensor_size) {
+    return ggml_backend_cpu_buffer_from_ptr(ptr, size);
+
+    GGML_UNUSED(dev);
+    GGML_UNUSED(max_tensor_size);
+}
+
+static bool ggml_backend_hexagon_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
+    if ((HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach) && (1 == g_hexagon_appcfg.enable_rpc_ion_mempool)) {
+        if (ggml_backend_buft_is_hexagon(buft)) {
+            ggml_backend_hexagon_context * dev_ctx  = (ggml_backend_hexagon_context *)dev->context;
+            ggml_backend_hexagon_context * buft_ctx = (ggml_backend_hexagon_context *)buft->context;
+            return buft_ctx->device == dev_ctx->device;
+        }
+    }
+
+    return ggml_backend_buft_is_host(buft);
+}
+
+static struct ggml_backend_device_i ggml_backend_hexagon_device_interface = {
+        /* .get_name             = */ ggml_backend_hexagon_device_get_name,
+        /* .get_description      = */ ggml_backend_hexagon_device_get_description,
+        /* .get_memory           = */ ggml_backend_hexagon_device_get_memory,
+        /* .get_type             = */ ggml_backend_hexagon_device_get_type,
+        /* .get_props            = */ ggml_backend_hexagon_device_get_props,
+        /* .init_backend         = */ ggml_backend_hexagon_device_init_backend,
+        /* .get_buffer_type      = */ ggml_backend_hexagon_device_get_buffer_type,
+        /* .get_host_buffer_type = */ ggml_backend_hexagon_device_get_host_buffer_type,
+        /* .buffer_from_host_ptr = */ ggml_backend_hexagon_device_buffer_from_host_ptr,
+        /* .supports_op          = */ nullptr,
+        /* .supports_buft        = */ ggml_backend_hexagon_device_supports_buft,
+        /* .offload_op           = */ nullptr,
+        /* .event_new            = */ nullptr,
+        /* .event_free           = */ nullptr,
+        /* .event_synchronize    = */ nullptr,
+};
+
+static ggml_backend_i ggml_backend_hexagon_interface = {
+        /* .get_name                = */ ggml_backend_hexagon_name,
+        /* .free                    = */ ggml_backend_hexagon_free,
+        /* .set_tensor_async        = */ nullptr,
+        /* .get_tensor_async        = */ nullptr,
+        /* .cpy_tensor_async        = */ nullptr,
+        /* .synchronize             = */ nullptr,
+        /* .graph_plan_create       = */ nullptr,
+        /* .graph_plan_free         = */ nullptr,
+        /* .graph_plan_update       = */ nullptr,
+        /* .graph_plan_compute      = */ nullptr,
+        /* .graph_compute           = */ nullptr,
+        /* .event_record            = */ nullptr,
+        /* .event_wait              = */ nullptr,
+};
+
+//FIXME: this guid is not make sense
+static ggml_guid_t ggml_backend_hexagon_guid() {
+    static ggml_guid guid = {
+            0x1a, 0x2b, 0x3c, 0x4d, 0x5e, 0x6f, 0x70, 0x81,
+            0x92, 0xa3, 0xb4, 0xc5, 0xd6, 0xe7, 0xf8, 0x09
+    };
+    return &guid;
+}
+
+bool ggml_backend_is_hexagon(ggml_backend_t backend) {
+    return backend != nullptr && ggml_guid_matches(backend->guid, ggml_backend_hexagon_guid());
+}
+
+static void ggml_backend_hexagon_set_n_threads(ggml_backend_t backend, int n_threads) {
+    GGML_ASSERT(ggml_backend_is_hexagon(backend));
+
+    struct ggml_backend_hexagon_context * ctx = (struct ggml_backend_hexagon_context *)backend->context;
+    ctx->n_threads = n_threads;
+}
+
+int ggml_backend_hexagon_get_device_count() {
+    if (g_hexagon_appcfg.hwaccel_approach == HWACCEL_CDSP) {
+        //there only 1 backend_device when g_hexagon_appcfg.hwaccel_approach == HWACCEL_CDSP
+        //so return 1
+        return 1;
+    } else {
+        //QNN-CPU, QNN-GPU, QNN-NPU
+        return GGML_HEXAGON_MAX_DEVICES - 1;
+    }
+}
+
+struct ggml_backend_hexagon_reg_context {
+    std::vector<ggml_backend_dev_t> devices;
+};
+
+static const char * ggml_backend_hexagon_reg_get_name(ggml_backend_reg_t reg) {
+    GGML_UNUSED(reg);
+    //return "ggml-hexagon";
+
+    //return accurate backend name rather than "ggml-hexagon" to
+    //make compare NPU performance through llama-bench more clear
+    if (HEXAGON_BACKEND_QNNNPU == g_hexagon_appcfg.hexagon_backend)
+        return "QNN-NPU";
+
+    if (HEXAGON_BACKEND_QNNGPU == g_hexagon_appcfg.hexagon_backend)
+        return "QNN-GPU";
+
+    if (HEXAGON_BACKEND_QNNCPU == g_hexagon_appcfg.hexagon_backend)
+        return "QNN-CPU";
+
+    if (HEXAGON_BACKEND_CDSP == g_hexagon_appcfg.hexagon_backend)
+        return "Hexagon-cDSP";
+
+    return "ggml";
+}
+
+static size_t ggml_backend_hexagon_reg_get_device_count(ggml_backend_reg_t reg) {
+    GGML_UNUSED(reg);
+    if (HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach) {
+        GGML_ASSERT(g_hexagon_appcfg.hexagon_backend == HEXAGON_BACKEND_CDSP);
+        //there only 1 backend_device when g_hexagon_appcfg.hwaccel_approach == HWACCEL_CDSP
+        //so return 1
+        return 1;
+    } else {
+        //QNN-CPU, QNN-GPU, QNN-NPU
+        return GGML_HEXAGON_MAX_DEVICES - 1;
+    }
+}
+
+static ggml_backend_dev_t ggml_backend_hexagon_reg_get_device(ggml_backend_reg_t reg, size_t index) {
+    GGML_UNUSED(reg);
+    GGML_UNUSED(index);
+
+    GGMLHEXAGON_LOG_DEBUG("index %d", index);
+    ggml_backend_hexagon_reg_context * ctx = (ggml_backend_hexagon_reg_context *)reg->context;
+    if (HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach) {
+        GGML_ASSERT(g_hexagon_appcfg.hexagon_backend == HEXAGON_BACKEND_CDSP);
+        //there only 1 backend_device when g_hexagon_appcfg.hwaccel_approach == HWACCEL_CDSP
+        //so return ctx->devices[0]
+        return ctx->devices[0];
+    } else {
+        GGML_ASSERT(index <= ctx->devices.size());
+        return ctx->devices[index];
+    }
+}
+
+static void * ggml_backend_hexagon_reg_get_proc_address(ggml_backend_reg_t reg, const char * name) {
+    GGML_UNUSED(reg);
+
+    if (nullptr == name)
+        return nullptr;
+
+    const char * slot_name =  "ggml_backend_set_n_threads";
+    if (0 == memcmp(name, slot_name, strlen(slot_name))) {
+        return (void *)ggml_backend_hexagon_set_n_threads;
+    }
+
+    return nullptr;
+}
+
+static const ggml_backend_reg_i ggml_backend_hexagon_reg_interface = {
+        /* .get_name          = */ ggml_backend_hexagon_reg_get_name,
+        /* .get_device_count  = */ ggml_backend_hexagon_reg_get_device_count,
+        /* .get_device        = */ ggml_backend_hexagon_reg_get_device,
+        /* .get_proc_address  = */ ggml_backend_hexagon_reg_get_proc_address,
+};
+
+ggml_backend_reg_t ggml_backend_hexagon_reg() {
+    static ggml_backend_reg reg;
+    //TODO: the existing codes can't cover following special case:
+    //      toggle back and forth frequently between QNN-NPU and cDSP and ggml in a standard Android APP or in
+    //      a same running process
+    //      supportive of such special case is easy but it will significantly increase the size of APK
+    static bool initialized = false;
+    GGMLHEXAGON_LOG_DEBUG("enter %s", __func__);
+
+    //case-2: normal scenario, such as llama-cli or UI applicaton
+    ggmlhexagon_load_cfg();
+    if (!ggmlhexagon_check_valid_appcfg()) {
+        return nullptr;
+    }
+
+    {
+        static std::mutex mutex;
+        std::lock_guard<std::mutex> lock(mutex);
+        if (!initialized) {
+            ggml_backend_hexagon_reg_context * ctx = new ggml_backend_hexagon_reg_context;
+
+            for (int i = 0; i < ggml_backend_hexagon_get_device_count(); i++) {
+                if (HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach) {
+                    ggml_backend_hexagon_device_interface.supports_op = ggmlhexagon_can_handle_op_through_cdsp;
+                } else {
+                    ggml_backend_hexagon_device_interface.supports_op = ggmlhexagon_can_handle_op_through_qnn;
+                }
+
+                if ((HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach) && (1 == g_hexagon_appcfg.enable_rpc_ion_mempool)) {
+                    if (0 == g_hexagon_appcfg.enable_pinned_memory) {
+                        //don't use system memory in this scenario
+                        ggml_backend_hexagon_device_interface.get_host_buffer_type = nullptr;
+                    }
+                }
+
+                GGMLHEXAGON_LOG_DEBUG("create backend device for device %d", i);
+                ggml_backend_dev_t dev = new ggml_backend_device{
+                        /* .iface       = */ ggml_backend_hexagon_device_interface,
+                        /* .reg         = */ &reg,
+                        /* .context     = */ &g_hexagon_mgr[i]
+                };
+                if (HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach) {
+                    //there only 1 backend_device when g_hexagon_appcfg.hwaccel_approach == HWACCEL_CDSP
+                    //so context is g_hexagon_mgr[HEXAGON_BACKEND_CDSP] rather than g_hexagon_mgr[0]
+                    //attention here:
+                    dev->context = &g_hexagon_mgr[HEXAGON_BACKEND_CDSP];
+                }
+
+                ctx->devices.push_back(dev);
+
+                //make cDSP rpc memory pool happy because ggml's backend subsystem need this
+                if (HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach) {
+                    GGML_ASSERT(HEXAGON_BACKEND_CDSP == g_hexagon_appcfg.hexagon_backend);
+                    int result = ggmlhexagon_init_dsp(&g_hexagon_mgr[HEXAGON_BACKEND_CDSP]);
+                    if (0 != result) {
+                        GGMLHEXAGON_LOG_INFO("init hexagon dsp failure");
+                        return nullptr;
+                    }
+                    //GGML_ASSERT(0 == result);
+                }
+            }
+
+            reg = ggml_backend_reg {
+                    /* .api_version = */ GGML_BACKEND_API_VERSION,
+                    /* .iface       = */ ggml_backend_hexagon_reg_interface,
+                    /* .context     = */ ctx
+            };
+        }
+
+        initialized = true;
+    }
+    GGMLHEXAGON_LOG_DEBUG("leave ggml_backend_hexagon_reg");
+
+    return &reg;
+}
+
+const char * ggml_backend_hexagon_get_devname(size_t dev_num) {
+    switch (dev_num) {
+        case HEXAGON_BACKEND_QNNCPU:
+            return "HEXAGON_BACKEND_QNN_CPU";
+        case HEXAGON_BACKEND_QNNGPU:
+            return "HEXAGON_BACKEND_QNN_GPU";
+        case HEXAGON_BACKEND_QNNNPU:
+            return "HEXAGON_BACKEND_QNN_NPU";
+        case HEXAGON_BACKEND_CDSP:
+            return "HEXAGON_BACKEND_CDSP";
+        case HEXAGON_BACKEND_GGML:
+            return "ggml"; //"fake" hexagon backend, used for compare performance between hexagon backend and the default ggml backend
+        default:
+            return "unknown";
+    }
+}
+
+static qnn_instance * ggmlqnn_init_qnn_instance(size_t device, const char * qnn_lib_path) {
+    int result = 0;
+    GGMLHEXAGON_LOG_VERBOSE("device=%d, hwaccel approach=%d(%s)", device, g_hexagon_appcfg.hwaccel_approach,
+                     ggmlhexagon_get_hwaccel_approach_name(g_hexagon_appcfg.hwaccel_approach));
+
+    qnn_instance * instance = nullptr;
+    instance = new qnn_instance(qnn_lib_path, g_hexagon_mgr[device].lib, "");
+    result = instance->qnn_init(nullptr);
+    if (0 != result) {
+        GGMLHEXAGON_LOG_WARN("init qnn subsystem failed with qnn backend %s, pls check why\n",
+                         ggml_backend_hexagon_get_devname(device));
+        delete instance;
+        return nullptr;
+    }
+    qnn_interface qnn_interface = instance->get_qnn_interface();
+    if (!qnn_interface.is_loaded()) {
+        GGMLHEXAGON_LOG_WARN("qnn subsystem failure\n");
+        delete instance;
+        return nullptr;
+    }
+
+    std::string device_name = ggml_backend_hexagon_get_devname(device);
+    GGMLHEXAGON_LOG_VERBOSE("qnn device name %s", device_name.c_str());
+    g_hexagon_mgr[device].instance = instance;
+    g_hexagon_mgr[device].raw_interface = instance->get_qnn_raw_interface();
+    g_hexagon_mgr[device].raw_system_interface = instance->get_qnn_raw_system_interface();
+
+    return instance;
+}
+
+/**
+ *
+ * @param device            0: HEXAGON_BACKEND_QNNCPU 1: HEXAGON_BACKEND_QNNGPU 2: HEXAGON_BACKEND_QNNNPU 3: HEXAGON_BACKEND_CDSP 4: ggml
+ * @param runtime_libpath   binary runtime library path, such as "/data/local/tmp/" on Android or specified in user's code
+ * @return
+ */
+ggml_backend_t ggml_backend_hexagon_init(size_t device, const char * runtime_libpath) {
+    GGMLHEXAGON_LOG_DEBUG("enter %s", __func__);
+    if (nullptr == runtime_libpath)
+        return nullptr;
+
+    //case-3: calling ggml_backend_hexagon_init() directly in user's code
+    ggmlhexagon_load_cfg();
+    if (!ggmlhexagon_check_valid_appcfg()) {
+        return nullptr;
+    }
+
+    GGMLHEXAGON_LOG_DEBUG("device %d", device);
+    GGMLHEXAGON_LOG_DEBUG("runtime libpath %s", runtime_libpath);
+    if (device >= GGML_HEXAGON_MAX_DEVICES) {
+        GGMLHEXAGON_LOG_ERROR("invalid device %d", device);
+        return nullptr;
+    }
+
+    if (0 != memcmp(runtime_libpath, g_hexagon_appcfg.runtime_libpath, strlen(g_hexagon_appcfg.runtime_libpath))) {
+        //re-setting runtime libpath
+        ggmlhexagon_set_runtime_path(device, runtime_libpath);
+    }
+
+    if (nullptr != g_hexagon_mgr[device].backend) {
+        GGMLHEXAGON_LOG_DEBUG("backend %d(%s) already loaded", device,
+                         ggml_backend_hexagon_get_devname(device));
+        GGMLHEXAGON_LOG_DEBUG("leave %s", __func__);
+        return g_hexagon_mgr[device].backend;
+    }
+
+    //don't initialize QNN when hwaccel approach is offload ggml op to Hexagon cDSP directly
+    if (HWACCEL_CDSP != g_hexagon_appcfg.hwaccel_approach) {
+        qnn_instance * instance = ggmlqnn_init_qnn_instance(device, runtime_libpath);
+        if (nullptr == instance)
+            return nullptr;
+    }
+    ggml_backend_hexagon_interface.graph_compute = ggmlhexagon_backend_graph_compute_general;
+    ggml_backend_t hexagon_backend = new ggml_backend{
+            /* .guid      = */ ggml_backend_hexagon_guid(),
+            /* .iface     = */ ggml_backend_hexagon_interface,
+            /* .device    = */ ggml_backend_reg_dev_get(ggml_backend_hexagon_reg(), device),
+            /* .context   = */ &g_hexagon_mgr[device]
+    };
+
+    g_hexagon_mgr[device].backend = hexagon_backend;
+    if (HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach) {
+        int result = ggmlhexagon_init_dsp(&g_hexagon_mgr[device]);
+        if (0 != result) {
+            GGMLHEXAGON_LOG_INFO("init hexagon dsp failure");
+            ggml_backend_hexagon_free(hexagon_backend);
+            return nullptr;
+        }
+    } else {
+        //get fully description of SoC when hwaccel approach is HWACCEL_QNN and backend is HEXAGON_BACKEND_QNNNPU
+        GGMLHEXAGON_LOG_VERBOSE("device name %s", ggml_backend_hexagon_device_get_description(hexagon_backend->device));
+    }
+    GGMLHEXAGON_LOG_DEBUG("leave %s", __func__);
+
+    return hexagon_backend;
+}
+
+GGML_BACKEND_DL_IMPL(ggml_backend_hexagon_reg)
diff --git a/ggml/src/ggml-hexagon/kernels/Makefile b/ggml/src/ggml-hexagon/kernels/Makefile
new file mode 100755
index 0000000000000..7ac0ba5dbd754
--- /dev/null
+++ b/ggml/src/ggml-hexagon/kernels/Makefile
@@ -0,0 +1,53 @@
+#following vars already defined in CMakeLists.txt
+#HTP_ARCH_VERSION=v79
+#DEBUG_FLAG=-DNDEBUG -Wall
+#HEXAGON_SDK_PATH=/opt/qcom/Hexagon_SDK/6.2.0.1
+
+HEXAGON_COMPUTE=compute${HTP_ARCH_VERSION}
+HEXAGON_CC=${HEXAGON_SDK_PATH}/tools/HEXAGON_Tools/8.8.06/Tools/bin/hexagon-clang
+HEXAGON_CXX=${HEXAGON_SDK_PATH}/tools/HEXAGON_Tools/8.8.06/Tools/bin/hexagon-clang
+
+TARGET=libggmldsp-skel.so
+
+$(info HEXAGON_SDK_PATH:${HEXAGON_SDK_PATH})
+$(info HTP_ARCH_VERSION:${HTP_ARCH_VERSION})
+$(info DEBUG_FLAG:${DEBUG_FLAG})
+$(info HEXAGON_COMPUTE:${HEXAGON_COMPUTE})
+
+INCS=-I${HEXAGON_SDK_PATH}/incs -I${HEXAGON_SDK_PATH}/libs/qprintf/inc -I${HEXAGON_SDK_PATH}/incs/stddef -I${HEXAGON_SDK_PATH}/ipc/fastrpc/incs -I${HEXAGON_SDK_PATH}/ipc/fastrpc/rpcmem/inc -I${HEXAGON_SDK_PATH}/ipc/fastrpc/rtld/ship/inc -I${HEXAGON_SDK_PATH}/libs/atomic/inc -I${HEXAGON_SDK_PATH}/utils/sim_utils/inc -I${HEXAGON_SDK_PATH}/utils/sim_utils/inc -I${HEXAGON_SDK_PATH}/rtos/qurt/${HEXAGON_COMPUTE}/include/posix -I${HEXAGON_SDK_PATH}/rtos/qurt/${HEXAGON_COMPUTE}/include/qurt/
+
+CFLAGS=-m${HTP_ARCH_VERSION} -c -Ofast -Wall -Wstrict-prototypes -fno-zero-initialized-in-bss -fdata-sections -fpic -D__V_DYNAMIC__ -mhvx -mhvx-length=128B ${INCS} -fno-finite-math-only
+
+LDFLAGS=-m${HTP_ARCH_VERSION} -Wl,--defsym=ISDB_TRUSTED_FLAG=2 -Wl,--defsym=ISDB_SECURE_FLAG=2 -Wl,--no-threads -fpic -shared -Wl,-Bsymbolic -Wl,--wrap=malloc -Wl,--wrap=calloc -Wl,--wrap=free -Wl,--wrap=realloc -Wl,--wrap=memalign -lc -Wl,-soname=${TARGET}
+
+#SRCS = $(wildcard *.c)
+SRCS = ggml-dsp.c skel.c entry.c add.c  mulmat.c
+OBJS = $(patsubst %.c, %.o, $(SRCS))
+OBJS += dot.o
+OBJS += worker_pool.o
+
+ALL:$(OBJS)
+		${HEXAGON_CC} ${LDFLAGS} -o ${TARGET} -Wl,--start-group ${OBJS} -Wl,--end-group
+		@ls -l ${TARGET}
+		/bin/cp -fv ${TARGET} ../../../../out/android/bin/
+		/bin/cp -fv ${TARGET} ../../../../out/android/bin/libggmldsp-skel${HTP_ARCH_VERSION}.so
+		/bin/rm -f *.so
+
+%.o:%.c
+		@echo "${HEXAGON_CC} ${CFLAGS} ${DEBUG_FLAG} -D__FILENAME__=\"$<\" -o $@ -c $<"
+		${HEXAGON_CC} ${CFLAGS} ${DEBUG_FLAG} -D__FILENAME__=\"$<\" -o $@ -c $<
+		@echo "\n"
+
+%.o:%.S
+		@echo "${HEXAGON_CC} ${CFLAGS} ${DEBUG_FLAG} -D__FILENAME__=\"$<\" -o $@ -c $<"
+		${HEXAGON_CC} ${CFLAGS} ${DEBUG_FLAG} -D__FILENAME__=\"$<\" -o $@ -c $<
+		@echo "\n"
+
+%.o:%.cpp
+		@echo "${HEXAGON_CC} ${CFLAGS} ${DEBUG_FLAG} -D__FILENAME__=\"$<\" -o $@ -c $<"
+		${HEXAGON_CC} ${CFLAGS} ${DEBUG_FLAG} -D__FILENAME__=\"$<\" -o $@ -c $<
+		@echo "\n"
+
+clean:
+	rm -f *.o
+	/bin/rm -f *.so
diff --git a/ggml/src/ggml-hexagon/kernels/add.c b/ggml/src/ggml-hexagon/kernels/add.c
new file mode 100644
index 0000000000000..25a2d73e23536
--- /dev/null
+++ b/ggml/src/ggml-hexagon/kernels/add.c
@@ -0,0 +1,143 @@
+#include "ggml-dsp.h"
+
+static inline void l2fetch(const void * p, uint32_t stride,
+                           uint32_t width, uint32_t height,
+                           uint32_t dir) {
+    uint64_t control = HEXAGON_V64_CREATE_H(dir, stride, width, height);
+    __asm__ __volatile__ (" l2fetch(%0,%1) " : :"r"(p),"r"(control));
+}
+
+static inline void ggmlhexagon_dsp_add_f32(const int n, float * GGML_RESTRICT z, const float * GGML_RESTRICT x, const float * GGML_RESTRICT y) {
+    HVX_Vector * va;
+    HVX_Vector * vb;
+    HVX_Vector * vc;
+    HVX_Vector qf32;
+    const size_t FLOATS_PER_VECTOR = 128 / sizeof(float);
+    const size_t block  = n / FLOATS_PER_VECTOR;
+    const size_t left   = n % FLOATS_PER_VECTOR;
+    const size_t blocks = block * FLOATS_PER_VECTOR;
+
+    if ((((uintptr_t)z | (uintptr_t)x | (uintptr_t)y) % ALIGN_128_BYTE) != 0) {
+        GGMLHEXAGON_LOG_DEBUG("memaddress mismatch alignment 128 bytes z:%p x:%p y:%p", z, x, y);
+        for (size_t i = 0; i < n; ++i)
+            z[i] = x[i] + y[i];
+
+        return;
+    }
+
+    va = (HVX_Vector *)x;
+    vb = (HVX_Vector *)y;
+    vc = (HVX_Vector *)z;
+    //unroll is better but need more carefully check for various cases and I think DSP also don't like branch predication
+    for (size_t i = 0; i < block; ++i) {
+        l2fetch(va + VLEN, VLEN, VLEN, 1, 0);
+        l2fetch(vb + VLEN, VLEN, VLEN, 1, 0);
+        //*vc++ = Q6_Vsf_vadd_VsfVsf(*va++, *vb++);
+        qf32 = Q6_Vqf32_vadd_VsfVsf(*va++, *vb++);
+        *vc++ = Q6_Vsf_equals_Vqf32(qf32);
+    }
+
+    if (left > 0) {
+        for (size_t i = 0; i < left; ++i)
+            z[i + blocks] = x[i + blocks] + y[i + blocks];
+    }
+}
+
+static void ggml_compute_forward_add_f32(
+        const struct ggml_tensor * src0,
+        const struct ggml_tensor * src1,
+        struct ggml_tensor * dst) {
+    GGMLHEXAGON_LOG_DEBUG("enter %s", __func__ );
+    uint64_t start_time = ggml_time_us();
+
+    memcpy(dst->ne, src1->ne, 16);
+    memcpy(dst->nb, src1->nb, 16);
+    ggmlhexagon_dump_tensor(src0, 1);
+    ggmlhexagon_dump_tensor(src1, 1);
+    ggmlhexagon_dump_tensor(dst, 1);
+
+    GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
+
+    const int rank = ggml_n_dims(src0);
+    if (1 == rank) {
+        //element-wise addition with vector
+        const size_t len = src0->ne[0];
+        float * dst_ptr  = (float *) (dst->data);
+        float * src0_ptr = (float *) (src0->data);
+        float * src1_ptr = (float *) (src1->data);
+        ggmlhexagon_dsp_add_f32(len, dst_ptr, src0_ptr, src1_ptr);
+        return;
+    }
+
+    const int ith = 0;
+    const int nth = 1;
+
+    const int nr  = ggml_nrows(src0);
+    GGML_TENSOR_BINARY_OP_LOCALS
+
+    GGML_ASSERT( nb0 == sizeof(float));
+    GGML_ASSERT(nb00 == sizeof(float));
+
+    const int dr = (nr + nth - 1)/nth;
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+    if (nb10 == sizeof(float)) {
+        for (int ir = ir0; ir < ir1; ++ir) {
+            // src1 is broadcastable across src0 and dst in i1, i2, i3
+            const int32_t i03 = ir/(ne02*ne01);
+            const int32_t i02 = (ir - i03*ne02*ne01)/ne01;
+            const int32_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
+
+            const int32_t i13 = i03 % ne13;
+            const int32_t i12 = i02 % ne12;
+            const int32_t i11 = i01 % ne11;
+            const int32_t nr0 = ne00 / ne10;
+
+            float * dst_ptr  = (float *) ((char *) dst->data  + i03*nb3  + i02*nb2  + i01*nb1 );
+            float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
+            float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
+            for (int32_t r = 0; r < nr0; ++r) {
+                ggmlhexagon_dsp_add_f32(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr);
+            }
+        }
+    } else {
+        // src1 is not contiguous
+        for (int ir = ir0; ir < ir1; ++ir) {
+            // src1 is broadcastable across src0 and dst in i1, i2, i3
+            const int32_t i03 = ir/(ne02*ne01);
+            const int32_t i02 = (ir - i03*ne02*ne01)/ne01;
+            const int32_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
+
+            const int32_t i13 = i03 % ne13;
+            const int32_t i12 = i02 % ne12;
+            const int32_t i11 = i01 % ne11;
+
+            float * dst_ptr  = (float *) ((char *) dst->data  + i03*nb3  + i02*nb2  + i01*nb1 );
+            float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
+
+            for (int32_t i0 = 0; i0 < ne0; ++i0) {
+                const int32_t i10 = i0 % ne10;
+                float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10);
+
+                dst_ptr[i0] = src0_ptr[i0] + *src1_ptr;
+            }
+        }
+    }
+
+    uint64_t end_time = ggml_time_us();
+    uint64_t duration = (end_time - start_time);
+    GGMLHEXAGON_LOG_DEBUG("duration %llu us", duration);
+#if !GGMLHEXAGON_DEBUG
+    UNUSED(duration);
+#endif
+
+    GGMLHEXAGON_LOG_DEBUG("leave %s", __func__ );
+}
+
+//FIXME: why failed with test-backend-ops when disable ion rpc mempool
+int ggmlop_dsp_add(remote_handle64 h, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    GGMLHEXAGON_LOG_DEBUG("enter %s\n", __func__);
+    ggml_compute_forward_add_f32(src0, src1, dst);
+    GGMLHEXAGON_LOG_DEBUG("leave %s\n", __func__);
+    return 0;
+}
diff --git a/ggml/src/ggml-hexagon/kernels/dot.S b/ggml/src/ggml-hexagon/kernels/dot.S
new file mode 100755
index 0000000000000..2031a6001519b
--- /dev/null
+++ b/ggml/src/ggml-hexagon/kernels/dot.S
@@ -0,0 +1,136 @@
+/**=============================================================================
+@file
+    qhblas_f_vector_dot_af.S
+
+@brief
+    Calculates dot product of two input float vectors.
+
+    Function prototype
+        
+        int32_t qhblas_f_vector_dot_af(float_a8_t *input_1, float_a8_t *input_2, float *output, uint32_t size);
+
+    Reference C code
+        
+        int32_t qhblas_f_vector_dot_af(float_a8_t *input_1, float_a8_t *input_2, float *output, uint32_t size)
+        {
+            if ((input_1 == NULL) || (input_2 == NULL) || (output == NULL) || (size == 0))
+            {
+                return -1;
+            }
+
+            float dot = 0;
+            for (uint32_t i = 0; i < size; ++i)
+            {
+                dot += input_1[i] * input_2[i];
+            }
+
+            *output = dot;
+            return 0;
+        }
+
+Copyright (c) 2019 Qualcomm Technologies Incorporated.
+All Rights Reserved. Qualcomm Proprietary and Confidential.
+=============================================================================**/
+
+/*============================================================================*/
+
+    .p2align 2
+    .p2align 4,,15
+    .global qhblas_f_vector_dot_af
+    .type qhblas_f_vector_dot_af, @function
+
+/*============================================================================*/
+
+#define DC_PREFETCH_AHEAD    64                                    // number of bytes for DCFETCH
+#define L2_PREFETCH_AHEAD    256                                   // number of bytes for L2FETCH
+#define L2FETCH_CONFIG       0x0100FF00+(L2_PREFETCH_AHEAD/256)    // [stride = 256 : width = 255 : height = bytes/256]
+#define L2_PREFETCH_ELEMS    L2_PREFETCH_AHEAD/8                   // number of elements to prefetch with L2FETCH
+
+/*============================================================================*/
+
+qhblas_f_vector_dot_af:
+{
+    p0 = !cmp.eq(r0,#0)                           // input_1 != NULL
+    p0 = !cmp.eq(r1,#0)                           // input_2 != NULL
+    p0 = !cmp.eq(r2,#0)                           // output != NULL
+    p0 = cmp.gtu(r3,#0)                           // size > 0
+    if (!p0.new) jump:nt .L_ret
+}
+{
+    r10 = #0
+    r3 = lsr(r3,#1)                               // size / 2
+    p1 = tstbit(r3,#0)                            // check for odd size
+    if(cmp.eq(r3.new,#0)) jump:nt .L_do_one
+}
+{
+    r7:6 = #0
+    r9:8 = #0
+    r5 = add(r3,#7)                               // (size / 2) + 7
+    p2 = cmp.gtu(r3,#L2_PREFETCH_ELEMS)           // check whether we can do l2fetch
+}
+{
+    r5 = lsr(r5,#3)                               // ceil(size / 2)
+    r14 = mux(p2,r3,#0)                           // set l2fetch counter
+}
+{
+    r13:12 = combine(##L2FETCH_CONFIG,#8)         // set l2fetch config and max number of iterations for .L_loop_do_two
+    loop1(.L_prefetch_loop_do_two,r5)
+}
+    .falign
+.L_prefetch_loop_do_two:
+{
+    dcfetch(r0+#DC_PREFETCH_AHEAD)                // prefetch ahead for input_1
+    r5 = min(r12,r3)                              // min(8, size / 2)
+}
+{
+    dcfetch(r1+#DC_PREFETCH_AHEAD)                // prefetch ahead for input_2
+    loop0(.L_loop_do_two,r5)
+    p2 = cmp.eq(r3,r14)                           // check whether to do l2fetch
+    if (!p2.new) jump:t .L_loop_do_two
+}
+{
+    r5 = add(r3,#-L2_PREFETCH_ELEMS)              // number of elements left to prefetch ahead
+    r15 = add(r0,#L2_PREFETCH_AHEAD)              // input_1 addr for l2fetch
+}
+{
+    p2 = cmp.gtu(r5,#L2_PREFETCH_ELEMS)           // check whether we can continue to do l2fetch
+    r15 = add(r1,#L2_PREFETCH_AHEAD)              // input_2 addr for l2fetch
+    l2fetch(r15,r13)
+}
+{
+    if (p2) r14 = add(r14,#-L2_PREFETCH_ELEMS)    // adjust l2fetch counter
+    if (!p2) r14 = #0                             // there are no more bytes left to prefetch ahead
+    l2fetch(r15,r13)
+}
+    .falign
+.L_loop_do_two:
+{
+    r7:6 = memd(r0++#8)
+    r9:8 = memd(r1++#8)
+    r10 += sfmpy(r7,r9)
+}
+{
+    r10 += sfmpy(r6,r8)
+    r3 = add(r3,#-1)                              // adjust (size / 2)
+}:endloop0:endloop1
+{
+    r10 += sfmpy(r7,r9)
+    if (!p1) jump:nt .L_ret
+}
+    .falign
+.L_do_one:
+{
+    r4 = memw(r0)
+    r5 = memw(r1)
+}
+{
+    r10 += sfmpy(r4,r5)
+}
+    .falign
+.L_ret:
+{
+    if (p0) memw(r2) = r10
+    r0 = mux(p0,#0,#-1)
+    jumpr r31
+}
+    .size   qhblas_f_vector_dot_af, .-qhblas_f_vector_dot_af
diff --git a/ggml/src/ggml-hexagon/kernels/entry.c b/ggml/src/ggml-hexagon/kernels/entry.c
new file mode 100644
index 0000000000000..8af93ea1d3082
--- /dev/null
+++ b/ggml/src/ggml-hexagon/kernels/entry.c
@@ -0,0 +1,115 @@
+#include "ggml-dsp.h"
+
+static int32 g_thread_counts = 1;
+
+int ggmlop_dsp_open(const char * uri, remote_handle64 * handle) {
+    void * tptr = NULL;
+    GGMLHEXAGON_LOG_DEBUG("uri %s", uri);
+    tptr = (void *)malloc(1);
+    GGML_ASSERT(NULL != tptr);
+    *handle = (remote_handle64)tptr;
+
+    GGMLHEXAGON_LOG_DEBUG("api_version = 0x%x", qurt_api_version());
+    GGMLHEXAGON_LOG_DEBUG("hvx units = 0x%d", qurt_hvx_get_units());
+    qurt_arch_version_t  vers;
+    qurt_sysenv_get_arch_version(&vers);
+    GGMLHEXAGON_LOG_DEBUG("arch_version=0x%x", vers.arch_version);
+
+    qurt_sysenv_app_heap_t aheap;
+    qurt_sysenv_get_app_heap(&aheap);
+    GGMLHEXAGON_LOG_DEBUG("aheap.heap_base=0x%x, aheap.heap_limit=0x%x", aheap.heap_base, aheap.heap_limit);
+
+    qurt_sysenv_max_hthreads_t mhwt;
+    qurt_sysenv_get_max_hw_threads(&mhwt);
+    GGMLHEXAGON_LOG_DEBUG("max hardware threads counts=%d", mhwt.max_hthreads);
+    g_thread_counts = mhwt.max_hthreads;
+
+    return 0;
+}
+
+int ggmlop_dsp_close(remote_handle64 handle) {
+    if (handle)
+        free((void*)handle);
+
+    return 0;
+}
+
+AEEResult ggmlop_dsp_setclocks(remote_handle64 handle, int32 power_level, int32 latency, int32 mulmat_algo, int32 thread_counts) {
+    GGMLHEXAGON_LOG_DEBUG("enter %s", __func__);
+    HAP_power_request_t request;
+    memset(&request, 0, sizeof(HAP_power_request_t));
+    request.type = HAP_power_set_apptype;
+    request.apptype = HAP_POWER_COMPUTE_CLIENT_CLASS;
+
+    GGMLHEXAGON_LOG_DEBUG("user specified thread_counts %d", thread_counts);
+    if (thread_counts > 1)
+        g_thread_counts = (thread_counts > g_thread_counts) ? g_thread_counts : thread_counts;
+    else
+        g_thread_counts = 1;
+    GGMLHEXAGON_LOG_DEBUG("real thread_counts %d", g_thread_counts);
+
+    void * ggmop_ctx = (void*)(handle);
+    int retval = HAP_power_set(ggmop_ctx, &request);
+    if (retval)  {
+        GGMLHEXAGON_LOG_DEBUG("failed first power vote");
+        return AEE_EFAILED;
+    }
+
+    //configure clocks & DCVS mode
+    memset(&request, 0, sizeof(HAP_power_request_t));
+    request.type = HAP_power_set_DCVS_v2;
+    request.dcvs_v2.dcvs_enable = TRUE;
+    request.dcvs_v2.dcvs_params.target_corner = (HAP_dcvs_voltage_corner_t)power_level;
+    if (mulmat_algo) {
+        request.dcvs_v2.dcvs_params.min_corner = HAP_DCVS_VCORNER_DISABLE;
+        request.dcvs_v2.dcvs_params.max_corner = HAP_DCVS_VCORNER_DISABLE;
+    } else {
+        request.dcvs_v2.dcvs_params.min_corner = request.dcvs_v2.dcvs_params.target_corner;
+        request.dcvs_v2.dcvs_params.max_corner = request.dcvs_v2.dcvs_params.target_corner;
+    }
+    request.dcvs_v2.dcvs_option     = HAP_DCVS_V2_PERFORMANCE_MODE;
+    request.dcvs_v2.set_dcvs_params = TRUE;
+    request.dcvs_v2.set_latency     = TRUE;
+    request.dcvs_v2.latency         = latency;
+    retval = HAP_power_set(ggmop_ctx, &request);
+    if (retval) {
+        GGMLHEXAGON_LOG_DEBUG("failed to vote for performance mode");
+        return AEE_EFAILED;
+    }
+
+    memset(&request, 0, sizeof(HAP_power_request_t));
+    request.type = HAP_power_set_HVX;
+    request.hvx.power_up = TRUE;
+    retval = HAP_power_set(ggmop_ctx, &request);
+    if (retval) {
+        GGMLHEXAGON_LOG_DEBUG("failed to vote for HVX power");
+        return AEE_EFAILED;
+    }
+    GGMLHEXAGON_LOG_DEBUG("leave %s", __func__ );
+    return AEE_SUCCESS;
+}
+
+// =================================================================================================
+//  implementation of ggml-hexagon kernel, it's better to put every hexagon-kernel to a single file
+// =================================================================================================
+int ggmlop_dsp_softmax(remote_handle64 h, const dsptensor * src0, const dsptensor * src1, dsptensor * dst) {
+    GGMLHEXAGON_LOG_DEBUG("enter %s", __func__ );
+    GGMLHEXAGON_LOG_DEBUG("leave %s", __func__ );
+    return 0;
+}
+
+int ggmlop_dsp_rmsnorm(remote_handle64 h, const dsptensor * src0, const dsptensor * src1, dsptensor * dst) {
+    GGMLHEXAGON_LOG_DEBUG("enter %s", __func__ );
+    GGMLHEXAGON_LOG_DEBUG("leave %s", __func__ );
+    return 0;
+}
+
+int ggmlop_dsp_pool2d(remote_handle64 h, const dsptensor * src0, const dsptensor * src1, dsptensor * dst) {
+    GGMLHEXAGON_LOG_DEBUG("enter %s", __func__ );
+    GGMLHEXAGON_LOG_DEBUG("leave %s", __func__ );
+    return 0;
+}
+
+int ggmlop_get_thread_counts(void) {
+    return g_thread_counts;
+}
diff --git a/ggml/src/ggml-hexagon/kernels/ggml-dsp.c b/ggml/src/ggml-hexagon/kernels/ggml-dsp.c
new file mode 100644
index 0000000000000..b64209971a0dc
--- /dev/null
+++ b/ggml/src/ggml-hexagon/kernels/ggml-dsp.c
@@ -0,0 +1,207 @@
+/*
+ * Copyright (c) 2025 The ggml authors
+ *
+ * Qualcomm Hexagon SDK and reference tech guides could be found at:
+ * https://developer.qualcomm.com/software/hexagon-dsp-sdk/tools
+ *
+ * this single-source-file or self-contained file is implementation of ggml-dsp:
+ *    - a customized tiny ggml running on Qualcomm Hexagon cDSP
+ *    - ported from original ggml
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#include "ggml-dsp.h"
+
+void ggmlhexagon_log_internal(int level, const char *file, const char *func, int line, const char *format, ...) {
+#if !GGMLHEXAGON_DEBUG
+    return;
+#endif
+    static char s_ggmlhexagon_log_internal_buf[GGMLHEXAGON_LOGBUF_LEN];
+    va_list args;
+    va_start(args, format);
+    int len_prefix = snprintf(s_ggmlhexagon_log_internal_buf, GGMLHEXAGON_LOGBUF_LEN, "[%s, %d]: ",
+                              func, line);
+    int len = vsnprintf(s_ggmlhexagon_log_internal_buf + len_prefix,
+                        GGMLHEXAGON_LOGBUF_LEN - len_prefix, format, args);
+    if (len < (GGMLHEXAGON_LOGBUF_LEN - len_prefix)) {
+        FARF(ALWAYS, "%s\n", s_ggmlhexagon_log_internal_buf);
+    }
+    va_end(args);
+}
+
+void ggmlhexagon_dump_tensor_elements(const ggml_tensor * tensor) {
+#if !GGMLHEXAGON_DEBUG
+    return;
+#endif
+    float value = 0;
+    char tmpbuf[GGMLHEXAGON_LOGBUF_LEN];
+    size_t buflen = 0;
+    if (tensor->type == GGML_TYPE_F32) {
+        memset(tmpbuf, 0, GGMLHEXAGON_LOGBUF_LEN);
+        for (int h = 0; h < tensor->ne[3]; h++) {
+            for (int i = 0; i < tensor->ne[2]; i++) {
+                for (int j = 0; j < tensor->ne[1]; j++) {
+                    for (int k = 0; k < tensor->ne[0]; k++) {
+                        value = ((float *) tensor->data)[h * tensor->ne[2] + i * tensor->ne[1] +
+                                                         j * tensor->ne[0] + k];
+                        buflen += snprintf(tmpbuf + buflen, GGMLHEXAGON_LOGBUF_LEN - buflen, "%-4.2f\t", value);
+                    }
+                    buflen += snprintf(tmpbuf + buflen, GGMLHEXAGON_LOGBUF_LEN - buflen, "\n");
+                }
+            }
+        }
+        GGMLHEXAGON_LOG_DEBUG("\n%s\n", tmpbuf);
+    }
+
+    GGMLHEXAGON_LOG_DEBUG("\n");
+}
+
+void ggmlhexagon_dump_tensor(const ggml_tensor * tensor, int dump_tensor_data) {
+    GGMLHEXAGON_LOG_DEBUG("ne = %5d x %5d x %5d x %5d , nb = (%5zi, %5zi, %5zi, %5zi)\n",
+         tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3],
+         tensor->nb[0], tensor->nb[1], tensor->nb[2], tensor->nb[3]);
+
+    if ((1 == dump_tensor_data) && (ggml_nbytes(tensor) < 320)) {
+        ggmlhexagon_dump_tensor_elements(tensor);
+    }
+}
+
+size_t ggml_row_size(enum ggml_type type, int64_t ne) {
+    return 4*ne;
+}
+
+size_t ggml_nbytes(const struct ggml_tensor * tensor) {
+    size_t nbytes;
+    const size_t blck_size = 1;
+    if (blck_size == 1) {
+        nbytes = 4;
+        for (int i = 0; i < GGML_MAX_DIMS; ++i) {
+            nbytes += (tensor->ne[i] - 1)*tensor->nb[i];
+        }
+    } else {
+        nbytes = tensor->ne[0]*tensor->nb[0]/blck_size;
+        for (int i = 1; i < GGML_MAX_DIMS; ++i) {
+            nbytes += (tensor->ne[i] - 1)*tensor->nb[i];
+        }
+    }
+
+    return nbytes;
+}
+
+bool ggml_is_empty(const struct ggml_tensor * tensor) {
+    for (int i = 0; i < GGML_MAX_DIMS; ++i) {
+        if (tensor->ne[i] == 0) {
+            return true;
+        }
+    }
+    return false;
+}
+
+bool ggml_can_repeat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
+    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
+
+    return ggml_is_empty(t0) ? ggml_is_empty(t1) :
+           (t1->ne[0]%t0->ne[0] == 0) &&
+           (t1->ne[1]%t0->ne[1] == 0) &&
+           (t1->ne[2]%t0->ne[2] == 0) &&
+           (t1->ne[3]%t0->ne[3] == 0);
+}
+
+bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
+    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
+    return
+            (t0->ne[0] == t1->ne[0]) &&
+            (t0->ne[1] == t1->ne[1]) &&
+            (t0->ne[2] == t1->ne[2]) &&
+            (t0->ne[3] == t1->ne[3]);
+}
+
+int64_t ggml_nrows(const struct ggml_tensor * tensor) {
+    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
+
+    return tensor->ne[1]*tensor->ne[2]*tensor->ne[3];
+}
+
+bool ggml_is_transposed(const struct ggml_tensor * tensor) {
+    return tensor->nb[0] > tensor->nb[1];
+}
+
+bool ggml_is_contiguous_n(const struct ggml_tensor * tensor, int n) {
+    size_t next_nb = 4;
+    if (tensor->ne[0] != 1 && tensor->nb[0] != next_nb) {
+        return false;
+    }
+    next_nb *= tensor->ne[0];
+    for (int i = 1; i < GGML_MAX_DIMS; i++) {
+        if (tensor->ne[i] != 1) {
+            if (i > n) {
+                if (tensor->nb[i] != next_nb) {
+                    return false;
+                }
+                next_nb *= tensor->ne[i];
+            } else {
+                // this dimension does not need to be contiguous
+                next_nb = tensor->ne[i]*tensor->nb[i];
+            }
+        }
+    }
+    return true;
+}
+
+int64_t ggml_nelements(const struct ggml_tensor * tensor) {
+    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
+
+    return tensor->ne[0]*tensor->ne[1]*tensor->ne[2]*tensor->ne[3];
+}
+
+static bool ggml_is_contiguous_0(const struct ggml_tensor * tensor) {
+    return ggml_is_contiguous_n(tensor, 0);
+}
+
+bool ggml_is_contiguous(const struct ggml_tensor * tensor) {
+    return ggml_is_contiguous_0(tensor);
+}
+
+int ggml_n_dims(const struct ggml_tensor * tensor) {
+    for (int i = GGML_MAX_DIMS - 1; i >= 1; --i) {
+        if (tensor->ne[i] > 1) {
+            return i + 1;
+        }
+    }
+    return 1;
+}
+
+void ggml_abort(const char * file, int line, const char * fmt, ...) {
+    GGMLHEXAGON_LOG_DEBUG("enter ggml_abort");
+    abort();
+}
+
+static inline uint64 hexagon_perf_get_time_us(void) {
+    unsigned long long count;
+    asm volatile (" %0 = c31:30 " : "=r"(count));
+    return (uint64)(count) * 10ull / 192ull;
+}
+
+int64_t ggml_time_ms(void) {
+    return hexagon_perf_get_time_us() * 1000;
+}
+
+int64_t ggml_time_us(void) {
+    return hexagon_perf_get_time_us();
+}
diff --git a/ggml/src/ggml-hexagon/kernels/ggml-dsp.h b/ggml/src/ggml-hexagon/kernels/ggml-dsp.h
new file mode 100644
index 0000000000000..103b46b8ee7fc
--- /dev/null
+++ b/ggml/src/ggml-hexagon/kernels/ggml-dsp.h
@@ -0,0 +1,168 @@
+#pragma once
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+#include <assert.h>
+
+#include "HAP_perf.h"
+#include "HAP_farf.h"
+#include "HAP_power.h"
+#include "HAP_vtcm_mgr.h"
+#include "HAP_compute_res.h"
+
+#include "qurt.h"
+#include "AEEStdErr.h"
+#include "hexagon_types.h"
+#include "hexagon_protos.h"
+
+#include "skel.h"
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+#define ggml_tensor         dsptensor
+
+#define GGML_MAX_DIMS       4
+
+#define ALIGN_128_BYTE      128
+
+#define VLEN                128
+
+#define GGML_UNUSED(x)      (void)(x)
+
+#define UNUSED              GGML_UNUSED
+
+#define GGML_PAD(x, n)      (((x) + (n) - 1) & ~((n) - 1))
+
+#define GGML_ABORT(...)     ggml_abort(__FILE__, __LINE__, __VA_ARGS__)
+
+#define GGML_ASSERT(x)      if (!(x)) GGML_ABORT("GGML_ASSERT(%s) failed", #x)
+
+#define MIN(a, b)           ((a) < (b) ? (a) : (b))
+#define MAX(a, b)           ((a) > (b) ? (a) : (b))
+
+#if UINTPTR_MAX == 0xFFFFFFFF
+#define GGML_MEM_ALIGN      4
+#else
+#define GGML_MEM_ALIGN      16
+#endif
+
+#define GGML_API            extern
+
+#ifdef __cplusplus
+// restrict not standard in C++
+#    if defined(__GNUC__)
+#        define GGML_RESTRICT       __restrict__
+#    elif defined(__clang__)
+#        define GGML_RESTRICT       __restrict
+#    elif defined(_MSC_VER)
+#        define GGML_RESTRICT       __restrict
+#    else
+#        define GGML_RESTRICT
+#    endif
+#else
+#    if defined (_MSC_VER) && (__STDC_VERSION__ < 201112L)
+#        define GGML_RESTRICT       __restrict
+#    else
+#        define GGML_RESTRICT       restrict
+#    endif
+#endif
+
+#ifndef __cplusplus
+#ifndef static_assert
+        #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L)
+            #define static_assert(cond, msg) _Static_assert(cond, msg)
+        #else
+            #define static_assert(cond, msg) struct global_scope_noop_trick
+        #endif
+#endif
+#endif // __cplusplus
+
+
+//NPU performance will be slower when enable GGMLHEXAGON_DEBUG
+#ifdef NDEBUG
+#define GGMLHEXAGON_DEBUG                                   0
+#else
+#define GGMLHEXAGON_DEBUG                                   1
+#endif
+
+#define GGMLHEXAGON_LOGBUF_LEN                              4096
+#define GGMLHEXAGON_TMPBUF_LEN                              256
+#if GGMLHEXAGON_DEBUG
+#define GGMLHEXAGON_LOG_DEBUG(...)                          ggmlhexagon_log_internal(GGMLHEXAGON_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__)
+#else
+#define GGMLHEXAGON_LOG_DEBUG(...)
+#endif
+
+#define GGML_TENSOR_LOCALS_1(type, prefix, pointer, array) \
+    const type prefix##0 = (pointer)->array[0]; \
+    GGML_UNUSED(prefix##0);
+#define GGML_TENSOR_LOCALS_2(type, prefix, pointer, array) \
+    GGML_TENSOR_LOCALS_1    (type, prefix, pointer, array) \
+    const type prefix##1 = (pointer)->array[1]; \
+    GGML_UNUSED(prefix##1);
+#define GGML_TENSOR_LOCALS_3(type, prefix, pointer, array) \
+    GGML_TENSOR_LOCALS_2    (type, prefix, pointer, array) \
+    const type prefix##2 = (pointer)->array[2]; \
+    GGML_UNUSED(prefix##2);
+#define GGML_TENSOR_LOCALS(type, prefix, pointer, array) \
+    GGML_TENSOR_LOCALS_3  (type, prefix, pointer, array) \
+    const type prefix##3 = (pointer)->array[3]; \
+    GGML_UNUSED(prefix##3);
+
+#define GGML_TENSOR_UNARY_OP_LOCALS \
+    GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
+    GGML_TENSOR_LOCALS(size_t,  nb0, src0, nb) \
+    GGML_TENSOR_LOCALS(int64_t, ne,  dst,  ne) \
+    GGML_TENSOR_LOCALS(size_t,  nb,  dst,  nb)
+
+#define GGML_TENSOR_BINARY_OP_LOCALS \
+    GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
+    GGML_TENSOR_LOCALS(size_t,  nb0, src0, nb) \
+    GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne) \
+    GGML_TENSOR_LOCALS(size_t,  nb1, src1, nb) \
+    GGML_TENSOR_LOCALS(int64_t, ne,  dst,  ne) \
+    GGML_TENSOR_LOCALS(size_t,  nb,  dst,  nb)
+
+#define GGML_TENSOR_BINARY_OP_LOCALS01 \
+    GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
+    GGML_TENSOR_LOCALS(size_t,  nb0, src0, nb) \
+    GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne) \
+    GGML_TENSOR_LOCALS(size_t,  nb1, src1, nb)
+
+enum ggmlhexagon_log_level {
+    GGMLHEXAGON_LOG_LEVEL_NONE  = 0,
+    GGMLHEXAGON_LOG_LEVEL_DEBUG = 1,
+};
+
+enum ggml_type {
+    GGML_TYPE_F32     = 0,
+};
+
+typedef double      ggml_float;
+
+GGML_API int64_t ggml_time_ms(void);
+GGML_API int64_t ggml_time_us(void);
+
+GGML_API size_t ggml_nbytes(const struct ggml_tensor * tensor);
+GGML_API int64_t ggml_nrows(const struct ggml_tensor * tensor);
+GGML_API int ggml_n_dims(const struct ggml_tensor * tensor);
+GGML_API bool ggml_is_contiguous(const struct ggml_tensor * tensor);
+GGML_API void ggml_abort(const char * file, int line, const char * fmt, ...);
+GGML_API bool ggml_can_repeat(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
+GGML_API bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
+
+GGML_API void ggmlhexagon_dump_tensor_elements(const ggml_tensor * tensor);
+GGML_API void ggmlhexagon_dump_tensor(const ggml_tensor * tensor, int dump_tensor_data);
+GGML_API void ggmlhexagon_log_internal(int level, const char *file, const char *func, int line, const char *format, ...);
+
+GGML_API int ggmlop_get_thread_counts(void);
+
+#ifdef  __cplusplus
+}
+#endif
diff --git a/ggml/src/ggml-hexagon/kernels/mulmat.c b/ggml/src/ggml-hexagon/kernels/mulmat.c
new file mode 100644
index 0000000000000..f34b6f8b09b4e
--- /dev/null
+++ b/ggml/src/ggml-hexagon/kernels/mulmat.c
@@ -0,0 +1,288 @@
+#include "ggml-dsp.h"
+
+// 128 byte vectors
+#define VSIZE_BYTES 128
+#define VSIZE_WORDS VSIZE_BYTES/4
+
+union ui32f { int32_t i; float f; };
+
+// create a vector of floats from a float
+static __attribute__((always_inline)) HVX_Vector create_sfv_from_sf(float value) {
+    union ui32f cvt;
+    cvt.f = value;
+    HVX_Vector tmp = Q6_V_vsplat_R(cvt.i);
+    return tmp;
+}
+
+// create a vector of qf32's from a float
+static __attribute__((always_inline)) HVX_Vector create_qf32v_from_sf(float value) {
+    HVX_Vector tmp = Q6_Vqf32_vadd_Vqf32Vsf(Q6_V_vsplat_R(0), create_sfv_from_sf(value));
+    return tmp;
+}
+
+// convert qf32 vector to float vector
+static __attribute__((always_inline)) HVX_Vector convert_qf32v_to_fltv(HVX_Vector vect) {
+    HVX_Vector tmp = Q6_Vsf_equals_Vqf32(vect);
+    return tmp;
+}
+
+// get lowest float from a vector of floats
+static __attribute__((always_inline)) float get_flt0_from_fltv(HVX_Vector vect) {
+    union ui32f cvt;
+    cvt.i = vect[0];
+    return cvt.f;
+}
+
+// get lowest float from a vector of qf32's
+static __attribute__((always_inline)) float get_flt0_from_qf32v(HVX_Vector vect) {
+    union ui32f cvt;
+    HVX_Vector tmp = convert_qf32v_to_fltv(vect);
+    cvt.i = tmp[0];
+    return cvt.f;
+}
+
+static void vec_dot_f32(int n, float *GGML_RESTRICT s, size_t bs, const float *GGML_RESTRICT x,
+                    size_t bx, const float *GGML_RESTRICT y, size_t by, int nrc) {
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+    // scalar
+    ggml_float sumf = 0.0;
+    for (int i = 0; i < n; ++i) {
+        sumf += (ggml_float) (x[i] * y[i]);
+    }
+    *s = sumf;
+}
+
+static void ggml_compute_forward_mul_mat_one_chunk(const ggml_tensor *src0, const ggml_tensor *src1,
+                                                   struct ggml_tensor *dst,
+                                                   const enum ggml_type type,
+                                                   const int32_t num_rows_per_vec_dot,
+                                                   const int32_t ir0_start, const int32_t ir0_end,
+                                                   const int32_t ir1_start, const int32_t ir1_end) {
+    ggmlhexagon_dump_tensor(src0, 0);
+    ggmlhexagon_dump_tensor(src1, 0);
+    ggmlhexagon_dump_tensor(dst, 0);
+
+    dst->ne[0] = src0->ne[1];
+    dst->ne[1] = src1->ne[1];
+    dst->ne[2] = src1->ne[2];
+    dst->ne[3] = src1->ne[3];
+
+    dst->nb[0] = 4;
+    dst->nb[1] = dst->nb[0] * dst->ne[0];
+    dst->nb[2] = dst->nb[1] * dst->ne[1];
+    dst->nb[3] = dst->nb[2] * dst->ne[2];
+    ggmlhexagon_dump_tensor(dst, 0);
+
+    GGML_TENSOR_BINARY_OP_LOCALS
+
+    const bool src1_cont = ggml_is_contiguous(src1);
+
+    // broadcast factors
+    const int32_t r2 = ne12 / ne02;
+    const int32_t r3 = ne13 / ne03;
+
+    if (ir0_start >= ir0_end || ir1_start >= ir1_end) {
+        return;
+    }
+
+    const void * wdata = src1->data;
+    const size_t row_size = 4* ne10;
+
+    assert(ne12 % ne02 == 0);
+    assert(ne13 % ne03 == 0);
+
+    // block-tiling attempt
+    const int32_t blck_0 = 16;
+    const int32_t blck_1 = 16;
+
+    const size_t src1_col_stride = src1_cont || nb11;
+
+    // attempt to reduce false-sharing (does not seem to make a difference)
+    // 16 * 2, accounting for mmla kernels
+    float tmp[32];
+
+    for (int32_t iir1 = ir1_start; iir1 < ir1_end; iir1 += blck_1) {
+        for (int32_t iir0 = ir0_start; iir0 < ir0_end; iir0 += blck_0) {
+            for (int32_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir1_end; ir1 += num_rows_per_vec_dot) {
+                const int32_t i13 = (ir1 / (ne12 * ne1));
+                const int32_t i12 = (ir1 - i13 * ne12 * ne1) / ne1;
+                const int32_t i11 = (ir1 - i13 * ne12 * ne1 - i12 * ne1);
+
+                // broadcast src0 into src1
+                const int32_t i03 = i13 / r3;
+                const int32_t i02 = i12 / r2;
+
+                const int32_t i1 = i11;
+                const int32_t i2 = i12;
+                const int32_t i3 = i13;
+
+                const char * src0_row = (const char*)src0->data + (0 + i02 * nb02 + i03 * nb03);
+
+                // desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
+                //       if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
+                //       the original src1 data pointer, so we should index using the indices directly
+                const char * src1_col = (const char*)wdata +
+                                        (src1_cont
+                                         ? (i11 + i12 * ne11 + i13 * ne12 * ne11) * row_size
+                                         : (i11 * nb11 + i12 * nb12 + i13 * nb13));
+                float * dst_col = (float*)((char*)dst->data + (i1 * nb1 + i2 * nb2 + i3 * nb3));
+
+                for (int32_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir0_end; ir0 += num_rows_per_vec_dot) {
+                    vec_dot_f32(ne00, &tmp[ir0 - iir0], (num_rows_per_vec_dot > 1 ? 16 : 0),
+                                (float*)(src0_row + ir0 * nb01), (num_rows_per_vec_dot > 1 ? nb01 : 0),
+                                (float*)src1_col, (num_rows_per_vec_dot > 1 ? src1_col_stride : 0), num_rows_per_vec_dot);
+                }
+
+                for (int cn = 0; cn < num_rows_per_vec_dot; ++cn) {
+                    memcpy(&dst_col[iir0 + cn * nb1 / nb0], tmp + (cn * 16), (MIN(iir0 + blck_0, ir0_end) - iir0) * sizeof(float));
+                }
+            }
+        }
+    }
+}
+
+static int ggmlop_dsp_mulmat_singlethread(remote_handle64 h, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    GGMLHEXAGON_LOG_DEBUG("enter %s", __func__ );
+    ggmlhexagon_dump_tensor(src0, 0);
+    ggmlhexagon_dump_tensor(src1, 0);
+    ggmlhexagon_dump_tensor(dst, 0);
+
+    dst->ne[0] = src0->ne[1];
+    dst->ne[1] = src1->ne[1];
+    dst->ne[2] = src1->ne[2];
+    dst->ne[3] = src1->ne[3];
+
+    dst->nb[0] = 4;
+    dst->nb[1] = dst->nb[0] * dst->ne[0];
+    dst->nb[2] = dst->nb[1] * dst->ne[1];
+    dst->nb[3] = dst->nb[2] * dst->ne[2];
+    ggmlhexagon_dump_tensor(dst, 0);
+
+    GGML_TENSOR_BINARY_OP_LOCALS
+
+    int32_t  const vec_dot_num_rows     = 1;
+
+    GGML_ASSERT(ne0 == ne01);
+    GGML_ASSERT(ne1 == ne11);
+    GGML_ASSERT(ne2 == ne12);
+    GGML_ASSERT(ne3 == ne13);
+
+    // we don't support permuted src0 or src1
+    GGML_ASSERT(nb00 == 4);
+    GGML_ASSERT(nb10 == 4);
+
+    // dst cannot be transposed or permuted
+    GGML_ASSERT(nb0 == sizeof(float));
+    GGML_ASSERT(nb0 <= nb1);
+    GGML_ASSERT(nb1 <= nb2);
+    GGML_ASSERT(nb2 <= nb3);
+
+#if 0 //naive algorithm for fp32, can pass various case in UT
+    {
+        //ggml_dump_tensor(src0);
+        //ggml_dump_tensor(src1);
+
+        float * a = (float*)src0->data;
+        float * b = (float*)src1->data;
+        float * c = (float*)dst->data;
+        int M = src0->ne[1];
+        int K = src0->ne[0];
+        int N = src1->ne[1];
+        float sum = 0;
+        for (int i = 0; i < M; i++) {
+            for (int j = 0; j < N; j++) {
+                sum = 0;
+                for (int h = 0; h < K; h++) {
+                    sum += a[i * K + h] * b[h * N + j];
+                }
+                c[i * N + j] = sum;
+            }
+        }
+        return 0;
+    }
+#endif
+
+    // This is the size of the first dimension of the result, so we can iterate that way. (see the ASSERT above, these are the same numbers)
+    const int32_t nr0 = ne0;
+
+    // This is the size of the rest of the dimensions of the result
+    const int32_t nr1 = ne1 * ne2 * ne3;
+
+    // Now select a reasonable chunk size.
+    int chunk_size = 16;
+
+    // We need to step up the size if it's small
+    if (nr0 == 1 || nr1 == 1) {
+        chunk_size = 64;
+    }
+
+    // distribute the work across the inner or outer loop based on which one is larger
+    // The number of chunks in the 0/1 dim.
+    // CEIL(nr0/chunk_size)
+    int32_t nchunk0 = (nr0 + chunk_size - 1) / chunk_size;
+    int32_t nchunk1 = (nr1 + chunk_size - 1) / chunk_size;
+
+    // If the chunking is poor for the number of threads on this setup, scrap the whole plan.  Re-chunk it by thread.
+    //   Also, chunking by thread was measured to have perform better on NUMA systems.  See https://github.com/ggml-org/llama.cpp/pull/6915
+    //   In theory, chunking should be just as useful on NUMA and non NUMA systems, but testing disagreed with that.
+    if (nchunk0 * nchunk1 <  4) {
+        // distribute the thread work across the inner or outer loop based on which one is larger
+        nchunk0 =  1; // parallelize by src0 rows
+        nchunk1 =  1; // parallelize by src1 rows
+    }
+
+    // The number of elements in each chunk
+    const int32_t dr0 = (nr0 + nchunk0 - 1) / nchunk0;
+    const int32_t dr1 = (nr1 + nchunk1 - 1) / nchunk1;
+
+    // The first chunk comes from our thread_id, the rest will get auto-assigned.
+    int current_chunk = 0;
+
+    while (current_chunk < nchunk0 * nchunk1) {
+        const int32_t ith0 = current_chunk % nchunk0;
+        const int32_t ith1 = current_chunk / nchunk0;
+
+        const int32_t ir0_start = dr0 * ith0;
+        const int32_t ir0_end = MIN(ir0_start + dr0, nr0);
+
+        const int32_t ir1_start = dr1 * ith1;
+        const int32_t ir1_end = MIN(ir1_start + dr1, nr1);
+
+        // dot kernels can handle 1 row and col at a time, but mmla kernels can process 2 rows and cols
+        int32_t num_rows_per_vec_dot = vec_dot_num_rows;
+
+        // these checks are needed to avoid crossing dim1 boundaries
+        // can be optimized, but the logic would become more complicated, so keeping it like this for simplicity
+        if ((nr0 % 2 != 0) || (ne11 % 2 != 0) || ((ir0_end - ir0_start) % 2 != 0) || ((ir1_end - ir1_start) % 2 != 0)) {
+            num_rows_per_vec_dot = 1;
+        }
+        ggml_compute_forward_mul_mat_one_chunk(src0, src1, dst, src0->type, num_rows_per_vec_dot,
+                                               ir0_start, ir0_end, ir1_start, ir1_end);
+
+        if (1 >= nchunk0 * nchunk1) {
+            break;
+        }
+        current_chunk++;
+    }
+
+    GGMLHEXAGON_LOG_DEBUG("leave %s", __func__ );
+    return 0;
+}
+
+static int ggmlop_dsp_mulmat_multithread(remote_handle64 h, const struct dsptensor * src0, const struct dsptensor * src1, dsptensor * dst) {
+    GGMLHEXAGON_LOG_DEBUG("enter %s", __func__ );
+    GGMLHEXAGON_LOG_DEBUG("leave %s", __func__ );
+    return 0;
+}
+
+int ggmlop_dsp_mulmat(remote_handle64 h, const struct dsptensor * src0, const struct dsptensor * src1, dsptensor * dst) {
+    if (ggmlop_get_thread_counts() > 1) {
+        return ggmlop_dsp_mulmat_multithread(h, src0, src1, dst);
+    } else {
+        return ggmlop_dsp_mulmat_singlethread(h, src0, src1, dst);
+    }
+}
diff --git a/ggml/src/ggml-hexagon/kernels/skel.c b/ggml/src/ggml-hexagon/kernels/skel.c
new file mode 100644
index 0000000000000..b216d66a654ab
--- /dev/null
+++ b/ggml/src/ggml-hexagon/kernels/skel.c
@@ -0,0 +1,621 @@
+//qidl copyright
+//qidl nested=false
+#include "skel.h"
+
+#include <string.h>
+#ifndef _WIN32
+#include "HAP_farf.h"
+#endif //_WIN32 for HAP_farf
+#ifndef _ALLOCATOR_H
+#define _ALLOCATOR_H
+
+#include <stdlib.h>
+#include <stdint.h>
+
+typedef struct _heap _heap;
+struct _heap {
+   _heap* pPrev;
+   const char* loc;
+   uint64_t buf;
+};
+
+typedef struct _allocator {
+   _heap* pheap;
+   uint8_t* stack;
+   uint8_t* stackEnd;
+   int nSize;
+} _allocator;
+
+_ATTRIBUTE_UNUSED
+static __inline int _heap_alloc(_heap** ppa, const char* loc, int size, void** ppbuf) {
+   _heap* pn = 0;
+   pn = MALLOC((size_t)size + sizeof(_heap) - sizeof(uint64_t));
+   if(pn != 0) {
+      pn->pPrev = *ppa;
+      pn->loc = loc;
+      *ppa = pn;
+      *ppbuf = (void*)&(pn->buf);
+      return 0;
+   } else {
+      return -1;
+   }
+}
+#define _ALIGN_SIZE(x, y) (((x) + (y-1)) & ~(y-1))
+
+_ATTRIBUTE_UNUSED
+static __inline int _allocator_alloc(_allocator* me,
+                                    const char* loc,
+                                    int size,
+                                    unsigned int al,
+                                    void** ppbuf) {
+   if(size < 0) {
+      return -1;
+   } else if (size == 0) {
+      *ppbuf = 0;
+      return 0;
+   }
+   if((_ALIGN_SIZE((uintptr_t)me->stackEnd, al) + (size_t)size) < (uintptr_t)me->stack + (size_t)me->nSize) {
+      *ppbuf = (uint8_t*)_ALIGN_SIZE((uintptr_t)me->stackEnd, al);
+      me->stackEnd = (uint8_t*)_ALIGN_SIZE((uintptr_t)me->stackEnd, al) + size;
+      return 0;
+   } else {
+      return _heap_alloc(&me->pheap, loc, size, ppbuf);
+   }
+}
+
+_ATTRIBUTE_UNUSED
+static __inline void _allocator_deinit(_allocator* me) {
+   _heap* pa = me->pheap;
+   while(pa != 0) {
+      _heap* pn = pa;
+      const char* loc = pn->loc;
+      (void)loc;
+      pa = pn->pPrev;
+      FREE(pn);
+   }
+}
+
+_ATTRIBUTE_UNUSED
+static __inline void _allocator_init(_allocator* me, uint8_t* stack, int stackSize) {
+   me->stack =  stack;
+   me->stackEnd =  stack + stackSize;
+   me->nSize = stackSize;
+   me->pheap = 0;
+}
+
+
+#endif // _ALLOCATOR_H
+
+#ifndef SLIM_H
+#define SLIM_H
+
+#include <stdint.h>
+
+//a C data structure for the idl types that can be used to implement
+//static and dynamic language bindings fairly efficiently.
+//
+//the goal is to have a minimal ROM and RAM footprint and without
+//doing too many allocations.  A good way to package these things seemed
+//like the module boundary, so all the idls within  one module can share
+//all the type references.
+
+
+#define PARAMETER_IN       0x0
+#define PARAMETER_OUT      0x1
+#define PARAMETER_INOUT    0x2
+#define PARAMETER_ROUT     0x3
+#define PARAMETER_INROUT   0x4
+
+//the types that we get from idl
+#define TYPE_OBJECT             0x0
+#define TYPE_INTERFACE          0x1
+#define TYPE_PRIMITIVE          0x2
+#define TYPE_ENUM               0x3
+#define TYPE_STRING             0x4
+#define TYPE_WSTRING            0x5
+#define TYPE_STRUCTURE          0x6
+#define TYPE_UNION              0x7
+#define TYPE_ARRAY              0x8
+#define TYPE_SEQUENCE           0x9
+
+//these require the pack/unpack to recurse
+//so it's a hint to those languages that can optimize in cases where
+//recursion isn't necessary.
+#define TYPE_COMPLEX_STRUCTURE  (0x10 | TYPE_STRUCTURE)
+#define TYPE_COMPLEX_UNION      (0x10 | TYPE_UNION)
+#define TYPE_COMPLEX_ARRAY      (0x10 | TYPE_ARRAY)
+#define TYPE_COMPLEX_SEQUENCE   (0x10 | TYPE_SEQUENCE)
+
+
+typedef struct Type Type;
+
+#define INHERIT_TYPE\
+   int32_t nativeSize;                /*in the simple case its the same as wire size and alignment*/\
+   union {\
+      struct {\
+         const uintptr_t         p1;\
+         const uintptr_t         p2;\
+      } _cast;\
+      struct {\
+         uint32_t  iid;\
+         uint32_t  bNotNil;\
+      } object;\
+      struct {\
+         const Type  *arrayType;\
+         int32_t      nItems;\
+      } array;\
+      struct {\
+         const Type *seqType;\
+         int32_t      nMaxLen;\
+      } seqSimple; \
+      struct {\
+         uint32_t bFloating;\
+         uint32_t bSigned;\
+      } prim; \
+      const SequenceType* seqComplex;\
+      const UnionType  *unionType;\
+      const StructType *structType;\
+      int32_t         stringMaxLen;\
+      uint8_t        bInterfaceNotNil;\
+   } param;\
+   uint8_t    type;\
+   uint8_t    nativeAlignment\
+
+typedef struct UnionType UnionType;
+typedef struct StructType StructType;
+typedef struct SequenceType SequenceType;
+struct Type {
+   INHERIT_TYPE;
+};
+
+struct SequenceType {
+   const Type *         seqType;
+   uint32_t               nMaxLen;
+   uint32_t               inSize;
+   uint32_t               routSizePrimIn;
+   uint32_t               routSizePrimROut;
+};
+
+//byte offset from the start of the case values for
+//this unions case value array.  it MUST be aligned
+//at the alignment requrements for the descriptor
+//
+//if negative it means that the unions cases are
+//simple enumerators, so the value read from the descriptor
+//can be used directly to find the correct case
+typedef union CaseValuePtr CaseValuePtr;
+union CaseValuePtr {
+   const uint8_t*   value8s;
+   const uint16_t*  value16s;
+   const uint32_t*  value32s;
+   const uint64_t*  value64s;
+};
+
+//these are only used in complex cases
+//so I pulled them out of the type definition as references to make
+//the type smaller
+struct UnionType {
+   const Type           *descriptor;
+   uint32_t               nCases;
+   const CaseValuePtr   caseValues;
+   const Type * const   *cases;
+   int32_t               inSize;
+   int32_t               routSizePrimIn;
+   int32_t               routSizePrimROut;
+   uint8_t                inAlignment;
+   uint8_t                routAlignmentPrimIn;
+   uint8_t                routAlignmentPrimROut;
+   uint8_t                inCaseAlignment;
+   uint8_t                routCaseAlignmentPrimIn;
+   uint8_t                routCaseAlignmentPrimROut;
+   uint8_t                nativeCaseAlignment;
+   uint8_t              bDefaultCase;
+};
+
+struct StructType {
+   uint32_t               nMembers;
+   const Type * const   *members;
+   int32_t               inSize;
+   int32_t               routSizePrimIn;
+   int32_t               routSizePrimROut;
+   uint8_t                inAlignment;
+   uint8_t                routAlignmentPrimIn;
+   uint8_t                routAlignmentPrimROut;
+};
+
+typedef struct Parameter Parameter;
+struct Parameter {
+   INHERIT_TYPE;
+   uint8_t    mode;
+   uint8_t  bNotNil;
+};
+
+#define SLIM_IFPTR32(is32,is64) (sizeof(uintptr_t) == 4 ? (is32) : (is64))
+#define SLIM_SCALARS_IS_DYNAMIC(u) (((u) & 0x00ffffff) == 0x00ffffff)
+
+typedef struct Method Method;
+struct Method {
+   uint32_t                    uScalars;            //no method index
+   int32_t                     primInSize;
+   int32_t                     primROutSize;
+   int                         maxArgs;
+   int                         numParams;
+   const Parameter * const     *params;
+   uint8_t                       primInAlignment;
+   uint8_t                       primROutAlignment;
+};
+
+typedef struct Interface Interface;
+
+struct Interface {
+   int                            nMethods;
+   const Method  * const          *methodArray;
+   int                            nIIds;
+   const uint32_t                   *iids;
+   const uint16_t*                  methodStringArray;
+   const uint16_t*                  methodStrings;
+   const char*                    strings;
+};
+
+
+#endif //SLIM_H
+
+
+#ifndef _GGMLOP_SLIM_H
+#define _GGMLOP_SLIM_H
+#include <stdint.h>
+
+#ifndef __QAIC_SLIM
+#define __QAIC_SLIM(ff) ff
+#endif
+#ifndef __QAIC_SLIM_EXPORT
+#define __QAIC_SLIM_EXPORT
+#endif
+
+static const Type types[5];
+static const Type* const typeArrays[7] = {&(types[0]),&(types[1]),&(types[1]),&(types[0]),&(types[2]),&(types[0]),&(types[3])};
+static const StructType structTypes[1] = {{0x7,&(typeArrays[0]),0x70,0x4,0x6c,0x4,0x4,0x4}};
+static const Type types[5] = {{0x4,{{(const uintptr_t)0,(const uintptr_t)1}}, 2,0x4},{0x10,{{(const uintptr_t)&(types[0]),(const uintptr_t)0x4}}, 8,0x4},{0x40,{{(const uintptr_t)&(types[0]),(const uintptr_t)0x10}}, 8,0x4},{SLIM_IFPTR32(0x8,0x10),{{(const uintptr_t)&(types[4]),(const uintptr_t)0x0}}, 9,SLIM_IFPTR32(0x4,0x8)},{0x4,{{(const uintptr_t)0,(const uintptr_t)1}}, 2,0x4}};
+static const Parameter parameters[6] = {{SLIM_IFPTR32(0x8,0x10),{{(const uintptr_t)0x0,0}}, 4,SLIM_IFPTR32(0x4,0x8),0,0},{SLIM_IFPTR32(0x4,0x8),{{(const uintptr_t)0xdeadc0de,(const uintptr_t)0}}, 0,SLIM_IFPTR32(0x4,0x8),3,0},{SLIM_IFPTR32(0x4,0x8),{{(const uintptr_t)0xdeadc0de,(const uintptr_t)0}}, 0,SLIM_IFPTR32(0x4,0x8),0,0},{0x4,{{(const uintptr_t)0,(const uintptr_t)1}}, 2,0x4,0,0},{SLIM_IFPTR32(0x74,0x80),{{(const uintptr_t)&(structTypes[0]),0}}, 22,SLIM_IFPTR32(0x4,0x8),0,0},{SLIM_IFPTR32(0x74,0x80),{{(const uintptr_t)&(structTypes[0]),0}}, 22,SLIM_IFPTR32(0x4,0x8),3,0}};
+static const Parameter* const parameterArrays[9] = {(&(parameters[4])),(&(parameters[4])),(&(parameters[5])),(&(parameters[3])),(&(parameters[3])),(&(parameters[3])),(&(parameters[0])),(&(parameters[1])),(&(parameters[2]))};
+static const Method methods[4] = {{REMOTE_SCALARS_MAKEX(0,0,0x2,0x0,0x0,0x1),0x4,0x0,2,2,(&(parameterArrays[6])),0x4,0x1},{REMOTE_SCALARS_MAKEX(0,0,0x0,0x0,0x1,0x0),0x0,0x0,1,1,(&(parameterArrays[8])),0x1,0x0},{REMOTE_SCALARS_MAKEX(0,0,0x1,0x0,0x0,0x0),0xc,0x0,3,3,(&(parameterArrays[3])),0x4,0x0},{REMOTE_SCALARS_MAKEX(0,0,0x3,0x2,0x0,0x0),0xe4,0x6c,3,3,(&(parameterArrays[0])),0x4,0x4}};
+static const Method* const methodArrays[8] = {&(methods[0]),&(methods[1]),&(methods[2]),&(methods[3]),&(methods[3]),&(methods[3]),&(methods[3]),&(methods[3])};
+static const char strings[167] = "dsp_setclocks\0dsp_rmsnorm\0dsp_softmax\0dcvs_enable\0power_level\0dsp_pool2d\0dsp_mulmat\0op_params\0dsp_add\0latency\0flags\0close\0src1\0data\0type\0src0\0open\0dst\0uri\0op\0nb\0ne\0h\0";
+static const uint16_t methodStrings[134] = {62,137,132,161,158,155,84,110,127,122,132,161,158,155,84,110,127,147,132,161,158,155,84,110,127,14,137,132,161,158,155,84,110,127,122,132,161,158,155,84,110,127,147,132,161,158,155,84,110,127,26,137,132,161,158,155,84,110,127,122,132,161,158,155,84,110,127,147,132,161,158,155,84,110,127,73,137,132,161,158,155,84,110,127,122,132,161,158,155,84,110,127,147,132,161,158,155,84,110,127,94,137,132,161,158,155,84,110,127,122,132,161,158,155,84,110,127,147,132,161,158,155,84,110,127,0,50,102,38,142,151,164,116,164};
+static const uint16_t methodStringsArrays[8] = {129,132,125,100,75,50,25,0};
+__QAIC_SLIM_EXPORT const Interface __QAIC_SLIM(ggmlop_slim) = {8,&(methodArrays[0]),0,0,&(methodStringsArrays [0]),methodStrings,strings};
+#endif //_GGMLOP_SLIM_H
+extern int adsp_mmap_fd_getinfo(int, uint32_t *);
+#ifdef __cplusplus
+extern "C" {
+#endif
+_ATTRIBUTE_VISIBILITY uint32_t ggmldsp_skel_handle_invoke_qaic_version = 10048;
+_ATTRIBUTE_VISIBILITY char ggmldsp_skel_handle_invoke_uri[79+1]="file:///libggmldsp-skel.so?ggmldsp_skel_handle_invoke&_modver=1.0&_idlver=0.0.1";
+static __inline int _skel_pack(_ATTRIBUTE_UNUSED remote_arg* _praROutPost, _ATTRIBUTE_UNUSED remote_arg* _ppraROutPost[1], _ATTRIBUTE_UNUSED void* _primROut, _ATTRIBUTE_UNUSED uint32_t _rout0[1], _ATTRIBUTE_UNUSED uint32_t _rout1[4], _ATTRIBUTE_UNUSED uint32_t _rout2[4], _ATTRIBUTE_UNUSED uint32_t _rout3[1], _ATTRIBUTE_UNUSED uint32_t _rout4[16], _ATTRIBUTE_UNUSED uint32_t _rout5[1], _ATTRIBUTE_UNUSED char* _rout6[1], _ATTRIBUTE_UNUSED uint32_t _rout6Len[1]) {
+   int _nErr = 0;
+   remote_arg* _praROutPostStart = _praROutPost;
+   remote_arg** _ppraROutPostStart = _ppraROutPost;
+   _ppraROutPost = &_praROutPost;
+   _COPY(_primROut, 0, _rout0, 0, 4);
+   _COPY(_primROut, 4, _rout1, 0, 16);
+   _COPY(_primROut, 20, _rout2, 0, 16);
+   _COPY(_primROut, 36, _rout3, 0, 4);
+   _COPY(_primROut, 40, _rout4, 0, 64);
+   _COPY(_primROut, 104, _rout5, 0, 4);
+   _ppraROutPostStart[0] += (_praROutPost - _praROutPostStart) +1;
+   return _nErr;
+}
+static __inline int _skel_unpack(_ATTRIBUTE_UNUSED _allocator* _al, _ATTRIBUTE_UNUSED remote_arg* _praIn, _ATTRIBUTE_UNUSED remote_arg* _ppraIn[1], _ATTRIBUTE_UNUSED remote_arg* _praROut, _ATTRIBUTE_UNUSED remote_arg* _ppraROut[1], _ATTRIBUTE_UNUSED remote_arg* _praHIn, _ATTRIBUTE_UNUSED remote_arg* _ppraHIn[1], _ATTRIBUTE_UNUSED remote_arg* _praHROut, _ATTRIBUTE_UNUSED remote_arg* _ppraHROut[1], _ATTRIBUTE_UNUSED void* _primIn, _ATTRIBUTE_UNUSED void* _primROut, _ATTRIBUTE_UNUSED uint32_t _rout0[1], _ATTRIBUTE_UNUSED uint32_t _rout1[4], _ATTRIBUTE_UNUSED uint32_t _rout2[4], _ATTRIBUTE_UNUSED uint32_t _rout3[1], _ATTRIBUTE_UNUSED uint32_t _rout4[16], _ATTRIBUTE_UNUSED uint32_t _rout5[1], _ATTRIBUTE_UNUSED char* _rout6[1], _ATTRIBUTE_UNUSED uint32_t _rout6Len[1]) {
+   int _nErr = 0;
+   remote_arg* _praInStart = _praIn;
+   remote_arg** _ppraInStart = _ppraIn;
+   remote_arg* _praROutStart = _praROut;
+   remote_arg** _ppraROutStart = _ppraROut;
+   _ppraIn = &_praIn;
+   _ppraROut = &_praROut;
+   _COPY(_rout6Len, 0, _primIn, 0, 4);
+   _QAIC_ASSERT(_nErr, ((_praROut[0].buf.nLen / 4)) >= (size_t)(_rout6Len[0]));
+   _rout6[0] = _praROut[0].buf.pv;
+   _ppraInStart[0] += (_praIn - _praInStart) + 0;
+   _ppraROutStart[0] += (_praROut - _praROutStart) +1;
+   _QAIC_CATCH(_nErr) {}
+   return _nErr;
+}
+static __inline int _skel_unpack_1(_ATTRIBUTE_UNUSED _allocator* _al, _ATTRIBUTE_UNUSED remote_arg* _praIn, _ATTRIBUTE_UNUSED remote_arg* _ppraIn[1], _ATTRIBUTE_UNUSED remote_arg* _praROut, _ATTRIBUTE_UNUSED remote_arg* _ppraROut[1], _ATTRIBUTE_UNUSED remote_arg* _praHIn, _ATTRIBUTE_UNUSED remote_arg* _ppraHIn[1], _ATTRIBUTE_UNUSED remote_arg* _praHROut, _ATTRIBUTE_UNUSED remote_arg* _ppraHROut[1], _ATTRIBUTE_UNUSED void* _primIn, _ATTRIBUTE_UNUSED void* _primROut, _ATTRIBUTE_UNUSED uint32_t _in0[1], _ATTRIBUTE_UNUSED uint32_t _in1[4], _ATTRIBUTE_UNUSED uint32_t _in2[4], _ATTRIBUTE_UNUSED uint32_t _in3[1], _ATTRIBUTE_UNUSED uint32_t _in4[16], _ATTRIBUTE_UNUSED uint32_t _in5[1], _ATTRIBUTE_UNUSED char* _in6[1], _ATTRIBUTE_UNUSED uint32_t _in6Len[1]) {
+   int _nErr = 0;
+   remote_arg* _praInStart = _praIn;
+   remote_arg** _ppraInStart = _ppraIn;
+   remote_arg* _praROutStart = _praROut;
+   remote_arg** _ppraROutStart = _ppraROut;
+   _ppraIn = &_praIn;
+   _ppraROut = &_praROut;
+   _COPY(_in0, 0, _primIn, 0, 4);
+   _COPY(_in1, 0, _primIn, 4, 16);
+   _COPY(_in2, 0, _primIn, 20, 16);
+   _COPY(_in3, 0, _primIn, 36, 4);
+   _COPY(_in4, 0, _primIn, 40, 64);
+   _COPY(_in5, 0, _primIn, 104, 4);
+   _COPY(_in6Len, 0, _primIn, 108, 4);
+   _QAIC_ASSERT(_nErr, ((_praIn[0].buf.nLen / 4)) >= (size_t)(_in6Len[0]));
+   _in6[0] = _praIn[0].buf.pv;
+   _ppraInStart[0] += (_praIn - _praInStart) + 1;
+   _ppraROutStart[0] += (_praROut - _praROutStart) +0;
+   _QAIC_CATCH(_nErr) {}
+   return _nErr;
+}
+static __inline int _skel_method(int (*_pfn)(remote_handle64, const dsptensor*, const dsptensor*, dsptensor*), remote_handle64 _h, uint32_t _sc, remote_arg* _pra) {
+   remote_arg* _praEnd = 0;
+   uintptr_t _in0[SLIM_IFPTR32(29, 16)] = {0};
+   uintptr_t _in1[SLIM_IFPTR32(29, 16)] = {0};
+   uintptr_t _rout2[SLIM_IFPTR32(29, 16)] = {0};
+   uint32_t* _primIn= 0;
+   int _numIn[1] = {0};
+   uint32_t* _primROut= 0;
+   int _numInH[1] = {0};
+   int _numROut[1] = {0};
+   remote_arg* _praIn = 0;
+   remote_arg* _praROut = 0;
+   remote_arg* _praROutPost = 0;
+   remote_arg** _ppraROutPost = &_praROutPost;
+   _allocator _al[1] = {{0}};
+   remote_arg** _ppraIn = &_praIn;
+   remote_arg** _ppraROut = &_praROut;
+   remote_arg* _praHIn = 0;
+   remote_arg** _ppraHIn = &_praHIn;
+   remote_arg* _praHROut = 0;
+   remote_arg** _ppraHROut = &_praHROut;
+   int _nErr = 0;
+   _praEnd = ((_pra + REMOTE_SCALARS_INBUFS(_sc)) + REMOTE_SCALARS_OUTBUFS(_sc) + REMOTE_SCALARS_INHANDLES(_sc) + REMOTE_SCALARS_OUTHANDLES(_sc));
+   _QAIC_ASSERT(_nErr, REMOTE_SCALARS_INBUFS(_sc)>=1);
+   _QAIC_ASSERT(_nErr, REMOTE_SCALARS_OUTBUFS(_sc)>=1);
+   _QAIC_ASSERT(_nErr, REMOTE_SCALARS_INHANDLES(_sc)==0);
+   _QAIC_ASSERT(_nErr, REMOTE_SCALARS_OUTHANDLES(_sc)==0);
+   _QAIC_ASSERT(_nErr, (_pra + ((1 + 1) + (((0 + 0) + 0) + 0))) <= _praEnd);
+   _numIn[0] = (REMOTE_SCALARS_INBUFS(_sc) - 1);
+   _QAIC_ASSERT(_nErr, _pra[0].buf.nLen >= 228);
+   _primIn = _pra[0].buf.pv;
+   _QAIC_ASSERT(_nErr, _pra[(_numIn[0] + 1)].buf.nLen >= 108);
+   _primROut = _pra[(_numIn[0] + 1)].buf.pv;
+   _numInH[0] = REMOTE_SCALARS_INHANDLES(_sc);
+   _numROut[0] = REMOTE_SCALARS_OUTBUFS(_sc);
+   _praIn = (_pra + 1);
+   _praROut = (_praIn + _numIn[0] + 1);
+   _praROutPost = _praROut;
+   _allocator_init(_al, 0, 0);
+   if(_praHIn == 0)
+   {
+      _praHIn = ((_praROut + _numROut[0]) + 1);
+   }
+   if(_praHROut == 0)
+      (_praHROut = _praHIn + _numInH[0] + 0);
+   _TRY(_nErr, _skel_unpack_1(_al, (_praIn + 0), _ppraIn, (_praROut + 0), _ppraROut, _praHIn, _ppraHIn, _praHROut, _ppraHROut, ((char*)_primIn + 0), 0, (uint32_t*)&(((uint32_t*)_in0)[0]), (uint32_t*)&(((uint32_t*)_in0)[1]), (uint32_t*)&(((uint32_t*)_in0)[5]), (uint32_t*)&(((uint32_t*)_in0)[9]), (uint32_t*)&(((uint32_t*)_in0)[10]), (uint32_t*)&(((uint32_t*)_in0)[26]), SLIM_IFPTR32((char**)&(((uint32_t*)_in0)[27]), (char**)&(((uint64_t*)_in0)[14])), SLIM_IFPTR32((uint32_t*)&(((uint32_t*)_in0)[28]), (uint32_t*)&(((uint32_t*)_in0)[30]))));
+   _TRY(_nErr, _skel_unpack_1(_al, (_praIn + 0), _ppraIn, (_praROut + 0), _ppraROut, _praHIn, _ppraHIn, _praHROut, _ppraHROut, ((char*)_primIn + 112), 0, (uint32_t*)&(((uint32_t*)_in1)[0]), (uint32_t*)&(((uint32_t*)_in1)[1]), (uint32_t*)&(((uint32_t*)_in1)[5]), (uint32_t*)&(((uint32_t*)_in1)[9]), (uint32_t*)&(((uint32_t*)_in1)[10]), (uint32_t*)&(((uint32_t*)_in1)[26]), SLIM_IFPTR32((char**)&(((uint32_t*)_in1)[27]), (char**)&(((uint64_t*)_in1)[14])), SLIM_IFPTR32((uint32_t*)&(((uint32_t*)_in1)[28]), (uint32_t*)&(((uint32_t*)_in1)[30]))));
+   _TRY(_nErr, _skel_unpack(_al, (_praIn + 0), _ppraIn, (_praROut + 0), _ppraROut, _praHIn, _ppraHIn, _praHROut, _ppraHROut, ((char*)_primIn + 224), ((char*)_primROut + 0), (uint32_t*)&(((uint32_t*)_rout2)[0]), (uint32_t*)&(((uint32_t*)_rout2)[1]), (uint32_t*)&(((uint32_t*)_rout2)[5]), (uint32_t*)&(((uint32_t*)_rout2)[9]), (uint32_t*)&(((uint32_t*)_rout2)[10]), (uint32_t*)&(((uint32_t*)_rout2)[26]), SLIM_IFPTR32((char**)&(((uint32_t*)_rout2)[27]), (char**)&(((uint64_t*)_rout2)[14])), SLIM_IFPTR32((uint32_t*)&(((uint32_t*)_rout2)[28]), (uint32_t*)&(((uint32_t*)_rout2)[30]))));
+   _TRY(_nErr, _pfn(_h, (const dsptensor*)_in0, (const dsptensor*)_in1, (dsptensor*)_rout2));
+   _TRY(_nErr, _skel_pack((_praROutPost + 0), _ppraROutPost, ((char*)_primROut + 0), (uint32_t*)&(((uint32_t*)_rout2)[0]), (uint32_t*)&(((uint32_t*)_rout2)[1]), (uint32_t*)&(((uint32_t*)_rout2)[5]), (uint32_t*)&(((uint32_t*)_rout2)[9]), (uint32_t*)&(((uint32_t*)_rout2)[10]), (uint32_t*)&(((uint32_t*)_rout2)[26]), SLIM_IFPTR32((char**)&(((uint32_t*)_rout2)[27]), (char**)&(((uint64_t*)_rout2)[14])), SLIM_IFPTR32((uint32_t*)&(((uint32_t*)_rout2)[28]), (uint32_t*)&(((uint32_t*)_rout2)[30]))));
+   _QAIC_CATCH(_nErr) {}
+   _allocator_deinit(_al);
+   return _nErr;
+}
+static __inline int _skel_method_1(int (*_pfn)(remote_handle64, int32, int32, int32, int32), remote_handle64 _h, uint32_t _sc, remote_arg* _pra) {
+   remote_arg* _praEnd = 0;
+   uint32_t _in0[1] = {0};
+   uint32_t _in1[1] = {0};
+   uint32_t _in2[1] = {0};
+   uint32_t _in3[1] = {0};
+   uint32_t* _primIn= 0;
+   int _nErr = 0;
+   _praEnd = ((_pra + REMOTE_SCALARS_INBUFS(_sc)) + REMOTE_SCALARS_OUTBUFS(_sc) + REMOTE_SCALARS_INHANDLES(_sc) + REMOTE_SCALARS_OUTHANDLES(_sc));
+   _QAIC_ASSERT(_nErr, REMOTE_SCALARS_INBUFS(_sc)==1);
+   _QAIC_ASSERT(_nErr, REMOTE_SCALARS_OUTBUFS(_sc)==0);
+   _QAIC_ASSERT(_nErr, REMOTE_SCALARS_INHANDLES(_sc)==0);
+   _QAIC_ASSERT(_nErr, REMOTE_SCALARS_OUTHANDLES(_sc)==0);
+   _QAIC_ASSERT(_nErr, (_pra + ((1 + 0) + (((0 + 0) + 0) + 0))) <= _praEnd);
+   _QAIC_ASSERT(_nErr, _pra[0].buf.nLen >= 12);
+   _primIn = _pra[0].buf.pv;
+   _COPY(_in0, 0, _primIn, 0, 4);
+   _COPY(_in1, 0, _primIn, 4, 4);
+   _COPY(_in2, 0, _primIn, 8, 4);
+   _COPY(_in3, 0, _primIn, 12, 4);
+   _TRY(_nErr, _pfn(_h, (int32)*_in0, (int32)*_in1, (int32)*_in2, (int32)*_in3));
+   _QAIC_CATCH(_nErr) {}
+   return _nErr;
+}
+static __inline int _skel_method_2(int (*_pfn)(remote_handle64), uint32_t _sc, remote_arg* _pra) {
+   remote_arg* _praEnd = 0;
+   remote_handle64 _in0[1] = {0};
+   remote_arg* _praRHandleIn = _pra + REMOTE_SCALARS_INBUFS(_sc) +  REMOTE_SCALARS_OUTBUFS(_sc);
+   int _nErr = 0;
+   _praEnd = ((_pra + REMOTE_SCALARS_INBUFS(_sc)) + REMOTE_SCALARS_OUTBUFS(_sc) + REMOTE_SCALARS_INHANDLES(_sc) + REMOTE_SCALARS_OUTHANDLES(_sc));
+   _QAIC_ASSERT(_nErr, REMOTE_SCALARS_INBUFS(_sc)==0);
+   _QAIC_ASSERT(_nErr, REMOTE_SCALARS_OUTBUFS(_sc)==0);
+   _QAIC_ASSERT(_nErr, REMOTE_SCALARS_INHANDLES(_sc)==1);
+   _QAIC_ASSERT(_nErr, REMOTE_SCALARS_OUTHANDLES(_sc)==0);
+   _QAIC_ASSERT(_nErr, (_pra + ((0 + 0) + (((1 + 0) + 0) + 0))) <= _praEnd);
+   _COPY(_in0, 0, &(_praRHandleIn[0].h64), 0, sizeof(remote_handle64));
+   _TRY(_nErr, _pfn((remote_handle64)*_in0));
+   _QAIC_CATCH(_nErr) {}
+   return _nErr;
+}
+static __inline int _compare_versions(char* stub_ver, char* skel_ver, int* result) {
+   unsigned long int major_stub = 0, minor_stub = 0, patch_stub = 0;
+   unsigned long int major_skel = 0, minor_skel = 0, patch_skel = 0;
+   char *saveptr1 = NULL;
+   char *token1 = NULL;
+   char *saveptr2 = NULL;
+   char *token2 = NULL;
+   int i=0;
+   for (i=0, token1 = strtok_r(stub_ver, ".", &saveptr1); i<3 && token1 != NULL; i++, token1 = strtok_r(NULL, ".", &saveptr1))
+   {
+      unsigned long int tn = strtoul(token1, NULL,10);
+      if( tn > 999)
+      {
+         *result=-1;
+         return 0;
+      }
+      else
+      {
+         if(i==0) major_stub=tn;
+         if(i==1) minor_stub=tn;
+         if(i==2) patch_stub=tn;
+      }
+   }
+   for (i=0, token2 = strtok_r(skel_ver, ".", &saveptr2); i<3 && token2 != NULL; i++, token2 = strtok_r(NULL, ".", &saveptr2))
+   {
+      unsigned long int tn = strtoul(token2, NULL,10);
+      if( tn > 999)
+      {
+         *result=-1;
+         return 0;
+      }
+      else
+      {
+         if(i==0) major_skel=tn;
+         if(i==1) minor_skel=tn;
+         if(i==2) patch_skel=tn;
+      }
+   }
+   if(major_stub<major_skel)
+   {
+      *result=1;
+      return 0;
+   }
+   else if(major_stub==major_skel)
+   {
+      if( minor_stub < minor_skel )
+      {
+         *result=1;
+         return 0;
+      }
+      else if((minor_stub == minor_skel) && (patch_skel>=patch_stub))
+      {
+         *result=1;
+         return 0;
+      }
+   }
+   *result=-1;
+   return 0;
+}
+static __inline int _stub_skel_version_check(char*_in0, int* resVal) {
+   int _nErr = 0;
+   char* p = strstr(_in0, "_idlver=");
+   if(!p)
+   {
+      *resVal = -1;
+      return 0;
+   }
+   p+=8;
+   int i=0,len=0, comVer=0,num_delimit=0, updtInxStub=0, updtInxSkel=0;
+   for(i=0;i<strlen(p);i++)
+   {
+      if(num_delimit>2)
+      {
+         *resVal = -1;
+         return 0;
+      }
+      if ((p[i]>='0' && p[i]<='9') || (p[i]=='.'))
+      {
+         len++;
+         if(p[i]=='.')
+         {
+            num_delimit++;
+         }
+      }
+      else if(p[i]=='&')
+      {
+         break;
+      }
+      else
+      {
+         *resVal = -1;
+         return 0;
+      }
+   }
+   char* stubVer=(char*)MALLOC(len+1);
+   _QAIC_ASSERT(_nErr, stubVer!=NULL);
+   for(i=0;i<strlen(p);i++)
+   {
+      if((p[i]>='0' && p[i]<='9') || (p[i]=='.'))
+      {
+         stubVer[updtInxStub]=p[i];
+         updtInxStub++;
+      }
+      else if(p[i]=='&')
+      {
+         break;
+      }
+   }
+   stubVer[len]='\0';
+   char* skelVer=(char*)MALLOC(strlen(IDL_VERSION)+1);
+   _QAIC_ASSERT(_nErr, skelVer!=NULL);
+   for(i=0;i< strlen(IDL_VERSION);i++)
+   {
+      skelVer[updtInxSkel]=IDL_VERSION[i];
+      updtInxSkel++;
+   }
+   skelVer[strlen(IDL_VERSION)]='\0';
+   _TRY(_nErr, _compare_versions(stubVer, skelVer, &comVer));
+   *resVal = 0;
+   if (comVer==-1)
+   {
+      *resVal = -1;
+   }
+   FREE(stubVer);
+   FREE(skelVer);
+   _QAIC_CATCH(_nErr) {}
+   return 0;
+}
+static __inline int _skel_method_3(int (*_pfn)(const char*, remote_handle64*), uint32_t _sc, remote_arg* _pra) {
+   remote_arg* _praEnd = 0;
+   char* _in0[1] = {0};
+   uint32_t _in0Len[1] = {0};
+   remote_handle64 _rout1[1] = {0};
+   uint32_t* _primIn= 0;
+   remote_arg* _praRHandleROut = _pra + REMOTE_SCALARS_INBUFS(_sc) + REMOTE_SCALARS_OUTBUFS(_sc) + REMOTE_SCALARS_INHANDLES(_sc) ;
+   remote_arg* _praIn = 0;
+   int _nErr = 0;
+   _praEnd = ((_pra + REMOTE_SCALARS_INBUFS(_sc)) + REMOTE_SCALARS_OUTBUFS(_sc) + REMOTE_SCALARS_INHANDLES(_sc) + REMOTE_SCALARS_OUTHANDLES(_sc));
+   _QAIC_ASSERT(_nErr, REMOTE_SCALARS_INBUFS(_sc)==2);
+   _QAIC_ASSERT(_nErr, REMOTE_SCALARS_OUTBUFS(_sc)==0);
+   _QAIC_ASSERT(_nErr, REMOTE_SCALARS_INHANDLES(_sc)==0);
+   _QAIC_ASSERT(_nErr, REMOTE_SCALARS_OUTHANDLES(_sc)==1);
+   _QAIC_ASSERT(_nErr, (_pra + ((2 + 0) + (((0 + 1) + 0) + 0))) <= _praEnd);
+   _QAIC_ASSERT(_nErr, _pra[0].buf.nLen >= 4);
+   _primIn = _pra[0].buf.pv;
+   _COPY(_in0Len, 0, _primIn, 0, 4);
+   _praIn = (_pra + 1);
+   _QAIC_ASSERT(_nErr, ((_praIn[0].buf.nLen / 1)) >= (size_t)(_in0Len[0]));
+   _in0[0] = _praIn[0].buf.pv;
+   _QAIC_ASSERT(_nErr, (_in0Len[0] > 0) && (_in0[0][(_in0Len[0] - 1)] == 0));
+   int resVal;
+   _TRY(_nErr, _stub_skel_version_check(*_in0, &resVal));
+   if(resVal==-1)
+   {
+      return AEE_ESTUBSKELVERMISMATCH;
+   }
+   _TRY(_nErr, _pfn((const char*)*_in0, (remote_handle64*)_rout1));
+   _COPY(&(_praRHandleROut[0].h64), 0, _rout1, 0, sizeof(remote_handle64));
+   _QAIC_CATCH(_nErr) {}
+   return _nErr;
+}
+__QAIC_SKEL_EXPORT int __QAIC_SKEL(ggmldsp_skel_handle_invoke)(remote_handle64 _h, uint32_t _sc, remote_arg* _pra) __QAIC_SKEL_ATTRIBUTE {
+   switch(REMOTE_SCALARS_METHOD(_sc)){
+      case 0:
+      return _skel_method_3(__QAIC_IMPL(ggmlop_dsp_open), _sc, _pra);
+      case 1:
+      return _skel_method_2(__QAIC_IMPL(ggmlop_dsp_close), _sc, _pra);
+      case 2:
+      return _skel_method_1(__QAIC_IMPL(ggmlop_dsp_setclocks), _h, _sc, _pra);
+      case 3:
+      return _skel_method(__QAIC_IMPL(ggmlop_dsp_add), _h, _sc, _pra);
+      case 4:
+      return _skel_method(__QAIC_IMPL(ggmlop_dsp_mulmat), _h, _sc, _pra);
+      case 5:
+      return _skel_method(__QAIC_IMPL(ggmlop_dsp_softmax), _h, _sc, _pra);
+      case 6:
+      return _skel_method(__QAIC_IMPL(ggmlop_dsp_rmsnorm), _h, _sc, _pra);
+      case 7:
+      return _skel_method(__QAIC_IMPL(ggmlop_dsp_pool2d), _h, _sc, _pra);
+   }
+   return AEE_EUNSUPPORTED;
+}
diff --git a/ggml/src/ggml-hexagon/kernels/skel.h b/ggml/src/ggml-hexagon/kernels/skel.h
new file mode 100644
index 0000000000000..f77e8101d14df
--- /dev/null
+++ b/ggml/src/ggml-hexagon/kernels/skel.h
@@ -0,0 +1,285 @@
+#ifndef _SKEL_H
+#define _SKEL_H
+//qidl copyright
+//qidl nested=false
+#include <AEEStdDef.h>
+#include <remote.h>
+#include <string.h>
+#include <stdlib.h>
+
+
+#ifndef __QAIC_HEADER
+#define __QAIC_HEADER(ff) ff
+#endif //__QAIC_HEADER
+
+#ifndef __QAIC_HEADER_EXPORT
+#define __QAIC_HEADER_EXPORT
+#endif // __QAIC_HEADER_EXPORT
+
+#ifndef __QAIC_HEADER_ATTRIBUTE
+#define __QAIC_HEADER_ATTRIBUTE
+#endif // __QAIC_HEADER_ATTRIBUTE
+
+#ifndef __QAIC_IMPL
+#define __QAIC_IMPL(ff) ff
+#endif //__QAIC_IMPL
+
+#ifndef __QAIC_IMPL_EXPORT
+#define __QAIC_IMPL_EXPORT
+#endif // __QAIC_IMPL_EXPORT
+
+#ifndef __QAIC_IMPL_ATTRIBUTE
+#define __QAIC_IMPL_ATTRIBUTE
+#endif // __QAIC_IMPL_ATTRIBUTE
+#ifndef _QAIC_ENV_H
+#define _QAIC_ENV_H
+
+#include <stdio.h>
+#ifdef _WIN32
+#include "qtest_stdlib.h"
+#else
+#define MALLOC malloc
+#define FREE free
+#endif
+
+#ifdef __GNUC__
+#ifdef __clang__
+#pragma GCC diagnostic ignored "-Wunknown-pragmas"
+#else
+#pragma GCC diagnostic ignored "-Wpragmas"
+#endif
+#pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+#pragma GCC diagnostic ignored "-Wunused-function"
+#endif
+
+#ifndef _ATTRIBUTE_UNUSED
+
+#ifdef _WIN32
+#define _ATTRIBUTE_UNUSED
+#else
+#define _ATTRIBUTE_UNUSED __attribute__ ((unused))
+#endif
+
+#endif // _ATTRIBUTE_UNUSED
+
+#ifndef _ATTRIBUTE_VISIBILITY
+
+#ifdef _WIN32
+#define _ATTRIBUTE_VISIBILITY
+#else
+#define _ATTRIBUTE_VISIBILITY __attribute__ ((visibility("default")))
+#endif
+
+#endif // _ATTRIBUTE_VISIBILITY
+
+#ifndef __QAIC_REMOTE
+#define __QAIC_REMOTE(ff) ff
+#endif //__QAIC_REMOTE
+
+#ifndef __QAIC_HEADER
+#define __QAIC_HEADER(ff) ff
+#endif //__QAIC_HEADER
+
+#ifndef __QAIC_HEADER_EXPORT
+#define __QAIC_HEADER_EXPORT
+#endif // __QAIC_HEADER_EXPORT
+
+#ifndef __QAIC_HEADER_ATTRIBUTE
+#define __QAIC_HEADER_ATTRIBUTE
+#endif // __QAIC_HEADER_ATTRIBUTE
+
+#ifndef __QAIC_IMPL
+#define __QAIC_IMPL(ff) ff
+#endif //__QAIC_IMPL
+
+#ifndef __QAIC_IMPL_EXPORT
+#define __QAIC_IMPL_EXPORT
+#endif // __QAIC_IMPL_EXPORT
+
+#ifndef __QAIC_IMPL_ATTRIBUTE
+#define __QAIC_IMPL_ATTRIBUTE
+#endif // __QAIC_IMPL_ATTRIBUTE
+
+#ifndef __QAIC_STUB
+#define __QAIC_STUB(ff) ff
+#endif //__QAIC_STUB
+
+#ifndef __QAIC_STUB_EXPORT
+#define __QAIC_STUB_EXPORT
+#endif // __QAIC_STUB_EXPORT
+
+#ifndef __QAIC_STUB_ATTRIBUTE
+#define __QAIC_STUB_ATTRIBUTE
+#endif // __QAIC_STUB_ATTRIBUTE
+
+#ifndef __QAIC_SKEL
+#define __QAIC_SKEL(ff) ff
+#endif //__QAIC_SKEL__
+
+#ifndef __QAIC_SKEL_EXPORT
+#define __QAIC_SKEL_EXPORT
+#endif // __QAIC_SKEL_EXPORT
+
+#ifndef __QAIC_SKEL_ATTRIBUTE
+#define __QAIC_SKEL_ATTRIBUTE
+#endif // __QAIC_SKEL_ATTRIBUTE
+
+#ifdef __QAIC_DEBUG__
+   #ifndef __QAIC_DBG_PRINTF__
+   #include <stdio.h>
+   #define __QAIC_DBG_PRINTF__( ee ) do { printf ee ; } while(0)
+   #endif
+#else
+   #define __QAIC_DBG_PRINTF__( ee ) (void)0
+#endif
+
+
+#define _OFFSET(src, sof)  ((void*)(((char*)(src)) + (sof)))
+
+#define _COPY(dst, dof, src, sof, sz)  \
+   do {\
+         struct __copy { \
+            char ar[sz]; \
+         };\
+         *(struct __copy*)_OFFSET(dst, dof) = *(struct __copy*)_OFFSET(src, sof);\
+   } while (0)
+
+#define _COPYIF(dst, dof, src, sof, sz)  \
+   do {\
+      if(_OFFSET(dst, dof) != _OFFSET(src, sof)) {\
+         _COPY(dst, dof, src, sof, sz); \
+      } \
+   } while (0)
+
+_ATTRIBUTE_UNUSED
+static __inline void _qaic_memmove(void* dst, void* src, int size) {
+   int i = 0;
+   for(i = 0; i < size; ++i) {
+      ((char*)dst)[i] = ((char*)src)[i];
+   }
+}
+
+#define _MEMMOVEIF(dst, src, sz)  \
+   do {\
+      if(dst != src) {\
+         _qaic_memmove(dst, src, sz);\
+      } \
+   } while (0)
+
+
+#define _ASSIGN(dst, src, sof)  \
+   do {\
+      dst = OFFSET(src, sof); \
+   } while (0)
+
+#define _STD_STRLEN_IF(str) (str == 0 ? 0 : strlen(str))
+
+#include "AEEStdErr.h"
+
+#ifdef _WIN32
+#define _QAIC_FARF(level, msg, ...) (void)0
+#else
+#define _QAIC_FARF(level, msg, ...) (void)0
+#endif //_WIN32 for _QAIC_FARF
+
+#define _TRY(ee, func) \
+   do { \
+      if (AEE_SUCCESS != ((ee) = func)) {\
+         __QAIC_DBG_PRINTF__((__FILE__ ":%d:error:%d:%s\n", __LINE__, (int)(ee),#func));\
+         goto ee##bail;\
+      } \
+   } while (0)
+
+#define _TRY_FARF(ee, func) \
+   do { \
+      if (AEE_SUCCESS != ((ee) = func)) {\
+         goto ee##farf##bail;\
+      } \
+   } while (0)
+
+#define _QAIC_CATCH(exception) exception##bail: if (exception != AEE_SUCCESS)
+
+#define _CATCH_FARF(exception) exception##farf##bail: if (exception != AEE_SUCCESS)
+
+#define _QAIC_ASSERT(nErr, ff) _TRY(nErr, 0 == (ff) ? AEE_EBADPARM : AEE_SUCCESS)
+
+#ifdef __QAIC_DEBUG__
+#define _QAIC_ALLOCATE(nErr, pal, size, alignment, pv) _TRY(nErr, _allocator_alloc(pal, __FILE_LINE__, size, alignment, (void**)&pv));\
+                                                  _QAIC_ASSERT(nErr,pv || !(size))
+#else
+#define _QAIC_ALLOCATE(nErr, pal, size, alignment, pv) _TRY(nErr, _allocator_alloc(pal, 0, size, alignment, (void**)&pv));\
+                                                  _QAIC_ASSERT(nErr,pv || !(size))
+#endif
+
+
+#endif // _QAIC_ENV_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#if !defined(__QAIC_STRING1_OBJECT_DEFINED__) && !defined(__STRING1_OBJECT__)
+#define __QAIC_STRING1_OBJECT_DEFINED__
+#define __STRING1_OBJECT__
+typedef struct _cstring1_s {
+   char* data;
+   int dataLen;
+} _cstring1_t;
+
+#endif /* __QAIC_STRING1_OBJECT_DEFINED__ */
+/// Enabling stub-skel mismatch check feature in the auto-gen files.
+/// Please refer to the IDL documentation for more details on the feature.
+/// It is fully supported only on Kailua and later targets.
+#define IDL_VERSION "0.0.1"
+typedef struct dsptensor dsptensor;
+struct dsptensor {
+   int32_t type;
+   int32_t ne[4];
+   int32_t nb[4];
+   int32_t op;
+   int32_t op_params[16];
+   int32_t flags;
+   void * data;
+   int data_len;
+};
+/**
+    * Opens the handle in the specified domain.  If this is the first
+    * handle, this creates the session.  Typically this means opening
+    * the device, aka open("/dev/adsprpc-smd"), then calling ioctl
+    * device APIs to create a PD on the DSP to execute our code in,
+    * then asking that PD to dlopen the .so and dlsym the skel function.
+    *
+    * @param uri, <interface>_URI"&_dom=aDSP"
+    *    <interface>_URI is a QAIC generated uri, or
+    *    "file:///<sofilename>?<interface>_skel_handle_invoke&_modver=1.0"
+    *    If the _dom parameter is not present, _dom=DEFAULT is assumed
+    *    but not forwarded.
+    *    Reserved uri keys:
+    *      [0]: first unamed argument is the skel invoke function
+    *      _dom: execution domain name, _dom=mDSP/aDSP/DEFAULT
+    *      _modver: module version, _modver=1.0
+    *      _*: any other key name starting with an _ is reserved
+    *    Unknown uri keys/values are forwarded as is.
+    * @param h, resulting handle
+    * @retval, 0 on success
+    */
+__QAIC_HEADER_EXPORT int __QAIC_HEADER(ggmlop_dsp_open)(const char* uri, remote_handle64* h) __QAIC_HEADER_ATTRIBUTE;
+/**
+    * Closes a handle.  If this is the last handle to close, the session
+    * is closed as well, releasing all the allocated resources.
+
+    * @param h, the handle to close
+    * @retval, 0 on success, should always succeed
+    */
+__QAIC_HEADER_EXPORT int __QAIC_HEADER(ggmlop_dsp_close)(remote_handle64 h) __QAIC_HEADER_ATTRIBUTE;
+__QAIC_HEADER_EXPORT AEEResult __QAIC_HEADER(ggmlop_dsp_setclocks)(remote_handle64 _h, int32 power_level, int32 latency, int32 mulmat_algotype, int32 thread_counts) __QAIC_HEADER_ATTRIBUTE;
+__QAIC_HEADER_EXPORT int __QAIC_HEADER(ggmlop_dsp_add)(remote_handle64 _h, const dsptensor* src0, const dsptensor* src1, dsptensor* dst) __QAIC_HEADER_ATTRIBUTE;
+__QAIC_HEADER_EXPORT int __QAIC_HEADER(ggmlop_dsp_mulmat)(remote_handle64 _h, const dsptensor* src0, const dsptensor* src1, dsptensor* dst) __QAIC_HEADER_ATTRIBUTE;
+__QAIC_HEADER_EXPORT int __QAIC_HEADER(ggmlop_dsp_softmax)(remote_handle64 _h, const dsptensor* src0, const dsptensor* src1, dsptensor* dst) __QAIC_HEADER_ATTRIBUTE;
+__QAIC_HEADER_EXPORT int __QAIC_HEADER(ggmlop_dsp_rmsnorm)(remote_handle64 _h, const dsptensor* src0, const dsptensor* src1, dsptensor* dst) __QAIC_HEADER_ATTRIBUTE;
+__QAIC_HEADER_EXPORT int __QAIC_HEADER(ggmlop_dsp_pool2d)(remote_handle64 _h, const dsptensor* src0, const dsptensor* src1, dsptensor* dst) __QAIC_HEADER_ATTRIBUTE;
+
+#ifdef __cplusplus
+}
+#endif
+#endif //_SKEL_H
diff --git a/ggml/src/ggml-hexagon/kernels/stub.c b/ggml/src/ggml-hexagon/kernels/stub.c
new file mode 100644
index 0000000000000..7936c43cd6d77
--- /dev/null
+++ b/ggml/src/ggml-hexagon/kernels/stub.c
@@ -0,0 +1,463 @@
+//qidl copyright
+//qidl nested=false
+#include "skel.h"
+#include <string.h>
+#ifndef _WIN32
+#include "HAP_farf.h"
+#include <inttypes.h>
+#endif //_WIN32 for HAP_farf
+#ifndef _ALLOCATOR_H
+#define _ALLOCATOR_H
+
+#include <stdlib.h>
+#include <stdint.h>
+
+typedef struct _heap _heap;
+struct _heap {
+   _heap* pPrev;
+   const char* loc;
+   uint64_t buf;
+};
+
+typedef struct _allocator {
+   _heap* pheap;
+   uint8_t* stack;
+   uint8_t* stackEnd;
+   int nSize;
+} _allocator;
+
+_ATTRIBUTE_UNUSED
+static __inline int _heap_alloc(_heap** ppa, const char* loc, int size, void** ppbuf) {
+   _heap* pn = 0;
+   pn = MALLOC((size_t)size + sizeof(_heap) - sizeof(uint64_t));
+   if(pn != 0) {
+      pn->pPrev = *ppa;
+      pn->loc = loc;
+      *ppa = pn;
+      *ppbuf = (void*)&(pn->buf);
+      return 0;
+   } else {
+      return -1;
+   }
+}
+#define _ALIGN_SIZE(x, y) (((x) + (y-1)) & ~(y-1))
+
+_ATTRIBUTE_UNUSED
+static __inline int _allocator_alloc(_allocator* me,
+                                    const char* loc,
+                                    int size,
+                                    unsigned int al,
+                                    void** ppbuf) {
+   if(size < 0) {
+      return -1;
+   } else if (size == 0) {
+      *ppbuf = 0;
+      return 0;
+   }
+   if((_ALIGN_SIZE((uintptr_t)me->stackEnd, al) + (size_t)size) < (uintptr_t)me->stack + (size_t)me->nSize) {
+      *ppbuf = (uint8_t*)_ALIGN_SIZE((uintptr_t)me->stackEnd, al);
+      me->stackEnd = (uint8_t*)_ALIGN_SIZE((uintptr_t)me->stackEnd, al) + size;
+      return 0;
+   } else {
+      return _heap_alloc(&me->pheap, loc, size, ppbuf);
+   }
+}
+
+_ATTRIBUTE_UNUSED
+static __inline void _allocator_deinit(_allocator* me) {
+   _heap* pa = me->pheap;
+   while(pa != 0) {
+      _heap* pn = pa;
+      const char* loc = pn->loc;
+      (void)loc;
+      pa = pn->pPrev;
+      FREE(pn);
+   }
+}
+
+_ATTRIBUTE_UNUSED
+static __inline void _allocator_init(_allocator* me, uint8_t* stack, int stackSize) {
+   me->stack =  stack;
+   me->stackEnd =  stack + stackSize;
+   me->nSize = stackSize;
+   me->pheap = 0;
+}
+
+
+#endif // _ALLOCATOR_H
+
+#ifndef SLIM_H
+#define SLIM_H
+
+#include <stdint.h>
+
+//a C data structure for the idl types that can be used to implement
+//static and dynamic language bindings fairly efficiently.
+//
+//the goal is to have a minimal ROM and RAM footprint and without
+//doing too many allocations.  A good way to package these things seemed
+//like the module boundary, so all the idls within  one module can share
+//all the type references.
+
+
+#define PARAMETER_IN       0x0
+#define PARAMETER_OUT      0x1
+#define PARAMETER_INOUT    0x2
+#define PARAMETER_ROUT     0x3
+#define PARAMETER_INROUT   0x4
+
+//the types that we get from idl
+#define TYPE_OBJECT             0x0
+#define TYPE_INTERFACE          0x1
+#define TYPE_PRIMITIVE          0x2
+#define TYPE_ENUM               0x3
+#define TYPE_STRING             0x4
+#define TYPE_WSTRING            0x5
+#define TYPE_STRUCTURE          0x6
+#define TYPE_UNION              0x7
+#define TYPE_ARRAY              0x8
+#define TYPE_SEQUENCE           0x9
+
+//these require the pack/unpack to recurse
+//so it's a hint to those languages that can optimize in cases where
+//recursion isn't necessary.
+#define TYPE_COMPLEX_STRUCTURE  (0x10 | TYPE_STRUCTURE)
+#define TYPE_COMPLEX_UNION      (0x10 | TYPE_UNION)
+#define TYPE_COMPLEX_ARRAY      (0x10 | TYPE_ARRAY)
+#define TYPE_COMPLEX_SEQUENCE   (0x10 | TYPE_SEQUENCE)
+
+
+typedef struct Type Type;
+
+#define INHERIT_TYPE\
+   int32_t nativeSize;                /*in the simple case its the same as wire size and alignment*/\
+   union {\
+      struct {\
+         const uintptr_t         p1;\
+         const uintptr_t         p2;\
+      } _cast;\
+      struct {\
+         uint32_t  iid;\
+         uint32_t  bNotNil;\
+      } object;\
+      struct {\
+         const Type  *arrayType;\
+         int32_t      nItems;\
+      } array;\
+      struct {\
+         const Type *seqType;\
+         int32_t      nMaxLen;\
+      } seqSimple; \
+      struct {\
+         uint32_t bFloating;\
+         uint32_t bSigned;\
+      } prim; \
+      const SequenceType* seqComplex;\
+      const UnionType  *unionType;\
+      const StructType *structType;\
+      int32_t         stringMaxLen;\
+      uint8_t        bInterfaceNotNil;\
+   } param;\
+   uint8_t    type;\
+   uint8_t    nativeAlignment\
+
+typedef struct UnionType UnionType;
+typedef struct StructType StructType;
+typedef struct SequenceType SequenceType;
+struct Type {
+   INHERIT_TYPE;
+};
+
+struct SequenceType {
+   const Type *         seqType;
+   uint32_t               nMaxLen;
+   uint32_t               inSize;
+   uint32_t               routSizePrimIn;
+   uint32_t               routSizePrimROut;
+};
+
+//byte offset from the start of the case values for
+//this unions case value array.  it MUST be aligned
+//at the alignment requrements for the descriptor
+//
+//if negative it means that the unions cases are
+//simple enumerators, so the value read from the descriptor
+//can be used directly to find the correct case
+typedef union CaseValuePtr CaseValuePtr;
+union CaseValuePtr {
+   const uint8_t*   value8s;
+   const uint16_t*  value16s;
+   const uint32_t*  value32s;
+   const uint64_t*  value64s;
+};
+
+//these are only used in complex cases
+//so I pulled them out of the type definition as references to make
+//the type smaller
+struct UnionType {
+   const Type           *descriptor;
+   uint32_t               nCases;
+   const CaseValuePtr   caseValues;
+   const Type * const   *cases;
+   int32_t               inSize;
+   int32_t               routSizePrimIn;
+   int32_t               routSizePrimROut;
+   uint8_t                inAlignment;
+   uint8_t                routAlignmentPrimIn;
+   uint8_t                routAlignmentPrimROut;
+   uint8_t                inCaseAlignment;
+   uint8_t                routCaseAlignmentPrimIn;
+   uint8_t                routCaseAlignmentPrimROut;
+   uint8_t                nativeCaseAlignment;
+   uint8_t              bDefaultCase;
+};
+
+struct StructType {
+   uint32_t               nMembers;
+   const Type * const   *members;
+   int32_t               inSize;
+   int32_t               routSizePrimIn;
+   int32_t               routSizePrimROut;
+   uint8_t                inAlignment;
+   uint8_t                routAlignmentPrimIn;
+   uint8_t                routAlignmentPrimROut;
+};
+
+typedef struct Parameter Parameter;
+struct Parameter {
+   INHERIT_TYPE;
+   uint8_t    mode;
+   uint8_t  bNotNil;
+};
+
+#define SLIM_IFPTR32(is32,is64) (sizeof(uintptr_t) == 4 ? (is32) : (is64))
+#define SLIM_SCALARS_IS_DYNAMIC(u) (((u) & 0x00ffffff) == 0x00ffffff)
+
+typedef struct Method Method;
+struct Method {
+   uint32_t                    uScalars;            //no method index
+   int32_t                     primInSize;
+   int32_t                     primROutSize;
+   int                         maxArgs;
+   int                         numParams;
+   const Parameter * const     *params;
+   uint8_t                       primInAlignment;
+   uint8_t                       primROutAlignment;
+};
+
+typedef struct Interface Interface;
+
+struct Interface {
+   int                            nMethods;
+   const Method  * const          *methodArray;
+   int                            nIIds;
+   const uint32_t                   *iids;
+   const uint16_t*                  methodStringArray;
+   const uint16_t*                  methodStrings;
+   const char*                    strings;
+};
+
+
+#endif //SLIM_H
+
+
+#ifndef _GGMLOP_SLIM_H
+#define _GGMLOP_SLIM_H
+#include <stdint.h>
+
+#ifndef __QAIC_SLIM
+#define __QAIC_SLIM(ff) ff
+#endif
+#ifndef __QAIC_SLIM_EXPORT
+#define __QAIC_SLIM_EXPORT
+#endif
+
+static const Type types[5];
+static const Type* const typeArrays[7] = {&(types[0]),&(types[1]),&(types[1]),&(types[0]),&(types[2]),&(types[0]),&(types[3])};
+static const StructType structTypes[1] = {{0x7,&(typeArrays[0]),0x70,0x4,0x6c,0x4,0x4,0x4}};
+static const Type types[5] = {{0x4,{{(const uintptr_t)0,(const uintptr_t)1}}, 2,0x4},{0x10,{{(const uintptr_t)&(types[0]),(const uintptr_t)0x4}}, 8,0x4},{0x40,{{(const uintptr_t)&(types[0]),(const uintptr_t)0x10}}, 8,0x4},{SLIM_IFPTR32(0x8,0x10),{{(const uintptr_t)&(types[4]),(const uintptr_t)0x0}}, 9,SLIM_IFPTR32(0x4,0x8)},{0x4,{{(const uintptr_t)0,(const uintptr_t)1}}, 2,0x4}};
+static const Parameter parameters[6] = {{SLIM_IFPTR32(0x8,0x10),{{(const uintptr_t)0x0,0}}, 4,SLIM_IFPTR32(0x4,0x8),0,0},{SLIM_IFPTR32(0x4,0x8),{{(const uintptr_t)0xdeadc0de,(const uintptr_t)0}}, 0,SLIM_IFPTR32(0x4,0x8),3,0},{SLIM_IFPTR32(0x4,0x8),{{(const uintptr_t)0xdeadc0de,(const uintptr_t)0}}, 0,SLIM_IFPTR32(0x4,0x8),0,0},{0x4,{{(const uintptr_t)0,(const uintptr_t)1}}, 2,0x4,0,0},{SLIM_IFPTR32(0x74,0x80),{{(const uintptr_t)&(structTypes[0]),0}}, 22,SLIM_IFPTR32(0x4,0x8),0,0},{SLIM_IFPTR32(0x74,0x80),{{(const uintptr_t)&(structTypes[0]),0}}, 22,SLIM_IFPTR32(0x4,0x8),3,0}};
+static const Parameter* const parameterArrays[9] = {(&(parameters[4])),(&(parameters[4])),(&(parameters[5])),(&(parameters[3])),(&(parameters[3])),(&(parameters[3])),(&(parameters[0])),(&(parameters[1])),(&(parameters[2]))};
+static const Method methods[4] = {{REMOTE_SCALARS_MAKEX(0,0,0x2,0x0,0x0,0x1),0x4,0x0,2,2,(&(parameterArrays[6])),0x4,0x1},{REMOTE_SCALARS_MAKEX(0,0,0x0,0x0,0x1,0x0),0x0,0x0,1,1,(&(parameterArrays[8])),0x1,0x0},{REMOTE_SCALARS_MAKEX(0,0,0x1,0x0,0x0,0x0),0xc,0x0,3,3,(&(parameterArrays[3])),0x4,0x0},{REMOTE_SCALARS_MAKEX(0,0,0x3,0x2,0x0,0x0),0xe4,0x6c,3,3,(&(parameterArrays[0])),0x4,0x4}};
+static const Method* const methodArrays[8] = {&(methods[0]),&(methods[1]),&(methods[2]),&(methods[3]),&(methods[3]),&(methods[3]),&(methods[3]),&(methods[3])};
+static const char strings[167] = "dsp_setclocks\0dsp_rmsnorm\0dsp_softmax\0dcvs_enable\0power_level\0dsp_pool2d\0dsp_mulmat\0op_params\0dsp_add\0latency\0flags\0close\0src1\0data\0type\0src0\0open\0dst\0uri\0op\0nb\0ne\0h\0";
+static const uint16_t methodStrings[134] = {62,137,132,161,158,155,84,110,127,122,132,161,158,155,84,110,127,147,132,161,158,155,84,110,127,14,137,132,161,158,155,84,110,127,122,132,161,158,155,84,110,127,147,132,161,158,155,84,110,127,26,137,132,161,158,155,84,110,127,122,132,161,158,155,84,110,127,147,132,161,158,155,84,110,127,73,137,132,161,158,155,84,110,127,122,132,161,158,155,84,110,127,147,132,161,158,155,84,110,127,94,137,132,161,158,155,84,110,127,122,132,161,158,155,84,110,127,147,132,161,158,155,84,110,127,0,50,102,38,142,151,164,116,164};
+static const uint16_t methodStringsArrays[8] = {129,132,125,100,75,50,25,0};
+__QAIC_SLIM_EXPORT const Interface __QAIC_SLIM(ggmlop_slim) = {8,&(methodArrays[0]),0,0,&(methodStringsArrays [0]),methodStrings,strings};
+#endif //_GGMLOP_SLIM_H
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+__QAIC_STUB_EXPORT int __QAIC_STUB(ggmlop_dsp_open)(const char* uri, remote_handle64* h) __QAIC_STUB_ATTRIBUTE {
+   return __QAIC_REMOTE(remote_handle64_open)(uri, h);
+}
+__QAIC_STUB_EXPORT int __QAIC_STUB(ggmlop_dsp_close)(remote_handle64 h) __QAIC_STUB_ATTRIBUTE {
+   return __QAIC_REMOTE(remote_handle64_close)(h);
+}
+static __inline int _stub_method(remote_handle64 _handle, uint32_t _mid, uint32_t _in0[1], uint32_t _in1[1], uint32_t _in2[1], uint32_t _in3[1]) {
+   remote_arg _pra[1] = {0};
+   uint32_t _primIn[4]= {0};
+   int _nErr = 0;
+   _pra[0].buf.pv = (void*)_primIn;
+   _pra[0].buf.nLen = sizeof(_primIn);
+   _COPY(_primIn, 0, _in0, 0, 4);
+   _COPY(_primIn, 4, _in1, 0, 4);
+   _COPY(_primIn, 8, _in2, 0, 4);
+   _COPY(_primIn, 12,_in3, 0, 4);
+   _TRY_FARF(_nErr, __QAIC_REMOTE(remote_handle64_invoke)(_handle, REMOTE_SCALARS_MAKEX(0, _mid, 1, 0, 0, 0), _pra));
+   _CATCH_FARF(_nErr) {
+      _QAIC_FARF(RUNTIME_ERROR, "ERROR 0x%x: handle=0x%"PRIx64", scalar=0x%x, method ID=%d: %s failed\n", _nErr , _handle, REMOTE_SCALARS_MAKEX(0, _mid, 1, 0, 0, 0), _mid, __func__);
+   }
+   return _nErr;
+}
+__QAIC_STUB_EXPORT AEEResult __QAIC_STUB(ggmlop_dsp_setclocks)(remote_handle64 _handle, int32 power_level, int32 latency, int32 mulmat_algotype, int32 threads) __QAIC_STUB_ATTRIBUTE {
+   uint32_t _mid = 2;
+   return _stub_method(_handle, _mid, (uint32_t*)&power_level, (uint32_t*)&latency, (uint32_t*)&mulmat_algotype, (uint32_t*)&threads);
+}
+static __inline int _stub_unpack(_ATTRIBUTE_UNUSED remote_arg* _praROutPost, _ATTRIBUTE_UNUSED remote_arg* _ppraROutPost[1], _ATTRIBUTE_UNUSED void* _primROut, _ATTRIBUTE_UNUSED uint32_t _rout0[1], _ATTRIBUTE_UNUSED uint32_t _rout1[4], _ATTRIBUTE_UNUSED uint32_t _rout2[4], _ATTRIBUTE_UNUSED uint32_t _rout3[1], _ATTRIBUTE_UNUSED uint32_t _rout4[16], _ATTRIBUTE_UNUSED uint32_t _rout5[1], _ATTRIBUTE_UNUSED char* _rout6[1], _ATTRIBUTE_UNUSED uint32_t _rout6Len[1]) {
+   int _nErr = 0;
+   remote_arg* _praROutPostStart = _praROutPost;
+   remote_arg** _ppraROutPostStart = _ppraROutPost;
+   _ppraROutPost = &_praROutPost;
+   _COPY(_rout0, 0, _primROut, 0, 4);
+   _COPY(_rout1, 0, _primROut, 4, 16);
+   _COPY(_rout2, 0, _primROut, 20, 16);
+   _COPY(_rout3, 0, _primROut, 36, 4);
+   _COPY(_rout4, 0, _primROut, 40, 64);
+   _COPY(_rout5, 0, _primROut, 104, 4);
+   _ppraROutPostStart[0] += (_praROutPost - _praROutPostStart) +1;
+   return _nErr;
+}
+static __inline int _stub_pack(_ATTRIBUTE_UNUSED _allocator* _al, _ATTRIBUTE_UNUSED remote_arg* _praIn, _ATTRIBUTE_UNUSED remote_arg* _ppraIn[1], _ATTRIBUTE_UNUSED remote_arg* _praROut, _ATTRIBUTE_UNUSED remote_arg* _ppraROut[1], _ATTRIBUTE_UNUSED remote_arg* _praHIn, _ATTRIBUTE_UNUSED remote_arg* _ppraHIn[1], _ATTRIBUTE_UNUSED remote_arg* _praHROut, _ATTRIBUTE_UNUSED remote_arg* _ppraHROut[1], _ATTRIBUTE_UNUSED void* _primIn, _ATTRIBUTE_UNUSED void* _primROut, _ATTRIBUTE_UNUSED uint32_t _rout0[1], _ATTRIBUTE_UNUSED uint32_t _rout1[4], _ATTRIBUTE_UNUSED uint32_t _rout2[4], _ATTRIBUTE_UNUSED uint32_t _rout3[1], _ATTRIBUTE_UNUSED uint32_t _rout4[16], _ATTRIBUTE_UNUSED uint32_t _rout5[1], _ATTRIBUTE_UNUSED char* _rout6[1], _ATTRIBUTE_UNUSED uint32_t _rout6Len[1]) {
+   int _nErr = 0;
+   remote_arg* _praInStart = _praIn;
+   remote_arg** _ppraInStart = _ppraIn;
+   remote_arg* _praROutStart = _praROut;
+   remote_arg** _ppraROutStart = _ppraROut;
+   _ppraIn = &_praIn;
+   _ppraROut = &_praROut;
+   _COPY(_primIn, 0, _rout6Len, 0, 4);
+   _praROut[0].buf.pv = _rout6[0];
+   _praROut[0].buf.nLen = (4 * _rout6Len[0]);
+   _ppraInStart[0] += (_praIn - _praInStart) + 0;
+   _ppraROutStart[0] += (_praROut - _praROutStart) +1;
+   return _nErr;
+}
+static __inline int _stub_pack_1(_ATTRIBUTE_UNUSED _allocator* _al, _ATTRIBUTE_UNUSED remote_arg* _praIn, _ATTRIBUTE_UNUSED remote_arg* _ppraIn[1], _ATTRIBUTE_UNUSED remote_arg* _praROut, _ATTRIBUTE_UNUSED remote_arg* _ppraROut[1], _ATTRIBUTE_UNUSED remote_arg* _praHIn, _ATTRIBUTE_UNUSED remote_arg* _ppraHIn[1], _ATTRIBUTE_UNUSED remote_arg* _praHROut, _ATTRIBUTE_UNUSED remote_arg* _ppraHROut[1], _ATTRIBUTE_UNUSED void* _primIn, _ATTRIBUTE_UNUSED void* _primROut, _ATTRIBUTE_UNUSED uint32_t _in0[1], _ATTRIBUTE_UNUSED uint32_t _in1[4], _ATTRIBUTE_UNUSED uint32_t _in2[4], _ATTRIBUTE_UNUSED uint32_t _in3[1], _ATTRIBUTE_UNUSED uint32_t _in4[16], _ATTRIBUTE_UNUSED uint32_t _in5[1], _ATTRIBUTE_UNUSED char* _in6[1], _ATTRIBUTE_UNUSED uint32_t _in6Len[1]) {
+   int _nErr = 0;
+   remote_arg* _praInStart = _praIn;
+   remote_arg** _ppraInStart = _ppraIn;
+   remote_arg* _praROutStart = _praROut;
+   remote_arg** _ppraROutStart = _ppraROut;
+   _ppraIn = &_praIn;
+   _ppraROut = &_praROut;
+   _COPY(_primIn, 0, _in0, 0, 4);
+   _COPY(_primIn, 4, _in1, 0, 16);
+   _COPY(_primIn, 20, _in2, 0, 16);
+   _COPY(_primIn, 36, _in3, 0, 4);
+   _COPY(_primIn, 40, _in4, 0, 64);
+   _COPY(_primIn, 104, _in5, 0, 4);
+   _COPY(_primIn, 108, _in6Len, 0, 4);
+   _praIn[0].buf.pv = (void*) _in6[0];
+   _praIn[0].buf.nLen = (4 * _in6Len[0]);
+   _ppraInStart[0] += (_praIn - _praInStart) + 1;
+   _ppraROutStart[0] += (_praROut - _praROutStart) +0;
+   return _nErr;
+}
+static __inline void _count(int _numIn[1], int _numROut[1], int _numInH[1], int _numROutH[1], _ATTRIBUTE_UNUSED uint32_t _rout0[1], _ATTRIBUTE_UNUSED uint32_t _rout1[4], _ATTRIBUTE_UNUSED uint32_t _rout2[4], _ATTRIBUTE_UNUSED uint32_t _rout3[1], _ATTRIBUTE_UNUSED uint32_t _rout4[16], _ATTRIBUTE_UNUSED uint32_t _rout5[1], _ATTRIBUTE_UNUSED char* _rout6[1], _ATTRIBUTE_UNUSED uint32_t _rout6Len[1]) {
+   _numIn[0] += 0;
+   _numROut[0] += 1;
+   _numInH[0] += 0;
+   _numROutH[0] += 0;
+}
+static __inline void _count_1(int _numIn[1], int _numROut[1], int _numInH[1], int _numROutH[1], _ATTRIBUTE_UNUSED uint32_t _in0[1], _ATTRIBUTE_UNUSED uint32_t _in1[4], _ATTRIBUTE_UNUSED uint32_t _in2[4], _ATTRIBUTE_UNUSED uint32_t _in3[1], _ATTRIBUTE_UNUSED uint32_t _in4[16], _ATTRIBUTE_UNUSED uint32_t _in5[1], _ATTRIBUTE_UNUSED char* _in6[1], _ATTRIBUTE_UNUSED uint32_t _in6Len[1]) {
+   _numIn[0] += 1;
+   _numROut[0] += 0;
+   _numInH[0] += 0;
+   _numROutH[0] += 0;
+}
+static __inline int _stub_method_1(remote_handle64 _handle, uint32_t _mid, uintptr_t _in0[SLIM_IFPTR32(29, 16)], uintptr_t _in1[SLIM_IFPTR32(29, 16)], uintptr_t _rout2[SLIM_IFPTR32(29, 16)]) {
+   remote_arg* _pra = 0;
+   int _numIn[1] = {0};
+   int _numROut[1] = {0};
+   int _numInH[1] = {0};
+   int _numROutH[1] = {0};
+   _allocator _al[1] = {{0}};
+   uint32_t _primIn[57]= {0};
+   uint32_t _primROut[27]= {0};
+   remote_arg* _praIn = 0;
+   remote_arg* _praROut = 0;
+   remote_arg* _praROutPost = 0;
+   remote_arg** _ppraROutPost = &_praROutPost;
+   remote_arg** _ppraIn = &_praIn;
+   remote_arg** _ppraROut = &_praROut;
+   remote_arg* _praHIn = 0;
+   remote_arg** _ppraHIn = &_praHIn;
+   remote_arg* _praHROut = 0;
+   remote_arg** _ppraHROut = &_praHROut;
+   int _nErr = 0;
+   _numIn[0] = 0;
+   _numROut[0] = 0;
+   _numInH[0] = 0;
+   _numROutH[0] = 0;
+   _count_1(_numIn, _numROut, _numInH, _numROutH, (uint32_t*)&(((uint32_t*)_in0)[0]), (uint32_t*)&(((uint32_t*)_in0)[1]), (uint32_t*)&(((uint32_t*)_in0)[5]), (uint32_t*)&(((uint32_t*)_in0)[9]), (uint32_t*)&(((uint32_t*)_in0)[10]), (uint32_t*)&(((uint32_t*)_in0)[26]), SLIM_IFPTR32((char**)&(((uint32_t*)_in0)[27]), (char**)&(((uint64_t*)_in0)[14])), SLIM_IFPTR32((uint32_t*)&(((uint32_t*)_in0)[28]), (uint32_t*)&(((uint32_t*)_in0)[30])));
+   _count_1(_numIn, _numROut, _numInH, _numROutH, (uint32_t*)&(((uint32_t*)_in1)[0]), (uint32_t*)&(((uint32_t*)_in1)[1]), (uint32_t*)&(((uint32_t*)_in1)[5]), (uint32_t*)&(((uint32_t*)_in1)[9]), (uint32_t*)&(((uint32_t*)_in1)[10]), (uint32_t*)&(((uint32_t*)_in1)[26]), SLIM_IFPTR32((char**)&(((uint32_t*)_in1)[27]), (char**)&(((uint64_t*)_in1)[14])), SLIM_IFPTR32((uint32_t*)&(((uint32_t*)_in1)[28]), (uint32_t*)&(((uint32_t*)_in1)[30])));
+   _count(_numIn, _numROut, _numInH, _numROutH, (uint32_t*)&(((uint32_t*)_rout2)[0]), (uint32_t*)&(((uint32_t*)_rout2)[1]), (uint32_t*)&(((uint32_t*)_rout2)[5]), (uint32_t*)&(((uint32_t*)_rout2)[9]), (uint32_t*)&(((uint32_t*)_rout2)[10]), (uint32_t*)&(((uint32_t*)_rout2)[26]), SLIM_IFPTR32((char**)&(((uint32_t*)_rout2)[27]), (char**)&(((uint64_t*)_rout2)[14])), SLIM_IFPTR32((uint32_t*)&(((uint32_t*)_rout2)[28]), (uint32_t*)&(((uint32_t*)_rout2)[30])));
+   if(_numIn[0]>=255){
+          return AEE_EUNSUPPORTED;
+   }
+   if(_numROut[0]>=255){
+          return AEE_EUNSUPPORTED;
+   }
+   _allocator_init(_al, 0, 0);
+   _QAIC_ALLOCATE(_nErr, _al, ((((((((_numIn[0] + _numROut[0]) + _numInH[0]) + _numROutH[0]) + 1) + 1) + 0) + 0) * sizeof(_pra[0])), 4, _pra);
+   _QAIC_ASSERT(_nErr, _pra);
+   _pra[0].buf.pv = (void*)_primIn;
+   _pra[0].buf.nLen = sizeof(_primIn);
+   _pra[(_numIn[0] + 1)].buf.pv = (void*)_primROut;
+   _pra[(_numIn[0] + 1)].buf.nLen = sizeof(_primROut);
+   _praIn = (_pra + 1);
+   _praROut = (_praIn + _numIn[0] + 1);
+   _praROutPost = _praROut;
+   if(_praHIn == 0)
+   {
+      _praHIn = ((_praROut + _numROut[0]) + 1);
+   }
+   if(_praHROut == 0)
+      (_praHROut = _praHIn + _numInH[0] + 0);
+   _TRY(_nErr, _stub_pack_1(_al, (_praIn + 0), _ppraIn, (_praROut + 0), _ppraROut, _praHIn, _ppraHIn, _praHROut, _ppraHROut, ((char*)_primIn + 0), 0, (uint32_t*)&(((uint32_t*)_in0)[0]), (uint32_t*)&(((uint32_t*)_in0)[1]), (uint32_t*)&(((uint32_t*)_in0)[5]), (uint32_t*)&(((uint32_t*)_in0)[9]), (uint32_t*)&(((uint32_t*)_in0)[10]), (uint32_t*)&(((uint32_t*)_in0)[26]), SLIM_IFPTR32((char**)&(((uint32_t*)_in0)[27]), (char**)&(((uint64_t*)_in0)[14])), SLIM_IFPTR32((uint32_t*)&(((uint32_t*)_in0)[28]), (uint32_t*)&(((uint32_t*)_in0)[30]))));
+   _TRY(_nErr, _stub_pack_1(_al, (_praIn + 0), _ppraIn, (_praROut + 0), _ppraROut, _praHIn, _ppraHIn, _praHROut, _ppraHROut, ((char*)_primIn + 112), 0, (uint32_t*)&(((uint32_t*)_in1)[0]), (uint32_t*)&(((uint32_t*)_in1)[1]), (uint32_t*)&(((uint32_t*)_in1)[5]), (uint32_t*)&(((uint32_t*)_in1)[9]), (uint32_t*)&(((uint32_t*)_in1)[10]), (uint32_t*)&(((uint32_t*)_in1)[26]), SLIM_IFPTR32((char**)&(((uint32_t*)_in1)[27]), (char**)&(((uint64_t*)_in1)[14])), SLIM_IFPTR32((uint32_t*)&(((uint32_t*)_in1)[28]), (uint32_t*)&(((uint32_t*)_in1)[30]))));
+   _TRY(_nErr, _stub_pack(_al, (_praIn + 0), _ppraIn, (_praROut + 0), _ppraROut, _praHIn, _ppraHIn, _praHROut, _ppraHROut, ((char*)_primIn + 224), ((char*)_primROut + 0), (uint32_t*)&(((uint32_t*)_rout2)[0]), (uint32_t*)&(((uint32_t*)_rout2)[1]), (uint32_t*)&(((uint32_t*)_rout2)[5]), (uint32_t*)&(((uint32_t*)_rout2)[9]), (uint32_t*)&(((uint32_t*)_rout2)[10]), (uint32_t*)&(((uint32_t*)_rout2)[26]), SLIM_IFPTR32((char**)&(((uint32_t*)_rout2)[27]), (char**)&(((uint64_t*)_rout2)[14])), SLIM_IFPTR32((uint32_t*)&(((uint32_t*)_rout2)[28]), (uint32_t*)&(((uint32_t*)_rout2)[30]))));
+   _QAIC_ASSERT(_nErr, (_numInH[0] + 0) <= 15);
+   _QAIC_ASSERT(_nErr, (_numROutH[0] + 0) <= 15);
+   _TRY_FARF(_nErr, __QAIC_REMOTE(remote_handle64_invoke)(_handle, REMOTE_SCALARS_MAKEX(0, _mid, (_numIn[0] + 1), (_numROut[0] + 1), (_numInH[0] + 0), (_numROutH[0] + 0)), _pra));
+   _TRY(_nErr, _stub_unpack((_praROutPost + 0), _ppraROutPost, ((char*)_primROut + 0), (uint32_t*)&(((uint32_t*)_rout2)[0]), (uint32_t*)&(((uint32_t*)_rout2)[1]), (uint32_t*)&(((uint32_t*)_rout2)[5]), (uint32_t*)&(((uint32_t*)_rout2)[9]), (uint32_t*)&(((uint32_t*)_rout2)[10]), (uint32_t*)&(((uint32_t*)_rout2)[26]), SLIM_IFPTR32((char**)&(((uint32_t*)_rout2)[27]), (char**)&(((uint64_t*)_rout2)[14])), SLIM_IFPTR32((uint32_t*)&(((uint32_t*)_rout2)[28]), (uint32_t*)&(((uint32_t*)_rout2)[30]))));
+   _QAIC_CATCH(_nErr) {}
+   _CATCH_FARF(_nErr) {
+      _QAIC_FARF(RUNTIME_ERROR, "ERROR 0x%x: handle=0x%"PRIx64", scalar=0x%x, method ID=%d: %s failed\n", _nErr , _handle, REMOTE_SCALARS_MAKEX(0, _mid, (_numIn[0] + 1), (_numROut[0] + 1), (_numInH[0] + 0), (_numROutH[0] + 0)), _mid, __func__);
+   }
+   _allocator_deinit(_al);
+   return _nErr;
+}
+__QAIC_STUB_EXPORT int __QAIC_STUB(ggmlop_dsp_add)(remote_handle64 _handle, const dsptensor* src0, const dsptensor* src1, dsptensor* dst) __QAIC_STUB_ATTRIBUTE {
+   uint32_t _mid = 3;
+   return _stub_method_1(_handle, _mid, (uintptr_t*)src0, (uintptr_t*)src1, (uintptr_t*)dst);
+}
+__QAIC_STUB_EXPORT int __QAIC_STUB(ggmlop_dsp_mulmat)(remote_handle64 _handle, const dsptensor* src0, const dsptensor* src1, dsptensor* dst) __QAIC_STUB_ATTRIBUTE {
+   uint32_t _mid = 4;
+   return _stub_method_1(_handle, _mid, (uintptr_t*)src0, (uintptr_t*)src1, (uintptr_t*)dst);
+}
+__QAIC_STUB_EXPORT int __QAIC_STUB(ggmlop_dsp_softmax)(remote_handle64 _handle, const dsptensor* src0, const dsptensor* src1, dsptensor* dst) __QAIC_STUB_ATTRIBUTE {
+   uint32_t _mid = 5;
+   return _stub_method_1(_handle, _mid, (uintptr_t*)src0, (uintptr_t*)src1, (uintptr_t*)dst);
+}
+__QAIC_STUB_EXPORT int __QAIC_STUB(ggmlop_dsp_rmsnorm)(remote_handle64 _handle, const dsptensor* src0, const dsptensor* src1, dsptensor* dst) __QAIC_STUB_ATTRIBUTE {
+   uint32_t _mid = 6;
+   return _stub_method_1(_handle, _mid, (uintptr_t*)src0, (uintptr_t*)src1, (uintptr_t*)dst);
+}
+__QAIC_STUB_EXPORT int __QAIC_STUB(ggmlop_dsp_pool2d)(remote_handle64 _handle, const dsptensor* src0, const dsptensor* src1, dsptensor* dst) __QAIC_STUB_ATTRIBUTE {
+   uint32_t _mid = 7;
+   return _stub_method_1(_handle, _mid, (uintptr_t*)src0, (uintptr_t*)src1, (uintptr_t*)dst);
+}
diff --git a/ggml/src/ggml-hexagon/kernels/worker_pool.cpp b/ggml/src/ggml-hexagon/kernels/worker_pool.cpp
new file mode 100755
index 0000000000000..8186edcf18a95
--- /dev/null
+++ b/ggml/src/ggml-hexagon/kernels/worker_pool.cpp
@@ -0,0 +1,475 @@
+/**=============================================================================
+
+@file
+   worker_pool.cpp
+
+@brief
+   Utility providing a multi-priority thread worker pool for
+   multi-threaded computer vision (or other compute) applications.
+
+Copyright (c) 2019-2020 Qualcomm Technologies Incorporated.
+All Rights Reserved. Qualcomm Proprietary and Confidential.
+
+Export of this technology or software is regulated by the U.S.
+Government. Diversion contrary to U.S. law prohibited.
+
+All ideas, data and information contained in or disclosed by
+this document are confidential and proprietary information of
+Qualcomm Technologies Incorporated and all rights therein are expressly reserved.
+By accepting this material the recipient agrees that this material
+and the information contained therein are held in confidence and in
+trust and will not be used, copied, reproduced in whole or in part,
+nor its contents revealed in any manner to others without the express
+written permission of Qualcomm Technologies Incorporated.
+
+=============================================================================**/
+
+/*===========================================================================
+    INCLUDE FILE
+===========================================================================*/
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include "worker_pool.h"
+
+#ifndef _DEBUG
+#define _DEBUG
+#endif
+#include "HAP_farf.h"
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+#include "qurt.h"
+#include "hexagon_protos.h"
+
+void worker_pool_constructor(void) __attribute__((constructor));
+void worker_pool_destructor(void) __attribute__((destructor));
+
+#ifdef __cplusplus
+}
+#endif
+
+/*===========================================================================
+    DEFINE
+===========================================================================*/
+#define WORKER_THREAD_STACK_SZ  2 *16384
+#define WORKER_KILL_SIGNAL      31                      // signal to kill the worker threads
+#define NUM_JOB_SLOTS           (MAX_NUM_WORKERS + 1) // max queued jobs, slightly more than number of workers.
+#define LOWEST_USABLE_QURT_PRIO 254
+
+/*===========================================================================
+    TYPEDEF
+===========================================================================*/
+// internal structure kept in thread-local storage per instance of worker pool
+typedef struct
+{
+    qurt_anysignal_t     empty_jobs;                // available job nodes
+    qurt_anysignal_t     queued_jobs;               // jobs that are waiting for a worker
+    qurt_mutex_t         empty_jobs_mutex;          // mutex for multiple threads trying to send a job
+    qurt_mutex_t         queued_jobs_mutex;         // mutex for multiple threads trying to acquire a job
+    unsigned int         job_queue_mask;            // mask for job queue nodes
+    unsigned int         num_workers;               // number of workers in this pool
+    worker_pool_job_t    job[NUM_JOB_SLOTS];        // list of job descriptors
+    qurt_thread_t        thread[MAX_NUM_WORKERS];   // thread ID's of the workers
+    void *               stack[MAX_NUM_WORKERS];    // thread stack pointers
+} worker_pool_t;
+
+// internal structure containing OS primitives to sync caller with all its spawned jobs.
+typedef union
+{
+    worker_synctoken_t raw;
+    struct
+    {
+        unsigned int atomic_countdown;
+        unsigned int reserved;                      // reserved to align next element to 8 bytes
+        qurt_sem_t   sem;
+    } sync;
+} internal_synctoken_t;
+
+/*===========================================================================
+    GLOBAL VARIABLES (per PD)
+===========================================================================*/
+// initialized in constructor
+unsigned int num_workers = 1;
+unsigned int num_hvx128_contexts = 0;
+
+/*===========================================================================
+    STATIC VARIABLES
+===========================================================================*/
+
+static worker_pool_context_t static_context = NULL;
+
+/*===========================================================================
+    LOCAL FUNCTION
+===========================================================================*/
+// the main workloop for each of the worker threads.
+static void worker_pool_main(void* context)
+{
+    // local pointer to owning pool's context
+    worker_pool_t *me = (worker_pool_t *) context;
+
+    // some local vars to reduce dereferencing inside loop
+    qurt_anysignal_t *signal = &me->queued_jobs;
+    unsigned int mask = me->job_queue_mask;
+    qurt_mutex_t *mutex = &me->queued_jobs_mutex;
+
+    while(1)
+    {
+        qurt_mutex_lock(mutex);                     // mutex only allows 1 thread to wait on signal at a time. QuRT restriction.
+        (void) qurt_anysignal_wait(signal, mask);    // wait for a job
+        unsigned int sig_rx = Q6_R_ct0_R(mask & qurt_anysignal_get(signal)); // count trailing 0's to choose flagged job
+        if (sig_rx < NUM_JOB_SLOTS)        // if real job
+        {
+            worker_pool_job_t job = me->job[sig_rx];    // local copy of job descriptor
+            (void) qurt_anysignal_clear(signal, (1 << sig_rx));        // clear the queued job signal
+            (void) qurt_anysignal_set(&me->empty_jobs, (1 << sig_rx)); // send node back to empty list
+            qurt_mutex_unlock(mutex);                // unlock the mutex
+            job.fptr(job.dptr);                        // issue the callback
+        }
+        else if (WORKER_KILL_SIGNAL == sig_rx)
+        {
+            // don't clear the kill signal, leave it for all the workers to see, and exit
+            qurt_mutex_unlock(mutex);
+            qurt_thread_exit(0);
+        }
+        else{
+            FARF(HIGH,"Worker pool received invalid job %d", sig_rx );
+            qurt_mutex_unlock(mutex);
+        }
+        // else ignore
+    }
+}
+
+void worker_pool_constructor()
+{
+    FARF(HIGH, "In worker_pool constructor");
+    qurt_sysenv_max_hthreads_t num_threads;
+    if (QURT_EOK != qurt_sysenv_get_max_hw_threads(&num_threads))
+    {
+        num_workers = MAX_NUM_WORKERS; // Couldn't get number of threads from QuRT, default to 4.
+        FARF(HIGH, "Failed to get number of threads. Defaulting to %u", num_workers);
+    }
+    else
+    {
+        num_workers = num_threads.max_hthreads;
+    }
+
+    /* Verify that number of hw threads isn't greater than max supported number of hw threads.
+       Max threads is used as a constant value for array size. */
+    if (num_workers > MAX_NUM_WORKERS)
+    {
+        num_workers = MAX_NUM_WORKERS;
+        FARF(HIGH, "Limiting number of threads to maximum supported value %u", num_workers);
+    }
+
+    num_hvx128_contexts = (qurt_hvx_get_units() >> 8) & 0xFF;
+
+    /* initialize static worker_pool for clients who pass NULL as context.*/
+    if (worker_pool_init(&static_context) != AEE_SUCCESS)
+    {
+        FARF(ERROR, "Could not initialize default worker pool");
+    }
+}
+
+AEEResult worker_pool_init_with_stack_size(worker_pool_context_t *context, int stack_size)
+{
+    int nErr = 0;
+
+    if(stack_size <= 0)
+    {
+        FARF(ERROR, "Stack size can not be negative");
+        return AEE_EBADPARM;
+    }
+
+    if (NULL == context)
+    {
+        FARF(ERROR, "NULL context passed to worker_pool_init().");
+        return AEE_EBADPARM;
+    }
+
+    // Allocations
+    int size = (stack_size * num_workers) + (sizeof(worker_pool_t));
+    unsigned char *mem_blob = (unsigned char*)malloc(size);
+    if (!mem_blob)
+    {
+        FARF(ERROR,"Could not allocate memory for worker pool!!");
+        return AEE_ENOMEMORY;
+    }
+
+    worker_pool_t *me = (worker_pool_t *)(mem_blob + stack_size * num_workers);
+
+    // name for the first worker, useful in debugging threads
+    char name[19];
+    snprintf(name, 12, "0x%8x:", (int)me);
+    strcat(name, "worker0");
+    me->num_workers = num_workers;
+    // initializations
+    for (unsigned int i = 0; i < me->num_workers; i++)
+    {
+        me->stack[i] = NULL;
+        me->thread[i] = 0;
+    }
+
+    // initialize job queue
+    qurt_anysignal_init(&(me->queued_jobs));
+    qurt_anysignal_init(&(me->empty_jobs));
+    qurt_mutex_init(&(me->empty_jobs_mutex));
+    qurt_mutex_init(&(me->queued_jobs_mutex));
+    me->job_queue_mask = (1 << NUM_JOB_SLOTS) - 1;  // set a bit for each job node, number of job nodes = num_workers + 1
+    (void) qurt_anysignal_set(&(me->empty_jobs), me->job_queue_mask); // fill the empty pool.
+    me->job_queue_mask |= (1 << WORKER_KILL_SIGNAL);  // add the kill signal to the mask.
+
+    // launch the workers
+    qurt_thread_attr_t attr;
+    qurt_thread_attr_init (&attr);
+
+    for (unsigned int i = 0; i < me->num_workers; i++)
+    {
+        // set up stack
+        me->stack[i] = mem_blob;
+        mem_blob += stack_size;
+        qurt_thread_attr_set_stack_addr(&attr, me->stack[i]);
+        qurt_thread_attr_set_stack_size(&attr, stack_size);
+
+        // set up name
+        qurt_thread_attr_set_name(&attr, name);
+        name[17] = (name[17] + 1);
+        // name threads context:worker0, context:worker1, .. (recycle at 9, but num threads should be less than that anyway)
+        if (name[17] > '9') name[17] = '0';
+        // set up priority - by default, match the creating thread's prio
+        int prio = qurt_thread_get_priority(qurt_thread_get_id());
+
+        // If loading thread has priority less than 64, load static worker pool with 64 priority.
+        if(context == &static_context && prio < 64) prio = 64;
+
+        if (prio < 1) prio = 1;
+        if (prio > LOWEST_USABLE_QURT_PRIO) prio = LOWEST_USABLE_QURT_PRIO;
+
+        qurt_thread_attr_set_priority(&attr, prio);
+
+        // launch
+        nErr = qurt_thread_create(&(me->thread[i]), &attr, worker_pool_main, (void *)me);
+        if (nErr)
+        {
+            FARF(ERROR, "Could not launch worker threads!");
+            worker_pool_deinit((worker_pool_context_t*)&me);
+            return AEE_EQURTTHREADCREATE;
+        }
+    }
+    *context = (worker_pool_context_t*)me;
+    return AEE_SUCCESS;
+}
+
+AEEResult worker_pool_init(worker_pool_context_t *context)
+{
+    return worker_pool_init_with_stack_size(context, WORKER_THREAD_STACK_SZ);
+}
+
+
+// clean up worker pool
+void worker_pool_deinit(worker_pool_context_t *context)
+{
+    worker_pool_t *me = (worker_pool_t*)*context;
+
+    // if no worker pool exists, return error.
+    if (NULL == me)
+    {
+        return;
+    }
+
+    // de-initializations
+    (void) qurt_anysignal_set(&(me->empty_jobs), (1 << WORKER_KILL_SIGNAL));  // notify to stop new jobs.
+    (void) qurt_anysignal_set(&(me->queued_jobs), (1 << WORKER_KILL_SIGNAL)); // kill worker pool.
+    for (unsigned int i = 0; i < me->num_workers; i++)                                          // wait for workers to die
+    {
+        if (me->thread[i])
+        {
+            int status;
+            (void) qurt_thread_join(me->thread[i], &status);
+        }
+    }
+
+    // release resources
+    qurt_mutex_destroy(&(me->empty_jobs_mutex));
+    qurt_mutex_destroy(&(me->queued_jobs_mutex));
+    qurt_anysignal_destroy(&(me->queued_jobs));
+    qurt_anysignal_destroy(&(me->empty_jobs));
+    // free allocated memory (were allocated as a single buffer starting at stack[0])
+    if (me->stack[0]) free (me->stack[0]);
+    // Assign NULL to freed context so that further refence to it fails.
+    *context = NULL;
+}
+
+// submit a job to the pool.
+AEEResult worker_pool_submit(worker_pool_context_t context, worker_pool_job_t job)
+{
+    worker_pool_t *me = (worker_pool_t*)context;
+
+    // if NULL is passed as worker_pool_context, try to use default static worker_pool
+    if (NULL == me)
+    {
+        if (static_context == NULL)
+        {
+            FARF(HIGH, "No default static worker pool found");
+            return AEE_ERESOURCENOTFOUND;
+        }
+        FARF(MEDIUM, "Using default static worker pool");
+        me = (worker_pool_t*)static_context;
+    }
+
+    // if a worker thread tries to submit a job, call it in-context to avoid recursion deadlock.
+    unsigned int i;
+    qurt_thread_t id = qurt_thread_get_id();
+    for (i = 0; i < me->num_workers; i++)
+    {
+        if (id == me->thread[i])
+        {
+            job.fptr(job.dptr);                     // issue the callback in caller's context
+            return AEE_SUCCESS;
+        }
+    }
+
+    // local vars to reduce dereferencing
+    qurt_mutex_t *mutex = &me->empty_jobs_mutex;
+    qurt_anysignal_t *signal =  &me->empty_jobs;
+    unsigned int mask = me->job_queue_mask;
+
+    qurt_mutex_lock(mutex);                             // lock empty queue
+    (void) qurt_anysignal_wait(signal, mask);            // wait for an empty job node
+    unsigned int bitfield = qurt_anysignal_get(signal);
+
+    // check if pool is being killed and return early
+    if (bitfield & (1 << WORKER_KILL_SIGNAL))
+    {
+        qurt_mutex_unlock(mutex);
+        return AEE_ENOMORE;
+    }
+
+    // send the job to the queue.
+    unsigned int sig_rx = Q6_R_ct0_R(mask & bitfield); // count trailing 0's to find first avail node
+    me->job[sig_rx] = job;            // copy job descriptor
+    (void) qurt_anysignal_clear(signal, (1 << sig_rx));        // clear the empty job node flag
+    (void) qurt_anysignal_set(&me->queued_jobs, (1 << sig_rx)); // notify of pending job
+    qurt_mutex_unlock(mutex);                // unlock the mutex
+
+    return 0;
+}
+
+void worker_pool_destructor()
+{
+    FARF(HIGH, "In worker_pool destructor");
+
+    worker_pool_deinit(&static_context);
+}
+
+/*===========================================================================
+    GLOBAL FUNCTION
+===========================================================================*/
+// initialize a synctoken - caller will wait on the synctoken and each job will release it.
+// caller wakes when all jobs have released.
+void worker_pool_synctoken_init(worker_synctoken_t *token, unsigned int njobs)
+{
+    // cast input to usable struct
+    internal_synctoken_t *internal_token = (internal_synctoken_t *) token;
+
+    // initialize atomic counter and semaphore
+    internal_token->sync.atomic_countdown = njobs;
+    qurt_sem_init_val(&internal_token->sync.sem, 0);
+}
+
+// worker job responsible for calling this function to count down completed jobs.
+void worker_pool_synctoken_jobdone(worker_synctoken_t *token)
+{
+    // cast input to usable struct
+    internal_synctoken_t *internal_token = (internal_synctoken_t *) token;
+
+    // count down atomically, and raise semaphore if last job.
+    if (0 == worker_pool_atomic_dec_return(&internal_token->sync.atomic_countdown))
+    {
+        (void) qurt_sem_up(&internal_token->sync.sem);
+    }
+}
+
+// job submitter waits on this function for all jobs to complete.
+void worker_pool_synctoken_wait(worker_synctoken_t *token)
+{
+    // cast input to usable struct
+    internal_synctoken_t *internal_token = (internal_synctoken_t *) token;
+
+    // Wait for all jobs to finish and raise the semaphore
+    (void) qurt_sem_down(&internal_token->sync.sem);
+
+    // clean up the semaphore
+    (void) qurt_sem_destroy(&internal_token->sync.sem);
+}
+
+AEEResult worker_pool_set_thread_priority(worker_pool_context_t context, unsigned int prio)
+{
+    worker_pool_t *me = (worker_pool_t*)context;
+
+    // if no worker pool exists, return error.
+    if (NULL == me)
+    {
+        return AEE_ENOMORE;
+    }
+
+    int result = AEE_SUCCESS;
+    if (prio < 1) prio = 1;
+    if (prio > LOWEST_USABLE_QURT_PRIO) prio = LOWEST_USABLE_QURT_PRIO;
+    for (unsigned int i = 0; i < me->num_workers; i++)
+    {
+        int res = qurt_thread_set_priority(me->thread[i], (unsigned short)prio);
+        if (0 != res)
+        {
+            result = AEE_EBADPARM;
+            FARF(ERROR, "QURT failed to set priority of thread %d, ERROR = %d", me->thread[i], res);
+        }
+    }
+    return result;
+}
+
+AEEResult worker_pool_retrieve_threadID(worker_pool_context_t context, unsigned int* threadIDs) {
+
+    worker_pool_t *me = (worker_pool_t*)context;
+    if(me == NULL)
+    {
+        FARF(ERROR, "Context NULL in RetrieveThreadID");
+        return AEE_EBADPARM;;
+    }
+
+    for(int i=0; i<me->num_workers; i++)
+    {
+        threadIDs[i]= me->thread[i];
+        FARF(MEDIUM, "Inside RetrieveThreadID threadIDs[%d] is %d",i,threadIDs[i]);
+    }
+    return AEE_SUCCESS;
+}
+
+
+AEEResult worker_pool_get_thread_priority(worker_pool_context_t context, unsigned int *prio)
+{
+    worker_pool_t *me = (worker_pool_t*)context;
+
+    // if NULL is passed as context, share static_context's priority.
+    if (NULL == me)
+    {
+        if (static_context == NULL)
+            return AEE_ENOMORE;
+        FARF(HIGH, "Using default static worker pool");
+        me = (worker_pool_t*)static_context;
+    }
+
+    int priority = qurt_thread_get_priority(me->thread[0]);
+    if (priority > 0)
+    {
+        *prio = priority;
+        return 0;
+    }
+    else
+    {
+        *prio = 0;
+        return AEE_EBADSTATE;
+    }
+}
diff --git a/ggml/src/ggml-hexagon/kernels/worker_pool.h b/ggml/src/ggml-hexagon/kernels/worker_pool.h
new file mode 100755
index 0000000000000..701cbf6215f43
--- /dev/null
+++ b/ggml/src/ggml-hexagon/kernels/worker_pool.h
@@ -0,0 +1,329 @@
+#ifndef WORKER_H
+#define WORKER_H
+
+/**=============================================================================
+
+@file
+   worker_pool.h
+
+@brief
+   Utility providing a thread worker pool for multi-threaded computer vision
+   (or other compute) applications.
+
+Copyright (c) 2019-2020 Qualcomm Technologies Incorporated.
+All Rights Reserved. Qualcomm Proprietary and Confidential.
+
+Export of this technology or software is regulated by the U.S.
+Government. Diversion contrary to U.S. law prohibited.
+
+All ideas, data and information contained in or disclosed by
+this document are confidential and proprietary information of
+Qualcomm Technologies Incorporated and all rights therein are expressly reserved.
+By accepting this material the recipient agrees that this material
+and the information contained therein are held in confidence and in
+trust and will not be used, copied, reproduced in whole or in part,
+nor its contents revealed in any manner to others without the express
+written permission of Qualcomm Technologies Incorporated.
+
+=============================================================================**/
+//==============================================================================
+// Defines
+//==============================================================================
+/// MACRO enables function to be visible in shared-library case.
+#define WORKERPOOL_API __attribute__ ((visibility ("default")))
+
+//==============================================================================
+// Include Files
+//==============================================================================
+
+#include <AEEStdDef.h>
+#include <AEEStdErr.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*===========================================================================
+    TYPEDEF
+===========================================================================*/
+/// signature of callbacks to be invoked by worker threads
+typedef void ( *worker_callback_t )( void* );
+
+/// Typedef of worker_pool context
+typedef void* worker_pool_context_t;
+
+/// descriptor for requested callback
+typedef struct
+{
+    /// function pointer
+    worker_callback_t fptr;
+    /// data pointer
+    void* dptr;
+} worker_pool_job_t;
+
+/// opaque client view of synchronization token for job submitter and workers. Internals hidden in implementation.
+typedef struct
+{
+    /// opaque array to store synchronization token for job
+    unsigned int dummy[8];   // large enough to hold a counter and a semaphore
+} worker_synctoken_t __attribute__((aligned(8)));
+
+/*===========================================================================
+    CONSTANTS
+===========================================================================*/
+/// Maximum supported number of worker threads.
+
+#define MAX_NUM_WORKERS   8
+/// Number of workers
+WORKERPOOL_API extern unsigned int num_workers;
+/// Maximum number of hvx 128 bytes units available
+WORKERPOOL_API extern unsigned int num_hvx128_contexts;
+
+//==============================================================================
+// Declarations
+//==============================================================================
+
+//---------------------------------------------------------------------------
+/// @brief
+///   Initialize a worker pool. Should be called by each control thread that
+///   requires its own worker pool.
+///
+///
+/// @param *context
+///   pointer to worker_pool_context_t variable.
+///
+/// @return
+///   0 - success.
+///   any other value - failure.
+//---------------------------------------------------------------------------
+WORKERPOOL_API AEEResult
+worker_pool_init(worker_pool_context_t *context);
+
+//---------------------------------------------------------------------------
+/// @brief
+///   Initialize a worker pool with custom stack size of worker threads.
+//    Should be called by each control thread that requires its own worker pool.
+///
+///
+/// @param *context
+///   pointer to worker_pool_context_t variable.
+/// @param *stack_size
+///   stack size of each worker thread.
+///
+/// @return
+///   0 - success.
+///   any other value - failure.
+//---------------------------------------------------------------------------
+WORKERPOOL_API AEEResult
+worker_pool_init_with_stack_size(worker_pool_context_t *context, int stack_size);
+
+//---------------------------------------------------------------------------
+/// @brief
+///   Kill worker threads and release worker pool resources. Must be called
+///   when pool owner no longer requires the pool.
+///
+///
+/// @param *context
+///   worker_pool_context_t.
+///
+//---------------------------------------------------------------------------
+WORKERPOOL_API void
+worker_pool_deinit(worker_pool_context_t *context);
+
+//---------------------------------------------------------------------------
+/// @brief
+///   Function to determine if there is an established worker pool available to
+///   the calling thread. This is an optional call - if no pool is available
+///   but attempted to be used, everything works seamlessly, in the client's
+///   context (instead of worker context).
+///
+///
+/// @param context
+///   worker_pool_context_t.
+///
+/// @return
+///   0 - no worker pool available.
+///   any other value - worker pool available.
+//---------------------------------------------------------------------------
+WORKERPOOL_API AEEResult
+worker_pool_available(worker_pool_context_t context);
+
+//---------------------------------------------------------------------------
+/// @brief
+///   Submit a job to the worker pool.
+///
+///
+/// @param context
+///   worker pool context where job is to be submitted.
+///
+/// @param job
+///   callback function pointer and data.
+///
+/// @return
+///   0 - success.
+///   any other value - failure.
+//---------------------------------------------------------------------------
+WORKERPOOL_API AEEResult
+worker_pool_submit(worker_pool_context_t context, worker_pool_job_t job);
+
+//---------------------------------------------------------------------------
+/// @brief
+///   Initialize a synchronization token for job submitter and workers to use.
+///   Each worker callback must be given access to the token to release it, and
+///   job submitter will wait for all jobs to release the token. Internals are
+///   hidden from client.
+///
+///
+/// @param token
+///   pointer to the synctoken structure.
+///
+/// @param njobs
+///   number of jobs that will be releasing the token
+//---------------------------------------------------------------------------
+WORKERPOOL_API void
+worker_pool_synctoken_init(worker_synctoken_t *token, unsigned int njobs);
+
+//---------------------------------------------------------------------------
+/// @brief
+///   Needs to be called by the worker in the callback before exiting. The
+///   token must be available to the callback via the data pointer given
+///   to the callback during job submission.
+///
+///
+/// @param token
+///   pointer to the synctoken structure held by the job submitter
+//---------------------------------------------------------------------------
+WORKERPOOL_API void
+worker_pool_synctoken_jobdone(worker_synctoken_t *token);
+
+//---------------------------------------------------------------------------
+/// @brief
+///   Job submitter calls this function after submitting all jobs to await
+///   their completion.
+///
+///
+/// @param token
+///   pointer to the synctoken structure
+//---------------------------------------------------------------------------
+WORKERPOOL_API void
+worker_pool_synctoken_wait(worker_synctoken_t *token);
+
+//---------------------------------------------------------------------------
+/// @brief
+///   Set the thread priority of the worker threads. Specified priority will
+///   be applied to all threads in the default worker pool. The threads
+///   that service boosted and background job requests will also be adjusted to be relative
+///   to the new default thread priority.
+///
+///
+/// @param context
+///   worker pool context whose workers' priorities are to be changed.
+///
+/// @param prio
+///   desired priority. 1 is the highest priority allowed. 255 is the lowest priority allowed.
+///
+/// @return
+///   0 - success.
+///   any other value - failure.
+//---------------------------------------------------------------------------
+WORKERPOOL_API AEEResult
+worker_pool_set_thread_priority(worker_pool_context_t context, unsigned int prio);
+
+//---------------------------------------------------------------------------
+/// @brief
+///   Query the thread priority of the default worker threads. This will return
+///   the current priority for one of the workers, which are all created
+///   with the same priority. If a user callback has changed one or more worker threads independently,
+///   there is no guarantee on which worker's priority is returned by this function.
+///
+///
+/// @param context
+///   worker pool context whose workers' priorities are asked.
+///
+/// @param prio
+///   desired priority. 1 is the highest priority allowed. 255 is the lowest priority allowed.
+///
+/// @return
+///   0 - success.
+///   any other value - failure.
+//---------------------------------------------------------------------------
+WORKERPOOL_API AEEResult
+worker_pool_get_thread_priority(worker_pool_context_t context, unsigned int *prio);
+
+//---------------------------------------------------------------------------
+/// @brief
+///   Utility inline to atomically increment a variable. Useful in
+///   synchronizing jobs among worker threads, in cases where all
+///   job-related info can be determined by the job number.
+///
+///
+/// @param target
+///   pointer to the variable being incremented
+///
+/// @return
+///   the value after incrementing
+//---------------------------------------------------------------------------
+static inline unsigned int
+worker_pool_atomic_inc_return(unsigned int *target)
+{
+    unsigned int result;
+    __asm__ __volatile__(
+        "1:     %0 = memw_locked(%2)\n"
+        "       %0 = add(%0, #1)\n"
+        "       memw_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*target)
+        : "r" (target)
+        : "p0");
+    return result;
+}
+
+//---------------------------------------------------------------------------
+/// @brief
+///   Utility inline to atomically decrement a variable.
+///
+///
+/// @param target
+///   pointer to the variable being incremented
+///
+/// @return
+///   the value after decrementing
+//---------------------------------------------------------------------------
+static inline unsigned int
+worker_pool_atomic_dec_return(unsigned int *target)
+{
+    unsigned int result;
+
+    __asm__ __volatile__(
+        "1:     %0 = memw_locked(%2)\n"
+        "       %0 = add(%0, #-1)\n"
+        "       memw_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*target)
+        : "r" (target)
+        : "p0");
+    return result;
+}
+
+//---------------------------------------------------------------------------
+/// @brief
+///   Quries and retruns the threads IDs of all the active threads in the worker pool.
+///
+///
+/// @param context
+///   worker pool context whose workers' IDs are asked.
+///
+/// @param threadIDs
+///   pointer to the array created by the user where thread IDs will be written to.
+///
+/// @return
+///   0 - success.
+///   0E - Invalid parameter
+//---------------------------------------------------------------------------
+WORKERPOOL_API AEEResult
+worker_pool_retrieve_threadID(worker_pool_context_t context, unsigned int* threadIDs);
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // #ifndef WORKER_H
diff --git a/ggml/src/ggml-hip/CMakeLists.txt b/ggml/src/ggml-hip/CMakeLists.txt
index 1fe8fe3b8d079..e29df98560e07 100644
--- a/ggml/src/ggml-hip/CMakeLists.txt
+++ b/ggml/src/ggml-hip/CMakeLists.txt
@@ -113,6 +113,10 @@ if (GGML_HIP_ROCWMMA_FATTN)
     add_compile_definitions(GGML_HIP_ROCWMMA_FATTN)
 endif()
 
+if (GGML_HIP_FORCE_ROCWMMA_FATTN_GFX12 OR ${hip_VERSION} VERSION_GREATER_EQUAL 7.0)
+    add_compile_definitions(GGML_HIP_ROCWMMA_FATTN_GFX12)
+endif()
+
 if (NOT GGML_CUDA_FA)
     add_compile_definitions(GGML_CUDA_NO_FA)
 endif()
diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h
index 89b59d9aadc7e..57761644f431a 100644
--- a/ggml/src/ggml-impl.h
+++ b/ggml/src/ggml-impl.h
@@ -32,6 +32,8 @@
 extern "C" {
 #endif
 
+void ggml_print_backtrace(void);
+
 #ifndef MIN
 #    define MIN(a, b) ((a) < (b) ? (a) : (b))
 #endif
@@ -315,203 +317,81 @@ struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph, int i0, int i1);
 GGML_API void * ggml_aligned_malloc(size_t size);
 GGML_API void ggml_aligned_free(void * ptr, size_t size);
 
-// FP16 to FP32 conversion
-
-// 16-bit float
-// on Arm, we use __fp16
-// on x86, we use uint16_t
-//
-// for old CUDA compilers (<= 11), we use uint16_t: ref https://github.com/ggml-org/llama.cpp/pull/10616
-// for     MUSA compilers        , we use uint16_t: ref https://github.com/ggml-org/llama.cpp/pull/11843
-//
-#if defined(__ARM_NEON) && !(defined(__CUDACC__) && __CUDACC_VER_MAJOR__ <= 11) && !defined(__MUSACC__)
-    #define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
-    #define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
-
-    #define GGML_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
-
-    static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
-        __fp16 tmp;
-        memcpy(&tmp, &h, sizeof(ggml_fp16_t));
-        return (float)tmp;
-    }
-
-    static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
-        ggml_fp16_t res;
-        __fp16 tmp = f;
-        memcpy(&res, &tmp, sizeof(ggml_fp16_t));
-        return res;
-    }
-
-#elif defined(__F16C__)
-
-    #ifdef _MSC_VER
-        #define GGML_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x)))
-        #define GGML_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0)
-    #else
-        #define GGML_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x)
-        #define GGML_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0)
-    #endif
-
-#elif defined(__POWER9_VECTOR__)
-
-    #define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
-    #define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
-    /* the inline asm below is about 12% faster than the lookup method */
-    #define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x)
-    #define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
-
-    static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
-        float f;
-        double d;
-        __asm__(
-            "mtfprd %0,%2\n"
-            "xscvhpdp %0,%0\n"
-            "frsp %1,%0\n" :
-            /* temp */ "=d"(d),
-            /* out */  "=f"(f):
-            /* in */   "r"(h));
-        return f;
-    }
-
-    static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
-        double d;
-        ggml_fp16_t r;
-        __asm__( /* xscvdphp can work on double or single precision */
-            "xscvdphp %0,%2\n"
-            "mffprd %1,%0\n" :
-            /* temp */ "=d"(d),
-            /* out */  "=r"(r):
-            /* in */   "f"(f));
-        return r;
-    }
-
-#elif defined(__riscv) && defined(__riscv_zfhmin)
+// FP16 <-> FP32
+// ref: https://github.com/Maratyszcza/FP16
 
-    static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
-        float f;
-        __asm__(
-            "fmv.h.x %[f], %[h]\n\t"
-            "fcvt.s.h %[f], %[f]"
-            : [f] "=&f" (f)
-            : [h] "r" (h)
-        );
-        return f;
-    }
+static inline float fp32_from_bits(uint32_t w) {
+    union {
+        uint32_t as_bits;
+        float as_value;
+    } fp32;
+    fp32.as_bits = w;
+    return fp32.as_value;
+}
 
-    static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
-        ggml_fp16_t res;
-        __asm__(
-            "fcvt.h.s %[f], %[f]\n\t"
-            "fmv.x.h %[h], %[f]"
-            : [h] "=&r" (res)
-            : [f] "f" (f)
-        );
-        return res;
-    }
+static inline uint32_t fp32_to_bits(float f) {
+    union {
+        float as_value;
+        uint32_t as_bits;
+    } fp32;
+    fp32.as_value = f;
+    return fp32.as_bits;
+}
 
-    #define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
-    #define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
-    #define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x)
-    #define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
+static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
+    const uint32_t w = (uint32_t) h << 16;
+    const uint32_t sign = w & UINT32_C(0x80000000);
+    const uint32_t two_w = w + w;
 
+    const uint32_t exp_offset = UINT32_C(0xE0) << 23;
+#if (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)) && (!defined(__cplusplus) || __cplusplus >= 201703L)
+    const float exp_scale = 0x1.0p-112f;
 #else
+    const float exp_scale = fp32_from_bits(UINT32_C(0x7800000));
+#endif
+    const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale;
 
-    // FP16 <-> FP32
-    // ref: https://github.com/Maratyszcza/FP16
-
-    static inline float fp32_from_bits(uint32_t w) {
-        union {
-            uint32_t as_bits;
-            float as_value;
-        } fp32;
-        fp32.as_bits = w;
-        return fp32.as_value;
-    }
-
-    static inline uint32_t fp32_to_bits(float f) {
-        union {
-            float as_value;
-            uint32_t as_bits;
-        } fp32;
-        fp32.as_value = f;
-        return fp32.as_bits;
-    }
-
-    static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
-        const uint32_t w = (uint32_t) h << 16;
-        const uint32_t sign = w & UINT32_C(0x80000000);
-        const uint32_t two_w = w + w;
-
-        const uint32_t exp_offset = UINT32_C(0xE0) << 23;
-    #if (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)) && (!defined(__cplusplus) || __cplusplus >= 201703L)
-        const float exp_scale = 0x1.0p-112f;
-    #else
-        const float exp_scale = fp32_from_bits(UINT32_C(0x7800000));
-    #endif
-        const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale;
-
-        const uint32_t magic_mask = UINT32_C(126) << 23;
-        const float magic_bias = 0.5f;
-        const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias;
-
-        const uint32_t denormalized_cutoff = UINT32_C(1) << 27;
-        const uint32_t result = sign |
-            (two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value));
-        return fp32_from_bits(result);
-    }
+    const uint32_t magic_mask = UINT32_C(126) << 23;
+    const float magic_bias = 0.5f;
+    const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias;
 
-    static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
-    #if (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)) && (!defined(__cplusplus) || __cplusplus >= 201703L)
-        const float scale_to_inf = 0x1.0p+112f;
-        const float scale_to_zero = 0x1.0p-110f;
-    #else
-        const float scale_to_inf = fp32_from_bits(UINT32_C(0x77800000));
-        const float scale_to_zero = fp32_from_bits(UINT32_C(0x08800000));
-    #endif
-        float base = (fabsf(f) * scale_to_inf) * scale_to_zero;
-
-        const uint32_t w = fp32_to_bits(f);
-        const uint32_t shl1_w = w + w;
-        const uint32_t sign = w & UINT32_C(0x80000000);
-        uint32_t bias = shl1_w & UINT32_C(0xFF000000);
-        if (bias < UINT32_C(0x71000000)) {
-            bias = UINT32_C(0x71000000);
-        }
+    const uint32_t denormalized_cutoff = UINT32_C(1) << 27;
+    const uint32_t result = sign |
+        (two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value));
+    return fp32_from_bits(result);
+}
 
-        base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base;
-        const uint32_t bits = fp32_to_bits(base);
-        const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00);
-        const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF);
-        const uint32_t nonsign = exp_bits + mantissa_bits;
-        return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign);
+static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
+#if (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)) && (!defined(__cplusplus) || __cplusplus >= 201703L)
+    const float scale_to_inf = 0x1.0p+112f;
+    const float scale_to_zero = 0x1.0p-110f;
+#else
+    const float scale_to_inf = fp32_from_bits(UINT32_C(0x77800000));
+    const float scale_to_zero = fp32_from_bits(UINT32_C(0x08800000));
+#endif
+    float base = (fabsf(f) * scale_to_inf) * scale_to_zero;
+
+    const uint32_t w = fp32_to_bits(f);
+    const uint32_t shl1_w = w + w;
+    const uint32_t sign = w & UINT32_C(0x80000000);
+    uint32_t bias = shl1_w & UINT32_C(0xFF000000);
+    if (bias < UINT32_C(0x71000000)) {
+        bias = UINT32_C(0x71000000);
     }
 
-    #define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
-    #define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
-
-#endif // defined(__ARM_NEON) && !(defined(__CUDACC__) && __CUDACC_VER_MAJOR__ <= 11) && !defined(__MUSACC__)
-
-// precomputed f32 table for f16 (256 KB)
-// defined in ggml.c, initialized in ggml_init()
-GGML_API float ggml_table_f32_f16[1 << 16];
-
-// On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32,
-// so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON.
-// This is also true for POWER9.
-#if !defined(GGML_FP16_TO_FP32)
-inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
-    uint16_t s;
-    memcpy(&s, &f, sizeof(uint16_t));
-    return ggml_table_f32_f16[s];
+    base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base;
+    const uint32_t bits = fp32_to_bits(base);
+    const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00);
+    const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF);
+    const uint32_t nonsign = exp_bits + mantissa_bits;
+    return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign);
 }
 
-#define GGML_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x)
-#endif
+#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
+#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
 
-#if !defined(GGML_FP32_TO_FP16)
+#define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x)
 #define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
-#endif
 
 /**
  * Converts brain16 to float32.
diff --git a/ggml/src/ggml-metal/CMakeLists.txt b/ggml/src/ggml-metal/CMakeLists.txt
index e222327809c31..77187efc1756d 100644
--- a/ggml/src/ggml-metal/CMakeLists.txt
+++ b/ggml/src/ggml-metal/CMakeLists.txt
@@ -44,21 +44,22 @@ if (GGML_METAL_EMBED_LIBRARY)
     set(METALLIB_SOURCE_EMBED_TMP "${CMAKE_BINARY_DIR}/autogenerated/ggml-metal-embed.metal.tmp")
 
     add_custom_command(
-        OUTPUT ${METALLIB_EMBED_ASM}
+        OUTPUT "${METALLIB_EMBED_ASM}"
         COMMAND echo "Embedding Metal library"
-        COMMAND sed -e '/__embed_ggml-common.h__/r         ${METALLIB_COMMON}' -e '/__embed_ggml-common.h__/d'         < ${METALLIB_SOURCE}           > ${METALLIB_SOURCE_EMBED_TMP}
-        COMMAND sed -e '/\#include \"ggml-metal-impl.h\"/r ${METALLIB_IMPL}'   -e '/\#include \"ggml-metal-impl.h\"/d' < ${METALLIB_SOURCE_EMBED_TMP} > ${METALLIB_SOURCE_EMBED}
-        COMMAND echo ".section __DATA,__ggml_metallib"          >  ${METALLIB_EMBED_ASM}
-        COMMAND echo ".globl _ggml_metallib_start"              >> ${METALLIB_EMBED_ASM}
-        COMMAND echo "_ggml_metallib_start:"                    >> ${METALLIB_EMBED_ASM}
-        COMMAND echo ".incbin \\\"${METALLIB_SOURCE_EMBED}\\\"" >> ${METALLIB_EMBED_ASM}
-        COMMAND echo ".globl _ggml_metallib_end"                >> ${METALLIB_EMBED_ASM}
-        COMMAND echo "_ggml_metallib_end:"                      >> ${METALLIB_EMBED_ASM}
+        COMMAND sed -e "/__embed_ggml-common.h__/r ${METALLIB_COMMON}"       -e "/__embed_ggml-common.h__/d"         < "${METALLIB_SOURCE}"           > "${METALLIB_SOURCE_EMBED_TMP}"
+        COMMAND sed -e "/\#include \"ggml-metal-impl.h\"/r ${METALLIB_IMPL}" -e "/\#include \"ggml-metal-impl.h\"/d" < "${METALLIB_SOURCE_EMBED_TMP}" > "${METALLIB_SOURCE_EMBED}"
+        COMMAND echo ".section __DATA,__ggml_metallib"          >  "${METALLIB_EMBED_ASM}"
+        COMMAND echo ".globl _ggml_metallib_start"              >> "${METALLIB_EMBED_ASM}"
+        COMMAND echo "_ggml_metallib_start:"                    >> "${METALLIB_EMBED_ASM}"
+        COMMAND echo .incbin "\"${METALLIB_SOURCE_EMBED}\""     >> "${METALLIB_EMBED_ASM}"
+        COMMAND echo ".globl _ggml_metallib_end"                >> "${METALLIB_EMBED_ASM}"
+        COMMAND echo "_ggml_metallib_end:"                      >> "${METALLIB_EMBED_ASM}"
         DEPENDS ../ggml-common.h ggml-metal.metal ggml-metal-impl.h
         COMMENT "Generate assembly for embedded Metal library"
+        VERBATIM
     )
 
-    target_sources(ggml-metal PRIVATE ${METALLIB_EMBED_ASM})
+    target_sources(ggml-metal PRIVATE "${METALLIB_EMBED_ASM}")
 else()
     if (GGML_METAL_SHADER_DEBUG)
         # custom command to do the following:
diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
index f78e7eee553b6..d8d30cc0b41ca 100644
--- a/ggml/src/ggml-metal/ggml-metal.m
+++ b/ggml/src/ggml-metal/ggml-metal.m
@@ -48,22 +48,28 @@
     int            mtl_device_ref_count;
     id<MTLLibrary> mtl_library;
 
+    NSLock * mtl_lock;
+
     bool has_simdgroup_reduction;
     bool has_simdgroup_mm;
     bool has_residency_sets;
     bool has_bfloat;
     bool use_bfloat;
 
+    size_t max_size;
+
     char name[128];
 } g_ggml_ctx_dev_main = {
     /*.mtl_device              =*/ nil,
     /*.mtl_device_ref_count    =*/ 0,
     /*.mtl_library             =*/ nil,
+    /*.mtl_lock                =*/ nil,
     /*.has_simdgroup_reduction =*/ false,
     /*.has_simdgroup_mm        =*/ false,
     /*.has_residency_sets      =*/ false,
     /*.has_bfloat              =*/ false,
     /*.use_bfloat              =*/ false,
+    /*.max_size                =*/ 0,
     /*.name                    =*/ "",
 };
 
@@ -71,6 +77,10 @@
 static id<MTLDevice> ggml_backend_metal_device_acq(struct ggml_backend_metal_device_context * ctx) {
     assert(ctx != NULL);
 
+    if (ctx->mtl_lock == nil) {
+        ctx->mtl_lock = [[NSLock alloc] init];
+    }
+
     if (ctx->mtl_device == nil) {
         ctx->mtl_device = MTLCreateSystemDefaultDevice();
     }
@@ -94,6 +104,8 @@
         ctx->use_bfloat = false;
 #endif
 
+        ctx->max_size = ctx->mtl_device.maxBufferLength;
+
         strncpy(ctx->name, [[ctx->mtl_device name] UTF8String], sizeof(ctx->name) - 1);
     }
 
@@ -110,6 +122,11 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte
     ctx->mtl_device_ref_count--;
 
     if (ctx->mtl_device_ref_count == 0) {
+        if (ctx->mtl_lock) {
+            [ctx->mtl_lock release];
+            ctx->mtl_lock = nil;
+        }
+
         if (ctx->mtl_library) {
             [ctx->mtl_library release];
             ctx->mtl_library = nil;
@@ -194,11 +211,14 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte
     GGML_METAL_KERNEL_TYPE_RWKV_WKV6_F32,
     GGML_METAL_KERNEL_TYPE_RWKV_WKV7_F32,
     GGML_METAL_KERNEL_TYPE_MUL_MV_F32_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_F32_F32_C4,
     GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_C4,
     GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_1ROW,
     GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_L4,
     GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F16,
     GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_F32_C4,
     GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_F32_1ROW,
     GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_F32_L4,
     GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_BF16,
@@ -498,6 +518,7 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte
     GGML_METAL_KERNEL_TYPE_COS,
     GGML_METAL_KERNEL_TYPE_NEG,
     GGML_METAL_KERNEL_TYPE_SUM_ROWS,
+    GGML_METAL_KERNEL_TYPE_MEAN,
     GGML_METAL_KERNEL_TYPE_POOL_2D_AVG_F32,
     GGML_METAL_KERNEL_TYPE_POOL_2D_MAX_F32,
     GGML_METAL_KERNEL_TYPE_ARGMAX,
@@ -976,7 +997,7 @@ @implementation GGMLMetalClass
     struct ggml_backend_metal_context * ctx = calloc(1, sizeof(struct ggml_backend_metal_context));
     struct ggml_backend_metal_device_context * ctx_dev = dev->context;
 
-    id<MTLDevice> device = ggml_backend_metal_device_acq(ctx_dev);
+    id<MTLDevice> device = ctx_dev->mtl_device;
 
     GGML_LOG_INFO("%s: picking default device: %s\n", __func__, [[device name] UTF8String]);
 
@@ -990,9 +1011,16 @@ @implementation GGMLMetalClass
     ctx->d_queue = dispatch_queue_create("ggml-metal", DISPATCH_QUEUE_CONCURRENT);
 
     // load library
-    if (ctx_dev->mtl_library == nil) {
-        ctx_dev->mtl_library = ggml_metal_load_library(device, ctx_dev->use_bfloat);
+    {
+        [ctx_dev->mtl_lock lock];
+
+        if (ctx_dev->mtl_library == nil) {
+            ctx_dev->mtl_library = ggml_metal_load_library(device, ctx_dev->use_bfloat);
+        }
+
+        [ctx_dev->mtl_lock unlock];
     }
+
     id<MTLLibrary> metal_library = ctx_dev->mtl_library;
     if (metal_library == nil) {
         GGML_LOG_ERROR("%s: error: metal library is nil\n", __func__);
@@ -1150,11 +1178,14 @@ @implementation GGMLMetalClass
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_RWKV_WKV6_F32,                   rwkv_wkv6_f32,                   true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_RWKV_WKV7_F32,                   rwkv_wkv7_f32,                   true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F32_F32,                  mul_mv_f32_f32,                  has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F32_F32_C4,               mul_mv_f32_f32_c4,               true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_F32,                 mul_mv_bf16_f32,                 has_simdgroup_reduction && use_bfloat);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_F32_C4,              mul_mv_bf16_f32_c4,              use_bfloat);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_F32_1ROW,            mul_mv_bf16_f32_1row,            has_simdgroup_reduction && use_bfloat);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_F32_L4,              mul_mv_bf16_f32_l4,              has_simdgroup_reduction && use_bfloat);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_BF16,                mul_mv_bf16_bf16,                has_simdgroup_reduction && use_bfloat);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32,                  mul_mv_f16_f32,                  has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_C4,               mul_mv_f16_f32_c4,               true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_1ROW,             mul_mv_f16_f32_1row,             has_simdgroup_reduction);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_L4,               mul_mv_f16_f32_l4,               has_simdgroup_reduction);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F16,                  mul_mv_f16_f16,                  has_simdgroup_reduction);
@@ -1454,6 +1485,7 @@ @implementation GGMLMetalClass
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_COS,                             cos,                             true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_NEG,                             neg,                             true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SUM_ROWS,                        sum_rows,                        true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MEAN,                            mean,                            true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGMAX,                          argmax,                          true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_POOL_2D_AVG_F32,                 pool_2d_avg_f32,                 true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_POOL_2D_MAX_F32,                 pool_2d_max_f32,                 true);
@@ -1653,6 +1685,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
         case GGML_OP_LOG:
             return false; // TODO: implement
         case GGML_OP_SUM_ROWS:
+        case GGML_OP_MEAN:
         case GGML_OP_SOFT_MAX:
         case GGML_OP_GROUP_NORM:
             return has_simdgroup_reduction && ggml_is_contiguous(op->src[0]);
@@ -2400,11 +2433,31 @@ static bool ggml_metal_encode_node(
                 [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
             } break;
         case GGML_OP_SUM_ROWS:
+        case GGML_OP_MEAN:
             {
                 GGML_ASSERT(src0->nb[0] == ggml_type_size(src0->type));
 
-                id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SUM_ROWS].pipeline;
+                id<MTLComputePipelineState> pipeline = nil;
+
+                switch (dst->op) {
+                    case GGML_OP_SUM_ROWS:
+                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SUM_ROWS].pipeline;
+                        break;
+                    case GGML_OP_MEAN:
+                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MEAN].pipeline;
+                        break;
+                    default:
+                        GGML_ABORT("fatal error");
+                }
+
+                int nth = 32; // SIMD width
+
+                while (nth < ne00 && nth < (int) pipeline.maxTotalThreadsPerThreadgroup) {
+                    nth *= 2;
+                }
 
+                nth = MIN(nth, (int) pipeline.maxTotalThreadsPerThreadgroup);
+                nth = MIN(nth, ne00);
 
                 ggml_metal_kargs_sum_rows args = {
                    /*.ne00 =*/ ne00,
@@ -2434,11 +2487,12 @@ static bool ggml_metal_encode_node(
                 };
 
                 [encoder setComputePipelineState:pipeline];
-                [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
-                [encoder setBytes:&args length:sizeof(args) atIndex:2];
+                [encoder setBytes:&args length:sizeof(args) atIndex:0];
+                [encoder setBuffer:id_src0 offset:offs_src0 atIndex:1];
+                [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
+                [encoder setThreadgroupMemoryLength:32*sizeof(float) atIndex:0];
 
-                [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
             } break;
         case GGML_OP_SOFT_MAX:
             {
@@ -3063,14 +3117,23 @@ static bool ggml_metal_encode_node(
                                 nsg = 1;
                                 nr0 = 1;
                                 nr1 = 4;
-                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_F32_F32].pipeline;
+                                if (ne00 == 4) {
+                                    nr0 = 32;
+                                    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_F32_F32_C4].pipeline;
+                                } else {
+                                    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_F32_F32].pipeline;
+                                }
                             } break;
                         case GGML_TYPE_F16:
                             {
                                 nsg = 1;
                                 nr0 = 1;
                                 if (src1t == GGML_TYPE_F32) {
-                                    if (ne11 * ne12 < 4) {
+                                    if (ne00 == 4) {
+                                        nr0 = 32;
+                                        nr1 = 4;
+                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_C4].pipeline;
+                                    } else if (ne11 * ne12 < 4) {
                                         pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_1ROW].pipeline;
                                     } else if (ne00 >= 128 && ne01 >= 8 && ne00%4 == 0) {
                                         pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_L4].pipeline;
@@ -3089,7 +3152,11 @@ static bool ggml_metal_encode_node(
                                 nsg = 1;
                                 nr0 = 1;
                                 if (src1t == GGML_TYPE_F32) {
-                                    if (ne11 * ne12 < 4) {
+                                    if (ne00 == 4) {
+                                        nr0 = 32;
+                                        nr1 = 4;
+                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_F32_C4].pipeline;
+                                    } else if (ne11 * ne12 < 4) {
                                         pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_F32_1ROW].pipeline;
                                     } else if (ne00 >= 128 && ne01 >= 8 && ne00%4 == 0) {
                                         pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_F32_L4].pipeline;
@@ -3733,6 +3800,7 @@ static bool ggml_metal_encode_node(
                     nth *= 2;
                 }
 
+                nth = MIN(nth, (int) pipeline.maxTotalThreadsPerThreadgroup);
                 nth = MIN(nth, ne00/4);
 
                 ggml_metal_kargs_rms_norm args = {
@@ -3769,6 +3837,7 @@ static bool ggml_metal_encode_node(
                     nth *= 2;
                 }
 
+                nth = MIN(nth, (int) pipeline.maxTotalThreadsPerThreadgroup);
                 nth = MIN(nth, ne00/4);
 
                 ggml_metal_kargs_l2_norm args = {
@@ -3841,6 +3910,7 @@ static bool ggml_metal_encode_node(
                     nth *= 2;
                 }
 
+                nth = MIN(nth, (int) pipeline.maxTotalThreadsPerThreadgroup);
                 nth = MIN(nth, ne00/4);
 
                 ggml_metal_kargs_norm args = {
@@ -4766,6 +4836,8 @@ static bool ggml_metal_encode_node(
                     GGML_ASSERT(nqptg  % 8  == 0);
                     GGML_ASSERT(ncpsg  % 32 == 0);
 
+                    const int is_q = ggml_is_quantized(src1->type) ? 1 : 0;
+
                     // 2*(2*ncpsg + nqptg)*(nsg)
                     // ncpsg soft_max values + ncpsg mask values + a diagonal scaling matrix (in float)
                     //
@@ -4773,7 +4845,7 @@ static bool ggml_metal_encode_node(
                     // the shared memory needed for the simdgroups to load the KV cache
                     // each thread loads (dequantizes) 16 head elements, there are 32 threads in th SG
                     //
-#define FATTN_SMEM(nsg) (GGML_PAD((nqptg*(ne00 + 2*(2*ncpsg + nqptg)*(nsg)) + 16*32*(nsg))*(sizeof(float)/2), 16))
+#define FATTN_SMEM(nsg) (GGML_PAD((nqptg*(2*ne00 + 2*(2*ncpsg + nqptg)*(nsg)) + is_q*(16*32*(nsg)))*(sizeof(float)/2), 16))
 
                     int64_t nsgmax = 2;
 
@@ -4810,9 +4882,9 @@ static bool ggml_metal_encode_node(
                     // and store the soft_max values and the mask
                     //
                     // ne00*(nsg)
-                    // each simdgroup has a full f16 head vector in shared mem to accumulate results
+                    // each simdgroup has a full f32 head vector in shared mem to accumulate results
                     //
-#define FATTN_SMEM(nsg) (GGML_PAD((nqptg*(GGML_PAD(ne00, 128) + 4*ncpsg*(nsg)) + ne20*(nsg))*(sizeof(float)/2), 16))
+#define FATTN_SMEM(nsg) (GGML_PAD((nqptg*(GGML_PAD(ne00, 128) + 4*ncpsg*(nsg)) + 2*ne20*(nsg))*(sizeof(float)/2), 16))
 
                     int64_t nsgmax = 2;
                     while (true) {
@@ -4925,8 +4997,39 @@ static bool ggml_metal_encode_node(
                     default: GGML_ABORT("not implemented");
                 }
 
+                GGML_ASSERT(ne00 % ggml_blck_size(src0->type) == 0);
+
+                // TODO: support
+                //const int32_t nk00 = ne00/ggml_blck_size(dst->type);
+                const int32_t nk00 = ne00;
+
+                int nth = 32; // SIMD width
+
+                while (nth < nk00 && nth < (int) pipeline.maxTotalThreadsPerThreadgroup) {
+                    nth *= 2;
+                }
+
+                nth = MIN(nth, (int) pipeline.maxTotalThreadsPerThreadgroup);
+
+                // when rows are small, we can batch them together in a single threadgroup
+                int nrptg = 1;
+
+                // TODO: relax this constraint in the future
+                if (ggml_blck_size(src0->type) == 1 && ggml_blck_size(dst->type) == 1) {
+                    if (nth > nk00) {
+                        nrptg = (nth + nk00 - 1)/nk00;
+                        nth   = nk00;
+
+                        if (nrptg*nth > (int) pipeline.maxTotalThreadsPerThreadgroup) {
+                            nrptg--;
+                        }
+                    }
+                }
+
+                nth = MIN(nth, nk00);
+
                 ggml_metal_kargs_cpy args = {
-                    /*.ne00 =*/ ne00,
+                    /*.ne00 =*/ nk00,
                     /*.ne01 =*/ ne01,
                     /*.ne02 =*/ ne02,
                     /*.ne03 =*/ ne03,
@@ -4949,11 +5052,7 @@ static bool ggml_metal_encode_node(
                 [encoder setBuffer:id_src0 offset:offs_src0 atIndex:1];
                 [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
 
-                GGML_ASSERT(ne00 % ggml_blck_size(src0->type) == 0);
-                int nth = MIN(1024, ne00/ggml_blck_size(src0->type));
-
-                [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
-
+                [encoder dispatchThreadgroups:MTLSizeMake((ne01 + nrptg - 1)/nrptg, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, nrptg, 1)];
             } break;
         case GGML_OP_SET:
             {
@@ -5259,7 +5358,6 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer)
     }
 
     ggml_backend_metal_buffer_rset_free(ctx);
-    ggml_backend_metal_device_rel(buffer->buft->device->context);
 
     if (ctx->owned) {
 #if TARGET_OS_OSX
@@ -5368,7 +5466,10 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba
     }
 
     struct ggml_backend_metal_device_context * ctx_dev = (struct ggml_backend_metal_device_context *)buft->device->context;
-    id<MTLDevice> device = ggml_backend_metal_device_acq(ctx_dev);
+
+    GGML_ASSERT(ctx_dev->mtl_device != nil);
+
+    id<MTLDevice> device = ctx_dev->mtl_device;
 
     ctx->all_data = ggml_metal_host_malloc(size_aligned);
     ctx->all_size = size_aligned;
@@ -5391,14 +5492,12 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba
     if (size_aligned > 0 && (ctx->all_data == NULL || ctx->buffers[0].metal == nil)) {
         GGML_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0);
         free(ctx);
-        ggml_backend_metal_device_rel(ctx_dev);
         return NULL;
     }
 
     if (!ggml_backend_metal_buffer_rset_init(ctx, ctx_dev, device)) {
         GGML_LOG_ERROR("%s: error: failed to initialize residency set\n", __func__);
         free(ctx);
-        ggml_backend_metal_device_rel(ctx_dev);
         return NULL;
     }
 
@@ -5409,17 +5508,14 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba
 
 static size_t ggml_backend_metal_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
     return 32;
+
     GGML_UNUSED(buft);
 }
 
 static size_t ggml_backend_metal_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) {
-    id<MTLDevice> device = ggml_backend_metal_device_acq(buft->device->context);
-    const size_t max_size = device.maxBufferLength;
-    ggml_backend_metal_device_rel(buft->device->context);
+    const size_t max_size = ((struct ggml_backend_metal_device_context *)buft->device->context)->max_size;
 
     return max_size;
-
-    GGML_UNUSED(buft);
 }
 
 static bool ggml_backend_metal_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
@@ -5492,7 +5588,10 @@ ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t siz
     }
 
     struct ggml_backend_metal_device_context * ctx_dev = &g_ggml_ctx_dev_main;
-    id<MTLDevice> device = ggml_backend_metal_device_acq(ctx_dev);
+
+    GGML_ASSERT(ctx_dev->mtl_device != nil);
+
+    id<MTLDevice> device = ctx_dev->mtl_device;
 
     // the buffer fits into the max buffer size allowed by the device
     if (size_aligned <= device.maxBufferLength) {
@@ -5548,7 +5647,6 @@ ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t siz
     if (!ggml_backend_metal_buffer_rset_init(ctx, ctx_dev, device)) {
         GGML_LOG_ERROR("%s: error: failed to initialize residency set\n", __func__);
         free(ctx);
-        ggml_backend_metal_device_rel(ctx_dev);
         return NULL;
     }
 
@@ -5564,10 +5662,8 @@ ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t siz
 }
 
 static void ggml_backend_metal_free(ggml_backend_t backend) {
-    struct ggml_backend_metal_context        * ctx     = backend->context;
-    struct ggml_backend_metal_device_context * ctx_dev = backend->device->context;
+    struct ggml_backend_metal_context * ctx = backend->context;
 
-    ggml_backend_metal_device_rel(ctx_dev);
     ggml_metal_free(ctx);
 
     free(backend);
@@ -5707,6 +5803,8 @@ bool ggml_backend_metal_supports_family(ggml_backend_t backend, int family) {
 
     struct ggml_backend_metal_device_context * ctx_dev = backend->device->context;
 
+    GGML_ASSERT(ctx_dev->mtl_device != nil);
+
     return [ctx_dev->mtl_device supportsFamily:(MTLGPUFamilyApple1 + family - 1)];
 }
 
@@ -5726,10 +5824,7 @@ void ggml_backend_metal_capture_next_compute(ggml_backend_t backend) {
 }
 
 static const char * ggml_backend_metal_device_get_description(ggml_backend_dev_t dev) {
-    // acq/rel just to populate ctx->name in case it hasn't been done yet
     struct ggml_backend_metal_device_context * ctx_dev = (struct ggml_backend_metal_device_context *)dev->context;
-    ggml_backend_metal_device_acq(ctx_dev);
-    ggml_backend_metal_device_rel(ctx_dev);
 
     return ctx_dev->name;
 }
@@ -5737,12 +5832,10 @@ void ggml_backend_metal_capture_next_compute(ggml_backend_t backend) {
 static void ggml_backend_metal_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
     if (@available(macOS 10.12, iOS 16.0, *)) {
         struct ggml_backend_metal_device_context * ctx_dev = (struct ggml_backend_metal_device_context *)dev->context;
-        id<MTLDevice> device = ggml_backend_metal_device_acq(ctx_dev);
+        id<MTLDevice> device = ctx_dev->mtl_device;
 
         *total = device.recommendedMaxWorkingSetSize;
         *free  = *total - device.currentAllocatedSize;
-
-        ggml_backend_metal_device_rel(ctx_dev);
     } else {
         *free = 1;
         *total = 1;
@@ -5820,7 +5913,10 @@ static ggml_backend_buffer_t ggml_backend_metal_device_buffer_from_ptr(ggml_back
     }
 
     struct ggml_backend_metal_device_context * ctx_dev = (struct ggml_backend_metal_device_context *)dev->context;
-    id<MTLDevice> device = ggml_backend_metal_device_acq(ctx_dev);
+
+    GGML_ASSERT(ctx_dev->mtl_device != nil);
+
+    id<MTLDevice> device = ctx_dev->mtl_device;
 
     // the buffer fits into the max buffer size allowed by the device
     if (size_aligned <= device.maxBufferLength) {
@@ -5876,7 +5972,6 @@ static ggml_backend_buffer_t ggml_backend_metal_device_buffer_from_ptr(ggml_back
     if (!ggml_backend_metal_buffer_rset_init(ctx, ctx_dev, device)) {
         GGML_LOG_ERROR("%s: error: failed to initialize residency set\n", __func__);
         free(ctx);
-        ggml_backend_metal_device_rel(ctx_dev);
         return NULL;
     }
 
@@ -5890,8 +5985,9 @@ static bool ggml_backend_metal_device_supports_op(ggml_backend_dev_t dev, const
 }
 
 static bool ggml_backend_metal_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
-    return buft->iface.get_name == ggml_backend_metal_buffer_type_get_name ||
-            buft->iface.get_name == ggml_backend_metal_buffer_from_ptr_type_get_name;
+    return
+        buft->iface.get_name == ggml_backend_metal_buffer_type_get_name ||
+        buft->iface.get_name == ggml_backend_metal_buffer_from_ptr_type_get_name;
 
     GGML_UNUSED(dev);
 }
@@ -5976,8 +6072,19 @@ static ggml_backend_dev_t ggml_backend_metal_reg_device_get(ggml_backend_reg_t r
     /* .get_proc_address = */ ggml_backend_metal_get_proc_address,
 };
 
+// called upon program exit
+static void ggml_metal_cleanup(void) {
+    ggml_backend_metal_device_rel(&g_ggml_ctx_dev_main);
+}
+
+// TODO: make thread-safe
 ggml_backend_reg_t ggml_backend_metal_reg(void) {
-    // TODO: make this thread-safe somehow?
+    ggml_backend_metal_device_acq(&g_ggml_ctx_dev_main);
+
+    // register cleanup callback
+    // TODO: not ideal, but not sure if there is a better way to do this in Objective-C
+    atexit(ggml_metal_cleanup);
+
     {
         g_ggml_backend_metal_reg = (struct ggml_backend_reg) {
             /* .api_version = */ GGML_BACKEND_API_VERSION,
diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
index 59899550ed38c..5f004a856bde6 100644
--- a/ggml/src/ggml-metal/ggml-metal.metal
+++ b/ggml/src/ggml-metal/ggml-metal.metal
@@ -993,31 +993,61 @@ kernel void kernel_neg(
     dst[tpig] = -src0[tpig];
 }
 
+template <bool norm>
 kernel void kernel_sum_rows(
+        constant ggml_metal_kargs_sum_rows & args,
         device const float * src0,
         device       float * dst,
-        constant ggml_metal_kargs_sum_rows & args,
-        uint3 tpig[[thread_position_in_grid]]) {
-    int64_t i3 = tpig.z;
-    int64_t i2 = tpig.y;
-    int64_t i1 = tpig.x;
+        threadgroup  float * shmem_f32 [[threadgroup(0)]],
+        uint3   tgpig[[threadgroup_position_in_grid]],
+        ushort3 tpitg[[thread_position_in_threadgroup]],
+        ushort  sgitg[[simdgroup_index_in_threadgroup]],
+        ushort  tiisg[[thread_index_in_simdgroup]],
+        ushort3   ntg[[threads_per_threadgroup]]) {
+    int64_t i3 = tgpig.z;
+    int64_t i2 = tgpig.y;
+    int64_t i1 = tgpig.x;
 
     if (i3 >= args.ne03 || i2 >= args.ne02 || i1 >= args.ne01) {
         return;
     }
 
+    if (sgitg == 0) {
+        shmem_f32[tiisg] = 0.0f;
+    }
+
     device const float * src_row = (device const float *) ((device const char *) src0 + i1*args.nb01 + i2*args.nb02 + i3*args.nb03);
     device       float * dst_row = (device       float *) ((device       char *) dst  + i1*args.nb1  + i2*args.nb2  + i3*args.nb3);
 
-    float row_sum = 0;
+    float sumf = 0;
+
+    for (int64_t i0 = tpitg.x; i0 < args.ne00; i0 += ntg.x) {
+        sumf += src_row[i0];
+    }
+
+    sumf = simd_sum(sumf);
+
+    threadgroup_barrier(mem_flags::mem_threadgroup);
 
-    for (int64_t i0 = 0; i0 < args.ne00; i0++) {
-        row_sum += src_row[i0];
+    if (tiisg == 0) {
+        shmem_f32[sgitg] = sumf;
     }
 
-    dst_row[0] = row_sum;
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+
+    sumf = shmem_f32[tiisg];
+    sumf = simd_sum(sumf);
+
+    if (tpitg.x == 0) {
+        dst_row[0] = norm ? sumf / args.ne00 : sumf;
+    }
 }
 
+typedef decltype(kernel_sum_rows<false>) kernel_sum_rows_t;
+
+template [[host_name("kernel_sum_rows")]] kernel kernel_sum_rows_t kernel_sum_rows<false>;
+template [[host_name("kernel_mean")]]     kernel kernel_sum_rows_t kernel_sum_rows<true>;
+
 template<typename T>
 kernel void kernel_soft_max(
         device const  char * src0,
@@ -2502,6 +2532,70 @@ template [[host_name("kernel_mul_mv_bf16_f32")]]  kernel mul_mv_t kernel_mul_mv<
 template [[host_name("kernel_mul_mv_bf16_bf16")]] kernel mul_mv_t kernel_mul_mv<bfloat, bfloat4, bfloat, bfloat4>;
 #endif
 
+template<typename T04, typename T14, typename args_t>
+void kernel_mul_mv_c4_impl(
+        args_t args,
+        device const char * src0,
+        device const char * src1,
+        device       char * dst,
+        uint3  tgpig,
+        ushort tiisg) {
+    const int r0 = tgpig.x*32 + tiisg;
+    const int rb = tgpig.y*N_MV_T_T;
+    const int im = tgpig.z;
+
+    if (r0 >= args.ne01) {
+        return;
+    }
+
+    const uint i12 = im%args.ne12;
+    const uint i13 = im/args.ne12;
+
+    const uint64_t offset0 = r0*args.nb01 + (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
+
+    device const T04 * x = (device const T04 *) (src0 + offset0);
+
+    device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1;
+
+    for (int row = 0; row < N_MV_T_T; ++row) {
+        int r1 = rb + row;
+        if (r1 >= args.ne11) {
+            break;
+        }
+
+        const uint64_t offset1 = r1*args.nb11 + (i12   )*args.nb12 + (i13   )*args.nb13;
+
+        device const T14 * y = (device const T14 *) (src1 + offset1);
+
+        dst_f32[(uint64_t)r1*args.ne0 + r0] = dot((float4) x[0], (float4) y[0]);
+    }
+}
+
+template<typename T04, typename T14>
+kernel void kernel_mul_mv_c4(
+        constant ggml_metal_kargs_mul_mv & args,
+        device const char * src0,
+        device const char * src1,
+        device       char * dst,
+        uint3  tgpig[[threadgroup_position_in_grid]],
+        ushort tiisg[[thread_index_in_simdgroup]]) {
+    kernel_mul_mv_c4_impl<T04, T14, constant ggml_metal_kargs_mul_mv &>(
+        args,
+        src0,
+        src1,
+        dst,
+        tgpig,
+        tiisg);
+}
+
+typedef decltype(kernel_mul_mv_c4<half4, half4>) mul_mv_c4_t;
+
+template [[host_name("kernel_mul_mv_f32_f32_c4")]]  kernel mul_mv_c4_t kernel_mul_mv_c4<float4,  float4>;
+template [[host_name("kernel_mul_mv_f16_f32_c4")]]  kernel mul_mv_c4_t kernel_mul_mv_c4<half4,   float4>;
+#if defined(GGML_METAL_USE_BF16)
+template [[host_name("kernel_mul_mv_bf16_f32_c4")]] kernel mul_mv_c4_t kernel_mul_mv_c4<bfloat4, float4>;
+#endif
+
 template<typename T, typename T4>
 kernel void kernel_mul_mv_1row(
         constant ggml_metal_kargs_mul_mv & args,
@@ -3328,14 +3422,12 @@ kernel void kernel_flash_attn_ext(
     constexpr short NW  = N_SIMDWIDTH;
     constexpr short SH  = (2*C + Q); // shared memory per simdgroup (s_t == float)
 
-    const short TS = nsg*SH;   // shared memory size per query in (s_t == float)
-    const short T  = DK + 2*TS; // shared memory size per query in (half)
+    const short TS = nsg*SH;      // shared memory size per query in (s_t == float)
+    const short T  = 2*DK + 2*TS; // shared memory size per query in (half)
 
-    threadgroup q_t  * sq  = (threadgroup q_t  *) (shmem_f16 +              0*DK); // holds the query data
-    threadgroup q4_t * sq4 = (threadgroup q4_t *) (shmem_f16 +              0*DK); // same as above but in q4_t
-    threadgroup o_t  * so  = (threadgroup o_t  *) (shmem_f16 +              0*DK); // reuse query data for accumulation
-    threadgroup o4_t * so4 = (threadgroup o4_t *) (shmem_f16 +              0*DK); // same as above but in o4_t
-    threadgroup s_t  * ss  = (threadgroup s_t  *) (shmem_f16 + 2*sgitg*SH + Q*DK); // scratch buffer for attention, mask and diagonal matrix
+    threadgroup q_t  * sq  = (threadgroup q_t  *) (shmem_f16 +                0*DK); // holds the query data
+    threadgroup q4_t * sq4 = (threadgroup q4_t *) (shmem_f16 +                0*DK); // same as above but in q4_t
+    threadgroup s_t  * ss  = (threadgroup s_t  *) (shmem_f16 + 2*sgitg*SH + 2*Q*DK); // scratch buffer for attention, mask and diagonal matrix
 
     threadgroup k_t    * sk    = (threadgroup k_t    *) (shmem_f16 + sgitg*(4*16*KV) + Q*T); // scratch buffer to load K in shared memory
     threadgroup k4x4_t * sk4x4 = (threadgroup k4x4_t *) (shmem_f16 + sgitg*(4*16*KV) + Q*T); // same as above but in k4x4_t
@@ -3354,7 +3446,7 @@ kernel void kernel_flash_attn_ext(
             if (iq1 + j < args.ne01) {
                 sq4[j*DK4 + i] = (q4_t) q4[i];
             } else {
-                sq4[j*DK4 + i] = (q4_t) 0.0f;
+                sq4[j*DK4 + i] = 0;
             }
         }
     }
@@ -3548,20 +3640,20 @@ kernel void kernel_flash_attn_ext(
 
             // O = diag(ms)*O
             {
-                s8x8_t mm;
-                simdgroup_load(mm, ss + 2*C, TS, 0, false);
+                s8x8_t ms;
+                simdgroup_load(ms, ss + 2*C, TS, 0, false);
 
                 #pragma unroll(DV8)
                 for (short i = 0; i < DV8; ++i) {
-                    simdgroup_multiply(lo[i], mm, lo[i]);
+                    simdgroup_multiply(lo[i], ms, lo[i]);
                 }
             }
 
             // O = O + (Q*K^T)*V
             {
                 for (short cc = 0; cc < C/8; ++cc) {
-                    s8x8_t ms;
-                    simdgroup_load(ms, ss + 8*cc, TS, 0, false);
+                    s8x8_t vs;
+                    simdgroup_load(vs, ss + 8*cc, TS, 0, false);
 
                     if (is_same<vd4x4_t, v4x4_t>::value) {
                         // we can read directly from global memory
@@ -3572,7 +3664,7 @@ kernel void kernel_flash_attn_ext(
                             v8x8_t mv;
                             simdgroup_load(mv, pv + i*8, args.nb21/sizeof(v_t), 0, false); // TODO: use ne20
 
-                            simdgroup_multiply_accumulate(lo[i], ms, mv, lo[i]);
+                            simdgroup_multiply_accumulate(lo[i], vs, mv, lo[i]);
                         }
                     } else {
                         for (short ii = 0; ii < DV16; ii += 4) {
@@ -3593,10 +3685,10 @@ kernel void kernel_flash_attn_ext(
                                     v8x8_t mv;
 
                                     simdgroup_load(mv, sv + 16*k + 0*8, 4*16, 0, false);
-                                    simdgroup_multiply_accumulate(lo[2*(ii + k) + 0], ms, mv, lo[2*(ii + k) + 0]);
+                                    simdgroup_multiply_accumulate(lo[2*(ii + k) + 0], vs, mv, lo[2*(ii + k) + 0]);
 
                                     simdgroup_load(mv, sv + 16*k + 1*8, 4*16, 0, false);
-                                    simdgroup_multiply_accumulate(lo[2*(ii + k) + 1], ms, mv, lo[2*(ii + k) + 1]);
+                                    simdgroup_multiply_accumulate(lo[2*(ii + k) + 1], vs, mv, lo[2*(ii + k) + 1]);
                                 }
                             } else {
                                 if (ii + tx < DV16) {
@@ -3611,10 +3703,10 @@ kernel void kernel_flash_attn_ext(
                                     v8x8_t mv;
 
                                     simdgroup_load(mv, sv + 16*k + 0*8, 4*16, 0, false);
-                                    simdgroup_multiply_accumulate(lo[2*(ii + k) + 0], ms, mv, lo[2*(ii + k) + 0]);
+                                    simdgroup_multiply_accumulate(lo[2*(ii + k) + 0], vs, mv, lo[2*(ii + k) + 0]);
 
                                     simdgroup_load(mv, sv + 16*k + 1*8, 4*16, 0, false);
-                                    simdgroup_multiply_accumulate(lo[2*(ii + k) + 1], ms, mv, lo[2*(ii + k) + 1]);
+                                    simdgroup_multiply_accumulate(lo[2*(ii + k) + 1], vs, mv, lo[2*(ii + k) + 1]);
                                 }
                             }
                         }
@@ -3624,93 +3716,89 @@ kernel void kernel_flash_attn_ext(
         }
 
         // these are needed for reducing the results from the simdgroups (reuse the ss buffer)
-        for (short j = 0; j < Q; ++j) {
-            if (tiisg == 0) {
-                ss[j*TS + 0] = S[j];
-                ss[j*TS + 1] = M[j];
-            }
+        for (short j = tiisg; j < Q; j += NW) {
+            ss[j*TS + 0] = S[j];
+            ss[j*TS + 1] = M[j];
         }
     }
 
-    // reduce the warps sequentially
-    for (ushort sg = 1; sg < nsg; ++sg) {
-        float S = { 0.0f };
-        float M = { -__FLT_MAX__/2 };
+    threadgroup_barrier(mem_flags::mem_threadgroup);
 
-        threadgroup_barrier(mem_flags::mem_threadgroup);
+    threadgroup float  * so  = (threadgroup float  *) (shmem_f16 + 0*DK); // reuse query data for accumulation
+    threadgroup float4 * so4 = (threadgroup float4 *) (shmem_f16 + 0*DK);
 
-        // each simdgroup stores its output to shared memory, reusing sq
-        if (sgitg == sg) {
-            for (short i = 0; i < DV8; ++i) {
-                simdgroup_store(lo[i], so + i*8, DV, 0, false);
-            }
+    // store result to shared memory in F32
+    if (sgitg == 0) {
+        for (short i = 0; i < DV8; ++i) {
+            //simdgroup_store(lo[i], so + i*8, DV, 0, false);
+            simdgroup_float8x8 t(1.0f);
+            simdgroup_multiply(t, lo[i], t);
+            simdgroup_store(t, so + i*8, DV, 0, false);
         }
+    }
 
-        threadgroup_barrier(mem_flags::mem_threadgroup);
+    threadgroup_barrier(mem_flags::mem_threadgroup);
 
-        // the first simdgroup accumulates the results from the other simdgroups
-        if (sgitg == 0) {
-            for (short j = 0; j < Q; ++j) {
-                const float S0 = ss[j*TS +         0];
-                const float S1 = ss[j*TS + sg*SH + 0];
+    // reduce the warps sequentially
+    for (ushort sg = 1; sg < nsg; ++sg) {
+        if (sgitg == sg) {
+            for (short j = tiisg; j < Q; j += NW) {
+                const float S0 = ss[j*TS - 1*SH + 0];
+                const float S1 = ss[j*TS        + 0];
 
-                const float M0 = ss[j*TS +         1];
-                const float M1 = ss[j*TS + sg*SH + 1];
+                const float M0 = ss[j*TS - 1*SH + 1];
+                const float M1 = ss[j*TS        + 1];
 
-                M = max(M0, M1);
+                const float M = max(M0, M1);
 
-                const float ms0 = exp(M0 - M);
-                const float ms1 = exp(M1 - M);
+                float ms0 = exp(M0 - M);
+                float ms1 = exp(M1 - M);
 
-                S = S0*ms0 + S1*ms1;
+                const float S = S0*ms0 + S1*ms1;
 
-                if (tiisg == 0) {
-                    ss[j*TS + 0] = S;
-                    ss[j*TS + 1] = M;
+                ss[j*TS + 0] = S;
+                ss[j*TS + 1] = M;
 
-                    ss[j*TS + 2*C + j        ] = ms0;
-                    ss[j*TS + 2*C + j + sg*SH] = ms1;
-                }
+                ss[j*TS + 2*C + j - 1*SH] = ms0;
+                ss[j*TS + 2*C + j       ] = ms1;
             }
 
+            //simdgroup_barrier(mem_flags::mem_threadgroup);
+
             // O_0 = diag(ms0)*O_0 + diag(ms1)*O_1
             {
                 s8x8_t ms0;
                 s8x8_t ms1;
 
-                simdgroup_load(ms0, ss + 2*C,         TS, 0, false);
-                simdgroup_load(ms1, ss + 2*C + sg*SH, TS, 0, false);
+                simdgroup_load(ms0, ss + 2*C - 1*SH, TS, 0, false);
+                simdgroup_load(ms1, ss + 2*C,        TS, 0, false);
 
                 #pragma unroll(DV8)
                 for (short i = 0; i < DV8; ++i) {
-                    o8x8_t t;
+                    simdgroup_float8x8 t;
 
                     simdgroup_load    (t, so + i*8, DV, 0, false);
-                    simdgroup_multiply(t, ms1, t);
+                    simdgroup_multiply(t, ms0, t);
 
-                    simdgroup_multiply_accumulate(lo[i], ms0, lo[i], t);
+                    simdgroup_multiply_accumulate(t, ms1, lo[i], t);
+                    simdgroup_store(t, so + i*8, DV, 0, false);
                 }
             }
         }
-    }
 
-    // store result to shared memory (reuse sq)
-    if (sgitg == 0) {
-        for (short i = 0; i < DV8; ++i) {
-            simdgroup_store(lo[i], so + i*8, DV, 0, false);
-        }
+        threadgroup_barrier(mem_flags::mem_threadgroup);
     }
 
-    device float4 * dst4 = (device float4 *) dst;
+    threadgroup s_t * sf = (threadgroup s_t *) (shmem_f16 + 2*(nsg-1)*SH + 2*Q*DK);
 
     // final rescale with 1/S and store to global memory
-    if (sgitg == 0) {
-        for (short j = 0; j < Q && iq1 + j < args.ne01; ++j) {
-            const float S = ss[j*TS + 0];
+    for (short j = sgitg; j < Q && iq1 + j < args.ne01; j += nsg) {
+        const float S = 1.0f/sf[j*TS + 0];
 
-            for (short i = tiisg; i < DV4; i += NW) {
-                dst4[((uint64_t)iq3*args.ne2*args.ne1 + iq2 + (uint64_t)(iq1 + j)*args.ne1)*DV4 + i] = (float4) so4[j*DV4 + i]/S;
-            }
+        device float4 * dst4 = (device float4 *) dst + ((uint64_t)iq3*args.ne2*args.ne1 + iq2 + (uint64_t)(iq1 + j)*args.ne1)*DV4;
+
+        for (short i = tiisg; i < DV4; i += NW) {
+            dst4[i] = (float4) so4[j*DV4 + i]*S;
         }
     }
 }
@@ -3719,12 +3807,22 @@ kernel void kernel_flash_attn_ext(
 //       template to be able to explore different combinations
 //
 #define FA_TYPES \
-    half,  half4,   simdgroup_half8x8,  \
-    half,  half4x4, simdgroup_half8x8,  \
-    half,  half4x4, simdgroup_half8x8,  \
-    float,          simdgroup_float8x8, \
-    float,          simdgroup_float8x8, \
-    half,  half4,   simdgroup_half8x8
+    float,  float4,    simdgroup_float8x8, \
+    half,   half4x4,   simdgroup_half8x8,  \
+    half,   half4x4,   simdgroup_half8x8,  \
+    float,             simdgroup_float8x8, \
+    float,             simdgroup_float8x8, \
+    half,   half4,     simdgroup_half8x8
+    //float,  float4,    simdgroup_float8x8
+
+#define FA_TYPES_BF \
+    bfloat, bfloat4,   simdgroup_bfloat8x8, \
+    bfloat, bfloat4x4, simdgroup_bfloat8x8, \
+    bfloat, bfloat4x4, simdgroup_bfloat8x8, \
+    float,             simdgroup_float8x8,  \
+    float,             simdgroup_float8x8,  \
+    half,   half4,     simdgroup_half8x8
+    //float,  float4,    simdgroup_float8x8
 
 typedef decltype(kernel_flash_attn_ext<FA_TYPES, half4x4, 1, dequantize_f16, half4x4, 1, dequantize_f16, 64, 64>) flash_attn_ext_t;
 
@@ -3739,15 +3837,15 @@ template [[host_name("kernel_flash_attn_ext_f16_h256")]]         kernel flash_at
 template [[host_name("kernel_flash_attn_ext_f16_hk576_hv512")]]  kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, half4x4,    1, dequantize_f16,  half4x4,    1, dequantize_f16,  576, 512>;
 
 #if defined(GGML_METAL_USE_BF16)
-template [[host_name("kernel_flash_attn_ext_bf16_h64" )]]        kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, bfloat4x4,  1, dequantize_bf16, bfloat4x4,  1, dequantize_bf16, 64,  64>;
-template [[host_name("kernel_flash_attn_ext_bf16_h80" )]]        kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, bfloat4x4,  1, dequantize_bf16, bfloat4x4,  1, dequantize_bf16, 80,  80>;
-template [[host_name("kernel_flash_attn_ext_bf16_h96" )]]        kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, bfloat4x4,  1, dequantize_bf16, bfloat4x4,  1, dequantize_bf16, 96,  96>;
-template [[host_name("kernel_flash_attn_ext_bf16_h112")]]        kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, bfloat4x4,  1, dequantize_bf16, bfloat4x4,  1, dequantize_bf16, 112, 112>;
-template [[host_name("kernel_flash_attn_ext_bf16_h128")]]        kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, bfloat4x4,  1, dequantize_bf16, bfloat4x4,  1, dequantize_bf16, 128, 128>;
-template [[host_name("kernel_flash_attn_ext_bf16_h192")]]        kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, bfloat4x4,  1, dequantize_bf16, bfloat4x4,  1, dequantize_bf16, 192, 192>;
-template [[host_name("kernel_flash_attn_ext_bf16_hk192_hv128")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, bfloat4x4,  1, dequantize_bf16, bfloat4x4,  1, dequantize_bf16, 192, 128>;
-template [[host_name("kernel_flash_attn_ext_bf16_h256")]]        kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, bfloat4x4,  1, dequantize_bf16, bfloat4x4,  1, dequantize_bf16, 256, 256>;
-template [[host_name("kernel_flash_attn_ext_bf16_hk576_hv512")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, bfloat4x4,  1, dequantize_bf16, bfloat4x4,  1, dequantize_bf16, 576, 512>;
+template [[host_name("kernel_flash_attn_ext_bf16_h64" )]]        kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES_BF, bfloat4x4,  1, dequantize_bf16, bfloat4x4,  1, dequantize_bf16, 64,  64>;
+template [[host_name("kernel_flash_attn_ext_bf16_h80" )]]        kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES_BF, bfloat4x4,  1, dequantize_bf16, bfloat4x4,  1, dequantize_bf16, 80,  80>;
+template [[host_name("kernel_flash_attn_ext_bf16_h96" )]]        kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES_BF, bfloat4x4,  1, dequantize_bf16, bfloat4x4,  1, dequantize_bf16, 96,  96>;
+template [[host_name("kernel_flash_attn_ext_bf16_h112")]]        kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES_BF, bfloat4x4,  1, dequantize_bf16, bfloat4x4,  1, dequantize_bf16, 112, 112>;
+template [[host_name("kernel_flash_attn_ext_bf16_h128")]]        kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES_BF, bfloat4x4,  1, dequantize_bf16, bfloat4x4,  1, dequantize_bf16, 128, 128>;
+template [[host_name("kernel_flash_attn_ext_bf16_h192")]]        kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES_BF, bfloat4x4,  1, dequantize_bf16, bfloat4x4,  1, dequantize_bf16, 192, 192>;
+template [[host_name("kernel_flash_attn_ext_bf16_hk192_hv128")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES_BF, bfloat4x4,  1, dequantize_bf16, bfloat4x4,  1, dequantize_bf16, 192, 128>;
+template [[host_name("kernel_flash_attn_ext_bf16_h256")]]        kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES_BF, bfloat4x4,  1, dequantize_bf16, bfloat4x4,  1, dequantize_bf16, 256, 256>;
+template [[host_name("kernel_flash_attn_ext_bf16_hk576_hv512")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES_BF, bfloat4x4,  1, dequantize_bf16, bfloat4x4,  1, dequantize_bf16, 576, 512>;
 #endif
 
 template [[host_name("kernel_flash_attn_ext_q4_0_h64" )]]        kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_0, 2, dequantize_q4_0, block_q4_0, 2, dequantize_q4_0, 64,  64>;
@@ -3801,6 +3899,7 @@ template [[host_name("kernel_flash_attn_ext_q8_0_h256")]]        kernel flash_at
 template [[host_name("kernel_flash_attn_ext_q8_0_hk576_hv512")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q8_0, 2, dequantize_q8_0, block_q8_0, 2, dequantize_q8_0, 576, 512>;
 
 #undef FA_TYPES
+#undef FA_TYPES_BF
 
 template<
     typename q4_t,  // query types in shared memory
@@ -3847,12 +3946,12 @@ kernel void kernel_flash_attn_ext_vec(
 
     const short T = DK + nsg*SH; // shared memory size per query in (half)
 
-  //threadgroup q_t   * sq  = (threadgroup q_t   *) (shmem_f16 +                  0*DK); // holds the query data
-    threadgroup q4_t  * sq4 = (threadgroup q4_t  *) (shmem_f16 +                  0*DK); // same as above but in q4_t
-    threadgroup s_t   * ss  = (threadgroup s_t   *) (shmem_f16 + sgitg*SH       + Q*DK); // scratch buffer for attention
-    threadgroup s4_t  * ss4 = (threadgroup s4_t  *) (shmem_f16 + sgitg*SH       + Q*DK); // same as above but in s4_t
-    threadgroup float * sm  = (threadgroup float *) (shmem_f16 + sgitg*SH + 2*C + Q*DK); // scratch buffer for mask
-    threadgroup o4_t  * sr4 = (threadgroup o4_t  *) (shmem_f16 + sgitg*DV       + Q*T);  // scratch buffer for the results
+  //threadgroup q_t   * sq  = (threadgroup q_t   *) (shmem_f16 +                    0*DK); // holds the query data
+    threadgroup q4_t  * sq4 = (threadgroup q4_t  *) (shmem_f16 +                    0*DK); // same as above but in q4_t
+    threadgroup s_t   * ss  = (threadgroup s_t   *) (shmem_f16 +   sgitg*SH       + Q*DK); // scratch buffer for attention
+    threadgroup s4_t  * ss4 = (threadgroup s4_t  *) (shmem_f16 +   sgitg*SH       + Q*DK); // same as above but in s4_t
+    threadgroup float * sm  = (threadgroup float *) (shmem_f16 +   sgitg*SH + 2*C + Q*DK); // scratch buffer for mask
+    threadgroup o4_t  * sr4 = (threadgroup o4_t  *) (shmem_f16 + 2*sgitg*DV       + Q*T);  // scratch buffer for the results
 
     // store the result for all queries in local memory (the O matrix from the paper)
     o4_t lo[DV4/NL];
@@ -4157,7 +4256,7 @@ kernel void kernel_flash_attn_ext_vec(
            half4,  \
     float,         \
     float, float4, \
-           half4
+           float4
 
 typedef decltype(kernel_flash_attn_ext_vec<FA_TYPES, half4, 1, dequantize_f16_t4, half4, 1, dequantize_f16_t4, 128, 128, 4>) flash_attn_ext_vec_t;
 
@@ -4271,11 +4370,16 @@ kernel void kernel_cpy(
         device  const char * src0,
         device        char * dst,
         uint3   tgpig[[threadgroup_position_in_grid]],
+        uint    tiitg[[thread_index_in_threadgroup]],
         ushort3 tpitg[[thread_position_in_threadgroup]],
-        ushort3   ntg[[threads_per_threadgroup]]) {
+        ushort3  tptg[[threads_per_threadgroup]]) {
     const int i03 = tgpig[2];
     const int i02 = tgpig[1];
-    const int i01 = tgpig[0];
+    const int i01 = tgpig[0]*tptg.y + tiitg/tptg.x;
+
+    if (i01 >= args.ne01) {
+        return;
+    }
 
     const int64_t n = i03*args.ne02*args.ne01*args.ne00 + i02*args.ne01*args.ne00 + i01*args.ne00;
 
@@ -4286,7 +4390,7 @@ kernel void kernel_cpy(
 
     device T1 * dst_data = (device T1 *) (dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1 + i0*args.nb0);
 
-    for (int64_t i00 = tpitg.x; i00 < args.ne00; i00 += ntg.x) {
+    for (int64_t i00 = tiitg%tptg.x; i00 < args.ne00; i00 += tptg.x) {
         device const T0 * src = (device T0 *)(src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01 + i00*args.nb00);
         dst_data[i00] = (T1) src[0];
     }
diff --git a/ggml/src/ggml-musa/mudnn.cuh b/ggml/src/ggml-musa/mudnn.cuh
index a63be5755c79c..c30128561e810 100644
--- a/ggml/src/ggml-musa/mudnn.cuh
+++ b/ggml/src/ggml-musa/mudnn.cuh
@@ -1,7 +1,7 @@
 #pragma once
 
-#include "../include/ggml.h"
-#include "../ggml-cuda/common.cuh"
+#include "ggml-cuda/common.cuh"
+#include "ggml.h"
 
 // Asynchronously copies data from src tensor to dst tensor using the provided context.
 // Returns a musaError_t indicating success or failure.
diff --git a/ggml/src/ggml-opencl/CMakeLists.txt b/ggml/src/ggml-opencl/CMakeLists.txt
index 9f930c70b7bb4..0e2a419649cea 100644
--- a/ggml/src/ggml-opencl/CMakeLists.txt
+++ b/ggml/src/ggml-opencl/CMakeLists.txt
@@ -80,6 +80,7 @@ set(GGML_OPENCL_KERNELS
     mul_mv_q4_0_f32_1d_8x_flat
     mul_mv_q4_0_f32_1d_16x_flat
     mul_mv_q6_k
+    mul_mv_id_q4_0_f32_8x_flat
     mul
     norm
     relu
@@ -95,6 +96,12 @@ set(GGML_OPENCL_KERNELS
     sub
     sum_rows
     transpose
+    concat
+    tsembd
+    upscale
+    tanh
+    pad
+    repeat
 )
 
 foreach (K ${GGML_OPENCL_KERNELS})
diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp
index 5dbe97ab2477d..96e8a8588dcb8 100644
--- a/ggml/src/ggml-opencl/ggml-opencl.cpp
+++ b/ggml/src/ggml-opencl/ggml-opencl.cpp
@@ -231,6 +231,71 @@ static ggml_cl_compiler_version get_adreno_cl_compiler_version(const char *drive
     return { type, major, minor, patch };
 }
 
+// Profiling
+struct ProfilingInfo {
+    std::string op_name;
+    std::string kernel_name;
+
+    cl_kernel kernel;
+    cl_event evt;
+
+    cl_ulong cmd_queued;
+    cl_ulong cmd_submit;
+    cl_ulong cmd_start;
+    cl_ulong cmd_end;
+    cl_ulong overhead_start;
+    cl_ulong overhead_end;
+    // For the times below, see spec for clGetEventProfilingInfo
+    // The time kernel spent in cmd queue - SUBMIT - QUEUED
+    cl_ulong cmd_queued_duration_ns;
+    // The time kernel spent for submission - START - SUBMIT
+    cl_ulong cmd_submit_duration_ns;
+    // Kernel execution time in nanoseconds - END - START
+    cl_ulong cmd_duration_ns;
+    // The time for the kernel to complete - COMPLETE - END
+    cl_ulong cmd_complete_duration_ns;
+    // Total time to finish the kernel - COMPELTE - QUEUED
+    cl_ulong cmd_total_duration_ns;
+    // Global and local work sizes.
+    size_t global_size[3];
+    size_t local_size[3];
+    // Op output size.
+    size_t output_size[4];
+};
+
+static void populateProfilingInfo(
+        ProfilingInfo& info, cl_event evt, cl_kernel kernel, cl_uint work_dim,
+        size_t global_size[3], size_t local_size[3],
+        const ggml_tensor * tensor) {
+    info.op_name     = tensor->name;
+    info.kernel      = kernel;
+    info.evt         = evt;
+
+    // 0 means not specified, e.g., 2D workgroup, or NULL for driver to choose
+    info.local_size[0] = 0;
+    info.local_size[1] = 0;
+    info.local_size[2] = 0;
+
+    info.global_size[0] = 0;
+    info.global_size[1] = 0;
+    info.global_size[2] = 0;
+
+    if (local_size) {
+        for (cl_uint i = 0; i < work_dim; ++i) {
+            info.local_size[i] = local_size[i];
+        }
+    }
+
+    for (cl_uint i = 0; i < work_dim; ++i) {
+        info.global_size[i] = global_size[i];
+    }
+
+    info.output_size[0] = tensor->ne[0];
+    info.output_size[1] = tensor->ne[1];
+    info.output_size[2] = tensor->ne[2];
+    info.output_size[3] = tensor->ne[3];
+}
+
 struct ggml_backend_opencl_context;
 
 // backend device context
@@ -254,6 +319,8 @@ struct ggml_backend_opencl_device_context {
 
 // backend context
 struct ggml_backend_opencl_context {
+    int ref_count;
+
     cl_device_id device;
     std::string device_name;
 
@@ -315,6 +382,13 @@ struct ggml_backend_opencl_context {
     cl_program program_softmax_4_f16;
     cl_program program_argsort_f32_i32;
     cl_program program_sum_rows_f32;
+    cl_program program_repeat;
+    cl_program program_pad;
+    cl_program program_tanh;
+    cl_program program_upscale;
+    cl_program program_concat;
+    cl_program program_tsembd;
+    cl_program program_mul_mv_id_q4_0_f32_8x_flat;
 
     cl_kernel kernel_add, kernel_add_row;
     cl_kernel kernel_mul, kernel_mul_row;
@@ -351,6 +425,118 @@ struct ggml_backend_opencl_context {
     cl_kernel kernel_im2col_f32, kernel_im2col_f16;
     cl_kernel kernel_argsort_f32_i32;
     cl_kernel kernel_sum_rows_f32;
+    cl_kernel kernel_repeat;
+    cl_kernel kernel_pad;
+    cl_kernel kernel_tanh_f32_nd;
+    cl_kernel kernel_tanh_f16_nd;
+    cl_kernel kernel_upscale;
+    cl_kernel kernel_upscale_bilinear;
+    cl_kernel kernel_concat_f32_contiguous;
+    cl_kernel kernel_concat_f32_non_contiguous;
+    cl_kernel kernel_timestep_embedding;
+    cl_kernel kernel_mul_mv_id_q4_0_f32_8x_flat;
+
+    std::vector<ProfilingInfo> profiling_info;
+
+    void write_profiling_info() {
+        FILE * fperf = fopen("cl_profiling.csv", "w");
+        if (!fperf) {
+            GGML_LOG_ERROR("Failed to open cl_profiling.csv\n");
+            return;
+        }
+
+        // Populate profiling info
+        for (ProfilingInfo & info : profiling_info) {
+            cl_ulong cmd_queued;
+            cl_ulong cmd_submit;
+            cl_ulong cmd_start;
+            cl_ulong cmd_end;
+            cl_ulong cmd_complete;
+
+            CL_CHECK(clWaitForEvents(1, &info.evt));
+            CL_CHECK(clGetEventProfilingInfo(
+                info.evt, CL_PROFILING_COMMAND_QUEUED, sizeof(cl_ulong), &cmd_queued, NULL));
+            CL_CHECK(clGetEventProfilingInfo(
+                info.evt, CL_PROFILING_COMMAND_SUBMIT, sizeof(cl_ulong), &cmd_submit, NULL));
+            CL_CHECK(clGetEventProfilingInfo(
+                info.evt, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &cmd_start, NULL));
+            CL_CHECK(clGetEventProfilingInfo(
+                info.evt, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &cmd_end, NULL));
+            CL_CHECK(clGetEventProfilingInfo(
+                info.evt, CL_PROFILING_COMMAND_COMPLETE, sizeof(cl_ulong), &cmd_complete, NULL));
+            CL_CHECK(clReleaseEvent(info.evt));
+
+            char kernel_name[512];
+            CL_CHECK(clGetKernelInfo(info.kernel, CL_KERNEL_FUNCTION_NAME,
+                sizeof(kernel_name), kernel_name, NULL));
+            info.kernel_name = kernel_name;
+
+            info.cmd_queued = cmd_queued;
+            info.cmd_submit = cmd_submit;
+            info.cmd_start  = cmd_start;
+            info.cmd_end    = cmd_end;
+
+            info.cmd_queued_duration_ns     = cmd_submit    - cmd_queued;
+            info.cmd_submit_duration_ns     = cmd_start     - cmd_submit;
+            info.cmd_duration_ns            = cmd_end       - cmd_start;
+            info.cmd_complete_duration_ns   = cmd_complete  - cmd_end;
+            info.cmd_total_duration_ns      = cmd_complete  - cmd_queued;
+        }
+
+        // Dump a csv
+        float total_kernel_time = 0;
+        fprintf(fperf, "op name, kernel name, queued duration (ms), submit duration(ms), exec duration (ms), complete duration (ms), total duration (ms), global size, local size, output size\n");
+        for (const ProfilingInfo & info : profiling_info) {
+            total_kernel_time += info.cmd_duration_ns/1.e6f;
+            fprintf(fperf, "%s,%s,%f,%f,%f,%f,%f,%zux%zux%zu,%zux%zux%zu,%zux%zux%zux%zu\n",
+                info.op_name.c_str(), info.kernel_name.c_str(),
+                info.cmd_queued_duration_ns/1.e6f,
+                info.cmd_submit_duration_ns/1.e6f,
+                info.cmd_duration_ns/1.e6f,
+                info.cmd_complete_duration_ns/1.e6f,
+                info.cmd_total_duration_ns/1.e6f,
+                info.global_size[0], info.global_size[1], info.global_size[2],
+                info.local_size[0], info.local_size[1], info.local_size[2],
+                info.output_size[0], info.output_size[1], info.output_size[2], info.output_size[3]);
+        }
+        fclose(fperf);
+
+        GGML_LOG_INFO("ggml_opencl: total kernel time: %f\n", total_kernel_time);
+
+        // Dump a simple chrome trace
+        FILE* ftrace = fopen("cl_trace.json", "w");
+        if (!ftrace) {
+            GGML_LOG_ERROR("Failed to open cl_trace.json\n");
+            return;
+        }
+
+        fprintf(ftrace, "[\n");
+        for (const ProfilingInfo & info : profiling_info) {
+            fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"B\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Host\"},\n",
+                info.kernel_name.c_str(), info.cmd_queued/1000);
+            fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"E\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Host\"},\n",
+                info.kernel_name.c_str(), info.cmd_submit/1000);
+
+            fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"B\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Device\"},\n",
+                info.kernel_name.c_str(), info.cmd_start/1000);
+            fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"E\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Device\"},\n",
+                info.kernel_name.c_str(), info.cmd_end/1000);
+        }
+        fclose(ftrace);
+    }
+
+    void enqueue_ndrange_kernel(cl_kernel kernel, cl_uint work_dim, size_t *global_work_size, size_t *local_work_size, const ggml_tensor * tensor) {
+#ifdef GGML_OPENCL_PROFILING
+        cl_event evt;
+        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, work_dim, NULL, global_work_size, local_work_size, 0, NULL, &evt));
+
+        profiling_info.emplace_back();
+        populateProfilingInfo(profiling_info.back(), evt, kernel, work_dim, global_work_size, local_work_size, tensor);
+#else
+        GGML_UNUSED(tensor);
+        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, work_dim, NULL, global_work_size, local_work_size, 0, NULL, NULL));
+#endif
+    }
 
 #ifdef GGML_OPENCL_USE_ADRENO_KERNELS
     // Transpose kernels
@@ -378,46 +564,19 @@ struct ggml_backend_opencl_context {
     cl_kernel CL_mul_mat_vec_q4_0_f32_1d_4x_flat_11008_1_4096;
     cl_kernel CL_mul_mat_vec_q4_0_f32_1d_4x_flat_32000_1_4096;
 #endif // GGML_OPENCL_USE_ADRENO_KERNELS
-};
-
-// All registered devices with a default device in the front.
-static std::vector<ggml_backend_device> g_ggml_backend_opencl_devices;
 
-// Profiling
+    void free() {
+        ref_count--;
+        if (ref_count == 0) {
 #ifdef GGML_OPENCL_PROFILING
-struct ProfilingInfo {
-    std::string op_name;
-    std::string kernel_name;
-
-    cl_kernel kernel;
-    cl_event evt;
-
-    cl_ulong cmd_queued;
-    cl_ulong cmd_submit;
-    cl_ulong cmd_start;
-    cl_ulong cmd_end;
-    cl_ulong overhead_start;
-    cl_ulong overhead_end;
-    // For the times below, see spec for clGetEventProfilingInfo
-    // The time kernel spent in cmd queue - SUBMIT - QUEUED
-    cl_ulong cmd_queued_duration_ns;
-    // The time kernel spent for submission - START - SUBMIT
-    cl_ulong cmd_submit_duration_ns;
-    // Kernel execution time in nanoseconds - END - START
-    cl_ulong cmd_duration_ns;
-    // The time for the kernel to complete - COMPLETE - END
-    cl_ulong cmd_complete_duration_ns;
-    // Total time to finish the kernel - COMPELTE - QUEUED
-    cl_ulong cmd_total_duration_ns;
-    // Global and local work sizes.
-    size_t global_size[3];
-    size_t local_size[3];
-    // Op output size.
-    size_t output_size[4];
+            write_profiling_info();
+#endif
+        }
+    }
 };
 
-std::vector<ProfilingInfo> g_profiling_info;
-#endif
+// All registered devices with a default device in the front.
+static std::vector<ggml_backend_device> g_ggml_backend_opencl_devices;
 
 inline std::string read_file(const std::string &path) {
   std::ifstream ifs(path);
@@ -1097,6 +1256,166 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
         GGML_LOG_CONT(".");
     }
 
+    // repeat
+    {
+#ifdef GGML_OPENCL_EMBED_KERNELS
+        const std::string kernel_src {
+            #include "repeat.cl.h"
+        };
+#else
+        const std::string kernel_src = read_file("repeat.cl");
+#endif
+        if (!kernel_src.empty()) {
+            backend_ctx->program_repeat =
+                build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
+            CL_CHECK((backend_ctx->kernel_repeat = clCreateKernel(backend_ctx->program_repeat, "kernel_repeat", &err), err));
+            GGML_LOG_CONT(".");
+        } else {
+            GGML_LOG_WARN("ggml_opencl: repeat kernel source not found or empty. Repeat operations will not be available.\n");
+            backend_ctx->program_repeat = nullptr;
+            backend_ctx->kernel_repeat = nullptr;
+        }
+    }
+
+    // pad
+    {
+#ifdef GGML_OPENCL_EMBED_KERNELS
+        const std::string kernel_src {
+            #include "pad.cl.h"
+        };
+#else
+        const std::string kernel_src = read_file("pad.cl");
+#endif
+        if (!kernel_src.empty()) {
+            backend_ctx->program_pad =
+                build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
+            CL_CHECK((backend_ctx->kernel_pad = clCreateKernel(backend_ctx->program_pad, "kernel_pad", &err), err));
+            GGML_LOG_CONT(".");
+        } else {
+            GGML_LOG_WARN("ggml_opencl: pad kernel source not found or empty. Pad operations will not be available.\n");
+            backend_ctx->program_pad = nullptr;
+            backend_ctx->kernel_pad = nullptr;
+        }
+    }
+
+    // tanh
+    {
+#ifdef GGML_OPENCL_EMBED_KERNELS
+        const std::string kernel_src {
+            #include "tanh.cl.h"
+        };
+#else
+        const std::string kernel_src = read_file("tanh.cl");
+#endif
+        if (!kernel_src.empty()) {
+            backend_ctx->program_tanh =
+                build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
+            CL_CHECK((backend_ctx->kernel_tanh_f32_nd = clCreateKernel(backend_ctx->program_tanh, "kernel_tanh_f32_nd", &err), err));
+            CL_CHECK((backend_ctx->kernel_tanh_f16_nd = clCreateKernel(backend_ctx->program_tanh, "kernel_tanh_f16_nd", &err), err));
+            GGML_LOG_CONT(".");
+        } else {
+            GGML_LOG_WARN("ggml_opencl: tanh kernel source not found or empty. Tanh operation will not be available.\n");
+            backend_ctx->program_tanh = nullptr;
+            backend_ctx->kernel_tanh_f32_nd = nullptr;
+            backend_ctx->kernel_tanh_f16_nd = nullptr;
+        }
+    }
+
+    // upscale
+    {
+#ifdef GGML_OPENCL_EMBED_KERNELS
+        const std::string kernel_src {
+            #include "upscale.cl.h"
+        };
+#else
+        const std::string kernel_src = read_file("upscale.cl");
+#endif
+        if (!kernel_src.empty()) {
+            backend_ctx->program_upscale =
+                build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
+            CL_CHECK((backend_ctx->kernel_upscale = clCreateKernel(backend_ctx->program_upscale, "kernel_upscale", &err), err));
+            if (backend_ctx->program_upscale) {
+                 cl_int err_bilinear;
+                 backend_ctx->kernel_upscale_bilinear = clCreateKernel(backend_ctx->program_upscale, "kernel_upscale_bilinear", &err_bilinear);
+                 if (err_bilinear != CL_SUCCESS) {
+                    GGML_LOG_WARN("ggml_opencl: kernel_upscale_bilinear not found in upscale.cl. Bilinear upscale will not be available. Error: %d\n", err_bilinear);
+                    backend_ctx->kernel_upscale_bilinear = nullptr;
+                 }
+            } else {
+                backend_ctx->kernel_upscale_bilinear = nullptr;
+            }
+            GGML_LOG_CONT(".");
+        } else {
+            GGML_LOG_WARN("ggml_opencl: upscale kernel source not found or empty. Upscale operations will not be available.\n");
+            backend_ctx->program_upscale = nullptr;
+            backend_ctx->kernel_upscale = nullptr;
+            backend_ctx->kernel_upscale_bilinear = nullptr;
+        }
+    }
+
+    // concat
+    {
+#ifdef GGML_OPENCL_EMBED_KERNELS
+        const std::string kernel_src {
+            #include "concat.cl.h"
+        };
+#else
+
+        const std::string kernel_src = read_file("concat.cl");
+#endif
+        if (!kernel_src.empty()) {
+            backend_ctx->program_concat =
+                build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
+
+            CL_CHECK((backend_ctx->kernel_concat_f32_contiguous = clCreateKernel(backend_ctx->program_concat, "kernel_concat_f32_contiguous", &err), err));
+            CL_CHECK((backend_ctx->kernel_concat_f32_non_contiguous = clCreateKernel(backend_ctx->program_concat, "kernel_concat_f32_non_contiguous", &err), err));
+            GGML_LOG_CONT(".");
+        } else {
+            GGML_LOG_WARN("ggml_opencl: concat kernel source not found or empty. Concat operations will not be available.\n");
+            backend_ctx->program_concat = nullptr;
+            backend_ctx->kernel_concat_f32_contiguous = nullptr;
+            backend_ctx->kernel_concat_f32_non_contiguous = nullptr;
+        }
+    }
+
+    // timestep_embedding
+    {
+#ifdef GGML_OPENCL_EMBED_KERNELS
+        const std::string kernel_src {
+            #include "tsembd.cl.h"
+        };
+#else
+
+        const std::string kernel_src = read_file("tsembd.cl");
+#endif
+        if (!kernel_src.empty()) {
+            backend_ctx->program_tsembd =
+                build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
+            CL_CHECK((backend_ctx->kernel_timestep_embedding = clCreateKernel(backend_ctx->program_tsembd, "kernel_timestep_embedding", &err), err));
+            GGML_LOG_CONT(".");
+        } else {
+            GGML_LOG_WARN("ggml_opencl: timestep_embedding kernel source not found or empty. This op will not be available.\n");
+            backend_ctx->program_tsembd = nullptr;
+            backend_ctx->kernel_timestep_embedding = nullptr;
+        }
+    }
+
+    // mul_mv_id_q4_0_f32_8x_flat
+    {
+#ifdef GGML_OPENCL_EMBED_KERNELS
+        const std::string kernel_src {
+            #include "mul_mv_id_q4_0_f32_8x_flat.cl.h"
+        };
+#else
+        const std::string kernel_src = read_file("mul_mv_id_q4_0_f32_8x_flat.cl");
+#endif
+        backend_ctx->program_mul_mv_id_q4_0_f32_8x_flat =
+            build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
+
+        CL_CHECK((backend_ctx->kernel_mul_mv_id_q4_0_f32_8x_flat = clCreateKernel(backend_ctx->program_mul_mv_id_q4_0_f32_8x_flat, "kernel_mul_mv_id_q4_0_f32_8x_flat", &err), err));
+        GGML_LOG_CONT(".");
+    }
+
     // Adreno kernels
 #ifdef GGML_OPENCL_USE_ADRENO_KERNELS
     // transpose
@@ -1492,6 +1811,12 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
     backend_ctx->device     = dev_ctx->device;
     backend_ctx->gpu_family = GPU_FAMILY::UNKNOWN;
 
+    // ref_count get increased in ggml_backend_opencl_device_init
+    // This function is also used to retrieve backend context, so we don't want
+    // to increase ref_count for each call. We only want to increase ref_count
+    // when the associated device is initialized
+    backend_ctx->ref_count  = 0;
+
     if (strstr(dev_ctx->device_name.c_str(), "Adreno") ||
         strstr(dev_ctx->device_name.c_str(), "Qualcomm") ||
         strstr(dev_ctx->device_version.c_str(), "Adreno")) {
@@ -1664,93 +1989,22 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
     return dev_ctx->backend_ctx;
 }
 
-static void ggml_cl2_free(void) {
-#ifdef GGML_OPENCL_PROFILING
-    FILE * fperf = fopen("cl_profiling.csv", "w");
-    if (!fperf) {
-        GGML_LOG_ERROR("Failed to open cl_profiling.csv\n");
-        return;
-    }
+static void ggml_cl2_free(ggml_backend_t backend) {
+    ggml_backend_opencl_context * ctx = (ggml_backend_opencl_context *) backend->context;
+    ctx->free();
 
-    // Populate profiling info
-    for (ProfilingInfo & info : g_profiling_info) {
-        cl_ulong cmd_queued;
-        cl_ulong cmd_submit;
-        cl_ulong cmd_start;
-        cl_ulong cmd_end;
-        cl_ulong cmd_complete;
-
-        CL_CHECK(clWaitForEvents(1, &info.evt));
-        CL_CHECK(clGetEventProfilingInfo(
-            info.evt, CL_PROFILING_COMMAND_QUEUED, sizeof(cl_ulong), &cmd_queued, NULL));
-        CL_CHECK(clGetEventProfilingInfo(
-            info.evt, CL_PROFILING_COMMAND_SUBMIT, sizeof(cl_ulong), &cmd_submit, NULL));
-        CL_CHECK(clGetEventProfilingInfo(
-            info.evt, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &cmd_start, NULL));
-        CL_CHECK(clGetEventProfilingInfo(
-            info.evt, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &cmd_end, NULL));
-        CL_CHECK(clGetEventProfilingInfo(
-            info.evt, CL_PROFILING_COMMAND_COMPLETE, sizeof(cl_ulong), &cmd_complete, NULL));
-        CL_CHECK(clReleaseEvent(info.evt));
-
-        char kernel_name[512];
-        CL_CHECK(clGetKernelInfo(info.kernel, CL_KERNEL_FUNCTION_NAME,
-            sizeof(kernel_name), kernel_name, NULL));
-        info.kernel_name = kernel_name;
-
-        info.cmd_queued = cmd_queued;
-        info.cmd_submit = cmd_submit;
-        info.cmd_start  = cmd_start;
-        info.cmd_end    = cmd_end;
-
-        info.cmd_queued_duration_ns     = cmd_submit    - cmd_queued;
-        info.cmd_submit_duration_ns     = cmd_start     - cmd_submit;
-        info.cmd_duration_ns            = cmd_end       - cmd_start;
-        info.cmd_complete_duration_ns   = cmd_complete  - cmd_end;
-        info.cmd_total_duration_ns      = cmd_complete  - cmd_queued;
-    }
-
-    // Dump a csv
-    float total_kernel_time = 0;
-    fprintf(fperf, "op name, kernel name, queued duration (ms), submit duration(ms), exec duration (ms), complete duration (ms), total duration (ms), global size, local size, output size\n");
-    for (const ProfilingInfo & info : g_profiling_info) {
-        total_kernel_time += info.cmd_duration_ns/1.e6f;
-        fprintf(fperf, "%s,%s,%f,%f,%f,%f,%f,%zux%zux%zu,%zux%zux%zu,%zux%zux%zux%zu\n",
-            info.op_name.c_str(), info.kernel_name.c_str(),
-            info.cmd_queued_duration_ns/1.e6f,
-            info.cmd_submit_duration_ns/1.e6f,
-            info.cmd_duration_ns/1.e6f,
-            info.cmd_complete_duration_ns/1.e6f,
-            info.cmd_total_duration_ns/1.e6f,
-            info.global_size[0], info.global_size[1], info.global_size[2],
-            info.local_size[0], info.local_size[1], info.local_size[2],
-            info.output_size[0], info.output_size[1], info.output_size[2], info.output_size[3]);
-    }
-    fclose(fperf);
-
-    GGML_LOG_INFO("ggml_opencl: total kernel time: %f\n", total_kernel_time);
-
-    // Dump a simple chrome trace
-    FILE* ftrace = fopen("cl_trace.json", "w");
-    if (!ftrace) {
-        GGML_LOG_ERROR("Failed to open cl_trace.json\n");
-        return;
+    // The CL context is shared by all backends, release it if all backends have been released
+    bool should_release_opencl = true;
+    for (auto device : g_ggml_backend_opencl_devices) {
+        ggml_backend_opencl_device_context * ctx_dev = (ggml_backend_opencl_device_context *) device.context;
+        if (ctx_dev->backend_ctx->ref_count > 0) {
+            should_release_opencl = false;
+        }
     }
 
-    fprintf(ftrace, "[\n");
-    for (const ProfilingInfo & info : g_profiling_info) {
-        fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"B\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Host\"},\n",
-            info.kernel_name.c_str(), info.cmd_queued/1000);
-        fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"E\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Host\"},\n",
-            info.kernel_name.c_str(), info.cmd_submit/1000);
-
-        fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"B\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Device\"},\n",
-            info.kernel_name.c_str(), info.cmd_start/1000);
-        fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"E\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Device\"},\n",
-            info.kernel_name.c_str(), info.cmd_end/1000);
+    if (should_release_opencl) {
+        CL_CHECK(clReleaseContext(ctx->context));
     }
-    fclose(ftrace);
-#endif
 }
 
 //------------------------------------------------------------------------------
@@ -1834,9 +2088,7 @@ static const char * ggml_backend_opencl_name(ggml_backend_t backend) {
 }
 
 static void ggml_backend_opencl_free(ggml_backend_t backend) {
-    ggml_cl2_free();
-
-    GGML_UNUSED(backend);
+    ggml_cl2_free(backend);
 }
 
 static void ggml_backend_opencl_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
@@ -1863,7 +2115,12 @@ static bool ggml_backend_opencl_cpy_tensor_async(ggml_backend_t backend, const g
 }
 
 static void ggml_backend_opencl_synchronize(ggml_backend_t backend) {
-    GGML_UNUSED(backend);
+    auto * backend_ctx = static_cast<ggml_backend_opencl_context *>(backend->context);
+
+    cl_event evt;
+    CL_CHECK(clEnqueueBarrierWithWaitList(backend_ctx->queue, 0, nullptr, &evt));
+    CL_CHECK(clWaitForEvents(1, &evt));
+    CL_CHECK(clReleaseEvent(evt));
 }
 
 // Syncronizes the 'backend_ctx's device with others so that commands
@@ -1976,9 +2233,12 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
                 case GGML_UNARY_OP_SILU:
                 case GGML_UNARY_OP_RELU:
                 case GGML_UNARY_OP_GELU_QUICK:
-                    return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
+                   return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
                 case GGML_UNARY_OP_SIGMOID:
                     return ggml_is_contiguous(op->src[0]);
+                case GGML_UNARY_OP_TANH:
+                   return (op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32) ||
+                          (op->src[0]->type == GGML_TYPE_F16 && op->type == GGML_TYPE_F16);
                 default:
                     return false;
             }
@@ -1988,6 +2248,17 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
         case GGML_OP_NORM:
         case GGML_OP_RMS_NORM:
             return true;
+        case GGML_OP_REPEAT:
+            return op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32; // Assuming F32 for now, can be expanded
+        case GGML_OP_PAD:
+            return op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32 &&
+                   op->src[0]->ne[3] == 1 && op->ne[3] == 1;
+        case GGML_OP_UPSCALE:
+            return op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32;
+        case GGML_OP_CONCAT:
+            return op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32;
+        case GGML_OP_TIMESTEP_EMBEDDING:
+            return op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32;
         case GGML_OP_GROUP_NORM:
             return ggml_is_contiguous(op->src[0]);
         case GGML_OP_MUL_MAT:
@@ -2000,6 +2271,13 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
                 return op->src[1]->type == GGML_TYPE_F32 && ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1]);
             }
             return false;
+        case GGML_OP_MUL_MAT_ID:
+            if (op->src[0]->type == GGML_TYPE_Q4_0) {
+                if (op->src[1]->type == GGML_TYPE_F32) {
+                    return ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1]);
+                }
+            }
+            return false;
         case GGML_OP_RESHAPE:
         case GGML_OP_VIEW:
         case GGML_OP_PERMUTE:
@@ -2052,7 +2330,7 @@ static ggml_backend_i ggml_backend_opencl_i = {
     /* .set_tensor_async        = */ NULL,  /* ggml_backend_opencl_set_tensor_async */
     /* .get_tensor_async        = */ NULL,  /* ggml_backend_opencl_get_tensor_async */
     /* .cpy_tensor_async        = */ NULL,  /* ggml_backend_opencl_cpy_tensor_async */
-    /* .synchronize             = */ NULL,  /* ggml_backend_opencl_synchronize */
+    /* .synchronize             = */ ggml_backend_opencl_synchronize,
     /* .graph_plan_create       = */ NULL,
     /* .graph_plan_free         = */ NULL,
     /* .graph_plan_update       = */ NULL,
@@ -2696,6 +2974,8 @@ static void ggml_backend_opencl_device_get_props(ggml_backend_dev_t dev, struct
 
 static ggml_backend_t ggml_backend_opencl_device_init(ggml_backend_dev_t dev, const char * params) {
     ggml_backend_opencl_context * backend_ctx = ggml_cl2_init(dev);
+    // Getting a new reference to the backend, increase ref_count
+    backend_ctx->ref_count++;
 
     ggml_backend_t backend = new ggml_backend {
         /* .guid      = */ ggml_backend_opencl_guid(),
@@ -2956,31 +3236,6 @@ static void dump_tensor(ggml_backend_t backend, const struct ggml_tensor * tenso
 #define dump_tensor(tensor)
 #endif
 
-//------------------------------------------------------------------------------
-// Profiling utility
-//------------------------------------------------------------------------------
-#ifdef GGML_OPENCL_PROFILING
-static void populateProfilingInfo(
-        ProfilingInfo& info, cl_event evt, cl_kernel kernel,
-        size_t global_size[3], size_t local_size[3],
-        const ggml_tensor * tensor) {
-    info.op_name     = tensor->name;
-    info.kernel      = kernel;
-    info.evt         = evt;
-
-    info.local_size[0]  = local_size[0];
-    info.local_size[1]  = local_size[1];
-    info.local_size[2]  = local_size[2];
-    info.global_size[0] = global_size[0];
-    info.global_size[1] = global_size[1];
-    info.global_size[2] = global_size[2];
-    info.output_size[0] = tensor->ne[0];
-    info.output_size[1] = tensor->ne[1];
-    info.output_size[2] = tensor->ne[2];
-    info.output_size[3] = tensor->ne[3];
-}
-#endif
-
 //------------------------------------------------------------------------------
 // Ops
 //------------------------------------------------------------------------------
@@ -3024,7 +3279,6 @@ static void ggml_cl_get_rows(ggml_backend_t backend, const ggml_tensor * src0, c
     const cl_ulong nb2  = dst  ?  dst->nb[2] : 0;
 
     ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
-    cl_command_queue queue = backend_ctx->queue;
 
     ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
     ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
@@ -3068,15 +3322,7 @@ static void ggml_cl_get_rows(ggml_backend_t backend, const ggml_tensor * src0, c
     size_t global_work_size[] = {(size_t)ne10, (size_t)ne11, 1};
     size_t local_work_size[] = {1, 1, 1};
 
-#ifdef GGML_OPENCL_PROFILING
-    cl_event evt;
-    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
-
-    g_profiling_info.emplace_back();
-    populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
-#else
-    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
-#endif
+    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
 }
 
 static void ggml_cl_add(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -3118,7 +3364,6 @@ static void ggml_cl_add(ggml_backend_t backend, const ggml_tensor * src0, const
     const cl_ulong nb3  = dst ? dst->nb[3] : 0;
 
     ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
-    cl_command_queue queue = backend_ctx->queue;
 
     ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
     ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
@@ -3193,29 +3438,13 @@ static void ggml_cl_add(ggml_backend_t backend, const ggml_tensor * src0, const
             local_work_size_ptr = nullptr;  // Let driver choose the work-group sizes.
         }
 
-#ifdef GGML_OPENCL_PROFILING
-        cl_event evt;
-        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
-
-        g_profiling_info.emplace_back();
-        populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
-#else
-        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
-#endif
+        backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
     } else {
         unsigned int nth = MIN(64, ne0);
         size_t global_work_size[] = {ne01*nth, (size_t)ne02, (size_t)ne03};
         size_t local_work_size[] = {nth, 1, 1};
 
-#ifdef GGML_OPENCL_PROFILING
-        cl_event evt;
-        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
-
-        g_profiling_info.emplace_back();
-        populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
-#else
-        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
-#endif
+        backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
     }
 }
 
@@ -3258,7 +3487,6 @@ static void ggml_cl_mul(ggml_backend_t backend, const ggml_tensor * src0, const
     const cl_ulong nb3  = dst ? dst->nb[3] : 0;
 
     ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
-    cl_command_queue queue = backend_ctx->queue;
 
     ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
     ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
@@ -3333,29 +3561,13 @@ static void ggml_cl_mul(ggml_backend_t backend, const ggml_tensor * src0, const
             local_work_size_ptr = nullptr;  // Let driver choose the work-group sizes.
         }
 
-#ifdef GGML_OPENCL_PROFILING
-        cl_event evt;
-        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
-
-        g_profiling_info.emplace_back();
-        populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
-#else
-        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
-#endif
+        backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
     } else {
         unsigned int nth = MIN(64, ne0);
         size_t global_work_size[] = {ne01*nth, (size_t)ne02, (size_t)ne03};
         size_t local_work_size[] = {nth, 1, 1};
 
-#ifdef GGML_OPENCL_PROFILING
-        cl_event evt;
-        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
-
-        g_profiling_info.emplace_back();
-        populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
-#else
-        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
-#endif
+        backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
     }
 }
 
@@ -3395,7 +3607,6 @@ static void ggml_cl_div(ggml_backend_t backend, const ggml_tensor * src0, const
     const cl_ulong nb3  = dst->nb[3];
 
     ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
-    cl_command_queue queue = backend_ctx->queue;
 
     ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
     ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
@@ -3458,29 +3669,13 @@ static void ggml_cl_div(ggml_backend_t backend, const ggml_tensor * src0, const
         size_t global_work_size[] = {(size_t)n, 1, 1};
         size_t local_work_size[] = {64, 1, 1};
 
-#ifdef GGML_OPENCL_PROFILING
-        cl_event evt;
-        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
-
-        g_profiling_info.emplace_back();
-        populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
-#else
-        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
-#endif
+        backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
     } else {
         unsigned int nth = MIN(64, ne0);
         size_t global_work_size[] = {ne01*nth, (size_t)ne02, (size_t)ne03};
         size_t local_work_size[] = {nth, 1, 1};
 
-#ifdef GGML_OPENCL_PROFILING
-        cl_event evt;
-        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
-
-        g_profiling_info.emplace_back();
-        populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
-#else
-        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
-#endif
+        backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
     }
 }
 
@@ -3520,7 +3715,6 @@ static void ggml_cl_sub(ggml_backend_t backend, const ggml_tensor * src0, const
     const cl_ulong nb3  = dst->nb[3];
 
     ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
-    cl_command_queue queue = backend_ctx->queue;
 
     ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
     ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
@@ -3583,29 +3777,13 @@ static void ggml_cl_sub(ggml_backend_t backend, const ggml_tensor * src0, const
         size_t global_work_size[] = {(size_t)n, 1, 1};
         size_t local_work_size[] = {64, 1, 1};
 
-#ifdef GGML_OPENCL_PROFILING
-        cl_event evt;
-        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
-
-        g_profiling_info.emplace_back();
-        populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
-#else
-        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
-#endif
+        backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
     } else {
         unsigned int nth = MIN(64, ne0);
         size_t global_work_size[] = {ne01*nth, (size_t)ne02, (size_t)ne03};
         size_t local_work_size[] = {nth, 1, 1};
 
-#ifdef GGML_OPENCL_PROFILING
-        cl_event evt;
-        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
-
-        g_profiling_info.emplace_back();
-        populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
-#else
-        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
-#endif
+        backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
     }
 }
 
@@ -3618,7 +3796,6 @@ static void ggml_cl_gelu(ggml_backend_t backend, const ggml_tensor * src0, const
     UNUSED(src1);
 
     ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
-    cl_command_queue queue = backend_ctx->queue;
 
     ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
     ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
@@ -3645,15 +3822,7 @@ static void ggml_cl_gelu(ggml_backend_t backend, const ggml_tensor * src0, const
     size_t global_work_size[] = {(size_t)n, 1, 1};
     size_t local_work_size[] = {64, 1, 1};
 
-#ifdef GGML_OPENCL_PROFILING
-    cl_event evt;
-    clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt);
-
-    g_profiling_info.emplace_back();
-    populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
-#else
-    clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL);
-#endif
+    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
 }
 
 static void ggml_cl_gelu_quick(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -3665,7 +3834,6 @@ static void ggml_cl_gelu_quick(ggml_backend_t backend, const ggml_tensor * src0,
     UNUSED(src1);
 
     ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
-    cl_command_queue queue = backend_ctx->queue;
 
     ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
     ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
@@ -3692,15 +3860,7 @@ static void ggml_cl_gelu_quick(ggml_backend_t backend, const ggml_tensor * src0,
     size_t global_work_size[] = {(size_t)n, 1, 1};
     size_t local_work_size[] = {64, 1, 1};
 
-#ifdef GGML_OPENCL_PROFILING
-    cl_event evt;
-    clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt);
-
-    g_profiling_info.emplace_back();
-    populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
-#else
-    clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL);
-#endif
+    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
 }
 
 static void ggml_cl_silu(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -3712,7 +3872,6 @@ static void ggml_cl_silu(ggml_backend_t backend, const ggml_tensor * src0, const
     UNUSED(src1);
 
     ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
-    cl_command_queue queue = backend_ctx->queue;
 
     ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
     ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
@@ -3744,15 +3903,7 @@ static void ggml_cl_silu(ggml_backend_t backend, const ggml_tensor * src0, const
         local_work_size_ptr = nullptr;  // Let driver choose the work-group sizes.
     }
 
-#ifdef GGML_OPENCL_PROFILING
-    cl_event evt;
-    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
-
-    g_profiling_info.emplace_back();
-    populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
-#else
-    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
-#endif
+    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
 }
 
 static void ggml_cl_relu(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -3764,7 +3915,6 @@ static void ggml_cl_relu(ggml_backend_t backend, const ggml_tensor * src0, const
     UNUSED(src1);
 
     ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
-    cl_command_queue queue = backend_ctx->queue;
 
     ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
     ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
@@ -3789,15 +3939,7 @@ static void ggml_cl_relu(ggml_backend_t backend, const ggml_tensor * src0, const
         local_work_size_ptr = nullptr;  // Let driver choose the work-group sizes.
     }
 
-#ifdef GGML_OPENCL_PROFILING
-    cl_event evt;
-    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
-
-    g_profiling_info.emplace_back();
-    populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
-#else
-    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
-#endif
+    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
 }
 
 static void ggml_cl_sigmoid(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -3809,7 +3951,6 @@ static void ggml_cl_sigmoid(ggml_backend_t backend, const ggml_tensor * src0, co
     UNUSED(src1);
 
     ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
-    cl_command_queue queue = backend_ctx->queue;
 
     ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
     ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
@@ -3841,15 +3982,7 @@ static void ggml_cl_sigmoid(ggml_backend_t backend, const ggml_tensor * src0, co
         local_work_size_ptr = nullptr;  // Let driver choose the work-group sizes.
     }
 
-#ifdef GGML_OPENCL_PROFILING
-    cl_event evt;
-    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
-
-    g_profiling_info.emplace_back();
-    populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
-#else
-    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
-#endif
+    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
 }
 
 static void ggml_cl_clamp(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -3861,7 +3994,6 @@ static void ggml_cl_clamp(ggml_backend_t backend, const ggml_tensor * src0, cons
     UNUSED(src1);
 
     ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
-    cl_command_queue queue = backend_ctx->queue;
 
     ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
     ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
@@ -3893,15 +4025,7 @@ static void ggml_cl_clamp(ggml_backend_t backend, const ggml_tensor * src0, cons
         local_work_size_ptr = nullptr;  // Let driver choose the work-group sizes.
     }
 
-#ifdef GGML_OPENCL_PROFILING
-    cl_event evt;
-    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
-
-    g_profiling_info.emplace_back();
-    populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
-#else
-    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
-#endif
+    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
 }
 
 static void ggml_cl_norm(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -3913,7 +4037,6 @@ static void ggml_cl_norm(ggml_backend_t backend, const ggml_tensor * src0, const
     UNUSED(src1);
 
     ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
-    cl_command_queue queue = backend_ctx->queue;
 
     ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
     ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
@@ -3954,15 +4077,7 @@ static void ggml_cl_norm(ggml_backend_t backend, const ggml_tensor * src0, const
     size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
     size_t local_work_size[] = {(size_t)nth, 1, 1};
 
-#ifdef GGML_OPENCL_PROFILING
-    cl_event evt;
-    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
-
-    g_profiling_info.emplace_back();
-    populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
-#else
-    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
-#endif
+    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
 }
 
 static void ggml_cl_rms_norm(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -3974,7 +4089,6 @@ static void ggml_cl_rms_norm(ggml_backend_t backend, const ggml_tensor * src0, c
     UNUSED(src1);
 
     ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
-    cl_command_queue queue = backend_ctx->queue;
 
     //ggml_backend_opencl_device_context * dev_ctx =
     //    (ggml_backend_opencl_device_context *)backend->device->context;
@@ -4038,15 +4152,7 @@ static void ggml_cl_rms_norm(ggml_backend_t backend, const ggml_tensor * src0, c
     // This is local memory - the size depends on subgroup size.
     CL_CHECK(clSetKernelArg(kernel, 12, sizeof(float)*nth/sgs,  NULL));
 
-#ifdef GGML_OPENCL_PROFILING
-    cl_event evt;
-    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
-
-    g_profiling_info.emplace_back();
-    populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
-#else
-    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
-#endif
+    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
 }
 
 static void ggml_cl_group_norm(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -4058,7 +4164,6 @@ static void ggml_cl_group_norm(ggml_backend_t backend, const ggml_tensor * src0,
     UNUSED(src1);
 
     ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
-    cl_command_queue queue = backend_ctx->queue;
 
     ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
     ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
@@ -4097,15 +4202,487 @@ static void ggml_cl_group_norm(ggml_backend_t backend, const ggml_tensor * src0,
     size_t global_work_size[] = {(size_t)n_groups*sgs, 1, 1};
     size_t local_work_size[] = {(size_t)sgs, 1, 1};
 
-#ifdef GGML_OPENCL_PROFILING
-    cl_event evt;
-    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
+    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
+}
 
-    g_profiling_info.emplace_back();
-    populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
-#else
-    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
-#endif
+static void ggml_cl_tanh(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    GGML_ASSERT(src0);
+    GGML_ASSERT(src0->extra);
+    GGML_ASSERT(dst);
+    GGML_ASSERT(dst->extra);
+
+    UNUSED(src1);
+
+    ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
+
+    ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
+    ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
+
+    cl_ulong offset0_abs = extra0->offset + src0->view_offs;
+    cl_ulong offsetd_abs = extrad->offset + dst->view_offs;
+
+    cl_kernel kernel;
+    if (dst->type == GGML_TYPE_F32) {
+        kernel = backend_ctx->kernel_tanh_f32_nd;
+    } else if (dst->type == GGML_TYPE_F16) {
+        kernel = backend_ctx->kernel_tanh_f16_nd;
+    } else {
+        GGML_ASSERT(false && "Unsupported type for ggml_cl_tanh");
+    }
+    GGML_ASSERT(kernel != nullptr);
+
+    const int ne00 = src0->ne[0]; const int ne01 = src0->ne[1]; const int ne02 = src0->ne[2]; const int ne03 = src0->ne[3];
+    const cl_ulong nb00 = src0->nb[0]; const cl_ulong nb01 = src0->nb[1]; const cl_ulong nb02 = src0->nb[2]; const cl_ulong nb03 = src0->nb[3];
+
+    const int ne10 = dst->ne[0]; const int ne11 = dst->ne[1]; const int ne12 = dst->ne[2]; const int ne13 = dst->ne[3];
+    const cl_ulong nb10 = dst->nb[0]; const cl_ulong nb11 = dst->nb[1]; const cl_ulong nb12 = dst->nb[2]; const cl_ulong nb13 = dst->nb[3];
+
+    CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem),   &extra0->data_device));
+    CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0_abs));
+    CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem),   &extrad->data_device));
+    CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd_abs));
+
+    CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int),      &ne00));
+    CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int),      &ne01));
+    CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int),      &ne02));
+    CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int),      &ne03));
+    CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb00));
+    CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb01));
+    CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong),&nb02));
+    CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong),&nb03));
+
+    CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int),     &ne10));
+    CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int),     &ne11));
+    CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int),     &ne12));
+    CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int),     &ne13));
+    CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong),&nb10));
+    CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong),&nb11));
+    CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong),&nb12));
+    CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong),&nb13));
+
+    size_t global_work_size[3];
+    if (ne10 == 0 || ne11 == 0 || ne12 == 0 || ne13 == 0) { // Handle case of 0 elements
+        return;
+    }
+    global_work_size[0] = (size_t)ne10;
+    global_work_size[1] = (size_t)ne11;
+    global_work_size[2] = (size_t)ne12;
+
+    size_t lws0 = 16, lws1 = 4, lws2 = 1;
+    if (ne10 < 16) lws0 = ne10;
+    if (ne11 < 4) lws1 = ne11;
+    if (ne12 < 1) lws2 = ne12 > 0 ? ne12 : 1;
+
+    while (lws0 * lws1 * lws2 > 256 && lws0 > 1) lws0 /= 2;
+    while (lws0 * lws1 * lws2 > 256 && lws1 > 1) lws1 /= 2;
+    while (lws0 * lws1 * lws2 > 256 && lws2 > 1) lws2 /= 2;
+
+
+    size_t local_work_size[] = {lws0, lws1, lws2};
+
+    size_t* local_work_size_ptr = local_work_size;
+    if (!backend_ctx->non_uniform_workgroups) {
+        if (global_work_size[0] % local_work_size[0] != 0 ||
+            global_work_size[1] % local_work_size[1] != 0 ||
+            global_work_size[2] % local_work_size[2] != 0) {
+            local_work_size_ptr = NULL;
+        }
+    }
+    if (global_work_size[0] == 0 || global_work_size[1] == 0 || global_work_size[2] == 0) return;
+
+    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
+}
+
+static void ggml_cl_repeat(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1_shape_def, ggml_tensor * dst) {
+    GGML_ASSERT(src0);
+    GGML_ASSERT(src0->extra);
+    GGML_ASSERT(dst);
+    GGML_ASSERT(dst->extra);
+    GGML_ASSERT(dst->type == src0->type);
+
+    UNUSED(src1_shape_def);
+
+    ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
+
+    if (backend_ctx->kernel_repeat == nullptr) {
+        GGML_LOG_WARN("%s: repeat kernel not available, skipping OpenCL execution.\n", __func__);
+        return;
+    }
+
+    ggml_tensor_extra_cl * extra_src0 = (ggml_tensor_extra_cl *)src0->extra;
+    ggml_tensor_extra_cl * extra_dst  = (ggml_tensor_extra_cl *)dst->extra;
+
+    cl_ulong off_src0 = extra_src0->offset + src0->view_offs;
+    cl_ulong off_dst  = extra_dst->offset  + dst->view_offs;
+
+    const int src0_ne0 = src0->ne[0]; const int src0_ne1 = src0->ne[1]; const int src0_ne2 = src0->ne[2]; const int src0_ne3 = src0->ne[3];
+    const cl_ulong src0_nb0 = src0->nb[0]; const cl_ulong src0_nb1 = src0->nb[1]; const cl_ulong src0_nb2 = src0->nb[2]; const cl_ulong src0_nb3 = src0->nb[3];
+
+    const int dst_ne0 = dst->ne[0]; const int dst_ne1 = dst->ne[1]; const int dst_ne2 = dst->ne[2]; const int dst_ne3 = dst->ne[3];
+    const cl_ulong dst_nb0 = dst->nb[0]; const cl_ulong dst_nb1 = dst->nb[1]; const cl_ulong dst_nb2 = dst->nb[2]; const cl_ulong dst_nb3 = dst->nb[3];
+
+    cl_kernel kernel = backend_ctx->kernel_repeat;
+
+    CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem),    &extra_src0->data_device));
+    CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem),    &extra_dst->data_device));
+    CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_ulong),  &off_src0));
+    CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong),  &off_dst));
+    CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int),       &src0_ne0));
+    CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int),       &src0_ne1));
+    CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int),       &src0_ne2));
+    CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int),       &src0_ne3));
+    CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong),  &src0_nb0));
+    CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong),  &src0_nb1));
+    CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &src0_nb2));
+    CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &src0_nb3));
+    CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int),      &dst_ne0));
+    CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int),      &dst_ne1));
+    CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int),      &dst_ne2));
+    CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int),      &dst_ne3));
+    CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &dst_nb0));
+    CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &dst_nb1));
+    CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &dst_nb2));
+    CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &dst_nb3));
+
+    size_t gws0 = dst_ne1 > 0 ? (size_t)dst_ne1 : 1;
+    size_t gws1 = dst_ne2 > 0 ? (size_t)dst_ne2 : 1;
+    size_t gws2 = dst_ne3 > 0 ? (size_t)dst_ne3 : 1;
+
+    size_t global_work_size[] = { gws0, gws1, gws2 };
+
+    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, NULL, dst);
+}
+
+static void ggml_cl_pad(ggml_backend_t backend, const ggml_tensor * src0, ggml_tensor * dst) {
+    GGML_ASSERT(src0);
+    GGML_ASSERT(src0->extra);
+    GGML_ASSERT(dst);
+    GGML_ASSERT(dst->extra);
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT(dst->type == GGML_TYPE_F32);
+    GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1);
+
+    ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
+
+    if (backend_ctx->kernel_pad == nullptr) {
+        GGML_LOG_WARN("%s: pad kernel not available, skipping OpenCL execution.\n", __func__);
+        return;
+    }
+
+    ggml_tensor_extra_cl * extra_src0 = (ggml_tensor_extra_cl *)src0->extra;
+    ggml_tensor_extra_cl * extra_dst  = (ggml_tensor_extra_cl *)dst->extra;
+
+    cl_ulong off_src0 = extra_src0->offset + src0->view_offs;
+    cl_ulong off_dst  = extra_dst->offset  + dst->view_offs;
+
+    const int s_ne0 = src0->ne[0];
+    const int s_ne1 = src0->ne[1];
+    const int s_ne2 = src0->ne[2];
+
+    const int d_ne0 = dst->ne[0];
+    const int d_ne1 = dst->ne[1];
+    const int d_ne2 = dst->ne[2];
+
+    cl_kernel kernel = backend_ctx->kernel_pad;
+
+    CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem),    &extra_src0->data_device));
+    CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong),  &off_src0));
+    CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem),    &extra_dst->data_device));
+    CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong),  &off_dst));
+    CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int),       &s_ne0));
+    CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int),       &s_ne1));
+    CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int),       &s_ne2));
+    CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int),       &d_ne0));
+    CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int),       &d_ne1));
+    CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int),       &d_ne2));
+
+    size_t lws0 = 64;
+    size_t gws0 = (( (size_t)d_ne0 + lws0 - 1 ) / lws0) * lws0;
+
+    size_t global_work_size[] = { gws0, (size_t)d_ne1, (size_t)d_ne2 };
+    size_t local_work_size[]  = { lws0, 1, 1 };
+
+    size_t * local_work_size_ptr = local_work_size;
+     if (d_ne0 % lws0 != 0 && !backend_ctx->non_uniform_workgroups) {
+        local_work_size_ptr = nullptr;
+    }
+
+    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
+}
+
+static void ggml_cl_upscale(ggml_backend_t backend, const ggml_tensor * src0, ggml_tensor * dst) {
+    GGML_ASSERT(src0);
+    GGML_ASSERT(src0->extra);
+    GGML_ASSERT(dst);
+    GGML_ASSERT(dst->extra);
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT(dst->type == GGML_TYPE_F32);
+
+    ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
+
+    const ggml_scale_mode mode = (ggml_scale_mode) ggml_get_op_params_i32(dst, 0);
+    cl_kernel kernel = nullptr;
+
+    if (mode == GGML_SCALE_MODE_NEAREST) {
+        kernel = backend_ctx->kernel_upscale;
+        if (kernel == nullptr) {
+            GGML_LOG_WARN("%s: nearest upscale kernel not available, skipping OpenCL execution.\n", __func__);
+            return;
+        }
+    } else if (mode == GGML_SCALE_MODE_BILINEAR) {
+        kernel = backend_ctx->kernel_upscale_bilinear;
+        if (kernel == nullptr) {
+            GGML_LOG_WARN("%s: bilinear upscale kernel not available, skipping OpenCL execution.\n", __func__);
+            return;
+        }
+    } else {
+        GGML_LOG_WARN("%s: unsupported upscale mode %d, skipping OpenCL execution.\n", __func__, mode);
+        return;
+    }
+
+    ggml_tensor_extra_cl * extra_src0 = (ggml_tensor_extra_cl *)src0->extra;
+    ggml_tensor_extra_cl * extra_dst  = (ggml_tensor_extra_cl *)dst->extra;
+
+    cl_ulong off_src0 = extra_src0->offset + src0->view_offs;
+    cl_ulong off_dst  = extra_dst->offset  + dst->view_offs;
+
+    const cl_ulong nb00 = src0->nb[0];
+    const cl_ulong nb01 = src0->nb[1];
+    const cl_ulong nb02 = src0->nb[2];
+    const cl_ulong nb03 = src0->nb[3];
+
+    const int ne00_src = src0->ne[0];
+    const int ne01_src = src0->ne[1];
+
+    const int ne10_dst = dst->ne[0];
+    const int ne11_dst = dst->ne[1];
+    const int ne12_dst = dst->ne[2];
+    const int ne13_dst = dst->ne[3];
+
+    const float sf0 = (float)dst->ne[0] / src0->ne[0];
+    const float sf1 = (float)dst->ne[1] / src0->ne[1];
+    const float sf2 = (float)dst->ne[2] / src0->ne[2];
+    const float sf3 = (float)dst->ne[3] / src0->ne[3];
+
+    CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem),    &extra_src0->data_device));
+    CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong),  &off_src0));
+    CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem),    &extra_dst->data_device));
+    CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong),  &off_dst));
+    CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_ulong),  &nb00));
+    CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong),  &nb01));
+    CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_ulong),  &nb02));
+    CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong),  &nb03));
+
+    if (mode == GGML_SCALE_MODE_NEAREST) {
+        CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int),       &ne10_dst));
+        CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int),       &ne11_dst));
+        CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int),      &ne12_dst));
+        CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int),      &ne13_dst));
+        CL_CHECK(clSetKernelArg(kernel, 12, sizeof(float),    &sf0));
+        CL_CHECK(clSetKernelArg(kernel, 13, sizeof(float),    &sf1));
+        CL_CHECK(clSetKernelArg(kernel, 14, sizeof(float),    &sf2));
+        CL_CHECK(clSetKernelArg(kernel, 15, sizeof(float),    &sf3));
+    } else if (mode == GGML_SCALE_MODE_BILINEAR) {
+        CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int),       &ne00_src));
+        CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int),       &ne01_src));
+        CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int),      &ne10_dst));
+        CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int),      &ne11_dst));
+        CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int),      &ne12_dst));
+        CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int),      &ne13_dst));
+        CL_CHECK(clSetKernelArg(kernel, 14, sizeof(float),    &sf0));
+        CL_CHECK(clSetKernelArg(kernel, 15, sizeof(float),    &sf1));
+        CL_CHECK(clSetKernelArg(kernel, 16, sizeof(float),    &sf2));
+        CL_CHECK(clSetKernelArg(kernel, 17, sizeof(float),    &sf3));
+    }
+
+
+    size_t dst_total_elements = (size_t)ne10_dst * ne11_dst * ne12_dst * ne13_dst;
+    if (dst_total_elements == 0) {
+        return;
+    }
+    size_t global_work_size[] = { dst_total_elements, 1, 1 };
+    size_t local_work_size_pref = 256;
+    size_t local_work_size[] = { MIN(local_work_size_pref, dst_total_elements), 1, 1};
+
+    size_t * local_work_size_ptr = local_work_size;
+    if (dst_total_elements % local_work_size[0] != 0 && !backend_ctx->non_uniform_workgroups) {
+        local_work_size_ptr = nullptr;
+    }
+
+    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
+}
+
+static void ggml_cl_concat(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    GGML_ASSERT(src0);
+    GGML_ASSERT(src0->extra);
+    GGML_ASSERT(src1);
+    GGML_ASSERT(src1->extra);
+    GGML_ASSERT(dst);
+    GGML_ASSERT(dst->extra);
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
+    GGML_ASSERT(dst->type == GGML_TYPE_F32);
+
+    ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
+    cl_command_queue queue = backend_ctx->queue;
+
+    if (backend_ctx->kernel_concat_f32_contiguous == nullptr || backend_ctx->kernel_concat_f32_non_contiguous == nullptr) {
+        GGML_LOG_WARN("%s: concat kernels not available, skipping OpenCL execution.\n", __func__);
+        return;
+    }
+
+    ggml_tensor_extra_cl * extra0_cl = (ggml_tensor_extra_cl *)src0->extra;
+    ggml_tensor_extra_cl * extra1_cl = (ggml_tensor_extra_cl *)src1->extra;
+    ggml_tensor_extra_cl * extrad_cl = (ggml_tensor_extra_cl *)dst->extra;
+
+    cl_ulong off_src0 = extra0_cl->offset + src0->view_offs;
+    cl_ulong off_src1 = extra1_cl->offset + src1->view_offs;
+    cl_ulong off_dst  = extrad_cl->offset + dst->view_offs;
+
+    const int32_t dim = ((const int32_t *) dst->op_params)[0];
+    GGML_ASSERT(dim >= 0 && dim <= 3);
+
+    if (ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && ggml_is_contiguous(dst)) {
+        if (dim == 3) {
+
+            size_t nbytes_src0 = ggml_nbytes(src0);
+            size_t nbytes_src1 = ggml_nbytes(src1);
+
+            CL_CHECK(clEnqueueCopyBuffer(queue, extra0_cl->data_device, extrad_cl->data_device,
+                                         off_src0, off_dst, nbytes_src0, 0, NULL, NULL));
+            CL_CHECK(clEnqueueCopyBuffer(queue, extra1_cl->data_device, extrad_cl->data_device,
+                                         off_src1, off_dst + nbytes_src0, nbytes_src1, 0, NULL, NULL));
+        } else {
+
+            cl_kernel kernel = backend_ctx->kernel_concat_f32_contiguous;
+            size_t global_work_size[3];
+
+            for (int i3 = 0; i3 < dst->ne[3]; ++i3) {
+                cl_ulong current_off_src0 = off_src0 + (i3 * src0->nb[3]);
+                cl_ulong current_off_src1 = off_src1 + (i3 * src1->nb[3]);
+                cl_ulong current_off_dst  = off_dst  + (i3 * dst->nb[3]);
+
+                int d_ne00 = src0->ne[0]; int d_ne01 = src0->ne[1]; int d_ne02 = src0->ne[2];
+                int d_ne10 = src1->ne[0]; int d_ne11 = src1->ne[1]; int d_ne12 = src1->ne[2];
+                int d_ne0  = dst->ne[0];  int d_ne1  = dst->ne[1];  int d_ne2  = dst->ne[2];
+
+                CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem),    &extra0_cl->data_device));
+                CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong),  &current_off_src0));
+                CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem),    &extra1_cl->data_device));
+                CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong),  &current_off_src1));
+                CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem),    &extrad_cl->data_device));
+                CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong),  &current_off_dst));
+                CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int),       &d_ne00));
+                CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int),       &d_ne01));
+                CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int),       &d_ne02));
+                CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int),       &d_ne10));
+                CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int),      &d_ne11));
+                CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int),      &d_ne12));
+                CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int),      &d_ne0));
+                CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int),      &d_ne1));
+                CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int),      &d_ne2));
+                CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int),      &dim));
+
+                global_work_size[0] = d_ne0;
+                global_work_size[1] = d_ne1;
+                global_work_size[2] = d_ne2;
+
+                backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, NULL, dst);
+            }
+        }
+    } else {
+        cl_kernel kernel = backend_ctx->kernel_concat_f32_non_contiguous;
+
+        long ne00 = src0->ne[0], ne01 = src0->ne[1], ne02 = src0->ne[2], ne03 = src0->ne[3];
+        cl_ulong nb00 = src0->nb[0], nb01 = src0->nb[1], nb02 = src0->nb[2], nb03 = src0->nb[3];
+
+        cl_ulong nb10 = src1->nb[0], nb11 = src1->nb[1], nb12 = src1->nb[2], nb13 = src1->nb[3];
+
+        long d_ne0 = dst->ne[0], d_ne1 = dst->ne[1], d_ne2 = dst->ne[2], d_ne3 = dst->ne[3];
+        cl_ulong d_nb0 = dst->nb[0], d_nb1 = dst->nb[1], d_nb2 = dst->nb[2], d_nb3 = dst->nb[3];
+
+
+        CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem),    &extra0_cl->data_device));
+        CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong),  &off_src0));
+        CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem),    &extra1_cl->data_device));
+        CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong),  &off_src1));
+        CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem),    &extrad_cl->data_device));
+        CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong),  &off_dst));
+
+        CL_CHECK(clSetKernelArg(kernel, 6, sizeof(long),      &ne00));
+        CL_CHECK(clSetKernelArg(kernel, 7, sizeof(long),      &ne01));
+        CL_CHECK(clSetKernelArg(kernel, 8, sizeof(long),      &ne02));
+        CL_CHECK(clSetKernelArg(kernel, 9, sizeof(long),      &ne03));
+        CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong),    &nb00));
+        CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong),    &nb01));
+        CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong),    &nb02));
+        CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong),    &nb03));
+
+        CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong),    &nb10));
+        CL_CHECK(clSetKernelArg(kernel, 15, sizeof(cl_ulong),    &nb11));
+        CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong),    &nb12));
+        CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong),    &nb13));
+
+        CL_CHECK(clSetKernelArg(kernel, 18, sizeof(long),     &d_ne0));
+        CL_CHECK(clSetKernelArg(kernel, 19, sizeof(long),     &d_ne1));
+        CL_CHECK(clSetKernelArg(kernel, 20, sizeof(long),     &d_ne2));
+        CL_CHECK(clSetKernelArg(kernel, 21, sizeof(long),     &d_ne3));
+        CL_CHECK(clSetKernelArg(kernel, 22, sizeof(cl_ulong),    &d_nb0));
+        CL_CHECK(clSetKernelArg(kernel, 23, sizeof(cl_ulong),    &d_nb1));
+        CL_CHECK(clSetKernelArg(kernel, 24, sizeof(cl_ulong),    &d_nb2));
+        CL_CHECK(clSetKernelArg(kernel, 25, sizeof(cl_ulong),    &d_nb3));
+        CL_CHECK(clSetKernelArg(kernel, 26, sizeof(int),      &dim));
+
+        size_t global_work_size_nc[] = { d_ne1 > 0 ? (size_t)d_ne1 : 1,
+                                         d_ne2 > 0 ? (size_t)d_ne2 : 1,
+                                         d_ne3 > 0 ? (size_t)d_ne3 : 1 };
+
+        backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size_nc, NULL, dst);
+    }
+}
+
+static void ggml_cl_timestep_embedding(ggml_backend_t backend, const ggml_tensor * src0, ggml_tensor * dst) {
+    GGML_ASSERT(src0);
+    GGML_ASSERT(src0->extra);
+    GGML_ASSERT(dst);
+    GGML_ASSERT(dst->extra);
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT(dst->type == GGML_TYPE_F32);
+
+    ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
+
+    if (backend_ctx->kernel_timestep_embedding == nullptr) {
+        GGML_LOG_WARN("%s: timestep_embedding kernel not available, skipping OpenCL execution.\n", __func__);
+        return;
+    }
+
+    ggml_tensor_extra_cl * extra_src0 = (ggml_tensor_extra_cl *)src0->extra;
+    ggml_tensor_extra_cl * extra_dst  = (ggml_tensor_extra_cl *)dst->extra;
+
+    cl_ulong off_src0 = extra_src0->offset + src0->view_offs;
+    cl_ulong off_dst  = extra_dst->offset  + dst->view_offs;
+
+    const int logical_dim = dst->op_params[0];
+    const int max_period  = dst->op_params[1];
+    const int dst_nb1_bytes = dst->nb[1];
+
+    cl_kernel kernel = backend_ctx->kernel_timestep_embedding;
+
+    CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem),    &extra_src0->data_device));
+    CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong),  &off_src0));
+    CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem),    &extra_dst->data_device));
+    CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong),  &off_dst));
+    CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int),       &dst_nb1_bytes));
+    CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int),       &logical_dim));
+    CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int),       &max_period));
+
+    size_t gws0 = (size_t)(((logical_dim + 1) / 2) + 1);
+
+    size_t gws1 = (size_t)src0->ne[0];
+
+    size_t global_work_size[] = {gws0, gws1, 1};
+
+    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, NULL, dst);
 }
 
 static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -4120,7 +4697,6 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
     const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT;
 
     ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
-    cl_command_queue queue = backend_ctx->queue;
 
     ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
     ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
@@ -4325,15 +4901,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
                 static_cast<size_t>(padded_height_B)
             };
 
-            #ifdef GGML_OPENCL_PROFILING
-                cl_event evt;
-                CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 2, NULL, global_size_t, local_size_t, 0, NULL, &evt));
-
-                g_profiling_info.emplace_back();
-                populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_size_t, local_size_t, dst);
-            #else
-                CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 2, NULL, global_size_t, local_size_t, 0, NULL, NULL));
-            #endif
+            backend_ctx->enqueue_ndrange_kernel(kernel, 2, global_size_t, local_size_t, dst);
         } else {
             // no need to transpose B in other cases
             // create an image for B from sub_buffer
@@ -4455,16 +5023,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
 
         // enqueue kernel with profiling
         // <--------------------------------------------> //
-    #ifdef GGML_OPENCL_PROFILING
-        cl_event evt;
-        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
-
-        g_profiling_info.emplace_back();
-        populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
-        // enqueue kernel without profiling
-    #else
-        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
-    #endif
+        backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
         // <--------------------------------------------> //
 
         // deallocate sub buffers and images
@@ -4544,15 +5103,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
                 global_work_size[2] = (size_t)ne12*ne13;
             }
 
-#ifdef GGML_OPENCL_PROFILING
-            cl_event evt;
-            CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
-
-            g_profiling_info.emplace_back();
-            populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
-#else
-            CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
-#endif
+            backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
             return;
         }
 #else // GGML_OPENCL_SOA_Q
@@ -4782,15 +5333,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
         size_t global_work_size[] = {(size_t)(ne01 + ndst-1)/ndst*nth0, (size_t)ne11*nth1, (size_t)ne12*ne13};
         size_t local_work_size[] = {(size_t)nth0, (size_t)nth1, 1};
 
-#ifdef GGML_OPENCL_PROFILING
-        cl_event evt;
-        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
-
-        g_profiling_info.emplace_back();
-        populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
-#else
-        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
-#endif
+        backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
     } else if (src0t == GGML_TYPE_Q4_K) {
         GGML_ASSERT(false && "not implemented");
     } else if (src0t == GGML_TYPE_Q3_K) {
@@ -4801,31 +5344,136 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
         size_t global_work_size[] = {(size_t)(ne01+1)/2*nth0, (size_t)ne11*nth1, (size_t)ne12*ne13};
         size_t local_work_size[] = {(size_t)nth0, (size_t)nth1, 1};
 
-#ifdef GGML_OPENCL_PROFILING
-        cl_event evt;
-        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
-
-        g_profiling_info.emplace_back();
-        populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
-#else
-        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
-#endif
+        backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
     } else {
         int64_t ny = (ne11 + nrows - 1)/nrows;
 
         size_t global_work_size[] = {(size_t)ne01*nth0, (size_t)ny*nth1, (size_t)ne12*ne13};
         size_t local_work_size[] = {(size_t)nth0, (size_t)nth1, 1};
 
-#ifdef GGML_OPENCL_PROFILING
-        cl_event evt;
-        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
+        backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
+    }
+}
 
-        g_profiling_info.emplace_back();
-        populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
-#else
-        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
+static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    GGML_ASSERT(src0);
+    GGML_ASSERT(src0->extra);
+    GGML_ASSERT(src1);
+    GGML_ASSERT(src1->extra);
+    GGML_ASSERT(dst);
+    GGML_ASSERT(dst->extra);
+
+    const ggml_tensor * src2 = dst->src[2];
+    GGML_ASSERT(src2);
+    GGML_ASSERT(src2->extra);
+
+    ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
+
+    ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
+    ggml_tensor_extra_cl * extra2 = (ggml_tensor_extra_cl *)src2->extra;
+    ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
+
+    cl_ulong offset1 = extra1->offset + src1->view_offs;
+    cl_ulong offset2 = extra2->offset + src2->view_offs;
+    cl_ulong offsetd = extrad->offset + dst->view_offs;
+
+#ifdef GGML_OPENCL_SOA_Q
+    ggml_tensor_extra_cl_q4_0 * extra0_q4_0 = (ggml_tensor_extra_cl_q4_0 *)src0->extra;
 #endif
+
+    const int ne00 = src0->ne[0];
+    const int ne01 = src0->ne[1];
+    const int ne02 = src0->ne[2];
+    const int ne03 = src0->ne[3];
+
+    const cl_ulong nb00 = src0->nb[0];
+    const cl_ulong nb02 = src0->nb[2];
+
+    const int ne10 = src1->ne[0];
+    const int ne11 = src1->ne[1];
+    const int ne12 = src1->ne[2];
+    const int ne13 = src1->ne[3];
+
+    const cl_ulong nb11 = src1->nb[1];
+    const cl_ulong nb12 = src1->nb[2];
+
+    const int ne20 = src2->ne[0];
+    const int ne21 = src2->ne[1];
+
+    const cl_ulong nb21 = src2->nb[1];
+
+    const int ne0 = dst->ne[0];
+    const int ne1 = dst->ne[1];
+
+    const int r2 = ne12/ne02;
+    const int r3 = ne13/ne03;
+    const int dst_rows = ne20*ne21; // ne20 = n_used_experts, ne21 = n_rows
+
+    GGML_ASSERT(ne00 == ne10);
+
+    int sgs   = 32; // subgroup size
+    int nsg   = 1;  // number of subgroups
+    int nrows = 1;  // number of row in src1
+    int ndst  = 4;  // number of values produced by each subgroup
+
+    cl_kernel kernel;
+
+    // subgroup mat vec
+    switch (src0->type) {
+        case GGML_TYPE_Q4_0: {
+            kernel = backend_ctx->kernel_mul_mv_id_q4_0_f32_8x_flat;
+
+            if (backend_ctx->gpu_family == INTEL) {
+                sgs  = 16;
+                nsg  = 1;
+                ndst = 8;
+            } else if (backend_ctx->gpu_family == ADRENO) {
+                sgs  = 64;
+                nsg  = 1;
+                ndst = 8;
+            } else {
+                GGML_ASSERT(false && "TODO: Unknown GPU");
+            }
+
+            CL_CHECK(clSetKernelArg(kernel,  0, sizeof(cl_mem),   &extra0_q4_0->q));
+            CL_CHECK(clSetKernelArg(kernel,  1, sizeof(cl_mem),   &extra0_q4_0->d));
+            CL_CHECK(clSetKernelArg(kernel,  2, sizeof(cl_mem),   &extra1->data_device));
+            CL_CHECK(clSetKernelArg(kernel,  3, sizeof(cl_ulong), &offset1));
+            CL_CHECK(clSetKernelArg(kernel,  4, sizeof(cl_mem),   &extra2->data_device));
+            CL_CHECK(clSetKernelArg(kernel,  5, sizeof(cl_ulong), &offset2));
+            CL_CHECK(clSetKernelArg(kernel,  6, sizeof(cl_mem),   &extrad->data_device));
+            CL_CHECK(clSetKernelArg(kernel,  7, sizeof(cl_ulong), &offsetd));
+            CL_CHECK(clSetKernelArg(kernel,  8, sizeof(int),      &ne00));
+            CL_CHECK(clSetKernelArg(kernel,  9, sizeof(int),      &ne01));
+            CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int),      &ne02));
+            CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb00));
+            CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb02));
+            CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int),      &ne10));
+            CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int),      &ne11));
+            CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int),      &ne12));
+            CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb11));
+            CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &nb12));
+            CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int),      &ne20));
+            CL_CHECK(clSetKernelArg(kernel, 19, sizeof(int),      &ne21));
+            CL_CHECK(clSetKernelArg(kernel, 20, sizeof(cl_ulong), &nb21));
+            CL_CHECK(clSetKernelArg(kernel, 21, sizeof(int),      &ne0));
+            CL_CHECK(clSetKernelArg(kernel, 22, sizeof(int),      &ne1));
+            CL_CHECK(clSetKernelArg(kernel, 23, sizeof(int),      &r2));
+            CL_CHECK(clSetKernelArg(kernel, 24, sizeof(int),      &r3));
+
+            break;
+        }
+        default:
+            GGML_ASSERT(false && "not implemented");;
     }
+
+    int _ne1 = 1;
+    int ne123 = dst_rows;
+
+    size_t global_work_size[] = {(size_t)(ne01+ndst*nsg-1)/(ndst*nsg)*sgs, (size_t)(_ne1+nrows-1)/nrows*nsg, (size_t)ne123};
+    size_t local_work_size[] = {(size_t)sgs, (size_t)nsg, 1};
+
+    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
 }
 
 static void ggml_cl_scale(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -4838,7 +5486,6 @@ static void ggml_cl_scale(ggml_backend_t backend, const ggml_tensor * src0, cons
     GGML_ASSERT(ggml_is_contiguous(src0));
 
     ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
-    cl_command_queue queue = backend_ctx->queue;
 
     float scale;
     memcpy(&scale, dst->op_params, sizeof(scale));
@@ -4867,15 +5514,7 @@ static void ggml_cl_scale(ggml_backend_t backend, const ggml_tensor * src0, cons
         local_work_size_ptr = nullptr;  // Let driver choose the work-group sizes.
     }
 
-#ifdef GGML_OPENCL_PROFILING
-    cl_event evt;
-    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
-
-    g_profiling_info.emplace_back();
-    populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
-#else
-    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
-#endif
+    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
 }
 
 static void ggml_cl_cpy(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -4912,7 +5551,6 @@ static void ggml_cl_cpy(ggml_backend_t backend, const ggml_tensor * src0, const
     const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT;
 
     ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
-    cl_command_queue queue = backend_ctx->queue;
 
     ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
     ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
@@ -4977,15 +5615,7 @@ static void ggml_cl_cpy(ggml_backend_t backend, const ggml_tensor * src0, const
     size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
     size_t local_work_size[] = {(size_t)nth, 1, 1};
 
-#ifdef GGML_OPENCL_PROFILING
-    cl_event evt;
-    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
-
-    g_profiling_info.emplace_back();
-    populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, src1);
-#else
-    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
-#endif
+    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, src1);
 }
 
 static void ggml_cl_dup(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -5008,7 +5638,6 @@ static void ggml_cl_diag_mask_inf(ggml_backend_t backend, const ggml_tensor * sr
     const int  ne02 = src0 ? src0->ne[2] : 0;
 
     ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
-    cl_command_queue queue = backend_ctx->queue;
 
     ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
     ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
@@ -5032,15 +5661,7 @@ static void ggml_cl_diag_mask_inf(ggml_backend_t backend, const ggml_tensor * sr
         size_t global_work_size[] = {(size_t)ne00*ne01*ne02/8, 1, 1};
         size_t local_work_size[] = {64, 1, 1};
 
-#ifdef GGML_OPENCL_PROFILING
-        cl_event evt;
-        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
-
-        g_profiling_info.emplace_back();
-        populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
-#else
-        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
-#endif
+        backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
     } else {
         kernel = backend_ctx->kernel_diag_mask_inf;
 
@@ -5060,15 +5681,7 @@ static void ggml_cl_diag_mask_inf(ggml_backend_t backend, const ggml_tensor * sr
             local_work_size_ptr = nullptr;  // Let driver choose the work-group sizes.
         }
 
-#ifdef GGML_OPENCL_PROFILING
-        cl_event evt;
-        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
-
-        g_profiling_info.emplace_back();
-        populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
-#else
-        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
-#endif
+        backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
     }
 }
 
@@ -5088,7 +5701,6 @@ static void ggml_cl_soft_max(ggml_backend_t backend, const ggml_tensor * src0, c
     }
 
     ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
-    cl_command_queue queue = backend_ctx->queue;
 
     ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
     ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
@@ -5168,15 +5780,7 @@ static void ggml_cl_soft_max(ggml_backend_t backend, const ggml_tensor * src0, c
     size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
     size_t local_work_size[] = {(size_t)nth, 1, 1};
 
-#ifdef GGML_OPENCL_PROFILING
-    cl_event evt;
-    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
-
-    g_profiling_info.emplace_back();
-    populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
-#else
-    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
-#endif
+    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
 }
 
 static void ggml_cl_rope(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -5188,7 +5792,6 @@ static void ggml_cl_rope(ggml_backend_t backend, const ggml_tensor * src0, const
     GGML_ASSERT(dst->extra);
 
     ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
-    cl_command_queue queue = backend_ctx->queue;
 
     ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
     ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
@@ -5354,15 +5957,7 @@ static void ggml_cl_rope(ggml_backend_t backend, const ggml_tensor * src0, const
     size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
     size_t local_work_size[] = {(size_t)nth, 1, 1};
 
-#ifdef GGML_OPENCL_PROFILING
-    cl_event evt;
-    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
-
-    g_profiling_info.emplace_back();
-    populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
-#else
-    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
-#endif
+    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
 }
 
 static void ggml_cl_im2col(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -5377,7 +5972,6 @@ static void ggml_cl_im2col(ggml_backend_t backend, const ggml_tensor * src0, con
     GGML_ASSERT(dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32);
 
     ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
-    cl_command_queue queue = backend_ctx->queue;
 
     ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
     ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
@@ -5446,15 +6040,7 @@ static void ggml_cl_im2col(ggml_backend_t backend, const ggml_tensor * src0, con
     size_t global_work_size[] = {(size_t)num_blocks*256, (size_t)OH, (size_t)batch*IC};
     size_t local_work_size[] = {256, 1, 1};
 
-#ifdef GGML_OPENCL_PROFILING
-    cl_event evt;
-    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
-
-    g_profiling_info.emplace_back();
-    populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
-#else
-    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
-#endif
+    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
 }
 
 static void ggml_cl_argsort(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -5469,7 +6055,6 @@ static void ggml_cl_argsort(ggml_backend_t backend, const ggml_tensor * src0, co
     GGML_ASSERT(ggml_is_contiguous(src0));
 
     ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
-    cl_command_queue queue = backend_ctx->queue;
 
     ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
     ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
@@ -5501,15 +6086,7 @@ static void ggml_cl_argsort(ggml_backend_t backend, const ggml_tensor * src0, co
     size_t global_work_size[] = {(size_t)ne00_padded, (size_t)nrows, (size_t)1};
     size_t local_work_size[] = {(size_t)ne00_padded, 1, 1};
 
-#ifdef GGML_OPENCL_PROFILING
-    cl_event evt;
-    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
-
-    g_profiling_info.emplace_back();
-    populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
-#else
-    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
-#endif
+    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
 }
 
 static void ggml_cl_sum_rows(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -5523,7 +6100,6 @@ static void ggml_cl_sum_rows(ggml_backend_t backend, const ggml_tensor * src0, c
     GGML_ASSERT(ggml_is_contiguous(src0));
 
     ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
-    cl_command_queue queue = backend_ctx->queue;
 
     ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
     ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
@@ -5564,15 +6140,7 @@ static void ggml_cl_sum_rows(ggml_backend_t backend, const ggml_tensor * src0, c
     size_t global_work_size[] = {(size_t)ne01, (size_t)ne02, (size_t)ne03};
     size_t local_work_size[] = {(size_t)64, 1, 1};
 
-#ifdef GGML_OPENCL_PROFILING
-    cl_event evt;
-    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
-
-    g_profiling_info.emplace_back();
-    populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
-#else
-    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
-#endif
+    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
 }
 
 //------------------------------------------------------------------------------
@@ -5667,6 +6235,12 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
                     }
                     func = ggml_cl_sigmoid;
                     break;
+                case GGML_UNARY_OP_TANH:
+                    if (!any_on_device) {
+                        return false;
+                    }
+                    func = ggml_cl_tanh;
+                    break;
                 default:
                     return false;
             } break;
@@ -5694,12 +6268,48 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
             }
             func = ggml_cl_group_norm;
             break;
+                case GGML_OP_REPEAT:
+             if (!any_on_device) {
+                return false;
+            }
+            func = ggml_cl_repeat;
+            break;
+        case GGML_OP_PAD:
+            if (!any_on_device) {
+                return false;
+            }
+            ggml_cl_pad(backend, tensor->src[0], tensor);
+            return true;
+        case GGML_OP_UPSCALE:
+            if (!any_on_device) {
+                return false;
+            }
+            ggml_cl_upscale(backend, tensor->src[0], tensor);
+            return true;
+        case GGML_OP_CONCAT:
+            if (!any_on_device) {
+                return false;
+            }
+            func = ggml_cl_concat;
+            break;
+        case GGML_OP_TIMESTEP_EMBEDDING:
+            if (!any_on_device) {
+                return false;
+            }
+            ggml_cl_timestep_embedding(backend, tensor->src[0], tensor);
+            return true;
         case GGML_OP_MUL_MAT:
             if (!any_on_device && !ggml_cl_can_mul_mat(tensor->src[0], tensor->src[1], tensor)) {
                 return false;
             }
             func = ggml_cl_mul_mat;
             break;
+        case GGML_OP_MUL_MAT_ID:
+            if (!any_on_device) {
+                return false;
+            }
+            func = ggml_cl_mul_mat_id;
+            break;
         case GGML_OP_SCALE:
             if (!any_on_device) {
                 return false;
diff --git a/ggml/src/ggml-opencl/kernels/concat.cl b/ggml/src/ggml-opencl/kernels/concat.cl
new file mode 100644
index 0000000000000..132758469c6fa
--- /dev/null
+++ b/ggml/src/ggml-opencl/kernels/concat.cl
@@ -0,0 +1,109 @@
+kernel void kernel_concat_f32_contiguous(
+    global const char * p_src0, ulong off_src0,
+    global const char * p_src1, ulong off_src1,
+    global char * p_dst, ulong off_dst,
+    int d_ne00, int d_ne01, int d_ne02, // src0->ne[0..2] for the slice
+    int d_ne10, int d_ne11, int d_ne12, // src1->ne[0..2] for the slice (d_ne1X must match d_ne0X on non-concat axes)
+    int d_ne0,  int d_ne1,  int d_ne2,  // dst->ne[0..2] for the slice
+    int dim
+) {
+    global const float * src0 = (global const float*)((global char*)p_src0 + off_src0);
+    global const float * src1 = (global const float*)((global char*)p_src1 + off_src1);
+    global float * dst        = (global float*)((global char*)p_dst + off_dst);
+
+    int i0 = get_global_id(0); // Index along dst's 0th dimension
+    int i1 = get_global_id(1); // Index along dst's 1st dimension
+    int i2 = get_global_id(2); // Index along dst's 2nd dimension
+
+    if (i0 >= d_ne0 || i1 >= d_ne1 || i2 >= d_ne2) {
+        return;
+    }
+
+    ulong dst_idx = (ulong)i2 * d_ne0 * d_ne1 + (ulong)i1 * d_ne0 + i0;
+    ulong src_idx;
+
+    if (dim == 0) {
+        if (i0 < d_ne00) { // Data from src0
+            src_idx = (ulong)i2 * d_ne00 * d_ne01 + (ulong)i1 * d_ne00 + i0;
+            dst[dst_idx] = src0[src_idx];
+        } else { // Data from src1
+            src_idx = (ulong)i2 * d_ne10 * d_ne11 + (ulong)i1 * d_ne10 + (i0 - d_ne00);
+            dst[dst_idx] = src1[src_idx];
+        }
+    } else if (dim == 1) {
+        if (i1 < d_ne01) { // Data from src0
+            src_idx = (ulong)i2 * d_ne00 * d_ne01 + (ulong)i1 * d_ne00 + i0;
+            dst[dst_idx] = src0[src_idx];
+        } else { // Data from src1
+            src_idx = (ulong)i2 * d_ne10 * d_ne11 + (ulong)(i1 - d_ne01) * d_ne10 + i0;
+            dst[dst_idx] = src1[src_idx];
+        }
+    } else if (dim == 2) {
+        if (i2 < d_ne02) { // Data from src0
+            src_idx = (ulong)i2 * d_ne00 * d_ne01 + (ulong)i1 * d_ne00 + i0;
+            dst[dst_idx] = src0[src_idx];
+        } else { // Data from src1
+
+            src_idx = (ulong)(i2 - d_ne02) * d_ne10 * d_ne11 + (ulong)i1 * d_ne10 + i0;
+            dst[dst_idx] = src1[src_idx];
+        }
+    }
+}
+
+kernel void kernel_concat_f32_non_contiguous(
+    global const char * p_src0, ulong off_src0,
+    global const char * p_src1, ulong off_src1,
+    global char * p_dst, ulong off_dst,
+
+    long ne00, long ne01, long ne02, long ne03,
+    ulong nb00, ulong nb01, ulong nb02, ulong nb03,
+
+    ulong nb10, ulong nb11, ulong nb12, ulong nb13, // Strides for src1
+
+    long d_ne0, long d_ne1, long d_ne2, long d_ne3,
+    ulong d_nb0, ulong d_nb1, ulong d_nb2, ulong d_nb3,
+    int dim
+) {
+    global const char * src0_base = p_src0 + off_src0;
+    global const char * src1_base = p_src1 + off_src1;
+    global char * dst_base        = p_dst + off_dst;
+
+    long current_i1 = get_global_id(0); // Index for dst_dim_1
+    long current_i2 = get_global_id(1); // Index for dst_dim_2
+    long current_i3 = get_global_id(2); // Index for dst_dim_3
+
+    if (current_i1 >= d_ne1 || current_i2 >= d_ne2 || current_i3 >= d_ne3) {
+        return;
+    }
+
+    global const float * x_val_ptr;
+    global float * y_val_ptr;
+
+    for (long current_i0 = 0; current_i0 < d_ne0; ++current_i0) {
+        bool use_src0;
+        long s_i0 = current_i0, s_i1 = current_i1, s_i2 = current_i2, s_i3 = current_i3;
+
+        if (dim == 0) {
+            use_src0 = (current_i0 < ne00);
+            if (!use_src0) { s_i0 = current_i0 - ne00; }
+        } else if (dim == 1) {
+            use_src0 = (current_i1 < ne01);
+            if (!use_src0) { s_i1 = current_i1 - ne01; }
+        } else if (dim == 2) {
+            use_src0 = (current_i2 < ne02);
+            if (!use_src0) { s_i2 = current_i2 - ne02; }
+        } else { // dim == 3
+            use_src0 = (current_i3 < ne03);
+            if (!use_src0) { s_i3 = current_i3 - ne03; }
+        }
+
+        if (use_src0) {
+            x_val_ptr = (global const float *)(src0_base + (ulong)s_i3*nb03 + (ulong)s_i2*nb02 + (ulong)s_i1*nb01 + (ulong)s_i0*nb00);
+        } else {
+            x_val_ptr = (global const float *)(src1_base + (ulong)s_i3*nb13 + (ulong)s_i2*nb12 + (ulong)s_i1*nb11 + (ulong)s_i0*nb10);
+        }
+
+        y_val_ptr = (global float *)(dst_base + (ulong)current_i3*d_nb3 + (ulong)current_i2*d_nb2 + (ulong)current_i1*d_nb1 + (ulong)current_i0*d_nb0);
+        *y_val_ptr = *x_val_ptr;
+    }
+}
diff --git a/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl b/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl
new file mode 100644
index 0000000000000..7ccf41efbe918
--- /dev/null
+++ b/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl
@@ -0,0 +1,283 @@
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+#ifdef cl_intel_subgroups
+#pragma OPENCL EXTENSION cl_intel_subgroups : enable
+#else
+#pragma OPENCL EXTENSION cl_khr_subgroups : enable
+#endif
+
+#ifdef cl_intel_required_subgroup_size
+#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
+#define INTEL_GPU 1
+#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
+#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
+#elif defined(cl_qcom_reqd_sub_group_size)
+#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
+#define ADRENO_GPU 1
+#define REQD_SUBGROUP_SIZE_64  __attribute__((qcom_reqd_sub_group_size("half")))
+#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
+#endif
+
+#define QK4_0                   32
+
+typedef char int8_t;
+typedef uchar uint8_t;
+typedef short int16_t;
+typedef ushort uint16_t;
+typedef int int32_t;
+typedef uint uint32_t;
+
+//------------------------------------------------------------------------------
+// block_q4_0
+//------------------------------------------------------------------------------
+struct block_q4_0
+{
+    half d;
+    uint8_t qs[QK4_0 / 2];
+};
+
+// This function requires the original shuffled weights.
+// As a reminder, the original weights are shuffled so that (q[0], q[16]) are
+// packed together in a byte, so are (q[1], q[17]) and so on.
+inline float block_q_4_0_dot_y_flat(
+        global uchar * x,
+        global half  * dh,
+        float sumy,
+        float16 yl,
+        int il
+) {
+    float           d   = *dh;
+    global ushort * qs  = ((global ushort *)x + il/2);
+    float           acc = 0.f;
+
+    acc += yl.s0 * (qs[0] & 0x000F);
+    acc += yl.s1 * (qs[0] & 0x0F00);
+    acc += yl.s8 * (qs[0] & 0x00F0);
+    acc += yl.s9 * (qs[0] & 0xF000);
+
+    acc += yl.s2 * (qs[1] & 0x000F);
+    acc += yl.s3 * (qs[1] & 0x0F00);
+    acc += yl.sa * (qs[1] & 0x00F0);
+    acc += yl.sb * (qs[1] & 0xF000);
+
+    acc += yl.s4 * (qs[2] & 0x000F);
+    acc += yl.s5 * (qs[2] & 0x0F00);
+    acc += yl.sc * (qs[2] & 0x00F0);
+    acc += yl.sd * (qs[2] & 0xF000);
+
+    acc += yl.s6 * (qs[3] & 0x000F);
+    acc += yl.s7 * (qs[3] & 0x0F00);
+    acc += yl.se * (qs[3] & 0x00F0);
+    acc += yl.sf * (qs[3] & 0xF000);
+
+    return d * (sumy * -8.f + acc);
+}
+
+//
+// This variant outputs 8 values.
+//
+#undef N_DST
+#undef N_SIMDGROUP
+#undef N_SIMDWIDTH
+
+#ifdef INTEL_GPU
+#define N_DST 8 // each SIMD group works on 8 rows
+#define N_SIMDGROUP 1 // number of SIMD groups in a thread group
+#define N_SIMDWIDTH 16 // subgroup size
+#elif defined (ADRENO_GPU)
+#define N_DST 8
+#define N_SIMDGROUP 1
+#define N_SIMDWIDTH 64
+#endif
+
+inline void mul_vec_q_n_f32_8x_flat(
+        global char  * src0_q,
+        global half  * src0_d,
+        global float * src1,
+        global float * dst,
+        int ne00,
+        int ne01,
+        int ne02,
+        int ne10,
+        int ne12,
+        int ne0,
+        int ne1,
+        int r2,
+        int r3
+) {
+    const ulong nb = ne00/QK4_0;
+
+    int r0 = get_group_id(0);
+    int r1 = get_group_id(1);
+    int im = 0;
+
+    int first_row = (r0 * N_SIMDGROUP + get_sub_group_id()) * N_DST;
+
+    int i12 = im%ne12;
+    int i13 = im/ne12;
+
+    // The number of scales is the same as the number of blocks.
+    ulong offset0_d = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
+    // Each block contains QK4_0/2 uchars, hence offset for qs is as follows.
+    ulong offset0_q = (first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02)) * QK4_0/2;
+
+    global uchar * x = (global uchar *) src0_q + offset0_q;
+    global half  * d = (global half  *) src0_d + offset0_d;
+    global float * y = (global float *) src1   + r1*ne10 + im*ne00*ne1;
+
+    float16 yl;
+    float8 sumf = 0.f;
+
+    int ix = get_sub_group_local_id()/2;
+    int il = 8*(get_sub_group_local_id()%2);
+
+    global float * yb = y + ix*QK4_0 + il;
+
+    for (int ib = ix; ib < nb; ib += N_SIMDWIDTH/2) {
+        float sumy = 0.f;
+
+        sumy += yb[0];
+        sumy += yb[1];
+        sumy += yb[2];
+        sumy += yb[3];
+        sumy += yb[4];
+        sumy += yb[5];
+        sumy += yb[6];
+        sumy += yb[7];
+
+        sumy += yb[16];
+        sumy += yb[17];
+        sumy += yb[18];
+        sumy += yb[19];
+        sumy += yb[20];
+        sumy += yb[21];
+        sumy += yb[22];
+        sumy += yb[23];
+
+        yl.s0 = yb[0];
+        yl.s1 = yb[1]/256.f;
+
+        yl.s2 = yb[2];
+        yl.s3 = yb[3]/256.f;
+
+        yl.s4 = yb[4];
+        yl.s5 = yb[5]/256.f;
+
+        yl.s6 = yb[6];
+        yl.s7 = yb[7]/256.f;
+
+        yl.s8 = yb[16]/16.f;
+        yl.s9 = yb[17]/4096.f;
+
+        yl.sa = yb[18]/16.f;
+        yl.sb = yb[19]/4096.f;
+
+        yl.sc = yb[20]/16.f;
+        yl.sd = yb[21]/4096.f;
+
+        yl.se = yb[22]/16.f;
+        yl.sf = yb[23]/4096.f;
+
+        sumf.s0 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 0*nb*QK4_0/2, d + ib + 0*nb, sumy, yl, il);
+        sumf.s1 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 1*nb*QK4_0/2, d + ib + 1*nb, sumy, yl, il);
+        sumf.s2 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 2*nb*QK4_0/2, d + ib + 2*nb, sumy, yl, il);
+        sumf.s3 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 3*nb*QK4_0/2, d + ib + 3*nb, sumy, yl, il);
+
+        sumf.s4 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 4*nb*QK4_0/2, d + ib + 4*nb, sumy, yl, il);
+        sumf.s5 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 5*nb*QK4_0/2, d + ib + 5*nb, sumy, yl, il);
+        sumf.s6 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 6*nb*QK4_0/2, d + ib + 6*nb, sumy, yl, il);
+        sumf.s7 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 7*nb*QK4_0/2, d + ib + 7*nb, sumy, yl, il);
+
+        yb += QK4_0 * (N_SIMDWIDTH/2);
+    }
+
+    float8 tot = (float8)(
+        sub_group_reduce_add(sumf.s0), sub_group_reduce_add(sumf.s1),
+        sub_group_reduce_add(sumf.s2), sub_group_reduce_add(sumf.s3),
+        sub_group_reduce_add(sumf.s4), sub_group_reduce_add(sumf.s5),
+        sub_group_reduce_add(sumf.s6), sub_group_reduce_add(sumf.s7)
+    );
+
+    if (get_sub_group_local_id() == 0) {
+        if (first_row + 0 < ne01) {
+            dst[r1*ne0 + im*ne0*ne1 + first_row + 0] = tot.s0;
+        }
+        if (first_row + 1 < ne01) {
+            dst[r1*ne0 + im*ne0*ne1 + first_row + 1] = tot.s1;
+        }
+        if (first_row + 2 < ne01) {
+            dst[r1*ne0 + im*ne0*ne1 + first_row + 2] = tot.s2;
+        }
+        if (first_row + 3 < ne01) {
+            dst[r1*ne0 + im*ne0*ne1 + first_row + 3] = tot.s3;
+        }
+
+        if (first_row + 4 < ne01) {
+            dst[r1*ne0 + im*ne0*ne1 + first_row + 4] = tot.s4;
+        }
+        if (first_row + 5 < ne01) {
+            dst[r1*ne0 + im*ne0*ne1 + first_row + 5] = tot.s5;
+        }
+        if (first_row + 6 < ne01) {
+            dst[r1*ne0 + im*ne0*ne1 + first_row + 6] = tot.s6;
+        }
+        if (first_row + 7 < ne01) {
+            dst[r1*ne0 + im*ne0*ne1 + first_row + 7] = tot.s7;
+        }
+    }
+}
+
+#ifdef INTEL_GPU
+REQD_SUBGROUP_SIZE_16
+#elif defined (ADRENO_GPU)
+REQD_SUBGROUP_SIZE_64
+#endif
+kernel void kernel_mul_mv_id_q4_0_f32_8x_flat(
+        global char  *  src0_q,
+        global half  *  src0_d,
+        global float *  src1,
+        ulong           offset1,
+        global char  *  src2,
+        ulong           offset2,
+        global float *  dst,
+        ulong           offsetd,
+        int             ne00,
+        int             ne01,
+        int             ne02,
+        ulong           nb00,
+        ulong           nb02,
+        int             ne10,
+        int             ne11,
+        int             ne12,
+        ulong           nb11,
+        ulong           nb12,
+        int             ne20,
+        int             ne21,
+        ulong           nb21,
+        int             ne0,
+        int             ne1,
+        int             r2,
+        int             r3
+) {
+    src1 = (global float *)((global char *)src1 + offset1);
+    src2 = (global char  *)((global char *)src2 + offset2);
+    dst  = (global float *)((global char *)dst  + offsetd);
+
+    const int iid1 = get_group_id(2)/ne20;
+    const int idx  = get_group_id(2)%ne20;
+
+    const int i02 = ((global int *)(src2 + iid1*nb21))[idx];
+
+    const int i11 = idx%ne11;
+    const int i12 = iid1;
+
+    const int i1 = idx;
+    const int i2 = i12;
+
+    global char  * src0_q_cur = src0_q + (i02*nb02/nb00)*(QK4_0/2);
+    global half  * src0_d_cur = src0_d + (i02*nb02/nb00);
+    global float * src1_cur   = (global float *)((global char *) src1  + i11*nb11 + i12*nb12);
+    global float * dst_cur    = dst + i1*ne0 + i2*ne1*ne0;
+
+    mul_vec_q_n_f32_8x_flat(src0_q_cur, src0_d_cur, src1_cur, dst_cur, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3);
+}
diff --git a/ggml/src/ggml-opencl/kernels/pad.cl b/ggml/src/ggml-opencl/kernels/pad.cl
new file mode 100644
index 0000000000000..747fa7febcc74
--- /dev/null
+++ b/ggml/src/ggml-opencl/kernels/pad.cl
@@ -0,0 +1,30 @@
+kernel void kernel_pad(
+        global const void * src0_ptr,
+        ulong src0_offset,
+        global void * dst_ptr,
+        ulong dst_offset,
+        int s_ne0, int s_ne1, int s_ne2,
+        int d_ne0, int d_ne1, int d_ne2
+) {
+    global const float * src0 = (global const float *)((global const char *)src0_ptr + src0_offset);
+    global float * dst = (global float *)((global char *)dst_ptr + dst_offset);
+
+    int nidx   = get_global_id(0);
+    int idx_d1 = get_group_id(1);
+    int idx_d2 = get_group_id(2);
+
+    if (nidx >= d_ne0) {
+        return;
+    }
+
+    int dst_el_offset = nidx + idx_d1 * d_ne0 + idx_d2 * d_ne0 * d_ne1;
+
+    bool in_src_bounds = (nidx < s_ne0) && (idx_d1 < s_ne1) && (idx_d2 < s_ne2);
+
+    if (in_src_bounds) {
+        int src_el_offset = nidx + idx_d1 * s_ne0 + idx_d2 * s_ne0 * s_ne1;
+        dst[dst_el_offset] = src0[src_el_offset];
+    } else {
+        dst[dst_el_offset] = 0.0f;
+    }
+}
diff --git a/ggml/src/ggml-opencl/kernels/repeat.cl b/ggml/src/ggml-opencl/kernels/repeat.cl
new file mode 100644
index 0000000000000..079498f5ab947
--- /dev/null
+++ b/ggml/src/ggml-opencl/kernels/repeat.cl
@@ -0,0 +1,39 @@
+kernel void kernel_repeat(
+    global const char * src0_data_in,
+    global       char * dst_data_in,
+    ulong src0_offset,
+    ulong dst_offset,
+    int src0_ne0, int src0_ne1, int src0_ne2, int src0_ne3,
+    ulong src0_nb0, ulong src0_nb1, ulong src0_nb2, ulong src0_nb3,
+    int dst_ne0, int dst_ne1, int dst_ne2, int dst_ne3,
+    ulong dst_nb0, ulong dst_nb1, ulong dst_nb2, ulong dst_nb3
+) {
+    global const char * src0_data = src0_data_in + src0_offset;
+    global       char * dst_data  = dst_data_in + dst_offset;
+
+    const int d3 = get_global_id(2);
+    const int d2 = get_global_id(1);
+    const int d1 = get_global_id(0);
+
+    if (d3 >= dst_ne3 || d2 >= dst_ne2 || d1 >= dst_ne1) {
+        return;
+    }
+
+    const int s3 = d3 % src0_ne3;
+    const int s2 = d2 % src0_ne2;
+    const int s1 = d1 % src0_ne1;
+
+    const global char * p_src0_slice = src0_data + (ulong)s3*src0_nb3 + (ulong)s2*src0_nb2 + (ulong)s1*src0_nb1;
+    global char * p_dst_slice  = dst_data  + (ulong)d3*dst_nb3 + (ulong)d2*dst_nb2 + (ulong)d1*dst_nb1;
+
+    for (int d0 = 0; d0 < dst_ne0; ++d0) {
+        // Determine source index for dimension 0 based on tiling/broadcasting.
+        const int s0 = d0 % src0_ne0;
+
+        const global char * restrict current_src_el_ptr = p_src0_slice + (ulong)s0*src0_nb0;
+        global char * restrict current_dst_el_ptr  = p_dst_slice  + (ulong)d0*dst_nb0;
+        for (int k = 0; k < src0_nb0; ++k) {
+            current_dst_el_ptr[k] = current_src_el_ptr[k];
+        }
+    }
+}
diff --git a/ggml/src/ggml-opencl/kernels/tanh.cl b/ggml/src/ggml-opencl/kernels/tanh.cl
new file mode 100644
index 0000000000000..d9da86b148921
--- /dev/null
+++ b/ggml/src/ggml-opencl/kernels/tanh.cl
@@ -0,0 +1,63 @@
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+#ifdef cl_intel_required_subgroup_size
+#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
+#define INTEL_GPU 1
+#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
+#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
+#elif defined(cl_qcom_reqd_sub_group_size)
+#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
+#define ADRENO_GPU 1
+#define REQD_SUBGROUP_SIZE_64  __attribute__((qcom_reqd_sub_group_size("half")))
+#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
+#endif
+
+kernel void kernel_tanh_f32_nd(
+    global void * p_src0_base, ulong off_src0_abs,
+    global void * p_dst_base,  ulong off_dst_abs,
+    int ne00, int ne01, int ne02, int ne03,
+    ulong nb00, ulong nb01, ulong nb02, ulong nb03,
+    int ne10, int ne11, int ne12, int ne13,
+    ulong nb10, ulong nb11, ulong nb12, ulong nb13
+) {
+    int i0 = get_global_id(0);
+    int i1 = get_global_id(1);
+    int i2 = get_global_id(2);
+
+    if (i0 < ne10 && i1 < ne11 && i2 < ne12) {
+        for (int i3 = 0; i3 < ne13; ++i3) {
+            ulong src_offset_in_tensor = (ulong)i0*nb00 + (ulong)i1*nb01 + (ulong)i2*nb02 + (ulong)i3*nb03;
+            global const float *src_val_ptr = (global const float *)((global char *)p_src0_base + off_src0_abs + src_offset_in_tensor);
+
+            ulong dst_offset_in_tensor = (ulong)i0*nb10 + (ulong)i1*nb11 + (ulong)i2*nb12 + (ulong)i3*nb13;
+            global float *dst_val_ptr = (global float *)((global char *)p_dst_base + off_dst_abs + dst_offset_in_tensor);
+
+            *dst_val_ptr = tanh(*src_val_ptr);
+        }
+    }
+}
+
+kernel void kernel_tanh_f16_nd(
+    global void * p_src0_base, ulong off_src0_abs,
+    global void * p_dst_base,  ulong off_dst_abs,
+    int ne00, int ne01, int ne02, int ne03,
+    ulong nb00, ulong nb01, ulong nb02, ulong nb03,
+    int ne10, int ne11, int ne12, int ne13,
+    ulong nb10, ulong nb11, ulong nb12, ulong nb13
+) {
+    int i0 = get_global_id(0);
+    int i1 = get_global_id(1);
+    int i2 = get_global_id(2);
+
+    if (i0 < ne10 && i1 < ne11 && i2 < ne12) {
+        for (int i3 = 0; i3 < ne13; ++i3) {
+            ulong src_offset_in_tensor = (ulong)i0*nb00 + (ulong)i1*nb01 + (ulong)i2*nb02 + (ulong)i3*nb03;
+            global const half *src_val_ptr = (global const half *)((global char *)p_src0_base + off_src0_abs + src_offset_in_tensor);
+
+            ulong dst_offset_in_tensor = (ulong)i0*nb10 + (ulong)i1*nb11 + (ulong)i2*nb12 + (ulong)i3*nb13;
+            global half *dst_val_ptr = (global half *)((global char *)p_dst_base + off_dst_abs + dst_offset_in_tensor);
+
+            *dst_val_ptr = tanh(*src_val_ptr);
+        }
+    }
+}
diff --git a/ggml/src/ggml-opencl/kernels/tsembd.cl b/ggml/src/ggml-opencl/kernels/tsembd.cl
new file mode 100644
index 0000000000000..4b1107f70ba7a
--- /dev/null
+++ b/ggml/src/ggml-opencl/kernels/tsembd.cl
@@ -0,0 +1,48 @@
+kernel void kernel_timestep_embedding(
+    global const void * p_timesteps,
+    ulong off_timesteps,
+    global void * p_dst,
+    ulong off_dst,
+    int dst_nb1_bytes,
+    int logical_dim,
+    int max_period
+) {
+    int local_i;
+    int local_j;
+    int local_half_dim;
+    float local_timestep_val;
+    float local_freq;
+    float local_arg;
+    global float * local_embed_data_ptr;
+    global const float * local_timesteps_input_ptr;
+    global float * local_dst_output_base_ptr;
+
+    local_timesteps_input_ptr = (global const float *)((global char *)p_timesteps + off_timesteps);
+    local_dst_output_base_ptr = (global float *)((global char *)p_dst + off_dst);
+
+    local_i = get_global_id(1);
+    local_j = get_global_id(0);
+
+    local_half_dim = logical_dim / 2;
+    local_embed_data_ptr = (global float *)((global char *)local_dst_output_base_ptr + local_i * dst_nb1_bytes);
+
+    if (logical_dim % 2 != 0 && local_j == ((logical_dim + 1) / 2)) {
+        local_embed_data_ptr[logical_dim] = 0.0f;
+    }
+
+    if (local_j >= local_half_dim) {
+        return;
+    }
+
+    local_timestep_val = local_timesteps_input_ptr[local_i];
+
+    if (local_half_dim == 0) {
+        local_freq = 1.0f;
+    } else {
+        local_freq = exp(-log((float)max_period) * (float)local_j / (float)local_half_dim);
+    }
+
+    local_arg = local_timestep_val * local_freq;
+    local_embed_data_ptr[local_j] = cos(local_arg);
+    local_embed_data_ptr[local_j + local_half_dim] = sin(local_arg);
+}
diff --git a/ggml/src/ggml-opencl/kernels/upscale.cl b/ggml/src/ggml-opencl/kernels/upscale.cl
new file mode 100644
index 0000000000000..219d31dbb9248
--- /dev/null
+++ b/ggml/src/ggml-opencl/kernels/upscale.cl
@@ -0,0 +1,121 @@
+kernel void kernel_upscale(
+    global const void * p_src0,
+    ulong off_src0,
+    global void * p_dst,
+    ulong off_dst,
+    ulong nb00,
+    ulong nb01,
+    ulong nb02,
+    ulong nb03,
+    int ne10,
+    int ne11,
+    int ne12,
+    int ne13,
+    float sf0,
+    float sf1,
+    float sf2,
+    float sf3
+) {
+    global const char * src_base = (global const char *)p_src0 + off_src0;
+    global float * dst_base = (global float *)((global char *)p_dst + off_dst);
+
+    int index = get_global_id(0);
+    int dst_total_elements = ne10 * ne11 * ne12 * ne13;
+
+    if (index >= dst_total_elements) {
+        return;
+    }
+
+    int i10 = index % ne10;
+    int i11 = (index / ne10) % ne11;
+    int i12 = (index / (ne10 * ne11)) % ne12;
+    int i13 = index / (ne10 * ne11 * ne12);
+
+    int i00 = (int)(i10 / sf0);
+    int i01 = (int)(i11 / sf1);
+    int i02 = (int)(i12 / sf2);
+    int i03 = (int)(i13 / sf3);
+
+    ulong offset_src_element = (ulong)i03 * nb03 + (ulong)i02 * nb02 + (ulong)i01 * nb01 + (ulong)i00 * nb00;
+    global const float * src_element_ptr = (global const float *)(src_base + offset_src_element);
+
+    dst_base[index] = *src_element_ptr;
+}
+
+kernel void kernel_upscale_bilinear(
+    global const void * p_src0,
+    ulong off_src0,
+    global void * p_dst,
+    ulong off_dst,
+    ulong nb00,
+    ulong nb01,
+    ulong nb02,
+    ulong nb03,
+    int ne00_src,
+    int ne01_src,
+    int ne10_dst,
+    int ne11_dst,
+    int ne12_dst,
+    int ne13_dst,
+    float sf0,
+    float sf1,
+    float sf2,
+    float sf3
+) {
+    global const char * src_base = (global const char *)p_src0 + off_src0;
+    global float * dst_base = (global float *)((global char *)p_dst + off_dst);
+
+    int index = get_global_id(0);
+    int dst_total_elements = ne10_dst * ne11_dst * ne12_dst * ne13_dst;
+
+    if (index >= dst_total_elements) {
+        return;
+    }
+
+    int i10_dst = index % ne10_dst;
+    int i11_dst = (index / ne10_dst) % ne11_dst;
+    int i12_dst = (index / (ne10_dst * ne11_dst)) % ne12_dst;
+    int i13_dst = index / (ne10_dst * ne11_dst * ne12_dst);
+
+    int i02_src = (int)(i12_dst / sf2);
+    int i03_src = (int)(i13_dst / sf3);
+
+    const float pixel_offset = 0.5f;
+
+    float y_src_f = ((float)i11_dst + pixel_offset) / sf1 - pixel_offset;
+    long y0_src = (long)floor(y_src_f);
+    long y1_src = y0_src + 1;
+
+    y0_src = max(0L, min(y0_src, (long)ne01_src - 1));
+    y1_src = max(0L, min(y1_src, (long)ne01_src - 1));
+
+    float dy = y_src_f - (float)y0_src;
+    dy = max(0.0f, min(dy, 1.0f));
+
+    float x_src_f = ((float)i10_dst + pixel_offset) / sf0 - pixel_offset;
+    long x0_src = (long)floor(x_src_f);
+    long x1_src = x0_src + 1;
+
+    x0_src = max(0L, min(x0_src, (long)ne00_src - 1));
+    x1_src = max(0L, min(x1_src, (long)ne00_src - 1));
+
+    float dx = x_src_f - (float)x0_src;
+    dx = max(0.0f, min(dx, 1.0f));
+
+    global const float * p_a = (global const float *)(src_base + (ulong)x0_src * nb00 + (ulong)y0_src * nb01 + (ulong)i02_src * nb02 + (ulong)i03_src * nb03);
+    global const float * p_b = (global const float *)(src_base + (ulong)x1_src * nb00 + (ulong)y0_src * nb01 + (ulong)i02_src * nb02 + (ulong)i03_src * nb03);
+    global const float * p_c = (global const float *)(src_base + (ulong)x0_src * nb00 + (ulong)y1_src * nb01 + (ulong)i02_src * nb02 + (ulong)i03_src * nb03);
+    global const float * p_d = (global const float *)(src_base + (ulong)x1_src * nb00 + (ulong)y1_src * nb01 + (ulong)i02_src * nb02 + (ulong)i03_src * nb03);
+
+    const float val_a = *p_a;
+    const float val_b = *p_b;
+    const float val_c = *p_c;
+    const float val_d = *p_d;
+
+    float result = val_a * (1.0f - dx) * (1.0f - dy) +
+                   val_b * dx * (1.0f - dy) +
+                   val_c * (1.0f - dx) * dy +
+                   val_d * dx * dy;
+
+    dst_base[index] = result;
+}
diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c
index 84ec6dfe31bfc..e389a46dbed87 100644
--- a/ggml/src/ggml-quants.c
+++ b/ggml/src/ggml-quants.c
@@ -2425,8 +2425,6 @@ void dequantize_row_iq1_m(const block_iq1_m * GGML_RESTRICT x, float * GGML_REST
     }
 }
 
-static const int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
-
 void dequantize_row_iq4_nl(const block_iq4_nl * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
     assert(k % QK4_NL == 0);
     const int64_t nb = k / QK4_NL;
diff --git a/ggml/src/ggml-rpc/ggml-rpc.cpp b/ggml/src/ggml-rpc/ggml-rpc.cpp
index 4f0abb5a60f48..f468f796d5773 100644
--- a/ggml/src/ggml-rpc/ggml-rpc.cpp
+++ b/ggml/src/ggml-rpc/ggml-rpc.cpp
@@ -53,6 +53,9 @@ struct socket_t {
     }
 };
 
+// macro for nicer error messages on server crash
+#define RPC_STATUS_ASSERT(x) if (!(x)) GGML_ABORT("Remote RPC server crashed or returned malformed response")
+
 // all RPC structures must be packed
 #pragma pack(push, 1)
 // ggml_tensor is serialized into rpc_tensor
@@ -425,7 +428,7 @@ static bool send_rpc_cmd(const std::shared_ptr<socket_t> & sock, enum rpc_cmd cm
 static bool check_server_version(const std::shared_ptr<socket_t> & sock) {
     rpc_msg_hello_rsp response;
     bool status = send_rpc_cmd(sock, RPC_CMD_HELLO, nullptr, 0, &response, sizeof(response));
-    GGML_ASSERT(status);
+    RPC_STATUS_ASSERT(status);
     if (response.major != RPC_PROTO_MAJOR_VERSION || response.minor > RPC_PROTO_MINOR_VERSION) {
         fprintf(stderr, "RPC server version mismatch: %d.%d.%d\n", response.major, response.minor, response.patch);
         return false;
@@ -481,7 +484,7 @@ static void ggml_backend_rpc_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context;
     rpc_msg_free_buffer_req request = {ctx->remote_ptr};
     bool status = send_rpc_cmd(ctx->sock, RPC_CMD_FREE_BUFFER, &request, sizeof(request), nullptr, 0);
-    GGML_ASSERT(status);
+    RPC_STATUS_ASSERT(status);
     delete ctx;
 }
 
@@ -493,7 +496,7 @@ static void * ggml_backend_rpc_buffer_get_base(ggml_backend_buffer_t buffer) {
     rpc_msg_buffer_get_base_req request = {ctx->remote_ptr};
     rpc_msg_buffer_get_base_rsp response;
     bool status = send_rpc_cmd(ctx->sock, RPC_CMD_BUFFER_GET_BASE, &request, sizeof(request), &response, sizeof(response));
-    GGML_ASSERT(status);
+    RPC_STATUS_ASSERT(status);
     ctx->base_ptr = reinterpret_cast<void *>(response.base_ptr);
     return ctx->base_ptr;
 }
@@ -545,7 +548,7 @@ static enum ggml_status ggml_backend_rpc_buffer_init_tensor(ggml_backend_buffer_
         request.tensor = serialize_tensor(tensor);
 
         bool status = send_rpc_cmd(ctx->sock, RPC_CMD_INIT_TENSOR, &request, sizeof(request), nullptr, 0);
-        GGML_ASSERT(status);
+        RPC_STATUS_ASSERT(status);
     }
     return GGML_STATUS_SUCCESS;
 }
@@ -560,7 +563,7 @@ static void ggml_backend_rpc_buffer_set_tensor(ggml_backend_buffer_t buffer, ggm
         request.hash = fnv_hash((const uint8_t*)data, size);
         rpc_msg_set_tensor_hash_rsp response;
         bool status = send_rpc_cmd(ctx->sock, RPC_CMD_SET_TENSOR_HASH, &request, sizeof(request), &response, sizeof(response));
-        GGML_ASSERT(status);
+        RPC_STATUS_ASSERT(status);
         if (response.result) {
             // the server has the same data, no need to send it
             return;
@@ -573,7 +576,7 @@ static void ggml_backend_rpc_buffer_set_tensor(ggml_backend_buffer_t buffer, ggm
     memcpy(input.data() + sizeof(rpc_tensor), &offset, sizeof(offset));
     memcpy(input.data() + sizeof(rpc_tensor) + sizeof(offset), data, size);
     bool status = send_rpc_cmd(ctx->sock, RPC_CMD_SET_TENSOR, input.data(), input.size());
-    GGML_ASSERT(status);
+    RPC_STATUS_ASSERT(status);
 }
 
 static void ggml_backend_rpc_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
@@ -583,7 +586,7 @@ static void ggml_backend_rpc_buffer_get_tensor(ggml_backend_buffer_t buffer, con
     request.offset = offset;
     request.size = size;
     bool status = send_rpc_cmd(ctx->sock, RPC_CMD_GET_TENSOR, &request, sizeof(request), data, size);
-    GGML_ASSERT(status);
+    RPC_STATUS_ASSERT(status);
 }
 
 static bool ggml_backend_rpc_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) {
@@ -601,7 +604,7 @@ static bool ggml_backend_rpc_buffer_cpy_tensor(ggml_backend_buffer_t buffer, con
     request.dst = serialize_tensor(dst);
     rpc_msg_copy_tensor_rsp response;
     bool status = send_rpc_cmd(ctx->sock, RPC_CMD_COPY_TENSOR, &request, sizeof(request), &response, sizeof(response));
-    GGML_ASSERT(status);
+    RPC_STATUS_ASSERT(status);
     return response.result;
 }
 
@@ -609,7 +612,7 @@ static void ggml_backend_rpc_buffer_clear(ggml_backend_buffer_t buffer, uint8_t
     ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context;
     rpc_msg_buffer_clear_req request = {ctx->remote_ptr, value};
     bool status = send_rpc_cmd(ctx->sock, RPC_CMD_BUFFER_CLEAR, &request, sizeof(request), nullptr, 0);
-    GGML_ASSERT(status);
+    RPC_STATUS_ASSERT(status);
 }
 
 static ggml_backend_buffer_i ggml_backend_rpc_buffer_interface = {
@@ -635,7 +638,7 @@ static ggml_backend_buffer_t ggml_backend_rpc_buffer_type_alloc_buffer(ggml_back
     rpc_msg_alloc_buffer_rsp response;
     auto sock = get_socket(buft_ctx->endpoint);
     bool status = send_rpc_cmd(sock, RPC_CMD_ALLOC_BUFFER, &request, sizeof(request), &response, sizeof(response));
-    GGML_ASSERT(status);
+    RPC_STATUS_ASSERT(status);
     if (response.remote_ptr != 0) {
         ggml_backend_buffer_t buffer = ggml_backend_buffer_init(buft,
             ggml_backend_rpc_buffer_interface,
@@ -650,7 +653,7 @@ static ggml_backend_buffer_t ggml_backend_rpc_buffer_type_alloc_buffer(ggml_back
 static size_t get_alignment(const std::shared_ptr<socket_t> & sock) {
     rpc_msg_get_alignment_rsp response;
     bool status = send_rpc_cmd(sock, RPC_CMD_GET_ALIGNMENT, nullptr, 0, &response, sizeof(response));
-    GGML_ASSERT(status);
+    RPC_STATUS_ASSERT(status);
     return response.alignment;
 }
 
@@ -662,7 +665,7 @@ static size_t ggml_backend_rpc_buffer_type_get_alignment(ggml_backend_buffer_typ
 static size_t get_max_size(const std::shared_ptr<socket_t> & sock) {
     rpc_msg_get_max_size_rsp response;
     bool status = send_rpc_cmd(sock, RPC_CMD_GET_MAX_SIZE, nullptr, 0, &response, sizeof(response));
-    GGML_ASSERT(status);
+    RPC_STATUS_ASSERT(status);
     return response.max_size;
 }
 
@@ -683,7 +686,7 @@ static size_t ggml_backend_rpc_buffer_type_get_alloc_size(ggml_backend_buffer_ty
 
         rpc_msg_get_alloc_size_rsp response;
         bool status = send_rpc_cmd(sock, RPC_CMD_GET_ALLOC_SIZE, &request, sizeof(request), &response, sizeof(response));
-        GGML_ASSERT(status);
+        RPC_STATUS_ASSERT(status);
 
         return response.alloc_size;
     } else {
@@ -761,7 +764,7 @@ static enum ggml_status ggml_backend_rpc_graph_compute(ggml_backend_t backend, g
     rpc_msg_graph_compute_rsp response;
     auto sock = get_socket(rpc_ctx->endpoint);
     bool status = send_rpc_cmd(sock, RPC_CMD_GRAPH_COMPUTE, input.data(), input.size(), &response, sizeof(response));
-    GGML_ASSERT(status);
+    RPC_STATUS_ASSERT(status);
     return (enum ggml_status)response.result;
 }
 
@@ -835,7 +838,7 @@ bool ggml_backend_is_rpc(ggml_backend_t backend) {
 static void get_device_memory(const std::shared_ptr<socket_t> & sock, size_t * free, size_t * total) {
     rpc_msg_get_device_memory_rsp response;
     bool status = send_rpc_cmd(sock, RPC_CMD_GET_DEVICE_MEMORY, nullptr, 0, &response, sizeof(response));
-    GGML_ASSERT(status);
+    RPC_STATUS_ASSERT(status);
     *free = response.free_mem;
     *total = response.total_mem;
 }
diff --git a/ggml/src/ggml-sycl/CMakeLists.txt b/ggml/src/ggml-sycl/CMakeLists.txt
index a2e26124802b2..efd78b912cc65 100644
--- a/ggml/src/ggml-sycl/CMakeLists.txt
+++ b/ggml/src/ggml-sycl/CMakeLists.txt
@@ -13,7 +13,7 @@ elseif(SUPPORTS_SYCL)
         If you expected the oneAPI Release compiler, please install oneAPI & source it, like:
         source /opt/intel/oneapi/setvars.sh")
 else()
-    message(FATAL_ERROR, "C++ compiler lacks SYCL support.")
+    message(FATAL_ERROR "C++ compiler lacks SYCL support.")
 endif()
 message(STATUS "SYCL found")
 #todo: AOT
@@ -142,7 +142,7 @@ else()
         FetchContent_Declare(
             ONEMATH
             GIT_REPOSITORY https://github.com/uxlfoundation/oneMath.git
-            GIT_TAG c255b1b4c41e2ee3059455c1f96a965d6a62568a
+            GIT_TAG 8efe85f5aaebb37f1d8c503b7af66315feabf142
         )
         FetchContent_MakeAvailable(ONEMATH)
         # Create alias to match with find_package targets name
@@ -170,7 +170,7 @@ else()
         target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_NVIDIA)
     elseif (GGML_SYCL_TARGET STREQUAL "AMD")
         if (NOT GGML_SYCL_DEVICE_ARCH)
-            message(ERROR "Can't enable SYCL hip backend, GGML_SYCL_DEVICE_ARCH has not been set.")
+            message(FATAL_ERROR "Can't enable SYCL hip backend, GGML_SYCL_DEVICE_ARCH has not been set.")
         endif()
         target_link_libraries(ggml-sycl PRIVATE ONEMATH::onemath_blas_rocblas)
         target_compile_options(ggml-sycl PRIVATE "-fsycl-targets=amdgcn-amd-amdhsa")
diff --git a/ggml/src/ggml-sycl/binbcast.cpp b/ggml/src/ggml-sycl/binbcast.cpp
index 0a3883ae1eda5..741630dba342c 100644
--- a/ggml/src/ggml-sycl/binbcast.cpp
+++ b/ggml/src/ggml-sycl/binbcast.cpp
@@ -225,9 +225,9 @@ struct bin_bcast_sycl {
                     dpct::has_capability_or_fail(stream->get_device(),
                                                  {sycl::aspect::fp16});
 
-                    stream->parallel_for(
-                        sycl::nd_range<3>(sycl::range<3>(1, 1, block_num) *
-                                              sycl::range<3>(1, 1, block_size),
+                    sycl_parallel_for(
+                        stream,
+                        sycl::nd_range<3>(sycl::range<3>(1, 1, block_num) * sycl::range<3>(1, 1, block_size),
                                           sycl::range<3>(1, 1, block_size)),
                         [=](sycl::nd_item<3> item_ct1) {
                             k_bin_bcast_unravel<bin_op>(
@@ -246,9 +246,8 @@ struct bin_bcast_sycl {
                 dpct::has_capability_or_fail(stream->get_device(),
                                              {sycl::aspect::fp16});
 
-                stream->parallel_for(
-                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                    [=](sycl::nd_item<3> item_ct1) {
+                sycl_parallel_for(
+                    stream, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
                         k_bin_bcast<bin_op>(src0_dd, src1_dd, dst_dd, ne0, ne1,
                                             ne2, ne3, ne10, ne11, ne12, ne13,
                                             s1, s2, s3, s01, s02, s03, s11, s12, s13,
diff --git a/ggml/src/ggml-sycl/common.hpp b/ggml/src/ggml-sycl/common.hpp
index 15ee9dc69d149..4e7449d06ecfe 100644
--- a/ggml/src/ggml-sycl/common.hpp
+++ b/ggml/src/ggml-sycl/common.hpp
@@ -149,8 +149,6 @@ typedef sycl::float2 dfloat2;
 
 #define MMVQ_MAX_BATCH_SIZE  8
 
-static const int8_t kvalues_iq4nl[16]={-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
-
 static int g_all_sycl_device_count = -1;
 static bool g_ggml_backend_sycl_buffer_type_initialized = false;
 
@@ -201,7 +199,7 @@ struct sycl_device_info {
     // size_t  smpb;               // max. shared memory per block
     bool    vmm;                // virtual memory support
     size_t  total_vram;
-    sycl_hw_info hw_info;
+    //sycl_hw_info hw_info;     \\ device id and aarch, currently not used
     optimize_feature opt_feature;
 };
 
@@ -288,29 +286,6 @@ struct ggml_tensor_extra_gpu {
 
 void release_extra_gpu(ggml_tensor_extra_gpu * extra, std::vector<queue_ptr> streams={});
 
-inline optimize_feature check_gpu_optimize_feature(syclex::architecture &arch) {
-    optimize_feature opt;
-
-    opt.reorder =
-        (arch == syclex::architecture::intel_gpu_dg1 ||
-         arch == syclex::architecture::intel_gpu_acm_g10 ||
-         arch == syclex::architecture::intel_gpu_acm_g11 ||
-         arch == syclex::architecture::intel_gpu_acm_g12 ||
-         arch == syclex::architecture::intel_gpu_pvc ||
-         arch == syclex::architecture::intel_gpu_pvc_vg ||
-         arch == syclex::architecture::intel_gpu_mtl_u ||
-         arch == syclex::architecture::intel_gpu_mtl_s ||
-         arch == syclex::architecture::intel_gpu_mtl_h ||
-         arch == syclex::architecture::intel_gpu_arl_u ||
-         arch == syclex::architecture::intel_gpu_arl_s ||
-         arch == syclex::architecture::intel_gpu_arl_h ||
-         arch == syclex::architecture::intel_gpu_bmg_g21 ||
-         arch == syclex::architecture::intel_gpu_lnl_m
-        );
-
-    return opt;
-}
-
 namespace sycl_ex = sycl::ext::oneapi::experimental;
 struct ggml_backend_sycl_context {
     int device;
@@ -515,9 +490,9 @@ constexpr size_t ceil_div(const size_t m, const size_t n) {
 
 bool gpu_has_xmx(sycl::device &dev);
 
-template <int N, class T> void debug_print_array(const std::string & prefix, const T array[N]) {
+template <int N, class T> std::string debug_get_array_str(const std::string & prefix, const T array[N]) {
     if (LIKELY(!g_ggml_sycl_debug)) {
-        return;
+        return "";
     }
     std::stringstream ss;
     ss << prefix << "=[";
@@ -528,29 +503,26 @@ template <int N, class T> void debug_print_array(const std::string & prefix, con
         ss << array[N - 1];
     }
     ss << "]";
-    GGML_SYCL_DEBUG("%s", ss.str().c_str());
+    return ss.str();
 }
 
-inline void debug_print_tensor(const std::string & prefix, const ggml_tensor * tensor,
-                               const std::string & suffix = "") {
-    if (LIKELY(!g_ggml_sycl_debug)) {
-        return;
-    }
-    GGML_SYCL_DEBUG("%s=", prefix.c_str());
+inline std::string debug_get_tensor_str(const std::string &prefix,
+        const ggml_tensor *tensor, const std::string &suffix = "") {
+    std::stringstream ss;
+    if (LIKELY(!g_ggml_sycl_debug)) { return ss.str(); }
+    ss << prefix.c_str() << "=";
     if (tensor) {
-        GGML_SYCL_DEBUG("'%s':type=%s", tensor->name, ggml_type_name(tensor->type));
-        debug_print_array<GGML_MAX_DIMS>(";ne", tensor->ne);
-        debug_print_array<GGML_MAX_DIMS>(";nb", tensor->nb);
-        if (!ggml_is_contiguous(tensor)) {
-            GGML_SYCL_DEBUG(";strided");
-        }
-        if (ggml_is_permuted(tensor)) {
-            GGML_SYCL_DEBUG(";permuted");
-        }
+        ss << "'" << tensor->name << "':type=" << ggml_type_name(tensor->type);
+        ss << debug_get_array_str<GGML_MAX_DIMS>(";ne", tensor->ne);
+        ss << debug_get_array_str<GGML_MAX_DIMS>(";nb", tensor->nb);
+
+        if (!ggml_is_contiguous(tensor)) { ss << ";strided"; }
+        if (ggml_is_permuted(tensor)) { ss << ";permuted"; }
     } else {
-        GGML_SYCL_DEBUG("nullptr");
+        ss << "nullptr";
     }
-    GGML_SYCL_DEBUG("%s", suffix.c_str());
+    ss << suffix;
+    return ss.str();
 }
 
 // Use scope_op_debug_print to log operations coming from running a model
@@ -566,10 +538,10 @@ struct scope_op_debug_print {
             return;
         }
         GGML_SYCL_DEBUG("[SYCL][OP] call %s%s:", func.data(), func_suffix.data());
-        debug_print_tensor(" dst", dst);
+        GGML_SYCL_DEBUG("%s", debug_get_tensor_str(" dst", dst).c_str());
         if (dst) {
             for (std::size_t i = 0; i < num_src; ++i) {
-                debug_print_tensor("\tsrc" + std::to_string(i), dst->src[i]);
+                GGML_SYCL_DEBUG("%s", debug_get_tensor_str("\tsrc" + std::to_string(i), dst->src[i]).c_str());
             }
         }
         GGML_SYCL_DEBUG("%s\n", suffix.data());
diff --git a/ggml/src/ggml-sycl/concat.cpp b/ggml/src/ggml-sycl/concat.cpp
index 7aa91c861d583..3501484a14611 100644
--- a/ggml/src/ggml-sycl/concat.cpp
+++ b/ggml/src/ggml-sycl/concat.cpp
@@ -89,33 +89,24 @@ static void concat_f32_sycl(const float *x, const float *y, float *dst,
   sycl::range<3> gridDim(ne2, ne1, num_blocks);
   switch (dim) {
   case 0:
-    stream->parallel_for(
-        sycl::nd_range<3>(gridDim *
-                              sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE),
-                          sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE)),
-        [=](sycl::nd_item<3> item_ct1) {
-          concat_f32_dim0(x, y, dst, ne0, ne00, item_ct1);
-        });
-    break;
+      sycl_parallel_for(stream,
+                        sycl::nd_range<3>(gridDim * sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE),
+                                          sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE)),
+                        [=](sycl::nd_item<3> item_ct1) { concat_f32_dim0(x, y, dst, ne0, ne00, item_ct1); });
+      break;
   case 1:
-    stream->parallel_for(
-        sycl::nd_range<3>(gridDim *
-                              sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE),
-                          sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE)),
-        [=](sycl::nd_item<3> item_ct1) {
-          concat_f32_dim1(x, y, dst, ne0, ne01, item_ct1);
-        });
-    break;
+      sycl_parallel_for(stream,
+                        sycl::nd_range<3>(gridDim * sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE),
+                                          sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE)),
+                        [=](sycl::nd_item<3> item_ct1) { concat_f32_dim1(x, y, dst, ne0, ne01, item_ct1); });
+      break;
   // dim >=2 will be dispatched to the default path
   default:
-    stream->parallel_for(
-        sycl::nd_range<3>(gridDim *
-                              sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE),
-                          sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE)),
-        [=](sycl::nd_item<3> item_ct1) {
-          concat_f32_dim2(x, y, dst, ne0, ne02, item_ct1);
-        });
-    break;
+      sycl_parallel_for(stream,
+                        sycl::nd_range<3>(gridDim * sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE),
+                                          sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE)),
+                        [=](sycl::nd_item<3> item_ct1) { concat_f32_dim2(x, y, dst, ne0, ne02, item_ct1); });
+      break;
   }
 }
 
@@ -129,33 +120,29 @@ static void concat_f32_sycl_non_cont(
     int64_t ne2, int64_t ne3, uint64_t nb0, uint64_t nb1, uint64_t nb2,
     uint64_t nb3, int32_t dim) {
   sycl::range<3> gridDim(ne3, ne2, ne1);
-  stream->parallel_for(
-      sycl::nd_range<3>(gridDim, sycl::range<3>(1, 1, 1)),
-      [=](sycl::nd_item<3> item_ct1) {
-        int64_t i3 = item_ct1.get_group(0);
-        int64_t i2 = item_ct1.get_group(1);
-        int64_t i1 = item_ct1.get_group(2);
+  sycl_parallel_for(stream, sycl::nd_range<3>(gridDim, sycl::range<3>(1, 1, 1)), [=](sycl::nd_item<3> item_ct1) {
+      int64_t i3 = item_ct1.get_group(0);
+      int64_t i2 = item_ct1.get_group(1);
+      int64_t i1 = item_ct1.get_group(2);
 
-        int64_t o[4] = {0, 0, 0, 0};
-        o[dim] = dim == 0 ? ne00 : (dim == 1 ? ne01 : (dim == 2 ? ne02 : ne03));
+      int64_t o[4] = { 0, 0, 0, 0 };
+      o[dim]       = dim == 0 ? ne00 : (dim == 1 ? ne01 : (dim == 2 ? ne02 : ne03));
 
-        const float *x;
+      const float * x;
 
-        for (int i0 = item_ct1.get_local_id(2); i0 < ne0;
-             i0 += item_ct1.get_local_range(2)) {
+      for (int i0 = item_ct1.get_local_id(2); i0 < ne0; i0 += item_ct1.get_local_range(2)) {
           if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
-            x = (const float *)(src0 + (i3)*nb03 + (i2)*nb02 + (i1)*nb01 +
-                                (i0)*nb00);
+              x = (const float *) (src0 + (i3) *nb03 + (i2) *nb02 + (i1) *nb01 + (i0) *nb00);
           } else {
-            x = (const float *)(src1 + (i3 - o[3]) * nb13 + (i2 - o[2]) * nb12 +
-                                (i1 - o[1]) * nb11 + (i0 - o[0]) * nb10);
+              x = (const float *) (src1 + (i3 - o[3]) * nb13 + (i2 - o[2]) * nb12 + (i1 - o[1]) * nb11 +
+                                   (i0 - o[0]) * nb10);
           }
 
           float *y = (float *)(dst + i3 * nb3 + i2 * nb2 + i1 * nb1 + i0 * nb0);
 
           *y = *x;
-        }
-      });
+      }
+  });
 }
 
 void ggml_sycl_op_concat(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
diff --git a/ggml/src/ggml-sycl/conv.cpp b/ggml/src/ggml-sycl/conv.cpp
index 475bd34a25d56..c2f991e8d64a7 100644
--- a/ggml/src/ggml-sycl/conv.cpp
+++ b/ggml/src/ggml-sycl/conv.cpp
@@ -59,16 +59,10 @@ static void conv_transpose_1d_f32_f32_sycl(
     const int num_blocks = (output_size + SYCL_CONV_TRANPOSE_1D_BLOCK_SIZE - 1) / SYCL_CONV_TRANPOSE_1D_BLOCK_SIZE;
     const sycl::range<3> block_dims(1, 1, SYCL_CONV_TRANPOSE_1D_BLOCK_SIZE);
     const sycl::range<3> block_nums(1, 1, num_blocks);
-    stream->parallel_for(
-        sycl::nd_range<3>(
-            block_nums * block_dims, block_dims),
-        [=](sycl::nd_item<3> item_ct1) {
-            conv_transpose_1d_kernel(
-                s0, output_size,
-                src0_ne0, src0_ne1, src0_ne2,
-                src1_ne0, dst_ne0,
-                src0, src1, dst, item_ct1);
-        });
+    sycl_parallel_for(stream, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
+        conv_transpose_1d_kernel(s0, output_size, src0_ne0, src0_ne1, src0_ne2, src1_ne0, dst_ne0, src0, src1, dst,
+                                 item_ct1);
+    });
 }
 
 void ggml_sycl_op_conv_transpose_1d(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
diff --git a/ggml/src/ggml-sycl/convert.cpp b/ggml/src/ggml-sycl/convert.cpp
index 75bac98e5fb64..0ef567122dddb 100644
--- a/ggml/src/ggml-sycl/convert.cpp
+++ b/ggml/src/ggml-sycl/convert.cpp
@@ -33,14 +33,11 @@ static void dequantize_block_sycl(const void *__restrict__ vx,
     {
         dpct::has_capability_or_fail(stream->get_device(),
                                      {sycl::aspect::fp16});
-        stream->parallel_for(
-            sycl::nd_range<3>(
-                sycl::range<3>(1, 1, num_blocks) *
-                    sycl::range<3>(1, 1, SYCL_DEQUANTIZE_BLOCK_SIZE),
-                sycl::range<3>(1, 1, SYCL_DEQUANTIZE_BLOCK_SIZE)),
-            [=](sycl::nd_item<3> item_ct1) {
-                dequantize_block<qk, qr, dequantize_kernel>(vx, y, k, item_ct1);
-            });
+        sycl_parallel_for(
+            stream,
+            sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_DEQUANTIZE_BLOCK_SIZE),
+                              sycl::range<3>(1, 1, SYCL_DEQUANTIZE_BLOCK_SIZE)),
+            [=](sycl::nd_item<3> item_ct1) { dequantize_block<qk, qr, dequantize_kernel>(vx, y, k, item_ct1); });
     }
 }
 
@@ -53,24 +50,18 @@ static void dequantize_row_q2_K_sycl(const void *vx, dst_t *y, const int64_t k,
         dpct::has_capability_or_fail(stream->get_device(),
                                      {sycl::aspect::fp16});
 
-        stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
-                                                   sycl::range<3>(1, 1, 64),
-                                               sycl::range<3>(1, 1, 64)),
-                             [=](sycl::nd_item<3> item_ct1) {
-                                 dequantize_block_q2_K(vx, y, item_ct1);
-                             });
+        sycl_parallel_for(
+            stream, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 64), sycl::range<3>(1, 1, 64)),
+            [=](sycl::nd_item<3> item_ct1) { dequantize_block_q2_K(vx, y, item_ct1); });
     }
 #else
     {
         dpct::has_capability_or_fail(stream->get_device(),
                                      {sycl::aspect::fp16});
 
-        stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
-                                                   sycl::range<3>(1, 1, 32),
-                                               sycl::range<3>(1, 1, 32)),
-                             [=](sycl::nd_item<3> item_ct1) {
-                                 dequantize_block_q2_K(vx, y, item_ct1);
-                             });
+        sycl_parallel_for(
+            stream, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)),
+            [=](sycl::nd_item<3> item_ct1) { dequantize_block_q2_K(vx, y, item_ct1); });
     }
 
 #endif
@@ -85,24 +76,18 @@ static void dequantize_row_q3_K_sycl(const void *vx, dst_t *y, const int64_t k,
         dpct::has_capability_or_fail(stream->get_device(),
                                      {sycl::aspect::fp16});
 
-        stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
-                                                   sycl::range<3>(1, 1, 64),
-                                               sycl::range<3>(1, 1, 64)),
-                             [=](sycl::nd_item<3> item_ct1) {
-                                 dequantize_block_q3_K(vx, y, item_ct1);
-                             });
+        sycl_parallel_for(
+            stream, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 64), sycl::range<3>(1, 1, 64)),
+            [=](sycl::nd_item<3> item_ct1) { dequantize_block_q3_K(vx, y, item_ct1); });
     }
 #else
     {
         dpct::has_capability_or_fail(stream->get_device(),
                                      {sycl::aspect::fp16});
 
-        stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
-                                                   sycl::range<3>(1, 1, 32),
-                                               sycl::range<3>(1, 1, 32)),
-                             [=](sycl::nd_item<3> item_ct1) {
-                                 dequantize_block_q3_K(vx, y, item_ct1);
-                             });
+        sycl_parallel_for(
+            stream, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)),
+            [=](sycl::nd_item<3> item_ct1) { dequantize_block_q3_K(vx, y, item_ct1); });
     }
 #endif
 }
@@ -116,12 +101,9 @@ static void dequantize_row_q4_0_sycl(const void *vx, dst_t *y, const int64_t k,
         dpct::has_capability_or_fail(stream->get_device(),
                                      {sycl::aspect::fp16});
 
-        stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
-                                                   sycl::range<3>(1, 1, 32),
-                                               sycl::range<3>(1, 1, 32)),
-                             [=](sycl::nd_item<3> item_ct1) {
-                                 dequantize_block_q4_0(vx, y, nb32, item_ct1);
-                             });
+        sycl_parallel_for(
+            stream, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)),
+            [=](sycl::nd_item<3> item_ct1) { dequantize_block_q4_0(vx, y, nb32, item_ct1); });
     }
 }
 
@@ -135,13 +117,12 @@ static void dequantize_row_q4_0_sycl_reorder(const void *vx, dst_t *y, const int
     int constexpr WARP_K = WARP_SIZE * QK4_0;
     const int n_warp = (k + WARP_K - 1) / WARP_K;
     GGML_ASSERT(k % 2 == 0);
-    stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, n_warp) *
-        sycl::range<3>(1, 1, WARP_SIZE),
-        sycl::range<3>(1, 1, WARP_SIZE)),
-        [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]]{
-            dequantize_block_q4_0_reorder(vx, y, k, item_ct1);
-        });
-
+    sycl_parallel_for(stream,
+                      sycl::nd_range<3>(sycl::range<3>(1, 1, n_warp) * sycl::range<3>(1, 1, WARP_SIZE),
+                                        sycl::range<3>(1, 1, WARP_SIZE)),
+                      [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
+                          dequantize_block_q4_0_reorder(vx, y, k, item_ct1);
+                      });
 }
 
 template <typename dst_t>
@@ -153,12 +134,9 @@ static void dequantize_row_q4_1_sycl(const void *vx, dst_t *y, const int64_t k,
         dpct::has_capability_or_fail(stream->get_device(),
                                      {sycl::aspect::fp16});
 
-        stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
-                                                   sycl::range<3>(1, 1, 32),
-                                               sycl::range<3>(1, 1, 32)),
-                             [=](sycl::nd_item<3> item_ct1) {
-                                 dequantize_block_q4_1(vx, y, nb32, item_ct1);
-                             });
+        sycl_parallel_for(
+            stream, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)),
+            [=](sycl::nd_item<3> item_ct1) { dequantize_block_q4_1(vx, y, nb32, item_ct1); });
     }
 }
 
@@ -171,14 +149,13 @@ static void dequantize_row_q4_K_sycl(const void *vx, dst_t *y, const int64_t k,
         dpct::has_capability_or_fail(stream->get_device(),
                                      {sycl::aspect::fp16});
 
-        stream->submit([&](sycl::handler &cgh) {
+        sycl_launch(stream, [&](sycl::handler & cgh) {
             sycl::local_accessor<uint8_t, 1> scale_local_acc(sycl::range<1>(12), cgh);
-            cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
-                                                   sycl::range<3>(1, 1, 32),
-                                               sycl::range<3>(1, 1, 32)),
-                             [=](sycl::nd_item<3> item_ct1) {
-                                 dequantize_block_q4_K(vx, y, get_pointer(scale_local_acc), item_ct1);
-                             });
+            sycl_parallel_for(
+                cgh, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)),
+                [=](sycl::nd_item<3> item_ct1) {
+                    dequantize_block_q4_K(vx, y, get_pointer(scale_local_acc), item_ct1);
+                });
         });
     }
 }
@@ -191,13 +168,13 @@ static void dequantize_row_q4_K_sycl_reorder(const void * vx, dst_t * y, const i
 
     dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 });
 
-    stream->submit([&](sycl::handler & cgh) {
+    sycl_launch(stream, [&](sycl::handler & cgh) {
         sycl::local_accessor<uint8_t, 1> scale_local_acc(sycl::range<1>(12), cgh);
 
-        cgh.parallel_for(sycl::nd_range<1>(sycl::range<1>(global_size), sycl::range<1>(local_size)),
-                         [=](sycl::nd_item<1> item_ct1) {
-                             dequantize_block_q4_K_reorder(vx, y, get_pointer(scale_local_acc), item_ct1, nb);
-                         });
+        sycl_parallel_for<1>(cgh, sycl::nd_range<1>(sycl::range<1>(global_size), sycl::range<1>(local_size)),
+                             [=](sycl::nd_item<1> item_ct1) {
+                                 dequantize_block_q4_K_reorder(vx, y, get_pointer(scale_local_acc), item_ct1, nb);
+                             });
     });
 }
 
@@ -210,24 +187,18 @@ static void dequantize_row_q5_K_sycl(const void *vx, dst_t *y, const int64_t k,
         dpct::has_capability_or_fail(stream->get_device(),
                                      {sycl::aspect::fp16});
 
-        stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
-                                                   sycl::range<3>(1, 1, 64),
-                                               sycl::range<3>(1, 1, 64)),
-                             [=](sycl::nd_item<3> item_ct1) {
-                                 dequantize_block_q5_K(vx, y, item_ct1);
-                             });
+        sycl_parallel_for(
+            stream, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 64), sycl::range<3>(1, 1, 64)),
+            [=](sycl::nd_item<3> item_ct1) { dequantize_block_q5_K(vx, y, item_ct1); });
     }
 #else
     {
         dpct::has_capability_or_fail(stream->get_device(),
                                      {sycl::aspect::fp16});
 
-        stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
-                                                   sycl::range<3>(1, 1, 32),
-                                               sycl::range<3>(1, 1, 32)),
-                             [=](sycl::nd_item<3> item_ct1) {
-                                 dequantize_block_q5_K(vx, y, item_ct1);
-                             });
+        sycl_parallel_for(
+            stream, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)),
+            [=](sycl::nd_item<3> item_ct1) { dequantize_block_q5_K(vx, y, item_ct1); });
     }
 
 #endif
@@ -242,29 +213,34 @@ static void dequantize_row_q6_K_sycl(const void *vx, dst_t *y, const int64_t k,
         dpct::has_capability_or_fail(stream->get_device(),
                                      {sycl::aspect::fp16});
 
-        stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
-                                                   sycl::range<3>(1, 1, 64),
-                                               sycl::range<3>(1, 1, 64)),
-                             [=](sycl::nd_item<3> item_ct1) {
-                                 dequantize_block_q6_K(vx, y, item_ct1);
-                             });
+        sycl_parallel_for(
+            stream, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 64), sycl::range<3>(1, 1, 64)),
+            [=](sycl::nd_item<3> item_ct1) { dequantize_block_q6_K(vx, y, item_ct1); });
     }
 #else
     {
         dpct::has_capability_or_fail(stream->get_device(),
                                      {sycl::aspect::fp16});
 
-        stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
-                                                   sycl::range<3>(1, 1, 32),
-                                               sycl::range<3>(1, 1, 32)),
-                             [=](sycl::nd_item<3> item_ct1) {
-                                 dequantize_block_q6_K(vx, y, item_ct1);
-                             });
+        sycl_parallel_for(
+            stream, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)),
+            [=](sycl::nd_item<3> item_ct1) { dequantize_block_q6_K(vx, y, item_ct1); });
     }
 
 #endif
 }
 
+template <typename dst_t>
+static void dequantize_row_q6_K_sycl_reorder(const void * vx, dst_t * y, const int64_t k, dpct::queue_ptr stream) {
+    const int64_t nb = k / QK_K;
+
+    dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 });
+
+    sycl_parallel_for(stream,
+                      sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 64), sycl::range<3>(1, 1, 64)),
+                      [=](sycl::nd_item<3> item_ct1) { dequantize_block_q6_K_reorder(vx, y, item_ct1, nb); });
+}
+
 template <typename dst_t>
 static void dequantize_row_iq1_s_sycl(const void *vx, dst_t *y, const int64_t k,
                                         dpct::queue_ptr stream) {
@@ -273,15 +249,10 @@ static void dequantize_row_iq1_s_sycl(const void *vx, dst_t *y, const int64_t k,
         dpct::has_capability_or_fail(stream->get_device(),
                                      {sycl::aspect::fp16});
 
-        stream->submit([&](sycl::handler &cgh) {
-            cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
-                                                   sycl::range<3>(1, 1, 32),
-                                               sycl::range<3>(1, 1, 32)),
-                             [=](sycl::nd_item<3> item_ct1) {
-                                 dequantize_block_iq1_s(
-                                     vx, y, item_ct1, iq1s_grid_gpu
-                                     );
-                             });
+        sycl_launch(stream, [&](sycl::handler & cgh) {
+            sycl_parallel_for(
+                cgh, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)),
+                [=](sycl::nd_item<3> item_ct1) { dequantize_block_iq1_s(vx, y, item_ct1, iq1s_grid_gpu); });
         });
     }
 }
@@ -294,15 +265,10 @@ static void dequantize_row_iq1_m_sycl(const void *vx, dst_t *y, const int64_t k,
         dpct::has_capability_or_fail(stream->get_device(),
                                      {sycl::aspect::fp16});
 
-        stream->submit([&](sycl::handler &cgh) {
-            cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
-                                                   sycl::range<3>(1, 1, 32),
-                                               sycl::range<3>(1, 1, 32)),
-                             [=](sycl::nd_item<3> item_ct1) {
-                                 dequantize_block_iq1_m(
-                                     vx, y, item_ct1, iq1s_grid_gpu
-                                     );
-                             });
+        sycl_launch(stream, [&](sycl::handler & cgh) {
+            sycl_parallel_for(
+                cgh, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)),
+                [=](sycl::nd_item<3> item_ct1) { dequantize_block_iq1_m(vx, y, item_ct1, iq1s_grid_gpu); });
         });
     }
 }
@@ -315,15 +281,12 @@ static void dequantize_row_iq2_xxs_sycl(const void *vx, dst_t *y, const int64_t
         dpct::has_capability_or_fail(stream->get_device(),
                                      {sycl::aspect::fp16});
 
-        stream->submit([&](sycl::handler &cgh) {
-            cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
-                                                   sycl::range<3>(1, 1, 32),
-                                               sycl::range<3>(1, 1, 32)),
-                             [=](sycl::nd_item<3> item_ct1) {
-                                 dequantize_block_iq2_xxs(
-                                     vx, y, item_ct1, iq2xxs_grid,
-                                     ksigns_iq2xs, kmask_iq2xs);
-                             });
+        sycl_launch(stream, [&](sycl::handler & cgh) {
+            sycl_parallel_for(
+                cgh, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)),
+                [=](sycl::nd_item<3> item_ct1) {
+                    dequantize_block_iq2_xxs(vx, y, item_ct1, iq2xxs_grid, ksigns_iq2xs, kmask_iq2xs);
+                });
         });
     }
 }
@@ -336,15 +299,12 @@ static void dequantize_row_iq2_xs_sycl(const void *vx, dst_t *y, const int64_t k
         dpct::has_capability_or_fail(stream->get_device(),
                                      {sycl::aspect::fp16});
 
-        stream->submit([&](sycl::handler &cgh) {
-            cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
-                                                   sycl::range<3>(1, 1, 32),
-                                               sycl::range<3>(1, 1, 32)),
-                             [=](sycl::nd_item<3> item_ct1) {
-                                 dequantize_block_iq2_xs(
-                                     vx, y, item_ct1, iq2xs_grid,
-                                     ksigns_iq2xs, kmask_iq2xs);
-                             });
+        sycl_launch(stream, [&](sycl::handler & cgh) {
+            sycl_parallel_for(
+                cgh, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)),
+                [=](sycl::nd_item<3> item_ct1) {
+                    dequantize_block_iq2_xs(vx, y, item_ct1, iq2xs_grid, ksigns_iq2xs, kmask_iq2xs);
+                });
         });
     }
 }
@@ -357,13 +317,10 @@ static void dequantize_row_iq2_s_sycl(const void *vx, dst_t *y, const int64_t k,
         dpct::has_capability_or_fail(stream->get_device(),
                                      {sycl::aspect::fp16});
 
-        stream->submit([&](sycl::handler &cgh) {
-            cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
-                                                   sycl::range<3>(1, 1, 32),
-                                               sycl::range<3>(1, 1, 32)),
-                             [=](sycl::nd_item<3> item_ct1) {
-                                 dequantize_block_iq2_s(vx, y, item_ct1);
-                             });
+        sycl_launch(stream, [&](sycl::handler & cgh) {
+            sycl_parallel_for(
+                cgh, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)),
+                [=](sycl::nd_item<3> item_ct1) { dequantize_block_iq2_s(vx, y, item_ct1); });
         });
     }
 }
@@ -377,15 +334,12 @@ static void dequantize_row_iq3_xxs_sycl(const void *vx, dst_t *y, const int64_t
         dpct::has_capability_or_fail(stream->get_device(),
                                      {sycl::aspect::fp16});
 
-        stream->submit([&](sycl::handler &cgh) {
-            cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
-                                                   sycl::range<3>(1, 1, 32),
-                                               sycl::range<3>(1, 1, 32)),
-                             [=](sycl::nd_item<3> item_ct1) {
-                                 dequantize_block_iq3_xxs(
-                                     vx, y, item_ct1, iq3xxs_grid,
-                                     ksigns_iq2xs, kmask_iq2xs);
-                             });
+        sycl_launch(stream, [&](sycl::handler & cgh) {
+            sycl_parallel_for(
+                cgh, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)),
+                [=](sycl::nd_item<3> item_ct1) {
+                    dequantize_block_iq3_xxs(vx, y, item_ct1, iq3xxs_grid, ksigns_iq2xs, kmask_iq2xs);
+                });
         });
     }
 }
@@ -398,14 +352,10 @@ static void dequantize_row_iq3_s_sycl(const void *vx, dst_t *y, const int64_t k,
         dpct::has_capability_or_fail(stream->get_device(),
                                      {sycl::aspect::fp16});
 
-        stream->submit([&](sycl::handler &cgh) {
-            cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
-                                                   sycl::range<3>(1, 1, 32),
-                                               sycl::range<3>(1, 1, 32)),
-                             [=](sycl::nd_item<3> item_ct1) {
-                                 dequantize_block_iq3_s(
-                                     vx, y, item_ct1, kmask_iq2xs, iq3s_grid);
-                             });
+        sycl_launch(stream, [&](sycl::handler & cgh) {
+            sycl_parallel_for(
+                cgh, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)),
+                [=](sycl::nd_item<3> item_ct1) { dequantize_block_iq3_s(vx, y, item_ct1, kmask_iq2xs, iq3s_grid); });
         });
     }
 }
@@ -421,14 +371,11 @@ static void dequantize_row_iq4_xs_sycl(const void *vx, dst_t *y, const int64_t k
             dpct::has_capability_or_fail(stream->get_device(),
                                          {sycl::aspect::fp16});
 
-            stream->submit([&](sycl::handler &cgh) {
-                  cgh.parallel_for(
-                      sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
-                                            sycl::range<3>(1, 1, 32),
-                                        sycl::range<3>(1, 1, 32)),
-                      [=](sycl::nd_item<3> item_ct1) {
-                            dequantize_block_iq4_xs(vx, y, item_ct1);
-                      });
+            sycl_launch(stream, [&](sycl::handler & cgh) {
+                sycl_parallel_for(
+                    cgh,
+                    sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)),
+                    [=](sycl::nd_item<3> item_ct1) { dequantize_block_iq4_xs(vx, y, item_ct1); });
             });
       }
 #endif
@@ -442,14 +389,11 @@ static void dequantize_row_iq4_nl_sycl(const void *vx, dst_t *y, const int64_t k
             dpct::has_capability_or_fail(stream->get_device(),
                                          {sycl::aspect::fp16});
 
-            stream->submit([&](sycl::handler &cgh) {
-                  cgh.parallel_for(
-                      sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
-                                            sycl::range<3>(1, 1, 32),
-                                        sycl::range<3>(1, 1, 32)),
-                      [=](sycl::nd_item<3> item_ct1) {
-                            dequantize_block_iq4_nl(vx, y, item_ct1);
-                      });
+            sycl_launch(stream, [&](sycl::handler & cgh) {
+                sycl_parallel_for(
+                    cgh,
+                    sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)),
+                    [=](sycl::nd_item<3> item_ct1) { dequantize_block_iq4_nl(vx, y, item_ct1); });
             });
       }
 }
@@ -530,7 +474,11 @@ to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type, ggml_tensor * dst) {
         case GGML_TYPE_Q5_K:
             return dequantize_row_q5_K_sycl;
         case GGML_TYPE_Q6_K:
-            return dequantize_row_q6_K_sycl;
+            if (dst->src[0]->extra && ((ggml_tensor_extra_gpu *) dst->src[0]->extra)->optimized_feature.reorder) {
+                return dequantize_row_q6_K_sycl_reorder;
+            } else {
+                return dequantize_row_q6_K_sycl;
+            }
         case GGML_TYPE_IQ1_S:
             return dequantize_row_iq1_s_sycl;
         case GGML_TYPE_IQ1_M:
@@ -587,7 +535,11 @@ to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type, ggml_tensor *dst) {
         case GGML_TYPE_Q5_K:
             return dequantize_row_q5_K_sycl;
         case GGML_TYPE_Q6_K:
-            return dequantize_row_q6_K_sycl;
+            if (dst->src[0]->extra && ((ggml_tensor_extra_gpu *) dst->src[0]->extra)->optimized_feature.reorder) {
+                return dequantize_row_q6_K_sycl_reorder;
+            } else {
+                return dequantize_row_q6_K_sycl;
+            }
         case GGML_TYPE_IQ1_S:
             return dequantize_row_iq1_s_sycl;
         case GGML_TYPE_IQ1_M:
diff --git a/ggml/src/ggml-sycl/cpy.cpp b/ggml/src/ggml-sycl/cpy.cpp
index 44487c25646d6..1ffd7f1226724 100644
--- a/ggml/src/ggml-sycl/cpy.cpp
+++ b/ggml/src/ggml-sycl/cpy.cpp
@@ -1,8 +1,12 @@
 #include "cpy.hpp"
 
 #include <float.h>
+#include <string>
 
 #include "dequantize.hpp"
+#include "ggml-sycl/common.hpp"
+#include "ggml-sycl/presets.hpp"
+#include "ggml.h"
 
 static __dpct_inline__ int best_index_int8(int n, const int8_t * val, float x) {
     if (x <= val[0]) {
@@ -116,6 +120,15 @@ static void cpy_blck_f32_q8_0(const char * cxi, char * cdsti) {
     }
 }
 
+/* quantized type same copy */
+template<typename T>
+static void cpy_blck_q_q(const char * cxi, char * cdsti) {
+    const T * xi = (const T *) cxi;
+    T * dsti = (T *) cdsti;
+    *dsti = *xi;
+}
+
+
 static void cpy_blck_q8_0_f32(const char * cxi, char * cdsti) {
     float * cdstf = (float *) (cdsti);
 
@@ -311,6 +324,34 @@ template <dequantize_kernel_t dequant, int qk> static void cpy_blck_q_f32(const
     }
 }
 
+
+template <typename T, int qk>
+static void cpy_q_q(const char * cx, char * cdst, const int ne, const int ne00, const int ne01, const int ne02,
+                      const int nb00, const int nb01, const int nb02, const int nb03, const int ne10, const int ne11,
+                      const int ne12, const int nb10, const int nb11, const int nb12, const int nb13,
+                      const sycl::nd_item<3> & item_ct1) {
+    const int i = (item_ct1.get_local_range(2) * item_ct1.get_group(2) + item_ct1.get_local_id(2)) * qk;
+
+    if (i >= ne) {
+        return;
+    }
+
+    const int i03      = i / (ne00 * ne01 * ne02);
+    const int i02      = (i - i03 * ne00 * ne01 * ne02) / (ne00 * ne01);
+    const int i01      = (i - i03 * ne00 * ne01 * ne02 - i02 * ne01 * ne00) / ne00;
+    const int i00      = i - i03 * ne00 * ne01 * ne02 - i02 * ne01 * ne00 - i01 * ne00;
+    const int x_offset = (i00 / qk) * nb00 + i01 * nb01 + i02 * nb02 + i03 * nb03;
+
+
+    const int i13        = i / (ne10 * ne11 * ne12);
+    const int i12        = (i - i13 * ne10 * ne11 * ne12) / (ne10 * ne11);
+    const int i11        = (i - i13 * ne10 * ne11 * ne12 - i12 * ne10 * ne11) / ne10;
+    const int i10        = i - i13 * ne10 * ne11 * ne12 - i12 * ne10 * ne11 - i11 * ne10;
+    const int dst_offset = (i10 / qk) * nb10 + i11 * nb11 + i12 * nb12 + i13 * nb13;
+
+    cpy_blck_q_q<T>(cx + x_offset, cdst + dst_offset);
+}
+
 template <cpy_kernel_t cpy_blck, int qk>
 static void cpy_f32_q(const char * cx, char * cdst, const int ne, const int ne00, const int ne01, const int ne02,
                       const int nb00, const int nb01, const int nb02, const int nb03, const int ne10, const int ne11,
@@ -322,6 +363,7 @@ static void cpy_f32_q(const char * cx, char * cdst, const int ne, const int ne00
         return;
     }
 
+
     const int i03      = i / (ne00 * ne01 * ne02);
     const int i02      = (i - i03 * ne00 * ne01 * ne02) / (ne00 * ne01);
     const int i01      = (i - i03 * ne00 * ne01 * ne02 - i02 * ne01 * ne00) / ne00;
@@ -371,7 +413,8 @@ static void ggml_cpy_f16_f32_sycl(const char * cx, char * cdst, const int ne, co
     {
         dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 });
 
-        stream->parallel_for(
+        sycl_parallel_for(
+            stream,
             sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
                               sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
             [=](sycl::nd_item<3> item_ct1) {
@@ -389,7 +432,8 @@ static void ggml_cpy_f32_f32_sycl(const char * cx, char * cdst, const int ne, co
     {
         dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 });
 
-        stream->parallel_for(
+        sycl_parallel_for(
+            stream,
             sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
                               sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
             [=](sycl::nd_item<3> item_ct1) {
@@ -407,7 +451,8 @@ static void ggml_cpy_f32_f16_sycl(const char * cx, char * cdst, const int ne, co
     {
         dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 });
 
-        stream->parallel_for(
+        sycl_parallel_for(
+            stream,
             sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
                               sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
             [=](sycl::nd_item<3> item_ct1) {
@@ -423,11 +468,11 @@ static void ggml_cpy_f32_q8_0_sycl(const char * cx, char * cdst, const int ne, c
                                    const int nb12, const int nb13, queue_ptr stream) {
     GGML_ASSERT(ne % QK8_0 == 0);
     const int num_blocks = ne / QK8_0;
-    stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)),
-                         [=](sycl::nd_item<3> item_ct1) {
-                             cpy_f32_q<cpy_blck_f32_q8_0, QK8_0>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
-                                                                 ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1);
-                         });
+    sycl_parallel_for(stream, sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)),
+                      [=](sycl::nd_item<3> item_ct1) {
+                          cpy_f32_q<cpy_blck_f32_q8_0, QK8_0>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
+                                                              ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1);
+                      });
 }
 
 static void ggml_cpy_q8_0_f32_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01,
@@ -435,11 +480,11 @@ static void ggml_cpy_q8_0_f32_sycl(const char * cx, char * cdst, const int ne, c
                                    const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
                                    const int nb12, const int nb13, queue_ptr stream) {
     const int num_blocks = ne;
-    stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)),
-                         [=](sycl::nd_item<3> item_ct1) {
-                             cpy_q_f32<cpy_blck_q8_0_f32, QK8_0>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
-                                                                 ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1);
-                         });
+    sycl_parallel_for(stream, sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)),
+                      [=](sycl::nd_item<3> item_ct1) {
+                          cpy_q_f32<cpy_blck_q8_0_f32, QK8_0>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
+                                                              ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1);
+                      });
 }
 
 static void ggml_cpy_f32_q4_0_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01,
@@ -448,11 +493,11 @@ static void ggml_cpy_f32_q4_0_sycl(const char * cx, char * cdst, const int ne, c
                                    const int nb12, const int nb13, queue_ptr stream) {
     GGML_ASSERT(ne % QK4_0 == 0);
     const int num_blocks = ne / QK4_0;
-    stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)),
-                         [=](sycl::nd_item<3> item_ct1) {
-                             cpy_f32_q<cpy_blck_f32_q4_0, QK4_0>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
-                                                                 ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1);
-                         });
+    sycl_parallel_for(stream, sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)),
+                      [=](sycl::nd_item<3> item_ct1) {
+                          cpy_f32_q<cpy_blck_f32_q4_0, QK4_0>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
+                                                              ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1);
+                      });
 }
 
 static void ggml_cpy_q4_0_f32_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01,
@@ -460,8 +505,9 @@ static void ggml_cpy_q4_0_f32_sycl(const char * cx, char * cdst, const int ne, c
                                    const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
                                    const int nb12, const int nb13, queue_ptr stream) {
     const int num_blocks = ne;
-    stream->parallel_for(
-        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)), [=](sycl::nd_item<3> item_ct1) {
+    sycl_parallel_for(
+        stream, sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)),
+        [=](sycl::nd_item<3> item_ct1) {
             cpy_q_f32<cpy_blck_q_f32<dequantize_q4_0, QK4_0>, QK4_0>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02,
                                                                      nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13,
                                                                      item_ct1);
@@ -474,11 +520,11 @@ static void ggml_cpy_f32_q4_1_sycl(const char * cx, char * cdst, const int ne, c
                                    const int nb12, const int nb13, queue_ptr stream) {
     GGML_ASSERT(ne % QK4_1 == 0);
     const int num_blocks = ne / QK4_1;
-    stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)),
-                         [=](sycl::nd_item<3> item_ct1) {
-                             cpy_f32_q<cpy_blck_f32_q4_1, QK4_1>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
-                                                                 ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1);
-                         });
+    sycl_parallel_for(stream, sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)),
+                      [=](sycl::nd_item<3> item_ct1) {
+                          cpy_f32_q<cpy_blck_f32_q4_1, QK4_1>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
+                                                              ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1);
+                      });
 }
 
 static void ggml_cpy_q4_1_f32_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01,
@@ -486,8 +532,9 @@ static void ggml_cpy_q4_1_f32_sycl(const char * cx, char * cdst, const int ne, c
                                    const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
                                    const int nb12, const int nb13, queue_ptr stream) {
     const int num_blocks = ne;
-    stream->parallel_for(
-        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)), [=](sycl::nd_item<3> item_ct1) {
+    sycl_parallel_for(
+        stream, sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)),
+        [=](sycl::nd_item<3> item_ct1) {
             cpy_q_f32<cpy_blck_q_f32<dequantize_q4_1, QK4_1>, QK4_1>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02,
                                                                      nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13,
                                                                      item_ct1);
@@ -500,11 +547,11 @@ static void ggml_cpy_f32_q5_0_sycl(const char * cx, char * cdst, const int ne, c
                                    const int nb12, const int nb13, queue_ptr stream) {
     GGML_ASSERT(ne % QK5_0 == 0);
     const int num_blocks = ne / QK5_0;
-    stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)),
-                         [=](sycl::nd_item<3> item_ct1) {
-                             cpy_f32_q<cpy_blck_f32_q5_0, QK5_0>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
-                                                                 ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1);
-                         });
+    sycl_parallel_for(stream, sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)),
+                      [=](sycl::nd_item<3> item_ct1) {
+                          cpy_f32_q<cpy_blck_f32_q5_0, QK5_0>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
+                                                              ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1);
+                      });
 }
 
 static void ggml_cpy_q5_0_f32_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01,
@@ -512,8 +559,9 @@ static void ggml_cpy_q5_0_f32_sycl(const char * cx, char * cdst, const int ne, c
                                    const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
                                    const int nb12, const int nb13, queue_ptr stream) {
     const int num_blocks = ne;
-    stream->parallel_for(
-        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)), [=](sycl::nd_item<3> item_ct1) {
+    sycl_parallel_for(
+        stream, sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)),
+        [=](sycl::nd_item<3> item_ct1) {
             cpy_q_f32<cpy_blck_q_f32<dequantize_q5_0, QK5_0>, QK5_0>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02,
                                                                      nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13,
                                                                      item_ct1);
@@ -526,11 +574,11 @@ static void ggml_cpy_f32_q5_1_sycl(const char * cx, char * cdst, const int ne, c
                                    const int nb12, const int nb13, queue_ptr stream) {
     GGML_ASSERT(ne % QK5_1 == 0);
     const int num_blocks = ne / QK5_1;
-    stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)),
-                         [=](sycl::nd_item<3> item_ct1) {
-                             cpy_f32_q<cpy_blck_f32_q5_1, QK5_1>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
-                                                                 ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1);
-                         });
+    sycl_parallel_for(stream, sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)),
+                      [=](sycl::nd_item<3> item_ct1) {
+                          cpy_f32_q<cpy_blck_f32_q5_1, QK5_1>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
+                                                              ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1);
+                      });
 }
 
 static void ggml_cpy_q5_1_f32_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01,
@@ -538,8 +586,9 @@ static void ggml_cpy_q5_1_f32_sycl(const char * cx, char * cdst, const int ne, c
                                    const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
                                    const int nb12, const int nb13, queue_ptr stream) {
     const int num_blocks = ne;
-    stream->parallel_for(
-        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)), [=](sycl::nd_item<3> item_ct1) {
+    sycl_parallel_for(
+        stream, sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)),
+        [=](sycl::nd_item<3> item_ct1) {
             cpy_q_f32<cpy_blck_q_f32<dequantize_q5_1, QK5_1>, QK5_1>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02,
                                                                      nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13,
                                                                      item_ct1);
@@ -552,11 +601,11 @@ static void ggml_cpy_f32_iq4_nl_sycl(const char * cx, char * cdst, const int ne,
                                      const int nb12, const int nb13, queue_ptr stream) {
     GGML_ASSERT(ne % QK4_NL == 0);
     const int num_blocks = ne / QK4_NL;
-    stream->parallel_for(
-        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)), [=](sycl::nd_item<3> item_ct1) {
-            cpy_f32_q<cpy_blck_f32_iq4_nl, QK4_NL>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11,
-                                                   ne12, nb10, nb11, nb12, nb13, item_ct1);
-        });
+    sycl_parallel_for(stream, sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)),
+                      [=](sycl::nd_item<3> item_ct1) {
+                          cpy_f32_q<cpy_blck_f32_iq4_nl, QK4_NL>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
+                                                                 ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1);
+                      });
 }
 
 static void ggml_cpy_f16_f16_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01,
@@ -567,7 +616,8 @@ static void ggml_cpy_f16_f16_sycl(const char * cx, char * cdst, const int ne, co
     {
         dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 });
 
-        stream->parallel_for(
+        sycl_parallel_for(
+            stream,
             sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
                               sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
             [=](sycl::nd_item<3> item_ct1) {
@@ -586,7 +636,8 @@ static void ggml_cpy_i16_i16_sycl(const char * cx, char * cdst, const int ne, co
         // dpct::has_capability_or_fail(stream->get_device(),
         //                              {sycl::aspect::fp16});
 
-        stream->parallel_for(
+        sycl_parallel_for(
+            stream,
             sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
                               sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
             [=](sycl::nd_item<3> item_ct1) {
@@ -605,7 +656,8 @@ static void ggml_cpy_i32_i32_sycl(const char * cx, char * cdst, const int ne, co
         // dpct::has_capability_or_fail(stream->get_device(),
         //                              {sycl::aspect::fp16});
 
-        stream->parallel_for(
+        sycl_parallel_for(
+            stream,
             sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
                               sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
             [=](sycl::nd_item<3> item_ct1) {
@@ -615,10 +667,85 @@ static void ggml_cpy_i32_i32_sycl(const char * cx, char * cdst, const int ne, co
     }
 }
 
+static void ggml_cpy_q8_0_q8_0(const char * cx, char * cdst, const int ne, const int ne00, const int ne01,
+                                   const int ne02, const int nb00, const int nb01, const int nb02, const int nb03,
+                                   const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
+                                   const int nb12, const int nb13, queue_ptr stream) {
+    const int num_blocks = ceil_div(ne, SYCL_CPY_BLOCK_SIZE);
+    sycl_parallel_for(stream,
+                      sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
+                                        sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
+                      [=](sycl::nd_item<3> item_ct1) {
+                          cpy_q_q<block_q8_0, QK8_0>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11,
+                                                     ne12, nb10, nb11, nb12, nb13, item_ct1);
+                      });
+}
+
+
+static void ggml_cpy_q5_0_q5_0(const char * cx, char * cdst, const int ne, const int ne00, const int ne01,
+                                   const int ne02, const int nb00, const int nb01, const int nb02, const int nb03,
+                                   const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
+                                   const int nb12, const int nb13, queue_ptr stream) {
+    const int num_blocks = ceil_div(ne, SYCL_CPY_BLOCK_SIZE);
+    sycl_parallel_for(stream,
+                      sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
+                                        sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
+                      [=](sycl::nd_item<3> item_ct1) {
+                          cpy_q_q<block_q5_0, QK5_0>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11,
+                                                     ne12, nb10, nb11, nb12, nb13, item_ct1);
+                      });
+}
+
+
+static void ggml_cpy_q5_1_q5_1(const char * cx, char * cdst, const int ne, const int ne00, const int ne01,
+                                   const int ne02, const int nb00, const int nb01, const int nb02, const int nb03,
+                                   const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
+                                   const int nb12, const int nb13, queue_ptr stream) {
+    const int num_blocks = ceil_div(ne, SYCL_CPY_BLOCK_SIZE);
+
+    sycl_parallel_for(stream,
+                      sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
+                                        sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
+                      [=](sycl::nd_item<3> item_ct1) {
+                          cpy_q_q<block_q5_1, QK5_1>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11,
+                                                     ne12, nb10, nb11, nb12, nb13, item_ct1);
+                      });
+}
+
+
+static void ggml_cpy_q4_0_q4_0(const char * cx, char * cdst, const int ne, const int ne00, const int ne01,
+                                   const int ne02, const int nb00, const int nb01, const int nb02, const int nb03,
+                                   const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
+                                   const int nb12, const int nb13, queue_ptr stream) {
+    const int num_blocks = ceil_div(ne, SYCL_CPY_BLOCK_SIZE);
+    sycl_parallel_for(stream,
+                      sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
+                                        sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
+                      [=](sycl::nd_item<3> item_ct1) {
+                          cpy_q_q<block_q4_0, QK4_0>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11,
+                                                     ne12, nb10, nb11, nb12, nb13, item_ct1);
+                      });
+}
+
+
+static void ggml_cpy_q4_1_q4_1(const char * cx, char * cdst, const int ne, const int ne00, const int ne01,
+                                   const int ne02, const int nb00, const int nb01, const int nb02, const int nb03,
+                                   const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
+                                   const int nb12, const int nb13, queue_ptr stream) {
+
+   const int num_blocks = ceil_div(ne, SYCL_CPY_BLOCK_SIZE);
+   sycl_parallel_for(stream,
+                     sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
+                                       sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
+                     [=](sycl::nd_item<3> item_ct1) {
+                         cpy_q_q<block_q4_1, QK4_1>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11,
+                                                    ne12, nb10, nb11, nb12, nb13, item_ct1);
+                     });
+}
+
 void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1) try {
     // Unlike other operators ggml_sycl_cpy takes 2 distinct tensors instead of a dst ggml_tensor and rely on its src field
-    scope_op_debug_print scope_dbg_print(__func__, src1, /*num_src=*/0,
-                                         std::string(" src0 type=") + ggml_type_name(src0->type));
+    scope_op_debug_print scope_dbg_print(__func__, src1, /*num_src=*/0, debug_get_tensor_str("\tsrc0", src0));
     const int64_t ne = ggml_nelements(src0);
     GGML_ASSERT(ne == ggml_nelements(src1));
 
@@ -632,8 +759,10 @@ void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, co
 
     char * src0_ddc = (char *) src0->data;
     char * src1_ddc = (char *) src1->data;
-
-    if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
+    if ((src0->type == src1->type) && (ggml_is_contiguous(src0) && ggml_is_contiguous(src1))) {
+        GGML_SYCL_DEBUG("%s: memcpy path\n", __func__);
+        main_stream->memcpy(src1_ddc, src0_ddc, ggml_nbytes(src0));
+    } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
         ggml_cpy_f32_f32_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10,
                               nb11, nb12, nb13, main_stream);
     } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
@@ -684,6 +813,16 @@ void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, co
     } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_IQ4_NL) {
         ggml_cpy_f32_iq4_nl_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12,
                                  nb10, nb11, nb12, nb13, main_stream);
+    } else if (src0->type == GGML_TYPE_Q8_0 && src1->type == GGML_TYPE_Q8_0) {
+        ggml_cpy_q8_0_q8_0(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
+    } else if (src0->type == GGML_TYPE_Q5_0 && src1->type == GGML_TYPE_Q5_0) {
+        ggml_cpy_q5_0_q5_0(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
+    } else if (src0->type == GGML_TYPE_Q5_1 && src1->type == GGML_TYPE_Q5_1) {
+        ggml_cpy_q5_1_q5_1(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
+    } else if (src0->type == GGML_TYPE_Q4_0 && src1->type == GGML_TYPE_Q4_0) {
+        ggml_cpy_q4_0_q4_0(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
+    } else if (src0->type == GGML_TYPE_Q4_1 && src1->type == GGML_TYPE_Q4_1) {
+        ggml_cpy_q4_1_q4_1(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
     } else {
         GGML_LOG_ERROR("%s: unsupported type combination (%s to %s)\n", __func__, ggml_type_name(src0->type),
                        ggml_type_name(src1->type));
diff --git a/ggml/src/ggml-sycl/dequantize.hpp b/ggml/src/ggml-sycl/dequantize.hpp
index 64e92f73f26c8..540539bb22381 100644
--- a/ggml/src/ggml-sycl/dequantize.hpp
+++ b/ggml/src/ggml-sycl/dequantize.hpp
@@ -538,6 +538,38 @@ static void dequantize_block_q6_K(const void * __restrict__ vx, dst_t * __restri
 #endif
 }
 
+template <typename dst_t>
+static void dequantize_block_q6_K_reorder(const void * __restrict__ vx, dst_t * __restrict__ yy,
+                                          const sycl::nd_item<3> & item_ct1, int64_t n_blocks) {
+    const int64_t ib = item_ct1.get_group(2);
+
+    const int64_t tid = item_ct1.get_local_id(2);
+    const int64_t ip  = tid / 32;       // ip is 0 or 1
+    const int64_t il  = tid - 32 * ip;  // 0...32
+    const int64_t is  = 8 * ip + il / 16;
+
+    const uint8_t *   base_ptr           = static_cast<const uint8_t *>(vx);
+    const auto        ql_offset          = ib * (QK_K / 2);
+    const auto        qh_offset          = (QK_K / 2) * n_blocks + (QK_K / 4) * ib;
+    const auto        base_scales_offset = (QK_K / 2) * n_blocks + (QK_K / 4) * n_blocks + (QK_K / 16) * ib;
+    const auto        base_d_offset      = ((QK_K / 2) + (QK_K / 4) + (QK_K / 16)) * n_blocks;
+    const uint8_t *   ql_ptr             = base_ptr + ql_offset;
+    const uint8_t *   qh_ptr             = base_ptr + qh_offset;
+    const uint8_t *   scales_ptr         = base_ptr + base_scales_offset;
+    const ggml_half * d                  = (const ggml_half *) (base_ptr + base_d_offset) + ib;
+
+    dst_t * y = yy + ib * QK_K + 128 * ip + il;
+
+    const uint8_t * ql = ql_ptr + 64 * ip + il;
+    const uint8_t   qh = *(qh_ptr + 32 * ip + il);
+    const int8_t *  sc = reinterpret_cast<const int8_t *>(scales_ptr + is);
+
+    y[0]  = *d * sc[0] * ((int8_t) ((ql[0] & 0xF) | (((qh >> 0) & 3) << 4)) - 32);
+    y[32] = *d * sc[2] * ((int8_t) ((ql[32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32);
+    y[64] = *d * sc[4] * ((int8_t) ((ql[0] >> 4) | (((qh >> 4) & 3) << 4)) - 32);
+    y[96] = *d * sc[6] * ((int8_t) ((ql[32] >> 4) | (((qh >> 6) & 3) << 4)) - 32);
+}
+
 template<typename dst_t>
 static void dequantize_block_iq2_xxs(const void * __restrict__ vx, dst_t * __restrict__ yy,
                                      const sycl::nd_item<3> &item_ct1,
diff --git a/ggml/src/ggml-sycl/dmmv.cpp b/ggml/src/ggml-sycl/dmmv.cpp
index 4f2760110c212..70579c0c3be11 100644
--- a/ggml/src/ggml-sycl/dmmv.cpp
+++ b/ggml/src/ggml-sycl/dmmv.cpp
@@ -208,12 +208,10 @@ static void convert_mul_mat_vec_f16_sycl(const void *vx, const dfloat *y,
         dpct::has_capability_or_fail(stream->get_device(),
                                      {sycl::aspect::fp16});
 
-        stream->parallel_for(
-            sycl::nd_range<3>(block_nums * block_dims, block_dims),
-            [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
-                dequantize_mul_mat_vec<1, 1, convert_f16>(vx, y, dst, ncols,
-                                                          nrows, item_ct1);
-            });
+        sycl_parallel_for(stream, sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                          [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
+                              dequantize_mul_mat_vec<1, 1, convert_f16>(vx, y, dst, ncols, nrows, item_ct1);
+                          });
     }
 }
 
@@ -877,12 +875,11 @@ static void dequantize_mul_mat_vec_q4_0_sycl_reorder(const void *vx, const dfloa
         dpct::has_capability_or_fail(stream->get_device(),
                                      {sycl::aspect::fp16});
 
-        stream->parallel_for(
-            sycl::nd_range<3>(block_nums * block_dims, block_dims),
-            [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
-                dequantize_mul_mat_vec_reorder<QK4_0, QR4_0, dequantize_q4_0_reorder>(
-                    vx, y, dst, ncols, nrows, item_ct1);
-            });
+        sycl_parallel_for(stream, sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                          [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
+                              dequantize_mul_mat_vec_reorder<QK4_0, QR4_0, dequantize_q4_0_reorder>(vx, y, dst, ncols,
+                                                                                                    nrows, item_ct1);
+                          });
     }
 }
 
@@ -900,12 +897,10 @@ static void dequantize_mul_mat_vec_q4_0_sycl(const void *vx, const dfloat *y,
         dpct::has_capability_or_fail(stream->get_device(),
                                      {sycl::aspect::fp16});
 
-        stream->parallel_for(
-            sycl::nd_range<3>(block_nums * block_dims, block_dims),
-            [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
-                dequantize_mul_mat_vec<QK4_0, QR4_0, dequantize_q4_0>(
-                    vx, y, dst, ncols, nrows, item_ct1);
-            });
+        sycl_parallel_for(stream, sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                          [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
+                              dequantize_mul_mat_vec<QK4_0, QR4_0, dequantize_q4_0>(vx, y, dst, ncols, nrows, item_ct1);
+                          });
     }
 }
 
@@ -921,12 +916,10 @@ static void dequantize_mul_mat_vec_q4_1_sycl(const void *vx, const dfloat *y,
         dpct::has_capability_or_fail(stream->get_device(),
                                      {sycl::aspect::fp16});
 
-        stream->parallel_for(
-            sycl::nd_range<3>(block_nums * block_dims, block_dims),
-            [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
-                dequantize_mul_mat_vec<QK4_1, QR4_1, dequantize_q4_1>(
-                    vx, y, dst, ncols, nrows, item_ct1);
-            });
+        sycl_parallel_for(stream, sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                          [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
+                              dequantize_mul_mat_vec<QK4_1, QR4_1, dequantize_q4_1>(vx, y, dst, ncols, nrows, item_ct1);
+                          });
     }
 }
 
@@ -942,12 +935,10 @@ static void dequantize_mul_mat_vec_q5_0_sycl(const void *vx, const dfloat *y,
         dpct::has_capability_or_fail(stream->get_device(),
                                      {sycl::aspect::fp16});
 
-        stream->parallel_for(
-            sycl::nd_range<3>(block_nums * block_dims, block_dims),
-            [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
-                dequantize_mul_mat_vec<QK5_0, QR5_0, dequantize_q5_0>(
-                    vx, y, dst, ncols, nrows, item_ct1);
-            });
+        sycl_parallel_for(stream, sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                          [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
+                              dequantize_mul_mat_vec<QK5_0, QR5_0, dequantize_q5_0>(vx, y, dst, ncols, nrows, item_ct1);
+                          });
     }
 }
 
@@ -963,12 +954,10 @@ static void dequantize_mul_mat_vec_q5_1_sycl(const void *vx, const dfloat *y,
         dpct::has_capability_or_fail(stream->get_device(),
                                      {sycl::aspect::fp16});
 
-        stream->parallel_for(
-            sycl::nd_range<3>(block_nums * block_dims, block_dims),
-            [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
-                dequantize_mul_mat_vec<QK5_1, QR5_1, dequantize_q5_1>(
-                    vx, y, dst, ncols, nrows, item_ct1);
-            });
+        sycl_parallel_for(stream, sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                          [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
+                              dequantize_mul_mat_vec<QK5_1, QR5_1, dequantize_q5_1>(vx, y, dst, ncols, nrows, item_ct1);
+                          });
     }
 }
 
@@ -984,12 +973,10 @@ static void dequantize_mul_mat_vec_q8_0_sycl(const void *vx, const dfloat *y,
         dpct::has_capability_or_fail(stream->get_device(),
                                      {sycl::aspect::fp16});
 
-        stream->parallel_for(
-            sycl::nd_range<3>(block_nums * block_dims, block_dims),
-            [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
-                dequantize_mul_mat_vec<QK8_0, QR8_0, dequantize_q8_0>(
-                    vx, y, dst, ncols, nrows, item_ct1);
-            });
+        sycl_parallel_for(stream, sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                          [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
+                              dequantize_mul_mat_vec<QK8_0, QR8_0, dequantize_q8_0>(vx, y, dst, ncols, nrows, item_ct1);
+                          });
     }
 }
 
@@ -1002,11 +989,10 @@ static void dequantize_mul_mat_vec_q2_K_sycl(const void *vx, const float *y,
     const int block_num_y = (nrows + ny - 1) / ny;
     const sycl::range<3> block_nums(1, 1, block_num_y);
     const sycl::range<3> block_dims(1, ny, QK_WARP_SIZE);
-    stream->parallel_for(
-        sycl::nd_range<3>(block_nums * block_dims, block_dims),
-        [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(QK_WARP_SIZE)]] {
-            dequantize_mul_mat_vec_q2_k(vx, y, dst, ncols, nrows, item_ct1);
-        });
+    sycl_parallel_for(stream, sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                      [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(QK_WARP_SIZE)]] {
+                          dequantize_mul_mat_vec_q2_k(vx, y, dst, ncols, nrows, item_ct1);
+                      });
 }
 
 static void dequantize_mul_mat_vec_q3_K_sycl(const void *vx, const float *y,
@@ -1018,11 +1004,10 @@ static void dequantize_mul_mat_vec_q3_K_sycl(const void *vx, const float *y,
     const int block_num_y = (nrows + ny - 1) / ny;
     const sycl::range<3> block_nums(1, 1, block_num_y);
     const sycl::range<3> block_dims(1, ny, QK_WARP_SIZE);
-    stream->parallel_for(
-        sycl::nd_range<3>(block_nums * block_dims, block_dims),
-        [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(QK_WARP_SIZE)]] {
-            dequantize_mul_mat_vec_q3_k(vx, y, dst, ncols, nrows, item_ct1);
-        });
+    sycl_parallel_for(stream, sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                      [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(QK_WARP_SIZE)]] {
+                          dequantize_mul_mat_vec_q3_k(vx, y, dst, ncols, nrows, item_ct1);
+                      });
 }
 
 static void dequantize_mul_mat_vec_q4_K_sycl(const void *vx, const float *y,
@@ -1034,11 +1019,10 @@ static void dequantize_mul_mat_vec_q4_K_sycl(const void *vx, const float *y,
     const int block_num_y = (nrows + ny - 1) / ny;
     const sycl::range<3> block_nums(1, 1, block_num_y);
     const sycl::range<3> block_dims(1, ny, QK_WARP_SIZE);
-    stream->parallel_for(
-        sycl::nd_range<3>(block_nums * block_dims, block_dims),
-        [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(QK_WARP_SIZE)]] {
-            dequantize_mul_mat_vec_q4_k(vx, y, dst, ncols, nrows, item_ct1);
-        });
+    sycl_parallel_for(stream, sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                      [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(QK_WARP_SIZE)]] {
+                          dequantize_mul_mat_vec_q4_k(vx, y, dst, ncols, nrows, item_ct1);
+                      });
 }
 
 static void dequantize_mul_mat_vec_q5_K_sycl(const void *vx, const float *y,
@@ -1047,11 +1031,10 @@ static void dequantize_mul_mat_vec_q5_K_sycl(const void *vx, const float *y,
                                              dpct::queue_ptr stream) {
     GGML_ASSERT(ncols % QK_K == 0);
     const sycl::range<3> block_dims(1, 1, QK_WARP_SIZE);
-    stream->parallel_for(
-        sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims, block_dims),
-        [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(QK_WARP_SIZE)]] {
-            dequantize_mul_mat_vec_q5_k(vx, y, dst, ncols, item_ct1);
-        });
+    sycl_parallel_for(stream, sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims, block_dims),
+                      [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(QK_WARP_SIZE)]] {
+                          dequantize_mul_mat_vec_q5_k(vx, y, dst, ncols, item_ct1);
+                      });
 }
 
 static void dequantize_mul_mat_vec_q6_K_sycl(const void *vx, const float *y,
@@ -1063,11 +1046,10 @@ static void dequantize_mul_mat_vec_q6_K_sycl(const void *vx, const float *y,
     const int block_num_y = (nrows + ny - 1) / ny;
     const sycl::range<3> block_nums(1, 1, block_num_y);
     const sycl::range<3> block_dims(1, ny, QK_WARP_SIZE);
-    stream->parallel_for(
-        sycl::nd_range<3>(block_nums * block_dims, block_dims),
-        [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(QK_WARP_SIZE)]] {
-            dequantize_mul_mat_vec_q6_k(vx, y, dst, ncols, nrows, item_ct1);
-        });
+    sycl_parallel_for(stream, sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                      [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(QK_WARP_SIZE)]] {
+                          dequantize_mul_mat_vec_q6_k(vx, y, dst, ncols, nrows, item_ct1);
+                      });
 }
 
 void ggml_sycl_op_dequantize_mul_mat_vec(
diff --git a/ggml/src/ggml-sycl/dpct/helper.hpp b/ggml/src/ggml-sycl/dpct/helper.hpp
index d538965b096bf..27c7278607832 100644
--- a/ggml/src/ggml-sycl/dpct/helper.hpp
+++ b/ggml/src/ggml-sycl/dpct/helper.hpp
@@ -13,10 +13,10 @@
 #ifndef GGML_SYCL_DPCT_HELPER_HPP
 #define GGML_SYCL_DPCT_HELPER_HPP
 
+#include <map>
 #include <sycl/sycl.hpp>
 #include <sycl/half_type.hpp>
 #include <syclcompat/math.hpp>
-#include <map>
 
 #ifdef GGML_SYCL_USE_INTEL_ONEMKL
 #include <oneapi/mkl.hpp>
@@ -118,6 +118,36 @@ inline auto get_onemath_backend(sycl::queue& queue)
 #endif
 }
 
+#ifdef SYCL_EXT_ONEAPI_ENQUEUE_FUNCTIONS
+    namespace syclex = sycl::ext::oneapi::experimental;
+#endif
+
+template <int NR, typename Func>
+__dpct_inline__ void sycl_parallel_for(sycl::handler & cgh, sycl::nd_range<NR> nd_range, Func && func) {
+#ifdef SYCL_EXT_ONEAPI_ENQUEUE_FUNCTIONS
+    syclex::nd_launch(cgh, nd_range, func);
+#else
+    cgh.parallel_for(nd_range, func);
+#endif
+}
+
+template <int NR, typename Func>
+__dpct_inline__ void sycl_parallel_for(sycl::queue * q, sycl::nd_range<NR> nd_range, Func && func) {
+#ifdef SYCL_EXT_ONEAPI_ENQUEUE_FUNCTIONS
+    syclex::nd_launch(*q, nd_range, func);
+#else
+    q->parallel_for(nd_range, func);
+#endif
+}
+
+template <typename Func> __dpct_inline__ void sycl_launch(sycl::queue * stream, Func && func) {
+#ifdef SYCL_EXT_ONEAPI_ENQUEUE_FUNCTIONS
+    syclex::submit(*stream, func);
+#else
+    stream->submit(func);
+#endif
+}
+
 namespace dpct
 {
     typedef sycl::queue *queue_ptr;
diff --git a/ggml/src/ggml-sycl/element_wise.cpp b/ggml/src/ggml-sycl/element_wise.cpp
index 5b7c4f0b4f003..c56924ce8322f 100644
--- a/ggml/src/ggml-sycl/element_wise.cpp
+++ b/ggml/src/ggml-sycl/element_wise.cpp
@@ -329,60 +329,51 @@ static void acc_f32_sycl(const float *x, const float *y, float *dst,
                          const int ne12, const int nb1, const int nb2,
                          const int offset, queue_ptr stream) {
     int num_blocks = (n_elements + SYCL_ACC_BLOCK_SIZE - 1) / SYCL_ACC_BLOCK_SIZE;
-    stream->parallel_for(
-        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
-                              sycl::range<3>(1, 1, SYCL_ACC_BLOCK_SIZE),
-                          sycl::range<3>(1, 1, SYCL_ACC_BLOCK_SIZE)),
-        [=](sycl::nd_item<3> item_ct1) {
-            acc_f32(x, y, dst, n_elements, ne10, ne11, ne12, nb1, nb2, offset,
-                    item_ct1);
-        });
+    sycl_parallel_for(stream,
+                      sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_ACC_BLOCK_SIZE),
+                                        sycl::range<3>(1, 1, SYCL_ACC_BLOCK_SIZE)),
+                      [=](sycl::nd_item<3> item_ct1) {
+                          acc_f32(x, y, dst, n_elements, ne10, ne11, ne12, nb1, nb2, offset, item_ct1);
+                      });
 }
 
 template<typename T>
 static void gelu_sycl(const T *x, T *dst, const int k,
                           queue_ptr stream) {
     const int num_blocks = (k + SYCL_GELU_BLOCK_SIZE - 1) / SYCL_GELU_BLOCK_SIZE;
-    stream->parallel_for(
-        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
-                              sycl::range<3>(1, 1, SYCL_GELU_BLOCK_SIZE),
-                          sycl::range<3>(1, 1, SYCL_GELU_BLOCK_SIZE)),
-        [=](sycl::nd_item<3> item_ct1) {
-            gelu(x, dst, k, item_ct1);
-        });
+    sycl_parallel_for(stream,
+                      sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_GELU_BLOCK_SIZE),
+                                        sycl::range<3>(1, 1, SYCL_GELU_BLOCK_SIZE)),
+                      [=](sycl::nd_item<3> item_ct1) { gelu(x, dst, k, item_ct1); });
 }
 
 template<typename T>
 static void silu_sycl(const T *x, T *dst, const int k,
                           queue_ptr stream) {
     const int num_blocks = (k + SYCL_SILU_BLOCK_SIZE - 1) / SYCL_SILU_BLOCK_SIZE;
-    stream->parallel_for(
-        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
-                              sycl::range<3>(1, 1, SYCL_SILU_BLOCK_SIZE),
-                          sycl::range<3>(1, 1, SYCL_SILU_BLOCK_SIZE)),
-        [=](sycl::nd_item<3> item_ct1) {
-            silu(x, dst, k, item_ct1);
-        });
+    sycl_parallel_for(stream,
+                      sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_SILU_BLOCK_SIZE),
+                                        sycl::range<3>(1, 1, SYCL_SILU_BLOCK_SIZE)),
+                      [=](sycl::nd_item<3> item_ct1) { silu(x, dst, k, item_ct1); });
 }
 
 template<typename T>
 static void sgn_sycl(const T * x, T * dst, const int k, queue_ptr stream) {
     // hard code for now
     const int num_blocks = ceil_div(k, 256);
-    stream->parallel_for(
-            sycl::nd_range<3>((sycl::range<3>(1, 1, num_blocks) * sycl::range(1, 1, 256)), sycl::range(1, 1, 256)), [=](sycl::nd_item<3> item_ct1) {
-            sgn(x, dst, k, item_ct1);
-            });
+    sycl_parallel_for(
+        stream, sycl::nd_range<3>((sycl::range<3>(1, 1, num_blocks) * sycl::range(1, 1, 256)), sycl::range(1, 1, 256)),
+        [=](sycl::nd_item<3> item_ct1) { sgn(x, dst, k, item_ct1); });
 }
 
 template<typename T>
 static void abs_sycl(const T * x, T * dst, const int k, queue_ptr stream) {
     // hard code for now
     const int num_blocks = ceil_div(k, 256);
-    stream->parallel_for(
-            sycl::nd_range<3>((sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, 256)), sycl::range<3>(1, 1, 256)), [=](sycl::nd_item<3> item_ct1) {
-            abs_op(x, dst, k, item_ct1);
-            });
+    sycl_parallel_for(
+        stream,
+        sycl::nd_range<3>((sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, 256)), sycl::range<3>(1, 1, 256)),
+        [=](sycl::nd_item<3> item_ct1) { abs_op(x, dst, k, item_ct1); });
 }
 
 
@@ -390,23 +381,20 @@ template<typename T>
 static void elu_sycl(const T * x, T * dst, const int k, queue_ptr stream) {
     // hard code for now
     const int num_blocks = ceil_div(k, 256);
-    stream->parallel_for(
-            sycl::nd_range<3>((sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, 256)), sycl::range<3>(1, 1, 256)), [=](sycl::nd_item<3> item_ct1) {
-            elu_op(x, dst, k, item_ct1);
-            });
+    sycl_parallel_for(
+        stream,
+        sycl::nd_range<3>((sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, 256)), sycl::range<3>(1, 1, 256)),
+        [=](sycl::nd_item<3> item_ct1) { elu_op(x, dst, k, item_ct1); });
 }
 
 template<typename T>
 static void gelu_quick_sycl(const T *x, T *dst, const int k,
                                 queue_ptr stream) {
     const int num_blocks = (k + SYCL_GELU_BLOCK_SIZE - 1) / SYCL_GELU_BLOCK_SIZE;
-    stream->parallel_for(
-        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
-                              sycl::range<3>(1, 1, SYCL_GELU_BLOCK_SIZE),
-                          sycl::range<3>(1, 1, SYCL_GELU_BLOCK_SIZE)),
-        [=](sycl::nd_item<3> item_ct1) {
-            gelu_quick(x, dst, k, item_ct1);
-        });
+    sycl_parallel_for(stream,
+                      sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_GELU_BLOCK_SIZE),
+                                        sycl::range<3>(1, 1, SYCL_GELU_BLOCK_SIZE)),
+                      [=](sycl::nd_item<3> item_ct1) { gelu_quick(x, dst, k, item_ct1); });
 }
 
 
@@ -414,169 +402,133 @@ template<typename T>
 static void gelu_erf_sycl(const T *x, T *dst, const int k,
                                 queue_ptr stream) {
     const int num_blocks = ceil_div(k, SYCL_GELU_BLOCK_SIZE);
-    stream->parallel_for(
-        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
-                              sycl::range<3>(1, 1, SYCL_GELU_BLOCK_SIZE),
-                          sycl::range<3>(1, 1, SYCL_GELU_BLOCK_SIZE)),
-        [=](sycl::nd_item<3> item_ct1) {
-            gelu_erf(x, dst, k, item_ct1);
-        });
+    sycl_parallel_for(stream,
+                      sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_GELU_BLOCK_SIZE),
+                                        sycl::range<3>(1, 1, SYCL_GELU_BLOCK_SIZE)),
+                      [=](sycl::nd_item<3> item_ct1) { gelu_erf(x, dst, k, item_ct1); });
 }
 
 template<typename T>
 static void tanh_sycl(const T *x, T *dst, const int k,
                           queue_ptr stream) {
     const int num_blocks = (k + SYCL_TANH_BLOCK_SIZE - 1) / SYCL_TANH_BLOCK_SIZE;
-    stream->parallel_for(
-        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
-                              sycl::range<3>(1, 1, SYCL_TANH_BLOCK_SIZE),
-                          sycl::range<3>(1, 1, SYCL_TANH_BLOCK_SIZE)),
-        [=](sycl::nd_item<3> item_ct1) {
-            tanh(x, dst, k, item_ct1);
-        });
+    sycl_parallel_for(stream,
+                      sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_TANH_BLOCK_SIZE),
+                                        sycl::range<3>(1, 1, SYCL_TANH_BLOCK_SIZE)),
+                      [=](sycl::nd_item<3> item_ct1) { tanh(x, dst, k, item_ct1); });
 }
 
 template<typename T>
 static void relu_sycl(const T *x, T *dst, const int k,
                           queue_ptr stream) {
     const int num_blocks = (k + SYCL_RELU_BLOCK_SIZE - 1) / SYCL_RELU_BLOCK_SIZE;
-    stream->parallel_for(
-        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
-                              sycl::range<3>(1, 1, SYCL_RELU_BLOCK_SIZE),
-                          sycl::range<3>(1, 1, SYCL_RELU_BLOCK_SIZE)),
-        [=](sycl::nd_item<3> item_ct1) {
-            relu(x, dst, k, item_ct1);
-        });
+    sycl_parallel_for(stream,
+                      sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_RELU_BLOCK_SIZE),
+                                        sycl::range<3>(1, 1, SYCL_RELU_BLOCK_SIZE)),
+                      [=](sycl::nd_item<3> item_ct1) { relu(x, dst, k, item_ct1); });
 }
 
 template<typename T>
 static void hardsigmoid_sycl(const T *x, T *dst, const int k,
                                  queue_ptr stream) {
     const int num_blocks = (k + SYCL_HARDSIGMOID_BLOCK_SIZE - 1) / SYCL_HARDSIGMOID_BLOCK_SIZE;
-    stream->parallel_for(
-        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
-                              sycl::range<3>(1, 1, SYCL_HARDSIGMOID_BLOCK_SIZE),
+    sycl_parallel_for(
+        stream,
+        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_HARDSIGMOID_BLOCK_SIZE),
                           sycl::range<3>(1, 1, SYCL_HARDSIGMOID_BLOCK_SIZE)),
-        [=](sycl::nd_item<3> item_ct1) {
-            hardsigmoid(x, dst, k, item_ct1);
-        });
+        [=](sycl::nd_item<3> item_ct1) { hardsigmoid(x, dst, k, item_ct1); });
 }
 
 template<typename T>
 static void hardswish_sycl(const T *x, T *dst, const int k,
                                queue_ptr stream) {
     const int num_blocks = (k + SYCL_HARDSWISH_BLOCK_SIZE - 1) / SYCL_HARDSWISH_BLOCK_SIZE;
-    stream->parallel_for(
-        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
-                              sycl::range<3>(1, 1, SYCL_HARDSWISH_BLOCK_SIZE),
+    sycl_parallel_for(
+        stream,
+        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_HARDSWISH_BLOCK_SIZE),
                           sycl::range<3>(1, 1, SYCL_HARDSWISH_BLOCK_SIZE)),
-        [=](sycl::nd_item<3> item_ct1) {
-            hardswish(x, dst, k, item_ct1);
-        });
+        [=](sycl::nd_item<3> item_ct1) { hardswish(x, dst, k, item_ct1); });
 }
 
 template<typename T>
 static void exp_sycl(const T *x, T *dst, const int k,
                                queue_ptr stream) {
     const int num_blocks = (k + SYCL_EXP_BLOCK_SIZE - 1) / SYCL_EXP_BLOCK_SIZE;
-    stream->parallel_for(
-        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
-                              sycl::range<3>(1, 1, SYCL_EXP_BLOCK_SIZE),
-                          sycl::range<3>(1, 1, SYCL_EXP_BLOCK_SIZE)),
-        [=](sycl::nd_item<3> item_ct1) {
-            exp(x, dst, k, item_ct1);
-        });
+    sycl_parallel_for(stream,
+                      sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_EXP_BLOCK_SIZE),
+                                        sycl::range<3>(1, 1, SYCL_EXP_BLOCK_SIZE)),
+                      [=](sycl::nd_item<3> item_ct1) { exp(x, dst, k, item_ct1); });
 }
 
 template<typename T>
 static void log_sycl(const T *x, T *dst, const int k,
                                queue_ptr stream) {
     const int num_blocks = (k + SYCL_EXP_BLOCK_SIZE - 1) / SYCL_EXP_BLOCK_SIZE;
-    stream->parallel_for(
-        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
-                              sycl::range<3>(1, 1, SYCL_EXP_BLOCK_SIZE),
-                          sycl::range<3>(1, 1, SYCL_EXP_BLOCK_SIZE)),
-        [=](sycl::nd_item<3> item_ct1) {
-            log(x, dst, k, item_ct1);
-        });
+    sycl_parallel_for(stream,
+                      sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_EXP_BLOCK_SIZE),
+                                        sycl::range<3>(1, 1, SYCL_EXP_BLOCK_SIZE)),
+                      [=](sycl::nd_item<3> item_ct1) { log(x, dst, k, item_ct1); });
 }
 
 template<typename T>
 static void neg_sycl(const T *x, T *dst, const int k,
                                queue_ptr stream) {
     const int num_blocks = (k + SYCL_NEG_BLOCK_SIZE - 1) / SYCL_NEG_BLOCK_SIZE;
-    stream->parallel_for(
-        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
-                              sycl::range<3>(1, 1, SYCL_NEG_BLOCK_SIZE),
-                          sycl::range<3>(1, 1, SYCL_NEG_BLOCK_SIZE)),
-        [=](sycl::nd_item<3> item_ct1) {
-            neg(x, dst, k, item_ct1);
-        });
+    sycl_parallel_for(stream,
+                      sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_NEG_BLOCK_SIZE),
+                                        sycl::range<3>(1, 1, SYCL_NEG_BLOCK_SIZE)),
+                      [=](sycl::nd_item<3> item_ct1) { neg(x, dst, k, item_ct1); });
 }
 
 template<typename T>
 static void step_sycl(const T *x, T *dst, const int k,
                                queue_ptr stream) {
     const int num_blocks = (k + SYCL_NEG_BLOCK_SIZE - 1) / SYCL_NEG_BLOCK_SIZE;
-    stream->parallel_for(
-        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
-                              sycl::range<3>(1, 1, SYCL_NEG_BLOCK_SIZE),
-                          sycl::range<3>(1, 1, SYCL_NEG_BLOCK_SIZE)),
-        [=](sycl::nd_item<3> item_ct1) {
-            step(x, dst, k, item_ct1);
-        });
+    sycl_parallel_for(stream,
+                      sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_NEG_BLOCK_SIZE),
+                                        sycl::range<3>(1, 1, SYCL_NEG_BLOCK_SIZE)),
+                      [=](sycl::nd_item<3> item_ct1) { step(x, dst, k, item_ct1); });
 }
 
 template<typename T>
 static void sigmoid_sycl(const T *x, T *dst, const int k,
                                queue_ptr stream) {
     const int num_blocks = (k + SYCL_SIGMOID_BLOCK_SIZE - 1) / SYCL_SIGMOID_BLOCK_SIZE;
-    stream->parallel_for(
-        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
-                              sycl::range<3>(1, 1, SYCL_SIGMOID_BLOCK_SIZE),
+    sycl_parallel_for(
+        stream,
+        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_SIGMOID_BLOCK_SIZE),
                           sycl::range<3>(1, 1, SYCL_SIGMOID_BLOCK_SIZE)),
-        [=](sycl::nd_item<3> item_ct1) {
-            sigmoid(x, dst, k, item_ct1);
-        });
+        [=](sycl::nd_item<3> item_ct1) { sigmoid(x, dst, k, item_ct1); });
 }
 
 template<typename T>
 static void sqrt_sycl(const T *x, T *dst, const int k,
                                queue_ptr stream) {
     const int num_blocks = (k + SYCL_SQRT_BLOCK_SIZE - 1) / SYCL_SQRT_BLOCK_SIZE;
-    stream->parallel_for(
-        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
-                              sycl::range<3>(1, 1, SYCL_SQRT_BLOCK_SIZE),
-                          sycl::range<3>(1, 1, SYCL_SQRT_BLOCK_SIZE)),
-        [=](sycl::nd_item<3> item_ct1) {
-            sqrt(x, dst, k, item_ct1);
-        });
+    sycl_parallel_for(stream,
+                      sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_SQRT_BLOCK_SIZE),
+                                        sycl::range<3>(1, 1, SYCL_SQRT_BLOCK_SIZE)),
+                      [=](sycl::nd_item<3> item_ct1) { sqrt(x, dst, k, item_ct1); });
 }
 
 template<typename T>
 static void sin_sycl(const T *x, T *dst, const int k,
                                queue_ptr stream) {
     const int num_blocks = (k + SYCL_SIN_BLOCK_SIZE - 1) / SYCL_SIN_BLOCK_SIZE;
-    stream->parallel_for(
-        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
-                              sycl::range<3>(1, 1, SYCL_SIN_BLOCK_SIZE),
-                          sycl::range<3>(1, 1, SYCL_SIN_BLOCK_SIZE)),
-        [=](sycl::nd_item<3> item_ct1) {
-            sin(x, dst, k, item_ct1);
-        });
+    sycl_parallel_for(stream,
+                      sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_SIN_BLOCK_SIZE),
+                                        sycl::range<3>(1, 1, SYCL_SIN_BLOCK_SIZE)),
+                      [=](sycl::nd_item<3> item_ct1) { sin(x, dst, k, item_ct1); });
 }
 
 template<typename T>
 static void cos_sycl(const T *x, T *dst, const int k,
                                queue_ptr stream) {
     const int num_blocks = (k + SYCL_SIN_BLOCK_SIZE - 1) / SYCL_SIN_BLOCK_SIZE;
-    stream->parallel_for(
-        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
-                              sycl::range<3>(1, 1, SYCL_SIN_BLOCK_SIZE),
-                          sycl::range<3>(1, 1, SYCL_SIN_BLOCK_SIZE)),
-        [=](sycl::nd_item<3> item_ct1) {
-            cos(x, dst, k, item_ct1);
-        });
+    sycl_parallel_for(stream,
+                      sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_SIN_BLOCK_SIZE),
+                                        sycl::range<3>(1, 1, SYCL_SIN_BLOCK_SIZE)),
+                      [=](sycl::nd_item<3> item_ct1) { cos(x, dst, k, item_ct1); });
 }
 
 template<typename T>
@@ -584,26 +536,20 @@ static void leaky_relu_sycl(const T *x, T *dst, const int k,
                                 const float negative_slope,
                                 queue_ptr stream) {
     const int num_blocks = (k + SYCL_RELU_BLOCK_SIZE - 1) / SYCL_RELU_BLOCK_SIZE;
-    stream->parallel_for(
-        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
-                              sycl::range<3>(1, 1, SYCL_RELU_BLOCK_SIZE),
-                          sycl::range<3>(1, 1, SYCL_RELU_BLOCK_SIZE)),
-        [=](sycl::nd_item<3> item_ct1) {
-            leaky_relu(x, dst, k, negative_slope, item_ct1);
-        });
+    sycl_parallel_for(stream,
+                      sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_RELU_BLOCK_SIZE),
+                                        sycl::range<3>(1, 1, SYCL_RELU_BLOCK_SIZE)),
+                      [=](sycl::nd_item<3> item_ct1) { leaky_relu(x, dst, k, negative_slope, item_ct1); });
 }
 
 template<typename T>
 static void sqr_sycl(const T *x, T *dst, const int k,
                          queue_ptr stream) {
     const int num_blocks = (k + SYCL_SQR_BLOCK_SIZE - 1) / SYCL_SQR_BLOCK_SIZE;
-    stream->parallel_for(
-        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
-                              sycl::range<3>(1, 1, SYCL_SQR_BLOCK_SIZE),
-                          sycl::range<3>(1, 1, SYCL_SQR_BLOCK_SIZE)),
-        [=](sycl::nd_item<3> item_ct1) {
-            sqr(x, dst, k, item_ct1);
-        });
+    sycl_parallel_for(stream,
+                      sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_SQR_BLOCK_SIZE),
+                                        sycl::range<3>(1, 1, SYCL_SQR_BLOCK_SIZE)),
+                      [=](sycl::nd_item<3> item_ct1) { sqr(x, dst, k, item_ct1); });
 }
 
 template<typename T>
@@ -614,9 +560,8 @@ static void upscale_sycl(const T *x, T *dst, const int nb00, const int nb01,
     int dst_size = ne10 * ne11 * ne12 * ne13;
     int num_blocks = (dst_size + SYCL_UPSCALE_BLOCK_SIZE - 1) / SYCL_UPSCALE_BLOCK_SIZE;
     sycl::range<1> gridDim(num_blocks * SYCL_UPSCALE_BLOCK_SIZE);
-    stream->parallel_for(
-        sycl::nd_range<1>(gridDim, sycl::range<1>(SYCL_UPSCALE_BLOCK_SIZE)),
-        [=](sycl::nd_item<1> item_ct1) {
+    sycl_parallel_for<1>(
+        stream, sycl::nd_range<1>(gridDim, sycl::range<1>(SYCL_UPSCALE_BLOCK_SIZE)), [=](sycl::nd_item<1> item_ct1) {
             upscale(x, dst, nb00, nb01, nb02, nb03, ne10, ne11, ne12, ne13, sf0, sf1, sf2, sf3, item_ct1);
         });
 }
@@ -627,12 +572,10 @@ static void pad_sycl(const T *x, T *dst, const int ne00,
                          const int ne1, const int ne2, queue_ptr stream) {
     int num_blocks = (ne0 + SYCL_PAD_BLOCK_SIZE - 1) / SYCL_PAD_BLOCK_SIZE;
     sycl::range<3> gridDim(ne2, ne1, num_blocks);
-    stream->parallel_for(
-        sycl::nd_range<3>(gridDim * sycl::range<3>(1, 1, SYCL_PAD_BLOCK_SIZE),
-                          sycl::range<3>(1, 1, SYCL_PAD_BLOCK_SIZE)),
-        [=](sycl::nd_item<3> item_ct1) {
-            pad(x, dst, ne0, ne00, ne01, ne02, item_ct1);
-        });
+    sycl_parallel_for(stream,
+                      sycl::nd_range<3>(gridDim * sycl::range<3>(1, 1, SYCL_PAD_BLOCK_SIZE),
+                                        sycl::range<3>(1, 1, SYCL_PAD_BLOCK_SIZE)),
+                      [=](sycl::nd_item<3> item_ct1) { pad(x, dst, ne0, ne00, ne01, ne02, item_ct1); });
 }
 
 template<typename T>
@@ -640,13 +583,10 @@ static void clamp_sycl(const T *x, T *dst, const float min,
                            const float max, const int k,
                            queue_ptr stream) {
     const int num_blocks = (k + SYCL_CLAMP_BLOCK_SIZE - 1) / SYCL_CLAMP_BLOCK_SIZE;
-    stream->parallel_for(
-        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
-                              sycl::range<3>(1, 1, SYCL_CLAMP_BLOCK_SIZE),
-                          sycl::range<3>(1, 1, SYCL_CLAMP_BLOCK_SIZE)),
-        [=](sycl::nd_item<3> item_ct1) {
-            clamp(x, dst, min, max, k, item_ct1);
-        });
+    sycl_parallel_for(stream,
+                      sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CLAMP_BLOCK_SIZE),
+                                        sycl::range<3>(1, 1, SYCL_CLAMP_BLOCK_SIZE)),
+                      [=](sycl::nd_item<3> item_ct1) { clamp(x, dst, min, max, k, item_ct1); });
 }
 
 inline void ggml_sycl_op_sgn(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
diff --git a/ggml/src/ggml-sycl/gemm.hpp b/ggml/src/ggml-sycl/gemm.hpp
index 6cbc7e0f6938c..5efe03d364b1b 100644
--- a/ggml/src/ggml-sycl/gemm.hpp
+++ b/ggml/src/ggml-sycl/gemm.hpp
@@ -65,6 +65,9 @@ class DnnlGemmWrapper {
 
         dnnl::primitive_attr primitive_attr;
         primitive_attr.set_scratchpad_mode(dnnl::scratchpad_mode::user);
+#ifdef GGML_SYCL_F16
+        primitive_attr.set_fpmath_mode(dnnl::fpmath_mode::f16);
+#endif
 
         auto a_mem = dnnl::memory(a_in_md, eng, const_cast<void*>(a));
         auto b_mem = dnnl::memory(b_in_md, eng, const_cast<void*>(b));
diff --git a/ggml/src/ggml-sycl/getrows.cpp b/ggml/src/ggml-sycl/getrows.cpp
index 4a7712781364e..9c76ffeb9508a 100644
--- a/ggml/src/ggml-sycl/getrows.cpp
+++ b/ggml/src/ggml-sycl/getrows.cpp
@@ -60,54 +60,6 @@ static void k_get_rows(
     dst_row[iybs + iqs + y_offset] = v.y();
 }
 
-template<int qk, int qr, dequantize_kernel_t_reorder dequantize_kernel_recorder, typename dst_t>
-static void k_get_rows_reorder(
-            const void * src0, const void *src0_dq, const int32_t * src1, dst_t * dst,
-            int64_t ne00, /*int64_t ne01, int64_t ne02, int64_t ne03,*/
-            /*int64_t ne10, int64_t ne11,*/ int64_t ne12, /*int64_t ne13,*/
-            /*size_t s0,*/ size_t s1, size_t s2, size_t s3,
-            /*size_t nb00,*/ size_t nb01, size_t nb02, size_t nb03,
-            size_t s10, size_t s11, size_t s12,
-            const sycl::nd_item<3> &item_ct1/*, size_t s13*/) {
-
-    const int i00 = (item_ct1.get_group(2) * item_ct1.get_local_range(2) +
-                     item_ct1.get_local_id(2)) *
-                    2;
-    const int i10 = item_ct1.get_local_range(1) * item_ct1.get_group(1) +
-                    item_ct1.get_local_id(1);
-    const int i11 = (item_ct1.get_group(0) * item_ct1.get_local_range(0) +
-                     item_ct1.get_local_id(0)) /
-                    ne12;
-    const int i12 = (item_ct1.get_group(0) * item_ct1.get_local_range(0) +
-                     item_ct1.get_local_id(0)) %
-                    ne12;
-
-    if (i00 >= ne00) {
-        return;
-    }
-    auto ncols = ne00;
-    const int i01 = src1[i10*s10 + i11*s11 + i12*s12];
-
-    dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3;
-
-    const int src0_off = i01 * ncols + i00;
-    const int ib = src0_off / QK4_0; // block index
-    const int iqs = (i00%qk)/qr; // x quant index
-    const int iybs = i00 - i00%qk; // dst block start index
-    const int y_offset = qr == 1 ? 1 : qk/2;
-
-    // dequantize
-    dfloat2 v;
-    dequantize_kernel_recorder((const void *)src0_dq, ib, (const void *)src0, src0_off/2, v);
-
-    dst_row[iybs + iqs + 0] = v.x();
-    dst_row[iybs + iqs + y_offset] = v.y();
-
-    GGML_UNUSED(nb01);
-    GGML_UNUSED(nb02);
-    GGML_UNUSED(nb03);
-}
-
 template<typename src0_t, typename dst_t>
 static void k_get_rows_float(
             const src0_t * src0, const int32_t * src1, dst_t * dst,
@@ -166,58 +118,15 @@ static void get_rows_sycl(ggml_backend_sycl_context & ctx, const ggml_tensor *sr
 
     GGML_ASSERT(ne00 % 2 == 0);
 
-    stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                         [=](sycl::nd_item<3> item_ct1) {
-                             k_get_rows<qk, qr, dq>(
-                                 src0_dd, src1_dd, dst_dd, ne00, ne12, s1, s2,
-                                 s3, nb01, nb02, nb03, s10, s11, s12, item_ct1);
-                         });
-
-    GGML_UNUSED(dst);
-    GGML_UNUSED(ctx);
-}
-
-template <int qk, int qr, dequantize_kernel_t_reorder dq_reorder>
-static void get_rows_sycl_reorder(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
-                          ggml_tensor *dst, const void *src0_dd,
-                          const int32_t *src1_dd, float *dst_dd,
-                          queue_ptr stream) {
-
-    GGML_TENSOR_BINARY_OP_LOCALS
-
-    const sycl::range<3> block_dims(1, 1, SYCL_GET_ROWS_BLOCK_SIZE);
-    const int block_num_x = (ne00 + 2*SYCL_GET_ROWS_BLOCK_SIZE - 1) / (2*SYCL_GET_ROWS_BLOCK_SIZE);
-    const sycl::range<3> block_nums(ne11 * ne12, ne10, block_num_x);
-
-    // strides in elements
-    //const size_t s0 = nb0 / ggml_element_size(dst);
-    const size_t s1 = nb1 / ggml_element_size(dst);
-    const size_t s2 = nb2 / ggml_element_size(dst);
-    const size_t s3 = nb3 / ggml_element_size(dst);
-
-    const size_t s10 = nb10 / ggml_element_size(src1);
-    const size_t s11 = nb11 / ggml_element_size(src1);
-    const size_t s12 = nb12 / ggml_element_size(src1);
-    //const size_t s13 = nb13 / ggml_element_size(src1);
-
-    GGML_ASSERT(ne00 % 2 == 0);
-
-    const uint8_t* src0_q = (const uint8_t*)src0_dd;
-    const size_t ncols = ne00;
-    const size_t nrows = ne01;
-    const sycl::half* src0_dq = (const sycl::half*)(src0_q + nrows * ncols / 2);
-    stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                         [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]]{
-                             k_get_rows_reorder<qk, qr, dq_reorder>(
-                                 src0_dd, src0_dq, src1_dd, dst_dd, ne00, ne12, s1, s2,
-                                 s3, nb01, nb02, nb03, s10, s11, s12, item_ct1);
-                         });
+    sycl_parallel_for(stream, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
+        k_get_rows<qk, qr, dq>(src0_dd, src1_dd, dst_dd, ne00, ne12, s1, s2, s3, nb01, nb02, nb03, s10, s11, s12,
+                               item_ct1);
+    });
 
     GGML_UNUSED(dst);
     GGML_UNUSED(ctx);
 }
 
-
 template <typename src0_t>
 static void get_rows_sycl_float(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
                                 const ggml_tensor *src1, ggml_tensor *dst,
@@ -245,9 +154,8 @@ static void get_rows_sycl_float(ggml_backend_sycl_context & ctx, const ggml_tens
         dpct::has_capability_or_fail(stream->get_device(),
                                      {sycl::aspect::fp16});
 
-        stream->parallel_for(
-            sycl::nd_range<3>(block_nums * block_dims, block_dims),
-            [=](sycl::nd_item<3> item_ct1) {
+        sycl_parallel_for(
+            stream, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
                 k_get_rows_float(src0_dd, src1_dd, dst_dd, ne00, ne12, s1, s2,
                                  s3, nb01, nb02, nb03, s10, s11, s12, item_ct1);
             });
@@ -277,13 +185,8 @@ void ggml_sycl_op_get_rows(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
             src1_i32, (float *)dst->data, ctx.stream());
             break;
         case GGML_TYPE_Q4_0:
-            if (ctx.opt_feature.reorder && dst->op == GGML_OP_MUL_MAT) {
-                get_rows_sycl_reorder<QK4_0, QR4_0, dequantize_q4_0_reorder>(ctx, dst->src[0], dst->src[1], dst, (const float *)dst->src[0]->data,
-                src1_i32, (float *)dst->data, ctx.stream());
-            } else {
-                get_rows_sycl<QK4_0, QR4_0, dequantize_q4_0>(ctx, dst->src[0], dst->src[1], dst, (const float *)dst->src[0]->data,
-                src1_i32, (float *)dst->data, ctx.stream());
-            }
+            get_rows_sycl<QK4_0, QR4_0, dequantize_q4_0>(ctx, dst->src[0], dst->src[1], dst, (const float *)dst->src[0]->data,
+            src1_i32, (float *)dst->data, ctx.stream());
             break;
         case GGML_TYPE_Q4_1:
             get_rows_sycl<QK4_1, QR4_1, dequantize_q4_1>(ctx, dst->src[0], dst->src[1], dst, (const float *)dst->src[0]->data,
diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp
index bcd2ea5366f76..9cb36ae99e7f5 100644
--- a/ggml/src/ggml-sycl/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
@@ -83,9 +83,7 @@ static ggml_sycl_device_info ggml_sycl_init() {
 
         info.devices[i].cc =
             100 * prop.get_major_version() + 10 * prop.get_minor_version();
-        info.devices[i].hw_info = get_device_hw_info(&device);
-        info.devices[i].opt_feature = check_gpu_optimize_feature(info.devices[i].hw_info.arch);
-
+        info.devices[i].opt_feature.reorder = !device.ext_oneapi_architecture_is(syclex::arch_category::intel_gpu);
         info.max_work_group_sizes[i] = prop.get_max_work_group_size();
     }
 
@@ -195,7 +193,7 @@ static void ggml_check_sycl() try {
 
     if (!initialized) {
         g_ggml_sycl_debug = get_sycl_env("GGML_SYCL_DEBUG", 0);
-        g_ggml_sycl_disable_optimize= get_sycl_env("GGML_SYCL_DISABLE_OPT", 1);
+        g_ggml_sycl_disable_optimize = get_sycl_env("GGML_SYCL_DISABLE_OPT", 0);
         g_ggml_sycl_disable_graph = get_sycl_env("GGML_SYCL_DISABLE_GRAPH", 1);
         g_ggml_sycl_disable_dnn = get_sycl_env("GGML_SYCL_DISABLE_DNN", 0);
         g_ggml_sycl_prioritize_dmmv = get_sycl_env("GGML_SYCL_PRIORITIZE_DMMV", 0);
@@ -347,14 +345,15 @@ static enum ggml_status
 ggml_backend_sycl_buffer_init_tensor(ggml_backend_buffer_t buffer,
                                      ggml_tensor *tensor) try {
     GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
-    debug_print_tensor(": tensor=", tensor, "\n");
+    GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": tensor", tensor, "\n").c_str());
     ggml_backend_sycl_buffer_context * ctx = (ggml_backend_sycl_buffer_context *)buffer->context;
 
     if (tensor->view_src != NULL) {
         assert(tensor->view_src->buffer->buft == buffer->buft);
         return GGML_STATUS_SUCCESS;
     }
-    if ((tensor->type == GGML_TYPE_Q4_0 || tensor->type == GGML_TYPE_Q4_K) && !g_ggml_sycl_disable_optimize) {
+    if ((tensor->type == GGML_TYPE_Q4_0 || tensor->type == GGML_TYPE_Q4_K || tensor->type == GGML_TYPE_Q6_K) &&
+        !g_ggml_sycl_disable_optimize) {
         ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu{};
         tensor->extra                 = extra;
         ctx->tensor_extras.push_back(extra);  //used to release it when destroy ctx.
@@ -384,7 +383,7 @@ static void ggml_backend_sycl_buffer_set_tensor(ggml_backend_buffer_t buffer,
                                                 const void *data, size_t offset,
                                                 size_t size) try {
     GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
-    debug_print_tensor(": tensor=", tensor);
+    GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": tensor", tensor).c_str());
     GGML_SYCL_DEBUG(" size=%zu offset=%zu\n", size, offset);
     ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context;
     ggml_sycl_set_device(ctx->device);
@@ -412,7 +411,7 @@ static void ggml_backend_sycl_buffer_get_tensor(ggml_backend_buffer_t buffer,
                                                 void *data, size_t offset,
                                                 size_t size) try {
     GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
-    debug_print_tensor(": tensor=", tensor);
+    GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": tensor", tensor).c_str());
     GGML_SYCL_DEBUG(" size=%zu offset=%zu\n", size, offset);
     ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context;
 
@@ -443,8 +442,8 @@ ggml_backend_sycl_buffer_cpy_tensor(ggml_backend_buffer_t buffer,
                                     ggml_tensor *dst) try {
     bool is_cpy_supported = ggml_backend_buffer_is_sycl(src->buffer);
     GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
-    debug_print_tensor(": dst=", dst);
-    debug_print_tensor(" src=", src);
+    GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": dst", dst).c_str());
+    GGML_SYCL_DEBUG("%s", debug_get_tensor_str(" src", src).c_str());
     GGML_SYCL_DEBUG(" is_cpy_supported=%d\n", is_cpy_supported);
     if (is_cpy_supported) {
         ggml_backend_sycl_buffer_context * src_ctx = (ggml_backend_sycl_buffer_context *)src->buffer->context;
@@ -524,7 +523,7 @@ catch (sycl::exception const &exc) {
 static void ggml_backend_sycl_buffer_memset_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, uint8_t value,
                                                    size_t offset, size_t size) {
     GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
-    debug_print_tensor(": tensor=", tensor);
+    GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": tensor", tensor).c_str());
     GGML_SYCL_DEBUG(" size=%zu offset=%zu value=%u\n", size, offset, value);
     ggml_backend_sycl_buffer_context * ctx = (ggml_backend_sycl_buffer_context *) buffer->context;
     SYCL_CHECK(ggml_sycl_set_device(ctx->device));
@@ -804,7 +803,7 @@ static enum ggml_status
 ggml_backend_sycl_split_buffer_init_tensor(ggml_backend_buffer_t buffer,
                                            ggml_tensor *tensor) try {
     GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
-    debug_print_tensor(": tensor=", tensor, "\n");
+    GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": tensor", tensor, "\n").c_str());
     GGML_ASSERT(tensor->view_src == nullptr); // views of split tensors are not supported
 
     ggml_backend_sycl_split_buffer_context * ctx = (ggml_backend_sycl_split_buffer_context *)buffer->context;
@@ -890,7 +889,7 @@ ggml_backend_sycl_split_buffer_set_tensor(ggml_backend_buffer_t buffer,
                                           ggml_tensor *tensor, const void *data,
                                           size_t offset, size_t size) try {
     GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
-    debug_print_tensor(": tensor=", tensor);
+    GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": tensor", tensor).c_str());
     GGML_SYCL_DEBUG(" size=%zu offset=%zu\n", size, offset);
     // split tensors must always be set in their entirety at once
     GGML_ASSERT(offset == 0);
@@ -946,7 +945,7 @@ ggml_backend_sycl_split_buffer_get_tensor(ggml_backend_buffer_t buffer,
                                           const ggml_tensor *tensor, void *data,
                                           size_t offset, size_t size) try {
     GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
-    debug_print_tensor(": tensor=", tensor);
+    GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": tensor", tensor).c_str());
     GGML_SYCL_DEBUG(" size=%zu offset=%zu\n", size, offset);
     // split tensors must always be set in their entirety at once
     GGML_ASSERT(offset == 0);
@@ -1434,6 +1433,59 @@ static void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy,
     reinterpret_cast<sycl::half &>(y[ib].ds.y()) = sum;
 }
 
+template <int ElementsPerWI>
+static __dpct_inline__ void quantize_and_reorder_q8_1(const float * __restrict__ x, void * reordered_q8_tensor,
+                                                      const int kx, const int kx_padded, const sycl::nd_item<1> & it) {
+    /*
+        Quantizes and reorders the resultant q8 tensor in a per row fashion
+        Each sub-group calculates one quant block. i.e. QK8_1 quant values and the d and sum values
+    */
+
+    auto subgroup_id = it.get_group(0);
+    auto wi_id       = it.get_local_id(0);
+
+    const int num_blocks_per_row = kx / QK8_1;
+    auto      row                = subgroup_id / num_blocks_per_row;
+    auto      col                = subgroup_id % num_blocks_per_row;
+
+    auto row_offset = row * (kx_padded / QK8_1) * sizeof(block_q8_1);
+    auto col_offset = QK8_1 * col + wi_id * ElementsPerWI;
+
+    auto quant_ptr = (int8_t *) ((char *) reordered_q8_tensor + row_offset + col_offset);
+    auto ds_ptr    = (sycl::half2 *) ((char *) reordered_q8_tensor + row_offset + kx + col * sizeof(sycl::half2));
+
+    sycl::vec<float, ElementsPerWI>  wi_f32_vals;
+    sycl::vec<int8_t, ElementsPerWI> quantized_values;
+
+    auto float_ptr_offset = subgroup_id * QK8_1 + ElementsPerWI * wi_id;
+    wi_f32_vals           = *reinterpret_cast<const sycl::vec<float, ElementsPerWI> *>(x + float_ptr_offset);
+
+    float sum  = 0.0f;
+    float amax = 0.0f;
+
+#pragma unroll(ElementsPerWI)
+    for (int i = 0; i < ElementsPerWI; i++) {
+        sum += wi_f32_vals[i];
+        amax                = sycl::fmax(amax, sycl::fabs(wi_f32_vals[i]));
+        quantized_values[i] = 0;
+    }
+    sum     = sycl::reduce_over_group(it.get_group(), sum, sycl::plus<float>());
+    amax    = sycl::reduce_over_group(it.get_group(), amax, sycl::maximum<float>());
+    float d = amax == 0 ? 1 : amax / 127;
+
+#pragma unroll(ElementsPerWI)
+    for (int i = 0; i < ElementsPerWI; i++) {
+        quantized_values[i] = sycl::round(wi_f32_vals[i] / d);
+    }
+
+    d = amax == 0 ? 0 : d;
+
+    *reinterpret_cast<sycl::vec<int8_t, ElementsPerWI> *>(quant_ptr) = quantized_values;
+    if (wi_id == 0) {
+        *ds_ptr = sycl::half2(sycl::half(d), sycl::half(sum));
+    }
+}
+
 static void mul_mat_p021_f16_f32(
     const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst,
     const int ncols_x, const int nrows_x, const int nchannels_x, const int nchannels_y,
@@ -1718,23 +1770,30 @@ static  void pool2d_nchw_kernel(
         o_ptr[cur_oh * ow + cur_ow] = res;
 }
 
-static void quantize_row_q8_1_sycl(const float *x, void *vy, const int kx,
-                                   const int ky, const int kx_padded,
-                                   queue_ptr stream) {
-    const int block_num_x = (kx_padded + SYCL_QUANTIZE_BLOCK_SIZE - 1) / SYCL_QUANTIZE_BLOCK_SIZE;
-    const sycl::range<3> num_blocks(1, ky, block_num_x);
-    int constexpr QUANT_BLOCK_TILE = QK8_1 / WARP_SIZE;
-    static_assert(QK8_1 % WARP_SIZE == 0);
-    const sycl::range<3> block_size(1, 1, SYCL_QUANTIZE_BLOCK_SIZE / QUANT_BLOCK_TILE);
-    {
-        dpct::has_capability_or_fail(stream->get_device(),
-                                     {sycl::aspect::fp16});
+static void quantize_row_q8_1_sycl(const float * x, void * vy, const int kx, const int ky, const int kx_padded,
+                                   bool reorder_q8_tensor, queue_ptr stream) {
+    if (reorder_q8_tensor) {
+        auto local_range      = std::size_t(WARP_SIZE);
+        auto num_quant_blocks = ky * (kx / QK8_1);
+        auto global_range     = num_quant_blocks * local_range;
+        stream->parallel_for(sycl::nd_range<1>({ global_range }, { local_range }),
+                             [=](sycl::nd_item<1> it) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
+                                 quantize_and_reorder_q8_1<QK8_1 / WARP_SIZE>(x, vy, kx, kx_padded, it);
+                             });
+    } else {
+        const int            block_num_x = (kx_padded + SYCL_QUANTIZE_BLOCK_SIZE - 1) / SYCL_QUANTIZE_BLOCK_SIZE;
+        const sycl::range<3> num_blocks(1, ky, block_num_x);
+        int constexpr QUANT_BLOCK_TILE = QK8_1 / WARP_SIZE;
+        static_assert(QK8_1 % WARP_SIZE == 0);
+        const sycl::range<3> block_size(1, 1, SYCL_QUANTIZE_BLOCK_SIZE / QUANT_BLOCK_TILE);
+        {
+            dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 });
 
-        stream->parallel_for(
-            sycl::nd_range<3>(num_blocks * block_size, block_size),
-            [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
-                quantize_q8_1<QUANT_BLOCK_TILE>(x, vy, kx, kx_padded, item_ct1);
-            });
+            stream->parallel_for(sycl::nd_range<3>(num_blocks * block_size, block_size),
+                                 [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
+                                     quantize_q8_1<QUANT_BLOCK_TILE>(x, vy, kx, kx_padded, item_ct1);
+                                 });
+        }
     }
 }
 
@@ -1826,13 +1885,12 @@ static void argsort_f32_i32_sycl(const float *x, int *dst, const int ncols,
     const size_t shared_mem = ncols_pad * sizeof(int);
 
     if (order == GGML_SORT_ORDER_ASC) {
-        stream->submit([&](sycl::handler &cgh) {
+        sycl_launch(stream, [&](sycl::handler & cgh) {
             sycl::local_accessor<uint8_t, 1> dpct_local_acc_ct1(
                 sycl::range<1>(shared_mem), cgh);
 
-            cgh.parallel_for(
-                sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                [=](sycl::nd_item<3> item_ct1) {
+            sycl_parallel_for(
+                cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
                     k_argsort_f32_i32<GGML_SORT_ORDER_ASC>(
                         x, dst, ncols, ncols_pad, item_ct1,
                         dpct_local_acc_ct1.get_multi_ptr<sycl::access::decorated::no>()
@@ -1840,13 +1898,12 @@ static void argsort_f32_i32_sycl(const float *x, int *dst, const int ncols,
                 });
         });
     } else if (order == GGML_SORT_ORDER_DESC) {
-        stream->submit([&](sycl::handler &cgh) {
+        sycl_launch(stream, [&](sycl::handler & cgh) {
             sycl::local_accessor<uint8_t, 1> dpct_local_acc_ct1(
                 sycl::range<1>(shared_mem), cgh);
 
-            cgh.parallel_for(
-                sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                [=](sycl::nd_item<3> item_ct1) {
+            sycl_parallel_for(
+                cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
                     k_argsort_f32_i32<GGML_SORT_ORDER_DESC>(
                         x, dst, ncols, ncols_pad, item_ct1,
                         dpct_local_acc_ct1.get_multi_ptr<sycl::access::decorated::no>()
@@ -1864,50 +1921,47 @@ static void argmax_f32_i32_sycl(const float *x, int *dst, const int ncols,
     const sycl::range<3> block_nums(1, nrows, 1);
     const size_t shared_mem = 256 * sizeof(float);
 
-    stream->submit([&](sycl::handler &cgh) {
+    sycl_launch(stream, [&](sycl::handler & cgh) {
         sycl::local_accessor<float, 1> shared_data(
             sycl::range<1>(shared_mem/sizeof(float)), cgh);
         sycl::local_accessor<int, 1> shared_indices(
             sycl::range<1>(shared_mem/sizeof(float)), cgh);
 
-        cgh.parallel_for(
-            sycl::nd_range<3>(block_nums * block_dims, block_dims),
-            [=](sycl::nd_item<3> item_ct1) {
-                const int tid = item_ct1.get_local_id(2);
-                const int row = item_ct1.get_global_id(1);
-
-                float max_val = -INFINITY;
-                int max_idx = -1;
-
-                for (int col = tid; col < ncols; col += 256) {
-                    float val = x[row * ncols + col];
-                    if (val > max_val) {
-                        max_val = val;
-                        max_idx = col;
-                    }
-                }
+        sycl_parallel_for(cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
+            const int tid = item_ct1.get_local_id(2);
+            const int row = item_ct1.get_global_id(1);
 
-                shared_data[tid] = max_val;
-                shared_indices[tid] = max_idx;
-                item_ct1.barrier(sycl::access::fence_space::local_space);
+            float max_val = -INFINITY;
+            int   max_idx = -1;
 
-                for (int stride = 256/2; stride > 0; stride >>= 1) {
-                    if (tid < stride) {
-                        float val1 = shared_data[tid];
-                        float val2 = shared_data[tid + stride];
-                        if (val2 > val1) {
-                            shared_data[tid] = val2;
-                            shared_indices[tid] = shared_indices[tid + stride];
-                        }
-                    }
-                    item_ct1.barrier(sycl::access::fence_space::local_space);
+            for (int col = tid; col < ncols; col += 256) {
+                float val = x[row * ncols + col];
+                if (val > max_val) {
+                    max_val = val;
+                    max_idx = col;
                 }
+            }
 
+            shared_data[tid]    = max_val;
+            shared_indices[tid] = max_idx;
+            item_ct1.barrier(sycl::access::fence_space::local_space);
 
-                if (tid == 0) {
-                    dst[row] = shared_indices[0];
+            for (int stride = 256 / 2; stride > 0; stride >>= 1) {
+                if (tid < stride) {
+                    float val1 = shared_data[tid];
+                    float val2 = shared_data[tid + stride];
+                    if (val2 > val1) {
+                        shared_data[tid]    = val2;
+                        shared_indices[tid] = shared_indices[tid + stride];
+                    }
                 }
-            });
+                item_ct1.barrier(sycl::access::fence_space::local_space);
+            }
+
+            if (tid == 0) {
+                dst[row] = shared_indices[0];
+            }
+        });
     });
 }
 static void diag_mask_inf_f32_sycl(const float *x, float *dst,
@@ -2066,21 +2120,18 @@ inline void ggml_sycl_op_mul_mat_sycl(
         const sycl::half *src1_ptr = src1->type == GGML_TYPE_F16
                 ? (const sycl::half *)src1->data + src1_padded_row_size
                                          : src1_as_f16.get();
-        ggml_sycl_pool_alloc<sycl::half> dst_f16(ctx.pool(), row_diff * src1_ncols);
 
 #if GGML_SYCL_DNNL
         if (!g_ggml_sycl_disable_dnn) {
             DnnlGemmWrapper::row_gemm(ctx, src1_ncols, row_diff, ne10, src1_ptr,
                                       DnnlGemmWrapper::to_dt<sycl::half>(), src0_ptr, DnnlGemmWrapper::to_dt<sycl::half>(),
-                                      dst_f16.get(), DnnlGemmWrapper::to_dt<sycl::half>(), stream);
-            scope_op_debug_print scope_dbg_print(__func__, "/to_fp32_sycl", dst, /*num_src=*/2,
-                                                 " : converting dst to fp32");
-            const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16, dst);
-            to_fp32_sycl(dst_f16.get(), dst_dd_i, row_diff* src1_ncols, stream);
+                                      dst_dd_i, DnnlGemmWrapper::to_dt<float>(), stream);
         }
         else
 #endif
         {
+            ggml_sycl_pool_alloc<sycl::half> dst_f16(ctx.pool(), row_diff * src1_ncols);
+
             const sycl::half alpha_f16 = 1.0f;
             const sycl::half beta_f16  = 0.0f;
             SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm(
@@ -2446,9 +2497,10 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
             dev[i].src1_ddq = dev[i].src1_ddq_alloc.alloc(ctx.pool(i), nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs);
 
             if (src1_on_device && src1_is_contiguous) {
+                bool reorder_q8_tensor = src0->extra && ((ggml_tensor_extra_gpu *)src0->extra)->optimized_feature.reorder;
                 scope_op_debug_print scope_dbg_print(__func__, "/quantize_row_q8_1_sycl", dst,
                                                      /*num_src=*/2, " : converting src1 to Q8_1");
-                quantize_row_q8_1_sycl(dev[i].src1_ddf, dev[i].src1_ddq, ne10, nrows1, src1_padded_col_size, stream);
+                quantize_row_q8_1_sycl(dev[i].src1_ddf, dev[i].src1_ddq, ne10, nrows1, src1_padded_col_size, reorder_q8_tensor, stream);
                 /*
                 DPCT1010:90: SYCL uses exceptions to report errors and does not
                 use the error codes. The call was replaced with 0. You need to
@@ -2554,7 +2606,7 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
                 if (convert_src1_to_q8_1 && !src1_is_contiguous) {
                     scope_op_debug_print scope_dbg_print(__func__, "/quantize_row_q8_1_sycl", dst,
                                                          /*num_src=*/2, " : converting src1 to Q8_1");
-                    quantize_row_q8_1_sycl(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, stream);
+                    quantize_row_q8_1_sycl(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, false, stream);
                     /*
                     DPCT1010:92: SYCL uses exceptions to report errors and does
                     not use the error codes. The call was replaced with 0. You
@@ -2893,7 +2945,7 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx, cons
                 void **       ptrs_dst_get = ptrs_dst.get();
                 size_t        nb12_scaled  = src1->type == GGML_TYPE_F16 ? nb12 : s12 * sizeof(sycl::half);
                 size_t        nb13_scaled  = src1->type == GGML_TYPE_F16 ? nb13 : s13 * sizeof(sycl::half);
-                cgh.parallel_for(sycl::nd_range<3>(block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
+                sycl_parallel_for(cgh, sycl::nd_range<3>(block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
                     k_compute_batched_ptrs(src0_f16, src1_f16, dst_ddf, ptrs_src_get, ptrs_dst_get, ne12, ne13, ne23, nb02,
                                            nb03, nb12_scaled, nb13_scaled, nbd2, nbd3, r2, r3, item_ct1);
                 });
@@ -2928,6 +2980,7 @@ inline bool ggml_sycl_supports_reorder_mul_mat_sycl(enum ggml_type type) {
         case GGML_TYPE_Q4_0:
             return true;
         case GGML_TYPE_Q4_K:
+        case GGML_TYPE_Q6_K:
             return !g_ggml_sycl_prioritize_dmmv;
         default:
             return false;
@@ -2947,6 +3000,7 @@ inline bool ggml_sycl_supports_reorder_mmvq(enum ggml_type type) {
     switch (type) {
         case GGML_TYPE_Q4_0:
         case GGML_TYPE_Q4_K:
+        case GGML_TYPE_Q6_K:
             return true;
         default:
             return false;
@@ -3031,6 +3085,50 @@ static void reorder_qw_q4_k(uint8_t * data_device, size_t size, size_t offset, d
     sycl::free(tmp_buf, *stream);
 }
 
+static void reorder_qw_q6_k(uint8_t * data_device, size_t size, size_t offset, dpct::queue_ptr stream) {
+    GGML_ASSERT(size % sizeof(block_q6_K) == 0);
+    GGML_ASSERT(offset % sizeof(block_q6_K) == 0);
+
+    const int nblocks = size / sizeof(block_q6_K);
+
+    auto * tmp_buf = sycl::malloc_shared<uint8_t>(size, *stream);
+    SYCL_CHECK(CHECK_TRY_ERROR((*stream).memcpy(tmp_buf, data_device, size).wait()));
+
+    auto *       ql_ptr     = data_device;
+    auto *       qh_ptr     = ql_ptr + (QK_K / 2) * nblocks;
+    auto *       scales_ptr = qh_ptr + (QK_K / 4) * nblocks;
+    sycl::half * dm_ptr     = (sycl::half *) (scales_ptr + (QK_K / 16) * nblocks);
+
+    stream
+        ->parallel_for(nblocks,
+                       [=](auto i) {
+                           const block_q6_K * x  = (const block_q6_K *) tmp_buf;
+                           const int          ib = i;
+
+                           const uint8_t * ql              = x[ib].ql;
+                           const uint8_t * qh              = x[ib].qh;
+                           uint8_t *       base_ql_ptr     = ql_ptr + (QK_K / 2) * ib;
+                           uint8_t *       base_qh_ptr     = qh_ptr + (QK_K / 4) * ib;
+                           uint8_t *       base_scales_ptr = scales_ptr + (QK_K / 16) * ib;
+
+                           for (int j = 0; j < QK_K / 2; ++j) {
+                               base_ql_ptr[j] = ql[j];
+                           }
+                           for (int j = 0; j < QK_K / 4; ++j) {
+                               base_qh_ptr[j] = qh[j];
+                           }
+
+                           for (int j = 0; j < QK_K / 16; ++j) {
+                               base_scales_ptr[j] = x[ib].scales[j];
+                           }
+
+                           dm_ptr[ib] = x[ib].d;
+                       })
+        .wait_and_throw();
+
+    sycl::free(tmp_buf, *stream);
+}
+
 static void reorder_qw(const ggml_tensor * src0, dpct::queue_ptr stream) {
     uint8_t * data_device = (uint8_t *) src0->data;
     size_t ncols = src0->ne[0];
@@ -3044,6 +3142,9 @@ static void reorder_qw(const ggml_tensor * src0, dpct::queue_ptr stream) {
         case GGML_TYPE_Q4_K:
             reorder_qw_q4_k(data_device, size, 0, stream);
             break;
+        case GGML_TYPE_Q6_K:
+            reorder_qw_q6_k(data_device, size, 0, stream);
+            break;
         default:
             GGML_ABORT("reorder_qw() called with unsupported type");
             break;
@@ -3348,7 +3449,7 @@ static void ggml_sycl_mul_mat_id(ggml_backend_sycl_context & ctx,
             {
                 sycl::range<3> block_dims(1, 1, std::min((unsigned int)ne10, 768u));
                 sycl::range<3> grid_dims(1, n_ids, ids->ne[1]);
-                stream->submit([&](sycl::handler &cgh) {
+                sycl_launch(stream, [&](sycl::handler & cgh) {
                     sycl::local_accessor<int, 0> src1_row_acc(cgh);
 
                     char *__restrict src1_contiguous_get =
@@ -3360,9 +3461,8 @@ static void ggml_sycl_mul_mat_id(ggml_backend_sycl_context & ctx,
                     size_t ids_nb_ct6 = ids->nb[1];
                     size_t ids_nb_ct7 = ids->nb[0];
 
-                    cgh.parallel_for(
-                        sycl::nd_range<3>(grid_dims * block_dims, block_dims),
-                        [=](sycl::nd_item<3> item_ct1) {
+                    sycl_parallel_for(
+                        cgh, sycl::nd_range<3>(grid_dims * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
                             k_copy_src1_to_contiguous(
                                 src1_original, src1_contiguous_get,
                                 dev_cur_src1_row_get,
@@ -3393,15 +3493,14 @@ static void ggml_sycl_mul_mat_id(ggml_backend_sycl_context & ctx,
             {
                 sycl::range<3> block_dims(1, 1, std::min((unsigned int)ne0, 768u));
                 sycl::range<3> grid_dims(1, 1, num_src1_rows);
-                stream->submit([&](sycl::handler &cgh) {
+                sycl_launch(stream, [&](sycl::handler & cgh) {
                     const char *__restrict dst_contiguous_get =
                         dst_contiguous.get();
                     const mmid_row_mapping *__restrict dev_row_mapping_get =
                         dev_row_mapping.get();
 
-                    cgh.parallel_for(
-                        sycl::nd_range<3>(grid_dims * block_dims, block_dims),
-                        [=](sycl::nd_item<3> item_ct1) {
+                    sycl_parallel_for(
+                        cgh, sycl::nd_range<3>(grid_dims * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
                             k_copy_dst_from_contiguous(dst_original,
                                                        dst_contiguous_get,
                                                        dev_row_mapping_get,
@@ -3755,7 +3854,7 @@ static void ggml_backend_sycl_set_tensor_async(ggml_backend_t backend,
                                                const void *data, size_t offset,
                                                size_t size) try {
     GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
-    debug_print_tensor(": tensor=", tensor);
+    GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": tensor", tensor).c_str());
     GGML_SYCL_DEBUG(" size=%zu offset=%zu\n", size, offset);
     ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
     ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
@@ -3776,7 +3875,7 @@ static void ggml_backend_sycl_get_tensor_async(ggml_backend_t backend,
                                                void *data, size_t offset,
                                                size_t size) try {
     GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
-    debug_print_tensor(": tensor=", tensor);
+    GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": tensor", tensor).c_str());
     GGML_SYCL_DEBUG(" size=%zu offset=%zu\n", size, offset);
     ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
     ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
@@ -3799,8 +3898,8 @@ static bool ggml_backend_sycl_cpy_tensor_async(ggml_backend_t backend,
     bool is_cpy_supported                = dst->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device) &&
                             ggml_backend_buffer_is_sycl(src->buffer);
     GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
-    debug_print_tensor(": dst=", dst);
-    debug_print_tensor(" src=", src);
+    GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": dst", dst).c_str());
+    GGML_SYCL_DEBUG("%s", debug_get_tensor_str(" src", src).c_str());
     GGML_SYCL_DEBUG(" is_cpy_supported=%d\n", is_cpy_supported);
     if (is_cpy_supported) {
         /*
@@ -4165,6 +4264,9 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
             {
                 ggml_type src0_type = op->src[0]->type;
                 ggml_type src1_type = op->src[1]->type;
+                if (src0_type == src1_type && (ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1])) && src0_type != GGML_TYPE_BF16) {
+                    return true;
+                }
                 if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) {
                     return true;
                 }
@@ -4210,6 +4312,21 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
                 if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_IQ4_NL) {
                     return true;
                 }
+                if(src0_type == GGML_TYPE_Q8_0 && src1_type == GGML_TYPE_Q8_0) {
+                    return true;
+                }
+                if(src0_type == GGML_TYPE_Q5_0 && src1_type == GGML_TYPE_Q5_0) {
+                    return true;
+                }
+                if(src0_type == GGML_TYPE_Q5_1 && src1_type == GGML_TYPE_Q5_1) {
+                    return true;
+                }
+                if(src0_type == GGML_TYPE_Q4_0 && src1_type == GGML_TYPE_Q4_0) {
+                    return true;
+                }
+                if(src0_type == GGML_TYPE_Q4_1 && src1_type == GGML_TYPE_Q4_1) {
+                    return true;
+                }
                 return false;
             }
         case GGML_OP_CONCAT:
diff --git a/ggml/src/ggml-sycl/gla.cpp b/ggml/src/ggml-sycl/gla.cpp
index 879184fdd3111..b40cbf1f14fb2 100644
--- a/ggml/src/ggml-sycl/gla.cpp
+++ b/ggml/src/ggml-sycl/gla.cpp
@@ -11,13 +11,13 @@ static void gated_linear_attn_f32_kernel(const dpct::queue_ptr stream, u_int B,
     const u_int n_seq_tokens = T / B;
     sycl::range<1> block_dims((C / H));
     sycl::range<1> grid_dims((B * H));
-    stream->submit([&](sycl::handler & cgh) {
+    sycl_launch(stream, [&](sycl::handler & cgh) {
         /* local memory accessors*/
         auto _k  = sycl::local_accessor<float, 1>(sycl::range<1>(head_size), cgh);
         auto _r  = sycl::local_accessor<float, 1>(sycl::range<1>(head_size), cgh);
         auto _td = sycl::local_accessor<float, 1>(sycl::range<1>(head_size), cgh);
 
-        cgh.parallel_for(sycl::nd_range<1>(grid_dims * block_dims, block_dims), [=](sycl::nd_item<1> item) {
+        sycl_parallel_for<1>(cgh, sycl::nd_range<1>(grid_dims * block_dims, block_dims), [=](sycl::nd_item<1> item) {
             u_int tid = item.get_local_id(0);
             u_int bid = item.get_group(0);
 
diff --git a/ggml/src/ggml-sycl/im2col.cpp b/ggml/src/ggml-sycl/im2col.cpp
index aa19c2527dc41..52737cc746dfa 100644
--- a/ggml/src/ggml-sycl/im2col.cpp
+++ b/ggml/src/ggml-sycl/im2col.cpp
@@ -70,7 +70,7 @@ static void im2col_sycl_internal(const float * x, T * dst, int64_t IW, int64_t I
 
     const int64_t CHW = IC * KH * KW;
 
-    stream->parallel_for(sycl::nd_range<3>(block_nums * local_range, local_range), [=](sycl::nd_item<3> item_ct1) {
+    sycl_parallel_for(stream, sycl::nd_range<3>(block_nums * local_range, local_range), [=](sycl::nd_item<3> item_ct1) {
         im2col_kernel<T>(x, dst, batch_offset, offset_delta, IC, IW, IH, OH, OW, KW, KH, parallel_elements, CHW, s0, s1,
                          p0, p1, d0, d1, item_ct1);
     });
diff --git a/ggml/src/ggml-sycl/mmq.cpp b/ggml/src/ggml-sycl/mmq.cpp
index ffb272aa28378..c72fcd38ebeff 100644
--- a/ggml/src/ggml-sycl/mmq.cpp
+++ b/ggml/src/ggml-sycl/mmq.cpp
@@ -1818,7 +1818,7 @@ static void ggml_mul_mat_q4_0_q8_1_sycl(const void *vx, const void *vy,
             dpct::has_capability_or_fail(stream->get_device(),
                                          {sycl::aspect::fp16});
 
-            stream->submit([&](sycl::handler &cgh) {
+            sycl_launch(stream, [&](sycl::handler & cgh) {
                 sycl::local_accessor<int, 1> tile_x_qs_q4_0_acc_ct1(
                     sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);
                 sycl::local_accessor<float, 1> tile_x_d_q4_0_acc_ct1(
@@ -1829,9 +1829,8 @@ static void ggml_mul_mat_q4_0_q8_1_sycl(const void *vx, const void *vy,
                 sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
                     sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
 
-                cgh.parallel_for(
-                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                    [=](sycl::nd_item<3> item_ct1) {
+                sycl_parallel_for(
+                    cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
                         mul_mat_q4_0<need_check>(
                             vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
                             nrows_dst, item_ct1,
@@ -1853,7 +1852,7 @@ static void ggml_mul_mat_q4_0_q8_1_sycl(const void *vx, const void *vy,
             dpct::has_capability_or_fail(stream->get_device(),
                                          {sycl::aspect::fp16});
 
-            stream->submit([&](sycl::handler &cgh) {
+            sycl_launch(stream, [&](sycl::handler & cgh) {
                 sycl::local_accessor<int, 1> tile_x_qs_q4_0_acc_ct1(
                     sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);
                 sycl::local_accessor<float, 1> tile_x_d_q4_0_acc_ct1(
@@ -1864,9 +1863,8 @@ static void ggml_mul_mat_q4_0_q8_1_sycl(const void *vx, const void *vy,
                 sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
                     sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
 
-                cgh.parallel_for(
-                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                    [=](sycl::nd_item<3> item_ct1) {
+                sycl_parallel_for(
+                    cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
                         mul_mat_q4_0<need_check>(
                             vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
                             nrows_dst, item_ct1,
@@ -1933,7 +1931,7 @@ static void ggml_mul_mat_q4_1_q8_1_sycl(const void *vx, const void *vy,
             dpct::has_capability_or_fail(stream->get_device(),
                                          {sycl::aspect::fp16});
 
-            stream->submit([&](sycl::handler &cgh) {
+            sycl_launch(stream, [&](sycl::handler & cgh) {
                 sycl::local_accessor<int, 1> tile_x_qs_q4_1_acc_ct1(
                     sycl::range<1>(mmq_y * (WARP_SIZE) + +mmq_y), cgh);
                 sycl::local_accessor<sycl::half2, 1> tile_x_dm_q4_1_acc_ct1(
@@ -1944,9 +1942,8 @@ static void ggml_mul_mat_q4_1_q8_1_sycl(const void *vx, const void *vy,
                 sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
                     sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
 
-                cgh.parallel_for(
-                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                    [=](sycl::nd_item<3> item_ct1) {
+                sycl_parallel_for(
+                    cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
                         mul_mat_q4_1<need_check>(
                             vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
                             nrows_dst, item_ct1,
@@ -1968,7 +1965,7 @@ static void ggml_mul_mat_q4_1_q8_1_sycl(const void *vx, const void *vy,
             dpct::has_capability_or_fail(stream->get_device(),
                                          {sycl::aspect::fp16});
 
-            stream->submit([&](sycl::handler &cgh) {
+            sycl_launch(stream, [&](sycl::handler & cgh) {
                 sycl::local_accessor<int, 1> tile_x_qs_q4_1_acc_ct1(
                     sycl::range<1>(mmq_y * (WARP_SIZE) + +mmq_y), cgh);
                 sycl::local_accessor<sycl::half2, 1> tile_x_dm_q4_1_acc_ct1(
@@ -1979,9 +1976,8 @@ static void ggml_mul_mat_q4_1_q8_1_sycl(const void *vx, const void *vy,
                 sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
                     sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
 
-                cgh.parallel_for(
-                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                    [=](sycl::nd_item<3> item_ct1) {
+                sycl_parallel_for(
+                    cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
                         mul_mat_q4_1<need_check>(
                             vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
                             nrows_dst, item_ct1,
@@ -2048,7 +2044,7 @@ static void ggml_mul_mat_q5_0_q8_1_sycl(const void *vx, const void *vy,
             dpct::has_capability_or_fail(stream->get_device(),
                                          {sycl::aspect::fp16});
 
-            stream->submit([&](sycl::handler &cgh) {
+            sycl_launch(stream, [&](sycl::handler & cgh) {
                 sycl::local_accessor<int, 1> tile_x_ql_q5_0_acc_ct1(
                     sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh);
                 sycl::local_accessor<float, 1> tile_x_d_q5_0_acc_ct1(
@@ -2059,9 +2055,8 @@ static void ggml_mul_mat_q5_0_q8_1_sycl(const void *vx, const void *vy,
                 sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
                     sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
 
-                cgh.parallel_for(
-                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                    [=](sycl::nd_item<3> item_ct1) {
+                sycl_parallel_for(
+                    cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
                         mul_mat_q5_0<need_check>(
                             vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
                             nrows_dst, item_ct1,
@@ -2083,7 +2078,7 @@ static void ggml_mul_mat_q5_0_q8_1_sycl(const void *vx, const void *vy,
             dpct::has_capability_or_fail(stream->get_device(),
                                          {sycl::aspect::fp16});
 
-            stream->submit([&](sycl::handler &cgh) {
+            sycl_launch(stream, [&](sycl::handler & cgh) {
                 sycl::local_accessor<int, 1> tile_x_ql_q5_0_acc_ct1(
                     sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh);
                 sycl::local_accessor<float, 1> tile_x_d_q5_0_acc_ct1(
@@ -2094,9 +2089,8 @@ static void ggml_mul_mat_q5_0_q8_1_sycl(const void *vx, const void *vy,
                 sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
                     sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
 
-                cgh.parallel_for(
-                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                    [=](sycl::nd_item<3> item_ct1) {
+                sycl_parallel_for(
+                    cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
                         mul_mat_q5_0<need_check>(
                             vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
                             nrows_dst, item_ct1,
@@ -2163,7 +2157,7 @@ static void ggml_mul_mat_q5_1_q8_1_sycl(const void *vx, const void *vy,
             dpct::has_capability_or_fail(stream->get_device(),
                                          {sycl::aspect::fp16});
 
-            stream->submit([&](sycl::handler &cgh) {
+            sycl_launch(stream, [&](sycl::handler & cgh) {
                 sycl::local_accessor<int, 1> tile_x_ql_q5_1_acc_ct1(
                     sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh);
                 sycl::local_accessor<sycl::half2, 1> tile_x_dm_q5_1_acc_ct1(
@@ -2174,9 +2168,8 @@ static void ggml_mul_mat_q5_1_q8_1_sycl(const void *vx, const void *vy,
                 sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
                     sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
 
-                cgh.parallel_for(
-                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                    [=](sycl::nd_item<3> item_ct1) {
+                sycl_parallel_for(
+                    cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
                         mul_mat_q5_1<need_check>(
                             vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
                             nrows_dst, item_ct1,
@@ -2198,7 +2191,7 @@ static void ggml_mul_mat_q5_1_q8_1_sycl(const void *vx, const void *vy,
             dpct::has_capability_or_fail(stream->get_device(),
                                          {sycl::aspect::fp16});
 
-            stream->submit([&](sycl::handler &cgh) {
+            sycl_launch(stream, [&](sycl::handler & cgh) {
                 sycl::local_accessor<int, 1> tile_x_ql_q5_1_acc_ct1(
                     sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh);
                 sycl::local_accessor<sycl::half2, 1> tile_x_dm_q5_1_acc_ct1(
@@ -2209,9 +2202,8 @@ static void ggml_mul_mat_q5_1_q8_1_sycl(const void *vx, const void *vy,
                 sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
                     sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
 
-                cgh.parallel_for(
-                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                    [=](sycl::nd_item<3> item_ct1) {
+                sycl_parallel_for(
+                    cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
                         mul_mat_q5_1<need_check>(
                             vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
                             nrows_dst, item_ct1,
@@ -2278,7 +2270,7 @@ static void ggml_mul_mat_q8_0_q8_1_sycl(const void *vx, const void *vy,
             dpct::has_capability_or_fail(stream->get_device(),
                                          {sycl::aspect::fp16});
 
-            stream->submit([&](sycl::handler &cgh) {
+            sycl_launch(stream, [&](sycl::handler & cgh) {
                 sycl::local_accessor<int, 1> tile_x_qs_q8_0_acc_ct1(
                     sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);
                 sycl::local_accessor<float, 1> tile_x_d_q8_0_acc_ct1(
@@ -2289,9 +2281,8 @@ static void ggml_mul_mat_q8_0_q8_1_sycl(const void *vx, const void *vy,
                 sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
                     sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
 
-                cgh.parallel_for(
-                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                    [=](sycl::nd_item<3> item_ct1) {
+                sycl_parallel_for(
+                    cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
                         mul_mat_q8_0<need_check>(
                             vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
                             nrows_dst, item_ct1,
@@ -2313,7 +2304,7 @@ static void ggml_mul_mat_q8_0_q8_1_sycl(const void *vx, const void *vy,
             dpct::has_capability_or_fail(stream->get_device(),
                                          {sycl::aspect::fp16});
 
-            stream->submit([&](sycl::handler &cgh) {
+            sycl_launch(stream, [&](sycl::handler & cgh) {
                 sycl::local_accessor<int, 1> tile_x_qs_q8_0_acc_ct1(
                     sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);
                 sycl::local_accessor<float, 1> tile_x_d_q8_0_acc_ct1(
@@ -2324,9 +2315,8 @@ static void ggml_mul_mat_q8_0_q8_1_sycl(const void *vx, const void *vy,
                 sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
                     sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
 
-                cgh.parallel_for(
-                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                    [=](sycl::nd_item<3> item_ct1) {
+                sycl_parallel_for(
+                    cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
                         mul_mat_q8_0<need_check>(
                             vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
                             nrows_dst, item_ct1,
@@ -2393,7 +2383,7 @@ static void ggml_mul_mat_q2_K_q8_1_sycl(const void *vx, const void *vy,
             dpct::has_capability_or_fail(stream->get_device(),
                                          {sycl::aspect::fp16});
 
-            stream->submit([&](sycl::handler &cgh) {
+            sycl_launch(stream, [&](sycl::handler & cgh) {
                 sycl::local_accessor<int, 1> tile_x_ql_q2_K_acc_ct1(
                     sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);
                 sycl::local_accessor<sycl::half2, 1> tile_x_dm_q2_K_acc_ct1(
@@ -2406,9 +2396,8 @@ static void ggml_mul_mat_q2_K_q8_1_sycl(const void *vx, const void *vy,
                 sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
                     sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
 
-                cgh.parallel_for(
-                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                    [=](sycl::nd_item<3> item_ct1) {
+                sycl_parallel_for(
+                    cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
                         mul_mat_q2_K<need_check>(
                             vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
                             nrows_dst, item_ct1,
@@ -2431,7 +2420,7 @@ static void ggml_mul_mat_q2_K_q8_1_sycl(const void *vx, const void *vy,
             dpct::has_capability_or_fail(stream->get_device(),
                                          {sycl::aspect::fp16});
 
-            stream->submit([&](sycl::handler &cgh) {
+            sycl_launch(stream, [&](sycl::handler & cgh) {
                 sycl::local_accessor<int, 1> tile_x_ql_q2_K_acc_ct1(
                     sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);
                 sycl::local_accessor<sycl::half2, 1> tile_x_dm_q2_K_acc_ct1(
@@ -2444,9 +2433,8 @@ static void ggml_mul_mat_q2_K_q8_1_sycl(const void *vx, const void *vy,
                 sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
                     sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
 
-                cgh.parallel_for(
-                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                    [=](sycl::nd_item<3> item_ct1) {
+                sycl_parallel_for(
+                    cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
                         mul_mat_q2_K<need_check>(
                             vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
                             nrows_dst, item_ct1,
@@ -2516,7 +2504,7 @@ static void ggml_mul_mat_q3_K_q8_1_sycl(const void *vx, const void *vy,
             dpct::has_capability_or_fail(stream->get_device(),
                                          {sycl::aspect::fp16});
 
-            stream->submit([&](sycl::handler &cgh) {
+            sycl_launch(stream, [&](sycl::handler & cgh) {
                 sycl::local_accessor<int, 1> tile_x_ql_q3_K_acc_ct1(
                     sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);
                 sycl::local_accessor<sycl::half2, 1> tile_x_dm_q3_K_acc_ct1(
@@ -2531,9 +2519,8 @@ static void ggml_mul_mat_q3_K_q8_1_sycl(const void *vx, const void *vy,
                 sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
                     sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
 
-                cgh.parallel_for(
-                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                    [=](sycl::nd_item<3> item_ct1) {
+                sycl_parallel_for(
+                    cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
                         mul_mat_q3_K<need_check>(
                             vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
                             nrows_dst, item_ct1,
@@ -2557,7 +2544,7 @@ static void ggml_mul_mat_q3_K_q8_1_sycl(const void *vx, const void *vy,
             dpct::has_capability_or_fail(stream->get_device(),
                                          {sycl::aspect::fp16});
 
-            stream->submit([&](sycl::handler &cgh) {
+            sycl_launch(stream, [&](sycl::handler & cgh) {
                 sycl::local_accessor<int, 1> tile_x_ql_q3_K_acc_ct1(
                     sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);
                 sycl::local_accessor<sycl::half2, 1> tile_x_dm_q3_K_acc_ct1(
@@ -2572,9 +2559,8 @@ static void ggml_mul_mat_q3_K_q8_1_sycl(const void *vx, const void *vy,
                 sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
                     sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
 
-                cgh.parallel_for(
-                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                    [=](sycl::nd_item<3> item_ct1) {
+                sycl_parallel_for(
+                    cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
                         mul_mat_q3_K<need_check>(
                             vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
                             nrows_dst, item_ct1,
@@ -2644,7 +2630,7 @@ static void ggml_mul_mat_q4_K_q8_1_sycl(const void *vx, const void *vy,
             dpct::has_capability_or_fail(stream->get_device(),
                                          {sycl::aspect::fp16});
 
-            stream->submit([&](sycl::handler &cgh) {
+            sycl_launch(stream, [&](sycl::handler & cgh) {
                 sycl::local_accessor<int, 1> tile_x_ql_q4_K_acc_ct1(
                     sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);
                 sycl::local_accessor<sycl::half2, 1> tile_x_dm_q4_K_acc_ct1(
@@ -2657,9 +2643,8 @@ static void ggml_mul_mat_q4_K_q8_1_sycl(const void *vx, const void *vy,
                 sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
                     sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
 
-                cgh.parallel_for(
-                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                    [=](sycl::nd_item<3> item_ct1) {
+                sycl_parallel_for(
+                    cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
                         mul_mat_q4_K<need_check>(
                             vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
                             nrows_dst, item_ct1,
@@ -2682,7 +2667,7 @@ static void ggml_mul_mat_q4_K_q8_1_sycl(const void *vx, const void *vy,
             dpct::has_capability_or_fail(stream->get_device(),
                                          {sycl::aspect::fp16});
 
-            stream->submit([&](sycl::handler &cgh) {
+            sycl_launch(stream, [&](sycl::handler & cgh) {
                 sycl::local_accessor<int, 1> tile_x_ql_q4_K_acc_ct1(
                     sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);
                 sycl::local_accessor<sycl::half2, 1> tile_x_dm_q4_K_acc_ct1(
@@ -2695,9 +2680,8 @@ static void ggml_mul_mat_q4_K_q8_1_sycl(const void *vx, const void *vy,
                 sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
                     sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
 
-                cgh.parallel_for(
-                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                    [=](sycl::nd_item<3> item_ct1) {
+                sycl_parallel_for(
+                    cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
                         mul_mat_q4_K<need_check>(
                             vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
                             nrows_dst, item_ct1,
@@ -2765,7 +2749,7 @@ static void ggml_mul_mat_q5_K_q8_1_sycl(const void *vx, const void *vy,
             dpct::has_capability_or_fail(stream->get_device(),
                                          {sycl::aspect::fp16});
 
-            stream->submit([&](sycl::handler &cgh) {
+            sycl_launch(stream, [&](sycl::handler & cgh) {
                 sycl::local_accessor<int, 1> tile_x_ql_q5_K_acc_ct1(
                     sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh);
                 sycl::local_accessor<sycl::half2, 1> tile_x_dm_q5_K_acc_ct1(
@@ -2778,9 +2762,8 @@ static void ggml_mul_mat_q5_K_q8_1_sycl(const void *vx, const void *vy,
                 sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
                     sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
 
-                cgh.parallel_for(
-                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                    [=](sycl::nd_item<3> item_ct1) {
+                sycl_parallel_for(
+                    cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
                         mul_mat_q5_K<need_check>(
                             vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
                             nrows_dst, item_ct1,
@@ -2803,7 +2786,7 @@ static void ggml_mul_mat_q5_K_q8_1_sycl(const void *vx, const void *vy,
             dpct::has_capability_or_fail(stream->get_device(),
                                          {sycl::aspect::fp16});
 
-            stream->submit([&](sycl::handler &cgh) {
+            sycl_launch(stream, [&](sycl::handler & cgh) {
                 sycl::local_accessor<int, 1> tile_x_ql_q5_K_acc_ct1(
                     sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh);
                 sycl::local_accessor<sycl::half2, 1> tile_x_dm_q5_K_acc_ct1(
@@ -2816,9 +2799,8 @@ static void ggml_mul_mat_q5_K_q8_1_sycl(const void *vx, const void *vy,
                 sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
                     sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
 
-                cgh.parallel_for(
-                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                    [=](sycl::nd_item<3> item_ct1) {
+                sycl_parallel_for(
+                    cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
                         mul_mat_q5_K<need_check>(
                             vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
                             nrows_dst, item_ct1,
@@ -2886,7 +2868,7 @@ static void ggml_mul_mat_q6_K_q8_1_sycl(const void *vx, const void *vy,
             dpct::has_capability_or_fail(stream->get_device(),
                                          {sycl::aspect::fp16});
 
-            stream->submit([&](sycl::handler &cgh) {
+            sycl_launch(stream, [&](sycl::handler & cgh) {
                 sycl::local_accessor<int, 1> tile_x_ql_acc_ct1(
                     sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh);
                 sycl::local_accessor<sycl::half2, 1> tile_x_dm_acc_ct1(
@@ -2899,9 +2881,8 @@ static void ggml_mul_mat_q6_K_q8_1_sycl(const void *vx, const void *vy,
                 sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
                     sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
 
-                cgh.parallel_for(
-                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                    [=](sycl::nd_item<3> item_ct1) {
+                sycl_parallel_for(
+                    cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
                         mul_mat_q6_K<need_check>(
                             vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
                             nrows_dst, item_ct1,
@@ -2924,7 +2905,7 @@ static void ggml_mul_mat_q6_K_q8_1_sycl(const void *vx, const void *vy,
             dpct::has_capability_or_fail(stream->get_device(),
                                          {sycl::aspect::fp16});
 
-            stream->submit([&](sycl::handler &cgh) {
+            sycl_launch(stream, [&](sycl::handler & cgh) {
                 sycl::local_accessor<int, 1> tile_x_ql_acc_ct1(
                     sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh);
                 sycl::local_accessor<sycl::half2, 1> tile_x_dm_acc_ct1(
@@ -2937,9 +2918,8 @@ static void ggml_mul_mat_q6_K_q8_1_sycl(const void *vx, const void *vy,
                 sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
                     sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
 
-                cgh.parallel_for(
-                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                    [=](sycl::nd_item<3> item_ct1) {
+                sycl_parallel_for(
+                    cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
                         mul_mat_q6_K<need_check>(
                             vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
                             nrows_dst, item_ct1,
diff --git a/ggml/src/ggml-sycl/mmvq.cpp b/ggml/src/ggml-sycl/mmvq.cpp
index cb70f83a4f9a6..c21929d51e94c 100644
--- a/ggml/src/ggml-sycl/mmvq.cpp
+++ b/ggml/src/ggml-sycl/mmvq.cpp
@@ -29,24 +29,23 @@ static void mul_mat_vec_q_reorder(const void * __restrict__ vx, const void * __r
     static_assert(blocks_per_subgroup > 0);
     static_assert(block_elements_per_subgroup > 0);
 
-    const block_q8_1 * y = (const block_q8_1 *) vy;
-
     float partial_sum = 0.0f;
     for (int i = sg.get_local_linear_id() / block_elements_per_subgroup; i < blocks_per_row; i += blocks_per_subgroup) {
-        const int ibx       = row * blocks_per_row + i;  // x block index
-        // TODO: Generalize offsets, right now only works for quantizations that don't split high and low bits
-        const int bx_offset = block_type::get_block_offset(ibx);
-        const int d_offset  = block_type::get_d_offset(nrows, ncols, ibx);
+        const int ibx = row * blocks_per_row + i;  // x block index
 
+        const auto         bx_offset      = block_type::get_block_offset(ibx, nblocks);
+        const auto         d_offset       = block_type::get_d_offset(nrows, ncols, ibx);
         // Y block index that aligns with ibx
         const int iby = i * block_type::block_to_q8_1_ratio();
+        const int8_t* q8_1_quant_ptr = (const int8_t*)vy + iby * QK8_1;
+        const sycl::half2* q8_1_ds_ptr = (const sycl::half2*)((const char*)vy + ncols + iby * sizeof(sycl::half2));
 
 #pragma unroll
         for (int elem = 0; elem < block_elements_per_subgroup; elem += WARP_SIZE) {
             // x block quant index when casting the quants to int
             const int iqs = elem + block_traits::vdr_mmvq * (sg.get_local_linear_id() % block_elements_per_subgroup);
 
-            partial_sum += reorder_vec_dot_q_sycl()(vx, bx_offset, d_offset, &y[iby], iqs, nblocks);
+            partial_sum += reorder_vec_dot_q_sycl()(vx, bx_offset, d_offset, q8_1_quant_ptr, q8_1_ds_ptr, iqs);
         }
     }
 
@@ -545,12 +544,12 @@ static void reorder_mul_mat_vec_q4_0_q8_1_sycl(const void * vx, const void * vy,
     const sycl::range<3> global_size(1, GGML_SYCL_MMV_Y, (block_num_y * WARP_SIZE));
     const sycl::range<3> workgroup_size(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE);
 
-    stream->submit([&](sycl::handler & cgh) {
-        cgh.parallel_for(sycl::nd_range<3>(global_size, workgroup_size),
-                         [=](sycl::nd_item<3> nd_item) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
-                             mul_mat_vec_q_reorder<reorder_vec_dot_q_sycl<GGML_TYPE_Q4_0>>(vx, vy, dst, ncols, nrows,
-                                                                                           nd_item);
-                         });
+    sycl_launch(stream, [&](sycl::handler & cgh) {
+        sycl_parallel_for(cgh, sycl::nd_range<3>(global_size, workgroup_size),
+                          [=](sycl::nd_item<3> nd_item) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
+                              mul_mat_vec_q_reorder<reorder_vec_dot_q_sycl<GGML_TYPE_Q4_0>>(vx, vy, dst, ncols, nrows,
+                                                                                            nd_item);
+                          });
     });
 }
 
@@ -562,12 +561,12 @@ static void mul_mat_vec_q4_0_q8_1_sycl(const void * vx, const void * vy, float *
     const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
 
     {
-        stream->submit([&](sycl::handler & cgh) {
-            cgh.parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                             [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
-                                 mul_mat_vec_q<QK4_0, QI4_0, block_q4_0, VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>(
-                                     vx, vy, dst, ncols, nrows, item_ct1);
-                             });
+        sycl_launch(stream, [&](sycl::handler & cgh) {
+            sycl_parallel_for(cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                              [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
+                                  mul_mat_vec_q<QK4_0, QI4_0, block_q4_0, VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>(
+                                      vx, vy, dst, ncols, nrows, item_ct1);
+                              });
         });
     }
 }
@@ -581,17 +580,12 @@ static void mul_mat_vec_q4_1_q8_1_sycl(const void *vx, const void *vy,
     const sycl::range<3> block_nums(1, 1, block_num_y);
     const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
     {
-
-        stream->submit([&](sycl::handler &cgh) {
-
-            cgh.parallel_for(
-                sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                [=](sycl::nd_item<3> item_ct1)
-                    [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
-                        mul_mat_vec_q<QK4_0, QI4_1, block_q4_1,
-                                      VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>(
-                            vx, vy, dst, ncols, nrows, item_ct1);
-                    });
+        sycl_launch(stream, [&](sycl::handler & cgh) {
+            sycl_parallel_for(cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                              [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
+                                  mul_mat_vec_q<QK4_0, QI4_1, block_q4_1, VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>(
+                                      vx, vy, dst, ncols, nrows, item_ct1);
+                              });
         });
     }
 }
@@ -605,17 +599,12 @@ static void mul_mat_vec_q5_0_q8_1_sycl(const void *vx, const void *vy,
     const sycl::range<3> block_nums(1, 1, block_num_y);
     const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
     {
-
-        stream->submit([&](sycl::handler &cgh) {
-
-            cgh.parallel_for(
-                sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                [=](sycl::nd_item<3> item_ct1)
-                    [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
-                        mul_mat_vec_q<QK5_0, QI5_0, block_q5_0,
-                                      VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>(
-                            vx, vy, dst, ncols, nrows, item_ct1);
-                    });
+        sycl_launch(stream, [&](sycl::handler & cgh) {
+            sycl_parallel_for(cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                              [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
+                                  mul_mat_vec_q<QK5_0, QI5_0, block_q5_0, VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>(
+                                      vx, vy, dst, ncols, nrows, item_ct1);
+                              });
         });
     }
 }
@@ -629,17 +618,12 @@ static void mul_mat_vec_q5_1_q8_1_sycl(const void *vx, const void *vy,
     const sycl::range<3> block_nums(1, 1, block_num_y);
     const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
     {
-
-        stream->submit([&](sycl::handler &cgh) {
-
-            cgh.parallel_for(
-                sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                [=](sycl::nd_item<3> item_ct1)
-                    [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
-                        mul_mat_vec_q<QK5_1, QI5_1, block_q5_1,
-                                      VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>(
-                            vx, vy, dst, ncols, nrows, item_ct1);
-                    });
+        sycl_launch(stream, [&](sycl::handler & cgh) {
+            sycl_parallel_for(cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                              [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
+                                  mul_mat_vec_q<QK5_1, QI5_1, block_q5_1, VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>(
+                                      vx, vy, dst, ncols, nrows, item_ct1);
+                              });
         });
     }
 }
@@ -653,17 +637,12 @@ static void mul_mat_vec_q8_0_q8_1_sycl(const void *vx, const void *vy,
     const sycl::range<3> block_nums(1, 1, block_num_y);
     const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
     {
-
-        stream->submit([&](sycl::handler &cgh) {
-
-            cgh.parallel_for(
-                sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                [=](sycl::nd_item<3> item_ct1)
-                    [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
-                        mul_mat_vec_q<QK8_0, QI8_0, block_q8_0,
-                                      VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>(
-                            vx, vy, dst, ncols, nrows, item_ct1);
-                    });
+        sycl_launch(stream, [&](sycl::handler & cgh) {
+            sycl_parallel_for(cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                              [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
+                                  mul_mat_vec_q<QK8_0, QI8_0, block_q8_0, VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>(
+                                      vx, vy, dst, ncols, nrows, item_ct1);
+                              });
         });
     }
 }
@@ -677,17 +656,12 @@ static void mul_mat_vec_q2_K_q8_1_sycl(const void *vx, const void *vy,
     const sycl::range<3> block_nums(1, 1, block_num_y);
     const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
     {
-
-        stream->submit([&](sycl::handler &cgh) {
-
-            cgh.parallel_for(
-                sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                [=](sycl::nd_item<3> item_ct1)
-                    [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
-                        mul_mat_vec_q<QK_K, QI2_K, block_q2_K,
-                                      VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>(
-                            vx, vy, dst, ncols, nrows, item_ct1);
-                    });
+        sycl_launch(stream, [&](sycl::handler & cgh) {
+            sycl_parallel_for(cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                              [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
+                                  mul_mat_vec_q<QK_K, QI2_K, block_q2_K, VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>(
+                                      vx, vy, dst, ncols, nrows, item_ct1);
+                              });
         });
     }
 }
@@ -701,17 +675,12 @@ static void mul_mat_vec_q3_K_q8_1_sycl(const void *vx, const void *vy,
     const sycl::range<3> block_nums(1, 1, block_num_y);
     const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
     {
-
-        stream->submit([&](sycl::handler &cgh) {
-
-            cgh.parallel_for(
-                sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                [=](sycl::nd_item<3> item_ct1)
-                    [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
-                        mul_mat_vec_q<QK_K, QI3_K, block_q3_K,
-                                      VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>(
-                            vx, vy, dst, ncols, nrows, item_ct1);
-                    });
+        sycl_launch(stream, [&](sycl::handler & cgh) {
+            sycl_parallel_for(cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                              [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
+                                  mul_mat_vec_q<QK_K, QI3_K, block_q3_K, VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>(
+                                      vx, vy, dst, ncols, nrows, item_ct1);
+                              });
         });
     }
 }
@@ -725,17 +694,12 @@ static void mul_mat_vec_q4_K_q8_1_sycl(const void *vx, const void *vy,
     const sycl::range<3> block_nums(1, 1, block_num_y);
     const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
     {
-
-        stream->submit([&](sycl::handler &cgh) {
-
-            cgh.parallel_for(
-                sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                [=](sycl::nd_item<3> item_ct1)
-                    [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
-                        mul_mat_vec_q<QK_K, QI4_K, block_q4_K,
-                                      VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>(
-                            vx, vy, dst, ncols, nrows, item_ct1);
-                    });
+        sycl_launch(stream, [&](sycl::handler & cgh) {
+            sycl_parallel_for(cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                              [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
+                                  mul_mat_vec_q<QK_K, QI4_K, block_q4_K, VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>(
+                                      vx, vy, dst, ncols, nrows, item_ct1);
+                              });
         });
     }
 }
@@ -751,12 +715,12 @@ static void reorder_mul_mat_vec_q4_k_q8_1_sycl(const void * vx, const void * vy,
     const sycl::range<3> global_size(1, GGML_SYCL_MMV_Y, block_num_y * WARP_SIZE);
     const sycl::range<3> workgroup_size(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE);
 
-    stream->submit([&](sycl::handler & cgh) {
-        cgh.parallel_for(sycl::nd_range<3>(global_size, workgroup_size),
-                            [=](sycl::nd_item<3> nd_item) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
-                                mul_mat_vec_q_reorder<reorder_vec_dot_q_sycl<GGML_TYPE_Q4_K>>(vx, vy, dst, ncols,
-                                                                                            nrows, nd_item);
-                            });
+    sycl_launch(stream, [&](sycl::handler & cgh) {
+        sycl_parallel_for(cgh, sycl::nd_range<3>(global_size, workgroup_size),
+                          [=](sycl::nd_item<3> nd_item) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
+                              mul_mat_vec_q_reorder<reorder_vec_dot_q_sycl<GGML_TYPE_Q4_K>>(vx, vy, dst, ncols, nrows,
+                                                                                            nd_item);
+                          });
     });
 }
 
@@ -770,21 +734,34 @@ static void mul_mat_vec_q5_K_q8_1_sycl(const void *vx, const void *vy,
     const sycl::range<3> block_nums(1, 1, block_num_y);
     const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
     {
-
-        stream->submit([&](sycl::handler &cgh) {
-
-            cgh.parallel_for(
-                sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                [=](sycl::nd_item<3> item_ct1)
-                    [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
-                        mul_mat_vec_q<QK_K, QI5_K, block_q5_K,
-                                      VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>(
-                            vx, vy, dst, ncols, nrows, item_ct1);
-                    });
+        sycl_launch(stream, [&](sycl::handler & cgh) {
+            sycl_parallel_for(cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                              [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
+                                  mul_mat_vec_q<QK_K, QI5_K, block_q5_K, VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>(
+                                      vx, vy, dst, ncols, nrows, item_ct1);
+                              });
         });
     }
 }
 
+static void reorder_mul_mat_vec_q6_k_q8_1_sycl(const void * vx, const void * vy, float * dst, const int ncols,
+                                               const int nrows, dpct::queue_ptr stream) {
+    GGML_ASSERT(ncols % QK_K == 0);
+    const int        block_num_y   = ceil_div(nrows, GGML_SYCL_MMV_Y);
+    constexpr size_t num_subgroups = 16;
+    GGML_ASSERT(block_num_y % num_subgroups == 0);
+
+    const sycl::range<3> global_size(1, GGML_SYCL_MMV_Y, block_num_y * WARP_SIZE);
+    const sycl::range<3> workgroup_size(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE);
+
+    sycl_launch(stream, [&](sycl::handler & cgh) {
+        sycl_parallel_for(cgh, sycl::nd_range<3>(global_size, workgroup_size),
+                          [=](sycl::nd_item<3> nd_item) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
+                              mul_mat_vec_q_reorder<reorder_vec_dot_q_sycl<GGML_TYPE_Q6_K>>(vx, vy, dst, ncols, nrows,
+                                                                                            nd_item);
+                          });
+    });
+}
 static void mul_mat_vec_q6_K_q8_1_sycl(const void *vx, const void *vy,
                                        float *dst, const int ncols,
                                        const int nrows,
@@ -794,17 +771,12 @@ static void mul_mat_vec_q6_K_q8_1_sycl(const void *vx, const void *vy,
     const sycl::range<3> block_nums(1, 1, block_num_y);
     const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
     {
-
-        stream->submit([&](sycl::handler &cgh) {
-
-            cgh.parallel_for(
-                sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                [=](sycl::nd_item<3> item_ct1)
-                    [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
-                        mul_mat_vec_q<QK_K, QI6_K, block_q6_K,
-                                      VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>(
-                            vx, vy, dst, ncols, nrows, item_ct1);
-                    });
+        sycl_launch(stream, [&](sycl::handler & cgh) {
+            sycl_parallel_for(cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                              [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
+                                  mul_mat_vec_q<QK_K, QI6_K, block_q6_K, VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>(
+                                      vx, vy, dst, ncols, nrows, item_ct1);
+                              });
         });
     }
 }
@@ -819,14 +791,12 @@ static void mul_mat_vec_iq2_xxs_q8_1_sycl(const void *vx, const void *vy,
     const sycl::range<3> block_nums(1, 1, block_num_y);
     const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
     {
-        stream->submit([&](sycl::handler &cgh) {
-            cgh.parallel_for(
-                sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                [=](sycl::nd_item<3> item_ct1)
-                    [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
-                        mul_mat_vec_q_iq2_xxs_q8_1<QK_K, QI2_XXS/2, block_iq2_xxs, 1>(
-                            vx, vy, dst, ncols, nrows, item_ct1);
-                    });
+        sycl_launch(stream, [&](sycl::handler & cgh) {
+            sycl_parallel_for(cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                              [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
+                                  mul_mat_vec_q_iq2_xxs_q8_1<QK_K, QI2_XXS / 2, block_iq2_xxs, 1>(vx, vy, dst, ncols,
+                                                                                                  nrows, item_ct1);
+                              });
         });
     }
 }
@@ -840,14 +810,12 @@ static void mul_mat_vec_iq2_xs_q8_1_sycl(const void *vx, const void *vy,
     const sycl::range<3> block_nums(1, 1, block_num_y);
     const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
     {
-        stream->submit([&](sycl::handler & cgh) {
-            cgh.parallel_for(
-                sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                [=](sycl::nd_item<3> item_ct1)
-                    [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
-                        mul_mat_vec_q_iq2_xs_q8_1<QK_K, QI2_XS/2, block_iq2_xs, 1>(
-                            vx, vy, dst, ncols, nrows, item_ct1);
-                    });
+        sycl_launch(stream, [&](sycl::handler & cgh) {
+            sycl_parallel_for(cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                              [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
+                                  mul_mat_vec_q_iq2_xs_q8_1<QK_K, QI2_XS / 2, block_iq2_xs, 1>(vx, vy, dst, ncols,
+                                                                                               nrows, item_ct1);
+                              });
         });
     }
 }
@@ -861,15 +829,12 @@ static void mul_mat_vec_iq2_s_q8_1_sycl(const void *vx, const void *vy,
     const sycl::range<3> block_nums(1, 1, block_num_y);
     const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
     {
-
-        stream->submit([&](sycl::handler &cgh) {
-            cgh.parallel_for(
-                sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                [=](sycl::nd_item<3> item_ct1)
-                    [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
-                        mul_mat_vec_q_iq2_s_q8_1<QK_K, QI2_S/2, block_iq2_s, 1>(
-                            vx, vy, dst, ncols, nrows, item_ct1);
-                    });
+        sycl_launch(stream, [&](sycl::handler & cgh) {
+            sycl_parallel_for(cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                              [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
+                                  mul_mat_vec_q_iq2_s_q8_1<QK_K, QI2_S / 2, block_iq2_s, 1>(vx, vy, dst, ncols, nrows,
+                                                                                            item_ct1);
+                              });
         });
     }
 }
@@ -883,15 +848,12 @@ static void mul_mat_vec_iq3_xxs_q8_1_sycl(const void *vx, const void *vy,
     const sycl::range<3> block_nums(1, 1, block_num_y);
     const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
     {
-
-        stream->submit([&](sycl::handler &cgh) {
-            cgh.parallel_for(
-                sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                [=](sycl::nd_item<3> item_ct1)
-                    [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
-                        mul_mat_vec_q_iq3_xxs_q8_1<QK_K, QI3_XXS/2, block_iq3_xxs, 1>(
-                            vx, vy, dst, ncols, nrows, item_ct1);
-                    });
+        sycl_launch(stream, [&](sycl::handler & cgh) {
+            sycl_parallel_for(cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                              [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
+                                  mul_mat_vec_q_iq3_xxs_q8_1<QK_K, QI3_XXS / 2, block_iq3_xxs, 1>(vx, vy, dst, ncols,
+                                                                                                  nrows, item_ct1);
+                              });
         });
     }
 }
@@ -905,15 +867,12 @@ static void mul_mat_vec_iq3_s_q8_1_sycl(const void *vx, const void *vy,
     const sycl::range<3> block_nums(1, 1, block_num_y);
     const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
     {
-
-        stream->submit([&](sycl::handler &cgh) {
-            cgh.parallel_for(
-                sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                [=](sycl::nd_item<3> item_ct1)
-                    [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
-                        mul_mat_vec_q_iq3_s_q8_1<QK_K, QI3_S/2, block_iq3_s, 1>(
-                            vx, vy, dst, ncols, nrows, item_ct1);
-                    });
+        sycl_launch(stream, [&](sycl::handler & cgh) {
+            sycl_parallel_for(cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                              [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
+                                  mul_mat_vec_q_iq3_s_q8_1<QK_K, QI3_S / 2, block_iq3_s, 1>(vx, vy, dst, ncols, nrows,
+                                                                                            item_ct1);
+                              });
         });
     }
 }
@@ -927,15 +886,12 @@ static void mul_mat_vec_iq1_s_q8_1_sycl(const void *vx, const void *vy,
     const sycl::range<3> block_nums(1, 1, block_num_y);
     const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
     {
-
-        stream->submit([&](sycl::handler &cgh) {
-            cgh.parallel_for(
-                sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                [=](sycl::nd_item<3> item_ct1)
-                    [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
-                        mul_mat_vec_q_iq1_s_q8_1<QK_K, QI1_S, block_iq1_s, 1>(
-                            vx, vy, dst, ncols, nrows, item_ct1);
-                    });
+        sycl_launch(stream, [&](sycl::handler & cgh) {
+            sycl_parallel_for(cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                              [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
+                                  mul_mat_vec_q_iq1_s_q8_1<QK_K, QI1_S, block_iq1_s, 1>(vx, vy, dst, ncols, nrows,
+                                                                                        item_ct1);
+                              });
         });
     }
 }
@@ -949,14 +905,12 @@ static void mul_mat_vec_iq1_m_q8_1_sycl(const void *vx, const void *vy,
     const sycl::range<3> block_nums(1, 1, block_num_y);
     const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
     {
-        stream->submit([&](sycl::handler &cgh) {
-            cgh.parallel_for(
-                sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                [=](sycl::nd_item<3> item_ct1)
-                    [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
-                        mul_mat_vec_q_iq1_m_q8_1<QK_K, QI1_S, block_iq1_m, 1>(
-                            vx, vy, dst, ncols, nrows, item_ct1);
-                    });
+        sycl_launch(stream, [&](sycl::handler & cgh) {
+            sycl_parallel_for(cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                              [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
+                                  mul_mat_vec_q_iq1_m_q8_1<QK_K, QI1_S, block_iq1_m, 1>(vx, vy, dst, ncols, nrows,
+                                                                                        item_ct1);
+                              });
         });
     }
 }
@@ -970,15 +924,12 @@ static void mul_mat_vec_iq4_nl_q8_1_sycl(const void *vx, const void *vy,
     const sycl::range<3> block_nums(1, 1, block_num_y);
     const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
     {
-
-        stream->submit([&](sycl::handler &cgh) {
-            cgh.parallel_for(
-                sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                [=](sycl::nd_item<3> item_ct1)
-                    [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
-                        mul_mat_vec_q_iq4_nl_q8_1<QK4_NL, QI4_NL, block_iq4_nl, 2>(
-                            vx, vy, dst, ncols, nrows, item_ct1);
-                    });
+        sycl_launch(stream, [&](sycl::handler & cgh) {
+            sycl_parallel_for(cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                              [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
+                                  mul_mat_vec_q_iq4_nl_q8_1<QK4_NL, QI4_NL, block_iq4_nl, 2>(vx, vy, dst, ncols, nrows,
+                                                                                             item_ct1);
+                              });
         });
     }
 }
@@ -992,15 +943,12 @@ static void mul_mat_vec_iq4_xs_q8_1_sycl(const void *vx, const void *vy,
     const sycl::range<3> block_nums(1, 1, block_num_y);
     const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
     {
-
-        stream->submit([&](sycl::handler &cgh) {
-            cgh.parallel_for(
-                sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                [=](sycl::nd_item<3> item_ct1)
-                    [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
-                        mul_mat_vec_q_iq4_xs_q8_1<QK_K, QI4_XS/4, block_iq4_xs, 1>(
-                            vx, vy, dst, ncols, nrows, item_ct1);
-                    });
+        sycl_launch(stream, [&](sycl::handler & cgh) {
+            sycl_parallel_for(cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                              [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
+                                  mul_mat_vec_q_iq4_xs_q8_1<QK_K, QI4_XS / 4, block_iq4_xs, 1>(vx, vy, dst, ncols,
+                                                                                               nrows, item_ct1);
+                              });
         });
     }
 }
@@ -1070,7 +1018,14 @@ void ggml_sycl_op_mul_mat_vec_q(ggml_backend_sycl_context & ctx, const ggml_tens
                 mul_mat_vec_q5_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
                 break;
             case GGML_TYPE_Q6_K:
-                mul_mat_vec_q6_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
+                if ((ggml_tensor_extra_gpu *) dst->src[0]->extra &&
+                    ((ggml_tensor_extra_gpu *) dst->src[0]->extra)->optimized_feature.reorder) {
+                    GGML_SYCL_DEBUG("Calling reorder_mul_mat_vec_q6_k_q8_1_sycl\n");
+                    reorder_mul_mat_vec_q6_k_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
+                } else {
+                    GGML_SYCL_DEBUG("Calling mul_mat_vec_q6_k_q8_1_sycl\n");
+                    mul_mat_vec_q6_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
+                }
                 break;
             case GGML_TYPE_IQ1_S:
                 mul_mat_vec_iq1_s_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
diff --git a/ggml/src/ggml-sycl/norm.cpp b/ggml/src/ggml-sycl/norm.cpp
index 4ec1416849c7e..79d846b41a15d 100644
--- a/ggml/src/ggml-sycl/norm.cpp
+++ b/ggml/src/ggml-sycl/norm.cpp
@@ -254,14 +254,13 @@ static void norm_f32_sycl(const float * x, float * dst, const int ncols, const i
     GGML_ASSERT(ncols % WARP_SIZE == 0);
     if (ncols < 1024) {
         const sycl::range<3> block_dims(1, 1, WARP_SIZE);
-        stream->submit([&](sycl::handler& cgh) {
-            cgh.parallel_for(
-                sycl::nd_range<3>(global_dims * block_dims, block_dims),
-                [=](sycl::nd_item<3> item_ct1)
-                [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
-                    norm_f32(x, dst, ncols, stride_row, stride_channel, stride_sample, eps, item_ct1, nullptr, WARP_SIZE);
-                });
-            });
+        sycl_launch(stream, [&](sycl::handler & cgh) {
+            sycl_parallel_for(cgh, sycl::nd_range<3>(global_dims * block_dims, block_dims),
+                              [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
+                                  norm_f32(x, dst, ncols, stride_row, stride_channel, stride_sample, eps, item_ct1,
+                                           nullptr, WARP_SIZE);
+                              });
+        });
     }
     else {
         const int work_group_size = ggml_sycl_info().max_work_group_sizes[device];
@@ -272,16 +271,15 @@ static void norm_f32_sycl(const float * x, float * dst, const int ncols, const i
         the limit. To get the device limit, query
         info::device::max_work_group_size. Adjust the work-group size if needed.
         */
-        stream->submit([&](sycl::handler& cgh) {
+        sycl_launch(stream, [&](sycl::handler & cgh) {
             sycl::local_accessor<sycl::float2, 1> s_sum_acc_ct1(
                             sycl::range<1>(work_group_size / WARP_SIZE), cgh);
-            cgh.parallel_for(
-                sycl::nd_range<3>(global_dims * block_dims, block_dims),
-                [=](sycl::nd_item<3> item_ct1)
-                [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
-                    norm_f32(x, dst, ncols, stride_row, stride_channel, stride_sample, eps, item_ct1, get_pointer(s_sum_acc_ct1), work_group_size);
-                });
-            });
+            sycl_parallel_for(cgh, sycl::nd_range<3>(global_dims * block_dims, block_dims),
+                              [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
+                                  norm_f32(x, dst, ncols, stride_row, stride_channel, stride_sample, eps, item_ct1,
+                                           get_pointer(s_sum_acc_ct1), work_group_size);
+                              });
+        });
     }
 }
 
@@ -290,18 +288,14 @@ static void group_norm_f32_sycl(const float* x, float* dst,
     const int ne_elements, queue_ptr stream, int device) {
     if (group_size < 1024) {
         const sycl::range<3> block_dims(1, 1, WARP_SIZE);
-        stream->submit([&](sycl::handler& cgh) {
+        sycl_launch(stream, [&](sycl::handler & cgh) {
             const float eps_ct4 = eps;
-            cgh.parallel_for(
-                sycl::nd_range<3>(sycl::range<3>(1, 1, num_groups) * block_dims,
-                    block_dims),
-                [=](sycl::nd_item<3> item_ct1)
-                [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
-                    group_norm_f32(
-                        x, dst, group_size, ne_elements, eps_ct4, item_ct1,
-                        nullptr, WARP_SIZE);
-                });
-            });
+            sycl_parallel_for(cgh, sycl::nd_range<3>(sycl::range<3>(1, 1, num_groups) * block_dims, block_dims),
+                              [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
+                                  group_norm_f32(x, dst, group_size, ne_elements, eps_ct4, item_ct1, nullptr,
+                                                 WARP_SIZE);
+                              });
+        });
     }
     else {
         const int work_group_size = ggml_sycl_info().max_work_group_sizes[device];
@@ -313,22 +307,18 @@ static void group_norm_f32_sycl(const float* x, float* dst,
         info::device::max_work_group_size. Adjust the work-group size if needed.
         */
 
-        stream->submit([&](sycl::handler& cgh) {
+        sycl_launch(stream, [&](sycl::handler & cgh) {
             sycl::local_accessor<float, 1> s_sum_acc_ct1(sycl::range<1>(work_group_size / WARP_SIZE),
                 cgh);
 
             const float eps_ct4 = eps;
 
-            cgh.parallel_for(
-                sycl::nd_range<3>(sycl::range<3>(1, 1, num_groups) * block_dims,
-                    block_dims),
-                [=](sycl::nd_item<3> item_ct1)
-                [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
-                    group_norm_f32(x, dst, group_size, ne_elements,
-                        eps_ct4, item_ct1,
-                        get_pointer(s_sum_acc_ct1), work_group_size);
-                });
-            });
+            sycl_parallel_for(cgh, sycl::nd_range<3>(sycl::range<3>(1, 1, num_groups) * block_dims, block_dims),
+                              [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
+                                  group_norm_f32(x, dst, group_size, ne_elements, eps_ct4, item_ct1,
+                                                 get_pointer(s_sum_acc_ct1), work_group_size);
+                              });
+        });
     }
 }
 
@@ -340,14 +330,13 @@ static void rms_norm_f32_sycl(const float* x, float* dst, const int ncols, const
     const sycl::range<3> global_dims(nsamples, nchannels, nrows);
     if (ncols < 1024) {
         const sycl::range<3> block_dims(1, 1, WARP_SIZE);
-        stream->submit([&](sycl::handler& cgh) {
-            cgh.parallel_for(
-                sycl::nd_range<3>(global_dims * block_dims, block_dims),
-                [=](sycl::nd_item<3> item_ct1)
-                [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
-                    rms_norm_f32(x, dst, ncols, stride_row, stride_channel, stride_sample, eps, item_ct1, nullptr, WARP_SIZE);
-                });
-            });
+        sycl_launch(stream, [&](sycl::handler & cgh) {
+            sycl_parallel_for(cgh, sycl::nd_range<3>(global_dims * block_dims, block_dims),
+                              [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
+                                  rms_norm_f32(x, dst, ncols, stride_row, stride_channel, stride_sample, eps, item_ct1,
+                                               nullptr, WARP_SIZE);
+                              });
+        });
     }
     else {
         const int work_group_size = ggml_sycl_info().max_work_group_sizes[device];
@@ -358,16 +347,15 @@ static void rms_norm_f32_sycl(const float* x, float* dst, const int ncols, const
         the limit. To get the device limit, query
         info::device::max_work_group_size. Adjust the work-group size if needed.
         */
-        stream->submit([&](sycl::handler& cgh) {
+        sycl_launch(stream, [&](sycl::handler & cgh) {
             sycl::local_accessor<float, 1> s_sum_acc_ct1(sycl::range<1>(work_group_size / WARP_SIZE),
                 cgh);
-            cgh.parallel_for(
-                sycl::nd_range<3>(global_dims * block_dims, block_dims),
-                [=](sycl::nd_item<3> item_ct1)
-                [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
-                    rms_norm_f32(x, dst, ncols, stride_row, stride_channel, stride_sample, eps, item_ct1, get_pointer(s_sum_acc_ct1), work_group_size);
-                });
-            });
+            sycl_parallel_for(cgh, sycl::nd_range<3>(global_dims * block_dims, block_dims),
+                              [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
+                                  rms_norm_f32(x, dst, ncols, stride_row, stride_channel, stride_sample, eps, item_ct1,
+                                               get_pointer(s_sum_acc_ct1), work_group_size);
+                              });
+        });
     }
 }
 
@@ -378,16 +366,12 @@ static void l2_norm_f32_sycl(const float* x, float* dst, const int ncols,
     // printf("%s ncols=%d, nrows=%d, WARP_SIZE=%d\n", __func__, ncols, nrows, WARP_SIZE);
     if (ncols < 1024) {
         const sycl::range<3> block_dims(1, 1, WARP_SIZE);
-        stream->submit([&](sycl::handler& cgh) {
-            cgh.parallel_for(
-                sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims,
-                    block_dims),
-                [=](sycl::nd_item<3> item_ct1)
-                [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
-                    l2_norm_f32(x, dst, ncols, eps, item_ct1,
-                        nullptr, WARP_SIZE);
-                });
-            });
+        sycl_launch(stream, [&](sycl::handler & cgh) {
+            sycl_parallel_for(cgh, sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims, block_dims),
+                              [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
+                                  l2_norm_f32(x, dst, ncols, eps, item_ct1, nullptr, WARP_SIZE);
+                              });
+        });
     }
     else {
         const int work_group_size = ggml_sycl_info().max_work_group_sizes[device];
@@ -398,18 +382,15 @@ static void l2_norm_f32_sycl(const float* x, float* dst, const int ncols,
         the limit. To get the device limit, query
         info::device::max_work_group_size. Adjust the work-group size if needed.
         */
-        stream->submit([&](sycl::handler& cgh) {
+        sycl_launch(stream, [&](sycl::handler & cgh) {
             sycl::local_accessor<float, 1> s_sum_acc_ct1(sycl::range<1>(work_group_size / WARP_SIZE),
                 cgh);
-            cgh.parallel_for(
-                sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims,
-                    block_dims),
-                [=](sycl::nd_item<3> item_ct1)
-                [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
-                    l2_norm_f32(x, dst, ncols, eps, item_ct1,
-                        get_pointer(s_sum_acc_ct1), work_group_size);
-                });
-            });
+            sycl_parallel_for(cgh, sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims, block_dims),
+                              [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
+                                  l2_norm_f32(x, dst, ncols, eps, item_ct1, get_pointer(s_sum_acc_ct1),
+                                              work_group_size);
+                              });
+        });
     }
 }
 
diff --git a/ggml/src/ggml-sycl/quants.hpp b/ggml/src/ggml-sycl/quants.hpp
index 88ec13ea26999..8b952db43bfe2 100644
--- a/ggml/src/ggml-sycl/quants.hpp
+++ b/ggml/src/ggml-sycl/quants.hpp
@@ -14,12 +14,13 @@
 #ifndef GGML_SYCL_QUANTS_HPP
 #define GGML_SYCL_QUANTS_HPP
 
+#include <utility>
+
 #include "ggml-common.h"
 #include "ggml.h"
 
 namespace ggml_sycl_reordered {
 
-
 // The reordered block moves quants (qs) and  scales(d) to two
 // uniform regions of memory that is contiguous in the same tensor.
 // What this means is that instead of having:
@@ -32,7 +33,6 @@ namespace ggml_sycl_reordered {
 
 template <ggml_type type> struct block_q_t;
 
-
 // qk number of weights / quants in a block
 // qr number of weights in a byte (described as 'before dequantization')
 //    for quantization types that has low and high bits split, qr is calculated with
@@ -47,10 +47,12 @@ template <> struct block_q_t<GGML_TYPE_Q4_0> {
         static constexpr uint32_t vdr_mmvq = 2;
     };
 
-    static constexpr int get_block_offset(const int block_index) { return block_index * (traits::qk / traits::qr); }
+    static constexpr std::pair<int, int> get_block_offset(const int block_index, const int /* nblocks */) {
+        return { block_index * (traits::qk / traits::qr), 0 };
+    }
 
-    static constexpr int get_d_offset(int nrows, int ncols, const int block_index) {
-        return (ncols / traits::qr * nrows) + block_index * sizeof(ggml_half);
+    static constexpr std::pair<int, int> get_d_offset(int nrows, int ncols, const int block_index) {
+        return { (ncols / traits::qr * nrows) + block_index * sizeof(ggml_half), 0 };
     }
 
     static constexpr int block_to_q8_1_ratio() { return traits::qk / QK8_1; }
@@ -64,20 +66,46 @@ template <> struct block_q_t<GGML_TYPE_Q4_K> {
         static constexpr uint32_t vdr_mmvq = 2;
     };
 
-    static constexpr int get_block_offset(const int block_index) { return block_index * (traits::qk / traits::qr); }
+    static constexpr std::pair<int, int> get_block_offset(const int block_index, const int /* nblocks */) {
+        return { block_index * (traits::qk / traits::qr), 0 };
+    }
 
-    static constexpr int get_d_offset(int nrows, int ncols, const int block_index) {
+    static constexpr std::pair<int, int> get_d_offset(int nrows, int ncols, const int block_index) {
         auto nblocks = (nrows * (ncols / traits::qk));
-        return (nblocks * QK_K / 2) + (nblocks * K_SCALE_SIZE) + (block_index * sizeof(ggml_half2));
+        return { nblocks * (QK_K / 2),
+                 (nblocks * QK_K / 2) + (nblocks * K_SCALE_SIZE) + (block_index * sizeof(ggml_half2)) };
     }
 
     static constexpr int block_to_q8_1_ratio() { return traits::qk / QK8_1; }
 
     constexpr size_t get_total_qs_bytes(int nblocks) { return nblocks * QK_K / 2; }
-
-    constexpr size_t get_dm_offset(int nblocks) { return get_total_qs_bytes(nblocks) + nblocks * K_SCALE_SIZE; }
 };
 
+template <> struct block_q_t<GGML_TYPE_Q6_K> {
+    struct traits {
+        static constexpr uint32_t qk       = QK_K;
+        static constexpr uint32_t qi       = QI6_K;
+        static constexpr uint32_t qr       = QR6_K;
+        static constexpr uint32_t vdr_mmvq = 1;
+    };
+
+    static constexpr std::pair<int, int> get_block_offset(const int block_index, const int n_blocks) {
+        auto low_bits_index  = block_index * (traits::qk / traits::qr);
+        // the index of high bits it's after all low bits
+        auto high_bits_index = n_blocks * (QK_K / 2) + (block_index * (QK_K / 4));
+        return { low_bits_index, high_bits_index };
+    }
+
+    static constexpr std::pair<int, int> get_d_offset(int nrows, int ncols, const int block_index) {
+        auto nblocks        = (nrows * (ncols / traits::qk));
+        auto total_qs_bytes = nblocks * (QK_K / 2) + nblocks * (QK_K / 4);
+        auto block_scales   = total_qs_bytes + block_index * (QK_K / 16);
+        auto sb_scale       = total_qs_bytes + nblocks * (QK_K / 16);
+        return { block_scales, sb_scale };
+    }
+
+    static constexpr int block_to_q8_1_ratio() { return traits::qk / QK8_1; }
+};
 }  // namespace ggml_sycl_reordered
 
 #endif  // GGML_SYCL_QUANTS_HPP
diff --git a/ggml/src/ggml-sycl/rope.cpp b/ggml/src/ggml-sycl/rope.cpp
index 44473e1e5580c..e44c6b6ef8f42 100644
--- a/ggml/src/ggml-sycl/rope.cpp
+++ b/ggml/src/ggml-sycl/rope.cpp
@@ -235,20 +235,22 @@ static void rope_norm_sycl(const T * x, T * dst, const int ne0, const int ne1, c
         the limit. To get the device limit, query
         info::device::max_work_group_size. Adjust the work-group size if needed.
         */
-        stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
-            rope_norm<T, false>(x, dst, ne0, ne1, s1, s2, n_dims, pos, freq_scale, ext_factor, attn_factor, corr_dims,
-                                theta_scale, freq_factors, item_ct1);
-        });
+        sycl_parallel_for(stream, sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                          [=](sycl::nd_item<3> item_ct1) {
+                              rope_norm<T, false>(x, dst, ne0, ne1, s1, s2, n_dims, pos, freq_scale, ext_factor,
+                                                  attn_factor, corr_dims, theta_scale, freq_factors, item_ct1);
+                          });
     } else {
         /*
         DPCT1049:41: The work-group size passed to the SYCL kernel may exceed
         the limit. To get the device limit, query
         info::device::max_work_group_size. Adjust the work-group size if needed.
         */
-        stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
-            rope_norm<T, true>(x, dst, ne0, ne1, s1, s2, n_dims, pos, freq_scale, ext_factor, attn_factor, corr_dims,
-                               theta_scale, freq_factors, item_ct1);
-        });
+        sycl_parallel_for(stream, sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                          [=](sycl::nd_item<3> item_ct1) {
+                              rope_norm<T, true>(x, dst, ne0, ne1, s1, s2, n_dims, pos, freq_scale, ext_factor,
+                                                 attn_factor, corr_dims, theta_scale, freq_factors, item_ct1);
+                          });
     }
 }
 
@@ -267,15 +269,17 @@ static void rope_neox_sycl(const T * x, T * dst, const int ne0, const int ne1, c
     dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 });
 
     if (freq_factors == nullptr) {
-        stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
-            rope_neox<T, false>(x, dst, ne0, ne1, s1, s2, n_dims, pos, freq_scale, ext_factor, attn_factor, corr_dims,
-                                theta_scale, freq_factors, item_ct1);
-        });
+        sycl_parallel_for(stream, sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                          [=](sycl::nd_item<3> item_ct1) {
+                              rope_neox<T, false>(x, dst, ne0, ne1, s1, s2, n_dims, pos, freq_scale, ext_factor,
+                                                  attn_factor, corr_dims, theta_scale, freq_factors, item_ct1);
+                          });
     } else {
-        stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
-            rope_neox<T, true>(x, dst, ne0, ne1, s1, s2, n_dims, pos, freq_scale, ext_factor, attn_factor, corr_dims,
-                               theta_scale, freq_factors, item_ct1);
-        });
+        sycl_parallel_for(stream, sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                          [=](sycl::nd_item<3> item_ct1) {
+                              rope_neox<T, true>(x, dst, ne0, ne1, s1, s2, n_dims, pos, freq_scale, ext_factor,
+                                                 attn_factor, corr_dims, theta_scale, freq_factors, item_ct1);
+                          });
     }
 }
 
@@ -298,12 +302,12 @@ static void rope_multi_sycl(const T * x, T * dst, const int ne0, const int ne1,
     }
     // launch kernel
     if (freq_factors == nullptr) {
-        stream->parallel_for(nd_range, [=](sycl::nd_item<3> item_ct1) {
+        sycl_parallel_for(stream, nd_range, [=](sycl::nd_item<3> item_ct1) {
             rope_multi<T, false>(x, dst, ne0, ne1, ne2, s1, s2, n_dims, pos, freq_scale, ext_factor, attn_factor,
                                   corr_dims, theta_scale, freq_factors, sections, item_ct1);
         });
     } else {
-        stream->parallel_for(nd_range, [=](sycl::nd_item<3> item_ct1) {
+        sycl_parallel_for(stream, nd_range, [=](sycl::nd_item<3> item_ct1) {
             rope_multi<T, true>(x, dst, ne0, ne1, ne2, s1, s2, n_dims, pos, freq_scale, ext_factor, attn_factor,
                                  corr_dims, theta_scale, freq_factors, sections, item_ct1);
         });
@@ -333,12 +337,12 @@ static void rope_vision_sycl(const T * x, T * dst, const int ne0, const int ne1,
     }
     // launch kernel
     if (freq_factors == nullptr) {
-        stream->parallel_for(nd_range, [=](sycl::nd_item<3> item_ct1) {
+        sycl_parallel_for(stream, nd_range, [=](sycl::nd_item<3> item_ct1) {
             rope_vision<T, false>(x, dst, ne0, ne1, ne2, s1, s2, n_dims, pos, freq_scale, ext_factor, attn_factor,
                                   corr_dims, theta_scale, freq_factors, sections, item_ct1);
         });
     } else {
-        stream->parallel_for(nd_range, [=](sycl::nd_item<3> item_ct1) {
+        sycl_parallel_for(stream, nd_range, [=](sycl::nd_item<3> item_ct1) {
             rope_vision<T, true>(x, dst, ne0, ne1, ne2, s1, s2, n_dims, pos, freq_scale, ext_factor, attn_factor,
                                  corr_dims, theta_scale, freq_factors, sections, item_ct1);
         });
diff --git a/ggml/src/ggml-sycl/softmax.cpp b/ggml/src/ggml-sycl/softmax.cpp
index 52fcf4b3dbd24..7b60c292e0c92 100644
--- a/ggml/src/ggml-sycl/softmax.cpp
+++ b/ggml/src/ggml-sycl/softmax.cpp
@@ -127,11 +127,11 @@ static void soft_max_f32_submitter(const float * x, const T * mask, float * dst,
                                    const int nrows_y, const float scale, const float max_bias, const float m0,
                                    const float m1, uint32_t n_head_log2, sycl::range<3> block_nums, sycl::range<3> block_dims,
                                    const size_t n_local_scratch, queue_ptr stream) {
-    stream->submit([&](sycl::handler &cgh) {
+    sycl_launch(stream, [&](sycl::handler & cgh) {
         sycl::local_accessor<float, 1> local_buf_acc(n_local_scratch, cgh);
 
-        cgh.parallel_for(
-            sycl::nd_range<3>(block_nums * block_dims, block_dims),
+        sycl_parallel_for(
+            cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims),
             [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
                 soft_max_f32<vals_smem, ncols_template, block_size_template>(x, mask, dst, ncols_par,
                                                                              nrows_y, scale, max_bias, m0,
diff --git a/ggml/src/ggml-sycl/sycl_hw.cpp b/ggml/src/ggml-sycl/sycl_hw.cpp
index da121ffc261e8..7041140034b45 100644
--- a/ggml/src/ggml-sycl/sycl_hw.cpp
+++ b/ggml/src/ggml-sycl/sycl_hw.cpp
@@ -1,6 +1,7 @@
 #include "sycl_hw.hpp"
 
-
+// TODO: currently not used
+/*
 sycl_hw_info get_device_hw_info(sycl::device *device_ptr) {
   sycl_hw_info res;
   int32_t id = device_ptr->get_info<sycl::ext::intel::info::device::device_id>();
@@ -11,3 +12,4 @@ sycl_hw_info get_device_hw_info(sycl::device *device_ptr) {
 
   return res;
 }
+*/
diff --git a/ggml/src/ggml-sycl/sycl_hw.hpp b/ggml/src/ggml-sycl/sycl_hw.hpp
index bf689450ce61f..36b140bf03737 100644
--- a/ggml/src/ggml-sycl/sycl_hw.hpp
+++ b/ggml/src/ggml-sycl/sycl_hw.hpp
@@ -10,6 +10,8 @@
 
 namespace syclex = sycl::ext::oneapi::experimental;
 
+// TODO: currently not used
+/*
 struct sycl_hw_info {
   syclex::architecture arch;
   int32_t device_id;
@@ -18,6 +20,7 @@ struct sycl_hw_info {
 bool is_in_vector(std::vector<int> &vec, int item);
 
 sycl_hw_info get_device_hw_info(sycl::device *device_ptr);
+*/
 
 
 #endif // SYCL_HW_HPP
diff --git a/ggml/src/ggml-sycl/tsembd.cpp b/ggml/src/ggml-sycl/tsembd.cpp
index f6ca626ea7a53..721c8fa6fa27e 100644
--- a/ggml/src/ggml-sycl/tsembd.cpp
+++ b/ggml/src/ggml-sycl/tsembd.cpp
@@ -45,14 +45,9 @@ static void timestep_embedding_f32_sycl(
     int num_blocks = (half_ceil + SYCL_TIMESTEP_EMBEDDING_BLOCK_SIZE - 1) / SYCL_TIMESTEP_EMBEDDING_BLOCK_SIZE;
     sycl::range<3> block_dims(1, 1, SYCL_TIMESTEP_EMBEDDING_BLOCK_SIZE);
     sycl::range<3> gridDim(1, ne00, num_blocks);
-    stream->parallel_for(
-        sycl::nd_range<3>(
-            gridDim * block_dims, block_dims),
-        [=](sycl::nd_item<3> item_ct1) {
-            timestep_embedding_f32(
-                x, dst, nb1, dim, max_period, item_ct1
-            );
-        });
+    sycl_parallel_for(stream, sycl::nd_range<3>(gridDim * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
+        timestep_embedding_f32(x, dst, nb1, dim, max_period, item_ct1);
+    });
 }
 
 void ggml_sycl_op_timestep_embedding(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
diff --git a/ggml/src/ggml-sycl/vecdotq.hpp b/ggml/src/ggml-sycl/vecdotq.hpp
index ed3699313466b..0a5d4999419c9 100644
--- a/ggml/src/ggml-sycl/vecdotq.hpp
+++ b/ggml/src/ggml-sycl/vecdotq.hpp
@@ -284,22 +284,23 @@ template <> struct reorder_vec_dot_q_sycl<GGML_TYPE_Q4_0> {
         return d4 * (sumi * ds8f.x() - (8 * q4_0_traits::vdr_mmvq / q4_0_traits::qi) * ds8f.y());
     }
 
-    __dpct_inline__ float operator()(const void * __restrict__ vbq, const int ibx_offset, const int d_offset,
-                     const block_q8_1 * __restrict__ bq8_1, const int & iqs, int /* nblocks */) {
-        const uint8_t * bq4_0 = static_cast<const uint8_t *>(vbq) + ibx_offset;
-        const ggml_half d     = *(reinterpret_cast<const ggml_half *>(static_cast<const uint8_t *>(vbq) + d_offset));
+    __dpct_inline__ float operator()(const void * __restrict__ vbq, const std::pair<int, int> ibx_offset,
+                                     const std::pair<int, int> d_offset, const int8_t * q8_1_quant_ptr,
+                                     const sycl::half2 * q8_1_ds, const int & iqs) {
+        const uint8_t * bq4_0 = static_cast<const uint8_t *>(vbq) + ibx_offset.first;
+        const ggml_half d = *(reinterpret_cast<const ggml_half *>(static_cast<const uint8_t *>(vbq) + d_offset.first));
         int             v[q4_0_traits::vdr_mmvq];
         int             u[2 * q4_0_traits::vdr_mmvq];
 
-#pragma unroll
 
+#pragma unroll
         for (size_t i = 0; i < q4_0_traits::vdr_mmvq; ++i) {
             v[i]         = get_int_from_uint8(bq4_0, iqs + i);
-            u[2 * i + 0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
-            u[2 * i + 1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + q4_0_traits::qi);
+            u[2 * i + 0] = get_int_from_int8_aligned(q8_1_quant_ptr, iqs + i);
+            u[2 * i + 1] = get_int_from_int8_aligned(q8_1_quant_ptr, iqs + i + q4_0_traits::qi);
         }
 
-        return vec_dot_q4_0_q8_1_impl(v, u, d, bq8_1->ds);
+        return vec_dot_q4_0_q8_1_impl(v, u, d, *q8_1_ds);
     };
 };
 
@@ -346,24 +347,115 @@ template <> struct reorder_vec_dot_q_sycl<GGML_TYPE_Q4_K> {
     using q4_k_block  = ggml_sycl_reordered::block_q_t<GGML_TYPE_Q4_K>;
     using q4_k_traits = typename q4_k_block::traits;
 
-    float operator()(const void * __restrict__ vbq, const int ibx_offset, const int d_offset,
-                     const block_q8_1 * __restrict__ bq8_1, const int & iqs, int nblocks) {
-        const int ib = ibx_offset / (QK_K / 2);
+    __dpct_inline__ float operator()(const void * __restrict__ vbq, const std::pair<int, int> ibx_offset,
+                                     const std::pair<int, int> d_offset, const int8_t * q8_1_quant_ptr,
+                                     const sycl::half2 * q8_1_ds, const int & iqs) {
+        const int ib = ibx_offset.first / (QK_K / 2);
 
         const uint8_t *    base           = static_cast<const uint8_t *>(vbq);
-        const uint8_t *    qs             = base + ibx_offset;
-        const int          total_qs_bytes = nblocks * (QK_K / 2);
-        const uint8_t *    scs            = base + total_qs_bytes + ib * K_SCALE_SIZE;
-        const ggml_half2 * dms            = reinterpret_cast<const ggml_half2 *>(base + d_offset);
+        const uint8_t *    qs             = base + ibx_offset.first;
+        const uint8_t *    scs            = base + d_offset.first + ib * K_SCALE_SIZE;
+        const ggml_half2 * dms            = reinterpret_cast<const ggml_half2 *>(base + d_offset.second);
 
         const int        bq8_offset = QR4_K * ((iqs / 2) / (QI8_1 / 2));
         const int *      q4         = (const int *) (qs + 16 * bq8_offset + 4 * ((iqs / 2) % 4));
         const uint16_t * scales     = (const uint16_t *) scs;
 
-        return vec_dot_q4_K_q8_1_common(q4, scales, *dms, bq8_1, iqs);
+        int   v[2];
+        int   u[2 * QR4_K];
+        float d8[QR4_K];
+
+        v[0] = q4[0];
+        v[1] = q4[4];
+
+        uint16_t  aux[2];
+        const int j = (QR4_K * ((iqs / 2) / (QI8_1 / 2))) / 2;
+        if (j < 2) {
+            aux[0] = scales[j + 0] & 0x3f3f;
+            aux[1] = scales[j + 2] & 0x3f3f;
+        } else {
+            aux[0] = ((scales[j + 2] >> 0) & 0x0f0f) | ((scales[j - 2] & 0xc0c0) >> 2);
+            aux[1] = ((scales[j + 2] >> 4) & 0x0f0f) | ((scales[j - 0] & 0xc0c0) >> 2);
+        }
+
+        const uint8_t * sc = (const uint8_t *) aux;
+        const uint8_t * m  = sc + 2;
+
+        for (int i = 0; i < QR4_K; ++i) {
+            const int8_t* quant_base_ptr = q8_1_quant_ptr + (bq8_offset + i) * QK8_1;
+            sycl::half2 ds_values = *(q8_1_ds + bq8_offset + i);
+
+            d8[i]                   = ds_values[0];
+
+            const int * q8 = (const int *) quant_base_ptr + ((iqs / 2) % 4);
+            u[2 * i + 0]   = q8[0];
+            u[2 * i + 1]   = q8[4];
+        }
+
+        return vec_dot_q4_K_q8_1_impl_vmmq(v, u, sc, m, *dms, d8);
     }
 };
 
+template <> struct reorder_vec_dot_q_sycl<GGML_TYPE_Q6_K> {
+    static constexpr ggml_type gtype = GGML_TYPE_Q6_K;
+
+    using q6_k_block  = ggml_sycl_reordered::block_q_t<GGML_TYPE_Q6_K>;
+    using q6_k_traits = typename q6_k_block::traits;
+
+    __dpct_inline__ float vec_dot_q6_K_q8_1_impl_mmvq(const int vl, const int vh, const int * __restrict__ u,
+                                                      const int8_t * __restrict__ scales, const float d,
+                                                      const float * __restrict__ d8) {
+        float sumf = 0.0f;
+
+#pragma unroll
+        for (int i = 0; i < QR6_K; ++i) {
+            const int sc = scales[4 * i];
+
+            const int vil = (vl >> (4 * i)) & 0x0F0F0F0F;
+
+            const int vih = ((vh >> (4 * i)) << 4) & 0x30303030;
+
+            const int vi = dpct::vectorized_binary<sycl::char4>((vil | vih), 0x20202020,
+                                                                dpct::sub_sat());  // vi = (vil | vih) - 32
+
+            sumf += d8[i] * (dpct::dp4a(vi, u[i], 0) * sc);                        // SIMD dot product
+        }
+
+        return d * sumf;
+    }
+
+    __dpct_inline__ float operator()(const void * __restrict__ vbq, const std::pair<int, int> ibx_offset,
+                     const std::pair<int, int> d_offset, const int8_t * q8_1_quant_ptr, const sycl::half2 * q8_1_ds,
+                     const int iqs) {
+        const int ib = ibx_offset.first / (QK_K / 2);
+
+        const uint8_t *   base   = static_cast<const uint8_t *>(vbq);
+        const uint8_t *   ql     = base + ibx_offset.first;
+        const uint8_t *   qh     = base + ibx_offset.second;
+        const int8_t *    scales = reinterpret_cast<const int8_t *>(base + d_offset.first);
+        const ggml_half * d      = (const ggml_half *) (base + d_offset.second) + ib;
+
+        const int bq8_offset   = 2 * QR6_K * (iqs / (QI6_K / 2)) + (iqs % (QI6_K / 2)) / (QI6_K / 4);
+        const int scale_offset = (QI6_K / 4) * (iqs / (QI6_K / 2)) + (iqs % (QI6_K / 2)) / (QI6_K / 8);
+        const int vh_shift     = 2 * ((iqs % (QI6_K / 2)) / (QI6_K / 4));
+
+        const int vl = get_int_from_uint8(ql, iqs);
+        const int vh = get_int_from_uint8(qh, (QI6_K / 4) * (iqs / (QI6_K / 2)) + iqs % (QI6_K / 4)) >> vh_shift;
+
+        const int8_t * scs = scales + scale_offset;
+
+        int   u[QR6_K];
+        float d8[QR6_K];
+
+#pragma unroll
+        for (int i = 0; i < QR6_K; ++i) {
+            u[i] = get_int_from_int8_aligned(q8_1_quant_ptr + (bq8_offset + 2 * i) * QK8_1, iqs % QI8_1);
+            const sycl::half2 ds_values = *(q8_1_ds + bq8_offset + 2 * i);
+            d8[i]                       = ds_values[0];
+        }
+        return vec_dot_q6_K_q8_1_impl_mmvq(vl, vh, u, scs, *d, d8);
+    }
+};
 #define VDR_Q4_0_Q8_1_MMVQ 2
 #define VDR_Q4_0_Q8_1_MMQ  4
 
diff --git a/ggml/src/ggml-sycl/wkv.cpp b/ggml/src/ggml-sycl/wkv.cpp
index c10e2f7645e89..3ed5bbf355ad9 100644
--- a/ggml/src/ggml-sycl/wkv.cpp
+++ b/ggml/src/ggml-sycl/wkv.cpp
@@ -207,12 +207,11 @@ void ggml_sycl_op_rwkv_wkv6(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {
 
     // Submit kernel
     if (C / H == WKV_BLOCK_SIZE) {
-        stream->submit([&](sycl::handler& cgh) {
+        sycl_launch(stream, [&](sycl::handler & cgh) {
             sycl::local_accessor<float, 1> shared_mem_acc(shared_mem_size, cgh);
 
-            cgh.parallel_for(
-                sycl::nd_range<3>(grid_dims * block_dims, block_dims),
-                [=](sycl::nd_item<3> item_ct1) {
+            sycl_parallel_for(
+                cgh, sycl::nd_range<3>(grid_dims * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
                     rwkv_wkv6_f32_kernel<WKV_BLOCK_SIZE>(
                         B, T, C, H, k_d, v_d, r_d, tf_d, td_d, s_d, dst_d,
                         item_ct1, (float*)shared_mem_acc.get_multi_ptr<sycl::access::decorated::no>().get()
@@ -220,12 +219,11 @@ void ggml_sycl_op_rwkv_wkv6(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {
                 });
         });
     } else {
-        stream->submit([&](sycl::handler& cgh) {
+        sycl_launch(stream, [&](sycl::handler & cgh) {
             sycl::local_accessor<float, 1> shared_mem_acc(shared_mem_size, cgh);
 
-            cgh.parallel_for(
-                sycl::nd_range<3>(grid_dims * block_dims, block_dims),
-                [=](sycl::nd_item<3> item_ct1) {
+            sycl_parallel_for(
+                cgh, sycl::nd_range<3>(grid_dims * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
                     rwkv_wkv6_f32_kernel<WKV_BLOCK_SIZE * 2>(
                         B, T, C, H, k_d, v_d, r_d, tf_d, td_d, s_d, dst_d,
                         item_ct1, (float*)shared_mem_acc.get_multi_ptr<sycl::access::decorated::no>().get()
@@ -264,12 +262,11 @@ void ggml_sycl_op_rwkv_wkv7(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {
 
     // Submit kernel
     if (C / H == WKV_BLOCK_SIZE) {
-        stream->submit([&](sycl::handler& cgh) {
+        sycl_launch(stream, [&](sycl::handler & cgh) {
             sycl::local_accessor<float, 1> shared_mem_acc(shared_mem_size, cgh);
 
-            cgh.parallel_for(
-                sycl::nd_range<3>(grid_dims * block_dims, block_dims),
-                [=](sycl::nd_item<3> item_ct1) {
+            sycl_parallel_for(
+                cgh, sycl::nd_range<3>(grid_dims * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
                     rwkv_wkv7_f32_kernel<WKV_BLOCK_SIZE>(
                         B, T, C, H, r_d, w_d, k_d, v_d, a_d, b_d, s_d, dst_d,
                         item_ct1, (float*)shared_mem_acc.get_multi_ptr<sycl::access::decorated::no>().get()
@@ -277,12 +274,11 @@ void ggml_sycl_op_rwkv_wkv7(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {
                 });
         });
     } else {
-        stream->submit([&](sycl::handler& cgh) {
+        sycl_launch(stream, [&](sycl::handler & cgh) {
             sycl::local_accessor<float, 1> shared_mem_acc(shared_mem_size, cgh);
 
-            cgh.parallel_for(
-                sycl::nd_range<3>(grid_dims * block_dims, block_dims),
-                [=](sycl::nd_item<3> item_ct1) {
+            sycl_parallel_for(
+                cgh, sycl::nd_range<3>(grid_dims * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
                     rwkv_wkv7_f32_kernel<WKV_BLOCK_SIZE * 2>(
                         B, T, C, H, r_d, w_d, k_d, v_d, a_d, b_d, s_d, dst_d,
                         item_ct1, (float*)shared_mem_acc.get_multi_ptr<sycl::access::decorated::no>().get()
diff --git a/ggml/src/ggml-vulkan/CMakeLists.txt b/ggml/src/ggml-vulkan/CMakeLists.txt
index 4a88415f96eae..0bf4cb14f88c7 100644
--- a/ggml/src/ggml-vulkan/CMakeLists.txt
+++ b/ggml/src/ggml-vulkan/CMakeLists.txt
@@ -49,15 +49,7 @@ if (Vulkan_FOUND)
                              ../../include/ggml-vulkan.h
                             )
 
-    set(VULKAN_SHADER_GEN_CMAKE_ARGS
-        -DCMAKE_INSTALL_PREFIX=${CMAKE_BINARY_DIR}
-        -DCMAKE_RUNTIME_OUTPUT_DIRECTORY=${CMAKE_RUNTIME_OUTPUT_DIRECTORY}
-    )
-
-    set(VULKAN_SHADER_GEN_CMAKE_BUILD_ARGS "")
-    if (CMAKE_BUILD_TYPE AND CMAKE_BUILD_TYPE MATCHES "Debug|Release|MinSizeRel|RelWithDebInfo")
-        list(APPEND VULKAN_SHADER_GEN_CMAKE_BUILD_ARGS --config=${CMAKE_BUILD_TYPE})
-    endif()
+    set(VULKAN_SHADER_GEN_CMAKE_ARGS "")
 
     # Test all shader extensions
     test_shader_extension_support(
@@ -136,42 +128,54 @@ if (Vulkan_FOUND)
         set(HOST_CMAKE_TOOLCHAIN_FILE "")
     endif()
 
-    # Always use ExternalProject_Add approach
     include(ExternalProject)
 
-    # Add toolchain file if cross-compiling
     if (CMAKE_CROSSCOMPILING)
         list(APPEND VULKAN_SHADER_GEN_CMAKE_ARGS -DCMAKE_TOOLCHAIN_FILE=${HOST_CMAKE_TOOLCHAIN_FILE})
         message(STATUS "vulkan-shaders-gen toolchain file: ${HOST_CMAKE_TOOLCHAIN_FILE}")
     endif()
 
-    # Native build through ExternalProject_Add
     ExternalProject_Add(
         vulkan-shaders-gen
         SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/vulkan-shaders
-        CMAKE_ARGS ${VULKAN_SHADER_GEN_CMAKE_ARGS}
-        BUILD_COMMAND ${CMAKE_COMMAND} --build . ${VULKAN_SHADER_GEN_CMAKE_BUILD_ARGS}
-        INSTALL_COMMAND ${CMAKE_COMMAND} --install .
-        INSTALL_DIR ${CMAKE_BINARY_DIR}
+        CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${CMAKE_BINARY_DIR}/$<CONFIG>
+                   -DCMAKE_INSTALL_BINDIR=.
+                   -DCMAKE_BUILD_TYPE=$<CONFIG>
+                   ${VULKAN_SHADER_GEN_CMAKE_ARGS}
+
+        BUILD_COMMAND ${CMAKE_COMMAND} --build . --config $<CONFIG>
+        BUILD_ALWAYS  TRUE
+
+        # NOTE: When DESTDIR is set using Makefile generators and
+        # "make install" triggers the build step, vulkan-shaders-gen
+        # would be installed into the DESTDIR prefix, so it is unset
+        # to ensure that does not happen.
+
+        INSTALL_COMMAND ${CMAKE_COMMAND} -E env --unset=DESTDIR
+                        ${CMAKE_COMMAND} --install . --config $<CONFIG>
     )
-    ExternalProject_Add_StepTargets(vulkan-shaders-gen build install)
 
     set (_ggml_vk_host_suffix $<IF:$<STREQUAL:${CMAKE_HOST_SYSTEM_NAME},Windows>,.exe,>)
-    set (_ggml_vk_genshaders_cmd ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/vulkan-shaders-gen${_ggml_vk_host_suffix})
-    set (_ggml_vk_header     ${CMAKE_CURRENT_BINARY_DIR}/ggml-vulkan-shaders.hpp)
-    set (_ggml_vk_source     ${CMAKE_CURRENT_BINARY_DIR}/ggml-vulkan-shaders.cpp)
-    set (_ggml_vk_input_dir  ${CMAKE_CURRENT_SOURCE_DIR}/vulkan-shaders)
-    set (_ggml_vk_output_dir ${CMAKE_CURRENT_BINARY_DIR}/vulkan-shaders.spv)
+    set (_ggml_vk_genshaders_dir "${CMAKE_BINARY_DIR}/$<CONFIG>")
+    set (_ggml_vk_genshaders_cmd "${_ggml_vk_genshaders_dir}/vulkan-shaders-gen${_ggml_vk_host_suffix}")
+    set (_ggml_vk_header     "${CMAKE_CURRENT_BINARY_DIR}/ggml-vulkan-shaders.hpp")
+    set (_ggml_vk_source     "${CMAKE_CURRENT_BINARY_DIR}/ggml-vulkan-shaders.cpp")
+    set (_ggml_vk_input_dir  "${CMAKE_CURRENT_SOURCE_DIR}/vulkan-shaders")
+    set (_ggml_vk_output_dir "${CMAKE_CURRENT_BINARY_DIR}/vulkan-shaders.spv")
 
-    file(GLOB _ggml_vk_shader_deps "${_ggml_vk_input_dir}/*.comp")
-    set (_ggml_vk_shader_deps ${_ggml_vk_shader_deps} vulkan-shaders-gen)
+    file(GLOB _ggml_vk_shader_files CONFIGURE_DEPENDS "${_ggml_vk_input_dir}/*.comp")
 
-    # Add build and install dependencies for all builds
-    set(_ggml_vk_shader_deps ${_ggml_vk_shader_deps} vulkan-shaders-gen-build vulkan-shaders-gen-install)
+    # Because external projects do not provide source-level tracking,
+    # the vulkan-shaders-gen sources need to be explicitly added to
+    # ensure that changes will cascade into shader re-generation.
+
+    file(GLOB _ggml_vk_shaders_gen_sources
+              CONFIGURE_DEPENDS "${_ggml_vk_input_dir}/*.cpp"
+                                "${_ggml_vk_input_dir}/*.h")
 
     add_custom_command(
         OUTPUT ${_ggml_vk_header}
-                ${_ggml_vk_source}
+               ${_ggml_vk_source}
 
         COMMAND ${_ggml_vk_genshaders_cmd}
             --glslc      ${Vulkan_GLSLC_EXECUTABLE}
@@ -181,7 +185,10 @@ if (Vulkan_FOUND)
             --target-cpp ${_ggml_vk_source}
             --no-clean
 
-        DEPENDS ${_ggml_vk_shader_deps}
+        DEPENDS ${_ggml_vk_shader_files}
+                ${_ggml_vk_shaders_gen_sources}
+                vulkan-shaders-gen
+
         COMMENT "Generate vulkan shaders"
     )
 
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
index ab0303646f505..99be5e45b2af7 100644
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -78,7 +78,7 @@ static bool is_pow2(uint32_t x) { return x > 1 && (x & (x-1)) == 0; }
 #define VK_VENDOR_ID_INTEL 0x8086
 #define VK_VENDOR_ID_NVIDIA 0x10de
 
-#define VK_DEVICE_DESCRIPTOR_POOL_SIZE 32
+#define VK_DEVICE_DESCRIPTOR_POOL_SIZE 256
 
 #define GGML_VK_MAX_NODES 8192
 
@@ -102,25 +102,11 @@ static bool is_pow2(uint32_t x) { return x > 1 && (x & (x-1)) == 0; }
 
 struct ggml_backend_vk_context;
 
-struct vk_queue {
-    uint32_t queue_family_index;
-    vk::Queue queue;
-    vk::CommandPool pool;
-    uint32_t cmd_buffer_idx;
-    std::vector<vk::CommandBuffer> cmd_buffers;
-
-    vk::PipelineStageFlags stage_flags;
-
-    bool transfer_only;
-};
+#define MAX_PARAMETER_COUNT 8
 
 struct vk_pipeline_struct {
     std::string name;
     vk::ShaderModule shader_module;
-    vk::DescriptorSetLayout dsl;
-    std::vector<vk::DescriptorPool> descriptor_pools;
-    std::vector<vk::DescriptorSet> descriptor_sets;
-    uint32_t descriptor_set_idx;
     vk::PipelineLayout layout;
     vk::Pipeline pipeline;
     uint32_t push_constant_size;
@@ -167,6 +153,45 @@ struct ggml_backend_vk_buffer_type_context {
     vk_device device;
 };
 
+struct vk_queue;
+
+// Stores command pool/buffers. There's an instance of this
+// for each (context,queue) pair and for each (device,queue) pair.
+struct vk_command_pool {
+    void init(vk_device& device, vk_queue *q_);
+    void destroy(vk::Device& device);
+
+    vk::CommandPool pool;
+    uint32_t cmd_buffer_idx;
+    std::vector<vk::CommandBuffer> cmd_buffers;
+
+    vk_queue *q;
+};
+
+// Prevent simultaneous submissions to the same queue.
+// This could be per vk_queue if we stopped having two vk_queue structures
+// sharing the same vk::Queue.
+static std::mutex queue_mutex;
+
+struct vk_queue {
+    uint32_t queue_family_index;
+    vk::Queue queue;
+
+    vk_command_pool cmd_pool;
+
+    vk::PipelineStageFlags stage_flags;
+
+    bool transfer_only;
+
+    // copy everything except the cmd_pool
+    void copyFrom(vk_queue &other) {
+        queue_family_index = other.queue_family_index;
+        queue = other.queue;
+        stage_flags = other.stage_flags;
+        transfer_only = other.transfer_only;
+    }
+};
+
 static const char * ggml_backend_vk_buffer_type_name(ggml_backend_buffer_type_t buft);
 static ggml_backend_buffer_t ggml_backend_vk_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size);
 static size_t ggml_backend_vk_buffer_type_get_alignment(ggml_backend_buffer_type_t buft);
@@ -196,6 +221,7 @@ enum vk_device_architecture {
     AMD_RDNA1,
     AMD_RDNA2,
     AMD_RDNA3,
+    INTEL_XE2,
 };
 
 static vk_device_architecture get_device_architecture(const vk::PhysicalDevice& device) {
@@ -246,6 +272,34 @@ static vk_device_architecture get_device_architecture(const vk::PhysicalDevice&
             }
             return vk_device_architecture::AMD_RDNA2;
         }
+    } else if (props.vendorID == VK_VENDOR_ID_INTEL) {
+        const std::vector<vk::ExtensionProperties> ext_props = device.enumerateDeviceExtensionProperties();
+
+        bool subgroup_size_control = false;
+
+        for (const auto& properties : ext_props) {
+            if (strcmp("VK_EXT_subgroup_size_control", properties.extensionName) == 0) {
+                subgroup_size_control = true;
+            }
+        }
+
+        if (!subgroup_size_control) {
+            return vk_device_architecture::OTHER;
+        }
+
+        vk::PhysicalDeviceProperties2 props2;
+        vk::PhysicalDeviceSubgroupSizeControlPropertiesEXT subgroup_size_control_props;
+
+        props2.pNext = &subgroup_size_control_props;
+        device.getProperties2(&props2);
+
+        if (subgroup_size_control_props.minSubgroupSize == 16) {
+            // Xe2 architecture uses SIMD16 while previous Xe and Gen architecture uses SIMD8.
+            // Minimum subgroup size matches the SIMD width so we distinguish architecture by checking this value.
+            // https://www.intel.com/content/www/us/en/content-details/824434/2024-intel-tech-tour-xe2-and-lunar-lake-s-gpu.html
+            // https://www.intel.com/content/www/us/en/docs/oneapi/optimization-guide-gpu/2025-0/intel-xe-gpu-architecture.html
+            return vk_device_architecture::INTEL_XE2;
+        }
     }
     return vk_device_architecture::OTHER;
 }
@@ -312,6 +366,8 @@ struct vk_device_struct {
     // set to true to indicate that some shaders need to be compiled after the dryrun
     bool need_compiles {};
 
+    vk::DescriptorSetLayout dsl;
+
     vk_matmul_pipeline pipeline_matmul_f32 {};
     vk_matmul_pipeline pipeline_matmul_f32_f16 {};
     vk_matmul_pipeline pipeline_matmul_bf16 {};
@@ -396,6 +452,7 @@ struct vk_device_struct {
     vk_pipeline pipeline_count_equal_i32;
     vk_pipeline pipeline_im2col_f32, pipeline_im2col_f32_f16;
     vk_pipeline pipeline_timestep_embedding_f32;
+    vk_pipeline pipeline_conv_transpose_1d_f32;
     vk_pipeline pipeline_pool2d_f32;
     vk_pipeline pipeline_rwkv_wkv6_f32;
     vk_pipeline pipeline_rwkv_wkv7_f32;
@@ -428,7 +485,6 @@ struct vk_device_struct {
     vk_pipeline pipeline_flash_attn_split_k_reduce;
 
     std::unordered_map<std::string, vk_pipeline_ref> pipelines;
-    std::unordered_map<std::string, uint64_t> pipeline_descriptor_set_requirements;
 
     std::vector<std::tuple<void*, size_t, vk_buffer>> pinned_memory;
 
@@ -444,7 +500,7 @@ struct vk_device_struct {
     // for GGML_VK_PERF_LOGGER
     std::unique_ptr<vk_perf_logger> perf_logger;
     vk::QueryPool query_pool;
-    uint32_t num_queries;
+    int32_t num_queries;
 
     ~vk_device_struct() {
         VK_LOG_DEBUG("destroy device " << name);
@@ -453,10 +509,8 @@ struct vk_device_struct {
 
         ggml_vk_destroy_buffer(sync_staging);
 
-        device.destroyCommandPool(compute_queue.pool);
-        if (!single_queue) {
-            device.destroyCommandPool(transfer_queue.pool);
-        }
+        compute_queue.cmd_pool.destroy(device);
+        transfer_queue.cmd_pool.destroy(device);
 
         for (auto& pipeline : pipelines) {
             if (pipeline.second.expired()) {
@@ -468,10 +522,26 @@ struct vk_device_struct {
         }
         pipelines.clear();
 
+        device.destroyDescriptorSetLayout(dsl);
+
         device.destroy();
     }
 };
 
+void vk_command_pool::init(vk_device& device, vk_queue *q_) {
+    cmd_buffer_idx = 0;
+    q = q_;
+
+    vk::CommandPoolCreateInfo command_pool_create_info(vk::CommandPoolCreateFlags(VK_COMMAND_POOL_CREATE_TRANSIENT_BIT), q->queue_family_index);
+    pool = device->device.createCommandPool(command_pool_create_info);
+}
+
+void vk_command_pool::destroy(vk::Device& device) {
+    device.destroyCommandPool(pool);
+    pool = nullptr;
+    cmd_buffers.clear();
+}
+
 struct vk_buffer_struct {
     vk::Buffer buffer = VK_NULL_HANDLE;
     vk::DeviceMemory device_memory = VK_NULL_HANDLE;
@@ -706,6 +776,21 @@ struct vk_op_timestep_embedding_push_constants {
     uint32_t max_period;
 };
 
+struct vk_op_conv_transpose_1d_push_constants {
+    uint32_t Cout;
+    uint32_t Cin;
+    uint32_t K;
+    uint32_t L;
+    uint32_t KL;
+
+    uint32_t nb01;
+    uint32_t nb02;
+    uint32_t nb11;
+    uint32_t nb1;
+
+    int32_t s0;
+};
+
 struct vk_op_pool2d_push_constants {
     uint32_t IW; uint32_t IH;
     uint32_t OW; uint32_t OH;
@@ -774,7 +859,7 @@ struct vk_context_struct {
     std::vector<vk_staging_memcpy> in_memcpys;
     std::vector<vk_staging_memcpy> out_memcpys;
 
-    vk_queue * q;
+    vk_command_pool * p {};
 };
 typedef std::shared_ptr<vk_context_struct> vk_context;
 typedef std::weak_ptr<vk_context_struct> vk_context_ref;
@@ -885,6 +970,14 @@ struct ggml_backend_vk_context {
     vk_context_ref transfer_ctx;
 
     std::vector<vk_context_ref> tensor_ctxs;
+
+    std::vector<vk::DescriptorPool> descriptor_pools;
+    std::vector<vk::DescriptorSet> descriptor_sets;
+    uint32_t descriptor_set_idx {};
+    uint32_t pipeline_descriptor_set_requirements {};
+
+    vk_command_pool compute_cmd_pool;
+    vk_command_pool transfer_cmd_pool;
 };
 
 static void * const vk_ptr_base = (void *)(uintptr_t) 0x1000;  // NOLINT
@@ -948,6 +1041,14 @@ void vk_memory_logger::log_deallocation(vk_buffer_ref buf_ref) {
 struct vk_instance_t {
     vk::Instance instance;
 
+    bool debug_utils_support = false;  // VK_EXT_debug_utils enabled
+    PFN_vkSetDebugUtilsObjectNameEXT pfn_vkSetDebugUtilsObjectNameEXT = {};
+    PFN_vkQueueBeginDebugUtilsLabelEXT pfn_vkQueueBeginDebugUtilsLabelEXT = {};
+    PFN_vkQueueEndDebugUtilsLabelEXT   pfn_vkQueueEndDebugUtilsLabelEXT   = {};
+    PFN_vkCmdBeginDebugUtilsLabelEXT   pfn_vkCmdBeginDebugUtilsLabelEXT   = {};
+    PFN_vkCmdEndDebugUtilsLabelEXT pfn_vkCmdEndDebugUtilsLabelEXT = {};
+    PFN_vkCmdInsertDebugUtilsLabelEXT  pfn_vkCmdInsertDebugUtilsLabelEXT  = {};
+
     std::vector<size_t> device_indices;
     vk_device devices[GGML_VK_MAX_DEVICES];
 };
@@ -1015,39 +1116,19 @@ static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipelin
                  ", (" << wg_denoms[0] << "," << wg_denoms[1] << "," << wg_denoms[2] << "), specialization_constants, " <<
                  disable_robustness << ", " << require_full_subgroups << ", " << required_subgroup_size << ")");
     GGML_ASSERT(parameter_count > 0);
+    GGML_ASSERT(parameter_count <= MAX_PARAMETER_COUNT);
     GGML_ASSERT(wg_denoms[0] > 0 && wg_denoms[1] > 0 && wg_denoms[2] > 0); // NOLINT
 
     vk::ShaderModuleCreateInfo shader_module_create_info({}, spv_size, reinterpret_cast<const uint32_t *>(spv_data));
     pipeline->shader_module = device->device.createShaderModule(shader_module_create_info);
 
-    std::vector<vk::DescriptorSetLayoutBinding> dsl_binding;
-    std::vector<vk::DescriptorBindingFlags> dsl_binding_flags;
-    for (uint32_t i = 0; i < parameter_count; i++) {
-        dsl_binding.push_back({i, vk::DescriptorType::eStorageBuffer, 1, vk::ShaderStageFlagBits::eCompute});
-        dsl_binding_flags.push_back({});
-    }
-
-    vk::DescriptorSetLayoutBindingFlagsCreateInfo dslbfci = { dsl_binding_flags };
-
     vk::PushConstantRange pcr(
         vk::ShaderStageFlagBits::eCompute,
         0,
         pipeline->push_constant_size
     );
 
-    vk::DescriptorSetLayoutCreateInfo descriptor_set_layout_create_info(
-        {},
-        dsl_binding);
-    descriptor_set_layout_create_info.setPNext(&dslbfci);
-    pipeline->dsl = device->device.createDescriptorSetLayout(descriptor_set_layout_create_info);
-
-    vk::DescriptorPoolSize descriptor_pool_size(vk::DescriptorType::eStorageBuffer, pipeline->parameter_count * VK_DEVICE_DESCRIPTOR_POOL_SIZE);
-    vk::DescriptorPoolCreateInfo descriptor_pool_create_info({}, VK_DEVICE_DESCRIPTOR_POOL_SIZE, descriptor_pool_size);
-    pipeline->descriptor_pools.push_back(device->device.createDescriptorPool(descriptor_pool_create_info));
-
-    pipeline->descriptor_set_idx = 0;
-
-    vk::PipelineLayoutCreateInfo pipeline_layout_create_info(vk::PipelineLayoutCreateFlags(), pipeline->dsl, pcr);
+    vk::PipelineLayoutCreateInfo pipeline_layout_create_info(vk::PipelineLayoutCreateFlags(), device->dsl, pcr);
     pipeline->layout = device->device.createPipelineLayout(pipeline_layout_create_info);
 
     std::vector<vk::SpecializationMapEntry> specialization_entries(specialization_constants.size());
@@ -1107,6 +1188,14 @@ static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipelin
     }
     pipeline->compiled = true;
 
+    if (vk_instance.debug_utils_support) {
+        vk::DebugUtilsObjectNameInfoEXT duoni;
+        duoni.objectType = vk::ObjectType::ePipeline;
+        duoni.pObjectName = pipeline->name.c_str();
+        duoni.objectHandle = reinterpret_cast<uint64_t>(static_cast<VkPipeline_T*>(pipeline->pipeline));
+        vk_instance.pfn_vkSetDebugUtilsObjectNameEXT(device->device, &static_cast<VkDebugUtilsObjectNameInfoEXT &>(duoni));
+    }
+
     {
         std::lock_guard<std::mutex> guard(device->mutex);
         device->pipelines.insert({ pipeline->name, pipeline });
@@ -1122,15 +1211,6 @@ static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipelin
 
 static void ggml_vk_destroy_pipeline(vk::Device& device, vk_pipeline& pipeline) {
     VK_LOG_DEBUG("ggml_pipeline_destroy_pipeline(" << pipeline->name << ")");
-    for (auto& pool : pipeline->descriptor_pools) {
-        device.destroyDescriptorPool(pool);
-    }
-    pipeline->descriptor_pools.clear();
-    pipeline->descriptor_sets.clear();
-    pipeline->descriptor_set_idx = 0;
-
-    device.destroyDescriptorSetLayout(pipeline->dsl);
-
     device.destroyPipelineLayout(pipeline->layout);
 
     device.destroyShaderModule(pipeline->shader_module);
@@ -1138,97 +1218,77 @@ static void ggml_vk_destroy_pipeline(vk::Device& device, vk_pipeline& pipeline)
     device.destroyPipeline(pipeline->pipeline);
 }
 
-static void ggml_pipeline_request_descriptor_sets(vk_device& device, vk_pipeline& pipeline, uint32_t n) {
+static void ggml_pipeline_request_descriptor_sets(ggml_backend_vk_context *ctx, vk_pipeline& pipeline, uint32_t n) {
     VK_LOG_DEBUG("ggml_pipeline_request_descriptor_sets(" << pipeline->name << ", " << n << ")");
-    device->pipeline_descriptor_set_requirements[pipeline->name] += n;
+    ctx->pipeline_descriptor_set_requirements += n;
     if (!pipeline->compiled) {
         pipeline->needed = true;
-        device->need_compiles = true;
+        ctx->device->need_compiles = true;
     }
 }
 
-static void ggml_pipeline_allocate_descriptor_sets(vk_device& device) {
-    std::lock_guard<std::mutex> guard(device->mutex);
+static void ggml_pipeline_allocate_descriptor_sets(ggml_backend_vk_context * ctx) {
 
-    for (auto& pair : device->pipeline_descriptor_set_requirements) {
-        vk_pipeline pipeline = device->pipelines.at(pair.first).lock();
-        const uint64_t n = pair.second;
-
-        VK_LOG_DEBUG("ggml_pipeline_allocate_descriptor_sets(" << pipeline->name << ", " << n << ")");
-
-        if (pipeline->descriptor_sets.size() >= pipeline->descriptor_set_idx + n) {
-            // Enough descriptors are available
-            continue;
-        }
+    if (ctx->descriptor_sets.size() >= ctx->pipeline_descriptor_set_requirements) {
+        // Enough descriptors are available
+        return;
+    }
 
-        uint32_t to_alloc = pipeline->descriptor_set_idx + n - pipeline->descriptor_sets.size();
-        uint32_t pool_remaining = VK_DEVICE_DESCRIPTOR_POOL_SIZE - pipeline->descriptor_sets.size() % VK_DEVICE_DESCRIPTOR_POOL_SIZE;
-        uint32_t pool_idx = pipeline->descriptor_sets.size() / VK_DEVICE_DESCRIPTOR_POOL_SIZE;
+    vk_device& device = ctx->device;
 
-        while (to_alloc > 0) {
-            const uint32_t alloc_count = std::min(pool_remaining, to_alloc);
-            to_alloc -= alloc_count;
-            pool_remaining = VK_DEVICE_DESCRIPTOR_POOL_SIZE;
+    uint32_t to_alloc = ctx->pipeline_descriptor_set_requirements - ctx->descriptor_sets.size();
+    uint32_t pool_remaining = VK_DEVICE_DESCRIPTOR_POOL_SIZE - ctx->descriptor_sets.size() % VK_DEVICE_DESCRIPTOR_POOL_SIZE;
+    uint32_t pool_idx = ctx->descriptor_sets.size() / VK_DEVICE_DESCRIPTOR_POOL_SIZE;
 
-            if (pool_idx >= pipeline->descriptor_pools.size()) {
-                vk::DescriptorPoolSize descriptor_pool_size(vk::DescriptorType::eStorageBuffer, pipeline->parameter_count * VK_DEVICE_DESCRIPTOR_POOL_SIZE);
-                vk::DescriptorPoolCreateInfo descriptor_pool_create_info({}, VK_DEVICE_DESCRIPTOR_POOL_SIZE, descriptor_pool_size);
-                pipeline->descriptor_pools.push_back(device->device.createDescriptorPool(descriptor_pool_create_info));
-            }
+    while (to_alloc > 0) {
+        const uint32_t alloc_count = std::min(pool_remaining, to_alloc);
+        to_alloc -= alloc_count;
+        pool_remaining = VK_DEVICE_DESCRIPTOR_POOL_SIZE;
 
-            std::vector<vk::DescriptorSetLayout> layouts(alloc_count);
-            for (uint32_t i = 0; i < alloc_count; i++) {
-                layouts[i] = pipeline->dsl;
-            }
-            vk::DescriptorSetAllocateInfo descriptor_set_alloc_info(pipeline->descriptor_pools[pool_idx], alloc_count, layouts.data());
-            std::vector<vk::DescriptorSet> sets = device->device.allocateDescriptorSets(descriptor_set_alloc_info);
-            pipeline->descriptor_sets.insert(pipeline->descriptor_sets.end(), sets.begin(), sets.end());
+        if (pool_idx >= ctx->descriptor_pools.size()) {
+            vk::DescriptorPoolSize descriptor_pool_size(vk::DescriptorType::eStorageBuffer, MAX_PARAMETER_COUNT * VK_DEVICE_DESCRIPTOR_POOL_SIZE);
+            vk::DescriptorPoolCreateInfo descriptor_pool_create_info({}, VK_DEVICE_DESCRIPTOR_POOL_SIZE, descriptor_pool_size);
+            ctx->descriptor_pools.push_back(device->device.createDescriptorPool(descriptor_pool_create_info));
+        }
 
-            pool_idx++;
+        std::vector<vk::DescriptorSetLayout> layouts(alloc_count);
+        for (uint32_t i = 0; i < alloc_count; i++) {
+            layouts[i] = device->dsl;
         }
-    }
-}
+        vk::DescriptorSetAllocateInfo descriptor_set_alloc_info(ctx->descriptor_pools[pool_idx], alloc_count, layouts.data());
+        std::vector<vk::DescriptorSet> sets = device->device.allocateDescriptorSets(descriptor_set_alloc_info);
+        ctx->descriptor_sets.insert(ctx->descriptor_sets.end(), sets.begin(), sets.end());
 
-static void ggml_pipeline_cleanup(vk_pipeline& pipeline) {
-    VK_LOG_DEBUG("ggml_pipeline_cleanup(" << pipeline->name << ")");
-    pipeline->descriptor_set_idx = 0;
+        pool_idx++;
+    }
 }
 
-static vk::CommandBuffer ggml_vk_create_cmd_buffer(vk_device& device, vk_queue& q) {
+static vk::CommandBuffer ggml_vk_create_cmd_buffer(vk_device& device, vk_command_pool& p) {
     VK_LOG_DEBUG("ggml_vk_create_cmd_buffer()");
-    std::lock_guard<std::mutex> guard(device->mutex);
 
-    if (q.cmd_buffers.size() > q.cmd_buffer_idx) {
+    if (p.cmd_buffers.size() > p.cmd_buffer_idx) {
         // Reuse command buffer
-        return q.cmd_buffers[q.cmd_buffer_idx++];
+        return p.cmd_buffers[p.cmd_buffer_idx++];
     }
 
     vk::CommandBufferAllocateInfo command_buffer_alloc_info(
-        q.pool,
+        p.pool,
         vk::CommandBufferLevel::ePrimary,
         1);
     const std::vector<vk::CommandBuffer> cmd_buffers = device->device.allocateCommandBuffers(command_buffer_alloc_info);
     auto buf = cmd_buffers.front();
 
-    q.cmd_buffers.push_back(buf);
-    q.cmd_buffer_idx++;
+    p.cmd_buffers.push_back(buf);
+    p.cmd_buffer_idx++;
 
     return buf;
 }
 
-static vk_submission ggml_vk_create_submission(vk_device& device, vk_queue& q, std::vector<vk_semaphore> wait_semaphores, std::vector<vk_semaphore> signal_semaphores) {
-    VK_LOG_DEBUG("ggml_vk_create_submission()");
-    vk_submission s;
-    s.buffer = ggml_vk_create_cmd_buffer(device, q);
-    s.wait_semaphores = std::move(wait_semaphores);
-    s.signal_semaphores = std::move(signal_semaphores);
-    return s;
-}
-
 static void ggml_vk_submit(vk_context& ctx, vk::Fence fence) {
     if (ctx->seqs.empty()) {
         if (fence) {
-            ctx->q->queue.submit({}, fence);
+            std::lock_guard<std::mutex> guard(queue_mutex);
+            ctx->p->q->queue.submit({}, fence);
         }
         return;
     }
@@ -1267,7 +1327,7 @@ static void ggml_vk_submit(vk_context& ctx, vk::Fence fence) {
             tl_signal_vals.push_back({});
             tl_signal_semaphores.push_back({});
             for (size_t i = 0; i < submission.wait_semaphores.size(); i++) {
-                stage_flags[idx].push_back(ctx->q->stage_flags);
+                stage_flags[idx].push_back(ctx->p->q->stage_flags);
                 tl_wait_vals[idx].push_back(submission.wait_semaphores[i].value);
                 tl_wait_semaphores[idx].push_back(submission.wait_semaphores[i].s);
             }
@@ -1297,7 +1357,8 @@ static void ggml_vk_submit(vk_context& ctx, vk::Fence fence) {
         }
     }
 
-    ctx->q->queue.submit(submit_infos, fence);
+    std::lock_guard<std::mutex> guard(queue_mutex);
+    ctx->p->q->queue.submit(submit_infos, fence);
 
     ctx->seqs.clear();
 }
@@ -1355,28 +1416,25 @@ static void ggml_vk_create_queue(vk_device& device, vk_queue& q, uint32_t queue_
     q.queue_family_index = queue_family_index;
     q.transfer_only = transfer_only;
 
-    vk::CommandPoolCreateInfo command_pool_create_info_compute(vk::CommandPoolCreateFlags(VK_COMMAND_POOL_CREATE_TRANSIENT_BIT), queue_family_index);
-    q.pool = device->device.createCommandPool(command_pool_create_info_compute);
-
-    q.cmd_buffer_idx = 0;
+    q.cmd_pool.init(device, &q);
 
     q.queue = device->device.getQueue(queue_family_index, queue_index);
 
     q.stage_flags = stage_flags;
 }
 
-static vk_context ggml_vk_create_context(ggml_backend_vk_context * ctx, vk_queue& q) {
+static vk_context ggml_vk_create_context(ggml_backend_vk_context * ctx, vk_command_pool& p) {
     vk_context result = std::make_shared<vk_context_struct>();
     VK_LOG_DEBUG("ggml_vk_create_context(" << result << ")");
     ctx->gc.contexts.emplace_back(result);
-    result->q = &q;
+    result->p = &p;
     return result;
 }
 
-static vk_context ggml_vk_create_temporary_context(vk_queue& q) {
+static vk_context ggml_vk_create_temporary_context(vk_command_pool& p) {
     vk_context result = std::make_shared<vk_context_struct>();
     VK_LOG_DEBUG("ggml_vk_create_temporary_context(" << result << ")");
-    result->q = &q;
+    result->p = &p;
     return result;
 }
 
@@ -1409,15 +1467,29 @@ static vk::Event ggml_vk_create_event(ggml_backend_vk_context * ctx) {
     return ctx->gc.events[ctx->event_idx++];
 }
 
-static void ggml_vk_queue_cleanup(vk_device& device, vk_queue& q) {
-    VK_LOG_DEBUG("ggml_vk_queue_cleanup()");
-    std::lock_guard<std::mutex> guard(device->mutex);
+static void ggml_vk_command_pool_cleanup(vk_device& device, vk_command_pool& p) {
+    VK_LOG_DEBUG("ggml_vk_command_pool_cleanup()");
 
     // Requires command buffers to be done
-    device->device.resetCommandPool(q.pool);
-    q.cmd_buffer_idx = 0;
+    device->device.resetCommandPool(p.pool);
+    p.cmd_buffer_idx = 0;
 }
 
+static void ggml_vk_queue_command_pools_cleanup(vk_device& device) {
+    VK_LOG_DEBUG("ggml_vk_queue_command_pools_cleanup()");
+
+    // Arbitrary frequency to cleanup/reuse command buffers
+    static constexpr uint32_t cleanup_frequency = 10;
+
+    if (device->compute_queue.cmd_pool.cmd_buffer_idx >= cleanup_frequency) {
+        ggml_vk_command_pool_cleanup(device, device->compute_queue.cmd_pool);
+    }
+    if (device->transfer_queue.cmd_pool.cmd_buffer_idx >= cleanup_frequency) {
+        ggml_vk_command_pool_cleanup(device, device->transfer_queue.cmd_pool);
+    }
+}
+
+
 static uint32_t find_properties(const vk::PhysicalDeviceMemoryProperties* mem_props, vk::MemoryRequirements* mem_req, vk::MemoryPropertyFlags flags) {
     for (uint32_t i = 0; i < mem_props->memoryTypeCount; ++i) {
         vk::MemoryType memory_type = mem_props->memoryTypes[i];
@@ -1436,8 +1508,6 @@ static vk_buffer ggml_vk_create_buffer(vk_device& device, size_t size, vk::Memor
         throw vk::OutOfDeviceMemoryError("Requested buffer size exceeds device memory allocation limit");
     }
 
-    std::lock_guard<std::mutex> guard(device->mutex);
-
     vk_buffer buf = std::make_shared<vk_buffer_struct>();
 
     if (size == 0) {
@@ -1566,11 +1636,11 @@ static vk_subbuffer ggml_vk_subbuffer(vk_buffer& buf) {
 static void ggml_vk_sync_buffers(vk_context& ctx) {
     VK_LOG_DEBUG("ggml_vk_sync_buffers()");
 
-    const bool transfer_queue = ctx->q->transfer_only;
+    const bool transfer_queue = ctx->p->q->transfer_only;
 
     ctx->s->buffer.pipelineBarrier(
-        ctx->q->stage_flags,
-        ctx->q->stage_flags,
+        ctx->p->q->stage_flags,
+        ctx->p->q->stage_flags,
         {},
         { {
           { !transfer_queue ? (vk::AccessFlagBits::eShaderRead | vk::AccessFlagBits::eShaderWrite | vk::AccessFlagBits::eTransferRead | vk::AccessFlagBits::eTransferWrite) : (vk::AccessFlagBits::eTransferRead | vk::AccessFlagBits::eTransferWrite) },
@@ -1589,8 +1659,8 @@ static void ggml_vk_wait_events(vk_context& ctx, std::vector<vk::Event>&& events
 
     ctx->s->buffer.waitEvents(
         events,
-        ctx->q->stage_flags,
-        ctx->q->stage_flags,
+        ctx->p->q->stage_flags,
+        ctx->p->q->stage_flags,
         {},
         {},
         {}
@@ -1652,7 +1722,7 @@ static std::array<uint32_t, 2> fa_rows_cols(FaCodePath path, uint32_t D, uint32_
         return {64, 32};
     }
     return {64, 64};
-};
+}
 
 static bool ggml_vk_matmul_shmem_support(const vk_device& device, const std::vector<uint32_t>& warptile, bool mul_mat_id, ggml_type src0_type) {
 
@@ -2726,6 +2796,8 @@ static void ggml_vk_load_shaders(vk_device& device) {
 
     ggml_vk_create_pipeline(device, device->pipeline_timestep_embedding_f32, "timestep_embedding_f32", timestep_embedding_f32_len, timestep_embedding_f32_data, "main", 2, sizeof(vk_op_timestep_embedding_push_constants), {256, 1, 1}, {}, 1);
 
+    ggml_vk_create_pipeline(device, device->pipeline_conv_transpose_1d_f32, "conv_transpose_1d_f32", conv_transpose_1d_f32_len, conv_transpose_1d_f32_data, "main", 3, sizeof(vk_op_conv_transpose_1d_push_constants), {1, 1, 1}, {}, 1);
+
     ggml_vk_create_pipeline(device, device->pipeline_pool2d_f32, "pool2d_f32", pool2d_f32_len, pool2d_f32_data, "main", 2, sizeof(vk_op_pool2d_push_constants), {512, 1, 1}, {}, 1);
 
     ggml_vk_create_pipeline(device, device->pipeline_rwkv_wkv6_f32, "rwkv_wkv6_f32", rwkv_wkv6_f32_len, rwkv_wkv6_f32_data, "main", 7, sizeof(vk_op_rwkv_wkv6_push_constants), {1, 1, 1}, {device->subgroup_size}, 1);
@@ -3322,6 +3394,22 @@ static vk_device ggml_vk_get_device(size_t idx) {
             }
         }
 
+
+        std::vector<vk::DescriptorSetLayoutBinding> dsl_binding;
+        std::vector<vk::DescriptorBindingFlags> dsl_binding_flags;
+        for (uint32_t i = 0; i < MAX_PARAMETER_COUNT; i++) {
+            dsl_binding.push_back({i, vk::DescriptorType::eStorageBuffer, 1, vk::ShaderStageFlagBits::eCompute});
+            dsl_binding_flags.push_back({});
+        }
+
+        vk::DescriptorSetLayoutBindingFlagsCreateInfo dslbfci = { dsl_binding_flags };
+
+        vk::DescriptorSetLayoutCreateInfo descriptor_set_layout_create_info(
+            {},
+            dsl_binding);
+        descriptor_set_layout_create_info.setPNext(&dslbfci);
+        device->dsl = device->device.createDescriptorSetLayout(descriptor_set_layout_create_info);
+
         ggml_vk_load_shaders(device);
 
         if (!device->single_queue) {
@@ -3329,7 +3417,8 @@ static vk_device ggml_vk_get_device(size_t idx) {
             ggml_vk_create_queue(device, device->transfer_queue, transfer_queue_family_index, transfer_queue_index, { vk::PipelineStageFlagBits::eTransfer }, true);
         } else {
             // TODO: Use pointer or reference to avoid copy
-            device->transfer_queue = device->compute_queue;
+            device->transfer_queue.copyFrom(device->compute_queue);
+            device->transfer_queue.cmd_pool.init(device, &device->transfer_queue);
         }
 
         device->buffer_type = {
@@ -3488,6 +3577,8 @@ static void ggml_vk_print_gpu_info(size_t idx) {
 static bool ggml_vk_instance_validation_ext_available(const std::vector<vk::ExtensionProperties>& instance_extensions);
 static bool ggml_vk_instance_portability_enumeration_ext_available(const std::vector<vk::ExtensionProperties>& instance_extensions);
 
+static bool ggml_vk_instance_debug_utils_ext_available(const std::vector<vk::ExtensionProperties> & instance_extensions);
+
 static void ggml_vk_instance_init() {
     if (vk_instance_initialized) {
         return;
@@ -3508,7 +3599,7 @@ static void ggml_vk_instance_init() {
 #ifdef __APPLE__
     const bool portability_enumeration_ext = ggml_vk_instance_portability_enumeration_ext_available(instance_extensions);
 #endif
-
+    const bool debug_utils_ext = ggml_vk_instance_debug_utils_ext_available(instance_extensions) && getenv("GGML_VK_DEBUG_MARKERS") != nullptr;
     std::vector<const char*> layers;
 
     if (validation_ext) {
@@ -3523,6 +3614,9 @@ static void ggml_vk_instance_init() {
         extensions.push_back("VK_KHR_portability_enumeration");
     }
 #endif
+    if (debug_utils_ext) {
+        extensions.push_back("VK_EXT_debug_utils");
+    }
     vk::InstanceCreateInfo instance_create_info(vk::InstanceCreateFlags{}, &app_info, layers, extensions);
 #ifdef __APPLE__
     if (portability_enumeration_ext) {
@@ -3546,13 +3640,25 @@ static void ggml_vk_instance_init() {
     vk_instance.instance = vk::createInstance(instance_create_info);
     vk_instance_initialized = true;
 
-    vk_perf_logger_enabled = getenv("GGML_VK_PERF_LOGGER") != nullptr;
+    if (debug_utils_ext) {
+        vk_instance.debug_utils_support              = true;
+        vk_instance.pfn_vkSetDebugUtilsObjectNameEXT = (PFN_vkSetDebugUtilsObjectNameEXT) vkGetInstanceProcAddr(vk_instance.instance, "vkSetDebugUtilsObjectNameEXT");
+        vk_instance.pfn_vkQueueBeginDebugUtilsLabelEXT = (PFN_vkQueueBeginDebugUtilsLabelEXT) vkGetInstanceProcAddr(vk_instance.instance, "vkQueueBeginDebugUtilsLabelEXT");
+        vk_instance.pfn_vkQueueEndDebugUtilsLabelEXT = (PFN_vkQueueEndDebugUtilsLabelEXT) vkGetInstanceProcAddr(vk_instance.instance, "vkQueueEndDebugUtilsLabelEXT");
+        vk_instance.pfn_vkCmdBeginDebugUtilsLabelEXT = (PFN_vkCmdBeginDebugUtilsLabelEXT) vkGetInstanceProcAddr(vk_instance.instance, "vkCmdBeginDebugUtilsLabelEXT");
+        vk_instance.pfn_vkCmdEndDebugUtilsLabelEXT =   (PFN_vkCmdEndDebugUtilsLabelEXT) vkGetInstanceProcAddr(vk_instance.instance, "vkCmdEndDebugUtilsLabelEXT");
+        vk_instance.pfn_vkCmdInsertDebugUtilsLabelEXT = (PFN_vkCmdInsertDebugUtilsLabelEXT) vkGetInstanceProcAddr(vk_instance.instance, "vkCmdInsertDebugUtilsLabelEXT");
+
+    }
 
     size_t num_available_devices = vk_instance.instance.enumeratePhysicalDevices().size();
+    vk_perf_logger_enabled = getenv("GGML_VK_PERF_LOGGER") != nullptr;
 
     // Emulate behavior of CUDA_VISIBLE_DEVICES for Vulkan
     char * devices_env = getenv("GGML_VK_VISIBLE_DEVICES");
     if (devices_env != nullptr) {
+        size_t num_available_devices = vk_instance.instance.enumeratePhysicalDevices().size();
+
         std::string devices(devices_env);
         std::replace(devices.begin(), devices.end(), ',', ' ');
 
@@ -3568,9 +3674,9 @@ static void ggml_vk_instance_init() {
     } else {
         std::vector<vk::PhysicalDevice> devices = vk_instance.instance.enumeratePhysicalDevices();
 
-        // Make sure at least one device exists
+        // If no vulkan devices are found, return early
         if (devices.empty()) {
-            std::cerr << "ggml_vulkan: Error: No devices found." << std::endl;
+            GGML_LOG_INFO("ggml_vulkan: No devices found.\n");
             return;
         }
 
@@ -3653,9 +3759,20 @@ static void ggml_vk_instance_init() {
             }
         }
 
-        // If no dedicated GPUs found, fall back to GPU 0
+        // If no dedicated GPUs found, fall back to the first non-CPU device.
+        // If only CPU devices are available, return without devices.
         if (vk_instance.device_indices.empty()) {
-            vk_instance.device_indices.push_back(0);
+            for (size_t i = 0; i < devices.size(); i++) {
+                if (devices[i].getProperties().deviceType != vk::PhysicalDeviceType::eCpu) {
+                    vk_instance.device_indices.push_back(i);
+                    break;
+                }
+            }
+        }
+
+        if (vk_instance.device_indices.empty()) {
+            GGML_LOG_INFO("ggml_vulkan: No devices found.\n");
+            return;
         }
     }
     GGML_LOG_DEBUG("ggml_vulkan: Found %zu Vulkan devices:\n", vk_instance.device_indices.size());
@@ -3684,6 +3801,9 @@ static void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) {
     ctx->fence = ctx->device->device.createFence({});
     ctx->almost_ready_fence = ctx->device->device.createFence({});
 
+    ctx->compute_cmd_pool.init(ctx->device, &ctx->device->compute_queue);
+    ctx->transfer_cmd_pool.init(ctx->device, &ctx->device->transfer_queue);
+
 #ifdef GGML_VULKAN_CHECK_RESULTS
     const char* skip_checks = getenv("GGML_VULKAN_SKIP_CHECKS");
     vk_skip_checks = (skip_checks == NULL ? 0 : atoi(skip_checks));
@@ -4049,9 +4169,9 @@ static void ggml_vk_host_get(vk_device& device, const void * ptr, vk_buffer& buf
     }
 }
 
-static vk_submission ggml_vk_begin_submission(vk_device& device, vk_queue& q, bool one_time = true) {
+static vk_submission ggml_vk_begin_submission(vk_device& device, vk_command_pool& p, bool one_time = true) {
     vk_submission s;
-    s.buffer = ggml_vk_create_cmd_buffer(device, q);
+    s.buffer = ggml_vk_create_cmd_buffer(device, p);
     if (one_time) {
         s.buffer.begin({ vk::CommandBufferUsageFlagBits::eOneTimeSubmit });
     } else {
@@ -4061,7 +4181,33 @@ static vk_submission ggml_vk_begin_submission(vk_device& device, vk_queue& q, bo
     return s;
 }
 
-static void ggml_vk_dispatch_pipeline(ggml_backend_vk_context* ctx, vk_context& subctx, vk_pipeline& pipeline, std::initializer_list<vk::DescriptorBufferInfo> const& descriptor_buffer_infos, size_t push_constant_size, const void* push_constants, std::array<uint32_t, 3> elements) {
+template <typename T> size_t push_constant_size(const T &t) {
+    static_assert(std::is_class<T>::value, "T must be a struct/class");
+    GGML_UNUSED(t);
+    return sizeof(T);
+}
+template <typename T> size_t push_constant_size(const std::vector<T> &t) {
+    GGML_UNUSED(t);
+    return sizeof(T) * t.size();
+}
+template <typename T, uint32_t N> size_t push_constant_size(const std::array<T, N> &t) {
+    GGML_UNUSED(t);
+    return sizeof(T) * N;
+}
+
+template <typename T> const T *push_constant_data(const T &t) {
+    static_assert(std::is_class<T>::value, "T must be a struct/class");
+    return &t;
+}
+template <typename T> const T *push_constant_data(const std::vector<T> &t) {
+    return t.data();
+}
+template <typename T, uint32_t N> const T *push_constant_data(const std::array<T, N> &t) {
+    return t.data();
+}
+
+template <typename T>
+static void ggml_vk_dispatch_pipeline(ggml_backend_vk_context* ctx, vk_context& subctx, vk_pipeline& pipeline, std::initializer_list<vk::DescriptorBufferInfo> const& descriptor_buffer_infos, const T &push_constants, std::array<uint32_t, 3> elements) {
     const uint32_t wg0 = CEIL_DIV(elements[0], pipeline->wg_denoms[0]);
     const uint32_t wg1 = CEIL_DIV(elements[1], pipeline->wg_denoms[1]);
     const uint32_t wg2 = CEIL_DIV(elements[2], pipeline->wg_denoms[2]);
@@ -4070,14 +4216,14 @@ static void ggml_vk_dispatch_pipeline(ggml_backend_vk_context* ctx, vk_context&
         std::cerr << "(" << buffer.buffer << ", " << buffer.offset << ", " << buffer.range << "), ";
     }
     std::cerr << "}, (" << wg0 << "," << wg1 << "," << wg2 << "))");
-    GGML_ASSERT(pipeline->descriptor_set_idx < pipeline->descriptor_sets.size());
-    GGML_ASSERT(descriptor_buffer_infos.size() == pipeline->parameter_count);
+    GGML_ASSERT(ctx->descriptor_set_idx < ctx->descriptor_sets.size());
+    GGML_ASSERT(descriptor_buffer_infos.size() <= MAX_PARAMETER_COUNT);
 
-    vk::DescriptorSet& descriptor_set = pipeline->descriptor_sets[pipeline->descriptor_set_idx++];
+    vk::DescriptorSet& descriptor_set = ctx->descriptor_sets[ctx->descriptor_set_idx++];
     vk::WriteDescriptorSet write_descriptor_set{ descriptor_set, 0, 0, pipeline->parameter_count, vk::DescriptorType::eStorageBuffer, nullptr, descriptor_buffer_infos.begin() };
     ctx->device->device.updateDescriptorSets({ write_descriptor_set }, {});
 
-    subctx->s->buffer.pushConstants(pipeline->layout, vk::ShaderStageFlagBits::eCompute, 0, push_constant_size, push_constants);
+    subctx->s->buffer.pushConstants(pipeline->layout, vk::ShaderStageFlagBits::eCompute, 0, push_constant_size(push_constants), push_constant_data(push_constants));
     subctx->s->buffer.bindPipeline(vk::PipelineBindPoint::eCompute, pipeline->pipeline);
     subctx->s->buffer.bindDescriptorSets(vk::PipelineBindPoint::eCompute,
                                 pipeline->layout,
@@ -4110,7 +4256,7 @@ static void ggml_vk_ctx_begin(vk_device& device, vk_context& subctx) {
         ggml_vk_ctx_end(subctx);
     }
 
-    subctx->seqs.push_back({ ggml_vk_begin_submission(device, *subctx->q) });
+    subctx->seqs.push_back({ ggml_vk_begin_submission(device, *subctx->p) });
     subctx->s = subctx->seqs[subctx->seqs.size() - 1].data();
 }
 
@@ -4311,7 +4457,9 @@ static void ggml_vk_buffer_write_2d(vk_buffer& dst, size_t offset, const void *
             memcpy((uint8_t *)dst->ptr + offset + i * width, (const uint8_t *) src + i * spitch, width);
         }
     } else {
-        vk_context subctx = ggml_vk_create_temporary_context(dst->device->transfer_queue);
+        std::lock_guard<std::mutex> guard(dst->device->mutex);
+
+        vk_context subctx = ggml_vk_create_temporary_context(dst->device->transfer_queue.cmd_pool);
         ggml_vk_ctx_begin(dst->device, subctx);
         ggml_vk_buffer_write_2d_async(subctx, dst, offset, src, spitch, width, height, true);
         ggml_vk_ctx_end(subctx);
@@ -4323,6 +4471,7 @@ static void ggml_vk_buffer_write_2d(vk_buffer& dst, size_t offset, const void *
         ggml_vk_submit(subctx, dst->device->fence);
         VK_CHECK(dst->device->device.waitForFences({ dst->device->fence }, true, UINT64_MAX), "vk_buffer_write_2d waitForFences");
         dst->device->device.resetFences({ dst->device->fence });
+        ggml_vk_queue_command_pools_cleanup(dst->device);
     }
 }
 
@@ -4399,7 +4548,9 @@ static void ggml_vk_buffer_read(vk_buffer& src, size_t offset, void * dst, size_
 
         memcpy(dst, (uint8_t *) src->ptr + offset, size);
     } else {
-        vk_context subctx = ggml_vk_create_temporary_context(src->device->transfer_queue);
+        std::lock_guard<std::mutex> guard(src->device->mutex);
+
+        vk_context subctx = ggml_vk_create_temporary_context(src->device->transfer_queue.cmd_pool);
         ggml_vk_ctx_begin(src->device, subctx);
         ggml_vk_buffer_read_async(subctx, src, offset, dst, size, true);
         ggml_vk_ctx_end(subctx);
@@ -4407,6 +4558,7 @@ static void ggml_vk_buffer_read(vk_buffer& src, size_t offset, void * dst, size_
         ggml_vk_submit(subctx, src->device->fence);
         VK_CHECK(src->device->device.waitForFences({ src->device->fence }, true, UINT64_MAX), "vk_buffer_read waitForFences");
         src->device->device.resetFences({ src->device->fence });
+        ggml_vk_queue_command_pools_cleanup(src->device);
 
         for (auto& cpy : subctx->out_memcpys) {
             memcpy(cpy.dst, cpy.src, cpy.n);
@@ -4426,15 +4578,17 @@ static void ggml_vk_buffer_copy_async(vk_context& ctx, vk_buffer& dst, size_t ds
 
 static void ggml_vk_buffer_copy(vk_buffer& dst, size_t dst_offset, vk_buffer& src, size_t src_offset, size_t size) {
     if (src->device == dst->device) {
+        std::lock_guard<std::mutex> guard(src->device->mutex);
         VK_LOG_DEBUG("ggml_vk_buffer_copy(SINGLE_DEVICE, " << size << ")");
         // Copy within the device
-        vk_context subctx = ggml_vk_create_temporary_context(src->device->transfer_queue);
+        vk_context subctx = ggml_vk_create_temporary_context(src->device->transfer_queue.cmd_pool);
         ggml_vk_ctx_begin(src->device, subctx);
         ggml_vk_buffer_copy_async(subctx, dst, dst_offset, src, src_offset, size);
         ggml_vk_ctx_end(subctx);
         ggml_vk_submit(subctx, src->device->fence);
         VK_CHECK(src->device->device.waitForFences({ src->device->fence }, true, UINT64_MAX), "vk_buffer_copy waitForFences");
         src->device->device.resetFences({ src->device->fence });
+        ggml_vk_queue_command_pools_cleanup(src->device);
     } else {
         VK_LOG_DEBUG("ggml_vk_buffer_copy(MULTI_DEVICE, " << size << ")");
         // Copy device to device
@@ -4459,7 +4613,8 @@ static void ggml_vk_buffer_memset_async(vk_context& ctx, vk_buffer& dst, size_t
 static void ggml_vk_buffer_memset(vk_buffer& dst, size_t offset, uint32_t c, size_t size) {
     VK_LOG_DEBUG("ggml_vk_buffer_memset(" << offset << ", " << c << ", " << size << ")");
 
-    vk_context subctx = ggml_vk_create_temporary_context(dst->device->transfer_queue);
+    std::lock_guard<std::mutex> guard(dst->device->mutex);
+    vk_context subctx = ggml_vk_create_temporary_context(dst->device->transfer_queue.cmd_pool);
     ggml_vk_ctx_begin(dst->device, subctx);
     subctx->s->buffer.fillBuffer(dst->buffer, offset, size, c);
     ggml_vk_ctx_end(subctx);
@@ -4467,6 +4622,7 @@ static void ggml_vk_buffer_memset(vk_buffer& dst, size_t offset, uint32_t c, siz
     ggml_vk_submit(subctx, dst->device->fence);
     VK_CHECK(dst->device->device.waitForFences({ dst->device->fence }, true, UINT64_MAX), "vk_memset waitForFences");
     dst->device->device.resetFences({ dst->device->fence });
+    ggml_vk_queue_command_pools_cleanup(dst->device);
 }
 
 static uint32_t ggml_vk_guess_split_k(ggml_backend_vk_context * ctx, int m, int n, int k, const vk_pipeline& pipeline) {
@@ -4540,7 +4696,7 @@ static void ggml_vk_matmul(
     ggml_vk_sync_buffers(subctx);
     if (split_k == 1) {
         const vk_mat_mat_push_constants pc = { m, n, k, stride_a, stride_b, stride_d, batch_stride_a, batch_stride_b, batch_stride_d, k, ne02, ne12, broadcast2, broadcast3, padded_n };
-        ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { a, b, d }, sizeof(vk_mat_mat_push_constants), &pc, { m, n, batch });
+        ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { a, b, d }, pc, { m, n, batch });
         return;
     }
 
@@ -4548,10 +4704,10 @@ static void ggml_vk_matmul(
 
     const vk_mat_mat_push_constants pc1 = { m, n, k, stride_a, stride_b, stride_d, batch_stride_a, batch_stride_b, batch_stride_d, CEIL_DIV(k, split_k), ne02, ne12, broadcast2, broadcast3, padded_n };
     // Make sure enough workgroups get assigned for split k to work
-    ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { a, b, split_k_buffer }, sizeof(vk_mat_mat_push_constants), &pc1, { (CEIL_DIV(m, pipeline->wg_denoms[0]) * pipeline->wg_denoms[0]) * split_k, n, batch });
+    ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { a, b, split_k_buffer }, pc1, { (CEIL_DIV(m, pipeline->wg_denoms[0]) * pipeline->wg_denoms[0]) * split_k, n, batch });
     ggml_vk_sync_buffers(subctx);
     const std::array<uint32_t, 2> pc2 = { (uint32_t)(m * n * batch), split_k };
-    ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_matmul_split_k_reduce, { split_k_buffer, d }, pc2.size() * sizeof(uint32_t), pc2.data(), { m * n * batch, 1, 1 });
+    ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_matmul_split_k_reduce, { split_k_buffer, d }, pc2, { m * n * batch, 1, 1 });
 }
 
 static vk_pipeline ggml_vk_guess_matmul_id_pipeline(ggml_backend_vk_context * ctx, vk_matmul_pipeline& mmp, uint32_t m, uint32_t n, bool aligned, ggml_type src0_type) {
@@ -4599,7 +4755,7 @@ static void ggml_vk_matmul_id(
     ggml_vk_sync_buffers(subctx);
     const vk_mat_mat_id_push_constants pc = { m, n, k, stride_a, stride_b, stride_d, batch_stride_a, batch_stride_b, batch_stride_d,
                                               nei0, nei1, nbi1, ne11, padded_n };
-    ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { a, b, d, ids }, sizeof(vk_mat_mat_id_push_constants), &pc, { m, nei1, n_as });
+    ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { a, b, d, ids }, pc, { m, nei1, n_as });
 }
 
 static bool ggml_vk_dim01_contiguous(const ggml_tensor * tensor) {
@@ -4720,7 +4876,7 @@ static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context&
     };
     init_pushconst_fastdiv(pc);
     ggml_vk_sync_buffers(subctx);
-    ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { in, out }, sizeof(vk_op_unary_push_constants), &pc, elements);
+    ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { in, out }, pc, elements);
 }
 
 static vk_pipeline ggml_vk_get_quantize_pipeline(ggml_backend_vk_context * ctx, ggml_type type) {
@@ -4739,7 +4895,7 @@ static void ggml_vk_quantize_q8_1(ggml_backend_vk_context * ctx, vk_context& sub
     vk_pipeline pipeline = ggml_vk_get_quantize_pipeline(ctx, GGML_TYPE_Q8_1);
 
     ggml_vk_sync_buffers(subctx);
-    ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { in, out }, sizeof(uint32_t), &ne, { ne, 1, 1 });
+    ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { in, out }, std::array<uint32_t, 1>{ne}, { ne, 1, 1 });
 }
 
 static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
@@ -4880,18 +5036,18 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub
         }
 
         // Request descriptor sets
-        ggml_pipeline_request_descriptor_sets(ctx->device, pipeline, 1);
+        ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
         if (qx_needs_dequant) {
-            ggml_pipeline_request_descriptor_sets(ctx->device, to_fp16_vk_0, 1);
+            ggml_pipeline_request_descriptor_sets(ctx, to_fp16_vk_0, 1);
         }
         if (qy_needs_dequant) {
-            ggml_pipeline_request_descriptor_sets(ctx->device, to_fp16_vk_1, 1);
+            ggml_pipeline_request_descriptor_sets(ctx, to_fp16_vk_1, 1);
         }
         if (quantize_y) {
-            ggml_pipeline_request_descriptor_sets(ctx->device, to_q8_1, 1);
+            ggml_pipeline_request_descriptor_sets(ctx, to_q8_1, 1);
         }
         if (split_k > 1) {
-            ggml_pipeline_request_descriptor_sets(ctx->device, ctx->device->pipeline_matmul_split_k_reduce, 1);
+            ggml_pipeline_request_descriptor_sets(ctx, ctx->device->pipeline_matmul_split_k_reduce, 1);
         }
         return;
     }
@@ -4939,7 +5095,7 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub
     } else if (qx_needs_dequant) {
         const std::vector<uint32_t> pc = { (uint32_t)ne01, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)(ggml_nelements(src0)) };
         ggml_vk_sync_buffers(subctx);
-        ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_0, { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, vk_subbuffer{ d_X, 0, x_sz * ne02 * ne03 } }, pc.size() * sizeof(uint32_t), pc.data(), { (uint32_t)(x_ne * ne02 * ne03), 1, 1});
+        ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_0, { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, vk_subbuffer{ d_X, 0, x_sz * ne02 * ne03 } }, pc, { (uint32_t)(x_ne * ne02 * ne03), 1, 1});
     }
     if (y_non_contig) {
         ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
@@ -5073,12 +5229,12 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context&
 
         // Request descriptor sets
         if (qx_needs_dequant) {
-            ggml_pipeline_request_descriptor_sets(ctx->device, to_fp16_vk_0, 1);
+            ggml_pipeline_request_descriptor_sets(ctx, to_fp16_vk_0, 1);
         }
         if (qy_needs_dequant) {
-            ggml_pipeline_request_descriptor_sets(ctx->device, to_fp16_vk_1, 1);
+            ggml_pipeline_request_descriptor_sets(ctx, to_fp16_vk_1, 1);
         }
-        ggml_pipeline_request_descriptor_sets(ctx->device, dmmv, 1);
+        ggml_pipeline_request_descriptor_sets(ctx, dmmv, 1);
         return;
     }
 
@@ -5155,7 +5311,7 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context&
     ggml_vk_sync_buffers(subctx);
     ggml_vk_dispatch_pipeline(ctx, subctx, dmmv,
                               { vk_subbuffer{ d_X, x_buf_offset, x_sz * ne02 * ne03 }, vk_subbuffer{ d_Y, y_buf_offset, y_sz * ne12 * ne13 }, vk_subbuffer{ d_D, d_buf_offset, d_sz * ne22 * ne23} },
-                              sizeof(vk_mat_vec_push_constants), &pc, { groups_x, (uint32_t)(ne12 * ne13), groups_z });
+                              pc, { groups_x, (uint32_t)(ne12 * ne13), groups_z });
 }
 
 static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
@@ -5211,7 +5367,7 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c
 
     if (dryrun) {
         // Request descriptor sets
-        ggml_pipeline_request_descriptor_sets(ctx->device, ctx->device->pipeline_mul_mat_vec_p021_f16_f32[gqa_ratio - 1], 1);
+        ggml_pipeline_request_descriptor_sets(ctx, ctx->device->pipeline_mul_mat_vec_p021_f16_f32[gqa_ratio - 1], 1);
         return;
     }
 
@@ -5243,7 +5399,7 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c
     }
 
     ggml_vk_sync_buffers(subctx);
-    ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_p021_f16_f32[gqa_ratio - 1], { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz }, vk_subbuffer{ d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, vk_subbuffer{ d_D, d_buffer_offset, d_sz + d_shader_offset } }, 6 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, workgroups_z });
+    ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_p021_f16_f32[gqa_ratio - 1], { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz }, vk_subbuffer{ d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, vk_subbuffer{ d_D, d_buffer_offset, d_sz + d_shader_offset } }, pc, { 1, (uint32_t)ne01, workgroups_z });
 }
 
 static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
@@ -5300,7 +5456,7 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
 
     if (dryrun) {
         // Request descriptor sets
-        ggml_pipeline_request_descriptor_sets(ctx->device, ctx->device->pipeline_mul_mat_vec_nc_f16_f32, 1);
+        ggml_pipeline_request_descriptor_sets(ctx, ctx->device->pipeline_mul_mat_vec_nc_f16_f32, 1);
         return;
     }
 
@@ -5326,7 +5482,7 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
     const std::array<uint32_t, 9> pc = { (uint32_t)ne00, (uint32_t)ne01, row_stride_x, channel_stride_x, channel_stride_y, (uint32_t)(ne12 / ne02), (uint32_t)ne12, (uint32_t)(qy_shader_offset / ggml_type_size(src1->type)), (uint32_t)(d_shader_offset / ggml_type_size(dst->type)) };
     ggml_vk_sync_buffers(subctx);
     ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_nc_f16_f32,
-        { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz }, vk_subbuffer{ d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, vk_subbuffer{ d_D, d_buffer_offset, d_sz + d_shader_offset } }, 7 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, (uint32_t)ne12 });
+        { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz }, vk_subbuffer{ d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, vk_subbuffer{ d_D, d_buffer_offset, d_sz + d_shader_offset } }, pc, { 1, (uint32_t)ne01, (uint32_t)ne12 });
 }
 
 static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
@@ -5487,12 +5643,12 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context&
         }
 
         // Request descriptor sets
-        ggml_pipeline_request_descriptor_sets(ctx->device, pipeline, 1);
+        ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
         if (qx_needs_dequant) {
-            ggml_pipeline_request_descriptor_sets(ctx->device, to_fp16_vk_0, 1);
+            ggml_pipeline_request_descriptor_sets(ctx, to_fp16_vk_0, 1);
         }
         if (qy_needs_dequant) {
-            ggml_pipeline_request_descriptor_sets(ctx->device, to_fp16_vk_1, 1);
+            ggml_pipeline_request_descriptor_sets(ctx, to_fp16_vk_1, 1);
         }
         return;
     }
@@ -5542,7 +5698,7 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context&
         const std::vector<uint32_t> pc = { (uint32_t)ne01, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)(ggml_nelements(src0)) };
         ggml_vk_sync_buffers(subctx);
         ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_0,
-            { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, vk_subbuffer{ d_X, 0, x_sz * ne02 * ne03 } }, pc.size() * sizeof(uint32_t), pc.data(), { (uint32_t)(x_ne * ne02 * ne03), 1, 1});
+            { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, vk_subbuffer{ d_X, 0, x_sz * ne02 * ne03 } }, pc, { (uint32_t)(x_ne * ne02 * ne03), 1, 1});
     }
     if (y_non_contig) {
         ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
@@ -5681,12 +5837,12 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte
 
         // Request descriptor sets
         if (qx_needs_dequant) {
-            ggml_pipeline_request_descriptor_sets(ctx->device, to_fp16_vk_0, 1);
+            ggml_pipeline_request_descriptor_sets(ctx, to_fp16_vk_0, 1);
         }
         if (qy_needs_dequant) {
-            ggml_pipeline_request_descriptor_sets(ctx->device, to_fp16_vk_1, 1);
+            ggml_pipeline_request_descriptor_sets(ctx, to_fp16_vk_1, 1);
         }
-        ggml_pipeline_request_descriptor_sets(ctx->device, dmmv, 1);
+        ggml_pipeline_request_descriptor_sets(ctx, dmmv, 1);
         return;
     }
 
@@ -5762,7 +5918,7 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte
     ggml_vk_dispatch_pipeline(ctx, subctx, dmmv,
         { vk_subbuffer{ d_X, x_buf_offset, x_sz * ne02 * ne03 },
         vk_subbuffer{ d_Y, y_buf_offset, y_sz * ne12 * ne13 }, vk_subbuffer{ d_D, d_buf_offset, d_sz * ne22 * ne23}, vk_subbuffer{ d_ids, ids_buf_offset, ids_sz } },
-        sizeof(vk_mat_vec_id_push_constants), &pc, { groups_x, (uint32_t)nei0, groups_z });
+        pc, { groups_x, (uint32_t)nei0, groups_z });
 }
 
 static void ggml_vk_mul_mat_id(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, bool dryrun = false) {
@@ -6006,9 +6162,9 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
 
     if (dryrun) {
         // Request descriptor sets
-        ggml_pipeline_request_descriptor_sets(ctx->device, pipeline, 1);
+        ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
         if (split_k > 1) {
-            ggml_pipeline_request_descriptor_sets(ctx->device, ctx->device->pipeline_flash_attn_split_k_reduce, 1);
+            ggml_pipeline_request_descriptor_sets(ctx, ctx->device->pipeline_flash_attn_split_k_reduce, 1);
         }
         return;
     }
@@ -6112,7 +6268,7 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
                                     // there's no more than one tile of rows (i.e. workgroups_x would have been
                                     // one). We reuse workgroups_x to mean the number of splits, so we need to
                                     // cancel out the divide by wg_denoms[0].
-                                    sizeof(vk_flash_attn_push_constants), &pc, { workgroups_x * pipeline->wg_denoms[0], workgroups_y, workgroups_z });
+                                    pc, { workgroups_x * pipeline->wg_denoms[0], workgroups_y, workgroups_z });
 
         ggml_vk_sync_buffers(subctx);
         const std::array<uint32_t, 3> pc2 = { D, (uint32_t)ne1, split_k };
@@ -6121,7 +6277,7 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
                                         vk_subbuffer{ctx->prealloc_split_k, 0, VK_WHOLE_SIZE},
                                         vk_subbuffer{d_D, d_buf_offset, VK_WHOLE_SIZE},
                                     },
-                                    pc2.size() * uint32_t{sizeof(uint32_t)}, pc2.data(), { (uint32_t)ne1, 1, 1 });
+                                    pc2, { (uint32_t)ne1, 1, 1 });
     } else {
         ggml_vk_dispatch_pipeline(ctx, subctx, pipeline,
                                     {
@@ -6131,7 +6287,7 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
                                         vk_subbuffer{d_M, m_buf_offset, VK_WHOLE_SIZE},
                                         vk_subbuffer{d_D, d_buf_offset, VK_WHOLE_SIZE},
                                     },
-                                    sizeof(vk_flash_attn_push_constants), &pc, { workgroups_x, workgroups_y, workgroups_z });
+                                    pc, { workgroups_x, workgroups_y, workgroups_z });
     }
 }
 
@@ -6392,6 +6548,11 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
             return ctx->device->pipeline_timestep_embedding_f32;
         }
         return nullptr;
+    case GGML_OP_CONV_TRANSPOSE_1D:
+        if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
+            return ctx->device->pipeline_conv_transpose_1d_f32;
+        }
+        return nullptr;
     case GGML_OP_POOL_2D:
         if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
             return ctx->device->pipeline_pool2d_f32;
@@ -6566,7 +6727,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
     }
 
     if (dryrun) {
-        ggml_pipeline_request_descriptor_sets(ctx->device, pipeline, 1);
+        ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
         return;
     }
 
@@ -6726,6 +6887,10 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
             uint32_t half_ceil = (dim + 1) / 2;
             elements = { half_ceil, (uint32_t)src0->ne[0], 1 };
         } break;
+    case GGML_OP_CONV_TRANSPOSE_1D:
+        {
+            elements = {uint32_t(src0->ne[1]), 1, 1}; // parallelize in {Cout, 1, 1}
+        } break;
     case GGML_OP_POOL_2D:
         {
             const uint32_t N = dst->ne[3];
@@ -6800,7 +6965,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
         }
 
         ggml_vk_sync_buffers(subctx);
-        ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, subbuf_y, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
+        ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, subbuf_y, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, pc, elements);
     } else if (op == GGML_OP_ROPE || op == GGML_OP_ROPE_BACK) {
         // Empty src2 is possible in rope, but the shader needs a buffer
         vk_subbuffer subbuf_z;
@@ -6811,26 +6976,26 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
         }
 
         ggml_vk_sync_buffers(subctx);
-        ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, subbuf_z, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
+        ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, subbuf_z, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, pc, elements);
     } else if (op == GGML_OP_IM2COL) {
         // im2col uses only src1 and dst buffers
         ggml_vk_sync_buffers(subctx);
-        ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
+        ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, pc, elements);
     } else if (op == GGML_OP_COUNT_EQUAL) {
         ggml_vk_sync_buffers(subctx);
         // count_equal assumes that destination buffer is initialized with zeroes
         ggml_vk_buffer_memset_async(subctx, d_D, d_buf_offset, 0, d_sz);
         ggml_vk_sync_buffers(subctx);
-        ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
+        ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, pc, elements);
     } else if (use_src2) {
         ggml_vk_sync_buffers(subctx);
-        ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_Z, z_buf_offset, z_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
+        ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_Z, z_buf_offset, z_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, pc, elements);
     } else if (use_src1) {
         ggml_vk_sync_buffers(subctx);
-        ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
+        ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, pc, elements);
     } else {
         ggml_vk_sync_buffers(subctx);
-        ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
+        ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, pc, elements);
     }
 }
 
@@ -6943,7 +7108,7 @@ static void ggml_vk_op_f32_wkv(ggml_backend_vk_context * ctx, vk_context& subctx
     GGML_ASSERT(pipeline != nullptr);
 
     if (dryrun) {
-        ggml_pipeline_request_descriptor_sets(ctx->device, pipeline, 1);
+        ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
         return;
     }
 
@@ -6999,7 +7164,7 @@ static void ggml_vk_op_f32_wkv(ggml_backend_vk_context * ctx, vk_context& subctx
             vk_subbuffer{ d_srcs[4], src_offsets[4], src_sizes[4] },
             vk_subbuffer{ d_srcs[5], src_offsets[5], src_sizes[5] },
             vk_subbuffer{ d_D, dst_offset, dst_size }
-        }, sizeof(vk_op_rwkv_wkv6_push_constants), &pc, elements);
+        }, pc, elements);
     } else if (version == 7) {
         ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, {
             vk_subbuffer{ d_srcs[0], src_offsets[0], src_sizes[0] },
@@ -7010,7 +7175,7 @@ static void ggml_vk_op_f32_wkv(ggml_backend_vk_context * ctx, vk_context& subctx
             vk_subbuffer{ d_srcs[5], src_offsets[5], src_sizes[5] },
             vk_subbuffer{ d_srcs[6], src_offsets[6], src_sizes[6] },
             vk_subbuffer{ d_D, dst_offset, dst_size }
-        }, sizeof(vk_op_rwkv_wkv7_push_constants), &pc, elements);
+        }, pc, elements);
     } else {
         // shouldn't happen
         GGML_ASSERT(false);
@@ -7082,7 +7247,7 @@ static void ggml_vk_op_f32_opt_step_adamw(ggml_backend_vk_context * ctx, vk_cont
     GGML_ASSERT(pipeline != nullptr);
 
     if (dryrun) {
-        ggml_pipeline_request_descriptor_sets(ctx->device, pipeline, 1);
+        ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
         return;
     }
 
@@ -7147,7 +7312,7 @@ static void ggml_vk_op_f32_opt_step_adamw(ggml_backend_vk_context * ctx, vk_cont
         vk_subbuffer{ d_GM, gm_offset, gm_size },
         vk_subbuffer{ d_GV, gv_offset, gv_size },
         vk_subbuffer{ d_P, p_offset, p_size },
-    }, sizeof(vk_op_push_constants), &pc, elements);
+    }, pc, elements);
 }
 
 static void ggml_vk_opt_step_adamw(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst, bool dryrun = false) {
@@ -7529,6 +7694,37 @@ static void ggml_vk_timestep_embedding(ggml_backend_vk_context * ctx, vk_context
     }, dryrun);
 }
 
+static void ggml_vk_conv_transpose_1d(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
+    // src0: (K, Cout, Cin, 1) -- kernel
+    // src1: (L, Cin, 1, 1) -- input
+    // dst: (*, Cout, 1, 1)
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    GGML_TENSOR_BINARY_OP_LOCALS
+
+    GGML_ASSERT(nb00 == sizeof(float));
+    GGML_ASSERT(nb10 == sizeof(float));
+
+    const int32_t s0 = dst->op_params[0];
+
+    vk_op_conv_transpose_1d_push_constants p{};
+    p.Cout = static_cast<uint32_t>(ne01);
+    p.Cin = static_cast<uint32_t>(ne02);
+    p.K = static_cast<uint32_t>(ne00);
+    p.L = static_cast<uint32_t>(ne10);
+    p.KL = static_cast<uint32_t>(ne0);
+    p.nb01 = static_cast<uint32_t>(nb01 / nb00);
+    p.nb02 = static_cast<uint32_t>(nb02 / nb00);
+    p.nb11 = static_cast<uint32_t>(nb11 / nb10);
+    p.nb1 = static_cast<uint32_t>(nb1 / nb0);
+    p.s0 = static_cast<uint32_t>(s0);
+
+    ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_CONV_TRANSPOSE_1D, std::move(p), dryrun);
+}
+
 static void ggml_vk_pool_2d(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
     uint32_t op = static_cast<uint32_t>(dst->op_params[0]);
     const int32_t k1 = dst->op_params[1];
@@ -7729,9 +7925,9 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
         }
     }
 
-    ggml_pipeline_request_descriptor_sets(ctx->device, p, num_it);
+    ggml_pipeline_request_descriptor_sets(ctx, p, num_it);
     if (split_k > 1) {
-        ggml_pipeline_request_descriptor_sets(ctx->device, ctx->device->pipeline_matmul_split_k_reduce, num_it);
+        ggml_pipeline_request_descriptor_sets(ctx, ctx->device->pipeline_matmul_split_k_reduce, num_it);
 
         if (ctx->prealloc_split_k == nullptr || ctx->prealloc_split_k->size < sizeof(float) * d_ne * split_k) {
             // Resize buffer
@@ -7746,7 +7942,7 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
         ggml_vk_load_shaders(ctx->device);
     }
 
-    ggml_pipeline_allocate_descriptor_sets(ctx->device);
+    ggml_pipeline_allocate_descriptor_sets(ctx);
 
     vk_buffer d_X = ggml_vk_create_buffer_check(ctx->device, sizeof(X_TYPE) * x_ne, vk::MemoryPropertyFlagBits::eDeviceLocal);
     vk_buffer d_Y = ggml_vk_create_buffer_check(ctx->device, sizeof(Y_TYPE) * y_ne, vk::MemoryPropertyFlagBits::eDeviceLocal);
@@ -7788,7 +7984,7 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
     ggml_vk_buffer_write(d_X, 0, x, sizeof(X_TYPE) * k * m * batch);
     ggml_vk_buffer_write(d_Y, 0, y, sizeof(Y_TYPE) * k * n * batch);
 
-    vk_context subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
+    vk_context subctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool);
     ggml_vk_ctx_begin(ctx->device, subctx);
     for (size_t i = 0; i < num_it; i++) {
         ggml_vk_matmul(
@@ -7804,6 +8000,7 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
     ggml_vk_submit(subctx, ctx->fence);
     VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_test_matmul waitForFences");
     ctx->device->device.resetFences({ ctx->fence });
+    ggml_vk_queue_command_pools_cleanup(ctx->device);
 
     auto end = std::chrono::high_resolution_clock::now();
     double time = std::chrono::duration_cast<std::chrono::microseconds>(end-begin).count() / 1000.0;
@@ -7905,16 +8102,13 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
 
     free(d_chk);
 
-    ggml_vk_queue_cleanup(ctx->device, ctx->device->transfer_queue);
-    ggml_vk_queue_cleanup(ctx->device, ctx->device->compute_queue);
+    ggml_vk_command_pool_cleanup(ctx->device, ctx->compute_cmd_pool);
+    ggml_vk_command_pool_cleanup(ctx->device, ctx->transfer_cmd_pool);
 
     ggml_vk_destroy_buffer(d_X);
     ggml_vk_destroy_buffer(d_Y);
     ggml_vk_destroy_buffer(d_D);
 
-    ggml_pipeline_cleanup(p);
-    ggml_pipeline_cleanup(ctx->device->pipeline_matmul_split_k_reduce);
-
     free(x);
     free(y);
     free(d);
@@ -7992,20 +8186,20 @@ static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_
     ggml_vk_quantize_data(x, qx, ne, quant);
     ggml_vk_dequantize_data(qx, x_ref, ne, quant);
 
-    ggml_pipeline_request_descriptor_sets(ctx->device, p, 1);
+    ggml_pipeline_request_descriptor_sets(ctx, p, 1);
 
     if (ctx->device->need_compiles) {
         ggml_vk_load_shaders(ctx->device);
     }
 
-    ggml_pipeline_allocate_descriptor_sets(ctx->device);
+    ggml_pipeline_allocate_descriptor_sets(ctx);
 
     ggml_vk_buffer_write(qx_buf, 0, qx, qx_sz);
 
-    vk_context subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
+    vk_context subctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool);
     ggml_vk_ctx_begin(ctx->device, subctx);
     const std::vector<uint32_t> pc = { 1, (uint32_t)ne, (uint32_t)ne, (uint32_t)ne, (uint32_t)ne };
-    ggml_vk_dispatch_pipeline(ctx, subctx, p, { vk_subbuffer{ qx_buf, 0, qx_sz }, vk_subbuffer{ x_buf, 0, x_sz_f16 } }, pc.size() * sizeof(int), pc.data(), { (uint32_t)ne, 1, 1});
+    ggml_vk_dispatch_pipeline(ctx, subctx, p, { vk_subbuffer{ qx_buf, 0, qx_sz }, vk_subbuffer{ x_buf, 0, x_sz_f16 } }, pc, { (uint32_t)ne, 1, 1});
     ggml_vk_ctx_end(subctx);
 
     auto begin = std::chrono::high_resolution_clock::now();
@@ -8013,6 +8207,7 @@ static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_
     ggml_vk_submit(subctx, ctx->fence);
     VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_test_dequant waitForFences");
     ctx->device->device.resetFences({ ctx->fence });
+    ggml_vk_queue_command_pools_cleanup(ctx->device);
 
     auto end = std::chrono::high_resolution_clock::now();
 
@@ -8092,17 +8287,17 @@ static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_
 //
 //     vk_pipeline p = ggml_vk_get_quantize_pipeline(ctx, quant);
 //
-//     ggml_pipeline_request_descriptor_sets(ctx->device, p, 1);
+//     ggml_pipeline_request_descriptor_sets(ctx, p, 1);
 //
 //     if (ctx->device->need_compiles) {
 //         ggml_vk_load_shaders(ctx->device);
 //     }
 //
-//     ggml_pipeline_allocate_descriptor_sets(ctx->device);
+//     ggml_pipeline_allocate_descriptor_sets(ctx);
 //
 //     ggml_vk_buffer_write(x_buf, 0, x, x_sz);
 //
-//     vk_context subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
+//     vk_context subctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool);
 //     ggml_vk_ctx_begin(ctx->device, subctx);
 //     ggml_vk_quantize_q8_1(ctx, subctx, ggml_vk_subbuffer(x_buf), ggml_vk_subbuffer(qx_buf), ne);
 //     ggml_vk_ctx_end(subctx);
@@ -8112,6 +8307,7 @@ static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_
 //     ggml_vk_submit(subctx, ctx->fence);
 //     VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_test_quantize waitForFences");
 //     ctx->device->device.resetFences({ ctx->fence });
+//     ggml_vk_queue_command_pools_cleanup(ctx->device);
 //
 //     auto end = std::chrono::high_resolution_clock::now();
 //
@@ -8251,9 +8447,9 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m,
         // y[i] = i % k;
     }
 
-    ggml_pipeline_request_descriptor_sets(ctx->device, p, num_it);
+    ggml_pipeline_request_descriptor_sets(ctx, p, num_it);
     if (split_k > 1) {
-        ggml_pipeline_request_descriptor_sets(ctx->device, ctx->device->pipeline_matmul_split_k_reduce, num_it);
+        ggml_pipeline_request_descriptor_sets(ctx, ctx->device->pipeline_matmul_split_k_reduce, num_it);
 
         if (ctx->prealloc_split_k == nullptr || ctx->prealloc_split_k->size < sizeof(float) * d_ne * split_k) {
             // Resize buffer
@@ -8264,19 +8460,19 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m,
         }
     }
     if (mmq) {
-        ggml_pipeline_request_descriptor_sets(ctx->device, ctx->device->pipeline_quantize_q8_1, num_it);
+        ggml_pipeline_request_descriptor_sets(ctx, ctx->device->pipeline_quantize_q8_1, num_it);
     }
 
     if (ctx->device->need_compiles) {
         ggml_vk_load_shaders(ctx->device);
     }
 
-    ggml_pipeline_allocate_descriptor_sets(ctx->device);
+    ggml_pipeline_allocate_descriptor_sets(ctx);
 
     ggml_vk_buffer_write(qx_buf, 0, qx, qx_sz);
     ggml_vk_buffer_write(y_buf, 0, y, y_sz);
 
-    vk_context subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
+    vk_context subctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool);
     ggml_vk_ctx_begin(ctx->device, subctx);
     if (mmq) {
         for (size_t i = 0; i < num_it; i++) {
@@ -8305,6 +8501,7 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m,
     ggml_vk_submit(subctx, ctx->fence);
     VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_test_dequant waitForFences");
     ctx->device->device.resetFences({ ctx->fence });
+    ggml_vk_queue_command_pools_cleanup(ctx->device);
 
     auto end = std::chrono::high_resolution_clock::now();
 
@@ -8600,6 +8797,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
     case GGML_OP_COUNT_EQUAL:
     case GGML_OP_IM2COL:
     case GGML_OP_TIMESTEP_EMBEDDING:
+    case GGML_OP_CONV_TRANSPOSE_1D:
     case GGML_OP_POOL_2D:
     case GGML_OP_CONV_2D_DW:
     case GGML_OP_RWKV_WKV6:
@@ -8618,7 +8816,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
 
     if (!dryrun) {
         if (ctx->compute_ctx.expired()) {
-            compute_ctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
+            compute_ctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool);
             ctx->compute_ctx = compute_ctx;
             ggml_vk_ctx_begin(ctx->device, compute_ctx);
         } else {
@@ -8664,6 +8862,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
         case GGML_OP_COUNT_EQUAL:
         case GGML_OP_IM2COL:
         case GGML_OP_TIMESTEP_EMBEDDING:
+        case GGML_OP_CONV_TRANSPOSE_1D:
         case GGML_OP_POOL_2D:
         case GGML_OP_CONV_2D_DW:
         case GGML_OP_LEAKY_RELU:
@@ -8671,7 +8870,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
                 // These operations all go through ggml_vk_op_f32, so short-circuit and
                 // do the only thing needed for the dryrun.
                 vk_pipeline pipeline = ggml_vk_op_get_pipeline(ctx, src0, src1, src2, node, node->op);
-                ggml_pipeline_request_descriptor_sets(ctx->device, pipeline, 1);
+                ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
                 return false;
             }
         default:
@@ -8835,6 +9034,10 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
     case GGML_OP_TIMESTEP_EMBEDDING:
         ggml_vk_timestep_embedding(ctx, compute_ctx, src0, node, dryrun);
 
+        break;
+    case GGML_OP_CONV_TRANSPOSE_1D:
+        ggml_vk_conv_transpose_1d(ctx, compute_ctx, src0, src1, node, dryrun);
+
         break;
     case GGML_OP_POOL_2D:
         ggml_vk_pool_2d(ctx, compute_ctx, src0, node, dryrun);
@@ -8963,6 +9166,7 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor *
     case GGML_OP_COUNT_EQUAL:
     case GGML_OP_IM2COL:
     case GGML_OP_TIMESTEP_EMBEDDING:
+    case GGML_OP_CONV_TRANSPOSE_1D:
     case GGML_OP_POOL_2D:
     case GGML_OP_CONV_2D_DW:
     case GGML_OP_RWKV_WKV6:
@@ -9058,19 +9262,8 @@ static void ggml_vk_graph_cleanup(ggml_backend_vk_context * ctx) {
     }
     ctx->gc.temp_buffers.clear();
 
-    for (auto& dsr : ctx->device->pipeline_descriptor_set_requirements) {
-        vk_pipeline_ref plr = ctx->device->pipelines[dsr.first];
-
-        if (plr.expired()) {
-            continue;
-        }
-
-        vk_pipeline pl = plr.lock();
-        ggml_pipeline_cleanup(pl);
-    }
-
-    ggml_vk_queue_cleanup(ctx->device, ctx->device->compute_queue);
-    ggml_vk_queue_cleanup(ctx->device, ctx->device->transfer_queue);
+    ggml_vk_command_pool_cleanup(ctx->device, ctx->compute_cmd_pool);
+    ggml_vk_command_pool_cleanup(ctx->device, ctx->transfer_cmd_pool);
 
     for (size_t i = 0; i < ctx->gc.semaphores.size(); i++) {
         ctx->device->device.destroySemaphore({ ctx->gc.semaphores[i].s });
@@ -9091,7 +9284,8 @@ static void ggml_vk_graph_cleanup(ggml_backend_vk_context * ctx) {
 
     ctx->tensor_ctxs.clear();
     ctx->gc.contexts.clear();
-    ctx->device->pipeline_descriptor_set_requirements.clear();
+    ctx->pipeline_descriptor_set_requirements = 0;
+    ctx->descriptor_set_idx = 0;
 }
 
 // Clean up on backend free
@@ -9118,6 +9312,15 @@ static void ggml_vk_cleanup(ggml_backend_vk_context * ctx) {
 
     ctx->device->device.destroyFence(ctx->fence);
     ctx->device->device.destroyFence(ctx->almost_ready_fence);
+
+    for (auto& pool : ctx->descriptor_pools) {
+        ctx->device->device.destroyDescriptorPool(pool);
+    }
+    ctx->descriptor_pools.clear();
+    ctx->descriptor_sets.clear();
+
+    ctx->compute_cmd_pool.destroy(ctx->device->device);
+    ctx->transfer_cmd_pool.destroy(ctx->device->device);
 }
 
 static int ggml_vk_get_device_count() {
@@ -9325,6 +9528,12 @@ static size_t ggml_backend_vk_host_buffer_type_get_alignment(ggml_backend_buffer
     UNUSED(buft);
 }
 
+static size_t ggml_backend_vk_host_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) {
+    return vk_instance.devices[0]->suballocation_block_size;
+
+    UNUSED(buft);
+}
+
 // Should be changed to return device-specific host buffer type
 // but that probably requires changes in llama.cpp
 ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type() {
@@ -9333,7 +9542,7 @@ ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type() {
             /* .get_name         = */ ggml_backend_vk_host_buffer_type_name,
             /* .alloc_buffer     = */ ggml_backend_vk_host_buffer_type_alloc_buffer,
             /* .get_alignment    = */ ggml_backend_vk_host_buffer_type_get_alignment,
-            /* .get_max_size     = */ NULL, // defaults to SIZE_MAX
+            /* .get_max_size     = */ ggml_backend_vk_host_buffer_type_get_max_size,
             /* .get_alloc_size   = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
             /* .is_host          = */ ggml_backend_cpu_buffer_type()->iface.is_host,
         },
@@ -9384,7 +9593,7 @@ static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, ggml_tensor
 
     if (ctx->transfer_ctx.expired()) {
         // Initialize new transfer context
-        transfer_ctx = ggml_vk_create_context(ctx, ctx->device->transfer_queue);
+        transfer_ctx = ggml_vk_create_context(ctx, ctx->transfer_cmd_pool);
         ctx->transfer_ctx = transfer_ctx;
         ggml_vk_ctx_begin(ctx->device, transfer_ctx);
     } else {
@@ -9407,7 +9616,7 @@ static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, const ggml_
 
     if (ctx->transfer_ctx.expired()) {
         // Initialize new transfer context
-        transfer_ctx = ggml_vk_create_context(ctx, ctx->device->transfer_queue);
+        transfer_ctx = ggml_vk_create_context(ctx, ctx->transfer_cmd_pool);
         ctx->transfer_ctx = transfer_ctx;
         ggml_vk_ctx_begin(ctx->device, transfer_ctx);
     } else {
@@ -9430,7 +9639,7 @@ static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend, const ggml_
 
         if (ctx->transfer_ctx.expired()) {
             // Initialize new transfer context
-            transfer_ctx = ggml_vk_create_context(ctx, ctx->device->transfer_queue);
+            transfer_ctx = ggml_vk_create_context(ctx, ctx->transfer_cmd_pool);
             ctx->transfer_ctx = transfer_ctx;
             ggml_vk_ctx_begin(ctx->device, transfer_ctx);
         } else {
@@ -9480,6 +9689,13 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
     VK_LOG_DEBUG("ggml_backend_vk_graph_compute(" << cgraph->n_nodes << " nodes)");
     ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
 
+    if (vk_instance.debug_utils_support) {
+        vk::DebugUtilsLabelEXT dul = {};
+        dul.pLabelName = "ggml_backend_vk_graph_compute";
+        dul.color = std::array<float,4>{1.0f, 1.0f, 1.0f, 1.0f};
+        vk_instance.pfn_vkQueueBeginDebugUtilsLabelEXT(ctx->device->compute_queue.queue, reinterpret_cast<VkDebugUtilsLabelEXT*>(&dul));
+    }
+
     uint64_t total_mat_mul_bytes = 0;
     for (int i = 0; i < cgraph->n_nodes; i++) {
         ggml_vk_build_graph(ctx, cgraph->nodes[i], i, nullptr, 0, true, false, false, false);
@@ -9491,7 +9707,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
         ggml_vk_load_shaders(ctx->device);
     }
     ggml_vk_preallocate_buffers(ctx);
-    ggml_pipeline_allocate_descriptor_sets(ctx->device);
+    ggml_pipeline_allocate_descriptor_sets(ctx);
 
     int last_node = cgraph->n_nodes - 1;
 
@@ -9513,8 +9729,8 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
             if (ctx->device->query_pool) {
                 ctx->device->device.destroyQueryPool(ctx->device->query_pool);
             }
-            VkQueryPoolCreateInfo query_create_info = { VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO };
-            query_create_info.queryType = VK_QUERY_TYPE_TIMESTAMP;
+            vk::QueryPoolCreateInfo query_create_info;
+            query_create_info.queryType = vk::QueryType::eTimestamp;
             query_create_info.queryCount = cgraph->n_nodes + 100;
             ctx->device->query_pool = ctx->device->device.createQueryPool(query_create_info);
             ctx->device->num_queries = query_create_info.queryCount;
@@ -9523,7 +9739,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
         ctx->device->device.resetQueryPool(ctx->device->query_pool, 0, cgraph->n_nodes+1);
 
         GGML_ASSERT(ctx->compute_ctx.expired());
-        compute_ctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
+        compute_ctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool);
         ctx->compute_ctx = compute_ctx;
         ggml_vk_ctx_begin(ctx->device, compute_ctx);
         compute_ctx->s->buffer.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->device->query_pool, 0);
@@ -9558,7 +9774,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
 
         if (vk_perf_logger_enabled) {
             if (ctx->compute_ctx.expired()) {
-                compute_ctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
+                compute_ctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool);
                 ctx->compute_ctx = compute_ctx;
                 ggml_vk_ctx_begin(ctx->device, compute_ctx);
             } else {
@@ -9600,7 +9816,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
 
         // Get the results and pass them to the logger
         std::vector<uint64_t> timestamps(cgraph->n_nodes + 1);
-        ctx->device->device.getQueryPoolResults(ctx->device->query_pool, 0, cgraph->n_nodes + 1, (cgraph->n_nodes + 1)*sizeof(uint64_t), timestamps.data(), sizeof(uint64_t), vk::QueryResultFlagBits::e64 | vk::QueryResultFlagBits::eWait);
+        VK_CHECK(ctx->device->device.getQueryPoolResults(ctx->device->query_pool, 0, cgraph->n_nodes + 1, (cgraph->n_nodes + 1)*sizeof(uint64_t), timestamps.data(), sizeof(uint64_t), vk::QueryResultFlagBits::e64 | vk::QueryResultFlagBits::eWait), "get timestamp results");
         for (int i = 0; i < cgraph->n_nodes; i++) {
             if (!ggml_vk_is_empty(cgraph->nodes[i])) {
                 ctx->device->perf_logger->log_timing(cgraph->nodes[i], uint64_t((timestamps[i+1] - timestamps[i]) * ctx->device->properties.limits.timestampPeriod));
@@ -10024,6 +10240,8 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
         case GGML_OP_LEAKY_RELU:
         case GGML_OP_OPT_STEP_ADAMW:
             return true;
+        case GGML_OP_CONV_TRANSPOSE_1D:
+            return op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32;
         default:
             return false;
     }
@@ -10167,11 +10385,28 @@ static bool ggml_vk_instance_portability_enumeration_ext_available(const std::ve
     UNUSED(instance_extensions);
 }
 
+// Extension availability
+static bool ggml_vk_instance_debug_utils_ext_available(
+    const std::vector<vk::ExtensionProperties> & instance_extensions) {
+    // Check for portability enumeration extension for MoltenVK support
+    for (const auto & properties : instance_extensions) {
+        if (strcmp("VK_EXT_debug_utils", properties.extensionName) == 0) {
+            return true;
+        }
+    }
+
+    std::cerr << "ggml_vulkan: WARNING: Instance extension VK_EXT_debug_utils not found." << std::endl;
+    return false;
+
+    UNUSED(instance_extensions);
+}
+
 static bool ggml_vk_khr_cooperative_matrix_support(const vk::PhysicalDeviceProperties& props, const vk::PhysicalDeviceDriverProperties& driver_props, vk_device_architecture arch) {
     switch (props.vendorID) {
     case VK_VENDOR_ID_INTEL:
-        // Intel drivers don't support coopmat properly yet
-        return false;
+        // Only allowing Xe2 GPU at the moment since Xe2 GPU can gain significant performance boost,
+        // while some older hardware (ex. Arc A770) has performance regressions
+        return arch == vk_device_architecture::INTEL_XE2;
     case VK_VENDOR_ID_AMD:
         if (driver_props.driverID == vk::DriverId::eAmdProprietary || driver_props.driverID == vk::DriverId::eAmdOpenSource) {
             // Workaround for AMD proprietary driver reporting support on all GPUs
@@ -10515,6 +10750,11 @@ static void ggml_vk_check_results_0(ggml_tensor * tensor) {
         const int32_t dim = tensor->op_params[0];
         const int32_t max_period = tensor->op_params[1];
         tensor_clone = ggml_timestep_embedding(ggml_ctx, src_clone[0], dim, max_period);
+    } else if (tensor->op == GGML_OP_CONV_TRANSPOSE_1D){
+        const int32_t s0 = tensor->op_params[0];
+        const int32_t p0 = tensor->op_params[1];
+        const int32_t d0 = tensor->op_params[2];
+        tensor_clone = ggml_conv_transpose_1d(ggml_ctx, src_clone[0], src_clone[1], s0, p0, d0);
     } else if (tensor->op == GGML_OP_POOL_2D) {
         enum ggml_op_pool op = static_cast<ggml_op_pool>(tensor->op_params[0]);
         const int32_t k0 = tensor->op_params[1];
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt b/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt
index e60e9d1e5b5c5..14e9daaa01a25 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt
@@ -25,15 +25,3 @@ add_executable(${TARGET} vulkan-shaders-gen.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
 target_link_libraries(vulkan-shaders-gen PUBLIC Threads::Threads)
-
-# Configure output directories for MSVC builds
-if(MSVC)
-    # Get the main project's runtime output directory if possible
-    if(DEFINED CMAKE_RUNTIME_OUTPUT_DIRECTORY)
-        foreach(CONFIG ${CMAKE_CONFIGURATION_TYPES})
-            string(TOUPPER ${CONFIG} CONFIG)
-            set_target_properties(${TARGET} PROPERTIES
-                RUNTIME_OUTPUT_DIRECTORY_${CONFIG} ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
-        endforeach()
-    endif()
-endif()
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp b/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp
new file mode 100644
index 0000000000000..b17b4e83eec4b
--- /dev/null
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp
@@ -0,0 +1,98 @@
+#version 450
+
+#include "types.comp"
+
+layout (binding = 0) readonly buffer A {A_TYPE data_a[];};   // src0 - kernel:    [K, Cout, Cin]
+layout (binding = 1) readonly buffer B {B_TYPE data_b[];};   // src1 - input:     [L, Cin]
+layout (binding = 2) writeonly buffer D {D_TYPE data_d[];};     // dst - result      [KL, Cout]
+
+layout(local_size_x = 128 , local_size_y = 1, local_size_z = 1) in;
+
+layout (push_constant) uniform parameter {
+    uint32_t Cout;
+    uint32_t Cin;
+    uint32_t K;
+    uint32_t L;
+    uint32_t KL;
+
+    uint32_t nb01;
+    uint32_t nb02;
+    uint32_t nb11;
+    uint32_t nb1;
+
+    int32_t s0;
+} p;
+
+
+uint32_t Cout_idx = gl_WorkGroupID.x;
+const uint32_t bs = gl_WorkGroupSize.x;
+uint32_t tid = gl_LocalInvocationID.x;
+// Code is more straightforward if we assume it is bs*s0+K instead of (bs-1)*s0+K.
+uint32_t tmp_len = bs*p.s0+p.K;
+shared D_TYPE tmp[4096];
+
+uint splitWork(uint workSize){
+    return (bs + workSize -1) / bs;
+}
+
+void main(){
+    for(uint32_t i = 0; i < splitWork(tmp_len); i++){
+        uint32_t idx = i*bs+tid;
+        if(idx < tmp_len){
+            tmp[idx] = 0.0;
+        }
+    }
+
+    uint32_t L_blocks = splitWork(p.L);
+    for(uint32_t L_block_id = 0; L_block_id < L_blocks; L_block_id++){
+        if(L_block_id > 0){
+            barrier();
+            // Shift values in tmp to the current processing window
+            for(int i = 0; i < splitWork(tmp_len); i++){
+                uint32_t idx = i*bs+tid;
+                if(idx >= bs*p.s0 && idx < tmp_len){
+                    tmp[idx-bs*p.s0] = tmp[idx];
+                    tmp[idx] = 0.0;
+                }else if(idx >= p.K && idx < bs*p.s0){
+                    tmp[idx] = 0.0;
+                }
+            }
+        }
+        barrier();
+
+        // Save contributions of the block to tmp
+        uint32_t L_idx = L_block_id*bs + tid;
+        for(uint32_t K_idx = 0; K_idx < p.K; K_idx++){
+            D_TYPE dp = 0.0;
+            for(uint32_t Cin_idx = 0; Cin_idx < p.Cin; Cin_idx++){
+                A_TYPE elemKrn = data_a[K_idx + Cout_idx * p.nb01 + Cin_idx * p.nb02];
+                if(L_idx < p.L){
+                    B_TYPE elemInp = data_b[L_idx + Cin_idx*p.nb11];
+                    dp = fma(elemKrn, elemInp, dp);
+                }
+            }
+            tmp[tid*p.s0 + K_idx] += dp;
+            barrier();
+        }
+
+        // Save the computed values except the last block that can have different size
+        uint32_t KLb_idx = L_block_id*bs*p.s0;
+        if(L_block_id < L_blocks-1){
+            for(uint32_t s0_idx = 0; s0_idx < p.s0; s0_idx++){
+                uint32_t sh_idx = p.s0*tid+s0_idx;
+                uint32_t KL_idx = KLb_idx+sh_idx;
+                if(KL_idx < p.KL){
+                    data_d[KL_idx + Cout_idx*p.nb1] = tmp[sh_idx];
+                }
+            }
+        }
+    }
+
+    for(uint32_t i = 0; i < splitWork(tmp_len); i++){
+        uint32_t idx = i*bs+tid;
+        uint32_t KL_idx = (L_blocks-1)*bs*p.s0+idx;
+        if(KL_idx < p.KL){
+            data_d[KL_idx + Cout_idx*p.nb1] = tmp[idx];
+        }
+    }
+}
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
index 9361e2ac83b0f..c63345ec8b4b6 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
@@ -622,6 +622,8 @@ void process_shaders() {
 
     string_to_spv("timestep_embedding_f32", "timestep_embedding.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
 
+    string_to_spv("conv_transpose_1d_f32", "conv_transpose_1d.comp", {{"A_TYPE", "float"},  {"B_TYPE", "float"}, {"D_TYPE", "float"}});
+
     string_to_spv("pool2d_f32", "pool2d.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
 
     string_to_spv("rwkv_wkv6_f32", "wkv6.comp", merge_maps(base_dict, {{"A_TYPE", "float"}}));
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index fb0d379dc8d68..ee605977f3a2c 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -61,9 +61,6 @@
 #define m512i(p) (__m512i)(p)
 #endif
 
-// precomputed f32 table for f16 (256 KB) (ggml-impl.h)
-float ggml_table_f32_f16[1 << 16];
-
 #if defined(__linux__) || \
     defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || \
     (defined(__APPLE__) && !TARGET_OS_TV && !TARGET_OS_WATCH)
@@ -133,7 +130,7 @@ static void ggml_print_backtrace_symbols(void) {
 }
 #endif
 
-static void ggml_print_backtrace(void) {
+void ggml_print_backtrace(void) {
     const char * GGML_NO_BACKTRACE = getenv("GGML_NO_BACKTRACE");
     if (GGML_NO_BACKTRACE) {
         return;
@@ -160,6 +157,10 @@ static void ggml_print_backtrace(void) {
     const int parent_pid = getpid();
     const int child_pid = fork();
     if (child_pid < 0) { // error
+#if defined(__linux__)
+        close(lock[1]);
+        close(lock[0]);
+#endif
         return;
     } else if (child_pid == 0) { // child
         char attach[32];
@@ -167,6 +168,7 @@ static void ggml_print_backtrace(void) {
 #if defined(__linux__)
         close(lock[1]);
         (void) !read(lock[0], lock, 1);
+        close(lock[0]);
 #endif
         // try gdb
         execlp("gdb", "gdb", "--batch",
@@ -195,7 +197,7 @@ static void ggml_print_backtrace(void) {
     }
 }
 #else
-static void ggml_print_backtrace(void) {
+void ggml_print_backtrace(void) {
     // platform not supported
 }
 #endif
@@ -216,6 +218,8 @@ void ggml_abort(const char * file, int line, const char * fmt, ...) {
     abort();
 }
 
+// ggml_print_backtrace is registered with std::set_terminate by ggml.cpp
+
 //
 // logging
 //
@@ -881,12 +885,6 @@ struct ggml_context {
     struct ggml_object * objects_end;
 };
 
-struct ggml_context_container {
-    bool used;
-
-    struct ggml_context context;
-};
-
 //
 // data types
 //
@@ -954,6 +952,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
     "UPSCALE",
     "PAD",
     "PAD_REFLECT_1D",
+    "ROLL",
     "ARANGE",
     "TIMESTEP_EMBEDDING",
     "ARGSORT",
@@ -984,7 +983,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
     "OPT_STEP_ADAMW",
 };
 
-static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82");
+static_assert(GGML_OP_COUNT == 83, "GGML_OP_COUNT != 83");
 
 static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "none",
@@ -1049,6 +1048,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "upscale(x)",
     "pad(x)",
     "pad_reflect_1d(x)",
+    "roll(x)",
     "arange(start, stop, step)",
     "timestep_embedding(timesteps, dim, max_period)",
     "argsort(x)",
@@ -1079,7 +1079,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "adamw(x)",
 };
 
-static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82");
+static_assert(GGML_OP_COUNT == 83, "GGML_OP_COUNT != 83");
 
 static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
 
@@ -1419,14 +1419,6 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
         // initialize time system (required on Windows)
         ggml_time_init();
 
-        for (int i = 0; i < (1 << 16); ++i) {
-            union {
-                uint16_t u16;
-                ggml_fp16_t fp16;
-            } u = {i};
-            ggml_table_f32_f16[i] = GGML_COMPUTE_FP16_TO_FP32(u.fp16);
-        }
-
         is_first_call = false;
     }
 
@@ -4340,6 +4332,34 @@ struct ggml_tensor * ggml_pad_reflect_1d(
     return result;
 }
 
+// ggml_roll
+
+struct ggml_tensor * ggml_roll(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        int                   shift0,
+        int                   shift1,
+        int                   shift2,
+        int                   shift3) {
+    GGML_ASSERT(a->nb[0] == ggml_type_size(a->type));
+    GGML_ASSERT(abs(shift0) < a->ne[0]);
+    GGML_ASSERT(abs(shift1) < a->ne[1]);
+    GGML_ASSERT(abs(shift2) < a->ne[2]);
+    GGML_ASSERT(abs(shift3) < a->ne[3]);
+
+    struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
+
+    ggml_set_op_params_i32(result, 0, shift0);
+    ggml_set_op_params_i32(result, 1, shift1);
+    ggml_set_op_params_i32(result, 2, shift2);
+    ggml_set_op_params_i32(result, 3, shift3);
+
+    result->op     = GGML_OP_ROLL;
+    result->src[0] = a;
+
+    return result;
+}
+
 // ggml_arange
 
 struct ggml_tensor * ggml_arange(
diff --git a/ggml/src/ggml.cpp b/ggml/src/ggml.cpp
new file mode 100644
index 0000000000000..0d388d45536d1
--- /dev/null
+++ b/ggml/src/ggml.cpp
@@ -0,0 +1,26 @@
+#include "ggml-impl.h"
+
+#include <cstdlib>
+#include <exception>
+
+static std::terminate_handler previous_terminate_handler;
+
+GGML_NORETURN static void ggml_uncaught_exception() {
+    ggml_print_backtrace();
+    if (previous_terminate_handler) {
+        previous_terminate_handler();
+    }
+    abort(); // unreachable unless previous_terminate_handler was nullptr
+}
+
+static bool ggml_uncaught_exception_init = []{
+    const char * GGML_NO_BACKTRACE = getenv("GGML_NO_BACKTRACE");
+    if (GGML_NO_BACKTRACE) {
+        return false;
+    }
+    const auto prev{std::get_terminate()};
+    GGML_ASSERT(prev != ggml_uncaught_exception);
+    previous_terminate_handler = prev;
+    std::set_terminate(ggml_uncaught_exception);
+    return true;
+}();
diff --git a/ggml/src/gguf.cpp b/ggml/src/gguf.cpp
index 8667a80bd0685..5ffd12b8b2795 100644
--- a/ggml/src/gguf.cpp
+++ b/ggml/src/gguf.cpp
@@ -335,7 +335,11 @@ struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_par
 
         for (uint32_t i = 0; i < magic.size(); i++) {
             if (magic[i] != GGUF_MAGIC[i]) {
-                GGML_LOG_ERROR("%s: invalid magic characters: '%c%c%c%c', expected 'GGUF'\n", __func__, magic[0], magic[1], magic[2], magic[3]);
+                char c0 = isprint(magic[0]) ? magic[0] : '?';
+                char c1 = isprint(magic[1]) ? magic[1] : '?';
+                char c2 = isprint(magic[2]) ? magic[2] : '?';
+                char c3 = isprint(magic[3]) ? magic[3] : '?';
+                GGML_LOG_ERROR("%s: invalid magic characters: '%c%c%c%c', expected 'GGUF'\n", __func__, c0, c1, c2, c3);
                 gguf_free(ctx);
                 return nullptr;
             }
@@ -347,11 +351,28 @@ struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_par
     int64_t n_tensors = 0;
 
     if (ok && gr.read(ctx->version)) {
-        if (ctx->version == 1) {
+        if (ok && ctx->version == 0) {
+            GGML_LOG_ERROR("%s: bad GGUF version: %" PRIu32 "\n", __func__, ctx->version);
+            ok = false;
+        }
+
+        /*
+         * bit layout is different when reading non-native endian models.
+         * assuming that the GGUF version is 3, the non-native endian model
+         * would read it as 0x30000000. we can use the AND operation against
+         * the last 4 hexadecimal digits to check if the model is the same
+         * endianness as the host system.
+        */
+        if (ok && (ctx->version & 0x0000FFFF) == 0x00000000) {
+            GGML_LOG_ERROR("%s: failed to load model: this GGUF file version %" PRIu32 " is extremely large, is there a mismatch between the host and model endianness?\n", __func__, ctx->version);
+            ok = false;
+        }
+
+        if (ok && ctx->version == 1) {
             GGML_LOG_ERROR("%s: GGUFv1 is no longer supported, please use a more up-to-date version\n", __func__);
             ok = false;
         }
-        if (ctx->version > GGUF_VERSION) {
+        if (ok && ctx->version > GGUF_VERSION) {
             GGML_LOG_ERROR("%s: this GGUF file is version %" PRIu32 " but this software only supports up to version %d\n",
                 __func__, ctx->version, GGUF_VERSION);
             ok = false;
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index 3ee2b2064e1b4..fb75143b0b545 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -118,6 +118,10 @@ class LLM:
         EMBEDDING_SCALE                   = "{arch}.embedding_scale"
         TOKEN_SHIFT_COUNT                 = "{arch}.token_shift_count"
         INTERLEAVE_MOE_LAYER_STEP         = "{arch}.interleave_moe_layer_step"
+        ACTIVATION_SPARSITY_SCALE         = "{arch}.activation_sparsity_scale"
+        ALTUP_ACTIVE_IDX                  = "{arch}.altup.active_idx"
+        ALTUP_NUM_INPUTS                  = "{arch}.altup.num_inputs"
+        EMBD_LENGTH_PER_LAYER_INP         = "{arch}.embedding_length_per_layer_input"
 
     class Attention:
         HEAD_COUNT                   = "{arch}.attention.head_count"
@@ -142,6 +146,8 @@ class Attention:
         SCALE                        = "{arch}.attention.scale"
         KEY_LENGTH_MLA               = "{arch}.attention.key_length_mla"
         VALUE_LENGTH_MLA             = "{arch}.attention.value_length_mla"
+        SHARED_KV_LAYERS             = "{arch}.attention.shared_kv_layers"
+        SLIDING_WINDOW_PATTERN       = "{arch}.attention.sliding_window_pattern"
 
     class Rope:
         DIMENSION_COUNT         = "{arch}.rope.dimension_count"
@@ -198,6 +204,7 @@ class Tokenizer:
         MASK_ID              = "tokenizer.ggml.mask_token_id"
         ADD_BOS              = "tokenizer.ggml.add_bos_token"
         ADD_EOS              = "tokenizer.ggml.add_eos_token"
+        ADD_SEP              = "tokenizer.ggml.add_sep_token"
         ADD_PREFIX           = "tokenizer.ggml.add_space_prefix"
         REMOVE_EXTRA_WS      = "tokenizer.ggml.remove_extra_whitespaces"
         PRECOMPILED_CHARSMAP = "tokenizer.ggml.precompiled_charsmap"
@@ -291,6 +298,7 @@ class MODEL_ARCH(IntEnum):
     BERT             = auto()
     NOMIC_BERT       = auto()
     NOMIC_BERT_MOE   = auto()
+    NEO_BERT         = auto()
     JINA_BERT_V2     = auto()
     BLOOM            = auto()
     STABLELM         = auto()
@@ -312,6 +320,7 @@ class MODEL_ARCH(IntEnum):
     GEMMA            = auto()
     GEMMA2           = auto()
     GEMMA3           = auto()
+    GEMMA3N          = auto()
     STARCODER2       = auto()
     RWKV6            = auto()
     RWKV6QWEN2       = auto()
@@ -343,6 +352,8 @@ class MODEL_ARCH(IntEnum):
     WAVTOKENIZER_DEC = auto()
     PLM              = auto()
     BAILINGMOE       = auto()
+    DOTS1            = auto()
+    ARCEE            = auto()
 
 
 class VISION_PROJECTOR_TYPE(IntEnum):
@@ -395,6 +406,22 @@ class MODEL_TENSOR(IntEnum):
     ATTN_Q_NORM          = auto()
     ATTN_K_NORM          = auto()
     LAYER_OUT_NORM       = auto()
+    PER_LAYER_TOKEN_EMBD = auto() # gemma3n
+    PER_LAYER_MODEL_PROJ = auto() # gemma3n
+    PER_LAYER_INP_GATE   = auto() # gemma3n
+    PER_LAYER_PROJ       = auto() # gemma3n
+    PER_LAYER_PROJ_NORM  = auto() # gemma3n
+    PER_LAYER_POST_NORM  = auto() # gemma3n
+    ALTUP_PROJ           = auto() # gemma3n
+    ALTUP_UNEMBD_PROJ    = auto() # gemma3n
+    ALTUP_CORRECT_COEF   = auto() # gemma3n
+    ALTUP_CORRECT_SCALE  = auto() # gemma3n
+    ALTUP_PREDICT_COEF   = auto() # gemma3n
+    ALTUP_ROUTER         = auto() # gemma3n
+    ALTUP_ROUTER_NORM    = auto() # gemma3n
+    LAUREL_L             = auto() # gemma3n
+    LAUREL_R             = auto() # gemma3n
+    LAUREL_POST_NORM     = auto() # gemma3n
     SSM_IN               = auto()
     SSM_CONV1D           = auto()
     SSM_X                = auto()
@@ -571,6 +598,7 @@ class MODEL_TENSOR(IntEnum):
     MODEL_ARCH.BERT:             "bert",
     MODEL_ARCH.NOMIC_BERT:       "nomic-bert",
     MODEL_ARCH.NOMIC_BERT_MOE:   "nomic-bert-moe",
+    MODEL_ARCH.NEO_BERT:         "neo-bert",
     MODEL_ARCH.JINA_BERT_V2:     "jina-bert-v2",
     MODEL_ARCH.BLOOM:            "bloom",
     MODEL_ARCH.STABLELM:         "stablelm",
@@ -592,6 +620,7 @@ class MODEL_TENSOR(IntEnum):
     MODEL_ARCH.GEMMA:            "gemma",
     MODEL_ARCH.GEMMA2:           "gemma2",
     MODEL_ARCH.GEMMA3:           "gemma3",
+    MODEL_ARCH.GEMMA3N:          "gemma3n",
     MODEL_ARCH.STARCODER2:       "starcoder2",
     MODEL_ARCH.RWKV6:            "rwkv6",
     MODEL_ARCH.RWKV6QWEN2:       "rwkv6qwen2",
@@ -623,6 +652,8 @@ class MODEL_TENSOR(IntEnum):
     MODEL_ARCH.WAVTOKENIZER_DEC: "wavtokenizer-dec",
     MODEL_ARCH.PLM:              "plm",
     MODEL_ARCH.BAILINGMOE:       "bailingmoe",
+    MODEL_ARCH.DOTS1:            "dots1",
+    MODEL_ARCH.ARCEE:            "arcee",
 }
 
 VISION_PROJECTOR_TYPE_NAMES: dict[VISION_PROJECTOR_TYPE, str] = {
@@ -675,6 +706,22 @@ class MODEL_TENSOR(IntEnum):
     MODEL_TENSOR.FFN_UP_EXP:                "blk.{bid}.ffn_up_exps",
     MODEL_TENSOR.FFN_EXP_PROBS_B:           "blk.{bid}.exp_probs_b",
     MODEL_TENSOR.LAYER_OUT_NORM:            "blk.{bid}.layer_output_norm",
+    MODEL_TENSOR.PER_LAYER_TOKEN_EMBD:      "per_layer_token_embd",           # gemma3n
+    MODEL_TENSOR.PER_LAYER_MODEL_PROJ:      "per_layer_model_proj",           # gemma3n
+    MODEL_TENSOR.PER_LAYER_PROJ_NORM:       "per_layer_proj_norm",            # gemma3n
+    MODEL_TENSOR.ALTUP_UNEMBD_PROJ:         "altup_unembd_proj",              # gemma3n
+    MODEL_TENSOR.ALTUP_PROJ:                "altup_proj",                     # gemma3n
+    MODEL_TENSOR.PER_LAYER_INP_GATE:        "blk.{bid}.inp_gate",             # gemma3n
+    MODEL_TENSOR.PER_LAYER_PROJ:            "blk.{bid}.proj",                 # gemma3n
+    MODEL_TENSOR.PER_LAYER_POST_NORM:       "blk.{bid}.post_norm",            # gemma3n
+    MODEL_TENSOR.ALTUP_CORRECT_COEF:        "blk.{bid}.altup_correct_coef",   # gemma3n
+    MODEL_TENSOR.ALTUP_CORRECT_SCALE:       "blk.{bid}.altup_correct_scale",  # gemma3n
+    MODEL_TENSOR.ALTUP_PREDICT_COEF:        "blk.{bid}.altup_predict_coef",   # gemma3n
+    MODEL_TENSOR.ALTUP_ROUTER:              "blk.{bid}.altup_router",         # gemma3n
+    MODEL_TENSOR.ALTUP_ROUTER_NORM:         "blk.{bid}.altup_router_norm",    # gemma3n
+    MODEL_TENSOR.LAUREL_L:                  "blk.{bid}.laurel_l",             # gemma3n
+    MODEL_TENSOR.LAUREL_R:                  "blk.{bid}.laurel_r",             # gemma3n
+    MODEL_TENSOR.LAUREL_POST_NORM:          "blk.{bid}.laurel_post_norm",     # gemma3n
     MODEL_TENSOR.SSM_IN:                    "blk.{bid}.ssm_in",
     MODEL_TENSOR.SSM_CONV1D:                "blk.{bid}.ssm_conv1d",
     MODEL_TENSOR.SSM_X:                     "blk.{bid}.ssm_x",
@@ -1077,6 +1124,18 @@ class MODEL_TENSOR(IntEnum):
         MODEL_TENSOR.FFN_UP_EXP,
         MODEL_TENSOR.LAYER_OUT_NORM,
     ],
+    MODEL_ARCH.NEO_BERT: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.ATTN_NORM,
+        MODEL_TENSOR.ATTN_QKV,
+        MODEL_TENSOR.ATTN_OUT,
+        MODEL_TENSOR.FFN_NORM,
+        MODEL_TENSOR.FFN_DOWN,
+        MODEL_TENSOR.FFN_UP,
+        MODEL_TENSOR.ENC_OUTPUT_NORM,
+        MODEL_TENSOR.CLS,
+        MODEL_TENSOR.CLS_OUT,
+    ],
     MODEL_ARCH.JINA_BERT_V2: [
         MODEL_TENSOR.TOKEN_EMBD,
         MODEL_TENSOR.TOKEN_EMBD_NORM,
@@ -1467,6 +1526,41 @@ class MODEL_TENSOR(IntEnum):
         MODEL_TENSOR.FFN_PRE_NORM,
         MODEL_TENSOR.FFN_POST_NORM,
     ],
+    MODEL_ARCH.GEMMA3N: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.OUTPUT,
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.ATTN_Q,
+        MODEL_TENSOR.ATTN_Q_NORM,
+        MODEL_TENSOR.ATTN_K,
+        MODEL_TENSOR.ATTN_K_NORM,
+        MODEL_TENSOR.ATTN_V,
+        MODEL_TENSOR.ATTN_OUT,
+        MODEL_TENSOR.FFN_GATE,
+        MODEL_TENSOR.FFN_DOWN,
+        MODEL_TENSOR.FFN_UP,
+        MODEL_TENSOR.ATTN_NORM,
+        MODEL_TENSOR.ATTN_POST_NORM,
+        MODEL_TENSOR.FFN_PRE_NORM,
+        MODEL_TENSOR.FFN_POST_NORM,
+        # altup / laurel
+        MODEL_TENSOR.PER_LAYER_TOKEN_EMBD,
+        MODEL_TENSOR.PER_LAYER_MODEL_PROJ,
+        MODEL_TENSOR.PER_LAYER_INP_GATE,
+        MODEL_TENSOR.PER_LAYER_PROJ,
+        MODEL_TENSOR.PER_LAYER_PROJ_NORM,
+        MODEL_TENSOR.PER_LAYER_POST_NORM,
+        MODEL_TENSOR.ALTUP_PROJ,
+        MODEL_TENSOR.ALTUP_UNEMBD_PROJ,
+        MODEL_TENSOR.ALTUP_CORRECT_COEF,
+        MODEL_TENSOR.ALTUP_CORRECT_SCALE,
+        MODEL_TENSOR.ALTUP_PREDICT_COEF,
+        MODEL_TENSOR.ALTUP_ROUTER,
+        MODEL_TENSOR.ALTUP_ROUTER_NORM,
+        MODEL_TENSOR.LAUREL_L,
+        MODEL_TENSOR.LAUREL_R,
+        MODEL_TENSOR.LAUREL_POST_NORM,
+    ],
     MODEL_ARCH.STARCODER2: [
         MODEL_TENSOR.TOKEN_EMBD,
         MODEL_TENSOR.OUTPUT_NORM,
@@ -2044,6 +2138,45 @@ class MODEL_TENSOR(IntEnum):
         MODEL_TENSOR.FFN_DOWN_SHEXP,
         MODEL_TENSOR.FFN_UP_SHEXP,
     ],
+    MODEL_ARCH.DOTS1: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.OUTPUT,
+        MODEL_TENSOR.ATTN_NORM,
+        MODEL_TENSOR.ATTN_Q,
+        MODEL_TENSOR.ATTN_Q_NORM,
+        MODEL_TENSOR.ATTN_K,
+        MODEL_TENSOR.ATTN_K_NORM,
+        MODEL_TENSOR.ATTN_V,
+        MODEL_TENSOR.ATTN_OUT,
+        MODEL_TENSOR.FFN_EXP_PROBS_B,
+        MODEL_TENSOR.FFN_NORM,
+        MODEL_TENSOR.FFN_GATE,
+        MODEL_TENSOR.FFN_GATE_EXP,
+        MODEL_TENSOR.FFN_GATE_INP,
+        MODEL_TENSOR.FFN_GATE_SHEXP,
+        MODEL_TENSOR.FFN_DOWN,
+        MODEL_TENSOR.FFN_DOWN_EXP,
+        MODEL_TENSOR.FFN_DOWN_SHEXP,
+        MODEL_TENSOR.FFN_UP,
+        MODEL_TENSOR.FFN_UP_EXP,
+        MODEL_TENSOR.FFN_UP_SHEXP,
+    ],
+    MODEL_ARCH.ARCEE: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.OUTPUT,
+        MODEL_TENSOR.ROPE_FREQS,
+        MODEL_TENSOR.ATTN_NORM,
+        MODEL_TENSOR.ATTN_Q,
+        MODEL_TENSOR.ATTN_K,
+        MODEL_TENSOR.ATTN_V,
+        MODEL_TENSOR.ATTN_OUT,
+        MODEL_TENSOR.ATTN_ROT_EMBD,
+        MODEL_TENSOR.FFN_NORM,
+        MODEL_TENSOR.FFN_DOWN,
+        MODEL_TENSOR.FFN_UP,
+    ],
     # TODO
 }
 
diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py
index de6e45ae827b9..d32cd479adb17 100644
--- a/gguf-py/gguf/gguf_writer.py
+++ b/gguf-py/gguf/gguf_writer.py
@@ -271,7 +271,7 @@ def write_ti_data_to_file(self) -> None:
 
     def add_key_value(self, key: str, val: Any, vtype: GGUFValueType, sub_type: GGUFValueType | None = None) -> None:
         if any(key in kv_data for kv_data in self.kv_data):
-            raise ValueError(f'Duplicated key name {key!r}')
+            logger.warning(f'Duplicated key name {key!r}, overwriting it with new value {val!r} of type {vtype.name}')
 
         self.kv_data[0][key] = GGUFValue(value=val, type=vtype, sub_type=sub_type)
 
@@ -672,6 +672,18 @@ def add_parallel_residual(self, use: bool) -> None:
     def add_decoder_start_token_id(self, id: int) -> None:
         self.add_uint32(Keys.LLM.DECODER_START_TOKEN_ID.format(arch=self.arch), id)
 
+    def add_embedding_length_per_layer_input(self, value: int) -> None:
+        self.add_uint32(Keys.LLM.EMBD_LENGTH_PER_LAYER_INP.format(arch=self.arch), value)
+
+    def add_altup_active_idx(self, val: int) -> None:
+        self.add_uint32(Keys.LLM.ALTUP_ACTIVE_IDX.format(arch=self.arch), val)
+
+    def add_altup_num_inputs(self, val: int) -> None:
+        self.add_uint32(Keys.LLM.ALTUP_NUM_INPUTS.format(arch=self.arch), val)
+
+    def add_activation_sparsity_scale(self, values: Sequence[float]) -> None:
+        self.add_array(Keys.LLM.ACTIVATION_SPARSITY_SCALE.format(arch=self.arch), values)
+
     def add_head_count(self, count: int | Sequence[int]) -> None:
         if isinstance(count, int):
             self.add_uint32(Keys.Attention.HEAD_COUNT.format(arch=self.arch), count)
@@ -702,6 +714,12 @@ def add_max_alibi_bias(self, bias: float) -> None:
     def add_clamp_kqv(self, value: float) -> None:
         self.add_float32(Keys.Attention.CLAMP_KQV.format(arch=self.arch), value)
 
+    def add_shared_kv_layers(self, value: float) -> None:
+        self.add_float32(Keys.Attention.SHARED_KV_LAYERS.format(arch=self.arch), value)
+
+    def add_sliding_window_pattern(self, value: Sequence[bool]) -> None:
+        self.add_array(Keys.Attention.SLIDING_WINDOW_PATTERN.format(arch=self.arch), value)
+
     def add_logit_scale(self, value: float) -> None:
         self.add_float32(Keys.LLM.LOGIT_SCALE.format(arch=self.arch), value)
 
@@ -891,6 +909,9 @@ def add_add_bos_token(self, value: bool) -> None:
     def add_add_eos_token(self, value: bool) -> None:
         self.add_bool(Keys.Tokenizer.ADD_EOS, value)
 
+    def add_add_sep_token(self, value: bool) -> None:
+        self.add_bool(Keys.Tokenizer.ADD_SEP, value)
+
     def add_add_space_prefix(self, value: bool) -> None:
         self.add_bool(Keys.Tokenizer.ADD_PREFIX, value)
 
@@ -935,6 +956,9 @@ def add_eot_token_id(self, id: int) -> None:
     def add_eom_token_id(self, id: int) -> None:
         self.add_uint32(Keys.Tokenizer.EOM_ID, id)
 
+    def add_classifier_output_labels(self, labels: Sequence[str]) -> None:
+        self.add_array(Keys.Classifier.OUTPUT_LABELS.format(arch=self.arch), labels)
+
     # for vision models
 
     def add_clip_has_vision_encoder(self, value: bool) -> None:
diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py
index 93dd1d8028f3d..b30f77dbe3be7 100644
--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@@ -31,6 +31,7 @@ class TensorNameMap:
             "model.embeddings",                          # rwkv7
             "model.word_embeddings",                     # bailingmoe
             "language_model.model.embed_tokens",         # llama4
+            "encoder",                                   # neobert
         ),
 
         # Token type embeddings
@@ -134,6 +135,7 @@ class TensorNameMap:
             "rwkv.blocks.{bid}.ln1",                                # rwkv6
             "model.layers.{bid}.ln1",                               # rwkv7
             "model.layers.{bid}.input_layernorm",                   # llama4
+            "transformer_encoder.{bid}.attention_norm",             # neobert
         ),
 
         # Attention norm 2
@@ -161,6 +163,7 @@ class TensorNameMap:
             "model.layers.{bid}.self_attn.qkv_proj",                               # phi3
             "encoder.layers.{bid}.self_attention.query_key_value",                 # chatglm
             "transformer.layers.{bid}.attn.qkv_proj",                              # openelm
+            "transformer_encoder.{bid}.qkv",                                       # neobert
         ),
 
         # Attention query
@@ -236,6 +239,7 @@ class TensorNameMap:
             "transformer.layers.{bid}.attn.out_proj",                       # openelm
             "transformer.h.{bid}.attn.attention.out_proj",                  # exaone
             "model.layers.{bid}.self_attn.o_proj",                          # llama4
+            "transformer_encoder.{bid}.wo",                                 # neobert
         ),
 
         # Attention output norm
@@ -276,6 +280,7 @@ class TensorNameMap:
             "encoder.layers.{bid}.post_attention_layernorm",                 # chatglm
             "transformer.layers.{bid}.ffn_norm",                             # openelm
             "model.layers.{bid}.post_attention_layernorm",                   # llama4
+            "transformer_encoder.{bid}.ffn_norm",                            # neobert
         ),
 
         # Post feed-forward norm
@@ -305,7 +310,7 @@ class TensorNameMap:
         ),
 
         MODEL_TENSOR.FFN_EXP_PROBS_B: (
-            "model.layers.{bid}.mlp.gate.e_score_correction", # deepseek-v3
+            "model.layers.{bid}.mlp.gate.e_score_correction", # deepseek-v3 dots1
         ),
 
         # Feed-forward up
@@ -333,11 +338,14 @@ class TensorNameMap:
             "encoder.layers.{bid}.mlp.fc11",                          # nomic-bert
             "encoder.layers.{bid}.mlp.fc1",                           # nomic-bert-moe
             "model.layers.{bid}.mlp.c_fc",                            # starcoder2
-            "encoder.layer.{bid}.mlp.gated_layers_v",                 # jina-bert-v2
+            "encoder.layer.{bid}.mlp.gated_layers_v",                 # jina-bert-v2 (split up/gate, no longer used)
+            "encoder.layer.{bid}.mlp.gated_layers",                   # jina-bert-v2 (GEGLU)
+            "encoder.layer.{bid}.mlp.up_gated_layer",                 # jina-v2-code (GEGLU)
             "model.layers.{bid}.residual_mlp.w3",                     # arctic
             "encoder.layers.{bid}.mlp.dense_h_to_4h",                 # chatglm
             "transformer.h.{bid}.mlp.c_fc_1",                         # exaone
             "model.layers.{bid}.feed_forward.up_proj",                # llama4
+            "transformer_encoder.{bid}.ffn.w12",                      # neobert
         ),
 
         MODEL_TENSOR.FFN_UP_EXP: (
@@ -370,7 +378,7 @@ class TensorNameMap:
             "model.layers.layers.{bid}.mlp.gate_proj",    # plamo
             "model.layers.{bid}.feed_forward.w1",         # internlm2
             "encoder.layers.{bid}.mlp.fc12",              # nomic-bert
-            "encoder.layer.{bid}.mlp.gated_layers_w",     # jina-bert-v2
+            "encoder.layer.{bid}.mlp.gated_layers_w",     # jina-bert-v2 (split up/gate, no longer used)
             "transformer.h.{bid}.mlp.linear_1",           # refact
             "model.layers.{bid}.residual_mlp.w1",         # arctic
             "transformer.h.{bid}.mlp.c_fc_0",             # exaone
@@ -420,6 +428,7 @@ class TensorNameMap:
             "encoder.layers.{bid}.mlp.dense_4h_to_h",                 # chatglm
             "model.layers.h.{bid}.mlp.c_proj",                        # exaone
             "model.layers.{bid}.feed_forward.down_proj",              # llama4
+            "transformer_encoder.{bid}.ffn.w3",                       # neobert
         ),
 
         MODEL_TENSOR.FFN_DOWN_EXP: (
@@ -471,6 +480,70 @@ class TensorNameMap:
             "encoder.layer.{bid}.layer_norm_2"              # jina-v2-code
         ),
 
+        MODEL_TENSOR.PER_LAYER_TOKEN_EMBD: (
+            "model.embed_tokens_per_layer",  # gemma3n
+        ),
+
+        MODEL_TENSOR.PER_LAYER_MODEL_PROJ: (
+            "model.per_layer_model_projection",  # gemma3n
+        ),
+
+        MODEL_TENSOR.PER_LAYER_PROJ_NORM: (
+            "model.per_layer_projection_norm",  # gemma3n
+        ),
+
+        MODEL_TENSOR.ALTUP_PROJ: (
+            "model.altup_projections",  # gemma3n
+        ),
+
+        MODEL_TENSOR.ALTUP_UNEMBD_PROJ: (
+            "model.altup_unembed_projections",  # gemma3n
+        ),
+
+        MODEL_TENSOR.PER_LAYER_INP_GATE: (
+            "model.layers.{bid}.per_layer_input_gate",  # gemma3n
+        ),
+
+        MODEL_TENSOR.PER_LAYER_PROJ: (
+            "model.layers.{bid}.per_layer_projection",  # gemma3n
+        ),
+
+        MODEL_TENSOR.PER_LAYER_POST_NORM: (
+            "model.layers.{bid}.post_per_layer_input_norm",  # gemma3n
+        ),
+
+        MODEL_TENSOR.ALTUP_CORRECT_COEF: (
+            "model.layers.{bid}.altup.correction_coefs",  # gemma3n
+        ),
+
+        MODEL_TENSOR.ALTUP_CORRECT_SCALE: (
+            "model.layers.{bid}.altup.correct_output_scale",  # gemma3n
+        ),
+
+        MODEL_TENSOR.ALTUP_PREDICT_COEF: (
+            "model.layers.{bid}.altup.prediction_coefs",  # gemma3n
+        ),
+
+        MODEL_TENSOR.ALTUP_ROUTER: (
+            "model.layers.{bid}.altup.modality_router",  # gemma3n
+        ),
+
+        MODEL_TENSOR.ALTUP_ROUTER_NORM: (
+            "model.layers.{bid}.altup.router_norm",  # gemma3n
+        ),
+
+        MODEL_TENSOR.LAUREL_L: (
+            "model.layers.{bid}.laurel.linear_left",  # gemma3n
+        ),
+
+        MODEL_TENSOR.LAUREL_R: (
+            "model.layers.{bid}.laurel.linear_right",  # gemma3n
+        ),
+
+        MODEL_TENSOR.LAUREL_POST_NORM: (
+            "model.layers.{bid}.laurel.post_laurel_norm",  # gemma3n
+        ),
+
         MODEL_TENSOR.SSM_IN: (
             "model.layers.{bid}.in_proj",
             "backbone.layers.{bid}.mixer.in_proj",
@@ -830,12 +903,14 @@ class TensorNameMap:
         # TODO: these do not belong to block_mappings_cfg - move them to mappings_cfg
         MODEL_TENSOR.ENC_OUTPUT_NORM: (
             "encoder.final_layer_norm", # t5
+            "layer_norm",               # neobert
         ),
 
         MODEL_TENSOR.CLS: (
             "classifier",       # jina
             "classifier.dense", # roberta
             "pre_classifier",   # distillbert
+            "dense",            # neobert
         ),
 
         MODEL_TENSOR.CLS_OUT: (
diff --git a/gguf-py/gguf/vocab.py b/gguf-py/gguf/vocab.py
index cca0979862a71..3f541b0c02e52 100644
--- a/gguf-py/gguf/vocab.py
+++ b/gguf-py/gguf/vocab.py
@@ -7,7 +7,10 @@
 from pathlib import Path
 from typing import Any, Callable, Sequence, Mapping, Iterable, Protocol, ClassVar, runtime_checkable
 
-from sentencepiece import SentencePieceProcessor
+try:
+    from sentencepiece import SentencePieceProcessor
+except ImportError:
+    SentencePieceProcessor = None
 
 import gguf
 
@@ -116,6 +119,7 @@ def _set_special_token(self, typ: str, tid: Any) -> None:
         logger.warning(f'Special token type {typ}, id {tid} out of range, must be under {self.n_vocab} - skipping')
 
     def _try_load_from_tokenizer_json(self, path: Path) -> bool:
+        tokenizer = None
         tokenizer_file = path / 'tokenizer.json'
         if tokenizer_file.is_file():
             with open(tokenizer_file, encoding = 'utf-8') as f:
@@ -149,11 +153,97 @@ def _try_load_from_tokenizer_json(self, path: Path) -> bool:
             added_tokens = tokenizer.get('added_tokens', {})
         else:
             added_tokens = {}
+        tokenizer_config = None
         tokenizer_config_file = path / 'tokenizer_config.json'
-        if not tokenizer_config_file.is_file():
+        if tokenizer_config_file.is_file():
+            with open(tokenizer_config_file, encoding = 'utf-8') as f:
+                tokenizer_config = json.load(f)
+        if tokenizer:
+            special_bos = (tokenizer_config or {}).get('bos_token')
+            special_cls = (tokenizer_config or {}).get('cls_token')
+            special_eos = (tokenizer_config or {}).get('eos_token')
+            special_sep = (tokenizer_config or {}).get('sep_token')
+            if not special_bos and special_cls and tokenizer_config:
+                tokenizer_config['bos_token'] = special_bos = special_cls
+            if not special_eos and special_sep and tokenizer_config:
+                tokenizer_config['eos_token'] = special_eos = special_sep
+            if post_processor := tokenizer.get('post_processor'):
+                for processor in post_processor.get('processors', [post_processor]):
+                    if processor.get('type') == 'RobertaProcessing':
+                        self.add_special_token['bos'] = True
+                        self.add_special_token['eos'] = True
+                        self.add_special_token['sep'] = True
+                        if not special_cls and tokenizer_config:
+                            special_cls = processor.get('cls', [special_bos])[0]
+                            tokenizer_config['cls_token'] = special_cls
+                        if not special_sep and tokenizer_config:
+                            special_sep = processor.get('sep', [special_eos])[0]
+                            tokenizer_config['sep_token'] = special_sep
+                        continue
+                    # Crude parsing of TemplateProcessing to determine if BOS/SEP/EOS should be added
+                    # Only works with simple templates, **will** get it wrong on unusual sequences
+                    if processor.get('type') == 'TemplateProcessing':
+                        tmpl_single = processor.get('single', [])
+                        tmpl_pair = processor.get('pair', [])
+                        special_first = None
+                        special_last = None
+                        if len(tmpl_single) > 1:
+                            if special_first := tmpl_single[0].get('SpecialToken', {}).get('id'):
+                                if not tokenizer_config:
+                                    special_bos = special_first
+                                self.add_special_token['bos'] = True if special_first in (special_bos, special_cls) else False
+                                if special_first not in (special_bos, special_cls):
+                                    logger.warning(f'Unknown leading special token {special_first!r} in TemplateProcessing<single>')
+                            if special_last := tmpl_single[-1].get('SpecialToken', {}).get('id'):
+                                if not tokenizer_config:
+                                    special_eos = special_last
+                                elif special_last != special_eos:
+                                    if 'eot' not in self.special_token_types:
+                                        self.special_token_types = tuple(self.special_token_types) + ('eot', )
+                                        tokenizer_config['eot_token'] = special_eos
+                                    elif 'eom' not in self.special_token_types:
+                                        self.special_token_types = tuple(self.special_token_types) + ('eom', )
+                                        tokenizer_config['eom_token'] = special_eos
+                                    else:
+                                        logger.warning(f'Overriding EOS token {special_eos!r} with {special_last!r} without EOT/EOM fallback!')
+                                    tokenizer_config['eos_token'] = special_eos = special_last
+                                self.add_special_token['eos'] = True if special_last == special_eos else False
+                                if special_last != special_eos:
+                                    logger.warning(f'Unknown trailing special token {special_last!r} in TemplateProcessing<single>')
+                        if tmpl_pair:
+                            seq_start = 1 if special_first and tmpl_pair[0].get('SpecialToken', {}).get('id') == special_first else 0
+                            seq_stop = -1 if special_last and tmpl_pair[-1].get('SpecialToken', {}).get('id') == special_last else None
+                            if (special_first and seq_start == 0) or (special_last and seq_stop is None):
+                                logger.warning('TemplateProcessing<single> leading/trailing special tokens do not match TemplateProcessing<pair>')
+                            if tmpl_pair := tmpl_pair[slice(seq_start, seq_stop)]:
+                                tmpl_a = tmpl_pair[0].get('Sequence', {}).get('id')
+                                tmpl_b = tmpl_pair[-1].get('Sequence', {}).get('id')
+                                if tmpl_a != 'A' or tmpl_b != 'B':
+                                    logger.warning(f'Unknown sequence {tmpl_a}...{tmpl_b} in TemplateProcessing<pair>')
+                                # A [sep] [eos] B
+                                if tmpl_a == 'A' and tmpl_b == 'B' and (tmpl_pair := tmpl_pair[1:-1]):
+                                    add_sep = False
+                                    if special_entry := tmpl_pair[0].get('SpecialToken', {}).get('id'):
+                                        if special_entry in (special_sep, special_eos) and not special_last:
+                                            add_sep = True
+                                        if special_entry not in (special_sep, special_eos):
+                                            logger.warning(f'Unknown separator token {special_entry!r} in TemplateProcessing<pair>')
+                                    else:
+                                        logger.warning(f'Unknown middle sequence {tmpl_pair[0]!r} in TemplateProcessing<pair>')
+                                    if len(tmpl_pair) == 2:
+                                        if special_entry := tmpl_pair[1].get('SpecialToken', {}).get('id'):
+                                            if special_entry in (special_sep, special_eos):
+                                                add_sep = True
+                                            if special_entry not in (special_sep, special_eos):
+                                                logger.warning(f'Unknown second separator token {special_entry!r} in TemplateProcessing<pair>')
+                                        else:
+                                            logger.warning(f'Unknown second middle sequence {tmpl_pair[1]!r} in TemplateProcessing<pair>')
+                                    self.add_special_token['sep'] = add_sep
+                                    if add_sep and not special_sep and tokenizer_config:
+                                        tokenizer_config['sep_token'] = special_eos
+                        continue
+        if not tokenizer_config:
             return True
-        with open(tokenizer_config_file, encoding = 'utf-8') as f:
-            tokenizer_config = json.load(f)
         chat_template_alt = None
         chat_template_file = path / 'chat_template.json'
         if chat_template_file.is_file():
@@ -302,6 +392,9 @@ class SentencePieceVocab(Vocab):
     name = "spm"
 
     def __init__(self, base_path: Path):
+        if SentencePieceProcessor is None:
+            raise RuntimeError("sentencepiece is not installed")
+
         added_tokens: dict[str, int] = {}
         if (fname_tokenizer := base_path / 'tokenizer.model').exists():
             # normal location
diff --git a/gguf-py/pyproject.toml b/gguf-py/pyproject.toml
index f11351cba1767..0f3a1eeee8304 100644
--- a/gguf-py/pyproject.toml
+++ b/gguf-py/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "gguf"
-version = "0.17.0"
+version = "0.17.1"
 description = "Read and write ML models in GGUF for GGML"
 authors = ["GGML <ggml@ggml.ai>"]
 packages = [
@@ -22,7 +22,7 @@ python = ">=3.8"
 numpy = ">=1.17"
 tqdm = ">=4.27"
 pyyaml = ">=5.1"
-sentencepiece = ">=0.1.98,<=0.2.0"
+sentencepiece = { version = ">=0.1.98,<=0.2.0", optional = true }
 PySide6 = { version = "^6.9", python = ">=3.9,<3.14", optional = true }
 
 [tool.poetry.dev-dependencies]
diff --git a/include/llama.h b/include/llama.h
index da0f652cfd63a..3eda9bc68608c 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -61,7 +61,10 @@ extern "C" {
     struct llama_model;
     struct llama_context;
     struct llama_sampler;
-    struct llama_kv_cache;
+
+    typedef struct llama_memory_i * llama_memory_t;
+
+    struct llama_kv_cache; // DEPRECATED (use llama_memory instead)
 
     typedef int32_t llama_pos;
     typedef int32_t llama_token;
@@ -240,18 +243,21 @@ extern "C" {
 
     typedef bool (*llama_progress_callback)(float progress, void * user_data);
 
-    // Input data for llama_decode
+    // Input data for llama_encode/llama_decode
     // A llama_batch object can contain input about one or many sequences
     // The provided arrays (i.e. token, embd, pos, etc.) must have size of n_tokens
     //
     // - token  : the token ids of the input (used when embd is NULL)
     // - embd   : token embeddings (i.e. float vector of size n_embd) (used when token is NULL)
     // - pos    : the positions of the respective token in the sequence
-    //            (if set to NULL, the token position will be tracked automatically by llama_decode)
+    //            (if set to NULL, the token position will be tracked automatically by llama_encode/llama_decode)
     // - seq_id : the sequence to which the respective token belongs
     //            (if set to NULL, the sequence ID will be assumed to be 0)
     // - logits : if zero, the logits (and/or the embeddings) for the respective token will not be output
-    //            (if set to NULL, only the logits for last token will be returned)
+    //            (if set to NULL:
+    //               - if embeddings: all tokens are output
+    //               - if not:        only the last token is output
+    //            )
     //
     typedef struct llama_batch {
         int32_t n_tokens;
@@ -259,8 +265,8 @@ extern "C" {
         llama_token  *  token;
         float        *  embd;
         llama_pos    *  pos;
-        int32_t      *  n_seq_id; // TODO: remove, should belong to only 1 sequence
-        llama_seq_id ** seq_id;   // TODO: become llama_seq_id * seq_id;
+        int32_t      *  n_seq_id;
+        llama_seq_id ** seq_id;
         int8_t       *  logits;   // TODO: rename this to "output"
     } llama_batch;
 
@@ -384,6 +390,7 @@ extern "C" {
         void * imatrix;                       // pointer to importance matrix data
         void * kv_overrides;                  // pointer to vector containing overrides
         void * tensor_types;                  // pointer to vector containing tensor types
+        void * prune_layers;                  // pointer to vector containing layer indices to prune
     } llama_model_quantize_params;
 
     typedef struct llama_logit_bias {
@@ -493,9 +500,11 @@ extern "C" {
     DEPRECATED(LLAMA_API int32_t llama_n_vocab    (const struct llama_vocab * vocab), "use llama_vocab_n_tokens instead");
 
     LLAMA_API const struct llama_model * llama_get_model   (const struct llama_context * ctx);
-    LLAMA_API    struct llama_kv_cache * llama_get_kv_self (      struct llama_context * ctx);
+    LLAMA_API           llama_memory_t   llama_get_memory  (const struct llama_context * ctx);
     LLAMA_API  enum llama_pooling_type   llama_pooling_type(const struct llama_context * ctx); // TODO: rename to llama_get_pooling_type
 
+    DEPRECATED(LLAMA_API struct llama_kv_cache * llama_get_kv_self(struct llama_context * ctx), "use llama_get_memory instead");
+
     LLAMA_API const struct llama_vocab * llama_model_get_vocab(const struct llama_model * model);
     LLAMA_API enum llama_rope_type       llama_model_rope_type(const struct llama_model * model);
 
@@ -509,6 +518,13 @@ extern "C" {
     // Get the model's RoPE frequency scaling factor
     LLAMA_API float llama_model_rope_freq_scale_train(const struct llama_model * model);
 
+    // Returns the number of classifier outputs (only valid for classifier models)
+    // Undefined behavior for non-classifier models
+    LLAMA_API uint32_t llama_model_n_cls_out(const struct llama_model * model);
+
+    // Returns label of classifier output by index (<n_cls_out). Returns nullptr if no label provided
+    LLAMA_API const char * llama_model_cls_label(const struct llama_model * model, uint32_t i);
+
     LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_vocab * vocab);
 
     LLAMA_API int32_t llama_vocab_n_tokens(const struct llama_vocab * vocab);
@@ -609,7 +625,81 @@ extern "C" {
                          int32_t   il_end);
 
     //
-    // KV cache
+    // Memory
+    //
+
+    // Clear the memory contents
+    // If data == true, the data buffers will also be cleared together with the metadata
+    LLAMA_API void llama_memory_clear(
+            llama_memory_t mem,
+                      bool data);
+
+    // Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
+    // Returns false if a partial sequence cannot be removed. Removing a whole sequence never fails
+    // seq_id < 0 : match any sequence
+    // p0 < 0     : [0,  p1]
+    // p1 < 0     : [p0, inf)
+    LLAMA_API bool llama_memory_seq_rm(
+            llama_memory_t mem,
+              llama_seq_id seq_id,
+                 llama_pos p0,
+                 llama_pos p1);
+
+    // Copy all tokens that belong to the specified sequence to another sequence
+    // p0 < 0 : [0,  p1]
+    // p1 < 0 : [p0, inf)
+    LLAMA_API void llama_memory_seq_cp(
+            llama_memory_t mem,
+              llama_seq_id seq_id_src,
+              llama_seq_id seq_id_dst,
+                 llama_pos p0,
+                 llama_pos p1);
+
+    // Removes all tokens that do not belong to the specified sequence
+    LLAMA_API void llama_memory_seq_keep(
+            llama_memory_t mem,
+              llama_seq_id seq_id);
+
+    // Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
+    // p0 < 0 : [0,  p1]
+    // p1 < 0 : [p0, inf)
+    LLAMA_API void llama_memory_seq_add(
+            llama_memory_t mem,
+              llama_seq_id seq_id,
+                 llama_pos p0,
+                 llama_pos p1,
+                 llama_pos delta);
+
+    // Integer division of the positions by factor of `d > 1`
+    // p0 < 0 : [0,  p1]
+    // p1 < 0 : [p0, inf)
+    LLAMA_API void llama_memory_seq_div(
+            llama_memory_t mem,
+              llama_seq_id seq_id,
+                 llama_pos p0,
+                 llama_pos p1,
+                       int d);
+
+    // Returns the smallest position present in the memory for the specified sequence
+    // This is typically non-zero only for SWA caches
+    // Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the memory
+    // Return -1 if the sequence is empty
+    LLAMA_API llama_pos llama_memory_seq_pos_min(
+            llama_memory_t mem,
+              llama_seq_id seq_id);
+
+    // Returns the largest position present in the memory for the specified sequence
+    // Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the memory
+    // Return -1 if the sequence is empty
+    LLAMA_API llama_pos llama_memory_seq_pos_max(
+            llama_memory_t mem,
+              llama_seq_id seq_id);
+
+    // Check if the memory supports shifting
+    LLAMA_API bool llama_memory_can_shift(llama_memory_t mem);
+
+    //
+    // KV cache for self-attention (TODO: deprecate in favor of llama_memory)
     //
 
     // Returns the number of tokens in the KV cache (slow, use only for debug)
@@ -622,86 +712,95 @@ extern "C" {
                "Use llama_kv_self_seq_pos_max() and llama_kv_self_seq_pos_min() instead (https://github.com/ggml-org/llama.cpp/issues/13793)");
 
     // Clear the KV cache - both cell info is erased and KV data is zeroed
-    LLAMA_API void llama_kv_self_clear(
-            struct llama_context * ctx);
+    DEPRECATED(LLAMA_API void llama_kv_self_clear(
+                struct llama_context * ctx),
+            "Use llama_memory_clear() instead");
 
     // Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
     // Returns false if a partial sequence cannot be removed. Removing a whole sequence never fails
     // seq_id < 0 : match any sequence
     // p0 < 0     : [0,  p1]
     // p1 < 0     : [p0, inf)
-    LLAMA_API bool llama_kv_self_seq_rm(
+    DEPRECATED(LLAMA_API bool llama_kv_self_seq_rm(
             struct llama_context * ctx,
                     llama_seq_id   seq_id,
                        llama_pos   p0,
-                       llama_pos   p1);
+                       llama_pos   p1),
+            "Use llama_memory_seq_rm() instead");
 
     // Copy all tokens that belong to the specified sequence to another sequence
     // Note that this does not allocate extra KV cache memory - it simply assigns the tokens to the new sequence
     // p0 < 0 : [0,  p1]
     // p1 < 0 : [p0, inf)
-    LLAMA_API void llama_kv_self_seq_cp(
+    DEPRECATED(LLAMA_API void llama_kv_self_seq_cp(
             struct llama_context * ctx,
                     llama_seq_id   seq_id_src,
                     llama_seq_id   seq_id_dst,
                        llama_pos   p0,
-                       llama_pos   p1);
+                       llama_pos   p1),
+            "Use llama_memory_seq_cp() instead");
 
     // Removes all tokens that do not belong to the specified sequence
-    LLAMA_API void llama_kv_self_seq_keep(
+    DEPRECATED(LLAMA_API void llama_kv_self_seq_keep(
             struct llama_context * ctx,
-                    llama_seq_id   seq_id);
+                    llama_seq_id   seq_id),
+            "Use llama_memory_seq_keep() instead");
 
     // Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
     // If the KV cache is RoPEd, the KV data is updated accordingly:
     //   - lazily on next llama_decode()
     // p0 < 0 : [0,  p1]
     // p1 < 0 : [p0, inf)
-    LLAMA_API void llama_kv_self_seq_add(
+    DEPRECATED(LLAMA_API void llama_kv_self_seq_add(
             struct llama_context * ctx,
                     llama_seq_id   seq_id,
                        llama_pos   p0,
                        llama_pos   p1,
-                       llama_pos   delta);
+                       llama_pos   delta),
+            "Use llama_memory_seq_add() instead");
 
     // Integer division of the positions by factor of `d > 1`
     // If the KV cache is RoPEd, the KV data is updated accordingly:
     //   - lazily on next llama_decode()
     // p0 < 0 : [0,  p1]
     // p1 < 0 : [p0, inf)
-    LLAMA_API void llama_kv_self_seq_div(
+    DEPRECATED(void llama_kv_self_seq_div(
             struct llama_context * ctx,
                     llama_seq_id   seq_id,
                        llama_pos   p0,
                        llama_pos   p1,
-                             int   d);
+                             int   d),
+            "Use llama_memory_seq_div() instead");
 
     // Returns the smallest position present in the KV cache for the specified sequence
     // This is typically non-zero only for SWA caches
     // Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the KV cache
     // Return -1 if the sequence is empty
-    LLAMA_API llama_pos llama_kv_self_seq_pos_min(
+    DEPRECATED(LLAMA_API llama_pos llama_kv_self_seq_pos_min(
             struct llama_context * ctx,
-                    llama_seq_id   seq_id);
+                    llama_seq_id   seq_id),
+            "Use llama_memory_seq_pos_min() instead");
 
     // Returns the largest position present in the KV cache for the specified sequence
     // Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the KV cache
     // Return -1 if the sequence is empty
-    LLAMA_API llama_pos llama_kv_self_seq_pos_max(
+    DEPRECATED(LLAMA_API llama_pos llama_kv_self_seq_pos_max(
             struct llama_context * ctx,
-                    llama_seq_id   seq_id);
+                    llama_seq_id   seq_id),
+            "Use llama_memory_seq_pos_max() instead");
 
     // Defragment the KV cache
     // This will be applied:
     //   - lazily on next llama_decode()
-    LLAMA_API DEPRECATED(void llama_kv_self_defrag(struct llama_context * ctx),
+    DEPRECATED(LLAMA_API void llama_kv_self_defrag(struct llama_context * ctx),
             "simply remove this call, the context will automatically decide when to do a defragmentation based on 'defrag_thold'");
 
     // Check if the context supports KV cache shifting
-    LLAMA_API bool llama_kv_self_can_shift(const struct llama_context * ctx);
+    DEPRECATED(LLAMA_API bool llama_kv_self_can_shift(const struct llama_context * ctx),
+            "use llama_memory_can_shift() instead");
 
     // Apply the KV cache updates (such as K-shifts, defragmentation, etc.)
-    LLAMA_API DEPRECATED(void llama_kv_self_update(struct llama_context * ctx),
+    DEPRECATED(LLAMA_API void llama_kv_self_update(struct llama_context * ctx),
             "simply remove this call, updates are applied lazily on the next llama_decode()");
 
     //
@@ -709,7 +808,7 @@ extern "C" {
     //
 
     // Returns the *actual* size in bytes of the state
-    // (logits, embedding and kv_cache)
+    // (logits, embedding and memory)
     // Only use when saving the state, not when restoring it, otherwise the size may be too small.
     LLAMA_API size_t llama_state_get_size(struct llama_context * ctx);
     LLAMA_API DEPRECATED(size_t llama_get_state_size(struct llama_context * ctx),
@@ -765,12 +864,12 @@ extern "C" {
                           size_t   n_token_count),
         "use llama_state_save_file instead");
 
-    // Get the exact size needed to copy the KV cache of a single sequence
+    // Get the exact size needed to copy the state of a single sequence
     LLAMA_API size_t llama_state_seq_get_size(
             struct llama_context * ctx,
                     llama_seq_id   seq_id);
 
-    // Copy the KV cache of a single sequence into the specified buffer
+    // Copy the state of a single sequence into the specified buffer
     LLAMA_API size_t llama_state_seq_get_data(
             struct llama_context * ctx,
                          uint8_t * dst,
@@ -836,21 +935,23 @@ extern "C" {
     // For encode-decoder contexts, processes the batch using the encoder.
     // Can store the encoder output internally for later use by the decoder's cross-attention layers.
     //   0 - success
-    // < 0 - error. the KV cache state is restored to the state before this call
+    // < 0 - error. the memory state is restored to the state before this call
     LLAMA_API int32_t llama_encode(
             struct llama_context * ctx,
               struct llama_batch   batch);
 
     // Process a batch of tokens.
-    // Requires KV cache.
+    // Requires the context to have a memory.
     // For encode-decoder contexts, processes the batch using the decoder.
     // Positive return values does not mean a fatal error, but rather a warning.
-    // Upon non-zero return values, the KV cache state is restored to the state before this call
+    // Upon fatal-error or abort, the ubatches that managed to be been processed will remain in the memory state of the context
+    //   To handle this correctly, query the memory state using llama_memory_seq_pos_min() and llama_memory_seq_pos_max()
+    // Upon other return values, the memory state is restored to the state before this call
     //    0 - success
     //    1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
-    //    2 - aborted
+    //    2 - aborted     (processed ubatches will remain in the context's memory)
     //   -1 - invalid input batch
-    // < -1 - error
+    // < -1 - fatal error (processed ubatches will remain in the context's memory)
     LLAMA_API int32_t llama_decode(
             struct llama_context * ctx,
               struct llama_batch   batch);
@@ -866,8 +967,8 @@ extern "C" {
     // Get the number of threads used for prompt and batch processing (multiple token).
     LLAMA_API int32_t llama_n_threads_batch(struct llama_context * ctx);
 
-    // Set whether the model is in embeddings mode or not
-    // If true, embeddings will be returned but logits will not
+    // Set whether the context outputs embeddings or not
+    // TODO: rename to avoid confusion with llama_get_embeddings()
     LLAMA_API void llama_set_embeddings(struct llama_context * ctx, bool embeddings);
 
     // Set whether to use causal attention or not
@@ -916,7 +1017,7 @@ extern "C" {
 
     // Get the embeddings for a sequence id
     // Returns NULL if pooling_type is LLAMA_POOLING_TYPE_NONE
-    // when pooling_type == LLAMA_POOLING_TYPE_RANK, returns float[1] with the rank of the sequence
+    // when pooling_type == LLAMA_POOLING_TYPE_RANK, returns float[n_cls_out] with the rank(s) of the sequence
     // otherwise: float[n_embd] (1-dimensional)
     LLAMA_API float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id);
 
@@ -946,6 +1047,7 @@ extern "C" {
 
     LLAMA_API bool llama_vocab_get_add_bos(const struct llama_vocab * vocab);
     LLAMA_API bool llama_vocab_get_add_eos(const struct llama_vocab * vocab);
+    LLAMA_API bool llama_vocab_get_add_sep(const struct llama_vocab * vocab);
 
     LLAMA_API llama_token llama_vocab_fim_pre(const struct llama_vocab * vocab);
     LLAMA_API llama_token llama_vocab_fim_suf(const struct llama_vocab * vocab);
@@ -989,6 +1091,7 @@ extern "C" {
     /// @param tokens The tokens pointer must be large enough to hold the resulting tokens.
     /// @return Returns the number of tokens on success, no more than n_tokens_max
     /// @return Returns a negative number on failure - the number of tokens that would have been returned
+    /// @return Returns INT32_MIN on overflow (e.g., tokenization result size exceeds int32_t limit)
     /// @param add_special Allow to add BOS and EOS tokens if model is configured to do so.
     /// @param parse_special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated
     ///                      as plaintext. Does not insert a leading space.
diff --git a/models/t5-very-small-random-F32.gguf b/models/t5-very-small-random-F32.gguf
new file mode 100644
index 0000000000000..fd386d88562d2
Binary files /dev/null and b/models/t5-very-small-random-F32.gguf differ
diff --git a/models/templates/Mistral-Small-3.2-24B-Instruct-2506.jinja b/models/templates/Mistral-Small-3.2-24B-Instruct-2506.jinja
new file mode 100644
index 0000000000000..19a3eaee49be6
--- /dev/null
+++ b/models/templates/Mistral-Small-3.2-24B-Instruct-2506.jinja
@@ -0,0 +1,124 @@
+{%- set today = strftime_now("%Y-%m-%d") %}
+{%- set default_system_message = "You are Mistral Small 3, a Large Language Model (LLM) created by Mistral AI, a French startup headquartered in Paris.\nYour knowledge base was last updated on 2023-10-01. The current date is " + today + ".\n\nWhen you're not sure about some information or when the user's request requires up-to-date or specific data, you must use the available tools to fetch the information. Do not hesitate to use tools whenever they can provide a more accurate or complete response. If no relevant tools are available, then clearly state that you don't have the information and avoid making up anything.
+
+If the user's question is not clear, ambiguous, or does not provide enough context for you to accurately answer the question, you do not try to answer it right away and you rather ask the user to clarify their request (e.g. \"What are some good restaurants around me?\" => \"Where are you?\" or \"When is the next flight to Tokyo\" => \"Where do you travel from?\").
+You are always very attentive to dates, and when asked about information at specific dates, you discard information that is at another date.
+You follow these instructions in all languages, and always respond to the user in the language they use or request.
+Next sections describe the capabilities that you have.
+
+# WEB BROWSING INSTRUCTIONS
+
+You cannot perform any web search or access internet to open URLs, links etc. If it seems like the user is expecting you to do so, you clarify the situation and ask the user to copy paste the text directly in the chat.
+
+# MULTI-MODAL INSTRUCTIONS
+
+You have the ability to read images, but you cannot generate images. You also cannot transcribe audio files or videos.
+You cannot read nor transcribe audio files or videos.
+
+# TOOL CALLING INSTRUCTIONS
+
+You may have access to tools that you can use to fetch information or perform actions. You must use these tools in the following situations:
+
+1. When the request requires up-to-date information.
+2. When the request requires specific data that you do not have in your knowledge base.
+3. When the request involves actions that you cannot perform without tools.
+
+Always prioritize using tools to provide the most accurate and helpful response. If tools are not available, inform the user that you cannot perform the requested action at the moment." %}
+
+{{- bos_token }}
+
+{%- set system_prompt = default_system_message %}
+{%- set loop_messages = messages %}
+
+{%- if not tools is defined %}
+    {%- set tools = none %}
+{%- endif %}
+
+{%- if messages|length > 0 and messages[0]['role'] == 'system' %}
+    {%- if messages[0]['content'] is string %}
+        {%- set system_prompt = messages[0]['content'] %}
+    {%- else %}
+        {%- set system_prompt = messages[0]['content'][0]['text'] %}
+    {%- endif %}
+    {%- set loop_messages = messages[1:] %}
+{%- endif %}
+
+{%- set user_messages = loop_messages | selectattr("role", "equalto", "user") | list %}
+
+{%- set ns = namespace(index=0) %}
+{%- for message in loop_messages %}
+    {%- if not (message.role == "tool" or (message.get('tool_calls'))) %}
+        {%- if (message["role"] == "user") != (ns.index % 2 == 0) %}
+            {{- raise_exception("After the optional system message, conversation roles must alternate user/assistant/user/assistant/...") }}
+        {%- endif %}
+        {%- set ns.index = ns.index + 1 %}
+    {%- endif %}
+{%- endfor %}
+
+{{- '[SYSTEM_PROMPT]' + system_prompt + '[/SYSTEM_PROMPT]' }}
+
+{%- for message in loop_messages %}
+    {%- if message['role'] == 'system' %}
+        {%- if message['content'] is string %}
+            {{- '[SYSTEM_PROMPT]' + message['content'] + '[/SYSTEM_PROMPT]' }}
+        {%- else %}
+            {{- '[SYSTEM_PROMPT]' + message['content'][0]['text'] + '[/SYSTEM_PROMPT]' }}
+        {%- endif %}
+    {%- elif message['role'] == 'user' %}
+        {%- if tools is not none and (message == user_messages[-1]) %}
+            {{- '[AVAILABLE_TOOLS]' + tools|tojson + '[/AVAILABLE_TOOLS]' }}
+        {%- endif %}
+        {{- '[INST]' }}
+        {%- if message['content'] is string %}
+            {{- message['content'] }}
+        {%- else %}
+            {%- for block in message['content'] %}
+                {%- if block['type'] == 'text' %}
+                    {{- block['text'] }}
+                {%- elif block['type'] in ['image', 'image_url'] %}
+                    {{- '[IMG]' }}
+                {%- else %}
+                    {{- raise_exception('Only text and image blocks are supported in message content!') }}
+                {%- endif %}
+            {%- endfor %}
+        {%- endif %}
+        {{- '[/INST]' }}
+    {%- elif message['role'] == 'assistant' %}
+        {%- if message.get('tool_calls') %}
+            {%- for tool_call in message.tool_calls %}
+                {{- '[TOOL_CALLS]' + tool_call.function.name }}
+                {%- if not tool_call.id is defined or tool_call.id is not string or tool_call.id|length != 9 %}
+                    {{- raise_exception("Tool call IDs should be alphanumeric strings with length 9!") }}
+                {%- endif %}
+                {{- '[CALL_ID]' + tool_call.id }}
+                {{- '[ARGS]' + tool_call['function']['arguments']|tojson }}
+            {%- endfor %}
+            {{- eos_token }}
+        {%- elif message['content'] is string %}
+            {{- message['content'] + eos_token }}
+        {%- else %}
+            {%- for block in message['content'] %}
+                {%- if block['type'] == 'text' %}
+                    {{- block['text'] }}
+                {%- elif block['type'] in ['image', 'image_url'] %}
+                    {{- '[IMG]' }}
+                {%- else %}
+                    {{- raise_exception('Only text and image blocks are supported in assistant content!') }}
+                {%- endif %}
+            {%- endfor %}
+            {{- eos_token }}
+        {%- endif %}
+    {%- elif message['role'] == 'tool_results' or message['role'] == 'tool' %}
+        {%- if message.content is defined and message.content.content is defined %}
+            {%- set content = message.content.content %}
+        {%- else %}
+            {%- set content = message.content %}
+        {%- endif %}
+        {%- if not message.tool_call_id is defined or message.tool_call_id is not string or message['tool_call_id']|length != 9 %}
+            {{- raise_exception("Tool call IDs should be alphanumeric strings with length 9!") }}
+        {%- endif %}
+        {{- '[TOOL_RESULTS]' + message.tool_call_id + '[TOOL_CONTENT]' + content|string + '[/TOOL_RESULTS]' }}
+    {%- else %}
+        {{- raise_exception('Only system, user, assistant, and tool roles are supported!') }}
+    {%- endif %}
+{%- endfor %}
diff --git a/prebuilts/Hexagon_SDK/.lock b/prebuilts/Hexagon_SDK/.lock
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_compute_res.h b/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_compute_res.h
new file mode 100755
index 0000000000000..c8e297a6a4474
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_compute_res.h
@@ -0,0 +1,1412 @@
+/*-----------------------------------------------------------------------------
+   Copyright (c) 2019-2020-2022,2024 QUALCOMM Technologies, Incorporated.
+   All Rights Reserved.
+   QUALCOMM Proprietary.
+-----------------------------------------------------------------------------*/
+
+#ifndef HAP_COMPUTE_RES_H_
+#define HAP_COMPUTE_RES_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * @defgroup types Macros and structures
+ * @{
+ */
+
+/** Error code for unsupported features. */
+#define HAP_COMPUTE_RES_NOT_SUPPORTED                           0x80000404
+/** Maximum thread identifiers supported */
+#define HAP_COMPUTE_RES_MAX_NUM_THREADS                         16
+
+/**
+ *  @file HAP_compute_res.h
+ *  @brief Header file with APIs to allocate compute resources.
+ */
+
+/**
+ * Structure containing attributes for compute resources.
+ */
+typedef struct {
+    unsigned long long attributes[8]; /**< Attribute array. */
+} compute_res_attr_t;
+
+/**
+ * Structure containing a VTCM page size and the number of pages with that size.
+ */
+typedef struct {
+    unsigned int page_size;  /**< Page size in bytes. */
+    unsigned int num_pages;  /**< Number of pages of size page_size. */
+} compute_res_vtcm_page_def_t;
+
+/**
+ * Structure describing the VTCM memory pages.
+ */
+typedef struct {
+    unsigned int block_size;           /**< Block size in bytes */
+    unsigned int page_list_len;      /**< Number of valid elements in page_list array */
+    compute_res_vtcm_page_def_t page_list[8];  /**< Array of pages. */
+} compute_res_vtcm_page_t;
+
+/**
+ * enum of HMX lock tyes
+ */
+typedef enum {
+    HAP_COMPUTE_RES_HMX_NON_SHARED = 0,      /**< No sharing of HMX across threads */
+    HAP_COMPUTE_RES_HMX_SHARED = 1,          /**< To share HMX across threads */
+} compute_res_hmx_type_t;
+
+/**
+ * enum of capabilities supported by capability query API
+ */
+typedef enum {
+    HAP_COMPUTE_RES_PREEMPTION_CAPABILITY = 1,  /**< Preemption capability */
+} compute_res_capability_id;
+
+/**
+ * Masks returned by preemption capability query
+ */
+#define HAP_COMPUTE_RES_COOPERATIVE_PREEMPTION                  1
+/**< Mask indicating support for cooperative preemption framework using
+ *   capabilities query. The cooperative preemption framework involves applications
+ *   registering a release callback for accepting yield requests from a high priority
+ *   allocator.
+ */
+#define HAP_COMPUTE_RES_AUTONOMOUS_PREEMPTION                   2
+/**< Mask indicating support for autonomous/optimized preemption framework using
+ *   capabilities query. HMX resource management is moved out of #HAP_compute_res_acquire()/
+ *   #HAP_compute_res_acquire_cached(), instead applications use #HAP_compute_res_hmx_lock3()/
+ *   #HAP_compute_res_hmx_unlock3() to lock/unlock HMX resource directly from the threads
+ *   using HMX. Applications shall implement HMX critical section using hmx_mutex object
+ *   returned by #HAP_compute_res_hmx_lock3() around non-preemptable HMX sections.
+ */
+#define HAP_COMPUTE_RES_THREADS_FOR_AUTONOMOUS_PREEMPTION       4
+/**< Mask indicating support for thread identifiers based autonomous/optimized
+ *   preemption framework. This feature is a subset of HAP_COMPUTE_RES_AUTONOMOUS_PREEMPTION.
+ *   In this feature, applications register the threads that
+ *   will be working on the allocated compute resources with the resource manager.
+ *   The compute resource manager, as part of autonomous preemption, suspends the
+ *   threads associated with the low priority context when a high priority thread
+ *   requests for these resources.
+ */
+
+
+/**
+ * enum of commands for providing thread ids to the resource manager
+ */
+typedef enum {
+    HAP_COMPUTE_RES_THREADS_OVERRIDE = 1,
+    /**< Command ID to override the thread list registered with a context */
+    HAP_COMPUTE_RES_THREADS_APPEND = 2,
+    /**< Command ID to append to an existing thread list associated with
+     * the context
+     */
+    HAP_COMPUTE_RES_THREADS_REMOVE = 3,
+    /**< Command ID to remove a thread from an existing thread list associated
+     * with the context
+     */
+} compute_res_threads_cmd_id;
+
+/**
+ * Structure holding HMX critical section parameters
+ */
+typedef struct {
+    void *mutex;
+    /**< Mutex to be used for entering/exiting HMX critical section
+     *   via lock and unlock functions
+     */
+    void (*lock)(void *mutex);
+    /**< Lock function to be called for entering HMX critical section using
+     *   mutex as argument
+     */
+    void (*unlock)(void *mutex);
+    /**< Unlock function to be called for exiting HMX critical section using
+     *   mutex as argument
+     */
+} compute_res_hmx_mutex_t;
+
+/**
+ * Structure for querying preemption data
+ */
+typedef struct {
+    unsigned int num_preemptions;
+    /**< Number of preemptions on the acquired context */
+    unsigned long long preempted_duration;
+    /**< Total duration the context remained preempted in terms of 19.2MHz ticks */
+    unsigned long long preemption_overhead;
+    /**< Total preemption overhead in terms of 19.2MHz ticks */
+} compute_res_preempt_data_t;
+
+/**
+ * @}
+ */
+
+/**
+ * @cond DEV
+ */
+int __attribute__((weak)) compute_resource_attr_init(
+                                        compute_res_attr_t* attr);
+
+int __attribute__((weak)) compute_resource_attr_set_serialize(
+                                        compute_res_attr_t* attr,
+                                        unsigned char b_enable);
+
+int __attribute__((weak)) compute_resource_attr_set_hmx_param(
+                                        compute_res_attr_t* attr,
+                                        unsigned char b_enable);
+
+int __attribute__((weak)) compute_resource_attr_set_vtcm_param(
+                                        compute_res_attr_t* attr,
+                                        unsigned int vtcm_size,
+                                        unsigned char b_single_page);
+
+int __attribute__((weak)) compute_resource_attr_set_vtcm_param_v2(
+                                        compute_res_attr_t* attr,
+                                        unsigned int vtcm_size,
+                                        unsigned int min_page_size,
+                                        unsigned int min_vtcm_size);
+
+int __attribute__((weak)) compute_resource_attr_set_app_type(
+                                        compute_res_attr_t* attr,
+                                        unsigned int application_id);
+
+int __attribute__((weak)) compute_resource_attr_set_cache_mode(
+                                        compute_res_attr_t* attr,
+                                        unsigned char b_enable);
+
+int __attribute__((weak)) compute_resource_attr_set_release_callback(
+                                        compute_res_attr_t* attr,
+                                        int (*release_callback)(
+                                            unsigned int context_id,
+                                            void* client_context),
+                                        void* client_context);
+
+void* __attribute__((weak)) compute_resource_attr_get_vtcm_ptr(
+                                        compute_res_attr_t* attr);
+
+int __attribute__((weak)) compute_resource_attr_get_vtcm_ptr_v2(
+                                        compute_res_attr_t* attr,
+                                        void** vtcm_ptr,
+                                        unsigned int* vtcm_size);
+
+int __attribute__((weak)) compute_resource_query_VTCM(
+                                unsigned int application_id,
+                                unsigned int* total_block_size,
+                                compute_res_vtcm_page_t* total_block_layout,
+                                unsigned int* avail_block_size,
+                                compute_res_vtcm_page_t* avail_block_layout);
+
+unsigned int __attribute__((weak)) compute_resource_acquire(
+                                        compute_res_attr_t* attr,
+                                        unsigned int timeout_us);
+
+int __attribute__((weak)) compute_resource_release(
+                                        unsigned int context_id);
+
+int __attribute__((weak)) compute_resource_acquire_cached(
+                                        unsigned int context_id,
+                                        unsigned int timeout_us);
+
+int __attribute__((weak)) compute_resource_release_cached(
+                                        unsigned int context_id);
+
+int __attribute__((weak)) compute_resource_hmx_lock(
+                                        unsigned int context_id);
+
+int __attribute__((weak)) compute_resource_hmx_unlock(
+                                        unsigned int context_id);
+
+int __attribute__((weak)) compute_resource_check_release_request(
+                                        unsigned int context_id);
+
+int __attribute__((weak)) compute_resource_hmx_lock2(
+                                        unsigned int context_id,
+                                        compute_res_hmx_type_t type);
+
+int __attribute__((weak)) compute_resource_hmx_unlock2(
+                                        unsigned int context_id,
+                                        compute_res_hmx_type_t type);
+
+int __attribute__((weak)) compute_resource_update_priority(
+                                        unsigned int context_id,
+                                        unsigned short priority);
+
+int __attribute__((weak)) crm_hmx_lock3(unsigned int context_id,
+                                        compute_res_hmx_type_t type,
+                                        compute_res_hmx_mutex_t *mutex,
+                                        unsigned int timeout_us);
+
+int __attribute__((weak)) crm_hmx_unlock3(unsigned int context_id,
+                                          compute_res_hmx_type_t type,
+                                          compute_res_hmx_mutex_t *mutex);
+
+int __attribute__ ((weak)) crm_attr_set_vtcm_backup(
+                                        compute_res_attr_t* attr,
+                                        void *buffer,
+                                        unsigned int buffer_size);
+
+int __attribute__ ((weak)) crm_attr_set_threads(
+                                        compute_res_attr_t* attr,
+                                        unsigned int *threads,
+                                        unsigned int num_threads);
+
+int __attribute__ ((weak)) crm_attr_set_vtcm_clear_on_release(
+                                        compute_res_attr_t* attr,
+                                        unsigned char enable);
+
+int __attribute__ ((weak)) crm_cached_set_threads(compute_res_threads_cmd_id command,
+                                                  unsigned int context_id,
+                                                  unsigned int *threads,
+                                                  unsigned int num_threads);
+
+int __attribute__((weak)) crm_query_capability(compute_res_capability_id capability_id,
+                                               unsigned int* data);
+
+int __attribute__((weak)) crm_get_preempt_data(unsigned int context_id,
+                                                 compute_res_preempt_data_t *data);
+
+int __attribute__((weak)) crm_tid_preemption_lock(void);
+
+int __attribute__((weak)) crm_tid_preemption_unlock(void);
+/**
+ * @endcond
+ */
+
+/**
+ * @defgroup attributes Manage attributes
+ * Manage parameters affecting the requested shared resources
+ * @{
+ */
+
+/**
+ * Initializes the attribute structure for a resource request.
+ *
+ * The user must call this function before setting any specific resource property
+ * via other helper functions.
+ *
+ * @param[in] attr Pointer to compute resource attribute structure,
+ *                 #compute_res_attr_t.
+ *
+ * @return
+ * 0 upon success. \n
+ * Nonzero upon failure. \n
+ * HAP_COMPUTE_RES_NOT_SUPPORTED if unsupported.
+ */
+static inline int HAP_compute_res_attr_init(compute_res_attr_t* attr)
+{
+    if (compute_resource_attr_init)
+        return compute_resource_attr_init(attr);
+
+    return HAP_COMPUTE_RES_NOT_SUPPORTED;
+}
+
+/**
+ * Sets or clears the serialization option in the request resource structure.
+ *
+ * Serialization allows participating use cases to run with mutually exclusive
+ * access to the entire cDSP which helps, for example, in avoiding cache
+ * thrashing while trying to run simultaneously on different hardware threads.
+ * Participating use cases issue blocking acquires on the serialization
+ * resource when ready to run, and each runs in turn when it is granted that
+ * resource.
+ *
+ * Acquiring the serialization resource only ensures
+ * mutual exclusion from other cooperating use cases that also block on
+ * acquisition of that resource, it does not guarantee exclusion from
+ * concurrent use cases that do not request the serialization
+ * resource.
+ *
+ * @param[in] attr Pointer to the compute resource attribute structure,
+ *                 #compute_res_attr_t.
+ * @param[in] b_serialize 1 (TRUE) to participate in serialization. \n
+ *                        0 (FALSE) otherwise.
+ *
+ * @return
+ * 0 upon success \n
+ * Nonzero upon failure.
+ */
+static inline int HAP_compute_res_attr_set_serialize(
+                                                compute_res_attr_t* attr,
+                                                unsigned char b_serialize)
+{
+    if (compute_resource_attr_set_serialize)
+    {
+        return compute_resource_attr_set_serialize(attr,
+                                                   b_serialize);
+    }
+
+    return HAP_COMPUTE_RES_NOT_SUPPORTED;
+}
+
+/**
+ * Sets VTCM request parameters in the provided resource attribute structure.
+ *
+ * The user calls this function to request the specified VTCM size in the acquire call.
+ * These VTCM request attributes are reset to 0 (no VTCM request) in the
+ * resource attribute structure by HAP_compute_res_attr_init().
+ *
+ * @param[in] attr Pointer to compute resource attribute structure,
+ *                 #compute_res_attr_t.
+ * @param[in] vtcm_size Size of the VTCM request in bytes;
+                        0 if VTCM allocation is not required.
+ * @param[in] b_single_page    1 - Requested VTCM size to be allocated in a
+ *                                 single page. \n
+ *                             0 - No page requirement (allocation can spread
+ *                                 across multiple pages. VTCM manager
+ *                                 always attempts the best fit).
+ *
+ * @return
+ * 0 upon success. \n
+ * Non-zero upon failure.
+ */
+static inline int HAP_compute_res_attr_set_vtcm_param(
+                                             compute_res_attr_t* attr,
+                                             unsigned int vtcm_size,
+                                             unsigned char b_single_page)
+{
+    if (compute_resource_attr_set_vtcm_param)
+    {
+        return compute_resource_attr_set_vtcm_param(attr,
+                                                    vtcm_size,
+                                                    b_single_page);
+    }
+
+    return HAP_COMPUTE_RES_NOT_SUPPORTED;
+}
+
+/**
+ * Reads the VTCM memory pointer from the given attribute structure.
+ *
+ * On a successful VTCM resource request placed via #HAP_compute_res_acquire()
+ * using HAP_compute_res_attr_set_vtcm_param(), a user can invoke this helper
+ * function to retrieve the allocated VTCM address by passing the same attribute
+ * structure used in the respective HAP_compute_res_acquire() call.
+ *
+ * @param[in] attr Pointer to compute the resource attribute structure
+ *                 #compute_res_attr_t.
+ *
+ * @return
+ * Void pointer to the allocated VTCM section. \n
+ * 0 signifies no allocation.
+ */
+static inline void* HAP_compute_res_attr_get_vtcm_ptr(compute_res_attr_t* attr)
+{
+    if (compute_resource_attr_get_vtcm_ptr)
+    {
+        return compute_resource_attr_get_vtcm_ptr(attr);
+    }
+
+    return 0;
+}
+
+/**
+ * Sets an extended set of VTCM request parameters in the attribute structure,
+ * specifically VTCM Size, the minimum required page size, and the minimum
+ * required VTCM size.
+ *
+ * This function cannot be used with HAP_compute_res_attr_set_vtcm_param().
+ * Call this function after HAP_compute_res_attr_init().
+ *
+ * Supported starting with Lahaina.
+ *
+ * @param[in] attr Pointer to compute the resource attribute structure,
+ *                 #compute_res_attr_t.
+ * @param[in] vtcm_size Size of the VTCM request in bytes. 0 if VTCM allocation
+ *                      is NOT required.
+ * @param[in] min_page_size Minimum page size required in bytes. Valid pages include
+ *                          4 KB, 16 KB, 64 KB, 256 KB, 1 MB, 4 MB, 16 MB. Setting 0
+ *                          will select best possible fit (least page mappings)
+ * @param[in] min_vtcm_size Minimum VTCM size in bytes, if the specified size
+ *                          (vtcm_size) is not available. 0 means the
+ *                          size is an absolute requirement.
+ *
+ * @return
+ * 0 for success. \n
+ * Non-zero for failure. \n
+ * HAP_COMPUTE_RES_NOT_SUPPORTED when not supported.
+ */
+static inline int HAP_compute_res_attr_set_vtcm_param_v2(
+                                             compute_res_attr_t* attr,
+                                             unsigned int vtcm_size,
+                                             unsigned int min_page_size,
+                                             unsigned int min_vtcm_size)
+{
+    if (compute_resource_attr_set_vtcm_param_v2)
+    {
+        return compute_resource_attr_set_vtcm_param_v2(attr,
+                                                       vtcm_size,
+                                                       min_page_size,
+                                                       min_vtcm_size);
+    }
+
+    return HAP_COMPUTE_RES_NOT_SUPPORTED;
+}
+
+/**
+ * Sets VTCM backup buffer in the provided attribute structure.
+ *
+ * Compute resource manager uses the provided buffer to backup VTCM allocated
+ * to the user during preemption of the associated request/context. The backup
+ * buffer provided should be able to accomodate all of the requested VTCM size.
+ * VTCM backup buffer is essential for preemption to work on architectures
+ * supporting HAP_COMPUTE_RES_THREADS_FOR_AUTONOMOUS_PREEMPTION (use
+ * HAP_compute_res_query_capability() to query preemption model supported)
+ *
+ * Call this function after HAP_compute_res_attr_init().
+ *
+ * @param[in] attr Pointer to the compute resource attribute structure,
+ *                 #compute_res_attr_t.
+ * @param[in] buffer Pointer to the backup buffer in main memory (DDR). To be
+ *                   used by the compute resource manager for saving/restoring
+ *                   user allocated VTCM region during preemption.
+ * @param[in] buffer_size Size of the backup buffer in main memory (DDR) pointed
+ *                        to by the #buffer argument. The provided buffer should
+ *                        be sufficiently sized to accommodate user requested
+ *                        VTCM size. Align the buffer to 128B for better performance.
+ *
+ * @return
+ * 0 for success. \n
+ * Non-zero for failure. \n
+ * HAP_COMPUTE_RES_NOT_SUPPORTED when not supported.
+ */
+static inline int HAP_compute_res_attr_set_vtcm_backup(
+                                            compute_res_attr_t* attr,
+                                            void *buffer,
+                                            unsigned int buffer_size)
+{
+    if (crm_attr_set_vtcm_backup)
+    {
+        return crm_attr_set_vtcm_backup(attr, buffer, buffer_size);
+    }
+
+    return HAP_COMPUTE_RES_NOT_SUPPORTED;
+}
+
+/**
+ * Updates provided attribute structure with user-provided thread id array.
+ *
+ * On architectures supporting HAP_COMPUTE_RES_THREADS_FOR_AUTONOMOUS_PREEMPTION,
+ * Compute resource manager requires users to register the threads that will be
+ * using the compute resources requested via #HAP_compute_res_acquire().
+ *
+ * Call this function after HAP_compute_res_attr_init().
+ *
+ * @param[in] attr Pointer to the compute resource attribute structure,
+ *                 #compute_res_attr_t.
+ * @param[in] threads Pointer to an array of QuRT thread identifiers associated
+ *                    with the resource request. This array should be valid
+ *                    till #HAP_compute_res_acquire() is called on the prepared
+ *                    attribute.
+ * @param[in] num_threads Number of QuRT thread identifiers in the provided
+ *                        threads array #threads. A maximum of
+ *                        HAP_COMPUTE_RES_MAX_NUM_THREADS
+ *                        number of threads can be provided.
+ *
+ * @return
+ * 0 for success. \n
+ * Non-zero for failure. \n
+ * HAP_COMPUTE_RES_NOT_SUPPORTED when not supported.
+ */
+static inline int HAP_compute_res_attr_set_threads(
+                                            compute_res_attr_t* attr,
+                                            unsigned int *threads,
+                                            unsigned int num_threads)
+{
+    if (crm_attr_set_threads)
+    {
+        return crm_attr_set_threads(attr, threads, num_threads);
+    }
+
+    return HAP_COMPUTE_RES_NOT_SUPPORTED;
+}
+
+/**
+ * Updates thread id array for the associated cached context.
+ *
+ * Compute resource manager uses the QuRT thread identifiers provided by the
+ * user during preemption of the associated context. For cached
+ * allocations, the thread identifiers can either be provided at the time
+ * of HAP_compute_res_acquire() call using #HAP_compute_res_attr_set_threads(),
+ * or using this API with the context_id returned by a successful
+ * #HAP_compute_res_acquire() call when the cached attribute is set via
+ * #HAP_compute_res_attr_set_cache_mode().
+ * The API has to be called before HAP_compute_res_acquire_cached() call.
+ *
+ * @param[in] command specifies a command from compute_res_threads_cmd_id:
+ *                    HAP_COMPUTE_RES_THREADS_OVERRIDE : To provide a new
+ *                                                  set of threads.
+ *                    HAP_COMPUTE_RES_THREADS_APPEND : To append to previously
+ *                                                  provided list of threads.
+ *                    HAP_COMPUTE_RES_THREADS_REMOVE : To remove given threads
+ *                                                  from previoulsy provided
+ *                                                  list of threads.
+ * @param[in] context_id Context ID returned by HAP_compute_res_acquire().
+ * @param[in] threads Pointer to an array of QuRT thread identifiers associated
+ *                    with the resource request.
+ * @param[in] num_threads Number of QuRT thread identifiers in the provided
+ *                        threads array #threads. A maximum of
+ *                        HAP_COMPUTE_RES_MAX_NUM_THREADS
+ *                        number of threads can be provided.
+ *
+ * @return
+ * 0 for success. \n
+ * Non-zero for failure. \n
+ * HAP_COMPUTE_RES_NOT_SUPPORTED when not supported.
+ */
+static inline int HAP_compute_res_cached_set_threads(compute_res_threads_cmd_id command,
+                                                     unsigned int context_id,
+                                                     unsigned int *threads,
+                                                     unsigned int num_threads)
+{
+    if (crm_cached_set_threads)
+    {
+        return crm_cached_set_threads(command, context_id, threads, num_threads);
+    }
+
+    return HAP_COMPUTE_RES_NOT_SUPPORTED;
+}
+
+/**
+ * Sets VTCM clear on release option in the provided attribute structure.
+ *
+ * The compute resource manager by default initializes the VTCM memory to 0
+ * when VTCM is released by the caller either at the time of release or when
+ * it's allocated to another process. For performance considerations (also
+ * considering security implications if any), client can intimate the compute
+ * resource manager not to clear out (zero-initialize) the allocated VTCM
+ * on release.
+ *
+ * Call this function after HAP_compute_res_attr_init().
+ *
+ * @param[in] attr Pointer to the compute resource attribute structure,
+ *                 #compute_res_attr_t.
+ * @param[in] enable  1 - zero-initialize VTCM memory after release (default)
+ *                     0 - Do not zero-initialize VTCM memory after release.
+ * @return
+ * 0 for success. \n
+ * Non-zero for failure. \n
+ * HAP_COMPUTE_RES_NOT_SUPPORTED when not supported.
+ */
+static inline int HAP_compute_res_attr_set_vtcm_clear_on_release(
+                                            compute_res_attr_t* attr,
+                                            unsigned char enable)
+{
+    if (crm_attr_set_vtcm_clear_on_release)
+    {
+        return crm_attr_set_vtcm_clear_on_release(attr, enable);
+    }
+
+    return HAP_COMPUTE_RES_NOT_SUPPORTED;
+}
+
+/**
+ * On a successful VTCM resource request placed via
+ * HAP_compute_res_acquire() or HAP_compute_res_acquire_cached() using
+ * HAP_compute_res_attr_set_vtcm_param_v2(), users invoke this helper function
+ * to retrieve the allocated VTCM address and size by passing the same
+ * attribute structure used in the respective acquire call.
+ *
+ * Supported starting with Lahaina.
+ *
+ * @param[in] attr Pointer to compute the resource attribute structure
+ *                 #compute_res_attr_t.
+ * @param[out] vtcm_ptr Assigned VTCM address; NULL for no allocation.
+ * @param[out] vtcm_size Size of the allocated VTCM memory from the assigned pointer.
+ *
+ * @return
+ * 0 upon success. \n
+ * Nonzero upon failure. \n
+ * HAP_COMPUTE_RES_NOT_SUPPORTED when not supported.
+ */
+static inline int HAP_compute_res_attr_get_vtcm_ptr_v2(
+                                        compute_res_attr_t* attr,
+                                        void** vtcm_ptr,
+                                        unsigned int* vtcm_size)
+{
+    if (compute_resource_attr_get_vtcm_ptr_v2)
+    {
+        return compute_resource_attr_get_vtcm_ptr_v2(attr,
+                                                     vtcm_ptr,
+                                                     vtcm_size);
+    }
+
+    return HAP_COMPUTE_RES_NOT_SUPPORTED;
+}
+
+/**
+ * On chipsets with HMX, sets/resets the HMX request parameter in the attribute
+ * structure for acquiring the HMX resource.
+ *
+ * Call this function after HAP_compute_res_attr_init().
+ *
+ * Supported starting with Lahaina.
+ *
+ * @param[in] attr Pointer to compute the resource attribute structure,
+ *                 #compute_res_attr_t.
+ * @param[in] b_enable 0 - do not request HMX resource (resets option). \n
+ *                     1 - request HMX resource (sets option).
+ * @return
+ * 0 upon success. \n
+ * Nonzero upon failure.
+ */
+static inline int HAP_compute_res_attr_set_hmx_param(
+                                                compute_res_attr_t* attr,
+                                                unsigned char b_enable)
+{
+    if (compute_resource_attr_set_hmx_param)
+    {
+        return compute_resource_attr_set_hmx_param(attr,
+                                                   b_enable);
+    }
+
+    return HAP_COMPUTE_RES_NOT_SUPPORTED;
+}
+
+/**
+ * Sets or resets cacheable mode in the attribute structure.
+ *
+ * A cacheable request allows users to allocate and release based on the
+ * context ID of the request. On a successful cacheable request via
+ * HAP_compute_res_acquire(), users get the same VTCM address and
+ * size across calls of HAP_compute_res_acquire_cached() and
+ * HAP_compute_res_release_cached() until the context is explicitly
+ * released via HAP_compute_res_release().
+ *
+ * After a successful cacheable request via HAP_compute_res_acquire(),
+ * users can get the assigned VTCM pointer (if requested) by passing
+ * the attribute structure to HAP_compute_res_attr_get_vtcm_ptr()
+ * for v1 and HAP_compute_res_attr_get_vtcm_ptr_v2() for v2,
+ * and they must call HAP_compute_res_acquire_cached() before using the
+ * assigned resources.
+ *
+ * Supported starting with Lahaina.
+ *
+ * @param[in] attr Pointer to compute resource attribute structure,
+ *                 #compute_res_attr_t.
+ * @param[in] b_enable  0 - Do not request cacheable mode (resets option). \n
+ *                      1 - Request cacheable mode (sets option).
+ *
+ * @return
+ * 0 upon success. \n
+ * Nonzero upon failure.\n
+ * HAP_COMPUTE_RES_NOT_SUPPORTED when not supported.
+ */
+static inline int HAP_compute_res_attr_set_cache_mode(
+                                              compute_res_attr_t* attr,
+                                              unsigned char b_enable)
+{
+    if (compute_resource_attr_set_cache_mode)
+    {
+        return compute_resource_attr_set_cache_mode(attr,
+                                                    b_enable);
+    }
+
+    return HAP_COMPUTE_RES_NOT_SUPPORTED;
+}
+
+/**
+ * Sets the application ID parameter in the resource structure used to
+ * select the appropriate VTCM partition.
+ *
+ * If this application ID parameter is not explicitly set, the default partition is selected.
+ * The default application ID (0) is set when the attribute structure is initialized.
+ * Application IDs are defined in the kernel device tree configuration.
+ * If the given ID is not specified in the tree, the primary VTCM partition is selected.
+ *
+ * Call this function after HAP_compute_res_attr_init().
+ *
+ * Supported starting with Lahaina.
+ *
+ * @param[in] attr Pointer to compute the resource attribute structure
+ *                 #compute_res_attr_t.
+ * @param[in] application_id Application ID used to specify the VTCM partition.
+ *
+ * @return
+ * 0 upon success. \n
+ * Nonzero upon failure.
+ */
+static inline int HAP_compute_res_attr_set_app_type(
+                                              compute_res_attr_t* attr,
+                                              unsigned int application_id)
+{
+    if (compute_resource_attr_set_app_type)
+    {
+        return compute_resource_attr_set_app_type(attr,
+                                                  application_id);
+    }
+
+    return HAP_COMPUTE_RES_NOT_SUPPORTED;
+}
+
+/**
+ * @}
+ */
+
+
+/**
+* @defgroup query VTCM query API
+* @{
+*/
+
+/**
+ * Returns the total and available VTCM sizes and page layouts
+ * for the given application type.
+ *
+ * Supported starting with Lahaina.
+ *
+ * @param[in] application_id Application ID used to specify the VTCM partition.
+ * @param[out] total_block_size Total VTCM size assigned for this application type.
+ * @param[out] total_block_layout Total VTCM size (total_block_size)
+ *                                represented in pages.
+ * @param[out] avail_block_size Largest contiguous memory chunk available in
+                                VTCM for this application type.
+ * @param[out] avail_block_layout Available block size (avail_block_size)
+ *                                represented in pages.
+ *
+ * @return
+ * 0 upon success. \n
+ * Nonzero upon failure. \n
+ * HAP_COMPUTE_RES_NOT_SUPPORTED when not supported.
+ */
+static inline int HAP_compute_res_query_VTCM(
+                                unsigned int application_id,
+                                unsigned int* total_block_size,
+                                compute_res_vtcm_page_t* total_block_layout,
+                                unsigned int* avail_block_size,
+                                compute_res_vtcm_page_t* avail_block_layout)
+{
+    if (compute_resource_query_VTCM)
+    {
+        return compute_resource_query_VTCM(application_id,
+                                           total_block_size,
+                                           total_block_layout,
+                                           avail_block_size,
+                                           avail_block_layout);
+    }
+
+    return HAP_COMPUTE_RES_NOT_SUPPORTED;
+}
+
+/**
+ * @}
+ */
+
+/**
+ * @defgroup acquire_release Acquire and release
+ * Manage the process of resource acquisition and release
+ * @{
+ */
+
+/**
+ * Checks the release request status for the provided context.
+ * When a context is acquired by providing a release callback, the callback
+ * can be invoked by the compute resource manager when a high priority client
+ * is waiting for the resource(s). If a client defers a release request waiting
+ * for an outstanding work item, this API can be used to check if a release is
+ * still required before releasing the context once the work is done.
+ *
+ * Note: It is not mandatory to call this API once a release request via
+ * the registered callback is received. The context can be released and reacquired
+ * if necessary. This API can be useful to avoid a release and reacquire in cases
+ * where the high priority client times out and is no longer waiting for the
+ * resource(s).
+ *
+ * Supported starting with Lahaina.
+ *
+ * @param[in] context_id  Context ID returned by HAP_compute_res_acquire call().
+ *
+ * @return
+ * 0 if the provided context need not be released. \n
+ * Nonzero up on failure or if the context needs to be released. \n
+ * HAP_COMPUTE_RES_NOT_SUPPORTED when not supported. \n
+ */
+static inline int HAP_compute_res_check_release_request(
+                                                    unsigned int context_id)
+{
+    if (compute_resource_check_release_request)
+    {
+        return compute_resource_check_release_request(context_id);
+    }
+
+    return HAP_COMPUTE_RES_NOT_SUPPORTED;
+}
+
+/**
+ * Accepts a prepared attribute structure (attr) and returns a context ID
+ * for a successful request within the provided timeout (microseconds).
+ *
+ * @param[in] attr Pointer to compute the resource attribute structure
+ *                 #compute_res_attr_t.
+ * @param[in] timeout_us Timeout in microseconds; 0 specifies no timeout
+ *                       i.e., requests with unavailable resources
+ *                       immediately return failure. If nonzero, should
+ *                       be at least 200.
+  *
+ * @return
+ * Nonzero context ID upon success. \n
+ * 0 upon failure (i.e., unable to acquire requested resource
+ * in a given timeout duration).
+ */
+static inline unsigned int HAP_compute_res_acquire(
+                                              compute_res_attr_t* attr,
+                                              unsigned int timeout_us)
+{
+    if (compute_resource_acquire)
+    {
+        return compute_resource_acquire(attr, timeout_us);
+    }
+
+    return 0;
+}
+
+/**
+ * Releases all the resources linked to the given context ID.
+ *
+ * Call this function with the context_id returned by a successful
+ * HAP_compute_res_acquire call().
+ *
+ * @param[in] context_id Context ID returned by HAP_compute_res_acquire call().
+ *
+ * @return
+ * 0 upon success. \n
+ * Nonzero upon failure. \n
+ * HAP_COMPUTE_RES_NOT_SUPPORTED when not supported.
+ */
+static inline int HAP_compute_res_release(unsigned int context_id)
+{
+    if (compute_resource_release)
+    {
+        return compute_resource_release(context_id);
+    }
+
+    return HAP_COMPUTE_RES_NOT_SUPPORTED;
+}
+
+/**
+ * Acquires or reacquires the resources pointed to the context_id returned by
+ * a successful HAP_compute_res_acquire() call. If VTCM resource was requested,
+ * the VTCM address, size, and page configuration remain the same.
+ *
+ * Supported from Lahaina.
+ *
+ * @param[in] context_id Context ID returned by HAP_compute_res_acquire().
+ * @param[in] timeout_us Timeout in microseconds; 0 specifies no timeout
+ *                       i.e., requests with unavailable resources
+ *                       immediately return failure. If nonzero, should
+ *                       be at least 200.
+ *
+ * @return
+ * 0 upon success. \n
+ * Nonzero upon failure. \n
+ * HAP_COMPUTE_RES_NOT_SUPPORTED when not supported.
+ */
+static inline int HAP_compute_res_acquire_cached(
+                                              unsigned int context_id,
+                                              unsigned int timeout_us)
+{
+    if (compute_resource_acquire_cached)
+    {
+        return compute_resource_acquire_cached(context_id, timeout_us);
+    }
+
+    return HAP_COMPUTE_RES_NOT_SUPPORTED;
+}
+
+/**
+ * Releases all the resources pointed to by the context_id acquired
+ * by a successful HAP_compute_res_acquire_cached() call, while allowing the
+ * user to reacquire the same resources via HAP_compute_res_acquire_cached()
+ * in the future until the context is released via HAP_compute_res_release().
+ *
+ * Supported starting with Lahaina.
+ *
+ * @param[in] context_id Context ID returned by
+ *                       #HAP_compute_res_acquire().
+ *
+ * @return
+ * 0 upon success. \n
+ * Nonzero upon failure. \n
+ * HAP_COMPUTE_RES_NOT_SUPPORTED when not supported.
+ */
+static inline int HAP_compute_res_release_cached(unsigned int context_id)
+{
+    if (compute_resource_release_cached)
+    {
+        return compute_resource_release_cached(context_id);
+    }
+
+    return HAP_COMPUTE_RES_NOT_SUPPORTED;
+}
+
+/**
+ * Sets the release callback function in the attribute structure.
+
+ * The compute resource manager calls the release_callback function when any of the
+ * resources reserved by the specified context are required by a higher priority
+ * client. Clients act on the release request by explicitly calling the release
+ * function HAP_compute_res_release() or HAP_compute_res_release_cached()
+ * to release all acquired resources of the given context_id.
+ *
+ * Client-provided context (client_context) is passed to the release callback. On
+ * receiving a release request via the provided callback, clients should call the
+ * release function within 5 milliseconds. The release_callback function
+ * should not have any blocking wait.
+ *
+ * Call this function after HAP_compute_res_attr_init().
+ *
+ * Supported starting with Lahaina.
+ *
+ * @param[in] attr Pointer to compute the resource attribute structure,
+ *                 #compute_res_attr_t.
+ * @param[in] release_callback Function pointer to the registered callback to
+                               receive the release request.
+ * @param[in] client_context User-provided client context.
+ *
+ * @return
+ * 0 upon success. \n
+ * Nonzero upon failure. \n
+ * HAP_COMPUTE_RES_NOT_SUPPORTED when not supported.
+ */
+static inline int HAP_compute_res_attr_set_release_callback(
+                                        compute_res_attr_t* attr,
+                                        int (*release_callback)(
+                                            unsigned int context_id,
+                                            void* client_context),
+                                        void* client_context)
+{
+    if (compute_resource_attr_set_release_callback)
+    {
+        return compute_resource_attr_set_release_callback(attr,
+                                                          release_callback,
+                                                          client_context);
+    }
+
+    return HAP_COMPUTE_RES_NOT_SUPPORTED;
+}
+
+/**
+ * Updates the priority of an allocated context reflecting the caller's
+ * thread priority.
+ * The compute resource manager uses the callers thread priority as the resource
+ * priority when acquired (HAP_compute_res_acquire() /
+ * HAP_compute_res_acquire_cached()). If the thread priority of the caller is
+ * changed after acquiring the resource, caller should intimate the compute
+ * resource manager of a priority change by invoking this API. Failing to do
+ * so will result in resource manager assuming an incorrect priority for
+ * the allocated resource which may result in unwanted release requests.
+ * For a cached allocation, this API should be called after a successful
+ * HAP_compute_res_acquire_cached() call.
+ *
+ * Supported on latest chipsets(released after Palima).
+ *
+ * @param[in] context_id Context ID returned by HAP_compute_res_acquire()..
+ * @param[in] priority 0 - The compute resource manager would use the caller
+ *                         thread priority
+ *                     1..255 - priority value in terms of QuRT thread priority.
+ *                              Priority ceiling will be applied for unprivileged
+ *                              processes.
+ *
+ * @return
+ * 0 upon success. \n
+ * Nonzero upon failure. \n
+ * HAP_COMPUTE_RES_NOT_SUPPORTED when not supported.
+ */
+static inline int HAP_compute_res_update_priority(unsigned int context_id,
+                                                  unsigned short priority)
+{
+    if (compute_resource_update_priority)
+    {
+        return compute_resource_update_priority(context_id, priority);
+    }
+
+    return HAP_COMPUTE_RES_NOT_SUPPORTED;
+}
+
+/**
+ * @}
+ */
+
+/**
+ * @defgroup Critical section for autonomous thread id preemption
+ *
+ * API to enter and exit critical section to prevent autonomous thread identifiers
+ * based preemption (HAP_COMPUTE_RES_THREADS_FOR_AUTONOMOUS_PREEMPTION) from
+ * resource manager when acquiring global mutexes (used
+ * in I/O, standard library functions like printf, user implemented
+ * serialization etc.)
+ *
+ * @{
+ */
+
+/**
+ * API to enter critical section to prevent autonomous thread identifiers
+ * based preemption (HAP_COMPUTE_RES_THREADS_FOR_AUTONOMOUS_PREEMPTION) from
+ * resource manager when acquiring global mutexes (used
+ * in I/O, standard library functions like printf, user implemented
+ * serialization etc.)
+ *
+ * On architectures supporting HAP_COMPUTE_RES_THREADS_FOR_AUTONOMOUS_PREEMPTION,
+ * holding global mutexes can lead to deadlocks within the preempted task's
+ * user process. The critical section exposed by this API should be implemented
+ * by users around I/O, logging or any standard libraries/user implementations
+ * which acquires global mutexes.
+ *
+ * Implementation uses a per-process global mutex, callers of this API will
+ * be serialized across threads within the caller user process on NSP.
+ *
+ * NOTE: The critical section implementation should only be done when,
+ *       - HAP_COMPUTE_RES_THREADS_FOR_AUTONOMOUS_PREEMPTION is supported
+ *       - Applications with different priorities co-exist in a single user process
+ *         exposing the risk of deadlock between a running and preempted
+ *         application.
+ *
+ * @return
+ * 0 upon success. \n
+ * Nonzero upon failure. \n
+ * HAP_COMPUTE_RES_NOT_SUPPORTED when not supported.
+ *
+ */
+
+static inline int HAP_compute_res_tid_preemption_lock(void)
+{
+    if (crm_tid_preemption_lock)
+    {
+        return crm_tid_preemption_lock();
+    }
+
+    return HAP_COMPUTE_RES_NOT_SUPPORTED;
+}
+
+/**
+ * Releases the critical section acquired by #HAP_compute_res_tid_preemption_lock().
+ *
+ * @return
+ * 0 upon success. \n
+ * Nonzero upon failure. \n
+ * HAP_COMPUTE_RES_NOT_SUPPORTED when not supported.
+ */
+
+static inline int HAP_compute_res_tid_preemption_unlock(void)
+{
+    if (crm_tid_preemption_unlock)
+    {
+        return crm_tid_preemption_unlock();
+    }
+
+    return HAP_COMPUTE_RES_NOT_SUPPORTED;
+}
+
+/**
+ * @}
+ */
+
+/**
+ * @defgroup Capability query and profiling data
+ * API to query capabilities of the compute resource manager and to get
+ * profiling data associated with a context.
+ *
+ * @{
+ */
+
+/**
+ * Queries compute resource manager capabilities listed under
+ * compute_res_capability_id enum.
+ *
+ * @param[in] capability_id Identifier from compute_res_capability_id corresponding
+ *                          to the compute resource manager capability.
+ * @param[out] data Pointer to an unsigned int data. On success, the memory
+ *                  is updated with the data associated with the queried capability.
+ *
+ * @return
+ * 0 upon success. \n
+ * Nonzero upon failure. \n
+ * HAP_COMPUTE_RES_NOT_SUPPORTED when not supported.
+ */
+static inline int HAP_compute_res_query_capability(compute_res_capability_id capability_id,
+                                                   unsigned int* data)
+{
+    if (crm_query_capability)
+    {
+        return crm_query_capability(capability_id, data);
+    }
+
+    return HAP_COMPUTE_RES_NOT_SUPPORTED;
+}
+
+/**
+ * On implementations supporting HAP_COMPUTE_RES_AUTONOMOUS_PREEMPTION,
+ * this API returns preemption statistics associated with the context_id
+ * acquired via HAP_compute_res_acquire().
+ *
+ * This API needs to be called before the associated context is released via
+ * HAP_compute_res_release() call, data returned is invalid otherwise.
+ *
+ * @param[in] context_id Context ID returned by HAP_compute_res_acquire().
+ * @param[out] Pointer to compute_res_preempt_data_t.
+ *             On success, the preemption-related statistics are updated in
+ *             the provided structure.
+ *
+ * @return
+ * 0 upon success. \n
+ * Nonzero upon failure. \n
+ * HAP_COMPUTE_RES_NOT_SUPPORTED when not supported.
+ */
+static inline int HAP_compute_res_get_preempt_data(unsigned int context_id,
+                                                   compute_res_preempt_data_t* data)
+{
+    if (crm_get_preempt_data)
+    {
+        return crm_get_preempt_data(context_id, data);
+    }
+
+    return HAP_COMPUTE_RES_NOT_SUPPORTED;
+}
+
+/**
+ * @}
+ */
+
+
+/**
+ * @defgroup HMX HMX lock and unlock
+ * Manage HMX lock once HMX has been acquired
+ *
+ * @{
+ */
+
+/**
+ * Locks the HMX unit to the current thread and prepares the thread to
+ * execute HMX instructions. The client must have already acquired the
+ * HMX resource with HAP_compute_res_acquire() or HAP_compute_res_acquire_cached(),
+ * and context_id must refer to the corresponding resource manager context.
+ *
+ * Before executing HMX instructions, a client must call this function from
+ * the same software thread used for HMX processing. Only the calling thread
+ * with a valid HMX lock may execute HMX instructions.
+ *
+ * Supported starting with Lahaina.
+ *
+ * @param[in] context_id Context ID returned by
+ *                       #HAP_compute_res_acquire().
+ *
+ * @return
+ * 0 upon success. \n
+ * Nonzero upon failure. \n
+ * HAP_COMPUTE_RES_NOT_SUPPORTED when not supported.
+ */
+static inline int HAP_compute_res_hmx_lock(unsigned int context_id)
+{
+    if (compute_resource_hmx_lock)
+    {
+        return compute_resource_hmx_lock(context_id);
+    }
+
+    return HAP_COMPUTE_RES_NOT_SUPPORTED;
+}
+
+/**
+ * Unlocks the HMX unit from the calling thread. The HMX unit can then be
+ * locked to another thread or released with HAP_compute_res_release().
+ *
+ * This function must be called from the same thread as the previous
+ * HMX_compute_res_hmx_lock() call.
+ *
+ * Supported starting with Lahaina.
+ *
+ * @param[in] context_id Context ID returned by
+ *                       #HAP_compute_res_acquire().
+ *
+ * @return
+ * 0 upon success. \n
+ * Nonzero upon failure. \n
+ * HAP_COMPUTE_RES_NOT_SUPPORTED when not supported.
+ */
+static inline int HAP_compute_res_hmx_unlock(unsigned int context_id)
+{
+    if (compute_resource_hmx_unlock)
+    {
+        return compute_resource_hmx_unlock(context_id);
+    }
+
+    return HAP_COMPUTE_RES_NOT_SUPPORTED;
+}
+
+/**
+ * This function is an extension to HAP_compute_res_hmx_lock() with an additional
+ * option to lock HMX across multiple participating threads within a user process
+ * and timeshare the HMX resource (only one thread should be using HMX at a time).
+ *
+ * Supported on latest chipsets(released after Palima).
+ *
+ * @param[in] context_id Context ID returned by
+ *                       #HAP_compute_res_acquire().
+ * @param[in] tye        HAP_COMPUTE_RES_HMX_NON_SHARED
+ *                          Analogous to #HAP_compute_res_hmx_lock()
+ *                       HAP_COMPUTE_RES_HMX_SHARED
+ *                          Threads within a process can lock and timeshare the same HMX
+ *                          resource. When using this option, it is caller's responsibility
+ *                          to timeshare HMX (only one thread should use HMX at a time)
+ *                          among participating threads using HAP_COMPUTE_RES_HMX_SHARED
+ *                          option from the same process.
+ *                          Note that the sharing of HMX is allowed between the threads of
+ *                          the same user process. A single Context ID (context_id) should be
+ *                          used across the participating threads in a user process.
+ *
+ * @return
+ * 0 upon success. \n
+ * Nonzero upon failure. \n
+ * HAP_COMPUTE_RES_NOT_SUPPORTED when not supported.
+ */
+static inline int HAP_compute_res_hmx_lock2(unsigned int context_id,
+                                            compute_res_hmx_type_t type)
+{
+    if (compute_resource_hmx_lock2)
+    {
+        return compute_resource_hmx_lock2(context_id, type);
+    }
+
+    return HAP_COMPUTE_RES_NOT_SUPPORTED;
+}
+
+/**
+ * To be used in conjunction with HAP_compute_res_hmx_lock2() to release a successfully
+ * locked HMX unit.
+ * 'type' provided should match with the type provided to a successful
+ * HAP_compute_res_hmx_lock2() call from this thread.
+ *
+ * Supported on latest chipsets(released after Palima).
+ *
+ * @param[in] context_id Context ID returned by
+ *                       #HAP_compute_res_acquire().
+ *
+ * @param[in] type  Should be the same paramter used to lock HMX
+ *                  via #HAP_compute_res_hmx_lock2()
+ *
+ * @return
+ * 0 upon success. \n
+ * Nonzero upon failure. \n
+ * HAP_COMPUTE_RES_NOT_SUPPORTED when not supported.
+ */
+static inline int HAP_compute_res_hmx_unlock2(unsigned int context_id,
+                                              compute_res_hmx_type_t type)
+{
+    if (compute_resource_hmx_unlock2)
+    {
+        return compute_resource_hmx_unlock2(context_id, type);
+    }
+
+    return HAP_COMPUTE_RES_NOT_SUPPORTED;
+}
+
+/**
+ * @}
+ */
+
+/**
+ * @defgroup HMX HMX lock and unlock
+ * Manage HMX on architectures supporting HAP_COMPUTE_RES_AUTONOMOUS_PREEMPTION
+ *
+ * @{
+ */
+
+/**
+ * On architectures supporting HAP_COMPUTE_RES_AUTONOMOUS_PREEMPTION preemption,
+ * this funciton locks the HMX unit to the current thread and prepares the thread to
+ * execute HMX instructions. The client must have already acquired the
+ * VTCM using HAP_compute_res_acquire() or HAP_compute_res_acquire_cached(),
+ * and context_id must refer to the corresponding resource manager context.
+ *
+ * Before executing HMX instructions, a client must call this function from
+ * the same software thread used for HMX processing. Only the calling thread
+ * with a valid HMX lock may execute HMX instructions.
+ *
+ * The calling thread shall acquire lock on HMX mutex before executing HMX
+ * instructions and release the lock when program reaches to a point where the
+ * acquired HMX unit can be re-assigned to a higher priority waiter (in case of
+ * multiple clients contending for HMX resource) without affecting
+ * functionality. For entering HMX critical section, user shall call
+ * hmx_mutex->lock(hmx_mutex->mutex). For exiting the HMX critical section, user
+ * shall call hmx_mutex->unlock(hmx_mutex->mutex)
+ * autonomous preemption will wait for applications to release the HMX critical section
+ * before preempting HMX from the allocator.
+ *
+ * @param[in] context_id Context ID returned by
+ *                       #HAP_compute_res_acquire().
+ * @param[in] type       HAP_COMPUTE_RES_HMX_NON_SHARED
+ *                          Analogous to #HAP_compute_res_hmx_lock()
+ *                       HAP_COMPUTE_RES_HMX_SHARED
+ *                          Threads within a process can lock and timeshare the same HMX
+ *                          resource. When using this option, it is caller's responsibility
+ *                          to timeshare HMX (only one thread should use HMX at a time)
+ *                          among participating threads using HAP_COMPUTE_RES_HMX_SHARED
+ *                          option from the same process.
+ *                          Note that the sharing of HMX is allowed between the threads of
+ *                          the same user process. A single Context ID (context_id) should be
+ *                          used across the participating threads in a user process.
+ * @param[out] hmx_mutex  Pointer to structure of type compute_res_hmx_mutex_t.
+ *                        On Success, the structure is updated with mutex, lock
+ *                        and unlock parameters.
+ * @param[in] timeout_us Timeout in microseconds; 0 specifies no timeout
+ *                       i.e., requests with unavailable resources
+ *                       immediately return failure. If nonzero, should
+ *                       be at least 200.
+ * @return
+ * 0 upon success. \n
+ * Nonzero upon failure. \n
+ * HAP_COMPUTE_RES_NOT_SUPPORTED when not supported.
+ */
+static inline int HAP_compute_res_hmx_lock3(unsigned int context_id,
+                                              compute_res_hmx_type_t type,
+                                              compute_res_hmx_mutex_t *hmx_mutex,
+                                              unsigned int timeout_us)
+{
+    if (crm_hmx_lock3)
+    {
+        return crm_hmx_lock3(context_id, type, hmx_mutex, timeout_us);
+    }
+
+    return HAP_COMPUTE_RES_NOT_SUPPORTED;
+}
+
+/**
+ * To be used in conjunction with HAP_compute_res_hmx_lock3() to release a
+ * successfully locked HMX unit.
+ * 'type' provided should match with the type provided to a successful
+ * #HAP_compute_res_hmx_lock3() call from this thread.
+ *
+ * @param[in] context_id Context ID returned for a successful VTCM acquisition by
+ *                       #HAP_compute_res_acquire().
+ *
+ * @param[in] type  Should be the same parameter used to lock HMX
+ *                  via #HAP_compute_res_hmx_lock3()
+ *
+ * @param[in] hmx_mutex Should be the same parameter used to lock HMX via
+ *                      #HAP_compute_res_hmx_lock3()
+ * @return
+ * 0 upon success. \n
+ * Nonzero upon failure. \n
+ * HAP_COMPUTE_RES_NOT_SUPPORTED when not supported.
+ */
+static inline int HAP_compute_res_hmx_unlock3(unsigned int context_id,
+                                              compute_res_hmx_type_t type,
+                                              compute_res_hmx_mutex_t *hmx_mutex)
+{
+    if (crm_hmx_unlock3)
+    {
+        return crm_hmx_unlock3(context_id, type, hmx_mutex);
+    }
+
+    return HAP_COMPUTE_RES_NOT_SUPPORTED;
+}
+
+/**
+ * @}
+ */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif //HAP_COMPUTE_RES_H_
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_compute_res.md b/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_compute_res.md
new file mode 100755
index 0000000000000..3b550bf9ed20c
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_compute_res.md
@@ -0,0 +1,635 @@
+# Compute resource manager framework
+
+The cDSP has several shared resources such as L2 cache, HVX, HMX (where available), VTCM, hardware threads, and memory
+buses.  The compute resource manager framework exposes in @b HAP_compute_res.h a set of APIs for managing, requesting
+and releasing some of these resources.
+
+## Legacy HAP_vtcm_mgr API
+
+VTCM allocation APIs exposed under [VTCM Manager](../../doxygen/HAP_vtcm_mgr/index.html) are being deprecated, we
+recommend using the compute resource APIs for VTCM management and allocation. The compute resource manager provides
+options to:
+
+* Query defined VTCM on an architecture and VTCM usage.
+* Cached mode: Release and reacquire the same VTCM virtual address, size and page configuration.
+* Cooperative preemption: Register release callbacks, which might be invoked when a high-priority client needs a resource
+used by a lower-priority client.
+* ThreadID based autonomous preemption: Register threads that work on the compute resources with resource manager, these
+threads will be suspended by the resource manager when a high priority client requests. Clients also provide a backup buffer
+for VTCM, used by the resource manager to save and restore VTCM context during preemption.
+* Query supported preemption model (cooperative, ThreadID based autonomous preemption etc.)
+
+## Serialization
+
+The resource manager also offers a virtualized serialization resource to aid concurrencies in which constituent use cases
+are to run with mutually exclusive access to the entire cDSP, for example, to avoid cache thrashing with each other.
+Participating use cases issue blocking acquires on the serialization resource when ready to run, and each use case runs
+in turn when it is granted that resource. Acquiring the serialization resource only ensures mutual exclusion from other
+cooperating use cases that also block on acquisition of that resource; it does not guarantee exclusion from concurrent
+use cases that do not block on the serialization resource.
+
+## Cached mode
+
+Clients requesting for VTCM are provided with a pointer (virtual address) to VTCM on success. The pointer to VTCM can
+change once it's released (HAP_compute_res_release()) and re-acquired (HAP_compute_res_acquire()). Clients requiring a
+constant VTCM pointer through out a session can use the cached mode. Cached mode can be enabled by setting cached attribute
+using HAP_compute_res_attr_set_cache_mode() when acquiring (HAP_compute_res_acquire()) the resource. When cached attribute
+is set while acquiring the resource, clients are expected to call HAP_compute_res_acquire_cached() with the context ID
+returned by HAP_compute_res_acquire() before accessing the resource.
+
+This mode is useful for periodic applications where VTCM pointer needs to remain the same at every execution while allocating
+and releasing the resource periodically:
+* HAP_compute_res_acquire() with cached attribute set is called for allocating VTCM during initialization.
+* HAP_compute_res_acquire_cached() and HAP_compute_res_release_cached() called before and after every execution.
+* HAP_compute_res_release() is called during de-initialization.
+
+From v73 architecture, cached mode also provides clients with an option to have an overlapping mapping from within a process.
+
+### overmapping / overlapping page mapping
+
+Applications working on HMX may require all of the requested VTCM to be in a single page mapping in MMU. When overmapping /
+overlapping page mapping feature is supported, the HMX applications requesting for a page size covering entire VTCM with
+a smaller VTCM size can allow other applications running from the same user process to allocate remaining VTCM size when
+cached mode is used.
+
+For example, on an architecture supporting 8MB of VTCM, HMX application (APP1) requesting for 6MB of VTCM with a minimum of
+8MB page in cached mode can allow another application (APP2) to acquire remaining 2MB of VTCM with a maximum page size of
+1MB.
+
+![screenshot](../../images/CRM_VTCM_overmapping_example.png)
+
+Note:
+* Only cached allocations requesting for VTCM page size covering entire VTCM defined for that architecture but with a
+smaller VTCM size request will result in overmapping condition. For example, on architecture with 8MB VTCM, a cached/
+non-cached request for 3MB VTCM with 4MB page size will get a 4MB allocation (3MB wrapped to the page size).
+* Multiple cached/non-cached allocations from within the same process (as overmapping client) can use the left over space
+in VTCM as long as their requests can be accomodated in that space. For example, a 2MB size request with single page/
+4MB page size cannot coexist concurrently with a cached 6MB request with 8MB page size.
+* Multiple overmapping clients cannot coexist concurrently. For example, a 4MB size with 8MB page request cannot coexist
+concurrently with another 4MB sized request with 8MB page.
+
+## VTCM window feature
+Starting with v79, the NSP has a VTCM window hardware feature, which can be used to prevent a thread from accessing a specific VTCM region. The compute resource manager utilizes this feature as an additional access control layer on top of the page mappings per process.
+### Use case:
+* Defined VTCM memory: 8MB
+* Process 1: VTCM memory request 6MB, single page mapping
+* Process 2: VTCM memory request 2MB
+
+In this use case, 6MB of VTCM to be mapped in a single page requires all of 8MB VTCM to be allocated.  The difference between architectures with or without the VTCM window feature is in how the remaining 2 MB of VTCM allocated but unused by process 1 may be used by another process.
+
+### Q6 architecture < V79 (Where VTCM window feature not available)
+As the entire 8MB of defined VTCM is mapped to the user process, the allocating user process, process 1, has access to the 2MB free space as well. The 2MB is marked free for other allocations from the same user process and not available for requests from other user processes.
+
+![screenshot](../../images/hap_compute_res_mgr_no_vtcm_window.png)
+
+Process 2 cannot use the 2MB of free space while process 1 still holds its allocation.
+
+### Q6 architecture >= V79 (Where VTCM window feature available)
+The compure resource manager restricts the allocating user process, process 1, to access only 6MB of allocated space using `VTCM window` hardware feature. This allows other user processes to access this 2MB region. In this use case, process 1 has neither read nor write access to the remaining 2 MB of VTCM.
+
+![screenshot](../../images/hap_compute_res_mgr_vtcm_window.png)
+
+### VTCM window - restrictions
+`VTCM window` can be useful to restrict threads within a process to desired VTCM regions. `VTCM window` should be a single contiguous memory region within the VTCM space: gaps inbetween allocations cannot be free for allocation from other user processes. For example, in the below scenario, the `VTCM window` cannot be used to allow other processes to allocate the 1MB of unallocated free space: it is only available for allocation from process 1.
+
+![screenshot](../../images/hap_compute_res_mgr_vtcm_window_restrictions.png)
+
+## Cooperative preemption framework
+
+The resource manager offers cooperative preemption framework where in clients can register a release callback when
+requesting for compute resources using HAP_compute_res_attr_set_release_callback(). When a higher-priority client requests
+a resource already in use by a lower-priority client, the lower-priority client will be notified by the callback to suspend
+its work and release the resource.
+
+## Autonomous preemption framework (threadId based)
+
+On supported architecutres (can be queried using HAP_compute_res_query_capability()), the resource manager implements
+an autonomous based preemption framework where clients register thread IDs associated with a resource request and provide
+VTCM backup buffer when VTCM is being requested. As part of preempting a context, the resource manager waits for HMX critical section
+when HMX is in used, suspends registered threads and saves VTCM in provided backup buffer. When the resource becomes available
+the resource manager resumes suspended threads after restoring VTCM and reattaching HMX (if previously assigned).
+
+HMX under this preemption scheme is handled differently in comparison to the cooperative preemption framework.
+In cooperative preemption framework, HMX as a resource is acquired first and then locked using HAP_compute_res_hmx_lock()/lock2() while
+in autonomous preemption framework, HMX is directly locked via HAP_compute_res_hmx_lock3() using the context returned by
+a successful VTCM allocation done using HAP_compute_res_acquire() call. As the resource manager can preempt a low
+priority client, HMX applications need to implement HMX critical section using the mutex structure returned by a successful
+HAP_compute_res_HMX_lock3() API.
+
+## Usage examples
+
+### Cached VTCM request - cooperative preemption
+
+@code
+int release_callback(unsigned int context, void *state)
+{
+    if (!context || !state) return FAILURE;
+    /*
+     * Got release request, set release required in state variable
+     */
+    application_state_t* local_state = (application_state_t *)state;
+    if (local_state->context != context) return FAILURE;
+    local_state->release_request = TRUE;
+    return 0;
+}
+
+void initialization_routine()
+{
+    compute_res_attr_t attr;
+    unsigned int context;
+    unsigned int vtcm_size = 8 * 1024 * 1024;   //8MB of VTCM
+    void *p_vtcm = NULL;
+    unsigned int result_vtcm_size = 0;
+    /*
+     * Initialize the attribute structure
+     */
+    if (0 != HAP_compute_res_attr_init(&attr))
+        return;
+    /*
+     * Query VTCM defined in the architecture and set our request VTCM size
+     * to the defined one (request for entire VTCM size)
+     */
+    if (0 != HAP_compute_res_query_VTCM(0, &vtcm_size, NULL, NULL, NULL))
+        return;
+    /*
+     * Set VTCM params:
+     * Requesting for entire VTCM size, minimum page size set to VTCM size,
+     * minimum required VTCM size is set to the same as VTCM size
+     */
+    if (0 != HAP_compute_res_attr_set_vtcm_param_v2(&attr, vtcm_size, vtcm_size, vtcm_size))
+        return;
+    /*
+     * Set cached mode
+     */
+    if (0 != HAP_compute_res_attr_set_cache_mode(&attr, 1))
+        return;
+    /*
+     * Set release callback
+     */
+    if (0 != HAP_compute_res_attr_set_release_callback(&attr, &release_callback, (void *)state))
+        return;
+    /*
+     * Acquire a context with the prepared attribute structure
+     */
+    if (0 == (context = HAP_compute_res_acquire(&attr, 0)))
+        return;
+    /*
+     * Get VTCM pointer
+     */
+    if (0 != HAP_compute_res_attr_get_vtcm_ptr_v2(&attr, &p_vtcm, &result_vtcm_size))
+    {
+        HAP_compute_res_release(context);
+        return;
+    }
+    state->context = context;
+    /*
+     * Setup algorithm using p_vtcm and result_vtcm_size
+     */
+    return;
+ }
+
+int yield(unsigned int context)
+{
+    /*
+     * Synchronize with workers to make sure all accesses to VTCM are complete
+     * Backup VTCM if required
+     * Release context and reacquire
+     */
+    if (0 == HAP_compute_res_check_release_request(context))
+        return FAILURE;
+    if (0 == HAP_compute_res_release_cached(context))
+        return FAILURE;
+    if (0 == HAP_compute_res_acquire_cached(context, <TIMEOUT_US>))
+        return FAILURE;
+    /*
+     * Restore VTCM and continue remaining work
+     */
+    return 0;
+}
+
+void execution_loop()
+    /*
+     * Acquire the cached resource
+     */
+    if (0 != HAP_compute_res_acquire_cached(context, <TIMEOUT_US>))
+        return;
+    /*
+     * Work items
+     */
+    for (i = 0; i < WORK_ITEMS; i++)
+    {
+        /*
+         * Check if cooperative preemption requested for a release
+         * param set in release_callback
+         */
+        if (state->release_request)
+        {
+            if (0 != yield(context))
+                return;
+        }
+        //Execute work item
+    }
+    /*
+     * Release the cached resource
+     */
+    if (0 != HAP_compute_res_release_cached(context))
+        return;
+}
+@endcode
+
+### Cached VTCM request - autonomous threadID based preemption
+
+@code
+int release_callback(unsigned int context, void *state)
+{
+    if (!context || !state) return FAILURE;
+    /*
+     * Got release request, set release required in state variable
+     */
+    application_state_t* local_state = (application_state_t *)state;
+    if (local_state->context != context) return FAILURE;
+    local_state->release_request = TRUE;
+    return 0;
+}
+
+int check_autonomous_threads_compute_res_capability()
+{
+    unsinged int capability = 0;
+
+    if (0 != HAP_compute_res_query_capability(HAP_COMPUTE_RES_PREEMPTION_CAPABILITY, &capability))
+        return FAILURE;
+    if (capability & HAP_COMPUTE_RES_THREADS_FOR_AUTONOMOUS_PREEMPTION)
+        return 0;
+    else
+        return FAILURE;
+}
+
+void initialization_routine()
+{
+    compute_res_attr_t attr;
+    unsigned int context;
+    unsigned int vtcm_size = 8 * 1024 * 1024;   //8MB of VTCM
+    void *p_vtcm = NULL, *p_vtcm_backup = NULL;
+    unsigned int result_vtcm_size = 0;
+    unsigned int thread_id = NULL;
+    /*
+     * Initialize the attribute structure
+     */
+    if (0 != HAP_compute_res_attr_init(&attr))
+        return;
+    /*
+     * Query VTCM defined in the architecture and set our request VTCM size
+     * to the defined one (request for entire VTCM size)
+     */
+    if (0 != HAP_compute_res_query_VTCM(0, &vtcm_size, NULL, NULL, NULL))
+        return;
+    /*
+     * Set VTCM params:
+     * Requesting for entire VTCM size, minimum page size set to VTCM size,
+     * minimum required VTCM size is set to the same as VTCM size
+     */
+    if (0 != HAP_compute_res_attr_set_vtcm_param_v2(&attr, vtcm_size, vtcm_size, vtcm_size))
+        return;
+    /*
+     * Set cached mode
+     */
+    if (0 != HAP_compute_res_attr_set_cache_mode(&attr, 1))
+        return;
+    /*
+     * Check threads based autonomous preemption support and register threads
+     */
+    if (0 == check_autonomous_threads_compute_res_capability())
+    {
+        /*
+         * Allocate backup buffer for VTCM to be registered with the resource
+         * manager
+         */
+        p_vtcm_backup = malloc(vtcm_size);
+        /*
+         * Register VTCM backup buffer
+         */
+        if (0 != HAP_compute_res_attr_set_vtcm_backup(&attr, p_vtcm_backup, vtcm_size))
+        {
+            free(p_vtcm_backup);
+            return;
+        }
+        /*
+         * Register threads that will be working on the requested VTCM buffer
+         */
+        thread_id = qurt_thread_get_id();
+        if (0 != HAP_compute_res_attr_set_threads(&attr, &thread_id, 1))
+        {
+            free(p_vtcm_backup);
+            return;
+        }
+    } else {
+        /*
+         * Falling back to cooperative preemption when autonomous preemption
+         * is not supported
+         */
+        if (0 != HAP_compute_res_attr_set_release_callback(&attr, &release_callback, (void *)state))
+            return;
+    }
+    /*
+     * Acquire a context with the prepared attribute structure
+     */
+    if (0 == (context = HAP_compute_res_acquire(&attr, 0)))
+        return;
+    /*
+     * Get VTCM pointer
+     */
+    if (0 != HAP_compute_res_attr_get_vtcm_ptr_v2(&attr, &p_vtcm, &result_vtcm_size))
+    {
+        HAP_compute_res_release(context);
+        return;
+    }
+    state->context = context;
+    /*
+     * Setup algorithm using p_vtcm and result_vtcm_size
+     */
+    return;
+ }
+
+int yield(unsigned int context)
+{
+    /*
+     * Synchronize with workers to make sure all accesses to VTCM are complete
+     * Backup VTCM if required
+     * Release context and reacquire
+     */
+    if (0 == HAP_compute_res_check_release_request(context))
+        return FAILURE;
+    if (0 == HAP_compute_res_release_cached(context))
+        return FAILURE;
+    if (0 == HAP_compute_res_acquire_cached(context, <TIMEOUT_US>))
+        return FAILURE;
+    /*
+     * Restore VTCM and continue remaining work
+     */
+    return 0;
+}
+
+void execution_loop()
+    /*
+     * Acquire the cached resource
+     */
+    if (0 != HAP_compute_res_acquire_cached(context, <TIMEOUT_US>))
+        return;
+    /*
+     * Work items
+     */
+    for (i = 0; i < WORK_ITEMS; i++)
+    {
+        /*
+         * Check if cooperative preemption requested for a release
+         * param set in release_callback
+         */
+        if (state->release_request)
+        {
+            if (0 != yield(context))
+                return;
+        }
+        //Execute work item
+    }
+    /*
+     * Release the cached resource
+     */
+    if (0 != HAP_compute_res_release_cached(context))
+        return;
+}
+@endcode
+
+### Serialized VTCM acquisition
+
+This example shows two threads requesting VTCM and both participating in serialization by invoking HAP_compute_res_attr_set_serialize().
+
+@code
+    /*
+     * PROCESS/THREAD 1
+     */
+    compute_res_attr_t res_info;
+    unsigned int context_id = 0;
+    void *p_vtcm = NULL;
+    /*
+     * Initialize the attribute structure
+     */
+    if (0 != HAP_compute_res_attr_init(&res_info) )
+        return;
+    /*
+     * Set serialization option
+     */
+    if (0 != HAP_compute_res_attr_set_serialize(&res_info, 1) )
+        return;
+    /*
+     * Set VTCM request parameters - 256KB single page
+     */
+    if (0 != HAP_compute_res_attr_set_vtcm_param(&res_info,
+                                                 (256 * 1024),
+                                                 1) )
+        return;
+    /*
+     * Call acquire with a timeout of 10 milliseconds.
+     */
+    if (0 != (context_id = HAP_compute_res_acquire(&res_info, 10000) ) )
+    {
+        /*
+         * Successfully requested for serialization and acquired VTCM.
+         * The serialization request from PROCESS/THREAD 2 waits
+         * until the resource is released here.
+         */
+        p_vtcm = HAP_compute_res_attr_get_vtcm_ptr(&res_info);
+        if (0 == p_vtcm)
+        {
+            /*
+             * VTCM allocation failed, should not reach here as the acquire
+             * returned with valid context ID.
+             */
+            HAP_compute_res_release(context_id);
+            return;
+        }
+        //Do my work in process/thread 1
+        /*
+         * Done. Release the resource now using the acquired context ID.
+         * This releases both the serialization request and VTCM allocation.
+         */
+        HAP_compute_res_release(context_id);
+        p_vtcm = NULL;
+    } else {
+        /*
+         * Unsuccessful allocation. Timeout would have triggered.
+         * Implement a fallback or fail gracefully.
+         */
+    }
+
+    ...
+
+    /*
+     * PROCESS/THREAD 2
+     */
+    compute_res_attr_t res_info;
+    unsigned int context_id = 0;
+    /*
+     * Initialize the attribute structure.
+     */
+    if (0 != HAP_compute_res_attr_init(&res_info) )
+        return;
+    /*
+     * Set serialization option.
+     */
+    if (0 != HAP_compute_res_attr_set_serialize(&res_info, 1) )
+        return;
+    /*
+     * Call acquire with a timeout of 10 milliseconds.
+     */
+    if (0 != (context_id = HAP_compute_res_acquire(&res_info, 10000) ) )
+    {
+        /*
+         * Successfully requested for serialization.
+         * The serialization request from PROCESS/THREAD 1 waits
+         * until the resource is released here even when the PROCESS/THREAD 1s
+         * request for VTCM can be served.
+         */
+        //Do my work in process/thread 2
+        /*
+         * Done. Release the resource now using the acquired context ID.
+         */
+        HAP_compute_res_release(context_id);
+    } else {
+        /*
+         * Unsuccessful allocation. Timeout would have triggered.
+         * Implement a fallback or fail gracefully.
+         */
+    }
+@endcode
+
+### Non-serialized VTCM acquisition
+
+This example shows two threads requesting VTCM alone without a serialization option.
+
+If the total size requested by both threads exceeds the size of VTCM that is available, only one thread gets
+access to VTCM while the other thread waits. In this case, the threads are serializing their workload
+implicitly.
+
+If enough VTCM memory is available to meet the requests of both threads, both threads acquire VTCM upon request
+and can end up executing in parallel.
+
+@code
+    /*
+     * PROCESS/THREAD 1
+     */
+    compute_res_attr_t res_info;
+    unsigned int context_id = 0;
+    void *p_vtcm = NULL;
+    /*
+     * Initialize the attribute structure.
+     */
+    if (0 != HAP_compute_res_attr_init(&res_info) )
+        return;
+
+	/* By not calling HAP_compute_res_attr_set_serialize, we enable thread 1 to acquire VTCM
+	 * as long as enough memory is available	
+	 */
+
+    /*
+     * Set VTCM request parameters - 256 KB single page
+     */
+    if (0 != HAP_compute_res_attr_set_vtcm_param(&res_info,
+                                                 (256 * 1024),
+                                                 1) )
+        return;
+    /*
+     * Call acquire with a timeout of 10 milliseconds.
+     */
+    if (0 != (context_id = HAP_compute_res_acquire(&res_info, 10000) ) )
+    {
+        /*
+         * Successfully acquired VTCM.
+         * The VTCM request from PROCESS/THREAD 2 waits if enough
+         * VTCM is not left to serve the request until the resource is released
+         * here.
+         */
+        p_vtcm = HAP_compute_res_attr_get_vtcm_ptr(&res_info);
+        if (0 == p_vtcm)
+        {
+            /*
+             * VTCM allocation failed, should not reach this point as the acquire
+             * returned with valid context ID.
+             */
+            HAP_compute_res_release(context_id);
+            return;
+        }
+        //Do my work in process/thread 1
+        /*
+         * Done. Release the resource now using the acquired context ID.
+         * This releases the VTCM allocation.
+         */
+        HAP_compute_res_release(context_id);
+        p_vtcm = NULL;
+    } else {
+        /*
+         * Unsuccessful allocation. Timeout would have triggered.
+         * Implement a fallback or fail gracefully.
+         */
+    }
+
+    ...
+
+    /*
+     * PROCESS/THREAD 2
+     */
+    compute_res_attr_t res_info;
+    unsigned int context_id = 0;
+    void *p_vtcm = NULL;
+    /*
+     * Initialize the attribute structure
+     */
+    if (0 != HAP_compute_res_attr_init(&res_info) )
+        return;
+
+	/* By not calling HAP_compute_res_attr_set_serialize, we enable thread 2 to acquire VTCM
+	 * as long as enough memory is available	
+	 */
+
+    /*
+     * Set VTCM request parameters - 256 KB single page.
+     */
+    if (0 != HAP_compute_res_attr_set_vtcm_param(&res_info,
+                                                 (256 * 1024),
+                                                 1) )
+        return;
+    /*
+     * Call acquire with a timeout of 10 milliseconds.
+     */
+    if (0 != (context_id = HAP_compute_res_acquire(&res_info, 10000) ) )
+    {
+        /*
+         * Successfully acquired VTCM.
+         * The VTCM request from PROCESS/THREAD 1 waits if enough
+         * VTCM is not left to serve the request until the resource is released
+         * here.
+         */
+        p_vtcm = HAP_compute_res_attr_get_vtcm_ptr(&res_info);
+        if (0 == p_vtcm)
+        {
+            /*
+             * VTCM allocation failed, should not reach this point as the acquire
+             * returned with valid context ID.
+             */
+            HAP_compute_res_release(context_id);
+            return;
+        }
+        //Do work in PROCESS/THREAD 2
+        /*
+         * Done. Release the resource now using the acquired context ID.
+         * This releases the VTCM allocation.
+         */
+        HAP_compute_res_release(context_id);
+        p_vtcm = NULL;
+    } else {
+        /*
+         * Unsuccessful allocation. Timeout would have triggered.
+         * Implement a fallback or fail gracefully.
+         */
+    }
+@endcode
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_dcvs.h b/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_dcvs.h
new file mode 100755
index 0000000000000..34159800c5227
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_dcvs.h
@@ -0,0 +1,332 @@
+/*-----------------------------------------------------------------------------
+   Copyright (c) 2021, 2022 QUALCOMM Technologies, Incorporated.
+   All Rights Reserved.
+   QUALCOMM Proprietary.
+-----------------------------------------------------------------------------*/
+
+#ifndef HAP_DCVS_H_
+#define HAP_DCVS_H_
+
+/**
+ *  @file HAP_dcvs.h
+ *  @brief Header file for DCVS APIs.
+ */
+
+#include "AEEStdErr.h"
+#include "HAP_power.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ *  Perf modes to specify core/bus clock frequency level within
+ *  target voltage corner for HAP DCVS V3 interface.
+ */
+typedef enum {
+	HAP_DCVS_CLK_PERF_HIGH, /**< To select max frequency at target voltage corner. */
+	HAP_DCVS_CLK_PERF_LOW,  /**< To select min frequency at target voltage corner. */
+} HAP_dcvs_clk_perf_mode_t;
+
+/**
+ * @cond DEV
+ */
+int __attribute__((weak)) sysmon_set_dcvs_v3_duty_cycle(
+                                        void* context,
+                                        uint32 max_active_time,
+                                        uint32 periodicity);
+
+int __attribute__((weak)) sysmon_set_dcvs_v3_duty_cycle_params(
+                                                HAP_power_request_t* request,
+                                                uint32 max_active_time,
+                                                uint32 periodicity);
+
+int __attribute__((weak)) sysmon_set_dcvs_v3_core_perf_mode(
+                                        HAP_power_request_t* request,
+                                        HAP_dcvs_clk_perf_mode_t perf_mode);
+
+int __attribute__((weak)) sysmon_set_dcvs_v3_bus_perf_mode(
+                                        HAP_power_request_t* request,
+                                        HAP_dcvs_clk_perf_mode_t perf_mode);
+
+int __attribute__((weak)) sysmon_set_dcvs_v3_protected_bus_corners(
+                                        HAP_power_request_t* request,
+                                        unsigned char enable_protected_corners);
+
+int __attribute__((weak)) sysmon_set_ddr_perf_mode(
+                                        HAP_power_request_t *request,
+                                        unsigned int perf_mode);
+/**
+ * @endcond
+ */
+
+/**
+ * @defgroup helperapi Helper APIs for DCVS Duty Cycle
+ * @{
+ */
+
+/**
+ * Method to enable DCVS Duty Cycle.
+ *
+ * Calls HAP_power_set API with the provided context and selects
+ * DCVS duty cycle mode via HAP_power_set_DCVS_v3 request type.
+ *
+ * @param[in] context User context - power client identifier to be used in
+ *                                   HAP_power_set call.
+ *
+ * @param[in] max_active_time Max active time allowed per frame in ms
+ *                            (optional, can pass 0 if don’t want to specify).
+ *                            DCVS selects appropriate operating levels to
+ *                            keep the activity time within the provided
+ *                            maximum allowed time.
+ *
+ * @param[in] periodicity Frame time in ms (optional, can pass 0 if
+ *                        don’t want to specify periodicity). For example,
+ *                        periodicity = 100 (milli-seconds) for a
+ *                        10 FPS activity. DCVS uses this as a hint while
+ *                        predicting activity.
+ *
+ * @return
+ * 0 upon success. \n
+ * Nonzero upon failure. \n
+ * AEE_EVERSIONNOTSUPPORT if unsupported.
+ */
+static inline int HAP_set_dcvs_v3_duty_cycle(
+                                        void* context,
+                                        uint32 max_active_time,
+                                        uint32 periodicity)
+{
+    if (sysmon_set_dcvs_v3_duty_cycle)
+        return sysmon_set_dcvs_v3_duty_cycle(
+                                        context,
+                                        max_active_time,
+                                        periodicity);
+
+    return AEE_EVERSIONNOTSUPPORT;
+}
+
+/**
+ * Method to set duty cycle threshold params (periodicity and activity time hints)
+ * in the request structure intended for HAP_power_set for request type set to
+ * HAP_power_set_DCVS_v3.
+ *
+ * Sets the max_active_time and periodicity fields under dcvs_v3 payload of given
+ * request structure.
+ *
+ * Note: Request type should be set to HAP_power_set_DCVS_v3.
+ *
+ * @param[in] request Pointer to request structure.
+ *
+ * @param[in] max_active_time Max active time allowed per frame in ms.
+ *                            DCVS selects appropriate operating levels to
+ *                            keep the activity time within the provided
+ *                            maximum allowed time.
+ *
+ * @param[in] periodicity Frame time in ms (optional, can pass 0 if
+ *                        don’t want to specify periodicity). For example,
+ *                        periodicity = 100 (milli-seconds) for a
+ *                        10 FPS activity. DCVS uses this as a hint while
+ *                        predicting activity.
+ *
+ * @return
+ * 0 upon success. \n
+ * Nonzero upon failure. \n
+ * AEE_EVERSIONNOTSUPPORT if unsupported.
+ */
+static inline int HAP_set_dcvs_v3_duty_cycle_params(
+                                        HAP_power_request_t* request,
+                                        uint32 max_active_time,
+                                        uint32 periodicity)
+{
+    if (sysmon_set_dcvs_v3_duty_cycle_params)
+    {
+        return sysmon_set_dcvs_v3_duty_cycle_params(
+                                            request,
+                                            max_active_time,
+                                            periodicity);
+    }
+
+    return AEE_EVERSIONNOTSUPPORT;
+}
+
+/**
+ * @}
+ */
+
+/**
+ * @defgroup enable_protected_corner_api Helper API for protected bus corners
+ *
+ * @{
+ */
+/**
+ * On chipsets supporting bus corners above HAP_DCVS_VCORNER_TURBO_PLUS, to optimize residency at these corners,
+ * target corner requests for bus are capped to HAP_DCVS_VCORNER_TURBO_PLUS by default.
+ * Any request beyond HAP_DCVS_VCORNER_TURBO_PLUS (including HAP_DCVS_VCORNER_MAX) will be wrapped to HAP_DCVS_VCORNER_TURBO_PLUS.
+ *
+ * This API enables clients of HAP_power_set to override this protection when voting explicitly for bus corners
+ * above HAP_DCVS_VCORNER_TURBO_PLUS in necessary use cases.
+ *
+ * Note:
+ * API is supported starting with V79 QDSP6 architecture, AEE_EVERSIONNOTSUPPORT error (can be safely ignored) is returned by the API when not supported.
+ *
+ * Request type should be set to HAP_power_set_DCVS_v3.
+ *
+ * @param[in] request    Pointer to HAP_power_request_t structure with request type set to HAP_power_set_DCVS_v3.
+ * @param[in] enable_protected_corners   1 - to consider bus corner requests above HAP_DCVS_VCORNER_TURBO_PLUS
+ *                                       0 (default) - to cap bus corner requests to HAP_DCVS_VCORNER_TURBO_PLUS
+ * @return
+ *   0 upon success. \n
+ *   Nonzero upon failure. \n
+ *   AEE_EVERSIONNOTSUPPORT if unsupported.
+ */
+
+static inline int HAP_set_dcvs_v3_protected_bus_corners(
+                                            HAP_power_request_t* request,
+                                            unsigned char enable_protected_corners)
+{
+    if (sysmon_set_dcvs_v3_protected_bus_corners)
+    {
+        return sysmon_set_dcvs_v3_protected_bus_corners(request,
+                                                enable_protected_corners);
+    }
+
+    return AEE_EVERSIONNOTSUPPORT;
+}
+
+/**
+ * @}
+ */
+/**
+ * @defgroup enable_ddr_perf_mode_api Helper API to enable DDR perf mode
+ *
+ * @{
+ */
+/**
+ * This API enables clients of HAP_power_set to vote for DDR performance mode.
+ *
+ * Note:
+ * API is supported starting with V79 QDSP6 architecture, AEE_EVERSIONNOTSUPPORT error (can be safely ignored) is returned by the API when not supported.
+ *
+ * Note: Request type should be set to HAP_power_set_DCVS_v3.
+ *
+ * @param[in] request Pointer to HAP_power_request_t structure with request type set to HAP_power_set_DCVS_v3
+ *
+ * @param[in] perf_mode  1 - to enable DDR performance mode
+ *                       0 - to disable the DDR performance mode
+ *
+ * @return
+ * 0 upon success. \n
+ * Nonzero upon failure. \n
+ * AEE_EVERSIONNOTSUPPORT if unsupported.
+ */
+static inline int HAP_set_ddr_perf_mode(
+                                    HAP_power_request_t *request,
+                                    unsigned int perf_mode)
+{
+    if (sysmon_set_ddr_perf_mode)
+    {
+        return sysmon_set_ddr_perf_mode(request, perf_mode);
+    }
+
+    return AEE_EVERSIONNOTSUPPORT;
+}
+
+/**
+ * @}
+ */
+
+/**
+ * @defgroup clk_perfmode_api APIs to specify core/bus clock frequency level within target voltage corner
+ *
+ * @{
+ */
+
+/**
+ * Method to specify core clock frequency level corresponding to the
+ * target corner request in the request structure intended for
+ * HAP_power_set for request type set to HAP_power_set_DCVS_v3.
+ *
+ * By default, the highest core clock frequency available at the requested
+ * target_corner is selected. Using this API, user can select either the
+ * highest (HAP_DCVS_CLK_PERF_HIGH) or the lowest (HAP_DCVS_CLK_PERF_LOW)
+ * core clock frequency at any given target_corner. If there is only one
+ * core clock frequency available at the requested target_corner, both the
+ * high and low settings will select the same.
+ *
+ * Note: Request type should be set to HAP_power_set_DCVS_v3.
+ *
+ * Supported on latest chipsets(released after Palima).
+ *
+ * @param[in] request Pointer to request structure.
+ *
+ * @param[in] perf_mode Perf mode to specify core clock frequency level
+ *                      within target voltage corner.
+ *
+ * @return
+ * 0 upon success. \n
+ * Nonzero upon failure. \n
+ * AEE_EVERSIONNOTSUPPORT if unsupported.
+ */
+static inline int HAP_set_dcvs_v3_core_perf_mode(
+                                        HAP_power_request_t* request,
+                                        HAP_dcvs_clk_perf_mode_t perf_mode)
+{
+    if (sysmon_set_dcvs_v3_core_perf_mode)
+    {
+        return sysmon_set_dcvs_v3_core_perf_mode(
+                                            request,
+                                            perf_mode);
+    }
+
+    return AEE_EVERSIONNOTSUPPORT;
+}
+
+/**
+ * Method to specify bus clock frequency level corresponding to the
+ * target corner request in the request structure intended for
+ * HAP_power_set for request type set to HAP_power_set_DCVS_v3.
+ *
+ * By default, the highest bus clock frequency available at the requested
+ * target_corner is selected. Using this API, user can select either the
+ * highest (HAP_DCVS_CLK_PERF_HIGH) or the lowest (HAP_DCVS_CLK_PERF_LOW)
+ * bus clock frequency at any given target_corner. If there is only one
+ * bus clock frequency available at the requested target_corner, both the
+ * high and low settings will select the same.
+ *
+ * Note: Request type should be set to HAP_power_set_DCVS_v3.
+ *
+ * Supported on latest chipsets(released after Palima).
+ *
+ * @param[in] request Pointer to request structure.
+ *
+ * @param[in] perf_mode Perf mode to specify bus clock frequency level
+ *                      within target voltage corner.
+ *
+ * @return
+ * 0 upon success. \n
+ * Nonzero upon failure. \n
+ * AEE_EVERSIONNOTSUPPORT if unsupported.
+ */
+static inline int HAP_set_dcvs_v3_bus_perf_mode(
+                                        HAP_power_request_t* request,
+                                        HAP_dcvs_clk_perf_mode_t perf_mode)
+{
+    if (sysmon_set_dcvs_v3_bus_perf_mode)
+    {
+        return sysmon_set_dcvs_v3_bus_perf_mode(
+                                            request,
+                                            perf_mode);
+    }
+
+    return AEE_EVERSIONNOTSUPPORT;
+}
+
+/**
+ * @}
+ */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif //HAP_DCVS_H_
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_dcvs.md b/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_dcvs.md
new file mode 100755
index 0000000000000..e957258e4e664
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_dcvs.md
@@ -0,0 +1,146 @@
+# DCVS Helper APIs
+
+DCVS Duty Cycle Helper APIs with usage examples.
+
+## DCVS Duty Cycle Helper APIs
+
+Header file: @b HAP_dcvs.h
+
+## Usage examples
+
+### HAP_set_dcvs_v3_duty_cycle
+
+This is the most straightforward and therefore the recommended simplified API to enable DCVS Duty Cycle.
+
+The user has to pass the context (power client identifier to be used in a HAP_power_set call) to this API.
+The function calls into HAP_power_set API with the provided context and selects DCVS duty cycle mode via `HAP_power_set_DCVS_v3` request type.
+The user optionally can provide max_active_time and periodicity. The DCVS algorithm selects appropriate operating levels to keep the activity time within the provided
+maximum allowed time and uses periodicity as a hint while predicting activity.
+
+The user does not need to specify any clock corners. Instead, the DCVS algorithm will select the appropriate clock corner with the best performance-power tradeoff that keeps the active time under the maximum value provided by the user for a given period.
+
+The example below demonstrates the usage of HAP_set_dcvs_v3_duty_cycle API.
+
+@code
+    /*
+     * Enabling DCVS Duty Cycle with 10ms max_active_time and 33ms periodicity
+     */
+    HAP_set_dcvs_v3_duty_cycle(context, 10, 33);
+@endcode
+
+Here DCVS Duty cycle starts with NOM as active corner and LOW SVS (SVS2) corner for idle cycle. Then, if the DCVS algorithm observes an active time longer than 10 ms (the user-defined max active time), it will increase the clock to the next level, NOM PLUS, to try bringing the active time under 10ms.
+
+![screenshot](../../images/HAP_set_dcvs_v3_duty_cycle.png)
+
+### HAP_set_dcvs_v3_duty_cycle_params
+
+This API is useful in setting the max_active_time and periodicity in an existing DCVS request structure.
+
+The user can set the DCVS params as per application requirement in DCVS request structure with request type set to `HAP_power_set_DCVS_v3` and pass it as an argument to this function.
+
+This API allows the user to set the maximum active time and period values used by the DCVS algorithm. After invoking this function, the user will have to call HAP_power_set() using the same request structure.
+
+@code
+    HAP_power_request_t request;
+    request.type = HAP_power_set_DCVS_v3;
+    /*
+     * Selecting Duty Cycle mode with DCVS enabled
+     */
+    request.dcvs_v3.set_dcvs_enable = TRUE;
+    request.dcvs_v3.dcvs_enable = TRUE;
+    request.dcvs_v3.dcvs_option = HAP_DCVS_V2_DUTY_CYCLE_MODE;
+    /*
+     * Setting TURBO PLUS as Max corner, NOM PLUS as Target corner
+     * and LOW SVS as Min corner
+     */
+    request.dcvs_v3.set_core_params = TRUE;
+    request.dcvs_v3.core_params.min_corner = HAP_DCVS_VCORNER_SVS2;
+    request.dcvs_v3.core_params.max_corner = HAP_DCVS_VCORNER_TURBO_PLUS;
+    request.dcvs_v3.core_params.target_corner = HAP_DCVS_VCORNER_NOM_PLUS;
+    request.dcvs_v3.set_bus_params = TRUE;
+    request.dcvs_v3.bus_params.min_corner = HAP_DCVS_VCORNER_SVS2;
+    request.dcvs_v3.bus_params.max_corner = HAP_DCVS_VCORNER_TURBO_PLUS;
+    request.dcvs_v3.bus_params.target_corner = HAP_DCVS_VCORNER_NOM_PLUS;
+    /*
+     * Setting 20ms max_active_time and 33ms periodicity
+     */
+    HAP_set_dcvs_v3_duty_cycle_params(&request, 20, 33);
+    HAP_power_set(context, &request);
+@endcode
+
+Here DCVS duty cycle apply LOW SVS (SVS2) for idle cycle and active cycle corner in the range of Max to Target (TURBO PLUS to NOM PLUS) to maintain the user given max_active_time (20ms).
+
+![screenshot](../../images/HAP_set_dcvs_v3_duty_cycle_params.png)
+
+### HAP_set_dcvs_v3_core_perf_mode
+
+This API helps to select core clock frequency level within target voltage corner.
+
+By default, the highest core clock frequency available at the requested target corner is selected. Using this API, the user can select either the highest (`HAP_DCVS_CLK_PERF_HIGH`) or the lowest (`HAP_DCVS_CLK_PERF_LOW`) core clock frequency at any given target corner. If there is only one core clock frequency available at the requested target corner, both the
+high and low settings will select the same.
+
+The user can set the DCVS params as per application requirement in DCVS request structure with request type set to `HAP_power_set_DCVS_v3` and pass the same as an arguement to this function along with perf_mode arguement which specifies the core clock frequency level (`HAP_DCVS_CLK_PERF_HIGH/HAP_DCVS_CLK_PERF_LOW`).
+
+This API sets the user provided perf_mode for core clock in the given request structure. After invoking this function, the user will have to call HAP_power_set() using the same request structure.
+
+@code
+    HAP_power_request_t request;
+    request.type = HAP_power_set_DCVS_v3;
+    /*
+     * Setting TURBO as Max corner, NOM as Target corner
+     * and LOW SVS as Min corner for core clock
+     */
+    request.dcvs_v3.set_core_params = TRUE;
+    request.dcvs_v3.core_params.min_corner = HAP_DCVS_VCORNER_SVS2;
+    request.dcvs_v3.core_params.max_corner = HAP_DCVS_VCORNER_TURBO;
+    request.dcvs_v3.core_params.target_corner = HAP_DCVS_VCORNER_NOM;
+    /*
+     * Setting perf_mode as HAP_DCVS_CLK_PERF_LOW
+     */
+    HAP_set_dcvs_v3_core_perf_mode(&request, HAP_DCVS_CLK_PERF_LOW);
+    HAP_power_set(context, &request);
+@endcode
+
+Here DCVS will vote for minimum available core clock frequency at NOM target corner.
+
+### HAP_set_dcvs_v3_bus_perf_mode
+
+This API helps to select bus clock frequency level within target voltage corner.
+
+By default, the highest bus clock frequency available at the requested target corner is selected. Using this API, the user can select either the highest (`HAP_DCVS_CLK_PERF_HIGH`) or the lowest (`HAP_DCVS_CLK_PERF_LOW`) bus clock frequency at any given target corner. If there is only one bus clock frequency available at the requested target corner, both the high and
+low settings will select the same.
+
+The user can set the DCVS params as per application requirement in DCVS request structure with request type set to `HAP_power_set_DCVS_v3` and pass the same as an arguement to this function along with perf_mode arguement which specifies the bus clock frequency level (`HAP_DCVS_CLK_PERF_HIGH/HAP_DCVS_CLK_PERF_LOW`).
+
+This API sets the user provided perf_mode for bus clock in the given request structure. After invoking this function, the user will have to call HAP_power_set() using the same request structure.
+
+@code
+    HAP_power_request_t request;
+    request.type = HAP_power_set_DCVS_v3;
+    /*
+     * Setting TURBO PLUS as Max corner, TURBO as Target corner
+     * and LOW SVS as Min corner for bus clock
+     */
+    request.dcvs_v3.set_bus_params = TRUE;
+    request.dcvs_v3.bus_params.min_corner = HAP_DCVS_VCORNER_SVS2;
+    request.dcvs_v3.bus_params.max_corner = HAP_DCVS_VCORNER_TURBO_PLUS;
+    request.dcvs_v3.bus_params.target_corner = HAP_DCVS_VCORNER_TURBO;
+    /*
+     * Setting perf_mode as HAP_DCVS_CLK_PERF_LOW
+     */
+    HAP_set_dcvs_v3_bus_perf_mode(&request, HAP_DCVS_CLK_PERF_LOW);
+    HAP_power_set(context, &request);
+@endcode
+
+Here DCVS will vote for minimum available bus clock frequency at TURBO target corner.
+
+### HAP_set_dcvs_v3_protected_bus_corners
+
+On chipsets supporting bus corners above `HAP_DCVS_VCORNER_TURBO_PLUS`, to optimize residency at these corners, target corner requests for bus are capped to `HAP_DCVS_VCORNER_TURBO_PLUS` by default.
+Any request beyond `HAP_DCVS_VCORNER_TURBO_PLUS` (including `HAP_DCVS_VCORNER_MAX`) will be set to `HAP_DCVS_VCORNER_TURBO_PLUS`.
+
+This API enables clients of HAP_power_set to override this protection when voting explicitly for bus corners above `HAP_DCVS_VCORNER_TURBO_PLUS` in necessary use cases.
+
+Note:
+This API is supported starting with V79 QDSP6 architecture, `AEE_EVERSIONNOTSUPPORT` error (can be safely ignored) is returned by the API when not supported.
+Request type should be set to `HAP_power_set_DCVS_v3`.
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_debug.h b/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_debug.h
new file mode 100755
index 0000000000000..aeed83ffcac27
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_debug.h
@@ -0,0 +1,81 @@
+#ifndef HAP_DEBUG_H
+#define HAP_DEBUG_H
+/*==============================================================================
+  Copyright (c) 2012-2013 Qualcomm Technologies, Inc.
+  All rights reserved. Qualcomm Proprietary and Confidential.
+==============================================================================*/
+
+#include "AEEStdDef.h"
+#include <stdarg.h>
+#include <stdio.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define HAP_LEVEL_LOW       0
+#define HAP_LEVEL_MEDIUM    1
+#define HAP_LEVEL_HIGH      2
+#define HAP_LEVEL_ERROR     3
+#define HAP_LEVEL_FATAL     4
+
+#define HAP_LEVEL_RUNTIME   (1 << 5)
+
+//Add a weak reference so shared objects work with older images
+#pragma weak HAP_debug_v2
+
+//Add a weak reference for enabling FARF in autogen stub files
+#pragma weak HAP_debug
+
+//Add a weak reference so runtime FARFs are ignored on older images
+#pragma weak HAP_debug_runtime
+
+/**************************************************************************
+    These HAP_debug* functions are not meant to be called directly.
+    Please use the FARF() macros to call them instead
+**************************************************************************/
+void HAP_debug_v2(int level, const char* file, int line, const char* format, ...);
+void HAP_debug_runtime(int level, const char* file, int line, const char* format, ...);
+int HAP_setFARFRuntimeLoggingParams(unsigned int mask, const char* files[],
+                                    unsigned short numberOfFiles);
+
+// Keep these around to support older shared objects and older images
+void HAP_debug(const char *msg, int level, const char *filename, int line);
+
+static __inline void _HAP_debug_v2(int level, const char* file, int line,
+                          const char* format, ...){
+   char buf[256];
+   va_list args;
+   va_start(args, format);
+   vsnprintf(buf, sizeof(buf), format, args);
+   va_end(args);
+   HAP_debug(buf, level, file, line);
+}
+
+/*!
+This function is called to log an accumlated log entry. If logging is
+enabled for the entry by the external device, then the entry is copied
+into the diag allocation manager and commited.
+
+    [in] log_code_type    ID of the event to be reported
+    [in] *data            data points to the log which is to be submitted
+    [in] dataLen          The length of the data to be logged.
+
+Returns
+    TRUE if log is submitted successfully into diag buffers
+    FALSE if there is no space left in the buffers.
+
+*/
+boolean HAP_log_data_packet(unsigned short log_code_type, unsigned int dataLen,
+                    byte* data);
+
+#define HAP_DEBUG_TRACEME 0
+
+long HAP_debug_ptrace(int req, unsigned int pid, void* addr, void* data);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // HAP_DEBUG_H
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_etm_config.h b/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_etm_config.h
new file mode 100755
index 0000000000000..4039d7822b331
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_etm_config.h
@@ -0,0 +1,98 @@
+/*-----------------------------------------------------------------------
+   Copyright (c) 2022 QUALCOMM Technologies, Incorporated.
+   All Rights Reserved.
+   QUALCOMM Proprietary.
+-----------------------------------------------------------------------*/
+
+/**
+ *  @file HAP_etm_config.h
+ *  @brief Header file with APIs to enable/disable etm tracing
+ */
+
+#include "AEEStdErr.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+ /**
+ * @cond DEV
+ */
+
+int __attribute__((weak)) __HAP_user_etm_enable(void);
+int __attribute__((weak)) __HAP_user_etm_disable(void);
+
+/**
+ * @endcond
+ */
+
+
+/** @defgroup helperapi Helper APIs to enable/disable etm trace.
+ *  API for users to enable or disable ETM tracing.
+ *  The HAP user ETM API provides user capability to start/stop
+ *  ETM tracing in a user module to cover a desired portion of
+ *  execution. This API is disabled by default and will return
+ *  an error when in that mode. To enable it, use
+ *  --hap_etm_enable option of sysMonApp etmTrace service as
+ *  mentioned in the sample command for default subsystem CDSP below:
+ *  ```
+ *  adb shell /data/local/tmp/sysMonApp etmTrace --command etm --hap_etm_enable 1
+ *  ```
+ *  ETM enablement requires setting up coresight driver on HLOS
+ *  and configuring appropriate ETM trace type on Q6 subsystem.
+ *  ETM configurations set via sysMonApp etmTrace option
+ *  like etm tracing mode (cycle accurate PC tracing etc.,
+ *  sample command on CDSP below)
+ *  ```
+ *  adb shell /data/local/tmp/sysMonApp etmTrace --command etm --etmType ca_pc
+ *  ```
+ *  are preserved across HAP user etm enable and disable calls.
+ *  The API is only for debug purpose and shouldn't be used in
+ *  production environments.
+ *  @{
+ */
+
+/**
+ * Requests ETM tracing to be enabled
+ *
+ * Call this function from the DSP user process to start ETM
+ * tracing. To stop the tracing, call @ref HAP_user_etm_disable().
+ * Supported on latest chipsets(released after Palima).
+ * @param None
+ * @return 0 upon success, other values upon failure.
+ */
+static inline int HAP_user_etm_enable(void) {
+    if(__HAP_user_etm_enable)
+        return __HAP_user_etm_enable();
+    return AEE_EVERSIONNOTSUPPORT;
+}
+
+/**
+ * Requests ETM tracing to be disabled
+ *
+ * Call this function from the DSP user process to stop any active
+ * ETM tracing. API returns error if there is no active ETM trace
+ * enable call, e.g., if @ref HAP_user_etm_disable() is called
+ * first without any active @ref HAP_user_etm_enable() being
+ * present. The enable and disable requests are reference counted
+ * in the driver. Nested calls are supported, e.g.
+ * if @ref HAP_user_etm_enable() is called twice, two calls
+ * to the disable API @ref HAP_user_etm_disable() will be needed
+ * to disable the tracing.
+ * Supported on latest chipsets(released after Palima).
+ * @param None
+ * @return 0 upon success, other values upon failure.
+ */
+static inline int HAP_user_etm_disable(void) {
+    if(__HAP_user_etm_disable)
+        return __HAP_user_etm_disable();
+    return AEE_EVERSIONNOTSUPPORT;
+}
+
+/**
+ * @}
+ */
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_etm_config.md b/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_etm_config.md
new file mode 100755
index 0000000000000..debf8e3ccc999
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_etm_config.md
@@ -0,0 +1,38 @@
+# ETM Trace enable/disable APIs
+
+The HAP ETM framework exposes a set of APIs to enable/disable
+ETM tracing in a user module to trace a region of interest
+provided ETM tracing is configured.
+
+For configuring ETM tracing, refer to "profile on device" section
+of Profiling example in base SDK.
+
+After ETM tracing is configured, the API requires setting the
+'--hap_etm_enable' flag via sysMonApp etmTrace option as below:
+```
+adb shell /data/local/tmp/sysMonApp etmTrace --command etm --hap_etm_enable 1
+```
+
+After ETM trace collection, this flag should be reset with the
+command:
+```
+adb shell /data/local/tmp/sysMonApp etmTrace --command etm --hap_etm_enable 0
+```
+
+Call to the APIs are ignored in the following cases:
+* ETM tracing is not configured.
+* The '--hap_etm_enable' flag is set to 0.
+
+***NOTE:*** The APIs work only on debug enabled device.
+A test device or debug device, (Mobile Test Platform) MTP
+or (Qualcomm Reference Design) QRD, is a device on which
+the debug fuse is present. This fuse is not present on
+production devices.
+
+## Supported chipsets
+
+Beyond Palima
+
+## Framework APIs
+
+Header file: @b HAP_etm_config.h
\ No newline at end of file
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_farf.h b/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_farf.h
new file mode 100755
index 0000000000000..8f5d3ba9aa38c
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_farf.h
@@ -0,0 +1,264 @@
+/*==============================================================================
+  Copyright (c) 2012-2013, 2020 Qualcomm Technologies, Inc.
+  All rights reserved. Qualcomm Proprietary and Confidential.
+==============================================================================*/
+
+#ifndef HAP_FARF_H
+#define HAP_FARF_H
+
+/**
+ * @file HAP_farf.h
+ * @brief FARF API
+ */
+
+#include "AEEStdDef.h"
+#include "HAP_debug.h"
+
+/**
+ *\def FARF()
+ * FARF is used to log debug messages from DSP
+ *
+ * `Compile time logging options:`
+ *
+ * Logging is controlled via conditional compilation.
+ * The FARF level allows the user to selectively enable or disable certain types
+ * of messages according to their priority level.
+ * The following levels are supported and listed in increasing priority:
+ *
+ *   LOW
+ *
+ *   MEDIUM
+ *
+ *   HIGH
+ *
+ *   ERROR
+ *
+ *   FATAL
+ *
+ *   ALWAYS
+ *
+ * A FARF level should be defined to 1 for FARF macros to be compiled
+ * in. For example:
+ *
+ * @code
+ *    #define FARF_LOW 1
+ *    #include "HAP_farf.h"
+ *
+ *    FARF(LOW, "something happened: %s", (const char*)string);
+ *
+ * @endcode
+ *
+ * FARF_LOW, FARF_MEDIM, FARF_HIGH are defined to 0 and FARF_ERROR,
+ * FARF_FATAL, FARF_ALWAYS are defined to 1 by default.
+ *
+ * If FARF_LOW is defined to 0, as it is by default, the above
+ * FARF string will not be compiled in, if it is defined to 1 it
+ * will be compiled in.
+ *
+ * If both HIGH and LOW messages are used but only FARF_LOW is defined
+ * as shown in below example then only LOW message will be compiled in and sent to DIAG.
+ *
+ * @code
+ *    #define FARF_LOW 1
+ *    #include "HAP_farf.h"
+ *
+ *    FARF(LOW, "LOW message");
+ *    FARF(HIGH, "HIGH message"); // This message will not be compiled in
+ *
+ * @endcode
+ *
+ * Messages logged with ALWAYS level are always compiled in and logged.
+ *
+ * When building the Debug variant or builds defining _DEBUG the
+ * following FARF levels will be enabled:
+ *
+ *    HIGH
+ *
+ *    ERROR
+ *
+ *    FATAL
+ *
+ * `Run time logging options:`
+ *
+ * In order to enable run-time logging (logging that can be enabled / disabled
+ * at run-time), the FARF_RUNTIME_* macros should be used.
+ *
+ * Log messages sent with these macros are compiled in by default. However by
+ * these messages WILL NOT be logged by default. In order to enable logging,
+ * the FASTRPC process will need to either call the
+ * HAP_SetFARFRuntimeLoggingParams() API, or by adding a `<process_name>`.farf
+ * file to the HLOS file system with the appropriate contents.
+ *
+ * @code
+ *
+ *      #include "HAP_farf.h"
+ *      FARF(RUNTIME_HIGH, "something happened: %s", (const char*)string);
+ *
+ * @endcode
+ *
+ * @param[in] x the FARF level defined to either 0 to disable compilation or 1 to enable.
+ * @param[in] ... the format string and arguments.
+ */
+#define FARF(x, ...) _FARF_PASTE(_FARF_,_FARF_VAL(FARF_##x))(x, ##__VA_ARGS__)
+
+
+/**
+*   @defgroup static_FARF Compile-time macros
+*
+*   Set these compile time macros to 1 to enable logging at that
+*   level. Setting them to 0 will cause them to be COMPILED out.
+*
+*   Usage Example:
+*   @code
+*
+*    #define FARF_HIGH 1
+*    FARF(HIGH,"Log message");
+*
+*   @endcode
+
+*   The ALWAYS macro will cause log messages to be ALWAYS compiled in.
+*   @code
+*
+*   FARF(ALWAYS,"Log message")
+*
+*   @endcode
+*
+*   Defining _DEBUG macro turns on ALWAYS, HIGH, ERROR, FATAL
+*/
+/*  @{ */
+
+#ifdef _DEBUG
+#ifndef FARF_HIGH
+#define FARF_HIGH 1
+#endif
+#endif
+
+/**
+ * The FARF_ALWAYS macro causes log messages to be ALWAYS compiled in
+ */
+#ifndef FARF_ALWAYS
+#define FARF_ALWAYS        1
+#endif
+
+/**
+ * The FARF_LOW macro causes log messages to be compiled in when FARF_LOW is defined to 1
+*/
+#ifndef FARF_LOW
+#define FARF_LOW           0
+#endif
+
+/**
+* The FARF_MEDIUM macro causes log messages to be compiled in when FARF_MEDIUM is defined to 1
+*/
+#ifndef FARF_MEDIUM
+#define FARF_MEDIUM        0
+#endif
+
+/**
+* The FARF_HIGH macro causes log messages to be compiled in when FARF_HIGH is defined to 1
+*/
+#ifndef FARF_HIGH
+#define FARF_HIGH          0
+#endif
+
+/**
+* The FARF_ERROR macro causes log messages to be compiled in when FARF_ERROR is defined to 1
+*/
+#ifndef FARF_ERROR
+#define FARF_ERROR         1
+#endif
+
+/**
+* The FARF_FATAL macro causes log messages to be compiled in when FARF_FATAL is defined to 1
+*/
+#ifndef FARF_FATAL
+#define FARF_FATAL         1
+#endif
+
+//! @cond Doxygen_Suppress
+#define FARF_ALWAYS_LEVEL  HAP_LEVEL_HIGH
+#define FARF_LOW_LEVEL     HAP_LEVEL_LOW
+#define FARF_MEDIUM_LEVEL  HAP_LEVEL_MEDIUM
+#define FARF_HIGH_LEVEL    HAP_LEVEL_HIGH
+#define FARF_ERROR_LEVEL   HAP_LEVEL_ERROR
+#define FARF_FATAL_LEVEL   HAP_LEVEL_FATAL
+//! @endcond
+
+/* @} */
+
+
+/**
+*   @defgroup Runtime_FARF Runtime macros
+*
+*   Runtime FARF macros can be enabled at runtime.
+*   They are turned OFF by default.
+*
+*   Usage Example:
+*   @code
+*
+*   FARF(RUNTIME_HIGH,"Log message");
+*
+*   @endcode
+*/
+/*  @{ */
+//! @cond Doxygen_Suppress
+#ifndef FARF_RUNTIME_LOW
+#define FARF_RUNTIME_LOW           1
+#endif
+#define FARF_RUNTIME_LOW_LEVEL     (HAP_LEVEL_RUNTIME | HAP_LEVEL_LOW)
+
+#ifndef FARF_RUNTIME_MEDIUM
+#define FARF_RUNTIME_MEDIUM        1
+#endif
+#define FARF_RUNTIME_MEDIUM_LEVEL  (HAP_LEVEL_RUNTIME | HAP_LEVEL_MEDIUM)
+
+#ifndef FARF_RUNTIME_HIGH
+#define FARF_RUNTIME_HIGH          1
+#endif
+#define FARF_RUNTIME_HIGH_LEVEL    (HAP_LEVEL_RUNTIME | HAP_LEVEL_HIGH)
+
+#ifndef FARF_RUNTIME_ERROR
+#define FARF_RUNTIME_ERROR         1
+#endif
+#define FARF_RUNTIME_ERROR_LEVEL   (HAP_LEVEL_RUNTIME | HAP_LEVEL_ERROR)
+
+#ifndef FARF_RUNTIME_FATAL
+#define FARF_RUNTIME_FATAL         1
+#endif
+#define FARF_RUNTIME_FATAL_LEVEL   (HAP_LEVEL_RUNTIME | HAP_LEVEL_FATAL)
+//! @endcond
+/* @} */
+
+
+//! @cond Doxygen_Suppress
+
+#define _FARF_PASTE(a,b) _FARF_PASTE_(a,b)
+#define _FARF_PASTE_(a,b) a##b
+#define _FARF_VAL(a) a
+
+
+#define _FARF_0(x, ...)
+
+#ifndef __FILENAME__
+#define __FILENAME__ __FILE__
+#endif
+
+#define _FARF_1(x, ...) \
+    do { \
+            if(0 == (HAP_debug_v2)) { \
+                _HAP_debug_v2(FARF_##x##_LEVEL, __FILENAME__, __LINE__, ##__VA_ARGS__); \
+            } else { \
+                if (FARF_##x##_LEVEL & HAP_LEVEL_RUNTIME) { \
+                    if (0 != HAP_debug_runtime) { \
+                        HAP_debug_runtime(FARF_##x##_LEVEL ^ HAP_LEVEL_RUNTIME , __FILENAME__, __LINE__, ##__VA_ARGS__); \
+                    } else { \
+                        break; \
+                    } \
+                } else { \
+                    HAP_debug_v2(FARF_##x##_LEVEL, __FILENAME__, __LINE__, ##__VA_ARGS__); \
+                } \
+            } \
+        } while (0)
+
+#endif /* #ifndef HAP_FARF_H */
+//! @endcond
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_farf.md b/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_farf.md
new file mode 100755
index 0000000000000..35111fa2dfb37
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_farf.md
@@ -0,0 +1,7 @@
+# HAP_farf
+
+##  Overview
+
+The FARF API on DSP is used to generate diagnostic messages. These messages are sent to a diagnostic (or DIAG) framework on the DSP, from which they can be collected via USB using a tool called mini-dm running on the host computer. Parallelly, the DSP FARF messages can be routed to the application processor, allowing the user to collect DSP messages with logcat. These tools and the process for collecting messages is explained in the Messaging resources page from the SDK documentation.
+
+FARF messages can be enabled at compile-time and runtime. The Messaging resources page from the SDK documentation explains in detail the differences between compile-time and runtime FARF messages, how to enable them, and how to display them.
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_mem.h b/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_mem.h
new file mode 100755
index 0000000000000..0b6bae4d8336b
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_mem.h
@@ -0,0 +1,466 @@
+/*
+ * Copyright (c) 2012-2020 Qualcomm Technologies, Inc.
+ * All Rights Reserved.
+ * Confidential and Proprietary - Qualcomm Technologies, Inc
+ */
+
+#ifndef HAP_MEM_H
+#define HAP_MEM_H
+#include <stdlib.h>
+#include "AEEStdDef.h"
+#include "AEEStdErr.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * @file HAP_mem.h
+ * @brief HAP Memory APIs
+ */
+
+
+/*
+ * Protections are chosen from these bits, or-ed together
+ */
+
+
+ /*! @name HAP_PROT
+     \brief These macros define the permissions on memory block described by the file descriptor.
+
+     It is passed as input parameter 'prot' to HAP_mmap(). These can be ORed to set the required permissions.
+
+
+*/
+
+///@{
+
+/*!     \def HAP_PROT_NONE
+        \brief Passing HAP_PROT_NONE as input results in setting 'NO' permissions on the buffer.
+*/
+#define HAP_PROT_NONE   0x00    /* no permissions */
+
+/*!      \def HAP_PROT_READ
+         \brief Passing HAP_PROT_READ as input results in setting 'Read' permissions on the buffer.
+*/
+#define HAP_PROT_READ   0x01    /* pages can be read */
+/*!      \def HAP_PROT_WRITE
+     \brief Passing HAP_PROT_WRITE as input results in setting 'Write' permissions on the buffer.
+*/
+
+#define HAP_PROT_WRITE  0x02    /* pages can be written */
+
+/*!
+     \def HAP_PROT_EXEC
+     \brief Passing HAP_PROT_EXEC as input results in setting 'Execute' permissions on the buffer. Currently not supported.
+*/
+#define HAP_PROT_EXEC   0x04    /* pages can be executed */
+
+
+///@}
+
+/*
+ * Cache policy or-ed with protections parameter
+ */
+
+ /*! @name HAP_MEM_CACHE
+     \brief These macros define the cache policies for mapping memory pages to DSP MMU. Default cache policy is cache writeback.
+
+     It is passed as input parameter 'prot', or-ed with page protections to HAP_mmap().
+
+*/
+
+///@{
+
+/*!     \def HAP_MEM_CACHE_WRITEBACK
+        \brief Passing HAP_MEM_CACHE_WRITEBACK as input results in mapping memory as cache writeback
+*/
+#define HAP_MEM_CACHE_WRITEBACK    (0x10)    /* cache write back */
+
+/*!      \def HAP_MEM_CACHE_NON_SHARED
+         \brief Passing HAP_MEM_CACHE_NON_SHARED as input results in mapping memory as uncached
+*/
+#define HAP_MEM_CACHE_NON_SHARED   (0x20)    /* normal uncached memory */
+
+/*!      \def HAP_MEM_CACHE_WRITETHROUGH
+     \brief Passing HAP_MEM_CACHE_WRITETHROUGH as input results in mapping memory as cache write through
+*/
+
+#define HAP_MEM_CACHE_WRITETHROUGH  (0x40)    /* write through memory */
+
+///@}
+
+/*! @name HAP_MEM_FLAGS
+     \brief These macros define the buffer attribute flags for allocating APPS memory from the DSP.
+
+     It is passed as input parameter 'flags' to HAP_apps_mem_request().
+
+*/
+
+///@{
+
+/*!     \def HAP_MEM_FLAGS_SKIP_DSP_MAP
+        \brief Allocate memory on HLOS but skip DSP mapping
+*/
+
+#define HAP_MEM_FLAGS_SKIP_DSP_MAP    0
+
+/*!     \def HAP_MEM_FLAGS_DSP_MAP
+        \brief Allocate memory on HLOS and map on DSP
+*/
+
+#define HAP_MEM_FLAGS_DSP_MAP         1
+
+/*!     \def HAP_MEM_FLAGS_EXTENDED_MAP
+        \brief Allocate memory on HLOS and map beyond 4GB virtual address range on DSP.
+
+        Unsupported currently. Reserved for future use.
+*/
+
+#define HAP_MEM_FLAGS_EXTENDED_MAP    2
+
+/*!     \def HAP_MEM_FLAGS_MAX
+        \brief Max number of flags supported by HAP_apps_mem_request
+*/
+
+#define HAP_MEM_FLAGS_MAX             (HAP_MEM_FLAGS_EXTENDED_MAP + 1)
+
+///@}
+
+/**
+ * Allocate a block of memory.
+ * @param[in] bytes size of memory block in bytes.
+ * @param[out] pptr pointer to the memory block
+ * @return int AEE_SUCCESS for success and AEE_ENOMEMORY for failure.
+ */
+
+static inline int HAP_malloc(uint32 bytes, void** pptr)
+{
+    *pptr = malloc(bytes);
+    if (*pptr) {
+        return AEE_SUCCESS;
+    }
+    return AEE_ENOMEMORY;
+}
+
+/**
+ * Free the memory block allocated through HAP_malloc().
+ * @param[in] ptr pointer to the memory block
+ * @return int AEE_EBADCLASS if ptr is NULL
+               AEE_SUCCESS if ptr is not NULL
+
+ */
+
+static inline int HAP_free(void* ptr)
+{
+    if(ptr == NULL)
+        return AEE_EBADCLASS;
+    free(ptr);
+    return AEE_SUCCESS;
+}
+
+/** Statistics of user heap memory */
+struct HAP_mem_stats {
+   uint64 bytes_free; /**< number of bytes free from all the segments,
+                        * may not be available for a single alloc
+                        */
+   uint64 bytes_used; /**< number of bytes used */
+   uint64 seg_free;   /**< number of segments free */
+   uint64 seg_used;   /**< number of segments used */
+   uint64 min_grow_bytes; /**< minimum number of bytes to grow the heap by when creating a new segment */
+};
+
+/**
+ * @brief Enum for reqID for HAP_mem_get_heap_info()
+ */
+enum HAP_mem_stats_request {
+  USAGE_STATS = 1,
+  MAX_USED
+};
+
+/**
+ * @brief RequestID/Response for HAP_mem_get_heap_info
+ */
+typedef struct {
+  enum HAP_mem_stats_request req_id;
+  union {
+    struct HAP_mem_stats usage_stats;
+    unsigned long max_used; /* Peak heap usage */
+  };
+} HAP_mem_heap_info_t;
+
+/**
+ * Get the current statistics from the heap.
+ *
+ * @param[in,out] stats pointer to stats structure
+ * @retval AEE_SUCCESS
+ */
+int HAP_mem_get_stats(struct HAP_mem_stats *stats);
+
+/**
+ * Get the heap info.
+ *
+ * @param payload, pointer to store the request/response
+ * @retval, 0 on success
+ */
+int HAP_mem_get_heap_info(HAP_mem_heap_info_t *payload);
+
+/**
+ * Enum to hold the START and END marker values
+ *
+ */
+typedef enum
+{
+  START = 0,
+  END
+} marker_t;
+
+/**
+ * Request types:
+ * HAP_MEM_LOG_BLOCKS - to log all the blocks to csv
+ *                      file named: hprt_block_info_<asid>.csv
+ *
+ * HAP_MEM_SET_MARKER - to set markers for different instances.
+ *                      (2^16 instances are possible per application)
+ *
+ * HAP_MEM_MAP        - to map buffer at random VA or reserved VA
+ *
+ * HAP_MEM_UNMAP      - to unmap buffer
+ *
+ * HAP_RESERVE_VA     - to reserve VA space on DSP without mapping
+ *
+ * HAP_UNRESERVE_VA   - to unreserve VA space on DSP
+ */
+typedef enum
+{
+  HAP_MEM_LOG_BLOCKS = 1,
+  HAP_MEM_SET_MARKER = 2,
+  HAP_MEM_MAP        = 3,
+  HAP_MEM_UNMAP      = 4,
+  HAP_RESERVE_VA     = 5,
+  HAP_UNRESERVE_VA   = 6
+} HAP_mem_req_t;
+
+/**
+ * Payload structure for HAP_MEM_SET_MARKER request
+ * marker_type, START or END marker
+ * instance, incase of START - NOOP; if request is success, instance number.
+ *           incase of END   - instance number to find leaks
+ *
+ */
+typedef struct
+{
+  marker_t marker_type;
+  uint16_t instance;
+} HAP_mem_marker_payload_t;
+
+/* Payload structure for HAP_MEM_MAP request */
+typedef struct {
+    uint64_t addr;   // [in] reserved va (optional). If 0, buffer mapped at random VA
+    uint64_t len;    // [in] length of buffer to be mapped
+    int prot;        // [in] permissions and cache-mode of mapping
+    int flags;       // [in] buffer flags
+    int fd;          // [in] file descriptor of buffer
+    uint64_t dsp_pa; // [in] Offset
+    uint64_t dsp_va; // [out] Mapped DSP virtual address
+} HAP_mem_map_t;
+
+/* Payload structure for HAP_MEM_UNMAP request */
+typedef struct {
+    uint64_t dsp_va;   // [in] DSP VA to be unmapped
+    uint64_t len;      // [in] length of mapping
+} HAP_mem_unmap_t;
+
+/* Payload structure for HAP_RESERVE_VA request */
+typedef struct {
+  uint64_t len;     // [in] Length of VA space to be reserved
+  int prot;         // [in] Permissions of the VA space
+  int flags;         // [in] flags (unused for now)
+  uint64_t dsp_va;  // [out] Reserved DSP virtual address
+} HAP_mem_reserve_t;
+
+/* Payload structure for HAP_UNRESERVE_VA request */
+typedef struct {
+   uint64_t dsp_va;   // [in] DSP VA to be unreserved
+   uint64_t len;      // [in] Length of buffer to be unreserved
+} HAP_mem_unreserve_t;
+
+/**
+ * Payload for different requests
+ * New request payload structures should be
+ * added to the union.
+ */
+typedef struct
+{
+  HAP_mem_req_t request_id;
+  union {
+    HAP_mem_marker_payload_t mem_marker_payload;
+    HAP_mem_map_t mmap;
+    HAP_mem_unmap_t munmap;
+    HAP_mem_reserve_t reserve;
+    HAP_mem_unreserve_t unreserve;
+  };
+} HAP_mem_req_payload_t;
+
+/**
+ * Generic request API, which will decode request type
+ * and use the payload to parse the input and output
+ * for the request
+ * @param mem_payload- input and output payload for the request
+ * @retval 0 on success.
+ */
+int HAP_mem_request(HAP_mem_req_payload_t *mem_payload);
+
+/**
+ * Set the minimum and maximum grow size.
+ *
+ * This API allows to configure the minimum and maximum size that should
+ * be added to the DSP user heap when an allocation fails and more memory
+ * needs to be obtained from the HLOS. Using this API is optional. If not
+ * used, the runtime will try to choose reasonable growth sizes based on
+ * allocation history.
+ *
+
+ * @param[in] min minimum bytes to grow the heap by when requesting a new segment
+ * @param[in] max maximum bytes to grow the heap by when requesting a new segment
+ * @retval AEE_SUCCESS
+ *
+ */
+int HAP_mem_set_grow_size(uint64 min, uint64 max);
+
+/**
+ * Set low and high memory thresholds for heap
+ *
+ * Thresholds must be tuned according to the memory requirements
+ *
+ * Improper thresholds might led to heap failure
+ *
+ * @param[in] low_largest_block_size (in bytes) - the heap will grow if size of the largest free block is less than this threshold.
+ *                                Currently, setting this parameter will have no impact on the heap.
+ * @param[in] high_largest_block_size (in bytes) - the heap manager will release all unused sections if size of the largest free block is greater than this threshold.
+ *                                 The recommended value for this, is the size of largest single allocation possible in your application.
+ * @return  AEE_SUCCESS on success
+ *         AEE_EBADPARM on failure
+ */
+int HAP_mem_set_heap_thresholds(unsigned int low_largest_block_size, unsigned int high_largest_block_size);
+
+
+/**
+ * Map buffer associated with the file descriptor to DSP memory. The reference
+ * count gets incremented if the file descriptor is already mapped. This API is
+ * limited to buffer size less then 2 GB. Recommendation is to use HAP_mmap2 for
+ * buffer of size > 2 power(8*sizeof(size_t))
+ *
+ * @param[in] addr mapping at fixed address, not supported currently. This has to be set to NULL
+ * @param[in] len size of the buffer to be mapped
+ * @param[in] prot protection flags - supported are only HAP_PROT_READ and HAP_PROT_WRITE. HAP_PROT_EXEC is not supported
+ * @param[in] flags HAP_MAP_NO_MAP - Increment reference count with no mapping
+ *               0 - map the buffer and increment the reference count
+ * @param[in] fd file descriptor for the buffer
+ * @param[in] offset offset into the buffer
+ * @retval  mapped address
+ *          -1 on failure
+ */
+void* HAP_mmap(void *addr, int len, int prot, int flags, int fd, long offset);
+
+/**
+ * Map buffer associated with the file descriptor to DSP memory. The reference
+ * count gets incremented if the file descriptor is already mapped.
+ *
+ * @param[in] addr mapping at fixed address, not supported currently. This has to be set to NULL
+ * @param[in] len size of the buffer to be mapped
+ * @param[in] prot protection flags - supported are only HAP_PROT_READ and HAP_PROT_WRITE. HAP_PROT_EXEC is not supported
+ * @param[in] flags HAP_MAP_NO_MAP - Increment reference count with no mapping
+ *               0 - map the buffer and increment the reference count
+ * @param[in] fd file descriptor for the buffer
+ * @param[in] offset offset into the buffer
+ * @retval  mapped address
+ *          -1 on failure
+ */
+void* HAP_mmap2(void *addr, size_t len, int prot, int flags, int fd, long offset);
+
+/**
+ * Decrements the reference count and unmaps the buffer from memory if the reference count goes to 0.
+ * This API is used for buffer size less then 2 GB. Recommendation is to use HAP_munmap2 for buffer of
+ * size > 2 power(8*sizeof(size_t)).
+ *
+ * @param[in] addr mapped address
+ * @param[in] len size of the mapped buffer
+ * @return  0 on success
+ *         AEE_NOSUCHMAP in input addr is invalid
+ */
+int HAP_munmap(void *addr, int len);
+
+/**
+ * Decrements the reference count and unmaps the buffer from memory if the reference count goes to 0.
+ *
+ * @param[in] addr mapped address
+ * @param[in] len size of the mapped buffer
+ * @return  0 on success
+ *         AEE_NOSUCHMAP in input addr is invalid
+ */
+int HAP_munmap2(void *addr, size_t len);
+
+/**
+ * Get virtual and physical address associated with the buffer and increments
+ * the reference count.
+ *
+ * @param[in] fd file descriptor for the buffer
+ * @param[out] vaddr virtual address associated with the buffer
+ * @param[out] paddr physical address associated with the buffer
+ * @retval 0 on success
+ *          AEE_ENOSUCHMAP if fd is invalid
+ */
+int HAP_mmap_get(int fd, void **vaddr, uint64 *paddr);
+
+/**
+ * Decrements the reference count of the file descriptor.
+ *
+ *@param[in] fd file descriptor of the buffer
+ *@retval 0 on success
+ *         AEE_ENOSUCHMAP if fd is invalid
+ *         AEE_EBADMAPREFCNT if map refcount is <=0
+ */
+int HAP_mmap_put(int fd);
+
+/**
+ * Get the stack size (in bytes) available for current thread
+ * Supported only on Lahaina and Cedros
+ * @return  available stack for current thread, on success
+ *          AEE_EINVALIDTHREAD if unable to get current thread id
+ *          AEE_ERESOURCENOTFOUND if unable to get stack for current thread
+ */
+uint64 HAP_mem_available_stack(void);
+
+/**
+ * Allocate and map APPS memory from DSP
+ *
+ * Usage of this API over malloc() is recommended when client wants greater control over DSP virtual address space
+ * as free() does not necessarily free the allocated memory depending on heap thresholds.
+ * HAP_apps_mem_request and HAP_apps_mem_release guarantee freeing of the allocated memory.
+ *
+ * @param[in] len size of memory to be allocated
+ * @param[in] flags Buffer attribute flags HAP_MEM_FLAGS_SKIP_DSP_MAP, HAP_MEM_FLAGS_DSP_MAP or HAP_MEM_FLAGS_EXTENDED_MAP
+ * @param[out] fd file descriptor of buffer
+ * @param[out] dsp_va DSP mapped virtual address
+ * @return 0 on success
+ */
+int HAP_apps_mem_request(size_t len, uint32_t flags, int *fd, uint64_t *dsp_va);
+
+/**
+ * Release previously allocated APPS memory from DSP.
+ * Releases memory from HLOS. Also unmaps memory from DSP
+ * if HAP_MEM_FLAGS_DSP_MAP was previously passed while
+ * requesting memory.
+ *
+ * @param[in] fd previously returned file descriptor of buffer
+ * @return 0 on success
+ */
+int HAP_apps_mem_release(int fd);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // HAP_MEM_H
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_mem.md b/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_mem.md
new file mode 100755
index 0000000000000..42be2ab6cba12
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_mem.md
@@ -0,0 +1,75 @@
+# HAP_mem APIs
+
+## Overview
+The HAP_mem APIs provide functionality available from the DSP to
+
+* allocate and free memory - HAP_malloc() and HAP_free()
+* map and unmap ION buffers allocated on the application processor and passed to the DSP using file descriptors - HAP_mmap() and HAP_munmap()
+* get heap statistics and set properties - HAP_mem_get_stats(), HAP_mem_set_grow_size(), HAP_mmap_get(), HAP_mem_set_heap_thresholds() and HAP_mmap_put()
+* allocate and free APPS memory - HAP_apps_mem_request() and HAP_apps_mem_free()
+
+## Memory mapping
+
+A common usage scenario for using the mapping functionality consists of the application processor allocating ION memory and passing the file descriptor to the DSP.
+The DSP will then use the HAP_mem APIs to map the buffer onto the DSP and obtain a memory pointer. The mapping will remain valid until the buffer is being unmapped.
+This approach allows to maintain a mapping across multiple FastRPC calls.
+
+## Memory allocation
+
+HAP_malloc and HAP_free are simple wrappers around the DSP malloc and free functions.
+If a user memory allocation request cannot be fulfilled with the existing DSP heap, the FastRPC
+runtime will attempt to grow the DSP heap by reserving additional memory from the HLOS.
+
+The HAP_set_grow_size API can be called to configure the minimum and maximum size that should be added to the DSP heap when one of these growth events occurs.
+If many growth events are anticipated, it may be appropriate to set a larger growth rate to reduce the number of growth events.  However, increasing
+the heap more than necessary will impact HLOS performance.  Therefore, care must be taken in finding the appropriate growth rate for a given application.
+
+Here is how the min and max values set by the HAP_set_grow_size control the growth of the heap:
+
+    min_grow_bytes = MIN(max,MAX(min,min_grow_bytes));
+
+    // The value will be aligned to the next 1MB boundary.
+
+    actual_grow_bytes = min_grow_bytes + request_size
+    actual_grow_bytes = ALIGN(actual_grow_bytes,0x100000)
+
+`HAP_apps_mem_request()` and `HAP_apps_mem_release()` APIs can be called from the DSP to allocate APPS memory and map the same memory on the DSP if required.
+
+These HAP request and release APIs are recommended when the user wants greater control over the DSP virtual address space: unlike `malloc` and `free`, these APIs guarantee that the memory will be mapped when allocated and unmapped when freed.
+
+The mapping on the DSP can be controlled using the `flags` parameter in `HAP_apps_mem_request()`:
+
+    * `HAP_MEM_FLAGS_SKIP_DSP_MAP` results in skipping the mapping on the DSP. In that case, the user needs to map the DSP memory by calling `HAP_mmap()`.
+
+    * `HAP_MEM_FLAGS_DSP_MAP` results in mapping the buffer on the DSP upon calling `HAP_apps_mem_request()`.
+
+`HAP_apps_mem_release()` will always free the allocated HLOS memory but will only unmap the buffer on the DSP if the flag `HAP_MEM_FLAGS_DSP_MAP` was used when calling `HAP_apps_mem_request()`.
+
+***NOTE***
+If HAP_MEM_FLAGS_SKIP_DSP_MAP flag was used when calling `HAP_apps_mem_request()`, and the memory was mapped later using `HAP_mmap()`, then the user needs to unmap DSP memory by calling `HAP_munmap()`.
+
+## Memory statistics
+
+HAP_mem_get_stats is useful when called at the beginning and end of an application to check for any memory leaks.
+
+## Memory request API
+HAP_mem_request is the request API, which support different request types. Requests supported are:
+
+* `HAP_MEM_LOG_BLOCKS`: This request will log all the heap blocks to the csv file named - hprt_block_info_<asid>.csv, for parsing use QMemCheck tool.
+    If block info logging is successful - 0 will be returned back by the HAP_mem_request. This request doesn't need any payload union.
+* `HAP_MEM_SET_MARKER`: This request is to mark instances for leak detection, the markers can be START or END markers.
+    When START marker is called, a marker instance number will be returned back to caller of the API (if the request is SUCCESS(0)) in the payload member:
+    mem_marker_payload.
+    When END marker is called, the caller should fill the instance number for which marker needs to be ended. If the request is success,
+    all the leaks from the START to END of that instance will be logged to hprt_leak_block_info_<asid>_<instance_num>.csv
+* `HAP_MEM_MAP`: This request is to create a DSP mapping for a shared buffer.
+    The payload structure for this request can be referred to in `HAP_mem_map_t`. To create the mapping at a reserved va, the start address needs to be specified in the `addr` field of the payload.
+    If the request is SUCCESS(0), the `dsp_va` member of payload will hold the mapped virtual address (VA).
+* `HAP_MEM_UNMAP`: This request is to unmap the memory region on the DSP.
+    The payload structure for this request can be referred to in `HAP_mem_unmap_t`. The starting virtual address and length of buffer needs to passed as payload members `dsp_va` and `len`.
+    If the request is SUCCESS(0), the virtual address (VA) mapping is removed from the DSP.
+* `HAP_RESERVE_VA`: This request is to reserve virtual address (VA) space on the DSP without creating any mappings.
+    The payload structure for this request can be referred to in `HAP_mem_reserve_t`.
+    If the request is SUCCESS(0), the `dsp_va` member of payload will hold the reserved virtual address (VA).
+* `HAP_UNRESERVE_VA`: This request is to unreserve the virtual address (VA) space on the DSP.
+    The payload structure for this request can be referred to in `HAP_mem_unreserve_t`. If the request is SUCCESS(0), the virtual address space is successfully unreserved.
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_pd_dtor.h b/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_pd_dtor.h
new file mode 100755
index 0000000000000..e612ed135a908
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_pd_dtor.h
@@ -0,0 +1,51 @@
+#ifndef HAP_PD_DTOR_H
+#define HAP_PD_DTOR_H
+/*==============================================================================
+  Copyright (c) 2015 Qualcomm Technologies Incorporated.
+  All Rights Reserved Qualcomm Technologies Proprietary
+
+  Export of this technology or software is regulated by the U.S.
+  Government. Diversion contrary to U.S. law prohibited.
+==============================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * This type is used to provide the qdi driver the register address, and the
+ * bits of the register to clear.
+ *
+ * @param register_addr,  The register address whose value needs to modified on process exit
+ * @param register_mask, A mask that indicates which bits of the register need to be set.
+ * @param register_val, The value that needs to be applied to the unmasked bits.
+ */
+typedef struct {
+    uintptr_t   register_addr;
+    uint32      register_mask;
+    uint32      register_value;
+}HAP_register_t;
+
+/**
+ * A fastrpc process can call this method and provide a list of register addresses
+ * and their desired values. When the fastrpc process exits, a previous call to this
+ * method will ensure that the bits (defined in the bitmask) for the provided
+ * registers are cleared.
+
+ * @param num_registers, Number of registers that need to be modified on process exit
+ * @param registers, The register address and masks.
+ */
+int HAP_clear_registers(unsigned int num_registers, HAP_register_t* registers);
+
+/**
+ * This method is used by the kernel to free any memory that
+ * a fastrpc client might have line-locked
+*/
+int HAP_free_linelocked_memory(unsigned int asid);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /*HAP_PD_DTOR_H */
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_perf.h b/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_perf.h
new file mode 100755
index 0000000000000..1f9da76d8c785
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_perf.h
@@ -0,0 +1,120 @@
+/*==============================================================================
+@file
+   HAP_perf.h
+
+@brief
+   Header file for DSP Perf APIs
+
+Copyright (c) 2012-2017, 2020 QUALCOMM Technologies, Incorporated.
+All Rights Reserved.
+QUALCOMM Proprietary.
+==============================================================================*/
+#ifndef HAP_PERF_H
+#define HAP_PERF_H
+
+#include "AEEStdDef.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** @defgroup timer_functionality Timer functionality.
+ *  @{
+ */
+
+/**
+  * Gets the current value of 56-bit, 19.2MHz hardware counter converted
+  * to micro-seconds. This value should be treated as relative and
+  * not absolute. The value wraps around to zero once it exceeds the
+  * maxiumum value. This function performs an integer division in order
+  * to convert ticks to time, which adds some overhead. Consider using
+  * HAP_perf_get_qtimer_count for a lower overhead.
+*/
+#ifdef __hexagon__
+#include "hexagon_sim_timer.h"
+static inline uint64 HAP_perf_get_time_us(void)
+{
+       /* Converts ticks into microseconds
+       1 tick = 1/19.2MHz seconds
+       Micro Seconds = Ticks * 10ULL/192ULL */
+  unsigned long long count;
+  asm volatile (" %0 = c31:30 " : "=r"(count));
+  return (uint64)(count) * 10ull / 192ull;
+}
+#else
+uint64 HAP_perf_get_time_us(void)
+{
+     static long long start = 0;
+  // TODO
+  // assume 500 MHz on simulator
+  //return HAP_perf_get_pcycles() / 500;
+  return start++;
+}
+#endif
+
+/**
+  * Gets the current 56 bit, 19.2MHz global hardware clock count.
+  * This value should be treated as relative and not absolute.
+  * The value wraps around to zero once it exceeds the maxiumum value.
+*/
+static inline uint64 HAP_perf_get_qtimer_count(void) {
+    unsigned long long cur_count;
+    asm volatile(" %0 = c31:30 " : "=r"(cur_count));
+    return (uint64)cur_count;
+}
+
+/**
+  * Converts the 19.2 MHz global hardware count to micro-seconds.
+  * @param[in] count			- 19.2 MHz global hardware count
+  * @returns				- Time in micro-seconds.
+*/
+uint64 HAP_perf_qtimer_count_to_us(uint64 count);
+
+/**
+  * Gets the current 64-bit Hexagon Processor cycle count.
+  * The processor cycle count is the current number of processor
+  * cycles executed since the processor was last reset.  Note
+  * that this counter stops incrementing whenever the DSP enters
+  * a low-power  state (such as clock gating), as opposed to the
+  * qtimer, which increments regardless of the DSP power state.
+*/
+#ifdef __hexagon__
+#include "hexagon_sim_timer.h"
+static inline uint64 HAP_perf_get_pcycles(void)
+{
+  uint64_t pcycle;
+  asm volatile ("%[pcycle] = C15:14" : [pcycle] "=r"(pcycle));
+  return pcycle;
+}
+#else
+uint64 HAP_perf_get_pcycles(void)
+{
+  return (uint64)0;
+}
+#endif
+
+/**
+ * @}
+ */
+
+/** @defgroup sleep_functionality Sleep functionality.
+ *  @{
+ */
+
+/**
+  * Suspends the calling thread from execution until the
+  * specified duration has elapsed.
+  * @param[in] sleep_duration:			- Sleep duration in micro-seconds.
+  * @returns					- returns 0 on success, non zero in error case.
+*/
+int HAP_timer_sleep(unsigned long long sleep_duration);
+
+/**
+ * @} // sleep_functionality
+ */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // HAP_PERF_H
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_perf.md b/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_perf.md
new file mode 100755
index 0000000000000..39895f35a5699
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_perf.md
@@ -0,0 +1,23 @@
+# Introduction {#intro}
+
+The Hexagon SDK provides APIs to measure the elapsed time in both microseconds
+and processor cycles(pcycles).
+
+
+# API Overview {#api-overview}
+
+The HAP_perf APIs are used by clients for profiling their code when running on the DSP. The profiling
+can be done in both microseconds and pcycles based on the needs. Morevover, the HAP_perf library
+also provides sleep APIs to the clients.
+
+The HAP_perf APIs include the following functions:
+
+::HAP_perf_get_time_us
+
+::HAP_perf_get_qtimer_count
+
+::HAP_perf_qtimer_count_to_us
+
+::HAP_perf_get_pcycles
+
+::HAP_timer_sleep
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_power.h b/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_power.h
new file mode 100755
index 0000000000000..55138edb1871f
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_power.h
@@ -0,0 +1,825 @@
+/*==============================================================================
+@file
+    HAP_power.h
+
+@brief
+    Header file of DSP power APIs.
+
+Copyright (c) 2015,2019,2022 Qualcomm Technologies, Inc.
+All rights reserved. Qualcomm Proprietary and Confidential.
+==============================================================================*/
+
+#ifndef _HAP_POWER_H
+#define _HAP_POWER_H
+
+#include "AEEStdErr.h"
+#include "AEEStdDef.h"
+#include <string.h>
+#include <stdlib.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//Add a weak reference so shared objects do not throw link error
+#pragma weak HAP_power_destroy_client
+
+/**
+ *  Low-power modes for HAP DCVS V3 interface, used in 'sleep_disable' param in DCVS_v3.
+ *
+ *  In general, applications are expected to vote for their latency tolerance via the
+ *  'latency' parameter in DCVS_v3/DCVS_v2 options. The aggregated latency vote across
+ *  clients is used in selecting appropriate low-power mode (LPM) of the DSP subsystem.
+ *  LPM will save power when the DSP subsystem is idle by reducing leakage current.
+ *  Deeper LPMs typically have higher wake up latencies, which will increase interrupt
+ *  service delays and add to inter-processor communication latencies. Though the
+ *  latency vote controls the selection of low-power modes, the vote required for
+ *  disabling/allowing certain LPMs is difficult to calculate as the wakeup latency
+ *  associated with these LPMs could change from chipset to chipset and between runs
+ *  within the same chipset.
+ *
+ *  This 'sleep_disable' parameter in DCVS_v3 allows user to directly prevent certain LPM
+ *  levels of the DSP subsystem. By default, there is no restriction placed on LPMs i.e.
+ *  all the LPMs are enabled and the aggregated latency vote (along with other system
+ *  parameters) is used in LPM selection. The 'sleep_disable' parameter in DCVS_v3 is for
+ *  the advanced developers who would like to disable certain low-power modes explicitly
+ *  irrespective of the latency vote. Developers need to consider their power-performance
+ *  tradeoff requirements and if necessary profile the results before voting using this
+ *  parameter. Regular users are suggested to choose the default i.e. 'HAP_DCVS_LPM_ENABLE_ALL'.
+ *
+ *  If any particular LPM level is not supported on the DSP subsystem then it will enable
+ *  nearest shallow LPM level. For example, in absense of 'HAP_DCVS_LPM_LEVEL3' it will select
+ *  'HAP_DCVS_LPM_LEVEL2' which is nearest shallow LPM level to 'HAP_DCVS_LPM_LEVEL3'.
+ */
+#define HAP_DCVS_LPM_LEVEL1                     1    /**< To disable all low-power modes */
+#define HAP_DCVS_LPM_LEVEL2                     2    /**< To enable only standalone APCR */
+#define HAP_DCVS_LPM_LEVEL3                     3    /**< To enable RPM assisted APCR */
+#define HAP_DCVS_LPM_ENABLE_ALL                 0    /**< To enable all low-power modes (enables full power collapse) */
+
+#define HAP_DCVS_VOLT_CORNER_TURBO_L2_L3_DEFINED     /**< To indicate presence of L2 and L3 corners in HAP_dcvs_voltage_corner_t */
+#define HAP_POWER_SET_HMX_V2_DEFINED                 /**< To indicate support for HAP_power_set_HMX_v2 request type in HAP_power_set */
+#define HAP_POWER_SET_CENG_BUS_VOTING_DEFINED        /**< To indicate support for HAP_power_set_CENG_bus request type in HAP power_set */
+
+/**
+* Possible error codes returned
+*/
+typedef enum {
+	HAP_POWER_ERR_UNKNOWN           = -1,
+	HAP_POWER_ERR_INVALID_PARAM     = -2,
+	HAP_POWER_ERR_UNSUPPORTED_API   = -3
+} HAP_power_error_codes;
+
+/** Payload for HAP_power_set_mips_bw */
+typedef struct {
+	boolean set_mips;						/**< Set to TRUE to request MIPS */
+	unsigned int mipsPerThread;				/**< mips requested per thread, to establish a minimal clock frequency per HW thread */
+	unsigned int mipsTotal;					/**< Total mips requested, to establish total number of MIPS required across all HW threads */
+	boolean set_bus_bw;						/**< Set to TRUE to request bus_bw */
+	uint64 bwBytePerSec;					/**< Max bus BW requested (bytes per second) */
+	unsigned short busbwUsagePercentage;	/**< Percentage of time during which bwBytesPerSec BW is required from the bus (0..100) */
+	boolean set_latency;					/**< Set to TRUE to set latency */
+	int latency;							/**< maximum hardware wakeup latency in microseconds.  The higher the value,
+											*	the deeper state of sleep that can be entered but the longer it may take
+											*	to awaken. Only values > 0 are supported (1 microsecond is the smallest valid value) */
+} HAP_power_mips_bw_payload;
+
+/** @defgroup HAP_power_enums HAP POWER enums
+ *  @{
+ */
+ /** Clock frequency match type*/
+typedef enum {
+	HAP_FREQ_AT_LEAST,				/**< Matches at least the specified frequency. */
+	HAP_FREQ_AT_MOST,				/**< Matches at most the specified frequency. */
+	HAP_FREQ_CLOSEST,				/**< Closest match to the specified frequency. */
+	HAP_FREQ_EXACT,					/**< Exact match with the specified frequency. */
+	HAP_FREQ_MAX_COUNT				/**< Maximum count. */
+} HAP_freq_match_type;
+/**
+ * @} // HAP_power_enums
+ */
+
+/** Configuration for bus bandwidth */
+typedef struct {
+	boolean set_bus_bw;						/**< Set to TRUE to request bus_bw */
+	uint64 bwBytePerSec;					/**< Max bus BW requested (bytes per second) */
+	unsigned short busbwUsagePercentage;	/**< Percentage of time during which bwBytesPerSec BW is required from the bus (0..100) */
+} HAP_power_bus_bw;
+
+/**
+* @brief Payload for vapps power request
+* vapps core is used for Video post processing
+*/
+typedef struct {
+	boolean set_clk;						/**< Set to TRUE to request clock frequency */
+	unsigned int clkFreqHz;					/**< Clock frequency in Hz */
+	HAP_freq_match_type freqMatch;			/**< Clock frequency match */
+	HAP_power_bus_bw dma_ext;				/**< DMA external bus bandwidth */
+	HAP_power_bus_bw hcp_ext;				/**< HCP external bus bandwidth */
+	HAP_power_bus_bw dma_int;				/**< DMA internal bus bandwidth */
+	HAP_power_bus_bw hcp_int;				/**< HCP internal bus bandwidth */
+} HAP_power_vapss_payload;
+
+/**
+* @brief Payload for vapps_v2 power request
+* Supported in targets which have split VAPPS core(DMA and HCP) form Hana onwards
+*/
+typedef struct {
+	boolean set_dma_clk;					/**< Set to TRUE to reqeust DMA clock frequency */
+	boolean set_hcp_clk;					/**< Set to TRUE to reqeust HCP clock frequency */
+	unsigned int dmaClkFreqHz;				/**< DMA Clock frequency in Hz */
+	unsigned int hcpClkFreqHz;				/**< HCP Clock frequency in Hz */
+	HAP_freq_match_type freqMatch;			/**< Clock frequency match type */
+	HAP_power_bus_bw dma_ext;				/**< DMA external bus bandwidth */
+	HAP_power_bus_bw hcp_ext;				/**< HCP external bus bandwidth */
+	HAP_power_bus_bw dma_int;				/**< DMA internal bus bandwidth */
+	HAP_power_bus_bw hcp_int;				/**< HCP internal bus bandwidth */
+} HAP_power_vapss_payload_v2;
+
+/** Payload for HAP_power_set_HVX */
+typedef struct {
+	boolean power_up;						/**< Set to TRUE to turn on HVX, and FALSE to turn off. */
+} HAP_power_hvx_payload;
+
+/**
+* Payload for HAP_power_set_HMX
+* Supported from Lahaina onwards*/
+typedef struct {
+	boolean power_up;						/**< Set to TRUE to turn on HMX, and FALSE to turn off.
+	                                        * When TRUE, on chipsets with separate HMX clock, a default
+											* HMX clock will be selected based on the voted
+											* Q6 core clock level from the same HAP_power_set context.
+											*/
+} HAP_power_hmx_payload;
+
+/** @defgroup HAP_power_enums HAP POWER enums
+ *  @{
+ */
+/** Payload for HAP power client classes */
+typedef enum {
+	HAP_POWER_UNKNOWN_CLIENT_CLASS			= 0x00,		/**< Unknown client class */
+	HAP_POWER_AUDIO_CLIENT_CLASS			= 0x01,		/**< Audio client class */
+	HAP_POWER_VOICE_CLIENT_CLASS			= 0x02,		/**< Voice client class */
+	HAP_POWER_COMPUTE_CLIENT_CLASS			= 0x04,		/**< Compute client class */
+	HAP_POWER_STREAMING_1HVX_CLIENT_CLASS	= 0x08,		/**< Camera streaming with 1 HVX client class */
+	HAP_POWER_STREAMING_2HVX_CLIENT_CLASS = 0x10,		/**< Camera streaming with 2 HVX client class */
+} HAP_power_app_type_payload;
+/**
+ * @} // HAP_power_enums
+ */
+
+/** Payload for HAP_power_set_linelock */
+typedef struct {
+	void* startAddress;					/**< Start address of the memory region to be locked. */
+	uint32 size;						/**< Size (bytes) of the memory region to be locked. Set size
+										*	to 0 to unlock memory. */
+	uint32 throttleBlockSize;			/**< Block size for throttling, in bytes;
+										* 0 for no throttling.  The region to be locked will be divided into
+										* blocks of this size for throttling purposes.
+										* Use for locking larger cache blocks.
+										* Applicable only when enabling line locking.Only ONE throttled linelock call is supported at this time.
+										* You can linelock additional regions (without throttling) using HAP_power_set_linelock_nothrottle*/
+	uint32 throttlePauseUs;				/**< Pause to be applied between locking each block, in microseconds. Applicable only when enabling line locking*/
+} HAP_power_linelock_payload;
+
+/** Payload for HAP_power_set_linelock_nothrottle */
+typedef struct {
+	void* startAddress;							/**< Start address of the memory region to be locked. */
+	uint32 size;								/**< Size (bytes) of the memory region to be locked. Set size to 0
+												* to unlock memory */
+} HAP_power_linelock_nothrottle_payload;
+
+/** @defgroup HAP_power_enums HAP POWER enums
+ *  @{
+ */
+/** Option for dcvs payload */
+typedef enum {
+	HAP_DCVS_ADJUST_UP_DOWN =   0x1,		/**< increase and decrease core/bus clock speed. */
+	HAP_DCVS_ADJUST_ONLY_UP =   0x2,		/**< restricts DCVS from lowering the clock speed below the requested value . */
+} HAP_power_dcvs_payload_option;
+/**
+ * @} // HAP_power_enums
+ */
+
+/** Payload for HAP_power_set_DCVS */
+typedef struct {
+	boolean dcvs_enable;								/**< Set to TRUE to participate in DCVS, and FALSE otherwise. */
+	HAP_power_dcvs_payload_option dcvs_option;			/**< Set to one of
+														*		HAP_DCVS_ADJUST_UP_DOWN  - Allows for DCVS to adjust up and down.
+														*		HAP_DCVS_ADJUST_ONLY_UP  - Allows for DCVS to adjust up only. */
+} HAP_power_dcvs_payload;
+
+/** @defgroup HAP_power_enums HAP POWER enums
+ *  @{
+ */
+/** Voltage corners for HAP DCVS V2 interface */
+typedef enum {
+	HAP_DCVS_VCORNER_DISABLE,
+	HAP_DCVS_VCORNER_SVS2,
+	HAP_DCVS_VCORNER_SVS,
+	HAP_DCVS_VCORNER_SVS_PLUS,
+	HAP_DCVS_VCORNER_NOM,
+	HAP_DCVS_VCORNER_NOM_PLUS,
+	HAP_DCVS_VCORNER_TURBO,
+	HAP_DCVS_VCORNER_TURBO_PLUS,
+	HAP_DCVS_VCORNER_TURBO_L2,                  /**< On targets released till Kailua, HAP_DCVS_VCORNER_TURBO_L2 level will be treated as HAP_DCVS_VCORNER_TURBO_PLUS */
+	HAP_DCVS_VCORNER_TURBO_L3,                  /**< On targets released till Kailua, HAP_DCVS_VCORNER_TURBO_L3 level will be treated as HAP_DCVS_VCORNER_TURBO_PLUS */
+	HAP_DCVS_VCORNER_MAX = 255,
+} HAP_dcvs_voltage_corner_t;
+
+/**
+* Expanded voltage corners for HAP_power_set corner voting options
+*/
+typedef enum {
+	HAP_DCVS_EXP_VCORNER_DISABLE    = 0,
+	HAP_DCVS_EXP_VCORNER_MIN        = 0x100,
+	/**< Selects the minimum voltage corner defined for the chipset */
+	HAP_DCVS_EXP_VCORNER_LOW_SVS_D2 = 0x134,
+	HAP_DCVS_EXP_VCORNER_LOW_SVS_D1 = 0x138,
+	HAP_DCVS_EXP_VCORNER_LOW_SVS    = 0x140,
+	HAP_DCVS_EXP_VCORNER_SVS        = 0x180,
+	HAP_DCVS_EXP_VCORNER_SVS_L1     = 0x1C0,
+	HAP_DCVS_EXP_VCORNER_NOM        = 0x200,
+	HAP_DCVS_EXP_VCORNER_NOM_L1     = 0x240,
+	HAP_DCVS_EXP_VCORNER_TUR        = 0x280,
+	HAP_DCVS_EXP_VCORNER_TUR_L1     = 0x2A0,
+	HAP_DCVS_EXP_VCORNER_TUR_L2     = 0x2B0,
+	HAP_DCVS_EXP_VCORNER_TUR_L3     = 0x2C0,
+	HAP_DCVS_EXP_VCORNER_MAX        = 0xFFFF,
+	/**< Selects the maximum voltage corner defined for the chipset */
+} HAP_dcvs_exp_voltage_corner_t;
+
+/**
+*  Perf modes to specify clock frequency level within
+*  target voltage corner.
+*/
+typedef enum {
+	HAP_CLK_PERF_HIGH = 0,                      /**< To select max frequency at target voltage corner. */
+	HAP_CLK_PERF_LOW,                           /**< To select min frequency at target voltage corner. */
+} HAP_clk_perf_mode_t;
+
+/**
+ * @} // HAP_power_enums
+ */
+
+#define HAP_DCVS_VCORNER_SVSPLUS HAP_DCVS_VCORNER_SVS_PLUS
+#define HAP_DCVS_VCORNER_NOMPLUS HAP_DCVS_VCORNER_NOM_PLUS
+#define HAP_DCVS_VCORNER_TURBO_L1 HAP_DCVS_VCORNER_TURBO_PLUS
+
+/** DCVS parameters for HAP_power_dcvs_v2_payload */
+typedef struct {
+	HAP_dcvs_voltage_corner_t target_corner;	/**< target voltage corner */
+	HAP_dcvs_voltage_corner_t min_corner;		/**< minimum voltage corner */
+	HAP_dcvs_voltage_corner_t max_corner;		/**< maximum voltage corner */
+	uint32 param1;								/**< reserved */
+	uint32 param2;								/**< reserved */
+	uint32 param3;								/**< reserved */
+} HAP_dcvs_params_t;
+
+/** Core clock parameters for HAP_power_dcvs_v3_payload */
+typedef struct {
+	HAP_dcvs_voltage_corner_t target_corner;	/**< target voltage corner */
+	HAP_dcvs_voltage_corner_t min_corner;		/**< minimum voltage corner */
+	HAP_dcvs_voltage_corner_t max_corner;		/**< maximum voltage corner */
+	uint32 param1;								/**< reserved */
+	uint32 param2;								/**< reserved */
+	uint32 param3;								/**< reserved */
+} HAP_core_params_t;
+
+/** Bus clock parameters for HAP_power_dcvs_v3_payload */
+typedef struct {
+	HAP_dcvs_voltage_corner_t target_corner;	/**< target voltage corner */
+	HAP_dcvs_voltage_corner_t min_corner;		/**< minimum voltage corner */
+	HAP_dcvs_voltage_corner_t max_corner;		/**< maximum voltage corner */
+	uint32 param1;								/**< reserved */
+	uint32 param2;								/**< reserved */
+	uint32 param3;								/**< reserved */
+} HAP_bus_params_t;
+
+/** DCVS v3 parameters for HAP_power_dcvs_v3_payload */
+typedef struct {
+	uint32 param1;					/**< reserved */
+	uint32 param2;					/**< reserved */
+	uint32 param3;					/**< reserved */
+	uint32 param4;					/**< reserved */
+	uint32 param5;					/**< reserved */
+	uint32 param6;					/**< reserved */
+} HAP_dcvs_v3_params_t;
+
+/** @defgroup HAP_power_enums HAP POWER enums
+ *  @{
+ */
+/** option for dcvs_v2 payload */
+typedef enum {
+	HAP_DCVS_V2_ADJUST_UP_DOWN =   0x1,					/**< Allows for DCVS to adjust up and down. */
+	HAP_DCVS_V2_ADJUST_ONLY_UP =   0x2,					/**< Allows for DCVS to adjust up only. */
+	HAP_DCVS_V2_POWER_SAVER_MODE = 0x4,					/**< HAP_DCVS_POWER_SAVER_MODE				-	Higher thresholds for power efficiency. */
+	HAP_DCVS_V2_POWER_SAVER_AGGRESSIVE_MODE = 0x8,		/**< HAP_DCVS_POWER_SAVER_AGGRESSIVE_MODE	-	Higher thresholds for power efficiency with faster ramp down. */
+	HAP_DCVS_V2_PERFORMANCE_MODE = 0x10,				/**< HAP_DCVS_PERFORMANCE_MODE				-	Lower thresholds for maximum performance */
+	HAP_DCVS_V2_DUTY_CYCLE_MODE = 0x20,					/**< HAP_DCVS_DUTY_CYCLE_MODE				-	only for HVX based clients.
+														*												For streaming class clients:
+														*													> detects periodicity based on HVX usage
+														*													> lowers clocks in the no HVX activity region of each period.
+														*												For compute class clients:
+														*													> Lowers clocks on no HVX activity detects and brings clocks up on detecting HVX activity again.
+														*													> Latency involved in bringing up the clock with be at max 1 to 2 ms. */
+
+
+
+} HAP_power_dcvs_v2_payload_option;
+/**
+ * @} // HAP_power_enums
+ */
+/** Payload for HAP_power_set_DCVS_v2 */
+typedef struct {
+	boolean dcvs_enable;								/**< Set to TRUE to participate in DCVS, and FALSE otherwise */
+	HAP_power_dcvs_v2_payload_option dcvs_option;		/**< Set to one of HAP_power_dcvs_v2_payload_option */
+	boolean set_latency;								/**< TRUE to set latency parameter, otherwise FALSE */
+	uint32 latency;										/**< sleep latency */
+	boolean set_dcvs_params;							/**< TRUE to set DCVS params, otherwise FALSE */
+	HAP_dcvs_params_t dcvs_params;						/**< DCVS parameters */
+} HAP_power_dcvs_v2_payload;
+
+/** Payload for HAP_power_set_DCVS_v3 */
+typedef struct {
+	boolean set_dcvs_enable;							/**< TRUE to consider DCVS enable/disable and option parameters, otherwise FALSE */
+	boolean dcvs_enable;								/**< Set to TRUE to participate in DCVS, and FALSE otherwise. */
+	HAP_power_dcvs_v2_payload_option dcvs_option;		/**< Set to one of HAP_power_dcvs_v2_payload_option */
+	boolean set_latency;								/**< TRUE to consider latency parameter, otherwise FALSE */
+	uint32 latency;										/**< sleep latency */
+	boolean set_core_params;							/**< TRUE to consider core clock params, otherwise FALSE */
+	HAP_core_params_t core_params;						/**< Core clock parameters */
+	boolean set_bus_params;								/**< TRUE to consider bus clock params, otherwise FALSE */
+	HAP_bus_params_t bus_params;						/**< Bus clock parameters */
+	boolean set_dcvs_v3_params;							/**< TRUE to consider DCVS v3 params, otherwise FALSE */
+	HAP_dcvs_v3_params_t dcvs_v3_params;				/**< DCVS v3 parameters */
+	boolean set_sleep_disable;							/**< TRUE to consider sleep disable/enable parameter, otherwise FALSE */
+	unsigned char sleep_disable;				        /**< See HAP_DCVS_LPM_LEVEL1, HAP_DCVS_LPM_LEVEL2, HAP_DCVS_LPM_LEVEL3 and HAP_DCVS_LPM_ENABLE ALL above */
+} HAP_power_dcvs_v3_payload;
+
+/** @defgroup HAP_power_enums HAP POWER enums
+ *  @{
+ */
+ /** Type for dcvs update request */
+typedef enum {
+	HAP_POWER_UPDATE_DCVS = 1,
+	HAP_POWER_UPDATE_SLEEP_LATENCY,
+	HAP_POWER_UPDATE_DCVS_PARAMS,
+} HAP_power_update_type_t;
+/**
+ * @} // HAP_power_enums
+ */
+/** Payload for DCVS update */
+typedef struct {
+	boolean dcvs_enable;							/**< TRUE for DCVS enable and FALSE for DCVS disable */
+	HAP_power_dcvs_v2_payload_option dcvs_option;	/**< Requested DCVS policy in case DCVS enable is TRUE */
+} HAP_power_update_dcvs_t;
+
+/** Payload for latency update */
+typedef struct {
+	boolean set_latency;							/**< TRUE if sleep latency request has to be considered */
+	unsigned int latency;							/**< Sleep latency request in micro seconds */
+} HAP_power_update_latency_t;
+
+/** Payload for DCVS params update */
+typedef struct {
+    boolean set_dcvs_params;						/**< Flag to mark DCVS params structure validity, TRUE for valid DCVS
+													*params request and FALSE otherwise */
+    HAP_dcvs_params_t dcvs_params;					/**< Intended DCVS params if set_dcvs_params is set to TRUE */
+} HAP_power_update_dcvs_params_t;
+
+/** Payload for HAP_power_set_DCVS_v2 */
+typedef struct {
+	HAP_power_update_type_t update_param;			/**< Type for which param to update */
+	union {
+		HAP_power_update_dcvs_t dcvs_payload;
+		HAP_power_update_latency_t latency_payload;
+		HAP_power_update_dcvs_params_t dcvs_params_payload;
+	};												/**< Update payload for DCVS, latency or DCVS params */
+} HAP_power_dcvs_v2_update_payload;
+
+/** Payload for HAP_power_set_streamer */
+typedef struct {
+	boolean set_streamer0_clk;				/**< Set streamer 0 clock */
+	boolean set_streamer1_clk;				/**< Set streamer 1 clock */
+	unsigned int streamer0_clkFreqHz;		/**< Streamer 0 clock frequency */
+	unsigned int streamer1_clkFreqHz;		/**< Streamer 1 clock frequency */
+	HAP_freq_match_type freqMatch;			/**< Clock frequency match */
+	uint32 param1;							/**< Reserved for future streamer parameters */
+	uint32 param2;							/**< Reserved for future streamer parameters */
+	uint32 param3;							/**< Reserved for future streamer parameters */
+} HAP_power_streamer_payload;
+
+/**
+* Payload for HAP_power_set_HMX_v2.
+* Provides user flexibility to vote for HMX clock based on either voltage
+* corner or frequency. User can also provide DCVS limits for HMX clock when
+* DCVS participation is enabled via HAP_power_set_DCVS/_v2/_v3 options.
+* On chipsets without separate HMX clock, requests made for target corner or
+* frequency will return AEE_EBADPARM error.
+*/
+
+typedef struct {
+	boolean set_power;							/**< Set to TRUE to consider HMX power_up parameter to turn ON/OFF HMX, otherwise FALSE. */
+	boolean power_up;							/**< Set to TRUE to turn on HMX, and FALSE to turn off. */
+	boolean set_clock;							/**< TRUE to consider HMX clock parameters. All the following parameters
+												* will be ignored if set to FALSE. By default, lowest HMX clock frequency will be selected.
+												* The default being 0 for pick_default, target_corner, freq_mhz and floor_freq_mhz. */
+	boolean pick_default;						/**< Set to TRUE to select default HMX clock based on the voted
+												* Q6 core clock level from the same HAP_power_set context, otherwise FALSE.
+												* When TRUE, target_corner, freq_mhz and floor_freq_mhz params
+												* should be set to 0.*/
+	HAP_dcvs_exp_voltage_corner_t target_corner;	/**< Target voltage corner. See HAP_dcvs_exp_voltage_corner_t.
+													* For target_corner > 0, pick_default, freq_mhz and floor_freq_mhz
+													* params should be set to 0.
+													* Maximum target_corner request among the requesting clients
+													* will be considered as the final vote. */
+	HAP_dcvs_exp_voltage_corner_t min_corner;	/**< minimum voltage corner for DCVS. See HAP_dcvs_exp_voltage_corner_t. */
+	HAP_dcvs_exp_voltage_corner_t max_corner;	/**<  maximum voltage corner for DCVS. See HAP_dcvs_exp_voltage_corner_t. */
+	HAP_clk_perf_mode_t perf_mode;				/**< To specify clock frequency level within target voltage corner. */
+	uint32 freq_mhz;							/**< Frequency request in MHz. freq_mhz requests across clients
+												* will be accumulated. For freq_mhz > 0, target_corner and
+												* pick_default should be set to 0. */
+    uint32 floor_freq_mhz;						/**< Floor frequency request in MHz.
+												* For floor_freq_mhz > 0, target_corner and pick_default
+												* should be set to 0. Maximum floor_freq_mhz request among the
+												* requesting clients will be considered.
+												* Maximum between the aggregated freq_mhz and floor_freq_mhz
+												* will be considered as the final frequency request. */
+	uint32 param1;								/**< Reserved */
+	uint32 param2;								/**< Reserved */
+	uint32 param3;								/**< Reserved */
+} HAP_power_hmx_payload_v2;
+
+/** Payload for HAP_power_set_CENG_bus */
+typedef struct {
+	HAP_dcvs_voltage_corner_t target_corner;	/**< Target voltage corner. For target_corner > 0,
+												* bwBytePerSec and busbwUsagePercentage params should be set to 0. */
+	HAP_dcvs_voltage_corner_t min_corner;		/**< Minimum voltage corner for DCVS */
+	HAP_dcvs_voltage_corner_t max_corner;		/**< Maximum voltage corner for DCVS */
+	HAP_clk_perf_mode_t perf_mode;				/**< To specify clock frequency level within target voltage corner */
+	uint64 bwBytePerSec;						/**< Clock request in terms of bandwidth (bytes per second).
+												* For bwBytePerSec > 0, target_corner should be set to 0. */
+	uint32 busbwUsagePercentage;				/**< Percentage of time during which bwBytesPerSec BW is required from the bus (0..100) */
+	uint32 param1;								/**< Reserved */
+	uint32 param2;								/**< Reserved */
+	uint32 param3;								/**< Reserved */
+} HAP_power_ceng_bus_payload;
+
+/** @defgroup HAP_power_enums HAP POWER enums
+ *  @{
+ */
+ /** Identifies the HAP power request type */
+typedef enum {
+	HAP_power_set_mips_bw = 1,				/**< Requests for MIPS. Provides
+											* fine-grained control to set MIPS values.
+											* Payload is set to HAP_power_payload */
+	HAP_power_set_HVX,						/**< Requests to enable / disable HVX
+											* Payload is set to HAP_power_hvx_payload */
+	HAP_power_set_apptype,					/**< Sets the app_type
+											* Payload is set to HAP_power_app_type_payload */
+	HAP_power_set_linelock,					/**< Sets the throttled L2 cache line locking parameters.
+											* Only one throttled call is supported at this time. Additional
+											* un-throttled line-locks can be performed using HAP_power_set_linelock_nothrottle
+											* Payload is set to HAP_power_linelock_payload */
+	HAP_power_set_DCVS,						/**< Requests to participate / stop participating in DCVS */
+	HAP_power_set_linelock_nothrottle,		/**< Sets the L2 cache line locking parameters (non-throttled).
+											* Payload is set to HAP_power_linelock_nothrottle_payload */
+	HAP_power_set_DCVS_v2,					/**< Requests to participate / stop participating in DCVS_v2 */
+	HAP_power_set_vapss,					/**< Sets the VAPSS core clock and DDR/IPNOC bandwidth
+											* Payload is set to HAP_power_vapss_payload */
+	HAP_power_set_vapss_v2,					/**< Sets the VAPSS core DMA/HCP clocks and DDR/IPNOC bandwidths
+											* Payload is set to HAP_power_vapss_payload_v2 */
+	HAP_power_set_dcvs_v2_update,			/**< Updates DCVS params
+											* Payload is set to HAP_power_dcvs_v2_update_payload */
+	HAP_power_set_streamer,					/**< Sets the streamer core clocks
+											* Payload is set to HAP_power_streamer_payload */
+	HAP_power_set_DCVS_v3,					/**< Updates DCVS params
+											* Payload is set to HAP_power_dcvs_v3_payload */
+	HAP_power_set_HMX,						/**< Requests to enable / disable HMX
+											* Payload is set to HAP_power_hmx_payload */
+	HAP_power_set_HMX_v2,					/**< Requests for HMX power management along with
+											* HMX clock requirement. On chipsets without separate HMX
+											* clock, will return AEE_EBADPARM error
+											* if target corner / frequency is requested.
+											* Payload is set to HAP_power_hmx_payload_v2 */
+	HAP_power_set_CENG_bus,					/**< To vote for CENG bus
+											* Payload is set to HAP_power_ceng_bus_payload */
+} HAP_Power_request_type;
+/**
+ * @} // HAP_power_enums
+ */
+
+/** Data type to change power values on the DSP */
+typedef struct {
+	HAP_Power_request_type type;									/**< Identifies the request type */
+	union{
+		HAP_power_mips_bw_payload mips_bw;							/**< Requests for performance level */
+		HAP_power_vapss_payload vapss;								/**< Sets the VAPSS core clock and DDR/IPNOC bandwidth  */
+		HAP_power_vapss_payload_v2 vapss_v2;						/**< Sets the VAPSS core clock and DDR/IPNOC bandwidth  */
+		HAP_power_streamer_payload streamer;						/**< Sets the streamer core clocks */
+		HAP_power_hvx_payload hvx;									/**< Requests to enable / disable HVX */
+		HAP_power_app_type_payload apptype;							/**< Sets the app_type */
+		HAP_power_linelock_payload linelock;						/**< Sets the throttled L2 cache linelock parameters. Only one
+																	* throttled linelock is permitted at this time. Additional
+																	* un-throttled linelocks can be performed using linelock_nothrottle */
+		HAP_power_dcvs_payload dcvs;								/**< Updates DCVS params */
+		HAP_power_dcvs_v2_payload dcvs_v2;							/**< Updates DCVS_v2 params */
+		HAP_power_dcvs_v2_update_payload dcvs_v2_update;			/**< Updates DCVS_v2_update params */
+		HAP_power_linelock_nothrottle_payload linelock_nothrottle;	/**< Sets the un-throttled L2 cache linelock parameters */
+		HAP_power_dcvs_v3_payload dcvs_v3;							/**< Updates DCVS_v3 params */
+		HAP_power_hmx_payload hmx;									/**< Requests to turn on / off HMX
+																	* When request is to turn on HMX, on chipsets with separate HMX clock,
+																	* a default HMX clock will be selected based on the voted
+																	* Q6 core clock level from the same HAP_power_set context.
+																	*/
+		HAP_power_hmx_payload_v2 hmx_v2;								/**< Requests for HMX power management along with HMX clock requirement.
+																	* On chipsets without separate HMX clock, will return AEE_EBADPARM error
+																	* if target corner / frequency is requested. */
+		HAP_power_ceng_bus_payload ceng_bus;							/**< Votes for CENG bus */
+	};
+} HAP_power_request_t;
+
+/** @defgroup HAP_power_functions HAP POWER functions
+ *  @{
+ */
+/**
+* Method to set power values from the DSP
+* @param[in] context	-	To identify the power client
+* @param[in] request	-	Request params.
+* @retval 0 on success, AEE_EMMPMREGISTER on MMPM client register request failure, -1 on unknown error
+*/
+int HAP_power_set(void* context, HAP_power_request_t* request);
+/**
+ * @} // HAP_power_functions
+ */
+
+/** @defgroup HAP_power_enums HAP POWER enums
+ *  @{
+ */
+ /** Identifies the HAP power response type */
+typedef enum {
+	HAP_power_get_max_mips = 1,				/**< Returns the max mips supported (max_mips) */
+	HAP_power_get_max_bus_bw,				/**< Returns the max bus bandwidth supported (max_bus_bw) */
+	HAP_power_get_client_class,				/**< Returns the client class (client_class) */
+	HAP_power_get_clk_Freq,					/**< Returns the core clock frequency (clkFreqHz) */
+	HAP_power_get_aggregateAVSMpps,			/**< Returns the aggregate Mpps used by audio and voice (clkFreqHz) */
+	HAP_power_get_dcvsEnabled,				/**< Returns the dcvs status (enabled / disabled) */
+	HAP_power_get_vapss_core_clk_Freq,		/**< Returns the VAPSS core clock frequency (clkFreqHz) */
+	HAP_power_get_dma_core_clk_Freq,		/**< Returns the DMA core clock frequency (clkFreqHz) */
+	HAP_power_get_hcp_core_clk_Freq,		/**< Returns the HCP core clock frequency (clkFreqHz) */
+	HAP_power_get_streamer0_core_clk_Freq,	/**< Returns the streamer 0 core clock frequency (clkFreqHz) */
+	HAP_power_get_streamer1_core_clk_Freq,	/**< Returns the streamer 1 core clock frequency (clkFreqHz) */
+} HAP_Power_response_type;
+/**
+ * @} // HAP_power_enums
+ */
+
+/** Data type to retrieve power values from the DSP */
+typedef struct {
+	HAP_Power_response_type type;			/**< Identifies the type to retrieve. */
+	union{
+		unsigned int max_mips;				/**< Max mips supported */
+		uint64 max_bus_bw;					/**< Max bus bw supported */
+		unsigned int client_class;			/**< Current client class */
+		unsigned int clkFreqHz;				/**< Current core CPU frequency */
+		unsigned int aggregateAVSMpps;		/**< Aggregate AVS Mpps used by audio and voice */
+		boolean dcvsEnabled;				/**< Indicates if dcvs is enabled / disabled. */
+	};
+} HAP_power_response_t;
+
+/** @defgroup HAP_power_functions HAP POWER functions
+ *  @{
+ */
+
+/**
+* Method to retrieve power values from the DSP
+* @param[in] context	-	Ignored
+* @param[out] response	-	Response.
+*/
+int HAP_power_get(void* context, HAP_power_response_t* response);
+
+/**
+* Method to initialize dcvs v3 structure in request param. It enables
+*		flags and resets params for all fields in dcvs v3. So, this
+*		can also be used to remove applied dcvs v3 params and restore
+*		defaults.
+* @param[in] request	-	Pointer to request params.
+*/
+static inline void HAP_power_set_dcvs_v3_init(HAP_power_request_t* request) {
+	memset(request, 0, sizeof(HAP_power_request_t) );
+	request->type = HAP_power_set_DCVS_v3;
+	request->dcvs_v3.set_dcvs_enable = TRUE;
+	request->dcvs_v3.dcvs_enable = TRUE;
+	request->dcvs_v3.dcvs_option = HAP_DCVS_V2_POWER_SAVER_MODE;
+	request->dcvs_v3.set_latency = TRUE;
+	request->dcvs_v3.latency = 65535;
+	request->dcvs_v3.set_core_params = TRUE;
+	request->dcvs_v3.set_bus_params = TRUE;
+	request->dcvs_v3.set_dcvs_v3_params = TRUE;
+	request->dcvs_v3.set_sleep_disable = TRUE;
+	return;
+}
+
+/**
+* Method to enable/disable dcvs and set particular dcvs policy.
+* @param[in] context		-	User context.
+* @param[in] dcvs_enable	-	TRUE to enable dcvs, FALSE to disable dcvs.
+* @param[in] dcvs_option	-	To set particular dcvs policy. In case of dcvs disable
+*                           request, this param will be ignored.
+* @returns	-	0 on success
+*/
+static inline int HAP_power_set_dcvs_option(void* context, boolean dcvs_enable,
+		HAP_power_dcvs_v2_payload_option dcvs_option) {
+	HAP_power_request_t request;
+	memset(&request, 0, sizeof(HAP_power_request_t) );
+	request.type = HAP_power_set_DCVS_v3;
+	request.dcvs_v3.set_dcvs_enable = TRUE;
+	request.dcvs_v3.dcvs_enable = dcvs_enable;
+	if(dcvs_enable)
+		request.dcvs_v3.dcvs_option = dcvs_option;
+	return HAP_power_set(context, &request);
+}
+
+/**
+* Method to set/reset sleep latency.
+* @param[in] context	-	User context.
+* @param[in] latency	-	Sleep latency value in microseconds, should be > 1.
+*						Use 65535 max value to reset it to default.
+* @returns	-	0 on success
+*/
+static inline int HAP_power_set_sleep_latency(void* context, uint32 latency) {
+	HAP_power_request_t request;
+	memset(&request, 0, sizeof(HAP_power_request_t) );
+	request.type = HAP_power_set_DCVS_v3;
+	request.dcvs_v3.set_latency = TRUE;
+	request.dcvs_v3.latency = latency;
+	return HAP_power_set(context, &request);
+}
+
+/**
+* Method to set/reset DSP core clock voltage corners.
+* @param[in] context		-	User context.
+* @param[in] target_corner	-	Target voltage corner.
+* @param[in] min_corner		-	Minimum voltage corner.
+* @param[in] max_corner		-	Maximum voltage corner.
+* @returns	-	0 on success
+*/
+static inline int HAP_power_set_core_corner(void* context, uint32 target_corner,
+		uint32 min_corner, uint32 max_corner) {
+	HAP_power_request_t request;
+	memset(&request, 0, sizeof(HAP_power_request_t) );
+	request.type = HAP_power_set_DCVS_v3;
+	request.dcvs_v3.set_core_params = TRUE;
+	request.dcvs_v3.core_params.min_corner = (HAP_dcvs_voltage_corner_t) (min_corner);
+	request.dcvs_v3.core_params.max_corner = (HAP_dcvs_voltage_corner_t) (max_corner);
+	request.dcvs_v3.core_params.target_corner = (HAP_dcvs_voltage_corner_t) (target_corner);
+	return HAP_power_set(context, &request);
+}
+
+/**
+* Method to set/reset bus clock voltage corners.
+* @param[in] context		-	User context.
+* @param[in] target_corner	-	Target voltage corner.
+* @param[in] min_corner		-	Minimum voltage corner.
+* @param[in] max_corner		-	Maximum voltage corner.
+* @returns	-	0 on success
+*/
+static inline int HAP_power_set_bus_corner(void* context, uint32 target_corner,
+		uint32 min_corner, uint32 max_corner) {
+    HAP_power_request_t request;
+	memset(&request, 0, sizeof(HAP_power_request_t) );
+	request.type = HAP_power_set_DCVS_v3;
+	request.dcvs_v3.set_bus_params = TRUE;
+	request.dcvs_v3.bus_params.min_corner = (HAP_dcvs_voltage_corner_t) (min_corner);
+	request.dcvs_v3.bus_params.max_corner = (HAP_dcvs_voltage_corner_t) (max_corner);
+	request.dcvs_v3.bus_params.target_corner = (HAP_dcvs_voltage_corner_t) (target_corner);
+	return HAP_power_set(context, &request);
+}
+
+/**
+* Method to select low power mode.
+* @param[in] context		-	User context.
+* @param[in] sleep_disable	-	See HAP_DCVS_LPM_LEVEL1, HAP_DCVS_LPM_LEVEL2, HAP_DCVS_LPM_LEVEL3 and HAP_DCVS_LPM_ENABLE ALL above.
+* @returns	-	0 on success
+*/
+static inline int HAP_power_set_sleep_mode(void* context, unsigned char sleep_disable) {
+	HAP_power_request_t request;
+	memset(&request, 0, sizeof(HAP_power_request_t) );
+	request.type = HAP_power_set_DCVS_v3;
+	request.dcvs_v3.set_sleep_disable = TRUE;
+	request.dcvs_v3.sleep_disable = sleep_disable;
+	return HAP_power_set(context, &request);
+}
+
+
+/**
+* This API is deprecated and might generate undesired results.
+* Please use the HAP_power_get() and HAP_power_set() APIs instead.
+* Requests a performance level by percentage for clock speed
+* and bus speed.  Passing 0 for any parameter results in no
+* request being issued for that particular attribute.
+* @param[in] clock		-	percentage of target's maximum clock speed
+* @param[in] bus		-	percentage of target's maximum bus speed
+* @param[in] latency	-	maximum hardware wake up latency in microseconds.  The
+*						higher the value the deeper state of sleep
+*						that can be entered but the longer it may
+*						take to awaken.
+* @retval 0 on success
+* @par Comments	:	Performance metrics vary from target to target so the
+*					intent of this API is to allow callers to set a relative
+*					performance level to achieve the desired balance between
+*					performance and power saving.
+*/
+int HAP_power_request(int clock, int bus, int latency);
+
+/**
+* This API is deprecated and might generate undesired results.
+* Please use the HAP_power_get() and HAP_power_set() APIs instead.
+* Requests a performance level by absolute values.  Passing 0
+* for any parameter results in no request being issued for that
+* particular attribute.
+* @param[in] clock		-	speed in MHz
+* @param[in] bus		-	bus speed in MHz
+* @param[in] latency	-	maximum hardware wakeup latency in microseconds.  The
+*						higher the value the deeper state of
+*						sleep that can be entered but the
+*						longer it may take to awaken.
+* @retval 0 on success
+* @par Comments	:	This API allows callers who are aware of their target
+*					specific capabilities to set them explicitly.
+*/
+int HAP_power_request_abs(int clock, int bus, int latency);
+
+/**
+* This API is deprecated and might generate undesired results.
+* Please use the HAP_power_get() and HAP_power_set() APIs instead.
+* queries the target for its clock and bus speed capabilities
+* @param[out] clock_max	-	maximum clock speed supported in MHz
+* @param[out] bus_max	-	maximum bus speed supported in MHz
+* @retval 0 on success
+*/
+int HAP_power_get_max_speed(int* clock_max, int* bus_max);
+
+/**
+* This API is deprecated and might generate undesired results.
+* Please use the HAP_power_get() and HAP_power_set() APIs instead.
+* Upvote for HVX power
+* @retval 0 on success
+*/
+int HVX_power_request(void);
+
+/**
+* This API is deprecated and might generate undesired results.
+* Please use the HAP_power_get() and HAP_power_set() APIs instead.
+* Downvote for HVX power
+* @retval 0 on success
+*/
+int HVX_power_release(void);
+
+/**
+* Method to destroy clients created through HAP_power_set
+* @param[in] context	-	To uniquely identify the client
+* @retval 0 on success, AEE_ENOSUCHCLIENT on Invalid context, -1 on unknown error
+* @brief DO NOT call this API directly, use HAP_power_destroy instead.
+*/
+int HAP_power_destroy_client(void *context);
+
+/**
+* @param[in] client	-	To uniquely identify the client context.
+* @retval 0 on success, AEE_EUNSUPPORTEDAPI if the API is not supported on the DSP image, AEE_ENOSUCHCLIENT on Invalid context, -1 on unknown error
+* @brief Method to destroy clients created through HAP_power_set, wrapper to HAP_power_destroy_client API
+*/
+static inline int HAP_power_destroy(void *client){
+	if(0 != HAP_power_destroy_client)
+		return HAP_power_destroy_client(client);
+	return AEE_EUNSUPPORTEDAPI;
+}
+
+/**
+* Method to create user client context
+* @retval context for client
+*/
+static inline void* HAP_utils_create_context(void) {
+	/*
+	 * Allocate 1 byte of memory for a unique context identifier
+	 * Clients can also allocate memory and use it as unique context identifier
+	 */
+	return malloc(1);
+}
+
+/**
+* Method to destroy user client context
+* @param context of client
+*/
+static inline void HAP_utils_destroy_context(void* context) {
+	free(context);
+}
+
+/**
+ * @} // HAP_power_functions
+ */
+#ifdef __cplusplus
+}
+#endif
+#endif //_HAP_POWER_H
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_power.md b/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_power.md
new file mode 100755
index 0000000000000..44ad184ac0eca
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_power.md
@@ -0,0 +1,946 @@
+# Introduction {#intro}
+
+The Hexagon SDK provides APIs to control DSP core and bus clocks based on power and performance needs.
+By default, every compute session votes for NOMINAL voltage corner and powers on HVX.
+Clients can choose to overwrite this by HAP power APIs below.
+
+# HAP_power API {#api}
+
+
+## API Overview {#api-overview}
+
+HAP_power_* APIS are used by clients to override the power settings of the DSPs according to their needs. This API is supported on ADSP, CDSP and SLPI.
+
+The HAP power API contains a set of interfaces that allow programmers to adjust the DSP power usage as per the application's power requirement, thereby providing a good balance between power consumption and performance.
+
+* HAP_power_set(): This is used to vote for performance levels on the DSP
+* HAP_power_get(): This is used to query the DSP for current performance levels
+* HAP_power_destroy(): This is used to destroy power clients created through HAP_power_set API
+
+::HAP_power_set can be used to control these parameters on the DSP:
+* DSP MIPS
+* Bus speed / bandwidth
+* Dynamic scaling of bus and DSP clocks and bus speeds (DCVS)
+* Application type (client class), more details on this can be found [here](#app-type)
+* L2 cache line locking
+* Hexagon Vector eXtension (HVX) blocks
+
+::HAP_power_get can be used to query the DSP for these parameters:
+* Max MIPS supported
+* Max bus speed / bandwidth supported
+* Current Core clock speed
+* Current application type (client class)
+* Aggregate Mpps used by audio and voice
+
+::HAP_power_destroy can be used to destroy the power clients created through HAP_power_set API.
+  Destroys any existing HAP_power votes associated with the provided client context, and disassociates that context from HAP_power.
+
+
+## Usage {#usage}
+
+See HAP_power.h for more information on this API.
+
+::HAP_power_set : This accepts two parameters
+* context - Unique identifier (explained below)
+* request - The power request.
+
+context is a unique identifier (in the scope of the PD) provided by the user to identify an independent voting client of HAP_power. For each unique context passed in a HAP_power_set invocation, HAP_power adds a new client to its state to be associated with that context.
+
+On targets after Lahaina, helper APIs HAP_utils_create_context and HAP_utils_destroy_context are added to create and destroy unique context identifiers. If these are not available, the recommended alternative is to create a context by allocating a dummy byte and using the pointer value as the context, and freeing that byte later after destroying the context's associated HAP_power client via HAP_power_destroy.
+* HAP_utils_create_context(): This is used to create a unique context identifier
+* HAP_utils_destroy_context(): This is used to destroy unique context identifier. HAP_utils_destroy_context should only be called on a context after destroying the HAP_power client associated to that context, via HAP_power_destroy(context). Failure to destroy both in the proper order may cause a leak.
+
+Refer to the following table for voting/unvoting call flow:
+<table>
+<tr><th>Voting/Unvoting call flow<th>Library code
+<tr><td>Create unique client context<td>context = userLibCodeToCreateUniqueContext() (or) context = HAP_utils_create_context()
+<tr><td>Create power client and vote<td>HAP_power_set(context, request)
+<tr><td>Destroy power client<td>HAP_power_destroy(context) (or) HAP_power_destroy_client(context)
+<tr><td>Destroy unique client context<td>userLibCodeToDestroyUniqueContext(context) (or) HAP_utils_destroy_context(context)
+</table>
+
+NOTE: Using a context set to NULL has specific implications, discussed below in [default voting](#default_voting)
+
+Example: Module1 and Module2 are two different clients running in the same user PD on DSP. Module1 creates a new, unique client context and votes for its needs. Module2 also creates a new, unique client context and votes for its needs. The figure below shows the different client contexts and their votes to power manager.
+
+![screenshot](../../images/hap_power.png)
+
+The type in the request is set to one of:
+* HAP_power_set_mips_bw: Used to set MIPS and / or bus speed (bandwidth). The payload in this case should contain HAP_power_mips_bw_payload.
+* HAP_power_set_HVX: Used to enable / disable power for HVX. The payload in this case should contain HAP_power_hvx_payload.
+* HAP_power_set_apptype: Used to set the application type. The payload in this case should contain ::HAP_power_app_type_payload.
+* HAP_power_set_linelock: Used to line lock memory in the L2 cache. The payload in this case should contain HAP_power_linelock_payload.
+* HAP_power_set_DCVS: Used to participate / stop participating in DCVS. The payload in this case should contain HAP_power_dcvs_payload.
+* HAP_power_set_DCVS_v2: Enhanced version of HAP_power_set_DCVS with more options. The payload in this case should contain HAP_power_dcvs_v2_payload.
+* HAP_power_set_DCVS_v3: Enhanced version of HAP_power_set_DCVS_v2 with more options to select core and bus operating corners separately. The payload in this case should contain HAP_power_dcvs_v3_payload.
+
+NOTE:
+* More details on HAP_power_set_DCVS_v2 can be found [here](#DCVS_V2).
+* HAP_power_set_DCVS_v3 is supported from SM8250 onwards. More details can be found [here](#DCVS_V3).
+* In Older targets, maximum of 8 clients can be created per PD, (including the default client). This limitation has been removed from SM8250 onwards.
+* HAP_power_hmx_payload_v2 is supported starting with v75. On chipsets (v75 onwards) without separate HMX clock plan, requests made for target corner or
+frequency will return AEE_EBADPARM (invalid parameter) error.
+* HAP_power_set_CENG_bus is supported from v75 onwards. On chipsets (v75 onwards) not supporting independent Q6-CENG bus clock scaling, this request type
+will return AEE_EBADPARM (invalid parameter) error.
+
+Example is provided below.
+
+~~~{.c}
+	//Vote
+	/* Populate request structure */
+	int retVal;
+	HAP_power_request_t request;
+	memset(&request, 0, sizeof(HAP_power_request_t)); //Important to clear the structure if only selected fields are updated.
+	request.type = HAP_power_set_DCVS_v2;
+	request.dcvs_v2.dcvs_enable = TRUE;
+	request.dcvs_v2.dcvs_option = HAP_DCVS_V2_PERFORMANCE_MODE;
+	request.dcvs_v2.set_latency = TRUE;
+	request.dcvs_v2.latency = 1000;
+	request.dcvs_v2.set_dcvs_params = TRUE;
+	request.dcvs_v2.dcvs_params.min_corner = HAP_DCVS_VCORNER_SVS;
+	request.dcvs_v2.dcvs_params.max_corner = HAP_DCVS_VCORNER_TURBO;
+	request.dcvs_v2.dcvs_params.target_corner = HAP_DCVS_VCORNER_NOM;
+	/* Call HAP_power_set API with the updated request structure */
+	/* cv is a global variable or an address in heap to uniquely identify the clients */
+	retVal = HAP_power_set(&cv, &request);
+	...
+~~~
+::HAP_power_get : This accepts two parameters
+
+* context - this parameter is ignored and response is for the system level
+* response - The power response
+
+The type in the request is set to one of
+* HAP_power_get_max_mips: Used to query for maximum MIPS supported
+* HAP_power_get_max_bus_bw: Used to query for maximum bus bandwidth supported
+* HAP_power_get_client_class: Used to query for current application type.
+* HAP_power_get_clk_Freq: Used to query for current core clock frequency.
+* HAP_power_get_aggregateAVSMpps: Used to query for aggregate Mpps used by audio and voice.
+
+::HAP_power_destroy : This accepts one parameter
+
+* context - the unique client context identifying the HAP_power client to destroy.
+
+Example to remove default vote for a PD and to destroy default client.
+~~~{.c}
+	//Vote
+	int nErr= 0;
+
+	if(0 == (nErr = HAP_power_destroy(NULL))){
+	   //Client destroyed successfully
+	}
+~~~
+
+### Default voting {#default_voting}
+In older targets, NULL is a valid HAP_power context that is used by FastRPC to establish a default vote for some reasonable clock settings. In order to override the default vote on these targets, it is necessary for a client to place a canceling vote using the NULL context.
+
+Permitting clients to use NULL context can lead to conflicts where multiple clients in the same PD may try to independently manage the NULL context.
+To address these conflicts, support for any NULL context voting has been removed starting in targets after Lahaina. For these targets, behavior of default voting has been changed. A suitable vote is placed automatically at opportune times (such as startup or object loading) on a unique context, and is automatically removed when no longer needed. For example, the FastRPC driver places a high clock vote when a new session is started on the DSP, and removes it as soon as any user in that session places any other HAP_power vote. This ensures clocks are high during startup and object loading, up until the point the user application is able to place its own vote.
+
+The recommended client behavior is as follows:
+
+* Lahaina and older targets: While it is allowed to rely on the default vote to establish reasonable clocks, it is recommended to place a canceling vote (with low/zero clock values as shown below) on the NULL context, plus an active vote for the requirements on a different unique context.
+
+* Targets after Lahaina: Simply place an active vote for required clock settings on a unique context.
+
+If a single client implementation is required to work correctly on all targets, the recommendation is as follows:
+* Make an attempt to place a canceling NULL-context vote. If the error code AEE_ENULLCONTEXT is returned, it means the target does not support NULL context. Thus, this error can be ignored.
+* Place a (non-NULL) unique context vote for the clock requirements.
+
+On targets that support NULL context default voting, it can be removed using HAP_power_destroy(NULL) or as follows:
+
+~~~{.c}
+req.type = HAP_power_set_DCVS_v2;
+req.dcvs_v2.dcvs_enable = FALSE;
+req.dcvs_v2.set_latency = FALSE;
+req.dcvs_v2.set_dcvs_params = FALSE;
+VERIFY(AEE_SUCCESS == (nErr = HAP_power_set(NULL, &req)));
+~~~
+
+
+### Application type/Client class {#app-type}
+HAP_power_set() API exposes an API for users to register for an application type.
+
+'apptype' in ::HAP_power_request_t passed as parameter to  HAP_power_set request allows user to register an application as one of the client classes available in ::HAP_power_app_type_payload.
+Setting an appropriate client class can be important as this information is used in DSP DCVS, QoS, DSP power management drivers. HAP_power clients who do not explicitly vote their apptype are treated as general compute applications, which is appropriate for most cases.
+
+####Users of Client class information
+DCVS selects HAP_DCVS_V2_POWER_SAVER_MODE as default DCVS policy for COMPUTE and STREAMING class clients. Client can always power their own DCVS policy by issuing a DCVS_v2 request, click [here](#DCVS_V2) for more details on DCVS_v2 request type of HAP_power_set.
+QoS driver modifies L2 scoreboard thresholds on detecting STREAMING class clients to allow DSP L2 slave accesses.
+
+
+###DSP DCVS v2 HAP interface {#DCVS_V2}
+Based on user configuration, DCVS module in DSP (ADSP/CDSP) can adjust the core and bus clock frequencies based on core and bus usage metrics captured by SysMon. The existing DCVS interface via HAP_power_set() (type: HAP_power_set_DCVS) only allows users to vote for DCVS participation with 2 different options. DSP DCVS v2 algorithm exposes an enhanced set of DCVS options for diversified clients and a simplified voltage corner based voting scheme. On supported targets (8998 and latest), these new DCVS options and voting scheme are exposed to clients via HAP_power_set()(type: HAP_power_set_DCVS_v2).
+
+####HAP API Support
+The HAP_power_set API is enhanced to support the new mode registrations with DSP DCVS logic. Following table illustrates the new type of request and the new dcvs_v2 request structure associated with it.
+
+<table>
+<tr><th> API <td colspan="3"> HAP_power_set (void* context, HAP_power_request_t* request)
+<tr><th> context <td colspan="3"> Explained [here](#usage). Votes across all contexts will be aggregated accordingly.
+<tr><th rowspan="7"> request <td> type <td> HAP_power_set_DCVS_v2 <td> This new request type allows user to request via the new dcvs_v2 request structure.
+<tr><td rowspan="6"> dcvs_v2 <td> dcvs_enable <td> DCVS participation flag
+<tr><td> dcvs_option <td> These options instruct DCVS algorithm to use a pre-defined set of thresholds and operation logic based on the selected option.
+<tr><td> set_latency <td> Latency vote validity flag. If FALSE then default sleep latency vote of 65535 micro seconds will be considered.
+<tr><td> latency <td> Sleep latency vote in micro seconds. Valid when the set_latency flag is set to TRUE
+<tr><td> set_dcvs_params <td> DCVS params validity flag. If FALSE then all parameters of dcvs_params will be set to default zero.
+<tr><td> dcvs_params <td> DCVS params structure with flexibility to set upper and lower DCVS thresholds and also vote for core and bus clocks using a voltage corner.
+</table>
+
+###DSP DCVS v3 HAP interface {#DCVS_V3}
+Based on user configuration, DCVS module in DSP can adjust the core and bus clock frequencies based on core and bus usage metrics captured by SysMon. The existing DCVS v2 algorithm via HAP_power_set() (type: HAP_power_set_DCVS_v2) exposes multiple DCVS options for diversified clients and a simplified voltage corner based voting scheme. But along with existing features, DCVS v3 provides separate voltage corner voting option to user for core and bus clock and also option to disable all low power modes without explicit sleep latency vote need. In scenarios where user is ok for same voltage corner voting for core and bus clock then they can still use DCVS v2. Also, in DCVS v3 user can vote for individual field/multiple fields based on his requirement. On supported targets (SM8250 and latest), these new DCVS options and voting scheme are exposed to clients via HAP_power_set() (type: HAP_power_set_DCVS_v3). Also, added wrapper functions built around same HAP_power_set() (type: HAP_power_set_DCVS_v3) to help user to select and vote for individual functionality in DCVS v3 without bothering about DCVS v3 structure and related details. This document captures information on these new DCVS v3 features and ways to use them.
+
+####HAP API Support
+The HAP_power_set API is enhanced to support the new user options with DCVS v3 with the new request type HAP_power_set_DCVS_v3 with HAP_power_dcvs_v3_payload.
+
+<table>
+<tr><th> API <td colspan="3"> HAP_power_set (void* context, HAP_power_request_t* request)
+<tr><th> context <td colspan="3">  Explained [here](#usage). Votes across all contexts will be aggregated accordingly.
+<tr><th rowspan="14"> request <td> type <td> HAP_power_set_DCVS_v3 <td> This new request type allows user to request via the new dcvs_v3 request structure
+<tr><td rowspan="13"> dcvs_v3 <td> set_dcvs_enable <td> DCVS participation validity flag. If FALSE then the dcvs_enable and dcvs_option fields will be ignored.
+<tr><td> dcvs_enable <td> DCVS participation flag. Vaild when the set_dcvs_enable is set to TRUE.
+<tr><td> dcvs_option <td> These options instruct DCVS algorithm to use a pre-defined set of thresholds and operation logic based on the selected option.
+<tr><td> set_latency <td> Latency vote validity flag. If FALSE then the latency field will be ignored.
+<tr><td> latency <td> sleep latency vote in micro seconds. Valid when the set_latency flag is set to TRUE
+<tr><td> set_core_params <td> Core clock params validity flag. If FALSE then the core_params field be ignored.
+<tr><td> core_params <td> Core clock params structure with flexibility to set upper and lower core clock DCVS thresholds and also vote for core clock using a voltage corner. Valid when set_core_params is set to TRUE.
+<tr><td> set_bus_params <td> Bus clock params validity flag. If FALSE then the bus_params field will be ignored.
+<tr><td> bus_params <td> Bus clock params structure with flexibility to set upper and lower bus clock DCVS thresholds and also vote for bus clock using a voltage corner. Valid when set_bus_params is set to TRUE.
+<tr><td> set_dcvs_v3_params <td> Validity flag for reserved DCVS params. If FALSE then the dcvs_v3_params field will be ignored.
+<tr><td> dcvs_v3_params <td> Reserved DCVS params
+<tr><td> set_sleep_disable <td> Sleep param validity flag. If FALSE then the sleep_disable field will be ignored.
+<tr><td> sleep_disable <td> To select low-power mode (LPM). Valid when set_sleep_disable is set to TRUE. Refer to [Sleep Disable](#sleep_disable) for options.
+</table>
+
+####Wrapper APIs
+There are wrapper functions built around same HAP_power_set()(type: HAP_power_set_DCVS_v3) to help user to select and vote for individual functionality in DCVS v3 without bothering about DCVS v3 structure and related details. Below section provides these APIs details.
+
+* HAP_power_set_dcvs_v3_init()
+* HAP_power_set_dcvs_option()
+* HAP_power_set_sleep_latency()
+* HAP_power_set_core_corner()
+* HAP_power_set_bus_corner()
+* HAP_power_set_sleep_mode()
+
+####DCVS Enable
+'dcvs_enable' parameter of dcvs_v2 structure enables user to vote for DCVS participation.
+<table>
+<tr><th> Value <th> Description
+<tr><td> TRUE <td> Enable DSP DCVS (if not already enabled). Using dcvs_option, based on the application demand, user can choose a particular option to guide DSP DCVS logic
+<tr><td> FALSE <td> Don't enable DSP DCVS. Valid only when the client requesting is the only one actively voting for clocks or is one among the clients voting for this same option.
+</table>
+
+'set_dcvs_enable' and 'dcvs_enable parameters' of dcvs_v3 structure enables user to vote for DCVS participation.
+<table>
+<tr><th rowspan="2"> set_dcvs_enable <td> FALSE <td> No DCVS request from the client, dcvs_enable and dcvs_option fields will be ignored.
+<tr><td> TRUE <td> Client request for DCVS is valid and desired DCVS participation is provided in dcvs_enable field.
+<tr><th rowspan="2"> dcvs_enable <td> TRUE <td> Enable DSP DCVS (if not already enabled). Using dcvs_option, based on the application demand, user can choose a particular option to guide DSP DCVS logic.
+<tr><td> FALSE <td> Don't enable DSP DCVS. Valid only when the client requesting is the only one actively voting for clocks or is one among the clients voting for this same option.
+</table>
+
+When a DCVS participating client is active, DCVS logic would be enabled, but the aggregated clients vote requesting for DCVS disable will be considered as a FLOOR request in DCVS logic i.e, DCVS would't lower the clocks below the aggregated value.
+
+DCVS participation and options are considered only for active clients. A client is deemed inactive when there is no MIPS and bandwidth request (made by setting request type to 'HAP_power_set_mips_bw' in 'HAP_power_set' [API](#usage)) and when target_corner for core and bus under dcvs_params is set to HAP_DCVS_VCORNER_DISABLE.
+
+####DCVS Options
+'dcvs_option' parameter of dcvs_v2 structure enables user to request for a particular DCVS mode when 'dcvs_enable' option is set to TRUE.
+
+'dcvs_option' parameter of dcvs_v3 structure enables user to request for a particular DCVS mode when 'set_dcvs_enable' and 'dcvs_enable' both are set to TRUE.
+
+Following table captures the gist of the available DCVS modes.
+
+<table>
+<tr><th> Value <th> Description
+<tr><td> HAP_DCVS_V2_ADJUST_UP_DOWN
+<td> Legacy option: For clients voting via HAP_power_set_mips_bw request type.
+This mode allows DCVS to both increase and decrease core/bus clock speeds based on need. DCVS selects thresholds corresponding to a balanced mode (legacy) of operation with respect to power and performance.
+
+min_corner and max_corner votes via dcvs_params are used as lower and
+
+upper limit guidelines in DCVS.
+
+NOTE: If client votes via target_corner under dcvs_params of this structure, both HAP_DCVS_V2_ADJUST_ONLY_UP and HAP_DCVS_V2_ADJUST_UP_DOWN modes are identical. min_corner and max_corner votes are used as lower and upper limit guidelines in DCVS while using balanced mode (legacy) thresholds.
+
+<tr><td> HAP_DCVS_V2_ADJUST_ONLY_UP
+<td> Legacy option: For clients voting via HAP_power_set_mips_bw request type.
+
+This mode restricts DCVS from lowering the clock below the values requested via HAP_power_set_mips_bw request. DCVS can only increase the clock above the requested levels. DCVS selects thresholds corresponding to a balanced mode(legacy) of operation with respect to power and performance. max_corner vote via dcvs_params is used as upper limit guideline in DCVS.
+
+NOTE: If client votes via target_corner under dcvs_params of this structure, both HAP_DCVS_V2_ADJUST_ONLY_UP and HAP_DCVS_V2_ADJUST_UP_DOWN modes are identical. min_corner and max_corner votes are used as lower and upper limit guidelines in DCVS while using balanced mode (legacy) thresholds.
+
+<tr><td> HAP_DCVS_V2_POWER_SAVER_MODE
+<td> New option:
+
+Default for all clients participating in DCVS. DCVS can both increase and decrease the core/bus clock speeds while min_corner and max_corner votes are used as lower and upper limit guidelines. DCVS selects thresholds corresponding to power saving model. This mode is meant for applications where saving power is of higher priority than achieving fastest performance. Performance may be slower in this mode than in HAP_DCVS_V2_PERFORMANCE_MODE or the legacy modes i.e, HAP_DCVS_V2_ADJUST_ONLY_UP HAP_DCVS_V2_ADJUST_UP_DOWN
+
+<tr><td> HAP_DCVS_V2_POWER_SAVER_AGGRESSIVE_MODE
+<td> New option:
+
+DCVS can both increase and decrease the core/bus clock speeds while min_corner and max_corner votes are used as lower and upper limit guidelines. DCVS selects thresholds corresponding to a power saving model. Further, the DCVS monitoring durations in lowering the clocks is decreased for a faster ramp down and hence greater power saving compared to the power saver mode. This mode is meant for applications where saving power is of higher priority than achieving fastest performance. Performance may be slower in this mode than in HAP_DCVS_V2_PERFORMANCE_MODE HAP_DCVS_V2_POWER_SAVER_MODE or the legacy modes i.e, HAP_DCVS_V2_ADJUST_ONLY_UP HAP_DCVS_V2_ADJUST_UP_DOWN
+
+<tr><td> HAP_DCVS_V2_PERFORMANCE_MODE
+<td>New option:
+
+DCVS can both increase and decrease the core/bus clock speeds while min_corner and max_corner votes are used as lower and upper limit guidelines. DCVS selects a set of aggressive thresholds in terms of performance. DCVS can quickly bump up the clocks in this mode assisting higher performance at the cost of power.
+
+<tr><td> HAP_DCVS_V2_DUTY_CYCLE_MODE
+<td> This mode is for periodic use cases. Starting with Lahaina, DCVS when in this mode detects the periodicity and sets/removes the core and bus clock votes for active/idle durations respectively. This mode helps save power significantly by reducing idle leakage current while keeping the performance intact. Compared to Applications setting/removing clock votes for each active frame to save the power, the DCVS duty cycle mode provides better performance and more power savings, as in this mode, the voting is done upfront by DCVS just before active duration start based on periodicity prediction.
+</table>
+In cases where multiple clients have registered different DCVS options, following table depicts the DCVS policy aggregation logic.
+<table>
+<tr><th> PERFORMANCE (Yes / No) <th> POWER SAVER (Yes / No) <th> POWER SAVER AGGRESSIVE (Yes / No) <th> BALANCED (UP ONLY/UP AND DOWN clients) (Yes / No) <th> Final DCVS thresholds
+<tr><td> Y <td> Y /N <td> Y /N <td> Y /N <td> PERFORMANCE
+<tr><td> N <td> Y <td> Y /N <td> Y /N <td> POWER SAVER
+<tr><td> N <td> N <td> Y <td> Y <td> POWER SAVER
+</table>
+
+####DCVS Duty Cycle
+DCVS duty cycle mode is for periodic use cases. The DCVS algorithm detects periodicity and sets the core and bus clock votes as per active and idle duration. This helps in saving the power to great extent by reducing idle leakage current while keeping the performance intact.
+
+Below example illustrates DCVS duty cycle working for an application with 30FPS activity and TURBO_PLUS votes for core and bus clocks.
+
+For this application run, core, bus clocks and related DSP metrics with and without DCVS duty cycle mode are shown below. In no duty cycle case, core and bus clocks are at TURBO_PLUS throughout the application run.
+In DCVS duty cycle case, the DCVS algorithm detects periodicity in use case and sets core and bus clocks to TURBO_PLUS in activity time and to LOW SVS (SVS2) during idle time of each frame.
+
+![screenshot](../../images/DCVS_CoreClock_DutyCycle.png)
+
+![screenshot](../../images/DCVS_BusClock_DutyCycle.png)
+
+With increasing processing capabilities, active time for applications will improve resulting in greater power savings for periodic activities with DCVS duty cycle mode due to increased idle time.
+
+The DCVS duty cycle mode is supported starting with Lahaina. On chipsets prior Lahaina, DCVS fallsback to power saver mode on selecting duty cycling.
+
+####DCVS Duty Cycle Modes
+Starting with Waipio, DCVS duty cycle mode is further expanded to cover following scenarios/sub-modes.
+
+####Fixed Corners Mode
+Fixed active and idle clock corners:
+* Client decides fixed active clock and idle clock
+* DCVS only uses those selected corners
+
+Example:
+* Max corner : HAP_DCVS_VCORNER_DISABLE
+* Target corner : TURBO
+* Min corner : LOW SVS (SVS2)
+* Mode : Duty_cycle
+* DCVS Enable flag: 0
+* Expectation : Duty cycle between TUBRO and LOW SVS (SVS2) only
+
+![screenshot](../../images/HAP_set_dcvs_v3_duty_cycle_fixed_corners_mode.png)
+
+####Active Range Mode
+Client and the DCVS algorithm decides active clock corners:
+* Client decides active clock range and idle clock
+* The DCVS algorithm decides active corner within provided range based on power vs performance tradeoff and user given max active time (if provided)
+
+Example:
+* Max corner : TURBO
+* Target corner : SVS PLUS
+* Min corner : LOW SVS (SVS2)
+* Mode : Duty_cycle
+* DCVS Enable flag: 1
+* Expectation : Active clock is decided by the DCVS algorithm within the provided range (Target corner, max corner)
+* The DCVS algorithm starts with client provided max corner for active clock, tunes it based on performance vs power tradeoff and user given max active time.
+
+![screenshot](../../images/HAP_set_dcvs_v3_duty_cycle_active_range_mode.png)
+
+####Full DCVS Control Mode
+DCVS decides active and idle clock corners:
+* Client does not provide active and idle clock corner
+* The DCVS algorithm can decide any clock corner for active and idle durations based on power vs performance tradeoff and user given max active time (if provided)
+* Max corner : HAP_DCVS_VCORNER_DISABLE
+* Target corner : HAP_DCVS_VCORNER_DISABLE
+* Min corner : HAP_DCVS_VCORNER_DISABLE
+* Mode : Duty_cycle
+* DCVS Enable flag: 1
+* Expectation : Active and idle clocks are decided by the DCVS algorithm
+* DCVS starts with NOM as active corner and LOW SVS (SVS2) as idle corner and later tunes it based on performance vs power tradeoff and user given max active time (if provided).
+* DCVS picks LOW SVS (SVS2) clock corner when there is no activity.
+
+![screenshot](../../images/HAP_set_dcvs_v3_duty_cycle_full_dcvs_control_mode.png)
+
+####DCVS Duty Cycle Helper APIs
+Starting Waipio, the DCVS duty cycle helper APIs are added for ease of configuration. See [HAP_dcvs.h](../../doxygen/HAP_dcvs/index.html) for more information. 
+
+####Sleep latency {#sleep-latency}
+'set_latency' and 'latency' parameters of structure dcvs_v2 can be used to request for a sleep latency in micro seconds.
+
+<table>
+<tr><th rowspan="2"> set_latency <td> FALSE <td> No sleep latency request from the client. If FALSE then default sleep latency vote of 65535 micro seconds will be considered.
+<tr><td> TRUE <td> Client request for a sleep latency is valid and desired latency is provided in latency field.
+<tr><th> latency <td colspan="2"> Sleep latency request in micro-seconds.
+</table>
+
+Similarly 'set_latency' and 'latency' parameters of structure dcvs_v3 can be used to request for a sleep latency in micro seconds.
+
+<table>
+<tr><th rowspan="2"> set_latency <td> FALSE <td> No sleep latency request from the client. If FALSE then the latency field will be ignored.
+<tr><td> TRUE <td> Client request for a sleep latency is valid and desired latency is provided in latency field.
+<tr><th> latency <td colspan="2"> Sleep latency request in micro-seconds.
+</table>
+
+NOTE: HAP_power_set provides below possible ways for voting for sleep latency:
+
+1. via HAP_power_set_mips_bw request type:
+~~~{.c}
+/* For sleep latency */
+mips_bw.set_latency = TRUE;
+mips_bw.latency = <Sleep latency tolerance in micro seconds>
+~~~
+2. via HAP_power_set_DCVS_v2 request type:
+~~~{.c}
+/* For sleep latency */
+dcvs_v2.set_latency = TRUE;
+dcvs_v2.latency = <Sleep latency tolerance in micro seconds>
+~~~
+   Or via HAP_power_set_DCVS_v3 request type:
+~~~{.c}
+/* For sleep latency */
+dcvs_v3.set_latency = TRUE;
+dcvs_v3.latency = <Sleep latency tolerance in micro second>
+~~~
+
+Clients should use only 1 of the above methods to vote for latency i.e, either via mips_bw or via dcvs_v2/dcvs_v3 but not both. Voting via dcvs_v2/dcvs_v3 does NOT cancel any previous vote done via mips_bw and vice versa.
+
+latency value can be set to a minimum of 10 micro-second. The Application should vote for a latency that is tolerable. For latency critical applications, the latency can be set to its minimum value of 10 micro-second.
+
+
+####DCVS params
+set_dcvs_params and dcvs_params parameters of dcvs_v2 can be used to update DCVS thresholds and target corner vote.
+set_core_params and core_params parameters of dcvs_v3 can be used to update DCVS thresholds and target corner vote for core clock. Similarly set_bus_params and bus_params parameters for bus clock.
+This structure is valid irrespective of chosen dcvs_enable and dcvs_option values. Client can request for a target_corner even when the dcvs_enable option is set to FALSE.
+
+When set_dcvs_params/set_core_params/set_bus_params is TRUE, target_corner, min_corner and max_corner parameters of dcvs_params/core_params/bus_params can take one of the value in ::HAP_dcvs_voltage_corner_t;
+
+<table>
+<tr><th> HAP_dcvs_voltage_corner_t <th> Description
+<tr><td> HAP_DCVS_VCORNER_DISABLE <td> No specific corner request (No Vote)
+<tr><td> HAP_DCVS_VCORNER_SVS2 <td>SVS2 / LOW SVS corner
+Note: On targets that don't support this voltage corner, this option will be interpreted as HAP_DCVS_VCORNER_SVS
+<tr><td> HAP_DCVS_VCORNER_SVS <td> SVS corner
+<tr><td> HAP_DCVS_VCORNER_SVS_PLUS <td> SVS Plus corner
+Note: On targets that don't support this voltage corner, this option will be interpreted as HAP_DCVS_VCORNER_SVS
+<tr><td> HAP_DCVS_VCORNER_NOM <td> NOMINAL corner
+<tr><td> HAP_DCVS_VCORNER_NOM_PLUS <td> NOMINAL Plus corner
+Note: On targets that don't support this voltage corner, this option will be interpreted as HAP_DCVS_VCORNER_NOM
+<tr><td> HAP_DCVS_VCORNER_TURBO <td>TURBO corner
+<tr><td> HAP_DCVS_VCORNER_TURBO_PLUS <td> TURBO Plus corner
+Note: On targets released till Kailua, this option selects the clock frequencies defined under corners TURBO_PLUS and above (TURBO_L2 / L3) and falls back to TURBO when there is no clock frequency available at these corners. On targets post Kailua, this option selects clock frequencies defined under TURBO_PLUS (or TURBO when no defined frequency under TURBO_PLUS). Frequencies defined under TURBO_L2 / L3 corners can be selected via the new HAP_DCVS_VCORNER_TURBO_L2 / L3 options.
+<tr><td> HAP_DCVS_VCORNER_TURBO_L2 <td> TURBO L2 corner
+Note: On targets released till Kailua, this option is interpreted as HAP_DCVS_VCORNER_TURBO_PLUS. On targets post Kailua, this option selects the closest TURBO clock frequency (corresponding to HAP_DCVS_VCORNER_TURBO_PLUS / TURBO) when there is no clock frequency defined under the TURBO_L2 voltage corner.
+<tr><td> HAP_DCVS_VCORNER_TURBO_L3 <td> TURBO L3 corner
+Note: On targets released till Kailua, this option is interpreted as HAP_DCVS_VCORNER_TURBO_PLUS. On targets post Kailua, this option selects the closest TURBO clock frequency (corresponding to HAP_DCVS_VCORNER_TURBO_L2 / TURBO_PLUS / TURBO) when there is no clock frequency defined under the TURBO_L3 voltage corner.
+<tr><td> HAP_DCVS_VCORNER_MAX <td> MAX possible corner defined for maximum performance.
+</table>
+<br>
+<table>
+<tr><th rowspan="6"> dcvs_params/core_params/bus_params <td> target_corner <td>Type: HAP_dcvs_voltage_corner_t.
+Alternative to HAP_power_set_mips_bw MIPS and Bandwidth request. HAP_power_set provides 2 possible ways for voting for sleep latency and core/bus clocks.
+1. via HAP_power_set_mips_bw request type:
+~~~{.c}
+/* For core clock */
+mips_bw.set_mips = TRUE;
+mips_bw.mipsPerThread = <MIPS per thread request>
+mips_bw.mipsTotal = <Total MIPS request>
+/* For bus clock */
+mips_bw.set_bus_bw = TRUE;
+mips_bw.bwBytePerSec = <bandwidth request in bytes per second (Instantaneous)>
+mips_bw.busbwUsagePercentage = <Usage percentage (Average)>
+/* For sleep latency */
+mips_bw.set_latency = TRUE;
+mips_bw.latency = <Sleep latency in micro seconds>
+~~~
+2. via HAP_power_set_DCVS_v2 request type:
+~~~{.c}
+/* For core and bus clock */
+dcvs_v2.set_dcvs_params = TRUE;
+dcvs_v2.dcvs_params.target_corner = <Desired vote in terms of voltage corner for core, bus clocks>
+/* For sleep latency */
+dcvs_v2.set_latency = TRUE;
+dcvs_v2.latency = <Sleep latency in micro seconds>
+~~~
+or
+
+3. via HAP_power_set_DCVS_v3 request type:
+~~~{.c}
+/* For core clock */
+dcvs_v3.set_core_params = TRUE;
+dcvs_v3.core_params.target_corner = <Desired vote in terms of
+voltage corner for core clock>
+/* For bus clock */
+dcvs_v3.set_bus_params = TRUE;
+dcvs_v3.bus_params.target_corner = <Desired vote in terms of
+voltage corner for bus clock>
+/* For sleep latency */
+dcvs_v3.set_latency = TRUE;
+dcvs_v3.latency = <Sleep latency tolerance in micro seconds>
+~~~
+Client can request core and bus clock to run at at a particular voltage corner instead of providing MIPS and Bandwidth (bytes per second) requests. DCVS will convert the requested voltage corner value to appropriate core clock and bus clock votes and forwards the request to the power manager on client's behalf. Clients should use only 1 of the above methods to vote i.e, either via mips_bw or via dcvs_v2/dcvs_v3 but not both. Voting via dcvs_v2/dcvs_v3 does NOT cancel any previous vote done via mips_bw and vice versa. If one would like to switch between these 2 methods, cancel any previous vote done via the other method before requesting.
+
+When target_corner = HAP_DCVS_VCORNER_DISABLE (No vote), DSP DCVS doesn't request for any core or bus clocks at the time of API call and it's client's responsibility to vote for core and bus clocks using HAP_power_set_mips_bw type request type.
+
+If enabled > HAP_DCVS_VCORNER_DISABLE, DSP DCVS logic will pick the highest available frequency plan for both core and bus clocks at the given voltage corner and requests for these clock frequencies synchronously in the API context on client's behalf. When the HAP_power_set API returns with success, core and bus clock frequencies would be set by DSP DCVS on a valid target_corner request.
+
+<tr><td> min_corner <td> Type: HAP_dcvs_voltage_corner_t.
+
+If disabled, min_corner == HAP_DCVS_VCORNER_DISABLE, the lower threshold/minimum value that DCVS can correct the clock will remain unchanged. If enabled > HAP_DCVS_VCORNER_DISABLE, DSP DCVS picks the lowest core clock frequency at the given voltage corner and uses it as the lower threshold/minimum value that DCVS can correct the clock to, irrespective of the dcvs_option selected.
+
+min_corner should always be less than or equal to target_corner and max_corner unless they are disabled HAP_DCVS_VCORNER_DISABLE.
+
+For clients requesting dcvs_enable as FALSE and using target_corner, min_corner should be equal to target_corner.
+
+<tr><td> max_corner <td>Type: HAP_dcvs_voltage_corner_t.
+
+If disabled, max_corner == HAP_DCVS_VCORNER_DISABLE, the upper threshold/maximum value that DCVS can correct the clock will remain unchanged. Typically, that would be HAP_DCVS_VCORNER_MAX in this case. If enabled > HAP_DCVS_VCORNER_DISABLE, DSP DCVS picks the highest core and bus clock frequencies at the given voltage corner and uses it as the upper threshold/maximum value that DCVS can correct the clocks to, irrespective of the dcvs_option selected.
+
+DSP DCVS logic overrides the max_corner vote from a client to MAX in presence of a concurrency. Concurrency is defined as a scenario where 2 or more FastRPC dynamic loaded clients are active or active Audio/Voice sessions with MPPS load greater than a pre-defined threshold.
+
+max_corner should always be greater than or equal to target_corner and min_corner votes, or, should be disabled HAP_DCVS_VCORNER_DISABLE.
+
+<tr><td> param1 <td> Type: HAP_dcvs_voltage_corner_t.
+
+NOTE: Set this option to HAP_DCVS_VCORNER_DISABLE unless required.
+
+This parameter allows user to set CPU L3 clock frequency to the requested corner. Valid only on CDSP subsystem in targets with CPU L3 cache and IO-coherency enabled (SDM845, SDM710, SM8150...), ignored elsewhere. On CDSP, based on the requested target_corner, CPU L3 clock vote from CDSP is set to a balanced level (with minimal power impact) to start with and DCVS (if enabled) increases the vote based on need to attain higher performance. This option is useful to peg CPU L3 clock at a higher level (at the cost of higher power) than that of the default balanced vote and that of the DCVS algorithm votes. This option is for advanced users and should be configured to default (HAP_DCVS_VCORNER_DISABLE) unless there is a need to explicitly set CPU L3 clock frequency based on performance and power analysis/characterization
+
+<tr><td> param2 <td> Reserved.
+<tr><td> param3 <td> Reserved.
+</table>
+
+####Clock frequency level selection at given target corner
+By default DCVS picks the highest available frequency for a given core/bus clock target corner. On latest chipsets(released after Palima), APIs are added to allow the user to specify frequency level (highest/lowest) for given core/bus clock target corner. See [HAP_dcvs.h](../../doxygen/HAP_dcvs/index.html) for more information.
+
+####DCVS vote aggregation logic in case of concurrency
+Following logic explains the aggregation logic for min and target corner votes when there are multiple requesting clients:
+~~~{.c}
+DCVS min_corner vote = MAX (min_corner vote client 1, client 2, ...)
+DCVS target_corner vote = MAX (target_corner vote client 1, client 2, ...)
+~~~
+The following scenarios are treated as a concurrency in DCVS vote aggregation logic where DCVS max corner vote is set to TURBO by DCVS:
+* More than 1 active HAP client with or without active Audio/Voice clients.
+* One active HAP client and active Audio/Voice clients with MPPS load greater than a pre-defined threshold.
+~~~{.c}
+	DCVS max_corner vote = HAP_DCVS_VCORNER_MAX
+~~~
+
+Note that DCVS overrides client's MAX corner vote to MAX to accommodate any concurrency requirement. DCVS MAX vote of MAX doesn't necessarily mean that DCVS will push the vote to MAX corner; MAX corner vote just sets the upper threshold for DCVS vote logic. DCVS will only bump up the clocks on need basis based on selected DCVS option.
+
+####Sleep Disable {#sleep_disable}
+'set_sleep_disable' and 'sleep_disable' parameters of dcvs_v3 structure enables user to select low-power mode (LPM) in DSP.
+
+In general, applications are expected to vote for their latency tolerance via the [latency](#sleep-latency) parameter in dcvs_v3/dcvs_v2 options. The aggregated latency vote across clients is used in selecting appropriate low-power mode (LPM) of the DSP subsystem. LPM will save power when the DSP subsystem is idle by reducing leakage current. Deeper LPMs typically have higher wake up latencies, which will increase interrupt service delays and add to inter-processor communication latencies. Though the latency vote controls the selection of low-power modes, the vote required for disabling/allowing certain LPMs is difficult to calculate as the wakeup latency associated with these LPMs could change from chipset to chipset and between runs within the same chipset.
+
+This 'sleep_disable' parameter in dcvs_v3 allows user to directly prevent certain LPM levels of the DSP subsystem. By default, there is no restriction placed on LPMs i.e. all the LPMs are enabled and the aggregated latency vote (along with other system parameters) is used in LPM selection. The 'sleep_disable' parameter in dcvs_v3 is for the advanced developers who would like to disable certain low-power modes explicitly irrespective of the latency vote. Developers need to consider their power-performance tradeoff requirements and if necessary profile the results before voting using this parameter. Regular users are suggested to choose the default i.e. 'HAP_DCVS_LPM_ENABLE_ALL'.
+
+If any particular LPM level is not supported on the DSP subsystem then it will enable nearest shallow LPM level. For example, in absense of 'HAP_DCVS_LPM_LEVEL3' it will select
+'HAP_DCVS_LPM_LEVEL2' which is nearest shallow LPM level to 'HAP_DCVS_LPM_LEVEL3'.
+
+<table>
+<tr><th rowspan="2"> set_sleep_disable <td> FALSE <td> No low-power mode request from the client. If FALSE then the sleep_disable field will be ignored.
+<tr><td> TRUE <td> Client request for low-power mode is valid and desired option is provided in sleep_disable field.
+<tr><th rowspan="4"> sleep_disable <td> HAP_DCVS_LPM_LEVEL1 <td> To disable sleep/low-power modes.
+<tr><td> HAP_DCVS_LPM_LEVEL2 <td> To enable only standalone APCR.
+<tr><td> HAP_DCVS_LPM_LEVEL3 <td> To enable RPM assisted APCR.
+<tr><td> HAP_DCVS_LPM_ENABLE_ALL <td> To enable all low-power modes (enables full power collapse).
+</table>
+
+***NOTE:*** Till Palima, only HAP_DCVS_LPM_LEVEL1 and HAP_DCVS_LPM_ENABLE_ALL are supported.
+
+####Illustrations (DCVS_V2)
+NOTE:
+For working example, refer `$HEXAGON_SDK_ROOT\examples\common\benchmark_v65` application; See benchmark_setClocks() in src_dsp\benchmark_imp.c
+
+1. Requirement: Enable DCVS in PERFORMANCE mode, set sleep latency to 1000 micro-seconds, vote NOM in Target with SVS as Min and TURBO as Max.
+~~~{.c}
+//Vote
+
+/* Populate request structure */
+int retVal;
+HAP_power_request_t request;
+memset(&request, 0, sizeof(HAP_power_request_t)); //Important to clear the structure if only selected fields are updated.
+request.type = HAP_power_set_DCVS_v2;
+request.dcvs_v2.dcvs_enable = TRUE;
+request.dcvs_v2.dcvs_option = HAP_DCVS_V2_PERFORMANCE_MODE;
+request.dcvs_v2.set_latency = TRUE;
+request.dcvs_v2.latency = 1000;
+request.dcvs_v2.set_dcvs_params = TRUE;
+request.dcvs_v2.dcvs_params.min_corner = HAP_DCVS_VCORNER_SVS;
+request.dcvs_v2.dcvs_params.max_corner = HAP_DCVS_VCORNER_TURBO;
+request.dcvs_v2.dcvs_params.target_corner = HAP_DCVS_VCORNER_NOM;
+/* Call HAP_power_set API with the updated request structure */
+/* ctx is an unique identifier, explained [here](#usage). */
+retVal = HAP_power_set(ctx, &request);
+...
+/*
+ * Processing block
+ */
+...
+//To remove the vote
+memset(&request, 0, sizeof(HAP_power_request_t)); //Remove all votes.
+request.type = HAP_power_set_DCVS_v2;
+/* ctx is an unique identifier, explained [here](#usage). */
+retVal = HAP_power_set(ctx, &request);
+~~~
+
+2. Requirement: Disable DCVS; do NOT vote for any corners/latency
+~~~{.c}
+//Vote
+
+/* Populate request structure */
+int retVal;
+HAP_power_request_t request;
+memset(&request, 0, sizeof(HAP_power_request_t)); //Important to clear the structure if only selected fields are updated.
+request.type = HAP_power_set_DCVS_v2;
+request.dcvs_v2.dcvs_enable = FALSE;
+/* Call HAP_power_set API with the updated request structure */
+/* ctx is an unique identifier, explained [here](#usage). */
+retVal = HAP_power_set(ctx, &request);
+~~~
+
+3. Requirement: Enable DCVS in Power saver mode. Do NOT vote for any target corner/latency, but set MIN and MAX thresholds to DCVS to SVS and TURBO respectively. Clock voting will be done via HAP_power_set_mips_bw request.
+~~~{.c}
+//Vote
+
+/* Populate request structure with dcvs_v2 request*/
+int retVal;
+HAP_power_request_t request;
+memset(&request, 0, sizeof(HAP_power_request_t)); //Important to clear the structure if only selected fields are updated.
+request.type = HAP_power_set_DCVS_v2;
+request.dcvs_v2.dcvs_enable = TRUE;
+request.dcvs_v2.dcvs_option = HAP_DCVS_V2_POWER_SAVER_MODE;
+request.dcvs_v2.set_dcvs_params = TRUE;
+request.dcvs_v2.dcvs_params.min_corner = HAP_DCVS_VCORNER_SVS;
+request.dcvs_v2.dcvs_params.max_corner = HAP_DCVS_VCORNER_TURBO;
+request.dcvs_v2.dcvs_params.target_corner = HAP_DCVS_VCORNER_DISABLE; //no vote
+/* Call HAP_power_set API with the updated request structure */
+/* ctx is an unique identifier, explained [here](#usage). */
+retVal = HAP_power_set(ctx, &request);
+/* Populate request structure with mips_bw request */
+HAP_power_request_t request;
+memset(&request, 0, sizeof(HAP_power_request_t));
+request.type = HAP_power_set_mips_bw;
+request.mips_bw.set_mips = TRUE;
+request.mips_bw.mipsPerThread = 150;
+request.mips_bw.mipsTotal = 600;
+request.mips_bw.set_bus_bw = TRUE;
+request.mips_bw.bwBytesPerSec = 10*1000*1000;
+request.mips_bw.busbwUsagePercentage = 50;
+request.mips_bw.set_latency = TRUE;
+request.mips_bw.latency = 1000;
+/* Call HAP_power_set API with the updated request structure */
+/* ctx is an unique identifier, explained [here](#usage). */
+retVal = HAP_power_set(ctx, &request); // Core and bus clocks will be set by this request.
+...
+/*
+ * Processing block
+ */
+...
+//To remove the dcvs_v2 vote
+memset(&request, 0, sizeof(HAP_power_request_t)); //Remove all votes.
+request.type = HAP_power_set_DCVS_v2;
+/* ctx is an unique identifier, explained [here](#usage). */
+retVal = HAP_power_set(ctx, &request);
+//To remove the mips_bw vote
+memset(&request, 0, sizeof(HAP_power_request_t)); //Remove all votes
+request.type = HAP_power_set_mips_bw;
+request.mips_bw.set_mips = TRUE;
+request.mips_bw.set_bus_bw = TRUE;
+request.mips_bw.set_latency = TRUE;
+request.mips_bw.latency = 65535;
+/* ctx is an unique identifier, explained [here](#usage). */
+retVal = HAP_power_set(ctx, &request);
+~~~
+
+4. Requirement: Enable DCVS in DUTY CYCLE mode, vote TURBO in Target with SVS as Min.
+~~~{.c}
+//Vote
+
+/* Populate request structure */
+int retVal;
+HAP_power_request_t request;
+memset(&request, 0, sizeof(HAP_power_request_t)); //Important to clear the structure if only selected fields are updated.
+request.type = HAP_power_set_DCVS_v2;
+request.dcvs_v2.dcvs_enable = TRUE;
+request.dcvs_v2.dcvs_option = HAP_DCVS_V2_DUTY_CYCLE_MODE;
+request.dcvs_v2.set_latency = TRUE;
+request.dcvs_v2.latency = 1000;
+request.dcvs_v2.set_dcvs_params = TRUE;
+request.dcvs_v2.dcvs_params.min_corner = HAP_DCVS_VCORNER_SVS;
+request.dcvs_v2.dcvs_params.max_corner = HAP_DCVS_VCORNER_TURBO;
+request.dcvs_v2.dcvs_params.target_corner = HAP_DCVS_VCORNER_TURBO;
+/* Call HAP_power_set API with the updated request structure */
+/* ctx is an unique identifier, explained [here](#usage). */
+retVal = HAP_power_set(ctx, &request);
+...
+/*
+ * Processing block
+ */
+...
+//To remove the vote
+memset(&request, 0, sizeof(HAP_power_request_t)); //Remove all votes.
+request.type = HAP_power_set_DCVS_v2;
+/* ctx is an unique identifier, explained [here](#usage). */
+retVal = HAP_power_set(ctx, &request);
+~~~
+
+####Illustrations (DCVS_V3)
+
+
+1. Requirement: Enable DCVS in POWER SAVER mode, set sleep latency to 1000 micro-seconds, vote NOM in Target with SVS as Min and TURBO as Max for core clock, vote TURBO in Target with NOM as Min and TURBO PLUS as Max for bus clock. Later change bus clock vote as SVS_PLUS in Target with SVS as Min and NOM as Max.
+~~~{.c}
+//Vote
+
+/* Populate request structure */
+int retVal;
+HAP_power_request_t request;
+memset(&request, 0, sizeof(HAP_power_request_t)); //Important to clear the structure if only selected fields are updated.
+request.type = HAP_power_set_DCVS_v3;
+request.dcvs_v3.set_dcvs_enable = TRUE;
+request.dcvs_v3.dcvs_enable = TRUE;
+request.dcvs_v3.dcvs_option = HAP_DCVS_V2_POWER_SAVER_MODE;
+request.dcvs_v3.set_latency = TRUE;
+request.dcvs_v3.latency = 1000;
+request.dcvs_v3.set_core_params = TRUE;
+request.dcvs_v3.core_params.min_corner = HAP_DCVS_VCORNER_SVS;
+request.dcvs_v3.core_params.max_corner = HAP_DCVS_VCORNER_TURBO;
+request.dcvs_v3.core_params.target_corner = HAP_DCVS_VCORNER_NOM;
+request.dcvs_v3.set_bus_params = TRUE;
+request.dcvs_v3.bus_params.min_corner = HAP_DCVS_VCORNER_NOM;
+request.dcvs_v3.bus_params.max_corner = HAP_DCVS_VCORNER_TURBO_PLUS;
+request.dcvs_v3.bus_params.target_corner = HAP_DCVS_VCORNER_TURBO;
+/* Call HAP_power_set API with the updated request structure */
+/* ctx is an unique identifier, explained [here](#usage). */
+retVal = HAP_power_set(ctx, &request);
+...
+/*
+ * Processing block 1
+ */
+...
+//To update bus clock votes while keeping core clock and other parameters of dcvs_v3 request intact.
+memset(&request, 0, sizeof(HAP_power_request_t));
+request.type = HAP_power_set_DCVS_v3;
+request.dcvs_v3.set_bus_params = TRUE;
+request.dcvs_v3.bus_params.min_corner = HAP_DCVS_VCORNER_SVS;
+request.dcvs_v3.bus_params.max_corner = HAP_DCVS_VCORNER_NOM;
+request.dcvs_v3.bus_params.target_corner = HAP_DCVS_VCORNER_SVS_PLUS;
+/* Call HAP_power_set API with the updated request structure */
+/* ctx is an unique identifier, explained [here](#usage). */
+retVal = HAP_power_set(ctx, &request);
+...
+/*
+ * Processing block 2
+ */
+...
+//To remove the vote
+memset(&request, 0, sizeof(HAP_power_request_t));
+request.type = HAP_power_set_DCVS_v3;
+request.dcvs_v3.set_dcvs_enable = TRUE;
+request.dcvs_v3.set_latency = TRUE;
+request.dcvs_v3.latency = 65535;
+request.dcvs_v3.set_core_params = TRUE;
+request.dcvs_v3.set_bus_params = TRUE;
+/* ctx is an unique identifier, explained [here](#usage). */
+retVal = HAP_power_set(ctx, &request);
+~~~
+
+2. Requirement: Enable DCVS in PERFORMANCE mode, vote TURBO in Target with NOM as Min and TURBO PLUS as Max for core clock, do NOT vote for latency and bus clock.
+~~~{.c}
+//Vote
+
+/* Populate request structure */
+int retVal;
+HAP_power_request_t request;
+memset(&request, 0, sizeof(HAP_power_request_t)); //Important to clear the structure if only selected fields are updated.
+request.type = HAP_power_set_DCVS_v3;
+request.dcvs_v3.set_dcvs_enable = TRUE;
+request.dcvs_v3.dcvs_enable = TRUE;
+request.dcvs_v3.dcvs_option = HAP_DCVS_V2_PERFORMANCE_MODE;
+request.dcvs_v3.set_core_params = TRUE;
+request.dcvs_v3.core_params.min_corner = HAP_DCVS_VCORNER_NOM;
+request.dcvs_v3.core_params.max_corner = HAP_DCVS_VCORNER_TURBO_PLUS;
+request.dcvs_v3.core_params.target_corner = HAP_DCVS_VCORNER_TURBO;
+/* Call HAP_power_set API with the updated request structure */
+/* ctx is an unique identifier, explained [here](#usage). */
+retVal = HAP_power_set(ctx, &request);
+...
+/*
+ * Processing block
+ */
+...
+//To remove the vote
+memset(&request, 0, sizeof(HAP_power_request_t));
+request.type = HAP_power_set_DCVS_v3;
+request.dcvs_v3.set_dcvs_enable = TRUE;
+request.dcvs_v3.set_core_params = TRUE;
+/* ctx is an unique identifier, explained [here](#usage). */
+retVal = HAP_power_set(ctx, &request);
+~~~
+
+3. Requirement: Disable DCVS; do NOT vote for any corners/latency.
+~~~{.c}
+//Vote
+
+/* Populate request structure */
+int retVal;
+HAP_power_request_t request;
+memset(&request, 0, sizeof(HAP_power_request_t)); //Important to clear the structure if only selected fields are updated.
+request.type = HAP_power_set_DCVS_v3;
+request.dcvs_v3.set_dcvs_enable = TRUE;
+request.dcvs_v3.dcvs_enable = FALSE;
+/* Call HAP_power_set API with the updated request structure */
+/* ctx is an unique identifier, explained [here](#usage). */
+retVal = HAP_power_seti(ctx, &request);
+~~~
+
+4. Requirement: Disable sleep (all low power modes) and re-enable it after task completion.
+~~~{.c}
+//Vote
+
+/* Populate request structure */
+int retVal;
+HAP_power_request_t request;
+memset(&request, 0, sizeof(HAP_power_request_t)); //Important to clear the structure if only selected fields are updated.
+request.type = HAP_power_set_DCVS_v3;
+request.dcvs_v3.set_sleep_disable = TRUE;
+request.dcvs_v3.sleep_disable = HAP_DCVS_LPM_LEVEL1;
+/* Call HAP_power_set API with the updated request structure */
+/* ctx is an unique identifier, explained [here](#usage). */
+retVal = HAP_power_set(ctx, &request);
+...
+/*
+ * Processing block
+ */
+...
+//To re-enable sleep.
+memset(&request, 0, sizeof(HAP_power_request_t));
+request.type = HAP_power_set_DCVS_v3;
+request.dcvs_v3.set_sleep_disable = TRUE;
+request.dcvs_v3.sleep_disable = HAP_DCVS_LPM_ENABLE_ALL;
+/* ctx is an unique identifier, explained [here](#usage). */
+retVal = HAP_power_set(ctx, &request);
+~~~
+
+5. Requirement: Enable DCVS in PERFORMANCE mode. Do NOT vote for any target corner/latency, but set MIN and MAX DCVS thresholds for core clock to NOM and TURBO respectively, set MIN and MAX DCVS thresholds for bus clock to SVS and NOM respectively. Clock voting will be done via HAP_power_set_mips_bw request.
+~~~{.c}
+//Vote
+
+/* Populate request structure with dcvs_v3 request*/
+int retVal;
+HAP_power_request_t request;
+memset(&request, 0, sizeof(HAP_power_request_t)); //Important to clear the structure if only selected fields are updated.
+request.type = HAP_power_set_DCVS_v3;
+request.dcvs_v3.set_dcvs_enable = TRUE;
+request.dcvs_v3.dcvs_enable = TRUE;
+request.dcvs_v3.dcvs_option = HAP_DCVS_V2_PERFORMANCE_MODE;
+request.dcvs_v3.set_core_params = TRUE;
+request.dcvs_v3.core_params.min_corner = HAP_DCVS_VCORNER_NOM;
+request.dcvs_v3.core_params.max_corner = HAP_DCVS_VCORNER_TURBO;
+request.dcvs_v3.set_bus_params = TRUE;
+request.dcvs_v3.bus_params.min_corner = HAP_DCVS_VCORNER_SVS;
+request.dcvs_v3.bus_params.max_corner = HAP_DCVS_VCORNER_NOM;
+/* Call HAP_power_set API with the updated request structure */
+/* ctx is an unique identifier, explained [here](#usage). */
+retVal = HAP_power_set(ctx, &request);
+/* Populate request structure with mips_bw request */
+HAP_power_request_t request;
+memset(&request, 0, sizeof(HAP_power_request_t));
+request.type = HAP_power_set_mips_bw;
+request.mips_bw.set_mips = TRUE;
+request.mips_bw.mipsPerThread = 150;
+request.mips_bw.mipsTotal = 600;
+request.mips_bw.set_bus_bw = TRUE;
+request.mips_bw.bwBytesPerSec = 10*1000*1000;
+request.mips_bw.busbwUsagePercentage = 50;
+request.mips_bw.set_latency = TRUE;
+request.mips_bw.latency = 1000;
+/* Call HAP_power_set API with the updated request structure */
+/* ctx is an unique identifier, explained [here](#usage). */
+retVal = HAP_power_set(ctx, &request); // Core and bus clocks will be set by this request.
+...
+/*
+ * Processing block
+ */
+...
+//To remove the dcvs_v3 vote
+memset(&request, 0, sizeof(HAP_power_request_t));
+request.type = HAP_power_set_DCVS_v3;
+request.dcvs_v3.set_dcvs_enable = TRUE;
+request.dcvs_v3.set_core_params = TRUE;
+request.dcvs_v3.set_bus_params = TRUE;
+/* ctx is an unique identifier, explained [here](#usage). */
+retVal = HAP_power_set(ctx, &request);
+//To remove the mips_bw vote
+memset(&request, 0, sizeof(HAP_power_request_t)); //Remove all votes
+request.type = HAP_power_set_mips_bw;
+request.mips_bw.set_mips = TRUE;
+request.mips_bw.set_bus_bw = TRUE;
+request.mips_bw.set_latency = TRUE;
+request.mips_bw.latency = 65535;
+/* ctx is an unique identifier, explained [here](#usage). */
+retVal = HAP_power_set(ctx, &request);
+~~~
+
+6. Requirement: Use wrapper APIs to: Enable DCVS in POWER SAVER AGGRESSIVE mode, set sleep latency to 1000 micro-seconds, vote NOM in Target with SVS as Min and TURBO as Max for core clock, vote TURBO in Target with NOM as Min and TURBO PLUS as Max for bus clock.
+~~~{.c}
+//Vote
+
+/* Populate request structure */
+int retVal;
+HAP_power_request_t request;
+HAP_power_set_dcvs_v3_init(&request);
+retVal = HAP_power_set_dcvs_option(NULL, TRUE, HAP_DCVS_V2_POWER_SAVER_AGGRESSIVE_MODE);
+retVal = HAP_power_set_sleep_latency(NULL, 1000);
+retVal = HAP_power_set_core_corner(NULL, HAP_DCVS_VCORNER_NOM, HAP_DCVS_VCORNER_SVS, HAP_DCVS_VCORNER_TURBO);
+retVal = HAP_power_set_bus_corner(NULL, HAP_DCVS_VCORNER_TURBO, HAP_DCVS_VCORNER_NOM, HAP_DCVS_VCORNER_TURBO_PLUS);
+...
+/*
+ * Processing block
+ */
+...
+//To remove the vote
+HAP_power_set_dcvs_v3_init(&request);
+/* ctx is an unique identifier, explained [here](#usage). */
+retVal = HAP_power_set(ctx, &request);
+~~~
+
+7. Requirement: Enable DCVS in DUTY CYCLE mode, vote TURBO_PLUS in Target with SVS as Min for core and bus clock.
+~~~{.c}
+//Vote
+
+/* Populate request structure */
+int retVal;
+HAP_power_request_t request;
+memset(&request, 0, sizeof(HAP_power_request_t)); //Important to clear the structure if only selected fields are updated.
+request.type = HAP_power_set_DCVS_v3;
+request.dcvs_v3.set_dcvs_enable = TRUE;
+request.dcvs_v3.dcvs_enable = TRUE;
+request.dcvs_v3.dcvs_option = HAP_DCVS_V2_DUTY_CYCLE_MODE;
+request.dcvs_v3.set_latency = TRUE;
+request.dcvs_v3.latency = 1000;
+request.dcvs_v3.set_core_params = TRUE;
+request.dcvs_v3.core_params.min_corner = HAP_DCVS_VCORNER_SVS;
+request.dcvs_v3.core_params.max_corner = HAP_DCVS_VCORNER_TURBO_PLUS;
+request.dcvs_v3.core_params.target_corner = HAP_DCVS_VCORNER_TURBO_PLUS;
+request.dcvs_v3.set_bus_params = TRUE;
+request.dcvs_v3.bus_params.min_corner = HAP_DCVS_VCORNER_SVS;
+request.dcvs_v3.bus_params.max_corner = HAP_DCVS_VCORNER_TURBO_PLUS;
+request.dcvs_v3.bus_params.target_corner = HAP_DCVS_VCORNER_TURBO_PLUS;
+/* Call HAP_power_set API with the updated request structure */
+/* ctx is an unique identifier, explained [here](#usage). */
+retVal = HAP_power_set(ctx, &request);
+...
+/*
+ * Processing block
+ */
+...
+//To remove the vote
+memset(&request, 0, sizeof(HAP_power_request_t));
+request.type = HAP_power_set_DCVS_v3;
+request.dcvs_v3.set_dcvs_enable = TRUE;
+request.dcvs_v3.set_latency = TRUE;
+request.dcvs_v3.latency = 65535;
+request.dcvs_v3.set_core_params = TRUE;
+request.dcvs_v3.set_bus_params = TRUE;
+/* ctx is an unique identifier, explained [here](#usage). */
+retVal = HAP_power_set(ctx, &request);
+~~~
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_process.h b/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_process.h
new file mode 100755
index 0000000000000..a7aacf0287159
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_process.h
@@ -0,0 +1,31 @@
+#ifndef HAP_PROCESS_H
+#define HAP_PROCESS_H
+/*==============================================================================
+  Copyright (c) 2024 Qualcomm Technologies Incorporated.
+  All Rights Reserved Qualcomm Technologies Proprietary
+
+  Export of this technology or software is regulated by the U.S.
+  Government. Diversion contrary to U.S. law prohibited.
+==============================================================================*/
+
+/** @defgroup process_type Process type
+ *  @{
+ */
+/** Return values for HAP_get_pd_type
+	Returns any one of the below values depending on the type of PD spawned */
+enum process_type {
+	ROOT_PD				= 0,
+	AUDIO_STATIC_PD			= 1,
+	SENSOR_STATIC_PD		= 2,
+	DYNAMIC_SIGNED_PD		= 3,
+	DYNAMIC_UNSIGNED_PD		= 4,
+	DYNAMIC_CPZ_PD			= 5,
+	SECURE_PD			= 6,
+	DYNAMIC_SYS_UNSIGNED_PD		= 7,
+	OIS_STATIC_PD			= 8,
+	MAX_PD_TYPE			= 9		/**< Maximum number of supported PD types */
+};
+/**
+ * @} // process_type
+ */
+#endif
\ No newline at end of file
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_ps.h b/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_ps.h
new file mode 100755
index 0000000000000..89eac2a080350
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_ps.h
@@ -0,0 +1,164 @@
+#ifndef HAP_PS_H
+#define HAP_PS_H
+/*==============================================================================
+  Copyright (c) 2012-2019,2024 Qualcomm Technologies Incorporated.
+  All Rights Reserved Qualcomm Technologies Proprietary
+
+  Export of this technology or software is regulated by the U.S.
+  Government. Diversion contrary to U.S. law prohibited.
+==============================================================================*/
+
+#include "AEEStdDef.h"
+#include "HAP_process.h"
+
+/**
+ * Maximum allowed remote process name length
+ */
+#define PROCESS_NAME_LEN 56
+
+
+/** @defgroup manage_dynamic_list Manage Dynamic List.
+ *  @{
+ */
+
+typedef struct HAP_process HAP_process;
+struct HAP_process {
+   char name[PROCESS_NAME_LEN];
+   int32 asid;
+   int32 hlos_pid;
+};
+
+/**
+ * Get list of active processes
+ * @param[out] num_processes : Number of active processes
+ * @param[out] processes     : Pointer to the list of processes
+ * @return                     0 on success, valid non-zero error code on failure
+ */
+int HAP_get_process_list(uint32* num_processes, HAP_process** processes);
+
+/**
+ * Add new entry to process list
+ * @param[in] process : Pointer to node to be added to the process list
+ * @return              0 on success, valid non-zero error code on failure
+ */
+int HAP_add_to_process_list(HAP_process* process);
+
+/**
+ * Remove entry from process list
+ * @param[in] hlos_pid : HLOS process ID of entry to be removed from the process list
+ * @return               0 on success, valid non-zero error code on failure
+ */
+int HAP_remove_from_process_list(int hlos_pid);
+
+/**
+ * Set name of current process
+ * @param[in] name : Name of process
+ * @return           0 on success, valid non-zero error code on failure
+ */
+int HAP_set_process_name(char *name);
+
+/**
+ * API deprecated from SM8150 onwards.
+ */
+int HAP_thread_migrate(int tidQ);
+
+/**
+ * @}
+ */
+
+
+/** @defgroup early_wakeup Signal early wakeup
+ *  @{
+ */
+
+
+/** Send signal to CPU for early wake up
+ *
+ *  Send signal to CPU for early wake up with approximate time to complete the job.
+ *  This signal helps to reduce FastRPC latency.
+ *
+ *  Args:
+ *		@param[in] tidQ          : QuRT thread id of a skel invoke thread. Use qurt_thread_get_id()
+ *                                 to retrieve the thread ID.
+ *		@param[in] earlyWakeTime : approximate time (in us) to complete job after sending the signal
+ *  Returns: 0 on success, valid non-zero error code on failure
+ */
+int HAP_send_early_signal(uint32_t tidQ, uint32_t earlyWakeTime);
+
+/**
+ * API deprecated from Lahaina onwards. Use HAP_send_early_signal() instead
+ */
+int fastrpc_send_early_signal(uint32_t tidQ, uint32_t earlyWakeTime);
+
+/**
+ * @}
+ */
+
+
+
+/** @defgroup thread_priority_ceiling Enquire thread priority ceiling
+ *  @{
+ */
+
+
+/** Return the ceiling thread priority for the current process
+ *
+ *  Return the thread priority ceiling for the current process. QuRT thread priorities
+ *  run from 1 to 255, with 1 being the highest. Unprivileged user processes will
+ *  have a ceiling priority of 64.
+ *
+ *  Args: None
+ *  Returns: Thread priority ceiling value (bet 1 & 255) on success, -1 on failure
+ */
+int HAP_get_thread_priority_ceiling(void);
+
+/**
+ * Identifies the HAP request user pd parameters type
+ * @param HAP_req_get_orig_apps_pid  : Returns the process original apps pid.
+ */
+typedef enum {
+   HAP_req_get_orig_apps_pid = 1,
+} HAP_req_userpd_params_type;
+
+/**
+ * Data type to get requested value from the DSP
+ * @param type          : Identifies the request type.
+ * @param orig_apps_pid : Returns the process original apps pid.
+ */
+typedef struct {
+   HAP_req_userpd_params_type type;
+   union {
+      int orig_apps_pid;
+   };
+} HAP_req_userpd_params_t;
+
+/**
+ * Method to retrieve user process values from the DSP. This API support from SM8750 onwards.
+ * @param [in] request  : Request params.
+ * @return Returns 0 for success, error code on failure.
+ */
+int HAP_get_userpd_params(HAP_req_userpd_params_t *request);
+
+/**
+ * @}
+ */
+
+/** @defgroup HAP_get_pd_type Query the PD type of the process
+ *  @{
+ */
+
+
+/** Function to get PD type of the spawned process
+ *
+ *  Args:
+ *  @param[out] pd_type : Pointer to enum process_type to get PD type
+ *  @return 0 on success, valid non-zero error code on failure
+ */
+int HAP_get_pd_type(enum process_type* pd_type);
+
+/**
+ * @}
+ */
+
+#endif /*HAP_PS_H */
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_ps.md b/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_ps.md
new file mode 100755
index 0000000000000..eed59f7adb3ce
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_ps.md
@@ -0,0 +1,33 @@
+# Introduction {#intro}
+
+These APIs allow a user to perform the following actions:
+* Manage the dynamic list of processes running on the current DSP
+* Send a wakeup call to the CPU in order to decrease its response time upon returning from a FastRPC call
+* Enquire about the thread priority ceiling for the current process
+
+
+## API Overview {#api-overview}
+
+The HAP_ps.h APIs include the following functions:
+
+* ::HAP_get_process_list
+
+* ::HAP_add_to_process_list
+
+* ::HAP_remove_from_process_list
+
+* ::HAP_set_process_name
+
+* ::HAP_thread_migrate
+
+* ::HAP_send_early_signal
+
+* ::fastrpc_send_early_signal
+
+* ::HAP_get_thread_priority_ceiling
+
+* ::HAP_get_userpd_params
+
+* ::HAP_get_pd_type
+
+Header file: @b HAP_ps.h
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_traceme.h b/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_traceme.h
new file mode 100755
index 0000000000000..a46c56a835479
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_traceme.h
@@ -0,0 +1,28 @@
+#ifndef HAP_TRACEME_H
+#define HAP_TRACEME_H
+/*==============================================================================
+  Copyright (c) 2012-2013 Qualcomm Technologies Incorporated.
+  All Rights Reserved Qualcomm Technologies Proprietary
+
+  Export of this technology or software is regulated by the U.S.
+  Government. Diversion contrary to U.S. law prohibited. 
+==============================================================================*/
+
+#include "AEEStdDef.h"
+#include "HAP_debug.h"
+
+#if defined(_DEBUG)
+
+static __inline void HAP_traceme(void)
+{
+  (void)HAP_debug_ptrace(HAP_DEBUG_TRACEME, 0, 0, 0);
+}
+  
+#else  /* #if defined(_DEBUG) */
+
+#define HAP_traceme() 
+
+#endif /* #if defined(_DEBUG) */
+
+#endif /* #ifndef HAP_TRACEME_H */
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_user_pmu.h b/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_user_pmu.h
new file mode 100755
index 0000000000000..cde225749d3c8
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_user_pmu.h
@@ -0,0 +1,221 @@
+/*-----------------------------------------------------------------------------
+   Copyright (c) 2019-2020 QUALCOMM Technologies, Incorporated.
+   All Rights Reserved.
+   QUALCOMM Proprietary.
+-----------------------------------------------------------------------------*/
+
+#ifndef HAP_USER_PMU_H_
+#define HAP_USER_PMU_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ *  @file HAP_user_pmu.h
+ *  @brief HAP user PMU API
+ */
+
+/** @defgroup Constants constants
+ *  @{
+ */
+
+/** Error value for unsupported APIs. */
+#define HAP_USER_PMU_READ_NOT_SUPPORTED                           0x80000FFF
+
+/** Error value for PMU read failure. */
+#define HAP_USER_PMU_READ_FAILED                                  0xDEADDEAD
+
+/** @}
+ */
+
+/** @defgroup Types Data types
+ *  @{
+ */
+
+/**
+ * Input parameter type used when a group of PMU events must be read via
+ * HAP_register_pmu_group(), HAP_read_pmu_group() and HAP_deregister_pmu_group().
+
+ * The user must fill in the pmu_events[] array field of this structure with the
+ * specified PMU events to track and update the num_events field with the number
+ * of events to track. Only four unique PMU events can be tracked.
+ */
+typedef struct {
+    int contextId;
+    /**< Return value after registering the PMU group via HAP_register_pmu_group. */
+
+    unsigned int num_events;
+    /**< Input parameter specifying the number of PMU events register.*/
+
+    unsigned short pmu_events[4];
+    /**< Input parameter specifying the list of PMU events to register.*/
+
+    unsigned int pmu_value[4];
+    /**< Output parameter containing values of PMU events registered. */
+} HAP_pmu_group_config_t;
+
+/** @}
+ */
+
+/**
+ * @cond DEV
+ */
+int __attribute__((weak)) __HAP_register_pmu_group(HAP_pmu_group_config_t* pmu_config);
+int __attribute__((weak)) __HAP_deregister_pmu_group(int contextId);
+int __attribute__((weak)) __HAP_read_pmu_group(HAP_pmu_group_config_t* pmu_config);
+int __attribute__((weak)) __HAP_register_pmu_event(unsigned short pmu_event);
+int __attribute__((weak)) __HAP_deregister_pmu_event(unsigned short pmu_event);
+unsigned int __attribute__((weak)) __HAP_read_pmu_event(unsigned short pmu_event);
+
+/**
+ * @endcond
+ */
+
+/** @defgroup GroupFunc API for reading a group of PMUs
+ *  These APIs expose a way to register and read an array of PMU events
+ *  (maximum of four PMU events) by using the #HAP_pmu_group_config_t structure.
+ *  Alternatively, the user can use a different set of APIs explained in the next
+ *  section to configure and read a single PMU event.
+ *  @{
+ */
+
+/**
+ * Registers a group of PMU events to read.
+ *
+ * Call this function from the DSP user process to register a set of PMU events
+ * (maximum of four) for tracking. Fill in the pmu_events[] array file of
+ * @p pmu_config with the specified PMU events to track (maximum of four) and
+ * update the num_events field of @p pmu_config with the number of PMU events
+ * written into the pmu_events[] array.
+ *
+ * @param pmu_config Pointer to HAP_pmu_group_config_t structure with
+ *                   pmu_events[] array and num_events fields updated.
+ *
+ * @return 0 upon success. Updates the contextId field of @p pmu_config.
+ * @par
+ * The same pmu_config structure should be used for reading the PMU
+ * counter values #HAP_read_pmu_group() corresponding to the
+ * configured events and for de-registration #HAP_deregister_pmu_group().
+ */
+static inline int HAP_register_pmu_group(HAP_pmu_group_config_t* pmu_config) {
+    if(__HAP_register_pmu_group)
+        return __HAP_register_pmu_group(pmu_config);
+
+    return HAP_USER_PMU_READ_NOT_SUPPORTED;
+}
+
+/**
+ * Reads the PMU values of registered PMU events.
+ *
+ * Call this function after successfully calkling HAP_register_pmu_group() with the
+ * same structure pointer, @p pmu_config.
+ * This API uses the context_id field of the input @p pmu_config
+ * structure, which is set in a successful HAP_register_pmu_group().
+ *
+ * @param pmu_config Pointer to the #HAP_pmu_group_config_t structure used in
+ *                   #HAP_register_pmu_group() call.
+ * @return
+ * 0 upon success. Updates the pmu_value[] array corresponding to the
+ * configured pmu_events[] in the structure pointed to by @p pmu_config.
+ * pmu_value[x] is updated to HAP_USER_PMU_READ_FAILED if the corresponding pmu_event[x]
+ * configuration has failed or is invalid.
+ * @par
+ * Other values upon failure. \n
+ * @par
+ * #HAP_USER_PMU_READ_NOT_SUPPORTED when unsupported.
+ */
+static inline int HAP_read_pmu_group(HAP_pmu_group_config_t* pmu_config) {
+    if(__HAP_read_pmu_group)
+        return __HAP_read_pmu_group(pmu_config);
+
+    return HAP_USER_PMU_READ_NOT_SUPPORTED;
+}
+
+/**
+ * De-registers a group of PMU events registered via HAP_register_pmu_group().
+ *
+ * @param pmu_config Pointer to the #HAP_pmu_group_config_t structure used in the
+ *                   HAP_register_pmu_group() call.
+
+  * @return
+ * 0 upon success. \n
+ * Other values upon failure.
+ */
+static inline int HAP_deregister_pmu_group(HAP_pmu_group_config_t* pmu_config) {
+    if(__HAP_deregister_pmu_group)
+        return __HAP_deregister_pmu_group(pmu_config->contextId);
+
+    return HAP_USER_PMU_READ_NOT_SUPPORTED;
+}
+
+/**
+ * @}
+ */
+
+/** @defgroup singleFunc API for reading single PMU event
+ *  These APIs allow the user to configure and read single PMU events.
+ *  PMU event is used as an input in register, read and de-register APIs.
+ *  Up to four unique PMU event requests can be served.
+ *  @{
+ */
+
+/**
+ * Registers sa PMU event for read.
+ *
+ * @param pmu_event PMU event to register.
+ *
+ * @return
+ * 0 upon success. \n
+ * Other values upon failure.
+ */
+static inline int HAP_register_pmu_event(unsigned short pmu_event) {
+    if(__HAP_register_pmu_event)
+        return __HAP_register_pmu_event(pmu_event);
+
+    return HAP_USER_PMU_READ_NOT_SUPPORTED;
+}
+
+/**
+ * Reads the PMU event registered via HAP_register_pmu_event().
+ *
+ * @param pmu_event PMU event to read. Should already be registered via
+ *                  HAP_register_pmu_event().
+ *
+ * @return
+ * The value of the PMU counter corresponding to the pmu_event. \n
+ * - HAP_USER_PMU_READ_NOT_SUPPORTED -- API is unsupported. \n
+ * - HAP_USER_PMU_READ_FAILED -- The given @p pmu_event read fails.
+ */
+static inline unsigned int HAP_read_pmu_event(unsigned short pmu_event) {
+    if(__HAP_read_pmu_event)
+        return __HAP_read_pmu_event(pmu_event);
+
+    return HAP_USER_PMU_READ_NOT_SUPPORTED;
+}
+
+/**
+ * De-registers the PMU event registered via HAP_register_pmu_event().
+ *
+ * @param pmu_event PMU event to de-register. It should already be registered
+ *                  via #HAP_register_pmu_event().
+ *
+ * @return
+ * 0 upon success. \n
+ * Other values upon failure. \n
+ * HAP_USER_PMU_READ_NOT_SUPPORTED when not supported.
+ */
+static inline int HAP_deregister_pmu_event(unsigned short pmu_event) {
+    if(__HAP_deregister_pmu_event)
+        return __HAP_deregister_pmu_event(pmu_event);
+
+    return HAP_USER_PMU_READ_NOT_SUPPORTED;
+}
+
+/** @}
+ */
+
+#ifdef __cplusplus
+}
+#endif
+#endif /*HAP_USER_PMU_H_*/
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_user_pmu.md b/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_user_pmu.md
new file mode 100755
index 0000000000000..1deab8f3af81d
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_user_pmu.md
@@ -0,0 +1,26 @@
+# Performance monitoring unit
+
+The DSP subsystem has the PMU (Performance Monitoring Unit) with counters to track
+hardware events (called PMU events). The HAP PMU framework exposes a
+set of APIs to read these PMU counters configured with specified PMU events. PMU
+events are Hexagon DSP architecture specific and the most common PMU events are briefly
+described in the Hexagon DSP architecture documentation.
+The [itrace](../../doxygen/itrace/index.html) library's header file `itrace_dsp_events_pmu.h`
+provides a complete list of all available public PMU events alongside their descriptions.
+
+***NOTE:*** 
+* aDSP and cDSP DCVS relies on a set of PMU events to monitor DSP
+statistics and make necessary decisions. Using these HAP APIs to register PMU
+events results in DCVS no longer being able to track these events. This might
+lead DCVS to making incorrect decisions.
+* HAP PMU APIs only work on [debug-enabled](../../tools/sign.html#test-device) devices.
+
+The HAP PMU APIs are not accessible from unsigned PD.
+
+## Supported chipsets
+
+SM8250 and beyond
+
+## Framework APIs
+
+Header file: @b HAP_user_pmu.h
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_vtcm_mgr.h b/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_vtcm_mgr.h
new file mode 100755
index 0000000000000..dbd66f0bf88d6
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_vtcm_mgr.h
@@ -0,0 +1,214 @@
+/*-----------------------------------------------------------------------------
+ * Copyright (c) 2016-2020 Qualcomm Technologies, Inc.
+ * All Rights Reserved.
+ * Confidential and Proprietary - Qualcomm Technologies, Inc.
+-----------------------------------------------------------------------------*/
+
+#ifndef HAP_VTCM_MGR_H_
+#define HAP_VTCM_MGR_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+void* __attribute__((weak)) HAP_request_async_VTCM(unsigned int size, unsigned int single_page_flag, unsigned int timeout_us);
+
+/**
+ * @defgroup vtcmapi HAP VTCM manager API.
+ * This section describes the HAP VTCM manager API to allocate and release VTCM.
+ * @{
+ */
+
+/**
+ *  @file HAP_vtcm_mgr.h
+ *  @brief APIs used to allocate, release, and query Vector TCM (VTCM) memory.
+ *         VTCM is a high-performance, tightly-coupled memory in the cDSP
+ *         subsystem. It can used for Hexagon Vector eXtensions (HVX)
+ *         scatter/gather instructions, the Hexagon Matrix eXtension (HMX) engine
+ *         (available in some cDSPs starting with Lahaina), or as high-performance
+ *         scratch memory for other HVX workloads.
+ */
+
+/**
+ * Request VTCM memory of a specified size and single page requirement.
+ *
+ * @param[in] size  Size of the request in bytes. \n
+ *                  If (@p single_page_flag == 0), the size is aligned to 4 KB. \n
+ *                  If (@p single_page_flag == 1), the size is aligned to
+ *                      the closest possible page size: 4 KB, 16 KB, 64 KB, 256 KB,
+ *                      1 MB, 4 MB, 16 MB.
+ * @param[in] single_page_flag  Single page requirement for this allocation:
+ *                              1 for single page requests, 0 otherwise.
+ *                              Single page requests are mandatory for
+ *                              scatter/gather operations because the operations
+ *                              must be contained within a single page of memory.
+ *                              (The memory region used by scatter/gather
+ *                              HVX instructions must reside in VTCM and cannot
+ *                              cross a page boundary).
+ *
+ * @return
+ * @c void* pointer to the allocated memory region on success. \n
+ * 0 on failure.
+ *
+ * @par Example
+ * @code
+ *   // Request for a single page of 4000 bytes
+ *  void *pVTCM = HAP_request_VTCM(4000, 1);
+ *  if (0 != pVTCM)
+ *  {
+ *      // Allocation is successful. Try a release
+ *      int result = HAP_release_VTCM(pVTCM);
+ *      if (0 == result)
+ *      {
+ *          //Release successful
+ *      }
+ *  }
+ * @endcode
+ */
+void* HAP_request_VTCM(unsigned int size, unsigned int single_page_flag);
+
+ /**
+ * Request VTCM memory of a specified size and single page requirement with a
+ * timeout option.
+ *
+ * This API can be used to wait for the provided timeout. The calling thread is
+ * suspended until the requested VTCM memory is available or until the timeout,
+ * whichever happens first.
+ *
+ * @b NOTE: A deadlock might occur when calling this API if the same
+ *          thread holds a part of, or the entire VTCM memory prior to this call.
+ *          This API is @a not supported from secure and CPZ PDs.
+ *
+ * @param[in] size  Size of the request in bytes. \n
+ *                  If (@p single_page_flag == 0), the size is aligned to 4 KB. \n
+ *                  If (@p single_page_flag == 1), the size is aligned to
+ *                      the closest possible page size,: 4 KB, 16 KB, 64 KB, 256 KB,
+ *                      1 MB, 4 MB, 16 MB
+ * @param[in] single_page_flag  Single page requirement for this allocation:
+ *                              1 for single page requests, 0 otherwise.
+ *                              Single page requests are mandatory for
+ *                              scatter/gather operations because the operations
+ *                              must be contained within a single page of memory.
+ *                              (The memory region used by scatter/gather
+ *                              instructions must reside in VTCM and cannot
+ *                              cross a page boundary).
+ * @param[in] timeout_us  Timeout in microseconds. If the request is readily
+ *                        available, return success with a void pointer. If the
+ *                        request cannot be served, wait for the available VTCM
+ *                        memory until the timeout, or return failure on the
+ *                        timeout. This value must be greater than 200 for the
+ *                        timeout implementation to work; otherwise, it is treated
+ *                        like HAP_request_VTCM().
+ *
+ * @return
+ * @c void* pointer to the allocated memory region on success. \n
+ * 0 on failure.
+ *
+ * @par Example
+ * @code
+ *  // Request for a single page of 256 * 1024  bytes with
+ *  // timeout set to 5 milliseconds
+ *  void *pVTCM = HAP_request_async_VTCM(256 * 1024, 1, 5000);
+ *  if (0 != pVTCM)
+ *  {
+ *      // Allocation is successful. Try a release
+ *      int result = HAP_release_VTCM(pVTCM);
+ *      if (0 == result)
+ *      {
+ *          //Release successful
+ *      }
+ *  }
+ * @endcode
+ */
+void* HAP_request_async_VTCM(unsigned int size,
+                             unsigned int single_page_flag,
+                             unsigned int timeout_us);
+
+/**
+ * Release a successful request for VTCM memory by providing the pointer
+ * to the previously allocated VTCM block.
+ *
+ * @param[in] pVA  Pointer returned by a successful VTCM request call.
+ *
+ * @return
+ * @c int 0 on success. \n
+ * Non-zero on failure.
+ */
+int HAP_release_VTCM(void* pVA);
+
+/**
+ * Query for the VTCM size defined on target.
+ *
+ * @param[out] page_size  Pointer to an @c unsigned @c int variable.
+ *                        If this parameter is non-zero on success, the memory
+ *                        location contains the maximum possible page size
+ *                        allocation (in bytes) in VTCM.
+ * @param[out] page_count  Pointer to an @c unsigned @c int variable.
+ *                         If @p page_size is non-zero on success, the memory
+ *                         location contains the number of @p page_size
+ *                         blocks in VTCM.
+ *
+ * @return
+ * @c int 0 on success. \n
+ * Non-zero on failure.
+ *
+ * @par Example
+ * @code
+ *  unsigned int page_size, page_count;
+ *  if (0 == HAP_query_total_VTCM(&page_size, &page_count))
+ *  {
+ *      // Query successful.
+ *      // For SM8150 cDSP:
+ *      //      page_size will be 256 * 1024.
+ *      //      page_count will be 1.
+ *      // VTCM memory defined for this chipset (256 KB)
+ *      unsigned int total_vtcm = page_size * page_count;
+ *  }
+ * @endcode
+ */
+int HAP_query_total_VTCM(unsigned int* page_size, unsigned int* page_count);
+
+/**
+ * API to query VTCM allocation status.
+ *
+ * @param[out] avail_block_size  Pointer to an @c unsigned @c int variable.
+ *                               If this parameter is non-zero on success, the
+ *                               memory location contains the maximum contiguous
+ *                               memory chunk (in bytes) available in VTCM.
+ * @param[out] max_page_size  Pointer to an @c unsigned @c int variable.
+ *                            If this parameter is non-zero, the memory location
+ *                            contains the maximum possible page size allocation
+ *                            (in bytes) in the available portion of VTCM.
+ * @param[out] num_pages  Pointer to an @c unsigned @c int variable.
+ *                        If this parameter is non-zero on success, the memory
+ *                        location contains the value of @p max_page_size.
+ *
+ * @return
+ * @c int 0 on success. \n
+ * Non-zero on failure.
+ *
+ * @par Example
+ * @code
+ *  unsigned int avail_block_size, max_page_size, num_pages;
+ *  if (0 == HAP_query_avail_VTCM(&avail_block_size, &max_page_size, &num_pages))
+ *  {
+ *      // Query successful.
+ *      // Use avail_block_size, max_page_size, num_pages
+ *  }
+ * @endcode
+ */
+int HAP_query_avail_VTCM(unsigned int* avail_block_size,
+                          unsigned int* max_page_size,
+                          unsigned int* num_pages);
+
+/**
+ * @}
+ */
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif //HAP_VTCM_MGR_H_
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_vtcm_mgr.md b/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_vtcm_mgr.md
new file mode 100755
index 0000000000000..3d5542c05202a
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/incs/HAP_vtcm_mgr.md
@@ -0,0 +1,21 @@
+# VTCM manager
+
+Vector TCM (VTCM) is available on supported targets with cDSP. VTCM 
+is a high-performance, tightly-coupled memory in the cDSP subsystem that can be used
+for Hexagon Vector eXtensions (HVX) scatter/gather instructions, Hexagon Matrix
+eXtension (HMX)(available in some cDSPs starting with Lahaina), or as
+high-performance scratch memory for other HVX workloads.
+
+The VTCM manager exposes APIs in from the `HAP_vtcm_mgr.h` file to allocate, free, and query the availability of VTCM.
+
+***NOTE:***
+Starting with Lahaina, use the [compute resource manager](../../doxygen/HAP_compute_res/index.html){target=_blank} API for VTCM allocations instead of this legacy VTCM manager API. The compute resource manager is expanded to provide user options to do the following:
+
+* Allocate other compute resources (including VTCM)
+* Manage application IDs, which control VTCM partitions and privileges
+* Send release callbacks, which can be invoked when a high priority client requires the resource
+* Release and reacquire the same VTCM size and page configuration
+* Request VTCM with granular sizes (minimum and maximum required) and specific page size requirements
+
+The VTCM manager API is restricted to allocate VTCM only from the
+primary VTCM partition (if the partition is defined).
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/incs/adsp_mmap.h b/prebuilts/Hexagon_SDK/6.2.0.1/incs/adsp_mmap.h
new file mode 100755
index 0000000000000..7e5fd438b0902
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/incs/adsp_mmap.h
@@ -0,0 +1,25 @@
+#ifndef ADSP_MMAP_H
+#define ADSP_MMAP_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#include "AEEStdDef.h"
+/**
+ * @param buf, the buffer virtual address
+ * @param bufLen, the length
+ * @param flags, the flags it was mapped with, 0 by default
+ */
+int adsp_addref_mmap(void* buf, int bufLen, uint32 flags);
+
+/**
+ * @param buf, the buffer virtual address
+ * @param bufLen, the length
+ */
+int adsp_release_mmap(void* buf, int bufLen);
+
+
+#ifdef __cplusplus
+}
+#endif
+#endif// ADSP_MMAP_H
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/incs/apps_mem.h b/prebuilts/Hexagon_SDK/6.2.0.1/incs/apps_mem.h
new file mode 100755
index 0000000000000..bb9ed189aa41b
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/incs/apps_mem.h
@@ -0,0 +1,39 @@
+#ifndef _APPS_MEM_H
+#define _APPS_MEM_H
+#include "AEEStdDef.h"
+#ifndef __QAIC_HEADER
+#define __QAIC_HEADER(ff) ff
+#endif //__QAIC_HEADER
+
+#ifndef __QAIC_HEADER_EXPORT
+#define __QAIC_HEADER_EXPORT
+#endif // __QAIC_HEADER_EXPORT
+
+#ifndef __QAIC_HEADER_ATTRIBUTE
+#define __QAIC_HEADER_ATTRIBUTE
+#endif // __QAIC_HEADER_ATTRIBUTE
+
+#ifndef __QAIC_IMPL
+#define __QAIC_IMPL(ff) ff
+#endif //__QAIC_IMPL
+
+#ifndef __QAIC_IMPL_EXPORT
+#define __QAIC_IMPL_EXPORT
+#endif // __QAIC_IMPL_EXPORT
+
+#ifndef __QAIC_IMPL_ATTRIBUTE
+#define __QAIC_IMPL_ATTRIBUTE
+#endif // __QAIC_IMPL_ATTRIBUTE
+#ifdef __cplusplus
+extern "C" {
+#endif
+__QAIC_HEADER_EXPORT int __QAIC_HEADER(apps_mem_request_map)(int heapid, uint32 ion_flags, uint32 rflags, uint32 vin, int32 len, uint32* vapps, uint32* vadsp) __QAIC_HEADER_ATTRIBUTE;
+__QAIC_HEADER_EXPORT int __QAIC_HEADER(apps_mem_request_unmap)(uint32 vadsp, int32 len) __QAIC_HEADER_ATTRIBUTE;
+__QAIC_HEADER_EXPORT int __QAIC_HEADER(apps_mem_request_map64)(int heapid, uint32 ion_flags, uint32 rflags, uint64 vin, int64 len, uint64* vapps, uint64* vadsp) __QAIC_HEADER_ATTRIBUTE;
+__QAIC_HEADER_EXPORT int __QAIC_HEADER(apps_mem_request_unmap64)(uint64 vadsp, int64 len) __QAIC_HEADER_ATTRIBUTE;
+__QAIC_HEADER_EXPORT int __QAIC_HEADER(apps_mem_share_map)(int fd, int size, uint64* vapps, uint64* vadsp) __QAIC_HEADER_ATTRIBUTE;
+__QAIC_HEADER_EXPORT int __QAIC_HEADER(apps_mem_share_unmap)(uint64 vadsp, int size) __QAIC_HEADER_ATTRIBUTE;
+#ifdef __cplusplus
+}
+#endif
+#endif //_APPS_MEM_H
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/incs/domain.h b/prebuilts/Hexagon_SDK/6.2.0.1/incs/domain.h
new file mode 100755
index 0000000000000..62c6ecdadb0af
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/incs/domain.h
@@ -0,0 +1,14 @@
+/*
+ * Copyright (c) 2021 QUALCOMM Technologies Inc.
+ * All Rights Reserved.
+ * Confidential and Proprietary - Qualcomm Technologies, Inc.
+ *
+ */
+
+#include "remote.h"
+
+#ifdef _AUTO
+   #include "domain_auto.h"
+#else
+   #include "domain_default.h"
+#endif
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/incs/domain_default.h b/prebuilts/Hexagon_SDK/6.2.0.1/incs/domain_default.h
new file mode 100755
index 0000000000000..efb741af8faee
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/incs/domain_default.h
@@ -0,0 +1,20 @@
+/*
+ * Copyright (c) 2021 QUALCOMM Technologies Inc.
+ * All Rights Reserved.
+ * Confidential and Proprietary - Qualcomm Technologies, Inc.
+ *
+ */
+
+#include "remote.h"
+
+domain supported_domains[] = {
+    {ADSP_DOMAIN_ID, ADSP_DOMAIN},
+    {MDSP_DOMAIN_ID, MDSP_DOMAIN},
+    {SDSP_DOMAIN_ID, SDSP_DOMAIN},
+    {CDSP_DOMAIN_ID, CDSP_DOMAIN},
+    {CDSP1_DOMAIN_ID, CDSP1_DOMAIN}
+};
+
+bool is_CDSP(int domain_id) {
+    return (domain_id == CDSP_DOMAIN_ID || domain_id == CDSP1_DOMAIN_ID);
+}
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/incs/dspqueue.h b/prebuilts/Hexagon_SDK/6.2.0.1/incs/dspqueue.h
new file mode 100755
index 0000000000000..91a3a4b737b5e
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/incs/dspqueue.h
@@ -0,0 +1,455 @@
+/*
+  Copyright (c) 2020 Qualcomm Technologies, Inc.
+  All Rights Reserved.
+  Confidential and Proprietary - Qualcomm Technologies, Inc.
+*/
+
+
+/** @file
+    Asynchronous DSP Packet Queue API.
+*/
+
+#ifndef DSPQUEUE_H
+#define DSPQUEUE_H
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <AEEStdDef.h>
+
+
+/** @defgroup dspqueue_consts Asynchronous DSP Packet Queue API Constants
+ *  @{
+ */
+
+/** Infinite timeout */
+#define DSPQUEUE_TIMEOUT_NONE 0xffffffff
+
+
+/**
+ * Packet flags. The flags are used as a bitfield in packet read/write operations.
+ */
+enum dspqueue_packet_flags {
+    DSPQUEUE_PACKET_FLAG_MESSAGE =       0x0001, /**< Packet contains a message */
+    DSPQUEUE_PACKET_FLAG_BUFFERS =       0x0002, /**< Packet contains buffer references */
+    DSPQUEUE_PACKET_FLAG_WAKEUP =        0x0004, /**< Early wakeup packet */
+    DSPQUEUE_PACKET_FLAG_DRIVER_READY =  0x0008, /**< Packet is ready for driver consumption. Currently unused. */
+    DSPQUEUE_PACKET_FLAG_USER_READY =    0x0010, /**< Packet is ready for userspace library consumption */
+    DSPQUEUE_PACKET_FLAG_RESERVED_ZERO = 0xffe0
+};
+
+/**
+ * Buffer flags. The flags are used in dspqueue_buffer.flags as a bitfield.
+ */
+enum dspqueue_buffer_flags {
+    /* 1 and 2 reserved */
+    DSPQUEUE_BUFFER_FLAG_REF =                  0x00000004, /**< Add a reference to a previously mapped buffer */
+    DSPQUEUE_BUFFER_FLAG_DEREF =                0x00000008, /**< Remove a reference from a previously mapped buffer */
+    DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER =         0x00000010, /**< Flush buffer from sender caches */
+    DSPQUEUE_BUFFER_FLAG_INVALIDATE_SENDER =    0x00000020, /**< Invalidate buffer from sender caches */
+    DSPQUEUE_BUFFER_FLAG_FLUSH_RECIPIENT =      0x00000040, /**< Flush buffer from recipient caches */
+    DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT = 0x00000080, /**< Invalidate buffer from recipient caches */
+    DSPQUEUE_BUFFER_FLAG_RESERVED_ZERO =        0xffffff00
+};
+
+
+/**
+ * Statistics readable with dspqueue_get_stat()
+ */
+enum dspqueue_stat {
+    DSPQUEUE_STAT_READ_QUEUE_PACKETS = 1, /**< Numbers of packets in the read queue */
+    DSPQUEUE_STAT_READ_QUEUE_BYTES,       /**< Number of bytes in the read queue */
+    DSPQUEUE_STAT_WRITE_QUEUE_PACKETS,    /**< Number of packets in the write queue */
+    DSPQUEUE_STAT_WRITE_QUEUE_BYTES,      /**< Number of bytes in the write queue */
+
+    DSPQUEUE_STAT_EARLY_WAKEUP_WAIT_TIME, /**< Total accumulated early wakeup wait time in microseconds.
+                                             Developers can use this value to tune their early wakeup
+                                             request timing; the target should be to have this value as
+                                             close to zero as possible while minimizing signaling latency.
+                                             For more information on tuning early wakeup requests, see the
+                                             "Performance Considerations" section in the main Hexagon SDK
+                                             "Asynchronous Packet Queue" document. */
+
+    DSPQUEUE_STAT_EARLY_WAKEUP_MISSES     /**< Number accumulated of packets missed in the early wakeup loop.
+                                             Developers can use this value to tune their early wakeup
+                                             request timing. If this value is above zero it indicates the
+                                             early wakeup request was sent too early and it expired before
+                                             the corresponding packet was received.
+                                             For more information on tuning early wakeup requests, see the
+                                             "Performance Considerations" section in the main Hexagon SDK
+                                             "Asynchronous Packet Queue" document. */
+};
+
+/** @}
+ */
+
+
+/** @defgroup dspqueue_types Asynchronous DSP Packet Queue API Data Types
+ *  @{
+ */
+
+struct dspqueue;
+typedef struct dspqueue* dspqueue_t; /**< Queue handle */
+
+
+/**
+ * Buffer reference in a packet.
+ * The buffer must already be mapped to the DSP using the same file descriptor.
+ * The subsection of the buffer as specified by #offset and #size must fit
+ * entirely within the mapped buffer.
+ * Note that buffer references are tracked based on the buffer file descriptor,
+ * and taking/releasing a reference to a buffer applies to the entire buffer as
+ * mapped to the DSP, not just the subsection specified.
+ */
+struct dspqueue_buffer {
+    uint32_t fd;     /**< Buffer file descriptor */
+    uint32_t size;   /**< Buffer size in bytes. The client can set this field
+                          to zero when writing packets; in this case the
+                          framework will set the field to the size of the
+                          buffer as mapped. */
+    uint32_t offset; /**< Offset within the buffer in bytes as allocated and mapped.
+                          The virtual address #ptr includes the offset */
+    uint32_t flags;  /**< Buffer flags, see enum #dspqueue_buffer_flags */
+    union {
+        void *ptr;   /**< Buffer virtual address; NULL if not mapped in the local context */
+        uint64_t address;
+    };
+};
+
+
+/**
+ * Callback function type for all queue callbacks
+ *
+ * @param queue Queue handle from dspqueue_create() / dspqueue_import()
+ * @param error Error code
+ * @param context Client-provided context pointer
+ */
+typedef void (*dspqueue_callback_t)(dspqueue_t queue, AEEResult error, void *context);
+
+/** @}
+ */
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/** @defgroup dspqueue_funcs Asynchronous DSP Packet Queue API Functions
+ *  @{
+ */
+
+/**
+ * Create a new queue to communicate with the DSP. Queues can only be
+ * created on the host CPU.
+ *
+ * @param [in] domain DSP to communicate with (CDSP_DOMAIN_ID in remote.h for cDSP)
+ * @param [in] flags Queue creation flags
+ * @param [in] req_queue_size Total request queue memory size in bytes; use 0 for system default
+ * @param [in] resp_queue_size Total response queue memory size in bytes; use 0 for system default
+ * @param [in] packet_callback Callback function called when there are new packets to read.
+ *                        The call will be done in a different thread's context.
+ *                        NULL to disable the callback. Clients cannot use blocking read
+ *                        calls if a packet callback has been set.
+ * @param [in] error_callback Callback function called on unrecoverable errors. NULL to disable.
+ * @param [in] callback_context Context pointer for callback functions
+ * @param [out] queue Queue handle
+ *
+ * @return 0 on success, error code on failure.
+ *         - AEE_ENOMEMORY: Not enough memory available
+ *         - AEE_EUNSUPPORTED: Message queue not supported on the given DSP
+ *         - AEE_EBADPARM: Bad parameters, e.g. Invalid domain (use CDSP_DOMAIN_ID for cDSP), Too many queues open for the DSP in this process
+ *         - AEE_ERPC: Internal RPC error, e.g. Queue list corrupt
+ *         - AEE_EBADSTATE: Bad internal state
+ */
+AEEResult dspqueue_create(int domain,
+                          uint32_t flags,
+                          uint32_t req_queue_size, uint32_t resp_queue_size,
+                          dspqueue_callback_t packet_callback,
+                          dspqueue_callback_t error_callback,
+                          void *callback_context,
+                          dspqueue_t *queue);
+
+/**
+ * Close a queue and free all memory associated with it. The
+ * function can be called on the host CPU with queue handles from
+ * dspqueue_create() or on the DSP with handles from
+ * dspqueue_import().
+ *
+ * @param [in] queue Queue handle from dsp_queue_create() from dsp_queue_import().
+ *
+ * @return 0 on success, error code on failure.
+ *         - AEE_EBADPARM: Bad parameters, e.g. The queue is open on the DSP when attempting to close it on the host CPU
+ *         - AEE_EBADSTATE: Bad internal state
+ */
+AEEResult dspqueue_close(dspqueue_t queue);
+
+/**
+ * Export a queue to the DSP. The CPU-side client calls this function,
+ * passes the ID to the DSP, which can then call dspqueue_import() to
+ * access the queue.
+ *
+ * @param [in] queue Queue handle from dspqueue_create()
+ * @param [out] queue_id Queue ID
+ *
+ * @return 0 on success, error code on failure.
+ */
+AEEResult dspqueue_export(dspqueue_t queue, uint64_t *queue_id);
+
+/**
+ * Import a queue on the DSP based on an ID passed in from the host
+ * CPU. The DSP client can use the returned queue handle to access the
+ * queue and communicate with its host CPU counterpart.
+ *
+ * @param [in] queue_id Queue ID from dspqueue_export().
+ * @param [in] packet_callback Callback function called when there are new packets to read.
+ *                        The call will be done in a different thread's context.
+ *                        NULL to disable the callback.
+ * @param [in] error_callback Callback function called on unrecoverable errors. NULL to disable.
+ * @param [in] callback_context Context pointer fo callback functions
+ * @param [out] queue Queue handle
+ *
+ * @return 0 on success, error code on failure.
+ *         - AEE_EITEMBUSY: The queue has already been imported
+ *         - AEE_EQURTTHREADCREATE: Unable to create callback thread; the system may have
+ *                                  reached its thread limit.
+ *         - AEE_EBADSTATE: Bad internal state
+ */
+AEEResult dspqueue_import(uint64_t queue_id,
+                          dspqueue_callback_t packet_callback,
+                          dspqueue_callback_t error_callback,
+                          void *callback_context,
+                          dspqueue_t *queue);
+/**
+ * Write a packet to a queue. This variant of the function will not
+ * block, and will instead return AEE_EWOULDBLOCK if the queue does not have
+ * enough space for the packet.
+ *
+ * With this function the client can pass separate pointers to the
+ * buffer references and message to include in the packet and the
+ * library copies the contents directly to the queue.
+ *
+ * @param [in] queue Queue handle from dspqueue_create() or dspqueue_import()
+ * @param [in] flags Packet flags. See enum #dspqueue_packet_flags
+ * @param [in] num_buffers Number of buffer references to insert to the packet;
+ *                    zero if there are no buffer references
+ * @param [in] buffers Pointer to buffer references
+ * @param [in] message_length Message length in bytes;
+ *                       zero if the packet contains no message
+ * @param [in] message Pointer to packet message
+ *
+ * @return 0 on success, error code on failure.
+ *         - AEE_EWOULDBLOCK: The queue is full
+ *         - AEE_EBADPARM: Bad parameters, e.g. buffers is NULL when num_buffers > 0 ,
+ *                          The packet is too long to fit in the queue. The call will never succeed.
+ *         - AEE_ENOSUCHMAP: Attempt to refer to an unmapped buffer. Buffers must be mapped to the DSP
+ *                           with fastrpc_mmap() before they can be used in queue packets.
+ *         - AEE_EBADSTATE: Bad internal state
+ */
+AEEResult dspqueue_write_noblock(dspqueue_t queue, uint32_t flags,
+                                 uint32_t num_buffers, struct dspqueue_buffer *buffers,
+                                 uint32_t message_length, const uint8_t *message);
+
+/**
+ * Write a packet to a queue. If the queue is full this function will
+ * block until space becomes available or the request times out.
+ *
+ * With this function the client can pass separate pointers to the
+ * buffer references and message to include in the packet and the
+ * library copies the contents directly to the queue.
+ *
+ * @param [in] queue Queue handle from dspqueue_create() or dspqueue_import()
+ * @param [in] flags Packet flags. See enum #dspqueue_packet_flags
+ * @param [in] num_buffers Number of buffer references to insert to the packet;
+ *                    zero if there are no buffer references
+ * @param [in] buffers Pointer to buffer references
+ * @param [in] message_length Message length in bytes;
+ *                       zero if the packet contains no message
+ * @param [in] message Pointer to packet message
+ * @param [in] timeout_us Timeout in microseconds; use DSPQUEUE_TIMEOUT_NONE to
+ *                   block indefinitely until a space is available or
+ *                   zero for non-blocking behavior.
+ *
+ * @return 0 on success, error code on failure.
+ *         - AEE_EBADPARM: Bad parameters, e.g. buffers is NULL when num_buffers > 0
+ *                         The packet is too long to fit in the queue. The call will never succeed.
+ *         - AEE_ENOSUCHMAP: Attempt to refer to an unmapped buffer. Buffers must be mapped to the DSP
+ *                           with fastrpc_mmap() before they can be used in queue packets.
+ *         - AEE_EEXPIRED: Request timed out
+ *         - AEE_EINTERRUPTED: The request was canceled
+ *         - AEE_EBADSTATE: Bad internal state
+ */
+AEEResult dspqueue_write(dspqueue_t queue, uint32_t flags,
+                         uint32_t num_buffers, struct dspqueue_buffer *buffers,
+                         uint32_t message_length, const uint8_t *message,
+                         uint32_t timeout_us);
+
+/**
+ * Read a packet from a queue. This variant of the function will not
+ * block, and will instead return AEE_EWOULDBLOCK if the queue does not have
+ * enough space for the packet.
+ *
+ * This function will read packet contents directly into
+ * client-provided buffers. The buffers must be large enough to fit
+ * contents from the packet or the call will fail.
+ *
+ * @param [in] queue Queue handle from dspqueue_create() or dspqueue_import()
+ * @param [out] flags Packet flags. See enum #dspqueue_packet_flags
+ * @param [in] max_buffers The maximum number of buffer references that can fit in the "buffers" parameter
+ * @param [out] num_buffers The number of buffer references in the packet
+ * @param [out] buffers Buffer reference data from the packet
+ * @param [in] max_message_length Maximum message length that can fit in the "message" parameter
+ * @param [out] message_length Message length in bytes
+ * @param [out] message Packet message
+ *
+ * @return 0 on success, error code on failure.
+ *         - AEE_EBADPARM: Bad parameters, e.g. The packet is too large to fit in the provided buffers
+ *         - AEE_ENOSUCHMAP: The packet refers to an unmapped buffer. Buffers must be mapped to the DSP
+ *                           with fastrpc_mmap() before they can be used in queue packets.
+ *         - AEE_EWOULDBLOCK: The queue is empty; try again later
+ *         - AEE_EBADITEM: The queue contains a corrupted packet. Internal error.
+ *         - AEE_EBADSTATE: Bad internal state
+ */
+AEEResult dspqueue_read_noblock(dspqueue_t queue, uint32_t *flags,
+                                uint32_t max_buffers, uint32_t *num_buffers, struct dspqueue_buffer *buffers,
+                                uint32_t max_message_length, uint32_t *message_length, uint8_t *message);
+
+/**
+ * Read a packet from a queue. If the queue is empty this function
+ * will block until a packet is available or the request times out.
+ * The queue must not have a packet callback set.
+ *
+ * This function will read packet contents directly into
+ * client-provided buffers. The buffers must be large enough to fit
+ * contents from the packet or the call will fail.
+ *
+ * @param [in] queue Queue handle from dspqueue_create() or dspqueue_import()
+ * @param [out] flags Packet flags. See enum #dspqueue_packet_flags
+ * @param [in] max_buffers The maximum number of buffer references that can fit in the "buffers" parameter
+ * @param [out] num_buffers The number of buffer references in the packet
+ * @param [out] buffers Buffer reference data from the packet
+ * @param [in] max_message_length Maximum message length that can fit in the "message" parameter
+ * @param [out] message_length Message length in bytes
+ * @param [out] message Packet message
+ * @param [in] timeout_us Timeout in microseconds; use DSPQUEUE_TIMEOUT_NONE to
+ *                   block indefinitely until a packet is available or
+ *                   zero for non-blocking behavior.
+ *
+ * @return 0 on success, error code on failure.
+ *         - AEE_EBADPARM: Bad parameters, e.g. The packet is too large to fit in the provided buffers
+ *         - AEE_ENOSUCHMAP: The packet refers to an unmapped buffer. Buffers must be mapped to the DSP
+ *                           with fastrpc_mmap() before they can be used in queue packets.
+ *         - AEE_EBADITEM: The queue contains a corrupted packet. Internal error.
+ *         - AEE_EBADSTATE: Bad internal state
+ *         - AEE_EEXPIRED: Request timed out
+ *         - AEE_EINTERRUPTED: The request was canceled
+ */
+AEEResult dspqueue_read(dspqueue_t queue, uint32_t *flags,
+                        uint32_t max_buffers, uint32_t *num_buffers, struct dspqueue_buffer *buffers,
+                        uint32_t max_message_length, uint32_t *message_length, uint8_t *message,
+                        uint32_t timeout_us);
+
+/**
+ * Retrieve information for the next packet if available, without reading
+ * it from the queue and advancing the read pointer. This function
+ * will not block, but will instead return an error if the queue is
+ * empty.
+ *
+ * @param [in] queue Queue handle from dspqueue_create() or dspqueue_import().
+ * @param [out] flags Packet flags. See enum #dspqueue_packet_flags
+ * @param [out] num_buffers Number of buffer references in packet
+ * @param [out] message_length Packet message length in bytes
+ *
+ * @return 0 on success, error code on failure.
+ *         - AEE_EWOULDBLOCK: The queue is empty; try again later
+ *         - AEE_EBADITEM: The queue contains a corrupted packet. Internal error.
+ *         - AEE_EBADSTATE: Bad internal state
+ */
+AEEResult dspqueue_peek_noblock(dspqueue_t queue, uint32_t *flags, uint32_t *num_buffers,
+                                uint32_t *message_length);
+
+/**
+ * Retrieve information for the next packet, without reading it from the
+ * queue and advancing the read pointer. If the queue is empty this
+ * function will block until a packet is available or the request
+ * times out.
+ *
+ * @param [in] queue Queue handle from dspqueue_create() or dspqueue_import().
+ * @param [out] flags Packet flags. See enum #dspqueue_packet_flags
+ * @param [out] num_buffers Number of buffer references in packet
+ * @param [out] message_length Packet message length in bytes
+ * @param [out] timeout_us Timeout in microseconds; use DSPQUEUE_TIMEOUT_NONE to
+ *                   block indefinitely until a packet is available or
+ *                   zero for non-blocking behavior.
+ *
+ * @return 0 on success, error code on failure.
+ *         - AEE_EEXPIRED: Request timed out
+ *         - AEE_EINTERRUPTED: The request was canceled
+ *         - AEE_EBADITEM: The queue contains a corrupted packet. Internal error.
+ *         - AEE_EBADSTATE: Bad internal state
+ */
+AEEResult dspqueue_peek(dspqueue_t queue, uint32_t *flags, uint32_t *num_buffers,
+                        uint32_t *message_length, uint32_t timeout_us);
+
+
+/**
+ * Write an early wakeup packet to the queue. Early wakeup packets are used
+ * to bring the recipient out of a low-power state in anticipation of a real
+ * message packet being availble shortly, and are typically used from the DSP
+ * to signal that an operation is almost complete.
+ *
+ * This function will return immediately if the queue is full. There is no
+ * blocking variant of this function; if the queue is full the other endpoint
+ * should already be processing data and an early wakeup would not be useful.
+ *
+ *
+ * @param [in] queue Queue handle from dspqueue_create() or dspqueue_import()
+ * @param [in] wakeup_delay Wakeup time in microseconds; this indicates how soon
+ *                     the real message packet should be available. Zero if not known.
+ *                     The recipient can use this information to determine how to
+ *                     wait for the packet.
+ * @param [in] packet_flags Flags for the upcoming packet if known.
+ *                    The framework can use this information to optimize its
+ *                    behavior if the flags match the upcoming packet; if not known
+ *                    set to zero.
+ *                    See enum #dspqueue_packet_flags
+ *
+ * @return 0 on success, error code on failure.
+ *         - AEE_EWOULDBLOCK: The queue is full
+ *         - AEE_EBADSTATE: Bad internal state
+ */
+AEEResult dspqueue_write_early_wakeup_noblock(dspqueue_t queue, uint32_t wakeup_delay, uint32_t packet_flags);
+
+
+/**
+ * Retrieve statistics from a queue. Statistics are relative to the queue
+ * as viewed from the current endpoint (e.g. "read queue" refers to the
+ * queue as being read by the current endpoint).
+ *
+ * Reading an accumulating statistic (such as early wakeup wait time)
+ * will reset it to zero.
+ *
+ * Note that statistics values are only valid at the time when they're
+ * read.  By the time this function returns the values may have
+ * changed due to actions from another thread or the other queue
+ * endpoint.
+ *
+ * @param [in] queue Queue handle from dspqueue_create() or dspqueue_import()
+ * @param [in] stat Statistic to read, see enum dspqueue_stat
+ * @param [out] value Statistic value. Reading a statistic will reset it to zero
+ *
+ * @return 0 on success, error code on failure.
+ *         - AEE_EBADPARM: Invalid statistic
+ */
+
+AEEResult dspqueue_get_stat(dspqueue_t queue, enum dspqueue_stat stat, uint64_t *value);
+
+
+/** @}
+ */
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif //DSPQUEUE_H
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/incs/dspqueue.md b/prebuilts/Hexagon_SDK/6.2.0.1/incs/dspqueue.md
new file mode 100755
index 0000000000000..3f1ff3206156b
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/incs/dspqueue.md
@@ -0,0 +1,36 @@
+# Asynchronous DSP Packet Queue
+
+## API Overview {#api-overview}
+
+The Asynchronous DSP Packet Queue is accessed through a simple C
+API. Most of the API is identical on both the host CPU and DSP with
+the exception that queues can only be created on the CPU.
+
+* dspqueue_create(): Create a new queue. Queues can only be created
+  on the host CPU.
+
+* dspqueue_close(): Close a queue
+
+* dspqueue_export(): Export a queue on the host CPU, creating a
+  handle that be used with dspqueue_import() on the DSP.
+
+* dspqueue_import(): Import a queue for use on the DSP, using a
+  handle returned from dspqueue_export() on the CPU.
+
+* dspqueue_write() / dspqueue_write_noblock(): Write a packet to a
+  queue. Writes can either block if the queue is full or return an
+  error (dspqueue_write_noblock()); blocking writes can optionally
+  have a timeout.
+
+* dspqueue_read() / dspqueue_read_noblock(): Read a packet from a
+  queue.
+
+* dspqueue_peek() / dspqueue_peek_noblock(): Retrieve information
+  about the next packet without consuming it.
+
+* dspqueue_write_early_wakeup_noblock(): Write an early wakeup packet to the
+  queue.
+
+* dspqueue_get_stat(): Retrieve queue statistics, including the number
+  of packets queued and statistics about early wakeup.
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/incs/dynsymbols.lst b/prebuilts/Hexagon_SDK/6.2.0.1/incs/dynsymbols.lst
new file mode 100755
index 0000000000000..17663f8ddb672
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/incs/dynsymbols.lst
@@ -0,0 +1,1364 @@
+{
+___dladdr;
+___dlclose;
+___dlerror;
+___dlopen;
+___dlsym;
+__assert;
+__builtin_mr_assignment;
+__builtin_pseudo_barrier;
+__builtinfunction_bitrev_update1_for_load;
+__builtinfunction_bitrev_update1_for_store;
+__builtinfunction_bitrev_update2_for_load;
+__builtinfunction_bitrev_update2_for_store;
+__builtinfunction_bitrev_update4_for_load;
+__builtinfunction_bitrev_update4_for_store;
+__builtinfunction_bitrev_update8_for_load;
+__builtinfunction_bitrev_update8_for_store;
+__builtinfunction_bitreverse;
+__builtinfunction_bitrevupdate;
+__builtinfunction_circular_update1_for_load;
+__builtinfunction_circular_update1_for_store;
+__builtinfunction_circular_update1I_for_load;
+__builtinfunction_circular_update1I_for_store;
+__builtinfunction_circular_update2_for_load;
+__builtinfunction_circular_update2_for_store;
+__builtinfunction_circular_update2I_for_load;
+__builtinfunction_circular_update2I_for_store;
+__builtinfunction_circular_update4_for_load;
+__builtinfunction_circular_update4_for_store;
+__builtinfunction_circular_update4I_for_load;
+__builtinfunction_circular_update4I_for_store;
+__builtinfunction_circular_update8_for_load;
+__builtinfunction_circular_update8_for_store;
+__builtinfunction_circular_update8I_for_load;
+__builtinfunction_circular_update8I_for_store;
+__builtinfunction_circupdate;
+__CTOR_END__;
+__cxa_atexit;
+__cxa_finalize;
+__cxa_finalize_stub;
+__cxa_guard_abort;
+__cxa_guard_acquire;
+__cxa_guard_release;
+__cxa_pure_virtual;
+__deallocframe;
+__default_hash;
+__deregister_frame_info_bases;
+__divdc3;
+__divsc3;
+__divxc3;
+__dladdr;
+__dlclose;
+__dlerror;
+__dlopen;
+__dlsym;
+__dso_handle;
+__DTOR_LIST__;
+__eh_nodes;
+__hexagon_adddf3;
+__hexagon_addsf3;
+__hexagon_cmpdf2;
+__hexagon_cmpsf2;
+__hexagon_cmpxdf2;
+__hexagon_cmpxsf2;
+__hexagon_divdf3;
+__hexagon_divdi3;
+__hexagon_divsf3;
+__hexagon_divsi3;
+__hexagon_eqdf2;
+__hexagon_eqsf2;
+__hexagon_extendsfdf2;
+__hexagon_fast2_adddf3;
+__hexagon_fast2_divdf3;
+__hexagon_fast2_divsf3;
+__hexagon_fast2_muldf3;
+__hexagon_fast2_sqrt;
+__hexagon_fast2_sqrtdf2;
+__hexagon_fast2_sqrtf;
+__hexagon_fast2_subdf3;
+__hexagon_fast_adddf3;
+__hexagon_fast_divdf3;
+__hexagon_fast_divsf3;
+__hexagon_fast_gtdf2;
+__hexagon_fast_ltdf2;
+__hexagon_fast_muldf3;
+__hexagon_fast_negdf2;
+__hexagon_fast_sqrt;
+__hexagon_fast_sqrtdf2;
+__hexagon_fast_sqrtf;
+__hexagon_fast_subdf3;
+__hexagon_fixdfdi;
+__hexagon_fixdfsi;
+__hexagon_fixdfti;
+__hexagon_fixsfdi;
+__hexagon_fixsfsi;
+__hexagon_fixsfti;
+__hexagon_fixunsdfdi;
+__hexagon_fixunsdfsi;
+__hexagon_fixunsdfti;
+__hexagon_fixunssfdi;
+__hexagon_fixunssfsi;
+__hexagon_fixunssfti;
+__hexagon_floatdidf;
+__hexagon_floatdisf;
+__hexagon_floatsidf;
+__hexagon_floatsisf;
+__hexagon_floattidf;
+__hexagon_floattisf;
+__hexagon_floatundidf;
+__hexagon_floatundisf;
+__hexagon_floatunsidf;
+__hexagon_floatunsisf;
+__hexagon_fmadf4;
+__hexagon_gedf2;
+__hexagon_gesf2;
+__hexagon_gtdf2;
+__hexagon_gtsf2;
+__hexagon_ledf2;
+__hexagon_lesf2;
+__hexagon_ltdf2;
+__hexagon_ltsf2;
+__hexagon_maxdf3;
+__hexagon_memcpy_likely_aligned_min32bytes_mult8bytes;
+__hexagon_mindf3;
+__hexagon_moddi3;
+__hexagon_modsi3;
+__hexagon_muldf3;
+__hexagon_mulsf3;
+__hexagon_nedf2;
+__hexagon_negdf2;
+__hexagon_negsf2;
+__hexagon_nesf2;
+__hexagon_sqrt;
+__hexagon_sqrtdf2;
+__hexagon_sqrtf;
+__hexagon_subdf3;
+__hexagon_subsf3;
+__hexagon_truncdfsf2;
+__hexagon_udivdi3;
+__hexagon_udivmoddi4;
+__hexagon_udivmodsi4;
+__hexagon_udivsi3;
+__hexagon_umoddi3;
+__hexagon_umodsi3;
+__hexagon_unorddf2;
+__hexagon_unordsf2;
+__ieee754_j0;
+__ieee754_j1;
+__ieee754_jn;
+__ieee754_log;
+__ieee754_scalb;
+__ieee754_y0;
+__ieee754_y1;
+__ieee754_yn;
+__muldc3;
+__mulsc3;
+__mulxc3;
+__qdsp_adddf3;
+__qdsp_addsf3;
+__qdsp_cmpdf2;
+__qdsp_cmpsf2;
+__qdsp_cmpxdf2;
+__qdsp_cmpxsf2;
+__qdsp_divdf3;
+__qdsp_divdi3;
+__qdsp_divsf3;
+__qdsp_divsi3;
+__qdsp_eqdf2;
+__qdsp_eqsf2;
+__qdsp_extendsfdf2;
+__qdsp_fast_gtdf2;
+__qdsp_fast_ltdf2;
+__qdsp_fast_negdf2;
+__qdsp_fixdfdi;
+__qdsp_fixdfsi;
+__qdsp_fixdfti;
+__qdsp_fixsfdi;
+__qdsp_fixsfsi;
+__qdsp_fixsfti;
+__qdsp_fixunsdfdi;
+__qdsp_fixunsdfsi;
+__qdsp_fixunsdfti;
+__qdsp_fixunssfdi;
+__qdsp_fixunssfsi;
+__qdsp_fixunssfti;
+__qdsp_floatdidf;
+__qdsp_floatdisf;
+__qdsp_floatsidf;
+__qdsp_floatsisf;
+__qdsp_floattidf;
+__qdsp_floattisf;
+__qdsp_floatundidf;
+__qdsp_floatundisf;
+__qdsp_floatunsidf;
+__qdsp_floatunsisf;
+__qdsp_fmadf5;
+__qdsp_gedf2;
+__qdsp_gesf2;
+__qdsp_gtdf2;
+__qdsp_gtsf2;
+__qdsp_ledf2;
+__qdsp_lesf2;
+__qdsp_ltdf2;
+__qdsp_ltsf2;
+__qdsp_maxdf3;
+__qdsp_memcpy_likely_aligned_min32bytes_mult8bytes;
+__qdsp_mindf3;
+__qdsp_moddi3;
+__qdsp_modsi3;
+__qdsp_muldf3;
+__qdsp_mulsf3;
+__qdsp_nedf2;
+__qdsp_negdf2;
+__qdsp_negsf2;
+__qdsp_nesf2;
+__qdsp_sqrt;
+__qdsp_sqrtdf2;
+__qdsp_sqrtf;
+__qdsp_subdf3;
+__qdsp_subsf3;
+__qdsp_truncdfsf2;
+__qdsp_udivdi3;
+__qdsp_udivmoddi4;
+__qdsp_udivmodsi4;
+__qdsp_udivsi3;
+__qdsp_umoddi3;
+__qdsp_umodsi3;
+__qdsp_unorddf2;
+__qdsp_unordsf2;
+__register_frame_info_bases;
+__registerx;
+__restore_r16_through_r17_and_deallocframe;
+__restore_r16_through_r17_and_deallocframe_before_tailcall;
+__restore_r16_through_r19_and_deallocframe;
+__restore_r16_through_r19_and_deallocframe_before_tailcall;
+__restore_r16_through_r21_and_deallocframe;
+__restore_r16_through_r21_and_deallocframe_before_tailcall;
+__restore_r16_through_r23_and_deallocframe;
+__restore_r16_through_r23_and_deallocframe_before_tailcall;
+__restore_r16_through_r25_and_deallocframe;
+__restore_r16_through_r25_and_deallocframe_before_tailcall;
+__restore_r16_through_r27_and_deallocframe;
+__restore_r16_through_r27_and_deallocframe_before_tailcall;
+__restore_r24_through_r25_and_deallocframe;
+__restore_r24_through_r25_and_deallocframe_before_tailcall;
+__restore_r24_through_r27_and_deallocframe;
+__restore_r24_through_r27_and_deallocframe_before_tailcall;
+__restore_r27_through_r16_and_deallocframe;
+__restore_r27_through_r16_and_deallocframe_before_sibcall;
+__restore_r27_through_r18_and_deallocframe;
+__restore_r27_through_r18_and_deallocframe_before_sibcall;
+__restore_r27_through_r20_and_deallocframe;
+__restore_r27_through_r20_and_deallocframe_before_sibcall;
+__restore_r27_through_r22_and_deallocframe;
+__restore_r27_through_r22_and_deallocframe_before_sibcall;
+__restore_r27_through_r24_and_deallocframe;
+__restore_r27_through_r24_and_deallocframe_before_sibcall;
+__restore_r27_through_r26_and_deallocframe;
+__restore_r27_through_r26_and_deallocframe_before_sibcall;
+__save_r16_through_r17;
+__save_r16_through_r19;
+__save_r16_through_r21;
+__save_r16_through_r23;
+__save_r16_through_r25;
+__save_r16_through_r27;
+__save_r24_through_r25;
+__save_r24_through_r27;
+__save_r27_through_r16;
+__save_r27_through_r18;
+__save_r27_through_r20;
+__save_r27_through_r22;
+__save_r27_through_r24;
+__sqrtf;
+__stack_chk_fail;
+__stack_chk_guard;
+__tls_get_addr;
+__wrap_calloc;
+__wrap_free;
+__wrap_malloc;
+__wrap_memalign;
+__wrap_realloc;
+_Aldata;
+_Assert;
+_Atan;
+_AtcountPrivate;
+_AtcountPublic;
+_Atdata;
+_Atexit;
+_Atfuns;
+_Atrealloc;
+_Btowc;
+_C_tolower_;
+_C_toupper_;
+_Caddcc;
+_Caddcr;
+_Cbuild;
+_Cdivcc;
+_Cdivcr;
+_Clearlocks;
+_Clocale;
+_Closreg;
+_Cmulcc;
+_Cmulcr;
+_Cosh;
+_CStrftime;
+_CStrxfrm;
+_Csubcc;
+_Csubcr;
+_CTinfo;
+_CurrentTimeLocale;
+_CWcsxfrm;
+_Daysto;
+_Dbl;
+_Dclass;
+_DefaultTimeLocale;
+_Defloc;
+_Denorm;
+_Dint;
+_Dnorm;
+_Dscale;
+_Dsign;
+_Dtentox;
+_Dtest;
+_Dunscale;
+_Eps;
+_Erf_one;
+_Erf_small;
+_Erfc;
+_Exit;
+_Exp;
+_FAtan;
+_FCaddcc;
+_FCaddcr;
+_FCbuild;
+_FCdivcc;
+_FCdivcr;
+_FCmulcc;
+_FCmulcr;
+_FCosh;
+_FCsubcc;
+_FCsubcr;
+_FDclass;
+_FDenorm;
+_FDint;
+_FDnorm;
+_FDscale;
+_FDsign;
+_FDtentox;
+_FDtest;
+_FDunscale;
+_Fenv0;
+_FEps;
+_Feraise;
+_FErf_one;
+_FErf_small;
+_FErfc;
+_FExp;
+_FFpcomp;
+_FGamma_big;
+_Fgpos;
+_FHypot;
+_Files;
+_Findloc;
+_FInf;
+_fini;
+_FLog;
+_FLogpoly;
+_Flt;
+_Fltrounds;
+_FNan;
+_Fofind;
+_Fofree;
+_Fopen;
+_Foprep;
+_Force_raise;
+_Fpcomp;
+_FPoly;
+_FPow;
+_FQuad;
+_FQuadph;
+_Freeloc;
+_FRint;
+_Frprep;
+_FRteps;
+_FSin;
+_FSinh;
+_FSnan;
+_Fspos;
+_FTgamma;
+_Fwprep;
+_FXbig;
+_FXp_addh;
+_FXp_addx;
+_FXp_getw;
+_FXp_invx;
+_FXp_ldexpx;
+_FXp_movx;
+_FXp_mulh;
+_FXp_mulx;
+_FXp_setw;
+_FXp_sqrtx;
+_FXp_subx;
+_FZero;
+_Gamma_big;
+_Genld;
+_Gentime;
+_Get_eh_data;
+_Getcloc;
+_Getdst;
+_Geterrno;
+_Getfld;
+_Getfloat;
+_Getint;
+_Getlname;
+_Getloc;
+_Getmbcurmax;
+_Getmem;
+_Getnloc;
+_Getpcostate;
+_Getpctype;
+_Getpmbstate;
+_Getptimes;
+_Getptolower;
+_Getptoupper;
+_Getpwcostate;
+_Getpwcstate;
+_Getpwctrtab;
+_Getpwctytab;
+_Getstr;
+_Gettime;
+_Getzone;
+_Hugeval;
+_Hypot;
+_Inf;
+_init;
+_Init_db;
+_Initlocks;
+_Isdst;
+_Iswctype;
+_LAtan;
+_LCaddcc;
+_LCaddcr;
+_LCbuild;
+_LCdivcc;
+_LCdivcr;
+_LCmulcc;
+_LCmulcr;
+_LCosh;
+_LCsubcc;
+_LCsubcr;
+_Ldbl;
+_LDclass;
+_LDenorm;
+_LDint;
+_LDscale;
+_LDsign;
+_LDtentox;
+_LDtest;
+_Ldtob;
+_LDunscale;
+_LEps;
+_LErf_one;
+_LErf_small;
+_LErfc;
+_LExp;
+_LFpcomp;
+_LGamma_big;
+_LHypot;
+_LInf;
+_Litob;
+_LLog;
+_LLogpoly;
+_LNan;
+_Lockfilelock;
+_Locksyslock;
+_Locsum;
+_Loctab;
+_Locterm;
+_Locvar;
+_Log;
+_Logpoly;
+_LPoly;
+_LPow;
+_LQuad;
+_LQuadph;
+_LRint;
+_LRteps;
+_LSin;
+_LSinh;
+_LSnan;
+_LTgamma;
+_LXbig;
+_LXp_addh;
+_LXp_addx;
+_LXp_getw;
+_LXp_invx;
+_LXp_ldexpx;
+_LXp_movx;
+_LXp_mulh;
+_LXp_mulx;
+_LXp_setw;
+_LXp_sqrtx;
+_LXp_subx;
+_LZero;
+_Makeloc;
+_Makestab;
+_Makewct;
+_Mbtowc;
+_Mbtowcx;
+_Nan;
+_Nats;
+_Nnl;
+_Parse_cie;
+_Parse_csd;
+_Parse_fde;
+_Parse_fde_instr;
+_Parse_lsda;
+_Poly;
+_Pow;
+_Printf;
+_Putfld;
+_Putstr;
+_Puttxt;
+_Quad;
+_Quadph;
+_Read_enc_ptr;
+_Read_sleb;
+_Read_uleb;
+_Readloc;
+_Rint;
+_Rteps;
+_Scanf;
+_Setloc;
+_Sin;
+_Sinh;
+_Size_block;
+_Skip;
+_Snan;
+_start;
+_Stderr;
+_Stdin;
+_Stdout;
+_Stod;
+_Stodx;
+_Stof;
+_Stoflt;
+_Stofx;
+_Stold;
+_Stoldx;
+_Stoll;
+_Stollx;
+_Stolx;
+_Stopfx;
+_Stoul;
+_Stoull;
+_Stoullx;
+_Stoulx;
+_Stoxflt;
+_Strcollx;
+_Strerror;
+_Strxfrmx;
+_Tgamma;
+_tolower;
+_tolower_tab_;
+_toupper;
+_toupper_tab_;
+_Towctrans;
+_Ttotm;
+_Tzoff;
+_Unlockfilelock;
+_Unlocksyslock;
+_Unwind_DeleteException;
+_Unwind_ForcedUnwind;
+_Unwind_GetDataRelBase;
+_Unwind_GetGR;
+_Unwind_GetIP;
+_Unwind_GetLanguageSpecificData;
+_Unwind_GetRegionStart;
+_Unwind_GetTextRelBase;
+_Unwind_RaiseException;
+_Unwind_Resume;
+_Unwind_Resume_or_Rethrow;
+_Unwind_SetGR;
+_Unwind_SetIP;
+_Vacopy;
+_Valbytes;
+_Wcscollx;
+_Wcsftime;
+_Wcsxfrmx;
+_Wctob;
+_Wctomb;
+_Wctombx;
+_WFrprep;
+_WFwprep;
+_WGenld;
+_WGetfld;
+_WGetfloat;
+_WGetint;
+_WGetstr;
+_WLdtob;
+_WLitob;
+_WPrintf;
+_WPutfld;
+_WPutstr;
+_WPuttxt;
+_WScanf;
+_WStod;
+_WStodx;
+_WStof;
+_WStoflt;
+_WStofx;
+_WStold;
+_WStoldx;
+_WStoll;
+_WStopfx;
+_WStoul;
+_WStoull;
+_WStoxflt;
+_Xbig;
+_Xp_addh;
+_Xp_addx;
+_Xp_getw;
+_Xp_invx;
+_Xp_ldexpx;
+_Xp_movx;
+_Xp_mulh;
+_Xp_mulx;
+_Xp_setw;
+_Xp_sqrtx;
+_Xp_subx;
+_Zero;
+a64l;
+abort;
+abs;
+access;
+acos;
+acosf;
+acosh;
+acoshf;
+acoshl;
+acosl;
+AHB_User_Base;
+ahbb;
+alarm;
+alarm_handler;
+alarmx;
+asctime;
+asctime_r;
+asin;
+asinf;
+asinh;
+asinhf;
+asinhl;
+asinl;
+atan;
+atan2;
+atan2f;
+atan2l;
+atanf;
+atanh;
+atanhf;
+atanhl;
+atanl;
+atexit;
+atof;
+atoi;
+atol;
+atoll;
+bcmp;
+bcopy;
+bsearch;
+btowc;
+bzero;
+c16rtomb;
+c32rtomb;
+cabs;
+cabsf;
+cabsl;
+cacos;
+cacosf;
+cacosh;
+cacoshf;
+cacoshl;
+cacosl;
+carg;
+cargf;
+cargl;
+casin;
+casinf;
+casinh;
+casinhf;
+casinhl;
+casinl;
+catan;
+catanf;
+catanh;
+catanhf;
+catanhl;
+catanl;
+cbrt;
+cbrtf;
+cbrtl;
+ccos;
+ccosf;
+ccosh;
+ccoshf;
+ccoshl;
+ccosl;
+ceil;
+ceilf;
+ceill;
+cexp;
+cexpf;
+cexpl;
+cimag;
+cimagf;
+cimagl;
+clearerr;
+clock;
+clog;
+clog10;
+clog10f;
+clog10l;
+clogf;
+clogl;
+close;
+closedir;
+conj;
+conjf;
+conjl;
+copysign;
+copysignf;
+copysignl;
+cos;
+cosf;
+cosh;
+coshf;
+coshl;
+cosl;
+cpow;
+cpowf;
+cpowl;
+cproj;
+cprojf;
+cprojl;
+creal;
+crealf;
+creall;
+create_qdouble;
+create_qdouble_li;
+csin;
+csinf;
+csinh;
+csinhf;
+csinhl;
+csinl;
+csqrt;
+csqrtf;
+csqrtl;
+ctan;
+ctanf;
+ctanh;
+ctanhf;
+ctanhl;
+ctanl;
+ctime;
+ctime_r;
+d2qd;
+dadd;
+dadd_asm;
+daylight;
+difftime;
+div;
+dladdr;
+dlclose;
+dlerror;
+dlinfo;
+dlopen;
+dlopenbuf;
+dlsym;
+dmpy;
+dmpy_asm;
+drand48;
+drecip;
+drecipsqrt;
+dsub;
+dsub_asm;
+ecvt;
+environ;
+erand48;
+erf;
+erfc;
+erfcf;
+erfcl;
+erff;
+erfl;
+err_Fatal_internal0;
+execve;
+exit;
+exp;
+exp10f;
+exp2;
+exp2f;
+exp2l;
+expf;
+expl;
+expm1;
+expm1f;
+expm1l;
+fabs;
+fabsf;
+fabsl;
+fast2_d2qd;
+fast2_d2qld;
+fast2_dadd;
+fast2_dadd_asm;
+fast2_dmpy;
+fast2_dmpy_asm;
+fast2_drecip;
+fast2_drecipsqrt;
+fast2_dsub;
+fast2_dsub_asm;
+fast2_f2qd;
+fast2_f2qd_asm;
+fast2_ldadd;
+fast2_ldadd_asm;
+fast2_ldmpy;
+fast2_ldmpy_asm;
+fast2_ldrecip;
+fast2_ldrecipsqrt;
+fast2_ldsub;
+fast2_ldsub_asm;
+fast2_qd2f;
+fast2_qd2f_asm;
+fast2_qld2d;
+fast2_recipsqrtTable_qd;
+fast2_recipsqrtTable_qld;
+fast2_recipTable_qd;
+fast2_recipTable_qld;
+fclose;
+fcntl;
+fcvt;
+fdim;
+fdimf;
+fdiml;
+fdopen;
+feclearexcept;
+fegetenv;
+fegetexceptflag;
+fegetround;
+fegettrapenable;
+feholdexcept;
+feof;
+feraiseexcept;
+ferror;
+fesetenv;
+fesetexceptflag;
+fesetround;
+fesettrapenable;
+fetestexcept;
+feupdateenv;
+fflush;
+ffs;
+fgetc;
+fgetpos;
+fgets;
+fgetwc;
+fgetws;
+fileno;
+floor;
+floorf;
+floorl;
+fma;
+fmaf;
+fmax;
+fmaxf;
+fmaxl;
+fmemcpy_asm;
+fmin;
+fminf;
+fminl;
+fmod;
+fmodf;
+fmodl;
+fopen;
+fork;
+fprintf;
+fputc;
+fputs;
+fputwc;
+fputws;
+fread;
+freopen;
+frexp;
+frexpf;
+frexpl;
+fscanf;
+fseek;
+fseeko;
+fsetpos;
+fstat;
+ftell;
+ftello;
+ftruncate;
+fwide;
+fwprintf;
+fwrite;
+fwscanf;
+gcvt;
+get_exp_qd;
+get_mant_qd;
+getc;
+getc_unlocked;
+getchar;
+getchar_unlocked;
+getcwd;
+getenv;
+getopt;
+getpid;
+gets;
+getsubopt;
+gettimeofday;
+getw;
+getwc;
+getwchar;
+gmtime;
+gmtime_r;
+h_acosf;
+h_asinf;
+h_atanf;
+h_cosf;
+h_exp10f;
+h_exp2f;
+h_expf;
+h_log10f;
+h_log2f;
+h_logf;
+h_sinf;
+h_tanf;
+hcreate;
+hdestroy;
+hexagon_buffer_clean;
+hexagon_buffer_cleaninv;
+hexagon_buffer_inv;
+hexagon_cache_cleaninv;
+hexagon_cache_inva;
+hexagon_memcpy_forward_vp4cp4n2;
+hexagon_reg_clear_timer;
+hexagon_reg_end_timer;
+hexagon_reg_init_timer;
+hexagon_reg_prof_off;
+hexagon_reg_prof_on;
+hexagon_reg_read_pcycles;
+hexagon_reg_read_rev;
+hexagon_reg_read_syscfg;
+hexagon_reg_show_timer;
+hexagon_reg_start_timer;
+hsearch;
+hypot;
+hypotf;
+hypotl;
+ilogb;
+ilogbf;
+ilogbl;
+imaxabs;
+imaxdiv;
+index;
+isalnum;
+isalpha;
+isascii;
+isatty;
+isblank;
+iscntrl;
+isdigit;
+isgraph;
+isinf;
+islower;
+isnan;
+isprint;
+ispunct;
+isspace;
+isupper;
+iswalnum;
+iswalpha;
+iswblank;
+iswcntrl;
+iswctype;
+iswdigit;
+iswgraph;
+iswlower;
+iswprint;
+iswpunct;
+iswspace;
+iswupper;
+iswxdigit;
+isxdigit;
+j0;
+j1;
+jn;
+jrand48;
+l64a;
+l64a_r;
+labs;
+lcong48;
+ldexp;
+ldexpf;
+ldexpl;
+ldiv;
+lgamma;
+lgammaf;
+lgammal;
+llabs;
+lldiv;
+llrint;
+llrintf;
+llrintl;
+llround;
+llroundf;
+llroundl;
+localeconv;
+localtime;
+localtime_r;
+log;
+log10;
+log10f;
+log10l;
+log1p;
+log1pf;
+log1pl;
+log2;
+log2f;
+log2l;
+logb;
+logbf;
+logbl;
+logf;
+logl;
+longjmp;
+lrand48;
+lrint;
+lrintf;
+lrintl;
+lround;
+lroundf;
+lroundl;
+lseek;
+mblen;
+mbrlen;
+mbrtoc16;
+mbrtoc32;
+mbrtowc;
+mbsinit;
+mbsnrtowcs;
+mbsrtowcs;
+mbstowcs;
+mbtowc;
+memccpy;
+memchr;
+memcmp;
+memcpy;
+memcpy_c;
+memcpy_v;
+memmove;
+memmove_c;
+memmove_v;
+memscpy;
+memset;
+memset_c;
+memset_s;
+memset_v;
+memsmove;
+mkdir;
+mkstemp;
+mktemp;
+mktime;
+modf;
+modff;
+modfl;
+mrand48;
+nan;
+nanf;
+nanl;
+nearbyint;
+nearbyintf;
+nearbyintl;
+nextafter;
+nextafterf;
+nextafterl;
+nexttoward;
+nexttowardf;
+nexttowardl;
+norm;
+normf;
+norml;
+npa_query_by_name;
+nrand48;
+open;
+opendir;
+optarg;
+opterr;
+optind;
+optopt;
+perror;
+pow;
+powf;
+powl;
+printf;
+putc;
+putc_unlocked;
+putchar;
+putchar_unlocked;
+putenv;
+puts;
+putw;
+putwc;
+putwchar;
+q6_buffer_clean;
+q6_buffer_cleaninv;
+q6_buffer_inv;
+q6reg_clear_timer;
+q6reg_end_timer;
+q6reg_init_timer;
+q6reg_prof_off;
+q6reg_prof_on;
+q6reg_read_pcycles;
+q6reg_read_rev;
+q6reg_read_syscfg;
+q6reg_show_timer;
+q6reg_start_timer;
+qd2d;
+qd_add;
+qd_add_dq;
+qd_add_qd;
+qd_div;
+qd_div_dq;
+qd_div_qd;
+qd_fabs;
+qd_gt;
+qd_gt_dq;
+qd_gt_qd;
+qd_lt;
+qd_lt_dq;
+qd_lt_qd;
+qd_mul;
+qd_mul_dq;
+qd_mul_qd;
+qd_neg;
+qd_self_div;
+qd_self_div_dd;
+qd_self_increment;
+qd_self_increment_qd;
+qd_self_mul;
+qd_self_mul_qd;
+qd_self_sub;
+qd_self_sub_dd;
+qd_sqrt;
+qd_sub_dq;
+qd_sub_qd;
+qsort;
+raise;
+rand;
+rand_r;
+read;
+readdir;
+recipsqrtTable_qd;
+recipTable_qd;
+remainder;
+remainderf;
+remainderl;
+remove;
+remquo;
+remquof;
+remquol;
+rename;
+rewind;
+rindex;
+rint;
+rintf;
+rintl;
+rmdir;
+rmemcpy_asm;
+round;
+roundf;
+roundl;
+sbrk;
+scalb;
+scalbln;
+scalblnf;
+scalblnl;
+scalbn;
+scalbnf;
+scalbnl;
+scanf;
+seed48;
+set_exp_qd;
+set_mant_qd;
+setbuf;
+setjmp;
+setlocale;
+setvbuf;
+signal;
+sin;
+sinf;
+sinh;
+sinhf;
+sinhl;
+sinl;
+sleep;
+snprintf;
+sprintf;
+sqrt;
+sqrtf;
+sqrtl;
+srand;
+srand48;
+sscanf;
+start;
+stat;
+statvfs;
+strcasecmp;
+strcat;
+strchr;
+strcmp;
+strcmp_c;
+strcoll;
+strcpy;
+strcspn;
+strdup;
+strerror;
+strerror_r;
+strftime;
+strlcat;
+strlcpy;
+strlen;
+strncasecmp;
+strncat;
+strncmp;
+strncpy;
+strpbrk;
+strptime;
+strrchr;
+strsep;
+strspn;
+strstr;
+strtod;
+strtof;
+strtoimax;
+strtok;
+strtok_r;
+strtol;
+strtold;
+strtoll;
+strtoul;
+strtoull;
+strtoumax;
+strxfrm;
+suboptarg;
+swab;
+swprintf;
+swscanf;
+sys_Mtxinit;
+sys_Mtxlock;
+sys_Mtxunlock;
+sys_Tlsalloc;
+sys_Tlsget;
+sys_Tlsset;
+sysconf;
+system;
+tan;
+tanf;
+tanh;
+tanhf;
+tanhl;
+tanl;
+tdelete;
+tempnam;
+tfind;
+tgamma;
+tgammaf;
+tgammal;
+time;
+times;
+timezone;
+tmpfile;
+tmpnam;
+toascii;
+todouble;
+tolower;
+toqdouble;
+toupper;
+towctrans;
+towlower;
+towupper;
+trunc;
+truncf;
+truncl;
+tsearch;
+twalk;
+tzname;
+tzset;
+u2g_client_open;
+ungetc;
+ungetwc;
+unlink;
+vasprintf;
+vfprintf;
+vfscanf;
+vfwprintf;
+vfwscanf;
+vprintf;
+vscanf;
+vsnprintf;
+vsprintf;
+vsscanf;
+vswprintf;
+vswscanf;
+vwprintf;
+vwscanf;
+wcrtomb;
+wcscat;
+wcschr;
+wcscmp;
+wcscoll;
+wcscpy;
+wcscspn;
+wcsftime;
+wcslen;
+wcsncat;
+wcsncmp;
+wcsncpy;
+wcsnrtombs;
+wcspbrk;
+wcsrchr;
+wcsrtombs;
+wcsspn;
+wcsstr;
+wcstod;
+wcstof;
+wcstoimax;
+wcstok;
+wcstol;
+wcstold;
+wcstoll;
+wcstombs;
+wcstoul;
+wcstoull;
+wcstoumax;
+wcsxfrm;
+wctob;
+wctomb;
+wctrans;
+wctype;
+wmemchr;
+wmemcmp;
+wmemcpy;
+wmemmove;
+wmemset;
+wprintf;
+write;
+wscanf;
+y0;
+y1;
+yn;
+};
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/incs/qtest_stdlib.h b/prebuilts/Hexagon_SDK/6.2.0.1/incs/qtest_stdlib.h
new file mode 100755
index 0000000000000..0da356fd1a46c
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/incs/qtest_stdlib.h
@@ -0,0 +1,177 @@
+/*
+ * Copyright (c) 2012-2013,2021 QUALCOMM Technologies Inc. All Rights Reserved.
+ * Qualcomm Technologies Confidential and Proprietary
+ *
+ */
+#ifndef QTEST_STDLIB_H
+#define QTEST_STDLIB_H
+
+#include <assert.h>
+#include "rpcmem.h"
+
+#define WHILE(a) \
+__pragma(warning(suppress:4127)) while(a)
+
+#define FREEIF(pv) \
+   do {\
+      if(pv) { \
+         void* tmp = (void*)pv;\
+         pv = 0;\
+         FREE(tmp);\
+      } \
+   } WHILE(0)
+
+#define ALIGNED_FREEIF(pv) \
+   do {\
+      if(pv) { \
+         void* tmp = (void*)pv;\
+         pv = 0;\
+         ALIGNED_FREE(tmp);\
+      } \
+   } WHILE(0)
+
+#ifndef FASTRPC_DMA_FREE
+#define FASTRPC_DMA_FREE(pv) rpcmem_free(pv)
+#endif
+
+#define FASTRPC_DMA_FREEIF(pv) \
+   do {\
+      if(pv) { \
+         void* tmp = (void*)pv;\
+         pv = 0;\
+         FASTRPC_DMA_FREE(tmp);\
+      } \
+   } WHILE(0)
+
+#ifndef QASSERT
+#define QASSERT(st) assert(st)
+#endif
+
+//had to copy this so not to bring in a1qtest headers
+#if (((defined __linux__) && !(defined ANDROID)) || (defined __APPLE__))
+#include <execinfo.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+static __inline char* stacktrace(void) {
+   int bufsz = 0, sz = 0;
+   char* buf = 0;
+   void* callstack[256];
+   int i, frames = backtrace(callstack, 256);
+   char** strs = backtrace_symbols(callstack, frames);
+   bufsz += snprintf(0, 0, "\n");
+   for (i = 0; i < frames; ++i) {
+      bufsz += snprintf(0, 0, "%s\n", strs[i]);
+   }
+   buf = malloc(bufsz);
+   assert(buf != 0);
+   sz += snprintf(buf + sz, bufsz, "\n");
+   bufsz -= sz;
+   for (i = 0; i < frames && bufsz > 0; ++i) {
+      sz += snprintf(buf + sz, bufsz, "%s\n", strs[i]);
+      bufsz -= sz;
+   }
+   free(strs);
+   return buf;
+}
+
+#else
+
+static __inline char* stacktrace(void) {
+   return 0;
+}
+
+
+#endif //ANDROID
+
+
+#ifndef QTEST
+//default implementation for stdlib
+#include <stdlib.h>
+
+
+#define IF_QTEST(vv) (void)0
+
+#ifndef QASSERT
+#define QASSERT(st) (void)0
+#endif
+
+#ifndef MALLOC
+#define MALLOC malloc
+#endif
+
+#ifndef FASTRPC_DMA_MALLOC
+#define FASTRPC_DMA_MALLOC(heapid, flags, size) rpcmem_alloc(heapid, flags, size)
+#endif
+
+#ifndef CALLOC
+#define CALLOC calloc
+#endif
+
+#ifndef FREE
+#define FREE free
+#endif
+
+#ifndef REALLOC
+#define REALLOC(pv, nsz, osz) realloc(pv, nsz)
+#endif
+
+#ifndef ALIGNED_REALLOC
+#define ALIGNED_REALLOC(pv, nsz, osz, aln) _aligned_realloc(pv, nsz, aln)
+#endif
+
+#ifndef FASTRPC_DMA_REALLOC
+#define FASTRPC_DMA_REALLOC(pv, nsz, osz, aln) fastrpc_dma_realloc(pv, nsz, osz)
+#endif
+
+#ifndef ALIGNED_FREE
+#define ALIGNED_FREE(pv) _aligned_free(pv)
+#endif
+
+#define qtest_set_failure_mask(mask) (void)mask
+#define qtest_get_failure_mask(mask) (void)mask
+#define qtest_set_pass_count(cnt)    (void)cnt
+#define qtest_done()                 (void)0
+#define qtest_test_failure()         0
+#define qtest_atexit(pfn,ctx)        (void)pfn; (void)ctx
+
+#else // QTEST
+
+#include "AEEStdDef.h"
+
+#define IF_QTEST(vv) do {\
+   vv \
+} while (0)
+
+//causes alloc to fail when mask & 0x1 is true
+//each test shifts the mask to the right
+void qtest_set_failure_mask(uint32 mask);
+void qtest_get_failure_mask(uint32* mask);
+
+//causes alloc to fail when count == 0
+//each test decrements the count
+void qtest_set_pass_count(int count);
+
+//returns 0 if succeeds and shifts the mask
+//usefull for generating controlled failures in functions
+int qtest_test_failure(void);
+
+void qtest_atexit(void (*pfnAtExit)(void* pCtx), void* pvCxt);
+
+void qtest_done(void);
+
+void* qtest_malloc(const char* name, char* stack, int sz);
+
+void* qtest_calloc(const char* name, char* stack, int cnt, int sz);
+
+void* qtest_realloc(const char* name, char* stack, void* ptr, int sz);
+
+void qtest_free(const char* name, char* stack, void* rv);
+
+#define MALLOC(sz)         qtest_malloc(__FILE_LINE__, stacktrace(), sz)
+#define CALLOC(cnt, sz)    qtest_calloc(__FILE_LINE__, stacktrace(), cnt, sz)
+#define REALLOC(ptr, sz)   qtest_realloc(__FILE_LINE__, stacktrace(), ptr, sz)
+#define FREE(ptr)          qtest_free(__FILE_LINE__, stacktrace(), ptr)
+
+#endif //QTEST
+#endif //QTEST_STDLIB_H
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/incs/remote.h b/prebuilts/Hexagon_SDK/6.2.0.1/incs/remote.h
new file mode 100755
index 0000000000000..be2e864346b29
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/incs/remote.h
@@ -0,0 +1,1430 @@
+/*
+ * Copyright (c) 2012-2014,2016,2017,2019-2022,2023 Qualcomm Technologies, Inc.
+ * All Rights Reserved.
+ * Confidential and Proprietary - Qualcomm Technologies, Inc.
+ */
+#ifndef REMOTE_H
+#define REMOTE_H
+
+#include <stdint.h>
+#include <sys/types.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef __QAIC_REMOTE
+#define __QAIC_REMOTE(ff) ff
+#endif ///__QAIC_REMOTE
+
+#ifndef __QAIC_REMOTE_EXPORT
+#ifdef _WIN32
+#ifdef _USRDLL
+#define __QAIC_REMOTE_EXPORT    __declspec(dllexport)
+#elif defined(STATIC_LIB)
+#define __QAIC_REMOTE_EXPORT    /** Define for static libk */
+#else   ///STATIC_LIB
+#define __QAIC_REMOTE_EXPORT    __declspec(dllimport)
+#endif ///_USRDLL
+#else
+#define __QAIC_REMOTE_EXPORT
+#endif ///_WIN32
+#endif ///__QAIC_REMOTE_EXPORT
+
+#ifndef __QAIC_RETURN
+#ifdef _WIN32
+#define __QAIC_RETURN _Success_(return == 0)
+#else
+#define __QAIC_RETURN
+#endif ///_WIN32
+#endif ///__QAIC_RETURN
+
+#ifndef __QAIC_IN
+#ifdef _WIN32
+#define __QAIC_IN _In_
+#else
+#define __QAIC_IN
+#endif ///_WIN32
+#endif ///__QAIC_IN
+
+#ifndef __QAIC_IN_CHAR
+#ifdef _WIN32
+#define __QAIC_IN_CHAR _In_z_
+#else
+#define __QAIC_IN_CHAR
+#endif ///_WIN32
+#endif ///__QAIC_IN_CHAR
+
+#ifndef __QAIC_IN_LEN
+#ifdef _WIN32
+#define __QAIC_IN_LEN(len) _Inout_updates_bytes_all_(len)
+#else
+#define __QAIC_IN_LEN(len)
+#endif ///_WIN32
+#endif ///__QAIC_IN_LEN
+
+#ifndef __QAIC_OUT
+#ifdef _WIN32
+#define __QAIC_OUT _Out_
+#else
+#define __QAIC_OUT
+#endif ///_WIN32
+#endif ///__QAIC_OUT
+
+#ifndef __QAIC_INT64PTR
+#ifdef _WIN32
+#define __QAIC_INT64PTR uintptr_t
+#else
+#define __QAIC_INT64PTR uint64_t
+#endif ///_WIN32
+#endif ///__QAIC_INT64PTR
+
+#ifndef __QAIC_REMOTE_ATTRIBUTE
+#define __QAIC_REMOTE_ATTRIBUTE
+#endif ///__QAIC_REMOTE_ATTRIBUTE
+
+/** Retrieves method attribute from the scalars parameter */
+#define REMOTE_SCALARS_METHOD_ATTR(dwScalars)   (((dwScalars) >> 29) & 0x7)
+
+/** Retrieves method index from the scalars parameter */
+#define REMOTE_SCALARS_METHOD(dwScalars)        (((dwScalars) >> 24) & 0x1f)
+
+/** Retrieves number of input buffers from the scalars parameter */
+#define REMOTE_SCALARS_INBUFS(dwScalars)        (((dwScalars) >> 16) & 0x0ff)
+
+/** Retrieves number of output buffers from the scalars parameter */
+#define REMOTE_SCALARS_OUTBUFS(dwScalars)       (((dwScalars) >> 8) & 0x0ff)
+
+/** Retrieves number of input handles from the scalars parameter */
+#define REMOTE_SCALARS_INHANDLES(dwScalars)     (((dwScalars) >> 4) & 0x0f)
+
+/** Retrieves number of output handles from the scalars parameter */
+#define REMOTE_SCALARS_OUTHANDLES(dwScalars)    ((dwScalars) & 0x0f)
+
+/** Makes the scalar using the method attr, index and number of io buffers and handles */
+#define REMOTE_SCALARS_MAKEX(nAttr,nMethod,nIn,nOut,noIn,noOut) \
+          ((((uint32_t)   (nAttr) &  0x7) << 29) | \
+           (((uint32_t) (nMethod) & 0x1f) << 24) | \
+           (((uint32_t)     (nIn) & 0xff) << 16) | \
+           (((uint32_t)    (nOut) & 0xff) <<  8) | \
+           (((uint32_t)    (noIn) & 0x0f) <<  4) | \
+            ((uint32_t)   (noOut) & 0x0f))
+
+#define REMOTE_SCALARS_MAKE(nMethod,nIn,nOut) REMOTE_SCALARS_MAKEX(0,nMethod,nIn,nOut,0,0)
+
+/** Retrieves number of io buffers and handles */
+#define REMOTE_SCALARS_LENGTH(sc) (REMOTE_SCALARS_INBUFS(sc) +\
+                                   REMOTE_SCALARS_OUTBUFS(sc) +\
+                                   REMOTE_SCALARS_INHANDLES(sc) +\
+                                   REMOTE_SCALARS_OUTHANDLES(sc))
+
+/** Defines the domain IDs for supported DSPs */
+#define ADSP_DOMAIN_ID    0
+#define MDSP_DOMAIN_ID    1
+#define SDSP_DOMAIN_ID    2
+#define CDSP_DOMAIN_ID    3
+#define CDSP1_DOMAIN_ID   4
+
+/** Defines the domain names for supported DSPs*/
+#define ADSP_DOMAIN_NAME "adsp"
+#define MDSP_DOMAIN_NAME "mdsp"
+#define SDSP_DOMAIN_NAME "sdsp"
+#define CDSP_DOMAIN_NAME "cdsp"
+#define CDSP1_DOMAIN_NAME "cdsp1"
+
+/** Defines to prepare URI for multi-domain calls */
+#define ADSP_DOMAIN "&_dom=adsp"
+#define MDSP_DOMAIN "&_dom=mdsp"
+#define SDSP_DOMAIN "&_dom=sdsp"
+#define CDSP_DOMAIN "&_dom=cdsp"
+#define CDSP1_DOMAIN "&_dom=cdsp1"
+
+/** Internal transport prefix */
+#define ITRANSPORT_PREFIX "'\":;./\\"
+
+/** Maximum length of URI for remote_handle_open() calls */
+#define MAX_DOMAIN_URI_SIZE 12
+
+/** Token to specify the priority of a handle */
+#define FASTRPC_URI_PRIORITY_TOKEN "&_hpriority="
+
+/** Macro to generate token string for priority */
+#define FASTRPC_HANDLE_PRIORITY_LEVEL(priority)  \
+                FASTRPC_URI_PRIORITY_TOKEN #priority
+
+/**
+ * The following defines are used to specify the priority level of a handle.
+ * Priority levels range from 1 to 7. Lower numbers indicate higher priority.
+ * For example, a priority of 1 indicates the highest priority while a priority
+ * of 7 indicates the lowest priority.
+ *
+ * If no priority level is specified, then handles are opened with highest
+ * priority.
+ */
+#define FASTRPC_HANDLE_PRIORITY_MIN 7
+#define FASTRPC_HANDLE_PRIORITY_MAX 1
+
+/** Domain type for multi-domain RPC calls */
+typedef struct domain_t {
+    /** Domain ID */
+    int id;
+    /** URI for remote_handle_open */
+    char uri[MAX_DOMAIN_URI_SIZE];
+} domain;
+
+/** Remote handle parameter for RPC calls */
+typedef uint32_t remote_handle;
+
+/** Remote handle parameter for multi-domain RPC calls */
+typedef uint64_t remote_handle64;
+
+/** 32-bit Remote buffer parameter for RPC calls */
+typedef struct {
+    /** Address of a remote buffer */
+    void *pv;
+    /** Size of a remote buffer */
+    size_t nLen;
+} remote_buf;
+
+/** 64-bit Remote buffer parameter for RPC calls */
+typedef struct {
+    /** Address of a remote buffer */
+    uint64_t pv;
+    /** Size of a remote buffer */
+    int64_t nLen;
+} remote_buf64;
+
+/** 32-bit Remote DMA handle parameter for RPC calls */
+typedef struct {
+    /** File descriptor of a remote buffer */
+    int32_t fd;
+    /** Offset of the file descriptor */
+    uint32_t offset;
+} remote_dma_handle;
+
+/** 64-bit Remote DMA handle parameter for RPC calls */
+typedef struct {
+    /** File descriptor of a remote buffer */
+    int32_t fd;
+    /** Offset of the file descriptor */
+    uint32_t offset;
+    /** Size of buffer */
+    uint32_t len;
+} remote_dma_handle64;
+
+/** 32-bit Remote Arg structure for RPC calls */
+typedef union {
+    /** 32-bit remote buffer */
+    remote_buf buf;
+    /** non-domains remote handle */
+    remote_handle h;
+    /** multi-domains remote handle */
+    remote_handle64 h64;
+    /** 32-bit remote dma handle */
+    remote_dma_handle dma;
+} remote_arg;
+
+/** 64-bit Remote Arg structure for RPC calls */
+typedef union {
+    /** 64-bit remote buffer */
+    remote_buf64 buf;
+    /** non-domains remote handle */
+    remote_handle h;
+    /** multi-domains remote handle */
+    remote_handle64 h64;
+    /** 64-bit remote dma handle */
+    remote_dma_handle64 dma;
+} remote_arg64;
+
+/** Async response type */
+enum fastrpc_async_notify_type {
+    /** No notification required */
+    FASTRPC_ASYNC_NO_SYNC,
+
+    /** Callback notification using fastrpc_async_callback */
+    FASTRPC_ASYNC_CALLBACK,
+
+    /** User will poll for the notification */
+    FASTRPC_ASYNC_POLL,
+
+/** Update FASTRPC_ASYNC_TYPE_MAX when adding new value to this enum */
+};
+
+/** Job id of Async job queued to DSP */
+typedef uint64_t fastrpc_async_jobid;
+
+/** Async call back response type, input structure */
+typedef struct fastrpc_async_callback {
+    /** Callback function for async notification */
+    void (*fn)(fastrpc_async_jobid jobid, void* context, int result);
+    /** Current context to identify the callback */
+    void *context;
+}fastrpc_async_callback_t;
+
+/** Async descriptor to submit async job */
+typedef struct fastrpc_async_descriptor {
+    /** Async response type */
+    enum fastrpc_async_notify_type type;
+    /** Job id of Async job queued to DSP */
+    fastrpc_async_jobid jobid;
+    /** Async call back response type */
+    fastrpc_async_callback_t cb;
+}fastrpc_async_descriptor_t;
+
+
+/**
+ * Flags used in struct remote_rpc_control_latency
+ * for request ID DSPRPC_CONTROL_LATENCY
+ * in remote handle control interface
+ **/
+enum remote_rpc_latency_flags {
+
+    /** Request ID to disable QOS */
+    RPC_DISABLE_QOS,
+
+    /** Control cpu low power modes based on RPC activity in 100 ms window.
+    * Recommended for latency sensitive use cases.
+    */
+    RPC_PM_QOS,
+
+    /** DSP driver predicts completion time of a method and send CPU wake up signal to reduce wake up latency.
+     * Recommended for moderate latency sensitive use cases. It is more power efficient compared to pm_qos control.
+     */
+    RPC_ADAPTIVE_QOS,
+
+    /**
+     * After sending invocation to DSP, CPU will enter polling mode instead of
+    * waiting for a glink response. This will boost fastrpc performance by
+    * reducing the CPU wakeup and scheduling times. Enabled only for sync RPC
+    * calls. Using this option also enables PM QoS with a latency of 100 us.
+    */
+    RPC_POLL_QOS,
+};
+
+/**
+ * Structure used for request ID `DSPRPC_CONTROL_LATENCY`
+ * in remote handle control interface
+ **/
+struct remote_rpc_control_latency {
+/** Enable latency optimization techniques to meet requested latency. Use remote_rpc_latency_flags */
+    uint32_t enable;
+
+/**
+ * Latency in microseconds.
+ *
+ * When used with RPC_PM_QOS or RPC_ADAPTIVE_QOS, user should pass maximum RPC
+ * latency that can be tolerated. It is not guaranteed that fastrpc will meet
+ * this requirement. 0 us latency is ignored. Recommended value is 100.
+ *
+ * When used with RPC_POLL_QOS, user needs to pass the expected execution time
+ * of method on DSP. CPU will poll for a DSP response for that specified duration
+ * after which it will timeout and fall back to waiting for a glink response.
+ * Max value that can be passed is 10000 (10 ms)
+ */
+    uint32_t latency;
+};
+
+/**
+ * @struct fastrpc_capability
+ * @brief Argument to query DSP capability with request ID DSPRPC_GET_DSP_INFO
+ */
+typedef struct remote_dsp_capability {
+    /** @param[in] : DSP domain ADSP_DOMAIN_ID, SDSP_DOMAIN_ID, or CDSP_DOMAIN_ID */
+    uint32_t domain;
+    /** @param[in] : One of the DSP/kernel attributes from enum remote_dsp_attributes */
+    uint32_t attribute_ID;
+    /** @param[out] : Result of the DSP/kernel capability query based on attribute_ID */
+    uint32_t capability;
+}fastrpc_capability;
+
+
+/**
+ * @enum remote_dsp_attributes
+ * @brief Different types of DSP capabilities queried via remote_handle_control
+ * using DSPRPC_GET_DSP_INFO request id.
+ * DSPRPC_GET_DSP_INFO should only be used with remote_handle_control() as a handle
+ * is not needed to query DSP capabilities.
+ * To query DSP capabilities fill out 'domain' and 'attribute_ID' from structure
+ * remote_dsp_capability. DSP capability will be returned on variable 'capability'.
+ */
+enum remote_dsp_attributes {
+    /** Check if DSP supported: supported = 1,
+                                     unsupported = 0 */
+    DOMAIN_SUPPORT,
+
+    /** DSP unsigned PD support: supported = 1,
+                                     unsupported = 0 */
+    UNSIGNED_PD_SUPPORT,
+
+    /** Number of HVX 64B support */
+    HVX_SUPPORT_64B,
+
+    /** Number of HVX 128B support */
+    HVX_SUPPORT_128B,
+
+    /** Max page size allocation possible in VTCM */
+    VTCM_PAGE,
+
+    /** Number of page_size blocks available */
+    VTCM_COUNT,
+
+    /** Hexagon processor architecture version */
+    ARCH_VER,
+
+    /** HMX Support Depth */
+    HMX_SUPPORT_DEPTH,
+
+    /** HMX Support Spatial */
+    HMX_SUPPORT_SPATIAL,
+
+    /** Async FastRPC Support */
+    ASYNC_FASTRPC_SUPPORT,
+
+    /** DSP User PD status notification Support */
+    STATUS_NOTIFICATION_SUPPORT ,
+
+    /** Multicast widget programming */
+    MCID_MULTICAST,
+
+    /** Mapping in extended address space on DSP */
+    EXTENDED_MAP_SUPPORT,
+
+    /** DSP support for handle priority */
+    HANDLE_PRIORITY_SUPPORT ,
+
+    /** Update FASTRPC_MAX_DSP_ATTRIBUTES when adding new value to this enum */
+};
+
+/** Macro for backward compatibility. Clients can compile wakelock request code
+ * in their app only when this is defined
+ */
+#define FASTRPC_WAKELOCK_CONTROL_SUPPORTED 1
+
+/**
+ * Structure used for request ID `DSPRPC_CONTROL_WAKELOCK`
+ * in remote handle control interface
+ **/
+struct remote_rpc_control_wakelock {
+    /** enable control of wake lock */
+    uint32_t enable;
+};
+
+/**
+ * Structure used for request ID `DSPRPC_GET_DOMAIN`
+ * in remote handle control interface.
+ * Get domain ID associated with an opened handle to remote interface of type remote_handle64.
+ * remote_handle64_control() returns domain for a given handle
+ * remote_handle_control() API returns default domain ID
+ */
+typedef struct remote_rpc_get_domain {
+    /** @param[out] : domain ID associcated with handle */
+    int domain;
+} remote_rpc_get_domain_t;
+
+/**
+ * Structure used for request IDs `DSPRPC_SET_PATH` and
+ * `DSPRPC_GET_PATH` in remote handle control interface.
+ */
+struct remote_control_custom_path {
+    /** value size including NULL char */
+    int32_t value_size;
+    /** key used for storing the path */
+    const char* path;
+    /** value which will be used for file operations when the corresponding key is specified in the file URI */
+    char* value;
+};
+
+/**
+ * Request IDs for remote handle control interface
+ **/
+enum handle_control_req_id {
+    /** Reserved */
+    DSPRPC_RESERVED,
+
+    /** Request ID to enable/disable QOS */
+    DSPRPC_CONTROL_LATENCY ,
+
+    /** Request ID to get dsp capabilites from kernel and Hexagon */
+    DSPRPC_GET_DSP_INFO,
+
+    /** Request ID to enable wakelock for the given domain */
+    DSPRPC_CONTROL_WAKELOCK,
+
+    /** Request ID to get the default domain or domain associated to an exisiting handle */
+    DSPRPC_GET_DOMAIN,
+
+    /** Request ID to add a custom path to the hash table */
+    DSPRPC_SET_PATH,
+
+    /** Request ID to read a custom path to the hash table */
+    DSPRPC_GET_PATH,
+
+};
+
+/**
+ * Structure used for request ID `FASTRPC_THREAD_PARAMS`
+ * in remote session control interface
+ **/
+struct remote_rpc_thread_params {
+    /** Remote subsystem domain ID, pass -1 to set params for all domains */
+    int domain;
+    /** User thread priority (1 to 255), pass -1 to use default */
+    int prio;
+    /** User thread stack size, pass -1 to use default */
+    int stack_size;
+};
+
+/**
+ * Structure used for request ID `DSPRPC_CONTROL_UNSIGNED_MODULE`
+ * in remote session control interface
+ **/
+struct remote_rpc_control_unsigned_module {
+    /** Remote subsystem domain ID, -1 to set params for all domains */
+    int domain;
+    /** Enable unsigned module loading */
+    int enable;
+};
+
+/**
+ * Structure used for request ID `FASTRPC_RELATIVE_THREAD_PRIORITY`
+ * in remote session control interface
+ **/
+struct remote_rpc_relative_thread_priority {
+    /** Remote subsystem domain ID, pass -1 to update priority for all domains */
+    int domain;
+    /** the value by which the default thread priority needs to increase/decrease
+                                     * DSP thread priorities run from 1 to 255 with 1 being the highest thread priority.
+                                     * So a negative relative thread priority value will 'increase' the thread priority,
+                                     * a positive value will 'decrease' the thread priority.
+                                     */
+    int relative_thread_priority;
+};
+
+/**
+ * When a remote invocation does not return,
+ * then call "remote_session_control" with FASTRPC_REMOTE_PROCESS_KILL requestID
+ * and the appropriate remote domain ID. Once remote process is successfully
+ * killed, before attempting to create new session, user is expected to
+ * close all open handles for shared objects in case of domains.
+ * And, user is expected to unload all shared objects including
+ * libcdsprpc.so/libadsprpc.so/libmdsprpc.so/libsdsprpc.so in case of non-domains.
+ */
+struct remote_rpc_process_clean_params {
+    /** Domain ID  to recover process */
+    int domain;
+};
+
+/**
+ * Structure used for request ID `FASTRPC_SESSION_CLOSE`
+ * in remote session control interface
+ **/
+struct remote_rpc_session_close {
+    /** Remote subsystem domain ID, -1 to close all handles for all domains */
+    int domain;
+};
+
+/**
+ * Structure used for request ID `FASTRPC_CONTROL_PD_DUMP`
+ * in remote session control interface
+ * This is used to enable/disable PD dump for userPDs on the DSP
+ **/
+struct remote_rpc_control_pd_dump {
+    /** Remote subsystem domain ID, -1 to set params for all domains */
+    int domain;
+    /** Enable PD dump of user PD on the DSP */
+    int enable;
+};
+
+/**
+ * Structure used for request ID `FASTRPC_REMOTE_PROCESS_EXCEPTION`
+ * in remote session control interface
+ * This is used to trigger exception in the userPDs running on the DSP
+ **/
+typedef struct remote_rpc_process_clean_params remote_rpc_process_exception;
+
+/**
+ * Process types
+ * Return values for FASTRPC_REMOTE_PROCESS_TYPE control req ID for remote_handle_control
+ * Return values denote the type of process on remote subsystem
+**/
+enum fastrpc_process_type {
+    /** Signed PD running on the DSP */
+    PROCESS_TYPE_SIGNED,
+
+    /** Unsigned PD running on the DSP */
+    PROCESS_TYPE_UNSIGNED,
+
+};
+
+/**
+ * Structure for remote_session_control,
+ * used with FASTRPC_REMOTE_PROCESS_TYPE request ID
+ * to query the type of PD running defined by enum fastrpc_process_type
+ * @param[in] : Domain of process
+ * @param[out] : Process_type belonging to enum fastrpc_process_type
+ */
+struct remote_process_type {
+    /** @param[in] : Domain of process */
+    int domain;
+    /** @param[out] : Process_type belonging to enum fastrpc_process_type */
+    int process_type;
+};
+
+/**
+ * DSP user PD status notification flags
+ * Status flags for the user PD on the DSP returned by the status notification function
+ *
+**/
+typedef enum remote_rpc_status_flags {
+    /** DSP user process is up */
+    FASTRPC_USER_PD_UP,
+
+    /** DSP user process exited */
+    FASTRPC_USER_PD_EXIT,
+
+    /** DSP user process forcefully killed. Happens when DSP resources needs to be freed. */
+    FASTRPC_USER_PD_FORCE_KILL,
+
+    /** Exception in the user process of DSP. */
+    FASTRPC_USER_PD_EXCEPTION,
+
+    /** Subsystem restart of the DSP, where user process is running. */
+    FASTRPC_DSP_SSR,
+
+} remote_rpc_status_flags_t;
+
+/**
+ * fastrpc_notif_fn_t
+ * Notification call back function
+ *
+ * @param context, context used in the registration
+ * @param domain, domain of the user process
+ * @param session, session id of user process
+ * @param status, status of user process
+ * @retval, 0 on success
+ */
+typedef int (*fastrpc_notif_fn_t)(void *context, int domain, int session, remote_rpc_status_flags_t status);
+
+
+/**
+ * Structure for remote_session_control,
+ * used with FASTRPC_REGISTER_STATUS_NOTIFICATIONS request ID
+ * to receive status notifications of the user PD on the DSP
+**/
+typedef struct remote_rpc_notif_register {
+    /** @param[in] : Context of the client */
+    void *context;
+    /** @param[in] : DSP domain ADSP_DOMAIN_ID, SDSP_DOMAIN_ID, or CDSP_DOMAIN_ID */
+    int domain;
+    /** @param[in] : Notification function pointer */
+    fastrpc_notif_fn_t notifier_fn;
+} remote_rpc_notif_register_t;
+
+/**
+ * Structure for remote_session_control,
+ * used with FASTRPC_PD_INITMEM_SIZE request ID
+ * to set signed userpd initial memory size
+ **/
+struct remote_rpc_pd_initmem_size {
+    /** Remote subsystem domain ID, pass -1 to set params for all domains **/
+    int domain;
+    /** Initial memory allocated for remote userpd, minimum value : 3MB, maximum value 200MB **/
+                                     /** Unsupported for unsigned user PD, for unsigned user PD init mem size is fixed at 5MB **/
+    uint32_t pd_initmem_size;
+};
+
+/**
+ * Structure for remote_session_control,
+ * used with FASTRPC_RESERVE_SESSION request ID
+ * to reserve new fastrpc session of the user PD on the DSP.
+ * Default sesion is always 0 and remains available for any module opened without Session ID.
+ * New session reservation starts with session ID 1.
+**/
+typedef struct remote_rpc_reserve_new_session {
+    /** @param[in] : Domain name of DSP, on which session need to be reserved */
+    char *domain_name;
+    /** @param[in] : Domain name length, without NULL character */
+    uint32_t domain_name_len;
+    /** @param[in] : Session name of the reserved sesssion */
+    char *session_name;
+    /** @param[in] : Session name length, without NULL character */
+    uint32_t session_name_len;
+    /** @param[out] : Effective Domain ID is the identifier of the session.
+                                             * Effective Domain ID is the unique identifier representing the session(PD) on DSP.
+                                             * Effective Domain ID needs to be used in place of Domain ID when application has multiple sessions.
+                                             */
+    uint32_t effective_domain_id;
+    /** @param[out] : Session ID of the reserved session.
+                                             * An application can have multiple sessions(PDs) created on DSP.
+                                             * session_id 0 is the default session. Clients can reserve session starting from 1.
+                                             * Currently only 2 sessions are supported session_id 0 and session_id 1.
+                                             */
+    uint32_t session_id;
+} remote_rpc_reserve_new_session_t;
+
+/**
+ * Structure for remote_session_control,
+ * used with FASTRPC_GET_EFFECTIVE_DOMAIN_ID request ID
+ * to get effective domain id of fastrpc session on the user PD of the DSP
+**/
+typedef struct remote_rpc_effective_domain_id {
+    /** @param[in] : Domain name of DSP */
+    char *domain_name;
+    /** @param[in] : Domain name length, without NULL character */
+    uint32_t domain_name_len;
+    /** @param[in] : Session ID of the reserved session. 0 can be used for Default session */
+    uint32_t session_id;
+    /** @param[out] : Effective Domain ID of session */
+    uint32_t effective_domain_id;
+} remote_rpc_effective_domain_id_t;
+
+/**
+ * Structure for remote_session_control,
+ * used with FASTRPC_GET_URI request ID
+ * to get the URI needed to load the module in the fastrpc user PD on the DSP
+**/
+typedef struct remote_rpc_get_uri {
+    /** @param[in] : Domain name of DSP */
+    char *domain_name;
+    /** @param[in] : Domain name length, without NULL character */
+    uint32_t domain_name_len;
+    /** @param[in] : Session ID of the reserved session. 0 can be used for Default session */
+    uint32_t session_id;
+    /** @param[in] : URI of the module, found in the auto-generated header file*/
+    char *module_uri ;
+    /** @param[in] : Module URI length, without NULL character */
+    uint32_t module_uri_len;
+    /** @param[out] : URI containing module, domain and session.
+                                             * Memory for uri need to be pre-allocated with session_uri_len size.
+                                             * Typically session_uri_len is 30 characters more than Module URI length.
+                                             * If size of uri is beyond session_uri_len, remote_session_control fails with AEE_EBADSIZE
+                                             */
+    char *uri ;
+    /** @param[in] : URI length */
+    uint32_t uri_len;
+} remote_rpc_get_uri_t;
+
+/**
+ * Structure for remote_session_control, used with FASTRPC_CONTEXT_CREATE request,
+ * to create a multidomain fastrpc context
+**/
+typedef struct fastrpc_context_create {
+	/** @param[in] : List of effective domain IDs on which session needs to be
+					 created. Needs to be allocated and populated by user.
+					 A new effective domain id CANNOT be added to an existing context. */
+	uint32_t *effec_domain_ids;
+
+	/** @param[in] : Number of domain ids.
+					 Size of effective domain ID array. */
+	uint32_t num_domain_ids;
+
+	/** @param[in] : Type of create request (unused) */
+	uint64_t flags;
+
+	/** @param[out] : Multi-domain context handle */
+	uint64_t ctx;
+} fastrpc_context_create;
+
+/** struct to be used with FASTRPC_CONTEXT_DESTROY request ID */
+typedef struct fastrpc_context_destroy {
+	/** @param[in] : Fastrpc multi-domain context */
+	uint64_t ctx;
+
+	/** @param[in] : Type of destroy request (unused) */
+	uint64_t flags;
+} fastrpc_context_destroy;
+
+/**
+ * Structure used for request ID `FASTRPC_MAX_THREAD_PARAM`
+ * in remote session control interface, to set max threads for
+ * unsigned PD.
+ **/
+struct remote_rpc_set_max_thread {
+/** @param[in] : CDSP_DOMAIN_ID */
+    int domain;
+/** @param[in] : Max thread config for unsigned PD Minimum value : 128, maximum value 256. */
+    unsigned int max_num_threads;
+};
+
+/**
+ * Request IDs for remote session control interface
+ **/
+enum session_control_req_id {
+    /** Reserved */
+    FASTRPC_RESERVED_1,
+
+    /** Set thread parameters like priority and stack size */
+    FASTRPC_THREAD_PARAMS,
+
+    /** Handle the unsigned module offload request, to be called before remote_handle_open() */
+    DSPRPC_CONTROL_UNSIGNED_MODULE,
+
+    /** Reserved */
+    FASTRPC_RESERVED_2,
+
+    /** To increase/decrease default thread priority */
+    FASTRPC_RELATIVE_THREAD_PRIORITY,
+
+    /** Reserved */
+    FASTRPC_RESERVED_3,
+
+    /** Kill remote process */
+    FASTRPC_REMOTE_PROCESS_KILL,
+
+    /** Close all open handles of requested domain */
+    FASTRPC_SESSION_CLOSE,
+
+    /** Enable PD dump feature */
+    FASTRPC_CONTROL_PD_DUMP,
+
+    /** Trigger Exception in the remote process */
+    FASTRPC_REMOTE_PROCESS_EXCEPTION,
+
+    /** Query type of process defined by enum fastrpc_process_type */
+    FASTRPC_REMOTE_PROCESS_TYPE,
+
+    /** Enable DSP User process status notifications */
+    FASTRPC_REGISTER_STATUS_NOTIFICATIONS,
+
+    /** Set signed userpd initial memory size  */
+    FASTRPC_PD_INITMEM_SIZE,
+
+    /** Reserve new FastRPC session */
+    FASTRPC_RESERVE_NEW_SESSION,
+
+    /** Get effective domain ID of a FastRPC session */
+    FASTRPC_GET_EFFECTIVE_DOMAIN_ID,
+
+    /** Creates the URI needed to load a module in the DSP User PD */
+    FASTRPC_GET_URI,
+
+    /** Set max thread value for unsigned PD */
+    FASTRPC_MAX_THREAD_PARAM,
+
+    /** Create or attaches to remote session(s) on one or more domains */
+    FASTRPC_CONTEXT_CREATE,
+
+    /** Destroy or detach from remote sessions */
+    FASTRPC_CONTEXT_DESTROY,
+};
+
+
+/**
+ * Memory map control flags for using with remote_mem_map() and remote_mem_unmap()
+ **/
+enum remote_mem_map_flags {
+/**
+ * Create static memory map on remote process with default cache configuration (writeback).
+ * Same remoteVirtAddr will be assigned on remote process when fastrpc call made with local virtual address.
+ * @Map lifetime
+ * Life time of this mapping is until user unmap using remote_mem_unmap or session close.
+ * No reference counts are used. Behavior of mapping multiple times without unmap is undefined.
+ * @Cache maintenance
+ * Driver clean caches when virtual address passed through RPC calls defined in IDL as a pointer.
+ * User is responsible for cleaning cache when remoteVirtAddr shared to DSP and accessed out of fastrpc method invocations on DSP.
+ * @recommended usage
+ * Map buffers which are reused for long time or until session close. This helps to reduce fastrpc latency.
+ * Memory shared with remote process and accessed only by DSP.
+ */
+    REMOTE_MAP_MEM_STATIC,
+
+/** Update REMOTE_MAP_MAX_FLAG when adding new value to this enum **/
+ };
+
+/**
+ * @enum fastrpc_map_flags for fastrpc_mmap and fastrpc_munmap
+ * @brief Types of maps with cache maintenance
+ */
+enum fastrpc_map_flags {
+    /**
+     * Map memory pages with RW- permission and CACHE WRITEBACK.
+     * Driver will clean cache when buffer passed in a FastRPC call.
+     * Same remote virtual address will be assigned for subsequent
+     * FastRPC calls.
+     */
+    FASTRPC_MAP_STATIC,
+
+    /** Reserved for compatibility with deprecated flag */
+    FASTRPC_MAP_RESERVED,
+
+    /**
+     * Map memory pages with RW- permission and CACHE WRITEBACK.
+     * Mapping tagged with a file descriptor. User is responsible for
+     * maintenance of CPU and DSP caches for the buffer. Get virtual address
+     * of buffer on DSP using HAP_mmap_get() and HAP_mmap_put() functions.
+     */
+    FASTRPC_MAP_FD,
+
+    /**
+     * Mapping delayed until user calls HAP_mmap() and HAP_munmap()
+     * functions on DSP. User is responsible for maintenance of CPU and DSP
+     * caches for the buffer. Delayed mapping is useful for users to map
+     * buffer on DSP with other than default permissions and cache modes
+     * using HAP_mmap() and HAP_munmap() functions.
+     */
+    FASTRPC_MAP_FD_DELAYED,
+
+    /** Reserved for compatibility **/
+    FASTRPC_MAP_RESERVED_4,
+    FASTRPC_MAP_RESERVED_5,
+    FASTRPC_MAP_RESERVED_6,
+    FASTRPC_MAP_RESERVED_7,
+    FASTRPC_MAP_RESERVED_8,
+    FASTRPC_MAP_RESERVED_9,
+    FASTRPC_MAP_RESERVED_10,
+    FASTRPC_MAP_RESERVED_11,
+    FASTRPC_MAP_RESERVED_12,
+    FASTRPC_MAP_RESERVED_13,
+    FASTRPC_MAP_RESERVED_14,
+    FASTRPC_MAP_RESERVED_15,
+
+    /**
+     * This flag is used to skip CPU mapping,
+     * otherwise behaves similar to FASTRPC_MAP_FD_DELAYED flag.
+     */
+    FASTRPC_MAP_FD_NOMAP,
+
+    /**
+     * The below two flags work the same as FASTRPC_MAP_FD and FASTRPC_MAP_FD_DELAYED
+     * but allow the user to map into the extended address space on DSP
+     */
+
+    FASTRPC_MAP_FD_EXTENDED,
+    FASTRPC_MAP_FD_DELAYED_EXTENDED,
+
+    /** Update FASTRPC_MAP_MAX when adding new value to this enum **/
+};
+
+#define MAX_DOMAIN_NAMELEN 30
+
+/* Position of domain type in flags */
+#define DOMAINS_LIST_FLAGS_TYPE_POS 5
+
+/* Helper macro to set domain type in flags */
+#define DOMAINS_LIST_FLAGS_SET_TYPE(flags, type) (flags | (type & ((1 << DOMAINS_LIST_FLAGS_TYPE_POS) - 1)))
+
+/**
+ * @enum fastrpc_domain_type
+ * @brief Indicates the type of domains (DSPs) present in the system
+ */
+typedef enum {
+	/** Flag to be used to query list of all available domains */
+	ALL_DOMAINS,
+	NSP,
+	LPASS,
+	SDSP,
+	MODEM,
+	HPASS,
+} fastrpc_domain_type;
+
+/**
+ * @struct fastrpc_domain
+ * @brief Describes the details of each domain
+ */
+typedef struct {
+	/**
+	 * @param : Logical domain id of the dsp.
+	 * This can be used to query the capabilities of the dsp and
+	 * can change with every reboot of device depending on the order
+	 * of domain enumeration.
+	 * This is NOT the same as effective domain id. To get the effective
+	 * domain id of a particular session on this domain, pass the corresponding
+	 * domain name with the `FASTRPC_GET_EFFECTIVE_DOMAIN_ID` request.
+	 */
+	int id;
+
+	/**
+	 * @param : Name of domain.
+	 * To be appended with module uri while opening remote handle,
+	 * or for querying the effective domain id on a specific session
+	 * on this domain.
+	 */
+	char name[MAX_DOMAIN_NAMELEN];
+
+	/**
+	 * @param : Type of DSP, of 'fastrpc_domain_type'.
+	 */
+	fastrpc_domain_type type;
+
+	/**
+	 * @param : Status of domain: 0 if domain is down
+	 *                            non-zero if domain is up
+	 */
+	int status;
+
+	/**
+	 * @param : Card on which domain is present (for future use).
+	 */
+	uint32_t card;
+
+	/**
+	 * @param : SoC on which domain is present (for future use).
+	 */
+	uint32_t soc_id;
+} fastrpc_domain;
+
+/**
+ * @struct fastrpc_domain_info
+ * @brief Structure used with 'FASTRPC_GET_DOMAINS' request id
+ * to query the list of available domains in the system.
+ */
+typedef struct {
+	/**
+	 * @param[in/out] : Domains-info array pointer.
+	 * Array needs to be allocated by client with size of array specified
+	 * in 'max_domains'. Array will be populated by fastrpc with list of
+	 * available domains.
+	 * To query number of domains first, pass NULL pointer.
+	 */
+	fastrpc_domain *domains;
+
+	/**
+	 * @param[in] : Size of the 'domains' array allocated by user.
+	 * This has to be greater than or equal to the actual number of available
+	 * domains. To query number of domains first, pass 0 in this field.
+	 */
+	int max_domains;
+
+	/**
+	 * @param[out] : This field will be populated with the total number
+	 * of available domains. While reading the domains-info in the array,
+	 * read only until 'num_domains' elements.
+	 */
+	int num_domains;
+
+	/**
+	 * @param[in] : Bit-mask for the type of request, to be populated by client.
+	 * Bits 0-4 : Type of domains to be queried of 'fastrpc_domain_type'.
+	 * Only domains of this type will be returned in the 'domains' array.
+	 * To get list of all available domains, use 'ALL_DOMAINS' type.
+	 * Other bits reserved for future use.
+	 */
+	uint64_t flags;
+} fastrpc_domains_info;
+
+/**
+ * @enum system_req_id
+ * @brief Requst ID to obtain information of available domains
+ */
+typedef enum {
+	/** Query list of available domains */
+	FASTRPC_GET_DOMAINS = 0
+} system_req_id;
+
+/**
+ * @struct system_req_payload
+ * @brief Payload for remote_system_request API
+ */
+typedef struct {
+	system_req_id id;
+	union {
+		fastrpc_domains_info sys;
+	};
+} system_req_payload;
+
+/**
+ * remote_system_request
+ * API to get system info like list of available domains
+ * @param req, payload containing system info and request ID
+ * @return, 0 on Success
+ */
+int remote_system_request(system_req_payload *req);
+
+/**
+ * Attributes for remote_register_buf_attr/remote_register_buf_attr2
+ **/
+#define FASTRPC_ATTR_NONE          0          /** No attribute to set.*/
+#define FASTRPC_ATTR_NON_COHERENT  2          /** Attribute to map a buffer as dma non-coherent,
+                                                 Driver perform cache maintenance.*/
+#define FASTRPC_ATTR_COHERENT      4          /** Attribute to map a buffer as dma coherent,
+                                                 Driver skips cache maintenenace
+                                                 It will be ignored if a device is marked as dma-coherent in device tree.*/
+#define FASTRPC_ATTR_KEEP_MAP      8          /** Attribute to keep the buffer persistant
+                                                 until unmap is called explicitly.*/
+#define FASTRPC_ATTR_NOMAP         16         /** Attribute for secure buffers to skip
+                                                 smmu mapping in fastrpc driver*/
+#define FASTRPC_ATTR_FORCE_NOFLUSH 32         /** Attribute to map buffer such that flush by driver is skipped for that particular buffer
+                                                 client has to perform cache maintenance*/
+#define FASTRPC_ATTR_FORCE_NOINVALIDATE 64    /** Attribute to map buffer such that invalidate by driver is skipped for that particular buffer
+                                                 client has to perform cache maintenance */
+#define FASTRPC_ATTR_TRY_MAP_STATIC 128       /** Attribute for persistent mapping a buffer
+                                                 to remote DSP process during buffer registration
+                                                 with FastRPC driver. This buffer will be automatically
+                                                 mapped during fastrpc session open and unmapped either
+                                                 at unregister or session close. FastRPC library tries
+                                                 to map buffers and ignore errors in case of failure.
+                                                 pre-mapping a buffer reduces the FastRPC latency.
+                                                 This flag is recommended only for buffers used with
+                                                 latency critical rpc calls */
+
+
+/**
+ * REMOTE_MODE_PARALLEL used with remote_set_mode
+ * This is the default mode for the driver.  While the driver is in parallel
+ * mode it will try to invalidate output buffers after it transfers control
+ * to the dsp.  This allows the invalidate operations to overlap with the
+ * dsp processing the call.  This mode should be used when output buffers
+ * are only read on the application processor and only written on the aDSP.
+ */
+#define REMOTE_MODE_PARALLEL  0
+
+/**
+ * REMOTE_MODE_SERIAL used with remote_set_mode
+ * When operating in SERIAL mode the driver will invalidate output buffers
+ * before calling into the dsp.  This mode should be used when output
+ * buffers have been written to somewhere besides the aDSP.
+ */
+#define REMOTE_MODE_SERIAL    1
+
+
+#ifdef _WIN32
+#include "remote_wos_ext.h" /** For function pointers of remote APIs */
+#endif
+
+
+/**
+ * remote_handle()_open
+ * Opens a remote_handle "name"
+ * returns 0 on success
+ **/
+__QAIC_REMOTE_EXPORT __QAIC_RETURN int __QAIC_REMOTE(remote_handle_open)(__QAIC_IN_CHAR  const char* name, __QAIC_OUT remote_handle *ph) __QAIC_REMOTE_ATTRIBUTE;
+__QAIC_REMOTE_EXPORT __QAIC_RETURN int __QAIC_REMOTE(remote_handle64_open)( __QAIC_IN_CHAR  const char* name, __QAIC_OUT  remote_handle64 *ph) __QAIC_REMOTE_ATTRIBUTE;
+
+
+/**
+ * invokes the remote handle
+ * see retrive macro's on dwScalars format
+ * pra, contains the arguments in the following order, inbufs, outbufs, inhandles, outhandles.
+ * implementors should ignore and pass values asis that the transport doesn't understand.
+ */
+__QAIC_REMOTE_EXPORT __QAIC_RETURN int __QAIC_REMOTE(remote_handle_invoke)(__QAIC_IN remote_handle h, __QAIC_IN uint32_t dwScalars, __QAIC_IN remote_arg *pra) __QAIC_REMOTE_ATTRIBUTE;
+__QAIC_REMOTE_EXPORT __QAIC_RETURN int __QAIC_REMOTE(remote_handle64_invoke)(__QAIC_IN remote_handle64 h, __QAIC_IN uint32_t dwScalars, __QAIC_IN remote_arg *pra) __QAIC_REMOTE_ATTRIBUTE;
+
+
+/**
+ * remote_handle()_close
+ * closes the remote handle
+ */
+__QAIC_REMOTE_EXPORT __QAIC_RETURN int __QAIC_REMOTE(remote_handle_close)(__QAIC_IN remote_handle h) __QAIC_REMOTE_ATTRIBUTE;
+__QAIC_REMOTE_EXPORT __QAIC_RETURN int __QAIC_REMOTE(remote_handle64_close)(__QAIC_IN remote_handle64 h) __QAIC_REMOTE_ATTRIBUTE;
+
+
+/**
+ * remote_handle_control
+ * Set remote handle control parameters
+ *
+ * @param req, request ID defined by handle_control_req_id
+ * @param data, address of structure with parameters
+ * @param datalen, length of data
+ * @retval, 0 on success
+ */
+__QAIC_REMOTE_EXPORT __QAIC_RETURN int __QAIC_REMOTE(remote_handle_control)(__QAIC_IN uint32_t req,  __QAIC_IN_LEN(datalen)  void* data,  __QAIC_IN uint32_t datalen) __QAIC_REMOTE_ATTRIBUTE;
+__QAIC_REMOTE_EXPORT __QAIC_RETURN int __QAIC_REMOTE(remote_handle64_control)(__QAIC_IN remote_handle64 h, __QAIC_IN uint32_t req, __QAIC_IN_LEN(datalen)  void* data, __QAIC_IN uint32_t datalen) __QAIC_REMOTE_ATTRIBUTE;
+
+
+/**
+ * remote_session_control
+ * Set remote session parameters
+ *
+ * @param req, request ID
+ * @param data, address of structure with parameters
+ * @param datalen, length of data
+ * @retval, 0 on success
+ * remote_session_control with FASTRPC_REMOTE_PROCESS_KILL req ID, possible error codes
+ * are AEE_ENOSUCH, AEE_EBADPARM, AEE_EINVALIDDOMAIN. Other than this errors codes treated as
+ * retuned from fastRPC framework.
+ */
+__QAIC_REMOTE_EXPORT __QAIC_RETURN int __QAIC_REMOTE(remote_session_control)(__QAIC_IN uint32_t req, __QAIC_IN_LEN(datalen) void *data, __QAIC_IN uint32_t datalen) __QAIC_REMOTE_ATTRIBUTE;
+
+
+/**
+ * remote_handle()_invoke_async
+ * invokes the remote handle asynchronously
+ *
+ * desc, descriptor contains type of asyncjob. context and call back function(if any)
+ * see retrive macro's on dwScalars format
+ * pra, contains the arguments in the following order, inbufs, outbufs, inhandles, outhandles.
+ * all outbufs need to be either allocated using rpcmem_alloc or registered ION buffers using register_buf
+ * implementors should ignore and pass values as is that the transport doesn't understand.
+ */
+__QAIC_REMOTE_EXPORT __QAIC_RETURN int __QAIC_REMOTE(remote_handle_invoke_async)(__QAIC_IN remote_handle h, __QAIC_IN fastrpc_async_descriptor_t *desc, __QAIC_IN uint32_t dwScalars, __QAIC_IN remote_arg *pra) __QAIC_REMOTE_ATTRIBUTE;
+__QAIC_REMOTE_EXPORT __QAIC_RETURN int __QAIC_REMOTE(remote_handle64_invoke_async)(__QAIC_IN remote_handle64 h, __QAIC_IN fastrpc_async_descriptor_t *desc, __QAIC_IN uint32_t dwScalars, __QAIC_IN remote_arg *pra) __QAIC_REMOTE_ATTRIBUTE;
+
+
+/**
+ * fastrpc_async_get_status
+ * Get status of Async job. This can be used to query the status of a Async job
+ *
+ * @param jobid, jobid returned during Async job submission.
+ * @param timeout_us, timeout in micro seconds
+ *                    timeout = 0, returns immediately with status/result
+ *                    timeout > 0, waits for specified time and then returns with status/result
+ *                    timeout < 0. waits indefinetely until job completes
+ * @param result, integer pointer for the result of the job
+ *                0 on success
+ *                error code on failure
+ * @retval, 0 on job completion and result of job is part of @param result
+ *          AEE_EBUSY, if job status is pending and is not returned from DSP
+ *          AEE_EBADPARM, if job id is invalid
+ *          AEE_EFAILED, FastRPC internal error
+ *
+ */
+__QAIC_REMOTE_EXPORT __QAIC_RETURN int __QAIC_REMOTE(fastrpc_async_get_status)(__QAIC_IN fastrpc_async_jobid jobid,__QAIC_IN int timeout_us,__QAIC_OUT int *result);
+
+
+/**
+ * fastrpc_release_async_job
+ * Release Async job after receiving status either through callback/poll
+ *
+ * @param jobid, jobid returned during Async job submission.
+ * @retval, 0 on success
+ *          AEE_EBUSY, if job status is pending and is not returned from DSP
+ *          AEE_EBADPARM, if job id is invalid
+ *
+ */
+__QAIC_REMOTE_EXPORT __QAIC_RETURN int __QAIC_REMOTE(fastrpc_release_async_job)(__QAIC_IN fastrpc_async_jobid jobid);
+
+
+/**
+ * remote_mmap
+ * map memory to the remote domain
+ *
+ * @param fd, fd assosciated with this memory
+ * @param flags, flags to be used for the mapping
+ * @param vaddrin, input address
+ * @param size, size of buffer
+ * @param vaddrout, output address
+ * @retval, 0 on success
+ */
+__QAIC_REMOTE_EXPORT __QAIC_RETURN int __QAIC_REMOTE(remote_mmap)(__QAIC_IN int fd, __QAIC_IN uint32_t flags, __QAIC_IN uint32_t vaddrin, __QAIC_IN int size, __QAIC_OUT uint32_t* vaddrout) __QAIC_REMOTE_ATTRIBUTE;
+
+
+/**
+ * remote_munmap
+ * unmap memory from the remote domain
+ *
+ * @param vaddrout, remote address mapped
+ * @param size, size to unmap.  Unmapping a range partially may  not be supported.
+ * @retval, 0 on success, may fail if memory is still mapped
+ */
+__QAIC_REMOTE_EXPORT __QAIC_RETURN int __QAIC_REMOTE(remote_munmap)(__QAIC_IN uint32_t vaddrout, __QAIC_IN int size) __QAIC_REMOTE_ATTRIBUTE;
+
+
+/**
+ * remote_mem_map
+ * Map memory to the remote process on a selected DSP domain
+ *
+ * @domain: DSP domain ID. Use -1 for using default domain.
+ *          Default domain is selected based on library lib(a/m/s/c)dsprpc.so library linked to application.
+ * @fd: file descriptor of memory
+ * @flags: enum remote_mem_map_flags type of flag
+ * @virtAddr: virtual address of buffer
+ * @size: buffer length
+ * @remoteVirtAddr[out]: remote process virtual address
+ * @retval, 0 on success
+ */
+__QAIC_REMOTE_EXPORT __QAIC_RETURN int __QAIC_REMOTE(remote_mem_map)(__QAIC_IN int domain, __QAIC_IN int fd, __QAIC_IN int flags, __QAIC_IN uint64_t virtAddr, __QAIC_IN size_t size, __QAIC_OUT uint64_t* remoteVirtAddr) __QAIC_REMOTE_ATTRIBUTE;
+
+
+/**
+ * remote_mem_unmap
+ * Unmap memory to the remote process on a selected DSP domain
+ *
+ * @domain: DSP domain ID. Use -1 for using default domain. Get domain from multi-domain handle if required.
+ * @remoteVirtAddr: remote process virtual address received from remote_mem_map
+ * @size: buffer length
+ * @retval, 0 on success
+ */
+__QAIC_REMOTE_EXPORT __QAIC_RETURN int __QAIC_REMOTE(remote_mem_unmap)(__QAIC_IN int domain, __QAIC_IN uint64_t remoteVirtAddr, __QAIC_IN size_t size) __QAIC_REMOTE_ATTRIBUTE;
+
+
+/**
+ * remote_mmap64
+ * map memory to the remote domain
+ *
+ * @param fd, fd associated with this memory
+ * @param flags, flags to be used for the mapping
+ * @param vaddrin, input address
+ * @param size, size of buffer
+ * @param vaddrout, output address
+ * @retval, 0 on success
+ */
+__QAIC_REMOTE_EXPORT __QAIC_RETURN int __QAIC_REMOTE(remote_mmap64)(__QAIC_IN int fd, __QAIC_IN uint32_t flags, __QAIC_IN __QAIC_INT64PTR vaddrin, __QAIC_IN int64_t size, __QAIC_OUT __QAIC_INT64PTR *vaddrout) __QAIC_REMOTE_ATTRIBUTE;
+
+
+/**
+ * remote_munmap64
+ * unmap memory from the remote domain
+ *
+ * @param vaddrout, remote address mapped
+ * @param size, size to unmap.  Unmapping a range partially may  not be supported.
+ * @retval, 0 on success, may fail if memory is still mapped
+ */
+__QAIC_REMOTE_EXPORT __QAIC_RETURN int __QAIC_REMOTE(remote_munmap64)(__QAIC_IN __QAIC_INT64PTR vaddrout, __QAIC_IN int64_t size) __QAIC_REMOTE_ATTRIBUTE;
+
+
+/**
+ * fastrpc_mmap
+ * Creates a mapping on remote process for an ION buffer with file descriptor. New fastrpc session
+ * will be opened if not already opened for the domain.
+ *
+ * @param domain, DSP domain ID of a fastrpc session
+ * @param fd, ION memory file descriptor
+ * @param addr, buffer virtual address on cpu
+ * @param offset, offset from the beginning of the buffer
+ * @param length, size of buffer in bytes
+ * @param flags, controls mapping functionality on DSP. Refer fastrpc_map_flags enum definition for more information.
+ *
+ * @return, 0 on success, error code on failure.
+ *          AEE_EALREADY Buffer already mapped. Multiple mappings for the same buffer are not supported.
+ *          AEE_EBADPARM Bad parameters
+ *          AEE_EFAILED Failed to map buffer
+ *          AEE_ENOMEMORY Out of memory (internal error)
+ *          AEE_EUNSUPPORTED Unsupported API on the target
+ */
+__QAIC_REMOTE_EXPORT __QAIC_RETURN int __QAIC_REMOTE(fastrpc_mmap)(__QAIC_IN int domain, __QAIC_IN int fd, __QAIC_IN void *addr, __QAIC_IN int offset, __QAIC_IN size_t length, __QAIC_IN enum fastrpc_map_flags flags)__QAIC_REMOTE_ATTRIBUTE;
+
+
+/**
+ * fastrpc_munmap
+ * Removes a mapping associated with file descriptor.
+ *
+ * @param domain, DSP domain ID of a fastrpc session
+ * @param fd, file descriptor
+ * @param addr, buffer virtual address used for mapping creation
+ * @param length, buffer length
+ *
+ * @return, 0 on success, error code on failure.
+ *          AEE_EBADPARM Bad parameters
+ *          AEE_EINVALIDFD Mapping not found for specified fd
+ *          AEE_EFAILED Failed to map buffer
+ *          AEE_EUNSUPPORTED Unsupported API on the target
+ */
+__QAIC_REMOTE_EXPORT __QAIC_RETURN int __QAIC_REMOTE(fastrpc_munmap)(__QAIC_IN int domain, __QAIC_IN int fd, __QAIC_IN void *addr, __QAIC_IN size_t length)__QAIC_REMOTE_ATTRIBUTE;
+
+
+/**
+ * remote_register_buf/remote_register_buf_attr
+ * Register a file descriptor for a buffer.
+ * Users of fastrpc should register zero-copy buffer to enable
+ * sharing that buffer to the dsp via the SMMU. The API is limited
+ * to register buffer less than 2 GB only. Recommendation is to use
+ * remote_register_buf_attr2 instead. API remote_register_buf_attr2
+ * can now accept size up to 2 power(8*sizeof(size_t)).
+ *
+ * Some versions of libcdsprpc.so lack this
+ * function, so users should set this symbol as weak.
+ *
+ * #pragma weak remote_register_buf
+ * #pragma weak remote_register_buf_attr
+ *
+ * @param buf, virtual address of the buffer
+ * @param size, size of the buffer
+ * @fd, the file descriptor, callers can use -1 to deregister.
+ * @attr, map buffer as coherent or non-coherent
+ */
+__QAIC_REMOTE_EXPORT __QAIC_RETURN void __QAIC_REMOTE(remote_register_buf)(__QAIC_IN_LEN(size) void* buf, __QAIC_IN int size, __QAIC_IN int fd) __QAIC_REMOTE_ATTRIBUTE;
+__QAIC_REMOTE_EXPORT __QAIC_RETURN void __QAIC_REMOTE(remote_register_buf_attr)(__QAIC_IN_LEN(size) void* buf, __QAIC_IN int size, __QAIC_IN int fd, __QAIC_IN int attr) __QAIC_REMOTE_ATTRIBUTE;
+
+
+/**
+ * remote_register_buf_attr2
+ * Register a file descriptor for a buffer. Users of fastrpc should
+ * register zero-copy buffer to enable sharing that buffer to the
+ * dsp via the SMMU.
+ *
+ * Some versions of libcdsprpc.so lack this
+ * function, so users should set this symbol as weak.
+ *
+ * #pragma weak remote_register_buf_attr2
+ *
+ * @param buf, virtual address of the buffer
+ * @param size, size of the buffer
+ * @fd, the file descriptor, callers can use -1 to deregister.
+ * @attr, setting attribute for the mapped buffer
+ *		  refer to "Attributes for remote_register_buf_attr/remote_register_buf_attr2"
+ *		  to set the required attribute value.
+ */
+__QAIC_REMOTE_EXPORT __QAIC_RETURN void __QAIC_REMOTE(remote_register_buf_attr2)(__QAIC_IN_LEN(size) void* buf, __QAIC_IN size_t size, __QAIC_IN int fd, __QAIC_IN int attr) __QAIC_REMOTE_ATTRIBUTE;
+
+
+/**
+ * remote_register_dma_handle/remote_register_dma_handle_attr
+ * Register a dma handle with fastrpc.
+ * This is only valid on Android with ION allocated memory.
+ * Users of fastrpc should register a file descriptor allocated with
+ * ION to enable sharing that memory to the dsp via the SMMU.
+ *
+ * Some versions of libadsprpc.so lack this function,
+ * so users should set this symbol as weak.
+ *
+ * #pragma weak remote_register_dma_handle
+ * #pragma weak remote_register_dma_handle_attr
+ *
+ * @fd, the file descriptor, callers can use -1 to deregister.
+ * @param len, size of the buffer
+ * @attr, map buffer as coherent or non-coherent or no-map
+ */
+__QAIC_REMOTE_EXPORT __QAIC_RETURN int __QAIC_REMOTE(remote_register_dma_handle)(__QAIC_IN int fd,__QAIC_IN uint32_t len) __QAIC_REMOTE_ATTRIBUTE;
+__QAIC_REMOTE_EXPORT __QAIC_RETURN int __QAIC_REMOTE(remote_register_dma_handle_attr)(__QAIC_IN int fd,__QAIC_IN uint32_t len,__QAIC_IN uint32_t attr) __QAIC_REMOTE_ATTRIBUTE;
+
+
+/**
+ * remote_set_mode
+ * Set the mode of operation.
+ *
+ * Some versions of libadsprpc.so lack this function,
+ * so users should set this symbol as weak.
+ *
+ * #pragma weak  remote_set_mode
+ *
+ * @param mode, the mode
+ * @retval, 0 on success
+ */
+__QAIC_REMOTE_EXPORT __QAIC_RETURN int __QAIC_REMOTE(remote_set_mode)(__QAIC_IN uint32_t mode) __QAIC_REMOTE_ATTRIBUTE;
+
+
+/**
+ * remote_register_fd
+ * Register a file descriptor.
+ * This can be used when users do not have a mapping to pass to the
+ * RPC layer. The generated address is a mapping with PROT_NONE, any
+ * access to this memory will fail, so it should only be used as an
+ * ID to identify this file descriptor to the RPC layer. This API is
+ * limited to buffer size less then 2 GB. Recommendation is to use
+ * remote_register_fd2 for buffer of size > 2 power(8*sizeof(size_t)).
+ *
+ * To deregister use remote_register_buf(addr, size, -1).
+ *
+ * #pragma weak  remote_register_fd
+ *
+ * @param fd, the file descriptor.
+ * @param size, size to of the buffer
+ * @retval, (void*)-1 on failure, address on success.
+ */
+__QAIC_REMOTE_EXPORT __QAIC_RETURN void *__QAIC_REMOTE(remote_register_fd)(__QAIC_IN int fd,__QAIC_IN int size) __QAIC_REMOTE_ATTRIBUTE;
+
+/**
+ * remote_register_fd2
+ * Register a file descriptor.
+ * This can be used when users do not have a mapping to pass to
+ * the RPC layer. The generated address is a mapping with PROT_NONE,
+ * any access to this memory will fail, so it should only be used
+ * as an ID to identify this file descriptor to the RPC layer.
+ *
+ * To deregister use remote_register_buf(addr, size, -1).
+ *
+ * #pragma weak  remote_register_fd2
+ *
+ * @param fd, the file descriptor.
+ * @param size, size to of the buffer
+ * @retval, (void*)-1 on failure, address on success.
+ */
+__QAIC_REMOTE_EXPORT __QAIC_RETURN void *__QAIC_REMOTE(remote_register_fd2)(__QAIC_IN int fd,__QAIC_IN size_t size) __QAIC_REMOTE_ATTRIBUTE;
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /// REMOTE_H
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/incs/remote.idl b/prebuilts/Hexagon_SDK/6.2.0.1/incs/remote.idl
new file mode 100755
index 0000000000000..65e5b162660b6
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/incs/remote.idl
@@ -0,0 +1,32 @@
+interface remote_handle64 {
+   /**
+    * Opens the handle in the specified domain.  If this is the first
+    * handle, this creates the session.  Typically this means opening
+    * the device, aka open("/dev/adsprpc-smd"), then calling ioctl
+    * device APIs to create a PD on the DSP to execute our code in,
+    * then asking that PD to dlopen the .so and dlsym the skel function.
+    *
+    * @param uri, <interface>_URI"&_dom=aDSP"
+    *    <interface>_URI is a QAIC generated uri, or
+    *    "file:///<sofilename>?<interface>_skel_handle_invoke&_modver=1.0"
+    *    If the _dom parameter is not present, _dom=DEFAULT is assumed
+    *    but not forwarded.
+    *    Reserved uri keys:
+    *      [0]: first unamed argument is the skel invoke function
+    *      _dom: execution domain name, _dom=mDSP/aDSP/DEFAULT
+    *      _modver: module version, _modver=1.0
+    *      _*: any other key name starting with an _ is reserved
+    *    Unknown uri keys/values are forwarded as is.
+    * @param h, resulting handle
+    * @retval, 0 on success
+    */
+   long open(in string uri, rout remote_handle64 h);
+   /** 
+    * Closes a handle.  If this is the last handle to close, the session
+    * is closed as well, releasing all the allocated resources.
+
+    * @param h, the handle to close
+    * @retval, 0 on success, should always succeed
+    */
+   long close(in remote_handle64 h);
+};
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/incs/remote.md b/prebuilts/Hexagon_SDK/6.2.0.1/incs/remote.md
new file mode 100755
index 0000000000000..0c3d9daca2aca
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/incs/remote.md
@@ -0,0 +1,120 @@
+# Remote session API to interface with FastRPC
+
+
+##  Overview
+
+FastRPC exposes a set of APIs enabling the following functionality:
+
+ - open, configure and close a remote session on the DSP
+ - enable unsigned PD offload to the compute DSP
+ - enable and manage QoS mode
+ - make synchronous or asynchronous remote calls
+ - query DSP capabilities
+ - map or unmap pages onto the DSP
+
+The 64-bit version of the API (`handle64`) enables multi-domain modules.
+It is recommended for applications to use the multi-domain framework,
+which provides multiple benefits over the older single-domain framework. remote_handle_*
+APIs should be used for single-domain applications. For more information on multi-domain support,
+refer to the RPC section in the Hexagon SDK documentation.
+
+# `remote_handle_open`, `remote_handle64_open`
+Loads the shared object on the remote process domain.
+
+# `remote_handle_invoke`, `remote_handle64_invoke`
+Executes a process on the remote domain.
+
+# `remote_handle_close`, `remote_handle64_close`
+Closes the remote handle opened by the remote process.
+
+# `remote_handle_control`, `remote_handle64_control`
+Manages the remote session.
+This API allows to control or query the remote session:
+- Control latency
+    The user can vote for a specific latency requirement per session. This latency is not guaranteed by the driver. The driver will try to
+    meet this requirement with the options available on a given target. Based on the arguments, either PM-QoS [Power Management] or adaptive
+    QoS can be enabled.
+
+    PM-QoS is recommended for latency-sensitive use cases, whereas adaptive QoS is recommended for moderately latency-sensitive use cases.
+    Adaptive QoS is more power-efficient than PM-QoS.
+
+    If PM-QoS enabled, CPU low power modes will be disabled.
+
+    If Adaptive QoS is enabled, the remote DSP starts keeping track of the method execution times for that process. Once enough data is available,
+    the DSP will try to predict when the method will finish executing and will send a "ping" to wake up the CPU prior to the completion of the
+    DSP task so that there is no extra overhead due to CPU wakeup time.
+
+- Enable wake lock
+    Keep the CPU up until a response from the remote invocation call is received.  Disabling wake lock feature enables the CPU to be in suspend mode.
+
+- Query DSP Capabilities
+    Queries DSP support for:
+
+    * domains available
+    * unsigned PD
+    * HVX, VTCM, HMX
+    * async FastRPC
+    * remote PD status notification
+
+- Get DSP domain
+    Returns the current DSP domain.
+
+# `remote_session_control`
+
+Sets remote session parameters such as thread stack size or unsigned PD mode.  Enables to kill remote process, closes sessions on the DSP,
+generates a PD dump, or triggers remote process exceptions.
+
+- [Stack thread parameters](structremote__rpc__thread__params.html)<br>
+    Parameters to configure a thread: priority and stack size.
+
+- [Unsigned PD](structremote__rpc__control__unsigned__module.html)<br>
+    Flag to configure the session as unsigned. This allows third party applications to run compute
+    intensive tasks on the compute DSP for better performance.
+
+- [Kill remote process](structremote__rpc__process__clean__params.html)<br>
+    Kills the remote process running on the DSP.
+
+- [Session close](structremote__rpc__session__close.html)<br>
+    Closes all sessions open on a given domain.
+
+- [PD dump](structremote__rpc__control__pd__dump.html)<br>
+    Enables PD dump feature.
+
+- [Remote process exception](structremote__rpc__process__clean__params.html)<br>
+    Introduces an exception in the remote process.
+
+- [Query process type](structremote__process__type.html)<br>
+    Query the type of process (signed or unsigned) running on remote DSP.
+
+- [Relative thread priority](structremote__rpc__relative__thread__priority.html)<br>
+    Set a lower or higher priority than the default thread priority, for the user threads on the DSP.
+
+# `fastrpc_mmap`, `fastrpc_munmap`
+Creates a new mapping of an ION memory into the virtual address space of a remote process on the DSP and associates the mapping with the
+provided file descriptor. The parameter `flags` of type `fastrpc_map_flags` allows the user to control the page permissions and other
+properties of the memory map. These mappings can be destroyed with `fastrpc_munmap()` API using the file descriptor. APIs `fastrpc_mmap`
+and `fastrpc_munmap` are available and their use is recommended for Lahaina and later chipsets.
+
+# `remote_mem_map`, `remote_mem_unmap`
+Maps/unmaps large buffers statically on a given DSP.
+Mapping the buffers statically saves the latency for the corresponding remote calls associated with these buffers.
+These APIs are available only on SM8250 (Kona) or later targets.
+
+# `remote_handle_invoke_async`, `remote_handle64_invoke_async`
+Make remote invocations asynchronous. Running asynchronously does not improve the latency but improves the throughput by enabling the DSP
+to run successive tasks continuously. This feature is supported on Lahaina and onward targets.
+
+# `fastrpc_async_get_status`
+Queries the status of the asynchronous job.
+
+# `fastrpc_release_async_job`
+Releases the asynchronous job after receiving the status either through callback or poll.
+
+# `remote_register_buf`, `remote_register_buf_attr`
+Registers a file descriptor for a buffer allocated with ION memory to share the memory with the DSP via SMMU.
+
+# `remote_register_dma_handle`, `remote_register_dma_handle_attr`
+Registers a DMA handle allocated with ION memory to share the memory with the DSP via SMMU.
+
+Header file: @b remote.h
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/incs/remote64.h b/prebuilts/Hexagon_SDK/6.2.0.1/incs/remote64.h
new file mode 100755
index 0000000000000..07cf3d5d6e769
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/incs/remote64.h
@@ -0,0 +1,14 @@
+/*
+ * Copyright (c) 2014, 2022 Qualcomm Technologies, Inc.
+ * All Rights Reserved.
+ * Confidential and Proprietary - Qualcomm Technologies, Inc.
+ */
+#ifndef REMOTE64_H
+#define REMOTE64_H
+
+#include "remote.h"
+
+/*
+All the functions declared here are moved to remote.h, remote64.h will be deleted in future.
+*/
+#endif // REMOTE64_H
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/incs/stddef/AEEStd.md b/prebuilts/Hexagon_SDK/6.2.0.1/incs/stddef/AEEStd.md
new file mode 100755
index 0000000000000..3a02dd209389b
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/incs/stddef/AEEStd.md
@@ -0,0 +1,11 @@
+# Standard definitions and error codes
+
+## Standard definitions
+
+AEEStdDef.h contains definitions of common data types used on the Hexagon DSPs and the application processor. It also has definitions of MIN, MAX values for common data types.
+
+## Standard error codes
+
+AEEStdErr.h file contains error codes returned by functions running on the DSPs and the application processor. 
+The application invoking these APIs is expected to check for the error codes and implement appropriate error handling. Each API in the header files will have required documentation on the error codes it can return.
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/incs/stddef/AEEStdDef.h b/prebuilts/Hexagon_SDK/6.2.0.1/incs/stddef/AEEStdDef.h
new file mode 100755
index 0000000000000..fe252dd6f0590
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/incs/stddef/AEEStdDef.h
@@ -0,0 +1,464 @@
+#ifndef AEESTDDEF_H
+#define AEESTDDEF_H
+/*
+=======================================================================
+
+FILE:         AEEStdDef.h
+
+DESCRIPTION:  definition of basic types, constants,
+                 preprocessor macros
+
+=======================================================================
+*/
+/*==============================================================================
+  Copyright (c) 2005,2007,2012-2013, 2020 Qualcomm Technologies, Inc.
+  All rights reserved.
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+  1. Redistributions of source code must retain the above copyright notice,
+  this list of conditions and the following disclaimer.
+
+  2. Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+  3. Neither the name of the copyright holder nor the names of its contributors
+  may be used to endorse or promote products derived from this software without
+  specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+  POSSIBILITY OF SUCH DAMAGE.
+==============================================================================*/
+
+#include <stdint.h>
+
+#if defined(COMDEF_H) /* guards against a known re-definer */
+#define _BOOLEAN_DEFINED
+#define _UINT32_DEFINED
+#define _UINT16_DEFINED
+#define _UINT8_DEFINED
+#define _INT32_DEFINED
+#define _INT16_DEFINED
+#define _INT8_DEFINED
+#define _UINT64_DEFINED
+#define _INT64_DEFINED
+#define _BYTE_DEFINED
+#endif /* #if !defined(COMDEF_H) */
+
+/* -----------------------------------------------------------------------
+** Standard Types
+** ----------------------------------------------------------------------- */
+
+/* The following definitions are the same accross platforms.  This first
+** group are the sanctioned types.
+*/
+/** @defgroup  stddef standard data type definitions
+*  @{
+*/
+#ifndef _BOOLEAN_DEFINED
+typedef  unsigned char      boolean;     /**<  Boolean value type. */
+#define _BOOLEAN_DEFINED
+#endif
+
+#ifndef _UINT32_DEFINED
+typedef  uint32_t           uint32;      /**<  Unsigned 32-bit value */
+#define _UINT32_DEFINED
+#endif
+
+#ifndef _UINT16_DEFINED
+typedef  unsigned short     uint16;      /**< Unsigned 16-bit value */
+#define _UINT16_DEFINED
+#endif
+
+#ifndef _UINT8_DEFINED
+typedef  unsigned char      uint8;       /**< Unsigned 8-bit value */
+#define _UINT8_DEFINED
+#endif
+
+#ifndef _INT32_DEFINED
+typedef  int32_t            int32;       /**< Signed 32-bit value */
+#define _INT32_DEFINED
+#endif
+
+#ifndef _INT16_DEFINED
+typedef  signed short       int16;       /**< Signed 16-bit value */
+#define _INT16_DEFINED
+#endif
+
+#ifndef _INT8_DEFINED
+typedef  signed char        int8;        /**< Signed 8-bit value */
+#define _INT8_DEFINED
+#endif
+
+#ifndef _INT64_DEFINED
+#if defined(__GNUC__)
+#define __int64 long long
+#endif
+typedef  __int64            int64;       /**< Signed 64-bit value */
+#define _INT64_DEFINED
+#endif
+
+#ifndef _UINT64_DEFINED
+typedef  unsigned __int64   uint64;      /**< Unsigned 64-bit value */
+#define _UINT64_DEFINED
+#endif
+
+#ifndef _BYTE_DEFINED
+typedef  unsigned char      byte;        /**< byte type */
+#define  _BYTE_DEFINED
+#endif
+
+/**
+ * @}
+ */
+
+ /** @defgroup  stdret standard return values
+*  @{
+*/
+
+//! @cond Doxygen_Suppress
+#ifndef _AEEUID_DEFINED
+typedef uint32             AEEUID;
+#define _AEEUID_DEFINED
+#endif
+
+#ifndef _AEEIID_DEFINED
+typedef uint32             AEEIID;
+#define _AEEIID_DEFINED
+#endif
+
+#ifndef _AEECLSID_DEFINED
+typedef uint32             AEECLSID;
+#define _AEECLSID_DEFINED
+#endif
+
+#ifndef _AEEPRIVID_DEFINED
+typedef uint32             AEEPRIVID;
+#define _AEEPRIVID_DEFINED
+#endif
+
+#ifndef _AECHAR_DEFINED
+typedef uint16             AECHAR;
+#define _AECHAR_DEFINED
+#endif
+//! @endcond
+
+/**
+ * @brief Return value of functions indicating success or failure. return value 0 indicates success. A non zero value indicates a failure. Any data in rout parameters is not propagated back.
+ */
+#ifndef _AEERESULT_DEFINED
+typedef int                AEEResult;
+#define _AEERESULT_DEFINED
+#endif
+
+/**
+ * @}
+ */
+
+
+/* -----------------------------------------------------------------------
+** Function Calling Conventions
+** ----------------------------------------------------------------------- */
+
+#ifndef CDECL
+#ifdef _MSC_VER
+#define CDECL __cdecl
+#else
+#define CDECL
+#endif /* _MSC_VER */
+#endif /* CDECL */
+
+/* -----------------------------------------------------------------------
+** Constants
+** ----------------------------------------------------------------------- */
+ /** @defgroup  stdminmax Standard Min and Max for all data types
+*  @{
+*/
+
+#ifndef TRUE
+#define TRUE   1   /**< Boolean true value. */
+#endif
+
+#ifndef FALSE
+#define FALSE  0   /**< Boolean false value. */
+#endif
+
+#ifndef NULL
+#define NULL  0     /**< NULL = 0. */
+#endif
+
+#ifndef MIN_INT8
+#define MIN_INT8 -128           /**< MIN 8-bit integer */
+#endif
+#ifndef MIN_INT16
+#define MIN_INT16 -32768        /**< MIN 16-bit integer */
+#endif
+#ifndef MIN_INT32
+#define MIN_INT32 (~0x7fffffff)   /**<  MIN 32-bit unsigned */
+#endif
+#ifndef MIN_INT64
+#define MIN_INT64 (~0x7fffffffffffffffLL) /**< MIN 64-bit integer */
+#endif
+
+#ifndef MAX_INT8
+#define MAX_INT8 127                /**< MAX 8-bit integer */
+#endif
+#ifndef MAX_INT16
+#define MAX_INT16 32767             /**< MAX 16-bit integer */
+#endif
+#ifndef MAX_INT32
+#define MAX_INT32 2147483647        /**< MAX 32-bit integer */
+#endif
+#ifndef MAX_INT64
+#define MAX_INT64 9223372036854775807LL     /**< MAX 64-bit integer */
+#endif
+
+#ifndef MAX_UINT8
+#define MAX_UINT8 255                   /**< MAX 8-bit unsigned integer */
+#endif
+#ifndef MAX_UINT16
+#define MAX_UINT16 65535                /**< MAX 16-bit unsigned integer */
+#endif
+#ifndef MAX_UINT32
+#define MAX_UINT32 4294967295u          /**< MAX 32-bit unsigned integer */
+#endif
+#ifndef MAX_UINT64
+#define MAX_UINT64 18446744073709551615uLL      /**< MAX 64-bit unsigned integer */
+#endif
+
+//! @cond Doxygen_Suppress
+#ifndef MIN_AECHAR
+#define MIN_AECHAR 0
+#endif
+
+#ifndef MAX_AECHAR
+#define MAX_AECHAR 65535
+#endif
+
+//! @endcond
+
+/**
+ * @}
+ */
+
+/* -----------------------------------------------------------------------
+** Preprocessor helpers
+** ----------------------------------------------------------------------- */
+#define __STR__(x) #x
+#define __TOSTR__(x) __STR__(x)
+#define __FILE_LINE__ __FILE__ ":" __TOSTR__(__LINE__)
+
+/* -----------------------------------------------------------------------
+** Types for code generated from IDL
+** ----------------------------------------------------------------------- */
+
+ /** @defgroup  QIDL data types
+*  @{
+*/
+//! @cond Doxygen_Suppress
+#ifndef __QIDL_WCHAR_T_DEFINED__
+#define __QIDL_WCHAR_T_DEFINED__
+typedef uint16 _wchar_t;
+#endif
+
+
+/* __STRING_OBJECT__ will be deprecated in the future */
+
+
+#if !defined(__QIDL_STRING_OBJECT_DEFINED__) && !defined(__STRING_OBJECT__)
+#define __QIDL_STRING_OBJECT_DEFINED__
+#define __STRING_OBJECT__
+
+/**
+ * @brief This structure is used to represent an IDL string when used inside a
+   sequence or union.
+ */
+typedef struct _cstring_s {
+   char* data;
+   int dataLen;
+   int dataLenReq;
+} _cstring_t;
+
+/**
+ * @brief This structure is used to represent an IDL wstring when used inside a
+   sequence or union.
+ */
+
+typedef struct _wstring_s {
+   _wchar_t* data;
+   int dataLen;
+   int dataLenReq;
+} _wstring_t;
+#endif /* __QIDL_STRING_OBJECT_DEFINED__ */
+//! @endcond
+/**
+ * @}
+ */
+/*
+=======================================================================
+  DATA STRUCTURES DOCUMENTATION
+=======================================================================
+
+=======================================================================
+
+AEEUID
+
+Description:
+   This is a BREW unique ID.  Used to express unique types, interfaces, classes
+     groups and privileges.  The BREW ClassID Generator generates
+     unique IDs that can be used anywhere you need a new AEEIID, AEECLSID,
+     or AEEPRIVID.
+
+Definition:
+    typedef uint32             AEEUID
+
+=======================================================================
+
+AEEIID
+
+Description:
+   This is an interface ID type, used to denote a BREW interface. It is a special case
+     of AEEUID.
+
+Definition:
+    typedef uint32             AEEIID
+
+=======================================================================
+
+AEECLSID
+
+Description:
+   This is a classe ID type, used to denote a BREW class. It is a special case
+     of AEEUID.
+
+Definition:
+    typedef uint32             AEECLSID
+
+=======================================================================
+
+AEEPRIVID
+
+Description:
+   This is a privilege ID type, used to express a privilege.  It is a special case
+     of AEEUID.
+
+Definition:
+    typedef uint32             AEEPRIVID
+
+=======================================================================
+
+AECHAR
+
+Description:
+   This is a 16-bit character type.
+
+Definition:
+   typedef uint16             AECHAR
+
+=======================================================================
+
+AEEResult
+
+Description:
+   This is the standard result type.
+
+Definition:
+   typedef int                AEEResult
+
+=======================================================================
+
+_wchar_t
+
+Description:
+   This is a 16-bit character type corresponding to the IDL 'wchar'
+   type.
+
+Definition:
+   typedef uint16             _wchar_t
+
+See Also:
+   _cstring_t
+   _wstring_t
+
+=======================================================================
+
+_cstring_t
+
+Description:
+   This structure is used to represent an IDL string when used inside a
+   sequence or union.
+
+Definition:
+   typedef struct _cstring_s {
+      char* data;
+      int dataLen;
+      int dataLenReq;
+   } _cstring_t;
+
+Members:
+   data       : A pointer to the NULL-terminated string.
+   dataLen    : The size, in chars, of the buffer pointed to by 'data',
+                including the NULL terminator.  This member is only used
+                when the structure is part of an rout or inrout
+                parameter, but must be supplied by the caller as an
+                input in these cases.
+   dataLenReq : The size that would have been required to store the
+                entire result string.  This member is only used when the
+                structure is part of an rout or inrout parameter, when
+                it is an output value set by the callee.  The length of
+                the returned string (including the NULL terminator)
+                after a call is the minimum of dataLen and dataLenReq.
+
+See Also:
+   _wchar_t
+   _wstring_t
+
+=======================================================================
+
+_wstring_t
+
+Description:
+   This structure is used to represent an IDL wstring when used inside a
+   sequence or union.
+
+Definition:
+   typedef struct _wstring_s {
+      _wchar_t* data;
+      int dataLen;
+      int dataLenReq;
+   } _wstring_t;
+
+Members:
+   data       : A pointer to the NULL-terminated wide string.
+   dataLen    : The size, in 16-bit characters, of the buffer pointed to
+                by 'data', including the NULL terminator.  This member
+                is only used when the structure is part of an rout or
+                inrout parameter, but must be supplied by the caller as
+                an input in these cases.
+   dataLenReq : The number of 16-bit characters that would have been
+                required to store the entire result string.  This member
+                is only used when the structure is part of an rout or
+                inrout parameter, when it is an output value set by the
+                callee.  The length of the returned wstring (including
+                the NULL terminator) after a call is the minimum of
+                dataLen and dataLenReq.
+
+See Also:
+   _cstring_t
+   _wchar_t
+
+=======================================================================
+*/
+
+#endif /* #ifndef AEESTDDEF_H */
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/incs/stddef/AEEStdDef.idl b/prebuilts/Hexagon_SDK/6.2.0.1/incs/stddef/AEEStdDef.idl
new file mode 100755
index 0000000000000..ac224152bcaf9
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/incs/stddef/AEEStdDef.idl
@@ -0,0 +1,91 @@
+#ifndef AEESTDDEF_IDL
+#define AEESTDDEF_IDL
+//============================================================================
+/// @file AEEStdDef.idl
+///
+/// This file contains definitions of primitive  types.
+                                                           //qidl copyright
+//%   Copyright (c) 2006-2014, 2020 Qualcomm Technologies, Inc.
+                                                           //qidl nested=false
+//% All Rights Reserved.
+//% Redistribution and use in source and binary forms, with or without
+//% modification, are permitted provided that the following conditions are met:
+//%
+//% 1. Redistributions of source code must retain the above copyright notice,
+//% this list of conditions and the following disclaimer.
+//%
+//% 2. Redistributions in binary form must reproduce the above copyright notice,
+//% this list of conditions and the following disclaimer in the documentation
+//% and/or other materials provided with the distribution.
+//%
+//% 3. Neither the name of the copyright holder nor the names of its contributors
+//% may be used to endorse or promote products derived from this software without
+//% specific prior written permission.
+//%
+//% THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+//% AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+//% IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+//% ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+//% LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+//% CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+//% SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+//% INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+//% CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+//% ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+//% POSSIBILITY OF SUCH DAMAGE.
+//============================================================================
+
+/* NOTE: THIS FILE SHOULD NEVER BE COMPILED DIRECTLY.  That is, code should
+ * never be generated from these definitions, as they will conflict with the
+ * "real" hand-written AEEStdDef.h.  Note also that if the definitions here
+ * become out of sync with the hand-written AEEStdDef.h, bad things will
+ * happen.
+ */
+
+/**
+ * @name Primitive Types
+ */
+/*@{*/
+
+typedef octet              byte;        ///< Alternate alias for an unsigned
+                                        ///< 8-bit integer
+/*@}*/
+
+/**
+ * @name Types
+ */
+/*@{*/
+
+/**
+ * This is a unique ID type.  Used to express types,
+ * interfaces, classes, and privileges.  The class ID generator generates
+ * unique IDs that can be used anywhere a new #AEEIID, #AEECLSID, or
+ * #AEEPRIVID is needed.
+ */
+typedef uint32             AEEUID;
+
+/**
+ * This is an interface ID type, used to denote an interface.  It is a special
+ case of #AEEUID.
+ */
+typedef uint32             AEEIID;
+
+/**
+ * This is a class ID type, used to denote a class.  It is  a special case of
+ #AEEUID.
+ */
+typedef uint32             AEECLSID;
+
+/**
+ * This is a privilege ID type, used to express a privilege.  It is a special
+ * case of #AEEUID.
+ */
+typedef uint32             AEEPRIVID;
+
+typedef wchar              AECHAR;      ///< Wide character type
+
+typedef long               AEEResult;   ///< Common return type
+
+/*@}*/
+
+#endif /* #ifndef AEESTDDEF_IDL */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/incs/stddef/AEEStdErr.h b/prebuilts/Hexagon_SDK/6.2.0.1/incs/stddef/AEEStdErr.h
new file mode 100755
index 0000000000000..bc1706abe5886
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/incs/stddef/AEEStdErr.h
@@ -0,0 +1,339 @@
+/*
+ * Copyright (c) 2005-2007, 2012-2013, 2019-2020 Qualcomm Technologies, Inc.
+ * All Rights Reserved.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef AEESTDERR_H
+#define AEESTDERR_H
+//
+// Basic Error Codes
+//
+//
+#if defined(__hexagon__)
+	#define AEE_EOFFSET               0x80000400
+#else
+	#define AEE_EOFFSET               0x00000000
+#endif
+/** @defgroup stdbasicerror Basic error codes
+ *  @{
+ */
+#define AEE_SUCCESS                   0                      ///< No error
+#define AEE_EUNKNOWN                  -1                     ///< Unknown error (should not use this)
+
+#define AEE_EFAILED                   (AEE_EOFFSET + 0x001)  ///< General failure
+#define AEE_ENOMEMORY                 (AEE_EOFFSET + 0x002)  ///< Memory allocation failed because of insufficient RAM
+#define AEE_ECLASSNOTSUPPORT          (AEE_EOFFSET + 0x003)  ///< Specified class unsupported
+#define AEE_EVERSIONNOTSUPPORT        (AEE_EOFFSET + 0x004)  ///< Version not supported
+#define AEE_EALREADYLOADED            (AEE_EOFFSET + 0x005)  ///< Object already loaded
+#define AEE_EUNABLETOLOAD             (AEE_EOFFSET + 0x006)  ///< Unable to load object/applet
+#define AEE_EUNABLETOUNLOAD           (AEE_EOFFSET + 0x007)  ///< Unable to unload
+                                                                    ///< object/applet
+#define AEE_EALARMPENDING             (AEE_EOFFSET + 0x008)  ///< Alarm is pending
+#define AEE_EINVALIDTIME              (AEE_EOFFSET + 0x009)  ///< Invalid time
+#define AEE_EBADCLASS                 (AEE_EOFFSET + 0x00A)  ///< NULL class object
+#define AEE_EBADMETRIC                (AEE_EOFFSET + 0x00B)  ///< Invalid metric specified
+#define AEE_EEXPIRED                  (AEE_EOFFSET + 0x00C)  ///< App/Component Expired
+#define AEE_EBADSTATE                 (AEE_EOFFSET + 0x00D)  ///< Process or thread is not in expected state
+#define AEE_EBADPARM                  (AEE_EOFFSET + 0x00E)  ///< Invalid parameter
+#define AEE_ESCHEMENOTSUPPORTED       (AEE_EOFFSET + 0x00F)  ///< Invalid URL scheme
+#define AEE_EBADITEM                  (AEE_EOFFSET + 0x010)  ///< Value out of range
+#define AEE_EINVALIDFORMAT            (AEE_EOFFSET + 0x011)  ///< Invalid format
+#define AEE_EINCOMPLETEITEM           (AEE_EOFFSET + 0x012)  ///< Incomplete item, like length of a string is less that expected
+#define AEE_ENOPERSISTMEMORY          (AEE_EOFFSET + 0x013)  ///< Insufficient flash
+#define AEE_EUNSUPPORTED              (AEE_EOFFSET + 0x014)  ///< API not implemented
+#define AEE_EPRIVLEVEL                (AEE_EOFFSET + 0x015)  ///< Privileges are insufficient
+                                                                    ///< for this operation
+#define AEE_ERESOURCENOTFOUND         (AEE_EOFFSET + 0x016)  ///< Unable to find specified
+                                                                    ///< resource
+#define AEE_EREENTERED                (AEE_EOFFSET + 0x017)  ///< Non re-entrant API
+                                                                    ///< re-entered
+#define AEE_EBADTASK                  (AEE_EOFFSET + 0x018)  ///< API called in wrong task
+                                                                    ///< context
+#define AEE_EALLOCATED                (AEE_EOFFSET + 0x019)  ///< App/Module left memory
+                                                                    ///< allocated when released.
+#define AEE_EALREADY                  (AEE_EOFFSET + 0x01A)  ///< Operation is already in
+                                                                    ///< progress
+#define AEE_EADSAUTHBAD               (AEE_EOFFSET + 0x01B)  ///< ADS mutual authorization
+                                                                    ///< failed
+#define AEE_ENEEDSERVICEPROG          (AEE_EOFFSET + 0x01C)  ///< Need service programming
+#define AEE_EMEMPTR                   (AEE_EOFFSET + 0x01D)  ///< bad memory pointer, expected to be NULL
+#define AEE_EHEAP                     (AEE_EOFFSET + 0x01E)  ///< An internal heap error was detected
+#define AEE_EIDLE                     (AEE_EOFFSET + 0x01F)  ///< Context (system, interface,
+                                                                    ///< etc.) is idle
+#define AEE_EITEMBUSY                 (AEE_EOFFSET + 0x020)  ///< Context (system, interface,
+                                                                    ///< etc.) is busy
+#define AEE_EBADSID                   (AEE_EOFFSET + 0x021)  ///< Invalid subscriber ID
+#define AEE_ENOTYPE                   (AEE_EOFFSET + 0x022)  ///< No type detected/found
+#define AEE_ENEEDMORE                 (AEE_EOFFSET + 0x023)  ///< Need more data/info
+#define AEE_EADSCAPS                  (AEE_EOFFSET + 0x024)  ///< ADS Capabilities do not
+                                                                    ///< match those required for phone
+#define AEE_EBADSHUTDOWN              (AEE_EOFFSET + 0x025)  ///< App failed to close properly
+#define AEE_EBUFFERTOOSMALL           (AEE_EOFFSET + 0x026)  ///< Destination buffer given is
+                                                                    ///< too small
+                                                                    ///< or service exists or is
+                                                                    ///< valid
+#define AEE_EACKPENDING               (AEE_EOFFSET + 0x028)  ///< ACK pending on application
+#define AEE_ENOTOWNER                 (AEE_EOFFSET + 0x029)  ///< Not an owner authorized to
+                                                                    ///< perform the operation
+#define AEE_EINVALIDITEM              (AEE_EOFFSET + 0x02A)  ///< Current item is invalid, it can be a switch case or a pointer to memory
+#define AEE_ENOTALLOWED               (AEE_EOFFSET + 0x02B)  ///< Not allowed to perform the
+                                                                    ///< operation
+#define AEE_EINVHANDLE                (AEE_EOFFSET + 0x02C)  ///< Invalid handle - adding here as its defined in vendor AEEStdErr.h - needed to check valid handle in stub.c
+#define AEE_EOUTOFHANDLES             (AEE_EOFFSET + 0x02D)  ///< Out of handles (Handle list is already full)
+//Hole here
+#define AEE_ENOMORE                   (AEE_EOFFSET + 0x02F)  ///< No more items available --
+                                                                    ///< reached end
+#define AEE_ECPUEXCEPTION             (AEE_EOFFSET + 0x030)  ///< A CPU exception occurred
+#define AEE_EREADONLY                 (AEE_EOFFSET + 0x031)  ///< Cannot change read-only
+                                                                    ///< object or parameter ( Parameter is in protected mode)
+#define AEE_ERPC                      (AEE_EOFFSET + 0x200)  ///< Error due to fastrpc implementation
+#define AEE_EFILE                     (AEE_EOFFSET + 0x201)  ///<File handling related error
+//NOTE: Used in both HLOS and DSP.
+#define AEE_ENOSUCH                   (39)                   ///< No such name, port, socket
+#define AEE_EINTERRUPTED              (46)                   ///< Waitable call is interrupted,
+                                                                   ///< the user should return to the HLOS and retry the call
+#define AEE_ECONNRESET                (104)                  ///< Connection reset by peer
+#define AEE_EWOULDBLOCK               (516)                  ///< Operation would block if not
+                                                                    ///< non-blocking; wait and try
+                                                                    ///< again
+/**
+ * @}
+ */
+
+/** @defgroup sigverifyerror Sigverify error codes
+ *  @{
+ */
+
+#define AEE_EINVALIDMSG	              (AEE_EOFFSET + 0x032)     ///<  Invalid SMD message from APPS
+#define AEE_EINVALIDTHREAD            (AEE_EOFFSET + 0x033)     ///<  Invalid thread
+#define AEE_EINVALIDPROCESS           (AEE_EOFFSET + 0x034)     ///<  Invalid Process
+#define AEE_EINVALIDFILENAME          (AEE_EOFFSET + 0x035)     ///<  Invalid filename
+#define AEE_EINVALIDDIGESTSIZE        (AEE_EOFFSET + 0x036)     ///<  Invalid digest size
+#define AEE_EINVALIDSEGS              (AEE_EOFFSET + 0x037)     ///<  Invalid segments
+#define AEE_EINVALIDSIGNATURE         (AEE_EOFFSET + 0x038)     ///<  Invalid signature
+#define AEE_EINVALIDDOMAIN            (AEE_EOFFSET + 0x039)     ///<  Invalid DSP domain
+#define AEE_EINVALIDFD                (AEE_EOFFSET + 0x03A)     ///<  Invalid file descriptor
+#define AEE_EINVALIDDEVICE            (AEE_EOFFSET + 0x03B)     ///<  Invalid Device or Device node open failed for the domain
+#define AEE_EINVALIDMODE              (AEE_EOFFSET + 0x03C)     ///<  Invalid Mode
+#define AEE_EINVALIDPROCNAME          (AEE_EOFFSET + 0x03D)     ///<  Invalid Process name
+#define AEE_ENOSUCHMOD                (AEE_EOFFSET + 0x03E)     ///<  No such module
+#define AEE_ENOSUCHINSTANCE           (AEE_EOFFSET + 0x03F)     ///<  No instance in the list lookup
+#define AEE_ENOSUCHTHREAD             (AEE_EOFFSET + 0x040)     ///<  No such thread
+#define AEE_ENOSUCHPROCESS            (AEE_EOFFSET + 0x041)     ///<  No such process
+#define AEE_ENOSUCHSYMBOL             (AEE_EOFFSET + 0x042)     ///<  No such symbol( dlsym for the symbol failed)
+#define AEE_ENOSUCHDEVICE             (AEE_EOFFSET + 0x043)     ///<  No such device
+#define AEE_ENOSUCHPROP               (AEE_EOFFSET + 0x044)     ///<  No such dal property
+#define AEE_ENOSUCHFILE               (AEE_EOFFSET + 0x045)     ///<  No such file found
+#define AEE_ENOSUCHHANDLE             (AEE_EOFFSET + 0x046)     ///<  No such handle
+#define AEE_ENOSUCHSTREAM             (AEE_EOFFSET + 0x047)     ///<  No such stream
+#define AEE_ENOSUCHMAP                (AEE_EOFFSET + 0x048)     ///<  No mapping exists for this address on DSP
+#define AEE_ENOSUCHREGISTER           (AEE_EOFFSET + 0x049)     ///<  No such register
+#define AEE_ENOSUCHCLIENT             (AEE_EOFFSET + 0x04A)     ///<  No such QDI client
+#define AEE_EBADDOMAIN                (AEE_EOFFSET + 0x04B)     ///<  Bad domain (not initialized)
+#define AEE_EBADOFFSET                (AEE_EOFFSET + 0x04C)     ///<  Bad buffer/page/heap offset
+#define AEE_EBADSIZE                  (AEE_EOFFSET + 0x04D)     ///<  Bad buffer/page/heap size
+#define AEE_EBADPERMS                 (AEE_EOFFSET + 0x04E)     ///<  Bad FILE/MAP/MEM permissions
+#define AEE_EBADFD                    (AEE_EOFFSET + 0x04F)     ///<  Bad file descriptor
+#define AEE_EBADPID                   (AEE_EOFFSET + 0x050)     ///<  Bad PID from HLOS
+#define AEE_EBADTID                   (AEE_EOFFSET + 0x051)     ///<  Bad TID
+#define AEE_EBADELF                   (AEE_EOFFSET + 0x052)     ///<  Bad elf file
+#define AEE_EBADASID                  (AEE_EOFFSET + 0x053)     ///<  Bad asid
+#define AEE_EBADCONTEXT               (AEE_EOFFSET + 0x054)     ///<  Bad context
+#define AEE_EBADMEMALIGN              (AEE_EOFFSET + 0x055)     ///<  Bad memory alignment
+#define AEE_EIOCTL                    (AEE_EOFFSET + 0x056)     ///<  ioctl call failed
+#define AEE_EFOPEN                    (AEE_EOFFSET + 0x057)     ///<  file open error or device node open failed for DSP domain
+#define AEE_EFGETS                    (AEE_EOFFSET + 0x058)     ///<  file get string error
+#define AEE_EFFLUSH                   (AEE_EOFFSET + 0x059)     ///<  file flush error
+#define AEE_EFCLOSE                   (AEE_EOFFSET + 0x05A)     ///<  file close error
+#define AEE_EEOF                      (AEE_EOFFSET + 0x05B)     ///<  File EOF reached
+#define AEE_EFREAD                    (AEE_EOFFSET + 0x05C)     ///<  file read failed
+#define AEE_EFWRITE                   (AEE_EOFFSET + 0x05D)     ///<  file write failed
+#define AEE_EFGETPOS                  (AEE_EOFFSET + 0x05E)     ///<  file get position failed
+#define AEE_EFSETPOS                  (AEE_EOFFSET + 0x05F)     ///<  file set position failed
+#define AEE_EFTELL                    (AEE_EOFFSET + 0x060)     ///<  file tell position failed
+#define AEE_EFSEEK                    (AEE_EOFFSET + 0x061)     ///<  file seek failed
+#define AEE_EFLEN                     (AEE_EOFFSET + 0x062)     ///<  file len greater than expected
+#define AEE_EGETENV                   (AEE_EOFFSET + 0x063)     ///<  apps_std get enviroment failed
+#define AEE_ESETENV                   (AEE_EOFFSET + 0x064)     ///<  apps_std set enviroment failed
+#define AEE_EMMAP                     (AEE_EOFFSET + 0x065)     ///<  mmap failed
+#define AEE_EIONMAP                   (AEE_EOFFSET + 0x066)     ///<  ion map failed
+#define AEE_EIONALLOC                 (AEE_EOFFSET + 0x067)     ///<  ion alloc failed
+#define AEE_ENORPCMEMORY              (AEE_EOFFSET + 0x068)     ///<  ION memory allocation failed
+#define AEE_ENOROOTOFTRUST            (AEE_EOFFSET + 0x069)     ///<  No root of trust for sigverify
+#define AEE_ENOTLOCKED                (AEE_EOFFSET + 0x06A)     ///<  Unlock failed, not locked before
+#define AEE_ENOTINITIALIZED           (AEE_EOFFSET + 0x06B)     ///<  Not initialized
+#define AEE_EUNSUPPORTEDAPI           (AEE_EOFFSET + 0x06C)     ///<  unsupported API/request ID
+#define AEE_EUNPACK                   (AEE_EOFFSET + 0x06D)     ///<  unpacking command failed
+#define AEE_EPOLL                     (AEE_EOFFSET + 0x06E)     ///<  error while polling for event
+#define AEE_EEVENTREAD                (AEE_EOFFSET + 0x06F)     ///<  event read failed
+#define AEE_EMAXBUFS                  (AEE_EOFFSET + 0x070)     ///<  Maximum buffers
+#define AEE_EINVARGS                  (AEE_EOFFSET + 0x071)     ///<  Invalid Arguments
+#define AEE_ECONNREFUSED              (AEE_EOFFSET + 0x072)     ///<  Connection refused to DSP
+#define AEE_ENOSESSION                (AEE_EOFFSET + 0x073)     ///<  No available session
+#define AEE_EUNSIGNEDMOD              (AEE_EOFFSET + 0x081)     ///<  test-sig not found, Unsigned shared object
+#define AEE_EINVALIDHASH              (AEE_EOFFSET + 0x082)     ///<  test-sig not found, Invalid hash object
+#define AEE_EBADVA                    (AEE_EOFFSET + 0x083)     ///<  Bad VA address
+#define AEE_ENOSUCHJOB                (AEE_EOFFSET + 0x084)     ///<  No such job
+#define AEE_ENOSUCHGROUP              (AEE_EOFFSET + 0x084)     ///<  No such static pd group
+#define AEE_EBADMAPREFCNT             (AEE_EOFFSET + 0x085)     ///<  Bad map reference count
+#define AEE_EBADPAGECNT               (AEE_EOFFSET + 0x086)     ///<  Bad page count
+#define AEE_EMAPALREADYPRESENT        (AEE_EOFFSET + 0x087)     ///<  Map already present
+#define AEE_ENOFREESECTION            (AEE_EOFFSET + 0x088)     ///<  No more free sections available
+#define AEE_U2GCLIENT_OPEN            (AEE_EOFFSET + 0x089)     ///<  u2g client open failed
+
+/**
+ * @}
+ */
+
+/** @defgroup smderror SMD error codes
+ *  @{
+ */
+
+
+#if defined(__hexagon__)
+	#define AEE_EGLINK_OFFSET         (AEE_EOFFSET + 0x100)     ///<  SMD errors offset
+	#define AEE_EGLINKBADPACKET       (AEE_EOFFSET + 0x101)     ///<  SMD invalid packet size
+	#define AEE_EGLINKALREADYOPEN     (AEE_EOFFSET + 0x102)     ///<  SMD port is already open
+	#define AEE_EGLINKOPENFAILED      (AEE_EOFFSET + 0x103)     ///<  SMD port open failed
+	#define AEE_EGLINKWRITE           (AEE_EOFFSET + 0x104)     ///<  SMD port write failed
+	#define AEE_EGLINKREGISTER        (AEE_EOFFSET + 0x105)     ///<  SMD port register callback failed
+#else
+	#define AEE_ESMD_OFFSET           (AEE_EOFFSET + 0x100)     ///<  SMD errors offset
+	#define AEE_ESMDBADPACKET         (AEE_EOFFSET + 0x101)     ///<  SMD invalid packet size
+	#define AEE_ESMDALREADYOPEN       (AEE_EOFFSET + 0x102)     ///<  SMD port is already open
+	#define AEE_ESMDOPENFAILED        (AEE_EOFFSET + 0x103)     ///<  SMD port open failed
+#endif
+ /**
+ * @}
+ */
+
+ /** @defgroup dalerror DAL error codes
+ *  @{
+ */
+
+
+#define AEE_EDAL_OFFSET               (AEE_EOFFSET + 0x120)     ///<  Dal error offset
+#define AEE_EDALDEVATTACH             (AEE_EOFFSET + 0x121)     ///<  DAL attach error
+#define AEE_EDALINTREGISTER           (AEE_EOFFSET + 0x122)     ///<  DAL interrupt register error
+#define AEE_EDALINTUNREGISTER         (AEE_EOFFSET + 0x123)     ///<  Dal interrupt unregister error
+#define AEE_EDALGETPROP               (AEE_EOFFSET + 0x124)     ///<  Dal get property
+#define AEE_EDALGETVAL                (AEE_EOFFSET + 0x125)     ///<  Dal get property value
+#define AEE_EDCVSREQUEST              (AEE_EOFFSET + 0x126)     ///<  Dal get property value
+
+ /**
+ * @}
+ */
+
+ /** @defgroup qurterror QURT error codes
+ *  @{
+ */
+
+#define AEE_EQURT_OFFSET              (AEE_EOFFSET + 0x140)     ///<  QURT error offset
+#define AEE_EQURTREGIONCREATE         (AEE_EOFFSET + 0x141)     ///<  QURT region create failed
+#define AEE_EQURTCACHECLEAN	          (AEE_EOFFSET + 0x142)     ///<  QURT cache clean failed
+#define AEE_EQURTREGIONGETATTR        (AEE_EOFFSET + 0x143)     ///<  QURT region get attribute failed
+#define AEE_EQURTBADREGIONPERMS       (AEE_EOFFSET + 0x144)     ///<  QURT bad permissions for region
+#define AEE_EQURTMEMPOOLADD	          (AEE_EOFFSET + 0x145)     ///<  QURT Add to memory pool failed
+#define AEE_EQURTREGISTERDEV          (AEE_EOFFSET + 0x146)     ///<  QURT register device failed
+#define AEE_EQURTMEMPOOLCREATE        (AEE_EOFFSET + 0x147)     ///<  QURT create memory pool failed
+#define AEE_EQURTGETVA                (AEE_EOFFSET + 0x148)     ///<  QURT get VA failed
+#define AEE_EQURTREGIONDELETE         (AEE_EOFFSET + 0x149)     ///<  QURT region delete failed
+#define AEE_EQURTMEMPOOLATTACH        (AEE_EOFFSET + 0x14A)     ///<  QURT memory pool attach failed
+#define AEE_EQURTTHREADCREATE         (AEE_EOFFSET + 0x14B)     ///<  QURT thread create failed
+#define AEE_EQURTCOPYTOUSER           (AEE_EOFFSET + 0x14C)     ///<  QURT copy to user memory failed
+#define AEE_EQURTMEMMAPCREATE         (AEE_EOFFSET + 0x14D)     ///<  QURT map create failed
+#define AEE_EQURTINVHANDLE            (AEE_EOFFSET + 0x14E)     ///<  QURT Invalid client handle
+#define AEE_EQURTBADASID              (AEE_EOFFSET + 0x14F)     ///<  QURT Bad ASIC from QURT
+#define AEE_EQURTOPENFAILED           (AEE_EOFFSET + 0x150)     ///<  QURT QDI open failed
+#define AEE_EQURTCOPYFROMUSER         (AEE_EOFFSET + 0x151)     ///<  QURT Copy from user failed
+#define AEE_EQURTLINELOCK             (AEE_EOFFSET + 0x152)     ///<  QURT Line lock failed
+#define AEE_EQURTQDIDEFMETHOD         (AEE_EOFFSET + 0x153)     ///<  QURT QDI default method failed
+#define AEE_EQURTCREATEHANDLE         (AEE_EOFFSET + 0x154)     ///<  QURT create handle from obj failed
+#define AEE_EQURTWRITABLEMEM          (AEE_EOFFSET + 0x155)     ///<  QURT CPZ migration writable mem
+#define AEE_EQURTTHREADCREATEDEF      (AEE_EOFFSET + 0x156)     ///<  QURT thread create def
+#define AEE_EQURTLOOKUPVA             (AEE_EOFFSET + 0x157)     ///<  QURT lookup VA
+#define AEE_EQURTLOOKUPPA             (AEE_EOFFSET + 0x158)     ///<  QURT lookup PA
+#define AEE_EQURTMIGRATESECURE        (AEE_EOFFSET + 0x159)     ///<  QURT CPZ migration failure
+#define AEE_EQURTQDIOPEN              (AEE_EOFFSET + 0X160)     ///<  QURT QDI open failure
+#define AEE_EQURTMAPREMOVE            (AEE_EOFFSET + 0X161)     ///<  QURT map remove failure
+#define AEE_EQURTQDICLOSE             (AEE_EOFFSET + 0X162)     ///<  QURT QDI close failed
+#define AEE_EQURTWAIT                 (AEE_EOFFSET + 0X163)     ///<  QURT Futex wait failed
+
+ /**
+ * @}
+ */
+
+  /** @defgroup mmpmerr MMPM error codes
+ *  @{
+ */
+
+#define AEE_EMMPM_OFFSET              (AEE_EOFFSET + 0x170)     ///<  MMPM errors offset
+#define AEE_EMMPMREQUEST              (AEE_EOFFSET + 0x171)     ///<  MMPM Power request to failed
+#define AEE_EMMPMRELEASE              (AEE_EOFFSET + 0x172)     ///<  MMPM Release request failed
+#define AEE_EMMPMSETPARAM             (AEE_EOFFSET + 0x173)     ///<  MMPM set param request failed
+#define AEE_EMMPMREGISTER             (AEE_EOFFSET + 0x174)     ///<  MMPM Register request failed
+#define AEE_EMMPMGETINFO              (AEE_EOFFSET + 0x175)     ///<  MMPM Get info failed
+#define AEE_EMAX_MMPM_CLIENTS         (AEE_EOFFSET + 0x176)     ///<  MMPM Reached maximum clients per PD(HAP_MAX_CLIENTS)
+#define AEE_EDCVSREGISTER             (AEE_EOFFSET + 0x177)     ///<  ADSP DCVS client registration failed
+#define AEE_PDRREGFAIL                (AEE_EOFFSET + 0x178)     ///<  Error Callback Services Registration failed for PD
+
+ /**
+ * @}
+ */
+
+#define AEE_DEFAULT_PROCESS           (AEE_EOFFSET + 0x180)     ///<  Default process in Guest OS is not present
+#define AEE_ENULLCONTEXT              (AEE_EOFFSET + 0x181)     ///<  User NULL context vote
+#define AEE_EINVALIDJOB               (AEE_EOFFSET + 0x182)     ///<  AsyncRPC Invalid job
+#define AEE_EBUSY                     (AEE_EOFFSET + 0x183)     ///<  AsyncRPC Pending job
+
+ /** @defgroup heaperror Heap error codes
+ *  @{
+ */
+
+#define E_APPS_BUSY_RETRY_LATER       (AEE_EOFFSET + 0x190)     ///<  Retry because the apps is busy
+#define E_HLOS_CAP_REACHED            (AEE_EOFFSET + 0x191)     ///<  cannot allocate any more hlos mem
+#define E_DPOOL_CAP_REACHED           (AEE_EOFFSET + 0x192)     ///<  cannot allocate any more physpool mem
+#define E_NO_MORE_FREE_SECTIONS       (AEE_EOFFSET + 0x193)     ///<  No more free sections available to grow heap
+
+ /** @defgroup stub skel version mismatch error code
+ *  @{
+ */
+
+#define AEE_ESTUBSKELVERMISMATCH      (AEE_EOFFSET + 0x194)     ///<  Stub version cannot be greater than skel version
+
+ /**
+ * @}
+ */
+
+/*
+Suggestion: Dont use error codes mentioned below.
+*/
+#define AEE_EBADHANDLE           (AEE_EOFFSET + 0x02C)  ///< Invalid handle
+
+#endif /* #ifndef AEESTDERR_H */
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/incs/stddef/AEEStdErr.idl b/prebuilts/Hexagon_SDK/6.2.0.1/incs/stddef/AEEStdErr.idl
new file mode 100755
index 0000000000000..6701c3d2b47f0
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/incs/stddef/AEEStdErr.idl
@@ -0,0 +1,121 @@
+#ifndef AEESTDERR_IDL
+#define AEESTDERR_IDL
+//============================================================================
+/// @file AEEStdErr.idl
+///
+/// This file contains error codes.
+                                                           //qidl copyright
+//% C  Copyright (c) 2006-2014, 2020 Qualcomm Technologies, Inc.
+                                                           //qidl nested=false
+//% All Rights Reserved.
+//% Redistribution and use in source and binary forms, with or without
+//% modification, are permitted provided that the following conditions are met:
+//%
+//% 1. Redistributions of source code must retain the above copyright notice,
+//% this list of conditions and the following disclaimer.
+//%
+//% 2. Redistributions in binary form must reproduce the above copyright notice,
+//% this list of conditions and the following disclaimer in the documentation
+//% and/or other materials provided with the distribution.
+//%
+//% 3. Neither the name of the copyright holder nor the names of its contributors
+//% may be used to endorse or promote products derived from this software without
+//% specific prior written permission.
+//%
+//% THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+//% AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+//% IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+//% ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+//% LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+//% CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+//% SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+//% INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+//% CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+//% ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+//% POSSIBILITY OF SUCH DAMAGE.
+//============================================================================
+
+#include "AEEStdDef.idl"
+
+/** @name Error Codes
+ * Common error codes.
+ */
+/*@{*/
+
+const AEEResult AEE_SUCCESS            = 0;  ///< No error
+const AEEResult AEE_EFAILED            = 1;  ///< General failure
+const AEEResult AEE_ENOMEMORY          = 2;  ///< Insufficient RAM
+const AEEResult AEE_ECLASSNOTSUPPORT   = 3;  ///< Specified class unsupported
+const AEEResult AEE_EVERSIONNOTSUPPORT = 4;  ///< Version not supported
+const AEEResult AEE_EALREADYLOADED     = 5;  ///< Object already loaded
+const AEEResult AEE_EUNABLETOLOAD      = 6;  ///< Unable to load object/applet
+const AEEResult AEE_EUNABLETOUNLOAD    = 7;  ///< Unable to unload
+                                             ///< object/applet
+const AEEResult AEE_EALARMPENDING      = 8;  ///< Alarm is pending
+const AEEResult AEE_EINVALIDTIME       = 9;  ///< Invalid time
+const AEEResult AEE_EBADCLASS          = 10; ///< NULL class object
+const AEEResult AEE_EBADMETRIC         = 11; ///< Invalid metric specified
+const AEEResult AEE_EEXPIRED           = 12; ///< App/Component Expired
+const AEEResult AEE_EBADSTATE          = 13; ///< Invalid state
+const AEEResult AEE_EBADPARM           = 14; ///< Invalid parameter
+const AEEResult AEE_ESCHEMENOTSUPPORTED= 15; ///< Invalid URL scheme
+const AEEResult AEE_EBADITEM           = 16; ///< Invalid item
+const AEEResult AEE_EINVALIDFORMAT     = 17; ///< Invalid format
+const AEEResult AEE_EINCOMPLETEITEM    = 18; ///< Incomplete item
+const AEEResult AEE_ENOPERSISTMEMORY   = 19; ///< Insufficient flash
+const AEEResult AEE_EUNSUPPORTED       = 20; ///< API is not supported
+const AEEResult AEE_EPRIVLEVEL         = 21; ///< Privileges are insufficient
+                                             ///< for this operation
+const AEEResult AEE_ERESOURCENOTFOUND  = 22; ///< Unable to find specified
+                                             ///< resource
+const AEEResult AEE_EREENTERED         = 23; ///< Non re-entrant API
+                                             ///< re-entered
+const AEEResult AEE_EBADTASK           = 24; ///< API called in wrong task
+                                             ///< context
+const AEEResult AEE_EALLOCATED         = 25; ///< App/Module left memory
+                                             ///< allocated when released.
+const AEEResult AEE_EALREADY           = 26; ///< Operation is already in
+                                             ///< progress
+const AEEResult AEE_EADSAUTHBAD        = 27; ///< ADS mutual authorization
+                                             ///< failed
+const AEEResult AEE_ENEEDSERVICEPROG   = 28; ///< Need service programming
+const AEEResult AEE_EMEMPTR            = 29; ///< bad memory pointer
+const AEEResult AEE_EHEAP              = 30; ///< heap corruption
+const AEEResult AEE_EIDLE              = 31; ///< Context (system, interface,
+                                             ///< etc.) is idle
+const AEEResult AEE_EITEMBUSY          = 32; ///< Context (system, interface,
+                                             ///< etc.) is busy
+const AEEResult AEE_EBADSID            = 33; ///< Invalid subscriber ID
+const AEEResult AEE_ENOTYPE            = 34; ///< No type detected/found
+const AEEResult AEE_ENEEDMORE          = 35; ///< Need more data/info
+const AEEResult AEE_EADSCAPS           = 36; ///< ADS Capabilities do not
+                                             ///< match those required for
+                                             ///< phone
+const AEEResult AEE_EBADSHUTDOWN       = 37; ///< App failed to close properly
+const AEEResult AEE_EBUFFERTOOSMALL    = 38; ///< Destination buffer given is
+                                             ///< too small
+const AEEResult AEE_ENOSUCH            = 39; ///< No such name, port, socket
+                                             ///< or service exists or is
+                                             ///< valid
+const AEEResult AEE_EACKPENDING        = 40; ///< ACK pending on application
+const AEEResult AEE_ENOTOWNER          = 41; ///< Not an owner authorized to
+                                             ///< perform the operation
+const AEEResult AEE_EINVALIDITEM       = 42; ///< Current item is invalid
+const AEEResult AEE_ENOTALLOWED        = 43; ///< Not allowed to perform the
+                                             ///< operation
+const AEEResult AEE_EINVHANDLE         = 44; ///< Invalid handle
+const AEEResult AEE_EOUTOFHANDLES      = 45; ///< Out of handles
+const AEEResult AEE_EINTERRUPTED       = 46; ///< Waitable call is interrupted
+const AEEResult AEE_ENOMORE            = 47; ///< No more items available --
+                                             ///< reached end
+const AEEResult AEE_ECPUEXCEPTION      = 48; ///< A CPU exception occurred
+const AEEResult AEE_EREADONLY          = 49; ///< Cannot change read-only
+                                             ///< object or parameter
+const AEEResult AEE_ECONNRESET         =104; ///< Connection reset by peer
+const AEEResult AEE_EWOULDBLOCK        =516; ///< Operation would block if not
+                                             ///< non-blocking; wait and try
+                                             ///< again
+
+/*@}*/
+
+#endif /* #ifndef AEESTDERR_IDL */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/incs/stddef/AEEVaList.h b/prebuilts/Hexagon_SDK/6.2.0.1/incs/stddef/AEEVaList.h
new file mode 100755
index 0000000000000..e830d965f5483
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/incs/stddef/AEEVaList.h
@@ -0,0 +1,112 @@
+#ifndef AEEVALIST_H
+#define AEEVALIST_H
+/*
+=======================================================================
+    Copyright 2006-2007,2012-2013,2020 QUALCOMM Technologies Inc.
+    All Rights Reserved.
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions are met:
+
+    1. Redistributions of source code must retain the above copyright notice,
+    this list of conditions and the following disclaimer.
+
+    2. Redistributions in binary form must reproduce the above copyright notice,
+    this list of conditions and the following disclaimer in the documentation
+    and/or other materials provided with the distribution.
+
+    3. Neither the name of the copyright holder nor the names of its contributors
+    may be used to endorse or promote products derived from this software without
+    specific prior written permission.
+
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+    AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+    IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+    ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+    LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+    CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+    SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+    INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+    CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+    POSSIBILITY OF SUCH DAMAGE.
+=======================================================================
+*/
+
+#if !defined(__clang__) && (defined(__ARMCC_VERSION) || (defined(__GNUC__) && defined(__arm__)))
+
+#if (defined(__ARMCC_VERSION) && __ARMCC_VERSION >= 200000 && !defined(__APCS_ADSABI)) || \
+    (defined(__GNUC__) && defined(__arm__) && defined(__ARM_EABI__))
+
+# define __AEEVA_ATPCS 0
+
+#else
+
+# define __AEEVA_ATPCS 1
+
+#endif
+
+typedef void* AEEVaList;
+
+#define __AEEVA_ARGALIGN(t)   (((char*)(&((struct{char c;t x;}*)1)->x))-((char*)1))
+#define __AEEVA_ARGSIZE(t)    ((sizeof(t)+sizeof(int)-1) & ~(sizeof(int)-1))
+
+static __inline void __cpy(char*d, const char*s, int len)
+{
+   while (len-- > 0) *d++ = *s++;
+}
+
+static __inline AEEVaList __AEEVa_Arg(AEEVaList args, void* pv, int nVSize,
+                                      int nArgSize, int nArgAlign)
+{
+   int   nArgs = (int)args & ~1;
+   char* pcArgs = (char*)args;
+   int   bATPCS = (int)args & 1;
+   int   nArgsOffset = 0;
+   int   nVOffset = 0;
+
+   if (!bATPCS) { /* caller was compiled with AAPCS */
+
+      if (nArgAlign > (int)sizeof(int)) {
+         nArgAlign--; /* make a mask */
+         pcArgs += ((nArgs + nArgAlign) & (int)~(unsigned)nArgAlign) - nArgs;
+         /* move pv to next alignment */
+      }
+   }
+
+#if defined(AEE_BIGENDIAN)
+   if (nArgSize < (int)sizeof(int)) {
+      nArgsOffset = (int)sizeof(int) - nArgSize;
+   }
+   nVOffset = nVSize - nArgSize;
+#else
+   (void)nVSize;
+#endif /* AEE_BIGENDIAN */
+
+   __cpy((char*)pv + nVOffset, (pcArgs - bATPCS) + nArgsOffset, nArgSize);
+
+   /* round up */
+   nArgSize = (nArgSize+(int)sizeof(int)-1) & ~((int)sizeof(int)-1);
+
+   return pcArgs + nArgSize; /* increment va */
+}
+
+#define AEEVA_START(va,v)     ((va) = (char*)&(v) + __AEEVA_ARGSIZE(v) + __AEEVA_ATPCS)
+#define AEEVA_ARG(va,v,t)     ((void)((va) = __AEEVa_Arg(va,&v,sizeof(v),sizeof(t),__AEEVA_ARGALIGN(t))))
+#define AEEVA_END(va)         ((va) = (AEEVaList)0)
+#define AEEVA_COPY(dest, src) ((void)((dest) = (src)))
+
+#else /* !defined(__clang__) && (defined(__ARMCC_VERSION) || (defined(__GNUC__) && defined(__arm__))) */
+
+#include <stdarg.h>
+
+typedef va_list AEEVaList;
+
+#define AEEVA_START(va,v)     (va_start((va), (v)))
+#define AEEVA_ARG(va,v,t)     ((v) = va_arg((va),t))
+#define AEEVA_END(va)         (va_end((va)))
+#define AEEVA_COPY(dest, src) (va_copy((dest),(src)))
+
+#endif/* !defined(__clang__) && (defined(__ARMCC_VERSION) || (defined(__GNUC__) && defined(__arm__))) */
+
+#endif /* #ifndef AEEVALIST_H */
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/incs/stringl.h b/prebuilts/Hexagon_SDK/6.2.0.1/incs/stringl.h
new file mode 100755
index 0000000000000..94bcba0a701a4
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/incs/stringl.h
@@ -0,0 +1,695 @@
+/*
+ *  $Header: //components/rel/core.qdsp6/8.2/api/common/kernel/libstd/stringl/stringl.h#1 $ 
+ *  $DateTime: 2023/05/10 09:48:16 $ 
+ */
+
+/*	$OpenBSD: string.h,v 1.17 2006/01/06 18:53:04 millert Exp $	*/
+/*	$NetBSD: string.h,v 1.6 1994/10/26 00:56:30 cgd Exp $	*/
+
+/*-
+ * Copyright (c) 1990 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)string.h	5.10 (Berkeley) 3/9/91
+ */
+
+#ifndef _STRINGL_H_
+#define	_STRINGL_H_
+
+#include <stdio.h>
+#include <string.h>
+#include <wchar.h>
+
+/** @defgroup error_codes Error Codes
+ *  @{
+ */
+//
+// AEEstd.h header error codes
+//
+#ifndef STD_NODIGITS
+    #define STD_NODIGITS   1    /**< See std_scanul(). */
+#endif
+
+#ifndef STD_NEGATIVE
+    #define STD_NEGATIVE   2    /**< See std_scanul(). */
+#endif
+
+#ifndef STD_OVERFLOW
+    #define STD_OVERFLOW   3    /**< See std_scanul(). */
+#endif
+
+#ifndef STD_BADPARAM
+    #define STD_BADPARAM   4    /**< See std_scanul(). */
+#endif
+
+/**
+ * @}
+ */
+
+/**< UTF-16 2-byte wide char type */
+typedef unsigned short wchar;
+
+#ifdef __cplusplus
+namespace std 
+{
+    extern "C"
+    {
+#endif //__cplusplus
+
+/**
+Added these macros for supporting compilation on Win based
+software dev environments like VC, .Net etc.
+*/
+#ifdef _WIN32
+   #define snprintf     _snprintf
+   #define vsnprintf    _vsnprintf
+#endif
+
+/** @defgroup str_apis String Operation APIs
+ *  @{
+ */
+ 
+/**
+  strlcat - Size bounded string concatenation.
+ 
+  Concatenates the source string to destination string.
+ 
+  This function ensures that the destination string will
+  not be improperly terminated and that there will be
+  no concatenation beyond the size of the destination buffer.
+          
+  @param[in,out]  dst   Destination buffer.
+  @param[in]      src   Source string.
+  @param[in]      siz   Size of the destination buffer in bytes.
+   
+  @return 
+  The length of the string that was attempted to be created,
+  i.e. the sum of the source and destination strings.
+ 
+  @dependencies 
+  None.        
+*/
+size_t strlcat(char *dst, const char *src, size_t siz);
+
+/**
+ * @}
+ */
+ 
+ /** @defgroup wstr_apis Wide Char String Operation APIs
+ *  @{
+ */
+/**
+  wcslcat - Size bounded wide string concatenation using 
+  C standard wide character data type wchar_t.
+ 
+  Concatenates the source string to destination string.
+ 
+  This function ensures that the destination string will
+  not be improperly terminated and that there will be
+  no concatenation beyond the size of the destination buffer.
+          
+  @param[in,out]  dst   Destination buffer.
+  @param[in]      src   Source string.
+  @param[in]      siz   Size of the destination buffer in units of wchar_t.
+   
+  @return 
+  The length of the string that was attempted to be created,
+  i.e. the sum of the source and destination strings.
+
+  @note It has been observed that wchar_t on some platforms is
+  2 bytes wide (UTF-16) and on others is 4 bytes wide (UTF-32). 
+  So carefully consider this when using the data type wchar_t
+  and this API in your application.
+
+  @dependencies 
+  None.        
+*/
+
+size_t wcslcat(wchar_t *dst, const wchar_t *src, size_t siz);
+
+/**
+  wstrlcat - Size bounded wide string concatenation using 2 byte
+  wide (UTF-16) character data type wchar.
+ 
+  Concatenates the source string to destination string.
+ 
+  This function ensures that the destination string will
+  not be improperly terminated and that there will be
+  no concatenation beyond the size of the destination buffer.
+          
+  @param[in,out]  dst   Destination buffer.
+  @param[in]      src   Source string.
+  @param[in]      siz   Size of the destination buffer in units of wchar.
+   
+  @return 
+  The length of the string that was attempted to be created,
+  i.e. the sum of the source and destination strings.
+ 
+  @dependencies 
+  None.        
+*/
+size_t wstrlcat(wchar* dst, const wchar* src, size_t siz);
+
+/**
+ * @}
+ */
+ 
+ /** @addtogroup str_apis
+  @{ */
+    
+/**
+  strlcpy - Size bounded string copy. 
+   
+  Copies the source string to the destination buffer.
+   
+  This function ensures that the destination buffer will always 
+  be NULL terminated and that there will not be a copy beyond 
+  the size of the destination buffer. 
+   
+  @param[out] dst   Destination buffer.
+  @param[in]  src   Source String.
+  @param[in]  siz   Size of the destination buffer in bytes.
+   
+  @return 
+  The length of the source string. 
+   
+  @dependencies 
+  None. 
+*/    
+size_t strlcpy(char *dst, const char *src, size_t siz);
+
+/** @}  */
+
+ /** @addtogroup wstr_apis
+  @{ */
+/**
+  wcslcpy - Size bounded wide string copy using
+  C standard wide character data type wchar_t.
+
+  Copies the source string to the destination buffer. 
+   
+  This function ensures that the destination buffer will always 
+  be NULL terminated and that there will not be a copy beyond 
+  the size of the destination buffer. 
+   
+  @param[out] dst   Destination buffer.
+  @param[in]  src   Source String.
+  @param[in]  siz   Size of the destination buffer in units of wchar_t.
+   
+  @return 
+  The length of the source string. 
+
+  @note It has been observed that wchar_t on some platforms is
+  2 bytes wide (UTF-16) and on others is 4 bytes wide (UTF-32). 
+  So carefully consider this when using the data type wchar_t
+  and this API in your application.
+   
+  @dependencies 
+  None. 
+*/
+
+size_t wcslcpy(wchar_t *dst, const wchar_t *src, size_t siz);
+
+/**
+  wstrlcpy - Size bounded wide string copy using 2 byte
+  wide (UTF-16) character data type wchar.
+   
+  Copies the source string to the destination buffer. 
+   
+  This function ensures that the destination buffer will always 
+  be NULL terminated and that there will not be a copy beyond 
+  the size of the destination buffer. 
+   
+  @param[out] dst   Destination buffer.
+  @param[in]  src   Source String.
+  @param[in]  siz   Size of the destination buffer in units of wchar.
+   
+  @return 
+  The length of the source string. 
+   
+  @dependencies 
+  None. 
+*/
+size_t wstrlcpy(wchar* dst, const wchar* src, size_t siz);
+
+/**
+  wstrlen - Returns the number of characters in the source string.
+  Used for strings based on wchar data type i.e. 2 byte wide (UTF-16)
+  characters.
+      
+  @param[in]  src   Source String.
+   
+  @return 
+  The number of characters in the source string.
+   
+  @dependencies 
+  None. 
+*/
+size_t wstrlen(const wchar *src);
+
+/**
+  wstrcmp - Compares wchar (UTF-16) string s1 to the wchar string s2.
+
+  This function starts comparing the first character of each string. 
+  If they are equal to each other, it continues with the following 
+  pairs until the characters differ or until a terminating 
+  null-character is reached.
+
+  @param[in]  s1   String to be compared.
+  @param[in]  s2   String to be compared against.
+
+  @return 
+  0  - Indicates that the strings are equal.
+  >0 - Indicates that the strings are not equal and a character in s1 is 
+       greater than the corresponding character in s2.
+  <0 - Indicates that the strings are not equal and a character in s1 is 
+       lesser than the corresponding character in s2.
+   
+  @dependencies 
+  None. 
+*/
+int wstrcmp(const wchar *s1, const wchar *s2);
+
+/**
+  wstrncmp - Compares upto n wchar (UTF-16) characters in string s1 
+  to the wchar string s2.
+
+  This function starts comparing the first character of each string. 
+  If they are equal to each other, it continues with the following 
+  pairs until the characters differ or until a terminating 
+  null-character is reached or n comparisons have been performed.
+
+  @param[in]  s1   String to be compared.
+  @param[in]  s2   String to be compared against.
+  @param[in]  n    Nmber of character to be compared.
+
+  @return 
+  0  - Indicates that the strings are equal.
+  >0 - Indicates that the strings are not equal and a character in s1 is 
+       greater than the corresponding character in s2.
+  <0 - Indicates that the strings are not equal and a character in s1 is 
+       lesser than the corresponding character in s2.
+   
+  @dependencies 
+  None. 
+*/
+int wstrncmp(const wchar *s1, const wchar *s2, size_t n);
+
+/** @}  */
+
+ /** @addtogroup str_apis
+  @{ */
+ 
+/**
+  strcasecmp - compare two strings ignoring case.
+   
+  @param[in] s1 First string.
+  @param[in] s2 Second string.
+   
+  @return 
+  The strcasecmp() and strncasecmp() functions return an integer 
+  less than, equal to, or greater than zero if s1 (or the first 
+  n bytes  thereof)  is found, respectively, to be less than, to
+  match, or be greater than s2. 
+   
+  @dependencies 
+  None. 
+*/
+int strcasecmp(const char * s1, const char * s2);
+
+/**
+  strncasecmp - compare two strings ignoring case (sized).
+   
+  @param[in] s1 First string.
+  @param[in] s2 Second string.
+  @param[in] n  The number of characters to compare (from the
+                beginning).
+   
+  @return 
+  The strcasecmp() and strncasecmp() functions return an integer 
+  less than, equal to, or greater than zero if s1 (or the first 
+  n bytes  thereof)  is found, respectively, to be less than, to
+  match, or be greater than s2. 
+   
+  @dependencies 
+  None. 
+*/
+int strncasecmp(const char * s1, const char * s2, size_t n);
+
+/**
+std_scanul()
+
+Description:
+
+  The std_scanul() converts an ASCII representation of a number
+  to an unsigned long.  It expects strings that match the
+  following pattern:
+
+       spaces [+|-] digits
+
+ 
+  'Spaces' is zero or more ASCII space or tab characters.
+ 
+  'Digits' is any number of digits valid in the radix.  Letters
+  'A' through 'Z' are treated as digits with values 10 through
+  35. 'Digits' may begin with "0x" when a radix of 0 or 16 is
+  specified.
+
+  Upper and lower case letters can be used interchangeably.
+ 
+  @param[in]  pchBuf    The start of the string to scan.
+
+  @param[in]  nRadix    The numeric radix (or base) of the
+                        number. Valid values are 2 through 36 or zero,
+                        which implies auto-detection. Auto-detection
+                        examines the digits field.  If it begins with
+                        "0x", radix 16 is selected.  Otherwise, if it
+                        begins with "0" radix 8 is selected.
+                        Otherwise, radix 10 is selected.
+
+  @param[out] ppchEnd   If ppchEnd is not NULL, *ppchEnd
+                        points to the first character that did not
+                        match the expected pattern shown above,
+                        except on STD_BADPARAM and STD_OVERFLOW when
+                        it is set to the start of the string.
+
+  @param[out] pnError   If pnError is not NULL, *pnError
+                        holds the error code, which is one of the
+                        following:
+ 
+        0            : Numeric value is from 0 to
+                       MAX_UINT32.
+        
+        STD_NEGATIVE : The scanned value was negative and its absolute value was
+                       from 1 to MAX_UINT32.  The result is the negated value
+                       (cast to a uint32).
+        
+        STD_NODIGITS : No digits were found.  The result is zero.
+        
+        STD_OVERFLOW : The absolute value exceeded MAX_UINT32.  The result
+                       is set to MAX_UINT32 and *ppchEnd is set to pchBuf.
+    
+        STD_BADPARAM : An improper value for nRadix was received.  The result
+                       is set to zero, and *ppchEnd is set to pchBuf.
+ 
+  @return
+  The converted numeric result.
+ 
+  @dependencies
+  None.
+ 
+*/
+unsigned int std_scanul(const char * pchBuf, int nRadix, const char ** ppchEnd, int *pnError);
+
+/** @}  */
+
+/** @defgroup mem_ops Memory Operation APIs
+ *  @{
+ */
+ 
+/**
+  memscpy - Size bounded memory copy. 
+   
+  Copies bytes from the source buffer to the destination buffer.
+   
+  This function ensures that there will not be a copy beyond 
+  the size of the destination buffer. 
+
+  The result of calling this on overlapping source and destination
+  buffers is undefined.
+   
+  @param[out] dst       Destination buffer.
+  @param[in]  dst_size  Size of the destination buffer in bytes.
+  @param[in]  src       Source buffer.
+  @param[in]  src_size  Number of bytes to copy from source buffer.
+   
+  @return 
+  The number of bytes copied to the destination buffer.  It is the
+  caller's responsibility to check for trunction if it cares about it -
+  truncation has occurred if the return value is less than src_size.
+   
+  @dependencies 
+  None. 
+*/    
+
+size_t memscpy(void *dst, size_t dst_size, const void *src, size_t src_size);
+
+/**
+  memscpy_i - Inline function for size bounded memory copy.
+
+  @see memscpy()
+*/
+
+static __inline size_t memscpy_i
+(
+  void *dst,
+  size_t dst_size,
+  const void *src,
+  size_t src_size
+)
+{
+  size_t  copy_size = (dst_size <= src_size)? dst_size : src_size;
+
+  memcpy(dst, src, copy_size);
+
+  return copy_size;
+}
+
+/**
+  memsmove - Size bounded memory move.
+   
+  Moves bytes from the source buffer to the destination buffer.
+   
+  This function ensures that there will not be a copy beyond 
+  the size of the destination buffer. 
+
+  This function should be used in preference to memscpy() if there
+  is the possiblity of source and destination buffers overlapping.
+  The result of the operation is defined to be as if the copy were from
+  the source to a temporary buffer that overlaps neither source nor
+  destination, followed by a copy from that temporary buffer to the
+  destination.
+   
+  @param[out] dst       Destination buffer.
+  @param[in]  dst_size  Size of the destination buffer in bytes.
+  @param[in]  src       Source buffer.
+  @param[in]  src_size  Number of bytes to copy from source buffer.
+   
+  @return 
+  The number of bytes copied to the destination buffer.  It is the
+  caller's responsibility to check for trunction if it cares about it -
+  truncation has occurred if the return value is less than src_size.
+   
+  @dependencies 
+  None. 
+*/    
+
+size_t memsmove(void *dst, size_t dst_size, const void *src, size_t src_size);
+
+/**
+  memsmove_i - Inline function for size bounded memory move.
+
+  @see memsmove()
+*/
+
+static __inline size_t memsmove_i
+(
+  void *dst,
+  size_t dst_size,
+  const void *src,
+  size_t src_size
+)
+{
+  size_t  copy_size = (dst_size <= src_size)? dst_size : src_size;
+
+  memmove(dst, src, copy_size);
+
+  return copy_size;
+}
+
+/**
+  secure_memset - Memset functionality that won't be optimized by the compiler
+
+  Memsets a memory location to a given value in a way that is unlikely to be
+  removed by the compiler
+
+  A classic compiler optimization is to remove references to instructions that
+  assign a value to a variable where that variable is never used after the
+  assignment.  However, this means that compilers will often remove memset 
+  instructions which are used to "zero" sensitive information in stack or heap
+  memory, which can cause a security risk.  This function performs a basic
+  memset operation, but should always be instantiated in its own file, this
+  will mean file optimizers will not be able to optimize its use and linkers
+  do not have sufficient intelligence to optimize calls between files.
+
+  This function should be used when clearing sensitive information in memory.
+
+  @param[in]  ptr    Points to the memory area to be set.
+  @param[in]  value  The value to be set.
+  @param[in]  len    The number of bytes to be set.
+
+  @return
+  This function returns the pointer to the memory area ptr.
+
+  @dependencies
+  None.
+*/
+
+void* secure_memset(void* ptr, int value, size_t len);
+
+/**
+ * @}
+ */
+
+
+/** @defgroup time_safe_ops Time Safe Memory Operation APIs
+ *  @{
+ */
+ 
+/**
+  timesafe_memcmp - Constant-time memory comparison
+
+  Compares bytes at two different sections in memory in constant time
+
+  This function compares the len bytes starting at ptr1 with the len
+  bytes starting at ptr2.  The function returns 1 if the two sections
+  of memory are different and 0 if the two sections of memory are
+  identical.  The function always scans the entire range of memory to
+  ensure the function runs in constant time.
+
+  This function should be used when comparing confidential information
+  in memory as it prevents timing attacks.  A traditional memcmp() exits
+  after finding non-equal bytes and this can be used to determine the value
+  of confidential data in memory.  Examples uses include password checks,
+  MACs checks, decryption checks, and checks on private user information.
+
+  @param[in]  ptr1   Points to the first memory bytes to be compared.
+  @param[in]  ptr2   Points to the second memory bytes to be compared.
+  @param[in]  len    The number of bytes to be compared.
+
+  @return
+  This function returns 1 if the two buffers are different and
+  0 if the two buffers are identical.
+
+  @dependencies
+  None.
+*/
+
+int timesafe_memcmp(const void* ptr1, const void* ptr2, size_t len);
+
+/**
+  timesafe_strncmp - Constant-time string comparison
+
+  Compares bytes in two different string buffers in constant time
+
+  This function compares the contents of the string buffer at ptr1 with
+  the string buffer at ptr2 up to a maximum of len bytes.  The function
+  does not compare bytes beyond the first occurrence of a NULL byte.  The
+  function returns 1 if the two strings are different and 0 if the strings
+  are identical.  The function always scans the entire range of memory to
+  ensure the function runs in constant time.
+
+  This function shuld be used when comparing strings that contain confidential
+  information as it prevents timing attacks.  A traditional strncmp() exits 
+  after finding non-equal bytes or a NULL byte and this can be used to
+  determine the value of confidential data in memory.
+
+  @param[in]  ptr1   Points to the first string to be compared.
+  @param[in]  ptr2   Points to the second string to be compared.
+  @param[in]  len    The number of bytes to be compared.
+
+  @return
+  This function returns 1 if the strings are different and
+  0 if the strings are the same.
+
+  @dependencies
+  None.
+*/
+
+int timesafe_strncmp(const char* ptr1, const char* ptr2, size_t len);
+/**
+ * @}
+ */
+ 
+  /** @addtogroup str_apis
+  @{ */
+/**
+  strnlen - Determine the length of a fixed size string
+
+  This function takes a maxlen length parameter and stops looking for a NULL
+  character if it passes the maxlen length.  It is considered safer than
+  strlen because it will not enter an endless loop if the source string
+  is a bad string without NULL termination.
+
+  @param[in]  str     Points to the source string.
+  @param[in]  maxlen  The maximum number of bytes to count.
+
+  @return
+  This function returns the number of bytes in string pointed to by str,
+  excluding the terminating NULL byte ('\0'), but at most maxlen.
+
+  @dependencies
+  None.
+*/
+#ifndef _WIN32
+size_t strnlen(const char *str, size_t maxlen);
+#endif
+
+/**
+ * @}
+ */
+#ifdef __cplusplus
+    } //extern "C"
+} //namespace std
+#endif //__cplusplus
+
+//Explicit export of the libstd implemented functions
+#ifdef __cplusplus
+#ifdef _WIN32
+    using std::strlcat;
+    using std::strlcpy;
+    using std::strcasecmp;
+    using std::strncasecmp;
+#endif
+    using std::wcslcat;
+    using std::wstrlcat;
+    using std::wcslcpy;
+    using std::wstrlcpy;
+    using std::wstrcmp;
+    using std::wstrncmp;
+    using std::wstrlen;
+    using std::memscpy;
+    using std::memsmove;
+    using std::secure_memset;
+    using std::timesafe_memcmp;
+    using std::timesafe_strncmp;
+#ifndef _WIN32
+    using std::strnlen;
+#endif
+#endif //__cplusplus
+
+#endif /* _STRINGL_H_ */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/incs/stringl.md b/prebuilts/Hexagon_SDK/6.2.0.1/incs/stringl.md
new file mode 100755
index 0000000000000..a80878d828e28
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/incs/stringl.md
@@ -0,0 +1,51 @@
+# Prototypes for string manipulation functions
+
+## Introduction {#introduction}
+
+`stringl.h` contains function prototypes for various string manipulation functions. These are considered safer than the standard C string manipulation functions and were developed as part of the [OpenBSD project](https://www.openbsd.org/).
+
+## API Overview {#api-overview}
+
+The stringl.h APIs include the following functions:
+
+* ::strlcat
+
+* ::wcslcat
+
+* ::wstrlcat
+
+* ::strlcpy
+
+* ::wcslcpy
+
+* ::wstrlcpy
+
+* ::wstrlen
+
+* ::wstrcmp
+
+* ::wstrncmp
+
+* ::strcasecmp
+
+* ::strncasecmp
+
+* ::std_scanul
+
+* ::memscpy
+
+* ::memscpy_i
+
+* ::memsmove
+
+* ::memsmove_i
+
+* ::secure_memset
+
+* ::timesafe_memcmp
+
+* ::timesafe_strncmp
+
+* ::strnlen
+
+Header file: @b stringl.h
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/incs/synx.md b/prebuilts/Hexagon_SDK/6.2.0.1/incs/synx.md
new file mode 100755
index 0000000000000..21bc9a9a18386
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/incs/synx.md
@@ -0,0 +1,7 @@
+##  Overview
+
+Heterogeneous Systems involve more than one core to efficiently process a task. Nowadays, there are lot of new advanced use cases that require computation across multiple cores. One example of such use case is for the camera core to capture an image, pass it on to DSP and/or GPU cores for post processing and send the final output to the display subsystem for rendering. Such use cases involve transferring of control points and sharing of buffers across multiple cores. This type of application use case drives the need for a generic synchronization framework which explicitly describes dependencies between different asynchronous operations across the SoC.
+
+The synx framework helps to capture such dependencies across cores. It notifies task completions and /or buffer ready information between a producer and consumers.
+
+***Note***: There are many synchronization/fence mechanisms available today but those work best within a single core/device. In Android systems, buffers are usually allocated as ION buffers so that these buffer can be shared across various components (UMD, KMD, and HW). If another core needs to access such buffers, we need to synchronously transfer control (along with data payload) from one core to the other. This explicit transfer of control from one core to the other eliminates the possibilities of optimization and hence drives the idea of introducing synx handle for synchronization.
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/incs/sysmon_cachelock.h b/prebuilts/Hexagon_SDK/6.2.0.1/incs/sysmon_cachelock.h
new file mode 100755
index 0000000000000..e46e7d7f25a29
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/incs/sysmon_cachelock.h
@@ -0,0 +1,109 @@
+/*-----------------------------------------------------------------------------
+   Copyright (c) 2017-2020 QUALCOMM Technologies, Incorporated.
+   All Rights Reserved.
+   QUALCOMM Proprietary.
+-----------------------------------------------------------------------------*/
+
+#ifndef SYSMON_CACHELOCK_H_
+#define SYSMON_CACHELOCK_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ *  @file sysmon_cachelock.h
+ *  @brief CDSP L2 cache locking manager API
+ */
+
+/**
+ * Allocates a memory buffer, locks it in L2 cache, and returns the locked
+ * virtual address.
+ *
+ * @param[in] size Memory size (in bytes) to lock. 
+ * @param[out] paddr_ptr Pointer to @c unsigned @c long @c long
+ *                       variable to get the locked 64-bit physical address upon
+ *                       success. NULL if the allocation and cache lock failed.
+ *
+ * @return
+ * @c void* Virtual address of the locked memory region. \n
+ * 0 if the requested buffer size could not be allocated and locked in the L2 cache.
+ */
+void* HAP_cache_lock(unsigned int size, unsigned long long *paddr_ptr);
+
+
+/**
+ * Unlocks cache and deallocates memory for a virtual address returned by 
+ * the corresponding HAP_cache_lock() call.
+ *
+ * @param[in] vaddr_ptr Virtual address of the memory block to unlock.
+ *
+ * @return 
+ * 0 upon success. \n
+ * Other values upon failure.
+ */
+int HAP_cache_unlock(void* vaddr_ptr);
+
+/**
+ * Locks the cache for a given virtual address and memory size (in bytes).
+ *
+ * Align the address and size to 32 bytes. The size should not be more 
+ * than 64 KB, and at any point of time, only one such request is honored
+ * (this restriction has been removed from SM8250 onwards).
+ * 
+ * Use this function to lock an existing memory block, for example, 
+ * to lock a code segment or data buffer. Note that whenever possible, it is
+ * preferable to let the driver allocate the memory to be locked in L2 via the
+ * HAP_cache_lock API, as it can often avoid the fragmentation likely to occur
+ * when the user provides the memory ranges to be locked. 
+ *
+ * @param[in] vaddr_ptr Virtual address of the memory block to lock; should be
+ *                      32-byte aligned.
+ * @param[in] size Memory size (in bytes) to lock; should be 32-byte aligned. 
+ *                 The maximum size limit is 64 KB. From SM8250, this size limit is
+ *                 the same as HAP_cache_lock().
+ *
+ * @return
+ * 0 upon success. \n
+ * Others values upon failure.
+ */
+int HAP_cache_lock_addr(void* vaddr_ptr, unsigned int size);
+
+/**
+ * Unlocks the cache for a given virtual address.
+ *
+ * Use this function together with HAP_cache_lock_addr().
+ *
+ * @param[in] vaddr_ptr Virtual address of the memory block to unlock.
+ *
+ * @return 
+ * 0 upon success. \n
+ * Other values upon failure.
+ */
+int HAP_cache_unlock_addr(void* vaddr_ptr);
+
+/**
+ * Queries the API to get the size of largest contiguous memory block available for 
+ * cache locking.
+ *
+ * @return 
+ * Available size in bytes upon success. \n
+ * -1 upon failure.
+ */
+int HAP_query_avail_cachelock(void);
+
+/**
+ * Queries the API to get the total locked cache size. 
+ *
+ * @return 
+ * Total locked cache size in bytes upon success. \n
+ * -1 upon failure.
+ */
+int HAP_query_total_cachelock(void); 
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* SYSMON_CACHELOCK_H_ */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/incs/sysmon_cachelock.md b/prebuilts/Hexagon_SDK/6.2.0.1/incs/sysmon_cachelock.md
new file mode 100755
index 0000000000000..44efd95724811
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/incs/sysmon_cachelock.md
@@ -0,0 +1,25 @@
+# Cache locking manager
+
+The cache locking manager locks a section of the L2 cache from the
+cDSP, and subsequently releases this lock.
+
+The cache locking manager replaces the HAP_power_set APIs that are now deprecated.
+This new cache locking manager utilizes available L2 cache by
+allocating memory with appropriately aligned address based on L2 cache
+availability and the request size. The cache locking manager also limits
+maximum cache that can be locked to guarantee performance of the guest OS and
+FastRPC threads.
+
+The cache locking manager monitors cache locking usage by providing
+APIs to get the maximum available cache size for locking and the total
+currently locked cache.
+
+Finally, a set of APIs passes the address of the memory to lock
+along with its size information. These APIs are useful for applications where a
+linker-defined section (code/library) must be locked into cache.
+
+The cache locking manager APIs are not accessible from unsigned PD.
+
+## Framework APIs
+
+Header file: @b sysmon_cachelock.h
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/incs/sysmon_marker.h b/prebuilts/Hexagon_SDK/6.2.0.1/incs/sysmon_marker.h
new file mode 100755
index 0000000000000..bcf34d1f80059
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/incs/sysmon_marker.h
@@ -0,0 +1,46 @@
+/*-----------------------------------------------------------------------
+   Copyright (c) 2017-2020 QUALCOMM Technologies, Incorporated.
+   All Rights Reserved.
+   QUALCOMM Proprietary.
+-----------------------------------------------------------------------*/
+#ifndef SYSMON_MARKER_H
+#define SYSMON_MARKER_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * @file sysmon_marker.h
+ * @brief Sysmon profiling marker API
+ *        Allows the user to profile a piece of code or
+ *        algorithm of interest.
+ */
+
+/**
+ * Enables or disables a profiling marker.
+
+ * @param[in] marker  Any unique, customer-defined, unsigned number to identify profiling 
+                      data mapped to a section of code.
+ * @param[in] enable  Flag to enable (1) or disable (1) the profiling marker.
+ *
+ * For example:
+ * @code
+ * #include <sysmon_marker.h>
+ * // or, alternatively,
+ * // extern HP_profile(unsigned int marker, unsigned char enable);
+ *
+ * HP_profile(10, 1);
+ * // ...
+ * // User code to profile
+ * // ...
+ * HP_profile(10, 0);
+ * @endcode
+ */
+void HP_profile(unsigned int marker, unsigned char enable);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /*SYSMON_MARKER_H*/
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/incs/sysmon_marker.md b/prebuilts/Hexagon_SDK/6.2.0.1/incs/sysmon_marker.md
new file mode 100755
index 0000000000000..d25fbdd30f102
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/incs/sysmon_marker.md
@@ -0,0 +1,36 @@
+# sysMon marker
+
+The sysMon marker API profiles a specific code region to study its own load on the processor compute resources and the bus bandwidth, and captures various other profiling metrics associated to that specific code region.
+This approach is useful when measuring performance, debugging performance-related issues or identifying possible optimizations for a specific code region instead of profiling the entire application.
+
+This API is not supported in unsigned PD and CPZ.
+
+## Collect profiling data
+
+Once the code has been instrumented with the sysMon marker APIs to enable and disable profiling of specific code regions, the [sysMonApp profiler](../../tools/sysmon_app.html#profiler-service)
+must run to collect sysMon marker data.
+
+If DCVS is enabled by the user, the decisions taken by DCVS with profiling markers enabled might not be the same as without markers.
+
+## Parsing profiling data with sysMon marker
+
+Profiling data captured using the [sysMonApp profiler](../../tools/sysmon_app.html#profiler-service) can be parsed using the [sysMon parser](../../tools/sysmon_parser.html).
+Refer [STID and markers data](../../tools/sysmon_parser.html#stid-and-markers-data) section for the output files generated by the sysMon parser when sysMon markers are enabled.
+
+## Limitations
+
+* Nested markers are not supported
+
+    For example, the following piece of code does not produce the expected profiling data; its behavior is undefined.
+
+        HP_profile(10, 1);
+        // ... user code ...
+        HP_profile(11, 1);
+        // ... user code ...
+        HP_profile(11, 0);
+        // ... user code ...
+        HP_profile(10, 0);
+
+* Enabling profiling markers forces collection of profiling data on all hardware threads.
+
+    Profiling statistics are collected for any entity running in parallel with the piece of code where markers are defined.
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/incs/utils.h b/prebuilts/Hexagon_SDK/6.2.0.1/incs/utils.h
new file mode 100755
index 0000000000000..57a5dd96b2930
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/incs/utils.h
@@ -0,0 +1,14 @@
+/*
+ * Copyright (c) 2020 QUALCOMM Technologies Inc. All Rights Reserved.
+ * Qualcomm Technologies Confidential and Proprietary
+ *
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+size_t memscpy(void* dst, size_t dst_size, const void* src, size_t src_size);
+
+#ifdef __cplusplus
+}
+#endif
\ No newline at end of file
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/incs/verify.h b/prebuilts/Hexagon_SDK/6.2.0.1/incs/verify.h
new file mode 100755
index 0000000000000..dea9f22172089
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/incs/verify.h
@@ -0,0 +1,159 @@
+/*
+ * Copyright (c) 2012-2013, 2020 QUALCOMM Technologies Inc.
+ * All Rights Reserved.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#ifndef VERIFY_H
+#define VERIFY_H
+
+
+#ifndef _WIN32
+#define C_ASSERT(test) \
+    switch(0) {\
+      case 0:\
+      case test:;\
+    }
+#endif // _WIN32
+
+#ifndef __V_STR__
+	#define __V_STR__(x) #x ":"
+#endif //__STR__
+#ifndef __V_TOSTR__
+	#define __V_TOSTR__(x) __V_STR__(x)
+#endif // __TOSTR__
+#ifndef __V_FILE_LINE__
+	#define __V_FILE_LINE__ __FILE__ ":" __V_TOSTR__(__LINE__)
+#endif /*__FILE_LINE__*/
+
+
+#ifdef __ANDROID__
+/*android */
+#if (defined VERIFY_PRINT_INFO) || (defined VERIFY_PRINT_ERROR)
+#include <android/log.h>
+#endif
+
+#ifdef VERIFY_PRINT_INFO
+#define VERIFY_IPRINTF(format, ...) __android_log_print(ANDROID_LOG_DEBUG , "adsprpc", __V_FILE_LINE__ format, ##__VA_ARGS__)
+#endif
+
+#ifdef VERIFY_PRINT_ERROR
+#define VERIFY_EPRINTF(format, ...) __android_log_print(ANDROID_LOG_ERROR , "adsprpc", __V_FILE_LINE__ format, ##__VA_ARGS__)
+#endif
+
+/* end android */
+#elif (defined __hexagon__) || (defined __qdsp6__)
+/* q6 */
+
+#ifdef VERIFY_PRINT_INFO
+   #define FARF_VERIFY_LOW  1
+   #define FARF_VERIFY_LOW_LEVEL HAP_LEVEL_LOW
+   #define VERIFY_IPRINTF(args...) FARF(VERIFY_LOW, args)
+#endif
+
+#ifdef VERIFY_PRINT_ERROR
+   #define FARF_VERIFY_ERROR         1
+   #define FARF_VERIFY_ERROR_LEVEL HAP_LEVEL_ERROR
+   #define VERIFY_EPRINTF(args...) FARF(VERIFY_ERROR, args)
+#endif
+
+#if (defined VERIFY_PRINT_INFO) || (defined VERIFY_PRINT_ERROR)
+   #include "HAP_farf.h"
+#endif
+
+/* end q6 */
+#elif (defined USE_SYSLOG)
+/* syslog */
+#if (defined VERIFY_PRINT_INFO) || (defined VERIFY_PRINT_ERROR)
+#include <syslog.h>
+#endif
+
+#ifdef VERIFY_PRINT_INFO
+#define VERIFY_IPRINTF(format, ...) syslog(LOG_USER|LOG_INFO, __V_FILE_LINE__ format, ##__VA_ARGS__)
+#endif
+
+#ifdef VERIFY_PRINT_ERROR
+#define VERIFY_EPRINTF(format, ...) syslog(LOG_USER|LOG_ERR, __V_FILE_LINE__ format, ##__VA_ARGS__)
+#endif
+
+/* end syslog */
+#else
+/* generic */
+
+#if (defined VERIFY_PRINT_INFO) || (defined VERIFY_PRINT_ERROR)
+#include <stdio.h>
+#endif
+
+#ifdef VERIFY_PRINT_INFO
+#define VERIFY_IPRINTF(format, ...) printf(__V_FILE_LINE__ format "\n", ##__VA_ARGS__)
+#endif
+
+#ifdef VERIFY_PRINT_ERROR
+#define VERIFY_EPRINTF(format, ...) printf(__V_FILE_LINE__ format "\n", ##__VA_ARGS__)
+#endif
+
+/* end generic */
+#endif
+
+#ifndef VERIFY_PRINT_INFO
+#define VERIFY_IPRINTF(format, ...) (void)0
+#endif
+
+#ifndef VERIFY_PRINT_ERROR
+#define VERIFY_EPRINTF(format, ...) (void)0
+#endif
+
+#ifndef VERIFYC
+	#define VERIFYC(val,err_code) \
+	  do {\
+	    VERIFY_IPRINTF(":info: calling: %s", #val);\
+	    if(0 == (val)) {\
+	 	   nErr = err_code;\
+		   VERIFY_EPRINTF(":error: %d: %s", nErr, #val);\
+		   goto bail;\
+	    } else {\
+		   VERIFY_IPRINTF(":info: passed: %s", #val);\
+	    }\
+	  } while(0)
+#endif //VERIFYC
+
+#ifndef VERIFY
+	#define VERIFY(val) \
+	   do {\
+		  VERIFY_IPRINTF(":info: calling: %s", #val);\
+		  if(0 == (val)) {\
+			 nErr = nErr == 0 ? -1 : nErr;\
+			 VERIFY_EPRINTF(":error: %d: %s", nErr, #val);\
+			 goto bail;\
+		  } else {\
+			 VERIFY_IPRINTF(":info: passed: %s", #val);\
+		  }\
+	   } while(0)
+#endif //VERIFY
+
+#endif //VERIFY_H
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/incs/version.h b/prebuilts/Hexagon_SDK/6.2.0.1/incs/version.h
new file mode 100755
index 0000000000000..2e3f0ad3278ff
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/incs/version.h
@@ -0,0 +1,91 @@
+#ifndef VERSION_H
+#define VERSION_H
+/*===========================================================================
+
+FILE:  version.h
+
+SERVICES:  Hexagon Access Program (HAP) SDK version_string
+
+GENERAL DESCRIPTION:
+	Definitions for versioning
+
+        Copyright � 2012 QUALCOMM Incorporated.
+               All Rights Reserved.
+            QUALCOMM Proprietary/GTDR
+===========================================================================*/
+
+#define VERSION_MAJOR    6
+#define VERSION_MINOR    2
+#define VERSION_MAINT    0
+#define VERSION_BUILD    1
+
+#define VERSION_STRING "HAP SDK 6.2.0.1 (srvr=qtcp406;br=main;cl=1242374)"
+
+
+/*
+=======================================================================
+MACROS DOCUMENTATION
+=======================================================================
+
+VERSION_MAJOR
+
+Description:
+	Defines the major release number
+
+Comments:
+    It has to be a valid numerical value
+=======================================================================
+ 
+VERSION_MINOR
+
+Description:
+	Defines the minor release number
+
+Comments:
+    It has to be a valid numerical value
+=======================================================================
+ 
+VERSION_MAINT
+
+Description:
+	Defines the maintenance release
+
+Comments:
+    It has to be a valid numerical value
+=======================================================================
+ 
+VERSION_BUILD
+
+Description:
+	Defines the build ID
+
+Comments:
+    It has to be a valid numerical value
+=======================================================================
+ 
+VERSION_STRING
+
+Description:
+	Defines the version string
+
+Definition:
+
+   #define VERSION_STRING "a.b.c.d (name=value;name=value;...)"
+	where a=major release number
+	      b=minor release number
+	      c=maintenance release number
+	      d=build number
+
+	name=value pair provides additional information about the build.
+	Example: 
+	patch/feature=comma separated list of features/patches that have been installed.
+	br=p4 branch that was used for the build
+	cl=p4 change list number
+	machine=hostname of the machine that was used for the build.
+
+Comments:
+
+=======================================================================
+*/
+
+#endif // VERSION_H
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/incs/version_note.h b/prebuilts/Hexagon_SDK/6.2.0.1/incs/version_note.h
new file mode 100755
index 0000000000000..10e498546cc98
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/incs/version_note.h
@@ -0,0 +1,19 @@
+/*==============================================================================
+  Copyright (c) 2022, 2023 Qualcomm Technologies, Inc.
+  All rights reserved. Qualcomm Proprietary and Confidential.
+==============================================================================*/
+
+#ifndef VERSION_NOTE_H
+#define VERSION_NOTE_H
+#define VERSION_NOTE_LENGTH 100
+
+ typedef struct {
+	int sizename;				//Size of the NOTE section
+	int sizedesc;				// Size of the descriptor(unused)
+	int type;				// Type of section(unused)//stores version and library name
+	char name[VERSION_NOTE_LENGTH];		// Name of NOTE section(version of shared object)
+	int desc[3];				// used for labeling note segment version (lib.ver.V1.V2.V3)
+ } lib_ver_note_t;
+
+#endif //VERSION_NOTE_H
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/ipc/fastrpc/incs/mmdefs.h b/prebuilts/Hexagon_SDK/6.2.0.1/ipc/fastrpc/incs/mmdefs.h
new file mode 100755
index 0000000000000..3fdc11dcb810e
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/ipc/fastrpc/incs/mmdefs.h
@@ -0,0 +1,48 @@
+#ifndef _MMDEFS_H
+#define _MMDEFS_H
+/*==============================================================================
+  Copyright (c) 2012-2013 Qualcomm Technologies Incorporated.
+  All Rights Reserved Qualcomm Technologies Proprietary
+
+  Export of this technology or software is regulated by the U.S.
+  Government. Diversion contrary to U.S. law prohibited.
+==============================================================================*/
+
+/*------------------------------------------------------------------------------
+   Standard Integer Types
+------------------------------------------------------------------------------*/
+
+#include "stdint.h"
+
+/*------------------------------------------------------------------------------
+   Constants
+------------------------------------------------------------------------------*/
+
+#undef TRUE
+#undef FALSE
+
+#define TRUE   (1)  /* Boolean true value */
+#define FALSE  (0)  /* Boolean false value */
+
+#ifndef NULL
+  #define NULL (0)
+#endif
+
+/*------------------------------------------------------------------------------
+   Character and boolean
+------------------------------------------------------------------------------*/
+
+typedef char char_t;           /* Character type */
+typedef unsigned char bool_t;  /* Boolean value type */
+
+/*==============================================================================
+  FUNCTION : align_to_8_byte                                                
+  DESCRIPTION: Ceil to the next multiple of 8                               
+==============================================================================*/
+static inline uint32_t align_to_8_byte(const uint32_t num)
+{
+   return ((num + 7) & (0xFFFFFFF8));
+}
+
+#endif /* _MMDEFS_H */
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/ipc/fastrpc/remote/ship/android_aarch64/libadsprpc.so b/prebuilts/Hexagon_SDK/6.2.0.1/ipc/fastrpc/remote/ship/android_aarch64/libadsprpc.so
new file mode 100755
index 0000000000000..572796d03718b
Binary files /dev/null and b/prebuilts/Hexagon_SDK/6.2.0.1/ipc/fastrpc/remote/ship/android_aarch64/libadsprpc.so differ
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/ipc/fastrpc/remote/ship/android_aarch64/libadsprpc_system.so b/prebuilts/Hexagon_SDK/6.2.0.1/ipc/fastrpc/remote/ship/android_aarch64/libadsprpc_system.so
new file mode 100755
index 0000000000000..e6b5daae76dbb
Binary files /dev/null and b/prebuilts/Hexagon_SDK/6.2.0.1/ipc/fastrpc/remote/ship/android_aarch64/libadsprpc_system.so differ
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/ipc/fastrpc/remote/ship/android_aarch64/libcdsprpc.so b/prebuilts/Hexagon_SDK/6.2.0.1/ipc/fastrpc/remote/ship/android_aarch64/libcdsprpc.so
new file mode 100755
index 0000000000000..266cb63c8ded7
Binary files /dev/null and b/prebuilts/Hexagon_SDK/6.2.0.1/ipc/fastrpc/remote/ship/android_aarch64/libcdsprpc.so differ
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/ipc/fastrpc/remote/ship/android_aarch64/libcdsprpc_system.so b/prebuilts/Hexagon_SDK/6.2.0.1/ipc/fastrpc/remote/ship/android_aarch64/libcdsprpc_system.so
new file mode 100755
index 0000000000000..c9a05c047d815
Binary files /dev/null and b/prebuilts/Hexagon_SDK/6.2.0.1/ipc/fastrpc/remote/ship/android_aarch64/libcdsprpc_system.so differ
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/ipc/fastrpc/remote/ship/android_aarch64/libmdsprpc.so b/prebuilts/Hexagon_SDK/6.2.0.1/ipc/fastrpc/remote/ship/android_aarch64/libmdsprpc.so
new file mode 100755
index 0000000000000..78fa7b7e84b9d
Binary files /dev/null and b/prebuilts/Hexagon_SDK/6.2.0.1/ipc/fastrpc/remote/ship/android_aarch64/libmdsprpc.so differ
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/ipc/fastrpc/remote/ship/android_aarch64/libmdsprpc_system.so b/prebuilts/Hexagon_SDK/6.2.0.1/ipc/fastrpc/remote/ship/android_aarch64/libmdsprpc_system.so
new file mode 100755
index 0000000000000..3b9783f2bc1d3
Binary files /dev/null and b/prebuilts/Hexagon_SDK/6.2.0.1/ipc/fastrpc/remote/ship/android_aarch64/libmdsprpc_system.so differ
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/ipc/fastrpc/remote/ship/android_aarch64/libsdsprpc.so b/prebuilts/Hexagon_SDK/6.2.0.1/ipc/fastrpc/remote/ship/android_aarch64/libsdsprpc.so
new file mode 100755
index 0000000000000..d7e3d46f9b141
Binary files /dev/null and b/prebuilts/Hexagon_SDK/6.2.0.1/ipc/fastrpc/remote/ship/android_aarch64/libsdsprpc.so differ
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/ipc/fastrpc/remote/ship/android_aarch64/libsdsprpc_system.so b/prebuilts/Hexagon_SDK/6.2.0.1/ipc/fastrpc/remote/ship/android_aarch64/libsdsprpc_system.so
new file mode 100755
index 0000000000000..588ed0b72f2fb
Binary files /dev/null and b/prebuilts/Hexagon_SDK/6.2.0.1/ipc/fastrpc/remote/ship/android_aarch64/libsdsprpc_system.so differ
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/ipc/fastrpc/rpcmem/inc/rpcmem.h b/prebuilts/Hexagon_SDK/6.2.0.1/ipc/fastrpc/rpcmem/inc/rpcmem.h
new file mode 100755
index 0000000000000..281890ef5fbf0
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/ipc/fastrpc/rpcmem/inc/rpcmem.h
@@ -0,0 +1,248 @@
+/*==============================================================================
+  Copyright (c) 2012-2013, 2020 Qualcomm Technologies, Inc.
+  All rights reserved.
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+  1. Redistributions of source code must retain the above copyright notice,
+  this list of conditions and the following disclaimer.
+
+  2. Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+  3. Neither the name of the copyright holder nor the names of its contributors
+  may be used to endorse or promote products derived from this software without
+  specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+  POSSIBILITY OF SUCH DAMAGE.
+==============================================================================*/
+
+#ifndef RPCMEM_H
+#define RPCMEM_H
+
+#include "AEEStdDef.h"
+#include "stddef.h"
+
+/**
+ *  @file rpcmem.h
+ *  @brief APIs used to manage memory allocated by the application processor and shared with the DSP.
+ */
+
+/** @defgroup rpcmem_const RPCMEM API macros and enumerations
+ *  @{
+ */
+
+/**
+ * Allocate memory with the same properties as the ION_FLAG_CACHED flag.
+ */
+#ifdef ION_FLAG_CACHED
+#define RPCMEM_DEFAULT_FLAGS ION_FLAG_CACHED
+#else
+#define RPCMEM_DEFAULT_FLAGS 1
+#endif
+
+/**
+ * The FastRPC library tries to map buffers allocated with this flag to the remote process of all current and new
+ * FastRPC sessions. In case of failure to map, the FastRPC library ignores the error and continues to open the session
+ * without pre-mapping the buffer. In case of success, buffers allocated with this flag will be pre-mapped to reduce
+ * the latency of upcoming FastRPC calls. This flag is recommended only for buffers that are used with latency-critical
+ * FastRPC methods. Pre-mapped buffers will be unmapped during either buffer free or session close.
+ */
+#define RPCMEM_TRY_MAP_STATIC   0x04000000
+
+/**
+ *  Supported RPCMEM heap IDs.
+ *
+ * If you are not using any of the RPCMEM-defined heap IDs,
+ * you are responsible for ensuring that you are passing
+ * a valid ION heap ID.
+ */
+enum rpc_heap_ids {
+/**
+ *  Memory for secure use cases only.
+ *  * Secure heap is to be used only by clients migrating to CPZ
+ */
+       RPCMEM_HEAP_ID_SECURE   = 9,
+/**
+ *  Contiguous physical memory:
+ *  * Very limited memory is available (< 8 MB)
+ *  * Recommended for subsystems without SMMU (sDSP and mDSP)
+ *  * Contiguous heap memory will be deprecated from archs after v73
+ */
+       RPCMEM_HEAP_ID_CONTIG   = 22,
+/**
+ *  Non-contiguous system physical memory.
+ *  * Recommended for all use cases that do not require using a specific heap
+ *  * Used with subsystems with SMMU (cDSP and aDSP)
+ */
+       RPCMEM_HEAP_ID_SYSTEM   = 25,
+ };
+
+/**
+ * Use uncached memory.
+ */
+#define RPCMEM_FLAG_UNCACHED 0
+
+/**
+ * Use cached memory.
+ */
+#define RPCMEM_FLAG_CACHED RPCMEM_DEFAULT_FLAGS
+
+/**
+ * @}
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** @defgroup rpcmem_api RPCMEM API functions
+ *  @{
+ */
+
+/**
+ * Initialize the RPCMEM Library.
+ *
+ * Only call this function once before using the RPCMEM Library.
+ *
+ * This API is mandatory on pre-Lahaina targets IF the client has linked to the
+ * rpcmem.a static library. If the client has only linked libadsprpc.so,
+ * libcdsprpc.so, or libsdsprpc.so, then the rpcmem_init call is not required
+ * on any target and other rpcmem APIs such as rpcmem_alloc can be called
+ * directly.
+ *
+ * NOTE: This function is not thread safe.
+ */
+void rpcmem_init(void);
+
+/**
+ * Deinitialize the RPCMEM Library.
+ *
+ * Only call this function once when the RPCMEM Library is no longer required.
+ *
+ * This API is mandatory on pre-Lahaina targets IF the client has linked to the
+ * rpcmem.a static library. If the client has only linked libadsprpc.so,
+ * libcdsprpc.so, or libsdsprpc.so, then the rpcmem_deinit call is not required
+ * on any target.
+ *
+ * NOTE: This function is not thread safe.
+ */
+void rpcmem_deinit(void);
+
+/**
+ * Allocate a zero-copy buffer for size upto 2 GB with the FastRPC framework.
+ * Buffers larger than 2 GB must be allocated with rpcmem_alloc2
+ * @param[in] heapid  Heap ID to use for memory allocation.
+ * @param[in] flags   ION flags to use for memory allocation.
+ * @param[in] size    Buffer size to allocate.
+ * @return            Pointer to the buffer on success; NULL on failure.
+ *
+ * Examples:
+ *
+ * * Default memory attributes, 2 KB
+ * @code
+ *    rpcmem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS, 2048);
+ * @endcode
+ * Or
+ * @code
+ *    rpcmem_alloc_def(2048);
+ * @endcode
+ *
+ * * Heap 22, uncached, 1 KB
+ * @code
+ *    rpcmem_alloc(22, 0, 1024);
+ * @endcode
+ * Or
+ * @code
+ *    rpcmem_alloc(22, RPCMEM_FLAG_UNCACHED, 1024);
+ * @endcode
+ *
+ * * Heap 21, cached, 2 KB
+ * @code
+ *    rpcmem_alloc(21, RPCMEM_FLAG_CACHED, 2048);
+ * @endcode
+ * Or
+ * @code
+ *    #include <ion.h>
+ *    rpcmem_alloc(21, ION_FLAG_CACHED, 2048);
+ * @endcode
+ *
+ * * Default memory attributes but from heap 18, 4 KB
+ * @code
+ *    rpcmem_alloc(18, RPCMEM_DEFAULT_FLAGS, 4096);
+ * @endcode
+ */
+void* rpcmem_alloc(int heapid, uint32 flags, int size);
+
+/**
+ * Allocate a zero-copy buffer with the FastRPC framework.
+ * @param[in] heapid  Heap ID to use for memory allocation.
+ * @param[in] flags   ION flags to use for memory allocation.
+ * @param[in] size    Buffer size to allocate.
+ * @return            Pointer to the buffer on success; NULL on failure.
+ *
+ * Examples:
+ *
+ * * The usage examples are same as rpcmem_alloc.
+ */
+void* rpcmem_alloc2(int heapid, uint32 flags, size_t size);
+
+/**
+ * Allocate a buffer with default settings.
+ * @param[in] size  Size of the buffer to be allocated.
+ * @return          Pointer to the allocated memory buffer.
+ */
+ #if !defined(WINNT) && !defined (_WIN32_WINNT)
+__attribute__((unused))
+#endif
+static __inline void* rpcmem_alloc_def(int size) {
+   return rpcmem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS, size);
+}
+
+/**
+ * Free a buffer and ignore invalid buffers.
+ */
+void rpcmem_free(void* po);
+
+/**
+ * Return an associated file descriptor.
+ * @param[in] po  Data pointer for an RPCMEM-allocated buffer.
+ * @return        Buffer file descriptor.
+ */
+int rpcmem_to_fd(void* po);
+
+/**
+ * @}
+ */
+
+#ifdef __cplusplus
+}
+#endif
+
+//! @cond Doxygen_Suppress
+/** These macros are deprecated.
+ */
+#define RPCMEM_DEFAULT_HEAP     -1
+#define RPCMEM_HEAP_DEFAULT     0x80000000
+#define RPCMEM_HEAP_NOREG       0x40000000
+#define RPCMEM_HEAP_UNCACHED    0x20000000
+#define RPCMEM_HEAP_NOVA        0x10000000
+#define RPCMEM_HEAP_NONCOHERENT 0x08000000
+#define RPCMEM_FORCE_NOFLUSH    0x01000000
+#define RPCMEM_FORCE_NOINVALIDATE    0x02000000
+// Use macros from libion instead
+#define ION_SECURE_FLAGS    ((1 << 31) | (1 << 19))
+//! @endcond
+
+#endif //RPCMEM_H
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/ipc/fastrpc/rpcmem/src/verify.h b/prebuilts/Hexagon_SDK/6.2.0.1/ipc/fastrpc/rpcmem/src/verify.h
new file mode 100755
index 0000000000000..71b37828c0646
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/ipc/fastrpc/rpcmem/src/verify.h
@@ -0,0 +1,164 @@
+/**
+ * Copyright (c) 2012-2020 Qualcomm Technologies, Inc.
+ * All Rights Reserved.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef VERIFY_H
+#define VERIFY_H
+
+
+//#define VERIFY_PRINT_ERROR
+//#define VERIFY_PRINT_INFO
+
+
+#ifndef _WIN32
+#define C_ASSERT(test) \
+    switch(0) {\
+      case 0:\
+      case test:;\
+    }
+#endif // _WIN32
+
+#ifndef __V_STR__
+	#define __V_STR__(x) #x ":"
+#endif //__STR__
+#ifndef __V_TOSTR__
+	#define __V_TOSTR__(x) __V_STR__(x)
+#endif // __TOSTR__
+#ifndef __V_FILE_LINE__
+	#define __V_FILE_LINE__ __FILE__ ":" __V_TOSTR__(__LINE__)
+#endif /*__FILE_LINE__*/
+
+
+// TODO:sunny - enabled extra prints
+#define VERIFY_PRINT_INFO
+#define VERIFY_PRINT_ERROR
+
+
+#ifdef __ANDROID__
+/*android */
+#if (defined VERIFY_PRINT_INFO) || (defined VERIFY_PRINT_ERROR)
+#include <android/log.h>
+#endif
+
+extern const char* __progname;
+#ifdef VERIFY_PRINT_INFO
+#ifdef __ANDROID__
+#define VERIFY_IPRINTF(format, ...) __android_log_print(ANDROID_LOG_DEBUG , __progname, __V_FILE_LINE__ format, ##__VA_ARGS__)
+#else /* !__ANDROID__ */
+#define VERIFY_IPRINTF(format, ...) fprintf(stderr,"%s:%d " _fmt "\n", __func__, __LINE__ , ##__VA_ARGS__)
+#endif /* __ANDROID__ */
+#endif
+
+#ifdef VERIFY_PRINT_ERROR
+#ifdef __ANDROID__
+#define VERIFY_EPRINTF(format, ...) __android_log_print(ANDROID_LOG_ERROR , __progname, __V_FILE_LINE__ format, ##__VA_ARGS__)
+#else /* !__ANDROID__ */
+#define VERIFY_EPRINTF(format, ...) fprintf(stderr,"%s:%d " _fmt "\n", __func__, __LINE__ , ##__VA_ARGS__)
+#endif /* __ANDROID__ */
+#endif
+
+/* end android */
+#elif (defined __hexagon__) || (defined __qdsp6__)
+/* q6 */
+
+#ifdef VERIFY_PRINT_INFO
+   #define FARF_VERIFY_LOW  1
+   #define FARF_VERIFY_LOW_LEVEL HAP_LEVEL_LOW
+   #define VERIFY_IPRINTF(args...) FARF(VERIFY_LOW, args)
+#endif
+
+#ifdef VERIFY_PRINT_ERROR
+   #define FARF_VERIFY_ERROR         1
+   #define FARF_VERIFY_ERROR_LEVEL HAP_LEVEL_ERROR
+   #define VERIFY_EPRINTF(args...) FARF(VERIFY_ERROR, args)
+#endif
+
+#if (defined VERIFY_PRINT_INFO) || (defined VERIFY_PRINT_ERROR)
+   #include "HAP_farf.h"
+#endif
+
+/* end q6 */
+#else
+/* generic */
+
+#if (defined VERIFY_PRINT_INFO) || (defined VERIFY_PRINT_ERROR)
+#include <stdio.h>
+#endif
+
+#ifdef VERIFY_PRINT_INFO
+#define VERIFY_IPRINTF(format, ...) printf(__V_FILE_LINE__ format, ##__VA_ARGS__)
+#endif
+
+#ifdef VERIFY_PRINT_ERROR
+#define VERIFY_EPRINTF(format, ...) printf(__V_FILE_LINE__ format, ##__VA_ARGS__)
+#endif
+
+/* end generic */
+#endif
+
+#ifndef VERIFY_PRINT_INFO
+#define VERIFY_IPRINTF(format, ...) (void)0
+#endif
+
+#ifndef VERIFY_PRINT_ERROR
+#define VERIFY_EPRINTF(format, ...) (void)0
+#endif
+
+#ifndef VERIFY
+	#define VERIFY(val) \
+	   do {\
+		  VERIFY_IPRINTF(":info: calling: " #val "\n");\
+		  if(0 == (val)) {\
+			 nErr = nErr == 0 ? -1 : nErr;\
+			 VERIFY_EPRINTF(":error: %d: " #val "\n", nErr);\
+			 goto bail;\
+		  } else {\
+			 VERIFY_IPRINTF(":info: passed: " #val "\n");\
+		  }\
+	   } while(0)
+#endif //VERIFY
+
+#ifndef VERIFYC
+        #define VERIFYC(val,err_code) \
+           do {\
+                  VERIFY_IPRINTF(":info: calling: " #val "\n");\
+                  if(0 == (val)) {\
+                         nErr = err_code;\
+                         VERIFY_EPRINTF(":Error: %x: " #val "\n", nErr);\
+                         goto bail;\
+                  } else {\
+                         VERIFY_IPRINTF(":info: passed: " #val "\n");\
+                  }\
+           } while(0)
+#endif //VERIFYC
+
+
+#endif /* VERIFY_H */
+
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/libs/atomic/inc/AEEatomic.h b/prebuilts/Hexagon_SDK/6.2.0.1/libs/atomic/inc/AEEatomic.h
new file mode 100755
index 0000000000000..0b4a7b9cb6be9
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/libs/atomic/inc/AEEatomic.h
@@ -0,0 +1,173 @@
+#ifndef AEEATOMIC_H
+#define AEEATOMIC_H
+/*
+=======================================================================
+
+FILE:         AEEatomic.h
+
+SERVICES:     atomic
+
+DESCRIPTION:  Fast Atomic ops
+
+=======================================================================
+        Copyright 2005, 2007 Qualcomm Technologies Incorporated.
+               All Rights Reserved.
+            QUALCOMM Confidential and Proprietary
+=======================================================================
+*/
+
+#include "AEEStdDef.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* #ifdef __cplusplus */
+
+uint32 atomic_Add(uint32 * volatile puDest, int nAdd);
+uint32 atomic_Exchange(uint32 * volatile puDest, uint32 uVal);
+uint32 atomic_CompareAndExchange(uint32 * volatile puDest, uint32 uExchange, uint32 uCompare);
+uint32 atomic_CompareOrAdd(uint32 * volatile puDest, uint32 uCompare, int nAdd);
+
+uint64 atomic_CompareAndExchange64(uint64 * volatile puDest, uint64 uExchange, uint64 uCompare);
+uintptr_t atomic_CompareAndExchangeUP(uintptr_t * volatile puDest, uintptr_t uExchange, uintptr_t uCompare);
+#ifdef __cplusplus
+}
+#endif /* #ifdef __cplusplus */
+
+/*=====================================================================
+INTERFACE DOCUMENTATION
+=======================================================================
+atomic Interface
+
+  The atomic interface provides fast "atomic" operations.  The 
+   operations are defined to be atomic with respect to each other.
+
+=======================================================================
+
+=======================================================================
+
+atomic_Add()
+
+Description:
+
+   Performs an atomic sum operation.
+
+Prototype:
+
+   uint32 atomic_Add(uint32* puDest, int nInc);
+
+Parameters:
+   puDest [in|out] : Points to unsigned number to add nInc and save
+   nInc : increment
+
+Return Value:
+   result.
+
+Comments:
+   None
+
+Side Effects:
+   None
+
+See Also:
+   None
+
+=======================================================================
+
+atomic_Exchange()
+
+Description:
+
+   Atomic exchange of 32bit value. Performs an atomic operation of :
+      write uVal to *puDest 
+      return the previous value in *puDest
+
+Prototype:
+
+   uint32 atomic_Exchange(uint32* puDest, uint32 uVal);
+
+Parameters:
+   puDest [in|out] : Points to unsigned number to be exchanged
+   uVal : new value to write.
+
+Return Value:
+   previous value at *puDest.
+
+Comments:
+   None
+
+Side Effects:
+   May cause exception if puDest is not a 32 bit aligned address.
+
+See Also:
+   None
+=======================================================================
+
+atomic_CompareAndExchange()
+
+Description:
+
+   Performs an atomic operation of :
+      if (*puDest == uCompare) {
+         *puDest = uExchange;
+      }
+
+   returns the previous value in *puDest
+
+Prototype:
+
+   uint32 atomic_CompareAndExchange(uint32 *puDest, uint32 uExchange, 
+                                    uint32 uCompare);
+
+Parameters:
+   puDest [in|out] : Points to unsigned number.
+   uExchange : A new value to write to *puDest
+   uCompare : Comparand
+
+Return Value:
+   previous value at *puDest.
+
+Comments:
+   None
+
+Side Effects:
+   May cause exception if puDest is not a 32 bit aligned address.
+
+See Also:
+   None
+
+=======================================================================
+atomic_CompareOrAdd()
+
+Description:
+
+   Performs an atomic operation of :
+      if (*puDest != uCompare) {
+         *puDest += nAdd;
+      }
+
+   returns the new value in *puDest
+
+Prototype:
+
+   uint32 atomic_CompareOrAdd(uint32 *puDest, uint32 uCompare, int nAdd);
+
+Parameters:
+   puDest [in|out] : Points to unsigned number.
+   uCompare : Comparand
+   nAdd : Add to *puDest
+
+Return Value:
+   new value at *puDest.
+
+Comments:
+   None
+
+Side Effects:
+   May cause exception if puDest is not a 32 bit aligned address.
+
+See Also:
+   None
+=======================================================================*/
+
+#endif /* #ifndef AEEATOMIC_H */
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/libs/qprintf/inc/qprintf.h b/prebuilts/Hexagon_SDK/6.2.0.1/libs/qprintf/inc/qprintf.h
new file mode 100755
index 0000000000000..cfa7b98d050bc
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/libs/qprintf/inc/qprintf.h
@@ -0,0 +1,170 @@
+/**=============================================================================
+
+@file
+   qprintf.h
+
+@brief
+   API, macros and struct definitions for qprintf utilities available from C.
+
+Copyright (c) 2017, 2020 QUALCOMM Technologies Incorporated.
+All Rights Reserved Qualcomm Proprietary
+=============================================================================**/
+
+#ifndef qprintf_H
+#define qprintf_H
+
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+#include "hexagon_types.h"
+#include "stdlib.h"
+
+#ifdef BUILDING_SO
+/// MACRO enables function to be visible in shared-library case.
+#define qprintf_API __attribute__ ((visibility ("default")))
+#else
+/// MACRO empty for non-shared-library case.
+#define qprintf_API
+#endif
+
+/**
+ * @defgroup  Masks Common masks controlling which bytes to display.
+ * @{
+ */
+
+/// Display all bytes
+#define QPRINTF_MASK_ALL -1ull
+
+/// Display no bytes
+#define QPRINTF_MASK_NONE 0ull
+
+/// Display even bytes
+#define QPRINTF_MASK_EVEN_8  0x5555555555555555ull
+
+/// Display odd bytes
+#define QPRINTF_MASK_ODD_8   0xaaaaaaaaaaaaaaaaull
+
+/// Display even 16-bit elements
+#define QPRINTF_MASK_EVEN_16 0x3333333333333333ull
+
+/// Display odd 16-bit elements
+#define QPRINTF_MASK_ODD_16  0xccccccccccccccccull
+
+/// Display even 32-bit elements
+#define QPRINTF_MASK_EVEN_32   0x0f0f0f0f0f0f0f0full
+
+/// Display odd 32-bit elements
+#define QPRINTF_MASK_ODD_32    0xf0f0f0f0f0f0f0f0ull
+
+/**
+ * @}
+ */
+
+/**
+ * @defgroup  C_functions qprintf functions
+ * @{
+ */
+//---------------------------------------------------------------------------
+/// @brief
+///   Set the mask controlling which bytes to display when printing out an HVX
+///   register.
+///
+///   If the nth bit of mask is set, the nth byte of HVX will be displayed.
+///   When printing HVX as 16-bit or 32-bit elements, only the bit corresponding
+///   to the lowest byte of the element controls whether the element will be
+///   printed out or not.
+///
+/// @param high
+///   Mask for upper 64 bytes of HVX vector.
+///
+/// @param low
+///   Mask for lower 64 bytes of HVX vector.
+///
+/// @return
+///   None.
+///
+/// Example:
+///
+/// * Display the 32-bit odd elements of the 64 most significant bytes and the even
+///   bytes of the 64 least significant bytes of HVX vectors printed with option %%m.
+/// @code
+///     // From C before invoking your assembly routine
+///   	qprintf_set_mask(QPRINTF_MASK_ODD_32,QPRINTF_MASK_EVEN_8);
+///
+///     // From assembly
+///     qprintf("v0: %mx",v0);
+/// @endcode
+///
+/// See also \ref assembly-hvx-registers for more assembly examples using %%m.
+//---------------------------------------------------------------------------
+qprintf_API void qprintf_set_mask(unsigned long long high, unsigned long long low);
+
+//---------------------------------------------------------------------------
+/// @brief
+///   Print a V register.
+///
+/// @param msg
+///   Character string used to display V register.
+///
+/// @param V
+///   HVX vector register to display.
+///
+/// @return
+///   None.
+///
+/// Example: See \ref c-hvx-registers for usage examples.
+//---------------------------------------------------------------------------
+void qprintf_V(const char* msg, HVX_Vector V);
+
+//---------------------------------------------------------------------------
+/// @brief
+///   Print a Q register. see documentation for details on supported format.
+///
+/// @param msg
+///   Character string used to display Q register.
+///
+/// @param Q
+///   HVX predicate register to display.
+///
+/// @return
+///   None.
+///
+/// Example: See \ref c-predicate-registers for usage examples.
+//---------------------------------------------------------------------------
+void qprintf_Q(const char* msg, HVX_VectorPred Q);
+
+//---------------------------------------------------------------------------
+/// @brief
+///   Display all HVX registers.
+///
+/// @return
+///   None.
+///
+/// Example: See \ref c-register-dump for usage examples.
+//---------------------------------------------------------------------------
+extern qprintf_API void qprintf_V_all(void);
+
+
+//---------------------------------------------------------------------------
+/// @brief
+///   Display all scalar registers.
+///
+/// @return
+///   None.
+///
+/// Example: See \ref c-register-dump for usage examples.
+//---------------------------------------------------------------------------
+extern qprintf_API void qprintf_R_all(void);
+
+/**
+ * @}
+ */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // #ifndef qprintf_H
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/libs/qprintf/inc/qprintf_asm.h b/prebuilts/Hexagon_SDK/6.2.0.1/libs/qprintf/inc/qprintf_asm.h
new file mode 100755
index 0000000000000..9fb27cc8ff8d6
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/libs/qprintf/inc/qprintf_asm.h
@@ -0,0 +1,73 @@
+/**=============================================================================
+
+@file
+   qprintf_asm.h
+
+@brief
+   Extend printf support to assembly.
+
+Copyright (c) 2017,2020 QUALCOMM Technologies Incorporated.
+All Rights Reserved Qualcomm Proprietary
+=============================================================================**/
+
+/**
+ * @defgroup  ASM_function qprintf routine assembly-callable
+ * @{
+ */
+
+//---------------------------------------------------------------------------
+/// @brief
+///   Assembly macro for displaying registers along with a message and
+///   filename[linenumber].
+///
+/// @param MSG
+///   Message to display.
+///
+/// @return
+///   none.
+///
+/// Example: See \ref assembly-support for usage examples.
+//---------------------------------------------------------------------------
+#define qprintf(MSG,...) qprintf_macro #__FILE__, #__LINE__, MSG, #__VA_ARGS__
+
+/**
+ * @}
+ */
+
+//! @cond Doxygen_Suppress
+
+.set STACK_SIZE, 24
+.macro	qprintf_macro FILE_NAME LINE_NUMBER MSG ARGS
+.data
+1:
+.string "\MSG\()\0"
+2:
+.string "\ARGS\()\0"
+3:
+.string "\LINE_NUMBER\()\0"
+4:
+.string "\FILE_NAME\()\0"
+.text
+  {
+   allocframe(#STACK_SIZE)                 // sp[STACK_SIZE]=r31:30  (sp refering to sp after stack allocation)
+   memd(r29 + #(-STACK_SIZE-8)) = r29:28   // sp[0]=r29:28
+   r28 = ADD(PC,##1b@PCREL)
+   } {
+   memw(r29 + #8) = r28                    // sp[8]=&msg
+   r28 = ADD(PC,##2b@PCREL)
+   } {
+   memw(r29 + #12) = r28                   // sp[12]=&args
+   r28 = #\LINE_NUMBER\()
+   } {
+   memw(r29 + #16) = r28                   // sp[12]=line_number
+   r28 = ADD(PC,##4b@PCREL)
+   } {
+   memw(r29 + #20) = r28                   // sp[16]=&file_name
+   call qprintf_asm
+   } {
+   r28 = memw(r29 + #0)
+   deallocframe
+   }
+.endm
+
+//! @endcond
\ No newline at end of file
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/posix/bits/confname.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/posix/bits/confname.h
new file mode 100755
index 0000000000000..d9ca3135501e3
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/posix/bits/confname.h
@@ -0,0 +1,528 @@
+#ifndef CONFNAME_H
+#define CONFNAME_H
+/**
+  @file confname.h
+  @brief Named literals for 'name' argument of sysconf, pathconf
+
+EXTERNAL FUNCTIONS    
+   None 
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS    
+   DONT include this header directly. Instead include unistd.h. For now since 
+   toolchain doesnt provide a hook by including bits/confname.h, we stick this 
+   header in QuRT's sys/types.h 
+
+Copyright (c) 2018, 2023 by Qualcomm Technologies, Inc.  All Rights Reserved.
+Confidential and Proprietary - Qualcomm Technologies, Inc.    
+    
+==============================================================================*/
+/* Values for the NAME argument to `pathconf' and `fpathconf'.  */
+enum
+{
+    _PC_LINK_MAX,
+#define	_PC_LINK_MAX			_PC_LINK_MAX
+    _PC_MAX_CANON,
+#define	_PC_MAX_CANON			_PC_MAX_CANON
+    _PC_MAX_INPUT,
+#define	_PC_MAX_INPUT			_PC_MAX_INPUT
+    _PC_NAME_MAX,
+#define	_PC_NAME_MAX			_PC_NAME_MAX
+    _PC_PATH_MAX,
+#define	_PC_PATH_MAX			_PC_PATH_MAX
+    _PC_PIPE_BUF,
+#define	_PC_PIPE_BUF			_PC_PIPE_BUF
+    _PC_CHOWN_RESTRICTED,
+#define	_PC_CHOWN_RESTRICTED		_PC_CHOWN_RESTRICTED
+    _PC_NO_TRUNC,
+#define	_PC_NO_TRUNC			_PC_NO_TRUNC
+    _PC_VDISABLE,
+#define _PC_VDISABLE			_PC_VDISABLE
+    _PC_SYNC_IO,
+#define	_PC_SYNC_IO			_PC_SYNC_IO
+    _PC_ASYNC_IO,
+#define	_PC_ASYNC_IO			_PC_ASYNC_IO
+    _PC_PRIO_IO,
+#define	_PC_PRIO_IO			_PC_PRIO_IO
+    _PC_SOCK_MAXBUF,
+#define	_PC_SOCK_MAXBUF			_PC_SOCK_MAXBUF
+    _PC_FILESIZEBITS,
+#define _PC_FILESIZEBITS		_PC_FILESIZEBITS
+    _PC_REC_INCR_XFER_SIZE,
+#define _PC_REC_INCR_XFER_SIZE		_PC_REC_INCR_XFER_SIZE
+    _PC_REC_MAX_XFER_SIZE,
+#define _PC_REC_MAX_XFER_SIZE		_PC_REC_MAX_XFER_SIZE
+    _PC_REC_MIN_XFER_SIZE,
+#define _PC_REC_MIN_XFER_SIZE		_PC_REC_MIN_XFER_SIZE
+    _PC_REC_XFER_ALIGN,
+#define _PC_REC_XFER_ALIGN		_PC_REC_XFER_ALIGN
+    _PC_ALLOC_SIZE_MIN,
+#define _PC_ALLOC_SIZE_MIN		_PC_ALLOC_SIZE_MIN
+    _PC_SYMLINK_MAX,
+#define _PC_SYMLINK_MAX			_PC_SYMLINK_MAX
+    _PC_2_SYMLINKS
+#define _PC_2_SYMLINKS			_PC_2_SYMLINKS
+};
+
+/* Values for the argument to `sysconf'.  */
+enum
+{
+    _SC_ARG_MAX,
+#define	_SC_ARG_MAX			_SC_ARG_MAX
+    _SC_CHILD_MAX,
+#define	_SC_CHILD_MAX			_SC_CHILD_MAX
+    _SC_CLK_TCK,
+#define	_SC_CLK_TCK			_SC_CLK_TCK
+    _SC_NGROUPS_MAX,
+#define	_SC_NGROUPS_MAX			_SC_NGROUPS_MAX
+    _SC_OPEN_MAX,
+#define	_SC_OPEN_MAX			_SC_OPEN_MAX
+    _SC_STREAM_MAX,
+#define	_SC_STREAM_MAX			_SC_STREAM_MAX
+    _SC_TZNAME_MAX,
+#define	_SC_TZNAME_MAX			_SC_TZNAME_MAX
+    _SC_JOB_CONTROL,
+#define	_SC_JOB_CONTROL			_SC_JOB_CONTROL
+    _SC_SAVED_IDS,
+#define	_SC_SAVED_IDS			_SC_SAVED_IDS
+    _SC_REALTIME_SIGNALS,
+#define	_SC_REALTIME_SIGNALS		_SC_REALTIME_SIGNALS
+    _SC_PRIORITY_SCHEDULING,
+#define	_SC_PRIORITY_SCHEDULING		_SC_PRIORITY_SCHEDULING
+    _SC_TIMERS,
+#define	_SC_TIMERS			_SC_TIMERS
+    _SC_ASYNCHRONOUS_IO,
+#define	_SC_ASYNCHRONOUS_IO		_SC_ASYNCHRONOUS_IO
+    _SC_PRIORITIZED_IO,
+#define	_SC_PRIORITIZED_IO		_SC_PRIORITIZED_IO
+    _SC_SYNCHRONIZED_IO,
+#define	_SC_SYNCHRONIZED_IO		_SC_SYNCHRONIZED_IO
+    _SC_FSYNC,
+#define	_SC_FSYNC			_SC_FSYNC
+    _SC_MAPPED_FILES,
+#define	_SC_MAPPED_FILES		_SC_MAPPED_FILES
+    _SC_MEMLOCK,
+#define	_SC_MEMLOCK			_SC_MEMLOCK
+    _SC_MEMLOCK_RANGE,
+#define	_SC_MEMLOCK_RANGE		_SC_MEMLOCK_RANGE
+    _SC_MEMORY_PROTECTION,
+#define	_SC_MEMORY_PROTECTION		_SC_MEMORY_PROTECTION
+    _SC_MESSAGE_PASSING,
+#define	_SC_MESSAGE_PASSING		_SC_MESSAGE_PASSING
+    _SC_SEMAPHORES,
+#define	_SC_SEMAPHORES			_SC_SEMAPHORES
+    _SC_SHARED_MEMORY_OBJECTS,
+#define	_SC_SHARED_MEMORY_OBJECTS	_SC_SHARED_MEMORY_OBJECTS
+    _SC_AIO_LISTIO_MAX,
+#define	_SC_AIO_LISTIO_MAX		_SC_AIO_LISTIO_MAX
+    _SC_AIO_MAX,
+#define	_SC_AIO_MAX			_SC_AIO_MAX
+    _SC_AIO_PRIO_DELTA_MAX,
+#define	_SC_AIO_PRIO_DELTA_MAX		_SC_AIO_PRIO_DELTA_MAX
+    _SC_DELAYTIMER_MAX,
+#define	_SC_DELAYTIMER_MAX		_SC_DELAYTIMER_MAX
+    _SC_MQ_OPEN_MAX,
+#define	_SC_MQ_OPEN_MAX			_SC_MQ_OPEN_MAX
+    _SC_MQ_PRIO_MAX,
+#define	_SC_MQ_PRIO_MAX			_SC_MQ_PRIO_MAX
+    _SC_VERSION,
+#define	_SC_VERSION			_SC_VERSION
+    _SC_PAGESIZE,
+#define	_SC_PAGESIZE			_SC_PAGESIZE
+#define	_SC_PAGE_SIZE			_SC_PAGESIZE
+    _SC_RTSIG_MAX,
+#define	_SC_RTSIG_MAX			_SC_RTSIG_MAX
+    _SC_SEM_NSEMS_MAX,
+#define	_SC_SEM_NSEMS_MAX		_SC_SEM_NSEMS_MAX
+    _SC_SEM_VALUE_MAX,
+#define	_SC_SEM_VALUE_MAX		_SC_SEM_VALUE_MAX
+    _SC_SIGQUEUE_MAX,
+#define	_SC_SIGQUEUE_MAX		_SC_SIGQUEUE_MAX
+    _SC_TIMER_MAX,
+#define	_SC_TIMER_MAX			_SC_TIMER_MAX
+
+    /* Values for the argument to `sysconf'
+       corresponding to _POSIX2_* symbols.  */
+    _SC_BC_BASE_MAX,
+#define	_SC_BC_BASE_MAX			_SC_BC_BASE_MAX
+    _SC_BC_DIM_MAX,
+#define	_SC_BC_DIM_MAX			_SC_BC_DIM_MAX
+    _SC_BC_SCALE_MAX,
+#define	_SC_BC_SCALE_MAX		_SC_BC_SCALE_MAX
+    _SC_BC_STRING_MAX,
+#define	_SC_BC_STRING_MAX		_SC_BC_STRING_MAX
+    _SC_COLL_WEIGHTS_MAX,
+#define	_SC_COLL_WEIGHTS_MAX		_SC_COLL_WEIGHTS_MAX
+    _SC_EQUIV_CLASS_MAX,
+#define	_SC_EQUIV_CLASS_MAX		_SC_EQUIV_CLASS_MAX
+    _SC_EXPR_NEST_MAX,
+#define	_SC_EXPR_NEST_MAX		_SC_EXPR_NEST_MAX
+    _SC_LINE_MAX,
+#define	_SC_LINE_MAX			_SC_LINE_MAX
+    _SC_RE_DUP_MAX,
+#define	_SC_RE_DUP_MAX			_SC_RE_DUP_MAX
+    _SC_CHARCLASS_NAME_MAX,
+#define	_SC_CHARCLASS_NAME_MAX		_SC_CHARCLASS_NAME_MAX
+
+    _SC_2_VERSION,
+#define	_SC_2_VERSION			_SC_2_VERSION
+    _SC_2_C_BIND,
+#define	_SC_2_C_BIND			_SC_2_C_BIND
+    _SC_2_C_DEV,
+#define	_SC_2_C_DEV			_SC_2_C_DEV
+    _SC_2_FORT_DEV,
+#define	_SC_2_FORT_DEV			_SC_2_FORT_DEV
+    _SC_2_FORT_RUN,
+#define	_SC_2_FORT_RUN			_SC_2_FORT_RUN
+    _SC_2_SW_DEV,
+#define	_SC_2_SW_DEV			_SC_2_SW_DEV
+    _SC_2_LOCALEDEF,
+#define	_SC_2_LOCALEDEF			_SC_2_LOCALEDEF
+
+    _SC_PII,
+#define	_SC_PII				_SC_PII
+    _SC_PII_XTI,
+#define	_SC_PII_XTI			_SC_PII_XTI
+    _SC_PII_SOCKET,
+#define	_SC_PII_SOCKET			_SC_PII_SOCKET
+    _SC_PII_INTERNET,
+#define	_SC_PII_INTERNET		_SC_PII_INTERNET
+    _SC_PII_OSI,
+#define	_SC_PII_OSI			_SC_PII_OSI
+    _SC_POLL,
+#define	_SC_POLL			_SC_POLL
+    _SC_SELECT,
+#define	_SC_SELECT			_SC_SELECT
+    _SC_UIO_MAXIOV,
+#define	_SC_UIO_MAXIOV			_SC_UIO_MAXIOV
+    _SC_IOV_MAX = _SC_UIO_MAXIOV,
+#define _SC_IOV_MAX			_SC_IOV_MAX
+    _SC_PII_INTERNET_STREAM,
+#define	_SC_PII_INTERNET_STREAM		_SC_PII_INTERNET_STREAM
+    _SC_PII_INTERNET_DGRAM,
+#define	_SC_PII_INTERNET_DGRAM		_SC_PII_INTERNET_DGRAM
+    _SC_PII_OSI_COTS,
+#define	_SC_PII_OSI_COTS		_SC_PII_OSI_COTS
+    _SC_PII_OSI_CLTS,
+#define	_SC_PII_OSI_CLTS		_SC_PII_OSI_CLTS
+    _SC_PII_OSI_M,
+#define	_SC_PII_OSI_M			_SC_PII_OSI_M
+    _SC_T_IOV_MAX,
+#define	_SC_T_IOV_MAX			_SC_T_IOV_MAX
+
+    /* Values according to POSIX 1003.1c (POSIX threads).  */
+    _SC_THREADS,
+#define	_SC_THREADS			_SC_THREADS
+    _SC_THREAD_SAFE_FUNCTIONS,
+#define _SC_THREAD_SAFE_FUNCTIONS	_SC_THREAD_SAFE_FUNCTIONS
+    _SC_GETGR_R_SIZE_MAX,
+#define	_SC_GETGR_R_SIZE_MAX		_SC_GETGR_R_SIZE_MAX
+    _SC_GETPW_R_SIZE_MAX,
+#define	_SC_GETPW_R_SIZE_MAX		_SC_GETPW_R_SIZE_MAX
+    _SC_LOGIN_NAME_MAX,
+#define	_SC_LOGIN_NAME_MAX		_SC_LOGIN_NAME_MAX
+    _SC_TTY_NAME_MAX,
+#define	_SC_TTY_NAME_MAX		_SC_TTY_NAME_MAX
+    _SC_THREAD_DESTRUCTOR_ITERATIONS,
+#define	_SC_THREAD_DESTRUCTOR_ITERATIONS _SC_THREAD_DESTRUCTOR_ITERATIONS
+    _SC_THREAD_KEYS_MAX,
+#define	_SC_THREAD_KEYS_MAX		_SC_THREAD_KEYS_MAX
+    _SC_THREAD_STACK_MIN,
+#define	_SC_THREAD_STACK_MIN		_SC_THREAD_STACK_MIN
+    _SC_THREAD_THREADS_MAX,
+#define	_SC_THREAD_THREADS_MAX		_SC_THREAD_THREADS_MAX
+    _SC_THREAD_ATTR_STACKADDR,
+#define	_SC_THREAD_ATTR_STACKADDR	_SC_THREAD_ATTR_STACKADDR
+    _SC_THREAD_ATTR_STACKSIZE,
+#define	_SC_THREAD_ATTR_STACKSIZE	_SC_THREAD_ATTR_STACKSIZE
+    _SC_THREAD_PRIORITY_SCHEDULING,
+#define	_SC_THREAD_PRIORITY_SCHEDULING	_SC_THREAD_PRIORITY_SCHEDULING
+    _SC_THREAD_PRIO_INHERIT,
+#define	_SC_THREAD_PRIO_INHERIT		_SC_THREAD_PRIO_INHERIT
+    _SC_THREAD_PRIO_PROTECT,
+#define	_SC_THREAD_PRIO_PROTECT		_SC_THREAD_PRIO_PROTECT
+    _SC_THREAD_PROCESS_SHARED,
+#define	_SC_THREAD_PROCESS_SHARED	_SC_THREAD_PROCESS_SHARED
+
+    _SC_NPROCESSORS_CONF,
+#define _SC_NPROCESSORS_CONF		_SC_NPROCESSORS_CONF
+    _SC_NPROCESSORS_ONLN,
+#define _SC_NPROCESSORS_ONLN		_SC_NPROCESSORS_ONLN
+    _SC_PHYS_PAGES,
+#define _SC_PHYS_PAGES			_SC_PHYS_PAGES
+    _SC_AVPHYS_PAGES,
+#define _SC_AVPHYS_PAGES		_SC_AVPHYS_PAGES
+    _SC_ATEXIT_MAX,
+#define _SC_ATEXIT_MAX			_SC_ATEXIT_MAX
+    _SC_PASS_MAX,
+#define _SC_PASS_MAX			_SC_PASS_MAX
+
+    _SC_XOPEN_VERSION,
+#define _SC_XOPEN_VERSION		_SC_XOPEN_VERSION
+    _SC_XOPEN_XCU_VERSION,
+#define _SC_XOPEN_XCU_VERSION		_SC_XOPEN_XCU_VERSION
+    _SC_XOPEN_UNIX,
+#define _SC_XOPEN_UNIX			_SC_XOPEN_UNIX
+    _SC_XOPEN_CRYPT,
+#define _SC_XOPEN_CRYPT			_SC_XOPEN_CRYPT
+    _SC_XOPEN_ENH_I18N,
+#define _SC_XOPEN_ENH_I18N		_SC_XOPEN_ENH_I18N
+    _SC_XOPEN_SHM,
+#define _SC_XOPEN_SHM			_SC_XOPEN_SHM
+
+    _SC_2_CHAR_TERM,
+#define _SC_2_CHAR_TERM			_SC_2_CHAR_TERM
+    _SC_2_C_VERSION,
+#define _SC_2_C_VERSION			_SC_2_C_VERSION
+    _SC_2_UPE,
+#define _SC_2_UPE			_SC_2_UPE
+
+    _SC_XOPEN_XPG2,
+#define _SC_XOPEN_XPG2			_SC_XOPEN_XPG2
+    _SC_XOPEN_XPG3,
+#define _SC_XOPEN_XPG3			_SC_XOPEN_XPG3
+    _SC_XOPEN_XPG4,
+#define _SC_XOPEN_XPG4			_SC_XOPEN_XPG4
+
+    _SC_CHAR_BIT,
+#define	_SC_CHAR_BIT			_SC_CHAR_BIT
+    _SC_CHAR_MAX,
+#define	_SC_CHAR_MAX			_SC_CHAR_MAX
+    _SC_CHAR_MIN,
+#define	_SC_CHAR_MIN			_SC_CHAR_MIN
+    _SC_INT_MAX,
+#define	_SC_INT_MAX			_SC_INT_MAX
+    _SC_INT_MIN,
+#define	_SC_INT_MIN			_SC_INT_MIN
+    _SC_LONG_BIT,
+#define	_SC_LONG_BIT			_SC_LONG_BIT
+    _SC_WORD_BIT,
+#define	_SC_WORD_BIT			_SC_WORD_BIT
+    _SC_MB_LEN_MAX,
+#define	_SC_MB_LEN_MAX			_SC_MB_LEN_MAX
+    _SC_NZERO,
+#define	_SC_NZERO			_SC_NZERO
+    _SC_SSIZE_MAX,
+#define	_SC_SSIZE_MAX			_SC_SSIZE_MAX
+    _SC_SCHAR_MAX,
+#define	_SC_SCHAR_MAX			_SC_SCHAR_MAX
+    _SC_SCHAR_MIN,
+#define	_SC_SCHAR_MIN			_SC_SCHAR_MIN
+    _SC_SHRT_MAX,
+#define	_SC_SHRT_MAX			_SC_SHRT_MAX
+    _SC_SHRT_MIN,
+#define	_SC_SHRT_MIN			_SC_SHRT_MIN
+    _SC_UCHAR_MAX,
+#define	_SC_UCHAR_MAX			_SC_UCHAR_MAX
+    _SC_UINT_MAX,
+#define	_SC_UINT_MAX			_SC_UINT_MAX
+    _SC_ULONG_MAX,
+#define	_SC_ULONG_MAX			_SC_ULONG_MAX
+    _SC_USHRT_MAX,
+#define	_SC_USHRT_MAX			_SC_USHRT_MAX
+
+    _SC_NL_ARGMAX,
+#define	_SC_NL_ARGMAX			_SC_NL_ARGMAX
+    _SC_NL_LANGMAX,
+#define	_SC_NL_LANGMAX			_SC_NL_LANGMAX
+    _SC_NL_MSGMAX,
+#define	_SC_NL_MSGMAX			_SC_NL_MSGMAX
+    _SC_NL_NMAX,
+#define	_SC_NL_NMAX			_SC_NL_NMAX
+    _SC_NL_SETMAX,
+#define	_SC_NL_SETMAX			_SC_NL_SETMAX
+    _SC_NL_TEXTMAX,
+#define	_SC_NL_TEXTMAX			_SC_NL_TEXTMAX
+
+    _SC_XBS5_ILP32_OFF32,
+#define _SC_XBS5_ILP32_OFF32		_SC_XBS5_ILP32_OFF32
+    _SC_XBS5_ILP32_OFFBIG,
+#define _SC_XBS5_ILP32_OFFBIG		_SC_XBS5_ILP32_OFFBIG
+    _SC_XBS5_LP64_OFF64,
+#define _SC_XBS5_LP64_OFF64		_SC_XBS5_LP64_OFF64
+    _SC_XBS5_LPBIG_OFFBIG,
+#define _SC_XBS5_LPBIG_OFFBIG		_SC_XBS5_LPBIG_OFFBIG
+
+    _SC_XOPEN_LEGACY,
+#define _SC_XOPEN_LEGACY		_SC_XOPEN_LEGACY
+    _SC_XOPEN_REALTIME,
+#define _SC_XOPEN_REALTIME		_SC_XOPEN_REALTIME
+    _SC_XOPEN_REALTIME_THREADS,
+#define _SC_XOPEN_REALTIME_THREADS	_SC_XOPEN_REALTIME_THREADS
+
+    _SC_ADVISORY_INFO,
+#define _SC_ADVISORY_INFO		_SC_ADVISORY_INFO
+    _SC_BARRIERS,
+#define _SC_BARRIERS			_SC_BARRIERS
+    _SC_BASE,
+#define _SC_BASE			_SC_BASE
+    _SC_C_LANG_SUPPORT,
+#define _SC_C_LANG_SUPPORT		_SC_C_LANG_SUPPORT
+    _SC_C_LANG_SUPPORT_R,
+#define _SC_C_LANG_SUPPORT_R		_SC_C_LANG_SUPPORT_R
+    _SC_CLOCK_SELECTION,
+#define _SC_CLOCK_SELECTION		_SC_CLOCK_SELECTION
+    _SC_CPUTIME,
+#define _SC_CPUTIME			_SC_CPUTIME
+    _SC_THREAD_CPUTIME,
+#define _SC_THREAD_CPUTIME		_SC_THREAD_CPUTIME
+    _SC_DEVICE_IO,
+#define _SC_DEVICE_IO			_SC_DEVICE_IO
+    _SC_DEVICE_SPECIFIC,
+#define _SC_DEVICE_SPECIFIC		_SC_DEVICE_SPECIFIC
+    _SC_DEVICE_SPECIFIC_R,
+#define _SC_DEVICE_SPECIFIC_R		_SC_DEVICE_SPECIFIC_R
+    _SC_FD_MGMT,
+#define _SC_FD_MGMT			_SC_FD_MGMT
+    _SC_FIFO,
+#define _SC_FIFO			_SC_FIFO
+    _SC_PIPE,
+#define _SC_PIPE			_SC_PIPE
+    _SC_FILE_ATTRIBUTES,
+#define _SC_FILE_ATTRIBUTES		_SC_FILE_ATTRIBUTES
+    _SC_FILE_LOCKING,
+#define _SC_FILE_LOCKING		_SC_FILE_LOCKING
+    _SC_FILE_SYSTEM,
+#define _SC_FILE_SYSTEM			_SC_FILE_SYSTEM
+    _SC_MONOTONIC_CLOCK,
+#define _SC_MONOTONIC_CLOCK		_SC_MONOTONIC_CLOCK
+    _SC_MULTI_PROCESS,
+#define _SC_MULTI_PROCESS		_SC_MULTI_PROCESS
+    _SC_SINGLE_PROCESS,
+#define _SC_SINGLE_PROCESS		_SC_SINGLE_PROCESS
+    _SC_NETWORKING,
+#define _SC_NETWORKING			_SC_NETWORKING
+    _SC_READER_WRITER_LOCKS,
+#define _SC_READER_WRITER_LOCKS		_SC_READER_WRITER_LOCKS
+    _SC_SPIN_LOCKS,
+#define _SC_SPIN_LOCKS			_SC_SPIN_LOCKS
+    _SC_REGEXP,
+#define _SC_REGEXP			_SC_REGEXP
+    _SC_REGEX_VERSION,
+#define _SC_REGEX_VERSION		_SC_REGEX_VERSION
+    _SC_SHELL,
+#define _SC_SHELL			_SC_SHELL
+    _SC_SIGNALS,
+#define _SC_SIGNALS			_SC_SIGNALS
+    _SC_SPAWN,
+#define _SC_SPAWN			_SC_SPAWN
+    _SC_SPORADIC_SERVER,
+#define _SC_SPORADIC_SERVER		_SC_SPORADIC_SERVER
+    _SC_THREAD_SPORADIC_SERVER,
+#define _SC_THREAD_SPORADIC_SERVER	_SC_THREAD_SPORADIC_SERVER
+    _SC_SYSTEM_DATABASE,
+#define _SC_SYSTEM_DATABASE		_SC_SYSTEM_DATABASE
+    _SC_SYSTEM_DATABASE_R,
+#define _SC_SYSTEM_DATABASE_R		_SC_SYSTEM_DATABASE_R
+    _SC_TIMEOUTS,
+#define _SC_TIMEOUTS			_SC_TIMEOUTS
+    _SC_TYPED_MEMORY_OBJECTS,
+#define _SC_TYPED_MEMORY_OBJECTS	_SC_TYPED_MEMORY_OBJECTS
+    _SC_USER_GROUPS,
+#define _SC_USER_GROUPS			_SC_USER_GROUPS
+    _SC_USER_GROUPS_R,
+#define _SC_USER_GROUPS_R		_SC_USER_GROUPS_R
+    _SC_2_PBS,
+#define _SC_2_PBS			_SC_2_PBS
+    _SC_2_PBS_ACCOUNTING,
+#define _SC_2_PBS_ACCOUNTING		_SC_2_PBS_ACCOUNTING
+    _SC_2_PBS_LOCATE,
+#define _SC_2_PBS_LOCATE		_SC_2_PBS_LOCATE
+    _SC_2_PBS_MESSAGE,
+#define _SC_2_PBS_MESSAGE		_SC_2_PBS_MESSAGE
+    _SC_2_PBS_TRACK,
+#define _SC_2_PBS_TRACK			_SC_2_PBS_TRACK
+    _SC_SYMLOOP_MAX,
+#define _SC_SYMLOOP_MAX			_SC_SYMLOOP_MAX
+    _SC_STREAMS,
+#define _SC_STREAMS			_SC_STREAMS
+    _SC_2_PBS_CHECKPOINT,
+#define _SC_2_PBS_CHECKPOINT		_SC_2_PBS_CHECKPOINT
+
+    _SC_V6_ILP32_OFF32,
+#define _SC_V6_ILP32_OFF32		_SC_V6_ILP32_OFF32
+    _SC_V6_ILP32_OFFBIG,
+#define _SC_V6_ILP32_OFFBIG		_SC_V6_ILP32_OFFBIG
+    _SC_V6_LP64_OFF64,
+#define _SC_V6_LP64_OFF64		_SC_V6_LP64_OFF64
+    _SC_V6_LPBIG_OFFBIG,
+#define _SC_V6_LPBIG_OFFBIG		_SC_V6_LPBIG_OFFBIG
+
+    _SC_HOST_NAME_MAX,
+#define _SC_HOST_NAME_MAX		_SC_HOST_NAME_MAX
+    _SC_TRACE,
+#define _SC_TRACE			_SC_TRACE
+    _SC_TRACE_EVENT_FILTER,
+#define _SC_TRACE_EVENT_FILTER		_SC_TRACE_EVENT_FILTER
+    _SC_TRACE_INHERIT,
+#define _SC_TRACE_INHERIT		_SC_TRACE_INHERIT
+    _SC_TRACE_LOG,
+#define _SC_TRACE_LOG			_SC_TRACE_LOG
+
+    _SC_LEVEL1_ICACHE_SIZE,
+#define _SC_LEVEL1_ICACHE_SIZE		_SC_LEVEL1_ICACHE_SIZE
+    _SC_LEVEL1_ICACHE_ASSOC,
+#define _SC_LEVEL1_ICACHE_ASSOC		_SC_LEVEL1_ICACHE_ASSOC
+    _SC_LEVEL1_ICACHE_LINESIZE,
+#define _SC_LEVEL1_ICACHE_LINESIZE	_SC_LEVEL1_ICACHE_LINESIZE
+    _SC_LEVEL1_DCACHE_SIZE,
+#define _SC_LEVEL1_DCACHE_SIZE		_SC_LEVEL1_DCACHE_SIZE
+    _SC_LEVEL1_DCACHE_ASSOC,
+#define _SC_LEVEL1_DCACHE_ASSOC		_SC_LEVEL1_DCACHE_ASSOC
+    _SC_LEVEL1_DCACHE_LINESIZE,
+#define _SC_LEVEL1_DCACHE_LINESIZE	_SC_LEVEL1_DCACHE_LINESIZE
+    _SC_LEVEL2_CACHE_SIZE,
+#define _SC_LEVEL2_CACHE_SIZE		_SC_LEVEL2_CACHE_SIZE
+    _SC_LEVEL2_CACHE_ASSOC,
+#define _SC_LEVEL2_CACHE_ASSOC		_SC_LEVEL2_CACHE_ASSOC
+    _SC_LEVEL2_CACHE_LINESIZE,
+#define _SC_LEVEL2_CACHE_LINESIZE	_SC_LEVEL2_CACHE_LINESIZE
+    _SC_LEVEL3_CACHE_SIZE,
+#define _SC_LEVEL3_CACHE_SIZE		_SC_LEVEL3_CACHE_SIZE
+    _SC_LEVEL3_CACHE_ASSOC,
+#define _SC_LEVEL3_CACHE_ASSOC		_SC_LEVEL3_CACHE_ASSOC
+    _SC_LEVEL3_CACHE_LINESIZE,
+#define _SC_LEVEL3_CACHE_LINESIZE	_SC_LEVEL3_CACHE_LINESIZE
+    _SC_LEVEL4_CACHE_SIZE,
+#define _SC_LEVEL4_CACHE_SIZE		_SC_LEVEL4_CACHE_SIZE
+    _SC_LEVEL4_CACHE_ASSOC,
+#define _SC_LEVEL4_CACHE_ASSOC		_SC_LEVEL4_CACHE_ASSOC
+    _SC_LEVEL4_CACHE_LINESIZE,
+#define _SC_LEVEL4_CACHE_LINESIZE	_SC_LEVEL4_CACHE_LINESIZE
+    /* Leave room here, maybe we need a few more cache levels some day.  */
+
+    _SC_IPV6 = _SC_LEVEL1_ICACHE_SIZE + 50,
+#define _SC_IPV6			_SC_IPV6
+    _SC_RAW_SOCKETS,
+#define _SC_RAW_SOCKETS			_SC_RAW_SOCKETS
+
+    _SC_V7_ILP32_OFF32,
+#define _SC_V7_ILP32_OFF32		_SC_V7_ILP32_OFF32
+    _SC_V7_ILP32_OFFBIG,
+#define _SC_V7_ILP32_OFFBIG		_SC_V7_ILP32_OFFBIG
+    _SC_V7_LP64_OFF64,
+#define _SC_V7_LP64_OFF64		_SC_V7_LP64_OFF64
+    _SC_V7_LPBIG_OFFBIG,
+#define _SC_V7_LPBIG_OFFBIG		_SC_V7_LPBIG_OFFBIG
+
+    _SC_SS_REPL_MAX,
+#define _SC_SS_REPL_MAX			_SC_SS_REPL_MAX
+
+    _SC_TRACE_EVENT_NAME_MAX,
+#define _SC_TRACE_EVENT_NAME_MAX	_SC_TRACE_EVENT_NAME_MAX
+    _SC_TRACE_NAME_MAX,
+#define _SC_TRACE_NAME_MAX		_SC_TRACE_NAME_MAX
+    _SC_TRACE_SYS_MAX,
+#define _SC_TRACE_SYS_MAX		_SC_TRACE_SYS_MAX
+    _SC_TRACE_USER_EVENT_MAX,
+#define _SC_TRACE_USER_EVENT_MAX	_SC_TRACE_USER_EVENT_MAX
+
+    _SC_XOPEN_STREAMS,
+#define _SC_XOPEN_STREAMS		_SC_XOPEN_STREAMS
+
+    _SC_THREAD_ROBUST_PRIO_INHERIT,
+#define _SC_THREAD_ROBUST_PRIO_INHERIT	_SC_THREAD_ROBUST_PRIO_INHERIT
+    _SC_THREAD_ROBUST_PRIO_PROTECT
+#define _SC_THREAD_ROBUST_PRIO_PROTECT	_SC_THREAD_ROBUST_PRIO_PROTECT
+
+};
+#endif
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/posix/bits/posix1_lim.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/posix/bits/posix1_lim.h
new file mode 100755
index 0000000000000..0739958c5a6c4
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/posix/bits/posix1_lim.h
@@ -0,0 +1,34 @@
+#ifndef POSIX1_LIM_H
+#define POSIX1_LIM_H
+/**
+  @file posix1_lim.h
+  @brief POSIX Minimum values
+
+EXTERNAL FUNCTIONS    
+   None 
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None 
+    
+TODO    
+   This header should be ideally relocated under api/posix/bits (something that 
+   doesnt exist today) and be included from api/posix/bits/limits.h which inturn 
+   should be included from toolchain's limits.h 
+
+Copyright (c) 2018, 2023 by Qualcomm Technologies, Inc.  All Rights Reserved.
+Confidential and Proprietary - Qualcomm Technologies, Inc.    
+    
+==============================================================================*/
+
+#ifndef _POSIX_PATH_MAX
+/** @brief Maximum number of bytes in a pathname, including the terminating
+    nul character */
+#define _POSIX_PATH_MAX 256
+#endif
+
+#ifndef _POSIX_SEM_NSEMS_MAX
+/** @brief Maximum number of semaphores that a process may have */
+#define _POSIX_SEM_NSEMS_MAX 16
+#endif
+#endif
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/posix/common/time.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/posix/common/time.h
new file mode 100755
index 0000000000000..76b0d39ab7039
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/posix/common/time.h
@@ -0,0 +1 @@
+#include <time.h>
\ No newline at end of file
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/posix/fcntl.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/posix/fcntl.h
new file mode 100755
index 0000000000000..c80ec98a449b6
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/posix/fcntl.h
@@ -0,0 +1,51 @@
+#ifndef _FCNTL_H
+#define _FCNTL_H
+
+/*==========================================================================
+ * FILE:         fcntl.h
+ *
+ * SERVICES:     POSIX fcntl.h
+ *
+ * DESCRIPTION:  The <fcntl.h> header is needed by the open() and fcntl()
+ *               system calls, which have a variety of parameters and
+ *               flags. They are described here.
+ *
+ *               The formats of the calls to each of these are:
+ *
+ *               open(path, oflag [,mode]) open a file
+ *               fcntl(fd, cmd [,arg]) get or set file attributes
+ *
+ *               Copyright (c) 2013,2016  by Qualcomm Technologies, Inc.  All Rights Reserved. QUALCOMM Proprietary and Confidential.
+
+ *==========================================================================*/
+
+#include <sys/types.h>
+#include <generic/fcntl.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Oflag values for open(). POSIX Table 6-4. */
+#define POSIX_O_CREAT       0x100  /* creat file if it doesn't exist */
+#define POSIX_O_EXCL        0x200  /* exclusive use flag */
+#define POSIX_O_NOCTTY      0x400  /* do not assign a controlling terminal */
+#define POSIX_O_TRUNC       0x1000 /* truncate flag */
+
+/* File status flags for open() and fcntl(). POSIX Table 6-5. */
+#define POSIX_O_APPEND      0x2000 /* set append mode */
+#define POSIX_O_NONBLOCK    0x4000 /* no delay */
+
+/* File access modes for open() and fcntl(). POSIX Table 6-6. */
+#define POSIX_O_RDONLY      0 /* open(name, POSIX_O_RDONLY) opens read only */
+#define POSIX_O_WRONLY      1 /* open(name, POSIX_O_WRONLY) opens write only */
+#define POSIX_O_RDWR        2 /* open(name, POSIX_O_RDWR) opens read/write */
+
+/* Mask for use with file access modes. POSIX Table 6-7. */
+#define POSIX_O_ACCMODE     0x3 /* mask for file access modes */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _FCNTL_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/posix/hooks/unistd.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/posix/hooks/unistd.h
new file mode 100755
index 0000000000000..1c618bfe36b4f
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/posix/hooks/unistd.h
@@ -0,0 +1,115 @@
+#ifndef UNISTD_H
+#define UNISTD_H
+/**
+  @file posix/hooks/unistd.h
+  @brief POSIX related declarations in <unistd.h> that are missing in toolchain 
+         header
+
+EXTERNAL FUNCTIONS    
+   None 
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS    
+   DONT include this header directly! Instead include unistd.h. 
+
+Copyright (c) 2018, 2023 by Qualcomm Technologies, Inc.  All Rights Reserved.
+Confidential and Proprietary - Qualcomm Technologies, Inc.    
+    
+==============================================================================*/
+#include <types.h> /* For various POSIX ID types from toolchain headers */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+extern long pathconf (char const * path, int name);
+
+/* Process*/
+
+/** The getppid() function shall return the parent process ID of the calling process.
+ * Please refer to POSIX standard for details.
+ * @param thread    [in]  none
+ * @param value_ptr [out] the  parent process ID
+ */
+pid_t getppid(void);
+
+/** The getpgid() function shall return the process group ID of the process whose process ID is equal to pid
+ * Please refer to POSIX standard for details.
+ * @param thread    [in]  process ID
+ * @param value_ptr [out] process group ID
+ */
+pid_t getpgid(pid_t pid);
+
+/** The getpgrp() function shall return the process group ID of the calling process
+ * Please refer to POSIX standard for details.
+ * @param thread    [in]  none
+ * @param value_ptr [out] process group ID of the calling process
+ */
+pid_t getpgrp(void);
+
+/**The getuid() function shall return the real user ID of the calling process.
+ * Please refer to POSIX standard for details.
+ * @param thread    [in]  none
+ * @param value_ptr [out] the real user ID of the calling process.
+ */
+uid_t getuid(void); 
+
+/** The geteuid() function shall return the effective user ID of the calling process
+ * Please refer to POSIX standard for details.
+ * @param thread    [in]  none
+ * @param value_ptr [out] effective user ID of the calling process
+ */
+uid_t geteuid(void); 
+
+/** The getegid() function shall return the effective group ID of the calling process.
+ * Please refer to POSIX standard for details.
+ * @param thread    [in]  none
+ * @param value_ptr [out] effective group ID of the calling process.
+ */
+gid_t getegid(void);
+
+/** The getgid() function shall return the real group ID of the calling process
+ * Please refer to POSIX standard for details.
+ * @param thread    [in]  none
+ * @param value_ptr [out] real group ID of the calling process.
+ */
+ gid_t getgid(void); 
+
+/** seteuid set effective user ID
+ * Please refer to POSIX standard for details.
+ * @param thread    [in] effective user ID
+ * @param value_ptr [out] Upon successful completion, 0 shall be returned; otherwise, -1 shall be returned and errno set to indicate the error.
+ */
+int seteuid(uid_t uid);
+
+/** setpgrp - set the process group ID
+ * Please refer to POSIX standard for details.
+ * @param thread    [in] none
+ * @param value_ptr [out] Upon successful completion, 0 shall be returned; otherwise, -1 shall be returned and errno set to indicate the error.
+ */ 
+pid_t setpgrp(void);
+
+/** setuid - set user ID
+ * Please refer to POSIX standard for details.
+ * @param thread    [in]  user ID
+ * @param value_ptr [out] Upon successful completion, 0 shall be returned; otherwise, -1 shall be returned and errno set to indicate the error.
+ */
+int setuid(uid_t uid);
+
+/** setpgid - set process group ID for job control
+ * Please refer to POSIX standard for details.
+ * @param thread    [in] PID of process, PGID to be set
+ * @param value_ptr [out] Upon successful completion, 0 shall be returned; otherwise, -1 shall be returned and errno set to indicate the error.
+ */
+int setpgid(pid_t pid, pid_t pgid);
+
+/** setsid - create session and set process group ID
+ * Please refer to POSIX standard for details.
+ * @param thread    [in] none
+ * @param value_ptr [out] Upon successful completion, 0 shall be returned; otherwise, -1 shall be returned and errno set to indicate the error.
+ */
+pid_t setsid(void);
+
+#ifdef __cplusplus
+}
+#endif
+#endif
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/posix/mqueue.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/posix/mqueue.h
new file mode 100755
index 0000000000000..74dcc2fa202c6
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/posix/mqueue.h
@@ -0,0 +1,203 @@
+#ifndef _POSIX_MQUEUE_H_
+#define _POSIX_MQUEUE_H_
+
+/*==========================================================================
+ * FILE:         mqueue.h
+ *
+ * SERVICES:     POSIX Message Queue API interface
+ *
+ * DESCRIPTION:  POSIX Message Queue API interface based upon POSIX 1003.1-2004
+ *
+ * Copyright (c) 2013, 2016, 2023 Qualcomm Technologies, Inc.  
+ * All Rights Reserved. 
+ * Confidential and Proprietary - Qualcomm Technlogies, Inc.
+ *==========================================================================*/
+
+#include <sys/types.h> /*ssize_t */
+#include <time.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MQ_PRIO_MAX        255     /* max priority */
+#define MQ_PRIO_DEFAULT    0       /* default priority */
+
+typedef int   mqd_t;
+
+struct mq_attr
+{
+    long mq_flags;   /* message queue flags */
+    long mq_maxmsg;  /* maximum number of messages */
+    long mq_msgsize; /* maximum message size */
+    long mq_curmsgs; /* number of messages currently queued */
+};
+
+typedef struct mq_attr mqueue_attr;
+
+/** \details
+ * This provides POSIX Message Queue API.
+ *
+ * mq_notify is not supported.
+ *
+ * Since this implementation of POSIX kernel API is a subset of PSE51,
+ * it only supports Message sending and receiving within one process.
+ * Message sending and receiving among processes are not supported.
+ */
+
+/** \defgroup mqueue POSIX Message Queue API */
+/** \ingroup mqueue */
+/** @{ */
+
+/** Open a message queue.
+ * Please refer to POSIX standard for details.
+ */
+mqd_t mq_open(const char *name, int oflag, /* mode_t mode, struct mq_attr *attr */...);
+
+/** Close a message queue.
+ * Please refer to POSIX standard for details.
+ */
+int mq_close(mqd_t mq_desc);
+
+/** Remove a message queue.
+ * Please refer to POSIX standard for details.
+ */
+int mq_unlink(const char *name);
+
+/** Send a message to a message queue.
+ * Please refer to POSIX standard for details.
+ *
+ * If the queue is full, instead of blocking the sender, this function
+ * will return -1 with errno EAGAIN, in this implementation. This behavior
+ * may change in the future.
+ */
+int mq_send(mqd_t mqdes, const char *msg_ptr, size_t msg_len, unsigned int msg_prio);
+
+/** Send a message to a message queue with timeout.
+ * Please refer to POSIX standard for details.
+ * @param abs_timeout [in] Only abs_timeout={0,0} is supported in this
+ * implementation. This behavior may change in the future.
+ */
+int mq_timedsend(mqd_t mqdes, const char *msg_ptr, size_t msg_len, unsigned int msg_prio, const struct timespec *abs_timeout);
+
+/** Receive a message from a message queue.
+ * Please refer to POSIX standard for details.
+ */
+ssize_t mq_receive(mqd_t mqdes, char *msg_ptr, size_t msg_len, unsigned int *msg_prio);
+
+/** Receive a message from a message queue with timeout.
+ * Please refer to POSIX standard for details.
+ * @param abs_timeout [in] Only abs_timeout={0,0} is supported in this
+ * implementation. This behavior may change in the future.
+ */
+ssize_t mq_timedreceive(mqd_t mqdes, char *restrict msg_ptr, size_t msg_len, unsigned int *restrict msg_prio, const struct timespec *restrict abs_timeout);
+
+/** Get message queue attributes.
+ * Please refer to POSIX standard for details.
+ */
+int mq_getattr(mqd_t mqdes, struct mq_attr *mqstat);
+
+/** Set message queue attributes.
+ * Please refer to POSIX standard for details.
+ */
+int mq_setattr(mqd_t mqdes, const struct mq_attr *restrict mqstat, struct mq_attr *restrict omqstat);
+
+/** @} */
+
+#define NBBY    8U               /* number of bits in a byte */
+
+/*
+ * Select uses bit masks of file descriptors in longs.  These macros
+ * manipulate such bit fields (the filesystem macros use chars).
+ * FD_SETSIZE may be defined by the user, but the default here should
+ * be enough for most uses.
+ */
+#ifndef FD_SETSIZE
+#define FD_SETSIZE    256U
+#endif
+
+typedef unsigned long   fd_mask;
+#define NFDBITS    (sizeof(fd_mask) * (unsigned int)NBBY)     /* bits per mask */
+
+#ifndef howmany
+#define howmany(x, y)    (((x) + ((y) - 1U)) / (y))
+#endif
+
+//equivalent of fd_set fpr WINNT env
+typedef struct fd_set
+{
+    fd_mask fds_bits[howmany(FD_SETSIZE, NFDBITS)];
+} fd_set;
+
+/** \addtogroup mqueue */
+/** @{ */
+
+/** Sets the bit for the file descriptor fd in the file descriptor set fdset.
+ */
+#define FD_SET(n, p) ((p)->fds_bits[((unsigned int) (n)) / NFDBITS] |= (1UL << (((unsigned int) (n)) % NFDBITS)))
+
+/** Clears the bit for the file descriptor fd in the file descriptor set fdset.
+ */
+#define FD_CLR(n, p) ((p)->fds_bits[((unsigned int) (n)) / NFDBITS] &= ~(1UL << (((unsigned int) (n)) % NFDBITS)))
+
+/** Returns a non-zero value if the bit for the file descriptor fd is set in the file descriptor set pointed to by fdset, and 0 otherwise.
+ */
+#define FD_ISSET(n, p)    ((unsigned long)(p)->fds_bits[((unsigned int) (n)) / NFDBITS] & (unsigned long)((unsigned)1U << (((unsigned int) (n)) % NFDBITS)))
+
+/** Copies the file descriptor set.
+ */
+#define FD_COPY(f, t)     (void)(memcpy)((t), (f), sizeof(*(f)))
+
+/** Initializes the file descriptor set fdset to have zero bits for all file descriptors.
+ */
+#define FD_ZERO(p)        (void)memset((p), 0, sizeof(*(p)))
+
+/** Error check the file descriptor set.
+ */
+#define FD_BAD(fd)        ((fd) < 0 /*|| fd >= fd_arraylen || fd_array[fd].obj == 0*/)
+
+/*! Wait for both message queues and signals. In this implementation, only
+ * message queue file descriptors are supported.
+ * @param nfds [in] This is an integer one more than the maximum of any file
+ * descriptor in any of the sets. In other words, while you are busy
+ * adding file descriptors to your sets, you must calculate the maximum
+ * integer value of all of them, then increment this value by one, and
+ * then pass this as nfds to select().
+ * @param readfds  [in] the file descriptor set on all message queues.
+ * @param writefds [in] ignored in this implementation.
+ * @param errorfds [in] ignored in this implementation.
+ * @param timeout  [in] Only timeout={0,0} is supported in this
+ * implementation. This behavior may change in the future.
+ */
+int pselect(int nfds, fd_set *restrict readfds,
+            fd_set *restrict writefds, fd_set *restrict errorfds,
+            const struct timespec *restrict timeout,
+            const sigset_t *restrict sigmask);
+
+/*! Wait for multiple message queues. In this implementation, only
+ * message queue file descriptors are supported.
+ * @param nfds [in] This is an integer one more than the maximum of any file
+ * descriptor in any of the sets. In other words, while you are busy
+ * adding file descriptors to your sets, you must calculate the maximum
+ * integer value of all of them, then increment this value by one, and
+ * then pass this as nfds to select().
+ * @param readfds  [in] the file descriptor set on all message queues.
+ * @param writefds [in] ignored in this implementation.
+ * @param errorfds [in] ignored in this implementation.
+ * @param timeout  [in] Only timeout={0,0} is supported in this
+ * implementation. This behavior may change in the future.
+ */
+int select(int nfds, fd_set *restrict readfds,
+           fd_set *restrict writefds, fd_set *restrict errorfds,
+           struct timeval *restrict timeout);
+
+/** @} */
+
+/* this function is needed for test framework which needs to clean up memory when teardown */
+void _mq_teardown(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/posix/pthread.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/posix/pthread.h
new file mode 100755
index 0000000000000..f64242e8dc683
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/posix/pthread.h
@@ -0,0 +1,287 @@
+#ifndef QURT_PTHREAD_H
+#define QURT_PTHREAD_H  
+
+/*==========================================================================
+ * FILE:         pthread.h
+ *
+ * SERVICES:     POSIX pthread API interface
+ *
+ * DESCRIPTION:  POSIX pthread API interface based upon POSIX 1003.1-2004
+ *
+ *               Copyright (c) 2013,2016,2023  by Qualcomm Technologies, Inc.  All Rights Reserved. QUALCOMM Proprietary and Confidential.
+ *==========================================================================
+ *
+ *                          EDIT HISTORY FOR MODULE
+ *
+ *  This section contains comments describing changes made to the module.
+ *  Notice that changes are listed in reverse chronological order.
+ *
+ *  
+ *
+ *  when       who     what, where, why
+ *  --------   ---     -------------------------------------------------------
+ *  10/13/08   cz      Initial version.
+ *==========================================================================*/
+
+#include <sys/types.h>
+#include "sys/sched.h" /* For struct sched_param */
+#include "sys/errno.h" /* error values */
+#include <qurt.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <time.h>
+#include "pthread_types.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* the range of the set supported by the kernel data type used to represent CPU sets. */
+#define CONFIG_NR_CPUS QURT_THREAD_CFG_BITMASK_ALL
+
+#define UNIMPLEMENTED(FUNC, RETURNTYPE, ARGS)    static inline RETURNTYPE FUNC ARGS { qurt_printf("Unimplemented: %s... exiting\n", __FUNCTION__); exit(1); }
+
+/** @brief Magic (non-portable) value for a stack's address to enable usage
+           of auto-stack feature (if available) */
+#define PTHREAD_AUTO_STACK_MAGIC_ADDR_NP ((void *)0xFFF)
+
+/** \details 
+ * This provides POSIX thread API. 
+ *
+ */
+
+/** \defgroup pthread POSIX pthread API */
+/** \ingroup pthread */
+/** @{ */
+
+/** Compare Two Threads. 
+ * Please refer to POSIX standard for details.  
+ */
+static inline int pthread_equal(pthread_t t1, pthread_t t2)
+{
+    return (t1 == t2) ? 1 : 0;
+}
+
+/** Create Thread. 
+ * Please refer to POSIX standard for details.  
+ */
+int pthread_create(pthread_t * tid, const pthread_attr_t * attr, void *(*start)(void *), void *arg);
+
+/** Terminate Calling Thread. 
+ * Please refer to POSIX standard for details.  
+ */
+void pthread_exit(void *value_ptr);
+
+/** Wait for thread termination.
+ * Please refer to POSIX standard for details.
+ * @param thread    [in]  the thread to be joined
+ * @param value_ptr [out] the pointer of the exit status
+ */
+int pthread_join(pthread_t thread, void **value_ptr);
+
+/** Detach a joinable thread.
+ * Please refer to POSIX standard for details.
+ * @param id    [in]  id of the tread the thread to be detached.
+ */
+int pthread_detach(pthread_t id);
+
+/** Dynamic package initialisation
+ * Please refer to POSIX standard for details.
+ */
+int pthread_once(pthread_once_t *once_control, void (*init_routine)(void));
+
+pthread_t pthread_self(void);
+int pthread_cancel(pthread_t thread);
+static inline void pthread_yield(void)
+{
+    return;
+}
+
+int pthread_kill(pthread_t thread, int sig);
+
+/**
+ * @brief Return name of thread
+ * @warning Donot call this in the error handling path as it may cause deadlock
+ *          due to underlying OS calls
+ * @param thread [in] thread Thread whose name is to be retrieved
+ * @param name [out] name Buffer used to return thread name
+ * @param len [in] len  Number of bytes available in name
+ * @return 0 on success, ESRCH, ERANGE on failure
+ */
+extern int pthread_getname_np (pthread_t thread, char * name, size_t len);
+
+int pthread_getschedparam(pthread_t thread, int *restrict policy, struct sched_param *restrict param);
+int pthread_setschedparam(pthread_t thread, int policy, const struct sched_param *param);
+int pthread_setschedprio(pthread_t thread, int prio);
+int pthread_setcancelstate(int state, int *oldstate);
+int pthread_setcanceltype(int type, int *oldtype);
+
+/* Attribute functions */
+int pthread_attr_init(pthread_attr_t *attr);
+int pthread_attr_destroy(pthread_attr_t *attr);
+int pthread_attr_setschedparam(pthread_attr_t *restrict attr, const sched_param *restrict param);
+int pthread_attr_getschedparam(const pthread_attr_t *restrict attr, sched_param *restrict param);
+int pthread_attr_setstacksize(pthread_attr_t *attr, size_t stacksize);
+int pthread_attr_getstacksize(const pthread_attr_t *attr, size_t *stacksize);
+int pthread_attr_setstackaddr(pthread_attr_t *attr, void * stackaddr);
+int pthread_attr_getstackaddr(const pthread_attr_t *attr, void ** stackaddr);
+int pthread_attr_getdetachstate(const pthread_attr_t *attr, int *detachstate);
+int pthread_attr_setdetachstate(pthread_attr_t *attr, int detachstate);
+int pthread_attr_setstack(pthread_attr_t *attr, void *stackaddr, size_t stacksize);
+int pthread_attr_getstack(const pthread_attr_t *attr, void **stackaddr, size_t *stacksize);
+int pthread_attr_setscope(pthread_attr_t *attr, int scope);
+int pthread_attr_getscope(const pthread_attr_t *attr, int *scope);
+int pthread_attr_setinheritsched(pthread_attr_t *attr, int inheritsched);
+int pthread_attr_getinheritsched(const pthread_attr_t *attr, int *inheritsched);
+int pthread_attr_getguardsize(const pthread_attr_t * attr, size_t * guardsize);
+int pthread_attr_setautostack(pthread_attr_t *attr);
+int pthread_attr_setbuspriority(pthread_attr_t *attr, unsigned short bus_priority);
+
+/* Qualcomm additions to pthread get/set attribute functions */
+int pthread_attr_setthreadname(pthread_attr_t *attr, const char * name);
+int pthread_attr_getthreadname(const pthread_attr_t *attr, char * name, int size);
+int pthread_attr_settimetestid(pthread_attr_t *attr, unsigned int tid);
+int pthread_attr_gettimetestid(const pthread_attr_t *attr, unsigned int* tid);
+
+/* Mutexes */
+int pthread_mutex_init(pthread_mutex_t *mutex, pthread_mutexattr_t *attr);
+int pthread_mutex_lock(pthread_mutex_t *mutex);
+int pthread_mutex_unlock(pthread_mutex_t *mutex);
+int pthread_mutex_trylock(pthread_mutex_t *mutex);
+int pthread_mutex_destroy(pthread_mutex_t *mutex);
+int pthread_mutex_getprioceiling(const pthread_mutex_t *restrict mutex, int *restrict prioceiling);
+int pthread_mutex_setprioceiling(pthread_mutex_t *restrict mutex, int prioceiling, int *restrict old_ceiling);
+
+/* For Mutex with type PTHREAD_MUTEX_NORMAL, Priority Inheritance is not 
+ * supported even PTHREAD_PRIO_INHERIT is defined since QURT does not support
+ * this kind of Mutex */
+int pthread_mutexattr_init(pthread_mutexattr_t *attr);
+int pthread_mutexattr_destroy(pthread_mutexattr_t *attr);
+int pthread_mutexattr_gettype(const pthread_mutexattr_t *restrict, int *restrict);
+int pthread_mutexattr_settype(pthread_mutexattr_t *attr, int type);
+int pthread_mutexattr_getprotocol(const pthread_mutexattr_t *restrict, int *restrict);
+int pthread_mutexattr_setprotocol(pthread_mutexattr_t *attr, int protocol);
+int pthread_mutexattr_getpshared(const pthread_mutexattr_t *restrict, int *restrict);
+int pthread_mutexattr_setpshared(pthread_mutexattr_t *, int);
+int pthread_mutexattr_getprioceiling(const pthread_mutexattr_t *restrict attr, int *restrict prioceiling);
+int pthread_mutexattr_setprioceiling(pthread_mutexattr_t *attr, int prioceiling);
+
+/* Spinlocks */
+int pthread_spin_init(pthread_spinlock_t *lock, int pshared);
+int pthread_spin_destroy(pthread_spinlock_t *lock);
+int pthread_spin_lock(pthread_spinlock_t *lock);
+int pthread_spin_trylock(pthread_spinlock_t *lock);
+int pthread_spin_unlock(pthread_spinlock_t *lock);
+
+/* Condition variables */
+int pthread_condattr_init(pthread_condattr_t *attr);
+int pthread_condattr_destroy(pthread_condattr_t *attr);
+int pthread_condattr_setpshared(pthread_condattr_t *attr, int pshared);
+int pthread_condattr_getpshared(const pthread_condattr_t *restrict attr, int *restrict pshared);
+int pthread_condattr_setclock(pthread_condattr_t *attr, clockid_t clock);
+int pthread_condattr_getclock(const pthread_condattr_t *restrict attr, clockid_t *restrict clock);
+int pthread_cond_init(pthread_cond_t *cond, pthread_condattr_t *attr);
+int pthread_cond_destroy(pthread_cond_t *cond);
+int pthread_cond_signal(pthread_cond_t *cond);
+int pthread_cond_broadcast(pthread_cond_t *cond);
+int pthread_cond_wait(pthread_cond_t *cond, pthread_mutex_t *mutex);
+int pthread_cond_timedwait(pthread_cond_t * cond, pthread_mutex_t * mutex, const struct timespec *time);
+
+/* Barriers */
+int pthread_barrier_init(pthread_barrier_t *restrict barrier, const pthread_barrierattr_t *restrict attr, unsigned count);
+int pthread_barrier_destroy(pthread_barrier_t *barrier);
+int pthread_barrier_wait(pthread_barrier_t *barrier);
+int pthread_barrierattr_init(pthread_barrierattr_t *attr);
+int pthread_barrierattr_destroy(pthread_barrierattr_t *attr);
+int pthread_barrierattr_getpshared(const pthread_barrierattr_t *restrict attr, int *restrict pshared);
+
+
+/*Read-Write locks*/
+int pthread_rwlock_init(pthread_rwlock_t *, const pthread_rwlockattr_t *);
+int pthread_rwlock_destroy(pthread_rwlock_t *);
+int pthread_rwlockattr_init(pthread_rwlockattr_t *);
+int pthread_rwlockattr_destroy(pthread_rwlockattr_t *);
+int pthread_rwlockattr_getpshared(const pthread_rwlockattr_t *, int *);
+int pthread_rwlockattr_setpshared(pthread_rwlockattr_t *, int);
+int pthread_rwlock_rdlock(pthread_rwlock_t *);
+int pthread_rwlock_tryrdlock(pthread_rwlock_t *);
+int pthread_rwlock_wrlock(pthread_rwlock_t *);
+int pthread_rwlock_trywrlock(pthread_rwlock_t *);
+int pthread_rwlock_unlock(pthread_rwlock_t *);
+
+
+/** please refer to POSIX standard document 
+ */
+int pthread_barrierattr_setpshared(pthread_barrierattr_t *attr, int pshared);
+
+/** set CPU affinity attribute in thread attributes object.
+
+ * @param attr       [in] pthread attributes 
+ * @param cpusetsize [in] The argument cpusetsize is the length (in bytes) 
+                          of the buffer pointed to by cpuset. Typically, 
+                          this argument would be specified as 
+                          sizeof(cpu_set_t).
+ * @param cpuset     [in] This data set is a bitset where each bit represents 
+                          a CPU (hw thread). How the system's CPUs are mapped 
+                          to bits in the bitset is system dependent. 
+                          For QURT kernel, Bit 0 is corresponding to hw 
+                          thread 0, and so on. If the corresponding bit is 
+                          set to 1, then the software thread is eligible to 
+                          run this hw thread.  0x3f means it can run any hw
+                          threads 0x0 also means it can run on any hw threads.
+   @return On success, this function returns 0; on error, it returns a 
+           non-zero error number.
+           EINVAL - cpuset specified a CPU that was outside the set supported 
+                    by the kernel.  (The kernel configuration option 
+                    CONFIG_NR_CPUS defines the range of the set supported by 
+                    the kernel data type used to represent CPU sets.)
+ * @note This function is non-standard GNU extensions; hence the suffix "_np"
+         (non-portable) in the names. 
+ */
+int pthread_attr_setaffinity_np(pthread_attr_t *attr, size_t cpusetsize, const cpu_set_t *cpuset);
+
+/** get CPU affinity attribute in thread attributes object.
+ * @param attr       [in] pthread attributes 
+ * @param cpusetsize [in] The argument cpusetsize is the length (in bytes) 
+                          of the buffer pointed to by cpuset. Typically, 
+                          this argument would be specified as 
+                          sizeof(cpu_set_t).
+ * @param cpuset    [out] This data set is a bitset where each bit represents 
+                          a CPU (hw thread). How the system's CPUs are mapped 
+                          to bits in the bitset is system dependent. 
+                          For QURT kernel, Bit 0 is corresponding to hw 
+                          thread 0, and so on. If the corresponding bit is 
+                          set to 1, then the software thread is eligible to 
+                          run this hw thread.  0x3f means it can run any hw
+                          threads 0x0 also means it can run on any hw threads.
+   @return On success, this function returns 0; on error, it returns a 
+           non-zero error number.
+           EINVAL - cpusetsize is smaller than the size of the affinity mask 
+                    used by the kernel.
+ * @note   This function is non-standard GNU extensions; hence the suffix "_np"
+           (non-portable) in the names. 
+ */
+int pthread_attr_getaffinity_np(pthread_attr_t *attr, size_t cpusetsize, cpu_set_t *cpuset);
+
+/* TLS */
+int pthread_key_create(pthread_key_t *key, void (*destructor)(void*));
+int pthread_key_delete(pthread_key_t key);
+int pthread_setspecific(pthread_key_t key, const void *value);
+void *pthread_getspecific(pthread_key_t key);
+int pthread_getattr_np(pthread_t thread, pthread_attr_t * restrict attr); 	 	
+
+/** @} */
+
+/* Calling non-pthread calls this function to create pthred tcb w/o creating actual thread */
+int pthread_fake(pthread_t * restrict thread, const pthread_attr_t * restrict attr);
+int pthread_fake_destroy(pthread_t thread);
+
+//amitkulk: move these to unistd.h after we move that header within qurt
+int posix_memalign(void **memptr, size_t alignment, size_t size);
+void exit(int status);
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* QURT_PTHREAD_H */
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/posix/pthread_types.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/posix/pthread_types.h
new file mode 100755
index 0000000000000..51c3b9dbca243
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/posix/pthread_types.h
@@ -0,0 +1,193 @@
+#ifndef _PTHREAD_TYPES_H_
+#define _PTHREAD_TYPES_H_
+
+/*==========================================================================
+ * FILE:         pthread_types.c
+ *
+ * SERVICES:     types usded in POSIX API interface
+ *
+ * DESCRIPTION:  POSIX API interface based upon POSIX 1003.1-2004
+ *
+ *               Copyright (c) 2016, 2023  by Qualcomm Technologies, Inc.  All Rights Reserved. QUALCOMM Proprietary and Confidential.
+
+ *==========================================================================*/
+
+#include <stddef.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef __GNUC__
+#define restrict __restrict__
+#else
+#define restrict
+#endif
+
+#define _SSIZE_T
+
+#ifndef TRUE
+#define TRUE    1
+#endif
+
+#ifndef FALSE
+#define FALSE    0
+#endif
+
+#define PTHREAD_MAX_THREADS          512U
+
+#define PTHREAD_NAME_LEN             16
+#define PTHREAD_MIN_STACKSIZE        512 //4096
+#define PTHREAD_MAX_STACKSIZE        1048576
+#define PTHREAD_DEFAULT_STACKSIZE    16384
+
+#define PTHREAD_STACK_MIN            (4096U*2U)
+#define PTHREAD_MIN_PRIORITY         0U
+#define PTHREAD_MAX_PRIORITY         255U
+#define PTHREAD_DEFAULT_PRIORITY     1
+
+/*Mutex initialization status*/
+#define PTHREAD_MUTEX_ATTR_UNINITIALIZED    0
+#define PTHREAD_MUTEX_ATTR_INITIALIZED      1
+
+/*Conditional attributes initialization status*/
+#define PTHREAD_COND_ATTR_UNINITIALIZED     0
+#define PTHREAD_COND_ATTR_INITIALIZED       1
+
+#define PTHREAD_DEFAULT_NAME                "Anonymous"
+
+#define PTHREAD_MUTEX_INITIALIZER    ((pthread_mutex_t) 0xFFFFFFFFU)
+                                      
+#define PTHREAD_COND_INITIALIZER     ((pthread_cond_t) 0xFFFFFFFFU)
+
+/* mutex and cond_var shared */
+#define PTHREAD_PROCESS_PRIVATE      0
+#define PTHREAD_PROCESS_SHARED       1
+
+/* mutex type */
+#define PTHREAD_MUTEX_ERRORCHECK     0
+#define PTHREAD_MUTEX_NORMAL         1
+#define PTHREAD_MUTEX_RECURSIVE      2
+#define PTHREAD_MUTEX_DEFAULT        3
+
+/* mutex protocol */
+#define PTHREAD_PRIO_NONE            0
+#define PTHREAD_PRIO_INHERIT         1
+#define PTHREAD_PRIO_PROTECT         2
+
+#define PTHREAD_SPINLOCK_UNLOCKED    0
+#define PTHREAD_SPINLOCK_LOCKED      1
+
+#define PTHREAD_ONCE_INIT (0)
+
+#define PTHREAD_MUTEX_OPAQUE //ToDo: amitkulk: debug
+
+typedef signed int   ssize_t;
+
+/*detatchstate of a pthread*/
+#define PTHREAD_CREATE_JOINABLE             1
+#define PTHREAD_CREATE_DETACHED             0
+
+/*contention scope*/
+#define PTHREAD_SCOPE_PROCESS 1 
+#define PTHREAD_SCOPE_SYSTEM 0
+
+/*scheduler*/
+#define PTHREAD_INHERIT_SCHED 1
+#define PTHREAD_EXPLICIT_SCHED 0
+
+/*
+ * Types and structure definitions
+ *
+ */
+typedef unsigned int cpu_set_t;
+
+typedef unsigned int pthread_t;
+
+typedef struct pthread_attr_t
+{
+    void         *stackaddr;
+    int          internal_stack; /* this flag==1 means the stack needs to be freed by posix */
+    size_t       stacksize;
+    int          priority;
+    unsigned short timetest_id;
+    /* This flag indicate if thread will be autostack thread*/    
+	unsigned short autostack:1;
+    /* This flag is to indicate thread's bus_priority high/low 
+       bus_priority = 0  -- Bus_priority is low
+       bus_priority = 1  -- Bus_priority is high
+       bus_priority = 3  -- Bus_priority is default (takes the default set for the process)
+    */
+    unsigned short bus_priority:2;
+    unsigned short reserved:13;
+    cpu_set_t    cpumask;
+    char         name[PTHREAD_NAME_LEN];
+    /* This flag indicates whether pthread lib should create thread contexts for other OSALs */
+    /* This is used internally by POSIX and not available for general usage */
+    int          ext_context;
+    int          detachstate;
+} pthread_attr_t;
+
+//mutex attr
+typedef struct pthread_mutexattr_t   pthread_mutexattr_t;
+struct pthread_mutexattr_t
+{
+    int is_initialized;
+    int type;
+    int pshared;
+    int protocol;
+};
+
+typedef unsigned int              pthread_mutex_t;
+
+typedef unsigned int              pthread_spinlock_t;
+
+typedef struct pthread_condattr_t
+{
+    int is_initialized;
+    int pshared;
+    clockid_t clock_id;
+} pthread_condattr_t;
+
+typedef unsigned int             pthread_cond_t;
+
+typedef struct pthread_barrierattr_t
+{
+    int is_initialized;
+    int pshared;
+} pthread_barrierattr_t;
+
+typedef unsigned int                pthread_barrier_t;
+
+typedef int pthread_key_t;
+
+typedef int pthread_once_t;
+
+
+/*Read-Write locks*/
+#define PTW32_RWLOCK_MAGIC 0xfacade2
+#define PTHREAD_RWLOCK_INITIALIZER ((pthread_rwlock_t)(size_t) -1)
+
+struct pthread_rwlockattr_t_
+{
+  int pshared;
+};
+
+struct pthread_rwlock_t_
+{
+  pthread_mutex_t mtxExclusiveAccess;
+  pthread_mutex_t mtxSharedAccessCompleted;
+  pthread_cond_t cndSharedAccessCompleted;
+  int nSharedAccessCount;
+  int nExclusiveAccessCount;
+  int nCompletedSharedAccessCount;
+  int nMagic;
+};
+
+typedef struct pthread_rwlock_t_ * pthread_rwlock_t;
+typedef struct pthread_rwlockattr_t_ * pthread_rwlockattr_t;
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _PTHERAD_TYPES_H_ */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/posix/sched.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/posix/sched.h
new file mode 100755
index 0000000000000..faf3365be9f82
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/posix/sched.h
@@ -0,0 +1,21 @@
+/*=============================================================================
+
+                                    sched.h
+
+GENERAL DESCRIPTION
+
+EXTERNAL FUNCTIONS
+        None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+        None.
+
+             Copyright (c) 2013,2016  by Qualcomm Technologies, Inc.  All Rights Reserved.
+=============================================================================*/
+#ifndef __SCHED_H__
+#define __SCHED_H__
+
+#include "sys/sched.h"
+
+#endif //__SCHED_H__
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/posix/semaphore.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/posix/semaphore.h
new file mode 100755
index 0000000000000..d9145b295ae62
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/posix/semaphore.h
@@ -0,0 +1,114 @@
+#ifndef SEMAPHORE_H
+#define SEMAPHORE_H
+
+/*==========================================================================
+ * FILE:         semaphore.h
+ *
+ * SERVICES:     POSIX semaphore API interface
+ *
+ * DESCRIPTION:  POSIX semaphore API interface based upon POSIX 1003.1-2004
+ *
+ *               Copyright (c) 2013, 2016  by Qualcomm Technologies, Inc.  All Rights Reserved. QUALCOMM Proprietary and Confidential.
+
+ *==========================================================================*/
+#include <sys/types.h> // Get all C sys types - includes POSIX specific
+#include "sys/errno.h" // error values
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*=============================================================================
+                        TYPEDEFS
+=============================================================================*/
+/** User facing semaphore container with opaque pointer to implementation */
+typedef struct
+{
+    unsigned int *opaque;
+} sem_t;
+#define _SEM_T
+
+/*=============================================================================
+                        CONSTANTS AND MACROS
+=============================================================================*/
+/* constant definitions */
+#define SEM_FAILED       ((sem_t*) 0)
+
+/* @todo siqbal Should we put such configuration items in a common place
+   instead of this user-facing header? */
+#define SEM_VALUE_MAX    ((unsigned int) 30) // If need be increase this
+
+/*=============================================================================
+                        FUNCTIONS
+=============================================================================*/
+
+/** \details
+ * POSIX standard comes with two kinds of semaphores: named and unnamed
+ * semaphores.
+ *
+ * This implementation of POSIX kernel API provide unnamed & named semaphore.
+ *
+ * 
+ * sem_timedwait() is not provided.
+ */
+
+/** \defgroup semaphore POSIX Semaphore API */
+
+/** \ingroup semaphore */
+/** @{ */
+
+/** Initialize an unnamed semaphore.
+ * Please refer to POSIX standard for details.
+ * @param pshared [in] This implementation does not support non-zero value, 
+ * i.e., semaphore cannot be shared between processes in this implementation. 
+ */                 
+int sem_init(sem_t *sem, int pshared, unsigned int value);
+
+/** Lock a semaphore.
+ * Please refer to POSIX standard for details.
+ */
+int sem_wait(sem_t *sem);
+
+/** Lock a semaphore.
+ * Please refer to POSIX standard for details.
+ */
+int sem_trywait(sem_t *sem);
+
+/** Unlock a semaphore.
+ * Please refer to POSIX standard for details.
+ */
+int sem_post(sem_t *sem);
+
+/** Get the value of a semaphore.
+ * Please refer to POSIX standard for details.
+ */
+int sem_getvalue(sem_t *sem, int *value);
+
+/** Destroy an unnamed semaphore.
+ * Please refer to POSIX standard for details.
+ */
+int sem_destroy(sem_t *sem);
+
+/** creates and initializes a named semaphore.
+ * Please refer to POSIX standard for details.
+ */
+sem_t * sem_open(const char* name , int oflag , ...);
+
+/** closes a semaphore.
+ * Please refer to POSIX standard for details.
+ */
+int sem_close(sem_t *sem);
+
+/** unlinkes a named semaphore.
+ * Please refer to POSIX standard for details.
+ */
+int sem_unlink(const char *name);
+/** @} */
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  /* SEMAPHORE_H */
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/posix/signal.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/posix/signal.h
new file mode 100755
index 0000000000000..35cb1f1a9a319
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/posix/signal.h
@@ -0,0 +1,201 @@
+#ifndef _SIGNAL_H_
+#define _SIGNAL_H_
+
+/*==========================================================================
+ * FILE:         signal.h
+ *
+ * SERVICES:     POSIX Signal API interface
+ *
+ * DESCRIPTION:  POSIX Signal API interface based upon POSIX 1003.1-2004
+ *
+ * Copyright (c) 2013, 2016, 2023 Qualcomm Technologies, Inc.
+ * All Rights Reserved. 
+ * Confidential and Proprietary - Qualcomm Technologies, Inc.
+ 
+ *==========================================================================*/
+
+#include <sys/types.h>
+#include <generic/signal.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* POSIX signal bits */
+
+#define POSIX_MSG      7 /* POSIX msg type used in Qube API */
+#define POSIX_NOTIF    8 /* POSIX msg type used in Qube API */
+#define SIGKILL        9 /* kill (cannot be caught or ignored) */
+
+#define SIGRTMIN       10
+#define SIGRTMAX       32
+
+/* Notification Types. */
+/* No asynchronous notification is delivered when the event of interest occurs. */
+#define SIGEV_NONE      0
+/* The signal specified in sigev_signo shall be generated for the process when
+   the event of interest occurs. */
+#define SIGEV_SIGNAL    1
+/* A notification function is called to perform notification. */
+#define SIGEV_THREAD    2
+#define SA_SIGINFO      1
+
+/*
+ * Flags for sigprocmask:
+ */
+#define SIG_BLOCK       1 /* block specified signal set */
+#define SIG_UNBLOCK     2 /* unblock specified signal set */
+#define SIG_SETMASK     3 /* set specified signal set */
+
+typedef unsigned long int   sigset_t;
+
+union sigval
+{
+    int  sival_int;   /* Integer signal value. */
+    void *sival_ptr;  /* Pointer signal value. */
+};
+
+typedef struct sigevent   sigevent;
+struct sigevent
+{
+    int            sigev_notify;                           /* Notification type.       */
+    int            sigev_signo;                            /* Signal number.           */
+    union sigval   sigev_value;                            /* Signal value.            */
+    void           (*sigev_notify_function)(union sigval); /* Notification function.   */
+    pthread_attr_t *sigev_notify_attributes;
+};
+
+typedef struct siginfo_t   siginfo_t;
+struct siginfo_t
+{
+    int          si_signo;
+    int          si_code;
+    union sigval si_value;
+/*  int          si_errno;
+    pid_t        si_pid;
+    uid_t        si_uid;
+    void         *si_addr;
+    int          si_status;
+    long         si_band;*/
+};
+struct sigaction
+{
+    void     (*sa_handler)(int);
+    sigset_t sa_mask;
+    int      sa_flags;
+    void     (*sa_sigaction)(int, siginfo_t *, void *);
+};
+
+/* Signal functions */
+
+/** \details
+ * This provides POSIX Signal API. Please note that this
+ * implementation does not fully comply with POSIX standard.
+ *
+ * In POSIX standard, Signal can be used as 'interrupt', which means
+ * an incoming signal will interrupt a running thread. After the
+ * registered signal handler is executed, the thread will resume.
+ * This behavior cannot be implemented w/o modifying L4 or QURT kernel.
+ * On the ohter hand, appliation need to be carefully written to avoid
+ * problems caused by 'interrupting' signals.
+ *
+ * Therefore, in this implementation of POSIX signal, thread will
+ * only receive signals when it explicitly waits for signals, i.e., when 
+ * the thread calls either sigwait() or sigsuspend().
+ *
+ * Therefore, pthread_sigmask(), which set or get signal mask for a thread, 
+ * is not supported, since the signal mask will be set by sigwait() and 
+ * sigsuspend().
+ *
+ * Since this implementation of POSIX kernel API is a subset of PSE51,
+ * only threads can send and receive signals. The functions related to 
+ * signal operations with processes, such as kill(), sigqueue(), 
+ * sigprocmask(), are not provided.
+ *
+ * Queued signal is not supported.
+ *
+ * Applications will use signals from SIGRTMIN to SIGRTMAX.
+ *
+ * SIGEV_SIGNAL and SIGEV_THREAD are supported. SIGEV_NONE is not 
+ * supported.
+ *
+ */
+
+/** \defgroup signal POSIX Signal API */
+/** \ingroup signal */
+/** @{ */
+
+/** Wait for signals. This implementation does not support queued signals.
+ *
+ * Please refer to POSIX standard for details.
+ */
+int sigwait(const sigset_t *restrict set, int *restrict sig);
+
+/** Examine and Change Signal Action. 
+ * Please refer to POSIX standard for details.
+ *
+ * @param act [in] A pointer to the sigaction structure that describes the 
+ * action to be taken for the signal. Can be NULL. 
+ * The following flags for sa_flags field in struct sigaction are not 
+ * supported: SA_NOCLDSTOP, SA_ONSTACK, SA_RESETHAND, SA_RESTART, 
+ * SA_NOCLDWAIT and SA_NODEFER. Only flag SA_SIGINFO is supported.  
+ *
+ * @note Define sigaction as macro to avoid a warning when included from 
+ * C++ code - it's causing a "sigaction(...) hides constructor for 
+ * 'struct sigaction'" warning.
+ */
+/*lint -esym(123,sigaction) Suppress "macro used with no arguments" */
+#define sigaction(sig,act,oact) _sigaction((sig),(act),(oact))
+
+/** Wait for signals. 
+ * Please refer to POSIX standard for details.
+ */
+int sigsuspend(const sigset_t *sigmask);
+
+/** Add Signal to Signal Set.
+ * Please refer to POSIX standard for details.
+ */
+int sigaddset(sigset_t *set, int signo);
+
+/** Delete Signal from Signal Set.
+ * Please refer to POSIX standard for details.
+ */
+int sigdelset(sigset_t *set, int signo);
+
+/** Initialize and Empty Signal Set.
+ * Please refer to POSIX standard for details.
+ */
+int sigemptyset(sigset_t *set);
+
+/** Initialize and Fill Signal Set.
+ * Please refer to POSIX standard for details.
+ */
+int sigfillset(sigset_t *set);
+
+/** Test for Signal in Signal Set.
+ * Please refer to POSIX standard for details.
+ */
+int sigismember(const sigset_t *set, int signo);
+
+/** @} */
+
+/* this is not a public api function */
+int _sigaction(int sig, const struct sigaction *act, struct sigaction *oact);
+
+/* have to move #include here to solve circular include problems between time.h and signal.h */
+#include <time.h>
+
+/** Wait for the time interval specified in the timespec structure referenced 
+ * by timeout. This implementation does not support queued signals.
+ * For struct siginfo_t, si_code and si_value are ignored in this implementation.
+ *
+ * Please refer to POSIX standard for details.
+ */
+int sigtimedwait(const sigset_t *restrict set, siginfo_t *restrict info, 
+                 const struct timespec *restrict timeout);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _POSIX_SIGNAL_H_ */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/posix/sys/errno.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/posix/sys/errno.h
new file mode 100755
index 0000000000000..b9edf57bab6c3
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/posix/sys/errno.h
@@ -0,0 +1,20 @@
+#ifndef _SYS_ERRNO_H_
+#define _SYS_ERRNO_H_
+
+/*==========================================================================
+ * FILE:         errno.h
+ *
+ * SERVICES:     POSIX errno header file
+ *
+ * DESCRIPTION:  POSIX errno based upon POSIX 1003.1-2004
+ *
+ *               Copyright (c) 2013, 2016  by Qualcomm Technologies, Inc.  All Rights Reserved. QUALCOMM Proprietary and Confidential.
+
+ *==========================================================================*/
+
+#include <errno.h>
+#ifndef EOK
+#define EOK                0
+#endif
+
+#endif /* _SYS_ERRNO_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/posix/sys/sched.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/posix/sys/sched.h
new file mode 100755
index 0000000000000..2acc34d821725
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/posix/sys/sched.h
@@ -0,0 +1,67 @@
+#ifndef _POSIX_SCHED_H_
+#define _POSIX_SCHED_H_
+
+/*==========================================================================
+ * FILE:         sched.c
+ *
+ * SERVICES:     POSIX Thread sched API interface
+ *
+ * DESCRIPTION:  POSIX Thread sched API interface based upon POSIX 1003.1-2004
+ *
+ *               Copyright (c) 2013, 2016  by Qualcomm Technologies, Inc.  All Rights Reserved. QUALCOMM Proprietary and Confidential.
+
+
+ *==========================================================================*/
+
+#include <qurt.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define SCHED_FIFO        0 /* First in, first out (FIFO) scheduling policy. */
+#define SCHED_RR          1 /* Round robin scheduling policy. */
+#define SCHED_SPORADIC    2 /* Sporadic server scheduling policy. */
+#define SCHED_OTHER       3 /* Another scheduling policy. */
+
+typedef struct sched_param   sched_param;
+struct sched_param
+{
+    void *unimplemented;
+    int  sched_priority;
+};
+
+/** \details 
+ * This provides POSIX sched API. 
+ */
+
+/** \defgroup sched POSIX sched API */
+/** \ingroup sched */
+/** @{ */
+
+/** Relinquish the CPU.
+ * Please refer to POSIX standard for details.
+ */
+static inline int sched_yield(void)
+{
+   return 0;
+}
+
+/** Get the maximum priority.
+ * Please refer to POSIX standard for details.
+ * @param policy [in] SCHED_FIFO is the only valid input for this implementation.
+ */
+int sched_get_priority_max(int policy);
+
+/** Get the minimum priority.
+ * Please refer to POSIX standard for details.
+ * @param policy [in] SCHED_FIFO is the only valid input for this implementation.
+ */
+int sched_get_priority_min(int policy);
+
+/** @} */
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _POSIX_SCHED_H_ */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/posix/sys/types.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/posix/sys/types.h
new file mode 100755
index 0000000000000..700026f9f9e4e
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/posix/sys/types.h
@@ -0,0 +1,35 @@
+#ifndef _SYS_TYPES_H_
+#define _SYS_TYPES_H_
+
+/*==========================================================================
+ * FILE:         types.c
+ *
+ * SERVICES:     types usded in POSIX API interface
+ *
+ * DESCRIPTION:  POSIX API interface based upon POSIX 1003.1-2004
+ *
+ *               Copyright (c) 2013, 2016  by Qualcomm Technologies, Inc.  All Rights Reserved. QUALCOMM Proprietary and Confidential.
+
+ *==========================================================================*/
+
+#if !defined( _PID_T ) || !defined( __pid_t_defined )
+/* POSIX defines pid_t as signed 32-bit type. Hexagon toolchain's header
+   defines it as unsigned 32-bit type citing conflict with QuRT POSIX
+   compatibility later. If any such conflicts exist, we should fix them.
+   pid_t is being defined *BEFORE* inclusion of generic/sys/types.h
+   *INTENTIONALLY* to fix this */
+typedef int        pid_t;
+#define _PID_T
+#define __pid_t_defined
+#endif
+#include <bits/confname.h>
+#include <hooks/unistd.h>
+#include <generic/sys/types.h>
+#include <pthread_types.h>
+
+#ifndef __DEFINED_off_t
+typedef long       off_t;
+#define __DEFINED_off_t
+#endif
+
+#endif /* _SYS_TYPES_H_ */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/posix/time.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/posix/time.h
new file mode 100755
index 0000000000000..13aeb1ea9920d
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/posix/time.h
@@ -0,0 +1,142 @@
+#ifndef _POSIX_TIME_H_
+#define _POSIX_TIME_H_
+
+/*==========================================================================
+ * FILE:         time.h
+ *
+ * SERVICES:     POSIX Timer API interface
+ *
+ * DESCRIPTION:  POSIX Timer API interface based upon POSIX 1003.1-2004
+ *
+ *               Copyright (c) 2013,2016  by Qualcomm Technologies, Inc.  All Rights Reserved. QUALCOMM Proprietary and Confidential.
+ *==========================================================================*/
+
+
+#include <sys/types.h>
+
+typedef int              clockid_t; /* ignored */
+#define _CLOCKID_T
+#define _PROVIDE_POSIX_TIME_DECLS 1
+#include <generic/time.h>
+/* @todo anandj sys/time.h has definition for struct timeval but is not
+         included by generic/time.h */
+#include <sys/time.h>
+
+#define CLOCK_FREQ_NOT_DEFINED          -1
+/* Frequency of Sclk used */
+#define TIME_CONV_SCLK_FREQ             19200000
+
+#define RES_CONV_FACTOR1                1
+#define RES_CONV_FACTOR2                1000000000
+
+#if !defined(CLOCK_REALTIME)
+# define CLOCK_REALTIME 0
+#endif
+
+#if !defined(CLOCK_MONOTONIC)
+# define CLOCK_MONOTONIC 1
+#endif
+
+#if !defined(CLOCK_THREAD_CPUTIME_ID)
+# define CLOCK_THREAD_CPUTIME_ID 2
+#endif
+
+#if !defined(CLOCK_PROCESS_CPUTIME_ID)
+# define CLOCK_PROCESS_CPUTIME_ID 3
+#endif
+
+#if !defined(CLOCK_MONOTONIC_RAW)
+# define CLOCK_MONOTONIC_RAW 4
+#endif
+
+#if !defined(CLOCK_REALTIME_COARSE)
+# define CLOCK_REALTIME_COARSE 5
+#endif
+
+#if !defined(CLOCK_MONOTONIC_COARSE)
+# define CLOCK_MONOTONIC_COARSE 6
+#endif
+
+#if !defined(CLOCK_BOOTTIME)
+# define CLOCK_BOOTTIME 7
+#endif
+
+struct itimerspec
+{
+    struct timespec it_interval;  /* Timer period.     */
+    struct timespec it_value;     /* Timer expiration. */
+};
+
+/* have to move #include here to solve circular include problems between time.h and signal.h */
+#include <signal.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Timer functions */
+
+/** \details
+ * POSIX timers can be either of two types: a one-shot type or a periodic 
+ * type.
+ *
+ * A one-shot is an armed timer that is set to an expiration time relative 
+ * to either a current time or an absolute time. The timer expires once and 
+ * is disarmed. 
+ *
+ * A periodic timer is armed with an initial expiration time and a repetition 
+ * interval. Every time the interval timer 
+ * expires, the timer is reloaded with the repetition interval. The timer 
+ * is then rearmed. 
+ */
+
+/** \defgroup timer POSIX Timer API */
+
+/** \ingroup timer */
+/** @{ */
+
+/** Create a POSIX timer. 
+ * Please refer to POSIX standard for details.
+ * @param clockid [in] ignored in this implementation
+ * @param evp     [in] if non-NULL, points to a sigevent structure. This 
+ * structure, allocated by the application, defines the asynchronous 
+ * notification to occur when the timer expires. If the evp argument is 
+ * NULL, the effect is as if the evp argument pointed to a sigevent 
+ * structure with the sigev_notify member having the value SIGEV_SIGNAL, 
+ * the sigev_signo having a default signal number (SIGALRM), and the 
+ * sigev_value member having the value of the timer ID.
+ */
+int timer_create(clockid_t clockid, struct sigevent *restrict evp,
+                 timer_t *restrict timerid);
+
+/** Delete a POSIX timer. 
+ * Please refer to POSIX standard for details.
+ */                 
+int timer_delete(timer_t timerid);
+
+/** Get the time remaining on a POSIX timer. 
+ * Please refer to POSIX standard for details.
+ */                 
+int timer_gettime(timer_t timerid, struct itimerspec *value);
+
+
+/** Set the time remaining on a POSIX timer. 
+ * Please refer to POSIX standard for details.
+ * @param flags [in] ignored in this implementation
+ */                 
+int timer_settime(timer_t timerid, int flags,
+                  const struct itimerspec *restrict value,
+                  struct itimerspec *restrict ovalue);
+/** Obtain ID of a process CPU-time clock
+ *  @param pid [in] Process ID
+ *  @param clock_id [out] Clock ID
+ *  @return Error values as per POSIX standard
+ */
+int clock_getcpuclockid (pid_t pid, clockid_t * clock_id);
+/** @} */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _POSIX_TIME_H_ */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qube/qube.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qube/qube.h
new file mode 100755
index 0000000000000..1e31e2deedb38
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qube/qube.h
@@ -0,0 +1,51 @@
+#ifndef QUBE_H
+#define QUBE_H
+/*=============================================================================
+
+                 qube.h -- H E A D E R  F I L E
+
+GENERAL DESCRIPTION
+   Prototypes of qpd API
+
+EXTERNAL FUNCTIONS
+   None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+      Copyright (c) 2013  by Qualcomm Technologies, Inc.  All Rights Reserved.
+
+=============================================================================*/
+
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <qurt.h>
+
+/* Define Error codes as QuRT error codes preceed with QURT_ */
+#ifndef EOK
+#define EOK                             QURT_EOK
+#endif /* EOK */
+#ifndef EVAL
+#define EVAL                            QURT_EVAL
+#endif /* EVAL */
+#ifndef EMEM
+#define EMEM                            QURT_EMEM
+#endif /* EMEM */
+#ifndef EINVALID
+#define EINVALID                        QURT_EINVALID
+#endif /* EINVALID */
+
+
+/*=============================================================================
+                      FUNCTION DECLARATIONS                                
+=============================================================================*/
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QUBE_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/atomic_ops.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/atomic_ops.h
new file mode 100755
index 0000000000000..0a9a9f8ba7db5
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/atomic_ops.h
@@ -0,0 +1,197 @@
+#ifndef ATOMIC_OPS_H
+#define ATOMIC_OPS_H
+/**
+  @file atomic_ops.h 
+
+  @brief  Type definitions backwards compatible.
+
+EXTERNAL FUNCTIONS
+   None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+Copyright (c) 2013, 2021  by Qualcomm Technologies, Inc.  All Rights Reserved
+Confidential and Proprietary - Qualcomm Technologies, Inc.
+=============================================================================*/
+
+
+/*
+ * Australian Public Licence B (OZPLB)
+ *
+ * Version 1-0
+ *
+ * Copyright (c) 2007, Open Kernel Labs, Inc.
+ *
+ * All rights reserved. 
+ *
+ * Developed by: Embedded, Real-time and Operating Systems Program (ERTOS)
+ *               National ICT Australia
+ *               http://www.ertos.nicta.com.au
+ *
+ * Permission is granted by National ICT Australia, free of charge, to
+ * any person obtaining a copy of this software and any associated
+ * documentation files (the "Software") to deal with the Software without
+ * restriction, including (without limitation) the rights to use, copy,
+ * modify, adapt, merge, publish, distribute, communicate to the public,
+ * sublicense, and/or sell, lend or rent out copies of the Software, and
+ * to permit persons to whom the Software is furnished to do so, subject
+ * to the following conditions:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimers.
+ *
+ *     * Redistributions in binary form must reproduce the above
+ *       copyright notice, this list of conditions and the following
+ *       disclaimers in the documentation and/or other materials provided
+ *       with the distribution.
+ *
+ *     * Neither the name of National ICT Australia, nor the names of its
+ *       contributors, may be used to endorse or promote products derived
+ *       from this Software without specific prior written permission.
+ *
+ * EXCEPT AS EXPRESSLY STATED IN THIS LICENCE AND TO THE FULL EXTENT
+ * PERMITTED BY APPLICABLE LAW, THE SOFTWARE IS PROVIDED "AS-IS", AND
+ * NATIONAL ICT AUSTRALIA AND ITS CONTRIBUTORS MAKE NO REPRESENTATIONS,
+ * WARRANTIES OR CONDITIONS OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
+ * BUT NOT LIMITED TO ANY REPRESENTATIONS, WARRANTIES OR CONDITIONS
+ * REGARDING THE CONTENTS OR ACCURACY OF THE SOFTWARE, OR OF TITLE,
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT,
+ * THE ABSENCE OF LATENT OR OTHER DEFECTS, OR THE PRESENCE OR ABSENCE OF
+ * ERRORS, WHETHER OR NOT DISCOVERABLE.
+ *
+ * TO THE FULL EXTENT PERMITTED BY APPLICABLE LAW, IN NO EVENT SHALL
+ * NATIONAL ICT AUSTRALIA OR ITS CONTRIBUTORS BE LIABLE ON ANY LEGAL
+ * THEORY (INCLUDING, WITHOUT LIMITATION, IN AN ACTION OF CONTRACT,
+ * NEGLIGENCE OR OTHERWISE) FOR ANY CLAIM, LOSS, DAMAGES OR OTHER
+ * LIABILITY, INCLUDING (WITHOUT LIMITATION) LOSS OF PRODUCTION OR
+ * OPERATION TIME, LOSS, DAMAGE OR CORRUPTION OF DATA OR RECORDS; OR LOSS
+ * OF ANTICIPATED SAVINGS, OPPORTUNITY, REVENUE, PROFIT OR GOODWILL, OR
+ * OTHER ECONOMIC LOSS; OR ANY SPECIAL, INCIDENTAL, INDIRECT,
+ * CONSEQUENTIAL, PUNITIVE OR EXEMPLARY DAMAGES, ARISING OUT OF OR IN
+ * CONNECTION WITH THIS LICENCE, THE SOFTWARE OR THE USE OF OR OTHER
+ * DEALINGS WITH THE SOFTWARE, EVEN IF NATIONAL ICT AUSTRALIA OR ITS
+ * CONTRIBUTORS HAVE BEEN ADVISED OF THE POSSIBILITY OF SUCH CLAIM, LOSS,
+ * DAMAGES OR OTHER LIABILITY.
+ *
+ * If applicable legislation implies representations, warranties, or
+ * conditions, or imposes obligations or liability on National ICT
+ * Australia or one of its contributors in respect of the Software that
+ * cannot be wholly or partly excluded, restricted or modified, the
+ * liability of National ICT Australia or the contributor is limited, to
+ * the full extent permitted by the applicable legislation, at its
+ * option, to:
+ * a.  in the case of goods, any one or more of the following:
+ * i.  the replacement of the goods or the supply of equivalent goods;
+ * ii.  the repair of the goods;
+ * iii. the payment of the cost of replacing the goods or of acquiring
+ *  equivalent goods;
+ * iv.  the payment of the cost of having the goods repaired; or
+ * b.  in the case of services:
+ * i.  the supplying of the services again; or
+ * ii.  the payment of the cost of having the services supplied again.
+ *
+ * The construction, validity and performance of this licence is governed
+ * by the laws in force in New South Wales, Australia.
+ */
+
+/*
+ * Author: Malcolm Purvis <malcolmp@ok-labs.com>
+ * Author: Carlos Dyonisio <medaglia@ok-labs.com>
+ */
+
+#include <qurt_atomic_ops.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef unsigned int atomic_plain_word_t;
+
+/*-------------------------------------------------------------------------*/
+                        /* Atomic Ops API. */
+
+/*
+ * IMPORTANT!
+ * If you plan to change the structure atomic_word_t, please add the new
+ * elements after value. For more information, read the comment in
+ * arch/arm/libs/atomic_ops/v5/src/arm_atomic_ops.spp:66
+ */
+
+typedef struct {
+    volatile atomic_plain_word_t value;
+} atomic_word_t;
+
+#define ATOMIC_INIT(i)  { (i) }
+
+static inline void
+atomic_init(atomic_word_t *a, atomic_plain_word_t v)
+{
+    a->value = v;
+}
+
+#if defined(ARCH_ARM) && defined(ARCH_VER) && (ARCH_VER < 6) && \
+         (!defined(__ATOMIC_OPS_IN_KERNEL__) || defined(MACHINE_SMP))
+
+/* 
+ * If it is ARMv4/v5, the function declarations may change
+ * and are defined in the arch specific header file,
+ * as some of then cannot be declared static because of
+ * the assembler implementation.
+ */
+
+#else 
+
+/* Arithmetic operations. */
+
+void atomic_sub(atomic_word_t *target, atomic_plain_word_t v);
+
+/* Architecture independent definitions. */
+
+static inline atomic_plain_word_t atomic_read(atomic_word_t *target)
+{
+    return target->value;
+}
+
+typedef unsigned long long atomic64_plain_word_t;
+
+typedef struct {
+    volatile atomic64_plain_word_t value;
+} atomic64_word_t;
+
+static inline void
+atomic64_init(atomic64_word_t *a, atomic64_plain_word_t v)
+{
+    a->value = v;
+}
+
+/*********************
+  Support 64-bit  
+ *********************/
+
+atomic64_plain_word_t atomic64_set(atomic64_word_t* target,
+                                      atomic64_plain_word_t value);
+
+void atomic64_xor(atomic64_word_t* target,
+                       atomic64_plain_word_t mask);
+
+/*---------------------------------------------------------------------------*/
+
+/* Architecture independent definitions. */
+
+static inline atomic64_plain_word_t atomic64_read(atomic64_word_t *target)
+{
+    return target->value;
+}
+
+#endif
+
+
+/* Architecture dependent definitions. */
+#include <atomic_ops_plat.h>
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* ATOMIC_OPS_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/atomic_ops_plat.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/atomic_ops_plat.h
new file mode 100755
index 0000000000000..b54b3ff83d978
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/atomic_ops_plat.h
@@ -0,0 +1,86 @@
+#ifndef ATOMIC_OPS_PLAT_H
+#define ATOMIC_OPS_PLAT_H
+/**
+  @file atomic_ops_plat.h 
+
+  @brief  Prototypes of atomic operations API backwards compatible.
+
+EXTERNAL FUNCTIONS
+   None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+Copyright (c) 2013, 2023 Qualcomm Technologies, Inc.
+All Rights Reserved.
+Confidential and Proprietary - Qualcomm Technologies, Inc.
+=============================================================================*/
+
+
+#include <qurt_atomic_ops.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+/*=============================================================================
+                      CONSTANTS AND MACROS                                
+=============================================================================*/
+#define atomic_set(a,b)                qurt_atomic_set((unsigned int *)(a),(unsigned int)(b))
+#define atomic_and(a,b)                qurt_atomic_and((unsigned int *)(a),(unsigned int)(b))
+#define atomic_and_return(a,b)         qurt_atomic_and_return((unsigned int *)(a),(unsigned int)(b))
+#define atomic_or(a,b)                 qurt_atomic_or((unsigned int *)(a),(unsigned int)(b))
+#define atomic_or_return(a,b)          qurt_atomic_or_return((unsigned int *)(a),(unsigned int)(b))
+#define atomic_xor(a,b)                qurt_atomic_xor((unsigned int *)(a),(unsigned int)(b))
+#define atomic_xor_return(a,b)         qurt_atomic_xor_return((unsigned int *)(a),(unsigned int)(b)) 
+#define atomic_set_bit(a,b)            qurt_atomic_set_bit((unsigned int *)(a),(unsigned int)(b))
+#define atomic_clear_bit(a,b)          qurt_atomic_clear_bit((unsigned int *)(a),(unsigned int)(b))
+#define atomic_change_bit(a,b)         qurt_atomic_change_bit((unsigned int *)(a),(unsigned int)(b))
+#define atomic_add(a,b)                qurt_atomic_add((unsigned int *)(a),(unsigned int)(b))
+#define atomic_add_return(a,b)         qurt_atomic_add_return((unsigned int *)(a),(unsigned int)(b))
+#define atomic_add_unless(a,b,c)       qurt_atomic_add_unless((unsigned int *)(a),(unsigned int)(b),(unsigned int)(c))
+#define atomic_sub(a,b)                qurt_atomic_sub((unsigned int *)(a),(unsigned int)(b))
+#define atomic_sub_return(a,b)         qurt_atomic_sub_return((unsigned int *)(a),(unsigned int)(b))
+#define atomic_inc(a)                  qurt_atomic_inc((unsigned int *)(a))
+#define atomic_inc_return(a)           qurt_atomic_inc_return((unsigned int *)(a))
+#define atomic_dec(a)                  qurt_atomic_dec((unsigned int *)(a))
+#define atomic_dec_return(a)           qurt_atomic_dec_return((unsigned int *)(a))
+#define atomic_compare_and_set(a,b,c)  qurt_atomic_compare_and_set((unsigned int *)(a),(unsigned int)(b),(unsigned int)(c))
+#define atomic_barrier                 qurt_atomic_barrier
+#define atomic_barrier_write           qurt_atomic_barrier_write
+#define atomic_barrier_write_smp       qurt_atomic_barrier_write_smp
+#define atomic_barrier_read_smp        qurt_atomic_barrier_read_smp
+#define atomic_barrier_smp             qurt_atomic_barrier_smp
+
+/*============================
+ *       64 bits support
+ *============================ */
+#define atomic64_set(a,b)                qurt_atomic64_set((unsigned long long *)(a),(unsigned long long)(b))  
+#define atomic64_and(a,b)                qurt_atomic64_and((unsigned long long *)(a),(unsigned long long)(b)) 
+#define atomic64_and_return(a,b)         qurt_atomic64_and_return((unsigned long long *)(a),(unsigned long long)(b))  
+#define atomic64_or(a,b)                 qurt_atomic64_or((unsigned long long *)(a),(unsigned long long)(b)) 
+#define atomic64_or_return(a,b)          qurt_atomic64_or_return((unsigned long long *)(a),(unsigned long long)(b)) 
+#define atomic64_xor(a,b)                qurt_atomic64_xor((unsigned long long *)(a),(unsigned long long)(b)) 
+#define atomic64_xor_return(a,b)         qurt_atomic64_xor_return((unsigned long long *)(a),(unsigned long long)(b))  
+#define atomic64_set_bit(a,b)            qurt_atomic64_set_bit((unsigned long long *)(a),(unsigned long long)(b)) 
+#define atomic64_clear_bit(a,b)          qurt_atomic64_clear_bit((unsigned long long *)(a),(unsigned long long)(b))  
+#define atomic64_change_bit(a,b)         qurt_atomic64_change_bit((unsigned long long *)(a),(unsigned long long)(b)) 
+#define atomic64_add(a,b)                qurt_atomic64_add((unsigned long long *)(a),(unsigned long long)(b))  
+#define atomic64_add_return(a,b)         qurt_atomic64_add_return((unsigned long long *)(a),(unsigned long long)(b))
+#define atomic64_sub(a,b)                qurt_atomic64_sub((unsigned long long *)(a),(unsigned long long)(b)) 
+#define atomic64_sub_return(a,b)         qurt_atomic64_sub_return((unsigned long long *)(a),(unsigned long long)(b)) 
+#define atomic64_inc(a)                  qurt_atomic64_inc((unsigned long long *)(a))
+#define atomic64_inc_return(a)           qurt_atomic64_inc_return((unsigned long long *)(a))
+#define atomic64_dec(a)                  qurt_atomic64_dec((unsigned long long *)(a))
+#define atomic64_dec_return(a)           qurt_atomic64_dec_return((unsigned long long *)(a))
+#define atomic64_compare_and_set(a,b,c)  qurt_atomic64_compare_and_set((unsigned long long  *)(a),(unsigned long long )(b),(unsigned long long )(c))
+#define atomic64_barrier                 qurt_atomic64_barrier
+#define atomic64_barrier_write           qurt_atomic64_barrier_write
+#define atomic64_barrier_write_smp       qurt_atomic64_barrier_write_smp
+#define atomic64_barrier_read_smp        qurt_atomic64_barrier_read_smp
+#define atomic64_barrier_smp             qurt_atomic64_barrier_smp
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* ATOMIC_OPS_PLAT_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt.h
new file mode 100755
index 0000000000000..4d25c9b2b6243
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt.h
@@ -0,0 +1,111 @@
+#ifndef QURT_H
+#define QURT_H 
+
+/**
+  @file qurt.h 
+  @brief  Contains kernel header files that provide kernel OS API functions, constants, and 
+  definitions 
+
+ EXTERNALIZED FUNCTIONS
+  none
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  none
+
+ Copyright (c) 2013,2021,2023  by Qualcomm Technologies, Inc.  All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+/*======================================================================
+ *
+ *											 EDIT HISTORY FOR FILE
+ *
+ *	 This section contains comments describing changes made to the
+ *	 module. Notice that changes are listed in reverse chronological
+ *	 order.
+ *
+ *	
+ *
+ *
+ * when 				who 		what, where, why
+ * ---------- 	--- 		------------------------------------------------
+ * 2011-02-25 	op			Add Header file
+   2012-12-16   cm          (Tech Pubs) Edited/added Doxygen comments and markup.
+ ======================================================================*/
+ 
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "qurt_consts.h"
+#include "qurt_api_version.h"
+#include "qurt_alloc.h"
+#include "qurt_futex.h"
+#include "qurt_mutex.h"
+#include "qurt_pipe.h"
+#include "qurt_printf.h"
+#include "qurt_assert.h"
+#include "qurt_thread.h"
+#include "qurt_trace.h"
+#include "qurt_cycles.h"
+#include "qurt_profile.h"
+#include "qurt_sem.h"
+#include "qurt_cond.h"
+#include "qurt_barrier.h"
+#include "qurt_fastint.h"
+#include "qurt_allsignal.h"
+#include "qurt_anysignal.h"
+#include "qurt_signal.h"
+#include "qurt_rmutex.h"
+#include "qurt_pimutex.h"
+#include "qurt_signal2.h"
+#include "qurt_rmutex2.h"
+#include "qurt_pimutex2.h"
+#include "qurt_int.h"
+#include "qurt_lifo.h"
+#include "qurt_power.h"
+#include "qurt_event.h"
+#include "qurt_pmu.h"
+#include "qurt_stid.h"
+//#include "qurt_version.h"
+#include "qurt_tlb.h"
+#include "qurt_vtlb.h"
+#include "qurt_memory.h"
+#include "qurt_qdi.h"
+#include "qurt_sclk.h"
+#include "qurt_space.h"
+#include "qurt_process.h"
+#include "qurt_timer.h"
+#include "qurt_tls.h"
+#include "qurt_thread_context.h"
+#include "qurt_hvx.h"
+#include "qurt_hmx.h"
+#include "qurt_mailbox.h"
+#include "qurt_island.h"
+#include "qurt_qdi_proxy.h"
+#include "qurt_l2cfg.h"
+#include "qurt_mmap.h"
+#include "qurt_isr.h"
+#include "qurt_busywait.h"
+#include "qurt_ecc.h"
+#include "qurt_callback.h"
+#include "qurt_error.h"
+#include "qurt_except.h"
+#include "qurt_mq.h"
+#include "qurt_user_dma.h"
+#include "qurt_fs_hub.h"	
+#include "qurt_os_services.h"	
+
+#ifndef MAIN_ONLY
+#define INCLUDE_ISLAND_CONTENTS
+#endif
+#ifndef ISLAND_ONLY
+#define INCLUDE_MAIN_CONTENTS
+#endif
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_H */
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_alloc.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_alloc.h
new file mode 100755
index 0000000000000..da37a4c0a714e
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_alloc.h
@@ -0,0 +1,145 @@
+#ifndef QURT_ALLOC_H
+#define QURT_ALLOC_H
+
+/**
+  @file qurt_alloc.h 
+  @brief Prototypes of kernel memory allocation API functions.      
+
+ EXTERNALIZED FUNCTIONS
+  none
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  none
+
+ Copyright (c) 2021, 2023 Qualcomm Technologies, Inc.
+ All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+
+/*======================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**@ingroup func_qurt_malloc
+  Dynamically allocates the specified array on the QuRT system heap.
+  The return value is the address of the allocated memory area.
+
+  @note1hang The allocated memory area is automatically initialized to zero.
+
+  @param[in] size     Size (in bytes) of the memory area.
+  
+  @return
+  Nonzero -- Pointer to the allocated memory area. \n
+  0 -- Not enough memory in heap to allocate memory area.
+
+  @dependencies
+  None.    
+
+ */
+/* ======================================================================*/
+void *qurt_malloc( unsigned int size);
+
+/*======================================================================*/
+/**@ingroup func_qurt_calloc
+  Dynamically allocates the specified array on the QuRT system heap.
+  The return value is the address of the allocated array. 
+
+  @note1hang The allocated memory area is automatically initialized to zero.
+
+  @param[in] elsize Size (in bytes) of each array element.
+  @param[in] num    Number of array elements.
+
+  @return 
+  Nonzero -- Pointer to allocated array.\n
+  Zero -- Not enough memory in heap to allocate array.
+
+  @dependencies
+  None.
+  
+ */
+ /* ======================================================================*/
+void *qurt_calloc(unsigned int elsize, unsigned int num);
+
+/*======================================================================*/
+/**@ingroup func_qurt_realloc
+  Reallocates memory on the heap. \n
+  Changes the size of a memory area that is already allocated on the QuRT system heap. 
+  The reallocate memory operation is functionally similar to realloc. It accepts a pointer
+  to an existing memory area on the heap, and resizes the memory area to the specified size
+  while preserving the original contents of the memory area.
+
+  @note1hang This function might change the address of the memory area.
+             If the value of ptr is NULL, this function is equivalent to 
+             qurt_malloc().
+             If the value of new_size is 0, it is equivalent to qurt_free().  
+             If the memory area is expanded, the added memory is not initialized.
+
+  @param[in] *ptr   Pointer to the address of the memory area.
+  @param[in] newsize Size (in bytes) of the reallocated memory area.
+	               	
+  @return
+  Nonzero -- Pointer to reallocated memory area. \n
+  0 -- Not enough memory in heap to reallocate the memory area.
+
+  @dependencies
+  None.
+	 
+ */
+ /* ======================================================================*/
+void *qurt_realloc(void *ptr,  int newsize);
+
+/*======================================================================*/
+/**@ingroup func_qurt_free
+  Frees allocated memory from the heap.\n
+  Deallocates the specified memory from the QuRT system heap.
+
+  @param[in] *ptr Pointer to the address of the memory to deallocate.
+	
+  @return
+  None.
+
+  @dependencies
+  The memory item that the ptr value specifies must have been previously 
+  allocated using one of the qurt_calloc(), 
+  qurt_malloc(), or qurt_realloc() memory allocation functions. 
+  Otherwise the behavior of QuRT is undefined.
+  
+ */
+ /* ======================================================================*/
+void qurt_free( void *ptr);
+
+
+void *qurt_memalign(unsigned int alignment, unsigned int size);
+
+/*
+||  Macro to define a static heap for a QuRT program.
+||
+||  Usage:
+||   Declare at the top-level of any C source file that
+||    is part of the build (and is guaranteed
+||    to actually be pulled into the build). Place
+||    it in the same function with main():
+||
+||    QURT_DECLARE_STATIC_HEAP(512000);
+||
+||  The only argument is the size in bytes, and it is
+||   rounded up to the nearest 64 bytes (size of an
+||   L2 cache block).
+||
+*/
+
+#define QURT_DECLARE_STATIC_HEAP(sz)                    \
+   static struct qurt_static_heap {                     \
+      char space[(sz)] __attribute__((aligned(64)));      \
+   } static_heap[1];                                    \
+   void * const override_heap_Base = &static_heap[0];   \
+   void * const override_heap_Limit = &static_heap[1]
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_ALLOC_H */
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_allsignal.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_allsignal.h
new file mode 100755
index 0000000000000..5dc89e495130d
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_allsignal.h
@@ -0,0 +1,176 @@
+
+#ifndef QURT_ALLSIGNAL_H
+#define QURT_ALLSIGNAL_H
+
+/**
+  @file  qurt_allsignal.h
+  @brief  Prototypes of kernel signal API functions.      
+
+ EXTERNALIZED FUNCTIONS
+  none
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  none
+
+
+ Copyright (c) 2021  by Qualcomm Technologies, Inc.  All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+
+#include <qurt_futex.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** @addtogroup all_signal_types
+@{ */
+/*=====================================================================
+ Typedefs
+ ======================================================================*/
+
+/**          
+qurt_signal_t supersedes qurt_allsignal_t. This type definition was added for backwards compatibility. */
+typedef union {
+    /** @cond */
+	unsigned long long int raw;
+	struct {
+		unsigned int waiting;      /**< */
+		unsigned int signals_in;   /**< */
+		unsigned int queue;        /**< */
+		unsigned int reserved;     /**< */
+	}X;
+    /** @endcond */
+} qurt_allsignal_t;
+/** @} */ /* end_addtogroup all_signal_types */
+
+/*=====================================================================
+ Functions
+======================================================================*/
+ 
+/*======================================================================*/
+/**@ingroup func_qurt_allsignal_init
+  Initializes an all-signal object.\n
+  The all-signal object is initially cleared.
+
+  @datatypes
+  #qurt_allsignal_t
+
+  @param[out] signal Pointer to the all-signal object to initialize. 
+  
+  @return         
+  None.
+
+  @dependencies    
+  None.
+ */
+/* ======================================================================*/
+void qurt_allsignal_init(qurt_allsignal_t *signal);
+
+/*======================================================================*/
+/**@ingroup func_qurt_allsignal_destroy
+  Destroys the specified all-signal object.\n
+  @note1hang All-signal objects must be destroyed when they are no longer in use. 
+             Failure to do this causes resource leaks in the QuRT kernel.  \n
+  @note1cont All-signal objects must not be destroyed while they are still in use. 
+             If this occurs, the behavior of QuRT is undefined.
+  
+  @datatypes
+  #qurt_allsignal_t
+
+  @param[in] signal Pointer to the all-signal object to destroy.
+
+  @return         
+  None.
+ 
+  @dependencies
+  None.
+ */
+/* ======================================================================*/
+void qurt_allsignal_destroy(qurt_allsignal_t *signal);
+
+/*======================================================================*/
+/**@ingroup func_qurt_allsignal_get
+  Gets signal values from the all-signal object.
+
+  Returns the current signal values of the specified all-signal object.
+
+  @datatypes
+  #qurt_allsignal_t
+
+  @param[in] signal Pointer to the all-signal object to access.
+
+  @return         
+  Bitmask with current signal values.
+    
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+static inline unsigned int qurt_allsignal_get(qurt_allsignal_t *signal)
+{ return signal->X.signals_in; }
+    
+/*======================================================================*/
+/**@ingroup func_qurt_allsignal_wait  
+  Waits on the all-signal object.\n
+  Suspends the current thread until all of the specified signals are set.
+  Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1
+  indicates that a signal must be waited on, and 0 that it is not to be waited on.
+
+  If a signal is set in an all-signal object, and a thread is waiting on the all-signal object for
+  that signal, the thread is awakened. If the awakened thread has higher priority than
+  the current thread, a context switch can occur.
+
+  Unlike any-signals, all-signals do not need to explicitly clear any set signals in an all-signal
+  object before waiting on them again -- clearing is done automatically by the wait
+  operation.
+
+  @note1hang At most, one thread can wait on an all-signal object at any given time.
+             Because signal clearing is done by the wait operation, no clear operation is
+             defined for all-signals.
+
+  @datatypes
+  #qurt_allsignal_t
+  
+  @param[in] signal Pointer to the all-signal object to wait on.
+  @param[in] mask	Signal mask value, which identifies the individual signals in the all-signal object
+                    to wait on.
+ 
+  @return
+  None.
+ 
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+void qurt_allsignal_wait(qurt_allsignal_t *signal, unsigned int mask);
+
+/*======================================================================*/
+/**@ingroup func_qurt_allsignal_set
+  Set signals in the specified all-signal object.
+
+  Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit 
+  value of 1 indicates that a signal must be set, and 0 indicates not to set the signal.
+
+  @datatypes
+  #qurt_allsignal_t
+
+  @param[in]	signal  Pointer to the all-signal object to modify. 
+  @param[in]	mask 	Signal mask value identifying the individual signals to  
+                        set in the all-signal object.
+
+  @return
+  None.
+ 
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+void qurt_allsignal_set(qurt_allsignal_t *signal, unsigned int mask);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_ALLSIGNAL_H */
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_anysignal.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_anysignal.h
new file mode 100755
index 0000000000000..9619e2de562b4
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_anysignal.h
@@ -0,0 +1,225 @@
+#ifndef QURT_ANYSIGNAL_H
+#define QURT_ANYSIGNAL_H 
+/**
+  @file qurt_anysignal.h
+  Prototypes of kernel signal API functions.      
+
+ EXTERNALIZED FUNCTIONS
+  None
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  None
+
+Copyright (c) 2021 Qualcomm Technologies, Inc.
+All rights reserved.
+Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+
+#include <qurt_signal.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*=====================================================================
+Typedefs
+======================================================================*/
+
+/**@ingroup anysignals_types                                                 
+ qurt_signal_t supersedes qurt_anysignal_t. This type definition was added for backwards compatibility.  */  
+typedef qurt_signal_t qurt_anysignal_t;
+
+/*=====================================================================
+ Functions
+======================================================================*/
+ 
+/*======================================================================*/
+/**@ingroup func_qurt_anysignal_init
+  Initializes an any-signal object.\n
+  The any-signal object is initially cleared.
+
+  @datatypes
+  #qurt_anysignal_t
+
+  @param[out] signal	Pointer to the initialized any-signal object.  
+  
+  @return         
+  None.
+
+  @dependencies  
+  None.
+ */
+/* ======================================================================*/
+static inline void qurt_anysignal_init(qurt_anysignal_t *signal)
+{
+  qurt_signal_init(signal);
+}
+
+/*======================================================================*/
+/**@ingroup func_qurt_anysignal_destroy
+  Destroys the specified any-signal object. 
+
+  @note1hang Any-signal objects must be destroyed when they are no longer in use. Failure
+             to do this causes resource leaks in the QuRT kernel.\n
+  @note1cont Any-signal objects must not be destroyed while they are still in use. If this
+             occurs, the behavior of QuRT is undefined.
+
+  @datatypes
+  #qurt_anysignal_t
+  
+  @param[in] signal Pointer to the any-signal object to destroy.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+/* ======================================================================*/
+static inline void qurt_anysignal_destroy(qurt_anysignal_t *signal)
+{
+  qurt_signal_destroy(signal);
+}
+
+/*======================================================================*/
+/**@ingroup func_qurt_anysignal_wait
+  Wait on the any-signal object. \n
+  Suspends the current thread until any one of the specified signals is set.
+
+  Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1
+  indicates that a signal must be waited on, and 0 indicates not to wait on the signal.
+  If a signal is set in an any-signal object, and a thread is waiting on the any-signal object for
+  that signal, the thread is awakened. If the awakened thread has higher priority than
+  the current thread, a context switch can occur.
+
+  @note1hang At most, one thread can wait on an any-signal object at any given time.
+
+  @datatypes
+  #qurt_anysignal_t
+	
+  @param[in] signal Pointer to the any-signal object to wait on. 
+  @param[in] mask   Signal mask value, which specifies the individual signals in the any-signal
+                      object to wait on.
+
+  @return 				
+  Bitmask of current signal values.
+
+  @dependencies
+  None.
+ */
+/* ======================================================================*/
+static inline unsigned int qurt_anysignal_wait(qurt_anysignal_t *signal, unsigned int mask)
+{
+  return qurt_signal_wait(signal, mask, QURT_SIGNAL_ATTR_WAIT_ANY);
+}
+
+/*======================================================================*/
+/**@ingroup func_qurt_anysignal_set
+  Sets signals in the specified any-signal object. \n
+  Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1
+  indicates that a signal must be set, and 0 indicates not to set the sigmal.
+
+  @datatypes
+  #qurt_anysignal_t
+	
+  @param[in] signal Pointer to the any-signal object to modify. 
+  @param[in]  mask  Signal mask value identifying the individual signals to  
+                       set in the any-signal object.
+
+  @return 				
+  Bitmask of old signal values (before set).
+
+  @dependencies
+  None.
+ */
+/* ======================================================================*/
+unsigned int qurt_anysignal_set(qurt_anysignal_t *signal, unsigned int mask);
+
+
+
+/*======================================================================*/
+/**@ingroup func_qurt_anysignal_get
+  Gets signal values from the any-signal object.\n
+  Returns the current signal values of the specified any-signal object.
+
+  @datatypes
+  #qurt_anysignal_t
+ 	
+  @param[in] signal Pointer to the any-signal object to access. 
+
+  @return 				
+  A bitmask with the current signal values of the specified any-signal object.
+
+  @dependencies
+  None.
+ */
+/* ======================================================================*/
+static inline unsigned int qurt_anysignal_get(qurt_anysignal_t *signal)
+{
+  return qurt_signal_get(signal);
+}
+
+
+/*======================================================================*/
+/**@ingroup func_qurt_anysignal_clear
+   @xreflabel{sec:anysignal_clear}
+  Clears signals in the specified any-signal object.\n
+  Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1
+  indicates that a signal must be cleared, and 0 indicates not to clear the signal.
+
+  @datatypes
+  #qurt_anysignal_t
+	
+  @param[in] signal Pointer to the any-signal object, which specifies the any-signal object to modify. 
+  @param[in] mask   Signal mask value identifying the individual signals to  
+                    clear in the any-signal object.
+	
+  @return 				
+  Bitmask -- Old signal values (before clear). 
+
+  @dependencies
+  None.
+ */
+/* ======================================================================*/
+unsigned int qurt_anysignal_clear(qurt_anysignal_t *signal, unsigned int mask);
+
+/*======================================================================*/
+/**@ingroup func_qurt_anysignal_wait_timed
+  Waits on the any-signal object. \n
+  Suspends the current thread until any of the specified signals is set or timeout expires.
+
+  Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1
+  indicates that a signal must be waited on, and 0 indicates not to wait on the signal.
+  If a signal is set in an any-signal object, and a thread was waiting on the any-signal object for
+  that signal, the thread is awakened. If the awakened thread has higher priority than
+  the current thread, a context switch can occur.
+
+  @note1hang At most, one thread can wait on an any-signal object at any given time.
+
+  @datatypes
+  #qurt_anysignal_t
+	
+  @param[in] signal Pointer to the any-signal object to wait on. 
+  @param[in] mask   Signal mask value, which specifies the individual signals in the any-signal
+                      object to wait on.
+  @param[out] signals Bitmask of current signal values.
+  @param[in] duration Interval (in microseconds) duration value must be between #QURT_TIMER_MIN_DURATION and
+  #QURT_TIMER_MAX_DURATION.
+
+  @return 				
+   #QURT_EOK -- Success \n
+   #QURT_ETIMEDOUT -- timeout
+   #QURT_EINVALID -- Duration out of range
+
+  @dependencies
+  None.
+ */
+/* ======================================================================*/
+
+int qurt_anysignal_wait_timed(qurt_anysignal_t *signal, unsigned int mask, unsigned int *signals, unsigned long long int duration);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_ANYSIGNAL_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_api_version.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_api_version.h
new file mode 100755
index 0000000000000..dfe53ae755054
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_api_version.h
@@ -0,0 +1,77 @@
+#ifndef QURT_API_VERSION_H
+#define QURT_API_VERSION_H
+/*==============================================================================
+
+qurt_api_version.h
+
+GENERAL DESCRIPTION
+    API version file
+
+EXTERNAL FUNCTIONS
+    None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+    None.
+
+Copyright (c) Qualcomm Technologies, Inc.
+All Rights Reserved.
+Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+==============================================================================*/
+
+/*==============================================================================
+                         CONSTANTS AND DEFINITIONS
+==============================================================================*/
+/**
+ * Each field of the QURT_API_VERSION definitions is an 8-bit unsigned integer.
+ * Main release has first 3 fields updated - Major, Minor and Release.
+ *  - QURT_API_VERSION = Major, Minor, Release.
+ * Patch releases are supported by adding the extra field.
+ *  - QURT_API_VERSION = Major, Minor, Release, Patch.
+ */
+// Major version is incremented for incompatible API changes.
+#define QURT_API_VER_MAJOR 1
+
+// Minor version is incremented for backward-compatible enhancements in the API
+// set.
+#define QURT_API_VER_MINOR 4
+
+// RELEASE version is incremented for each release within a `MAJOR.MINOR`
+// release.
+#define QURT_API_VER_RELEASE 1
+
+// Patch version is incremented when new API content is introduced on older LTS
+// release.
+#define QURT_API_VER_PATCH 0
+
+/* Update the QURT_API_VERSION function macro. */
+#define QURT_API_VERSION_ENCODE(major, minor, release, patch) \
+    ((((major) & 0xFF) << 24) | (((minor) & 0xFF) << 16) | \
+        (((release) & 0xFF) << 8) | ((patch) & 0xFF))
+
+/* Update the QURT_API_VERSION Macro. */
+#define QURT_API_VERSION \
+    QURT_API_VERSION_ENCODE(QURT_API_VER_MAJOR, QURT_API_VER_MINOR, \
+        QURT_API_VER_RELEASE, QURT_API_VER_PATCH)
+
+/** Usage:
+ *
+ * #if QURT_API_VERSION >= QURT_API_VERSION_ENCODE(1,4,0,0)
+ *  qurt_func_2(a,b,c);
+ * #else
+ *  qurt_func(a);
+ * #endif
+ *
+ */
+/*
+   Gets the QuRT API version.
+
+  @return
+  QuRT API version.
+
+  @dependencies
+  None.
+ */
+unsigned int qurt_api_version(void);
+
+#endif /* QURT_API_VERSION_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_assert.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_assert.h
new file mode 100755
index 0000000000000..13cc2afd2e973
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_assert.h
@@ -0,0 +1,51 @@
+#ifndef QURT_ASSERT_H
+#define QURT_ASSERT_H
+/**
+  @file qurt_assert.h   
+  @brief  Prototypes of qurt_assert API  
+
+  EXTERNAL FUNCTIONS
+   None.
+
+  INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+  Copyright (c) 2021  by Qualcomm Technologies, Inc.  All Rights Reserved.
+  Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+=============================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*=============================================================================
+                        CONSTANTS AND MACROS
+=============================================================================*/
+/**@ingroup func_qurt_assert_error
+  Writes diagnostic information to the debug buffer, and raises an error to the QuRT kernel.
+  
+  @datatypes
+  None.
+  
+  @param[in] filename     Pointer to the file name string.
+  @param[in] lineno       Line number.
+  
+  @return
+  None.
+
+  @dependencies
+  None.  
+ */
+void qurt_assert_error(const char *filename, int lineno) __attribute__((noreturn));
+
+#define qurt_assert(cond) ((cond)?(void)0:qurt_assert_error(__QURTFILENAME__,__LINE__))
+
+/** @} */ /* end_ingroup func_qurt_assert */
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_ASSERT_H */
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_atomic_ops.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_atomic_ops.h
new file mode 100755
index 0000000000000..d9b2cff7d737c
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_atomic_ops.h
@@ -0,0 +1,1298 @@
+#ifndef QURT_ATOMIC_OPS_H
+#define QURT_ATOMIC_OPS_H
+/**
+  @file qurt_atomic_ops.h 
+  @brief  Prototypes of kernel atomic operations API.
+
+  EXTERNAL FUNCTIONS
+   None.
+
+   INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+Copyright (c) 2021, 2022  by Qualcomm Technologies, Inc.  All Rights Reserved
+Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+=============================================================================*/
+
+/*
+ * Australian Public Licence B (OZPLB)
+ *
+ * Version 1-0
+ *
+ * Copyright (c) 2007, Open Kernel Labs, Inc.
+ *
+ * All rights reserved. 
+ *
+ * Developed by: Embedded, Real-time and Operating Systems Program (ERTOS)
+ *               National ICT Australia
+ *               http://www.ertos.nicta.com.au
+ *
+ * Permission is granted by National ICT Australia, free of charge, to
+ * any person obtaining a copy of this software and any associated
+ * documentation files (the "Software") to deal with the Software without
+ * restriction, including (without limitation) the rights to use, copy,
+ * modify, adapt, merge, publish, distribute, communicate to the public,
+ * sublicense, and/or sell, lend or rent out copies of the Software, and
+ * to permit persons to whom the Software is furnished to do so, subject
+ * to the following conditions:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimers.
+ *
+ *     * Redistributions in binary form must reproduce the above
+ *       copyright notice, this list of conditions and the following
+ *       disclaimers in the documentation and/or other materials provided
+ *       with the distribution.
+ *
+ *     * Neither the name of National ICT Australia, nor the names of its
+ *       contributors, may be used to endorse or promote products derived
+ *       from this Software without specific prior written permission.
+ *
+ * EXCEPT AS EXPRESSLY STATED IN THIS LICENCE AND TO THE FULL EXTENT
+ * PERMITTED BY APPLICABLE LAW, THE SOFTWARE IS PROVIDED "AS-IS", AND
+ * NATIONAL ICT AUSTRALIA AND ITS CONTRIBUTORS MAKE NO REPRESENTATIONS,
+ * WARRANTIES OR CONDITIONS OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
+ * BUT NOT LIMITED TO ANY REPRESENTATIONS, WARRANTIES OR CONDITIONS
+ * REGARDING THE CONTENTS OR ACCURACY OF THE SOFTWARE, OR OF TITLE,
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT,
+ * THE ABSENCE OF LATENT OR OTHER DEFECTS, OR THE PRESENCE OR ABSENCE OF
+ * ERRORS, WHETHER OR NOT DISCOVERABLE.
+ *
+ * TO THE FULL EXTENT PERMITTED BY APPLICABLE LAW, IN NO EVENT SHALL
+ * NATIONAL ICT AUSTRALIA OR ITS CONTRIBUTORS BE LIABLE ON ANY LEGAL
+ * THEORY (INCLUDING, WITHOUT LIMITATION, IN AN ACTION OF CONTRACT,
+ * NEGLIGENCE OR OTHERWISE) FOR ANY CLAIM, LOSS, DAMAGES OR OTHER
+ * LIABILITY, INCLUDING (WITHOUT LIMITATION) LOSS OF PRODUCTION OR
+ * OPERATION TIME, LOSS, DAMAGE OR CORRUPTION OF DATA OR RECORDS; OR LOSS
+ * OF ANTICIPATED SAVINGS, OPPORTUNITY, REVENUE, PROFIT OR GOODWILL, OR
+ * OTHER ECONOMIC LOSS; OR ANY SPECIAL, INCIDENTAL, INDIRECT,
+ * CONSEQUENTIAL, PUNITIVE OR EXEMPLARY DAMAGES, ARISING OUT OF OR IN
+ * CONNECTION WITH THIS LICENCE, THE SOFTWARE OR THE USE OF OR OTHER
+ * DEALINGS WITH THE SOFTWARE, EVEN IF NATIONAL ICT AUSTRALIA OR ITS
+ * CONTRIBUTORS HAVE BEEN ADVISED OF THE POSSIBILITY OF SUCH CLAIM, LOSS,
+ * DAMAGES OR OTHER LIABILITY.
+ *
+ * If applicable legislation implies representations, warranties, or
+ * conditions, or imposes obligations or liability on National ICT
+ * Australia or one of its contributors in respect of the Software that
+ * cannot be wholly or partly excluded, restricted or modified, the
+ * liability of National ICT Australia or the contributor is limited, to
+ * the full extent permitted by the applicable legislation, at its
+ * option, to:
+ * a.  in the case of goods, any one or more of the following:
+ * i.  the replacement of the goods or the supply of equivalent goods;
+ * ii.  the repair of the goods;
+ * iii. the payment of the cost of replacing the goods or of acquiring
+ *  equivalent goods;
+ * iv.  the payment of the cost of having the goods repaired; or
+ * b.  in the case of services:
+ * i.  the supplying of the services again; or
+ * ii.  the payment of the cost of having the services supplied again.
+ *
+ * The construction, validity and performance of this licence is governed
+ * by the laws in force in New South Wales, Australia.
+ */
+
+/*
+ * Author: Malcolm Purvis <malcolmp@ok-labs.com>
+ *
+ * This file is only included by the main atomic_ops.h, so all of that
+ * file's definitions are available.
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*=============================================================================
+                        CONSTANTS AND MACROS
+=============================================================================*/
+
+///* Sanity check to ensure the smp flag is set in machines.py */
+//#if defined(__ATOMIC_OPS_IN_KERNEL__) && !defined(MACHINE_SMP) && CONFIG_NUM_UNITS > 1
+//#error CONFIG_NUM_UNITS > 1 but smp not defined in machines.py.
+//#endif
+#define QURT_INLINE  __attribute__((always_inline))
+
+/*=============================================================================
+                        FUNCTIONS
+=============================================================================*/
+/**@ingroup func_qurt_atomic_set
+  Sets the atomic variable with the specified value.
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+  @param[in]      value  Value to set.
+  
+  @return
+  Value successfuly set.
+
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE unsigned int
+qurt_atomic_set(unsigned int* target, unsigned int value)
+{
+    unsigned long tmp;
+
+    __asm__ __volatile__(
+        "1:     %0 = memw_locked(%2)\n"
+        "       memw_locked(%2, p0) = %3\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (tmp),"+m" (*target)
+        : "r" (target), "r" (value)
+        : "p0");
+    return value;
+}
+
+/**@ingroup func_qurt_atomic_and
+  Bitwise AND operation of the atomic variable with mask. 
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+  @param[in]      mask   Mask for bitwise AND. 
+
+  @return
+  None
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE void
+qurt_atomic_and(unsigned int* target, unsigned int mask)
+{
+    unsigned int result;
+
+    __asm__ __volatile__(
+        "1:     %0 = memw_locked(%2)\n"
+        "       %0 = and(%0, %3)\n"
+        "       memw_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*target)
+        : "r" (target),"r" (mask)
+        : "p0");
+}
+
+/**@ingroup func_qurt_atomic_and_return
+  Bitwise AND operation of the atomic variable with mask. 
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+  @param[in]      mask   Mask for bitwise AND. 
+
+  @return
+  AND result of atomic variable with mask.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE unsigned int
+qurt_atomic_and_return(unsigned int* target, unsigned int mask)
+{
+    unsigned int result;
+
+    __asm__ __volatile__(
+        "1:     %0 = memw_locked(%2)\n"
+        "       %0 = and(%0, %3)\n"
+        "       memw_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*target)
+        : "r" (target), "r" (mask)
+        : "p0");
+
+    return result;
+}
+
+/**@ingroup func_qurt_atomic_or
+  Bitwise OR operation of the atomic variable with mask. 
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+  @param[in]      mask   Mask for bitwise OR. 
+
+  @return
+  None.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE void
+qurt_atomic_or(unsigned int* target, unsigned int mask)
+{
+    unsigned int result;
+
+    __asm__ __volatile__(
+        "1:     %0 = memw_locked(%2)\n"
+        "       %0 = or(%0, %3)\n"
+        "       memw_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*target)
+        : "r" (target), "r" (mask)
+        : "p0");
+}
+
+/**@ingroup func_qurt_atomic_or_return
+  Bitwise OR operation of the atomic variable with mask. 
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+  @param[in]      mask   Mask for bitwise OR. 
+
+  @return
+  Returns the OR result of the atomic variable with mask.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE unsigned int
+qurt_atomic_or_return(unsigned int* target, unsigned int mask)
+{
+    unsigned int result;
+
+    __asm__ __volatile__(
+        "1:     %0 = memw_locked(%2)\n"
+        "       %0 = or(%0, %3)\n"
+        "       memw_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*target)
+        : "r" (target), "r" (mask)
+        : "p0");
+
+    return result;
+}
+
+/**@ingroup func_qurt_atomic_xor
+  Bitwise XOR operation of the atomic variable with mask. 
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+  @param[in]      mask   Mask for bitwise XOR.
+
+  @return
+  None.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE void
+qurt_atomic_xor(unsigned int* target, unsigned int mask)
+{
+    unsigned int result;
+
+    __asm__ __volatile__(
+        "1:     %0 = memw_locked(%2)\n"
+        "       %0 = xor(%0, %3)\n"
+        "       memw_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*target)
+        : "r" (target), "r" (mask)
+        : "p0");
+}
+
+/**@ingroup func_qurt_atomic_xor_return
+  Bitwise XOR operation of the atomic variable with mask. 
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+  @param[in]      mask   Mask for bitwise XOR. 
+
+  @return
+  XOR result of atomic variable with mask.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE unsigned int
+qurt_atomic_xor_return(unsigned int* target, unsigned int mask)
+{
+    unsigned int result;
+
+    __asm__ __volatile__(
+        "1:     %0 = memw_locked(%2)\n"
+        "       %0 = xor(%0, %3)\n"
+        "       memw_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*target)
+        : "r" (target), "r" (mask)
+        : "p0");
+
+    return result;
+}
+
+/**@ingroup func_qurt_atomic_set_bit
+  Sets a bit in the atomic variable at a specified position.
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+  @param[in]      bit    Bit position to set. 
+
+  @return
+  None.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE void
+qurt_atomic_set_bit(unsigned int *target, unsigned int bit)
+{
+    unsigned int result;
+    unsigned int aword = bit / ((unsigned int)sizeof(unsigned int) * 8U); 
+    unsigned int sbit = bit % ((unsigned int)sizeof(unsigned int) * 8U);
+    unsigned int *wtarget= (unsigned int *)&target[aword];
+
+    __asm__ __volatile__(
+        "1:     %0 = memw_locked(%2)\n"
+        "       %0 = setbit(%0, %3)\n"
+        "       memw_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*wtarget)
+        : "r" (wtarget), "r" (sbit)
+        : "p0");
+}
+
+/**@ingroup func_qurt_atomic_clear_bit
+  Clears a bit in the atomic variable at a specified position. 
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+  @param[in]      bit    Bit position to clear.
+
+  @return
+  None.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE void
+qurt_atomic_clear_bit(unsigned int *target, unsigned int bit)
+{
+    unsigned int result;
+    unsigned int aword = bit / ((unsigned int)sizeof(unsigned int) * 8U); 
+    unsigned int sbit = bit % ((unsigned int)sizeof(unsigned int) * 8U);
+    unsigned int *wtarget= (unsigned int *)&target[aword];
+
+    __asm__ __volatile__(
+        "1:     %0 = memw_locked(%2)\n"
+        "       %0 = clrbit(%0, %3)\n"
+        "       memw_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*wtarget)
+        : "r" (wtarget), "r" (sbit)
+        : "p0");
+}
+
+/**@ingroup func_qurt_atomic_change_bit
+  Toggles a bit in a atomic variable at a bit position. 
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+  @param[in]      bit    Bit position to toggle. 
+
+  @return
+  None.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE void
+qurt_atomic_change_bit(unsigned int *target, unsigned int bit)
+{
+    unsigned int result;
+    unsigned int aword = bit / ((unsigned int)sizeof(unsigned int) * 8U); 
+    unsigned int sbit = bit & 0x1fU;
+    unsigned int *wtarget= (unsigned int *)&target[aword];
+
+    __asm__ __volatile__(
+        "1:     %0 = memw_locked(%2)\n"
+        "       %0 = togglebit(%0, %3)\n"
+        "       memw_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*wtarget)
+        : "r" (wtarget),"r" (sbit)
+        : "p0");
+}
+
+/**@ingroup func_qurt_atomic_add
+  Adds an integer to atomic variable.
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+  @param[in]      v      Integer value to add. 
+
+  @return
+  None.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE void
+qurt_atomic_add(unsigned int *target, unsigned int v)
+{
+    unsigned int result;
+
+    __asm__ __volatile__(
+        "1:     %0 = memw_locked(%2)\n"
+        "       %0 = add(%0, %3)\n"
+        "       memw_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*target)
+        : "r" (target), "r" (v)
+        : "p0");
+}
+
+/**@ingroup func_qurt_atomic_add_return
+  Adds an integer to atomic variable.
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+  @param[in]      v      Integer value to add. 
+
+  @return
+  Result of arithmetic sum.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE unsigned int
+qurt_atomic_add_return(unsigned int *target, unsigned int v)
+{
+    unsigned int result;
+
+    __asm__ __volatile__(
+        "1:     %0 = memw_locked(%2)\n"
+        "       %0 = add(%0, %3)\n"
+        "       memw_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*target)
+        : "r" (target), "r" (v)
+        : "p0");
+
+    return result;
+}
+
+/**@ingroup func_qurt_atomic_add_unless
+  Adds the delta value to an atomic variable unless the current value in the target 
+  matches the unless variable.
+
+  @note1hang The function retries until load lock and store conditional
+             are successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+  @param[in]      delta  Value to add to the current value.
+  @param[in]      unless Perform the addition only when the current value is not 
+                         equal to this unless value.
+  @return
+  TRUE  -- 1 - Addition was performed. \n
+  FALSE -- 0 - Addition was not done.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE unsigned int
+qurt_atomic_add_unless(unsigned int* target,
+                       unsigned int delta,
+                       unsigned int unless)
+{
+    unsigned int current_val;
+    unsigned int new_val;
+
+    __asm__ __volatile__(
+        "1:     %0 = memw_locked(%3)\n"
+        "       p0 = cmp.eq(%0, %5)\n"
+        "       if p0 jump 2f\n"
+        "       %1 = add(%0, %4)\n"
+        "       memw_locked(%3, p0) = %1\n"
+        "       if !p0 jump 1b\n"
+        "2:\n"
+        : "=&r" (current_val),"=&r" (new_val),"+m" (*target)
+        : "r" (target), "r" (delta), "r" (unless)
+        : "p0");
+
+    return (unsigned int)(current_val != unless);
+}
+
+/**@ingroup func_qurt_atomic_sub
+  Subtracts an integer from an atomic variable.
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+  @param[in]      v      Integer value to subtract. 
+
+  @return
+  None.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE void
+qurt_atomic_sub(unsigned int *target, unsigned int v)
+{
+    unsigned int result;
+
+    __asm__ __volatile__(
+        "1:     %0 = memw_locked(%2)\n"
+        "       %0 = sub(%0, %3)\n"
+        "       memw_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*target)
+        : "r" (target), "r" (v)
+        : "p0");
+}
+
+/**@ingroup func_qurt_atomic_sub_return
+  Subtracts an integer from an atomic variable.
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+  @param[in]      v      Integer value to subtract. 
+
+  @return
+  Result of arithmetic subtraction.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE unsigned int
+qurt_atomic_sub_return(unsigned int *target, unsigned int v)
+{
+    unsigned int result;
+
+    __asm__ __volatile__(
+        "1:     %0 = memw_locked(%2)\n"
+        "       %0 = sub(%0, %3)\n"
+        "       memw_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*target)
+        : "r" (target), "r" (v)
+        : "p0");
+
+    return result;
+}
+
+/**@ingroup func_qurt_atomic_inc
+  Increments an atomic variable by one.
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+
+  @return
+  None.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE void
+qurt_atomic_inc(unsigned int *target)
+{
+    unsigned int result;
+
+    __asm__ __volatile__(
+        "1:     %0 = memw_locked(%2)\n"
+        "       %0 = add(%0, #1)\n"
+        "       memw_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*target)
+        : "r" (target)
+        : "p0");
+}
+
+/**@ingroup func_qurt_atomic_inc_return
+  Increments an atomic variable by one.
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+
+  @return
+  Incremented value.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE unsigned int
+qurt_atomic_inc_return(unsigned int *target)
+{
+    unsigned int result;
+
+    __asm__ __volatile__(
+        "1:     %0 = memw_locked(%2)\n"
+        "       %0 = add(%0, #1)\n"
+        "       memw_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*target)
+        : "r" (target)
+        : "p0");
+
+    return result;
+}
+
+/**@ingroup func_qurt_atomic_dec
+  Decrements an atomic variable by one.
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+
+  @return
+  None.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE void
+qurt_atomic_dec(unsigned int *target)
+{
+    unsigned int result;
+
+    __asm__ __volatile__(
+        "1:     %0 = memw_locked(%2)\n"
+        "       %0 = add(%0, #-1)\n"
+        "       memw_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*target)
+        : "r" (target)
+        : "p0");
+}
+
+/**@ingroup func_qurt_atomic_dec_return
+  Decrements an atomic variable by one.
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+
+  @return
+  Decremented value.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE unsigned int
+qurt_atomic_dec_return(unsigned int *target)
+{
+    unsigned int result;
+
+    __asm__ __volatile__(
+        "1:     %0 = memw_locked(%2)\n"
+        "       %0 = add(%0, #-1)\n"
+        "       memw_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*target)
+        : "r" (target)
+        : "p0");
+
+    return result;
+}
+
+/**@ingroup func_qurt_atomic_compare_and_set
+  Compares the current value of the atomic variable with the
+  specified value and set to a new value when compare is successful.
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target  Pointer to the atomic variable.
+  @param[in]      old_val Old value to compare.
+  @param[in]      new_val New value to set.
+
+  @return
+  FALSE -- Specified value is not equal to the current value. \n
+  TRUE --Specified value is equal to the current value.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE unsigned int
+qurt_atomic_compare_and_set(unsigned int* target,
+                       unsigned int old_val,
+                       unsigned int new_val)
+{
+    unsigned int current_val;
+
+    __asm__ __volatile__(
+        "1:     %0 = memw_locked(%2)\n"
+        "       p0 = cmp.eq(%0, %3)\n"
+        "       if !p0 jump 2f\n"
+        "       memw_locked(%2, p0) = %4\n"
+        "       if !p0 jump 1b\n"
+        "2:\n"
+        : "=&r" (current_val),"+m" (*target)
+        : "r" (target), "r" (old_val), "r" (new_val)
+        : "p0");
+
+    return (unsigned int)(current_val == old_val);
+}
+
+/**@ingroup func_qurt_atomic_barrier
+  Allows the compiler to enforce an ordering constraint on memory operation issued
+  before and after the function.
+  
+  @return
+  None.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE void
+qurt_atomic_barrier(void)
+{
+    __asm__ __volatile__ (
+        ""
+        :
+        :
+        :
+        "memory");
+}
+
+
+/**@ingroup func_qurt_atomic64_set
+  Sets the 64-bit atomic variable with the specified value. 
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+  @param[in]      value  64-bit value to set. 
+
+  @return
+  Successfuly set value.
+
+  @dependencies
+  None.
+*/ 
+static inline QURT_INLINE unsigned long long
+qurt_atomic64_set(unsigned long long* target, unsigned long long value)
+{
+    unsigned long long tmp;
+
+    __asm__ __volatile__(
+        "1:     %0 = memd_locked(%2)\n"
+        "       memd_locked(%2, p0) = %3\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (tmp),"+m" (*target)
+        : "r" (target), "r" (value)
+        : "p0");
+    return value;
+}
+
+/**@ingroup func_qurt_atomic64_and_return
+  Bitwise AND operation of a 64-bit atomic variable with mask. 
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+  @param[in]      mask   64-bit mask for bitwise AND. 
+
+  @return
+  AND result of 64-bit atomic variable with mask.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE unsigned long long
+qurt_atomic64_and_return(unsigned long long* target, unsigned long long mask)
+{
+    unsigned long long result;
+
+    __asm__ __volatile__(
+        "1:     %0 = memd_locked(%2)\n"
+        "       %0 = and(%0, %3)\n"
+        "       memd_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*target)
+        : "r" (target), "r" (mask)
+        : "p0");
+
+    return result;
+}
+
+/**@ingroup func_qurt_atomic64_or
+  Bitwise OR operation of a 64-bit atomic variable with mask.
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+  @param[in]      mask   64-bit mask for bitwise OR. 
+
+  @return
+  None.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE void
+qurt_atomic64_or(unsigned long long* target, unsigned long long mask)
+{
+    unsigned long long result;
+
+    __asm__ __volatile__(
+        "1:     %0 = memd_locked(%2)\n"
+        "       %0 = or(%0, %3)\n"
+        "       memd_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*target)
+        : "r" (target), "r" (mask)
+        : "p0");
+}
+
+/**@ingroup func_qurt_atomic64_or_return
+  Bitwise OR operation of a 64-bit atomic variable with mask. 
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+  @param[in]      mask   64-bit mask for bitwise OR. 
+
+  @return
+  OR result of the atomic variable with mask.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE unsigned long long
+qurt_atomic64_or_return(unsigned long long* target, unsigned long long mask)
+{
+    unsigned long long result;
+
+    __asm__ __volatile__(
+        "1:     %0 = memd_locked(%2)\n"
+        "       %0 = or(%0, %3)\n"
+        "       memd_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*target)
+        : "r" (target), "r" (mask)
+        : "p0");
+
+    return result;
+}
+
+/**@ingroup func_qurt_atomic64_xor_return
+  Bitwise XOR operation of 64-bit atomic variable with mask. 
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+  @param[in]      mask   64-bit mask for bitwise XOR. 
+
+  @return
+  XOR result of atomic variable with mask.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE unsigned long long
+qurt_atomic64_xor_return(unsigned long long* target, unsigned long long mask)
+{
+    unsigned long long result;
+
+    __asm__ __volatile__(
+        "1:     %0 = memd_locked(%2)\n"
+        "       %0 = xor(%0, %3)\n"
+        "       memd_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*target)
+        : "r" (target), "r" (mask)
+        : "p0");
+
+    return result;
+}
+
+/**@ingroup func_qurt_atomic64_set_bit
+  Sets a bit in a 64-bit atomic variable at a specified position. 
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+  @param[in]      bit    Bit position to set. 
+
+  @return
+  None.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE void
+qurt_atomic64_set_bit(unsigned long long *target, unsigned int bit)
+{
+    unsigned int result;
+    unsigned int *wtarget;
+    unsigned int *pwtarget = (unsigned int *)target;
+    unsigned int aword = bit / ((unsigned int)sizeof(unsigned int) * 8U); 
+    unsigned int sbit = bit & 0x1FU;
+    wtarget = (unsigned int *)&pwtarget[aword];
+
+
+    __asm__ __volatile__(
+        "1:     %0 = memw_locked(%2)\n"
+        "       %0 = setbit(%0, %3)\n"
+        "       memw_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*wtarget)
+        : "r" (wtarget), "r" (sbit)
+        : "p0");
+}
+
+/**@ingroup func_qurt_atomic64_clear_bit
+  Clears a bit in a 64-bit atomic variable at a specified position. 
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+  @param[in]      bit    Bit position to clear. 
+
+  @return
+  None.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE void
+qurt_atomic64_clear_bit(unsigned long long *target, unsigned int bit)
+{
+    unsigned int result;
+    unsigned int *wtarget;
+    unsigned int *pwtarget = (unsigned int *)target;
+    unsigned int aword = bit / ((unsigned int)sizeof(unsigned int) * 8U); 
+    unsigned int sbit = bit & 0x1FU;
+    wtarget = (unsigned int *)&pwtarget[aword];
+
+    __asm__ __volatile__(
+        "1:     %0 = memw_locked(%2)\n"
+        "       %0 = clrbit(%0, %3)\n"
+        "       memw_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*wtarget)
+        : "r" (wtarget), "r" (sbit)
+        : "p0");
+}
+
+/**@ingroup func_qurt_atomic64_change_bit
+  Toggles a bit in a 64-bit atomic variable at a bit position. 
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+  @param[in]      bit    Bit position to toggle. 
+
+  @return
+  None.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE void
+qurt_atomic64_change_bit(unsigned long long *target, unsigned int bit)
+{
+    unsigned int result;
+    unsigned int *wtarget;
+    unsigned int *pwtarget = (unsigned int *)target;
+    unsigned int aword = bit / ((unsigned int)sizeof(unsigned int) * 8U); 
+    unsigned int sbit = bit & 0x1FU;
+    wtarget = (unsigned int *)&pwtarget[aword];
+
+    __asm__ __volatile__(
+        "1:     %0 = memw_locked(%2)\n"
+        "       %0 = togglebit(%0, %3)\n"
+        "       memw_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*wtarget)
+        : "r" (wtarget),"r" (sbit)
+        : "p0");
+}
+
+/**@ingroup func_qurt_atomic64_add
+  Adds a 64-bit integer to 64-bit atomic variable.
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+  @param[in]      v      64-bit integer value to add. 
+
+  @return
+  None.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE void
+qurt_atomic64_add(unsigned long long *target, unsigned long long v)
+{
+    unsigned long long result;
+
+    __asm__ __volatile__(
+        "1:     %0 = memd_locked(%2)\n"
+        "       %0 = add(%0, %3)\n"
+        "       memd_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*target)
+        : "r" (target), "r" (v)
+        : "p0");
+}
+
+/**@ingroup func_qurt_atomic64_add_return
+  Adds a 64-bit integer to 64-bit atomic variable.
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+  @param[in]      v      64-bit integer value to add. 
+
+  @return
+  Result of arithmetic sum.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE unsigned long long
+qurt_atomic64_add_return(unsigned long long *target, unsigned long long v)
+{
+    unsigned long long result;
+
+    __asm__ __volatile__(
+        "1:     %0 = memd_locked(%2)\n"
+        "       %0 = add(%0, %3)\n"
+        "       memd_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*target)
+        : "r" (target), "r" (v)
+        : "p0");
+
+    return result;
+}
+
+/**@ingroup func_qurt_atomic64_sub_return
+  Subtracts a 64-bit integer from an atomic variable.
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+  @param[in]      v      64-bit integer value to subtract. 
+
+  @return
+  Result of arithmetic subtraction.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE unsigned long long
+qurt_atomic64_sub_return(unsigned long long *target, unsigned long long v)
+{
+    unsigned long long result;
+
+    __asm__ __volatile__(
+        "1:     %0 = memd_locked(%2)\n"
+        "       %0 = sub(%0, %3)\n"
+        "       memd_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*target)
+        : "r" (target), "r" (v)
+        : "p0");
+
+    return result;
+}
+
+/**@ingroup func_qurt_atomic64_inc
+  Increments a 64-bit atomic variable by one.
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+
+  @return
+  None.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE void
+qurt_atomic64_inc(unsigned long long *target)
+{
+    unsigned long long result;
+    unsigned long long inc =1;
+
+    __asm__ __volatile__(
+        "1:     %0 = memd_locked(%2)\n"
+        "       %0 = add(%0, %3)\n"
+        "       memd_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*target)
+        : "r" (target),"r" (inc)
+        : "p0");
+}
+
+/**@ingroup func_qurt_atomic64_inc_return
+  Increments a 64-bit atomic variable by one
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+
+  @return
+  Incremented value.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE unsigned long long
+qurt_atomic64_inc_return(unsigned long long *target)
+{
+    unsigned long long result;
+    unsigned long long inc =1;
+
+    __asm__ __volatile__(
+        "1:     %0 = memd_locked(%2)\n"
+        "       %0 = add(%0, %3)\n"
+        "       memd_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*target)
+        : "r" (target),"r" (inc)
+        : "p0");
+
+    return result;
+}
+
+/**@ingroup func_qurt_atomic64_dec_return
+  Decrements a 64-bit atomic variable by one.
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+
+  @return
+  Decremented value.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE unsigned long long
+qurt_atomic64_dec_return(unsigned long long *target)
+{
+    unsigned long long result;
+    long long minus1 = 0xFFFFFFFFFFFFFFFFLL;
+
+    __asm__ __volatile__(
+        "1:     %0 = memd_locked(%2)\n"
+        "       %0 = add(%0, %3)\n"
+        "       memd_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*target)
+        : "r" (target),"r" (minus1)
+        : "p0");
+
+    return result;
+}
+
+/**@ingroup func_qurt_atomic64_compare_and_set
+  Compares the current value of an 64-bit atomic variable with 
+  the specified value and sets to a new value when compare is successful.
+
+  @note1hang The function keep retrying until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target  Pointer to the atomic variable.
+  @param[in]      old_val 64-bit old value to compare.
+  @param[in]      new_val 64-bit new value to set.
+
+  @return
+  FALSE -- Specified value is not equal to the current value. \n
+  TRUE -- Specified value is equal to the current value.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE int
+qurt_atomic64_compare_and_set(unsigned long long *target,
+                       unsigned long long old_val,
+                       unsigned long long new_val)
+{
+    unsigned long long current_val;
+
+    __asm__ __volatile__(
+        "1:     %0 = memd_locked(%2)\n"
+        "       p0 = cmp.eq(%0, %3)\n"
+        "       if !p0 jump 2f\n"
+        "       memd_locked(%2, p0) = %4\n"
+        "       if !p0 jump 1b\n"
+        "2:\n"
+        : "=&r" (current_val),"+m" (*target)
+        : "r" (target), "r" (old_val), "r" (new_val)
+        : "p0");
+
+    return (int)(current_val == old_val);
+}
+
+/**@ingroup func_qurt_atomic64_barrier
+  Allows compiler to enforce an ordering constraint on memory operation issued
+  before and after the function.
+
+  @return
+  None.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE void
+qurt_atomic64_barrier(void)
+{
+    /** @cond */
+    __asm__ __volatile__ (
+        ""
+        :
+        :
+        :
+        "memory");
+    /** @endcond */
+}
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_ATOMIC_OPS_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_barrier.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_barrier.h
new file mode 100755
index 0000000000000..7c6f787d43bc2
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_barrier.h
@@ -0,0 +1,140 @@
+#ifndef QURT_BARRIER_H
+#define QURT_BARRIER_H
+
+/**
+  @file qurt_barrier.h
+  @brief Prototypes of Kernel barrier API functions.      
+
+ EXTERNALIZED FUNCTIONS
+ None.
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+ None.
+
+ Copyright (c) 2021 Qualcomm Technologies, Inc. All rights reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** @addtogroup barrier_types
+@{ */
+/*=====================================================================
+ Constants and macros
+======================================================================*/
+#define QURT_BARRIER_SERIAL_THREAD 1 /**< Serial thread. */
+#define QURT_BARRIER_OTHER 0         /**< Other. */
+
+#ifndef ASM
+#include <qurt_mutex.h>
+
+/*=====================================================================
+Typedefs
+======================================================================*/
+
+/** QuRT barrier type.                                                 
+ */
+typedef union {
+    /** @cond */
+	struct {
+        unsigned short threads_left;
+		unsigned short count;
+		unsigned int threads_total;
+        unsigned int queue;
+        unsigned int reserved;
+	};
+	unsigned long long int raw;
+    /** @endcond */
+} qurt_barrier_t;
+
+/** @} */ /* end_addtogroup barrier_types */
+
+/*=====================================================================
+ Functions
+======================================================================*/
+ 
+/*======================================================================*/
+/**@ingroup func_qurt_barrier_init
+  Initializes a barrier object.
+	
+  @datatypes
+  #qurt_barrier_t
+
+  @param[out] barrier       Pointer to the barrier object to initialize.
+  @param[in]  threads_total Total number of threads to synchronize on the barrier.
+
+
+  @return
+  Unused integer value.
+
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+int qurt_barrier_init(qurt_barrier_t *barrier, unsigned int threads_total);
+
+/*======================================================================*/
+/**@ingroup func_qurt_barrier_destroy
+  Destroys the specified barrier.
+
+  @note1hang Barriers must be destroyed when they are no longer in use. Failure
+             to do this causes resource leaks in the QuRT kernel.\n
+  @note1cont Barriers must not be destroyed while they are still in use. If this
+             occurs, the behavior of QuRT is undefined.
+
+  @datatypes
+  #qurt_barrier_t
+ 
+  @param[in] barrier Pointer to the barrier object to destroy.
+
+  @return     		
+  Unused integer value.
+
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+int qurt_barrier_destroy(qurt_barrier_t *barrier);
+
+/*======================================================================*/
+/**@ingroup func_qurt_barrier_wait
+  Waits on the barrier.\n
+  Suspends the current thread on the specified barrier. \n
+  The function return value indicates whether the thread was the last one to
+  synchronize on the barrier.
+  When a thread waits on a barrier, it is suspended on the barrier: \n
+  - If the total number of threads waiting on the barrier is less than the assigned value 
+     of the barrier, no other action occurs. \n
+  - If the total number of threads waiting on the barrier equals the assigned value of the
+     barrier, all threads currently waiting on the barrier are awakened, allowing them to
+     execute past the barrier.
+
+  @note1hang After its waiting threads are awakened, a barrier is automatically reset 
+            and can be used again in the program without the need for re-initialization.
+	                
+  @datatypes
+  #qurt_barrier_t
+  
+  @param[in] barrier Pointer to the barrier object to wait on.
+
+  @return 				
+  #QURT_BARRIER_OTHER -- Current thread awakened from barrier. \n 
+  #QURT_BARRIER_SERIAL_THREAD -- Current thread is last caller of barrier.
+
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+int qurt_barrier_wait(qurt_barrier_t *barrier);
+
+
+#endif
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_BARRIER_H */
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_busywait.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_busywait.h
new file mode 100755
index 0000000000000..a4dab80a2520a
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_busywait.h
@@ -0,0 +1,62 @@
+#ifndef QURT_BUSYWAIT_H
+#define QURT_BUSYWAIT_H
+
+/**
+  @file qurt_busywait.h 
+  @brief Implementation of the busywait() function for 
+   hardware based blocking waits that use the QTIMER as a reference.   
+
+ EXTERNALIZED FUNCTIONS
+  none
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  none
+
+ Copyright (c) 2021  by Qualcomm Technologies, Inc.  All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ============================================================================*/
+/*=============================================================================
+ *
+ *                       EDIT HISTORY FOR FILE
+ *
+ *   This section contains comments describing changes made to the
+ *   module. Changes are listed in reverse chronological
+ *   order.
+ *
+ * 
+ * when         who     what, where, why
+ * ----------   ---     -------------------------------------------------------
+ * 2018-03-20   pg      Add Header file
+ ============================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*=============================================================================
+                                    FUNCTIONS
+=============================================================================*/
+
+/**@ingroup func_qurt_busywait
+  Pauses the execution of a thread for a specified time.\n
+  Use for small microsecond delays.
+  
+  @note1hang The function does not return to the caller until
+  the time duration has expired.
+             
+  @param[in] pause_time_us Time to pause in microseconds. 
+ 
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+void qurt_busywait (unsigned int pause_time_us);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_BUSYWAIT_H */
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_callback.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_callback.h
new file mode 100755
index 0000000000000..dc9b896c63454
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_callback.h
@@ -0,0 +1,235 @@
+#ifndef QURT_CALLBACK_H
+#define QURT_CALLBACK_H
+
+/**
+  @file qurt_callback.h
+    Definitions, macros, and prototypes for QuRT callback framework.
+  
+  QDI framework allows the development of root process drivers and services that 
+  a user process client can interact with in a secure manner. QDI framework does 
+  this by elevating the priviledge of user process thread, temporarily allowing 
+  the thread execute in root context and letting it fall back to user context once 
+  the QDI invocation is finished. 
+
+  The QuRT callback framework provides a safe mechanism for root process drivers 
+  to execute callback functions in a user process. The framework hosts 
+  dedicated worker threads in corresponding processes that handle the execution
+  of the callback function. This ensures that the callbacks occur in context of
+  the appropriate process thread, in result maintaining privilege boundaries. 
+
+  Prerequisites for use of this framework are:
+  1. Driver is a QDI driver and client communicates with drivers using QDI 
+     invocations.
+  2. Appropriate callback configuration is specified in cust_config.xml for 
+     the user process that intends to use this framework.
+
+  qurt_cb_data_t is the public data structure that allows client to store all
+  the required information about the callback, including the callback function
+  and the arguments to pass to this function when it executes.
+  The client uses QDI interface to register this structure with root driver.
+  
+  Callback framework provides following APIs that a root driver can use to invoke callback.
+  These functions are described in qurt_qdi_driver.h header file.
+
+  qurt_qdi_cb_invoke_async() triggers an asynchronous callback wherein the
+  invoking thread does not wait for the callback to finish executing.
+
+  qurt_qdi_cb_invoke_sync()  triggers a synchronous callback. Upon invocation
+  the invoking thread gets suspended till the callback function finishes execution.
+  
+  qurt_qdi_cb_invoke_sync_with_data() invokes a synchronous callback similar to
+  qurt_qdi_cb_invoke_sync(). It allows user to pass large data along with 
+  the callback invocation to be utlized during the callback execution.
+     
+ EXTERNALIZED FUNCTIONS
+  none
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  none
+
+ Copyright (c) 2021, 2023  by Qualcomm Technologies, Inc.  All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+#include "qurt_qdi.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef int qurt_cb_result_t;
+
+/* Callback framework error codes.
+  Callback framework returns a nonzero value if callback invocation is unsuccessful.
+  Following macros highlight cause of failure in more detail.
+*/
+#define QURT_CB_ERROR               -1                  /* Callback registration failed.\n*/
+#define QURT_CB_OK                   0                  /* Success.\n*/
+#define QURT_CB_MALLOC_FAILED       -2                  /* QuRTOS malloc failure.\n*/
+#define QURT_CB_WAIT_CANCEL         -3                  /* Process exit cancelled wait operation.\n*/
+#define QURT_CB_CONFIG_NOT_FOUND    -4                  /* Callback configuration for process was not found.\n*/
+#define QURT_CB_QUEUE_FULL          -5                  /* Callback queue is serving at maximum capacity.*/
+/** @addtogroup cb_types
+@{ */
+/** Callback registration data structure.
+  This data structure is used by a client attempting to register a callback with a QDI driver.
+  It holds the address of callback function and the argument supplied to the callback 
+  function when it executes.
+*/
+typedef struct {
+  /** @cond */
+  void* cb_func;             /*< Pointer to the callback function. */
+  unsigned cb_arg;           /*< Not interpreted by the framework.*/
+  /** @endcond */
+} qurt_cb_data_t;
+
+/** @cond */
+/* Defines used as default if cust_config does not specify them. */
+#define CALLBACK_WORKER_STACK_SIZE 0x2000
+/** @endcond */
+/** @} */ /* end_addtogroup cb_typess */
+/**@ingroup func_qurt_cb_data_init 
+  Initializes the callback data structure.
+  Entity registering a callback with the root process driver must call this function
+  to initialize callback registration data structure to the default value.
+
+  @datatypes 
+  #qurt_cb_data_t
+
+  @param[in]  cb_data         Pointer to the callback data structure.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+static inline void qurt_cb_data_init (qurt_cb_data_t* cb_data){
+    cb_data->cb_func = NULL;
+    cb_data->cb_arg = 0;
+}
+
+/**@ingroup func_qurt_cb_data_set_cbfunc
+  Sets up the callback function in the callback registration data structure.
+  
+  @datatypes 
+  #qurt_cb_data_t
+
+  @param[in] cb_data         Pointer to the callback data structure.
+  @param[in] cb_func         Pointer to the callback function.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+static inline void qurt_cb_data_set_cbfunc (qurt_cb_data_t* cb_data, void* cb_func){
+  cb_data->cb_func = cb_func;
+}
+
+/**@ingroup func_qurt_cb_data_set_cbarg
+  Sets up the callback argument.
+  This function sets up the argument passed to the callback function when it executes.
+  
+  @datatypes 
+  #qurt_cb_data_t
+
+  @param[in] cb_data         Pointer to the callback data structure.
+  @param[in] cb_arg          Argument for the callback function.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+static inline void qurt_cb_data_set_cbarg (qurt_cb_data_t* cb_data, unsigned cb_arg){
+  cb_data->cb_arg = cb_arg;
+}
+
+/** @cond */
+/**@ingroup driver_support_functions
+  Invokes an asynchronous callback for a specified process. 
+  A driver that resides in the root process calls this API to launch a callback in
+  a process described by the client_handle.
+  After the callback is invoked, the framework queues the callback as per its 
+  priority and subsequently executes it.
+  The caller of this function is not suspended during the callback execution period.
+  The API returns immediately with a success/failure error code.
+
+  @note1hang  This function is only accessible to drivers in the root process. 
+              User process invocations shall fail with a negative error code return value.
+
+  @param  client_handle   Obtained from the current invocation function (Section 4.3.1).
+  @param  cb_data         Pointer to the callback data structure (refer to qurt_callback.h).
+  @param  prio            Priority at which the callback should execute.
+                          This paraemter is optional. If -1 is passed, the callback frameowrk 
+                          executes the callback at the priority of the API caller.
+  @return
+  QURT_EOK -- Callback was successfully communicated to the framework.
+  Negative error code -- Callback cannot be communicated to the framework.
+ */
+qurt_cb_result_t qurt_qdi_cb_invoke_async(int client_handle,
+                                          qurt_cb_data_t* cb_data,
+                                          int prio);
+
+
+/**@ingroup driver_support_functions
+  Invokes a synchronous callback for a specified process. 
+  A driver that resides in a root process calls this API to launch a sync callback in
+  a process described by the client_handle.
+  AFter the callback is invoked, the framework queues the callback as per its 
+  priority and subsequently executes it.
+  The caller of this function is suspended during the callback execution period.
+  If the process in which to execute the callback exits or terminates, the caller is
+  woken up with error code #QURT_CB_WAIT_CANCEL (refer to qurt_callback.h).
+
+  @note1hang  This function is only accessible to drivers in the root process. 
+              User process invocations shall fail with a negative error code return value.
+
+  @param  client_handle   Obtained from the current invocation function (Section 4.3.1).
+  @param  cb_data         Pointer to the callback data structure (refer to qurt_callback.h).
+  @param  prio            Priority at which the callback should execute.
+                          This paraemter is optional. If -1 is passed, callback frameowrk 
+                          executes the callback at the priority of the API caller.
+  @return
+  QURT_EOK -- Callback was successfully communicated to the framework.
+  Negative error code -- Callback cannot be communicated to the framework.
+ */
+qurt_cb_result_t qurt_qdi_cb_invoke_sync(int client_handle,
+                                         qurt_cb_data_t* cb_data,
+                                         int prio);
+
+/**@ingroup driver_support_functions
+  Invokes a synchronous callback for a specified process, passing driver data to the user PD.
+  This function is similar to qurt_qdi_cb_invoke_sync() and allows the driver to pass arbitrary data to
+  the user process as part of the callback invocation.
+
+  @param  client_handle   Obtained from the current invocation function (Section 4.3.1).
+  @param  cb_data         Pointer to the callback data structure (refer to qurt_callback.h).
+  @param  prio            Priority at which the callback should execute.
+                          This paraemter is optional. If -1 is passed, the callback frameowrk
+                          executes the callback at the priority of the API caller.
+  @param  data            Driver arbitrary data to pass to the user process. Memory pointed to by data
+                          must be accessible to the user PD. The root driver can allocate such memory by
+                          using qurt_mem_mmap().
+  @param  data_len        Driver arbitrary data length.
+  
+  @return
+  QURT_EOK -- Callback was successfully communicated to the framework.
+  Negative error code -- Callback cannot be communicated to the framework.
+ */
+qurt_cb_result_t qurt_qdi_cb_invoke_sync_with_data( int client_handle,
+                                                    qurt_cb_data_t* cb_data,
+                                                    int prio,
+                                                    void *data,
+                                                    unsigned data_len
+                                                    );
+/** @endcond */
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_clade.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_clade.h
new file mode 100755
index 0000000000000..d7442cf98dd94
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_clade.h
@@ -0,0 +1,62 @@
+#ifndef QURT_CLADE_H
+#define QURT_CLADE_H
+/**
+  @file qurt_clade.h 
+  @brief  Prototypes of Cache Line Accelerated Decompression Engine (CLADE) API.
+  CLADE is a cache line level memory compression system that is used to
+  decrease DRAM usage.
+
+EXTERNAL FUNCTIONS
+   None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+ Copyright (c) 2019-2021  by Qualcomm Technologies, Inc.  All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+=============================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*=============================================================================
+                                    FUNCTIONS
+=============================================================================*/
+
+/**@ingroup func_qurt_clade2_get
+  Reads the value of the clade2 register.
+ 
+  @param[in] offset Offset from the clade2 cfg base.
+  @param[out] *value  Pointer to the register value read from the offset.
+ 
+  @return
+  #QURT_EOK - Successfully read the value from the register at offset \n
+  #QURT_EINVALID - Offset passed is incorrect
+   
+  @dependencies
+  None.
+ */
+int qurt_clade2_get(unsigned short offset, unsigned int *value);
+ 
+/**@ingroup func_qurt_clade2_set
+  Sets the PMU register; only PMU_SEL register can be set.
+  
+  @param[in] offset Offset from the QURTK_clade2_cfg_base.          
+  @param[in] value  Value to set at offset.  
+ 
+  @return
+  #QURT_EOK -- Successfully set the value at offset. \n
+  #QURT_ENOTALLOWED -- Set operation performed at an offset other than CLADE2_PMU_SELECTION_REG.
+
+  @dependencies
+  None.
+ */
+int qurt_clade2_set(unsigned short offset, unsigned int value);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_CLADE_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_cond.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_cond.h
new file mode 100755
index 0000000000000..6e65ed82a8393
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_cond.h
@@ -0,0 +1,219 @@
+﻿#ifndef QURT_COND_H
+#define QURT_COND_H 
+/**
+  @file qurt_cond.h
+  @brief  Prototypes of kernel condition variable object API functions.
+
+ EXTERNALIZED FUNCTIONS
+  None
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  none
+
+ Copyright (c) 2021 Qualcomm Technologies, Inc.
+ All rights reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+
+
+#include <qurt_mutex.h>
+#include <qurt_rmutex2.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** @addtogroup condition_variables_types
+@{ */
+/*=====================================================================
+ Typedefs
+ ======================================================================*/
+
+/** QuRT condition variable type.  */
+typedef union {
+    /** @cond */
+	unsigned long long raw;
+	struct {
+		unsigned int count;
+		unsigned int n_waiting;
+        unsigned int queue;
+        unsigned int reserved;
+	}X;
+    /** @endcond */
+} qurt_cond_t;
+
+/** @} */ /* end_addtogroup condition_variables_types */
+
+/*=====================================================================
+ Functions
+======================================================================*/
+ 
+/*======================================================================*/
+/**@ingroup func_qurt_cond_init
+  Initializes a conditional variable object.
+
+  @datatypes
+  #qurt_cond_t
+	
+  @param[out] cond Pointer to the initialized condition variable object. 
+
+  @return
+  None.
+		 
+  @dependencies
+  None.
+ */
+/* ======================================================================*/
+void qurt_cond_init(qurt_cond_t *cond);
+
+/*======================================================================*/
+/**@ingroup func_qurt_cond_destroy
+  Destroys the specified condition variable.
+
+  @note1hang Conditions must be destroyed when they are no longer in use. Failure to do
+             this causes resource leaks in the QuRT kernel.\n
+  @note1cont Conditions must not be destroyed while they are still in use. If this occurs,
+             the behavior of QuRT is undefined. 
+
+  @datatypes
+  #qurt_cond_t
+
+  @param[in] cond Pointer to the condition variable object to destroy.
+
+  @return
+  None.
+
+ */
+/* ======================================================================*/
+void qurt_cond_destroy(qurt_cond_t *cond);
+
+/*======================================================================*/
+/**@ingroup func_qurt_cond_signal
+  Signals a waiting thread that the specified condition is true. \n
+
+  When a thread wishes to signal that a condition is true on a shared data item, it must
+  perform the following procedure: \n
+  -# Lock the mutex that controls access to the data item. \n
+  -# Perform the signal condition operation. \n
+  -# Unlock the mutex.
+
+  @note1hang Failure to properly lock and unlock a mutex of a condition variable can cause
+             the threads to never be suspended (or suspended but never awakened). 
+
+  @note1cont Use condition variables only with regular mutexes -- attempting to use
+             recursive mutexes or priority inheritance mutexes results in undefined behavior.
+             
+  @datatypes
+  #qurt_cond_t
+
+  @param[in] cond Pointer to the condition variable object to signal.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+/* ======================================================================*/
+void qurt_cond_signal(qurt_cond_t *cond);
+
+/*======================================================================*/
+/**@ingroup func_qurt_cond_broadcast
+  Signals multiple waiting threads that the specified condition is true.\n
+  When a thread wishes to broadcast that a condition is true on a shared data item, it must
+  perform the following procedure: \n
+  -# Lock the mutex that controls access to the data item. \n
+  -# Perform the broadcast condition operation. \n
+  -# Unlock the mutex.\n
+
+  @note1hang Failure to properly lock and unlock the mutex of a condition variable can cause
+             the threads to never be suspended (or suspended but never awakened).
+
+  @note1cont Use condition variables only with regular mutexes -- attempting to use
+  recursive mutexes or priority inheritance mutexes results in undefined behavior.
+  
+  @datatypes
+  #qurt_cond_t
+
+  @param[in] cond Pointer to the condition variable object to signal.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+/* ======================================================================*/
+void qurt_cond_broadcast(qurt_cond_t *cond);
+
+/*======================================================================*/
+/**@ingroup func_qurt_cond_wait
+  Suspends the current thread until the specified condition is true.
+  When a thread wishes to wait for a specific condition on a shared data item, it must
+  perform the following procedure: \n
+  -# Lock the mutex that controls access to the data item. \n
+  -# If the condition is not satisfied, perform the wait condition operation on the
+  condition variable (suspends the thread and unlocks the mutex).
+
+  @note1hang Failure to properly lock and unlock the mutex of a condition variable can cause
+             the threads to never be suspended (or suspended but never awakened).
+
+  @note1cont Use condition variables only with regular mutexes -- attempting to use
+  recursive mutexes or priority inheritance mutexes results in undefined behavior.
+  
+  @datatypes
+  #qurt_cond_t \n
+  #qurt_mutex_t
+  
+  @param[in] cond     Pointer to the condition variable object to wait on.
+  @param[in] mutex    Pointer to the mutex associated with condition variable to wait on.
+
+  @return
+  None.
+		 
+  @dependencies 
+  None.
+ */
+/* ======================================================================*/
+void qurt_cond_wait(qurt_cond_t *cond, qurt_mutex_t *mutex);
+
+/*======================================================================*/
+/**@ingroup func_qurt_cond_wait2
+  Suspends the current thread until the specified condition is true.
+  When a thread wishes to wait for a specific condition on a shared data item, it must
+  perform the following procedure: \n
+  -# Lock the mutex that controls access to the data item. \n
+  -# If the condition is not satisfied, perform the wait condition operation on the
+  condition variable, which suspends the thread and unlocks the mutex.
+ 
+  @note1hang Failure to properly lock and unlock the mutex of a condition variable can cause
+             the threads to never be suspended (or suspended but never awakened). 
+
+  @note1cont Use condition variables only with regular mutexes -- attempting to use
+             recursive mutexes or priority inheritance mutexes results in undefined behavior.
+             
+  @note1cont This is the same API as qurt_cond_wait(), use this version 
+             when using mutexes of type #qurt_rmutex2_t.
+
+  @datatypes
+  #qurt_cond_t \n
+  #qurt_rmutex2_t
+  
+  @param[in] cond     Pointer to the condition variable object to wait on.
+  @param[in] mutex    Pointer to the mutex associated with the condition variable to wait on.
+
+  @return
+  None.
+		 
+  @dependencies 
+  None.
+ */
+/* ======================================================================*/
+void qurt_cond_wait2(qurt_cond_t *cond, qurt_rmutex2_t *mutex);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_COND_H */
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_consts.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_consts.h
new file mode 100755
index 0000000000000..b1e35998e73b6
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_consts.h
@@ -0,0 +1,315 @@
+#ifndef QURT_CONSTS_H
+#define QURT_CONSTS_H
+
+/**
+  @file qurt_consts.h
+  @brief  QuRT constants and definitions
+
+  EXTERNAL FUNCTIONS
+   None.
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None
+
+ Copyright (c) 2013-2023 by Qualcomm Technologies, Inc.  All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*=====================================================================
+ Constants and macros
+ ======================================================================*/
+
+/* Definitions of system events. System events suspend
+   a thread and put it into suspending_list.
+   The system event number is saved in CONTEXT::error::cause field
+   of the suspended thread. An event handler thread such as
+   page fault handler or system error handler can wake up the suspended
+   thread.
+ */
+#define QURT_EVENT_PAGEFAULT      0x1 /* Page fault event. */
+#define QURT_EVENT_SYSTEM_ERR     0x2 /* System error event. */
+#define QURT_EVENT_SUSPEND        0x3
+#define QURT_EVENT_PROCESS_EXIT   0x4 /* Process termination event.*/
+
+#define QURT_SYSENV_MAX_THREADS_TYPE           1 /* Maximum threads object. */
+#define QURT_SYSENV_PROCNAME_TYPE              2 /* Process name object. */
+#define QURT_SYSENV_MAX_PI_PRIO_TYPE           3 /* Maximum pi priority object. */
+#define QURT_SYSENV_ARCH_REV_TYPE              4 /* Architecture version object. */
+#define QURT_SYSENV_APP_HEAP_TYPE              5 /* Application heap object. */
+#define QURT_SYSENV_REGION_ATTR_DEFAULT        7 /* Default region attributes. */
+#define QURT_SYSENV_STACK_PROFILE_COUNT_TYPE   8 /* Stack profile count type. */
+#define QURT_SYSENV_ISLAND_CONFIG_TYPE         9 /*island configuration check*/
+#define QURT_SYSENV_HTHREADS_TYPE              10 /* Active threads objec */
+#define QURT_SYSENV_CONFIG_IMAGE_START_LO      11 /* Config image start address for DTB parsing */
+#define QURT_SYSENV_CONFIG_IMAGE_START_HI      12 /* Config Image start address for DTB parsing */
+#define QURT_SYSENV_CHIPPARAMS_LO              13 /* ChipParams for DTB parsing */
+#define QURT_SYSENV_CHIPPARAMS_HI              14 /* ChipParams for DTB parsing */
+#define QURT_SYSENV_PLATPARAMS                 15 /* Platformparams for DTB parsing */
+#define QURT_SYSENV_CONFIG_IMAGE_SIZE          16 /* Config image Size for DTB parsing */
+#define QURT_SYSENV_L2_CACHE_LINE_SIZE         17 /*L2 cache line size*/
+
+/* Get q6 regs */
+#define QURT_GET_SSR         1
+#define QURT_GET_CCR         2
+#define QURT_GET_CFGBASE     3
+#define QURT_GET_SYSCFG      4
+#define QURT_GET_REV         5
+
+
+/** @cond rest_reg_dist */
+/** @addtogroup performance_monitor_macros
+@{ */
+
+/* PMU */
+#define QURT_PMUCNT0    0  /**< */
+#define QURT_PMUCNT1    1  /**< */
+#define QURT_PMUCNT2    2  /**< */
+#define QURT_PMUCNT3    3  /**< */
+#define QURT_PMUCFG     4  /**< */
+#define QURT_PMUEVTCFG  5  /**< */
+
+/* new since V55 */
+#define QURT_PMUCNT4    6  /**< */
+#define QURT_PMUCNT5    7  /**< */
+#define QURT_PMUCNT6    8  /**< */
+#define QURT_PMUCNT7    9  /**< */
+#define QURT_PMUEVTCFG1 10  /**< */
+
+/* new since V61 */
+#define QURT_PMUSTID0   11  /**< */
+#define QURT_PMUSTID1   12  /**< */
+
+#define QURT_PMUCNTSTID0   13  /**< */
+#define QURT_PMUCNTSTID1   14  /**< */
+#define QURT_PMUCNTSTID2   15  /**< */
+#define QURT_PMUCNTSTID3   16  /**< */
+#define QURT_PMUCNTSTID4   17  /**< */
+#define QURT_PMUCNTSTID5   18  /**< */
+#define QURT_PMUCNTSTID6   19  /**< */
+#define QURT_PMUCNTSTID7   20  /**< */
+
+/** @} */ /* end_addtogroup performance_monitor_macros */
+/** @endcond */
+
+/*
+ Power collapse operation
+*/
+#define QURT_POWER_SHUTDOWN       0 /**< */
+#define QURT_TCXO_SHUTDOWN        1 /**< */
+#define QURT_POWER_CMD_PREPARE    0 /**< */
+#define QURT_POWER_CMD_PERFORM    1 /**< */
+#define QURT_POWER_CMD_EXIT       2 /**< */
+#define QURT_POWER_CMD_FAIL_EXIT  3 /**< */
+#define QURT_POWER_CMD_PERFORM_L2_RETENTION 4 /**< */
+#define QURT_POWER_CMD_PERFORM_SAVE_TCM     5 /**< */
+#define QURT_POWER_CMD_DEEP_SLEEP 6           /**< */
+
+
+/** @addtogroup thread_macros
+@{ */
+#define QURT_MAX_HTHREAD_LIMIT    8U /**< Limit on the maximum number of hardware threads supported by QuRT for any
+ Hexagon version. Use this definition to define arrays, and so on, in
+ target independent code. */
+/** @} */ /* end_addtogroup thread_macros */
+
+/** @cond internal_only */
+/** @addtogroup power_management_macros
+@{ */
+/**
+  L2 cache retention mode
+*/
+#define QURT_POWER_SHUTDOWN_TYPE_L2NORET QURT_POWER_CMD_PERFORM /**< */
+#define QURT_POWER_SHUTDOWN_TYPE_L2RET   QURT_POWER_CMD_PERFORM_L2_RETENTION /**< */
+#define QURT_POWER_SHUTDOWN_TYPE_SAVETCM QURT_POWER_CMD_PERFORM_SAVE_TCM /**< */
+/** @} */ /* end_addtogroup power_management_macros */
+/** @endcond */
+
+/*
+  QURT_system_state
+  Use for debugging the shutdown/startup process.
+
+  State transition for cold boot:
+  QURT_BOOT_SETUP_ISDB --> QURT_CBOOT_BSP_INIT -->
+  QURT_CBOOT_END_CLEAN_INIT --> QURT_CBOOT_END_OS_INIT -->
+  QURT_CBOOT_KERNEL_INIT_DONE --> QURT_CBOOT_PLAT_CONFIG_DONE -->
+  QURT_CBOOT_ROOT_TASK_STARTED
+
+  State transition for power collapse:
+  QURT_PREPARE_SINGLE_MODE --> QURT_PERFORM_IPEND -->
+  QURT_PERFORM_SAVE_TLB --> QURT_PERFORM_SWITCH_PC -->
+  cache flush states (dependent on L2 retention config)
+
+  State transition for warm boot:
+  QURT_BOOT_SETUP_ISDB --> QURT_WBOOT_INIT_TLB -->
+  QURT_WBOOT_SET_1TO1_MAP --> QURT_WBOOT_REMOVE_1TO1_MAP -->
+  QURT_CBOOT_END_CLEAN_INIT --> QURT_CBOOT_END_OS_INIT
+*/
+#define QURT_PREPARE_SINGLE_MODE 1 /**< */
+#define QURT_PREPARE_END 2 /**< */
+#define QURT_PERFORM_IPEND 3 /**< */
+#define QURT_PERFORM_SAVE_ISDP 4 /**< */
+#define QURT_PERFORM_SAVE_PMU 5 /**< */
+#define QURT_PERFORM_SAVE_TLB 6 /**< */
+#define QURT_PERFORM_SWITCH_PC 7 /**< */
+#define QURT_PERFORM_EXIT 8 /**< */
+#define QURT_FLUSH_L1CACHE 9 /**< */
+#define QURT_FLUSH_L2CACHE 0xA /**< */
+#define QURT_FLUSH_CACHE_DONE 0xB /**< */
+#define QURT_SWITCH_PC_DONE 0xC /**< */
+#define QURT_BOOT_SETUP_ISDB 0xD /**< */
+#define QURT_WBOOT_INIT_TLB 0xE /**< */
+#define QURT_WBOOT_SET_1TO1_MAP 0xF /**< */
+#define QURT_WBOOT_CFG_ADV_SYSCFG 0x10 /**< */
+#define QURT_WBOOT_REMOVE_1TO1_MAP 0x11 /**< */
+#define QURT_CBOOT_BSP_INIT 0x12 /**< */
+#define QURT_CBOOT_END_CLEAN_L1CACHE 0x13 /**< */
+#define QURT_CBOOT_END_CLEAN_INIT 0x14 /**< */
+#define QURT_CBOOT_END_OS_INIT 0x15 /**< */
+#define QURT_CBOOT_TLB_DUMP_LOAD 0x16 /**< */
+#define QURT_CBOOT_TLB_STATIC_LOAD 0x17 /**< */
+#define QURT_CBOOT_KERNEL_INIT_DONE 0x18 /**< */
+#define QURT_CBOOT_PLAT_CONFIG_DONE 0x19 /**< */
+#define QURT_CBOOT_ROOT_TASK_STARTED 0x1A /**< */
+#define QURT_IMPRECISE_EXCEPTION 0x1B /**< */
+#define QURT_WBOOT_DEBUG_L2_START 0x1C /**< */
+#define QURT_WBOOT_DEBUG_L2_END   0x1D /**< */
+#define QURT_NMI_SAVE_L2VIC_COMPLETE   0x1E /**< */
+#define QURT_NMI_HANDLER_COMPLETE   0x1F /**< */
+#define QURT_NMI_AFTER_SAVE_GLOBAL 0x20 /**< */
+#define QURT_WBOOT_START 0x21 /**< */
+#define QURT_ENTER_ISLAND 0x22 /**< */
+#define QURT_EXIT_ISLAND 0x23 /**< */
+#define QURT_LOAD_NOTIFIER_TCB 0x24 /**< */
+#define QURT_ABNORMAL_RESET 0x25 /**< */
+/*
+  Thread attributes
+*/
+
+#define QURT_THREAD_ATTR_GP                    0x00000002 /*< */
+#define QURT_THREAD_ATTR_UGP                   0x00000003 /*< User general pointer (UGP)*/
+#define QURT_THREAD_ATTR_PREFETCH              0x00000004 /*< */
+#define QURT_THREAD_ATTR_TID                   0x00000005 /*< */
+#define QURT_THREAD_ATTR_CACHE_PART            0x00000007 /*< */
+#define QURT_THREAD_ATTR_COPROCESSOR           0x00000008 /*< */
+#define QURT_THREAD_ATTR_GET_L2CACHE_PART      0x00000009 /*< */
+#define QURT_THREAD_ATTR_SET_FRML              0x0000000A /*< */
+#define QURT_THREAD_ATTR_STID_GET              0x0000000B /*< */
+#define QURT_THREAD_ATTR_STID_SET              0x0000000C /*< */
+#define QURT_THREAD_ATTR_AUTOSTACK             0x0000000D /*< */
+#define QURT_THREAD_ATTR_SYSTEM_THREAD         0x0000000E /*< */
+#define QURT_THREAD_ATTR_STID_SET2             0x0000000F /*< */
+#define QURT_THREAD_ATTR_STID_SET2_ACKNOWLEDGE 0x00000010 /*< */
+#define QURT_THREAD_ATTR_STID_GET2             0x00000011 /*< */
+
+/**  Cache operations*/
+#define QURT_DCCLEAN                0U   /* Clean Dcache. */
+#define QURT_DCINV                  1U   /* Invalidate Dcache. */
+#define QURT_DCCLEANINV             2U   /* Clean and invalidate Dcache. */
+#define QURT_ICINV                  3U   /* Invalidate Icache. */
+#define QURT_DUMP_DCTAGS            4U  /* For testing purpose. */
+#define QURT_FLUSH_ALL              5U  /* Flush entire L1 and L2 cache. */
+#define QURT_TABLE_FLUSH            6U  /* Flush based on table of physical pages */
+#define QURT_CLEAN_INVALIDATE_ALL   7U  /* Flush and invalidate entire L1 and L2 cache. */
+#define QURT_L2CACHE_LOCK_LINES     8U  /* l2 cache lock lines */
+#define QURT_L2CACHE_UNLOCK_LINES   9U  /* l2 cache unlock lines */
+#define QURT_CLEAN                  10U  /* Flush L1 and L2 cache */
+#define QURT_CLEAN_INVALIDATE       11U  /* Flush and invalidate L1 and L2 cache. */
+#define QURT_CLEAN_INVALIDATE_L2    12U  /* Flush and invalidate entire L2 cache. */
+
+/**@ingroup chapter_prefined_symbols */
+/**@xreflabel{hdr:QURT_API_VERSION}*/
+
+
+/* Process state. */
+#define QURT_UPDATE_PROCESS_STATE   0 /**< */
+#define QURT_MP_INIT        1 /*< */
+#define QURT_MP_RUNNING     2 /*< */
+#define QURT_MP_STOPPED     3 /*< */
+
+/* QuRT reset reason. */
+#define QURT_NORMAL_BOOT               0  /* Normal boot. */
+#define QURT_WARM_BOOT                 1  /* Power collapse warm boot. */
+#define QURT_WARM_BOOT_L2_RETENTION    2  /* Power collapse with L2 retention warm boot. */
+#define QURT_WARM_BOOT_SAVE_TCM        3  /* Power collapse with saving TCM. */
+#define QURT_QUICK_BOOT                4  /* Deep sleep. */
+
+/* QuRT Wait for Idle command */
+#define QURT_WAIT_FOR_IDLE_DISABLE  0 /*< */
+#define QURT_WAIT_FOR_IDLE_ENABLE   1 /*< */
+#define QURT_WAIT_FOR_IDLE     2 /*< */
+#define QURT_WAIT_FOR_IDLE_CANCEL 3 /*< */
+
+/*QuRT island exit stages */
+#define QURT_ISLAND_EXIT_STAGE1 1 /*< */
+#define QURT_ISLAND_EXIT_STAGE2 2 /*< */
+
+#define QURT_MAX_NAME_LEN   64 /*< */
+
+#define MAX_POOL_RANGES     16 /*< */
+
+/* key definitions for debug thread info */
+//#define MAX_TCB_KEY           40    //whatever is a good number or makes debug thread structure be 1K
+#define KEY_SCHDULER_STATE      1   /*< */
+#define KEY_PRIORITY            2   /*< */
+#define KEY_PRIORITY_ORIG       3   /*< */
+#define KEY_STACK_BOTTOM        4    // Currently not populated
+#define KEY_STACK_TOP           5    // Currently not populated
+#define KEY_HVX_STATE           6    /*< */
+#define KEY_FUTEX_OBJECT        7    /*< */
+#define KEY_THREAD_ID           8    /*< */
+#define KEY_PROFILE_CYCLE_LO    9    // Currently not populated
+#define KEY_PROFILE_CYCLE_HI    10   // Currently not populated
+#define KEY_ERROR_ADDRESS       11   // This holds the BADVA
+#define KEY_ERROR_CAUSE         12   // This is the same as QURT_error_info.cause
+#define KEY_ERROR_CAUSE2        13   // This is the same as QURT_error_info.cause2
+#define KEY_ERROR_SSR           14   /*< Holds the SSR value */
+#define QURT_RESERVED           -1
+
+/* VTLB method IDs. */
+#define QURT_VTLB_ENTRY_CREATE          0U
+#define QURT_VTLB_ENTRY_DELETE          1U
+#define QURT_VTLB_ENTRY_READ            2U
+#define QURT_VTLB_ENTRY_WRITE           3U
+#define QURT_VTLB_ENTRY_PROBE           4U
+#define QURT_VTLB_ENTRY_SPLIT           5U
+#define QURT_VTLB_ENTRY_MERGE           6U
+#define QURT_VTLB_ENTRY_STATISTICS      7U
+#define QURT_VTLB_ENTRY_SET_SPECIAL     8U
+#define QURT_VTLB_QUEUE_PPAGE           9U
+#define QURT_VTLB_RECLAIM_STACK_PAGES   10U
+#define QURT_VTLB_ASID_SET_STATE_FAST   11U
+#define QURT_VTLB_ASID_SET_STATE        12U
+#define QURT_VTLB_ENTRY_SET_EXTENSION   13U
+#define QURT_VTLB_ENTRY_CLEAR_EXTENSION 14U
+
+/* VTCM window access control HWIO programming. */
+#define QURT_VTCM_WINDOW_ENABLE             1U
+#define QURT_VTCM_WINDOW_DISABLE            0U
+#define QURT_VTCM_WINDOW_HI_OFFSET_DEFAULT  0xFFFU
+#define QURT_VTCM_WINDOW_LO_OFFSET_DEFAULT  0U
+
+/** @cond */
+/* ETM source - PC or data access */
+#define QURT_ETM_SOURCE_PC          0U  /**< Memory source of SAC* is PC. */
+#define QURT_ETM_SOURCE_DATA        1U  /**< Memory source of SAC* is data. */
+
+/* ETM PID status flags */
+#define QURT_ETM_NO_PID             0xFFFFFFFF /**< No PID is selected. */
+/** @endcond */
+
+/* execution context */
+#define QURT_CTX_USER       1
+#define QURT_CTX_GUEST      2
+
+/* Profiling STID */
+#define QURT_STID_DEFAULT   0U
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_CONSTS_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_cycles.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_cycles.h
new file mode 100755
index 0000000000000..b599493f5d563
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_cycles.h
@@ -0,0 +1,301 @@
+
+#ifndef QURT_CYCLES_H
+#define QURT_CYCLES_H 1
+/**
+  @file qurt_cycles.h
+  Prototypes of kernel pcycle API functions.      
+
+ EXTERNALIZED FUNCTIONS
+  none
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  none
+
+ Copyright (c) 2021, 2023  by Qualcomm Technologies, Inc.  All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+ 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+	/*=====================================================================
+	 Functions
+	======================================================================*/
+	 
+/*======================================================================*/
+
+/**@ingroup func_qurt_profile_reset_idle_pcycles
+  @xreflabel{hdr:qurt_profile_reset_idle_pcycles}
+  Sets the per-hardware-thread idle cycle counts to zero. 
+
+  @return 
+  None. 
+		 
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+void qurt_profile_reset_idle_pcycles (void);
+	 
+/*======================================================================*/
+/**@ingroup func_qurt_profile_get_thread_pcycles
+  @xreflabel{hdr:qurt_profile_get_thread_pcycles}
+  Gets the count of the running processor cycles for the current thread.\n
+  Returns the current running processor cycle count for the current QuRT thread.
+
+  @note1hang  Profiling shall be enabled first to start the cycle counting. 
+              The cycles are accumulated once the profiling is enabled and 
+              resets on #qurt_profile_reset_threadid_pcycles
+
+  @return 
+  Integer -- Running processor cycle count for current thread.
+		 
+  @dependencies 
+  None.
+*/
+/* ======================================================================*/
+unsigned long long int qurt_profile_get_thread_pcycles(void);
+
+	
+/*======================================================================*/
+/**@ingroup func_qurt_get_core_pcycles
+  @xreflabel{hdr:qurt_get_core_pcycles}
+  Gets the count of core processor cycles executed.\n
+  Returns the current number of running processor cycles executed since the Hexagon
+  processor was last reset.
+
+  This value is based on the hardware core clock, which varies in speed according to the
+  processor clock frequency.
+
+  @note1hang Because the hardware core clock stops running when the processor shuts
+             down (due to all of the hardware threads being idle), treat the cycle values returned
+             by this operation as relative rather than absolute.
+
+  @note1cont Thread cycle counts are valid only in the V4 Hexagon processor version.
+
+  @return 
+  Integer -- Current count of core processor cycles.
+		 
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+unsigned long long int qurt_get_core_pcycles(void);
+
+/*======================================================================*/
+/**@ingroup func_qurt_profile_get_idle_pcycles
+
+  @deprecated use #qurt_profile_get_idle_pcycles2 instead
+
+  Gets the current idle processor cycle counts for a maximum of 6 hardware threads. Use
+  #qurt_profile_get_idle_pcycles2 for reading pcycles without limitation on maximum hardware threads. 
+
+  This operation accepts a pointer to a user-defined array, and writes to the array the current
+  idle cycle count for each hardware thread.
+
+  Each count value represents the number of processor cycles that have elapsed on the
+  corresponding hardware thread while that thread has been in Wait mode.\n
+
+
+  @note1hang This operation does not return the idle cycles that occur when the Hexagon
+             processor shuts down (due to all of the hardware threads being idle). 
+             Idle cycle counts gets accumulated irrespective of profiling is enabled or not, 
+	           and resets on #qurt_profile_reset_idle_pcycles
+	
+  @param[out] pcycles  User array where the function stores the current idle cycle count values.
+                        Array size should be a minimum of the number of hardware threads intended. 
+
+  @return
+  None.
+		 
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+void qurt_profile_get_idle_pcycles (unsigned long long *pcycles);
+
+/*======================================================================*/
+/**@ingroup func_qurt_profile_get_idle_pcycles2
+  Gets the current idle processor cycle counts for maximum available hardware threads.
+
+  This operation accepts a pointer to a user-defined array with length in bytes, and writes 
+  to the array the current idle cycle count for each hardware thread.
+
+  Each count value represents the number of processor cycles that have elapsed on the
+  corresponding hardware thread while that thread has been in Wait mode.\n
+
+  @note1hang This operation does not return the idle cycles that occur when the Hexagon
+             processor shuts down (due to all of the hardware threads being idle). 
+             Idle cycle counts gets accumulated irrespective of profiling enable status, and 
+             resets on #qurt_profile_reset_idle_pcycles
+	
+  @param[out] pcycles  User array where the function stores the current idle cycle count values. 
+                        Array size should be equivalent to the number of hardware threads intended. 
+                        Call #qurt_sysenv_get_max_hw_threads to determine the array size required.
+  
+  @param[in] length_in_bytes Length of pcycles array in bytes. If the array size is smaller
+                              than the required for the maximum available hardware threads, 
+                              it returns error code. 
+
+  @return
+  #QURT_EOK -- Successful operation. Stored all the data to the destination array
+  #QURT_EFAILED -- Operation failed due to smaller #pcycles array
+		 
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+int qurt_profile_get_idle_pcycles2 (unsigned long long *pcycles, unsigned int length_in_bytes);
+
+/*======================================================================*/
+/**@ingroup func_qurt_profile_get_threadid_pcycles
+  
+  @deprecated use #qurt_profile_get_threadid_pcycles2 instead
+  
+  Gets the current per-hardware-thread running cycle counts for the specified QuRT
+  thread for a maximum of 6 hardware threads.
+
+  Each count value represents the number of processor cycles that have elapsed on the
+  corresponding hardware thread while that thread has been scheduled for the specified
+  QuRT thread.
+
+  @note1hang  Profiling shall be enabled first to start the cycle counting. 
+              The cycles are accumulated once the profiling is enabled and 
+              resets on #qurt_profile_reset_threadid_pcycles
+
+  @param[in]   thread_id  Valid thread identifier.
+  @param[out]  pcycles    Pointer to a user array where the function stores the current running 
+                          cycle count values. Array size should be a minimum of the number of
+                          hardware threads intended. 
+	
+  @return 				
+  None. 
+		 
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+void qurt_profile_get_threadid_pcycles (int thread_id, unsigned long long  *pcycles);
+
+/*======================================================================*/
+/**@ingroup func_qurt_profile_get_threadid_pcycles2
+    
+  Gets the current per-hardware-thread running cycle counts for the specified QuRT
+  thread for maximum available hardware threads.
+
+  Each count value represents the number of processor cycles that have elapsed on the
+  corresponding hardware thread while that thread has been scheduled for the specified
+  QuRT thread.
+
+  @note1hang  Profiling shall be enabled first to start the cycle counting. 
+              The cycles are accumulated once the profiling is enabled and 
+              resets on #qurt_profile_reset_threadid_pcycles
+
+  @param[in]  thread_id  Thread identifier.
+  @param[out] pcycles    Pointer to a user array where the function stores the current running 
+                          cycle count values. Array size should be equivalent to the number of
+                          hardware threads intended. 
+                          Call #qurt_sysenv_get_max_hw_threads to determine the array size required.
+  @param[in]  length_in_bytes Length of pcycles array in bytes. If the array size is smaller
+                              than the required for the maximum available hardware threads, it 
+                              returns error code. 
+  
+  @return
+  #QURT_EOK -- Successful operation. Stored all the data to the destination array
+  #QURT_EFAILED -- Operation failed due to smaller #pcycles array
+  #QURT_ENOTHREAD -- Operation failed due to invalid #thread_id
+		 
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+int qurt_profile_get_threadid_pcycles2 (int thread_id, unsigned long long  *pcycles, unsigned int length_in_bytes);
+
+
+/*======================================================================*/
+/**@ingroup func_qurt_profile_reset_threadid_pcycles
+  @xreflabel{hdr:qurt_profile_reset_threadid_pcycles}
+  Sets the per-hardware-thread running cycle counts to zero for the specified QuRT thread.
+
+  @param[in]  thread_id Thread identifier.
+	
+  @return 
+  None. 
+		 
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+void qurt_profile_reset_threadid_pcycles (int thread_id);
+
+/*======================================================================*/
+/**@ingroup func_qurt_profile_enable
+  @xreflabel{hdr:qurt_profile_enable}
+  Enables profiling.\n
+  Enables or disables cycle counting of the running and idle processor cycles.
+  Profiling is disabled by default. \n
+
+  @note1hang Enabling profiling does not automatically reset the cycle counts -- this must be
+             done explicitly by calling the reset operations before starting cycle counting.
+             Cycle counting starts from the instant of it was enabled using this API, and  
+             halts on profiling disable.
+	
+  @param[in] enable  Profiling. Values: \n
+                     - 0 -- Disable profiling \n
+                     - 1 -- Enable profiling @tablebulletend
+	
+  @return 
+  None. 
+		 
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+void qurt_profile_enable (int enable);
+
+/*======================================================================*/
+/**@ingroup func_qurt_get_hthread_pcycles
+  @xreflabel{hdr:qurt_get_hthread_pcycles}
+  Reads the GCYCLE_nT register to allow performance measurement when N threads are in run mode.\n
+
+  @note1hang Returns 0 when architecture is earlier than v67 or for invalid HW thread id.
+  
+  @param[in] n Threads in run mode. Valid values are 1 through <maximum HW threads>.
+                     
+  
+  @return 
+  Value read from GCYCLE_nT register. This value indicates the total number of pcycles that got executed
+  from reset to current point of execution when n threads are in run mode
+     
+  @dependencies
+  PMU must be enabled.
+*/
+/* ======================================================================*/
+unsigned int qurt_get_hthread_pcycles(int n);
+
+/*======================================================================*/
+/**@ingroup func_qurt_get_hthread_commits
+  @xreflabel{hdr:qurt_get_hthread_commits}
+  Reads the GCOMMIT_nT register to allow performance measurement when N threads are in run mode.\n
+
+  @note1hang Returns 0 when architecture is earlier than v67 or for invalid HW thread id.
+  
+  @param[in] n Threads in run mode. Valid values: 1 through <maximum HW threads>.
+  
+  @return 
+  Value read from the GCOMMIT_nT register. This value indicates the total number of packets 
+  committed from reset to current point of execution when n threads are in run mode.
+     
+  @dependencies
+  PMU must be enabled.
+*/
+/* ======================================================================*/
+unsigned int qurt_get_hthread_commits(int n);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_devtree.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_devtree.h
new file mode 100755
index 0000000000000..4adee45bb44a2
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_devtree.h
@@ -0,0 +1,161 @@
+#ifndef QURT_DEVTREE_H
+#define QURT_DEVTREE_H
+/**
+ @file qurt_devtree.h 
+ @brief  Prototypes and structures for device tree aware QuRT library function.
+
+Copyright (c) 2021, 2023  by Qualcomm Technologies, Inc.  All Rights Reserved.
+*/
+/*qurt_callback is included by qurt_qdi_driver.h and depends on NULL being def.
+  callback is not used here, so define NULL here to avoid including the world*/
+#ifndef NULL
+#define NULL ((void *) 0)
+#endif
+
+#include "libfdt.h"
+#include "DTBExtnLib.h"
+#include "qurt_qdi_ext.h"
+#include "qurt_thread.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define INVALID_BLOB_ID       (-1)
+#define DEFAULT_BLOB_ID         0
+
+/** QURT Device Tree Mapping Macros */
+#define QURT_DT_MAPPING_FAILED         (-1)
+#define QURT_DT_FLAG_ISLAND             0x1
+#define QURT_DT_FLAG_PHYSADDR           0x2
+
+/** Device Tree type for Root PD Device tree.
+    Root PD Device Tree will typically describe the hardware in the subsystem.
+    This is the /soc portion of the Device Tree. */
+#define QURT_DT_BLOB_TYPE_ROOT  0
+
+/** Device Tree type for Local Device tree.
+    Local Device Tree will typically contain the software settings.
+    This is the /sw portion of the Device Tree. */
+#define QURT_DT_BLOB_TYPE_LOCAL 1
+
+int qurt_devtree_init(void);
+
+/**@ingroup func_qurt_dt_mapping_create
+ Creates a memory mapping from the specified property of the specified device
+ tree node. Returns virtual addresses and sizes.
+                  
+ @param[in]   offset         Device tree node offset.
+ @param[in]   flags          Flags to configure memory. Overloaded as property 
+                              index if reg_name is NULL.
+ @param[in]   reg_name       Identifies property to use for mapping, should 
+                              resemble a region.
+ @param[out]   vaddr         Return pointer for the virtual region address.
+ @param[out]   size          Return pointer for the virtual region size.
+
+ @return
+ Result code indicating success or failure \n
+*/
+int qurt_dt_mapping_create(fdt_node_handle *devtreeNode, int flags, char *regionName, int regionIdx, 
+                                unsigned long long *vaddr, unsigned long long *size);
+
+/**@ingroup func_qurt_dt_mapping_create2
+ 
+ Creates a memory mapping from the specified property of the specified device
+ tree node.
+
+ Returns virtual addresses and sizes according to architecture (i.e either 32 bit or 64 bit). 
+
+ @param[in]   devtreeNode    Device Tree node    
+
+ @param[in]   dt_map_flags   Flags to configure memory mapping and are reserved for future purpose.
+                              (0) - Default value assumes details from DT node are phys address, size.
+                              QURT_DT_FLAG_ISLAND <IslandMode-Mapping>
+
+                              NOTE: The PA needs to be added to corresponding island spec to create an island mapping
+
+ @param[in]   regionName     NULL or name of index in range to return, should 
+                              resemble a region.       Ex.reg-names =  "base",         "rx",               "tx";
+
+ @param[in]   regionIdx      Index of range to return.  Ex reg       = <0x1000 0x20>, <0x10000 0x100>, <0x18000 0x100 >;
+                              
+                              NOTE: If client specifies both re_name & regionIdx. The precedence of 
+                              region name is taken over and region index is ignored.
+
+ @param[in]   dt_map_perm    Mapping access permissions(R/W),
+                              QURT_PERM_READ <Read only>
+                              QURT_PERM_WRITE
+
+ @param[in]   cache_attr     QuRT cache mode type's :
+                              QURT_MEM_CACHE_DEVICE <memory-mapped device>
+                              QURT_MEM_CACHE_WRITEBACK <Cached WB>
+                              Other required cache type enums in qurt_types.h can also be passed.
+
+                             NOTE: No default value for cache & perm is present. 
+                             Client always needs to pass any of defined the flags.
+
+ @param[out]  vaddr          Return pointer to the variable that holds the virtual address
+ @param[out]  size           Return pointer for the virtual region size.
+
+ @return
+ #QURT_EOK                   Success indicating mapping created properly.
+ #QURT_DT_MAPPING_FAILED     Failed to create mapping.
+ #QURT_EINVALID              Mismatch in the architecture.
+
+                             else FdtLib or thirdparty error code.
+
+*/
+int qurt_dt_mapping_create2(fdt_node_handle *devtreeNode, unsigned int dt_map_flags, 
+                              char *regionName, int regionIdx, unsigned int dt_map_perm, int cache_attr, void **vaddr, size_t *size);
+
+/**@ingroup func_qurt_dt_isr_register
+  Device tree aware registration of an interrupt service routine (ISR) to an ISR thread. 
+  The interrupt defined in the specified device tree node is enabled when this function returns success.
+
+  @datatypes
+  #qurt_thread_t \n
+  #fdt_node_handle
+
+  @param[in]   dt_node       Device tree node that specifies the interrupt property.
+  @param[in]   dt_int_index  Index of the specific interrupt to use within the device tree node structure.
+                             Specify either this or int_name, use -1 if string is used.
+  @param[in]   dt_int_name   Name of the specific interrupt to use within the device tree node structure.
+                             Either this or int_index should be specified, use NULL if index is used
+  @param[in]   isr_thread_id ISR thread ID, returned from qurt_isr_create(), defined by qurt_isr_register2().  
+  @param[in]   prio          Priority of the ISR, defined by qurt_isr_register2().
+  @param[in]   flags         Defines ACK type. Values : \n
+                             #QURT_INT_NON_DELAYED_ACK - ISR is acknowledged by the interrupt handle routine 
+			                                     in the kernel.
+                             #QURT_INT_DELAYED_ACK     - Client chooses to acknowledge.
+                             Defined by qurt_isr_register2().             
+  @param[in]   isr           ISR with proto type void isr (void *arg, int int_num), defined by qurt_isr_register2().
+  @param[in]   arg  	     First argument of the ISR when it is called to service the interrupt, defined by qurt_isr_register2().
+   
+  @return 
+  #QURT_EOK          -- Successfully registered the ISR for the interrupt \n
+  #QURT_EINT         -- Interrupt not configured \n
+  #QURT_EINVALID     -- Invalid thread ID \n
+  #QURT_EDISABLED    -- The feature is disabled \n
+  #QURT_EDUPLICATE   -- Interrupt is already registered
+
+  @dependencies
+   Create the thread ID qurt_isr_create().
+   ISR registration completed with qurt_isr_register2().
+ */
+int qurt_dt_isr_register(fdt_node_handle *dt_node, int dt_int_index, char * dt_int_name, qurt_thread_t isr_thread_id, 
+                         unsigned short prio, unsigned short flags, void (*isr) (void *, int), void *arg);
+
+/**@ingroup func_qurt_dt_blob_id_get
+ Returns the Blob ID for the Blob type passed.
+ The value returned from this API can be passed as Blob ID parameter to DTBExtnLib APIs.
+
+ @param[in] blob_type  Blob type to look up.
+ @return Blob ID for the passed Blob Type.
+*/
+int qurt_dt_blob_id_get(unsigned int blob_type);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_ecc.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_ecc.h
new file mode 100755
index 0000000000000..09312684e99af
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_ecc.h
@@ -0,0 +1,168 @@
+#ifndef QURT_ECC_H
+#define QURT_ECC_H
+
+
+/*=====================================================================
+ 
+  @file  qurt_ecc.h
+  @brief  Prototypes of QuRT memory ECC API functions      
+
+ Copyright (c) 2018, 2020-2021 by Qualcomm Technologies, Inc.  All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+ ======================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*=============================================================================
+                        TYPEDEFS
+=============================================================================*/
+/** @addtogroup exception_handling_types
+@{ */
+// ECC memory definition
+typedef enum {
+    QURT_ECC_MEM_L1_ICACHE = 0, /**< ECC memory L1 ICache. */
+    QURT_ECC_MEM_L1_DCACHE = 1, /**< ECC memory L1 DCache.*/
+    QURT_ECC_MEM_L2_CACHE  = 2, /**< ECC memory L2 Cache.*/
+    QURT_ECC_MEM_VTCM      = 3  /**< ECC memory VTCM.*/
+} qurt_ecc_memory_t;
+/** @} */ /* end_addtogroup exception_handling_types */
+
+/*=============================================================================
+                        CONSTANTS AND MACROS
+=============================================================================*/
+/** @addtogroup exception_handling_macros
+@{ */
+
+#define   QURT_ECC_ERR_DETECTED_STATUS        0 /**< ECC error detected. */
+#define   QURT_ECC_ERR_TYPE                   1 /**< ECC error type.*/
+// ECC status type
+
+#define  QURT_ECC_CORRECTABLE_COUNT           (1<<0) /**< ECC correctable count.*/
+#define  QURT_ECC_UNCORRECTABLE_COUNT         (1<<1) /**< ECC uncorrectable count.*/
+#define  QURT_ECC_REGION_LOGGING              (1<<2) /**< ECC region logging.*/
+// ECC enable/disable definition
+
+#define QURT_ECC_PROTECTION_DISABLE  (0<<0)    /**< Bit 0. */
+#define QURT_ECC_PROTECTION_ENABLE   (1<<0)    /**< Bit 0. */
+/** @} */ /* end_addtogroup exception_handling_macros */
+/*=============================================================================
+                        FUNCTIONS
+=============================================================================*/
+
+
+/**@ingroup func_qurt_ecc_enable
+  Enables or disables ECC protection on a specified memory.
+  
+  @datatypes
+  #qurt_ecc_memory_t
+  
+  @param[in]  memory Set to one of the following values:
+                     - #QURT_ECC_MEM_L1_ICACHE
+                     - #QURT_ECC_MEM_L1_DCACHE
+                     - #QURT_ECC_MEM_L2_CACHE
+                     - #QURT_ECC_MEM_VTCM   @tablebulletend
+
+  @param[in]  enable Set to one of the following values:
+                     - #QURT_ECC_PROTECTION_ENABLE
+                     - #QURT_ECC_PROTECTION_DISABLE  @tablebulletend
+
+  @return
+  - #QURT_EOK --   ECC enabling or disabling setup is performed successfully
+  - Others  --    Failure
+
+  @dependencies
+  None.
+ */
+int qurt_ecc_enable( qurt_ecc_memory_t memory, unsigned int enable );
+
+
+/**@ingroup func_qurt_ecc_get_error_status
+  Gets ECC error status for a specified memory.
+  
+  @datatypes
+  #qurt_ecc_memory_t
+  
+  @param[in]  memory  Set to one of the following:
+                      - #QURT_ECC_MEM_L1_ICACHE
+                      - #QURT_ECC_MEM_L1_DCACHE
+                      - #QURT_ECC_MEM_L2_CACHE
+                      - #QURT_ECC_MEM_VTCM    @tablebulletend
+
+  @param[in]  type  Set to one of the following:
+                     - #QURT_ECC_ERR_DETECTED_STATUS
+                     - #QURT_ECC_ERR_TYPE  @tablebulletend
+
+  @return
+  Returns the following when the type is #QURT_ECC_ERR_DETECTED_STATUS:
+       - 0 -- No error detected \n
+       - 1 -- At least one error detected \n
+  Returns the following when the type is #QURT_ECC_ERR_TYPE: \n
+       - 0 through 1 -- Correctable error \n
+       - 2 --   Uncorrectable error
+
+  @dependencies
+  None.
+ */
+int qurt_ecc_get_error_status( qurt_ecc_memory_t memory, unsigned int type );
+
+
+/**@ingroup func_qurt_ecc_get_error_count
+  Gets the ECC error count for a specified memory.
+  
+  @datatypes
+  #qurt_ecc_memory_t
+  
+  @param[in]  memory  Set to one of the following values:\n
+                      - #QURT_ECC_MEM_L1_ICACHE \n
+                      - #QURT_ECC_MEM_L1_DCACHE \n
+                      - #QURT_ECC_MEM_L2_CACHE \n
+                      - #QURT_ECC_MEM_VTCM    @tablebulletend
+
+  @param[in]  type  Set to one of the following values: \n
+                     - #QURT_ECC_CORRECTABLE_COUNT \n
+                     - #QURT_ECC_UNCORRECTABLE_COUNT  @tablebulletend
+
+  @return
+  Error count for the specified error type.
+
+  @dependencies
+  None.
+ */
+int qurt_ecc_get_error_count( qurt_ecc_memory_t memory, unsigned int type );
+
+
+/**@ingroup func_qurt_ecc_clear_error_count
+  Clears ECC error count or region logging for a specified memory.
+  
+  @datatypes
+  #qurt_ecc_memory_t
+  
+  @param[in]  memory Set to one of the following values: \n
+                     - #QURT_ECC_MEM_L1_ICACHE \n
+                     - #QURT_ECC_MEM_L1_DCACHE \n
+                     - #QURT_ECC_MEM_L2_CACHE \n
+                     - #QURT_ECC_MEM_VTCM    @tablebulletend
+
+  @param[in]  type Set to one or multiple OR'ed of the following values: \n
+                  - #QURT_ECC_CORRECTABLE_COUNT  \n
+                  - #QURT_ECC_UNCORRECTABLE_COUNT \n
+                  - #QURT_ECC_REGION_LOGGING  @tablebulletend
+     
+  @return
+  #QURT_EOK -- Error count successfully cleared \n
+  Others --   Failure at clearing the error count
+
+  @dependencies
+  None.
+ */
+int qurt_ecc_clear_error_count( qurt_ecc_memory_t memory, unsigned int type );
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_ECC_H */
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_error.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_error.h
new file mode 100755
index 0000000000000..f4666b396c378
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_error.h
@@ -0,0 +1,149 @@
+#ifndef QURT_ERROR_H
+#define QURT_ERROR_H
+
+/**
+  @file qurt_error.h 
+  Error results- QURT defines a set of standard symbols for the error result values. This file lists the
+  symbols and their corresponding values.
+
+ EXTERNALIZED FUNCTIONS
+  none
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  none
+
+ Copyright (c) 2021-2022 , 2023  by Qualcomm Technologies, Inc.  All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc..
+ ======================================================================*/
+#include "qurt_except.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** @addtogroup chapter_error
+@{ */
+
+/*=====================================================================
+Constants and macros
+======================================================================*/
+#define QURT_EOK                             0  /**< Operation successfully performed. */
+#define QURT_EVAL                            1  /**< Wrong values for the parameters. The specified page does not exist. */
+#define QURT_EMEM                            2  /**< Not enough memory to perform the operation.*/
+
+#define QURT_EINVALID                        4  /**< Invalid argument value; invalid key. */ 
+/** @cond  */
+#define QURT_EUNKNOWN                        6  /**< Defined but never used in QuRT. */
+#define QURT_ENOMSGS                         7  /**< Message queue is empty. */ 
+#define QURT_EBADF                           9  /**< Bad message queue descriptor. */
+/** @endcond */
+#define QURT_EFAILED                        12  /**< Operation failed. */ 
+
+#define QURT_ENOTALLOWED                    13  /**< Operation not allowed. */
+
+/** @cond */
+#define QURT_EDUPCLSID                      14  /*< Duplicate class ID. */
+/** @endcond */
+/** @cond rest_reg_dist   */
+#define QURT_ENOREGISTERED                  20  /**< No registered interrupts.*/ 
+/** @endcond */
+
+
+/** @cond */
+#define QURT_EISDB                          21  /*< Power collapse failed due to ISDB being enabled. */
+#define QURT_ESTM                           22  /*< Power collapse failed in a Single-threaded mode check. */
+/** @endcond */
+
+
+/** @cond rest_reg_dist  */
+#define QURT_ETLSAVAIL                      23  /**< No free TLS key is available. */
+#define QURT_ETLSENTRY                      24  /**< TLS key is not already free. */ 
+/** @endcond */
+
+#define QURT_EINT                           26  /**< Invalid interrupt number (not registered). */  
+/** @cond rest_reg_dist */
+#define QURT_ESIG                           27  /**< Invalid signal bitmask (cannot set more than one signal at a time). */
+/** @endcond */
+
+/** @cond */
+#define QURT_EHEAP                          28  /**< No heap space is available. */
+#define QURT_ENOSPC                         28  /**< No space to create another queue in the system. */
+#define QURT_EMEMMAP                        29  /**< Physical address layout is not supported by the kernel. */
+/** @endcond */
+/** @cond rest_reg_dist */
+#define QURT_ENOTHREAD                      30  /**< Thread no longer exists. */
+/** @endcond */
+/** @cond */
+#define QURT_EL2CACHE                       31  /**< L2cachable is not supported in kernel invalidate/cleaninv. */
+/** @endcond */
+/** @cond rest_reg_dist  */
+#define QURT_EALIGN                         32  /**< Not aligned. */
+#define QURT_EDEREGISTERED                  33  /**< Interrupt is already deregistered.*/
+/** @endcond */
+
+/** @cond internal_only */
+
+#define QURT_ETLBCREATESIZE                 34  /**< TLB create error -- Incorrect size.*/
+#define QURT_ETLBCREATEUNALIGNED            35  /**< TLB create error -- Unaligned address.*/
+/** @endcond */
+/** @cond rest_reg_dist*/
+#define QURT_EEXISTS                        35  /**< File or message queue already exists. */
+#define QURT_ENAMETOOLONG                   36  /**< Name too long for message queue creation. */
+#define QURT_EPRIVILEGE                     36  /**< Caller does not have privilege for this operation.*/
+
+#define QURT_ECANCEL                        37  /**< A cancellable request was canceled because the associated process was asked to exit.*/
+/** @endcond */
+
+/** @cond */
+#define QURT_EISLANDTRAP                    38  /*< Unsupported TRAP is called in Island mode.*/ 
+
+#define QURT_ERMUTEXUNLOCKNONHOLDER         39  /*< Rmutex unlock by a non-holder.*/
+#define QURT_ERMUTEXUNLOCKFATAL             40  /*< Rmutex unlock error, all except the non-holder error.*/
+#define QURT_EMUTEXUNLOCKNONHOLDER          41  /*< Mutex unlock by a non-holder.*/
+#define QURT_EMUTEXUNLOCKFATAL              42  /*< Mutex unlock error, all except the non-holder error.*/
+#define QURT_EINVALIDPOWERCOLLAPSE          43  /*< Invalid power collapse mode requested. */ 
+/** @endcond */
+#define QURT_EISLANDUSEREXIT                44  /**< User call has resulted in island exit.*/
+#define QURT_ENOISLANDENTRY                 45  /**< Island mode had not yet been entered.*/
+#define QURT_EISLANDINVALIDINT              46  /**< Exited Island mode due to an invalid island interrupt.*/
+/** @cond rest_reg_dist */
+#define QURT_ETIMEDOUT                      47  /**< Operation timed-out. */
+#define QURT_EALREADY                       48  /**< Operation already in progress. */
+/** @endcond */
+
+#define QURT_ERETRY                         49  /*< Retry the operation. */
+#define QURT_EDISABLED                      50  /*< Resource disabled. */
+#define QURT_EDUPLICATE                     51  /*< Duplicate resource. */
+#define QURT_EBADR                          53  /*< Invalid request descriptor. */
+#define QURT_ETLB                           54  /*< Exceeded maximum allowed TLBs. */
+#define QURT_ENOTSUPPORTED                  55  /*< Operation not supported. */
+/** @cond rest_reg_dist */
+#define QURT_ENORESOURCE                    56  /**< No resource. */
+/** @endcond */
+
+#define QURT_EDTINIT                        57  /**< Problem with device tree intialization. */
+#define QURT_EBUFLOCK                       58  /*< Buffer lock failed because it was already locked many times. */
+#define QURT_ELOCKED                        59  /**< Current operation failed as the buffer is locked. */
+#define QURT_EMSGSIZE                       90  /*< Message queue msg_len is greater than mq_msgsize attribute of the message queue. */
+
+
+#define QURT_ENOTCONFIGURED                 91  /*< Interrupt is NOT configured. */
+
+#define QURT_EBANDWIDTHLIMIT                92  /*< Message queue send exceed the bandwidth limit. */
+
+#define QURT_ECFIVIOLATION                  93  /*< CFI violation detected. */
+
+#define QURT_EDESTROY                       94  /**< A destroy request was made to waiting threads.*/
+
+#define QURT_EHMXNOTAVAIL                   95  /**< HMX is not available to target thread.*/
+#define QURT_EHMXNOTDETACHABLE              96  /**< HMX is not detachable from target thread.*/
+
+#define QURT_EFATAL                         -1  /**< Fatal error. */
+
+/** @} */ /* end_addtogroup chapter_error */
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_ERROR_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_event.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_event.h
new file mode 100755
index 0000000000000..987f0fe79f227
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_event.h
@@ -0,0 +1,452 @@
+#ifndef QURT_EVENT_H
+#define QURT_EVENT_H
+/**
+  @file qurt_event.h
+  @brief Prototypes of kernel event API functions.      
+
+ EXTERNALIZED FUNCTIONS
+  none
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  none
+
+ Copyright (c) 2018-2021, 2023 Qualcomm Technologies, Inc.  All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+
+#include "qurt_consts.h"
+#include "qurt_thread.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * System environment object type.
+ */
+/**@addtogroup sys_env_types
+@{ */
+/** QuRT swap pool information type. */
+typedef struct qurt_sysenv_swap_pools {
+   /** @cond */
+   unsigned int spoolsize; /* Swap pool size.*/
+   unsigned int spooladdr;   /* Swap pool start address.*/
+   /** @endcond */
+}qurt_sysenv_swap_pools_t;
+
+/**QuRT application heap information type. */
+typedef struct qurt_sysenv_app_heap {
+   /** @cond */
+   unsigned int heap_base; /* Heap base address.*/
+   unsigned int heap_limit; /* Heap end address.*/
+   /** @endcond */
+} qurt_sysenv_app_heap_t ;
+
+/** QuRT architecture version information type. */
+typedef struct qurt_sysenv_arch_version {
+   /** @cond */
+    unsigned int arch_version; /*Architecture version.*/
+    /** @endcond */
+}qurt_arch_version_t;
+
+/** QuRT maximum hardware threads information type. */
+typedef struct qurt_sysenv_max_hthreads {
+   /** @cond */
+   unsigned int max_hthreads; /*Maximum number of hardware threads.*/
+   /** @endcond */
+}qurt_sysenv_max_hthreads_t;
+
+/** QuRT active hardware threads information type. */
+typedef struct qurt_sysenv_hthreads {
+   /** @cond */
+   unsigned int hthreads; /*Maximum number of hardware threads.*/
+   /** @endcond */
+}qurt_sysenv_hthreads_t;
+
+/** QuRT maximum pi priority information type. */
+typedef struct qurt_sysenv_max_pi_prio {
+     /** @cond */
+    unsigned int max_pi_prio; /*Maximum pi priority.*/
+     /** @endcond */
+}qurt_sysenv_max_pi_prio_t;
+
+/** QuRT process name information type. */
+typedef struct qurt_sysenv_procname {
+     /** @cond */
+   union {
+      unsigned int asid; /*Address space ID.*/
+      unsigned int pid;  /*Process ID.*/
+   };
+   char name[QURT_MAX_NAME_LEN]; /* Process name.*/
+    /** @endcond */
+}qurt_sysenv_procname_t;
+
+/** QuRT stack profile count information type. */
+typedef struct qurt_sysenv_stack_profile_count {
+     /** @cond */
+   unsigned int count; /*Stack profile count for usage.*/
+   unsigned int count_watermark; /*Stack profile count for watermark.*/
+    /** @endcond */
+}qurt_sysenv_stack_profile_count_t;
+
+/**
+ QuRT system error event type.
+ */
+typedef struct _qurt_sysevent_error_t
+{
+    unsigned int thread_id; /**< Thread ID.  */
+    unsigned int fault_pc;  /**< Fault PC. */
+    unsigned int sp;        /**< Stack pointer. */
+    unsigned int badva;     /**< Virtual data address where the exception occurred. */
+    unsigned int cause;     /**< QuRT error result. */
+    unsigned int ssr;       /**< Supervisor status register. */
+    unsigned int fp;        /**< Frame pointer. */
+    unsigned int lr;        /**< Link register. */
+    unsigned int pid;       /**< PID of the process to which this thread belongs.*/
+ } qurt_sysevent_error_t ;
+
+typedef struct _qurt_sysevent_error_1_t
+{
+    unsigned int thread_id; /**< Thread ID.  */
+    unsigned int fault_pc;  /**< Fault PC. */
+    unsigned int sp;        /**< Stack pointer. */
+    unsigned int badva;     /**< Virtual data address where the exception occurred. */
+    unsigned int cause;     /**< QuRT error result. */
+    unsigned int ssr;       /**< Supervisor status register. */
+    unsigned int fp;        /**< Frame pointer. */
+    unsigned int lr;        /**< Link register. */
+    unsigned int pid;       /**< PID of the process to which this thread belongs.*/
+    unsigned int fkey;      /**< Framekey.*/
+    unsigned int reserved1; /**< Reserved.*/
+    unsigned int reserved2; /**< Reserved.*/
+    unsigned int reserved3; /**< Reserved.*/
+ } qurt_sysevent_error_1_t ;
+ 
+/** QuRT page fault error event information type. */
+typedef struct qurt_sysevent_pagefault {
+    qurt_thread_t thread_id; /**< Thread ID of the page fault thread. */
+    unsigned int fault_addr; /**< Accessed address that caused the page fault. */
+    unsigned int ssr_cause;  /**< SSR cause code for the page fault. */
+} qurt_sysevent_pagefault_t ;
+/** @} */ /* @endaddtogroup sys_env_types */
+/*=============================================================================
+                                    FUNCTIONS
+=============================================================================*/
+
+/*======================================================================*/
+/**
+  Gets the environment swap pool 0 information from the kernel.
+
+  @datatypes
+  #qurt_sysenv_swap_pools_t
+
+  @param[out] pools  Pointer to the pools information.
+
+  @return
+  #QURT_EOK -- Success.
+
+  @dependencies
+  None.
+*/
+int qurt_sysenv_get_swap_spool0 (qurt_sysenv_swap_pools_t *pools );
+
+/*
+  Gets the environment swap pool 1 information from the kernel.
+
+  @datatypes
+  #qurt_sysenv_swap_pools_t
+
+  @param[out] pools  Pointer to the pools information.
+
+  @return
+  #QURT_EOK -- Success.
+
+  @dependencies
+  None.
+*/
+int qurt_sysenv_get_swap_spool1(qurt_sysenv_swap_pools_t *pools );
+
+/**@ingroup func_qurt_sysenv_get_app_heap
+  Gets information on the program heap from the kernel.
+
+  @datatypes
+  #qurt_sysenv_app_heap_t
+
+  @param[out] aheap  Pointer to information on the program heap.
+
+  @return
+  #QURT_EOK -- Success. \n
+  #QURT_EVAL -- Invalid parameter.
+
+  @dependencies
+  None.
+*/
+int qurt_sysenv_get_app_heap(qurt_sysenv_app_heap_t *aheap );
+
+/**@ingroup func_qurt_sysenv_get_arch_version
+  Gets the Hexagon processor architecture version from the kernel.
+
+  @datatypes
+  #qurt_arch_version_t
+
+  @param[out] vers  Pointer to the Hexagon processor architecture version.
+
+  @return
+  #QURT_EOK -- Success. \n
+  #QURT_EVAL -- Invalid parameter
+
+  @dependencies
+  None.
+*/
+int qurt_sysenv_get_arch_version(qurt_arch_version_t *vers);
+
+/**@ingroup func_qurt_sysenv_get_max_hw_threads
+  Gets the maximum number of hardware threads supported in the Hexagon processor. 
+  The API includes the disabled hardware threads to reflect the maximum 
+  hardware thread count.
+  For example, if the image is configured for four hardware threads and hthread_mask is set to 0x5 in 
+  cust_config.xml, only HW0 and HW2 are initialized by QuRT.
+  HW1 and HW3 are not used at all. Under such a scenario, 
+  qurt_sysenv_get_max_hw_threads() still returns four.
+
+  @datatypes
+  #qurt_sysenv_max_hthreads_t
+
+  @param[out] mhwt  Pointer to the maximum number of hardware threads supported in the Hexagon processor.
+
+  @return
+  #QURT_EOK -- Success. \n
+  #QURT_EVAL -- Invalid parameter.
+
+  @dependencies
+  None.
+*/
+int qurt_sysenv_get_max_hw_threads(qurt_sysenv_max_hthreads_t *mhwt );
+
+/**@ingroup func_qurt_sysenv_get_hw_threads
+  Gets the number of hardware threads initialized by QuRT in Hexagon processor.
+  For example, if the image is configured for four hardware threads and hthread_mask is set to 0x5 in 
+  cust_config.xml, QuRT only initializes HW0 and HW2.
+  HW1 and HW3 are not used. In this scenario, qurt_sysenv_get_hw_threads() returns 2.
+
+  @datatypes
+  #qurt_sysenv_hthreads_t
+
+  @param[out] mhwt  Pointer to the number of hardware threads active in the Hexagon processor.
+
+  @return
+  #QURT_EOK -- Success. \n
+  #QURT_EVAL -- Invalid parameter.
+
+  @dependencies
+  None.
+*/
+int qurt_sysenv_get_hw_threads(qurt_sysenv_hthreads_t *mhwt );
+
+/**@ingroup func_qurt_sysenv_get_max_pi_prio
+  Gets the maximum priority inheritance mutex priority from the kernel.
+
+  @datatypes
+  #qurt_sysenv_max_pi_prio_t
+
+  @param[out] mpip  Pointer to the maximum priority inheritance mutex priority.
+
+  @return
+  #QURT_EOK -- Success. \n
+  #QURT_EVAL -- Invalid parameter.
+
+  @dependencies
+  None.
+*/
+int qurt_sysenv_get_max_pi_prio(qurt_sysenv_max_pi_prio_t *mpip );
+
+/**@ingroup func_qurt_sysenv_get_process_name2
+  Gets information on the system environment process names based on the client_handle argument.
+
+  @datatypes
+  #qurt_sysenv_procname_t
+
+  @param[in] client_handle  Obtained from the current invocation function (Section 3.4.1).
+  @param[out] pname         Pointer to information on the process names in the system.
+
+  @return
+  #QURT_EOK -- Success. \n
+  #QURT_EVAL -- Invalid parameter.
+
+  @dependencies
+  None.
+*/
+int qurt_sysenv_get_process_name2(int client_handle, qurt_sysenv_procname_t *pname );
+
+/**@ingroup func_qurt_sysenv_get_process_name
+  Gets information on the system environment process names from the kernel.
+
+  @datatypes
+  #qurt_sysenv_procname_t
+
+  @param[out] pname  Pointer to information on the process names in the system.
+
+  @return
+  #QURT_EOK -- Success. \n
+  #QURT_EVAL -- Invalid parameter.
+
+  @dependencies
+  None.
+*/
+int qurt_sysenv_get_process_name(qurt_sysenv_procname_t *pname );
+
+/**@ingroup func_qurt_sysenv_get_stack_profile_count
+   Gets information on the stack profile count from the kernel.
+
+   @datatypes
+   #qurt_sysenv_stack_profile_count_t
+
+   @param[out] count Pointer to information on the stack profile count.
+
+   @return
+   #QURT_EOK -- Success.
+
+   @dependencies
+   None.
+*/
+int qurt_sysenv_get_stack_profile_count(qurt_sysenv_stack_profile_count_t *count );
+
+/**@ingroup func_qurt_exception_wait
+  Registers the program exception handler.
+  This function assigns the current thread as the QuRT program exception handler and suspends the
+  thread until a program exception occurs.
+
+  When a program exception occurs, the thread is awakened with error information
+  assigned to the parameters of this operation.
+
+  @note1hang If no program exception handler is registered, or if the registered handler
+             calls exit, QuRT raises a kernel exception.
+             If a thread runs in Supervisor mode, any errors are treated as kernel
+             exceptions.
+
+  @param[out]  ip      Pointer to the instruction memory address where the exception occurred.
+  @param[out]  sp      Stack pointer.
+  @param[out]  badva   Pointer to the virtual data address where the exception occurred.
+  @param[out]  cause   Pointer to the QuRT error result code.
+
+  @return
+  Registry status: \n
+  Thread identifier -- Handler successfully registered. \n
+  #QURT_EFATAL -- Registration failed.
+
+  @dependencies
+  None.
+*/
+unsigned int qurt_exception_wait (unsigned int *ip, unsigned int *sp,
+                                  unsigned int *badva, unsigned int *cause);
+
+unsigned int qurt_exception_wait_ext (qurt_sysevent_error_t * sys_err);
+
+/**@ingroup func_qurt_exception_wait3
+  Registers the current thread as the QuRT program exception handler, and suspends the thread until a
+  program exception occurs.
+  When a program exception occurs, the thread is awakened with error information assigned to the specified
+  error event record.
+  If a program exception is raised when no handler is registered (or when a handler is registered, but it calls
+  exit), the exception is treated as fatal.\n
+  @note1hang If a thread runs in Monitor mode, all exceptions are treated as kernel exceptions.\n
+  @note1cont This function differs from qurt_exception_wait() by returning the error information in a data
+              structure rather than as individual variables. It also returns additional information (for example, SSR, FP, and LR).
+
+  @param[out] sys_err       Pointer to the qurt_sysevent_error_1_t type structure.
+  @param[in]  sys_err_size  Size of the qurt_sysevent_error_1_t structure.
+
+  @return
+  Registry status: \n
+  - #QURT_EFATAL -- Failure. \n
+  - Thread ID -- Success.
+
+  @dependencies
+  None.
+*/
+
+unsigned int qurt_exception_wait3(void * sys_err, unsigned int sys_err_size);
+
+/**@ingroup func_qurt_exception_raise_nonfatal
+  Raises a nonfatal program exception in the QuRT program system.
+
+  For more information on program exceptions, see Section @xref{dox:exception_handling}.
+
+  This operation never returns -- the program exception handler is assumed to perform all
+  exception handling before terminating or reloading the QuRT program system.
+
+  @note1hang The C library function abort() calls this operation to indicate software
+             errors.
+
+  @param[in] error QuRT error result code (Section @xref{dox:error_results}).
+
+  @return
+  Integer -- Unused.
+
+  @dependencies
+  None.
+*/
+int qurt_exception_raise_nonfatal (int error) __attribute__((noreturn));
+
+
+/**@ingroup func_qurt_exception_raise_fatal
+  Raises a fatal program exception in the QuRT system.
+
+  Fatal program exceptions terminate the execution of the QuRT system without invoking
+  the program exception handler.
+
+  For more information on fatal program exceptions, see Section @xref{dox:exception_handling}.
+
+  This operation always returns, so the calling program can perform the necessary shutdown
+  operations (data logging, on so on).
+
+  @note1hang Context switches do not work after this operation has been called.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+void qurt_exception_raise_fatal (void);
+
+unsigned int qurt_enable_floating_point_exception(unsigned int mask);
+
+/**@ingroup func_qurt_exception_enable_fp_exceptions
+  Enables the specified floating point exceptions as QuRT program exceptions.
+
+  The exceptions are enabled by setting the corresponding bits in the Hexagon
+  control user status register (USR).
+
+  The mask argument specifies a mask value identifying the individual floating
+  point exceptions to set. The exceptions are represented as defined symbols
+  that map into bits 0 through 31 of the 32-bit flag value.
+  Multiple floating point exceptions are specified by OR'ing together the individual
+  exception symbols.\n
+  @note1hang This function must be called before performing any floating point operations.
+
+  @param[in] mask Floating point exception types. Values: \n
+             - #QURT_FP_EXCEPTION_ALL    \n
+             - #QURT_FP_EXCEPTION_INEXACT    \n
+             - #QURT_FP_EXCEPTION_UNDERFLOW  \n
+             - #QURT_FP_EXCEPTION_OVERFLOW  \n
+             - #QURT_FP_EXCEPTION_DIVIDE0    \n
+             - #QURT_FP_EXCEPTION_INVALID   @tablebulletend
+
+  @return
+  Updated contents of the USR.
+
+  @dependencies
+  None.
+*/
+
+static inline unsigned int qurt_exception_enable_fp_exceptions(unsigned int mask)
+{
+   return qurt_enable_floating_point_exception(mask);
+}
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_EVENT_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_except.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_except.h
new file mode 100755
index 0000000000000..e1684c80e3d50
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_except.h
@@ -0,0 +1,185 @@
+#ifndef QURT_EXCEPT_H
+#define QURT_EXCEPT_H
+
+/**
+  @file qurt_except.h 
+  @brief  Defines Cause and Cause2 codes for error-handling.  
+
+EXTERNAL FUNCTIONS
+   None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+ Copyright (c) 2021-2022 by Qualcomm Technologies, Inc.  All Rights Reserved.
+
+ Confidential and Proprietary - Qualcomm Technologies, Inc..
+ ======================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+  QuRT supports error handling to handle CPU detected exceptions and software errors. 
+  QuRT treats all errors as either fatal errors or nonfatal errors. 
+
+  @section sec1 Fatal errors
+  All supervisor mode exceptions are treated as fatal errors. 
+  If a registered exception handler calls qurt_exit(), it is treated as a fatal error.
+  Fatal errors result in saving the context of primary hardware thread to QURT_error_info and the rest of the thread contexts to the corresponding TCBs. 
+  All hardware threads are eventually stopped and the cache is flushed.
+  NMI exception is treated little differently from other fatal errors. QuRT saves the contexts of all the hardware threads into QURT_error_info.\n
+
+  @subsection subsection1 Debugging fatal errors
+  - QURT_error_info.status.status	 -- Indicates that an error occured.
+  - QURT_error_info.status.cause	 -- Cause code for fatal error; Cause and Cause 2 details are listed below.
+  - QURT_error_info.status.cause2	 -- Cause2 code for fatal error; Cause and Cause 2 details are listed below.
+  - QURT_error_info.status.fatal	 -- Indicates whether a fatal error occurred. A user error can result in a fatal error if the exceptional handler is not registered.
+  - QURT_error_info.status.hw_tnum -- Indicates the index of QURT_error_info.locregs[], where the context is saved when the error is fatal error.
+  - QURT_error_info.global_regs    -- Contains the values of the global registers of Q6
+  - QURT_error_info.local_regs[QURT_error_info.status.hw_tnum] -- Provides the CPU context when the error is a supervisor error.
+    
+
+
+  @subsection subsection2 Debugging nonfatal errors
+  - QURT_error_info.user_errors                                    -- All user errors are logged here.
+  - QURT_error_info.user_errors.counter                            -- Index to last logged error.
+  - QURT_error_info.user_errors.entry[0...counter]	               -- Structure for logged error.
+  - QURT_error_info.user_errors.entry[0...counter].error_tcb       -- TCB for the user error.
+  - QURT_error_info.user_errors.entry[0...counter].error_tcb.error -- Information about the error; Cause, Cause2, Badva and hardware thread ID.
+  - QURT_error_info.user_errors.entry[0...counter].error_code      -- ((cause2 << 8) 'Logical Or' (cause) ); Cause and Cause 2 details are listed below.
+  - QURT_error_info.user_errors.entry[0...counter].hw_thread	   -- Hardware thread ID for error.
+  - QURT_error_info.user_errors.entry[0...counter].pcycle	       -- Pcycle for error.
+
+@note  
+  Important usage note:
+  Cause and Cause2 are error codes to distinguish multiple errors.
+  SSR and BADAVA are inconclusive without the vector number.
+  All cause and cause2 can range from 1 to 255 and every cause can have 1 to 255 error code.
+  Hence the system can have up to 255 * 255 unique error codes.
+  The cominations is representated as ((cause2 << 8) 'Logical OR' (cause) )
+  Some Cause2 codes are statically defined, whereas some are obtaned from SSR[7:0] cause codes. It depends on cause codes.
+  SSR cause codes are defined in Hexagon reference manual.
+  All possible combinations are listed below.
+*/
+/** @addtogroup chapter_error
+@{ */
+/* cause - error type - 8-bits*/
+#define QURT_EXCEPT_PRECISE             0x01U /**< Precise exception occurred. For this cause code, Cause2 is SSR[7:0].*/
+#define QURT_EXCEPT_NMI                 0x02U /**< NMI occurred; Cause2 is not defined. */
+#define QURT_EXCEPT_TLBMISS             0x03U /**< TLBMISS RW occurred; for this cause code, Cause2 is SSR[7:0]. */
+#define QURT_EXCEPT_RSVD_VECTOR         0x04U /**< Interrupt raised on a reserved vector, which must never occur. Cause2 is not defined. */
+#define QURT_EXCEPT_ASSERT              0x05U /**< Kernel assert. Cause2 QURT_ABORT_* are listed below.  */
+#define QURT_EXCEPT_BADTRAP             0x06U /**< trap0(num) called with unsupported num. Cause2 is 0. */
+#define QURT_EXCEPT_UNDEF_TRAP1         0x07U /**< Trap1 is not supported. Using Trap1 causes this error. Cause2 is not defined. */
+#define QURT_EXCEPT_EXIT                0x08U /**< Application called qurt_exit() or qurt_exception_raise_nonfatal(). Can be called from C library. Cause2 is "[Argument passed to qurt_exception_raise_nonfatal() & 0xFF]". */
+#define QURT_EXCEPT_TLBMISS_X           0x0AU /**< TLBMISS X (execution) occurred. Cause2 is not defined. */
+#define QURT_EXCEPT_STOPPED             0x0BU /**< Running thread stopped due to fatal error on other hardware thread. Cause2 is not defined. */
+#define QURT_EXCEPT_FATAL_EXIT          0x0CU /**< Application called qurt_fatal_exit(). Cause2 is not defined. */
+#define QURT_EXCEPT_INVALID_INT         0x0DU /**< Kernel received an invalid L1 interrupt. Cause2 is not defined. */
+#define QURT_EXCEPT_FLOATING_POINT      0x0EU /**< Kernel received an floating point error. Cause2 is not defined.  */
+#define QURT_EXCEPT_DBG_SINGLE_STEP     0x0FU /**< Cause2 is not defined. */
+#define QURT_EXCEPT_TLBMISS_RW_ISLAND   0x10U /**< Read write miss in Island mode. Cause2 QURT_TLB_MISS_RW_MEM* are listed below. */
+#define QURT_EXCEPT_TLBMISS_X_ISLAND    0x11U /**< Execute miss in Island mode. For this cause code, Cause2 is SSR[7:0]. */
+#define QURT_EXCEPT_SYNTHETIC_FAULT     0x12U /**< Synthetic fault with user request that kernel detected. Cause2 QURT_SYNTH_* are listed below. */
+#define QURT_EXCEPT_INVALID_ISLAND_TRAP 0x13U /**< Invalid trap in Island mode. Cause2 is trap number. */
+#define QURT_EXCEPT_UNDEF_TRAP0         0x14U /**< trap0(num) was called with unsupported num. Cause2 is trap number. */
+#define QURT_EXCEPT_PRECISE_DMA_ERROR   0x28U /**< Precise DMA error. Cause2 is DM4[15:8]. Badva is DM5 register. */
+
+#define QURT_ECODE_UPPER_LIBC         (0U << 16)  /**< Upper 16 bits is 0 for libc. */
+#define QURT_ECODE_UPPER_QURT         (0U << 16)  /**< Upper 16 bits is 0 for QuRT. */
+#define QURT_ECODE_UPPER_ERR_SERVICES (2U << 16)  /**< Upper 16 bits is 2 for error service. */
+/** @cond */
+#define QURT_ECODE_ISLAND_INVALID_QDI  3U         /**< Passing invalid QDI method in island. */
+/** @endcond */
+
+/* Cause2 for QURT_EXCEPT_SYNTHETIC_FAULT cause- 8bits */
+#define  QURT_SYNTH_ERR                         0x01U     /**< */
+#define  QURT_SYNTH_INVALID_OP                  0x02U     /**< */
+#define  QURT_SYNTH_DATA_ALIGNMENT_FAULT        0x03U     /**< */
+#define  QURT_SYNTH_FUTEX_INUSE                 0x04U     /**< */
+#define  QURT_SYNTH_FUTEX_BOGUS                 0x05U     /**< */
+#define  QURT_SYNTH_FUTEX_ISLAND                0x06U     /**< */
+#define  QURT_SYNTH_FUTEX_DESTROYED             0x07U     /**< */
+#define  QURT_SYNTH_PRIVILEGE_ERR               0x08U     /**< */
+
+/* Cause2 - Abort cause reason - 8 bits */
+/* ERR_ASSERT cause */
+#define   QURT_ABORT_FUTEX_WAKE_MULTIPLE           0x01U   /**<  Abort cause - futex wake multiple. */
+#define   QURT_ABORT_WAIT_WAKEUP_SINGLE_MODE       0x02U   /**<  Abort cause - thread waiting to wake up in Single Threaded mode. */
+#define   QURT_ABORT_TCXO_SHUTDOWN_NOEXIT          0x03U   /**<  Abort cause - call TCXO shutdown without exit. */
+#define   QURT_ABORT_FUTEX_ALLOC_QUEUE_FAIL        0x04U   /**<  Abort cause - futex allocation queue failure -  QURTK_futexhash_lifo empty. */
+#define   QURT_ABORT_INVALID_CALL_QURTK_WARM_INIT  0x05U   /**<  Abort cause - invalid call QURTK_warm_init() in NONE CONFIG_POWER_MGMT mode. */
+#define   QURT_ABORT_THREAD_SCHEDULE_SANITY        0x06U   /**<  Abort cause - sanity schedule thread is not supposed to run on the current hardware thread. */
+#define   QURT_ABORT_REMAP                         0x07U   /**<  Remap in the page table; the correct behavior must remove mapping if necessary. */
+#define   QURT_ABORT_NOMAP                         0x08U   /**<  No mapping in page table when removing a user mapping. */
+#define   QURT_ABORT_OUT_OF_SPACES                 0x09U
+#define   QURT_ABORT_INVALID_MEM_MAPPING_TYPE      0x0AU   /**<  Invalid memory mapping type when creating qmemory. */
+#define   QURT_ABORT_NOPOOL                        0x0BU   /**<  No pool available to attach. */
+#define   QURT_ABORT_LIFO_REMOVE_NON_EXIST_ITEM    0x0CU   /**<  Cannot allocate more futex waiting queue. */
+#define   QURT_ABORT_ARG_ERROR                     0x0DU
+#define   QURT_ABORT_ASSERT                        0x0EU   /**<  Assert abort. */
+#define   QURT_ABORT_FATAL                         0x0FU   /**<  Fatal error; must never occur. */
+#define   QURT_ABORT_FUTEX_RESUME_INVALID_QUEUE    0x10U   /**<  Abort cause - invalid queue ID in futex resume. */
+#define   QURT_ABORT_FUTEX_WAIT_INVALID_QUEUE      0x11U   /**<  Abort cause - invalid queue ID in futex wait. */
+#define   QURT_ABORT_FUTEX_RESUME_INVALID_FUTEX    0x12U   /**<  Abort cause - invalid futex object in hashtable. */
+#define   QURT_ABORT_NO_ERHNDLR                    0x13U   /**<  No registered error handler. */
+#define   QURT_ABORT_ERR_REAPER                    0x14U   /**<  Exception in the reaper thread. */
+#define   QURT_ABORT_FREEZE_UNKNOWN_CAUSE          0x15U   /**<  Abort in thread freeze operation. */
+#define   QURT_ABORT_FUTEX_WAIT_WRITE_FAILURE      0x16U   /**<  During futex wait processing, could not perform a necessary write operation to userland data; most likely due to a DLPager eviction. */
+#define   QURT_ABORT_ERR_ISLAND_EXP_HANDLER        0x17U   /**<  Exception in Island exception handler task. */
+#define   QURT_ABORT_L2_TAG_DATA_CHECK_FAIL        0x18U   /**<  Detected error in L2 tag/data during warm boot. The L2 tag/data check is done when CONFIG_DEBUG_L2_POWER_COLLAPSE is enabled. */
+#define   QURT_ABORT_ERR_SECURE_PROCESS            0x19U   /**<  Abort error in secure process. */
+#define   QURT_ABORT_ERR_EXP_HANDLER               0x20U   /**<  No exception handler, or the handler caused an exception. */
+#define   QURT_ABORT_ERR_NO_PCB                    0x21U   /**<  PCB of the thread context failed initialization, PCB was NULL. */
+#define   QURT_ABORT_NO_PHYS_ADDR                  0x22U   /**<  Unable to find the physical address for the virtual address. */
+#define   QURT_ABORT_OUT_OF_FASTINT_CONTEXTS       0x23U   /**<  Fast interrupt contexts exhausted. */
+#define   QURT_ABORT_CLADE_ERR                     0x24U   /**<  Fatal error seen with CLADE interrupt. */
+#define   QURT_ABORT_ETM_ERR                       0x25U   /**<  Fatal error seen with ETM interrupt. */
+#define   QURT_ABORT_ECC_DED_ASSERT                0x26U   /**<  ECC two-bit DED error. */
+#define   QURT_ABORT_VTLB_ERR                      0x27U   /**<  Fatal error in the VTLB layer. */
+#define   QURT_ABORT_TLB_ENCODE_DECODE_FAILURE     0x28U   /**<  Failure during the TLB encode or decode operation. */
+#define   QURT_ABORT_VTLB_WALKOBJS_BOUND_FAILURE   0x29U   /**<  Failure to lookup entry in the page table. */
+#define   QURT_ABORT_PHY_MEMORY_OWNERSHIP_FAILURE  0x30U   /**<  Failure to claim phy memory ownership. */
+#define   QURT_ABORT_JTLB_SIZE_CHECK_FAIL          0x31U   /**<  JTLB size configured is more than actual size in hardware */
+#define   QURT_ABORT_AUTOSTACK_ASSERT              0x32U   /**<  Error while handling stack flimit exception. */
+
+/* Cause2 - TLB-miss_X - 8bits */
+#define  QURT_TLB_MISS_X_FETCH_PC_PAGE             0x60U  /**<   */
+#define  QURT_TLB_MISS_X_2ND_PAGE                  0x61U  /**<   */
+#define  QURT_TLB_MISS_X_ICINVA                    0x62U  /**<   */
+
+/* Cause2 - TLB-miss_RW - 8bits */
+#define  QURT_TLB_MISS_RW_MEM_READ                 0x70U  /**<   */
+#define  QURT_TLB_MISS_RW_MEM_WRITE                0x71U  /**<   */
+
+/** @cond rest_reg_dist */
+/* Cause2 - Floating point exception - 8 bits */
+#define  QURT_FLOATING_POINT_EXEC_ERR              0xBFU    /**<  Execute floating-point. */
+/** @endcond */
+
+/** Cause2 - autostackv2 - 8 bits */
+#define  QURT_AUTOSTACKV2_CANARY_NOT_MATCH         0xC1U
+#define  QURT_AUTOSTACKV2_POOL_IDX_OFF_RANGE       0xC2U
+
+/** Cause2 - CFI violation - 8 bits */
+#define  QURT_CFI_VIOLATION                        0xC3U
+
+/** @cond rest_reg_dist*/
+/* Enable floating point exceptions */
+#define QURT_FP_EXCEPTION_ALL        0x1FU << 25 /**< */
+#define QURT_FP_EXCEPTION_INEXACT    0x1U << 29 /**< */
+#define QURT_FP_EXCEPTION_UNDERFLOW  0x1U << 28 /**< */
+#define QURT_FP_EXCEPTION_OVERFLOW   0x1U << 27 /**< */
+#define QURT_FP_EXCEPTION_DIVIDE0    0x1U << 26 /**< */
+#define QURT_FP_EXCEPTION_INVALID    0x1U << 25 /**< */
+
+/** @endcond */
+/** @} */ /* end_addtogroup chapter_error */
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_EXCEPT_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_fastint.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_fastint.h
new file mode 100755
index 0000000000000..ea65dc0917fc0
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_fastint.h
@@ -0,0 +1,71 @@
+#ifndef QURT_FASTINT_H
+#define QURT_FASTINT_H
+
+/**
+  @file qurt_fastint.h
+  @brief QuRT fast interrupt functions      
+
+   Copyright (c) 2013-2021  by Qualcomm Technologies, Inc.  All Rights Reserved.
+
+ ======================================================================*/
+
+/*======================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**@ingroup func_qurt_fastint_register
+  Register fast interrupt callback function
+
+  Fast interrupt callback should be designed to perform the minimal necessary 
+  actions for the interrupt, and/or perform some operations, such as signaling 
+  another regular software thread to start any additional processing. 
+  The callback should be a fast and short function. When a fast interrupt callback 
+  is running, the corresponding interrupt cannot be re-enabled until the callback 
+  returns. 
+
+  The fast interrupt callback must not use any system blocking calls, such as 
+  mutex lock or signal wait. Otherwise, it results in errors.
+
+  The fast interrupt callback function has a single integer argument and the 
+  function ends with no return. The argument value passed in is the interrupt
+  number, and therefore a single callback function can handle 
+  multiple fast interrupts.
+
+  @param[in] intno  Interrupt number to register. 
+  @param[in] fn     Interrupt callback function. 
+    
+  @return
+  #QURT_EOK -- Fast interrupt registration is successful. \n
+  #QURT_EINVALID -- Interrupt is already registered. \n
+  #QURT_EINT -- Invalid interrupt number.    
+*/
+/* ======================================================================*/
+unsigned int qurt_fastint_register(int intno, void (*fn)(int));
+
+
+/*======================================================================*/
+/**@ingroup func_qurt_fastint_deregister
+  Deregisters the fast interrupt callback function. 
+	
+  @param[in] intno  Level-one interrupt number to deregister. Valid range is 1 and 10 through 31 
+                    (simulator only). 
+
+  @return 				
+  #QURT_EOK -- Interrupt deregistration is successful. \n
+  #QURT_EINT -- Invalid interrupt number (not registered). \n
+  #QURT_EINVALID -- Invalid interrupt number (already deregistered).
+
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+unsigned int qurt_fastint_deregister(int intno);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_FASTINT_H */
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_fs_hub.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_fs_hub.h
new file mode 100755
index 0000000000000..aaa050a6c838b
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_fs_hub.h
@@ -0,0 +1,58 @@
+#ifndef QURT_FS_HUB_H
+#define QURT_FS_HUB_H
+
+/**
+  @file qurt_fs_hub.h
+  @brief  Definitions, macros, and prototypes used when writing a
+  QDI driver that provides file-system functionality.
+
+ EXTERNALIZED FUNCTIONS
+  none
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  none
+
+ Copyright (c) 2023 by Qualcomm Technologies, Inc.  All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+  This structure tracks a file-designator for a FS-hub QDI driver.
+  File system's QDI interface should use this object to encapsulate
+  true file-descriptor and return back a QDI handle. This QDI handle
+  will be used as file-descriptor by File-systm-hub. 
+ */
+
+typedef struct qurt_qdi_fs_obj
+{
+    qurt_qdi_obj_t qdi_obj;
+    int client_handle;
+    int fd;
+}qurt_qdi_fs_obj_t;
+
+
+/**@ingroup fs_hub_support_functions
+  This function allows a file-system to register it's QDI interface with file-system-hub.
+  Once registered, all file open operations for any filenames containing the mountpoint will
+  be forwarded to the QDI inteface.
+
+  Mountpoint string must be encased in two forward slashes e.g. "/mountpoint/"
+
+  @param  mtpoint         mount point for the file-system being registered.
+  @param  opener          opener structure for the QDI driver interface
+  
+  @return
+  QURT_EOK -- Successfully registered QDI driver with file-system-hub.
+  Negative error code -- Failed to register with file-system-hub
+ */
+int qurt_fs_hub_mtpoint_register(const char *mtpoint, qurt_qdi_obj_t *opener);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_futex.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_futex.h
new file mode 100755
index 0000000000000..1fdcc79a43f01
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_futex.h
@@ -0,0 +1,82 @@
+#ifndef QURT_FUTEX_H
+#define QURT_FUTEX_H
+/**
+  @file  qurt_futex.h
+
+  @brief  Prototypes of QuRT futex API functions      
+  
+ EXTERNALIZED FUNCTIONS
+  none
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  none
+
+ Copyright (c) 2013, 2020-2021  by Qualcomm Technologies, Inc.  All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*=====================================================================
+ Functions
+======================================================================*/
+
+
+/**@ingroup func_qurt_futex_wait
+  Moves the caller thread into waiting state when a memory object address
+  contains a value that is the same as a specified value. 
+
+   @param[in]  lock  Pointer to the object memory. 
+   @param[in]  val   Value to check against the object content. 
+
+   @return
+   #QURT_EOK -- Success \n
+   Other values -- Failure
+
+   @dependencies
+   None.
+ */
+int qurt_futex_wait(void *lock, int val);
+
+
+/**@ingroup func_qurt_futex_wait_cancellable
+  If a memory object address contains a value that is same as a specified 
+  value, move the caller thread into waiting state. 
+  The kernal can cancel the waiting state when there is a special need. 
+
+   @param[in]  lock  Pointer to the object memory. 
+   @param[in]  val   Value to check against the object content. 
+
+   @return
+   #QURT_EOK -- Success \n
+   Other values  -- Failure
+
+   @dependencies
+   None.
+ */
+int qurt_futex_wait_cancellable(void *lock, int val);
+
+
+/**@ingroup func_qurt_futex_wake
+  Wakes up a specified number of threads that have been waiting 
+  for the object change with qurt_futex_wait().
+
+   @param[in]  lock        Pointer to the object memory. 
+   @param[in]  n_to_wake   Maximum number of threads to wake up.
+
+   @return
+   number of threads to be woken up by this function
+
+   @dependencies
+   None.
+ */
+int qurt_futex_wake(void *lock, int n_to_wake);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_FUTEX_H */
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_hmx.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_hmx.h
new file mode 100755
index 0000000000000..e4037dbeae514
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_hmx.h
@@ -0,0 +1,226 @@
+#ifndef QURT_HMX_H
+#define QURT_HMX_H
+/**
+  @file qurt_hmx.h 
+  @brief   Prototypes of Qurt HMX API.  
+
+Copyright (c) 2019-2021, 2023  by Qualcomm Technologies, Inc.  All Rights Reserved.
+Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+=============================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*=============================================================================
+                        TYPEDEFS
+=============================================================================*/
+
+
+/** @addtogroup hmx_types
+@{ */
+/* HMX locking type */
+#define  QURT_HMX_NON_SHARED_LOCK           0U /**< HMX locking type.*/
+#define  QURT_HMX_SHARED_LOCK               1U /**< HMX locking type.*/
+
+/* HMX unlocking type */
+#define  QURT_HMX_NON_SHARED_UNLOCK         0U /**< HMX unlocking type.*/
+#define  QURT_HMX_SHARED_UNLOCK             1U /**< HMX unlocking type.*/
+
+/* HMX hardware context */
+#define  QURT_HMX_UNIT_0                    0U /**< HMX hardware context #0 */
+#define  QURT_HMX_UNIT_1                    1U /**< HMX hardware context #1 */
+	/** @} */ /* end_addtogroup hmx_types */
+
+
+/*=============================================================================
+                        FUNCTIONS
+=============================================================================*/
+
+
+/**@ingroup func_qurt_hmx_lock2
+  Locks a HMX unit with the specified locking type.
+
+    #QURT_HMX_NON_SHARED_LOCK:
+   - If a HMX unit is available, lock the unit and return success of #QURT_EOK.
+   - If the HMX unit is already locked by another thread, the caller thread is suspended 
+     until the HMX is available and gets locked by this function.
+   - If there is no HMX hardware supported, returns #QURT_EVAL;
+
+    #QURT_HMX_SHARED_LOCK:
+   - If a HMX unit is available, enables HMX access for the caller thread, and returns 
+     success of #QURT_EOK.
+   - If the HMX is enabled on the caller thread, return #QURT_EFAILED.
+   - If the HMX is locked by another thread in the same user process of the caller 
+     thread with locking type of #QURT_HMX_SHARED_LOCK, enable HMX access for the caller 
+     thread, and return success of #QURT_EOK.
+   - If the HMX is locked by another thread in the same user process of the caller 
+     thread with locking type of #QURT_HMX_NON_SHARED_LOCK, return #QURT_EFAILED.
+   - If the HMX is locked by a thread from another user process different from the 
+     user process of the caller thread, return #QURT_EFAILED.
+   - If there is no HMX hardware supported, return #QURT_EVAL.
+
+  @param[in]  type  Locking type.
+    
+  @return
+  #QURT_EOK     -- HMX lock successful.\n
+  #QURT_EFAILED -- Failure due to wrong locking condition.\n
+  #QURT_EVAL    -- Failure because no HMX hardware is supported.
+
+  @dependencies
+  None.
+  
+ */
+int qurt_hmx_lock2(unsigned int type);
+
+
+/**@ingroup func_qurt_hmx_unlock2
+  Unlocks a HMX unit with the unlocking type.
+
+    #QURT_HMX_NON_SHARED_UNLOCK:
+  - If there is a HMX unit locked by the caller thread, unlock the HMX unit and clear the 
+    HMX accumulators (assuming a fixed point type). 
+  - If there is no HMX unit locked by the caller thread, return #QURT_EFAILED. 
+  - If there is no HMX hardware supported, return #QURT_EVAL.
+
+  #QURT_HMX_SHARED_UNLOCK:
+   - If the caller thread has locked HMX with type #QURT_HMX_SHARED_LOCK, disable the 
+     HMX access on the caller thread, and return success of #QURT_EOK.
+     Note: If the caller thread is the last thread that unlocks for #QURT_HMX_SHARED_LOCK 
+           in its user process, the unlock function clears the HMX accumulators. 
+   - If the caller thread has locked HMX with type #QURT_HMX_NON_SHARED_LOCK, return 
+     failure of #QURT_EFAILED.
+   - If the caller thread has not locked HMX, return failure of #QURT_EFAILED.
+   - If there is no HMX hardware supported, returns #QURT_EVAL.
+
+  @param[in]  type  Locking type.
+    
+  @return
+  #QURT_EOK     -- HMX is unlocked successful. \n
+  #QURT_EFAILED -- Failure due to wrong unlocking condition. \n
+  #QURT_EVAL    -- Failure because no HMX hardware is supported.
+
+  @dependencies
+  None.
+  
+ */
+int qurt_hmx_unlock2(unsigned int type);
+
+
+/**@ingroup func_qurt_hmx_lock
+  Locks a HMX unit.
+  If a HMX unit is available, this function locks the unit and returns right away.
+  If there is no HMX unit available, the caller is blocked until a HMX is available 
+  and is locked by the function.
+
+  @return
+  #QURT_EOK -- HMX lock successful. \n
+  #QURT_EFAILED -- Failure due to wrong locking condition. \n
+  #QURT_EVAL    -- Failure because no HMX hardware is supported.
+
+  @dependencies
+  None.
+ */
+int qurt_hmx_lock(void);
+
+
+/**@ingroup func_qurt_hmx_unlock
+  Unlocks a HMX unit.
+  If a HMX unit is locked by the caller thread, unlock the HMX unit and clear its 
+  accumulators(assuming fixed point type). 
+  If there is no HMX unit locked by the caller thread, return failure. 
+  
+  @return
+  #QURT_EOK -- HMX unlock successful. \n
+  #QURT_EFAILED -- Failure due to wrong unlocking condition. \n
+  #QURT_EVAL    -- Failure because no HMX hardware is supported.
+
+  @dependencies
+  None.
+ */
+int qurt_hmx_unlock(void);
+
+
+/**@ingroup func_qurt_hmx_try_lock
+  Tries to lock a HMX unit.
+  If a HMX unit is available, this function locks the unit and returns right away;
+  if there is no HMX unit available, the function returns failure without blocking the caller.
+  
+  @return
+  #QURT_EOK -- HMX lock successful \n
+  #QURT_EFAILED -- Failure due to wrong locking condition.\n
+  #QURT_EVAL    -- Failure because no HMX hardware is supported.
+
+  @dependencies
+  None.
+ */
+int qurt_hmx_try_lock(void);
+
+
+/**@ingroup func_qurt_hmx_assign
+  Assign a HMX unit to a target thread specified by its thread identifier. 
+  The HMX unit (HMX hardware context) is specified by hmx_unit.
+  The caller of this function is limited to the SRM process.
+  If the requested hmx_unit is already assigned to another thread with QURT_HMX_NON_SHARED_LOCK, 
+  kernel will detach it from the thread, and re-assign it to the target thread. 
+  If the target thread has HVX enabled, it cannot have HMX enabled.  
+
+  Locking type 
+  #QURT_HMX_NON_SHARED_LOCK:
+   - If the HMX unit is available, lock the HMX unit and return success of #QURT_EOK.
+   - If the HMX unit is already enabled on the target thread, return #QURT_EOK.
+   - If the HMX unit is already locked by another thread, detach the HMX from the thread.
+     Re-assign the HMX unit to the target thread, and return #QURT_EOK.
+     
+  @param[in]  thread_id    Thread identifier
+  @param[in]  type         Locking type  
+                             #QURT_HMX_NON_SHARED_LOCK -- non-shared lock
+  @param[in]  hmx_unit     HMX hardware context number  
+                             #QURT_HMX_UNIT_0
+                             #QURT_HMX_UNIT_1 
+    
+  @return
+  #QURT_EOK       -- The HMX is assigned successfully. This includes the case that \n
+                     the target thread already has HMX assigned. \n
+  #QURT_EFAILED   -- Failure due to wrong assigning conditions. \n
+  #QURT_EINVALID  -- Failure because no HMX hardware is supported.
+
+  @dependencies
+  None.
+ */
+int qurt_hmx_assign ( unsigned int thread_id, unsigned int type, unsigned int hmx_unit );
+
+
+/**@ingroup func_qurt_hmx_release
+  Release a HMX unit from a target thread specified by its thread identifier. 
+  The HMX unit (HMX hardware context) is specified by hmx_unit.
+  The caller of this function is limited to the SRM process.
+
+  Qurt detaches the specified HMX unit from the target thread, and return success of 
+  #QURT_EOK. If the HMX unit is already released from the target thread, return #QURT_EOK.
+     
+  @param[in]  thread_id    Thread identifier
+  @param[in]  hmx_unit     HMX hardware context number  
+                             #QURT_HMX_UNIT_0
+                             #QURT_HMX_UNIT_1 
+    
+  @return
+  #QURT_EOK       -- The HMX is released successfully. This includes the case that \n
+                     the target thread already has the HMX released. \n
+  #QURT_EFAILED   -- Failure due to wrong assigning condition. \n
+  #QURT_EINVALID  -- Failure because no HMX hardware is supported.
+
+  @dependencies
+  None.
+ */
+int qurt_hmx_release ( unsigned int thread_id, unsigned int hmx_unit );
+
+
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_HMX_H */
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_hvx.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_hvx.h
new file mode 100755
index 0000000000000..13c213d49ac84
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_hvx.h
@@ -0,0 +1,421 @@
+#ifndef QURT_HVX_H
+#define QURT_HVX_H
+/**
+  @file qurt_hvx.h 
+  @brief   Prototypes of QuRT HVX API.  
+
+EXTERNAL FUNCTIONS
+   None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+Copyright (c) 2021-2022  by Qualcomm Technologies, Inc.  All Rights Reserved.
+Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+=============================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*=============================================================================
+                        TYPEDEFS
+=============================================================================*/
+/** @cond */
+
+typedef enum {
+    QURT_HVX_MODE_64B = 0,      /**< HVX mode of 64 bytes */
+    QURT_HVX_MODE_128B = 1      /**< HVX mode of 128 bytes */
+} qurt_hvx_mode_t;
+/** @endcond */
+/*=============================================================================
+                        CONSTANTS AND MACROS
+=============================================================================*/
+/** @cond internal_only*/
+/** @addtogroup hvx_macros
+@{ */
+#define QURT_HVX_HW_UNITS_2X128B_4X64B        0x00000204       /**< Bits 15 through 8 are for the number of 128B units.   */
+                                                               /**< Bits 7 through 0 are for the number of 64B units.     */
+#define QURT_HVX_HW_UNITS_4X128B_0X64B        0x00000400   
+#define QURT_HVX_HW_UNITS_6X128B_0X64B        0x00000600   
+
+/* HVX locking status */
+
+#define QURT_HVX_UNLOCKED                     (0)              /* Has not locked HVX unit */
+#define QURT_HVX_LOCKED                       (1)              /* Has locked HVX unit */
+#define QURT_HVX_ERROR                        (-1)             /* Error, no HVX support */
+
+/* Input value for HVX reservation */
+
+#define QURT_HVX_RESERVE_ALL                  (4)              /* All the HVX units in terms of 64B_MODE are requested to be reserved */
+#define QURT_HVX_RESERVE_ALL_AVAILABLE        (0xff)           /* All remaining unlocked HVX units in terms of 64B_MODE are requested to be reserved */
+
+/* Return values for HVX reservation */
+
+#define QURT_HVX_RESERVE_NOT_SUPPORTED        (-1)             /* There is no HVX hardware, or less units in the hardware than requested */
+#define QURT_HVX_RESERVE_NOT_SUCCESSFUL       (-2)             /* Some HVX units are already locked/reserved by other PD, thus not enough units left for the reservation. */
+#define QURT_HVX_RESERVE_ALREADY_MADE         (-3)             /* There is already a HVX reservation made. */
+#define QURT_HVX_RESERVE_CANCEL_ERR           (-4)             /* The action of cancling the reservation fails because this protection domain has no reservation made before. */
+
+// HVX set requests
+
+#define QURT_HVX_64B                    0  /**< */
+#define QURT_HVX_128B                   1  /**< */
+#define QURT_HVX_NO_USE                 2  /**< */
+#define QURT_HVX_RELEASE_CONTEXT        3  /**< */
+#define QURT_HVX_IMMEDIATE_USE          4  /**< */
+
+// HVX set masks
+
+#define QURT_HVX_64B_PREFERRED          (1<<(QURT_HVX_64B  + 8))/**< */
+#define QURT_HVX_128B_PREFERRED         (1<<(QURT_HVX_128B + 8))/**< */
+#define QURT_HVX_64B_ACCEPTABLE         (1<<(QURT_HVX_64B  + 12))/**< */
+#define QURT_HVX_128B_ACCEPTABLE        (1<<(QURT_HVX_128B + 12))/**< */
+
+// HVX set return "result"
+
+#define QURT_EOK                        0     /**< */
+#define QURT_HVX_SET_ERROR              0xFF  /**< */
+
+// hvx_mode_assigned for QURT_HVX_IMMEDIATE_USE 
+#define QURT_HVX_64B_ASSIGNED          (1<<(QURT_HVX_64B  + 8)) /**< */
+#define QURT_HVX_128B_ASSIGNED         (1<<(QURT_HVX_128B + 8)) /**< */
+
+// Sizes of HVX dump buffer
+
+#define   QURT_HVX_V65_64B_VSIZE           2084U      /**<  64 x 32 +  8 x 4 + 4 (version). */
+#define   QURT_HVX_V65_128B_VSIZE          4164U      /**<  128 x 32 + 16 x 4 + 4 (version). */
+#define   QURT_HVX_V66_128B_VSIZE          4420U      /**<  128 x (32 +2) + 16 x 4 + 4 (version). */
+#define   QURT_HVX_V68_128B_VSIZE          4164U      /**<  128 x 32 + 16 x 4 + 4 (version). */
+#define   QURT_HVX_V79_128B_VSIZE          4740U      /**<  128 x (32+4+1) + 4 (version). */
+#define   QURT_HVX_VREG_BUF_SIZE           QURT_HVX_V79_128B_VSIZE /**< */
+
+// HVX dump versions
+
+#define QURT_HVX_DUMP_V65_64B           1U  /**< */
+#define QURT_HVX_DUMP_V65_128B          2U  /**< */
+#define QURT_HVX_DUMP_V66_128B          3U  /**< */
+#define QURT_HVX_DUMP_V68_128B          4U  /**< */
+#define QURT_HVX_DUMP_V79_128B          5U  /**< */
+/** @} */ /* end_addtogroup hvx_macros */
+/** @endcond */
+/** @cond */
+// Qurt data struct for hvx_set input
+typedef struct qurt_hvx_set_struct_ {          
+    unsigned char set_req;  // LSB
+    struct {
+        unsigned char preferred_mask:4;
+        unsigned char acceptable_mask:4;
+    };
+    unsigned short resvd;   // MSB
+} qurt_hvx_set_struct_t;  // 4 bytes
+
+
+// Qurt data struct for hvx_set return
+typedef struct qurt_hvx_set_return_str_ {          
+    unsigned char result;  // LSB
+    unsigned char hvx_mode_assigned;
+    unsigned short resvd;   // MSB
+} qurt_hvx_set_return_struct_t;  // 4 bytes
+/** @endcond */
+
+
+/*=============================================================================
+                        FUNCTIONS
+=============================================================================*/
+
+/**@ingroup func_qurt_hvx_lock
+  Locks one HVX unit specified by the HVX mode.
+  
+  @note1hang Input variable can be 128B_MODE or 64B_MODE. If an HVX unit in this mode 
+             is available, this function locks the unit and returns right away.
+             If the current HVX mode is different from the requested mode, the current 
+             thread is blocked. When all HVX units become idle, QuRT changes 
+             the mode, locks the HVX unit, and returns.
+
+            Starting from Q6v65 with HVX context switch support, qurt_hvx_lock() is 
+            mapped as qurt_hvx_set(64_BYTE or 128_BYTE).
+  
+  @datatypes
+  #qurt_mode_t
+  
+  @param[in]  lock_mode #QURT_HVX_MODE_64B or #QURT_HVX_MODE_128B.
+
+  @return
+  #QURT_EOK -- Success \n
+  Other value -- Failure
+
+  @dependencies
+  None.
+  
+ */
+int qurt_hvx_lock(qurt_hvx_mode_t lock_mode);
+
+/**@ingroup func_qurt_hvx_unlock
+  Unlocks the HVX unit held by this software thread.
+  
+  @note1hang  Starting from Q6v65 with HVX context switch support, qurt_hvx_unlock()
+              maps as qurt_hvx_set(QURT_HVX_RELEASE_CONTEXT).
+  
+  @return
+  #QURT_EOK -- Successful return \n
+  Other values -- Failure
+
+  @dependencies
+  None.
+  
+ */
+int qurt_hvx_unlock(void);
+
+/**@ingroup func_qurt_hvx_try_lock
+  Tries to lock one HVX unit specified by the HVX mode.
+  
+  @note1hang Input variable can be 128B_MODE or 64B_MODE. If an HVX unit in this mode 
+             is available, this function locks the unit and returns #QURT_EOK; Otherwise,
+             the function returns a failure, but does not block the current software 
+             thread to wait for the HVX unit.
+            Starting from Q6v65 with HVX context switch support, qurt_hvx_try_lock()
+             maps to qurt_hvx_set(FOR_IMMEDIATE_USE| preferred_mask | acceptable_mask);
+  
+  @datatypes
+  #qurt_mode_t
+
+  @return
+  #QURT_EOK -- Successful return \n
+  Other values -- Failure
+
+  @dependencies
+  None.
+  
+ */
+int qurt_hvx_try_lock(qurt_hvx_mode_t lock_mode);
+
+/**@ingroup func_qurt_hvx_get_mode
+  Gets the current HVX mode configured by QuRT.
+  
+  @note1hang Returns #QURT_HVX_MODE_128B or #QURT_HVX_MODE_64B, based on 
+             the current HVX configuration.
+  
+  @param[out] 
+  None.
+
+  @return
+  #QURT_HVX_MODE_128B \n
+  #QURT_HVX_MODE_64B \n
+  -1 -- Not available.
+
+  @dependencies
+  None.
+ */
+int qurt_hvx_get_mode(void);
+
+
+/**@ingroup func_qurt_hvx_get_units
+  Gets the HVX hardware configuration that the chipset supports.
+  
+  @note1hang The function returns the HVX hardware configuration supported by the chipset.
+  
+  @return
+  Bitmask of the units: 1X64, 2X64, 4X64, 1X128, 2X128, and so on.\n
+  - QURT_HVX_HW_UNITS_2X126B_4X64B -- V60, V62, or V65 HVX \n
+  - QURT_HVX_HW_UNITS_4X128B_0X64B -- V66 CDSP or newer \n
+  - 0 --  not available
+
+  @dependencies
+  None.
+  
+ */
+int qurt_hvx_get_units(void);
+
+
+/**@ingroup func_qurt_hvx_reserve
+  Reserves HVX units in terms of 64-byte mode for the protection domain (PD) of the caller.
+  
+  @note1hang Only one HVX reservation in the system is supported.
+             If one HVX unit is already locked by the application in the same PD, the unit is 
+             added to the returned count as one reserved unit for the PD.
+            Starting from Q6v65 with HVX context switch support, qurt_hvx_reserve()
+            only does basic sanity checks on HVX units.
+  
+  @datatypes
+  None.
+
+  @param[in]  num_units  Number of HVX units in terms of 64B_MODE to reserve for the PD.
+                         QURT_HVX_RESERVE_ALL to reserve all the HVX units.
+                         QURT_HVX_RESERVE_ALL_AVAILABLE to reserve the remaining unlocked units.
+
+  @return
+    Number of units successfully reserved, including the units already locked in the same PD. \n
+    #QURT_HVX_RESERVE_NOT_SUPPORTED \n     
+    #QURT_HVX_RESERVE_NOT_SUCCESSFUL \n    
+  #QURT_HVX_RESERVE_ALREADY_MADE    
+
+
+  @dependencies
+  None.
+  
+ */
+int qurt_hvx_reserve(int num_units);
+
+
+/**@ingroup func_qurt_hvx_cancel_reserve
+  Cancels the HVX reservation in the protection domain (PD) of the caller.
+  
+  @note1hang Only one HVX reservation in the system is supported.
+  
+  @return
+    0 -- Success \n
+    #QURT_HVX_RESERVE_CANCEL_ERR -- Failure      
+
+  @dependencies
+  None.
+  
+ */
+int qurt_hvx_cancel_reserve(void);
+
+
+/**@ingroup func_qurt_hvx_get_lock_val
+  Gets the HVX locking status value of the thread of the caller. 
+  
+  @note1hang Returns the status of whether the thread of the caller already locks a HVX unit or not.
+  
+  @datatypes
+  None.
+
+  @return
+    #QURT_HVX_UNLOCKED \n  
+    #QURT_HVX_LOCKED \n   
+    #QURT_HVX_ERROR    
+
+  @dependencies
+  None.
+ */
+int qurt_hvx_get_lock_val(void);
+
+/** @cond internal_only*/
+/**@ingroup func_qurt_hvx_set
+  Sets the HVX configuration for the software thread of the caller. 
+  
+  @datatypes
+  None.
+
+  @param[in] input_arg Composed of set_request | hvx_preferred_mode_mask 
+                       | hvx_acceptable_mode_mask where set_request can be set to: \n
+                       - #QURT_HVX_64B  \n         
+                       - #QURT_HVX_128B  \n       
+                       - #QURT_HVX_NO_USE  \n    
+                       - #QURT_HVX_RELEASE_CONTEXT \n
+                       - #QURT_HVX_IMMEDIATE_USE \n
+                       When set_request is QURT_HVX_IMMEDIATE_USE,  
+    hvx_preferred_mode_mask can be set to: \n
+                       - #QURT_HVX_64B_PREFERRED \n    
+                       - #QURT_HVX_128B_PREFERRED   
+                       When set_request is QURT_HVX_IMMEDIATE_USE,  
+    hvx_acceptable_mode_mask can be set to: \n
+                       - #QURT_HVX_64B_ACCEPTABLE  \n
+                       - #QURT_HVX_128B_ACCEPTABLE @tablebulletend
+
+  @return 
+     Result of the HVX setting in the least significant 8 bits of the returned data. \n
+  #QURT_EOK -- 0  \n
+  #QURT_HVX_SET_ERROR -- 0xFF \n     
+  When #QURT_HVX_IMMEDIATE_USE has a result of #QURT_EOK, 
+  bit 8 to bit 15 of the returned data contain hvx_mode_assigned:\n
+  - #QURT_HVX_64B_ASSIGNED      \n
+  - #QURT_HVX_128B_ASSIGNED   
+
+  @dependencies
+  None.
+ */
+unsigned int qurt_hvx_set(unsigned int input_arg);
+
+
+/**@ingroup func_qurt_system_hvx_regs_get_maxsize
+  Returns the maximum buffer size for saving HVX registers.
+  
+  @datatypes
+  None.
+
+  @return
+  0 -- No HVX supported in the target. \n
+  #QURT_HVX_VREG_BUF_SIZE -- Maximum buffer size for saving HVX registers.
+
+  @dependencies
+  None.
+ */
+unsigned int qurt_system_hvx_regs_get_maxsize(void);
+
+
+/**@ingroup func_qurt_system_hvx_regs_get_size
+  Returns the buffer size for saving HVX registers for a specified thread.
+  
+  @param[in]  thread_id    Thread ID of the target thread.
+
+  @return
+  0 -- No HVX assgined to the thread. \n
+    size -- Size of the buffer in bytes for saving HVX registers for the specified thread: \n 
+  - #QURT_HVX_V65_64B_VSIZE  -- 64 x 32 +  8 x 4 + 4 (version) \n
+  - #QURT_HVX_V65_128B_VSIZE -- 128 x 32 + 16 x 4 + 4 (version) \n
+  - #QURT_HVX_V66_128B_VSIZE -- 128 x (32 +2) + 16 x 4 + 4 (version) \n
+  - #QURT_HVX_V68_128B_VSIZE -- 128 x 32 + 16 x 4 + 4 (version) \n
+  - #QURT_HVX_V79_128B_VSIZE -- 128 x (32+4+1) + 4 (version)
+
+
+  @dependencies
+  None.
+  
+ */
+unsigned int qurt_system_hvx_regs_get_size(unsigned int thread_id);
+
+
+
+/**@ingroup func_qurt_system_hvx_regs_get
+  Saves the HVX registers into the specified buffer.
+  Returns the size of the data saved into the buffer.
+  After calling this function for the first time on a specified thread_id, the QuRT kernel removes the internal HVX saving buffer 
+  from the specified thread. When calling the function on the same thread_id for the second time, this function returns 0.
+  
+  @param[in] thread_id    Thread ID of the target thread.
+  @param[in] pBuf         Pointer to the buffer for HVX register saving.
+                          The first four bytes of the buffer are for saving the HVX version. HVX registers are saved from 
+                          the fifth byte of the buffer. The address of the fifth byte should be 256 bytes aligned. 
+                          For example, a buffer can be declared at first as: \n
+                          unsigned char vbuf[QURT_HVX_VREG_BUF_SIZE+256];\n
+                          unsigned char *pBuf; \n
+                          then align the buffer pointer to: \n
+                          pBuf = vbuf; \n
+                    pBuf += (256 - 4 - (unsigned)pBuf%256);
+  @param[in] size         Size of the buffer provided, which is pointed by *pBuf. The buffer size should not be smaller than that 
+                          returned from qurt_system_hvx_regs_get_size(), and pBuf should be aligned as described above.
+  @param[out] pBuf        Buffer returned with the saved HVx registers (unsigned char hvx_regs[];), which are saved from the fith 
+                          byte of the buffer, and the HVX version (unsigned int hvx_version;), which in the first four bytes 
+                          contain one of the HVX dump versions:\n
+                          - #QURT_HVX_DUMP_V65_64B \n   
+                          - #QURT_HVX_DUMP_V65_128B \n   
+                          - #QURT_HVX_DUMP_V66_128B  \n  
+                          - #QURT_HVX_DUMP_V68_128B  \n  
+                          - #QURT_HVX_DUMP_V79_128B  \n  
+                           @tablebulletend
+
+  @return
+    Total bytes of the data saved in the provided buffer. \n
+  0  -- No HVX assigned to the thread \n
+  #QURT_HVX_V65_64B_VSIZE   --  64 x 32 +  8 x 4 + 4 (version) \n
+  #QURT_HVX_V65_128B_VSIZE  -- 128 x 32 + 16 x 4 + 4 (version) \n
+  #QURT_HVX_V66_128B_VSIZE  -- 128 x (32 +2) + 16 x 4 + 4 (version) \n
+  #QURT_HVX_V68_128B_VSIZE  -- 128 x 32 + 16 x 4 + 4 (version)  \n
+  #QURT_HVX_V79_128B_VSIZE  -- 128 x (32+4+1) + 4 (version)
+
+  @dependencies
+  None.
+ */
+unsigned int qurt_system_hvx_regs_get(unsigned int thread_id, void *pBuf, size_t size);
+/** @endcond */
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_HVX_H */
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_int.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_int.h
new file mode 100755
index 0000000000000..386aeda1051eb
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_int.h
@@ -0,0 +1,509 @@
+﻿#ifndef QURT_INT_H
+#define QURT_INT_H
+/**
+  @file  qurt_int.h
+  @brief  QuRT interrupt functions.    
+
+
+
+ Copyright (c) 2013-2021, 2023 Qualcomm Technologies, Inc.
+ All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*=============================================================================
+                            CONSTANTS AND MACROS
+=============================================================================*/
+
+
+/** @cond rest_reg_dist */
+/** @addtogroup interrupts_constants
+@{ */
+#define SIG_INT_ABORT 0x80000000                                       /**< */
+#define QURT_INT_NON_DELAYED_ACK           0 
+#define QURT_INT_DELAYED_ACK               1
+#define QURT_INT_ACK_DEFAULT               QURT_INT_NON_DELAYED_ACK
+#define QURT_INT_DRV_DEFAULT               0
+#define QURT_INT_PRIORITY_DEFAULT          0xFF
+
+/** QuRT interrupt property. */
+#define QURT_INT_CONFIGID_POLARITY        0x1U /**< */
+#define QURT_INT_CONFIGID_LOCK            0x2U /**< */
+
+/** QuRT interrupt lock.*/
+#define QURT_INT_LOCK_DEFAULT             0x0  /**< Default. */
+#define QURT_INT_LOCK_DISABLE             0x0  /**< Interrupt can be enabled or disabled or deregistered. */
+#define QURT_INT_LOCK_ENABLE              0x1  /**< Interrupt is locked and cannot be enabled, disabled, or deregistered.*/
+/** @} */ /* end_addtogroup interrupts_constants */
+
+/** @addtogroup Qurt_interrupt_type
+@{ */
+/** Trigger type bit fields for a PDC interrupt:\n
+    @verbatim
+    Polarity  Edge  Output\n
+    0         00    Level sensitive active low
+    0         01    Rising edge sensitive
+    0         10    Falling edge sensitive
+    0         11    Dual edge sensitive
+    1         00    Level sensitive active high
+    1         01    Falling edge sensitive
+    1         10    Rising edge sensitive
+    1         11    Dual edge sensitive 
+    @endverbatim
+*/
+#define QURT_INT_TRIGGER_TYPE_SET(pol, edge)   ((((pol) & 0x01U) << 2) | ((edge) & 0x03U)) /**< */
+	 
+#define QURT_INT_TRIGGER_LEVEL_LOW     QURT_INT_TRIGGER_TYPE_SET(0U, 0x00U)  /**< */
+#define QURT_INT_TRIGGER_LEVEL_HIGH    QURT_INT_TRIGGER_TYPE_SET(1U, 0x00U)  /**< */
+#define QURT_INT_TRIGGER_RISING_EDGE   QURT_INT_TRIGGER_TYPE_SET(1U, 0x02U)  /**< */
+#define QURT_INT_TRIGGER_FALLING_EDGE  QURT_INT_TRIGGER_TYPE_SET(0U, 0x02U)  /**< */
+#define QURT_INT_TRIGGER_DUAL_EDGE     QURT_INT_TRIGGER_TYPE_SET(0U, 0x03U)  /**< */
+#define QURT_INT_TRIGGER_USE_DEFAULT   0xffU                                 /**< */
+/** @} */ /* end_addtogroup Qurt_interrupt_type */
+
+/*=====================================================================
+ Functions
+======================================================================*/
+
+/**@ingroup func_qurt_interrupt_register
+  @xreflabel{sec:interrupt_register} 
+  Registers the interrupt.\n
+  Enables the specified interrupt and associates it with the specified QuRT signal object and
+  signal mask.
+
+  Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1
+  indicates that a signal must be waited on, and 0 indicates not to wait.
+
+  When the interrupt occurs, the signal specified in the signal mask is set in the signal
+  object. An IST conventionally waits on that signal to
+  handle the interrupt. The thread that registers the interrupt is set as the IST.
+
+  Up to 31 separate interrupts can be registered to a single signal object, as determined by
+  the number of individual signals the object can store. QuRT reserves signal 31. Thus a
+  single IST can handle several different interrupts.
+
+  QuRT reserves some interrupts for internal use -- the remainder are available for use by
+  applications, and thus are valid interrupt numbers. If the specified interrupt number is
+  outside the valid range, the register operation returns the status value QURT_EINT.
+
+  Only one thread can be registered at a time to a specific interrupt. Attempting to register
+  an already-registered interrupt returns the status value QURT_EVAL.
+
+  Only one signal bit in a signal object can be registered at a time to a specific interrupt.
+  Attempting to register multiple signal bits to an interrupt returns the status value
+  QURT_ESIG.
+
+  When the signal registers an interrupt, QuRT can only set its signal bits 
+  when receiving the interrupt. The QuRT signal API from another
+  software thread cannot set the signal even for unused signal bits.
+
+  @note1hang The valid range for an interrupt number can differ on target execution
+             environments other than the simulator. For more information, see the
+             appropriate hardware document.
+								 
+  @datatypes
+  #qurt_anysignal_t
+
+  @param[in] int_num      L2VIC interrupt to deregister; valid range is 0 to 1023.
+  @param[in] int_signal   Any-signal object to wait on (Section @xref{dox:any_signals}).
+  @param[in] signal_mask  Signal mask value indicating signal to receive the interrupt.
+
+   @return
+   #QURT_EOK -- Interrupt successfully registered.\n
+   #QURT_EINT -- Invalid interrupt number. \n
+   #QURT_ESIG -- Invalid signal bitmask (cannot set more than one
+                signal at a time). \n
+   #QURT_EVAL -- Interrupt already registered.
+
+   @dependencies
+   None.
+*/
+ unsigned int qurt_interrupt_register(int int_num, qurt_anysignal_t *int_signal, int signal_mask);
+
+/**@ingroup func_qurt_interrupt_register2
+  @xreflabel{sec:interrupt_register2} 
+  Registers the interrupt.\n
+  Enables the specified interrupt, associates it with the specified QuRT signal object and
+  signal mask, and sets interrupt flags.
+
+  Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1
+  indicates that a signal must be waited on, and 0 indicates not to wait.
+
+  When the interrupt occurs, the signal specified in the signal mask is set in the signal
+  object. An IST conventionally waits on that signal to
+  handle the interrupt. The thread that registers the interrupt is set as the IST.
+
+  Up to 31 separate interrupts can be registered to a single signal object, as determined by
+  the number of individual signals that the object can store. QuRT reserves signal 31. Thus a
+  single IST can handle several different interrupts.
+
+  QuRT reserves some interrupts for internal use -- the remainder are available for use by
+  applications, and thus are valid interrupt numbers. If the specified interrupt number is
+  outside the valid range, the register operation returns the status value #QURT_EINT.
+
+  Only one thread can be registered at a time to a specific interrupt. Attempting to register
+  an already-registered interrupt returns the status value #QURT_EVAL.
+
+  Only one signal bit in a signal object can be registered at a time to a specific interrupt.
+  Attempting to register multiple signal bits to an interrupt returns the status value
+  #QURT_ESIG.
+
+  When the signal registers an interrupt, QuRT can only set its signal bits 
+  when receiving the interrupt. The QuRT signal API from another
+  software thread cannot set the signal even for unused signal bits.
+
+  @note1hang The valid range for an interrupt number can differ on target execution
+             environments other than the simulator. For more information, see the
+             appropriate hardware document.
+								 
+  @datatypes
+  #qurt_anysignal_t
+
+  @param[in] int_num      L2VIC interrupt to deregister; valid range is 0 to 1023.
+  @param[in] int_signal   Any-signal object to wait on (Section @xref{dox:any_signals}).
+  @param[in] signal_mask  Signal mask value indicating signal to receive the interrupt.
+  @param[in] flags        Defines interrupt property, supported property is interrupt lock enable/disable. 
+                          Possible values for flags: \n
+                           - #QURT_INT_LOCK_ENABLE
+                           - #QURT_INT_LOCK_DISABLE @tablebulletend
+
+   @return
+   #QURT_EOK -- Interrupt successfully registered.\n
+   #QURT_EINT -- Invalid interrupt number. \n
+   #QURT_ESIG -- Invalid signal bitmask (cannot set more than one
+                signal at a time). \n
+   #QURT_EVAL -- Interrupt already registered.
+
+   @dependencies
+   None.
+*/
+ unsigned int qurt_interrupt_register2(int int_num, qurt_anysignal_t *int_signal, int signal_mask, unsigned int flags);
+/*
+ * Waits for registered interrupt signal
+
+ * Suspend the current thread until one of its registered interrupts occurs. The second input mask, 
+ * contains the interrupt signals the IST expects to receive. The interrupt signals are registered 
+ * with interrupts via qurt_register_interrupt API.
+ *
+ * The signals returned in the signal variable indicate which interrupts occurred. Use function 
+ * qurt_anysignal_get to read the signals. IST must locally maintain a table that maps a signal to 
+ * a specific interrupt. IST also checks if signal #SIG_INT_ABORT is received. If so, the IST 
+ * must quit from interrupt receiving loop.
+ *
+ * For detail information on this API, see QuRT User Manual Section 4.2.5
+ *
+ * Prototype
+ *
+ * unsigned int qurt_anysignal_wait(qurt_anysignal_t *int_signal, unsigned int mask)
+ */
+
+/**@ingroup func_qurt_interrupt_acknowledge
+  Acknowledges an interrupt after it has been processed.\n
+  Re-enables an interrupt and clears its pending status. This is done after an interrupt is
+  processed by an IST.
+
+  Interrupts are automatically disabled after they occur. To re-enable an interrupt, an IST
+  performs the acknowledge operation after it has finished processing the interrupt and
+  just before suspending itself (such as by waiting on the interrupt signal).
+
+  @note1hang To prevent losing or reprocessing subsequent occurrences of the interrupt,
+           an IST must clear the interrupt signal (Section @xref{sec:anysignal_clear}) before
+           acknowledging the interrupt.
+
+  @param[in] int_num Interrupt that is being re-enabled.
+
+  @return 
+  #QURT_EOK -- Interrupt acknowledge was successful. \n
+  #QURT_EDEREGISTERED -- Interrupt is already de-registered.
+
+  @dependencies
+  None.	
+*/
+int qurt_interrupt_acknowledge(int int_num);
+
+/**@ingroup func_qurt_interrupt_deregister
+  Disables the specified interrupt and disassociates it from a QuRT signal object.
+  If the specified interrupt was never registered (Section @xref{sec:interrupt_register}), the deregister operation
+  returns the status value #QURT_EINT.
+
+  @note1hang If an interrupt is deregistered while an IST waits
+             to receive it, the IST might wait indefinitely for the interrupt to occur. To avoid
+             this problem, the QuRT kernel sends the signal #SIG_INT_ABORT to awaken an
+             IST after determining that it has no interrupts registered.
+
+  @param[in] int_num L2VIC to deregister; valid range is 0 to 1023.
+
+  @return
+  #QURT_EOK -- Success.\n
+  #QURT_EINT -- Invalid interrupt number (not registered).
+
+  @dependencies
+  None.
+
+*/
+unsigned int qurt_interrupt_deregister(int int_num);
+/** @endcond */
+
+/**@ingroup func_qurt_interrupt_disable
+  Disables an interrupt with its interrupt number.\n
+  The interrupt must be registered prior to calling this function. 
+  After qurt_interrupt_disable() returns, the Hexagon subsystem
+  can no longer send the corresponding interrupt to the Hexagon
+  core, until qurt_interrupt_enable() is called 
+  for the same interrupt. 
+  
+  Avoid calling qurt_interrupt_disable() and qurt_interrupt_enable() frequently within 
+  a short period of time.\n
+  - A pending interrupt can already be in the Hexagon core when qurt_interrupt_disable() 
+    is called. Therefore, some time later, the pending interrupt is received on a Hexagon 
+    hardware thread.\n
+  - After the Hexagon subsystem sends an interrupt to the Hexagon core, the Hexagon 
+    hardware automatically disables the interrupt until kernel software re-enables the interrupt 
+    at the interrupt acknowledgement stage. If qurt_interrupt_enable() is called from a certain 
+    thread at an ealier time, the interrupt is re-enabled earlier and can trigger 
+  sending a new interrupt to the Hexagon core while kernel software is still processing
+  the previous interrupt.
+
+  @param[in] int_num Interrupt number.
+
+  @return
+  #QURT_EOK  -- Interrupt successfully disabled.\n 
+  #QURT_EINT -- Invalid interrupt number.\n
+  #QURT_ENOTALLOWED -- Interrupt is locked. \n
+  #QURT_EVAL -- Interrupt is not registered. 
+
+  @dependencies
+  None.
+*/
+ unsigned int qurt_interrupt_disable(int int_num);
+
+ 
+/**@ingroup func_qurt_interrupt_enable
+  Enables an interrupt with its interrupt number.\n
+  The interrupt must be registered prior to calling this function. 
+
+  @param[in] int_num Interrupt number.
+
+  @return
+  #QURT_EOK -- Interrupt successfully enabled.\n 
+  #QURT_EINT -- Invalid interrupt number.\n
+  #QURT_ENOTALLOWED -- Interrupt is locked. \n
+  #QURT_EVAL -- Interrupt is not registered.
+
+  @dependencies
+  None.
+
+*/
+ unsigned int qurt_interrupt_enable(int int_num);
+
+
+/**@ingroup func_qurt_interrupt_status
+  Returns a value that indicates the pending status of the specified interrupt.
+
+  @param[in]  int_num  Interrupt number that is being checked.
+  @param[out] status   Interrupt status; 1 indicates that an interrupt is
+                       pending, 0 indicates that an interrupt is not pending.
+ 
+  @return 
+  #QURT_EOK -- Success. \n
+  #QURT_EINT -- Failure; invalid interrupt number.
+
+  @dependencies
+  None.
+ */
+unsigned int qurt_interrupt_status(int int_num, int *status);
+
+
+/**@ingroup func_qurt_interrupt_get_status
+  Gets the status of the specified interrupt in L2VIC.
+
+  @param[in]  int_num  Interrupt number that is being checked.
+  @param[in]  status_type     0 -- interrupt pending status \n
+                              1 -- interrupt enabling status
+  @param[out] status          0 -- OFF \n
+                              1 -- ON
+ 
+  @return 
+  #QURT_EOK -- Success. \n
+  #QURT_EINT -- Failure; invalid interrupt number.
+
+  @dependencies
+  None.
+ */
+unsigned int qurt_interrupt_get_status(int int_num, int status_type, int *status);
+
+/** @cond rest_reg_dist */
+/**@ingroup func_qurt_interrupt_clear
+  Clears the pending status of the specified interrupt.
+
+  @note1hang This operation is intended for system-level use, and must be used with care.
+             
+  @param[in] int_num Interrupt that is being re-enabled.
+ 
+  @return 
+  #QURT_EOK -- Success.\n
+  #QURT_EINT -- Invalid interrupt number.
+  
+  @dependencies
+  None.
+ */
+unsigned int qurt_interrupt_clear(int int_num);
+
+
+/**@ingroup func_qurt_interrupt_get_config
+  Gets the L2VIC interrupt configuration. \n
+  This function returns the type and polarity of the specified L2VIC interrupt.
+
+  @param[in]   int_num       L2VIC interrupt that is being re-enabled.
+  @param[out]  int_type      Pointer to an interrupt type. \n
+                             0 -- Level-triggered interrupt \n
+                             1 -- Eedge-triggered interrupt
+  @param[out]  int_polarity  Pointer to interrupt polarity.\n
+                             0 -- Active-high interrupt \n
+                             1 -- Active-low interrupt.
+ 
+  @return 
+  #QURT_EOK -- Configuration successfully returned.\n
+  #QURT_EINT -- Invalid interrupt number. 
+
+  @dependencies
+  None.
+ */
+unsigned int qurt_interrupt_get_config(unsigned int int_num, unsigned int *int_type, unsigned int *int_polarity);
+
+/**@ingroup func_qurt_interrupt_set_config
+  Sets the type and polarity of the specified L2VIC interrupt.
+
+  @note1hang Deregister L2VIC interrupts before reconfiguring them.
+
+  @param[in] int_num        L2VIC interrupt that is being re-enabled.
+  @param[in] int_type       Interrupt type. \n
+                            0 -- Level-triggered interrupt\n
+                            1 -- Edge-triggered interrupt
+  @param[in] int_polarity   Interrupt polarity. \n
+                            0 -- Active-high interrupt \n
+                            1 -- Active-low interrupt
+ 
+  @return
+  #QURT_EOK -- Success. \n
+  #QURT_ENOTALLOWED -- Not allowed; the interrupt is being registered.\n
+  #QURT_EINT -- Invalid interrupt number.
+  
+  @dependencies
+  None.
+ */
+unsigned int qurt_interrupt_set_config(unsigned int int_num, unsigned int int_type, unsigned int int_polarity);
+
+/**@ingroup func_qurt_interrupt_set_config2
+  Sets the type and polarity of the specified L2VIC interrupt.
+
+  @note1hang L2VIC interrupts must be deregistered before they can be reconfigured.
+
+  @param[in] int_num        L2VIC interrupt that is being re-enabled.
+  @param[in] int_type       Notified to the hardware configuration callback function and used to 
+                            modify the L2VIC type. Possible values: \n 
+                            - #QURT_INT_TRIGGER_USE_DEFAULT \n 
+                            - #QURT_INT_TRIGGER_LEVEL_HIGH  \n 
+                            - #QURT_INT_TRIGGER_LEVEL_LOW  \n 
+                            - #QURT_INT_TRIGGER_RISING_EDGE  \n 
+                            - #QURT_INT_TRIGGER_FALLING_EDGE  \n              
+                            - #QURT_INT_TRIGGER_DUAL_EDGE  @tablebulletend
+ 
+  @return
+  #QURT_EOK -- Success. \n
+  #QURT_ENOTALLOWED -- Not allowed; the interrupt is being registered.\n
+  #QURT_EINT -- Invalid interrupt number.
+  
+  @dependencies
+  None.
+ */
+unsigned int qurt_interrupt_set_config2(unsigned int int_num, unsigned int int_type);
+
+/**@ingroup func_ qurt_interrupt_set_config3
+  Sets the specified configuration value for the specified property of the specified L2VIC interrupt.
+
+  @note1hang L2VIC interrupts must be deregistered before they can be reconfigured for polarity.
+    
+  @param[in] int_num        L2VIC interrupt to re-enable.
+  @param[in] config_id      Property to configure: \n
+                            - #QURT_INT_CONFIGID_POLARITY \n
+                            - #QURT_INT_CONFIGID_LOCK @tablebulletend
+  @param[in] config_val    Dependent on the second argument config_id, specifies the value to set. \n
+                           Values for #QURT_INT_CONFIGID_POLARITY: \n 
+                            - #QURT_INT_TRIGGER_USE_DEFAULT \n
+                            - #QURT_INT_TRIGGER_LEVEL_HIGH  \n
+                            - #QURT_INT_TRIGGER_LEVEL_LOW \n
+                            - #QURT_INT_TRIGGER_RISING_EDGE \n
+                            - #QURT_INT_TRIGGER_FALLING_EDGE \n             
+                            - #QURT_INT_TRIGGER_DUAL_EDGE \n
+
+                           Values for #QURT_INT_CONFIGID_LOCK: \n
+                            - #QURT_INT_LOCK_ENABLE\n
+                            - #QURT_INT_LOCK_DISABLE @tablebulletend
+          
+  @return
+  #QURT_EOK -- Success. \n
+  #QURT_ENOTALLOWED -- Not allowed; the interrupt is being registered or is locked for enable/disable.\n
+  #QURT_EINT -- Invalid interrupt number.
+
+  @dependencies
+  None.
+*/
+unsigned int qurt_interrupt_set_config3(unsigned int int_num, unsigned int config_id, unsigned int config_val);
+
+
+/**@ingroup func_qurt_interrupt_raise
+  Raises the interrupt. \n
+  This function triggers a level-triggered L2VIC
+  interrupt, and accepts interrupt numbers in the range of 0 to 1023.
+
+  @param[in] interrupt_num Interrupt number.
+  
+  @return
+  #QURT_EOK --  Success \n
+  -1  --  Failure; the interrupt is not supported.
+
+  @dependencies
+  None.
+ */
+int qurt_interrupt_raise(unsigned int interrupt_num);
+
+/**@ingroup func_qurt_interrupt_raise2
+  Raises the interrupt and returns the current pcycle value.
+
+  @param[in] interrupt_num Interrupt number.
+  
+  @return
+  0xFFFFFFFFFFFFFFFF -- Failure; the interrupt is not supported.\n
+  Other value        -- pcycle count at the time the interrupt is raised.
+
+  @dependencies
+  None.
+ */
+unsigned long long qurt_interrupt_raise2(unsigned int interrupt_num);
+/** @endcond */
+
+/** @cond internal_only */
+/**@ingroup func_qurt_isr_subcall
+  Indicates whether the current function is called from a callback procedure (either short or long).
+  
+  @return
+  #QURT_EOK -- TRUE \n
+  #QURT_EVAL -- FALSE.
+  
+  @dependencies
+  None.
+ */
+int qurt_isr_subcall(void);
+/** @endcond */
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_INT_H */
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_island.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_island.h
new file mode 100755
index 0000000000000..f0c8ee27cf8b0
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_island.h
@@ -0,0 +1,122 @@
+#ifndef QURT_ISLAND_H
+#define QURT_ISLAND_H
+
+/**
+  @file qurt_island.h
+  @brief  Prototypes of power API
+          The APIs allow entering and exiting island mode where the memory
+          accesses are limited to local memory.
+
+  EXTERNAL FUNCTIONS
+   None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+Copyright (c) 2018-2021,2023  by Qualcomm Technologies, Inc.  All Rights Reserved.
+
+=============================================================================*/
+
+#include <qurt_thread.h>
+#include <qurt_memory.h>
+#include <qurt_alloc.h>
+#include <qurt_error.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**@ingroup func_qurt_island_get_status
+  Gets Island mode status.
+
+  Returns a value that indicates whether the QuRT system executes in Island mode.
+
+  @return
+  0 - Normal mode. \n
+  1 - Island mode. 
+
+  @dependencies
+  None.
+*/
+unsigned int qurt_island_get_status (void);
+
+/**@ingroup func_qurt_island_get_status2
+  Gets Island mode status especially that differentiates between island partial exit and complete exit.
+ 
+  Returns a value that indicates the current state. 
+  
+  @note1hang Transition from NORMAL mode to ISLAND mode happens in single
+             threaded mode. Whereas transition from ISLAND mode to other modes
+             happen in multi-threaded mode. So, a thread that gets island mode
+             status as NORMAL can assume the same status till it continues to
+             run. A thread that gets island mode status as ISLAND should 
+             assume that the status may change to EXITING or NORMAL while it
+             runs. A thread that gets island mode status as EXITING should
+             assume that the status may change to NORMAL while it runs. If 
+             the thread goes to wait state in after reading the status, it should get
+             the island mode state again and not assume the previous state. 
+  @note2hang This api returns more intrinsic states than qurt_island_get_status,
+             when qurt_island_get_status returns 0, this api could return 
+             QURT_ISLAND_MODE_EXITING or QURT_ISLAND_MODE_ISLAND
+          
+  @param[in/out] data  field is reserved for future use. If NULL pointer is passed, 
+                       the field will be ignored. If a valid pointer is passed, 
+                  QuRT will return back a bitmask which can be interpreted as follows:
+                  data[31] - Valid bit. Set to 1 to indicate data[30:0] are valid. 
+                  Otherwise set to 0.
+                  data[30:0] – Reserved for future definition. 
+ 
+  @return
+    QURT_ISLAND_MODE_NORMAL   - Main mode \n
+    QURT_ISLAND_MODE_ISLAND   - Island mode \n
+    QURT_ISLAND_MODE_EXITING  - Exiting Island mode \n
+ 
+  @dependencies
+  None.
+*/
+unsigned int qurt_island_get_status2 (unsigned int *data);
+
+
+
+/**@ingroup func_qurt_island_get_exit_status
+  Gets the reason for the last Island mode exit status.
+
+  @param[out] cause_code Pointer that returns the cause code of the last
+                         island exit reason. \n
+                         - #QURT_EISLANDUSEREXIT -- Island exit due to user call for island exit.\n
+                         - #QURT_ENOISLANDENTRY -- API called before exiting island. \n                
+                         - #QURT_EISLANDINVALIDINT -- Island exit due to an invalid interrupt in Island mode. @tablebulletend
+
+  @param[out] int_num Pointer that holds the invalid interrupt number that caused
+                      island exit when the cause code is #QURT_EISLANDINVALIDINT.
+                      For other cases, it is -1.
+
+  @return
+  None. 
+
+  @dependencies
+  None.
+*/
+void qurt_island_get_exit_status(unsigned int *cause_code, int *int_num);
+
+/**@ingroup func_qurt_island_get_enter_timestamp
+  Gets the recent timestamp when the system exits STM during island enter.
+
+  @param[out]    island_enter_timestamp Returns a pointer to the recent timestamp
+                                        recorded after the system exits STM during island enter. If the system never 
+                                        attempts to enter island, the island_enter_timestamp return pointer holds a value 
+                 of zero.
+  
+  @return
+  None. 
+
+  @dependencies
+  None.
+*/
+void qurt_island_get_enter_timestamp(unsigned long long *island_enter_timestamp);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_ISLAND_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_isr.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_isr.h
new file mode 100755
index 0000000000000..db29ea2f265d7
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_isr.h
@@ -0,0 +1,177 @@
+#ifndef QURT_ISR_H
+#define QURT_ISR_H
+
+/*=====================================================================
+ 
+  @file  qurt_isr.h
+
+  @brief  Prototypes of Qurt ISR API functions      
+
+ EXTERNALIZED FUNCTIONS
+  none
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  none
+
+ Copyright (c) 2017, 2021  by Qualcomm Technologies, Inc.  All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+
+#include <string.h>
+#include <qurt_thread.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*=============================================================================
+                            Functions
+=============================================================================*/
+
+
+/**@ingroup func_qurt_isr_set_hw_config_callback
+  Set callback function for the configuration related to interrupt hardware.
+  In a process, the callback function can only be set once.
+
+  @param[in]   cb_addr      address of the callback function.
+   
+  @return 
+  #QURT_EOK     -- the callback function is set succssfully. \n
+  #QURT_EFAILED -- Failure. The callback function has been set before. 
+
+  @dependencies
+  None.
+ */
+int qurt_isr_set_hw_config_callback(unsigned int cb_addr);
+
+
+/**@ingroup func_qurt_isr_set_hw_enable_callback
+  Set callback function for enabling the configuration related to interrupt hardware.
+  In a process, the callback function can only be set once.
+
+  @param[in]   cb_addr      address of the callback function.
+   
+  @return 
+  #QURT_EOK     -- the callback function is set succssfully. \n
+  #QURT_EFAILED -- Failure. The callback function has been set before. 
+
+  @dependencies
+  None.
+ */
+int qurt_isr_set_hw_enable_callback(unsigned int cb_addr);
+
+
+/**@ingroup func_qurt_isr_set_hw_disable_callback
+  Set callback function for disabling the configuration related to interrupt hardware.
+  In a process, the callback function can only be set once.
+
+  @param[in]   cb_addr      address of the callback function.
+   
+  @return 
+  #QURT_EOK     -- the callback function is set succssfully. \n
+  #QURT_EFAILED -- Failure. The callback function has been set before. 
+
+  @dependencies
+  None.
+ */
+int qurt_isr_set_hw_disable_callback(unsigned int cb_addr);
+
+
+/**@ingroup func_qurt_isr_create
+  Creates an ISR thread with the specified attributes, and makes it executable.
+
+  @datatypes
+  #qurt_thread_t \n
+  #qurt_thread_attr_t
+  
+  @param[out]  thread_id    Returns a pointer to the thread identifier if the thread was 
+                             successfully created.
+  @param[in]   attr 	    Pointer to the initialized thread attribute structure that specifies 
+                             the attributes of the created thread.
+   
+  @return 
+  #QURT_EVAL    -- Invalid arguments
+  #QURT_EOK -- Thread created. \n
+  #QURT_EFAILED -- Thread not created. 
+
+  @dependencies
+  None.
+ */
+int qurt_isr_create (qurt_thread_t *thread_id, qurt_thread_attr_t *pAttr);
+
+/**@ingroup func_qurt_isr_register2
+  Registers an Interrupt Service Routine to an ISR thread. ISR callback with the specified attributes.
+  The interrupt is enabled when this function returns success.
+
+  @datatypes
+   qurt_thread_t
+  
+  @param[in]   isr_thread_id ISR thread ID, returned from qurt_isr_create()
+  @param[in]   int_num       The interrupt number
+  @param[in]   prio          Priority of the ISR
+  @param[in]   flags         Defines ACK type. Values : \n
+                             QURT_INT_NON_DELAYED_ACK - ISR is acknowledged by the interrupt handle routine 
+			                                     in the Kernel.
+                             QURT_INT_DELAYED_ACK     - Client chooses to acknowledge. 
+  @param[in]   int_type.     Notifies it to registered function. Values: \n 
+                             - QURT_INT_TRIGGER_USE_DEFAULT
+                             - QURT_INT_TRIGGER_LEVEL_HIGH 
+                             - QURT_INT_TRIGGER_LEVEL_LOW 
+                             - QURT_INT_TRIGGER_RISING_EDGE 
+                             - QURT_INT_TRIGGER_FALLING_EDGE              
+                             - QURT_INT_TRIGGER_DUAL_EDGE              
+  @param[in]   isr           Interrupt Service Routine with proto type void isr (void *arg, int int_num)
+  @param[in]   arg  	     1st argument of the ISR when it is called to service the interrupt
+   
+  @return 
+   QURT_EOK          -- Successfully registered the ISR for the interrupt
+   QURT_EINT         -- Interrupt not configured
+   QURT_EINVALID     -- Invalid Thread ID
+   QURT_EDISABLED    -- The feature is disabled
+   QURT_EDUPLICATE   -- Interrupt is already registered
+
+  @dependencies
+   Thread ID should be created using qurt_isr_create()
+ */
+int qurt_isr_register2 (qurt_thread_t isr_thread_id, int int_num, unsigned short prio, unsigned short flags, unsigned int int_type, void (*isr) (void *, int), void *arg);
+
+/**@ingroup func_qurt_isr_deregister2
+  De-registers the ISR for the specified interrupt.
+  The interrupt is disabled when this function returns success.
+
+  @param[in]   int_num   The interrupt number
+   
+  @return 
+   QURT_EOK            -- ISR deregistered successfully
+   QURT_ENOREGISTERED  -- Interrupt with int_num is not registered
+
+  @dependencies
+  None.
+ */
+int qurt_isr_deregister2 (int int_num);
+
+/**@ingroup func_qurt_isr_delete
+   ISR thread will exit and releases Kernel resources
+
+   @note1hang   The ISR thread shouldn't be actively processing interrupts,
+                otherwise the call will fail and return an error.
+  
+   @param[in]   thread-id of the ISR thread that needs to be deleted.
+
+   @return
+    QURT_ENOTALLOWED   -- ISR thread is processing an interrupt
+    QURT_EINVALID      -- Invalid ISR thread ID
+    QURT_EOK           -- Success 
+
+   @dependencies
+   Thread ID should be created using qurt_isr_create()
+ */
+int qurt_isr_delete (qurt_thread_t isr_tid);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_ISR_H */
+
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_l2cfg.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_l2cfg.h
new file mode 100755
index 0000000000000..7e26b30a580d9
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_l2cfg.h
@@ -0,0 +1,98 @@
+#ifndef QURT_L2CFG_H
+#define QURT_L2CFG_H
+/**
+  @file qurt_l2cfg.h 
+  @brief QuRT APIs for L2 configuration and system configuration
+
+EXTERNAL FUNCTIONS
+   qurt_l2cfg_set 
+   qurt_l2cfg_get 
+   qurt_system_config_get
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+Copyright (c) 2019-2021 by Qualcomm Technologies, Inc.  All Rights Reserved.
+Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+=============================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*=============================================================================
+                        CONSTANTS AND MACROS
+=============================================================================*/
+
+/* Definition for system configuration */
+/** @addtogroup l2cfg_macros
+@{ */
+#define QURT_CORE_CFG_HMX_INT8_SPATIAL  0x78   /**< HMX fixed-point spatial size */
+#define QURT_CORE_CFG_HMX_INT8_DEPTH    0x7C   /**< HMX fixed-point output depth */
+/** @} */ /* end_addtogroup l2cfg_macros */
+/*=============================================================================
+                        FUNCTIONS
+=============================================================================*/
+/**@ingroup func_qurt_l2cfg_set
+  Sets the value of a L2 configuration register. A register can be set *IFF* its 
+  initial value is configured.
+   
+  @param[in] offset Offset of L2 configuration register; must be multiple of 4.
+  @param[in] value  Value to set the register to. 
+   
+  @return 
+  #QURT_EOK -- Success. \n 
+  #QURT_EFAILED -- Internal mapping that covers L2CFG register file absent; likely 
+                    a configuration problem. \n
+  #QURT_EINVALID -- Argument error. \n
+  #QURT_ENOTALLOWED -- Setting this register is prohibited.
+   
+  @dependencies 
+  None.  
+ */
+int qurt_l2cfg_set (unsigned short offset, unsigned int value);
+
+/**@ingroup func_qurt_l2cfg_get 
+  Gets the value of a L2 configuration register. 
+   
+  @param[in]  offset Offset of L2 configuration register; must be multiple of 4.
+  @param[out] value  Pointer to value of the register. 
+   
+  @return 
+  #QURT_EOK -- Success. \n 
+  #QURT_EFAILED -- Internal mapping that covers L2CFG register file absent;  
+                   likely a configuration problem. \n 
+  #QURT_EINVALID -- Argument error. 
+   
+  @dependencies 
+  None. 
+  
+ */
+int qurt_l2cfg_get (unsigned short offset, unsigned int * value);
+
+
+/**@ingroup func_qurt_system_config_get
+  Gets the system configuration information.
+
+  @param[in] index Index to system configuration. Values:\n
+                   - #QURT_CORE_CFG_HMX_INT8_SPATIAL \n
+                   - #QURT_CORE_CFG_HMX_INT8_DEPTH @tablebulletend
+
+  @param[out] data   Pointer to a word for returned data.
+
+  @return
+  #QURT_EOK -- Get the configuration data successful. \n
+  Other values -- Failure (no such configuration available).
+
+  @dependencies
+  None.
+  
+ */
+int qurt_system_config_get(int index, unsigned int *data);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_L2CFG_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_lifo.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_lifo.h
new file mode 100755
index 0000000000000..dc399fccc5f0f
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_lifo.h
@@ -0,0 +1,71 @@
+#ifndef QURT_LIFO_H
+#define QURT_LIFO_H
+/**
+  @file qurt_lifo.h
+
+  @brief
+  Provide lock free LastInFirstOut algorithm, which can be used in a
+  variety of situations for allocation/free fixed size buffer    
+  This implementation touches the first word of your FREED buffer. Even
+  though it does not matter how you use it when it is allocated, you might want
+	to be a bit careful not to put your MAGIC number as the first field.
+	Because it will not hold the magic value for "freed"
+
+ EXTERNALIZED FUNCTIONS
+ None
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+ None
+
+ Copyright (c) 2013, 2021  by Qualcomm Technologies, Inc.  All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+	/*=====================================================================
+	 Functions
+	======================================================================*/
+
+/*======================================================================*/
+/**
+  Pops an element out of the LIFO. 
+
+  @param[in] freelist  Pointer to the head of your list. 
+	
+  @return 				
+  Top object from the list 
+		 
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+void * qurt_lifo_pop(void *freelist);
+
+ 
+/*======================================================================*/
+/**
+  Pushes an element into the LIFO.
+	
+  @param[in] freelist  Pointer to the head of your list. 
+  @param[in] buf       Pointer to your buffer to push into the list. 
+	
+  @return
+  None.
+		 
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+void qurt_lifo_push(void *freelist, void *buf);
+
+void qurt_lifo_remove(void *freelist, void *buf);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_LIFO_H */
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_mailbox.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_mailbox.h
new file mode 100755
index 0000000000000..a6cd91c611782
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_mailbox.h
@@ -0,0 +1,176 @@
+#ifndef QURT_MAILBOX_H
+#define QURT_MAILBOX_H
+
+/**
+  @file qurt_mailbox.h
+  @brief  Definitions, macros, and prototypes used for QuRT mailbox
+
+ EXTERNALIZED FUNCTIONS
+  none
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  none
+
+ Copyright (c) 2015, 2021-2023  by Qualcomm Technologies, Inc.  All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*=============================================================================
+                        CONSTANTS AND MACROS
+=============================================================================*/
+/* Definitions on typedef and return values */
+
+#define   QURT_MAILBOX_ID_NULL               0
+#define   QURT_MAILBOX_ERROR                -1
+#define   QURT_MAILBOX_ID_ERROR             -2
+#define   QURT_MAILBOX_NON_VALID_DATA       -3
+#define   QURT_MAILBOX_FULL                 -4
+#define   QURT_MAILBOX_DELETED              -5
+#define   QURT_MAILBOX_RECEIVE_HALTED       -6
+#define   QURT_MAILBOX_BANDWIDTH_LIMIT      -7
+
+
+/*=============================================================================
+                    FORWARD DECLARATIONS & TYPEDEFS
+=============================================================================*/
+
+#define        QURT_MAILBOX_AT_QURTOS     0U            // Receiver is QurtOS
+#define        QURT_MAILBOX_AT_ROOTPD     1U            // Receiver is RootPD  (ASID=0)
+#define        QURT_MAILBOX_AT_USERPD     2U            // Receiver is User PD (ASID!=0)
+#define        QURT_MAILBOX_AT_SECUREPD   3U            // Receiver is Secure PD
+
+typedef unsigned char qurt_mailbox_receiver_cfg_t;  
+
+#define        QURT_MAILBOX_SEND_OVERWRITE        0U       // When there is already valid content, overwrite it
+#define        QURT_MAILBOX_SEND_NON_OVERWRITE    1U       // When there is already valid content, return failure
+
+typedef unsigned char qurt_mailbox_send_option_t;  
+
+
+#define        QURT_MAILBOX_RECV_WAITING          0U          // When there is no valid content, wait for it 
+#define        QURT_MAILBOX_RECV_NON_WAITING      1U          // When there is no valid content, return failure immediately
+#define        QURT_MAILBOX_RECV_PEEK_NON_WAITING 2U          // Read the content, but doesn't remove it from the mailbox. No waiting.
+
+typedef unsigned char qurt_mailbox_recv_option_t;
+
+
+/*=============================================================================
+                            EXTERNS & FUNCTIONS
+=============================================================================*/
+/* Function prototype */
+
+/**@ingroup qurt_mailbox_create
+  Creates a QuRT mailbox.
+   
+  @param name            Mailbox name up to 8 characters.
+  @param recv_opt        Configuration on the receiver process.
+
+  @return
+  Mailbox ID --          Mailbox Identifier \n
+  #QURT_MAILBOX_ID_NULL --  NULL, failure at creating mailbox
+
+  @dependencies
+  None.
+*/
+unsigned long long qurt_mailbox_create(char *name, qurt_mailbox_receiver_cfg_t recv_opt);
+
+
+/**@ingroup qurt_mailbox_get_id
+  Gets a QuRT mailbox identifier.
+   
+  @param name            Mailbox name up to 8 characters.
+
+  @return
+  Mailbox ID --            Mailbox identifier \n
+  #QURT_MAILBOX_ID_NULL -- NULL, failure at getting mailbox ID
+
+  @dependencies
+  None.
+*/
+unsigned long long qurt_mailbox_get_id(char *name);
+
+
+/**@ingroup qurt_mailbox_send
+  Sends data to a QuRT mailbox.
+   
+  @param mailbox_id   Mailbox identifier.
+  @param send_opt     Option for mailbox send.
+  @param data         Data to send.
+
+
+  @return
+  #QURT_EOK                      Success \n
+  #QURT_MAILBOX_ID_ERROR         Mailbox ID error.\n
+  #QURT_MAILBOX_ERROR            Other errors.\n
+  #QURT_MAILBOX_FULL             Valid data already exists, non-overwriting.\n
+  #QURT_MAILBOX_BANDWIDTH_LIMIT  Reached the bandwidth limitation.   
+
+  @dependencies
+  None.
+*/
+int qurt_mailbox_send(unsigned long long mailbox_id, qurt_mailbox_send_option_t send_opt, unsigned long long data);
+
+
+/**@ingroup qurt_mailbox_receive
+  Receive data from QuRT mailbox
+   
+  @param mailbox_id   Mailbox Identifier
+  @param send_opt     Option for mailbox receiving
+  @param data         Pointer to data buffer for receiving
+
+  @return
+  #QURT_EOK                            Success \n
+  #QURT_MAILBOX_ID_ERROR               Mailbox ID error. \n
+  #QURT_MAILBOX_ERROR                  Other errors. \n
+  #QURT_MAILBOX_NON_VALID_DATA         No current valid data, put the previous content in the buffer. \n
+  #QURT_MAILBOX_RECEIVE_HALTED         Receive halted, the waiting thread is woken up. \n
+  #QURT_MAILBOX_DELETED                Mailbox is deleted, and the waiting thread is woken up.
+
+  @dependencies
+  None.
+*/
+int qurt_mailbox_receive(unsigned long long mailbox_id, qurt_mailbox_recv_option_t recv_opt, unsigned long long *data);
+
+
+/**@ingroup qurt_mailbox_delete
+  Deletes a QuRT mailbox.
+
+  A mailbox can only be deleted from the process that created the mailbox.
+   
+  @param mailbox_id   Mailbox identifier.
+
+  @return
+  #QURT_EOK                   Success. \n
+  #QURT_MAILBOX_ID_ERROR      Mailbox ID error. \n
+  #QURT_MAILBOX_ERROR         Other errors.
+
+  @dependencies
+  None.
+*/
+int qurt_mailbox_delete(unsigned long long mailbox_id);
+
+
+/**@ingroup qurt_mailbox_receive_halt
+  Halts a QuRT mailbox receiving and wakes up waiting threads.
+
+  @param mailbox_id   Mailbox identifier.
+
+  @return
+  #QURT_EOK                   Success. \n
+  #QURT_MAILBOX_ID_ERROR      Mailbox ID error.\n
+  #QURT_MAILBOX_ERROR         Other errors.
+
+  @dependencies
+  None.
+*/
+int qurt_mailbox_receive_halt(unsigned long long mailbox_id);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif // QURT_MAILBOX_H
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_memory.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_memory.h
new file mode 100755
index 0000000000000..90ce2586fec50
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_memory.h
@@ -0,0 +1,1487 @@
+#ifndef QURT_MEMORY_H
+#define QURT_MEMORY_H
+/**
+  @file qurt_memory.h
+  @brief  Prototypes of kernel memory API functions.
+
+ EXTERNALIZED FUNCTIONS
+  none
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  none
+
+ Copyright (c) Qualcomm Technologies, Inc.
+ All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+
+
+#include <qurt_error.h>
+#include <qurt_types.h>
+//#include <qurt_util_macros.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** @addtogroup memory_management_macros
+@{ */
+#define QURT_SYSTEM_ALLOC_VIRTUAL 1 /**< Allocates available virtual memory in the address space of all
+                                processes.*/
+/** @} */ /* end_addtogroup memory_management_macros */
+/**@cond rest_reg_dist */
+/** @addtogroup memory_management_types
+@{ */
+/** @xreflabel{hdr:qurt_mem_default_pool} */
+extern qurt_mem_pool_t qurt_mem_default_pool __attribute__((section(".data"))); /**< Memory pool object.*/
+/** @} */ /* end_addtogroup memory_management_types */
+
+/** @cond rest_reg_dist */
+/** Mapping attribute information*/
+typedef struct{
+    qurt_paddr_64_t        paddr;
+    qurt_size_t            size ;
+    qurt_mem_cache_mode_t  cache_mode;
+    qurt_perm_t            perms ; 
+}qurt_mapping_attr_t;
+/** @endcond */
+/** @} */ /* end_addtogroup mapping_attribute_types*/
+
+/*=====================================================================
+ Functions
+======================================================================*/
+
+/**@ingroup func_qurt_mem_cache_clean
+  Performs a cache clean operation on the data stored in the specified memory area.
+  Peforms a syncht on all the data cache operations when the Hexagon processor version is V60 or greater.
+
+  @note1hang Perform the flush all operation only on the data cache.
+
+  @note1cont This operation flushes and invalidates the contents of all cache lines from start address
+             to end address (start address + size). The contents of the adjoining buffer can be 
+             flushed and invalidated if it falls in any of the cache line.
+
+  @datatypes
+  #qurt_addr_t \n
+  #qurt_size_t \n
+  #qurt_mem_cache_op_t \n
+  #qurt_mem_cache_type_t
+
+  @param[in] addr      Address of data to flush.
+  @param[in] size      Size (in bytes) of data to flush.
+  @param[in] opcode    Type of cache clean operation. Values:  
+                       - #QURT_MEM_CACHE_FLUSH
+                       - #QURT_MEM_CACHE_INVALIDATE
+                       - #QURT_MEM_CACHE_FLUSH_INVALIDATE
+                       - #QURT_MEM_CACHE_FLUSH_ALL\n
+                       @note1 #QURT_MEM_CACHE_FLUSH_ALL is valid only when the type is #QURT_MEM_DCACHE @tablebulletend
+  @param[in] type          Cache type. Values:  
+                       - #QURT_MEM_ICACHE
+                       - #QURT_MEM_DCACHE  @tablebulletend
+ 
+  @return
+  #QURT_EOK -- Cache operation performed successfully.\n
+  #QURT_EVAL -- Invalid cache type.\n
+
+  @dependencies
+  None.
+*/
+int qurt_mem_cache_clean(qurt_addr_t addr, qurt_size_t size, qurt_mem_cache_op_t opcode, qurt_mem_cache_type_t type);
+
+/**@ingroup func_qurt_mem_cache_clean2
+  Performs a data cache clean operation on the data stored in the specified memory area.
+
+  This API only performs the following data cache operations:\n  
+  - #QURT_MEM_CACHE_FLUSH\n
+  - #QURT_MEM_CACHE_INVALIDATE\n  
+  - #QURT_MEM_CACHE_FLUSH_INVALIDATE -- flushes/invalidates the contents of all cache lines from start address
+  to end address (start address + size). The contents of the adjoining buffer can be 
+  flushed/invalidated if it falls in any of the cache line.
+
+  @datatypes
+  #qurt_addr_t \n
+  #qurt_size_t \n
+  #qurt_mem_cache_op_t \n
+  #qurt_mem_cache_type_t
+
+  @param[in] addr      Address of data to flush.
+  @param[in] size      Size (in bytes) of data to flush.
+  @param[in] opcode    Type of cache clean operation. Values:\n  #QURT_MEM_CACHE_FLUSH\n  #QURT_MEM_CACHE_INVALIDATE\n
+                       #QURT_MEM_CACHE_FLUSH_INVALIDATE
+  @param[in] type          Cache type. Values: \n #QURT_MEM_DCACHE
+
+  @return
+  #QURT_EOK -- Cache operation performed successfully.\n
+  #QURT_EVAL -- Invalid cache type.
+
+  @dependencies
+  None.
+*/
+int qurt_mem_cache_clean2(qurt_addr_t addr, qurt_size_t size, qurt_mem_cache_op_t opcode, qurt_mem_cache_type_t type);
+
+/**@ingroup func_qurt_mem_cache_phys_clean
+  Performs a cache clean operation on the data stored in the specified memory area based on address match and mask.
+  Operate on a cache line when (LINE.PhysicalPageNumber & mask) == addrmatch.
+
+  @note1hang The addrmatch value should be the upper 24-bit physical address to match against.
+
+  @datatypes
+  #qurt_mem_cache_op_t \n
+
+  @param[in] mask      24-bit address mask.
+  @param[in] addrmatch Physical page number (24 bits) of memory to use as an address match.
+  @param[in] opcode    Type of cache clean operation. Values:  
+                       - #QURT_MEM_CACHE_FLUSH
+                       - #QURT_MEM_CACHE_INVALIDATE @tablebulletend
+ 
+  @return
+  #QURT_EOK -- Cache operation performed successfully.\n
+  #QURT_EVAL -- Invalid operation
+  
+  @dependencies
+  None.
+*/
+
+int qurt_mem_cache_phys_clean(unsigned int mask, unsigned int addrmatch, qurt_mem_cache_op_t opcode);
+
+/**@ingroup func_qurt_mem_l2cache_line_lock 
+  Performs an L2 cache line locking operation. This function locks selective lines in the L2 cache memory.
+
+  @note1hang Perform the line lock operation only on the 32-byte aligned size and address.
+
+  @datatypes
+  #qurt_addr_t \n
+  #qurt_size_t 
+ 
+  @param[in] addr   Address of the L2 cache memory line to lock; the address must be 32-byte aligned.
+  @param[in] size   Size (in bytes) of L2 cache memory to line lock; size must be a multiple of 32 bytes.
+ 
+  @return
+  #QURT_EOK -- Success.\n
+  #QURT_EALIGN -- Data alignment or address failure.
+  #QURT_EINVALID -- Improper addr and size passed (e.g. integer overflow due to addr + size)
+  #QURT_EFAILED -- Failed to lock cache line as all the ways were locked for the corresponding set of an address 
+                   in the range of addr and addr+size or the address range is not L2 cacheable
+  @dependencies
+  None.
+*/
+int qurt_mem_l2cache_line_lock(qurt_addr_t addr, qurt_size_t size);
+
+/**@ingroup func_qurt_mem_l2cache_line_unlock
+  Performs an L2 cache line unlocking operation. This function unlocks selective lines in the L2 cache memory.
+
+  @note1hang Perform the line unlock operation only on a 32-byte aligned size and address.
+
+  @datatypes
+  #qurt_addr_t \n
+  #qurt_size_t
+
+  @param[in] addr   Address of the L2 cache memory line to unlock; the address must be 32-byte aligned.
+  @param[in] size   Size (in bytes) of the L2 cache memory line to unlock; size must be a multiple of 32 bytes.
+
+  @return
+  #QURT_EOK -- Success. \n
+  #QURT_EALIGN -- Aligning data or address failure. \n
+  #QURT_EFAILED -- Operation failed, cannot find the matching tag.
+
+  @dependencies
+  None.
+*/
+int qurt_mem_l2cache_line_unlock(qurt_addr_t addr, qurt_size_t size);
+
+/**@ingroup func_qurt_mem_region_attr_init
+  @xreflabel{sec:qurt_mem_region_attr_init} 
+  Initializes the specified memory region attribute structure with default attribute values: \n
+  - Mapping -- #QURT_MEM_MAPPING_VIRTUAL \n
+  - Cache mode -- #QURT_MEM_CACHE_WRITEBACK \n
+  - Physical address -- -1 \n
+  - Virtual address -- -1 \n
+  - Memory type -- #QURT_MEM_REGION_LOCAL \n
+  - Size -- -1 
+
+  @note1hang The memory physical address attribute must be explicitly set by calling the
+             qurt_mem_region_attr_set_physaddr() function. The size and pool attributes are set directly
+             as parameters in the memory region create operation.
+
+  @datatypes
+  #qurt_mem_region_attr_t
+
+  @param[in,out] attr  Pointer to the destination structure for the memory region attributes.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+void qurt_mem_region_attr_init(qurt_mem_region_attr_t *attr);
+
+/**@ingroup func_qurt_mem_pool_attach
+  Initializes a memory pool object to attach to a pool predefined in the system
+  configuration file.
+
+  Memory pool objects assign memory regions to physical memory in different
+  Hexagon memory units. They are specified in memory region create operations
+  (Section @xref{sec:mem_region_create}).
+
+  @note1hang QuRT predefines the memory pool object #qurt_mem_default_pool
+             (Section @xref{dox:mem_management}) for allocation memory regions in SMI memory. The pool attach
+             operation is necessary only when allocating memory regions in nonstandard
+             memory units such as TCM.
+
+  @datatypes
+  #qurt_mem_pool_t
+
+  @param[in] name   Pointer to the memory pool name.
+  @param[out] pool  Pointer to the memory pool object.
+
+  @return
+  #QURT_EOK -- Attach operation successful.
+
+  @dependencies
+  None.
+*/
+int qurt_mem_pool_attach(char *name, qurt_mem_pool_t *pool);
+
+/**@ingroup func_qurt_mem_pool_attach2
+  Gets the identifier that corresponds to a pool object created specifically for a client, for example, HLOS_PHYSPOOL.
+  The client_handle is used to look up the client specific pool.
+
+  Memory pool objects assign memory regions to physical memory in different
+  Hexagon memory units. Memory pool objects are specified during mapping creation operations 
+  (qurt_mem_mmap() and qurt_mem_region_create()).
+
+  @note1hang QuRT predefines the memory pool object #qurt_mem_default_pool
+             (Section @xref{dox:mem_management}) for allocation memory regions in SMI memory. The pool_attach2
+             operation is necessary only when allocating memory regions in memory units specific to the client.
+
+  @datatypes
+  #qurt_mem_pool_t
+
+  @param[in] client_handle   Client identifier used by the OS to lookup the identifier
+                             for client specific pool
+  @param[in] name            Pointer to the memory pool name.
+  @param[out] pool           Pointer to the memory pool object.
+
+  @return
+  #QURT_EOK -- Attach operation successful.
+
+  @dependencies
+  None.
+*/
+int qurt_mem_pool_attach2(int client_handle, char *name, qurt_mem_pool_t *pool);
+
+/**@ingroup func_qurt_mem_pool_create
+   @xreflabel{hdr:qurt_mem_pool_create}
+   Dynamically creates a memory pool object from a physical address range.
+
+   The pool is assigned a single memory region with the specified base address and size.
+
+   The base address and size values passed to this function must be aligned to 4K byte
+   boundaries, and must be expressed as the actual base address and size values divided by 4K.
+
+   For example, the function call:
+         @code
+         qurt_mem_pool_create ("TCM_PHYSPOOL", 0xd8020, 0x20, &pool)
+         @endcode
+   ... is equivalent to the following static pool definition in the QuRT system configuration file:
+        @code
+       <physical_pool name="TCM_PHYSPOOL">
+            <region base="0xd8020000" size="0x20000" />
+       </physical_pool>
+       @endcode
+
+   @cond rest_dist For more information on the system configuration file, see @xhyperref{80VB41979,80-VB419-79}. @endcond
+
+   @note1hang Dynamically created pools are not identical to static pools. In particular, 
+   qurt_mem_pool_attr_get() is not valid with dynamically created pools.
+
+   @note1cont Dynamic pool creation permanently consumes system resources, and cannot be undone.
+
+  @datatypes
+  #qurt_mem_pool_t
+
+  @param[in] name           Pointer to the memory pool name. 
+  @param[in] base           Base address of the memory region (divided by 4K).
+  @param[in] size           Size (in bytes) of the memory region (divided by 4K).
+  @param[out] pool          Pointer to the memory pool object.
+
+  @return
+  #QURT_EOK -- Success.
+
+  @dependencies
+  None.
+*/
+int qurt_mem_pool_create(char *name, unsigned base, unsigned size, qurt_mem_pool_t *pool);
+
+/**@ingroup func_qurt_mem_pool_add_pages
+  Adds a physical address range to the specified memory pool object.\n
+ 
+  @note1hang Call this operation only with root privileges (guest OS mode).
+
+  @datatypes
+  #qurt_mem_pool_t
+
+  @param[in] pool           Memory pool object.
+  @param[in] first_pageno   First page number of the physical address range (equivalent to address >> 12)
+  @param[in] size_in_pages  Number of pages in the physical address range (equivalent to size >> 12)
+
+  @return
+  #QURT_EOK -- Pages successfully added.
+
+  @dependencies
+  None.
+*/
+int qurt_mem_pool_add_pages(qurt_mem_pool_t pool,
+                            unsigned first_pageno,
+                            unsigned size_in_pages);
+
+/**@ingroup func_qurt_mem_pool_remove_pages
+  Removes a physical address range from the specified memory pool object.
+ 
+  If any part of the address range is in use, this operation returns an
+  error without changing the state.
+ 
+  @note1hang Call this operation only with root privileges (guest-OS mode).
+ 
+  @note1cont In the future, this operation will support (via the flags parameter) the
+  removal of a physical address range when part of the range is in use.
+ 
+  @datatypes
+  #qurt_mem_pool_t
+
+  @param[in] pool           Memory pool object.
+  @param[in] first_pageno   First page number of the physical address range (equivalent to address >> 12)
+  @param[in] size_in_pages  Number of pages in the physical address range (equivalent to size >> 12)
+  @param[in] flags          Remove options. Values: \n 
+                            - 0 -- Skip holes in the range that are not part of the pool (default) \n
+                            - #QURT_POOL_REMOVE_ALL_OR_NONE -- Pages are removed only if the specified
+                            physical address range is entirely contained (with no holes) in the
+                            pool free space. @tablebulletend                          
+  @param[in] callback       Callback procedure called when pages were successfully removed.
+                            Not called if the operation failed. Passing 0 as the parameter
+                            value causes the callback to not be called. 
+  @param[in] arg            Value passed as an argument to the callback procedure.
+
+  @return
+  #QURT_EOK -- Pages successfully removed.
+
+  @dependencies
+  None.
+*/
+int qurt_mem_pool_remove_pages(qurt_mem_pool_t pool,
+                               unsigned first_pageno,
+                               unsigned size_in_pages,
+                               unsigned flags,
+                               void (*callback)(void *),
+                               void *arg);
+/**@ingroup memory_management_types*/
+#define QURT_POOL_REMOVE_ALL_OR_NONE            1  /**< */
+
+/**@ingroup func_qurt_mem_pool_attr_get  
+   Gets the memory pool attributes. \n
+   Retrieves pool configurations based on the pool handle, and fills in
+   the attribute structure with configuration values.   
+
+   @datatypes
+   #qurt_mem_pool_t \n
+   #qurt_mem_pool_attr_t
+
+   @param[in]  pool   Pool handle obtained from qurt_mem_pool_attach().
+   @param[out] attr   Pointer to the memory region attribute structure. 
+
+   @return   
+   0 -- Success. \n
+   #QURT_EINVALID -- Corrupt handle; pool handle is invalid.
+*/
+int qurt_mem_pool_attr_get (qurt_mem_pool_t pool, qurt_mem_pool_attr_t *attr);
+
+/**@ingroup func_qurt_mem_pool_attr_get_size
+  Gets the size of the specified memory pool range.
+
+  @datatypes
+  #qurt_mem_pool_attr_t \n
+  #qurt_size_t
+ 
+  @param[in] attr        Pointer to the memory pool attribute structure.
+  @param[in] range_id    Memory pool range key.
+  @param[out] size       Pointer to the destination variable for the range size.
+
+  @return 
+  0 -- Success. \n
+  #QURT_EINVALID -- Range is invalid.
+
+  @dependencies
+  None.
+*/
+static inline int qurt_mem_pool_attr_get_size (qurt_mem_pool_attr_t *attr, int range_id, qurt_size_t *size){
+    if ((range_id >= MAX_POOL_RANGES) || (range_id < 0)){
+        (*size) = 0;
+        return QURT_EINVALID;
+    }
+    else {
+        (*size) = attr->ranges[range_id].size;
+    }
+    return QURT_EOK;
+}
+
+/**@ingroup func_qurt_mem_pool_attr_get_addr
+   Gets the start address of the specified memory pool range.
+ 
+  @datatypes
+  #qurt_mem_pool_attr_t \n
+  #qurt_addr_t
+  
+  @param[in] attr        Pointer to the memory pool attribute structure.
+  @param[in]  range_id   Memory pool range key.
+  @param[out] addr       Pointer to the destination variable for range start address.
+
+  @return 
+  0 -- Success. \n
+  #QURT_EINVALID -- Range is invalid.
+
+  @dependencies
+  None.
+*/
+static inline int qurt_mem_pool_attr_get_addr (qurt_mem_pool_attr_t *attr, int range_id, qurt_addr_t *addr){
+    if ((range_id >= MAX_POOL_RANGES) || (range_id < 0)){
+        (*addr) = 0;
+        return QURT_EINVALID;
+    }
+    else {
+        (*addr) = (attr->ranges[range_id].start)<<12;
+   }
+   return QURT_EOK;
+}
+
+/**@ingroup func_qurt_mem_pool_attr_get_addr_64
+   Gets the 64 bit start address of the specified memory pool range.
+ 
+  @datatypes
+  #qurt_mem_pool_attr_t \n
+  #qurt_addr_64_t
+  
+  @param[in] attr        Pointer to the memory pool attribute structure.
+  @param[in]  range_id   Memory pool range key.
+  @param[out] addr       Pointer to the destination variable for range start address.
+
+  @return 
+  0 -- Success. \n
+  #QURT_EINVALID -- Range is invalid.
+
+  @dependencies
+  None.
+*/
+static inline int qurt_mem_pool_attr_get_addr_64 (qurt_mem_pool_attr_t *attr, int range_id, qurt_addr_64_t *addr){
+if ((range_id >= MAX_POOL_RANGES) || (range_id < 0)){
+    (*addr) = 0;
+    return QURT_EINVALID;
+}
+else {
+     (*addr) = ((qurt_addr_64_t)attr->ranges[range_id].start)<<12;
+    }
+    return QURT_EOK;
+ }
+
+
+/**@ingroup func_qurt_mem_pool_status_get  
+   Gets the memory pool status. \n
+   Based on the pool handle, retrieves largest contiguous free memory, 
+   total free memory, and total memory declared for the pool in bytes. Fills in
+   the memory status structure with the values.   
+   
+   @datatypes
+   #qurt_mem_pool_t \n
+   #qurt_mem_pool_status_t
+   
+   @param[in]  pool   Pool handle.
+   @param[out] status Pointer to the memory pool status structure. 
+   
+   @return   
+   #QURT_EOK      -- Success. \n
+   #QURT_EINVALID -- Corrupt handle; pool handle is invalid.
+*/
+int qurt_mem_pool_status_get (qurt_mem_pool_t pool, qurt_mem_pool_status_t *status);
+
+
+/**@ingroup func_qurt_mem_pool_is_available
+   Checks whether the number of pages that the page_count argument indicates
+   can be allocated from the specified pool.
+
+  @datatypes
+  #qurt_mem_pool_attr_t \n
+  #qurt_mem_mapping_t \n
+
+  @param[in] pool          Pool handle obtained from qurt_mem_pool_attach().
+  @param[in] page_count    Number of 4K pages.
+  @param[in] mapping_type  Variable of type qurt_mem_mapping_t.
+
+  @return
+  0 -- Success. \n
+  #QURT_EINVALID -- Mapping_type is invalid. \n
+  #QURT_EMEM     -- Specified pages cannot be allocated from the pool.
+
+  @dependencies
+  None.
+*/
+int qurt_mem_pool_is_available(qurt_mem_pool_t pool, int page_count, qurt_mem_mapping_t mapping_type);
+
+
+/**@ingroup func_qurt_mem_region_create
+  @xreflabel{sec:mem_region_create}
+  Creates a memory region with the specified attributes.
+
+  The application initializes the memory region attribute structure with
+  qurt_mem_region_attr_init() and qurt_mem_region_attr_set_bus_attr().
+
+  If the virtual address attribute is set to its default value 
+  (Section @xref{sec:qurt_mem_region_attr_init}), the virtual address of the memory region is 
+  automatically assigned any available virtual address value.
+
+  If the memory mapping attribute is set to virtual mapping, the physical address of the memory region
+  is also automatically assigned.\n
+
+  @note1hang The physical address attribute is explicitly set in the attribute structure only
+             for memory regions with physical-contiguous-mapped mapping.
+
+  Memory regions are always assigned to memory pools. The pool value specifies the memory pool
+  that the memory region is assigned to.
+
+  @note1hang If attr is specified as NULL, the memory region is created with default
+             attribute values (Section @xref{sec:qurt_mem_region_attr_init}).
+             QuRT predefines the memory pool object #qurt_mem_default_pool
+             (Section @xref{dox:mem_management}), which allocates memory regions in SMI memory.
+
+  @datatypes
+  #qurt_mem_region_t \n
+  #qurt_size_t \n
+  #qurt_mem_pool_t \n
+  #qurt_mem_region_attr_t
+
+  @param[out] region Pointer to the memory region object.
+  @param[in]  size   Memory region size (in bytes). If size is not an integral multiple of 4K,
+                     it is rounded up to a 4K boundary.
+  @param[in]  pool   Memory pool of the region.
+  @param[in]  attr   Pointer to the memory region attribute structure.
+
+  @return
+  #QURT_EOK -- Memory region successfully created.\n
+  #QURT_EMEM -- Not enough memory to create region.
+  #QURT_EINVALID -- Invalid cache attributes / permissions provided in attribute.
+
+  @dependencies
+  None.
+*/
+int qurt_mem_region_create(qurt_mem_region_t *region, qurt_size_t size, qurt_mem_pool_t pool, qurt_mem_region_attr_t *attr);
+
+/**@ingroup func_qurt_mem_region_delete
+  Deletes the specified memory region.
+
+  If the caller application creates the memory region, it is removed and the system reclaims its
+  assigned memory.
+
+  If a different application creates the memory region (and is shared with the caller
+  application), only the local memory mapping to the region is removed; the system does
+  not reclaim the memory.
+
+  @datatypes
+  #qurt_mem_region_t
+
+  @param[in] region Memory region object.
+
+  @returns
+  #QURT_EOK -- Region successfully deleted.
+  #QURT_ELOCKED -- Buffer is locked. Mapping delete failed.
+
+  @dependencies
+  None.
+*/
+int qurt_mem_region_delete(qurt_mem_region_t region);
+
+
+/**@ingroup func_qurt_mem_region_attr_get
+  @xreflabel{sec:mem_region_attr_get}
+  Gets the memory attributes of the specified message region.
+  After a memory region is created, its attributes cannot be changed.
+
+  @datatypes
+  #qurt_mem_region_t \n
+  #qurt_mem_region_attr_t
+
+  @param[in] region     Memory region object.
+  @param[out] attr      Pointer to the destination structure for memory region attributes.
+
+  @return
+  #QURT_EOK -- Operation successfully performed. \n
+  Error code -- Failure.
+
+  @dependencies
+  None.
+*/
+int qurt_mem_region_attr_get(qurt_mem_region_t region, qurt_mem_region_attr_t *attr);
+
+
+/**@ingroup func_qurt_mem_region_attr_set_type
+  Sets the memory type in the specified memory region attribute structure.
+
+  The type indicates whether the memory region is local to an application or shared between
+  applications. 
+  @cond rest_dist For more information, see @xhyperref{80VB41992,80-VB419-92}. @endcond
+ 
+  @datatypes
+  #qurt_mem_region_attr_t \n
+  #qurt_mem_region_type_t
+
+  @param[in,out] attr  Pointer to memory region attribute structure.
+  @param[in]     type  Memory type. Values: \n
+                       - #QURT_MEM_REGION_LOCAL \n
+                       - #QURT_MEM_REGION_SHARED @tablebulletend
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+static inline void qurt_mem_region_attr_set_type(qurt_mem_region_attr_t *attr, qurt_mem_region_type_t type){
+    attr->type = type;
+}
+
+/**@ingroup func_qurt_mem_region_attr_get_size
+  Gets the memory region size from the specified memory region attribute structure.
+
+  @datatypes
+  #qurt_mem_region_attr_t \n
+  #qurt_size_t
+
+  @param[in]  attr  Pointer to the memory region attribute structure.
+  @param[out] size  Pointer to the destination variable for memory region size.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+static inline void qurt_mem_region_attr_get_size(qurt_mem_region_attr_t *attr, qurt_size_t *size){
+    (*size) = attr->size;
+}
+
+/**@ingroup func_qurt_mem_region_attr_get_type
+  Gets the memory type from the specified memory region attribute structure.
+
+  @datatypes
+  #qurt_mem_region_attr_t \n
+  #qurt_mem_region_type_t
+
+  @param[in] attr  Pointer to the memory region attribute structure.
+  @param[out] type  Pointer to the destination variable for the memory type.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+static inline void qurt_mem_region_attr_get_type(qurt_mem_region_attr_t *attr, qurt_mem_region_type_t *type){
+    (*type) = attr->type;
+}
+
+/**@ingroup func_qurt_mem_region_attr_set_physaddr
+  Sets the memory region 32-bit physical address in the specified memory attribute structure.
+
+  @note1hang The physical address attribute is explicitly set only for memory regions with 
+             physical contiguous mapping. Otherwise QuRT automatically sets it
+			 when the memory region is created.
+
+  @datatypes
+  #qurt_mem_region_attr_t \n
+  #qurt_paddr_t
+
+  @param[in,out] attr  Pointer to the memory region attribute structure.
+  @param[in] addr  Memory region physical address.
+
+  @return      
+  None.
+ */
+static inline void qurt_mem_region_attr_set_physaddr(qurt_mem_region_attr_t *attr, qurt_paddr_t addr){
+    attr->ppn = (unsigned)(((unsigned)(addr))>>12);
+}
+
+/**@ingroup func_qurt_mem_region_attr_get_physaddr
+  Gets the memory region physical address from the specified memory region attribute structure.
+  
+  @datatypes
+  #qurt_mem_region_attr_t
+  
+  @param[in]  attr  Pointer to the memory region attribute structure.
+  @param[out] addr  Pointer to the destination variable for memory region physical address.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+static inline void qurt_mem_region_attr_get_physaddr(qurt_mem_region_attr_t *attr, unsigned int *addr){
+    (*addr) = (unsigned)(((unsigned) (attr->ppn))<<12);
+}
+
+/**@ingroup func_qurt_mem_region_attr_set_virtaddr
+  Sets the memory region virtual address in the specified memory attribute structure.
+
+  @datatypes
+  #qurt_mem_region_attr_t \n
+  #qurt_addr_t
+  
+  @param[in,out] attr  Pointer to the memory region attribute structure.
+  @param[in]     addr  Memory region virtual address.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+static inline void qurt_mem_region_attr_set_virtaddr(qurt_mem_region_attr_t *attr, qurt_addr_t addr){
+    attr->virtaddr = addr;
+}
+
+/**@ingroup func_qurt_mem_region_attr_get_virtaddr
+  Gets the memory region virtual address from the specified memory region attribute structure.
+
+  @datatypes
+  #qurt_mem_region_attr_t \n
+
+  @param[in]   attr   Pointer to the memory region attribute structure.
+  @param[out]  addr   Pointer to the destination variable for the memory region virtual address.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+static inline void qurt_mem_region_attr_get_virtaddr(qurt_mem_region_attr_t *attr, unsigned int *addr){
+    (*addr) = (unsigned int)(attr->virtaddr);
+}
+
+/**@ingroup func_qurt_mem_region_attr_set_mapping
+  Sets the memory mapping in the specified memory region attribute structure.
+
+  The mapping value indicates how the memory region is mapped in virtual memory.  
+
+  @datatypes
+  #qurt_mem_region_attr_t \n
+  #qurt_mem_mapping_t
+  
+  @param[in,out] attr     Pointer to the memory region attribute structure.
+  @param[in] mapping  Mapping. Values: 
+                      - #QURT_MEM_MAPPING_VIRTUAL
+                      - #QURT_MEM_MAPPING_PHYS_CONTIGUOUS 
+                      - #QURT_MEM_MAPPING_IDEMPOTENT  	                                   
+                      - #QURT_MEM_MAPPING_VIRTUAL_FIXED								   
+                      - #QURT_MEM_MAPPING_NONE 
+                      - #QURT_MEM_MAPPING_VIRTUAL_RANDOM
+                      - #QURT_MEM_MAPPING_INVALID   @tablebulletend  
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+static inline void qurt_mem_region_attr_set_mapping(qurt_mem_region_attr_t *attr, qurt_mem_mapping_t mapping){
+    attr->mapping_type = mapping;
+}
+
+/**@ingroup func_qurt_mem_region_attr_get_mapping
+  Gets the memory mapping from the specified memory region attribute structure.
+
+  @datatypes
+  #qurt_mem_region_attr_t \n
+  #qurt_mem_mapping_t
+
+  @param[in]  attr     Pointer to the memory region attribute structure.
+  @param[out] mapping  Pointer to the destination variable for memory mapping.
+
+  @return 
+  None.
+
+  @dependencies
+  None.
+ */
+static inline void qurt_mem_region_attr_get_mapping(qurt_mem_region_attr_t *attr, qurt_mem_mapping_t *mapping){
+    (*mapping) = attr->mapping_type;
+}
+
+/**@ingroup func_qurt_mem_region_attr_set_cache_mode
+  Sets the cache operation mode in the specified memory region attribute structure.
+
+  @cond rest_dist For more information on the cache, see @xhyperref{80VB41992,80-VB419-92}.@endcond
+
+  @datatypes
+  #qurt_mem_region_attr_t \n
+  #qurt_mem_cache_mode_t
+  
+  @param[in,out] attr  Pointer to the memory region attribute structure.
+  @param[in] mode      Cache mode. Values:  \n
+                       - #QURT_MEM_CACHE_WRITEBACK \n
+                       - #QURT_MEM_CACHE_WRITETHROUGH\n
+                       - #QURT_MEM_CACHE_WRITEBACK_NONL2CACHEABLE\n
+                       - #QURT_MEM_CACHE_WRITETHROUGH_NONL2CACHEABLE\n
+                       - #QURT_MEM_CACHE_WRITEBACK_L2CACHEABLE\n
+                       - #QURT_MEM_CACHE_WRITETHROUGH_L2CACHEABLE\n
+                       - #QURT_MEM_CACHE_NONE @tablebulletend
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+static inline void qurt_mem_region_attr_set_cache_mode(qurt_mem_region_attr_t *attr, qurt_mem_cache_mode_t mode){
+    QURT_PGATTR_C_SET(attr->pga, (unsigned)mode);
+}
+
+/**@ingroup func_qurt_mem_region_attr_get_cache_mode
+  Gets the cache operation mode from the specified memory region attribute structure.
+
+  @datatypes
+  #qurt_mem_region_attr_t \n
+  #qurt_mem_cache_mode_t
+
+  @param[in]  attr  Pointer to the memory region attribute structure.
+  @param[out] mode  Pointer to the destination variable for cache mode.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+static inline void qurt_mem_region_attr_get_cache_mode(qurt_mem_region_attr_t *attr, qurt_mem_cache_mode_t *mode){
+    unsigned int mode_temp = QURT_PGATTR_C_GET(attr->pga);
+    (*mode) = (qurt_mem_cache_mode_t)mode_temp;
+}
+
+/**@ingroup func_qurt_mem_region_attr_set_bus_attr
+  Sets the (A1, A0) bus attribute bits in the specified memory region attribute structure.
+
+  @cond rest_dist For more information on the bus attribute bits, see the @xhyperref{80VB41992,80-VB419-92}. @endcond
+
+  @datatypes
+  #qurt_mem_region_attr_t
+
+  @param[in,out] attr  Pointer to the memory region attribute structure.
+  @param[in] abits     The (A1, A0) bits to use with the memory region, expressed as a 2-bit binary number.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+static inline void qurt_mem_region_attr_set_bus_attr(qurt_mem_region_attr_t *attr, unsigned abits){
+    QURT_PGATTR_A_SET(attr->pga, abits);
+}
+
+/**@ingroup func_qurt_mem_region_attr_get_bus_attr
+  Gets the (A1, A0) bus attribute bits from the specified memory region attribute structure.
+
+  @datatypes
+  #qurt_mem_region_attr_t 
+
+  @param[in]  attr  Pointer to the memory region attribute structure.
+  @param[out] pbits Pointer to an unsigned integer that is filled in with
+                    the (A1, A0) bits from the memory region attribute structure, expressed as a 2-bit binary number.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+static inline void qurt_mem_region_attr_get_bus_attr(qurt_mem_region_attr_t *attr, unsigned *pbits){
+    (*pbits) = QURT_PGATTR_A_GET(attr->pga);
+}
+
+void qurt_mem_region_attr_set_owner(qurt_mem_region_attr_t *attr, int handle);
+void qurt_mem_region_attr_get_owner(qurt_mem_region_attr_t *attr, int *p_handle);
+void qurt_mem_region_attr_set_perms(qurt_mem_region_attr_t *attr, unsigned perms);
+void qurt_mem_region_attr_get_perms(qurt_mem_region_attr_t *attr, unsigned *p_perms);
+
+/**@ingroup func_qurt_mem_map_static_query
+  Determines whether a memory page is statically mapped.
+  Pages are specified by the following attributes: physical address, page size, cache mode,
+  and memory permissions. \n
+  - If the specified page is statically mapped, vaddr returns the virtual
+     address of the page. \n
+  - If the page is not statically mapped (or if it does not exist as specified), vaddr
+     returns -1 as the virtual address value.\n
+  The system configuration file defines QuRT memory maps.
+ 
+  @datatypes
+  #qurt_addr_t \n
+  #qurt_mem_cache_mode_t \n
+  #qurt_perm_t
+  
+  @param[out]  vaddr             Virtual address corresponding to paddr.
+  @param[in]   paddr             Physical address.  
+  @param[in]   page_size         Size of the mapped memory page.
+  @param[in]   cache_attribs     Cache mode (writeback, and so on).
+  @param[in]   perm              Access permissions.
+
+  @return
+  #QURT_EOK -- Specified page is statically mapped, vaddr returns the virtual address. \n
+  #QURT_EMEM -- Specified page is not statically mapped, vaddr returns -1. \n
+  #QURT_EVAL -- Specified page does not exist.
+
+  @dependencies
+  None.
+ */
+int qurt_mem_map_static_query(qurt_addr_t *vaddr, qurt_addr_t paddr, unsigned int page_size, qurt_mem_cache_mode_t cache_attribs, qurt_perm_t perm);
+
+
+/**@ingroup func_qurt_mem_region_query
+  Queries a memory region. \n
+  This function determines whether a dynamically-created memory region (Section @xref{sec:mem_region_create}) exists for the
+  specified virtual or physical address.  
+  When a memory region has been determined to exist, its attributes are
+  accessible (Section @xref{sec:mem_region_attr_get}).
+
+  @note1hang This function returns #QURT_EFATAL if #QURT_EINVALID is passed to both
+             vaddr and paddr (or to neither). 
+
+  @datatypes
+  #qurt_mem_region_t \n
+  #qurt_paddr_t 
+   
+  @param[out] region_handle    Pointer to the memory region object (if it exists).
+  @param[in]  vaddr            Virtual address to query; if vaddr is specified, paddr must be set to
+                               the value #QURT_EINVALID.
+  @param[in]  paddr            Physical address to query; if paddr is specified, vaddr must be set to
+                               the value #QURT_EINVALID.
+
+  @return 
+  #QURT_EOK -- Query successfully performed. \n
+  #QURT_EMEM -- Region not found for the specified address. \n
+  #QURT_EFATAL -- Invalid input parameters.
+
+  @dependencies
+  None.
+ */
+int qurt_mem_region_query(qurt_mem_region_t *region_handle, qurt_addr_t vaddr, qurt_paddr_t paddr);
+
+
+/**@ingroup func_qurt_mapping_create
+  @xreflabel{hdr:qurt_mapping_create}
+  Creates a memory mapping in the page table.
+  Not supported if called from a user process, always returns QURT_EMEM. 
+
+  @datatypes
+  #qurt_addr_t \n
+  #qurt_size_t \n
+  #qurt_mem_cache_mode_t \n
+  #qurt_perm_t
+ 
+  @param[in] vaddr			Virtual address.
+  @param[in] paddr			Physical address.
+  @param[in] size			Size (4K-aligned) of the mapped memory page.
+  @param[in] cache_attribs		Cache mode (writeback, and so on).
+  @param[in] perm			Access permissions.
+
+  @return			
+  #QURT_EOK -- Mapping created. \n
+  #QURT_EMEM -- Failed to create mapping.
+  #QURT_EINVALID -- Invalid cache attributes / permissions provided.
+
+  @dependencies
+  None.
+*/
+int qurt_mapping_create(qurt_addr_t vaddr, qurt_addr_t paddr, qurt_size_t size,
+                         qurt_mem_cache_mode_t cache_attribs, qurt_perm_t perm);
+
+/**@ingroup func_qurt_mapping_remove
+   @xreflabel{hdr:qurt_mapping_remove}
+  Deletes the specified memory mapping from the page table.
+ 
+  @datatypes
+  #qurt_addr_t \n
+  #qurt_size_t
+
+  @param[in] vaddr			Virtual address.
+  @param[in] paddr			Physical address.
+  @param[in] size			Size of the mapped memory page (4K-aligned).
+
+  @return 			
+  #QURT_EOK -- Mapping created.
+  #QURT_ELOCKED -- Buffer is locked. Mapping delete failed.
+
+  @dependencies
+  None.
+  		
+ */ 		
+int qurt_mapping_remove(qurt_addr_t vaddr, qurt_addr_t paddr, qurt_size_t size);
+
+/**@ingroup func_qurt_lookup_physaddr
+  Translates a virtual memory address to the physical memory address to which it maps. \n
+  The lookup occurs in the process of the caller. Use qurt_lookup_physaddr2() to lookup the
+  physical address of another process.
+  
+
+  @datatypes
+  #qurt_addr_t \n
+  #qurt_paddr_t
+
+  @param[in] vaddr   Virtual address.
+
+  @return
+  Nonzero -- Physical address to which the virtual address is mapped.\n
+  0 -- Virtual address not mapped.
+
+  @dependencies
+  None.
+*/
+qurt_paddr_t qurt_lookup_physaddr (qurt_addr_t vaddr);
+
+/**@ingroup func_qurt_mem_region_attr_set_physaddr_64
+  Sets the memory region 64-bit physical address in the specified memory attribute structure.
+
+  @note1hang The physical address attribute is explicitly set only for memory regions with
+             physical contiguous mapping. Otherwise it is automatically set by
+             QuRT when the memory region is created.
+
+  @datatypes
+  #qurt_mem_region_attr_t \n
+  #qurt_paddr_64_t
+
+  @param[in,out] attr  Pointer to the memory region attribute structure.
+  @param[in] addr_64   Memory region 64-bit physical address.
+
+  @return
+  None.
+ */
+static inline void qurt_mem_region_attr_set_physaddr_64(qurt_mem_region_attr_t *attr, qurt_paddr_64_t addr_64){
+    attr->ppn = (unsigned)(((unsigned long long)(addr_64))>>12);
+}
+
+/**@ingroup func_qurt_mem_region_attr_get_physaddr_64
+  Gets the memory region 64-bit physical address from the specified memory region attribute structure.
+
+  @datatypes
+  #qurt_mem_region_attr_t \n
+  #qurt_paddr_64_t
+
+  @param[in]  attr     Pointer to the memory region attribute structure.
+  @param[out] addr_64  Pointer to the destination variable for the memory region 64-bit physical address.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+static inline void qurt_mem_region_attr_get_physaddr_64(qurt_mem_region_attr_t *attr, qurt_paddr_64_t *addr_64){
+    (*addr_64) = (unsigned long long)(((unsigned long long)(attr->ppn))<<12);
+}
+
+/**@ingroup func_qurt_mem_map_static_query_64
+  Determines if a memory page is statically mapped.
+  The following attributes specify pages: 64-bit physical address, page size, cache mode,
+  and memory permissions. \n
+  If the specified page is statically mapped, vaddr returns the virtual
+     address of the page.
+  If the page is not statically mapped (or if it does not exist as specified), vaddr
+     returns -1 as the virtual address value.\n
+  QuRT memory maps are defined in the system configuration file.
+
+  @datatypes
+  #qurt_addr_t \n
+  #qurt_paddr_64_t \n
+  #qurt_mem_cache_mode_t \n
+  #qurt_perm_t
+
+  @param[out]  vaddr             Virtual address corresponding to paddr.
+  @param[in]   paddr_64          64-bit physical address.
+  @param[in]   page_size         Size of the mapped memory page.
+  @param[in]   cache_attribs     Cache mode (writeback, and so on).
+  @param[in]   perm              Access permissions.
+
+  @return
+  #QURT_EOK -- Specified page is statically mapped; a virtual address is returned in vaddr. \n
+  #QURT_EMEM -- Specified page is not statically mapped; -1 is returned in vaddr. \n
+  #QURT_EVAL -- Specified page does not exist.
+
+  @dependencies
+  None.
+ */
+int qurt_mem_map_static_query_64(qurt_addr_t *vaddr, qurt_paddr_64_t paddr_64, unsigned int page_size, qurt_mem_cache_mode_t cache_attribs, qurt_perm_t perm);
+
+/**@ingroup func_qurt_mem_region_query_64
+  Determines whether a dynamically created memory region (Section @xref{sec:mem_region_create}) exists for the
+  specified virtual or physical address. When a memory region has been determined to exist, its attributes are
+  accessible (Section @xref{sec:mem_region_attr_get}).
+
+  @note1hang This function returns QURT_EFATAL if #QURT_EINVALID is passed to both
+             vaddr and paddr (or to neither).
+
+  @datatypes
+  #qurt_mem_region_t \n
+  #qurt_addr_t \n
+  #qurt_paddr_64_t
+
+  @param[out] region_handle    Pointer to the memory region object (if it exists).
+  @param[in]  vaddr            Virtual address to query; if vaddr is specified, paddr must be set to
+                               the value #QURT_EINVALID.
+  @param[in]  paddr_64         64-bit physical address to query; if paddr is specified, vaddr must be set to
+                               the value #QURT_EINVALID.
+
+  @return
+  #QURT_EOK -- Success. \n
+  #QURT_EMEM -- Region not found for the specified address. \n
+  #QURT_EFATAL -- Invalid input parameters.
+
+  @dependencies
+  None.
+ */
+int qurt_mem_region_query_64(qurt_mem_region_t *region_handle, qurt_addr_t vaddr, qurt_paddr_64_t paddr_64);
+
+/**@ingroup func_qurt_mapping_create_64
+  @xreflabel{hdr:qurt_mapping_create_64}
+  Creates a memory mapping in the page table.
+  Not supported if called from a user process, always returns QURT_EMEM.
+
+  @datatypes
+  #qurt_addr_t \n
+  #qurt_paddr_64_t \n
+  #qurt_size_t \n
+  #qurt_mem_cache_mode_t \n
+  #qurt_perm_t
+ 
+  @param[in] vaddr	        Virtual address.
+  @param[in] paddr_64		64-bit physical address.
+  @param[in] size			Size (4K-aligned) of the mapped memory page.
+  @param[in] cache_attribs  Cache mode (writeback, and so on).
+  @param[in] perm			Access permissions.
+
+  @return			
+  #QURT_EOK -- Success. \n
+  #QURT_EMEM -- Failure.
+  #QURT_EINVALID -- Invalid cache attributes / permissions provided.
+
+  @dependencies
+  None.
+*/
+int qurt_mapping_create_64(qurt_addr_t vaddr, qurt_paddr_64_t paddr_64, qurt_size_t size,
+                         qurt_mem_cache_mode_t cache_attribs, qurt_perm_t perm);
+
+/**@ingroup func_qurt_mapping_remove_64
+   @xreflabel{hdr:qurt_mapping_remove_64}
+  Deletes the specified memory mapping from the page table.
+ 
+  @datatypes
+  #qurt_addr_t \n
+  #qurt_paddr_64_t \n  
+  #qurt_size_t
+ 
+  @param[in] vaddr    Virtual address.
+  @param[in] paddr_64 64-bit physical address.
+  @param[in] size     Size of the mapped memory page (4K-aligned).
+
+  @return 			
+  #QURT_EOK -- Success.
+  #QURT_ELOCKED -- Buffer is locked. Mapping delete failed.
+
+  @dependencies
+  None.
+  		
+ */ 		
+int qurt_mapping_remove_64(qurt_addr_t vaddr, qurt_paddr_64_t paddr_64, qurt_size_t size);
+
+/**@ingroup func_qurt_lookup_physaddr_64
+  Translates a virtual memory address to the 64-bit physical memory address it is mapped to. \n
+  The lookup occurs in the process of the caller. Use qurt_lookup_physaddr2() to lookup the physical
+  address of another process.
+
+  @datatypes
+  #qurt_paddr_64_t \n
+  #qurt_addr_t
+
+  @param[in] vaddr   Virtual address.
+
+  @return
+  Nonzero -- 64-bit physical address to which the virtual address is mapped. \n
+  0 -- Virtual address has not been mapped.
+
+  @dependencies
+  None.
+*/
+qurt_paddr_64_t qurt_lookup_physaddr_64 (qurt_addr_t vaddr);
+/** @endcond */
+
+/** @cond internal_only */
+/**@ingroup func_qurt_mapping_reclaim
+  Deallocates all QuRT resources associated with the specified virtual
+  memory area, making it available for user memory management:\n
+  - The associated physical memory areas are freed and added to the
+    specified physical pool.\n
+  - The associated TLB entries are deleted and made available for TLB
+    management.\n
+  - The virtual memory area is not freed -- it is left in
+    place as allocated, but unmapped virtual memory. Access to this
+    memory area generates an exception.\n
+
+  The virtual memory area must be statically allocated.
+  If no pool is specified, the freed physical memory is not added to any pool.
+
+  @note1hang The virtual memory area is restricted to being filled with locked 
+             TLB entries that are contiguous within the memory area, and contained by it.
+
+  @datatypes
+  #qurt_addr_t \n
+  #qurt_size_t \n
+  #qurt_mem_pool_t
+
+  @param[in] vaddr   Virtual address of the memory area to free.
+  @param[in] vsize   Size (in bytes) of the memory area to free.
+  @param[in] pool    Handle to the physical pool where freed physical memory is added.
+                     If set to 0, freed physical memory is not added to any pool.
+
+  @return
+  0 -- Success. \n
+  Nonzero -- Failure that indicates a partial success, or that the request was malformed. \n @note1hang The expected behavior is that
+       QuRT logs messages related to the failure, and callers are free to ignore the return value.
+
+  @dependencies
+  None.
+*/
+int qurt_mapping_reclaim(qurt_addr_t vaddr, qurt_size_t vsize, qurt_mem_pool_t pool);
+/** @endcond */
+/** @cond rest_reg_dist  */
+/**@ingroup func_qurt_mem_configure_cache_partition
+  Configures the Hexagon cache partition at the system level.
+
+  A partition size value of #SEVEN_EIGHTHS_SIZE is applicable only to the L2 cache.
+
+  The L1 cache partition is not supported in Hexagon processor version V60 or greater.
+
+  @note1hang Call this operation only with QuRT OS privilege.
+
+  @datatypes
+  #qurt_cache_type_t \n
+  #qurt_cache_partition_size_t
+
+  @param[in] cache_type  Cache type for partition configuration. Values: \n
+                       - #HEXAGON_L1_I_CACHE \n
+                       - #HEXAGON_L1_D_CACHE \n
+                       - #HEXAGON_L2_CACHE @tablebulletend
+
+  @param[in] partition_size  Cache partition size. Values: \n
+                        - #FULL_SIZE \n
+                        - #HALF_SIZE \n
+                        - #THREE_QUARTER_SIZE \n
+                        - #SEVEN_EIGHTHS_SIZE @tablebulletend
+
+  @return
+  #QURT_EOK -- Success. \n
+  #QURT_EVAL -- Error.
+
+  @dependencies
+  None.
+ */
+int qurt_mem_configure_cache_partition(qurt_cache_type_t cache_type, qurt_cache_partition_size_t partition_size);
+
+
+/**@ingroup func_qurt_mem_syncht
+   @xreflabel{hdr:qurt_mem_syncht}
+  Performs heavy-weight synchronization of memory transactions.
+
+  This operation does not return until all previous memory transactions (cached and uncached load/store,
+  mem_locked, and so on) that originated from the current thread are complete and globally observable.
+
+  @note1hang This operation is implemented as a wrapper for the Hexagon syncht instruction.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+static inline void qurt_mem_syncht(void){
+    #ifdef __HEXAGON_ARCH__
+    __asm__  __volatile__ (" SYNCHT \n");
+    #endif
+}
+
+/**@ingroup func_qurt_mem_barrier
+   @xreflabel{hdr:qurt_mem_barrier}
+  Creates a barrier for memory transactions.
+
+  This operation ensures that all previous memory transactions are globally observable before any
+  future memory transactions are globally observable.
+
+  @note1hang This operation is implemented as a wrapper for the Hexagon barrier instruction.
+  @return
+  None
+
+  @dependencies
+  None.
+ */
+static inline void qurt_mem_barrier(void){
+    #ifdef __HEXAGON_ARCH__
+    __asm__  __volatile__ (" BARRIER \n");
+    #endif
+}
+/** @endcond */
+
+/** @cond internal_only */
+/**@ingroup func_qurt_system_mem_alloc
+  Requests that the kernel allocates memory from the kernel-owned pool.
+
+  @param[in] size     Size in bytes (aligned to 4K) to allocate.
+  @param[in] align    Any alignment that must be considered for the allocation.
+  @param[in] flags    Supports the #QURT_SYSTEM_ALLOC_VIRTUAL flag; allocates 
+                      available virtual memory in the address space of all processes.
+
+  @return
+  #QURT_EFATAL  -- Allocation failed \n
+  Start address of the successful allocation.  
+
+  @dependencies
+  None.
+*/
+unsigned qurt_system_mem_alloc(unsigned size, unsigned align, unsigned flags);
+/** @endcond */
+/** @cond rest_reg_dist*/
+/**@ingroup func_qurt_lookup_physaddr2
+  Translates the virtual memory address of the specified process to the 64-bit 
+  physical memory address to which it is mapped.
+
+  @datatypes
+  #qurt_addr_t \n
+  #qurt_paddr_64_t
+
+  @param[in] vaddr   Virtual address.
+  @param[in] pid     PID.
+
+  @return
+  Nonzero -- 64-bit physical address to which the virtual address is mapped. \n
+  0 -- Virtual address is not mapped.
+
+  @dependencies
+  None.
+*/
+qurt_paddr_64_t qurt_lookup_physaddr2(qurt_addr_t vaddr, unsigned int pid);
+/** @endcond */
+
+/**@ingroup func_qurt_mapping_attr_get  
+   Gets the mapping attributes for a given virtual address and PID
+
+   @datatypes
+   #qurt_addr_t \n
+   #qurt_mapping_attr_t
+
+   @param[in]  vaddr  virtual address for which the attributes are required.
+   @param[in]  pid    process id for the target process
+   @param[out] attr   Pointer to the mapping attribute structure. 
+
+   @return   
+   0 -- Success. \n
+   #QURT_EINVALID -- Incorrect virtual address or pid
+*/
+int qurt_mapping_attr_get(qurt_addr_t vaddr, unsigned int pid, qurt_mapping_attr_t *attr);
+
+
+/**@ingroup func_qurt_mapping_attr_get_cache_mode
+  Gets the cache operation mode in the specified memory mapping attribute structure.
+
+
+  @datatypes
+  #qurt_mapping_attr_t \n
+  #qurt_mem_cache_mode_t
+  
+  @param[in]  attr  Pointer to the memory mapping attribute structure.
+  @param[out] cache_mode  Pointer to the destination variable for cache mode.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+static inline void qurt_mapping_attr_get_cache_mode(qurt_mapping_attr_t *attr, qurt_mem_cache_mode_t *cache_mode)
+{
+   (*cache_mode) = attr->cache_mode;
+}
+
+/**@ingroup func_qurt_mapping_attr_get_physaddr
+  Gets the physical memory address in the specified memory mapping attribute structure.
+
+
+  @datatypes
+  #qurt_mapping_attr_t \n
+  #qurt_paddr_64_t
+  
+  @param[in]  attr      Pointer to the memory mapping attribute structure.
+  @param[out] physaddr  Pointer to the destination variable for physical address.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+static inline void qurt_mapping_attr_get_physaddr(qurt_mapping_attr_t *attr, qurt_paddr_64_t *physaddr)
+{
+   (*physaddr) = attr->paddr;
+}
+
+/**@ingroup func_qurt_mapping_attr_get_perms
+  Gets the permissions in the specified memory mapping attribute structure.
+
+
+  @datatypes
+  #qurt_mapping_attr_t \n
+  #qurt_perm_t
+  
+  @param[in]  attr   Pointer to the memory mapping attribute structure.
+  @param[out] perms  Pointer to the destination variable for permissions.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+static inline void qurt_mapping_attr_get_perms(qurt_mapping_attr_t *attr, qurt_perm_t *perms)
+{
+   (*perms) = attr->perms;
+}
+
+/**@ingroup func_qurt_mapping_attr_get_size
+  Gets the size in the specified memory mapping attribute structure.This represents size of the
+  TLB entry which covers the virtual address.
+
+
+  @datatypes
+  #qurt_mapping_attr_t \n
+  #unsigned int
+  
+  @param[in]  attr  Pointer to the memory mapping attribute structure.
+  @param[out] size  Pointer to the destination variable for size.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+static inline void qurt_mapping_attr_get_size(qurt_mapping_attr_t *attr, unsigned int *size)
+{
+   (*size) = attr->size;
+}
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_MEMORY_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_mmap.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_mmap.h
new file mode 100755
index 0000000000000..c3bd875910af7
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_mmap.h
@@ -0,0 +1,359 @@
+#ifndef QURT_MMAP_H
+#define QURT_MMAP_H
+/**
+  @file qurt_mmap.h 
+  @brief  Prototypes of memory mapping/unmapping APIs.
+          The APIs allow the user to map, un-map, and change permissions
+          on memory regions. 
+
+  EXTERNAL FUNCTIONS
+   None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+Copyright (c) 2018-2021, 2022, 2023 Qualcomm Technologies, Inc.
+All Rights Reserved.
+Confidential and Proprietary - Qualcomm Technologies, Inc.
+=============================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**@ingroup func_qurt_mem_mmap
+  Creates a memory mapping with the specified attributes. 
+  This API allows the root process caller to create mapping on behalf of a user
+  process. If the client_handle belongs to a valid user process, the resulting
+  mapping is created for the process.
+  If -1 is passed in place of client_handle, the API creates mapping
+  for the underlying process of the caller.
+
+  @note1hang If the specified attributes are not valid, an error result is returned.  
+                
+  @param[out]  client_handle  Client handle to use for this mapping (optional).
+  @param[in]   pool           Optional argument that specifies a pool handle
+                              if the user wants to allocate memory from a specific pool.
+                              The default value for this argument is NULL.
+  @param[in]   pRegion        Map region. This argument is unused, and the default value is NULL.
+  @param[in]   addr           Virtual memory address.
+  @param[in]   length         Size of mapping in bytes.
+  @param[in]   prot           Mapping access permissions (R/W/X).
+  @param[in]   flags          Mapping modes.\n
+                              - #QURT_MAP_NAMED_MEMSECTION 
+                              - #QURT_MAP_FIXED            \n
+                              - #QURT_MAP_NONPROCESS_VPOOL \n
+                              - #QURT_MAP_TRYFIXED         \n
+                              - #QURT_MAP_ANON             \n
+                              - #QURT_MAP_PHYSADDR         \n
+                              - #QURT_MAP_VA_ONLY @tablebulletend  
+  @param[in]   fd             File designator.
+  @param[in]   offset         Offset in file.
+ 
+  @return
+  Valid virtual address -- Success.\n
+  #QURT_MAP_FAILED -- Mapping creation failed. 
+ */
+void *qurt_mem_mmap(int client_handle,
+                    qurt_mem_pool_t pool,
+                    qurt_mem_region_t *pRegion,
+                    void *addr,
+                    size_t length,
+                    int prot,
+                    int flags,
+                    int fd,
+                    unsigned long long offset);
+
+/**@ingroup func_qurt_mem_mmap2
+  Creates a memory mapping with the specified attributes. Returns a more descriptive 
+  error code in case of failure.
+  This API allows the root process caller to create mapping on behalf of a user
+  process. If the client_handle belongs to a valid user process, the resulting
+  mapping is created for the process.
+  If -1 is passed in place of client_handle, the API creates mapping
+  for the underlying process of the caller.
+
+  @note1hang If the specified attributes are not valid, an error result is returned.
+
+  @param[out]  client_handle  Client handle to use for this mapping (optional).
+  @param[in]   pool           Optional argument that allows the user to specify a pool handle
+                              when the user wants to allocate memory from a specific pool.
+                              Default value for this argument is NULL.
+  @param[in]   pRegion        Map region (unused argument); default value is NULL.
+  @param[in]   addr           Virtual memory address.
+  @param[in]   length         Size of mapping in bytes.
+  @param[in]   prot           Mapping access permissions (R/W/X).
+                              Cache attributes, bus attributes, User mode.
+  @param[in]   flags          Mapping modes;
+                              Shared, Private, or Anonymous.
+  @param[in]   fd             File designator.
+  @param[in]   offset         Offset in file.
+ 
+  @return
+  Valid virtual address -- Success.\n
+  #QURT_EMEM -- Physical address is not available. \n
+  #QURT_EFAILED -- VA is not available or mapping failed.\n
+  #QURT_EINVALID -- Invalid argument was passed (for example, an unaligned VA/PA).
+ */
+void *qurt_mem_mmap2(int client_handle,
+                    qurt_mem_pool_t pool,
+                    qurt_mem_region_t *pRegion,
+                    void *addr,
+                    size_t length,
+                    int prot,
+                    int flags,
+                    int fd,
+                    unsigned long long offset);
+
+/**@ingroup func_qurt_mem_mmap_by_name
+  Creates a memory mapping for a named-memsection using the specified attributes.
+  The named memsection should be specified in cust_config.xml.
+
+  @note1hang If the specified attributes are not valid or the named memsection is not found,
+  an error result is returned.
+                  
+  @param[in]   name           Name of the memsection in cust_config.xml that specifies 
+                              this mapping. Should be less than 25 characters.
+  @param[in]   addr           Virtual memory address.
+  @param[in]   length         Size of mapping in bytes.
+  @param[in]   prot           Mapping access permissions (R/W/X).
+                              Cache attributes, bus attributes, User mode
+  @param[in]   flags          Mapping modes, such as
+                              Shared, Private, or Anonymous.
+  @param[in]   offset         Offset relative to the physical address range specified in memsection. 
+                              If offset + length exceeds size of memsection, failure is 
+                              returned.
+  @return
+  Valid virtual address -- Success.\n
+  #QURT_MAP_FAILED -- Mapping creation failed. 
+ */
+void *qurt_mem_mmap_by_name(const char* name,
+                            void *addr,
+                            size_t length,
+                            int prot,
+                            int flags,
+                            unsigned long long offset);
+
+/**@ingroup func_qurt_mem_mprotect2
+  Changes access permissions and attributes on an existing mapping based on the client_handle argument. 
+
+  @note1hang If the specified virtual address is not found or invalid attributes are passed,
+  an error code is returned.
+
+  @note2 When error is returned, it is possible that attributes/permissions are changed for some part of the
+          mapping, while for the remaining it is unchanged. Clients should not use these mappings further.
+                  
+  @param[in]   client_handle  Obtained from the current invocation function (Section 3.4.1).                   
+  @param[in]   addr           Virtual memory address.
+  @param[in]   length         Size of mapping in bytes.
+  @param[in]   prot           Mapping access permissions (R/W/X).
+                              Cache attributes, Bus attributes, User mode.
+  @return
+  #QURT_EOK -- Successfully changes permissions on the mapping.\n
+  #QURT_EFATAL -- Failed to change permissions on the mapping. \n
+  #QURT_EINVALID -- Attributes / permissions requested are invalid.
+ */
+int qurt_mem_mprotect2(int client_handle, const void *addr,
+                      size_t length,
+                      int prot);
+
+/**@ingroup func_qurt_mem_mprotect
+  Changes access permissions and attributes on an existing mapping. 
+
+  @note1hang If the specified virtual address is not found or invalid attributes are passed,
+  an error code is returned.\n
+
+  @note2 When error is returned, it is possible that attributes/permissions are changed for some part of the
+          mapping, while for the remaining it is unchanged. Clients should not use these mappings further.
+                  
+  @param[in]   addr           Virtual memory address.
+  @param[in]   length         Size of mapping in bytes.
+  @param[in]   prot           Mapping access permissions (R/W/X).
+                              Cache attributes, Bus attributes, User mode.
+  @return
+  #QURT_EOK -- Successfully changes permissions on the mapping. \n
+  #QURT_EFATAL -- Failed to change permissions on the mapping. \n
+  #QURT_EINVALID -- Attributes / permissions requested are invalid.
+ */
+int qurt_mem_mprotect(const void *addr,
+                      size_t length,
+                      int prot);
+
+/**@ingroup func_qurt_mem_munmap
+  Removes an existing mapping. 
+
+  @note1hang If the specified mapping is not found in the context of the caller process
+  or invalid attributes are passed, an error code is returned.
+                  
+  @param[in]   addr           Virtual memory address.
+  @param[in]   length         Size of mapping in bytes.
+  
+  @return
+  #QURT_EOK -- Successfully changes permissions on the mapping. \n
+  #QURT_EFATAL -- Failed to change permissions on the mapping.
+  #QURT_ELOCKED - Buffer is locked. Mapping delete failed.
+ */
+int qurt_mem_munmap(void *addr,
+                    size_t length);
+
+/**@ingroup func_qurt_mem_munmap2
+  Removes an existing mapping for a specified process. 
+
+  @note1hang This API allows a root process entity, such as a driver, to remove mapping
+  that was created for a user process. If the specified mapping is not found in the context 
+  of client handle or invalid attributes are passed, an error code is returned.
+             
+  @param[out]  client_handle  Client handle of the user process that owns this mapping. 
+  @param[in]   addr           Virtual memory address.
+  @param[in]   length         Size of mapping in bytes.
+  
+  @return
+  #QURT_EOK -- Successfully changes permissions on the mapping. \n
+  #QURT_EFATAL -- Failed to change permissions on the mapping. 
+  #QURT_ELOCKED - Buffer is locked. Mapping delete failed.
+ */
+int qurt_mem_munmap2(int client_handle,
+                     void *addr,
+                     size_t length);
+
+/**@ingroup func_qurt_mem_munmap3
+  Removes an existing mapping or reservation for a specified process. 
+
+  @param[in]   client_handle  Client handle of the user process that owns this mapping. 
+  @param[in]   addr           Pointer to a virtual memory address.
+  @param[in]   length         Size of mapping in bytes.
+  @param[in]   flags          Specifies the flag.
+  
+  @return
+  #QURT_EOK -- Successfully changes permissions on the mapping. \n
+  #QURT_EFATAL -- Failed to change permissions on the mapping. 
+  #QURT_ELOCKED - Buffer is locked. Mapping delete failed.
+ */
+int qurt_mem_munmap3(int client_handle,
+                     void *addr,
+                     size_t length,
+                     int flags);
+
+/*
+|| The macros here follow the style of the standard mmap() macros, but with
+||  QURT_ prepended to avoid name conflicts, and to avoid having a dependency
+||  on sys/mman.h.
+||
+|| Wherever possible, any values here that are also present in sys/mman.h
+||  should have the same value in both places so that we can accept "mmap"
+||  calls without having to remap parameters to new values.
+||
+|| In the future, it would be desirable to have a regression test that
+||  checks, for instance, that these macros match.  Example:
+||
+||   assert(QURT_MAP_FAILED == MAP_FAILED);
+||   ... repeat as needed ...
+*/
+
+/** @addtogroup memory_mapping_macros
+@{ */
+/** @cond */
+#define QURT_PROT_NONE                  0x00U    /**< */
+#define QURT_PROT_READ                  0x01U    /**< */
+#define QURT_PROT_WRITE                 0x02U    /**< */
+#define QURT_PROT_EXEC                  0x04U    /**< */
+#define QURT_PROT_NODUMP                0x08U    /**< Skip dumping the mapping. During PD dump, must skip
+                                                   some mappings on host memory to avoid a race condition
+                                                      where the memory is removed from the host and the DSP process
+                                                      crashes before the mapping is removed.*/
+#define QURT_PROT_ISLAND                0x10U     /**< Island mapping. */
+
+#define QURT_MAP_SHARED                 0x0001U   /**< Shared. */
+#define QURT_MAP_PRIVATE                0x0002U   /**< Private. */
+/** @endcond */
+#define QURT_MAP_NAMED_MEMSECTION       0x0004U   /**< Named memsection. */
+#define QURT_MAP_FIXED                  0x0010U   /**< Fixed virtual address. */
+#define QURT_MAP_RENAME                 0x0020U   /**< Rename. */
+#define QURT_MAP_NORESERVE              0x0040U   /**< No reserve. */
+#define QURT_MAP_INHERIT                0x0080U   /**< Inherit. */
+#define QURT_MAP_NONPROCESS_VPOOL       0x0100U   /**< Use a virtual address outside of the default range of the
+                                                       processes. This option is only supported in the root process
+                                                       and only when virtual memory split is enabled in the XML.
+                                                       The root process can use this flag to create mapping for a
+                                                       user process, for example, if the virtual address is configured
+                                                       for a 3G/1G split, the root process can use this flag to create
+                                                       mapping in the top 1 GB area for the user process or the
+                                                       lower 3 GB area for the root process. This is useful for
+                                                       shared buffer use cases. */
+#define QURT_MAP_HASSEMAPHORE           0x0200U   /**< Has semaphore. */
+#define QURT_MAP_TRYFIXED               0x0400U   /**< Try to create a mapping for a virtual address that was passed.
+                                                       If the passed virtual address fails, use a random virtual address. */
+#define QURT_MAP_WIRED                  0x0800U   /**< Wired. */
+#define QURT_MAP_FILE                   0x0000U   /**< File. */
+#define QURT_MAP_ANON                   0x1000U   /**< Allocate physical memory from the pool that was passed. 
+                                                       By default, memory is allocated from the default physpool. */
+#define QURT_MAP_VA_ONLY                0X2000U   /**< Reserve a virtual address without
+                                                       mapping it. */
+
+/** @cond */                                                   
+#define QURT_MAP_ALIGNED(n)             ((n) << QURT_MAP_ALIGNMENT_SHIFT)
+#define QURT_MAP_ALIGNMENT_SHIFT        24
+
+
+#define QURT_MAP_ALIGNMENT_MASK         QURT_MAP_ALIGNED(0xff)   /**< */
+#define QURT_MAP_ALIGNMENT_64KB         QURT_MAP_ALIGNED(16)     /**< */
+#define QURT_MAP_ALIGNMENT_16MB         QURT_MAP_ALIGNED(24)     /**< */
+#define QURT_MAP_ALIGNMENT_4GB          QURT_MAP_ALIGNED(32)     /**< */
+#define QURT_MAP_ALIGNMENT_1TB          QURT_MAP_ALIGNED(40)     /**< */
+#define QURT_MAP_ALIGNMENT_256TB        QURT_MAP_ALIGNED(48)     /**< */
+#define QURT_MAP_ALIGNMENT_64PB         QURT_MAP_ALIGNED(56)     /**< */
+/** @endcond */
+#define QURT_MAP_FAILED                 ((void *) -1)            /**< Mapping creation failed. */
+
+/*
+|| The macros below are extensions beyond the standard mmap flags, but follow
+||  the style of the mmap flags.
+*/
+/** @cond */
+// Describe bitfields in (prot)
+#define QURT_PROT_CACHE_BOUNDS          16U,19U,7U         /**< Bits 16 through 19 are cache attribute, default is 0. */
+#define QURT_PROT_BUS_BOUNDS            20U,21U,0U         /**< Bits 20 through 21 are bus attributes, default is 0. */
+#define QURT_PROT_USER_BOUNDS           22U,23U,3U         /**< Bits 22 through 23 are user mode, default is 3;
+                                                                default of 3 means to derive user mode setting from the
+                                                                default mode of the client. */
+
+// Describe bitfields in (flags)
+#define QURT_MAP_PHYSADDR_BOUNDS        15U,15U,0U         /**< Bits 15 through 15 are physaddr, default is 0. */
+#define QURT_MAP_TYPE_BOUNDS            16U,19U,0U         /**< Bits 16 through 19 are mapping type, default is 0. */
+#define QURT_MAP_REGION_BOUNDS          20U,23U,0U         /**< Bits 20 through 23 are region type, default is 0. */
+/** @endcond */
+
+// These macros get OR'ed into (prot)
+#define QURT_PROT_CACHE_MODE(n)         QURT_MMAP_BUILD(QURT_PROT_CACHE_BOUNDS,(n)) /**< */
+#define QURT_PROT_BUS_ATTR(n)           QURT_MMAP_BUILD(QURT_PROT_BUS_BOUNDS,(n))   /**< */
+#define QURT_PROT_USER_MODE(n)          QURT_MMAP_BUILD(QURT_PROT_USER_BOUNDS,(n))  /**< */
+// These macros get OR'ed into (flags)
+
+#define QURT_MAP_PHYSADDR               QURT_MMAP_BUILD(QURT_MAP_PHYSADDR_BOUNDS,1U) /**< Use the physical address that was passed in offset field. 
+                                                                                          This is allowed only for root process. */
+#define QURT_MAP_TYPE(n)                QURT_MMAP_BUILD(QURT_MAP_TYPE_BOUNDS,(n))    /**< */
+#define QURT_MAP_REGION(n)              QURT_MMAP_BUILD(QURT_MAP_REGION_BOUNDS,(n))  /**< */
+/** @} */ /* end_addtogroup memory_mapping_macros */
+/** @cond */
+// These macros extract fields from (prot)
+#define QURT_PROT_GET_CACHE_MODE(n)     QURT_MMAP_EXTRACT(QURT_PROT_CACHE_BOUNDS,(n))  /**< */
+#define QURT_PROT_GET_BUS_ATTR(n)       QURT_MMAP_EXTRACT(QURT_PROT_BUS_BOUNDS,(n))    /**< */
+#define QURT_PROT_GET_USER_MODE(n)      QURT_MMAP_EXTRACT(QURT_PROT_USER_BOUNDS,(n))   /**< */
+
+// These macros extract fields from (flags)
+#define QURT_MAP_GET_TYPE(n)            QURT_MMAP_EXTRACT(QURT_MAP_TYPE_BOUNDS,(n))   /**< */
+#define QURT_MAP_GET_REGION(n)          QURT_MMAP_EXTRACT(QURT_MAP_REGION_BOUNDS,(n)) /**< */
+
+// Macros for bitfield insertion and extraction
+#define QURT_MMAP_MASK(lo,hi)           (~((~0u) << ((hi)-(lo)+1U)))                     /**< Mask of same size as [lo..hi]. */
+#define QURT_MMAP_BUILD_(lo,hi,def,n)   ((((n)^(def))&QURT_MMAP_MASK((lo),(hi)))<<(lo)) /**< */
+#define QURT_MMAP_EXTRACT_(lo,hi,def,n) ((((n)>>(lo))&QURT_MMAP_MASK((lo),(hi)))^(def)) /**< */
+#define QURT_MMAP_BUILD(a,b)            QURT_MMAP_BUILD_(a,b)                           /**< */
+#define QURT_MMAP_EXTRACT(a,b)          QURT_MMAP_EXTRACT_(a,b)                         /**< */
+/** @endcond */
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_mq.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_mq.h
new file mode 100755
index 0000000000000..580c83d3de41a
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_mq.h
@@ -0,0 +1,458 @@
+#ifndef QURT_MQ_H
+#define QURT_MQ_H
+/**
+  @file  qurt_mq.h
+
+  @brief  Prototypes of secure message queues API functions.
+
+ EXTERNALIZED FUNCTIONS
+  None
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  None
+
+ Copyright (c) 2019-2023  by Qualcomm Technologies, Inc.  All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+======================================================================*/
+#include <qurt_types.h>
+#include <qurt_error.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*=============================================================================
+                            CONSTANTS AND MACROS
+=============================================================================*/
+#define QURT_MQ_NAME_MAXLEN            16U  /**< Maximum name length. */
+
+
+/*=============================================================================
+                            FORWARD DECLARATIONS & TYPEDEFS
+=============================================================================*/
+/* This enum must be generated in accordance to process class class numbers.
+   For now it is made to match generated version, do not change this unless 
+   there is a corresponding change in the process_class.py, indicies start from 0
+   basically: QURT_MQ_SECURITY_SCOPE_<x> = (1 << QURTK_process_class_index_<x>)
+*/
+typedef enum {
+    QURT_MQ_SECURITY_SCOPE_KERNEL =   ( 1U << 0 ),
+    QURT_MQ_SECURITY_SCOPE_SRM =      ( 1U << 1 ),
+    QURT_MQ_SECURITY_SCOPE_SECURE =   ( 1U << 2 ),
+    QURT_MQ_SECURITY_SCOPE_CPZ =      ( 1U << 3 ),
+    QURT_MQ_SECURITY_SCOPE_ROOT =     ( 1U << 4 ),
+    QURT_MQ_SECURITY_SCOPE_SIGNED =   ( 1U << 5 ),
+    QURT_MQ_SECURITY_SCOPE_UNSIGNED = ( 1U << 6 ),
+    QURT_MQ_SECURITY_SCOPE_SECURE_ROOT = ( 1U << 7 )
+} qurt_mq_security_scope_t;
+
+typedef enum {
+    QURT_MQ_CARDINALITY_PTP =   (1U << 0),
+    QURT_MQ_CARDINALITY_MTO =   (1U << 1)
+}qurt_mq_cardinality_t;
+
+typedef unsigned int qurt_mqd_t;
+
+typedef union{
+    struct {
+        unsigned int perms:2;
+        unsigned int cardinality:1;
+        unsigned int blocking:1;
+
+        qurt_mq_security_scope_t creator_scope: 8;
+        qurt_mq_security_scope_t allowed_scope: 8; //can be a bitmask in case of MTO
+        unsigned int queue_closed: 1;
+        unsigned int reserved: 11;
+    }; //try to do anonymous struct
+    unsigned int raw;
+} qurt_mq_flags_t;
+
+
+/* permissions are from qurt_types.h , block X though */
+#if 0
+/** Memory access permission. */
+typedef enum {
+        QURT_PERM_READ=0x1U, /**< */
+        QURT_PERM_WRITE=0x2U,  /**< */
+        QURT_PERM_EXECUTE=0x4U,  /**< */
+        QURT_PERM_FULL=QURT_PERM_READ|QURT_PERM_WRITE|QURT_PERM_EXECUTE,  /**< */
+} qurt_perm_t;
+#endif
+
+struct qurt_mq_attr {
+   unsigned flags;                         /**< Configured flags. Only meaningful with get_attr(), only used for qurt_mq_flags_t.perms. */
+   unsigned mq_maxmsg;                     /**< Maximum number of messages. Used with create() and get_attr. */
+   unsigned short mq_send_msgsize;         /**< Maximum size (bytes) of message in receiver facing queue,
+                                                from sender to receiver. */
+   unsigned short mq_recv_msgsize;         /**< Maximum size (bytes) of message in sender facing queue,
+                                                from receiver to sender. */
+   unsigned client_pid;                    /**< Process ID of client that is allowed to open the message queue
+                                                that was created using qurt_mq_create(). */
+   qurt_mq_cardinality_t    cardinality;   /**< Cardinality of message queue connection, see below. */
+   qurt_mq_security_scope_t scope;         /**< Security scope of the senders to the queue. */ 
+};
+
+
+/*=============================================================================
+                            EXTERNS & FUNCTIONS
+=============================================================================*/
+/**@ingroup func_qurt_mq_attr_init
+  Initializes attributes to default values used for creating the queue.
+
+  The initialize operation sets the following default attribute values: \n
+  - flag - QURT_PERM_READ | QURT_PERM_WRITE \n
+  - maxmsg - 1 \n
+  - mq_send_msgsize - 8 \n
+  - mq_recv_msgsize - 8 \n
+  - sender_pid -  -1 \n    
+  - cardinality -  QURT_MQ_CARDINALITY_PTP \n    
+  - scope -  QURT_MQ_SECURITY_SCOPE_SIGNED \n    
+
+  @datatypes
+  #qurt_mq_attr 
+  
+  @param[in,out] attr Pointer to the initialized message queue object.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+void qurt_mq_attr_init(struct qurt_mq_attr * attr);
+
+/**@ingroup qurt_mq_attr_set_send_msgsize
+  Sets the message size in bytes the sender can send.
+  Maximum message length is configurable using the XML configuration, however, limited to a maximum value of 62 bytes.
+  
+  @datatypes
+  #qurt_mq_attr
+
+  @param[in,out] attr Pointer to the message queue object.
+  @param[in] len     Length of message in bytes.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+void qurt_mq_attr_set_send_msgsize (struct qurt_mq_attr *attr, size_t len);
+
+/**@ingroup qurt_mq_attr_set_recv_msgsize
+  Sets the message size in bytes that the receiver can read.
+  Maximum message length is configurable using the XML configuration, however, limited to maximum value of 62 bytes.
+  
+  @datatypes
+  #qurt_mq_attr
+
+  @param[in,out] attr Pointer to the message queue object.
+  @param[in] len     Length of message in bytes.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+void qurt_mq_attr_set_recv_msgsize (struct qurt_mq_attr *attr, size_t len);
+
+/**@ingroup qurt_mq_attr_set_maxmsg
+  Sets the maximum message that can queue in the message queue.
+  Message depth is configurable using the XML configuration. 
+  
+  @datatypes
+  #qurt_mq_attr
+
+  @param[in,out] attr  Pointer to the message queue object.
+  @param[in] depth     Maximum message that can be queued.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+void qurt_mq_attr_set_maxmsg (struct qurt_mq_attr *attr, unsigned int depth);
+
+/**@ingroup qurt_mq_attr_set_scope
+  Sets the scope of the message queue. A message queue created with a security 
+  scope allows only a process class of that scope to open a message queue.
+  
+  @datatypes
+  #qurt_mq_attr \n
+  #qurt_mq_security_scope_t
+
+  @param[in,out] attr  Pointer to the message queue object.
+  @param[in] scope     Scope of the message queue: \n
+                       #QURT_MQ_SECURITY_SCOPE_KERNEL \n
+                       #QURT_MQ_SECURITY_SCOPE_SRM \n
+                       #QURT_MQ_SECURITY_SCOPE_SECURE \n
+                       #QURT_MQ_SECURITY_SCOPE_CPZ \n
+                       #QURT_MQ_SECURITY_SCOPE_ROOT \n
+                       #QURT_MQ_SECURITY_SCOPE_SIGNED \n
+                       #QURT_MQ_SECURITY_SCOPE_UNSIGNED
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+void qurt_mq_attr_set_scope (struct qurt_mq_attr *attr, qurt_mq_security_scope_t scope);
+
+
+/**@ingroup qurt_mq_attr_set_client_pid
+  Sets the client_pid that can open this message queue.
+  If client_pid is set, allowed_scope to open MQ shall not be considered.
+  
+  @datatypes
+  #qurt_mq_attr
+
+  @param[in,out] attr    Pointer to the message queue object.
+  @param[in] client_pid  Valid PID for client process.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+void qurt_mq_attr_set_client_pid (struct qurt_mq_attr *attr, unsigned client_pid);
+
+/**@ingroup qurt_mq_attr_set_flags
+  Sets the properties of the message queues. 
+  The current implementation is only used to set the permission for the message queue using the flag attribute.
+  Default is #QURT_PERM_READ | #QURT_PERM_WRITE, explicit permission is not implemented.
+  
+  @datatypes
+  #qurt_mq_attr
+
+  @param[in,out] attr  Pointer to the message queue object.
+  @param[in] flags     Permission for message queue.  
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+void qurt_mq_attr_set_flags (struct qurt_mq_attr *attr, unsigned int flags);
+
+/**@ingroup qurt_mq_create
+   Create a message queue with the provided name and attributes. 
+   The calling process becomes the owner of the queue.
+   Name of the message queue is limited to 16 characters including the NULL terminator. 
+  
+  @datatypes
+  #qurt_mq_attr \n
+  #qurt_mqd_t
+
+  @param[out] mqd Returns a pointer to the message queue identifier if 
+              the message queue  was successfully created.
+  @param[in] name     String identifier of the message queue.
+  @param[in] attr     Pointer to the initialized message queue attribute 
+                      structure that specifies the attributes of the created message queue.
+
+  @return
+  #QURT_EOK        Message queue created. \n
+  #QURT_EINVALID   Invalid arguments. \n
+  #QURT_ENOSPC     Maximum number of queues in the system is exceeded.
+
+  @dependencies
+  None.
+*/
+int qurt_mq_create(qurt_mqd_t *mqd, const char *name, struct qurt_mq_attr * attr);
+
+/**@ingroup qurt_mq_open
+  Opens a message queue connection between a process and a created message queue. 
+  
+  @datatypes
+  #qurt_mq_attr \n
+  #qurt_mqd_t
+
+  @param[out] mqd Returns a pointer to the message queue 
+              identifier if the message queue  was successfully created.
+  @param[in] name    String identifier of the message queue. 
+  @param[in] flags   Flag that contains the properties that define the behavior of message queue connection.
+                     Permissions:\n
+                      #QURT_PERM_READ \n
+                      #QURT_PERM_WRITE \n
+                      #QURT_PERM_READ | QURT_PERM_WRITE @tablebulletend  
+                      Default is QURT_PERM_READ | QURT_PERM_WRITE, explicit permission is not implemented \n
+                     Cardinality: \n
+                      #QURT_MQ_CARDINALITY_PTP (default) \n      
+                      #QURT_MQ_CARDINALITY_MTO (not implemented) \n
+                      Block suspend thread until the message queue with the apecified name is created. \n
+                     Scope: security boundary to which the message queue and its users are constrained.
+                      Block suspend thread until the message queue with the apecified name is created. \n
+                      It is coupled with process privilege level/scope.\n
+                      #QURT_MQ_SECURITY_SCOPE_KERNEL   \n
+                      #QURT_MQ_SECURITY_SCOPE_SRM      \n
+                      #QURT_MQ_SECURITY_SCOPE_SECURE   \n
+                      #QURT_MQ_SECURITY_SCOPE_CPZ      \n
+                      #QURT_MQ_SECURITY_SCOPE_ROOT     \n
+                      #QURT_MQ_SECURITY_SCOPE_SIGNED   \n
+                      #QURT_MQ_SECURITY_SCOPE_UNSIGNED @tablebulletend
+
+  @return
+  QURT_EOK -- Message queue connection successfully opened \n
+  QURT_EFAILED -- Message queue connection failed , if non-blocking message queue \n
+  QURT_ENOTALLOWED --  Open failed due to security scope mismatch
+
+  @dependencies
+  None.
+*/
+int qurt_mq_open (qurt_mqd_t *mqd, const char *name, qurt_mq_flags_t flags);
+
+/**@ingroup qurt_mq_send
+  Sends a message over message queue.\n
+  - If the message queue is full, the calling thread shall be 
+    suspended until space becomes available to enqueue the message. \n
+  - If there exists a thread suspended on an empty queue 
+  to receive a message,  qurt_mq_send shall resume that thread. 
+
+  @datatypes
+  #qurt_mqd_t
+
+  @param[in] mqd Pointer to the message queue identifier.
+  @param[in] msg_ptr     Pointer to the message buffer.  
+  @param[in] msg_len     Length of the message buffer in bytes.  
+
+  @return
+  #QURT_EOK  Message queue send was successful.\n
+  #QURT_EMSGSIZE  Message size in msg_len field is greater than max_message_len specified during queue creation.\n
+  #QURT_ENOTALLOWED   Send failed due to security scope mismatch.
+
+  @dependencies
+  None.
+*/
+int qurt_mq_send(qurt_mqd_t mqd, const char *msg_ptr, size_t msg_len); 
+
+/**@ingroup qurt_mq_send_timed
+  Sends a message over message queue.\n
+  - If the message queue is full, the calling thread shall be 
+    suspended until space becomes available to enqueue the message or until timeout is reached. \n
+  - If there exists a thread suspended on an empty queue 
+    to receive a message, qurt_mq_send_timed shall return with possible return codes.\n
+  - If timeout is reached, qurt_mq_send_timed shall return #QURT_ETIMEOUT.
+
+  @datatypes
+  #qurt_mqd_t
+
+  @param[in] mqd Pointer to the message queue identifier.
+  @param[in] msg_ptr     Pointer to the message buffer.
+  @param[in] duration    Interval (in microseconds) that the duration value must be
+             between #QURT_TIMER_MIN_DURATION and #QURT_TIMER_MAX_DURATION     
+  @param[in] msg_len     Length of message buffer in bytes.  
+
+  @return
+  #QURT_EOK -- Message queue send was successful. \n
+  #QURT_EMSGSIZE -- Message size in msg_len field is greater than max_message_len specified during queue creation.\n
+  #QURT_ENOTALLOWED --  Send failed due to security scope mismatch \n
+  #QURT_ETIMEDOUT -- Timeout
+  
+  @dependencies
+  None.
+*/
+int qurt_mq_send_timed(qurt_mqd_t mqd, const char *msg_ptr, unsigned long long int duration, size_t msg_len);
+
+ /**@ingroup qurt_mq_recv
+  Receives a message from the message queue. \n
+  -If the message queue is empty, the calling thread shall be 
+   suspended until a message is enqueued in the message queue. \n
+  -If there exists a thread suspended on a full queue to 
+   send a message, qurt_mq_recv shall resume the thread.
+
+  @datatypes
+  #qurt_mqd_t
+
+  @param[in] mqd Pointer to the message queue identifier.
+  @param[in] msg_ptr       Pointer to the message buffer  
+  @param[in,out] msg_len   Pointer to the length of message buffer.  
+
+  @return
+  #QURT_EOK --    Message queue created.\n
+  #QURT_EINVALID  Message pointer or msg_len ptr are NULL. \n
+  #QURT_EBADR     Message queue descriptior (mqd) is invalid. \n
+  #QURT_EBADF     Sender closed the message queue.
+
+  @dependencies
+  None.
+*/
+int qurt_mq_recv(qurt_mqd_t mqd, unsigned char *msg_ptr, size_t *msg_len);
+
+ /**@ingroup qurt_mq_recv_timed
+  Receives a message from the message queue. \n
+  -If the message queue is empty, the calling thread shall be 
+   suspended until a message is enqueued in the message queue or until timeout is reached.\n 
+  -If there exists a thread suspended on a full queue to 
+   send a message, qurt_mq_recv_timed shall return with possible return codes.\n
+  - If timeout is reached, qurt_mq_recv_timed shall return QURT_ETIMEOUT.
+  
+  @datatypes
+  #qurt_mqd_t
+
+  @param[in] mqd Pointer to the message queue identifier.
+  @param[in] msg_ptr     Pointer to the message buffer  
+  @param[in] duration    Interval (in microseconds) that the duration value must be;
+             between #QURT_TIMER_MIN_DURATION and #QURT_TIMER_MAX_DURATION   
+  @param[in,out] msg_len     Pointer to length of message buffer.  
+
+  @return
+  #QURT_EOK --       Message queue created.\n
+  #QURT_EINVALID --  Message ptr or msg_len ptr are NULL. \n
+  #QURT_EBADR    --  Message queue descriptior (mqd) is invalid.\n
+  #QURT_EBADF   --   Sender closed the message queue. \n
+  #QURT_ETIMEDOUT -- Timeout.
+  
+  @dependencies
+  None.
+*/
+int qurt_mq_recv_timed(qurt_mqd_t mqd, unsigned char *msg_ptr, unsigned long long int duration, size_t *msg_len);
+
+ /**@ingroup qurt_mq_close
+  Closes the message queue and disassociates the calling process (client) from the message queue 
+  under this descriptor. Marks the queue as closed for the receiver. 
+  This function is expected to be called from the client side. If called 
+  from the server side, the function reduces to no-op and returns success. 
+
+  @datatypes
+  #qurt_mqd_t
+
+  @param[in] mqd Pointer to the message queue identifier. 
+
+  @return
+  #QURT_EOK -- Message queue close was successfully.\n
+  #QURT_EBADR -- Invalid descriptor.\n
+  #QURT_ENOTALLOWED --   Message queue close is not called from client side.
+
+  @dependencies
+  None.
+*/
+int qurt_mq_close(qurt_mqd_t mqd);
+
+ /**@ingroup qurt_mq_destroy
+  Destroys the message queue. This function ought to be 
+  called from the process that called qurt_mq_create(). 
+
+  @datatypes
+  #qurt_mqd_t
+
+  @param[in] mqd Pointer to the message queue identifier. 
+
+  @return
+  #QURT_EOK -- Message queue destroy was successfully.\n
+  #QURT_EBADR -- Invalid descriptor.\n
+  #QURT_ENOTALLOWED --  Message queue close is not called from client side.
+
+  @dependencies
+  None.
+*/
+int qurt_mq_destroy(qurt_mqd_t mqd);
+
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+#endif //QURT_MQ_H
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_mutex.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_mutex.h
new file mode 100755
index 0000000000000..4ad6b270cdde6
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_mutex.h
@@ -0,0 +1,211 @@
+#ifndef QURT_MUTEX_H
+#define QURT_MUTEX_H
+/**
+  @file qurt_mutex.h 
+  @brief   Prototypes of mutex API.  
+   This is mostly a user space mutex, but calls the 
+   kernel to block if the mutex is taken. 
+
+EXTERNAL FUNCTIONS
+   None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+Copyright (c) 2021  by Qualcomm Technologies, Inc.  All Rights Reserved.
+Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+=============================================================================*/
+
+
+#include <qurt_futex.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** @addtogroup mutex_types
+@{ */
+/*=============================================================================
+                        TYPEDEFS
+=============================================================================*/
+
+/** QuRT mutex type.                                       
+  
+   Both non-recursive mutex lock and unlock, and recursive
+   mutex lock and unlock can be applied to this type.
+ */
+typedef union qurt_mutex_aligned8{
+   /** @cond */  
+    struct {       
+        unsigned int holder; 
+        unsigned int count;  
+        unsigned int queue;  
+        unsigned int wait_count;        
+    };
+    unsigned long long int raw;  
+    /** @endcond */  
+} qurt_mutex_t;
+/** @} */ /* end_addtogroup mutex_types */
+/*=============================================================================
+                        CONSTANTS AND MACROS
+=============================================================================*/
+/* @addtogroup mutex_const_macros
+@{ */
+#define MUTEX_MAGIC 0xfe                             /**< */
+#define QURTK_FUTEX_FREE_MAGIC     0x1F   // 11111   /**< */
+#define QURT_MUTEX_INIT {{MUTEX_MAGIC, 0, QURTK_FUTEX_FREE_MAGIC,0}}   /**< Suitable as an initializer for a
+                                                                        variable of type qurt_mutex_t. */
+/* @} */ /* end_addtogroup mutex_const_macros */
+/*=============================================================================
+                        FUNCTIONS
+=============================================================================*/
+/**@ingroup func_qurt_mutex_init
+  Initializes a mutex object.
+  The mutex is initially unlocked.
+
+  @note1hang Each mutex-based object has one or more kernel resources associated with it;
+             to prevent resource leaks, call qurt_mutex_destroy()
+             when this object is not used anymore
+  @datatypes
+  #qurt_mutex_t
+  
+  @param[out]  lock  Pointer to the mutex object. Returns the initialized object.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+  
+ */
+void qurt_mutex_init(qurt_mutex_t *lock);
+
+/**@ingroup func_qurt_mutex_destroy
+   Destroys the specified mutex. 
+
+   @note1hang Mutexes must be destroyed when they are no longer in use. Failure to do this
+              causes resource leaks in the QuRT kernel.\n
+   @note1cont Mutexes must not be destroyed while they are still in use. If this occurs, the
+              behavior of QuRT is undefined. 
+  
+   @datatypes
+   #qurt_mutex_t
+   
+   @param[in]  lock  Pointer to the mutex object to destroy.
+
+   @return 
+   None.
+
+   @dependencies
+   None.
+  
+ */
+void qurt_mutex_destroy(qurt_mutex_t *lock); 
+
+/**@ingroup func_qurt_mutex_lock
+   Locks the specified mutex.  
+   If a thread performs a lock operation on a mutex that is not in use, the thread gains
+   access to the shared resource that is protected by the mutex, and continues executing.
+
+   If a thread performs a lock operation on a mutex that is already in use by another
+   thread, the thread is suspended. When the mutex becomes available again (because the
+   other thread has unlocked it), the thread is awakened and given access to the shared
+   resource.
+
+   @note1hang A thread is suspended indefinitely if it locks a mutex that it has already
+           locked. Avoid this by using recursive mutexes (Section @xref{dox:recursive_mutexes}).  
+  
+   @datatypes
+   #qurt_mutex_t
+   
+   @param[in]  lock  Pointer to the mutex object. Specifies the mutex to lock.
+
+   @return 
+   None.
+
+   @dependencies
+   None.  
+ */
+void qurt_mutex_lock(qurt_mutex_t *lock);		/* blocking */
+
+/**@ingroup func_qurt_mutex_lock_timed
+   Locks the specified mutex.
+   When a thread performs a lock operation on a mutex that is not in use, the thread gains
+   access to the shared resource that is protected by the mutex, and continues executing.
+
+   When a thread performs a lock operation on a mutex that is already in use by another
+   thread, the thread is suspended. When the mutex becomes available again (because the
+   other thread has unlocked it), the thread is awakened and given access to the shared
+   resource. If the duration of suspension exceeds the timeout duration, wait is
+   terminated and no access to mutex is granted. 
+   
+   @datatypes
+   #qurt_mutex_t
+   
+   @param[in]  lock    Pointer to the mutex object; specifies the mutex to lock. 
+   @param[in] duration Interval (in microseconds) that the duration value must be between #QURT_TIMER_MIN_DURATION and
+    #QURT_TIMER_MAX_DURATION 
+
+   @return 
+   #QURT_EOK -- Success \n
+   #QURT_ETIMEDOUT -- Timeout
+ 
+   @dependencies
+   None.  
+ */
+int qurt_mutex_lock_timed (qurt_mutex_t * lock, unsigned long long int duration);
+
+/**@ingroup func_qurt_mutex_unlock
+  Unlocks the specified mutex.  \n
+  More than one thread can be suspended on a mutex. When the mutex is unlocked, only the
+  highest-priority thread waiting on the mutex is awakened. If the awakened thread has
+  higher priority than the current thread, a context switch occurs.
+
+  @note1hang The behavior of QuRT is undefined if a thread unlocks a mutex it did not first
+              lock.  
+  
+   @datatypes
+   #qurt_mutex_t
+   
+   @param[in]  lock  Pointer to the mutex object. Specifies the mutex to unlock. 
+
+   @return
+   None.
+
+   @dependencies
+   None.  
+ */
+void qurt_mutex_unlock(qurt_mutex_t *lock);	/* unlock */
+
+/**@ingroup func_qurt_mutex_try_lock
+   @xreflabel{hdr:qurt_mutex_try_lock}
+   Attempts to lock the specified mutex. 
+   If a thread performs a try_lock operation on a mutex that is not in use, the thread gains
+   access to the shared resource that is protected by the mutex, and continues executing.
+
+   @note1hang If a thread performs a try_lock operation on a mutex that it has already locked 
+              or is in use by another thread, qurt_mutex_try_lock immediately returns with a 
+              nonzero result value.
+   
+  
+   @datatypes
+   #qurt_mutex_t
+   
+   @param[in]  lock  Pointer to the mutex object. Specifies the mutex to lock.
+
+   @return 
+   0 -- Success. \n 
+   Nonzero -- Failure.
+  
+  @dependencies
+  None.
+ */
+int qurt_mutex_try_lock(qurt_mutex_t *lock);	
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_MUTEX_H */
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_os_services.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_os_services.h
new file mode 100755
index 0000000000000..cbc4c239e9620
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_os_services.h
@@ -0,0 +1,24 @@
+/*=============================================================================
+
+                                    qurt_os_services.c
+
+GENERAL DESCRIPTION
+
+EXTERNAL FUNCTIONS
+        None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+        None.
+
+             Copyright (c) 2023  by Qualcomm Technologies, Inc.  All Rights Reserved.
+=============================================================================*/
+
+#define QURT_OS_SERVICE_THREAD                "/os/thread"				/**< Thread service */
+#define QURT_OS_SERVICE_FS_HUB                "/os/fs_hub"  			/**< file-system hub */
+#define QURT_OS_SERVICE_CALLBACK              "/os/callback"            /**< QDI callback service */ 
+#define QURT_OS_SERVICE_INTERRUPTS            "/os/interrupt"           /**< Interrupt service */
+#define QURT_OS_SERVICE_PROXY                 "/os/proxy"               /**< QDI proxy serice */
+#define QURT_OS_SERVICE_MEMORY                "/os/memory"              /**< Memory management service */
+#define QURT_OS_SERVICE_MEMPOOL               "/os/mempool"             /**< Pool management service */
+#define QURT_OS_SERVICE_PROCESS               "/os/process"             /**< Process management service */
+#define QURT_OS_SERVICE_MMAP                  "/os/mem_mapper"          /**< mmapper service */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_pimutex.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_pimutex.h
new file mode 100755
index 0000000000000..61aee5cba7ce8
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_pimutex.h
@@ -0,0 +1,200 @@
+#ifndef QURT_PIMUTEX_H
+#define QURT_PIMUTEX_H 1
+/**
+  @file qurt_pimutex.h   
+  @brief Prototypes of qurt_pimutex API.  
+
+EXTERNAL FUNCTIONS
+   None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+Copyright (c) 2021  by Qualcomm Technologies, Inc.  All Rights Reserved.
+Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+=============================================================================*/
+
+#include <qurt_futex.h>
+#include <qurt_mutex.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*=============================================================================
+                        FUNCTIONS
+=============================================================================*/
+/**@ingroup func_qurt_pimutex_init
+  Initializes a priority inheritance mutex object.
+  The priority inheritance mutex is initially unlocked.
+
+  This function works the same as qurt_mutex_init().
+
+   @note1hang Each pimutex-based object has one or more kernel resources associated with it;
+              to prevent resource leaks, call qurt_pimutex_destroy()
+              when this object is not used anymore
+
+  @datatypes
+  #qurt_mutex_t
+  
+  @param[out]  lock  Pointer to the priority inheritance mutex object.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ 
+ */
+void qurt_pimutex_init(qurt_mutex_t *lock);
+
+/**@ingroup func_qurt_pimutex_destroy
+   Destroys the specified priority inheritance mutex.  
+
+   @note1hang Priority inheritance mutexes must be destroyed when they are no longer in
+              use. Failure to do this causes resource leaks in the QuRT kernel.\n
+   @note1cont Priority inheritance mutexes must not be destroyed while they are still in use.
+              If this occurs, the behavior of QuRT is undefined.
+  
+   @datatypes
+   #qurt_mutex_t
+   
+   @param[in]  lock  Pointer to the priority inheritance mutex object to destroy.
+
+   @return
+   None.
+
+   @dependencies
+   None.
+  
+ */
+void qurt_pimutex_destroy(qurt_mutex_t *lock);
+
+/**@ingroup func_qurt_pimutex_lock
+  Requests access to a shared resources. If a thread performs a lock operation on a mutex 
+  that is not in use, the thread gains access to the shared resource that the mutex protects, 
+  and continues executing.
+ 
+  If a thread performs a lock operation on a mutex that is already in use by another
+  thread, the thread is suspended. When the mutex becomes available again (because the 
+  other thread has unlocked it), the thread is awakened and given access to the shared resource.
+
+  If a thread is suspended on a priority inheritance mutex, and the priority of the suspended
+  thread is higher than the priority of the thread that has locked the mutex, the thread
+  with the mutex acquires the higher priority of the suspended thread. The locker thread blocks
+  until the lock is available.
+ 
+  @note1hang  A thread is not suspended if it locks a priority inheritance mutex that it has 
+              already locked . However, the mutex does not become available to other 
+			  threads until the thread performs a balanced number of unlocks on the mutex.\n
+  @note1cont  When multiple threads compete for a mutex, the lock operation for a priority
+              inheritance mutex is slower than it is for a recursive mutex. 
+			  In particular, it is about 10 times slower when the mutex is available for locking,
+			  and slower (with greatly varying times) when the mutex is already locked.
+
+  @datatypes
+  #qurt_mutex_t
+  
+  @param[in]  lock  Pointer to the priority inheritance mutex object to lock.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+  
+ */
+void qurt_pimutex_lock(qurt_mutex_t *lock);
+
+
+/**@ingroup func_qurt_pimutex_lock_timed
+  Locks a priority inheritance mutex with timeout.
+ 
+  A thread can lock a priority inheritance mutex for multiple times. The mutex is not 
+  available to other threads until the thread performs the same number of mutex unlock
+  operations.
+
+  If a thread performs a lock operation on a mutex that is already locked by another thread, 
+  the thread is moved to waiting state. When the mutex becomes available again (because the 
+  other thread has unlocked the mutex), the thread is awakened and tries to lock the mutex.
+
+  If a thread is waiting on a priority inheritance mutex, and the priority of the waiting thread 
+  is higher than the priority of the thread that has locked the mutex, the priority of the thread
+  that has locked the mutex is raised to the same priority of the waiting thread.
+
+  If the duration of waiting exceeds the timeout duration, the waiting is terminated, and 
+  the function returns QURT_ETIMEDOUT as a failure of the mutex lock.
+  
+
+  @datatypes
+  #qurt_mutex_t
+  
+  @param[in]  lock       Pointer to the mutex object to lock.
+  @param[in]  duration   Duration (in microseconds) to wait. The duration value must be between 
+                         #QURT_TIMER_MIN_DURATION and #QURT_TIMER_MAX_DURATION.
+
+  @return
+   #QURT_EOK       -- Success \n
+   #QURT_ETIMEDOUT -- Timeout
+   #QURT_EINVALID  -- Duration is out of range
+
+  @dependencies
+  None.
+  
+ */
+int qurt_pimutex_lock_timed(qurt_mutex_t *lock, unsigned long long int duration);
+
+
+/**@ingroup func_qurt_pimutex_unlock
+   Releases access to a shared resource; unlocks the specified priority inheritance mutex.  \n
+   More than one thread can be suspended on a priority inheritance mutex. When the mutex
+   is unlocked, only the highest-priority thread waiting on the mutex is awakened. If the
+   awakened thread has higher priority than the current thread, a context switch occurs.
+
+   When a thread unlocks a priority inheritance mutex, its thread priority is restored to its
+   original value from any higher priority value that it acquired from another thread
+   suspended on the mutex.
+  
+   @datatypes
+   #qurt_mutex_t
+   
+   @param[in]  lock  Pointer to the priority inheritance mutex object to unlock.
+
+   @return 
+   None.
+
+   @dependencies
+   None.
+ 
+ */
+void qurt_pimutex_unlock(qurt_mutex_t *lock);
+
+/**@ingroup func_qurt_pimutex_try_lock
+  Request access to a shared resource (without suspend). Attempts to lock the specified priority inheritance mutex.\n
+  If a thread performs a try_lock operation on a priority inheritance mutex that is not in
+  use, the thread gains access to the shared resource that is protected by the mutex, and
+  continues executing.
+  If a thread performs a try_lock operation on a priority inheritance mutex that is already
+  in use by another thread, qurt_pimutex_try_lock immediately returns with a
+  nonzero result value.
+  
+  @datatypes
+  #qurt_mutex_t
+  
+  @param[in]  lock  Pointer to the priority inheritance mutex object to lock.
+
+  @return
+  0 -- Success. \n
+  Nonzero -- Failure. 
+
+  @dependencies
+  None. 
+ */
+int qurt_pimutex_try_lock(qurt_mutex_t *lock);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_PIMUTEX_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_pimutex2.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_pimutex2.h
new file mode 100755
index 0000000000000..b809f163cbfd2
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_pimutex2.h
@@ -0,0 +1,162 @@
+#ifndef QURT_PIMUTEX2_H
+#define QURT_PIMUTEX2_H
+/**
+  @file qurt_pimutex2.h 
+  @brief Prototypes of pimutex2 API  
+
+EXTERNAL FUNCTIONS
+   None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+Copyright (c) 2013, 2021, 2023  by Qualcomm Technologies, Inc.  All Rights Reserved.
+Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+=============================================================================*/
+
+
+#include <qurt_futex.h>
+#include <qurt_mutex.h>
+#include <qurt_rmutex2.h>
+
+/*=============================================================================
+												FUNCTIONS
+=============================================================================*/
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**@ingroup func_qurt_pimutex2_init
+   Initializes a recursive mutex object. 
+
+   @deprecated use #qurt_pimutex_init instead.
+
+   The recursive mutex is initially unlocked.
+  
+   Objects of type pimutex2 solve a potential race condition between
+   unlock() and destroy() operations.
+
+   @datatypes
+   #qurt_rmutex2_t
+   
+   @param[out]  lock  Pointer to the recursive mutex object.
+
+   @return 
+   None.
+
+   @dependencies
+   None.
+  
+ */
+void qurt_pimutex2_init(qurt_rmutex2_t *lock);
+
+/**@ingroup func_qurt_pimutex2_destroy
+
+  @deprecated use #qurt_pimutex_destroy instead.
+
+  Destroys the specified recursive mutex. \n
+  @note1cont Recursive mutexes must not be destroyed while they are still in use. If this
+             occurs, the behavior of QuRT is undefined. 
+  @note1cont In general, application code should destroy an pimutex2 object prior to
+             deallocating it; calling qurt_pimutex2_destroy() before deallocating it ensures
+             that all qurt_pimutex2_unlock() calls complete.
+  
+  @datatypes
+  #qurt_rmutex2_t
+  
+  @param[in]  lock  Pointer to the recursive mutex object to destroy.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+  
+ */
+void qurt_pimutex2_destroy(qurt_rmutex2_t *lock);
+
+/**@ingroup func_qurt_pimutex2_lock
+
+  @deprecated use #qurt_pimutex_lock instead.
+
+  Locks the specified recursive mutex. \n 
+
+  If a thread performs a lock operation on a recursive mutex that is not being used, the
+  thread gains access to the shared resource that is protected by the mutex, and continues
+  executing.
+
+  If a thread performs a lock operation on a recursive mutex that is already being used by
+  another thread, the thread is suspended. When the mutex becomes available again
+  (because the other thread has unlocked it), the thread is awakened and given access to the
+  shared resource.
+  
+  @note1hang A thread is not suspended if it locks a recursive mutex that it has already
+             locked, but the mutex does not become available until the thread performs a
+             balanced number of unlocks on the mutex.
+  
+   @datatypes
+   #qurt_rmutex2_t
+  
+   @param[in]  lock  Pointer to the recursive mutex object to lock. 
+
+   @return
+   None.
+
+   @dependencies
+   None.
+  
+ */
+void qurt_pimutex2_lock(qurt_rmutex2_t *lock);
+
+/**@ingroup func_qurt_pimutex2_unlock
+
+   @deprecated use #qurt_pimutex_unlock instead.
+
+   Unlocks the specified recursive mutex. \n 
+   More than one thread can be suspended on a recursive mutex. When the mutex is
+   unlocked, only the highest-priority thread waiting on the mutex is awakened. If the
+   awakened thread has higher priority than the current thread, a context switch occurs.
+  
+   @datatypes
+   #qurt_rmutex2_t
+  
+   @param[in]  lock  Pointer to the recursive mutex object to unlock. 
+
+   @return
+   None.
+
+   @dependencies
+   None.
+  
+ */
+void qurt_pimutex2_unlock(qurt_rmutex2_t *lock);
+
+/**@ingroup func_qurt_rmutex2_try_lock
+
+   @deprecated use #qurt_pimutex_try_lock instead.
+
+   Attempts to lock the specified recursive mutex.\n
+
+   Non-blocking version of qurt_pimutex2_lock().  If a call to qurt_pimutex2_lock() would
+   succeed immediately, this function behaves similarly, and returns 0 for success.
+   If a call to qurt_pimutex2_lock() would not succeed immediately, this function has
+   no effect and returns non-zero for failure.
+
+   @datatypes
+   #qurt_rmutex2_t
+   
+   @param[in]  lock  Pointer to the recursive mutex object to lock.
+
+   @return 
+   0 -- Success. \n 
+   Nonzero -- Failure. 
+  
+ */
+int qurt_pimutex2_try_lock(qurt_rmutex2_t *lock);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_PIMUTEX2_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_pipe.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_pipe.h
new file mode 100755
index 0000000000000..6bdaa044f8640
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_pipe.h
@@ -0,0 +1,479 @@
+#ifndef QURT_PIPE_H
+#define QURT_PIPE_H
+/**
+  @file qurt_pipe.h 
+  @brief  Prototypes of the pipe interface API  
+   This is a pipe or message queue
+	 It blocks when too full (send) or empty (receive).
+	 Unless using a nonblocking option, all datagrams are 64 bits.
+
+EXTERNAL FUNCTIONS
+   None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+ Copyright (c) 2021,2023  by Qualcomm Technologies, Inc.  All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+=============================================================================*/
+
+#include <stddef.h>
+#include <qurt_mutex.h>
+#include <qurt_sem.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** @addtogroup pipe_types
+@{ */
+/*=============================================================================
+                        CONSTANTS AND MACROS
+=============================================================================*/
+#define QURT_PIPE_MAGIC  0xF1FEF1FE        /**< Magic. */
+#define QURT_PIPE_ATTR_MEM_PARTITION_RAM 0 /**< RAM. */
+#define QURT_PIPE_ATTR_MEM_PARTITION_TCM 1 /**< TCM. */
+
+/*=============================================================================
+                        TYPEDEFS
+=============================================================================*/
+/** QuRT pipe data values type. */
+typedef unsigned long long int qurt_pipe_data_t;
+
+/** QuRT pipe type.*/
+typedef struct {
+    /** @cond */
+	qurt_mutex_t pipe_lock;
+	qurt_sem_t senders;
+	qurt_sem_t receiver;
+	unsigned int size;
+	unsigned int sendidx;
+	unsigned int recvidx;
+	void (*lock_func)(qurt_mutex_t *);
+	void (*unlock_func)(qurt_mutex_t *);
+    int (*try_lock_func)(qurt_mutex_t *);
+    void (*destroy_lock_func)(qurt_mutex_t *);
+	unsigned int magic;
+	qurt_pipe_data_t *data;
+    /** @endcond */
+} qurt_pipe_t;
+
+/**  QuRT pipe attributes type. */
+typedef struct {
+  /** @cond */
+  qurt_pipe_data_t *buffer;
+  unsigned int elements;
+  unsigned char mem_partition;
+  /** @endcond */
+} qurt_pipe_attr_t;
+
+/** @} */ /* end_addtogroup pipe_types */
+/*=============================================================================
+                        FUNCTIONS
+=============================================================================*/
+/**@ingroup func_qurt_pipe_attr_init
+  @xreflabel{hdr:qurt_pipe_attr_init}
+  Initializes the structure that sets the pipe attributes when a pipe is created.
+
+  After an attribute structure is initialized, the individual attributes in the structure are
+  explicitly set using the pipe attribute operations.
+
+  The attribute structure is assigned the following default values: \n
+  - buffer -- 0 \n
+  - elements -- 0 \n
+  - mem_partition -- #QURT_PIPE_ATTR_MEM_PARTITION_RAM
+  
+  @datatypes
+  #qurt_pipe_attr_t
+ 
+  @param[in,out] attr Pointer to the pipe attribute structure.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+static inline void qurt_pipe_attr_init(qurt_pipe_attr_t *attr)
+{
+  attr->buffer = NULL;
+  attr->elements = 0;
+  attr->mem_partition = QURT_PIPE_ATTR_MEM_PARTITION_RAM;
+}
+
+/**@ingroup func_qurt_pipe_attr_set_buffer
+  @xreflabel{sec:qurt_pipe_attr_set_buffer}
+  Sets the pipe buffer address attribute.\n
+  Specifies the base address of the memory area to use for the data buffer of a pipe.
+
+  The base address and size (Section @xref{sec:qurt_pipe_attr_set_elements}) specify the 
+  memory area used as a pipe data buffer. The user is responsible for allocating the 
+  memory area used for the buffer.
+
+  @datatypes
+  #qurt_pipe_attr_t \n
+  #qurt_pipe_data_t
+  
+  @param[in,out] attr Pointer to the pipe attribute structure.
+  @param[in] buffer   Pointer to the buffer base address.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+static inline void qurt_pipe_attr_set_buffer(qurt_pipe_attr_t *attr, qurt_pipe_data_t *buffer)
+{
+  attr->buffer = buffer;
+}
+
+/**@ingroup func_qurt_pipe_attr_set_elements
+  @xreflabel{sec:qurt_pipe_attr_set_elements}
+  Specifies the length of the memory area to use for the data buffer of a pipe. 
+  
+  The length is expressed in terms of the number of 64-bit data elements that 
+  can be stored in the buffer. 
+  
+  The base address (Section @xref{sec:qurt_pipe_attr_set_buffer}) and size specify 
+  the memory area used as a pipe data buffer. The user is responsible for 
+  allocating the memory area used for the buffer.
+
+  @datatypes
+  #qurt_pipe_attr_t
+
+  @param[in,out] attr Pointer to the pipe attribute structure.
+  @param[in] elements Pipe length (64-bit elements). 
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+static inline void qurt_pipe_attr_set_elements(qurt_pipe_attr_t *attr, unsigned int elements)
+{
+  attr->elements = elements;
+}
+
+/**@ingroup func_qurt_pipe_attr_set_buffer_partition
+  @xreflabel{sec:qurt_pipe_attr_set_buffer_partition}
+  Specifies the memory type where a pipe's buffer is allocated.
+  Allocate pipes in RAM or TCM/LPM.
+ 
+  @note1hang If a pipe is specified as allocated in TCM/LPM, it must be created
+  with the qurt_pipe_init() operation. The qurt_pipe_create() operation results in an error.
+
+  @datatypes
+  #qurt_pipe_attr_t
+
+  @param[in,out] attr Pointer to the pipe attribute structure.
+  @param[in] mem_partition Pipe memory partition. Values: \n
+             - #QURT_PIPE_ATTR_MEM_PARTITION_RAM -- Pipe resides in RAM \n
+             - #QURT_PIPE_ATTR_MEM_PARTITION_TCM -- Pipe resides in TCM/LCM @tablebulletend
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+static inline void qurt_pipe_attr_set_buffer_partition(qurt_pipe_attr_t *attr, unsigned char mem_partition)
+{
+  attr->mem_partition = mem_partition;
+}
+
+/**@ingroup func_qurt_pipe_create
+  Creates a pipe.\n
+  Allocates a pipe object and its associated data buffer, and initializes the pipe object.
+
+  @note1hang The buffer address and size stored in the attribute structure specify how the
+             pipe data buffer is allocated.
+  
+  @note1cont If a pipe is specified as allocated in TCM/LPM, it must be created
+             using the qurt_pipe_init() operation. The qurt_pipe_create() operation results in an error.
+  
+  @datatypes
+  #qurt_pipe_t \n
+  #qurt_pipe_attr_t
+  
+  @param[out] pipe  Pointer to the created pipe object.
+  @param[in]  attr  Pointer to the attribute structure used to create the pipe.
+
+  @return 
+  #QURT_EOK -- Pipe created. \n
+  #QURT_EFAILED -- Pipe not created. \n
+  #QURT_ENOTALLOWED -- Pipe cannot be created in TCM/LPM.
+
+  @dependencies
+  None.
+ */
+int qurt_pipe_create(qurt_pipe_t **pipe, qurt_pipe_attr_t *attr);
+
+/**@ingroup func_qurt_pipe_init
+  Initializes a pipe object using an existing data buffer.
+
+  @note1hang The buffer address and size stored in the attribute structure must 
+             specify a data buffer that the user has already allocated.
+
+  @datatypes
+  #qurt_pipe_t \n
+  #qurt_pipe_attr_t
+  
+  @param[out] pipe Pointer to the pipe object to initialize.
+  @param[in] attr  Pointer to the pipe attribute structure used to initialize the pipe.
+
+  @return 
+  #QURT_EOK -- Success. \n
+  #QURT_EFAILED -- Failure.
+
+  @dependencies
+  None.
+ */
+int qurt_pipe_init(qurt_pipe_t *pipe, qurt_pipe_attr_t *attr);
+
+/**@ingroup func_qurt_pipe_destroy
+  @xreflabel{sec:qurt_pipe_destroy}
+  Destroys the specified pipe.
+
+  @note1hang Pipes must be destroyed when they are no longer in use. Failure 
+             to do this causes resource leaks in the QuRT kernel.
+             Pipes must not be destroyed while they are still in use. If this 
+             occurs, the behavior of QuRT is undefined.
+
+  @datatypes
+  #qurt_pipe_t
+  
+  @param[in] pipe Pointer to the pipe object to destroy.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+void qurt_pipe_destroy(qurt_pipe_t *pipe); 
+
+/**@ingroup func_qurt_pipe_delete
+  Deletes the pipe.\n
+  Destroys the specified pipe (Section @xref{sec:qurt_pipe_destroy}) and deallocates the pipe object and its
+  associated data buffer.
+
+  @note1hang Delete pipes only if they were created using qurt_pipe_create
+             (and not qurt_pipe_init). Otherwise the behavior of QuRT is undefined. \n
+  @note1cont Pipes must be deleted when they are no longer in use. Failure to do this 
+             causes resource leaks in the QuRT kernel.\n
+  @note1cont Pipes must not be deleted while they are still in use. If this occurs, the
+             behavior of QuRT is undefined. 
+
+  @datatypes
+  #qurt_pipe_t
+  
+  @param[in] pipe Pointer to the pipe object to destroy.
+
+  @return 
+  None.
+
+  @dependencies
+  None.
+ */
+void qurt_pipe_delete(qurt_pipe_t *pipe);
+
+/**@ingroup func_qurt_pipe_send
+  Writes a data item to the specified pipe. \n
+  If a thread writes to a full pipe, it is suspended on the pipe. When another thread reads
+  from the pipe, the suspended thread is awakened and can then write data to the pipe.
+
+  Pipe data items are defined as 64-bit values. Pipe writes are limited to transferring a single
+  64-bit data item per operation.
+
+  @note1hang Transfer data items larger than 64 bits by reading and writing
+             pointers to the data, or by transferring the data in consecutive 64-bit chunks.
+
+  @datatypes
+  #qurt_pipe_t \n
+  #qurt_pipe_data_t
+  
+  @param[in] pipe Pointer to the pipe object to write to.
+  @param[in] data Data item to write.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+void qurt_pipe_send(qurt_pipe_t *pipe, qurt_pipe_data_t data);
+
+/**@ingroup func_qurt_pipe_receive
+  Reads a data item from the specified pipe.
+
+  If a thread reads from an empty pipe, it is suspended on the pipe. When another thread
+  writes to the pipe, the suspended thread is awakened and can then read data from the pipe.
+  Pipe data items are defined as 64-bit values. Pipe reads are limited to transferring a single
+  64-bit data item per operation.
+
+  @note1hang Transfer data items larger than 64 bits by reading and writing
+             pointers to the data, or by transferring the data in consecutive 64-bit chunks.
+
+  @datatypes
+  #qurt_pipe_t
+  
+  @param[in] pipe Pointer to the pipe object to read from.
+
+  @return
+  Integer containing the 64-bit data item from pipe.
+
+  @dependencies
+  None.
+*/
+qurt_pipe_data_t qurt_pipe_receive(qurt_pipe_t *pipe);
+
+/**@ingroup func_qurt_pipe_try_send
+  Writes a data item to the specified pipe (without suspending the thread if the pipe is full).\n
+
+  If a thread writes to a full pipe, the operation returns immediately with success set to -1.
+  Otherwise, success is always set to 0 to indicate a successful write operation.
+
+  Pipe data items are defined as 64-bit values. Pipe writes are limited to transferring a single
+  64-bit data item per operation.
+
+  @note1hang Transfer data items larger than 64 bits  by reading and writing
+             pointers to the data, or by transferring the data in consecutive 64-bit chunks.
+
+  @datatypes
+  #qurt_pipe_t \n
+  #qurt_pipe_data_t
+  
+  @param[in] pipe Pointer to the pipe object to write to.
+  @param[in] data Data item to write.
+
+  @return
+  0 -- Success. \n
+  -1 -- Failure (pipe full).
+
+  @dependencies
+  None.
+*/ 
+int qurt_pipe_try_send(qurt_pipe_t *pipe, qurt_pipe_data_t data);
+
+/**@ingroup func_qurt_pipe_try_receive
+  Reads a data item from the specified pipe (without suspending the thread if the pipe is
+  empty).\n
+  If a thread reads from an empty pipe, the operation returns immediately with success set
+  to -1. Otherwise, success is always set to 0 to indicate a successful read operation.\n
+
+  Pipe data items are defined as 64-bit values. Pipe reads are limited to transferring a single
+  64-bit data item per operation.
+
+  @note1hang Transfer data items larger than 64 bits by reading and writing
+             pointers to the data, or by transferring the data in consecutive 64-bit chunks.
+
+  @datatypes
+  #qurt_pipe_t
+  
+  @param[in] pipe     Pointer to the pipe object to read from.
+  @param[out] success Pointer to the operation status result.
+
+  @return
+  Integer containing a 64-bit data item from pipe.
+
+  @dependencies
+  None.
+*/
+qurt_pipe_data_t qurt_pipe_try_receive(qurt_pipe_t *pipe, int *success);
+
+/**@ingroup func_qurt_pipe_receive_cancellable  
+  Reads a data item from the specified pipe (with suspend), cancellable.
+
+  If a thread reads from an empty pipe, it is suspended on the pipe. When another thread
+  writes to the pipe, the suspended thread is awakened and can then read data from the pipe.
+  The operation is cancelled if the user process of the calling thread is killed, 
+  or if the calling thread must finish its current QDI invocation and return to user space.
+  Root pd thread can use this api to wait on pipe for receiving and gets resumed with QURT_EDESTROY
+  if the pipe gets destroyed .
+  Pipe data items are defined as 64-bit values. Pipe reads are limited to transferring a single
+  64-bit data item per operation. 
+
+  @note1hang Transfer data items larger than 64 bits by reading and writing
+             pointers to the data, or by transferring the data in consecutive 64-bit chunks.
+
+  @datatypes
+  #qurt_pipe_t \n
+  #qurt_pipe_data_t
+
+  @param[in] pipe     Pointer to the pipe object to read from.
+  @param[in] result   Pointer to the integer containing the 64-bit data item from pipe.
+
+  @return     	
+  #QURT_EOK -- Receive completed. \n
+  #QURT_ECANCEL -- Receive canceled. \n
+  #QURT_EDESTROY -- Receive destroyed. \n
+  #QURT_ENOTALLOWED -- Pipe is not initialized
+
+ 
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+int qurt_pipe_receive_cancellable(qurt_pipe_t *pipe, qurt_pipe_data_t *result);
+
+/**@ingroup func_qurt_pipe_send_cancellable  
+  @xreflabel{hdr:qurt_pipe_send_cancellable}
+  Writes a data item to the specified pipe (with suspend), cancellable. \n
+  If a thread writes to a full pipe, it is suspended on the pipe. When another thread reads
+  from the pipe, the suspended thread is awakened and can then write data to the pipe.
+  The operation is canceled if the user process of the calling thread is killed, or if the 
+  calling thread must finish its current QDI invocation and return to user space.
+  Root pd thread can use this api to wait on pipe for receiving and gets resumed with QURT_EDESTROY
+  if the pipe gets destroyed .
+
+  Pipe data items are defined as 64-bit values. Pipe writes are limited to transferring a single
+  64-bit data item per operation.
+
+  @note1hang Transfer data items larger than 64 bits by reading and writing
+             pointers to the data, or by transferring the data in consecutive 64-bit chunks.
+
+  @datatypes
+  #qurt_pipe_t \n
+  #qurt_pipe_data_t
+
+  @param[in] pipe      Pointer to the pipe object to read from.
+  @param[in] data      Data item to write.
+
+  @return     	
+  #QURT_EOK -- Send completed. \n
+  #QURT_ECANCEL -- Send canceled. \n
+  #QURT_EDESTROY -- Send destroyed. \n
+  #QURT_ENOTALLOWED -- Pipe is not initialized
+
+ 
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+int qurt_pipe_send_cancellable(qurt_pipe_t *pipe, qurt_pipe_data_t data);
+
+/**@ingroup func_qurt_pipe_is_empty
+  Returns a value indicating whether the specified pipe contains any data.
+
+  @datatypes
+  #qurt_pipe_t
+
+  @param[in] pipe     Pointer to the pipe object to read from.
+
+  @return
+  1 -- Pipe contains no data. \n
+  0 -- Pipe contains data.
+
+  @dependencies
+  None.
+*/
+int qurt_pipe_is_empty(qurt_pipe_t *pipe);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif  /* QURT_PIPE_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_pmem_manager.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_pmem_manager.h
new file mode 100755
index 0000000000000..8c8da985228b9
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_pmem_manager.h
@@ -0,0 +1,82 @@
+#ifndef QURT_PMEM_MANAGER_H
+#define QURT_PMEM_MANAGER_H
+/**
+  @file qurt_pmem_manager.h
+  Prototypes of kernel physical memory manager APIs
+
+ EXTERNALIZED FUNCTIONS
+  none
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  none
+
+ Copyright (c) 2023 by Qualcomm Technologies, Inc.  All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*=====================================================================
+ Constants and macros
+ ======================================================================*/
+
+/* physical memory API return error code */
+#define QURT_PMEM_SUCCESS               0
+#define QURT_PMEM_NO_PRIV               1
+#define QURT_PMEM_RETRY                 2
+#define QURT_PMEM_OVERLAP               3
+#define QURT_PMEM_NOT_EXIST             4
+#define QURT_PMEM_INIT_FAILURE          5
+#define QURT_PMEM_OUTSTANDING_MAPPING   6
+#define QURT_PMEM_GENERIC_FAILURE       7
+#define QURT_PMEM_ENTRY_FOUND           8
+#define QURT_PMEM_REACH_END             9
+#define QURT_PMEM_UNCLAIMED             10
+#define QURT_PMEM_ALREADY_CLAIMED       11
+
+/*=====================================================================
+ Functions
+======================================================================*/
+
+/**@ingroup func_qurt_pmem_acquire
+  Acquire the ownership of a specific physical memory region.
+
+  @note1hang The ownership will be the caller
+
+  @param[in] ppage      Starting physical page number
+  @param[in] pnum       Number of physical pages
+
+  @return
+  #QURT_PMEM_NO_PRIV -- Have no privilege to claim the ownership. \n
+  #QURT_PMEM_OVERLAP -- The whole or part of the range has been owned \n
+  #QURT_PMEM_SUCCESS -- Succeed to claim ownership.
+
+  @dependencies
+  None.
+*/
+int qurt_pmem_acquire(unsigned int ppage, unsigned int pnum);
+
+/**@ingroup func_qurt_pmem_release
+  Release the ownership of a specific physical memory region.
+
+  @param[in] ppage      The start of physical page number
+  @param[in] pnum       The numbers of physical pages
+
+  @return
+  #QURT_PMEM_NO_PRIV                -- Have no privilege to claim the ownership. \n
+  #QURT_PMEM_NOT_EXIST              -- The physical memory range is not usable. \n
+  #QURT_PMEM_OUTSTANDING_MAPPING    -- There is outstanding mapping in this range
+  #QURT_PMEM_SUCCESS                -- Succeed to claim ownership.
+
+  @dependencies
+  None.
+ */
+int qurt_pmem_release(unsigned int ppage, unsigned int pnum);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_PMEM_MANAGER_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_pmu.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_pmu.h
new file mode 100755
index 0000000000000..73ea8eba04abf
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_pmu.h
@@ -0,0 +1,121 @@
+#ifndef QURT_PMU_H
+#define QURT_PMU_H
+/**
+  @file qurt_pmu.h 
+  Prototypes of pipe interface API.  
+	 A pipe or message queue blocks when too full (send) or empty (receive).
+	 Unless using a nonblocking option, all datagrams are 64 bits.
+
+  EXTERNAL FUNCTIONS
+   None.
+
+  INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+  Copyright (c) 2021 Qualcomm Technologies, Inc.
+  All rights reserved.
+  Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+=============================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*=============================================================================
+												FUNCTIONS
+=============================================================================*/
+
+/**@ingroup func_qurt_pmu_set
+  Sets the value of the specified PMU register.
+
+  @note1hang Setting PMUEVTCFG automatically clears the PMU registers PMUCNT0
+             through PMUCNT3.
+ 
+  @param[in] reg_id   PMU register. Values: 
+            - #QURT_PMUCNT0
+            - #QURT_PMUCNT1    
+            - #QURT_PMUCNT2    
+            - #QURT_PMUCNT3    
+            - #QURT_PMUCFG     
+            - #QURT_PMUEVTCFG
+            - #QURT_PMUCNT4    
+            - #QURT_PMUCNT5    
+            - #QURT_PMUCNT6    
+            - #QURT_PMUCNT7    
+            - #QURT_PMUEVTCFG1   @tablebulletend 
+
+  @param[in] reg_value  Register value.
+ 
+  @return
+  None.
+   
+  @dependencies
+  None.
+ */
+void qurt_pmu_set (int reg_id, unsigned int reg_value);
+ 
+/**@ingroup func_qurt_pmu_get
+  Gets the PMU register.\n
+  Returns the current value of the specified PMU register.
+
+  @param[in] reg_id   PMU register. Values: 			   
+            - #QURT_PMUCNT0
+            - #QURT_PMUCNT1    
+            - #QURT_PMUCNT2    
+            - #QURT_PMUCNT3    
+            - #QURT_PMUCFG     
+            - #QURT_PMUEVTCFG
+            - #QURT_PMUCNT4    
+            - #QURT_PMUCNT5    
+            - #QURT_PMUCNT6    
+            - #QURT_PMUCNT7    
+            - #QURT_PMUEVTCFG1  @tablebulletend           
+ 
+  @return
+   Integer -- Current value of the specified PMU register.
+
+  @dependencies
+  None.
+ */
+unsigned int  qurt_pmu_get (int reg_id);
+ 
+/**@ingroup func_qurt_pmu_enable
+  Enables or disables the Hexagon processor PMU.
+  Profiling is disabled by default. 
+
+  @note1hang Enabling profiling does not automatically reset the count registers -- this must
+            be done explicitly before starting event counting.
+ 
+  @param[in] enable Performance monitor. Values: \n
+                    - 0 -- Disable performance monitor \n
+                    - 1 -- Enable performance monitor @tablebulletend
+ 
+  @return 
+  None.
+
+  @dependencies
+  None.
+ */
+void qurt_pmu_enable (int enable);
+
+/**@ingroup func_qurt_pmu_get_pmucnt
+  Reads PMU counters in a single trap.
+ 
+  @param[out] buf   Pointer to a buffer to save values read from PMU counters.
+                    buffer size should be at least 32 bytes to read all eight PMU counters.
+ 
+  @return 
+  #QURT_EOK    -- Successful read.\n
+  #QURT_EFATAL -- Failure.
+
+  @dependencies
+  None.
+ */
+int qurt_pmu_get_pmucnt (void * buf);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_PMU_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_power.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_power.h
new file mode 100755
index 0000000000000..2ee4d29a73976
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_power.h
@@ -0,0 +1,140 @@
+#ifndef QURT_POWER_H
+#define QURT_POWER_H
+/**
+  @file qurt_power.h
+  @brief  Prototypes of power API
+
+EXTERNAL FUNCTIONS
+   None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+Copyright (c) 2018-2021  by Qualcomm Technologies, Inc.  All Rights Reserved.
+Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+=============================================================================*/
+
+/*=============================================================================
+
+                        EDIT HISTORY FOR MODULE
+
+ This section contains comments describing changes made to the module.
+ Notice that changes are listed in reverse chronological order.
+
+
+when       who     what, where, why
+--------   ---     ------------------------------------------------------------
+03/03/11   op      Add header file
+12/12/12   cm      (Tech Pubs) Edited/added Doxygen comments and markup.
+=============================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** @cond */
+/**@ingroup func_qurt_power_shutdown_fail_exit
+  Returns from Power Collapse mode when power collapse cannot proceed.
+
+  This function unmasks the global interrupt. This operation is used only when the thread is
+  recovering from a failed power collapse operation (Section @xref{sec:powerShutdownEnter}).
+
+  @return
+  #QURT_EOK -- Operation was successfully performed.
+
+  @dependencies
+  None.
+ */
+#define  qurt_power_shutdown_fail_exit qurt_power_exit
+
+/**@ingroup func_qurt_power_shutdown_exit
+  Undoes state changes made preparing for power collapse.\n
+  This function unmasks the global interrupts.
+
+  @return
+  #QURT_EOK --Operation was successfully performed.
+
+  @dependencies
+  None.
+ */
+#define qurt_power_shutdown_exit qurt_power_exit
+/**@endcond */
+
+/**@ingroup func_qurt_system_ipend_get
+  Gets the IPEND register.\n
+
+  @note1hang Returns the current value of the Hexagon processor IPEND register. The return value
+             is a mask value that identifies the individual interrupts that are pending. \n
+
+  @note1hang The bit order of the mask value is identical to the order defined for the IPEND register. A
+             mask bit value of 1 indicates that the corresponding interrupt is pending, and 0 indicates that the
+             corresponding interrupt is not pending. \n
+
+  @return
+  Return the IPEND register value.
+
+  @dependencies
+  None.
+ */
+unsigned int qurt_system_ipend_get (void);
+
+
+/**@ingroup func_qurt_system_vid_get
+  Gets the VID register. \n
+
+  @note1hang Returns the current value of the Hexagon processor VID register. The return value is
+             the vector number of a second-level interrupt that has been accepted by the Hexagon
+             processor core.\n
+
+  @return
+  Return the VID register value that is the L2 VIC interrupt number accepted by the processor.
+  Valid range is 0 to 1023.
+
+  @dependencies
+  None.
+ */
+unsigned int qurt_system_vid_get(void);
+
+/**@ingroup func_qurt_power_shutdown_get_pcycles
+   Gets the number of power collapses and processor cycles for entering and exiting most recent
+   power collapse.
+
+   @note1hang If no power collapse has occured yet, processor cycle numbers are zero.
+
+   @param[out] enter_pcycles  Number of processor cycles for entering most
+                              recent power collapse.
+   @param[out] exit_pcycles  Number of processor cycles for exiting most
+                             recent power collapse.
+   @return
+   Zero -- No power collapses have occurred. \n
+   Nonzero -- Number of power collapses that have occurred since
+                the processor was reset.
+
+   @dependencies
+   None.
+ */
+int qurt_power_shutdown_get_pcycles( unsigned long long *enter_pcycles,  unsigned long long *exit_pcycles );
+
+/**@ingroup func_qurt_system_tcm_set_size
+   Set size of TCM to save during full power collapse.
+
+   @note1hang The size aligns to 32 bytes. If size passed is greater than the maximum size defined in
+              XML, the size is truncated to the size defined in XML.
+
+   @param[in] new_size Size of TCM to save.
+
+   @return
+   Zero -- Size successfully set \n
+   -1 -- Size of 0 passed
+
+   @dependencies
+   None.
+ */
+int qurt_system_tcm_set_size(unsigned int new_size);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_POWER_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_printf.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_printf.h
new file mode 100755
index 0000000000000..a775d8a815918
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_printf.h
@@ -0,0 +1,44 @@
+#ifndef QURT_PRINTF_H
+#define QURT_PRINTF_H
+
+#include <stdarg.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+  @file qurt_printf.h   
+  Prototypes of printf API.  
+
+EXTERNAL FUNCTIONS
+   None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+Copyright (c) 2021  by Qualcomm Technologies, Inc.  All Rights Reserved.
+Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+=============================================================================*/
+
+
+
+/*=============================================================================
+                        CONSTANTS AND MACROS
+=============================================================================*/
+/** @addtogroup chapter_function_tracing
+@{ */
+
+int qurt_printf(const char* format, ...);
+
+int qurt_vprintf(const char* format, va_list args);
+
+/** @} */ /* end_addtogroup chapter_function_tracing */
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_PRINTF_H */
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_process.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_process.h
new file mode 100755
index 0000000000000..0df9ddc2d4a70
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_process.h
@@ -0,0 +1,995 @@
+#ifndef QURT_PROCESS_H
+#define QURT_PROCESS_H
+/**
+  @file qurt_process.h
+  @brief Prototypes of QuRT process control APIs.
+
+ EXTERNALIZED FUNCTIONS
+ None
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+ None
+
+ Copyright (c) 2009-2013, 2021-2023 Qualcomm Technologies, Inc.
+ All rights reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+#include "qurt_callback.h"
+#include "qurt_consts.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** @addtogroup process_types
+@{ */
+#define QURT_PROCESS_ATTR_NAME_MAXLEN       QURT_MAX_NAME_LEN   /**< Maximum length of the process name. */
+#define QURT_PROCESS_ATTR_BIN_PATH_MAXLEN   128                 /**< Maximum length of the path of binary/ELF for this process. */
+#define QURT_PROCESS_ATTR_CAP_MAXLEN        128                 /**< Maximum length for a resource name. */
+
+/** QuRT process capability wildcard strings */
+#define QURT_PROCESS_ATTR_CAP_ALLOW_ALL     "ALLOW_ALL"         /**< Capability wild-card for full access */
+#define QURT_PROCESS_ATTR_CAP_ALLOW_NONE    "ALLOW_NONE"        /**< Capability wild-card for no access */
+
+/** QuRT process capability states */  
+#define QURT_PROCESS_ATTR_CAP_ENABLED       0x1                 /**< Capability enabled*/
+#define QURT_PROCESS_ATTR_CAP_DISABLED      0x0                 /**< Capability disabled*/  
+
+/* QuRT process thread attributes. */
+#define QURT_PROCESS_DEFAULT_CEILING_PRIO 0        /**< Default ceiling priority of the threads in the new process. */
+#define QURT_PROCESS_DEFAULT_MAX_THREADS  -1       /**< Default number of threads in the new process.
+                                                        -1 indicates that the limit is set to the maximum supported by the system. */
+
+/* QuRT process flags. */
+#define QURT_PROCESS_SUSPEND_ON_STARTUP  (1U)      /**< Suspend the new processes just before calling main(). */
+#define QURT_PROCESS_NON_SYSTEM_CRITICAL (1u << 1) /**< Starts the new process as non system-critical. */
+#define QURT_PROCESS_ISLAND_RESIDENT     (1u << 2) /**< Process is island resident. */
+#define QURT_PROCESS_RESTARTABLE         (1u << 3) /**< Indicates that the process is restartable */
+#define QURT_PROCESS_UNTRUSTED           (1u << 7) /**< Starts the new process as unsigned process. */
+
+/* QuRT process debugging session status.*/
+#define QURT_DEBUG_NOT_START         0  /**< Debug is not started. */
+#define QURT_DEBUG_START             1  /**< Debug has started. */
+
+/** Process Suspend Options */
+#define QURT_PROCESS_SUSPEND_DEFAULT   0
+
+/** Process Resume Options   */
+#define QURT_PROCESS_RESUME_DEFAULT    0
+
+
+/* QuRT process types. */
+typedef enum {
+    QURT_PROCESS_TYPE_RESERVED,            /**< Process type is reserved. \n */
+    QURT_PROCESS_TYPE_KERNEL,              /**< Kernel process. \n*/
+    QURT_PROCESS_TYPE_SRM,                 /**< SRM process.    \n*/
+    QURT_PROCESS_TYPE_SECURE,              /**< Secure process. \n*/
+    QURT_PROCESS_TYPE_ROOT,                /**< Root process.   \n*/
+    QURT_PROCESS_TYPE_USER,                /**< User process.   */
+}qurt_process_type_t;
+
+/** QuRT process callback types. */
+typedef enum {
+   QURT_PROCESS_DUMP_CB_ROOT,             /**< Register the callback that executes in the
+                                               root process context. \n */
+   QURT_PROCESS_DUMP_CB_ERROR,            /**< Register the user process callback that is 
+                                               called after threads in the process are frozen. \n */
+   QURT_PROCESS_DUMP_CB_PRESTM,           /**< Register the user process callback that is
+                                               called before threads in the process are frozen. \n*/
+   QURT_PROCESS_DUMP_CB_MAX               /**< Reserved for error checking. */
+}qurt_process_dump_cb_type_t;
+
+/** QuRT process dump attributes. */
+typedef struct _qurt_pd_dump_attr{
+  /** @cond */
+  unsigned int enabled;                    /**< Process dump is enabled. */
+  const char *path;                        /**< Process dump path. */
+  unsigned int path_len;                   /**< Length of process dump path. */
+  /** @endcond */
+}qurt_pd_dump_attr_t;                    
+
+/** QuRT process capability resource type */
+enum qurt_process_cap_type_t {
+    QURT_PROCESS_CAP_TYPE_NUM_ENTRIES=0,       /**< Number of entries in the capability structure*/
+    QURT_PROCESS_CAP_TYPE_DRIVER=1,            /**< Driver resource */
+    QURT_PROCESS_CAP_TYPE_MAX                  /**< Maximum identifier */        
+};
+
+/** QuRT process capability structure */
+typedef struct _qurt_capability {
+    enum qurt_process_cap_type_t type;             /**< Resource type */
+    char name[QURT_PROCESS_ATTR_CAP_MAXLEN];       /**< Resource name*/ 
+    unsigned long long cap;                        /**< Capabilities allowed for this resource */
+}qurt_capability_t;
+
+/** QuRT process attributes. */
+typedef struct _qurt_process_attr {
+    /** @cond */
+    char name[QURT_PROCESS_ATTR_NAME_MAXLEN];           /**< Name of the new process. */
+    char path[QURT_PROCESS_ATTR_BIN_PATH_MAXLEN];       /**< Path of the binary for the new process. */
+    char dtb_path[QURT_PROCESS_ATTR_BIN_PATH_MAXLEN];   /**< Path of the DTB ELF for the new process. */
+    int flags;                                          /**< Flags as indicated by QuRT process flags. */
+    unsigned int sw_id;                                 /**< Software ID of the process be load. */
+    unsigned sid;                                       /**< Stream ID of the process being spawned. */
+    unsigned max_threads;                               /**< Maximum number of threads that the new process can create. */
+    unsigned short ceiling_prio;                        /**< Maximum priority at which threads can be 
+                                                             created by new process. */
+    qurt_process_type_t type;                           /**< Process type as indicated by 
+                                                             #qurt_process_type_t. */
+    qurt_pd_dump_attr_t dump_attr;                      /**< Process dump attributes for the new process 
+                                                             as indicated by #qurt_pd_dump_attr_t. */ 
+    qurt_capability_t *capabilities;                    /**< Pointer to array of structure of type
+                                                             qurt_capability_t */
+    /** @endcond */
+} qurt_process_attr_t; 
+
+/** @} */ /* end_addtogroup process_types */
+
+/*=============================================================================
+FUNCTIONS
+=============================================================================*/
+ /** @cond rest_reg_dist */
+/**@ingroup func_qurt_process_create
+  Creates a process with the specified attributes, and starts the process.
+
+  The process executes the code in the specified executable ELF file.
+
+  @datatypes
+  #qurt_process_attr_t
+
+  @param[out] attr Accepts an initialized process attribute structure, which specifies
+                   the attributes of the created process.
+
+  @return
+  Postive return value Indicates Process ID.
+  Negative return value Indicates any of follwoing error,
+  #-QURT_EPRIVILEGE      --   Caller does not have privilege for this operation \n
+  #-QURT_EMEM            --   Not enough memory to perform the operation \n
+  #-QURT_EFAILED         --   Operation failed \n
+  #-QURT_ENOTALLOWED     --   Operation not allowed \n
+  #-QURT_ENOREGISTERED   --   Not registered    \n
+  #-QURT_ENORESOURCE     --   Resource exhaustion   \n
+  #-QURT_EINVALID        --   Invalid argument value    
+  #QURT_EFATAL           --   attr is NULL
+
+  @dependencies
+  None.
+*/
+int qurt_process_create (qurt_process_attr_t *attr);
+
+/**@ingroup func_qurt_process_get_id
+  Returns the process identifier for the current thread. 
+
+  @return
+  None.
+
+  @dependencies
+  Process identifier for the current thread.
+*/
+int qurt_process_get_id (void);
+/** @endcond */
+
+/** @cond internal_only*/
+/**@ingroup func_qurt_process_get_uid
+  Returns the user identifier for the current thread. 
+
+  @return
+  None.
+
+  @dependencies
+  User identifier for the current thread.
+*/
+int qurt_process_get_uid (void);
+/** @endcond */
+/** @cond rest_reg_dist */
+/**@ingroup func_qurt_process_attr_init
+  Initializes the structure that sets the process attributes when a thread is created.
+
+  After an attribute structure is initialized, the individual attributes in the structure can 
+  be explicitly set using the process attribute operations.
+
+  Table @xref{tbl:processAttrDefaults} lists the default attribute values set by the initialize 
+  operation.
+
+  @inputov{table_process_attribute_defaults}
+
+  @datatypes
+  #qurt_process_attr_t
+
+  @param[out] attr Pointer to the structure to initialize.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+static inline void qurt_process_attr_init (qurt_process_attr_t *attr)
+{
+    attr->name[0] = '\0';
+    attr->path[0] = '\0';
+    attr->dtb_path[0] = '\0';
+    attr->flags = 0;
+    attr->sw_id = 0;
+    attr->sid = 0;
+    attr->max_threads = (unsigned)QURT_PROCESS_DEFAULT_MAX_THREADS;
+    attr->ceiling_prio = QURT_PROCESS_DEFAULT_CEILING_PRIO;
+    attr->type = QURT_PROCESS_TYPE_RESERVED;
+    attr->dump_attr.enabled = 0;
+    attr->dump_attr.path = NULL;
+    attr->dump_attr.path_len = 0;
+    attr->capabilities = NULL;
+}
+
+/**@ingroup func_qurt_process_attr_set_executable
+  Sets the process name in the specified process attribute structure.
+
+  Process names identify process objects that are already 
+  loaded in memory as part of the QuRT system.
+
+  @note1hang Process objects are incorporated into the QuRT system at build time.
+
+  @note1hang Maximum length of name string is limited to QURT_PROCESS_ATTR_NAME_MAXLEN - 1.
+
+  @datatypes
+  #qurt_process_attr_t
+
+  @param[in] attr Pointer to the process attribute structure.
+  @param[in] name Pointer to the process name.
+ 
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+void qurt_process_attr_set_executable (qurt_process_attr_t *attr, const char *name);
+
+/**@ingroup func_qurt_process_attr_set_binary_path
+  Sets the binary path for the process loading in the specified process attribute structure.
+
+  Path specifies the binary to load for this process.
+  
+  @note1hang Max length of path string is limited to QURT_PROCESS_ATTR_BIN_PATH_MAXLEN-1.
+
+  @datatypes
+  #qurt_process_attr_t
+
+  @param[in] attr Pointer to the process attribute structure.
+  @param[in] path Pointer to the binary path.
+ 
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+void qurt_process_attr_set_binary_path(qurt_process_attr_t *attr, char *path);
+
+/**@ingroup func_qurt_process_attr_set_dtb_path
+  Sets the DTB binary path for the process loading in the specified process attribute structure.
+
+  Path specifies the DTB binary to load for this process.
+  
+  @note1hang Max length of path string is limited to QURT_PROCESS_ATTR_BIN_PATH_MAXLEN-1.
+
+  @datatypes
+  #qurt_process_attr_t
+
+  @param[in] attr Pointer to the process attribute structure.
+  @param[in] path Pointer to the binary path.
+ 
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+void qurt_process_attr_set_dtb_path(qurt_process_attr_t *attr, char *path);
+
+/**@ingroup func_qurt_process_attr_set_flags
+Sets the process properties in the specified process attribute structure.
+Process properties are represented as defined symbols that map into bits 
+0 through 31 of the 32-bit flag value. Multiple properties are specified by OR'ing 
+together the individual property symbols.
+
+@datatypes
+#qurt_process_attr_t
+
+@param[in] attr  Pointer to the process attribute structure.
+@param[in] flags QURT_PROCESS_NON_SYSTEM_CRITICAL Process is considered as non system-critical.
+                                                  This attribute will be used by error services,
+                                                  to decide whether to kill user pd or whole subsystem.
+                 QURT_PROCESS_ISLAND_RESIDENT     Process will be marked as island resident.
+                 QURT_PROCESS_RESTARTABLE         Process will be marked as restartable.
+                 QURT_PROCESS_UNTRUSTED           Process will be marked as unsigned process.
+@return
+None.
+
+@dependencies
+None.
+*/
+static inline void qurt_process_attr_set_flags (qurt_process_attr_t *attr, int flags)
+{
+    attr->flags = flags;
+}
+/** @endcond */
+/** @cond internal_only*/
+/**@ingroup func_qurt_process_attr_set_sid
+Sets the process streamID in the specified process attribute structure.
+
+@datatypes
+#qurt_process_attr_t
+
+@param[in] attr  Pointer to the process attribute structure.
+@param[in] sid   streamID to set for this process.
+
+@return
+None.
+
+@dependencies
+None.
+*/
+static inline void qurt_process_attr_set_sid (qurt_process_attr_t *attr, unsigned sid)
+{
+    attr->sid = sid;
+}
+/** @endcond */
+/** @cond rest_reg_dist */
+/**@ingroup func_qurt_process_attr_set_max_threads
+Sets the maximum number of threads allowed in the specified process attribute structure.
+
+@datatypes
+#qurt_process_attr_t
+
+@param[in] attr          Pointer to the process attribute structure.
+@param[in] max_threads   Maximum number of threads allowed for this process.
+
+@return
+None.
+
+@dependencies
+None.
+*/
+static inline void qurt_process_attr_set_max_threads (qurt_process_attr_t *attr, unsigned max_threads)
+{
+    attr->max_threads = max_threads;
+}
+
+/**@ingroup func_qurt_process_attr_set_sw_id
+Sets the software ID of the process to load in the specified process attribute structure.
+
+@datatypes
+#qurt_process_attr_t
+
+@param[in] attr          Pointer to the process attribute structure.
+@param[in] sw_id         Software ID of the process, used in authentication.
+
+@return
+None.
+
+@dependencies
+None.
+*/
+static inline void qurt_process_attr_set_sw_id(qurt_process_attr_t *attr, unsigned int sw_id)
+{
+    attr->sw_id = sw_id;
+}
+
+/**@ingroup func_qurt_process_attr_set_ceiling_prio
+Sets the highest thread priority allowed in the specified process attribute structure.
+Refer qurt_thread.h for priority ranges.
+
+@datatypes
+#qurt_process_attr_t
+
+@param[in] attr          Pointer to the process attribute structure.
+@param[in] prio          Priority.
+
+@return
+None.
+
+@dependencies
+None.
+*/
+static inline void qurt_process_attr_set_ceiling_prio (qurt_process_attr_t *attr, unsigned short prio)
+{
+    attr->ceiling_prio = prio;
+}
+/** @endcond */
+
+/** @cond internal_only*/
+/**@ingroup func_qurt_process_attr_set_dump_status
+Sets the process domain dump-enabled field in the process domain dump attributes.
+
+@datatypes
+#qurt_process_attr_t
+
+@param[in] attr          Pointer to the process attribute structure.
+@param[in] enabled       1 -- Process domain dump is collected \n
+                         0 -- Process domain dump is not collected
+
+@return
+None.
+
+@dependencies
+None.
+*/
+static inline void qurt_process_attr_set_dump_status(qurt_process_attr_t *attr, unsigned int enabled)
+{
+    attr->dump_attr.enabled = enabled;
+}
+
+/**@ingroup func_qurt_process_attr_set_dump_path
+Sets the process domain dump path and type.
+
+@datatypes
+#qurt_process_attr_t
+
+@param[in] attr          Pointer to the process attribute structure.
+@param[in] path          Path where the process domain dumps must be saved.
+@param[in] path_len      Length of the path string.
+
+@return
+None. 
+
+@dependencies
+None.
+*/
+static inline void qurt_process_attr_set_dump_path(qurt_process_attr_t *attr, const char *path, int path_len)
+{
+    attr->dump_attr.path = path;
+    attr->dump_attr.path_len = (unsigned int)path_len;
+}
+
+/**@ingroup func_qurt_process_attr_set_capabilities
+Sets list of capabilities available to this process.
+
+@datatypes
+#qurt_process_attr_t
+
+@param[in] attr          Pointer to the process attribute structure.
+@param[in] capabilities  Pointer to array of structures of type qurt_capability_t defining 
+                         resources and capabilites
+
+@return
+None. 
+
+@dependencies
+None.
+*/
+static inline void qurt_process_attr_set_capabilities(qurt_process_attr_t *attr, qurt_capability_t *capabilities)
+{
+    attr->capabilities = capabilities;
+}
+
+/** @endcond */
+/** @cond rest_reg_dist */
+/**@ingroup func_qurt_process_cmdline_get
+Gets the command line string associated with the current process.
+The Hexagon simulator command line arguments are retrieved using 
+this function as long as the call is made
+in the process of the QuRT installation, and with the 
+requirement that the program runs in a simulation environment.
+
+If the function modifies the provided buffer, it zero-terminates
+the string. It is possible that the function does not modify the
+provided buffer, so the caller must set buf[0] to a NULL
+byte before making the call. A truncated command line is returned when
+the command line is longer than the provided buffer.
+
+@param[in] buf      Pointer to a character buffer that must be filled in.
+@param[in] buf_siz  Size (in bytes) of the buffer pointed to by the buf argument.
+
+@return
+None.
+
+@dependencies
+None.
+*/
+void qurt_process_cmdline_get(char *buf, unsigned buf_siz);
+
+/**@ingroup func_qurt_process_get_thread_count
+Gets the number of threads present in the process indicated by the PID. 
+ 
+@param[in] pid PID of the process for which the information is required.
+
+@return
+Number of threads in the process indicated by PID, if positive value is obtained
+Negative error code if failed include:
+   QURT_EFATAL - Invalid PID
+   -QURT_ENOTALLOWED - Current process doesnt have access to target process indicated by PID
+
+@dependencies
+None.
+*/
+int qurt_process_get_thread_count(unsigned int pid);
+
+/**@ingroup func_qurt_process_get_thread_ids
+Gets the thread IDs for a process indicated by PID. 
+
+@param[in] pid      PID of the process for which the information is required.
+@param[in] ptr         Pointer to a user passed buffer that must be filled in with thread IDs.
+@param[in] thread_num  Number of thread IDs requested.
+
+@return
+#QURT_EOK - Success
+#QURT_EFATAL - Failed, ptr is NULL
+
+@dependencies
+None.
+ */
+int qurt_process_get_thread_ids(unsigned int pid, unsigned int *ptr, unsigned thread_num);
+/** @endcond */
+/** @cond internal_only*/
+/**@ingroup func_qurt_process_dump_get_mem_mappings_count
+Gets the number of mappings present in the process indicated by the PID. 
+ 
+@param[in] pid PID of the process for which the information is required.
+
+@return
+Number of mappings for the process indicated by the PID.
+
+@dependencies
+None.
+*/
+int qurt_process_dump_get_mem_mappings_count(unsigned int pid);
+
+/**@ingroup func_qurt_process_dump_get_mappings
+Gets the mappings for a specified PID.
+
+@note1hang This API skips device type mappings or mappings created by setting the #QURT_PERM_NODUMP attribute.
+
+@param[in] pid      PID of the process for which the information is required.
+@param[in] ptr      Pointer to a buffer that must be filled in with mappings.
+@param[in] count    Count of mappings requested.
+
+@return
+Number of mappings filled in the buffer passed by the user.
+
+@dependencies
+None.
+*/
+int qurt_process_dump_get_mappings(unsigned int pid, unsigned int *ptr, unsigned count);
+/** @endcond */
+/** @cond rest_reg_dist */
+/**@ingroup func_qurt_process_attr_get
+Gets the attributes of the process with which it was created. 
+ 
+@datatypes
+#qurt_process_attr_t
+
+@param[in]     pid  PID of the process for which the information is required.
+@param[in,out] attr Pointer to the user allocated attribute structure.
+
+@return
+#QURT_EOK     - Success
+#QURT_INVALID - Invalid PID
+#QURT_EFATAL  - attr is NULL
+
+@dependencies
+None.
+*/
+int qurt_process_attr_get(unsigned int pid, qurt_process_attr_t *attr);
+
+/**@ingroup func_qurt_process_dump_register_cb
+Registers the process domain dump callback. 
+ 
+@datatypes
+#qurt_cb_data_t \n
+#qurt_process_dump_cb_type_t
+
+@param[in] cb_data Pointer to the callback information.
+@param[in] type Callback type; these callbacks are called in the context of the user process domain: \n
+   #QURT_PROCESS_DUMP_CB_PRESTM -- Before threads of the exiting process are frozen. \n
+   #QURT_PROCESS_DUMP_CB_ERROR  -- After threads are frozen and captured. \n
+   #QURT_PROCESS_DUMP_CB_ROOT   -- After threads are frozen and captured, and CB_ERROR type of callbacks
+                                   are called.
+@param[in] priority Priority.
+
+@return
+#QURT_EOK -- Success \n
+Other values -- Failure
+    QURT_EFATAL if cb_data is NULL
+    QURT_EINVALID If invalid cb_type
+    QURT_EFAILED If invalid cb_data 
+ 
+@dependencies
+None.
+*/
+int qurt_process_dump_register_cb(qurt_cb_data_t *cb_data, qurt_process_dump_cb_type_t type, unsigned short priority);
+
+/**@ingroup func_qurt_process_dump_deregister_cb
+Deregisters the process domain dump callback.
+
+@datatypes
+#qurt_cb_data_t \n
+#qurt_process_dump_cb_type_t
+
+@param[in] cb_data Pointer to the callback information to deregister.
+@param[in] type    Callback type.
+
+@return
+#QURT_EOK -- Success.\n
+Other values -- Failure.
+    QURT_EFATAL if cb_data is NULL
+    QURT_EINVALID If invalid cb_type
+    QURT_EFAILED If invalid cb_data 
+
+@dependencies
+None.
+*/
+int qurt_process_dump_deregister_cb(qurt_cb_data_t *cb_data,qurt_process_dump_cb_type_t type);
+
+/** @endcond */
+/** @cond internal_only*/
+/**@ingroup func_qurt_process_set_rtld_debug
+Sets rtld_debug for a process. 
+ 
+@param[in] pid     PID of the process for which rtld_debug must be set.
+@param[in] address rtld_debug address.
+
+@return
+#QURT_EOK      - Success
+#QURT_EINVALID - Invalid PID
+#QURT_EFATAL   - Invalid address
+ 
+@dependencies
+None.
+*/
+int qurt_process_set_rtld_debug(unsigned int pid,unsigned int address);
+
+/**@ingroup func_qurt_process_get_rtld_debug
+Gets rtld_debug for a process.
+
+@param[in] pid         PID of the process for which rtld_debug must be set.
+@param[in,out] address Pointer to the user passed address in which the rtld_debug address must be returned.
+
+@return
+#QURT_EOK      - Success
+#QURT_EINVALID - Invalid PID
+#QURT_EFATAL   - Invalid address
+
+@dependencies
+None.
+*/
+int qurt_process_get_rtld_debug(unsigned int pid,unsigned int *address);
+/** @endcond */
+/**@ingroup func_qurt_process_exit
+Exits the current user process with an exit code.
+
+@param[in] exitcode Exit code.
+ 
+@return
+#QURT_EFATAL -- No client found with the specified PID value \n
+#QURT_EINVALID -- Invalid client \n
+#QURT_ENOTALLOWED -- User does not have permission to perform this operation \n
+#QURT_EOK -- Success
+
+@dependencies
+None.
+*/
+int qurt_process_exit(int exitcode);
+
+/**@ingroup func_qurt_process_kill
+Kills the process represented by the PID with the exit code.
+
+@param[in] pid       PID of the process to kill.
+@param[in] exitcode  Exit code.
+ 
+@return
+#QURT_EFATAL -- No client found with the specified PID value \n
+#QURT_EINVALID -- Invalid client \n
+#QURT_ENOTALLOWED -- User does not have permission to perform this operation \n
+#QURT_EOK -- Success
+
+@dependencies
+None.
+*/
+int qurt_process_kill(int pid, int exitcode);
+ 
+ 
+/**@ingroup func_qurt_debugger_register_process
+Registers the process indicated by the PID with the debug monitor. 
+
+@param[in] pid  PID of the process.
+@param[in] adr  Address.
+ 
+@return
+#QURT_EOK -- Success 
+
+@dependencies
+None.
+*/
+int qurt_debugger_register_process(int pid, unsigned int adr);
+ 
+ 
+/**@ingroup func_qurt_debugger_deregister_process
+Deregister the process indicated by the PID with the debug monitor.
+
+@param[in] pid  PID of the process.
+ 
+@return
+#QURT_EOK -- Success
+
+@dependencies
+None.
+*/
+int qurt_debugger_deregister_process(int pid);
+ 
+/**@ingroup func_qurt_process_exec_callback
+Executes callbacks in the user process as indicated by the client_handle argument.
+
+@param[in] client_handle  Client handle obtained from the current invocation function (Section 3.4.1).
+@param[in] callback_fn    Callback function to execute.
+@param[in] stack_base     Stack address to use.
+@param[in] stack_size     Stack size.
+ 
+@return
+#QURT_EOK -- Success
+
+@dependencies
+None.
+*/
+int qurt_process_exec_callback(int client_handle,
+                                     unsigned callback_fn,
+                                     unsigned stack_base,
+                                     unsigned stack_size);
+ 
+/**@ingroup func_qurt_process_get_pid
+Gets the process ID of the process that the client_handle argument represents.
+
+@note1hang This API is not supported for unsigned PD, For unsigned PD use qurt_process_get_id()
+
+@param[in] client_handle    Client handle obtained from the current invocation function (Section 3.4.1).
+@param[in] pid              Pointer to the address to store the PID.
+ 
+@return
+#QURT_EOK -- Success
+#QURT_EFATAL -- pid pointer passed as NULL 
+
+@dependencies
+None.
+*/
+int qurt_process_get_pid(int client_handle, int * pid);
+
+/**@ingroup func_qurt_process_get_dm_status
+Gets the debugging session status on the process represented by the pid argument.
+
+@param[in]     pid      Process ID  
+@param[in,out] status   Address to store the status: \n
+                        #QURT_DEBUG_NOT_START \n        
+                        #QURT_DEBUG_START         
+ 
+@return
+#QURT_EOK - Success \n
+#QURT_EINVALID - Error
+
+@dependencies
+None.
+*/
+int qurt_process_get_dm_status( unsigned int pid, unsigned int *status);
+
+
+/**@ingroup func_qurt_process_suspend_threads 
+  Suspends user threads in a user process with its process identifier.
+  The target user process can be a signed user process or an unsigned user process.
+  The caller is from a thread in GuestOS/root process.
+  After the user threads in the target user process are suspended, they cannot be scheduled to run by the kernel 
+  until they resume later.
+
+  This function has one optional argument with one default option.
+  #QURT_PROCESS_SUSPEND_DEFAULT suspends user threads in the target user process.
+
+  This function call is a synchronous call, the function returns after the relevant threads are 
+  completely suspended. 
+  
+  If some user threads in the target user process are set as non-suspendable, this function call does
+  not suspend these threads.
+
+  If the target user process is already suspended, this function call returns success as the 
+  confirmation on the user process suspending.
+
+  QuRT debugger monitor threads in the target user process are non-suspendable, this function call does
+  not suspend the threads.
+
+  If the target user process is a secure user process, or a CPZ process, this function call returns error 
+  without suspending the target user process.                                          
+
+  If a user thread in the target user process runs in the guest OS/root process via a QDI call, this function call 
+  does not suspend the thread in the guest OS, but instead marks the thread as pending-suspend. The thread is suspended 
+  when it exits the guest OS, before executing the first instruction in the user process.
+  In this case, the function returns success while the user thread can be running in GuestOS, and is suspended 
+  when exiting the guest OS. 
+ 
+  @param[in] process_id  Process identifier.
+  @param[in] option      Dfault option #QURT_PROCESS_SUSPEND_DEFAULT suspends user threads in the target user process.
+ 
+  @return
+  #QURT_EOK         -- Success  \n
+  #QURT_EINVALID    -- Failure because of invalid process_id input \n
+  #QURT_ENOTALLOWED -- Failure because the operation is not allowed, for example, on a secure process/CPZ process.
+ 
+  @dependencies
+  None.
+ */
+int qurt_process_suspend_threads (unsigned int process_id, unsigned int option);
+
+
+/**@ingroup func_qurt_process_resume_threads 
+  Resumes a user process with its process identifier.
+  The target user process can be a signed user process or an unsigned user process.
+  The caller is from a thread in the guest OS/root process.
+  After the user threads in the target user process resume, the kernel scheduler
+  can schedule the user threads to run based on their thread priorities.
+
+  This function has an optional argument, #QURT_PROCESS_RESUME_DEFAULT, which 
+  resumes user threads in the target user process.
+
+  This is an asynchronous function, it returns after the kernel moves the user thread from 
+  suspended state to runnable state. The threads are scheduled to run based on their thread priorities.
+  
+  This function call does not resume threads in the target user process that have been set as non-resumable.
+
+  If the target user process have already resumed, this function call confirms that the user process resumes
+  by returning success.
+
+  If the target user process is a secure user process or a CPZ process, this function call returns an error without 
+  resuming operation.                                          
+
+  If user threads in the target user process run in the guest OS/root process via QDI call, this function 
+  call clears the mark of suspend-pending on these threads, so that the threads are be suspended when it exits 
+  the guest OS. 
+ 
+  @param[in] process_id Process identifier.
+  @param[in] option     Default option #QURT_PROCESS_RESUME_DEFAULT resumes user threads in the target user process.
+ 
+  @return
+  #QURT_EOK         -- Success  
+  #QURT_EINVALID    -- Failure because of invalid process_id input.
+  #QURT_ENOTALLOWED -- Failure because of the operation is not allowed, for example, on a secure process/CPZ process.
+ 
+  @dependencies
+  None.
+ */
+int qurt_process_resume_threads (unsigned int process_id, unsigned int option);
+
+/**@ingroup func_qurt_process_vtcm_window_set
+  Set a VTCM access window for a process.
+  The caller thread needs to be in SRM process.
+  
+  This is an synchronous function, it ensures all running threads of the process have the requested 
+  window in effect.The requested view for all non-running thread will take in effect when they get 
+  scheduled.  
+
+  @param[in] pid Process identifier.
+  @param[in] enable  QURT_VTCM_WINDOW_ENABLE    enforces VTCM access window defined by high and low offset.
+                     QURT_VTCM_WINDOW_DISABLE   high and low offset is ignored and VTCM access is fully 
+                                                disabled for the process.
+  @param[in] high_offset  Specifies the high window offset, in 4K increments, from the base address of the VTCM.
+                          QURT_VTCM_WINDOW_HI_OFFSET_DEFAULT  restore high offset to reset value.
+  @param[in] low_offset   Specifies the low window offset, in 4K increments, from the base address of the VTCM.
+                          QURT_VTCM_WINDOW_LO_OFFSET_DEFAULT restore low offset to reset value.
+           
+  @note1hang
+  when high_offset is set to QURT_VTCM_WINDOW_HI_OFFSET_DEFAULT  and low offset is set as 
+  QURT_VTCM_WINDOW_LO_OFFSET_DEFAULT full VTCM range is accessible. Access to VTCM is controlled 
+  via MMU mapping for the process. 
+  
+  @return
+  #QURT_EOK            -- Success  
+  #QURT_EVAL           -- Failure because of invalid inputs.
+  #QURT_EPRIVILEGE     -- Failure because caller does not have enough privilege for this operation.
+  #QURT_ENOTSUPPORTED  -- Failure because of the operation is not supported due to limitation in HW capabilities 
+ 
+  @dependencies
+  None.
+ */
+int qurt_process_vtcm_window_set(int pid, unsigned int enable, unsigned int high_offset, unsigned int low_offset);
+
+/**@ingroup func_qurt_process_vtcm_window_get
+  Get the VTCM window for a process.
+  The caller thread needs to be in SRM process.
+  
+
+  @param[in] pid Process identifier.
+  @param[out] enable  address to store enable status if set
+  @param[out] high_offset address to return high window offset, in 4K increments, from the base address of the VTCM
+  @param[out] low_offset  address to return low window offset, in 4K increments, from the base address of the VTCM.
+  
+  @note1hang
+  User must first check the value of enable returned before checking high and low offset.
+ 
+  @return
+  #QURT_EOK            -- Success  
+  #QURT_EVAL           -- Failure because of invalid inputs.
+  #QURT_EPRIVILEGE     -- Failure because caller does not have enough privilege for this operation.
+  #QURT_ENOTSUPPORTED  -- Failure because of the operation is not supported due to limitation in HW capabilities 
+ 
+  @dependencies
+  None.
+ */
+int qurt_process_vtcm_window_get(int pid, unsigned int *enable, unsigned int *high_offset, unsigned int *low_offset);
+
+/**@ingroup func_qurt_process_set_group_config
+  Enable thread groups in the process with the ceiling priorities setup
+
+  @param[in] process_id Process identifier.
+  @param[in] group_bitmask 64-bit mask of active thread groups
+  @param[in] ceiling_priorities array of ceiling priorities for thread group
+
+  @note1hang
+  This API can only be called by root PD and can only be called once for each process, otherwise it will be
+  rejected. Group 0 must be enabled in group_bitmask, otherwise QuRT will return error. After this API, all
+  exisiting threads will be moved to group 0, and if there is any thread's priority higher than ceiling
+  priority of group 0, it will be lowered to the ceiling value.
+  Examples 1:
+  group_bitmask = 0xD7; //'b11010111
+  ceiling_priorities[] = {100, 128, 200, 0, 196, 0, 240, 20}; // 0 - does not care
+  Exmaples 2:
+  group_mask = 0x5;     //'b101
+  ceiling_priorities[] = {240, 0, 20}; // 0 - does not care
+
+
+  @return
+  #QURT_EOK            -- Success.
+  #QURT_EVAL           -- Failure because of invalid inputs.
+  #QURT_ENOTALLOWED    -- The group has been configured already.
+
+  @dependencies
+  None.
+ */
+int qurt_process_set_group_config(unsigned int process_id, unsigned long long group_bitmask,
+    unsigned char *ceiling_priorities);
+
+
+/**@ingroup func_qurt_process_stid_set
+  Set the specified stid for a process or for a thread group within a process. 
+
+  @param[in] pid Process identifier.
+  @param[in] group_id  group identifier
+  @param[in] stid stid to be set 
+  
+  @note1hang 
+  User can pass default group_id (QURT_THREAD_DEFAULT_GROUP_ID) if stid needs to set at a process level.
+  All threads within a process that has default stid (QURT_STID_DEFAULT) will inherit the stid set for a process.
+  When a non-default group_id is specified, the stid is set only for a thread group.
+  
+  @return
+  #QURT_EOK            -- Success
+  #QURT_EFATAL         -- Invalid PID
+  #QURT_EVAL           -- Failure because of invalid inputs.
+  #QURT_EPRIVILEGE     -- Failure because caller does not have enough privilege for this operation.
+ 
+  @dependencies
+  None.
+ */
+int qurt_process_stid_set(unsigned int pid, unsigned int group_id , unsigned int stid);
+
+/**@ingroup func_qurt_process_stid_get
+  Get the stid for a process or for a thread group within a process. 
+
+  @param[in]  pid Process identifier.
+  @param[in]  group_id  group identifier
+  @param[out] Pointer to a variable to return  stid
+  
+  @note1hang 
+  User can pass default group_id (QURT_THREAD_DEFAULT_GROUP_ID) to return process-level stid.
+  When a non-default group_id is specified, the stid is returned only for a thread group.
+  
+  @return
+  #QURT_EOK            -- Success
+  #QURT_EFATAL         -- Invalid PID
+  #QURT_EVAL           -- Failure because of invalid inputs.
+  #QURT_EPRIVILEGE     -- Failure because caller does not have enough privilege for this operation.
+ 
+  @dependencies
+  None.
+ */
+int qurt_process_stid_get(unsigned int pid, unsigned int group_id , unsigned int *stid);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_profile.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_profile.h
new file mode 100755
index 0000000000000..2a50c461440f6
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_profile.h
@@ -0,0 +1,98 @@
+#ifndef QURT_PROFILE_H
+#define QURT_PROFILE_H
+/**
+  @file qurt_profile.h
+  QuRT profiling support.
+
+EXTERNAL FUNCTIONS
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+Copyright (c) 2018, 2021, 2023 by Qualcomm Technologies, Inc.  All Rights Reserved.
+Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+==============================================================================*/
+#include "qurt_thread.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** @addtogroup profiling_macros
+@{ */
+#define QURT_PROFILE_DISABLE 0 /**< Disable profiling. */
+#define QURT_PROFILE_ENABLE  1 /**< Enable profiling. */
+
+typedef unsigned int qurt_profile_param_t;
+
+#define QURT_PROFILE_PARAM_THREAD_READY_TIME 0U /**< Profile thread ready time. */
+
+/** @} */ /* end_addtogroup profiling_macros */
+
+/** @addtogroup profiling_types
+    @{ */
+/** Profiling results. */
+typedef union
+{
+    /** Result associated with #QURT_PROFILE_PARAM_THREAD_READY_TIME.  */
+    struct
+    {
+        unsigned int ticks; /**< Cumulative ticks the thread was ready. */
+    } thread_ready_time;
+
+} qurt_profile_result_t;
+/** @} */ /* end_addtogroup profiling_types */
+
+/**@ingroup func_qurt_profile_enable2
+ * Starts profiling of a specific parameter on a specific thread (as applicable).
+ *  
+ * @param[in] param     Profiling parameter.
+ * @param[in] thread_id ID of the thread (if applicable) for which the specified 
+ *                      paramter must be profiled.
+ * @param[in] enable    #QURT_PROFILE_DISABLE -- disable \n #QURT_PROFILE_ENABLE -- 
+ *                      enable
+ *  
+ * @return 
+ * #QURT_EOK -- Success \n 
+ * #QURT_EALREADY -- Measurement already in progress or already stopped \n 
+ * #QURT_ENOTHREAD -- Thread does not exist \n 
+ * #QURT_EINVALID -- Invalid profiling parameter \n
+ *  
+ * @dependencies 
+ * None.   
+ */
+extern int qurt_profile_enable2 (
+    qurt_profile_param_t param,
+    qurt_thread_t        thread_id,
+    int                  enable
+);
+
+/**@ingroup func_qurt_profile_get
+ * Gets the value of the profiling parameter that was previously enabled. 
+ *  
+ * @param[in] param     Profiling parameter.
+ * @param[in] thread_id ID of thread (if applicable) for which the specified 
+ *                      profiling paramter must be retrieved.
+ * @param [out] result  Profiling result associated with the parameter for the specified 
+ *                      thread (if applicable).
+ *  
+ * @return 
+ * #QURT_EOK -- Success \n 
+ * #QURT_EFAILED -- Operation failed; profiling was not enabled \n 
+ * #QURT_ENOTHREAD -- Thread does not exist \n 
+ * #QURT_EINVALID -- Invalid profiling parameter \n
+ *  
+ * @dependencies 
+ * None. 
+ */
+extern int qurt_profile_get (
+    qurt_profile_param_t    param,
+    qurt_thread_t           thread_id,
+    qurt_profile_result_t * result
+);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+#endif
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_ptrace.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_ptrace.h
new file mode 100755
index 0000000000000..622304dd92865
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_ptrace.h
@@ -0,0 +1,37 @@
+/*=============================================================================
+
+                                    qurt_ptrace.h
+
+GENERAL DESCRIPTION
+
+EXTERNAL FUNCTIONS
+        None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+        None.
+
+             Copyright (c) 2013  by Qualcomm Technologies, Inc.  All Rights Reserved.
+=============================================================================*/
+#ifndef __SYS_PTRACE_H__
+#define __SYS_PTRACE_H__
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+enum __ptrace_request
+{
+   /**
+     Indicates that the process making this request is requesting to be traced.
+   */
+   PTRACE_TRACEME = 0,
+   PTRACE_EXT_IS_DEBUG_PERMITTED = 500
+};
+
+long ptrace(enum __ptrace_request request, unsigned int pid, void*addr, void *data);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif //__SYS_PTRACE_H__
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_qdi.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_qdi.h
new file mode 100755
index 0000000000000..705408e5cfc6f
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_qdi.h
@@ -0,0 +1,185 @@
+#ifndef QDI_H
+#define QDI_H
+
+/**
+  @file qurt_qdi.h
+  @brief Prototypes of QuRT Driver Invocation API functions      
+
+ EXTERNALIZED FUNCTIONS
+  none
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  none
+
+ Copyright (c) 2013, 2021, 2023 Qualcomm Technologies, Inc.
+ All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+
+
+#include "qurt_qdi_constants.h"
+#include "qurt_qdi_imacros.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**@ingroup func_qurt_qdi_open
+  Opens the specified driver for subsequent operations.
+  qurt_qdi_open() is the primary mechanism by which a driver user can
+  obtain a QDI handle. The user provides the name of the driver to the 
+  qurt_qdi_open call, and gets back a handle referencing
+  the named driver. \n
+  @note1hang For reasons related to the Hexagon standard for varargs functions, the
+             qurt_qdi_open function prototype is not actually defined as a varargs.
+
+
+  @param[in] p   Driver name.
+  @param[in] ... Up to nine additional device-specific arguments can be passed as parameters, 
+                 and should follow the POSIX open() convention. \n
+                 - flags -- Optional second parameter (POSIX flags), the handle 
+                         access requested (read-only, write-only, or read-write,
+                         for instance) and other flags such as whether the call 
+                         should create a new device or only open an existing 
+                         device.   \n
+                 - mode  -- Optional third parameter (POSIX mode); permissions to
+                         configure when a new device is created. @tablebulletend
+ 
+  @return 
+  Negative value -- Error. \n
+  Non-negative value -- Success, this result value serves as a handle to the
+                        opened driver.
+  @dependencies
+  None.
+ */
+// int qurt_qdi_open();
+#define qurt_qdi_open(p,...) \
+   qurt_qdi_handle_invoke(QDI_HANDLE_GENERIC,QDI_OPEN,(p),##__VA_ARGS__)
+
+#define qurt_qdi_open_dt(p,q,...) \
+   qurt_qdi_handle_invoke(QDI_HANDLE_GENERIC,QDI_OPEN_FROM_DT,(p),(q),##__VA_ARGS__)
+
+/**@ingroup func_qurt_qdi_handle_invoke
+  Performs a generic driver operation, which (depending on the specified operation) can be
+  either be one of the predefined operations listed in @xhyperref{tbl:functionMapping,QDI function mapping} 
+  or a driver-specific operation.
+  The user provides a QDI handle and an integer
+  method number, along with 0 to 8 optional 32-bit arguments.
+  The device driver invocation function is invoked with the
+  same method number and 0 to 8 optional arguments. The
+  return value from the invocation function is passed back to
+  the user as the return value of qurt_qdi_handle_invoke.
+
+  @note1hang For reasons related to the Hexagon standard for varargs functions, the
+             qurt_qdi_handle_invoke() function prototype is not actually defined as a
+             varargs function (and would break if it were defined this way).
+ 
+  @param[in]  h   Driver handle.
+  @param[in]  m   Integer number for the operation to perform.
+  @param[in]  ... Up to eight optional arguments can be passed to the device driver as operation-specific parameters: \n
+               arg1 -- First parameter \n
+               arg2 -- Second parameter  \n
+               arg3 -- Third parameter  \n
+               arg4 -- Fourth parameter  \n
+               arg5 -- Fifth parameter  \n
+               arg6 -- Sixth parameter  \n
+               arg7 -- Seventh parameter  \n
+               arg8 -- Eighth parameter 
+ 
+  @return 
+  Integer value defined by the device driver. \n
+  -1 -- Error.
+
+  @dependencies
+  None.
+ */
+// int qurt_qdi_handle_invoke();
+#define qurt_qdi_handle_invoke(h,m,...) \
+   _QDMPASTE(_QDMHI,_QDMCNT(QDI_HANDLE_LOCAL_CLIENT,h,m,##__VA_ARGS__))(QDI_HANDLE_LOCAL_CLIENT,h,m,##__VA_ARGS__)
+#define _QDMHI3(a,b,c) qurt_qdi_qhi3(0,b,c)
+#define _QDMHI4(a,b,c,d) qurt_qdi_qhi4(0,b,c,(int)(d))
+#define _QDMHI5(a,b,c,d,e) qurt_qdi_qhi5(0,b,c,(int)(d),(int)(e))
+#define _QDMHI6(a,b,c,d,e,f) qurt_qdi_qhi6(0,b,c,(int)(d),(int)(e),(int)(f))
+#define _QDMHI7(a,b,c,d,e,f,g) qurt_qdi_qhi7(8,b,c,(int)(d),(int)(e),(int)(f),(int)(g))
+#define _QDMHI8(a,b,c,d,e,f,g,h) qurt_qdi_qhi8(8,b,c,(int)(d),(int)(e),(int)(f),(int)(g),(int)(h))
+#define _QDMHI9(a,b,c,d,e,f,g,h,i) qurt_qdi_qhi9(16,b,c,(int)(d),(int)(e),(int)(f),(int)(g),(int)(h),(int)(i))
+#define _QDMHI10(a,b,c,d,e,f,g,h,i,j) qurt_qdi_qhi10(16,b,c,(int)(d),(int)(e),(int)(f),(int)(g),(int)(h),(int)(i),(int)(j))
+#define _QDMHI11(a,b,c,d,e,f,g,h,i,j,k) qurt_qdi_qhi11(24,b,c,(int)(d),(int)(e),(int)(f),(int)(g),(int)(h),(int)(i),(int)(j),(int)(k))
+#define _QDMHI12(a,b,c,d,e,f,g,h,i,j,k,l) qurt_qdi_qhi12(24,b,c,(int)(d),(int)(e),(int)(f),(int)(g),(int)(h),(int)(i),(int)(j),(int)(k),(int)(l))
+int qurt_qdi_qhi3(int,int,int);
+int qurt_qdi_qhi4(int,int,int,int);
+int qurt_qdi_qhi5(int,int,int,int,int);
+int qurt_qdi_qhi6(int,int,int,int,int,int);
+int qurt_qdi_qhi7(int,int,int,int,int,int,int);
+int qurt_qdi_qhi8(int,int,int,int,int,int,int,int);
+int qurt_qdi_qhi9(int,int,int,int,int,int,int,int,int);
+int qurt_qdi_qhi10(int,int,int,int,int,int,int,int,int,int);
+int qurt_qdi_qhi11(int,int,int,int,int,int,int,int,int,int,int);
+int qurt_qdi_qhi12(int,int,int,int,int,int,int,int,int,int,int,int);
+
+/**@ingroup func_qurt_qdi_write
+  Writes data to the specified driver.
+  A predefined invocation routine for drivers that
+  support a POSIX-like write functionality.
+  qqurt_qdi_write(handle, buf, len) is equivalent to
+  qurt_qdi_handle_invoke(handle, QDI_WRITE, handle, buf, len);
+ 
+  @param[in]  handle Driver handle.
+  @param[in]  buf    Pointer to the memory address where the data to write is stored.
+  @param[in]  len    Number of bytes of data to write.
+
+  @return 
+  Non-negative integer -- Number of bytes written. \n
+  Negative error code -- Write could not take place.
+
+  @dependencies
+  None.
+ */
+int qurt_qdi_write(int handle, const void *buf, unsigned len);
+
+/**@ingroup func_qurt_qdi_read
+  User-visible API to read data from a QDI handle. 
+  A predefined invocation routine for drivers that
+  support a POSIX-like read functionality.
+  qurt_qdi_read(handle, buf, len) is equivalent to:
+  qurt_qdi_handle_invoke(handle, QDI_READ, handle, buf, len);
+ 
+  @param[in]  handle   Driver handle.
+  @param[in]  buf      Pointer to the memory address where the data read is stored.
+  @param[in]  len      Number of bytes of data to read.
+
+  @return 
+  Non-negative integer number -- Bytes read. \n
+  Negative error code -- Read could not take place.
+
+  @dependencies
+  None.
+ */
+int qurt_qdi_read(int handle, void *buf, unsigned len);
+
+/**@ingroup func_qurt_qdi_close
+  Closes the specified driver, releasing any resources associated with the open driver.
+  User-visible API to close a QDI handle.
+ 
+  This API should be called when the user is done using a
+  QDI-based handle. When this function is called, the driver can release
+  any resources held and perform other necessary cleanup
+  operations. qurt_qdi_close(handle) is equivalent to
+  qurt_qdi_handle_invoke(handle, QDI_CLOSE, handle)
+ 
+  @param[in]  handle Driver handle.
+ 
+  @return 
+  0 -- Success.\n
+  Negative error code -- Failure.
+
+  @dependencies
+  None.
+ */
+int qurt_qdi_close(int handle);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_qdi_constants.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_qdi_constants.h
new file mode 100755
index 0000000000000..4866fada067f0
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_qdi_constants.h
@@ -0,0 +1,193 @@
+#ifndef QDI_CONSTANTS_H
+#define QDI_CONSTANTS_H
+
+/**
+  @file qurt_qdi_constants.h
+  @brief  Predefined invocation methods for drivers.  
+
+ EXTERNALIZED FUNCTIONS
+  None
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  None
+
+ Copyright (c) 2013-2023 by Qualcomm Technologies, Inc.  All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc..
+ ======================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+||  Method numbers used for QDI.
+||
+||  Intended grouping of method numbers for QDI
+||   including future usage:
+||
+||   Method 0 should always be unused and not responded to by
+||    any driver.
+||   Methods 1 and 2 are reserved for name registration and
+||    name lookup.
+||   Methods 3 through 31 are reserved for POSIX-type operations
+||    on open handles.
+||   Methods 32 through 127 are reserved for the QDI infrastructure
+||    and may be extended in the future to provide standard
+||    driver debug services, management services, and system
+||    notifications.
+||   Methods 128 through 255 are reserved for the use of automatically
+||    generated methods such as might be generated by an IDL (interface
+||    definition language).  The infrastructure may be extended to
+||    perform services on these methods based on information provided
+||    by the IDL, such as automatic buffer validation, etc.  These
+||    method numbers should not be used for any "ad hoc" methods.
+||   Methods with number >= 256 are "private" method numbers that are
+||    outside the scope of the QDI infrastructure.  Drivers that want
+||    to generate and consume their own "ad hoc" methods are free to
+||    use these method numbers as they wish. The infrastructure does
+||    not generate these method numbers or respond to them, but
+||    passes them on unmolested.
+||
+||   All driver implementations *should* return a value of
+||    -1 when called with an unsupported method.  The standard error
+||    return value for POSIX APIs is -1, so we emulate that behavior
+||    here.
+*/
+/** @cond */
+#define QDI_UNUSED              0
+#define QDI_DEVNAME_REGISTER    1
+#define QDI_OPEN                2
+#define QDI_CLOSE               3
+#define QDI_READ                4
+#define QDI_WRITE               5
+#define QDI_IOCTL               6
+#define QDI_MMAP                7
+#define QDI_OS_FILEOPEN         8
+#define QDI_FLEN                9
+#define QDI_UNLINK             10
+#define QDI_FTELL              22
+#define QDI_SEEK               23
+#define QDI_FSTAT              24
+
+#define QDI_FSNAME_REGISTER     150    
+#define QDI_FS_OPEN             151
+#define QDI_MMAP2               153
+#define QDI_MPROTECT2           154
+#define QDI_MUNMAP2             155
+
+#define QDI_CLIENT_HANDLE_OBJREF_GET    10
+
+#define QDI_OS_PROCESS_LOAD             12
+#define QDI_OS_PROCESS_CHOOSE_ASID      13
+
+#define QDI_OS_SET_GP                    26
+#define QDI_CLIENT_HANDLE_CALLBACK       27
+   
+#define QDI_CLIENT_HANDLE_ISLAND_HANDLE_CREATE_FROM_OBJ_T  19 //reused
+#define QDI_CLIENT_HANDLE_HANDLE_CREATE_FROM_OBJ_T 80
+#define QDI_CLIENT_HANDLE_HANDLE_RELEASE           81
+#define QDI_CLIENT_HANDLE_COPY_FROM_USER           82
+#define QDI_CLIENT_HANDLE_COPY_TO_USER             83
+#define QDI_CLIENT_HANDLE_SIGNAL_GROUP_CREATE      86
+#define QDI_CLIENT_HANDLE_SAFE_CACHE_OPS           87
+
+#define QDI_CLIENT_HANDLE_BUFFER_LOCK   41
+#define QDI_CLIENT_HLOSPOOL_INFO_GET    90
+#define QDI_CLIENT_HLOSPOOL2_INFO_GET   96
+
+#define QDI_CLIENT_PID                  44
+#define QDI_CLIENT_ASID                 QDI_CLIENT_PID
+
+#define QDI_OS_CLIENT_INFO_GET          48
+
+#define QDI_OS_MEM_LOOKUP_PHYSADDR      57
+
+#define QDI_OS_THREAD_ITERATOR_CREATE   68
+#define QDI_OS_THREAD_ITERATOR_NEXT     69
+
+#define QDI_OS_SYSENV                   78
+
+#define QDI_REGION_USERMALLOC_INIT      180 // This method is for generic handle
+
+
+#define QDI_CLIENT_HANDLE_USER_MALLOC              84
+#define QDI_CLIENT_HANDLE_USER_FREE                85
+
+#define QDI_SIGNAL_GROUP_SIGNAL_CREATE          96
+#define QDI_SIGNAL_GROUP_WAIT                   98
+#define QDI_SIGNAL_GROUP_POLL                   99
+#define QDI_SIGNAL_SET                          96
+#define QDI_SIGNAL_CLEAR                        97
+#define QDI_SIGNAL_WAIT                         98
+#define QDI_SIGNAL_POLL                         99
+
+#define QDI_OS_WAIT_FOR_MAIN_REAPER		104
+
+#define QDI_CLIENT_HANDLE_REFPROXY_INSTALL        105
+#define QDI_CLIENT_HANDLE_REFPROXY_ADD            106
+#define QDI_CLIENT_HANDLE_REFPROXY_REMOVE         107
+
+#define QDI_CLIENT_HANDLE_DETACH                  116
+
+#define QDI_OS_RESERVED1                       139
+
+#define QDI_CLIENT_HANDLE_BUFFER_LOCK2   142
+
+#define QDI_DT_REGISTER 158
+#define QDI_OPEN_DEVICE 159
+#define QDI_OPEN_FROM_DT 160
+
+#define QDI_PRIVATE             256  /* Method numbers beginning at 256
+                                        are private method numbers, which
+                                        are device-specific and available
+                                        for use by device implementors. */
+/*
+||  Permission bitmasks for use with qurt_qdi_lock_buffer().
+||
+||  Make sure these match with permission values from qurt_perm_t.
+*/
+/** @endcond */
+
+/** @addtogroup driver_support_constants
+@{ */
+#define QDI_PERM_W              2                         /**< Write access. */
+#define QDI_PERM_R              1                         /**< Read access. */
+#define QDI_PERM_RW             (QDI_PERM_R | QDI_PERM_W) /**< Read/write access. */
+
+#define QDI_HANDLE_LOCAL_CLIENT         3                 /**< Local client. */
+#define QDI_HANDLE_GENERIC              4                 /**< Generic. */
+
+#define QDI_REFCNT_BASE   0x510000                        /**<  */
+#define QDI_REFCNT_MAXED  0x51FFFD                        /**<  */
+#define QDI_REFCNT_INIT   0x51FFFE                        /**< Driver object is temporary and is eventually deleted.*/
+#define QDI_REFCNT_PERM   0x51FFFF                        /**< Driver object is permanent and is never deleted. */
+/** @} */ /* end_addtogroup driver_support_constants */
+
+/** @cond */
+/*
+||  Flags used by process loaders.
+*/
+
+#define QDI_OS_PROCESS_FLAGS_ISLAND_RESIDENT    0x1     /* Set this flag to request the loaded process
+                                                           to have island residency. */
+#define QDI_OS_PROCESS_FLAGS_ROOT_RESIDENT      0x2     /* Set this flag to request the loaded process
+                                                           to have root residency, for example, DL Pager. */
+/*
+||  Constants used for qurt_event register API, type field.
+*/
+
+#define QURT_PROCESS_EXIT   1
+
+/*
+||  Constants used by QDI extensions.
+*/
+
+#define QURT_QDI_SINGLETON_TYPE_TRUE			0
+#define QURT_QDI_SINGLETON_TYPE_FALSE			1
+#define QURT_QDI_SINGLETON_TYPE_PER_PROCESS		2
+/** @endcond */
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QDI_CONSTANTS_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_qdi_driver.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_qdi_driver.h
new file mode 100755
index 0000000000000..e044e25f1bb72
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_qdi_driver.h
@@ -0,0 +1,868 @@
+#ifndef QURT_QDI_DRIVER_H
+#define QURT_QDI_DRIVER_H
+
+/**
+  @file qurt_qdi_driver.h
+  @brief  Definitions, macros, and prototypes used when writing a
+  QDI driver.
+
+ EXTERNALIZED FUNCTIONS
+  None
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  None
+
+ Copyright (c) 2018, 2019-2021, 2023 Qualcomm Technologies, Inc.
+ All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+
+#include "stddef.h"
+#include "qurt_qdi.h"
+#include "qurt_types.h"
+#include "qurt_callback.h"
+#include "qurt_qdi_constants.h"
+#include "qurt_qdi_imacros.h"
+#include "qurt_mutex.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+||  This gives the canonical form for the arguments to a QDI
+||   driver invocation function.  The arguments are as follows:
+||
+||   int client_handle    (R0) QDI handle that represents the client
+||                             that made this QDI request. If the
+||                             client is remote, this is a
+||                             variable handle; if the client is local
+||                             (same thread and process), this is
+||                             set to QDI_HANDLE_LOCAL_CLIENT.
+||
+||   qurt_qdi_obj_t *obj  (R1) Points at the qdi_object_t structure
+||                             on which this QDI request is being made.
+||                             The qdi_object_t structure is usually
+||                             the first element of a larger structure
+||                             that contains state associated with the
+||                             object; because it is usually the first
+||                             element, the object pointers can be freely
+||                             interchanged through casts.
+||
+||   int method           (R2) Integer QDI method that represents
+||                             the request type.
+||
+||   qurt_qdi_arg_t arg1  (R3) First three general purpose arguments
+||   qurt_qdi_arg_t arg2  (R4)  to the invocation function are passed in
+||   qurt_qdi_arg_t arg3  (R5)  these slots.
+||
+||   qurt_qdi_arg_t arg4  (SP+0)  Arguments beyond the first three are
+||   qurt_qdi_arg_t arg5  (SP+4)  passed on the stack.
+||   qurt_qdi_arg_t arg6  (SP+8)
+||   qurt_qdi_arg_t arg7  (SP+12)
+||   qurt_qdi_arg_t arg8  (SP+16)
+||   qurt_qdi_arg_t arg9  (SP+20)
+||
+||  The canonical form of the invocation function takes a
+||   total of 12 arguments, but not all of them are used.  In general,
+||   the QDI infrastructure only passes those arguments provided by
+||   the caller; if the invocation function accesses additional
+||   arguments beyond those provided by the caller, the values are not
+||   useful.
+*/
+/** @cond */
+#define QDI_INVOKE_ARGS \
+    int, struct qdiobj *, int, \
+    qurt_qdi_arg_t, qurt_qdi_arg_t, qurt_qdi_arg_t, \
+    qurt_qdi_arg_t, qurt_qdi_arg_t, qurt_qdi_arg_t, \
+    qurt_qdi_arg_t, qurt_qdi_arg_t, qurt_qdi_arg_t
+
+#define QDI_EXT_INVOKE_ARGS \
+    int, qurt_qdi_man_obj_t*, int, \
+    qurt_qdi_arg_t, qurt_qdi_arg_t, qurt_qdi_arg_t, \
+    qurt_qdi_arg_t, qurt_qdi_arg_t, qurt_qdi_arg_t, \
+    qurt_qdi_arg_t, qurt_qdi_arg_t, qurt_qdi_arg_t
+
+#define BUFFER_LOCK 1
+#define BUFFER_UNLOCK 0 
+
+struct qdiobj;
+/** @endcond */
+/** @addtogroup driver_support_types
+@{ */
+typedef union {
+    void *ptr; /**< Pointer to the driver handle. */
+    int num;   /**< Method number. */
+} qurt_qdi_arg_t;
+/** @} */ /* end_addtogroup driver_support_types */
+/** @cond */
+/** QuRT QDI driver version */
+typedef union {
+    int num;
+    struct {
+        short major; /** Driver major version number. */
+        short minor; /** Driver minor version number. */
+    };
+} qurt_qdi_version_t;
+
+typedef int (*qurt_qdi_pfn_invoke_t)(QDI_INVOKE_ARGS);
+typedef void (*qurt_qdi_pfn_release_t)(struct qdiobj *);
+/** @endcond */
+/** @addtogroup driver_support_types
+@{ */
+typedef struct qdiobj {
+    qurt_qdi_pfn_invoke_t invoke;   /**< Invocation function that implements the driver methods.*/
+    int refcnt;                     /**< Reference count, an integer value maintained by the QDI infrastructure that tracks the number of
+                                         references to a driver instance. */
+    qurt_qdi_pfn_release_t release; /**< Release function that performs details associated with deleting an instance
+                                         of the driver object.*/
+} qurt_qdi_obj_t;
+/** @} */ /* end_addtogroup driver_support_types */
+/** @cond */
+/** QuRT QDI managed object */
+typedef struct qurt_qdi_man_obj
+{
+    qurt_qdi_obj_t qdi_obj;
+    union
+    {
+        struct qurt_qdi_ext_driver * opener_obj;
+        struct qurt_qdi_ext_device * device_obj;
+    };
+}qurt_qdi_man_obj_t;
+
+typedef int (*qurt_qdi_ext_pfn_create_t)(int client_id, const char *name, qurt_qdi_version_t version, qurt_qdi_man_obj_t **qdi_obj);
+typedef int (*qurt_qdi_ext_pfn_create_device_t)(int client_id, const char *name, qurt_qdi_version_t version, struct qurt_qdi_ext_device * device, qurt_qdi_man_obj_t **qdi_obj);
+typedef int (*qurt_qdi_ext_pfn_invoke_t)(QDI_EXT_INVOKE_ARGS);
+typedef void (*qurt_qdi_ext_pfn_destroy_t)(qurt_qdi_man_obj_t *qdi_obj);
+typedef int (*qurt_qdi_ext_pfn_probe_t)(void *handle, struct qurt_qdi_ext_device **device);
+
+typedef struct qurt_qdi_ext_obj_info{
+    qurt_qdi_man_obj_t *obj;
+    int qdi_client_id;
+    struct qurt_qdi_ext_obj_info *next;
+}qurt_qdi_ext_obj_info_t;
+typedef struct qurt_qdi_ext_obj_info *qurt_qdi_ext_obj_info_ptr;
+
+/** QuRT QDI device */
+//temporarily add this back while there are still drivers who statically define this structure
+struct qurt_qdi_device {
+    qurt_qdi_obj_t opener_obj;
+    const char* name;
+    char island_resident;
+    unsigned char singleton;
+    qurt_qdi_ext_pfn_create_t create;
+    qurt_qdi_ext_pfn_invoke_t invoke;
+    qurt_qdi_ext_pfn_destroy_t destroy;
+    qurt_mutex_t qurt_qdi_ext_list_lock;
+    qurt_qdi_ext_obj_info_ptr qurt_qdi_ext_obj_info_head;
+};
+typedef struct qurt_qdi_device qurt_qdi_man_device;
+
+struct qurt_qdi_ext_driver {
+    qurt_qdi_obj_t opener_obj;
+    const char* name;
+    char island_resident;
+    unsigned char singleton;
+    qurt_qdi_ext_pfn_create_t create;
+    qurt_qdi_ext_pfn_invoke_t invoke;
+    qurt_qdi_ext_pfn_destroy_t destroy;
+    qurt_mutex_t qurt_qdi_ext_list_lock;
+    qurt_qdi_ext_obj_info_ptr qurt_qdi_ext_obj_info_head;
+    qurt_qdi_ext_pfn_create_device_t create_device;
+    qurt_qdi_version_t version;
+    qurt_qdi_ext_pfn_probe_t probe;
+    const char* compatible;
+    struct qurt_qdi_ext_device * device_list;
+    //qurt_qdi_ext_device_ptr device_list;
+};
+typedef struct qurt_qdi_ext_driver qurt_qdi_ext_driver_t;
+//above replaces qurt_qdi_man_device
+
+extern int qurt_qdi_obj_ref_inc(qurt_qdi_obj_t *);
+extern int qurt_qdi_obj_ref_dec(qurt_qdi_obj_t *);
+
+extern int qurt_qdi_ext_opener (QDI_INVOKE_ARGS);
+/** @endcond */
+/**@ingroup func_qurt_qdi_method_default
+  Processes a method that is unrecognized or unsupported in the driver invocation function.
+  All arguments passed to the current invocation function (Section @xref{sec:invocationFunction}) must be forwarded
+  to this function.
+
+  @note1hang Invocation functions must process all unrecognized or unsupported methods
+             by calling this function.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+extern int qurt_qdi_method_default(QDI_INVOKE_ARGS);
+
+/**@ingroup func_qurt_qdi_handle_create_from_obj_t
+  Allocates a new device handle for use with the specified driver object.
+   
+  @param[in] client_handle  Client handle obtained from the current invocation function (Section @xref{sec:invocationFunction}).
+  @param[out] obj           Pointer to the driver object.
+
+  @return
+  Non-negative integer -- Success; this value is the new handle. \n
+  Negative value -- Error.
+
+  @dependencies
+  None.
+*/
+static __inline int qurt_qdi_handle_create_from_obj_t(int client_handle, qurt_qdi_obj_t *obj)
+{
+    return qurt_qdi_handle_invoke(client_handle,
+                                    QDI_CLIENT_HANDLE_HANDLE_CREATE_FROM_OBJ_T,
+                                    obj);
+}
+
+/**@ingroup func_qurt_qdi_handle_invoke
+  Allocates a new island device handle for use with the specified driver object.
+   
+  @param[in] client_handle Client handle obtained from the current invocation function (Section 3.4.1).
+  @param[in] obj           Pointer.
+
+  @return
+  Non-negative integer value that is the new handle -- Success. \n
+  Negative return value -- Error.
+
+  @dependencies
+  None.
+*/
+static __inline int qurt_qdi_island_handle_create_from_obj_t(int client_handle, qurt_qdi_obj_t *obj)
+{
+    return qurt_qdi_handle_invoke(client_handle,
+                                    QDI_CLIENT_HANDLE_ISLAND_HANDLE_CREATE_FROM_OBJ_T,
+                                    obj);
+}
+
+/**@ingroup func_qurt_qdi_handle_release
+  Deallocates the specified device handle.
+
+  @param[in] client_handle     Obtained from the current invocation function (Section @xref{sec:invocationFunction}).
+  @param[in] handle_to_release Handle to release.
+
+  @return 
+  0 -- Success. \n
+  Negative value -- Error. 
+
+  @dependencies
+  None.
+*/
+static __inline int qurt_qdi_handle_release(int client_handle, int handle_to_release)
+{
+    return qurt_qdi_handle_invoke(client_handle,
+                                    QDI_CLIENT_HANDLE_HANDLE_RELEASE,
+                                    handle_to_release);
+}
+
+static __inline qurt_qdi_obj_t *
+qurt_qdi_objref_get_from_handle(int client_handle, int object_handle)
+{
+    qurt_qdi_obj_t *ret;
+
+    ret = NULL;
+
+    qurt_qdi_handle_invoke(client_handle,
+                            QDI_CLIENT_HANDLE_OBJREF_GET,
+                            object_handle,
+                            &ret);
+
+    return ret;
+}
+
+/**@ingroup func_qurt_client_add_memory
+  Adds a physical address range to the HLOS physpool of the caller user PD.
+   
+  @param[in] client_handle  Obtained from the current invocation function (Section 3.4.1).
+  @param[in] phys_addr      Starting address of the physical address range. 
+  @param[in] size           Size.
+
+  @return
+  #QURT_EOK -- Pages successfully added.
+
+  @dependencies
+  None.
+*/
+int qurt_client_add_memory(int client_handle, qurt_addr_t phys_addr, qurt_size_t size);
+
+/**@ingroup func_qurt_client_add_memory2
+  Adds a physical address range to the HLOS physpool of the caller user PD.
+   
+  @param[in] client_handle  Obtained from the current invocation function (Section 3.4.1).
+  @param[in] phys_addr      Starting 36-bit address of the physical address range. 
+  @param[in] size           Size.
+
+  @return
+  #QURT_EOK -- Pages successfully added.
+
+  @dependencies
+  None.
+*/
+int qurt_client_add_memory2(int user_client_handle, qurt_paddr_64_t phys_addr, qurt_size_t size);
+
+static __inline qurt_qdi_obj_t *
+qurt_qdi_objref_get_from_pointer(qurt_qdi_obj_t *objptr)
+{
+    qurt_qdi_obj_t * ret = NULL;
+
+    if (qurt_qdi_obj_ref_inc(objptr) < 0) {
+        ret = NULL;
+    } else {
+        ret = objptr;
+    }
+
+    return ret;
+}
+
+static __inline void
+qurt_qdi_objref_release(qurt_qdi_obj_t *objptr)
+{
+    if (qurt_qdi_obj_ref_dec(objptr) == 1) {
+        (*objptr->release)(objptr);
+    }
+}
+
+/**@ingroup func_qurt_qdi_copy_from_user
+  Copies the contents of a user memory buffer into the current driver.
+
+  @note1hang User buffer addresses are valid only for the duration of the current driver
+  invocation.
+
+  @param[in] client_handle   Obtained from the current invocation function (Section @xref{sec:invocationFunction}).
+  @param[in] dest            Base address of the driver buffer.
+  @param[in] src             Base address of the user buffer.
+  @param[in] len             Number of bytes to copy.
+  
+  @return
+  Negative value -- Indicates a privilege or security violation, the copy operation 
+                has crossed a privilege boundary.
+  
+  @dependencies
+  None.
+*/
+static __inline int qurt_qdi_copy_from_user(int client_handle, void *dest, const void *src, unsigned len)
+{
+    return qurt_qdi_handle_invoke(client_handle,
+                                    QDI_CLIENT_HANDLE_COPY_FROM_USER,
+                                    dest, src, len);
+}
+
+/**@ingroup qurt_qdi_copy_string_from_user
+  Copies the contents of a user memory buffer into the current driver.
+
+  @note1hang User buffer addresses are valid only for the duration of the current driver
+  invocation.
+
+  @param client_handle   Obtained from the current invocation function (Section 3.4.1).
+  @param dest            Base address of the driver buffer.
+  @param src             Base address of the user buffer.
+  @param len             Number of bytes to copy. NOTE: This is the destination buffer length.
+  
+  @return
+  Negative error result -- privilege or security violation, the copy operation 
+                has crossed a privilege boundary.
+  
+  @dependencies
+  None.
+*/
+int qurt_qdi_copy_string_from_user(int client_handle, char *dest, const char *src, unsigned len);
+
+/**@ingroup func_qurt_qdi_copy_to_user
+  Copies the contents of a driver memory buffer to user memory.
+
+  @note1hang User buffer addresses are valid only for the duration of the current driver
+             invocation.
+
+  @param[in] client_handle Client handle obtained from the current invocation function (Section @xref{sec:invocationFunction}).
+  @param[in] dest          Base address of the user buffer.
+  @param[in] src           Base address of the driver buffer.
+  @param[in] len           Number of bytes to copy.
+
+  @return
+  Negative value -- Indicates a privilege or security violation, the copy operation has crossed a 
+                    privilege boundary
+
+  @dependencies
+  None.
+*/
+static __inline int qurt_qdi_copy_to_user(int client_handle, void *dest, const void *src, unsigned len)
+{
+    return qurt_qdi_handle_invoke(client_handle,
+                                    QDI_CLIENT_HANDLE_COPY_TO_USER,
+                                    dest, src, len);
+}
+
+/**@ingroup func_qurt_qdi_safe_cache_ops
+  Do cache operations on user memory
+
+  @note1hang User buffer addresses are valid only for the duration of the current driver
+             invocation.
+
+  @param[in] client_handle Client handle obtained from the current invocation function (Section @xref{sec:invocationFunction}).
+  @param[in] addr          Base address of the user memory.
+  @param[in] size          Size of the user memory.
+  @param[in] opcode        Cache operations (QURT_MEM_CACHE_FLUSH, QURT_MEM_CACHE_INVALIDATE...)
+  @param[in] type          Cache type (QURT_MEM_ICACHE, QURT_MEM_DCACHE)
+
+  @return
+  Negative value -- Indicates a privilege or security violation, the copy operation has crossed a
+                    privilege boundary
+
+  @dependencies
+  None.
+*/
+static __inline int qurt_qdi_safe_cache_ops(int client_handle, qurt_addr_t addr, qurt_size_t size,
+        qurt_mem_cache_op_t opcode, qurt_mem_cache_type_t type)
+{
+    return qurt_qdi_handle_invoke(client_handle,
+                                  QDI_CLIENT_HANDLE_SAFE_CACHE_OPS,
+                                  addr, size, opcode, type);
+}
+
+
+/**@ingroup func_qurt_qdi_buffer_lock
+  Prepares for the direct manipulation of a potentially untrusted buffer provided by a QDI
+  client.
+
+  This function is used to permit a trusted driver to safely access memory that is
+  provided by a potentially untrusted client. A driver calls this function to obtain a safe buffer
+  pointer for accessing the memory.
+
+  This function performs the following security checks: \n
+  - Verifies that the entire buffer is accessible to the client. \n
+  - Ensures that the pointer remains valid for the remainder of the QDI driver
+      operation. \n
+
+  @note1hang  User buffer addresses are valid only for the duration of the current driver
+              invocation.
+
+  @param[in] client_handle Obtained from the current invocation function (Section @xref{sec:invocationFunction}).
+  @param[in] buf           Pointer to the base address of the client buffer address.
+  @param[in] len           Buffer length (in bytes).
+  @param[in] perms         Bitmask value that specifies the read or write access to perform on the
+                       client buffer: \n
+                           - #QDI_PERM_R -- Read access \n
+                           - #QDI_PERM_W -- Write access \n
+                           - #QDI_PERM_RW -- Read/write access @tablebulletend
+  @param[out] obuf     Pointer to the buffer address that the driver must use to access the buffer.
+
+  @return
+  Negative value -- Error; the operation crosses a privilege boundary, indicating a privilege or security violation. \n
+  Nonzero value -- User passed a buffer that does not fulfill the requested read/write access permission.
+                    In this case the QDI driver call must be terminated cleanly, with an appropriate error code 
+                    returned to the client. \n
+  Zero -- Success; when this occurs the QDI driver must use the pointer at *obuf to access memory, and not the
+                    pointer passed in as buf -- even if the user process changes the mapping of memory at buf,
+                   the mapping of memory at *obuf remains valid until the driver invocation completes.
+
+  @dependencies
+  None.
+*/
+static __inline int qurt_qdi_buffer_lock(int client_handle, void *buf, unsigned len,
+                                         unsigned perms, void **obuf)
+{
+    return qurt_qdi_handle_invoke(client_handle,
+                                    QDI_CLIENT_HANDLE_BUFFER_LOCK,
+                                    buf, len, perms, obuf);
+}
+
+/**@ingroup func_qurt_qdi_buffer_lock2
+   Prepares for the direct manipulation of a possibly-untrusted buffer provided by a QDI
+   client.
+   This API permits a trusted driver to safely access memory 
+   provided by a possibly-untrusted client. A driver calls this function to obtain a safe buffer
+   pointer for accessing the memory.
+   This function performs the following security checks: \n
+   -- Entire buffer is accessible to the client. \n
+   -- Entire buffer is mapped with permissions passed in perms field \n
+   -- Entire buffer is physically contiguous \n
+   In addition to the security checks, the API also locks the client mapping such that the client
+   cannot remove the mapping while the physical memory is used by the trusted
+   driver. \n
+
+   @note1      Drivers are responsible for calling qurt_qdi_buffer_unlock() at appropriate time. Not 
+               pairing qurt_qdi_buffer_unlock() with this API leads to resource leakages and 
+               process exit failures. Drivers can keep track of which buffers are locked for
+               a particular client. If the client exits abruptly, the buffers can be
+               unlocked on driver release invocation for the exiting client.
+
+   @note2      This API is supported in limited capacity when called from Island mode. Safe buffer
+               unmapping or user buffer unlock is not supported in Island mode.
+
+   @param client_handle Obtained from the current invocation function (Section 3.4.1).
+   @param buf           Pointer to the base address of the client buffer address.
+   @param len           Buffer length (in bytes).
+   @param perms         Bitmask value that specifies the read or write access to perform on the
+                        client buffer: \n
+                        -- #QDI_PERM_R -- Read access \n
+                        -- #QDI_PERM_W -- Write access \n
+                        -- #QDI_PERM_RW -- Read/write access \n
+   @param obuf         Optional parameter that returns a pointer to the buffer address that 
+                       the driver must use to access the buffer. If NULL is passed, the API 
+                       only performs security checks and does not create a mapping to access the user buffer in
+                       a safe way.
+
+   @return
+   QURT_EINVALID   -- Arguments passed to the API are invalid. User buffer pointer is NULL or length of the
+                      buffer is 0. \n
+   QURT_EPRIVILEGE -- One of the security checks on the user buffer failed. \n
+   QURT_EFAILED    -- Mapping cannot be created for the trusted driver. \n
+   QURT_EOK        -- Lock operation was successful. When this occurs, the QDI driver must use the 
+                      pointer at *obuf to perform its memory accesses, and not the
+                      pointer passed in as buf. 
+                      
+   @dependencies
+   None.
+*/
+static __inline int qurt_qdi_buffer_lock2(int client_handle, void *buf, unsigned len,
+                                         unsigned perms, void **obuf)
+{
+    return qurt_qdi_handle_invoke(client_handle,
+                                    QDI_CLIENT_HANDLE_BUFFER_LOCK2,
+                                    BUFFER_LOCK, buf, len, perms, obuf);
+}
+
+/**@ingroup func_qurt_qdi_buffer_unlock
+   This API is paired with qurt_qdi_buffer_lock2(). A temporary overlapping mapping 
+   created for the driver is removed. Client mapping for the user buffer is
+   unlocked. 
+
+   @note1      Drivers are responsible for pairing this with qurt_qdi_buffer_lock(). Not 
+               pairing qurt_qdi_buffer_lock() with this API leads to resource leakages and 
+               process exit failures. Drivers can keep track of which buffers are locked for
+               a particular client, and if the client exits abruptly, all the buffers can be
+               unlocked on driver release invocation for the exiting client.
+
+   @note2      This API is supported in limited capacity when called from Island mode. Actual
+               unmapping of driver accessible memory or unlocking of the buffer is not
+               supported in Island bode.
+
+   @param client_handle Obtained from the current invocation function (Section 3.4.1).
+   @param buf           Pointer to the base address of the client buffer address.
+   @param len           Buffer length (in bytes).
+   @param obuf          Safe buffer address that was returned in the obuf field after calling
+                        qurt_qdi_buffer_lock2().
+
+   @return
+   QURT_EINVALID   -- Arguments passed to the API are invalid. User buffer pointer is NULL or length of the
+                      buffer is 0. \n
+   QURT_EOK        -- Lock operation was successful. When this occurs, the QDI driver must use the 
+                      pointer at *obuf to perform its memory accesses, and not the
+                      pointer passed in as buf. \n
+   other results   -- Safe buffer unmapping failed or unlocking of user buffer failed \n.
+
+   @dependencies
+   None.
+*/
+static __inline int qurt_qdi_buffer_unlock(int client_handle, void *buf, unsigned len,
+                                           void *obuf)
+{
+    return qurt_qdi_handle_invoke(client_handle,
+                                    QDI_CLIENT_HANDLE_BUFFER_LOCK2,
+                                    BUFFER_UNLOCK, buf, len, obuf);
+}
+
+/**@ingroup func_qurt_qdi_user_malloc
+  Allocates memory area in the QDI heap that is read/write accessible to both the driver and
+  the client. \n
+  @note1hang The QDI heap has a limited amount of memory available, and only the
+  device driver can free the allocated memory.
+
+  @param client_handle Client handle obtained from the current invocation function (Section @xref{sec:invocationFunction}).
+  @param size          Size.
+
+  @return
+  Non-zero -- Success; this returned value points to the allocated memory area. \n
+  Zero -- Error.
+
+  @dependencies
+  None.
+*/
+void *qurt_qdi_user_malloc(int client_handle, unsigned size);
+
+/**@ingroup func_qurt_qdi_user_free
+  Deallocates memory area in the QDI heap.
+
+  @param client_handle Client handle obtained from the current invocation function (Section @xref{sec:invocationFunction}).
+  @param ptr Pointer.
+
+  @dependencies
+  None.
+*/
+void qurt_qdi_user_free(int client_handle, void *ptr);
+
+/**@ingroup funct_qurt_qdi_client_detach
+  Detaches a client (a process), indicating that the client does not
+  participate in the qurt_wait() mechanism. This behavior
+  is opt-in and irrevocable. When a client is detached, it can
+  not be un-detached.
+
+  @param client_handle Handle of the client to detach.
+
+  @return
+  Zero -- Success.  Detachable clients always return success.
+  Nonzero value -- client_handle did not refer to a
+    detachable user client.
+
+  @dependencies
+  None.
+*/
+static __inline int qurt_qdi_client_detach(int client_handle)
+{
+    return qurt_qdi_handle_invoke(client_handle, QDI_CLIENT_HANDLE_DETACH);
+}
+
+/**@ingroup func_qurt_qdi_signal_group_create
+  Creates a new signal group for use in a device driver.
+  A QDI signal group contains up to 32 signals, which can be operated on either
+  individually (using the qurt_qdi_signal_* functions) or as a group (using the
+  qurt_qdi_signal_group_* functions). \n
+  @note1hang Driver implementation is responsible for using the proper signal group
+             handle in any given situation. \n
+  For more information on signals, see the Hexagon QuRT RTOS User Guide (80-VB419-78).
+
+  @param client_handle                 Client handle obtained from the current invocation function (Section @xref{sec:invocationFunction}).
+  @param p_signal_group_handle_local   Returns a handle intended for use by code that
+                                       resides in the same context and process as the created signal group
+                      (for example, the device driver implementation that allocated the 
+                      signal group).
+  @param p_signal_group_handle_remote  Returns a handle intended for use by code
+                                       that resides in a different context and process than the created signal group 
+                      (for example, the user-mode client of an OS driver).
+
+  @return
+  Zero return value indicates success.\n
+  Negative return value indicates could not create signal group.
+
+  @dependencies
+  None.
+*/
+static __inline int qurt_qdi_signal_group_create(int client_handle,
+                                                 int *p_signal_group_handle_local,
+                                                 int *p_signal_group_handle_remote)
+{
+    return qurt_qdi_handle_invoke(client_handle,
+                                    QDI_CLIENT_HANDLE_SIGNAL_GROUP_CREATE,
+                                    p_signal_group_handle_local,
+                                    p_signal_group_handle_remote);
+}
+
+/**@ingroup func_qurt_qdi_signal_group_wait
+  Suspends the current thread until any of the signals are set in the specified signal group.
+
+  If a signal is set in a signal group object, and a thread waits on the signal group object,
+  the thread is awakened. If the awakened thread has higher priority than the current
+  thread, a context switch can occur.
+
+  @param signal_group_handle   Handle of the signal group.
+
+  @return
+  If the client is remote:
+  QURT_EOK -- Wait complete \n
+  QURT_ECANCEL -- Wait cancelled.\n
+  If the client is local, returns a 32-bit word with current signals.
+
+  @dependencies
+  None.
+*/
+static __inline int qurt_qdi_signal_group_wait(int signal_group_handle)
+{
+    return qurt_qdi_handle_invoke(signal_group_handle,
+                                    QDI_SIGNAL_GROUP_WAIT);
+}
+
+/**@ingroup func_qurt_qdi_signal_group_poll
+  Returns a value that indicates if any of the signals are set in the specified signal group.
+
+  @param signal_group_handle Handle of the signal group.
+
+  @return
+  1 -- Indicates whether any of the signals are set in the signal group.\n
+  0 -- Indicates that none of the signals are set.
+
+  @dependencies
+  None.
+*/
+static __inline int qurt_qdi_signal_group_poll(int signal_group_handle)
+{
+    return qurt_qdi_handle_invoke(signal_group_handle,
+                                    QDI_SIGNAL_GROUP_POLL);
+}
+
+
+/**@ingroup func_qurt_qdi_signal_create
+  Creates a new signal in the specified signal group.
+  For more information on signals, see the Hexagon QuRT RTOS User Guide (80-VB419-78).
+
+  @note1hang Driver implementation is responsible for using the proper signal handle in
+             any given situation.
+
+  @param signal_group_handle    Handle of an existing signal group.
+  @param p_signal_handle_local  Returns a handle intended for use by code that resides in
+                               the same context and process as the created signal (for example,
+                               the device driver implementation that allocated the signal).
+  @param p_signal_handle_remote Returns a handle intended for use by code that resides in
+                               a different context and process than the created signal (for 
+                               example, the user-mode client of an OS driver).
+
+  @return 
+  Nonzero value -- No more signals can be created in the specified
+                    signal group. 
+
+  @dependencies
+  None.
+*/
+static __inline int qurt_qdi_signal_create(int signal_group_handle,
+                                           int *p_signal_handle_local,
+                                           int *p_signal_handle_remote)
+{
+    return qurt_qdi_handle_invoke(signal_group_handle,
+                                    QDI_SIGNAL_GROUP_SIGNAL_CREATE,
+                                    p_signal_handle_local,
+                                    p_signal_handle_remote);
+}
+
+/**@ingroup func_qurt_qdi_signal_set
+  Sets the signal in the specified signal object.
+
+  @param signal_handle Handle of the signal.
+
+  @return
+  Always returns 0.
+
+  @dependencies
+  None.
+*/
+static __inline int qurt_qdi_signal_set(int signal_handle)
+{
+    return qurt_qdi_handle_invoke(signal_handle,
+                                    QDI_SIGNAL_SET);
+}
+
+/**@ingroup func_qurt_qdi_signal_clear
+  Clears the signal in the specified signal object.
+
+  @param signal_handle   Handle of the signal.
+  
+  @return 
+  Always returns 0.
+
+  @dependencies
+  None.
+*/
+static __inline int qurt_qdi_signal_clear(int signal_handle)
+{
+    return qurt_qdi_handle_invoke(signal_handle,
+                                    QDI_SIGNAL_CLEAR);
+}
+
+/**@ingroup func_qurt_qdi_signal_wait
+  Suspends the current thread until the specified signal is set.
+  If a signal is set in a signal object, and a thread waits on the signal object, the
+  thread is awakened. If the awakened thread has higher priority than the current thread, a
+  context switch may occur.
+
+  @param signal_handle Handle of the signal.
+
+  @return
+  If client is remote:
+  QURT_EOK -- Wait complete. \n
+  QURT_ECANCEL -- Wait cancelled.\n
+  If client is local, return a 32-bit word with current signals.
+
+  @dependencies
+  None.
+*/
+static __inline int qurt_qdi_signal_wait(int signal_handle)
+{
+    return qurt_qdi_handle_invoke(signal_handle,
+                                    QDI_SIGNAL_WAIT);
+}
+
+/**@ingroup func_qurt_qdi_signal_poll
+  Returns a value that indicates if the specified signal is set.
+
+  @param signal_handle Handle of the signal.
+
+  @return
+  1 -- Signal is set. \n
+  0 -- Signal is not set.
+
+  @dependencies
+  None.
+*/
+static __inline int qurt_qdi_signal_poll(int signal_handle)
+{
+    return qurt_qdi_handle_invoke(signal_handle,
+                                    QDI_SIGNAL_POLL);
+}
+
+/**@ingroup func_qurt_qdi_devname_register
+  Registers a QDI device with the generic QDI object in the 
+  current QDI context.
+
+  This function registers an exact name or a directory prefix with a QDI opener object.
+  Future invocations of qurt_qdi_open() in the context of the caller invokes the
+  opener object if a match is detected.
+
+  Directory prefix names are specified by ending the name with a forward slash character.
+
+  Example of an exact name:
+  @code qurt_qdi_devname_register(/dev/foobar, foobar_opener);@endcode
+
+  Example of a directory prefix:
+  @code qurt_qdi_devname_register(/pipedev/, pipedev_opener);@endcode
+
+  Given the two registrations shown above, the only qurt_qdi_open() requests to
+  direct to the foobar_opener object are requests for the exact name
+  "/dev/foobar", Any request beginning with "/pipedev/" is directed to the
+  pipedev_opener object.
+
+  The pipedev invocation function presumably examines the name argument to
+  determine exactly how to handle the request. The name is passed to the invocation
+  function in the a1.ptr argument (Section @xref{sec:invocationFunction}).
+
+  @param  name   Device name or device name prefix.
+  @param  opener Pointer to the opener object for the device.
+ 
+  @return
+  0 -- Device was successfully registered. \n
+  Negative error code -- Device was not registered.
+
+  @dependencies
+  None.
+ */
+static __inline int qurt_qdi_devname_register(const char *name,
+                                              qurt_qdi_obj_t *opener)
+{
+    return qurt_qdi_handle_invoke(QDI_HANDLE_GENERIC,
+                                    QDI_DEVNAME_REGISTER,
+                                    name,
+                                    opener);
+}
+
+// Macros for backward compatibility with deprecated APIs
+//  (These will go away soon)
+
+#define qurt_qdi_register_devname(name, opener) \
+        qurt_qdi_devname_register((name), (void *)(opener))
+#define qurt_qdi_new_handle_from_obj_t(handle, obj) \
+        qurt_qdi_handle_create_from_obj_t((handle), (obj))
+#define qurt_qdi_release_handle(client_handle, handle) \
+        qurt_qdi_handle_release((client_handle), (handle))
+#define qurt_qdi_lock_buffer(handle, buf, len, perms, obuf) \
+        qurt_qdi_buffer_lock((handle), (buf), (len), (perms), (obuf))
+#define qurt_qdi_usermalloc(handle, size) \
+        qurt_qdi_user_malloc((handle), (size))
+#define qurt_qdi_userfree(handle, ptr) \
+        qurt_qdi_user_free((handle), (ptr))
+        
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_qdi_ext.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_qdi_ext.h
new file mode 100755
index 0000000000000..383e1799a15d6
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_qdi_ext.h
@@ -0,0 +1,58 @@
+#ifndef QURT_QDI_EXT_H
+#define QURT_QDI_EXT_H
+
+/**
+  @file qurt_qdi_driver.h
+  @brief  Definitions, macros, and prototypes used when writing a
+  QDI driver
+
+ EXTERNALIZED FUNCTIONS
+  none
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  none
+
+ Copyright (c) 2018, 2019-2021 by Qualcomm Technologies, Inc.  All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+#include "qurt_qdi_driver.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct qurt_qdi_ext_device {
+    qurt_qdi_ext_obj_info_ptr qurt_qdi_ext_obj_info_head;
+    struct qurt_qdi_ext_device * next;
+    char * instance;
+    fdt_node_handle context;
+};
+typedef struct qurt_qdi_ext_device *qurt_qdi_ext_device_ptr;
+
+/**@ingroup func_qurt_qdi_dt_register
+ Registers a QDI device with the generic QDI object in the current QDI context,
+ if and only if a compatible device node is found in the device tree. This 
+ function serves as a device tree aware wrapper for qurt_qdi_devname_register().
+
+ @param  name   Device name or device name prefix.
+ @param  opener Pointer to QDI ext specialized opener object for the driver.
+
+ @return
+ 0 -- Device was successfully registered. \n
+ Negative error code -- Device was not registered.
+*/
+static __inline int qurt_qdi_dt_register(const char *name, qurt_qdi_obj_t *opener)
+{
+    return qurt_qdi_handle_invoke(QDI_HANDLE_GENERIC, QDI_DT_REGISTER, name, opener);
+}
+
+static inline void qurt_qdi_ext_deviceobj_set_name (struct qurt_qdi_ext_device * device, char * name)
+{
+    device->instance = name;
+}
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_qdi_imacros.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_qdi_imacros.h
new file mode 100755
index 0000000000000..c0a8448ac87f8
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_qdi_imacros.h
@@ -0,0 +1,34 @@
+#ifndef QURT_QDI_IMACROS_H
+#define QURT_QDI_IMACROS_H
+
+/**
+  @file  qurt_qdi_imacros.h 
+  @brief  Internal macros used for QDI. Mostly consists of tricky (and ugly)
+  preprocessor hacks that permit us to do varargs function invocations
+  where we pass optional arguments in registers and where we can do
+  type casting and checking automatically.
+
+ EXTERNALIZED FUNCTIONS
+  none
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  none
+
+ Copyright (c) 2013, 2021  by Qualcomm Technologies, Inc.  All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define _QDMPASTE(a,b) _QDMPASTE_(a,b)
+#define _QDMPASTE_(a,b) a##b
+#define _QDMCNT(...) _QDMCNT_(__VA_ARGS__,12,11,10,9,8,7,6,5,4,3,2,1,0)
+#define _QDMCNT_(a,b,c,d,e,f,g,h,i,j,k,l,cnt,...) cnt
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_qdi_proxy.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_qdi_proxy.h
new file mode 100755
index 0000000000000..f1d8992ea8811
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_qdi_proxy.h
@@ -0,0 +1,55 @@
+/*=============================================================================
+
+                                    qurt_qdi_proxy.h
+
+GENERAL DESCRIPTION
+
+EXTERNAL FUNCTIONS
+        None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+        None.
+
+Copyright (c) 2013, 2021  by Qualcomm Technologies, Inc.  All Rights Reserved.
+=============================================================================*/
+#ifndef _QURT_QDI_PROXY_H
+#define _QURT_QDI_PROXY_H
+
+#include "qurt_qdi_driver.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* APIs allowing operation on the proxy object directly */
+int qurt_qdi_proxy_ref_create(void);
+
+/* APIs allowing to operate on proxy given a known proxy handle 
+ * 1) using qdi handle of the object 
+ * successful return: QURT_EOK, anything else -- failure 
+ */
+int qurt_qdi_proxy_ref_add_by_handle(int proxy_handle, int qdi_handle);
+int qurt_qdi_proxy_ref_sub_by_handle(int proxy_handle, int qdi_handle);
+
+/* 2) using object reference 
+ * successful return: QURT_EOK, anything else -- failure 
+ */
+int qurt_qdi_proxy_ref_add_by_object(int proxy_handle, qurt_qdi_obj_t *obj_ptr);
+int qurt_qdi_proxy_ref_sub_by_object(int proxy_handle, qurt_qdi_obj_t *obj_ptr);
+
+/* API allowing to associate a proxy object with a particular client given a client handle 
+ * successfule return: QURT_EOK, anything else -- failure 
+ */
+int qurt_client_proxy_ref_install (int client_handle, int proxy_handle);
+
+/* APIs allowing operation on proxy object from user client 
+ * successful return: QURT_EOK, anything else -- failure 
+ */
+int qurt_client_proxy_ref_add(int qdi_handle);
+int qurt_client_proxy_ref_remove(int qdi_handle);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_QDI_PROXY_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_rmutex.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_rmutex.h
new file mode 100755
index 0000000000000..a013a0bbddb1d
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_rmutex.h
@@ -0,0 +1,200 @@
+#ifndef QURT_RMUTEX_H
+#define QURT_RMUTEX_H
+/**
+  @file qurt_rmutex.h 
+  Prototypes of rmutex API.  
+
+EXTERNAL FUNCTIONS
+   None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+Copyright (c) 2013 - 2021 by Qualcomm Technologies, Inc.  All Rights Reserved.
+Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+=============================================================================*/
+
+
+#include <qurt_futex.h>
+#include <qurt_mutex.h>
+
+/*=============================================================================
+												FUNCTIONS
+=============================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**@ingroup func_qurt_rmutex_init
+   Initializes a recursive mutex object.
+   The recursive mutex is initialized in unlocked state.
+
+   @datatypes
+   #qurt_mutex_t
+   
+   @param[out]  lock  Pointer to the recursive mutex object.
+
+   @return 
+   None.
+
+   @dependencies
+   None.  
+ */
+void qurt_rmutex_init(qurt_mutex_t *lock);
+
+/**@ingroup func_qurt_rmutex_destroy  
+  Destroys the specified recursive mutex. \n
+  @note1hang Recursive mutexes must not be destroyed while they are still in use. If this
+             occurs, the behavior of QuRT is undefined. 
+  
+  @datatypes
+  #qurt_mutex_t
+  
+  @param[in]  lock  Pointer to the recursive mutex object to destroy.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+  
+ */
+void qurt_rmutex_destroy(qurt_mutex_t *lock);
+
+/**@ingroup func_qurt_rmutex_lock
+  Locks the specified recursive mutex. \n 
+
+  If a thread performs a lock operation on a mutex that is not in use, the thread 
+  gains access to the shared resource that the mutex protects, and continues executing.
+
+  If a thread performs a lock operation on a mutex that is already use by another 
+  thread, the thread is suspended. When the mutex becomes available again (because the 
+  other thread has unlocked it), the thread is awakened and given access to the shared resource.
+  
+   @note1hang A thread is not suspended if it locks a recursive mutex that it has already 
+   locked. However, the mutex does not become available to other threads until the
+   thread performs a balanced number of unlocks on the mutex.
+
+   @datatypes
+   #qurt_mutex_t
+  
+   @param[in]  lock  Pointer to the recursive mutex object to lock. 
+
+   @return
+   None.
+
+   @dependencies
+   None.
+  
+ */
+void qurt_rmutex_lock(qurt_mutex_t *lock);
+
+/**@ingroup func_qurt_rmutex_lock_timed
+  Locks the specified recursive mutex. The wait must be terminated when the specified timeout expires.\n 
+
+  If a thread performs a lock operation on a mutex that is not in use, the thread 
+  gains access to the shared resource that the mutex is protecting, and continues executing.
+
+  If a thread performs a lock operation on a mutex that is already in use by another 
+  thread, the thread is suspended. When the mutex becomes available again (because the 
+  other thread has unlocked it), the thread is awakened and given access to the shared resource.
+  
+   @note1hang A thread is not suspended if it locks a recursive mutex that it has already 
+   locked by itself. However, the mutex does not become available to other threads until the
+   thread performs a balanced number of unlocks on the mutex.
+   If timeout expires, this wait must be terminated and no access to the mutex is granted.
+   
+   @datatypes
+   #qurt_mutex_t
+  
+   @param[in]  lock  Pointer to the recursive mutex object to lock. 
+   @param[in] duration Interval (in microseconds) duration value must be between #QURT_TIMER_MIN_DURATION and
+    #QURT_TIMER_MAX_DURATION 
+
+   @return 
+   #QURT_EOK -- Success \n
+   #QURT_ETIMEDOUT -- Timeout
+
+   @dependencies
+   None.
+  
+ */
+int qurt_rmutex_lock_timed(qurt_mutex_t *lock, unsigned long long int duration);
+
+/**@ingroup func_qurt_rmutex_unlock
+   Unlocks the specified recursive mutex. \n 
+   More than one thread can be suspended on a mutex. When the mutex is 
+   unlocked, the thread waiting on the mutex awakens. If the awakened
+   thread has higher priority than the current thread, a context switch occurs.
+
+   @note1hang When a thread unlocks a recursive mutex, the mutex is not available until 
+   the balanced number of locks and unlocks has been performed on the mutex.
+
+   @datatypes
+   #qurt_mutex_t
+  
+   @param[in]  lock  Pointer to the recursive mutex object to unlock. 
+
+   @return
+   None.
+
+   @dependencies
+   None.
+  
+ */
+void qurt_rmutex_unlock(qurt_mutex_t *lock);
+
+/**@ingroup func_qurt_rmutex_try_lock
+   Attempts to lock the specified recursive mutex.\n
+
+   If a thread performs a try_lock operation on a recursive mutex that is not in use, the
+   thread gains access to the shared resource that is protected by the mutex, and continues
+   executing.\n
+   If a thread performs a try_lock operation on a recursive mutex that another thread has 
+   already locked, qurt_rmutex_try_lock immediately returns with a nonzero result
+   value.
+
+   @datatypes
+   #qurt_mutex_t
+   
+   @param[in]  lock  Pointer to the recursive mutex object to lock.
+
+   @return 
+   0 -- Success. \n 
+   Nonzero -- Failure. 
+  
+ */
+int qurt_rmutex_try_lock(qurt_mutex_t *lock);
+
+/**@ingroup func_qurt_rmutex_try_lock_block_once 
+  Attempts to lock a mutex object recursively. If the mutex is available, 
+  it locks the mutex. If the mutex is held by the current thread, 
+  it increases the internal counter and returns 0. If not, it returns a
+  nonzero value.
+  If the mutex is already locked by another thread, the caller thread is 
+  suspended. When the mutex becomes available again (because the other 
+  thread has unlocked it), the caller thread is awakened and tries to lock
+  the mutex; and if it fails, this function returns failure with a nonzero 
+  value. If it succeeds, this function returns success with zero.
+ 
+  @datatypes
+  #qurt_mutex_t
+  
+  @param[in]  lock  Pointer to the qurt_mutex_t object. 
+
+  @return
+  0 -- Success. \n
+  Nonzero -- Failure. 
+
+  @dependencies
+  None.
+ */
+int qurt_rmutex_try_lock_block_once(qurt_mutex_t *lock);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_RMUTEX_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_rmutex2.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_rmutex2.h
new file mode 100755
index 0000000000000..a37e7e4458c4b
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_rmutex2.h
@@ -0,0 +1,183 @@
+#ifndef QURT_RMUTEX2_H
+#define QURT_RMUTEX2_H
+/**
+  @file qurt_rmutex2.h 
+  @brief Prototypes of rmutex2 API  
+
+EXTERNAL FUNCTIONS
+   None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+Copyright (c) 2013, 2021, 2023  by Qualcomm Technologies, Inc.  All Rights Reserved.
+Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+=============================================================================*/
+
+
+#include <qurt_futex.h>
+#include <qurt_mutex.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** @addtogroup mutex_types
+@{ */
+/*=============================================================================
+                        TYPEDEFS
+=============================================================================*/
+
+/** QuRT rmutex2 type.                                       
+   Mutex type used with rmutex2 APIs.
+ */
+typedef struct {
+   /** @cond */
+   unsigned int holder __attribute__((aligned(8)));    /* UGP value of the mutex holder. */
+   unsigned short waiters;                             /* Number of waiting threads. */
+   unsigned short refs;                                /* Number of references to this mutex. */
+   unsigned int queue;                                 /* Kernel-maintained futex queuevalue. */
+   unsigned int excess_locks;                          /* Number of excess times the holder has locked the mutex. */
+   /** @endcond */
+} qurt_rmutex2_t;
+/** @} */ /* end_addtogroup mutex_types */
+/** @cond internal_only*/
+/*=============================================================================
+												FUNCTIONS
+=============================================================================*/
+
+/**@ingroup func_qurt_rmutex2_init
+
+   @deprecated use #qurt_rmutex_init instead.
+
+   Initializes a recursive mutex object. 
+
+   The recursive mutex is initially unlocked.
+  
+   Objects of type rmutex2 solve a potential race condition between
+   unlock() and destroy() operations.
+
+   @datatypes
+   #qurt_rmutex2_t
+   
+   @param[out]  lock  Pointer to the recursive mutex object.
+
+   @return 
+   None.
+
+   @dependencies
+   None.
+ */
+void qurt_rmutex2_init(qurt_rmutex2_t *lock);
+
+/**@ingroup func_qurt_rmutex2_destroy
+
+  @deprecated use #qurt_rmutex_destroy instead.
+
+  Destroys the specified recursive mutex. \n
+  @note1hang Recursive mutexes must not be destroyed while they are still in use. If this
+             occurs, the behavior of QuRT is undefined. 
+  @note1cont In general, application code must destroy an rmutex2 object prior to
+             deallocating it; calling qurt_rmutex2_destroy() before deallocating it ensures
+             that all qurt_rmutex2_unlock() calls complete.
+  
+  @datatypes
+  #qurt_rmutex2_t
+  
+  @param[in]  lock  Pointer to the recursive mutex object to destroy.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+  
+ */
+void qurt_rmutex2_destroy(qurt_rmutex2_t *lock);
+
+/**@ingroup func_qurt_rmutex2_lock
+
+  @deprecated use #qurt_rmutex_lock instead.
+
+  Locks the specified recursive mutex. \n 
+
+  If a thread performs a lock operation on a recursive mutex that is not in use, the
+  thread gains access to the shared resource that the mutex protects, and continues
+  to execute.
+
+  If a thread performs a lock operation on a recursive mutex that another thread is using, 
+  the thread is suspended. When the mutex becomes available again
+  (because the other thread has unlocked it), the thread is awakened and given access to the
+  shared resource.
+  
+  @note1hang A thread is not suspended if it locks a recursive mutex that it has already
+             locked, but the mutex does not become available until the thread performs a
+             balanced number of unlocks on the mutex.
+  
+   @datatypes
+   #qurt_rmutex2_t
+  
+   @param[in]  lock  Pointer to the recursive mutex object to lock. 
+
+   @return
+   None.
+
+   @dependencies
+   None.
+  
+ */
+void qurt_rmutex2_lock(qurt_rmutex2_t *lock);
+
+/**@ingroup func_qurt_rmutex2_unlock
+
+   @deprecated use #qurt_rmutex_unlock instead.
+
+   Unlocks the specified recursive mutex. \n 
+   More than one thread can be suspended on a recursive mutex. When the mutex is
+   unlocked, only the highest-priority thread waiting on the mutex awakens. If the
+   awakened thread has higher priority than the current thread, a context switch occurs.
+  
+   @datatypes
+   #qurt_rmutex2_t
+  
+   @param[in]  lock  Pointer to the recursive mutex object to unlock. 
+
+   @return
+   None.
+
+   @dependencies
+   None.
+  
+ */
+void qurt_rmutex2_unlock(qurt_rmutex2_t *lock);
+
+/**@ingroup func_qurt_rmutex2_try_lock
+
+   @deprecated use #qurt_rmutex_try_lock instead.
+
+   Attempts to lock the specified recursive mutex.\n
+
+   Non-blocking version of qurt_rmutex2_lock(). When a call to qurt_rmutex2_lock() 
+   succeeds immediately, this function behaves similarly, returning 0 for success.
+   When a call to qurt_rmutex2_lock() does not succeed immediately, this function has
+   no effect and returns nonzero for failure.
+
+   @datatypes
+   #qurt_rmutex2_t
+   
+   @param[in]  lock  Pointer to the recursive mutex object to lock.
+
+   @return 
+   0 -- Success. \n 
+   Nonzero -- Failure. 
+  
+ */
+int qurt_rmutex2_try_lock(qurt_rmutex2_t *lock);
+/** @endcond */
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_RMUTEX2_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_sclk.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_sclk.h
new file mode 100755
index 0000000000000..a83cf5f1db889
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_sclk.h
@@ -0,0 +1,145 @@
+#ifndef QURT_SCLK_H
+#define QURT_SCLK_H
+/**
+  @file qurt_sclk.h 
+  @brief Header file describing the APIs supported by QuRT system SCLK
+   feature.
+
+EXTERNAL FUNCTIONS
+   None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+Copyright (c) 2018-2021, 2023 Qualcomm Technologies, Inc.
+All Rights Reserved.
+Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+=============================================================================*/
+
+
+
+
+/*=============================================================================
+
+                           INCLUDE FILES
+
+=============================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
+
+/**
+ Conversion from microseconds to sleep ticks.
+ */
+#define QURT_SYSCLOCK_TIMETICK_FROM_US(us) ((us) * 192ULL / 10UL)
+#define qurt_sysclock_timetick_from_us(us) QURT_SYSCLOCK_TIMETICK_FROM_US(us)
+
+
+/**
+ Conversion from timer ticks to microseconds at the nominal frequency.
+*/
+#define QURT_SYSCLOCK_TIMETICK_TO_US(ticks) qurt_timer_timetick_to_us(ticks)
+
+/**
+  Maximum microseconds value for Qtimer is 1,042,499 hours.
+*/
+#define QURT_SYSCLOCK_MAX_DURATION (1042499uLL * 3600uLL * 1000uLL * 1000uLL)
+#define qurt_sysclock_max_duration() QURT_SYSCLOCK_MAX_DURATION
+/** 
+ Timer clock for Qtimer is 19.2 MHz.
+*/
+#define QURT_SYSCLOCK_MAX_DURATION_TICKS (1042499uLL * 3600uLL * 19200000uLL)
+#define qurt_sysclock_max_duration_ticks() QURT_SYSCLOCK_MAX_DURATION_TICKS
+/** 
+ Sleep timer error margin for Qtimer is 192 ticks ~10 us.
+*/
+#define QURT_SYSCLOCK_ERROR_MARGIN 192U //QURT_TIMER_MIN_DURATION*timer_freq;
+#define qurt_sysclock_error_margin() QURT_SYSCLOCK_ERROR_MARGIN
+
+/*=============================================================================
+
+                           DATA DECLARATIONS
+
+=============================================================================*/
+
+/**@ingroup func_qurt_sysclock_get_hw_ticks
+  @xreflabel{sec:qurt_sysclock_get_hw_ticks}
+  Gets the hardware tick count.\n
+  Returns the current value of a 64-bit hardware counter. The value wraps around to zero
+  when it exceeds the maximum value. 
+
+  @note1hang This operation must be used with care because of the wrap-around behavior.
+ 
+  @return 
+  Integer -- Current value of 64-bit hardware counter. 
+
+  @dependencies
+  None.
+ */
+unsigned long long qurt_sysclock_get_hw_ticks (void);
+
+
+/**@ingroup func_qurt_sysclock_get_hw_ticks_32
+  @xreflabel{sec:qurt_sysclock_get_hw_ticks_32}
+  Gets the hardware tick count in 32 bits.\n
+  Returns the current value of a 32-bit hardware counter. The value wraps around to zero
+  when it exceeds the maximum value. 
+
+  @note1hang This operation is implemented as an inline C function, and should be called from a C/C++ program.
+             The returned 32 bits are the lower 32 bits of the Qtimer counter.
+ 
+  @return 
+  Integer -- Current value of the 32-bit timer counter. 
+
+  @dependencies
+  None.
+ */
+static inline unsigned long qurt_sysclock_get_hw_ticks_32 (void)
+{
+    //Beginning with v61 there is a HW register that can be read directly.
+          unsigned long count;
+          __asm__ __volatile__ (" %0 = c30 " : "=r"(count));
+          return count;
+}
+
+
+/**@ingroup func_qurt_sysclock_get_hw_ticks_16
+  @xreflabel{sec:qurt_sysclock_get_hw_ticks_16}
+  Gets the hardware tick count in 16 bits.\n
+  Returns the current value of a 16-bit timer counter. The value wraps around to zero
+  when it exceeds the maximum value. 
+
+  @note1hang This operation is implemented as an inline C function, and should be called from a C/C++ program.
+             The returned 16 bits are based on the value of the lower 32 bits in Qtimer 
+             counter, right shifted by 16 bits.
+ 
+  @return 
+  Integer -- Current value of the 16-bit timer counter, calculated from the lower 32 bits in the
+             Qtimer counter, right shifted by 16 bits. 
+
+  @dependencies
+  None.
+ */
+
+
+static inline unsigned short qurt_sysclock_get_hw_ticks_16 (void)
+{
+    unsigned long ticks;
+
+    //Beginning with v61 there is a HW register that can be read directly.
+       __asm__ __volatile__ (" %0 = c30 " : "=r"(ticks));
+    __asm__ __volatile__ ( "%0 = lsr(%0, #16) \n" :"+r"(ticks));
+
+    return (unsigned short)ticks; 
+}
+unsigned long long qurt_timer_timetick_to_us(unsigned long long ticks);
+#define qurt_sysclock_timetick_to_us(ticks) qurt_timer_timetick_to_us(ticks)
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif /* __cplusplus */
+
+#endif /* QURT_SCLK_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_secure_proc.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_secure_proc.h
new file mode 100755
index 0000000000000..f40c7deb9bca1
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_secure_proc.h
@@ -0,0 +1,53 @@
+#ifndef QURT_SECURE_PROC_H
+#define QURT_SECURE_PROC_H
+
+/**
+  @file qurt_secure_proc.h
+  @brief  Definitions, macros, and prototypes used for handling secure process
+
+ EXTERNALIZED FUNCTIONS
+  none
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  none
+
+ Copyright (c) 2015, 2021  by Qualcomm Technologies, Inc.  All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**@ingroup qurt_process_migrate_secure_process
+  Migrate the user process to Qurt secure process 
+   
+  @param secure_phy_address  Physical starting address of secure memory
+  @param secure_memory_size  Size of secure memory
+  @param entry               Entry function to secure process 
+
+  @return
+  EOK
+  Negative return value -- Error.
+
+  @dependencies
+  None.
+*/
+int qurt_process_migrate_secure_process(unsigned long long secure_phy_address, unsigned int secure_memory_size,  void entry(unsigned));
+
+/**@ingroup qurt_process_get_migration_mem_size
+  get the size of all writable memory regions in a user PD. This is for preparation on secure process migration.
+   
+  @return
+  size of all writable memory regions in a user PD.
+ 
+  @dependencies
+  None.
+*/
+int qurt_process_get_migration_mem_size(void);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_sem.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_sem.h
new file mode 100755
index 0000000000000..ee5ce4b2d94ab
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_sem.h
@@ -0,0 +1,252 @@
+#ifndef QURT_SEM_H
+#define QURT_SEM_H 
+/**
+  @file  qurt_sem.h 
+  Prototypes of semaphore API.  
+
+EXTERNAL FUNCTIONS
+   None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+Copyright (c) 2021  by Qualcomm Technologies, Inc.  All Rights Reserved.
+Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+=============================================================================*/
+
+
+#include <qurt_futex.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*=============================================================================
+												TYPEDEFS
+=============================================================================*/
+/** @addtogroup semaphore_types
+@{ */
+
+/** QuRT semaphore type.   */
+typedef union {
+    /** @cond */
+	unsigned int raw[2] __attribute__((aligned(8)));
+	struct {        
+		unsigned short val;        /**< */
+		unsigned short n_waiting;  /**< */
+        unsigned int reserved1;    /**< */
+        unsigned int queue;       /**< */
+        unsigned int reserved2;    /**< */
+	}X; /** @endcond */   
+} qurt_sem_t;
+/** @} */ /* end_addtogroup semaphore_types */
+/*=============================================================================
+												FUNCTIONS
+=============================================================================*/
+
+/**@ingroup func_qurt_sem_add
+  Releases access to a shared resource (the specified amount increments the semaphore count value).\n
+  When a thread performs an add operation on a semaphore, the specified value increments the semaphore count.
+  The result depends on the number of threads waiting
+  on the semaphore: \n
+  - When no threads are waiting, the current thread releases access to the shared resource
+     and continues executing. \n
+  - When one or more threads are waiting and the semaphore count value is nonzero, 
+     the kernel repeatedly awakens the highest-priority waiting thread and decrements
+     the semaphore count value until either no waiting threads remain or the
+     semaphore count value is zero. If any of the awakened threads has higher priority
+     than the current thread, a context switch can occur.
+
+   @datatypes
+   #qurt_sem_t
+   
+   @param[in]  sem  Pointer to the semaphore object to access. 
+   @param[in]  amt  Amount to increment the semaphore count value. 
+
+   @return 
+   Unused integer value.
+
+   @dependencies 
+   None.
+  
+ */
+int qurt_sem_add(qurt_sem_t *sem, unsigned int amt);
+
+/**@ingroup func_qurt_sem_up  
+  Releases access to a shared resource. When a thread performs an up operation on a semaphore,
+  the semaphore count value increments. The result depends on the number of threads waiting 
+  on the semaphore: \n
+  - When no threads are waiting, the current thread releases access to the shared resource
+     and continues executing.\n
+  - When one or more threads are waiting and the semaphore count value is nonzero, 
+     the kernel awakens the highest-priority waiting thread and decrements the
+     semaphore count value. If the awakened thread has higher priority than the current
+     thread, a context switch can occur.
+
+   @datatypes
+   #qurt_sem_t
+   
+   @param[in]  sem  Pointer to the semaphore object to access.
+
+   @return 
+   Unused integer value.
+
+   @dependencies
+   None.  
+ */
+static inline int qurt_sem_up(qurt_sem_t *sem) { return qurt_sem_add(sem,1); }
+
+/**@ingroup func_qurt_sem_down  
+  Requests access to a shared resource. When a thread performs a down operation on a 
+  semaphore, the result depends on the semaphore count value: \n
+  - When the count value is nonzero, it is decremented, and the thread gains access to the
+     shared resource and continues executing.\n
+  - When the count value is zero, it is not decremented, and the thread is suspended on the
+     semaphore. When the count value becomes nonzero (because another thread
+     released the semaphore) it is decremented, and the suspended thread is awakened
+     and gains access to the shared resource.
+  
+   @datatypes
+   #qurt_sem_t
+   
+   @param[in]  sem  Pointer to the semaphore object to access.
+
+   @return 
+   Unused integer value.
+
+   @dependencies
+   None.
+ */
+int qurt_sem_down(qurt_sem_t *sem);
+
+/**@ingroup func_qurt_sem_down_timed  
+  When a thread performs a down operation on a semaphore, the result depends on the
+  semaphore count value: \n
+  - When the count value is nonzero, it is decremented, and the thread gains access to the
+     shared resource and continues executing.\n
+  - When the count value is zero, it is not decremented, and the thread is suspended on the
+     semaphore. When the count value becomes nonzero (because another thread
+     released the semaphore) it is decremented, and the suspended thread is awakened
+     and gains access to the shared resource. Terminate the wait when the specified timeout expires.
+   If timeout expires, terminate this wait and grant no access to the shared resource.
+  
+   @datatypes
+   #qurt_sem_t
+   
+   @param[in]  sem     Pointer to the semaphore object to access. 
+   @param[in] duration Interval (in microseconds) duration value must be between #QURT_TIMER_MIN_DURATION and
+                       #QURT_TIMER_MAX_DURATION 
+
+   @return 
+   #QURT_EOK -- Success \n
+   #QURT_ETIMEDOUT -- Timeout
+
+   @dependencies
+   None.
+ */
+int qurt_sem_down_timed(qurt_sem_t *sem, unsigned long long int duration);
+
+/**@ingroup func_qurt_sem_try_down
+  @xreflabel{hdr:qurt_sem_try_down}
+  Requests access to a shared resource (without suspend). When a thread performs a try down
+  operation on a semaphore, the result depends on the semaphore count value: \n
+  - The count value is decremented when it is nonzero. The down operation returns 0 as
+     the function result, and the thread gains access to the shared resource and is free to
+     continue executing.\n
+  - The count value is not decremented when it is zero. The down operation returns -1
+     as the function result, and the thread does not gain access to the shared resource
+     and should not continue executing.
+ 
+   @datatypes
+   #qurt_sem_t
+   
+   @param[in]  sem  Pointer to the semaphore object to access. 
+
+   @return 
+   0 -- Success. \n
+   -1 -- Failure. 
+
+   @dependencies
+   None.
+   
+ */
+int qurt_sem_try_down(qurt_sem_t *sem);
+
+/**@ingroup func_qurt_sem_init
+  Initializes a semaphore object.
+  The default initial value of the semaphore count value is 1.
+
+  @param[out]  sem  Pointer to the initialized semaphore object.
+
+  @return 
+  None.
+
+  @dependencies
+  None.
+  
+ */
+void qurt_sem_init(qurt_sem_t *sem);
+
+/**@ingroup func_qurt_sem_destroy
+  Destroys the specified semaphore.\n
+  @note1hang Semaphores must be destroyed when they are no longer in use. Failure to do
+             this causes resource leaks in the QuRT kernel.\n
+  @note1cont Semaphores must not be destroyed while they are still in use. If this occur,
+             the behavior of QuRT is undefined.
+
+  @datatypes
+  #qurt_sem_t
+
+  @param[in]  sem  Pointer to the semaphore object to destroy. 
+ 
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+void qurt_sem_destroy(qurt_sem_t *sem);
+
+/**@ingroup func_qurt_sem_init_val
+  Initializes a semaphore object with the specified value.
+
+   @datatypes
+   #qurt_sem_t
+   
+   @param[out]  sem  Pointer to the initialized semaphore object. 
+   @param[in]  val   Initial value of the semaphore count value.
+
+   @return
+   None.
+
+   @dependencies
+   None.
+  
+ */
+void qurt_sem_init_val(qurt_sem_t *sem, unsigned short val);
+
+/**@ingroup func_qurt_sem_get_val
+  Gets the semaphore count value.\n
+  Returns the current count value of the specified semaphore.
+
+  @datatypes
+  #qurt_sem_t
+  
+  @param[in]   sem Pointer to the semaphore object to access.
+
+  @return
+  Integer semaphore count value
+
+  @dependencies
+  None.
+ */
+static inline unsigned short qurt_sem_get_val(qurt_sem_t *sem ){return sem->X.val;}
+int qurt_sem_down_cancellable(qurt_sem_t *sem);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_SEM_H  */
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_shmem.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_shmem.h
new file mode 100755
index 0000000000000..980557323708a
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_shmem.h
@@ -0,0 +1,89 @@
+#ifndef QURT_SHMEM_H
+#define QURT_SHMEM_H
+
+/**
+  @file qurt_shmem.h
+
+  @brief
+  Prototypes of QuRT inter-process shared memory APIs
+
+ EXTERNALIZED FUNCTIONS
+  none
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  none
+
+ Copyright (c) 2013, 2021  by Qualcomm Technologies, Inc.  All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef MODE_T
+#define MODE_T
+typedef unsigned int mode_t;
+#endif //MODE_T
+
+/**
+ * The shm_open() function establishes a connection between a shared memory object and a file descriptor.
+ * The file descriptor is used by other functions such as mmap() to refer to that shared memory object.
+ * 
+ *
+ * @param name      Pointer to string naming a shared memory object. Name has to start with "/shm/"
+ * @param oflag     File status flags and file access modes of the open file description. Following
+ *                  flags are defined in <fcntl.h> and supported:
+ *                  O_RDONLY: oepn for read access only
+ *                  O_RDWR: Open for read or write access
+ *                  O_CREAT: If shared memory object doesn't exist, create one.
+ * @param mode      Permission flags (currently ignored)
+ *
+ * @return    file descriptor (positive number) if operation successful.
+ *                  negative error code if failed
+ *
+*/
+
+int shm_open(const char * name, int oflag, mode_t mode);
+
+/**
+ * The shm_mmap() function create a shared memory mapping in the virtual address space of the
+ * the calling process. 
+ * 
+ * @param addr      The starting address for the new mapping is specified in addr.
+ * @param len       Specifies the lengh of the shared memory region.
+ * @param prot      Describes the desired memory protection of the mapping. Same as the one in mmap of POSIX.
+ * @param flags     Determines whether updates to the mapping is visible or not to other process. Same as
+ *                  the one in mmap of POSIX.
+ * @param fd        The starting adddress for the new mapping is returned.
+ * @param offset    unused.
+ *
+ * @return    The starting adddress for the new mapping is returned.
+ *                  negative error code if failed
+ *
+*/
+
+void *shm_mmap(void *addr, unsigned int len, int prot, int flags, int fd, unsigned int offset);
+
+/**
+ * The shm_close() function removes a connection between a shared memory object and a file descriptor.
+ * If there is no file descriptor connects to the shared memory object, the shared memory object will
+ * be deleted automatically. Shared memory object has same virtual address in any process. This is 
+ * restriction of single virtual address space. 
+ * 
+ *
+ * @param fd        File descriptor of shared memory object
+ *
+ * @return    0 if operation successful.
+ *                  negative error code if failed
+ *
+*/
+
+
+int shm_close(int fd);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_signal.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_signal.h
new file mode 100755
index 0000000000000..3a89c53394ad5
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_signal.h
@@ -0,0 +1,518 @@
+#ifndef QURT_SIGNAL_H
+#define QURT_SIGNAL_H
+
+/**
+  @file qurt_signal.h
+  @brief  Prototypes of kernel signal API functions. 
+
+ EXTERNALIZED FUNCTIONS
+  none
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  none
+
+ Copyright (c) 2021  by Qualcomm Technologies, Inc.  All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** @addtogroup signals_types
+@{ */
+#define QURT_SIGNAL_ATTR_WAIT_ANY 0x00000000  /**< Wait any. */
+#define QURT_SIGNAL_ATTR_WAIT_ALL 0x00000001  /**< Wait all. */
+
+/*=====================================================================
+ Typedefs
+ ======================================================================*/
+
+
+/** QuRT signal type.                                           
+ */
+typedef union {
+    /** @cond */
+	unsigned long long int raw;
+	struct {
+		unsigned int signals;
+		unsigned int waiting;
+		unsigned int queue;
+		unsigned int attribute;
+	}X;
+    /** @endcond */
+} qurt_signal_t;
+
+
+/** QuRT 64-bit signal type.                                           
+ */
+typedef struct {
+    /** @cond */
+    qurt_signal_t signal_sum;
+    unsigned long long signals;
+    unsigned long long waiting;
+    /** @endcond */
+} qurt_signal_64_t;
+/** @} */ /* end_addtogroup signals_types */
+/*=====================================================================
+ Functions
+======================================================================*/
+ 
+/*======================================================================*/
+/**@ingroup func_qurt_signal_init
+  Initializes a signal object.
+  Signal returns the initialized object.
+  The signal object is initially cleared.
+
+  @note1hang   Each signal-based object has one or more kernel resources associated with it;
+               to prevent resource leaks, call qurt_signal_destroy()
+               when this object is not used anymore
+  @datatypes
+  #qurt_signal_t
+
+  @param[in] *signal Pointer to the initialized object.
+
+  @return         
+  None.
+     
+  @dependencies
+  None.
+ */
+/* ======================================================================*/
+void qurt_signal_init(qurt_signal_t *signal);
+
+/*======================================================================*/
+/**@ingroup func_qurt_signal_destroy
+  Destroys the specified signal object.
+  
+  @note1hang Signal objects must be destroyed when they are no longer in use. Failure 
+  to do this causes resource leaks in the QuRT kernel.\n
+  @note1cont Signal objects must not be destroyed while they are still in use. If this
+  occurs, the behavior of QuRT is undefined.
+  
+  @datatypes
+  #qurt_signal_t
+
+  @param[in] *signal  Pointer to the signal object to destroy.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+/* ======================================================================*/
+void qurt_signal_destroy(qurt_signal_t *signal);
+
+/*======================================================================*/
+/**@ingroup func_qurt_signal_wait 
+  @xreflabel{hdr:qurt_signal_wait}
+  Suspends the current thread until the specified signals are set.
+
+  Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1 indicates 
+  waiting on a signal, and 0 indicates not waiting on the signal.
+
+  If a thread is waiting on a signal object for any of the specified set of signals to set, 
+  and one or more of those signals is set in the signal object, the thread is awakened.
+
+  If a thread is waiting on a signal object for all of the specified set of signals to be set, 
+  and all of those signals are set in the signal object, the thread is awakened.
+
+  The specified set of signals can be cleared when the signal is set.
+
+  @note1hang At most, one thread can wait on a signal object at any given time.
+
+  @datatypes
+  #qurt_signal_t
+
+  @param[in] signal      Pointer to the signal object to wait on.
+  @param[in] mask        Mask value identifying the individual signals in the signal object to 
+                         wait on.
+  @param[in] attribute   Indicates whether the thread waits to set any of the signals, or to set all of 
+                         them. \n
+						 @note1hang The wait-any and wait-all types are mutually exclusive.\n Values:\n
+                         - #QURT_SIGNAL_ATTR_WAIT_ANY \n
+                         - #QURT_SIGNAL_ATTR_WAIT_ALL @tablebulletend
+
+  @return     	
+  A 32-bit word with current signals.
+ 
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+unsigned int qurt_signal_wait(qurt_signal_t *signal, unsigned int mask, 
+                unsigned int attribute);
+
+/*======================================================================*/
+/**@ingroup func_qurt_signal_wait_timed
+  @xreflabel{hdr:qurt_signal_wait}
+  Suspends the current thread until the specified signals are set or until timeout.
+
+  Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1 indicates 
+  waiting on a signal, and 0 indicates not waiting.
+
+  If a thread is waiting on a signal object for any of the specified set of signals to be set, 
+  and one or more of those signals is set in the signal object, the thread is awakened.
+
+  If a thread is waiting on a signal object for all of the specified set of signals to be set, 
+  and all of those signals are set in the signal object, the thread is awakened.
+
+  The specified set of signals can be cleared after the signal is set.
+
+  @note1hang At most, one thread can wait on a signal object at any given time.
+
+  @datatypes
+  #qurt_signal_t
+
+  @param[in] signal      Pointer to the signal object to wait on.
+  @param[in] mask        Mask value that identifies the individual signals in the signal object to wait on.
+  @param[in] attribute   Indicates whether the thread must wait until any of the signals are set, or until all of 
+                         them are set. \n
+						 @note1hang The wait-any and wait-all types are mutually exclusive.\n Values:\n
+                         - #QURT_SIGNAL_ATTR_WAIT_ANY \n
+                         - #QURT_SIGNAL_ATTR_WAIT_ALL @tablebulletend
+  @param[out] signals    Bitmask of signals that are set 
+  @param[in] duration    Duration (microseconds) to wait. Must be in the range
+                         [#QURT_TIMER_MIN_DURATION ... #QURT_TIMER_MAX_DURATION]
+
+  @return 				
+  #QURT_EOK -- Success; one or more signals were set \n
+  #QURT_ETIMEDOUT -- Timed-out \n
+  #QURT_EINVALID -- Duration out of range
+  
+  @dependencies
+  Timed-waiting support in the kernel.
+*/
+/* ======================================================================*/
+int qurt_signal_wait_timed(qurt_signal_t *signal, unsigned int mask, 
+                unsigned int attribute, unsigned int *signals, unsigned long long int duration);
+
+/*======================================================================*/
+/**@ingroup func_qurt_signal_wait_any
+  Suspends the current thread until any of the specified signals are set.
+
+  Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1 indicates
+  to wait on a signal, and 0 indicates not to wait on the thread.
+
+  If a thread is waiting on a signal object for any of the specified set of signals to be set, 
+  and one or more of those signals is set in the signal object, the thread is awakened.
+
+  @note1hang At most, one thread can wait on a signal object at any given time.
+
+  @datatypes
+  #qurt_signal_t
+
+  @param[in] signal      Pointer to the signal object to wait on.
+  @param[in] mask        Mask value identifying the individual signals in the signal object to  
+                         wait on.
+	
+  @return     	
+  32-bit word with current signals.
+ 
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+static inline unsigned int qurt_signal_wait_any(qurt_signal_t *signal, unsigned int mask)
+{
+  return qurt_signal_wait(signal, mask, QURT_SIGNAL_ATTR_WAIT_ANY);
+}
+
+/*======================================================================*/
+/**@ingroup func_qurt_signal_wait_all
+  Suspends the current thread until all of the specified signals are set.
+
+  Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1 indicates 
+  to wait on a signal, and 0 indicates not to wait on it.
+
+  If a thread is waiting on a signal object for all of the specified set of signals to be set, 
+  and all of those signals are set in the signal object, the thread is awakened.
+
+  @note1hang At most, one thread can wait on a signal object at any given time.
+
+  @datatypes
+  #qurt_signal_t
+
+  @param[in] signal      Pointer to the signal object to wait on. 
+  @param[in] mask        Mask value identifying the individual signals in the signal object to  
+                         wait on.
+	
+  @return      	
+  A 32-bit word with current signals.
+ 
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+static inline unsigned int qurt_signal_wait_all(qurt_signal_t *signal, unsigned int mask)
+{
+  return qurt_signal_wait(signal, mask, QURT_SIGNAL_ATTR_WAIT_ALL);
+}
+
+/*======================================================================*/
+/**@ingroup func_qurt_signal_set
+  Sets signals in the specified signal object.
+
+  Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1 indicates 
+  to set the signal, and 0 indicates not to set it.
+  	
+  @datatypes
+  #qurt_signal_t
+
+  @param[in]    signal  Pointer to the signal object to modify.
+  @param[in]    mask    Mask value identifying the individual signals to set in the signal
+                        object.
+
+  @return 
+  None.
+  
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+void qurt_signal_set(qurt_signal_t *signal, unsigned int mask);
+
+/*======================================================================*/
+/**@ingroup func_qurt_signal_get
+   Gets a signal from a signal object.
+   
+   Returns the current signal values of the specified signal object.
+
+  @datatypes
+  #qurt_signal_t
+
+  @param[in] *signal Pointer to the signal object to access.
+
+  @return         
+  A 32-bit word with current signals
+    
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+unsigned int qurt_signal_get(qurt_signal_t *signal);
+
+/*======================================================================*/
+/**@ingroup func_qurt_signal_clear
+  Clear signals in the specified signal object.
+
+  Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1 
+  indicates that a signal must be cleared, and 0 indicates not to clear it.
+
+  @note1hang Signals must be explicitly cleared by a thread when it is awakened -- the wait 
+           operations do not automatically clear them.
+
+  @datatypes
+  #qurt_signal_t
+
+  @param[in] signal   Pointer to the signal object to modify.
+  @param[in] mask     Mask value identifying the individual signals to clear in the signal object.
+
+  @return 		  
+  None.
+
+  @dependencies
+  None.
+ */
+/* ======================================================================*/
+void qurt_signal_clear(qurt_signal_t *signal, unsigned int mask);
+
+/**@ingroup func_qurt_signal_wait_cancellable  
+  @xreflabel{hdr:qurt_signal_wait_cancellable}
+  Suspends the current thread until either the specified signals are set or the wait operation is cancelled.
+  The operation is cancelled if the user process of the calling thread is killed, or if the calling thread 
+  must finish its current QDI invocation and return to user space. 
+
+  Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1 indicates 
+  that a signal must be waited on, and 0 indicates not to wait on it.
+
+  If a thread is waiting on a signal object for any of the specified set of signals to be set, and one or 
+  more of those signals is set in the signal object, the thread is awakened.
+
+  If a thread is waiting on a signal object for all of the specified set of signals to be set, and all of 
+  those signals are set in the signal object, the thread is awakened.
+
+  @note1hang At most, one thread can wait on a signal object at any given time.
+
+  @note1cont When the operation is cancelled, the caller must assume that the signal is never set.
+
+  @datatypes
+  #qurt_signal_t
+
+  @param[in] signal      Pointer to the signal object to wait on.
+  @param[in] mask        Mask value identifying the individual signals in the signal object to  
+                         wait on.
+  @param[in] attribute   Indicates whether the thread must wait until any of the signals are set, or until all of 
+                         them are set.  Values:\n
+                         - #QURT_SIGNAL_ATTR_WAIT_ANY \n
+                         - #QURT_SIGNAL_ATTR_WAIT_ALL @tablebulletend
+  @param[out] return_mask Pointer to the 32-bit mask value that was originally passed to the function.
+
+
+  @return     	
+  #QURT_EOK -- Wait completed. \n
+  #QURT_ECANCEL -- Wait cancelled.
+
+ 
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+int qurt_signal_wait_cancellable(qurt_signal_t *signal, unsigned int mask, 
+                                 unsigned int attribute,
+                                 unsigned int *return_mask);
+
+/*======================================================================*/
+/**@ingroup func_qurt_signal_64_init
+  Initializes a 64-bit signal object.\n
+  The signal argument returns the initialized object.
+  The signal object is initially cleared.
+
+  @note1hang   Each signal-based object has one or more kernel resources associated with it;
+               to prevent resource leaks, call qurt_signal_destroy()
+               when this object is not used anymore.
+  @datatypes
+  #qurt_signal_64_t
+
+  @param[in] signal Pointer to the initialized object.
+
+  @return         
+  None.
+     
+  @dependencies
+  None.
+ */
+/* ======================================================================*/
+void qurt_signal_64_init(qurt_signal_64_t *signal);
+
+/*======================================================================*/
+/**@ingroup func_qurt_signal_64_destroy
+  Destroys the specified signal object.
+  
+  @note1hang 64-bit signal objects must be destroyed when they are no longer in use. Failure 
+  to do this causes resource leaks in the QuRT kernel.\n
+  @note1cont Signal objects must not be destroyed while they are still in use. If this
+  occurs, the behavior of QuRT is undefined.
+  
+  @datatypes
+  #qurt_signal_64_t
+
+  @param[in] signal  Pointer to the signal object to destroy.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+/* ======================================================================*/
+void qurt_signal_64_destroy(qurt_signal_64_t *signal);
+
+/*======================================================================*/
+/**@ingroup func_qurt_signal_64_wait
+  Suspends the current thread until all of the specified signals are set.
+
+  Signals are represented as bits 0 through 63 in the 64-bit mask value. A mask bit value of 1 indicates 
+  that a signal must be waited on, and 0 indicates not wait on it.
+
+  If a thread is waiting on a signal object for all of the specified set of signals to be set, 
+  and all of those signals are set in the signal object, the thread is awakened.
+
+  @note1hang At most, one thread can wait on a signal object at any given time.
+
+  @datatypes
+  #qurt_signal_64_t
+
+  @param[in] signal      Pointer to the signal object to wait on. 
+  @param[in] mask        Mask value, which identifies the individual signals in the signal object to  
+                         wait on.
+  @param[in] attribute   Indicates whether the thread must wait until any of the signals are set, or until all of 
+                         them are set.  \n
+						 @note1hang The wait-any and wait-all types are mutually exclusive.\n Values:\n
+                         - #QURT_SIGNAL_ATTR_WAIT_ANY \n
+                         - #QURT_SIGNAL_ATTR_WAIT_ALL @tablebulletend
+	
+  @return      	
+  A 32-bit word with current signals.
+ 
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+unsigned long long qurt_signal_64_wait(qurt_signal_64_t *signal, unsigned long long mask, 
+                unsigned int attribute);
+
+/*======================================================================*/
+/**@ingroup func_qurt_signal_64_set
+  Sets signals in the specified signal object.
+
+  Signals are represented as bits 0 through 63 in the 64-bit mask value. A mask bit value of 1 indicates 
+  that a signal must be set, and 0 indicates not to set it.
+  	
+  @datatypes
+  #qurt_signal_64_t
+
+  @param[in]    signal  Pointer to the signal object to modify.
+  @param[in]    mask    Mask value identifiying the individual signals to set in the signal
+                        object.
+
+  @return 
+  None.
+  
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+void qurt_signal_64_set(qurt_signal_64_t *signal, unsigned long long mask);
+
+/*======================================================================*/
+/**@ingroup func_qurt_signal_64_get
+   Gets a signal from a signal object.
+   
+   Returns the current signal values of the specified signal object.
+
+  @datatypes
+  #qurt_signal_64_t
+
+  @param[in] *signal Pointer to the signal object to access.
+
+  @return         
+  A 64-bit double word with current signals.
+    
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+unsigned long long qurt_signal_64_get(qurt_signal_64_t *signal);
+
+/*======================================================================*/
+/**@ingroup func_qurt_signal_64_clear
+  Clears signals in the specified signal object.
+
+  Signals are represented as bits 0 through 63 in the 64-bit mask value. A mask bit value of 1 
+  indicates that a signal must be cleared, and 0 indicates not to clear it.
+
+  @note1hang Signals must be explicitly cleared by a thread when it is awakened -- the wait 
+           operations do not automatically clear them.
+
+  @datatypes
+  #qurt_signal_64_t
+
+  @param[in] signal   Pointer to the signal object to modify.
+  @param[in] mask     Mask value identifying the individual signals to clear in the signal object.
+
+  @return 		  
+  None.
+
+  @dependencies
+  None.
+ */
+/* ======================================================================*/
+void qurt_signal_64_clear(qurt_signal_64_t *signal, unsigned long long mask);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* QURT_SIGNAL_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_signal2.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_signal2.h
new file mode 100755
index 0000000000000..43975100cbf75
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_signal2.h
@@ -0,0 +1,340 @@
+#ifndef QURT_SIGNAL2_H
+#define QURT_SIGNAL2_H
+
+/**
+  @file qurt_signal2.h
+  @brief  Prototypes of kernel signal2 API functions.
+
+ EXTERNALIZED FUNCTIONS
+  none
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  none
+
+ Copyright (c) 2013, 2021, 2023  by Qualcomm Technologies, Inc.  All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define QURT_SIGNAL_ATTR_WAIT_ANY 0x00000000
+#define QURT_SIGNAL_ATTR_WAIT_ALL 0x00000001
+
+/*=====================================================================
+ Typedefs
+ ======================================================================*/
+
+/** @addtogroup signals2_types
+@{ */
+/** qurt_signal2 type.
+ */
+typedef union {
+   /** @cond */
+  struct{
+   unsigned int cur_mask;                              /* Current set of signal bits that are set. */
+   unsigned int sig_state;                             /* Current state. */
+                                                       /* Bit 0 -- in anysignal wait. */
+                                                       /* Bit 1 -- in allsignal wait. */
+                                                       /* Bit 2 -- in interrupt wait. */
+                                                       /* Bits 31-3 -- reference count field. */
+   unsigned int queue;                                 /* Kernel-maintained futex queue value. */
+   unsigned int wait_mask;                             /* When sig_state indicates a waiter is present, this is the wait mask. */
+   };
+  unsigned long long int raw;
+  /** @endcond */
+} qurt_signal2_t;
+/* @} */ /* end_addtogroup signals2_types */
+
+/*=====================================================================
+ Functions
+======================================================================*/
+
+/*======================================================================*/
+/**@ingroup func_qurt_signal2_init
+
+  @deprecated use #qurt_signal_init instead.
+
+  Initializes a signal2 object.
+  Signal returns the initialized object.
+  The signal object is initially cleared.
+
+  Objects of type signal2 solve a potential race condition between
+  set() and destroy() operations.
+
+  @datatypes
+  #qurt_signal2_t
+
+  @param[in] *signal Pointer to the initialized object.
+
+  @return
+  None.
+
+  @dependencies
+  Each mutex-based object has an associated
+       kernel resource(s), therefore users must call qurt_signal2_destroy()
+       when this object no longer in use.
+ */
+/* ======================================================================*/
+void qurt_signal2_init(qurt_signal2_t *signal);
+
+/*======================================================================*/
+/**@ingroup func_qurt_signal2_destroy
+
+  @deprecated use #qurt_signal_destroy instead.
+
+  Destroys the specified signal object.
+
+  @note1cont Signal objects must not be destroyed while they are still in use. If this
+  occurs, the behavior of QuRT is undefined.
+  @note1cont Application code should destroy a signal2 object prior to deallocating it.
+             Calling qurt_signal2_destroy() before deallocating a 
+             signal2 object ensures completion of all qurt_signal2_set() calls.
+
+  @datatypes
+  #qurt_signal2_t
+
+  @param[in] signal  Pointer to the signal object to destroy.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+/* ======================================================================*/
+void qurt_signal2_destroy(qurt_signal2_t *signal);
+
+/*======================================================================*/
+/**@ingroup func_qurt_signal2_wait
+
+  @deprecated use #qurt_signal_wait instead.
+
+  Suspends the current thread until the specified signals are set.
+
+  Signals are represented as bits [31:0] in the 32-bit mask value. A mask bit value of 1 indicates
+  a signal to wait on.
+
+  If a thread calls this API with QURT_SIGNAL_ATTR_WAIT_ANY, the thread will be awakened when
+  any of the signals specified in the mask are set.
+
+  If a thread calls this API with QURT_SIGNAL_ATTR_WAIT_ALL, the thread will be awakened only
+  when all the signals specified in the mask are set.
+
+  @note1hang At most, one thread can wait on a signal object at any given time.
+
+  @datatypes
+  #qurt_signal2_t
+
+  @param[in] signal      Pointer to the signal object to wait on.
+  @param[in] mask        Mask value identifying the individual signals in the signal object to wait on.
+  @param[in] attribute   Specifies whether the thread waits for any of the signals to be set, or for all of
+                         them to be set. Values:\n
+                         - QURT_SIGNAL_ATTR_WAIT_ANY \n
+                         - QURT_SIGNAL_ATTR_WAIT_ALL @tablebulletend
+  @return
+  A 32-bit word with current signals.
+
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+unsigned int qurt_signal2_wait(qurt_signal2_t *signal, unsigned int mask,
+                unsigned int attribute);
+
+/*======================================================================*/
+/**@ingroup func_qurt_signal2_wait_any
+
+  @deprecated use #qurt_signal_wait_any instead.
+
+  Suspends the current thread until any of the specified signals are set.
+
+  Signals are represented as bits [31:0] in the 32-bit mask value. A mask bit value of 1 indicates
+  a signal to wait on.
+
+  The thread will be awakened when any of the signals specified in the mask are set.
+
+  @note1hang At most, one thread can wait on a signal object at any given time.
+
+  @datatypes
+  #qurt_signal2_t
+
+  @param[in] signal      Pointer to the signal object to wait on.
+  @param[in] mask        Mask value identifying the individual signals in the signal object to 
+                         wait on.
+
+  @return
+  32-bit word with current signals.
+
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+static inline unsigned int qurt_signal2_wait_any(qurt_signal2_t *signal, unsigned int mask)
+{
+  return qurt_signal2_wait(signal, mask, QURT_SIGNAL_ATTR_WAIT_ANY);
+}
+
+/*======================================================================*/
+/**@ingroup func_qurt_signal2_wait_all
+
+  @deprecated use #qurt_signal_wait_all instead.
+
+  Suspends the current thread until all of the specified signals are set.
+
+  Signals are represented as bits [31:0] in the 32-bit mask value. A mask bit value of 1 indicates
+  a signal to wait on.
+
+  The thread will be awakened only when all the signals specified in the mask are set.
+
+  @note1hang At most one thread can wait on a signal object at any given time.
+
+  @datatypes
+  #qurt_signal2_t
+
+  @param[in] signal      Pointer to the signal object to wait on.
+  @param[in] mask        Mask value identifying the individual signals in the signal object to 
+                         wait on.
+
+  @return
+  32-bit word with current signals.
+
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+static inline unsigned int qurt_signal2_wait_all(qurt_signal2_t *signal, unsigned int mask)
+{
+  return qurt_signal2_wait(signal, mask, QURT_SIGNAL_ATTR_WAIT_ALL);
+}
+
+/*======================================================================*/
+/**@ingroup func_qurt_signal2_set
+
+  @deprecated use #qurt_signal_set instead.
+
+  Sets signals in the specified signal object.
+
+  Signals are represented as bits [31:0] in the 32-bit mask value. A mask bit value of 1 indicates
+  that a signal must be set, and 0 indicates not to set the signal.
+
+  @datatypes
+  #qurt_signal2_t
+
+  @param[in]    signal  Pointer to the signal object to modify.
+  @param[in]    mask    Mask value identifying the individual signals to set in the signal
+                        object.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+void qurt_signal2_set(qurt_signal2_t *signal, unsigned int mask);
+
+/*======================================================================*/
+/**@ingroup func_qurt_signal2_get
+
+  @deprecated use #qurt_signal_get instead.
+
+   Gets a signal from a signal object.
+
+   Returns the current signal values of the specified signal object.
+
+  @datatypes
+  #qurt_signal2_t
+
+  @param[in] *signal Pointer to the signal object to access.
+
+  @return
+   32-bit word with current signals.
+
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+unsigned int qurt_signal2_get(qurt_signal2_t *signal);
+
+/*======================================================================*/
+/**@ingroup func_qurt_signal2_clear
+
+  @deprecated use #qurt_signal_clear instead.
+
+  Clear signals in the specified signal object.
+
+  Signals are represented as bits [31:0] in the 32-bit mask value. A mask bit value of 1
+  indicates that a signal must be cleared, and 0 indicates not to clear the signal.
+
+  @note1hang Signals must be explicitly cleared by a thread when it is awakened -- the wait
+           operations do not automatically clear them.
+
+  @datatypes
+  #qurt_signal2_t
+
+  @param[in] signal   Pointer to the signal object to modify.
+  @param[in] mask     Mask value identifying the individual signals to clear in the signal object.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+/* ======================================================================*/
+void qurt_signal2_clear(qurt_signal2_t *signal, unsigned int mask);
+
+/**@ingroup func_qurt_signal2_wait_cancellable  
+  
+  @deprecated use #qurt_signal_wait_cancellable instead.
+
+  Suspends the current thread until either the specified signals are set or the wait operation is cancelled.
+  The operation is cancelled if the user process of the calling thread is killed, or if the calling thread 
+  must finish its current QDI invocation and return to user space. 
+
+  Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1 indicates 
+  that a signal must be waited on, and 0 indicates not to wait on it.
+
+  If a thread is waiting on a signal object for any of the specified set of signals to be set, and one or 
+  more of those signals is set in the signal object, the thread is awakened.
+
+  If a thread is waiting on a signal object for all of the specified set of signals to be set, and all of 
+  those signals are set in the signal object, the thread is awakened.
+
+  @note1hang At most, one thread can wait on a signal object at any given time.
+
+  @note1cont When the operation is cancelled, the caller must assume that the signal is never set.
+
+  @datatypes
+  #qurt_signal2_t
+
+  @param[in] signal      Pointer to the signal object to wait on.
+  @param[in] mask        Mask value identifying the individual signals in the signal object to  
+                         wait on.
+  @param[in] attribute   Indicates whether the thread must wait until any of the signals are set, or until all of 
+                         them are set.  Values:\n
+                         - #QURT_SIGNAL_ATTR_WAIT_ANY \n
+                         - #QURT_SIGNAL_ATTR_WAIT_ALL @tablebulletend
+  @param[out] p_returnmask Pointer to the 32-bit mask value that was originally passed to the function.
+
+
+  @return     	
+  #QURT_EOK -- Wait completed. \n
+  #QURT_ECANCEL -- Wait cancelled.
+
+ 
+  @dependencies
+  None.
+*/
+int qurt_signal2_wait_cancellable(qurt_signal2_t *signal,
+                                  unsigned int mask,
+                                  unsigned int attribute,
+                                  unsigned int *p_returnmask);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_SIGNAL2_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_space.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_space.h
new file mode 100755
index 0000000000000..2c3f9e4496697
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_space.h
@@ -0,0 +1,230 @@
+#ifndef QURT_SPACE_H
+#define QURT_SPACE_H
+/**
+  @file qurt_space.h
+  @brief Prototypes of QuRT process control APIs
+
+ EXTERNALIZED FUNCTIONS
+  none
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  none
+
+ Copyright (c) 2013, 2021, 2023 by Qualcomm Technologies, Inc. All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+
+#include <qurt_types.h>
+#include <qurt_signal.h>
+#include <qurt_process.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** This flag is a request to the OS to suspend the processes just before calling main()
+But it is going to be obsoleted and replaced by QURT_PROCESS_SUSPEND_ON_STARTUP */
+#define SPAWNN_FLAG_SUSPEND_ON_STARTUP QURT_PROCESS_SUSPEND_ON_STARTUP
+
+/**
+ * Creates and starts a process from ELF of a specified name. The slash symbols
+ * "/" or "\" are ignored. Do not include the directory name in the input. This function
+ * accepts the the SPAWN flags. Multiple SPAWN flags can be specified by OR'ing the flags.
+ *
+ * @param name      ELF name of the executable. Name shall not contain directories,
+ *                  use "dsp2.elf", instead of "/prj/qct/.../dsp2.elf"
+ *
+ * @param return
+   Process ID -- Success \n
+   Negative error code -- failure\n
+   #QURT_EPRIVILEGE --                    Caller does not have enough privilege for this operation\n
+   #QURT_EMEM       --                    Not enough memory to perform the operation \n
+   #QURT_EFAILED     --                   Operation failed \n
+   #QURT_ENOTALLOWED --                   Operation not allowed \n
+   #QURT_ENOREGISTERED --                 Not registered \n
+   #QURT_ENORESOURCE  --                  Resource exhaustion \n
+   #QURT_EINVALID --                      Invalid argument value
+*/
+
+int qurt_spawn_flags(const char * name, int flags);
+
+/**
+   Creates and starts a process from an ELF of the specified name. The slash symbols
+   "/" or "\" are ignored. Do not include the directory name in the input.
+
+   @param name      ELF name of the executable. Name shall not contain directories,
+                    use "dsp2.elf", instead of "/prj/qct/.../dsp2.elf".
+
+   @return
+   Process ID -- Success. \m
+   Negative error code -- Failure.
+
+*/
+static inline int qurt_spawn(const char *name)
+{
+    return qurt_spawn_flags(name,0);
+}
+
+/**
+ * Returns the process ID of the current process.
+ *
+ * @return
+ * Process ID
+ *
+*/
+#define qurt_getpid qurt_process_get_id
+
+/**
+ * The qurt_wait() function  waits for status change in a child process. It could be used by parent
+ * process to block on any child process terminates.
+ *
+ * This API returns error if there are no user processes or all user processes got detached.
+ *
+ * @param status    Pointer to status variable. The variable provides the status value of child process.
+ *                  The value comes from exit() system call made by child process.
+ *
+ * @return
+   Process ID of the child process that changes status -- Success \n
+ * Negative error code -- Failure
+ *
+*/
+
+int qurt_wait(int *status);
+
+
+/** @cond */
+/* APIs that allow registering callbacks on spawn of user pd */
+typedef void (*QURT_SPAWN_PFN)(int client_handle, void *data_ptr);  //no return, since we won't be error checking it in spawn 
+typedef int (*QURT_CB_PFN)(int client_handle, void *user_data, void *info);
+typedef union {
+    QURT_SPAWN_PFN spawn_pfn;
+    QURT_CB_PFN cb_pfn;
+} qurt_process_callback_pfn_t;
+/** @endcond */
+
+/** @cond internal_only */
+
+/**@ingroup func_qurt_event_register
+Sets the specified bits by mask in the signal passed by the caller. The signal gets set
+when the client handle indicated by value goes away (at process exit). Multiple clients can register for the signal
+to be set.
+
+@datatypes
+
+@param[in]  type     QURT_PROCESS_EXIT is the only event that can be registered for.
+@param[in]  value    Indicates the client handle of the process for which the event is registered.
+@param[in]  signal   Pointer to the signal object to set when the event occurs.
+@param[in]  mask     Mask bits to set in the signal.
+@param[out] data     Pointer to the variable that would receive the exit code of the exiting process.
+@param[in]  datasize Size of the data variable.
+
+@return
+#QURT_EOK -- Success \n
+#QURT_EMEM -- Not enough memory to allocate resources \n
+#QURT_EVAL -- Invalid values passed to the API
+
+@dependencies
+None.
+*/
+int qurt_event_register(int type, int value, qurt_signal_t *psig, unsigned int mask, void *data, unsigned int data_size);
+
+/**@ingroup func_qurt_callback_register_onspawn
+Allows registering for a callback on spawn of any user process.
+
+@datatypes
+#QURT_SPAWN_PFN
+
+@param[in] pFn         Callback function to call when any user process is spawned.
+@param[in] user_data   Pointer to the argument that the callback must be called with.
+
+
+@return   If positive value is obtained, handle to be used while deregistering the callback.
+          Mutliple clients can register for callback on spawn and some clients might choose to deregister.
+
+          If failed, QURT_EFATAL will be returned.
+
+@dependencies
+None.
+*/
+int qurt_callback_register_onspawn(QURT_SPAWN_PFN pFn, void *user_data);
+
+/**@ingroup func_qurt_callback_deregister_onspawn
+Allows de-registering callback on spawn.
+
+@param[in] callback_handle   Handle returned by qurt_callback_register_onspawn.
+
+@return
+#QURT_EOK --de-registering was successful
+
+@dependencies
+None.
+*/
+int qurt_callback_deregister_onspawn(int callback_handle);
+
+/**@ingroup func_qurt_process_callback_register
+Allows registering for a callback during or after image loading.
+Generic callback types:
+    Functions similarly to qurt_callback_register_onspawn(). Callback is called after process is
+    loaded, before process thread starts. Callback has no return value and has no info provided
+    from OS.
+        pFn - QURT_SPAWN_PFN
+        type - QURT_PROCESS_CB_GENERIC
+        arg1 - not used 
+        arg2 - not used
+        arg3 - not used
+Note callback types:
+    Callback is called during process loading: before segment loading(QURT_PROCESS_NOTE_CB_PRE_MAP),
+    or after segment loading (QURT_PROCESS_NOTE_CB_POST_MAP). OS provides info to the callback. info
+    argument in callback is populated with pointer to the mapped note corresponding to the callback.
+    Callback has return value, loader fails if callback returns a value that is not QURT_EOK.
+        pFn - QURT_CB_PFN
+        type - QURT_PROCESS_NOTE_CB_PRE_MAP or QURT_PROCESS_NOTE_CB_POST_MAP
+        arg1 - note type (ex: NOTE_TYPE_POOL_INFO, NOTE_TYPE_SEGMENT_INFO, NOTE_TYPE_ARB_INFO)
+        arg2 - note name
+        arg3 - not used
+
+@datatypes
+
+@param[in] pFn          Callback function to call
+@param[in] type         Callback type
+@param[in] user_data    Pointer to the argument that the callback must be called with.
+@param[in] arg1         Arguments interpreted by OS based on callback type
+@param[in] arg2         Arguments interpreted by OS based on callback type
+@param[in] arg3         Arguments interpreted by OS based on callback type (currently not used)
+
+
+@return   If positive value is obtained, handle to be used while deregistering the callback.
+          Mutliple clients can register for callback on spawn and some clients might choose to deregister.
+
+          If failed, QURT_EFATAL will be returned.
+
+@dependencies
+None.
+*/
+int qurt_process_callback_register(qurt_process_callback_pfn_t pFn, 
+                                   qurt_process_cb_type_t type, 
+                                   void *user_data, 
+                                   qurt_process_callback_arg_t arg1, 
+                                   qurt_process_callback_arg_t arg2, 
+                                   qurt_process_callback_arg_t arg3);
+
+
+
+/**@ingroup func_qurt_process_callback_deregister
+Allows de-registering callback for imate loading.
+@param[in] callback_handle   Handle returned by qurt_process_callback_register.
+
+@return
+#QURT_EOK --de-registering was successful
+
+@dependencies
+None.
+*/
+int qurt_process_callback_deregister(int callback_handle);
+/** @endcond */
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_SPACE_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_srm_consts.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_srm_consts.h
new file mode 100755
index 0000000000000..48a8b6a38c402
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_srm_consts.h
@@ -0,0 +1,32 @@
+#ifndef QURT_SRM_CONSTS_H
+#define QURT_SRM_CONSTS_H
+/**
+  @file qurt_srm_consts.h 
+  @brief  Type definitions for srm
+
+EXTERNAL FUNCTIONS
+   None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+Copyright (c) 2020-2021, 2022  by Qualcomm Technologies, Inc.  All Rights Reserved
+Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+=============================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** @cond */
+#define QURT_SRM_WAKEUP_REQUEST       1U << 0          /**< Value = 1:  Send wakeup request to the SRM server. */
+#define QURT_SRM_SET_HANDLE           1U << 1          /**< Value = 2:  Set the client handle for a new SRM client. */
+#define QURT_SRM_ALLOC_KERNEL_PAGES   1U << 2          /**< Value = 4:  Allocate pages from the kernel VA space. */
+/** @endcond */
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_SRM_CONSTS_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_srm_driver.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_srm_driver.h
new file mode 100755
index 0000000000000..5489e3dddbcca
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_srm_driver.h
@@ -0,0 +1,140 @@
+#ifndef QURT_SRM_DRIVER_H
+#define QURT_SRM_DRIVER_H
+/**
+  @file qurt_srm_driver.h
+  @brief Definitions, macros, and prototypes used by SRM drivers.
+
+  EXTERNAL FUNCTIONS
+  None.
+
+  INITIALIZATION AND SEQUENCING REQUIREMENTS
+  None.
+
+  Copyright (c) 2021-2023 by Qualcomm Technologies, Inc.  All Rights Reserved.
+
+ =============================================================================*/
+#include <qurt.h>
+#include <qurt_srm_consts.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+||  Define qurt_srm_driver_t structure, which represents
+||   the "registration" object for an SRM driver.
+*/
+/** @cond internal_only */
+struct _qurt_srm_driver {
+   const char *name;
+   qurt_qdi_obj_t *obj;
+};
+
+typedef struct _qurt_srm_driver qurt_srm_driver_t;
+
+/*
+||  qurt_srm_object_invoke() is an internal equivalent to qurt_qdi_handle_invoke().
+||  It behaves the same, but it takes a QDI object pointer instead of a handle.
+*/
+
+#define qurt_srm_object_invoke(o,m,...) \
+   _QDMPASTE(_QDMSOI,_QDMCNT(QDI_HANDLE_LOCAL_CLIENT,o,m,##__VA_ARGS__))(QDI_HANDLE_LOCAL_CLIENT,o,m,##__VA_ARGS__)
+#define _QDMSOI3(a,b,c) qurt_srm_oi3(a,b,c)
+#define _QDMSOI4(a,b,c,d) qurt_srm_oi4(a,b,c,(int)(d))
+#define _QDMSOI5(a,b,c,d,e) qurt_srm_oi5(a,b,c,(int)(d),(int)(e))
+#define _QDMSOI6(a,b,c,d,e,f) qurt_srm_oi6(a,b,c,(int)(d),(int)(e),(int)(f))
+#define _QDMSOI7(a,b,c,d,e,f,g) qurt_srm_oi7(a,b,c,(int)(d),(int)(e),(int)(f),(int)(g))
+#define _QDMSOI8(a,b,c,d,e,f,g,h) qurt_srm_oi8(a,b,c,(int)(d),(int)(e),(int)(f),(int)(g),(int)(h))
+#define _QDMSOI9(a,b,c,d,e,f,g,h,i) qurt_srm_oi9(a,b,c,(int)(d),(int)(e),(int)(f),(int)(g),(int)(h),(int)(i))
+#define _QDMSOI10(a,b,c,d,e,f,g,h,i,j) qurt_srm_oi10(a,b,c,(int)(d),(int)(e),(int)(f),(int)(g),(int)(h),(int)(i),(int)(j))
+#define _QDMSOI11(a,b,c,d,e,f,g,h,i,j,k) qurt_srm_oi11(a,b,c,(int)(d),(int)(e),(int)(f),(int)(g),(int)(h),(int)(i),(int)(j),(int)(k))
+#define _QDMSOI12(a,b,c,d,e,f,g,h,i,j,k,l) qurt_srm_oi12(a,b,c,(int)(d),(int)(e),(int)(f),(int)(g),(int)(h),(int)(i),(int)(j),(int)(k),(int)(l))
+
+int qurt_srm_oi3(int, qurt_qdi_obj_t *, int);
+int qurt_srm_oi4(int, qurt_qdi_obj_t *, int, int);
+int qurt_srm_oi5(int, qurt_qdi_obj_t *, int, int, int);
+int qurt_srm_oi6(int, qurt_qdi_obj_t *, int, int, int, int);
+int qurt_srm_oi7(int, qurt_qdi_obj_t *, int, int, int, int, int);
+int qurt_srm_oi8(int, qurt_qdi_obj_t *, int, int, int, int, int, int);
+int qurt_srm_oi9(int, qurt_qdi_obj_t *, int, int, int, int, int, int, int);
+int qurt_srm_oi10(int, qurt_qdi_obj_t *, int, int, int, int, int, int, int, int);
+int qurt_srm_oi11(int, qurt_qdi_obj_t *, int, int, int, int, int, int, int, int, int);
+int qurt_srm_oi12(int, qurt_qdi_obj_t *, int, int, int, int, int, int, int, int, int, int);
+
+#define QDI_SRM_INIT 192
+
+/*
+||  QURT_SRM_DECLARE_DRIVER() declares an SRM driver to the SRM infrastructure.
+||
+||  The three arguments are:
+||   unique_id -- Unique C identifier, unused but must be a unique global symbol.
+||   name -- Name of the driver by which an SRM client attempts to open it.
+||   obj -- Pointer to the singleton object of the driver, which handles things such as
+||          initialization and QDI_OPEN requests.
+*/
+
+#define QURT_SRM_DECLARE_DRIVER(unique_id, xname, xobj) \
+   __attribute__((section(".srm.rodata.user.main.DECL"))) const qurt_srm_driver_t unique_id = \
+      { .name = xname, .obj = xobj }
+
+
+/*@ingroup func_qurt_srm_mapping_create
+  Creates a memory mapping in pagetable with specified attributes
+
+  @param[in] client_handle  Client handle representing the process for which
+                            mapping would be created.
+  @param[in] pageno_virt    pointer to the virtual page. NULL indicates SRM
+                            would indicate the virtual memory.
+  @param[in] pageno_phys    physical page to be used for the mapping
+  @param[in] page_count     number of 4k pages to be mapped
+  @param[in] cache_attr     cache attributes for the mapping
+  @param[in] perm           permissions to be used for the mapping
+  
+  @return value greater than 0 indicates a handle which can be passed to
+          qdi_close() to remove the mapping. Negative value indicates
+		  an error.
+  
+  @dependencies
+  None.
+*/
+int qurt_srm_mapping_create(int client_handle,
+                            unsigned *pageno_virt,
+                            unsigned pageno_phys,
+                            unsigned page_count,
+                            qurt_mem_cache_mode_t cache_attr,
+                            qurt_perm_t perm);
+
+
+/**@ingroup func_qurt_srm_get_pid
+  Gets the PID for the client_handle that is passed.
+
+  @param[in] client_handle  Client handle for which PID is required.
+
+  @return PID of the client
+          Negative PID value '-1' will be returned in case of Error
+  
+  @dependencies
+  None.
+*/
+unsigned qurt_srm_get_pid(int client_handle);
+
+
+/*@ingroup func_qurt_srm_get_thread_id
+  Gets the thread id of the client requesting a service from SRM
+
+  @param[in] None.
+
+  @return thead id of client thread
+  
+  @dependencies
+  None.
+*/
+qurt_thread_t qurt_srm_get_client_thread_id(void);
+
+/** @endcond */
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_SRM_DRIVER_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_stid.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_stid.h
new file mode 100755
index 0000000000000..379f46aaa4b80
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_stid.h
@@ -0,0 +1,73 @@
+#ifndef QURT_STID_H
+#define QURT_STID_H
+/**
+  @file qurt_stid.h 
+  Prototypes of software thread identifier(stid) interface APIs.  
+  A stid is 8 bit identifier that can be assigned to a software thread.
+  The performance monitor logic uses stid as a counting match criteria
+  for maskable events. stid is also used by the hardware debugger 
+  (ISDB) to match breakpoints. 
+
+  EXTERNAL FUNCTIONS
+   None.
+
+  INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+  Copyright (c) 2024 Qualcomm Technologies, Inc.
+  All rights reserved.
+  Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+=============================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/*=============================================================================
+                            FUNCTIONS
+=============================================================================*/
+
+/**@ingroup func_qurt_stid_alloc
+  Allocate a unique stid 
+
+  @param[in]  pid   Process identifier
+  @param[out] stid  Pointer to a variable to return stid
+ 
+  @return
+  QURT_EOK - Allocation success
+  QURT_ENORESOURCE  - No stid available for allocation
+  QURT_EINVALID - Invalid input
+   
+  @dependencies
+  None.
+ */
+int qurt_stid_alloc(unsigned int pid, unsigned int *stid);
+
+/**@ingroup func_qurt_stid_release
+   Release the stid. 
+
+
+  @param[in]  pid   Process identifier
+  @param[in]  stid  STID to release
+  
+  @note1hang 
+  User shall ensure to clear the released stid from process or thread(s)
+  to default value (QURT_STID_DEFAULT) before releasing that stid
+ 
+  @return
+  QURT_EOK - Release success
+  QURT_ENOTALLOWED   - Operation not allowed for a pid
+  QURT_EINVALID  - Invalid stid
+   
+  @dependencies
+  None.
+ */
+int qurt_stid_release(unsigned int pid, unsigned int stid);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_STID_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_thread.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_thread.h
new file mode 100755
index 0000000000000..499699e7c72e2
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_thread.h
@@ -0,0 +1,1260 @@
+#ifndef QURT_THREAD_H
+#define QURT_THREAD_H
+/**
+  @file qurt_thread.h 
+  @brief Prototypes of Thread API
+
+  EXTERNAL FUNCTIONS
+   None.
+
+  INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+Copyright (c) 2018, 2020-2023  by Qualcomm Technologies, Inc.  All Rights Reserved.
+Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+=============================================================================*/
+
+
+/* The followings are for C code only */
+#ifndef __ASSEMBLER__ 
+#include <string.h>
+#include "qurt_pmu.h"
+#include "qurt_api_version.h"
+#endif /* __ASSEMBLER__ */
+#include "qurt_consts.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*=============================================================================
+                            CONSTANTS AND MACROS
+=============================================================================*/
+
+
+/*
+  Bitmask configuration to select DSP hardware threads. 
+  To select all the hardware threads, use #QURT_THREAD_CFG_BITMASK_ALL 
+  and the following: \n
+  - For QDSP6 V2/V3, all six hardware threads are selected \n
+  - For QDSP6 V3L, all four hardware threads are selected \n
+  - For QDSP6 V4, all three hardware threads are selected
+ */  
+
+#define QURT_THREAD_CFG_BITMASK_HT0      0x00000001   /**< HTO. */
+#define QURT_THREAD_CFG_BITMASK_HT1      0x00000002   /**< HT1. */
+#define QURT_THREAD_CFG_BITMASK_HT2      0x00000004   /**< HT2. */ 
+#define QURT_THREAD_CFG_BITMASK_HT3      0x00000008   /**< HT3. */
+#define QURT_THREAD_CFG_BITMASK_HT4      0x00000010   /**< HT4. */
+#define QURT_THREAD_CFG_BITMASK_HT5      0x00000020   /**< HT5. */
+/** @cond rest_reg_dist */
+/** @addtogroup thread_macros
+@{ */
+/**   @xreflabel{sec:qurt_thread_cfg} */  
+
+#define QURT_THREAD_CFG_BITMASK_ALL      0x000000ffU   /**< Select all the hardware threads. */
+/** @} */ /* end_addtogroup thread_macros */
+/** @endcond */
+
+#define QURT_THREAD_CFG_USE_RAM          0x00000000   /**< Use RAM. */
+#define QURT_THREAD_CFG_USE_TCM          0x00000100   /**< Use TCM. */
+/** @cond rest_reg_dist */
+/** @addtogroup thread_macros
+@{ */
+#define QURT_THREAD_BUS_PRIO_DISABLED    0   /**< Thread internal bus priority disabled. */
+#define QURT_THREAD_BUS_PRIO_ENABLED     1   /**< Thread internal bus priority enabled.  */
+/** @} */ /* end_addtogroup thread_macros */
+/** @endcond */
+
+#define QURT_THREAD_AUTOSTACK_DISABLED    0   /**< Thread has autostack v2 feature disabled. */
+#define QURT_THREAD_AUTOSTACK_ENABLED     1   /**< Thread has autostack v2 feature enabled.  */
+
+/*
+   Macros for QuRT thread attributes.   
+ */
+#define QURT_HTHREAD_L1I_PREFETCH      0x1     /**< Enables hardware L1 instruction cache prefetching. */
+#define QURT_HTHREAD_L1D_PREFETCH      0x2     /**< Enables hardware L1 data cache prefetching. */
+#define QURT_HTHREAD_L2I_PREFETCH      0x4     /**< Enables hardware L2 instruction cache prefetching. */
+#define QURT_HTHREAD_L2D_PREFETCH      0x8     /**< Enables hardware L2 data cache prefetching. */
+#define QURT_HTHREAD_DCFETCH           0x10    /**< Enables DC fetch to the provided virtual address. 
+                                                    DC fetch indicates the hardware that a data memory access is likely. 
+                                                    Instructions are dropped when there is high bus utilization. */
+/** @addtogroup thread_macros
+@{ */
+/** @xreflabel{hdr:partition_tcm} */
+/*
+   Below value is used to create legacy QuRT threads by default.
+   If a thread has this as the detach_state, the thread can be joined
+   on until it exits. When we are able to change default behavior of all
+   QuRT threads to JOINABLE (posix default), we can remove this legacy
+   behavior.
+*/
+#define QURT_THREAD_ATTR_CREATE_LEGACY               0U /**< Create a legacy QuRT thread by default. If a thread has this as a detach state, the thread can be joined on until it exits. */
+#define QURT_THREAD_ATTR_CREATE_JOINABLE             1U /**< Create a joinable thread. */
+#define QURT_THREAD_ATTR_CREATE_DETACHED             2U /**< Create a detached thread. */
+/** @} */ /* end_addtogroup thread_macros */
+
+
+#define QURT_THREAD_ATTR_NAME_MAXLEN            16  /**< Maximum name length. */
+#define QURT_THREAD_ATTR_TCB_PARTITION_RAM      0  /**< Creates threads in RAM/DDR. */
+#define QURT_THREAD_ATTR_TCB_PARTITION_TCM      1  /**< Creates threads in TCM. */
+/** @cond rest_reg_dist */
+/** @addtogroup thread_macros
+@{ */
+#define QURT_THREAD_ATTR_TCB_PARTITION_DEFAULT  QURT_THREAD_ATTR_TCB_PARTITION_RAM  /**< Backward compatibility. */
+#define QURT_THREAD_ATTR_PRIORITY_DEFAULT       254   /**< Priority.*/
+#define QURT_THREAD_ATTR_ASID_DEFAULT           0    /**< ASID. */
+#define QURT_THREAD_ATTR_AFFINITY_DEFAULT      (-1)  /**< Affinity. */
+#define QURT_THREAD_ATTR_BUS_PRIO_DEFAULT       255  /**< Bus priority. */
+#define QURT_THREAD_ATTR_AUTOSTACK_DEFAULT      0    /**< Default autostack v2 disabled thread. */
+#define QURT_THREAD_ATTR_TIMETEST_ID_DEFAULT   (-2)  /**< Timetest ID. */
+#define QURT_THREAD_ATTR_STID_DEFAULT           QURT_STID_DEFAULT  /**< STID. */
+#define QURT_THREAD_ATTR_STID_ENABLE            1  /**< Indicate to allocate STID during thread creation. */
+
+#define  QURT_PRIORITY_FLOOR_DEFAULT            255U  /**< Default floor. */
+/** @} */ /* end_addtogroup thread_macros */
+
+// Option for suspending thread
+#define  QURT_THREAD_SUSPEND_SYNCHRONOUS   0x0U  // bit#0
+#define  QURT_THREAD_SUSPEND_ASYNCHRONOUS  0x1U  // bit#0
+#define  QURT_THREAD_SUSPEND_KEEP_HMX      0x0U  // bit#1
+#define  QURT_THREAD_SUSPEND_DETACH_HMX    0x2U  // bit#1
+ 
+// Option for resuming thread
+#define  QURT_THREAD_RESUME_DEFAULT        0x0
+
+// Thread property IDs
+#define  QURT_THREAD_PROPERTY_SUSPENDABLE  0x0U 
+#define  QURT_THREAD_PROPERTY_RESUMABLE    0x1
+
+// Thread group
+#define  QURT_THREAD_DEFAULT_GROUP_ID      0x0U
+#define  QURT_THREAD_GROUP_ID_MASK         0x3FU
+
+/** @endcond*/
+
+
+/* The followings are for C code only */
+#ifndef __ASSEMBLER__ 
+/*=============================================================================
+                                TYPEDEFS
+=============================================================================*/
+/** @addtogroup thread_types
+@{ */
+/** @cond rest_reg_dist  */
+typedef unsigned int qurt_cache_partition_t; /**< QuRT cache partition type. */
+
+#define CCCC_PARTITION      0U     /**< Use the CCCC page attribute bits to determine the main or auxiliary partition. */
+#define MAIN_PARTITION      1U     /**< Use the main partition. */
+#define AUX_PARTITION       2U     /**< Use the auxiliary partition. */
+#define MINIMUM_PARTITION   3U     /**< Use the minimum. Allocates the least amount of cache (no-allocate policy possible) for this thread. */
+/** @endcond */
+
+/** Thread ID type. */
+typedef unsigned int qurt_thread_t;
+
+/** @cond rest_reg_dist  */
+/** Thread attributes. */
+typedef struct _qurt_thread_attr {
+    
+    char name[QURT_THREAD_ATTR_NAME_MAXLEN]; /**< Thread name. */
+    unsigned char tcb_partition;  /**< Indicates whether the thread TCB resides in RAM or
+                                       on chip memory (TCM). */
+    unsigned char  stid;          /**< Software thread ID used to configure the stid register
+                                       for profiling purposes. */
+    unsigned short priority;      /**< Thread priority. */
+    unsigned char  autostack:1;   /**< Autostack v2 enabled thread. */
+    unsigned char  group_id:6;    /**< Group ID. */
+    unsigned char  reserved:1;    /**< Reserved bits. */
+    unsigned char  bus_priority;  /**< Internal bus priority. */
+    unsigned short timetest_id;   /**< Timetest ID. */
+    unsigned int   stack_size;    /**< Thread stack size. */
+    void *stack_addr;             /**< Pointer to the stack address base. The range of the stack is
+                                       (stack_addr, stack_addr+stack_size-1). */
+    unsigned short detach_state;  /**< Detach state of the thread. */
+
+} qurt_thread_attr_t;
+/** @endcond */
+
+/** @cond rest_reg_dist */
+/** Dynamic TLS attributes. */
+typedef struct qurt_tls_info {
+  unsigned int module_id;        /**< Module ID of the loaded dynamic linked library. */
+  unsigned int tls_start;        /**< Start address of the TLS data. */
+  unsigned int tls_data_end;     /**< End address of the TLS RW data. */
+  unsigned int tls_end;          /**< End address of the TLS data. */
+}qurt_tls_info;
+/** @endcond */
+
+/** @} */ /* end_addtogroup thread_types */
+
+/*=============================================================================
+                       FUNCTIONS
+=============================================================================*/
+/**@ingroup func_qurt_thread_attr_init
+  Initializes the structure used to set the thread attributes when a thread is created.
+  After an attribute structure is initialized, Explicity set the individual attributes in the structure 
+  using the thread attribute operations.
+
+  The initialize operation sets the following default attribute values: \n
+  - Name -- NULL string \n
+  - TCB partition -- QURT_THREAD_ATTR_TCB_PARTITION_DEFAULT
+  - Priority -- QURT_THREAD_ATTR_PRIORITY_DEFAULT \n
+  - Autostack -- QURT_THREAD_ATTR_AUTOSTACK_DEFAULT \n
+  - Bus priority -- QURT_THREAD_ATTR_BUS_PRIO_DEFAULT \n
+  - Timetest ID -- QURT_THREAD_ATTR_TIMETEST_ID_DEFAULT \n
+  - stack_size -- 0 \n
+  - stack_addr -- NULL \n
+  - detach state -- #QURT_THREAD_ATTR_CREATE_LEGACY \n
+  - STID -- #QURT_THREAD_ATTR_STID_DEFAULT
+
+  @datatypes
+  #qurt_thread_attr_t
+  
+  @param[in,out] attr Pointer to the thread attribute structure.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+static inline void qurt_thread_attr_init (qurt_thread_attr_t *attr)
+{
+
+    attr->name[0] = '\0';
+    attr->tcb_partition = QURT_THREAD_ATTR_TCB_PARTITION_DEFAULT;
+    attr->priority = QURT_THREAD_ATTR_PRIORITY_DEFAULT;
+    attr->autostack = QURT_THREAD_ATTR_AUTOSTACK_DEFAULT; /* Default attribute for autostack v2*/
+    attr->bus_priority = QURT_THREAD_ATTR_BUS_PRIO_DEFAULT;
+    attr->timetest_id = (unsigned short)QURT_THREAD_ATTR_TIMETEST_ID_DEFAULT;
+    attr->stack_size = 0;
+    attr->stack_addr = NULL;
+    attr->detach_state = QURT_THREAD_ATTR_CREATE_LEGACY;
+    attr->stid = QURT_THREAD_ATTR_STID_DEFAULT;
+    attr->group_id = QURT_THREAD_DEFAULT_GROUP_ID;
+}
+
+/**@ingroup func_qurt_thread_attr_set_name
+  Sets the thread name attribute.\n
+  This function specifies the name to use by a thread.
+  Thread names identify a thread during debugging or profiling.
+  Maximum name length is 16 charactes  \n
+  @note1hang Thread names differ from the kernel-generated thread identifiers used to
+  specify threads in the API thread operations.
+
+  @datatypes
+  #qurt_thread_attr_t
+
+  @param[in,out] attr Pointer to the thread attribute structure.
+  @param[in] name     Pointer to the character string containing the thread name.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+static inline void qurt_thread_attr_set_name (qurt_thread_attr_t *attr, const char *name)
+{
+    strlcpy (attr->name, name, QURT_THREAD_ATTR_NAME_MAXLEN);
+    attr->name[QURT_THREAD_ATTR_NAME_MAXLEN - 1] = '\0';
+}
+
+
+/**@ingroup func_qurt_thread_attr_set_tcb_partition
+  Sets the thread TCB partition attribute.
+  Specifies the memory type where a TCB of a thread is allocated.
+  Allocates TCBs in RAM or TCM/LPM.
+
+  @datatypes
+  #qurt_thread_attr_t
+
+  @param[in,out] attr  Pointer to the thread attribute structure.
+  @param[in] tcb_partition TCB partition. Values:\n
+                     - 0 -- TCB resides in RAM \n
+                     - 1 -- TCB resides in TCM/LCM @tablebulletend
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+static inline void qurt_thread_attr_set_tcb_partition (qurt_thread_attr_t *attr, unsigned char tcb_partition)
+{
+    attr->tcb_partition = tcb_partition;
+}
+
+/**@ingroup func_qurt_thread_attr_set_priority
+  Sets the thread priority to assign to a thread.
+  Thread priorities are specified as numeric values in the range 1 to 254, with 1 representing
+  the highest priority.
+  Priority 0 and 255  are internally used by the kernel for special purposes.
+
+  @datatypes
+  #qurt_thread_attr_t
+
+  @param[in,out] attr Pointer to the thread attribute structure.
+  @param[in] priority Thread priority.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+static inline void qurt_thread_attr_set_priority (qurt_thread_attr_t *attr, unsigned short priority)
+{
+    attr->priority = priority;
+}
+
+/**@ingroup func_qurt_thread_attr_set_detachstate
+  Sets the thread detach state with which thread is created.
+  Thread detach state is either joinable or detached; specified by the following values:
+  - #QURT_THREAD_ATTR_CREATE_JOINABLE  \n           
+  - #QURT_THREAD_ATTR_CREATE_DETACHED  \n   
+
+  When a detached thread is created (QURT_THREAD_ATTR_CREATE_DETACHED), its thread
+  ID and other resources are reclaimed as soon as the thread exits. When a joinable thread 
+  is created (QURT_THREAD_ATTR_CREATE_JOINABLE), it is assumed that some
+  thread waits to join on it using a qurt_thread_join() call. 
+  By default, detached state is QURT_THREAD_ATTR_CREATE_LEGACY
+  If detached state is QURT_THREAD_ATTR_CREATE_LEGACY then other
+  thread can join before thread exits but it will not wait other thread to join.
+  
+  @note1hang For a joinable thread (QURT_THREAD_ATTR_CREATE_JOINABLE), it is very
+             important that some thread joins on it after it terminates, otherwise
+			 the resources of that thread are not reclaimed, causing memory leaks.      
+
+  @datatypes
+  #qurt_thread_attr_t
+
+  @param[in,out] attr Pointer to the thread attribute structure.
+  @param[in] detachstate Thread detach state.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+static inline void qurt_thread_attr_set_detachstate (qurt_thread_attr_t *attr, unsigned short detachstate)
+{	
+    if(detachstate == QURT_THREAD_ATTR_CREATE_JOINABLE  || detachstate == QURT_THREAD_ATTR_CREATE_DETACHED){
+		attr->detach_state = detachstate;
+	}
+}
+
+
+/**@ingroup func_qurt_thread_attr_set_timetest_id
+  Sets the thread timetest attribute.\n
+  Specifies the timetest identifier to use by a thread.
+
+  Timetest identifiers are used to identify a thread during debugging or profiling. \n
+  @note1hang Timetest identifiers differ from the kernel-generated thread identifiers used to
+             specify threads in the API thread operations.
+
+  @datatypes
+  #qurt_thread_attr_t
+  
+  @param[in,out] attr   Pointer to the thread attribute structure.
+  @param[in] timetest_id Timetest identifier value.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+  */
+static inline void qurt_thread_attr_set_timetest_id (qurt_thread_attr_t *attr, unsigned short timetest_id)
+{
+    attr->timetest_id = timetest_id;
+}
+
+/**@ingroup func_qurt_thread_attr_set_stack_size
+  @xreflabel{sec:set_stack_size}
+  Sets the thread stack size attribute.\n
+  Specifies the size of the memory area to use for a call stack of a thread.
+
+  The thread stack address (Section @xref{sec:set_stack_addr}) and stack size specify the memory area used as a
+  call stack for the thread. The user is responsible for allocating the memory area used for
+  the stack.
+
+  @datatypes
+  #qurt_thread_attr_t
+
+  @param[in,out] attr Pointer to the thread attribute structure.
+  @param[in] stack_size Size (in bytes) of the thread stack.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+
+static inline void qurt_thread_attr_set_stack_size (qurt_thread_attr_t *attr, unsigned int stack_size)
+{
+    attr->stack_size = stack_size;
+}
+
+/**@ingroup func_qurt_thread_attr_set_stack_size2
+  @xreflabel{sec:set_stack_size}
+  Sets the thread stack size attribute for island threads that require a higher guest OS stack size than the stack size
+  defined in the configuration XML.\n
+  Specifies the size of the memory area to use for a call stack of an island thread in User and Guest mode.
+
+  The thread stack address (Section @xref{sec:set_stack_addr}) and stack size specify the memory area used as a
+  call stack for the thread. The user is responsible for allocating the memory area used for
+  the stack.
+
+  @datatypes
+  #qurt_thread_attr_t
+
+  @param[in,out] attr Pointer to the thread attribute structure.
+  @param[in] user_stack_size Size (in bytes) of the stack usage in User mode.
+  @param[in] root_stack_size Size (in bytes) of the stack usage in Guest mode.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+static inline void qurt_thread_attr_set_stack_size2 (qurt_thread_attr_t *attr, unsigned short user_stack_size, unsigned short root_stack_size)
+{
+	union qurt_thread_stack_info{
+		unsigned int raw_size;
+		struct{
+			unsigned short user_stack;
+			unsigned short root_stack;
+		};
+	}user_root_stack_size;
+	user_root_stack_size.user_stack = user_stack_size;
+	user_root_stack_size.root_stack = root_stack_size;
+	
+    attr->stack_size = user_root_stack_size.raw_size;
+}
+
+/**@ingroup func_qurt_thread_attr_set_stack_addr
+  @xreflabel{sec:set_stack_addr}
+  Sets the thread stack address attribute. \n
+  Specifies the base address of the memory area to use for a call stack of a thread.
+
+  stack_addr must contain an address value that is 8-byte aligned.
+
+  The thread stack address and stack size (Section @xref{sec:set_stack_size}) specify the memory area used as a
+  call stack for the thread. \n
+  @note1hang The user is responsible for allocating the memory area used for the thread
+             stack. The memory area must be large enough to contain the stack that the thread
+			 creates.
+
+  @datatypes
+  #qurt_thread_attr_t
+  
+  @param[in,out] attr Pointer to the thread attribute structure.
+  @param[in] stack_addr  Pointer to the 8-byte aligned address of the thread stack.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+static inline void qurt_thread_attr_set_stack_addr (qurt_thread_attr_t *attr, void *stack_addr)
+{
+    attr->stack_addr = stack_addr;
+}
+
+/**@ingroup func_qurt_thread_attr_set_bus_priority
+   Sets the internal bus priority state in the Hexagon core for this software thread attribute. 
+   Memory requests generated by the thread with bus priority enabled are
+   given priority over requests generated by the thread with bus priority disabled. 
+   The default value of bus priority is disabled.
+
+   @note1hang Sets the internal bus priority for Hexagon processor version V60 or greater. 
+              The priority is not propagated to the bus fabric.
+  
+   @datatypes
+   #qurt_thread_attr_t
+
+   @param[in] attr Pointer to the thread attribute structure.
+
+   @param[in] bus_priority Enabling flag. Values: \n 
+         - #QURT_THREAD_BUS_PRIO_DISABLED \n
+         - #QURT_THREAD_BUS_PRIO_ENABLED @tablebulletend
+
+   @return
+   None
+
+   @dependencies
+   None.
+*/
+static inline void qurt_thread_attr_set_bus_priority ( qurt_thread_attr_t *attr, unsigned short bus_priority)
+{
+    attr->bus_priority = (unsigned char)bus_priority;
+}
+
+/**@ingroup func_qurt_thread_attr_set_autostack
+   Enables autostack v2 feature in the thread attributes.
+   
+   When autostack is enabled by the subsystem, in the case that
+   an autostack enabled thread gets framelimit exception, kernel will
+   allocate more stack for thread and return to normal execution. 
+
+   If autostack is not enabled by the subsystem, or it is not enabled
+   for the thread, the framelimit exception will be fatal.
+  
+   @datatypes
+   #qurt_thread_attr_t
+
+   @param[in] attr Pointer to the thread attribute structure.
+   @param[in] autostack  Autostack enable or disable flag. Values: \n 
+         - #QURT_THREAD_AUTOSTACK_DISABLED \n
+         - #QURT_THREAD_AUTOSTACK_ENABLED @tablebulletend
+
+   @return
+   None.
+
+   @dependencies
+   None.
+*/
+static inline void qurt_thread_attr_set_autostack ( qurt_thread_attr_t *attr, unsigned short autostack)
+{
+    attr->autostack = (unsigned char)autostack;  
+}
+/**@ingroup qurt_thread_attr_enable_stid
+   Set STID in the thread attributes.
+  
+   @datatypes
+   #qurt_thread_attr_t
+
+   @param[in] attr Pointer to the thread attribute structure.
+   @param[in] enable_stid  STID to be set. Values: \n 
+         - #QURT_THREAD_ATTR_STID_DEFAULT (0): Default STID. \n
+         - #QURT_THREAD_ATTR_STID_ENABLE (1):  QuRT assigns an STID that is not already in use \n
+         - #2 through #255 : User provided STID.  @tablebulletend
+
+   @return
+   None.
+
+   @dependencies
+   None.
+*/
+static inline void qurt_thread_attr_enable_stid ( qurt_thread_attr_t *attr, char enable_stid)
+{
+    if (enable_stid != '\0') {
+        attr->stid = enable_stid;
+    }
+    else
+    {
+        attr->stid = QURT_THREAD_ATTR_STID_DEFAULT;
+    }
+}
+
+/**@ingroup func_qurt_thread_attr_set_stid
+   Sets the stid thread attribute.
+   The default stid value is QURT_THREAD_ATTR_STID_DEFAULT
+
+   @note1hang When a thread is created with non default stid , 
+   the stid set in thread attribute  will be assigned to a thread.
+  
+   @datatypes
+   #qurt_thread_attr_t
+
+   @param[in] attr Pointer to the thread attribute structure.
+   @param[in] stid Stid to be set for a thread.
+
+   @return
+   None
+
+   @dependencies
+   None.
+*/
+static inline void qurt_thread_attr_set_stid( qurt_thread_attr_t *attr, unsigned int stid){
+    attr->stid = stid;
+}
+
+/**@ingroup func_qurt_thread_attr_set_group_id
+  Sets group id in the thread attributes.
+  Primordial/first thread has group ID 0.
+  If a new thread is created without assigning group_id, it
+  inherits the group ID from its parent thread.
+
+  @note1hang
+  1) Group ID can only be set before creating a thread. It cannot be
+  changed after the thread is created.
+  2) If a non-activated group_id is passed, thread creation will fail.
+  3) Only a thread with Group ID #0 can set Group ID for its child threads.
+  4) If thread with non-zero group ID set the group ID for its child threads,
+  QuRT will ingore this parameter and child threads will inherit the parent
+  thread's group ID. But if passed group ID is not activated, thread creation
+  will still fail.
+
+  @datatypes
+  #qurt_thread_attr_t
+
+  @param[in] attr Pointer to the thread attribute structure.
+  @param[in] group_id Group identifier. Its valid range is 0 ~ 63
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+static inline void qurt_thread_attr_set_group_id(qurt_thread_attr_t *attr, unsigned int group_id)
+{
+    attr->group_id = group_id & QURT_THREAD_GROUP_ID_MASK;
+}
+
+/**@ingroup func_qurt_thread_set_autostack
+  Sets autostack enable in the TCB.
+
+  @param[in] Pointer to UGP
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+
+void qurt_thread_set_autostack(void *);
+
+
+/**@ingroup func_qurt_thread_get_name
+  Gets the thread name of current thread.\n
+  Returns the thread name of the current thread. 
+  Thread names are assigned to threads as thread attributes, see qurt_thread_attr_set_name(). Thread names 
+  identify a thread during debugging or profiling.
+
+  @param[out] name Pointer to a character string, which specifies the address where the returned thread name is stored.
+  @param[in] max_len Maximum length of the character string that can be returned.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+void qurt_thread_get_name (char *name, unsigned char max_len);
+
+/**@ingroup func_qurt_thread_create
+  @xreflabel{hdr:qurt_thread_create}
+  Creates a thread with the specified attributes, and makes it executable.
+
+  @datatypes
+  #qurt_thread_t \n
+  #qurt_thread_attr_t
+  
+  @param[out]  thread_id    Returns a pointer to the thread identifier if the thread was 
+                             successfully created.
+  @param[in]   attr 	    Pointer to the initialized thread attribute structure that specifies 
+                             the attributes of the created thread.
+  @param[in]   entrypoint   C function pointer, which specifies the main function of a thread.
+  @param[in]   arg  	     Pointer to a thread-specific argument structure
+  
+   
+  @return 
+  #QURT_EOK -- Thread created. \n
+  #QURT_EFAILED -- Thread not created. 
+
+  @dependencies
+  None.
+ */
+int qurt_thread_create (qurt_thread_t *thread_id, qurt_thread_attr_t *attr, void (*entrypoint) (void *), void *arg);
+
+/**@ingroup func_qurt_thread_stop
+   Stops the current thread, frees the kernel TCB, and yields to the next highest ready thread. 
+  
+   @return
+   void 
+
+   @dependencies
+   None.
+ */
+void qurt_thread_stop(void);
+
+/** @cond internal_only */
+/**@ingroup func_qurt_thread_resume
+   When a demand-loading paging solution is enabled, this function
+   will resumes the execution of a thread that was suspended due to
+   a page miss.
+  
+   @param[in]  thread_id Thread identifier.
+
+   @return 
+   #QURT_EOK -- Thread successfully resumed. \n
+   #QURT_EFATAL -- Resume operation failed.
+
+   @dependencies
+   None.
+ */
+int qurt_thread_resume(unsigned int thread_id);
+/** @endcond */
+
+/**@ingroup func_qurt_thread_get_id
+   Gets the identifier of the current thread.\n
+   Returns the thread identifier for the current thread.
+     
+   @return 
+   Thread identifier -- Identifier of the current thread. 
+
+   @dependencies
+   None.
+ */
+qurt_thread_t qurt_thread_get_id (void);
+
+
+/**@ingroup func_qurt_thread_get_l2cache_partition
+   Returns the current value of the L2 cache partition assigned to the caller thread.\n
+     
+   @return 
+   Value of the #qurt_cache_partition_t data type.
+
+   @dependencies
+   None.
+ */
+qurt_cache_partition_t qurt_thread_get_l2cache_partition (void);
+
+/**@ingroup func_qurt_thread_set_timetest_id
+   Sets the timetest identifier of the current thread.
+   Timetest identifiers are used to identify a thread during debugging or profiling.\n
+   @note1hang Timetest identifiers differ from the kernel-generated thread identifiers used to
+              specify threads in the API thread operations.
+
+   @param[in]  tid  Timetest identifier.
+
+   @return
+   None.
+
+   @dependencies
+   None.
+ */
+void qurt_thread_set_timetest_id (unsigned short tid);
+
+/**@ingroup func_qurt_thread_set_cache_partition
+   Sets the cache partition for the current thread. This function uses the qurt_cache_partition_t type 
+   to select the cache partition of the current thread for the L1 Icache, L1 Dcache, and L2 cache.
+  
+   @datatypes
+   #qurt_cache_partition_t 
+
+   @param[in] l1_icache L1 I cache partition.
+   @param[in] l1_dcache L1 D cache partition.
+   @param[in] l2_cache L2 cache partition.
+    
+   @return
+   None.
+
+   @dependencies
+   None.
+ */
+void qurt_thread_set_cache_partition(qurt_cache_partition_t l1_icache, qurt_cache_partition_t l1_dcache, qurt_cache_partition_t l2_cache);
+
+
+/**@ingroup func_qurt_thread_get_timetest_id
+   Gets the timetest identifier of the current thread.\n
+   Returns the timetest identifier of the current thread.\n
+   Timetest identifiers are used to identify a thread during debugging or profiling. \n
+   @note1hang Timetest identifiers differ from the kernel-generated thread identifiers used to
+              specify threads in the API thread operations.
+
+   @return 
+   Integer -- Timetest identifier. 
+
+   @dependencies
+   None.
+ */
+unsigned short qurt_thread_get_timetest_id (void);
+
+/**@ingroup func_qurt_thread_exit
+   @xreflabel{sec:qurt_thread_exit}
+   Stops the current thread, awakens threads joined to it, then destroys the stopped
+   thread.
+
+   Threads that are suspended on the current thread (by performing a thread join 
+   Section @xref{sec:thread_join}) are awakened and passed a user-defined status value 
+   that indicates the status of the stopped thread.
+
+   @note1hang Exit must be called in the context of the thread to stop.
+  
+   @param[in]   status User-defined thread exit status value.
+
+   @return
+   None.
+
+   @dependencies
+   None.
+ */
+void qurt_thread_exit(int status);
+
+/**@ingroup func_qurt_thread_join
+   @xreflabel{sec:thread_join}
+   Waits for a specified thread to finish; the specified thread is another thread within
+   the same process.
+   The caller thread is suspended until the specified thread exits. When the unspecified thread
+   exits, the caller thread is awakened. \n
+   @note1hang If the specified thread has already exited, this function returns immediately
+              with the result value #QURT_ENOTHREAD. \n
+   @note1cont Two threads cannot call qurt_thread_join to wait for the same thread to finish.
+              If this occurs, QuRT generates an exception (see Section @xref{sec:exceptionHandling}).
+  
+   @param[in]   tid     Thread identifier.
+   @param[out]  status  Destination variable for thread exit status. Returns an application-defined 
+                        value that indicates the termination status of the specified thread. 
+  
+   @return  
+   #QURT_ENOTHREAD -- Thread has already exited. \n
+   #QURT_EOK -- Thread successfully joined with valid status value. 
+
+   @dependencies
+   None.
+ */
+int qurt_thread_join(unsigned int tid, int *status);
+
+/**@ingroup qurt_thread_detach
+   @xreflabel{sec:thread_detach}
+   Detaches a joinable thread. The specified thread is another thread within the 
+   same process. Create the thread as a joinable thread; only joinable threads 
+   can be detached.
+   If a joinable thread is detached, it finishes execution and exits.
+  
+   @param[in]   tid     Thread identifier.
+   
+   @return  
+   #QURT_ENOTHREAD -- Thread specifed by TID does not exist. \n
+   #QURT_EOK -- Thread successfully detached.
+
+   @dependencies
+   None.
+ */
+int qurt_thread_detach(unsigned int tid);
+
+
+/**@ingroup func_qurt_thread_get_priority 
+   Gets the priority of the specified thread. \n 
+   Returns the thread priority of the specified thread.\n
+   Thread priorities are specified as numeric values in a range as large as 1 through 254, with lower
+   values representing higher priorities. 1 represents the highest possible thread priority. \n
+   Priority 0 and 255 are internally used by the kernel for special purposes.
+
+   @note1hang QuRT can be configured to have different priority ranges.
+
+   @datatypes
+   #qurt_thread_t
+  
+   @param[in]  threadid	   Thread identifier.	
+
+   @return
+   -1 -- Invalid thread identifier. \n
+   1 through 254 -- Thread priority value.
+
+   @dependencies
+   None.
+ */
+int qurt_thread_get_priority (qurt_thread_t threadid);
+
+/**@ingroup func_qurt_thread_set_priority
+   Sets the priority of the specified thread.\n
+   Thread priorities are specified as numeric values in a range as large as 1 through 254, with lower
+   values representing higher priorities. 1 represents the highest possible thread priority.
+   Priority 0 and 255  are internally used by the kernel  for special purposes.
+
+   @note1hang QuRT can be configured to have different priority ranges. For more
+              information, see Section @xref{sec:AppDev}.
+   
+   @datatypes
+   #qurt_thread_t
+
+   @param[in] threadid	    Thread identifier.	
+   @param[in] newprio 	    New thread priority value.
+
+   @return
+   0 -- Priority successfully set. \n
+   -1 -- Invalid thread identifier. \n 
+   
+   @dependencies
+   None.
+ */
+int qurt_thread_set_priority (qurt_thread_t threadid, unsigned short newprio);
+
+
+
+/**@ingroup func_qurt_thread_attr_get
+  Gets the attributes of the specified thread.
+
+  @datatypes
+  #qurt_thread_t \n
+  #qurt_thread_attr_t
+
+  @param[in]  thread_id	    Thread identifier.
+  @param[out] attr 	    Pointer to the destination structure for thread attributes.
+  
+  @return
+  #QURT_EOK -- Success. \n
+  #QURT_EINVALID -- Invalid argument.
+
+  @dependencies
+  None.
+ */
+int qurt_thread_attr_get (qurt_thread_t thread_id, qurt_thread_attr_t *attr);
+
+
+
+/**@ingroup func_qurt_thread_get_tls_base
+  Gets the base address of thread local storage (TLS) of a dynamically loaded module
+  for the current thread.
+  
+  @datatypes
+  #qurt_tls_info 
+
+  @param[in]  info	   Pointer to the TLS information for a module.
+  
+  @return
+   Pointer to the TLS object for the dynamically loaded module.\n
+   NULL -- TLS information is invalid.
+
+  @dependencies
+  None.
+ */
+void * qurt_thread_get_tls_base(qurt_tls_info* info);
+
+/**@ingroup func_qurt_thread_pktcount_get
+  Gets the PKTCOUNT of a specified thread.
+
+  @datatypes
+  #qurt_thread_t 
+
+  @param[in]  thread_id	    Thread identifier.
+  
+  @return
+  PKTCOUNT
+
+  @dependencies
+  None.
+ */
+
+long long int qurt_thread_pktcount_get (qurt_thread_t thread_id);
+
+/**@ingroup func_qurt_thread_pktcount_set
+  Sets the PKTCOUNT for the current QuRT thread.
+  
+  @return
+  Value to which pktcount is set.
+
+  @dependencies
+  None.
+ */
+
+long long int qurt_thread_pktcount_set (long long int);
+
+/**@ingroup func_qurt_thread_stid_get
+  Gets the STID for a specified thread.
+
+  @datatypes
+  #qurt_thread_t 
+
+  @param[in]  thread_id	    Thread identifier.
+  
+  @return
+  STID
+
+  @dependencies
+  None.
+ */
+
+char qurt_thread_stid_get(qurt_thread_t thread_id);
+ 
+/**@ingroup func_qurt_thread_stid_get2
+  Returns the set stid for a thread
+  
+  @param[in]  thread_id   thread identifier
+  @param[out] stid  Pointer to a variable to return  stid
+   
+  @return
+  QURT_EOK - success
+  QURT_ENOTALLOWED   - operation not allowed for a thread
+  QURT_EINVALID - Invalid input
+
+  @dependencies
+  None.
+ */
+int qurt_thread_stid_get2(unsigned int thread_id, unsigned int *stid);
+
+/**@ingroup func_qurt_thread_stid_set
+  Sets the STID for a specified thread. 
+
+  @datatypes
+  #qurt_thread_t 
+
+  @param[in]  stid	    Thread identifier.
+  
+  @return 
+   #QURT_EOK -- STID set created. \n
+   #QURT_EFAILED -- STID not set. 
+
+  @dependencies
+  None.
+ */
+
+int qurt_thread_stid_set(char stid);
+
+/**@ingroup qurt_thread_stid_set2
+   Sets the stid for a specified thread.
+
+   @datatypes
+   #qurt_thread_attr_t
+
+   @param[in]  thread_id  Thread identifier.
+   @param[in]  stid       Stid to be set for a thread.
+
+   @return
+   QURT_EOK -- Success
+   #QURT_EPRIVILEGE -- Failure because caller does not have enough privilege for this operation.
+   #QURT_EVAL -- Failure because of invalid inputs.
+
+   @dependencies
+   None.
+*/
+int qurt_thread_stid_set2(unsigned int thread_id, unsigned int stid); 
+
+/** @cond internal_only */
+/**@ingroup func_qurt_thread_get_running_ids
+  Returns the thread IDs of the running threads in the system; use only during fatal error handling.
+ 
+  @datatypes
+  #qurt_thread_t 
+ 
+  @param[in,out] * Array of thread identifier of size #QURT_MAX_HTHREAD_LIMIT + 1.
+ 
+  @return
+   #QURT_EINVALID -- Incorrect argument \n
+   #QURT_ENOTALLOWED  -- API not called during error handling \n
+   #QURT_EOK -- Success, returns a NULL-terminated array of thread_id
+ 
+  @dependencies
+  None.
+ */
+int qurt_thread_get_running_ids(qurt_thread_t *);
+/** @endcond */
+
+
+/**@ingroup func_qurt_thread_get_thread_id
+  Gets the thread identifier of the thread with the matching name in the same process
+  of the caller.
+ 
+  @datatypes
+  #qurt_thread_t 
+ 
+  @param[out] thread_id Pointer to the thread identifier.
+  @param[in]  name      Pointer to the name of the thread.
+ 
+  @return
+  #QURT_EINVALID -- No thread with matching name in the process of the caller \n
+  #QURT_EOK      -- Success  
+ 
+  @dependencies
+  None.
+ */
+int qurt_thread_get_thread_id (qurt_thread_t *thread_id, char *name);
+
+/**@ingroup func_qurt_sleep
+  Suspends the current thread for the specified amount of time.
+
+  @note1hang Because QuRT timers are deferrable, this call is guaranteed to block
+             at least for the specified amount of time. If power-collapse is 
+             enabled, the maximum amount of time this call can block depends on
+             the earliest wakeup from power-collapse past the specified duration.
+
+  @param[in] duration  Duration (in microseconds) for which the thread is suspended.
+
+  @return 
+  None.
+
+  @dependencies
+  None.
+ */
+void qurt_sleep (unsigned long long int duration);
+
+
+/**@ingroup func_qurt_system_set_priority_floor
+  Sets a priority floor to move threads with thread priority lower than the floor out of the running state.
+  Running threads with thread priority lower than the priority floor are moved into the kernel ready queue, and they 
+  are not scheduled to run when the thread priority is lower than the floor.
+  Later the caller should reset the priority floor back to the default value of QURT_PRIORITY_FLOOR_DEFAULT. 
+  Threads in the kernel ready queue are scheduled to run when the thread priority is higher than the floor.
+
+  The priority floor is set and associated to the user process of the caller. When the caller gets into QuRTOS and
+  sets a new floor, the new floor is associated to its original user process, not the QuRTOS process.
+  The floor associated to the user process is reset when the user process exits or is killed, but not at the time 
+  when the user thread of the caller exits.
+
+  The priority floor cannot be set to a priority higher than the thread priority of the caller.
+
+  The priority floor cannot be set to a priority lower than the default #QURT_PRIORITY_FLOOR_DEFAULT system floor.
+
+  This function is not supported in Island mode.
+
+  After the system floor is set above QURT_PRIORITY_FLOOR_DEFAULT, power collapse is skipped, and sleep task 
+  is not scheduled to run.
+ 
+  @param[in]  priority_floor Priority floor. 
+ 
+  @return
+  #QURT_EOK         -- Success \n  
+  #QURT_ENOTALLOWED -- Floor setting is not allowed
+ 
+  @dependencies
+  None.
+ */
+int qurt_system_set_priority_floor (unsigned int priority_floor);
+
+
+/**@ingroup func_qurt_thread_suspend_thread 
+  Suspend a QuRT thread with its thread identifier.
+  The target thread can be in a signed user process or an unsigned user process.
+  The caller thread can be a thread from the same user process of the target thread, or from its parent process.
+  After the target thread is suspended, the kernel will not schedule it to run until it is resumed later.
+
+  If the target thread is set as non-suspendable, this function call returns an error without suspending 
+  the target thread. 
+
+  If the target thread is already suspended, this function call returns success to confirm 
+  the target thread suspend.                                          
+
+  If the target thread is in a secure user process, or CPZ process, this function call returns an error without
+  suspending the target thread.                                          
+
+  If the target thread is running in the guest OS/root process via a QDI call, this function call does not suspend 
+  the target thread in guest OS, but marks the target thread as suspend-pending. The target thread is
+  suspended when it exits the guest OS, before executing the first instruction in the user process.
+  In this case, the function returns success even with the #QURT_THREAD_SUSPEND_SYNCHRONOUS option, while the target
+  thread can runn in the guest OS, and is suspended when exiting the guest OS. 
+ 
+  QuRT debug monitor threads that are in a user process are non-suspendable. This function does not suspend 
+  those threads.
+
+  @param[in] thread_id  Thread identifier.
+  @param[in] option     Optional argument, multiple options can be ORed. \n
+                        #QURT_THREAD_SUSPEND_SYNCHRONOUS (default) -- set to synchronous function call,
+                        the function returns after the thread is completely suspended.\n
+                        #QURT_THREAD_SUSPEND_ASYNCHRONOUS -- set to asynchronous function call, the function returns
+                        after the kernel acts to suspend the target thread. The target thread
+                        might still be running before it is completely suspended. \n
+                        #QURT_THREAD_SUSPEND_KEEP_HMX (default) -- keep the HMX attachment on the target thread 
+                        if it locks the HMX with qurt_hmx_lock(). In this case, the HMX cannot be re-used by other threads. \n
+                        #QURT_THREAD_SUSPEND_DETACH_HMX -- detach HMX from the target thread if it locks the HMX with qurt_hmx_lock().
+                        Later when the target thread resumes, the HMX is re-attached to the thread. Note that, this option is only 
+                        supported for the caller from the same user process of the target thread, not for a caller from the parent 
+                        process of the target thread, or other processes. With the HMX detach option, Qurt does not save the HMX 
+                        context. Thus, the HMX context state will be lost. It is the responsibility of caller to ensure HMX operations
+                        and its context state saving when calling qurt_thread_suspend_thread() with the HMX detach option.
+                        If a thread from another process uses this detach option, QURT_EHMXNOTDETACHABLE will be returned; in this 
+                        case, if the caller is qualified to suspend the target thread, the target thread will be moved to suspended 
+                        state without HMX detached.
+ 
+  @return
+  #QURT_EOK         -- Success  \n
+  #QURT_EINVALID    -- Failure because of invalid thread_id input \n
+  #QURT_ENOTALLOWED -- Failure because of the operation is not allowed, for example, in secure process/CPZ process.
+  #QURT_EHMXNOTDETACHABLE -- Failure because HMX is not detachable from the target thread.
+ 
+  @dependencies
+  None.
+ */
+int qurt_thread_suspend_thread (unsigned int thread_id, unsigned int option);
+
+
+/**@ingroup func_qurt_thread_resume_thread 
+  Resume a QuRT thread with its thread identifier.
+  The target thread can be in a signed user process or an unsigned user process.
+  The caller thread can be a thread from the same user process of the target thread, or from its parent 
+  process. After the target thread resumes, the kernel scheduler can schedule the thread to run based on 
+  the thread priority.
+
+  There is an option argument in this function, with only one default option as of now,
+     QURT_THREAD_RESUME_DEFAULT: resume the target thread in default way.
+
+  By default, this is an asynchronous function. The function returns after kernel moves the 
+  target thread from suspended state to runnable state. The thread is scheduled to run based on its 
+  thread priority.
+  
+  If the target thread is set as non-resumable, this function call does not resume the target thread.                                          
+
+  If the target thread has already resumed, this function confirms that the target thread resumes
+  by returning success.  
+
+  If the target thread is in a secure user process or CPZ process, this function call returns an error without 
+  resuming the operation.  
+
+  If the target thread runs in the guest OS/root process via a QDI call, this function call clears the mark of
+  suspend-pending on the target thread, and the target thread is not suspended when it exits the 
+  guest OS. 
+ 
+  @param[in] thread_id  Thread identifier.
+  @param[in] option     Optional argument, #QURT_THREAD_RESUME_DEFAULT, which resumes the target thread.
+ 
+  @return
+  #QURT_EOK           -- Success \n 
+  #QURT_EINVALID      -- Failure because of invalid thread_id input \n
+  #QURT_ENOTALLOWED   -- Failure because of the operation is not allowed, for example, in a secure process/CPZ process.
+  #QURT_EHMXNOTAVAIL  -- Failure because when resume a HMX thread, the HMX is not available/free for the HMX thread resume.
+ 
+  @dependencies
+  None.
+ */
+int qurt_thread_resume_thread (unsigned int thread_id, unsigned int option);
+
+
+/**@ingroup func_qurt_thread_set_thread_property 
+  Set a QuRT thread property with its thread identifier.
+  The target thread can be in a signed user process or an unsigned user process.
+  The caller thread can be from the same user process of the target thread, or from its parent process.
+
+  If the target thread is in a secure user process, or CPZ process, this function call returns an error without 
+  changing the property of the target thread.
+
+  @param[in] thread_id    Thread identifier \n
+  @param[in] property_id  Thread property identifier \n
+                          #QURT_THREAD_PROPERTY_SUSPENDABLE -- thread is suspendable. Default is TRUE. \n
+                          #QURT_THREAD_PROPERTY_RESUMEABLE  -- thread is resumable. Default is TRUE
+  @param[in] value        Proper value: \n
+                          TRUE(1) -- TRUE for the property \n
+                          FALSE(0) -- FALSE for the property
+ 
+  @return
+  #QURT_EOK         -- Success  \n
+  #QURT_EINVALID    -- Failure because of invalid thread_id input \n
+  #QURT_ENOTALLOWED -- Failure because of the operation is not allowed, for example, in a secure process/CPZ process.
+ 
+  @dependencies
+  None.
+ */
+int qurt_thread_set_thread_property( unsigned int thread_id, unsigned int property_id, unsigned int value );    
+
+/**@ingroup func_qurt_thread_get_group_id
+  Get the group id of the thread specified by thread_id.\n
+
+  @param[in] thread_id Thread identifier
+  @param[out] group_id Pointer to the variable of group identifier
+
+  @return
+  #QURT_EOK         -- Success  \n
+  #QURT_EINVALID    -- Thread id is invalid, or the process has no groups enabled \n
+  #QURT_ENOTALLOWED -- Operation is not allowed \n
+
+  @dependencies
+  None.
+*/
+int qurt_thread_get_group_id(qurt_thread_t thread_id, unsigned int* group_id);
+
+#endif /* __ASSEMBLER__ */ 
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_THREAD_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_thread_context.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_thread_context.h
new file mode 100755
index 0000000000000..bab09deec8889
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_thread_context.h
@@ -0,0 +1,234 @@
+#ifndef QURT_THREAD_CONTEXT_H
+#define QURT_THREAD_CONTEXT_H
+/**
+  @file qurt_thread_context.h 
+  @brief Kernel thread context structure
+			
+EXTERNAL FUNCTIONS
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+Copyright (c) 2018-2022  by Qualcomm Technologies, Inc.  All Rights Reserved.
+Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+=============================================================================*/
+
+
+
+#include <qurt_qdi_constants.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** @cond internal_only */
+
+#define THREAD_ITERATOR_END ((qurt_thread_t)(-1))  /**< Thread iterator is complete. */   
+
+
+/**@ingroup func_qurt_thread_iterator_create
+Gives the ability to the caller to enumerate threads in the system.
+
+@return 
+Handle of the newly created iterator must be passed for
+subsequent operations on the iterator.           
+
+@dependencies
+None.
+*/
+static inline int qurt_thread_iterator_create(void)
+{
+   return qurt_qdi_handle_invoke(QDI_HANDLE_GENERIC, QDI_OS_THREAD_ITERATOR_CREATE);
+}
+
+/**@ingroup func_qurt_thread_iterator_next
+Iterates over the list of threads in the system.
+
+@datatypes
+#qurt_thread_t
+
+@param[in] iter Iterator handle returned by qurt_thread_iterator_create().
+
+@return 
+#THREAD_ITERATOR_END -- iterator has reached the end of the thread list. \n
+Other values indicate a valid thread_id.
+
+@dependencies
+None.
+*/
+static inline qurt_thread_t qurt_thread_iterator_next(int iter)
+{
+   return (qurt_thread_t)qurt_qdi_handle_invoke(iter, QDI_OS_THREAD_ITERATOR_NEXT);
+}
+
+/**@ingroup func_qurt_thread_iterator_destroy
+Cleans up thread iterator resources.
+
+@param[in] iter Iterator handle returned by qurt_thread_iterator_create().
+
+@return 
+#QURT_EOK -- Successful completion of operation \n
+#QURT_EFATAL -- Invalid handle passed 
+		  
+@dependencies
+None.
+*/
+static inline int qurt_thread_iterator_destroy(int iter)
+{
+   return qurt_qdi_close(iter);
+}
+
+/**@ingroup func_qurt_thread_context_get_tname
+Gets the name of the thread from the specified thread ID.
+
+@param[in]      thread_id   Thread for which name is returned.
+@param[in,out]  name        Pointer to the local buffer where name is copied back.
+@param[in]      max_len     Size of the local buffer.
+
+@return 
+#QURT_EOK -- Success \n
+Failure otherwise
+		  
+@dependencies
+None.
+*/
+int qurt_thread_context_get_tname(unsigned int thread_id, char *name, unsigned char max_len);
+
+/**@ingroup func_qurt_thread_context_get_prio
+Gets the priority for the specified thread.
+
+@param[in]     thread_id   Thread for which priority is returned.
+@param[in,out] prio        Pointer to the local variable where priority is written.
+
+@return  
+#QURT_EOK -- Success \n
+Failure otherwise
+		  
+@dependencies
+None.
+*/
+int qurt_thread_context_get_prio(unsigned int thread_id, unsigned char *prio);
+
+/**@ingroup func_qurt_thread_context_get_pcycles
+Gets pcycles for the specified thread.
+
+@param[in]     thread_id Thread for which processor cycles are returned.
+@param[in,out] pcycles   Pointer to the local variable where processor cycles are written.
+
+@return  
+#QURT_EOK -- Success \n
+Failure otherwise.
+		  
+@dependencies
+None.
+*/
+int qurt_thread_context_get_pcycles(unsigned int thread_id, unsigned long long int *pcycles);
+
+/**@ingroup func_qurt_thread_context_get_stack_base
+Gets the stack base address for the specified thread.
+
+@param[in]     thread_id Thread for which stack base address is returned.
+@param[in,out] sbase     Pointer to the local variable where stack base address is written.
+
+@return  
+QURT_EOK -- Success \n
+Failure otherwise
+		  
+@dependencies
+None.
+*/
+int qurt_thread_context_get_stack_base(unsigned int thread_id, unsigned int *sbase);
+
+/**@ingroup func_qurt_thread_context_get_stack_size
+Gets the stack size for the specified thread.
+
+@param[in]      thread_id   Thread for which stack size is returned.
+@param[in,out]  ssize       Pointer to the local variable where stack size is written.
+
+@return  
+#QURT_EOK -- Success \n
+Failure otherwise
+		  
+@dependencies
+None.
+*/
+int qurt_thread_context_get_stack_size(unsigned int thread_id, unsigned int *ssize);
+
+/**@ingroup func_qurt_thread_context_get_pid
+Gets the process ID for the specified thread.
+
+@param[in]     thread_id  Thread for which process ID is returned.
+@param[in,out] pid        Pointer to the local variable where process id is written.
+
+@return  
+#QURT_EOK -- Success \n
+Failure otherwise
+		  
+@dependencies
+None.
+*/
+int qurt_thread_context_get_pid(unsigned int thread_id, unsigned int *pid);
+
+/**@ingroup func_qurt_thread_context_get_pname
+Gets the process name for the specified thread.
+
+@param[in]       thread_id  Represents the thread for which process name is returned.
+@param[in, out]  name       Pointer to the local buffer where process name is copied back.
+@param[in]       len        Length allocated to the local buffer.
+
+@return  
+#QURT_EOK -- Success \n
+Failure otherwise
+		  
+@dependencies
+None.
+*/
+int qurt_thread_context_get_pname(unsigned int thread_id, char *name, unsigned int len);
+
+/** @addtogroup thread_types
+@{ */
+/** Structure that defines how TCB is interpreted to crash dump tools.*/
+/* Keys are defined in consts.h */
+struct qurt_debug_thread_info {
+/** @cond */
+    char name[QURT_MAX_NAME_LEN];     /**< Name of the thread. */
+    struct {
+        unsigned key;                 
+        unsigned val;
+    } os_info[40];  
+    unsigned gen_regs[32];            /**< General mode registers. */
+    unsigned user_cregs[32];          /**< User mode registers. */
+    unsigned guest_cregs[32];         /**< Guest mode registers. */
+    unsigned monitor_cregs[64];       /**< Monitor mode registers. */
+/** @endcond */
+}; /* should add up to 1K */
+/** @} */ /* end_addtogroup thread_types */
+
+
+/**@ingroup func_qurt_system_tcb_dump_get
+Cleans up thread iterator resources.
+
+@datatypes
+#qurt_thread_t
+
+@param[in]       thread_id  Thread on which the operation must be performed.
+@param[in, out]  ptr        Pointer to the local buffer where contents are written.
+@param[in]       size       Size of the debug thread information structure obtained by calling
+                     qurt_system_tcb_dump_get_size().
+	   
+@return 
+#QURT_EOK -- Success \n
+Failure otherwise
+		  
+@dependencies
+None.
+*/
+int qurt_system_tcb_dump_get(qurt_thread_t thread_id, void *ptr, size_t size);
+/** @endcond */
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_THREAD_CONTEXT_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_timer.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_timer.h
new file mode 100755
index 0000000000000..7bdfdb8f3c3df
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_timer.h
@@ -0,0 +1,560 @@
+#ifndef QURT_TIMER_H
+#define QURT_TIMER_H
+/**
+  @file qurt_timer.h
+  @brief  Prototypes of qurt_timer API
+
+EXTERNAL FUNCTIONS
+   None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+Copyright (c) 2021, 2023 by Qualcomm Technologies, Inc.  All Rights Reserved.
+Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+=============================================================================*/
+
+
+
+#include "qurt_anysignal.h"
+#include "qurt_signal2.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*=============================================================================
+                        CONSTANTS AND MACROS
+=============================================================================*/
+/**@addtogroup timer_const_macros 
+@{ */
+/**
+ Default values.
+*/
+/**	@xreflabel{hdr:QURT_TIMER_ONESHOT}*/
+#define QURT_TIMER_DEFAULT_TYPE            QURT_TIMER_ONESHOT /**< One shot.*/
+#define QURT_TIMER_DEFAULT_DURATION        1000uL             /**< Default duration. */
+#define QURT_TIMER_DEFAULT_EXPIRY          0uL                /**< Default expiration. */
+
+/**
+ Conversion from microseconds to timer ticks.
+ */
+#define QURT_TIMER_TIMETICK_FROM_US(us) QURT_SYSCLOCK_TIMETICK_FROM_US(us)
+
+/**
+ Conversion from timer ticks to microseconds at the nominal frequency.
+*/
+#define QURT_TIMER_TIMETICK_TO_US(ticks) qurt_timer_timetick_to_us(ticks)
+
+/** Minimum microseconds value is 100 microseconds (sleep timer).*/
+#define QURT_TIMER_MIN_DURATION             100uL
+
+/**
+ Maximum microseconds value for Qtimer is 1,042,499 hours.
+*/
+#define QURT_TIMER_MAX_DURATION  QURT_SYSCLOCK_MAX_DURATION
+
+/** 
+ Timer clock for Qtimer is 19.2 MHz.
+*/
+#define QURT_TIMER_MAX_DURATION_TICKS QURT_SYSCLOCK_MAX_DURATION_TICKS
+
+/** 
+ Sleep timer error margin for Qtimer is 1,000 ticks ~52 us.
+*/
+#define QURT_TIMETICK_ERROR_MARGIN QURT_SYSCLOCK_ERROR_MARGIN
+
+/*
+  qurt_timer group defines.                                                    
+*/
+#define QURT_TIMER_MAX_GROUPS              5U /**< Maximum groups.*/
+#define QURT_TIMER_DEFAULT_GROUP           0U /**< Default groups. */
+/** @} */ /* end_addtogroup timer_const_macros */
+
+/** @addtogroup timer_types
+@{ */
+/**
+   QuRT timer types.                                                       
+ */
+typedef enum
+{
+  QURT_TIMER_ONESHOT = 0,  /**< One shot.*/
+  /**	@xreflabel{hdr:QURT_TIMER_PERIODIC}*/
+  QURT_TIMER_PERIODIC      /**< Periodic. */
+} qurt_timer_type_t;
+
+
+/*=============================================================================
+                        TYPEDEFS
+=============================================================================*/
+
+/** QuRT timer type.*/
+typedef unsigned int                        qurt_timer_t;
+
+/** QuRT timer duration type.  */
+typedef unsigned long long                  qurt_timer_duration_t;
+
+/** QuRT timer time type. */
+typedef unsigned long long                  qurt_timer_time_t;
+
+typedef void (*pfn_t)(void);
+/** QuRT timer attribute type. */
+typedef struct 
+{ 
+    /** @cond */
+    unsigned int        magic; /**< Magic number to verify the qmsgq_attr_t pointer.  */    
+     
+    qurt_timer_duration_t   duration; /**< Specifies the duration of the new timer. */
+     
+    qurt_timer_time_t   expiry; /**< Specifies the absolute expiry of the new timer. */
+
+    qurt_timer_duration_t   remaining; /**< Specifies the remaining time of an active timer. */
+   
+    qurt_timer_type_t       type;  /**< Specifies the timer type; only #QURT_TIMER_ONESHOT and
+                                            #QURT_TIMER_PERIODIC are supported.  */
+    
+    unsigned int        group;  /**<  Group number of the timer; the criterion used to disable or enable the set
+       of timers.  */
+    pfn_t pFn;  /**< Callback other than the signal set */
+    /** @endcond */
+}
+qurt_timer_attr_t;
+
+/** @} */ /* end_addtogroup timer_types */
+/*=============================================================================
+                        FUNCTIONS
+=============================================================================*/
+
+/**@ingroup func_qurt_timer_stop
+  @xreflabel{sec:qurt_timer_stop}  
+  Stops a running timer.
+  The timer must be a one-shot timer.
+
+  @note1hang Restart stopped timers with the timer restart operation,
+             see Section @xref{sec:qurt_timer_restart}. 
+
+  @datatypes
+  #qurt_timer_t
+  
+  @param[in] timer    Timer object. 
+
+  @return
+  #QURT_EOK -- Success. \n
+  #QURT_EINVALID -- Invalid timer ID or duration value. \n
+  #QURT_ENOTALLOWED -- Timer is not a one shot timer. \n
+  #QURT_EMEM -- Out of memory error.
+
+  @dependencies
+  None.
+ */
+int qurt_timer_stop (qurt_timer_t timer);
+
+/**@ingroup func_qurt_timer_restart
+   @xreflabel{sec:qurt_timer_restart}
+   Restarts a stopped timer with the specified duration. The timer must be a one-shot timer.
+   Timers stop after they have expired or after they are explicitly stopped with qurt_timer_stop().
+   A restarted timer expires after the specified duration, the starting time is when the function is called.
+
+  @note1hang Timers stop after they have expired or after they are explicitly
+             stopped with the timer stop operation, see Section @xref{sec:qurt_timer_stop}.
+
+  @datatypes
+  #qurt_timer_t \n
+  #qurt_timer_duration_t
+
+  @param[in] timer        Timer object. 
+  @param[in] duration     Timer duration (in microseconds) before the restarted timer
+                          expires again.
+                          The valid range is #QURT_TIMER_MIN_DURATION to
+                          #QURT_TIMER_MAX_DURATION.
+
+  @return             
+  #QURT_EOK -- Success. \n
+  #QURT_EINVALID -- Invalid timer ID or duration value. \n
+  #QURT_ENOTALLOWED -- Timer is not a one-shot timer. \n
+  #QURT_EMEM --  Out-of-memory error.
+
+  @dependencies
+  None.
+ */
+int qurt_timer_restart (qurt_timer_t timer, qurt_timer_duration_t duration);
+
+
+/**@ingroup func_qurt_timer_create
+  Creates a timer.\n
+  Allocates and initializes a timer object, and starts the timer.
+
+  @note1hang A timer event handler must be defined to wait on the specified signal 
+             to handle the timer event.
+
+  @datatypes
+  #qurt_timer_t \n
+  #qurt_timer_attr_t \n
+  #qurt_anysignal_t
+
+  @param[out] timer   Pointer to the created timer object.
+  @param[in]  attr    Pointer to the timer attribute structure.
+  @param[in]  signal  Pointer to the signal object set when timer expires.
+  @param[in]  mask    Signal mask, which specifies the signal to set in the signal object when the
+                      time expires.
+
+  @return
+  #QURT_EOK -- Success. \n
+  #QURT_EMEM -- Not enough memory to create the timer. \n
+  #QURT_EINVALID -- One of the arguments in the attr field is invalid. \n
+  Other error code -- Operation failed. \n
+
+  @dependencies
+  None.
+ */
+int qurt_timer_create (qurt_timer_t *timer, const qurt_timer_attr_t *attr,
+                  const qurt_anysignal_t *signal, unsigned int mask);
+
+int qurt_timer_create_sig2 (qurt_timer_t *timer, const qurt_timer_attr_t *attr, 
+                  const qurt_signal2_t *signal, unsigned int mask);
+
+/**@ingroup func_qurt_timer_attr_init
+  Initializes the specified timer attribute structure with default attribute values: \n
+  - Timer duration -- #QURT_TIMER_DEFAULT_DURATION (Section @xref{dox:timers}) \n
+  - Timer type -- #QURT_TIMER_ONESHOT \n
+  - Timer group -- #QURT_TIMER_DEFAULT_GROUP
+
+  @datatypes
+  #qurt_timer_attr_t
+
+  @param[in,out] attr Pointer to the destination structure for the timer attributes.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+void qurt_timer_attr_init(qurt_timer_attr_t *attr);
+
+
+/*Tech Comm note: removed qurt_timer_attr_set_pfn from documentation 9/10/2020
+@ingroup func_qurt_timer_attr_set_pfn  
+
+  @datatypes
+  #qurt_timer_attr_t
+
+  @param[in,out] attr Pointer to the destination structure for the timer attributes.
+  @param[in] pFn pFn.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+void qurt_timer_attr_set_pfn(qurt_timer_attr_t *attr, pfn_t pFn);
+
+
+/**@ingroup func_qurt_timer_attr_set_duration
+  Sets the timer duration in the specified timer attribute structure.\n
+
+  The timer duration specifies the interval (in microseconds) between the creation of the
+  timer object and the generation of the corresponding timer event.
+
+  The timer duration value must be between #QURT_TIMER_MIN_DURATION and
+  #QURT_TIMER_MAX_DURATION (Section @xref{dox:timers}). Otherwise, the set operation is ignored.
+
+  @datatypes
+  #qurt_timer_attr_t \n
+  #qurt_timer_duration_t
+
+  @param[in,out] attr    Pointer to the timer attribute structure.
+  @param[in] duration    Timer duration (in microseconds).
+                         Valid range is #QURT_TIMER_MIN_DURATION to
+                         #QURT_TIMER_MAX_DURATION.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+void qurt_timer_attr_set_duration(qurt_timer_attr_t *attr, qurt_timer_duration_t duration);
+
+/**@ingroup func_qurt_timer_attr_set_expiry
+   Sets the absolute expiry time in the specified timer attribute structure.\n
+   The timer expiry specifies the absolute time (in microseconds) of the generation of the
+   corresponding timer event.\n
+   Timer expiries are relative to when the system first began executing.
+
+   @datatypes
+   #qurt_timer_attr_t \n
+   #qurt_timer_time_t
+
+   @param[in,out] attr  Pointer to the timer attribute structure.
+   @param[in]     time  Timer expiry.
+
+   @return
+   None.
+
+   @dependencies
+   None.
+ */
+void qurt_timer_attr_set_expiry(qurt_timer_attr_t *attr, qurt_timer_time_t time);
+
+/**@ingroup func_qurt_timer_attr_get_duration
+  Gets the timer duration from the specified timer attribute structure.
+  The value returned is the duration that was originally set for the timer.
+
+  @note1hang This function does not return the remaining time of an active timer; 
+  use qurt_timer_attr_get_remaining() to get the remaining time.
+
+  @datatypes
+  #qurt_timer_attr_t \n
+  #qurt_timer_duration_t
+
+  @param[in]  attr       Pointer to the timer attributes object
+  @param[out] duration   Pointer to the destination variable for timer duration.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+void qurt_timer_attr_get_duration(qurt_timer_attr_t *attr, qurt_timer_duration_t *duration);
+
+/**@ingroup func_qurt_timer_attr_get_remaining
+  Gets the timer remaining duration from the specified timer attribute structure. \n
+
+  The timer remaining duration indicates (in microseconds) how much time remains before
+  the generation of the next timer event on the corresponding timer.
+  In most cases this function assumes that the timer attribute structure was obtained by
+  calling qurt_timer_get_attr().
+
+  @note1hang This attribute is read-only and thus has no set operation defined for it.
+
+  @datatypes
+  #qurt_timer_attr_t \n
+  #qurt_timer_duration_t
+
+  @param[in] attr          Pointer to the timer attribute object.
+  @param[out] remaining    Pointer to the destination variable for remaining time.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+void qurt_timer_attr_get_remaining(qurt_timer_attr_t *attr, qurt_timer_duration_t *remaining);
+
+/**@ingroup func_qurt_timer_attr_set_type
+  Sets the timer type in the specified timer attribute structure.
+
+  The timer type specifies the functional behavior of the timer: \n
+  - A one-shot timer (#QURT_TIMER_ONESHOT) waits for the specified timer duration
+      and then generates a single timer event. After this the timer is nonfunctional. \n
+  - A periodic timer (#QURT_TIMER_PERIODIC) repeatedly waits for the specified
+     timer duration and then generates a timer event. The result is a series of timer
+     events with interval equal to the timer duration.
+
+   @datatypes 
+   #qurt_timer_attr_t \n
+   #qurt_timer_type_t
+   
+   @param[in,out]  attr  Pointer to the timer attribute structure.
+   @param[in]      type  Timer type. Values are: \n
+                   - #QURT_TIMER_ONESHOT -- One-shot timer. \n
+                   - #QURT_TIMER_PERIODIC -- Periodic timer. @tablebulletend
+
+   @return
+   None.
+
+   @dependencies
+   None.
+ */
+void qurt_timer_attr_set_type(qurt_timer_attr_t *attr, qurt_timer_type_t type);
+
+/**@ingroup func_qurt_timer_attr_get_type
+  Gets the timer type from the specified timer attribute structure.
+
+  @datatypes
+  #qurt_timer_attr_t \n
+  #qurt_timer_type_t
+
+  @param[in]  attr  Pointer to the timer attribute structure.
+  @param[out] type  Pointer to the destination variable for the timer type.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+void qurt_timer_attr_get_type(qurt_timer_attr_t *attr, qurt_timer_type_t *type);
+
+/**@ingroup func_qurt_timer_attr_set_group
+  Sets the timer group identifier in the specified timer attribute structure.\n
+  The timer group identifier specifies the group that the timer belongs to. Timer groups are
+  used to enable or disable one or more timers in a single operation. \n
+  The timer group identifier value must be between 0 and (#QURT_TIMER_MAX_GROUPS - 1).
+  See Section @xref{dox:timers}.
+
+  @datatypes
+  #qurt_timer_attr_t
+
+  @param[in,out]  attr  Pointer to the timer attribute object.
+  @param[in] group      Timer group identifier;
+                        Valid range is 0 to (#QURT_TIMER_MAX_GROUPS - 1).
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+void qurt_timer_attr_set_group(qurt_timer_attr_t *attr, unsigned int group);
+
+/**@ingroup func_qurt_timer_attr_get_group
+  Gets the timer group identifier from the specified timer attribute structure.
+
+  @datatypes
+  #qurt_timer_attr_t
+  
+  @param[in]  attr   Pointer to the timer attribute structure.
+  @param[out] group  Pointer to the destination variable for the timer group identifier.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+void qurt_timer_attr_get_group(qurt_timer_attr_t *attr, unsigned int *group);
+
+/**@ingroup func_qurt_timer_get_attr
+  @xreflabel{hdr:qurt_timer_get_attr}
+  Gets the timer attributes of the specified timer when it was created.
+
+  @datatypes
+  #qurt_timer_t \n
+  #qurt_timer_attr_t
+
+  @param[in] timer  Timer object.
+  @param[out] attr  Pointer to the destination structure for timer attributes.
+
+  @return
+  #QURT_EOK -- Success. \n
+  #QURT_EVAL -- Argument passed is not a valid timer.
+
+  @dependencies
+  None.
+ */
+int qurt_timer_get_attr(qurt_timer_t timer, qurt_timer_attr_t *attr);
+
+/**@ingroup func_qurt_timer_delete
+  Deletes the timer.\n
+  Destroys the specified timer and deallocates the timer object.
+
+  @datatypes
+  #qurt_timer_t
+  
+  @param[in] timer  Timer object.
+
+  @return       
+  #QURT_EOK -- Success. \n
+  #QURT_EVAL -- Argument passed is not a valid timer. 
+
+  @dependencies
+  None.
+ */
+int qurt_timer_delete(qurt_timer_t timer);
+
+/**@ingroup func_qurt_timer_sleep
+  Suspends the current thread for the specified amount of time.
+  The sleep duration value must be between #QURT_TIMER_MIN_DURATION and
+  #QURT_TIMER_MAX_DURATION (Section @xref{dox:timers}).
+
+  @datatypes
+  #qurt_timer_duration_t
+
+  @param[in] duration  Interval (in microseconds) between when the thread is suspended
+                       and when it is re-awakened. 
+
+  @return 
+  #QURT_EOK -- Success. \n
+  #QURT_EMEM -- Not enough memory to perform the operation.
+
+  @dependencies
+  None.
+ */
+
+int qurt_timer_sleep(qurt_timer_duration_t duration);
+
+/**@ingroup func_qurt_timer_group_disable
+  Disables all timers that are assigned to the specified timer group.
+  If a specified timer is already disabled, ignore it.
+  If a specified timer is expired, do not process it.
+  If the specified timer group is empty, do nothing.
+
+  @note1hang When a timer is disabled its remaining time does not change, thus it
+             cannot generate a timer event.
+ 
+  @param[in] group  Timer group identifier.
+
+  @return 
+  #QURT_EOK -- Success.
+
+  @dependencies
+  None.
+ */
+int qurt_timer_group_disable (unsigned int group);
+
+/**@ingroup func_qurt_timer_group_enable
+  Enables all timers that are assigned to the specified timer group.
+  If a specified timer is already enabled, ignore it.
+  If a specified timer is expired, process it.
+  If the specified timer group is empty, do nothing.
+
+  @param[in] group  Timer group identifier.
+
+  @return 
+  #QURT_EOK -- Success.
+
+  @dependencies
+  None.
+ */
+int qurt_timer_group_enable (unsigned int group);
+
+
+/**
+  Notifies the timer server recovery from power collapse. The server
+  must account for any missed interrupts during power collapse. 
+ */
+void qurt_timer_recover_pc (void);
+
+/**
+   Determines whether the Qtimer is initialized.
+
+   @return
+   0       -- Not initialized. \n
+   Nonzero -- Initialized.
+ */
+static inline int qurt_timer_is_init (void) {return 1;}
+
+/**@ingroup func_qurt_timer_get_ticks
+   Gets current ticks. The ticks are accumulated since the RTOS
+   has started. Each tick is equal to a single timer clock
+   cycle, where the frequency is 32 KHz on RGPT or 19.2 MHz on Qtimer.
+  
+   @return             
+   Ticks since system started.
+ */
+unsigned long long qurt_timer_get_ticks (void);
+
+#define qurt_timer_timetick_from_us(us) QURT_SYSCLOCK_TIMETICK_FROM_US(us)
+
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_TIMER_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_tlb.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_tlb.h
new file mode 100755
index 0000000000000..b1b2d261d31c0
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_tlb.h
@@ -0,0 +1,215 @@
+#ifndef QURT_TLB_H
+#define QURT_TLB_H
+
+/**
+  @file qurt_tlb.h 
+  @brief  Prototypes of TLB API  
+        The TLB APIs allow explicit control of the portion of TLB between TLB_first_replaceble and TLB_LAST_REPLACEABLE. 
+        Both are nonconfigurable for the time being. This portion of TLB is permanently assigned/locked unless manually removed 
+        by qurt_tlb_remove. Implementation does not change depending on the configuration, such as whether CONFIG_STATIC is set or not. 
+        In CONFIG_STATIC=y, TLB_LAST_REPLACEABLE is set to the last TLB index, which indicates that the entire TLB is permanently 
+        assigned and is not backed up by page table (page table does not exist). TLB indicies are maintained through a 64-bit bitmask. 
+        A new entry is placed in the first available slot. 
+
+EXTERNAL FUNCTIONS
+   None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+Copyright (c) 2013, 2021, 2023
+All Rights Reserved.
+Confidential and Proprietary - Qualcomm Technologies, Inc.
+=============================================================================*/
+
+#include <qurt_types.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/*=============================================================================
+												FUNCTIONS
+=============================================================================*/
+
+/**@ingroup func_qurt_tlb_entry_create
+  Creates a new TLB entry with the specified mapping attributes in the TLB of the Hexagon processor. \n
+  @note1hang If the specified attributes are not valid (such as if the address is not aligned with the
+             size), the entry is created and an error result is returned.\n
+  @note1cont To set the G bit in the new TLB entry, set the ASID argument to -1.
+
+  @datatypes
+  #qurt_addr_t \n
+  #qurt_paddr_t \n
+  #qurt_mem_cache_mode_t \n
+  #qurt_perm_t
+  
+  @param[out]  entry_id         TLB entry identifier.
+  @param[in]   vaddr 			Virtual memory address.
+  @param[in]   paddr  			Physical memory address.
+  @param[in]   size  			Size of memory region to map (in bytes).
+  @param[in]   cache_attribs    Cache mode (writeback, and so on).
+  @param[in]   perms  			Access permissions.  
+  @param[in]   asid  			ASID (space ID).
+ 
+  @return
+  #QURT_EOK -- TLB entry successfully created.\n
+  #QURT_EFATAL -- Entry is not created; the TLB is full. \n
+  #QURT_ETLBCREATESIZE -- Entry is not created; the incorrect size was specified. \n
+  #QURT_ETLBCREATEUNALIGNED -- Entry is not created; an unaligned address was specified. \n
+  #QURT_EINVALID -- Invalid cache attributes / permissions provided.
+
+ */
+int  qurt_tlb_entry_create (unsigned int *entry_id, qurt_addr_t vaddr, qurt_paddr_t paddr, qurt_size_t size, qurt_mem_cache_mode_t cache_attribs, qurt_perm_t perms, int asid);
+
+/**@ingroup func_qurt_tlb_entry_create_64
+  Creates a new TLB entry with the specified mapping attributes in the TLB of the Hexagon processor. \n
+  @note1hang If the specified attributes are not valid (the address is not aligned with the
+             size), the entry is not created, and an error result is returned.\n
+  @note1cont To set the G bit in the new TLB entry, set the asid argument to -1.
+  
+  @param[out]  entry_id         TLB entry identifier.
+  @param[in]   vaddr 			Virtual memory address.
+  @param[in]   paddr_64         64-bit physical memory address.
+  @param[in]   size  			Size of memory region to map (in bytes).
+  @param[in]   cache_attribs    Cache mode (writeback, and so on).
+  @param[in]   perms  			Access permissions.  
+  @param[in]   asid  			ASID (space ID).
+ 
+  @return
+  #QURT_EOK -- TLB entry successfully created.\n
+  #QURT_EFATAL -- Entry was not created; the TLB is full. \n
+  #QURT_ETLBCREATESIZE -- Entry was not created; the incorrect size was specified. \n
+  #QURT_ETLBCREATEUNALIGNED -- Entry was not created; an unaligned address was specified. \n
+  #QURT_EINVALID -- Invalid cache attributes / permissions provided.
+
+ */
+int qurt_tlb_entry_create_64 (unsigned int *entry_id, qurt_addr_t vaddr, qurt_paddr_64_t paddr_64, qurt_size_t size, qurt_mem_cache_mode_t cache_attribs, qurt_perm_t perms, int asid);
+
+/**@ingroup func_qurt_tlb_entry_delete 
+  Deletes the specified TLB entry from the TLB of the Hexagon processor.
+  If the specified entry does not exist, no deletion occurs and an error result is returned.
+
+  @param[in]   entry_id  TLB entry identifier.			
+
+  @return
+  #QURT_EOK -- TLB entry successfully deleted. \n
+  #QURT_EFATAL -- TLB entry does not exist.
+
+  @dependencies
+  None.
+ **/
+int qurt_tlb_entry_delete (unsigned int entry_id);
+
+/**@ingroup func_qurt_tlb_entry_query
+  Searches for the specified TLB entry in the TLB of the Hexagon processor.
+  If the TLB entry is found, its entry identifier is returned.
+
+  @datatypes
+  #qurt_addr_t
+
+  @param[out]   entry_id     TLB entry identifier.  
+  @param[in]    vaddr  		 Virtual memory address.
+  @param[in]    asid 		 ASID (space ID).
+
+  @return  
+  #QURT_EOK -- TLB entry successfully returned. \n
+  #QURT_EFATAL -- TLB entry does not exist.
+
+  @dependencies
+  None.
+ **/
+int qurt_tlb_entry_query (unsigned int *entry_id, qurt_addr_t vaddr, int asid);
+
+/**@ingroup func_qurt_tlb_entry_set
+  Sets the TLB entry by storing an entry at the specified location 
+  in the TLB of the Hexagon processor.
+
+  @param[in]   entry_id  		TLB entry identifier.
+  @param[in]   entry  			64-bit TLB entry to store.
+
+  @return
+  #QURT_EOK -- Entry successfully stored in the TLB. \n
+  #QURT_EFATAL -- Entry not set at the specified location.
+
+  @dependencies
+  None.
+ **/
+int qurt_tlb_entry_set (unsigned int entry_id, unsigned long long int entry);
+
+/**@ingroup func_qurt_tlb_entry_get
+  Gets the TLB entry. \n
+  Returns the specified 64-bit TLB entry in the TLB of the Hexagon processor.
+
+  @param[in]    entry_id  	TLB entry identifier.
+  @param[out]   entry       64-bit TLB entry.
+
+  @return
+  #QURT_EOK -- TLB entry successfully returned. \n
+  #QURT_EFATAL -- TLB entry does not exist.   
+
+  @dependencies
+  None.
+ **/
+int qurt_tlb_entry_get (unsigned int entry_id, unsigned long long int *entry);
+
+/**@ingroup func_qurt_tlb_get_pager_physaddrs
+  Searches the TLB of the Hexagon processor, and returns all physical addresses that belong to the pager.
+  Each returned address indicates the starting address of an active page.
+
+The function return value indicates the number of addresses returned.
+
+  @param[out]  pager_phys_addrs  Pointer to the return array of pager physical addresses.
+ 
+  @return
+  Integer -- Number of addresses returned in array.
+
+  @dependencies
+    None.
+*/
+
+unsigned int qurt_tlb_get_pager_physaddr(unsigned int** pager_phys_addrs);
+
+/**@ingroup func_qurt_tlb_get_pager_virtaddr
+  Searches the TLB of the Hexagon processor, and returns all virtual addresses that belong to the pager.
+  Each returned address indicates the starting address of an active page.
+
+The function return value indicates the number of addresses returned.
+
+  @param[out]  pager_virt_addrs  Pointer to the return array of pager virtual addresses.
+ 
+  @return
+  Integer -- Number of addresses returned in the array.
+
+  @dependencies
+    None.
+*/
+
+unsigned int qurt_tlb_get_pager_virtaddr(unsigned int** pager_virt_addrs);
+
+
+/**@ingroup func_qurt_tlb_entry_set2
+  Sets the TLB entry by storing an entry at the specified location 
+  in the TLB of the Hexagon processor. An additional option can be passed 
+  to lock the TLB entry in the TLB of the Hexagon processor.
+
+  @param[in]   id     TLB entry identifier.
+  @param[in]   tlb    64-bit TLB entry to store.
+  @param[in]   lock   Nonzero value indicates that the TLB entry must be locked in the hardware TLB.
+
+  @return
+  #QURT_EOK -- Entry successfully stored in the TLB. \n
+  #QURT_EFATAL -- Entry not set at the specified location.
+
+  @dependencies
+  None.
+ **/
+int qurt_tlb_entry_set2(unsigned id, unsigned long long tlb, unsigned lock);
+
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_TLB_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_tls.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_tls.h
new file mode 100755
index 0000000000000..6ec3b39ff5cb0
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_tls.h
@@ -0,0 +1,100 @@
+#ifndef QURT_TLS_H
+#define QURT_TLS_H
+/**
+  @file qurt_tls.h 
+  @brief  Prototypes of TLS APIs 
+
+EXTERNAL FUNCTIONS
+   None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+Copyright (c) 2021  by Qualcomm Technologies, Inc.  All Rights Reserved.
+Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+=============================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/*=============================================================================
+												FUNCTIONS
+=============================================================================*/
+
+/**@ingroup func_qurt_tls_create_key
+  @xreflabel{sec:tls_create_key}
+  Creates a key for accessing a thread local storage data item.\n
+  Subsequent get and set operations use the key value.
+
+  @note1hang The destructor function performs any clean-up operations needed by a thread
+             local storage item when its containing thread is deleted (Section @xref{sec:qurt_thread_exit}).
+
+  @param[out] key         Pointer to the newly created thread local storage key value.
+  @param[in]  destructor  Pointer to the key-specific destructor function. Passing NULL 
+                          specifies that no destructor function is defined for the key.
+
+  @return	
+  #QURT_EOK -- Key successfully created. \n
+  #QURT_ETLSAVAIL -- No free TLS key available. 
+
+  @dependencies
+  None.
+ */
+int qurt_tls_create_key (int *key, void (*destructor)(void *));
+
+/**@ingroup func_qurt_tls_set_specific
+  Stores a data item to thread local storage along with the specified key.
+
+  @param[in]    key  Thread local storage key value.
+  @param[in]    value  Pointer to user data value to store.
+
+  @return  
+  #QURT_EOK -- Data item successfully stored. \n
+  #QURT_EINVALID -- Invalid key. \n
+  #QURT_EFAILED -- Invoked from a non-thread context.
+ */
+int qurt_tls_set_specific (int key, const void *value);
+
+/**@ingroup func_qurt_tls_get_specific
+  Loads the data item from thread local storage. \n
+  Returns the data item that is stored in thread local storage with the specified key.
+  The data item is always a pointer to user data.
+
+  @param[in]    key Thread local storage key value.
+
+  @return
+  Pointer -- Data item indexed by key in thread local storage. \n
+  0 (NULL) -- Key out of range.
+
+  @dependencies
+  None.
+ */
+void * __attribute__((section(".text.qurt_tls_get_specific "))) qurt_tls_get_specific (int key);
+
+
+/**@ingroup func_qurt_tls_delete_key
+  Deletes the specified key from thread local storage.
+
+  @note1hang Explicitly deleting a key does not execute any destructor function that is
+             associated with the key (Section @xref{sec:tls_create_key}).
+
+  @param[in]   key  Thread local storage key value to delete.
+
+  @return  
+  #QURT_EOK -- Key successfully deleted. \n
+  #QURT_ETLSENTRY -- Key already free.
+
+  @dependencies
+  None.
+ */
+int qurt_tls_delete_key (int key);
+
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_TLS_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_trace.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_trace.h
new file mode 100755
index 0000000000000..541f8f1d34bf6
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_trace.h
@@ -0,0 +1,317 @@
+﻿#ifndef QURT_TRACE_H
+#define QURT_TRACE_H
+/**
+  @file qurt_trace.h 
+  @brief  Prototypes of system call tracing helpers API  
+
+EXTERNAL FUNCTIONS
+   None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+Copyright (c) 2021-2023 by Qualcomm Technologies, Inc.
+All Rights Reserved.
+Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+=============================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*=============================================================================
+                            GLOBAL VARIABLES
+=============================================================================*/
+/** @cond internal_only */
+/** @addtogroup etm_macros
+@{ */
+/* ETM trace types. */
+#define QURT_ETM_TYPE_PC_ADDR                           (1U<<0) /**< PC address.*/
+#define QURT_ETM_TYPE_MEMORY_ADDR                       (1U<<1) /**< Memory address. */
+#define QURT_ETM_TYPE_TESTBUS                           (1U<<2) /**< Test bus. */
+#define QURT_ETM_TYPE_CYCLE_ACCURATE                    (1U<<3) /**< Cycle accurate. */
+#define QURT_ETM_TYPE_CYCLE_COARSE                      (1U<<4) /**< Cycle coarse. */
+#define QURT_ETM_TYPE_PC_AND_MEMORY_ADDR                (QURT_ETM_TYPE_PC_ADDR|QURT_ETM_TYPE_MEMORY_ADDR) /**< PC and memory address. */
+#define QURT_ETM_TYPE_PC_ADDR_AND_TESTBUS               (QURT_ETM_TYPE_PC_ADDR|QURT_ETM_TYPE_TESTBUS) /**< PC address and test bus. */
+#define QURT_ETM_TYPE_MEMORY_ADDR_AND_TESTBUS           (QURT_ETM_TYPE_MEMORY_ADDR|QURT_ETM_TYPE_TESTBUS) /**< Memory address and test bus.*/
+#define QURT_ETM_TYPE_PC_AND_MEMORY_ADDR_AND_TESTBUS    (QURT_ETM_TYPE_PC_ADDR|QURT_ETM_TYPE_MEMORY_ADDR|QURT_ETM_TYPE_TESTBUS) /**< PC, memory address, and test bus. */
+
+/* ETM routes. */
+#define QURT_ETM_ROUTE_TO_QDSS      0U /**< ETM route to QDSS. */
+#define QURT_ETM_ROUTE_TO_Q6ETB     1U /**< ETM route to Q6ETB. */
+
+/* ETM filters. */
+#define QURT_ETM_TRACE_FILTER_ALL_DEFAULT   0U       /*< Filter all as default. */
+#define QURT_ETM_TRACE_FILTER_HNUM0         (1U<<0)  /*< Filter HNUM0. */    
+#define QURT_ETM_TRACE_FILTER_HNUM1         (1U<<1)  /*< Filter HNUM1. */     
+#define QURT_ETM_TRACE_FILTER_HNUM2         (1U<<2)  /*< Filter HNUM2. */     
+#define QURT_ETM_TRACE_FILTER_HNUM3         (1U<<3)  /*< Filter HNUM3. */  
+#define QURT_ETM_TRACE_FILTER_HNUM4         (1U<<4)  /*< Filter HNUM4. */  
+#define QURT_ETM_TRACE_FILTER_HNUM5         (1U<<5)  /*< Filter HNUM5. */  
+#define QURT_ETM_TRACE_FILTER_HNUM6         (1U<<6)  /*< Filter HNUM6. */  
+#define QURT_ETM_TRACE_FILTER_HNUM7         (1U<<7)  /*< Filter HNUM7. */  
+#define QURT_ETM_TRACE_FILTER_HNUM8         (1U<<8)  /*< Filter HNUM8. */    
+#define QURT_ETM_TRACE_FILTER_HNUM9         (1U<<9)  /*< Filter HNUM9. */     
+#define QURT_ETM_TRACE_FILTER_HNUM10        (1U<<10) /*< Filter HNUM10. */     
+#define QURT_ETM_TRACE_FILTER_HNUM11        (1U<<11) /*< Filter HNUM11. */
+#define QURT_ETM_TRACE_FILTER_HNUM12        (1U<<12) /*< Filter HNUM12. */    
+#define QURT_ETM_TRACE_FILTER_HNUM13        (1U<<13) /*< Filter HNUM13. */     
+#define QURT_ETM_TRACE_FILTER_HNUM14        (1U<<14) /*< Filter HNUM14. */     
+#define QURT_ETM_TRACE_FILTER_HNUM15        (1U<<15) /*< Filter HNUM15. */
+#define QURT_ETM_TRACE_FILTER_ALL           QURT_ETM_TRACE_FILTER_ALL_DEFAULT
+
+#define QURT_ETM_TRACE_FILTER_CLUSTER0      (1<<16)  /*< Filter trace cluster0 address. */  
+#define QURT_ETM_TRACE_FILTER_CLUSTER1      (1<<17)  /*< Filter trace cluster1 address. */  
+#define QURT_ETM_TRACE_FILTER_PC_RANGE      (1<<19)  /*< Filter PC address range. */  
+
+/* ETM memory source - PC or data access */
+#define QURT_ETM_SOURCE_PC                  0U  /**< ETM memory source of SAC* is PC. */
+#define QURT_ETM_SOURCE_DATA                1U  /**< ETM memory source of SAC* is data. */
+
+/* Period between synchronization traces */
+#define QURT_ETM_ASYNC_PERIOD               0  /**< Async.*/
+#define QURT_ETM_ISYNC_PERIOD               1  /**< Isync.*/
+#define QURT_ETM_GSYNC_PERIOD               2  /**< Gsync. */
+
+/* ETM enable flags */
+#define QURT_ETM_OFF                0U  /**< ETM off. */
+#define QURT_ETM_ON                 1U  /**< ETM on. */
+/** @endcond */
+/** @} */ /* end_addtogroup etm_macros */
+
+/** @addtogroup function_tracing_macro
+@{ */
+/* ETM setup return values */
+#define QURT_ETM_SETUP_OK                   0 /**< ETM setup OK. */
+#define QURT_ETM_SETUP_ERR                  1 /**< ETM setup error. */
+/** @} */ /* end_addtogroup function_tracing_macro */
+/* ETM breakpoint types */
+#define QURT_ETM_READWRITE_BRKPT            0U /**< ETM read/write breakpoint. */
+#define QURT_ETM_READ_BRKPT                 1U /**< ETM read breakpoint. */
+#define QURT_ETM_WRITE_BRKPT                2U /**< ETM write breakpoint. */
+#define QURT_ETM_BRKPT_INVALIDATE           3U /**< Invalidate breakpoint. */
+/** @addtogroup function_tracing_macro
+@{ */
+/* ATB status flags */
+#define QURT_ATB_OFF                        0  /**< ATB off. */
+#define QURT_ATB_ON                         1  /**< ATB on. */
+/** @} */ /* end_addtogroup function_tracing_macro */
+/* DTM enable flags */
+#define QURT_DTM_OFF                0  /**< DTM off. */
+#define QURT_DTM_ON                 1  /**< DTM on. */
+
+/** @addtogroup function_tracing_datatypes
+@{ */
+/**STM trace information. */
+typedef struct qurt_stm_trace_info {
+   /** @cond */
+   unsigned int stm_port_addr[6];   /* STM port address to which trace data must be written.*/
+   unsigned int thread_event_id; /* Event ID for context switches.*/
+   unsigned int interrupt_event_id; /* Event ID for interrupts. */
+   unsigned int marker; /* Marker value that must be written at the beginning of the trace. */
+   /** @endcond */
+} qurt_stm_trace_info_t;
+/** @} */ /* end_addtogroup function_tracing_datatypes */
+/*=============================================================================
+                            GLOBAL FUNCTIONS
+=============================================================================*/
+
+
+/**@ingroup func_qurt_trace_get_marker
+  Gets the kernel trace marker.\n
+  Returns the current value of the kernel trace marker.
+  The marker consists of a hardware thread identifier and an index into the kernel trace
+  buffer. The trace buffer records kernel events.
+
+  @note1hang Using this function with qurt_trace_changed() 
+             determines whether certain kernel events occurred in a block of code.
+
+  @return
+  Integer -- Kernel trace marker.
+
+  @dependencies
+  None.
+*/
+unsigned int qurt_trace_get_marker(void);
+
+/**@ingroup func_qurt_trace_changed  
+  Determines whether specific kernel events have occurred. \n
+  Returns a value that indicates whether the specified kernel events are recorded in the
+  kernel trace buffer since the specified kernel trace marker was obtained.
+
+  The prev_trace_marker parameter specifies a kernel trace marker that was obtained by calling 
+  qurt_trace_get_marker().
+  @cond rest_dist For more information on the mask value, see the description of the trace_mask element in 
+  @xhyperref{80VB41992,80-VB419-92}. \n @endcond
+
+  @note1hang Used with qurt_trace_get_marker(), this function determines whether
+             certain kernel events occurred in a block of code.\n
+  @note1cont This function cannot determine whether a specific kernel event type has
+             occurred unless that event type has been enabled in the trace_mask element
+             of the system configuration file. \n
+  @note1cont QuRT supports the recording of interrupt and context switch events only (such as
+             a trace_mask value of 0x3).
+
+  @param[in] prev_trace_marker Previous kernel trace marker.
+  @param[in] trace_mask        Mask value that indicates which kernel events to check for.
+
+  @returns
+  1 -- Kernel events of the specified type have occurred since the
+       specified trace marker was obtained.\n
+  0 -- No kernel events of the specified type have occurred since the
+       specified trace marker was obtained.
+
+  @dependencies
+  None.
+*/
+int qurt_trace_changed(unsigned int prev_trace_marker, unsigned int trace_mask);
+
+/*=============================================================================
+                        CONSTANTS AND MACROS
+=============================================================================*/
+/** @addtogroup function_tracing_macro
+@{ */
+#ifndef QURT_DEBUG 
+#define QURT_TRACE(str, ...) __VA_ARGS__
+  /**< Function tracing is implemented with the QURT_TRACE debug macro, which
+       optionally generates printf statements both before and after every function call that is
+       passed as a macro argument. 
+
+       For example, in the following macro calls in the source code:
+       @code
+       QURT_TRACE(myfunc, my_func(33))
+       
+       @endcode
+       generates the following debug output:
+       @code
+       myfile:nnn: my_func >>> calling my_func(33)
+       myfile:nnn: my_func >>> returned my_func(33)
+       @endcode
+       The debug output includes the source file and line number of the function call, along with
+       the text of the call. Compile the client source file with -D __FILENAME__
+       defined for its file name.
+
+       The library function qurt_printf() generates the debug output.
+       The QURT_DEBUG symbol controls generation of the debug output. If this symbol is
+       not defined, function tracing is not generated.\n
+       @note1hang The debug macro is accessed through the QuRT API header file. 
+        */
+#else
+#define QURT_TRACE(str, ...) \
+	do { \
+		qurt_printf("%s:%d: %s: >>> calling %s\n",__FILENAME__,__LINE__,(str),#__VA_ARGS__); \
+		__VA_ARGS__; \
+		qurt_printf("%s:%d: %s: <<< %s returned\n",__FILENAME__,__LINE__,(str),#__VA_ARGS__); \
+	} while (0);
+#endif
+/** @} */ /* end_addtogroup function_tracing_macro */
+
+/**@ingroup func_qurt_etm_set_pc_range
+  Sets the PC address range for ETM filtering.
+  Depending on the Hexagon core design, a maximum of four PC ranges are supported.
+
+  @param[in] range_num  0 to 3. 
+  @param[in] low_addr   Lower boundary of PC address range.
+  @param[in] high_addr  Higher boundary of PC address range.
+
+  @returns
+  #QURT_ETM_SETUP_OK -- Success. \n
+  #QURT_ETM_SETUP_ERR -- Failure.
+
+  @dependencies
+  None.
+*/
+unsigned int qurt_etm_set_pc_range(unsigned int range_num, unsigned int low_addr, unsigned int high_addr);
+
+/**@ingroup func_qurt_etm_set_range
+  Sets the address range for ETM filtering. 
+  It allows the user to select the source type of addresses - QURT_ETM_SOURCE_PC and QURT_ETM_SOURCE_DATA.
+
+  @param[in] addr_source_type   Type of the address source:\n
+                                - #QURT_ETM_SOURCE_PC \n
+                                - #QURT_ETM_SOURCE_DATA @tablebulletend
+  @param[in] trig_block_num     0 to 3.
+  @param[in] pid                pid of the process
+                                1. Any valid PID number will enable the ASID based trace filtering.
+                                2. QURT_ETM_NO_PID - Disable the ASID based trace filtering.
+  @param[in] low_addr           Lower boundary of PC address range.
+  @param[in] high_addr          Higher boundary of PC address range.
+
+  @returns
+  #QURT_ETM_SETUP_OK -- Success. \n
+  #QURT_ETM_SETUP_ERR -- Failure.
+
+  @dependencies
+  None.
+*/
+unsigned int qurt_etm_set_range(unsigned int addr_source_type, unsigned int trig_block_num, unsigned int pid, unsigned int low_addr, unsigned int high_addr);
+
+/**@ingroup func_qurt_etm_set_atb
+  Sets the advanced trace bus (ATB) state to notify QuRT that the ATB is actively enabled or disabled.
+  QuRT performs the corresponding actions at low power management.
+  
+  @param[in] flag Values: \n
+                         #QURT_ATB_ON \n
+						 #QURT_ATB_OFF  
+      
+  @returns
+  #QURT_ETM_SETUP_OK  -- Success. \n
+  #QURT_ETM_SETUP_ERR -- Failure
+
+  @dependencies
+  None.
+*/
+unsigned int qurt_etm_set_atb(unsigned int flag);
+
+/**@ingroup func_qurt_etm_set_sync_period
+  Sets the period for types of synchronization trace packets. \n
+  ASYNC defines the period between alignment synchronization packets.
+         Period is in terms of bytes in the packet stream. \n 
+  ISYNC defines the period between instruction synchronization packets.
+         Period is per thread and is defined as the bytes sent out for that thread. \n
+  GSYNC is the defined period in thread cycles between GSYNC packets.
+
+  @param[in]  sync_type Type of synchronization packets: \n
+                          #QURT_ETM_ASYNC_PERIOD \n
+                          #QURT_ETM_ISYNC_PERIOD \n
+                          #QURT_ETM_GSYNC_PERIOD
+  @param[in]  period    Period value. 
+
+  @return
+  #QURT_ETM_SETUP_OK -- Success. \n
+  #QURT_ETM_SETUP_ERR -- Failure.
+
+  @dependencies
+  None.
+ */
+unsigned int qurt_etm_set_sync_period(unsigned int sync_type, unsigned int period);
+
+/**@ingroup func_qurt_stm_trace_set_config
+  Sets up a STM port for tracing events.
+
+  @datatypes
+  #qurt_stm_trace_info_t 
+
+  @param[in]  stm_config_info Pointer to the STM trace information used to set up the trace
+              in the kernel.
+			  The strucure must have the following:\n
+			  - One port address per hardware thread \n
+			  - Event ID for context switches \n
+			  - Event ID for interrupt tracing n
+			  - Header or marker to identify the beginning of the trace. @tablebulletend
+
+  @return
+  #QURT_EOK -- Success. \n
+  #QURT_EINVALID -- Failure; possibly because the passed port address is not in the page table.
+
+  @dependencies
+  None.
+ */
+unsigned int qurt_stm_trace_set_config(qurt_stm_trace_info_t *stm_config_info);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_TRACE_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_types.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_types.h
new file mode 100755
index 0000000000000..bdb83a3fe2fb2
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_types.h
@@ -0,0 +1,294 @@
+#ifndef QURT_TYPES_H
+#define QURT_TYPES_H
+/**
+  @file qurt_types.h 
+  @brief  Contains types common to all configurations
+
+EXTERNAL FUNCTIONS
+   None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+Copyright (c) Qualcomm Technologies, Inc.
+All Rights Reserved.
+Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+=============================================================================*/
+
+
+//#include <stddef.h>
+#include <qurt_consts.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*=============================================================================
+                            CONSTANTS AND MACROS
+=============================================================================*/
+#define PGA_BITFIELD_MASK(hi,lo)    (((~0u)>>(31U-((hi)-(lo))))<<(lo))
+#define PGA_BITFIELD_GET(x,hi,lo)   (((x)&PGA_BITFIELD_MASK((hi),(lo)))>>(lo))
+#define PGA_BITFIELD_INS(hi,lo,v)   (((v)<<(lo))&PGA_BITFIELD_MASK((hi),(lo)))
+#define PGA_BITFIELD_SET(x,hi,lo,v) ((x)=((x)&~PGA_BITFIELD_MASK((hi),(lo)))|PGA_BITFIELD_INS((hi),(lo),(v)))
+#define QURT_PGATTR_C_GET(pga)      PGA_BITFIELD_GET((pga).pga_value, 3U, 0U)       /* Bits 3-0:  cache */
+#define QURT_PGATTR_A_GET(pga)      PGA_BITFIELD_GET((pga).pga_value, 5U, 4U)       /* Bits 5-4:  bus attr */
+#define QURT_PGATTR_C_SET(pga,v)    PGA_BITFIELD_SET((pga).pga_value, 3U, 0U, (v))  /* Bits 3-0:  cache */
+#define QURT_PGATTR_A_SET(pga,v)    PGA_BITFIELD_SET((pga).pga_value, 5U, 4U, (v))  /* Bits 5-4:  bus attr */
+#define QURT_PGATTR_MKRAW(v)        ((qurt_pgattr_t){.pga_value = (v)})
+#define QURT_PGATTR_MK(c,a)         QURT_PGATTR_MKRAW(PGA_BITFIELD_INS(3U,0U,(c))|PGA_BITFIELD_INS(5U,4U,(a)))
+
+/*return types for qurt_island_get_status2*/
+#define QURT_ISLAND_MODE_NORMAL    0U    /**< Normal operating mode */
+#define QURT_ISLAND_MODE_ISLAND    1U    /**< Island mode */
+#define QURT_ISLAND_MODE_EXITING   2U    /**< In transition from Island mode to Normal mode */
+
+/*=============================================================================
+                        FORWARD DECLARATIONS & TYPEDEFS
+=============================================================================*/
+/** @addtogroup memory_management_types
+@{ */
+typedef unsigned int qurt_addr_t;          /**< QuRT address type.*/
+typedef unsigned int qurt_paddr_t;         /**< QuRT physical memory address type.  */ 
+/** @cond rest_reg_dist  */
+typedef unsigned long long qurt_addr_64_t;  /**< QuRT 64-bit memory address type. */
+typedef unsigned long long qurt_paddr_64_t; /**< QuRT 64-bit physical memory address type. */
+typedef unsigned int qurt_mem_region_t;    /**< QuRT memory regions type. */
+typedef unsigned int qurt_mem_fs_region_t; /**< QuRT memory FS region type. */
+/**@endcond */
+typedef unsigned int qurt_mem_pool_t;      /**< QuRT memory pool type.*/
+typedef unsigned int qurt_size_t;          /**< QuRT size type. */
+/** @cond  */
+typedef unsigned long long qurt_mmu_entry_t;/**< QuRT MMU entry type. */
+#define QURT_PHYSPOOL_NAME_LEN (32)
+typedef char qurt_physpool_name_t[QURT_PHYSPOOL_NAME_LEN];
+
+
+/*
+ * Mapping type
+ *
+ * QMEM_MAPPING_VIRTUAL is the default mode, in which the system 
+ * picks up the available range of the virtual address, and maps it to 
+ * available contiguous physical addresses. Physical-to-virtual
+ * is not guaranteed to be 1:1; both virtual and physical memory is 
+ * contiguous.
+ *
+ * In QMEM_MAPPING_IDEMPOTENT mode, the user provides the physical address;
+ * the kernel allocates 1:1 physical-to-virtual memory. Primary use of 
+ * of this mapping is to allocate physical-to-virtual memory 1:1.
+ *
+ * In QMEM_MAPPING_PHYS_CONTIGUOUS mode, the virtual address might
+ * not be the same as the physical address. But the physical address of the
+ * memory region is guaranteed to be contiguous starting at the provided
+ * address, it is required to provide a fixed physical address. The primary 
+ * use of this mapping is to allocate physical memory from a particular 
+ * address, where 1:1 physical-to-virtual is not required.
+ *
+ * QMEM_MAPPING_NONE mode must be used to reserve a virtual memory
+ * area (VMA); no physical memory is reserved or mapped to this virtual
+ * space; all standard qmem_region APIs apply to a VMA, however physical
+ * address is always INVALID_ADDR. qmem_region_create() in this mode
+ * returns a handle to the VMA, both virt_addr and phys_addr must
+ * be set to INVALID_ADDR, kernel allocates any available virtual
+ * memory of the specified size. Obtain the starting virtual address 
+ * of VMA through qmem_region_attr_getvirtaddr().
+ * Primary purpose of this mapping mode is to provide a mechanism for
+ * delayed binding in QuRT, for example reserve virtual memory and map it at
+ * some later time to possibly discontiguous physical blocks. Thus, a
+ * single VMA can be partitioned among several physical-virtual mappings
+ * created via qmem_region_create() with QMEM_VIRTUAL_FIXED mapping mode.
+ * Each VMA keeps track of associated mapped regions.
+ * Deletion of VMA succeeds only if all associated "virtual_fixed"
+ * regions are freed prior to VMA deletion.
+ *
+ * Use QMEM_MAPPING_VIRTUAL_FIXED mode to create a region
+ * from virtual space that has been reserved via qmem_region_create()
+ * with QMEM_MAPPING_NONE mapping. A valid virt_add is required, if
+ * phys_addr is specified, the kernel attempts to map it accordingly,
+ * if no phys_addr is specified, kernel maps any available physical
+ * memory. All standard qmem_region APIs apply to such region. Remapping
+ * a virtual range without prior freeing of the region is not permitted.
+ * When such region is deleted its corresponding VMA remains intact.
+ *
+ * QMEM_MAPPING_PHYS_DISCONTIGUOUS mode can obtain contiguous
+ * virtual memory but physical memory can be discontiguous. This method
+ * tries to club small physical memory blocks to obtain requested
+ * memory and is useful in case where there is no contiguous full block
+ * of requested size. If client does not need contiguous physical memory, 
+ * (for example, if client does not use physical addressing), this helps
+ * use smaller physical memory blocks rather than using contiguous memory.
+ * Note: When memory is allocated through this method, physical address is
+ * not returned to the caller using the qurt_mem_region_attr_get() API as there might
+ * not be a single physical address.
+ *
+ */
+/**@endcond */
+/** QuRT memory region mapping type. */
+typedef enum {
+        QURT_MEM_MAPPING_VIRTUAL=0,            /**< Default mode. The region virtual address range maps to an 
+                                          available contiguous area of physical memory. For the most
+                                                    efficient use of virtual memory, the QuRT system 
+                                                    chooses the base address in physical memory. This works for most memory
+                                          use cases.*/
+        QURT_MEM_MAPPING_PHYS_CONTIGUOUS = 1,  /**< The region virtual address space must be mapped to a 
+                                               contiguous area of physical memory. This is necessary when the
+                                               memory region is accessed by external devices that bypass Hexagon
+                                               virtual memory addressing. The base address in physical 
+                                               memory must be explicitly specified.*/
+        QURT_MEM_MAPPING_IDEMPOTENT=2,         /**< Region virtual address space maps
+                                             to the identical area of physical memory. */
+        QURT_MEM_MAPPING_VIRTUAL_FIXED=3,      /**< Virtual address space of the region maps either to the 
+                                           specified area of physical memory or (if no area is specified)
+                                                    to available physical memory. Use this mapping to create
+                                           regions from virtual space that was reserved by calling 
+                                           qurt_mem_region_create() with mapping. */
+        QURT_MEM_MAPPING_NONE=4,  /**< Reserves a virtual memory area (VMA). Remapping a virtual range is not
+                                       permitted without first deleting the memory region. When such a region is
+                                       deleted, its corresponding virtual memory addressing remains intact. */
+        QURT_MEM_MAPPING_VIRTUAL_RANDOM=7,     /**< System chooses a random virtual address and
+                                            maps it to available contiguous physical addresses.*/
+        QURT_MEM_MAPPING_PHYS_DISCONTIGUOUS=8, /**< While virtual memory is contiguous, allocates in discontiguous physical 
+                                                    memory blocks. This helps when there are smaller contiguous blocks
+                                                    than the requested size.
+                                                    Physical address is not provided as part of the get_attr call */
+        QURT_MEM_MAPPING_INVALID=10,        /**< Reserved as an invalid mapping type. */
+} qurt_mem_mapping_t;  
+
+
+/** QuRT cache mode type. */
+typedef enum {
+        QURT_MEM_CACHE_WRITEBACK=7,     /**< Write back. */
+        QURT_MEM_CACHE_NONE_SHARED=6,   /**< Normal uncached memory that can be shared with other subsystems.*/
+        QURT_MEM_CACHE_WRITETHROUGH=5,  /**< Write through. */
+        QURT_MEM_CACHE_WRITEBACK_NONL2CACHEABLE=0,    /**< Write back non-L2-cacheable.*/
+        QURT_MEM_CACHE_WRITETHROUGH_NONL2CACHEABLE=1,  /**< Write through non-L2-cacheable. */
+        QURT_MEM_CACHE_WRITEBACK_L2CACHEABLE=QURT_MEM_CACHE_WRITEBACK,  /**< Write back L2 cacheable. */
+        QURT_MEM_CACHE_WRITETHROUGH_L2CACHEABLE=QURT_MEM_CACHE_WRITETHROUGH,  /**< Write through L2 cacheable.  */
+        QURT_MEM_CACHE_DEVICE = 4,  /**< Volatile memory-mapped device. Access to device memory cannot be cancelled by interrupts, re-ordered, or replayed.*/
+        QURT_MEM_CACHE_NONE = 4,  /**< Deprecated -- use #QURT_MEM_CACHE_DEVICE instead. */
+        QURT_MEM_CACHE_DEVICE_SFC = 2, /**< Enables placing limitations on the number of outstanding transactions. */
+        QURT_MEM_CACHE_INVALID=10,  /**< Reserved as an invalid cache type. */
+} qurt_mem_cache_mode_t;
+
+/** Memory access permission. */
+#define     QURT_PERM_NONE    0x0U     /**< No permission. */
+#define     QURT_PERM_READ    0x1U     /**< Read permission. */
+#define     QURT_PERM_WRITE   0x2U     /**< Write permission. */
+#define     QURT_PERM_EXECUTE 0x4U     /**< Execution permission. */
+#define     QURT_PERM_NODUMP  0x8U   
+                                    /**<  Skip dumping the mapping. During process domain dump, must skip
+                                     some mappings on host memory to avoid a race condition
+                                     where the memory is removed from the host and DSP process
+                                     crashed before the mapping is removed. */
+#define     QURT_PERM_FULL  QURT_PERM_READ | QURT_PERM_WRITE | QURT_PERM_EXECUTE  /**< Read, write, and execute permission. */
+
+typedef unsigned char qurt_perm_t;
+
+
+/** @cond rest_reg_dist*/
+/** QuRT cache type; specifies data cache or instruction cache. */
+typedef enum {
+        QURT_MEM_ICACHE, /**< Instruction cache.*/
+        QURT_MEM_DCACHE  /**< Data cache.*/
+} qurt_mem_cache_type_t;
+
+/** QuRT cache operation code type. */
+typedef enum {
+    QURT_MEM_CACHE_FLUSH, /**< Flush. */
+    QURT_MEM_CACHE_INVALIDATE, /**< Invalidate */
+    QURT_MEM_CACHE_FLUSH_INVALIDATE, /**< Flush invalidate. */
+    QURT_MEM_CACHE_FLUSH_ALL, /**< Flush all. */
+    QURT_MEM_CACHE_FLUSH_INVALIDATE_ALL, /**< Flush invalidate all. */
+    QURT_MEM_CACHE_TABLE_FLUSH_INVALIDATE, /**< Table flush invalidate. */
+    QURT_MEM_CACHE_FLUSH_INVALIDATE_L2, /**< L2 flush invalidate.*/
+} qurt_mem_cache_op_t;
+
+/** QuRT memory region type. */
+typedef enum {
+        QURT_MEM_REGION_LOCAL=0,  /**< Local. */
+        QURT_MEM_REGION_SHARED=1,  /**< Shared.*/
+        QURT_MEM_REGION_USER_ACCESS=2,  /**< User access. */
+        QURT_MEM_REGION_FS=4,  /**< FS. */
+        QURT_MEM_REGION_INVALID=10,  /**< Reserved as an invalid region type. */
+} qurt_mem_region_type_t;
+
+/* Cache and bus attributes are combined into a value of this type for convenience,
+    and macros for combining and extracting fields are defined here.  */
+/** @cond */
+struct qurt_pgattr {
+   unsigned pga_value; /**< PGA value.*/
+};
+typedef struct qurt_pgattr qurt_pgattr_t;
+/** @endcond */
+/** QuRT memory region attributes type.*/  
+/* QMEM_MAPPING_IDEMPOTENT and QMEM_MAPPING_PHYS_CONTIGUOUS mode can specify physaddr.
+   virtaddr cannot be specified for a memory region, it can only be queried by the 
+   qmem_attr_getvirtaddr() function.
+ */
+typedef struct {
+    /** @cond */
+    qurt_mem_mapping_t    mapping_type; 
+    unsigned char          perms;
+    unsigned short         owner;
+    qurt_pgattr_t          pga;
+    unsigned               ppn; //physical page number (physical>>12)
+    qurt_addr_t            virtaddr;
+    qurt_mem_region_type_t   type;   
+    qurt_size_t               size;
+    /** @endcond */
+} qurt_mem_region_attr_t;
+
+
+/** QuRT user physical memory pool type. */
+typedef struct {
+    /** @cond */
+    char name[32];
+    struct ranges{
+        unsigned int start;
+        unsigned int size;
+    } ranges[MAX_POOL_RANGES];
+     /** @endcond */
+} qurt_mem_pool_attr_t;
+
+/** QuRT memory pool status type.*/
+typedef struct _qurt_mem_pool_status {
+
+    qurt_size_t         contig_size; /**< Largest contiguous free memory in bytes. */
+    qurt_size_t         free_size;   /**< Total free memory in bytes. */
+    qurt_size_t         total_size;  /**< Total declared memory in bytes. */
+
+} qurt_mem_pool_status_t;
+
+typedef enum {
+    HEXAGON_L1_I_CACHE = 0,     /**< Hexagon L1 instruction cache. */
+    HEXAGON_L1_D_CACHE = 1,     /**< Hexagon L1 data cache. */
+    HEXAGON_L2_CACHE = 2        /**< Hexagon L2 cache. */
+} qurt_cache_type_t;
+
+typedef enum {
+    FULL_SIZE = 0,                /**< Fully shared cache, without partitioning. */
+    HALF_SIZE = 1,                /**< 1/2 for main, 1/2 for auxiliary. */
+    THREE_QUARTER_SIZE = 2,       /**< 3/4 for main, 1/4 for auxiliary. */
+    SEVEN_EIGHTHS_SIZE = 3        /**< 7/8 for main, 1/8 for auxiliary; for L2 cache only. */
+} qurt_cache_partition_size_t;
+
+typedef enum {
+	QURT_PROCESS_CB_GENERIC,        /**< generic unconditional cb called after image loading. */
+	QURT_PROCESS_NOTE_CB_PRE_MAP,   /**< note cb called before segment loading. */
+	QURT_PROCESS_NOTE_CB_POST_MAP   /**< note cb called after segment loading. */
+} qurt_process_cb_type_t;
+
+typedef union {
+    void *ptr;
+    int num;
+} qurt_process_callback_arg_t;
+
+
+/**@endcond*/
+
+/** @} */ /* end_addtogroup memory_management_types */
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_TYPES_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_user_dma.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_user_dma.h
new file mode 100755
index 0000000000000..e05a6429fd703
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_user_dma.h
@@ -0,0 +1,44 @@
+#ifndef QURT_USER_DMA_H
+#define QURT_USER_DMA_H
+
+/**
+  @file qurt_user_dma.h
+  @brief  Definitions, macros, and prototypes used for handling user DMA.
+
+ EXTERNALIZED FUNCTIONS
+  none
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  none
+
+ Copyright (c) 2021  by Qualcomm Technologies, Inc.  All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**@ingroup qurt_user_dma_dmsyncht
+  Sends the DMSyncht command to the user DMA engine.
+   
+   Call this function to ensure all posted DMA memory operations are
+   complete. 
+   
+   This stalls the current thread until the instruction
+   is complete and returns.
+
+  @return
+  QURT_EOK - On dmsyncht completion \n
+  QURT_ENOTSUPPORTED - User DMA not supported
+  
+  @dependencies
+  None.
+*/
+int qurt_user_dma_dmsyncht(void);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_vtlb.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_vtlb.h
new file mode 100755
index 0000000000000..e064042e447ac
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/include/qurt/qurt_vtlb.h
@@ -0,0 +1,76 @@
+/*=============================================================================
+
+                                    qurt_vtlb.h
+
+GENERAL DESCRIPTION
+
+EXTERNAL FUNCTIONS
+        None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+        None.
+
+Copyright (c) 2019, 2021, 2023  by Qualcomm Technologies, Inc.  All Rights Reserved.
+=============================================================================*/
+#ifndef QURT_VTLB_H
+#define QURT_VTLB_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+||  Names starting with "qurt_i_vtlb" are the internal low-level functions.
+||  These should be considered subject to change.
+*/
+
+int qurt_i_vtlb_entry_create(unsigned *pIndex,
+                             unsigned tlb_lo,
+                             unsigned tlb_hi,
+                             unsigned extension);
+
+int qurt_i_vtlb_entry_create_with_pid(unsigned *pIndex,
+                                      unsigned tlb_lo,
+                                      unsigned tlb_hi,
+                                      unsigned extension,
+                                      unsigned target_pid);
+
+int qurt_i_vtlb_entry_delete(unsigned index);
+
+int qurt_i_vtlb_entry_read(unsigned index, unsigned *tlbinfo);
+
+int qurt_i_vtlb_entry_write(unsigned index, unsigned tlb_lo, unsigned tlb_hi, unsigned extension);
+
+int qurt_i_vtlb_entry_write_with_pid(unsigned index, unsigned tlb_lo, unsigned tlb_hi, unsigned extension, unsigned target_pid);
+
+int qurt_i_vtlb_entry_probe(const void *vaddr, unsigned *tlbinfo, unsigned *pIndex);
+
+int qurt_i_vtlb_entry_probe_with_pid(const void *vaddr, unsigned *tlbinfo, unsigned *pIndex, unsigned target_pid);
+
+
+int qurt_i_vtlb_statistics(unsigned *stats); // Returns stats[0] -- total number of VTLB entries
+                                             //         stats[1] -- number of available VTLB entries
+                                             //         stats[2] -- max size of VTLB tree since boot
+
+//can return index to an entry that was specialed, change it to take addresses instead of pages
+int qurt_i_vtlb_set_special(int index, unsigned pageno, unsigned asid, unsigned size);
+
+int qurt_i_vtlb_queue_ppage(unsigned pageno, unsigned vtlb_index);
+
+#define QURT_VTLB_EXT_DEFAULT      0U
+#define QURT_VTLB_EXT_LOCKED       1U
+#define QURT_VTLB_EXT_EXCLUDE_DUMP 2U      /* Temporary ability to skip certain mappings in pd dump */
+#define QURT_VTLB_EXT_FREELIST     0x800000u
+
+#define QURT_VTLB_ERR_OVERLAP           -64
+#define QURT_VTLB_ERR_TREE_NO_SPACE     -65
+#define QURT_VTLB_ERR_INVALID_SIZE      -68
+#define QURT_VTLB_ERR_INVALID_EXT       -69
+#define QURT_VTLB_ERR_DEL_PGT_LOCKED    -70
+#define QURT_VTLB_ERR_PGT_LOCK_CNT      -71
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif // QURT_VTLB_H
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/lib/libposix.a b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/lib/libposix.a
new file mode 100755
index 0000000000000..fd0c274b7ca0e
Binary files /dev/null and b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/lib/libposix.a differ
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/lib/libqurt.a b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/lib/libqurt.a
new file mode 100755
index 0000000000000..23238a59eaa87
Binary files /dev/null and b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/lib/libqurt.a differ
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/lib/libqurtcfs.a b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/lib/libqurtcfs.a
new file mode 100755
index 0000000000000..85f9ad9d41bce
Binary files /dev/null and b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/lib/libqurtcfs.a differ
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/lib/libtimer_island.a b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/lib/libtimer_island.a
new file mode 100755
index 0000000000000..b4a6a40af02a8
Binary files /dev/null and b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/lib/libtimer_island.a differ
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/lib/libtimer_main.a b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/lib/libtimer_main.a
new file mode 100755
index 0000000000000..472857ff02a1f
Binary files /dev/null and b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/lib/libtimer_main.a differ
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/lib/pic/libposix.a b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/lib/pic/libposix.a
new file mode 100755
index 0000000000000..566d5c66d3f03
Binary files /dev/null and b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/lib/pic/libposix.a differ
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/lib/pic/libqurt.a b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/lib/pic/libqurt.a
new file mode 100755
index 0000000000000..fffad1d70a51c
Binary files /dev/null and b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/lib/pic/libqurt.a differ
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/lib/pic/libqurtcfs.a b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/lib/pic/libqurtcfs.a
new file mode 100755
index 0000000000000..85f9ad9d41bce
Binary files /dev/null and b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/lib/pic/libqurtcfs.a differ
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/lib/pic/libtimer.a b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/lib/pic/libtimer.a
new file mode 100755
index 0000000000000..89aa8ae9e03bb
Binary files /dev/null and b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev68/lib/pic/libtimer.a differ
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/posix/bits/confname.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/posix/bits/confname.h
new file mode 100755
index 0000000000000..d9ca3135501e3
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/posix/bits/confname.h
@@ -0,0 +1,528 @@
+#ifndef CONFNAME_H
+#define CONFNAME_H
+/**
+  @file confname.h
+  @brief Named literals for 'name' argument of sysconf, pathconf
+
+EXTERNAL FUNCTIONS    
+   None 
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS    
+   DONT include this header directly. Instead include unistd.h. For now since 
+   toolchain doesnt provide a hook by including bits/confname.h, we stick this 
+   header in QuRT's sys/types.h 
+
+Copyright (c) 2018, 2023 by Qualcomm Technologies, Inc.  All Rights Reserved.
+Confidential and Proprietary - Qualcomm Technologies, Inc.    
+    
+==============================================================================*/
+/* Values for the NAME argument to `pathconf' and `fpathconf'.  */
+enum
+{
+    _PC_LINK_MAX,
+#define	_PC_LINK_MAX			_PC_LINK_MAX
+    _PC_MAX_CANON,
+#define	_PC_MAX_CANON			_PC_MAX_CANON
+    _PC_MAX_INPUT,
+#define	_PC_MAX_INPUT			_PC_MAX_INPUT
+    _PC_NAME_MAX,
+#define	_PC_NAME_MAX			_PC_NAME_MAX
+    _PC_PATH_MAX,
+#define	_PC_PATH_MAX			_PC_PATH_MAX
+    _PC_PIPE_BUF,
+#define	_PC_PIPE_BUF			_PC_PIPE_BUF
+    _PC_CHOWN_RESTRICTED,
+#define	_PC_CHOWN_RESTRICTED		_PC_CHOWN_RESTRICTED
+    _PC_NO_TRUNC,
+#define	_PC_NO_TRUNC			_PC_NO_TRUNC
+    _PC_VDISABLE,
+#define _PC_VDISABLE			_PC_VDISABLE
+    _PC_SYNC_IO,
+#define	_PC_SYNC_IO			_PC_SYNC_IO
+    _PC_ASYNC_IO,
+#define	_PC_ASYNC_IO			_PC_ASYNC_IO
+    _PC_PRIO_IO,
+#define	_PC_PRIO_IO			_PC_PRIO_IO
+    _PC_SOCK_MAXBUF,
+#define	_PC_SOCK_MAXBUF			_PC_SOCK_MAXBUF
+    _PC_FILESIZEBITS,
+#define _PC_FILESIZEBITS		_PC_FILESIZEBITS
+    _PC_REC_INCR_XFER_SIZE,
+#define _PC_REC_INCR_XFER_SIZE		_PC_REC_INCR_XFER_SIZE
+    _PC_REC_MAX_XFER_SIZE,
+#define _PC_REC_MAX_XFER_SIZE		_PC_REC_MAX_XFER_SIZE
+    _PC_REC_MIN_XFER_SIZE,
+#define _PC_REC_MIN_XFER_SIZE		_PC_REC_MIN_XFER_SIZE
+    _PC_REC_XFER_ALIGN,
+#define _PC_REC_XFER_ALIGN		_PC_REC_XFER_ALIGN
+    _PC_ALLOC_SIZE_MIN,
+#define _PC_ALLOC_SIZE_MIN		_PC_ALLOC_SIZE_MIN
+    _PC_SYMLINK_MAX,
+#define _PC_SYMLINK_MAX			_PC_SYMLINK_MAX
+    _PC_2_SYMLINKS
+#define _PC_2_SYMLINKS			_PC_2_SYMLINKS
+};
+
+/* Values for the argument to `sysconf'.  */
+enum
+{
+    _SC_ARG_MAX,
+#define	_SC_ARG_MAX			_SC_ARG_MAX
+    _SC_CHILD_MAX,
+#define	_SC_CHILD_MAX			_SC_CHILD_MAX
+    _SC_CLK_TCK,
+#define	_SC_CLK_TCK			_SC_CLK_TCK
+    _SC_NGROUPS_MAX,
+#define	_SC_NGROUPS_MAX			_SC_NGROUPS_MAX
+    _SC_OPEN_MAX,
+#define	_SC_OPEN_MAX			_SC_OPEN_MAX
+    _SC_STREAM_MAX,
+#define	_SC_STREAM_MAX			_SC_STREAM_MAX
+    _SC_TZNAME_MAX,
+#define	_SC_TZNAME_MAX			_SC_TZNAME_MAX
+    _SC_JOB_CONTROL,
+#define	_SC_JOB_CONTROL			_SC_JOB_CONTROL
+    _SC_SAVED_IDS,
+#define	_SC_SAVED_IDS			_SC_SAVED_IDS
+    _SC_REALTIME_SIGNALS,
+#define	_SC_REALTIME_SIGNALS		_SC_REALTIME_SIGNALS
+    _SC_PRIORITY_SCHEDULING,
+#define	_SC_PRIORITY_SCHEDULING		_SC_PRIORITY_SCHEDULING
+    _SC_TIMERS,
+#define	_SC_TIMERS			_SC_TIMERS
+    _SC_ASYNCHRONOUS_IO,
+#define	_SC_ASYNCHRONOUS_IO		_SC_ASYNCHRONOUS_IO
+    _SC_PRIORITIZED_IO,
+#define	_SC_PRIORITIZED_IO		_SC_PRIORITIZED_IO
+    _SC_SYNCHRONIZED_IO,
+#define	_SC_SYNCHRONIZED_IO		_SC_SYNCHRONIZED_IO
+    _SC_FSYNC,
+#define	_SC_FSYNC			_SC_FSYNC
+    _SC_MAPPED_FILES,
+#define	_SC_MAPPED_FILES		_SC_MAPPED_FILES
+    _SC_MEMLOCK,
+#define	_SC_MEMLOCK			_SC_MEMLOCK
+    _SC_MEMLOCK_RANGE,
+#define	_SC_MEMLOCK_RANGE		_SC_MEMLOCK_RANGE
+    _SC_MEMORY_PROTECTION,
+#define	_SC_MEMORY_PROTECTION		_SC_MEMORY_PROTECTION
+    _SC_MESSAGE_PASSING,
+#define	_SC_MESSAGE_PASSING		_SC_MESSAGE_PASSING
+    _SC_SEMAPHORES,
+#define	_SC_SEMAPHORES			_SC_SEMAPHORES
+    _SC_SHARED_MEMORY_OBJECTS,
+#define	_SC_SHARED_MEMORY_OBJECTS	_SC_SHARED_MEMORY_OBJECTS
+    _SC_AIO_LISTIO_MAX,
+#define	_SC_AIO_LISTIO_MAX		_SC_AIO_LISTIO_MAX
+    _SC_AIO_MAX,
+#define	_SC_AIO_MAX			_SC_AIO_MAX
+    _SC_AIO_PRIO_DELTA_MAX,
+#define	_SC_AIO_PRIO_DELTA_MAX		_SC_AIO_PRIO_DELTA_MAX
+    _SC_DELAYTIMER_MAX,
+#define	_SC_DELAYTIMER_MAX		_SC_DELAYTIMER_MAX
+    _SC_MQ_OPEN_MAX,
+#define	_SC_MQ_OPEN_MAX			_SC_MQ_OPEN_MAX
+    _SC_MQ_PRIO_MAX,
+#define	_SC_MQ_PRIO_MAX			_SC_MQ_PRIO_MAX
+    _SC_VERSION,
+#define	_SC_VERSION			_SC_VERSION
+    _SC_PAGESIZE,
+#define	_SC_PAGESIZE			_SC_PAGESIZE
+#define	_SC_PAGE_SIZE			_SC_PAGESIZE
+    _SC_RTSIG_MAX,
+#define	_SC_RTSIG_MAX			_SC_RTSIG_MAX
+    _SC_SEM_NSEMS_MAX,
+#define	_SC_SEM_NSEMS_MAX		_SC_SEM_NSEMS_MAX
+    _SC_SEM_VALUE_MAX,
+#define	_SC_SEM_VALUE_MAX		_SC_SEM_VALUE_MAX
+    _SC_SIGQUEUE_MAX,
+#define	_SC_SIGQUEUE_MAX		_SC_SIGQUEUE_MAX
+    _SC_TIMER_MAX,
+#define	_SC_TIMER_MAX			_SC_TIMER_MAX
+
+    /* Values for the argument to `sysconf'
+       corresponding to _POSIX2_* symbols.  */
+    _SC_BC_BASE_MAX,
+#define	_SC_BC_BASE_MAX			_SC_BC_BASE_MAX
+    _SC_BC_DIM_MAX,
+#define	_SC_BC_DIM_MAX			_SC_BC_DIM_MAX
+    _SC_BC_SCALE_MAX,
+#define	_SC_BC_SCALE_MAX		_SC_BC_SCALE_MAX
+    _SC_BC_STRING_MAX,
+#define	_SC_BC_STRING_MAX		_SC_BC_STRING_MAX
+    _SC_COLL_WEIGHTS_MAX,
+#define	_SC_COLL_WEIGHTS_MAX		_SC_COLL_WEIGHTS_MAX
+    _SC_EQUIV_CLASS_MAX,
+#define	_SC_EQUIV_CLASS_MAX		_SC_EQUIV_CLASS_MAX
+    _SC_EXPR_NEST_MAX,
+#define	_SC_EXPR_NEST_MAX		_SC_EXPR_NEST_MAX
+    _SC_LINE_MAX,
+#define	_SC_LINE_MAX			_SC_LINE_MAX
+    _SC_RE_DUP_MAX,
+#define	_SC_RE_DUP_MAX			_SC_RE_DUP_MAX
+    _SC_CHARCLASS_NAME_MAX,
+#define	_SC_CHARCLASS_NAME_MAX		_SC_CHARCLASS_NAME_MAX
+
+    _SC_2_VERSION,
+#define	_SC_2_VERSION			_SC_2_VERSION
+    _SC_2_C_BIND,
+#define	_SC_2_C_BIND			_SC_2_C_BIND
+    _SC_2_C_DEV,
+#define	_SC_2_C_DEV			_SC_2_C_DEV
+    _SC_2_FORT_DEV,
+#define	_SC_2_FORT_DEV			_SC_2_FORT_DEV
+    _SC_2_FORT_RUN,
+#define	_SC_2_FORT_RUN			_SC_2_FORT_RUN
+    _SC_2_SW_DEV,
+#define	_SC_2_SW_DEV			_SC_2_SW_DEV
+    _SC_2_LOCALEDEF,
+#define	_SC_2_LOCALEDEF			_SC_2_LOCALEDEF
+
+    _SC_PII,
+#define	_SC_PII				_SC_PII
+    _SC_PII_XTI,
+#define	_SC_PII_XTI			_SC_PII_XTI
+    _SC_PII_SOCKET,
+#define	_SC_PII_SOCKET			_SC_PII_SOCKET
+    _SC_PII_INTERNET,
+#define	_SC_PII_INTERNET		_SC_PII_INTERNET
+    _SC_PII_OSI,
+#define	_SC_PII_OSI			_SC_PII_OSI
+    _SC_POLL,
+#define	_SC_POLL			_SC_POLL
+    _SC_SELECT,
+#define	_SC_SELECT			_SC_SELECT
+    _SC_UIO_MAXIOV,
+#define	_SC_UIO_MAXIOV			_SC_UIO_MAXIOV
+    _SC_IOV_MAX = _SC_UIO_MAXIOV,
+#define _SC_IOV_MAX			_SC_IOV_MAX
+    _SC_PII_INTERNET_STREAM,
+#define	_SC_PII_INTERNET_STREAM		_SC_PII_INTERNET_STREAM
+    _SC_PII_INTERNET_DGRAM,
+#define	_SC_PII_INTERNET_DGRAM		_SC_PII_INTERNET_DGRAM
+    _SC_PII_OSI_COTS,
+#define	_SC_PII_OSI_COTS		_SC_PII_OSI_COTS
+    _SC_PII_OSI_CLTS,
+#define	_SC_PII_OSI_CLTS		_SC_PII_OSI_CLTS
+    _SC_PII_OSI_M,
+#define	_SC_PII_OSI_M			_SC_PII_OSI_M
+    _SC_T_IOV_MAX,
+#define	_SC_T_IOV_MAX			_SC_T_IOV_MAX
+
+    /* Values according to POSIX 1003.1c (POSIX threads).  */
+    _SC_THREADS,
+#define	_SC_THREADS			_SC_THREADS
+    _SC_THREAD_SAFE_FUNCTIONS,
+#define _SC_THREAD_SAFE_FUNCTIONS	_SC_THREAD_SAFE_FUNCTIONS
+    _SC_GETGR_R_SIZE_MAX,
+#define	_SC_GETGR_R_SIZE_MAX		_SC_GETGR_R_SIZE_MAX
+    _SC_GETPW_R_SIZE_MAX,
+#define	_SC_GETPW_R_SIZE_MAX		_SC_GETPW_R_SIZE_MAX
+    _SC_LOGIN_NAME_MAX,
+#define	_SC_LOGIN_NAME_MAX		_SC_LOGIN_NAME_MAX
+    _SC_TTY_NAME_MAX,
+#define	_SC_TTY_NAME_MAX		_SC_TTY_NAME_MAX
+    _SC_THREAD_DESTRUCTOR_ITERATIONS,
+#define	_SC_THREAD_DESTRUCTOR_ITERATIONS _SC_THREAD_DESTRUCTOR_ITERATIONS
+    _SC_THREAD_KEYS_MAX,
+#define	_SC_THREAD_KEYS_MAX		_SC_THREAD_KEYS_MAX
+    _SC_THREAD_STACK_MIN,
+#define	_SC_THREAD_STACK_MIN		_SC_THREAD_STACK_MIN
+    _SC_THREAD_THREADS_MAX,
+#define	_SC_THREAD_THREADS_MAX		_SC_THREAD_THREADS_MAX
+    _SC_THREAD_ATTR_STACKADDR,
+#define	_SC_THREAD_ATTR_STACKADDR	_SC_THREAD_ATTR_STACKADDR
+    _SC_THREAD_ATTR_STACKSIZE,
+#define	_SC_THREAD_ATTR_STACKSIZE	_SC_THREAD_ATTR_STACKSIZE
+    _SC_THREAD_PRIORITY_SCHEDULING,
+#define	_SC_THREAD_PRIORITY_SCHEDULING	_SC_THREAD_PRIORITY_SCHEDULING
+    _SC_THREAD_PRIO_INHERIT,
+#define	_SC_THREAD_PRIO_INHERIT		_SC_THREAD_PRIO_INHERIT
+    _SC_THREAD_PRIO_PROTECT,
+#define	_SC_THREAD_PRIO_PROTECT		_SC_THREAD_PRIO_PROTECT
+    _SC_THREAD_PROCESS_SHARED,
+#define	_SC_THREAD_PROCESS_SHARED	_SC_THREAD_PROCESS_SHARED
+
+    _SC_NPROCESSORS_CONF,
+#define _SC_NPROCESSORS_CONF		_SC_NPROCESSORS_CONF
+    _SC_NPROCESSORS_ONLN,
+#define _SC_NPROCESSORS_ONLN		_SC_NPROCESSORS_ONLN
+    _SC_PHYS_PAGES,
+#define _SC_PHYS_PAGES			_SC_PHYS_PAGES
+    _SC_AVPHYS_PAGES,
+#define _SC_AVPHYS_PAGES		_SC_AVPHYS_PAGES
+    _SC_ATEXIT_MAX,
+#define _SC_ATEXIT_MAX			_SC_ATEXIT_MAX
+    _SC_PASS_MAX,
+#define _SC_PASS_MAX			_SC_PASS_MAX
+
+    _SC_XOPEN_VERSION,
+#define _SC_XOPEN_VERSION		_SC_XOPEN_VERSION
+    _SC_XOPEN_XCU_VERSION,
+#define _SC_XOPEN_XCU_VERSION		_SC_XOPEN_XCU_VERSION
+    _SC_XOPEN_UNIX,
+#define _SC_XOPEN_UNIX			_SC_XOPEN_UNIX
+    _SC_XOPEN_CRYPT,
+#define _SC_XOPEN_CRYPT			_SC_XOPEN_CRYPT
+    _SC_XOPEN_ENH_I18N,
+#define _SC_XOPEN_ENH_I18N		_SC_XOPEN_ENH_I18N
+    _SC_XOPEN_SHM,
+#define _SC_XOPEN_SHM			_SC_XOPEN_SHM
+
+    _SC_2_CHAR_TERM,
+#define _SC_2_CHAR_TERM			_SC_2_CHAR_TERM
+    _SC_2_C_VERSION,
+#define _SC_2_C_VERSION			_SC_2_C_VERSION
+    _SC_2_UPE,
+#define _SC_2_UPE			_SC_2_UPE
+
+    _SC_XOPEN_XPG2,
+#define _SC_XOPEN_XPG2			_SC_XOPEN_XPG2
+    _SC_XOPEN_XPG3,
+#define _SC_XOPEN_XPG3			_SC_XOPEN_XPG3
+    _SC_XOPEN_XPG4,
+#define _SC_XOPEN_XPG4			_SC_XOPEN_XPG4
+
+    _SC_CHAR_BIT,
+#define	_SC_CHAR_BIT			_SC_CHAR_BIT
+    _SC_CHAR_MAX,
+#define	_SC_CHAR_MAX			_SC_CHAR_MAX
+    _SC_CHAR_MIN,
+#define	_SC_CHAR_MIN			_SC_CHAR_MIN
+    _SC_INT_MAX,
+#define	_SC_INT_MAX			_SC_INT_MAX
+    _SC_INT_MIN,
+#define	_SC_INT_MIN			_SC_INT_MIN
+    _SC_LONG_BIT,
+#define	_SC_LONG_BIT			_SC_LONG_BIT
+    _SC_WORD_BIT,
+#define	_SC_WORD_BIT			_SC_WORD_BIT
+    _SC_MB_LEN_MAX,
+#define	_SC_MB_LEN_MAX			_SC_MB_LEN_MAX
+    _SC_NZERO,
+#define	_SC_NZERO			_SC_NZERO
+    _SC_SSIZE_MAX,
+#define	_SC_SSIZE_MAX			_SC_SSIZE_MAX
+    _SC_SCHAR_MAX,
+#define	_SC_SCHAR_MAX			_SC_SCHAR_MAX
+    _SC_SCHAR_MIN,
+#define	_SC_SCHAR_MIN			_SC_SCHAR_MIN
+    _SC_SHRT_MAX,
+#define	_SC_SHRT_MAX			_SC_SHRT_MAX
+    _SC_SHRT_MIN,
+#define	_SC_SHRT_MIN			_SC_SHRT_MIN
+    _SC_UCHAR_MAX,
+#define	_SC_UCHAR_MAX			_SC_UCHAR_MAX
+    _SC_UINT_MAX,
+#define	_SC_UINT_MAX			_SC_UINT_MAX
+    _SC_ULONG_MAX,
+#define	_SC_ULONG_MAX			_SC_ULONG_MAX
+    _SC_USHRT_MAX,
+#define	_SC_USHRT_MAX			_SC_USHRT_MAX
+
+    _SC_NL_ARGMAX,
+#define	_SC_NL_ARGMAX			_SC_NL_ARGMAX
+    _SC_NL_LANGMAX,
+#define	_SC_NL_LANGMAX			_SC_NL_LANGMAX
+    _SC_NL_MSGMAX,
+#define	_SC_NL_MSGMAX			_SC_NL_MSGMAX
+    _SC_NL_NMAX,
+#define	_SC_NL_NMAX			_SC_NL_NMAX
+    _SC_NL_SETMAX,
+#define	_SC_NL_SETMAX			_SC_NL_SETMAX
+    _SC_NL_TEXTMAX,
+#define	_SC_NL_TEXTMAX			_SC_NL_TEXTMAX
+
+    _SC_XBS5_ILP32_OFF32,
+#define _SC_XBS5_ILP32_OFF32		_SC_XBS5_ILP32_OFF32
+    _SC_XBS5_ILP32_OFFBIG,
+#define _SC_XBS5_ILP32_OFFBIG		_SC_XBS5_ILP32_OFFBIG
+    _SC_XBS5_LP64_OFF64,
+#define _SC_XBS5_LP64_OFF64		_SC_XBS5_LP64_OFF64
+    _SC_XBS5_LPBIG_OFFBIG,
+#define _SC_XBS5_LPBIG_OFFBIG		_SC_XBS5_LPBIG_OFFBIG
+
+    _SC_XOPEN_LEGACY,
+#define _SC_XOPEN_LEGACY		_SC_XOPEN_LEGACY
+    _SC_XOPEN_REALTIME,
+#define _SC_XOPEN_REALTIME		_SC_XOPEN_REALTIME
+    _SC_XOPEN_REALTIME_THREADS,
+#define _SC_XOPEN_REALTIME_THREADS	_SC_XOPEN_REALTIME_THREADS
+
+    _SC_ADVISORY_INFO,
+#define _SC_ADVISORY_INFO		_SC_ADVISORY_INFO
+    _SC_BARRIERS,
+#define _SC_BARRIERS			_SC_BARRIERS
+    _SC_BASE,
+#define _SC_BASE			_SC_BASE
+    _SC_C_LANG_SUPPORT,
+#define _SC_C_LANG_SUPPORT		_SC_C_LANG_SUPPORT
+    _SC_C_LANG_SUPPORT_R,
+#define _SC_C_LANG_SUPPORT_R		_SC_C_LANG_SUPPORT_R
+    _SC_CLOCK_SELECTION,
+#define _SC_CLOCK_SELECTION		_SC_CLOCK_SELECTION
+    _SC_CPUTIME,
+#define _SC_CPUTIME			_SC_CPUTIME
+    _SC_THREAD_CPUTIME,
+#define _SC_THREAD_CPUTIME		_SC_THREAD_CPUTIME
+    _SC_DEVICE_IO,
+#define _SC_DEVICE_IO			_SC_DEVICE_IO
+    _SC_DEVICE_SPECIFIC,
+#define _SC_DEVICE_SPECIFIC		_SC_DEVICE_SPECIFIC
+    _SC_DEVICE_SPECIFIC_R,
+#define _SC_DEVICE_SPECIFIC_R		_SC_DEVICE_SPECIFIC_R
+    _SC_FD_MGMT,
+#define _SC_FD_MGMT			_SC_FD_MGMT
+    _SC_FIFO,
+#define _SC_FIFO			_SC_FIFO
+    _SC_PIPE,
+#define _SC_PIPE			_SC_PIPE
+    _SC_FILE_ATTRIBUTES,
+#define _SC_FILE_ATTRIBUTES		_SC_FILE_ATTRIBUTES
+    _SC_FILE_LOCKING,
+#define _SC_FILE_LOCKING		_SC_FILE_LOCKING
+    _SC_FILE_SYSTEM,
+#define _SC_FILE_SYSTEM			_SC_FILE_SYSTEM
+    _SC_MONOTONIC_CLOCK,
+#define _SC_MONOTONIC_CLOCK		_SC_MONOTONIC_CLOCK
+    _SC_MULTI_PROCESS,
+#define _SC_MULTI_PROCESS		_SC_MULTI_PROCESS
+    _SC_SINGLE_PROCESS,
+#define _SC_SINGLE_PROCESS		_SC_SINGLE_PROCESS
+    _SC_NETWORKING,
+#define _SC_NETWORKING			_SC_NETWORKING
+    _SC_READER_WRITER_LOCKS,
+#define _SC_READER_WRITER_LOCKS		_SC_READER_WRITER_LOCKS
+    _SC_SPIN_LOCKS,
+#define _SC_SPIN_LOCKS			_SC_SPIN_LOCKS
+    _SC_REGEXP,
+#define _SC_REGEXP			_SC_REGEXP
+    _SC_REGEX_VERSION,
+#define _SC_REGEX_VERSION		_SC_REGEX_VERSION
+    _SC_SHELL,
+#define _SC_SHELL			_SC_SHELL
+    _SC_SIGNALS,
+#define _SC_SIGNALS			_SC_SIGNALS
+    _SC_SPAWN,
+#define _SC_SPAWN			_SC_SPAWN
+    _SC_SPORADIC_SERVER,
+#define _SC_SPORADIC_SERVER		_SC_SPORADIC_SERVER
+    _SC_THREAD_SPORADIC_SERVER,
+#define _SC_THREAD_SPORADIC_SERVER	_SC_THREAD_SPORADIC_SERVER
+    _SC_SYSTEM_DATABASE,
+#define _SC_SYSTEM_DATABASE		_SC_SYSTEM_DATABASE
+    _SC_SYSTEM_DATABASE_R,
+#define _SC_SYSTEM_DATABASE_R		_SC_SYSTEM_DATABASE_R
+    _SC_TIMEOUTS,
+#define _SC_TIMEOUTS			_SC_TIMEOUTS
+    _SC_TYPED_MEMORY_OBJECTS,
+#define _SC_TYPED_MEMORY_OBJECTS	_SC_TYPED_MEMORY_OBJECTS
+    _SC_USER_GROUPS,
+#define _SC_USER_GROUPS			_SC_USER_GROUPS
+    _SC_USER_GROUPS_R,
+#define _SC_USER_GROUPS_R		_SC_USER_GROUPS_R
+    _SC_2_PBS,
+#define _SC_2_PBS			_SC_2_PBS
+    _SC_2_PBS_ACCOUNTING,
+#define _SC_2_PBS_ACCOUNTING		_SC_2_PBS_ACCOUNTING
+    _SC_2_PBS_LOCATE,
+#define _SC_2_PBS_LOCATE		_SC_2_PBS_LOCATE
+    _SC_2_PBS_MESSAGE,
+#define _SC_2_PBS_MESSAGE		_SC_2_PBS_MESSAGE
+    _SC_2_PBS_TRACK,
+#define _SC_2_PBS_TRACK			_SC_2_PBS_TRACK
+    _SC_SYMLOOP_MAX,
+#define _SC_SYMLOOP_MAX			_SC_SYMLOOP_MAX
+    _SC_STREAMS,
+#define _SC_STREAMS			_SC_STREAMS
+    _SC_2_PBS_CHECKPOINT,
+#define _SC_2_PBS_CHECKPOINT		_SC_2_PBS_CHECKPOINT
+
+    _SC_V6_ILP32_OFF32,
+#define _SC_V6_ILP32_OFF32		_SC_V6_ILP32_OFF32
+    _SC_V6_ILP32_OFFBIG,
+#define _SC_V6_ILP32_OFFBIG		_SC_V6_ILP32_OFFBIG
+    _SC_V6_LP64_OFF64,
+#define _SC_V6_LP64_OFF64		_SC_V6_LP64_OFF64
+    _SC_V6_LPBIG_OFFBIG,
+#define _SC_V6_LPBIG_OFFBIG		_SC_V6_LPBIG_OFFBIG
+
+    _SC_HOST_NAME_MAX,
+#define _SC_HOST_NAME_MAX		_SC_HOST_NAME_MAX
+    _SC_TRACE,
+#define _SC_TRACE			_SC_TRACE
+    _SC_TRACE_EVENT_FILTER,
+#define _SC_TRACE_EVENT_FILTER		_SC_TRACE_EVENT_FILTER
+    _SC_TRACE_INHERIT,
+#define _SC_TRACE_INHERIT		_SC_TRACE_INHERIT
+    _SC_TRACE_LOG,
+#define _SC_TRACE_LOG			_SC_TRACE_LOG
+
+    _SC_LEVEL1_ICACHE_SIZE,
+#define _SC_LEVEL1_ICACHE_SIZE		_SC_LEVEL1_ICACHE_SIZE
+    _SC_LEVEL1_ICACHE_ASSOC,
+#define _SC_LEVEL1_ICACHE_ASSOC		_SC_LEVEL1_ICACHE_ASSOC
+    _SC_LEVEL1_ICACHE_LINESIZE,
+#define _SC_LEVEL1_ICACHE_LINESIZE	_SC_LEVEL1_ICACHE_LINESIZE
+    _SC_LEVEL1_DCACHE_SIZE,
+#define _SC_LEVEL1_DCACHE_SIZE		_SC_LEVEL1_DCACHE_SIZE
+    _SC_LEVEL1_DCACHE_ASSOC,
+#define _SC_LEVEL1_DCACHE_ASSOC		_SC_LEVEL1_DCACHE_ASSOC
+    _SC_LEVEL1_DCACHE_LINESIZE,
+#define _SC_LEVEL1_DCACHE_LINESIZE	_SC_LEVEL1_DCACHE_LINESIZE
+    _SC_LEVEL2_CACHE_SIZE,
+#define _SC_LEVEL2_CACHE_SIZE		_SC_LEVEL2_CACHE_SIZE
+    _SC_LEVEL2_CACHE_ASSOC,
+#define _SC_LEVEL2_CACHE_ASSOC		_SC_LEVEL2_CACHE_ASSOC
+    _SC_LEVEL2_CACHE_LINESIZE,
+#define _SC_LEVEL2_CACHE_LINESIZE	_SC_LEVEL2_CACHE_LINESIZE
+    _SC_LEVEL3_CACHE_SIZE,
+#define _SC_LEVEL3_CACHE_SIZE		_SC_LEVEL3_CACHE_SIZE
+    _SC_LEVEL3_CACHE_ASSOC,
+#define _SC_LEVEL3_CACHE_ASSOC		_SC_LEVEL3_CACHE_ASSOC
+    _SC_LEVEL3_CACHE_LINESIZE,
+#define _SC_LEVEL3_CACHE_LINESIZE	_SC_LEVEL3_CACHE_LINESIZE
+    _SC_LEVEL4_CACHE_SIZE,
+#define _SC_LEVEL4_CACHE_SIZE		_SC_LEVEL4_CACHE_SIZE
+    _SC_LEVEL4_CACHE_ASSOC,
+#define _SC_LEVEL4_CACHE_ASSOC		_SC_LEVEL4_CACHE_ASSOC
+    _SC_LEVEL4_CACHE_LINESIZE,
+#define _SC_LEVEL4_CACHE_LINESIZE	_SC_LEVEL4_CACHE_LINESIZE
+    /* Leave room here, maybe we need a few more cache levels some day.  */
+
+    _SC_IPV6 = _SC_LEVEL1_ICACHE_SIZE + 50,
+#define _SC_IPV6			_SC_IPV6
+    _SC_RAW_SOCKETS,
+#define _SC_RAW_SOCKETS			_SC_RAW_SOCKETS
+
+    _SC_V7_ILP32_OFF32,
+#define _SC_V7_ILP32_OFF32		_SC_V7_ILP32_OFF32
+    _SC_V7_ILP32_OFFBIG,
+#define _SC_V7_ILP32_OFFBIG		_SC_V7_ILP32_OFFBIG
+    _SC_V7_LP64_OFF64,
+#define _SC_V7_LP64_OFF64		_SC_V7_LP64_OFF64
+    _SC_V7_LPBIG_OFFBIG,
+#define _SC_V7_LPBIG_OFFBIG		_SC_V7_LPBIG_OFFBIG
+
+    _SC_SS_REPL_MAX,
+#define _SC_SS_REPL_MAX			_SC_SS_REPL_MAX
+
+    _SC_TRACE_EVENT_NAME_MAX,
+#define _SC_TRACE_EVENT_NAME_MAX	_SC_TRACE_EVENT_NAME_MAX
+    _SC_TRACE_NAME_MAX,
+#define _SC_TRACE_NAME_MAX		_SC_TRACE_NAME_MAX
+    _SC_TRACE_SYS_MAX,
+#define _SC_TRACE_SYS_MAX		_SC_TRACE_SYS_MAX
+    _SC_TRACE_USER_EVENT_MAX,
+#define _SC_TRACE_USER_EVENT_MAX	_SC_TRACE_USER_EVENT_MAX
+
+    _SC_XOPEN_STREAMS,
+#define _SC_XOPEN_STREAMS		_SC_XOPEN_STREAMS
+
+    _SC_THREAD_ROBUST_PRIO_INHERIT,
+#define _SC_THREAD_ROBUST_PRIO_INHERIT	_SC_THREAD_ROBUST_PRIO_INHERIT
+    _SC_THREAD_ROBUST_PRIO_PROTECT
+#define _SC_THREAD_ROBUST_PRIO_PROTECT	_SC_THREAD_ROBUST_PRIO_PROTECT
+
+};
+#endif
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/posix/bits/posix1_lim.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/posix/bits/posix1_lim.h
new file mode 100755
index 0000000000000..0739958c5a6c4
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/posix/bits/posix1_lim.h
@@ -0,0 +1,34 @@
+#ifndef POSIX1_LIM_H
+#define POSIX1_LIM_H
+/**
+  @file posix1_lim.h
+  @brief POSIX Minimum values
+
+EXTERNAL FUNCTIONS    
+   None 
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None 
+    
+TODO    
+   This header should be ideally relocated under api/posix/bits (something that 
+   doesnt exist today) and be included from api/posix/bits/limits.h which inturn 
+   should be included from toolchain's limits.h 
+
+Copyright (c) 2018, 2023 by Qualcomm Technologies, Inc.  All Rights Reserved.
+Confidential and Proprietary - Qualcomm Technologies, Inc.    
+    
+==============================================================================*/
+
+#ifndef _POSIX_PATH_MAX
+/** @brief Maximum number of bytes in a pathname, including the terminating
+    nul character */
+#define _POSIX_PATH_MAX 256
+#endif
+
+#ifndef _POSIX_SEM_NSEMS_MAX
+/** @brief Maximum number of semaphores that a process may have */
+#define _POSIX_SEM_NSEMS_MAX 16
+#endif
+#endif
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/posix/common/time.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/posix/common/time.h
new file mode 100755
index 0000000000000..76b0d39ab7039
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/posix/common/time.h
@@ -0,0 +1 @@
+#include <time.h>
\ No newline at end of file
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/posix/fcntl.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/posix/fcntl.h
new file mode 100755
index 0000000000000..c80ec98a449b6
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/posix/fcntl.h
@@ -0,0 +1,51 @@
+#ifndef _FCNTL_H
+#define _FCNTL_H
+
+/*==========================================================================
+ * FILE:         fcntl.h
+ *
+ * SERVICES:     POSIX fcntl.h
+ *
+ * DESCRIPTION:  The <fcntl.h> header is needed by the open() and fcntl()
+ *               system calls, which have a variety of parameters and
+ *               flags. They are described here.
+ *
+ *               The formats of the calls to each of these are:
+ *
+ *               open(path, oflag [,mode]) open a file
+ *               fcntl(fd, cmd [,arg]) get or set file attributes
+ *
+ *               Copyright (c) 2013,2016  by Qualcomm Technologies, Inc.  All Rights Reserved. QUALCOMM Proprietary and Confidential.
+
+ *==========================================================================*/
+
+#include <sys/types.h>
+#include <generic/fcntl.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Oflag values for open(). POSIX Table 6-4. */
+#define POSIX_O_CREAT       0x100  /* creat file if it doesn't exist */
+#define POSIX_O_EXCL        0x200  /* exclusive use flag */
+#define POSIX_O_NOCTTY      0x400  /* do not assign a controlling terminal */
+#define POSIX_O_TRUNC       0x1000 /* truncate flag */
+
+/* File status flags for open() and fcntl(). POSIX Table 6-5. */
+#define POSIX_O_APPEND      0x2000 /* set append mode */
+#define POSIX_O_NONBLOCK    0x4000 /* no delay */
+
+/* File access modes for open() and fcntl(). POSIX Table 6-6. */
+#define POSIX_O_RDONLY      0 /* open(name, POSIX_O_RDONLY) opens read only */
+#define POSIX_O_WRONLY      1 /* open(name, POSIX_O_WRONLY) opens write only */
+#define POSIX_O_RDWR        2 /* open(name, POSIX_O_RDWR) opens read/write */
+
+/* Mask for use with file access modes. POSIX Table 6-7. */
+#define POSIX_O_ACCMODE     0x3 /* mask for file access modes */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _FCNTL_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/posix/hooks/unistd.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/posix/hooks/unistd.h
new file mode 100755
index 0000000000000..1c618bfe36b4f
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/posix/hooks/unistd.h
@@ -0,0 +1,115 @@
+#ifndef UNISTD_H
+#define UNISTD_H
+/**
+  @file posix/hooks/unistd.h
+  @brief POSIX related declarations in <unistd.h> that are missing in toolchain 
+         header
+
+EXTERNAL FUNCTIONS    
+   None 
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS    
+   DONT include this header directly! Instead include unistd.h. 
+
+Copyright (c) 2018, 2023 by Qualcomm Technologies, Inc.  All Rights Reserved.
+Confidential and Proprietary - Qualcomm Technologies, Inc.    
+    
+==============================================================================*/
+#include <types.h> /* For various POSIX ID types from toolchain headers */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+extern long pathconf (char const * path, int name);
+
+/* Process*/
+
+/** The getppid() function shall return the parent process ID of the calling process.
+ * Please refer to POSIX standard for details.
+ * @param thread    [in]  none
+ * @param value_ptr [out] the  parent process ID
+ */
+pid_t getppid(void);
+
+/** The getpgid() function shall return the process group ID of the process whose process ID is equal to pid
+ * Please refer to POSIX standard for details.
+ * @param thread    [in]  process ID
+ * @param value_ptr [out] process group ID
+ */
+pid_t getpgid(pid_t pid);
+
+/** The getpgrp() function shall return the process group ID of the calling process
+ * Please refer to POSIX standard for details.
+ * @param thread    [in]  none
+ * @param value_ptr [out] process group ID of the calling process
+ */
+pid_t getpgrp(void);
+
+/**The getuid() function shall return the real user ID of the calling process.
+ * Please refer to POSIX standard for details.
+ * @param thread    [in]  none
+ * @param value_ptr [out] the real user ID of the calling process.
+ */
+uid_t getuid(void); 
+
+/** The geteuid() function shall return the effective user ID of the calling process
+ * Please refer to POSIX standard for details.
+ * @param thread    [in]  none
+ * @param value_ptr [out] effective user ID of the calling process
+ */
+uid_t geteuid(void); 
+
+/** The getegid() function shall return the effective group ID of the calling process.
+ * Please refer to POSIX standard for details.
+ * @param thread    [in]  none
+ * @param value_ptr [out] effective group ID of the calling process.
+ */
+gid_t getegid(void);
+
+/** The getgid() function shall return the real group ID of the calling process
+ * Please refer to POSIX standard for details.
+ * @param thread    [in]  none
+ * @param value_ptr [out] real group ID of the calling process.
+ */
+ gid_t getgid(void); 
+
+/** seteuid set effective user ID
+ * Please refer to POSIX standard for details.
+ * @param thread    [in] effective user ID
+ * @param value_ptr [out] Upon successful completion, 0 shall be returned; otherwise, -1 shall be returned and errno set to indicate the error.
+ */
+int seteuid(uid_t uid);
+
+/** setpgrp - set the process group ID
+ * Please refer to POSIX standard for details.
+ * @param thread    [in] none
+ * @param value_ptr [out] Upon successful completion, 0 shall be returned; otherwise, -1 shall be returned and errno set to indicate the error.
+ */ 
+pid_t setpgrp(void);
+
+/** setuid - set user ID
+ * Please refer to POSIX standard for details.
+ * @param thread    [in]  user ID
+ * @param value_ptr [out] Upon successful completion, 0 shall be returned; otherwise, -1 shall be returned and errno set to indicate the error.
+ */
+int setuid(uid_t uid);
+
+/** setpgid - set process group ID for job control
+ * Please refer to POSIX standard for details.
+ * @param thread    [in] PID of process, PGID to be set
+ * @param value_ptr [out] Upon successful completion, 0 shall be returned; otherwise, -1 shall be returned and errno set to indicate the error.
+ */
+int setpgid(pid_t pid, pid_t pgid);
+
+/** setsid - create session and set process group ID
+ * Please refer to POSIX standard for details.
+ * @param thread    [in] none
+ * @param value_ptr [out] Upon successful completion, 0 shall be returned; otherwise, -1 shall be returned and errno set to indicate the error.
+ */
+pid_t setsid(void);
+
+#ifdef __cplusplus
+}
+#endif
+#endif
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/posix/mqueue.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/posix/mqueue.h
new file mode 100755
index 0000000000000..74dcc2fa202c6
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/posix/mqueue.h
@@ -0,0 +1,203 @@
+#ifndef _POSIX_MQUEUE_H_
+#define _POSIX_MQUEUE_H_
+
+/*==========================================================================
+ * FILE:         mqueue.h
+ *
+ * SERVICES:     POSIX Message Queue API interface
+ *
+ * DESCRIPTION:  POSIX Message Queue API interface based upon POSIX 1003.1-2004
+ *
+ * Copyright (c) 2013, 2016, 2023 Qualcomm Technologies, Inc.  
+ * All Rights Reserved. 
+ * Confidential and Proprietary - Qualcomm Technlogies, Inc.
+ *==========================================================================*/
+
+#include <sys/types.h> /*ssize_t */
+#include <time.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MQ_PRIO_MAX        255     /* max priority */
+#define MQ_PRIO_DEFAULT    0       /* default priority */
+
+typedef int   mqd_t;
+
+struct mq_attr
+{
+    long mq_flags;   /* message queue flags */
+    long mq_maxmsg;  /* maximum number of messages */
+    long mq_msgsize; /* maximum message size */
+    long mq_curmsgs; /* number of messages currently queued */
+};
+
+typedef struct mq_attr mqueue_attr;
+
+/** \details
+ * This provides POSIX Message Queue API.
+ *
+ * mq_notify is not supported.
+ *
+ * Since this implementation of POSIX kernel API is a subset of PSE51,
+ * it only supports Message sending and receiving within one process.
+ * Message sending and receiving among processes are not supported.
+ */
+
+/** \defgroup mqueue POSIX Message Queue API */
+/** \ingroup mqueue */
+/** @{ */
+
+/** Open a message queue.
+ * Please refer to POSIX standard for details.
+ */
+mqd_t mq_open(const char *name, int oflag, /* mode_t mode, struct mq_attr *attr */...);
+
+/** Close a message queue.
+ * Please refer to POSIX standard for details.
+ */
+int mq_close(mqd_t mq_desc);
+
+/** Remove a message queue.
+ * Please refer to POSIX standard for details.
+ */
+int mq_unlink(const char *name);
+
+/** Send a message to a message queue.
+ * Please refer to POSIX standard for details.
+ *
+ * If the queue is full, instead of blocking the sender, this function
+ * will return -1 with errno EAGAIN, in this implementation. This behavior
+ * may change in the future.
+ */
+int mq_send(mqd_t mqdes, const char *msg_ptr, size_t msg_len, unsigned int msg_prio);
+
+/** Send a message to a message queue with timeout.
+ * Please refer to POSIX standard for details.
+ * @param abs_timeout [in] Only abs_timeout={0,0} is supported in this
+ * implementation. This behavior may change in the future.
+ */
+int mq_timedsend(mqd_t mqdes, const char *msg_ptr, size_t msg_len, unsigned int msg_prio, const struct timespec *abs_timeout);
+
+/** Receive a message from a message queue.
+ * Please refer to POSIX standard for details.
+ */
+ssize_t mq_receive(mqd_t mqdes, char *msg_ptr, size_t msg_len, unsigned int *msg_prio);
+
+/** Receive a message from a message queue with timeout.
+ * Please refer to POSIX standard for details.
+ * @param abs_timeout [in] Only abs_timeout={0,0} is supported in this
+ * implementation. This behavior may change in the future.
+ */
+ssize_t mq_timedreceive(mqd_t mqdes, char *restrict msg_ptr, size_t msg_len, unsigned int *restrict msg_prio, const struct timespec *restrict abs_timeout);
+
+/** Get message queue attributes.
+ * Please refer to POSIX standard for details.
+ */
+int mq_getattr(mqd_t mqdes, struct mq_attr *mqstat);
+
+/** Set message queue attributes.
+ * Please refer to POSIX standard for details.
+ */
+int mq_setattr(mqd_t mqdes, const struct mq_attr *restrict mqstat, struct mq_attr *restrict omqstat);
+
+/** @} */
+
+#define NBBY    8U               /* number of bits in a byte */
+
+/*
+ * Select uses bit masks of file descriptors in longs.  These macros
+ * manipulate such bit fields (the filesystem macros use chars).
+ * FD_SETSIZE may be defined by the user, but the default here should
+ * be enough for most uses.
+ */
+#ifndef FD_SETSIZE
+#define FD_SETSIZE    256U
+#endif
+
+typedef unsigned long   fd_mask;
+#define NFDBITS    (sizeof(fd_mask) * (unsigned int)NBBY)     /* bits per mask */
+
+#ifndef howmany
+#define howmany(x, y)    (((x) + ((y) - 1U)) / (y))
+#endif
+
+//equivalent of fd_set fpr WINNT env
+typedef struct fd_set
+{
+    fd_mask fds_bits[howmany(FD_SETSIZE, NFDBITS)];
+} fd_set;
+
+/** \addtogroup mqueue */
+/** @{ */
+
+/** Sets the bit for the file descriptor fd in the file descriptor set fdset.
+ */
+#define FD_SET(n, p) ((p)->fds_bits[((unsigned int) (n)) / NFDBITS] |= (1UL << (((unsigned int) (n)) % NFDBITS)))
+
+/** Clears the bit for the file descriptor fd in the file descriptor set fdset.
+ */
+#define FD_CLR(n, p) ((p)->fds_bits[((unsigned int) (n)) / NFDBITS] &= ~(1UL << (((unsigned int) (n)) % NFDBITS)))
+
+/** Returns a non-zero value if the bit for the file descriptor fd is set in the file descriptor set pointed to by fdset, and 0 otherwise.
+ */
+#define FD_ISSET(n, p)    ((unsigned long)(p)->fds_bits[((unsigned int) (n)) / NFDBITS] & (unsigned long)((unsigned)1U << (((unsigned int) (n)) % NFDBITS)))
+
+/** Copies the file descriptor set.
+ */
+#define FD_COPY(f, t)     (void)(memcpy)((t), (f), sizeof(*(f)))
+
+/** Initializes the file descriptor set fdset to have zero bits for all file descriptors.
+ */
+#define FD_ZERO(p)        (void)memset((p), 0, sizeof(*(p)))
+
+/** Error check the file descriptor set.
+ */
+#define FD_BAD(fd)        ((fd) < 0 /*|| fd >= fd_arraylen || fd_array[fd].obj == 0*/)
+
+/*! Wait for both message queues and signals. In this implementation, only
+ * message queue file descriptors are supported.
+ * @param nfds [in] This is an integer one more than the maximum of any file
+ * descriptor in any of the sets. In other words, while you are busy
+ * adding file descriptors to your sets, you must calculate the maximum
+ * integer value of all of them, then increment this value by one, and
+ * then pass this as nfds to select().
+ * @param readfds  [in] the file descriptor set on all message queues.
+ * @param writefds [in] ignored in this implementation.
+ * @param errorfds [in] ignored in this implementation.
+ * @param timeout  [in] Only timeout={0,0} is supported in this
+ * implementation. This behavior may change in the future.
+ */
+int pselect(int nfds, fd_set *restrict readfds,
+            fd_set *restrict writefds, fd_set *restrict errorfds,
+            const struct timespec *restrict timeout,
+            const sigset_t *restrict sigmask);
+
+/*! Wait for multiple message queues. In this implementation, only
+ * message queue file descriptors are supported.
+ * @param nfds [in] This is an integer one more than the maximum of any file
+ * descriptor in any of the sets. In other words, while you are busy
+ * adding file descriptors to your sets, you must calculate the maximum
+ * integer value of all of them, then increment this value by one, and
+ * then pass this as nfds to select().
+ * @param readfds  [in] the file descriptor set on all message queues.
+ * @param writefds [in] ignored in this implementation.
+ * @param errorfds [in] ignored in this implementation.
+ * @param timeout  [in] Only timeout={0,0} is supported in this
+ * implementation. This behavior may change in the future.
+ */
+int select(int nfds, fd_set *restrict readfds,
+           fd_set *restrict writefds, fd_set *restrict errorfds,
+           struct timeval *restrict timeout);
+
+/** @} */
+
+/* this function is needed for test framework which needs to clean up memory when teardown */
+void _mq_teardown(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/posix/pthread.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/posix/pthread.h
new file mode 100755
index 0000000000000..f64242e8dc683
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/posix/pthread.h
@@ -0,0 +1,287 @@
+#ifndef QURT_PTHREAD_H
+#define QURT_PTHREAD_H  
+
+/*==========================================================================
+ * FILE:         pthread.h
+ *
+ * SERVICES:     POSIX pthread API interface
+ *
+ * DESCRIPTION:  POSIX pthread API interface based upon POSIX 1003.1-2004
+ *
+ *               Copyright (c) 2013,2016,2023  by Qualcomm Technologies, Inc.  All Rights Reserved. QUALCOMM Proprietary and Confidential.
+ *==========================================================================
+ *
+ *                          EDIT HISTORY FOR MODULE
+ *
+ *  This section contains comments describing changes made to the module.
+ *  Notice that changes are listed in reverse chronological order.
+ *
+ *  
+ *
+ *  when       who     what, where, why
+ *  --------   ---     -------------------------------------------------------
+ *  10/13/08   cz      Initial version.
+ *==========================================================================*/
+
+#include <sys/types.h>
+#include "sys/sched.h" /* For struct sched_param */
+#include "sys/errno.h" /* error values */
+#include <qurt.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <time.h>
+#include "pthread_types.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* the range of the set supported by the kernel data type used to represent CPU sets. */
+#define CONFIG_NR_CPUS QURT_THREAD_CFG_BITMASK_ALL
+
+#define UNIMPLEMENTED(FUNC, RETURNTYPE, ARGS)    static inline RETURNTYPE FUNC ARGS { qurt_printf("Unimplemented: %s... exiting\n", __FUNCTION__); exit(1); }
+
+/** @brief Magic (non-portable) value for a stack's address to enable usage
+           of auto-stack feature (if available) */
+#define PTHREAD_AUTO_STACK_MAGIC_ADDR_NP ((void *)0xFFF)
+
+/** \details 
+ * This provides POSIX thread API. 
+ *
+ */
+
+/** \defgroup pthread POSIX pthread API */
+/** \ingroup pthread */
+/** @{ */
+
+/** Compare Two Threads. 
+ * Please refer to POSIX standard for details.  
+ */
+static inline int pthread_equal(pthread_t t1, pthread_t t2)
+{
+    return (t1 == t2) ? 1 : 0;
+}
+
+/** Create Thread. 
+ * Please refer to POSIX standard for details.  
+ */
+int pthread_create(pthread_t * tid, const pthread_attr_t * attr, void *(*start)(void *), void *arg);
+
+/** Terminate Calling Thread. 
+ * Please refer to POSIX standard for details.  
+ */
+void pthread_exit(void *value_ptr);
+
+/** Wait for thread termination.
+ * Please refer to POSIX standard for details.
+ * @param thread    [in]  the thread to be joined
+ * @param value_ptr [out] the pointer of the exit status
+ */
+int pthread_join(pthread_t thread, void **value_ptr);
+
+/** Detach a joinable thread.
+ * Please refer to POSIX standard for details.
+ * @param id    [in]  id of the tread the thread to be detached.
+ */
+int pthread_detach(pthread_t id);
+
+/** Dynamic package initialisation
+ * Please refer to POSIX standard for details.
+ */
+int pthread_once(pthread_once_t *once_control, void (*init_routine)(void));
+
+pthread_t pthread_self(void);
+int pthread_cancel(pthread_t thread);
+static inline void pthread_yield(void)
+{
+    return;
+}
+
+int pthread_kill(pthread_t thread, int sig);
+
+/**
+ * @brief Return name of thread
+ * @warning Donot call this in the error handling path as it may cause deadlock
+ *          due to underlying OS calls
+ * @param thread [in] thread Thread whose name is to be retrieved
+ * @param name [out] name Buffer used to return thread name
+ * @param len [in] len  Number of bytes available in name
+ * @return 0 on success, ESRCH, ERANGE on failure
+ */
+extern int pthread_getname_np (pthread_t thread, char * name, size_t len);
+
+int pthread_getschedparam(pthread_t thread, int *restrict policy, struct sched_param *restrict param);
+int pthread_setschedparam(pthread_t thread, int policy, const struct sched_param *param);
+int pthread_setschedprio(pthread_t thread, int prio);
+int pthread_setcancelstate(int state, int *oldstate);
+int pthread_setcanceltype(int type, int *oldtype);
+
+/* Attribute functions */
+int pthread_attr_init(pthread_attr_t *attr);
+int pthread_attr_destroy(pthread_attr_t *attr);
+int pthread_attr_setschedparam(pthread_attr_t *restrict attr, const sched_param *restrict param);
+int pthread_attr_getschedparam(const pthread_attr_t *restrict attr, sched_param *restrict param);
+int pthread_attr_setstacksize(pthread_attr_t *attr, size_t stacksize);
+int pthread_attr_getstacksize(const pthread_attr_t *attr, size_t *stacksize);
+int pthread_attr_setstackaddr(pthread_attr_t *attr, void * stackaddr);
+int pthread_attr_getstackaddr(const pthread_attr_t *attr, void ** stackaddr);
+int pthread_attr_getdetachstate(const pthread_attr_t *attr, int *detachstate);
+int pthread_attr_setdetachstate(pthread_attr_t *attr, int detachstate);
+int pthread_attr_setstack(pthread_attr_t *attr, void *stackaddr, size_t stacksize);
+int pthread_attr_getstack(const pthread_attr_t *attr, void **stackaddr, size_t *stacksize);
+int pthread_attr_setscope(pthread_attr_t *attr, int scope);
+int pthread_attr_getscope(const pthread_attr_t *attr, int *scope);
+int pthread_attr_setinheritsched(pthread_attr_t *attr, int inheritsched);
+int pthread_attr_getinheritsched(const pthread_attr_t *attr, int *inheritsched);
+int pthread_attr_getguardsize(const pthread_attr_t * attr, size_t * guardsize);
+int pthread_attr_setautostack(pthread_attr_t *attr);
+int pthread_attr_setbuspriority(pthread_attr_t *attr, unsigned short bus_priority);
+
+/* Qualcomm additions to pthread get/set attribute functions */
+int pthread_attr_setthreadname(pthread_attr_t *attr, const char * name);
+int pthread_attr_getthreadname(const pthread_attr_t *attr, char * name, int size);
+int pthread_attr_settimetestid(pthread_attr_t *attr, unsigned int tid);
+int pthread_attr_gettimetestid(const pthread_attr_t *attr, unsigned int* tid);
+
+/* Mutexes */
+int pthread_mutex_init(pthread_mutex_t *mutex, pthread_mutexattr_t *attr);
+int pthread_mutex_lock(pthread_mutex_t *mutex);
+int pthread_mutex_unlock(pthread_mutex_t *mutex);
+int pthread_mutex_trylock(pthread_mutex_t *mutex);
+int pthread_mutex_destroy(pthread_mutex_t *mutex);
+int pthread_mutex_getprioceiling(const pthread_mutex_t *restrict mutex, int *restrict prioceiling);
+int pthread_mutex_setprioceiling(pthread_mutex_t *restrict mutex, int prioceiling, int *restrict old_ceiling);
+
+/* For Mutex with type PTHREAD_MUTEX_NORMAL, Priority Inheritance is not 
+ * supported even PTHREAD_PRIO_INHERIT is defined since QURT does not support
+ * this kind of Mutex */
+int pthread_mutexattr_init(pthread_mutexattr_t *attr);
+int pthread_mutexattr_destroy(pthread_mutexattr_t *attr);
+int pthread_mutexattr_gettype(const pthread_mutexattr_t *restrict, int *restrict);
+int pthread_mutexattr_settype(pthread_mutexattr_t *attr, int type);
+int pthread_mutexattr_getprotocol(const pthread_mutexattr_t *restrict, int *restrict);
+int pthread_mutexattr_setprotocol(pthread_mutexattr_t *attr, int protocol);
+int pthread_mutexattr_getpshared(const pthread_mutexattr_t *restrict, int *restrict);
+int pthread_mutexattr_setpshared(pthread_mutexattr_t *, int);
+int pthread_mutexattr_getprioceiling(const pthread_mutexattr_t *restrict attr, int *restrict prioceiling);
+int pthread_mutexattr_setprioceiling(pthread_mutexattr_t *attr, int prioceiling);
+
+/* Spinlocks */
+int pthread_spin_init(pthread_spinlock_t *lock, int pshared);
+int pthread_spin_destroy(pthread_spinlock_t *lock);
+int pthread_spin_lock(pthread_spinlock_t *lock);
+int pthread_spin_trylock(pthread_spinlock_t *lock);
+int pthread_spin_unlock(pthread_spinlock_t *lock);
+
+/* Condition variables */
+int pthread_condattr_init(pthread_condattr_t *attr);
+int pthread_condattr_destroy(pthread_condattr_t *attr);
+int pthread_condattr_setpshared(pthread_condattr_t *attr, int pshared);
+int pthread_condattr_getpshared(const pthread_condattr_t *restrict attr, int *restrict pshared);
+int pthread_condattr_setclock(pthread_condattr_t *attr, clockid_t clock);
+int pthread_condattr_getclock(const pthread_condattr_t *restrict attr, clockid_t *restrict clock);
+int pthread_cond_init(pthread_cond_t *cond, pthread_condattr_t *attr);
+int pthread_cond_destroy(pthread_cond_t *cond);
+int pthread_cond_signal(pthread_cond_t *cond);
+int pthread_cond_broadcast(pthread_cond_t *cond);
+int pthread_cond_wait(pthread_cond_t *cond, pthread_mutex_t *mutex);
+int pthread_cond_timedwait(pthread_cond_t * cond, pthread_mutex_t * mutex, const struct timespec *time);
+
+/* Barriers */
+int pthread_barrier_init(pthread_barrier_t *restrict barrier, const pthread_barrierattr_t *restrict attr, unsigned count);
+int pthread_barrier_destroy(pthread_barrier_t *barrier);
+int pthread_barrier_wait(pthread_barrier_t *barrier);
+int pthread_barrierattr_init(pthread_barrierattr_t *attr);
+int pthread_barrierattr_destroy(pthread_barrierattr_t *attr);
+int pthread_barrierattr_getpshared(const pthread_barrierattr_t *restrict attr, int *restrict pshared);
+
+
+/*Read-Write locks*/
+int pthread_rwlock_init(pthread_rwlock_t *, const pthread_rwlockattr_t *);
+int pthread_rwlock_destroy(pthread_rwlock_t *);
+int pthread_rwlockattr_init(pthread_rwlockattr_t *);
+int pthread_rwlockattr_destroy(pthread_rwlockattr_t *);
+int pthread_rwlockattr_getpshared(const pthread_rwlockattr_t *, int *);
+int pthread_rwlockattr_setpshared(pthread_rwlockattr_t *, int);
+int pthread_rwlock_rdlock(pthread_rwlock_t *);
+int pthread_rwlock_tryrdlock(pthread_rwlock_t *);
+int pthread_rwlock_wrlock(pthread_rwlock_t *);
+int pthread_rwlock_trywrlock(pthread_rwlock_t *);
+int pthread_rwlock_unlock(pthread_rwlock_t *);
+
+
+/** please refer to POSIX standard document 
+ */
+int pthread_barrierattr_setpshared(pthread_barrierattr_t *attr, int pshared);
+
+/** set CPU affinity attribute in thread attributes object.
+
+ * @param attr       [in] pthread attributes 
+ * @param cpusetsize [in] The argument cpusetsize is the length (in bytes) 
+                          of the buffer pointed to by cpuset. Typically, 
+                          this argument would be specified as 
+                          sizeof(cpu_set_t).
+ * @param cpuset     [in] This data set is a bitset where each bit represents 
+                          a CPU (hw thread). How the system's CPUs are mapped 
+                          to bits in the bitset is system dependent. 
+                          For QURT kernel, Bit 0 is corresponding to hw 
+                          thread 0, and so on. If the corresponding bit is 
+                          set to 1, then the software thread is eligible to 
+                          run this hw thread.  0x3f means it can run any hw
+                          threads 0x0 also means it can run on any hw threads.
+   @return On success, this function returns 0; on error, it returns a 
+           non-zero error number.
+           EINVAL - cpuset specified a CPU that was outside the set supported 
+                    by the kernel.  (The kernel configuration option 
+                    CONFIG_NR_CPUS defines the range of the set supported by 
+                    the kernel data type used to represent CPU sets.)
+ * @note This function is non-standard GNU extensions; hence the suffix "_np"
+         (non-portable) in the names. 
+ */
+int pthread_attr_setaffinity_np(pthread_attr_t *attr, size_t cpusetsize, const cpu_set_t *cpuset);
+
+/** get CPU affinity attribute in thread attributes object.
+ * @param attr       [in] pthread attributes 
+ * @param cpusetsize [in] The argument cpusetsize is the length (in bytes) 
+                          of the buffer pointed to by cpuset. Typically, 
+                          this argument would be specified as 
+                          sizeof(cpu_set_t).
+ * @param cpuset    [out] This data set is a bitset where each bit represents 
+                          a CPU (hw thread). How the system's CPUs are mapped 
+                          to bits in the bitset is system dependent. 
+                          For QURT kernel, Bit 0 is corresponding to hw 
+                          thread 0, and so on. If the corresponding bit is 
+                          set to 1, then the software thread is eligible to 
+                          run this hw thread.  0x3f means it can run any hw
+                          threads 0x0 also means it can run on any hw threads.
+   @return On success, this function returns 0; on error, it returns a 
+           non-zero error number.
+           EINVAL - cpusetsize is smaller than the size of the affinity mask 
+                    used by the kernel.
+ * @note   This function is non-standard GNU extensions; hence the suffix "_np"
+           (non-portable) in the names. 
+ */
+int pthread_attr_getaffinity_np(pthread_attr_t *attr, size_t cpusetsize, cpu_set_t *cpuset);
+
+/* TLS */
+int pthread_key_create(pthread_key_t *key, void (*destructor)(void*));
+int pthread_key_delete(pthread_key_t key);
+int pthread_setspecific(pthread_key_t key, const void *value);
+void *pthread_getspecific(pthread_key_t key);
+int pthread_getattr_np(pthread_t thread, pthread_attr_t * restrict attr); 	 	
+
+/** @} */
+
+/* Calling non-pthread calls this function to create pthred tcb w/o creating actual thread */
+int pthread_fake(pthread_t * restrict thread, const pthread_attr_t * restrict attr);
+int pthread_fake_destroy(pthread_t thread);
+
+//amitkulk: move these to unistd.h after we move that header within qurt
+int posix_memalign(void **memptr, size_t alignment, size_t size);
+void exit(int status);
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* QURT_PTHREAD_H */
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/posix/pthread_types.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/posix/pthread_types.h
new file mode 100755
index 0000000000000..51c3b9dbca243
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/posix/pthread_types.h
@@ -0,0 +1,193 @@
+#ifndef _PTHREAD_TYPES_H_
+#define _PTHREAD_TYPES_H_
+
+/*==========================================================================
+ * FILE:         pthread_types.c
+ *
+ * SERVICES:     types usded in POSIX API interface
+ *
+ * DESCRIPTION:  POSIX API interface based upon POSIX 1003.1-2004
+ *
+ *               Copyright (c) 2016, 2023  by Qualcomm Technologies, Inc.  All Rights Reserved. QUALCOMM Proprietary and Confidential.
+
+ *==========================================================================*/
+
+#include <stddef.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef __GNUC__
+#define restrict __restrict__
+#else
+#define restrict
+#endif
+
+#define _SSIZE_T
+
+#ifndef TRUE
+#define TRUE    1
+#endif
+
+#ifndef FALSE
+#define FALSE    0
+#endif
+
+#define PTHREAD_MAX_THREADS          512U
+
+#define PTHREAD_NAME_LEN             16
+#define PTHREAD_MIN_STACKSIZE        512 //4096
+#define PTHREAD_MAX_STACKSIZE        1048576
+#define PTHREAD_DEFAULT_STACKSIZE    16384
+
+#define PTHREAD_STACK_MIN            (4096U*2U)
+#define PTHREAD_MIN_PRIORITY         0U
+#define PTHREAD_MAX_PRIORITY         255U
+#define PTHREAD_DEFAULT_PRIORITY     1
+
+/*Mutex initialization status*/
+#define PTHREAD_MUTEX_ATTR_UNINITIALIZED    0
+#define PTHREAD_MUTEX_ATTR_INITIALIZED      1
+
+/*Conditional attributes initialization status*/
+#define PTHREAD_COND_ATTR_UNINITIALIZED     0
+#define PTHREAD_COND_ATTR_INITIALIZED       1
+
+#define PTHREAD_DEFAULT_NAME                "Anonymous"
+
+#define PTHREAD_MUTEX_INITIALIZER    ((pthread_mutex_t) 0xFFFFFFFFU)
+                                      
+#define PTHREAD_COND_INITIALIZER     ((pthread_cond_t) 0xFFFFFFFFU)
+
+/* mutex and cond_var shared */
+#define PTHREAD_PROCESS_PRIVATE      0
+#define PTHREAD_PROCESS_SHARED       1
+
+/* mutex type */
+#define PTHREAD_MUTEX_ERRORCHECK     0
+#define PTHREAD_MUTEX_NORMAL         1
+#define PTHREAD_MUTEX_RECURSIVE      2
+#define PTHREAD_MUTEX_DEFAULT        3
+
+/* mutex protocol */
+#define PTHREAD_PRIO_NONE            0
+#define PTHREAD_PRIO_INHERIT         1
+#define PTHREAD_PRIO_PROTECT         2
+
+#define PTHREAD_SPINLOCK_UNLOCKED    0
+#define PTHREAD_SPINLOCK_LOCKED      1
+
+#define PTHREAD_ONCE_INIT (0)
+
+#define PTHREAD_MUTEX_OPAQUE //ToDo: amitkulk: debug
+
+typedef signed int   ssize_t;
+
+/*detatchstate of a pthread*/
+#define PTHREAD_CREATE_JOINABLE             1
+#define PTHREAD_CREATE_DETACHED             0
+
+/*contention scope*/
+#define PTHREAD_SCOPE_PROCESS 1 
+#define PTHREAD_SCOPE_SYSTEM 0
+
+/*scheduler*/
+#define PTHREAD_INHERIT_SCHED 1
+#define PTHREAD_EXPLICIT_SCHED 0
+
+/*
+ * Types and structure definitions
+ *
+ */
+typedef unsigned int cpu_set_t;
+
+typedef unsigned int pthread_t;
+
+typedef struct pthread_attr_t
+{
+    void         *stackaddr;
+    int          internal_stack; /* this flag==1 means the stack needs to be freed by posix */
+    size_t       stacksize;
+    int          priority;
+    unsigned short timetest_id;
+    /* This flag indicate if thread will be autostack thread*/    
+	unsigned short autostack:1;
+    /* This flag is to indicate thread's bus_priority high/low 
+       bus_priority = 0  -- Bus_priority is low
+       bus_priority = 1  -- Bus_priority is high
+       bus_priority = 3  -- Bus_priority is default (takes the default set for the process)
+    */
+    unsigned short bus_priority:2;
+    unsigned short reserved:13;
+    cpu_set_t    cpumask;
+    char         name[PTHREAD_NAME_LEN];
+    /* This flag indicates whether pthread lib should create thread contexts for other OSALs */
+    /* This is used internally by POSIX and not available for general usage */
+    int          ext_context;
+    int          detachstate;
+} pthread_attr_t;
+
+//mutex attr
+typedef struct pthread_mutexattr_t   pthread_mutexattr_t;
+struct pthread_mutexattr_t
+{
+    int is_initialized;
+    int type;
+    int pshared;
+    int protocol;
+};
+
+typedef unsigned int              pthread_mutex_t;
+
+typedef unsigned int              pthread_spinlock_t;
+
+typedef struct pthread_condattr_t
+{
+    int is_initialized;
+    int pshared;
+    clockid_t clock_id;
+} pthread_condattr_t;
+
+typedef unsigned int             pthread_cond_t;
+
+typedef struct pthread_barrierattr_t
+{
+    int is_initialized;
+    int pshared;
+} pthread_barrierattr_t;
+
+typedef unsigned int                pthread_barrier_t;
+
+typedef int pthread_key_t;
+
+typedef int pthread_once_t;
+
+
+/*Read-Write locks*/
+#define PTW32_RWLOCK_MAGIC 0xfacade2
+#define PTHREAD_RWLOCK_INITIALIZER ((pthread_rwlock_t)(size_t) -1)
+
+struct pthread_rwlockattr_t_
+{
+  int pshared;
+};
+
+struct pthread_rwlock_t_
+{
+  pthread_mutex_t mtxExclusiveAccess;
+  pthread_mutex_t mtxSharedAccessCompleted;
+  pthread_cond_t cndSharedAccessCompleted;
+  int nSharedAccessCount;
+  int nExclusiveAccessCount;
+  int nCompletedSharedAccessCount;
+  int nMagic;
+};
+
+typedef struct pthread_rwlock_t_ * pthread_rwlock_t;
+typedef struct pthread_rwlockattr_t_ * pthread_rwlockattr_t;
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _PTHERAD_TYPES_H_ */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/posix/sched.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/posix/sched.h
new file mode 100755
index 0000000000000..faf3365be9f82
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/posix/sched.h
@@ -0,0 +1,21 @@
+/*=============================================================================
+
+                                    sched.h
+
+GENERAL DESCRIPTION
+
+EXTERNAL FUNCTIONS
+        None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+        None.
+
+             Copyright (c) 2013,2016  by Qualcomm Technologies, Inc.  All Rights Reserved.
+=============================================================================*/
+#ifndef __SCHED_H__
+#define __SCHED_H__
+
+#include "sys/sched.h"
+
+#endif //__SCHED_H__
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/posix/semaphore.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/posix/semaphore.h
new file mode 100755
index 0000000000000..d9145b295ae62
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/posix/semaphore.h
@@ -0,0 +1,114 @@
+#ifndef SEMAPHORE_H
+#define SEMAPHORE_H
+
+/*==========================================================================
+ * FILE:         semaphore.h
+ *
+ * SERVICES:     POSIX semaphore API interface
+ *
+ * DESCRIPTION:  POSIX semaphore API interface based upon POSIX 1003.1-2004
+ *
+ *               Copyright (c) 2013, 2016  by Qualcomm Technologies, Inc.  All Rights Reserved. QUALCOMM Proprietary and Confidential.
+
+ *==========================================================================*/
+#include <sys/types.h> // Get all C sys types - includes POSIX specific
+#include "sys/errno.h" // error values
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*=============================================================================
+                        TYPEDEFS
+=============================================================================*/
+/** User facing semaphore container with opaque pointer to implementation */
+typedef struct
+{
+    unsigned int *opaque;
+} sem_t;
+#define _SEM_T
+
+/*=============================================================================
+                        CONSTANTS AND MACROS
+=============================================================================*/
+/* constant definitions */
+#define SEM_FAILED       ((sem_t*) 0)
+
+/* @todo siqbal Should we put such configuration items in a common place
+   instead of this user-facing header? */
+#define SEM_VALUE_MAX    ((unsigned int) 30) // If need be increase this
+
+/*=============================================================================
+                        FUNCTIONS
+=============================================================================*/
+
+/** \details
+ * POSIX standard comes with two kinds of semaphores: named and unnamed
+ * semaphores.
+ *
+ * This implementation of POSIX kernel API provide unnamed & named semaphore.
+ *
+ * 
+ * sem_timedwait() is not provided.
+ */
+
+/** \defgroup semaphore POSIX Semaphore API */
+
+/** \ingroup semaphore */
+/** @{ */
+
+/** Initialize an unnamed semaphore.
+ * Please refer to POSIX standard for details.
+ * @param pshared [in] This implementation does not support non-zero value, 
+ * i.e., semaphore cannot be shared between processes in this implementation. 
+ */                 
+int sem_init(sem_t *sem, int pshared, unsigned int value);
+
+/** Lock a semaphore.
+ * Please refer to POSIX standard for details.
+ */
+int sem_wait(sem_t *sem);
+
+/** Lock a semaphore.
+ * Please refer to POSIX standard for details.
+ */
+int sem_trywait(sem_t *sem);
+
+/** Unlock a semaphore.
+ * Please refer to POSIX standard for details.
+ */
+int sem_post(sem_t *sem);
+
+/** Get the value of a semaphore.
+ * Please refer to POSIX standard for details.
+ */
+int sem_getvalue(sem_t *sem, int *value);
+
+/** Destroy an unnamed semaphore.
+ * Please refer to POSIX standard for details.
+ */
+int sem_destroy(sem_t *sem);
+
+/** creates and initializes a named semaphore.
+ * Please refer to POSIX standard for details.
+ */
+sem_t * sem_open(const char* name , int oflag , ...);
+
+/** closes a semaphore.
+ * Please refer to POSIX standard for details.
+ */
+int sem_close(sem_t *sem);
+
+/** unlinkes a named semaphore.
+ * Please refer to POSIX standard for details.
+ */
+int sem_unlink(const char *name);
+/** @} */
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  /* SEMAPHORE_H */
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/posix/signal.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/posix/signal.h
new file mode 100755
index 0000000000000..35cb1f1a9a319
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/posix/signal.h
@@ -0,0 +1,201 @@
+#ifndef _SIGNAL_H_
+#define _SIGNAL_H_
+
+/*==========================================================================
+ * FILE:         signal.h
+ *
+ * SERVICES:     POSIX Signal API interface
+ *
+ * DESCRIPTION:  POSIX Signal API interface based upon POSIX 1003.1-2004
+ *
+ * Copyright (c) 2013, 2016, 2023 Qualcomm Technologies, Inc.
+ * All Rights Reserved. 
+ * Confidential and Proprietary - Qualcomm Technologies, Inc.
+ 
+ *==========================================================================*/
+
+#include <sys/types.h>
+#include <generic/signal.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* POSIX signal bits */
+
+#define POSIX_MSG      7 /* POSIX msg type used in Qube API */
+#define POSIX_NOTIF    8 /* POSIX msg type used in Qube API */
+#define SIGKILL        9 /* kill (cannot be caught or ignored) */
+
+#define SIGRTMIN       10
+#define SIGRTMAX       32
+
+/* Notification Types. */
+/* No asynchronous notification is delivered when the event of interest occurs. */
+#define SIGEV_NONE      0
+/* The signal specified in sigev_signo shall be generated for the process when
+   the event of interest occurs. */
+#define SIGEV_SIGNAL    1
+/* A notification function is called to perform notification. */
+#define SIGEV_THREAD    2
+#define SA_SIGINFO      1
+
+/*
+ * Flags for sigprocmask:
+ */
+#define SIG_BLOCK       1 /* block specified signal set */
+#define SIG_UNBLOCK     2 /* unblock specified signal set */
+#define SIG_SETMASK     3 /* set specified signal set */
+
+typedef unsigned long int   sigset_t;
+
+union sigval
+{
+    int  sival_int;   /* Integer signal value. */
+    void *sival_ptr;  /* Pointer signal value. */
+};
+
+typedef struct sigevent   sigevent;
+struct sigevent
+{
+    int            sigev_notify;                           /* Notification type.       */
+    int            sigev_signo;                            /* Signal number.           */
+    union sigval   sigev_value;                            /* Signal value.            */
+    void           (*sigev_notify_function)(union sigval); /* Notification function.   */
+    pthread_attr_t *sigev_notify_attributes;
+};
+
+typedef struct siginfo_t   siginfo_t;
+struct siginfo_t
+{
+    int          si_signo;
+    int          si_code;
+    union sigval si_value;
+/*  int          si_errno;
+    pid_t        si_pid;
+    uid_t        si_uid;
+    void         *si_addr;
+    int          si_status;
+    long         si_band;*/
+};
+struct sigaction
+{
+    void     (*sa_handler)(int);
+    sigset_t sa_mask;
+    int      sa_flags;
+    void     (*sa_sigaction)(int, siginfo_t *, void *);
+};
+
+/* Signal functions */
+
+/** \details
+ * This provides POSIX Signal API. Please note that this
+ * implementation does not fully comply with POSIX standard.
+ *
+ * In POSIX standard, Signal can be used as 'interrupt', which means
+ * an incoming signal will interrupt a running thread. After the
+ * registered signal handler is executed, the thread will resume.
+ * This behavior cannot be implemented w/o modifying L4 or QURT kernel.
+ * On the ohter hand, appliation need to be carefully written to avoid
+ * problems caused by 'interrupting' signals.
+ *
+ * Therefore, in this implementation of POSIX signal, thread will
+ * only receive signals when it explicitly waits for signals, i.e., when 
+ * the thread calls either sigwait() or sigsuspend().
+ *
+ * Therefore, pthread_sigmask(), which set or get signal mask for a thread, 
+ * is not supported, since the signal mask will be set by sigwait() and 
+ * sigsuspend().
+ *
+ * Since this implementation of POSIX kernel API is a subset of PSE51,
+ * only threads can send and receive signals. The functions related to 
+ * signal operations with processes, such as kill(), sigqueue(), 
+ * sigprocmask(), are not provided.
+ *
+ * Queued signal is not supported.
+ *
+ * Applications will use signals from SIGRTMIN to SIGRTMAX.
+ *
+ * SIGEV_SIGNAL and SIGEV_THREAD are supported. SIGEV_NONE is not 
+ * supported.
+ *
+ */
+
+/** \defgroup signal POSIX Signal API */
+/** \ingroup signal */
+/** @{ */
+
+/** Wait for signals. This implementation does not support queued signals.
+ *
+ * Please refer to POSIX standard for details.
+ */
+int sigwait(const sigset_t *restrict set, int *restrict sig);
+
+/** Examine and Change Signal Action. 
+ * Please refer to POSIX standard for details.
+ *
+ * @param act [in] A pointer to the sigaction structure that describes the 
+ * action to be taken for the signal. Can be NULL. 
+ * The following flags for sa_flags field in struct sigaction are not 
+ * supported: SA_NOCLDSTOP, SA_ONSTACK, SA_RESETHAND, SA_RESTART, 
+ * SA_NOCLDWAIT and SA_NODEFER. Only flag SA_SIGINFO is supported.  
+ *
+ * @note Define sigaction as macro to avoid a warning when included from 
+ * C++ code - it's causing a "sigaction(...) hides constructor for 
+ * 'struct sigaction'" warning.
+ */
+/*lint -esym(123,sigaction) Suppress "macro used with no arguments" */
+#define sigaction(sig,act,oact) _sigaction((sig),(act),(oact))
+
+/** Wait for signals. 
+ * Please refer to POSIX standard for details.
+ */
+int sigsuspend(const sigset_t *sigmask);
+
+/** Add Signal to Signal Set.
+ * Please refer to POSIX standard for details.
+ */
+int sigaddset(sigset_t *set, int signo);
+
+/** Delete Signal from Signal Set.
+ * Please refer to POSIX standard for details.
+ */
+int sigdelset(sigset_t *set, int signo);
+
+/** Initialize and Empty Signal Set.
+ * Please refer to POSIX standard for details.
+ */
+int sigemptyset(sigset_t *set);
+
+/** Initialize and Fill Signal Set.
+ * Please refer to POSIX standard for details.
+ */
+int sigfillset(sigset_t *set);
+
+/** Test for Signal in Signal Set.
+ * Please refer to POSIX standard for details.
+ */
+int sigismember(const sigset_t *set, int signo);
+
+/** @} */
+
+/* this is not a public api function */
+int _sigaction(int sig, const struct sigaction *act, struct sigaction *oact);
+
+/* have to move #include here to solve circular include problems between time.h and signal.h */
+#include <time.h>
+
+/** Wait for the time interval specified in the timespec structure referenced 
+ * by timeout. This implementation does not support queued signals.
+ * For struct siginfo_t, si_code and si_value are ignored in this implementation.
+ *
+ * Please refer to POSIX standard for details.
+ */
+int sigtimedwait(const sigset_t *restrict set, siginfo_t *restrict info, 
+                 const struct timespec *restrict timeout);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _POSIX_SIGNAL_H_ */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/posix/sys/errno.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/posix/sys/errno.h
new file mode 100755
index 0000000000000..b9edf57bab6c3
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/posix/sys/errno.h
@@ -0,0 +1,20 @@
+#ifndef _SYS_ERRNO_H_
+#define _SYS_ERRNO_H_
+
+/*==========================================================================
+ * FILE:         errno.h
+ *
+ * SERVICES:     POSIX errno header file
+ *
+ * DESCRIPTION:  POSIX errno based upon POSIX 1003.1-2004
+ *
+ *               Copyright (c) 2013, 2016  by Qualcomm Technologies, Inc.  All Rights Reserved. QUALCOMM Proprietary and Confidential.
+
+ *==========================================================================*/
+
+#include <errno.h>
+#ifndef EOK
+#define EOK                0
+#endif
+
+#endif /* _SYS_ERRNO_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/posix/sys/sched.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/posix/sys/sched.h
new file mode 100755
index 0000000000000..2acc34d821725
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/posix/sys/sched.h
@@ -0,0 +1,67 @@
+#ifndef _POSIX_SCHED_H_
+#define _POSIX_SCHED_H_
+
+/*==========================================================================
+ * FILE:         sched.c
+ *
+ * SERVICES:     POSIX Thread sched API interface
+ *
+ * DESCRIPTION:  POSIX Thread sched API interface based upon POSIX 1003.1-2004
+ *
+ *               Copyright (c) 2013, 2016  by Qualcomm Technologies, Inc.  All Rights Reserved. QUALCOMM Proprietary and Confidential.
+
+
+ *==========================================================================*/
+
+#include <qurt.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define SCHED_FIFO        0 /* First in, first out (FIFO) scheduling policy. */
+#define SCHED_RR          1 /* Round robin scheduling policy. */
+#define SCHED_SPORADIC    2 /* Sporadic server scheduling policy. */
+#define SCHED_OTHER       3 /* Another scheduling policy. */
+
+typedef struct sched_param   sched_param;
+struct sched_param
+{
+    void *unimplemented;
+    int  sched_priority;
+};
+
+/** \details 
+ * This provides POSIX sched API. 
+ */
+
+/** \defgroup sched POSIX sched API */
+/** \ingroup sched */
+/** @{ */
+
+/** Relinquish the CPU.
+ * Please refer to POSIX standard for details.
+ */
+static inline int sched_yield(void)
+{
+   return 0;
+}
+
+/** Get the maximum priority.
+ * Please refer to POSIX standard for details.
+ * @param policy [in] SCHED_FIFO is the only valid input for this implementation.
+ */
+int sched_get_priority_max(int policy);
+
+/** Get the minimum priority.
+ * Please refer to POSIX standard for details.
+ * @param policy [in] SCHED_FIFO is the only valid input for this implementation.
+ */
+int sched_get_priority_min(int policy);
+
+/** @} */
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _POSIX_SCHED_H_ */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/posix/sys/types.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/posix/sys/types.h
new file mode 100755
index 0000000000000..700026f9f9e4e
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/posix/sys/types.h
@@ -0,0 +1,35 @@
+#ifndef _SYS_TYPES_H_
+#define _SYS_TYPES_H_
+
+/*==========================================================================
+ * FILE:         types.c
+ *
+ * SERVICES:     types usded in POSIX API interface
+ *
+ * DESCRIPTION:  POSIX API interface based upon POSIX 1003.1-2004
+ *
+ *               Copyright (c) 2013, 2016  by Qualcomm Technologies, Inc.  All Rights Reserved. QUALCOMM Proprietary and Confidential.
+
+ *==========================================================================*/
+
+#if !defined( _PID_T ) || !defined( __pid_t_defined )
+/* POSIX defines pid_t as signed 32-bit type. Hexagon toolchain's header
+   defines it as unsigned 32-bit type citing conflict with QuRT POSIX
+   compatibility later. If any such conflicts exist, we should fix them.
+   pid_t is being defined *BEFORE* inclusion of generic/sys/types.h
+   *INTENTIONALLY* to fix this */
+typedef int        pid_t;
+#define _PID_T
+#define __pid_t_defined
+#endif
+#include <bits/confname.h>
+#include <hooks/unistd.h>
+#include <generic/sys/types.h>
+#include <pthread_types.h>
+
+#ifndef __DEFINED_off_t
+typedef long       off_t;
+#define __DEFINED_off_t
+#endif
+
+#endif /* _SYS_TYPES_H_ */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/posix/time.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/posix/time.h
new file mode 100755
index 0000000000000..13aeb1ea9920d
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/posix/time.h
@@ -0,0 +1,142 @@
+#ifndef _POSIX_TIME_H_
+#define _POSIX_TIME_H_
+
+/*==========================================================================
+ * FILE:         time.h
+ *
+ * SERVICES:     POSIX Timer API interface
+ *
+ * DESCRIPTION:  POSIX Timer API interface based upon POSIX 1003.1-2004
+ *
+ *               Copyright (c) 2013,2016  by Qualcomm Technologies, Inc.  All Rights Reserved. QUALCOMM Proprietary and Confidential.
+ *==========================================================================*/
+
+
+#include <sys/types.h>
+
+typedef int              clockid_t; /* ignored */
+#define _CLOCKID_T
+#define _PROVIDE_POSIX_TIME_DECLS 1
+#include <generic/time.h>
+/* @todo anandj sys/time.h has definition for struct timeval but is not
+         included by generic/time.h */
+#include <sys/time.h>
+
+#define CLOCK_FREQ_NOT_DEFINED          -1
+/* Frequency of Sclk used */
+#define TIME_CONV_SCLK_FREQ             19200000
+
+#define RES_CONV_FACTOR1                1
+#define RES_CONV_FACTOR2                1000000000
+
+#if !defined(CLOCK_REALTIME)
+# define CLOCK_REALTIME 0
+#endif
+
+#if !defined(CLOCK_MONOTONIC)
+# define CLOCK_MONOTONIC 1
+#endif
+
+#if !defined(CLOCK_THREAD_CPUTIME_ID)
+# define CLOCK_THREAD_CPUTIME_ID 2
+#endif
+
+#if !defined(CLOCK_PROCESS_CPUTIME_ID)
+# define CLOCK_PROCESS_CPUTIME_ID 3
+#endif
+
+#if !defined(CLOCK_MONOTONIC_RAW)
+# define CLOCK_MONOTONIC_RAW 4
+#endif
+
+#if !defined(CLOCK_REALTIME_COARSE)
+# define CLOCK_REALTIME_COARSE 5
+#endif
+
+#if !defined(CLOCK_MONOTONIC_COARSE)
+# define CLOCK_MONOTONIC_COARSE 6
+#endif
+
+#if !defined(CLOCK_BOOTTIME)
+# define CLOCK_BOOTTIME 7
+#endif
+
+struct itimerspec
+{
+    struct timespec it_interval;  /* Timer period.     */
+    struct timespec it_value;     /* Timer expiration. */
+};
+
+/* have to move #include here to solve circular include problems between time.h and signal.h */
+#include <signal.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Timer functions */
+
+/** \details
+ * POSIX timers can be either of two types: a one-shot type or a periodic 
+ * type.
+ *
+ * A one-shot is an armed timer that is set to an expiration time relative 
+ * to either a current time or an absolute time. The timer expires once and 
+ * is disarmed. 
+ *
+ * A periodic timer is armed with an initial expiration time and a repetition 
+ * interval. Every time the interval timer 
+ * expires, the timer is reloaded with the repetition interval. The timer 
+ * is then rearmed. 
+ */
+
+/** \defgroup timer POSIX Timer API */
+
+/** \ingroup timer */
+/** @{ */
+
+/** Create a POSIX timer. 
+ * Please refer to POSIX standard for details.
+ * @param clockid [in] ignored in this implementation
+ * @param evp     [in] if non-NULL, points to a sigevent structure. This 
+ * structure, allocated by the application, defines the asynchronous 
+ * notification to occur when the timer expires. If the evp argument is 
+ * NULL, the effect is as if the evp argument pointed to a sigevent 
+ * structure with the sigev_notify member having the value SIGEV_SIGNAL, 
+ * the sigev_signo having a default signal number (SIGALRM), and the 
+ * sigev_value member having the value of the timer ID.
+ */
+int timer_create(clockid_t clockid, struct sigevent *restrict evp,
+                 timer_t *restrict timerid);
+
+/** Delete a POSIX timer. 
+ * Please refer to POSIX standard for details.
+ */                 
+int timer_delete(timer_t timerid);
+
+/** Get the time remaining on a POSIX timer. 
+ * Please refer to POSIX standard for details.
+ */                 
+int timer_gettime(timer_t timerid, struct itimerspec *value);
+
+
+/** Set the time remaining on a POSIX timer. 
+ * Please refer to POSIX standard for details.
+ * @param flags [in] ignored in this implementation
+ */                 
+int timer_settime(timer_t timerid, int flags,
+                  const struct itimerspec *restrict value,
+                  struct itimerspec *restrict ovalue);
+/** Obtain ID of a process CPU-time clock
+ *  @param pid [in] Process ID
+ *  @param clock_id [out] Clock ID
+ *  @return Error values as per POSIX standard
+ */
+int clock_getcpuclockid (pid_t pid, clockid_t * clock_id);
+/** @} */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _POSIX_TIME_H_ */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qube/qube.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qube/qube.h
new file mode 100755
index 0000000000000..1e31e2deedb38
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qube/qube.h
@@ -0,0 +1,51 @@
+#ifndef QUBE_H
+#define QUBE_H
+/*=============================================================================
+
+                 qube.h -- H E A D E R  F I L E
+
+GENERAL DESCRIPTION
+   Prototypes of qpd API
+
+EXTERNAL FUNCTIONS
+   None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+      Copyright (c) 2013  by Qualcomm Technologies, Inc.  All Rights Reserved.
+
+=============================================================================*/
+
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <qurt.h>
+
+/* Define Error codes as QuRT error codes preceed with QURT_ */
+#ifndef EOK
+#define EOK                             QURT_EOK
+#endif /* EOK */
+#ifndef EVAL
+#define EVAL                            QURT_EVAL
+#endif /* EVAL */
+#ifndef EMEM
+#define EMEM                            QURT_EMEM
+#endif /* EMEM */
+#ifndef EINVALID
+#define EINVALID                        QURT_EINVALID
+#endif /* EINVALID */
+
+
+/*=============================================================================
+                      FUNCTION DECLARATIONS                                
+=============================================================================*/
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QUBE_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/atomic_ops.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/atomic_ops.h
new file mode 100755
index 0000000000000..0a9a9f8ba7db5
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/atomic_ops.h
@@ -0,0 +1,197 @@
+#ifndef ATOMIC_OPS_H
+#define ATOMIC_OPS_H
+/**
+  @file atomic_ops.h 
+
+  @brief  Type definitions backwards compatible.
+
+EXTERNAL FUNCTIONS
+   None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+Copyright (c) 2013, 2021  by Qualcomm Technologies, Inc.  All Rights Reserved
+Confidential and Proprietary - Qualcomm Technologies, Inc.
+=============================================================================*/
+
+
+/*
+ * Australian Public Licence B (OZPLB)
+ *
+ * Version 1-0
+ *
+ * Copyright (c) 2007, Open Kernel Labs, Inc.
+ *
+ * All rights reserved. 
+ *
+ * Developed by: Embedded, Real-time and Operating Systems Program (ERTOS)
+ *               National ICT Australia
+ *               http://www.ertos.nicta.com.au
+ *
+ * Permission is granted by National ICT Australia, free of charge, to
+ * any person obtaining a copy of this software and any associated
+ * documentation files (the "Software") to deal with the Software without
+ * restriction, including (without limitation) the rights to use, copy,
+ * modify, adapt, merge, publish, distribute, communicate to the public,
+ * sublicense, and/or sell, lend or rent out copies of the Software, and
+ * to permit persons to whom the Software is furnished to do so, subject
+ * to the following conditions:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimers.
+ *
+ *     * Redistributions in binary form must reproduce the above
+ *       copyright notice, this list of conditions and the following
+ *       disclaimers in the documentation and/or other materials provided
+ *       with the distribution.
+ *
+ *     * Neither the name of National ICT Australia, nor the names of its
+ *       contributors, may be used to endorse or promote products derived
+ *       from this Software without specific prior written permission.
+ *
+ * EXCEPT AS EXPRESSLY STATED IN THIS LICENCE AND TO THE FULL EXTENT
+ * PERMITTED BY APPLICABLE LAW, THE SOFTWARE IS PROVIDED "AS-IS", AND
+ * NATIONAL ICT AUSTRALIA AND ITS CONTRIBUTORS MAKE NO REPRESENTATIONS,
+ * WARRANTIES OR CONDITIONS OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
+ * BUT NOT LIMITED TO ANY REPRESENTATIONS, WARRANTIES OR CONDITIONS
+ * REGARDING THE CONTENTS OR ACCURACY OF THE SOFTWARE, OR OF TITLE,
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT,
+ * THE ABSENCE OF LATENT OR OTHER DEFECTS, OR THE PRESENCE OR ABSENCE OF
+ * ERRORS, WHETHER OR NOT DISCOVERABLE.
+ *
+ * TO THE FULL EXTENT PERMITTED BY APPLICABLE LAW, IN NO EVENT SHALL
+ * NATIONAL ICT AUSTRALIA OR ITS CONTRIBUTORS BE LIABLE ON ANY LEGAL
+ * THEORY (INCLUDING, WITHOUT LIMITATION, IN AN ACTION OF CONTRACT,
+ * NEGLIGENCE OR OTHERWISE) FOR ANY CLAIM, LOSS, DAMAGES OR OTHER
+ * LIABILITY, INCLUDING (WITHOUT LIMITATION) LOSS OF PRODUCTION OR
+ * OPERATION TIME, LOSS, DAMAGE OR CORRUPTION OF DATA OR RECORDS; OR LOSS
+ * OF ANTICIPATED SAVINGS, OPPORTUNITY, REVENUE, PROFIT OR GOODWILL, OR
+ * OTHER ECONOMIC LOSS; OR ANY SPECIAL, INCIDENTAL, INDIRECT,
+ * CONSEQUENTIAL, PUNITIVE OR EXEMPLARY DAMAGES, ARISING OUT OF OR IN
+ * CONNECTION WITH THIS LICENCE, THE SOFTWARE OR THE USE OF OR OTHER
+ * DEALINGS WITH THE SOFTWARE, EVEN IF NATIONAL ICT AUSTRALIA OR ITS
+ * CONTRIBUTORS HAVE BEEN ADVISED OF THE POSSIBILITY OF SUCH CLAIM, LOSS,
+ * DAMAGES OR OTHER LIABILITY.
+ *
+ * If applicable legislation implies representations, warranties, or
+ * conditions, or imposes obligations or liability on National ICT
+ * Australia or one of its contributors in respect of the Software that
+ * cannot be wholly or partly excluded, restricted or modified, the
+ * liability of National ICT Australia or the contributor is limited, to
+ * the full extent permitted by the applicable legislation, at its
+ * option, to:
+ * a.  in the case of goods, any one or more of the following:
+ * i.  the replacement of the goods or the supply of equivalent goods;
+ * ii.  the repair of the goods;
+ * iii. the payment of the cost of replacing the goods or of acquiring
+ *  equivalent goods;
+ * iv.  the payment of the cost of having the goods repaired; or
+ * b.  in the case of services:
+ * i.  the supplying of the services again; or
+ * ii.  the payment of the cost of having the services supplied again.
+ *
+ * The construction, validity and performance of this licence is governed
+ * by the laws in force in New South Wales, Australia.
+ */
+
+/*
+ * Author: Malcolm Purvis <malcolmp@ok-labs.com>
+ * Author: Carlos Dyonisio <medaglia@ok-labs.com>
+ */
+
+#include <qurt_atomic_ops.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef unsigned int atomic_plain_word_t;
+
+/*-------------------------------------------------------------------------*/
+                        /* Atomic Ops API. */
+
+/*
+ * IMPORTANT!
+ * If you plan to change the structure atomic_word_t, please add the new
+ * elements after value. For more information, read the comment in
+ * arch/arm/libs/atomic_ops/v5/src/arm_atomic_ops.spp:66
+ */
+
+typedef struct {
+    volatile atomic_plain_word_t value;
+} atomic_word_t;
+
+#define ATOMIC_INIT(i)  { (i) }
+
+static inline void
+atomic_init(atomic_word_t *a, atomic_plain_word_t v)
+{
+    a->value = v;
+}
+
+#if defined(ARCH_ARM) && defined(ARCH_VER) && (ARCH_VER < 6) && \
+         (!defined(__ATOMIC_OPS_IN_KERNEL__) || defined(MACHINE_SMP))
+
+/* 
+ * If it is ARMv4/v5, the function declarations may change
+ * and are defined in the arch specific header file,
+ * as some of then cannot be declared static because of
+ * the assembler implementation.
+ */
+
+#else 
+
+/* Arithmetic operations. */
+
+void atomic_sub(atomic_word_t *target, atomic_plain_word_t v);
+
+/* Architecture independent definitions. */
+
+static inline atomic_plain_word_t atomic_read(atomic_word_t *target)
+{
+    return target->value;
+}
+
+typedef unsigned long long atomic64_plain_word_t;
+
+typedef struct {
+    volatile atomic64_plain_word_t value;
+} atomic64_word_t;
+
+static inline void
+atomic64_init(atomic64_word_t *a, atomic64_plain_word_t v)
+{
+    a->value = v;
+}
+
+/*********************
+  Support 64-bit  
+ *********************/
+
+atomic64_plain_word_t atomic64_set(atomic64_word_t* target,
+                                      atomic64_plain_word_t value);
+
+void atomic64_xor(atomic64_word_t* target,
+                       atomic64_plain_word_t mask);
+
+/*---------------------------------------------------------------------------*/
+
+/* Architecture independent definitions. */
+
+static inline atomic64_plain_word_t atomic64_read(atomic64_word_t *target)
+{
+    return target->value;
+}
+
+#endif
+
+
+/* Architecture dependent definitions. */
+#include <atomic_ops_plat.h>
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* ATOMIC_OPS_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/atomic_ops_plat.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/atomic_ops_plat.h
new file mode 100755
index 0000000000000..b54b3ff83d978
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/atomic_ops_plat.h
@@ -0,0 +1,86 @@
+#ifndef ATOMIC_OPS_PLAT_H
+#define ATOMIC_OPS_PLAT_H
+/**
+  @file atomic_ops_plat.h 
+
+  @brief  Prototypes of atomic operations API backwards compatible.
+
+EXTERNAL FUNCTIONS
+   None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+Copyright (c) 2013, 2023 Qualcomm Technologies, Inc.
+All Rights Reserved.
+Confidential and Proprietary - Qualcomm Technologies, Inc.
+=============================================================================*/
+
+
+#include <qurt_atomic_ops.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+/*=============================================================================
+                      CONSTANTS AND MACROS                                
+=============================================================================*/
+#define atomic_set(a,b)                qurt_atomic_set((unsigned int *)(a),(unsigned int)(b))
+#define atomic_and(a,b)                qurt_atomic_and((unsigned int *)(a),(unsigned int)(b))
+#define atomic_and_return(a,b)         qurt_atomic_and_return((unsigned int *)(a),(unsigned int)(b))
+#define atomic_or(a,b)                 qurt_atomic_or((unsigned int *)(a),(unsigned int)(b))
+#define atomic_or_return(a,b)          qurt_atomic_or_return((unsigned int *)(a),(unsigned int)(b))
+#define atomic_xor(a,b)                qurt_atomic_xor((unsigned int *)(a),(unsigned int)(b))
+#define atomic_xor_return(a,b)         qurt_atomic_xor_return((unsigned int *)(a),(unsigned int)(b)) 
+#define atomic_set_bit(a,b)            qurt_atomic_set_bit((unsigned int *)(a),(unsigned int)(b))
+#define atomic_clear_bit(a,b)          qurt_atomic_clear_bit((unsigned int *)(a),(unsigned int)(b))
+#define atomic_change_bit(a,b)         qurt_atomic_change_bit((unsigned int *)(a),(unsigned int)(b))
+#define atomic_add(a,b)                qurt_atomic_add((unsigned int *)(a),(unsigned int)(b))
+#define atomic_add_return(a,b)         qurt_atomic_add_return((unsigned int *)(a),(unsigned int)(b))
+#define atomic_add_unless(a,b,c)       qurt_atomic_add_unless((unsigned int *)(a),(unsigned int)(b),(unsigned int)(c))
+#define atomic_sub(a,b)                qurt_atomic_sub((unsigned int *)(a),(unsigned int)(b))
+#define atomic_sub_return(a,b)         qurt_atomic_sub_return((unsigned int *)(a),(unsigned int)(b))
+#define atomic_inc(a)                  qurt_atomic_inc((unsigned int *)(a))
+#define atomic_inc_return(a)           qurt_atomic_inc_return((unsigned int *)(a))
+#define atomic_dec(a)                  qurt_atomic_dec((unsigned int *)(a))
+#define atomic_dec_return(a)           qurt_atomic_dec_return((unsigned int *)(a))
+#define atomic_compare_and_set(a,b,c)  qurt_atomic_compare_and_set((unsigned int *)(a),(unsigned int)(b),(unsigned int)(c))
+#define atomic_barrier                 qurt_atomic_barrier
+#define atomic_barrier_write           qurt_atomic_barrier_write
+#define atomic_barrier_write_smp       qurt_atomic_barrier_write_smp
+#define atomic_barrier_read_smp        qurt_atomic_barrier_read_smp
+#define atomic_barrier_smp             qurt_atomic_barrier_smp
+
+/*============================
+ *       64 bits support
+ *============================ */
+#define atomic64_set(a,b)                qurt_atomic64_set((unsigned long long *)(a),(unsigned long long)(b))  
+#define atomic64_and(a,b)                qurt_atomic64_and((unsigned long long *)(a),(unsigned long long)(b)) 
+#define atomic64_and_return(a,b)         qurt_atomic64_and_return((unsigned long long *)(a),(unsigned long long)(b))  
+#define atomic64_or(a,b)                 qurt_atomic64_or((unsigned long long *)(a),(unsigned long long)(b)) 
+#define atomic64_or_return(a,b)          qurt_atomic64_or_return((unsigned long long *)(a),(unsigned long long)(b)) 
+#define atomic64_xor(a,b)                qurt_atomic64_xor((unsigned long long *)(a),(unsigned long long)(b)) 
+#define atomic64_xor_return(a,b)         qurt_atomic64_xor_return((unsigned long long *)(a),(unsigned long long)(b))  
+#define atomic64_set_bit(a,b)            qurt_atomic64_set_bit((unsigned long long *)(a),(unsigned long long)(b)) 
+#define atomic64_clear_bit(a,b)          qurt_atomic64_clear_bit((unsigned long long *)(a),(unsigned long long)(b))  
+#define atomic64_change_bit(a,b)         qurt_atomic64_change_bit((unsigned long long *)(a),(unsigned long long)(b)) 
+#define atomic64_add(a,b)                qurt_atomic64_add((unsigned long long *)(a),(unsigned long long)(b))  
+#define atomic64_add_return(a,b)         qurt_atomic64_add_return((unsigned long long *)(a),(unsigned long long)(b))
+#define atomic64_sub(a,b)                qurt_atomic64_sub((unsigned long long *)(a),(unsigned long long)(b)) 
+#define atomic64_sub_return(a,b)         qurt_atomic64_sub_return((unsigned long long *)(a),(unsigned long long)(b)) 
+#define atomic64_inc(a)                  qurt_atomic64_inc((unsigned long long *)(a))
+#define atomic64_inc_return(a)           qurt_atomic64_inc_return((unsigned long long *)(a))
+#define atomic64_dec(a)                  qurt_atomic64_dec((unsigned long long *)(a))
+#define atomic64_dec_return(a)           qurt_atomic64_dec_return((unsigned long long *)(a))
+#define atomic64_compare_and_set(a,b,c)  qurt_atomic64_compare_and_set((unsigned long long  *)(a),(unsigned long long )(b),(unsigned long long )(c))
+#define atomic64_barrier                 qurt_atomic64_barrier
+#define atomic64_barrier_write           qurt_atomic64_barrier_write
+#define atomic64_barrier_write_smp       qurt_atomic64_barrier_write_smp
+#define atomic64_barrier_read_smp        qurt_atomic64_barrier_read_smp
+#define atomic64_barrier_smp             qurt_atomic64_barrier_smp
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* ATOMIC_OPS_PLAT_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt.h
new file mode 100755
index 0000000000000..4d25c9b2b6243
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt.h
@@ -0,0 +1,111 @@
+#ifndef QURT_H
+#define QURT_H 
+
+/**
+  @file qurt.h 
+  @brief  Contains kernel header files that provide kernel OS API functions, constants, and 
+  definitions 
+
+ EXTERNALIZED FUNCTIONS
+  none
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  none
+
+ Copyright (c) 2013,2021,2023  by Qualcomm Technologies, Inc.  All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+/*======================================================================
+ *
+ *											 EDIT HISTORY FOR FILE
+ *
+ *	 This section contains comments describing changes made to the
+ *	 module. Notice that changes are listed in reverse chronological
+ *	 order.
+ *
+ *	
+ *
+ *
+ * when 				who 		what, where, why
+ * ---------- 	--- 		------------------------------------------------
+ * 2011-02-25 	op			Add Header file
+   2012-12-16   cm          (Tech Pubs) Edited/added Doxygen comments and markup.
+ ======================================================================*/
+ 
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "qurt_consts.h"
+#include "qurt_api_version.h"
+#include "qurt_alloc.h"
+#include "qurt_futex.h"
+#include "qurt_mutex.h"
+#include "qurt_pipe.h"
+#include "qurt_printf.h"
+#include "qurt_assert.h"
+#include "qurt_thread.h"
+#include "qurt_trace.h"
+#include "qurt_cycles.h"
+#include "qurt_profile.h"
+#include "qurt_sem.h"
+#include "qurt_cond.h"
+#include "qurt_barrier.h"
+#include "qurt_fastint.h"
+#include "qurt_allsignal.h"
+#include "qurt_anysignal.h"
+#include "qurt_signal.h"
+#include "qurt_rmutex.h"
+#include "qurt_pimutex.h"
+#include "qurt_signal2.h"
+#include "qurt_rmutex2.h"
+#include "qurt_pimutex2.h"
+#include "qurt_int.h"
+#include "qurt_lifo.h"
+#include "qurt_power.h"
+#include "qurt_event.h"
+#include "qurt_pmu.h"
+#include "qurt_stid.h"
+//#include "qurt_version.h"
+#include "qurt_tlb.h"
+#include "qurt_vtlb.h"
+#include "qurt_memory.h"
+#include "qurt_qdi.h"
+#include "qurt_sclk.h"
+#include "qurt_space.h"
+#include "qurt_process.h"
+#include "qurt_timer.h"
+#include "qurt_tls.h"
+#include "qurt_thread_context.h"
+#include "qurt_hvx.h"
+#include "qurt_hmx.h"
+#include "qurt_mailbox.h"
+#include "qurt_island.h"
+#include "qurt_qdi_proxy.h"
+#include "qurt_l2cfg.h"
+#include "qurt_mmap.h"
+#include "qurt_isr.h"
+#include "qurt_busywait.h"
+#include "qurt_ecc.h"
+#include "qurt_callback.h"
+#include "qurt_error.h"
+#include "qurt_except.h"
+#include "qurt_mq.h"
+#include "qurt_user_dma.h"
+#include "qurt_fs_hub.h"	
+#include "qurt_os_services.h"	
+
+#ifndef MAIN_ONLY
+#define INCLUDE_ISLAND_CONTENTS
+#endif
+#ifndef ISLAND_ONLY
+#define INCLUDE_MAIN_CONTENTS
+#endif
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_H */
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_alloc.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_alloc.h
new file mode 100755
index 0000000000000..da37a4c0a714e
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_alloc.h
@@ -0,0 +1,145 @@
+#ifndef QURT_ALLOC_H
+#define QURT_ALLOC_H
+
+/**
+  @file qurt_alloc.h 
+  @brief Prototypes of kernel memory allocation API functions.      
+
+ EXTERNALIZED FUNCTIONS
+  none
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  none
+
+ Copyright (c) 2021, 2023 Qualcomm Technologies, Inc.
+ All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+
+/*======================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**@ingroup func_qurt_malloc
+  Dynamically allocates the specified array on the QuRT system heap.
+  The return value is the address of the allocated memory area.
+
+  @note1hang The allocated memory area is automatically initialized to zero.
+
+  @param[in] size     Size (in bytes) of the memory area.
+  
+  @return
+  Nonzero -- Pointer to the allocated memory area. \n
+  0 -- Not enough memory in heap to allocate memory area.
+
+  @dependencies
+  None.    
+
+ */
+/* ======================================================================*/
+void *qurt_malloc( unsigned int size);
+
+/*======================================================================*/
+/**@ingroup func_qurt_calloc
+  Dynamically allocates the specified array on the QuRT system heap.
+  The return value is the address of the allocated array. 
+
+  @note1hang The allocated memory area is automatically initialized to zero.
+
+  @param[in] elsize Size (in bytes) of each array element.
+  @param[in] num    Number of array elements.
+
+  @return 
+  Nonzero -- Pointer to allocated array.\n
+  Zero -- Not enough memory in heap to allocate array.
+
+  @dependencies
+  None.
+  
+ */
+ /* ======================================================================*/
+void *qurt_calloc(unsigned int elsize, unsigned int num);
+
+/*======================================================================*/
+/**@ingroup func_qurt_realloc
+  Reallocates memory on the heap. \n
+  Changes the size of a memory area that is already allocated on the QuRT system heap. 
+  The reallocate memory operation is functionally similar to realloc. It accepts a pointer
+  to an existing memory area on the heap, and resizes the memory area to the specified size
+  while preserving the original contents of the memory area.
+
+  @note1hang This function might change the address of the memory area.
+             If the value of ptr is NULL, this function is equivalent to 
+             qurt_malloc().
+             If the value of new_size is 0, it is equivalent to qurt_free().  
+             If the memory area is expanded, the added memory is not initialized.
+
+  @param[in] *ptr   Pointer to the address of the memory area.
+  @param[in] newsize Size (in bytes) of the reallocated memory area.
+	               	
+  @return
+  Nonzero -- Pointer to reallocated memory area. \n
+  0 -- Not enough memory in heap to reallocate the memory area.
+
+  @dependencies
+  None.
+	 
+ */
+ /* ======================================================================*/
+void *qurt_realloc(void *ptr,  int newsize);
+
+/*======================================================================*/
+/**@ingroup func_qurt_free
+  Frees allocated memory from the heap.\n
+  Deallocates the specified memory from the QuRT system heap.
+
+  @param[in] *ptr Pointer to the address of the memory to deallocate.
+	
+  @return
+  None.
+
+  @dependencies
+  The memory item that the ptr value specifies must have been previously 
+  allocated using one of the qurt_calloc(), 
+  qurt_malloc(), or qurt_realloc() memory allocation functions. 
+  Otherwise the behavior of QuRT is undefined.
+  
+ */
+ /* ======================================================================*/
+void qurt_free( void *ptr);
+
+
+void *qurt_memalign(unsigned int alignment, unsigned int size);
+
+/*
+||  Macro to define a static heap for a QuRT program.
+||
+||  Usage:
+||   Declare at the top-level of any C source file that
+||    is part of the build (and is guaranteed
+||    to actually be pulled into the build). Place
+||    it in the same function with main():
+||
+||    QURT_DECLARE_STATIC_HEAP(512000);
+||
+||  The only argument is the size in bytes, and it is
+||   rounded up to the nearest 64 bytes (size of an
+||   L2 cache block).
+||
+*/
+
+#define QURT_DECLARE_STATIC_HEAP(sz)                    \
+   static struct qurt_static_heap {                     \
+      char space[(sz)] __attribute__((aligned(64)));      \
+   } static_heap[1];                                    \
+   void * const override_heap_Base = &static_heap[0];   \
+   void * const override_heap_Limit = &static_heap[1]
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_ALLOC_H */
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_allsignal.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_allsignal.h
new file mode 100755
index 0000000000000..5dc89e495130d
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_allsignal.h
@@ -0,0 +1,176 @@
+
+#ifndef QURT_ALLSIGNAL_H
+#define QURT_ALLSIGNAL_H
+
+/**
+  @file  qurt_allsignal.h
+  @brief  Prototypes of kernel signal API functions.      
+
+ EXTERNALIZED FUNCTIONS
+  none
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  none
+
+
+ Copyright (c) 2021  by Qualcomm Technologies, Inc.  All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+
+#include <qurt_futex.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** @addtogroup all_signal_types
+@{ */
+/*=====================================================================
+ Typedefs
+ ======================================================================*/
+
+/**          
+qurt_signal_t supersedes qurt_allsignal_t. This type definition was added for backwards compatibility. */
+typedef union {
+    /** @cond */
+	unsigned long long int raw;
+	struct {
+		unsigned int waiting;      /**< */
+		unsigned int signals_in;   /**< */
+		unsigned int queue;        /**< */
+		unsigned int reserved;     /**< */
+	}X;
+    /** @endcond */
+} qurt_allsignal_t;
+/** @} */ /* end_addtogroup all_signal_types */
+
+/*=====================================================================
+ Functions
+======================================================================*/
+ 
+/*======================================================================*/
+/**@ingroup func_qurt_allsignal_init
+  Initializes an all-signal object.\n
+  The all-signal object is initially cleared.
+
+  @datatypes
+  #qurt_allsignal_t
+
+  @param[out] signal Pointer to the all-signal object to initialize. 
+  
+  @return         
+  None.
+
+  @dependencies    
+  None.
+ */
+/* ======================================================================*/
+void qurt_allsignal_init(qurt_allsignal_t *signal);
+
+/*======================================================================*/
+/**@ingroup func_qurt_allsignal_destroy
+  Destroys the specified all-signal object.\n
+  @note1hang All-signal objects must be destroyed when they are no longer in use. 
+             Failure to do this causes resource leaks in the QuRT kernel.  \n
+  @note1cont All-signal objects must not be destroyed while they are still in use. 
+             If this occurs, the behavior of QuRT is undefined.
+  
+  @datatypes
+  #qurt_allsignal_t
+
+  @param[in] signal Pointer to the all-signal object to destroy.
+
+  @return         
+  None.
+ 
+  @dependencies
+  None.
+ */
+/* ======================================================================*/
+void qurt_allsignal_destroy(qurt_allsignal_t *signal);
+
+/*======================================================================*/
+/**@ingroup func_qurt_allsignal_get
+  Gets signal values from the all-signal object.
+
+  Returns the current signal values of the specified all-signal object.
+
+  @datatypes
+  #qurt_allsignal_t
+
+  @param[in] signal Pointer to the all-signal object to access.
+
+  @return         
+  Bitmask with current signal values.
+    
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+static inline unsigned int qurt_allsignal_get(qurt_allsignal_t *signal)
+{ return signal->X.signals_in; }
+    
+/*======================================================================*/
+/**@ingroup func_qurt_allsignal_wait  
+  Waits on the all-signal object.\n
+  Suspends the current thread until all of the specified signals are set.
+  Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1
+  indicates that a signal must be waited on, and 0 that it is not to be waited on.
+
+  If a signal is set in an all-signal object, and a thread is waiting on the all-signal object for
+  that signal, the thread is awakened. If the awakened thread has higher priority than
+  the current thread, a context switch can occur.
+
+  Unlike any-signals, all-signals do not need to explicitly clear any set signals in an all-signal
+  object before waiting on them again -- clearing is done automatically by the wait
+  operation.
+
+  @note1hang At most, one thread can wait on an all-signal object at any given time.
+             Because signal clearing is done by the wait operation, no clear operation is
+             defined for all-signals.
+
+  @datatypes
+  #qurt_allsignal_t
+  
+  @param[in] signal Pointer to the all-signal object to wait on.
+  @param[in] mask	Signal mask value, which identifies the individual signals in the all-signal object
+                    to wait on.
+ 
+  @return
+  None.
+ 
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+void qurt_allsignal_wait(qurt_allsignal_t *signal, unsigned int mask);
+
+/*======================================================================*/
+/**@ingroup func_qurt_allsignal_set
+  Set signals in the specified all-signal object.
+
+  Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit 
+  value of 1 indicates that a signal must be set, and 0 indicates not to set the signal.
+
+  @datatypes
+  #qurt_allsignal_t
+
+  @param[in]	signal  Pointer to the all-signal object to modify. 
+  @param[in]	mask 	Signal mask value identifying the individual signals to  
+                        set in the all-signal object.
+
+  @return
+  None.
+ 
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+void qurt_allsignal_set(qurt_allsignal_t *signal, unsigned int mask);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_ALLSIGNAL_H */
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_anysignal.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_anysignal.h
new file mode 100755
index 0000000000000..9619e2de562b4
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_anysignal.h
@@ -0,0 +1,225 @@
+#ifndef QURT_ANYSIGNAL_H
+#define QURT_ANYSIGNAL_H 
+/**
+  @file qurt_anysignal.h
+  Prototypes of kernel signal API functions.      
+
+ EXTERNALIZED FUNCTIONS
+  None
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  None
+
+Copyright (c) 2021 Qualcomm Technologies, Inc.
+All rights reserved.
+Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+
+#include <qurt_signal.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*=====================================================================
+Typedefs
+======================================================================*/
+
+/**@ingroup anysignals_types                                                 
+ qurt_signal_t supersedes qurt_anysignal_t. This type definition was added for backwards compatibility.  */  
+typedef qurt_signal_t qurt_anysignal_t;
+
+/*=====================================================================
+ Functions
+======================================================================*/
+ 
+/*======================================================================*/
+/**@ingroup func_qurt_anysignal_init
+  Initializes an any-signal object.\n
+  The any-signal object is initially cleared.
+
+  @datatypes
+  #qurt_anysignal_t
+
+  @param[out] signal	Pointer to the initialized any-signal object.  
+  
+  @return         
+  None.
+
+  @dependencies  
+  None.
+ */
+/* ======================================================================*/
+static inline void qurt_anysignal_init(qurt_anysignal_t *signal)
+{
+  qurt_signal_init(signal);
+}
+
+/*======================================================================*/
+/**@ingroup func_qurt_anysignal_destroy
+  Destroys the specified any-signal object. 
+
+  @note1hang Any-signal objects must be destroyed when they are no longer in use. Failure
+             to do this causes resource leaks in the QuRT kernel.\n
+  @note1cont Any-signal objects must not be destroyed while they are still in use. If this
+             occurs, the behavior of QuRT is undefined.
+
+  @datatypes
+  #qurt_anysignal_t
+  
+  @param[in] signal Pointer to the any-signal object to destroy.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+/* ======================================================================*/
+static inline void qurt_anysignal_destroy(qurt_anysignal_t *signal)
+{
+  qurt_signal_destroy(signal);
+}
+
+/*======================================================================*/
+/**@ingroup func_qurt_anysignal_wait
+  Wait on the any-signal object. \n
+  Suspends the current thread until any one of the specified signals is set.
+
+  Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1
+  indicates that a signal must be waited on, and 0 indicates not to wait on the signal.
+  If a signal is set in an any-signal object, and a thread is waiting on the any-signal object for
+  that signal, the thread is awakened. If the awakened thread has higher priority than
+  the current thread, a context switch can occur.
+
+  @note1hang At most, one thread can wait on an any-signal object at any given time.
+
+  @datatypes
+  #qurt_anysignal_t
+	
+  @param[in] signal Pointer to the any-signal object to wait on. 
+  @param[in] mask   Signal mask value, which specifies the individual signals in the any-signal
+                      object to wait on.
+
+  @return 				
+  Bitmask of current signal values.
+
+  @dependencies
+  None.
+ */
+/* ======================================================================*/
+static inline unsigned int qurt_anysignal_wait(qurt_anysignal_t *signal, unsigned int mask)
+{
+  return qurt_signal_wait(signal, mask, QURT_SIGNAL_ATTR_WAIT_ANY);
+}
+
+/*======================================================================*/
+/**@ingroup func_qurt_anysignal_set
+  Sets signals in the specified any-signal object. \n
+  Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1
+  indicates that a signal must be set, and 0 indicates not to set the sigmal.
+
+  @datatypes
+  #qurt_anysignal_t
+	
+  @param[in] signal Pointer to the any-signal object to modify. 
+  @param[in]  mask  Signal mask value identifying the individual signals to  
+                       set in the any-signal object.
+
+  @return 				
+  Bitmask of old signal values (before set).
+
+  @dependencies
+  None.
+ */
+/* ======================================================================*/
+unsigned int qurt_anysignal_set(qurt_anysignal_t *signal, unsigned int mask);
+
+
+
+/*======================================================================*/
+/**@ingroup func_qurt_anysignal_get
+  Gets signal values from the any-signal object.\n
+  Returns the current signal values of the specified any-signal object.
+
+  @datatypes
+  #qurt_anysignal_t
+ 	
+  @param[in] signal Pointer to the any-signal object to access. 
+
+  @return 				
+  A bitmask with the current signal values of the specified any-signal object.
+
+  @dependencies
+  None.
+ */
+/* ======================================================================*/
+static inline unsigned int qurt_anysignal_get(qurt_anysignal_t *signal)
+{
+  return qurt_signal_get(signal);
+}
+
+
+/*======================================================================*/
+/**@ingroup func_qurt_anysignal_clear
+   @xreflabel{sec:anysignal_clear}
+  Clears signals in the specified any-signal object.\n
+  Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1
+  indicates that a signal must be cleared, and 0 indicates not to clear the signal.
+
+  @datatypes
+  #qurt_anysignal_t
+	
+  @param[in] signal Pointer to the any-signal object, which specifies the any-signal object to modify. 
+  @param[in] mask   Signal mask value identifying the individual signals to  
+                    clear in the any-signal object.
+	
+  @return 				
+  Bitmask -- Old signal values (before clear). 
+
+  @dependencies
+  None.
+ */
+/* ======================================================================*/
+unsigned int qurt_anysignal_clear(qurt_anysignal_t *signal, unsigned int mask);
+
+/*======================================================================*/
+/**@ingroup func_qurt_anysignal_wait_timed
+  Waits on the any-signal object. \n
+  Suspends the current thread until any of the specified signals is set or timeout expires.
+
+  Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1
+  indicates that a signal must be waited on, and 0 indicates not to wait on the signal.
+  If a signal is set in an any-signal object, and a thread was waiting on the any-signal object for
+  that signal, the thread is awakened. If the awakened thread has higher priority than
+  the current thread, a context switch can occur.
+
+  @note1hang At most, one thread can wait on an any-signal object at any given time.
+
+  @datatypes
+  #qurt_anysignal_t
+	
+  @param[in] signal Pointer to the any-signal object to wait on. 
+  @param[in] mask   Signal mask value, which specifies the individual signals in the any-signal
+                      object to wait on.
+  @param[out] signals Bitmask of current signal values.
+  @param[in] duration Interval (in microseconds) duration value must be between #QURT_TIMER_MIN_DURATION and
+  #QURT_TIMER_MAX_DURATION.
+
+  @return 				
+   #QURT_EOK -- Success \n
+   #QURT_ETIMEDOUT -- timeout
+   #QURT_EINVALID -- Duration out of range
+
+  @dependencies
+  None.
+ */
+/* ======================================================================*/
+
+int qurt_anysignal_wait_timed(qurt_anysignal_t *signal, unsigned int mask, unsigned int *signals, unsigned long long int duration);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_ANYSIGNAL_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_api_version.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_api_version.h
new file mode 100755
index 0000000000000..dfe53ae755054
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_api_version.h
@@ -0,0 +1,77 @@
+#ifndef QURT_API_VERSION_H
+#define QURT_API_VERSION_H
+/*==============================================================================
+
+qurt_api_version.h
+
+GENERAL DESCRIPTION
+    API version file
+
+EXTERNAL FUNCTIONS
+    None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+    None.
+
+Copyright (c) Qualcomm Technologies, Inc.
+All Rights Reserved.
+Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+==============================================================================*/
+
+/*==============================================================================
+                         CONSTANTS AND DEFINITIONS
+==============================================================================*/
+/**
+ * Each field of the QURT_API_VERSION definitions is an 8-bit unsigned integer.
+ * Main release has first 3 fields updated - Major, Minor and Release.
+ *  - QURT_API_VERSION = Major, Minor, Release.
+ * Patch releases are supported by adding the extra field.
+ *  - QURT_API_VERSION = Major, Minor, Release, Patch.
+ */
+// Major version is incremented for incompatible API changes.
+#define QURT_API_VER_MAJOR 1
+
+// Minor version is incremented for backward-compatible enhancements in the API
+// set.
+#define QURT_API_VER_MINOR 4
+
+// RELEASE version is incremented for each release within a `MAJOR.MINOR`
+// release.
+#define QURT_API_VER_RELEASE 1
+
+// Patch version is incremented when new API content is introduced on older LTS
+// release.
+#define QURT_API_VER_PATCH 0
+
+/* Update the QURT_API_VERSION function macro. */
+#define QURT_API_VERSION_ENCODE(major, minor, release, patch) \
+    ((((major) & 0xFF) << 24) | (((minor) & 0xFF) << 16) | \
+        (((release) & 0xFF) << 8) | ((patch) & 0xFF))
+
+/* Update the QURT_API_VERSION Macro. */
+#define QURT_API_VERSION \
+    QURT_API_VERSION_ENCODE(QURT_API_VER_MAJOR, QURT_API_VER_MINOR, \
+        QURT_API_VER_RELEASE, QURT_API_VER_PATCH)
+
+/** Usage:
+ *
+ * #if QURT_API_VERSION >= QURT_API_VERSION_ENCODE(1,4,0,0)
+ *  qurt_func_2(a,b,c);
+ * #else
+ *  qurt_func(a);
+ * #endif
+ *
+ */
+/*
+   Gets the QuRT API version.
+
+  @return
+  QuRT API version.
+
+  @dependencies
+  None.
+ */
+unsigned int qurt_api_version(void);
+
+#endif /* QURT_API_VERSION_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_assert.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_assert.h
new file mode 100755
index 0000000000000..13cc2afd2e973
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_assert.h
@@ -0,0 +1,51 @@
+#ifndef QURT_ASSERT_H
+#define QURT_ASSERT_H
+/**
+  @file qurt_assert.h   
+  @brief  Prototypes of qurt_assert API  
+
+  EXTERNAL FUNCTIONS
+   None.
+
+  INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+  Copyright (c) 2021  by Qualcomm Technologies, Inc.  All Rights Reserved.
+  Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+=============================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*=============================================================================
+                        CONSTANTS AND MACROS
+=============================================================================*/
+/**@ingroup func_qurt_assert_error
+  Writes diagnostic information to the debug buffer, and raises an error to the QuRT kernel.
+  
+  @datatypes
+  None.
+  
+  @param[in] filename     Pointer to the file name string.
+  @param[in] lineno       Line number.
+  
+  @return
+  None.
+
+  @dependencies
+  None.  
+ */
+void qurt_assert_error(const char *filename, int lineno) __attribute__((noreturn));
+
+#define qurt_assert(cond) ((cond)?(void)0:qurt_assert_error(__QURTFILENAME__,__LINE__))
+
+/** @} */ /* end_ingroup func_qurt_assert */
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_ASSERT_H */
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_atomic_ops.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_atomic_ops.h
new file mode 100755
index 0000000000000..d9b2cff7d737c
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_atomic_ops.h
@@ -0,0 +1,1298 @@
+#ifndef QURT_ATOMIC_OPS_H
+#define QURT_ATOMIC_OPS_H
+/**
+  @file qurt_atomic_ops.h 
+  @brief  Prototypes of kernel atomic operations API.
+
+  EXTERNAL FUNCTIONS
+   None.
+
+   INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+Copyright (c) 2021, 2022  by Qualcomm Technologies, Inc.  All Rights Reserved
+Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+=============================================================================*/
+
+/*
+ * Australian Public Licence B (OZPLB)
+ *
+ * Version 1-0
+ *
+ * Copyright (c) 2007, Open Kernel Labs, Inc.
+ *
+ * All rights reserved. 
+ *
+ * Developed by: Embedded, Real-time and Operating Systems Program (ERTOS)
+ *               National ICT Australia
+ *               http://www.ertos.nicta.com.au
+ *
+ * Permission is granted by National ICT Australia, free of charge, to
+ * any person obtaining a copy of this software and any associated
+ * documentation files (the "Software") to deal with the Software without
+ * restriction, including (without limitation) the rights to use, copy,
+ * modify, adapt, merge, publish, distribute, communicate to the public,
+ * sublicense, and/or sell, lend or rent out copies of the Software, and
+ * to permit persons to whom the Software is furnished to do so, subject
+ * to the following conditions:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimers.
+ *
+ *     * Redistributions in binary form must reproduce the above
+ *       copyright notice, this list of conditions and the following
+ *       disclaimers in the documentation and/or other materials provided
+ *       with the distribution.
+ *
+ *     * Neither the name of National ICT Australia, nor the names of its
+ *       contributors, may be used to endorse or promote products derived
+ *       from this Software without specific prior written permission.
+ *
+ * EXCEPT AS EXPRESSLY STATED IN THIS LICENCE AND TO THE FULL EXTENT
+ * PERMITTED BY APPLICABLE LAW, THE SOFTWARE IS PROVIDED "AS-IS", AND
+ * NATIONAL ICT AUSTRALIA AND ITS CONTRIBUTORS MAKE NO REPRESENTATIONS,
+ * WARRANTIES OR CONDITIONS OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
+ * BUT NOT LIMITED TO ANY REPRESENTATIONS, WARRANTIES OR CONDITIONS
+ * REGARDING THE CONTENTS OR ACCURACY OF THE SOFTWARE, OR OF TITLE,
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT,
+ * THE ABSENCE OF LATENT OR OTHER DEFECTS, OR THE PRESENCE OR ABSENCE OF
+ * ERRORS, WHETHER OR NOT DISCOVERABLE.
+ *
+ * TO THE FULL EXTENT PERMITTED BY APPLICABLE LAW, IN NO EVENT SHALL
+ * NATIONAL ICT AUSTRALIA OR ITS CONTRIBUTORS BE LIABLE ON ANY LEGAL
+ * THEORY (INCLUDING, WITHOUT LIMITATION, IN AN ACTION OF CONTRACT,
+ * NEGLIGENCE OR OTHERWISE) FOR ANY CLAIM, LOSS, DAMAGES OR OTHER
+ * LIABILITY, INCLUDING (WITHOUT LIMITATION) LOSS OF PRODUCTION OR
+ * OPERATION TIME, LOSS, DAMAGE OR CORRUPTION OF DATA OR RECORDS; OR LOSS
+ * OF ANTICIPATED SAVINGS, OPPORTUNITY, REVENUE, PROFIT OR GOODWILL, OR
+ * OTHER ECONOMIC LOSS; OR ANY SPECIAL, INCIDENTAL, INDIRECT,
+ * CONSEQUENTIAL, PUNITIVE OR EXEMPLARY DAMAGES, ARISING OUT OF OR IN
+ * CONNECTION WITH THIS LICENCE, THE SOFTWARE OR THE USE OF OR OTHER
+ * DEALINGS WITH THE SOFTWARE, EVEN IF NATIONAL ICT AUSTRALIA OR ITS
+ * CONTRIBUTORS HAVE BEEN ADVISED OF THE POSSIBILITY OF SUCH CLAIM, LOSS,
+ * DAMAGES OR OTHER LIABILITY.
+ *
+ * If applicable legislation implies representations, warranties, or
+ * conditions, or imposes obligations or liability on National ICT
+ * Australia or one of its contributors in respect of the Software that
+ * cannot be wholly or partly excluded, restricted or modified, the
+ * liability of National ICT Australia or the contributor is limited, to
+ * the full extent permitted by the applicable legislation, at its
+ * option, to:
+ * a.  in the case of goods, any one or more of the following:
+ * i.  the replacement of the goods or the supply of equivalent goods;
+ * ii.  the repair of the goods;
+ * iii. the payment of the cost of replacing the goods or of acquiring
+ *  equivalent goods;
+ * iv.  the payment of the cost of having the goods repaired; or
+ * b.  in the case of services:
+ * i.  the supplying of the services again; or
+ * ii.  the payment of the cost of having the services supplied again.
+ *
+ * The construction, validity and performance of this licence is governed
+ * by the laws in force in New South Wales, Australia.
+ */
+
+/*
+ * Author: Malcolm Purvis <malcolmp@ok-labs.com>
+ *
+ * This file is only included by the main atomic_ops.h, so all of that
+ * file's definitions are available.
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*=============================================================================
+                        CONSTANTS AND MACROS
+=============================================================================*/
+
+///* Sanity check to ensure the smp flag is set in machines.py */
+//#if defined(__ATOMIC_OPS_IN_KERNEL__) && !defined(MACHINE_SMP) && CONFIG_NUM_UNITS > 1
+//#error CONFIG_NUM_UNITS > 1 but smp not defined in machines.py.
+//#endif
+#define QURT_INLINE  __attribute__((always_inline))
+
+/*=============================================================================
+                        FUNCTIONS
+=============================================================================*/
+/**@ingroup func_qurt_atomic_set
+  Sets the atomic variable with the specified value.
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+  @param[in]      value  Value to set.
+  
+  @return
+  Value successfuly set.
+
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE unsigned int
+qurt_atomic_set(unsigned int* target, unsigned int value)
+{
+    unsigned long tmp;
+
+    __asm__ __volatile__(
+        "1:     %0 = memw_locked(%2)\n"
+        "       memw_locked(%2, p0) = %3\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (tmp),"+m" (*target)
+        : "r" (target), "r" (value)
+        : "p0");
+    return value;
+}
+
+/**@ingroup func_qurt_atomic_and
+  Bitwise AND operation of the atomic variable with mask. 
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+  @param[in]      mask   Mask for bitwise AND. 
+
+  @return
+  None
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE void
+qurt_atomic_and(unsigned int* target, unsigned int mask)
+{
+    unsigned int result;
+
+    __asm__ __volatile__(
+        "1:     %0 = memw_locked(%2)\n"
+        "       %0 = and(%0, %3)\n"
+        "       memw_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*target)
+        : "r" (target),"r" (mask)
+        : "p0");
+}
+
+/**@ingroup func_qurt_atomic_and_return
+  Bitwise AND operation of the atomic variable with mask. 
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+  @param[in]      mask   Mask for bitwise AND. 
+
+  @return
+  AND result of atomic variable with mask.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE unsigned int
+qurt_atomic_and_return(unsigned int* target, unsigned int mask)
+{
+    unsigned int result;
+
+    __asm__ __volatile__(
+        "1:     %0 = memw_locked(%2)\n"
+        "       %0 = and(%0, %3)\n"
+        "       memw_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*target)
+        : "r" (target), "r" (mask)
+        : "p0");
+
+    return result;
+}
+
+/**@ingroup func_qurt_atomic_or
+  Bitwise OR operation of the atomic variable with mask. 
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+  @param[in]      mask   Mask for bitwise OR. 
+
+  @return
+  None.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE void
+qurt_atomic_or(unsigned int* target, unsigned int mask)
+{
+    unsigned int result;
+
+    __asm__ __volatile__(
+        "1:     %0 = memw_locked(%2)\n"
+        "       %0 = or(%0, %3)\n"
+        "       memw_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*target)
+        : "r" (target), "r" (mask)
+        : "p0");
+}
+
+/**@ingroup func_qurt_atomic_or_return
+  Bitwise OR operation of the atomic variable with mask. 
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+  @param[in]      mask   Mask for bitwise OR. 
+
+  @return
+  Returns the OR result of the atomic variable with mask.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE unsigned int
+qurt_atomic_or_return(unsigned int* target, unsigned int mask)
+{
+    unsigned int result;
+
+    __asm__ __volatile__(
+        "1:     %0 = memw_locked(%2)\n"
+        "       %0 = or(%0, %3)\n"
+        "       memw_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*target)
+        : "r" (target), "r" (mask)
+        : "p0");
+
+    return result;
+}
+
+/**@ingroup func_qurt_atomic_xor
+  Bitwise XOR operation of the atomic variable with mask. 
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+  @param[in]      mask   Mask for bitwise XOR.
+
+  @return
+  None.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE void
+qurt_atomic_xor(unsigned int* target, unsigned int mask)
+{
+    unsigned int result;
+
+    __asm__ __volatile__(
+        "1:     %0 = memw_locked(%2)\n"
+        "       %0 = xor(%0, %3)\n"
+        "       memw_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*target)
+        : "r" (target), "r" (mask)
+        : "p0");
+}
+
+/**@ingroup func_qurt_atomic_xor_return
+  Bitwise XOR operation of the atomic variable with mask. 
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+  @param[in]      mask   Mask for bitwise XOR. 
+
+  @return
+  XOR result of atomic variable with mask.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE unsigned int
+qurt_atomic_xor_return(unsigned int* target, unsigned int mask)
+{
+    unsigned int result;
+
+    __asm__ __volatile__(
+        "1:     %0 = memw_locked(%2)\n"
+        "       %0 = xor(%0, %3)\n"
+        "       memw_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*target)
+        : "r" (target), "r" (mask)
+        : "p0");
+
+    return result;
+}
+
+/**@ingroup func_qurt_atomic_set_bit
+  Sets a bit in the atomic variable at a specified position.
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+  @param[in]      bit    Bit position to set. 
+
+  @return
+  None.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE void
+qurt_atomic_set_bit(unsigned int *target, unsigned int bit)
+{
+    unsigned int result;
+    unsigned int aword = bit / ((unsigned int)sizeof(unsigned int) * 8U); 
+    unsigned int sbit = bit % ((unsigned int)sizeof(unsigned int) * 8U);
+    unsigned int *wtarget= (unsigned int *)&target[aword];
+
+    __asm__ __volatile__(
+        "1:     %0 = memw_locked(%2)\n"
+        "       %0 = setbit(%0, %3)\n"
+        "       memw_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*wtarget)
+        : "r" (wtarget), "r" (sbit)
+        : "p0");
+}
+
+/**@ingroup func_qurt_atomic_clear_bit
+  Clears a bit in the atomic variable at a specified position. 
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+  @param[in]      bit    Bit position to clear.
+
+  @return
+  None.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE void
+qurt_atomic_clear_bit(unsigned int *target, unsigned int bit)
+{
+    unsigned int result;
+    unsigned int aword = bit / ((unsigned int)sizeof(unsigned int) * 8U); 
+    unsigned int sbit = bit % ((unsigned int)sizeof(unsigned int) * 8U);
+    unsigned int *wtarget= (unsigned int *)&target[aword];
+
+    __asm__ __volatile__(
+        "1:     %0 = memw_locked(%2)\n"
+        "       %0 = clrbit(%0, %3)\n"
+        "       memw_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*wtarget)
+        : "r" (wtarget), "r" (sbit)
+        : "p0");
+}
+
+/**@ingroup func_qurt_atomic_change_bit
+  Toggles a bit in a atomic variable at a bit position. 
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+  @param[in]      bit    Bit position to toggle. 
+
+  @return
+  None.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE void
+qurt_atomic_change_bit(unsigned int *target, unsigned int bit)
+{
+    unsigned int result;
+    unsigned int aword = bit / ((unsigned int)sizeof(unsigned int) * 8U); 
+    unsigned int sbit = bit & 0x1fU;
+    unsigned int *wtarget= (unsigned int *)&target[aword];
+
+    __asm__ __volatile__(
+        "1:     %0 = memw_locked(%2)\n"
+        "       %0 = togglebit(%0, %3)\n"
+        "       memw_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*wtarget)
+        : "r" (wtarget),"r" (sbit)
+        : "p0");
+}
+
+/**@ingroup func_qurt_atomic_add
+  Adds an integer to atomic variable.
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+  @param[in]      v      Integer value to add. 
+
+  @return
+  None.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE void
+qurt_atomic_add(unsigned int *target, unsigned int v)
+{
+    unsigned int result;
+
+    __asm__ __volatile__(
+        "1:     %0 = memw_locked(%2)\n"
+        "       %0 = add(%0, %3)\n"
+        "       memw_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*target)
+        : "r" (target), "r" (v)
+        : "p0");
+}
+
+/**@ingroup func_qurt_atomic_add_return
+  Adds an integer to atomic variable.
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+  @param[in]      v      Integer value to add. 
+
+  @return
+  Result of arithmetic sum.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE unsigned int
+qurt_atomic_add_return(unsigned int *target, unsigned int v)
+{
+    unsigned int result;
+
+    __asm__ __volatile__(
+        "1:     %0 = memw_locked(%2)\n"
+        "       %0 = add(%0, %3)\n"
+        "       memw_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*target)
+        : "r" (target), "r" (v)
+        : "p0");
+
+    return result;
+}
+
+/**@ingroup func_qurt_atomic_add_unless
+  Adds the delta value to an atomic variable unless the current value in the target 
+  matches the unless variable.
+
+  @note1hang The function retries until load lock and store conditional
+             are successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+  @param[in]      delta  Value to add to the current value.
+  @param[in]      unless Perform the addition only when the current value is not 
+                         equal to this unless value.
+  @return
+  TRUE  -- 1 - Addition was performed. \n
+  FALSE -- 0 - Addition was not done.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE unsigned int
+qurt_atomic_add_unless(unsigned int* target,
+                       unsigned int delta,
+                       unsigned int unless)
+{
+    unsigned int current_val;
+    unsigned int new_val;
+
+    __asm__ __volatile__(
+        "1:     %0 = memw_locked(%3)\n"
+        "       p0 = cmp.eq(%0, %5)\n"
+        "       if p0 jump 2f\n"
+        "       %1 = add(%0, %4)\n"
+        "       memw_locked(%3, p0) = %1\n"
+        "       if !p0 jump 1b\n"
+        "2:\n"
+        : "=&r" (current_val),"=&r" (new_val),"+m" (*target)
+        : "r" (target), "r" (delta), "r" (unless)
+        : "p0");
+
+    return (unsigned int)(current_val != unless);
+}
+
+/**@ingroup func_qurt_atomic_sub
+  Subtracts an integer from an atomic variable.
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+  @param[in]      v      Integer value to subtract. 
+
+  @return
+  None.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE void
+qurt_atomic_sub(unsigned int *target, unsigned int v)
+{
+    unsigned int result;
+
+    __asm__ __volatile__(
+        "1:     %0 = memw_locked(%2)\n"
+        "       %0 = sub(%0, %3)\n"
+        "       memw_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*target)
+        : "r" (target), "r" (v)
+        : "p0");
+}
+
+/**@ingroup func_qurt_atomic_sub_return
+  Subtracts an integer from an atomic variable.
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+  @param[in]      v      Integer value to subtract. 
+
+  @return
+  Result of arithmetic subtraction.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE unsigned int
+qurt_atomic_sub_return(unsigned int *target, unsigned int v)
+{
+    unsigned int result;
+
+    __asm__ __volatile__(
+        "1:     %0 = memw_locked(%2)\n"
+        "       %0 = sub(%0, %3)\n"
+        "       memw_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*target)
+        : "r" (target), "r" (v)
+        : "p0");
+
+    return result;
+}
+
+/**@ingroup func_qurt_atomic_inc
+  Increments an atomic variable by one.
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+
+  @return
+  None.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE void
+qurt_atomic_inc(unsigned int *target)
+{
+    unsigned int result;
+
+    __asm__ __volatile__(
+        "1:     %0 = memw_locked(%2)\n"
+        "       %0 = add(%0, #1)\n"
+        "       memw_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*target)
+        : "r" (target)
+        : "p0");
+}
+
+/**@ingroup func_qurt_atomic_inc_return
+  Increments an atomic variable by one.
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+
+  @return
+  Incremented value.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE unsigned int
+qurt_atomic_inc_return(unsigned int *target)
+{
+    unsigned int result;
+
+    __asm__ __volatile__(
+        "1:     %0 = memw_locked(%2)\n"
+        "       %0 = add(%0, #1)\n"
+        "       memw_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*target)
+        : "r" (target)
+        : "p0");
+
+    return result;
+}
+
+/**@ingroup func_qurt_atomic_dec
+  Decrements an atomic variable by one.
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+
+  @return
+  None.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE void
+qurt_atomic_dec(unsigned int *target)
+{
+    unsigned int result;
+
+    __asm__ __volatile__(
+        "1:     %0 = memw_locked(%2)\n"
+        "       %0 = add(%0, #-1)\n"
+        "       memw_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*target)
+        : "r" (target)
+        : "p0");
+}
+
+/**@ingroup func_qurt_atomic_dec_return
+  Decrements an atomic variable by one.
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+
+  @return
+  Decremented value.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE unsigned int
+qurt_atomic_dec_return(unsigned int *target)
+{
+    unsigned int result;
+
+    __asm__ __volatile__(
+        "1:     %0 = memw_locked(%2)\n"
+        "       %0 = add(%0, #-1)\n"
+        "       memw_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*target)
+        : "r" (target)
+        : "p0");
+
+    return result;
+}
+
+/**@ingroup func_qurt_atomic_compare_and_set
+  Compares the current value of the atomic variable with the
+  specified value and set to a new value when compare is successful.
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target  Pointer to the atomic variable.
+  @param[in]      old_val Old value to compare.
+  @param[in]      new_val New value to set.
+
+  @return
+  FALSE -- Specified value is not equal to the current value. \n
+  TRUE --Specified value is equal to the current value.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE unsigned int
+qurt_atomic_compare_and_set(unsigned int* target,
+                       unsigned int old_val,
+                       unsigned int new_val)
+{
+    unsigned int current_val;
+
+    __asm__ __volatile__(
+        "1:     %0 = memw_locked(%2)\n"
+        "       p0 = cmp.eq(%0, %3)\n"
+        "       if !p0 jump 2f\n"
+        "       memw_locked(%2, p0) = %4\n"
+        "       if !p0 jump 1b\n"
+        "2:\n"
+        : "=&r" (current_val),"+m" (*target)
+        : "r" (target), "r" (old_val), "r" (new_val)
+        : "p0");
+
+    return (unsigned int)(current_val == old_val);
+}
+
+/**@ingroup func_qurt_atomic_barrier
+  Allows the compiler to enforce an ordering constraint on memory operation issued
+  before and after the function.
+  
+  @return
+  None.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE void
+qurt_atomic_barrier(void)
+{
+    __asm__ __volatile__ (
+        ""
+        :
+        :
+        :
+        "memory");
+}
+
+
+/**@ingroup func_qurt_atomic64_set
+  Sets the 64-bit atomic variable with the specified value. 
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+  @param[in]      value  64-bit value to set. 
+
+  @return
+  Successfuly set value.
+
+  @dependencies
+  None.
+*/ 
+static inline QURT_INLINE unsigned long long
+qurt_atomic64_set(unsigned long long* target, unsigned long long value)
+{
+    unsigned long long tmp;
+
+    __asm__ __volatile__(
+        "1:     %0 = memd_locked(%2)\n"
+        "       memd_locked(%2, p0) = %3\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (tmp),"+m" (*target)
+        : "r" (target), "r" (value)
+        : "p0");
+    return value;
+}
+
+/**@ingroup func_qurt_atomic64_and_return
+  Bitwise AND operation of a 64-bit atomic variable with mask. 
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+  @param[in]      mask   64-bit mask for bitwise AND. 
+
+  @return
+  AND result of 64-bit atomic variable with mask.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE unsigned long long
+qurt_atomic64_and_return(unsigned long long* target, unsigned long long mask)
+{
+    unsigned long long result;
+
+    __asm__ __volatile__(
+        "1:     %0 = memd_locked(%2)\n"
+        "       %0 = and(%0, %3)\n"
+        "       memd_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*target)
+        : "r" (target), "r" (mask)
+        : "p0");
+
+    return result;
+}
+
+/**@ingroup func_qurt_atomic64_or
+  Bitwise OR operation of a 64-bit atomic variable with mask.
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+  @param[in]      mask   64-bit mask for bitwise OR. 
+
+  @return
+  None.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE void
+qurt_atomic64_or(unsigned long long* target, unsigned long long mask)
+{
+    unsigned long long result;
+
+    __asm__ __volatile__(
+        "1:     %0 = memd_locked(%2)\n"
+        "       %0 = or(%0, %3)\n"
+        "       memd_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*target)
+        : "r" (target), "r" (mask)
+        : "p0");
+}
+
+/**@ingroup func_qurt_atomic64_or_return
+  Bitwise OR operation of a 64-bit atomic variable with mask. 
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+  @param[in]      mask   64-bit mask for bitwise OR. 
+
+  @return
+  OR result of the atomic variable with mask.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE unsigned long long
+qurt_atomic64_or_return(unsigned long long* target, unsigned long long mask)
+{
+    unsigned long long result;
+
+    __asm__ __volatile__(
+        "1:     %0 = memd_locked(%2)\n"
+        "       %0 = or(%0, %3)\n"
+        "       memd_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*target)
+        : "r" (target), "r" (mask)
+        : "p0");
+
+    return result;
+}
+
+/**@ingroup func_qurt_atomic64_xor_return
+  Bitwise XOR operation of 64-bit atomic variable with mask. 
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+  @param[in]      mask   64-bit mask for bitwise XOR. 
+
+  @return
+  XOR result of atomic variable with mask.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE unsigned long long
+qurt_atomic64_xor_return(unsigned long long* target, unsigned long long mask)
+{
+    unsigned long long result;
+
+    __asm__ __volatile__(
+        "1:     %0 = memd_locked(%2)\n"
+        "       %0 = xor(%0, %3)\n"
+        "       memd_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*target)
+        : "r" (target), "r" (mask)
+        : "p0");
+
+    return result;
+}
+
+/**@ingroup func_qurt_atomic64_set_bit
+  Sets a bit in a 64-bit atomic variable at a specified position. 
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+  @param[in]      bit    Bit position to set. 
+
+  @return
+  None.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE void
+qurt_atomic64_set_bit(unsigned long long *target, unsigned int bit)
+{
+    unsigned int result;
+    unsigned int *wtarget;
+    unsigned int *pwtarget = (unsigned int *)target;
+    unsigned int aword = bit / ((unsigned int)sizeof(unsigned int) * 8U); 
+    unsigned int sbit = bit & 0x1FU;
+    wtarget = (unsigned int *)&pwtarget[aword];
+
+
+    __asm__ __volatile__(
+        "1:     %0 = memw_locked(%2)\n"
+        "       %0 = setbit(%0, %3)\n"
+        "       memw_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*wtarget)
+        : "r" (wtarget), "r" (sbit)
+        : "p0");
+}
+
+/**@ingroup func_qurt_atomic64_clear_bit
+  Clears a bit in a 64-bit atomic variable at a specified position. 
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+  @param[in]      bit    Bit position to clear. 
+
+  @return
+  None.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE void
+qurt_atomic64_clear_bit(unsigned long long *target, unsigned int bit)
+{
+    unsigned int result;
+    unsigned int *wtarget;
+    unsigned int *pwtarget = (unsigned int *)target;
+    unsigned int aword = bit / ((unsigned int)sizeof(unsigned int) * 8U); 
+    unsigned int sbit = bit & 0x1FU;
+    wtarget = (unsigned int *)&pwtarget[aword];
+
+    __asm__ __volatile__(
+        "1:     %0 = memw_locked(%2)\n"
+        "       %0 = clrbit(%0, %3)\n"
+        "       memw_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*wtarget)
+        : "r" (wtarget), "r" (sbit)
+        : "p0");
+}
+
+/**@ingroup func_qurt_atomic64_change_bit
+  Toggles a bit in a 64-bit atomic variable at a bit position. 
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+  @param[in]      bit    Bit position to toggle. 
+
+  @return
+  None.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE void
+qurt_atomic64_change_bit(unsigned long long *target, unsigned int bit)
+{
+    unsigned int result;
+    unsigned int *wtarget;
+    unsigned int *pwtarget = (unsigned int *)target;
+    unsigned int aword = bit / ((unsigned int)sizeof(unsigned int) * 8U); 
+    unsigned int sbit = bit & 0x1FU;
+    wtarget = (unsigned int *)&pwtarget[aword];
+
+    __asm__ __volatile__(
+        "1:     %0 = memw_locked(%2)\n"
+        "       %0 = togglebit(%0, %3)\n"
+        "       memw_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*wtarget)
+        : "r" (wtarget),"r" (sbit)
+        : "p0");
+}
+
+/**@ingroup func_qurt_atomic64_add
+  Adds a 64-bit integer to 64-bit atomic variable.
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+  @param[in]      v      64-bit integer value to add. 
+
+  @return
+  None.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE void
+qurt_atomic64_add(unsigned long long *target, unsigned long long v)
+{
+    unsigned long long result;
+
+    __asm__ __volatile__(
+        "1:     %0 = memd_locked(%2)\n"
+        "       %0 = add(%0, %3)\n"
+        "       memd_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*target)
+        : "r" (target), "r" (v)
+        : "p0");
+}
+
+/**@ingroup func_qurt_atomic64_add_return
+  Adds a 64-bit integer to 64-bit atomic variable.
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+  @param[in]      v      64-bit integer value to add. 
+
+  @return
+  Result of arithmetic sum.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE unsigned long long
+qurt_atomic64_add_return(unsigned long long *target, unsigned long long v)
+{
+    unsigned long long result;
+
+    __asm__ __volatile__(
+        "1:     %0 = memd_locked(%2)\n"
+        "       %0 = add(%0, %3)\n"
+        "       memd_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*target)
+        : "r" (target), "r" (v)
+        : "p0");
+
+    return result;
+}
+
+/**@ingroup func_qurt_atomic64_sub_return
+  Subtracts a 64-bit integer from an atomic variable.
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+  @param[in]      v      64-bit integer value to subtract. 
+
+  @return
+  Result of arithmetic subtraction.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE unsigned long long
+qurt_atomic64_sub_return(unsigned long long *target, unsigned long long v)
+{
+    unsigned long long result;
+
+    __asm__ __volatile__(
+        "1:     %0 = memd_locked(%2)\n"
+        "       %0 = sub(%0, %3)\n"
+        "       memd_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*target)
+        : "r" (target), "r" (v)
+        : "p0");
+
+    return result;
+}
+
+/**@ingroup func_qurt_atomic64_inc
+  Increments a 64-bit atomic variable by one.
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+
+  @return
+  None.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE void
+qurt_atomic64_inc(unsigned long long *target)
+{
+    unsigned long long result;
+    unsigned long long inc =1;
+
+    __asm__ __volatile__(
+        "1:     %0 = memd_locked(%2)\n"
+        "       %0 = add(%0, %3)\n"
+        "       memd_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*target)
+        : "r" (target),"r" (inc)
+        : "p0");
+}
+
+/**@ingroup func_qurt_atomic64_inc_return
+  Increments a 64-bit atomic variable by one
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+
+  @return
+  Incremented value.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE unsigned long long
+qurt_atomic64_inc_return(unsigned long long *target)
+{
+    unsigned long long result;
+    unsigned long long inc =1;
+
+    __asm__ __volatile__(
+        "1:     %0 = memd_locked(%2)\n"
+        "       %0 = add(%0, %3)\n"
+        "       memd_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*target)
+        : "r" (target),"r" (inc)
+        : "p0");
+
+    return result;
+}
+
+/**@ingroup func_qurt_atomic64_dec_return
+  Decrements a 64-bit atomic variable by one.
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+
+  @return
+  Decremented value.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE unsigned long long
+qurt_atomic64_dec_return(unsigned long long *target)
+{
+    unsigned long long result;
+    long long minus1 = 0xFFFFFFFFFFFFFFFFLL;
+
+    __asm__ __volatile__(
+        "1:     %0 = memd_locked(%2)\n"
+        "       %0 = add(%0, %3)\n"
+        "       memd_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*target)
+        : "r" (target),"r" (minus1)
+        : "p0");
+
+    return result;
+}
+
+/**@ingroup func_qurt_atomic64_compare_and_set
+  Compares the current value of an 64-bit atomic variable with 
+  the specified value and sets to a new value when compare is successful.
+
+  @note1hang The function keep retrying until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target  Pointer to the atomic variable.
+  @param[in]      old_val 64-bit old value to compare.
+  @param[in]      new_val 64-bit new value to set.
+
+  @return
+  FALSE -- Specified value is not equal to the current value. \n
+  TRUE -- Specified value is equal to the current value.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE int
+qurt_atomic64_compare_and_set(unsigned long long *target,
+                       unsigned long long old_val,
+                       unsigned long long new_val)
+{
+    unsigned long long current_val;
+
+    __asm__ __volatile__(
+        "1:     %0 = memd_locked(%2)\n"
+        "       p0 = cmp.eq(%0, %3)\n"
+        "       if !p0 jump 2f\n"
+        "       memd_locked(%2, p0) = %4\n"
+        "       if !p0 jump 1b\n"
+        "2:\n"
+        : "=&r" (current_val),"+m" (*target)
+        : "r" (target), "r" (old_val), "r" (new_val)
+        : "p0");
+
+    return (int)(current_val == old_val);
+}
+
+/**@ingroup func_qurt_atomic64_barrier
+  Allows compiler to enforce an ordering constraint on memory operation issued
+  before and after the function.
+
+  @return
+  None.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE void
+qurt_atomic64_barrier(void)
+{
+    /** @cond */
+    __asm__ __volatile__ (
+        ""
+        :
+        :
+        :
+        "memory");
+    /** @endcond */
+}
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_ATOMIC_OPS_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_barrier.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_barrier.h
new file mode 100755
index 0000000000000..7c6f787d43bc2
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_barrier.h
@@ -0,0 +1,140 @@
+#ifndef QURT_BARRIER_H
+#define QURT_BARRIER_H
+
+/**
+  @file qurt_barrier.h
+  @brief Prototypes of Kernel barrier API functions.      
+
+ EXTERNALIZED FUNCTIONS
+ None.
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+ None.
+
+ Copyright (c) 2021 Qualcomm Technologies, Inc. All rights reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** @addtogroup barrier_types
+@{ */
+/*=====================================================================
+ Constants and macros
+======================================================================*/
+#define QURT_BARRIER_SERIAL_THREAD 1 /**< Serial thread. */
+#define QURT_BARRIER_OTHER 0         /**< Other. */
+
+#ifndef ASM
+#include <qurt_mutex.h>
+
+/*=====================================================================
+Typedefs
+======================================================================*/
+
+/** QuRT barrier type.                                                 
+ */
+typedef union {
+    /** @cond */
+	struct {
+        unsigned short threads_left;
+		unsigned short count;
+		unsigned int threads_total;
+        unsigned int queue;
+        unsigned int reserved;
+	};
+	unsigned long long int raw;
+    /** @endcond */
+} qurt_barrier_t;
+
+/** @} */ /* end_addtogroup barrier_types */
+
+/*=====================================================================
+ Functions
+======================================================================*/
+ 
+/*======================================================================*/
+/**@ingroup func_qurt_barrier_init
+  Initializes a barrier object.
+	
+  @datatypes
+  #qurt_barrier_t
+
+  @param[out] barrier       Pointer to the barrier object to initialize.
+  @param[in]  threads_total Total number of threads to synchronize on the barrier.
+
+
+  @return
+  Unused integer value.
+
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+int qurt_barrier_init(qurt_barrier_t *barrier, unsigned int threads_total);
+
+/*======================================================================*/
+/**@ingroup func_qurt_barrier_destroy
+  Destroys the specified barrier.
+
+  @note1hang Barriers must be destroyed when they are no longer in use. Failure
+             to do this causes resource leaks in the QuRT kernel.\n
+  @note1cont Barriers must not be destroyed while they are still in use. If this
+             occurs, the behavior of QuRT is undefined.
+
+  @datatypes
+  #qurt_barrier_t
+ 
+  @param[in] barrier Pointer to the barrier object to destroy.
+
+  @return     		
+  Unused integer value.
+
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+int qurt_barrier_destroy(qurt_barrier_t *barrier);
+
+/*======================================================================*/
+/**@ingroup func_qurt_barrier_wait
+  Waits on the barrier.\n
+  Suspends the current thread on the specified barrier. \n
+  The function return value indicates whether the thread was the last one to
+  synchronize on the barrier.
+  When a thread waits on a barrier, it is suspended on the barrier: \n
+  - If the total number of threads waiting on the barrier is less than the assigned value 
+     of the barrier, no other action occurs. \n
+  - If the total number of threads waiting on the barrier equals the assigned value of the
+     barrier, all threads currently waiting on the barrier are awakened, allowing them to
+     execute past the barrier.
+
+  @note1hang After its waiting threads are awakened, a barrier is automatically reset 
+            and can be used again in the program without the need for re-initialization.
+	                
+  @datatypes
+  #qurt_barrier_t
+  
+  @param[in] barrier Pointer to the barrier object to wait on.
+
+  @return 				
+  #QURT_BARRIER_OTHER -- Current thread awakened from barrier. \n 
+  #QURT_BARRIER_SERIAL_THREAD -- Current thread is last caller of barrier.
+
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+int qurt_barrier_wait(qurt_barrier_t *barrier);
+
+
+#endif
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_BARRIER_H */
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_busywait.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_busywait.h
new file mode 100755
index 0000000000000..a4dab80a2520a
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_busywait.h
@@ -0,0 +1,62 @@
+#ifndef QURT_BUSYWAIT_H
+#define QURT_BUSYWAIT_H
+
+/**
+  @file qurt_busywait.h 
+  @brief Implementation of the busywait() function for 
+   hardware based blocking waits that use the QTIMER as a reference.   
+
+ EXTERNALIZED FUNCTIONS
+  none
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  none
+
+ Copyright (c) 2021  by Qualcomm Technologies, Inc.  All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ============================================================================*/
+/*=============================================================================
+ *
+ *                       EDIT HISTORY FOR FILE
+ *
+ *   This section contains comments describing changes made to the
+ *   module. Changes are listed in reverse chronological
+ *   order.
+ *
+ * 
+ * when         who     what, where, why
+ * ----------   ---     -------------------------------------------------------
+ * 2018-03-20   pg      Add Header file
+ ============================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*=============================================================================
+                                    FUNCTIONS
+=============================================================================*/
+
+/**@ingroup func_qurt_busywait
+  Pauses the execution of a thread for a specified time.\n
+  Use for small microsecond delays.
+  
+  @note1hang The function does not return to the caller until
+  the time duration has expired.
+             
+  @param[in] pause_time_us Time to pause in microseconds. 
+ 
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+void qurt_busywait (unsigned int pause_time_us);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_BUSYWAIT_H */
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_callback.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_callback.h
new file mode 100755
index 0000000000000..dc9b896c63454
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_callback.h
@@ -0,0 +1,235 @@
+#ifndef QURT_CALLBACK_H
+#define QURT_CALLBACK_H
+
+/**
+  @file qurt_callback.h
+    Definitions, macros, and prototypes for QuRT callback framework.
+  
+  QDI framework allows the development of root process drivers and services that 
+  a user process client can interact with in a secure manner. QDI framework does 
+  this by elevating the priviledge of user process thread, temporarily allowing 
+  the thread execute in root context and letting it fall back to user context once 
+  the QDI invocation is finished. 
+
+  The QuRT callback framework provides a safe mechanism for root process drivers 
+  to execute callback functions in a user process. The framework hosts 
+  dedicated worker threads in corresponding processes that handle the execution
+  of the callback function. This ensures that the callbacks occur in context of
+  the appropriate process thread, in result maintaining privilege boundaries. 
+
+  Prerequisites for use of this framework are:
+  1. Driver is a QDI driver and client communicates with drivers using QDI 
+     invocations.
+  2. Appropriate callback configuration is specified in cust_config.xml for 
+     the user process that intends to use this framework.
+
+  qurt_cb_data_t is the public data structure that allows client to store all
+  the required information about the callback, including the callback function
+  and the arguments to pass to this function when it executes.
+  The client uses QDI interface to register this structure with root driver.
+  
+  Callback framework provides following APIs that a root driver can use to invoke callback.
+  These functions are described in qurt_qdi_driver.h header file.
+
+  qurt_qdi_cb_invoke_async() triggers an asynchronous callback wherein the
+  invoking thread does not wait for the callback to finish executing.
+
+  qurt_qdi_cb_invoke_sync()  triggers a synchronous callback. Upon invocation
+  the invoking thread gets suspended till the callback function finishes execution.
+  
+  qurt_qdi_cb_invoke_sync_with_data() invokes a synchronous callback similar to
+  qurt_qdi_cb_invoke_sync(). It allows user to pass large data along with 
+  the callback invocation to be utlized during the callback execution.
+     
+ EXTERNALIZED FUNCTIONS
+  none
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  none
+
+ Copyright (c) 2021, 2023  by Qualcomm Technologies, Inc.  All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+#include "qurt_qdi.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef int qurt_cb_result_t;
+
+/* Callback framework error codes.
+  Callback framework returns a nonzero value if callback invocation is unsuccessful.
+  Following macros highlight cause of failure in more detail.
+*/
+#define QURT_CB_ERROR               -1                  /* Callback registration failed.\n*/
+#define QURT_CB_OK                   0                  /* Success.\n*/
+#define QURT_CB_MALLOC_FAILED       -2                  /* QuRTOS malloc failure.\n*/
+#define QURT_CB_WAIT_CANCEL         -3                  /* Process exit cancelled wait operation.\n*/
+#define QURT_CB_CONFIG_NOT_FOUND    -4                  /* Callback configuration for process was not found.\n*/
+#define QURT_CB_QUEUE_FULL          -5                  /* Callback queue is serving at maximum capacity.*/
+/** @addtogroup cb_types
+@{ */
+/** Callback registration data structure.
+  This data structure is used by a client attempting to register a callback with a QDI driver.
+  It holds the address of callback function and the argument supplied to the callback 
+  function when it executes.
+*/
+typedef struct {
+  /** @cond */
+  void* cb_func;             /*< Pointer to the callback function. */
+  unsigned cb_arg;           /*< Not interpreted by the framework.*/
+  /** @endcond */
+} qurt_cb_data_t;
+
+/** @cond */
+/* Defines used as default if cust_config does not specify them. */
+#define CALLBACK_WORKER_STACK_SIZE 0x2000
+/** @endcond */
+/** @} */ /* end_addtogroup cb_typess */
+/**@ingroup func_qurt_cb_data_init 
+  Initializes the callback data structure.
+  Entity registering a callback with the root process driver must call this function
+  to initialize callback registration data structure to the default value.
+
+  @datatypes 
+  #qurt_cb_data_t
+
+  @param[in]  cb_data         Pointer to the callback data structure.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+static inline void qurt_cb_data_init (qurt_cb_data_t* cb_data){
+    cb_data->cb_func = NULL;
+    cb_data->cb_arg = 0;
+}
+
+/**@ingroup func_qurt_cb_data_set_cbfunc
+  Sets up the callback function in the callback registration data structure.
+  
+  @datatypes 
+  #qurt_cb_data_t
+
+  @param[in] cb_data         Pointer to the callback data structure.
+  @param[in] cb_func         Pointer to the callback function.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+static inline void qurt_cb_data_set_cbfunc (qurt_cb_data_t* cb_data, void* cb_func){
+  cb_data->cb_func = cb_func;
+}
+
+/**@ingroup func_qurt_cb_data_set_cbarg
+  Sets up the callback argument.
+  This function sets up the argument passed to the callback function when it executes.
+  
+  @datatypes 
+  #qurt_cb_data_t
+
+  @param[in] cb_data         Pointer to the callback data structure.
+  @param[in] cb_arg          Argument for the callback function.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+static inline void qurt_cb_data_set_cbarg (qurt_cb_data_t* cb_data, unsigned cb_arg){
+  cb_data->cb_arg = cb_arg;
+}
+
+/** @cond */
+/**@ingroup driver_support_functions
+  Invokes an asynchronous callback for a specified process. 
+  A driver that resides in the root process calls this API to launch a callback in
+  a process described by the client_handle.
+  After the callback is invoked, the framework queues the callback as per its 
+  priority and subsequently executes it.
+  The caller of this function is not suspended during the callback execution period.
+  The API returns immediately with a success/failure error code.
+
+  @note1hang  This function is only accessible to drivers in the root process. 
+              User process invocations shall fail with a negative error code return value.
+
+  @param  client_handle   Obtained from the current invocation function (Section 4.3.1).
+  @param  cb_data         Pointer to the callback data structure (refer to qurt_callback.h).
+  @param  prio            Priority at which the callback should execute.
+                          This paraemter is optional. If -1 is passed, the callback frameowrk 
+                          executes the callback at the priority of the API caller.
+  @return
+  QURT_EOK -- Callback was successfully communicated to the framework.
+  Negative error code -- Callback cannot be communicated to the framework.
+ */
+qurt_cb_result_t qurt_qdi_cb_invoke_async(int client_handle,
+                                          qurt_cb_data_t* cb_data,
+                                          int prio);
+
+
+/**@ingroup driver_support_functions
+  Invokes a synchronous callback for a specified process. 
+  A driver that resides in a root process calls this API to launch a sync callback in
+  a process described by the client_handle.
+  AFter the callback is invoked, the framework queues the callback as per its 
+  priority and subsequently executes it.
+  The caller of this function is suspended during the callback execution period.
+  If the process in which to execute the callback exits or terminates, the caller is
+  woken up with error code #QURT_CB_WAIT_CANCEL (refer to qurt_callback.h).
+
+  @note1hang  This function is only accessible to drivers in the root process. 
+              User process invocations shall fail with a negative error code return value.
+
+  @param  client_handle   Obtained from the current invocation function (Section 4.3.1).
+  @param  cb_data         Pointer to the callback data structure (refer to qurt_callback.h).
+  @param  prio            Priority at which the callback should execute.
+                          This paraemter is optional. If -1 is passed, callback frameowrk 
+                          executes the callback at the priority of the API caller.
+  @return
+  QURT_EOK -- Callback was successfully communicated to the framework.
+  Negative error code -- Callback cannot be communicated to the framework.
+ */
+qurt_cb_result_t qurt_qdi_cb_invoke_sync(int client_handle,
+                                         qurt_cb_data_t* cb_data,
+                                         int prio);
+
+/**@ingroup driver_support_functions
+  Invokes a synchronous callback for a specified process, passing driver data to the user PD.
+  This function is similar to qurt_qdi_cb_invoke_sync() and allows the driver to pass arbitrary data to
+  the user process as part of the callback invocation.
+
+  @param  client_handle   Obtained from the current invocation function (Section 4.3.1).
+  @param  cb_data         Pointer to the callback data structure (refer to qurt_callback.h).
+  @param  prio            Priority at which the callback should execute.
+                          This paraemter is optional. If -1 is passed, the callback frameowrk
+                          executes the callback at the priority of the API caller.
+  @param  data            Driver arbitrary data to pass to the user process. Memory pointed to by data
+                          must be accessible to the user PD. The root driver can allocate such memory by
+                          using qurt_mem_mmap().
+  @param  data_len        Driver arbitrary data length.
+  
+  @return
+  QURT_EOK -- Callback was successfully communicated to the framework.
+  Negative error code -- Callback cannot be communicated to the framework.
+ */
+qurt_cb_result_t qurt_qdi_cb_invoke_sync_with_data( int client_handle,
+                                                    qurt_cb_data_t* cb_data,
+                                                    int prio,
+                                                    void *data,
+                                                    unsigned data_len
+                                                    );
+/** @endcond */
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_clade.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_clade.h
new file mode 100755
index 0000000000000..d7442cf98dd94
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_clade.h
@@ -0,0 +1,62 @@
+#ifndef QURT_CLADE_H
+#define QURT_CLADE_H
+/**
+  @file qurt_clade.h 
+  @brief  Prototypes of Cache Line Accelerated Decompression Engine (CLADE) API.
+  CLADE is a cache line level memory compression system that is used to
+  decrease DRAM usage.
+
+EXTERNAL FUNCTIONS
+   None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+ Copyright (c) 2019-2021  by Qualcomm Technologies, Inc.  All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+=============================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*=============================================================================
+                                    FUNCTIONS
+=============================================================================*/
+
+/**@ingroup func_qurt_clade2_get
+  Reads the value of the clade2 register.
+ 
+  @param[in] offset Offset from the clade2 cfg base.
+  @param[out] *value  Pointer to the register value read from the offset.
+ 
+  @return
+  #QURT_EOK - Successfully read the value from the register at offset \n
+  #QURT_EINVALID - Offset passed is incorrect
+   
+  @dependencies
+  None.
+ */
+int qurt_clade2_get(unsigned short offset, unsigned int *value);
+ 
+/**@ingroup func_qurt_clade2_set
+  Sets the PMU register; only PMU_SEL register can be set.
+  
+  @param[in] offset Offset from the QURTK_clade2_cfg_base.          
+  @param[in] value  Value to set at offset.  
+ 
+  @return
+  #QURT_EOK -- Successfully set the value at offset. \n
+  #QURT_ENOTALLOWED -- Set operation performed at an offset other than CLADE2_PMU_SELECTION_REG.
+
+  @dependencies
+  None.
+ */
+int qurt_clade2_set(unsigned short offset, unsigned int value);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_CLADE_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_cond.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_cond.h
new file mode 100755
index 0000000000000..6e65ed82a8393
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_cond.h
@@ -0,0 +1,219 @@
+﻿#ifndef QURT_COND_H
+#define QURT_COND_H 
+/**
+  @file qurt_cond.h
+  @brief  Prototypes of kernel condition variable object API functions.
+
+ EXTERNALIZED FUNCTIONS
+  None
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  none
+
+ Copyright (c) 2021 Qualcomm Technologies, Inc.
+ All rights reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+
+
+#include <qurt_mutex.h>
+#include <qurt_rmutex2.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** @addtogroup condition_variables_types
+@{ */
+/*=====================================================================
+ Typedefs
+ ======================================================================*/
+
+/** QuRT condition variable type.  */
+typedef union {
+    /** @cond */
+	unsigned long long raw;
+	struct {
+		unsigned int count;
+		unsigned int n_waiting;
+        unsigned int queue;
+        unsigned int reserved;
+	}X;
+    /** @endcond */
+} qurt_cond_t;
+
+/** @} */ /* end_addtogroup condition_variables_types */
+
+/*=====================================================================
+ Functions
+======================================================================*/
+ 
+/*======================================================================*/
+/**@ingroup func_qurt_cond_init
+  Initializes a conditional variable object.
+
+  @datatypes
+  #qurt_cond_t
+	
+  @param[out] cond Pointer to the initialized condition variable object. 
+
+  @return
+  None.
+		 
+  @dependencies
+  None.
+ */
+/* ======================================================================*/
+void qurt_cond_init(qurt_cond_t *cond);
+
+/*======================================================================*/
+/**@ingroup func_qurt_cond_destroy
+  Destroys the specified condition variable.
+
+  @note1hang Conditions must be destroyed when they are no longer in use. Failure to do
+             this causes resource leaks in the QuRT kernel.\n
+  @note1cont Conditions must not be destroyed while they are still in use. If this occurs,
+             the behavior of QuRT is undefined. 
+
+  @datatypes
+  #qurt_cond_t
+
+  @param[in] cond Pointer to the condition variable object to destroy.
+
+  @return
+  None.
+
+ */
+/* ======================================================================*/
+void qurt_cond_destroy(qurt_cond_t *cond);
+
+/*======================================================================*/
+/**@ingroup func_qurt_cond_signal
+  Signals a waiting thread that the specified condition is true. \n
+
+  When a thread wishes to signal that a condition is true on a shared data item, it must
+  perform the following procedure: \n
+  -# Lock the mutex that controls access to the data item. \n
+  -# Perform the signal condition operation. \n
+  -# Unlock the mutex.
+
+  @note1hang Failure to properly lock and unlock a mutex of a condition variable can cause
+             the threads to never be suspended (or suspended but never awakened). 
+
+  @note1cont Use condition variables only with regular mutexes -- attempting to use
+             recursive mutexes or priority inheritance mutexes results in undefined behavior.
+             
+  @datatypes
+  #qurt_cond_t
+
+  @param[in] cond Pointer to the condition variable object to signal.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+/* ======================================================================*/
+void qurt_cond_signal(qurt_cond_t *cond);
+
+/*======================================================================*/
+/**@ingroup func_qurt_cond_broadcast
+  Signals multiple waiting threads that the specified condition is true.\n
+  When a thread wishes to broadcast that a condition is true on a shared data item, it must
+  perform the following procedure: \n
+  -# Lock the mutex that controls access to the data item. \n
+  -# Perform the broadcast condition operation. \n
+  -# Unlock the mutex.\n
+
+  @note1hang Failure to properly lock and unlock the mutex of a condition variable can cause
+             the threads to never be suspended (or suspended but never awakened).
+
+  @note1cont Use condition variables only with regular mutexes -- attempting to use
+  recursive mutexes or priority inheritance mutexes results in undefined behavior.
+  
+  @datatypes
+  #qurt_cond_t
+
+  @param[in] cond Pointer to the condition variable object to signal.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+/* ======================================================================*/
+void qurt_cond_broadcast(qurt_cond_t *cond);
+
+/*======================================================================*/
+/**@ingroup func_qurt_cond_wait
+  Suspends the current thread until the specified condition is true.
+  When a thread wishes to wait for a specific condition on a shared data item, it must
+  perform the following procedure: \n
+  -# Lock the mutex that controls access to the data item. \n
+  -# If the condition is not satisfied, perform the wait condition operation on the
+  condition variable (suspends the thread and unlocks the mutex).
+
+  @note1hang Failure to properly lock and unlock the mutex of a condition variable can cause
+             the threads to never be suspended (or suspended but never awakened).
+
+  @note1cont Use condition variables only with regular mutexes -- attempting to use
+  recursive mutexes or priority inheritance mutexes results in undefined behavior.
+  
+  @datatypes
+  #qurt_cond_t \n
+  #qurt_mutex_t
+  
+  @param[in] cond     Pointer to the condition variable object to wait on.
+  @param[in] mutex    Pointer to the mutex associated with condition variable to wait on.
+
+  @return
+  None.
+		 
+  @dependencies 
+  None.
+ */
+/* ======================================================================*/
+void qurt_cond_wait(qurt_cond_t *cond, qurt_mutex_t *mutex);
+
+/*======================================================================*/
+/**@ingroup func_qurt_cond_wait2
+  Suspends the current thread until the specified condition is true.
+  When a thread wishes to wait for a specific condition on a shared data item, it must
+  perform the following procedure: \n
+  -# Lock the mutex that controls access to the data item. \n
+  -# If the condition is not satisfied, perform the wait condition operation on the
+  condition variable, which suspends the thread and unlocks the mutex.
+ 
+  @note1hang Failure to properly lock and unlock the mutex of a condition variable can cause
+             the threads to never be suspended (or suspended but never awakened). 
+
+  @note1cont Use condition variables only with regular mutexes -- attempting to use
+             recursive mutexes or priority inheritance mutexes results in undefined behavior.
+             
+  @note1cont This is the same API as qurt_cond_wait(), use this version 
+             when using mutexes of type #qurt_rmutex2_t.
+
+  @datatypes
+  #qurt_cond_t \n
+  #qurt_rmutex2_t
+  
+  @param[in] cond     Pointer to the condition variable object to wait on.
+  @param[in] mutex    Pointer to the mutex associated with the condition variable to wait on.
+
+  @return
+  None.
+		 
+  @dependencies 
+  None.
+ */
+/* ======================================================================*/
+void qurt_cond_wait2(qurt_cond_t *cond, qurt_rmutex2_t *mutex);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_COND_H */
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_consts.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_consts.h
new file mode 100755
index 0000000000000..b1e35998e73b6
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_consts.h
@@ -0,0 +1,315 @@
+#ifndef QURT_CONSTS_H
+#define QURT_CONSTS_H
+
+/**
+  @file qurt_consts.h
+  @brief  QuRT constants and definitions
+
+  EXTERNAL FUNCTIONS
+   None.
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None
+
+ Copyright (c) 2013-2023 by Qualcomm Technologies, Inc.  All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*=====================================================================
+ Constants and macros
+ ======================================================================*/
+
+/* Definitions of system events. System events suspend
+   a thread and put it into suspending_list.
+   The system event number is saved in CONTEXT::error::cause field
+   of the suspended thread. An event handler thread such as
+   page fault handler or system error handler can wake up the suspended
+   thread.
+ */
+#define QURT_EVENT_PAGEFAULT      0x1 /* Page fault event. */
+#define QURT_EVENT_SYSTEM_ERR     0x2 /* System error event. */
+#define QURT_EVENT_SUSPEND        0x3
+#define QURT_EVENT_PROCESS_EXIT   0x4 /* Process termination event.*/
+
+#define QURT_SYSENV_MAX_THREADS_TYPE           1 /* Maximum threads object. */
+#define QURT_SYSENV_PROCNAME_TYPE              2 /* Process name object. */
+#define QURT_SYSENV_MAX_PI_PRIO_TYPE           3 /* Maximum pi priority object. */
+#define QURT_SYSENV_ARCH_REV_TYPE              4 /* Architecture version object. */
+#define QURT_SYSENV_APP_HEAP_TYPE              5 /* Application heap object. */
+#define QURT_SYSENV_REGION_ATTR_DEFAULT        7 /* Default region attributes. */
+#define QURT_SYSENV_STACK_PROFILE_COUNT_TYPE   8 /* Stack profile count type. */
+#define QURT_SYSENV_ISLAND_CONFIG_TYPE         9 /*island configuration check*/
+#define QURT_SYSENV_HTHREADS_TYPE              10 /* Active threads objec */
+#define QURT_SYSENV_CONFIG_IMAGE_START_LO      11 /* Config image start address for DTB parsing */
+#define QURT_SYSENV_CONFIG_IMAGE_START_HI      12 /* Config Image start address for DTB parsing */
+#define QURT_SYSENV_CHIPPARAMS_LO              13 /* ChipParams for DTB parsing */
+#define QURT_SYSENV_CHIPPARAMS_HI              14 /* ChipParams for DTB parsing */
+#define QURT_SYSENV_PLATPARAMS                 15 /* Platformparams for DTB parsing */
+#define QURT_SYSENV_CONFIG_IMAGE_SIZE          16 /* Config image Size for DTB parsing */
+#define QURT_SYSENV_L2_CACHE_LINE_SIZE         17 /*L2 cache line size*/
+
+/* Get q6 regs */
+#define QURT_GET_SSR         1
+#define QURT_GET_CCR         2
+#define QURT_GET_CFGBASE     3
+#define QURT_GET_SYSCFG      4
+#define QURT_GET_REV         5
+
+
+/** @cond rest_reg_dist */
+/** @addtogroup performance_monitor_macros
+@{ */
+
+/* PMU */
+#define QURT_PMUCNT0    0  /**< */
+#define QURT_PMUCNT1    1  /**< */
+#define QURT_PMUCNT2    2  /**< */
+#define QURT_PMUCNT3    3  /**< */
+#define QURT_PMUCFG     4  /**< */
+#define QURT_PMUEVTCFG  5  /**< */
+
+/* new since V55 */
+#define QURT_PMUCNT4    6  /**< */
+#define QURT_PMUCNT5    7  /**< */
+#define QURT_PMUCNT6    8  /**< */
+#define QURT_PMUCNT7    9  /**< */
+#define QURT_PMUEVTCFG1 10  /**< */
+
+/* new since V61 */
+#define QURT_PMUSTID0   11  /**< */
+#define QURT_PMUSTID1   12  /**< */
+
+#define QURT_PMUCNTSTID0   13  /**< */
+#define QURT_PMUCNTSTID1   14  /**< */
+#define QURT_PMUCNTSTID2   15  /**< */
+#define QURT_PMUCNTSTID3   16  /**< */
+#define QURT_PMUCNTSTID4   17  /**< */
+#define QURT_PMUCNTSTID5   18  /**< */
+#define QURT_PMUCNTSTID6   19  /**< */
+#define QURT_PMUCNTSTID7   20  /**< */
+
+/** @} */ /* end_addtogroup performance_monitor_macros */
+/** @endcond */
+
+/*
+ Power collapse operation
+*/
+#define QURT_POWER_SHUTDOWN       0 /**< */
+#define QURT_TCXO_SHUTDOWN        1 /**< */
+#define QURT_POWER_CMD_PREPARE    0 /**< */
+#define QURT_POWER_CMD_PERFORM    1 /**< */
+#define QURT_POWER_CMD_EXIT       2 /**< */
+#define QURT_POWER_CMD_FAIL_EXIT  3 /**< */
+#define QURT_POWER_CMD_PERFORM_L2_RETENTION 4 /**< */
+#define QURT_POWER_CMD_PERFORM_SAVE_TCM     5 /**< */
+#define QURT_POWER_CMD_DEEP_SLEEP 6           /**< */
+
+
+/** @addtogroup thread_macros
+@{ */
+#define QURT_MAX_HTHREAD_LIMIT    8U /**< Limit on the maximum number of hardware threads supported by QuRT for any
+ Hexagon version. Use this definition to define arrays, and so on, in
+ target independent code. */
+/** @} */ /* end_addtogroup thread_macros */
+
+/** @cond internal_only */
+/** @addtogroup power_management_macros
+@{ */
+/**
+  L2 cache retention mode
+*/
+#define QURT_POWER_SHUTDOWN_TYPE_L2NORET QURT_POWER_CMD_PERFORM /**< */
+#define QURT_POWER_SHUTDOWN_TYPE_L2RET   QURT_POWER_CMD_PERFORM_L2_RETENTION /**< */
+#define QURT_POWER_SHUTDOWN_TYPE_SAVETCM QURT_POWER_CMD_PERFORM_SAVE_TCM /**< */
+/** @} */ /* end_addtogroup power_management_macros */
+/** @endcond */
+
+/*
+  QURT_system_state
+  Use for debugging the shutdown/startup process.
+
+  State transition for cold boot:
+  QURT_BOOT_SETUP_ISDB --> QURT_CBOOT_BSP_INIT -->
+  QURT_CBOOT_END_CLEAN_INIT --> QURT_CBOOT_END_OS_INIT -->
+  QURT_CBOOT_KERNEL_INIT_DONE --> QURT_CBOOT_PLAT_CONFIG_DONE -->
+  QURT_CBOOT_ROOT_TASK_STARTED
+
+  State transition for power collapse:
+  QURT_PREPARE_SINGLE_MODE --> QURT_PERFORM_IPEND -->
+  QURT_PERFORM_SAVE_TLB --> QURT_PERFORM_SWITCH_PC -->
+  cache flush states (dependent on L2 retention config)
+
+  State transition for warm boot:
+  QURT_BOOT_SETUP_ISDB --> QURT_WBOOT_INIT_TLB -->
+  QURT_WBOOT_SET_1TO1_MAP --> QURT_WBOOT_REMOVE_1TO1_MAP -->
+  QURT_CBOOT_END_CLEAN_INIT --> QURT_CBOOT_END_OS_INIT
+*/
+#define QURT_PREPARE_SINGLE_MODE 1 /**< */
+#define QURT_PREPARE_END 2 /**< */
+#define QURT_PERFORM_IPEND 3 /**< */
+#define QURT_PERFORM_SAVE_ISDP 4 /**< */
+#define QURT_PERFORM_SAVE_PMU 5 /**< */
+#define QURT_PERFORM_SAVE_TLB 6 /**< */
+#define QURT_PERFORM_SWITCH_PC 7 /**< */
+#define QURT_PERFORM_EXIT 8 /**< */
+#define QURT_FLUSH_L1CACHE 9 /**< */
+#define QURT_FLUSH_L2CACHE 0xA /**< */
+#define QURT_FLUSH_CACHE_DONE 0xB /**< */
+#define QURT_SWITCH_PC_DONE 0xC /**< */
+#define QURT_BOOT_SETUP_ISDB 0xD /**< */
+#define QURT_WBOOT_INIT_TLB 0xE /**< */
+#define QURT_WBOOT_SET_1TO1_MAP 0xF /**< */
+#define QURT_WBOOT_CFG_ADV_SYSCFG 0x10 /**< */
+#define QURT_WBOOT_REMOVE_1TO1_MAP 0x11 /**< */
+#define QURT_CBOOT_BSP_INIT 0x12 /**< */
+#define QURT_CBOOT_END_CLEAN_L1CACHE 0x13 /**< */
+#define QURT_CBOOT_END_CLEAN_INIT 0x14 /**< */
+#define QURT_CBOOT_END_OS_INIT 0x15 /**< */
+#define QURT_CBOOT_TLB_DUMP_LOAD 0x16 /**< */
+#define QURT_CBOOT_TLB_STATIC_LOAD 0x17 /**< */
+#define QURT_CBOOT_KERNEL_INIT_DONE 0x18 /**< */
+#define QURT_CBOOT_PLAT_CONFIG_DONE 0x19 /**< */
+#define QURT_CBOOT_ROOT_TASK_STARTED 0x1A /**< */
+#define QURT_IMPRECISE_EXCEPTION 0x1B /**< */
+#define QURT_WBOOT_DEBUG_L2_START 0x1C /**< */
+#define QURT_WBOOT_DEBUG_L2_END   0x1D /**< */
+#define QURT_NMI_SAVE_L2VIC_COMPLETE   0x1E /**< */
+#define QURT_NMI_HANDLER_COMPLETE   0x1F /**< */
+#define QURT_NMI_AFTER_SAVE_GLOBAL 0x20 /**< */
+#define QURT_WBOOT_START 0x21 /**< */
+#define QURT_ENTER_ISLAND 0x22 /**< */
+#define QURT_EXIT_ISLAND 0x23 /**< */
+#define QURT_LOAD_NOTIFIER_TCB 0x24 /**< */
+#define QURT_ABNORMAL_RESET 0x25 /**< */
+/*
+  Thread attributes
+*/
+
+#define QURT_THREAD_ATTR_GP                    0x00000002 /*< */
+#define QURT_THREAD_ATTR_UGP                   0x00000003 /*< User general pointer (UGP)*/
+#define QURT_THREAD_ATTR_PREFETCH              0x00000004 /*< */
+#define QURT_THREAD_ATTR_TID                   0x00000005 /*< */
+#define QURT_THREAD_ATTR_CACHE_PART            0x00000007 /*< */
+#define QURT_THREAD_ATTR_COPROCESSOR           0x00000008 /*< */
+#define QURT_THREAD_ATTR_GET_L2CACHE_PART      0x00000009 /*< */
+#define QURT_THREAD_ATTR_SET_FRML              0x0000000A /*< */
+#define QURT_THREAD_ATTR_STID_GET              0x0000000B /*< */
+#define QURT_THREAD_ATTR_STID_SET              0x0000000C /*< */
+#define QURT_THREAD_ATTR_AUTOSTACK             0x0000000D /*< */
+#define QURT_THREAD_ATTR_SYSTEM_THREAD         0x0000000E /*< */
+#define QURT_THREAD_ATTR_STID_SET2             0x0000000F /*< */
+#define QURT_THREAD_ATTR_STID_SET2_ACKNOWLEDGE 0x00000010 /*< */
+#define QURT_THREAD_ATTR_STID_GET2             0x00000011 /*< */
+
+/**  Cache operations*/
+#define QURT_DCCLEAN                0U   /* Clean Dcache. */
+#define QURT_DCINV                  1U   /* Invalidate Dcache. */
+#define QURT_DCCLEANINV             2U   /* Clean and invalidate Dcache. */
+#define QURT_ICINV                  3U   /* Invalidate Icache. */
+#define QURT_DUMP_DCTAGS            4U  /* For testing purpose. */
+#define QURT_FLUSH_ALL              5U  /* Flush entire L1 and L2 cache. */
+#define QURT_TABLE_FLUSH            6U  /* Flush based on table of physical pages */
+#define QURT_CLEAN_INVALIDATE_ALL   7U  /* Flush and invalidate entire L1 and L2 cache. */
+#define QURT_L2CACHE_LOCK_LINES     8U  /* l2 cache lock lines */
+#define QURT_L2CACHE_UNLOCK_LINES   9U  /* l2 cache unlock lines */
+#define QURT_CLEAN                  10U  /* Flush L1 and L2 cache */
+#define QURT_CLEAN_INVALIDATE       11U  /* Flush and invalidate L1 and L2 cache. */
+#define QURT_CLEAN_INVALIDATE_L2    12U  /* Flush and invalidate entire L2 cache. */
+
+/**@ingroup chapter_prefined_symbols */
+/**@xreflabel{hdr:QURT_API_VERSION}*/
+
+
+/* Process state. */
+#define QURT_UPDATE_PROCESS_STATE   0 /**< */
+#define QURT_MP_INIT        1 /*< */
+#define QURT_MP_RUNNING     2 /*< */
+#define QURT_MP_STOPPED     3 /*< */
+
+/* QuRT reset reason. */
+#define QURT_NORMAL_BOOT               0  /* Normal boot. */
+#define QURT_WARM_BOOT                 1  /* Power collapse warm boot. */
+#define QURT_WARM_BOOT_L2_RETENTION    2  /* Power collapse with L2 retention warm boot. */
+#define QURT_WARM_BOOT_SAVE_TCM        3  /* Power collapse with saving TCM. */
+#define QURT_QUICK_BOOT                4  /* Deep sleep. */
+
+/* QuRT Wait for Idle command */
+#define QURT_WAIT_FOR_IDLE_DISABLE  0 /*< */
+#define QURT_WAIT_FOR_IDLE_ENABLE   1 /*< */
+#define QURT_WAIT_FOR_IDLE     2 /*< */
+#define QURT_WAIT_FOR_IDLE_CANCEL 3 /*< */
+
+/*QuRT island exit stages */
+#define QURT_ISLAND_EXIT_STAGE1 1 /*< */
+#define QURT_ISLAND_EXIT_STAGE2 2 /*< */
+
+#define QURT_MAX_NAME_LEN   64 /*< */
+
+#define MAX_POOL_RANGES     16 /*< */
+
+/* key definitions for debug thread info */
+//#define MAX_TCB_KEY           40    //whatever is a good number or makes debug thread structure be 1K
+#define KEY_SCHDULER_STATE      1   /*< */
+#define KEY_PRIORITY            2   /*< */
+#define KEY_PRIORITY_ORIG       3   /*< */
+#define KEY_STACK_BOTTOM        4    // Currently not populated
+#define KEY_STACK_TOP           5    // Currently not populated
+#define KEY_HVX_STATE           6    /*< */
+#define KEY_FUTEX_OBJECT        7    /*< */
+#define KEY_THREAD_ID           8    /*< */
+#define KEY_PROFILE_CYCLE_LO    9    // Currently not populated
+#define KEY_PROFILE_CYCLE_HI    10   // Currently not populated
+#define KEY_ERROR_ADDRESS       11   // This holds the BADVA
+#define KEY_ERROR_CAUSE         12   // This is the same as QURT_error_info.cause
+#define KEY_ERROR_CAUSE2        13   // This is the same as QURT_error_info.cause2
+#define KEY_ERROR_SSR           14   /*< Holds the SSR value */
+#define QURT_RESERVED           -1
+
+/* VTLB method IDs. */
+#define QURT_VTLB_ENTRY_CREATE          0U
+#define QURT_VTLB_ENTRY_DELETE          1U
+#define QURT_VTLB_ENTRY_READ            2U
+#define QURT_VTLB_ENTRY_WRITE           3U
+#define QURT_VTLB_ENTRY_PROBE           4U
+#define QURT_VTLB_ENTRY_SPLIT           5U
+#define QURT_VTLB_ENTRY_MERGE           6U
+#define QURT_VTLB_ENTRY_STATISTICS      7U
+#define QURT_VTLB_ENTRY_SET_SPECIAL     8U
+#define QURT_VTLB_QUEUE_PPAGE           9U
+#define QURT_VTLB_RECLAIM_STACK_PAGES   10U
+#define QURT_VTLB_ASID_SET_STATE_FAST   11U
+#define QURT_VTLB_ASID_SET_STATE        12U
+#define QURT_VTLB_ENTRY_SET_EXTENSION   13U
+#define QURT_VTLB_ENTRY_CLEAR_EXTENSION 14U
+
+/* VTCM window access control HWIO programming. */
+#define QURT_VTCM_WINDOW_ENABLE             1U
+#define QURT_VTCM_WINDOW_DISABLE            0U
+#define QURT_VTCM_WINDOW_HI_OFFSET_DEFAULT  0xFFFU
+#define QURT_VTCM_WINDOW_LO_OFFSET_DEFAULT  0U
+
+/** @cond */
+/* ETM source - PC or data access */
+#define QURT_ETM_SOURCE_PC          0U  /**< Memory source of SAC* is PC. */
+#define QURT_ETM_SOURCE_DATA        1U  /**< Memory source of SAC* is data. */
+
+/* ETM PID status flags */
+#define QURT_ETM_NO_PID             0xFFFFFFFF /**< No PID is selected. */
+/** @endcond */
+
+/* execution context */
+#define QURT_CTX_USER       1
+#define QURT_CTX_GUEST      2
+
+/* Profiling STID */
+#define QURT_STID_DEFAULT   0U
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_CONSTS_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_cycles.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_cycles.h
new file mode 100755
index 0000000000000..b599493f5d563
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_cycles.h
@@ -0,0 +1,301 @@
+
+#ifndef QURT_CYCLES_H
+#define QURT_CYCLES_H 1
+/**
+  @file qurt_cycles.h
+  Prototypes of kernel pcycle API functions.      
+
+ EXTERNALIZED FUNCTIONS
+  none
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  none
+
+ Copyright (c) 2021, 2023  by Qualcomm Technologies, Inc.  All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+ 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+	/*=====================================================================
+	 Functions
+	======================================================================*/
+	 
+/*======================================================================*/
+
+/**@ingroup func_qurt_profile_reset_idle_pcycles
+  @xreflabel{hdr:qurt_profile_reset_idle_pcycles}
+  Sets the per-hardware-thread idle cycle counts to zero. 
+
+  @return 
+  None. 
+		 
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+void qurt_profile_reset_idle_pcycles (void);
+	 
+/*======================================================================*/
+/**@ingroup func_qurt_profile_get_thread_pcycles
+  @xreflabel{hdr:qurt_profile_get_thread_pcycles}
+  Gets the count of the running processor cycles for the current thread.\n
+  Returns the current running processor cycle count for the current QuRT thread.
+
+  @note1hang  Profiling shall be enabled first to start the cycle counting. 
+              The cycles are accumulated once the profiling is enabled and 
+              resets on #qurt_profile_reset_threadid_pcycles
+
+  @return 
+  Integer -- Running processor cycle count for current thread.
+		 
+  @dependencies 
+  None.
+*/
+/* ======================================================================*/
+unsigned long long int qurt_profile_get_thread_pcycles(void);
+
+	
+/*======================================================================*/
+/**@ingroup func_qurt_get_core_pcycles
+  @xreflabel{hdr:qurt_get_core_pcycles}
+  Gets the count of core processor cycles executed.\n
+  Returns the current number of running processor cycles executed since the Hexagon
+  processor was last reset.
+
+  This value is based on the hardware core clock, which varies in speed according to the
+  processor clock frequency.
+
+  @note1hang Because the hardware core clock stops running when the processor shuts
+             down (due to all of the hardware threads being idle), treat the cycle values returned
+             by this operation as relative rather than absolute.
+
+  @note1cont Thread cycle counts are valid only in the V4 Hexagon processor version.
+
+  @return 
+  Integer -- Current count of core processor cycles.
+		 
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+unsigned long long int qurt_get_core_pcycles(void);
+
+/*======================================================================*/
+/**@ingroup func_qurt_profile_get_idle_pcycles
+
+  @deprecated use #qurt_profile_get_idle_pcycles2 instead
+
+  Gets the current idle processor cycle counts for a maximum of 6 hardware threads. Use
+  #qurt_profile_get_idle_pcycles2 for reading pcycles without limitation on maximum hardware threads. 
+
+  This operation accepts a pointer to a user-defined array, and writes to the array the current
+  idle cycle count for each hardware thread.
+
+  Each count value represents the number of processor cycles that have elapsed on the
+  corresponding hardware thread while that thread has been in Wait mode.\n
+
+
+  @note1hang This operation does not return the idle cycles that occur when the Hexagon
+             processor shuts down (due to all of the hardware threads being idle). 
+             Idle cycle counts gets accumulated irrespective of profiling is enabled or not, 
+	           and resets on #qurt_profile_reset_idle_pcycles
+	
+  @param[out] pcycles  User array where the function stores the current idle cycle count values.
+                        Array size should be a minimum of the number of hardware threads intended. 
+
+  @return
+  None.
+		 
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+void qurt_profile_get_idle_pcycles (unsigned long long *pcycles);
+
+/*======================================================================*/
+/**@ingroup func_qurt_profile_get_idle_pcycles2
+  Gets the current idle processor cycle counts for maximum available hardware threads.
+
+  This operation accepts a pointer to a user-defined array with length in bytes, and writes 
+  to the array the current idle cycle count for each hardware thread.
+
+  Each count value represents the number of processor cycles that have elapsed on the
+  corresponding hardware thread while that thread has been in Wait mode.\n
+
+  @note1hang This operation does not return the idle cycles that occur when the Hexagon
+             processor shuts down (due to all of the hardware threads being idle). 
+             Idle cycle counts gets accumulated irrespective of profiling enable status, and 
+             resets on #qurt_profile_reset_idle_pcycles
+	
+  @param[out] pcycles  User array where the function stores the current idle cycle count values. 
+                        Array size should be equivalent to the number of hardware threads intended. 
+                        Call #qurt_sysenv_get_max_hw_threads to determine the array size required.
+  
+  @param[in] length_in_bytes Length of pcycles array in bytes. If the array size is smaller
+                              than the required for the maximum available hardware threads, 
+                              it returns error code. 
+
+  @return
+  #QURT_EOK -- Successful operation. Stored all the data to the destination array
+  #QURT_EFAILED -- Operation failed due to smaller #pcycles array
+		 
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+int qurt_profile_get_idle_pcycles2 (unsigned long long *pcycles, unsigned int length_in_bytes);
+
+/*======================================================================*/
+/**@ingroup func_qurt_profile_get_threadid_pcycles
+  
+  @deprecated use #qurt_profile_get_threadid_pcycles2 instead
+  
+  Gets the current per-hardware-thread running cycle counts for the specified QuRT
+  thread for a maximum of 6 hardware threads.
+
+  Each count value represents the number of processor cycles that have elapsed on the
+  corresponding hardware thread while that thread has been scheduled for the specified
+  QuRT thread.
+
+  @note1hang  Profiling shall be enabled first to start the cycle counting. 
+              The cycles are accumulated once the profiling is enabled and 
+              resets on #qurt_profile_reset_threadid_pcycles
+
+  @param[in]   thread_id  Valid thread identifier.
+  @param[out]  pcycles    Pointer to a user array where the function stores the current running 
+                          cycle count values. Array size should be a minimum of the number of
+                          hardware threads intended. 
+	
+  @return 				
+  None. 
+		 
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+void qurt_profile_get_threadid_pcycles (int thread_id, unsigned long long  *pcycles);
+
+/*======================================================================*/
+/**@ingroup func_qurt_profile_get_threadid_pcycles2
+    
+  Gets the current per-hardware-thread running cycle counts for the specified QuRT
+  thread for maximum available hardware threads.
+
+  Each count value represents the number of processor cycles that have elapsed on the
+  corresponding hardware thread while that thread has been scheduled for the specified
+  QuRT thread.
+
+  @note1hang  Profiling shall be enabled first to start the cycle counting. 
+              The cycles are accumulated once the profiling is enabled and 
+              resets on #qurt_profile_reset_threadid_pcycles
+
+  @param[in]  thread_id  Thread identifier.
+  @param[out] pcycles    Pointer to a user array where the function stores the current running 
+                          cycle count values. Array size should be equivalent to the number of
+                          hardware threads intended. 
+                          Call #qurt_sysenv_get_max_hw_threads to determine the array size required.
+  @param[in]  length_in_bytes Length of pcycles array in bytes. If the array size is smaller
+                              than the required for the maximum available hardware threads, it 
+                              returns error code. 
+  
+  @return
+  #QURT_EOK -- Successful operation. Stored all the data to the destination array
+  #QURT_EFAILED -- Operation failed due to smaller #pcycles array
+  #QURT_ENOTHREAD -- Operation failed due to invalid #thread_id
+		 
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+int qurt_profile_get_threadid_pcycles2 (int thread_id, unsigned long long  *pcycles, unsigned int length_in_bytes);
+
+
+/*======================================================================*/
+/**@ingroup func_qurt_profile_reset_threadid_pcycles
+  @xreflabel{hdr:qurt_profile_reset_threadid_pcycles}
+  Sets the per-hardware-thread running cycle counts to zero for the specified QuRT thread.
+
+  @param[in]  thread_id Thread identifier.
+	
+  @return 
+  None. 
+		 
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+void qurt_profile_reset_threadid_pcycles (int thread_id);
+
+/*======================================================================*/
+/**@ingroup func_qurt_profile_enable
+  @xreflabel{hdr:qurt_profile_enable}
+  Enables profiling.\n
+  Enables or disables cycle counting of the running and idle processor cycles.
+  Profiling is disabled by default. \n
+
+  @note1hang Enabling profiling does not automatically reset the cycle counts -- this must be
+             done explicitly by calling the reset operations before starting cycle counting.
+             Cycle counting starts from the instant of it was enabled using this API, and  
+             halts on profiling disable.
+	
+  @param[in] enable  Profiling. Values: \n
+                     - 0 -- Disable profiling \n
+                     - 1 -- Enable profiling @tablebulletend
+	
+  @return 
+  None. 
+		 
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+void qurt_profile_enable (int enable);
+
+/*======================================================================*/
+/**@ingroup func_qurt_get_hthread_pcycles
+  @xreflabel{hdr:qurt_get_hthread_pcycles}
+  Reads the GCYCLE_nT register to allow performance measurement when N threads are in run mode.\n
+
+  @note1hang Returns 0 when architecture is earlier than v67 or for invalid HW thread id.
+  
+  @param[in] n Threads in run mode. Valid values are 1 through <maximum HW threads>.
+                     
+  
+  @return 
+  Value read from GCYCLE_nT register. This value indicates the total number of pcycles that got executed
+  from reset to current point of execution when n threads are in run mode
+     
+  @dependencies
+  PMU must be enabled.
+*/
+/* ======================================================================*/
+unsigned int qurt_get_hthread_pcycles(int n);
+
+/*======================================================================*/
+/**@ingroup func_qurt_get_hthread_commits
+  @xreflabel{hdr:qurt_get_hthread_commits}
+  Reads the GCOMMIT_nT register to allow performance measurement when N threads are in run mode.\n
+
+  @note1hang Returns 0 when architecture is earlier than v67 or for invalid HW thread id.
+  
+  @param[in] n Threads in run mode. Valid values: 1 through <maximum HW threads>.
+  
+  @return 
+  Value read from the GCOMMIT_nT register. This value indicates the total number of packets 
+  committed from reset to current point of execution when n threads are in run mode.
+     
+  @dependencies
+  PMU must be enabled.
+*/
+/* ======================================================================*/
+unsigned int qurt_get_hthread_commits(int n);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_devtree.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_devtree.h
new file mode 100755
index 0000000000000..4adee45bb44a2
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_devtree.h
@@ -0,0 +1,161 @@
+#ifndef QURT_DEVTREE_H
+#define QURT_DEVTREE_H
+/**
+ @file qurt_devtree.h 
+ @brief  Prototypes and structures for device tree aware QuRT library function.
+
+Copyright (c) 2021, 2023  by Qualcomm Technologies, Inc.  All Rights Reserved.
+*/
+/*qurt_callback is included by qurt_qdi_driver.h and depends on NULL being def.
+  callback is not used here, so define NULL here to avoid including the world*/
+#ifndef NULL
+#define NULL ((void *) 0)
+#endif
+
+#include "libfdt.h"
+#include "DTBExtnLib.h"
+#include "qurt_qdi_ext.h"
+#include "qurt_thread.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define INVALID_BLOB_ID       (-1)
+#define DEFAULT_BLOB_ID         0
+
+/** QURT Device Tree Mapping Macros */
+#define QURT_DT_MAPPING_FAILED         (-1)
+#define QURT_DT_FLAG_ISLAND             0x1
+#define QURT_DT_FLAG_PHYSADDR           0x2
+
+/** Device Tree type for Root PD Device tree.
+    Root PD Device Tree will typically describe the hardware in the subsystem.
+    This is the /soc portion of the Device Tree. */
+#define QURT_DT_BLOB_TYPE_ROOT  0
+
+/** Device Tree type for Local Device tree.
+    Local Device Tree will typically contain the software settings.
+    This is the /sw portion of the Device Tree. */
+#define QURT_DT_BLOB_TYPE_LOCAL 1
+
+int qurt_devtree_init(void);
+
+/**@ingroup func_qurt_dt_mapping_create
+ Creates a memory mapping from the specified property of the specified device
+ tree node. Returns virtual addresses and sizes.
+                  
+ @param[in]   offset         Device tree node offset.
+ @param[in]   flags          Flags to configure memory. Overloaded as property 
+                              index if reg_name is NULL.
+ @param[in]   reg_name       Identifies property to use for mapping, should 
+                              resemble a region.
+ @param[out]   vaddr         Return pointer for the virtual region address.
+ @param[out]   size          Return pointer for the virtual region size.
+
+ @return
+ Result code indicating success or failure \n
+*/
+int qurt_dt_mapping_create(fdt_node_handle *devtreeNode, int flags, char *regionName, int regionIdx, 
+                                unsigned long long *vaddr, unsigned long long *size);
+
+/**@ingroup func_qurt_dt_mapping_create2
+ 
+ Creates a memory mapping from the specified property of the specified device
+ tree node.
+
+ Returns virtual addresses and sizes according to architecture (i.e either 32 bit or 64 bit). 
+
+ @param[in]   devtreeNode    Device Tree node    
+
+ @param[in]   dt_map_flags   Flags to configure memory mapping and are reserved for future purpose.
+                              (0) - Default value assumes details from DT node are phys address, size.
+                              QURT_DT_FLAG_ISLAND <IslandMode-Mapping>
+
+                              NOTE: The PA needs to be added to corresponding island spec to create an island mapping
+
+ @param[in]   regionName     NULL or name of index in range to return, should 
+                              resemble a region.       Ex.reg-names =  "base",         "rx",               "tx";
+
+ @param[in]   regionIdx      Index of range to return.  Ex reg       = <0x1000 0x20>, <0x10000 0x100>, <0x18000 0x100 >;
+                              
+                              NOTE: If client specifies both re_name & regionIdx. The precedence of 
+                              region name is taken over and region index is ignored.
+
+ @param[in]   dt_map_perm    Mapping access permissions(R/W),
+                              QURT_PERM_READ <Read only>
+                              QURT_PERM_WRITE
+
+ @param[in]   cache_attr     QuRT cache mode type's :
+                              QURT_MEM_CACHE_DEVICE <memory-mapped device>
+                              QURT_MEM_CACHE_WRITEBACK <Cached WB>
+                              Other required cache type enums in qurt_types.h can also be passed.
+
+                             NOTE: No default value for cache & perm is present. 
+                             Client always needs to pass any of defined the flags.
+
+ @param[out]  vaddr          Return pointer to the variable that holds the virtual address
+ @param[out]  size           Return pointer for the virtual region size.
+
+ @return
+ #QURT_EOK                   Success indicating mapping created properly.
+ #QURT_DT_MAPPING_FAILED     Failed to create mapping.
+ #QURT_EINVALID              Mismatch in the architecture.
+
+                             else FdtLib or thirdparty error code.
+
+*/
+int qurt_dt_mapping_create2(fdt_node_handle *devtreeNode, unsigned int dt_map_flags, 
+                              char *regionName, int regionIdx, unsigned int dt_map_perm, int cache_attr, void **vaddr, size_t *size);
+
+/**@ingroup func_qurt_dt_isr_register
+  Device tree aware registration of an interrupt service routine (ISR) to an ISR thread. 
+  The interrupt defined in the specified device tree node is enabled when this function returns success.
+
+  @datatypes
+  #qurt_thread_t \n
+  #fdt_node_handle
+
+  @param[in]   dt_node       Device tree node that specifies the interrupt property.
+  @param[in]   dt_int_index  Index of the specific interrupt to use within the device tree node structure.
+                             Specify either this or int_name, use -1 if string is used.
+  @param[in]   dt_int_name   Name of the specific interrupt to use within the device tree node structure.
+                             Either this or int_index should be specified, use NULL if index is used
+  @param[in]   isr_thread_id ISR thread ID, returned from qurt_isr_create(), defined by qurt_isr_register2().  
+  @param[in]   prio          Priority of the ISR, defined by qurt_isr_register2().
+  @param[in]   flags         Defines ACK type. Values : \n
+                             #QURT_INT_NON_DELAYED_ACK - ISR is acknowledged by the interrupt handle routine 
+			                                     in the kernel.
+                             #QURT_INT_DELAYED_ACK     - Client chooses to acknowledge.
+                             Defined by qurt_isr_register2().             
+  @param[in]   isr           ISR with proto type void isr (void *arg, int int_num), defined by qurt_isr_register2().
+  @param[in]   arg  	     First argument of the ISR when it is called to service the interrupt, defined by qurt_isr_register2().
+   
+  @return 
+  #QURT_EOK          -- Successfully registered the ISR for the interrupt \n
+  #QURT_EINT         -- Interrupt not configured \n
+  #QURT_EINVALID     -- Invalid thread ID \n
+  #QURT_EDISABLED    -- The feature is disabled \n
+  #QURT_EDUPLICATE   -- Interrupt is already registered
+
+  @dependencies
+   Create the thread ID qurt_isr_create().
+   ISR registration completed with qurt_isr_register2().
+ */
+int qurt_dt_isr_register(fdt_node_handle *dt_node, int dt_int_index, char * dt_int_name, qurt_thread_t isr_thread_id, 
+                         unsigned short prio, unsigned short flags, void (*isr) (void *, int), void *arg);
+
+/**@ingroup func_qurt_dt_blob_id_get
+ Returns the Blob ID for the Blob type passed.
+ The value returned from this API can be passed as Blob ID parameter to DTBExtnLib APIs.
+
+ @param[in] blob_type  Blob type to look up.
+ @return Blob ID for the passed Blob Type.
+*/
+int qurt_dt_blob_id_get(unsigned int blob_type);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_ecc.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_ecc.h
new file mode 100755
index 0000000000000..09312684e99af
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_ecc.h
@@ -0,0 +1,168 @@
+#ifndef QURT_ECC_H
+#define QURT_ECC_H
+
+
+/*=====================================================================
+ 
+  @file  qurt_ecc.h
+  @brief  Prototypes of QuRT memory ECC API functions      
+
+ Copyright (c) 2018, 2020-2021 by Qualcomm Technologies, Inc.  All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+ ======================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*=============================================================================
+                        TYPEDEFS
+=============================================================================*/
+/** @addtogroup exception_handling_types
+@{ */
+// ECC memory definition
+typedef enum {
+    QURT_ECC_MEM_L1_ICACHE = 0, /**< ECC memory L1 ICache. */
+    QURT_ECC_MEM_L1_DCACHE = 1, /**< ECC memory L1 DCache.*/
+    QURT_ECC_MEM_L2_CACHE  = 2, /**< ECC memory L2 Cache.*/
+    QURT_ECC_MEM_VTCM      = 3  /**< ECC memory VTCM.*/
+} qurt_ecc_memory_t;
+/** @} */ /* end_addtogroup exception_handling_types */
+
+/*=============================================================================
+                        CONSTANTS AND MACROS
+=============================================================================*/
+/** @addtogroup exception_handling_macros
+@{ */
+
+#define   QURT_ECC_ERR_DETECTED_STATUS        0 /**< ECC error detected. */
+#define   QURT_ECC_ERR_TYPE                   1 /**< ECC error type.*/
+// ECC status type
+
+#define  QURT_ECC_CORRECTABLE_COUNT           (1<<0) /**< ECC correctable count.*/
+#define  QURT_ECC_UNCORRECTABLE_COUNT         (1<<1) /**< ECC uncorrectable count.*/
+#define  QURT_ECC_REGION_LOGGING              (1<<2) /**< ECC region logging.*/
+// ECC enable/disable definition
+
+#define QURT_ECC_PROTECTION_DISABLE  (0<<0)    /**< Bit 0. */
+#define QURT_ECC_PROTECTION_ENABLE   (1<<0)    /**< Bit 0. */
+/** @} */ /* end_addtogroup exception_handling_macros */
+/*=============================================================================
+                        FUNCTIONS
+=============================================================================*/
+
+
+/**@ingroup func_qurt_ecc_enable
+  Enables or disables ECC protection on a specified memory.
+  
+  @datatypes
+  #qurt_ecc_memory_t
+  
+  @param[in]  memory Set to one of the following values:
+                     - #QURT_ECC_MEM_L1_ICACHE
+                     - #QURT_ECC_MEM_L1_DCACHE
+                     - #QURT_ECC_MEM_L2_CACHE
+                     - #QURT_ECC_MEM_VTCM   @tablebulletend
+
+  @param[in]  enable Set to one of the following values:
+                     - #QURT_ECC_PROTECTION_ENABLE
+                     - #QURT_ECC_PROTECTION_DISABLE  @tablebulletend
+
+  @return
+  - #QURT_EOK --   ECC enabling or disabling setup is performed successfully
+  - Others  --    Failure
+
+  @dependencies
+  None.
+ */
+int qurt_ecc_enable( qurt_ecc_memory_t memory, unsigned int enable );
+
+
+/**@ingroup func_qurt_ecc_get_error_status
+  Gets ECC error status for a specified memory.
+  
+  @datatypes
+  #qurt_ecc_memory_t
+  
+  @param[in]  memory  Set to one of the following:
+                      - #QURT_ECC_MEM_L1_ICACHE
+                      - #QURT_ECC_MEM_L1_DCACHE
+                      - #QURT_ECC_MEM_L2_CACHE
+                      - #QURT_ECC_MEM_VTCM    @tablebulletend
+
+  @param[in]  type  Set to one of the following:
+                     - #QURT_ECC_ERR_DETECTED_STATUS
+                     - #QURT_ECC_ERR_TYPE  @tablebulletend
+
+  @return
+  Returns the following when the type is #QURT_ECC_ERR_DETECTED_STATUS:
+       - 0 -- No error detected \n
+       - 1 -- At least one error detected \n
+  Returns the following when the type is #QURT_ECC_ERR_TYPE: \n
+       - 0 through 1 -- Correctable error \n
+       - 2 --   Uncorrectable error
+
+  @dependencies
+  None.
+ */
+int qurt_ecc_get_error_status( qurt_ecc_memory_t memory, unsigned int type );
+
+
+/**@ingroup func_qurt_ecc_get_error_count
+  Gets the ECC error count for a specified memory.
+  
+  @datatypes
+  #qurt_ecc_memory_t
+  
+  @param[in]  memory  Set to one of the following values:\n
+                      - #QURT_ECC_MEM_L1_ICACHE \n
+                      - #QURT_ECC_MEM_L1_DCACHE \n
+                      - #QURT_ECC_MEM_L2_CACHE \n
+                      - #QURT_ECC_MEM_VTCM    @tablebulletend
+
+  @param[in]  type  Set to one of the following values: \n
+                     - #QURT_ECC_CORRECTABLE_COUNT \n
+                     - #QURT_ECC_UNCORRECTABLE_COUNT  @tablebulletend
+
+  @return
+  Error count for the specified error type.
+
+  @dependencies
+  None.
+ */
+int qurt_ecc_get_error_count( qurt_ecc_memory_t memory, unsigned int type );
+
+
+/**@ingroup func_qurt_ecc_clear_error_count
+  Clears ECC error count or region logging for a specified memory.
+  
+  @datatypes
+  #qurt_ecc_memory_t
+  
+  @param[in]  memory Set to one of the following values: \n
+                     - #QURT_ECC_MEM_L1_ICACHE \n
+                     - #QURT_ECC_MEM_L1_DCACHE \n
+                     - #QURT_ECC_MEM_L2_CACHE \n
+                     - #QURT_ECC_MEM_VTCM    @tablebulletend
+
+  @param[in]  type Set to one or multiple OR'ed of the following values: \n
+                  - #QURT_ECC_CORRECTABLE_COUNT  \n
+                  - #QURT_ECC_UNCORRECTABLE_COUNT \n
+                  - #QURT_ECC_REGION_LOGGING  @tablebulletend
+     
+  @return
+  #QURT_EOK -- Error count successfully cleared \n
+  Others --   Failure at clearing the error count
+
+  @dependencies
+  None.
+ */
+int qurt_ecc_clear_error_count( qurt_ecc_memory_t memory, unsigned int type );
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_ECC_H */
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_error.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_error.h
new file mode 100755
index 0000000000000..f4666b396c378
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_error.h
@@ -0,0 +1,149 @@
+#ifndef QURT_ERROR_H
+#define QURT_ERROR_H
+
+/**
+  @file qurt_error.h 
+  Error results- QURT defines a set of standard symbols for the error result values. This file lists the
+  symbols and their corresponding values.
+
+ EXTERNALIZED FUNCTIONS
+  none
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  none
+
+ Copyright (c) 2021-2022 , 2023  by Qualcomm Technologies, Inc.  All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc..
+ ======================================================================*/
+#include "qurt_except.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** @addtogroup chapter_error
+@{ */
+
+/*=====================================================================
+Constants and macros
+======================================================================*/
+#define QURT_EOK                             0  /**< Operation successfully performed. */
+#define QURT_EVAL                            1  /**< Wrong values for the parameters. The specified page does not exist. */
+#define QURT_EMEM                            2  /**< Not enough memory to perform the operation.*/
+
+#define QURT_EINVALID                        4  /**< Invalid argument value; invalid key. */ 
+/** @cond  */
+#define QURT_EUNKNOWN                        6  /**< Defined but never used in QuRT. */
+#define QURT_ENOMSGS                         7  /**< Message queue is empty. */ 
+#define QURT_EBADF                           9  /**< Bad message queue descriptor. */
+/** @endcond */
+#define QURT_EFAILED                        12  /**< Operation failed. */ 
+
+#define QURT_ENOTALLOWED                    13  /**< Operation not allowed. */
+
+/** @cond */
+#define QURT_EDUPCLSID                      14  /*< Duplicate class ID. */
+/** @endcond */
+/** @cond rest_reg_dist   */
+#define QURT_ENOREGISTERED                  20  /**< No registered interrupts.*/ 
+/** @endcond */
+
+
+/** @cond */
+#define QURT_EISDB                          21  /*< Power collapse failed due to ISDB being enabled. */
+#define QURT_ESTM                           22  /*< Power collapse failed in a Single-threaded mode check. */
+/** @endcond */
+
+
+/** @cond rest_reg_dist  */
+#define QURT_ETLSAVAIL                      23  /**< No free TLS key is available. */
+#define QURT_ETLSENTRY                      24  /**< TLS key is not already free. */ 
+/** @endcond */
+
+#define QURT_EINT                           26  /**< Invalid interrupt number (not registered). */  
+/** @cond rest_reg_dist */
+#define QURT_ESIG                           27  /**< Invalid signal bitmask (cannot set more than one signal at a time). */
+/** @endcond */
+
+/** @cond */
+#define QURT_EHEAP                          28  /**< No heap space is available. */
+#define QURT_ENOSPC                         28  /**< No space to create another queue in the system. */
+#define QURT_EMEMMAP                        29  /**< Physical address layout is not supported by the kernel. */
+/** @endcond */
+/** @cond rest_reg_dist */
+#define QURT_ENOTHREAD                      30  /**< Thread no longer exists. */
+/** @endcond */
+/** @cond */
+#define QURT_EL2CACHE                       31  /**< L2cachable is not supported in kernel invalidate/cleaninv. */
+/** @endcond */
+/** @cond rest_reg_dist  */
+#define QURT_EALIGN                         32  /**< Not aligned. */
+#define QURT_EDEREGISTERED                  33  /**< Interrupt is already deregistered.*/
+/** @endcond */
+
+/** @cond internal_only */
+
+#define QURT_ETLBCREATESIZE                 34  /**< TLB create error -- Incorrect size.*/
+#define QURT_ETLBCREATEUNALIGNED            35  /**< TLB create error -- Unaligned address.*/
+/** @endcond */
+/** @cond rest_reg_dist*/
+#define QURT_EEXISTS                        35  /**< File or message queue already exists. */
+#define QURT_ENAMETOOLONG                   36  /**< Name too long for message queue creation. */
+#define QURT_EPRIVILEGE                     36  /**< Caller does not have privilege for this operation.*/
+
+#define QURT_ECANCEL                        37  /**< A cancellable request was canceled because the associated process was asked to exit.*/
+/** @endcond */
+
+/** @cond */
+#define QURT_EISLANDTRAP                    38  /*< Unsupported TRAP is called in Island mode.*/ 
+
+#define QURT_ERMUTEXUNLOCKNONHOLDER         39  /*< Rmutex unlock by a non-holder.*/
+#define QURT_ERMUTEXUNLOCKFATAL             40  /*< Rmutex unlock error, all except the non-holder error.*/
+#define QURT_EMUTEXUNLOCKNONHOLDER          41  /*< Mutex unlock by a non-holder.*/
+#define QURT_EMUTEXUNLOCKFATAL              42  /*< Mutex unlock error, all except the non-holder error.*/
+#define QURT_EINVALIDPOWERCOLLAPSE          43  /*< Invalid power collapse mode requested. */ 
+/** @endcond */
+#define QURT_EISLANDUSEREXIT                44  /**< User call has resulted in island exit.*/
+#define QURT_ENOISLANDENTRY                 45  /**< Island mode had not yet been entered.*/
+#define QURT_EISLANDINVALIDINT              46  /**< Exited Island mode due to an invalid island interrupt.*/
+/** @cond rest_reg_dist */
+#define QURT_ETIMEDOUT                      47  /**< Operation timed-out. */
+#define QURT_EALREADY                       48  /**< Operation already in progress. */
+/** @endcond */
+
+#define QURT_ERETRY                         49  /*< Retry the operation. */
+#define QURT_EDISABLED                      50  /*< Resource disabled. */
+#define QURT_EDUPLICATE                     51  /*< Duplicate resource. */
+#define QURT_EBADR                          53  /*< Invalid request descriptor. */
+#define QURT_ETLB                           54  /*< Exceeded maximum allowed TLBs. */
+#define QURT_ENOTSUPPORTED                  55  /*< Operation not supported. */
+/** @cond rest_reg_dist */
+#define QURT_ENORESOURCE                    56  /**< No resource. */
+/** @endcond */
+
+#define QURT_EDTINIT                        57  /**< Problem with device tree intialization. */
+#define QURT_EBUFLOCK                       58  /*< Buffer lock failed because it was already locked many times. */
+#define QURT_ELOCKED                        59  /**< Current operation failed as the buffer is locked. */
+#define QURT_EMSGSIZE                       90  /*< Message queue msg_len is greater than mq_msgsize attribute of the message queue. */
+
+
+#define QURT_ENOTCONFIGURED                 91  /*< Interrupt is NOT configured. */
+
+#define QURT_EBANDWIDTHLIMIT                92  /*< Message queue send exceed the bandwidth limit. */
+
+#define QURT_ECFIVIOLATION                  93  /*< CFI violation detected. */
+
+#define QURT_EDESTROY                       94  /**< A destroy request was made to waiting threads.*/
+
+#define QURT_EHMXNOTAVAIL                   95  /**< HMX is not available to target thread.*/
+#define QURT_EHMXNOTDETACHABLE              96  /**< HMX is not detachable from target thread.*/
+
+#define QURT_EFATAL                         -1  /**< Fatal error. */
+
+/** @} */ /* end_addtogroup chapter_error */
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_ERROR_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_event.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_event.h
new file mode 100755
index 0000000000000..987f0fe79f227
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_event.h
@@ -0,0 +1,452 @@
+#ifndef QURT_EVENT_H
+#define QURT_EVENT_H
+/**
+  @file qurt_event.h
+  @brief Prototypes of kernel event API functions.      
+
+ EXTERNALIZED FUNCTIONS
+  none
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  none
+
+ Copyright (c) 2018-2021, 2023 Qualcomm Technologies, Inc.  All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+
+#include "qurt_consts.h"
+#include "qurt_thread.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * System environment object type.
+ */
+/**@addtogroup sys_env_types
+@{ */
+/** QuRT swap pool information type. */
+typedef struct qurt_sysenv_swap_pools {
+   /** @cond */
+   unsigned int spoolsize; /* Swap pool size.*/
+   unsigned int spooladdr;   /* Swap pool start address.*/
+   /** @endcond */
+}qurt_sysenv_swap_pools_t;
+
+/**QuRT application heap information type. */
+typedef struct qurt_sysenv_app_heap {
+   /** @cond */
+   unsigned int heap_base; /* Heap base address.*/
+   unsigned int heap_limit; /* Heap end address.*/
+   /** @endcond */
+} qurt_sysenv_app_heap_t ;
+
+/** QuRT architecture version information type. */
+typedef struct qurt_sysenv_arch_version {
+   /** @cond */
+    unsigned int arch_version; /*Architecture version.*/
+    /** @endcond */
+}qurt_arch_version_t;
+
+/** QuRT maximum hardware threads information type. */
+typedef struct qurt_sysenv_max_hthreads {
+   /** @cond */
+   unsigned int max_hthreads; /*Maximum number of hardware threads.*/
+   /** @endcond */
+}qurt_sysenv_max_hthreads_t;
+
+/** QuRT active hardware threads information type. */
+typedef struct qurt_sysenv_hthreads {
+   /** @cond */
+   unsigned int hthreads; /*Maximum number of hardware threads.*/
+   /** @endcond */
+}qurt_sysenv_hthreads_t;
+
+/** QuRT maximum pi priority information type. */
+typedef struct qurt_sysenv_max_pi_prio {
+     /** @cond */
+    unsigned int max_pi_prio; /*Maximum pi priority.*/
+     /** @endcond */
+}qurt_sysenv_max_pi_prio_t;
+
+/** QuRT process name information type. */
+typedef struct qurt_sysenv_procname {
+     /** @cond */
+   union {
+      unsigned int asid; /*Address space ID.*/
+      unsigned int pid;  /*Process ID.*/
+   };
+   char name[QURT_MAX_NAME_LEN]; /* Process name.*/
+    /** @endcond */
+}qurt_sysenv_procname_t;
+
+/** QuRT stack profile count information type. */
+typedef struct qurt_sysenv_stack_profile_count {
+     /** @cond */
+   unsigned int count; /*Stack profile count for usage.*/
+   unsigned int count_watermark; /*Stack profile count for watermark.*/
+    /** @endcond */
+}qurt_sysenv_stack_profile_count_t;
+
+/**
+ QuRT system error event type.
+ */
+typedef struct _qurt_sysevent_error_t
+{
+    unsigned int thread_id; /**< Thread ID.  */
+    unsigned int fault_pc;  /**< Fault PC. */
+    unsigned int sp;        /**< Stack pointer. */
+    unsigned int badva;     /**< Virtual data address where the exception occurred. */
+    unsigned int cause;     /**< QuRT error result. */
+    unsigned int ssr;       /**< Supervisor status register. */
+    unsigned int fp;        /**< Frame pointer. */
+    unsigned int lr;        /**< Link register. */
+    unsigned int pid;       /**< PID of the process to which this thread belongs.*/
+ } qurt_sysevent_error_t ;
+
+typedef struct _qurt_sysevent_error_1_t
+{
+    unsigned int thread_id; /**< Thread ID.  */
+    unsigned int fault_pc;  /**< Fault PC. */
+    unsigned int sp;        /**< Stack pointer. */
+    unsigned int badva;     /**< Virtual data address where the exception occurred. */
+    unsigned int cause;     /**< QuRT error result. */
+    unsigned int ssr;       /**< Supervisor status register. */
+    unsigned int fp;        /**< Frame pointer. */
+    unsigned int lr;        /**< Link register. */
+    unsigned int pid;       /**< PID of the process to which this thread belongs.*/
+    unsigned int fkey;      /**< Framekey.*/
+    unsigned int reserved1; /**< Reserved.*/
+    unsigned int reserved2; /**< Reserved.*/
+    unsigned int reserved3; /**< Reserved.*/
+ } qurt_sysevent_error_1_t ;
+ 
+/** QuRT page fault error event information type. */
+typedef struct qurt_sysevent_pagefault {
+    qurt_thread_t thread_id; /**< Thread ID of the page fault thread. */
+    unsigned int fault_addr; /**< Accessed address that caused the page fault. */
+    unsigned int ssr_cause;  /**< SSR cause code for the page fault. */
+} qurt_sysevent_pagefault_t ;
+/** @} */ /* @endaddtogroup sys_env_types */
+/*=============================================================================
+                                    FUNCTIONS
+=============================================================================*/
+
+/*======================================================================*/
+/**
+  Gets the environment swap pool 0 information from the kernel.
+
+  @datatypes
+  #qurt_sysenv_swap_pools_t
+
+  @param[out] pools  Pointer to the pools information.
+
+  @return
+  #QURT_EOK -- Success.
+
+  @dependencies
+  None.
+*/
+int qurt_sysenv_get_swap_spool0 (qurt_sysenv_swap_pools_t *pools );
+
+/*
+  Gets the environment swap pool 1 information from the kernel.
+
+  @datatypes
+  #qurt_sysenv_swap_pools_t
+
+  @param[out] pools  Pointer to the pools information.
+
+  @return
+  #QURT_EOK -- Success.
+
+  @dependencies
+  None.
+*/
+int qurt_sysenv_get_swap_spool1(qurt_sysenv_swap_pools_t *pools );
+
+/**@ingroup func_qurt_sysenv_get_app_heap
+  Gets information on the program heap from the kernel.
+
+  @datatypes
+  #qurt_sysenv_app_heap_t
+
+  @param[out] aheap  Pointer to information on the program heap.
+
+  @return
+  #QURT_EOK -- Success. \n
+  #QURT_EVAL -- Invalid parameter.
+
+  @dependencies
+  None.
+*/
+int qurt_sysenv_get_app_heap(qurt_sysenv_app_heap_t *aheap );
+
+/**@ingroup func_qurt_sysenv_get_arch_version
+  Gets the Hexagon processor architecture version from the kernel.
+
+  @datatypes
+  #qurt_arch_version_t
+
+  @param[out] vers  Pointer to the Hexagon processor architecture version.
+
+  @return
+  #QURT_EOK -- Success. \n
+  #QURT_EVAL -- Invalid parameter
+
+  @dependencies
+  None.
+*/
+int qurt_sysenv_get_arch_version(qurt_arch_version_t *vers);
+
+/**@ingroup func_qurt_sysenv_get_max_hw_threads
+  Gets the maximum number of hardware threads supported in the Hexagon processor. 
+  The API includes the disabled hardware threads to reflect the maximum 
+  hardware thread count.
+  For example, if the image is configured for four hardware threads and hthread_mask is set to 0x5 in 
+  cust_config.xml, only HW0 and HW2 are initialized by QuRT.
+  HW1 and HW3 are not used at all. Under such a scenario, 
+  qurt_sysenv_get_max_hw_threads() still returns four.
+
+  @datatypes
+  #qurt_sysenv_max_hthreads_t
+
+  @param[out] mhwt  Pointer to the maximum number of hardware threads supported in the Hexagon processor.
+
+  @return
+  #QURT_EOK -- Success. \n
+  #QURT_EVAL -- Invalid parameter.
+
+  @dependencies
+  None.
+*/
+int qurt_sysenv_get_max_hw_threads(qurt_sysenv_max_hthreads_t *mhwt );
+
+/**@ingroup func_qurt_sysenv_get_hw_threads
+  Gets the number of hardware threads initialized by QuRT in Hexagon processor.
+  For example, if the image is configured for four hardware threads and hthread_mask is set to 0x5 in 
+  cust_config.xml, QuRT only initializes HW0 and HW2.
+  HW1 and HW3 are not used. In this scenario, qurt_sysenv_get_hw_threads() returns 2.
+
+  @datatypes
+  #qurt_sysenv_hthreads_t
+
+  @param[out] mhwt  Pointer to the number of hardware threads active in the Hexagon processor.
+
+  @return
+  #QURT_EOK -- Success. \n
+  #QURT_EVAL -- Invalid parameter.
+
+  @dependencies
+  None.
+*/
+int qurt_sysenv_get_hw_threads(qurt_sysenv_hthreads_t *mhwt );
+
+/**@ingroup func_qurt_sysenv_get_max_pi_prio
+  Gets the maximum priority inheritance mutex priority from the kernel.
+
+  @datatypes
+  #qurt_sysenv_max_pi_prio_t
+
+  @param[out] mpip  Pointer to the maximum priority inheritance mutex priority.
+
+  @return
+  #QURT_EOK -- Success. \n
+  #QURT_EVAL -- Invalid parameter.
+
+  @dependencies
+  None.
+*/
+int qurt_sysenv_get_max_pi_prio(qurt_sysenv_max_pi_prio_t *mpip );
+
+/**@ingroup func_qurt_sysenv_get_process_name2
+  Gets information on the system environment process names based on the client_handle argument.
+
+  @datatypes
+  #qurt_sysenv_procname_t
+
+  @param[in] client_handle  Obtained from the current invocation function (Section 3.4.1).
+  @param[out] pname         Pointer to information on the process names in the system.
+
+  @return
+  #QURT_EOK -- Success. \n
+  #QURT_EVAL -- Invalid parameter.
+
+  @dependencies
+  None.
+*/
+int qurt_sysenv_get_process_name2(int client_handle, qurt_sysenv_procname_t *pname );
+
+/**@ingroup func_qurt_sysenv_get_process_name
+  Gets information on the system environment process names from the kernel.
+
+  @datatypes
+  #qurt_sysenv_procname_t
+
+  @param[out] pname  Pointer to information on the process names in the system.
+
+  @return
+  #QURT_EOK -- Success. \n
+  #QURT_EVAL -- Invalid parameter.
+
+  @dependencies
+  None.
+*/
+int qurt_sysenv_get_process_name(qurt_sysenv_procname_t *pname );
+
+/**@ingroup func_qurt_sysenv_get_stack_profile_count
+   Gets information on the stack profile count from the kernel.
+
+   @datatypes
+   #qurt_sysenv_stack_profile_count_t
+
+   @param[out] count Pointer to information on the stack profile count.
+
+   @return
+   #QURT_EOK -- Success.
+
+   @dependencies
+   None.
+*/
+int qurt_sysenv_get_stack_profile_count(qurt_sysenv_stack_profile_count_t *count );
+
+/**@ingroup func_qurt_exception_wait
+  Registers the program exception handler.
+  This function assigns the current thread as the QuRT program exception handler and suspends the
+  thread until a program exception occurs.
+
+  When a program exception occurs, the thread is awakened with error information
+  assigned to the parameters of this operation.
+
+  @note1hang If no program exception handler is registered, or if the registered handler
+             calls exit, QuRT raises a kernel exception.
+             If a thread runs in Supervisor mode, any errors are treated as kernel
+             exceptions.
+
+  @param[out]  ip      Pointer to the instruction memory address where the exception occurred.
+  @param[out]  sp      Stack pointer.
+  @param[out]  badva   Pointer to the virtual data address where the exception occurred.
+  @param[out]  cause   Pointer to the QuRT error result code.
+
+  @return
+  Registry status: \n
+  Thread identifier -- Handler successfully registered. \n
+  #QURT_EFATAL -- Registration failed.
+
+  @dependencies
+  None.
+*/
+unsigned int qurt_exception_wait (unsigned int *ip, unsigned int *sp,
+                                  unsigned int *badva, unsigned int *cause);
+
+unsigned int qurt_exception_wait_ext (qurt_sysevent_error_t * sys_err);
+
+/**@ingroup func_qurt_exception_wait3
+  Registers the current thread as the QuRT program exception handler, and suspends the thread until a
+  program exception occurs.
+  When a program exception occurs, the thread is awakened with error information assigned to the specified
+  error event record.
+  If a program exception is raised when no handler is registered (or when a handler is registered, but it calls
+  exit), the exception is treated as fatal.\n
+  @note1hang If a thread runs in Monitor mode, all exceptions are treated as kernel exceptions.\n
+  @note1cont This function differs from qurt_exception_wait() by returning the error information in a data
+              structure rather than as individual variables. It also returns additional information (for example, SSR, FP, and LR).
+
+  @param[out] sys_err       Pointer to the qurt_sysevent_error_1_t type structure.
+  @param[in]  sys_err_size  Size of the qurt_sysevent_error_1_t structure.
+
+  @return
+  Registry status: \n
+  - #QURT_EFATAL -- Failure. \n
+  - Thread ID -- Success.
+
+  @dependencies
+  None.
+*/
+
+unsigned int qurt_exception_wait3(void * sys_err, unsigned int sys_err_size);
+
+/**@ingroup func_qurt_exception_raise_nonfatal
+  Raises a nonfatal program exception in the QuRT program system.
+
+  For more information on program exceptions, see Section @xref{dox:exception_handling}.
+
+  This operation never returns -- the program exception handler is assumed to perform all
+  exception handling before terminating or reloading the QuRT program system.
+
+  @note1hang The C library function abort() calls this operation to indicate software
+             errors.
+
+  @param[in] error QuRT error result code (Section @xref{dox:error_results}).
+
+  @return
+  Integer -- Unused.
+
+  @dependencies
+  None.
+*/
+int qurt_exception_raise_nonfatal (int error) __attribute__((noreturn));
+
+
+/**@ingroup func_qurt_exception_raise_fatal
+  Raises a fatal program exception in the QuRT system.
+
+  Fatal program exceptions terminate the execution of the QuRT system without invoking
+  the program exception handler.
+
+  For more information on fatal program exceptions, see Section @xref{dox:exception_handling}.
+
+  This operation always returns, so the calling program can perform the necessary shutdown
+  operations (data logging, on so on).
+
+  @note1hang Context switches do not work after this operation has been called.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+void qurt_exception_raise_fatal (void);
+
+unsigned int qurt_enable_floating_point_exception(unsigned int mask);
+
+/**@ingroup func_qurt_exception_enable_fp_exceptions
+  Enables the specified floating point exceptions as QuRT program exceptions.
+
+  The exceptions are enabled by setting the corresponding bits in the Hexagon
+  control user status register (USR).
+
+  The mask argument specifies a mask value identifying the individual floating
+  point exceptions to set. The exceptions are represented as defined symbols
+  that map into bits 0 through 31 of the 32-bit flag value.
+  Multiple floating point exceptions are specified by OR'ing together the individual
+  exception symbols.\n
+  @note1hang This function must be called before performing any floating point operations.
+
+  @param[in] mask Floating point exception types. Values: \n
+             - #QURT_FP_EXCEPTION_ALL    \n
+             - #QURT_FP_EXCEPTION_INEXACT    \n
+             - #QURT_FP_EXCEPTION_UNDERFLOW  \n
+             - #QURT_FP_EXCEPTION_OVERFLOW  \n
+             - #QURT_FP_EXCEPTION_DIVIDE0    \n
+             - #QURT_FP_EXCEPTION_INVALID   @tablebulletend
+
+  @return
+  Updated contents of the USR.
+
+  @dependencies
+  None.
+*/
+
+static inline unsigned int qurt_exception_enable_fp_exceptions(unsigned int mask)
+{
+   return qurt_enable_floating_point_exception(mask);
+}
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_EVENT_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_except.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_except.h
new file mode 100755
index 0000000000000..e1684c80e3d50
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_except.h
@@ -0,0 +1,185 @@
+#ifndef QURT_EXCEPT_H
+#define QURT_EXCEPT_H
+
+/**
+  @file qurt_except.h 
+  @brief  Defines Cause and Cause2 codes for error-handling.  
+
+EXTERNAL FUNCTIONS
+   None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+ Copyright (c) 2021-2022 by Qualcomm Technologies, Inc.  All Rights Reserved.
+
+ Confidential and Proprietary - Qualcomm Technologies, Inc..
+ ======================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+  QuRT supports error handling to handle CPU detected exceptions and software errors. 
+  QuRT treats all errors as either fatal errors or nonfatal errors. 
+
+  @section sec1 Fatal errors
+  All supervisor mode exceptions are treated as fatal errors. 
+  If a registered exception handler calls qurt_exit(), it is treated as a fatal error.
+  Fatal errors result in saving the context of primary hardware thread to QURT_error_info and the rest of the thread contexts to the corresponding TCBs. 
+  All hardware threads are eventually stopped and the cache is flushed.
+  NMI exception is treated little differently from other fatal errors. QuRT saves the contexts of all the hardware threads into QURT_error_info.\n
+
+  @subsection subsection1 Debugging fatal errors
+  - QURT_error_info.status.status	 -- Indicates that an error occured.
+  - QURT_error_info.status.cause	 -- Cause code for fatal error; Cause and Cause 2 details are listed below.
+  - QURT_error_info.status.cause2	 -- Cause2 code for fatal error; Cause and Cause 2 details are listed below.
+  - QURT_error_info.status.fatal	 -- Indicates whether a fatal error occurred. A user error can result in a fatal error if the exceptional handler is not registered.
+  - QURT_error_info.status.hw_tnum -- Indicates the index of QURT_error_info.locregs[], where the context is saved when the error is fatal error.
+  - QURT_error_info.global_regs    -- Contains the values of the global registers of Q6
+  - QURT_error_info.local_regs[QURT_error_info.status.hw_tnum] -- Provides the CPU context when the error is a supervisor error.
+    
+
+
+  @subsection subsection2 Debugging nonfatal errors
+  - QURT_error_info.user_errors                                    -- All user errors are logged here.
+  - QURT_error_info.user_errors.counter                            -- Index to last logged error.
+  - QURT_error_info.user_errors.entry[0...counter]	               -- Structure for logged error.
+  - QURT_error_info.user_errors.entry[0...counter].error_tcb       -- TCB for the user error.
+  - QURT_error_info.user_errors.entry[0...counter].error_tcb.error -- Information about the error; Cause, Cause2, Badva and hardware thread ID.
+  - QURT_error_info.user_errors.entry[0...counter].error_code      -- ((cause2 << 8) 'Logical Or' (cause) ); Cause and Cause 2 details are listed below.
+  - QURT_error_info.user_errors.entry[0...counter].hw_thread	   -- Hardware thread ID for error.
+  - QURT_error_info.user_errors.entry[0...counter].pcycle	       -- Pcycle for error.
+
+@note  
+  Important usage note:
+  Cause and Cause2 are error codes to distinguish multiple errors.
+  SSR and BADAVA are inconclusive without the vector number.
+  All cause and cause2 can range from 1 to 255 and every cause can have 1 to 255 error code.
+  Hence the system can have up to 255 * 255 unique error codes.
+  The cominations is representated as ((cause2 << 8) 'Logical OR' (cause) )
+  Some Cause2 codes are statically defined, whereas some are obtaned from SSR[7:0] cause codes. It depends on cause codes.
+  SSR cause codes are defined in Hexagon reference manual.
+  All possible combinations are listed below.
+*/
+/** @addtogroup chapter_error
+@{ */
+/* cause - error type - 8-bits*/
+#define QURT_EXCEPT_PRECISE             0x01U /**< Precise exception occurred. For this cause code, Cause2 is SSR[7:0].*/
+#define QURT_EXCEPT_NMI                 0x02U /**< NMI occurred; Cause2 is not defined. */
+#define QURT_EXCEPT_TLBMISS             0x03U /**< TLBMISS RW occurred; for this cause code, Cause2 is SSR[7:0]. */
+#define QURT_EXCEPT_RSVD_VECTOR         0x04U /**< Interrupt raised on a reserved vector, which must never occur. Cause2 is not defined. */
+#define QURT_EXCEPT_ASSERT              0x05U /**< Kernel assert. Cause2 QURT_ABORT_* are listed below.  */
+#define QURT_EXCEPT_BADTRAP             0x06U /**< trap0(num) called with unsupported num. Cause2 is 0. */
+#define QURT_EXCEPT_UNDEF_TRAP1         0x07U /**< Trap1 is not supported. Using Trap1 causes this error. Cause2 is not defined. */
+#define QURT_EXCEPT_EXIT                0x08U /**< Application called qurt_exit() or qurt_exception_raise_nonfatal(). Can be called from C library. Cause2 is "[Argument passed to qurt_exception_raise_nonfatal() & 0xFF]". */
+#define QURT_EXCEPT_TLBMISS_X           0x0AU /**< TLBMISS X (execution) occurred. Cause2 is not defined. */
+#define QURT_EXCEPT_STOPPED             0x0BU /**< Running thread stopped due to fatal error on other hardware thread. Cause2 is not defined. */
+#define QURT_EXCEPT_FATAL_EXIT          0x0CU /**< Application called qurt_fatal_exit(). Cause2 is not defined. */
+#define QURT_EXCEPT_INVALID_INT         0x0DU /**< Kernel received an invalid L1 interrupt. Cause2 is not defined. */
+#define QURT_EXCEPT_FLOATING_POINT      0x0EU /**< Kernel received an floating point error. Cause2 is not defined.  */
+#define QURT_EXCEPT_DBG_SINGLE_STEP     0x0FU /**< Cause2 is not defined. */
+#define QURT_EXCEPT_TLBMISS_RW_ISLAND   0x10U /**< Read write miss in Island mode. Cause2 QURT_TLB_MISS_RW_MEM* are listed below. */
+#define QURT_EXCEPT_TLBMISS_X_ISLAND    0x11U /**< Execute miss in Island mode. For this cause code, Cause2 is SSR[7:0]. */
+#define QURT_EXCEPT_SYNTHETIC_FAULT     0x12U /**< Synthetic fault with user request that kernel detected. Cause2 QURT_SYNTH_* are listed below. */
+#define QURT_EXCEPT_INVALID_ISLAND_TRAP 0x13U /**< Invalid trap in Island mode. Cause2 is trap number. */
+#define QURT_EXCEPT_UNDEF_TRAP0         0x14U /**< trap0(num) was called with unsupported num. Cause2 is trap number. */
+#define QURT_EXCEPT_PRECISE_DMA_ERROR   0x28U /**< Precise DMA error. Cause2 is DM4[15:8]. Badva is DM5 register. */
+
+#define QURT_ECODE_UPPER_LIBC         (0U << 16)  /**< Upper 16 bits is 0 for libc. */
+#define QURT_ECODE_UPPER_QURT         (0U << 16)  /**< Upper 16 bits is 0 for QuRT. */
+#define QURT_ECODE_UPPER_ERR_SERVICES (2U << 16)  /**< Upper 16 bits is 2 for error service. */
+/** @cond */
+#define QURT_ECODE_ISLAND_INVALID_QDI  3U         /**< Passing invalid QDI method in island. */
+/** @endcond */
+
+/* Cause2 for QURT_EXCEPT_SYNTHETIC_FAULT cause- 8bits */
+#define  QURT_SYNTH_ERR                         0x01U     /**< */
+#define  QURT_SYNTH_INVALID_OP                  0x02U     /**< */
+#define  QURT_SYNTH_DATA_ALIGNMENT_FAULT        0x03U     /**< */
+#define  QURT_SYNTH_FUTEX_INUSE                 0x04U     /**< */
+#define  QURT_SYNTH_FUTEX_BOGUS                 0x05U     /**< */
+#define  QURT_SYNTH_FUTEX_ISLAND                0x06U     /**< */
+#define  QURT_SYNTH_FUTEX_DESTROYED             0x07U     /**< */
+#define  QURT_SYNTH_PRIVILEGE_ERR               0x08U     /**< */
+
+/* Cause2 - Abort cause reason - 8 bits */
+/* ERR_ASSERT cause */
+#define   QURT_ABORT_FUTEX_WAKE_MULTIPLE           0x01U   /**<  Abort cause - futex wake multiple. */
+#define   QURT_ABORT_WAIT_WAKEUP_SINGLE_MODE       0x02U   /**<  Abort cause - thread waiting to wake up in Single Threaded mode. */
+#define   QURT_ABORT_TCXO_SHUTDOWN_NOEXIT          0x03U   /**<  Abort cause - call TCXO shutdown without exit. */
+#define   QURT_ABORT_FUTEX_ALLOC_QUEUE_FAIL        0x04U   /**<  Abort cause - futex allocation queue failure -  QURTK_futexhash_lifo empty. */
+#define   QURT_ABORT_INVALID_CALL_QURTK_WARM_INIT  0x05U   /**<  Abort cause - invalid call QURTK_warm_init() in NONE CONFIG_POWER_MGMT mode. */
+#define   QURT_ABORT_THREAD_SCHEDULE_SANITY        0x06U   /**<  Abort cause - sanity schedule thread is not supposed to run on the current hardware thread. */
+#define   QURT_ABORT_REMAP                         0x07U   /**<  Remap in the page table; the correct behavior must remove mapping if necessary. */
+#define   QURT_ABORT_NOMAP                         0x08U   /**<  No mapping in page table when removing a user mapping. */
+#define   QURT_ABORT_OUT_OF_SPACES                 0x09U
+#define   QURT_ABORT_INVALID_MEM_MAPPING_TYPE      0x0AU   /**<  Invalid memory mapping type when creating qmemory. */
+#define   QURT_ABORT_NOPOOL                        0x0BU   /**<  No pool available to attach. */
+#define   QURT_ABORT_LIFO_REMOVE_NON_EXIST_ITEM    0x0CU   /**<  Cannot allocate more futex waiting queue. */
+#define   QURT_ABORT_ARG_ERROR                     0x0DU
+#define   QURT_ABORT_ASSERT                        0x0EU   /**<  Assert abort. */
+#define   QURT_ABORT_FATAL                         0x0FU   /**<  Fatal error; must never occur. */
+#define   QURT_ABORT_FUTEX_RESUME_INVALID_QUEUE    0x10U   /**<  Abort cause - invalid queue ID in futex resume. */
+#define   QURT_ABORT_FUTEX_WAIT_INVALID_QUEUE      0x11U   /**<  Abort cause - invalid queue ID in futex wait. */
+#define   QURT_ABORT_FUTEX_RESUME_INVALID_FUTEX    0x12U   /**<  Abort cause - invalid futex object in hashtable. */
+#define   QURT_ABORT_NO_ERHNDLR                    0x13U   /**<  No registered error handler. */
+#define   QURT_ABORT_ERR_REAPER                    0x14U   /**<  Exception in the reaper thread. */
+#define   QURT_ABORT_FREEZE_UNKNOWN_CAUSE          0x15U   /**<  Abort in thread freeze operation. */
+#define   QURT_ABORT_FUTEX_WAIT_WRITE_FAILURE      0x16U   /**<  During futex wait processing, could not perform a necessary write operation to userland data; most likely due to a DLPager eviction. */
+#define   QURT_ABORT_ERR_ISLAND_EXP_HANDLER        0x17U   /**<  Exception in Island exception handler task. */
+#define   QURT_ABORT_L2_TAG_DATA_CHECK_FAIL        0x18U   /**<  Detected error in L2 tag/data during warm boot. The L2 tag/data check is done when CONFIG_DEBUG_L2_POWER_COLLAPSE is enabled. */
+#define   QURT_ABORT_ERR_SECURE_PROCESS            0x19U   /**<  Abort error in secure process. */
+#define   QURT_ABORT_ERR_EXP_HANDLER               0x20U   /**<  No exception handler, or the handler caused an exception. */
+#define   QURT_ABORT_ERR_NO_PCB                    0x21U   /**<  PCB of the thread context failed initialization, PCB was NULL. */
+#define   QURT_ABORT_NO_PHYS_ADDR                  0x22U   /**<  Unable to find the physical address for the virtual address. */
+#define   QURT_ABORT_OUT_OF_FASTINT_CONTEXTS       0x23U   /**<  Fast interrupt contexts exhausted. */
+#define   QURT_ABORT_CLADE_ERR                     0x24U   /**<  Fatal error seen with CLADE interrupt. */
+#define   QURT_ABORT_ETM_ERR                       0x25U   /**<  Fatal error seen with ETM interrupt. */
+#define   QURT_ABORT_ECC_DED_ASSERT                0x26U   /**<  ECC two-bit DED error. */
+#define   QURT_ABORT_VTLB_ERR                      0x27U   /**<  Fatal error in the VTLB layer. */
+#define   QURT_ABORT_TLB_ENCODE_DECODE_FAILURE     0x28U   /**<  Failure during the TLB encode or decode operation. */
+#define   QURT_ABORT_VTLB_WALKOBJS_BOUND_FAILURE   0x29U   /**<  Failure to lookup entry in the page table. */
+#define   QURT_ABORT_PHY_MEMORY_OWNERSHIP_FAILURE  0x30U   /**<  Failure to claim phy memory ownership. */
+#define   QURT_ABORT_JTLB_SIZE_CHECK_FAIL          0x31U   /**<  JTLB size configured is more than actual size in hardware */
+#define   QURT_ABORT_AUTOSTACK_ASSERT              0x32U   /**<  Error while handling stack flimit exception. */
+
+/* Cause2 - TLB-miss_X - 8bits */
+#define  QURT_TLB_MISS_X_FETCH_PC_PAGE             0x60U  /**<   */
+#define  QURT_TLB_MISS_X_2ND_PAGE                  0x61U  /**<   */
+#define  QURT_TLB_MISS_X_ICINVA                    0x62U  /**<   */
+
+/* Cause2 - TLB-miss_RW - 8bits */
+#define  QURT_TLB_MISS_RW_MEM_READ                 0x70U  /**<   */
+#define  QURT_TLB_MISS_RW_MEM_WRITE                0x71U  /**<   */
+
+/** @cond rest_reg_dist */
+/* Cause2 - Floating point exception - 8 bits */
+#define  QURT_FLOATING_POINT_EXEC_ERR              0xBFU    /**<  Execute floating-point. */
+/** @endcond */
+
+/** Cause2 - autostackv2 - 8 bits */
+#define  QURT_AUTOSTACKV2_CANARY_NOT_MATCH         0xC1U
+#define  QURT_AUTOSTACKV2_POOL_IDX_OFF_RANGE       0xC2U
+
+/** Cause2 - CFI violation - 8 bits */
+#define  QURT_CFI_VIOLATION                        0xC3U
+
+/** @cond rest_reg_dist*/
+/* Enable floating point exceptions */
+#define QURT_FP_EXCEPTION_ALL        0x1FU << 25 /**< */
+#define QURT_FP_EXCEPTION_INEXACT    0x1U << 29 /**< */
+#define QURT_FP_EXCEPTION_UNDERFLOW  0x1U << 28 /**< */
+#define QURT_FP_EXCEPTION_OVERFLOW   0x1U << 27 /**< */
+#define QURT_FP_EXCEPTION_DIVIDE0    0x1U << 26 /**< */
+#define QURT_FP_EXCEPTION_INVALID    0x1U << 25 /**< */
+
+/** @endcond */
+/** @} */ /* end_addtogroup chapter_error */
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_EXCEPT_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_fastint.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_fastint.h
new file mode 100755
index 0000000000000..ea65dc0917fc0
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_fastint.h
@@ -0,0 +1,71 @@
+#ifndef QURT_FASTINT_H
+#define QURT_FASTINT_H
+
+/**
+  @file qurt_fastint.h
+  @brief QuRT fast interrupt functions      
+
+   Copyright (c) 2013-2021  by Qualcomm Technologies, Inc.  All Rights Reserved.
+
+ ======================================================================*/
+
+/*======================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**@ingroup func_qurt_fastint_register
+  Register fast interrupt callback function
+
+  Fast interrupt callback should be designed to perform the minimal necessary 
+  actions for the interrupt, and/or perform some operations, such as signaling 
+  another regular software thread to start any additional processing. 
+  The callback should be a fast and short function. When a fast interrupt callback 
+  is running, the corresponding interrupt cannot be re-enabled until the callback 
+  returns. 
+
+  The fast interrupt callback must not use any system blocking calls, such as 
+  mutex lock or signal wait. Otherwise, it results in errors.
+
+  The fast interrupt callback function has a single integer argument and the 
+  function ends with no return. The argument value passed in is the interrupt
+  number, and therefore a single callback function can handle 
+  multiple fast interrupts.
+
+  @param[in] intno  Interrupt number to register. 
+  @param[in] fn     Interrupt callback function. 
+    
+  @return
+  #QURT_EOK -- Fast interrupt registration is successful. \n
+  #QURT_EINVALID -- Interrupt is already registered. \n
+  #QURT_EINT -- Invalid interrupt number.    
+*/
+/* ======================================================================*/
+unsigned int qurt_fastint_register(int intno, void (*fn)(int));
+
+
+/*======================================================================*/
+/**@ingroup func_qurt_fastint_deregister
+  Deregisters the fast interrupt callback function. 
+	
+  @param[in] intno  Level-one interrupt number to deregister. Valid range is 1 and 10 through 31 
+                    (simulator only). 
+
+  @return 				
+  #QURT_EOK -- Interrupt deregistration is successful. \n
+  #QURT_EINT -- Invalid interrupt number (not registered). \n
+  #QURT_EINVALID -- Invalid interrupt number (already deregistered).
+
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+unsigned int qurt_fastint_deregister(int intno);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_FASTINT_H */
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_fs_hub.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_fs_hub.h
new file mode 100755
index 0000000000000..aaa050a6c838b
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_fs_hub.h
@@ -0,0 +1,58 @@
+#ifndef QURT_FS_HUB_H
+#define QURT_FS_HUB_H
+
+/**
+  @file qurt_fs_hub.h
+  @brief  Definitions, macros, and prototypes used when writing a
+  QDI driver that provides file-system functionality.
+
+ EXTERNALIZED FUNCTIONS
+  none
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  none
+
+ Copyright (c) 2023 by Qualcomm Technologies, Inc.  All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+  This structure tracks a file-designator for a FS-hub QDI driver.
+  File system's QDI interface should use this object to encapsulate
+  true file-descriptor and return back a QDI handle. This QDI handle
+  will be used as file-descriptor by File-systm-hub. 
+ */
+
+typedef struct qurt_qdi_fs_obj
+{
+    qurt_qdi_obj_t qdi_obj;
+    int client_handle;
+    int fd;
+}qurt_qdi_fs_obj_t;
+
+
+/**@ingroup fs_hub_support_functions
+  This function allows a file-system to register it's QDI interface with file-system-hub.
+  Once registered, all file open operations for any filenames containing the mountpoint will
+  be forwarded to the QDI inteface.
+
+  Mountpoint string must be encased in two forward slashes e.g. "/mountpoint/"
+
+  @param  mtpoint         mount point for the file-system being registered.
+  @param  opener          opener structure for the QDI driver interface
+  
+  @return
+  QURT_EOK -- Successfully registered QDI driver with file-system-hub.
+  Negative error code -- Failed to register with file-system-hub
+ */
+int qurt_fs_hub_mtpoint_register(const char *mtpoint, qurt_qdi_obj_t *opener);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_futex.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_futex.h
new file mode 100755
index 0000000000000..1fdcc79a43f01
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_futex.h
@@ -0,0 +1,82 @@
+#ifndef QURT_FUTEX_H
+#define QURT_FUTEX_H
+/**
+  @file  qurt_futex.h
+
+  @brief  Prototypes of QuRT futex API functions      
+  
+ EXTERNALIZED FUNCTIONS
+  none
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  none
+
+ Copyright (c) 2013, 2020-2021  by Qualcomm Technologies, Inc.  All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*=====================================================================
+ Functions
+======================================================================*/
+
+
+/**@ingroup func_qurt_futex_wait
+  Moves the caller thread into waiting state when a memory object address
+  contains a value that is the same as a specified value. 
+
+   @param[in]  lock  Pointer to the object memory. 
+   @param[in]  val   Value to check against the object content. 
+
+   @return
+   #QURT_EOK -- Success \n
+   Other values -- Failure
+
+   @dependencies
+   None.
+ */
+int qurt_futex_wait(void *lock, int val);
+
+
+/**@ingroup func_qurt_futex_wait_cancellable
+  If a memory object address contains a value that is same as a specified 
+  value, move the caller thread into waiting state. 
+  The kernal can cancel the waiting state when there is a special need. 
+
+   @param[in]  lock  Pointer to the object memory. 
+   @param[in]  val   Value to check against the object content. 
+
+   @return
+   #QURT_EOK -- Success \n
+   Other values  -- Failure
+
+   @dependencies
+   None.
+ */
+int qurt_futex_wait_cancellable(void *lock, int val);
+
+
+/**@ingroup func_qurt_futex_wake
+  Wakes up a specified number of threads that have been waiting 
+  for the object change with qurt_futex_wait().
+
+   @param[in]  lock        Pointer to the object memory. 
+   @param[in]  n_to_wake   Maximum number of threads to wake up.
+
+   @return
+   number of threads to be woken up by this function
+
+   @dependencies
+   None.
+ */
+int qurt_futex_wake(void *lock, int n_to_wake);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_FUTEX_H */
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_hmx.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_hmx.h
new file mode 100755
index 0000000000000..e4037dbeae514
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_hmx.h
@@ -0,0 +1,226 @@
+#ifndef QURT_HMX_H
+#define QURT_HMX_H
+/**
+  @file qurt_hmx.h 
+  @brief   Prototypes of Qurt HMX API.  
+
+Copyright (c) 2019-2021, 2023  by Qualcomm Technologies, Inc.  All Rights Reserved.
+Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+=============================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*=============================================================================
+                        TYPEDEFS
+=============================================================================*/
+
+
+/** @addtogroup hmx_types
+@{ */
+/* HMX locking type */
+#define  QURT_HMX_NON_SHARED_LOCK           0U /**< HMX locking type.*/
+#define  QURT_HMX_SHARED_LOCK               1U /**< HMX locking type.*/
+
+/* HMX unlocking type */
+#define  QURT_HMX_NON_SHARED_UNLOCK         0U /**< HMX unlocking type.*/
+#define  QURT_HMX_SHARED_UNLOCK             1U /**< HMX unlocking type.*/
+
+/* HMX hardware context */
+#define  QURT_HMX_UNIT_0                    0U /**< HMX hardware context #0 */
+#define  QURT_HMX_UNIT_1                    1U /**< HMX hardware context #1 */
+	/** @} */ /* end_addtogroup hmx_types */
+
+
+/*=============================================================================
+                        FUNCTIONS
+=============================================================================*/
+
+
+/**@ingroup func_qurt_hmx_lock2
+  Locks a HMX unit with the specified locking type.
+
+    #QURT_HMX_NON_SHARED_LOCK:
+   - If a HMX unit is available, lock the unit and return success of #QURT_EOK.
+   - If the HMX unit is already locked by another thread, the caller thread is suspended 
+     until the HMX is available and gets locked by this function.
+   - If there is no HMX hardware supported, returns #QURT_EVAL;
+
+    #QURT_HMX_SHARED_LOCK:
+   - If a HMX unit is available, enables HMX access for the caller thread, and returns 
+     success of #QURT_EOK.
+   - If the HMX is enabled on the caller thread, return #QURT_EFAILED.
+   - If the HMX is locked by another thread in the same user process of the caller 
+     thread with locking type of #QURT_HMX_SHARED_LOCK, enable HMX access for the caller 
+     thread, and return success of #QURT_EOK.
+   - If the HMX is locked by another thread in the same user process of the caller 
+     thread with locking type of #QURT_HMX_NON_SHARED_LOCK, return #QURT_EFAILED.
+   - If the HMX is locked by a thread from another user process different from the 
+     user process of the caller thread, return #QURT_EFAILED.
+   - If there is no HMX hardware supported, return #QURT_EVAL.
+
+  @param[in]  type  Locking type.
+    
+  @return
+  #QURT_EOK     -- HMX lock successful.\n
+  #QURT_EFAILED -- Failure due to wrong locking condition.\n
+  #QURT_EVAL    -- Failure because no HMX hardware is supported.
+
+  @dependencies
+  None.
+  
+ */
+int qurt_hmx_lock2(unsigned int type);
+
+
+/**@ingroup func_qurt_hmx_unlock2
+  Unlocks a HMX unit with the unlocking type.
+
+    #QURT_HMX_NON_SHARED_UNLOCK:
+  - If there is a HMX unit locked by the caller thread, unlock the HMX unit and clear the 
+    HMX accumulators (assuming a fixed point type). 
+  - If there is no HMX unit locked by the caller thread, return #QURT_EFAILED. 
+  - If there is no HMX hardware supported, return #QURT_EVAL.
+
+  #QURT_HMX_SHARED_UNLOCK:
+   - If the caller thread has locked HMX with type #QURT_HMX_SHARED_LOCK, disable the 
+     HMX access on the caller thread, and return success of #QURT_EOK.
+     Note: If the caller thread is the last thread that unlocks for #QURT_HMX_SHARED_LOCK 
+           in its user process, the unlock function clears the HMX accumulators. 
+   - If the caller thread has locked HMX with type #QURT_HMX_NON_SHARED_LOCK, return 
+     failure of #QURT_EFAILED.
+   - If the caller thread has not locked HMX, return failure of #QURT_EFAILED.
+   - If there is no HMX hardware supported, returns #QURT_EVAL.
+
+  @param[in]  type  Locking type.
+    
+  @return
+  #QURT_EOK     -- HMX is unlocked successful. \n
+  #QURT_EFAILED -- Failure due to wrong unlocking condition. \n
+  #QURT_EVAL    -- Failure because no HMX hardware is supported.
+
+  @dependencies
+  None.
+  
+ */
+int qurt_hmx_unlock2(unsigned int type);
+
+
+/**@ingroup func_qurt_hmx_lock
+  Locks a HMX unit.
+  If a HMX unit is available, this function locks the unit and returns right away.
+  If there is no HMX unit available, the caller is blocked until a HMX is available 
+  and is locked by the function.
+
+  @return
+  #QURT_EOK -- HMX lock successful. \n
+  #QURT_EFAILED -- Failure due to wrong locking condition. \n
+  #QURT_EVAL    -- Failure because no HMX hardware is supported.
+
+  @dependencies
+  None.
+ */
+int qurt_hmx_lock(void);
+
+
+/**@ingroup func_qurt_hmx_unlock
+  Unlocks a HMX unit.
+  If a HMX unit is locked by the caller thread, unlock the HMX unit and clear its 
+  accumulators(assuming fixed point type). 
+  If there is no HMX unit locked by the caller thread, return failure. 
+  
+  @return
+  #QURT_EOK -- HMX unlock successful. \n
+  #QURT_EFAILED -- Failure due to wrong unlocking condition. \n
+  #QURT_EVAL    -- Failure because no HMX hardware is supported.
+
+  @dependencies
+  None.
+ */
+int qurt_hmx_unlock(void);
+
+
+/**@ingroup func_qurt_hmx_try_lock
+  Tries to lock a HMX unit.
+  If a HMX unit is available, this function locks the unit and returns right away;
+  if there is no HMX unit available, the function returns failure without blocking the caller.
+  
+  @return
+  #QURT_EOK -- HMX lock successful \n
+  #QURT_EFAILED -- Failure due to wrong locking condition.\n
+  #QURT_EVAL    -- Failure because no HMX hardware is supported.
+
+  @dependencies
+  None.
+ */
+int qurt_hmx_try_lock(void);
+
+
+/**@ingroup func_qurt_hmx_assign
+  Assign a HMX unit to a target thread specified by its thread identifier. 
+  The HMX unit (HMX hardware context) is specified by hmx_unit.
+  The caller of this function is limited to the SRM process.
+  If the requested hmx_unit is already assigned to another thread with QURT_HMX_NON_SHARED_LOCK, 
+  kernel will detach it from the thread, and re-assign it to the target thread. 
+  If the target thread has HVX enabled, it cannot have HMX enabled.  
+
+  Locking type 
+  #QURT_HMX_NON_SHARED_LOCK:
+   - If the HMX unit is available, lock the HMX unit and return success of #QURT_EOK.
+   - If the HMX unit is already enabled on the target thread, return #QURT_EOK.
+   - If the HMX unit is already locked by another thread, detach the HMX from the thread.
+     Re-assign the HMX unit to the target thread, and return #QURT_EOK.
+     
+  @param[in]  thread_id    Thread identifier
+  @param[in]  type         Locking type  
+                             #QURT_HMX_NON_SHARED_LOCK -- non-shared lock
+  @param[in]  hmx_unit     HMX hardware context number  
+                             #QURT_HMX_UNIT_0
+                             #QURT_HMX_UNIT_1 
+    
+  @return
+  #QURT_EOK       -- The HMX is assigned successfully. This includes the case that \n
+                     the target thread already has HMX assigned. \n
+  #QURT_EFAILED   -- Failure due to wrong assigning conditions. \n
+  #QURT_EINVALID  -- Failure because no HMX hardware is supported.
+
+  @dependencies
+  None.
+ */
+int qurt_hmx_assign ( unsigned int thread_id, unsigned int type, unsigned int hmx_unit );
+
+
+/**@ingroup func_qurt_hmx_release
+  Release a HMX unit from a target thread specified by its thread identifier. 
+  The HMX unit (HMX hardware context) is specified by hmx_unit.
+  The caller of this function is limited to the SRM process.
+
+  Qurt detaches the specified HMX unit from the target thread, and return success of 
+  #QURT_EOK. If the HMX unit is already released from the target thread, return #QURT_EOK.
+     
+  @param[in]  thread_id    Thread identifier
+  @param[in]  hmx_unit     HMX hardware context number  
+                             #QURT_HMX_UNIT_0
+                             #QURT_HMX_UNIT_1 
+    
+  @return
+  #QURT_EOK       -- The HMX is released successfully. This includes the case that \n
+                     the target thread already has the HMX released. \n
+  #QURT_EFAILED   -- Failure due to wrong assigning condition. \n
+  #QURT_EINVALID  -- Failure because no HMX hardware is supported.
+
+  @dependencies
+  None.
+ */
+int qurt_hmx_release ( unsigned int thread_id, unsigned int hmx_unit );
+
+
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_HMX_H */
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_hvx.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_hvx.h
new file mode 100755
index 0000000000000..13c213d49ac84
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_hvx.h
@@ -0,0 +1,421 @@
+#ifndef QURT_HVX_H
+#define QURT_HVX_H
+/**
+  @file qurt_hvx.h 
+  @brief   Prototypes of QuRT HVX API.  
+
+EXTERNAL FUNCTIONS
+   None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+Copyright (c) 2021-2022  by Qualcomm Technologies, Inc.  All Rights Reserved.
+Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+=============================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*=============================================================================
+                        TYPEDEFS
+=============================================================================*/
+/** @cond */
+
+typedef enum {
+    QURT_HVX_MODE_64B = 0,      /**< HVX mode of 64 bytes */
+    QURT_HVX_MODE_128B = 1      /**< HVX mode of 128 bytes */
+} qurt_hvx_mode_t;
+/** @endcond */
+/*=============================================================================
+                        CONSTANTS AND MACROS
+=============================================================================*/
+/** @cond internal_only*/
+/** @addtogroup hvx_macros
+@{ */
+#define QURT_HVX_HW_UNITS_2X128B_4X64B        0x00000204       /**< Bits 15 through 8 are for the number of 128B units.   */
+                                                               /**< Bits 7 through 0 are for the number of 64B units.     */
+#define QURT_HVX_HW_UNITS_4X128B_0X64B        0x00000400   
+#define QURT_HVX_HW_UNITS_6X128B_0X64B        0x00000600   
+
+/* HVX locking status */
+
+#define QURT_HVX_UNLOCKED                     (0)              /* Has not locked HVX unit */
+#define QURT_HVX_LOCKED                       (1)              /* Has locked HVX unit */
+#define QURT_HVX_ERROR                        (-1)             /* Error, no HVX support */
+
+/* Input value for HVX reservation */
+
+#define QURT_HVX_RESERVE_ALL                  (4)              /* All the HVX units in terms of 64B_MODE are requested to be reserved */
+#define QURT_HVX_RESERVE_ALL_AVAILABLE        (0xff)           /* All remaining unlocked HVX units in terms of 64B_MODE are requested to be reserved */
+
+/* Return values for HVX reservation */
+
+#define QURT_HVX_RESERVE_NOT_SUPPORTED        (-1)             /* There is no HVX hardware, or less units in the hardware than requested */
+#define QURT_HVX_RESERVE_NOT_SUCCESSFUL       (-2)             /* Some HVX units are already locked/reserved by other PD, thus not enough units left for the reservation. */
+#define QURT_HVX_RESERVE_ALREADY_MADE         (-3)             /* There is already a HVX reservation made. */
+#define QURT_HVX_RESERVE_CANCEL_ERR           (-4)             /* The action of cancling the reservation fails because this protection domain has no reservation made before. */
+
+// HVX set requests
+
+#define QURT_HVX_64B                    0  /**< */
+#define QURT_HVX_128B                   1  /**< */
+#define QURT_HVX_NO_USE                 2  /**< */
+#define QURT_HVX_RELEASE_CONTEXT        3  /**< */
+#define QURT_HVX_IMMEDIATE_USE          4  /**< */
+
+// HVX set masks
+
+#define QURT_HVX_64B_PREFERRED          (1<<(QURT_HVX_64B  + 8))/**< */
+#define QURT_HVX_128B_PREFERRED         (1<<(QURT_HVX_128B + 8))/**< */
+#define QURT_HVX_64B_ACCEPTABLE         (1<<(QURT_HVX_64B  + 12))/**< */
+#define QURT_HVX_128B_ACCEPTABLE        (1<<(QURT_HVX_128B + 12))/**< */
+
+// HVX set return "result"
+
+#define QURT_EOK                        0     /**< */
+#define QURT_HVX_SET_ERROR              0xFF  /**< */
+
+// hvx_mode_assigned for QURT_HVX_IMMEDIATE_USE 
+#define QURT_HVX_64B_ASSIGNED          (1<<(QURT_HVX_64B  + 8)) /**< */
+#define QURT_HVX_128B_ASSIGNED         (1<<(QURT_HVX_128B + 8)) /**< */
+
+// Sizes of HVX dump buffer
+
+#define   QURT_HVX_V65_64B_VSIZE           2084U      /**<  64 x 32 +  8 x 4 + 4 (version). */
+#define   QURT_HVX_V65_128B_VSIZE          4164U      /**<  128 x 32 + 16 x 4 + 4 (version). */
+#define   QURT_HVX_V66_128B_VSIZE          4420U      /**<  128 x (32 +2) + 16 x 4 + 4 (version). */
+#define   QURT_HVX_V68_128B_VSIZE          4164U      /**<  128 x 32 + 16 x 4 + 4 (version). */
+#define   QURT_HVX_V79_128B_VSIZE          4740U      /**<  128 x (32+4+1) + 4 (version). */
+#define   QURT_HVX_VREG_BUF_SIZE           QURT_HVX_V79_128B_VSIZE /**< */
+
+// HVX dump versions
+
+#define QURT_HVX_DUMP_V65_64B           1U  /**< */
+#define QURT_HVX_DUMP_V65_128B          2U  /**< */
+#define QURT_HVX_DUMP_V66_128B          3U  /**< */
+#define QURT_HVX_DUMP_V68_128B          4U  /**< */
+#define QURT_HVX_DUMP_V79_128B          5U  /**< */
+/** @} */ /* end_addtogroup hvx_macros */
+/** @endcond */
+/** @cond */
+// Qurt data struct for hvx_set input
+typedef struct qurt_hvx_set_struct_ {          
+    unsigned char set_req;  // LSB
+    struct {
+        unsigned char preferred_mask:4;
+        unsigned char acceptable_mask:4;
+    };
+    unsigned short resvd;   // MSB
+} qurt_hvx_set_struct_t;  // 4 bytes
+
+
+// Qurt data struct for hvx_set return
+typedef struct qurt_hvx_set_return_str_ {          
+    unsigned char result;  // LSB
+    unsigned char hvx_mode_assigned;
+    unsigned short resvd;   // MSB
+} qurt_hvx_set_return_struct_t;  // 4 bytes
+/** @endcond */
+
+
+/*=============================================================================
+                        FUNCTIONS
+=============================================================================*/
+
+/**@ingroup func_qurt_hvx_lock
+  Locks one HVX unit specified by the HVX mode.
+  
+  @note1hang Input variable can be 128B_MODE or 64B_MODE. If an HVX unit in this mode 
+             is available, this function locks the unit and returns right away.
+             If the current HVX mode is different from the requested mode, the current 
+             thread is blocked. When all HVX units become idle, QuRT changes 
+             the mode, locks the HVX unit, and returns.
+
+            Starting from Q6v65 with HVX context switch support, qurt_hvx_lock() is 
+            mapped as qurt_hvx_set(64_BYTE or 128_BYTE).
+  
+  @datatypes
+  #qurt_mode_t
+  
+  @param[in]  lock_mode #QURT_HVX_MODE_64B or #QURT_HVX_MODE_128B.
+
+  @return
+  #QURT_EOK -- Success \n
+  Other value -- Failure
+
+  @dependencies
+  None.
+  
+ */
+int qurt_hvx_lock(qurt_hvx_mode_t lock_mode);
+
+/**@ingroup func_qurt_hvx_unlock
+  Unlocks the HVX unit held by this software thread.
+  
+  @note1hang  Starting from Q6v65 with HVX context switch support, qurt_hvx_unlock()
+              maps as qurt_hvx_set(QURT_HVX_RELEASE_CONTEXT).
+  
+  @return
+  #QURT_EOK -- Successful return \n
+  Other values -- Failure
+
+  @dependencies
+  None.
+  
+ */
+int qurt_hvx_unlock(void);
+
+/**@ingroup func_qurt_hvx_try_lock
+  Tries to lock one HVX unit specified by the HVX mode.
+  
+  @note1hang Input variable can be 128B_MODE or 64B_MODE. If an HVX unit in this mode 
+             is available, this function locks the unit and returns #QURT_EOK; Otherwise,
+             the function returns a failure, but does not block the current software 
+             thread to wait for the HVX unit.
+            Starting from Q6v65 with HVX context switch support, qurt_hvx_try_lock()
+             maps to qurt_hvx_set(FOR_IMMEDIATE_USE| preferred_mask | acceptable_mask);
+  
+  @datatypes
+  #qurt_mode_t
+
+  @return
+  #QURT_EOK -- Successful return \n
+  Other values -- Failure
+
+  @dependencies
+  None.
+  
+ */
+int qurt_hvx_try_lock(qurt_hvx_mode_t lock_mode);
+
+/**@ingroup func_qurt_hvx_get_mode
+  Gets the current HVX mode configured by QuRT.
+  
+  @note1hang Returns #QURT_HVX_MODE_128B or #QURT_HVX_MODE_64B, based on 
+             the current HVX configuration.
+  
+  @param[out] 
+  None.
+
+  @return
+  #QURT_HVX_MODE_128B \n
+  #QURT_HVX_MODE_64B \n
+  -1 -- Not available.
+
+  @dependencies
+  None.
+ */
+int qurt_hvx_get_mode(void);
+
+
+/**@ingroup func_qurt_hvx_get_units
+  Gets the HVX hardware configuration that the chipset supports.
+  
+  @note1hang The function returns the HVX hardware configuration supported by the chipset.
+  
+  @return
+  Bitmask of the units: 1X64, 2X64, 4X64, 1X128, 2X128, and so on.\n
+  - QURT_HVX_HW_UNITS_2X126B_4X64B -- V60, V62, or V65 HVX \n
+  - QURT_HVX_HW_UNITS_4X128B_0X64B -- V66 CDSP or newer \n
+  - 0 --  not available
+
+  @dependencies
+  None.
+  
+ */
+int qurt_hvx_get_units(void);
+
+
+/**@ingroup func_qurt_hvx_reserve
+  Reserves HVX units in terms of 64-byte mode for the protection domain (PD) of the caller.
+  
+  @note1hang Only one HVX reservation in the system is supported.
+             If one HVX unit is already locked by the application in the same PD, the unit is 
+             added to the returned count as one reserved unit for the PD.
+            Starting from Q6v65 with HVX context switch support, qurt_hvx_reserve()
+            only does basic sanity checks on HVX units.
+  
+  @datatypes
+  None.
+
+  @param[in]  num_units  Number of HVX units in terms of 64B_MODE to reserve for the PD.
+                         QURT_HVX_RESERVE_ALL to reserve all the HVX units.
+                         QURT_HVX_RESERVE_ALL_AVAILABLE to reserve the remaining unlocked units.
+
+  @return
+    Number of units successfully reserved, including the units already locked in the same PD. \n
+    #QURT_HVX_RESERVE_NOT_SUPPORTED \n     
+    #QURT_HVX_RESERVE_NOT_SUCCESSFUL \n    
+  #QURT_HVX_RESERVE_ALREADY_MADE    
+
+
+  @dependencies
+  None.
+  
+ */
+int qurt_hvx_reserve(int num_units);
+
+
+/**@ingroup func_qurt_hvx_cancel_reserve
+  Cancels the HVX reservation in the protection domain (PD) of the caller.
+  
+  @note1hang Only one HVX reservation in the system is supported.
+  
+  @return
+    0 -- Success \n
+    #QURT_HVX_RESERVE_CANCEL_ERR -- Failure      
+
+  @dependencies
+  None.
+  
+ */
+int qurt_hvx_cancel_reserve(void);
+
+
+/**@ingroup func_qurt_hvx_get_lock_val
+  Gets the HVX locking status value of the thread of the caller. 
+  
+  @note1hang Returns the status of whether the thread of the caller already locks a HVX unit or not.
+  
+  @datatypes
+  None.
+
+  @return
+    #QURT_HVX_UNLOCKED \n  
+    #QURT_HVX_LOCKED \n   
+    #QURT_HVX_ERROR    
+
+  @dependencies
+  None.
+ */
+int qurt_hvx_get_lock_val(void);
+
+/** @cond internal_only*/
+/**@ingroup func_qurt_hvx_set
+  Sets the HVX configuration for the software thread of the caller. 
+  
+  @datatypes
+  None.
+
+  @param[in] input_arg Composed of set_request | hvx_preferred_mode_mask 
+                       | hvx_acceptable_mode_mask where set_request can be set to: \n
+                       - #QURT_HVX_64B  \n         
+                       - #QURT_HVX_128B  \n       
+                       - #QURT_HVX_NO_USE  \n    
+                       - #QURT_HVX_RELEASE_CONTEXT \n
+                       - #QURT_HVX_IMMEDIATE_USE \n
+                       When set_request is QURT_HVX_IMMEDIATE_USE,  
+    hvx_preferred_mode_mask can be set to: \n
+                       - #QURT_HVX_64B_PREFERRED \n    
+                       - #QURT_HVX_128B_PREFERRED   
+                       When set_request is QURT_HVX_IMMEDIATE_USE,  
+    hvx_acceptable_mode_mask can be set to: \n
+                       - #QURT_HVX_64B_ACCEPTABLE  \n
+                       - #QURT_HVX_128B_ACCEPTABLE @tablebulletend
+
+  @return 
+     Result of the HVX setting in the least significant 8 bits of the returned data. \n
+  #QURT_EOK -- 0  \n
+  #QURT_HVX_SET_ERROR -- 0xFF \n     
+  When #QURT_HVX_IMMEDIATE_USE has a result of #QURT_EOK, 
+  bit 8 to bit 15 of the returned data contain hvx_mode_assigned:\n
+  - #QURT_HVX_64B_ASSIGNED      \n
+  - #QURT_HVX_128B_ASSIGNED   
+
+  @dependencies
+  None.
+ */
+unsigned int qurt_hvx_set(unsigned int input_arg);
+
+
+/**@ingroup func_qurt_system_hvx_regs_get_maxsize
+  Returns the maximum buffer size for saving HVX registers.
+  
+  @datatypes
+  None.
+
+  @return
+  0 -- No HVX supported in the target. \n
+  #QURT_HVX_VREG_BUF_SIZE -- Maximum buffer size for saving HVX registers.
+
+  @dependencies
+  None.
+ */
+unsigned int qurt_system_hvx_regs_get_maxsize(void);
+
+
+/**@ingroup func_qurt_system_hvx_regs_get_size
+  Returns the buffer size for saving HVX registers for a specified thread.
+  
+  @param[in]  thread_id    Thread ID of the target thread.
+
+  @return
+  0 -- No HVX assgined to the thread. \n
+    size -- Size of the buffer in bytes for saving HVX registers for the specified thread: \n 
+  - #QURT_HVX_V65_64B_VSIZE  -- 64 x 32 +  8 x 4 + 4 (version) \n
+  - #QURT_HVX_V65_128B_VSIZE -- 128 x 32 + 16 x 4 + 4 (version) \n
+  - #QURT_HVX_V66_128B_VSIZE -- 128 x (32 +2) + 16 x 4 + 4 (version) \n
+  - #QURT_HVX_V68_128B_VSIZE -- 128 x 32 + 16 x 4 + 4 (version) \n
+  - #QURT_HVX_V79_128B_VSIZE -- 128 x (32+4+1) + 4 (version)
+
+
+  @dependencies
+  None.
+  
+ */
+unsigned int qurt_system_hvx_regs_get_size(unsigned int thread_id);
+
+
+
+/**@ingroup func_qurt_system_hvx_regs_get
+  Saves the HVX registers into the specified buffer.
+  Returns the size of the data saved into the buffer.
+  After calling this function for the first time on a specified thread_id, the QuRT kernel removes the internal HVX saving buffer 
+  from the specified thread. When calling the function on the same thread_id for the second time, this function returns 0.
+  
+  @param[in] thread_id    Thread ID of the target thread.
+  @param[in] pBuf         Pointer to the buffer for HVX register saving.
+                          The first four bytes of the buffer are for saving the HVX version. HVX registers are saved from 
+                          the fifth byte of the buffer. The address of the fifth byte should be 256 bytes aligned. 
+                          For example, a buffer can be declared at first as: \n
+                          unsigned char vbuf[QURT_HVX_VREG_BUF_SIZE+256];\n
+                          unsigned char *pBuf; \n
+                          then align the buffer pointer to: \n
+                          pBuf = vbuf; \n
+                    pBuf += (256 - 4 - (unsigned)pBuf%256);
+  @param[in] size         Size of the buffer provided, which is pointed by *pBuf. The buffer size should not be smaller than that 
+                          returned from qurt_system_hvx_regs_get_size(), and pBuf should be aligned as described above.
+  @param[out] pBuf        Buffer returned with the saved HVx registers (unsigned char hvx_regs[];), which are saved from the fith 
+                          byte of the buffer, and the HVX version (unsigned int hvx_version;), which in the first four bytes 
+                          contain one of the HVX dump versions:\n
+                          - #QURT_HVX_DUMP_V65_64B \n   
+                          - #QURT_HVX_DUMP_V65_128B \n   
+                          - #QURT_HVX_DUMP_V66_128B  \n  
+                          - #QURT_HVX_DUMP_V68_128B  \n  
+                          - #QURT_HVX_DUMP_V79_128B  \n  
+                           @tablebulletend
+
+  @return
+    Total bytes of the data saved in the provided buffer. \n
+  0  -- No HVX assigned to the thread \n
+  #QURT_HVX_V65_64B_VSIZE   --  64 x 32 +  8 x 4 + 4 (version) \n
+  #QURT_HVX_V65_128B_VSIZE  -- 128 x 32 + 16 x 4 + 4 (version) \n
+  #QURT_HVX_V66_128B_VSIZE  -- 128 x (32 +2) + 16 x 4 + 4 (version) \n
+  #QURT_HVX_V68_128B_VSIZE  -- 128 x 32 + 16 x 4 + 4 (version)  \n
+  #QURT_HVX_V79_128B_VSIZE  -- 128 x (32+4+1) + 4 (version)
+
+  @dependencies
+  None.
+ */
+unsigned int qurt_system_hvx_regs_get(unsigned int thread_id, void *pBuf, size_t size);
+/** @endcond */
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_HVX_H */
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_int.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_int.h
new file mode 100755
index 0000000000000..386aeda1051eb
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_int.h
@@ -0,0 +1,509 @@
+﻿#ifndef QURT_INT_H
+#define QURT_INT_H
+/**
+  @file  qurt_int.h
+  @brief  QuRT interrupt functions.    
+
+
+
+ Copyright (c) 2013-2021, 2023 Qualcomm Technologies, Inc.
+ All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*=============================================================================
+                            CONSTANTS AND MACROS
+=============================================================================*/
+
+
+/** @cond rest_reg_dist */
+/** @addtogroup interrupts_constants
+@{ */
+#define SIG_INT_ABORT 0x80000000                                       /**< */
+#define QURT_INT_NON_DELAYED_ACK           0 
+#define QURT_INT_DELAYED_ACK               1
+#define QURT_INT_ACK_DEFAULT               QURT_INT_NON_DELAYED_ACK
+#define QURT_INT_DRV_DEFAULT               0
+#define QURT_INT_PRIORITY_DEFAULT          0xFF
+
+/** QuRT interrupt property. */
+#define QURT_INT_CONFIGID_POLARITY        0x1U /**< */
+#define QURT_INT_CONFIGID_LOCK            0x2U /**< */
+
+/** QuRT interrupt lock.*/
+#define QURT_INT_LOCK_DEFAULT             0x0  /**< Default. */
+#define QURT_INT_LOCK_DISABLE             0x0  /**< Interrupt can be enabled or disabled or deregistered. */
+#define QURT_INT_LOCK_ENABLE              0x1  /**< Interrupt is locked and cannot be enabled, disabled, or deregistered.*/
+/** @} */ /* end_addtogroup interrupts_constants */
+
+/** @addtogroup Qurt_interrupt_type
+@{ */
+/** Trigger type bit fields for a PDC interrupt:\n
+    @verbatim
+    Polarity  Edge  Output\n
+    0         00    Level sensitive active low
+    0         01    Rising edge sensitive
+    0         10    Falling edge sensitive
+    0         11    Dual edge sensitive
+    1         00    Level sensitive active high
+    1         01    Falling edge sensitive
+    1         10    Rising edge sensitive
+    1         11    Dual edge sensitive 
+    @endverbatim
+*/
+#define QURT_INT_TRIGGER_TYPE_SET(pol, edge)   ((((pol) & 0x01U) << 2) | ((edge) & 0x03U)) /**< */
+	 
+#define QURT_INT_TRIGGER_LEVEL_LOW     QURT_INT_TRIGGER_TYPE_SET(0U, 0x00U)  /**< */
+#define QURT_INT_TRIGGER_LEVEL_HIGH    QURT_INT_TRIGGER_TYPE_SET(1U, 0x00U)  /**< */
+#define QURT_INT_TRIGGER_RISING_EDGE   QURT_INT_TRIGGER_TYPE_SET(1U, 0x02U)  /**< */
+#define QURT_INT_TRIGGER_FALLING_EDGE  QURT_INT_TRIGGER_TYPE_SET(0U, 0x02U)  /**< */
+#define QURT_INT_TRIGGER_DUAL_EDGE     QURT_INT_TRIGGER_TYPE_SET(0U, 0x03U)  /**< */
+#define QURT_INT_TRIGGER_USE_DEFAULT   0xffU                                 /**< */
+/** @} */ /* end_addtogroup Qurt_interrupt_type */
+
+/*=====================================================================
+ Functions
+======================================================================*/
+
+/**@ingroup func_qurt_interrupt_register
+  @xreflabel{sec:interrupt_register} 
+  Registers the interrupt.\n
+  Enables the specified interrupt and associates it with the specified QuRT signal object and
+  signal mask.
+
+  Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1
+  indicates that a signal must be waited on, and 0 indicates not to wait.
+
+  When the interrupt occurs, the signal specified in the signal mask is set in the signal
+  object. An IST conventionally waits on that signal to
+  handle the interrupt. The thread that registers the interrupt is set as the IST.
+
+  Up to 31 separate interrupts can be registered to a single signal object, as determined by
+  the number of individual signals the object can store. QuRT reserves signal 31. Thus a
+  single IST can handle several different interrupts.
+
+  QuRT reserves some interrupts for internal use -- the remainder are available for use by
+  applications, and thus are valid interrupt numbers. If the specified interrupt number is
+  outside the valid range, the register operation returns the status value QURT_EINT.
+
+  Only one thread can be registered at a time to a specific interrupt. Attempting to register
+  an already-registered interrupt returns the status value QURT_EVAL.
+
+  Only one signal bit in a signal object can be registered at a time to a specific interrupt.
+  Attempting to register multiple signal bits to an interrupt returns the status value
+  QURT_ESIG.
+
+  When the signal registers an interrupt, QuRT can only set its signal bits 
+  when receiving the interrupt. The QuRT signal API from another
+  software thread cannot set the signal even for unused signal bits.
+
+  @note1hang The valid range for an interrupt number can differ on target execution
+             environments other than the simulator. For more information, see the
+             appropriate hardware document.
+								 
+  @datatypes
+  #qurt_anysignal_t
+
+  @param[in] int_num      L2VIC interrupt to deregister; valid range is 0 to 1023.
+  @param[in] int_signal   Any-signal object to wait on (Section @xref{dox:any_signals}).
+  @param[in] signal_mask  Signal mask value indicating signal to receive the interrupt.
+
+   @return
+   #QURT_EOK -- Interrupt successfully registered.\n
+   #QURT_EINT -- Invalid interrupt number. \n
+   #QURT_ESIG -- Invalid signal bitmask (cannot set more than one
+                signal at a time). \n
+   #QURT_EVAL -- Interrupt already registered.
+
+   @dependencies
+   None.
+*/
+ unsigned int qurt_interrupt_register(int int_num, qurt_anysignal_t *int_signal, int signal_mask);
+
+/**@ingroup func_qurt_interrupt_register2
+  @xreflabel{sec:interrupt_register2} 
+  Registers the interrupt.\n
+  Enables the specified interrupt, associates it with the specified QuRT signal object and
+  signal mask, and sets interrupt flags.
+
+  Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1
+  indicates that a signal must be waited on, and 0 indicates not to wait.
+
+  When the interrupt occurs, the signal specified in the signal mask is set in the signal
+  object. An IST conventionally waits on that signal to
+  handle the interrupt. The thread that registers the interrupt is set as the IST.
+
+  Up to 31 separate interrupts can be registered to a single signal object, as determined by
+  the number of individual signals that the object can store. QuRT reserves signal 31. Thus a
+  single IST can handle several different interrupts.
+
+  QuRT reserves some interrupts for internal use -- the remainder are available for use by
+  applications, and thus are valid interrupt numbers. If the specified interrupt number is
+  outside the valid range, the register operation returns the status value #QURT_EINT.
+
+  Only one thread can be registered at a time to a specific interrupt. Attempting to register
+  an already-registered interrupt returns the status value #QURT_EVAL.
+
+  Only one signal bit in a signal object can be registered at a time to a specific interrupt.
+  Attempting to register multiple signal bits to an interrupt returns the status value
+  #QURT_ESIG.
+
+  When the signal registers an interrupt, QuRT can only set its signal bits 
+  when receiving the interrupt. The QuRT signal API from another
+  software thread cannot set the signal even for unused signal bits.
+
+  @note1hang The valid range for an interrupt number can differ on target execution
+             environments other than the simulator. For more information, see the
+             appropriate hardware document.
+								 
+  @datatypes
+  #qurt_anysignal_t
+
+  @param[in] int_num      L2VIC interrupt to deregister; valid range is 0 to 1023.
+  @param[in] int_signal   Any-signal object to wait on (Section @xref{dox:any_signals}).
+  @param[in] signal_mask  Signal mask value indicating signal to receive the interrupt.
+  @param[in] flags        Defines interrupt property, supported property is interrupt lock enable/disable. 
+                          Possible values for flags: \n
+                           - #QURT_INT_LOCK_ENABLE
+                           - #QURT_INT_LOCK_DISABLE @tablebulletend
+
+   @return
+   #QURT_EOK -- Interrupt successfully registered.\n
+   #QURT_EINT -- Invalid interrupt number. \n
+   #QURT_ESIG -- Invalid signal bitmask (cannot set more than one
+                signal at a time). \n
+   #QURT_EVAL -- Interrupt already registered.
+
+   @dependencies
+   None.
+*/
+ unsigned int qurt_interrupt_register2(int int_num, qurt_anysignal_t *int_signal, int signal_mask, unsigned int flags);
+/*
+ * Waits for registered interrupt signal
+
+ * Suspend the current thread until one of its registered interrupts occurs. The second input mask, 
+ * contains the interrupt signals the IST expects to receive. The interrupt signals are registered 
+ * with interrupts via qurt_register_interrupt API.
+ *
+ * The signals returned in the signal variable indicate which interrupts occurred. Use function 
+ * qurt_anysignal_get to read the signals. IST must locally maintain a table that maps a signal to 
+ * a specific interrupt. IST also checks if signal #SIG_INT_ABORT is received. If so, the IST 
+ * must quit from interrupt receiving loop.
+ *
+ * For detail information on this API, see QuRT User Manual Section 4.2.5
+ *
+ * Prototype
+ *
+ * unsigned int qurt_anysignal_wait(qurt_anysignal_t *int_signal, unsigned int mask)
+ */
+
+/**@ingroup func_qurt_interrupt_acknowledge
+  Acknowledges an interrupt after it has been processed.\n
+  Re-enables an interrupt and clears its pending status. This is done after an interrupt is
+  processed by an IST.
+
+  Interrupts are automatically disabled after they occur. To re-enable an interrupt, an IST
+  performs the acknowledge operation after it has finished processing the interrupt and
+  just before suspending itself (such as by waiting on the interrupt signal).
+
+  @note1hang To prevent losing or reprocessing subsequent occurrences of the interrupt,
+           an IST must clear the interrupt signal (Section @xref{sec:anysignal_clear}) before
+           acknowledging the interrupt.
+
+  @param[in] int_num Interrupt that is being re-enabled.
+
+  @return 
+  #QURT_EOK -- Interrupt acknowledge was successful. \n
+  #QURT_EDEREGISTERED -- Interrupt is already de-registered.
+
+  @dependencies
+  None.	
+*/
+int qurt_interrupt_acknowledge(int int_num);
+
+/**@ingroup func_qurt_interrupt_deregister
+  Disables the specified interrupt and disassociates it from a QuRT signal object.
+  If the specified interrupt was never registered (Section @xref{sec:interrupt_register}), the deregister operation
+  returns the status value #QURT_EINT.
+
+  @note1hang If an interrupt is deregistered while an IST waits
+             to receive it, the IST might wait indefinitely for the interrupt to occur. To avoid
+             this problem, the QuRT kernel sends the signal #SIG_INT_ABORT to awaken an
+             IST after determining that it has no interrupts registered.
+
+  @param[in] int_num L2VIC to deregister; valid range is 0 to 1023.
+
+  @return
+  #QURT_EOK -- Success.\n
+  #QURT_EINT -- Invalid interrupt number (not registered).
+
+  @dependencies
+  None.
+
+*/
+unsigned int qurt_interrupt_deregister(int int_num);
+/** @endcond */
+
+/**@ingroup func_qurt_interrupt_disable
+  Disables an interrupt with its interrupt number.\n
+  The interrupt must be registered prior to calling this function. 
+  After qurt_interrupt_disable() returns, the Hexagon subsystem
+  can no longer send the corresponding interrupt to the Hexagon
+  core, until qurt_interrupt_enable() is called 
+  for the same interrupt. 
+  
+  Avoid calling qurt_interrupt_disable() and qurt_interrupt_enable() frequently within 
+  a short period of time.\n
+  - A pending interrupt can already be in the Hexagon core when qurt_interrupt_disable() 
+    is called. Therefore, some time later, the pending interrupt is received on a Hexagon 
+    hardware thread.\n
+  - After the Hexagon subsystem sends an interrupt to the Hexagon core, the Hexagon 
+    hardware automatically disables the interrupt until kernel software re-enables the interrupt 
+    at the interrupt acknowledgement stage. If qurt_interrupt_enable() is called from a certain 
+    thread at an ealier time, the interrupt is re-enabled earlier and can trigger 
+  sending a new interrupt to the Hexagon core while kernel software is still processing
+  the previous interrupt.
+
+  @param[in] int_num Interrupt number.
+
+  @return
+  #QURT_EOK  -- Interrupt successfully disabled.\n 
+  #QURT_EINT -- Invalid interrupt number.\n
+  #QURT_ENOTALLOWED -- Interrupt is locked. \n
+  #QURT_EVAL -- Interrupt is not registered. 
+
+  @dependencies
+  None.
+*/
+ unsigned int qurt_interrupt_disable(int int_num);
+
+ 
+/**@ingroup func_qurt_interrupt_enable
+  Enables an interrupt with its interrupt number.\n
+  The interrupt must be registered prior to calling this function. 
+
+  @param[in] int_num Interrupt number.
+
+  @return
+  #QURT_EOK -- Interrupt successfully enabled.\n 
+  #QURT_EINT -- Invalid interrupt number.\n
+  #QURT_ENOTALLOWED -- Interrupt is locked. \n
+  #QURT_EVAL -- Interrupt is not registered.
+
+  @dependencies
+  None.
+
+*/
+ unsigned int qurt_interrupt_enable(int int_num);
+
+
+/**@ingroup func_qurt_interrupt_status
+  Returns a value that indicates the pending status of the specified interrupt.
+
+  @param[in]  int_num  Interrupt number that is being checked.
+  @param[out] status   Interrupt status; 1 indicates that an interrupt is
+                       pending, 0 indicates that an interrupt is not pending.
+ 
+  @return 
+  #QURT_EOK -- Success. \n
+  #QURT_EINT -- Failure; invalid interrupt number.
+
+  @dependencies
+  None.
+ */
+unsigned int qurt_interrupt_status(int int_num, int *status);
+
+
+/**@ingroup func_qurt_interrupt_get_status
+  Gets the status of the specified interrupt in L2VIC.
+
+  @param[in]  int_num  Interrupt number that is being checked.
+  @param[in]  status_type     0 -- interrupt pending status \n
+                              1 -- interrupt enabling status
+  @param[out] status          0 -- OFF \n
+                              1 -- ON
+ 
+  @return 
+  #QURT_EOK -- Success. \n
+  #QURT_EINT -- Failure; invalid interrupt number.
+
+  @dependencies
+  None.
+ */
+unsigned int qurt_interrupt_get_status(int int_num, int status_type, int *status);
+
+/** @cond rest_reg_dist */
+/**@ingroup func_qurt_interrupt_clear
+  Clears the pending status of the specified interrupt.
+
+  @note1hang This operation is intended for system-level use, and must be used with care.
+             
+  @param[in] int_num Interrupt that is being re-enabled.
+ 
+  @return 
+  #QURT_EOK -- Success.\n
+  #QURT_EINT -- Invalid interrupt number.
+  
+  @dependencies
+  None.
+ */
+unsigned int qurt_interrupt_clear(int int_num);
+
+
+/**@ingroup func_qurt_interrupt_get_config
+  Gets the L2VIC interrupt configuration. \n
+  This function returns the type and polarity of the specified L2VIC interrupt.
+
+  @param[in]   int_num       L2VIC interrupt that is being re-enabled.
+  @param[out]  int_type      Pointer to an interrupt type. \n
+                             0 -- Level-triggered interrupt \n
+                             1 -- Eedge-triggered interrupt
+  @param[out]  int_polarity  Pointer to interrupt polarity.\n
+                             0 -- Active-high interrupt \n
+                             1 -- Active-low interrupt.
+ 
+  @return 
+  #QURT_EOK -- Configuration successfully returned.\n
+  #QURT_EINT -- Invalid interrupt number. 
+
+  @dependencies
+  None.
+ */
+unsigned int qurt_interrupt_get_config(unsigned int int_num, unsigned int *int_type, unsigned int *int_polarity);
+
+/**@ingroup func_qurt_interrupt_set_config
+  Sets the type and polarity of the specified L2VIC interrupt.
+
+  @note1hang Deregister L2VIC interrupts before reconfiguring them.
+
+  @param[in] int_num        L2VIC interrupt that is being re-enabled.
+  @param[in] int_type       Interrupt type. \n
+                            0 -- Level-triggered interrupt\n
+                            1 -- Edge-triggered interrupt
+  @param[in] int_polarity   Interrupt polarity. \n
+                            0 -- Active-high interrupt \n
+                            1 -- Active-low interrupt
+ 
+  @return
+  #QURT_EOK -- Success. \n
+  #QURT_ENOTALLOWED -- Not allowed; the interrupt is being registered.\n
+  #QURT_EINT -- Invalid interrupt number.
+  
+  @dependencies
+  None.
+ */
+unsigned int qurt_interrupt_set_config(unsigned int int_num, unsigned int int_type, unsigned int int_polarity);
+
+/**@ingroup func_qurt_interrupt_set_config2
+  Sets the type and polarity of the specified L2VIC interrupt.
+
+  @note1hang L2VIC interrupts must be deregistered before they can be reconfigured.
+
+  @param[in] int_num        L2VIC interrupt that is being re-enabled.
+  @param[in] int_type       Notified to the hardware configuration callback function and used to 
+                            modify the L2VIC type. Possible values: \n 
+                            - #QURT_INT_TRIGGER_USE_DEFAULT \n 
+                            - #QURT_INT_TRIGGER_LEVEL_HIGH  \n 
+                            - #QURT_INT_TRIGGER_LEVEL_LOW  \n 
+                            - #QURT_INT_TRIGGER_RISING_EDGE  \n 
+                            - #QURT_INT_TRIGGER_FALLING_EDGE  \n              
+                            - #QURT_INT_TRIGGER_DUAL_EDGE  @tablebulletend
+ 
+  @return
+  #QURT_EOK -- Success. \n
+  #QURT_ENOTALLOWED -- Not allowed; the interrupt is being registered.\n
+  #QURT_EINT -- Invalid interrupt number.
+  
+  @dependencies
+  None.
+ */
+unsigned int qurt_interrupt_set_config2(unsigned int int_num, unsigned int int_type);
+
+/**@ingroup func_ qurt_interrupt_set_config3
+  Sets the specified configuration value for the specified property of the specified L2VIC interrupt.
+
+  @note1hang L2VIC interrupts must be deregistered before they can be reconfigured for polarity.
+    
+  @param[in] int_num        L2VIC interrupt to re-enable.
+  @param[in] config_id      Property to configure: \n
+                            - #QURT_INT_CONFIGID_POLARITY \n
+                            - #QURT_INT_CONFIGID_LOCK @tablebulletend
+  @param[in] config_val    Dependent on the second argument config_id, specifies the value to set. \n
+                           Values for #QURT_INT_CONFIGID_POLARITY: \n 
+                            - #QURT_INT_TRIGGER_USE_DEFAULT \n
+                            - #QURT_INT_TRIGGER_LEVEL_HIGH  \n
+                            - #QURT_INT_TRIGGER_LEVEL_LOW \n
+                            - #QURT_INT_TRIGGER_RISING_EDGE \n
+                            - #QURT_INT_TRIGGER_FALLING_EDGE \n             
+                            - #QURT_INT_TRIGGER_DUAL_EDGE \n
+
+                           Values for #QURT_INT_CONFIGID_LOCK: \n
+                            - #QURT_INT_LOCK_ENABLE\n
+                            - #QURT_INT_LOCK_DISABLE @tablebulletend
+          
+  @return
+  #QURT_EOK -- Success. \n
+  #QURT_ENOTALLOWED -- Not allowed; the interrupt is being registered or is locked for enable/disable.\n
+  #QURT_EINT -- Invalid interrupt number.
+
+  @dependencies
+  None.
+*/
+unsigned int qurt_interrupt_set_config3(unsigned int int_num, unsigned int config_id, unsigned int config_val);
+
+
+/**@ingroup func_qurt_interrupt_raise
+  Raises the interrupt. \n
+  This function triggers a level-triggered L2VIC
+  interrupt, and accepts interrupt numbers in the range of 0 to 1023.
+
+  @param[in] interrupt_num Interrupt number.
+  
+  @return
+  #QURT_EOK --  Success \n
+  -1  --  Failure; the interrupt is not supported.
+
+  @dependencies
+  None.
+ */
+int qurt_interrupt_raise(unsigned int interrupt_num);
+
+/**@ingroup func_qurt_interrupt_raise2
+  Raises the interrupt and returns the current pcycle value.
+
+  @param[in] interrupt_num Interrupt number.
+  
+  @return
+  0xFFFFFFFFFFFFFFFF -- Failure; the interrupt is not supported.\n
+  Other value        -- pcycle count at the time the interrupt is raised.
+
+  @dependencies
+  None.
+ */
+unsigned long long qurt_interrupt_raise2(unsigned int interrupt_num);
+/** @endcond */
+
+/** @cond internal_only */
+/**@ingroup func_qurt_isr_subcall
+  Indicates whether the current function is called from a callback procedure (either short or long).
+  
+  @return
+  #QURT_EOK -- TRUE \n
+  #QURT_EVAL -- FALSE.
+  
+  @dependencies
+  None.
+ */
+int qurt_isr_subcall(void);
+/** @endcond */
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_INT_H */
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_island.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_island.h
new file mode 100755
index 0000000000000..f0c8ee27cf8b0
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_island.h
@@ -0,0 +1,122 @@
+#ifndef QURT_ISLAND_H
+#define QURT_ISLAND_H
+
+/**
+  @file qurt_island.h
+  @brief  Prototypes of power API
+          The APIs allow entering and exiting island mode where the memory
+          accesses are limited to local memory.
+
+  EXTERNAL FUNCTIONS
+   None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+Copyright (c) 2018-2021,2023  by Qualcomm Technologies, Inc.  All Rights Reserved.
+
+=============================================================================*/
+
+#include <qurt_thread.h>
+#include <qurt_memory.h>
+#include <qurt_alloc.h>
+#include <qurt_error.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**@ingroup func_qurt_island_get_status
+  Gets Island mode status.
+
+  Returns a value that indicates whether the QuRT system executes in Island mode.
+
+  @return
+  0 - Normal mode. \n
+  1 - Island mode. 
+
+  @dependencies
+  None.
+*/
+unsigned int qurt_island_get_status (void);
+
+/**@ingroup func_qurt_island_get_status2
+  Gets Island mode status especially that differentiates between island partial exit and complete exit.
+ 
+  Returns a value that indicates the current state. 
+  
+  @note1hang Transition from NORMAL mode to ISLAND mode happens in single
+             threaded mode. Whereas transition from ISLAND mode to other modes
+             happen in multi-threaded mode. So, a thread that gets island mode
+             status as NORMAL can assume the same status till it continues to
+             run. A thread that gets island mode status as ISLAND should 
+             assume that the status may change to EXITING or NORMAL while it
+             runs. A thread that gets island mode status as EXITING should
+             assume that the status may change to NORMAL while it runs. If 
+             the thread goes to wait state in after reading the status, it should get
+             the island mode state again and not assume the previous state. 
+  @note2hang This api returns more intrinsic states than qurt_island_get_status,
+             when qurt_island_get_status returns 0, this api could return 
+             QURT_ISLAND_MODE_EXITING or QURT_ISLAND_MODE_ISLAND
+          
+  @param[in/out] data  field is reserved for future use. If NULL pointer is passed, 
+                       the field will be ignored. If a valid pointer is passed, 
+                  QuRT will return back a bitmask which can be interpreted as follows:
+                  data[31] - Valid bit. Set to 1 to indicate data[30:0] are valid. 
+                  Otherwise set to 0.
+                  data[30:0] – Reserved for future definition. 
+ 
+  @return
+    QURT_ISLAND_MODE_NORMAL   - Main mode \n
+    QURT_ISLAND_MODE_ISLAND   - Island mode \n
+    QURT_ISLAND_MODE_EXITING  - Exiting Island mode \n
+ 
+  @dependencies
+  None.
+*/
+unsigned int qurt_island_get_status2 (unsigned int *data);
+
+
+
+/**@ingroup func_qurt_island_get_exit_status
+  Gets the reason for the last Island mode exit status.
+
+  @param[out] cause_code Pointer that returns the cause code of the last
+                         island exit reason. \n
+                         - #QURT_EISLANDUSEREXIT -- Island exit due to user call for island exit.\n
+                         - #QURT_ENOISLANDENTRY -- API called before exiting island. \n                
+                         - #QURT_EISLANDINVALIDINT -- Island exit due to an invalid interrupt in Island mode. @tablebulletend
+
+  @param[out] int_num Pointer that holds the invalid interrupt number that caused
+                      island exit when the cause code is #QURT_EISLANDINVALIDINT.
+                      For other cases, it is -1.
+
+  @return
+  None. 
+
+  @dependencies
+  None.
+*/
+void qurt_island_get_exit_status(unsigned int *cause_code, int *int_num);
+
+/**@ingroup func_qurt_island_get_enter_timestamp
+  Gets the recent timestamp when the system exits STM during island enter.
+
+  @param[out]    island_enter_timestamp Returns a pointer to the recent timestamp
+                                        recorded after the system exits STM during island enter. If the system never 
+                                        attempts to enter island, the island_enter_timestamp return pointer holds a value 
+                 of zero.
+  
+  @return
+  None. 
+
+  @dependencies
+  None.
+*/
+void qurt_island_get_enter_timestamp(unsigned long long *island_enter_timestamp);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_ISLAND_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_isr.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_isr.h
new file mode 100755
index 0000000000000..db29ea2f265d7
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_isr.h
@@ -0,0 +1,177 @@
+#ifndef QURT_ISR_H
+#define QURT_ISR_H
+
+/*=====================================================================
+ 
+  @file  qurt_isr.h
+
+  @brief  Prototypes of Qurt ISR API functions      
+
+ EXTERNALIZED FUNCTIONS
+  none
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  none
+
+ Copyright (c) 2017, 2021  by Qualcomm Technologies, Inc.  All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+
+#include <string.h>
+#include <qurt_thread.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*=============================================================================
+                            Functions
+=============================================================================*/
+
+
+/**@ingroup func_qurt_isr_set_hw_config_callback
+  Set callback function for the configuration related to interrupt hardware.
+  In a process, the callback function can only be set once.
+
+  @param[in]   cb_addr      address of the callback function.
+   
+  @return 
+  #QURT_EOK     -- the callback function is set succssfully. \n
+  #QURT_EFAILED -- Failure. The callback function has been set before. 
+
+  @dependencies
+  None.
+ */
+int qurt_isr_set_hw_config_callback(unsigned int cb_addr);
+
+
+/**@ingroup func_qurt_isr_set_hw_enable_callback
+  Set callback function for enabling the configuration related to interrupt hardware.
+  In a process, the callback function can only be set once.
+
+  @param[in]   cb_addr      address of the callback function.
+   
+  @return 
+  #QURT_EOK     -- the callback function is set succssfully. \n
+  #QURT_EFAILED -- Failure. The callback function has been set before. 
+
+  @dependencies
+  None.
+ */
+int qurt_isr_set_hw_enable_callback(unsigned int cb_addr);
+
+
+/**@ingroup func_qurt_isr_set_hw_disable_callback
+  Set callback function for disabling the configuration related to interrupt hardware.
+  In a process, the callback function can only be set once.
+
+  @param[in]   cb_addr      address of the callback function.
+   
+  @return 
+  #QURT_EOK     -- the callback function is set succssfully. \n
+  #QURT_EFAILED -- Failure. The callback function has been set before. 
+
+  @dependencies
+  None.
+ */
+int qurt_isr_set_hw_disable_callback(unsigned int cb_addr);
+
+
+/**@ingroup func_qurt_isr_create
+  Creates an ISR thread with the specified attributes, and makes it executable.
+
+  @datatypes
+  #qurt_thread_t \n
+  #qurt_thread_attr_t
+  
+  @param[out]  thread_id    Returns a pointer to the thread identifier if the thread was 
+                             successfully created.
+  @param[in]   attr 	    Pointer to the initialized thread attribute structure that specifies 
+                             the attributes of the created thread.
+   
+  @return 
+  #QURT_EVAL    -- Invalid arguments
+  #QURT_EOK -- Thread created. \n
+  #QURT_EFAILED -- Thread not created. 
+
+  @dependencies
+  None.
+ */
+int qurt_isr_create (qurt_thread_t *thread_id, qurt_thread_attr_t *pAttr);
+
+/**@ingroup func_qurt_isr_register2
+  Registers an Interrupt Service Routine to an ISR thread. ISR callback with the specified attributes.
+  The interrupt is enabled when this function returns success.
+
+  @datatypes
+   qurt_thread_t
+  
+  @param[in]   isr_thread_id ISR thread ID, returned from qurt_isr_create()
+  @param[in]   int_num       The interrupt number
+  @param[in]   prio          Priority of the ISR
+  @param[in]   flags         Defines ACK type. Values : \n
+                             QURT_INT_NON_DELAYED_ACK - ISR is acknowledged by the interrupt handle routine 
+			                                     in the Kernel.
+                             QURT_INT_DELAYED_ACK     - Client chooses to acknowledge. 
+  @param[in]   int_type.     Notifies it to registered function. Values: \n 
+                             - QURT_INT_TRIGGER_USE_DEFAULT
+                             - QURT_INT_TRIGGER_LEVEL_HIGH 
+                             - QURT_INT_TRIGGER_LEVEL_LOW 
+                             - QURT_INT_TRIGGER_RISING_EDGE 
+                             - QURT_INT_TRIGGER_FALLING_EDGE              
+                             - QURT_INT_TRIGGER_DUAL_EDGE              
+  @param[in]   isr           Interrupt Service Routine with proto type void isr (void *arg, int int_num)
+  @param[in]   arg  	     1st argument of the ISR when it is called to service the interrupt
+   
+  @return 
+   QURT_EOK          -- Successfully registered the ISR for the interrupt
+   QURT_EINT         -- Interrupt not configured
+   QURT_EINVALID     -- Invalid Thread ID
+   QURT_EDISABLED    -- The feature is disabled
+   QURT_EDUPLICATE   -- Interrupt is already registered
+
+  @dependencies
+   Thread ID should be created using qurt_isr_create()
+ */
+int qurt_isr_register2 (qurt_thread_t isr_thread_id, int int_num, unsigned short prio, unsigned short flags, unsigned int int_type, void (*isr) (void *, int), void *arg);
+
+/**@ingroup func_qurt_isr_deregister2
+  De-registers the ISR for the specified interrupt.
+  The interrupt is disabled when this function returns success.
+
+  @param[in]   int_num   The interrupt number
+   
+  @return 
+   QURT_EOK            -- ISR deregistered successfully
+   QURT_ENOREGISTERED  -- Interrupt with int_num is not registered
+
+  @dependencies
+  None.
+ */
+int qurt_isr_deregister2 (int int_num);
+
+/**@ingroup func_qurt_isr_delete
+   ISR thread will exit and releases Kernel resources
+
+   @note1hang   The ISR thread shouldn't be actively processing interrupts,
+                otherwise the call will fail and return an error.
+  
+   @param[in]   thread-id of the ISR thread that needs to be deleted.
+
+   @return
+    QURT_ENOTALLOWED   -- ISR thread is processing an interrupt
+    QURT_EINVALID      -- Invalid ISR thread ID
+    QURT_EOK           -- Success 
+
+   @dependencies
+   Thread ID should be created using qurt_isr_create()
+ */
+int qurt_isr_delete (qurt_thread_t isr_tid);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_ISR_H */
+
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_l2cfg.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_l2cfg.h
new file mode 100755
index 0000000000000..7e26b30a580d9
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_l2cfg.h
@@ -0,0 +1,98 @@
+#ifndef QURT_L2CFG_H
+#define QURT_L2CFG_H
+/**
+  @file qurt_l2cfg.h 
+  @brief QuRT APIs for L2 configuration and system configuration
+
+EXTERNAL FUNCTIONS
+   qurt_l2cfg_set 
+   qurt_l2cfg_get 
+   qurt_system_config_get
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+Copyright (c) 2019-2021 by Qualcomm Technologies, Inc.  All Rights Reserved.
+Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+=============================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*=============================================================================
+                        CONSTANTS AND MACROS
+=============================================================================*/
+
+/* Definition for system configuration */
+/** @addtogroup l2cfg_macros
+@{ */
+#define QURT_CORE_CFG_HMX_INT8_SPATIAL  0x78   /**< HMX fixed-point spatial size */
+#define QURT_CORE_CFG_HMX_INT8_DEPTH    0x7C   /**< HMX fixed-point output depth */
+/** @} */ /* end_addtogroup l2cfg_macros */
+/*=============================================================================
+                        FUNCTIONS
+=============================================================================*/
+/**@ingroup func_qurt_l2cfg_set
+  Sets the value of a L2 configuration register. A register can be set *IFF* its 
+  initial value is configured.
+   
+  @param[in] offset Offset of L2 configuration register; must be multiple of 4.
+  @param[in] value  Value to set the register to. 
+   
+  @return 
+  #QURT_EOK -- Success. \n 
+  #QURT_EFAILED -- Internal mapping that covers L2CFG register file absent; likely 
+                    a configuration problem. \n
+  #QURT_EINVALID -- Argument error. \n
+  #QURT_ENOTALLOWED -- Setting this register is prohibited.
+   
+  @dependencies 
+  None.  
+ */
+int qurt_l2cfg_set (unsigned short offset, unsigned int value);
+
+/**@ingroup func_qurt_l2cfg_get 
+  Gets the value of a L2 configuration register. 
+   
+  @param[in]  offset Offset of L2 configuration register; must be multiple of 4.
+  @param[out] value  Pointer to value of the register. 
+   
+  @return 
+  #QURT_EOK -- Success. \n 
+  #QURT_EFAILED -- Internal mapping that covers L2CFG register file absent;  
+                   likely a configuration problem. \n 
+  #QURT_EINVALID -- Argument error. 
+   
+  @dependencies 
+  None. 
+  
+ */
+int qurt_l2cfg_get (unsigned short offset, unsigned int * value);
+
+
+/**@ingroup func_qurt_system_config_get
+  Gets the system configuration information.
+
+  @param[in] index Index to system configuration. Values:\n
+                   - #QURT_CORE_CFG_HMX_INT8_SPATIAL \n
+                   - #QURT_CORE_CFG_HMX_INT8_DEPTH @tablebulletend
+
+  @param[out] data   Pointer to a word for returned data.
+
+  @return
+  #QURT_EOK -- Get the configuration data successful. \n
+  Other values -- Failure (no such configuration available).
+
+  @dependencies
+  None.
+  
+ */
+int qurt_system_config_get(int index, unsigned int *data);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_L2CFG_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_lifo.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_lifo.h
new file mode 100755
index 0000000000000..dc399fccc5f0f
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_lifo.h
@@ -0,0 +1,71 @@
+#ifndef QURT_LIFO_H
+#define QURT_LIFO_H
+/**
+  @file qurt_lifo.h
+
+  @brief
+  Provide lock free LastInFirstOut algorithm, which can be used in a
+  variety of situations for allocation/free fixed size buffer    
+  This implementation touches the first word of your FREED buffer. Even
+  though it does not matter how you use it when it is allocated, you might want
+	to be a bit careful not to put your MAGIC number as the first field.
+	Because it will not hold the magic value for "freed"
+
+ EXTERNALIZED FUNCTIONS
+ None
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+ None
+
+ Copyright (c) 2013, 2021  by Qualcomm Technologies, Inc.  All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+	/*=====================================================================
+	 Functions
+	======================================================================*/
+
+/*======================================================================*/
+/**
+  Pops an element out of the LIFO. 
+
+  @param[in] freelist  Pointer to the head of your list. 
+	
+  @return 				
+  Top object from the list 
+		 
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+void * qurt_lifo_pop(void *freelist);
+
+ 
+/*======================================================================*/
+/**
+  Pushes an element into the LIFO.
+	
+  @param[in] freelist  Pointer to the head of your list. 
+  @param[in] buf       Pointer to your buffer to push into the list. 
+	
+  @return
+  None.
+		 
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+void qurt_lifo_push(void *freelist, void *buf);
+
+void qurt_lifo_remove(void *freelist, void *buf);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_LIFO_H */
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_mailbox.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_mailbox.h
new file mode 100755
index 0000000000000..a6cd91c611782
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_mailbox.h
@@ -0,0 +1,176 @@
+#ifndef QURT_MAILBOX_H
+#define QURT_MAILBOX_H
+
+/**
+  @file qurt_mailbox.h
+  @brief  Definitions, macros, and prototypes used for QuRT mailbox
+
+ EXTERNALIZED FUNCTIONS
+  none
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  none
+
+ Copyright (c) 2015, 2021-2023  by Qualcomm Technologies, Inc.  All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*=============================================================================
+                        CONSTANTS AND MACROS
+=============================================================================*/
+/* Definitions on typedef and return values */
+
+#define   QURT_MAILBOX_ID_NULL               0
+#define   QURT_MAILBOX_ERROR                -1
+#define   QURT_MAILBOX_ID_ERROR             -2
+#define   QURT_MAILBOX_NON_VALID_DATA       -3
+#define   QURT_MAILBOX_FULL                 -4
+#define   QURT_MAILBOX_DELETED              -5
+#define   QURT_MAILBOX_RECEIVE_HALTED       -6
+#define   QURT_MAILBOX_BANDWIDTH_LIMIT      -7
+
+
+/*=============================================================================
+                    FORWARD DECLARATIONS & TYPEDEFS
+=============================================================================*/
+
+#define        QURT_MAILBOX_AT_QURTOS     0U            // Receiver is QurtOS
+#define        QURT_MAILBOX_AT_ROOTPD     1U            // Receiver is RootPD  (ASID=0)
+#define        QURT_MAILBOX_AT_USERPD     2U            // Receiver is User PD (ASID!=0)
+#define        QURT_MAILBOX_AT_SECUREPD   3U            // Receiver is Secure PD
+
+typedef unsigned char qurt_mailbox_receiver_cfg_t;  
+
+#define        QURT_MAILBOX_SEND_OVERWRITE        0U       // When there is already valid content, overwrite it
+#define        QURT_MAILBOX_SEND_NON_OVERWRITE    1U       // When there is already valid content, return failure
+
+typedef unsigned char qurt_mailbox_send_option_t;  
+
+
+#define        QURT_MAILBOX_RECV_WAITING          0U          // When there is no valid content, wait for it 
+#define        QURT_MAILBOX_RECV_NON_WAITING      1U          // When there is no valid content, return failure immediately
+#define        QURT_MAILBOX_RECV_PEEK_NON_WAITING 2U          // Read the content, but doesn't remove it from the mailbox. No waiting.
+
+typedef unsigned char qurt_mailbox_recv_option_t;
+
+
+/*=============================================================================
+                            EXTERNS & FUNCTIONS
+=============================================================================*/
+/* Function prototype */
+
+/**@ingroup qurt_mailbox_create
+  Creates a QuRT mailbox.
+   
+  @param name            Mailbox name up to 8 characters.
+  @param recv_opt        Configuration on the receiver process.
+
+  @return
+  Mailbox ID --          Mailbox Identifier \n
+  #QURT_MAILBOX_ID_NULL --  NULL, failure at creating mailbox
+
+  @dependencies
+  None.
+*/
+unsigned long long qurt_mailbox_create(char *name, qurt_mailbox_receiver_cfg_t recv_opt);
+
+
+/**@ingroup qurt_mailbox_get_id
+  Gets a QuRT mailbox identifier.
+   
+  @param name            Mailbox name up to 8 characters.
+
+  @return
+  Mailbox ID --            Mailbox identifier \n
+  #QURT_MAILBOX_ID_NULL -- NULL, failure at getting mailbox ID
+
+  @dependencies
+  None.
+*/
+unsigned long long qurt_mailbox_get_id(char *name);
+
+
+/**@ingroup qurt_mailbox_send
+  Sends data to a QuRT mailbox.
+   
+  @param mailbox_id   Mailbox identifier.
+  @param send_opt     Option for mailbox send.
+  @param data         Data to send.
+
+
+  @return
+  #QURT_EOK                      Success \n
+  #QURT_MAILBOX_ID_ERROR         Mailbox ID error.\n
+  #QURT_MAILBOX_ERROR            Other errors.\n
+  #QURT_MAILBOX_FULL             Valid data already exists, non-overwriting.\n
+  #QURT_MAILBOX_BANDWIDTH_LIMIT  Reached the bandwidth limitation.   
+
+  @dependencies
+  None.
+*/
+int qurt_mailbox_send(unsigned long long mailbox_id, qurt_mailbox_send_option_t send_opt, unsigned long long data);
+
+
+/**@ingroup qurt_mailbox_receive
+  Receive data from QuRT mailbox
+   
+  @param mailbox_id   Mailbox Identifier
+  @param send_opt     Option for mailbox receiving
+  @param data         Pointer to data buffer for receiving
+
+  @return
+  #QURT_EOK                            Success \n
+  #QURT_MAILBOX_ID_ERROR               Mailbox ID error. \n
+  #QURT_MAILBOX_ERROR                  Other errors. \n
+  #QURT_MAILBOX_NON_VALID_DATA         No current valid data, put the previous content in the buffer. \n
+  #QURT_MAILBOX_RECEIVE_HALTED         Receive halted, the waiting thread is woken up. \n
+  #QURT_MAILBOX_DELETED                Mailbox is deleted, and the waiting thread is woken up.
+
+  @dependencies
+  None.
+*/
+int qurt_mailbox_receive(unsigned long long mailbox_id, qurt_mailbox_recv_option_t recv_opt, unsigned long long *data);
+
+
+/**@ingroup qurt_mailbox_delete
+  Deletes a QuRT mailbox.
+
+  A mailbox can only be deleted from the process that created the mailbox.
+   
+  @param mailbox_id   Mailbox identifier.
+
+  @return
+  #QURT_EOK                   Success. \n
+  #QURT_MAILBOX_ID_ERROR      Mailbox ID error. \n
+  #QURT_MAILBOX_ERROR         Other errors.
+
+  @dependencies
+  None.
+*/
+int qurt_mailbox_delete(unsigned long long mailbox_id);
+
+
+/**@ingroup qurt_mailbox_receive_halt
+  Halts a QuRT mailbox receiving and wakes up waiting threads.
+
+  @param mailbox_id   Mailbox identifier.
+
+  @return
+  #QURT_EOK                   Success. \n
+  #QURT_MAILBOX_ID_ERROR      Mailbox ID error.\n
+  #QURT_MAILBOX_ERROR         Other errors.
+
+  @dependencies
+  None.
+*/
+int qurt_mailbox_receive_halt(unsigned long long mailbox_id);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif // QURT_MAILBOX_H
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_memory.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_memory.h
new file mode 100755
index 0000000000000..90ce2586fec50
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_memory.h
@@ -0,0 +1,1487 @@
+#ifndef QURT_MEMORY_H
+#define QURT_MEMORY_H
+/**
+  @file qurt_memory.h
+  @brief  Prototypes of kernel memory API functions.
+
+ EXTERNALIZED FUNCTIONS
+  none
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  none
+
+ Copyright (c) Qualcomm Technologies, Inc.
+ All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+
+
+#include <qurt_error.h>
+#include <qurt_types.h>
+//#include <qurt_util_macros.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** @addtogroup memory_management_macros
+@{ */
+#define QURT_SYSTEM_ALLOC_VIRTUAL 1 /**< Allocates available virtual memory in the address space of all
+                                processes.*/
+/** @} */ /* end_addtogroup memory_management_macros */
+/**@cond rest_reg_dist */
+/** @addtogroup memory_management_types
+@{ */
+/** @xreflabel{hdr:qurt_mem_default_pool} */
+extern qurt_mem_pool_t qurt_mem_default_pool __attribute__((section(".data"))); /**< Memory pool object.*/
+/** @} */ /* end_addtogroup memory_management_types */
+
+/** @cond rest_reg_dist */
+/** Mapping attribute information*/
+typedef struct{
+    qurt_paddr_64_t        paddr;
+    qurt_size_t            size ;
+    qurt_mem_cache_mode_t  cache_mode;
+    qurt_perm_t            perms ; 
+}qurt_mapping_attr_t;
+/** @endcond */
+/** @} */ /* end_addtogroup mapping_attribute_types*/
+
+/*=====================================================================
+ Functions
+======================================================================*/
+
+/**@ingroup func_qurt_mem_cache_clean
+  Performs a cache clean operation on the data stored in the specified memory area.
+  Peforms a syncht on all the data cache operations when the Hexagon processor version is V60 or greater.
+
+  @note1hang Perform the flush all operation only on the data cache.
+
+  @note1cont This operation flushes and invalidates the contents of all cache lines from start address
+             to end address (start address + size). The contents of the adjoining buffer can be 
+             flushed and invalidated if it falls in any of the cache line.
+
+  @datatypes
+  #qurt_addr_t \n
+  #qurt_size_t \n
+  #qurt_mem_cache_op_t \n
+  #qurt_mem_cache_type_t
+
+  @param[in] addr      Address of data to flush.
+  @param[in] size      Size (in bytes) of data to flush.
+  @param[in] opcode    Type of cache clean operation. Values:  
+                       - #QURT_MEM_CACHE_FLUSH
+                       - #QURT_MEM_CACHE_INVALIDATE
+                       - #QURT_MEM_CACHE_FLUSH_INVALIDATE
+                       - #QURT_MEM_CACHE_FLUSH_ALL\n
+                       @note1 #QURT_MEM_CACHE_FLUSH_ALL is valid only when the type is #QURT_MEM_DCACHE @tablebulletend
+  @param[in] type          Cache type. Values:  
+                       - #QURT_MEM_ICACHE
+                       - #QURT_MEM_DCACHE  @tablebulletend
+ 
+  @return
+  #QURT_EOK -- Cache operation performed successfully.\n
+  #QURT_EVAL -- Invalid cache type.\n
+
+  @dependencies
+  None.
+*/
+int qurt_mem_cache_clean(qurt_addr_t addr, qurt_size_t size, qurt_mem_cache_op_t opcode, qurt_mem_cache_type_t type);
+
+/**@ingroup func_qurt_mem_cache_clean2
+  Performs a data cache clean operation on the data stored in the specified memory area.
+
+  This API only performs the following data cache operations:\n  
+  - #QURT_MEM_CACHE_FLUSH\n
+  - #QURT_MEM_CACHE_INVALIDATE\n  
+  - #QURT_MEM_CACHE_FLUSH_INVALIDATE -- flushes/invalidates the contents of all cache lines from start address
+  to end address (start address + size). The contents of the adjoining buffer can be 
+  flushed/invalidated if it falls in any of the cache line.
+
+  @datatypes
+  #qurt_addr_t \n
+  #qurt_size_t \n
+  #qurt_mem_cache_op_t \n
+  #qurt_mem_cache_type_t
+
+  @param[in] addr      Address of data to flush.
+  @param[in] size      Size (in bytes) of data to flush.
+  @param[in] opcode    Type of cache clean operation. Values:\n  #QURT_MEM_CACHE_FLUSH\n  #QURT_MEM_CACHE_INVALIDATE\n
+                       #QURT_MEM_CACHE_FLUSH_INVALIDATE
+  @param[in] type          Cache type. Values: \n #QURT_MEM_DCACHE
+
+  @return
+  #QURT_EOK -- Cache operation performed successfully.\n
+  #QURT_EVAL -- Invalid cache type.
+
+  @dependencies
+  None.
+*/
+int qurt_mem_cache_clean2(qurt_addr_t addr, qurt_size_t size, qurt_mem_cache_op_t opcode, qurt_mem_cache_type_t type);
+
+/**@ingroup func_qurt_mem_cache_phys_clean
+  Performs a cache clean operation on the data stored in the specified memory area based on address match and mask.
+  Operate on a cache line when (LINE.PhysicalPageNumber & mask) == addrmatch.
+
+  @note1hang The addrmatch value should be the upper 24-bit physical address to match against.
+
+  @datatypes
+  #qurt_mem_cache_op_t \n
+
+  @param[in] mask      24-bit address mask.
+  @param[in] addrmatch Physical page number (24 bits) of memory to use as an address match.
+  @param[in] opcode    Type of cache clean operation. Values:  
+                       - #QURT_MEM_CACHE_FLUSH
+                       - #QURT_MEM_CACHE_INVALIDATE @tablebulletend
+ 
+  @return
+  #QURT_EOK -- Cache operation performed successfully.\n
+  #QURT_EVAL -- Invalid operation
+  
+  @dependencies
+  None.
+*/
+
+int qurt_mem_cache_phys_clean(unsigned int mask, unsigned int addrmatch, qurt_mem_cache_op_t opcode);
+
+/**@ingroup func_qurt_mem_l2cache_line_lock 
+  Performs an L2 cache line locking operation. This function locks selective lines in the L2 cache memory.
+
+  @note1hang Perform the line lock operation only on the 32-byte aligned size and address.
+
+  @datatypes
+  #qurt_addr_t \n
+  #qurt_size_t 
+ 
+  @param[in] addr   Address of the L2 cache memory line to lock; the address must be 32-byte aligned.
+  @param[in] size   Size (in bytes) of L2 cache memory to line lock; size must be a multiple of 32 bytes.
+ 
+  @return
+  #QURT_EOK -- Success.\n
+  #QURT_EALIGN -- Data alignment or address failure.
+  #QURT_EINVALID -- Improper addr and size passed (e.g. integer overflow due to addr + size)
+  #QURT_EFAILED -- Failed to lock cache line as all the ways were locked for the corresponding set of an address 
+                   in the range of addr and addr+size or the address range is not L2 cacheable
+  @dependencies
+  None.
+*/
+int qurt_mem_l2cache_line_lock(qurt_addr_t addr, qurt_size_t size);
+
+/**@ingroup func_qurt_mem_l2cache_line_unlock
+  Performs an L2 cache line unlocking operation. This function unlocks selective lines in the L2 cache memory.
+
+  @note1hang Perform the line unlock operation only on a 32-byte aligned size and address.
+
+  @datatypes
+  #qurt_addr_t \n
+  #qurt_size_t
+
+  @param[in] addr   Address of the L2 cache memory line to unlock; the address must be 32-byte aligned.
+  @param[in] size   Size (in bytes) of the L2 cache memory line to unlock; size must be a multiple of 32 bytes.
+
+  @return
+  #QURT_EOK -- Success. \n
+  #QURT_EALIGN -- Aligning data or address failure. \n
+  #QURT_EFAILED -- Operation failed, cannot find the matching tag.
+
+  @dependencies
+  None.
+*/
+int qurt_mem_l2cache_line_unlock(qurt_addr_t addr, qurt_size_t size);
+
+/**@ingroup func_qurt_mem_region_attr_init
+  @xreflabel{sec:qurt_mem_region_attr_init} 
+  Initializes the specified memory region attribute structure with default attribute values: \n
+  - Mapping -- #QURT_MEM_MAPPING_VIRTUAL \n
+  - Cache mode -- #QURT_MEM_CACHE_WRITEBACK \n
+  - Physical address -- -1 \n
+  - Virtual address -- -1 \n
+  - Memory type -- #QURT_MEM_REGION_LOCAL \n
+  - Size -- -1 
+
+  @note1hang The memory physical address attribute must be explicitly set by calling the
+             qurt_mem_region_attr_set_physaddr() function. The size and pool attributes are set directly
+             as parameters in the memory region create operation.
+
+  @datatypes
+  #qurt_mem_region_attr_t
+
+  @param[in,out] attr  Pointer to the destination structure for the memory region attributes.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+void qurt_mem_region_attr_init(qurt_mem_region_attr_t *attr);
+
+/**@ingroup func_qurt_mem_pool_attach
+  Initializes a memory pool object to attach to a pool predefined in the system
+  configuration file.
+
+  Memory pool objects assign memory regions to physical memory in different
+  Hexagon memory units. They are specified in memory region create operations
+  (Section @xref{sec:mem_region_create}).
+
+  @note1hang QuRT predefines the memory pool object #qurt_mem_default_pool
+             (Section @xref{dox:mem_management}) for allocation memory regions in SMI memory. The pool attach
+             operation is necessary only when allocating memory regions in nonstandard
+             memory units such as TCM.
+
+  @datatypes
+  #qurt_mem_pool_t
+
+  @param[in] name   Pointer to the memory pool name.
+  @param[out] pool  Pointer to the memory pool object.
+
+  @return
+  #QURT_EOK -- Attach operation successful.
+
+  @dependencies
+  None.
+*/
+int qurt_mem_pool_attach(char *name, qurt_mem_pool_t *pool);
+
+/**@ingroup func_qurt_mem_pool_attach2
+  Gets the identifier that corresponds to a pool object created specifically for a client, for example, HLOS_PHYSPOOL.
+  The client_handle is used to look up the client specific pool.
+
+  Memory pool objects assign memory regions to physical memory in different
+  Hexagon memory units. Memory pool objects are specified during mapping creation operations 
+  (qurt_mem_mmap() and qurt_mem_region_create()).
+
+  @note1hang QuRT predefines the memory pool object #qurt_mem_default_pool
+             (Section @xref{dox:mem_management}) for allocation memory regions in SMI memory. The pool_attach2
+             operation is necessary only when allocating memory regions in memory units specific to the client.
+
+  @datatypes
+  #qurt_mem_pool_t
+
+  @param[in] client_handle   Client identifier used by the OS to lookup the identifier
+                             for client specific pool
+  @param[in] name            Pointer to the memory pool name.
+  @param[out] pool           Pointer to the memory pool object.
+
+  @return
+  #QURT_EOK -- Attach operation successful.
+
+  @dependencies
+  None.
+*/
+int qurt_mem_pool_attach2(int client_handle, char *name, qurt_mem_pool_t *pool);
+
+/**@ingroup func_qurt_mem_pool_create
+   @xreflabel{hdr:qurt_mem_pool_create}
+   Dynamically creates a memory pool object from a physical address range.
+
+   The pool is assigned a single memory region with the specified base address and size.
+
+   The base address and size values passed to this function must be aligned to 4K byte
+   boundaries, and must be expressed as the actual base address and size values divided by 4K.
+
+   For example, the function call:
+         @code
+         qurt_mem_pool_create ("TCM_PHYSPOOL", 0xd8020, 0x20, &pool)
+         @endcode
+   ... is equivalent to the following static pool definition in the QuRT system configuration file:
+        @code
+       <physical_pool name="TCM_PHYSPOOL">
+            <region base="0xd8020000" size="0x20000" />
+       </physical_pool>
+       @endcode
+
+   @cond rest_dist For more information on the system configuration file, see @xhyperref{80VB41979,80-VB419-79}. @endcond
+
+   @note1hang Dynamically created pools are not identical to static pools. In particular, 
+   qurt_mem_pool_attr_get() is not valid with dynamically created pools.
+
+   @note1cont Dynamic pool creation permanently consumes system resources, and cannot be undone.
+
+  @datatypes
+  #qurt_mem_pool_t
+
+  @param[in] name           Pointer to the memory pool name. 
+  @param[in] base           Base address of the memory region (divided by 4K).
+  @param[in] size           Size (in bytes) of the memory region (divided by 4K).
+  @param[out] pool          Pointer to the memory pool object.
+
+  @return
+  #QURT_EOK -- Success.
+
+  @dependencies
+  None.
+*/
+int qurt_mem_pool_create(char *name, unsigned base, unsigned size, qurt_mem_pool_t *pool);
+
+/**@ingroup func_qurt_mem_pool_add_pages
+  Adds a physical address range to the specified memory pool object.\n
+ 
+  @note1hang Call this operation only with root privileges (guest OS mode).
+
+  @datatypes
+  #qurt_mem_pool_t
+
+  @param[in] pool           Memory pool object.
+  @param[in] first_pageno   First page number of the physical address range (equivalent to address >> 12)
+  @param[in] size_in_pages  Number of pages in the physical address range (equivalent to size >> 12)
+
+  @return
+  #QURT_EOK -- Pages successfully added.
+
+  @dependencies
+  None.
+*/
+int qurt_mem_pool_add_pages(qurt_mem_pool_t pool,
+                            unsigned first_pageno,
+                            unsigned size_in_pages);
+
+/**@ingroup func_qurt_mem_pool_remove_pages
+  Removes a physical address range from the specified memory pool object.
+ 
+  If any part of the address range is in use, this operation returns an
+  error without changing the state.
+ 
+  @note1hang Call this operation only with root privileges (guest-OS mode).
+ 
+  @note1cont In the future, this operation will support (via the flags parameter) the
+  removal of a physical address range when part of the range is in use.
+ 
+  @datatypes
+  #qurt_mem_pool_t
+
+  @param[in] pool           Memory pool object.
+  @param[in] first_pageno   First page number of the physical address range (equivalent to address >> 12)
+  @param[in] size_in_pages  Number of pages in the physical address range (equivalent to size >> 12)
+  @param[in] flags          Remove options. Values: \n 
+                            - 0 -- Skip holes in the range that are not part of the pool (default) \n
+                            - #QURT_POOL_REMOVE_ALL_OR_NONE -- Pages are removed only if the specified
+                            physical address range is entirely contained (with no holes) in the
+                            pool free space. @tablebulletend                          
+  @param[in] callback       Callback procedure called when pages were successfully removed.
+                            Not called if the operation failed. Passing 0 as the parameter
+                            value causes the callback to not be called. 
+  @param[in] arg            Value passed as an argument to the callback procedure.
+
+  @return
+  #QURT_EOK -- Pages successfully removed.
+
+  @dependencies
+  None.
+*/
+int qurt_mem_pool_remove_pages(qurt_mem_pool_t pool,
+                               unsigned first_pageno,
+                               unsigned size_in_pages,
+                               unsigned flags,
+                               void (*callback)(void *),
+                               void *arg);
+/**@ingroup memory_management_types*/
+#define QURT_POOL_REMOVE_ALL_OR_NONE            1  /**< */
+
+/**@ingroup func_qurt_mem_pool_attr_get  
+   Gets the memory pool attributes. \n
+   Retrieves pool configurations based on the pool handle, and fills in
+   the attribute structure with configuration values.   
+
+   @datatypes
+   #qurt_mem_pool_t \n
+   #qurt_mem_pool_attr_t
+
+   @param[in]  pool   Pool handle obtained from qurt_mem_pool_attach().
+   @param[out] attr   Pointer to the memory region attribute structure. 
+
+   @return   
+   0 -- Success. \n
+   #QURT_EINVALID -- Corrupt handle; pool handle is invalid.
+*/
+int qurt_mem_pool_attr_get (qurt_mem_pool_t pool, qurt_mem_pool_attr_t *attr);
+
+/**@ingroup func_qurt_mem_pool_attr_get_size
+  Gets the size of the specified memory pool range.
+
+  @datatypes
+  #qurt_mem_pool_attr_t \n
+  #qurt_size_t
+ 
+  @param[in] attr        Pointer to the memory pool attribute structure.
+  @param[in] range_id    Memory pool range key.
+  @param[out] size       Pointer to the destination variable for the range size.
+
+  @return 
+  0 -- Success. \n
+  #QURT_EINVALID -- Range is invalid.
+
+  @dependencies
+  None.
+*/
+static inline int qurt_mem_pool_attr_get_size (qurt_mem_pool_attr_t *attr, int range_id, qurt_size_t *size){
+    if ((range_id >= MAX_POOL_RANGES) || (range_id < 0)){
+        (*size) = 0;
+        return QURT_EINVALID;
+    }
+    else {
+        (*size) = attr->ranges[range_id].size;
+    }
+    return QURT_EOK;
+}
+
+/**@ingroup func_qurt_mem_pool_attr_get_addr
+   Gets the start address of the specified memory pool range.
+ 
+  @datatypes
+  #qurt_mem_pool_attr_t \n
+  #qurt_addr_t
+  
+  @param[in] attr        Pointer to the memory pool attribute structure.
+  @param[in]  range_id   Memory pool range key.
+  @param[out] addr       Pointer to the destination variable for range start address.
+
+  @return 
+  0 -- Success. \n
+  #QURT_EINVALID -- Range is invalid.
+
+  @dependencies
+  None.
+*/
+static inline int qurt_mem_pool_attr_get_addr (qurt_mem_pool_attr_t *attr, int range_id, qurt_addr_t *addr){
+    if ((range_id >= MAX_POOL_RANGES) || (range_id < 0)){
+        (*addr) = 0;
+        return QURT_EINVALID;
+    }
+    else {
+        (*addr) = (attr->ranges[range_id].start)<<12;
+   }
+   return QURT_EOK;
+}
+
+/**@ingroup func_qurt_mem_pool_attr_get_addr_64
+   Gets the 64 bit start address of the specified memory pool range.
+ 
+  @datatypes
+  #qurt_mem_pool_attr_t \n
+  #qurt_addr_64_t
+  
+  @param[in] attr        Pointer to the memory pool attribute structure.
+  @param[in]  range_id   Memory pool range key.
+  @param[out] addr       Pointer to the destination variable for range start address.
+
+  @return 
+  0 -- Success. \n
+  #QURT_EINVALID -- Range is invalid.
+
+  @dependencies
+  None.
+*/
+static inline int qurt_mem_pool_attr_get_addr_64 (qurt_mem_pool_attr_t *attr, int range_id, qurt_addr_64_t *addr){
+if ((range_id >= MAX_POOL_RANGES) || (range_id < 0)){
+    (*addr) = 0;
+    return QURT_EINVALID;
+}
+else {
+     (*addr) = ((qurt_addr_64_t)attr->ranges[range_id].start)<<12;
+    }
+    return QURT_EOK;
+ }
+
+
+/**@ingroup func_qurt_mem_pool_status_get  
+   Gets the memory pool status. \n
+   Based on the pool handle, retrieves largest contiguous free memory, 
+   total free memory, and total memory declared for the pool in bytes. Fills in
+   the memory status structure with the values.   
+   
+   @datatypes
+   #qurt_mem_pool_t \n
+   #qurt_mem_pool_status_t
+   
+   @param[in]  pool   Pool handle.
+   @param[out] status Pointer to the memory pool status structure. 
+   
+   @return   
+   #QURT_EOK      -- Success. \n
+   #QURT_EINVALID -- Corrupt handle; pool handle is invalid.
+*/
+int qurt_mem_pool_status_get (qurt_mem_pool_t pool, qurt_mem_pool_status_t *status);
+
+
+/**@ingroup func_qurt_mem_pool_is_available
+   Checks whether the number of pages that the page_count argument indicates
+   can be allocated from the specified pool.
+
+  @datatypes
+  #qurt_mem_pool_attr_t \n
+  #qurt_mem_mapping_t \n
+
+  @param[in] pool          Pool handle obtained from qurt_mem_pool_attach().
+  @param[in] page_count    Number of 4K pages.
+  @param[in] mapping_type  Variable of type qurt_mem_mapping_t.
+
+  @return
+  0 -- Success. \n
+  #QURT_EINVALID -- Mapping_type is invalid. \n
+  #QURT_EMEM     -- Specified pages cannot be allocated from the pool.
+
+  @dependencies
+  None.
+*/
+int qurt_mem_pool_is_available(qurt_mem_pool_t pool, int page_count, qurt_mem_mapping_t mapping_type);
+
+
+/**@ingroup func_qurt_mem_region_create
+  @xreflabel{sec:mem_region_create}
+  Creates a memory region with the specified attributes.
+
+  The application initializes the memory region attribute structure with
+  qurt_mem_region_attr_init() and qurt_mem_region_attr_set_bus_attr().
+
+  If the virtual address attribute is set to its default value 
+  (Section @xref{sec:qurt_mem_region_attr_init}), the virtual address of the memory region is 
+  automatically assigned any available virtual address value.
+
+  If the memory mapping attribute is set to virtual mapping, the physical address of the memory region
+  is also automatically assigned.\n
+
+  @note1hang The physical address attribute is explicitly set in the attribute structure only
+             for memory regions with physical-contiguous-mapped mapping.
+
+  Memory regions are always assigned to memory pools. The pool value specifies the memory pool
+  that the memory region is assigned to.
+
+  @note1hang If attr is specified as NULL, the memory region is created with default
+             attribute values (Section @xref{sec:qurt_mem_region_attr_init}).
+             QuRT predefines the memory pool object #qurt_mem_default_pool
+             (Section @xref{dox:mem_management}), which allocates memory regions in SMI memory.
+
+  @datatypes
+  #qurt_mem_region_t \n
+  #qurt_size_t \n
+  #qurt_mem_pool_t \n
+  #qurt_mem_region_attr_t
+
+  @param[out] region Pointer to the memory region object.
+  @param[in]  size   Memory region size (in bytes). If size is not an integral multiple of 4K,
+                     it is rounded up to a 4K boundary.
+  @param[in]  pool   Memory pool of the region.
+  @param[in]  attr   Pointer to the memory region attribute structure.
+
+  @return
+  #QURT_EOK -- Memory region successfully created.\n
+  #QURT_EMEM -- Not enough memory to create region.
+  #QURT_EINVALID -- Invalid cache attributes / permissions provided in attribute.
+
+  @dependencies
+  None.
+*/
+int qurt_mem_region_create(qurt_mem_region_t *region, qurt_size_t size, qurt_mem_pool_t pool, qurt_mem_region_attr_t *attr);
+
+/**@ingroup func_qurt_mem_region_delete
+  Deletes the specified memory region.
+
+  If the caller application creates the memory region, it is removed and the system reclaims its
+  assigned memory.
+
+  If a different application creates the memory region (and is shared with the caller
+  application), only the local memory mapping to the region is removed; the system does
+  not reclaim the memory.
+
+  @datatypes
+  #qurt_mem_region_t
+
+  @param[in] region Memory region object.
+
+  @returns
+  #QURT_EOK -- Region successfully deleted.
+  #QURT_ELOCKED -- Buffer is locked. Mapping delete failed.
+
+  @dependencies
+  None.
+*/
+int qurt_mem_region_delete(qurt_mem_region_t region);
+
+
+/**@ingroup func_qurt_mem_region_attr_get
+  @xreflabel{sec:mem_region_attr_get}
+  Gets the memory attributes of the specified message region.
+  After a memory region is created, its attributes cannot be changed.
+
+  @datatypes
+  #qurt_mem_region_t \n
+  #qurt_mem_region_attr_t
+
+  @param[in] region     Memory region object.
+  @param[out] attr      Pointer to the destination structure for memory region attributes.
+
+  @return
+  #QURT_EOK -- Operation successfully performed. \n
+  Error code -- Failure.
+
+  @dependencies
+  None.
+*/
+int qurt_mem_region_attr_get(qurt_mem_region_t region, qurt_mem_region_attr_t *attr);
+
+
+/**@ingroup func_qurt_mem_region_attr_set_type
+  Sets the memory type in the specified memory region attribute structure.
+
+  The type indicates whether the memory region is local to an application or shared between
+  applications. 
+  @cond rest_dist For more information, see @xhyperref{80VB41992,80-VB419-92}. @endcond
+ 
+  @datatypes
+  #qurt_mem_region_attr_t \n
+  #qurt_mem_region_type_t
+
+  @param[in,out] attr  Pointer to memory region attribute structure.
+  @param[in]     type  Memory type. Values: \n
+                       - #QURT_MEM_REGION_LOCAL \n
+                       - #QURT_MEM_REGION_SHARED @tablebulletend
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+static inline void qurt_mem_region_attr_set_type(qurt_mem_region_attr_t *attr, qurt_mem_region_type_t type){
+    attr->type = type;
+}
+
+/**@ingroup func_qurt_mem_region_attr_get_size
+  Gets the memory region size from the specified memory region attribute structure.
+
+  @datatypes
+  #qurt_mem_region_attr_t \n
+  #qurt_size_t
+
+  @param[in]  attr  Pointer to the memory region attribute structure.
+  @param[out] size  Pointer to the destination variable for memory region size.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+static inline void qurt_mem_region_attr_get_size(qurt_mem_region_attr_t *attr, qurt_size_t *size){
+    (*size) = attr->size;
+}
+
+/**@ingroup func_qurt_mem_region_attr_get_type
+  Gets the memory type from the specified memory region attribute structure.
+
+  @datatypes
+  #qurt_mem_region_attr_t \n
+  #qurt_mem_region_type_t
+
+  @param[in] attr  Pointer to the memory region attribute structure.
+  @param[out] type  Pointer to the destination variable for the memory type.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+static inline void qurt_mem_region_attr_get_type(qurt_mem_region_attr_t *attr, qurt_mem_region_type_t *type){
+    (*type) = attr->type;
+}
+
+/**@ingroup func_qurt_mem_region_attr_set_physaddr
+  Sets the memory region 32-bit physical address in the specified memory attribute structure.
+
+  @note1hang The physical address attribute is explicitly set only for memory regions with 
+             physical contiguous mapping. Otherwise QuRT automatically sets it
+			 when the memory region is created.
+
+  @datatypes
+  #qurt_mem_region_attr_t \n
+  #qurt_paddr_t
+
+  @param[in,out] attr  Pointer to the memory region attribute structure.
+  @param[in] addr  Memory region physical address.
+
+  @return      
+  None.
+ */
+static inline void qurt_mem_region_attr_set_physaddr(qurt_mem_region_attr_t *attr, qurt_paddr_t addr){
+    attr->ppn = (unsigned)(((unsigned)(addr))>>12);
+}
+
+/**@ingroup func_qurt_mem_region_attr_get_physaddr
+  Gets the memory region physical address from the specified memory region attribute structure.
+  
+  @datatypes
+  #qurt_mem_region_attr_t
+  
+  @param[in]  attr  Pointer to the memory region attribute structure.
+  @param[out] addr  Pointer to the destination variable for memory region physical address.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+static inline void qurt_mem_region_attr_get_physaddr(qurt_mem_region_attr_t *attr, unsigned int *addr){
+    (*addr) = (unsigned)(((unsigned) (attr->ppn))<<12);
+}
+
+/**@ingroup func_qurt_mem_region_attr_set_virtaddr
+  Sets the memory region virtual address in the specified memory attribute structure.
+
+  @datatypes
+  #qurt_mem_region_attr_t \n
+  #qurt_addr_t
+  
+  @param[in,out] attr  Pointer to the memory region attribute structure.
+  @param[in]     addr  Memory region virtual address.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+static inline void qurt_mem_region_attr_set_virtaddr(qurt_mem_region_attr_t *attr, qurt_addr_t addr){
+    attr->virtaddr = addr;
+}
+
+/**@ingroup func_qurt_mem_region_attr_get_virtaddr
+  Gets the memory region virtual address from the specified memory region attribute structure.
+
+  @datatypes
+  #qurt_mem_region_attr_t \n
+
+  @param[in]   attr   Pointer to the memory region attribute structure.
+  @param[out]  addr   Pointer to the destination variable for the memory region virtual address.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+static inline void qurt_mem_region_attr_get_virtaddr(qurt_mem_region_attr_t *attr, unsigned int *addr){
+    (*addr) = (unsigned int)(attr->virtaddr);
+}
+
+/**@ingroup func_qurt_mem_region_attr_set_mapping
+  Sets the memory mapping in the specified memory region attribute structure.
+
+  The mapping value indicates how the memory region is mapped in virtual memory.  
+
+  @datatypes
+  #qurt_mem_region_attr_t \n
+  #qurt_mem_mapping_t
+  
+  @param[in,out] attr     Pointer to the memory region attribute structure.
+  @param[in] mapping  Mapping. Values: 
+                      - #QURT_MEM_MAPPING_VIRTUAL
+                      - #QURT_MEM_MAPPING_PHYS_CONTIGUOUS 
+                      - #QURT_MEM_MAPPING_IDEMPOTENT  	                                   
+                      - #QURT_MEM_MAPPING_VIRTUAL_FIXED								   
+                      - #QURT_MEM_MAPPING_NONE 
+                      - #QURT_MEM_MAPPING_VIRTUAL_RANDOM
+                      - #QURT_MEM_MAPPING_INVALID   @tablebulletend  
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+static inline void qurt_mem_region_attr_set_mapping(qurt_mem_region_attr_t *attr, qurt_mem_mapping_t mapping){
+    attr->mapping_type = mapping;
+}
+
+/**@ingroup func_qurt_mem_region_attr_get_mapping
+  Gets the memory mapping from the specified memory region attribute structure.
+
+  @datatypes
+  #qurt_mem_region_attr_t \n
+  #qurt_mem_mapping_t
+
+  @param[in]  attr     Pointer to the memory region attribute structure.
+  @param[out] mapping  Pointer to the destination variable for memory mapping.
+
+  @return 
+  None.
+
+  @dependencies
+  None.
+ */
+static inline void qurt_mem_region_attr_get_mapping(qurt_mem_region_attr_t *attr, qurt_mem_mapping_t *mapping){
+    (*mapping) = attr->mapping_type;
+}
+
+/**@ingroup func_qurt_mem_region_attr_set_cache_mode
+  Sets the cache operation mode in the specified memory region attribute structure.
+
+  @cond rest_dist For more information on the cache, see @xhyperref{80VB41992,80-VB419-92}.@endcond
+
+  @datatypes
+  #qurt_mem_region_attr_t \n
+  #qurt_mem_cache_mode_t
+  
+  @param[in,out] attr  Pointer to the memory region attribute structure.
+  @param[in] mode      Cache mode. Values:  \n
+                       - #QURT_MEM_CACHE_WRITEBACK \n
+                       - #QURT_MEM_CACHE_WRITETHROUGH\n
+                       - #QURT_MEM_CACHE_WRITEBACK_NONL2CACHEABLE\n
+                       - #QURT_MEM_CACHE_WRITETHROUGH_NONL2CACHEABLE\n
+                       - #QURT_MEM_CACHE_WRITEBACK_L2CACHEABLE\n
+                       - #QURT_MEM_CACHE_WRITETHROUGH_L2CACHEABLE\n
+                       - #QURT_MEM_CACHE_NONE @tablebulletend
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+static inline void qurt_mem_region_attr_set_cache_mode(qurt_mem_region_attr_t *attr, qurt_mem_cache_mode_t mode){
+    QURT_PGATTR_C_SET(attr->pga, (unsigned)mode);
+}
+
+/**@ingroup func_qurt_mem_region_attr_get_cache_mode
+  Gets the cache operation mode from the specified memory region attribute structure.
+
+  @datatypes
+  #qurt_mem_region_attr_t \n
+  #qurt_mem_cache_mode_t
+
+  @param[in]  attr  Pointer to the memory region attribute structure.
+  @param[out] mode  Pointer to the destination variable for cache mode.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+static inline void qurt_mem_region_attr_get_cache_mode(qurt_mem_region_attr_t *attr, qurt_mem_cache_mode_t *mode){
+    unsigned int mode_temp = QURT_PGATTR_C_GET(attr->pga);
+    (*mode) = (qurt_mem_cache_mode_t)mode_temp;
+}
+
+/**@ingroup func_qurt_mem_region_attr_set_bus_attr
+  Sets the (A1, A0) bus attribute bits in the specified memory region attribute structure.
+
+  @cond rest_dist For more information on the bus attribute bits, see the @xhyperref{80VB41992,80-VB419-92}. @endcond
+
+  @datatypes
+  #qurt_mem_region_attr_t
+
+  @param[in,out] attr  Pointer to the memory region attribute structure.
+  @param[in] abits     The (A1, A0) bits to use with the memory region, expressed as a 2-bit binary number.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+static inline void qurt_mem_region_attr_set_bus_attr(qurt_mem_region_attr_t *attr, unsigned abits){
+    QURT_PGATTR_A_SET(attr->pga, abits);
+}
+
+/**@ingroup func_qurt_mem_region_attr_get_bus_attr
+  Gets the (A1, A0) bus attribute bits from the specified memory region attribute structure.
+
+  @datatypes
+  #qurt_mem_region_attr_t 
+
+  @param[in]  attr  Pointer to the memory region attribute structure.
+  @param[out] pbits Pointer to an unsigned integer that is filled in with
+                    the (A1, A0) bits from the memory region attribute structure, expressed as a 2-bit binary number.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+static inline void qurt_mem_region_attr_get_bus_attr(qurt_mem_region_attr_t *attr, unsigned *pbits){
+    (*pbits) = QURT_PGATTR_A_GET(attr->pga);
+}
+
+void qurt_mem_region_attr_set_owner(qurt_mem_region_attr_t *attr, int handle);
+void qurt_mem_region_attr_get_owner(qurt_mem_region_attr_t *attr, int *p_handle);
+void qurt_mem_region_attr_set_perms(qurt_mem_region_attr_t *attr, unsigned perms);
+void qurt_mem_region_attr_get_perms(qurt_mem_region_attr_t *attr, unsigned *p_perms);
+
+/**@ingroup func_qurt_mem_map_static_query
+  Determines whether a memory page is statically mapped.
+  Pages are specified by the following attributes: physical address, page size, cache mode,
+  and memory permissions. \n
+  - If the specified page is statically mapped, vaddr returns the virtual
+     address of the page. \n
+  - If the page is not statically mapped (or if it does not exist as specified), vaddr
+     returns -1 as the virtual address value.\n
+  The system configuration file defines QuRT memory maps.
+ 
+  @datatypes
+  #qurt_addr_t \n
+  #qurt_mem_cache_mode_t \n
+  #qurt_perm_t
+  
+  @param[out]  vaddr             Virtual address corresponding to paddr.
+  @param[in]   paddr             Physical address.  
+  @param[in]   page_size         Size of the mapped memory page.
+  @param[in]   cache_attribs     Cache mode (writeback, and so on).
+  @param[in]   perm              Access permissions.
+
+  @return
+  #QURT_EOK -- Specified page is statically mapped, vaddr returns the virtual address. \n
+  #QURT_EMEM -- Specified page is not statically mapped, vaddr returns -1. \n
+  #QURT_EVAL -- Specified page does not exist.
+
+  @dependencies
+  None.
+ */
+int qurt_mem_map_static_query(qurt_addr_t *vaddr, qurt_addr_t paddr, unsigned int page_size, qurt_mem_cache_mode_t cache_attribs, qurt_perm_t perm);
+
+
+/**@ingroup func_qurt_mem_region_query
+  Queries a memory region. \n
+  This function determines whether a dynamically-created memory region (Section @xref{sec:mem_region_create}) exists for the
+  specified virtual or physical address.  
+  When a memory region has been determined to exist, its attributes are
+  accessible (Section @xref{sec:mem_region_attr_get}).
+
+  @note1hang This function returns #QURT_EFATAL if #QURT_EINVALID is passed to both
+             vaddr and paddr (or to neither). 
+
+  @datatypes
+  #qurt_mem_region_t \n
+  #qurt_paddr_t 
+   
+  @param[out] region_handle    Pointer to the memory region object (if it exists).
+  @param[in]  vaddr            Virtual address to query; if vaddr is specified, paddr must be set to
+                               the value #QURT_EINVALID.
+  @param[in]  paddr            Physical address to query; if paddr is specified, vaddr must be set to
+                               the value #QURT_EINVALID.
+
+  @return 
+  #QURT_EOK -- Query successfully performed. \n
+  #QURT_EMEM -- Region not found for the specified address. \n
+  #QURT_EFATAL -- Invalid input parameters.
+
+  @dependencies
+  None.
+ */
+int qurt_mem_region_query(qurt_mem_region_t *region_handle, qurt_addr_t vaddr, qurt_paddr_t paddr);
+
+
+/**@ingroup func_qurt_mapping_create
+  @xreflabel{hdr:qurt_mapping_create}
+  Creates a memory mapping in the page table.
+  Not supported if called from a user process, always returns QURT_EMEM. 
+
+  @datatypes
+  #qurt_addr_t \n
+  #qurt_size_t \n
+  #qurt_mem_cache_mode_t \n
+  #qurt_perm_t
+ 
+  @param[in] vaddr			Virtual address.
+  @param[in] paddr			Physical address.
+  @param[in] size			Size (4K-aligned) of the mapped memory page.
+  @param[in] cache_attribs		Cache mode (writeback, and so on).
+  @param[in] perm			Access permissions.
+
+  @return			
+  #QURT_EOK -- Mapping created. \n
+  #QURT_EMEM -- Failed to create mapping.
+  #QURT_EINVALID -- Invalid cache attributes / permissions provided.
+
+  @dependencies
+  None.
+*/
+int qurt_mapping_create(qurt_addr_t vaddr, qurt_addr_t paddr, qurt_size_t size,
+                         qurt_mem_cache_mode_t cache_attribs, qurt_perm_t perm);
+
+/**@ingroup func_qurt_mapping_remove
+   @xreflabel{hdr:qurt_mapping_remove}
+  Deletes the specified memory mapping from the page table.
+ 
+  @datatypes
+  #qurt_addr_t \n
+  #qurt_size_t
+
+  @param[in] vaddr			Virtual address.
+  @param[in] paddr			Physical address.
+  @param[in] size			Size of the mapped memory page (4K-aligned).
+
+  @return 			
+  #QURT_EOK -- Mapping created.
+  #QURT_ELOCKED -- Buffer is locked. Mapping delete failed.
+
+  @dependencies
+  None.
+  		
+ */ 		
+int qurt_mapping_remove(qurt_addr_t vaddr, qurt_addr_t paddr, qurt_size_t size);
+
+/**@ingroup func_qurt_lookup_physaddr
+  Translates a virtual memory address to the physical memory address to which it maps. \n
+  The lookup occurs in the process of the caller. Use qurt_lookup_physaddr2() to lookup the
+  physical address of another process.
+  
+
+  @datatypes
+  #qurt_addr_t \n
+  #qurt_paddr_t
+
+  @param[in] vaddr   Virtual address.
+
+  @return
+  Nonzero -- Physical address to which the virtual address is mapped.\n
+  0 -- Virtual address not mapped.
+
+  @dependencies
+  None.
+*/
+qurt_paddr_t qurt_lookup_physaddr (qurt_addr_t vaddr);
+
+/**@ingroup func_qurt_mem_region_attr_set_physaddr_64
+  Sets the memory region 64-bit physical address in the specified memory attribute structure.
+
+  @note1hang The physical address attribute is explicitly set only for memory regions with
+             physical contiguous mapping. Otherwise it is automatically set by
+             QuRT when the memory region is created.
+
+  @datatypes
+  #qurt_mem_region_attr_t \n
+  #qurt_paddr_64_t
+
+  @param[in,out] attr  Pointer to the memory region attribute structure.
+  @param[in] addr_64   Memory region 64-bit physical address.
+
+  @return
+  None.
+ */
+static inline void qurt_mem_region_attr_set_physaddr_64(qurt_mem_region_attr_t *attr, qurt_paddr_64_t addr_64){
+    attr->ppn = (unsigned)(((unsigned long long)(addr_64))>>12);
+}
+
+/**@ingroup func_qurt_mem_region_attr_get_physaddr_64
+  Gets the memory region 64-bit physical address from the specified memory region attribute structure.
+
+  @datatypes
+  #qurt_mem_region_attr_t \n
+  #qurt_paddr_64_t
+
+  @param[in]  attr     Pointer to the memory region attribute structure.
+  @param[out] addr_64  Pointer to the destination variable for the memory region 64-bit physical address.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+static inline void qurt_mem_region_attr_get_physaddr_64(qurt_mem_region_attr_t *attr, qurt_paddr_64_t *addr_64){
+    (*addr_64) = (unsigned long long)(((unsigned long long)(attr->ppn))<<12);
+}
+
+/**@ingroup func_qurt_mem_map_static_query_64
+  Determines if a memory page is statically mapped.
+  The following attributes specify pages: 64-bit physical address, page size, cache mode,
+  and memory permissions. \n
+  If the specified page is statically mapped, vaddr returns the virtual
+     address of the page.
+  If the page is not statically mapped (or if it does not exist as specified), vaddr
+     returns -1 as the virtual address value.\n
+  QuRT memory maps are defined in the system configuration file.
+
+  @datatypes
+  #qurt_addr_t \n
+  #qurt_paddr_64_t \n
+  #qurt_mem_cache_mode_t \n
+  #qurt_perm_t
+
+  @param[out]  vaddr             Virtual address corresponding to paddr.
+  @param[in]   paddr_64          64-bit physical address.
+  @param[in]   page_size         Size of the mapped memory page.
+  @param[in]   cache_attribs     Cache mode (writeback, and so on).
+  @param[in]   perm              Access permissions.
+
+  @return
+  #QURT_EOK -- Specified page is statically mapped; a virtual address is returned in vaddr. \n
+  #QURT_EMEM -- Specified page is not statically mapped; -1 is returned in vaddr. \n
+  #QURT_EVAL -- Specified page does not exist.
+
+  @dependencies
+  None.
+ */
+int qurt_mem_map_static_query_64(qurt_addr_t *vaddr, qurt_paddr_64_t paddr_64, unsigned int page_size, qurt_mem_cache_mode_t cache_attribs, qurt_perm_t perm);
+
+/**@ingroup func_qurt_mem_region_query_64
+  Determines whether a dynamically created memory region (Section @xref{sec:mem_region_create}) exists for the
+  specified virtual or physical address. When a memory region has been determined to exist, its attributes are
+  accessible (Section @xref{sec:mem_region_attr_get}).
+
+  @note1hang This function returns QURT_EFATAL if #QURT_EINVALID is passed to both
+             vaddr and paddr (or to neither).
+
+  @datatypes
+  #qurt_mem_region_t \n
+  #qurt_addr_t \n
+  #qurt_paddr_64_t
+
+  @param[out] region_handle    Pointer to the memory region object (if it exists).
+  @param[in]  vaddr            Virtual address to query; if vaddr is specified, paddr must be set to
+                               the value #QURT_EINVALID.
+  @param[in]  paddr_64         64-bit physical address to query; if paddr is specified, vaddr must be set to
+                               the value #QURT_EINVALID.
+
+  @return
+  #QURT_EOK -- Success. \n
+  #QURT_EMEM -- Region not found for the specified address. \n
+  #QURT_EFATAL -- Invalid input parameters.
+
+  @dependencies
+  None.
+ */
+int qurt_mem_region_query_64(qurt_mem_region_t *region_handle, qurt_addr_t vaddr, qurt_paddr_64_t paddr_64);
+
+/**@ingroup func_qurt_mapping_create_64
+  @xreflabel{hdr:qurt_mapping_create_64}
+  Creates a memory mapping in the page table.
+  Not supported if called from a user process, always returns QURT_EMEM.
+
+  @datatypes
+  #qurt_addr_t \n
+  #qurt_paddr_64_t \n
+  #qurt_size_t \n
+  #qurt_mem_cache_mode_t \n
+  #qurt_perm_t
+ 
+  @param[in] vaddr	        Virtual address.
+  @param[in] paddr_64		64-bit physical address.
+  @param[in] size			Size (4K-aligned) of the mapped memory page.
+  @param[in] cache_attribs  Cache mode (writeback, and so on).
+  @param[in] perm			Access permissions.
+
+  @return			
+  #QURT_EOK -- Success. \n
+  #QURT_EMEM -- Failure.
+  #QURT_EINVALID -- Invalid cache attributes / permissions provided.
+
+  @dependencies
+  None.
+*/
+int qurt_mapping_create_64(qurt_addr_t vaddr, qurt_paddr_64_t paddr_64, qurt_size_t size,
+                         qurt_mem_cache_mode_t cache_attribs, qurt_perm_t perm);
+
+/**@ingroup func_qurt_mapping_remove_64
+   @xreflabel{hdr:qurt_mapping_remove_64}
+  Deletes the specified memory mapping from the page table.
+ 
+  @datatypes
+  #qurt_addr_t \n
+  #qurt_paddr_64_t \n  
+  #qurt_size_t
+ 
+  @param[in] vaddr    Virtual address.
+  @param[in] paddr_64 64-bit physical address.
+  @param[in] size     Size of the mapped memory page (4K-aligned).
+
+  @return 			
+  #QURT_EOK -- Success.
+  #QURT_ELOCKED -- Buffer is locked. Mapping delete failed.
+
+  @dependencies
+  None.
+  		
+ */ 		
+int qurt_mapping_remove_64(qurt_addr_t vaddr, qurt_paddr_64_t paddr_64, qurt_size_t size);
+
+/**@ingroup func_qurt_lookup_physaddr_64
+  Translates a virtual memory address to the 64-bit physical memory address it is mapped to. \n
+  The lookup occurs in the process of the caller. Use qurt_lookup_physaddr2() to lookup the physical
+  address of another process.
+
+  @datatypes
+  #qurt_paddr_64_t \n
+  #qurt_addr_t
+
+  @param[in] vaddr   Virtual address.
+
+  @return
+  Nonzero -- 64-bit physical address to which the virtual address is mapped. \n
+  0 -- Virtual address has not been mapped.
+
+  @dependencies
+  None.
+*/
+qurt_paddr_64_t qurt_lookup_physaddr_64 (qurt_addr_t vaddr);
+/** @endcond */
+
+/** @cond internal_only */
+/**@ingroup func_qurt_mapping_reclaim
+  Deallocates all QuRT resources associated with the specified virtual
+  memory area, making it available for user memory management:\n
+  - The associated physical memory areas are freed and added to the
+    specified physical pool.\n
+  - The associated TLB entries are deleted and made available for TLB
+    management.\n
+  - The virtual memory area is not freed -- it is left in
+    place as allocated, but unmapped virtual memory. Access to this
+    memory area generates an exception.\n
+
+  The virtual memory area must be statically allocated.
+  If no pool is specified, the freed physical memory is not added to any pool.
+
+  @note1hang The virtual memory area is restricted to being filled with locked 
+             TLB entries that are contiguous within the memory area, and contained by it.
+
+  @datatypes
+  #qurt_addr_t \n
+  #qurt_size_t \n
+  #qurt_mem_pool_t
+
+  @param[in] vaddr   Virtual address of the memory area to free.
+  @param[in] vsize   Size (in bytes) of the memory area to free.
+  @param[in] pool    Handle to the physical pool where freed physical memory is added.
+                     If set to 0, freed physical memory is not added to any pool.
+
+  @return
+  0 -- Success. \n
+  Nonzero -- Failure that indicates a partial success, or that the request was malformed. \n @note1hang The expected behavior is that
+       QuRT logs messages related to the failure, and callers are free to ignore the return value.
+
+  @dependencies
+  None.
+*/
+int qurt_mapping_reclaim(qurt_addr_t vaddr, qurt_size_t vsize, qurt_mem_pool_t pool);
+/** @endcond */
+/** @cond rest_reg_dist  */
+/**@ingroup func_qurt_mem_configure_cache_partition
+  Configures the Hexagon cache partition at the system level.
+
+  A partition size value of #SEVEN_EIGHTHS_SIZE is applicable only to the L2 cache.
+
+  The L1 cache partition is not supported in Hexagon processor version V60 or greater.
+
+  @note1hang Call this operation only with QuRT OS privilege.
+
+  @datatypes
+  #qurt_cache_type_t \n
+  #qurt_cache_partition_size_t
+
+  @param[in] cache_type  Cache type for partition configuration. Values: \n
+                       - #HEXAGON_L1_I_CACHE \n
+                       - #HEXAGON_L1_D_CACHE \n
+                       - #HEXAGON_L2_CACHE @tablebulletend
+
+  @param[in] partition_size  Cache partition size. Values: \n
+                        - #FULL_SIZE \n
+                        - #HALF_SIZE \n
+                        - #THREE_QUARTER_SIZE \n
+                        - #SEVEN_EIGHTHS_SIZE @tablebulletend
+
+  @return
+  #QURT_EOK -- Success. \n
+  #QURT_EVAL -- Error.
+
+  @dependencies
+  None.
+ */
+int qurt_mem_configure_cache_partition(qurt_cache_type_t cache_type, qurt_cache_partition_size_t partition_size);
+
+
+/**@ingroup func_qurt_mem_syncht
+   @xreflabel{hdr:qurt_mem_syncht}
+  Performs heavy-weight synchronization of memory transactions.
+
+  This operation does not return until all previous memory transactions (cached and uncached load/store,
+  mem_locked, and so on) that originated from the current thread are complete and globally observable.
+
+  @note1hang This operation is implemented as a wrapper for the Hexagon syncht instruction.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+static inline void qurt_mem_syncht(void){
+    #ifdef __HEXAGON_ARCH__
+    __asm__  __volatile__ (" SYNCHT \n");
+    #endif
+}
+
+/**@ingroup func_qurt_mem_barrier
+   @xreflabel{hdr:qurt_mem_barrier}
+  Creates a barrier for memory transactions.
+
+  This operation ensures that all previous memory transactions are globally observable before any
+  future memory transactions are globally observable.
+
+  @note1hang This operation is implemented as a wrapper for the Hexagon barrier instruction.
+  @return
+  None
+
+  @dependencies
+  None.
+ */
+static inline void qurt_mem_barrier(void){
+    #ifdef __HEXAGON_ARCH__
+    __asm__  __volatile__ (" BARRIER \n");
+    #endif
+}
+/** @endcond */
+
+/** @cond internal_only */
+/**@ingroup func_qurt_system_mem_alloc
+  Requests that the kernel allocates memory from the kernel-owned pool.
+
+  @param[in] size     Size in bytes (aligned to 4K) to allocate.
+  @param[in] align    Any alignment that must be considered for the allocation.
+  @param[in] flags    Supports the #QURT_SYSTEM_ALLOC_VIRTUAL flag; allocates 
+                      available virtual memory in the address space of all processes.
+
+  @return
+  #QURT_EFATAL  -- Allocation failed \n
+  Start address of the successful allocation.  
+
+  @dependencies
+  None.
+*/
+unsigned qurt_system_mem_alloc(unsigned size, unsigned align, unsigned flags);
+/** @endcond */
+/** @cond rest_reg_dist*/
+/**@ingroup func_qurt_lookup_physaddr2
+  Translates the virtual memory address of the specified process to the 64-bit 
+  physical memory address to which it is mapped.
+
+  @datatypes
+  #qurt_addr_t \n
+  #qurt_paddr_64_t
+
+  @param[in] vaddr   Virtual address.
+  @param[in] pid     PID.
+
+  @return
+  Nonzero -- 64-bit physical address to which the virtual address is mapped. \n
+  0 -- Virtual address is not mapped.
+
+  @dependencies
+  None.
+*/
+qurt_paddr_64_t qurt_lookup_physaddr2(qurt_addr_t vaddr, unsigned int pid);
+/** @endcond */
+
+/**@ingroup func_qurt_mapping_attr_get  
+   Gets the mapping attributes for a given virtual address and PID
+
+   @datatypes
+   #qurt_addr_t \n
+   #qurt_mapping_attr_t
+
+   @param[in]  vaddr  virtual address for which the attributes are required.
+   @param[in]  pid    process id for the target process
+   @param[out] attr   Pointer to the mapping attribute structure. 
+
+   @return   
+   0 -- Success. \n
+   #QURT_EINVALID -- Incorrect virtual address or pid
+*/
+int qurt_mapping_attr_get(qurt_addr_t vaddr, unsigned int pid, qurt_mapping_attr_t *attr);
+
+
+/**@ingroup func_qurt_mapping_attr_get_cache_mode
+  Gets the cache operation mode in the specified memory mapping attribute structure.
+
+
+  @datatypes
+  #qurt_mapping_attr_t \n
+  #qurt_mem_cache_mode_t
+  
+  @param[in]  attr  Pointer to the memory mapping attribute structure.
+  @param[out] cache_mode  Pointer to the destination variable for cache mode.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+static inline void qurt_mapping_attr_get_cache_mode(qurt_mapping_attr_t *attr, qurt_mem_cache_mode_t *cache_mode)
+{
+   (*cache_mode) = attr->cache_mode;
+}
+
+/**@ingroup func_qurt_mapping_attr_get_physaddr
+  Gets the physical memory address in the specified memory mapping attribute structure.
+
+
+  @datatypes
+  #qurt_mapping_attr_t \n
+  #qurt_paddr_64_t
+  
+  @param[in]  attr      Pointer to the memory mapping attribute structure.
+  @param[out] physaddr  Pointer to the destination variable for physical address.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+static inline void qurt_mapping_attr_get_physaddr(qurt_mapping_attr_t *attr, qurt_paddr_64_t *physaddr)
+{
+   (*physaddr) = attr->paddr;
+}
+
+/**@ingroup func_qurt_mapping_attr_get_perms
+  Gets the permissions in the specified memory mapping attribute structure.
+
+
+  @datatypes
+  #qurt_mapping_attr_t \n
+  #qurt_perm_t
+  
+  @param[in]  attr   Pointer to the memory mapping attribute structure.
+  @param[out] perms  Pointer to the destination variable for permissions.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+static inline void qurt_mapping_attr_get_perms(qurt_mapping_attr_t *attr, qurt_perm_t *perms)
+{
+   (*perms) = attr->perms;
+}
+
+/**@ingroup func_qurt_mapping_attr_get_size
+  Gets the size in the specified memory mapping attribute structure.This represents size of the
+  TLB entry which covers the virtual address.
+
+
+  @datatypes
+  #qurt_mapping_attr_t \n
+  #unsigned int
+  
+  @param[in]  attr  Pointer to the memory mapping attribute structure.
+  @param[out] size  Pointer to the destination variable for size.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+static inline void qurt_mapping_attr_get_size(qurt_mapping_attr_t *attr, unsigned int *size)
+{
+   (*size) = attr->size;
+}
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_MEMORY_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_mmap.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_mmap.h
new file mode 100755
index 0000000000000..c3bd875910af7
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_mmap.h
@@ -0,0 +1,359 @@
+#ifndef QURT_MMAP_H
+#define QURT_MMAP_H
+/**
+  @file qurt_mmap.h 
+  @brief  Prototypes of memory mapping/unmapping APIs.
+          The APIs allow the user to map, un-map, and change permissions
+          on memory regions. 
+
+  EXTERNAL FUNCTIONS
+   None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+Copyright (c) 2018-2021, 2022, 2023 Qualcomm Technologies, Inc.
+All Rights Reserved.
+Confidential and Proprietary - Qualcomm Technologies, Inc.
+=============================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**@ingroup func_qurt_mem_mmap
+  Creates a memory mapping with the specified attributes. 
+  This API allows the root process caller to create mapping on behalf of a user
+  process. If the client_handle belongs to a valid user process, the resulting
+  mapping is created for the process.
+  If -1 is passed in place of client_handle, the API creates mapping
+  for the underlying process of the caller.
+
+  @note1hang If the specified attributes are not valid, an error result is returned.  
+                
+  @param[out]  client_handle  Client handle to use for this mapping (optional).
+  @param[in]   pool           Optional argument that specifies a pool handle
+                              if the user wants to allocate memory from a specific pool.
+                              The default value for this argument is NULL.
+  @param[in]   pRegion        Map region. This argument is unused, and the default value is NULL.
+  @param[in]   addr           Virtual memory address.
+  @param[in]   length         Size of mapping in bytes.
+  @param[in]   prot           Mapping access permissions (R/W/X).
+  @param[in]   flags          Mapping modes.\n
+                              - #QURT_MAP_NAMED_MEMSECTION 
+                              - #QURT_MAP_FIXED            \n
+                              - #QURT_MAP_NONPROCESS_VPOOL \n
+                              - #QURT_MAP_TRYFIXED         \n
+                              - #QURT_MAP_ANON             \n
+                              - #QURT_MAP_PHYSADDR         \n
+                              - #QURT_MAP_VA_ONLY @tablebulletend  
+  @param[in]   fd             File designator.
+  @param[in]   offset         Offset in file.
+ 
+  @return
+  Valid virtual address -- Success.\n
+  #QURT_MAP_FAILED -- Mapping creation failed. 
+ */
+void *qurt_mem_mmap(int client_handle,
+                    qurt_mem_pool_t pool,
+                    qurt_mem_region_t *pRegion,
+                    void *addr,
+                    size_t length,
+                    int prot,
+                    int flags,
+                    int fd,
+                    unsigned long long offset);
+
+/**@ingroup func_qurt_mem_mmap2
+  Creates a memory mapping with the specified attributes. Returns a more descriptive 
+  error code in case of failure.
+  This API allows the root process caller to create mapping on behalf of a user
+  process. If the client_handle belongs to a valid user process, the resulting
+  mapping is created for the process.
+  If -1 is passed in place of client_handle, the API creates mapping
+  for the underlying process of the caller.
+
+  @note1hang If the specified attributes are not valid, an error result is returned.
+
+  @param[out]  client_handle  Client handle to use for this mapping (optional).
+  @param[in]   pool           Optional argument that allows the user to specify a pool handle
+                              when the user wants to allocate memory from a specific pool.
+                              Default value for this argument is NULL.
+  @param[in]   pRegion        Map region (unused argument); default value is NULL.
+  @param[in]   addr           Virtual memory address.
+  @param[in]   length         Size of mapping in bytes.
+  @param[in]   prot           Mapping access permissions (R/W/X).
+                              Cache attributes, bus attributes, User mode.
+  @param[in]   flags          Mapping modes;
+                              Shared, Private, or Anonymous.
+  @param[in]   fd             File designator.
+  @param[in]   offset         Offset in file.
+ 
+  @return
+  Valid virtual address -- Success.\n
+  #QURT_EMEM -- Physical address is not available. \n
+  #QURT_EFAILED -- VA is not available or mapping failed.\n
+  #QURT_EINVALID -- Invalid argument was passed (for example, an unaligned VA/PA).
+ */
+void *qurt_mem_mmap2(int client_handle,
+                    qurt_mem_pool_t pool,
+                    qurt_mem_region_t *pRegion,
+                    void *addr,
+                    size_t length,
+                    int prot,
+                    int flags,
+                    int fd,
+                    unsigned long long offset);
+
+/**@ingroup func_qurt_mem_mmap_by_name
+  Creates a memory mapping for a named-memsection using the specified attributes.
+  The named memsection should be specified in cust_config.xml.
+
+  @note1hang If the specified attributes are not valid or the named memsection is not found,
+  an error result is returned.
+                  
+  @param[in]   name           Name of the memsection in cust_config.xml that specifies 
+                              this mapping. Should be less than 25 characters.
+  @param[in]   addr           Virtual memory address.
+  @param[in]   length         Size of mapping in bytes.
+  @param[in]   prot           Mapping access permissions (R/W/X).
+                              Cache attributes, bus attributes, User mode
+  @param[in]   flags          Mapping modes, such as
+                              Shared, Private, or Anonymous.
+  @param[in]   offset         Offset relative to the physical address range specified in memsection. 
+                              If offset + length exceeds size of memsection, failure is 
+                              returned.
+  @return
+  Valid virtual address -- Success.\n
+  #QURT_MAP_FAILED -- Mapping creation failed. 
+ */
+void *qurt_mem_mmap_by_name(const char* name,
+                            void *addr,
+                            size_t length,
+                            int prot,
+                            int flags,
+                            unsigned long long offset);
+
+/**@ingroup func_qurt_mem_mprotect2
+  Changes access permissions and attributes on an existing mapping based on the client_handle argument. 
+
+  @note1hang If the specified virtual address is not found or invalid attributes are passed,
+  an error code is returned.
+
+  @note2 When error is returned, it is possible that attributes/permissions are changed for some part of the
+          mapping, while for the remaining it is unchanged. Clients should not use these mappings further.
+                  
+  @param[in]   client_handle  Obtained from the current invocation function (Section 3.4.1).                   
+  @param[in]   addr           Virtual memory address.
+  @param[in]   length         Size of mapping in bytes.
+  @param[in]   prot           Mapping access permissions (R/W/X).
+                              Cache attributes, Bus attributes, User mode.
+  @return
+  #QURT_EOK -- Successfully changes permissions on the mapping.\n
+  #QURT_EFATAL -- Failed to change permissions on the mapping. \n
+  #QURT_EINVALID -- Attributes / permissions requested are invalid.
+ */
+int qurt_mem_mprotect2(int client_handle, const void *addr,
+                      size_t length,
+                      int prot);
+
+/**@ingroup func_qurt_mem_mprotect
+  Changes access permissions and attributes on an existing mapping. 
+
+  @note1hang If the specified virtual address is not found or invalid attributes are passed,
+  an error code is returned.\n
+
+  @note2 When error is returned, it is possible that attributes/permissions are changed for some part of the
+          mapping, while for the remaining it is unchanged. Clients should not use these mappings further.
+                  
+  @param[in]   addr           Virtual memory address.
+  @param[in]   length         Size of mapping in bytes.
+  @param[in]   prot           Mapping access permissions (R/W/X).
+                              Cache attributes, Bus attributes, User mode.
+  @return
+  #QURT_EOK -- Successfully changes permissions on the mapping. \n
+  #QURT_EFATAL -- Failed to change permissions on the mapping. \n
+  #QURT_EINVALID -- Attributes / permissions requested are invalid.
+ */
+int qurt_mem_mprotect(const void *addr,
+                      size_t length,
+                      int prot);
+
+/**@ingroup func_qurt_mem_munmap
+  Removes an existing mapping. 
+
+  @note1hang If the specified mapping is not found in the context of the caller process
+  or invalid attributes are passed, an error code is returned.
+                  
+  @param[in]   addr           Virtual memory address.
+  @param[in]   length         Size of mapping in bytes.
+  
+  @return
+  #QURT_EOK -- Successfully changes permissions on the mapping. \n
+  #QURT_EFATAL -- Failed to change permissions on the mapping.
+  #QURT_ELOCKED - Buffer is locked. Mapping delete failed.
+ */
+int qurt_mem_munmap(void *addr,
+                    size_t length);
+
+/**@ingroup func_qurt_mem_munmap2
+  Removes an existing mapping for a specified process. 
+
+  @note1hang This API allows a root process entity, such as a driver, to remove mapping
+  that was created for a user process. If the specified mapping is not found in the context 
+  of client handle or invalid attributes are passed, an error code is returned.
+             
+  @param[out]  client_handle  Client handle of the user process that owns this mapping. 
+  @param[in]   addr           Virtual memory address.
+  @param[in]   length         Size of mapping in bytes.
+  
+  @return
+  #QURT_EOK -- Successfully changes permissions on the mapping. \n
+  #QURT_EFATAL -- Failed to change permissions on the mapping. 
+  #QURT_ELOCKED - Buffer is locked. Mapping delete failed.
+ */
+int qurt_mem_munmap2(int client_handle,
+                     void *addr,
+                     size_t length);
+
+/**@ingroup func_qurt_mem_munmap3
+  Removes an existing mapping or reservation for a specified process. 
+
+  @param[in]   client_handle  Client handle of the user process that owns this mapping. 
+  @param[in]   addr           Pointer to a virtual memory address.
+  @param[in]   length         Size of mapping in bytes.
+  @param[in]   flags          Specifies the flag.
+  
+  @return
+  #QURT_EOK -- Successfully changes permissions on the mapping. \n
+  #QURT_EFATAL -- Failed to change permissions on the mapping. 
+  #QURT_ELOCKED - Buffer is locked. Mapping delete failed.
+ */
+int qurt_mem_munmap3(int client_handle,
+                     void *addr,
+                     size_t length,
+                     int flags);
+
+/*
+|| The macros here follow the style of the standard mmap() macros, but with
+||  QURT_ prepended to avoid name conflicts, and to avoid having a dependency
+||  on sys/mman.h.
+||
+|| Wherever possible, any values here that are also present in sys/mman.h
+||  should have the same value in both places so that we can accept "mmap"
+||  calls without having to remap parameters to new values.
+||
+|| In the future, it would be desirable to have a regression test that
+||  checks, for instance, that these macros match.  Example:
+||
+||   assert(QURT_MAP_FAILED == MAP_FAILED);
+||   ... repeat as needed ...
+*/
+
+/** @addtogroup memory_mapping_macros
+@{ */
+/** @cond */
+#define QURT_PROT_NONE                  0x00U    /**< */
+#define QURT_PROT_READ                  0x01U    /**< */
+#define QURT_PROT_WRITE                 0x02U    /**< */
+#define QURT_PROT_EXEC                  0x04U    /**< */
+#define QURT_PROT_NODUMP                0x08U    /**< Skip dumping the mapping. During PD dump, must skip
+                                                   some mappings on host memory to avoid a race condition
+                                                      where the memory is removed from the host and the DSP process
+                                                      crashes before the mapping is removed.*/
+#define QURT_PROT_ISLAND                0x10U     /**< Island mapping. */
+
+#define QURT_MAP_SHARED                 0x0001U   /**< Shared. */
+#define QURT_MAP_PRIVATE                0x0002U   /**< Private. */
+/** @endcond */
+#define QURT_MAP_NAMED_MEMSECTION       0x0004U   /**< Named memsection. */
+#define QURT_MAP_FIXED                  0x0010U   /**< Fixed virtual address. */
+#define QURT_MAP_RENAME                 0x0020U   /**< Rename. */
+#define QURT_MAP_NORESERVE              0x0040U   /**< No reserve. */
+#define QURT_MAP_INHERIT                0x0080U   /**< Inherit. */
+#define QURT_MAP_NONPROCESS_VPOOL       0x0100U   /**< Use a virtual address outside of the default range of the
+                                                       processes. This option is only supported in the root process
+                                                       and only when virtual memory split is enabled in the XML.
+                                                       The root process can use this flag to create mapping for a
+                                                       user process, for example, if the virtual address is configured
+                                                       for a 3G/1G split, the root process can use this flag to create
+                                                       mapping in the top 1 GB area for the user process or the
+                                                       lower 3 GB area for the root process. This is useful for
+                                                       shared buffer use cases. */
+#define QURT_MAP_HASSEMAPHORE           0x0200U   /**< Has semaphore. */
+#define QURT_MAP_TRYFIXED               0x0400U   /**< Try to create a mapping for a virtual address that was passed.
+                                                       If the passed virtual address fails, use a random virtual address. */
+#define QURT_MAP_WIRED                  0x0800U   /**< Wired. */
+#define QURT_MAP_FILE                   0x0000U   /**< File. */
+#define QURT_MAP_ANON                   0x1000U   /**< Allocate physical memory from the pool that was passed. 
+                                                       By default, memory is allocated from the default physpool. */
+#define QURT_MAP_VA_ONLY                0X2000U   /**< Reserve a virtual address without
+                                                       mapping it. */
+
+/** @cond */                                                   
+#define QURT_MAP_ALIGNED(n)             ((n) << QURT_MAP_ALIGNMENT_SHIFT)
+#define QURT_MAP_ALIGNMENT_SHIFT        24
+
+
+#define QURT_MAP_ALIGNMENT_MASK         QURT_MAP_ALIGNED(0xff)   /**< */
+#define QURT_MAP_ALIGNMENT_64KB         QURT_MAP_ALIGNED(16)     /**< */
+#define QURT_MAP_ALIGNMENT_16MB         QURT_MAP_ALIGNED(24)     /**< */
+#define QURT_MAP_ALIGNMENT_4GB          QURT_MAP_ALIGNED(32)     /**< */
+#define QURT_MAP_ALIGNMENT_1TB          QURT_MAP_ALIGNED(40)     /**< */
+#define QURT_MAP_ALIGNMENT_256TB        QURT_MAP_ALIGNED(48)     /**< */
+#define QURT_MAP_ALIGNMENT_64PB         QURT_MAP_ALIGNED(56)     /**< */
+/** @endcond */
+#define QURT_MAP_FAILED                 ((void *) -1)            /**< Mapping creation failed. */
+
+/*
+|| The macros below are extensions beyond the standard mmap flags, but follow
+||  the style of the mmap flags.
+*/
+/** @cond */
+// Describe bitfields in (prot)
+#define QURT_PROT_CACHE_BOUNDS          16U,19U,7U         /**< Bits 16 through 19 are cache attribute, default is 0. */
+#define QURT_PROT_BUS_BOUNDS            20U,21U,0U         /**< Bits 20 through 21 are bus attributes, default is 0. */
+#define QURT_PROT_USER_BOUNDS           22U,23U,3U         /**< Bits 22 through 23 are user mode, default is 3;
+                                                                default of 3 means to derive user mode setting from the
+                                                                default mode of the client. */
+
+// Describe bitfields in (flags)
+#define QURT_MAP_PHYSADDR_BOUNDS        15U,15U,0U         /**< Bits 15 through 15 are physaddr, default is 0. */
+#define QURT_MAP_TYPE_BOUNDS            16U,19U,0U         /**< Bits 16 through 19 are mapping type, default is 0. */
+#define QURT_MAP_REGION_BOUNDS          20U,23U,0U         /**< Bits 20 through 23 are region type, default is 0. */
+/** @endcond */
+
+// These macros get OR'ed into (prot)
+#define QURT_PROT_CACHE_MODE(n)         QURT_MMAP_BUILD(QURT_PROT_CACHE_BOUNDS,(n)) /**< */
+#define QURT_PROT_BUS_ATTR(n)           QURT_MMAP_BUILD(QURT_PROT_BUS_BOUNDS,(n))   /**< */
+#define QURT_PROT_USER_MODE(n)          QURT_MMAP_BUILD(QURT_PROT_USER_BOUNDS,(n))  /**< */
+// These macros get OR'ed into (flags)
+
+#define QURT_MAP_PHYSADDR               QURT_MMAP_BUILD(QURT_MAP_PHYSADDR_BOUNDS,1U) /**< Use the physical address that was passed in offset field. 
+                                                                                          This is allowed only for root process. */
+#define QURT_MAP_TYPE(n)                QURT_MMAP_BUILD(QURT_MAP_TYPE_BOUNDS,(n))    /**< */
+#define QURT_MAP_REGION(n)              QURT_MMAP_BUILD(QURT_MAP_REGION_BOUNDS,(n))  /**< */
+/** @} */ /* end_addtogroup memory_mapping_macros */
+/** @cond */
+// These macros extract fields from (prot)
+#define QURT_PROT_GET_CACHE_MODE(n)     QURT_MMAP_EXTRACT(QURT_PROT_CACHE_BOUNDS,(n))  /**< */
+#define QURT_PROT_GET_BUS_ATTR(n)       QURT_MMAP_EXTRACT(QURT_PROT_BUS_BOUNDS,(n))    /**< */
+#define QURT_PROT_GET_USER_MODE(n)      QURT_MMAP_EXTRACT(QURT_PROT_USER_BOUNDS,(n))   /**< */
+
+// These macros extract fields from (flags)
+#define QURT_MAP_GET_TYPE(n)            QURT_MMAP_EXTRACT(QURT_MAP_TYPE_BOUNDS,(n))   /**< */
+#define QURT_MAP_GET_REGION(n)          QURT_MMAP_EXTRACT(QURT_MAP_REGION_BOUNDS,(n)) /**< */
+
+// Macros for bitfield insertion and extraction
+#define QURT_MMAP_MASK(lo,hi)           (~((~0u) << ((hi)-(lo)+1U)))                     /**< Mask of same size as [lo..hi]. */
+#define QURT_MMAP_BUILD_(lo,hi,def,n)   ((((n)^(def))&QURT_MMAP_MASK((lo),(hi)))<<(lo)) /**< */
+#define QURT_MMAP_EXTRACT_(lo,hi,def,n) ((((n)>>(lo))&QURT_MMAP_MASK((lo),(hi)))^(def)) /**< */
+#define QURT_MMAP_BUILD(a,b)            QURT_MMAP_BUILD_(a,b)                           /**< */
+#define QURT_MMAP_EXTRACT(a,b)          QURT_MMAP_EXTRACT_(a,b)                         /**< */
+/** @endcond */
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_mq.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_mq.h
new file mode 100755
index 0000000000000..580c83d3de41a
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_mq.h
@@ -0,0 +1,458 @@
+#ifndef QURT_MQ_H
+#define QURT_MQ_H
+/**
+  @file  qurt_mq.h
+
+  @brief  Prototypes of secure message queues API functions.
+
+ EXTERNALIZED FUNCTIONS
+  None
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  None
+
+ Copyright (c) 2019-2023  by Qualcomm Technologies, Inc.  All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+======================================================================*/
+#include <qurt_types.h>
+#include <qurt_error.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*=============================================================================
+                            CONSTANTS AND MACROS
+=============================================================================*/
+#define QURT_MQ_NAME_MAXLEN            16U  /**< Maximum name length. */
+
+
+/*=============================================================================
+                            FORWARD DECLARATIONS & TYPEDEFS
+=============================================================================*/
+/* This enum must be generated in accordance to process class class numbers.
+   For now it is made to match generated version, do not change this unless 
+   there is a corresponding change in the process_class.py, indicies start from 0
+   basically: QURT_MQ_SECURITY_SCOPE_<x> = (1 << QURTK_process_class_index_<x>)
+*/
+typedef enum {
+    QURT_MQ_SECURITY_SCOPE_KERNEL =   ( 1U << 0 ),
+    QURT_MQ_SECURITY_SCOPE_SRM =      ( 1U << 1 ),
+    QURT_MQ_SECURITY_SCOPE_SECURE =   ( 1U << 2 ),
+    QURT_MQ_SECURITY_SCOPE_CPZ =      ( 1U << 3 ),
+    QURT_MQ_SECURITY_SCOPE_ROOT =     ( 1U << 4 ),
+    QURT_MQ_SECURITY_SCOPE_SIGNED =   ( 1U << 5 ),
+    QURT_MQ_SECURITY_SCOPE_UNSIGNED = ( 1U << 6 ),
+    QURT_MQ_SECURITY_SCOPE_SECURE_ROOT = ( 1U << 7 )
+} qurt_mq_security_scope_t;
+
+typedef enum {
+    QURT_MQ_CARDINALITY_PTP =   (1U << 0),
+    QURT_MQ_CARDINALITY_MTO =   (1U << 1)
+}qurt_mq_cardinality_t;
+
+typedef unsigned int qurt_mqd_t;
+
+typedef union{
+    struct {
+        unsigned int perms:2;
+        unsigned int cardinality:1;
+        unsigned int blocking:1;
+
+        qurt_mq_security_scope_t creator_scope: 8;
+        qurt_mq_security_scope_t allowed_scope: 8; //can be a bitmask in case of MTO
+        unsigned int queue_closed: 1;
+        unsigned int reserved: 11;
+    }; //try to do anonymous struct
+    unsigned int raw;
+} qurt_mq_flags_t;
+
+
+/* permissions are from qurt_types.h , block X though */
+#if 0
+/** Memory access permission. */
+typedef enum {
+        QURT_PERM_READ=0x1U, /**< */
+        QURT_PERM_WRITE=0x2U,  /**< */
+        QURT_PERM_EXECUTE=0x4U,  /**< */
+        QURT_PERM_FULL=QURT_PERM_READ|QURT_PERM_WRITE|QURT_PERM_EXECUTE,  /**< */
+} qurt_perm_t;
+#endif
+
+struct qurt_mq_attr {
+   unsigned flags;                         /**< Configured flags. Only meaningful with get_attr(), only used for qurt_mq_flags_t.perms. */
+   unsigned mq_maxmsg;                     /**< Maximum number of messages. Used with create() and get_attr. */
+   unsigned short mq_send_msgsize;         /**< Maximum size (bytes) of message in receiver facing queue,
+                                                from sender to receiver. */
+   unsigned short mq_recv_msgsize;         /**< Maximum size (bytes) of message in sender facing queue,
+                                                from receiver to sender. */
+   unsigned client_pid;                    /**< Process ID of client that is allowed to open the message queue
+                                                that was created using qurt_mq_create(). */
+   qurt_mq_cardinality_t    cardinality;   /**< Cardinality of message queue connection, see below. */
+   qurt_mq_security_scope_t scope;         /**< Security scope of the senders to the queue. */ 
+};
+
+
+/*=============================================================================
+                            EXTERNS & FUNCTIONS
+=============================================================================*/
+/**@ingroup func_qurt_mq_attr_init
+  Initializes attributes to default values used for creating the queue.
+
+  The initialize operation sets the following default attribute values: \n
+  - flag - QURT_PERM_READ | QURT_PERM_WRITE \n
+  - maxmsg - 1 \n
+  - mq_send_msgsize - 8 \n
+  - mq_recv_msgsize - 8 \n
+  - sender_pid -  -1 \n    
+  - cardinality -  QURT_MQ_CARDINALITY_PTP \n    
+  - scope -  QURT_MQ_SECURITY_SCOPE_SIGNED \n    
+
+  @datatypes
+  #qurt_mq_attr 
+  
+  @param[in,out] attr Pointer to the initialized message queue object.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+void qurt_mq_attr_init(struct qurt_mq_attr * attr);
+
+/**@ingroup qurt_mq_attr_set_send_msgsize
+  Sets the message size in bytes the sender can send.
+  Maximum message length is configurable using the XML configuration, however, limited to a maximum value of 62 bytes.
+  
+  @datatypes
+  #qurt_mq_attr
+
+  @param[in,out] attr Pointer to the message queue object.
+  @param[in] len     Length of message in bytes.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+void qurt_mq_attr_set_send_msgsize (struct qurt_mq_attr *attr, size_t len);
+
+/**@ingroup qurt_mq_attr_set_recv_msgsize
+  Sets the message size in bytes that the receiver can read.
+  Maximum message length is configurable using the XML configuration, however, limited to maximum value of 62 bytes.
+  
+  @datatypes
+  #qurt_mq_attr
+
+  @param[in,out] attr Pointer to the message queue object.
+  @param[in] len     Length of message in bytes.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+void qurt_mq_attr_set_recv_msgsize (struct qurt_mq_attr *attr, size_t len);
+
+/**@ingroup qurt_mq_attr_set_maxmsg
+  Sets the maximum message that can queue in the message queue.
+  Message depth is configurable using the XML configuration. 
+  
+  @datatypes
+  #qurt_mq_attr
+
+  @param[in,out] attr  Pointer to the message queue object.
+  @param[in] depth     Maximum message that can be queued.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+void qurt_mq_attr_set_maxmsg (struct qurt_mq_attr *attr, unsigned int depth);
+
+/**@ingroup qurt_mq_attr_set_scope
+  Sets the scope of the message queue. A message queue created with a security 
+  scope allows only a process class of that scope to open a message queue.
+  
+  @datatypes
+  #qurt_mq_attr \n
+  #qurt_mq_security_scope_t
+
+  @param[in,out] attr  Pointer to the message queue object.
+  @param[in] scope     Scope of the message queue: \n
+                       #QURT_MQ_SECURITY_SCOPE_KERNEL \n
+                       #QURT_MQ_SECURITY_SCOPE_SRM \n
+                       #QURT_MQ_SECURITY_SCOPE_SECURE \n
+                       #QURT_MQ_SECURITY_SCOPE_CPZ \n
+                       #QURT_MQ_SECURITY_SCOPE_ROOT \n
+                       #QURT_MQ_SECURITY_SCOPE_SIGNED \n
+                       #QURT_MQ_SECURITY_SCOPE_UNSIGNED
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+void qurt_mq_attr_set_scope (struct qurt_mq_attr *attr, qurt_mq_security_scope_t scope);
+
+
+/**@ingroup qurt_mq_attr_set_client_pid
+  Sets the client_pid that can open this message queue.
+  If client_pid is set, allowed_scope to open MQ shall not be considered.
+  
+  @datatypes
+  #qurt_mq_attr
+
+  @param[in,out] attr    Pointer to the message queue object.
+  @param[in] client_pid  Valid PID for client process.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+void qurt_mq_attr_set_client_pid (struct qurt_mq_attr *attr, unsigned client_pid);
+
+/**@ingroup qurt_mq_attr_set_flags
+  Sets the properties of the message queues. 
+  The current implementation is only used to set the permission for the message queue using the flag attribute.
+  Default is #QURT_PERM_READ | #QURT_PERM_WRITE, explicit permission is not implemented.
+  
+  @datatypes
+  #qurt_mq_attr
+
+  @param[in,out] attr  Pointer to the message queue object.
+  @param[in] flags     Permission for message queue.  
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+void qurt_mq_attr_set_flags (struct qurt_mq_attr *attr, unsigned int flags);
+
+/**@ingroup qurt_mq_create
+   Create a message queue with the provided name and attributes. 
+   The calling process becomes the owner of the queue.
+   Name of the message queue is limited to 16 characters including the NULL terminator. 
+  
+  @datatypes
+  #qurt_mq_attr \n
+  #qurt_mqd_t
+
+  @param[out] mqd Returns a pointer to the message queue identifier if 
+              the message queue  was successfully created.
+  @param[in] name     String identifier of the message queue.
+  @param[in] attr     Pointer to the initialized message queue attribute 
+                      structure that specifies the attributes of the created message queue.
+
+  @return
+  #QURT_EOK        Message queue created. \n
+  #QURT_EINVALID   Invalid arguments. \n
+  #QURT_ENOSPC     Maximum number of queues in the system is exceeded.
+
+  @dependencies
+  None.
+*/
+int qurt_mq_create(qurt_mqd_t *mqd, const char *name, struct qurt_mq_attr * attr);
+
+/**@ingroup qurt_mq_open
+  Opens a message queue connection between a process and a created message queue. 
+  
+  @datatypes
+  #qurt_mq_attr \n
+  #qurt_mqd_t
+
+  @param[out] mqd Returns a pointer to the message queue 
+              identifier if the message queue  was successfully created.
+  @param[in] name    String identifier of the message queue. 
+  @param[in] flags   Flag that contains the properties that define the behavior of message queue connection.
+                     Permissions:\n
+                      #QURT_PERM_READ \n
+                      #QURT_PERM_WRITE \n
+                      #QURT_PERM_READ | QURT_PERM_WRITE @tablebulletend  
+                      Default is QURT_PERM_READ | QURT_PERM_WRITE, explicit permission is not implemented \n
+                     Cardinality: \n
+                      #QURT_MQ_CARDINALITY_PTP (default) \n      
+                      #QURT_MQ_CARDINALITY_MTO (not implemented) \n
+                      Block suspend thread until the message queue with the apecified name is created. \n
+                     Scope: security boundary to which the message queue and its users are constrained.
+                      Block suspend thread until the message queue with the apecified name is created. \n
+                      It is coupled with process privilege level/scope.\n
+                      #QURT_MQ_SECURITY_SCOPE_KERNEL   \n
+                      #QURT_MQ_SECURITY_SCOPE_SRM      \n
+                      #QURT_MQ_SECURITY_SCOPE_SECURE   \n
+                      #QURT_MQ_SECURITY_SCOPE_CPZ      \n
+                      #QURT_MQ_SECURITY_SCOPE_ROOT     \n
+                      #QURT_MQ_SECURITY_SCOPE_SIGNED   \n
+                      #QURT_MQ_SECURITY_SCOPE_UNSIGNED @tablebulletend
+
+  @return
+  QURT_EOK -- Message queue connection successfully opened \n
+  QURT_EFAILED -- Message queue connection failed , if non-blocking message queue \n
+  QURT_ENOTALLOWED --  Open failed due to security scope mismatch
+
+  @dependencies
+  None.
+*/
+int qurt_mq_open (qurt_mqd_t *mqd, const char *name, qurt_mq_flags_t flags);
+
+/**@ingroup qurt_mq_send
+  Sends a message over message queue.\n
+  - If the message queue is full, the calling thread shall be 
+    suspended until space becomes available to enqueue the message. \n
+  - If there exists a thread suspended on an empty queue 
+  to receive a message,  qurt_mq_send shall resume that thread. 
+
+  @datatypes
+  #qurt_mqd_t
+
+  @param[in] mqd Pointer to the message queue identifier.
+  @param[in] msg_ptr     Pointer to the message buffer.  
+  @param[in] msg_len     Length of the message buffer in bytes.  
+
+  @return
+  #QURT_EOK  Message queue send was successful.\n
+  #QURT_EMSGSIZE  Message size in msg_len field is greater than max_message_len specified during queue creation.\n
+  #QURT_ENOTALLOWED   Send failed due to security scope mismatch.
+
+  @dependencies
+  None.
+*/
+int qurt_mq_send(qurt_mqd_t mqd, const char *msg_ptr, size_t msg_len); 
+
+/**@ingroup qurt_mq_send_timed
+  Sends a message over message queue.\n
+  - If the message queue is full, the calling thread shall be 
+    suspended until space becomes available to enqueue the message or until timeout is reached. \n
+  - If there exists a thread suspended on an empty queue 
+    to receive a message, qurt_mq_send_timed shall return with possible return codes.\n
+  - If timeout is reached, qurt_mq_send_timed shall return #QURT_ETIMEOUT.
+
+  @datatypes
+  #qurt_mqd_t
+
+  @param[in] mqd Pointer to the message queue identifier.
+  @param[in] msg_ptr     Pointer to the message buffer.
+  @param[in] duration    Interval (in microseconds) that the duration value must be
+             between #QURT_TIMER_MIN_DURATION and #QURT_TIMER_MAX_DURATION     
+  @param[in] msg_len     Length of message buffer in bytes.  
+
+  @return
+  #QURT_EOK -- Message queue send was successful. \n
+  #QURT_EMSGSIZE -- Message size in msg_len field is greater than max_message_len specified during queue creation.\n
+  #QURT_ENOTALLOWED --  Send failed due to security scope mismatch \n
+  #QURT_ETIMEDOUT -- Timeout
+  
+  @dependencies
+  None.
+*/
+int qurt_mq_send_timed(qurt_mqd_t mqd, const char *msg_ptr, unsigned long long int duration, size_t msg_len);
+
+ /**@ingroup qurt_mq_recv
+  Receives a message from the message queue. \n
+  -If the message queue is empty, the calling thread shall be 
+   suspended until a message is enqueued in the message queue. \n
+  -If there exists a thread suspended on a full queue to 
+   send a message, qurt_mq_recv shall resume the thread.
+
+  @datatypes
+  #qurt_mqd_t
+
+  @param[in] mqd Pointer to the message queue identifier.
+  @param[in] msg_ptr       Pointer to the message buffer  
+  @param[in,out] msg_len   Pointer to the length of message buffer.  
+
+  @return
+  #QURT_EOK --    Message queue created.\n
+  #QURT_EINVALID  Message pointer or msg_len ptr are NULL. \n
+  #QURT_EBADR     Message queue descriptior (mqd) is invalid. \n
+  #QURT_EBADF     Sender closed the message queue.
+
+  @dependencies
+  None.
+*/
+int qurt_mq_recv(qurt_mqd_t mqd, unsigned char *msg_ptr, size_t *msg_len);
+
+ /**@ingroup qurt_mq_recv_timed
+  Receives a message from the message queue. \n
+  -If the message queue is empty, the calling thread shall be 
+   suspended until a message is enqueued in the message queue or until timeout is reached.\n 
+  -If there exists a thread suspended on a full queue to 
+   send a message, qurt_mq_recv_timed shall return with possible return codes.\n
+  - If timeout is reached, qurt_mq_recv_timed shall return QURT_ETIMEOUT.
+  
+  @datatypes
+  #qurt_mqd_t
+
+  @param[in] mqd Pointer to the message queue identifier.
+  @param[in] msg_ptr     Pointer to the message buffer  
+  @param[in] duration    Interval (in microseconds) that the duration value must be;
+             between #QURT_TIMER_MIN_DURATION and #QURT_TIMER_MAX_DURATION   
+  @param[in,out] msg_len     Pointer to length of message buffer.  
+
+  @return
+  #QURT_EOK --       Message queue created.\n
+  #QURT_EINVALID --  Message ptr or msg_len ptr are NULL. \n
+  #QURT_EBADR    --  Message queue descriptior (mqd) is invalid.\n
+  #QURT_EBADF   --   Sender closed the message queue. \n
+  #QURT_ETIMEDOUT -- Timeout.
+  
+  @dependencies
+  None.
+*/
+int qurt_mq_recv_timed(qurt_mqd_t mqd, unsigned char *msg_ptr, unsigned long long int duration, size_t *msg_len);
+
+ /**@ingroup qurt_mq_close
+  Closes the message queue and disassociates the calling process (client) from the message queue 
+  under this descriptor. Marks the queue as closed for the receiver. 
+  This function is expected to be called from the client side. If called 
+  from the server side, the function reduces to no-op and returns success. 
+
+  @datatypes
+  #qurt_mqd_t
+
+  @param[in] mqd Pointer to the message queue identifier. 
+
+  @return
+  #QURT_EOK -- Message queue close was successfully.\n
+  #QURT_EBADR -- Invalid descriptor.\n
+  #QURT_ENOTALLOWED --   Message queue close is not called from client side.
+
+  @dependencies
+  None.
+*/
+int qurt_mq_close(qurt_mqd_t mqd);
+
+ /**@ingroup qurt_mq_destroy
+  Destroys the message queue. This function ought to be 
+  called from the process that called qurt_mq_create(). 
+
+  @datatypes
+  #qurt_mqd_t
+
+  @param[in] mqd Pointer to the message queue identifier. 
+
+  @return
+  #QURT_EOK -- Message queue destroy was successfully.\n
+  #QURT_EBADR -- Invalid descriptor.\n
+  #QURT_ENOTALLOWED --  Message queue close is not called from client side.
+
+  @dependencies
+  None.
+*/
+int qurt_mq_destroy(qurt_mqd_t mqd);
+
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+#endif //QURT_MQ_H
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_mutex.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_mutex.h
new file mode 100755
index 0000000000000..4ad6b270cdde6
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_mutex.h
@@ -0,0 +1,211 @@
+#ifndef QURT_MUTEX_H
+#define QURT_MUTEX_H
+/**
+  @file qurt_mutex.h 
+  @brief   Prototypes of mutex API.  
+   This is mostly a user space mutex, but calls the 
+   kernel to block if the mutex is taken. 
+
+EXTERNAL FUNCTIONS
+   None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+Copyright (c) 2021  by Qualcomm Technologies, Inc.  All Rights Reserved.
+Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+=============================================================================*/
+
+
+#include <qurt_futex.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** @addtogroup mutex_types
+@{ */
+/*=============================================================================
+                        TYPEDEFS
+=============================================================================*/
+
+/** QuRT mutex type.                                       
+  
+   Both non-recursive mutex lock and unlock, and recursive
+   mutex lock and unlock can be applied to this type.
+ */
+typedef union qurt_mutex_aligned8{
+   /** @cond */  
+    struct {       
+        unsigned int holder; 
+        unsigned int count;  
+        unsigned int queue;  
+        unsigned int wait_count;        
+    };
+    unsigned long long int raw;  
+    /** @endcond */  
+} qurt_mutex_t;
+/** @} */ /* end_addtogroup mutex_types */
+/*=============================================================================
+                        CONSTANTS AND MACROS
+=============================================================================*/
+/* @addtogroup mutex_const_macros
+@{ */
+#define MUTEX_MAGIC 0xfe                             /**< */
+#define QURTK_FUTEX_FREE_MAGIC     0x1F   // 11111   /**< */
+#define QURT_MUTEX_INIT {{MUTEX_MAGIC, 0, QURTK_FUTEX_FREE_MAGIC,0}}   /**< Suitable as an initializer for a
+                                                                        variable of type qurt_mutex_t. */
+/* @} */ /* end_addtogroup mutex_const_macros */
+/*=============================================================================
+                        FUNCTIONS
+=============================================================================*/
+/**@ingroup func_qurt_mutex_init
+  Initializes a mutex object.
+  The mutex is initially unlocked.
+
+  @note1hang Each mutex-based object has one or more kernel resources associated with it;
+             to prevent resource leaks, call qurt_mutex_destroy()
+             when this object is not used anymore
+  @datatypes
+  #qurt_mutex_t
+  
+  @param[out]  lock  Pointer to the mutex object. Returns the initialized object.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+  
+ */
+void qurt_mutex_init(qurt_mutex_t *lock);
+
+/**@ingroup func_qurt_mutex_destroy
+   Destroys the specified mutex. 
+
+   @note1hang Mutexes must be destroyed when they are no longer in use. Failure to do this
+              causes resource leaks in the QuRT kernel.\n
+   @note1cont Mutexes must not be destroyed while they are still in use. If this occurs, the
+              behavior of QuRT is undefined. 
+  
+   @datatypes
+   #qurt_mutex_t
+   
+   @param[in]  lock  Pointer to the mutex object to destroy.
+
+   @return 
+   None.
+
+   @dependencies
+   None.
+  
+ */
+void qurt_mutex_destroy(qurt_mutex_t *lock); 
+
+/**@ingroup func_qurt_mutex_lock
+   Locks the specified mutex.  
+   If a thread performs a lock operation on a mutex that is not in use, the thread gains
+   access to the shared resource that is protected by the mutex, and continues executing.
+
+   If a thread performs a lock operation on a mutex that is already in use by another
+   thread, the thread is suspended. When the mutex becomes available again (because the
+   other thread has unlocked it), the thread is awakened and given access to the shared
+   resource.
+
+   @note1hang A thread is suspended indefinitely if it locks a mutex that it has already
+           locked. Avoid this by using recursive mutexes (Section @xref{dox:recursive_mutexes}).  
+  
+   @datatypes
+   #qurt_mutex_t
+   
+   @param[in]  lock  Pointer to the mutex object. Specifies the mutex to lock.
+
+   @return 
+   None.
+
+   @dependencies
+   None.  
+ */
+void qurt_mutex_lock(qurt_mutex_t *lock);		/* blocking */
+
+/**@ingroup func_qurt_mutex_lock_timed
+   Locks the specified mutex.
+   When a thread performs a lock operation on a mutex that is not in use, the thread gains
+   access to the shared resource that is protected by the mutex, and continues executing.
+
+   When a thread performs a lock operation on a mutex that is already in use by another
+   thread, the thread is suspended. When the mutex becomes available again (because the
+   other thread has unlocked it), the thread is awakened and given access to the shared
+   resource. If the duration of suspension exceeds the timeout duration, wait is
+   terminated and no access to mutex is granted. 
+   
+   @datatypes
+   #qurt_mutex_t
+   
+   @param[in]  lock    Pointer to the mutex object; specifies the mutex to lock. 
+   @param[in] duration Interval (in microseconds) that the duration value must be between #QURT_TIMER_MIN_DURATION and
+    #QURT_TIMER_MAX_DURATION 
+
+   @return 
+   #QURT_EOK -- Success \n
+   #QURT_ETIMEDOUT -- Timeout
+ 
+   @dependencies
+   None.  
+ */
+int qurt_mutex_lock_timed (qurt_mutex_t * lock, unsigned long long int duration);
+
+/**@ingroup func_qurt_mutex_unlock
+  Unlocks the specified mutex.  \n
+  More than one thread can be suspended on a mutex. When the mutex is unlocked, only the
+  highest-priority thread waiting on the mutex is awakened. If the awakened thread has
+  higher priority than the current thread, a context switch occurs.
+
+  @note1hang The behavior of QuRT is undefined if a thread unlocks a mutex it did not first
+              lock.  
+  
+   @datatypes
+   #qurt_mutex_t
+   
+   @param[in]  lock  Pointer to the mutex object. Specifies the mutex to unlock. 
+
+   @return
+   None.
+
+   @dependencies
+   None.  
+ */
+void qurt_mutex_unlock(qurt_mutex_t *lock);	/* unlock */
+
+/**@ingroup func_qurt_mutex_try_lock
+   @xreflabel{hdr:qurt_mutex_try_lock}
+   Attempts to lock the specified mutex. 
+   If a thread performs a try_lock operation on a mutex that is not in use, the thread gains
+   access to the shared resource that is protected by the mutex, and continues executing.
+
+   @note1hang If a thread performs a try_lock operation on a mutex that it has already locked 
+              or is in use by another thread, qurt_mutex_try_lock immediately returns with a 
+              nonzero result value.
+   
+  
+   @datatypes
+   #qurt_mutex_t
+   
+   @param[in]  lock  Pointer to the mutex object. Specifies the mutex to lock.
+
+   @return 
+   0 -- Success. \n 
+   Nonzero -- Failure.
+  
+  @dependencies
+  None.
+ */
+int qurt_mutex_try_lock(qurt_mutex_t *lock);	
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_MUTEX_H */
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_os_services.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_os_services.h
new file mode 100755
index 0000000000000..cbc4c239e9620
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_os_services.h
@@ -0,0 +1,24 @@
+/*=============================================================================
+
+                                    qurt_os_services.c
+
+GENERAL DESCRIPTION
+
+EXTERNAL FUNCTIONS
+        None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+        None.
+
+             Copyright (c) 2023  by Qualcomm Technologies, Inc.  All Rights Reserved.
+=============================================================================*/
+
+#define QURT_OS_SERVICE_THREAD                "/os/thread"				/**< Thread service */
+#define QURT_OS_SERVICE_FS_HUB                "/os/fs_hub"  			/**< file-system hub */
+#define QURT_OS_SERVICE_CALLBACK              "/os/callback"            /**< QDI callback service */ 
+#define QURT_OS_SERVICE_INTERRUPTS            "/os/interrupt"           /**< Interrupt service */
+#define QURT_OS_SERVICE_PROXY                 "/os/proxy"               /**< QDI proxy serice */
+#define QURT_OS_SERVICE_MEMORY                "/os/memory"              /**< Memory management service */
+#define QURT_OS_SERVICE_MEMPOOL               "/os/mempool"             /**< Pool management service */
+#define QURT_OS_SERVICE_PROCESS               "/os/process"             /**< Process management service */
+#define QURT_OS_SERVICE_MMAP                  "/os/mem_mapper"          /**< mmapper service */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_pimutex.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_pimutex.h
new file mode 100755
index 0000000000000..61aee5cba7ce8
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_pimutex.h
@@ -0,0 +1,200 @@
+#ifndef QURT_PIMUTEX_H
+#define QURT_PIMUTEX_H 1
+/**
+  @file qurt_pimutex.h   
+  @brief Prototypes of qurt_pimutex API.  
+
+EXTERNAL FUNCTIONS
+   None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+Copyright (c) 2021  by Qualcomm Technologies, Inc.  All Rights Reserved.
+Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+=============================================================================*/
+
+#include <qurt_futex.h>
+#include <qurt_mutex.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*=============================================================================
+                        FUNCTIONS
+=============================================================================*/
+/**@ingroup func_qurt_pimutex_init
+  Initializes a priority inheritance mutex object.
+  The priority inheritance mutex is initially unlocked.
+
+  This function works the same as qurt_mutex_init().
+
+   @note1hang Each pimutex-based object has one or more kernel resources associated with it;
+              to prevent resource leaks, call qurt_pimutex_destroy()
+              when this object is not used anymore
+
+  @datatypes
+  #qurt_mutex_t
+  
+  @param[out]  lock  Pointer to the priority inheritance mutex object.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ 
+ */
+void qurt_pimutex_init(qurt_mutex_t *lock);
+
+/**@ingroup func_qurt_pimutex_destroy
+   Destroys the specified priority inheritance mutex.  
+
+   @note1hang Priority inheritance mutexes must be destroyed when they are no longer in
+              use. Failure to do this causes resource leaks in the QuRT kernel.\n
+   @note1cont Priority inheritance mutexes must not be destroyed while they are still in use.
+              If this occurs, the behavior of QuRT is undefined.
+  
+   @datatypes
+   #qurt_mutex_t
+   
+   @param[in]  lock  Pointer to the priority inheritance mutex object to destroy.
+
+   @return
+   None.
+
+   @dependencies
+   None.
+  
+ */
+void qurt_pimutex_destroy(qurt_mutex_t *lock);
+
+/**@ingroup func_qurt_pimutex_lock
+  Requests access to a shared resources. If a thread performs a lock operation on a mutex 
+  that is not in use, the thread gains access to the shared resource that the mutex protects, 
+  and continues executing.
+ 
+  If a thread performs a lock operation on a mutex that is already in use by another
+  thread, the thread is suspended. When the mutex becomes available again (because the 
+  other thread has unlocked it), the thread is awakened and given access to the shared resource.
+
+  If a thread is suspended on a priority inheritance mutex, and the priority of the suspended
+  thread is higher than the priority of the thread that has locked the mutex, the thread
+  with the mutex acquires the higher priority of the suspended thread. The locker thread blocks
+  until the lock is available.
+ 
+  @note1hang  A thread is not suspended if it locks a priority inheritance mutex that it has 
+              already locked . However, the mutex does not become available to other 
+			  threads until the thread performs a balanced number of unlocks on the mutex.\n
+  @note1cont  When multiple threads compete for a mutex, the lock operation for a priority
+              inheritance mutex is slower than it is for a recursive mutex. 
+			  In particular, it is about 10 times slower when the mutex is available for locking,
+			  and slower (with greatly varying times) when the mutex is already locked.
+
+  @datatypes
+  #qurt_mutex_t
+  
+  @param[in]  lock  Pointer to the priority inheritance mutex object to lock.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+  
+ */
+void qurt_pimutex_lock(qurt_mutex_t *lock);
+
+
+/**@ingroup func_qurt_pimutex_lock_timed
+  Locks a priority inheritance mutex with timeout.
+ 
+  A thread can lock a priority inheritance mutex for multiple times. The mutex is not 
+  available to other threads until the thread performs the same number of mutex unlock
+  operations.
+
+  If a thread performs a lock operation on a mutex that is already locked by another thread, 
+  the thread is moved to waiting state. When the mutex becomes available again (because the 
+  other thread has unlocked the mutex), the thread is awakened and tries to lock the mutex.
+
+  If a thread is waiting on a priority inheritance mutex, and the priority of the waiting thread 
+  is higher than the priority of the thread that has locked the mutex, the priority of the thread
+  that has locked the mutex is raised to the same priority of the waiting thread.
+
+  If the duration of waiting exceeds the timeout duration, the waiting is terminated, and 
+  the function returns QURT_ETIMEDOUT as a failure of the mutex lock.
+  
+
+  @datatypes
+  #qurt_mutex_t
+  
+  @param[in]  lock       Pointer to the mutex object to lock.
+  @param[in]  duration   Duration (in microseconds) to wait. The duration value must be between 
+                         #QURT_TIMER_MIN_DURATION and #QURT_TIMER_MAX_DURATION.
+
+  @return
+   #QURT_EOK       -- Success \n
+   #QURT_ETIMEDOUT -- Timeout
+   #QURT_EINVALID  -- Duration is out of range
+
+  @dependencies
+  None.
+  
+ */
+int qurt_pimutex_lock_timed(qurt_mutex_t *lock, unsigned long long int duration);
+
+
+/**@ingroup func_qurt_pimutex_unlock
+   Releases access to a shared resource; unlocks the specified priority inheritance mutex.  \n
+   More than one thread can be suspended on a priority inheritance mutex. When the mutex
+   is unlocked, only the highest-priority thread waiting on the mutex is awakened. If the
+   awakened thread has higher priority than the current thread, a context switch occurs.
+
+   When a thread unlocks a priority inheritance mutex, its thread priority is restored to its
+   original value from any higher priority value that it acquired from another thread
+   suspended on the mutex.
+  
+   @datatypes
+   #qurt_mutex_t
+   
+   @param[in]  lock  Pointer to the priority inheritance mutex object to unlock.
+
+   @return 
+   None.
+
+   @dependencies
+   None.
+ 
+ */
+void qurt_pimutex_unlock(qurt_mutex_t *lock);
+
+/**@ingroup func_qurt_pimutex_try_lock
+  Request access to a shared resource (without suspend). Attempts to lock the specified priority inheritance mutex.\n
+  If a thread performs a try_lock operation on a priority inheritance mutex that is not in
+  use, the thread gains access to the shared resource that is protected by the mutex, and
+  continues executing.
+  If a thread performs a try_lock operation on a priority inheritance mutex that is already
+  in use by another thread, qurt_pimutex_try_lock immediately returns with a
+  nonzero result value.
+  
+  @datatypes
+  #qurt_mutex_t
+  
+  @param[in]  lock  Pointer to the priority inheritance mutex object to lock.
+
+  @return
+  0 -- Success. \n
+  Nonzero -- Failure. 
+
+  @dependencies
+  None. 
+ */
+int qurt_pimutex_try_lock(qurt_mutex_t *lock);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_PIMUTEX_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_pimutex2.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_pimutex2.h
new file mode 100755
index 0000000000000..b809f163cbfd2
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_pimutex2.h
@@ -0,0 +1,162 @@
+#ifndef QURT_PIMUTEX2_H
+#define QURT_PIMUTEX2_H
+/**
+  @file qurt_pimutex2.h 
+  @brief Prototypes of pimutex2 API  
+
+EXTERNAL FUNCTIONS
+   None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+Copyright (c) 2013, 2021, 2023  by Qualcomm Technologies, Inc.  All Rights Reserved.
+Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+=============================================================================*/
+
+
+#include <qurt_futex.h>
+#include <qurt_mutex.h>
+#include <qurt_rmutex2.h>
+
+/*=============================================================================
+												FUNCTIONS
+=============================================================================*/
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**@ingroup func_qurt_pimutex2_init
+   Initializes a recursive mutex object. 
+
+   @deprecated use #qurt_pimutex_init instead.
+
+   The recursive mutex is initially unlocked.
+  
+   Objects of type pimutex2 solve a potential race condition between
+   unlock() and destroy() operations.
+
+   @datatypes
+   #qurt_rmutex2_t
+   
+   @param[out]  lock  Pointer to the recursive mutex object.
+
+   @return 
+   None.
+
+   @dependencies
+   None.
+  
+ */
+void qurt_pimutex2_init(qurt_rmutex2_t *lock);
+
+/**@ingroup func_qurt_pimutex2_destroy
+
+  @deprecated use #qurt_pimutex_destroy instead.
+
+  Destroys the specified recursive mutex. \n
+  @note1cont Recursive mutexes must not be destroyed while they are still in use. If this
+             occurs, the behavior of QuRT is undefined. 
+  @note1cont In general, application code should destroy an pimutex2 object prior to
+             deallocating it; calling qurt_pimutex2_destroy() before deallocating it ensures
+             that all qurt_pimutex2_unlock() calls complete.
+  
+  @datatypes
+  #qurt_rmutex2_t
+  
+  @param[in]  lock  Pointer to the recursive mutex object to destroy.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+  
+ */
+void qurt_pimutex2_destroy(qurt_rmutex2_t *lock);
+
+/**@ingroup func_qurt_pimutex2_lock
+
+  @deprecated use #qurt_pimutex_lock instead.
+
+  Locks the specified recursive mutex. \n 
+
+  If a thread performs a lock operation on a recursive mutex that is not being used, the
+  thread gains access to the shared resource that is protected by the mutex, and continues
+  executing.
+
+  If a thread performs a lock operation on a recursive mutex that is already being used by
+  another thread, the thread is suspended. When the mutex becomes available again
+  (because the other thread has unlocked it), the thread is awakened and given access to the
+  shared resource.
+  
+  @note1hang A thread is not suspended if it locks a recursive mutex that it has already
+             locked, but the mutex does not become available until the thread performs a
+             balanced number of unlocks on the mutex.
+  
+   @datatypes
+   #qurt_rmutex2_t
+  
+   @param[in]  lock  Pointer to the recursive mutex object to lock. 
+
+   @return
+   None.
+
+   @dependencies
+   None.
+  
+ */
+void qurt_pimutex2_lock(qurt_rmutex2_t *lock);
+
+/**@ingroup func_qurt_pimutex2_unlock
+
+   @deprecated use #qurt_pimutex_unlock instead.
+
+   Unlocks the specified recursive mutex. \n 
+   More than one thread can be suspended on a recursive mutex. When the mutex is
+   unlocked, only the highest-priority thread waiting on the mutex is awakened. If the
+   awakened thread has higher priority than the current thread, a context switch occurs.
+  
+   @datatypes
+   #qurt_rmutex2_t
+  
+   @param[in]  lock  Pointer to the recursive mutex object to unlock. 
+
+   @return
+   None.
+
+   @dependencies
+   None.
+  
+ */
+void qurt_pimutex2_unlock(qurt_rmutex2_t *lock);
+
+/**@ingroup func_qurt_rmutex2_try_lock
+
+   @deprecated use #qurt_pimutex_try_lock instead.
+
+   Attempts to lock the specified recursive mutex.\n
+
+   Non-blocking version of qurt_pimutex2_lock().  If a call to qurt_pimutex2_lock() would
+   succeed immediately, this function behaves similarly, and returns 0 for success.
+   If a call to qurt_pimutex2_lock() would not succeed immediately, this function has
+   no effect and returns non-zero for failure.
+
+   @datatypes
+   #qurt_rmutex2_t
+   
+   @param[in]  lock  Pointer to the recursive mutex object to lock.
+
+   @return 
+   0 -- Success. \n 
+   Nonzero -- Failure. 
+  
+ */
+int qurt_pimutex2_try_lock(qurt_rmutex2_t *lock);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_PIMUTEX2_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_pipe.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_pipe.h
new file mode 100755
index 0000000000000..6bdaa044f8640
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_pipe.h
@@ -0,0 +1,479 @@
+#ifndef QURT_PIPE_H
+#define QURT_PIPE_H
+/**
+  @file qurt_pipe.h 
+  @brief  Prototypes of the pipe interface API  
+   This is a pipe or message queue
+	 It blocks when too full (send) or empty (receive).
+	 Unless using a nonblocking option, all datagrams are 64 bits.
+
+EXTERNAL FUNCTIONS
+   None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+ Copyright (c) 2021,2023  by Qualcomm Technologies, Inc.  All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+=============================================================================*/
+
+#include <stddef.h>
+#include <qurt_mutex.h>
+#include <qurt_sem.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** @addtogroup pipe_types
+@{ */
+/*=============================================================================
+                        CONSTANTS AND MACROS
+=============================================================================*/
+#define QURT_PIPE_MAGIC  0xF1FEF1FE        /**< Magic. */
+#define QURT_PIPE_ATTR_MEM_PARTITION_RAM 0 /**< RAM. */
+#define QURT_PIPE_ATTR_MEM_PARTITION_TCM 1 /**< TCM. */
+
+/*=============================================================================
+                        TYPEDEFS
+=============================================================================*/
+/** QuRT pipe data values type. */
+typedef unsigned long long int qurt_pipe_data_t;
+
+/** QuRT pipe type.*/
+typedef struct {
+    /** @cond */
+	qurt_mutex_t pipe_lock;
+	qurt_sem_t senders;
+	qurt_sem_t receiver;
+	unsigned int size;
+	unsigned int sendidx;
+	unsigned int recvidx;
+	void (*lock_func)(qurt_mutex_t *);
+	void (*unlock_func)(qurt_mutex_t *);
+    int (*try_lock_func)(qurt_mutex_t *);
+    void (*destroy_lock_func)(qurt_mutex_t *);
+	unsigned int magic;
+	qurt_pipe_data_t *data;
+    /** @endcond */
+} qurt_pipe_t;
+
+/**  QuRT pipe attributes type. */
+typedef struct {
+  /** @cond */
+  qurt_pipe_data_t *buffer;
+  unsigned int elements;
+  unsigned char mem_partition;
+  /** @endcond */
+} qurt_pipe_attr_t;
+
+/** @} */ /* end_addtogroup pipe_types */
+/*=============================================================================
+                        FUNCTIONS
+=============================================================================*/
+/**@ingroup func_qurt_pipe_attr_init
+  @xreflabel{hdr:qurt_pipe_attr_init}
+  Initializes the structure that sets the pipe attributes when a pipe is created.
+
+  After an attribute structure is initialized, the individual attributes in the structure are
+  explicitly set using the pipe attribute operations.
+
+  The attribute structure is assigned the following default values: \n
+  - buffer -- 0 \n
+  - elements -- 0 \n
+  - mem_partition -- #QURT_PIPE_ATTR_MEM_PARTITION_RAM
+  
+  @datatypes
+  #qurt_pipe_attr_t
+ 
+  @param[in,out] attr Pointer to the pipe attribute structure.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+static inline void qurt_pipe_attr_init(qurt_pipe_attr_t *attr)
+{
+  attr->buffer = NULL;
+  attr->elements = 0;
+  attr->mem_partition = QURT_PIPE_ATTR_MEM_PARTITION_RAM;
+}
+
+/**@ingroup func_qurt_pipe_attr_set_buffer
+  @xreflabel{sec:qurt_pipe_attr_set_buffer}
+  Sets the pipe buffer address attribute.\n
+  Specifies the base address of the memory area to use for the data buffer of a pipe.
+
+  The base address and size (Section @xref{sec:qurt_pipe_attr_set_elements}) specify the 
+  memory area used as a pipe data buffer. The user is responsible for allocating the 
+  memory area used for the buffer.
+
+  @datatypes
+  #qurt_pipe_attr_t \n
+  #qurt_pipe_data_t
+  
+  @param[in,out] attr Pointer to the pipe attribute structure.
+  @param[in] buffer   Pointer to the buffer base address.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+static inline void qurt_pipe_attr_set_buffer(qurt_pipe_attr_t *attr, qurt_pipe_data_t *buffer)
+{
+  attr->buffer = buffer;
+}
+
+/**@ingroup func_qurt_pipe_attr_set_elements
+  @xreflabel{sec:qurt_pipe_attr_set_elements}
+  Specifies the length of the memory area to use for the data buffer of a pipe. 
+  
+  The length is expressed in terms of the number of 64-bit data elements that 
+  can be stored in the buffer. 
+  
+  The base address (Section @xref{sec:qurt_pipe_attr_set_buffer}) and size specify 
+  the memory area used as a pipe data buffer. The user is responsible for 
+  allocating the memory area used for the buffer.
+
+  @datatypes
+  #qurt_pipe_attr_t
+
+  @param[in,out] attr Pointer to the pipe attribute structure.
+  @param[in] elements Pipe length (64-bit elements). 
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+static inline void qurt_pipe_attr_set_elements(qurt_pipe_attr_t *attr, unsigned int elements)
+{
+  attr->elements = elements;
+}
+
+/**@ingroup func_qurt_pipe_attr_set_buffer_partition
+  @xreflabel{sec:qurt_pipe_attr_set_buffer_partition}
+  Specifies the memory type where a pipe's buffer is allocated.
+  Allocate pipes in RAM or TCM/LPM.
+ 
+  @note1hang If a pipe is specified as allocated in TCM/LPM, it must be created
+  with the qurt_pipe_init() operation. The qurt_pipe_create() operation results in an error.
+
+  @datatypes
+  #qurt_pipe_attr_t
+
+  @param[in,out] attr Pointer to the pipe attribute structure.
+  @param[in] mem_partition Pipe memory partition. Values: \n
+             - #QURT_PIPE_ATTR_MEM_PARTITION_RAM -- Pipe resides in RAM \n
+             - #QURT_PIPE_ATTR_MEM_PARTITION_TCM -- Pipe resides in TCM/LCM @tablebulletend
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+static inline void qurt_pipe_attr_set_buffer_partition(qurt_pipe_attr_t *attr, unsigned char mem_partition)
+{
+  attr->mem_partition = mem_partition;
+}
+
+/**@ingroup func_qurt_pipe_create
+  Creates a pipe.\n
+  Allocates a pipe object and its associated data buffer, and initializes the pipe object.
+
+  @note1hang The buffer address and size stored in the attribute structure specify how the
+             pipe data buffer is allocated.
+  
+  @note1cont If a pipe is specified as allocated in TCM/LPM, it must be created
+             using the qurt_pipe_init() operation. The qurt_pipe_create() operation results in an error.
+  
+  @datatypes
+  #qurt_pipe_t \n
+  #qurt_pipe_attr_t
+  
+  @param[out] pipe  Pointer to the created pipe object.
+  @param[in]  attr  Pointer to the attribute structure used to create the pipe.
+
+  @return 
+  #QURT_EOK -- Pipe created. \n
+  #QURT_EFAILED -- Pipe not created. \n
+  #QURT_ENOTALLOWED -- Pipe cannot be created in TCM/LPM.
+
+  @dependencies
+  None.
+ */
+int qurt_pipe_create(qurt_pipe_t **pipe, qurt_pipe_attr_t *attr);
+
+/**@ingroup func_qurt_pipe_init
+  Initializes a pipe object using an existing data buffer.
+
+  @note1hang The buffer address and size stored in the attribute structure must 
+             specify a data buffer that the user has already allocated.
+
+  @datatypes
+  #qurt_pipe_t \n
+  #qurt_pipe_attr_t
+  
+  @param[out] pipe Pointer to the pipe object to initialize.
+  @param[in] attr  Pointer to the pipe attribute structure used to initialize the pipe.
+
+  @return 
+  #QURT_EOK -- Success. \n
+  #QURT_EFAILED -- Failure.
+
+  @dependencies
+  None.
+ */
+int qurt_pipe_init(qurt_pipe_t *pipe, qurt_pipe_attr_t *attr);
+
+/**@ingroup func_qurt_pipe_destroy
+  @xreflabel{sec:qurt_pipe_destroy}
+  Destroys the specified pipe.
+
+  @note1hang Pipes must be destroyed when they are no longer in use. Failure 
+             to do this causes resource leaks in the QuRT kernel.
+             Pipes must not be destroyed while they are still in use. If this 
+             occurs, the behavior of QuRT is undefined.
+
+  @datatypes
+  #qurt_pipe_t
+  
+  @param[in] pipe Pointer to the pipe object to destroy.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+void qurt_pipe_destroy(qurt_pipe_t *pipe); 
+
+/**@ingroup func_qurt_pipe_delete
+  Deletes the pipe.\n
+  Destroys the specified pipe (Section @xref{sec:qurt_pipe_destroy}) and deallocates the pipe object and its
+  associated data buffer.
+
+  @note1hang Delete pipes only if they were created using qurt_pipe_create
+             (and not qurt_pipe_init). Otherwise the behavior of QuRT is undefined. \n
+  @note1cont Pipes must be deleted when they are no longer in use. Failure to do this 
+             causes resource leaks in the QuRT kernel.\n
+  @note1cont Pipes must not be deleted while they are still in use. If this occurs, the
+             behavior of QuRT is undefined. 
+
+  @datatypes
+  #qurt_pipe_t
+  
+  @param[in] pipe Pointer to the pipe object to destroy.
+
+  @return 
+  None.
+
+  @dependencies
+  None.
+ */
+void qurt_pipe_delete(qurt_pipe_t *pipe);
+
+/**@ingroup func_qurt_pipe_send
+  Writes a data item to the specified pipe. \n
+  If a thread writes to a full pipe, it is suspended on the pipe. When another thread reads
+  from the pipe, the suspended thread is awakened and can then write data to the pipe.
+
+  Pipe data items are defined as 64-bit values. Pipe writes are limited to transferring a single
+  64-bit data item per operation.
+
+  @note1hang Transfer data items larger than 64 bits by reading and writing
+             pointers to the data, or by transferring the data in consecutive 64-bit chunks.
+
+  @datatypes
+  #qurt_pipe_t \n
+  #qurt_pipe_data_t
+  
+  @param[in] pipe Pointer to the pipe object to write to.
+  @param[in] data Data item to write.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+void qurt_pipe_send(qurt_pipe_t *pipe, qurt_pipe_data_t data);
+
+/**@ingroup func_qurt_pipe_receive
+  Reads a data item from the specified pipe.
+
+  If a thread reads from an empty pipe, it is suspended on the pipe. When another thread
+  writes to the pipe, the suspended thread is awakened and can then read data from the pipe.
+  Pipe data items are defined as 64-bit values. Pipe reads are limited to transferring a single
+  64-bit data item per operation.
+
+  @note1hang Transfer data items larger than 64 bits by reading and writing
+             pointers to the data, or by transferring the data in consecutive 64-bit chunks.
+
+  @datatypes
+  #qurt_pipe_t
+  
+  @param[in] pipe Pointer to the pipe object to read from.
+
+  @return
+  Integer containing the 64-bit data item from pipe.
+
+  @dependencies
+  None.
+*/
+qurt_pipe_data_t qurt_pipe_receive(qurt_pipe_t *pipe);
+
+/**@ingroup func_qurt_pipe_try_send
+  Writes a data item to the specified pipe (without suspending the thread if the pipe is full).\n
+
+  If a thread writes to a full pipe, the operation returns immediately with success set to -1.
+  Otherwise, success is always set to 0 to indicate a successful write operation.
+
+  Pipe data items are defined as 64-bit values. Pipe writes are limited to transferring a single
+  64-bit data item per operation.
+
+  @note1hang Transfer data items larger than 64 bits  by reading and writing
+             pointers to the data, or by transferring the data in consecutive 64-bit chunks.
+
+  @datatypes
+  #qurt_pipe_t \n
+  #qurt_pipe_data_t
+  
+  @param[in] pipe Pointer to the pipe object to write to.
+  @param[in] data Data item to write.
+
+  @return
+  0 -- Success. \n
+  -1 -- Failure (pipe full).
+
+  @dependencies
+  None.
+*/ 
+int qurt_pipe_try_send(qurt_pipe_t *pipe, qurt_pipe_data_t data);
+
+/**@ingroup func_qurt_pipe_try_receive
+  Reads a data item from the specified pipe (without suspending the thread if the pipe is
+  empty).\n
+  If a thread reads from an empty pipe, the operation returns immediately with success set
+  to -1. Otherwise, success is always set to 0 to indicate a successful read operation.\n
+
+  Pipe data items are defined as 64-bit values. Pipe reads are limited to transferring a single
+  64-bit data item per operation.
+
+  @note1hang Transfer data items larger than 64 bits by reading and writing
+             pointers to the data, or by transferring the data in consecutive 64-bit chunks.
+
+  @datatypes
+  #qurt_pipe_t
+  
+  @param[in] pipe     Pointer to the pipe object to read from.
+  @param[out] success Pointer to the operation status result.
+
+  @return
+  Integer containing a 64-bit data item from pipe.
+
+  @dependencies
+  None.
+*/
+qurt_pipe_data_t qurt_pipe_try_receive(qurt_pipe_t *pipe, int *success);
+
+/**@ingroup func_qurt_pipe_receive_cancellable  
+  Reads a data item from the specified pipe (with suspend), cancellable.
+
+  If a thread reads from an empty pipe, it is suspended on the pipe. When another thread
+  writes to the pipe, the suspended thread is awakened and can then read data from the pipe.
+  The operation is cancelled if the user process of the calling thread is killed, 
+  or if the calling thread must finish its current QDI invocation and return to user space.
+  Root pd thread can use this api to wait on pipe for receiving and gets resumed with QURT_EDESTROY
+  if the pipe gets destroyed .
+  Pipe data items are defined as 64-bit values. Pipe reads are limited to transferring a single
+  64-bit data item per operation. 
+
+  @note1hang Transfer data items larger than 64 bits by reading and writing
+             pointers to the data, or by transferring the data in consecutive 64-bit chunks.
+
+  @datatypes
+  #qurt_pipe_t \n
+  #qurt_pipe_data_t
+
+  @param[in] pipe     Pointer to the pipe object to read from.
+  @param[in] result   Pointer to the integer containing the 64-bit data item from pipe.
+
+  @return     	
+  #QURT_EOK -- Receive completed. \n
+  #QURT_ECANCEL -- Receive canceled. \n
+  #QURT_EDESTROY -- Receive destroyed. \n
+  #QURT_ENOTALLOWED -- Pipe is not initialized
+
+ 
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+int qurt_pipe_receive_cancellable(qurt_pipe_t *pipe, qurt_pipe_data_t *result);
+
+/**@ingroup func_qurt_pipe_send_cancellable  
+  @xreflabel{hdr:qurt_pipe_send_cancellable}
+  Writes a data item to the specified pipe (with suspend), cancellable. \n
+  If a thread writes to a full pipe, it is suspended on the pipe. When another thread reads
+  from the pipe, the suspended thread is awakened and can then write data to the pipe.
+  The operation is canceled if the user process of the calling thread is killed, or if the 
+  calling thread must finish its current QDI invocation and return to user space.
+  Root pd thread can use this api to wait on pipe for receiving and gets resumed with QURT_EDESTROY
+  if the pipe gets destroyed .
+
+  Pipe data items are defined as 64-bit values. Pipe writes are limited to transferring a single
+  64-bit data item per operation.
+
+  @note1hang Transfer data items larger than 64 bits by reading and writing
+             pointers to the data, or by transferring the data in consecutive 64-bit chunks.
+
+  @datatypes
+  #qurt_pipe_t \n
+  #qurt_pipe_data_t
+
+  @param[in] pipe      Pointer to the pipe object to read from.
+  @param[in] data      Data item to write.
+
+  @return     	
+  #QURT_EOK -- Send completed. \n
+  #QURT_ECANCEL -- Send canceled. \n
+  #QURT_EDESTROY -- Send destroyed. \n
+  #QURT_ENOTALLOWED -- Pipe is not initialized
+
+ 
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+int qurt_pipe_send_cancellable(qurt_pipe_t *pipe, qurt_pipe_data_t data);
+
+/**@ingroup func_qurt_pipe_is_empty
+  Returns a value indicating whether the specified pipe contains any data.
+
+  @datatypes
+  #qurt_pipe_t
+
+  @param[in] pipe     Pointer to the pipe object to read from.
+
+  @return
+  1 -- Pipe contains no data. \n
+  0 -- Pipe contains data.
+
+  @dependencies
+  None.
+*/
+int qurt_pipe_is_empty(qurt_pipe_t *pipe);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif  /* QURT_PIPE_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_pmem_manager.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_pmem_manager.h
new file mode 100755
index 0000000000000..8c8da985228b9
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_pmem_manager.h
@@ -0,0 +1,82 @@
+#ifndef QURT_PMEM_MANAGER_H
+#define QURT_PMEM_MANAGER_H
+/**
+  @file qurt_pmem_manager.h
+  Prototypes of kernel physical memory manager APIs
+
+ EXTERNALIZED FUNCTIONS
+  none
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  none
+
+ Copyright (c) 2023 by Qualcomm Technologies, Inc.  All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*=====================================================================
+ Constants and macros
+ ======================================================================*/
+
+/* physical memory API return error code */
+#define QURT_PMEM_SUCCESS               0
+#define QURT_PMEM_NO_PRIV               1
+#define QURT_PMEM_RETRY                 2
+#define QURT_PMEM_OVERLAP               3
+#define QURT_PMEM_NOT_EXIST             4
+#define QURT_PMEM_INIT_FAILURE          5
+#define QURT_PMEM_OUTSTANDING_MAPPING   6
+#define QURT_PMEM_GENERIC_FAILURE       7
+#define QURT_PMEM_ENTRY_FOUND           8
+#define QURT_PMEM_REACH_END             9
+#define QURT_PMEM_UNCLAIMED             10
+#define QURT_PMEM_ALREADY_CLAIMED       11
+
+/*=====================================================================
+ Functions
+======================================================================*/
+
+/**@ingroup func_qurt_pmem_acquire
+  Acquire the ownership of a specific physical memory region.
+
+  @note1hang The ownership will be the caller
+
+  @param[in] ppage      Starting physical page number
+  @param[in] pnum       Number of physical pages
+
+  @return
+  #QURT_PMEM_NO_PRIV -- Have no privilege to claim the ownership. \n
+  #QURT_PMEM_OVERLAP -- The whole or part of the range has been owned \n
+  #QURT_PMEM_SUCCESS -- Succeed to claim ownership.
+
+  @dependencies
+  None.
+*/
+int qurt_pmem_acquire(unsigned int ppage, unsigned int pnum);
+
+/**@ingroup func_qurt_pmem_release
+  Release the ownership of a specific physical memory region.
+
+  @param[in] ppage      The start of physical page number
+  @param[in] pnum       The numbers of physical pages
+
+  @return
+  #QURT_PMEM_NO_PRIV                -- Have no privilege to claim the ownership. \n
+  #QURT_PMEM_NOT_EXIST              -- The physical memory range is not usable. \n
+  #QURT_PMEM_OUTSTANDING_MAPPING    -- There is outstanding mapping in this range
+  #QURT_PMEM_SUCCESS                -- Succeed to claim ownership.
+
+  @dependencies
+  None.
+ */
+int qurt_pmem_release(unsigned int ppage, unsigned int pnum);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_PMEM_MANAGER_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_pmu.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_pmu.h
new file mode 100755
index 0000000000000..73ea8eba04abf
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_pmu.h
@@ -0,0 +1,121 @@
+#ifndef QURT_PMU_H
+#define QURT_PMU_H
+/**
+  @file qurt_pmu.h 
+  Prototypes of pipe interface API.  
+	 A pipe or message queue blocks when too full (send) or empty (receive).
+	 Unless using a nonblocking option, all datagrams are 64 bits.
+
+  EXTERNAL FUNCTIONS
+   None.
+
+  INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+  Copyright (c) 2021 Qualcomm Technologies, Inc.
+  All rights reserved.
+  Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+=============================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*=============================================================================
+												FUNCTIONS
+=============================================================================*/
+
+/**@ingroup func_qurt_pmu_set
+  Sets the value of the specified PMU register.
+
+  @note1hang Setting PMUEVTCFG automatically clears the PMU registers PMUCNT0
+             through PMUCNT3.
+ 
+  @param[in] reg_id   PMU register. Values: 
+            - #QURT_PMUCNT0
+            - #QURT_PMUCNT1    
+            - #QURT_PMUCNT2    
+            - #QURT_PMUCNT3    
+            - #QURT_PMUCFG     
+            - #QURT_PMUEVTCFG
+            - #QURT_PMUCNT4    
+            - #QURT_PMUCNT5    
+            - #QURT_PMUCNT6    
+            - #QURT_PMUCNT7    
+            - #QURT_PMUEVTCFG1   @tablebulletend 
+
+  @param[in] reg_value  Register value.
+ 
+  @return
+  None.
+   
+  @dependencies
+  None.
+ */
+void qurt_pmu_set (int reg_id, unsigned int reg_value);
+ 
+/**@ingroup func_qurt_pmu_get
+  Gets the PMU register.\n
+  Returns the current value of the specified PMU register.
+
+  @param[in] reg_id   PMU register. Values: 			   
+            - #QURT_PMUCNT0
+            - #QURT_PMUCNT1    
+            - #QURT_PMUCNT2    
+            - #QURT_PMUCNT3    
+            - #QURT_PMUCFG     
+            - #QURT_PMUEVTCFG
+            - #QURT_PMUCNT4    
+            - #QURT_PMUCNT5    
+            - #QURT_PMUCNT6    
+            - #QURT_PMUCNT7    
+            - #QURT_PMUEVTCFG1  @tablebulletend           
+ 
+  @return
+   Integer -- Current value of the specified PMU register.
+
+  @dependencies
+  None.
+ */
+unsigned int  qurt_pmu_get (int reg_id);
+ 
+/**@ingroup func_qurt_pmu_enable
+  Enables or disables the Hexagon processor PMU.
+  Profiling is disabled by default. 
+
+  @note1hang Enabling profiling does not automatically reset the count registers -- this must
+            be done explicitly before starting event counting.
+ 
+  @param[in] enable Performance monitor. Values: \n
+                    - 0 -- Disable performance monitor \n
+                    - 1 -- Enable performance monitor @tablebulletend
+ 
+  @return 
+  None.
+
+  @dependencies
+  None.
+ */
+void qurt_pmu_enable (int enable);
+
+/**@ingroup func_qurt_pmu_get_pmucnt
+  Reads PMU counters in a single trap.
+ 
+  @param[out] buf   Pointer to a buffer to save values read from PMU counters.
+                    buffer size should be at least 32 bytes to read all eight PMU counters.
+ 
+  @return 
+  #QURT_EOK    -- Successful read.\n
+  #QURT_EFATAL -- Failure.
+
+  @dependencies
+  None.
+ */
+int qurt_pmu_get_pmucnt (void * buf);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_PMU_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_power.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_power.h
new file mode 100755
index 0000000000000..2ee4d29a73976
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_power.h
@@ -0,0 +1,140 @@
+#ifndef QURT_POWER_H
+#define QURT_POWER_H
+/**
+  @file qurt_power.h
+  @brief  Prototypes of power API
+
+EXTERNAL FUNCTIONS
+   None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+Copyright (c) 2018-2021  by Qualcomm Technologies, Inc.  All Rights Reserved.
+Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+=============================================================================*/
+
+/*=============================================================================
+
+                        EDIT HISTORY FOR MODULE
+
+ This section contains comments describing changes made to the module.
+ Notice that changes are listed in reverse chronological order.
+
+
+when       who     what, where, why
+--------   ---     ------------------------------------------------------------
+03/03/11   op      Add header file
+12/12/12   cm      (Tech Pubs) Edited/added Doxygen comments and markup.
+=============================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** @cond */
+/**@ingroup func_qurt_power_shutdown_fail_exit
+  Returns from Power Collapse mode when power collapse cannot proceed.
+
+  This function unmasks the global interrupt. This operation is used only when the thread is
+  recovering from a failed power collapse operation (Section @xref{sec:powerShutdownEnter}).
+
+  @return
+  #QURT_EOK -- Operation was successfully performed.
+
+  @dependencies
+  None.
+ */
+#define  qurt_power_shutdown_fail_exit qurt_power_exit
+
+/**@ingroup func_qurt_power_shutdown_exit
+  Undoes state changes made preparing for power collapse.\n
+  This function unmasks the global interrupts.
+
+  @return
+  #QURT_EOK --Operation was successfully performed.
+
+  @dependencies
+  None.
+ */
+#define qurt_power_shutdown_exit qurt_power_exit
+/**@endcond */
+
+/**@ingroup func_qurt_system_ipend_get
+  Gets the IPEND register.\n
+
+  @note1hang Returns the current value of the Hexagon processor IPEND register. The return value
+             is a mask value that identifies the individual interrupts that are pending. \n
+
+  @note1hang The bit order of the mask value is identical to the order defined for the IPEND register. A
+             mask bit value of 1 indicates that the corresponding interrupt is pending, and 0 indicates that the
+             corresponding interrupt is not pending. \n
+
+  @return
+  Return the IPEND register value.
+
+  @dependencies
+  None.
+ */
+unsigned int qurt_system_ipend_get (void);
+
+
+/**@ingroup func_qurt_system_vid_get
+  Gets the VID register. \n
+
+  @note1hang Returns the current value of the Hexagon processor VID register. The return value is
+             the vector number of a second-level interrupt that has been accepted by the Hexagon
+             processor core.\n
+
+  @return
+  Return the VID register value that is the L2 VIC interrupt number accepted by the processor.
+  Valid range is 0 to 1023.
+
+  @dependencies
+  None.
+ */
+unsigned int qurt_system_vid_get(void);
+
+/**@ingroup func_qurt_power_shutdown_get_pcycles
+   Gets the number of power collapses and processor cycles for entering and exiting most recent
+   power collapse.
+
+   @note1hang If no power collapse has occured yet, processor cycle numbers are zero.
+
+   @param[out] enter_pcycles  Number of processor cycles for entering most
+                              recent power collapse.
+   @param[out] exit_pcycles  Number of processor cycles for exiting most
+                             recent power collapse.
+   @return
+   Zero -- No power collapses have occurred. \n
+   Nonzero -- Number of power collapses that have occurred since
+                the processor was reset.
+
+   @dependencies
+   None.
+ */
+int qurt_power_shutdown_get_pcycles( unsigned long long *enter_pcycles,  unsigned long long *exit_pcycles );
+
+/**@ingroup func_qurt_system_tcm_set_size
+   Set size of TCM to save during full power collapse.
+
+   @note1hang The size aligns to 32 bytes. If size passed is greater than the maximum size defined in
+              XML, the size is truncated to the size defined in XML.
+
+   @param[in] new_size Size of TCM to save.
+
+   @return
+   Zero -- Size successfully set \n
+   -1 -- Size of 0 passed
+
+   @dependencies
+   None.
+ */
+int qurt_system_tcm_set_size(unsigned int new_size);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_POWER_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_printf.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_printf.h
new file mode 100755
index 0000000000000..a775d8a815918
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_printf.h
@@ -0,0 +1,44 @@
+#ifndef QURT_PRINTF_H
+#define QURT_PRINTF_H
+
+#include <stdarg.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+  @file qurt_printf.h   
+  Prototypes of printf API.  
+
+EXTERNAL FUNCTIONS
+   None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+Copyright (c) 2021  by Qualcomm Technologies, Inc.  All Rights Reserved.
+Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+=============================================================================*/
+
+
+
+/*=============================================================================
+                        CONSTANTS AND MACROS
+=============================================================================*/
+/** @addtogroup chapter_function_tracing
+@{ */
+
+int qurt_printf(const char* format, ...);
+
+int qurt_vprintf(const char* format, va_list args);
+
+/** @} */ /* end_addtogroup chapter_function_tracing */
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_PRINTF_H */
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_process.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_process.h
new file mode 100755
index 0000000000000..0df9ddc2d4a70
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_process.h
@@ -0,0 +1,995 @@
+#ifndef QURT_PROCESS_H
+#define QURT_PROCESS_H
+/**
+  @file qurt_process.h
+  @brief Prototypes of QuRT process control APIs.
+
+ EXTERNALIZED FUNCTIONS
+ None
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+ None
+
+ Copyright (c) 2009-2013, 2021-2023 Qualcomm Technologies, Inc.
+ All rights reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+#include "qurt_callback.h"
+#include "qurt_consts.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** @addtogroup process_types
+@{ */
+#define QURT_PROCESS_ATTR_NAME_MAXLEN       QURT_MAX_NAME_LEN   /**< Maximum length of the process name. */
+#define QURT_PROCESS_ATTR_BIN_PATH_MAXLEN   128                 /**< Maximum length of the path of binary/ELF for this process. */
+#define QURT_PROCESS_ATTR_CAP_MAXLEN        128                 /**< Maximum length for a resource name. */
+
+/** QuRT process capability wildcard strings */
+#define QURT_PROCESS_ATTR_CAP_ALLOW_ALL     "ALLOW_ALL"         /**< Capability wild-card for full access */
+#define QURT_PROCESS_ATTR_CAP_ALLOW_NONE    "ALLOW_NONE"        /**< Capability wild-card for no access */
+
+/** QuRT process capability states */  
+#define QURT_PROCESS_ATTR_CAP_ENABLED       0x1                 /**< Capability enabled*/
+#define QURT_PROCESS_ATTR_CAP_DISABLED      0x0                 /**< Capability disabled*/  
+
+/* QuRT process thread attributes. */
+#define QURT_PROCESS_DEFAULT_CEILING_PRIO 0        /**< Default ceiling priority of the threads in the new process. */
+#define QURT_PROCESS_DEFAULT_MAX_THREADS  -1       /**< Default number of threads in the new process.
+                                                        -1 indicates that the limit is set to the maximum supported by the system. */
+
+/* QuRT process flags. */
+#define QURT_PROCESS_SUSPEND_ON_STARTUP  (1U)      /**< Suspend the new processes just before calling main(). */
+#define QURT_PROCESS_NON_SYSTEM_CRITICAL (1u << 1) /**< Starts the new process as non system-critical. */
+#define QURT_PROCESS_ISLAND_RESIDENT     (1u << 2) /**< Process is island resident. */
+#define QURT_PROCESS_RESTARTABLE         (1u << 3) /**< Indicates that the process is restartable */
+#define QURT_PROCESS_UNTRUSTED           (1u << 7) /**< Starts the new process as unsigned process. */
+
+/* QuRT process debugging session status.*/
+#define QURT_DEBUG_NOT_START         0  /**< Debug is not started. */
+#define QURT_DEBUG_START             1  /**< Debug has started. */
+
+/** Process Suspend Options */
+#define QURT_PROCESS_SUSPEND_DEFAULT   0
+
+/** Process Resume Options   */
+#define QURT_PROCESS_RESUME_DEFAULT    0
+
+
+/* QuRT process types. */
+typedef enum {
+    QURT_PROCESS_TYPE_RESERVED,            /**< Process type is reserved. \n */
+    QURT_PROCESS_TYPE_KERNEL,              /**< Kernel process. \n*/
+    QURT_PROCESS_TYPE_SRM,                 /**< SRM process.    \n*/
+    QURT_PROCESS_TYPE_SECURE,              /**< Secure process. \n*/
+    QURT_PROCESS_TYPE_ROOT,                /**< Root process.   \n*/
+    QURT_PROCESS_TYPE_USER,                /**< User process.   */
+}qurt_process_type_t;
+
+/** QuRT process callback types. */
+typedef enum {
+   QURT_PROCESS_DUMP_CB_ROOT,             /**< Register the callback that executes in the
+                                               root process context. \n */
+   QURT_PROCESS_DUMP_CB_ERROR,            /**< Register the user process callback that is 
+                                               called after threads in the process are frozen. \n */
+   QURT_PROCESS_DUMP_CB_PRESTM,           /**< Register the user process callback that is
+                                               called before threads in the process are frozen. \n*/
+   QURT_PROCESS_DUMP_CB_MAX               /**< Reserved for error checking. */
+}qurt_process_dump_cb_type_t;
+
+/** QuRT process dump attributes. */
+typedef struct _qurt_pd_dump_attr{
+  /** @cond */
+  unsigned int enabled;                    /**< Process dump is enabled. */
+  const char *path;                        /**< Process dump path. */
+  unsigned int path_len;                   /**< Length of process dump path. */
+  /** @endcond */
+}qurt_pd_dump_attr_t;                    
+
+/** QuRT process capability resource type */
+enum qurt_process_cap_type_t {
+    QURT_PROCESS_CAP_TYPE_NUM_ENTRIES=0,       /**< Number of entries in the capability structure*/
+    QURT_PROCESS_CAP_TYPE_DRIVER=1,            /**< Driver resource */
+    QURT_PROCESS_CAP_TYPE_MAX                  /**< Maximum identifier */        
+};
+
+/** QuRT process capability structure */
+typedef struct _qurt_capability {
+    enum qurt_process_cap_type_t type;             /**< Resource type */
+    char name[QURT_PROCESS_ATTR_CAP_MAXLEN];       /**< Resource name*/ 
+    unsigned long long cap;                        /**< Capabilities allowed for this resource */
+}qurt_capability_t;
+
+/** QuRT process attributes. */
+typedef struct _qurt_process_attr {
+    /** @cond */
+    char name[QURT_PROCESS_ATTR_NAME_MAXLEN];           /**< Name of the new process. */
+    char path[QURT_PROCESS_ATTR_BIN_PATH_MAXLEN];       /**< Path of the binary for the new process. */
+    char dtb_path[QURT_PROCESS_ATTR_BIN_PATH_MAXLEN];   /**< Path of the DTB ELF for the new process. */
+    int flags;                                          /**< Flags as indicated by QuRT process flags. */
+    unsigned int sw_id;                                 /**< Software ID of the process be load. */
+    unsigned sid;                                       /**< Stream ID of the process being spawned. */
+    unsigned max_threads;                               /**< Maximum number of threads that the new process can create. */
+    unsigned short ceiling_prio;                        /**< Maximum priority at which threads can be 
+                                                             created by new process. */
+    qurt_process_type_t type;                           /**< Process type as indicated by 
+                                                             #qurt_process_type_t. */
+    qurt_pd_dump_attr_t dump_attr;                      /**< Process dump attributes for the new process 
+                                                             as indicated by #qurt_pd_dump_attr_t. */ 
+    qurt_capability_t *capabilities;                    /**< Pointer to array of structure of type
+                                                             qurt_capability_t */
+    /** @endcond */
+} qurt_process_attr_t; 
+
+/** @} */ /* end_addtogroup process_types */
+
+/*=============================================================================
+FUNCTIONS
+=============================================================================*/
+ /** @cond rest_reg_dist */
+/**@ingroup func_qurt_process_create
+  Creates a process with the specified attributes, and starts the process.
+
+  The process executes the code in the specified executable ELF file.
+
+  @datatypes
+  #qurt_process_attr_t
+
+  @param[out] attr Accepts an initialized process attribute structure, which specifies
+                   the attributes of the created process.
+
+  @return
+  Postive return value Indicates Process ID.
+  Negative return value Indicates any of follwoing error,
+  #-QURT_EPRIVILEGE      --   Caller does not have privilege for this operation \n
+  #-QURT_EMEM            --   Not enough memory to perform the operation \n
+  #-QURT_EFAILED         --   Operation failed \n
+  #-QURT_ENOTALLOWED     --   Operation not allowed \n
+  #-QURT_ENOREGISTERED   --   Not registered    \n
+  #-QURT_ENORESOURCE     --   Resource exhaustion   \n
+  #-QURT_EINVALID        --   Invalid argument value    
+  #QURT_EFATAL           --   attr is NULL
+
+  @dependencies
+  None.
+*/
+int qurt_process_create (qurt_process_attr_t *attr);
+
+/**@ingroup func_qurt_process_get_id
+  Returns the process identifier for the current thread. 
+
+  @return
+  None.
+
+  @dependencies
+  Process identifier for the current thread.
+*/
+int qurt_process_get_id (void);
+/** @endcond */
+
+/** @cond internal_only*/
+/**@ingroup func_qurt_process_get_uid
+  Returns the user identifier for the current thread. 
+
+  @return
+  None.
+
+  @dependencies
+  User identifier for the current thread.
+*/
+int qurt_process_get_uid (void);
+/** @endcond */
+/** @cond rest_reg_dist */
+/**@ingroup func_qurt_process_attr_init
+  Initializes the structure that sets the process attributes when a thread is created.
+
+  After an attribute structure is initialized, the individual attributes in the structure can 
+  be explicitly set using the process attribute operations.
+
+  Table @xref{tbl:processAttrDefaults} lists the default attribute values set by the initialize 
+  operation.
+
+  @inputov{table_process_attribute_defaults}
+
+  @datatypes
+  #qurt_process_attr_t
+
+  @param[out] attr Pointer to the structure to initialize.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+static inline void qurt_process_attr_init (qurt_process_attr_t *attr)
+{
+    attr->name[0] = '\0';
+    attr->path[0] = '\0';
+    attr->dtb_path[0] = '\0';
+    attr->flags = 0;
+    attr->sw_id = 0;
+    attr->sid = 0;
+    attr->max_threads = (unsigned)QURT_PROCESS_DEFAULT_MAX_THREADS;
+    attr->ceiling_prio = QURT_PROCESS_DEFAULT_CEILING_PRIO;
+    attr->type = QURT_PROCESS_TYPE_RESERVED;
+    attr->dump_attr.enabled = 0;
+    attr->dump_attr.path = NULL;
+    attr->dump_attr.path_len = 0;
+    attr->capabilities = NULL;
+}
+
+/**@ingroup func_qurt_process_attr_set_executable
+  Sets the process name in the specified process attribute structure.
+
+  Process names identify process objects that are already 
+  loaded in memory as part of the QuRT system.
+
+  @note1hang Process objects are incorporated into the QuRT system at build time.
+
+  @note1hang Maximum length of name string is limited to QURT_PROCESS_ATTR_NAME_MAXLEN - 1.
+
+  @datatypes
+  #qurt_process_attr_t
+
+  @param[in] attr Pointer to the process attribute structure.
+  @param[in] name Pointer to the process name.
+ 
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+void qurt_process_attr_set_executable (qurt_process_attr_t *attr, const char *name);
+
+/**@ingroup func_qurt_process_attr_set_binary_path
+  Sets the binary path for the process loading in the specified process attribute structure.
+
+  Path specifies the binary to load for this process.
+  
+  @note1hang Max length of path string is limited to QURT_PROCESS_ATTR_BIN_PATH_MAXLEN-1.
+
+  @datatypes
+  #qurt_process_attr_t
+
+  @param[in] attr Pointer to the process attribute structure.
+  @param[in] path Pointer to the binary path.
+ 
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+void qurt_process_attr_set_binary_path(qurt_process_attr_t *attr, char *path);
+
+/**@ingroup func_qurt_process_attr_set_dtb_path
+  Sets the DTB binary path for the process loading in the specified process attribute structure.
+
+  Path specifies the DTB binary to load for this process.
+  
+  @note1hang Max length of path string is limited to QURT_PROCESS_ATTR_BIN_PATH_MAXLEN-1.
+
+  @datatypes
+  #qurt_process_attr_t
+
+  @param[in] attr Pointer to the process attribute structure.
+  @param[in] path Pointer to the binary path.
+ 
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+void qurt_process_attr_set_dtb_path(qurt_process_attr_t *attr, char *path);
+
+/**@ingroup func_qurt_process_attr_set_flags
+Sets the process properties in the specified process attribute structure.
+Process properties are represented as defined symbols that map into bits 
+0 through 31 of the 32-bit flag value. Multiple properties are specified by OR'ing 
+together the individual property symbols.
+
+@datatypes
+#qurt_process_attr_t
+
+@param[in] attr  Pointer to the process attribute structure.
+@param[in] flags QURT_PROCESS_NON_SYSTEM_CRITICAL Process is considered as non system-critical.
+                                                  This attribute will be used by error services,
+                                                  to decide whether to kill user pd or whole subsystem.
+                 QURT_PROCESS_ISLAND_RESIDENT     Process will be marked as island resident.
+                 QURT_PROCESS_RESTARTABLE         Process will be marked as restartable.
+                 QURT_PROCESS_UNTRUSTED           Process will be marked as unsigned process.
+@return
+None.
+
+@dependencies
+None.
+*/
+static inline void qurt_process_attr_set_flags (qurt_process_attr_t *attr, int flags)
+{
+    attr->flags = flags;
+}
+/** @endcond */
+/** @cond internal_only*/
+/**@ingroup func_qurt_process_attr_set_sid
+Sets the process streamID in the specified process attribute structure.
+
+@datatypes
+#qurt_process_attr_t
+
+@param[in] attr  Pointer to the process attribute structure.
+@param[in] sid   streamID to set for this process.
+
+@return
+None.
+
+@dependencies
+None.
+*/
+static inline void qurt_process_attr_set_sid (qurt_process_attr_t *attr, unsigned sid)
+{
+    attr->sid = sid;
+}
+/** @endcond */
+/** @cond rest_reg_dist */
+/**@ingroup func_qurt_process_attr_set_max_threads
+Sets the maximum number of threads allowed in the specified process attribute structure.
+
+@datatypes
+#qurt_process_attr_t
+
+@param[in] attr          Pointer to the process attribute structure.
+@param[in] max_threads   Maximum number of threads allowed for this process.
+
+@return
+None.
+
+@dependencies
+None.
+*/
+static inline void qurt_process_attr_set_max_threads (qurt_process_attr_t *attr, unsigned max_threads)
+{
+    attr->max_threads = max_threads;
+}
+
+/**@ingroup func_qurt_process_attr_set_sw_id
+Sets the software ID of the process to load in the specified process attribute structure.
+
+@datatypes
+#qurt_process_attr_t
+
+@param[in] attr          Pointer to the process attribute structure.
+@param[in] sw_id         Software ID of the process, used in authentication.
+
+@return
+None.
+
+@dependencies
+None.
+*/
+static inline void qurt_process_attr_set_sw_id(qurt_process_attr_t *attr, unsigned int sw_id)
+{
+    attr->sw_id = sw_id;
+}
+
+/**@ingroup func_qurt_process_attr_set_ceiling_prio
+Sets the highest thread priority allowed in the specified process attribute structure.
+Refer qurt_thread.h for priority ranges.
+
+@datatypes
+#qurt_process_attr_t
+
+@param[in] attr          Pointer to the process attribute structure.
+@param[in] prio          Priority.
+
+@return
+None.
+
+@dependencies
+None.
+*/
+static inline void qurt_process_attr_set_ceiling_prio (qurt_process_attr_t *attr, unsigned short prio)
+{
+    attr->ceiling_prio = prio;
+}
+/** @endcond */
+
+/** @cond internal_only*/
+/**@ingroup func_qurt_process_attr_set_dump_status
+Sets the process domain dump-enabled field in the process domain dump attributes.
+
+@datatypes
+#qurt_process_attr_t
+
+@param[in] attr          Pointer to the process attribute structure.
+@param[in] enabled       1 -- Process domain dump is collected \n
+                         0 -- Process domain dump is not collected
+
+@return
+None.
+
+@dependencies
+None.
+*/
+static inline void qurt_process_attr_set_dump_status(qurt_process_attr_t *attr, unsigned int enabled)
+{
+    attr->dump_attr.enabled = enabled;
+}
+
+/**@ingroup func_qurt_process_attr_set_dump_path
+Sets the process domain dump path and type.
+
+@datatypes
+#qurt_process_attr_t
+
+@param[in] attr          Pointer to the process attribute structure.
+@param[in] path          Path where the process domain dumps must be saved.
+@param[in] path_len      Length of the path string.
+
+@return
+None. 
+
+@dependencies
+None.
+*/
+static inline void qurt_process_attr_set_dump_path(qurt_process_attr_t *attr, const char *path, int path_len)
+{
+    attr->dump_attr.path = path;
+    attr->dump_attr.path_len = (unsigned int)path_len;
+}
+
+/**@ingroup func_qurt_process_attr_set_capabilities
+Sets list of capabilities available to this process.
+
+@datatypes
+#qurt_process_attr_t
+
+@param[in] attr          Pointer to the process attribute structure.
+@param[in] capabilities  Pointer to array of structures of type qurt_capability_t defining 
+                         resources and capabilites
+
+@return
+None. 
+
+@dependencies
+None.
+*/
+static inline void qurt_process_attr_set_capabilities(qurt_process_attr_t *attr, qurt_capability_t *capabilities)
+{
+    attr->capabilities = capabilities;
+}
+
+/** @endcond */
+/** @cond rest_reg_dist */
+/**@ingroup func_qurt_process_cmdline_get
+Gets the command line string associated with the current process.
+The Hexagon simulator command line arguments are retrieved using 
+this function as long as the call is made
+in the process of the QuRT installation, and with the 
+requirement that the program runs in a simulation environment.
+
+If the function modifies the provided buffer, it zero-terminates
+the string. It is possible that the function does not modify the
+provided buffer, so the caller must set buf[0] to a NULL
+byte before making the call. A truncated command line is returned when
+the command line is longer than the provided buffer.
+
+@param[in] buf      Pointer to a character buffer that must be filled in.
+@param[in] buf_siz  Size (in bytes) of the buffer pointed to by the buf argument.
+
+@return
+None.
+
+@dependencies
+None.
+*/
+void qurt_process_cmdline_get(char *buf, unsigned buf_siz);
+
+/**@ingroup func_qurt_process_get_thread_count
+Gets the number of threads present in the process indicated by the PID. 
+ 
+@param[in] pid PID of the process for which the information is required.
+
+@return
+Number of threads in the process indicated by PID, if positive value is obtained
+Negative error code if failed include:
+   QURT_EFATAL - Invalid PID
+   -QURT_ENOTALLOWED - Current process doesnt have access to target process indicated by PID
+
+@dependencies
+None.
+*/
+int qurt_process_get_thread_count(unsigned int pid);
+
+/**@ingroup func_qurt_process_get_thread_ids
+Gets the thread IDs for a process indicated by PID. 
+
+@param[in] pid      PID of the process for which the information is required.
+@param[in] ptr         Pointer to a user passed buffer that must be filled in with thread IDs.
+@param[in] thread_num  Number of thread IDs requested.
+
+@return
+#QURT_EOK - Success
+#QURT_EFATAL - Failed, ptr is NULL
+
+@dependencies
+None.
+ */
+int qurt_process_get_thread_ids(unsigned int pid, unsigned int *ptr, unsigned thread_num);
+/** @endcond */
+/** @cond internal_only*/
+/**@ingroup func_qurt_process_dump_get_mem_mappings_count
+Gets the number of mappings present in the process indicated by the PID. 
+ 
+@param[in] pid PID of the process for which the information is required.
+
+@return
+Number of mappings for the process indicated by the PID.
+
+@dependencies
+None.
+*/
+int qurt_process_dump_get_mem_mappings_count(unsigned int pid);
+
+/**@ingroup func_qurt_process_dump_get_mappings
+Gets the mappings for a specified PID.
+
+@note1hang This API skips device type mappings or mappings created by setting the #QURT_PERM_NODUMP attribute.
+
+@param[in] pid      PID of the process for which the information is required.
+@param[in] ptr      Pointer to a buffer that must be filled in with mappings.
+@param[in] count    Count of mappings requested.
+
+@return
+Number of mappings filled in the buffer passed by the user.
+
+@dependencies
+None.
+*/
+int qurt_process_dump_get_mappings(unsigned int pid, unsigned int *ptr, unsigned count);
+/** @endcond */
+/** @cond rest_reg_dist */
+/**@ingroup func_qurt_process_attr_get
+Gets the attributes of the process with which it was created. 
+ 
+@datatypes
+#qurt_process_attr_t
+
+@param[in]     pid  PID of the process for which the information is required.
+@param[in,out] attr Pointer to the user allocated attribute structure.
+
+@return
+#QURT_EOK     - Success
+#QURT_INVALID - Invalid PID
+#QURT_EFATAL  - attr is NULL
+
+@dependencies
+None.
+*/
+int qurt_process_attr_get(unsigned int pid, qurt_process_attr_t *attr);
+
+/**@ingroup func_qurt_process_dump_register_cb
+Registers the process domain dump callback. 
+ 
+@datatypes
+#qurt_cb_data_t \n
+#qurt_process_dump_cb_type_t
+
+@param[in] cb_data Pointer to the callback information.
+@param[in] type Callback type; these callbacks are called in the context of the user process domain: \n
+   #QURT_PROCESS_DUMP_CB_PRESTM -- Before threads of the exiting process are frozen. \n
+   #QURT_PROCESS_DUMP_CB_ERROR  -- After threads are frozen and captured. \n
+   #QURT_PROCESS_DUMP_CB_ROOT   -- After threads are frozen and captured, and CB_ERROR type of callbacks
+                                   are called.
+@param[in] priority Priority.
+
+@return
+#QURT_EOK -- Success \n
+Other values -- Failure
+    QURT_EFATAL if cb_data is NULL
+    QURT_EINVALID If invalid cb_type
+    QURT_EFAILED If invalid cb_data 
+ 
+@dependencies
+None.
+*/
+int qurt_process_dump_register_cb(qurt_cb_data_t *cb_data, qurt_process_dump_cb_type_t type, unsigned short priority);
+
+/**@ingroup func_qurt_process_dump_deregister_cb
+Deregisters the process domain dump callback.
+
+@datatypes
+#qurt_cb_data_t \n
+#qurt_process_dump_cb_type_t
+
+@param[in] cb_data Pointer to the callback information to deregister.
+@param[in] type    Callback type.
+
+@return
+#QURT_EOK -- Success.\n
+Other values -- Failure.
+    QURT_EFATAL if cb_data is NULL
+    QURT_EINVALID If invalid cb_type
+    QURT_EFAILED If invalid cb_data 
+
+@dependencies
+None.
+*/
+int qurt_process_dump_deregister_cb(qurt_cb_data_t *cb_data,qurt_process_dump_cb_type_t type);
+
+/** @endcond */
+/** @cond internal_only*/
+/**@ingroup func_qurt_process_set_rtld_debug
+Sets rtld_debug for a process. 
+ 
+@param[in] pid     PID of the process for which rtld_debug must be set.
+@param[in] address rtld_debug address.
+
+@return
+#QURT_EOK      - Success
+#QURT_EINVALID - Invalid PID
+#QURT_EFATAL   - Invalid address
+ 
+@dependencies
+None.
+*/
+int qurt_process_set_rtld_debug(unsigned int pid,unsigned int address);
+
+/**@ingroup func_qurt_process_get_rtld_debug
+Gets rtld_debug for a process.
+
+@param[in] pid         PID of the process for which rtld_debug must be set.
+@param[in,out] address Pointer to the user passed address in which the rtld_debug address must be returned.
+
+@return
+#QURT_EOK      - Success
+#QURT_EINVALID - Invalid PID
+#QURT_EFATAL   - Invalid address
+
+@dependencies
+None.
+*/
+int qurt_process_get_rtld_debug(unsigned int pid,unsigned int *address);
+/** @endcond */
+/**@ingroup func_qurt_process_exit
+Exits the current user process with an exit code.
+
+@param[in] exitcode Exit code.
+ 
+@return
+#QURT_EFATAL -- No client found with the specified PID value \n
+#QURT_EINVALID -- Invalid client \n
+#QURT_ENOTALLOWED -- User does not have permission to perform this operation \n
+#QURT_EOK -- Success
+
+@dependencies
+None.
+*/
+int qurt_process_exit(int exitcode);
+
+/**@ingroup func_qurt_process_kill
+Kills the process represented by the PID with the exit code.
+
+@param[in] pid       PID of the process to kill.
+@param[in] exitcode  Exit code.
+ 
+@return
+#QURT_EFATAL -- No client found with the specified PID value \n
+#QURT_EINVALID -- Invalid client \n
+#QURT_ENOTALLOWED -- User does not have permission to perform this operation \n
+#QURT_EOK -- Success
+
+@dependencies
+None.
+*/
+int qurt_process_kill(int pid, int exitcode);
+ 
+ 
+/**@ingroup func_qurt_debugger_register_process
+Registers the process indicated by the PID with the debug monitor. 
+
+@param[in] pid  PID of the process.
+@param[in] adr  Address.
+ 
+@return
+#QURT_EOK -- Success 
+
+@dependencies
+None.
+*/
+int qurt_debugger_register_process(int pid, unsigned int adr);
+ 
+ 
+/**@ingroup func_qurt_debugger_deregister_process
+Deregister the process indicated by the PID with the debug monitor.
+
+@param[in] pid  PID of the process.
+ 
+@return
+#QURT_EOK -- Success
+
+@dependencies
+None.
+*/
+int qurt_debugger_deregister_process(int pid);
+ 
+/**@ingroup func_qurt_process_exec_callback
+Executes callbacks in the user process as indicated by the client_handle argument.
+
+@param[in] client_handle  Client handle obtained from the current invocation function (Section 3.4.1).
+@param[in] callback_fn    Callback function to execute.
+@param[in] stack_base     Stack address to use.
+@param[in] stack_size     Stack size.
+ 
+@return
+#QURT_EOK -- Success
+
+@dependencies
+None.
+*/
+int qurt_process_exec_callback(int client_handle,
+                                     unsigned callback_fn,
+                                     unsigned stack_base,
+                                     unsigned stack_size);
+ 
+/**@ingroup func_qurt_process_get_pid
+Gets the process ID of the process that the client_handle argument represents.
+
+@note1hang This API is not supported for unsigned PD, For unsigned PD use qurt_process_get_id()
+
+@param[in] client_handle    Client handle obtained from the current invocation function (Section 3.4.1).
+@param[in] pid              Pointer to the address to store the PID.
+ 
+@return
+#QURT_EOK -- Success
+#QURT_EFATAL -- pid pointer passed as NULL 
+
+@dependencies
+None.
+*/
+int qurt_process_get_pid(int client_handle, int * pid);
+
+/**@ingroup func_qurt_process_get_dm_status
+Gets the debugging session status on the process represented by the pid argument.
+
+@param[in]     pid      Process ID  
+@param[in,out] status   Address to store the status: \n
+                        #QURT_DEBUG_NOT_START \n        
+                        #QURT_DEBUG_START         
+ 
+@return
+#QURT_EOK - Success \n
+#QURT_EINVALID - Error
+
+@dependencies
+None.
+*/
+int qurt_process_get_dm_status( unsigned int pid, unsigned int *status);
+
+
+/**@ingroup func_qurt_process_suspend_threads 
+  Suspends user threads in a user process with its process identifier.
+  The target user process can be a signed user process or an unsigned user process.
+  The caller is from a thread in GuestOS/root process.
+  After the user threads in the target user process are suspended, they cannot be scheduled to run by the kernel 
+  until they resume later.
+
+  This function has one optional argument with one default option.
+  #QURT_PROCESS_SUSPEND_DEFAULT suspends user threads in the target user process.
+
+  This function call is a synchronous call, the function returns after the relevant threads are 
+  completely suspended. 
+  
+  If some user threads in the target user process are set as non-suspendable, this function call does
+  not suspend these threads.
+
+  If the target user process is already suspended, this function call returns success as the 
+  confirmation on the user process suspending.
+
+  QuRT debugger monitor threads in the target user process are non-suspendable, this function call does
+  not suspend the threads.
+
+  If the target user process is a secure user process, or a CPZ process, this function call returns error 
+  without suspending the target user process.                                          
+
+  If a user thread in the target user process runs in the guest OS/root process via a QDI call, this function call 
+  does not suspend the thread in the guest OS, but instead marks the thread as pending-suspend. The thread is suspended 
+  when it exits the guest OS, before executing the first instruction in the user process.
+  In this case, the function returns success while the user thread can be running in GuestOS, and is suspended 
+  when exiting the guest OS. 
+ 
+  @param[in] process_id  Process identifier.
+  @param[in] option      Dfault option #QURT_PROCESS_SUSPEND_DEFAULT suspends user threads in the target user process.
+ 
+  @return
+  #QURT_EOK         -- Success  \n
+  #QURT_EINVALID    -- Failure because of invalid process_id input \n
+  #QURT_ENOTALLOWED -- Failure because the operation is not allowed, for example, on a secure process/CPZ process.
+ 
+  @dependencies
+  None.
+ */
+int qurt_process_suspend_threads (unsigned int process_id, unsigned int option);
+
+
+/**@ingroup func_qurt_process_resume_threads 
+  Resumes a user process with its process identifier.
+  The target user process can be a signed user process or an unsigned user process.
+  The caller is from a thread in the guest OS/root process.
+  After the user threads in the target user process resume, the kernel scheduler
+  can schedule the user threads to run based on their thread priorities.
+
+  This function has an optional argument, #QURT_PROCESS_RESUME_DEFAULT, which 
+  resumes user threads in the target user process.
+
+  This is an asynchronous function, it returns after the kernel moves the user thread from 
+  suspended state to runnable state. The threads are scheduled to run based on their thread priorities.
+  
+  This function call does not resume threads in the target user process that have been set as non-resumable.
+
+  If the target user process have already resumed, this function call confirms that the user process resumes
+  by returning success.
+
+  If the target user process is a secure user process or a CPZ process, this function call returns an error without 
+  resuming operation.                                          
+
+  If user threads in the target user process run in the guest OS/root process via QDI call, this function 
+  call clears the mark of suspend-pending on these threads, so that the threads are be suspended when it exits 
+  the guest OS. 
+ 
+  @param[in] process_id Process identifier.
+  @param[in] option     Default option #QURT_PROCESS_RESUME_DEFAULT resumes user threads in the target user process.
+ 
+  @return
+  #QURT_EOK         -- Success  
+  #QURT_EINVALID    -- Failure because of invalid process_id input.
+  #QURT_ENOTALLOWED -- Failure because of the operation is not allowed, for example, on a secure process/CPZ process.
+ 
+  @dependencies
+  None.
+ */
+int qurt_process_resume_threads (unsigned int process_id, unsigned int option);
+
+/**@ingroup func_qurt_process_vtcm_window_set
+  Set a VTCM access window for a process.
+  The caller thread needs to be in SRM process.
+  
+  This is an synchronous function, it ensures all running threads of the process have the requested 
+  window in effect.The requested view for all non-running thread will take in effect when they get 
+  scheduled.  
+
+  @param[in] pid Process identifier.
+  @param[in] enable  QURT_VTCM_WINDOW_ENABLE    enforces VTCM access window defined by high and low offset.
+                     QURT_VTCM_WINDOW_DISABLE   high and low offset is ignored and VTCM access is fully 
+                                                disabled for the process.
+  @param[in] high_offset  Specifies the high window offset, in 4K increments, from the base address of the VTCM.
+                          QURT_VTCM_WINDOW_HI_OFFSET_DEFAULT  restore high offset to reset value.
+  @param[in] low_offset   Specifies the low window offset, in 4K increments, from the base address of the VTCM.
+                          QURT_VTCM_WINDOW_LO_OFFSET_DEFAULT restore low offset to reset value.
+           
+  @note1hang
+  when high_offset is set to QURT_VTCM_WINDOW_HI_OFFSET_DEFAULT  and low offset is set as 
+  QURT_VTCM_WINDOW_LO_OFFSET_DEFAULT full VTCM range is accessible. Access to VTCM is controlled 
+  via MMU mapping for the process. 
+  
+  @return
+  #QURT_EOK            -- Success  
+  #QURT_EVAL           -- Failure because of invalid inputs.
+  #QURT_EPRIVILEGE     -- Failure because caller does not have enough privilege for this operation.
+  #QURT_ENOTSUPPORTED  -- Failure because of the operation is not supported due to limitation in HW capabilities 
+ 
+  @dependencies
+  None.
+ */
+int qurt_process_vtcm_window_set(int pid, unsigned int enable, unsigned int high_offset, unsigned int low_offset);
+
+/**@ingroup func_qurt_process_vtcm_window_get
+  Get the VTCM window for a process.
+  The caller thread needs to be in SRM process.
+  
+
+  @param[in] pid Process identifier.
+  @param[out] enable  address to store enable status if set
+  @param[out] high_offset address to return high window offset, in 4K increments, from the base address of the VTCM
+  @param[out] low_offset  address to return low window offset, in 4K increments, from the base address of the VTCM.
+  
+  @note1hang
+  User must first check the value of enable returned before checking high and low offset.
+ 
+  @return
+  #QURT_EOK            -- Success  
+  #QURT_EVAL           -- Failure because of invalid inputs.
+  #QURT_EPRIVILEGE     -- Failure because caller does not have enough privilege for this operation.
+  #QURT_ENOTSUPPORTED  -- Failure because of the operation is not supported due to limitation in HW capabilities 
+ 
+  @dependencies
+  None.
+ */
+int qurt_process_vtcm_window_get(int pid, unsigned int *enable, unsigned int *high_offset, unsigned int *low_offset);
+
+/**@ingroup func_qurt_process_set_group_config
+  Enable thread groups in the process with the ceiling priorities setup
+
+  @param[in] process_id Process identifier.
+  @param[in] group_bitmask 64-bit mask of active thread groups
+  @param[in] ceiling_priorities array of ceiling priorities for thread group
+
+  @note1hang
+  This API can only be called by root PD and can only be called once for each process, otherwise it will be
+  rejected. Group 0 must be enabled in group_bitmask, otherwise QuRT will return error. After this API, all
+  exisiting threads will be moved to group 0, and if there is any thread's priority higher than ceiling
+  priority of group 0, it will be lowered to the ceiling value.
+  Examples 1:
+  group_bitmask = 0xD7; //'b11010111
+  ceiling_priorities[] = {100, 128, 200, 0, 196, 0, 240, 20}; // 0 - does not care
+  Exmaples 2:
+  group_mask = 0x5;     //'b101
+  ceiling_priorities[] = {240, 0, 20}; // 0 - does not care
+
+
+  @return
+  #QURT_EOK            -- Success.
+  #QURT_EVAL           -- Failure because of invalid inputs.
+  #QURT_ENOTALLOWED    -- The group has been configured already.
+
+  @dependencies
+  None.
+ */
+int qurt_process_set_group_config(unsigned int process_id, unsigned long long group_bitmask,
+    unsigned char *ceiling_priorities);
+
+
+/**@ingroup func_qurt_process_stid_set
+  Set the specified stid for a process or for a thread group within a process. 
+
+  @param[in] pid Process identifier.
+  @param[in] group_id  group identifier
+  @param[in] stid stid to be set 
+  
+  @note1hang 
+  User can pass default group_id (QURT_THREAD_DEFAULT_GROUP_ID) if stid needs to set at a process level.
+  All threads within a process that has default stid (QURT_STID_DEFAULT) will inherit the stid set for a process.
+  When a non-default group_id is specified, the stid is set only for a thread group.
+  
+  @return
+  #QURT_EOK            -- Success
+  #QURT_EFATAL         -- Invalid PID
+  #QURT_EVAL           -- Failure because of invalid inputs.
+  #QURT_EPRIVILEGE     -- Failure because caller does not have enough privilege for this operation.
+ 
+  @dependencies
+  None.
+ */
+int qurt_process_stid_set(unsigned int pid, unsigned int group_id , unsigned int stid);
+
+/**@ingroup func_qurt_process_stid_get
+  Get the stid for a process or for a thread group within a process. 
+
+  @param[in]  pid Process identifier.
+  @param[in]  group_id  group identifier
+  @param[out] Pointer to a variable to return  stid
+  
+  @note1hang 
+  User can pass default group_id (QURT_THREAD_DEFAULT_GROUP_ID) to return process-level stid.
+  When a non-default group_id is specified, the stid is returned only for a thread group.
+  
+  @return
+  #QURT_EOK            -- Success
+  #QURT_EFATAL         -- Invalid PID
+  #QURT_EVAL           -- Failure because of invalid inputs.
+  #QURT_EPRIVILEGE     -- Failure because caller does not have enough privilege for this operation.
+ 
+  @dependencies
+  None.
+ */
+int qurt_process_stid_get(unsigned int pid, unsigned int group_id , unsigned int *stid);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_profile.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_profile.h
new file mode 100755
index 0000000000000..2a50c461440f6
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_profile.h
@@ -0,0 +1,98 @@
+#ifndef QURT_PROFILE_H
+#define QURT_PROFILE_H
+/**
+  @file qurt_profile.h
+  QuRT profiling support.
+
+EXTERNAL FUNCTIONS
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+Copyright (c) 2018, 2021, 2023 by Qualcomm Technologies, Inc.  All Rights Reserved.
+Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+==============================================================================*/
+#include "qurt_thread.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** @addtogroup profiling_macros
+@{ */
+#define QURT_PROFILE_DISABLE 0 /**< Disable profiling. */
+#define QURT_PROFILE_ENABLE  1 /**< Enable profiling. */
+
+typedef unsigned int qurt_profile_param_t;
+
+#define QURT_PROFILE_PARAM_THREAD_READY_TIME 0U /**< Profile thread ready time. */
+
+/** @} */ /* end_addtogroup profiling_macros */
+
+/** @addtogroup profiling_types
+    @{ */
+/** Profiling results. */
+typedef union
+{
+    /** Result associated with #QURT_PROFILE_PARAM_THREAD_READY_TIME.  */
+    struct
+    {
+        unsigned int ticks; /**< Cumulative ticks the thread was ready. */
+    } thread_ready_time;
+
+} qurt_profile_result_t;
+/** @} */ /* end_addtogroup profiling_types */
+
+/**@ingroup func_qurt_profile_enable2
+ * Starts profiling of a specific parameter on a specific thread (as applicable).
+ *  
+ * @param[in] param     Profiling parameter.
+ * @param[in] thread_id ID of the thread (if applicable) for which the specified 
+ *                      paramter must be profiled.
+ * @param[in] enable    #QURT_PROFILE_DISABLE -- disable \n #QURT_PROFILE_ENABLE -- 
+ *                      enable
+ *  
+ * @return 
+ * #QURT_EOK -- Success \n 
+ * #QURT_EALREADY -- Measurement already in progress or already stopped \n 
+ * #QURT_ENOTHREAD -- Thread does not exist \n 
+ * #QURT_EINVALID -- Invalid profiling parameter \n
+ *  
+ * @dependencies 
+ * None.   
+ */
+extern int qurt_profile_enable2 (
+    qurt_profile_param_t param,
+    qurt_thread_t        thread_id,
+    int                  enable
+);
+
+/**@ingroup func_qurt_profile_get
+ * Gets the value of the profiling parameter that was previously enabled. 
+ *  
+ * @param[in] param     Profiling parameter.
+ * @param[in] thread_id ID of thread (if applicable) for which the specified 
+ *                      profiling paramter must be retrieved.
+ * @param [out] result  Profiling result associated with the parameter for the specified 
+ *                      thread (if applicable).
+ *  
+ * @return 
+ * #QURT_EOK -- Success \n 
+ * #QURT_EFAILED -- Operation failed; profiling was not enabled \n 
+ * #QURT_ENOTHREAD -- Thread does not exist \n 
+ * #QURT_EINVALID -- Invalid profiling parameter \n
+ *  
+ * @dependencies 
+ * None. 
+ */
+extern int qurt_profile_get (
+    qurt_profile_param_t    param,
+    qurt_thread_t           thread_id,
+    qurt_profile_result_t * result
+);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+#endif
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_ptrace.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_ptrace.h
new file mode 100755
index 0000000000000..622304dd92865
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_ptrace.h
@@ -0,0 +1,37 @@
+/*=============================================================================
+
+                                    qurt_ptrace.h
+
+GENERAL DESCRIPTION
+
+EXTERNAL FUNCTIONS
+        None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+        None.
+
+             Copyright (c) 2013  by Qualcomm Technologies, Inc.  All Rights Reserved.
+=============================================================================*/
+#ifndef __SYS_PTRACE_H__
+#define __SYS_PTRACE_H__
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+enum __ptrace_request
+{
+   /**
+     Indicates that the process making this request is requesting to be traced.
+   */
+   PTRACE_TRACEME = 0,
+   PTRACE_EXT_IS_DEBUG_PERMITTED = 500
+};
+
+long ptrace(enum __ptrace_request request, unsigned int pid, void*addr, void *data);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif //__SYS_PTRACE_H__
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_qdi.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_qdi.h
new file mode 100755
index 0000000000000..705408e5cfc6f
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_qdi.h
@@ -0,0 +1,185 @@
+#ifndef QDI_H
+#define QDI_H
+
+/**
+  @file qurt_qdi.h
+  @brief Prototypes of QuRT Driver Invocation API functions      
+
+ EXTERNALIZED FUNCTIONS
+  none
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  none
+
+ Copyright (c) 2013, 2021, 2023 Qualcomm Technologies, Inc.
+ All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+
+
+#include "qurt_qdi_constants.h"
+#include "qurt_qdi_imacros.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**@ingroup func_qurt_qdi_open
+  Opens the specified driver for subsequent operations.
+  qurt_qdi_open() is the primary mechanism by which a driver user can
+  obtain a QDI handle. The user provides the name of the driver to the 
+  qurt_qdi_open call, and gets back a handle referencing
+  the named driver. \n
+  @note1hang For reasons related to the Hexagon standard for varargs functions, the
+             qurt_qdi_open function prototype is not actually defined as a varargs.
+
+
+  @param[in] p   Driver name.
+  @param[in] ... Up to nine additional device-specific arguments can be passed as parameters, 
+                 and should follow the POSIX open() convention. \n
+                 - flags -- Optional second parameter (POSIX flags), the handle 
+                         access requested (read-only, write-only, or read-write,
+                         for instance) and other flags such as whether the call 
+                         should create a new device or only open an existing 
+                         device.   \n
+                 - mode  -- Optional third parameter (POSIX mode); permissions to
+                         configure when a new device is created. @tablebulletend
+ 
+  @return 
+  Negative value -- Error. \n
+  Non-negative value -- Success, this result value serves as a handle to the
+                        opened driver.
+  @dependencies
+  None.
+ */
+// int qurt_qdi_open();
+#define qurt_qdi_open(p,...) \
+   qurt_qdi_handle_invoke(QDI_HANDLE_GENERIC,QDI_OPEN,(p),##__VA_ARGS__)
+
+#define qurt_qdi_open_dt(p,q,...) \
+   qurt_qdi_handle_invoke(QDI_HANDLE_GENERIC,QDI_OPEN_FROM_DT,(p),(q),##__VA_ARGS__)
+
+/**@ingroup func_qurt_qdi_handle_invoke
+  Performs a generic driver operation, which (depending on the specified operation) can be
+  either be one of the predefined operations listed in @xhyperref{tbl:functionMapping,QDI function mapping} 
+  or a driver-specific operation.
+  The user provides a QDI handle and an integer
+  method number, along with 0 to 8 optional 32-bit arguments.
+  The device driver invocation function is invoked with the
+  same method number and 0 to 8 optional arguments. The
+  return value from the invocation function is passed back to
+  the user as the return value of qurt_qdi_handle_invoke.
+
+  @note1hang For reasons related to the Hexagon standard for varargs functions, the
+             qurt_qdi_handle_invoke() function prototype is not actually defined as a
+             varargs function (and would break if it were defined this way).
+ 
+  @param[in]  h   Driver handle.
+  @param[in]  m   Integer number for the operation to perform.
+  @param[in]  ... Up to eight optional arguments can be passed to the device driver as operation-specific parameters: \n
+               arg1 -- First parameter \n
+               arg2 -- Second parameter  \n
+               arg3 -- Third parameter  \n
+               arg4 -- Fourth parameter  \n
+               arg5 -- Fifth parameter  \n
+               arg6 -- Sixth parameter  \n
+               arg7 -- Seventh parameter  \n
+               arg8 -- Eighth parameter 
+ 
+  @return 
+  Integer value defined by the device driver. \n
+  -1 -- Error.
+
+  @dependencies
+  None.
+ */
+// int qurt_qdi_handle_invoke();
+#define qurt_qdi_handle_invoke(h,m,...) \
+   _QDMPASTE(_QDMHI,_QDMCNT(QDI_HANDLE_LOCAL_CLIENT,h,m,##__VA_ARGS__))(QDI_HANDLE_LOCAL_CLIENT,h,m,##__VA_ARGS__)
+#define _QDMHI3(a,b,c) qurt_qdi_qhi3(0,b,c)
+#define _QDMHI4(a,b,c,d) qurt_qdi_qhi4(0,b,c,(int)(d))
+#define _QDMHI5(a,b,c,d,e) qurt_qdi_qhi5(0,b,c,(int)(d),(int)(e))
+#define _QDMHI6(a,b,c,d,e,f) qurt_qdi_qhi6(0,b,c,(int)(d),(int)(e),(int)(f))
+#define _QDMHI7(a,b,c,d,e,f,g) qurt_qdi_qhi7(8,b,c,(int)(d),(int)(e),(int)(f),(int)(g))
+#define _QDMHI8(a,b,c,d,e,f,g,h) qurt_qdi_qhi8(8,b,c,(int)(d),(int)(e),(int)(f),(int)(g),(int)(h))
+#define _QDMHI9(a,b,c,d,e,f,g,h,i) qurt_qdi_qhi9(16,b,c,(int)(d),(int)(e),(int)(f),(int)(g),(int)(h),(int)(i))
+#define _QDMHI10(a,b,c,d,e,f,g,h,i,j) qurt_qdi_qhi10(16,b,c,(int)(d),(int)(e),(int)(f),(int)(g),(int)(h),(int)(i),(int)(j))
+#define _QDMHI11(a,b,c,d,e,f,g,h,i,j,k) qurt_qdi_qhi11(24,b,c,(int)(d),(int)(e),(int)(f),(int)(g),(int)(h),(int)(i),(int)(j),(int)(k))
+#define _QDMHI12(a,b,c,d,e,f,g,h,i,j,k,l) qurt_qdi_qhi12(24,b,c,(int)(d),(int)(e),(int)(f),(int)(g),(int)(h),(int)(i),(int)(j),(int)(k),(int)(l))
+int qurt_qdi_qhi3(int,int,int);
+int qurt_qdi_qhi4(int,int,int,int);
+int qurt_qdi_qhi5(int,int,int,int,int);
+int qurt_qdi_qhi6(int,int,int,int,int,int);
+int qurt_qdi_qhi7(int,int,int,int,int,int,int);
+int qurt_qdi_qhi8(int,int,int,int,int,int,int,int);
+int qurt_qdi_qhi9(int,int,int,int,int,int,int,int,int);
+int qurt_qdi_qhi10(int,int,int,int,int,int,int,int,int,int);
+int qurt_qdi_qhi11(int,int,int,int,int,int,int,int,int,int,int);
+int qurt_qdi_qhi12(int,int,int,int,int,int,int,int,int,int,int,int);
+
+/**@ingroup func_qurt_qdi_write
+  Writes data to the specified driver.
+  A predefined invocation routine for drivers that
+  support a POSIX-like write functionality.
+  qqurt_qdi_write(handle, buf, len) is equivalent to
+  qurt_qdi_handle_invoke(handle, QDI_WRITE, handle, buf, len);
+ 
+  @param[in]  handle Driver handle.
+  @param[in]  buf    Pointer to the memory address where the data to write is stored.
+  @param[in]  len    Number of bytes of data to write.
+
+  @return 
+  Non-negative integer -- Number of bytes written. \n
+  Negative error code -- Write could not take place.
+
+  @dependencies
+  None.
+ */
+int qurt_qdi_write(int handle, const void *buf, unsigned len);
+
+/**@ingroup func_qurt_qdi_read
+  User-visible API to read data from a QDI handle. 
+  A predefined invocation routine for drivers that
+  support a POSIX-like read functionality.
+  qurt_qdi_read(handle, buf, len) is equivalent to:
+  qurt_qdi_handle_invoke(handle, QDI_READ, handle, buf, len);
+ 
+  @param[in]  handle   Driver handle.
+  @param[in]  buf      Pointer to the memory address where the data read is stored.
+  @param[in]  len      Number of bytes of data to read.
+
+  @return 
+  Non-negative integer number -- Bytes read. \n
+  Negative error code -- Read could not take place.
+
+  @dependencies
+  None.
+ */
+int qurt_qdi_read(int handle, void *buf, unsigned len);
+
+/**@ingroup func_qurt_qdi_close
+  Closes the specified driver, releasing any resources associated with the open driver.
+  User-visible API to close a QDI handle.
+ 
+  This API should be called when the user is done using a
+  QDI-based handle. When this function is called, the driver can release
+  any resources held and perform other necessary cleanup
+  operations. qurt_qdi_close(handle) is equivalent to
+  qurt_qdi_handle_invoke(handle, QDI_CLOSE, handle)
+ 
+  @param[in]  handle Driver handle.
+ 
+  @return 
+  0 -- Success.\n
+  Negative error code -- Failure.
+
+  @dependencies
+  None.
+ */
+int qurt_qdi_close(int handle);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_qdi_constants.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_qdi_constants.h
new file mode 100755
index 0000000000000..4866fada067f0
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_qdi_constants.h
@@ -0,0 +1,193 @@
+#ifndef QDI_CONSTANTS_H
+#define QDI_CONSTANTS_H
+
+/**
+  @file qurt_qdi_constants.h
+  @brief  Predefined invocation methods for drivers.  
+
+ EXTERNALIZED FUNCTIONS
+  None
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  None
+
+ Copyright (c) 2013-2023 by Qualcomm Technologies, Inc.  All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc..
+ ======================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+||  Method numbers used for QDI.
+||
+||  Intended grouping of method numbers for QDI
+||   including future usage:
+||
+||   Method 0 should always be unused and not responded to by
+||    any driver.
+||   Methods 1 and 2 are reserved for name registration and
+||    name lookup.
+||   Methods 3 through 31 are reserved for POSIX-type operations
+||    on open handles.
+||   Methods 32 through 127 are reserved for the QDI infrastructure
+||    and may be extended in the future to provide standard
+||    driver debug services, management services, and system
+||    notifications.
+||   Methods 128 through 255 are reserved for the use of automatically
+||    generated methods such as might be generated by an IDL (interface
+||    definition language).  The infrastructure may be extended to
+||    perform services on these methods based on information provided
+||    by the IDL, such as automatic buffer validation, etc.  These
+||    method numbers should not be used for any "ad hoc" methods.
+||   Methods with number >= 256 are "private" method numbers that are
+||    outside the scope of the QDI infrastructure.  Drivers that want
+||    to generate and consume their own "ad hoc" methods are free to
+||    use these method numbers as they wish. The infrastructure does
+||    not generate these method numbers or respond to them, but
+||    passes them on unmolested.
+||
+||   All driver implementations *should* return a value of
+||    -1 when called with an unsupported method.  The standard error
+||    return value for POSIX APIs is -1, so we emulate that behavior
+||    here.
+*/
+/** @cond */
+#define QDI_UNUSED              0
+#define QDI_DEVNAME_REGISTER    1
+#define QDI_OPEN                2
+#define QDI_CLOSE               3
+#define QDI_READ                4
+#define QDI_WRITE               5
+#define QDI_IOCTL               6
+#define QDI_MMAP                7
+#define QDI_OS_FILEOPEN         8
+#define QDI_FLEN                9
+#define QDI_UNLINK             10
+#define QDI_FTELL              22
+#define QDI_SEEK               23
+#define QDI_FSTAT              24
+
+#define QDI_FSNAME_REGISTER     150    
+#define QDI_FS_OPEN             151
+#define QDI_MMAP2               153
+#define QDI_MPROTECT2           154
+#define QDI_MUNMAP2             155
+
+#define QDI_CLIENT_HANDLE_OBJREF_GET    10
+
+#define QDI_OS_PROCESS_LOAD             12
+#define QDI_OS_PROCESS_CHOOSE_ASID      13
+
+#define QDI_OS_SET_GP                    26
+#define QDI_CLIENT_HANDLE_CALLBACK       27
+   
+#define QDI_CLIENT_HANDLE_ISLAND_HANDLE_CREATE_FROM_OBJ_T  19 //reused
+#define QDI_CLIENT_HANDLE_HANDLE_CREATE_FROM_OBJ_T 80
+#define QDI_CLIENT_HANDLE_HANDLE_RELEASE           81
+#define QDI_CLIENT_HANDLE_COPY_FROM_USER           82
+#define QDI_CLIENT_HANDLE_COPY_TO_USER             83
+#define QDI_CLIENT_HANDLE_SIGNAL_GROUP_CREATE      86
+#define QDI_CLIENT_HANDLE_SAFE_CACHE_OPS           87
+
+#define QDI_CLIENT_HANDLE_BUFFER_LOCK   41
+#define QDI_CLIENT_HLOSPOOL_INFO_GET    90
+#define QDI_CLIENT_HLOSPOOL2_INFO_GET   96
+
+#define QDI_CLIENT_PID                  44
+#define QDI_CLIENT_ASID                 QDI_CLIENT_PID
+
+#define QDI_OS_CLIENT_INFO_GET          48
+
+#define QDI_OS_MEM_LOOKUP_PHYSADDR      57
+
+#define QDI_OS_THREAD_ITERATOR_CREATE   68
+#define QDI_OS_THREAD_ITERATOR_NEXT     69
+
+#define QDI_OS_SYSENV                   78
+
+#define QDI_REGION_USERMALLOC_INIT      180 // This method is for generic handle
+
+
+#define QDI_CLIENT_HANDLE_USER_MALLOC              84
+#define QDI_CLIENT_HANDLE_USER_FREE                85
+
+#define QDI_SIGNAL_GROUP_SIGNAL_CREATE          96
+#define QDI_SIGNAL_GROUP_WAIT                   98
+#define QDI_SIGNAL_GROUP_POLL                   99
+#define QDI_SIGNAL_SET                          96
+#define QDI_SIGNAL_CLEAR                        97
+#define QDI_SIGNAL_WAIT                         98
+#define QDI_SIGNAL_POLL                         99
+
+#define QDI_OS_WAIT_FOR_MAIN_REAPER		104
+
+#define QDI_CLIENT_HANDLE_REFPROXY_INSTALL        105
+#define QDI_CLIENT_HANDLE_REFPROXY_ADD            106
+#define QDI_CLIENT_HANDLE_REFPROXY_REMOVE         107
+
+#define QDI_CLIENT_HANDLE_DETACH                  116
+
+#define QDI_OS_RESERVED1                       139
+
+#define QDI_CLIENT_HANDLE_BUFFER_LOCK2   142
+
+#define QDI_DT_REGISTER 158
+#define QDI_OPEN_DEVICE 159
+#define QDI_OPEN_FROM_DT 160
+
+#define QDI_PRIVATE             256  /* Method numbers beginning at 256
+                                        are private method numbers, which
+                                        are device-specific and available
+                                        for use by device implementors. */
+/*
+||  Permission bitmasks for use with qurt_qdi_lock_buffer().
+||
+||  Make sure these match with permission values from qurt_perm_t.
+*/
+/** @endcond */
+
+/** @addtogroup driver_support_constants
+@{ */
+#define QDI_PERM_W              2                         /**< Write access. */
+#define QDI_PERM_R              1                         /**< Read access. */
+#define QDI_PERM_RW             (QDI_PERM_R | QDI_PERM_W) /**< Read/write access. */
+
+#define QDI_HANDLE_LOCAL_CLIENT         3                 /**< Local client. */
+#define QDI_HANDLE_GENERIC              4                 /**< Generic. */
+
+#define QDI_REFCNT_BASE   0x510000                        /**<  */
+#define QDI_REFCNT_MAXED  0x51FFFD                        /**<  */
+#define QDI_REFCNT_INIT   0x51FFFE                        /**< Driver object is temporary and is eventually deleted.*/
+#define QDI_REFCNT_PERM   0x51FFFF                        /**< Driver object is permanent and is never deleted. */
+/** @} */ /* end_addtogroup driver_support_constants */
+
+/** @cond */
+/*
+||  Flags used by process loaders.
+*/
+
+#define QDI_OS_PROCESS_FLAGS_ISLAND_RESIDENT    0x1     /* Set this flag to request the loaded process
+                                                           to have island residency. */
+#define QDI_OS_PROCESS_FLAGS_ROOT_RESIDENT      0x2     /* Set this flag to request the loaded process
+                                                           to have root residency, for example, DL Pager. */
+/*
+||  Constants used for qurt_event register API, type field.
+*/
+
+#define QURT_PROCESS_EXIT   1
+
+/*
+||  Constants used by QDI extensions.
+*/
+
+#define QURT_QDI_SINGLETON_TYPE_TRUE			0
+#define QURT_QDI_SINGLETON_TYPE_FALSE			1
+#define QURT_QDI_SINGLETON_TYPE_PER_PROCESS		2
+/** @endcond */
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QDI_CONSTANTS_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_qdi_driver.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_qdi_driver.h
new file mode 100755
index 0000000000000..e044e25f1bb72
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_qdi_driver.h
@@ -0,0 +1,868 @@
+#ifndef QURT_QDI_DRIVER_H
+#define QURT_QDI_DRIVER_H
+
+/**
+  @file qurt_qdi_driver.h
+  @brief  Definitions, macros, and prototypes used when writing a
+  QDI driver.
+
+ EXTERNALIZED FUNCTIONS
+  None
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  None
+
+ Copyright (c) 2018, 2019-2021, 2023 Qualcomm Technologies, Inc.
+ All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+
+#include "stddef.h"
+#include "qurt_qdi.h"
+#include "qurt_types.h"
+#include "qurt_callback.h"
+#include "qurt_qdi_constants.h"
+#include "qurt_qdi_imacros.h"
+#include "qurt_mutex.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+||  This gives the canonical form for the arguments to a QDI
+||   driver invocation function.  The arguments are as follows:
+||
+||   int client_handle    (R0) QDI handle that represents the client
+||                             that made this QDI request. If the
+||                             client is remote, this is a
+||                             variable handle; if the client is local
+||                             (same thread and process), this is
+||                             set to QDI_HANDLE_LOCAL_CLIENT.
+||
+||   qurt_qdi_obj_t *obj  (R1) Points at the qdi_object_t structure
+||                             on which this QDI request is being made.
+||                             The qdi_object_t structure is usually
+||                             the first element of a larger structure
+||                             that contains state associated with the
+||                             object; because it is usually the first
+||                             element, the object pointers can be freely
+||                             interchanged through casts.
+||
+||   int method           (R2) Integer QDI method that represents
+||                             the request type.
+||
+||   qurt_qdi_arg_t arg1  (R3) First three general purpose arguments
+||   qurt_qdi_arg_t arg2  (R4)  to the invocation function are passed in
+||   qurt_qdi_arg_t arg3  (R5)  these slots.
+||
+||   qurt_qdi_arg_t arg4  (SP+0)  Arguments beyond the first three are
+||   qurt_qdi_arg_t arg5  (SP+4)  passed on the stack.
+||   qurt_qdi_arg_t arg6  (SP+8)
+||   qurt_qdi_arg_t arg7  (SP+12)
+||   qurt_qdi_arg_t arg8  (SP+16)
+||   qurt_qdi_arg_t arg9  (SP+20)
+||
+||  The canonical form of the invocation function takes a
+||   total of 12 arguments, but not all of them are used.  In general,
+||   the QDI infrastructure only passes those arguments provided by
+||   the caller; if the invocation function accesses additional
+||   arguments beyond those provided by the caller, the values are not
+||   useful.
+*/
+/** @cond */
+#define QDI_INVOKE_ARGS \
+    int, struct qdiobj *, int, \
+    qurt_qdi_arg_t, qurt_qdi_arg_t, qurt_qdi_arg_t, \
+    qurt_qdi_arg_t, qurt_qdi_arg_t, qurt_qdi_arg_t, \
+    qurt_qdi_arg_t, qurt_qdi_arg_t, qurt_qdi_arg_t
+
+#define QDI_EXT_INVOKE_ARGS \
+    int, qurt_qdi_man_obj_t*, int, \
+    qurt_qdi_arg_t, qurt_qdi_arg_t, qurt_qdi_arg_t, \
+    qurt_qdi_arg_t, qurt_qdi_arg_t, qurt_qdi_arg_t, \
+    qurt_qdi_arg_t, qurt_qdi_arg_t, qurt_qdi_arg_t
+
+#define BUFFER_LOCK 1
+#define BUFFER_UNLOCK 0 
+
+struct qdiobj;
+/** @endcond */
+/** @addtogroup driver_support_types
+@{ */
+typedef union {
+    void *ptr; /**< Pointer to the driver handle. */
+    int num;   /**< Method number. */
+} qurt_qdi_arg_t;
+/** @} */ /* end_addtogroup driver_support_types */
+/** @cond */
+/** QuRT QDI driver version */
+typedef union {
+    int num;
+    struct {
+        short major; /** Driver major version number. */
+        short minor; /** Driver minor version number. */
+    };
+} qurt_qdi_version_t;
+
+typedef int (*qurt_qdi_pfn_invoke_t)(QDI_INVOKE_ARGS);
+typedef void (*qurt_qdi_pfn_release_t)(struct qdiobj *);
+/** @endcond */
+/** @addtogroup driver_support_types
+@{ */
+typedef struct qdiobj {
+    qurt_qdi_pfn_invoke_t invoke;   /**< Invocation function that implements the driver methods.*/
+    int refcnt;                     /**< Reference count, an integer value maintained by the QDI infrastructure that tracks the number of
+                                         references to a driver instance. */
+    qurt_qdi_pfn_release_t release; /**< Release function that performs details associated with deleting an instance
+                                         of the driver object.*/
+} qurt_qdi_obj_t;
+/** @} */ /* end_addtogroup driver_support_types */
+/** @cond */
+/** QuRT QDI managed object */
+typedef struct qurt_qdi_man_obj
+{
+    qurt_qdi_obj_t qdi_obj;
+    union
+    {
+        struct qurt_qdi_ext_driver * opener_obj;
+        struct qurt_qdi_ext_device * device_obj;
+    };
+}qurt_qdi_man_obj_t;
+
+typedef int (*qurt_qdi_ext_pfn_create_t)(int client_id, const char *name, qurt_qdi_version_t version, qurt_qdi_man_obj_t **qdi_obj);
+typedef int (*qurt_qdi_ext_pfn_create_device_t)(int client_id, const char *name, qurt_qdi_version_t version, struct qurt_qdi_ext_device * device, qurt_qdi_man_obj_t **qdi_obj);
+typedef int (*qurt_qdi_ext_pfn_invoke_t)(QDI_EXT_INVOKE_ARGS);
+typedef void (*qurt_qdi_ext_pfn_destroy_t)(qurt_qdi_man_obj_t *qdi_obj);
+typedef int (*qurt_qdi_ext_pfn_probe_t)(void *handle, struct qurt_qdi_ext_device **device);
+
+typedef struct qurt_qdi_ext_obj_info{
+    qurt_qdi_man_obj_t *obj;
+    int qdi_client_id;
+    struct qurt_qdi_ext_obj_info *next;
+}qurt_qdi_ext_obj_info_t;
+typedef struct qurt_qdi_ext_obj_info *qurt_qdi_ext_obj_info_ptr;
+
+/** QuRT QDI device */
+//temporarily add this back while there are still drivers who statically define this structure
+struct qurt_qdi_device {
+    qurt_qdi_obj_t opener_obj;
+    const char* name;
+    char island_resident;
+    unsigned char singleton;
+    qurt_qdi_ext_pfn_create_t create;
+    qurt_qdi_ext_pfn_invoke_t invoke;
+    qurt_qdi_ext_pfn_destroy_t destroy;
+    qurt_mutex_t qurt_qdi_ext_list_lock;
+    qurt_qdi_ext_obj_info_ptr qurt_qdi_ext_obj_info_head;
+};
+typedef struct qurt_qdi_device qurt_qdi_man_device;
+
+struct qurt_qdi_ext_driver {
+    qurt_qdi_obj_t opener_obj;
+    const char* name;
+    char island_resident;
+    unsigned char singleton;
+    qurt_qdi_ext_pfn_create_t create;
+    qurt_qdi_ext_pfn_invoke_t invoke;
+    qurt_qdi_ext_pfn_destroy_t destroy;
+    qurt_mutex_t qurt_qdi_ext_list_lock;
+    qurt_qdi_ext_obj_info_ptr qurt_qdi_ext_obj_info_head;
+    qurt_qdi_ext_pfn_create_device_t create_device;
+    qurt_qdi_version_t version;
+    qurt_qdi_ext_pfn_probe_t probe;
+    const char* compatible;
+    struct qurt_qdi_ext_device * device_list;
+    //qurt_qdi_ext_device_ptr device_list;
+};
+typedef struct qurt_qdi_ext_driver qurt_qdi_ext_driver_t;
+//above replaces qurt_qdi_man_device
+
+extern int qurt_qdi_obj_ref_inc(qurt_qdi_obj_t *);
+extern int qurt_qdi_obj_ref_dec(qurt_qdi_obj_t *);
+
+extern int qurt_qdi_ext_opener (QDI_INVOKE_ARGS);
+/** @endcond */
+/**@ingroup func_qurt_qdi_method_default
+  Processes a method that is unrecognized or unsupported in the driver invocation function.
+  All arguments passed to the current invocation function (Section @xref{sec:invocationFunction}) must be forwarded
+  to this function.
+
+  @note1hang Invocation functions must process all unrecognized or unsupported methods
+             by calling this function.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+extern int qurt_qdi_method_default(QDI_INVOKE_ARGS);
+
+/**@ingroup func_qurt_qdi_handle_create_from_obj_t
+  Allocates a new device handle for use with the specified driver object.
+   
+  @param[in] client_handle  Client handle obtained from the current invocation function (Section @xref{sec:invocationFunction}).
+  @param[out] obj           Pointer to the driver object.
+
+  @return
+  Non-negative integer -- Success; this value is the new handle. \n
+  Negative value -- Error.
+
+  @dependencies
+  None.
+*/
+static __inline int qurt_qdi_handle_create_from_obj_t(int client_handle, qurt_qdi_obj_t *obj)
+{
+    return qurt_qdi_handle_invoke(client_handle,
+                                    QDI_CLIENT_HANDLE_HANDLE_CREATE_FROM_OBJ_T,
+                                    obj);
+}
+
+/**@ingroup func_qurt_qdi_handle_invoke
+  Allocates a new island device handle for use with the specified driver object.
+   
+  @param[in] client_handle Client handle obtained from the current invocation function (Section 3.4.1).
+  @param[in] obj           Pointer.
+
+  @return
+  Non-negative integer value that is the new handle -- Success. \n
+  Negative return value -- Error.
+
+  @dependencies
+  None.
+*/
+static __inline int qurt_qdi_island_handle_create_from_obj_t(int client_handle, qurt_qdi_obj_t *obj)
+{
+    return qurt_qdi_handle_invoke(client_handle,
+                                    QDI_CLIENT_HANDLE_ISLAND_HANDLE_CREATE_FROM_OBJ_T,
+                                    obj);
+}
+
+/**@ingroup func_qurt_qdi_handle_release
+  Deallocates the specified device handle.
+
+  @param[in] client_handle     Obtained from the current invocation function (Section @xref{sec:invocationFunction}).
+  @param[in] handle_to_release Handle to release.
+
+  @return 
+  0 -- Success. \n
+  Negative value -- Error. 
+
+  @dependencies
+  None.
+*/
+static __inline int qurt_qdi_handle_release(int client_handle, int handle_to_release)
+{
+    return qurt_qdi_handle_invoke(client_handle,
+                                    QDI_CLIENT_HANDLE_HANDLE_RELEASE,
+                                    handle_to_release);
+}
+
+static __inline qurt_qdi_obj_t *
+qurt_qdi_objref_get_from_handle(int client_handle, int object_handle)
+{
+    qurt_qdi_obj_t *ret;
+
+    ret = NULL;
+
+    qurt_qdi_handle_invoke(client_handle,
+                            QDI_CLIENT_HANDLE_OBJREF_GET,
+                            object_handle,
+                            &ret);
+
+    return ret;
+}
+
+/**@ingroup func_qurt_client_add_memory
+  Adds a physical address range to the HLOS physpool of the caller user PD.
+   
+  @param[in] client_handle  Obtained from the current invocation function (Section 3.4.1).
+  @param[in] phys_addr      Starting address of the physical address range. 
+  @param[in] size           Size.
+
+  @return
+  #QURT_EOK -- Pages successfully added.
+
+  @dependencies
+  None.
+*/
+int qurt_client_add_memory(int client_handle, qurt_addr_t phys_addr, qurt_size_t size);
+
+/**@ingroup func_qurt_client_add_memory2
+  Adds a physical address range to the HLOS physpool of the caller user PD.
+   
+  @param[in] client_handle  Obtained from the current invocation function (Section 3.4.1).
+  @param[in] phys_addr      Starting 36-bit address of the physical address range. 
+  @param[in] size           Size.
+
+  @return
+  #QURT_EOK -- Pages successfully added.
+
+  @dependencies
+  None.
+*/
+int qurt_client_add_memory2(int user_client_handle, qurt_paddr_64_t phys_addr, qurt_size_t size);
+
+static __inline qurt_qdi_obj_t *
+qurt_qdi_objref_get_from_pointer(qurt_qdi_obj_t *objptr)
+{
+    qurt_qdi_obj_t * ret = NULL;
+
+    if (qurt_qdi_obj_ref_inc(objptr) < 0) {
+        ret = NULL;
+    } else {
+        ret = objptr;
+    }
+
+    return ret;
+}
+
+static __inline void
+qurt_qdi_objref_release(qurt_qdi_obj_t *objptr)
+{
+    if (qurt_qdi_obj_ref_dec(objptr) == 1) {
+        (*objptr->release)(objptr);
+    }
+}
+
+/**@ingroup func_qurt_qdi_copy_from_user
+  Copies the contents of a user memory buffer into the current driver.
+
+  @note1hang User buffer addresses are valid only for the duration of the current driver
+  invocation.
+
+  @param[in] client_handle   Obtained from the current invocation function (Section @xref{sec:invocationFunction}).
+  @param[in] dest            Base address of the driver buffer.
+  @param[in] src             Base address of the user buffer.
+  @param[in] len             Number of bytes to copy.
+  
+  @return
+  Negative value -- Indicates a privilege or security violation, the copy operation 
+                has crossed a privilege boundary.
+  
+  @dependencies
+  None.
+*/
+static __inline int qurt_qdi_copy_from_user(int client_handle, void *dest, const void *src, unsigned len)
+{
+    return qurt_qdi_handle_invoke(client_handle,
+                                    QDI_CLIENT_HANDLE_COPY_FROM_USER,
+                                    dest, src, len);
+}
+
+/**@ingroup qurt_qdi_copy_string_from_user
+  Copies the contents of a user memory buffer into the current driver.
+
+  @note1hang User buffer addresses are valid only for the duration of the current driver
+  invocation.
+
+  @param client_handle   Obtained from the current invocation function (Section 3.4.1).
+  @param dest            Base address of the driver buffer.
+  @param src             Base address of the user buffer.
+  @param len             Number of bytes to copy. NOTE: This is the destination buffer length.
+  
+  @return
+  Negative error result -- privilege or security violation, the copy operation 
+                has crossed a privilege boundary.
+  
+  @dependencies
+  None.
+*/
+int qurt_qdi_copy_string_from_user(int client_handle, char *dest, const char *src, unsigned len);
+
+/**@ingroup func_qurt_qdi_copy_to_user
+  Copies the contents of a driver memory buffer to user memory.
+
+  @note1hang User buffer addresses are valid only for the duration of the current driver
+             invocation.
+
+  @param[in] client_handle Client handle obtained from the current invocation function (Section @xref{sec:invocationFunction}).
+  @param[in] dest          Base address of the user buffer.
+  @param[in] src           Base address of the driver buffer.
+  @param[in] len           Number of bytes to copy.
+
+  @return
+  Negative value -- Indicates a privilege or security violation, the copy operation has crossed a 
+                    privilege boundary
+
+  @dependencies
+  None.
+*/
+static __inline int qurt_qdi_copy_to_user(int client_handle, void *dest, const void *src, unsigned len)
+{
+    return qurt_qdi_handle_invoke(client_handle,
+                                    QDI_CLIENT_HANDLE_COPY_TO_USER,
+                                    dest, src, len);
+}
+
+/**@ingroup func_qurt_qdi_safe_cache_ops
+  Do cache operations on user memory
+
+  @note1hang User buffer addresses are valid only for the duration of the current driver
+             invocation.
+
+  @param[in] client_handle Client handle obtained from the current invocation function (Section @xref{sec:invocationFunction}).
+  @param[in] addr          Base address of the user memory.
+  @param[in] size          Size of the user memory.
+  @param[in] opcode        Cache operations (QURT_MEM_CACHE_FLUSH, QURT_MEM_CACHE_INVALIDATE...)
+  @param[in] type          Cache type (QURT_MEM_ICACHE, QURT_MEM_DCACHE)
+
+  @return
+  Negative value -- Indicates a privilege or security violation, the copy operation has crossed a
+                    privilege boundary
+
+  @dependencies
+  None.
+*/
+static __inline int qurt_qdi_safe_cache_ops(int client_handle, qurt_addr_t addr, qurt_size_t size,
+        qurt_mem_cache_op_t opcode, qurt_mem_cache_type_t type)
+{
+    return qurt_qdi_handle_invoke(client_handle,
+                                  QDI_CLIENT_HANDLE_SAFE_CACHE_OPS,
+                                  addr, size, opcode, type);
+}
+
+
+/**@ingroup func_qurt_qdi_buffer_lock
+  Prepares for the direct manipulation of a potentially untrusted buffer provided by a QDI
+  client.
+
+  This function is used to permit a trusted driver to safely access memory that is
+  provided by a potentially untrusted client. A driver calls this function to obtain a safe buffer
+  pointer for accessing the memory.
+
+  This function performs the following security checks: \n
+  - Verifies that the entire buffer is accessible to the client. \n
+  - Ensures that the pointer remains valid for the remainder of the QDI driver
+      operation. \n
+
+  @note1hang  User buffer addresses are valid only for the duration of the current driver
+              invocation.
+
+  @param[in] client_handle Obtained from the current invocation function (Section @xref{sec:invocationFunction}).
+  @param[in] buf           Pointer to the base address of the client buffer address.
+  @param[in] len           Buffer length (in bytes).
+  @param[in] perms         Bitmask value that specifies the read or write access to perform on the
+                       client buffer: \n
+                           - #QDI_PERM_R -- Read access \n
+                           - #QDI_PERM_W -- Write access \n
+                           - #QDI_PERM_RW -- Read/write access @tablebulletend
+  @param[out] obuf     Pointer to the buffer address that the driver must use to access the buffer.
+
+  @return
+  Negative value -- Error; the operation crosses a privilege boundary, indicating a privilege or security violation. \n
+  Nonzero value -- User passed a buffer that does not fulfill the requested read/write access permission.
+                    In this case the QDI driver call must be terminated cleanly, with an appropriate error code 
+                    returned to the client. \n
+  Zero -- Success; when this occurs the QDI driver must use the pointer at *obuf to access memory, and not the
+                    pointer passed in as buf -- even if the user process changes the mapping of memory at buf,
+                   the mapping of memory at *obuf remains valid until the driver invocation completes.
+
+  @dependencies
+  None.
+*/
+static __inline int qurt_qdi_buffer_lock(int client_handle, void *buf, unsigned len,
+                                         unsigned perms, void **obuf)
+{
+    return qurt_qdi_handle_invoke(client_handle,
+                                    QDI_CLIENT_HANDLE_BUFFER_LOCK,
+                                    buf, len, perms, obuf);
+}
+
+/**@ingroup func_qurt_qdi_buffer_lock2
+   Prepares for the direct manipulation of a possibly-untrusted buffer provided by a QDI
+   client.
+   This API permits a trusted driver to safely access memory 
+   provided by a possibly-untrusted client. A driver calls this function to obtain a safe buffer
+   pointer for accessing the memory.
+   This function performs the following security checks: \n
+   -- Entire buffer is accessible to the client. \n
+   -- Entire buffer is mapped with permissions passed in perms field \n
+   -- Entire buffer is physically contiguous \n
+   In addition to the security checks, the API also locks the client mapping such that the client
+   cannot remove the mapping while the physical memory is used by the trusted
+   driver. \n
+
+   @note1      Drivers are responsible for calling qurt_qdi_buffer_unlock() at appropriate time. Not 
+               pairing qurt_qdi_buffer_unlock() with this API leads to resource leakages and 
+               process exit failures. Drivers can keep track of which buffers are locked for
+               a particular client. If the client exits abruptly, the buffers can be
+               unlocked on driver release invocation for the exiting client.
+
+   @note2      This API is supported in limited capacity when called from Island mode. Safe buffer
+               unmapping or user buffer unlock is not supported in Island mode.
+
+   @param client_handle Obtained from the current invocation function (Section 3.4.1).
+   @param buf           Pointer to the base address of the client buffer address.
+   @param len           Buffer length (in bytes).
+   @param perms         Bitmask value that specifies the read or write access to perform on the
+                        client buffer: \n
+                        -- #QDI_PERM_R -- Read access \n
+                        -- #QDI_PERM_W -- Write access \n
+                        -- #QDI_PERM_RW -- Read/write access \n
+   @param obuf         Optional parameter that returns a pointer to the buffer address that 
+                       the driver must use to access the buffer. If NULL is passed, the API 
+                       only performs security checks and does not create a mapping to access the user buffer in
+                       a safe way.
+
+   @return
+   QURT_EINVALID   -- Arguments passed to the API are invalid. User buffer pointer is NULL or length of the
+                      buffer is 0. \n
+   QURT_EPRIVILEGE -- One of the security checks on the user buffer failed. \n
+   QURT_EFAILED    -- Mapping cannot be created for the trusted driver. \n
+   QURT_EOK        -- Lock operation was successful. When this occurs, the QDI driver must use the 
+                      pointer at *obuf to perform its memory accesses, and not the
+                      pointer passed in as buf. 
+                      
+   @dependencies
+   None.
+*/
+static __inline int qurt_qdi_buffer_lock2(int client_handle, void *buf, unsigned len,
+                                         unsigned perms, void **obuf)
+{
+    return qurt_qdi_handle_invoke(client_handle,
+                                    QDI_CLIENT_HANDLE_BUFFER_LOCK2,
+                                    BUFFER_LOCK, buf, len, perms, obuf);
+}
+
+/**@ingroup func_qurt_qdi_buffer_unlock
+   This API is paired with qurt_qdi_buffer_lock2(). A temporary overlapping mapping 
+   created for the driver is removed. Client mapping for the user buffer is
+   unlocked. 
+
+   @note1      Drivers are responsible for pairing this with qurt_qdi_buffer_lock(). Not 
+               pairing qurt_qdi_buffer_lock() with this API leads to resource leakages and 
+               process exit failures. Drivers can keep track of which buffers are locked for
+               a particular client, and if the client exits abruptly, all the buffers can be
+               unlocked on driver release invocation for the exiting client.
+
+   @note2      This API is supported in limited capacity when called from Island mode. Actual
+               unmapping of driver accessible memory or unlocking of the buffer is not
+               supported in Island bode.
+
+   @param client_handle Obtained from the current invocation function (Section 3.4.1).
+   @param buf           Pointer to the base address of the client buffer address.
+   @param len           Buffer length (in bytes).
+   @param obuf          Safe buffer address that was returned in the obuf field after calling
+                        qurt_qdi_buffer_lock2().
+
+   @return
+   QURT_EINVALID   -- Arguments passed to the API are invalid. User buffer pointer is NULL or length of the
+                      buffer is 0. \n
+   QURT_EOK        -- Lock operation was successful. When this occurs, the QDI driver must use the 
+                      pointer at *obuf to perform its memory accesses, and not the
+                      pointer passed in as buf. \n
+   other results   -- Safe buffer unmapping failed or unlocking of user buffer failed \n.
+
+   @dependencies
+   None.
+*/
+static __inline int qurt_qdi_buffer_unlock(int client_handle, void *buf, unsigned len,
+                                           void *obuf)
+{
+    return qurt_qdi_handle_invoke(client_handle,
+                                    QDI_CLIENT_HANDLE_BUFFER_LOCK2,
+                                    BUFFER_UNLOCK, buf, len, obuf);
+}
+
+/**@ingroup func_qurt_qdi_user_malloc
+  Allocates memory area in the QDI heap that is read/write accessible to both the driver and
+  the client. \n
+  @note1hang The QDI heap has a limited amount of memory available, and only the
+  device driver can free the allocated memory.
+
+  @param client_handle Client handle obtained from the current invocation function (Section @xref{sec:invocationFunction}).
+  @param size          Size.
+
+  @return
+  Non-zero -- Success; this returned value points to the allocated memory area. \n
+  Zero -- Error.
+
+  @dependencies
+  None.
+*/
+void *qurt_qdi_user_malloc(int client_handle, unsigned size);
+
+/**@ingroup func_qurt_qdi_user_free
+  Deallocates memory area in the QDI heap.
+
+  @param client_handle Client handle obtained from the current invocation function (Section @xref{sec:invocationFunction}).
+  @param ptr Pointer.
+
+  @dependencies
+  None.
+*/
+void qurt_qdi_user_free(int client_handle, void *ptr);
+
+/**@ingroup funct_qurt_qdi_client_detach
+  Detaches a client (a process), indicating that the client does not
+  participate in the qurt_wait() mechanism. This behavior
+  is opt-in and irrevocable. When a client is detached, it can
+  not be un-detached.
+
+  @param client_handle Handle of the client to detach.
+
+  @return
+  Zero -- Success.  Detachable clients always return success.
+  Nonzero value -- client_handle did not refer to a
+    detachable user client.
+
+  @dependencies
+  None.
+*/
+static __inline int qurt_qdi_client_detach(int client_handle)
+{
+    return qurt_qdi_handle_invoke(client_handle, QDI_CLIENT_HANDLE_DETACH);
+}
+
+/**@ingroup func_qurt_qdi_signal_group_create
+  Creates a new signal group for use in a device driver.
+  A QDI signal group contains up to 32 signals, which can be operated on either
+  individually (using the qurt_qdi_signal_* functions) or as a group (using the
+  qurt_qdi_signal_group_* functions). \n
+  @note1hang Driver implementation is responsible for using the proper signal group
+             handle in any given situation. \n
+  For more information on signals, see the Hexagon QuRT RTOS User Guide (80-VB419-78).
+
+  @param client_handle                 Client handle obtained from the current invocation function (Section @xref{sec:invocationFunction}).
+  @param p_signal_group_handle_local   Returns a handle intended for use by code that
+                                       resides in the same context and process as the created signal group
+                      (for example, the device driver implementation that allocated the 
+                      signal group).
+  @param p_signal_group_handle_remote  Returns a handle intended for use by code
+                                       that resides in a different context and process than the created signal group 
+                      (for example, the user-mode client of an OS driver).
+
+  @return
+  Zero return value indicates success.\n
+  Negative return value indicates could not create signal group.
+
+  @dependencies
+  None.
+*/
+static __inline int qurt_qdi_signal_group_create(int client_handle,
+                                                 int *p_signal_group_handle_local,
+                                                 int *p_signal_group_handle_remote)
+{
+    return qurt_qdi_handle_invoke(client_handle,
+                                    QDI_CLIENT_HANDLE_SIGNAL_GROUP_CREATE,
+                                    p_signal_group_handle_local,
+                                    p_signal_group_handle_remote);
+}
+
+/**@ingroup func_qurt_qdi_signal_group_wait
+  Suspends the current thread until any of the signals are set in the specified signal group.
+
+  If a signal is set in a signal group object, and a thread waits on the signal group object,
+  the thread is awakened. If the awakened thread has higher priority than the current
+  thread, a context switch can occur.
+
+  @param signal_group_handle   Handle of the signal group.
+
+  @return
+  If the client is remote:
+  QURT_EOK -- Wait complete \n
+  QURT_ECANCEL -- Wait cancelled.\n
+  If the client is local, returns a 32-bit word with current signals.
+
+  @dependencies
+  None.
+*/
+static __inline int qurt_qdi_signal_group_wait(int signal_group_handle)
+{
+    return qurt_qdi_handle_invoke(signal_group_handle,
+                                    QDI_SIGNAL_GROUP_WAIT);
+}
+
+/**@ingroup func_qurt_qdi_signal_group_poll
+  Returns a value that indicates if any of the signals are set in the specified signal group.
+
+  @param signal_group_handle Handle of the signal group.
+
+  @return
+  1 -- Indicates whether any of the signals are set in the signal group.\n
+  0 -- Indicates that none of the signals are set.
+
+  @dependencies
+  None.
+*/
+static __inline int qurt_qdi_signal_group_poll(int signal_group_handle)
+{
+    return qurt_qdi_handle_invoke(signal_group_handle,
+                                    QDI_SIGNAL_GROUP_POLL);
+}
+
+
+/**@ingroup func_qurt_qdi_signal_create
+  Creates a new signal in the specified signal group.
+  For more information on signals, see the Hexagon QuRT RTOS User Guide (80-VB419-78).
+
+  @note1hang Driver implementation is responsible for using the proper signal handle in
+             any given situation.
+
+  @param signal_group_handle    Handle of an existing signal group.
+  @param p_signal_handle_local  Returns a handle intended for use by code that resides in
+                               the same context and process as the created signal (for example,
+                               the device driver implementation that allocated the signal).
+  @param p_signal_handle_remote Returns a handle intended for use by code that resides in
+                               a different context and process than the created signal (for 
+                               example, the user-mode client of an OS driver).
+
+  @return 
+  Nonzero value -- No more signals can be created in the specified
+                    signal group. 
+
+  @dependencies
+  None.
+*/
+static __inline int qurt_qdi_signal_create(int signal_group_handle,
+                                           int *p_signal_handle_local,
+                                           int *p_signal_handle_remote)
+{
+    return qurt_qdi_handle_invoke(signal_group_handle,
+                                    QDI_SIGNAL_GROUP_SIGNAL_CREATE,
+                                    p_signal_handle_local,
+                                    p_signal_handle_remote);
+}
+
+/**@ingroup func_qurt_qdi_signal_set
+  Sets the signal in the specified signal object.
+
+  @param signal_handle Handle of the signal.
+
+  @return
+  Always returns 0.
+
+  @dependencies
+  None.
+*/
+static __inline int qurt_qdi_signal_set(int signal_handle)
+{
+    return qurt_qdi_handle_invoke(signal_handle,
+                                    QDI_SIGNAL_SET);
+}
+
+/**@ingroup func_qurt_qdi_signal_clear
+  Clears the signal in the specified signal object.
+
+  @param signal_handle   Handle of the signal.
+  
+  @return 
+  Always returns 0.
+
+  @dependencies
+  None.
+*/
+static __inline int qurt_qdi_signal_clear(int signal_handle)
+{
+    return qurt_qdi_handle_invoke(signal_handle,
+                                    QDI_SIGNAL_CLEAR);
+}
+
+/**@ingroup func_qurt_qdi_signal_wait
+  Suspends the current thread until the specified signal is set.
+  If a signal is set in a signal object, and a thread waits on the signal object, the
+  thread is awakened. If the awakened thread has higher priority than the current thread, a
+  context switch may occur.
+
+  @param signal_handle Handle of the signal.
+
+  @return
+  If client is remote:
+  QURT_EOK -- Wait complete. \n
+  QURT_ECANCEL -- Wait cancelled.\n
+  If client is local, return a 32-bit word with current signals.
+
+  @dependencies
+  None.
+*/
+static __inline int qurt_qdi_signal_wait(int signal_handle)
+{
+    return qurt_qdi_handle_invoke(signal_handle,
+                                    QDI_SIGNAL_WAIT);
+}
+
+/**@ingroup func_qurt_qdi_signal_poll
+  Returns a value that indicates if the specified signal is set.
+
+  @param signal_handle Handle of the signal.
+
+  @return
+  1 -- Signal is set. \n
+  0 -- Signal is not set.
+
+  @dependencies
+  None.
+*/
+static __inline int qurt_qdi_signal_poll(int signal_handle)
+{
+    return qurt_qdi_handle_invoke(signal_handle,
+                                    QDI_SIGNAL_POLL);
+}
+
+/**@ingroup func_qurt_qdi_devname_register
+  Registers a QDI device with the generic QDI object in the 
+  current QDI context.
+
+  This function registers an exact name or a directory prefix with a QDI opener object.
+  Future invocations of qurt_qdi_open() in the context of the caller invokes the
+  opener object if a match is detected.
+
+  Directory prefix names are specified by ending the name with a forward slash character.
+
+  Example of an exact name:
+  @code qurt_qdi_devname_register(/dev/foobar, foobar_opener);@endcode
+
+  Example of a directory prefix:
+  @code qurt_qdi_devname_register(/pipedev/, pipedev_opener);@endcode
+
+  Given the two registrations shown above, the only qurt_qdi_open() requests to
+  direct to the foobar_opener object are requests for the exact name
+  "/dev/foobar", Any request beginning with "/pipedev/" is directed to the
+  pipedev_opener object.
+
+  The pipedev invocation function presumably examines the name argument to
+  determine exactly how to handle the request. The name is passed to the invocation
+  function in the a1.ptr argument (Section @xref{sec:invocationFunction}).
+
+  @param  name   Device name or device name prefix.
+  @param  opener Pointer to the opener object for the device.
+ 
+  @return
+  0 -- Device was successfully registered. \n
+  Negative error code -- Device was not registered.
+
+  @dependencies
+  None.
+ */
+static __inline int qurt_qdi_devname_register(const char *name,
+                                              qurt_qdi_obj_t *opener)
+{
+    return qurt_qdi_handle_invoke(QDI_HANDLE_GENERIC,
+                                    QDI_DEVNAME_REGISTER,
+                                    name,
+                                    opener);
+}
+
+// Macros for backward compatibility with deprecated APIs
+//  (These will go away soon)
+
+#define qurt_qdi_register_devname(name, opener) \
+        qurt_qdi_devname_register((name), (void *)(opener))
+#define qurt_qdi_new_handle_from_obj_t(handle, obj) \
+        qurt_qdi_handle_create_from_obj_t((handle), (obj))
+#define qurt_qdi_release_handle(client_handle, handle) \
+        qurt_qdi_handle_release((client_handle), (handle))
+#define qurt_qdi_lock_buffer(handle, buf, len, perms, obuf) \
+        qurt_qdi_buffer_lock((handle), (buf), (len), (perms), (obuf))
+#define qurt_qdi_usermalloc(handle, size) \
+        qurt_qdi_user_malloc((handle), (size))
+#define qurt_qdi_userfree(handle, ptr) \
+        qurt_qdi_user_free((handle), (ptr))
+        
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_qdi_ext.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_qdi_ext.h
new file mode 100755
index 0000000000000..383e1799a15d6
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_qdi_ext.h
@@ -0,0 +1,58 @@
+#ifndef QURT_QDI_EXT_H
+#define QURT_QDI_EXT_H
+
+/**
+  @file qurt_qdi_driver.h
+  @brief  Definitions, macros, and prototypes used when writing a
+  QDI driver
+
+ EXTERNALIZED FUNCTIONS
+  none
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  none
+
+ Copyright (c) 2018, 2019-2021 by Qualcomm Technologies, Inc.  All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+#include "qurt_qdi_driver.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct qurt_qdi_ext_device {
+    qurt_qdi_ext_obj_info_ptr qurt_qdi_ext_obj_info_head;
+    struct qurt_qdi_ext_device * next;
+    char * instance;
+    fdt_node_handle context;
+};
+typedef struct qurt_qdi_ext_device *qurt_qdi_ext_device_ptr;
+
+/**@ingroup func_qurt_qdi_dt_register
+ Registers a QDI device with the generic QDI object in the current QDI context,
+ if and only if a compatible device node is found in the device tree. This 
+ function serves as a device tree aware wrapper for qurt_qdi_devname_register().
+
+ @param  name   Device name or device name prefix.
+ @param  opener Pointer to QDI ext specialized opener object for the driver.
+
+ @return
+ 0 -- Device was successfully registered. \n
+ Negative error code -- Device was not registered.
+*/
+static __inline int qurt_qdi_dt_register(const char *name, qurt_qdi_obj_t *opener)
+{
+    return qurt_qdi_handle_invoke(QDI_HANDLE_GENERIC, QDI_DT_REGISTER, name, opener);
+}
+
+static inline void qurt_qdi_ext_deviceobj_set_name (struct qurt_qdi_ext_device * device, char * name)
+{
+    device->instance = name;
+}
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_qdi_imacros.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_qdi_imacros.h
new file mode 100755
index 0000000000000..c0a8448ac87f8
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_qdi_imacros.h
@@ -0,0 +1,34 @@
+#ifndef QURT_QDI_IMACROS_H
+#define QURT_QDI_IMACROS_H
+
+/**
+  @file  qurt_qdi_imacros.h 
+  @brief  Internal macros used for QDI. Mostly consists of tricky (and ugly)
+  preprocessor hacks that permit us to do varargs function invocations
+  where we pass optional arguments in registers and where we can do
+  type casting and checking automatically.
+
+ EXTERNALIZED FUNCTIONS
+  none
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  none
+
+ Copyright (c) 2013, 2021  by Qualcomm Technologies, Inc.  All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define _QDMPASTE(a,b) _QDMPASTE_(a,b)
+#define _QDMPASTE_(a,b) a##b
+#define _QDMCNT(...) _QDMCNT_(__VA_ARGS__,12,11,10,9,8,7,6,5,4,3,2,1,0)
+#define _QDMCNT_(a,b,c,d,e,f,g,h,i,j,k,l,cnt,...) cnt
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_qdi_proxy.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_qdi_proxy.h
new file mode 100755
index 0000000000000..f1d8992ea8811
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_qdi_proxy.h
@@ -0,0 +1,55 @@
+/*=============================================================================
+
+                                    qurt_qdi_proxy.h
+
+GENERAL DESCRIPTION
+
+EXTERNAL FUNCTIONS
+        None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+        None.
+
+Copyright (c) 2013, 2021  by Qualcomm Technologies, Inc.  All Rights Reserved.
+=============================================================================*/
+#ifndef _QURT_QDI_PROXY_H
+#define _QURT_QDI_PROXY_H
+
+#include "qurt_qdi_driver.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* APIs allowing operation on the proxy object directly */
+int qurt_qdi_proxy_ref_create(void);
+
+/* APIs allowing to operate on proxy given a known proxy handle 
+ * 1) using qdi handle of the object 
+ * successful return: QURT_EOK, anything else -- failure 
+ */
+int qurt_qdi_proxy_ref_add_by_handle(int proxy_handle, int qdi_handle);
+int qurt_qdi_proxy_ref_sub_by_handle(int proxy_handle, int qdi_handle);
+
+/* 2) using object reference 
+ * successful return: QURT_EOK, anything else -- failure 
+ */
+int qurt_qdi_proxy_ref_add_by_object(int proxy_handle, qurt_qdi_obj_t *obj_ptr);
+int qurt_qdi_proxy_ref_sub_by_object(int proxy_handle, qurt_qdi_obj_t *obj_ptr);
+
+/* API allowing to associate a proxy object with a particular client given a client handle 
+ * successfule return: QURT_EOK, anything else -- failure 
+ */
+int qurt_client_proxy_ref_install (int client_handle, int proxy_handle);
+
+/* APIs allowing operation on proxy object from user client 
+ * successful return: QURT_EOK, anything else -- failure 
+ */
+int qurt_client_proxy_ref_add(int qdi_handle);
+int qurt_client_proxy_ref_remove(int qdi_handle);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_QDI_PROXY_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_rmutex.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_rmutex.h
new file mode 100755
index 0000000000000..a013a0bbddb1d
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_rmutex.h
@@ -0,0 +1,200 @@
+#ifndef QURT_RMUTEX_H
+#define QURT_RMUTEX_H
+/**
+  @file qurt_rmutex.h 
+  Prototypes of rmutex API.  
+
+EXTERNAL FUNCTIONS
+   None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+Copyright (c) 2013 - 2021 by Qualcomm Technologies, Inc.  All Rights Reserved.
+Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+=============================================================================*/
+
+
+#include <qurt_futex.h>
+#include <qurt_mutex.h>
+
+/*=============================================================================
+												FUNCTIONS
+=============================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**@ingroup func_qurt_rmutex_init
+   Initializes a recursive mutex object.
+   The recursive mutex is initialized in unlocked state.
+
+   @datatypes
+   #qurt_mutex_t
+   
+   @param[out]  lock  Pointer to the recursive mutex object.
+
+   @return 
+   None.
+
+   @dependencies
+   None.  
+ */
+void qurt_rmutex_init(qurt_mutex_t *lock);
+
+/**@ingroup func_qurt_rmutex_destroy  
+  Destroys the specified recursive mutex. \n
+  @note1hang Recursive mutexes must not be destroyed while they are still in use. If this
+             occurs, the behavior of QuRT is undefined. 
+  
+  @datatypes
+  #qurt_mutex_t
+  
+  @param[in]  lock  Pointer to the recursive mutex object to destroy.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+  
+ */
+void qurt_rmutex_destroy(qurt_mutex_t *lock);
+
+/**@ingroup func_qurt_rmutex_lock
+  Locks the specified recursive mutex. \n 
+
+  If a thread performs a lock operation on a mutex that is not in use, the thread 
+  gains access to the shared resource that the mutex protects, and continues executing.
+
+  If a thread performs a lock operation on a mutex that is already use by another 
+  thread, the thread is suspended. When the mutex becomes available again (because the 
+  other thread has unlocked it), the thread is awakened and given access to the shared resource.
+  
+   @note1hang A thread is not suspended if it locks a recursive mutex that it has already 
+   locked. However, the mutex does not become available to other threads until the
+   thread performs a balanced number of unlocks on the mutex.
+
+   @datatypes
+   #qurt_mutex_t
+  
+   @param[in]  lock  Pointer to the recursive mutex object to lock. 
+
+   @return
+   None.
+
+   @dependencies
+   None.
+  
+ */
+void qurt_rmutex_lock(qurt_mutex_t *lock);
+
+/**@ingroup func_qurt_rmutex_lock_timed
+  Locks the specified recursive mutex. The wait must be terminated when the specified timeout expires.\n 
+
+  If a thread performs a lock operation on a mutex that is not in use, the thread 
+  gains access to the shared resource that the mutex is protecting, and continues executing.
+
+  If a thread performs a lock operation on a mutex that is already in use by another 
+  thread, the thread is suspended. When the mutex becomes available again (because the 
+  other thread has unlocked it), the thread is awakened and given access to the shared resource.
+  
+   @note1hang A thread is not suspended if it locks a recursive mutex that it has already 
+   locked by itself. However, the mutex does not become available to other threads until the
+   thread performs a balanced number of unlocks on the mutex.
+   If timeout expires, this wait must be terminated and no access to the mutex is granted.
+   
+   @datatypes
+   #qurt_mutex_t
+  
+   @param[in]  lock  Pointer to the recursive mutex object to lock. 
+   @param[in] duration Interval (in microseconds) duration value must be between #QURT_TIMER_MIN_DURATION and
+    #QURT_TIMER_MAX_DURATION 
+
+   @return 
+   #QURT_EOK -- Success \n
+   #QURT_ETIMEDOUT -- Timeout
+
+   @dependencies
+   None.
+  
+ */
+int qurt_rmutex_lock_timed(qurt_mutex_t *lock, unsigned long long int duration);
+
+/**@ingroup func_qurt_rmutex_unlock
+   Unlocks the specified recursive mutex. \n 
+   More than one thread can be suspended on a mutex. When the mutex is 
+   unlocked, the thread waiting on the mutex awakens. If the awakened
+   thread has higher priority than the current thread, a context switch occurs.
+
+   @note1hang When a thread unlocks a recursive mutex, the mutex is not available until 
+   the balanced number of locks and unlocks has been performed on the mutex.
+
+   @datatypes
+   #qurt_mutex_t
+  
+   @param[in]  lock  Pointer to the recursive mutex object to unlock. 
+
+   @return
+   None.
+
+   @dependencies
+   None.
+  
+ */
+void qurt_rmutex_unlock(qurt_mutex_t *lock);
+
+/**@ingroup func_qurt_rmutex_try_lock
+   Attempts to lock the specified recursive mutex.\n
+
+   If a thread performs a try_lock operation on a recursive mutex that is not in use, the
+   thread gains access to the shared resource that is protected by the mutex, and continues
+   executing.\n
+   If a thread performs a try_lock operation on a recursive mutex that another thread has 
+   already locked, qurt_rmutex_try_lock immediately returns with a nonzero result
+   value.
+
+   @datatypes
+   #qurt_mutex_t
+   
+   @param[in]  lock  Pointer to the recursive mutex object to lock.
+
+   @return 
+   0 -- Success. \n 
+   Nonzero -- Failure. 
+  
+ */
+int qurt_rmutex_try_lock(qurt_mutex_t *lock);
+
+/**@ingroup func_qurt_rmutex_try_lock_block_once 
+  Attempts to lock a mutex object recursively. If the mutex is available, 
+  it locks the mutex. If the mutex is held by the current thread, 
+  it increases the internal counter and returns 0. If not, it returns a
+  nonzero value.
+  If the mutex is already locked by another thread, the caller thread is 
+  suspended. When the mutex becomes available again (because the other 
+  thread has unlocked it), the caller thread is awakened and tries to lock
+  the mutex; and if it fails, this function returns failure with a nonzero 
+  value. If it succeeds, this function returns success with zero.
+ 
+  @datatypes
+  #qurt_mutex_t
+  
+  @param[in]  lock  Pointer to the qurt_mutex_t object. 
+
+  @return
+  0 -- Success. \n
+  Nonzero -- Failure. 
+
+  @dependencies
+  None.
+ */
+int qurt_rmutex_try_lock_block_once(qurt_mutex_t *lock);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_RMUTEX_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_rmutex2.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_rmutex2.h
new file mode 100755
index 0000000000000..a37e7e4458c4b
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_rmutex2.h
@@ -0,0 +1,183 @@
+#ifndef QURT_RMUTEX2_H
+#define QURT_RMUTEX2_H
+/**
+  @file qurt_rmutex2.h 
+  @brief Prototypes of rmutex2 API  
+
+EXTERNAL FUNCTIONS
+   None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+Copyright (c) 2013, 2021, 2023  by Qualcomm Technologies, Inc.  All Rights Reserved.
+Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+=============================================================================*/
+
+
+#include <qurt_futex.h>
+#include <qurt_mutex.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** @addtogroup mutex_types
+@{ */
+/*=============================================================================
+                        TYPEDEFS
+=============================================================================*/
+
+/** QuRT rmutex2 type.                                       
+   Mutex type used with rmutex2 APIs.
+ */
+typedef struct {
+   /** @cond */
+   unsigned int holder __attribute__((aligned(8)));    /* UGP value of the mutex holder. */
+   unsigned short waiters;                             /* Number of waiting threads. */
+   unsigned short refs;                                /* Number of references to this mutex. */
+   unsigned int queue;                                 /* Kernel-maintained futex queuevalue. */
+   unsigned int excess_locks;                          /* Number of excess times the holder has locked the mutex. */
+   /** @endcond */
+} qurt_rmutex2_t;
+/** @} */ /* end_addtogroup mutex_types */
+/** @cond internal_only*/
+/*=============================================================================
+												FUNCTIONS
+=============================================================================*/
+
+/**@ingroup func_qurt_rmutex2_init
+
+   @deprecated use #qurt_rmutex_init instead.
+
+   Initializes a recursive mutex object. 
+
+   The recursive mutex is initially unlocked.
+  
+   Objects of type rmutex2 solve a potential race condition between
+   unlock() and destroy() operations.
+
+   @datatypes
+   #qurt_rmutex2_t
+   
+   @param[out]  lock  Pointer to the recursive mutex object.
+
+   @return 
+   None.
+
+   @dependencies
+   None.
+ */
+void qurt_rmutex2_init(qurt_rmutex2_t *lock);
+
+/**@ingroup func_qurt_rmutex2_destroy
+
+  @deprecated use #qurt_rmutex_destroy instead.
+
+  Destroys the specified recursive mutex. \n
+  @note1hang Recursive mutexes must not be destroyed while they are still in use. If this
+             occurs, the behavior of QuRT is undefined. 
+  @note1cont In general, application code must destroy an rmutex2 object prior to
+             deallocating it; calling qurt_rmutex2_destroy() before deallocating it ensures
+             that all qurt_rmutex2_unlock() calls complete.
+  
+  @datatypes
+  #qurt_rmutex2_t
+  
+  @param[in]  lock  Pointer to the recursive mutex object to destroy.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+  
+ */
+void qurt_rmutex2_destroy(qurt_rmutex2_t *lock);
+
+/**@ingroup func_qurt_rmutex2_lock
+
+  @deprecated use #qurt_rmutex_lock instead.
+
+  Locks the specified recursive mutex. \n 
+
+  If a thread performs a lock operation on a recursive mutex that is not in use, the
+  thread gains access to the shared resource that the mutex protects, and continues
+  to execute.
+
+  If a thread performs a lock operation on a recursive mutex that another thread is using, 
+  the thread is suspended. When the mutex becomes available again
+  (because the other thread has unlocked it), the thread is awakened and given access to the
+  shared resource.
+  
+  @note1hang A thread is not suspended if it locks a recursive mutex that it has already
+             locked, but the mutex does not become available until the thread performs a
+             balanced number of unlocks on the mutex.
+  
+   @datatypes
+   #qurt_rmutex2_t
+  
+   @param[in]  lock  Pointer to the recursive mutex object to lock. 
+
+   @return
+   None.
+
+   @dependencies
+   None.
+  
+ */
+void qurt_rmutex2_lock(qurt_rmutex2_t *lock);
+
+/**@ingroup func_qurt_rmutex2_unlock
+
+   @deprecated use #qurt_rmutex_unlock instead.
+
+   Unlocks the specified recursive mutex. \n 
+   More than one thread can be suspended on a recursive mutex. When the mutex is
+   unlocked, only the highest-priority thread waiting on the mutex awakens. If the
+   awakened thread has higher priority than the current thread, a context switch occurs.
+  
+   @datatypes
+   #qurt_rmutex2_t
+  
+   @param[in]  lock  Pointer to the recursive mutex object to unlock. 
+
+   @return
+   None.
+
+   @dependencies
+   None.
+  
+ */
+void qurt_rmutex2_unlock(qurt_rmutex2_t *lock);
+
+/**@ingroup func_qurt_rmutex2_try_lock
+
+   @deprecated use #qurt_rmutex_try_lock instead.
+
+   Attempts to lock the specified recursive mutex.\n
+
+   Non-blocking version of qurt_rmutex2_lock(). When a call to qurt_rmutex2_lock() 
+   succeeds immediately, this function behaves similarly, returning 0 for success.
+   When a call to qurt_rmutex2_lock() does not succeed immediately, this function has
+   no effect and returns nonzero for failure.
+
+   @datatypes
+   #qurt_rmutex2_t
+   
+   @param[in]  lock  Pointer to the recursive mutex object to lock.
+
+   @return 
+   0 -- Success. \n 
+   Nonzero -- Failure. 
+  
+ */
+int qurt_rmutex2_try_lock(qurt_rmutex2_t *lock);
+/** @endcond */
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_RMUTEX2_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_sclk.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_sclk.h
new file mode 100755
index 0000000000000..a83cf5f1db889
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_sclk.h
@@ -0,0 +1,145 @@
+#ifndef QURT_SCLK_H
+#define QURT_SCLK_H
+/**
+  @file qurt_sclk.h 
+  @brief Header file describing the APIs supported by QuRT system SCLK
+   feature.
+
+EXTERNAL FUNCTIONS
+   None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+Copyright (c) 2018-2021, 2023 Qualcomm Technologies, Inc.
+All Rights Reserved.
+Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+=============================================================================*/
+
+
+
+
+/*=============================================================================
+
+                           INCLUDE FILES
+
+=============================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
+
+/**
+ Conversion from microseconds to sleep ticks.
+ */
+#define QURT_SYSCLOCK_TIMETICK_FROM_US(us) ((us) * 192ULL / 10UL)
+#define qurt_sysclock_timetick_from_us(us) QURT_SYSCLOCK_TIMETICK_FROM_US(us)
+
+
+/**
+ Conversion from timer ticks to microseconds at the nominal frequency.
+*/
+#define QURT_SYSCLOCK_TIMETICK_TO_US(ticks) qurt_timer_timetick_to_us(ticks)
+
+/**
+  Maximum microseconds value for Qtimer is 1,042,499 hours.
+*/
+#define QURT_SYSCLOCK_MAX_DURATION (1042499uLL * 3600uLL * 1000uLL * 1000uLL)
+#define qurt_sysclock_max_duration() QURT_SYSCLOCK_MAX_DURATION
+/** 
+ Timer clock for Qtimer is 19.2 MHz.
+*/
+#define QURT_SYSCLOCK_MAX_DURATION_TICKS (1042499uLL * 3600uLL * 19200000uLL)
+#define qurt_sysclock_max_duration_ticks() QURT_SYSCLOCK_MAX_DURATION_TICKS
+/** 
+ Sleep timer error margin for Qtimer is 192 ticks ~10 us.
+*/
+#define QURT_SYSCLOCK_ERROR_MARGIN 192U //QURT_TIMER_MIN_DURATION*timer_freq;
+#define qurt_sysclock_error_margin() QURT_SYSCLOCK_ERROR_MARGIN
+
+/*=============================================================================
+
+                           DATA DECLARATIONS
+
+=============================================================================*/
+
+/**@ingroup func_qurt_sysclock_get_hw_ticks
+  @xreflabel{sec:qurt_sysclock_get_hw_ticks}
+  Gets the hardware tick count.\n
+  Returns the current value of a 64-bit hardware counter. The value wraps around to zero
+  when it exceeds the maximum value. 
+
+  @note1hang This operation must be used with care because of the wrap-around behavior.
+ 
+  @return 
+  Integer -- Current value of 64-bit hardware counter. 
+
+  @dependencies
+  None.
+ */
+unsigned long long qurt_sysclock_get_hw_ticks (void);
+
+
+/**@ingroup func_qurt_sysclock_get_hw_ticks_32
+  @xreflabel{sec:qurt_sysclock_get_hw_ticks_32}
+  Gets the hardware tick count in 32 bits.\n
+  Returns the current value of a 32-bit hardware counter. The value wraps around to zero
+  when it exceeds the maximum value. 
+
+  @note1hang This operation is implemented as an inline C function, and should be called from a C/C++ program.
+             The returned 32 bits are the lower 32 bits of the Qtimer counter.
+ 
+  @return 
+  Integer -- Current value of the 32-bit timer counter. 
+
+  @dependencies
+  None.
+ */
+static inline unsigned long qurt_sysclock_get_hw_ticks_32 (void)
+{
+    //Beginning with v61 there is a HW register that can be read directly.
+          unsigned long count;
+          __asm__ __volatile__ (" %0 = c30 " : "=r"(count));
+          return count;
+}
+
+
+/**@ingroup func_qurt_sysclock_get_hw_ticks_16
+  @xreflabel{sec:qurt_sysclock_get_hw_ticks_16}
+  Gets the hardware tick count in 16 bits.\n
+  Returns the current value of a 16-bit timer counter. The value wraps around to zero
+  when it exceeds the maximum value. 
+
+  @note1hang This operation is implemented as an inline C function, and should be called from a C/C++ program.
+             The returned 16 bits are based on the value of the lower 32 bits in Qtimer 
+             counter, right shifted by 16 bits.
+ 
+  @return 
+  Integer -- Current value of the 16-bit timer counter, calculated from the lower 32 bits in the
+             Qtimer counter, right shifted by 16 bits. 
+
+  @dependencies
+  None.
+ */
+
+
+static inline unsigned short qurt_sysclock_get_hw_ticks_16 (void)
+{
+    unsigned long ticks;
+
+    //Beginning with v61 there is a HW register that can be read directly.
+       __asm__ __volatile__ (" %0 = c30 " : "=r"(ticks));
+    __asm__ __volatile__ ( "%0 = lsr(%0, #16) \n" :"+r"(ticks));
+
+    return (unsigned short)ticks; 
+}
+unsigned long long qurt_timer_timetick_to_us(unsigned long long ticks);
+#define qurt_sysclock_timetick_to_us(ticks) qurt_timer_timetick_to_us(ticks)
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif /* __cplusplus */
+
+#endif /* QURT_SCLK_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_secure_proc.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_secure_proc.h
new file mode 100755
index 0000000000000..f40c7deb9bca1
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_secure_proc.h
@@ -0,0 +1,53 @@
+#ifndef QURT_SECURE_PROC_H
+#define QURT_SECURE_PROC_H
+
+/**
+  @file qurt_secure_proc.h
+  @brief  Definitions, macros, and prototypes used for handling secure process
+
+ EXTERNALIZED FUNCTIONS
+  none
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  none
+
+ Copyright (c) 2015, 2021  by Qualcomm Technologies, Inc.  All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**@ingroup qurt_process_migrate_secure_process
+  Migrate the user process to Qurt secure process 
+   
+  @param secure_phy_address  Physical starting address of secure memory
+  @param secure_memory_size  Size of secure memory
+  @param entry               Entry function to secure process 
+
+  @return
+  EOK
+  Negative return value -- Error.
+
+  @dependencies
+  None.
+*/
+int qurt_process_migrate_secure_process(unsigned long long secure_phy_address, unsigned int secure_memory_size,  void entry(unsigned));
+
+/**@ingroup qurt_process_get_migration_mem_size
+  get the size of all writable memory regions in a user PD. This is for preparation on secure process migration.
+   
+  @return
+  size of all writable memory regions in a user PD.
+ 
+  @dependencies
+  None.
+*/
+int qurt_process_get_migration_mem_size(void);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_sem.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_sem.h
new file mode 100755
index 0000000000000..ee5ce4b2d94ab
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_sem.h
@@ -0,0 +1,252 @@
+#ifndef QURT_SEM_H
+#define QURT_SEM_H 
+/**
+  @file  qurt_sem.h 
+  Prototypes of semaphore API.  
+
+EXTERNAL FUNCTIONS
+   None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+Copyright (c) 2021  by Qualcomm Technologies, Inc.  All Rights Reserved.
+Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+=============================================================================*/
+
+
+#include <qurt_futex.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*=============================================================================
+												TYPEDEFS
+=============================================================================*/
+/** @addtogroup semaphore_types
+@{ */
+
+/** QuRT semaphore type.   */
+typedef union {
+    /** @cond */
+	unsigned int raw[2] __attribute__((aligned(8)));
+	struct {        
+		unsigned short val;        /**< */
+		unsigned short n_waiting;  /**< */
+        unsigned int reserved1;    /**< */
+        unsigned int queue;       /**< */
+        unsigned int reserved2;    /**< */
+	}X; /** @endcond */   
+} qurt_sem_t;
+/** @} */ /* end_addtogroup semaphore_types */
+/*=============================================================================
+												FUNCTIONS
+=============================================================================*/
+
+/**@ingroup func_qurt_sem_add
+  Releases access to a shared resource (the specified amount increments the semaphore count value).\n
+  When a thread performs an add operation on a semaphore, the specified value increments the semaphore count.
+  The result depends on the number of threads waiting
+  on the semaphore: \n
+  - When no threads are waiting, the current thread releases access to the shared resource
+     and continues executing. \n
+  - When one or more threads are waiting and the semaphore count value is nonzero, 
+     the kernel repeatedly awakens the highest-priority waiting thread and decrements
+     the semaphore count value until either no waiting threads remain or the
+     semaphore count value is zero. If any of the awakened threads has higher priority
+     than the current thread, a context switch can occur.
+
+   @datatypes
+   #qurt_sem_t
+   
+   @param[in]  sem  Pointer to the semaphore object to access. 
+   @param[in]  amt  Amount to increment the semaphore count value. 
+
+   @return 
+   Unused integer value.
+
+   @dependencies 
+   None.
+  
+ */
+int qurt_sem_add(qurt_sem_t *sem, unsigned int amt);
+
+/**@ingroup func_qurt_sem_up  
+  Releases access to a shared resource. When a thread performs an up operation on a semaphore,
+  the semaphore count value increments. The result depends on the number of threads waiting 
+  on the semaphore: \n
+  - When no threads are waiting, the current thread releases access to the shared resource
+     and continues executing.\n
+  - When one or more threads are waiting and the semaphore count value is nonzero, 
+     the kernel awakens the highest-priority waiting thread and decrements the
+     semaphore count value. If the awakened thread has higher priority than the current
+     thread, a context switch can occur.
+
+   @datatypes
+   #qurt_sem_t
+   
+   @param[in]  sem  Pointer to the semaphore object to access.
+
+   @return 
+   Unused integer value.
+
+   @dependencies
+   None.  
+ */
+static inline int qurt_sem_up(qurt_sem_t *sem) { return qurt_sem_add(sem,1); }
+
+/**@ingroup func_qurt_sem_down  
+  Requests access to a shared resource. When a thread performs a down operation on a 
+  semaphore, the result depends on the semaphore count value: \n
+  - When the count value is nonzero, it is decremented, and the thread gains access to the
+     shared resource and continues executing.\n
+  - When the count value is zero, it is not decremented, and the thread is suspended on the
+     semaphore. When the count value becomes nonzero (because another thread
+     released the semaphore) it is decremented, and the suspended thread is awakened
+     and gains access to the shared resource.
+  
+   @datatypes
+   #qurt_sem_t
+   
+   @param[in]  sem  Pointer to the semaphore object to access.
+
+   @return 
+   Unused integer value.
+
+   @dependencies
+   None.
+ */
+int qurt_sem_down(qurt_sem_t *sem);
+
+/**@ingroup func_qurt_sem_down_timed  
+  When a thread performs a down operation on a semaphore, the result depends on the
+  semaphore count value: \n
+  - When the count value is nonzero, it is decremented, and the thread gains access to the
+     shared resource and continues executing.\n
+  - When the count value is zero, it is not decremented, and the thread is suspended on the
+     semaphore. When the count value becomes nonzero (because another thread
+     released the semaphore) it is decremented, and the suspended thread is awakened
+     and gains access to the shared resource. Terminate the wait when the specified timeout expires.
+   If timeout expires, terminate this wait and grant no access to the shared resource.
+  
+   @datatypes
+   #qurt_sem_t
+   
+   @param[in]  sem     Pointer to the semaphore object to access. 
+   @param[in] duration Interval (in microseconds) duration value must be between #QURT_TIMER_MIN_DURATION and
+                       #QURT_TIMER_MAX_DURATION 
+
+   @return 
+   #QURT_EOK -- Success \n
+   #QURT_ETIMEDOUT -- Timeout
+
+   @dependencies
+   None.
+ */
+int qurt_sem_down_timed(qurt_sem_t *sem, unsigned long long int duration);
+
+/**@ingroup func_qurt_sem_try_down
+  @xreflabel{hdr:qurt_sem_try_down}
+  Requests access to a shared resource (without suspend). When a thread performs a try down
+  operation on a semaphore, the result depends on the semaphore count value: \n
+  - The count value is decremented when it is nonzero. The down operation returns 0 as
+     the function result, and the thread gains access to the shared resource and is free to
+     continue executing.\n
+  - The count value is not decremented when it is zero. The down operation returns -1
+     as the function result, and the thread does not gain access to the shared resource
+     and should not continue executing.
+ 
+   @datatypes
+   #qurt_sem_t
+   
+   @param[in]  sem  Pointer to the semaphore object to access. 
+
+   @return 
+   0 -- Success. \n
+   -1 -- Failure. 
+
+   @dependencies
+   None.
+   
+ */
+int qurt_sem_try_down(qurt_sem_t *sem);
+
+/**@ingroup func_qurt_sem_init
+  Initializes a semaphore object.
+  The default initial value of the semaphore count value is 1.
+
+  @param[out]  sem  Pointer to the initialized semaphore object.
+
+  @return 
+  None.
+
+  @dependencies
+  None.
+  
+ */
+void qurt_sem_init(qurt_sem_t *sem);
+
+/**@ingroup func_qurt_sem_destroy
+  Destroys the specified semaphore.\n
+  @note1hang Semaphores must be destroyed when they are no longer in use. Failure to do
+             this causes resource leaks in the QuRT kernel.\n
+  @note1cont Semaphores must not be destroyed while they are still in use. If this occur,
+             the behavior of QuRT is undefined.
+
+  @datatypes
+  #qurt_sem_t
+
+  @param[in]  sem  Pointer to the semaphore object to destroy. 
+ 
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+void qurt_sem_destroy(qurt_sem_t *sem);
+
+/**@ingroup func_qurt_sem_init_val
+  Initializes a semaphore object with the specified value.
+
+   @datatypes
+   #qurt_sem_t
+   
+   @param[out]  sem  Pointer to the initialized semaphore object. 
+   @param[in]  val   Initial value of the semaphore count value.
+
+   @return
+   None.
+
+   @dependencies
+   None.
+  
+ */
+void qurt_sem_init_val(qurt_sem_t *sem, unsigned short val);
+
+/**@ingroup func_qurt_sem_get_val
+  Gets the semaphore count value.\n
+  Returns the current count value of the specified semaphore.
+
+  @datatypes
+  #qurt_sem_t
+  
+  @param[in]   sem Pointer to the semaphore object to access.
+
+  @return
+  Integer semaphore count value
+
+  @dependencies
+  None.
+ */
+static inline unsigned short qurt_sem_get_val(qurt_sem_t *sem ){return sem->X.val;}
+int qurt_sem_down_cancellable(qurt_sem_t *sem);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_SEM_H  */
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_shmem.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_shmem.h
new file mode 100755
index 0000000000000..980557323708a
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_shmem.h
@@ -0,0 +1,89 @@
+#ifndef QURT_SHMEM_H
+#define QURT_SHMEM_H
+
+/**
+  @file qurt_shmem.h
+
+  @brief
+  Prototypes of QuRT inter-process shared memory APIs
+
+ EXTERNALIZED FUNCTIONS
+  none
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  none
+
+ Copyright (c) 2013, 2021  by Qualcomm Technologies, Inc.  All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef MODE_T
+#define MODE_T
+typedef unsigned int mode_t;
+#endif //MODE_T
+
+/**
+ * The shm_open() function establishes a connection between a shared memory object and a file descriptor.
+ * The file descriptor is used by other functions such as mmap() to refer to that shared memory object.
+ * 
+ *
+ * @param name      Pointer to string naming a shared memory object. Name has to start with "/shm/"
+ * @param oflag     File status flags and file access modes of the open file description. Following
+ *                  flags are defined in <fcntl.h> and supported:
+ *                  O_RDONLY: oepn for read access only
+ *                  O_RDWR: Open for read or write access
+ *                  O_CREAT: If shared memory object doesn't exist, create one.
+ * @param mode      Permission flags (currently ignored)
+ *
+ * @return    file descriptor (positive number) if operation successful.
+ *                  negative error code if failed
+ *
+*/
+
+int shm_open(const char * name, int oflag, mode_t mode);
+
+/**
+ * The shm_mmap() function create a shared memory mapping in the virtual address space of the
+ * the calling process. 
+ * 
+ * @param addr      The starting address for the new mapping is specified in addr.
+ * @param len       Specifies the lengh of the shared memory region.
+ * @param prot      Describes the desired memory protection of the mapping. Same as the one in mmap of POSIX.
+ * @param flags     Determines whether updates to the mapping is visible or not to other process. Same as
+ *                  the one in mmap of POSIX.
+ * @param fd        The starting adddress for the new mapping is returned.
+ * @param offset    unused.
+ *
+ * @return    The starting adddress for the new mapping is returned.
+ *                  negative error code if failed
+ *
+*/
+
+void *shm_mmap(void *addr, unsigned int len, int prot, int flags, int fd, unsigned int offset);
+
+/**
+ * The shm_close() function removes a connection between a shared memory object and a file descriptor.
+ * If there is no file descriptor connects to the shared memory object, the shared memory object will
+ * be deleted automatically. Shared memory object has same virtual address in any process. This is 
+ * restriction of single virtual address space. 
+ * 
+ *
+ * @param fd        File descriptor of shared memory object
+ *
+ * @return    0 if operation successful.
+ *                  negative error code if failed
+ *
+*/
+
+
+int shm_close(int fd);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_signal.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_signal.h
new file mode 100755
index 0000000000000..3a89c53394ad5
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_signal.h
@@ -0,0 +1,518 @@
+#ifndef QURT_SIGNAL_H
+#define QURT_SIGNAL_H
+
+/**
+  @file qurt_signal.h
+  @brief  Prototypes of kernel signal API functions. 
+
+ EXTERNALIZED FUNCTIONS
+  none
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  none
+
+ Copyright (c) 2021  by Qualcomm Technologies, Inc.  All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** @addtogroup signals_types
+@{ */
+#define QURT_SIGNAL_ATTR_WAIT_ANY 0x00000000  /**< Wait any. */
+#define QURT_SIGNAL_ATTR_WAIT_ALL 0x00000001  /**< Wait all. */
+
+/*=====================================================================
+ Typedefs
+ ======================================================================*/
+
+
+/** QuRT signal type.                                           
+ */
+typedef union {
+    /** @cond */
+	unsigned long long int raw;
+	struct {
+		unsigned int signals;
+		unsigned int waiting;
+		unsigned int queue;
+		unsigned int attribute;
+	}X;
+    /** @endcond */
+} qurt_signal_t;
+
+
+/** QuRT 64-bit signal type.                                           
+ */
+typedef struct {
+    /** @cond */
+    qurt_signal_t signal_sum;
+    unsigned long long signals;
+    unsigned long long waiting;
+    /** @endcond */
+} qurt_signal_64_t;
+/** @} */ /* end_addtogroup signals_types */
+/*=====================================================================
+ Functions
+======================================================================*/
+ 
+/*======================================================================*/
+/**@ingroup func_qurt_signal_init
+  Initializes a signal object.
+  Signal returns the initialized object.
+  The signal object is initially cleared.
+
+  @note1hang   Each signal-based object has one or more kernel resources associated with it;
+               to prevent resource leaks, call qurt_signal_destroy()
+               when this object is not used anymore
+  @datatypes
+  #qurt_signal_t
+
+  @param[in] *signal Pointer to the initialized object.
+
+  @return         
+  None.
+     
+  @dependencies
+  None.
+ */
+/* ======================================================================*/
+void qurt_signal_init(qurt_signal_t *signal);
+
+/*======================================================================*/
+/**@ingroup func_qurt_signal_destroy
+  Destroys the specified signal object.
+  
+  @note1hang Signal objects must be destroyed when they are no longer in use. Failure 
+  to do this causes resource leaks in the QuRT kernel.\n
+  @note1cont Signal objects must not be destroyed while they are still in use. If this
+  occurs, the behavior of QuRT is undefined.
+  
+  @datatypes
+  #qurt_signal_t
+
+  @param[in] *signal  Pointer to the signal object to destroy.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+/* ======================================================================*/
+void qurt_signal_destroy(qurt_signal_t *signal);
+
+/*======================================================================*/
+/**@ingroup func_qurt_signal_wait 
+  @xreflabel{hdr:qurt_signal_wait}
+  Suspends the current thread until the specified signals are set.
+
+  Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1 indicates 
+  waiting on a signal, and 0 indicates not waiting on the signal.
+
+  If a thread is waiting on a signal object for any of the specified set of signals to set, 
+  and one or more of those signals is set in the signal object, the thread is awakened.
+
+  If a thread is waiting on a signal object for all of the specified set of signals to be set, 
+  and all of those signals are set in the signal object, the thread is awakened.
+
+  The specified set of signals can be cleared when the signal is set.
+
+  @note1hang At most, one thread can wait on a signal object at any given time.
+
+  @datatypes
+  #qurt_signal_t
+
+  @param[in] signal      Pointer to the signal object to wait on.
+  @param[in] mask        Mask value identifying the individual signals in the signal object to 
+                         wait on.
+  @param[in] attribute   Indicates whether the thread waits to set any of the signals, or to set all of 
+                         them. \n
+						 @note1hang The wait-any and wait-all types are mutually exclusive.\n Values:\n
+                         - #QURT_SIGNAL_ATTR_WAIT_ANY \n
+                         - #QURT_SIGNAL_ATTR_WAIT_ALL @tablebulletend
+
+  @return     	
+  A 32-bit word with current signals.
+ 
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+unsigned int qurt_signal_wait(qurt_signal_t *signal, unsigned int mask, 
+                unsigned int attribute);
+
+/*======================================================================*/
+/**@ingroup func_qurt_signal_wait_timed
+  @xreflabel{hdr:qurt_signal_wait}
+  Suspends the current thread until the specified signals are set or until timeout.
+
+  Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1 indicates 
+  waiting on a signal, and 0 indicates not waiting.
+
+  If a thread is waiting on a signal object for any of the specified set of signals to be set, 
+  and one or more of those signals is set in the signal object, the thread is awakened.
+
+  If a thread is waiting on a signal object for all of the specified set of signals to be set, 
+  and all of those signals are set in the signal object, the thread is awakened.
+
+  The specified set of signals can be cleared after the signal is set.
+
+  @note1hang At most, one thread can wait on a signal object at any given time.
+
+  @datatypes
+  #qurt_signal_t
+
+  @param[in] signal      Pointer to the signal object to wait on.
+  @param[in] mask        Mask value that identifies the individual signals in the signal object to wait on.
+  @param[in] attribute   Indicates whether the thread must wait until any of the signals are set, or until all of 
+                         them are set. \n
+						 @note1hang The wait-any and wait-all types are mutually exclusive.\n Values:\n
+                         - #QURT_SIGNAL_ATTR_WAIT_ANY \n
+                         - #QURT_SIGNAL_ATTR_WAIT_ALL @tablebulletend
+  @param[out] signals    Bitmask of signals that are set 
+  @param[in] duration    Duration (microseconds) to wait. Must be in the range
+                         [#QURT_TIMER_MIN_DURATION ... #QURT_TIMER_MAX_DURATION]
+
+  @return 				
+  #QURT_EOK -- Success; one or more signals were set \n
+  #QURT_ETIMEDOUT -- Timed-out \n
+  #QURT_EINVALID -- Duration out of range
+  
+  @dependencies
+  Timed-waiting support in the kernel.
+*/
+/* ======================================================================*/
+int qurt_signal_wait_timed(qurt_signal_t *signal, unsigned int mask, 
+                unsigned int attribute, unsigned int *signals, unsigned long long int duration);
+
+/*======================================================================*/
+/**@ingroup func_qurt_signal_wait_any
+  Suspends the current thread until any of the specified signals are set.
+
+  Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1 indicates
+  to wait on a signal, and 0 indicates not to wait on the thread.
+
+  If a thread is waiting on a signal object for any of the specified set of signals to be set, 
+  and one or more of those signals is set in the signal object, the thread is awakened.
+
+  @note1hang At most, one thread can wait on a signal object at any given time.
+
+  @datatypes
+  #qurt_signal_t
+
+  @param[in] signal      Pointer to the signal object to wait on.
+  @param[in] mask        Mask value identifying the individual signals in the signal object to  
+                         wait on.
+	
+  @return     	
+  32-bit word with current signals.
+ 
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+static inline unsigned int qurt_signal_wait_any(qurt_signal_t *signal, unsigned int mask)
+{
+  return qurt_signal_wait(signal, mask, QURT_SIGNAL_ATTR_WAIT_ANY);
+}
+
+/*======================================================================*/
+/**@ingroup func_qurt_signal_wait_all
+  Suspends the current thread until all of the specified signals are set.
+
+  Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1 indicates 
+  to wait on a signal, and 0 indicates not to wait on it.
+
+  If a thread is waiting on a signal object for all of the specified set of signals to be set, 
+  and all of those signals are set in the signal object, the thread is awakened.
+
+  @note1hang At most, one thread can wait on a signal object at any given time.
+
+  @datatypes
+  #qurt_signal_t
+
+  @param[in] signal      Pointer to the signal object to wait on. 
+  @param[in] mask        Mask value identifying the individual signals in the signal object to  
+                         wait on.
+	
+  @return      	
+  A 32-bit word with current signals.
+ 
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+static inline unsigned int qurt_signal_wait_all(qurt_signal_t *signal, unsigned int mask)
+{
+  return qurt_signal_wait(signal, mask, QURT_SIGNAL_ATTR_WAIT_ALL);
+}
+
+/*======================================================================*/
+/**@ingroup func_qurt_signal_set
+  Sets signals in the specified signal object.
+
+  Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1 indicates 
+  to set the signal, and 0 indicates not to set it.
+  	
+  @datatypes
+  #qurt_signal_t
+
+  @param[in]    signal  Pointer to the signal object to modify.
+  @param[in]    mask    Mask value identifying the individual signals to set in the signal
+                        object.
+
+  @return 
+  None.
+  
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+void qurt_signal_set(qurt_signal_t *signal, unsigned int mask);
+
+/*======================================================================*/
+/**@ingroup func_qurt_signal_get
+   Gets a signal from a signal object.
+   
+   Returns the current signal values of the specified signal object.
+
+  @datatypes
+  #qurt_signal_t
+
+  @param[in] *signal Pointer to the signal object to access.
+
+  @return         
+  A 32-bit word with current signals
+    
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+unsigned int qurt_signal_get(qurt_signal_t *signal);
+
+/*======================================================================*/
+/**@ingroup func_qurt_signal_clear
+  Clear signals in the specified signal object.
+
+  Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1 
+  indicates that a signal must be cleared, and 0 indicates not to clear it.
+
+  @note1hang Signals must be explicitly cleared by a thread when it is awakened -- the wait 
+           operations do not automatically clear them.
+
+  @datatypes
+  #qurt_signal_t
+
+  @param[in] signal   Pointer to the signal object to modify.
+  @param[in] mask     Mask value identifying the individual signals to clear in the signal object.
+
+  @return 		  
+  None.
+
+  @dependencies
+  None.
+ */
+/* ======================================================================*/
+void qurt_signal_clear(qurt_signal_t *signal, unsigned int mask);
+
+/**@ingroup func_qurt_signal_wait_cancellable  
+  @xreflabel{hdr:qurt_signal_wait_cancellable}
+  Suspends the current thread until either the specified signals are set or the wait operation is cancelled.
+  The operation is cancelled if the user process of the calling thread is killed, or if the calling thread 
+  must finish its current QDI invocation and return to user space. 
+
+  Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1 indicates 
+  that a signal must be waited on, and 0 indicates not to wait on it.
+
+  If a thread is waiting on a signal object for any of the specified set of signals to be set, and one or 
+  more of those signals is set in the signal object, the thread is awakened.
+
+  If a thread is waiting on a signal object for all of the specified set of signals to be set, and all of 
+  those signals are set in the signal object, the thread is awakened.
+
+  @note1hang At most, one thread can wait on a signal object at any given time.
+
+  @note1cont When the operation is cancelled, the caller must assume that the signal is never set.
+
+  @datatypes
+  #qurt_signal_t
+
+  @param[in] signal      Pointer to the signal object to wait on.
+  @param[in] mask        Mask value identifying the individual signals in the signal object to  
+                         wait on.
+  @param[in] attribute   Indicates whether the thread must wait until any of the signals are set, or until all of 
+                         them are set.  Values:\n
+                         - #QURT_SIGNAL_ATTR_WAIT_ANY \n
+                         - #QURT_SIGNAL_ATTR_WAIT_ALL @tablebulletend
+  @param[out] return_mask Pointer to the 32-bit mask value that was originally passed to the function.
+
+
+  @return     	
+  #QURT_EOK -- Wait completed. \n
+  #QURT_ECANCEL -- Wait cancelled.
+
+ 
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+int qurt_signal_wait_cancellable(qurt_signal_t *signal, unsigned int mask, 
+                                 unsigned int attribute,
+                                 unsigned int *return_mask);
+
+/*======================================================================*/
+/**@ingroup func_qurt_signal_64_init
+  Initializes a 64-bit signal object.\n
+  The signal argument returns the initialized object.
+  The signal object is initially cleared.
+
+  @note1hang   Each signal-based object has one or more kernel resources associated with it;
+               to prevent resource leaks, call qurt_signal_destroy()
+               when this object is not used anymore.
+  @datatypes
+  #qurt_signal_64_t
+
+  @param[in] signal Pointer to the initialized object.
+
+  @return         
+  None.
+     
+  @dependencies
+  None.
+ */
+/* ======================================================================*/
+void qurt_signal_64_init(qurt_signal_64_t *signal);
+
+/*======================================================================*/
+/**@ingroup func_qurt_signal_64_destroy
+  Destroys the specified signal object.
+  
+  @note1hang 64-bit signal objects must be destroyed when they are no longer in use. Failure 
+  to do this causes resource leaks in the QuRT kernel.\n
+  @note1cont Signal objects must not be destroyed while they are still in use. If this
+  occurs, the behavior of QuRT is undefined.
+  
+  @datatypes
+  #qurt_signal_64_t
+
+  @param[in] signal  Pointer to the signal object to destroy.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+/* ======================================================================*/
+void qurt_signal_64_destroy(qurt_signal_64_t *signal);
+
+/*======================================================================*/
+/**@ingroup func_qurt_signal_64_wait
+  Suspends the current thread until all of the specified signals are set.
+
+  Signals are represented as bits 0 through 63 in the 64-bit mask value. A mask bit value of 1 indicates 
+  that a signal must be waited on, and 0 indicates not wait on it.
+
+  If a thread is waiting on a signal object for all of the specified set of signals to be set, 
+  and all of those signals are set in the signal object, the thread is awakened.
+
+  @note1hang At most, one thread can wait on a signal object at any given time.
+
+  @datatypes
+  #qurt_signal_64_t
+
+  @param[in] signal      Pointer to the signal object to wait on. 
+  @param[in] mask        Mask value, which identifies the individual signals in the signal object to  
+                         wait on.
+  @param[in] attribute   Indicates whether the thread must wait until any of the signals are set, or until all of 
+                         them are set.  \n
+						 @note1hang The wait-any and wait-all types are mutually exclusive.\n Values:\n
+                         - #QURT_SIGNAL_ATTR_WAIT_ANY \n
+                         - #QURT_SIGNAL_ATTR_WAIT_ALL @tablebulletend
+	
+  @return      	
+  A 32-bit word with current signals.
+ 
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+unsigned long long qurt_signal_64_wait(qurt_signal_64_t *signal, unsigned long long mask, 
+                unsigned int attribute);
+
+/*======================================================================*/
+/**@ingroup func_qurt_signal_64_set
+  Sets signals in the specified signal object.
+
+  Signals are represented as bits 0 through 63 in the 64-bit mask value. A mask bit value of 1 indicates 
+  that a signal must be set, and 0 indicates not to set it.
+  	
+  @datatypes
+  #qurt_signal_64_t
+
+  @param[in]    signal  Pointer to the signal object to modify.
+  @param[in]    mask    Mask value identifiying the individual signals to set in the signal
+                        object.
+
+  @return 
+  None.
+  
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+void qurt_signal_64_set(qurt_signal_64_t *signal, unsigned long long mask);
+
+/*======================================================================*/
+/**@ingroup func_qurt_signal_64_get
+   Gets a signal from a signal object.
+   
+   Returns the current signal values of the specified signal object.
+
+  @datatypes
+  #qurt_signal_64_t
+
+  @param[in] *signal Pointer to the signal object to access.
+
+  @return         
+  A 64-bit double word with current signals.
+    
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+unsigned long long qurt_signal_64_get(qurt_signal_64_t *signal);
+
+/*======================================================================*/
+/**@ingroup func_qurt_signal_64_clear
+  Clears signals in the specified signal object.
+
+  Signals are represented as bits 0 through 63 in the 64-bit mask value. A mask bit value of 1 
+  indicates that a signal must be cleared, and 0 indicates not to clear it.
+
+  @note1hang Signals must be explicitly cleared by a thread when it is awakened -- the wait 
+           operations do not automatically clear them.
+
+  @datatypes
+  #qurt_signal_64_t
+
+  @param[in] signal   Pointer to the signal object to modify.
+  @param[in] mask     Mask value identifying the individual signals to clear in the signal object.
+
+  @return 		  
+  None.
+
+  @dependencies
+  None.
+ */
+/* ======================================================================*/
+void qurt_signal_64_clear(qurt_signal_64_t *signal, unsigned long long mask);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* QURT_SIGNAL_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_signal2.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_signal2.h
new file mode 100755
index 0000000000000..43975100cbf75
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_signal2.h
@@ -0,0 +1,340 @@
+#ifndef QURT_SIGNAL2_H
+#define QURT_SIGNAL2_H
+
+/**
+  @file qurt_signal2.h
+  @brief  Prototypes of kernel signal2 API functions.
+
+ EXTERNALIZED FUNCTIONS
+  none
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  none
+
+ Copyright (c) 2013, 2021, 2023  by Qualcomm Technologies, Inc.  All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define QURT_SIGNAL_ATTR_WAIT_ANY 0x00000000
+#define QURT_SIGNAL_ATTR_WAIT_ALL 0x00000001
+
+/*=====================================================================
+ Typedefs
+ ======================================================================*/
+
+/** @addtogroup signals2_types
+@{ */
+/** qurt_signal2 type.
+ */
+typedef union {
+   /** @cond */
+  struct{
+   unsigned int cur_mask;                              /* Current set of signal bits that are set. */
+   unsigned int sig_state;                             /* Current state. */
+                                                       /* Bit 0 -- in anysignal wait. */
+                                                       /* Bit 1 -- in allsignal wait. */
+                                                       /* Bit 2 -- in interrupt wait. */
+                                                       /* Bits 31-3 -- reference count field. */
+   unsigned int queue;                                 /* Kernel-maintained futex queue value. */
+   unsigned int wait_mask;                             /* When sig_state indicates a waiter is present, this is the wait mask. */
+   };
+  unsigned long long int raw;
+  /** @endcond */
+} qurt_signal2_t;
+/* @} */ /* end_addtogroup signals2_types */
+
+/*=====================================================================
+ Functions
+======================================================================*/
+
+/*======================================================================*/
+/**@ingroup func_qurt_signal2_init
+
+  @deprecated use #qurt_signal_init instead.
+
+  Initializes a signal2 object.
+  Signal returns the initialized object.
+  The signal object is initially cleared.
+
+  Objects of type signal2 solve a potential race condition between
+  set() and destroy() operations.
+
+  @datatypes
+  #qurt_signal2_t
+
+  @param[in] *signal Pointer to the initialized object.
+
+  @return
+  None.
+
+  @dependencies
+  Each mutex-based object has an associated
+       kernel resource(s), therefore users must call qurt_signal2_destroy()
+       when this object no longer in use.
+ */
+/* ======================================================================*/
+void qurt_signal2_init(qurt_signal2_t *signal);
+
+/*======================================================================*/
+/**@ingroup func_qurt_signal2_destroy
+
+  @deprecated use #qurt_signal_destroy instead.
+
+  Destroys the specified signal object.
+
+  @note1cont Signal objects must not be destroyed while they are still in use. If this
+  occurs, the behavior of QuRT is undefined.
+  @note1cont Application code should destroy a signal2 object prior to deallocating it.
+             Calling qurt_signal2_destroy() before deallocating a 
+             signal2 object ensures completion of all qurt_signal2_set() calls.
+
+  @datatypes
+  #qurt_signal2_t
+
+  @param[in] signal  Pointer to the signal object to destroy.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+/* ======================================================================*/
+void qurt_signal2_destroy(qurt_signal2_t *signal);
+
+/*======================================================================*/
+/**@ingroup func_qurt_signal2_wait
+
+  @deprecated use #qurt_signal_wait instead.
+
+  Suspends the current thread until the specified signals are set.
+
+  Signals are represented as bits [31:0] in the 32-bit mask value. A mask bit value of 1 indicates
+  a signal to wait on.
+
+  If a thread calls this API with QURT_SIGNAL_ATTR_WAIT_ANY, the thread will be awakened when
+  any of the signals specified in the mask are set.
+
+  If a thread calls this API with QURT_SIGNAL_ATTR_WAIT_ALL, the thread will be awakened only
+  when all the signals specified in the mask are set.
+
+  @note1hang At most, one thread can wait on a signal object at any given time.
+
+  @datatypes
+  #qurt_signal2_t
+
+  @param[in] signal      Pointer to the signal object to wait on.
+  @param[in] mask        Mask value identifying the individual signals in the signal object to wait on.
+  @param[in] attribute   Specifies whether the thread waits for any of the signals to be set, or for all of
+                         them to be set. Values:\n
+                         - QURT_SIGNAL_ATTR_WAIT_ANY \n
+                         - QURT_SIGNAL_ATTR_WAIT_ALL @tablebulletend
+  @return
+  A 32-bit word with current signals.
+
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+unsigned int qurt_signal2_wait(qurt_signal2_t *signal, unsigned int mask,
+                unsigned int attribute);
+
+/*======================================================================*/
+/**@ingroup func_qurt_signal2_wait_any
+
+  @deprecated use #qurt_signal_wait_any instead.
+
+  Suspends the current thread until any of the specified signals are set.
+
+  Signals are represented as bits [31:0] in the 32-bit mask value. A mask bit value of 1 indicates
+  a signal to wait on.
+
+  The thread will be awakened when any of the signals specified in the mask are set.
+
+  @note1hang At most, one thread can wait on a signal object at any given time.
+
+  @datatypes
+  #qurt_signal2_t
+
+  @param[in] signal      Pointer to the signal object to wait on.
+  @param[in] mask        Mask value identifying the individual signals in the signal object to 
+                         wait on.
+
+  @return
+  32-bit word with current signals.
+
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+static inline unsigned int qurt_signal2_wait_any(qurt_signal2_t *signal, unsigned int mask)
+{
+  return qurt_signal2_wait(signal, mask, QURT_SIGNAL_ATTR_WAIT_ANY);
+}
+
+/*======================================================================*/
+/**@ingroup func_qurt_signal2_wait_all
+
+  @deprecated use #qurt_signal_wait_all instead.
+
+  Suspends the current thread until all of the specified signals are set.
+
+  Signals are represented as bits [31:0] in the 32-bit mask value. A mask bit value of 1 indicates
+  a signal to wait on.
+
+  The thread will be awakened only when all the signals specified in the mask are set.
+
+  @note1hang At most one thread can wait on a signal object at any given time.
+
+  @datatypes
+  #qurt_signal2_t
+
+  @param[in] signal      Pointer to the signal object to wait on.
+  @param[in] mask        Mask value identifying the individual signals in the signal object to 
+                         wait on.
+
+  @return
+  32-bit word with current signals.
+
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+static inline unsigned int qurt_signal2_wait_all(qurt_signal2_t *signal, unsigned int mask)
+{
+  return qurt_signal2_wait(signal, mask, QURT_SIGNAL_ATTR_WAIT_ALL);
+}
+
+/*======================================================================*/
+/**@ingroup func_qurt_signal2_set
+
+  @deprecated use #qurt_signal_set instead.
+
+  Sets signals in the specified signal object.
+
+  Signals are represented as bits [31:0] in the 32-bit mask value. A mask bit value of 1 indicates
+  that a signal must be set, and 0 indicates not to set the signal.
+
+  @datatypes
+  #qurt_signal2_t
+
+  @param[in]    signal  Pointer to the signal object to modify.
+  @param[in]    mask    Mask value identifying the individual signals to set in the signal
+                        object.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+void qurt_signal2_set(qurt_signal2_t *signal, unsigned int mask);
+
+/*======================================================================*/
+/**@ingroup func_qurt_signal2_get
+
+  @deprecated use #qurt_signal_get instead.
+
+   Gets a signal from a signal object.
+
+   Returns the current signal values of the specified signal object.
+
+  @datatypes
+  #qurt_signal2_t
+
+  @param[in] *signal Pointer to the signal object to access.
+
+  @return
+   32-bit word with current signals.
+
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+unsigned int qurt_signal2_get(qurt_signal2_t *signal);
+
+/*======================================================================*/
+/**@ingroup func_qurt_signal2_clear
+
+  @deprecated use #qurt_signal_clear instead.
+
+  Clear signals in the specified signal object.
+
+  Signals are represented as bits [31:0] in the 32-bit mask value. A mask bit value of 1
+  indicates that a signal must be cleared, and 0 indicates not to clear the signal.
+
+  @note1hang Signals must be explicitly cleared by a thread when it is awakened -- the wait
+           operations do not automatically clear them.
+
+  @datatypes
+  #qurt_signal2_t
+
+  @param[in] signal   Pointer to the signal object to modify.
+  @param[in] mask     Mask value identifying the individual signals to clear in the signal object.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+/* ======================================================================*/
+void qurt_signal2_clear(qurt_signal2_t *signal, unsigned int mask);
+
+/**@ingroup func_qurt_signal2_wait_cancellable  
+  
+  @deprecated use #qurt_signal_wait_cancellable instead.
+
+  Suspends the current thread until either the specified signals are set or the wait operation is cancelled.
+  The operation is cancelled if the user process of the calling thread is killed, or if the calling thread 
+  must finish its current QDI invocation and return to user space. 
+
+  Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1 indicates 
+  that a signal must be waited on, and 0 indicates not to wait on it.
+
+  If a thread is waiting on a signal object for any of the specified set of signals to be set, and one or 
+  more of those signals is set in the signal object, the thread is awakened.
+
+  If a thread is waiting on a signal object for all of the specified set of signals to be set, and all of 
+  those signals are set in the signal object, the thread is awakened.
+
+  @note1hang At most, one thread can wait on a signal object at any given time.
+
+  @note1cont When the operation is cancelled, the caller must assume that the signal is never set.
+
+  @datatypes
+  #qurt_signal2_t
+
+  @param[in] signal      Pointer to the signal object to wait on.
+  @param[in] mask        Mask value identifying the individual signals in the signal object to  
+                         wait on.
+  @param[in] attribute   Indicates whether the thread must wait until any of the signals are set, or until all of 
+                         them are set.  Values:\n
+                         - #QURT_SIGNAL_ATTR_WAIT_ANY \n
+                         - #QURT_SIGNAL_ATTR_WAIT_ALL @tablebulletend
+  @param[out] p_returnmask Pointer to the 32-bit mask value that was originally passed to the function.
+
+
+  @return     	
+  #QURT_EOK -- Wait completed. \n
+  #QURT_ECANCEL -- Wait cancelled.
+
+ 
+  @dependencies
+  None.
+*/
+int qurt_signal2_wait_cancellable(qurt_signal2_t *signal,
+                                  unsigned int mask,
+                                  unsigned int attribute,
+                                  unsigned int *p_returnmask);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_SIGNAL2_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_space.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_space.h
new file mode 100755
index 0000000000000..2c3f9e4496697
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_space.h
@@ -0,0 +1,230 @@
+#ifndef QURT_SPACE_H
+#define QURT_SPACE_H
+/**
+  @file qurt_space.h
+  @brief Prototypes of QuRT process control APIs
+
+ EXTERNALIZED FUNCTIONS
+  none
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  none
+
+ Copyright (c) 2013, 2021, 2023 by Qualcomm Technologies, Inc. All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+
+#include <qurt_types.h>
+#include <qurt_signal.h>
+#include <qurt_process.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** This flag is a request to the OS to suspend the processes just before calling main()
+But it is going to be obsoleted and replaced by QURT_PROCESS_SUSPEND_ON_STARTUP */
+#define SPAWNN_FLAG_SUSPEND_ON_STARTUP QURT_PROCESS_SUSPEND_ON_STARTUP
+
+/**
+ * Creates and starts a process from ELF of a specified name. The slash symbols
+ * "/" or "\" are ignored. Do not include the directory name in the input. This function
+ * accepts the the SPAWN flags. Multiple SPAWN flags can be specified by OR'ing the flags.
+ *
+ * @param name      ELF name of the executable. Name shall not contain directories,
+ *                  use "dsp2.elf", instead of "/prj/qct/.../dsp2.elf"
+ *
+ * @param return
+   Process ID -- Success \n
+   Negative error code -- failure\n
+   #QURT_EPRIVILEGE --                    Caller does not have enough privilege for this operation\n
+   #QURT_EMEM       --                    Not enough memory to perform the operation \n
+   #QURT_EFAILED     --                   Operation failed \n
+   #QURT_ENOTALLOWED --                   Operation not allowed \n
+   #QURT_ENOREGISTERED --                 Not registered \n
+   #QURT_ENORESOURCE  --                  Resource exhaustion \n
+   #QURT_EINVALID --                      Invalid argument value
+*/
+
+int qurt_spawn_flags(const char * name, int flags);
+
+/**
+   Creates and starts a process from an ELF of the specified name. The slash symbols
+   "/" or "\" are ignored. Do not include the directory name in the input.
+
+   @param name      ELF name of the executable. Name shall not contain directories,
+                    use "dsp2.elf", instead of "/prj/qct/.../dsp2.elf".
+
+   @return
+   Process ID -- Success. \m
+   Negative error code -- Failure.
+
+*/
+static inline int qurt_spawn(const char *name)
+{
+    return qurt_spawn_flags(name,0);
+}
+
+/**
+ * Returns the process ID of the current process.
+ *
+ * @return
+ * Process ID
+ *
+*/
+#define qurt_getpid qurt_process_get_id
+
+/**
+ * The qurt_wait() function  waits for status change in a child process. It could be used by parent
+ * process to block on any child process terminates.
+ *
+ * This API returns error if there are no user processes or all user processes got detached.
+ *
+ * @param status    Pointer to status variable. The variable provides the status value of child process.
+ *                  The value comes from exit() system call made by child process.
+ *
+ * @return
+   Process ID of the child process that changes status -- Success \n
+ * Negative error code -- Failure
+ *
+*/
+
+int qurt_wait(int *status);
+
+
+/** @cond */
+/* APIs that allow registering callbacks on spawn of user pd */
+typedef void (*QURT_SPAWN_PFN)(int client_handle, void *data_ptr);  //no return, since we won't be error checking it in spawn 
+typedef int (*QURT_CB_PFN)(int client_handle, void *user_data, void *info);
+typedef union {
+    QURT_SPAWN_PFN spawn_pfn;
+    QURT_CB_PFN cb_pfn;
+} qurt_process_callback_pfn_t;
+/** @endcond */
+
+/** @cond internal_only */
+
+/**@ingroup func_qurt_event_register
+Sets the specified bits by mask in the signal passed by the caller. The signal gets set
+when the client handle indicated by value goes away (at process exit). Multiple clients can register for the signal
+to be set.
+
+@datatypes
+
+@param[in]  type     QURT_PROCESS_EXIT is the only event that can be registered for.
+@param[in]  value    Indicates the client handle of the process for which the event is registered.
+@param[in]  signal   Pointer to the signal object to set when the event occurs.
+@param[in]  mask     Mask bits to set in the signal.
+@param[out] data     Pointer to the variable that would receive the exit code of the exiting process.
+@param[in]  datasize Size of the data variable.
+
+@return
+#QURT_EOK -- Success \n
+#QURT_EMEM -- Not enough memory to allocate resources \n
+#QURT_EVAL -- Invalid values passed to the API
+
+@dependencies
+None.
+*/
+int qurt_event_register(int type, int value, qurt_signal_t *psig, unsigned int mask, void *data, unsigned int data_size);
+
+/**@ingroup func_qurt_callback_register_onspawn
+Allows registering for a callback on spawn of any user process.
+
+@datatypes
+#QURT_SPAWN_PFN
+
+@param[in] pFn         Callback function to call when any user process is spawned.
+@param[in] user_data   Pointer to the argument that the callback must be called with.
+
+
+@return   If positive value is obtained, handle to be used while deregistering the callback.
+          Mutliple clients can register for callback on spawn and some clients might choose to deregister.
+
+          If failed, QURT_EFATAL will be returned.
+
+@dependencies
+None.
+*/
+int qurt_callback_register_onspawn(QURT_SPAWN_PFN pFn, void *user_data);
+
+/**@ingroup func_qurt_callback_deregister_onspawn
+Allows de-registering callback on spawn.
+
+@param[in] callback_handle   Handle returned by qurt_callback_register_onspawn.
+
+@return
+#QURT_EOK --de-registering was successful
+
+@dependencies
+None.
+*/
+int qurt_callback_deregister_onspawn(int callback_handle);
+
+/**@ingroup func_qurt_process_callback_register
+Allows registering for a callback during or after image loading.
+Generic callback types:
+    Functions similarly to qurt_callback_register_onspawn(). Callback is called after process is
+    loaded, before process thread starts. Callback has no return value and has no info provided
+    from OS.
+        pFn - QURT_SPAWN_PFN
+        type - QURT_PROCESS_CB_GENERIC
+        arg1 - not used 
+        arg2 - not used
+        arg3 - not used
+Note callback types:
+    Callback is called during process loading: before segment loading(QURT_PROCESS_NOTE_CB_PRE_MAP),
+    or after segment loading (QURT_PROCESS_NOTE_CB_POST_MAP). OS provides info to the callback. info
+    argument in callback is populated with pointer to the mapped note corresponding to the callback.
+    Callback has return value, loader fails if callback returns a value that is not QURT_EOK.
+        pFn - QURT_CB_PFN
+        type - QURT_PROCESS_NOTE_CB_PRE_MAP or QURT_PROCESS_NOTE_CB_POST_MAP
+        arg1 - note type (ex: NOTE_TYPE_POOL_INFO, NOTE_TYPE_SEGMENT_INFO, NOTE_TYPE_ARB_INFO)
+        arg2 - note name
+        arg3 - not used
+
+@datatypes
+
+@param[in] pFn          Callback function to call
+@param[in] type         Callback type
+@param[in] user_data    Pointer to the argument that the callback must be called with.
+@param[in] arg1         Arguments interpreted by OS based on callback type
+@param[in] arg2         Arguments interpreted by OS based on callback type
+@param[in] arg3         Arguments interpreted by OS based on callback type (currently not used)
+
+
+@return   If positive value is obtained, handle to be used while deregistering the callback.
+          Mutliple clients can register for callback on spawn and some clients might choose to deregister.
+
+          If failed, QURT_EFATAL will be returned.
+
+@dependencies
+None.
+*/
+int qurt_process_callback_register(qurt_process_callback_pfn_t pFn, 
+                                   qurt_process_cb_type_t type, 
+                                   void *user_data, 
+                                   qurt_process_callback_arg_t arg1, 
+                                   qurt_process_callback_arg_t arg2, 
+                                   qurt_process_callback_arg_t arg3);
+
+
+
+/**@ingroup func_qurt_process_callback_deregister
+Allows de-registering callback for imate loading.
+@param[in] callback_handle   Handle returned by qurt_process_callback_register.
+
+@return
+#QURT_EOK --de-registering was successful
+
+@dependencies
+None.
+*/
+int qurt_process_callback_deregister(int callback_handle);
+/** @endcond */
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_SPACE_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_srm_consts.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_srm_consts.h
new file mode 100755
index 0000000000000..48a8b6a38c402
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_srm_consts.h
@@ -0,0 +1,32 @@
+#ifndef QURT_SRM_CONSTS_H
+#define QURT_SRM_CONSTS_H
+/**
+  @file qurt_srm_consts.h 
+  @brief  Type definitions for srm
+
+EXTERNAL FUNCTIONS
+   None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+Copyright (c) 2020-2021, 2022  by Qualcomm Technologies, Inc.  All Rights Reserved
+Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+=============================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** @cond */
+#define QURT_SRM_WAKEUP_REQUEST       1U << 0          /**< Value = 1:  Send wakeup request to the SRM server. */
+#define QURT_SRM_SET_HANDLE           1U << 1          /**< Value = 2:  Set the client handle for a new SRM client. */
+#define QURT_SRM_ALLOC_KERNEL_PAGES   1U << 2          /**< Value = 4:  Allocate pages from the kernel VA space. */
+/** @endcond */
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_SRM_CONSTS_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_srm_driver.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_srm_driver.h
new file mode 100755
index 0000000000000..5489e3dddbcca
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_srm_driver.h
@@ -0,0 +1,140 @@
+#ifndef QURT_SRM_DRIVER_H
+#define QURT_SRM_DRIVER_H
+/**
+  @file qurt_srm_driver.h
+  @brief Definitions, macros, and prototypes used by SRM drivers.
+
+  EXTERNAL FUNCTIONS
+  None.
+
+  INITIALIZATION AND SEQUENCING REQUIREMENTS
+  None.
+
+  Copyright (c) 2021-2023 by Qualcomm Technologies, Inc.  All Rights Reserved.
+
+ =============================================================================*/
+#include <qurt.h>
+#include <qurt_srm_consts.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+||  Define qurt_srm_driver_t structure, which represents
+||   the "registration" object for an SRM driver.
+*/
+/** @cond internal_only */
+struct _qurt_srm_driver {
+   const char *name;
+   qurt_qdi_obj_t *obj;
+};
+
+typedef struct _qurt_srm_driver qurt_srm_driver_t;
+
+/*
+||  qurt_srm_object_invoke() is an internal equivalent to qurt_qdi_handle_invoke().
+||  It behaves the same, but it takes a QDI object pointer instead of a handle.
+*/
+
+#define qurt_srm_object_invoke(o,m,...) \
+   _QDMPASTE(_QDMSOI,_QDMCNT(QDI_HANDLE_LOCAL_CLIENT,o,m,##__VA_ARGS__))(QDI_HANDLE_LOCAL_CLIENT,o,m,##__VA_ARGS__)
+#define _QDMSOI3(a,b,c) qurt_srm_oi3(a,b,c)
+#define _QDMSOI4(a,b,c,d) qurt_srm_oi4(a,b,c,(int)(d))
+#define _QDMSOI5(a,b,c,d,e) qurt_srm_oi5(a,b,c,(int)(d),(int)(e))
+#define _QDMSOI6(a,b,c,d,e,f) qurt_srm_oi6(a,b,c,(int)(d),(int)(e),(int)(f))
+#define _QDMSOI7(a,b,c,d,e,f,g) qurt_srm_oi7(a,b,c,(int)(d),(int)(e),(int)(f),(int)(g))
+#define _QDMSOI8(a,b,c,d,e,f,g,h) qurt_srm_oi8(a,b,c,(int)(d),(int)(e),(int)(f),(int)(g),(int)(h))
+#define _QDMSOI9(a,b,c,d,e,f,g,h,i) qurt_srm_oi9(a,b,c,(int)(d),(int)(e),(int)(f),(int)(g),(int)(h),(int)(i))
+#define _QDMSOI10(a,b,c,d,e,f,g,h,i,j) qurt_srm_oi10(a,b,c,(int)(d),(int)(e),(int)(f),(int)(g),(int)(h),(int)(i),(int)(j))
+#define _QDMSOI11(a,b,c,d,e,f,g,h,i,j,k) qurt_srm_oi11(a,b,c,(int)(d),(int)(e),(int)(f),(int)(g),(int)(h),(int)(i),(int)(j),(int)(k))
+#define _QDMSOI12(a,b,c,d,e,f,g,h,i,j,k,l) qurt_srm_oi12(a,b,c,(int)(d),(int)(e),(int)(f),(int)(g),(int)(h),(int)(i),(int)(j),(int)(k),(int)(l))
+
+int qurt_srm_oi3(int, qurt_qdi_obj_t *, int);
+int qurt_srm_oi4(int, qurt_qdi_obj_t *, int, int);
+int qurt_srm_oi5(int, qurt_qdi_obj_t *, int, int, int);
+int qurt_srm_oi6(int, qurt_qdi_obj_t *, int, int, int, int);
+int qurt_srm_oi7(int, qurt_qdi_obj_t *, int, int, int, int, int);
+int qurt_srm_oi8(int, qurt_qdi_obj_t *, int, int, int, int, int, int);
+int qurt_srm_oi9(int, qurt_qdi_obj_t *, int, int, int, int, int, int, int);
+int qurt_srm_oi10(int, qurt_qdi_obj_t *, int, int, int, int, int, int, int, int);
+int qurt_srm_oi11(int, qurt_qdi_obj_t *, int, int, int, int, int, int, int, int, int);
+int qurt_srm_oi12(int, qurt_qdi_obj_t *, int, int, int, int, int, int, int, int, int, int);
+
+#define QDI_SRM_INIT 192
+
+/*
+||  QURT_SRM_DECLARE_DRIVER() declares an SRM driver to the SRM infrastructure.
+||
+||  The three arguments are:
+||   unique_id -- Unique C identifier, unused but must be a unique global symbol.
+||   name -- Name of the driver by which an SRM client attempts to open it.
+||   obj -- Pointer to the singleton object of the driver, which handles things such as
+||          initialization and QDI_OPEN requests.
+*/
+
+#define QURT_SRM_DECLARE_DRIVER(unique_id, xname, xobj) \
+   __attribute__((section(".srm.rodata.user.main.DECL"))) const qurt_srm_driver_t unique_id = \
+      { .name = xname, .obj = xobj }
+
+
+/*@ingroup func_qurt_srm_mapping_create
+  Creates a memory mapping in pagetable with specified attributes
+
+  @param[in] client_handle  Client handle representing the process for which
+                            mapping would be created.
+  @param[in] pageno_virt    pointer to the virtual page. NULL indicates SRM
+                            would indicate the virtual memory.
+  @param[in] pageno_phys    physical page to be used for the mapping
+  @param[in] page_count     number of 4k pages to be mapped
+  @param[in] cache_attr     cache attributes for the mapping
+  @param[in] perm           permissions to be used for the mapping
+  
+  @return value greater than 0 indicates a handle which can be passed to
+          qdi_close() to remove the mapping. Negative value indicates
+		  an error.
+  
+  @dependencies
+  None.
+*/
+int qurt_srm_mapping_create(int client_handle,
+                            unsigned *pageno_virt,
+                            unsigned pageno_phys,
+                            unsigned page_count,
+                            qurt_mem_cache_mode_t cache_attr,
+                            qurt_perm_t perm);
+
+
+/**@ingroup func_qurt_srm_get_pid
+  Gets the PID for the client_handle that is passed.
+
+  @param[in] client_handle  Client handle for which PID is required.
+
+  @return PID of the client
+          Negative PID value '-1' will be returned in case of Error
+  
+  @dependencies
+  None.
+*/
+unsigned qurt_srm_get_pid(int client_handle);
+
+
+/*@ingroup func_qurt_srm_get_thread_id
+  Gets the thread id of the client requesting a service from SRM
+
+  @param[in] None.
+
+  @return thead id of client thread
+  
+  @dependencies
+  None.
+*/
+qurt_thread_t qurt_srm_get_client_thread_id(void);
+
+/** @endcond */
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_SRM_DRIVER_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_stid.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_stid.h
new file mode 100755
index 0000000000000..379f46aaa4b80
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_stid.h
@@ -0,0 +1,73 @@
+#ifndef QURT_STID_H
+#define QURT_STID_H
+/**
+  @file qurt_stid.h 
+  Prototypes of software thread identifier(stid) interface APIs.  
+  A stid is 8 bit identifier that can be assigned to a software thread.
+  The performance monitor logic uses stid as a counting match criteria
+  for maskable events. stid is also used by the hardware debugger 
+  (ISDB) to match breakpoints. 
+
+  EXTERNAL FUNCTIONS
+   None.
+
+  INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+  Copyright (c) 2024 Qualcomm Technologies, Inc.
+  All rights reserved.
+  Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+=============================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/*=============================================================================
+                            FUNCTIONS
+=============================================================================*/
+
+/**@ingroup func_qurt_stid_alloc
+  Allocate a unique stid 
+
+  @param[in]  pid   Process identifier
+  @param[out] stid  Pointer to a variable to return stid
+ 
+  @return
+  QURT_EOK - Allocation success
+  QURT_ENORESOURCE  - No stid available for allocation
+  QURT_EINVALID - Invalid input
+   
+  @dependencies
+  None.
+ */
+int qurt_stid_alloc(unsigned int pid, unsigned int *stid);
+
+/**@ingroup func_qurt_stid_release
+   Release the stid. 
+
+
+  @param[in]  pid   Process identifier
+  @param[in]  stid  STID to release
+  
+  @note1hang 
+  User shall ensure to clear the released stid from process or thread(s)
+  to default value (QURT_STID_DEFAULT) before releasing that stid
+ 
+  @return
+  QURT_EOK - Release success
+  QURT_ENOTALLOWED   - Operation not allowed for a pid
+  QURT_EINVALID  - Invalid stid
+   
+  @dependencies
+  None.
+ */
+int qurt_stid_release(unsigned int pid, unsigned int stid);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_STID_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_thread.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_thread.h
new file mode 100755
index 0000000000000..499699e7c72e2
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_thread.h
@@ -0,0 +1,1260 @@
+#ifndef QURT_THREAD_H
+#define QURT_THREAD_H
+/**
+  @file qurt_thread.h 
+  @brief Prototypes of Thread API
+
+  EXTERNAL FUNCTIONS
+   None.
+
+  INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+Copyright (c) 2018, 2020-2023  by Qualcomm Technologies, Inc.  All Rights Reserved.
+Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+=============================================================================*/
+
+
+/* The followings are for C code only */
+#ifndef __ASSEMBLER__ 
+#include <string.h>
+#include "qurt_pmu.h"
+#include "qurt_api_version.h"
+#endif /* __ASSEMBLER__ */
+#include "qurt_consts.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*=============================================================================
+                            CONSTANTS AND MACROS
+=============================================================================*/
+
+
+/*
+  Bitmask configuration to select DSP hardware threads. 
+  To select all the hardware threads, use #QURT_THREAD_CFG_BITMASK_ALL 
+  and the following: \n
+  - For QDSP6 V2/V3, all six hardware threads are selected \n
+  - For QDSP6 V3L, all four hardware threads are selected \n
+  - For QDSP6 V4, all three hardware threads are selected
+ */  
+
+#define QURT_THREAD_CFG_BITMASK_HT0      0x00000001   /**< HTO. */
+#define QURT_THREAD_CFG_BITMASK_HT1      0x00000002   /**< HT1. */
+#define QURT_THREAD_CFG_BITMASK_HT2      0x00000004   /**< HT2. */ 
+#define QURT_THREAD_CFG_BITMASK_HT3      0x00000008   /**< HT3. */
+#define QURT_THREAD_CFG_BITMASK_HT4      0x00000010   /**< HT4. */
+#define QURT_THREAD_CFG_BITMASK_HT5      0x00000020   /**< HT5. */
+/** @cond rest_reg_dist */
+/** @addtogroup thread_macros
+@{ */
+/**   @xreflabel{sec:qurt_thread_cfg} */  
+
+#define QURT_THREAD_CFG_BITMASK_ALL      0x000000ffU   /**< Select all the hardware threads. */
+/** @} */ /* end_addtogroup thread_macros */
+/** @endcond */
+
+#define QURT_THREAD_CFG_USE_RAM          0x00000000   /**< Use RAM. */
+#define QURT_THREAD_CFG_USE_TCM          0x00000100   /**< Use TCM. */
+/** @cond rest_reg_dist */
+/** @addtogroup thread_macros
+@{ */
+#define QURT_THREAD_BUS_PRIO_DISABLED    0   /**< Thread internal bus priority disabled. */
+#define QURT_THREAD_BUS_PRIO_ENABLED     1   /**< Thread internal bus priority enabled.  */
+/** @} */ /* end_addtogroup thread_macros */
+/** @endcond */
+
+#define QURT_THREAD_AUTOSTACK_DISABLED    0   /**< Thread has autostack v2 feature disabled. */
+#define QURT_THREAD_AUTOSTACK_ENABLED     1   /**< Thread has autostack v2 feature enabled.  */
+
+/*
+   Macros for QuRT thread attributes.   
+ */
+#define QURT_HTHREAD_L1I_PREFETCH      0x1     /**< Enables hardware L1 instruction cache prefetching. */
+#define QURT_HTHREAD_L1D_PREFETCH      0x2     /**< Enables hardware L1 data cache prefetching. */
+#define QURT_HTHREAD_L2I_PREFETCH      0x4     /**< Enables hardware L2 instruction cache prefetching. */
+#define QURT_HTHREAD_L2D_PREFETCH      0x8     /**< Enables hardware L2 data cache prefetching. */
+#define QURT_HTHREAD_DCFETCH           0x10    /**< Enables DC fetch to the provided virtual address. 
+                                                    DC fetch indicates the hardware that a data memory access is likely. 
+                                                    Instructions are dropped when there is high bus utilization. */
+/** @addtogroup thread_macros
+@{ */
+/** @xreflabel{hdr:partition_tcm} */
+/*
+   Below value is used to create legacy QuRT threads by default.
+   If a thread has this as the detach_state, the thread can be joined
+   on until it exits. When we are able to change default behavior of all
+   QuRT threads to JOINABLE (posix default), we can remove this legacy
+   behavior.
+*/
+#define QURT_THREAD_ATTR_CREATE_LEGACY               0U /**< Create a legacy QuRT thread by default. If a thread has this as a detach state, the thread can be joined on until it exits. */
+#define QURT_THREAD_ATTR_CREATE_JOINABLE             1U /**< Create a joinable thread. */
+#define QURT_THREAD_ATTR_CREATE_DETACHED             2U /**< Create a detached thread. */
+/** @} */ /* end_addtogroup thread_macros */
+
+
+#define QURT_THREAD_ATTR_NAME_MAXLEN            16  /**< Maximum name length. */
+#define QURT_THREAD_ATTR_TCB_PARTITION_RAM      0  /**< Creates threads in RAM/DDR. */
+#define QURT_THREAD_ATTR_TCB_PARTITION_TCM      1  /**< Creates threads in TCM. */
+/** @cond rest_reg_dist */
+/** @addtogroup thread_macros
+@{ */
+#define QURT_THREAD_ATTR_TCB_PARTITION_DEFAULT  QURT_THREAD_ATTR_TCB_PARTITION_RAM  /**< Backward compatibility. */
+#define QURT_THREAD_ATTR_PRIORITY_DEFAULT       254   /**< Priority.*/
+#define QURT_THREAD_ATTR_ASID_DEFAULT           0    /**< ASID. */
+#define QURT_THREAD_ATTR_AFFINITY_DEFAULT      (-1)  /**< Affinity. */
+#define QURT_THREAD_ATTR_BUS_PRIO_DEFAULT       255  /**< Bus priority. */
+#define QURT_THREAD_ATTR_AUTOSTACK_DEFAULT      0    /**< Default autostack v2 disabled thread. */
+#define QURT_THREAD_ATTR_TIMETEST_ID_DEFAULT   (-2)  /**< Timetest ID. */
+#define QURT_THREAD_ATTR_STID_DEFAULT           QURT_STID_DEFAULT  /**< STID. */
+#define QURT_THREAD_ATTR_STID_ENABLE            1  /**< Indicate to allocate STID during thread creation. */
+
+#define  QURT_PRIORITY_FLOOR_DEFAULT            255U  /**< Default floor. */
+/** @} */ /* end_addtogroup thread_macros */
+
+// Option for suspending thread
+#define  QURT_THREAD_SUSPEND_SYNCHRONOUS   0x0U  // bit#0
+#define  QURT_THREAD_SUSPEND_ASYNCHRONOUS  0x1U  // bit#0
+#define  QURT_THREAD_SUSPEND_KEEP_HMX      0x0U  // bit#1
+#define  QURT_THREAD_SUSPEND_DETACH_HMX    0x2U  // bit#1
+ 
+// Option for resuming thread
+#define  QURT_THREAD_RESUME_DEFAULT        0x0
+
+// Thread property IDs
+#define  QURT_THREAD_PROPERTY_SUSPENDABLE  0x0U 
+#define  QURT_THREAD_PROPERTY_RESUMABLE    0x1
+
+// Thread group
+#define  QURT_THREAD_DEFAULT_GROUP_ID      0x0U
+#define  QURT_THREAD_GROUP_ID_MASK         0x3FU
+
+/** @endcond*/
+
+
+/* The followings are for C code only */
+#ifndef __ASSEMBLER__ 
+/*=============================================================================
+                                TYPEDEFS
+=============================================================================*/
+/** @addtogroup thread_types
+@{ */
+/** @cond rest_reg_dist  */
+typedef unsigned int qurt_cache_partition_t; /**< QuRT cache partition type. */
+
+#define CCCC_PARTITION      0U     /**< Use the CCCC page attribute bits to determine the main or auxiliary partition. */
+#define MAIN_PARTITION      1U     /**< Use the main partition. */
+#define AUX_PARTITION       2U     /**< Use the auxiliary partition. */
+#define MINIMUM_PARTITION   3U     /**< Use the minimum. Allocates the least amount of cache (no-allocate policy possible) for this thread. */
+/** @endcond */
+
+/** Thread ID type. */
+typedef unsigned int qurt_thread_t;
+
+/** @cond rest_reg_dist  */
+/** Thread attributes. */
+typedef struct _qurt_thread_attr {
+    
+    char name[QURT_THREAD_ATTR_NAME_MAXLEN]; /**< Thread name. */
+    unsigned char tcb_partition;  /**< Indicates whether the thread TCB resides in RAM or
+                                       on chip memory (TCM). */
+    unsigned char  stid;          /**< Software thread ID used to configure the stid register
+                                       for profiling purposes. */
+    unsigned short priority;      /**< Thread priority. */
+    unsigned char  autostack:1;   /**< Autostack v2 enabled thread. */
+    unsigned char  group_id:6;    /**< Group ID. */
+    unsigned char  reserved:1;    /**< Reserved bits. */
+    unsigned char  bus_priority;  /**< Internal bus priority. */
+    unsigned short timetest_id;   /**< Timetest ID. */
+    unsigned int   stack_size;    /**< Thread stack size. */
+    void *stack_addr;             /**< Pointer to the stack address base. The range of the stack is
+                                       (stack_addr, stack_addr+stack_size-1). */
+    unsigned short detach_state;  /**< Detach state of the thread. */
+
+} qurt_thread_attr_t;
+/** @endcond */
+
+/** @cond rest_reg_dist */
+/** Dynamic TLS attributes. */
+typedef struct qurt_tls_info {
+  unsigned int module_id;        /**< Module ID of the loaded dynamic linked library. */
+  unsigned int tls_start;        /**< Start address of the TLS data. */
+  unsigned int tls_data_end;     /**< End address of the TLS RW data. */
+  unsigned int tls_end;          /**< End address of the TLS data. */
+}qurt_tls_info;
+/** @endcond */
+
+/** @} */ /* end_addtogroup thread_types */
+
+/*=============================================================================
+                       FUNCTIONS
+=============================================================================*/
+/**@ingroup func_qurt_thread_attr_init
+  Initializes the structure used to set the thread attributes when a thread is created.
+  After an attribute structure is initialized, Explicity set the individual attributes in the structure 
+  using the thread attribute operations.
+
+  The initialize operation sets the following default attribute values: \n
+  - Name -- NULL string \n
+  - TCB partition -- QURT_THREAD_ATTR_TCB_PARTITION_DEFAULT
+  - Priority -- QURT_THREAD_ATTR_PRIORITY_DEFAULT \n
+  - Autostack -- QURT_THREAD_ATTR_AUTOSTACK_DEFAULT \n
+  - Bus priority -- QURT_THREAD_ATTR_BUS_PRIO_DEFAULT \n
+  - Timetest ID -- QURT_THREAD_ATTR_TIMETEST_ID_DEFAULT \n
+  - stack_size -- 0 \n
+  - stack_addr -- NULL \n
+  - detach state -- #QURT_THREAD_ATTR_CREATE_LEGACY \n
+  - STID -- #QURT_THREAD_ATTR_STID_DEFAULT
+
+  @datatypes
+  #qurt_thread_attr_t
+  
+  @param[in,out] attr Pointer to the thread attribute structure.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+static inline void qurt_thread_attr_init (qurt_thread_attr_t *attr)
+{
+
+    attr->name[0] = '\0';
+    attr->tcb_partition = QURT_THREAD_ATTR_TCB_PARTITION_DEFAULT;
+    attr->priority = QURT_THREAD_ATTR_PRIORITY_DEFAULT;
+    attr->autostack = QURT_THREAD_ATTR_AUTOSTACK_DEFAULT; /* Default attribute for autostack v2*/
+    attr->bus_priority = QURT_THREAD_ATTR_BUS_PRIO_DEFAULT;
+    attr->timetest_id = (unsigned short)QURT_THREAD_ATTR_TIMETEST_ID_DEFAULT;
+    attr->stack_size = 0;
+    attr->stack_addr = NULL;
+    attr->detach_state = QURT_THREAD_ATTR_CREATE_LEGACY;
+    attr->stid = QURT_THREAD_ATTR_STID_DEFAULT;
+    attr->group_id = QURT_THREAD_DEFAULT_GROUP_ID;
+}
+
+/**@ingroup func_qurt_thread_attr_set_name
+  Sets the thread name attribute.\n
+  This function specifies the name to use by a thread.
+  Thread names identify a thread during debugging or profiling.
+  Maximum name length is 16 charactes  \n
+  @note1hang Thread names differ from the kernel-generated thread identifiers used to
+  specify threads in the API thread operations.
+
+  @datatypes
+  #qurt_thread_attr_t
+
+  @param[in,out] attr Pointer to the thread attribute structure.
+  @param[in] name     Pointer to the character string containing the thread name.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+static inline void qurt_thread_attr_set_name (qurt_thread_attr_t *attr, const char *name)
+{
+    strlcpy (attr->name, name, QURT_THREAD_ATTR_NAME_MAXLEN);
+    attr->name[QURT_THREAD_ATTR_NAME_MAXLEN - 1] = '\0';
+}
+
+
+/**@ingroup func_qurt_thread_attr_set_tcb_partition
+  Sets the thread TCB partition attribute.
+  Specifies the memory type where a TCB of a thread is allocated.
+  Allocates TCBs in RAM or TCM/LPM.
+
+  @datatypes
+  #qurt_thread_attr_t
+
+  @param[in,out] attr  Pointer to the thread attribute structure.
+  @param[in] tcb_partition TCB partition. Values:\n
+                     - 0 -- TCB resides in RAM \n
+                     - 1 -- TCB resides in TCM/LCM @tablebulletend
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+static inline void qurt_thread_attr_set_tcb_partition (qurt_thread_attr_t *attr, unsigned char tcb_partition)
+{
+    attr->tcb_partition = tcb_partition;
+}
+
+/**@ingroup func_qurt_thread_attr_set_priority
+  Sets the thread priority to assign to a thread.
+  Thread priorities are specified as numeric values in the range 1 to 254, with 1 representing
+  the highest priority.
+  Priority 0 and 255  are internally used by the kernel for special purposes.
+
+  @datatypes
+  #qurt_thread_attr_t
+
+  @param[in,out] attr Pointer to the thread attribute structure.
+  @param[in] priority Thread priority.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+static inline void qurt_thread_attr_set_priority (qurt_thread_attr_t *attr, unsigned short priority)
+{
+    attr->priority = priority;
+}
+
+/**@ingroup func_qurt_thread_attr_set_detachstate
+  Sets the thread detach state with which thread is created.
+  Thread detach state is either joinable or detached; specified by the following values:
+  - #QURT_THREAD_ATTR_CREATE_JOINABLE  \n           
+  - #QURT_THREAD_ATTR_CREATE_DETACHED  \n   
+
+  When a detached thread is created (QURT_THREAD_ATTR_CREATE_DETACHED), its thread
+  ID and other resources are reclaimed as soon as the thread exits. When a joinable thread 
+  is created (QURT_THREAD_ATTR_CREATE_JOINABLE), it is assumed that some
+  thread waits to join on it using a qurt_thread_join() call. 
+  By default, detached state is QURT_THREAD_ATTR_CREATE_LEGACY
+  If detached state is QURT_THREAD_ATTR_CREATE_LEGACY then other
+  thread can join before thread exits but it will not wait other thread to join.
+  
+  @note1hang For a joinable thread (QURT_THREAD_ATTR_CREATE_JOINABLE), it is very
+             important that some thread joins on it after it terminates, otherwise
+			 the resources of that thread are not reclaimed, causing memory leaks.      
+
+  @datatypes
+  #qurt_thread_attr_t
+
+  @param[in,out] attr Pointer to the thread attribute structure.
+  @param[in] detachstate Thread detach state.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+static inline void qurt_thread_attr_set_detachstate (qurt_thread_attr_t *attr, unsigned short detachstate)
+{	
+    if(detachstate == QURT_THREAD_ATTR_CREATE_JOINABLE  || detachstate == QURT_THREAD_ATTR_CREATE_DETACHED){
+		attr->detach_state = detachstate;
+	}
+}
+
+
+/**@ingroup func_qurt_thread_attr_set_timetest_id
+  Sets the thread timetest attribute.\n
+  Specifies the timetest identifier to use by a thread.
+
+  Timetest identifiers are used to identify a thread during debugging or profiling. \n
+  @note1hang Timetest identifiers differ from the kernel-generated thread identifiers used to
+             specify threads in the API thread operations.
+
+  @datatypes
+  #qurt_thread_attr_t
+  
+  @param[in,out] attr   Pointer to the thread attribute structure.
+  @param[in] timetest_id Timetest identifier value.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+  */
+static inline void qurt_thread_attr_set_timetest_id (qurt_thread_attr_t *attr, unsigned short timetest_id)
+{
+    attr->timetest_id = timetest_id;
+}
+
+/**@ingroup func_qurt_thread_attr_set_stack_size
+  @xreflabel{sec:set_stack_size}
+  Sets the thread stack size attribute.\n
+  Specifies the size of the memory area to use for a call stack of a thread.
+
+  The thread stack address (Section @xref{sec:set_stack_addr}) and stack size specify the memory area used as a
+  call stack for the thread. The user is responsible for allocating the memory area used for
+  the stack.
+
+  @datatypes
+  #qurt_thread_attr_t
+
+  @param[in,out] attr Pointer to the thread attribute structure.
+  @param[in] stack_size Size (in bytes) of the thread stack.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+
+static inline void qurt_thread_attr_set_stack_size (qurt_thread_attr_t *attr, unsigned int stack_size)
+{
+    attr->stack_size = stack_size;
+}
+
+/**@ingroup func_qurt_thread_attr_set_stack_size2
+  @xreflabel{sec:set_stack_size}
+  Sets the thread stack size attribute for island threads that require a higher guest OS stack size than the stack size
+  defined in the configuration XML.\n
+  Specifies the size of the memory area to use for a call stack of an island thread in User and Guest mode.
+
+  The thread stack address (Section @xref{sec:set_stack_addr}) and stack size specify the memory area used as a
+  call stack for the thread. The user is responsible for allocating the memory area used for
+  the stack.
+
+  @datatypes
+  #qurt_thread_attr_t
+
+  @param[in,out] attr Pointer to the thread attribute structure.
+  @param[in] user_stack_size Size (in bytes) of the stack usage in User mode.
+  @param[in] root_stack_size Size (in bytes) of the stack usage in Guest mode.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+static inline void qurt_thread_attr_set_stack_size2 (qurt_thread_attr_t *attr, unsigned short user_stack_size, unsigned short root_stack_size)
+{
+	union qurt_thread_stack_info{
+		unsigned int raw_size;
+		struct{
+			unsigned short user_stack;
+			unsigned short root_stack;
+		};
+	}user_root_stack_size;
+	user_root_stack_size.user_stack = user_stack_size;
+	user_root_stack_size.root_stack = root_stack_size;
+	
+    attr->stack_size = user_root_stack_size.raw_size;
+}
+
+/**@ingroup func_qurt_thread_attr_set_stack_addr
+  @xreflabel{sec:set_stack_addr}
+  Sets the thread stack address attribute. \n
+  Specifies the base address of the memory area to use for a call stack of a thread.
+
+  stack_addr must contain an address value that is 8-byte aligned.
+
+  The thread stack address and stack size (Section @xref{sec:set_stack_size}) specify the memory area used as a
+  call stack for the thread. \n
+  @note1hang The user is responsible for allocating the memory area used for the thread
+             stack. The memory area must be large enough to contain the stack that the thread
+			 creates.
+
+  @datatypes
+  #qurt_thread_attr_t
+  
+  @param[in,out] attr Pointer to the thread attribute structure.
+  @param[in] stack_addr  Pointer to the 8-byte aligned address of the thread stack.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+static inline void qurt_thread_attr_set_stack_addr (qurt_thread_attr_t *attr, void *stack_addr)
+{
+    attr->stack_addr = stack_addr;
+}
+
+/**@ingroup func_qurt_thread_attr_set_bus_priority
+   Sets the internal bus priority state in the Hexagon core for this software thread attribute. 
+   Memory requests generated by the thread with bus priority enabled are
+   given priority over requests generated by the thread with bus priority disabled. 
+   The default value of bus priority is disabled.
+
+   @note1hang Sets the internal bus priority for Hexagon processor version V60 or greater. 
+              The priority is not propagated to the bus fabric.
+  
+   @datatypes
+   #qurt_thread_attr_t
+
+   @param[in] attr Pointer to the thread attribute structure.
+
+   @param[in] bus_priority Enabling flag. Values: \n 
+         - #QURT_THREAD_BUS_PRIO_DISABLED \n
+         - #QURT_THREAD_BUS_PRIO_ENABLED @tablebulletend
+
+   @return
+   None
+
+   @dependencies
+   None.
+*/
+static inline void qurt_thread_attr_set_bus_priority ( qurt_thread_attr_t *attr, unsigned short bus_priority)
+{
+    attr->bus_priority = (unsigned char)bus_priority;
+}
+
+/**@ingroup func_qurt_thread_attr_set_autostack
+   Enables autostack v2 feature in the thread attributes.
+   
+   When autostack is enabled by the subsystem, in the case that
+   an autostack enabled thread gets framelimit exception, kernel will
+   allocate more stack for thread and return to normal execution. 
+
+   If autostack is not enabled by the subsystem, or it is not enabled
+   for the thread, the framelimit exception will be fatal.
+  
+   @datatypes
+   #qurt_thread_attr_t
+
+   @param[in] attr Pointer to the thread attribute structure.
+   @param[in] autostack  Autostack enable or disable flag. Values: \n 
+         - #QURT_THREAD_AUTOSTACK_DISABLED \n
+         - #QURT_THREAD_AUTOSTACK_ENABLED @tablebulletend
+
+   @return
+   None.
+
+   @dependencies
+   None.
+*/
+static inline void qurt_thread_attr_set_autostack ( qurt_thread_attr_t *attr, unsigned short autostack)
+{
+    attr->autostack = (unsigned char)autostack;  
+}
+/**@ingroup qurt_thread_attr_enable_stid
+   Set STID in the thread attributes.
+  
+   @datatypes
+   #qurt_thread_attr_t
+
+   @param[in] attr Pointer to the thread attribute structure.
+   @param[in] enable_stid  STID to be set. Values: \n 
+         - #QURT_THREAD_ATTR_STID_DEFAULT (0): Default STID. \n
+         - #QURT_THREAD_ATTR_STID_ENABLE (1):  QuRT assigns an STID that is not already in use \n
+         - #2 through #255 : User provided STID.  @tablebulletend
+
+   @return
+   None.
+
+   @dependencies
+   None.
+*/
+static inline void qurt_thread_attr_enable_stid ( qurt_thread_attr_t *attr, char enable_stid)
+{
+    if (enable_stid != '\0') {
+        attr->stid = enable_stid;
+    }
+    else
+    {
+        attr->stid = QURT_THREAD_ATTR_STID_DEFAULT;
+    }
+}
+
+/**@ingroup func_qurt_thread_attr_set_stid
+   Sets the stid thread attribute.
+   The default stid value is QURT_THREAD_ATTR_STID_DEFAULT
+
+   @note1hang When a thread is created with non default stid , 
+   the stid set in thread attribute  will be assigned to a thread.
+  
+   @datatypes
+   #qurt_thread_attr_t
+
+   @param[in] attr Pointer to the thread attribute structure.
+   @param[in] stid Stid to be set for a thread.
+
+   @return
+   None
+
+   @dependencies
+   None.
+*/
+static inline void qurt_thread_attr_set_stid( qurt_thread_attr_t *attr, unsigned int stid){
+    attr->stid = stid;
+}
+
+/**@ingroup func_qurt_thread_attr_set_group_id
+  Sets group id in the thread attributes.
+  Primordial/first thread has group ID 0.
+  If a new thread is created without assigning group_id, it
+  inherits the group ID from its parent thread.
+
+  @note1hang
+  1) Group ID can only be set before creating a thread. It cannot be
+  changed after the thread is created.
+  2) If a non-activated group_id is passed, thread creation will fail.
+  3) Only a thread with Group ID #0 can set Group ID for its child threads.
+  4) If thread with non-zero group ID set the group ID for its child threads,
+  QuRT will ingore this parameter and child threads will inherit the parent
+  thread's group ID. But if passed group ID is not activated, thread creation
+  will still fail.
+
+  @datatypes
+  #qurt_thread_attr_t
+
+  @param[in] attr Pointer to the thread attribute structure.
+  @param[in] group_id Group identifier. Its valid range is 0 ~ 63
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+static inline void qurt_thread_attr_set_group_id(qurt_thread_attr_t *attr, unsigned int group_id)
+{
+    attr->group_id = group_id & QURT_THREAD_GROUP_ID_MASK;
+}
+
+/**@ingroup func_qurt_thread_set_autostack
+  Sets autostack enable in the TCB.
+
+  @param[in] Pointer to UGP
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+
+void qurt_thread_set_autostack(void *);
+
+
+/**@ingroup func_qurt_thread_get_name
+  Gets the thread name of current thread.\n
+  Returns the thread name of the current thread. 
+  Thread names are assigned to threads as thread attributes, see qurt_thread_attr_set_name(). Thread names 
+  identify a thread during debugging or profiling.
+
+  @param[out] name Pointer to a character string, which specifies the address where the returned thread name is stored.
+  @param[in] max_len Maximum length of the character string that can be returned.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+void qurt_thread_get_name (char *name, unsigned char max_len);
+
+/**@ingroup func_qurt_thread_create
+  @xreflabel{hdr:qurt_thread_create}
+  Creates a thread with the specified attributes, and makes it executable.
+
+  @datatypes
+  #qurt_thread_t \n
+  #qurt_thread_attr_t
+  
+  @param[out]  thread_id    Returns a pointer to the thread identifier if the thread was 
+                             successfully created.
+  @param[in]   attr 	    Pointer to the initialized thread attribute structure that specifies 
+                             the attributes of the created thread.
+  @param[in]   entrypoint   C function pointer, which specifies the main function of a thread.
+  @param[in]   arg  	     Pointer to a thread-specific argument structure
+  
+   
+  @return 
+  #QURT_EOK -- Thread created. \n
+  #QURT_EFAILED -- Thread not created. 
+
+  @dependencies
+  None.
+ */
+int qurt_thread_create (qurt_thread_t *thread_id, qurt_thread_attr_t *attr, void (*entrypoint) (void *), void *arg);
+
+/**@ingroup func_qurt_thread_stop
+   Stops the current thread, frees the kernel TCB, and yields to the next highest ready thread. 
+  
+   @return
+   void 
+
+   @dependencies
+   None.
+ */
+void qurt_thread_stop(void);
+
+/** @cond internal_only */
+/**@ingroup func_qurt_thread_resume
+   When a demand-loading paging solution is enabled, this function
+   will resumes the execution of a thread that was suspended due to
+   a page miss.
+  
+   @param[in]  thread_id Thread identifier.
+
+   @return 
+   #QURT_EOK -- Thread successfully resumed. \n
+   #QURT_EFATAL -- Resume operation failed.
+
+   @dependencies
+   None.
+ */
+int qurt_thread_resume(unsigned int thread_id);
+/** @endcond */
+
+/**@ingroup func_qurt_thread_get_id
+   Gets the identifier of the current thread.\n
+   Returns the thread identifier for the current thread.
+     
+   @return 
+   Thread identifier -- Identifier of the current thread. 
+
+   @dependencies
+   None.
+ */
+qurt_thread_t qurt_thread_get_id (void);
+
+
+/**@ingroup func_qurt_thread_get_l2cache_partition
+   Returns the current value of the L2 cache partition assigned to the caller thread.\n
+     
+   @return 
+   Value of the #qurt_cache_partition_t data type.
+
+   @dependencies
+   None.
+ */
+qurt_cache_partition_t qurt_thread_get_l2cache_partition (void);
+
+/**@ingroup func_qurt_thread_set_timetest_id
+   Sets the timetest identifier of the current thread.
+   Timetest identifiers are used to identify a thread during debugging or profiling.\n
+   @note1hang Timetest identifiers differ from the kernel-generated thread identifiers used to
+              specify threads in the API thread operations.
+
+   @param[in]  tid  Timetest identifier.
+
+   @return
+   None.
+
+   @dependencies
+   None.
+ */
+void qurt_thread_set_timetest_id (unsigned short tid);
+
+/**@ingroup func_qurt_thread_set_cache_partition
+   Sets the cache partition for the current thread. This function uses the qurt_cache_partition_t type 
+   to select the cache partition of the current thread for the L1 Icache, L1 Dcache, and L2 cache.
+  
+   @datatypes
+   #qurt_cache_partition_t 
+
+   @param[in] l1_icache L1 I cache partition.
+   @param[in] l1_dcache L1 D cache partition.
+   @param[in] l2_cache L2 cache partition.
+    
+   @return
+   None.
+
+   @dependencies
+   None.
+ */
+void qurt_thread_set_cache_partition(qurt_cache_partition_t l1_icache, qurt_cache_partition_t l1_dcache, qurt_cache_partition_t l2_cache);
+
+
+/**@ingroup func_qurt_thread_get_timetest_id
+   Gets the timetest identifier of the current thread.\n
+   Returns the timetest identifier of the current thread.\n
+   Timetest identifiers are used to identify a thread during debugging or profiling. \n
+   @note1hang Timetest identifiers differ from the kernel-generated thread identifiers used to
+              specify threads in the API thread operations.
+
+   @return 
+   Integer -- Timetest identifier. 
+
+   @dependencies
+   None.
+ */
+unsigned short qurt_thread_get_timetest_id (void);
+
+/**@ingroup func_qurt_thread_exit
+   @xreflabel{sec:qurt_thread_exit}
+   Stops the current thread, awakens threads joined to it, then destroys the stopped
+   thread.
+
+   Threads that are suspended on the current thread (by performing a thread join 
+   Section @xref{sec:thread_join}) are awakened and passed a user-defined status value 
+   that indicates the status of the stopped thread.
+
+   @note1hang Exit must be called in the context of the thread to stop.
+  
+   @param[in]   status User-defined thread exit status value.
+
+   @return
+   None.
+
+   @dependencies
+   None.
+ */
+void qurt_thread_exit(int status);
+
+/**@ingroup func_qurt_thread_join
+   @xreflabel{sec:thread_join}
+   Waits for a specified thread to finish; the specified thread is another thread within
+   the same process.
+   The caller thread is suspended until the specified thread exits. When the unspecified thread
+   exits, the caller thread is awakened. \n
+   @note1hang If the specified thread has already exited, this function returns immediately
+              with the result value #QURT_ENOTHREAD. \n
+   @note1cont Two threads cannot call qurt_thread_join to wait for the same thread to finish.
+              If this occurs, QuRT generates an exception (see Section @xref{sec:exceptionHandling}).
+  
+   @param[in]   tid     Thread identifier.
+   @param[out]  status  Destination variable for thread exit status. Returns an application-defined 
+                        value that indicates the termination status of the specified thread. 
+  
+   @return  
+   #QURT_ENOTHREAD -- Thread has already exited. \n
+   #QURT_EOK -- Thread successfully joined with valid status value. 
+
+   @dependencies
+   None.
+ */
+int qurt_thread_join(unsigned int tid, int *status);
+
+/**@ingroup qurt_thread_detach
+   @xreflabel{sec:thread_detach}
+   Detaches a joinable thread. The specified thread is another thread within the 
+   same process. Create the thread as a joinable thread; only joinable threads 
+   can be detached.
+   If a joinable thread is detached, it finishes execution and exits.
+  
+   @param[in]   tid     Thread identifier.
+   
+   @return  
+   #QURT_ENOTHREAD -- Thread specifed by TID does not exist. \n
+   #QURT_EOK -- Thread successfully detached.
+
+   @dependencies
+   None.
+ */
+int qurt_thread_detach(unsigned int tid);
+
+
+/**@ingroup func_qurt_thread_get_priority 
+   Gets the priority of the specified thread. \n 
+   Returns the thread priority of the specified thread.\n
+   Thread priorities are specified as numeric values in a range as large as 1 through 254, with lower
+   values representing higher priorities. 1 represents the highest possible thread priority. \n
+   Priority 0 and 255 are internally used by the kernel for special purposes.
+
+   @note1hang QuRT can be configured to have different priority ranges.
+
+   @datatypes
+   #qurt_thread_t
+  
+   @param[in]  threadid	   Thread identifier.	
+
+   @return
+   -1 -- Invalid thread identifier. \n
+   1 through 254 -- Thread priority value.
+
+   @dependencies
+   None.
+ */
+int qurt_thread_get_priority (qurt_thread_t threadid);
+
+/**@ingroup func_qurt_thread_set_priority
+   Sets the priority of the specified thread.\n
+   Thread priorities are specified as numeric values in a range as large as 1 through 254, with lower
+   values representing higher priorities. 1 represents the highest possible thread priority.
+   Priority 0 and 255  are internally used by the kernel  for special purposes.
+
+   @note1hang QuRT can be configured to have different priority ranges. For more
+              information, see Section @xref{sec:AppDev}.
+   
+   @datatypes
+   #qurt_thread_t
+
+   @param[in] threadid	    Thread identifier.	
+   @param[in] newprio 	    New thread priority value.
+
+   @return
+   0 -- Priority successfully set. \n
+   -1 -- Invalid thread identifier. \n 
+   
+   @dependencies
+   None.
+ */
+int qurt_thread_set_priority (qurt_thread_t threadid, unsigned short newprio);
+
+
+
+/**@ingroup func_qurt_thread_attr_get
+  Gets the attributes of the specified thread.
+
+  @datatypes
+  #qurt_thread_t \n
+  #qurt_thread_attr_t
+
+  @param[in]  thread_id	    Thread identifier.
+  @param[out] attr 	    Pointer to the destination structure for thread attributes.
+  
+  @return
+  #QURT_EOK -- Success. \n
+  #QURT_EINVALID -- Invalid argument.
+
+  @dependencies
+  None.
+ */
+int qurt_thread_attr_get (qurt_thread_t thread_id, qurt_thread_attr_t *attr);
+
+
+
+/**@ingroup func_qurt_thread_get_tls_base
+  Gets the base address of thread local storage (TLS) of a dynamically loaded module
+  for the current thread.
+  
+  @datatypes
+  #qurt_tls_info 
+
+  @param[in]  info	   Pointer to the TLS information for a module.
+  
+  @return
+   Pointer to the TLS object for the dynamically loaded module.\n
+   NULL -- TLS information is invalid.
+
+  @dependencies
+  None.
+ */
+void * qurt_thread_get_tls_base(qurt_tls_info* info);
+
+/**@ingroup func_qurt_thread_pktcount_get
+  Gets the PKTCOUNT of a specified thread.
+
+  @datatypes
+  #qurt_thread_t 
+
+  @param[in]  thread_id	    Thread identifier.
+  
+  @return
+  PKTCOUNT
+
+  @dependencies
+  None.
+ */
+
+long long int qurt_thread_pktcount_get (qurt_thread_t thread_id);
+
+/**@ingroup func_qurt_thread_pktcount_set
+  Sets the PKTCOUNT for the current QuRT thread.
+  
+  @return
+  Value to which pktcount is set.
+
+  @dependencies
+  None.
+ */
+
+long long int qurt_thread_pktcount_set (long long int);
+
+/**@ingroup func_qurt_thread_stid_get
+  Gets the STID for a specified thread.
+
+  @datatypes
+  #qurt_thread_t 
+
+  @param[in]  thread_id	    Thread identifier.
+  
+  @return
+  STID
+
+  @dependencies
+  None.
+ */
+
+char qurt_thread_stid_get(qurt_thread_t thread_id);
+ 
+/**@ingroup func_qurt_thread_stid_get2
+  Returns the set stid for a thread
+  
+  @param[in]  thread_id   thread identifier
+  @param[out] stid  Pointer to a variable to return  stid
+   
+  @return
+  QURT_EOK - success
+  QURT_ENOTALLOWED   - operation not allowed for a thread
+  QURT_EINVALID - Invalid input
+
+  @dependencies
+  None.
+ */
+int qurt_thread_stid_get2(unsigned int thread_id, unsigned int *stid);
+
+/**@ingroup func_qurt_thread_stid_set
+  Sets the STID for a specified thread. 
+
+  @datatypes
+  #qurt_thread_t 
+
+  @param[in]  stid	    Thread identifier.
+  
+  @return 
+   #QURT_EOK -- STID set created. \n
+   #QURT_EFAILED -- STID not set. 
+
+  @dependencies
+  None.
+ */
+
+int qurt_thread_stid_set(char stid);
+
+/**@ingroup qurt_thread_stid_set2
+   Sets the stid for a specified thread.
+
+   @datatypes
+   #qurt_thread_attr_t
+
+   @param[in]  thread_id  Thread identifier.
+   @param[in]  stid       Stid to be set for a thread.
+
+   @return
+   QURT_EOK -- Success
+   #QURT_EPRIVILEGE -- Failure because caller does not have enough privilege for this operation.
+   #QURT_EVAL -- Failure because of invalid inputs.
+
+   @dependencies
+   None.
+*/
+int qurt_thread_stid_set2(unsigned int thread_id, unsigned int stid); 
+
+/** @cond internal_only */
+/**@ingroup func_qurt_thread_get_running_ids
+  Returns the thread IDs of the running threads in the system; use only during fatal error handling.
+ 
+  @datatypes
+  #qurt_thread_t 
+ 
+  @param[in,out] * Array of thread identifier of size #QURT_MAX_HTHREAD_LIMIT + 1.
+ 
+  @return
+   #QURT_EINVALID -- Incorrect argument \n
+   #QURT_ENOTALLOWED  -- API not called during error handling \n
+   #QURT_EOK -- Success, returns a NULL-terminated array of thread_id
+ 
+  @dependencies
+  None.
+ */
+int qurt_thread_get_running_ids(qurt_thread_t *);
+/** @endcond */
+
+
+/**@ingroup func_qurt_thread_get_thread_id
+  Gets the thread identifier of the thread with the matching name in the same process
+  of the caller.
+ 
+  @datatypes
+  #qurt_thread_t 
+ 
+  @param[out] thread_id Pointer to the thread identifier.
+  @param[in]  name      Pointer to the name of the thread.
+ 
+  @return
+  #QURT_EINVALID -- No thread with matching name in the process of the caller \n
+  #QURT_EOK      -- Success  
+ 
+  @dependencies
+  None.
+ */
+int qurt_thread_get_thread_id (qurt_thread_t *thread_id, char *name);
+
+/**@ingroup func_qurt_sleep
+  Suspends the current thread for the specified amount of time.
+
+  @note1hang Because QuRT timers are deferrable, this call is guaranteed to block
+             at least for the specified amount of time. If power-collapse is 
+             enabled, the maximum amount of time this call can block depends on
+             the earliest wakeup from power-collapse past the specified duration.
+
+  @param[in] duration  Duration (in microseconds) for which the thread is suspended.
+
+  @return 
+  None.
+
+  @dependencies
+  None.
+ */
+void qurt_sleep (unsigned long long int duration);
+
+
+/**@ingroup func_qurt_system_set_priority_floor
+  Sets a priority floor to move threads with thread priority lower than the floor out of the running state.
+  Running threads with thread priority lower than the priority floor are moved into the kernel ready queue, and they 
+  are not scheduled to run when the thread priority is lower than the floor.
+  Later the caller should reset the priority floor back to the default value of QURT_PRIORITY_FLOOR_DEFAULT. 
+  Threads in the kernel ready queue are scheduled to run when the thread priority is higher than the floor.
+
+  The priority floor is set and associated to the user process of the caller. When the caller gets into QuRTOS and
+  sets a new floor, the new floor is associated to its original user process, not the QuRTOS process.
+  The floor associated to the user process is reset when the user process exits or is killed, but not at the time 
+  when the user thread of the caller exits.
+
+  The priority floor cannot be set to a priority higher than the thread priority of the caller.
+
+  The priority floor cannot be set to a priority lower than the default #QURT_PRIORITY_FLOOR_DEFAULT system floor.
+
+  This function is not supported in Island mode.
+
+  After the system floor is set above QURT_PRIORITY_FLOOR_DEFAULT, power collapse is skipped, and sleep task 
+  is not scheduled to run.
+ 
+  @param[in]  priority_floor Priority floor. 
+ 
+  @return
+  #QURT_EOK         -- Success \n  
+  #QURT_ENOTALLOWED -- Floor setting is not allowed
+ 
+  @dependencies
+  None.
+ */
+int qurt_system_set_priority_floor (unsigned int priority_floor);
+
+
+/**@ingroup func_qurt_thread_suspend_thread 
+  Suspend a QuRT thread with its thread identifier.
+  The target thread can be in a signed user process or an unsigned user process.
+  The caller thread can be a thread from the same user process of the target thread, or from its parent process.
+  After the target thread is suspended, the kernel will not schedule it to run until it is resumed later.
+
+  If the target thread is set as non-suspendable, this function call returns an error without suspending 
+  the target thread. 
+
+  If the target thread is already suspended, this function call returns success to confirm 
+  the target thread suspend.                                          
+
+  If the target thread is in a secure user process, or CPZ process, this function call returns an error without
+  suspending the target thread.                                          
+
+  If the target thread is running in the guest OS/root process via a QDI call, this function call does not suspend 
+  the target thread in guest OS, but marks the target thread as suspend-pending. The target thread is
+  suspended when it exits the guest OS, before executing the first instruction in the user process.
+  In this case, the function returns success even with the #QURT_THREAD_SUSPEND_SYNCHRONOUS option, while the target
+  thread can runn in the guest OS, and is suspended when exiting the guest OS. 
+ 
+  QuRT debug monitor threads that are in a user process are non-suspendable. This function does not suspend 
+  those threads.
+
+  @param[in] thread_id  Thread identifier.
+  @param[in] option     Optional argument, multiple options can be ORed. \n
+                        #QURT_THREAD_SUSPEND_SYNCHRONOUS (default) -- set to synchronous function call,
+                        the function returns after the thread is completely suspended.\n
+                        #QURT_THREAD_SUSPEND_ASYNCHRONOUS -- set to asynchronous function call, the function returns
+                        after the kernel acts to suspend the target thread. The target thread
+                        might still be running before it is completely suspended. \n
+                        #QURT_THREAD_SUSPEND_KEEP_HMX (default) -- keep the HMX attachment on the target thread 
+                        if it locks the HMX with qurt_hmx_lock(). In this case, the HMX cannot be re-used by other threads. \n
+                        #QURT_THREAD_SUSPEND_DETACH_HMX -- detach HMX from the target thread if it locks the HMX with qurt_hmx_lock().
+                        Later when the target thread resumes, the HMX is re-attached to the thread. Note that, this option is only 
+                        supported for the caller from the same user process of the target thread, not for a caller from the parent 
+                        process of the target thread, or other processes. With the HMX detach option, Qurt does not save the HMX 
+                        context. Thus, the HMX context state will be lost. It is the responsibility of caller to ensure HMX operations
+                        and its context state saving when calling qurt_thread_suspend_thread() with the HMX detach option.
+                        If a thread from another process uses this detach option, QURT_EHMXNOTDETACHABLE will be returned; in this 
+                        case, if the caller is qualified to suspend the target thread, the target thread will be moved to suspended 
+                        state without HMX detached.
+ 
+  @return
+  #QURT_EOK         -- Success  \n
+  #QURT_EINVALID    -- Failure because of invalid thread_id input \n
+  #QURT_ENOTALLOWED -- Failure because of the operation is not allowed, for example, in secure process/CPZ process.
+  #QURT_EHMXNOTDETACHABLE -- Failure because HMX is not detachable from the target thread.
+ 
+  @dependencies
+  None.
+ */
+int qurt_thread_suspend_thread (unsigned int thread_id, unsigned int option);
+
+
+/**@ingroup func_qurt_thread_resume_thread 
+  Resume a QuRT thread with its thread identifier.
+  The target thread can be in a signed user process or an unsigned user process.
+  The caller thread can be a thread from the same user process of the target thread, or from its parent 
+  process. After the target thread resumes, the kernel scheduler can schedule the thread to run based on 
+  the thread priority.
+
+  There is an option argument in this function, with only one default option as of now,
+     QURT_THREAD_RESUME_DEFAULT: resume the target thread in default way.
+
+  By default, this is an asynchronous function. The function returns after kernel moves the 
+  target thread from suspended state to runnable state. The thread is scheduled to run based on its 
+  thread priority.
+  
+  If the target thread is set as non-resumable, this function call does not resume the target thread.                                          
+
+  If the target thread has already resumed, this function confirms that the target thread resumes
+  by returning success.  
+
+  If the target thread is in a secure user process or CPZ process, this function call returns an error without 
+  resuming the operation.  
+
+  If the target thread runs in the guest OS/root process via a QDI call, this function call clears the mark of
+  suspend-pending on the target thread, and the target thread is not suspended when it exits the 
+  guest OS. 
+ 
+  @param[in] thread_id  Thread identifier.
+  @param[in] option     Optional argument, #QURT_THREAD_RESUME_DEFAULT, which resumes the target thread.
+ 
+  @return
+  #QURT_EOK           -- Success \n 
+  #QURT_EINVALID      -- Failure because of invalid thread_id input \n
+  #QURT_ENOTALLOWED   -- Failure because of the operation is not allowed, for example, in a secure process/CPZ process.
+  #QURT_EHMXNOTAVAIL  -- Failure because when resume a HMX thread, the HMX is not available/free for the HMX thread resume.
+ 
+  @dependencies
+  None.
+ */
+int qurt_thread_resume_thread (unsigned int thread_id, unsigned int option);
+
+
+/**@ingroup func_qurt_thread_set_thread_property 
+  Set a QuRT thread property with its thread identifier.
+  The target thread can be in a signed user process or an unsigned user process.
+  The caller thread can be from the same user process of the target thread, or from its parent process.
+
+  If the target thread is in a secure user process, or CPZ process, this function call returns an error without 
+  changing the property of the target thread.
+
+  @param[in] thread_id    Thread identifier \n
+  @param[in] property_id  Thread property identifier \n
+                          #QURT_THREAD_PROPERTY_SUSPENDABLE -- thread is suspendable. Default is TRUE. \n
+                          #QURT_THREAD_PROPERTY_RESUMEABLE  -- thread is resumable. Default is TRUE
+  @param[in] value        Proper value: \n
+                          TRUE(1) -- TRUE for the property \n
+                          FALSE(0) -- FALSE for the property
+ 
+  @return
+  #QURT_EOK         -- Success  \n
+  #QURT_EINVALID    -- Failure because of invalid thread_id input \n
+  #QURT_ENOTALLOWED -- Failure because of the operation is not allowed, for example, in a secure process/CPZ process.
+ 
+  @dependencies
+  None.
+ */
+int qurt_thread_set_thread_property( unsigned int thread_id, unsigned int property_id, unsigned int value );    
+
+/**@ingroup func_qurt_thread_get_group_id
+  Get the group id of the thread specified by thread_id.\n
+
+  @param[in] thread_id Thread identifier
+  @param[out] group_id Pointer to the variable of group identifier
+
+  @return
+  #QURT_EOK         -- Success  \n
+  #QURT_EINVALID    -- Thread id is invalid, or the process has no groups enabled \n
+  #QURT_ENOTALLOWED -- Operation is not allowed \n
+
+  @dependencies
+  None.
+*/
+int qurt_thread_get_group_id(qurt_thread_t thread_id, unsigned int* group_id);
+
+#endif /* __ASSEMBLER__ */ 
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_THREAD_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_thread_context.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_thread_context.h
new file mode 100755
index 0000000000000..bab09deec8889
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_thread_context.h
@@ -0,0 +1,234 @@
+#ifndef QURT_THREAD_CONTEXT_H
+#define QURT_THREAD_CONTEXT_H
+/**
+  @file qurt_thread_context.h 
+  @brief Kernel thread context structure
+			
+EXTERNAL FUNCTIONS
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+Copyright (c) 2018-2022  by Qualcomm Technologies, Inc.  All Rights Reserved.
+Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+=============================================================================*/
+
+
+
+#include <qurt_qdi_constants.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** @cond internal_only */
+
+#define THREAD_ITERATOR_END ((qurt_thread_t)(-1))  /**< Thread iterator is complete. */   
+
+
+/**@ingroup func_qurt_thread_iterator_create
+Gives the ability to the caller to enumerate threads in the system.
+
+@return 
+Handle of the newly created iterator must be passed for
+subsequent operations on the iterator.           
+
+@dependencies
+None.
+*/
+static inline int qurt_thread_iterator_create(void)
+{
+   return qurt_qdi_handle_invoke(QDI_HANDLE_GENERIC, QDI_OS_THREAD_ITERATOR_CREATE);
+}
+
+/**@ingroup func_qurt_thread_iterator_next
+Iterates over the list of threads in the system.
+
+@datatypes
+#qurt_thread_t
+
+@param[in] iter Iterator handle returned by qurt_thread_iterator_create().
+
+@return 
+#THREAD_ITERATOR_END -- iterator has reached the end of the thread list. \n
+Other values indicate a valid thread_id.
+
+@dependencies
+None.
+*/
+static inline qurt_thread_t qurt_thread_iterator_next(int iter)
+{
+   return (qurt_thread_t)qurt_qdi_handle_invoke(iter, QDI_OS_THREAD_ITERATOR_NEXT);
+}
+
+/**@ingroup func_qurt_thread_iterator_destroy
+Cleans up thread iterator resources.
+
+@param[in] iter Iterator handle returned by qurt_thread_iterator_create().
+
+@return 
+#QURT_EOK -- Successful completion of operation \n
+#QURT_EFATAL -- Invalid handle passed 
+		  
+@dependencies
+None.
+*/
+static inline int qurt_thread_iterator_destroy(int iter)
+{
+   return qurt_qdi_close(iter);
+}
+
+/**@ingroup func_qurt_thread_context_get_tname
+Gets the name of the thread from the specified thread ID.
+
+@param[in]      thread_id   Thread for which name is returned.
+@param[in,out]  name        Pointer to the local buffer where name is copied back.
+@param[in]      max_len     Size of the local buffer.
+
+@return 
+#QURT_EOK -- Success \n
+Failure otherwise
+		  
+@dependencies
+None.
+*/
+int qurt_thread_context_get_tname(unsigned int thread_id, char *name, unsigned char max_len);
+
+/**@ingroup func_qurt_thread_context_get_prio
+Gets the priority for the specified thread.
+
+@param[in]     thread_id   Thread for which priority is returned.
+@param[in,out] prio        Pointer to the local variable where priority is written.
+
+@return  
+#QURT_EOK -- Success \n
+Failure otherwise
+		  
+@dependencies
+None.
+*/
+int qurt_thread_context_get_prio(unsigned int thread_id, unsigned char *prio);
+
+/**@ingroup func_qurt_thread_context_get_pcycles
+Gets pcycles for the specified thread.
+
+@param[in]     thread_id Thread for which processor cycles are returned.
+@param[in,out] pcycles   Pointer to the local variable where processor cycles are written.
+
+@return  
+#QURT_EOK -- Success \n
+Failure otherwise.
+		  
+@dependencies
+None.
+*/
+int qurt_thread_context_get_pcycles(unsigned int thread_id, unsigned long long int *pcycles);
+
+/**@ingroup func_qurt_thread_context_get_stack_base
+Gets the stack base address for the specified thread.
+
+@param[in]     thread_id Thread for which stack base address is returned.
+@param[in,out] sbase     Pointer to the local variable where stack base address is written.
+
+@return  
+QURT_EOK -- Success \n
+Failure otherwise
+		  
+@dependencies
+None.
+*/
+int qurt_thread_context_get_stack_base(unsigned int thread_id, unsigned int *sbase);
+
+/**@ingroup func_qurt_thread_context_get_stack_size
+Gets the stack size for the specified thread.
+
+@param[in]      thread_id   Thread for which stack size is returned.
+@param[in,out]  ssize       Pointer to the local variable where stack size is written.
+
+@return  
+#QURT_EOK -- Success \n
+Failure otherwise
+		  
+@dependencies
+None.
+*/
+int qurt_thread_context_get_stack_size(unsigned int thread_id, unsigned int *ssize);
+
+/**@ingroup func_qurt_thread_context_get_pid
+Gets the process ID for the specified thread.
+
+@param[in]     thread_id  Thread for which process ID is returned.
+@param[in,out] pid        Pointer to the local variable where process id is written.
+
+@return  
+#QURT_EOK -- Success \n
+Failure otherwise
+		  
+@dependencies
+None.
+*/
+int qurt_thread_context_get_pid(unsigned int thread_id, unsigned int *pid);
+
+/**@ingroup func_qurt_thread_context_get_pname
+Gets the process name for the specified thread.
+
+@param[in]       thread_id  Represents the thread for which process name is returned.
+@param[in, out]  name       Pointer to the local buffer where process name is copied back.
+@param[in]       len        Length allocated to the local buffer.
+
+@return  
+#QURT_EOK -- Success \n
+Failure otherwise
+		  
+@dependencies
+None.
+*/
+int qurt_thread_context_get_pname(unsigned int thread_id, char *name, unsigned int len);
+
+/** @addtogroup thread_types
+@{ */
+/** Structure that defines how TCB is interpreted to crash dump tools.*/
+/* Keys are defined in consts.h */
+struct qurt_debug_thread_info {
+/** @cond */
+    char name[QURT_MAX_NAME_LEN];     /**< Name of the thread. */
+    struct {
+        unsigned key;                 
+        unsigned val;
+    } os_info[40];  
+    unsigned gen_regs[32];            /**< General mode registers. */
+    unsigned user_cregs[32];          /**< User mode registers. */
+    unsigned guest_cregs[32];         /**< Guest mode registers. */
+    unsigned monitor_cregs[64];       /**< Monitor mode registers. */
+/** @endcond */
+}; /* should add up to 1K */
+/** @} */ /* end_addtogroup thread_types */
+
+
+/**@ingroup func_qurt_system_tcb_dump_get
+Cleans up thread iterator resources.
+
+@datatypes
+#qurt_thread_t
+
+@param[in]       thread_id  Thread on which the operation must be performed.
+@param[in, out]  ptr        Pointer to the local buffer where contents are written.
+@param[in]       size       Size of the debug thread information structure obtained by calling
+                     qurt_system_tcb_dump_get_size().
+	   
+@return 
+#QURT_EOK -- Success \n
+Failure otherwise
+		  
+@dependencies
+None.
+*/
+int qurt_system_tcb_dump_get(qurt_thread_t thread_id, void *ptr, size_t size);
+/** @endcond */
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_THREAD_CONTEXT_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_timer.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_timer.h
new file mode 100755
index 0000000000000..7bdfdb8f3c3df
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_timer.h
@@ -0,0 +1,560 @@
+#ifndef QURT_TIMER_H
+#define QURT_TIMER_H
+/**
+  @file qurt_timer.h
+  @brief  Prototypes of qurt_timer API
+
+EXTERNAL FUNCTIONS
+   None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+Copyright (c) 2021, 2023 by Qualcomm Technologies, Inc.  All Rights Reserved.
+Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+=============================================================================*/
+
+
+
+#include "qurt_anysignal.h"
+#include "qurt_signal2.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*=============================================================================
+                        CONSTANTS AND MACROS
+=============================================================================*/
+/**@addtogroup timer_const_macros 
+@{ */
+/**
+ Default values.
+*/
+/**	@xreflabel{hdr:QURT_TIMER_ONESHOT}*/
+#define QURT_TIMER_DEFAULT_TYPE            QURT_TIMER_ONESHOT /**< One shot.*/
+#define QURT_TIMER_DEFAULT_DURATION        1000uL             /**< Default duration. */
+#define QURT_TIMER_DEFAULT_EXPIRY          0uL                /**< Default expiration. */
+
+/**
+ Conversion from microseconds to timer ticks.
+ */
+#define QURT_TIMER_TIMETICK_FROM_US(us) QURT_SYSCLOCK_TIMETICK_FROM_US(us)
+
+/**
+ Conversion from timer ticks to microseconds at the nominal frequency.
+*/
+#define QURT_TIMER_TIMETICK_TO_US(ticks) qurt_timer_timetick_to_us(ticks)
+
+/** Minimum microseconds value is 100 microseconds (sleep timer).*/
+#define QURT_TIMER_MIN_DURATION             100uL
+
+/**
+ Maximum microseconds value for Qtimer is 1,042,499 hours.
+*/
+#define QURT_TIMER_MAX_DURATION  QURT_SYSCLOCK_MAX_DURATION
+
+/** 
+ Timer clock for Qtimer is 19.2 MHz.
+*/
+#define QURT_TIMER_MAX_DURATION_TICKS QURT_SYSCLOCK_MAX_DURATION_TICKS
+
+/** 
+ Sleep timer error margin for Qtimer is 1,000 ticks ~52 us.
+*/
+#define QURT_TIMETICK_ERROR_MARGIN QURT_SYSCLOCK_ERROR_MARGIN
+
+/*
+  qurt_timer group defines.                                                    
+*/
+#define QURT_TIMER_MAX_GROUPS              5U /**< Maximum groups.*/
+#define QURT_TIMER_DEFAULT_GROUP           0U /**< Default groups. */
+/** @} */ /* end_addtogroup timer_const_macros */
+
+/** @addtogroup timer_types
+@{ */
+/**
+   QuRT timer types.                                                       
+ */
+typedef enum
+{
+  QURT_TIMER_ONESHOT = 0,  /**< One shot.*/
+  /**	@xreflabel{hdr:QURT_TIMER_PERIODIC}*/
+  QURT_TIMER_PERIODIC      /**< Periodic. */
+} qurt_timer_type_t;
+
+
+/*=============================================================================
+                        TYPEDEFS
+=============================================================================*/
+
+/** QuRT timer type.*/
+typedef unsigned int                        qurt_timer_t;
+
+/** QuRT timer duration type.  */
+typedef unsigned long long                  qurt_timer_duration_t;
+
+/** QuRT timer time type. */
+typedef unsigned long long                  qurt_timer_time_t;
+
+typedef void (*pfn_t)(void);
+/** QuRT timer attribute type. */
+typedef struct 
+{ 
+    /** @cond */
+    unsigned int        magic; /**< Magic number to verify the qmsgq_attr_t pointer.  */    
+     
+    qurt_timer_duration_t   duration; /**< Specifies the duration of the new timer. */
+     
+    qurt_timer_time_t   expiry; /**< Specifies the absolute expiry of the new timer. */
+
+    qurt_timer_duration_t   remaining; /**< Specifies the remaining time of an active timer. */
+   
+    qurt_timer_type_t       type;  /**< Specifies the timer type; only #QURT_TIMER_ONESHOT and
+                                            #QURT_TIMER_PERIODIC are supported.  */
+    
+    unsigned int        group;  /**<  Group number of the timer; the criterion used to disable or enable the set
+       of timers.  */
+    pfn_t pFn;  /**< Callback other than the signal set */
+    /** @endcond */
+}
+qurt_timer_attr_t;
+
+/** @} */ /* end_addtogroup timer_types */
+/*=============================================================================
+                        FUNCTIONS
+=============================================================================*/
+
+/**@ingroup func_qurt_timer_stop
+  @xreflabel{sec:qurt_timer_stop}  
+  Stops a running timer.
+  The timer must be a one-shot timer.
+
+  @note1hang Restart stopped timers with the timer restart operation,
+             see Section @xref{sec:qurt_timer_restart}. 
+
+  @datatypes
+  #qurt_timer_t
+  
+  @param[in] timer    Timer object. 
+
+  @return
+  #QURT_EOK -- Success. \n
+  #QURT_EINVALID -- Invalid timer ID or duration value. \n
+  #QURT_ENOTALLOWED -- Timer is not a one shot timer. \n
+  #QURT_EMEM -- Out of memory error.
+
+  @dependencies
+  None.
+ */
+int qurt_timer_stop (qurt_timer_t timer);
+
+/**@ingroup func_qurt_timer_restart
+   @xreflabel{sec:qurt_timer_restart}
+   Restarts a stopped timer with the specified duration. The timer must be a one-shot timer.
+   Timers stop after they have expired or after they are explicitly stopped with qurt_timer_stop().
+   A restarted timer expires after the specified duration, the starting time is when the function is called.
+
+  @note1hang Timers stop after they have expired or after they are explicitly
+             stopped with the timer stop operation, see Section @xref{sec:qurt_timer_stop}.
+
+  @datatypes
+  #qurt_timer_t \n
+  #qurt_timer_duration_t
+
+  @param[in] timer        Timer object. 
+  @param[in] duration     Timer duration (in microseconds) before the restarted timer
+                          expires again.
+                          The valid range is #QURT_TIMER_MIN_DURATION to
+                          #QURT_TIMER_MAX_DURATION.
+
+  @return             
+  #QURT_EOK -- Success. \n
+  #QURT_EINVALID -- Invalid timer ID or duration value. \n
+  #QURT_ENOTALLOWED -- Timer is not a one-shot timer. \n
+  #QURT_EMEM --  Out-of-memory error.
+
+  @dependencies
+  None.
+ */
+int qurt_timer_restart (qurt_timer_t timer, qurt_timer_duration_t duration);
+
+
+/**@ingroup func_qurt_timer_create
+  Creates a timer.\n
+  Allocates and initializes a timer object, and starts the timer.
+
+  @note1hang A timer event handler must be defined to wait on the specified signal 
+             to handle the timer event.
+
+  @datatypes
+  #qurt_timer_t \n
+  #qurt_timer_attr_t \n
+  #qurt_anysignal_t
+
+  @param[out] timer   Pointer to the created timer object.
+  @param[in]  attr    Pointer to the timer attribute structure.
+  @param[in]  signal  Pointer to the signal object set when timer expires.
+  @param[in]  mask    Signal mask, which specifies the signal to set in the signal object when the
+                      time expires.
+
+  @return
+  #QURT_EOK -- Success. \n
+  #QURT_EMEM -- Not enough memory to create the timer. \n
+  #QURT_EINVALID -- One of the arguments in the attr field is invalid. \n
+  Other error code -- Operation failed. \n
+
+  @dependencies
+  None.
+ */
+int qurt_timer_create (qurt_timer_t *timer, const qurt_timer_attr_t *attr,
+                  const qurt_anysignal_t *signal, unsigned int mask);
+
+int qurt_timer_create_sig2 (qurt_timer_t *timer, const qurt_timer_attr_t *attr, 
+                  const qurt_signal2_t *signal, unsigned int mask);
+
+/**@ingroup func_qurt_timer_attr_init
+  Initializes the specified timer attribute structure with default attribute values: \n
+  - Timer duration -- #QURT_TIMER_DEFAULT_DURATION (Section @xref{dox:timers}) \n
+  - Timer type -- #QURT_TIMER_ONESHOT \n
+  - Timer group -- #QURT_TIMER_DEFAULT_GROUP
+
+  @datatypes
+  #qurt_timer_attr_t
+
+  @param[in,out] attr Pointer to the destination structure for the timer attributes.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+void qurt_timer_attr_init(qurt_timer_attr_t *attr);
+
+
+/*Tech Comm note: removed qurt_timer_attr_set_pfn from documentation 9/10/2020
+@ingroup func_qurt_timer_attr_set_pfn  
+
+  @datatypes
+  #qurt_timer_attr_t
+
+  @param[in,out] attr Pointer to the destination structure for the timer attributes.
+  @param[in] pFn pFn.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+void qurt_timer_attr_set_pfn(qurt_timer_attr_t *attr, pfn_t pFn);
+
+
+/**@ingroup func_qurt_timer_attr_set_duration
+  Sets the timer duration in the specified timer attribute structure.\n
+
+  The timer duration specifies the interval (in microseconds) between the creation of the
+  timer object and the generation of the corresponding timer event.
+
+  The timer duration value must be between #QURT_TIMER_MIN_DURATION and
+  #QURT_TIMER_MAX_DURATION (Section @xref{dox:timers}). Otherwise, the set operation is ignored.
+
+  @datatypes
+  #qurt_timer_attr_t \n
+  #qurt_timer_duration_t
+
+  @param[in,out] attr    Pointer to the timer attribute structure.
+  @param[in] duration    Timer duration (in microseconds).
+                         Valid range is #QURT_TIMER_MIN_DURATION to
+                         #QURT_TIMER_MAX_DURATION.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+void qurt_timer_attr_set_duration(qurt_timer_attr_t *attr, qurt_timer_duration_t duration);
+
+/**@ingroup func_qurt_timer_attr_set_expiry
+   Sets the absolute expiry time in the specified timer attribute structure.\n
+   The timer expiry specifies the absolute time (in microseconds) of the generation of the
+   corresponding timer event.\n
+   Timer expiries are relative to when the system first began executing.
+
+   @datatypes
+   #qurt_timer_attr_t \n
+   #qurt_timer_time_t
+
+   @param[in,out] attr  Pointer to the timer attribute structure.
+   @param[in]     time  Timer expiry.
+
+   @return
+   None.
+
+   @dependencies
+   None.
+ */
+void qurt_timer_attr_set_expiry(qurt_timer_attr_t *attr, qurt_timer_time_t time);
+
+/**@ingroup func_qurt_timer_attr_get_duration
+  Gets the timer duration from the specified timer attribute structure.
+  The value returned is the duration that was originally set for the timer.
+
+  @note1hang This function does not return the remaining time of an active timer; 
+  use qurt_timer_attr_get_remaining() to get the remaining time.
+
+  @datatypes
+  #qurt_timer_attr_t \n
+  #qurt_timer_duration_t
+
+  @param[in]  attr       Pointer to the timer attributes object
+  @param[out] duration   Pointer to the destination variable for timer duration.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+void qurt_timer_attr_get_duration(qurt_timer_attr_t *attr, qurt_timer_duration_t *duration);
+
+/**@ingroup func_qurt_timer_attr_get_remaining
+  Gets the timer remaining duration from the specified timer attribute structure. \n
+
+  The timer remaining duration indicates (in microseconds) how much time remains before
+  the generation of the next timer event on the corresponding timer.
+  In most cases this function assumes that the timer attribute structure was obtained by
+  calling qurt_timer_get_attr().
+
+  @note1hang This attribute is read-only and thus has no set operation defined for it.
+
+  @datatypes
+  #qurt_timer_attr_t \n
+  #qurt_timer_duration_t
+
+  @param[in] attr          Pointer to the timer attribute object.
+  @param[out] remaining    Pointer to the destination variable for remaining time.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+void qurt_timer_attr_get_remaining(qurt_timer_attr_t *attr, qurt_timer_duration_t *remaining);
+
+/**@ingroup func_qurt_timer_attr_set_type
+  Sets the timer type in the specified timer attribute structure.
+
+  The timer type specifies the functional behavior of the timer: \n
+  - A one-shot timer (#QURT_TIMER_ONESHOT) waits for the specified timer duration
+      and then generates a single timer event. After this the timer is nonfunctional. \n
+  - A periodic timer (#QURT_TIMER_PERIODIC) repeatedly waits for the specified
+     timer duration and then generates a timer event. The result is a series of timer
+     events with interval equal to the timer duration.
+
+   @datatypes 
+   #qurt_timer_attr_t \n
+   #qurt_timer_type_t
+   
+   @param[in,out]  attr  Pointer to the timer attribute structure.
+   @param[in]      type  Timer type. Values are: \n
+                   - #QURT_TIMER_ONESHOT -- One-shot timer. \n
+                   - #QURT_TIMER_PERIODIC -- Periodic timer. @tablebulletend
+
+   @return
+   None.
+
+   @dependencies
+   None.
+ */
+void qurt_timer_attr_set_type(qurt_timer_attr_t *attr, qurt_timer_type_t type);
+
+/**@ingroup func_qurt_timer_attr_get_type
+  Gets the timer type from the specified timer attribute structure.
+
+  @datatypes
+  #qurt_timer_attr_t \n
+  #qurt_timer_type_t
+
+  @param[in]  attr  Pointer to the timer attribute structure.
+  @param[out] type  Pointer to the destination variable for the timer type.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+void qurt_timer_attr_get_type(qurt_timer_attr_t *attr, qurt_timer_type_t *type);
+
+/**@ingroup func_qurt_timer_attr_set_group
+  Sets the timer group identifier in the specified timer attribute structure.\n
+  The timer group identifier specifies the group that the timer belongs to. Timer groups are
+  used to enable or disable one or more timers in a single operation. \n
+  The timer group identifier value must be between 0 and (#QURT_TIMER_MAX_GROUPS - 1).
+  See Section @xref{dox:timers}.
+
+  @datatypes
+  #qurt_timer_attr_t
+
+  @param[in,out]  attr  Pointer to the timer attribute object.
+  @param[in] group      Timer group identifier;
+                        Valid range is 0 to (#QURT_TIMER_MAX_GROUPS - 1).
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+void qurt_timer_attr_set_group(qurt_timer_attr_t *attr, unsigned int group);
+
+/**@ingroup func_qurt_timer_attr_get_group
+  Gets the timer group identifier from the specified timer attribute structure.
+
+  @datatypes
+  #qurt_timer_attr_t
+  
+  @param[in]  attr   Pointer to the timer attribute structure.
+  @param[out] group  Pointer to the destination variable for the timer group identifier.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+void qurt_timer_attr_get_group(qurt_timer_attr_t *attr, unsigned int *group);
+
+/**@ingroup func_qurt_timer_get_attr
+  @xreflabel{hdr:qurt_timer_get_attr}
+  Gets the timer attributes of the specified timer when it was created.
+
+  @datatypes
+  #qurt_timer_t \n
+  #qurt_timer_attr_t
+
+  @param[in] timer  Timer object.
+  @param[out] attr  Pointer to the destination structure for timer attributes.
+
+  @return
+  #QURT_EOK -- Success. \n
+  #QURT_EVAL -- Argument passed is not a valid timer.
+
+  @dependencies
+  None.
+ */
+int qurt_timer_get_attr(qurt_timer_t timer, qurt_timer_attr_t *attr);
+
+/**@ingroup func_qurt_timer_delete
+  Deletes the timer.\n
+  Destroys the specified timer and deallocates the timer object.
+
+  @datatypes
+  #qurt_timer_t
+  
+  @param[in] timer  Timer object.
+
+  @return       
+  #QURT_EOK -- Success. \n
+  #QURT_EVAL -- Argument passed is not a valid timer. 
+
+  @dependencies
+  None.
+ */
+int qurt_timer_delete(qurt_timer_t timer);
+
+/**@ingroup func_qurt_timer_sleep
+  Suspends the current thread for the specified amount of time.
+  The sleep duration value must be between #QURT_TIMER_MIN_DURATION and
+  #QURT_TIMER_MAX_DURATION (Section @xref{dox:timers}).
+
+  @datatypes
+  #qurt_timer_duration_t
+
+  @param[in] duration  Interval (in microseconds) between when the thread is suspended
+                       and when it is re-awakened. 
+
+  @return 
+  #QURT_EOK -- Success. \n
+  #QURT_EMEM -- Not enough memory to perform the operation.
+
+  @dependencies
+  None.
+ */
+
+int qurt_timer_sleep(qurt_timer_duration_t duration);
+
+/**@ingroup func_qurt_timer_group_disable
+  Disables all timers that are assigned to the specified timer group.
+  If a specified timer is already disabled, ignore it.
+  If a specified timer is expired, do not process it.
+  If the specified timer group is empty, do nothing.
+
+  @note1hang When a timer is disabled its remaining time does not change, thus it
+             cannot generate a timer event.
+ 
+  @param[in] group  Timer group identifier.
+
+  @return 
+  #QURT_EOK -- Success.
+
+  @dependencies
+  None.
+ */
+int qurt_timer_group_disable (unsigned int group);
+
+/**@ingroup func_qurt_timer_group_enable
+  Enables all timers that are assigned to the specified timer group.
+  If a specified timer is already enabled, ignore it.
+  If a specified timer is expired, process it.
+  If the specified timer group is empty, do nothing.
+
+  @param[in] group  Timer group identifier.
+
+  @return 
+  #QURT_EOK -- Success.
+
+  @dependencies
+  None.
+ */
+int qurt_timer_group_enable (unsigned int group);
+
+
+/**
+  Notifies the timer server recovery from power collapse. The server
+  must account for any missed interrupts during power collapse. 
+ */
+void qurt_timer_recover_pc (void);
+
+/**
+   Determines whether the Qtimer is initialized.
+
+   @return
+   0       -- Not initialized. \n
+   Nonzero -- Initialized.
+ */
+static inline int qurt_timer_is_init (void) {return 1;}
+
+/**@ingroup func_qurt_timer_get_ticks
+   Gets current ticks. The ticks are accumulated since the RTOS
+   has started. Each tick is equal to a single timer clock
+   cycle, where the frequency is 32 KHz on RGPT or 19.2 MHz on Qtimer.
+  
+   @return             
+   Ticks since system started.
+ */
+unsigned long long qurt_timer_get_ticks (void);
+
+#define qurt_timer_timetick_from_us(us) QURT_SYSCLOCK_TIMETICK_FROM_US(us)
+
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_TIMER_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_tlb.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_tlb.h
new file mode 100755
index 0000000000000..b1b2d261d31c0
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_tlb.h
@@ -0,0 +1,215 @@
+#ifndef QURT_TLB_H
+#define QURT_TLB_H
+
+/**
+  @file qurt_tlb.h 
+  @brief  Prototypes of TLB API  
+        The TLB APIs allow explicit control of the portion of TLB between TLB_first_replaceble and TLB_LAST_REPLACEABLE. 
+        Both are nonconfigurable for the time being. This portion of TLB is permanently assigned/locked unless manually removed 
+        by qurt_tlb_remove. Implementation does not change depending on the configuration, such as whether CONFIG_STATIC is set or not. 
+        In CONFIG_STATIC=y, TLB_LAST_REPLACEABLE is set to the last TLB index, which indicates that the entire TLB is permanently 
+        assigned and is not backed up by page table (page table does not exist). TLB indicies are maintained through a 64-bit bitmask. 
+        A new entry is placed in the first available slot. 
+
+EXTERNAL FUNCTIONS
+   None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+Copyright (c) 2013, 2021, 2023
+All Rights Reserved.
+Confidential and Proprietary - Qualcomm Technologies, Inc.
+=============================================================================*/
+
+#include <qurt_types.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/*=============================================================================
+												FUNCTIONS
+=============================================================================*/
+
+/**@ingroup func_qurt_tlb_entry_create
+  Creates a new TLB entry with the specified mapping attributes in the TLB of the Hexagon processor. \n
+  @note1hang If the specified attributes are not valid (such as if the address is not aligned with the
+             size), the entry is created and an error result is returned.\n
+  @note1cont To set the G bit in the new TLB entry, set the ASID argument to -1.
+
+  @datatypes
+  #qurt_addr_t \n
+  #qurt_paddr_t \n
+  #qurt_mem_cache_mode_t \n
+  #qurt_perm_t
+  
+  @param[out]  entry_id         TLB entry identifier.
+  @param[in]   vaddr 			Virtual memory address.
+  @param[in]   paddr  			Physical memory address.
+  @param[in]   size  			Size of memory region to map (in bytes).
+  @param[in]   cache_attribs    Cache mode (writeback, and so on).
+  @param[in]   perms  			Access permissions.  
+  @param[in]   asid  			ASID (space ID).
+ 
+  @return
+  #QURT_EOK -- TLB entry successfully created.\n
+  #QURT_EFATAL -- Entry is not created; the TLB is full. \n
+  #QURT_ETLBCREATESIZE -- Entry is not created; the incorrect size was specified. \n
+  #QURT_ETLBCREATEUNALIGNED -- Entry is not created; an unaligned address was specified. \n
+  #QURT_EINVALID -- Invalid cache attributes / permissions provided.
+
+ */
+int  qurt_tlb_entry_create (unsigned int *entry_id, qurt_addr_t vaddr, qurt_paddr_t paddr, qurt_size_t size, qurt_mem_cache_mode_t cache_attribs, qurt_perm_t perms, int asid);
+
+/**@ingroup func_qurt_tlb_entry_create_64
+  Creates a new TLB entry with the specified mapping attributes in the TLB of the Hexagon processor. \n
+  @note1hang If the specified attributes are not valid (the address is not aligned with the
+             size), the entry is not created, and an error result is returned.\n
+  @note1cont To set the G bit in the new TLB entry, set the asid argument to -1.
+  
+  @param[out]  entry_id         TLB entry identifier.
+  @param[in]   vaddr 			Virtual memory address.
+  @param[in]   paddr_64         64-bit physical memory address.
+  @param[in]   size  			Size of memory region to map (in bytes).
+  @param[in]   cache_attribs    Cache mode (writeback, and so on).
+  @param[in]   perms  			Access permissions.  
+  @param[in]   asid  			ASID (space ID).
+ 
+  @return
+  #QURT_EOK -- TLB entry successfully created.\n
+  #QURT_EFATAL -- Entry was not created; the TLB is full. \n
+  #QURT_ETLBCREATESIZE -- Entry was not created; the incorrect size was specified. \n
+  #QURT_ETLBCREATEUNALIGNED -- Entry was not created; an unaligned address was specified. \n
+  #QURT_EINVALID -- Invalid cache attributes / permissions provided.
+
+ */
+int qurt_tlb_entry_create_64 (unsigned int *entry_id, qurt_addr_t vaddr, qurt_paddr_64_t paddr_64, qurt_size_t size, qurt_mem_cache_mode_t cache_attribs, qurt_perm_t perms, int asid);
+
+/**@ingroup func_qurt_tlb_entry_delete 
+  Deletes the specified TLB entry from the TLB of the Hexagon processor.
+  If the specified entry does not exist, no deletion occurs and an error result is returned.
+
+  @param[in]   entry_id  TLB entry identifier.			
+
+  @return
+  #QURT_EOK -- TLB entry successfully deleted. \n
+  #QURT_EFATAL -- TLB entry does not exist.
+
+  @dependencies
+  None.
+ **/
+int qurt_tlb_entry_delete (unsigned int entry_id);
+
+/**@ingroup func_qurt_tlb_entry_query
+  Searches for the specified TLB entry in the TLB of the Hexagon processor.
+  If the TLB entry is found, its entry identifier is returned.
+
+  @datatypes
+  #qurt_addr_t
+
+  @param[out]   entry_id     TLB entry identifier.  
+  @param[in]    vaddr  		 Virtual memory address.
+  @param[in]    asid 		 ASID (space ID).
+
+  @return  
+  #QURT_EOK -- TLB entry successfully returned. \n
+  #QURT_EFATAL -- TLB entry does not exist.
+
+  @dependencies
+  None.
+ **/
+int qurt_tlb_entry_query (unsigned int *entry_id, qurt_addr_t vaddr, int asid);
+
+/**@ingroup func_qurt_tlb_entry_set
+  Sets the TLB entry by storing an entry at the specified location 
+  in the TLB of the Hexagon processor.
+
+  @param[in]   entry_id  		TLB entry identifier.
+  @param[in]   entry  			64-bit TLB entry to store.
+
+  @return
+  #QURT_EOK -- Entry successfully stored in the TLB. \n
+  #QURT_EFATAL -- Entry not set at the specified location.
+
+  @dependencies
+  None.
+ **/
+int qurt_tlb_entry_set (unsigned int entry_id, unsigned long long int entry);
+
+/**@ingroup func_qurt_tlb_entry_get
+  Gets the TLB entry. \n
+  Returns the specified 64-bit TLB entry in the TLB of the Hexagon processor.
+
+  @param[in]    entry_id  	TLB entry identifier.
+  @param[out]   entry       64-bit TLB entry.
+
+  @return
+  #QURT_EOK -- TLB entry successfully returned. \n
+  #QURT_EFATAL -- TLB entry does not exist.   
+
+  @dependencies
+  None.
+ **/
+int qurt_tlb_entry_get (unsigned int entry_id, unsigned long long int *entry);
+
+/**@ingroup func_qurt_tlb_get_pager_physaddrs
+  Searches the TLB of the Hexagon processor, and returns all physical addresses that belong to the pager.
+  Each returned address indicates the starting address of an active page.
+
+The function return value indicates the number of addresses returned.
+
+  @param[out]  pager_phys_addrs  Pointer to the return array of pager physical addresses.
+ 
+  @return
+  Integer -- Number of addresses returned in array.
+
+  @dependencies
+    None.
+*/
+
+unsigned int qurt_tlb_get_pager_physaddr(unsigned int** pager_phys_addrs);
+
+/**@ingroup func_qurt_tlb_get_pager_virtaddr
+  Searches the TLB of the Hexagon processor, and returns all virtual addresses that belong to the pager.
+  Each returned address indicates the starting address of an active page.
+
+The function return value indicates the number of addresses returned.
+
+  @param[out]  pager_virt_addrs  Pointer to the return array of pager virtual addresses.
+ 
+  @return
+  Integer -- Number of addresses returned in the array.
+
+  @dependencies
+    None.
+*/
+
+unsigned int qurt_tlb_get_pager_virtaddr(unsigned int** pager_virt_addrs);
+
+
+/**@ingroup func_qurt_tlb_entry_set2
+  Sets the TLB entry by storing an entry at the specified location 
+  in the TLB of the Hexagon processor. An additional option can be passed 
+  to lock the TLB entry in the TLB of the Hexagon processor.
+
+  @param[in]   id     TLB entry identifier.
+  @param[in]   tlb    64-bit TLB entry to store.
+  @param[in]   lock   Nonzero value indicates that the TLB entry must be locked in the hardware TLB.
+
+  @return
+  #QURT_EOK -- Entry successfully stored in the TLB. \n
+  #QURT_EFATAL -- Entry not set at the specified location.
+
+  @dependencies
+  None.
+ **/
+int qurt_tlb_entry_set2(unsigned id, unsigned long long tlb, unsigned lock);
+
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_TLB_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_tls.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_tls.h
new file mode 100755
index 0000000000000..6ec3b39ff5cb0
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_tls.h
@@ -0,0 +1,100 @@
+#ifndef QURT_TLS_H
+#define QURT_TLS_H
+/**
+  @file qurt_tls.h 
+  @brief  Prototypes of TLS APIs 
+
+EXTERNAL FUNCTIONS
+   None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+Copyright (c) 2021  by Qualcomm Technologies, Inc.  All Rights Reserved.
+Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+=============================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/*=============================================================================
+												FUNCTIONS
+=============================================================================*/
+
+/**@ingroup func_qurt_tls_create_key
+  @xreflabel{sec:tls_create_key}
+  Creates a key for accessing a thread local storage data item.\n
+  Subsequent get and set operations use the key value.
+
+  @note1hang The destructor function performs any clean-up operations needed by a thread
+             local storage item when its containing thread is deleted (Section @xref{sec:qurt_thread_exit}).
+
+  @param[out] key         Pointer to the newly created thread local storage key value.
+  @param[in]  destructor  Pointer to the key-specific destructor function. Passing NULL 
+                          specifies that no destructor function is defined for the key.
+
+  @return	
+  #QURT_EOK -- Key successfully created. \n
+  #QURT_ETLSAVAIL -- No free TLS key available. 
+
+  @dependencies
+  None.
+ */
+int qurt_tls_create_key (int *key, void (*destructor)(void *));
+
+/**@ingroup func_qurt_tls_set_specific
+  Stores a data item to thread local storage along with the specified key.
+
+  @param[in]    key  Thread local storage key value.
+  @param[in]    value  Pointer to user data value to store.
+
+  @return  
+  #QURT_EOK -- Data item successfully stored. \n
+  #QURT_EINVALID -- Invalid key. \n
+  #QURT_EFAILED -- Invoked from a non-thread context.
+ */
+int qurt_tls_set_specific (int key, const void *value);
+
+/**@ingroup func_qurt_tls_get_specific
+  Loads the data item from thread local storage. \n
+  Returns the data item that is stored in thread local storage with the specified key.
+  The data item is always a pointer to user data.
+
+  @param[in]    key Thread local storage key value.
+
+  @return
+  Pointer -- Data item indexed by key in thread local storage. \n
+  0 (NULL) -- Key out of range.
+
+  @dependencies
+  None.
+ */
+void * __attribute__((section(".text.qurt_tls_get_specific "))) qurt_tls_get_specific (int key);
+
+
+/**@ingroup func_qurt_tls_delete_key
+  Deletes the specified key from thread local storage.
+
+  @note1hang Explicitly deleting a key does not execute any destructor function that is
+             associated with the key (Section @xref{sec:tls_create_key}).
+
+  @param[in]   key  Thread local storage key value to delete.
+
+  @return  
+  #QURT_EOK -- Key successfully deleted. \n
+  #QURT_ETLSENTRY -- Key already free.
+
+  @dependencies
+  None.
+ */
+int qurt_tls_delete_key (int key);
+
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_TLS_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_trace.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_trace.h
new file mode 100755
index 0000000000000..541f8f1d34bf6
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_trace.h
@@ -0,0 +1,317 @@
+﻿#ifndef QURT_TRACE_H
+#define QURT_TRACE_H
+/**
+  @file qurt_trace.h 
+  @brief  Prototypes of system call tracing helpers API  
+
+EXTERNAL FUNCTIONS
+   None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+Copyright (c) 2021-2023 by Qualcomm Technologies, Inc.
+All Rights Reserved.
+Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+=============================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*=============================================================================
+                            GLOBAL VARIABLES
+=============================================================================*/
+/** @cond internal_only */
+/** @addtogroup etm_macros
+@{ */
+/* ETM trace types. */
+#define QURT_ETM_TYPE_PC_ADDR                           (1U<<0) /**< PC address.*/
+#define QURT_ETM_TYPE_MEMORY_ADDR                       (1U<<1) /**< Memory address. */
+#define QURT_ETM_TYPE_TESTBUS                           (1U<<2) /**< Test bus. */
+#define QURT_ETM_TYPE_CYCLE_ACCURATE                    (1U<<3) /**< Cycle accurate. */
+#define QURT_ETM_TYPE_CYCLE_COARSE                      (1U<<4) /**< Cycle coarse. */
+#define QURT_ETM_TYPE_PC_AND_MEMORY_ADDR                (QURT_ETM_TYPE_PC_ADDR|QURT_ETM_TYPE_MEMORY_ADDR) /**< PC and memory address. */
+#define QURT_ETM_TYPE_PC_ADDR_AND_TESTBUS               (QURT_ETM_TYPE_PC_ADDR|QURT_ETM_TYPE_TESTBUS) /**< PC address and test bus. */
+#define QURT_ETM_TYPE_MEMORY_ADDR_AND_TESTBUS           (QURT_ETM_TYPE_MEMORY_ADDR|QURT_ETM_TYPE_TESTBUS) /**< Memory address and test bus.*/
+#define QURT_ETM_TYPE_PC_AND_MEMORY_ADDR_AND_TESTBUS    (QURT_ETM_TYPE_PC_ADDR|QURT_ETM_TYPE_MEMORY_ADDR|QURT_ETM_TYPE_TESTBUS) /**< PC, memory address, and test bus. */
+
+/* ETM routes. */
+#define QURT_ETM_ROUTE_TO_QDSS      0U /**< ETM route to QDSS. */
+#define QURT_ETM_ROUTE_TO_Q6ETB     1U /**< ETM route to Q6ETB. */
+
+/* ETM filters. */
+#define QURT_ETM_TRACE_FILTER_ALL_DEFAULT   0U       /*< Filter all as default. */
+#define QURT_ETM_TRACE_FILTER_HNUM0         (1U<<0)  /*< Filter HNUM0. */    
+#define QURT_ETM_TRACE_FILTER_HNUM1         (1U<<1)  /*< Filter HNUM1. */     
+#define QURT_ETM_TRACE_FILTER_HNUM2         (1U<<2)  /*< Filter HNUM2. */     
+#define QURT_ETM_TRACE_FILTER_HNUM3         (1U<<3)  /*< Filter HNUM3. */  
+#define QURT_ETM_TRACE_FILTER_HNUM4         (1U<<4)  /*< Filter HNUM4. */  
+#define QURT_ETM_TRACE_FILTER_HNUM5         (1U<<5)  /*< Filter HNUM5. */  
+#define QURT_ETM_TRACE_FILTER_HNUM6         (1U<<6)  /*< Filter HNUM6. */  
+#define QURT_ETM_TRACE_FILTER_HNUM7         (1U<<7)  /*< Filter HNUM7. */  
+#define QURT_ETM_TRACE_FILTER_HNUM8         (1U<<8)  /*< Filter HNUM8. */    
+#define QURT_ETM_TRACE_FILTER_HNUM9         (1U<<9)  /*< Filter HNUM9. */     
+#define QURT_ETM_TRACE_FILTER_HNUM10        (1U<<10) /*< Filter HNUM10. */     
+#define QURT_ETM_TRACE_FILTER_HNUM11        (1U<<11) /*< Filter HNUM11. */
+#define QURT_ETM_TRACE_FILTER_HNUM12        (1U<<12) /*< Filter HNUM12. */    
+#define QURT_ETM_TRACE_FILTER_HNUM13        (1U<<13) /*< Filter HNUM13. */     
+#define QURT_ETM_TRACE_FILTER_HNUM14        (1U<<14) /*< Filter HNUM14. */     
+#define QURT_ETM_TRACE_FILTER_HNUM15        (1U<<15) /*< Filter HNUM15. */
+#define QURT_ETM_TRACE_FILTER_ALL           QURT_ETM_TRACE_FILTER_ALL_DEFAULT
+
+#define QURT_ETM_TRACE_FILTER_CLUSTER0      (1<<16)  /*< Filter trace cluster0 address. */  
+#define QURT_ETM_TRACE_FILTER_CLUSTER1      (1<<17)  /*< Filter trace cluster1 address. */  
+#define QURT_ETM_TRACE_FILTER_PC_RANGE      (1<<19)  /*< Filter PC address range. */  
+
+/* ETM memory source - PC or data access */
+#define QURT_ETM_SOURCE_PC                  0U  /**< ETM memory source of SAC* is PC. */
+#define QURT_ETM_SOURCE_DATA                1U  /**< ETM memory source of SAC* is data. */
+
+/* Period between synchronization traces */
+#define QURT_ETM_ASYNC_PERIOD               0  /**< Async.*/
+#define QURT_ETM_ISYNC_PERIOD               1  /**< Isync.*/
+#define QURT_ETM_GSYNC_PERIOD               2  /**< Gsync. */
+
+/* ETM enable flags */
+#define QURT_ETM_OFF                0U  /**< ETM off. */
+#define QURT_ETM_ON                 1U  /**< ETM on. */
+/** @endcond */
+/** @} */ /* end_addtogroup etm_macros */
+
+/** @addtogroup function_tracing_macro
+@{ */
+/* ETM setup return values */
+#define QURT_ETM_SETUP_OK                   0 /**< ETM setup OK. */
+#define QURT_ETM_SETUP_ERR                  1 /**< ETM setup error. */
+/** @} */ /* end_addtogroup function_tracing_macro */
+/* ETM breakpoint types */
+#define QURT_ETM_READWRITE_BRKPT            0U /**< ETM read/write breakpoint. */
+#define QURT_ETM_READ_BRKPT                 1U /**< ETM read breakpoint. */
+#define QURT_ETM_WRITE_BRKPT                2U /**< ETM write breakpoint. */
+#define QURT_ETM_BRKPT_INVALIDATE           3U /**< Invalidate breakpoint. */
+/** @addtogroup function_tracing_macro
+@{ */
+/* ATB status flags */
+#define QURT_ATB_OFF                        0  /**< ATB off. */
+#define QURT_ATB_ON                         1  /**< ATB on. */
+/** @} */ /* end_addtogroup function_tracing_macro */
+/* DTM enable flags */
+#define QURT_DTM_OFF                0  /**< DTM off. */
+#define QURT_DTM_ON                 1  /**< DTM on. */
+
+/** @addtogroup function_tracing_datatypes
+@{ */
+/**STM trace information. */
+typedef struct qurt_stm_trace_info {
+   /** @cond */
+   unsigned int stm_port_addr[6];   /* STM port address to which trace data must be written.*/
+   unsigned int thread_event_id; /* Event ID for context switches.*/
+   unsigned int interrupt_event_id; /* Event ID for interrupts. */
+   unsigned int marker; /* Marker value that must be written at the beginning of the trace. */
+   /** @endcond */
+} qurt_stm_trace_info_t;
+/** @} */ /* end_addtogroup function_tracing_datatypes */
+/*=============================================================================
+                            GLOBAL FUNCTIONS
+=============================================================================*/
+
+
+/**@ingroup func_qurt_trace_get_marker
+  Gets the kernel trace marker.\n
+  Returns the current value of the kernel trace marker.
+  The marker consists of a hardware thread identifier and an index into the kernel trace
+  buffer. The trace buffer records kernel events.
+
+  @note1hang Using this function with qurt_trace_changed() 
+             determines whether certain kernel events occurred in a block of code.
+
+  @return
+  Integer -- Kernel trace marker.
+
+  @dependencies
+  None.
+*/
+unsigned int qurt_trace_get_marker(void);
+
+/**@ingroup func_qurt_trace_changed  
+  Determines whether specific kernel events have occurred. \n
+  Returns a value that indicates whether the specified kernel events are recorded in the
+  kernel trace buffer since the specified kernel trace marker was obtained.
+
+  The prev_trace_marker parameter specifies a kernel trace marker that was obtained by calling 
+  qurt_trace_get_marker().
+  @cond rest_dist For more information on the mask value, see the description of the trace_mask element in 
+  @xhyperref{80VB41992,80-VB419-92}. \n @endcond
+
+  @note1hang Used with qurt_trace_get_marker(), this function determines whether
+             certain kernel events occurred in a block of code.\n
+  @note1cont This function cannot determine whether a specific kernel event type has
+             occurred unless that event type has been enabled in the trace_mask element
+             of the system configuration file. \n
+  @note1cont QuRT supports the recording of interrupt and context switch events only (such as
+             a trace_mask value of 0x3).
+
+  @param[in] prev_trace_marker Previous kernel trace marker.
+  @param[in] trace_mask        Mask value that indicates which kernel events to check for.
+
+  @returns
+  1 -- Kernel events of the specified type have occurred since the
+       specified trace marker was obtained.\n
+  0 -- No kernel events of the specified type have occurred since the
+       specified trace marker was obtained.
+
+  @dependencies
+  None.
+*/
+int qurt_trace_changed(unsigned int prev_trace_marker, unsigned int trace_mask);
+
+/*=============================================================================
+                        CONSTANTS AND MACROS
+=============================================================================*/
+/** @addtogroup function_tracing_macro
+@{ */
+#ifndef QURT_DEBUG 
+#define QURT_TRACE(str, ...) __VA_ARGS__
+  /**< Function tracing is implemented with the QURT_TRACE debug macro, which
+       optionally generates printf statements both before and after every function call that is
+       passed as a macro argument. 
+
+       For example, in the following macro calls in the source code:
+       @code
+       QURT_TRACE(myfunc, my_func(33))
+       
+       @endcode
+       generates the following debug output:
+       @code
+       myfile:nnn: my_func >>> calling my_func(33)
+       myfile:nnn: my_func >>> returned my_func(33)
+       @endcode
+       The debug output includes the source file and line number of the function call, along with
+       the text of the call. Compile the client source file with -D __FILENAME__
+       defined for its file name.
+
+       The library function qurt_printf() generates the debug output.
+       The QURT_DEBUG symbol controls generation of the debug output. If this symbol is
+       not defined, function tracing is not generated.\n
+       @note1hang The debug macro is accessed through the QuRT API header file. 
+        */
+#else
+#define QURT_TRACE(str, ...) \
+	do { \
+		qurt_printf("%s:%d: %s: >>> calling %s\n",__FILENAME__,__LINE__,(str),#__VA_ARGS__); \
+		__VA_ARGS__; \
+		qurt_printf("%s:%d: %s: <<< %s returned\n",__FILENAME__,__LINE__,(str),#__VA_ARGS__); \
+	} while (0);
+#endif
+/** @} */ /* end_addtogroup function_tracing_macro */
+
+/**@ingroup func_qurt_etm_set_pc_range
+  Sets the PC address range for ETM filtering.
+  Depending on the Hexagon core design, a maximum of four PC ranges are supported.
+
+  @param[in] range_num  0 to 3. 
+  @param[in] low_addr   Lower boundary of PC address range.
+  @param[in] high_addr  Higher boundary of PC address range.
+
+  @returns
+  #QURT_ETM_SETUP_OK -- Success. \n
+  #QURT_ETM_SETUP_ERR -- Failure.
+
+  @dependencies
+  None.
+*/
+unsigned int qurt_etm_set_pc_range(unsigned int range_num, unsigned int low_addr, unsigned int high_addr);
+
+/**@ingroup func_qurt_etm_set_range
+  Sets the address range for ETM filtering. 
+  It allows the user to select the source type of addresses - QURT_ETM_SOURCE_PC and QURT_ETM_SOURCE_DATA.
+
+  @param[in] addr_source_type   Type of the address source:\n
+                                - #QURT_ETM_SOURCE_PC \n
+                                - #QURT_ETM_SOURCE_DATA @tablebulletend
+  @param[in] trig_block_num     0 to 3.
+  @param[in] pid                pid of the process
+                                1. Any valid PID number will enable the ASID based trace filtering.
+                                2. QURT_ETM_NO_PID - Disable the ASID based trace filtering.
+  @param[in] low_addr           Lower boundary of PC address range.
+  @param[in] high_addr          Higher boundary of PC address range.
+
+  @returns
+  #QURT_ETM_SETUP_OK -- Success. \n
+  #QURT_ETM_SETUP_ERR -- Failure.
+
+  @dependencies
+  None.
+*/
+unsigned int qurt_etm_set_range(unsigned int addr_source_type, unsigned int trig_block_num, unsigned int pid, unsigned int low_addr, unsigned int high_addr);
+
+/**@ingroup func_qurt_etm_set_atb
+  Sets the advanced trace bus (ATB) state to notify QuRT that the ATB is actively enabled or disabled.
+  QuRT performs the corresponding actions at low power management.
+  
+  @param[in] flag Values: \n
+                         #QURT_ATB_ON \n
+						 #QURT_ATB_OFF  
+      
+  @returns
+  #QURT_ETM_SETUP_OK  -- Success. \n
+  #QURT_ETM_SETUP_ERR -- Failure
+
+  @dependencies
+  None.
+*/
+unsigned int qurt_etm_set_atb(unsigned int flag);
+
+/**@ingroup func_qurt_etm_set_sync_period
+  Sets the period for types of synchronization trace packets. \n
+  ASYNC defines the period between alignment synchronization packets.
+         Period is in terms of bytes in the packet stream. \n 
+  ISYNC defines the period between instruction synchronization packets.
+         Period is per thread and is defined as the bytes sent out for that thread. \n
+  GSYNC is the defined period in thread cycles between GSYNC packets.
+
+  @param[in]  sync_type Type of synchronization packets: \n
+                          #QURT_ETM_ASYNC_PERIOD \n
+                          #QURT_ETM_ISYNC_PERIOD \n
+                          #QURT_ETM_GSYNC_PERIOD
+  @param[in]  period    Period value. 
+
+  @return
+  #QURT_ETM_SETUP_OK -- Success. \n
+  #QURT_ETM_SETUP_ERR -- Failure.
+
+  @dependencies
+  None.
+ */
+unsigned int qurt_etm_set_sync_period(unsigned int sync_type, unsigned int period);
+
+/**@ingroup func_qurt_stm_trace_set_config
+  Sets up a STM port for tracing events.
+
+  @datatypes
+  #qurt_stm_trace_info_t 
+
+  @param[in]  stm_config_info Pointer to the STM trace information used to set up the trace
+              in the kernel.
+			  The strucure must have the following:\n
+			  - One port address per hardware thread \n
+			  - Event ID for context switches \n
+			  - Event ID for interrupt tracing n
+			  - Header or marker to identify the beginning of the trace. @tablebulletend
+
+  @return
+  #QURT_EOK -- Success. \n
+  #QURT_EINVALID -- Failure; possibly because the passed port address is not in the page table.
+
+  @dependencies
+  None.
+ */
+unsigned int qurt_stm_trace_set_config(qurt_stm_trace_info_t *stm_config_info);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_TRACE_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_types.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_types.h
new file mode 100755
index 0000000000000..bdb83a3fe2fb2
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_types.h
@@ -0,0 +1,294 @@
+#ifndef QURT_TYPES_H
+#define QURT_TYPES_H
+/**
+  @file qurt_types.h 
+  @brief  Contains types common to all configurations
+
+EXTERNAL FUNCTIONS
+   None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+Copyright (c) Qualcomm Technologies, Inc.
+All Rights Reserved.
+Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+=============================================================================*/
+
+
+//#include <stddef.h>
+#include <qurt_consts.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*=============================================================================
+                            CONSTANTS AND MACROS
+=============================================================================*/
+#define PGA_BITFIELD_MASK(hi,lo)    (((~0u)>>(31U-((hi)-(lo))))<<(lo))
+#define PGA_BITFIELD_GET(x,hi,lo)   (((x)&PGA_BITFIELD_MASK((hi),(lo)))>>(lo))
+#define PGA_BITFIELD_INS(hi,lo,v)   (((v)<<(lo))&PGA_BITFIELD_MASK((hi),(lo)))
+#define PGA_BITFIELD_SET(x,hi,lo,v) ((x)=((x)&~PGA_BITFIELD_MASK((hi),(lo)))|PGA_BITFIELD_INS((hi),(lo),(v)))
+#define QURT_PGATTR_C_GET(pga)      PGA_BITFIELD_GET((pga).pga_value, 3U, 0U)       /* Bits 3-0:  cache */
+#define QURT_PGATTR_A_GET(pga)      PGA_BITFIELD_GET((pga).pga_value, 5U, 4U)       /* Bits 5-4:  bus attr */
+#define QURT_PGATTR_C_SET(pga,v)    PGA_BITFIELD_SET((pga).pga_value, 3U, 0U, (v))  /* Bits 3-0:  cache */
+#define QURT_PGATTR_A_SET(pga,v)    PGA_BITFIELD_SET((pga).pga_value, 5U, 4U, (v))  /* Bits 5-4:  bus attr */
+#define QURT_PGATTR_MKRAW(v)        ((qurt_pgattr_t){.pga_value = (v)})
+#define QURT_PGATTR_MK(c,a)         QURT_PGATTR_MKRAW(PGA_BITFIELD_INS(3U,0U,(c))|PGA_BITFIELD_INS(5U,4U,(a)))
+
+/*return types for qurt_island_get_status2*/
+#define QURT_ISLAND_MODE_NORMAL    0U    /**< Normal operating mode */
+#define QURT_ISLAND_MODE_ISLAND    1U    /**< Island mode */
+#define QURT_ISLAND_MODE_EXITING   2U    /**< In transition from Island mode to Normal mode */
+
+/*=============================================================================
+                        FORWARD DECLARATIONS & TYPEDEFS
+=============================================================================*/
+/** @addtogroup memory_management_types
+@{ */
+typedef unsigned int qurt_addr_t;          /**< QuRT address type.*/
+typedef unsigned int qurt_paddr_t;         /**< QuRT physical memory address type.  */ 
+/** @cond rest_reg_dist  */
+typedef unsigned long long qurt_addr_64_t;  /**< QuRT 64-bit memory address type. */
+typedef unsigned long long qurt_paddr_64_t; /**< QuRT 64-bit physical memory address type. */
+typedef unsigned int qurt_mem_region_t;    /**< QuRT memory regions type. */
+typedef unsigned int qurt_mem_fs_region_t; /**< QuRT memory FS region type. */
+/**@endcond */
+typedef unsigned int qurt_mem_pool_t;      /**< QuRT memory pool type.*/
+typedef unsigned int qurt_size_t;          /**< QuRT size type. */
+/** @cond  */
+typedef unsigned long long qurt_mmu_entry_t;/**< QuRT MMU entry type. */
+#define QURT_PHYSPOOL_NAME_LEN (32)
+typedef char qurt_physpool_name_t[QURT_PHYSPOOL_NAME_LEN];
+
+
+/*
+ * Mapping type
+ *
+ * QMEM_MAPPING_VIRTUAL is the default mode, in which the system 
+ * picks up the available range of the virtual address, and maps it to 
+ * available contiguous physical addresses. Physical-to-virtual
+ * is not guaranteed to be 1:1; both virtual and physical memory is 
+ * contiguous.
+ *
+ * In QMEM_MAPPING_IDEMPOTENT mode, the user provides the physical address;
+ * the kernel allocates 1:1 physical-to-virtual memory. Primary use of 
+ * of this mapping is to allocate physical-to-virtual memory 1:1.
+ *
+ * In QMEM_MAPPING_PHYS_CONTIGUOUS mode, the virtual address might
+ * not be the same as the physical address. But the physical address of the
+ * memory region is guaranteed to be contiguous starting at the provided
+ * address, it is required to provide a fixed physical address. The primary 
+ * use of this mapping is to allocate physical memory from a particular 
+ * address, where 1:1 physical-to-virtual is not required.
+ *
+ * QMEM_MAPPING_NONE mode must be used to reserve a virtual memory
+ * area (VMA); no physical memory is reserved or mapped to this virtual
+ * space; all standard qmem_region APIs apply to a VMA, however physical
+ * address is always INVALID_ADDR. qmem_region_create() in this mode
+ * returns a handle to the VMA, both virt_addr and phys_addr must
+ * be set to INVALID_ADDR, kernel allocates any available virtual
+ * memory of the specified size. Obtain the starting virtual address 
+ * of VMA through qmem_region_attr_getvirtaddr().
+ * Primary purpose of this mapping mode is to provide a mechanism for
+ * delayed binding in QuRT, for example reserve virtual memory and map it at
+ * some later time to possibly discontiguous physical blocks. Thus, a
+ * single VMA can be partitioned among several physical-virtual mappings
+ * created via qmem_region_create() with QMEM_VIRTUAL_FIXED mapping mode.
+ * Each VMA keeps track of associated mapped regions.
+ * Deletion of VMA succeeds only if all associated "virtual_fixed"
+ * regions are freed prior to VMA deletion.
+ *
+ * Use QMEM_MAPPING_VIRTUAL_FIXED mode to create a region
+ * from virtual space that has been reserved via qmem_region_create()
+ * with QMEM_MAPPING_NONE mapping. A valid virt_add is required, if
+ * phys_addr is specified, the kernel attempts to map it accordingly,
+ * if no phys_addr is specified, kernel maps any available physical
+ * memory. All standard qmem_region APIs apply to such region. Remapping
+ * a virtual range without prior freeing of the region is not permitted.
+ * When such region is deleted its corresponding VMA remains intact.
+ *
+ * QMEM_MAPPING_PHYS_DISCONTIGUOUS mode can obtain contiguous
+ * virtual memory but physical memory can be discontiguous. This method
+ * tries to club small physical memory blocks to obtain requested
+ * memory and is useful in case where there is no contiguous full block
+ * of requested size. If client does not need contiguous physical memory, 
+ * (for example, if client does not use physical addressing), this helps
+ * use smaller physical memory blocks rather than using contiguous memory.
+ * Note: When memory is allocated through this method, physical address is
+ * not returned to the caller using the qurt_mem_region_attr_get() API as there might
+ * not be a single physical address.
+ *
+ */
+/**@endcond */
+/** QuRT memory region mapping type. */
+typedef enum {
+        QURT_MEM_MAPPING_VIRTUAL=0,            /**< Default mode. The region virtual address range maps to an 
+                                          available contiguous area of physical memory. For the most
+                                                    efficient use of virtual memory, the QuRT system 
+                                                    chooses the base address in physical memory. This works for most memory
+                                          use cases.*/
+        QURT_MEM_MAPPING_PHYS_CONTIGUOUS = 1,  /**< The region virtual address space must be mapped to a 
+                                               contiguous area of physical memory. This is necessary when the
+                                               memory region is accessed by external devices that bypass Hexagon
+                                               virtual memory addressing. The base address in physical 
+                                               memory must be explicitly specified.*/
+        QURT_MEM_MAPPING_IDEMPOTENT=2,         /**< Region virtual address space maps
+                                             to the identical area of physical memory. */
+        QURT_MEM_MAPPING_VIRTUAL_FIXED=3,      /**< Virtual address space of the region maps either to the 
+                                           specified area of physical memory or (if no area is specified)
+                                                    to available physical memory. Use this mapping to create
+                                           regions from virtual space that was reserved by calling 
+                                           qurt_mem_region_create() with mapping. */
+        QURT_MEM_MAPPING_NONE=4,  /**< Reserves a virtual memory area (VMA). Remapping a virtual range is not
+                                       permitted without first deleting the memory region. When such a region is
+                                       deleted, its corresponding virtual memory addressing remains intact. */
+        QURT_MEM_MAPPING_VIRTUAL_RANDOM=7,     /**< System chooses a random virtual address and
+                                            maps it to available contiguous physical addresses.*/
+        QURT_MEM_MAPPING_PHYS_DISCONTIGUOUS=8, /**< While virtual memory is contiguous, allocates in discontiguous physical 
+                                                    memory blocks. This helps when there are smaller contiguous blocks
+                                                    than the requested size.
+                                                    Physical address is not provided as part of the get_attr call */
+        QURT_MEM_MAPPING_INVALID=10,        /**< Reserved as an invalid mapping type. */
+} qurt_mem_mapping_t;  
+
+
+/** QuRT cache mode type. */
+typedef enum {
+        QURT_MEM_CACHE_WRITEBACK=7,     /**< Write back. */
+        QURT_MEM_CACHE_NONE_SHARED=6,   /**< Normal uncached memory that can be shared with other subsystems.*/
+        QURT_MEM_CACHE_WRITETHROUGH=5,  /**< Write through. */
+        QURT_MEM_CACHE_WRITEBACK_NONL2CACHEABLE=0,    /**< Write back non-L2-cacheable.*/
+        QURT_MEM_CACHE_WRITETHROUGH_NONL2CACHEABLE=1,  /**< Write through non-L2-cacheable. */
+        QURT_MEM_CACHE_WRITEBACK_L2CACHEABLE=QURT_MEM_CACHE_WRITEBACK,  /**< Write back L2 cacheable. */
+        QURT_MEM_CACHE_WRITETHROUGH_L2CACHEABLE=QURT_MEM_CACHE_WRITETHROUGH,  /**< Write through L2 cacheable.  */
+        QURT_MEM_CACHE_DEVICE = 4,  /**< Volatile memory-mapped device. Access to device memory cannot be cancelled by interrupts, re-ordered, or replayed.*/
+        QURT_MEM_CACHE_NONE = 4,  /**< Deprecated -- use #QURT_MEM_CACHE_DEVICE instead. */
+        QURT_MEM_CACHE_DEVICE_SFC = 2, /**< Enables placing limitations on the number of outstanding transactions. */
+        QURT_MEM_CACHE_INVALID=10,  /**< Reserved as an invalid cache type. */
+} qurt_mem_cache_mode_t;
+
+/** Memory access permission. */
+#define     QURT_PERM_NONE    0x0U     /**< No permission. */
+#define     QURT_PERM_READ    0x1U     /**< Read permission. */
+#define     QURT_PERM_WRITE   0x2U     /**< Write permission. */
+#define     QURT_PERM_EXECUTE 0x4U     /**< Execution permission. */
+#define     QURT_PERM_NODUMP  0x8U   
+                                    /**<  Skip dumping the mapping. During process domain dump, must skip
+                                     some mappings on host memory to avoid a race condition
+                                     where the memory is removed from the host and DSP process
+                                     crashed before the mapping is removed. */
+#define     QURT_PERM_FULL  QURT_PERM_READ | QURT_PERM_WRITE | QURT_PERM_EXECUTE  /**< Read, write, and execute permission. */
+
+typedef unsigned char qurt_perm_t;
+
+
+/** @cond rest_reg_dist*/
+/** QuRT cache type; specifies data cache or instruction cache. */
+typedef enum {
+        QURT_MEM_ICACHE, /**< Instruction cache.*/
+        QURT_MEM_DCACHE  /**< Data cache.*/
+} qurt_mem_cache_type_t;
+
+/** QuRT cache operation code type. */
+typedef enum {
+    QURT_MEM_CACHE_FLUSH, /**< Flush. */
+    QURT_MEM_CACHE_INVALIDATE, /**< Invalidate */
+    QURT_MEM_CACHE_FLUSH_INVALIDATE, /**< Flush invalidate. */
+    QURT_MEM_CACHE_FLUSH_ALL, /**< Flush all. */
+    QURT_MEM_CACHE_FLUSH_INVALIDATE_ALL, /**< Flush invalidate all. */
+    QURT_MEM_CACHE_TABLE_FLUSH_INVALIDATE, /**< Table flush invalidate. */
+    QURT_MEM_CACHE_FLUSH_INVALIDATE_L2, /**< L2 flush invalidate.*/
+} qurt_mem_cache_op_t;
+
+/** QuRT memory region type. */
+typedef enum {
+        QURT_MEM_REGION_LOCAL=0,  /**< Local. */
+        QURT_MEM_REGION_SHARED=1,  /**< Shared.*/
+        QURT_MEM_REGION_USER_ACCESS=2,  /**< User access. */
+        QURT_MEM_REGION_FS=4,  /**< FS. */
+        QURT_MEM_REGION_INVALID=10,  /**< Reserved as an invalid region type. */
+} qurt_mem_region_type_t;
+
+/* Cache and bus attributes are combined into a value of this type for convenience,
+    and macros for combining and extracting fields are defined here.  */
+/** @cond */
+struct qurt_pgattr {
+   unsigned pga_value; /**< PGA value.*/
+};
+typedef struct qurt_pgattr qurt_pgattr_t;
+/** @endcond */
+/** QuRT memory region attributes type.*/  
+/* QMEM_MAPPING_IDEMPOTENT and QMEM_MAPPING_PHYS_CONTIGUOUS mode can specify physaddr.
+   virtaddr cannot be specified for a memory region, it can only be queried by the 
+   qmem_attr_getvirtaddr() function.
+ */
+typedef struct {
+    /** @cond */
+    qurt_mem_mapping_t    mapping_type; 
+    unsigned char          perms;
+    unsigned short         owner;
+    qurt_pgattr_t          pga;
+    unsigned               ppn; //physical page number (physical>>12)
+    qurt_addr_t            virtaddr;
+    qurt_mem_region_type_t   type;   
+    qurt_size_t               size;
+    /** @endcond */
+} qurt_mem_region_attr_t;
+
+
+/** QuRT user physical memory pool type. */
+typedef struct {
+    /** @cond */
+    char name[32];
+    struct ranges{
+        unsigned int start;
+        unsigned int size;
+    } ranges[MAX_POOL_RANGES];
+     /** @endcond */
+} qurt_mem_pool_attr_t;
+
+/** QuRT memory pool status type.*/
+typedef struct _qurt_mem_pool_status {
+
+    qurt_size_t         contig_size; /**< Largest contiguous free memory in bytes. */
+    qurt_size_t         free_size;   /**< Total free memory in bytes. */
+    qurt_size_t         total_size;  /**< Total declared memory in bytes. */
+
+} qurt_mem_pool_status_t;
+
+typedef enum {
+    HEXAGON_L1_I_CACHE = 0,     /**< Hexagon L1 instruction cache. */
+    HEXAGON_L1_D_CACHE = 1,     /**< Hexagon L1 data cache. */
+    HEXAGON_L2_CACHE = 2        /**< Hexagon L2 cache. */
+} qurt_cache_type_t;
+
+typedef enum {
+    FULL_SIZE = 0,                /**< Fully shared cache, without partitioning. */
+    HALF_SIZE = 1,                /**< 1/2 for main, 1/2 for auxiliary. */
+    THREE_QUARTER_SIZE = 2,       /**< 3/4 for main, 1/4 for auxiliary. */
+    SEVEN_EIGHTHS_SIZE = 3        /**< 7/8 for main, 1/8 for auxiliary; for L2 cache only. */
+} qurt_cache_partition_size_t;
+
+typedef enum {
+	QURT_PROCESS_CB_GENERIC,        /**< generic unconditional cb called after image loading. */
+	QURT_PROCESS_NOTE_CB_PRE_MAP,   /**< note cb called before segment loading. */
+	QURT_PROCESS_NOTE_CB_POST_MAP   /**< note cb called after segment loading. */
+} qurt_process_cb_type_t;
+
+typedef union {
+    void *ptr;
+    int num;
+} qurt_process_callback_arg_t;
+
+
+/**@endcond*/
+
+/** @} */ /* end_addtogroup memory_management_types */
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_TYPES_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_user_dma.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_user_dma.h
new file mode 100755
index 0000000000000..e05a6429fd703
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_user_dma.h
@@ -0,0 +1,44 @@
+#ifndef QURT_USER_DMA_H
+#define QURT_USER_DMA_H
+
+/**
+  @file qurt_user_dma.h
+  @brief  Definitions, macros, and prototypes used for handling user DMA.
+
+ EXTERNALIZED FUNCTIONS
+  none
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  none
+
+ Copyright (c) 2021  by Qualcomm Technologies, Inc.  All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**@ingroup qurt_user_dma_dmsyncht
+  Sends the DMSyncht command to the user DMA engine.
+   
+   Call this function to ensure all posted DMA memory operations are
+   complete. 
+   
+   This stalls the current thread until the instruction
+   is complete and returns.
+
+  @return
+  QURT_EOK - On dmsyncht completion \n
+  QURT_ENOTSUPPORTED - User DMA not supported
+  
+  @dependencies
+  None.
+*/
+int qurt_user_dma_dmsyncht(void);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_vtlb.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_vtlb.h
new file mode 100755
index 0000000000000..e064042e447ac
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/include/qurt/qurt_vtlb.h
@@ -0,0 +1,76 @@
+/*=============================================================================
+
+                                    qurt_vtlb.h
+
+GENERAL DESCRIPTION
+
+EXTERNAL FUNCTIONS
+        None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+        None.
+
+Copyright (c) 2019, 2021, 2023  by Qualcomm Technologies, Inc.  All Rights Reserved.
+=============================================================================*/
+#ifndef QURT_VTLB_H
+#define QURT_VTLB_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+||  Names starting with "qurt_i_vtlb" are the internal low-level functions.
+||  These should be considered subject to change.
+*/
+
+int qurt_i_vtlb_entry_create(unsigned *pIndex,
+                             unsigned tlb_lo,
+                             unsigned tlb_hi,
+                             unsigned extension);
+
+int qurt_i_vtlb_entry_create_with_pid(unsigned *pIndex,
+                                      unsigned tlb_lo,
+                                      unsigned tlb_hi,
+                                      unsigned extension,
+                                      unsigned target_pid);
+
+int qurt_i_vtlb_entry_delete(unsigned index);
+
+int qurt_i_vtlb_entry_read(unsigned index, unsigned *tlbinfo);
+
+int qurt_i_vtlb_entry_write(unsigned index, unsigned tlb_lo, unsigned tlb_hi, unsigned extension);
+
+int qurt_i_vtlb_entry_write_with_pid(unsigned index, unsigned tlb_lo, unsigned tlb_hi, unsigned extension, unsigned target_pid);
+
+int qurt_i_vtlb_entry_probe(const void *vaddr, unsigned *tlbinfo, unsigned *pIndex);
+
+int qurt_i_vtlb_entry_probe_with_pid(const void *vaddr, unsigned *tlbinfo, unsigned *pIndex, unsigned target_pid);
+
+
+int qurt_i_vtlb_statistics(unsigned *stats); // Returns stats[0] -- total number of VTLB entries
+                                             //         stats[1] -- number of available VTLB entries
+                                             //         stats[2] -- max size of VTLB tree since boot
+
+//can return index to an entry that was specialed, change it to take addresses instead of pages
+int qurt_i_vtlb_set_special(int index, unsigned pageno, unsigned asid, unsigned size);
+
+int qurt_i_vtlb_queue_ppage(unsigned pageno, unsigned vtlb_index);
+
+#define QURT_VTLB_EXT_DEFAULT      0U
+#define QURT_VTLB_EXT_LOCKED       1U
+#define QURT_VTLB_EXT_EXCLUDE_DUMP 2U      /* Temporary ability to skip certain mappings in pd dump */
+#define QURT_VTLB_EXT_FREELIST     0x800000u
+
+#define QURT_VTLB_ERR_OVERLAP           -64
+#define QURT_VTLB_ERR_TREE_NO_SPACE     -65
+#define QURT_VTLB_ERR_INVALID_SIZE      -68
+#define QURT_VTLB_ERR_INVALID_EXT       -69
+#define QURT_VTLB_ERR_DEL_PGT_LOCKED    -70
+#define QURT_VTLB_ERR_PGT_LOCK_CNT      -71
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif // QURT_VTLB_H
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/lib/libposix.a b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/lib/libposix.a
new file mode 100755
index 0000000000000..ca0bdbacb0604
Binary files /dev/null and b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/lib/libposix.a differ
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/lib/libqurt.a b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/lib/libqurt.a
new file mode 100755
index 0000000000000..91fc230d94d3b
Binary files /dev/null and b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/lib/libqurt.a differ
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/lib/libqurtcfs.a b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/lib/libqurtcfs.a
new file mode 100755
index 0000000000000..e7a8102d8cb40
Binary files /dev/null and b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/lib/libqurtcfs.a differ
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/lib/libtimer_island.a b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/lib/libtimer_island.a
new file mode 100755
index 0000000000000..32ce17efe453e
Binary files /dev/null and b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/lib/libtimer_island.a differ
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/lib/libtimer_main.a b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/lib/libtimer_main.a
new file mode 100755
index 0000000000000..a67c32e005b95
Binary files /dev/null and b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/lib/libtimer_main.a differ
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/lib/pic/libposix.a b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/lib/pic/libposix.a
new file mode 100755
index 0000000000000..1e0afa4db765b
Binary files /dev/null and b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/lib/pic/libposix.a differ
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/lib/pic/libqurt.a b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/lib/pic/libqurt.a
new file mode 100755
index 0000000000000..fff03b0877eb8
Binary files /dev/null and b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/lib/pic/libqurt.a differ
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/lib/pic/libqurtcfs.a b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/lib/pic/libqurtcfs.a
new file mode 100755
index 0000000000000..e7a8102d8cb40
Binary files /dev/null and b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/lib/pic/libqurtcfs.a differ
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/lib/pic/libtimer.a b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/lib/pic/libtimer.a
new file mode 100755
index 0000000000000..cd856bdb8c5cf
Binary files /dev/null and b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev69/lib/pic/libtimer.a differ
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/posix/bits/confname.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/posix/bits/confname.h
new file mode 100755
index 0000000000000..d9ca3135501e3
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/posix/bits/confname.h
@@ -0,0 +1,528 @@
+#ifndef CONFNAME_H
+#define CONFNAME_H
+/**
+  @file confname.h
+  @brief Named literals for 'name' argument of sysconf, pathconf
+
+EXTERNAL FUNCTIONS    
+   None 
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS    
+   DONT include this header directly. Instead include unistd.h. For now since 
+   toolchain doesnt provide a hook by including bits/confname.h, we stick this 
+   header in QuRT's sys/types.h 
+
+Copyright (c) 2018, 2023 by Qualcomm Technologies, Inc.  All Rights Reserved.
+Confidential and Proprietary - Qualcomm Technologies, Inc.    
+    
+==============================================================================*/
+/* Values for the NAME argument to `pathconf' and `fpathconf'.  */
+enum
+{
+    _PC_LINK_MAX,
+#define	_PC_LINK_MAX			_PC_LINK_MAX
+    _PC_MAX_CANON,
+#define	_PC_MAX_CANON			_PC_MAX_CANON
+    _PC_MAX_INPUT,
+#define	_PC_MAX_INPUT			_PC_MAX_INPUT
+    _PC_NAME_MAX,
+#define	_PC_NAME_MAX			_PC_NAME_MAX
+    _PC_PATH_MAX,
+#define	_PC_PATH_MAX			_PC_PATH_MAX
+    _PC_PIPE_BUF,
+#define	_PC_PIPE_BUF			_PC_PIPE_BUF
+    _PC_CHOWN_RESTRICTED,
+#define	_PC_CHOWN_RESTRICTED		_PC_CHOWN_RESTRICTED
+    _PC_NO_TRUNC,
+#define	_PC_NO_TRUNC			_PC_NO_TRUNC
+    _PC_VDISABLE,
+#define _PC_VDISABLE			_PC_VDISABLE
+    _PC_SYNC_IO,
+#define	_PC_SYNC_IO			_PC_SYNC_IO
+    _PC_ASYNC_IO,
+#define	_PC_ASYNC_IO			_PC_ASYNC_IO
+    _PC_PRIO_IO,
+#define	_PC_PRIO_IO			_PC_PRIO_IO
+    _PC_SOCK_MAXBUF,
+#define	_PC_SOCK_MAXBUF			_PC_SOCK_MAXBUF
+    _PC_FILESIZEBITS,
+#define _PC_FILESIZEBITS		_PC_FILESIZEBITS
+    _PC_REC_INCR_XFER_SIZE,
+#define _PC_REC_INCR_XFER_SIZE		_PC_REC_INCR_XFER_SIZE
+    _PC_REC_MAX_XFER_SIZE,
+#define _PC_REC_MAX_XFER_SIZE		_PC_REC_MAX_XFER_SIZE
+    _PC_REC_MIN_XFER_SIZE,
+#define _PC_REC_MIN_XFER_SIZE		_PC_REC_MIN_XFER_SIZE
+    _PC_REC_XFER_ALIGN,
+#define _PC_REC_XFER_ALIGN		_PC_REC_XFER_ALIGN
+    _PC_ALLOC_SIZE_MIN,
+#define _PC_ALLOC_SIZE_MIN		_PC_ALLOC_SIZE_MIN
+    _PC_SYMLINK_MAX,
+#define _PC_SYMLINK_MAX			_PC_SYMLINK_MAX
+    _PC_2_SYMLINKS
+#define _PC_2_SYMLINKS			_PC_2_SYMLINKS
+};
+
+/* Values for the argument to `sysconf'.  */
+enum
+{
+    _SC_ARG_MAX,
+#define	_SC_ARG_MAX			_SC_ARG_MAX
+    _SC_CHILD_MAX,
+#define	_SC_CHILD_MAX			_SC_CHILD_MAX
+    _SC_CLK_TCK,
+#define	_SC_CLK_TCK			_SC_CLK_TCK
+    _SC_NGROUPS_MAX,
+#define	_SC_NGROUPS_MAX			_SC_NGROUPS_MAX
+    _SC_OPEN_MAX,
+#define	_SC_OPEN_MAX			_SC_OPEN_MAX
+    _SC_STREAM_MAX,
+#define	_SC_STREAM_MAX			_SC_STREAM_MAX
+    _SC_TZNAME_MAX,
+#define	_SC_TZNAME_MAX			_SC_TZNAME_MAX
+    _SC_JOB_CONTROL,
+#define	_SC_JOB_CONTROL			_SC_JOB_CONTROL
+    _SC_SAVED_IDS,
+#define	_SC_SAVED_IDS			_SC_SAVED_IDS
+    _SC_REALTIME_SIGNALS,
+#define	_SC_REALTIME_SIGNALS		_SC_REALTIME_SIGNALS
+    _SC_PRIORITY_SCHEDULING,
+#define	_SC_PRIORITY_SCHEDULING		_SC_PRIORITY_SCHEDULING
+    _SC_TIMERS,
+#define	_SC_TIMERS			_SC_TIMERS
+    _SC_ASYNCHRONOUS_IO,
+#define	_SC_ASYNCHRONOUS_IO		_SC_ASYNCHRONOUS_IO
+    _SC_PRIORITIZED_IO,
+#define	_SC_PRIORITIZED_IO		_SC_PRIORITIZED_IO
+    _SC_SYNCHRONIZED_IO,
+#define	_SC_SYNCHRONIZED_IO		_SC_SYNCHRONIZED_IO
+    _SC_FSYNC,
+#define	_SC_FSYNC			_SC_FSYNC
+    _SC_MAPPED_FILES,
+#define	_SC_MAPPED_FILES		_SC_MAPPED_FILES
+    _SC_MEMLOCK,
+#define	_SC_MEMLOCK			_SC_MEMLOCK
+    _SC_MEMLOCK_RANGE,
+#define	_SC_MEMLOCK_RANGE		_SC_MEMLOCK_RANGE
+    _SC_MEMORY_PROTECTION,
+#define	_SC_MEMORY_PROTECTION		_SC_MEMORY_PROTECTION
+    _SC_MESSAGE_PASSING,
+#define	_SC_MESSAGE_PASSING		_SC_MESSAGE_PASSING
+    _SC_SEMAPHORES,
+#define	_SC_SEMAPHORES			_SC_SEMAPHORES
+    _SC_SHARED_MEMORY_OBJECTS,
+#define	_SC_SHARED_MEMORY_OBJECTS	_SC_SHARED_MEMORY_OBJECTS
+    _SC_AIO_LISTIO_MAX,
+#define	_SC_AIO_LISTIO_MAX		_SC_AIO_LISTIO_MAX
+    _SC_AIO_MAX,
+#define	_SC_AIO_MAX			_SC_AIO_MAX
+    _SC_AIO_PRIO_DELTA_MAX,
+#define	_SC_AIO_PRIO_DELTA_MAX		_SC_AIO_PRIO_DELTA_MAX
+    _SC_DELAYTIMER_MAX,
+#define	_SC_DELAYTIMER_MAX		_SC_DELAYTIMER_MAX
+    _SC_MQ_OPEN_MAX,
+#define	_SC_MQ_OPEN_MAX			_SC_MQ_OPEN_MAX
+    _SC_MQ_PRIO_MAX,
+#define	_SC_MQ_PRIO_MAX			_SC_MQ_PRIO_MAX
+    _SC_VERSION,
+#define	_SC_VERSION			_SC_VERSION
+    _SC_PAGESIZE,
+#define	_SC_PAGESIZE			_SC_PAGESIZE
+#define	_SC_PAGE_SIZE			_SC_PAGESIZE
+    _SC_RTSIG_MAX,
+#define	_SC_RTSIG_MAX			_SC_RTSIG_MAX
+    _SC_SEM_NSEMS_MAX,
+#define	_SC_SEM_NSEMS_MAX		_SC_SEM_NSEMS_MAX
+    _SC_SEM_VALUE_MAX,
+#define	_SC_SEM_VALUE_MAX		_SC_SEM_VALUE_MAX
+    _SC_SIGQUEUE_MAX,
+#define	_SC_SIGQUEUE_MAX		_SC_SIGQUEUE_MAX
+    _SC_TIMER_MAX,
+#define	_SC_TIMER_MAX			_SC_TIMER_MAX
+
+    /* Values for the argument to `sysconf'
+       corresponding to _POSIX2_* symbols.  */
+    _SC_BC_BASE_MAX,
+#define	_SC_BC_BASE_MAX			_SC_BC_BASE_MAX
+    _SC_BC_DIM_MAX,
+#define	_SC_BC_DIM_MAX			_SC_BC_DIM_MAX
+    _SC_BC_SCALE_MAX,
+#define	_SC_BC_SCALE_MAX		_SC_BC_SCALE_MAX
+    _SC_BC_STRING_MAX,
+#define	_SC_BC_STRING_MAX		_SC_BC_STRING_MAX
+    _SC_COLL_WEIGHTS_MAX,
+#define	_SC_COLL_WEIGHTS_MAX		_SC_COLL_WEIGHTS_MAX
+    _SC_EQUIV_CLASS_MAX,
+#define	_SC_EQUIV_CLASS_MAX		_SC_EQUIV_CLASS_MAX
+    _SC_EXPR_NEST_MAX,
+#define	_SC_EXPR_NEST_MAX		_SC_EXPR_NEST_MAX
+    _SC_LINE_MAX,
+#define	_SC_LINE_MAX			_SC_LINE_MAX
+    _SC_RE_DUP_MAX,
+#define	_SC_RE_DUP_MAX			_SC_RE_DUP_MAX
+    _SC_CHARCLASS_NAME_MAX,
+#define	_SC_CHARCLASS_NAME_MAX		_SC_CHARCLASS_NAME_MAX
+
+    _SC_2_VERSION,
+#define	_SC_2_VERSION			_SC_2_VERSION
+    _SC_2_C_BIND,
+#define	_SC_2_C_BIND			_SC_2_C_BIND
+    _SC_2_C_DEV,
+#define	_SC_2_C_DEV			_SC_2_C_DEV
+    _SC_2_FORT_DEV,
+#define	_SC_2_FORT_DEV			_SC_2_FORT_DEV
+    _SC_2_FORT_RUN,
+#define	_SC_2_FORT_RUN			_SC_2_FORT_RUN
+    _SC_2_SW_DEV,
+#define	_SC_2_SW_DEV			_SC_2_SW_DEV
+    _SC_2_LOCALEDEF,
+#define	_SC_2_LOCALEDEF			_SC_2_LOCALEDEF
+
+    _SC_PII,
+#define	_SC_PII				_SC_PII
+    _SC_PII_XTI,
+#define	_SC_PII_XTI			_SC_PII_XTI
+    _SC_PII_SOCKET,
+#define	_SC_PII_SOCKET			_SC_PII_SOCKET
+    _SC_PII_INTERNET,
+#define	_SC_PII_INTERNET		_SC_PII_INTERNET
+    _SC_PII_OSI,
+#define	_SC_PII_OSI			_SC_PII_OSI
+    _SC_POLL,
+#define	_SC_POLL			_SC_POLL
+    _SC_SELECT,
+#define	_SC_SELECT			_SC_SELECT
+    _SC_UIO_MAXIOV,
+#define	_SC_UIO_MAXIOV			_SC_UIO_MAXIOV
+    _SC_IOV_MAX = _SC_UIO_MAXIOV,
+#define _SC_IOV_MAX			_SC_IOV_MAX
+    _SC_PII_INTERNET_STREAM,
+#define	_SC_PII_INTERNET_STREAM		_SC_PII_INTERNET_STREAM
+    _SC_PII_INTERNET_DGRAM,
+#define	_SC_PII_INTERNET_DGRAM		_SC_PII_INTERNET_DGRAM
+    _SC_PII_OSI_COTS,
+#define	_SC_PII_OSI_COTS		_SC_PII_OSI_COTS
+    _SC_PII_OSI_CLTS,
+#define	_SC_PII_OSI_CLTS		_SC_PII_OSI_CLTS
+    _SC_PII_OSI_M,
+#define	_SC_PII_OSI_M			_SC_PII_OSI_M
+    _SC_T_IOV_MAX,
+#define	_SC_T_IOV_MAX			_SC_T_IOV_MAX
+
+    /* Values according to POSIX 1003.1c (POSIX threads).  */
+    _SC_THREADS,
+#define	_SC_THREADS			_SC_THREADS
+    _SC_THREAD_SAFE_FUNCTIONS,
+#define _SC_THREAD_SAFE_FUNCTIONS	_SC_THREAD_SAFE_FUNCTIONS
+    _SC_GETGR_R_SIZE_MAX,
+#define	_SC_GETGR_R_SIZE_MAX		_SC_GETGR_R_SIZE_MAX
+    _SC_GETPW_R_SIZE_MAX,
+#define	_SC_GETPW_R_SIZE_MAX		_SC_GETPW_R_SIZE_MAX
+    _SC_LOGIN_NAME_MAX,
+#define	_SC_LOGIN_NAME_MAX		_SC_LOGIN_NAME_MAX
+    _SC_TTY_NAME_MAX,
+#define	_SC_TTY_NAME_MAX		_SC_TTY_NAME_MAX
+    _SC_THREAD_DESTRUCTOR_ITERATIONS,
+#define	_SC_THREAD_DESTRUCTOR_ITERATIONS _SC_THREAD_DESTRUCTOR_ITERATIONS
+    _SC_THREAD_KEYS_MAX,
+#define	_SC_THREAD_KEYS_MAX		_SC_THREAD_KEYS_MAX
+    _SC_THREAD_STACK_MIN,
+#define	_SC_THREAD_STACK_MIN		_SC_THREAD_STACK_MIN
+    _SC_THREAD_THREADS_MAX,
+#define	_SC_THREAD_THREADS_MAX		_SC_THREAD_THREADS_MAX
+    _SC_THREAD_ATTR_STACKADDR,
+#define	_SC_THREAD_ATTR_STACKADDR	_SC_THREAD_ATTR_STACKADDR
+    _SC_THREAD_ATTR_STACKSIZE,
+#define	_SC_THREAD_ATTR_STACKSIZE	_SC_THREAD_ATTR_STACKSIZE
+    _SC_THREAD_PRIORITY_SCHEDULING,
+#define	_SC_THREAD_PRIORITY_SCHEDULING	_SC_THREAD_PRIORITY_SCHEDULING
+    _SC_THREAD_PRIO_INHERIT,
+#define	_SC_THREAD_PRIO_INHERIT		_SC_THREAD_PRIO_INHERIT
+    _SC_THREAD_PRIO_PROTECT,
+#define	_SC_THREAD_PRIO_PROTECT		_SC_THREAD_PRIO_PROTECT
+    _SC_THREAD_PROCESS_SHARED,
+#define	_SC_THREAD_PROCESS_SHARED	_SC_THREAD_PROCESS_SHARED
+
+    _SC_NPROCESSORS_CONF,
+#define _SC_NPROCESSORS_CONF		_SC_NPROCESSORS_CONF
+    _SC_NPROCESSORS_ONLN,
+#define _SC_NPROCESSORS_ONLN		_SC_NPROCESSORS_ONLN
+    _SC_PHYS_PAGES,
+#define _SC_PHYS_PAGES			_SC_PHYS_PAGES
+    _SC_AVPHYS_PAGES,
+#define _SC_AVPHYS_PAGES		_SC_AVPHYS_PAGES
+    _SC_ATEXIT_MAX,
+#define _SC_ATEXIT_MAX			_SC_ATEXIT_MAX
+    _SC_PASS_MAX,
+#define _SC_PASS_MAX			_SC_PASS_MAX
+
+    _SC_XOPEN_VERSION,
+#define _SC_XOPEN_VERSION		_SC_XOPEN_VERSION
+    _SC_XOPEN_XCU_VERSION,
+#define _SC_XOPEN_XCU_VERSION		_SC_XOPEN_XCU_VERSION
+    _SC_XOPEN_UNIX,
+#define _SC_XOPEN_UNIX			_SC_XOPEN_UNIX
+    _SC_XOPEN_CRYPT,
+#define _SC_XOPEN_CRYPT			_SC_XOPEN_CRYPT
+    _SC_XOPEN_ENH_I18N,
+#define _SC_XOPEN_ENH_I18N		_SC_XOPEN_ENH_I18N
+    _SC_XOPEN_SHM,
+#define _SC_XOPEN_SHM			_SC_XOPEN_SHM
+
+    _SC_2_CHAR_TERM,
+#define _SC_2_CHAR_TERM			_SC_2_CHAR_TERM
+    _SC_2_C_VERSION,
+#define _SC_2_C_VERSION			_SC_2_C_VERSION
+    _SC_2_UPE,
+#define _SC_2_UPE			_SC_2_UPE
+
+    _SC_XOPEN_XPG2,
+#define _SC_XOPEN_XPG2			_SC_XOPEN_XPG2
+    _SC_XOPEN_XPG3,
+#define _SC_XOPEN_XPG3			_SC_XOPEN_XPG3
+    _SC_XOPEN_XPG4,
+#define _SC_XOPEN_XPG4			_SC_XOPEN_XPG4
+
+    _SC_CHAR_BIT,
+#define	_SC_CHAR_BIT			_SC_CHAR_BIT
+    _SC_CHAR_MAX,
+#define	_SC_CHAR_MAX			_SC_CHAR_MAX
+    _SC_CHAR_MIN,
+#define	_SC_CHAR_MIN			_SC_CHAR_MIN
+    _SC_INT_MAX,
+#define	_SC_INT_MAX			_SC_INT_MAX
+    _SC_INT_MIN,
+#define	_SC_INT_MIN			_SC_INT_MIN
+    _SC_LONG_BIT,
+#define	_SC_LONG_BIT			_SC_LONG_BIT
+    _SC_WORD_BIT,
+#define	_SC_WORD_BIT			_SC_WORD_BIT
+    _SC_MB_LEN_MAX,
+#define	_SC_MB_LEN_MAX			_SC_MB_LEN_MAX
+    _SC_NZERO,
+#define	_SC_NZERO			_SC_NZERO
+    _SC_SSIZE_MAX,
+#define	_SC_SSIZE_MAX			_SC_SSIZE_MAX
+    _SC_SCHAR_MAX,
+#define	_SC_SCHAR_MAX			_SC_SCHAR_MAX
+    _SC_SCHAR_MIN,
+#define	_SC_SCHAR_MIN			_SC_SCHAR_MIN
+    _SC_SHRT_MAX,
+#define	_SC_SHRT_MAX			_SC_SHRT_MAX
+    _SC_SHRT_MIN,
+#define	_SC_SHRT_MIN			_SC_SHRT_MIN
+    _SC_UCHAR_MAX,
+#define	_SC_UCHAR_MAX			_SC_UCHAR_MAX
+    _SC_UINT_MAX,
+#define	_SC_UINT_MAX			_SC_UINT_MAX
+    _SC_ULONG_MAX,
+#define	_SC_ULONG_MAX			_SC_ULONG_MAX
+    _SC_USHRT_MAX,
+#define	_SC_USHRT_MAX			_SC_USHRT_MAX
+
+    _SC_NL_ARGMAX,
+#define	_SC_NL_ARGMAX			_SC_NL_ARGMAX
+    _SC_NL_LANGMAX,
+#define	_SC_NL_LANGMAX			_SC_NL_LANGMAX
+    _SC_NL_MSGMAX,
+#define	_SC_NL_MSGMAX			_SC_NL_MSGMAX
+    _SC_NL_NMAX,
+#define	_SC_NL_NMAX			_SC_NL_NMAX
+    _SC_NL_SETMAX,
+#define	_SC_NL_SETMAX			_SC_NL_SETMAX
+    _SC_NL_TEXTMAX,
+#define	_SC_NL_TEXTMAX			_SC_NL_TEXTMAX
+
+    _SC_XBS5_ILP32_OFF32,
+#define _SC_XBS5_ILP32_OFF32		_SC_XBS5_ILP32_OFF32
+    _SC_XBS5_ILP32_OFFBIG,
+#define _SC_XBS5_ILP32_OFFBIG		_SC_XBS5_ILP32_OFFBIG
+    _SC_XBS5_LP64_OFF64,
+#define _SC_XBS5_LP64_OFF64		_SC_XBS5_LP64_OFF64
+    _SC_XBS5_LPBIG_OFFBIG,
+#define _SC_XBS5_LPBIG_OFFBIG		_SC_XBS5_LPBIG_OFFBIG
+
+    _SC_XOPEN_LEGACY,
+#define _SC_XOPEN_LEGACY		_SC_XOPEN_LEGACY
+    _SC_XOPEN_REALTIME,
+#define _SC_XOPEN_REALTIME		_SC_XOPEN_REALTIME
+    _SC_XOPEN_REALTIME_THREADS,
+#define _SC_XOPEN_REALTIME_THREADS	_SC_XOPEN_REALTIME_THREADS
+
+    _SC_ADVISORY_INFO,
+#define _SC_ADVISORY_INFO		_SC_ADVISORY_INFO
+    _SC_BARRIERS,
+#define _SC_BARRIERS			_SC_BARRIERS
+    _SC_BASE,
+#define _SC_BASE			_SC_BASE
+    _SC_C_LANG_SUPPORT,
+#define _SC_C_LANG_SUPPORT		_SC_C_LANG_SUPPORT
+    _SC_C_LANG_SUPPORT_R,
+#define _SC_C_LANG_SUPPORT_R		_SC_C_LANG_SUPPORT_R
+    _SC_CLOCK_SELECTION,
+#define _SC_CLOCK_SELECTION		_SC_CLOCK_SELECTION
+    _SC_CPUTIME,
+#define _SC_CPUTIME			_SC_CPUTIME
+    _SC_THREAD_CPUTIME,
+#define _SC_THREAD_CPUTIME		_SC_THREAD_CPUTIME
+    _SC_DEVICE_IO,
+#define _SC_DEVICE_IO			_SC_DEVICE_IO
+    _SC_DEVICE_SPECIFIC,
+#define _SC_DEVICE_SPECIFIC		_SC_DEVICE_SPECIFIC
+    _SC_DEVICE_SPECIFIC_R,
+#define _SC_DEVICE_SPECIFIC_R		_SC_DEVICE_SPECIFIC_R
+    _SC_FD_MGMT,
+#define _SC_FD_MGMT			_SC_FD_MGMT
+    _SC_FIFO,
+#define _SC_FIFO			_SC_FIFO
+    _SC_PIPE,
+#define _SC_PIPE			_SC_PIPE
+    _SC_FILE_ATTRIBUTES,
+#define _SC_FILE_ATTRIBUTES		_SC_FILE_ATTRIBUTES
+    _SC_FILE_LOCKING,
+#define _SC_FILE_LOCKING		_SC_FILE_LOCKING
+    _SC_FILE_SYSTEM,
+#define _SC_FILE_SYSTEM			_SC_FILE_SYSTEM
+    _SC_MONOTONIC_CLOCK,
+#define _SC_MONOTONIC_CLOCK		_SC_MONOTONIC_CLOCK
+    _SC_MULTI_PROCESS,
+#define _SC_MULTI_PROCESS		_SC_MULTI_PROCESS
+    _SC_SINGLE_PROCESS,
+#define _SC_SINGLE_PROCESS		_SC_SINGLE_PROCESS
+    _SC_NETWORKING,
+#define _SC_NETWORKING			_SC_NETWORKING
+    _SC_READER_WRITER_LOCKS,
+#define _SC_READER_WRITER_LOCKS		_SC_READER_WRITER_LOCKS
+    _SC_SPIN_LOCKS,
+#define _SC_SPIN_LOCKS			_SC_SPIN_LOCKS
+    _SC_REGEXP,
+#define _SC_REGEXP			_SC_REGEXP
+    _SC_REGEX_VERSION,
+#define _SC_REGEX_VERSION		_SC_REGEX_VERSION
+    _SC_SHELL,
+#define _SC_SHELL			_SC_SHELL
+    _SC_SIGNALS,
+#define _SC_SIGNALS			_SC_SIGNALS
+    _SC_SPAWN,
+#define _SC_SPAWN			_SC_SPAWN
+    _SC_SPORADIC_SERVER,
+#define _SC_SPORADIC_SERVER		_SC_SPORADIC_SERVER
+    _SC_THREAD_SPORADIC_SERVER,
+#define _SC_THREAD_SPORADIC_SERVER	_SC_THREAD_SPORADIC_SERVER
+    _SC_SYSTEM_DATABASE,
+#define _SC_SYSTEM_DATABASE		_SC_SYSTEM_DATABASE
+    _SC_SYSTEM_DATABASE_R,
+#define _SC_SYSTEM_DATABASE_R		_SC_SYSTEM_DATABASE_R
+    _SC_TIMEOUTS,
+#define _SC_TIMEOUTS			_SC_TIMEOUTS
+    _SC_TYPED_MEMORY_OBJECTS,
+#define _SC_TYPED_MEMORY_OBJECTS	_SC_TYPED_MEMORY_OBJECTS
+    _SC_USER_GROUPS,
+#define _SC_USER_GROUPS			_SC_USER_GROUPS
+    _SC_USER_GROUPS_R,
+#define _SC_USER_GROUPS_R		_SC_USER_GROUPS_R
+    _SC_2_PBS,
+#define _SC_2_PBS			_SC_2_PBS
+    _SC_2_PBS_ACCOUNTING,
+#define _SC_2_PBS_ACCOUNTING		_SC_2_PBS_ACCOUNTING
+    _SC_2_PBS_LOCATE,
+#define _SC_2_PBS_LOCATE		_SC_2_PBS_LOCATE
+    _SC_2_PBS_MESSAGE,
+#define _SC_2_PBS_MESSAGE		_SC_2_PBS_MESSAGE
+    _SC_2_PBS_TRACK,
+#define _SC_2_PBS_TRACK			_SC_2_PBS_TRACK
+    _SC_SYMLOOP_MAX,
+#define _SC_SYMLOOP_MAX			_SC_SYMLOOP_MAX
+    _SC_STREAMS,
+#define _SC_STREAMS			_SC_STREAMS
+    _SC_2_PBS_CHECKPOINT,
+#define _SC_2_PBS_CHECKPOINT		_SC_2_PBS_CHECKPOINT
+
+    _SC_V6_ILP32_OFF32,
+#define _SC_V6_ILP32_OFF32		_SC_V6_ILP32_OFF32
+    _SC_V6_ILP32_OFFBIG,
+#define _SC_V6_ILP32_OFFBIG		_SC_V6_ILP32_OFFBIG
+    _SC_V6_LP64_OFF64,
+#define _SC_V6_LP64_OFF64		_SC_V6_LP64_OFF64
+    _SC_V6_LPBIG_OFFBIG,
+#define _SC_V6_LPBIG_OFFBIG		_SC_V6_LPBIG_OFFBIG
+
+    _SC_HOST_NAME_MAX,
+#define _SC_HOST_NAME_MAX		_SC_HOST_NAME_MAX
+    _SC_TRACE,
+#define _SC_TRACE			_SC_TRACE
+    _SC_TRACE_EVENT_FILTER,
+#define _SC_TRACE_EVENT_FILTER		_SC_TRACE_EVENT_FILTER
+    _SC_TRACE_INHERIT,
+#define _SC_TRACE_INHERIT		_SC_TRACE_INHERIT
+    _SC_TRACE_LOG,
+#define _SC_TRACE_LOG			_SC_TRACE_LOG
+
+    _SC_LEVEL1_ICACHE_SIZE,
+#define _SC_LEVEL1_ICACHE_SIZE		_SC_LEVEL1_ICACHE_SIZE
+    _SC_LEVEL1_ICACHE_ASSOC,
+#define _SC_LEVEL1_ICACHE_ASSOC		_SC_LEVEL1_ICACHE_ASSOC
+    _SC_LEVEL1_ICACHE_LINESIZE,
+#define _SC_LEVEL1_ICACHE_LINESIZE	_SC_LEVEL1_ICACHE_LINESIZE
+    _SC_LEVEL1_DCACHE_SIZE,
+#define _SC_LEVEL1_DCACHE_SIZE		_SC_LEVEL1_DCACHE_SIZE
+    _SC_LEVEL1_DCACHE_ASSOC,
+#define _SC_LEVEL1_DCACHE_ASSOC		_SC_LEVEL1_DCACHE_ASSOC
+    _SC_LEVEL1_DCACHE_LINESIZE,
+#define _SC_LEVEL1_DCACHE_LINESIZE	_SC_LEVEL1_DCACHE_LINESIZE
+    _SC_LEVEL2_CACHE_SIZE,
+#define _SC_LEVEL2_CACHE_SIZE		_SC_LEVEL2_CACHE_SIZE
+    _SC_LEVEL2_CACHE_ASSOC,
+#define _SC_LEVEL2_CACHE_ASSOC		_SC_LEVEL2_CACHE_ASSOC
+    _SC_LEVEL2_CACHE_LINESIZE,
+#define _SC_LEVEL2_CACHE_LINESIZE	_SC_LEVEL2_CACHE_LINESIZE
+    _SC_LEVEL3_CACHE_SIZE,
+#define _SC_LEVEL3_CACHE_SIZE		_SC_LEVEL3_CACHE_SIZE
+    _SC_LEVEL3_CACHE_ASSOC,
+#define _SC_LEVEL3_CACHE_ASSOC		_SC_LEVEL3_CACHE_ASSOC
+    _SC_LEVEL3_CACHE_LINESIZE,
+#define _SC_LEVEL3_CACHE_LINESIZE	_SC_LEVEL3_CACHE_LINESIZE
+    _SC_LEVEL4_CACHE_SIZE,
+#define _SC_LEVEL4_CACHE_SIZE		_SC_LEVEL4_CACHE_SIZE
+    _SC_LEVEL4_CACHE_ASSOC,
+#define _SC_LEVEL4_CACHE_ASSOC		_SC_LEVEL4_CACHE_ASSOC
+    _SC_LEVEL4_CACHE_LINESIZE,
+#define _SC_LEVEL4_CACHE_LINESIZE	_SC_LEVEL4_CACHE_LINESIZE
+    /* Leave room here, maybe we need a few more cache levels some day.  */
+
+    _SC_IPV6 = _SC_LEVEL1_ICACHE_SIZE + 50,
+#define _SC_IPV6			_SC_IPV6
+    _SC_RAW_SOCKETS,
+#define _SC_RAW_SOCKETS			_SC_RAW_SOCKETS
+
+    _SC_V7_ILP32_OFF32,
+#define _SC_V7_ILP32_OFF32		_SC_V7_ILP32_OFF32
+    _SC_V7_ILP32_OFFBIG,
+#define _SC_V7_ILP32_OFFBIG		_SC_V7_ILP32_OFFBIG
+    _SC_V7_LP64_OFF64,
+#define _SC_V7_LP64_OFF64		_SC_V7_LP64_OFF64
+    _SC_V7_LPBIG_OFFBIG,
+#define _SC_V7_LPBIG_OFFBIG		_SC_V7_LPBIG_OFFBIG
+
+    _SC_SS_REPL_MAX,
+#define _SC_SS_REPL_MAX			_SC_SS_REPL_MAX
+
+    _SC_TRACE_EVENT_NAME_MAX,
+#define _SC_TRACE_EVENT_NAME_MAX	_SC_TRACE_EVENT_NAME_MAX
+    _SC_TRACE_NAME_MAX,
+#define _SC_TRACE_NAME_MAX		_SC_TRACE_NAME_MAX
+    _SC_TRACE_SYS_MAX,
+#define _SC_TRACE_SYS_MAX		_SC_TRACE_SYS_MAX
+    _SC_TRACE_USER_EVENT_MAX,
+#define _SC_TRACE_USER_EVENT_MAX	_SC_TRACE_USER_EVENT_MAX
+
+    _SC_XOPEN_STREAMS,
+#define _SC_XOPEN_STREAMS		_SC_XOPEN_STREAMS
+
+    _SC_THREAD_ROBUST_PRIO_INHERIT,
+#define _SC_THREAD_ROBUST_PRIO_INHERIT	_SC_THREAD_ROBUST_PRIO_INHERIT
+    _SC_THREAD_ROBUST_PRIO_PROTECT
+#define _SC_THREAD_ROBUST_PRIO_PROTECT	_SC_THREAD_ROBUST_PRIO_PROTECT
+
+};
+#endif
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/posix/bits/posix1_lim.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/posix/bits/posix1_lim.h
new file mode 100755
index 0000000000000..0739958c5a6c4
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/posix/bits/posix1_lim.h
@@ -0,0 +1,34 @@
+#ifndef POSIX1_LIM_H
+#define POSIX1_LIM_H
+/**
+  @file posix1_lim.h
+  @brief POSIX Minimum values
+
+EXTERNAL FUNCTIONS    
+   None 
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None 
+    
+TODO    
+   This header should be ideally relocated under api/posix/bits (something that 
+   doesnt exist today) and be included from api/posix/bits/limits.h which inturn 
+   should be included from toolchain's limits.h 
+
+Copyright (c) 2018, 2023 by Qualcomm Technologies, Inc.  All Rights Reserved.
+Confidential and Proprietary - Qualcomm Technologies, Inc.    
+    
+==============================================================================*/
+
+#ifndef _POSIX_PATH_MAX
+/** @brief Maximum number of bytes in a pathname, including the terminating
+    nul character */
+#define _POSIX_PATH_MAX 256
+#endif
+
+#ifndef _POSIX_SEM_NSEMS_MAX
+/** @brief Maximum number of semaphores that a process may have */
+#define _POSIX_SEM_NSEMS_MAX 16
+#endif
+#endif
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/posix/common/time.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/posix/common/time.h
new file mode 100755
index 0000000000000..76b0d39ab7039
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/posix/common/time.h
@@ -0,0 +1 @@
+#include <time.h>
\ No newline at end of file
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/posix/fcntl.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/posix/fcntl.h
new file mode 100755
index 0000000000000..c80ec98a449b6
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/posix/fcntl.h
@@ -0,0 +1,51 @@
+#ifndef _FCNTL_H
+#define _FCNTL_H
+
+/*==========================================================================
+ * FILE:         fcntl.h
+ *
+ * SERVICES:     POSIX fcntl.h
+ *
+ * DESCRIPTION:  The <fcntl.h> header is needed by the open() and fcntl()
+ *               system calls, which have a variety of parameters and
+ *               flags. They are described here.
+ *
+ *               The formats of the calls to each of these are:
+ *
+ *               open(path, oflag [,mode]) open a file
+ *               fcntl(fd, cmd [,arg]) get or set file attributes
+ *
+ *               Copyright (c) 2013,2016  by Qualcomm Technologies, Inc.  All Rights Reserved. QUALCOMM Proprietary and Confidential.
+
+ *==========================================================================*/
+
+#include <sys/types.h>
+#include <generic/fcntl.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Oflag values for open(). POSIX Table 6-4. */
+#define POSIX_O_CREAT       0x100  /* creat file if it doesn't exist */
+#define POSIX_O_EXCL        0x200  /* exclusive use flag */
+#define POSIX_O_NOCTTY      0x400  /* do not assign a controlling terminal */
+#define POSIX_O_TRUNC       0x1000 /* truncate flag */
+
+/* File status flags for open() and fcntl(). POSIX Table 6-5. */
+#define POSIX_O_APPEND      0x2000 /* set append mode */
+#define POSIX_O_NONBLOCK    0x4000 /* no delay */
+
+/* File access modes for open() and fcntl(). POSIX Table 6-6. */
+#define POSIX_O_RDONLY      0 /* open(name, POSIX_O_RDONLY) opens read only */
+#define POSIX_O_WRONLY      1 /* open(name, POSIX_O_WRONLY) opens write only */
+#define POSIX_O_RDWR        2 /* open(name, POSIX_O_RDWR) opens read/write */
+
+/* Mask for use with file access modes. POSIX Table 6-7. */
+#define POSIX_O_ACCMODE     0x3 /* mask for file access modes */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _FCNTL_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/posix/hooks/unistd.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/posix/hooks/unistd.h
new file mode 100755
index 0000000000000..1c618bfe36b4f
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/posix/hooks/unistd.h
@@ -0,0 +1,115 @@
+#ifndef UNISTD_H
+#define UNISTD_H
+/**
+  @file posix/hooks/unistd.h
+  @brief POSIX related declarations in <unistd.h> that are missing in toolchain 
+         header
+
+EXTERNAL FUNCTIONS    
+   None 
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS    
+   DONT include this header directly! Instead include unistd.h. 
+
+Copyright (c) 2018, 2023 by Qualcomm Technologies, Inc.  All Rights Reserved.
+Confidential and Proprietary - Qualcomm Technologies, Inc.    
+    
+==============================================================================*/
+#include <types.h> /* For various POSIX ID types from toolchain headers */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+extern long pathconf (char const * path, int name);
+
+/* Process*/
+
+/** The getppid() function shall return the parent process ID of the calling process.
+ * Please refer to POSIX standard for details.
+ * @param thread    [in]  none
+ * @param value_ptr [out] the  parent process ID
+ */
+pid_t getppid(void);
+
+/** The getpgid() function shall return the process group ID of the process whose process ID is equal to pid
+ * Please refer to POSIX standard for details.
+ * @param thread    [in]  process ID
+ * @param value_ptr [out] process group ID
+ */
+pid_t getpgid(pid_t pid);
+
+/** The getpgrp() function shall return the process group ID of the calling process
+ * Please refer to POSIX standard for details.
+ * @param thread    [in]  none
+ * @param value_ptr [out] process group ID of the calling process
+ */
+pid_t getpgrp(void);
+
+/**The getuid() function shall return the real user ID of the calling process.
+ * Please refer to POSIX standard for details.
+ * @param thread    [in]  none
+ * @param value_ptr [out] the real user ID of the calling process.
+ */
+uid_t getuid(void); 
+
+/** The geteuid() function shall return the effective user ID of the calling process
+ * Please refer to POSIX standard for details.
+ * @param thread    [in]  none
+ * @param value_ptr [out] effective user ID of the calling process
+ */
+uid_t geteuid(void); 
+
+/** The getegid() function shall return the effective group ID of the calling process.
+ * Please refer to POSIX standard for details.
+ * @param thread    [in]  none
+ * @param value_ptr [out] effective group ID of the calling process.
+ */
+gid_t getegid(void);
+
+/** The getgid() function shall return the real group ID of the calling process
+ * Please refer to POSIX standard for details.
+ * @param thread    [in]  none
+ * @param value_ptr [out] real group ID of the calling process.
+ */
+ gid_t getgid(void); 
+
+/** seteuid set effective user ID
+ * Please refer to POSIX standard for details.
+ * @param thread    [in] effective user ID
+ * @param value_ptr [out] Upon successful completion, 0 shall be returned; otherwise, -1 shall be returned and errno set to indicate the error.
+ */
+int seteuid(uid_t uid);
+
+/** setpgrp - set the process group ID
+ * Please refer to POSIX standard for details.
+ * @param thread    [in] none
+ * @param value_ptr [out] Upon successful completion, 0 shall be returned; otherwise, -1 shall be returned and errno set to indicate the error.
+ */ 
+pid_t setpgrp(void);
+
+/** setuid - set user ID
+ * Please refer to POSIX standard for details.
+ * @param thread    [in]  user ID
+ * @param value_ptr [out] Upon successful completion, 0 shall be returned; otherwise, -1 shall be returned and errno set to indicate the error.
+ */
+int setuid(uid_t uid);
+
+/** setpgid - set process group ID for job control
+ * Please refer to POSIX standard for details.
+ * @param thread    [in] PID of process, PGID to be set
+ * @param value_ptr [out] Upon successful completion, 0 shall be returned; otherwise, -1 shall be returned and errno set to indicate the error.
+ */
+int setpgid(pid_t pid, pid_t pgid);
+
+/** setsid - create session and set process group ID
+ * Please refer to POSIX standard for details.
+ * @param thread    [in] none
+ * @param value_ptr [out] Upon successful completion, 0 shall be returned; otherwise, -1 shall be returned and errno set to indicate the error.
+ */
+pid_t setsid(void);
+
+#ifdef __cplusplus
+}
+#endif
+#endif
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/posix/mqueue.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/posix/mqueue.h
new file mode 100755
index 0000000000000..74dcc2fa202c6
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/posix/mqueue.h
@@ -0,0 +1,203 @@
+#ifndef _POSIX_MQUEUE_H_
+#define _POSIX_MQUEUE_H_
+
+/*==========================================================================
+ * FILE:         mqueue.h
+ *
+ * SERVICES:     POSIX Message Queue API interface
+ *
+ * DESCRIPTION:  POSIX Message Queue API interface based upon POSIX 1003.1-2004
+ *
+ * Copyright (c) 2013, 2016, 2023 Qualcomm Technologies, Inc.  
+ * All Rights Reserved. 
+ * Confidential and Proprietary - Qualcomm Technlogies, Inc.
+ *==========================================================================*/
+
+#include <sys/types.h> /*ssize_t */
+#include <time.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MQ_PRIO_MAX        255     /* max priority */
+#define MQ_PRIO_DEFAULT    0       /* default priority */
+
+typedef int   mqd_t;
+
+struct mq_attr
+{
+    long mq_flags;   /* message queue flags */
+    long mq_maxmsg;  /* maximum number of messages */
+    long mq_msgsize; /* maximum message size */
+    long mq_curmsgs; /* number of messages currently queued */
+};
+
+typedef struct mq_attr mqueue_attr;
+
+/** \details
+ * This provides POSIX Message Queue API.
+ *
+ * mq_notify is not supported.
+ *
+ * Since this implementation of POSIX kernel API is a subset of PSE51,
+ * it only supports Message sending and receiving within one process.
+ * Message sending and receiving among processes are not supported.
+ */
+
+/** \defgroup mqueue POSIX Message Queue API */
+/** \ingroup mqueue */
+/** @{ */
+
+/** Open a message queue.
+ * Please refer to POSIX standard for details.
+ */
+mqd_t mq_open(const char *name, int oflag, /* mode_t mode, struct mq_attr *attr */...);
+
+/** Close a message queue.
+ * Please refer to POSIX standard for details.
+ */
+int mq_close(mqd_t mq_desc);
+
+/** Remove a message queue.
+ * Please refer to POSIX standard for details.
+ */
+int mq_unlink(const char *name);
+
+/** Send a message to a message queue.
+ * Please refer to POSIX standard for details.
+ *
+ * If the queue is full, instead of blocking the sender, this function
+ * will return -1 with errno EAGAIN, in this implementation. This behavior
+ * may change in the future.
+ */
+int mq_send(mqd_t mqdes, const char *msg_ptr, size_t msg_len, unsigned int msg_prio);
+
+/** Send a message to a message queue with timeout.
+ * Please refer to POSIX standard for details.
+ * @param abs_timeout [in] Only abs_timeout={0,0} is supported in this
+ * implementation. This behavior may change in the future.
+ */
+int mq_timedsend(mqd_t mqdes, const char *msg_ptr, size_t msg_len, unsigned int msg_prio, const struct timespec *abs_timeout);
+
+/** Receive a message from a message queue.
+ * Please refer to POSIX standard for details.
+ */
+ssize_t mq_receive(mqd_t mqdes, char *msg_ptr, size_t msg_len, unsigned int *msg_prio);
+
+/** Receive a message from a message queue with timeout.
+ * Please refer to POSIX standard for details.
+ * @param abs_timeout [in] Only abs_timeout={0,0} is supported in this
+ * implementation. This behavior may change in the future.
+ */
+ssize_t mq_timedreceive(mqd_t mqdes, char *restrict msg_ptr, size_t msg_len, unsigned int *restrict msg_prio, const struct timespec *restrict abs_timeout);
+
+/** Get message queue attributes.
+ * Please refer to POSIX standard for details.
+ */
+int mq_getattr(mqd_t mqdes, struct mq_attr *mqstat);
+
+/** Set message queue attributes.
+ * Please refer to POSIX standard for details.
+ */
+int mq_setattr(mqd_t mqdes, const struct mq_attr *restrict mqstat, struct mq_attr *restrict omqstat);
+
+/** @} */
+
+#define NBBY    8U               /* number of bits in a byte */
+
+/*
+ * Select uses bit masks of file descriptors in longs.  These macros
+ * manipulate such bit fields (the filesystem macros use chars).
+ * FD_SETSIZE may be defined by the user, but the default here should
+ * be enough for most uses.
+ */
+#ifndef FD_SETSIZE
+#define FD_SETSIZE    256U
+#endif
+
+typedef unsigned long   fd_mask;
+#define NFDBITS    (sizeof(fd_mask) * (unsigned int)NBBY)     /* bits per mask */
+
+#ifndef howmany
+#define howmany(x, y)    (((x) + ((y) - 1U)) / (y))
+#endif
+
+//equivalent of fd_set fpr WINNT env
+typedef struct fd_set
+{
+    fd_mask fds_bits[howmany(FD_SETSIZE, NFDBITS)];
+} fd_set;
+
+/** \addtogroup mqueue */
+/** @{ */
+
+/** Sets the bit for the file descriptor fd in the file descriptor set fdset.
+ */
+#define FD_SET(n, p) ((p)->fds_bits[((unsigned int) (n)) / NFDBITS] |= (1UL << (((unsigned int) (n)) % NFDBITS)))
+
+/** Clears the bit for the file descriptor fd in the file descriptor set fdset.
+ */
+#define FD_CLR(n, p) ((p)->fds_bits[((unsigned int) (n)) / NFDBITS] &= ~(1UL << (((unsigned int) (n)) % NFDBITS)))
+
+/** Returns a non-zero value if the bit for the file descriptor fd is set in the file descriptor set pointed to by fdset, and 0 otherwise.
+ */
+#define FD_ISSET(n, p)    ((unsigned long)(p)->fds_bits[((unsigned int) (n)) / NFDBITS] & (unsigned long)((unsigned)1U << (((unsigned int) (n)) % NFDBITS)))
+
+/** Copies the file descriptor set.
+ */
+#define FD_COPY(f, t)     (void)(memcpy)((t), (f), sizeof(*(f)))
+
+/** Initializes the file descriptor set fdset to have zero bits for all file descriptors.
+ */
+#define FD_ZERO(p)        (void)memset((p), 0, sizeof(*(p)))
+
+/** Error check the file descriptor set.
+ */
+#define FD_BAD(fd)        ((fd) < 0 /*|| fd >= fd_arraylen || fd_array[fd].obj == 0*/)
+
+/*! Wait for both message queues and signals. In this implementation, only
+ * message queue file descriptors are supported.
+ * @param nfds [in] This is an integer one more than the maximum of any file
+ * descriptor in any of the sets. In other words, while you are busy
+ * adding file descriptors to your sets, you must calculate the maximum
+ * integer value of all of them, then increment this value by one, and
+ * then pass this as nfds to select().
+ * @param readfds  [in] the file descriptor set on all message queues.
+ * @param writefds [in] ignored in this implementation.
+ * @param errorfds [in] ignored in this implementation.
+ * @param timeout  [in] Only timeout={0,0} is supported in this
+ * implementation. This behavior may change in the future.
+ */
+int pselect(int nfds, fd_set *restrict readfds,
+            fd_set *restrict writefds, fd_set *restrict errorfds,
+            const struct timespec *restrict timeout,
+            const sigset_t *restrict sigmask);
+
+/*! Wait for multiple message queues. In this implementation, only
+ * message queue file descriptors are supported.
+ * @param nfds [in] This is an integer one more than the maximum of any file
+ * descriptor in any of the sets. In other words, while you are busy
+ * adding file descriptors to your sets, you must calculate the maximum
+ * integer value of all of them, then increment this value by one, and
+ * then pass this as nfds to select().
+ * @param readfds  [in] the file descriptor set on all message queues.
+ * @param writefds [in] ignored in this implementation.
+ * @param errorfds [in] ignored in this implementation.
+ * @param timeout  [in] Only timeout={0,0} is supported in this
+ * implementation. This behavior may change in the future.
+ */
+int select(int nfds, fd_set *restrict readfds,
+           fd_set *restrict writefds, fd_set *restrict errorfds,
+           struct timeval *restrict timeout);
+
+/** @} */
+
+/* this function is needed for test framework which needs to clean up memory when teardown */
+void _mq_teardown(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/posix/pthread.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/posix/pthread.h
new file mode 100755
index 0000000000000..f64242e8dc683
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/posix/pthread.h
@@ -0,0 +1,287 @@
+#ifndef QURT_PTHREAD_H
+#define QURT_PTHREAD_H  
+
+/*==========================================================================
+ * FILE:         pthread.h
+ *
+ * SERVICES:     POSIX pthread API interface
+ *
+ * DESCRIPTION:  POSIX pthread API interface based upon POSIX 1003.1-2004
+ *
+ *               Copyright (c) 2013,2016,2023  by Qualcomm Technologies, Inc.  All Rights Reserved. QUALCOMM Proprietary and Confidential.
+ *==========================================================================
+ *
+ *                          EDIT HISTORY FOR MODULE
+ *
+ *  This section contains comments describing changes made to the module.
+ *  Notice that changes are listed in reverse chronological order.
+ *
+ *  
+ *
+ *  when       who     what, where, why
+ *  --------   ---     -------------------------------------------------------
+ *  10/13/08   cz      Initial version.
+ *==========================================================================*/
+
+#include <sys/types.h>
+#include "sys/sched.h" /* For struct sched_param */
+#include "sys/errno.h" /* error values */
+#include <qurt.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <time.h>
+#include "pthread_types.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* the range of the set supported by the kernel data type used to represent CPU sets. */
+#define CONFIG_NR_CPUS QURT_THREAD_CFG_BITMASK_ALL
+
+#define UNIMPLEMENTED(FUNC, RETURNTYPE, ARGS)    static inline RETURNTYPE FUNC ARGS { qurt_printf("Unimplemented: %s... exiting\n", __FUNCTION__); exit(1); }
+
+/** @brief Magic (non-portable) value for a stack's address to enable usage
+           of auto-stack feature (if available) */
+#define PTHREAD_AUTO_STACK_MAGIC_ADDR_NP ((void *)0xFFF)
+
+/** \details 
+ * This provides POSIX thread API. 
+ *
+ */
+
+/** \defgroup pthread POSIX pthread API */
+/** \ingroup pthread */
+/** @{ */
+
+/** Compare Two Threads. 
+ * Please refer to POSIX standard for details.  
+ */
+static inline int pthread_equal(pthread_t t1, pthread_t t2)
+{
+    return (t1 == t2) ? 1 : 0;
+}
+
+/** Create Thread. 
+ * Please refer to POSIX standard for details.  
+ */
+int pthread_create(pthread_t * tid, const pthread_attr_t * attr, void *(*start)(void *), void *arg);
+
+/** Terminate Calling Thread. 
+ * Please refer to POSIX standard for details.  
+ */
+void pthread_exit(void *value_ptr);
+
+/** Wait for thread termination.
+ * Please refer to POSIX standard for details.
+ * @param thread    [in]  the thread to be joined
+ * @param value_ptr [out] the pointer of the exit status
+ */
+int pthread_join(pthread_t thread, void **value_ptr);
+
+/** Detach a joinable thread.
+ * Please refer to POSIX standard for details.
+ * @param id    [in]  id of the tread the thread to be detached.
+ */
+int pthread_detach(pthread_t id);
+
+/** Dynamic package initialisation
+ * Please refer to POSIX standard for details.
+ */
+int pthread_once(pthread_once_t *once_control, void (*init_routine)(void));
+
+pthread_t pthread_self(void);
+int pthread_cancel(pthread_t thread);
+static inline void pthread_yield(void)
+{
+    return;
+}
+
+int pthread_kill(pthread_t thread, int sig);
+
+/**
+ * @brief Return name of thread
+ * @warning Donot call this in the error handling path as it may cause deadlock
+ *          due to underlying OS calls
+ * @param thread [in] thread Thread whose name is to be retrieved
+ * @param name [out] name Buffer used to return thread name
+ * @param len [in] len  Number of bytes available in name
+ * @return 0 on success, ESRCH, ERANGE on failure
+ */
+extern int pthread_getname_np (pthread_t thread, char * name, size_t len);
+
+int pthread_getschedparam(pthread_t thread, int *restrict policy, struct sched_param *restrict param);
+int pthread_setschedparam(pthread_t thread, int policy, const struct sched_param *param);
+int pthread_setschedprio(pthread_t thread, int prio);
+int pthread_setcancelstate(int state, int *oldstate);
+int pthread_setcanceltype(int type, int *oldtype);
+
+/* Attribute functions */
+int pthread_attr_init(pthread_attr_t *attr);
+int pthread_attr_destroy(pthread_attr_t *attr);
+int pthread_attr_setschedparam(pthread_attr_t *restrict attr, const sched_param *restrict param);
+int pthread_attr_getschedparam(const pthread_attr_t *restrict attr, sched_param *restrict param);
+int pthread_attr_setstacksize(pthread_attr_t *attr, size_t stacksize);
+int pthread_attr_getstacksize(const pthread_attr_t *attr, size_t *stacksize);
+int pthread_attr_setstackaddr(pthread_attr_t *attr, void * stackaddr);
+int pthread_attr_getstackaddr(const pthread_attr_t *attr, void ** stackaddr);
+int pthread_attr_getdetachstate(const pthread_attr_t *attr, int *detachstate);
+int pthread_attr_setdetachstate(pthread_attr_t *attr, int detachstate);
+int pthread_attr_setstack(pthread_attr_t *attr, void *stackaddr, size_t stacksize);
+int pthread_attr_getstack(const pthread_attr_t *attr, void **stackaddr, size_t *stacksize);
+int pthread_attr_setscope(pthread_attr_t *attr, int scope);
+int pthread_attr_getscope(const pthread_attr_t *attr, int *scope);
+int pthread_attr_setinheritsched(pthread_attr_t *attr, int inheritsched);
+int pthread_attr_getinheritsched(const pthread_attr_t *attr, int *inheritsched);
+int pthread_attr_getguardsize(const pthread_attr_t * attr, size_t * guardsize);
+int pthread_attr_setautostack(pthread_attr_t *attr);
+int pthread_attr_setbuspriority(pthread_attr_t *attr, unsigned short bus_priority);
+
+/* Qualcomm additions to pthread get/set attribute functions */
+int pthread_attr_setthreadname(pthread_attr_t *attr, const char * name);
+int pthread_attr_getthreadname(const pthread_attr_t *attr, char * name, int size);
+int pthread_attr_settimetestid(pthread_attr_t *attr, unsigned int tid);
+int pthread_attr_gettimetestid(const pthread_attr_t *attr, unsigned int* tid);
+
+/* Mutexes */
+int pthread_mutex_init(pthread_mutex_t *mutex, pthread_mutexattr_t *attr);
+int pthread_mutex_lock(pthread_mutex_t *mutex);
+int pthread_mutex_unlock(pthread_mutex_t *mutex);
+int pthread_mutex_trylock(pthread_mutex_t *mutex);
+int pthread_mutex_destroy(pthread_mutex_t *mutex);
+int pthread_mutex_getprioceiling(const pthread_mutex_t *restrict mutex, int *restrict prioceiling);
+int pthread_mutex_setprioceiling(pthread_mutex_t *restrict mutex, int prioceiling, int *restrict old_ceiling);
+
+/* For Mutex with type PTHREAD_MUTEX_NORMAL, Priority Inheritance is not 
+ * supported even PTHREAD_PRIO_INHERIT is defined since QURT does not support
+ * this kind of Mutex */
+int pthread_mutexattr_init(pthread_mutexattr_t *attr);
+int pthread_mutexattr_destroy(pthread_mutexattr_t *attr);
+int pthread_mutexattr_gettype(const pthread_mutexattr_t *restrict, int *restrict);
+int pthread_mutexattr_settype(pthread_mutexattr_t *attr, int type);
+int pthread_mutexattr_getprotocol(const pthread_mutexattr_t *restrict, int *restrict);
+int pthread_mutexattr_setprotocol(pthread_mutexattr_t *attr, int protocol);
+int pthread_mutexattr_getpshared(const pthread_mutexattr_t *restrict, int *restrict);
+int pthread_mutexattr_setpshared(pthread_mutexattr_t *, int);
+int pthread_mutexattr_getprioceiling(const pthread_mutexattr_t *restrict attr, int *restrict prioceiling);
+int pthread_mutexattr_setprioceiling(pthread_mutexattr_t *attr, int prioceiling);
+
+/* Spinlocks */
+int pthread_spin_init(pthread_spinlock_t *lock, int pshared);
+int pthread_spin_destroy(pthread_spinlock_t *lock);
+int pthread_spin_lock(pthread_spinlock_t *lock);
+int pthread_spin_trylock(pthread_spinlock_t *lock);
+int pthread_spin_unlock(pthread_spinlock_t *lock);
+
+/* Condition variables */
+int pthread_condattr_init(pthread_condattr_t *attr);
+int pthread_condattr_destroy(pthread_condattr_t *attr);
+int pthread_condattr_setpshared(pthread_condattr_t *attr, int pshared);
+int pthread_condattr_getpshared(const pthread_condattr_t *restrict attr, int *restrict pshared);
+int pthread_condattr_setclock(pthread_condattr_t *attr, clockid_t clock);
+int pthread_condattr_getclock(const pthread_condattr_t *restrict attr, clockid_t *restrict clock);
+int pthread_cond_init(pthread_cond_t *cond, pthread_condattr_t *attr);
+int pthread_cond_destroy(pthread_cond_t *cond);
+int pthread_cond_signal(pthread_cond_t *cond);
+int pthread_cond_broadcast(pthread_cond_t *cond);
+int pthread_cond_wait(pthread_cond_t *cond, pthread_mutex_t *mutex);
+int pthread_cond_timedwait(pthread_cond_t * cond, pthread_mutex_t * mutex, const struct timespec *time);
+
+/* Barriers */
+int pthread_barrier_init(pthread_barrier_t *restrict barrier, const pthread_barrierattr_t *restrict attr, unsigned count);
+int pthread_barrier_destroy(pthread_barrier_t *barrier);
+int pthread_barrier_wait(pthread_barrier_t *barrier);
+int pthread_barrierattr_init(pthread_barrierattr_t *attr);
+int pthread_barrierattr_destroy(pthread_barrierattr_t *attr);
+int pthread_barrierattr_getpshared(const pthread_barrierattr_t *restrict attr, int *restrict pshared);
+
+
+/*Read-Write locks*/
+int pthread_rwlock_init(pthread_rwlock_t *, const pthread_rwlockattr_t *);
+int pthread_rwlock_destroy(pthread_rwlock_t *);
+int pthread_rwlockattr_init(pthread_rwlockattr_t *);
+int pthread_rwlockattr_destroy(pthread_rwlockattr_t *);
+int pthread_rwlockattr_getpshared(const pthread_rwlockattr_t *, int *);
+int pthread_rwlockattr_setpshared(pthread_rwlockattr_t *, int);
+int pthread_rwlock_rdlock(pthread_rwlock_t *);
+int pthread_rwlock_tryrdlock(pthread_rwlock_t *);
+int pthread_rwlock_wrlock(pthread_rwlock_t *);
+int pthread_rwlock_trywrlock(pthread_rwlock_t *);
+int pthread_rwlock_unlock(pthread_rwlock_t *);
+
+
+/** please refer to POSIX standard document 
+ */
+int pthread_barrierattr_setpshared(pthread_barrierattr_t *attr, int pshared);
+
+/** set CPU affinity attribute in thread attributes object.
+
+ * @param attr       [in] pthread attributes 
+ * @param cpusetsize [in] The argument cpusetsize is the length (in bytes) 
+                          of the buffer pointed to by cpuset. Typically, 
+                          this argument would be specified as 
+                          sizeof(cpu_set_t).
+ * @param cpuset     [in] This data set is a bitset where each bit represents 
+                          a CPU (hw thread). How the system's CPUs are mapped 
+                          to bits in the bitset is system dependent. 
+                          For QURT kernel, Bit 0 is corresponding to hw 
+                          thread 0, and so on. If the corresponding bit is 
+                          set to 1, then the software thread is eligible to 
+                          run this hw thread.  0x3f means it can run any hw
+                          threads 0x0 also means it can run on any hw threads.
+   @return On success, this function returns 0; on error, it returns a 
+           non-zero error number.
+           EINVAL - cpuset specified a CPU that was outside the set supported 
+                    by the kernel.  (The kernel configuration option 
+                    CONFIG_NR_CPUS defines the range of the set supported by 
+                    the kernel data type used to represent CPU sets.)
+ * @note This function is non-standard GNU extensions; hence the suffix "_np"
+         (non-portable) in the names. 
+ */
+int pthread_attr_setaffinity_np(pthread_attr_t *attr, size_t cpusetsize, const cpu_set_t *cpuset);
+
+/** get CPU affinity attribute in thread attributes object.
+ * @param attr       [in] pthread attributes 
+ * @param cpusetsize [in] The argument cpusetsize is the length (in bytes) 
+                          of the buffer pointed to by cpuset. Typically, 
+                          this argument would be specified as 
+                          sizeof(cpu_set_t).
+ * @param cpuset    [out] This data set is a bitset where each bit represents 
+                          a CPU (hw thread). How the system's CPUs are mapped 
+                          to bits in the bitset is system dependent. 
+                          For QURT kernel, Bit 0 is corresponding to hw 
+                          thread 0, and so on. If the corresponding bit is 
+                          set to 1, then the software thread is eligible to 
+                          run this hw thread.  0x3f means it can run any hw
+                          threads 0x0 also means it can run on any hw threads.
+   @return On success, this function returns 0; on error, it returns a 
+           non-zero error number.
+           EINVAL - cpusetsize is smaller than the size of the affinity mask 
+                    used by the kernel.
+ * @note   This function is non-standard GNU extensions; hence the suffix "_np"
+           (non-portable) in the names. 
+ */
+int pthread_attr_getaffinity_np(pthread_attr_t *attr, size_t cpusetsize, cpu_set_t *cpuset);
+
+/* TLS */
+int pthread_key_create(pthread_key_t *key, void (*destructor)(void*));
+int pthread_key_delete(pthread_key_t key);
+int pthread_setspecific(pthread_key_t key, const void *value);
+void *pthread_getspecific(pthread_key_t key);
+int pthread_getattr_np(pthread_t thread, pthread_attr_t * restrict attr); 	 	
+
+/** @} */
+
+/* Calling non-pthread calls this function to create pthred tcb w/o creating actual thread */
+int pthread_fake(pthread_t * restrict thread, const pthread_attr_t * restrict attr);
+int pthread_fake_destroy(pthread_t thread);
+
+//amitkulk: move these to unistd.h after we move that header within qurt
+int posix_memalign(void **memptr, size_t alignment, size_t size);
+void exit(int status);
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* QURT_PTHREAD_H */
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/posix/pthread_types.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/posix/pthread_types.h
new file mode 100755
index 0000000000000..51c3b9dbca243
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/posix/pthread_types.h
@@ -0,0 +1,193 @@
+#ifndef _PTHREAD_TYPES_H_
+#define _PTHREAD_TYPES_H_
+
+/*==========================================================================
+ * FILE:         pthread_types.c
+ *
+ * SERVICES:     types usded in POSIX API interface
+ *
+ * DESCRIPTION:  POSIX API interface based upon POSIX 1003.1-2004
+ *
+ *               Copyright (c) 2016, 2023  by Qualcomm Technologies, Inc.  All Rights Reserved. QUALCOMM Proprietary and Confidential.
+
+ *==========================================================================*/
+
+#include <stddef.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef __GNUC__
+#define restrict __restrict__
+#else
+#define restrict
+#endif
+
+#define _SSIZE_T
+
+#ifndef TRUE
+#define TRUE    1
+#endif
+
+#ifndef FALSE
+#define FALSE    0
+#endif
+
+#define PTHREAD_MAX_THREADS          512U
+
+#define PTHREAD_NAME_LEN             16
+#define PTHREAD_MIN_STACKSIZE        512 //4096
+#define PTHREAD_MAX_STACKSIZE        1048576
+#define PTHREAD_DEFAULT_STACKSIZE    16384
+
+#define PTHREAD_STACK_MIN            (4096U*2U)
+#define PTHREAD_MIN_PRIORITY         0U
+#define PTHREAD_MAX_PRIORITY         255U
+#define PTHREAD_DEFAULT_PRIORITY     1
+
+/*Mutex initialization status*/
+#define PTHREAD_MUTEX_ATTR_UNINITIALIZED    0
+#define PTHREAD_MUTEX_ATTR_INITIALIZED      1
+
+/*Conditional attributes initialization status*/
+#define PTHREAD_COND_ATTR_UNINITIALIZED     0
+#define PTHREAD_COND_ATTR_INITIALIZED       1
+
+#define PTHREAD_DEFAULT_NAME                "Anonymous"
+
+#define PTHREAD_MUTEX_INITIALIZER    ((pthread_mutex_t) 0xFFFFFFFFU)
+                                      
+#define PTHREAD_COND_INITIALIZER     ((pthread_cond_t) 0xFFFFFFFFU)
+
+/* mutex and cond_var shared */
+#define PTHREAD_PROCESS_PRIVATE      0
+#define PTHREAD_PROCESS_SHARED       1
+
+/* mutex type */
+#define PTHREAD_MUTEX_ERRORCHECK     0
+#define PTHREAD_MUTEX_NORMAL         1
+#define PTHREAD_MUTEX_RECURSIVE      2
+#define PTHREAD_MUTEX_DEFAULT        3
+
+/* mutex protocol */
+#define PTHREAD_PRIO_NONE            0
+#define PTHREAD_PRIO_INHERIT         1
+#define PTHREAD_PRIO_PROTECT         2
+
+#define PTHREAD_SPINLOCK_UNLOCKED    0
+#define PTHREAD_SPINLOCK_LOCKED      1
+
+#define PTHREAD_ONCE_INIT (0)
+
+#define PTHREAD_MUTEX_OPAQUE //ToDo: amitkulk: debug
+
+typedef signed int   ssize_t;
+
+/*detatchstate of a pthread*/
+#define PTHREAD_CREATE_JOINABLE             1
+#define PTHREAD_CREATE_DETACHED             0
+
+/*contention scope*/
+#define PTHREAD_SCOPE_PROCESS 1 
+#define PTHREAD_SCOPE_SYSTEM 0
+
+/*scheduler*/
+#define PTHREAD_INHERIT_SCHED 1
+#define PTHREAD_EXPLICIT_SCHED 0
+
+/*
+ * Types and structure definitions
+ *
+ */
+typedef unsigned int cpu_set_t;
+
+typedef unsigned int pthread_t;
+
+typedef struct pthread_attr_t
+{
+    void         *stackaddr;
+    int          internal_stack; /* this flag==1 means the stack needs to be freed by posix */
+    size_t       stacksize;
+    int          priority;
+    unsigned short timetest_id;
+    /* This flag indicate if thread will be autostack thread*/    
+	unsigned short autostack:1;
+    /* This flag is to indicate thread's bus_priority high/low 
+       bus_priority = 0  -- Bus_priority is low
+       bus_priority = 1  -- Bus_priority is high
+       bus_priority = 3  -- Bus_priority is default (takes the default set for the process)
+    */
+    unsigned short bus_priority:2;
+    unsigned short reserved:13;
+    cpu_set_t    cpumask;
+    char         name[PTHREAD_NAME_LEN];
+    /* This flag indicates whether pthread lib should create thread contexts for other OSALs */
+    /* This is used internally by POSIX and not available for general usage */
+    int          ext_context;
+    int          detachstate;
+} pthread_attr_t;
+
+//mutex attr
+typedef struct pthread_mutexattr_t   pthread_mutexattr_t;
+struct pthread_mutexattr_t
+{
+    int is_initialized;
+    int type;
+    int pshared;
+    int protocol;
+};
+
+typedef unsigned int              pthread_mutex_t;
+
+typedef unsigned int              pthread_spinlock_t;
+
+typedef struct pthread_condattr_t
+{
+    int is_initialized;
+    int pshared;
+    clockid_t clock_id;
+} pthread_condattr_t;
+
+typedef unsigned int             pthread_cond_t;
+
+typedef struct pthread_barrierattr_t
+{
+    int is_initialized;
+    int pshared;
+} pthread_barrierattr_t;
+
+typedef unsigned int                pthread_barrier_t;
+
+typedef int pthread_key_t;
+
+typedef int pthread_once_t;
+
+
+/*Read-Write locks*/
+#define PTW32_RWLOCK_MAGIC 0xfacade2
+#define PTHREAD_RWLOCK_INITIALIZER ((pthread_rwlock_t)(size_t) -1)
+
+struct pthread_rwlockattr_t_
+{
+  int pshared;
+};
+
+struct pthread_rwlock_t_
+{
+  pthread_mutex_t mtxExclusiveAccess;
+  pthread_mutex_t mtxSharedAccessCompleted;
+  pthread_cond_t cndSharedAccessCompleted;
+  int nSharedAccessCount;
+  int nExclusiveAccessCount;
+  int nCompletedSharedAccessCount;
+  int nMagic;
+};
+
+typedef struct pthread_rwlock_t_ * pthread_rwlock_t;
+typedef struct pthread_rwlockattr_t_ * pthread_rwlockattr_t;
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _PTHERAD_TYPES_H_ */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/posix/sched.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/posix/sched.h
new file mode 100755
index 0000000000000..faf3365be9f82
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/posix/sched.h
@@ -0,0 +1,21 @@
+/*=============================================================================
+
+                                    sched.h
+
+GENERAL DESCRIPTION
+
+EXTERNAL FUNCTIONS
+        None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+        None.
+
+             Copyright (c) 2013,2016  by Qualcomm Technologies, Inc.  All Rights Reserved.
+=============================================================================*/
+#ifndef __SCHED_H__
+#define __SCHED_H__
+
+#include "sys/sched.h"
+
+#endif //__SCHED_H__
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/posix/semaphore.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/posix/semaphore.h
new file mode 100755
index 0000000000000..d9145b295ae62
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/posix/semaphore.h
@@ -0,0 +1,114 @@
+#ifndef SEMAPHORE_H
+#define SEMAPHORE_H
+
+/*==========================================================================
+ * FILE:         semaphore.h
+ *
+ * SERVICES:     POSIX semaphore API interface
+ *
+ * DESCRIPTION:  POSIX semaphore API interface based upon POSIX 1003.1-2004
+ *
+ *               Copyright (c) 2013, 2016  by Qualcomm Technologies, Inc.  All Rights Reserved. QUALCOMM Proprietary and Confidential.
+
+ *==========================================================================*/
+#include <sys/types.h> // Get all C sys types - includes POSIX specific
+#include "sys/errno.h" // error values
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*=============================================================================
+                        TYPEDEFS
+=============================================================================*/
+/** User facing semaphore container with opaque pointer to implementation */
+typedef struct
+{
+    unsigned int *opaque;
+} sem_t;
+#define _SEM_T
+
+/*=============================================================================
+                        CONSTANTS AND MACROS
+=============================================================================*/
+/* constant definitions */
+#define SEM_FAILED       ((sem_t*) 0)
+
+/* @todo siqbal Should we put such configuration items in a common place
+   instead of this user-facing header? */
+#define SEM_VALUE_MAX    ((unsigned int) 30) // If need be increase this
+
+/*=============================================================================
+                        FUNCTIONS
+=============================================================================*/
+
+/** \details
+ * POSIX standard comes with two kinds of semaphores: named and unnamed
+ * semaphores.
+ *
+ * This implementation of POSIX kernel API provide unnamed & named semaphore.
+ *
+ * 
+ * sem_timedwait() is not provided.
+ */
+
+/** \defgroup semaphore POSIX Semaphore API */
+
+/** \ingroup semaphore */
+/** @{ */
+
+/** Initialize an unnamed semaphore.
+ * Please refer to POSIX standard for details.
+ * @param pshared [in] This implementation does not support non-zero value, 
+ * i.e., semaphore cannot be shared between processes in this implementation. 
+ */                 
+int sem_init(sem_t *sem, int pshared, unsigned int value);
+
+/** Lock a semaphore.
+ * Please refer to POSIX standard for details.
+ */
+int sem_wait(sem_t *sem);
+
+/** Lock a semaphore.
+ * Please refer to POSIX standard for details.
+ */
+int sem_trywait(sem_t *sem);
+
+/** Unlock a semaphore.
+ * Please refer to POSIX standard for details.
+ */
+int sem_post(sem_t *sem);
+
+/** Get the value of a semaphore.
+ * Please refer to POSIX standard for details.
+ */
+int sem_getvalue(sem_t *sem, int *value);
+
+/** Destroy an unnamed semaphore.
+ * Please refer to POSIX standard for details.
+ */
+int sem_destroy(sem_t *sem);
+
+/** creates and initializes a named semaphore.
+ * Please refer to POSIX standard for details.
+ */
+sem_t * sem_open(const char* name , int oflag , ...);
+
+/** closes a semaphore.
+ * Please refer to POSIX standard for details.
+ */
+int sem_close(sem_t *sem);
+
+/** unlinkes a named semaphore.
+ * Please refer to POSIX standard for details.
+ */
+int sem_unlink(const char *name);
+/** @} */
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  /* SEMAPHORE_H */
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/posix/signal.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/posix/signal.h
new file mode 100755
index 0000000000000..35cb1f1a9a319
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/posix/signal.h
@@ -0,0 +1,201 @@
+#ifndef _SIGNAL_H_
+#define _SIGNAL_H_
+
+/*==========================================================================
+ * FILE:         signal.h
+ *
+ * SERVICES:     POSIX Signal API interface
+ *
+ * DESCRIPTION:  POSIX Signal API interface based upon POSIX 1003.1-2004
+ *
+ * Copyright (c) 2013, 2016, 2023 Qualcomm Technologies, Inc.
+ * All Rights Reserved. 
+ * Confidential and Proprietary - Qualcomm Technologies, Inc.
+ 
+ *==========================================================================*/
+
+#include <sys/types.h>
+#include <generic/signal.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* POSIX signal bits */
+
+#define POSIX_MSG      7 /* POSIX msg type used in Qube API */
+#define POSIX_NOTIF    8 /* POSIX msg type used in Qube API */
+#define SIGKILL        9 /* kill (cannot be caught or ignored) */
+
+#define SIGRTMIN       10
+#define SIGRTMAX       32
+
+/* Notification Types. */
+/* No asynchronous notification is delivered when the event of interest occurs. */
+#define SIGEV_NONE      0
+/* The signal specified in sigev_signo shall be generated for the process when
+   the event of interest occurs. */
+#define SIGEV_SIGNAL    1
+/* A notification function is called to perform notification. */
+#define SIGEV_THREAD    2
+#define SA_SIGINFO      1
+
+/*
+ * Flags for sigprocmask:
+ */
+#define SIG_BLOCK       1 /* block specified signal set */
+#define SIG_UNBLOCK     2 /* unblock specified signal set */
+#define SIG_SETMASK     3 /* set specified signal set */
+
+typedef unsigned long int   sigset_t;
+
+union sigval
+{
+    int  sival_int;   /* Integer signal value. */
+    void *sival_ptr;  /* Pointer signal value. */
+};
+
+typedef struct sigevent   sigevent;
+struct sigevent
+{
+    int            sigev_notify;                           /* Notification type.       */
+    int            sigev_signo;                            /* Signal number.           */
+    union sigval   sigev_value;                            /* Signal value.            */
+    void           (*sigev_notify_function)(union sigval); /* Notification function.   */
+    pthread_attr_t *sigev_notify_attributes;
+};
+
+typedef struct siginfo_t   siginfo_t;
+struct siginfo_t
+{
+    int          si_signo;
+    int          si_code;
+    union sigval si_value;
+/*  int          si_errno;
+    pid_t        si_pid;
+    uid_t        si_uid;
+    void         *si_addr;
+    int          si_status;
+    long         si_band;*/
+};
+struct sigaction
+{
+    void     (*sa_handler)(int);
+    sigset_t sa_mask;
+    int      sa_flags;
+    void     (*sa_sigaction)(int, siginfo_t *, void *);
+};
+
+/* Signal functions */
+
+/** \details
+ * This provides POSIX Signal API. Please note that this
+ * implementation does not fully comply with POSIX standard.
+ *
+ * In POSIX standard, Signal can be used as 'interrupt', which means
+ * an incoming signal will interrupt a running thread. After the
+ * registered signal handler is executed, the thread will resume.
+ * This behavior cannot be implemented w/o modifying L4 or QURT kernel.
+ * On the ohter hand, appliation need to be carefully written to avoid
+ * problems caused by 'interrupting' signals.
+ *
+ * Therefore, in this implementation of POSIX signal, thread will
+ * only receive signals when it explicitly waits for signals, i.e., when 
+ * the thread calls either sigwait() or sigsuspend().
+ *
+ * Therefore, pthread_sigmask(), which set or get signal mask for a thread, 
+ * is not supported, since the signal mask will be set by sigwait() and 
+ * sigsuspend().
+ *
+ * Since this implementation of POSIX kernel API is a subset of PSE51,
+ * only threads can send and receive signals. The functions related to 
+ * signal operations with processes, such as kill(), sigqueue(), 
+ * sigprocmask(), are not provided.
+ *
+ * Queued signal is not supported.
+ *
+ * Applications will use signals from SIGRTMIN to SIGRTMAX.
+ *
+ * SIGEV_SIGNAL and SIGEV_THREAD are supported. SIGEV_NONE is not 
+ * supported.
+ *
+ */
+
+/** \defgroup signal POSIX Signal API */
+/** \ingroup signal */
+/** @{ */
+
+/** Wait for signals. This implementation does not support queued signals.
+ *
+ * Please refer to POSIX standard for details.
+ */
+int sigwait(const sigset_t *restrict set, int *restrict sig);
+
+/** Examine and Change Signal Action. 
+ * Please refer to POSIX standard for details.
+ *
+ * @param act [in] A pointer to the sigaction structure that describes the 
+ * action to be taken for the signal. Can be NULL. 
+ * The following flags for sa_flags field in struct sigaction are not 
+ * supported: SA_NOCLDSTOP, SA_ONSTACK, SA_RESETHAND, SA_RESTART, 
+ * SA_NOCLDWAIT and SA_NODEFER. Only flag SA_SIGINFO is supported.  
+ *
+ * @note Define sigaction as macro to avoid a warning when included from 
+ * C++ code - it's causing a "sigaction(...) hides constructor for 
+ * 'struct sigaction'" warning.
+ */
+/*lint -esym(123,sigaction) Suppress "macro used with no arguments" */
+#define sigaction(sig,act,oact) _sigaction((sig),(act),(oact))
+
+/** Wait for signals. 
+ * Please refer to POSIX standard for details.
+ */
+int sigsuspend(const sigset_t *sigmask);
+
+/** Add Signal to Signal Set.
+ * Please refer to POSIX standard for details.
+ */
+int sigaddset(sigset_t *set, int signo);
+
+/** Delete Signal from Signal Set.
+ * Please refer to POSIX standard for details.
+ */
+int sigdelset(sigset_t *set, int signo);
+
+/** Initialize and Empty Signal Set.
+ * Please refer to POSIX standard for details.
+ */
+int sigemptyset(sigset_t *set);
+
+/** Initialize and Fill Signal Set.
+ * Please refer to POSIX standard for details.
+ */
+int sigfillset(sigset_t *set);
+
+/** Test for Signal in Signal Set.
+ * Please refer to POSIX standard for details.
+ */
+int sigismember(const sigset_t *set, int signo);
+
+/** @} */
+
+/* this is not a public api function */
+int _sigaction(int sig, const struct sigaction *act, struct sigaction *oact);
+
+/* have to move #include here to solve circular include problems between time.h and signal.h */
+#include <time.h>
+
+/** Wait for the time interval specified in the timespec structure referenced 
+ * by timeout. This implementation does not support queued signals.
+ * For struct siginfo_t, si_code and si_value are ignored in this implementation.
+ *
+ * Please refer to POSIX standard for details.
+ */
+int sigtimedwait(const sigset_t *restrict set, siginfo_t *restrict info, 
+                 const struct timespec *restrict timeout);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _POSIX_SIGNAL_H_ */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/posix/sys/errno.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/posix/sys/errno.h
new file mode 100755
index 0000000000000..b9edf57bab6c3
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/posix/sys/errno.h
@@ -0,0 +1,20 @@
+#ifndef _SYS_ERRNO_H_
+#define _SYS_ERRNO_H_
+
+/*==========================================================================
+ * FILE:         errno.h
+ *
+ * SERVICES:     POSIX errno header file
+ *
+ * DESCRIPTION:  POSIX errno based upon POSIX 1003.1-2004
+ *
+ *               Copyright (c) 2013, 2016  by Qualcomm Technologies, Inc.  All Rights Reserved. QUALCOMM Proprietary and Confidential.
+
+ *==========================================================================*/
+
+#include <errno.h>
+#ifndef EOK
+#define EOK                0
+#endif
+
+#endif /* _SYS_ERRNO_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/posix/sys/sched.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/posix/sys/sched.h
new file mode 100755
index 0000000000000..2acc34d821725
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/posix/sys/sched.h
@@ -0,0 +1,67 @@
+#ifndef _POSIX_SCHED_H_
+#define _POSIX_SCHED_H_
+
+/*==========================================================================
+ * FILE:         sched.c
+ *
+ * SERVICES:     POSIX Thread sched API interface
+ *
+ * DESCRIPTION:  POSIX Thread sched API interface based upon POSIX 1003.1-2004
+ *
+ *               Copyright (c) 2013, 2016  by Qualcomm Technologies, Inc.  All Rights Reserved. QUALCOMM Proprietary and Confidential.
+
+
+ *==========================================================================*/
+
+#include <qurt.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define SCHED_FIFO        0 /* First in, first out (FIFO) scheduling policy. */
+#define SCHED_RR          1 /* Round robin scheduling policy. */
+#define SCHED_SPORADIC    2 /* Sporadic server scheduling policy. */
+#define SCHED_OTHER       3 /* Another scheduling policy. */
+
+typedef struct sched_param   sched_param;
+struct sched_param
+{
+    void *unimplemented;
+    int  sched_priority;
+};
+
+/** \details 
+ * This provides POSIX sched API. 
+ */
+
+/** \defgroup sched POSIX sched API */
+/** \ingroup sched */
+/** @{ */
+
+/** Relinquish the CPU.
+ * Please refer to POSIX standard for details.
+ */
+static inline int sched_yield(void)
+{
+   return 0;
+}
+
+/** Get the maximum priority.
+ * Please refer to POSIX standard for details.
+ * @param policy [in] SCHED_FIFO is the only valid input for this implementation.
+ */
+int sched_get_priority_max(int policy);
+
+/** Get the minimum priority.
+ * Please refer to POSIX standard for details.
+ * @param policy [in] SCHED_FIFO is the only valid input for this implementation.
+ */
+int sched_get_priority_min(int policy);
+
+/** @} */
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _POSIX_SCHED_H_ */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/posix/sys/types.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/posix/sys/types.h
new file mode 100755
index 0000000000000..700026f9f9e4e
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/posix/sys/types.h
@@ -0,0 +1,35 @@
+#ifndef _SYS_TYPES_H_
+#define _SYS_TYPES_H_
+
+/*==========================================================================
+ * FILE:         types.c
+ *
+ * SERVICES:     types usded in POSIX API interface
+ *
+ * DESCRIPTION:  POSIX API interface based upon POSIX 1003.1-2004
+ *
+ *               Copyright (c) 2013, 2016  by Qualcomm Technologies, Inc.  All Rights Reserved. QUALCOMM Proprietary and Confidential.
+
+ *==========================================================================*/
+
+#if !defined( _PID_T ) || !defined( __pid_t_defined )
+/* POSIX defines pid_t as signed 32-bit type. Hexagon toolchain's header
+   defines it as unsigned 32-bit type citing conflict with QuRT POSIX
+   compatibility later. If any such conflicts exist, we should fix them.
+   pid_t is being defined *BEFORE* inclusion of generic/sys/types.h
+   *INTENTIONALLY* to fix this */
+typedef int        pid_t;
+#define _PID_T
+#define __pid_t_defined
+#endif
+#include <bits/confname.h>
+#include <hooks/unistd.h>
+#include <generic/sys/types.h>
+#include <pthread_types.h>
+
+#ifndef __DEFINED_off_t
+typedef long       off_t;
+#define __DEFINED_off_t
+#endif
+
+#endif /* _SYS_TYPES_H_ */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/posix/time.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/posix/time.h
new file mode 100755
index 0000000000000..13aeb1ea9920d
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/posix/time.h
@@ -0,0 +1,142 @@
+#ifndef _POSIX_TIME_H_
+#define _POSIX_TIME_H_
+
+/*==========================================================================
+ * FILE:         time.h
+ *
+ * SERVICES:     POSIX Timer API interface
+ *
+ * DESCRIPTION:  POSIX Timer API interface based upon POSIX 1003.1-2004
+ *
+ *               Copyright (c) 2013,2016  by Qualcomm Technologies, Inc.  All Rights Reserved. QUALCOMM Proprietary and Confidential.
+ *==========================================================================*/
+
+
+#include <sys/types.h>
+
+typedef int              clockid_t; /* ignored */
+#define _CLOCKID_T
+#define _PROVIDE_POSIX_TIME_DECLS 1
+#include <generic/time.h>
+/* @todo anandj sys/time.h has definition for struct timeval but is not
+         included by generic/time.h */
+#include <sys/time.h>
+
+#define CLOCK_FREQ_NOT_DEFINED          -1
+/* Frequency of Sclk used */
+#define TIME_CONV_SCLK_FREQ             19200000
+
+#define RES_CONV_FACTOR1                1
+#define RES_CONV_FACTOR2                1000000000
+
+#if !defined(CLOCK_REALTIME)
+# define CLOCK_REALTIME 0
+#endif
+
+#if !defined(CLOCK_MONOTONIC)
+# define CLOCK_MONOTONIC 1
+#endif
+
+#if !defined(CLOCK_THREAD_CPUTIME_ID)
+# define CLOCK_THREAD_CPUTIME_ID 2
+#endif
+
+#if !defined(CLOCK_PROCESS_CPUTIME_ID)
+# define CLOCK_PROCESS_CPUTIME_ID 3
+#endif
+
+#if !defined(CLOCK_MONOTONIC_RAW)
+# define CLOCK_MONOTONIC_RAW 4
+#endif
+
+#if !defined(CLOCK_REALTIME_COARSE)
+# define CLOCK_REALTIME_COARSE 5
+#endif
+
+#if !defined(CLOCK_MONOTONIC_COARSE)
+# define CLOCK_MONOTONIC_COARSE 6
+#endif
+
+#if !defined(CLOCK_BOOTTIME)
+# define CLOCK_BOOTTIME 7
+#endif
+
+struct itimerspec
+{
+    struct timespec it_interval;  /* Timer period.     */
+    struct timespec it_value;     /* Timer expiration. */
+};
+
+/* have to move #include here to solve circular include problems between time.h and signal.h */
+#include <signal.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Timer functions */
+
+/** \details
+ * POSIX timers can be either of two types: a one-shot type or a periodic 
+ * type.
+ *
+ * A one-shot is an armed timer that is set to an expiration time relative 
+ * to either a current time or an absolute time. The timer expires once and 
+ * is disarmed. 
+ *
+ * A periodic timer is armed with an initial expiration time and a repetition 
+ * interval. Every time the interval timer 
+ * expires, the timer is reloaded with the repetition interval. The timer 
+ * is then rearmed. 
+ */
+
+/** \defgroup timer POSIX Timer API */
+
+/** \ingroup timer */
+/** @{ */
+
+/** Create a POSIX timer. 
+ * Please refer to POSIX standard for details.
+ * @param clockid [in] ignored in this implementation
+ * @param evp     [in] if non-NULL, points to a sigevent structure. This 
+ * structure, allocated by the application, defines the asynchronous 
+ * notification to occur when the timer expires. If the evp argument is 
+ * NULL, the effect is as if the evp argument pointed to a sigevent 
+ * structure with the sigev_notify member having the value SIGEV_SIGNAL, 
+ * the sigev_signo having a default signal number (SIGALRM), and the 
+ * sigev_value member having the value of the timer ID.
+ */
+int timer_create(clockid_t clockid, struct sigevent *restrict evp,
+                 timer_t *restrict timerid);
+
+/** Delete a POSIX timer. 
+ * Please refer to POSIX standard for details.
+ */                 
+int timer_delete(timer_t timerid);
+
+/** Get the time remaining on a POSIX timer. 
+ * Please refer to POSIX standard for details.
+ */                 
+int timer_gettime(timer_t timerid, struct itimerspec *value);
+
+
+/** Set the time remaining on a POSIX timer. 
+ * Please refer to POSIX standard for details.
+ * @param flags [in] ignored in this implementation
+ */                 
+int timer_settime(timer_t timerid, int flags,
+                  const struct itimerspec *restrict value,
+                  struct itimerspec *restrict ovalue);
+/** Obtain ID of a process CPU-time clock
+ *  @param pid [in] Process ID
+ *  @param clock_id [out] Clock ID
+ *  @return Error values as per POSIX standard
+ */
+int clock_getcpuclockid (pid_t pid, clockid_t * clock_id);
+/** @} */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _POSIX_TIME_H_ */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qube/qube.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qube/qube.h
new file mode 100755
index 0000000000000..1e31e2deedb38
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qube/qube.h
@@ -0,0 +1,51 @@
+#ifndef QUBE_H
+#define QUBE_H
+/*=============================================================================
+
+                 qube.h -- H E A D E R  F I L E
+
+GENERAL DESCRIPTION
+   Prototypes of qpd API
+
+EXTERNAL FUNCTIONS
+   None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+      Copyright (c) 2013  by Qualcomm Technologies, Inc.  All Rights Reserved.
+
+=============================================================================*/
+
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <qurt.h>
+
+/* Define Error codes as QuRT error codes preceed with QURT_ */
+#ifndef EOK
+#define EOK                             QURT_EOK
+#endif /* EOK */
+#ifndef EVAL
+#define EVAL                            QURT_EVAL
+#endif /* EVAL */
+#ifndef EMEM
+#define EMEM                            QURT_EMEM
+#endif /* EMEM */
+#ifndef EINVALID
+#define EINVALID                        QURT_EINVALID
+#endif /* EINVALID */
+
+
+/*=============================================================================
+                      FUNCTION DECLARATIONS                                
+=============================================================================*/
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QUBE_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/atomic_ops.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/atomic_ops.h
new file mode 100755
index 0000000000000..0a9a9f8ba7db5
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/atomic_ops.h
@@ -0,0 +1,197 @@
+#ifndef ATOMIC_OPS_H
+#define ATOMIC_OPS_H
+/**
+  @file atomic_ops.h 
+
+  @brief  Type definitions backwards compatible.
+
+EXTERNAL FUNCTIONS
+   None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+Copyright (c) 2013, 2021  by Qualcomm Technologies, Inc.  All Rights Reserved
+Confidential and Proprietary - Qualcomm Technologies, Inc.
+=============================================================================*/
+
+
+/*
+ * Australian Public Licence B (OZPLB)
+ *
+ * Version 1-0
+ *
+ * Copyright (c) 2007, Open Kernel Labs, Inc.
+ *
+ * All rights reserved. 
+ *
+ * Developed by: Embedded, Real-time and Operating Systems Program (ERTOS)
+ *               National ICT Australia
+ *               http://www.ertos.nicta.com.au
+ *
+ * Permission is granted by National ICT Australia, free of charge, to
+ * any person obtaining a copy of this software and any associated
+ * documentation files (the "Software") to deal with the Software without
+ * restriction, including (without limitation) the rights to use, copy,
+ * modify, adapt, merge, publish, distribute, communicate to the public,
+ * sublicense, and/or sell, lend or rent out copies of the Software, and
+ * to permit persons to whom the Software is furnished to do so, subject
+ * to the following conditions:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimers.
+ *
+ *     * Redistributions in binary form must reproduce the above
+ *       copyright notice, this list of conditions and the following
+ *       disclaimers in the documentation and/or other materials provided
+ *       with the distribution.
+ *
+ *     * Neither the name of National ICT Australia, nor the names of its
+ *       contributors, may be used to endorse or promote products derived
+ *       from this Software without specific prior written permission.
+ *
+ * EXCEPT AS EXPRESSLY STATED IN THIS LICENCE AND TO THE FULL EXTENT
+ * PERMITTED BY APPLICABLE LAW, THE SOFTWARE IS PROVIDED "AS-IS", AND
+ * NATIONAL ICT AUSTRALIA AND ITS CONTRIBUTORS MAKE NO REPRESENTATIONS,
+ * WARRANTIES OR CONDITIONS OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
+ * BUT NOT LIMITED TO ANY REPRESENTATIONS, WARRANTIES OR CONDITIONS
+ * REGARDING THE CONTENTS OR ACCURACY OF THE SOFTWARE, OR OF TITLE,
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT,
+ * THE ABSENCE OF LATENT OR OTHER DEFECTS, OR THE PRESENCE OR ABSENCE OF
+ * ERRORS, WHETHER OR NOT DISCOVERABLE.
+ *
+ * TO THE FULL EXTENT PERMITTED BY APPLICABLE LAW, IN NO EVENT SHALL
+ * NATIONAL ICT AUSTRALIA OR ITS CONTRIBUTORS BE LIABLE ON ANY LEGAL
+ * THEORY (INCLUDING, WITHOUT LIMITATION, IN AN ACTION OF CONTRACT,
+ * NEGLIGENCE OR OTHERWISE) FOR ANY CLAIM, LOSS, DAMAGES OR OTHER
+ * LIABILITY, INCLUDING (WITHOUT LIMITATION) LOSS OF PRODUCTION OR
+ * OPERATION TIME, LOSS, DAMAGE OR CORRUPTION OF DATA OR RECORDS; OR LOSS
+ * OF ANTICIPATED SAVINGS, OPPORTUNITY, REVENUE, PROFIT OR GOODWILL, OR
+ * OTHER ECONOMIC LOSS; OR ANY SPECIAL, INCIDENTAL, INDIRECT,
+ * CONSEQUENTIAL, PUNITIVE OR EXEMPLARY DAMAGES, ARISING OUT OF OR IN
+ * CONNECTION WITH THIS LICENCE, THE SOFTWARE OR THE USE OF OR OTHER
+ * DEALINGS WITH THE SOFTWARE, EVEN IF NATIONAL ICT AUSTRALIA OR ITS
+ * CONTRIBUTORS HAVE BEEN ADVISED OF THE POSSIBILITY OF SUCH CLAIM, LOSS,
+ * DAMAGES OR OTHER LIABILITY.
+ *
+ * If applicable legislation implies representations, warranties, or
+ * conditions, or imposes obligations or liability on National ICT
+ * Australia or one of its contributors in respect of the Software that
+ * cannot be wholly or partly excluded, restricted or modified, the
+ * liability of National ICT Australia or the contributor is limited, to
+ * the full extent permitted by the applicable legislation, at its
+ * option, to:
+ * a.  in the case of goods, any one or more of the following:
+ * i.  the replacement of the goods or the supply of equivalent goods;
+ * ii.  the repair of the goods;
+ * iii. the payment of the cost of replacing the goods or of acquiring
+ *  equivalent goods;
+ * iv.  the payment of the cost of having the goods repaired; or
+ * b.  in the case of services:
+ * i.  the supplying of the services again; or
+ * ii.  the payment of the cost of having the services supplied again.
+ *
+ * The construction, validity and performance of this licence is governed
+ * by the laws in force in New South Wales, Australia.
+ */
+
+/*
+ * Author: Malcolm Purvis <malcolmp@ok-labs.com>
+ * Author: Carlos Dyonisio <medaglia@ok-labs.com>
+ */
+
+#include <qurt_atomic_ops.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef unsigned int atomic_plain_word_t;
+
+/*-------------------------------------------------------------------------*/
+                        /* Atomic Ops API. */
+
+/*
+ * IMPORTANT!
+ * If you plan to change the structure atomic_word_t, please add the new
+ * elements after value. For more information, read the comment in
+ * arch/arm/libs/atomic_ops/v5/src/arm_atomic_ops.spp:66
+ */
+
+typedef struct {
+    volatile atomic_plain_word_t value;
+} atomic_word_t;
+
+#define ATOMIC_INIT(i)  { (i) }
+
+static inline void
+atomic_init(atomic_word_t *a, atomic_plain_word_t v)
+{
+    a->value = v;
+}
+
+#if defined(ARCH_ARM) && defined(ARCH_VER) && (ARCH_VER < 6) && \
+         (!defined(__ATOMIC_OPS_IN_KERNEL__) || defined(MACHINE_SMP))
+
+/* 
+ * If it is ARMv4/v5, the function declarations may change
+ * and are defined in the arch specific header file,
+ * as some of then cannot be declared static because of
+ * the assembler implementation.
+ */
+
+#else 
+
+/* Arithmetic operations. */
+
+void atomic_sub(atomic_word_t *target, atomic_plain_word_t v);
+
+/* Architecture independent definitions. */
+
+static inline atomic_plain_word_t atomic_read(atomic_word_t *target)
+{
+    return target->value;
+}
+
+typedef unsigned long long atomic64_plain_word_t;
+
+typedef struct {
+    volatile atomic64_plain_word_t value;
+} atomic64_word_t;
+
+static inline void
+atomic64_init(atomic64_word_t *a, atomic64_plain_word_t v)
+{
+    a->value = v;
+}
+
+/*********************
+  Support 64-bit  
+ *********************/
+
+atomic64_plain_word_t atomic64_set(atomic64_word_t* target,
+                                      atomic64_plain_word_t value);
+
+void atomic64_xor(atomic64_word_t* target,
+                       atomic64_plain_word_t mask);
+
+/*---------------------------------------------------------------------------*/
+
+/* Architecture independent definitions. */
+
+static inline atomic64_plain_word_t atomic64_read(atomic64_word_t *target)
+{
+    return target->value;
+}
+
+#endif
+
+
+/* Architecture dependent definitions. */
+#include <atomic_ops_plat.h>
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* ATOMIC_OPS_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/atomic_ops_plat.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/atomic_ops_plat.h
new file mode 100755
index 0000000000000..b54b3ff83d978
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/atomic_ops_plat.h
@@ -0,0 +1,86 @@
+#ifndef ATOMIC_OPS_PLAT_H
+#define ATOMIC_OPS_PLAT_H
+/**
+  @file atomic_ops_plat.h 
+
+  @brief  Prototypes of atomic operations API backwards compatible.
+
+EXTERNAL FUNCTIONS
+   None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+Copyright (c) 2013, 2023 Qualcomm Technologies, Inc.
+All Rights Reserved.
+Confidential and Proprietary - Qualcomm Technologies, Inc.
+=============================================================================*/
+
+
+#include <qurt_atomic_ops.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+/*=============================================================================
+                      CONSTANTS AND MACROS                                
+=============================================================================*/
+#define atomic_set(a,b)                qurt_atomic_set((unsigned int *)(a),(unsigned int)(b))
+#define atomic_and(a,b)                qurt_atomic_and((unsigned int *)(a),(unsigned int)(b))
+#define atomic_and_return(a,b)         qurt_atomic_and_return((unsigned int *)(a),(unsigned int)(b))
+#define atomic_or(a,b)                 qurt_atomic_or((unsigned int *)(a),(unsigned int)(b))
+#define atomic_or_return(a,b)          qurt_atomic_or_return((unsigned int *)(a),(unsigned int)(b))
+#define atomic_xor(a,b)                qurt_atomic_xor((unsigned int *)(a),(unsigned int)(b))
+#define atomic_xor_return(a,b)         qurt_atomic_xor_return((unsigned int *)(a),(unsigned int)(b)) 
+#define atomic_set_bit(a,b)            qurt_atomic_set_bit((unsigned int *)(a),(unsigned int)(b))
+#define atomic_clear_bit(a,b)          qurt_atomic_clear_bit((unsigned int *)(a),(unsigned int)(b))
+#define atomic_change_bit(a,b)         qurt_atomic_change_bit((unsigned int *)(a),(unsigned int)(b))
+#define atomic_add(a,b)                qurt_atomic_add((unsigned int *)(a),(unsigned int)(b))
+#define atomic_add_return(a,b)         qurt_atomic_add_return((unsigned int *)(a),(unsigned int)(b))
+#define atomic_add_unless(a,b,c)       qurt_atomic_add_unless((unsigned int *)(a),(unsigned int)(b),(unsigned int)(c))
+#define atomic_sub(a,b)                qurt_atomic_sub((unsigned int *)(a),(unsigned int)(b))
+#define atomic_sub_return(a,b)         qurt_atomic_sub_return((unsigned int *)(a),(unsigned int)(b))
+#define atomic_inc(a)                  qurt_atomic_inc((unsigned int *)(a))
+#define atomic_inc_return(a)           qurt_atomic_inc_return((unsigned int *)(a))
+#define atomic_dec(a)                  qurt_atomic_dec((unsigned int *)(a))
+#define atomic_dec_return(a)           qurt_atomic_dec_return((unsigned int *)(a))
+#define atomic_compare_and_set(a,b,c)  qurt_atomic_compare_and_set((unsigned int *)(a),(unsigned int)(b),(unsigned int)(c))
+#define atomic_barrier                 qurt_atomic_barrier
+#define atomic_barrier_write           qurt_atomic_barrier_write
+#define atomic_barrier_write_smp       qurt_atomic_barrier_write_smp
+#define atomic_barrier_read_smp        qurt_atomic_barrier_read_smp
+#define atomic_barrier_smp             qurt_atomic_barrier_smp
+
+/*============================
+ *       64 bits support
+ *============================ */
+#define atomic64_set(a,b)                qurt_atomic64_set((unsigned long long *)(a),(unsigned long long)(b))  
+#define atomic64_and(a,b)                qurt_atomic64_and((unsigned long long *)(a),(unsigned long long)(b)) 
+#define atomic64_and_return(a,b)         qurt_atomic64_and_return((unsigned long long *)(a),(unsigned long long)(b))  
+#define atomic64_or(a,b)                 qurt_atomic64_or((unsigned long long *)(a),(unsigned long long)(b)) 
+#define atomic64_or_return(a,b)          qurt_atomic64_or_return((unsigned long long *)(a),(unsigned long long)(b)) 
+#define atomic64_xor(a,b)                qurt_atomic64_xor((unsigned long long *)(a),(unsigned long long)(b)) 
+#define atomic64_xor_return(a,b)         qurt_atomic64_xor_return((unsigned long long *)(a),(unsigned long long)(b))  
+#define atomic64_set_bit(a,b)            qurt_atomic64_set_bit((unsigned long long *)(a),(unsigned long long)(b)) 
+#define atomic64_clear_bit(a,b)          qurt_atomic64_clear_bit((unsigned long long *)(a),(unsigned long long)(b))  
+#define atomic64_change_bit(a,b)         qurt_atomic64_change_bit((unsigned long long *)(a),(unsigned long long)(b)) 
+#define atomic64_add(a,b)                qurt_atomic64_add((unsigned long long *)(a),(unsigned long long)(b))  
+#define atomic64_add_return(a,b)         qurt_atomic64_add_return((unsigned long long *)(a),(unsigned long long)(b))
+#define atomic64_sub(a,b)                qurt_atomic64_sub((unsigned long long *)(a),(unsigned long long)(b)) 
+#define atomic64_sub_return(a,b)         qurt_atomic64_sub_return((unsigned long long *)(a),(unsigned long long)(b)) 
+#define atomic64_inc(a)                  qurt_atomic64_inc((unsigned long long *)(a))
+#define atomic64_inc_return(a)           qurt_atomic64_inc_return((unsigned long long *)(a))
+#define atomic64_dec(a)                  qurt_atomic64_dec((unsigned long long *)(a))
+#define atomic64_dec_return(a)           qurt_atomic64_dec_return((unsigned long long *)(a))
+#define atomic64_compare_and_set(a,b,c)  qurt_atomic64_compare_and_set((unsigned long long  *)(a),(unsigned long long )(b),(unsigned long long )(c))
+#define atomic64_barrier                 qurt_atomic64_barrier
+#define atomic64_barrier_write           qurt_atomic64_barrier_write
+#define atomic64_barrier_write_smp       qurt_atomic64_barrier_write_smp
+#define atomic64_barrier_read_smp        qurt_atomic64_barrier_read_smp
+#define atomic64_barrier_smp             qurt_atomic64_barrier_smp
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* ATOMIC_OPS_PLAT_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt.h
new file mode 100755
index 0000000000000..4d25c9b2b6243
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt.h
@@ -0,0 +1,111 @@
+#ifndef QURT_H
+#define QURT_H 
+
+/**
+  @file qurt.h 
+  @brief  Contains kernel header files that provide kernel OS API functions, constants, and 
+  definitions 
+
+ EXTERNALIZED FUNCTIONS
+  none
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  none
+
+ Copyright (c) 2013,2021,2023  by Qualcomm Technologies, Inc.  All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+/*======================================================================
+ *
+ *											 EDIT HISTORY FOR FILE
+ *
+ *	 This section contains comments describing changes made to the
+ *	 module. Notice that changes are listed in reverse chronological
+ *	 order.
+ *
+ *	
+ *
+ *
+ * when 				who 		what, where, why
+ * ---------- 	--- 		------------------------------------------------
+ * 2011-02-25 	op			Add Header file
+   2012-12-16   cm          (Tech Pubs) Edited/added Doxygen comments and markup.
+ ======================================================================*/
+ 
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "qurt_consts.h"
+#include "qurt_api_version.h"
+#include "qurt_alloc.h"
+#include "qurt_futex.h"
+#include "qurt_mutex.h"
+#include "qurt_pipe.h"
+#include "qurt_printf.h"
+#include "qurt_assert.h"
+#include "qurt_thread.h"
+#include "qurt_trace.h"
+#include "qurt_cycles.h"
+#include "qurt_profile.h"
+#include "qurt_sem.h"
+#include "qurt_cond.h"
+#include "qurt_barrier.h"
+#include "qurt_fastint.h"
+#include "qurt_allsignal.h"
+#include "qurt_anysignal.h"
+#include "qurt_signal.h"
+#include "qurt_rmutex.h"
+#include "qurt_pimutex.h"
+#include "qurt_signal2.h"
+#include "qurt_rmutex2.h"
+#include "qurt_pimutex2.h"
+#include "qurt_int.h"
+#include "qurt_lifo.h"
+#include "qurt_power.h"
+#include "qurt_event.h"
+#include "qurt_pmu.h"
+#include "qurt_stid.h"
+//#include "qurt_version.h"
+#include "qurt_tlb.h"
+#include "qurt_vtlb.h"
+#include "qurt_memory.h"
+#include "qurt_qdi.h"
+#include "qurt_sclk.h"
+#include "qurt_space.h"
+#include "qurt_process.h"
+#include "qurt_timer.h"
+#include "qurt_tls.h"
+#include "qurt_thread_context.h"
+#include "qurt_hvx.h"
+#include "qurt_hmx.h"
+#include "qurt_mailbox.h"
+#include "qurt_island.h"
+#include "qurt_qdi_proxy.h"
+#include "qurt_l2cfg.h"
+#include "qurt_mmap.h"
+#include "qurt_isr.h"
+#include "qurt_busywait.h"
+#include "qurt_ecc.h"
+#include "qurt_callback.h"
+#include "qurt_error.h"
+#include "qurt_except.h"
+#include "qurt_mq.h"
+#include "qurt_user_dma.h"
+#include "qurt_fs_hub.h"	
+#include "qurt_os_services.h"	
+
+#ifndef MAIN_ONLY
+#define INCLUDE_ISLAND_CONTENTS
+#endif
+#ifndef ISLAND_ONLY
+#define INCLUDE_MAIN_CONTENTS
+#endif
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_H */
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_alloc.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_alloc.h
new file mode 100755
index 0000000000000..da37a4c0a714e
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_alloc.h
@@ -0,0 +1,145 @@
+#ifndef QURT_ALLOC_H
+#define QURT_ALLOC_H
+
+/**
+  @file qurt_alloc.h 
+  @brief Prototypes of kernel memory allocation API functions.      
+
+ EXTERNALIZED FUNCTIONS
+  none
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  none
+
+ Copyright (c) 2021, 2023 Qualcomm Technologies, Inc.
+ All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+
+/*======================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**@ingroup func_qurt_malloc
+  Dynamically allocates the specified array on the QuRT system heap.
+  The return value is the address of the allocated memory area.
+
+  @note1hang The allocated memory area is automatically initialized to zero.
+
+  @param[in] size     Size (in bytes) of the memory area.
+  
+  @return
+  Nonzero -- Pointer to the allocated memory area. \n
+  0 -- Not enough memory in heap to allocate memory area.
+
+  @dependencies
+  None.    
+
+ */
+/* ======================================================================*/
+void *qurt_malloc( unsigned int size);
+
+/*======================================================================*/
+/**@ingroup func_qurt_calloc
+  Dynamically allocates the specified array on the QuRT system heap.
+  The return value is the address of the allocated array. 
+
+  @note1hang The allocated memory area is automatically initialized to zero.
+
+  @param[in] elsize Size (in bytes) of each array element.
+  @param[in] num    Number of array elements.
+
+  @return 
+  Nonzero -- Pointer to allocated array.\n
+  Zero -- Not enough memory in heap to allocate array.
+
+  @dependencies
+  None.
+  
+ */
+ /* ======================================================================*/
+void *qurt_calloc(unsigned int elsize, unsigned int num);
+
+/*======================================================================*/
+/**@ingroup func_qurt_realloc
+  Reallocates memory on the heap. \n
+  Changes the size of a memory area that is already allocated on the QuRT system heap. 
+  The reallocate memory operation is functionally similar to realloc. It accepts a pointer
+  to an existing memory area on the heap, and resizes the memory area to the specified size
+  while preserving the original contents of the memory area.
+
+  @note1hang This function might change the address of the memory area.
+             If the value of ptr is NULL, this function is equivalent to 
+             qurt_malloc().
+             If the value of new_size is 0, it is equivalent to qurt_free().  
+             If the memory area is expanded, the added memory is not initialized.
+
+  @param[in] *ptr   Pointer to the address of the memory area.
+  @param[in] newsize Size (in bytes) of the reallocated memory area.
+	               	
+  @return
+  Nonzero -- Pointer to reallocated memory area. \n
+  0 -- Not enough memory in heap to reallocate the memory area.
+
+  @dependencies
+  None.
+	 
+ */
+ /* ======================================================================*/
+void *qurt_realloc(void *ptr,  int newsize);
+
+/*======================================================================*/
+/**@ingroup func_qurt_free
+  Frees allocated memory from the heap.\n
+  Deallocates the specified memory from the QuRT system heap.
+
+  @param[in] *ptr Pointer to the address of the memory to deallocate.
+	
+  @return
+  None.
+
+  @dependencies
+  The memory item that the ptr value specifies must have been previously 
+  allocated using one of the qurt_calloc(), 
+  qurt_malloc(), or qurt_realloc() memory allocation functions. 
+  Otherwise the behavior of QuRT is undefined.
+  
+ */
+ /* ======================================================================*/
+void qurt_free( void *ptr);
+
+
+void *qurt_memalign(unsigned int alignment, unsigned int size);
+
+/*
+||  Macro to define a static heap for a QuRT program.
+||
+||  Usage:
+||   Declare at the top-level of any C source file that
+||    is part of the build (and is guaranteed
+||    to actually be pulled into the build). Place
+||    it in the same function with main():
+||
+||    QURT_DECLARE_STATIC_HEAP(512000);
+||
+||  The only argument is the size in bytes, and it is
+||   rounded up to the nearest 64 bytes (size of an
+||   L2 cache block).
+||
+*/
+
+#define QURT_DECLARE_STATIC_HEAP(sz)                    \
+   static struct qurt_static_heap {                     \
+      char space[(sz)] __attribute__((aligned(64)));      \
+   } static_heap[1];                                    \
+   void * const override_heap_Base = &static_heap[0];   \
+   void * const override_heap_Limit = &static_heap[1]
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_ALLOC_H */
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_allsignal.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_allsignal.h
new file mode 100755
index 0000000000000..5dc89e495130d
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_allsignal.h
@@ -0,0 +1,176 @@
+
+#ifndef QURT_ALLSIGNAL_H
+#define QURT_ALLSIGNAL_H
+
+/**
+  @file  qurt_allsignal.h
+  @brief  Prototypes of kernel signal API functions.      
+
+ EXTERNALIZED FUNCTIONS
+  none
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  none
+
+
+ Copyright (c) 2021  by Qualcomm Technologies, Inc.  All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+
+#include <qurt_futex.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** @addtogroup all_signal_types
+@{ */
+/*=====================================================================
+ Typedefs
+ ======================================================================*/
+
+/**          
+qurt_signal_t supersedes qurt_allsignal_t. This type definition was added for backwards compatibility. */
+typedef union {
+    /** @cond */
+	unsigned long long int raw;
+	struct {
+		unsigned int waiting;      /**< */
+		unsigned int signals_in;   /**< */
+		unsigned int queue;        /**< */
+		unsigned int reserved;     /**< */
+	}X;
+    /** @endcond */
+} qurt_allsignal_t;
+/** @} */ /* end_addtogroup all_signal_types */
+
+/*=====================================================================
+ Functions
+======================================================================*/
+ 
+/*======================================================================*/
+/**@ingroup func_qurt_allsignal_init
+  Initializes an all-signal object.\n
+  The all-signal object is initially cleared.
+
+  @datatypes
+  #qurt_allsignal_t
+
+  @param[out] signal Pointer to the all-signal object to initialize. 
+  
+  @return         
+  None.
+
+  @dependencies    
+  None.
+ */
+/* ======================================================================*/
+void qurt_allsignal_init(qurt_allsignal_t *signal);
+
+/*======================================================================*/
+/**@ingroup func_qurt_allsignal_destroy
+  Destroys the specified all-signal object.\n
+  @note1hang All-signal objects must be destroyed when they are no longer in use. 
+             Failure to do this causes resource leaks in the QuRT kernel.  \n
+  @note1cont All-signal objects must not be destroyed while they are still in use. 
+             If this occurs, the behavior of QuRT is undefined.
+  
+  @datatypes
+  #qurt_allsignal_t
+
+  @param[in] signal Pointer to the all-signal object to destroy.
+
+  @return         
+  None.
+ 
+  @dependencies
+  None.
+ */
+/* ======================================================================*/
+void qurt_allsignal_destroy(qurt_allsignal_t *signal);
+
+/*======================================================================*/
+/**@ingroup func_qurt_allsignal_get
+  Gets signal values from the all-signal object.
+
+  Returns the current signal values of the specified all-signal object.
+
+  @datatypes
+  #qurt_allsignal_t
+
+  @param[in] signal Pointer to the all-signal object to access.
+
+  @return         
+  Bitmask with current signal values.
+    
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+static inline unsigned int qurt_allsignal_get(qurt_allsignal_t *signal)
+{ return signal->X.signals_in; }
+    
+/*======================================================================*/
+/**@ingroup func_qurt_allsignal_wait  
+  Waits on the all-signal object.\n
+  Suspends the current thread until all of the specified signals are set.
+  Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1
+  indicates that a signal must be waited on, and 0 that it is not to be waited on.
+
+  If a signal is set in an all-signal object, and a thread is waiting on the all-signal object for
+  that signal, the thread is awakened. If the awakened thread has higher priority than
+  the current thread, a context switch can occur.
+
+  Unlike any-signals, all-signals do not need to explicitly clear any set signals in an all-signal
+  object before waiting on them again -- clearing is done automatically by the wait
+  operation.
+
+  @note1hang At most, one thread can wait on an all-signal object at any given time.
+             Because signal clearing is done by the wait operation, no clear operation is
+             defined for all-signals.
+
+  @datatypes
+  #qurt_allsignal_t
+  
+  @param[in] signal Pointer to the all-signal object to wait on.
+  @param[in] mask	Signal mask value, which identifies the individual signals in the all-signal object
+                    to wait on.
+ 
+  @return
+  None.
+ 
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+void qurt_allsignal_wait(qurt_allsignal_t *signal, unsigned int mask);
+
+/*======================================================================*/
+/**@ingroup func_qurt_allsignal_set
+  Set signals in the specified all-signal object.
+
+  Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit 
+  value of 1 indicates that a signal must be set, and 0 indicates not to set the signal.
+
+  @datatypes
+  #qurt_allsignal_t
+
+  @param[in]	signal  Pointer to the all-signal object to modify. 
+  @param[in]	mask 	Signal mask value identifying the individual signals to  
+                        set in the all-signal object.
+
+  @return
+  None.
+ 
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+void qurt_allsignal_set(qurt_allsignal_t *signal, unsigned int mask);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_ALLSIGNAL_H */
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_anysignal.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_anysignal.h
new file mode 100755
index 0000000000000..9619e2de562b4
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_anysignal.h
@@ -0,0 +1,225 @@
+#ifndef QURT_ANYSIGNAL_H
+#define QURT_ANYSIGNAL_H 
+/**
+  @file qurt_anysignal.h
+  Prototypes of kernel signal API functions.      
+
+ EXTERNALIZED FUNCTIONS
+  None
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  None
+
+Copyright (c) 2021 Qualcomm Technologies, Inc.
+All rights reserved.
+Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+
+#include <qurt_signal.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*=====================================================================
+Typedefs
+======================================================================*/
+
+/**@ingroup anysignals_types                                                 
+ qurt_signal_t supersedes qurt_anysignal_t. This type definition was added for backwards compatibility.  */  
+typedef qurt_signal_t qurt_anysignal_t;
+
+/*=====================================================================
+ Functions
+======================================================================*/
+ 
+/*======================================================================*/
+/**@ingroup func_qurt_anysignal_init
+  Initializes an any-signal object.\n
+  The any-signal object is initially cleared.
+
+  @datatypes
+  #qurt_anysignal_t
+
+  @param[out] signal	Pointer to the initialized any-signal object.  
+  
+  @return         
+  None.
+
+  @dependencies  
+  None.
+ */
+/* ======================================================================*/
+static inline void qurt_anysignal_init(qurt_anysignal_t *signal)
+{
+  qurt_signal_init(signal);
+}
+
+/*======================================================================*/
+/**@ingroup func_qurt_anysignal_destroy
+  Destroys the specified any-signal object. 
+
+  @note1hang Any-signal objects must be destroyed when they are no longer in use. Failure
+             to do this causes resource leaks in the QuRT kernel.\n
+  @note1cont Any-signal objects must not be destroyed while they are still in use. If this
+             occurs, the behavior of QuRT is undefined.
+
+  @datatypes
+  #qurt_anysignal_t
+  
+  @param[in] signal Pointer to the any-signal object to destroy.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+/* ======================================================================*/
+static inline void qurt_anysignal_destroy(qurt_anysignal_t *signal)
+{
+  qurt_signal_destroy(signal);
+}
+
+/*======================================================================*/
+/**@ingroup func_qurt_anysignal_wait
+  Wait on the any-signal object. \n
+  Suspends the current thread until any one of the specified signals is set.
+
+  Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1
+  indicates that a signal must be waited on, and 0 indicates not to wait on the signal.
+  If a signal is set in an any-signal object, and a thread is waiting on the any-signal object for
+  that signal, the thread is awakened. If the awakened thread has higher priority than
+  the current thread, a context switch can occur.
+
+  @note1hang At most, one thread can wait on an any-signal object at any given time.
+
+  @datatypes
+  #qurt_anysignal_t
+	
+  @param[in] signal Pointer to the any-signal object to wait on. 
+  @param[in] mask   Signal mask value, which specifies the individual signals in the any-signal
+                      object to wait on.
+
+  @return 				
+  Bitmask of current signal values.
+
+  @dependencies
+  None.
+ */
+/* ======================================================================*/
+static inline unsigned int qurt_anysignal_wait(qurt_anysignal_t *signal, unsigned int mask)
+{
+  return qurt_signal_wait(signal, mask, QURT_SIGNAL_ATTR_WAIT_ANY);
+}
+
+/*======================================================================*/
+/**@ingroup func_qurt_anysignal_set
+  Sets signals in the specified any-signal object. \n
+  Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1
+  indicates that a signal must be set, and 0 indicates not to set the sigmal.
+
+  @datatypes
+  #qurt_anysignal_t
+	
+  @param[in] signal Pointer to the any-signal object to modify. 
+  @param[in]  mask  Signal mask value identifying the individual signals to  
+                       set in the any-signal object.
+
+  @return 				
+  Bitmask of old signal values (before set).
+
+  @dependencies
+  None.
+ */
+/* ======================================================================*/
+unsigned int qurt_anysignal_set(qurt_anysignal_t *signal, unsigned int mask);
+
+
+
+/*======================================================================*/
+/**@ingroup func_qurt_anysignal_get
+  Gets signal values from the any-signal object.\n
+  Returns the current signal values of the specified any-signal object.
+
+  @datatypes
+  #qurt_anysignal_t
+ 	
+  @param[in] signal Pointer to the any-signal object to access. 
+
+  @return 				
+  A bitmask with the current signal values of the specified any-signal object.
+
+  @dependencies
+  None.
+ */
+/* ======================================================================*/
+static inline unsigned int qurt_anysignal_get(qurt_anysignal_t *signal)
+{
+  return qurt_signal_get(signal);
+}
+
+
+/*======================================================================*/
+/**@ingroup func_qurt_anysignal_clear
+   @xreflabel{sec:anysignal_clear}
+  Clears signals in the specified any-signal object.\n
+  Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1
+  indicates that a signal must be cleared, and 0 indicates not to clear the signal.
+
+  @datatypes
+  #qurt_anysignal_t
+	
+  @param[in] signal Pointer to the any-signal object, which specifies the any-signal object to modify. 
+  @param[in] mask   Signal mask value identifying the individual signals to  
+                    clear in the any-signal object.
+	
+  @return 				
+  Bitmask -- Old signal values (before clear). 
+
+  @dependencies
+  None.
+ */
+/* ======================================================================*/
+unsigned int qurt_anysignal_clear(qurt_anysignal_t *signal, unsigned int mask);
+
+/*======================================================================*/
+/**@ingroup func_qurt_anysignal_wait_timed
+  Waits on the any-signal object. \n
+  Suspends the current thread until any of the specified signals is set or timeout expires.
+
+  Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1
+  indicates that a signal must be waited on, and 0 indicates not to wait on the signal.
+  If a signal is set in an any-signal object, and a thread was waiting on the any-signal object for
+  that signal, the thread is awakened. If the awakened thread has higher priority than
+  the current thread, a context switch can occur.
+
+  @note1hang At most, one thread can wait on an any-signal object at any given time.
+
+  @datatypes
+  #qurt_anysignal_t
+	
+  @param[in] signal Pointer to the any-signal object to wait on. 
+  @param[in] mask   Signal mask value, which specifies the individual signals in the any-signal
+                      object to wait on.
+  @param[out] signals Bitmask of current signal values.
+  @param[in] duration Interval (in microseconds) duration value must be between #QURT_TIMER_MIN_DURATION and
+  #QURT_TIMER_MAX_DURATION.
+
+  @return 				
+   #QURT_EOK -- Success \n
+   #QURT_ETIMEDOUT -- timeout
+   #QURT_EINVALID -- Duration out of range
+
+  @dependencies
+  None.
+ */
+/* ======================================================================*/
+
+int qurt_anysignal_wait_timed(qurt_anysignal_t *signal, unsigned int mask, unsigned int *signals, unsigned long long int duration);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_ANYSIGNAL_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_api_version.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_api_version.h
new file mode 100755
index 0000000000000..dfe53ae755054
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_api_version.h
@@ -0,0 +1,77 @@
+#ifndef QURT_API_VERSION_H
+#define QURT_API_VERSION_H
+/*==============================================================================
+
+qurt_api_version.h
+
+GENERAL DESCRIPTION
+    API version file
+
+EXTERNAL FUNCTIONS
+    None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+    None.
+
+Copyright (c) Qualcomm Technologies, Inc.
+All Rights Reserved.
+Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+==============================================================================*/
+
+/*==============================================================================
+                         CONSTANTS AND DEFINITIONS
+==============================================================================*/
+/**
+ * Each field of the QURT_API_VERSION definitions is an 8-bit unsigned integer.
+ * Main release has first 3 fields updated - Major, Minor and Release.
+ *  - QURT_API_VERSION = Major, Minor, Release.
+ * Patch releases are supported by adding the extra field.
+ *  - QURT_API_VERSION = Major, Minor, Release, Patch.
+ */
+// Major version is incremented for incompatible API changes.
+#define QURT_API_VER_MAJOR 1
+
+// Minor version is incremented for backward-compatible enhancements in the API
+// set.
+#define QURT_API_VER_MINOR 4
+
+// RELEASE version is incremented for each release within a `MAJOR.MINOR`
+// release.
+#define QURT_API_VER_RELEASE 1
+
+// Patch version is incremented when new API content is introduced on older LTS
+// release.
+#define QURT_API_VER_PATCH 0
+
+/* Update the QURT_API_VERSION function macro. */
+#define QURT_API_VERSION_ENCODE(major, minor, release, patch) \
+    ((((major) & 0xFF) << 24) | (((minor) & 0xFF) << 16) | \
+        (((release) & 0xFF) << 8) | ((patch) & 0xFF))
+
+/* Update the QURT_API_VERSION Macro. */
+#define QURT_API_VERSION \
+    QURT_API_VERSION_ENCODE(QURT_API_VER_MAJOR, QURT_API_VER_MINOR, \
+        QURT_API_VER_RELEASE, QURT_API_VER_PATCH)
+
+/** Usage:
+ *
+ * #if QURT_API_VERSION >= QURT_API_VERSION_ENCODE(1,4,0,0)
+ *  qurt_func_2(a,b,c);
+ * #else
+ *  qurt_func(a);
+ * #endif
+ *
+ */
+/*
+   Gets the QuRT API version.
+
+  @return
+  QuRT API version.
+
+  @dependencies
+  None.
+ */
+unsigned int qurt_api_version(void);
+
+#endif /* QURT_API_VERSION_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_assert.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_assert.h
new file mode 100755
index 0000000000000..13cc2afd2e973
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_assert.h
@@ -0,0 +1,51 @@
+#ifndef QURT_ASSERT_H
+#define QURT_ASSERT_H
+/**
+  @file qurt_assert.h   
+  @brief  Prototypes of qurt_assert API  
+
+  EXTERNAL FUNCTIONS
+   None.
+
+  INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+  Copyright (c) 2021  by Qualcomm Technologies, Inc.  All Rights Reserved.
+  Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+=============================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*=============================================================================
+                        CONSTANTS AND MACROS
+=============================================================================*/
+/**@ingroup func_qurt_assert_error
+  Writes diagnostic information to the debug buffer, and raises an error to the QuRT kernel.
+  
+  @datatypes
+  None.
+  
+  @param[in] filename     Pointer to the file name string.
+  @param[in] lineno       Line number.
+  
+  @return
+  None.
+
+  @dependencies
+  None.  
+ */
+void qurt_assert_error(const char *filename, int lineno) __attribute__((noreturn));
+
+#define qurt_assert(cond) ((cond)?(void)0:qurt_assert_error(__QURTFILENAME__,__LINE__))
+
+/** @} */ /* end_ingroup func_qurt_assert */
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_ASSERT_H */
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_atomic_ops.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_atomic_ops.h
new file mode 100755
index 0000000000000..d9b2cff7d737c
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_atomic_ops.h
@@ -0,0 +1,1298 @@
+#ifndef QURT_ATOMIC_OPS_H
+#define QURT_ATOMIC_OPS_H
+/**
+  @file qurt_atomic_ops.h 
+  @brief  Prototypes of kernel atomic operations API.
+
+  EXTERNAL FUNCTIONS
+   None.
+
+   INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+Copyright (c) 2021, 2022  by Qualcomm Technologies, Inc.  All Rights Reserved
+Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+=============================================================================*/
+
+/*
+ * Australian Public Licence B (OZPLB)
+ *
+ * Version 1-0
+ *
+ * Copyright (c) 2007, Open Kernel Labs, Inc.
+ *
+ * All rights reserved. 
+ *
+ * Developed by: Embedded, Real-time and Operating Systems Program (ERTOS)
+ *               National ICT Australia
+ *               http://www.ertos.nicta.com.au
+ *
+ * Permission is granted by National ICT Australia, free of charge, to
+ * any person obtaining a copy of this software and any associated
+ * documentation files (the "Software") to deal with the Software without
+ * restriction, including (without limitation) the rights to use, copy,
+ * modify, adapt, merge, publish, distribute, communicate to the public,
+ * sublicense, and/or sell, lend or rent out copies of the Software, and
+ * to permit persons to whom the Software is furnished to do so, subject
+ * to the following conditions:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimers.
+ *
+ *     * Redistributions in binary form must reproduce the above
+ *       copyright notice, this list of conditions and the following
+ *       disclaimers in the documentation and/or other materials provided
+ *       with the distribution.
+ *
+ *     * Neither the name of National ICT Australia, nor the names of its
+ *       contributors, may be used to endorse or promote products derived
+ *       from this Software without specific prior written permission.
+ *
+ * EXCEPT AS EXPRESSLY STATED IN THIS LICENCE AND TO THE FULL EXTENT
+ * PERMITTED BY APPLICABLE LAW, THE SOFTWARE IS PROVIDED "AS-IS", AND
+ * NATIONAL ICT AUSTRALIA AND ITS CONTRIBUTORS MAKE NO REPRESENTATIONS,
+ * WARRANTIES OR CONDITIONS OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
+ * BUT NOT LIMITED TO ANY REPRESENTATIONS, WARRANTIES OR CONDITIONS
+ * REGARDING THE CONTENTS OR ACCURACY OF THE SOFTWARE, OR OF TITLE,
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT,
+ * THE ABSENCE OF LATENT OR OTHER DEFECTS, OR THE PRESENCE OR ABSENCE OF
+ * ERRORS, WHETHER OR NOT DISCOVERABLE.
+ *
+ * TO THE FULL EXTENT PERMITTED BY APPLICABLE LAW, IN NO EVENT SHALL
+ * NATIONAL ICT AUSTRALIA OR ITS CONTRIBUTORS BE LIABLE ON ANY LEGAL
+ * THEORY (INCLUDING, WITHOUT LIMITATION, IN AN ACTION OF CONTRACT,
+ * NEGLIGENCE OR OTHERWISE) FOR ANY CLAIM, LOSS, DAMAGES OR OTHER
+ * LIABILITY, INCLUDING (WITHOUT LIMITATION) LOSS OF PRODUCTION OR
+ * OPERATION TIME, LOSS, DAMAGE OR CORRUPTION OF DATA OR RECORDS; OR LOSS
+ * OF ANTICIPATED SAVINGS, OPPORTUNITY, REVENUE, PROFIT OR GOODWILL, OR
+ * OTHER ECONOMIC LOSS; OR ANY SPECIAL, INCIDENTAL, INDIRECT,
+ * CONSEQUENTIAL, PUNITIVE OR EXEMPLARY DAMAGES, ARISING OUT OF OR IN
+ * CONNECTION WITH THIS LICENCE, THE SOFTWARE OR THE USE OF OR OTHER
+ * DEALINGS WITH THE SOFTWARE, EVEN IF NATIONAL ICT AUSTRALIA OR ITS
+ * CONTRIBUTORS HAVE BEEN ADVISED OF THE POSSIBILITY OF SUCH CLAIM, LOSS,
+ * DAMAGES OR OTHER LIABILITY.
+ *
+ * If applicable legislation implies representations, warranties, or
+ * conditions, or imposes obligations or liability on National ICT
+ * Australia or one of its contributors in respect of the Software that
+ * cannot be wholly or partly excluded, restricted or modified, the
+ * liability of National ICT Australia or the contributor is limited, to
+ * the full extent permitted by the applicable legislation, at its
+ * option, to:
+ * a.  in the case of goods, any one or more of the following:
+ * i.  the replacement of the goods or the supply of equivalent goods;
+ * ii.  the repair of the goods;
+ * iii. the payment of the cost of replacing the goods or of acquiring
+ *  equivalent goods;
+ * iv.  the payment of the cost of having the goods repaired; or
+ * b.  in the case of services:
+ * i.  the supplying of the services again; or
+ * ii.  the payment of the cost of having the services supplied again.
+ *
+ * The construction, validity and performance of this licence is governed
+ * by the laws in force in New South Wales, Australia.
+ */
+
+/*
+ * Author: Malcolm Purvis <malcolmp@ok-labs.com>
+ *
+ * This file is only included by the main atomic_ops.h, so all of that
+ * file's definitions are available.
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*=============================================================================
+                        CONSTANTS AND MACROS
+=============================================================================*/
+
+///* Sanity check to ensure the smp flag is set in machines.py */
+//#if defined(__ATOMIC_OPS_IN_KERNEL__) && !defined(MACHINE_SMP) && CONFIG_NUM_UNITS > 1
+//#error CONFIG_NUM_UNITS > 1 but smp not defined in machines.py.
+//#endif
+#define QURT_INLINE  __attribute__((always_inline))
+
+/*=============================================================================
+                        FUNCTIONS
+=============================================================================*/
+/**@ingroup func_qurt_atomic_set
+  Sets the atomic variable with the specified value.
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+  @param[in]      value  Value to set.
+  
+  @return
+  Value successfuly set.
+
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE unsigned int
+qurt_atomic_set(unsigned int* target, unsigned int value)
+{
+    unsigned long tmp;
+
+    __asm__ __volatile__(
+        "1:     %0 = memw_locked(%2)\n"
+        "       memw_locked(%2, p0) = %3\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (tmp),"+m" (*target)
+        : "r" (target), "r" (value)
+        : "p0");
+    return value;
+}
+
+/**@ingroup func_qurt_atomic_and
+  Bitwise AND operation of the atomic variable with mask. 
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+  @param[in]      mask   Mask for bitwise AND. 
+
+  @return
+  None
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE void
+qurt_atomic_and(unsigned int* target, unsigned int mask)
+{
+    unsigned int result;
+
+    __asm__ __volatile__(
+        "1:     %0 = memw_locked(%2)\n"
+        "       %0 = and(%0, %3)\n"
+        "       memw_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*target)
+        : "r" (target),"r" (mask)
+        : "p0");
+}
+
+/**@ingroup func_qurt_atomic_and_return
+  Bitwise AND operation of the atomic variable with mask. 
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+  @param[in]      mask   Mask for bitwise AND. 
+
+  @return
+  AND result of atomic variable with mask.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE unsigned int
+qurt_atomic_and_return(unsigned int* target, unsigned int mask)
+{
+    unsigned int result;
+
+    __asm__ __volatile__(
+        "1:     %0 = memw_locked(%2)\n"
+        "       %0 = and(%0, %3)\n"
+        "       memw_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*target)
+        : "r" (target), "r" (mask)
+        : "p0");
+
+    return result;
+}
+
+/**@ingroup func_qurt_atomic_or
+  Bitwise OR operation of the atomic variable with mask. 
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+  @param[in]      mask   Mask for bitwise OR. 
+
+  @return
+  None.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE void
+qurt_atomic_or(unsigned int* target, unsigned int mask)
+{
+    unsigned int result;
+
+    __asm__ __volatile__(
+        "1:     %0 = memw_locked(%2)\n"
+        "       %0 = or(%0, %3)\n"
+        "       memw_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*target)
+        : "r" (target), "r" (mask)
+        : "p0");
+}
+
+/**@ingroup func_qurt_atomic_or_return
+  Bitwise OR operation of the atomic variable with mask. 
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+  @param[in]      mask   Mask for bitwise OR. 
+
+  @return
+  Returns the OR result of the atomic variable with mask.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE unsigned int
+qurt_atomic_or_return(unsigned int* target, unsigned int mask)
+{
+    unsigned int result;
+
+    __asm__ __volatile__(
+        "1:     %0 = memw_locked(%2)\n"
+        "       %0 = or(%0, %3)\n"
+        "       memw_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*target)
+        : "r" (target), "r" (mask)
+        : "p0");
+
+    return result;
+}
+
+/**@ingroup func_qurt_atomic_xor
+  Bitwise XOR operation of the atomic variable with mask. 
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+  @param[in]      mask   Mask for bitwise XOR.
+
+  @return
+  None.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE void
+qurt_atomic_xor(unsigned int* target, unsigned int mask)
+{
+    unsigned int result;
+
+    __asm__ __volatile__(
+        "1:     %0 = memw_locked(%2)\n"
+        "       %0 = xor(%0, %3)\n"
+        "       memw_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*target)
+        : "r" (target), "r" (mask)
+        : "p0");
+}
+
+/**@ingroup func_qurt_atomic_xor_return
+  Bitwise XOR operation of the atomic variable with mask. 
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+  @param[in]      mask   Mask for bitwise XOR. 
+
+  @return
+  XOR result of atomic variable with mask.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE unsigned int
+qurt_atomic_xor_return(unsigned int* target, unsigned int mask)
+{
+    unsigned int result;
+
+    __asm__ __volatile__(
+        "1:     %0 = memw_locked(%2)\n"
+        "       %0 = xor(%0, %3)\n"
+        "       memw_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*target)
+        : "r" (target), "r" (mask)
+        : "p0");
+
+    return result;
+}
+
+/**@ingroup func_qurt_atomic_set_bit
+  Sets a bit in the atomic variable at a specified position.
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+  @param[in]      bit    Bit position to set. 
+
+  @return
+  None.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE void
+qurt_atomic_set_bit(unsigned int *target, unsigned int bit)
+{
+    unsigned int result;
+    unsigned int aword = bit / ((unsigned int)sizeof(unsigned int) * 8U); 
+    unsigned int sbit = bit % ((unsigned int)sizeof(unsigned int) * 8U);
+    unsigned int *wtarget= (unsigned int *)&target[aword];
+
+    __asm__ __volatile__(
+        "1:     %0 = memw_locked(%2)\n"
+        "       %0 = setbit(%0, %3)\n"
+        "       memw_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*wtarget)
+        : "r" (wtarget), "r" (sbit)
+        : "p0");
+}
+
+/**@ingroup func_qurt_atomic_clear_bit
+  Clears a bit in the atomic variable at a specified position. 
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+  @param[in]      bit    Bit position to clear.
+
+  @return
+  None.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE void
+qurt_atomic_clear_bit(unsigned int *target, unsigned int bit)
+{
+    unsigned int result;
+    unsigned int aword = bit / ((unsigned int)sizeof(unsigned int) * 8U); 
+    unsigned int sbit = bit % ((unsigned int)sizeof(unsigned int) * 8U);
+    unsigned int *wtarget= (unsigned int *)&target[aword];
+
+    __asm__ __volatile__(
+        "1:     %0 = memw_locked(%2)\n"
+        "       %0 = clrbit(%0, %3)\n"
+        "       memw_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*wtarget)
+        : "r" (wtarget), "r" (sbit)
+        : "p0");
+}
+
+/**@ingroup func_qurt_atomic_change_bit
+  Toggles a bit in a atomic variable at a bit position. 
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+  @param[in]      bit    Bit position to toggle. 
+
+  @return
+  None.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE void
+qurt_atomic_change_bit(unsigned int *target, unsigned int bit)
+{
+    unsigned int result;
+    unsigned int aword = bit / ((unsigned int)sizeof(unsigned int) * 8U); 
+    unsigned int sbit = bit & 0x1fU;
+    unsigned int *wtarget= (unsigned int *)&target[aword];
+
+    __asm__ __volatile__(
+        "1:     %0 = memw_locked(%2)\n"
+        "       %0 = togglebit(%0, %3)\n"
+        "       memw_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*wtarget)
+        : "r" (wtarget),"r" (sbit)
+        : "p0");
+}
+
+/**@ingroup func_qurt_atomic_add
+  Adds an integer to atomic variable.
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+  @param[in]      v      Integer value to add. 
+
+  @return
+  None.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE void
+qurt_atomic_add(unsigned int *target, unsigned int v)
+{
+    unsigned int result;
+
+    __asm__ __volatile__(
+        "1:     %0 = memw_locked(%2)\n"
+        "       %0 = add(%0, %3)\n"
+        "       memw_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*target)
+        : "r" (target), "r" (v)
+        : "p0");
+}
+
+/**@ingroup func_qurt_atomic_add_return
+  Adds an integer to atomic variable.
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+  @param[in]      v      Integer value to add. 
+
+  @return
+  Result of arithmetic sum.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE unsigned int
+qurt_atomic_add_return(unsigned int *target, unsigned int v)
+{
+    unsigned int result;
+
+    __asm__ __volatile__(
+        "1:     %0 = memw_locked(%2)\n"
+        "       %0 = add(%0, %3)\n"
+        "       memw_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*target)
+        : "r" (target), "r" (v)
+        : "p0");
+
+    return result;
+}
+
+/**@ingroup func_qurt_atomic_add_unless
+  Adds the delta value to an atomic variable unless the current value in the target 
+  matches the unless variable.
+
+  @note1hang The function retries until load lock and store conditional
+             are successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+  @param[in]      delta  Value to add to the current value.
+  @param[in]      unless Perform the addition only when the current value is not 
+                         equal to this unless value.
+  @return
+  TRUE  -- 1 - Addition was performed. \n
+  FALSE -- 0 - Addition was not done.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE unsigned int
+qurt_atomic_add_unless(unsigned int* target,
+                       unsigned int delta,
+                       unsigned int unless)
+{
+    unsigned int current_val;
+    unsigned int new_val;
+
+    __asm__ __volatile__(
+        "1:     %0 = memw_locked(%3)\n"
+        "       p0 = cmp.eq(%0, %5)\n"
+        "       if p0 jump 2f\n"
+        "       %1 = add(%0, %4)\n"
+        "       memw_locked(%3, p0) = %1\n"
+        "       if !p0 jump 1b\n"
+        "2:\n"
+        : "=&r" (current_val),"=&r" (new_val),"+m" (*target)
+        : "r" (target), "r" (delta), "r" (unless)
+        : "p0");
+
+    return (unsigned int)(current_val != unless);
+}
+
+/**@ingroup func_qurt_atomic_sub
+  Subtracts an integer from an atomic variable.
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+  @param[in]      v      Integer value to subtract. 
+
+  @return
+  None.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE void
+qurt_atomic_sub(unsigned int *target, unsigned int v)
+{
+    unsigned int result;
+
+    __asm__ __volatile__(
+        "1:     %0 = memw_locked(%2)\n"
+        "       %0 = sub(%0, %3)\n"
+        "       memw_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*target)
+        : "r" (target), "r" (v)
+        : "p0");
+}
+
+/**@ingroup func_qurt_atomic_sub_return
+  Subtracts an integer from an atomic variable.
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+  @param[in]      v      Integer value to subtract. 
+
+  @return
+  Result of arithmetic subtraction.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE unsigned int
+qurt_atomic_sub_return(unsigned int *target, unsigned int v)
+{
+    unsigned int result;
+
+    __asm__ __volatile__(
+        "1:     %0 = memw_locked(%2)\n"
+        "       %0 = sub(%0, %3)\n"
+        "       memw_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*target)
+        : "r" (target), "r" (v)
+        : "p0");
+
+    return result;
+}
+
+/**@ingroup func_qurt_atomic_inc
+  Increments an atomic variable by one.
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+
+  @return
+  None.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE void
+qurt_atomic_inc(unsigned int *target)
+{
+    unsigned int result;
+
+    __asm__ __volatile__(
+        "1:     %0 = memw_locked(%2)\n"
+        "       %0 = add(%0, #1)\n"
+        "       memw_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*target)
+        : "r" (target)
+        : "p0");
+}
+
+/**@ingroup func_qurt_atomic_inc_return
+  Increments an atomic variable by one.
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+
+  @return
+  Incremented value.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE unsigned int
+qurt_atomic_inc_return(unsigned int *target)
+{
+    unsigned int result;
+
+    __asm__ __volatile__(
+        "1:     %0 = memw_locked(%2)\n"
+        "       %0 = add(%0, #1)\n"
+        "       memw_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*target)
+        : "r" (target)
+        : "p0");
+
+    return result;
+}
+
+/**@ingroup func_qurt_atomic_dec
+  Decrements an atomic variable by one.
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+
+  @return
+  None.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE void
+qurt_atomic_dec(unsigned int *target)
+{
+    unsigned int result;
+
+    __asm__ __volatile__(
+        "1:     %0 = memw_locked(%2)\n"
+        "       %0 = add(%0, #-1)\n"
+        "       memw_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*target)
+        : "r" (target)
+        : "p0");
+}
+
+/**@ingroup func_qurt_atomic_dec_return
+  Decrements an atomic variable by one.
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+
+  @return
+  Decremented value.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE unsigned int
+qurt_atomic_dec_return(unsigned int *target)
+{
+    unsigned int result;
+
+    __asm__ __volatile__(
+        "1:     %0 = memw_locked(%2)\n"
+        "       %0 = add(%0, #-1)\n"
+        "       memw_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*target)
+        : "r" (target)
+        : "p0");
+
+    return result;
+}
+
+/**@ingroup func_qurt_atomic_compare_and_set
+  Compares the current value of the atomic variable with the
+  specified value and set to a new value when compare is successful.
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target  Pointer to the atomic variable.
+  @param[in]      old_val Old value to compare.
+  @param[in]      new_val New value to set.
+
+  @return
+  FALSE -- Specified value is not equal to the current value. \n
+  TRUE --Specified value is equal to the current value.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE unsigned int
+qurt_atomic_compare_and_set(unsigned int* target,
+                       unsigned int old_val,
+                       unsigned int new_val)
+{
+    unsigned int current_val;
+
+    __asm__ __volatile__(
+        "1:     %0 = memw_locked(%2)\n"
+        "       p0 = cmp.eq(%0, %3)\n"
+        "       if !p0 jump 2f\n"
+        "       memw_locked(%2, p0) = %4\n"
+        "       if !p0 jump 1b\n"
+        "2:\n"
+        : "=&r" (current_val),"+m" (*target)
+        : "r" (target), "r" (old_val), "r" (new_val)
+        : "p0");
+
+    return (unsigned int)(current_val == old_val);
+}
+
+/**@ingroup func_qurt_atomic_barrier
+  Allows the compiler to enforce an ordering constraint on memory operation issued
+  before and after the function.
+  
+  @return
+  None.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE void
+qurt_atomic_barrier(void)
+{
+    __asm__ __volatile__ (
+        ""
+        :
+        :
+        :
+        "memory");
+}
+
+
+/**@ingroup func_qurt_atomic64_set
+  Sets the 64-bit atomic variable with the specified value. 
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+  @param[in]      value  64-bit value to set. 
+
+  @return
+  Successfuly set value.
+
+  @dependencies
+  None.
+*/ 
+static inline QURT_INLINE unsigned long long
+qurt_atomic64_set(unsigned long long* target, unsigned long long value)
+{
+    unsigned long long tmp;
+
+    __asm__ __volatile__(
+        "1:     %0 = memd_locked(%2)\n"
+        "       memd_locked(%2, p0) = %3\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (tmp),"+m" (*target)
+        : "r" (target), "r" (value)
+        : "p0");
+    return value;
+}
+
+/**@ingroup func_qurt_atomic64_and_return
+  Bitwise AND operation of a 64-bit atomic variable with mask. 
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+  @param[in]      mask   64-bit mask for bitwise AND. 
+
+  @return
+  AND result of 64-bit atomic variable with mask.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE unsigned long long
+qurt_atomic64_and_return(unsigned long long* target, unsigned long long mask)
+{
+    unsigned long long result;
+
+    __asm__ __volatile__(
+        "1:     %0 = memd_locked(%2)\n"
+        "       %0 = and(%0, %3)\n"
+        "       memd_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*target)
+        : "r" (target), "r" (mask)
+        : "p0");
+
+    return result;
+}
+
+/**@ingroup func_qurt_atomic64_or
+  Bitwise OR operation of a 64-bit atomic variable with mask.
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+  @param[in]      mask   64-bit mask for bitwise OR. 
+
+  @return
+  None.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE void
+qurt_atomic64_or(unsigned long long* target, unsigned long long mask)
+{
+    unsigned long long result;
+
+    __asm__ __volatile__(
+        "1:     %0 = memd_locked(%2)\n"
+        "       %0 = or(%0, %3)\n"
+        "       memd_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*target)
+        : "r" (target), "r" (mask)
+        : "p0");
+}
+
+/**@ingroup func_qurt_atomic64_or_return
+  Bitwise OR operation of a 64-bit atomic variable with mask. 
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+  @param[in]      mask   64-bit mask for bitwise OR. 
+
+  @return
+  OR result of the atomic variable with mask.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE unsigned long long
+qurt_atomic64_or_return(unsigned long long* target, unsigned long long mask)
+{
+    unsigned long long result;
+
+    __asm__ __volatile__(
+        "1:     %0 = memd_locked(%2)\n"
+        "       %0 = or(%0, %3)\n"
+        "       memd_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*target)
+        : "r" (target), "r" (mask)
+        : "p0");
+
+    return result;
+}
+
+/**@ingroup func_qurt_atomic64_xor_return
+  Bitwise XOR operation of 64-bit atomic variable with mask. 
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+  @param[in]      mask   64-bit mask for bitwise XOR. 
+
+  @return
+  XOR result of atomic variable with mask.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE unsigned long long
+qurt_atomic64_xor_return(unsigned long long* target, unsigned long long mask)
+{
+    unsigned long long result;
+
+    __asm__ __volatile__(
+        "1:     %0 = memd_locked(%2)\n"
+        "       %0 = xor(%0, %3)\n"
+        "       memd_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*target)
+        : "r" (target), "r" (mask)
+        : "p0");
+
+    return result;
+}
+
+/**@ingroup func_qurt_atomic64_set_bit
+  Sets a bit in a 64-bit atomic variable at a specified position. 
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+  @param[in]      bit    Bit position to set. 
+
+  @return
+  None.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE void
+qurt_atomic64_set_bit(unsigned long long *target, unsigned int bit)
+{
+    unsigned int result;
+    unsigned int *wtarget;
+    unsigned int *pwtarget = (unsigned int *)target;
+    unsigned int aword = bit / ((unsigned int)sizeof(unsigned int) * 8U); 
+    unsigned int sbit = bit & 0x1FU;
+    wtarget = (unsigned int *)&pwtarget[aword];
+
+
+    __asm__ __volatile__(
+        "1:     %0 = memw_locked(%2)\n"
+        "       %0 = setbit(%0, %3)\n"
+        "       memw_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*wtarget)
+        : "r" (wtarget), "r" (sbit)
+        : "p0");
+}
+
+/**@ingroup func_qurt_atomic64_clear_bit
+  Clears a bit in a 64-bit atomic variable at a specified position. 
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+  @param[in]      bit    Bit position to clear. 
+
+  @return
+  None.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE void
+qurt_atomic64_clear_bit(unsigned long long *target, unsigned int bit)
+{
+    unsigned int result;
+    unsigned int *wtarget;
+    unsigned int *pwtarget = (unsigned int *)target;
+    unsigned int aword = bit / ((unsigned int)sizeof(unsigned int) * 8U); 
+    unsigned int sbit = bit & 0x1FU;
+    wtarget = (unsigned int *)&pwtarget[aword];
+
+    __asm__ __volatile__(
+        "1:     %0 = memw_locked(%2)\n"
+        "       %0 = clrbit(%0, %3)\n"
+        "       memw_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*wtarget)
+        : "r" (wtarget), "r" (sbit)
+        : "p0");
+}
+
+/**@ingroup func_qurt_atomic64_change_bit
+  Toggles a bit in a 64-bit atomic variable at a bit position. 
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+  @param[in]      bit    Bit position to toggle. 
+
+  @return
+  None.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE void
+qurt_atomic64_change_bit(unsigned long long *target, unsigned int bit)
+{
+    unsigned int result;
+    unsigned int *wtarget;
+    unsigned int *pwtarget = (unsigned int *)target;
+    unsigned int aword = bit / ((unsigned int)sizeof(unsigned int) * 8U); 
+    unsigned int sbit = bit & 0x1FU;
+    wtarget = (unsigned int *)&pwtarget[aword];
+
+    __asm__ __volatile__(
+        "1:     %0 = memw_locked(%2)\n"
+        "       %0 = togglebit(%0, %3)\n"
+        "       memw_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*wtarget)
+        : "r" (wtarget),"r" (sbit)
+        : "p0");
+}
+
+/**@ingroup func_qurt_atomic64_add
+  Adds a 64-bit integer to 64-bit atomic variable.
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+  @param[in]      v      64-bit integer value to add. 
+
+  @return
+  None.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE void
+qurt_atomic64_add(unsigned long long *target, unsigned long long v)
+{
+    unsigned long long result;
+
+    __asm__ __volatile__(
+        "1:     %0 = memd_locked(%2)\n"
+        "       %0 = add(%0, %3)\n"
+        "       memd_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*target)
+        : "r" (target), "r" (v)
+        : "p0");
+}
+
+/**@ingroup func_qurt_atomic64_add_return
+  Adds a 64-bit integer to 64-bit atomic variable.
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+  @param[in]      v      64-bit integer value to add. 
+
+  @return
+  Result of arithmetic sum.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE unsigned long long
+qurt_atomic64_add_return(unsigned long long *target, unsigned long long v)
+{
+    unsigned long long result;
+
+    __asm__ __volatile__(
+        "1:     %0 = memd_locked(%2)\n"
+        "       %0 = add(%0, %3)\n"
+        "       memd_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*target)
+        : "r" (target), "r" (v)
+        : "p0");
+
+    return result;
+}
+
+/**@ingroup func_qurt_atomic64_sub_return
+  Subtracts a 64-bit integer from an atomic variable.
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+  @param[in]      v      64-bit integer value to subtract. 
+
+  @return
+  Result of arithmetic subtraction.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE unsigned long long
+qurt_atomic64_sub_return(unsigned long long *target, unsigned long long v)
+{
+    unsigned long long result;
+
+    __asm__ __volatile__(
+        "1:     %0 = memd_locked(%2)\n"
+        "       %0 = sub(%0, %3)\n"
+        "       memd_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*target)
+        : "r" (target), "r" (v)
+        : "p0");
+
+    return result;
+}
+
+/**@ingroup func_qurt_atomic64_inc
+  Increments a 64-bit atomic variable by one.
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+
+  @return
+  None.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE void
+qurt_atomic64_inc(unsigned long long *target)
+{
+    unsigned long long result;
+    unsigned long long inc =1;
+
+    __asm__ __volatile__(
+        "1:     %0 = memd_locked(%2)\n"
+        "       %0 = add(%0, %3)\n"
+        "       memd_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*target)
+        : "r" (target),"r" (inc)
+        : "p0");
+}
+
+/**@ingroup func_qurt_atomic64_inc_return
+  Increments a 64-bit atomic variable by one
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+
+  @return
+  Incremented value.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE unsigned long long
+qurt_atomic64_inc_return(unsigned long long *target)
+{
+    unsigned long long result;
+    unsigned long long inc =1;
+
+    __asm__ __volatile__(
+        "1:     %0 = memd_locked(%2)\n"
+        "       %0 = add(%0, %3)\n"
+        "       memd_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*target)
+        : "r" (target),"r" (inc)
+        : "p0");
+
+    return result;
+}
+
+/**@ingroup func_qurt_atomic64_dec_return
+  Decrements a 64-bit atomic variable by one.
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+
+  @return
+  Decremented value.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE unsigned long long
+qurt_atomic64_dec_return(unsigned long long *target)
+{
+    unsigned long long result;
+    long long minus1 = 0xFFFFFFFFFFFFFFFFLL;
+
+    __asm__ __volatile__(
+        "1:     %0 = memd_locked(%2)\n"
+        "       %0 = add(%0, %3)\n"
+        "       memd_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*target)
+        : "r" (target),"r" (minus1)
+        : "p0");
+
+    return result;
+}
+
+/**@ingroup func_qurt_atomic64_compare_and_set
+  Compares the current value of an 64-bit atomic variable with 
+  the specified value and sets to a new value when compare is successful.
+
+  @note1hang The function keep retrying until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target  Pointer to the atomic variable.
+  @param[in]      old_val 64-bit old value to compare.
+  @param[in]      new_val 64-bit new value to set.
+
+  @return
+  FALSE -- Specified value is not equal to the current value. \n
+  TRUE -- Specified value is equal to the current value.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE int
+qurt_atomic64_compare_and_set(unsigned long long *target,
+                       unsigned long long old_val,
+                       unsigned long long new_val)
+{
+    unsigned long long current_val;
+
+    __asm__ __volatile__(
+        "1:     %0 = memd_locked(%2)\n"
+        "       p0 = cmp.eq(%0, %3)\n"
+        "       if !p0 jump 2f\n"
+        "       memd_locked(%2, p0) = %4\n"
+        "       if !p0 jump 1b\n"
+        "2:\n"
+        : "=&r" (current_val),"+m" (*target)
+        : "r" (target), "r" (old_val), "r" (new_val)
+        : "p0");
+
+    return (int)(current_val == old_val);
+}
+
+/**@ingroup func_qurt_atomic64_barrier
+  Allows compiler to enforce an ordering constraint on memory operation issued
+  before and after the function.
+
+  @return
+  None.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE void
+qurt_atomic64_barrier(void)
+{
+    /** @cond */
+    __asm__ __volatile__ (
+        ""
+        :
+        :
+        :
+        "memory");
+    /** @endcond */
+}
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_ATOMIC_OPS_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_barrier.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_barrier.h
new file mode 100755
index 0000000000000..7c6f787d43bc2
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_barrier.h
@@ -0,0 +1,140 @@
+#ifndef QURT_BARRIER_H
+#define QURT_BARRIER_H
+
+/**
+  @file qurt_barrier.h
+  @brief Prototypes of Kernel barrier API functions.      
+
+ EXTERNALIZED FUNCTIONS
+ None.
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+ None.
+
+ Copyright (c) 2021 Qualcomm Technologies, Inc. All rights reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** @addtogroup barrier_types
+@{ */
+/*=====================================================================
+ Constants and macros
+======================================================================*/
+#define QURT_BARRIER_SERIAL_THREAD 1 /**< Serial thread. */
+#define QURT_BARRIER_OTHER 0         /**< Other. */
+
+#ifndef ASM
+#include <qurt_mutex.h>
+
+/*=====================================================================
+Typedefs
+======================================================================*/
+
+/** QuRT barrier type.                                                 
+ */
+typedef union {
+    /** @cond */
+	struct {
+        unsigned short threads_left;
+		unsigned short count;
+		unsigned int threads_total;
+        unsigned int queue;
+        unsigned int reserved;
+	};
+	unsigned long long int raw;
+    /** @endcond */
+} qurt_barrier_t;
+
+/** @} */ /* end_addtogroup barrier_types */
+
+/*=====================================================================
+ Functions
+======================================================================*/
+ 
+/*======================================================================*/
+/**@ingroup func_qurt_barrier_init
+  Initializes a barrier object.
+	
+  @datatypes
+  #qurt_barrier_t
+
+  @param[out] barrier       Pointer to the barrier object to initialize.
+  @param[in]  threads_total Total number of threads to synchronize on the barrier.
+
+
+  @return
+  Unused integer value.
+
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+int qurt_barrier_init(qurt_barrier_t *barrier, unsigned int threads_total);
+
+/*======================================================================*/
+/**@ingroup func_qurt_barrier_destroy
+  Destroys the specified barrier.
+
+  @note1hang Barriers must be destroyed when they are no longer in use. Failure
+             to do this causes resource leaks in the QuRT kernel.\n
+  @note1cont Barriers must not be destroyed while they are still in use. If this
+             occurs, the behavior of QuRT is undefined.
+
+  @datatypes
+  #qurt_barrier_t
+ 
+  @param[in] barrier Pointer to the barrier object to destroy.
+
+  @return     		
+  Unused integer value.
+
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+int qurt_barrier_destroy(qurt_barrier_t *barrier);
+
+/*======================================================================*/
+/**@ingroup func_qurt_barrier_wait
+  Waits on the barrier.\n
+  Suspends the current thread on the specified barrier. \n
+  The function return value indicates whether the thread was the last one to
+  synchronize on the barrier.
+  When a thread waits on a barrier, it is suspended on the barrier: \n
+  - If the total number of threads waiting on the barrier is less than the assigned value 
+     of the barrier, no other action occurs. \n
+  - If the total number of threads waiting on the barrier equals the assigned value of the
+     barrier, all threads currently waiting on the barrier are awakened, allowing them to
+     execute past the barrier.
+
+  @note1hang After its waiting threads are awakened, a barrier is automatically reset 
+            and can be used again in the program without the need for re-initialization.
+	                
+  @datatypes
+  #qurt_barrier_t
+  
+  @param[in] barrier Pointer to the barrier object to wait on.
+
+  @return 				
+  #QURT_BARRIER_OTHER -- Current thread awakened from barrier. \n 
+  #QURT_BARRIER_SERIAL_THREAD -- Current thread is last caller of barrier.
+
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+int qurt_barrier_wait(qurt_barrier_t *barrier);
+
+
+#endif
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_BARRIER_H */
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_busywait.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_busywait.h
new file mode 100755
index 0000000000000..a4dab80a2520a
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_busywait.h
@@ -0,0 +1,62 @@
+#ifndef QURT_BUSYWAIT_H
+#define QURT_BUSYWAIT_H
+
+/**
+  @file qurt_busywait.h 
+  @brief Implementation of the busywait() function for 
+   hardware based blocking waits that use the QTIMER as a reference.   
+
+ EXTERNALIZED FUNCTIONS
+  none
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  none
+
+ Copyright (c) 2021  by Qualcomm Technologies, Inc.  All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ============================================================================*/
+/*=============================================================================
+ *
+ *                       EDIT HISTORY FOR FILE
+ *
+ *   This section contains comments describing changes made to the
+ *   module. Changes are listed in reverse chronological
+ *   order.
+ *
+ * 
+ * when         who     what, where, why
+ * ----------   ---     -------------------------------------------------------
+ * 2018-03-20   pg      Add Header file
+ ============================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*=============================================================================
+                                    FUNCTIONS
+=============================================================================*/
+
+/**@ingroup func_qurt_busywait
+  Pauses the execution of a thread for a specified time.\n
+  Use for small microsecond delays.
+  
+  @note1hang The function does not return to the caller until
+  the time duration has expired.
+             
+  @param[in] pause_time_us Time to pause in microseconds. 
+ 
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+void qurt_busywait (unsigned int pause_time_us);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_BUSYWAIT_H */
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_callback.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_callback.h
new file mode 100755
index 0000000000000..dc9b896c63454
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_callback.h
@@ -0,0 +1,235 @@
+#ifndef QURT_CALLBACK_H
+#define QURT_CALLBACK_H
+
+/**
+  @file qurt_callback.h
+    Definitions, macros, and prototypes for QuRT callback framework.
+  
+  QDI framework allows the development of root process drivers and services that 
+  a user process client can interact with in a secure manner. QDI framework does 
+  this by elevating the priviledge of user process thread, temporarily allowing 
+  the thread execute in root context and letting it fall back to user context once 
+  the QDI invocation is finished. 
+
+  The QuRT callback framework provides a safe mechanism for root process drivers 
+  to execute callback functions in a user process. The framework hosts 
+  dedicated worker threads in corresponding processes that handle the execution
+  of the callback function. This ensures that the callbacks occur in context of
+  the appropriate process thread, in result maintaining privilege boundaries. 
+
+  Prerequisites for use of this framework are:
+  1. Driver is a QDI driver and client communicates with drivers using QDI 
+     invocations.
+  2. Appropriate callback configuration is specified in cust_config.xml for 
+     the user process that intends to use this framework.
+
+  qurt_cb_data_t is the public data structure that allows client to store all
+  the required information about the callback, including the callback function
+  and the arguments to pass to this function when it executes.
+  The client uses QDI interface to register this structure with root driver.
+  
+  Callback framework provides following APIs that a root driver can use to invoke callback.
+  These functions are described in qurt_qdi_driver.h header file.
+
+  qurt_qdi_cb_invoke_async() triggers an asynchronous callback wherein the
+  invoking thread does not wait for the callback to finish executing.
+
+  qurt_qdi_cb_invoke_sync()  triggers a synchronous callback. Upon invocation
+  the invoking thread gets suspended till the callback function finishes execution.
+  
+  qurt_qdi_cb_invoke_sync_with_data() invokes a synchronous callback similar to
+  qurt_qdi_cb_invoke_sync(). It allows user to pass large data along with 
+  the callback invocation to be utlized during the callback execution.
+     
+ EXTERNALIZED FUNCTIONS
+  none
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  none
+
+ Copyright (c) 2021, 2023  by Qualcomm Technologies, Inc.  All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+#include "qurt_qdi.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef int qurt_cb_result_t;
+
+/* Callback framework error codes.
+  Callback framework returns a nonzero value if callback invocation is unsuccessful.
+  Following macros highlight cause of failure in more detail.
+*/
+#define QURT_CB_ERROR               -1                  /* Callback registration failed.\n*/
+#define QURT_CB_OK                   0                  /* Success.\n*/
+#define QURT_CB_MALLOC_FAILED       -2                  /* QuRTOS malloc failure.\n*/
+#define QURT_CB_WAIT_CANCEL         -3                  /* Process exit cancelled wait operation.\n*/
+#define QURT_CB_CONFIG_NOT_FOUND    -4                  /* Callback configuration for process was not found.\n*/
+#define QURT_CB_QUEUE_FULL          -5                  /* Callback queue is serving at maximum capacity.*/
+/** @addtogroup cb_types
+@{ */
+/** Callback registration data structure.
+  This data structure is used by a client attempting to register a callback with a QDI driver.
+  It holds the address of callback function and the argument supplied to the callback 
+  function when it executes.
+*/
+typedef struct {
+  /** @cond */
+  void* cb_func;             /*< Pointer to the callback function. */
+  unsigned cb_arg;           /*< Not interpreted by the framework.*/
+  /** @endcond */
+} qurt_cb_data_t;
+
+/** @cond */
+/* Defines used as default if cust_config does not specify them. */
+#define CALLBACK_WORKER_STACK_SIZE 0x2000
+/** @endcond */
+/** @} */ /* end_addtogroup cb_typess */
+/**@ingroup func_qurt_cb_data_init 
+  Initializes the callback data structure.
+  Entity registering a callback with the root process driver must call this function
+  to initialize callback registration data structure to the default value.
+
+  @datatypes 
+  #qurt_cb_data_t
+
+  @param[in]  cb_data         Pointer to the callback data structure.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+static inline void qurt_cb_data_init (qurt_cb_data_t* cb_data){
+    cb_data->cb_func = NULL;
+    cb_data->cb_arg = 0;
+}
+
+/**@ingroup func_qurt_cb_data_set_cbfunc
+  Sets up the callback function in the callback registration data structure.
+  
+  @datatypes 
+  #qurt_cb_data_t
+
+  @param[in] cb_data         Pointer to the callback data structure.
+  @param[in] cb_func         Pointer to the callback function.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+static inline void qurt_cb_data_set_cbfunc (qurt_cb_data_t* cb_data, void* cb_func){
+  cb_data->cb_func = cb_func;
+}
+
+/**@ingroup func_qurt_cb_data_set_cbarg
+  Sets up the callback argument.
+  This function sets up the argument passed to the callback function when it executes.
+  
+  @datatypes 
+  #qurt_cb_data_t
+
+  @param[in] cb_data         Pointer to the callback data structure.
+  @param[in] cb_arg          Argument for the callback function.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+static inline void qurt_cb_data_set_cbarg (qurt_cb_data_t* cb_data, unsigned cb_arg){
+  cb_data->cb_arg = cb_arg;
+}
+
+/** @cond */
+/**@ingroup driver_support_functions
+  Invokes an asynchronous callback for a specified process. 
+  A driver that resides in the root process calls this API to launch a callback in
+  a process described by the client_handle.
+  After the callback is invoked, the framework queues the callback as per its 
+  priority and subsequently executes it.
+  The caller of this function is not suspended during the callback execution period.
+  The API returns immediately with a success/failure error code.
+
+  @note1hang  This function is only accessible to drivers in the root process. 
+              User process invocations shall fail with a negative error code return value.
+
+  @param  client_handle   Obtained from the current invocation function (Section 4.3.1).
+  @param  cb_data         Pointer to the callback data structure (refer to qurt_callback.h).
+  @param  prio            Priority at which the callback should execute.
+                          This paraemter is optional. If -1 is passed, the callback frameowrk 
+                          executes the callback at the priority of the API caller.
+  @return
+  QURT_EOK -- Callback was successfully communicated to the framework.
+  Negative error code -- Callback cannot be communicated to the framework.
+ */
+qurt_cb_result_t qurt_qdi_cb_invoke_async(int client_handle,
+                                          qurt_cb_data_t* cb_data,
+                                          int prio);
+
+
+/**@ingroup driver_support_functions
+  Invokes a synchronous callback for a specified process. 
+  A driver that resides in a root process calls this API to launch a sync callback in
+  a process described by the client_handle.
+  AFter the callback is invoked, the framework queues the callback as per its 
+  priority and subsequently executes it.
+  The caller of this function is suspended during the callback execution period.
+  If the process in which to execute the callback exits or terminates, the caller is
+  woken up with error code #QURT_CB_WAIT_CANCEL (refer to qurt_callback.h).
+
+  @note1hang  This function is only accessible to drivers in the root process. 
+              User process invocations shall fail with a negative error code return value.
+
+  @param  client_handle   Obtained from the current invocation function (Section 4.3.1).
+  @param  cb_data         Pointer to the callback data structure (refer to qurt_callback.h).
+  @param  prio            Priority at which the callback should execute.
+                          This paraemter is optional. If -1 is passed, callback frameowrk 
+                          executes the callback at the priority of the API caller.
+  @return
+  QURT_EOK -- Callback was successfully communicated to the framework.
+  Negative error code -- Callback cannot be communicated to the framework.
+ */
+qurt_cb_result_t qurt_qdi_cb_invoke_sync(int client_handle,
+                                         qurt_cb_data_t* cb_data,
+                                         int prio);
+
+/**@ingroup driver_support_functions
+  Invokes a synchronous callback for a specified process, passing driver data to the user PD.
+  This function is similar to qurt_qdi_cb_invoke_sync() and allows the driver to pass arbitrary data to
+  the user process as part of the callback invocation.
+
+  @param  client_handle   Obtained from the current invocation function (Section 4.3.1).
+  @param  cb_data         Pointer to the callback data structure (refer to qurt_callback.h).
+  @param  prio            Priority at which the callback should execute.
+                          This paraemter is optional. If -1 is passed, the callback frameowrk
+                          executes the callback at the priority of the API caller.
+  @param  data            Driver arbitrary data to pass to the user process. Memory pointed to by data
+                          must be accessible to the user PD. The root driver can allocate such memory by
+                          using qurt_mem_mmap().
+  @param  data_len        Driver arbitrary data length.
+  
+  @return
+  QURT_EOK -- Callback was successfully communicated to the framework.
+  Negative error code -- Callback cannot be communicated to the framework.
+ */
+qurt_cb_result_t qurt_qdi_cb_invoke_sync_with_data( int client_handle,
+                                                    qurt_cb_data_t* cb_data,
+                                                    int prio,
+                                                    void *data,
+                                                    unsigned data_len
+                                                    );
+/** @endcond */
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_clade.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_clade.h
new file mode 100755
index 0000000000000..d7442cf98dd94
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_clade.h
@@ -0,0 +1,62 @@
+#ifndef QURT_CLADE_H
+#define QURT_CLADE_H
+/**
+  @file qurt_clade.h 
+  @brief  Prototypes of Cache Line Accelerated Decompression Engine (CLADE) API.
+  CLADE is a cache line level memory compression system that is used to
+  decrease DRAM usage.
+
+EXTERNAL FUNCTIONS
+   None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+ Copyright (c) 2019-2021  by Qualcomm Technologies, Inc.  All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+=============================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*=============================================================================
+                                    FUNCTIONS
+=============================================================================*/
+
+/**@ingroup func_qurt_clade2_get
+  Reads the value of the clade2 register.
+ 
+  @param[in] offset Offset from the clade2 cfg base.
+  @param[out] *value  Pointer to the register value read from the offset.
+ 
+  @return
+  #QURT_EOK - Successfully read the value from the register at offset \n
+  #QURT_EINVALID - Offset passed is incorrect
+   
+  @dependencies
+  None.
+ */
+int qurt_clade2_get(unsigned short offset, unsigned int *value);
+ 
+/**@ingroup func_qurt_clade2_set
+  Sets the PMU register; only PMU_SEL register can be set.
+  
+  @param[in] offset Offset from the QURTK_clade2_cfg_base.          
+  @param[in] value  Value to set at offset.  
+ 
+  @return
+  #QURT_EOK -- Successfully set the value at offset. \n
+  #QURT_ENOTALLOWED -- Set operation performed at an offset other than CLADE2_PMU_SELECTION_REG.
+
+  @dependencies
+  None.
+ */
+int qurt_clade2_set(unsigned short offset, unsigned int value);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_CLADE_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_cond.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_cond.h
new file mode 100755
index 0000000000000..6e65ed82a8393
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_cond.h
@@ -0,0 +1,219 @@
+﻿#ifndef QURT_COND_H
+#define QURT_COND_H 
+/**
+  @file qurt_cond.h
+  @brief  Prototypes of kernel condition variable object API functions.
+
+ EXTERNALIZED FUNCTIONS
+  None
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  none
+
+ Copyright (c) 2021 Qualcomm Technologies, Inc.
+ All rights reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+
+
+#include <qurt_mutex.h>
+#include <qurt_rmutex2.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** @addtogroup condition_variables_types
+@{ */
+/*=====================================================================
+ Typedefs
+ ======================================================================*/
+
+/** QuRT condition variable type.  */
+typedef union {
+    /** @cond */
+	unsigned long long raw;
+	struct {
+		unsigned int count;
+		unsigned int n_waiting;
+        unsigned int queue;
+        unsigned int reserved;
+	}X;
+    /** @endcond */
+} qurt_cond_t;
+
+/** @} */ /* end_addtogroup condition_variables_types */
+
+/*=====================================================================
+ Functions
+======================================================================*/
+ 
+/*======================================================================*/
+/**@ingroup func_qurt_cond_init
+  Initializes a conditional variable object.
+
+  @datatypes
+  #qurt_cond_t
+	
+  @param[out] cond Pointer to the initialized condition variable object. 
+
+  @return
+  None.
+		 
+  @dependencies
+  None.
+ */
+/* ======================================================================*/
+void qurt_cond_init(qurt_cond_t *cond);
+
+/*======================================================================*/
+/**@ingroup func_qurt_cond_destroy
+  Destroys the specified condition variable.
+
+  @note1hang Conditions must be destroyed when they are no longer in use. Failure to do
+             this causes resource leaks in the QuRT kernel.\n
+  @note1cont Conditions must not be destroyed while they are still in use. If this occurs,
+             the behavior of QuRT is undefined. 
+
+  @datatypes
+  #qurt_cond_t
+
+  @param[in] cond Pointer to the condition variable object to destroy.
+
+  @return
+  None.
+
+ */
+/* ======================================================================*/
+void qurt_cond_destroy(qurt_cond_t *cond);
+
+/*======================================================================*/
+/**@ingroup func_qurt_cond_signal
+  Signals a waiting thread that the specified condition is true. \n
+
+  When a thread wishes to signal that a condition is true on a shared data item, it must
+  perform the following procedure: \n
+  -# Lock the mutex that controls access to the data item. \n
+  -# Perform the signal condition operation. \n
+  -# Unlock the mutex.
+
+  @note1hang Failure to properly lock and unlock a mutex of a condition variable can cause
+             the threads to never be suspended (or suspended but never awakened). 
+
+  @note1cont Use condition variables only with regular mutexes -- attempting to use
+             recursive mutexes or priority inheritance mutexes results in undefined behavior.
+             
+  @datatypes
+  #qurt_cond_t
+
+  @param[in] cond Pointer to the condition variable object to signal.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+/* ======================================================================*/
+void qurt_cond_signal(qurt_cond_t *cond);
+
+/*======================================================================*/
+/**@ingroup func_qurt_cond_broadcast
+  Signals multiple waiting threads that the specified condition is true.\n
+  When a thread wishes to broadcast that a condition is true on a shared data item, it must
+  perform the following procedure: \n
+  -# Lock the mutex that controls access to the data item. \n
+  -# Perform the broadcast condition operation. \n
+  -# Unlock the mutex.\n
+
+  @note1hang Failure to properly lock and unlock the mutex of a condition variable can cause
+             the threads to never be suspended (or suspended but never awakened).
+
+  @note1cont Use condition variables only with regular mutexes -- attempting to use
+  recursive mutexes or priority inheritance mutexes results in undefined behavior.
+  
+  @datatypes
+  #qurt_cond_t
+
+  @param[in] cond Pointer to the condition variable object to signal.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+/* ======================================================================*/
+void qurt_cond_broadcast(qurt_cond_t *cond);
+
+/*======================================================================*/
+/**@ingroup func_qurt_cond_wait
+  Suspends the current thread until the specified condition is true.
+  When a thread wishes to wait for a specific condition on a shared data item, it must
+  perform the following procedure: \n
+  -# Lock the mutex that controls access to the data item. \n
+  -# If the condition is not satisfied, perform the wait condition operation on the
+  condition variable (suspends the thread and unlocks the mutex).
+
+  @note1hang Failure to properly lock and unlock the mutex of a condition variable can cause
+             the threads to never be suspended (or suspended but never awakened).
+
+  @note1cont Use condition variables only with regular mutexes -- attempting to use
+  recursive mutexes or priority inheritance mutexes results in undefined behavior.
+  
+  @datatypes
+  #qurt_cond_t \n
+  #qurt_mutex_t
+  
+  @param[in] cond     Pointer to the condition variable object to wait on.
+  @param[in] mutex    Pointer to the mutex associated with condition variable to wait on.
+
+  @return
+  None.
+		 
+  @dependencies 
+  None.
+ */
+/* ======================================================================*/
+void qurt_cond_wait(qurt_cond_t *cond, qurt_mutex_t *mutex);
+
+/*======================================================================*/
+/**@ingroup func_qurt_cond_wait2
+  Suspends the current thread until the specified condition is true.
+  When a thread wishes to wait for a specific condition on a shared data item, it must
+  perform the following procedure: \n
+  -# Lock the mutex that controls access to the data item. \n
+  -# If the condition is not satisfied, perform the wait condition operation on the
+  condition variable, which suspends the thread and unlocks the mutex.
+ 
+  @note1hang Failure to properly lock and unlock the mutex of a condition variable can cause
+             the threads to never be suspended (or suspended but never awakened). 
+
+  @note1cont Use condition variables only with regular mutexes -- attempting to use
+             recursive mutexes or priority inheritance mutexes results in undefined behavior.
+             
+  @note1cont This is the same API as qurt_cond_wait(), use this version 
+             when using mutexes of type #qurt_rmutex2_t.
+
+  @datatypes
+  #qurt_cond_t \n
+  #qurt_rmutex2_t
+  
+  @param[in] cond     Pointer to the condition variable object to wait on.
+  @param[in] mutex    Pointer to the mutex associated with the condition variable to wait on.
+
+  @return
+  None.
+		 
+  @dependencies 
+  None.
+ */
+/* ======================================================================*/
+void qurt_cond_wait2(qurt_cond_t *cond, qurt_rmutex2_t *mutex);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_COND_H */
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_consts.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_consts.h
new file mode 100755
index 0000000000000..b1e35998e73b6
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_consts.h
@@ -0,0 +1,315 @@
+#ifndef QURT_CONSTS_H
+#define QURT_CONSTS_H
+
+/**
+  @file qurt_consts.h
+  @brief  QuRT constants and definitions
+
+  EXTERNAL FUNCTIONS
+   None.
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None
+
+ Copyright (c) 2013-2023 by Qualcomm Technologies, Inc.  All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*=====================================================================
+ Constants and macros
+ ======================================================================*/
+
+/* Definitions of system events. System events suspend
+   a thread and put it into suspending_list.
+   The system event number is saved in CONTEXT::error::cause field
+   of the suspended thread. An event handler thread such as
+   page fault handler or system error handler can wake up the suspended
+   thread.
+ */
+#define QURT_EVENT_PAGEFAULT      0x1 /* Page fault event. */
+#define QURT_EVENT_SYSTEM_ERR     0x2 /* System error event. */
+#define QURT_EVENT_SUSPEND        0x3
+#define QURT_EVENT_PROCESS_EXIT   0x4 /* Process termination event.*/
+
+#define QURT_SYSENV_MAX_THREADS_TYPE           1 /* Maximum threads object. */
+#define QURT_SYSENV_PROCNAME_TYPE              2 /* Process name object. */
+#define QURT_SYSENV_MAX_PI_PRIO_TYPE           3 /* Maximum pi priority object. */
+#define QURT_SYSENV_ARCH_REV_TYPE              4 /* Architecture version object. */
+#define QURT_SYSENV_APP_HEAP_TYPE              5 /* Application heap object. */
+#define QURT_SYSENV_REGION_ATTR_DEFAULT        7 /* Default region attributes. */
+#define QURT_SYSENV_STACK_PROFILE_COUNT_TYPE   8 /* Stack profile count type. */
+#define QURT_SYSENV_ISLAND_CONFIG_TYPE         9 /*island configuration check*/
+#define QURT_SYSENV_HTHREADS_TYPE              10 /* Active threads objec */
+#define QURT_SYSENV_CONFIG_IMAGE_START_LO      11 /* Config image start address for DTB parsing */
+#define QURT_SYSENV_CONFIG_IMAGE_START_HI      12 /* Config Image start address for DTB parsing */
+#define QURT_SYSENV_CHIPPARAMS_LO              13 /* ChipParams for DTB parsing */
+#define QURT_SYSENV_CHIPPARAMS_HI              14 /* ChipParams for DTB parsing */
+#define QURT_SYSENV_PLATPARAMS                 15 /* Platformparams for DTB parsing */
+#define QURT_SYSENV_CONFIG_IMAGE_SIZE          16 /* Config image Size for DTB parsing */
+#define QURT_SYSENV_L2_CACHE_LINE_SIZE         17 /*L2 cache line size*/
+
+/* Get q6 regs */
+#define QURT_GET_SSR         1
+#define QURT_GET_CCR         2
+#define QURT_GET_CFGBASE     3
+#define QURT_GET_SYSCFG      4
+#define QURT_GET_REV         5
+
+
+/** @cond rest_reg_dist */
+/** @addtogroup performance_monitor_macros
+@{ */
+
+/* PMU */
+#define QURT_PMUCNT0    0  /**< */
+#define QURT_PMUCNT1    1  /**< */
+#define QURT_PMUCNT2    2  /**< */
+#define QURT_PMUCNT3    3  /**< */
+#define QURT_PMUCFG     4  /**< */
+#define QURT_PMUEVTCFG  5  /**< */
+
+/* new since V55 */
+#define QURT_PMUCNT4    6  /**< */
+#define QURT_PMUCNT5    7  /**< */
+#define QURT_PMUCNT6    8  /**< */
+#define QURT_PMUCNT7    9  /**< */
+#define QURT_PMUEVTCFG1 10  /**< */
+
+/* new since V61 */
+#define QURT_PMUSTID0   11  /**< */
+#define QURT_PMUSTID1   12  /**< */
+
+#define QURT_PMUCNTSTID0   13  /**< */
+#define QURT_PMUCNTSTID1   14  /**< */
+#define QURT_PMUCNTSTID2   15  /**< */
+#define QURT_PMUCNTSTID3   16  /**< */
+#define QURT_PMUCNTSTID4   17  /**< */
+#define QURT_PMUCNTSTID5   18  /**< */
+#define QURT_PMUCNTSTID6   19  /**< */
+#define QURT_PMUCNTSTID7   20  /**< */
+
+/** @} */ /* end_addtogroup performance_monitor_macros */
+/** @endcond */
+
+/*
+ Power collapse operation
+*/
+#define QURT_POWER_SHUTDOWN       0 /**< */
+#define QURT_TCXO_SHUTDOWN        1 /**< */
+#define QURT_POWER_CMD_PREPARE    0 /**< */
+#define QURT_POWER_CMD_PERFORM    1 /**< */
+#define QURT_POWER_CMD_EXIT       2 /**< */
+#define QURT_POWER_CMD_FAIL_EXIT  3 /**< */
+#define QURT_POWER_CMD_PERFORM_L2_RETENTION 4 /**< */
+#define QURT_POWER_CMD_PERFORM_SAVE_TCM     5 /**< */
+#define QURT_POWER_CMD_DEEP_SLEEP 6           /**< */
+
+
+/** @addtogroup thread_macros
+@{ */
+#define QURT_MAX_HTHREAD_LIMIT    8U /**< Limit on the maximum number of hardware threads supported by QuRT for any
+ Hexagon version. Use this definition to define arrays, and so on, in
+ target independent code. */
+/** @} */ /* end_addtogroup thread_macros */
+
+/** @cond internal_only */
+/** @addtogroup power_management_macros
+@{ */
+/**
+  L2 cache retention mode
+*/
+#define QURT_POWER_SHUTDOWN_TYPE_L2NORET QURT_POWER_CMD_PERFORM /**< */
+#define QURT_POWER_SHUTDOWN_TYPE_L2RET   QURT_POWER_CMD_PERFORM_L2_RETENTION /**< */
+#define QURT_POWER_SHUTDOWN_TYPE_SAVETCM QURT_POWER_CMD_PERFORM_SAVE_TCM /**< */
+/** @} */ /* end_addtogroup power_management_macros */
+/** @endcond */
+
+/*
+  QURT_system_state
+  Use for debugging the shutdown/startup process.
+
+  State transition for cold boot:
+  QURT_BOOT_SETUP_ISDB --> QURT_CBOOT_BSP_INIT -->
+  QURT_CBOOT_END_CLEAN_INIT --> QURT_CBOOT_END_OS_INIT -->
+  QURT_CBOOT_KERNEL_INIT_DONE --> QURT_CBOOT_PLAT_CONFIG_DONE -->
+  QURT_CBOOT_ROOT_TASK_STARTED
+
+  State transition for power collapse:
+  QURT_PREPARE_SINGLE_MODE --> QURT_PERFORM_IPEND -->
+  QURT_PERFORM_SAVE_TLB --> QURT_PERFORM_SWITCH_PC -->
+  cache flush states (dependent on L2 retention config)
+
+  State transition for warm boot:
+  QURT_BOOT_SETUP_ISDB --> QURT_WBOOT_INIT_TLB -->
+  QURT_WBOOT_SET_1TO1_MAP --> QURT_WBOOT_REMOVE_1TO1_MAP -->
+  QURT_CBOOT_END_CLEAN_INIT --> QURT_CBOOT_END_OS_INIT
+*/
+#define QURT_PREPARE_SINGLE_MODE 1 /**< */
+#define QURT_PREPARE_END 2 /**< */
+#define QURT_PERFORM_IPEND 3 /**< */
+#define QURT_PERFORM_SAVE_ISDP 4 /**< */
+#define QURT_PERFORM_SAVE_PMU 5 /**< */
+#define QURT_PERFORM_SAVE_TLB 6 /**< */
+#define QURT_PERFORM_SWITCH_PC 7 /**< */
+#define QURT_PERFORM_EXIT 8 /**< */
+#define QURT_FLUSH_L1CACHE 9 /**< */
+#define QURT_FLUSH_L2CACHE 0xA /**< */
+#define QURT_FLUSH_CACHE_DONE 0xB /**< */
+#define QURT_SWITCH_PC_DONE 0xC /**< */
+#define QURT_BOOT_SETUP_ISDB 0xD /**< */
+#define QURT_WBOOT_INIT_TLB 0xE /**< */
+#define QURT_WBOOT_SET_1TO1_MAP 0xF /**< */
+#define QURT_WBOOT_CFG_ADV_SYSCFG 0x10 /**< */
+#define QURT_WBOOT_REMOVE_1TO1_MAP 0x11 /**< */
+#define QURT_CBOOT_BSP_INIT 0x12 /**< */
+#define QURT_CBOOT_END_CLEAN_L1CACHE 0x13 /**< */
+#define QURT_CBOOT_END_CLEAN_INIT 0x14 /**< */
+#define QURT_CBOOT_END_OS_INIT 0x15 /**< */
+#define QURT_CBOOT_TLB_DUMP_LOAD 0x16 /**< */
+#define QURT_CBOOT_TLB_STATIC_LOAD 0x17 /**< */
+#define QURT_CBOOT_KERNEL_INIT_DONE 0x18 /**< */
+#define QURT_CBOOT_PLAT_CONFIG_DONE 0x19 /**< */
+#define QURT_CBOOT_ROOT_TASK_STARTED 0x1A /**< */
+#define QURT_IMPRECISE_EXCEPTION 0x1B /**< */
+#define QURT_WBOOT_DEBUG_L2_START 0x1C /**< */
+#define QURT_WBOOT_DEBUG_L2_END   0x1D /**< */
+#define QURT_NMI_SAVE_L2VIC_COMPLETE   0x1E /**< */
+#define QURT_NMI_HANDLER_COMPLETE   0x1F /**< */
+#define QURT_NMI_AFTER_SAVE_GLOBAL 0x20 /**< */
+#define QURT_WBOOT_START 0x21 /**< */
+#define QURT_ENTER_ISLAND 0x22 /**< */
+#define QURT_EXIT_ISLAND 0x23 /**< */
+#define QURT_LOAD_NOTIFIER_TCB 0x24 /**< */
+#define QURT_ABNORMAL_RESET 0x25 /**< */
+/*
+  Thread attributes
+*/
+
+#define QURT_THREAD_ATTR_GP                    0x00000002 /*< */
+#define QURT_THREAD_ATTR_UGP                   0x00000003 /*< User general pointer (UGP)*/
+#define QURT_THREAD_ATTR_PREFETCH              0x00000004 /*< */
+#define QURT_THREAD_ATTR_TID                   0x00000005 /*< */
+#define QURT_THREAD_ATTR_CACHE_PART            0x00000007 /*< */
+#define QURT_THREAD_ATTR_COPROCESSOR           0x00000008 /*< */
+#define QURT_THREAD_ATTR_GET_L2CACHE_PART      0x00000009 /*< */
+#define QURT_THREAD_ATTR_SET_FRML              0x0000000A /*< */
+#define QURT_THREAD_ATTR_STID_GET              0x0000000B /*< */
+#define QURT_THREAD_ATTR_STID_SET              0x0000000C /*< */
+#define QURT_THREAD_ATTR_AUTOSTACK             0x0000000D /*< */
+#define QURT_THREAD_ATTR_SYSTEM_THREAD         0x0000000E /*< */
+#define QURT_THREAD_ATTR_STID_SET2             0x0000000F /*< */
+#define QURT_THREAD_ATTR_STID_SET2_ACKNOWLEDGE 0x00000010 /*< */
+#define QURT_THREAD_ATTR_STID_GET2             0x00000011 /*< */
+
+/**  Cache operations*/
+#define QURT_DCCLEAN                0U   /* Clean Dcache. */
+#define QURT_DCINV                  1U   /* Invalidate Dcache. */
+#define QURT_DCCLEANINV             2U   /* Clean and invalidate Dcache. */
+#define QURT_ICINV                  3U   /* Invalidate Icache. */
+#define QURT_DUMP_DCTAGS            4U  /* For testing purpose. */
+#define QURT_FLUSH_ALL              5U  /* Flush entire L1 and L2 cache. */
+#define QURT_TABLE_FLUSH            6U  /* Flush based on table of physical pages */
+#define QURT_CLEAN_INVALIDATE_ALL   7U  /* Flush and invalidate entire L1 and L2 cache. */
+#define QURT_L2CACHE_LOCK_LINES     8U  /* l2 cache lock lines */
+#define QURT_L2CACHE_UNLOCK_LINES   9U  /* l2 cache unlock lines */
+#define QURT_CLEAN                  10U  /* Flush L1 and L2 cache */
+#define QURT_CLEAN_INVALIDATE       11U  /* Flush and invalidate L1 and L2 cache. */
+#define QURT_CLEAN_INVALIDATE_L2    12U  /* Flush and invalidate entire L2 cache. */
+
+/**@ingroup chapter_prefined_symbols */
+/**@xreflabel{hdr:QURT_API_VERSION}*/
+
+
+/* Process state. */
+#define QURT_UPDATE_PROCESS_STATE   0 /**< */
+#define QURT_MP_INIT        1 /*< */
+#define QURT_MP_RUNNING     2 /*< */
+#define QURT_MP_STOPPED     3 /*< */
+
+/* QuRT reset reason. */
+#define QURT_NORMAL_BOOT               0  /* Normal boot. */
+#define QURT_WARM_BOOT                 1  /* Power collapse warm boot. */
+#define QURT_WARM_BOOT_L2_RETENTION    2  /* Power collapse with L2 retention warm boot. */
+#define QURT_WARM_BOOT_SAVE_TCM        3  /* Power collapse with saving TCM. */
+#define QURT_QUICK_BOOT                4  /* Deep sleep. */
+
+/* QuRT Wait for Idle command */
+#define QURT_WAIT_FOR_IDLE_DISABLE  0 /*< */
+#define QURT_WAIT_FOR_IDLE_ENABLE   1 /*< */
+#define QURT_WAIT_FOR_IDLE     2 /*< */
+#define QURT_WAIT_FOR_IDLE_CANCEL 3 /*< */
+
+/*QuRT island exit stages */
+#define QURT_ISLAND_EXIT_STAGE1 1 /*< */
+#define QURT_ISLAND_EXIT_STAGE2 2 /*< */
+
+#define QURT_MAX_NAME_LEN   64 /*< */
+
+#define MAX_POOL_RANGES     16 /*< */
+
+/* key definitions for debug thread info */
+//#define MAX_TCB_KEY           40    //whatever is a good number or makes debug thread structure be 1K
+#define KEY_SCHDULER_STATE      1   /*< */
+#define KEY_PRIORITY            2   /*< */
+#define KEY_PRIORITY_ORIG       3   /*< */
+#define KEY_STACK_BOTTOM        4    // Currently not populated
+#define KEY_STACK_TOP           5    // Currently not populated
+#define KEY_HVX_STATE           6    /*< */
+#define KEY_FUTEX_OBJECT        7    /*< */
+#define KEY_THREAD_ID           8    /*< */
+#define KEY_PROFILE_CYCLE_LO    9    // Currently not populated
+#define KEY_PROFILE_CYCLE_HI    10   // Currently not populated
+#define KEY_ERROR_ADDRESS       11   // This holds the BADVA
+#define KEY_ERROR_CAUSE         12   // This is the same as QURT_error_info.cause
+#define KEY_ERROR_CAUSE2        13   // This is the same as QURT_error_info.cause2
+#define KEY_ERROR_SSR           14   /*< Holds the SSR value */
+#define QURT_RESERVED           -1
+
+/* VTLB method IDs. */
+#define QURT_VTLB_ENTRY_CREATE          0U
+#define QURT_VTLB_ENTRY_DELETE          1U
+#define QURT_VTLB_ENTRY_READ            2U
+#define QURT_VTLB_ENTRY_WRITE           3U
+#define QURT_VTLB_ENTRY_PROBE           4U
+#define QURT_VTLB_ENTRY_SPLIT           5U
+#define QURT_VTLB_ENTRY_MERGE           6U
+#define QURT_VTLB_ENTRY_STATISTICS      7U
+#define QURT_VTLB_ENTRY_SET_SPECIAL     8U
+#define QURT_VTLB_QUEUE_PPAGE           9U
+#define QURT_VTLB_RECLAIM_STACK_PAGES   10U
+#define QURT_VTLB_ASID_SET_STATE_FAST   11U
+#define QURT_VTLB_ASID_SET_STATE        12U
+#define QURT_VTLB_ENTRY_SET_EXTENSION   13U
+#define QURT_VTLB_ENTRY_CLEAR_EXTENSION 14U
+
+/* VTCM window access control HWIO programming. */
+#define QURT_VTCM_WINDOW_ENABLE             1U
+#define QURT_VTCM_WINDOW_DISABLE            0U
+#define QURT_VTCM_WINDOW_HI_OFFSET_DEFAULT  0xFFFU
+#define QURT_VTCM_WINDOW_LO_OFFSET_DEFAULT  0U
+
+/** @cond */
+/* ETM source - PC or data access */
+#define QURT_ETM_SOURCE_PC          0U  /**< Memory source of SAC* is PC. */
+#define QURT_ETM_SOURCE_DATA        1U  /**< Memory source of SAC* is data. */
+
+/* ETM PID status flags */
+#define QURT_ETM_NO_PID             0xFFFFFFFF /**< No PID is selected. */
+/** @endcond */
+
+/* execution context */
+#define QURT_CTX_USER       1
+#define QURT_CTX_GUEST      2
+
+/* Profiling STID */
+#define QURT_STID_DEFAULT   0U
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_CONSTS_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_cycles.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_cycles.h
new file mode 100755
index 0000000000000..b599493f5d563
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_cycles.h
@@ -0,0 +1,301 @@
+
+#ifndef QURT_CYCLES_H
+#define QURT_CYCLES_H 1
+/**
+  @file qurt_cycles.h
+  Prototypes of kernel pcycle API functions.      
+
+ EXTERNALIZED FUNCTIONS
+  none
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  none
+
+ Copyright (c) 2021, 2023  by Qualcomm Technologies, Inc.  All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+ 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+	/*=====================================================================
+	 Functions
+	======================================================================*/
+	 
+/*======================================================================*/
+
+/**@ingroup func_qurt_profile_reset_idle_pcycles
+  @xreflabel{hdr:qurt_profile_reset_idle_pcycles}
+  Sets the per-hardware-thread idle cycle counts to zero. 
+
+  @return 
+  None. 
+		 
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+void qurt_profile_reset_idle_pcycles (void);
+	 
+/*======================================================================*/
+/**@ingroup func_qurt_profile_get_thread_pcycles
+  @xreflabel{hdr:qurt_profile_get_thread_pcycles}
+  Gets the count of the running processor cycles for the current thread.\n
+  Returns the current running processor cycle count for the current QuRT thread.
+
+  @note1hang  Profiling shall be enabled first to start the cycle counting. 
+              The cycles are accumulated once the profiling is enabled and 
+              resets on #qurt_profile_reset_threadid_pcycles
+
+  @return 
+  Integer -- Running processor cycle count for current thread.
+		 
+  @dependencies 
+  None.
+*/
+/* ======================================================================*/
+unsigned long long int qurt_profile_get_thread_pcycles(void);
+
+	
+/*======================================================================*/
+/**@ingroup func_qurt_get_core_pcycles
+  @xreflabel{hdr:qurt_get_core_pcycles}
+  Gets the count of core processor cycles executed.\n
+  Returns the current number of running processor cycles executed since the Hexagon
+  processor was last reset.
+
+  This value is based on the hardware core clock, which varies in speed according to the
+  processor clock frequency.
+
+  @note1hang Because the hardware core clock stops running when the processor shuts
+             down (due to all of the hardware threads being idle), treat the cycle values returned
+             by this operation as relative rather than absolute.
+
+  @note1cont Thread cycle counts are valid only in the V4 Hexagon processor version.
+
+  @return 
+  Integer -- Current count of core processor cycles.
+		 
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+unsigned long long int qurt_get_core_pcycles(void);
+
+/*======================================================================*/
+/**@ingroup func_qurt_profile_get_idle_pcycles
+
+  @deprecated use #qurt_profile_get_idle_pcycles2 instead
+
+  Gets the current idle processor cycle counts for a maximum of 6 hardware threads. Use
+  #qurt_profile_get_idle_pcycles2 for reading pcycles without limitation on maximum hardware threads. 
+
+  This operation accepts a pointer to a user-defined array, and writes to the array the current
+  idle cycle count for each hardware thread.
+
+  Each count value represents the number of processor cycles that have elapsed on the
+  corresponding hardware thread while that thread has been in Wait mode.\n
+
+
+  @note1hang This operation does not return the idle cycles that occur when the Hexagon
+             processor shuts down (due to all of the hardware threads being idle). 
+             Idle cycle counts gets accumulated irrespective of profiling is enabled or not, 
+	           and resets on #qurt_profile_reset_idle_pcycles
+	
+  @param[out] pcycles  User array where the function stores the current idle cycle count values.
+                        Array size should be a minimum of the number of hardware threads intended. 
+
+  @return
+  None.
+		 
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+void qurt_profile_get_idle_pcycles (unsigned long long *pcycles);
+
+/*======================================================================*/
+/**@ingroup func_qurt_profile_get_idle_pcycles2
+  Gets the current idle processor cycle counts for maximum available hardware threads.
+
+  This operation accepts a pointer to a user-defined array with length in bytes, and writes 
+  to the array the current idle cycle count for each hardware thread.
+
+  Each count value represents the number of processor cycles that have elapsed on the
+  corresponding hardware thread while that thread has been in Wait mode.\n
+
+  @note1hang This operation does not return the idle cycles that occur when the Hexagon
+             processor shuts down (due to all of the hardware threads being idle). 
+             Idle cycle counts gets accumulated irrespective of profiling enable status, and 
+             resets on #qurt_profile_reset_idle_pcycles
+	
+  @param[out] pcycles  User array where the function stores the current idle cycle count values. 
+                        Array size should be equivalent to the number of hardware threads intended. 
+                        Call #qurt_sysenv_get_max_hw_threads to determine the array size required.
+  
+  @param[in] length_in_bytes Length of pcycles array in bytes. If the array size is smaller
+                              than the required for the maximum available hardware threads, 
+                              it returns error code. 
+
+  @return
+  #QURT_EOK -- Successful operation. Stored all the data to the destination array
+  #QURT_EFAILED -- Operation failed due to smaller #pcycles array
+		 
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+int qurt_profile_get_idle_pcycles2 (unsigned long long *pcycles, unsigned int length_in_bytes);
+
+/*======================================================================*/
+/**@ingroup func_qurt_profile_get_threadid_pcycles
+  
+  @deprecated use #qurt_profile_get_threadid_pcycles2 instead
+  
+  Gets the current per-hardware-thread running cycle counts for the specified QuRT
+  thread for a maximum of 6 hardware threads.
+
+  Each count value represents the number of processor cycles that have elapsed on the
+  corresponding hardware thread while that thread has been scheduled for the specified
+  QuRT thread.
+
+  @note1hang  Profiling shall be enabled first to start the cycle counting. 
+              The cycles are accumulated once the profiling is enabled and 
+              resets on #qurt_profile_reset_threadid_pcycles
+
+  @param[in]   thread_id  Valid thread identifier.
+  @param[out]  pcycles    Pointer to a user array where the function stores the current running 
+                          cycle count values. Array size should be a minimum of the number of
+                          hardware threads intended. 
+	
+  @return 				
+  None. 
+		 
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+void qurt_profile_get_threadid_pcycles (int thread_id, unsigned long long  *pcycles);
+
+/*======================================================================*/
+/**@ingroup func_qurt_profile_get_threadid_pcycles2
+    
+  Gets the current per-hardware-thread running cycle counts for the specified QuRT
+  thread for maximum available hardware threads.
+
+  Each count value represents the number of processor cycles that have elapsed on the
+  corresponding hardware thread while that thread has been scheduled for the specified
+  QuRT thread.
+
+  @note1hang  Profiling shall be enabled first to start the cycle counting. 
+              The cycles are accumulated once the profiling is enabled and 
+              resets on #qurt_profile_reset_threadid_pcycles
+
+  @param[in]  thread_id  Thread identifier.
+  @param[out] pcycles    Pointer to a user array where the function stores the current running 
+                          cycle count values. Array size should be equivalent to the number of
+                          hardware threads intended. 
+                          Call #qurt_sysenv_get_max_hw_threads to determine the array size required.
+  @param[in]  length_in_bytes Length of pcycles array in bytes. If the array size is smaller
+                              than the required for the maximum available hardware threads, it 
+                              returns error code. 
+  
+  @return
+  #QURT_EOK -- Successful operation. Stored all the data to the destination array
+  #QURT_EFAILED -- Operation failed due to smaller #pcycles array
+  #QURT_ENOTHREAD -- Operation failed due to invalid #thread_id
+		 
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+int qurt_profile_get_threadid_pcycles2 (int thread_id, unsigned long long  *pcycles, unsigned int length_in_bytes);
+
+
+/*======================================================================*/
+/**@ingroup func_qurt_profile_reset_threadid_pcycles
+  @xreflabel{hdr:qurt_profile_reset_threadid_pcycles}
+  Sets the per-hardware-thread running cycle counts to zero for the specified QuRT thread.
+
+  @param[in]  thread_id Thread identifier.
+	
+  @return 
+  None. 
+		 
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+void qurt_profile_reset_threadid_pcycles (int thread_id);
+
+/*======================================================================*/
+/**@ingroup func_qurt_profile_enable
+  @xreflabel{hdr:qurt_profile_enable}
+  Enables profiling.\n
+  Enables or disables cycle counting of the running and idle processor cycles.
+  Profiling is disabled by default. \n
+
+  @note1hang Enabling profiling does not automatically reset the cycle counts -- this must be
+             done explicitly by calling the reset operations before starting cycle counting.
+             Cycle counting starts from the instant of it was enabled using this API, and  
+             halts on profiling disable.
+	
+  @param[in] enable  Profiling. Values: \n
+                     - 0 -- Disable profiling \n
+                     - 1 -- Enable profiling @tablebulletend
+	
+  @return 
+  None. 
+		 
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+void qurt_profile_enable (int enable);
+
+/*======================================================================*/
+/**@ingroup func_qurt_get_hthread_pcycles
+  @xreflabel{hdr:qurt_get_hthread_pcycles}
+  Reads the GCYCLE_nT register to allow performance measurement when N threads are in run mode.\n
+
+  @note1hang Returns 0 when architecture is earlier than v67 or for invalid HW thread id.
+  
+  @param[in] n Threads in run mode. Valid values are 1 through <maximum HW threads>.
+                     
+  
+  @return 
+  Value read from GCYCLE_nT register. This value indicates the total number of pcycles that got executed
+  from reset to current point of execution when n threads are in run mode
+     
+  @dependencies
+  PMU must be enabled.
+*/
+/* ======================================================================*/
+unsigned int qurt_get_hthread_pcycles(int n);
+
+/*======================================================================*/
+/**@ingroup func_qurt_get_hthread_commits
+  @xreflabel{hdr:qurt_get_hthread_commits}
+  Reads the GCOMMIT_nT register to allow performance measurement when N threads are in run mode.\n
+
+  @note1hang Returns 0 when architecture is earlier than v67 or for invalid HW thread id.
+  
+  @param[in] n Threads in run mode. Valid values: 1 through <maximum HW threads>.
+  
+  @return 
+  Value read from the GCOMMIT_nT register. This value indicates the total number of packets 
+  committed from reset to current point of execution when n threads are in run mode.
+     
+  @dependencies
+  PMU must be enabled.
+*/
+/* ======================================================================*/
+unsigned int qurt_get_hthread_commits(int n);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_devtree.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_devtree.h
new file mode 100755
index 0000000000000..4adee45bb44a2
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_devtree.h
@@ -0,0 +1,161 @@
+#ifndef QURT_DEVTREE_H
+#define QURT_DEVTREE_H
+/**
+ @file qurt_devtree.h 
+ @brief  Prototypes and structures for device tree aware QuRT library function.
+
+Copyright (c) 2021, 2023  by Qualcomm Technologies, Inc.  All Rights Reserved.
+*/
+/*qurt_callback is included by qurt_qdi_driver.h and depends on NULL being def.
+  callback is not used here, so define NULL here to avoid including the world*/
+#ifndef NULL
+#define NULL ((void *) 0)
+#endif
+
+#include "libfdt.h"
+#include "DTBExtnLib.h"
+#include "qurt_qdi_ext.h"
+#include "qurt_thread.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define INVALID_BLOB_ID       (-1)
+#define DEFAULT_BLOB_ID         0
+
+/** QURT Device Tree Mapping Macros */
+#define QURT_DT_MAPPING_FAILED         (-1)
+#define QURT_DT_FLAG_ISLAND             0x1
+#define QURT_DT_FLAG_PHYSADDR           0x2
+
+/** Device Tree type for Root PD Device tree.
+    Root PD Device Tree will typically describe the hardware in the subsystem.
+    This is the /soc portion of the Device Tree. */
+#define QURT_DT_BLOB_TYPE_ROOT  0
+
+/** Device Tree type for Local Device tree.
+    Local Device Tree will typically contain the software settings.
+    This is the /sw portion of the Device Tree. */
+#define QURT_DT_BLOB_TYPE_LOCAL 1
+
+int qurt_devtree_init(void);
+
+/**@ingroup func_qurt_dt_mapping_create
+ Creates a memory mapping from the specified property of the specified device
+ tree node. Returns virtual addresses and sizes.
+                  
+ @param[in]   offset         Device tree node offset.
+ @param[in]   flags          Flags to configure memory. Overloaded as property 
+                              index if reg_name is NULL.
+ @param[in]   reg_name       Identifies property to use for mapping, should 
+                              resemble a region.
+ @param[out]   vaddr         Return pointer for the virtual region address.
+ @param[out]   size          Return pointer for the virtual region size.
+
+ @return
+ Result code indicating success or failure \n
+*/
+int qurt_dt_mapping_create(fdt_node_handle *devtreeNode, int flags, char *regionName, int regionIdx, 
+                                unsigned long long *vaddr, unsigned long long *size);
+
+/**@ingroup func_qurt_dt_mapping_create2
+ 
+ Creates a memory mapping from the specified property of the specified device
+ tree node.
+
+ Returns virtual addresses and sizes according to architecture (i.e either 32 bit or 64 bit). 
+
+ @param[in]   devtreeNode    Device Tree node    
+
+ @param[in]   dt_map_flags   Flags to configure memory mapping and are reserved for future purpose.
+                              (0) - Default value assumes details from DT node are phys address, size.
+                              QURT_DT_FLAG_ISLAND <IslandMode-Mapping>
+
+                              NOTE: The PA needs to be added to corresponding island spec to create an island mapping
+
+ @param[in]   regionName     NULL or name of index in range to return, should 
+                              resemble a region.       Ex.reg-names =  "base",         "rx",               "tx";
+
+ @param[in]   regionIdx      Index of range to return.  Ex reg       = <0x1000 0x20>, <0x10000 0x100>, <0x18000 0x100 >;
+                              
+                              NOTE: If client specifies both re_name & regionIdx. The precedence of 
+                              region name is taken over and region index is ignored.
+
+ @param[in]   dt_map_perm    Mapping access permissions(R/W),
+                              QURT_PERM_READ <Read only>
+                              QURT_PERM_WRITE
+
+ @param[in]   cache_attr     QuRT cache mode type's :
+                              QURT_MEM_CACHE_DEVICE <memory-mapped device>
+                              QURT_MEM_CACHE_WRITEBACK <Cached WB>
+                              Other required cache type enums in qurt_types.h can also be passed.
+
+                             NOTE: No default value for cache & perm is present. 
+                             Client always needs to pass any of defined the flags.
+
+ @param[out]  vaddr          Return pointer to the variable that holds the virtual address
+ @param[out]  size           Return pointer for the virtual region size.
+
+ @return
+ #QURT_EOK                   Success indicating mapping created properly.
+ #QURT_DT_MAPPING_FAILED     Failed to create mapping.
+ #QURT_EINVALID              Mismatch in the architecture.
+
+                             else FdtLib or thirdparty error code.
+
+*/
+int qurt_dt_mapping_create2(fdt_node_handle *devtreeNode, unsigned int dt_map_flags, 
+                              char *regionName, int regionIdx, unsigned int dt_map_perm, int cache_attr, void **vaddr, size_t *size);
+
+/**@ingroup func_qurt_dt_isr_register
+  Device tree aware registration of an interrupt service routine (ISR) to an ISR thread. 
+  The interrupt defined in the specified device tree node is enabled when this function returns success.
+
+  @datatypes
+  #qurt_thread_t \n
+  #fdt_node_handle
+
+  @param[in]   dt_node       Device tree node that specifies the interrupt property.
+  @param[in]   dt_int_index  Index of the specific interrupt to use within the device tree node structure.
+                             Specify either this or int_name, use -1 if string is used.
+  @param[in]   dt_int_name   Name of the specific interrupt to use within the device tree node structure.
+                             Either this or int_index should be specified, use NULL if index is used
+  @param[in]   isr_thread_id ISR thread ID, returned from qurt_isr_create(), defined by qurt_isr_register2().  
+  @param[in]   prio          Priority of the ISR, defined by qurt_isr_register2().
+  @param[in]   flags         Defines ACK type. Values : \n
+                             #QURT_INT_NON_DELAYED_ACK - ISR is acknowledged by the interrupt handle routine 
+			                                     in the kernel.
+                             #QURT_INT_DELAYED_ACK     - Client chooses to acknowledge.
+                             Defined by qurt_isr_register2().             
+  @param[in]   isr           ISR with proto type void isr (void *arg, int int_num), defined by qurt_isr_register2().
+  @param[in]   arg  	     First argument of the ISR when it is called to service the interrupt, defined by qurt_isr_register2().
+   
+  @return 
+  #QURT_EOK          -- Successfully registered the ISR for the interrupt \n
+  #QURT_EINT         -- Interrupt not configured \n
+  #QURT_EINVALID     -- Invalid thread ID \n
+  #QURT_EDISABLED    -- The feature is disabled \n
+  #QURT_EDUPLICATE   -- Interrupt is already registered
+
+  @dependencies
+   Create the thread ID qurt_isr_create().
+   ISR registration completed with qurt_isr_register2().
+ */
+int qurt_dt_isr_register(fdt_node_handle *dt_node, int dt_int_index, char * dt_int_name, qurt_thread_t isr_thread_id, 
+                         unsigned short prio, unsigned short flags, void (*isr) (void *, int), void *arg);
+
+/**@ingroup func_qurt_dt_blob_id_get
+ Returns the Blob ID for the Blob type passed.
+ The value returned from this API can be passed as Blob ID parameter to DTBExtnLib APIs.
+
+ @param[in] blob_type  Blob type to look up.
+ @return Blob ID for the passed Blob Type.
+*/
+int qurt_dt_blob_id_get(unsigned int blob_type);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_ecc.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_ecc.h
new file mode 100755
index 0000000000000..09312684e99af
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_ecc.h
@@ -0,0 +1,168 @@
+#ifndef QURT_ECC_H
+#define QURT_ECC_H
+
+
+/*=====================================================================
+ 
+  @file  qurt_ecc.h
+  @brief  Prototypes of QuRT memory ECC API functions      
+
+ Copyright (c) 2018, 2020-2021 by Qualcomm Technologies, Inc.  All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+ ======================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*=============================================================================
+                        TYPEDEFS
+=============================================================================*/
+/** @addtogroup exception_handling_types
+@{ */
+// ECC memory definition
+typedef enum {
+    QURT_ECC_MEM_L1_ICACHE = 0, /**< ECC memory L1 ICache. */
+    QURT_ECC_MEM_L1_DCACHE = 1, /**< ECC memory L1 DCache.*/
+    QURT_ECC_MEM_L2_CACHE  = 2, /**< ECC memory L2 Cache.*/
+    QURT_ECC_MEM_VTCM      = 3  /**< ECC memory VTCM.*/
+} qurt_ecc_memory_t;
+/** @} */ /* end_addtogroup exception_handling_types */
+
+/*=============================================================================
+                        CONSTANTS AND MACROS
+=============================================================================*/
+/** @addtogroup exception_handling_macros
+@{ */
+
+#define   QURT_ECC_ERR_DETECTED_STATUS        0 /**< ECC error detected. */
+#define   QURT_ECC_ERR_TYPE                   1 /**< ECC error type.*/
+// ECC status type
+
+#define  QURT_ECC_CORRECTABLE_COUNT           (1<<0) /**< ECC correctable count.*/
+#define  QURT_ECC_UNCORRECTABLE_COUNT         (1<<1) /**< ECC uncorrectable count.*/
+#define  QURT_ECC_REGION_LOGGING              (1<<2) /**< ECC region logging.*/
+// ECC enable/disable definition
+
+#define QURT_ECC_PROTECTION_DISABLE  (0<<0)    /**< Bit 0. */
+#define QURT_ECC_PROTECTION_ENABLE   (1<<0)    /**< Bit 0. */
+/** @} */ /* end_addtogroup exception_handling_macros */
+/*=============================================================================
+                        FUNCTIONS
+=============================================================================*/
+
+
+/**@ingroup func_qurt_ecc_enable
+  Enables or disables ECC protection on a specified memory.
+  
+  @datatypes
+  #qurt_ecc_memory_t
+  
+  @param[in]  memory Set to one of the following values:
+                     - #QURT_ECC_MEM_L1_ICACHE
+                     - #QURT_ECC_MEM_L1_DCACHE
+                     - #QURT_ECC_MEM_L2_CACHE
+                     - #QURT_ECC_MEM_VTCM   @tablebulletend
+
+  @param[in]  enable Set to one of the following values:
+                     - #QURT_ECC_PROTECTION_ENABLE
+                     - #QURT_ECC_PROTECTION_DISABLE  @tablebulletend
+
+  @return
+  - #QURT_EOK --   ECC enabling or disabling setup is performed successfully
+  - Others  --    Failure
+
+  @dependencies
+  None.
+ */
+int qurt_ecc_enable( qurt_ecc_memory_t memory, unsigned int enable );
+
+
+/**@ingroup func_qurt_ecc_get_error_status
+  Gets ECC error status for a specified memory.
+  
+  @datatypes
+  #qurt_ecc_memory_t
+  
+  @param[in]  memory  Set to one of the following:
+                      - #QURT_ECC_MEM_L1_ICACHE
+                      - #QURT_ECC_MEM_L1_DCACHE
+                      - #QURT_ECC_MEM_L2_CACHE
+                      - #QURT_ECC_MEM_VTCM    @tablebulletend
+
+  @param[in]  type  Set to one of the following:
+                     - #QURT_ECC_ERR_DETECTED_STATUS
+                     - #QURT_ECC_ERR_TYPE  @tablebulletend
+
+  @return
+  Returns the following when the type is #QURT_ECC_ERR_DETECTED_STATUS:
+       - 0 -- No error detected \n
+       - 1 -- At least one error detected \n
+  Returns the following when the type is #QURT_ECC_ERR_TYPE: \n
+       - 0 through 1 -- Correctable error \n
+       - 2 --   Uncorrectable error
+
+  @dependencies
+  None.
+ */
+int qurt_ecc_get_error_status( qurt_ecc_memory_t memory, unsigned int type );
+
+
+/**@ingroup func_qurt_ecc_get_error_count
+  Gets the ECC error count for a specified memory.
+  
+  @datatypes
+  #qurt_ecc_memory_t
+  
+  @param[in]  memory  Set to one of the following values:\n
+                      - #QURT_ECC_MEM_L1_ICACHE \n
+                      - #QURT_ECC_MEM_L1_DCACHE \n
+                      - #QURT_ECC_MEM_L2_CACHE \n
+                      - #QURT_ECC_MEM_VTCM    @tablebulletend
+
+  @param[in]  type  Set to one of the following values: \n
+                     - #QURT_ECC_CORRECTABLE_COUNT \n
+                     - #QURT_ECC_UNCORRECTABLE_COUNT  @tablebulletend
+
+  @return
+  Error count for the specified error type.
+
+  @dependencies
+  None.
+ */
+int qurt_ecc_get_error_count( qurt_ecc_memory_t memory, unsigned int type );
+
+
+/**@ingroup func_qurt_ecc_clear_error_count
+  Clears ECC error count or region logging for a specified memory.
+  
+  @datatypes
+  #qurt_ecc_memory_t
+  
+  @param[in]  memory Set to one of the following values: \n
+                     - #QURT_ECC_MEM_L1_ICACHE \n
+                     - #QURT_ECC_MEM_L1_DCACHE \n
+                     - #QURT_ECC_MEM_L2_CACHE \n
+                     - #QURT_ECC_MEM_VTCM    @tablebulletend
+
+  @param[in]  type Set to one or multiple OR'ed of the following values: \n
+                  - #QURT_ECC_CORRECTABLE_COUNT  \n
+                  - #QURT_ECC_UNCORRECTABLE_COUNT \n
+                  - #QURT_ECC_REGION_LOGGING  @tablebulletend
+     
+  @return
+  #QURT_EOK -- Error count successfully cleared \n
+  Others --   Failure at clearing the error count
+
+  @dependencies
+  None.
+ */
+int qurt_ecc_clear_error_count( qurt_ecc_memory_t memory, unsigned int type );
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_ECC_H */
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_error.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_error.h
new file mode 100755
index 0000000000000..f4666b396c378
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_error.h
@@ -0,0 +1,149 @@
+#ifndef QURT_ERROR_H
+#define QURT_ERROR_H
+
+/**
+  @file qurt_error.h 
+  Error results- QURT defines a set of standard symbols for the error result values. This file lists the
+  symbols and their corresponding values.
+
+ EXTERNALIZED FUNCTIONS
+  none
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  none
+
+ Copyright (c) 2021-2022 , 2023  by Qualcomm Technologies, Inc.  All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc..
+ ======================================================================*/
+#include "qurt_except.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** @addtogroup chapter_error
+@{ */
+
+/*=====================================================================
+Constants and macros
+======================================================================*/
+#define QURT_EOK                             0  /**< Operation successfully performed. */
+#define QURT_EVAL                            1  /**< Wrong values for the parameters. The specified page does not exist. */
+#define QURT_EMEM                            2  /**< Not enough memory to perform the operation.*/
+
+#define QURT_EINVALID                        4  /**< Invalid argument value; invalid key. */ 
+/** @cond  */
+#define QURT_EUNKNOWN                        6  /**< Defined but never used in QuRT. */
+#define QURT_ENOMSGS                         7  /**< Message queue is empty. */ 
+#define QURT_EBADF                           9  /**< Bad message queue descriptor. */
+/** @endcond */
+#define QURT_EFAILED                        12  /**< Operation failed. */ 
+
+#define QURT_ENOTALLOWED                    13  /**< Operation not allowed. */
+
+/** @cond */
+#define QURT_EDUPCLSID                      14  /*< Duplicate class ID. */
+/** @endcond */
+/** @cond rest_reg_dist   */
+#define QURT_ENOREGISTERED                  20  /**< No registered interrupts.*/ 
+/** @endcond */
+
+
+/** @cond */
+#define QURT_EISDB                          21  /*< Power collapse failed due to ISDB being enabled. */
+#define QURT_ESTM                           22  /*< Power collapse failed in a Single-threaded mode check. */
+/** @endcond */
+
+
+/** @cond rest_reg_dist  */
+#define QURT_ETLSAVAIL                      23  /**< No free TLS key is available. */
+#define QURT_ETLSENTRY                      24  /**< TLS key is not already free. */ 
+/** @endcond */
+
+#define QURT_EINT                           26  /**< Invalid interrupt number (not registered). */  
+/** @cond rest_reg_dist */
+#define QURT_ESIG                           27  /**< Invalid signal bitmask (cannot set more than one signal at a time). */
+/** @endcond */
+
+/** @cond */
+#define QURT_EHEAP                          28  /**< No heap space is available. */
+#define QURT_ENOSPC                         28  /**< No space to create another queue in the system. */
+#define QURT_EMEMMAP                        29  /**< Physical address layout is not supported by the kernel. */
+/** @endcond */
+/** @cond rest_reg_dist */
+#define QURT_ENOTHREAD                      30  /**< Thread no longer exists. */
+/** @endcond */
+/** @cond */
+#define QURT_EL2CACHE                       31  /**< L2cachable is not supported in kernel invalidate/cleaninv. */
+/** @endcond */
+/** @cond rest_reg_dist  */
+#define QURT_EALIGN                         32  /**< Not aligned. */
+#define QURT_EDEREGISTERED                  33  /**< Interrupt is already deregistered.*/
+/** @endcond */
+
+/** @cond internal_only */
+
+#define QURT_ETLBCREATESIZE                 34  /**< TLB create error -- Incorrect size.*/
+#define QURT_ETLBCREATEUNALIGNED            35  /**< TLB create error -- Unaligned address.*/
+/** @endcond */
+/** @cond rest_reg_dist*/
+#define QURT_EEXISTS                        35  /**< File or message queue already exists. */
+#define QURT_ENAMETOOLONG                   36  /**< Name too long for message queue creation. */
+#define QURT_EPRIVILEGE                     36  /**< Caller does not have privilege for this operation.*/
+
+#define QURT_ECANCEL                        37  /**< A cancellable request was canceled because the associated process was asked to exit.*/
+/** @endcond */
+
+/** @cond */
+#define QURT_EISLANDTRAP                    38  /*< Unsupported TRAP is called in Island mode.*/ 
+
+#define QURT_ERMUTEXUNLOCKNONHOLDER         39  /*< Rmutex unlock by a non-holder.*/
+#define QURT_ERMUTEXUNLOCKFATAL             40  /*< Rmutex unlock error, all except the non-holder error.*/
+#define QURT_EMUTEXUNLOCKNONHOLDER          41  /*< Mutex unlock by a non-holder.*/
+#define QURT_EMUTEXUNLOCKFATAL              42  /*< Mutex unlock error, all except the non-holder error.*/
+#define QURT_EINVALIDPOWERCOLLAPSE          43  /*< Invalid power collapse mode requested. */ 
+/** @endcond */
+#define QURT_EISLANDUSEREXIT                44  /**< User call has resulted in island exit.*/
+#define QURT_ENOISLANDENTRY                 45  /**< Island mode had not yet been entered.*/
+#define QURT_EISLANDINVALIDINT              46  /**< Exited Island mode due to an invalid island interrupt.*/
+/** @cond rest_reg_dist */
+#define QURT_ETIMEDOUT                      47  /**< Operation timed-out. */
+#define QURT_EALREADY                       48  /**< Operation already in progress. */
+/** @endcond */
+
+#define QURT_ERETRY                         49  /*< Retry the operation. */
+#define QURT_EDISABLED                      50  /*< Resource disabled. */
+#define QURT_EDUPLICATE                     51  /*< Duplicate resource. */
+#define QURT_EBADR                          53  /*< Invalid request descriptor. */
+#define QURT_ETLB                           54  /*< Exceeded maximum allowed TLBs. */
+#define QURT_ENOTSUPPORTED                  55  /*< Operation not supported. */
+/** @cond rest_reg_dist */
+#define QURT_ENORESOURCE                    56  /**< No resource. */
+/** @endcond */
+
+#define QURT_EDTINIT                        57  /**< Problem with device tree intialization. */
+#define QURT_EBUFLOCK                       58  /*< Buffer lock failed because it was already locked many times. */
+#define QURT_ELOCKED                        59  /**< Current operation failed as the buffer is locked. */
+#define QURT_EMSGSIZE                       90  /*< Message queue msg_len is greater than mq_msgsize attribute of the message queue. */
+
+
+#define QURT_ENOTCONFIGURED                 91  /*< Interrupt is NOT configured. */
+
+#define QURT_EBANDWIDTHLIMIT                92  /*< Message queue send exceed the bandwidth limit. */
+
+#define QURT_ECFIVIOLATION                  93  /*< CFI violation detected. */
+
+#define QURT_EDESTROY                       94  /**< A destroy request was made to waiting threads.*/
+
+#define QURT_EHMXNOTAVAIL                   95  /**< HMX is not available to target thread.*/
+#define QURT_EHMXNOTDETACHABLE              96  /**< HMX is not detachable from target thread.*/
+
+#define QURT_EFATAL                         -1  /**< Fatal error. */
+
+/** @} */ /* end_addtogroup chapter_error */
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_ERROR_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_event.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_event.h
new file mode 100755
index 0000000000000..987f0fe79f227
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_event.h
@@ -0,0 +1,452 @@
+#ifndef QURT_EVENT_H
+#define QURT_EVENT_H
+/**
+  @file qurt_event.h
+  @brief Prototypes of kernel event API functions.      
+
+ EXTERNALIZED FUNCTIONS
+  none
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  none
+
+ Copyright (c) 2018-2021, 2023 Qualcomm Technologies, Inc.  All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+
+#include "qurt_consts.h"
+#include "qurt_thread.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * System environment object type.
+ */
+/**@addtogroup sys_env_types
+@{ */
+/** QuRT swap pool information type. */
+typedef struct qurt_sysenv_swap_pools {
+   /** @cond */
+   unsigned int spoolsize; /* Swap pool size.*/
+   unsigned int spooladdr;   /* Swap pool start address.*/
+   /** @endcond */
+}qurt_sysenv_swap_pools_t;
+
+/**QuRT application heap information type. */
+typedef struct qurt_sysenv_app_heap {
+   /** @cond */
+   unsigned int heap_base; /* Heap base address.*/
+   unsigned int heap_limit; /* Heap end address.*/
+   /** @endcond */
+} qurt_sysenv_app_heap_t ;
+
+/** QuRT architecture version information type. */
+typedef struct qurt_sysenv_arch_version {
+   /** @cond */
+    unsigned int arch_version; /*Architecture version.*/
+    /** @endcond */
+}qurt_arch_version_t;
+
+/** QuRT maximum hardware threads information type. */
+typedef struct qurt_sysenv_max_hthreads {
+   /** @cond */
+   unsigned int max_hthreads; /*Maximum number of hardware threads.*/
+   /** @endcond */
+}qurt_sysenv_max_hthreads_t;
+
+/** QuRT active hardware threads information type. */
+typedef struct qurt_sysenv_hthreads {
+   /** @cond */
+   unsigned int hthreads; /*Maximum number of hardware threads.*/
+   /** @endcond */
+}qurt_sysenv_hthreads_t;
+
+/** QuRT maximum pi priority information type. */
+typedef struct qurt_sysenv_max_pi_prio {
+     /** @cond */
+    unsigned int max_pi_prio; /*Maximum pi priority.*/
+     /** @endcond */
+}qurt_sysenv_max_pi_prio_t;
+
+/** QuRT process name information type. */
+typedef struct qurt_sysenv_procname {
+     /** @cond */
+   union {
+      unsigned int asid; /*Address space ID.*/
+      unsigned int pid;  /*Process ID.*/
+   };
+   char name[QURT_MAX_NAME_LEN]; /* Process name.*/
+    /** @endcond */
+}qurt_sysenv_procname_t;
+
+/** QuRT stack profile count information type. */
+typedef struct qurt_sysenv_stack_profile_count {
+     /** @cond */
+   unsigned int count; /*Stack profile count for usage.*/
+   unsigned int count_watermark; /*Stack profile count for watermark.*/
+    /** @endcond */
+}qurt_sysenv_stack_profile_count_t;
+
+/**
+ QuRT system error event type.
+ */
+typedef struct _qurt_sysevent_error_t
+{
+    unsigned int thread_id; /**< Thread ID.  */
+    unsigned int fault_pc;  /**< Fault PC. */
+    unsigned int sp;        /**< Stack pointer. */
+    unsigned int badva;     /**< Virtual data address where the exception occurred. */
+    unsigned int cause;     /**< QuRT error result. */
+    unsigned int ssr;       /**< Supervisor status register. */
+    unsigned int fp;        /**< Frame pointer. */
+    unsigned int lr;        /**< Link register. */
+    unsigned int pid;       /**< PID of the process to which this thread belongs.*/
+ } qurt_sysevent_error_t ;
+
+typedef struct _qurt_sysevent_error_1_t
+{
+    unsigned int thread_id; /**< Thread ID.  */
+    unsigned int fault_pc;  /**< Fault PC. */
+    unsigned int sp;        /**< Stack pointer. */
+    unsigned int badva;     /**< Virtual data address where the exception occurred. */
+    unsigned int cause;     /**< QuRT error result. */
+    unsigned int ssr;       /**< Supervisor status register. */
+    unsigned int fp;        /**< Frame pointer. */
+    unsigned int lr;        /**< Link register. */
+    unsigned int pid;       /**< PID of the process to which this thread belongs.*/
+    unsigned int fkey;      /**< Framekey.*/
+    unsigned int reserved1; /**< Reserved.*/
+    unsigned int reserved2; /**< Reserved.*/
+    unsigned int reserved3; /**< Reserved.*/
+ } qurt_sysevent_error_1_t ;
+ 
+/** QuRT page fault error event information type. */
+typedef struct qurt_sysevent_pagefault {
+    qurt_thread_t thread_id; /**< Thread ID of the page fault thread. */
+    unsigned int fault_addr; /**< Accessed address that caused the page fault. */
+    unsigned int ssr_cause;  /**< SSR cause code for the page fault. */
+} qurt_sysevent_pagefault_t ;
+/** @} */ /* @endaddtogroup sys_env_types */
+/*=============================================================================
+                                    FUNCTIONS
+=============================================================================*/
+
+/*======================================================================*/
+/**
+  Gets the environment swap pool 0 information from the kernel.
+
+  @datatypes
+  #qurt_sysenv_swap_pools_t
+
+  @param[out] pools  Pointer to the pools information.
+
+  @return
+  #QURT_EOK -- Success.
+
+  @dependencies
+  None.
+*/
+int qurt_sysenv_get_swap_spool0 (qurt_sysenv_swap_pools_t *pools );
+
+/*
+  Gets the environment swap pool 1 information from the kernel.
+
+  @datatypes
+  #qurt_sysenv_swap_pools_t
+
+  @param[out] pools  Pointer to the pools information.
+
+  @return
+  #QURT_EOK -- Success.
+
+  @dependencies
+  None.
+*/
+int qurt_sysenv_get_swap_spool1(qurt_sysenv_swap_pools_t *pools );
+
+/**@ingroup func_qurt_sysenv_get_app_heap
+  Gets information on the program heap from the kernel.
+
+  @datatypes
+  #qurt_sysenv_app_heap_t
+
+  @param[out] aheap  Pointer to information on the program heap.
+
+  @return
+  #QURT_EOK -- Success. \n
+  #QURT_EVAL -- Invalid parameter.
+
+  @dependencies
+  None.
+*/
+int qurt_sysenv_get_app_heap(qurt_sysenv_app_heap_t *aheap );
+
+/**@ingroup func_qurt_sysenv_get_arch_version
+  Gets the Hexagon processor architecture version from the kernel.
+
+  @datatypes
+  #qurt_arch_version_t
+
+  @param[out] vers  Pointer to the Hexagon processor architecture version.
+
+  @return
+  #QURT_EOK -- Success. \n
+  #QURT_EVAL -- Invalid parameter
+
+  @dependencies
+  None.
+*/
+int qurt_sysenv_get_arch_version(qurt_arch_version_t *vers);
+
+/**@ingroup func_qurt_sysenv_get_max_hw_threads
+  Gets the maximum number of hardware threads supported in the Hexagon processor. 
+  The API includes the disabled hardware threads to reflect the maximum 
+  hardware thread count.
+  For example, if the image is configured for four hardware threads and hthread_mask is set to 0x5 in 
+  cust_config.xml, only HW0 and HW2 are initialized by QuRT.
+  HW1 and HW3 are not used at all. Under such a scenario, 
+  qurt_sysenv_get_max_hw_threads() still returns four.
+
+  @datatypes
+  #qurt_sysenv_max_hthreads_t
+
+  @param[out] mhwt  Pointer to the maximum number of hardware threads supported in the Hexagon processor.
+
+  @return
+  #QURT_EOK -- Success. \n
+  #QURT_EVAL -- Invalid parameter.
+
+  @dependencies
+  None.
+*/
+int qurt_sysenv_get_max_hw_threads(qurt_sysenv_max_hthreads_t *mhwt );
+
+/**@ingroup func_qurt_sysenv_get_hw_threads
+  Gets the number of hardware threads initialized by QuRT in Hexagon processor.
+  For example, if the image is configured for four hardware threads and hthread_mask is set to 0x5 in 
+  cust_config.xml, QuRT only initializes HW0 and HW2.
+  HW1 and HW3 are not used. In this scenario, qurt_sysenv_get_hw_threads() returns 2.
+
+  @datatypes
+  #qurt_sysenv_hthreads_t
+
+  @param[out] mhwt  Pointer to the number of hardware threads active in the Hexagon processor.
+
+  @return
+  #QURT_EOK -- Success. \n
+  #QURT_EVAL -- Invalid parameter.
+
+  @dependencies
+  None.
+*/
+int qurt_sysenv_get_hw_threads(qurt_sysenv_hthreads_t *mhwt );
+
+/**@ingroup func_qurt_sysenv_get_max_pi_prio
+  Gets the maximum priority inheritance mutex priority from the kernel.
+
+  @datatypes
+  #qurt_sysenv_max_pi_prio_t
+
+  @param[out] mpip  Pointer to the maximum priority inheritance mutex priority.
+
+  @return
+  #QURT_EOK -- Success. \n
+  #QURT_EVAL -- Invalid parameter.
+
+  @dependencies
+  None.
+*/
+int qurt_sysenv_get_max_pi_prio(qurt_sysenv_max_pi_prio_t *mpip );
+
+/**@ingroup func_qurt_sysenv_get_process_name2
+  Gets information on the system environment process names based on the client_handle argument.
+
+  @datatypes
+  #qurt_sysenv_procname_t
+
+  @param[in] client_handle  Obtained from the current invocation function (Section 3.4.1).
+  @param[out] pname         Pointer to information on the process names in the system.
+
+  @return
+  #QURT_EOK -- Success. \n
+  #QURT_EVAL -- Invalid parameter.
+
+  @dependencies
+  None.
+*/
+int qurt_sysenv_get_process_name2(int client_handle, qurt_sysenv_procname_t *pname );
+
+/**@ingroup func_qurt_sysenv_get_process_name
+  Gets information on the system environment process names from the kernel.
+
+  @datatypes
+  #qurt_sysenv_procname_t
+
+  @param[out] pname  Pointer to information on the process names in the system.
+
+  @return
+  #QURT_EOK -- Success. \n
+  #QURT_EVAL -- Invalid parameter.
+
+  @dependencies
+  None.
+*/
+int qurt_sysenv_get_process_name(qurt_sysenv_procname_t *pname );
+
+/**@ingroup func_qurt_sysenv_get_stack_profile_count
+   Gets information on the stack profile count from the kernel.
+
+   @datatypes
+   #qurt_sysenv_stack_profile_count_t
+
+   @param[out] count Pointer to information on the stack profile count.
+
+   @return
+   #QURT_EOK -- Success.
+
+   @dependencies
+   None.
+*/
+int qurt_sysenv_get_stack_profile_count(qurt_sysenv_stack_profile_count_t *count );
+
+/**@ingroup func_qurt_exception_wait
+  Registers the program exception handler.
+  This function assigns the current thread as the QuRT program exception handler and suspends the
+  thread until a program exception occurs.
+
+  When a program exception occurs, the thread is awakened with error information
+  assigned to the parameters of this operation.
+
+  @note1hang If no program exception handler is registered, or if the registered handler
+             calls exit, QuRT raises a kernel exception.
+             If a thread runs in Supervisor mode, any errors are treated as kernel
+             exceptions.
+
+  @param[out]  ip      Pointer to the instruction memory address where the exception occurred.
+  @param[out]  sp      Stack pointer.
+  @param[out]  badva   Pointer to the virtual data address where the exception occurred.
+  @param[out]  cause   Pointer to the QuRT error result code.
+
+  @return
+  Registry status: \n
+  Thread identifier -- Handler successfully registered. \n
+  #QURT_EFATAL -- Registration failed.
+
+  @dependencies
+  None.
+*/
+unsigned int qurt_exception_wait (unsigned int *ip, unsigned int *sp,
+                                  unsigned int *badva, unsigned int *cause);
+
+unsigned int qurt_exception_wait_ext (qurt_sysevent_error_t * sys_err);
+
+/**@ingroup func_qurt_exception_wait3
+  Registers the current thread as the QuRT program exception handler, and suspends the thread until a
+  program exception occurs.
+  When a program exception occurs, the thread is awakened with error information assigned to the specified
+  error event record.
+  If a program exception is raised when no handler is registered (or when a handler is registered, but it calls
+  exit), the exception is treated as fatal.\n
+  @note1hang If a thread runs in Monitor mode, all exceptions are treated as kernel exceptions.\n
+  @note1cont This function differs from qurt_exception_wait() by returning the error information in a data
+              structure rather than as individual variables. It also returns additional information (for example, SSR, FP, and LR).
+
+  @param[out] sys_err       Pointer to the qurt_sysevent_error_1_t type structure.
+  @param[in]  sys_err_size  Size of the qurt_sysevent_error_1_t structure.
+
+  @return
+  Registry status: \n
+  - #QURT_EFATAL -- Failure. \n
+  - Thread ID -- Success.
+
+  @dependencies
+  None.
+*/
+
+unsigned int qurt_exception_wait3(void * sys_err, unsigned int sys_err_size);
+
+/**@ingroup func_qurt_exception_raise_nonfatal
+  Raises a nonfatal program exception in the QuRT program system.
+
+  For more information on program exceptions, see Section @xref{dox:exception_handling}.
+
+  This operation never returns -- the program exception handler is assumed to perform all
+  exception handling before terminating or reloading the QuRT program system.
+
+  @note1hang The C library function abort() calls this operation to indicate software
+             errors.
+
+  @param[in] error QuRT error result code (Section @xref{dox:error_results}).
+
+  @return
+  Integer -- Unused.
+
+  @dependencies
+  None.
+*/
+int qurt_exception_raise_nonfatal (int error) __attribute__((noreturn));
+
+
+/**@ingroup func_qurt_exception_raise_fatal
+  Raises a fatal program exception in the QuRT system.
+
+  Fatal program exceptions terminate the execution of the QuRT system without invoking
+  the program exception handler.
+
+  For more information on fatal program exceptions, see Section @xref{dox:exception_handling}.
+
+  This operation always returns, so the calling program can perform the necessary shutdown
+  operations (data logging, on so on).
+
+  @note1hang Context switches do not work after this operation has been called.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+void qurt_exception_raise_fatal (void);
+
+unsigned int qurt_enable_floating_point_exception(unsigned int mask);
+
+/**@ingroup func_qurt_exception_enable_fp_exceptions
+  Enables the specified floating point exceptions as QuRT program exceptions.
+
+  The exceptions are enabled by setting the corresponding bits in the Hexagon
+  control user status register (USR).
+
+  The mask argument specifies a mask value identifying the individual floating
+  point exceptions to set. The exceptions are represented as defined symbols
+  that map into bits 0 through 31 of the 32-bit flag value.
+  Multiple floating point exceptions are specified by OR'ing together the individual
+  exception symbols.\n
+  @note1hang This function must be called before performing any floating point operations.
+
+  @param[in] mask Floating point exception types. Values: \n
+             - #QURT_FP_EXCEPTION_ALL    \n
+             - #QURT_FP_EXCEPTION_INEXACT    \n
+             - #QURT_FP_EXCEPTION_UNDERFLOW  \n
+             - #QURT_FP_EXCEPTION_OVERFLOW  \n
+             - #QURT_FP_EXCEPTION_DIVIDE0    \n
+             - #QURT_FP_EXCEPTION_INVALID   @tablebulletend
+
+  @return
+  Updated contents of the USR.
+
+  @dependencies
+  None.
+*/
+
+static inline unsigned int qurt_exception_enable_fp_exceptions(unsigned int mask)
+{
+   return qurt_enable_floating_point_exception(mask);
+}
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_EVENT_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_except.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_except.h
new file mode 100755
index 0000000000000..e1684c80e3d50
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_except.h
@@ -0,0 +1,185 @@
+#ifndef QURT_EXCEPT_H
+#define QURT_EXCEPT_H
+
+/**
+  @file qurt_except.h 
+  @brief  Defines Cause and Cause2 codes for error-handling.  
+
+EXTERNAL FUNCTIONS
+   None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+ Copyright (c) 2021-2022 by Qualcomm Technologies, Inc.  All Rights Reserved.
+
+ Confidential and Proprietary - Qualcomm Technologies, Inc..
+ ======================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+  QuRT supports error handling to handle CPU detected exceptions and software errors. 
+  QuRT treats all errors as either fatal errors or nonfatal errors. 
+
+  @section sec1 Fatal errors
+  All supervisor mode exceptions are treated as fatal errors. 
+  If a registered exception handler calls qurt_exit(), it is treated as a fatal error.
+  Fatal errors result in saving the context of primary hardware thread to QURT_error_info and the rest of the thread contexts to the corresponding TCBs. 
+  All hardware threads are eventually stopped and the cache is flushed.
+  NMI exception is treated little differently from other fatal errors. QuRT saves the contexts of all the hardware threads into QURT_error_info.\n
+
+  @subsection subsection1 Debugging fatal errors
+  - QURT_error_info.status.status	 -- Indicates that an error occured.
+  - QURT_error_info.status.cause	 -- Cause code for fatal error; Cause and Cause 2 details are listed below.
+  - QURT_error_info.status.cause2	 -- Cause2 code for fatal error; Cause and Cause 2 details are listed below.
+  - QURT_error_info.status.fatal	 -- Indicates whether a fatal error occurred. A user error can result in a fatal error if the exceptional handler is not registered.
+  - QURT_error_info.status.hw_tnum -- Indicates the index of QURT_error_info.locregs[], where the context is saved when the error is fatal error.
+  - QURT_error_info.global_regs    -- Contains the values of the global registers of Q6
+  - QURT_error_info.local_regs[QURT_error_info.status.hw_tnum] -- Provides the CPU context when the error is a supervisor error.
+    
+
+
+  @subsection subsection2 Debugging nonfatal errors
+  - QURT_error_info.user_errors                                    -- All user errors are logged here.
+  - QURT_error_info.user_errors.counter                            -- Index to last logged error.
+  - QURT_error_info.user_errors.entry[0...counter]	               -- Structure for logged error.
+  - QURT_error_info.user_errors.entry[0...counter].error_tcb       -- TCB for the user error.
+  - QURT_error_info.user_errors.entry[0...counter].error_tcb.error -- Information about the error; Cause, Cause2, Badva and hardware thread ID.
+  - QURT_error_info.user_errors.entry[0...counter].error_code      -- ((cause2 << 8) 'Logical Or' (cause) ); Cause and Cause 2 details are listed below.
+  - QURT_error_info.user_errors.entry[0...counter].hw_thread	   -- Hardware thread ID for error.
+  - QURT_error_info.user_errors.entry[0...counter].pcycle	       -- Pcycle for error.
+
+@note  
+  Important usage note:
+  Cause and Cause2 are error codes to distinguish multiple errors.
+  SSR and BADAVA are inconclusive without the vector number.
+  All cause and cause2 can range from 1 to 255 and every cause can have 1 to 255 error code.
+  Hence the system can have up to 255 * 255 unique error codes.
+  The cominations is representated as ((cause2 << 8) 'Logical OR' (cause) )
+  Some Cause2 codes are statically defined, whereas some are obtaned from SSR[7:0] cause codes. It depends on cause codes.
+  SSR cause codes are defined in Hexagon reference manual.
+  All possible combinations are listed below.
+*/
+/** @addtogroup chapter_error
+@{ */
+/* cause - error type - 8-bits*/
+#define QURT_EXCEPT_PRECISE             0x01U /**< Precise exception occurred. For this cause code, Cause2 is SSR[7:0].*/
+#define QURT_EXCEPT_NMI                 0x02U /**< NMI occurred; Cause2 is not defined. */
+#define QURT_EXCEPT_TLBMISS             0x03U /**< TLBMISS RW occurred; for this cause code, Cause2 is SSR[7:0]. */
+#define QURT_EXCEPT_RSVD_VECTOR         0x04U /**< Interrupt raised on a reserved vector, which must never occur. Cause2 is not defined. */
+#define QURT_EXCEPT_ASSERT              0x05U /**< Kernel assert. Cause2 QURT_ABORT_* are listed below.  */
+#define QURT_EXCEPT_BADTRAP             0x06U /**< trap0(num) called with unsupported num. Cause2 is 0. */
+#define QURT_EXCEPT_UNDEF_TRAP1         0x07U /**< Trap1 is not supported. Using Trap1 causes this error. Cause2 is not defined. */
+#define QURT_EXCEPT_EXIT                0x08U /**< Application called qurt_exit() or qurt_exception_raise_nonfatal(). Can be called from C library. Cause2 is "[Argument passed to qurt_exception_raise_nonfatal() & 0xFF]". */
+#define QURT_EXCEPT_TLBMISS_X           0x0AU /**< TLBMISS X (execution) occurred. Cause2 is not defined. */
+#define QURT_EXCEPT_STOPPED             0x0BU /**< Running thread stopped due to fatal error on other hardware thread. Cause2 is not defined. */
+#define QURT_EXCEPT_FATAL_EXIT          0x0CU /**< Application called qurt_fatal_exit(). Cause2 is not defined. */
+#define QURT_EXCEPT_INVALID_INT         0x0DU /**< Kernel received an invalid L1 interrupt. Cause2 is not defined. */
+#define QURT_EXCEPT_FLOATING_POINT      0x0EU /**< Kernel received an floating point error. Cause2 is not defined.  */
+#define QURT_EXCEPT_DBG_SINGLE_STEP     0x0FU /**< Cause2 is not defined. */
+#define QURT_EXCEPT_TLBMISS_RW_ISLAND   0x10U /**< Read write miss in Island mode. Cause2 QURT_TLB_MISS_RW_MEM* are listed below. */
+#define QURT_EXCEPT_TLBMISS_X_ISLAND    0x11U /**< Execute miss in Island mode. For this cause code, Cause2 is SSR[7:0]. */
+#define QURT_EXCEPT_SYNTHETIC_FAULT     0x12U /**< Synthetic fault with user request that kernel detected. Cause2 QURT_SYNTH_* are listed below. */
+#define QURT_EXCEPT_INVALID_ISLAND_TRAP 0x13U /**< Invalid trap in Island mode. Cause2 is trap number. */
+#define QURT_EXCEPT_UNDEF_TRAP0         0x14U /**< trap0(num) was called with unsupported num. Cause2 is trap number. */
+#define QURT_EXCEPT_PRECISE_DMA_ERROR   0x28U /**< Precise DMA error. Cause2 is DM4[15:8]. Badva is DM5 register. */
+
+#define QURT_ECODE_UPPER_LIBC         (0U << 16)  /**< Upper 16 bits is 0 for libc. */
+#define QURT_ECODE_UPPER_QURT         (0U << 16)  /**< Upper 16 bits is 0 for QuRT. */
+#define QURT_ECODE_UPPER_ERR_SERVICES (2U << 16)  /**< Upper 16 bits is 2 for error service. */
+/** @cond */
+#define QURT_ECODE_ISLAND_INVALID_QDI  3U         /**< Passing invalid QDI method in island. */
+/** @endcond */
+
+/* Cause2 for QURT_EXCEPT_SYNTHETIC_FAULT cause- 8bits */
+#define  QURT_SYNTH_ERR                         0x01U     /**< */
+#define  QURT_SYNTH_INVALID_OP                  0x02U     /**< */
+#define  QURT_SYNTH_DATA_ALIGNMENT_FAULT        0x03U     /**< */
+#define  QURT_SYNTH_FUTEX_INUSE                 0x04U     /**< */
+#define  QURT_SYNTH_FUTEX_BOGUS                 0x05U     /**< */
+#define  QURT_SYNTH_FUTEX_ISLAND                0x06U     /**< */
+#define  QURT_SYNTH_FUTEX_DESTROYED             0x07U     /**< */
+#define  QURT_SYNTH_PRIVILEGE_ERR               0x08U     /**< */
+
+/* Cause2 - Abort cause reason - 8 bits */
+/* ERR_ASSERT cause */
+#define   QURT_ABORT_FUTEX_WAKE_MULTIPLE           0x01U   /**<  Abort cause - futex wake multiple. */
+#define   QURT_ABORT_WAIT_WAKEUP_SINGLE_MODE       0x02U   /**<  Abort cause - thread waiting to wake up in Single Threaded mode. */
+#define   QURT_ABORT_TCXO_SHUTDOWN_NOEXIT          0x03U   /**<  Abort cause - call TCXO shutdown without exit. */
+#define   QURT_ABORT_FUTEX_ALLOC_QUEUE_FAIL        0x04U   /**<  Abort cause - futex allocation queue failure -  QURTK_futexhash_lifo empty. */
+#define   QURT_ABORT_INVALID_CALL_QURTK_WARM_INIT  0x05U   /**<  Abort cause - invalid call QURTK_warm_init() in NONE CONFIG_POWER_MGMT mode. */
+#define   QURT_ABORT_THREAD_SCHEDULE_SANITY        0x06U   /**<  Abort cause - sanity schedule thread is not supposed to run on the current hardware thread. */
+#define   QURT_ABORT_REMAP                         0x07U   /**<  Remap in the page table; the correct behavior must remove mapping if necessary. */
+#define   QURT_ABORT_NOMAP                         0x08U   /**<  No mapping in page table when removing a user mapping. */
+#define   QURT_ABORT_OUT_OF_SPACES                 0x09U
+#define   QURT_ABORT_INVALID_MEM_MAPPING_TYPE      0x0AU   /**<  Invalid memory mapping type when creating qmemory. */
+#define   QURT_ABORT_NOPOOL                        0x0BU   /**<  No pool available to attach. */
+#define   QURT_ABORT_LIFO_REMOVE_NON_EXIST_ITEM    0x0CU   /**<  Cannot allocate more futex waiting queue. */
+#define   QURT_ABORT_ARG_ERROR                     0x0DU
+#define   QURT_ABORT_ASSERT                        0x0EU   /**<  Assert abort. */
+#define   QURT_ABORT_FATAL                         0x0FU   /**<  Fatal error; must never occur. */
+#define   QURT_ABORT_FUTEX_RESUME_INVALID_QUEUE    0x10U   /**<  Abort cause - invalid queue ID in futex resume. */
+#define   QURT_ABORT_FUTEX_WAIT_INVALID_QUEUE      0x11U   /**<  Abort cause - invalid queue ID in futex wait. */
+#define   QURT_ABORT_FUTEX_RESUME_INVALID_FUTEX    0x12U   /**<  Abort cause - invalid futex object in hashtable. */
+#define   QURT_ABORT_NO_ERHNDLR                    0x13U   /**<  No registered error handler. */
+#define   QURT_ABORT_ERR_REAPER                    0x14U   /**<  Exception in the reaper thread. */
+#define   QURT_ABORT_FREEZE_UNKNOWN_CAUSE          0x15U   /**<  Abort in thread freeze operation. */
+#define   QURT_ABORT_FUTEX_WAIT_WRITE_FAILURE      0x16U   /**<  During futex wait processing, could not perform a necessary write operation to userland data; most likely due to a DLPager eviction. */
+#define   QURT_ABORT_ERR_ISLAND_EXP_HANDLER        0x17U   /**<  Exception in Island exception handler task. */
+#define   QURT_ABORT_L2_TAG_DATA_CHECK_FAIL        0x18U   /**<  Detected error in L2 tag/data during warm boot. The L2 tag/data check is done when CONFIG_DEBUG_L2_POWER_COLLAPSE is enabled. */
+#define   QURT_ABORT_ERR_SECURE_PROCESS            0x19U   /**<  Abort error in secure process. */
+#define   QURT_ABORT_ERR_EXP_HANDLER               0x20U   /**<  No exception handler, or the handler caused an exception. */
+#define   QURT_ABORT_ERR_NO_PCB                    0x21U   /**<  PCB of the thread context failed initialization, PCB was NULL. */
+#define   QURT_ABORT_NO_PHYS_ADDR                  0x22U   /**<  Unable to find the physical address for the virtual address. */
+#define   QURT_ABORT_OUT_OF_FASTINT_CONTEXTS       0x23U   /**<  Fast interrupt contexts exhausted. */
+#define   QURT_ABORT_CLADE_ERR                     0x24U   /**<  Fatal error seen with CLADE interrupt. */
+#define   QURT_ABORT_ETM_ERR                       0x25U   /**<  Fatal error seen with ETM interrupt. */
+#define   QURT_ABORT_ECC_DED_ASSERT                0x26U   /**<  ECC two-bit DED error. */
+#define   QURT_ABORT_VTLB_ERR                      0x27U   /**<  Fatal error in the VTLB layer. */
+#define   QURT_ABORT_TLB_ENCODE_DECODE_FAILURE     0x28U   /**<  Failure during the TLB encode or decode operation. */
+#define   QURT_ABORT_VTLB_WALKOBJS_BOUND_FAILURE   0x29U   /**<  Failure to lookup entry in the page table. */
+#define   QURT_ABORT_PHY_MEMORY_OWNERSHIP_FAILURE  0x30U   /**<  Failure to claim phy memory ownership. */
+#define   QURT_ABORT_JTLB_SIZE_CHECK_FAIL          0x31U   /**<  JTLB size configured is more than actual size in hardware */
+#define   QURT_ABORT_AUTOSTACK_ASSERT              0x32U   /**<  Error while handling stack flimit exception. */
+
+/* Cause2 - TLB-miss_X - 8bits */
+#define  QURT_TLB_MISS_X_FETCH_PC_PAGE             0x60U  /**<   */
+#define  QURT_TLB_MISS_X_2ND_PAGE                  0x61U  /**<   */
+#define  QURT_TLB_MISS_X_ICINVA                    0x62U  /**<   */
+
+/* Cause2 - TLB-miss_RW - 8bits */
+#define  QURT_TLB_MISS_RW_MEM_READ                 0x70U  /**<   */
+#define  QURT_TLB_MISS_RW_MEM_WRITE                0x71U  /**<   */
+
+/** @cond rest_reg_dist */
+/* Cause2 - Floating point exception - 8 bits */
+#define  QURT_FLOATING_POINT_EXEC_ERR              0xBFU    /**<  Execute floating-point. */
+/** @endcond */
+
+/** Cause2 - autostackv2 - 8 bits */
+#define  QURT_AUTOSTACKV2_CANARY_NOT_MATCH         0xC1U
+#define  QURT_AUTOSTACKV2_POOL_IDX_OFF_RANGE       0xC2U
+
+/** Cause2 - CFI violation - 8 bits */
+#define  QURT_CFI_VIOLATION                        0xC3U
+
+/** @cond rest_reg_dist*/
+/* Enable floating point exceptions */
+#define QURT_FP_EXCEPTION_ALL        0x1FU << 25 /**< */
+#define QURT_FP_EXCEPTION_INEXACT    0x1U << 29 /**< */
+#define QURT_FP_EXCEPTION_UNDERFLOW  0x1U << 28 /**< */
+#define QURT_FP_EXCEPTION_OVERFLOW   0x1U << 27 /**< */
+#define QURT_FP_EXCEPTION_DIVIDE0    0x1U << 26 /**< */
+#define QURT_FP_EXCEPTION_INVALID    0x1U << 25 /**< */
+
+/** @endcond */
+/** @} */ /* end_addtogroup chapter_error */
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_EXCEPT_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_fastint.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_fastint.h
new file mode 100755
index 0000000000000..ea65dc0917fc0
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_fastint.h
@@ -0,0 +1,71 @@
+#ifndef QURT_FASTINT_H
+#define QURT_FASTINT_H
+
+/**
+  @file qurt_fastint.h
+  @brief QuRT fast interrupt functions      
+
+   Copyright (c) 2013-2021  by Qualcomm Technologies, Inc.  All Rights Reserved.
+
+ ======================================================================*/
+
+/*======================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**@ingroup func_qurt_fastint_register
+  Register fast interrupt callback function
+
+  Fast interrupt callback should be designed to perform the minimal necessary 
+  actions for the interrupt, and/or perform some operations, such as signaling 
+  another regular software thread to start any additional processing. 
+  The callback should be a fast and short function. When a fast interrupt callback 
+  is running, the corresponding interrupt cannot be re-enabled until the callback 
+  returns. 
+
+  The fast interrupt callback must not use any system blocking calls, such as 
+  mutex lock or signal wait. Otherwise, it results in errors.
+
+  The fast interrupt callback function has a single integer argument and the 
+  function ends with no return. The argument value passed in is the interrupt
+  number, and therefore a single callback function can handle 
+  multiple fast interrupts.
+
+  @param[in] intno  Interrupt number to register. 
+  @param[in] fn     Interrupt callback function. 
+    
+  @return
+  #QURT_EOK -- Fast interrupt registration is successful. \n
+  #QURT_EINVALID -- Interrupt is already registered. \n
+  #QURT_EINT -- Invalid interrupt number.    
+*/
+/* ======================================================================*/
+unsigned int qurt_fastint_register(int intno, void (*fn)(int));
+
+
+/*======================================================================*/
+/**@ingroup func_qurt_fastint_deregister
+  Deregisters the fast interrupt callback function. 
+	
+  @param[in] intno  Level-one interrupt number to deregister. Valid range is 1 and 10 through 31 
+                    (simulator only). 
+
+  @return 				
+  #QURT_EOK -- Interrupt deregistration is successful. \n
+  #QURT_EINT -- Invalid interrupt number (not registered). \n
+  #QURT_EINVALID -- Invalid interrupt number (already deregistered).
+
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+unsigned int qurt_fastint_deregister(int intno);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_FASTINT_H */
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_fs_hub.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_fs_hub.h
new file mode 100755
index 0000000000000..aaa050a6c838b
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_fs_hub.h
@@ -0,0 +1,58 @@
+#ifndef QURT_FS_HUB_H
+#define QURT_FS_HUB_H
+
+/**
+  @file qurt_fs_hub.h
+  @brief  Definitions, macros, and prototypes used when writing a
+  QDI driver that provides file-system functionality.
+
+ EXTERNALIZED FUNCTIONS
+  none
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  none
+
+ Copyright (c) 2023 by Qualcomm Technologies, Inc.  All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+  This structure tracks a file-designator for a FS-hub QDI driver.
+  File system's QDI interface should use this object to encapsulate
+  true file-descriptor and return back a QDI handle. This QDI handle
+  will be used as file-descriptor by File-systm-hub. 
+ */
+
+typedef struct qurt_qdi_fs_obj
+{
+    qurt_qdi_obj_t qdi_obj;
+    int client_handle;
+    int fd;
+}qurt_qdi_fs_obj_t;
+
+
+/**@ingroup fs_hub_support_functions
+  This function allows a file-system to register it's QDI interface with file-system-hub.
+  Once registered, all file open operations for any filenames containing the mountpoint will
+  be forwarded to the QDI inteface.
+
+  Mountpoint string must be encased in two forward slashes e.g. "/mountpoint/"
+
+  @param  mtpoint         mount point for the file-system being registered.
+  @param  opener          opener structure for the QDI driver interface
+  
+  @return
+  QURT_EOK -- Successfully registered QDI driver with file-system-hub.
+  Negative error code -- Failed to register with file-system-hub
+ */
+int qurt_fs_hub_mtpoint_register(const char *mtpoint, qurt_qdi_obj_t *opener);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_futex.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_futex.h
new file mode 100755
index 0000000000000..1fdcc79a43f01
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_futex.h
@@ -0,0 +1,82 @@
+#ifndef QURT_FUTEX_H
+#define QURT_FUTEX_H
+/**
+  @file  qurt_futex.h
+
+  @brief  Prototypes of QuRT futex API functions      
+  
+ EXTERNALIZED FUNCTIONS
+  none
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  none
+
+ Copyright (c) 2013, 2020-2021  by Qualcomm Technologies, Inc.  All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*=====================================================================
+ Functions
+======================================================================*/
+
+
+/**@ingroup func_qurt_futex_wait
+  Moves the caller thread into waiting state when a memory object address
+  contains a value that is the same as a specified value. 
+
+   @param[in]  lock  Pointer to the object memory. 
+   @param[in]  val   Value to check against the object content. 
+
+   @return
+   #QURT_EOK -- Success \n
+   Other values -- Failure
+
+   @dependencies
+   None.
+ */
+int qurt_futex_wait(void *lock, int val);
+
+
+/**@ingroup func_qurt_futex_wait_cancellable
+  If a memory object address contains a value that is same as a specified 
+  value, move the caller thread into waiting state. 
+  The kernal can cancel the waiting state when there is a special need. 
+
+   @param[in]  lock  Pointer to the object memory. 
+   @param[in]  val   Value to check against the object content. 
+
+   @return
+   #QURT_EOK -- Success \n
+   Other values  -- Failure
+
+   @dependencies
+   None.
+ */
+int qurt_futex_wait_cancellable(void *lock, int val);
+
+
+/**@ingroup func_qurt_futex_wake
+  Wakes up a specified number of threads that have been waiting 
+  for the object change with qurt_futex_wait().
+
+   @param[in]  lock        Pointer to the object memory. 
+   @param[in]  n_to_wake   Maximum number of threads to wake up.
+
+   @return
+   number of threads to be woken up by this function
+
+   @dependencies
+   None.
+ */
+int qurt_futex_wake(void *lock, int n_to_wake);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_FUTEX_H */
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_hmx.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_hmx.h
new file mode 100755
index 0000000000000..e4037dbeae514
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_hmx.h
@@ -0,0 +1,226 @@
+#ifndef QURT_HMX_H
+#define QURT_HMX_H
+/**
+  @file qurt_hmx.h 
+  @brief   Prototypes of Qurt HMX API.  
+
+Copyright (c) 2019-2021, 2023  by Qualcomm Technologies, Inc.  All Rights Reserved.
+Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+=============================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*=============================================================================
+                        TYPEDEFS
+=============================================================================*/
+
+
+/** @addtogroup hmx_types
+@{ */
+/* HMX locking type */
+#define  QURT_HMX_NON_SHARED_LOCK           0U /**< HMX locking type.*/
+#define  QURT_HMX_SHARED_LOCK               1U /**< HMX locking type.*/
+
+/* HMX unlocking type */
+#define  QURT_HMX_NON_SHARED_UNLOCK         0U /**< HMX unlocking type.*/
+#define  QURT_HMX_SHARED_UNLOCK             1U /**< HMX unlocking type.*/
+
+/* HMX hardware context */
+#define  QURT_HMX_UNIT_0                    0U /**< HMX hardware context #0 */
+#define  QURT_HMX_UNIT_1                    1U /**< HMX hardware context #1 */
+	/** @} */ /* end_addtogroup hmx_types */
+
+
+/*=============================================================================
+                        FUNCTIONS
+=============================================================================*/
+
+
+/**@ingroup func_qurt_hmx_lock2
+  Locks a HMX unit with the specified locking type.
+
+    #QURT_HMX_NON_SHARED_LOCK:
+   - If a HMX unit is available, lock the unit and return success of #QURT_EOK.
+   - If the HMX unit is already locked by another thread, the caller thread is suspended 
+     until the HMX is available and gets locked by this function.
+   - If there is no HMX hardware supported, returns #QURT_EVAL;
+
+    #QURT_HMX_SHARED_LOCK:
+   - If a HMX unit is available, enables HMX access for the caller thread, and returns 
+     success of #QURT_EOK.
+   - If the HMX is enabled on the caller thread, return #QURT_EFAILED.
+   - If the HMX is locked by another thread in the same user process of the caller 
+     thread with locking type of #QURT_HMX_SHARED_LOCK, enable HMX access for the caller 
+     thread, and return success of #QURT_EOK.
+   - If the HMX is locked by another thread in the same user process of the caller 
+     thread with locking type of #QURT_HMX_NON_SHARED_LOCK, return #QURT_EFAILED.
+   - If the HMX is locked by a thread from another user process different from the 
+     user process of the caller thread, return #QURT_EFAILED.
+   - If there is no HMX hardware supported, return #QURT_EVAL.
+
+  @param[in]  type  Locking type.
+    
+  @return
+  #QURT_EOK     -- HMX lock successful.\n
+  #QURT_EFAILED -- Failure due to wrong locking condition.\n
+  #QURT_EVAL    -- Failure because no HMX hardware is supported.
+
+  @dependencies
+  None.
+  
+ */
+int qurt_hmx_lock2(unsigned int type);
+
+
+/**@ingroup func_qurt_hmx_unlock2
+  Unlocks a HMX unit with the unlocking type.
+
+    #QURT_HMX_NON_SHARED_UNLOCK:
+  - If there is a HMX unit locked by the caller thread, unlock the HMX unit and clear the 
+    HMX accumulators (assuming a fixed point type). 
+  - If there is no HMX unit locked by the caller thread, return #QURT_EFAILED. 
+  - If there is no HMX hardware supported, return #QURT_EVAL.
+
+  #QURT_HMX_SHARED_UNLOCK:
+   - If the caller thread has locked HMX with type #QURT_HMX_SHARED_LOCK, disable the 
+     HMX access on the caller thread, and return success of #QURT_EOK.
+     Note: If the caller thread is the last thread that unlocks for #QURT_HMX_SHARED_LOCK 
+           in its user process, the unlock function clears the HMX accumulators. 
+   - If the caller thread has locked HMX with type #QURT_HMX_NON_SHARED_LOCK, return 
+     failure of #QURT_EFAILED.
+   - If the caller thread has not locked HMX, return failure of #QURT_EFAILED.
+   - If there is no HMX hardware supported, returns #QURT_EVAL.
+
+  @param[in]  type  Locking type.
+    
+  @return
+  #QURT_EOK     -- HMX is unlocked successful. \n
+  #QURT_EFAILED -- Failure due to wrong unlocking condition. \n
+  #QURT_EVAL    -- Failure because no HMX hardware is supported.
+
+  @dependencies
+  None.
+  
+ */
+int qurt_hmx_unlock2(unsigned int type);
+
+
+/**@ingroup func_qurt_hmx_lock
+  Locks a HMX unit.
+  If a HMX unit is available, this function locks the unit and returns right away.
+  If there is no HMX unit available, the caller is blocked until a HMX is available 
+  and is locked by the function.
+
+  @return
+  #QURT_EOK -- HMX lock successful. \n
+  #QURT_EFAILED -- Failure due to wrong locking condition. \n
+  #QURT_EVAL    -- Failure because no HMX hardware is supported.
+
+  @dependencies
+  None.
+ */
+int qurt_hmx_lock(void);
+
+
+/**@ingroup func_qurt_hmx_unlock
+  Unlocks a HMX unit.
+  If a HMX unit is locked by the caller thread, unlock the HMX unit and clear its 
+  accumulators(assuming fixed point type). 
+  If there is no HMX unit locked by the caller thread, return failure. 
+  
+  @return
+  #QURT_EOK -- HMX unlock successful. \n
+  #QURT_EFAILED -- Failure due to wrong unlocking condition. \n
+  #QURT_EVAL    -- Failure because no HMX hardware is supported.
+
+  @dependencies
+  None.
+ */
+int qurt_hmx_unlock(void);
+
+
+/**@ingroup func_qurt_hmx_try_lock
+  Tries to lock a HMX unit.
+  If a HMX unit is available, this function locks the unit and returns right away;
+  if there is no HMX unit available, the function returns failure without blocking the caller.
+  
+  @return
+  #QURT_EOK -- HMX lock successful \n
+  #QURT_EFAILED -- Failure due to wrong locking condition.\n
+  #QURT_EVAL    -- Failure because no HMX hardware is supported.
+
+  @dependencies
+  None.
+ */
+int qurt_hmx_try_lock(void);
+
+
+/**@ingroup func_qurt_hmx_assign
+  Assign a HMX unit to a target thread specified by its thread identifier. 
+  The HMX unit (HMX hardware context) is specified by hmx_unit.
+  The caller of this function is limited to the SRM process.
+  If the requested hmx_unit is already assigned to another thread with QURT_HMX_NON_SHARED_LOCK, 
+  kernel will detach it from the thread, and re-assign it to the target thread. 
+  If the target thread has HVX enabled, it cannot have HMX enabled.  
+
+  Locking type 
+  #QURT_HMX_NON_SHARED_LOCK:
+   - If the HMX unit is available, lock the HMX unit and return success of #QURT_EOK.
+   - If the HMX unit is already enabled on the target thread, return #QURT_EOK.
+   - If the HMX unit is already locked by another thread, detach the HMX from the thread.
+     Re-assign the HMX unit to the target thread, and return #QURT_EOK.
+     
+  @param[in]  thread_id    Thread identifier
+  @param[in]  type         Locking type  
+                             #QURT_HMX_NON_SHARED_LOCK -- non-shared lock
+  @param[in]  hmx_unit     HMX hardware context number  
+                             #QURT_HMX_UNIT_0
+                             #QURT_HMX_UNIT_1 
+    
+  @return
+  #QURT_EOK       -- The HMX is assigned successfully. This includes the case that \n
+                     the target thread already has HMX assigned. \n
+  #QURT_EFAILED   -- Failure due to wrong assigning conditions. \n
+  #QURT_EINVALID  -- Failure because no HMX hardware is supported.
+
+  @dependencies
+  None.
+ */
+int qurt_hmx_assign ( unsigned int thread_id, unsigned int type, unsigned int hmx_unit );
+
+
+/**@ingroup func_qurt_hmx_release
+  Release a HMX unit from a target thread specified by its thread identifier. 
+  The HMX unit (HMX hardware context) is specified by hmx_unit.
+  The caller of this function is limited to the SRM process.
+
+  Qurt detaches the specified HMX unit from the target thread, and return success of 
+  #QURT_EOK. If the HMX unit is already released from the target thread, return #QURT_EOK.
+     
+  @param[in]  thread_id    Thread identifier
+  @param[in]  hmx_unit     HMX hardware context number  
+                             #QURT_HMX_UNIT_0
+                             #QURT_HMX_UNIT_1 
+    
+  @return
+  #QURT_EOK       -- The HMX is released successfully. This includes the case that \n
+                     the target thread already has the HMX released. \n
+  #QURT_EFAILED   -- Failure due to wrong assigning condition. \n
+  #QURT_EINVALID  -- Failure because no HMX hardware is supported.
+
+  @dependencies
+  None.
+ */
+int qurt_hmx_release ( unsigned int thread_id, unsigned int hmx_unit );
+
+
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_HMX_H */
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_hvx.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_hvx.h
new file mode 100755
index 0000000000000..13c213d49ac84
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_hvx.h
@@ -0,0 +1,421 @@
+#ifndef QURT_HVX_H
+#define QURT_HVX_H
+/**
+  @file qurt_hvx.h 
+  @brief   Prototypes of QuRT HVX API.  
+
+EXTERNAL FUNCTIONS
+   None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+Copyright (c) 2021-2022  by Qualcomm Technologies, Inc.  All Rights Reserved.
+Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+=============================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*=============================================================================
+                        TYPEDEFS
+=============================================================================*/
+/** @cond */
+
+typedef enum {
+    QURT_HVX_MODE_64B = 0,      /**< HVX mode of 64 bytes */
+    QURT_HVX_MODE_128B = 1      /**< HVX mode of 128 bytes */
+} qurt_hvx_mode_t;
+/** @endcond */
+/*=============================================================================
+                        CONSTANTS AND MACROS
+=============================================================================*/
+/** @cond internal_only*/
+/** @addtogroup hvx_macros
+@{ */
+#define QURT_HVX_HW_UNITS_2X128B_4X64B        0x00000204       /**< Bits 15 through 8 are for the number of 128B units.   */
+                                                               /**< Bits 7 through 0 are for the number of 64B units.     */
+#define QURT_HVX_HW_UNITS_4X128B_0X64B        0x00000400   
+#define QURT_HVX_HW_UNITS_6X128B_0X64B        0x00000600   
+
+/* HVX locking status */
+
+#define QURT_HVX_UNLOCKED                     (0)              /* Has not locked HVX unit */
+#define QURT_HVX_LOCKED                       (1)              /* Has locked HVX unit */
+#define QURT_HVX_ERROR                        (-1)             /* Error, no HVX support */
+
+/* Input value for HVX reservation */
+
+#define QURT_HVX_RESERVE_ALL                  (4)              /* All the HVX units in terms of 64B_MODE are requested to be reserved */
+#define QURT_HVX_RESERVE_ALL_AVAILABLE        (0xff)           /* All remaining unlocked HVX units in terms of 64B_MODE are requested to be reserved */
+
+/* Return values for HVX reservation */
+
+#define QURT_HVX_RESERVE_NOT_SUPPORTED        (-1)             /* There is no HVX hardware, or less units in the hardware than requested */
+#define QURT_HVX_RESERVE_NOT_SUCCESSFUL       (-2)             /* Some HVX units are already locked/reserved by other PD, thus not enough units left for the reservation. */
+#define QURT_HVX_RESERVE_ALREADY_MADE         (-3)             /* There is already a HVX reservation made. */
+#define QURT_HVX_RESERVE_CANCEL_ERR           (-4)             /* The action of cancling the reservation fails because this protection domain has no reservation made before. */
+
+// HVX set requests
+
+#define QURT_HVX_64B                    0  /**< */
+#define QURT_HVX_128B                   1  /**< */
+#define QURT_HVX_NO_USE                 2  /**< */
+#define QURT_HVX_RELEASE_CONTEXT        3  /**< */
+#define QURT_HVX_IMMEDIATE_USE          4  /**< */
+
+// HVX set masks
+
+#define QURT_HVX_64B_PREFERRED          (1<<(QURT_HVX_64B  + 8))/**< */
+#define QURT_HVX_128B_PREFERRED         (1<<(QURT_HVX_128B + 8))/**< */
+#define QURT_HVX_64B_ACCEPTABLE         (1<<(QURT_HVX_64B  + 12))/**< */
+#define QURT_HVX_128B_ACCEPTABLE        (1<<(QURT_HVX_128B + 12))/**< */
+
+// HVX set return "result"
+
+#define QURT_EOK                        0     /**< */
+#define QURT_HVX_SET_ERROR              0xFF  /**< */
+
+// hvx_mode_assigned for QURT_HVX_IMMEDIATE_USE 
+#define QURT_HVX_64B_ASSIGNED          (1<<(QURT_HVX_64B  + 8)) /**< */
+#define QURT_HVX_128B_ASSIGNED         (1<<(QURT_HVX_128B + 8)) /**< */
+
+// Sizes of HVX dump buffer
+
+#define   QURT_HVX_V65_64B_VSIZE           2084U      /**<  64 x 32 +  8 x 4 + 4 (version). */
+#define   QURT_HVX_V65_128B_VSIZE          4164U      /**<  128 x 32 + 16 x 4 + 4 (version). */
+#define   QURT_HVX_V66_128B_VSIZE          4420U      /**<  128 x (32 +2) + 16 x 4 + 4 (version). */
+#define   QURT_HVX_V68_128B_VSIZE          4164U      /**<  128 x 32 + 16 x 4 + 4 (version). */
+#define   QURT_HVX_V79_128B_VSIZE          4740U      /**<  128 x (32+4+1) + 4 (version). */
+#define   QURT_HVX_VREG_BUF_SIZE           QURT_HVX_V79_128B_VSIZE /**< */
+
+// HVX dump versions
+
+#define QURT_HVX_DUMP_V65_64B           1U  /**< */
+#define QURT_HVX_DUMP_V65_128B          2U  /**< */
+#define QURT_HVX_DUMP_V66_128B          3U  /**< */
+#define QURT_HVX_DUMP_V68_128B          4U  /**< */
+#define QURT_HVX_DUMP_V79_128B          5U  /**< */
+/** @} */ /* end_addtogroup hvx_macros */
+/** @endcond */
+/** @cond */
+// Qurt data struct for hvx_set input
+typedef struct qurt_hvx_set_struct_ {          
+    unsigned char set_req;  // LSB
+    struct {
+        unsigned char preferred_mask:4;
+        unsigned char acceptable_mask:4;
+    };
+    unsigned short resvd;   // MSB
+} qurt_hvx_set_struct_t;  // 4 bytes
+
+
+// Qurt data struct for hvx_set return
+typedef struct qurt_hvx_set_return_str_ {          
+    unsigned char result;  // LSB
+    unsigned char hvx_mode_assigned;
+    unsigned short resvd;   // MSB
+} qurt_hvx_set_return_struct_t;  // 4 bytes
+/** @endcond */
+
+
+/*=============================================================================
+                        FUNCTIONS
+=============================================================================*/
+
+/**@ingroup func_qurt_hvx_lock
+  Locks one HVX unit specified by the HVX mode.
+  
+  @note1hang Input variable can be 128B_MODE or 64B_MODE. If an HVX unit in this mode 
+             is available, this function locks the unit and returns right away.
+             If the current HVX mode is different from the requested mode, the current 
+             thread is blocked. When all HVX units become idle, QuRT changes 
+             the mode, locks the HVX unit, and returns.
+
+            Starting from Q6v65 with HVX context switch support, qurt_hvx_lock() is 
+            mapped as qurt_hvx_set(64_BYTE or 128_BYTE).
+  
+  @datatypes
+  #qurt_mode_t
+  
+  @param[in]  lock_mode #QURT_HVX_MODE_64B or #QURT_HVX_MODE_128B.
+
+  @return
+  #QURT_EOK -- Success \n
+  Other value -- Failure
+
+  @dependencies
+  None.
+  
+ */
+int qurt_hvx_lock(qurt_hvx_mode_t lock_mode);
+
+/**@ingroup func_qurt_hvx_unlock
+  Unlocks the HVX unit held by this software thread.
+  
+  @note1hang  Starting from Q6v65 with HVX context switch support, qurt_hvx_unlock()
+              maps as qurt_hvx_set(QURT_HVX_RELEASE_CONTEXT).
+  
+  @return
+  #QURT_EOK -- Successful return \n
+  Other values -- Failure
+
+  @dependencies
+  None.
+  
+ */
+int qurt_hvx_unlock(void);
+
+/**@ingroup func_qurt_hvx_try_lock
+  Tries to lock one HVX unit specified by the HVX mode.
+  
+  @note1hang Input variable can be 128B_MODE or 64B_MODE. If an HVX unit in this mode 
+             is available, this function locks the unit and returns #QURT_EOK; Otherwise,
+             the function returns a failure, but does not block the current software 
+             thread to wait for the HVX unit.
+            Starting from Q6v65 with HVX context switch support, qurt_hvx_try_lock()
+             maps to qurt_hvx_set(FOR_IMMEDIATE_USE| preferred_mask | acceptable_mask);
+  
+  @datatypes
+  #qurt_mode_t
+
+  @return
+  #QURT_EOK -- Successful return \n
+  Other values -- Failure
+
+  @dependencies
+  None.
+  
+ */
+int qurt_hvx_try_lock(qurt_hvx_mode_t lock_mode);
+
+/**@ingroup func_qurt_hvx_get_mode
+  Gets the current HVX mode configured by QuRT.
+  
+  @note1hang Returns #QURT_HVX_MODE_128B or #QURT_HVX_MODE_64B, based on 
+             the current HVX configuration.
+  
+  @param[out] 
+  None.
+
+  @return
+  #QURT_HVX_MODE_128B \n
+  #QURT_HVX_MODE_64B \n
+  -1 -- Not available.
+
+  @dependencies
+  None.
+ */
+int qurt_hvx_get_mode(void);
+
+
+/**@ingroup func_qurt_hvx_get_units
+  Gets the HVX hardware configuration that the chipset supports.
+  
+  @note1hang The function returns the HVX hardware configuration supported by the chipset.
+  
+  @return
+  Bitmask of the units: 1X64, 2X64, 4X64, 1X128, 2X128, and so on.\n
+  - QURT_HVX_HW_UNITS_2X126B_4X64B -- V60, V62, or V65 HVX \n
+  - QURT_HVX_HW_UNITS_4X128B_0X64B -- V66 CDSP or newer \n
+  - 0 --  not available
+
+  @dependencies
+  None.
+  
+ */
+int qurt_hvx_get_units(void);
+
+
+/**@ingroup func_qurt_hvx_reserve
+  Reserves HVX units in terms of 64-byte mode for the protection domain (PD) of the caller.
+  
+  @note1hang Only one HVX reservation in the system is supported.
+             If one HVX unit is already locked by the application in the same PD, the unit is 
+             added to the returned count as one reserved unit for the PD.
+            Starting from Q6v65 with HVX context switch support, qurt_hvx_reserve()
+            only does basic sanity checks on HVX units.
+  
+  @datatypes
+  None.
+
+  @param[in]  num_units  Number of HVX units in terms of 64B_MODE to reserve for the PD.
+                         QURT_HVX_RESERVE_ALL to reserve all the HVX units.
+                         QURT_HVX_RESERVE_ALL_AVAILABLE to reserve the remaining unlocked units.
+
+  @return
+    Number of units successfully reserved, including the units already locked in the same PD. \n
+    #QURT_HVX_RESERVE_NOT_SUPPORTED \n     
+    #QURT_HVX_RESERVE_NOT_SUCCESSFUL \n    
+  #QURT_HVX_RESERVE_ALREADY_MADE    
+
+
+  @dependencies
+  None.
+  
+ */
+int qurt_hvx_reserve(int num_units);
+
+
+/**@ingroup func_qurt_hvx_cancel_reserve
+  Cancels the HVX reservation in the protection domain (PD) of the caller.
+  
+  @note1hang Only one HVX reservation in the system is supported.
+  
+  @return
+    0 -- Success \n
+    #QURT_HVX_RESERVE_CANCEL_ERR -- Failure      
+
+  @dependencies
+  None.
+  
+ */
+int qurt_hvx_cancel_reserve(void);
+
+
+/**@ingroup func_qurt_hvx_get_lock_val
+  Gets the HVX locking status value of the thread of the caller. 
+  
+  @note1hang Returns the status of whether the thread of the caller already locks a HVX unit or not.
+  
+  @datatypes
+  None.
+
+  @return
+    #QURT_HVX_UNLOCKED \n  
+    #QURT_HVX_LOCKED \n   
+    #QURT_HVX_ERROR    
+
+  @dependencies
+  None.
+ */
+int qurt_hvx_get_lock_val(void);
+
+/** @cond internal_only*/
+/**@ingroup func_qurt_hvx_set
+  Sets the HVX configuration for the software thread of the caller. 
+  
+  @datatypes
+  None.
+
+  @param[in] input_arg Composed of set_request | hvx_preferred_mode_mask 
+                       | hvx_acceptable_mode_mask where set_request can be set to: \n
+                       - #QURT_HVX_64B  \n         
+                       - #QURT_HVX_128B  \n       
+                       - #QURT_HVX_NO_USE  \n    
+                       - #QURT_HVX_RELEASE_CONTEXT \n
+                       - #QURT_HVX_IMMEDIATE_USE \n
+                       When set_request is QURT_HVX_IMMEDIATE_USE,  
+    hvx_preferred_mode_mask can be set to: \n
+                       - #QURT_HVX_64B_PREFERRED \n    
+                       - #QURT_HVX_128B_PREFERRED   
+                       When set_request is QURT_HVX_IMMEDIATE_USE,  
+    hvx_acceptable_mode_mask can be set to: \n
+                       - #QURT_HVX_64B_ACCEPTABLE  \n
+                       - #QURT_HVX_128B_ACCEPTABLE @tablebulletend
+
+  @return 
+     Result of the HVX setting in the least significant 8 bits of the returned data. \n
+  #QURT_EOK -- 0  \n
+  #QURT_HVX_SET_ERROR -- 0xFF \n     
+  When #QURT_HVX_IMMEDIATE_USE has a result of #QURT_EOK, 
+  bit 8 to bit 15 of the returned data contain hvx_mode_assigned:\n
+  - #QURT_HVX_64B_ASSIGNED      \n
+  - #QURT_HVX_128B_ASSIGNED   
+
+  @dependencies
+  None.
+ */
+unsigned int qurt_hvx_set(unsigned int input_arg);
+
+
+/**@ingroup func_qurt_system_hvx_regs_get_maxsize
+  Returns the maximum buffer size for saving HVX registers.
+  
+  @datatypes
+  None.
+
+  @return
+  0 -- No HVX supported in the target. \n
+  #QURT_HVX_VREG_BUF_SIZE -- Maximum buffer size for saving HVX registers.
+
+  @dependencies
+  None.
+ */
+unsigned int qurt_system_hvx_regs_get_maxsize(void);
+
+
+/**@ingroup func_qurt_system_hvx_regs_get_size
+  Returns the buffer size for saving HVX registers for a specified thread.
+  
+  @param[in]  thread_id    Thread ID of the target thread.
+
+  @return
+  0 -- No HVX assgined to the thread. \n
+    size -- Size of the buffer in bytes for saving HVX registers for the specified thread: \n 
+  - #QURT_HVX_V65_64B_VSIZE  -- 64 x 32 +  8 x 4 + 4 (version) \n
+  - #QURT_HVX_V65_128B_VSIZE -- 128 x 32 + 16 x 4 + 4 (version) \n
+  - #QURT_HVX_V66_128B_VSIZE -- 128 x (32 +2) + 16 x 4 + 4 (version) \n
+  - #QURT_HVX_V68_128B_VSIZE -- 128 x 32 + 16 x 4 + 4 (version) \n
+  - #QURT_HVX_V79_128B_VSIZE -- 128 x (32+4+1) + 4 (version)
+
+
+  @dependencies
+  None.
+  
+ */
+unsigned int qurt_system_hvx_regs_get_size(unsigned int thread_id);
+
+
+
+/**@ingroup func_qurt_system_hvx_regs_get
+  Saves the HVX registers into the specified buffer.
+  Returns the size of the data saved into the buffer.
+  After calling this function for the first time on a specified thread_id, the QuRT kernel removes the internal HVX saving buffer 
+  from the specified thread. When calling the function on the same thread_id for the second time, this function returns 0.
+  
+  @param[in] thread_id    Thread ID of the target thread.
+  @param[in] pBuf         Pointer to the buffer for HVX register saving.
+                          The first four bytes of the buffer are for saving the HVX version. HVX registers are saved from 
+                          the fifth byte of the buffer. The address of the fifth byte should be 256 bytes aligned. 
+                          For example, a buffer can be declared at first as: \n
+                          unsigned char vbuf[QURT_HVX_VREG_BUF_SIZE+256];\n
+                          unsigned char *pBuf; \n
+                          then align the buffer pointer to: \n
+                          pBuf = vbuf; \n
+                    pBuf += (256 - 4 - (unsigned)pBuf%256);
+  @param[in] size         Size of the buffer provided, which is pointed by *pBuf. The buffer size should not be smaller than that 
+                          returned from qurt_system_hvx_regs_get_size(), and pBuf should be aligned as described above.
+  @param[out] pBuf        Buffer returned with the saved HVx registers (unsigned char hvx_regs[];), which are saved from the fith 
+                          byte of the buffer, and the HVX version (unsigned int hvx_version;), which in the first four bytes 
+                          contain one of the HVX dump versions:\n
+                          - #QURT_HVX_DUMP_V65_64B \n   
+                          - #QURT_HVX_DUMP_V65_128B \n   
+                          - #QURT_HVX_DUMP_V66_128B  \n  
+                          - #QURT_HVX_DUMP_V68_128B  \n  
+                          - #QURT_HVX_DUMP_V79_128B  \n  
+                           @tablebulletend
+
+  @return
+    Total bytes of the data saved in the provided buffer. \n
+  0  -- No HVX assigned to the thread \n
+  #QURT_HVX_V65_64B_VSIZE   --  64 x 32 +  8 x 4 + 4 (version) \n
+  #QURT_HVX_V65_128B_VSIZE  -- 128 x 32 + 16 x 4 + 4 (version) \n
+  #QURT_HVX_V66_128B_VSIZE  -- 128 x (32 +2) + 16 x 4 + 4 (version) \n
+  #QURT_HVX_V68_128B_VSIZE  -- 128 x 32 + 16 x 4 + 4 (version)  \n
+  #QURT_HVX_V79_128B_VSIZE  -- 128 x (32+4+1) + 4 (version)
+
+  @dependencies
+  None.
+ */
+unsigned int qurt_system_hvx_regs_get(unsigned int thread_id, void *pBuf, size_t size);
+/** @endcond */
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_HVX_H */
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_int.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_int.h
new file mode 100755
index 0000000000000..386aeda1051eb
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_int.h
@@ -0,0 +1,509 @@
+﻿#ifndef QURT_INT_H
+#define QURT_INT_H
+/**
+  @file  qurt_int.h
+  @brief  QuRT interrupt functions.    
+
+
+
+ Copyright (c) 2013-2021, 2023 Qualcomm Technologies, Inc.
+ All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*=============================================================================
+                            CONSTANTS AND MACROS
+=============================================================================*/
+
+
+/** @cond rest_reg_dist */
+/** @addtogroup interrupts_constants
+@{ */
+#define SIG_INT_ABORT 0x80000000                                       /**< */
+#define QURT_INT_NON_DELAYED_ACK           0 
+#define QURT_INT_DELAYED_ACK               1
+#define QURT_INT_ACK_DEFAULT               QURT_INT_NON_DELAYED_ACK
+#define QURT_INT_DRV_DEFAULT               0
+#define QURT_INT_PRIORITY_DEFAULT          0xFF
+
+/** QuRT interrupt property. */
+#define QURT_INT_CONFIGID_POLARITY        0x1U /**< */
+#define QURT_INT_CONFIGID_LOCK            0x2U /**< */
+
+/** QuRT interrupt lock.*/
+#define QURT_INT_LOCK_DEFAULT             0x0  /**< Default. */
+#define QURT_INT_LOCK_DISABLE             0x0  /**< Interrupt can be enabled or disabled or deregistered. */
+#define QURT_INT_LOCK_ENABLE              0x1  /**< Interrupt is locked and cannot be enabled, disabled, or deregistered.*/
+/** @} */ /* end_addtogroup interrupts_constants */
+
+/** @addtogroup Qurt_interrupt_type
+@{ */
+/** Trigger type bit fields for a PDC interrupt:\n
+    @verbatim
+    Polarity  Edge  Output\n
+    0         00    Level sensitive active low
+    0         01    Rising edge sensitive
+    0         10    Falling edge sensitive
+    0         11    Dual edge sensitive
+    1         00    Level sensitive active high
+    1         01    Falling edge sensitive
+    1         10    Rising edge sensitive
+    1         11    Dual edge sensitive 
+    @endverbatim
+*/
+#define QURT_INT_TRIGGER_TYPE_SET(pol, edge)   ((((pol) & 0x01U) << 2) | ((edge) & 0x03U)) /**< */
+	 
+#define QURT_INT_TRIGGER_LEVEL_LOW     QURT_INT_TRIGGER_TYPE_SET(0U, 0x00U)  /**< */
+#define QURT_INT_TRIGGER_LEVEL_HIGH    QURT_INT_TRIGGER_TYPE_SET(1U, 0x00U)  /**< */
+#define QURT_INT_TRIGGER_RISING_EDGE   QURT_INT_TRIGGER_TYPE_SET(1U, 0x02U)  /**< */
+#define QURT_INT_TRIGGER_FALLING_EDGE  QURT_INT_TRIGGER_TYPE_SET(0U, 0x02U)  /**< */
+#define QURT_INT_TRIGGER_DUAL_EDGE     QURT_INT_TRIGGER_TYPE_SET(0U, 0x03U)  /**< */
+#define QURT_INT_TRIGGER_USE_DEFAULT   0xffU                                 /**< */
+/** @} */ /* end_addtogroup Qurt_interrupt_type */
+
+/*=====================================================================
+ Functions
+======================================================================*/
+
+/**@ingroup func_qurt_interrupt_register
+  @xreflabel{sec:interrupt_register} 
+  Registers the interrupt.\n
+  Enables the specified interrupt and associates it with the specified QuRT signal object and
+  signal mask.
+
+  Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1
+  indicates that a signal must be waited on, and 0 indicates not to wait.
+
+  When the interrupt occurs, the signal specified in the signal mask is set in the signal
+  object. An IST conventionally waits on that signal to
+  handle the interrupt. The thread that registers the interrupt is set as the IST.
+
+  Up to 31 separate interrupts can be registered to a single signal object, as determined by
+  the number of individual signals the object can store. QuRT reserves signal 31. Thus a
+  single IST can handle several different interrupts.
+
+  QuRT reserves some interrupts for internal use -- the remainder are available for use by
+  applications, and thus are valid interrupt numbers. If the specified interrupt number is
+  outside the valid range, the register operation returns the status value QURT_EINT.
+
+  Only one thread can be registered at a time to a specific interrupt. Attempting to register
+  an already-registered interrupt returns the status value QURT_EVAL.
+
+  Only one signal bit in a signal object can be registered at a time to a specific interrupt.
+  Attempting to register multiple signal bits to an interrupt returns the status value
+  QURT_ESIG.
+
+  When the signal registers an interrupt, QuRT can only set its signal bits 
+  when receiving the interrupt. The QuRT signal API from another
+  software thread cannot set the signal even for unused signal bits.
+
+  @note1hang The valid range for an interrupt number can differ on target execution
+             environments other than the simulator. For more information, see the
+             appropriate hardware document.
+								 
+  @datatypes
+  #qurt_anysignal_t
+
+  @param[in] int_num      L2VIC interrupt to deregister; valid range is 0 to 1023.
+  @param[in] int_signal   Any-signal object to wait on (Section @xref{dox:any_signals}).
+  @param[in] signal_mask  Signal mask value indicating signal to receive the interrupt.
+
+   @return
+   #QURT_EOK -- Interrupt successfully registered.\n
+   #QURT_EINT -- Invalid interrupt number. \n
+   #QURT_ESIG -- Invalid signal bitmask (cannot set more than one
+                signal at a time). \n
+   #QURT_EVAL -- Interrupt already registered.
+
+   @dependencies
+   None.
+*/
+ unsigned int qurt_interrupt_register(int int_num, qurt_anysignal_t *int_signal, int signal_mask);
+
+/**@ingroup func_qurt_interrupt_register2
+  @xreflabel{sec:interrupt_register2} 
+  Registers the interrupt.\n
+  Enables the specified interrupt, associates it with the specified QuRT signal object and
+  signal mask, and sets interrupt flags.
+
+  Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1
+  indicates that a signal must be waited on, and 0 indicates not to wait.
+
+  When the interrupt occurs, the signal specified in the signal mask is set in the signal
+  object. An IST conventionally waits on that signal to
+  handle the interrupt. The thread that registers the interrupt is set as the IST.
+
+  Up to 31 separate interrupts can be registered to a single signal object, as determined by
+  the number of individual signals that the object can store. QuRT reserves signal 31. Thus a
+  single IST can handle several different interrupts.
+
+  QuRT reserves some interrupts for internal use -- the remainder are available for use by
+  applications, and thus are valid interrupt numbers. If the specified interrupt number is
+  outside the valid range, the register operation returns the status value #QURT_EINT.
+
+  Only one thread can be registered at a time to a specific interrupt. Attempting to register
+  an already-registered interrupt returns the status value #QURT_EVAL.
+
+  Only one signal bit in a signal object can be registered at a time to a specific interrupt.
+  Attempting to register multiple signal bits to an interrupt returns the status value
+  #QURT_ESIG.
+
+  When the signal registers an interrupt, QuRT can only set its signal bits 
+  when receiving the interrupt. The QuRT signal API from another
+  software thread cannot set the signal even for unused signal bits.
+
+  @note1hang The valid range for an interrupt number can differ on target execution
+             environments other than the simulator. For more information, see the
+             appropriate hardware document.
+								 
+  @datatypes
+  #qurt_anysignal_t
+
+  @param[in] int_num      L2VIC interrupt to deregister; valid range is 0 to 1023.
+  @param[in] int_signal   Any-signal object to wait on (Section @xref{dox:any_signals}).
+  @param[in] signal_mask  Signal mask value indicating signal to receive the interrupt.
+  @param[in] flags        Defines interrupt property, supported property is interrupt lock enable/disable. 
+                          Possible values for flags: \n
+                           - #QURT_INT_LOCK_ENABLE
+                           - #QURT_INT_LOCK_DISABLE @tablebulletend
+
+   @return
+   #QURT_EOK -- Interrupt successfully registered.\n
+   #QURT_EINT -- Invalid interrupt number. \n
+   #QURT_ESIG -- Invalid signal bitmask (cannot set more than one
+                signal at a time). \n
+   #QURT_EVAL -- Interrupt already registered.
+
+   @dependencies
+   None.
+*/
+ unsigned int qurt_interrupt_register2(int int_num, qurt_anysignal_t *int_signal, int signal_mask, unsigned int flags);
+/*
+ * Waits for registered interrupt signal
+
+ * Suspend the current thread until one of its registered interrupts occurs. The second input mask, 
+ * contains the interrupt signals the IST expects to receive. The interrupt signals are registered 
+ * with interrupts via qurt_register_interrupt API.
+ *
+ * The signals returned in the signal variable indicate which interrupts occurred. Use function 
+ * qurt_anysignal_get to read the signals. IST must locally maintain a table that maps a signal to 
+ * a specific interrupt. IST also checks if signal #SIG_INT_ABORT is received. If so, the IST 
+ * must quit from interrupt receiving loop.
+ *
+ * For detail information on this API, see QuRT User Manual Section 4.2.5
+ *
+ * Prototype
+ *
+ * unsigned int qurt_anysignal_wait(qurt_anysignal_t *int_signal, unsigned int mask)
+ */
+
+/**@ingroup func_qurt_interrupt_acknowledge
+  Acknowledges an interrupt after it has been processed.\n
+  Re-enables an interrupt and clears its pending status. This is done after an interrupt is
+  processed by an IST.
+
+  Interrupts are automatically disabled after they occur. To re-enable an interrupt, an IST
+  performs the acknowledge operation after it has finished processing the interrupt and
+  just before suspending itself (such as by waiting on the interrupt signal).
+
+  @note1hang To prevent losing or reprocessing subsequent occurrences of the interrupt,
+           an IST must clear the interrupt signal (Section @xref{sec:anysignal_clear}) before
+           acknowledging the interrupt.
+
+  @param[in] int_num Interrupt that is being re-enabled.
+
+  @return 
+  #QURT_EOK -- Interrupt acknowledge was successful. \n
+  #QURT_EDEREGISTERED -- Interrupt is already de-registered.
+
+  @dependencies
+  None.	
+*/
+int qurt_interrupt_acknowledge(int int_num);
+
+/**@ingroup func_qurt_interrupt_deregister
+  Disables the specified interrupt and disassociates it from a QuRT signal object.
+  If the specified interrupt was never registered (Section @xref{sec:interrupt_register}), the deregister operation
+  returns the status value #QURT_EINT.
+
+  @note1hang If an interrupt is deregistered while an IST waits
+             to receive it, the IST might wait indefinitely for the interrupt to occur. To avoid
+             this problem, the QuRT kernel sends the signal #SIG_INT_ABORT to awaken an
+             IST after determining that it has no interrupts registered.
+
+  @param[in] int_num L2VIC to deregister; valid range is 0 to 1023.
+
+  @return
+  #QURT_EOK -- Success.\n
+  #QURT_EINT -- Invalid interrupt number (not registered).
+
+  @dependencies
+  None.
+
+*/
+unsigned int qurt_interrupt_deregister(int int_num);
+/** @endcond */
+
+/**@ingroup func_qurt_interrupt_disable
+  Disables an interrupt with its interrupt number.\n
+  The interrupt must be registered prior to calling this function. 
+  After qurt_interrupt_disable() returns, the Hexagon subsystem
+  can no longer send the corresponding interrupt to the Hexagon
+  core, until qurt_interrupt_enable() is called 
+  for the same interrupt. 
+  
+  Avoid calling qurt_interrupt_disable() and qurt_interrupt_enable() frequently within 
+  a short period of time.\n
+  - A pending interrupt can already be in the Hexagon core when qurt_interrupt_disable() 
+    is called. Therefore, some time later, the pending interrupt is received on a Hexagon 
+    hardware thread.\n
+  - After the Hexagon subsystem sends an interrupt to the Hexagon core, the Hexagon 
+    hardware automatically disables the interrupt until kernel software re-enables the interrupt 
+    at the interrupt acknowledgement stage. If qurt_interrupt_enable() is called from a certain 
+    thread at an ealier time, the interrupt is re-enabled earlier and can trigger 
+  sending a new interrupt to the Hexagon core while kernel software is still processing
+  the previous interrupt.
+
+  @param[in] int_num Interrupt number.
+
+  @return
+  #QURT_EOK  -- Interrupt successfully disabled.\n 
+  #QURT_EINT -- Invalid interrupt number.\n
+  #QURT_ENOTALLOWED -- Interrupt is locked. \n
+  #QURT_EVAL -- Interrupt is not registered. 
+
+  @dependencies
+  None.
+*/
+ unsigned int qurt_interrupt_disable(int int_num);
+
+ 
+/**@ingroup func_qurt_interrupt_enable
+  Enables an interrupt with its interrupt number.\n
+  The interrupt must be registered prior to calling this function. 
+
+  @param[in] int_num Interrupt number.
+
+  @return
+  #QURT_EOK -- Interrupt successfully enabled.\n 
+  #QURT_EINT -- Invalid interrupt number.\n
+  #QURT_ENOTALLOWED -- Interrupt is locked. \n
+  #QURT_EVAL -- Interrupt is not registered.
+
+  @dependencies
+  None.
+
+*/
+ unsigned int qurt_interrupt_enable(int int_num);
+
+
+/**@ingroup func_qurt_interrupt_status
+  Returns a value that indicates the pending status of the specified interrupt.
+
+  @param[in]  int_num  Interrupt number that is being checked.
+  @param[out] status   Interrupt status; 1 indicates that an interrupt is
+                       pending, 0 indicates that an interrupt is not pending.
+ 
+  @return 
+  #QURT_EOK -- Success. \n
+  #QURT_EINT -- Failure; invalid interrupt number.
+
+  @dependencies
+  None.
+ */
+unsigned int qurt_interrupt_status(int int_num, int *status);
+
+
+/**@ingroup func_qurt_interrupt_get_status
+  Gets the status of the specified interrupt in L2VIC.
+
+  @param[in]  int_num  Interrupt number that is being checked.
+  @param[in]  status_type     0 -- interrupt pending status \n
+                              1 -- interrupt enabling status
+  @param[out] status          0 -- OFF \n
+                              1 -- ON
+ 
+  @return 
+  #QURT_EOK -- Success. \n
+  #QURT_EINT -- Failure; invalid interrupt number.
+
+  @dependencies
+  None.
+ */
+unsigned int qurt_interrupt_get_status(int int_num, int status_type, int *status);
+
+/** @cond rest_reg_dist */
+/**@ingroup func_qurt_interrupt_clear
+  Clears the pending status of the specified interrupt.
+
+  @note1hang This operation is intended for system-level use, and must be used with care.
+             
+  @param[in] int_num Interrupt that is being re-enabled.
+ 
+  @return 
+  #QURT_EOK -- Success.\n
+  #QURT_EINT -- Invalid interrupt number.
+  
+  @dependencies
+  None.
+ */
+unsigned int qurt_interrupt_clear(int int_num);
+
+
+/**@ingroup func_qurt_interrupt_get_config
+  Gets the L2VIC interrupt configuration. \n
+  This function returns the type and polarity of the specified L2VIC interrupt.
+
+  @param[in]   int_num       L2VIC interrupt that is being re-enabled.
+  @param[out]  int_type      Pointer to an interrupt type. \n
+                             0 -- Level-triggered interrupt \n
+                             1 -- Eedge-triggered interrupt
+  @param[out]  int_polarity  Pointer to interrupt polarity.\n
+                             0 -- Active-high interrupt \n
+                             1 -- Active-low interrupt.
+ 
+  @return 
+  #QURT_EOK -- Configuration successfully returned.\n
+  #QURT_EINT -- Invalid interrupt number. 
+
+  @dependencies
+  None.
+ */
+unsigned int qurt_interrupt_get_config(unsigned int int_num, unsigned int *int_type, unsigned int *int_polarity);
+
+/**@ingroup func_qurt_interrupt_set_config
+  Sets the type and polarity of the specified L2VIC interrupt.
+
+  @note1hang Deregister L2VIC interrupts before reconfiguring them.
+
+  @param[in] int_num        L2VIC interrupt that is being re-enabled.
+  @param[in] int_type       Interrupt type. \n
+                            0 -- Level-triggered interrupt\n
+                            1 -- Edge-triggered interrupt
+  @param[in] int_polarity   Interrupt polarity. \n
+                            0 -- Active-high interrupt \n
+                            1 -- Active-low interrupt
+ 
+  @return
+  #QURT_EOK -- Success. \n
+  #QURT_ENOTALLOWED -- Not allowed; the interrupt is being registered.\n
+  #QURT_EINT -- Invalid interrupt number.
+  
+  @dependencies
+  None.
+ */
+unsigned int qurt_interrupt_set_config(unsigned int int_num, unsigned int int_type, unsigned int int_polarity);
+
+/**@ingroup func_qurt_interrupt_set_config2
+  Sets the type and polarity of the specified L2VIC interrupt.
+
+  @note1hang L2VIC interrupts must be deregistered before they can be reconfigured.
+
+  @param[in] int_num        L2VIC interrupt that is being re-enabled.
+  @param[in] int_type       Notified to the hardware configuration callback function and used to 
+                            modify the L2VIC type. Possible values: \n 
+                            - #QURT_INT_TRIGGER_USE_DEFAULT \n 
+                            - #QURT_INT_TRIGGER_LEVEL_HIGH  \n 
+                            - #QURT_INT_TRIGGER_LEVEL_LOW  \n 
+                            - #QURT_INT_TRIGGER_RISING_EDGE  \n 
+                            - #QURT_INT_TRIGGER_FALLING_EDGE  \n              
+                            - #QURT_INT_TRIGGER_DUAL_EDGE  @tablebulletend
+ 
+  @return
+  #QURT_EOK -- Success. \n
+  #QURT_ENOTALLOWED -- Not allowed; the interrupt is being registered.\n
+  #QURT_EINT -- Invalid interrupt number.
+  
+  @dependencies
+  None.
+ */
+unsigned int qurt_interrupt_set_config2(unsigned int int_num, unsigned int int_type);
+
+/**@ingroup func_ qurt_interrupt_set_config3
+  Sets the specified configuration value for the specified property of the specified L2VIC interrupt.
+
+  @note1hang L2VIC interrupts must be deregistered before they can be reconfigured for polarity.
+    
+  @param[in] int_num        L2VIC interrupt to re-enable.
+  @param[in] config_id      Property to configure: \n
+                            - #QURT_INT_CONFIGID_POLARITY \n
+                            - #QURT_INT_CONFIGID_LOCK @tablebulletend
+  @param[in] config_val    Dependent on the second argument config_id, specifies the value to set. \n
+                           Values for #QURT_INT_CONFIGID_POLARITY: \n 
+                            - #QURT_INT_TRIGGER_USE_DEFAULT \n
+                            - #QURT_INT_TRIGGER_LEVEL_HIGH  \n
+                            - #QURT_INT_TRIGGER_LEVEL_LOW \n
+                            - #QURT_INT_TRIGGER_RISING_EDGE \n
+                            - #QURT_INT_TRIGGER_FALLING_EDGE \n             
+                            - #QURT_INT_TRIGGER_DUAL_EDGE \n
+
+                           Values for #QURT_INT_CONFIGID_LOCK: \n
+                            - #QURT_INT_LOCK_ENABLE\n
+                            - #QURT_INT_LOCK_DISABLE @tablebulletend
+          
+  @return
+  #QURT_EOK -- Success. \n
+  #QURT_ENOTALLOWED -- Not allowed; the interrupt is being registered or is locked for enable/disable.\n
+  #QURT_EINT -- Invalid interrupt number.
+
+  @dependencies
+  None.
+*/
+unsigned int qurt_interrupt_set_config3(unsigned int int_num, unsigned int config_id, unsigned int config_val);
+
+
+/**@ingroup func_qurt_interrupt_raise
+  Raises the interrupt. \n
+  This function triggers a level-triggered L2VIC
+  interrupt, and accepts interrupt numbers in the range of 0 to 1023.
+
+  @param[in] interrupt_num Interrupt number.
+  
+  @return
+  #QURT_EOK --  Success \n
+  -1  --  Failure; the interrupt is not supported.
+
+  @dependencies
+  None.
+ */
+int qurt_interrupt_raise(unsigned int interrupt_num);
+
+/**@ingroup func_qurt_interrupt_raise2
+  Raises the interrupt and returns the current pcycle value.
+
+  @param[in] interrupt_num Interrupt number.
+  
+  @return
+  0xFFFFFFFFFFFFFFFF -- Failure; the interrupt is not supported.\n
+  Other value        -- pcycle count at the time the interrupt is raised.
+
+  @dependencies
+  None.
+ */
+unsigned long long qurt_interrupt_raise2(unsigned int interrupt_num);
+/** @endcond */
+
+/** @cond internal_only */
+/**@ingroup func_qurt_isr_subcall
+  Indicates whether the current function is called from a callback procedure (either short or long).
+  
+  @return
+  #QURT_EOK -- TRUE \n
+  #QURT_EVAL -- FALSE.
+  
+  @dependencies
+  None.
+ */
+int qurt_isr_subcall(void);
+/** @endcond */
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_INT_H */
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_island.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_island.h
new file mode 100755
index 0000000000000..f0c8ee27cf8b0
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_island.h
@@ -0,0 +1,122 @@
+#ifndef QURT_ISLAND_H
+#define QURT_ISLAND_H
+
+/**
+  @file qurt_island.h
+  @brief  Prototypes of power API
+          The APIs allow entering and exiting island mode where the memory
+          accesses are limited to local memory.
+
+  EXTERNAL FUNCTIONS
+   None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+Copyright (c) 2018-2021,2023  by Qualcomm Technologies, Inc.  All Rights Reserved.
+
+=============================================================================*/
+
+#include <qurt_thread.h>
+#include <qurt_memory.h>
+#include <qurt_alloc.h>
+#include <qurt_error.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**@ingroup func_qurt_island_get_status
+  Gets Island mode status.
+
+  Returns a value that indicates whether the QuRT system executes in Island mode.
+
+  @return
+  0 - Normal mode. \n
+  1 - Island mode. 
+
+  @dependencies
+  None.
+*/
+unsigned int qurt_island_get_status (void);
+
+/**@ingroup func_qurt_island_get_status2
+  Gets Island mode status especially that differentiates between island partial exit and complete exit.
+ 
+  Returns a value that indicates the current state. 
+  
+  @note1hang Transition from NORMAL mode to ISLAND mode happens in single
+             threaded mode. Whereas transition from ISLAND mode to other modes
+             happen in multi-threaded mode. So, a thread that gets island mode
+             status as NORMAL can assume the same status till it continues to
+             run. A thread that gets island mode status as ISLAND should 
+             assume that the status may change to EXITING or NORMAL while it
+             runs. A thread that gets island mode status as EXITING should
+             assume that the status may change to NORMAL while it runs. If 
+             the thread goes to wait state in after reading the status, it should get
+             the island mode state again and not assume the previous state. 
+  @note2hang This api returns more intrinsic states than qurt_island_get_status,
+             when qurt_island_get_status returns 0, this api could return 
+             QURT_ISLAND_MODE_EXITING or QURT_ISLAND_MODE_ISLAND
+          
+  @param[in/out] data  field is reserved for future use. If NULL pointer is passed, 
+                       the field will be ignored. If a valid pointer is passed, 
+                  QuRT will return back a bitmask which can be interpreted as follows:
+                  data[31] - Valid bit. Set to 1 to indicate data[30:0] are valid. 
+                  Otherwise set to 0.
+                  data[30:0] – Reserved for future definition. 
+ 
+  @return
+    QURT_ISLAND_MODE_NORMAL   - Main mode \n
+    QURT_ISLAND_MODE_ISLAND   - Island mode \n
+    QURT_ISLAND_MODE_EXITING  - Exiting Island mode \n
+ 
+  @dependencies
+  None.
+*/
+unsigned int qurt_island_get_status2 (unsigned int *data);
+
+
+
+/**@ingroup func_qurt_island_get_exit_status
+  Gets the reason for the last Island mode exit status.
+
+  @param[out] cause_code Pointer that returns the cause code of the last
+                         island exit reason. \n
+                         - #QURT_EISLANDUSEREXIT -- Island exit due to user call for island exit.\n
+                         - #QURT_ENOISLANDENTRY -- API called before exiting island. \n                
+                         - #QURT_EISLANDINVALIDINT -- Island exit due to an invalid interrupt in Island mode. @tablebulletend
+
+  @param[out] int_num Pointer that holds the invalid interrupt number that caused
+                      island exit when the cause code is #QURT_EISLANDINVALIDINT.
+                      For other cases, it is -1.
+
+  @return
+  None. 
+
+  @dependencies
+  None.
+*/
+void qurt_island_get_exit_status(unsigned int *cause_code, int *int_num);
+
+/**@ingroup func_qurt_island_get_enter_timestamp
+  Gets the recent timestamp when the system exits STM during island enter.
+
+  @param[out]    island_enter_timestamp Returns a pointer to the recent timestamp
+                                        recorded after the system exits STM during island enter. If the system never 
+                                        attempts to enter island, the island_enter_timestamp return pointer holds a value 
+                 of zero.
+  
+  @return
+  None. 
+
+  @dependencies
+  None.
+*/
+void qurt_island_get_enter_timestamp(unsigned long long *island_enter_timestamp);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_ISLAND_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_isr.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_isr.h
new file mode 100755
index 0000000000000..db29ea2f265d7
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_isr.h
@@ -0,0 +1,177 @@
+#ifndef QURT_ISR_H
+#define QURT_ISR_H
+
+/*=====================================================================
+ 
+  @file  qurt_isr.h
+
+  @brief  Prototypes of Qurt ISR API functions      
+
+ EXTERNALIZED FUNCTIONS
+  none
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  none
+
+ Copyright (c) 2017, 2021  by Qualcomm Technologies, Inc.  All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+
+#include <string.h>
+#include <qurt_thread.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*=============================================================================
+                            Functions
+=============================================================================*/
+
+
+/**@ingroup func_qurt_isr_set_hw_config_callback
+  Set callback function for the configuration related to interrupt hardware.
+  In a process, the callback function can only be set once.
+
+  @param[in]   cb_addr      address of the callback function.
+   
+  @return 
+  #QURT_EOK     -- the callback function is set succssfully. \n
+  #QURT_EFAILED -- Failure. The callback function has been set before. 
+
+  @dependencies
+  None.
+ */
+int qurt_isr_set_hw_config_callback(unsigned int cb_addr);
+
+
+/**@ingroup func_qurt_isr_set_hw_enable_callback
+  Set callback function for enabling the configuration related to interrupt hardware.
+  In a process, the callback function can only be set once.
+
+  @param[in]   cb_addr      address of the callback function.
+   
+  @return 
+  #QURT_EOK     -- the callback function is set succssfully. \n
+  #QURT_EFAILED -- Failure. The callback function has been set before. 
+
+  @dependencies
+  None.
+ */
+int qurt_isr_set_hw_enable_callback(unsigned int cb_addr);
+
+
+/**@ingroup func_qurt_isr_set_hw_disable_callback
+  Set callback function for disabling the configuration related to interrupt hardware.
+  In a process, the callback function can only be set once.
+
+  @param[in]   cb_addr      address of the callback function.
+   
+  @return 
+  #QURT_EOK     -- the callback function is set succssfully. \n
+  #QURT_EFAILED -- Failure. The callback function has been set before. 
+
+  @dependencies
+  None.
+ */
+int qurt_isr_set_hw_disable_callback(unsigned int cb_addr);
+
+
+/**@ingroup func_qurt_isr_create
+  Creates an ISR thread with the specified attributes, and makes it executable.
+
+  @datatypes
+  #qurt_thread_t \n
+  #qurt_thread_attr_t
+  
+  @param[out]  thread_id    Returns a pointer to the thread identifier if the thread was 
+                             successfully created.
+  @param[in]   attr 	    Pointer to the initialized thread attribute structure that specifies 
+                             the attributes of the created thread.
+   
+  @return 
+  #QURT_EVAL    -- Invalid arguments
+  #QURT_EOK -- Thread created. \n
+  #QURT_EFAILED -- Thread not created. 
+
+  @dependencies
+  None.
+ */
+int qurt_isr_create (qurt_thread_t *thread_id, qurt_thread_attr_t *pAttr);
+
+/**@ingroup func_qurt_isr_register2
+  Registers an Interrupt Service Routine to an ISR thread. ISR callback with the specified attributes.
+  The interrupt is enabled when this function returns success.
+
+  @datatypes
+   qurt_thread_t
+  
+  @param[in]   isr_thread_id ISR thread ID, returned from qurt_isr_create()
+  @param[in]   int_num       The interrupt number
+  @param[in]   prio          Priority of the ISR
+  @param[in]   flags         Defines ACK type. Values : \n
+                             QURT_INT_NON_DELAYED_ACK - ISR is acknowledged by the interrupt handle routine 
+			                                     in the Kernel.
+                             QURT_INT_DELAYED_ACK     - Client chooses to acknowledge. 
+  @param[in]   int_type.     Notifies it to registered function. Values: \n 
+                             - QURT_INT_TRIGGER_USE_DEFAULT
+                             - QURT_INT_TRIGGER_LEVEL_HIGH 
+                             - QURT_INT_TRIGGER_LEVEL_LOW 
+                             - QURT_INT_TRIGGER_RISING_EDGE 
+                             - QURT_INT_TRIGGER_FALLING_EDGE              
+                             - QURT_INT_TRIGGER_DUAL_EDGE              
+  @param[in]   isr           Interrupt Service Routine with proto type void isr (void *arg, int int_num)
+  @param[in]   arg  	     1st argument of the ISR when it is called to service the interrupt
+   
+  @return 
+   QURT_EOK          -- Successfully registered the ISR for the interrupt
+   QURT_EINT         -- Interrupt not configured
+   QURT_EINVALID     -- Invalid Thread ID
+   QURT_EDISABLED    -- The feature is disabled
+   QURT_EDUPLICATE   -- Interrupt is already registered
+
+  @dependencies
+   Thread ID should be created using qurt_isr_create()
+ */
+int qurt_isr_register2 (qurt_thread_t isr_thread_id, int int_num, unsigned short prio, unsigned short flags, unsigned int int_type, void (*isr) (void *, int), void *arg);
+
+/**@ingroup func_qurt_isr_deregister2
+  De-registers the ISR for the specified interrupt.
+  The interrupt is disabled when this function returns success.
+
+  @param[in]   int_num   The interrupt number
+   
+  @return 
+   QURT_EOK            -- ISR deregistered successfully
+   QURT_ENOREGISTERED  -- Interrupt with int_num is not registered
+
+  @dependencies
+  None.
+ */
+int qurt_isr_deregister2 (int int_num);
+
+/**@ingroup func_qurt_isr_delete
+   ISR thread will exit and releases Kernel resources
+
+   @note1hang   The ISR thread shouldn't be actively processing interrupts,
+                otherwise the call will fail and return an error.
+  
+   @param[in]   thread-id of the ISR thread that needs to be deleted.
+
+   @return
+    QURT_ENOTALLOWED   -- ISR thread is processing an interrupt
+    QURT_EINVALID      -- Invalid ISR thread ID
+    QURT_EOK           -- Success 
+
+   @dependencies
+   Thread ID should be created using qurt_isr_create()
+ */
+int qurt_isr_delete (qurt_thread_t isr_tid);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_ISR_H */
+
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_l2cfg.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_l2cfg.h
new file mode 100755
index 0000000000000..7e26b30a580d9
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_l2cfg.h
@@ -0,0 +1,98 @@
+#ifndef QURT_L2CFG_H
+#define QURT_L2CFG_H
+/**
+  @file qurt_l2cfg.h 
+  @brief QuRT APIs for L2 configuration and system configuration
+
+EXTERNAL FUNCTIONS
+   qurt_l2cfg_set 
+   qurt_l2cfg_get 
+   qurt_system_config_get
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+Copyright (c) 2019-2021 by Qualcomm Technologies, Inc.  All Rights Reserved.
+Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+=============================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*=============================================================================
+                        CONSTANTS AND MACROS
+=============================================================================*/
+
+/* Definition for system configuration */
+/** @addtogroup l2cfg_macros
+@{ */
+#define QURT_CORE_CFG_HMX_INT8_SPATIAL  0x78   /**< HMX fixed-point spatial size */
+#define QURT_CORE_CFG_HMX_INT8_DEPTH    0x7C   /**< HMX fixed-point output depth */
+/** @} */ /* end_addtogroup l2cfg_macros */
+/*=============================================================================
+                        FUNCTIONS
+=============================================================================*/
+/**@ingroup func_qurt_l2cfg_set
+  Sets the value of a L2 configuration register. A register can be set *IFF* its 
+  initial value is configured.
+   
+  @param[in] offset Offset of L2 configuration register; must be multiple of 4.
+  @param[in] value  Value to set the register to. 
+   
+  @return 
+  #QURT_EOK -- Success. \n 
+  #QURT_EFAILED -- Internal mapping that covers L2CFG register file absent; likely 
+                    a configuration problem. \n
+  #QURT_EINVALID -- Argument error. \n
+  #QURT_ENOTALLOWED -- Setting this register is prohibited.
+   
+  @dependencies 
+  None.  
+ */
+int qurt_l2cfg_set (unsigned short offset, unsigned int value);
+
+/**@ingroup func_qurt_l2cfg_get 
+  Gets the value of a L2 configuration register. 
+   
+  @param[in]  offset Offset of L2 configuration register; must be multiple of 4.
+  @param[out] value  Pointer to value of the register. 
+   
+  @return 
+  #QURT_EOK -- Success. \n 
+  #QURT_EFAILED -- Internal mapping that covers L2CFG register file absent;  
+                   likely a configuration problem. \n 
+  #QURT_EINVALID -- Argument error. 
+   
+  @dependencies 
+  None. 
+  
+ */
+int qurt_l2cfg_get (unsigned short offset, unsigned int * value);
+
+
+/**@ingroup func_qurt_system_config_get
+  Gets the system configuration information.
+
+  @param[in] index Index to system configuration. Values:\n
+                   - #QURT_CORE_CFG_HMX_INT8_SPATIAL \n
+                   - #QURT_CORE_CFG_HMX_INT8_DEPTH @tablebulletend
+
+  @param[out] data   Pointer to a word for returned data.
+
+  @return
+  #QURT_EOK -- Get the configuration data successful. \n
+  Other values -- Failure (no such configuration available).
+
+  @dependencies
+  None.
+  
+ */
+int qurt_system_config_get(int index, unsigned int *data);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_L2CFG_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_lifo.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_lifo.h
new file mode 100755
index 0000000000000..dc399fccc5f0f
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_lifo.h
@@ -0,0 +1,71 @@
+#ifndef QURT_LIFO_H
+#define QURT_LIFO_H
+/**
+  @file qurt_lifo.h
+
+  @brief
+  Provide lock free LastInFirstOut algorithm, which can be used in a
+  variety of situations for allocation/free fixed size buffer    
+  This implementation touches the first word of your FREED buffer. Even
+  though it does not matter how you use it when it is allocated, you might want
+	to be a bit careful not to put your MAGIC number as the first field.
+	Because it will not hold the magic value for "freed"
+
+ EXTERNALIZED FUNCTIONS
+ None
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+ None
+
+ Copyright (c) 2013, 2021  by Qualcomm Technologies, Inc.  All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+	/*=====================================================================
+	 Functions
+	======================================================================*/
+
+/*======================================================================*/
+/**
+  Pops an element out of the LIFO. 
+
+  @param[in] freelist  Pointer to the head of your list. 
+	
+  @return 				
+  Top object from the list 
+		 
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+void * qurt_lifo_pop(void *freelist);
+
+ 
+/*======================================================================*/
+/**
+  Pushes an element into the LIFO.
+	
+  @param[in] freelist  Pointer to the head of your list. 
+  @param[in] buf       Pointer to your buffer to push into the list. 
+	
+  @return
+  None.
+		 
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+void qurt_lifo_push(void *freelist, void *buf);
+
+void qurt_lifo_remove(void *freelist, void *buf);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_LIFO_H */
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_mailbox.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_mailbox.h
new file mode 100755
index 0000000000000..a6cd91c611782
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_mailbox.h
@@ -0,0 +1,176 @@
+#ifndef QURT_MAILBOX_H
+#define QURT_MAILBOX_H
+
+/**
+  @file qurt_mailbox.h
+  @brief  Definitions, macros, and prototypes used for QuRT mailbox
+
+ EXTERNALIZED FUNCTIONS
+  none
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  none
+
+ Copyright (c) 2015, 2021-2023  by Qualcomm Technologies, Inc.  All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*=============================================================================
+                        CONSTANTS AND MACROS
+=============================================================================*/
+/* Definitions on typedef and return values */
+
+#define   QURT_MAILBOX_ID_NULL               0
+#define   QURT_MAILBOX_ERROR                -1
+#define   QURT_MAILBOX_ID_ERROR             -2
+#define   QURT_MAILBOX_NON_VALID_DATA       -3
+#define   QURT_MAILBOX_FULL                 -4
+#define   QURT_MAILBOX_DELETED              -5
+#define   QURT_MAILBOX_RECEIVE_HALTED       -6
+#define   QURT_MAILBOX_BANDWIDTH_LIMIT      -7
+
+
+/*=============================================================================
+                    FORWARD DECLARATIONS & TYPEDEFS
+=============================================================================*/
+
+#define        QURT_MAILBOX_AT_QURTOS     0U            // Receiver is QurtOS
+#define        QURT_MAILBOX_AT_ROOTPD     1U            // Receiver is RootPD  (ASID=0)
+#define        QURT_MAILBOX_AT_USERPD     2U            // Receiver is User PD (ASID!=0)
+#define        QURT_MAILBOX_AT_SECUREPD   3U            // Receiver is Secure PD
+
+typedef unsigned char qurt_mailbox_receiver_cfg_t;  
+
+#define        QURT_MAILBOX_SEND_OVERWRITE        0U       // When there is already valid content, overwrite it
+#define        QURT_MAILBOX_SEND_NON_OVERWRITE    1U       // When there is already valid content, return failure
+
+typedef unsigned char qurt_mailbox_send_option_t;  
+
+
+#define        QURT_MAILBOX_RECV_WAITING          0U          // When there is no valid content, wait for it 
+#define        QURT_MAILBOX_RECV_NON_WAITING      1U          // When there is no valid content, return failure immediately
+#define        QURT_MAILBOX_RECV_PEEK_NON_WAITING 2U          // Read the content, but doesn't remove it from the mailbox. No waiting.
+
+typedef unsigned char qurt_mailbox_recv_option_t;
+
+
+/*=============================================================================
+                            EXTERNS & FUNCTIONS
+=============================================================================*/
+/* Function prototype */
+
+/**@ingroup qurt_mailbox_create
+  Creates a QuRT mailbox.
+   
+  @param name            Mailbox name up to 8 characters.
+  @param recv_opt        Configuration on the receiver process.
+
+  @return
+  Mailbox ID --          Mailbox Identifier \n
+  #QURT_MAILBOX_ID_NULL --  NULL, failure at creating mailbox
+
+  @dependencies
+  None.
+*/
+unsigned long long qurt_mailbox_create(char *name, qurt_mailbox_receiver_cfg_t recv_opt);
+
+
+/**@ingroup qurt_mailbox_get_id
+  Gets a QuRT mailbox identifier.
+   
+  @param name            Mailbox name up to 8 characters.
+
+  @return
+  Mailbox ID --            Mailbox identifier \n
+  #QURT_MAILBOX_ID_NULL -- NULL, failure at getting mailbox ID
+
+  @dependencies
+  None.
+*/
+unsigned long long qurt_mailbox_get_id(char *name);
+
+
+/**@ingroup qurt_mailbox_send
+  Sends data to a QuRT mailbox.
+   
+  @param mailbox_id   Mailbox identifier.
+  @param send_opt     Option for mailbox send.
+  @param data         Data to send.
+
+
+  @return
+  #QURT_EOK                      Success \n
+  #QURT_MAILBOX_ID_ERROR         Mailbox ID error.\n
+  #QURT_MAILBOX_ERROR            Other errors.\n
+  #QURT_MAILBOX_FULL             Valid data already exists, non-overwriting.\n
+  #QURT_MAILBOX_BANDWIDTH_LIMIT  Reached the bandwidth limitation.   
+
+  @dependencies
+  None.
+*/
+int qurt_mailbox_send(unsigned long long mailbox_id, qurt_mailbox_send_option_t send_opt, unsigned long long data);
+
+
+/**@ingroup qurt_mailbox_receive
+  Receive data from QuRT mailbox
+   
+  @param mailbox_id   Mailbox Identifier
+  @param send_opt     Option for mailbox receiving
+  @param data         Pointer to data buffer for receiving
+
+  @return
+  #QURT_EOK                            Success \n
+  #QURT_MAILBOX_ID_ERROR               Mailbox ID error. \n
+  #QURT_MAILBOX_ERROR                  Other errors. \n
+  #QURT_MAILBOX_NON_VALID_DATA         No current valid data, put the previous content in the buffer. \n
+  #QURT_MAILBOX_RECEIVE_HALTED         Receive halted, the waiting thread is woken up. \n
+  #QURT_MAILBOX_DELETED                Mailbox is deleted, and the waiting thread is woken up.
+
+  @dependencies
+  None.
+*/
+int qurt_mailbox_receive(unsigned long long mailbox_id, qurt_mailbox_recv_option_t recv_opt, unsigned long long *data);
+
+
+/**@ingroup qurt_mailbox_delete
+  Deletes a QuRT mailbox.
+
+  A mailbox can only be deleted from the process that created the mailbox.
+   
+  @param mailbox_id   Mailbox identifier.
+
+  @return
+  #QURT_EOK                   Success. \n
+  #QURT_MAILBOX_ID_ERROR      Mailbox ID error. \n
+  #QURT_MAILBOX_ERROR         Other errors.
+
+  @dependencies
+  None.
+*/
+int qurt_mailbox_delete(unsigned long long mailbox_id);
+
+
+/**@ingroup qurt_mailbox_receive_halt
+  Halts a QuRT mailbox receiving and wakes up waiting threads.
+
+  @param mailbox_id   Mailbox identifier.
+
+  @return
+  #QURT_EOK                   Success. \n
+  #QURT_MAILBOX_ID_ERROR      Mailbox ID error.\n
+  #QURT_MAILBOX_ERROR         Other errors.
+
+  @dependencies
+  None.
+*/
+int qurt_mailbox_receive_halt(unsigned long long mailbox_id);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif // QURT_MAILBOX_H
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_memory.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_memory.h
new file mode 100755
index 0000000000000..90ce2586fec50
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_memory.h
@@ -0,0 +1,1487 @@
+#ifndef QURT_MEMORY_H
+#define QURT_MEMORY_H
+/**
+  @file qurt_memory.h
+  @brief  Prototypes of kernel memory API functions.
+
+ EXTERNALIZED FUNCTIONS
+  none
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  none
+
+ Copyright (c) Qualcomm Technologies, Inc.
+ All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+
+
+#include <qurt_error.h>
+#include <qurt_types.h>
+//#include <qurt_util_macros.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** @addtogroup memory_management_macros
+@{ */
+#define QURT_SYSTEM_ALLOC_VIRTUAL 1 /**< Allocates available virtual memory in the address space of all
+                                processes.*/
+/** @} */ /* end_addtogroup memory_management_macros */
+/**@cond rest_reg_dist */
+/** @addtogroup memory_management_types
+@{ */
+/** @xreflabel{hdr:qurt_mem_default_pool} */
+extern qurt_mem_pool_t qurt_mem_default_pool __attribute__((section(".data"))); /**< Memory pool object.*/
+/** @} */ /* end_addtogroup memory_management_types */
+
+/** @cond rest_reg_dist */
+/** Mapping attribute information*/
+typedef struct{
+    qurt_paddr_64_t        paddr;
+    qurt_size_t            size ;
+    qurt_mem_cache_mode_t  cache_mode;
+    qurt_perm_t            perms ; 
+}qurt_mapping_attr_t;
+/** @endcond */
+/** @} */ /* end_addtogroup mapping_attribute_types*/
+
+/*=====================================================================
+ Functions
+======================================================================*/
+
+/**@ingroup func_qurt_mem_cache_clean
+  Performs a cache clean operation on the data stored in the specified memory area.
+  Peforms a syncht on all the data cache operations when the Hexagon processor version is V60 or greater.
+
+  @note1hang Perform the flush all operation only on the data cache.
+
+  @note1cont This operation flushes and invalidates the contents of all cache lines from start address
+             to end address (start address + size). The contents of the adjoining buffer can be 
+             flushed and invalidated if it falls in any of the cache line.
+
+  @datatypes
+  #qurt_addr_t \n
+  #qurt_size_t \n
+  #qurt_mem_cache_op_t \n
+  #qurt_mem_cache_type_t
+
+  @param[in] addr      Address of data to flush.
+  @param[in] size      Size (in bytes) of data to flush.
+  @param[in] opcode    Type of cache clean operation. Values:  
+                       - #QURT_MEM_CACHE_FLUSH
+                       - #QURT_MEM_CACHE_INVALIDATE
+                       - #QURT_MEM_CACHE_FLUSH_INVALIDATE
+                       - #QURT_MEM_CACHE_FLUSH_ALL\n
+                       @note1 #QURT_MEM_CACHE_FLUSH_ALL is valid only when the type is #QURT_MEM_DCACHE @tablebulletend
+  @param[in] type          Cache type. Values:  
+                       - #QURT_MEM_ICACHE
+                       - #QURT_MEM_DCACHE  @tablebulletend
+ 
+  @return
+  #QURT_EOK -- Cache operation performed successfully.\n
+  #QURT_EVAL -- Invalid cache type.\n
+
+  @dependencies
+  None.
+*/
+int qurt_mem_cache_clean(qurt_addr_t addr, qurt_size_t size, qurt_mem_cache_op_t opcode, qurt_mem_cache_type_t type);
+
+/**@ingroup func_qurt_mem_cache_clean2
+  Performs a data cache clean operation on the data stored in the specified memory area.
+
+  This API only performs the following data cache operations:\n  
+  - #QURT_MEM_CACHE_FLUSH\n
+  - #QURT_MEM_CACHE_INVALIDATE\n  
+  - #QURT_MEM_CACHE_FLUSH_INVALIDATE -- flushes/invalidates the contents of all cache lines from start address
+  to end address (start address + size). The contents of the adjoining buffer can be 
+  flushed/invalidated if it falls in any of the cache line.
+
+  @datatypes
+  #qurt_addr_t \n
+  #qurt_size_t \n
+  #qurt_mem_cache_op_t \n
+  #qurt_mem_cache_type_t
+
+  @param[in] addr      Address of data to flush.
+  @param[in] size      Size (in bytes) of data to flush.
+  @param[in] opcode    Type of cache clean operation. Values:\n  #QURT_MEM_CACHE_FLUSH\n  #QURT_MEM_CACHE_INVALIDATE\n
+                       #QURT_MEM_CACHE_FLUSH_INVALIDATE
+  @param[in] type          Cache type. Values: \n #QURT_MEM_DCACHE
+
+  @return
+  #QURT_EOK -- Cache operation performed successfully.\n
+  #QURT_EVAL -- Invalid cache type.
+
+  @dependencies
+  None.
+*/
+int qurt_mem_cache_clean2(qurt_addr_t addr, qurt_size_t size, qurt_mem_cache_op_t opcode, qurt_mem_cache_type_t type);
+
+/**@ingroup func_qurt_mem_cache_phys_clean
+  Performs a cache clean operation on the data stored in the specified memory area based on address match and mask.
+  Operate on a cache line when (LINE.PhysicalPageNumber & mask) == addrmatch.
+
+  @note1hang The addrmatch value should be the upper 24-bit physical address to match against.
+
+  @datatypes
+  #qurt_mem_cache_op_t \n
+
+  @param[in] mask      24-bit address mask.
+  @param[in] addrmatch Physical page number (24 bits) of memory to use as an address match.
+  @param[in] opcode    Type of cache clean operation. Values:  
+                       - #QURT_MEM_CACHE_FLUSH
+                       - #QURT_MEM_CACHE_INVALIDATE @tablebulletend
+ 
+  @return
+  #QURT_EOK -- Cache operation performed successfully.\n
+  #QURT_EVAL -- Invalid operation
+  
+  @dependencies
+  None.
+*/
+
+int qurt_mem_cache_phys_clean(unsigned int mask, unsigned int addrmatch, qurt_mem_cache_op_t opcode);
+
+/**@ingroup func_qurt_mem_l2cache_line_lock 
+  Performs an L2 cache line locking operation. This function locks selective lines in the L2 cache memory.
+
+  @note1hang Perform the line lock operation only on the 32-byte aligned size and address.
+
+  @datatypes
+  #qurt_addr_t \n
+  #qurt_size_t 
+ 
+  @param[in] addr   Address of the L2 cache memory line to lock; the address must be 32-byte aligned.
+  @param[in] size   Size (in bytes) of L2 cache memory to line lock; size must be a multiple of 32 bytes.
+ 
+  @return
+  #QURT_EOK -- Success.\n
+  #QURT_EALIGN -- Data alignment or address failure.
+  #QURT_EINVALID -- Improper addr and size passed (e.g. integer overflow due to addr + size)
+  #QURT_EFAILED -- Failed to lock cache line as all the ways were locked for the corresponding set of an address 
+                   in the range of addr and addr+size or the address range is not L2 cacheable
+  @dependencies
+  None.
+*/
+int qurt_mem_l2cache_line_lock(qurt_addr_t addr, qurt_size_t size);
+
+/**@ingroup func_qurt_mem_l2cache_line_unlock
+  Performs an L2 cache line unlocking operation. This function unlocks selective lines in the L2 cache memory.
+
+  @note1hang Perform the line unlock operation only on a 32-byte aligned size and address.
+
+  @datatypes
+  #qurt_addr_t \n
+  #qurt_size_t
+
+  @param[in] addr   Address of the L2 cache memory line to unlock; the address must be 32-byte aligned.
+  @param[in] size   Size (in bytes) of the L2 cache memory line to unlock; size must be a multiple of 32 bytes.
+
+  @return
+  #QURT_EOK -- Success. \n
+  #QURT_EALIGN -- Aligning data or address failure. \n
+  #QURT_EFAILED -- Operation failed, cannot find the matching tag.
+
+  @dependencies
+  None.
+*/
+int qurt_mem_l2cache_line_unlock(qurt_addr_t addr, qurt_size_t size);
+
+/**@ingroup func_qurt_mem_region_attr_init
+  @xreflabel{sec:qurt_mem_region_attr_init} 
+  Initializes the specified memory region attribute structure with default attribute values: \n
+  - Mapping -- #QURT_MEM_MAPPING_VIRTUAL \n
+  - Cache mode -- #QURT_MEM_CACHE_WRITEBACK \n
+  - Physical address -- -1 \n
+  - Virtual address -- -1 \n
+  - Memory type -- #QURT_MEM_REGION_LOCAL \n
+  - Size -- -1 
+
+  @note1hang The memory physical address attribute must be explicitly set by calling the
+             qurt_mem_region_attr_set_physaddr() function. The size and pool attributes are set directly
+             as parameters in the memory region create operation.
+
+  @datatypes
+  #qurt_mem_region_attr_t
+
+  @param[in,out] attr  Pointer to the destination structure for the memory region attributes.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+void qurt_mem_region_attr_init(qurt_mem_region_attr_t *attr);
+
+/**@ingroup func_qurt_mem_pool_attach
+  Initializes a memory pool object to attach to a pool predefined in the system
+  configuration file.
+
+  Memory pool objects assign memory regions to physical memory in different
+  Hexagon memory units. They are specified in memory region create operations
+  (Section @xref{sec:mem_region_create}).
+
+  @note1hang QuRT predefines the memory pool object #qurt_mem_default_pool
+             (Section @xref{dox:mem_management}) for allocation memory regions in SMI memory. The pool attach
+             operation is necessary only when allocating memory regions in nonstandard
+             memory units such as TCM.
+
+  @datatypes
+  #qurt_mem_pool_t
+
+  @param[in] name   Pointer to the memory pool name.
+  @param[out] pool  Pointer to the memory pool object.
+
+  @return
+  #QURT_EOK -- Attach operation successful.
+
+  @dependencies
+  None.
+*/
+int qurt_mem_pool_attach(char *name, qurt_mem_pool_t *pool);
+
+/**@ingroup func_qurt_mem_pool_attach2
+  Gets the identifier that corresponds to a pool object created specifically for a client, for example, HLOS_PHYSPOOL.
+  The client_handle is used to look up the client specific pool.
+
+  Memory pool objects assign memory regions to physical memory in different
+  Hexagon memory units. Memory pool objects are specified during mapping creation operations 
+  (qurt_mem_mmap() and qurt_mem_region_create()).
+
+  @note1hang QuRT predefines the memory pool object #qurt_mem_default_pool
+             (Section @xref{dox:mem_management}) for allocation memory regions in SMI memory. The pool_attach2
+             operation is necessary only when allocating memory regions in memory units specific to the client.
+
+  @datatypes
+  #qurt_mem_pool_t
+
+  @param[in] client_handle   Client identifier used by the OS to lookup the identifier
+                             for client specific pool
+  @param[in] name            Pointer to the memory pool name.
+  @param[out] pool           Pointer to the memory pool object.
+
+  @return
+  #QURT_EOK -- Attach operation successful.
+
+  @dependencies
+  None.
+*/
+int qurt_mem_pool_attach2(int client_handle, char *name, qurt_mem_pool_t *pool);
+
+/**@ingroup func_qurt_mem_pool_create
+   @xreflabel{hdr:qurt_mem_pool_create}
+   Dynamically creates a memory pool object from a physical address range.
+
+   The pool is assigned a single memory region with the specified base address and size.
+
+   The base address and size values passed to this function must be aligned to 4K byte
+   boundaries, and must be expressed as the actual base address and size values divided by 4K.
+
+   For example, the function call:
+         @code
+         qurt_mem_pool_create ("TCM_PHYSPOOL", 0xd8020, 0x20, &pool)
+         @endcode
+   ... is equivalent to the following static pool definition in the QuRT system configuration file:
+        @code
+       <physical_pool name="TCM_PHYSPOOL">
+            <region base="0xd8020000" size="0x20000" />
+       </physical_pool>
+       @endcode
+
+   @cond rest_dist For more information on the system configuration file, see @xhyperref{80VB41979,80-VB419-79}. @endcond
+
+   @note1hang Dynamically created pools are not identical to static pools. In particular, 
+   qurt_mem_pool_attr_get() is not valid with dynamically created pools.
+
+   @note1cont Dynamic pool creation permanently consumes system resources, and cannot be undone.
+
+  @datatypes
+  #qurt_mem_pool_t
+
+  @param[in] name           Pointer to the memory pool name. 
+  @param[in] base           Base address of the memory region (divided by 4K).
+  @param[in] size           Size (in bytes) of the memory region (divided by 4K).
+  @param[out] pool          Pointer to the memory pool object.
+
+  @return
+  #QURT_EOK -- Success.
+
+  @dependencies
+  None.
+*/
+int qurt_mem_pool_create(char *name, unsigned base, unsigned size, qurt_mem_pool_t *pool);
+
+/**@ingroup func_qurt_mem_pool_add_pages
+  Adds a physical address range to the specified memory pool object.\n
+ 
+  @note1hang Call this operation only with root privileges (guest OS mode).
+
+  @datatypes
+  #qurt_mem_pool_t
+
+  @param[in] pool           Memory pool object.
+  @param[in] first_pageno   First page number of the physical address range (equivalent to address >> 12)
+  @param[in] size_in_pages  Number of pages in the physical address range (equivalent to size >> 12)
+
+  @return
+  #QURT_EOK -- Pages successfully added.
+
+  @dependencies
+  None.
+*/
+int qurt_mem_pool_add_pages(qurt_mem_pool_t pool,
+                            unsigned first_pageno,
+                            unsigned size_in_pages);
+
+/**@ingroup func_qurt_mem_pool_remove_pages
+  Removes a physical address range from the specified memory pool object.
+ 
+  If any part of the address range is in use, this operation returns an
+  error without changing the state.
+ 
+  @note1hang Call this operation only with root privileges (guest-OS mode).
+ 
+  @note1cont In the future, this operation will support (via the flags parameter) the
+  removal of a physical address range when part of the range is in use.
+ 
+  @datatypes
+  #qurt_mem_pool_t
+
+  @param[in] pool           Memory pool object.
+  @param[in] first_pageno   First page number of the physical address range (equivalent to address >> 12)
+  @param[in] size_in_pages  Number of pages in the physical address range (equivalent to size >> 12)
+  @param[in] flags          Remove options. Values: \n 
+                            - 0 -- Skip holes in the range that are not part of the pool (default) \n
+                            - #QURT_POOL_REMOVE_ALL_OR_NONE -- Pages are removed only if the specified
+                            physical address range is entirely contained (with no holes) in the
+                            pool free space. @tablebulletend                          
+  @param[in] callback       Callback procedure called when pages were successfully removed.
+                            Not called if the operation failed. Passing 0 as the parameter
+                            value causes the callback to not be called. 
+  @param[in] arg            Value passed as an argument to the callback procedure.
+
+  @return
+  #QURT_EOK -- Pages successfully removed.
+
+  @dependencies
+  None.
+*/
+int qurt_mem_pool_remove_pages(qurt_mem_pool_t pool,
+                               unsigned first_pageno,
+                               unsigned size_in_pages,
+                               unsigned flags,
+                               void (*callback)(void *),
+                               void *arg);
+/**@ingroup memory_management_types*/
+#define QURT_POOL_REMOVE_ALL_OR_NONE            1  /**< */
+
+/**@ingroup func_qurt_mem_pool_attr_get  
+   Gets the memory pool attributes. \n
+   Retrieves pool configurations based on the pool handle, and fills in
+   the attribute structure with configuration values.   
+
+   @datatypes
+   #qurt_mem_pool_t \n
+   #qurt_mem_pool_attr_t
+
+   @param[in]  pool   Pool handle obtained from qurt_mem_pool_attach().
+   @param[out] attr   Pointer to the memory region attribute structure. 
+
+   @return   
+   0 -- Success. \n
+   #QURT_EINVALID -- Corrupt handle; pool handle is invalid.
+*/
+int qurt_mem_pool_attr_get (qurt_mem_pool_t pool, qurt_mem_pool_attr_t *attr);
+
+/**@ingroup func_qurt_mem_pool_attr_get_size
+  Gets the size of the specified memory pool range.
+
+  @datatypes
+  #qurt_mem_pool_attr_t \n
+  #qurt_size_t
+ 
+  @param[in] attr        Pointer to the memory pool attribute structure.
+  @param[in] range_id    Memory pool range key.
+  @param[out] size       Pointer to the destination variable for the range size.
+
+  @return 
+  0 -- Success. \n
+  #QURT_EINVALID -- Range is invalid.
+
+  @dependencies
+  None.
+*/
+static inline int qurt_mem_pool_attr_get_size (qurt_mem_pool_attr_t *attr, int range_id, qurt_size_t *size){
+    if ((range_id >= MAX_POOL_RANGES) || (range_id < 0)){
+        (*size) = 0;
+        return QURT_EINVALID;
+    }
+    else {
+        (*size) = attr->ranges[range_id].size;
+    }
+    return QURT_EOK;
+}
+
+/**@ingroup func_qurt_mem_pool_attr_get_addr
+   Gets the start address of the specified memory pool range.
+ 
+  @datatypes
+  #qurt_mem_pool_attr_t \n
+  #qurt_addr_t
+  
+  @param[in] attr        Pointer to the memory pool attribute structure.
+  @param[in]  range_id   Memory pool range key.
+  @param[out] addr       Pointer to the destination variable for range start address.
+
+  @return 
+  0 -- Success. \n
+  #QURT_EINVALID -- Range is invalid.
+
+  @dependencies
+  None.
+*/
+static inline int qurt_mem_pool_attr_get_addr (qurt_mem_pool_attr_t *attr, int range_id, qurt_addr_t *addr){
+    if ((range_id >= MAX_POOL_RANGES) || (range_id < 0)){
+        (*addr) = 0;
+        return QURT_EINVALID;
+    }
+    else {
+        (*addr) = (attr->ranges[range_id].start)<<12;
+   }
+   return QURT_EOK;
+}
+
+/**@ingroup func_qurt_mem_pool_attr_get_addr_64
+   Gets the 64 bit start address of the specified memory pool range.
+ 
+  @datatypes
+  #qurt_mem_pool_attr_t \n
+  #qurt_addr_64_t
+  
+  @param[in] attr        Pointer to the memory pool attribute structure.
+  @param[in]  range_id   Memory pool range key.
+  @param[out] addr       Pointer to the destination variable for range start address.
+
+  @return 
+  0 -- Success. \n
+  #QURT_EINVALID -- Range is invalid.
+
+  @dependencies
+  None.
+*/
+static inline int qurt_mem_pool_attr_get_addr_64 (qurt_mem_pool_attr_t *attr, int range_id, qurt_addr_64_t *addr){
+if ((range_id >= MAX_POOL_RANGES) || (range_id < 0)){
+    (*addr) = 0;
+    return QURT_EINVALID;
+}
+else {
+     (*addr) = ((qurt_addr_64_t)attr->ranges[range_id].start)<<12;
+    }
+    return QURT_EOK;
+ }
+
+
+/**@ingroup func_qurt_mem_pool_status_get  
+   Gets the memory pool status. \n
+   Based on the pool handle, retrieves largest contiguous free memory, 
+   total free memory, and total memory declared for the pool in bytes. Fills in
+   the memory status structure with the values.   
+   
+   @datatypes
+   #qurt_mem_pool_t \n
+   #qurt_mem_pool_status_t
+   
+   @param[in]  pool   Pool handle.
+   @param[out] status Pointer to the memory pool status structure. 
+   
+   @return   
+   #QURT_EOK      -- Success. \n
+   #QURT_EINVALID -- Corrupt handle; pool handle is invalid.
+*/
+int qurt_mem_pool_status_get (qurt_mem_pool_t pool, qurt_mem_pool_status_t *status);
+
+
+/**@ingroup func_qurt_mem_pool_is_available
+   Checks whether the number of pages that the page_count argument indicates
+   can be allocated from the specified pool.
+
+  @datatypes
+  #qurt_mem_pool_attr_t \n
+  #qurt_mem_mapping_t \n
+
+  @param[in] pool          Pool handle obtained from qurt_mem_pool_attach().
+  @param[in] page_count    Number of 4K pages.
+  @param[in] mapping_type  Variable of type qurt_mem_mapping_t.
+
+  @return
+  0 -- Success. \n
+  #QURT_EINVALID -- Mapping_type is invalid. \n
+  #QURT_EMEM     -- Specified pages cannot be allocated from the pool.
+
+  @dependencies
+  None.
+*/
+int qurt_mem_pool_is_available(qurt_mem_pool_t pool, int page_count, qurt_mem_mapping_t mapping_type);
+
+
+/**@ingroup func_qurt_mem_region_create
+  @xreflabel{sec:mem_region_create}
+  Creates a memory region with the specified attributes.
+
+  The application initializes the memory region attribute structure with
+  qurt_mem_region_attr_init() and qurt_mem_region_attr_set_bus_attr().
+
+  If the virtual address attribute is set to its default value 
+  (Section @xref{sec:qurt_mem_region_attr_init}), the virtual address of the memory region is 
+  automatically assigned any available virtual address value.
+
+  If the memory mapping attribute is set to virtual mapping, the physical address of the memory region
+  is also automatically assigned.\n
+
+  @note1hang The physical address attribute is explicitly set in the attribute structure only
+             for memory regions with physical-contiguous-mapped mapping.
+
+  Memory regions are always assigned to memory pools. The pool value specifies the memory pool
+  that the memory region is assigned to.
+
+  @note1hang If attr is specified as NULL, the memory region is created with default
+             attribute values (Section @xref{sec:qurt_mem_region_attr_init}).
+             QuRT predefines the memory pool object #qurt_mem_default_pool
+             (Section @xref{dox:mem_management}), which allocates memory regions in SMI memory.
+
+  @datatypes
+  #qurt_mem_region_t \n
+  #qurt_size_t \n
+  #qurt_mem_pool_t \n
+  #qurt_mem_region_attr_t
+
+  @param[out] region Pointer to the memory region object.
+  @param[in]  size   Memory region size (in bytes). If size is not an integral multiple of 4K,
+                     it is rounded up to a 4K boundary.
+  @param[in]  pool   Memory pool of the region.
+  @param[in]  attr   Pointer to the memory region attribute structure.
+
+  @return
+  #QURT_EOK -- Memory region successfully created.\n
+  #QURT_EMEM -- Not enough memory to create region.
+  #QURT_EINVALID -- Invalid cache attributes / permissions provided in attribute.
+
+  @dependencies
+  None.
+*/
+int qurt_mem_region_create(qurt_mem_region_t *region, qurt_size_t size, qurt_mem_pool_t pool, qurt_mem_region_attr_t *attr);
+
+/**@ingroup func_qurt_mem_region_delete
+  Deletes the specified memory region.
+
+  If the caller application creates the memory region, it is removed and the system reclaims its
+  assigned memory.
+
+  If a different application creates the memory region (and is shared with the caller
+  application), only the local memory mapping to the region is removed; the system does
+  not reclaim the memory.
+
+  @datatypes
+  #qurt_mem_region_t
+
+  @param[in] region Memory region object.
+
+  @returns
+  #QURT_EOK -- Region successfully deleted.
+  #QURT_ELOCKED -- Buffer is locked. Mapping delete failed.
+
+  @dependencies
+  None.
+*/
+int qurt_mem_region_delete(qurt_mem_region_t region);
+
+
+/**@ingroup func_qurt_mem_region_attr_get
+  @xreflabel{sec:mem_region_attr_get}
+  Gets the memory attributes of the specified message region.
+  After a memory region is created, its attributes cannot be changed.
+
+  @datatypes
+  #qurt_mem_region_t \n
+  #qurt_mem_region_attr_t
+
+  @param[in] region     Memory region object.
+  @param[out] attr      Pointer to the destination structure for memory region attributes.
+
+  @return
+  #QURT_EOK -- Operation successfully performed. \n
+  Error code -- Failure.
+
+  @dependencies
+  None.
+*/
+int qurt_mem_region_attr_get(qurt_mem_region_t region, qurt_mem_region_attr_t *attr);
+
+
+/**@ingroup func_qurt_mem_region_attr_set_type
+  Sets the memory type in the specified memory region attribute structure.
+
+  The type indicates whether the memory region is local to an application or shared between
+  applications. 
+  @cond rest_dist For more information, see @xhyperref{80VB41992,80-VB419-92}. @endcond
+ 
+  @datatypes
+  #qurt_mem_region_attr_t \n
+  #qurt_mem_region_type_t
+
+  @param[in,out] attr  Pointer to memory region attribute structure.
+  @param[in]     type  Memory type. Values: \n
+                       - #QURT_MEM_REGION_LOCAL \n
+                       - #QURT_MEM_REGION_SHARED @tablebulletend
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+static inline void qurt_mem_region_attr_set_type(qurt_mem_region_attr_t *attr, qurt_mem_region_type_t type){
+    attr->type = type;
+}
+
+/**@ingroup func_qurt_mem_region_attr_get_size
+  Gets the memory region size from the specified memory region attribute structure.
+
+  @datatypes
+  #qurt_mem_region_attr_t \n
+  #qurt_size_t
+
+  @param[in]  attr  Pointer to the memory region attribute structure.
+  @param[out] size  Pointer to the destination variable for memory region size.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+static inline void qurt_mem_region_attr_get_size(qurt_mem_region_attr_t *attr, qurt_size_t *size){
+    (*size) = attr->size;
+}
+
+/**@ingroup func_qurt_mem_region_attr_get_type
+  Gets the memory type from the specified memory region attribute structure.
+
+  @datatypes
+  #qurt_mem_region_attr_t \n
+  #qurt_mem_region_type_t
+
+  @param[in] attr  Pointer to the memory region attribute structure.
+  @param[out] type  Pointer to the destination variable for the memory type.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+static inline void qurt_mem_region_attr_get_type(qurt_mem_region_attr_t *attr, qurt_mem_region_type_t *type){
+    (*type) = attr->type;
+}
+
+/**@ingroup func_qurt_mem_region_attr_set_physaddr
+  Sets the memory region 32-bit physical address in the specified memory attribute structure.
+
+  @note1hang The physical address attribute is explicitly set only for memory regions with 
+             physical contiguous mapping. Otherwise QuRT automatically sets it
+			 when the memory region is created.
+
+  @datatypes
+  #qurt_mem_region_attr_t \n
+  #qurt_paddr_t
+
+  @param[in,out] attr  Pointer to the memory region attribute structure.
+  @param[in] addr  Memory region physical address.
+
+  @return      
+  None.
+ */
+static inline void qurt_mem_region_attr_set_physaddr(qurt_mem_region_attr_t *attr, qurt_paddr_t addr){
+    attr->ppn = (unsigned)(((unsigned)(addr))>>12);
+}
+
+/**@ingroup func_qurt_mem_region_attr_get_physaddr
+  Gets the memory region physical address from the specified memory region attribute structure.
+  
+  @datatypes
+  #qurt_mem_region_attr_t
+  
+  @param[in]  attr  Pointer to the memory region attribute structure.
+  @param[out] addr  Pointer to the destination variable for memory region physical address.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+static inline void qurt_mem_region_attr_get_physaddr(qurt_mem_region_attr_t *attr, unsigned int *addr){
+    (*addr) = (unsigned)(((unsigned) (attr->ppn))<<12);
+}
+
+/**@ingroup func_qurt_mem_region_attr_set_virtaddr
+  Sets the memory region virtual address in the specified memory attribute structure.
+
+  @datatypes
+  #qurt_mem_region_attr_t \n
+  #qurt_addr_t
+  
+  @param[in,out] attr  Pointer to the memory region attribute structure.
+  @param[in]     addr  Memory region virtual address.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+static inline void qurt_mem_region_attr_set_virtaddr(qurt_mem_region_attr_t *attr, qurt_addr_t addr){
+    attr->virtaddr = addr;
+}
+
+/**@ingroup func_qurt_mem_region_attr_get_virtaddr
+  Gets the memory region virtual address from the specified memory region attribute structure.
+
+  @datatypes
+  #qurt_mem_region_attr_t \n
+
+  @param[in]   attr   Pointer to the memory region attribute structure.
+  @param[out]  addr   Pointer to the destination variable for the memory region virtual address.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+static inline void qurt_mem_region_attr_get_virtaddr(qurt_mem_region_attr_t *attr, unsigned int *addr){
+    (*addr) = (unsigned int)(attr->virtaddr);
+}
+
+/**@ingroup func_qurt_mem_region_attr_set_mapping
+  Sets the memory mapping in the specified memory region attribute structure.
+
+  The mapping value indicates how the memory region is mapped in virtual memory.  
+
+  @datatypes
+  #qurt_mem_region_attr_t \n
+  #qurt_mem_mapping_t
+  
+  @param[in,out] attr     Pointer to the memory region attribute structure.
+  @param[in] mapping  Mapping. Values: 
+                      - #QURT_MEM_MAPPING_VIRTUAL
+                      - #QURT_MEM_MAPPING_PHYS_CONTIGUOUS 
+                      - #QURT_MEM_MAPPING_IDEMPOTENT  	                                   
+                      - #QURT_MEM_MAPPING_VIRTUAL_FIXED								   
+                      - #QURT_MEM_MAPPING_NONE 
+                      - #QURT_MEM_MAPPING_VIRTUAL_RANDOM
+                      - #QURT_MEM_MAPPING_INVALID   @tablebulletend  
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+static inline void qurt_mem_region_attr_set_mapping(qurt_mem_region_attr_t *attr, qurt_mem_mapping_t mapping){
+    attr->mapping_type = mapping;
+}
+
+/**@ingroup func_qurt_mem_region_attr_get_mapping
+  Gets the memory mapping from the specified memory region attribute structure.
+
+  @datatypes
+  #qurt_mem_region_attr_t \n
+  #qurt_mem_mapping_t
+
+  @param[in]  attr     Pointer to the memory region attribute structure.
+  @param[out] mapping  Pointer to the destination variable for memory mapping.
+
+  @return 
+  None.
+
+  @dependencies
+  None.
+ */
+static inline void qurt_mem_region_attr_get_mapping(qurt_mem_region_attr_t *attr, qurt_mem_mapping_t *mapping){
+    (*mapping) = attr->mapping_type;
+}
+
+/**@ingroup func_qurt_mem_region_attr_set_cache_mode
+  Sets the cache operation mode in the specified memory region attribute structure.
+
+  @cond rest_dist For more information on the cache, see @xhyperref{80VB41992,80-VB419-92}.@endcond
+
+  @datatypes
+  #qurt_mem_region_attr_t \n
+  #qurt_mem_cache_mode_t
+  
+  @param[in,out] attr  Pointer to the memory region attribute structure.
+  @param[in] mode      Cache mode. Values:  \n
+                       - #QURT_MEM_CACHE_WRITEBACK \n
+                       - #QURT_MEM_CACHE_WRITETHROUGH\n
+                       - #QURT_MEM_CACHE_WRITEBACK_NONL2CACHEABLE\n
+                       - #QURT_MEM_CACHE_WRITETHROUGH_NONL2CACHEABLE\n
+                       - #QURT_MEM_CACHE_WRITEBACK_L2CACHEABLE\n
+                       - #QURT_MEM_CACHE_WRITETHROUGH_L2CACHEABLE\n
+                       - #QURT_MEM_CACHE_NONE @tablebulletend
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+static inline void qurt_mem_region_attr_set_cache_mode(qurt_mem_region_attr_t *attr, qurt_mem_cache_mode_t mode){
+    QURT_PGATTR_C_SET(attr->pga, (unsigned)mode);
+}
+
+/**@ingroup func_qurt_mem_region_attr_get_cache_mode
+  Gets the cache operation mode from the specified memory region attribute structure.
+
+  @datatypes
+  #qurt_mem_region_attr_t \n
+  #qurt_mem_cache_mode_t
+
+  @param[in]  attr  Pointer to the memory region attribute structure.
+  @param[out] mode  Pointer to the destination variable for cache mode.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+static inline void qurt_mem_region_attr_get_cache_mode(qurt_mem_region_attr_t *attr, qurt_mem_cache_mode_t *mode){
+    unsigned int mode_temp = QURT_PGATTR_C_GET(attr->pga);
+    (*mode) = (qurt_mem_cache_mode_t)mode_temp;
+}
+
+/**@ingroup func_qurt_mem_region_attr_set_bus_attr
+  Sets the (A1, A0) bus attribute bits in the specified memory region attribute structure.
+
+  @cond rest_dist For more information on the bus attribute bits, see the @xhyperref{80VB41992,80-VB419-92}. @endcond
+
+  @datatypes
+  #qurt_mem_region_attr_t
+
+  @param[in,out] attr  Pointer to the memory region attribute structure.
+  @param[in] abits     The (A1, A0) bits to use with the memory region, expressed as a 2-bit binary number.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+static inline void qurt_mem_region_attr_set_bus_attr(qurt_mem_region_attr_t *attr, unsigned abits){
+    QURT_PGATTR_A_SET(attr->pga, abits);
+}
+
+/**@ingroup func_qurt_mem_region_attr_get_bus_attr
+  Gets the (A1, A0) bus attribute bits from the specified memory region attribute structure.
+
+  @datatypes
+  #qurt_mem_region_attr_t 
+
+  @param[in]  attr  Pointer to the memory region attribute structure.
+  @param[out] pbits Pointer to an unsigned integer that is filled in with
+                    the (A1, A0) bits from the memory region attribute structure, expressed as a 2-bit binary number.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+static inline void qurt_mem_region_attr_get_bus_attr(qurt_mem_region_attr_t *attr, unsigned *pbits){
+    (*pbits) = QURT_PGATTR_A_GET(attr->pga);
+}
+
+void qurt_mem_region_attr_set_owner(qurt_mem_region_attr_t *attr, int handle);
+void qurt_mem_region_attr_get_owner(qurt_mem_region_attr_t *attr, int *p_handle);
+void qurt_mem_region_attr_set_perms(qurt_mem_region_attr_t *attr, unsigned perms);
+void qurt_mem_region_attr_get_perms(qurt_mem_region_attr_t *attr, unsigned *p_perms);
+
+/**@ingroup func_qurt_mem_map_static_query
+  Determines whether a memory page is statically mapped.
+  Pages are specified by the following attributes: physical address, page size, cache mode,
+  and memory permissions. \n
+  - If the specified page is statically mapped, vaddr returns the virtual
+     address of the page. \n
+  - If the page is not statically mapped (or if it does not exist as specified), vaddr
+     returns -1 as the virtual address value.\n
+  The system configuration file defines QuRT memory maps.
+ 
+  @datatypes
+  #qurt_addr_t \n
+  #qurt_mem_cache_mode_t \n
+  #qurt_perm_t
+  
+  @param[out]  vaddr             Virtual address corresponding to paddr.
+  @param[in]   paddr             Physical address.  
+  @param[in]   page_size         Size of the mapped memory page.
+  @param[in]   cache_attribs     Cache mode (writeback, and so on).
+  @param[in]   perm              Access permissions.
+
+  @return
+  #QURT_EOK -- Specified page is statically mapped, vaddr returns the virtual address. \n
+  #QURT_EMEM -- Specified page is not statically mapped, vaddr returns -1. \n
+  #QURT_EVAL -- Specified page does not exist.
+
+  @dependencies
+  None.
+ */
+int qurt_mem_map_static_query(qurt_addr_t *vaddr, qurt_addr_t paddr, unsigned int page_size, qurt_mem_cache_mode_t cache_attribs, qurt_perm_t perm);
+
+
+/**@ingroup func_qurt_mem_region_query
+  Queries a memory region. \n
+  This function determines whether a dynamically-created memory region (Section @xref{sec:mem_region_create}) exists for the
+  specified virtual or physical address.  
+  When a memory region has been determined to exist, its attributes are
+  accessible (Section @xref{sec:mem_region_attr_get}).
+
+  @note1hang This function returns #QURT_EFATAL if #QURT_EINVALID is passed to both
+             vaddr and paddr (or to neither). 
+
+  @datatypes
+  #qurt_mem_region_t \n
+  #qurt_paddr_t 
+   
+  @param[out] region_handle    Pointer to the memory region object (if it exists).
+  @param[in]  vaddr            Virtual address to query; if vaddr is specified, paddr must be set to
+                               the value #QURT_EINVALID.
+  @param[in]  paddr            Physical address to query; if paddr is specified, vaddr must be set to
+                               the value #QURT_EINVALID.
+
+  @return 
+  #QURT_EOK -- Query successfully performed. \n
+  #QURT_EMEM -- Region not found for the specified address. \n
+  #QURT_EFATAL -- Invalid input parameters.
+
+  @dependencies
+  None.
+ */
+int qurt_mem_region_query(qurt_mem_region_t *region_handle, qurt_addr_t vaddr, qurt_paddr_t paddr);
+
+
+/**@ingroup func_qurt_mapping_create
+  @xreflabel{hdr:qurt_mapping_create}
+  Creates a memory mapping in the page table.
+  Not supported if called from a user process, always returns QURT_EMEM. 
+
+  @datatypes
+  #qurt_addr_t \n
+  #qurt_size_t \n
+  #qurt_mem_cache_mode_t \n
+  #qurt_perm_t
+ 
+  @param[in] vaddr			Virtual address.
+  @param[in] paddr			Physical address.
+  @param[in] size			Size (4K-aligned) of the mapped memory page.
+  @param[in] cache_attribs		Cache mode (writeback, and so on).
+  @param[in] perm			Access permissions.
+
+  @return			
+  #QURT_EOK -- Mapping created. \n
+  #QURT_EMEM -- Failed to create mapping.
+  #QURT_EINVALID -- Invalid cache attributes / permissions provided.
+
+  @dependencies
+  None.
+*/
+int qurt_mapping_create(qurt_addr_t vaddr, qurt_addr_t paddr, qurt_size_t size,
+                         qurt_mem_cache_mode_t cache_attribs, qurt_perm_t perm);
+
+/**@ingroup func_qurt_mapping_remove
+   @xreflabel{hdr:qurt_mapping_remove}
+  Deletes the specified memory mapping from the page table.
+ 
+  @datatypes
+  #qurt_addr_t \n
+  #qurt_size_t
+
+  @param[in] vaddr			Virtual address.
+  @param[in] paddr			Physical address.
+  @param[in] size			Size of the mapped memory page (4K-aligned).
+
+  @return 			
+  #QURT_EOK -- Mapping created.
+  #QURT_ELOCKED -- Buffer is locked. Mapping delete failed.
+
+  @dependencies
+  None.
+  		
+ */ 		
+int qurt_mapping_remove(qurt_addr_t vaddr, qurt_addr_t paddr, qurt_size_t size);
+
+/**@ingroup func_qurt_lookup_physaddr
+  Translates a virtual memory address to the physical memory address to which it maps. \n
+  The lookup occurs in the process of the caller. Use qurt_lookup_physaddr2() to lookup the
+  physical address of another process.
+  
+
+  @datatypes
+  #qurt_addr_t \n
+  #qurt_paddr_t
+
+  @param[in] vaddr   Virtual address.
+
+  @return
+  Nonzero -- Physical address to which the virtual address is mapped.\n
+  0 -- Virtual address not mapped.
+
+  @dependencies
+  None.
+*/
+qurt_paddr_t qurt_lookup_physaddr (qurt_addr_t vaddr);
+
+/**@ingroup func_qurt_mem_region_attr_set_physaddr_64
+  Sets the memory region 64-bit physical address in the specified memory attribute structure.
+
+  @note1hang The physical address attribute is explicitly set only for memory regions with
+             physical contiguous mapping. Otherwise it is automatically set by
+             QuRT when the memory region is created.
+
+  @datatypes
+  #qurt_mem_region_attr_t \n
+  #qurt_paddr_64_t
+
+  @param[in,out] attr  Pointer to the memory region attribute structure.
+  @param[in] addr_64   Memory region 64-bit physical address.
+
+  @return
+  None.
+ */
+static inline void qurt_mem_region_attr_set_physaddr_64(qurt_mem_region_attr_t *attr, qurt_paddr_64_t addr_64){
+    attr->ppn = (unsigned)(((unsigned long long)(addr_64))>>12);
+}
+
+/**@ingroup func_qurt_mem_region_attr_get_physaddr_64
+  Gets the memory region 64-bit physical address from the specified memory region attribute structure.
+
+  @datatypes
+  #qurt_mem_region_attr_t \n
+  #qurt_paddr_64_t
+
+  @param[in]  attr     Pointer to the memory region attribute structure.
+  @param[out] addr_64  Pointer to the destination variable for the memory region 64-bit physical address.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+static inline void qurt_mem_region_attr_get_physaddr_64(qurt_mem_region_attr_t *attr, qurt_paddr_64_t *addr_64){
+    (*addr_64) = (unsigned long long)(((unsigned long long)(attr->ppn))<<12);
+}
+
+/**@ingroup func_qurt_mem_map_static_query_64
+  Determines if a memory page is statically mapped.
+  The following attributes specify pages: 64-bit physical address, page size, cache mode,
+  and memory permissions. \n
+  If the specified page is statically mapped, vaddr returns the virtual
+     address of the page.
+  If the page is not statically mapped (or if it does not exist as specified), vaddr
+     returns -1 as the virtual address value.\n
+  QuRT memory maps are defined in the system configuration file.
+
+  @datatypes
+  #qurt_addr_t \n
+  #qurt_paddr_64_t \n
+  #qurt_mem_cache_mode_t \n
+  #qurt_perm_t
+
+  @param[out]  vaddr             Virtual address corresponding to paddr.
+  @param[in]   paddr_64          64-bit physical address.
+  @param[in]   page_size         Size of the mapped memory page.
+  @param[in]   cache_attribs     Cache mode (writeback, and so on).
+  @param[in]   perm              Access permissions.
+
+  @return
+  #QURT_EOK -- Specified page is statically mapped; a virtual address is returned in vaddr. \n
+  #QURT_EMEM -- Specified page is not statically mapped; -1 is returned in vaddr. \n
+  #QURT_EVAL -- Specified page does not exist.
+
+  @dependencies
+  None.
+ */
+int qurt_mem_map_static_query_64(qurt_addr_t *vaddr, qurt_paddr_64_t paddr_64, unsigned int page_size, qurt_mem_cache_mode_t cache_attribs, qurt_perm_t perm);
+
+/**@ingroup func_qurt_mem_region_query_64
+  Determines whether a dynamically created memory region (Section @xref{sec:mem_region_create}) exists for the
+  specified virtual or physical address. When a memory region has been determined to exist, its attributes are
+  accessible (Section @xref{sec:mem_region_attr_get}).
+
+  @note1hang This function returns QURT_EFATAL if #QURT_EINVALID is passed to both
+             vaddr and paddr (or to neither).
+
+  @datatypes
+  #qurt_mem_region_t \n
+  #qurt_addr_t \n
+  #qurt_paddr_64_t
+
+  @param[out] region_handle    Pointer to the memory region object (if it exists).
+  @param[in]  vaddr            Virtual address to query; if vaddr is specified, paddr must be set to
+                               the value #QURT_EINVALID.
+  @param[in]  paddr_64         64-bit physical address to query; if paddr is specified, vaddr must be set to
+                               the value #QURT_EINVALID.
+
+  @return
+  #QURT_EOK -- Success. \n
+  #QURT_EMEM -- Region not found for the specified address. \n
+  #QURT_EFATAL -- Invalid input parameters.
+
+  @dependencies
+  None.
+ */
+int qurt_mem_region_query_64(qurt_mem_region_t *region_handle, qurt_addr_t vaddr, qurt_paddr_64_t paddr_64);
+
+/**@ingroup func_qurt_mapping_create_64
+  @xreflabel{hdr:qurt_mapping_create_64}
+  Creates a memory mapping in the page table.
+  Not supported if called from a user process, always returns QURT_EMEM.
+
+  @datatypes
+  #qurt_addr_t \n
+  #qurt_paddr_64_t \n
+  #qurt_size_t \n
+  #qurt_mem_cache_mode_t \n
+  #qurt_perm_t
+ 
+  @param[in] vaddr	        Virtual address.
+  @param[in] paddr_64		64-bit physical address.
+  @param[in] size			Size (4K-aligned) of the mapped memory page.
+  @param[in] cache_attribs  Cache mode (writeback, and so on).
+  @param[in] perm			Access permissions.
+
+  @return			
+  #QURT_EOK -- Success. \n
+  #QURT_EMEM -- Failure.
+  #QURT_EINVALID -- Invalid cache attributes / permissions provided.
+
+  @dependencies
+  None.
+*/
+int qurt_mapping_create_64(qurt_addr_t vaddr, qurt_paddr_64_t paddr_64, qurt_size_t size,
+                         qurt_mem_cache_mode_t cache_attribs, qurt_perm_t perm);
+
+/**@ingroup func_qurt_mapping_remove_64
+   @xreflabel{hdr:qurt_mapping_remove_64}
+  Deletes the specified memory mapping from the page table.
+ 
+  @datatypes
+  #qurt_addr_t \n
+  #qurt_paddr_64_t \n  
+  #qurt_size_t
+ 
+  @param[in] vaddr    Virtual address.
+  @param[in] paddr_64 64-bit physical address.
+  @param[in] size     Size of the mapped memory page (4K-aligned).
+
+  @return 			
+  #QURT_EOK -- Success.
+  #QURT_ELOCKED -- Buffer is locked. Mapping delete failed.
+
+  @dependencies
+  None.
+  		
+ */ 		
+int qurt_mapping_remove_64(qurt_addr_t vaddr, qurt_paddr_64_t paddr_64, qurt_size_t size);
+
+/**@ingroup func_qurt_lookup_physaddr_64
+  Translates a virtual memory address to the 64-bit physical memory address it is mapped to. \n
+  The lookup occurs in the process of the caller. Use qurt_lookup_physaddr2() to lookup the physical
+  address of another process.
+
+  @datatypes
+  #qurt_paddr_64_t \n
+  #qurt_addr_t
+
+  @param[in] vaddr   Virtual address.
+
+  @return
+  Nonzero -- 64-bit physical address to which the virtual address is mapped. \n
+  0 -- Virtual address has not been mapped.
+
+  @dependencies
+  None.
+*/
+qurt_paddr_64_t qurt_lookup_physaddr_64 (qurt_addr_t vaddr);
+/** @endcond */
+
+/** @cond internal_only */
+/**@ingroup func_qurt_mapping_reclaim
+  Deallocates all QuRT resources associated with the specified virtual
+  memory area, making it available for user memory management:\n
+  - The associated physical memory areas are freed and added to the
+    specified physical pool.\n
+  - The associated TLB entries are deleted and made available for TLB
+    management.\n
+  - The virtual memory area is not freed -- it is left in
+    place as allocated, but unmapped virtual memory. Access to this
+    memory area generates an exception.\n
+
+  The virtual memory area must be statically allocated.
+  If no pool is specified, the freed physical memory is not added to any pool.
+
+  @note1hang The virtual memory area is restricted to being filled with locked 
+             TLB entries that are contiguous within the memory area, and contained by it.
+
+  @datatypes
+  #qurt_addr_t \n
+  #qurt_size_t \n
+  #qurt_mem_pool_t
+
+  @param[in] vaddr   Virtual address of the memory area to free.
+  @param[in] vsize   Size (in bytes) of the memory area to free.
+  @param[in] pool    Handle to the physical pool where freed physical memory is added.
+                     If set to 0, freed physical memory is not added to any pool.
+
+  @return
+  0 -- Success. \n
+  Nonzero -- Failure that indicates a partial success, or that the request was malformed. \n @note1hang The expected behavior is that
+       QuRT logs messages related to the failure, and callers are free to ignore the return value.
+
+  @dependencies
+  None.
+*/
+int qurt_mapping_reclaim(qurt_addr_t vaddr, qurt_size_t vsize, qurt_mem_pool_t pool);
+/** @endcond */
+/** @cond rest_reg_dist  */
+/**@ingroup func_qurt_mem_configure_cache_partition
+  Configures the Hexagon cache partition at the system level.
+
+  A partition size value of #SEVEN_EIGHTHS_SIZE is applicable only to the L2 cache.
+
+  The L1 cache partition is not supported in Hexagon processor version V60 or greater.
+
+  @note1hang Call this operation only with QuRT OS privilege.
+
+  @datatypes
+  #qurt_cache_type_t \n
+  #qurt_cache_partition_size_t
+
+  @param[in] cache_type  Cache type for partition configuration. Values: \n
+                       - #HEXAGON_L1_I_CACHE \n
+                       - #HEXAGON_L1_D_CACHE \n
+                       - #HEXAGON_L2_CACHE @tablebulletend
+
+  @param[in] partition_size  Cache partition size. Values: \n
+                        - #FULL_SIZE \n
+                        - #HALF_SIZE \n
+                        - #THREE_QUARTER_SIZE \n
+                        - #SEVEN_EIGHTHS_SIZE @tablebulletend
+
+  @return
+  #QURT_EOK -- Success. \n
+  #QURT_EVAL -- Error.
+
+  @dependencies
+  None.
+ */
+int qurt_mem_configure_cache_partition(qurt_cache_type_t cache_type, qurt_cache_partition_size_t partition_size);
+
+
+/**@ingroup func_qurt_mem_syncht
+   @xreflabel{hdr:qurt_mem_syncht}
+  Performs heavy-weight synchronization of memory transactions.
+
+  This operation does not return until all previous memory transactions (cached and uncached load/store,
+  mem_locked, and so on) that originated from the current thread are complete and globally observable.
+
+  @note1hang This operation is implemented as a wrapper for the Hexagon syncht instruction.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+static inline void qurt_mem_syncht(void){
+    #ifdef __HEXAGON_ARCH__
+    __asm__  __volatile__ (" SYNCHT \n");
+    #endif
+}
+
+/**@ingroup func_qurt_mem_barrier
+   @xreflabel{hdr:qurt_mem_barrier}
+  Creates a barrier for memory transactions.
+
+  This operation ensures that all previous memory transactions are globally observable before any
+  future memory transactions are globally observable.
+
+  @note1hang This operation is implemented as a wrapper for the Hexagon barrier instruction.
+  @return
+  None
+
+  @dependencies
+  None.
+ */
+static inline void qurt_mem_barrier(void){
+    #ifdef __HEXAGON_ARCH__
+    __asm__  __volatile__ (" BARRIER \n");
+    #endif
+}
+/** @endcond */
+
+/** @cond internal_only */
+/**@ingroup func_qurt_system_mem_alloc
+  Requests that the kernel allocates memory from the kernel-owned pool.
+
+  @param[in] size     Size in bytes (aligned to 4K) to allocate.
+  @param[in] align    Any alignment that must be considered for the allocation.
+  @param[in] flags    Supports the #QURT_SYSTEM_ALLOC_VIRTUAL flag; allocates 
+                      available virtual memory in the address space of all processes.
+
+  @return
+  #QURT_EFATAL  -- Allocation failed \n
+  Start address of the successful allocation.  
+
+  @dependencies
+  None.
+*/
+unsigned qurt_system_mem_alloc(unsigned size, unsigned align, unsigned flags);
+/** @endcond */
+/** @cond rest_reg_dist*/
+/**@ingroup func_qurt_lookup_physaddr2
+  Translates the virtual memory address of the specified process to the 64-bit 
+  physical memory address to which it is mapped.
+
+  @datatypes
+  #qurt_addr_t \n
+  #qurt_paddr_64_t
+
+  @param[in] vaddr   Virtual address.
+  @param[in] pid     PID.
+
+  @return
+  Nonzero -- 64-bit physical address to which the virtual address is mapped. \n
+  0 -- Virtual address is not mapped.
+
+  @dependencies
+  None.
+*/
+qurt_paddr_64_t qurt_lookup_physaddr2(qurt_addr_t vaddr, unsigned int pid);
+/** @endcond */
+
+/**@ingroup func_qurt_mapping_attr_get  
+   Gets the mapping attributes for a given virtual address and PID
+
+   @datatypes
+   #qurt_addr_t \n
+   #qurt_mapping_attr_t
+
+   @param[in]  vaddr  virtual address for which the attributes are required.
+   @param[in]  pid    process id for the target process
+   @param[out] attr   Pointer to the mapping attribute structure. 
+
+   @return   
+   0 -- Success. \n
+   #QURT_EINVALID -- Incorrect virtual address or pid
+*/
+int qurt_mapping_attr_get(qurt_addr_t vaddr, unsigned int pid, qurt_mapping_attr_t *attr);
+
+
+/**@ingroup func_qurt_mapping_attr_get_cache_mode
+  Gets the cache operation mode in the specified memory mapping attribute structure.
+
+
+  @datatypes
+  #qurt_mapping_attr_t \n
+  #qurt_mem_cache_mode_t
+  
+  @param[in]  attr  Pointer to the memory mapping attribute structure.
+  @param[out] cache_mode  Pointer to the destination variable for cache mode.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+static inline void qurt_mapping_attr_get_cache_mode(qurt_mapping_attr_t *attr, qurt_mem_cache_mode_t *cache_mode)
+{
+   (*cache_mode) = attr->cache_mode;
+}
+
+/**@ingroup func_qurt_mapping_attr_get_physaddr
+  Gets the physical memory address in the specified memory mapping attribute structure.
+
+
+  @datatypes
+  #qurt_mapping_attr_t \n
+  #qurt_paddr_64_t
+  
+  @param[in]  attr      Pointer to the memory mapping attribute structure.
+  @param[out] physaddr  Pointer to the destination variable for physical address.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+static inline void qurt_mapping_attr_get_physaddr(qurt_mapping_attr_t *attr, qurt_paddr_64_t *physaddr)
+{
+   (*physaddr) = attr->paddr;
+}
+
+/**@ingroup func_qurt_mapping_attr_get_perms
+  Gets the permissions in the specified memory mapping attribute structure.
+
+
+  @datatypes
+  #qurt_mapping_attr_t \n
+  #qurt_perm_t
+  
+  @param[in]  attr   Pointer to the memory mapping attribute structure.
+  @param[out] perms  Pointer to the destination variable for permissions.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+static inline void qurt_mapping_attr_get_perms(qurt_mapping_attr_t *attr, qurt_perm_t *perms)
+{
+   (*perms) = attr->perms;
+}
+
+/**@ingroup func_qurt_mapping_attr_get_size
+  Gets the size in the specified memory mapping attribute structure.This represents size of the
+  TLB entry which covers the virtual address.
+
+
+  @datatypes
+  #qurt_mapping_attr_t \n
+  #unsigned int
+  
+  @param[in]  attr  Pointer to the memory mapping attribute structure.
+  @param[out] size  Pointer to the destination variable for size.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+static inline void qurt_mapping_attr_get_size(qurt_mapping_attr_t *attr, unsigned int *size)
+{
+   (*size) = attr->size;
+}
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_MEMORY_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_mmap.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_mmap.h
new file mode 100755
index 0000000000000..c3bd875910af7
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_mmap.h
@@ -0,0 +1,359 @@
+#ifndef QURT_MMAP_H
+#define QURT_MMAP_H
+/**
+  @file qurt_mmap.h 
+  @brief  Prototypes of memory mapping/unmapping APIs.
+          The APIs allow the user to map, un-map, and change permissions
+          on memory regions. 
+
+  EXTERNAL FUNCTIONS
+   None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+Copyright (c) 2018-2021, 2022, 2023 Qualcomm Technologies, Inc.
+All Rights Reserved.
+Confidential and Proprietary - Qualcomm Technologies, Inc.
+=============================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**@ingroup func_qurt_mem_mmap
+  Creates a memory mapping with the specified attributes. 
+  This API allows the root process caller to create mapping on behalf of a user
+  process. If the client_handle belongs to a valid user process, the resulting
+  mapping is created for the process.
+  If -1 is passed in place of client_handle, the API creates mapping
+  for the underlying process of the caller.
+
+  @note1hang If the specified attributes are not valid, an error result is returned.  
+                
+  @param[out]  client_handle  Client handle to use for this mapping (optional).
+  @param[in]   pool           Optional argument that specifies a pool handle
+                              if the user wants to allocate memory from a specific pool.
+                              The default value for this argument is NULL.
+  @param[in]   pRegion        Map region. This argument is unused, and the default value is NULL.
+  @param[in]   addr           Virtual memory address.
+  @param[in]   length         Size of mapping in bytes.
+  @param[in]   prot           Mapping access permissions (R/W/X).
+  @param[in]   flags          Mapping modes.\n
+                              - #QURT_MAP_NAMED_MEMSECTION 
+                              - #QURT_MAP_FIXED            \n
+                              - #QURT_MAP_NONPROCESS_VPOOL \n
+                              - #QURT_MAP_TRYFIXED         \n
+                              - #QURT_MAP_ANON             \n
+                              - #QURT_MAP_PHYSADDR         \n
+                              - #QURT_MAP_VA_ONLY @tablebulletend  
+  @param[in]   fd             File designator.
+  @param[in]   offset         Offset in file.
+ 
+  @return
+  Valid virtual address -- Success.\n
+  #QURT_MAP_FAILED -- Mapping creation failed. 
+ */
+void *qurt_mem_mmap(int client_handle,
+                    qurt_mem_pool_t pool,
+                    qurt_mem_region_t *pRegion,
+                    void *addr,
+                    size_t length,
+                    int prot,
+                    int flags,
+                    int fd,
+                    unsigned long long offset);
+
+/**@ingroup func_qurt_mem_mmap2
+  Creates a memory mapping with the specified attributes. Returns a more descriptive 
+  error code in case of failure.
+  This API allows the root process caller to create mapping on behalf of a user
+  process. If the client_handle belongs to a valid user process, the resulting
+  mapping is created for the process.
+  If -1 is passed in place of client_handle, the API creates mapping
+  for the underlying process of the caller.
+
+  @note1hang If the specified attributes are not valid, an error result is returned.
+
+  @param[out]  client_handle  Client handle to use for this mapping (optional).
+  @param[in]   pool           Optional argument that allows the user to specify a pool handle
+                              when the user wants to allocate memory from a specific pool.
+                              Default value for this argument is NULL.
+  @param[in]   pRegion        Map region (unused argument); default value is NULL.
+  @param[in]   addr           Virtual memory address.
+  @param[in]   length         Size of mapping in bytes.
+  @param[in]   prot           Mapping access permissions (R/W/X).
+                              Cache attributes, bus attributes, User mode.
+  @param[in]   flags          Mapping modes;
+                              Shared, Private, or Anonymous.
+  @param[in]   fd             File designator.
+  @param[in]   offset         Offset in file.
+ 
+  @return
+  Valid virtual address -- Success.\n
+  #QURT_EMEM -- Physical address is not available. \n
+  #QURT_EFAILED -- VA is not available or mapping failed.\n
+  #QURT_EINVALID -- Invalid argument was passed (for example, an unaligned VA/PA).
+ */
+void *qurt_mem_mmap2(int client_handle,
+                    qurt_mem_pool_t pool,
+                    qurt_mem_region_t *pRegion,
+                    void *addr,
+                    size_t length,
+                    int prot,
+                    int flags,
+                    int fd,
+                    unsigned long long offset);
+
+/**@ingroup func_qurt_mem_mmap_by_name
+  Creates a memory mapping for a named-memsection using the specified attributes.
+  The named memsection should be specified in cust_config.xml.
+
+  @note1hang If the specified attributes are not valid or the named memsection is not found,
+  an error result is returned.
+                  
+  @param[in]   name           Name of the memsection in cust_config.xml that specifies 
+                              this mapping. Should be less than 25 characters.
+  @param[in]   addr           Virtual memory address.
+  @param[in]   length         Size of mapping in bytes.
+  @param[in]   prot           Mapping access permissions (R/W/X).
+                              Cache attributes, bus attributes, User mode
+  @param[in]   flags          Mapping modes, such as
+                              Shared, Private, or Anonymous.
+  @param[in]   offset         Offset relative to the physical address range specified in memsection. 
+                              If offset + length exceeds size of memsection, failure is 
+                              returned.
+  @return
+  Valid virtual address -- Success.\n
+  #QURT_MAP_FAILED -- Mapping creation failed. 
+ */
+void *qurt_mem_mmap_by_name(const char* name,
+                            void *addr,
+                            size_t length,
+                            int prot,
+                            int flags,
+                            unsigned long long offset);
+
+/**@ingroup func_qurt_mem_mprotect2
+  Changes access permissions and attributes on an existing mapping based on the client_handle argument. 
+
+  @note1hang If the specified virtual address is not found or invalid attributes are passed,
+  an error code is returned.
+
+  @note2 When error is returned, it is possible that attributes/permissions are changed for some part of the
+          mapping, while for the remaining it is unchanged. Clients should not use these mappings further.
+                  
+  @param[in]   client_handle  Obtained from the current invocation function (Section 3.4.1).                   
+  @param[in]   addr           Virtual memory address.
+  @param[in]   length         Size of mapping in bytes.
+  @param[in]   prot           Mapping access permissions (R/W/X).
+                              Cache attributes, Bus attributes, User mode.
+  @return
+  #QURT_EOK -- Successfully changes permissions on the mapping.\n
+  #QURT_EFATAL -- Failed to change permissions on the mapping. \n
+  #QURT_EINVALID -- Attributes / permissions requested are invalid.
+ */
+int qurt_mem_mprotect2(int client_handle, const void *addr,
+                      size_t length,
+                      int prot);
+
+/**@ingroup func_qurt_mem_mprotect
+  Changes access permissions and attributes on an existing mapping. 
+
+  @note1hang If the specified virtual address is not found or invalid attributes are passed,
+  an error code is returned.\n
+
+  @note2 When error is returned, it is possible that attributes/permissions are changed for some part of the
+          mapping, while for the remaining it is unchanged. Clients should not use these mappings further.
+                  
+  @param[in]   addr           Virtual memory address.
+  @param[in]   length         Size of mapping in bytes.
+  @param[in]   prot           Mapping access permissions (R/W/X).
+                              Cache attributes, Bus attributes, User mode.
+  @return
+  #QURT_EOK -- Successfully changes permissions on the mapping. \n
+  #QURT_EFATAL -- Failed to change permissions on the mapping. \n
+  #QURT_EINVALID -- Attributes / permissions requested are invalid.
+ */
+int qurt_mem_mprotect(const void *addr,
+                      size_t length,
+                      int prot);
+
+/**@ingroup func_qurt_mem_munmap
+  Removes an existing mapping. 
+
+  @note1hang If the specified mapping is not found in the context of the caller process
+  or invalid attributes are passed, an error code is returned.
+                  
+  @param[in]   addr           Virtual memory address.
+  @param[in]   length         Size of mapping in bytes.
+  
+  @return
+  #QURT_EOK -- Successfully changes permissions on the mapping. \n
+  #QURT_EFATAL -- Failed to change permissions on the mapping.
+  #QURT_ELOCKED - Buffer is locked. Mapping delete failed.
+ */
+int qurt_mem_munmap(void *addr,
+                    size_t length);
+
+/**@ingroup func_qurt_mem_munmap2
+  Removes an existing mapping for a specified process. 
+
+  @note1hang This API allows a root process entity, such as a driver, to remove mapping
+  that was created for a user process. If the specified mapping is not found in the context 
+  of client handle or invalid attributes are passed, an error code is returned.
+             
+  @param[out]  client_handle  Client handle of the user process that owns this mapping. 
+  @param[in]   addr           Virtual memory address.
+  @param[in]   length         Size of mapping in bytes.
+  
+  @return
+  #QURT_EOK -- Successfully changes permissions on the mapping. \n
+  #QURT_EFATAL -- Failed to change permissions on the mapping. 
+  #QURT_ELOCKED - Buffer is locked. Mapping delete failed.
+ */
+int qurt_mem_munmap2(int client_handle,
+                     void *addr,
+                     size_t length);
+
+/**@ingroup func_qurt_mem_munmap3
+  Removes an existing mapping or reservation for a specified process. 
+
+  @param[in]   client_handle  Client handle of the user process that owns this mapping. 
+  @param[in]   addr           Pointer to a virtual memory address.
+  @param[in]   length         Size of mapping in bytes.
+  @param[in]   flags          Specifies the flag.
+  
+  @return
+  #QURT_EOK -- Successfully changes permissions on the mapping. \n
+  #QURT_EFATAL -- Failed to change permissions on the mapping. 
+  #QURT_ELOCKED - Buffer is locked. Mapping delete failed.
+ */
+int qurt_mem_munmap3(int client_handle,
+                     void *addr,
+                     size_t length,
+                     int flags);
+
+/*
+|| The macros here follow the style of the standard mmap() macros, but with
+||  QURT_ prepended to avoid name conflicts, and to avoid having a dependency
+||  on sys/mman.h.
+||
+|| Wherever possible, any values here that are also present in sys/mman.h
+||  should have the same value in both places so that we can accept "mmap"
+||  calls without having to remap parameters to new values.
+||
+|| In the future, it would be desirable to have a regression test that
+||  checks, for instance, that these macros match.  Example:
+||
+||   assert(QURT_MAP_FAILED == MAP_FAILED);
+||   ... repeat as needed ...
+*/
+
+/** @addtogroup memory_mapping_macros
+@{ */
+/** @cond */
+#define QURT_PROT_NONE                  0x00U    /**< */
+#define QURT_PROT_READ                  0x01U    /**< */
+#define QURT_PROT_WRITE                 0x02U    /**< */
+#define QURT_PROT_EXEC                  0x04U    /**< */
+#define QURT_PROT_NODUMP                0x08U    /**< Skip dumping the mapping. During PD dump, must skip
+                                                   some mappings on host memory to avoid a race condition
+                                                      where the memory is removed from the host and the DSP process
+                                                      crashes before the mapping is removed.*/
+#define QURT_PROT_ISLAND                0x10U     /**< Island mapping. */
+
+#define QURT_MAP_SHARED                 0x0001U   /**< Shared. */
+#define QURT_MAP_PRIVATE                0x0002U   /**< Private. */
+/** @endcond */
+#define QURT_MAP_NAMED_MEMSECTION       0x0004U   /**< Named memsection. */
+#define QURT_MAP_FIXED                  0x0010U   /**< Fixed virtual address. */
+#define QURT_MAP_RENAME                 0x0020U   /**< Rename. */
+#define QURT_MAP_NORESERVE              0x0040U   /**< No reserve. */
+#define QURT_MAP_INHERIT                0x0080U   /**< Inherit. */
+#define QURT_MAP_NONPROCESS_VPOOL       0x0100U   /**< Use a virtual address outside of the default range of the
+                                                       processes. This option is only supported in the root process
+                                                       and only when virtual memory split is enabled in the XML.
+                                                       The root process can use this flag to create mapping for a
+                                                       user process, for example, if the virtual address is configured
+                                                       for a 3G/1G split, the root process can use this flag to create
+                                                       mapping in the top 1 GB area for the user process or the
+                                                       lower 3 GB area for the root process. This is useful for
+                                                       shared buffer use cases. */
+#define QURT_MAP_HASSEMAPHORE           0x0200U   /**< Has semaphore. */
+#define QURT_MAP_TRYFIXED               0x0400U   /**< Try to create a mapping for a virtual address that was passed.
+                                                       If the passed virtual address fails, use a random virtual address. */
+#define QURT_MAP_WIRED                  0x0800U   /**< Wired. */
+#define QURT_MAP_FILE                   0x0000U   /**< File. */
+#define QURT_MAP_ANON                   0x1000U   /**< Allocate physical memory from the pool that was passed. 
+                                                       By default, memory is allocated from the default physpool. */
+#define QURT_MAP_VA_ONLY                0X2000U   /**< Reserve a virtual address without
+                                                       mapping it. */
+
+/** @cond */                                                   
+#define QURT_MAP_ALIGNED(n)             ((n) << QURT_MAP_ALIGNMENT_SHIFT)
+#define QURT_MAP_ALIGNMENT_SHIFT        24
+
+
+#define QURT_MAP_ALIGNMENT_MASK         QURT_MAP_ALIGNED(0xff)   /**< */
+#define QURT_MAP_ALIGNMENT_64KB         QURT_MAP_ALIGNED(16)     /**< */
+#define QURT_MAP_ALIGNMENT_16MB         QURT_MAP_ALIGNED(24)     /**< */
+#define QURT_MAP_ALIGNMENT_4GB          QURT_MAP_ALIGNED(32)     /**< */
+#define QURT_MAP_ALIGNMENT_1TB          QURT_MAP_ALIGNED(40)     /**< */
+#define QURT_MAP_ALIGNMENT_256TB        QURT_MAP_ALIGNED(48)     /**< */
+#define QURT_MAP_ALIGNMENT_64PB         QURT_MAP_ALIGNED(56)     /**< */
+/** @endcond */
+#define QURT_MAP_FAILED                 ((void *) -1)            /**< Mapping creation failed. */
+
+/*
+|| The macros below are extensions beyond the standard mmap flags, but follow
+||  the style of the mmap flags.
+*/
+/** @cond */
+// Describe bitfields in (prot)
+#define QURT_PROT_CACHE_BOUNDS          16U,19U,7U         /**< Bits 16 through 19 are cache attribute, default is 0. */
+#define QURT_PROT_BUS_BOUNDS            20U,21U,0U         /**< Bits 20 through 21 are bus attributes, default is 0. */
+#define QURT_PROT_USER_BOUNDS           22U,23U,3U         /**< Bits 22 through 23 are user mode, default is 3;
+                                                                default of 3 means to derive user mode setting from the
+                                                                default mode of the client. */
+
+// Describe bitfields in (flags)
+#define QURT_MAP_PHYSADDR_BOUNDS        15U,15U,0U         /**< Bits 15 through 15 are physaddr, default is 0. */
+#define QURT_MAP_TYPE_BOUNDS            16U,19U,0U         /**< Bits 16 through 19 are mapping type, default is 0. */
+#define QURT_MAP_REGION_BOUNDS          20U,23U,0U         /**< Bits 20 through 23 are region type, default is 0. */
+/** @endcond */
+
+// These macros get OR'ed into (prot)
+#define QURT_PROT_CACHE_MODE(n)         QURT_MMAP_BUILD(QURT_PROT_CACHE_BOUNDS,(n)) /**< */
+#define QURT_PROT_BUS_ATTR(n)           QURT_MMAP_BUILD(QURT_PROT_BUS_BOUNDS,(n))   /**< */
+#define QURT_PROT_USER_MODE(n)          QURT_MMAP_BUILD(QURT_PROT_USER_BOUNDS,(n))  /**< */
+// These macros get OR'ed into (flags)
+
+#define QURT_MAP_PHYSADDR               QURT_MMAP_BUILD(QURT_MAP_PHYSADDR_BOUNDS,1U) /**< Use the physical address that was passed in offset field. 
+                                                                                          This is allowed only for root process. */
+#define QURT_MAP_TYPE(n)                QURT_MMAP_BUILD(QURT_MAP_TYPE_BOUNDS,(n))    /**< */
+#define QURT_MAP_REGION(n)              QURT_MMAP_BUILD(QURT_MAP_REGION_BOUNDS,(n))  /**< */
+/** @} */ /* end_addtogroup memory_mapping_macros */
+/** @cond */
+// These macros extract fields from (prot)
+#define QURT_PROT_GET_CACHE_MODE(n)     QURT_MMAP_EXTRACT(QURT_PROT_CACHE_BOUNDS,(n))  /**< */
+#define QURT_PROT_GET_BUS_ATTR(n)       QURT_MMAP_EXTRACT(QURT_PROT_BUS_BOUNDS,(n))    /**< */
+#define QURT_PROT_GET_USER_MODE(n)      QURT_MMAP_EXTRACT(QURT_PROT_USER_BOUNDS,(n))   /**< */
+
+// These macros extract fields from (flags)
+#define QURT_MAP_GET_TYPE(n)            QURT_MMAP_EXTRACT(QURT_MAP_TYPE_BOUNDS,(n))   /**< */
+#define QURT_MAP_GET_REGION(n)          QURT_MMAP_EXTRACT(QURT_MAP_REGION_BOUNDS,(n)) /**< */
+
+// Macros for bitfield insertion and extraction
+#define QURT_MMAP_MASK(lo,hi)           (~((~0u) << ((hi)-(lo)+1U)))                     /**< Mask of same size as [lo..hi]. */
+#define QURT_MMAP_BUILD_(lo,hi,def,n)   ((((n)^(def))&QURT_MMAP_MASK((lo),(hi)))<<(lo)) /**< */
+#define QURT_MMAP_EXTRACT_(lo,hi,def,n) ((((n)>>(lo))&QURT_MMAP_MASK((lo),(hi)))^(def)) /**< */
+#define QURT_MMAP_BUILD(a,b)            QURT_MMAP_BUILD_(a,b)                           /**< */
+#define QURT_MMAP_EXTRACT(a,b)          QURT_MMAP_EXTRACT_(a,b)                         /**< */
+/** @endcond */
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_mq.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_mq.h
new file mode 100755
index 0000000000000..580c83d3de41a
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_mq.h
@@ -0,0 +1,458 @@
+#ifndef QURT_MQ_H
+#define QURT_MQ_H
+/**
+  @file  qurt_mq.h
+
+  @brief  Prototypes of secure message queues API functions.
+
+ EXTERNALIZED FUNCTIONS
+  None
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  None
+
+ Copyright (c) 2019-2023  by Qualcomm Technologies, Inc.  All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+======================================================================*/
+#include <qurt_types.h>
+#include <qurt_error.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*=============================================================================
+                            CONSTANTS AND MACROS
+=============================================================================*/
+#define QURT_MQ_NAME_MAXLEN            16U  /**< Maximum name length. */
+
+
+/*=============================================================================
+                            FORWARD DECLARATIONS & TYPEDEFS
+=============================================================================*/
+/* This enum must be generated in accordance to process class class numbers.
+   For now it is made to match generated version, do not change this unless 
+   there is a corresponding change in the process_class.py, indicies start from 0
+   basically: QURT_MQ_SECURITY_SCOPE_<x> = (1 << QURTK_process_class_index_<x>)
+*/
+typedef enum {
+    QURT_MQ_SECURITY_SCOPE_KERNEL =   ( 1U << 0 ),
+    QURT_MQ_SECURITY_SCOPE_SRM =      ( 1U << 1 ),
+    QURT_MQ_SECURITY_SCOPE_SECURE =   ( 1U << 2 ),
+    QURT_MQ_SECURITY_SCOPE_CPZ =      ( 1U << 3 ),
+    QURT_MQ_SECURITY_SCOPE_ROOT =     ( 1U << 4 ),
+    QURT_MQ_SECURITY_SCOPE_SIGNED =   ( 1U << 5 ),
+    QURT_MQ_SECURITY_SCOPE_UNSIGNED = ( 1U << 6 ),
+    QURT_MQ_SECURITY_SCOPE_SECURE_ROOT = ( 1U << 7 )
+} qurt_mq_security_scope_t;
+
+typedef enum {
+    QURT_MQ_CARDINALITY_PTP =   (1U << 0),
+    QURT_MQ_CARDINALITY_MTO =   (1U << 1)
+}qurt_mq_cardinality_t;
+
+typedef unsigned int qurt_mqd_t;
+
+typedef union{
+    struct {
+        unsigned int perms:2;
+        unsigned int cardinality:1;
+        unsigned int blocking:1;
+
+        qurt_mq_security_scope_t creator_scope: 8;
+        qurt_mq_security_scope_t allowed_scope: 8; //can be a bitmask in case of MTO
+        unsigned int queue_closed: 1;
+        unsigned int reserved: 11;
+    }; //try to do anonymous struct
+    unsigned int raw;
+} qurt_mq_flags_t;
+
+
+/* permissions are from qurt_types.h , block X though */
+#if 0
+/** Memory access permission. */
+typedef enum {
+        QURT_PERM_READ=0x1U, /**< */
+        QURT_PERM_WRITE=0x2U,  /**< */
+        QURT_PERM_EXECUTE=0x4U,  /**< */
+        QURT_PERM_FULL=QURT_PERM_READ|QURT_PERM_WRITE|QURT_PERM_EXECUTE,  /**< */
+} qurt_perm_t;
+#endif
+
+struct qurt_mq_attr {
+   unsigned flags;                         /**< Configured flags. Only meaningful with get_attr(), only used for qurt_mq_flags_t.perms. */
+   unsigned mq_maxmsg;                     /**< Maximum number of messages. Used with create() and get_attr. */
+   unsigned short mq_send_msgsize;         /**< Maximum size (bytes) of message in receiver facing queue,
+                                                from sender to receiver. */
+   unsigned short mq_recv_msgsize;         /**< Maximum size (bytes) of message in sender facing queue,
+                                                from receiver to sender. */
+   unsigned client_pid;                    /**< Process ID of client that is allowed to open the message queue
+                                                that was created using qurt_mq_create(). */
+   qurt_mq_cardinality_t    cardinality;   /**< Cardinality of message queue connection, see below. */
+   qurt_mq_security_scope_t scope;         /**< Security scope of the senders to the queue. */ 
+};
+
+
+/*=============================================================================
+                            EXTERNS & FUNCTIONS
+=============================================================================*/
+/**@ingroup func_qurt_mq_attr_init
+  Initializes attributes to default values used for creating the queue.
+
+  The initialize operation sets the following default attribute values: \n
+  - flag - QURT_PERM_READ | QURT_PERM_WRITE \n
+  - maxmsg - 1 \n
+  - mq_send_msgsize - 8 \n
+  - mq_recv_msgsize - 8 \n
+  - sender_pid -  -1 \n    
+  - cardinality -  QURT_MQ_CARDINALITY_PTP \n    
+  - scope -  QURT_MQ_SECURITY_SCOPE_SIGNED \n    
+
+  @datatypes
+  #qurt_mq_attr 
+  
+  @param[in,out] attr Pointer to the initialized message queue object.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+void qurt_mq_attr_init(struct qurt_mq_attr * attr);
+
+/**@ingroup qurt_mq_attr_set_send_msgsize
+  Sets the message size in bytes the sender can send.
+  Maximum message length is configurable using the XML configuration, however, limited to a maximum value of 62 bytes.
+  
+  @datatypes
+  #qurt_mq_attr
+
+  @param[in,out] attr Pointer to the message queue object.
+  @param[in] len     Length of message in bytes.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+void qurt_mq_attr_set_send_msgsize (struct qurt_mq_attr *attr, size_t len);
+
+/**@ingroup qurt_mq_attr_set_recv_msgsize
+  Sets the message size in bytes that the receiver can read.
+  Maximum message length is configurable using the XML configuration, however, limited to maximum value of 62 bytes.
+  
+  @datatypes
+  #qurt_mq_attr
+
+  @param[in,out] attr Pointer to the message queue object.
+  @param[in] len     Length of message in bytes.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+void qurt_mq_attr_set_recv_msgsize (struct qurt_mq_attr *attr, size_t len);
+
+/**@ingroup qurt_mq_attr_set_maxmsg
+  Sets the maximum message that can queue in the message queue.
+  Message depth is configurable using the XML configuration. 
+  
+  @datatypes
+  #qurt_mq_attr
+
+  @param[in,out] attr  Pointer to the message queue object.
+  @param[in] depth     Maximum message that can be queued.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+void qurt_mq_attr_set_maxmsg (struct qurt_mq_attr *attr, unsigned int depth);
+
+/**@ingroup qurt_mq_attr_set_scope
+  Sets the scope of the message queue. A message queue created with a security 
+  scope allows only a process class of that scope to open a message queue.
+  
+  @datatypes
+  #qurt_mq_attr \n
+  #qurt_mq_security_scope_t
+
+  @param[in,out] attr  Pointer to the message queue object.
+  @param[in] scope     Scope of the message queue: \n
+                       #QURT_MQ_SECURITY_SCOPE_KERNEL \n
+                       #QURT_MQ_SECURITY_SCOPE_SRM \n
+                       #QURT_MQ_SECURITY_SCOPE_SECURE \n
+                       #QURT_MQ_SECURITY_SCOPE_CPZ \n
+                       #QURT_MQ_SECURITY_SCOPE_ROOT \n
+                       #QURT_MQ_SECURITY_SCOPE_SIGNED \n
+                       #QURT_MQ_SECURITY_SCOPE_UNSIGNED
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+void qurt_mq_attr_set_scope (struct qurt_mq_attr *attr, qurt_mq_security_scope_t scope);
+
+
+/**@ingroup qurt_mq_attr_set_client_pid
+  Sets the client_pid that can open this message queue.
+  If client_pid is set, allowed_scope to open MQ shall not be considered.
+  
+  @datatypes
+  #qurt_mq_attr
+
+  @param[in,out] attr    Pointer to the message queue object.
+  @param[in] client_pid  Valid PID for client process.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+void qurt_mq_attr_set_client_pid (struct qurt_mq_attr *attr, unsigned client_pid);
+
+/**@ingroup qurt_mq_attr_set_flags
+  Sets the properties of the message queues. 
+  The current implementation is only used to set the permission for the message queue using the flag attribute.
+  Default is #QURT_PERM_READ | #QURT_PERM_WRITE, explicit permission is not implemented.
+  
+  @datatypes
+  #qurt_mq_attr
+
+  @param[in,out] attr  Pointer to the message queue object.
+  @param[in] flags     Permission for message queue.  
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+void qurt_mq_attr_set_flags (struct qurt_mq_attr *attr, unsigned int flags);
+
+/**@ingroup qurt_mq_create
+   Create a message queue with the provided name and attributes. 
+   The calling process becomes the owner of the queue.
+   Name of the message queue is limited to 16 characters including the NULL terminator. 
+  
+  @datatypes
+  #qurt_mq_attr \n
+  #qurt_mqd_t
+
+  @param[out] mqd Returns a pointer to the message queue identifier if 
+              the message queue  was successfully created.
+  @param[in] name     String identifier of the message queue.
+  @param[in] attr     Pointer to the initialized message queue attribute 
+                      structure that specifies the attributes of the created message queue.
+
+  @return
+  #QURT_EOK        Message queue created. \n
+  #QURT_EINVALID   Invalid arguments. \n
+  #QURT_ENOSPC     Maximum number of queues in the system is exceeded.
+
+  @dependencies
+  None.
+*/
+int qurt_mq_create(qurt_mqd_t *mqd, const char *name, struct qurt_mq_attr * attr);
+
+/**@ingroup qurt_mq_open
+  Opens a message queue connection between a process and a created message queue. 
+  
+  @datatypes
+  #qurt_mq_attr \n
+  #qurt_mqd_t
+
+  @param[out] mqd Returns a pointer to the message queue 
+              identifier if the message queue  was successfully created.
+  @param[in] name    String identifier of the message queue. 
+  @param[in] flags   Flag that contains the properties that define the behavior of message queue connection.
+                     Permissions:\n
+                      #QURT_PERM_READ \n
+                      #QURT_PERM_WRITE \n
+                      #QURT_PERM_READ | QURT_PERM_WRITE @tablebulletend  
+                      Default is QURT_PERM_READ | QURT_PERM_WRITE, explicit permission is not implemented \n
+                     Cardinality: \n
+                      #QURT_MQ_CARDINALITY_PTP (default) \n      
+                      #QURT_MQ_CARDINALITY_MTO (not implemented) \n
+                      Block suspend thread until the message queue with the apecified name is created. \n
+                     Scope: security boundary to which the message queue and its users are constrained.
+                      Block suspend thread until the message queue with the apecified name is created. \n
+                      It is coupled with process privilege level/scope.\n
+                      #QURT_MQ_SECURITY_SCOPE_KERNEL   \n
+                      #QURT_MQ_SECURITY_SCOPE_SRM      \n
+                      #QURT_MQ_SECURITY_SCOPE_SECURE   \n
+                      #QURT_MQ_SECURITY_SCOPE_CPZ      \n
+                      #QURT_MQ_SECURITY_SCOPE_ROOT     \n
+                      #QURT_MQ_SECURITY_SCOPE_SIGNED   \n
+                      #QURT_MQ_SECURITY_SCOPE_UNSIGNED @tablebulletend
+
+  @return
+  QURT_EOK -- Message queue connection successfully opened \n
+  QURT_EFAILED -- Message queue connection failed , if non-blocking message queue \n
+  QURT_ENOTALLOWED --  Open failed due to security scope mismatch
+
+  @dependencies
+  None.
+*/
+int qurt_mq_open (qurt_mqd_t *mqd, const char *name, qurt_mq_flags_t flags);
+
+/**@ingroup qurt_mq_send
+  Sends a message over message queue.\n
+  - If the message queue is full, the calling thread shall be 
+    suspended until space becomes available to enqueue the message. \n
+  - If there exists a thread suspended on an empty queue 
+  to receive a message,  qurt_mq_send shall resume that thread. 
+
+  @datatypes
+  #qurt_mqd_t
+
+  @param[in] mqd Pointer to the message queue identifier.
+  @param[in] msg_ptr     Pointer to the message buffer.  
+  @param[in] msg_len     Length of the message buffer in bytes.  
+
+  @return
+  #QURT_EOK  Message queue send was successful.\n
+  #QURT_EMSGSIZE  Message size in msg_len field is greater than max_message_len specified during queue creation.\n
+  #QURT_ENOTALLOWED   Send failed due to security scope mismatch.
+
+  @dependencies
+  None.
+*/
+int qurt_mq_send(qurt_mqd_t mqd, const char *msg_ptr, size_t msg_len); 
+
+/**@ingroup qurt_mq_send_timed
+  Sends a message over message queue.\n
+  - If the message queue is full, the calling thread shall be 
+    suspended until space becomes available to enqueue the message or until timeout is reached. \n
+  - If there exists a thread suspended on an empty queue 
+    to receive a message, qurt_mq_send_timed shall return with possible return codes.\n
+  - If timeout is reached, qurt_mq_send_timed shall return #QURT_ETIMEOUT.
+
+  @datatypes
+  #qurt_mqd_t
+
+  @param[in] mqd Pointer to the message queue identifier.
+  @param[in] msg_ptr     Pointer to the message buffer.
+  @param[in] duration    Interval (in microseconds) that the duration value must be
+             between #QURT_TIMER_MIN_DURATION and #QURT_TIMER_MAX_DURATION     
+  @param[in] msg_len     Length of message buffer in bytes.  
+
+  @return
+  #QURT_EOK -- Message queue send was successful. \n
+  #QURT_EMSGSIZE -- Message size in msg_len field is greater than max_message_len specified during queue creation.\n
+  #QURT_ENOTALLOWED --  Send failed due to security scope mismatch \n
+  #QURT_ETIMEDOUT -- Timeout
+  
+  @dependencies
+  None.
+*/
+int qurt_mq_send_timed(qurt_mqd_t mqd, const char *msg_ptr, unsigned long long int duration, size_t msg_len);
+
+ /**@ingroup qurt_mq_recv
+  Receives a message from the message queue. \n
+  -If the message queue is empty, the calling thread shall be 
+   suspended until a message is enqueued in the message queue. \n
+  -If there exists a thread suspended on a full queue to 
+   send a message, qurt_mq_recv shall resume the thread.
+
+  @datatypes
+  #qurt_mqd_t
+
+  @param[in] mqd Pointer to the message queue identifier.
+  @param[in] msg_ptr       Pointer to the message buffer  
+  @param[in,out] msg_len   Pointer to the length of message buffer.  
+
+  @return
+  #QURT_EOK --    Message queue created.\n
+  #QURT_EINVALID  Message pointer or msg_len ptr are NULL. \n
+  #QURT_EBADR     Message queue descriptior (mqd) is invalid. \n
+  #QURT_EBADF     Sender closed the message queue.
+
+  @dependencies
+  None.
+*/
+int qurt_mq_recv(qurt_mqd_t mqd, unsigned char *msg_ptr, size_t *msg_len);
+
+ /**@ingroup qurt_mq_recv_timed
+  Receives a message from the message queue. \n
+  -If the message queue is empty, the calling thread shall be 
+   suspended until a message is enqueued in the message queue or until timeout is reached.\n 
+  -If there exists a thread suspended on a full queue to 
+   send a message, qurt_mq_recv_timed shall return with possible return codes.\n
+  - If timeout is reached, qurt_mq_recv_timed shall return QURT_ETIMEOUT.
+  
+  @datatypes
+  #qurt_mqd_t
+
+  @param[in] mqd Pointer to the message queue identifier.
+  @param[in] msg_ptr     Pointer to the message buffer  
+  @param[in] duration    Interval (in microseconds) that the duration value must be;
+             between #QURT_TIMER_MIN_DURATION and #QURT_TIMER_MAX_DURATION   
+  @param[in,out] msg_len     Pointer to length of message buffer.  
+
+  @return
+  #QURT_EOK --       Message queue created.\n
+  #QURT_EINVALID --  Message ptr or msg_len ptr are NULL. \n
+  #QURT_EBADR    --  Message queue descriptior (mqd) is invalid.\n
+  #QURT_EBADF   --   Sender closed the message queue. \n
+  #QURT_ETIMEDOUT -- Timeout.
+  
+  @dependencies
+  None.
+*/
+int qurt_mq_recv_timed(qurt_mqd_t mqd, unsigned char *msg_ptr, unsigned long long int duration, size_t *msg_len);
+
+ /**@ingroup qurt_mq_close
+  Closes the message queue and disassociates the calling process (client) from the message queue 
+  under this descriptor. Marks the queue as closed for the receiver. 
+  This function is expected to be called from the client side. If called 
+  from the server side, the function reduces to no-op and returns success. 
+
+  @datatypes
+  #qurt_mqd_t
+
+  @param[in] mqd Pointer to the message queue identifier. 
+
+  @return
+  #QURT_EOK -- Message queue close was successfully.\n
+  #QURT_EBADR -- Invalid descriptor.\n
+  #QURT_ENOTALLOWED --   Message queue close is not called from client side.
+
+  @dependencies
+  None.
+*/
+int qurt_mq_close(qurt_mqd_t mqd);
+
+ /**@ingroup qurt_mq_destroy
+  Destroys the message queue. This function ought to be 
+  called from the process that called qurt_mq_create(). 
+
+  @datatypes
+  #qurt_mqd_t
+
+  @param[in] mqd Pointer to the message queue identifier. 
+
+  @return
+  #QURT_EOK -- Message queue destroy was successfully.\n
+  #QURT_EBADR -- Invalid descriptor.\n
+  #QURT_ENOTALLOWED --  Message queue close is not called from client side.
+
+  @dependencies
+  None.
+*/
+int qurt_mq_destroy(qurt_mqd_t mqd);
+
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+#endif //QURT_MQ_H
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_mutex.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_mutex.h
new file mode 100755
index 0000000000000..4ad6b270cdde6
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_mutex.h
@@ -0,0 +1,211 @@
+#ifndef QURT_MUTEX_H
+#define QURT_MUTEX_H
+/**
+  @file qurt_mutex.h 
+  @brief   Prototypes of mutex API.  
+   This is mostly a user space mutex, but calls the 
+   kernel to block if the mutex is taken. 
+
+EXTERNAL FUNCTIONS
+   None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+Copyright (c) 2021  by Qualcomm Technologies, Inc.  All Rights Reserved.
+Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+=============================================================================*/
+
+
+#include <qurt_futex.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** @addtogroup mutex_types
+@{ */
+/*=============================================================================
+                        TYPEDEFS
+=============================================================================*/
+
+/** QuRT mutex type.                                       
+  
+   Both non-recursive mutex lock and unlock, and recursive
+   mutex lock and unlock can be applied to this type.
+ */
+typedef union qurt_mutex_aligned8{
+   /** @cond */  
+    struct {       
+        unsigned int holder; 
+        unsigned int count;  
+        unsigned int queue;  
+        unsigned int wait_count;        
+    };
+    unsigned long long int raw;  
+    /** @endcond */  
+} qurt_mutex_t;
+/** @} */ /* end_addtogroup mutex_types */
+/*=============================================================================
+                        CONSTANTS AND MACROS
+=============================================================================*/
+/* @addtogroup mutex_const_macros
+@{ */
+#define MUTEX_MAGIC 0xfe                             /**< */
+#define QURTK_FUTEX_FREE_MAGIC     0x1F   // 11111   /**< */
+#define QURT_MUTEX_INIT {{MUTEX_MAGIC, 0, QURTK_FUTEX_FREE_MAGIC,0}}   /**< Suitable as an initializer for a
+                                                                        variable of type qurt_mutex_t. */
+/* @} */ /* end_addtogroup mutex_const_macros */
+/*=============================================================================
+                        FUNCTIONS
+=============================================================================*/
+/**@ingroup func_qurt_mutex_init
+  Initializes a mutex object.
+  The mutex is initially unlocked.
+
+  @note1hang Each mutex-based object has one or more kernel resources associated with it;
+             to prevent resource leaks, call qurt_mutex_destroy()
+             when this object is not used anymore
+  @datatypes
+  #qurt_mutex_t
+  
+  @param[out]  lock  Pointer to the mutex object. Returns the initialized object.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+  
+ */
+void qurt_mutex_init(qurt_mutex_t *lock);
+
+/**@ingroup func_qurt_mutex_destroy
+   Destroys the specified mutex. 
+
+   @note1hang Mutexes must be destroyed when they are no longer in use. Failure to do this
+              causes resource leaks in the QuRT kernel.\n
+   @note1cont Mutexes must not be destroyed while they are still in use. If this occurs, the
+              behavior of QuRT is undefined. 
+  
+   @datatypes
+   #qurt_mutex_t
+   
+   @param[in]  lock  Pointer to the mutex object to destroy.
+
+   @return 
+   None.
+
+   @dependencies
+   None.
+  
+ */
+void qurt_mutex_destroy(qurt_mutex_t *lock); 
+
+/**@ingroup func_qurt_mutex_lock
+   Locks the specified mutex.  
+   If a thread performs a lock operation on a mutex that is not in use, the thread gains
+   access to the shared resource that is protected by the mutex, and continues executing.
+
+   If a thread performs a lock operation on a mutex that is already in use by another
+   thread, the thread is suspended. When the mutex becomes available again (because the
+   other thread has unlocked it), the thread is awakened and given access to the shared
+   resource.
+
+   @note1hang A thread is suspended indefinitely if it locks a mutex that it has already
+           locked. Avoid this by using recursive mutexes (Section @xref{dox:recursive_mutexes}).  
+  
+   @datatypes
+   #qurt_mutex_t
+   
+   @param[in]  lock  Pointer to the mutex object. Specifies the mutex to lock.
+
+   @return 
+   None.
+
+   @dependencies
+   None.  
+ */
+void qurt_mutex_lock(qurt_mutex_t *lock);		/* blocking */
+
+/**@ingroup func_qurt_mutex_lock_timed
+   Locks the specified mutex.
+   When a thread performs a lock operation on a mutex that is not in use, the thread gains
+   access to the shared resource that is protected by the mutex, and continues executing.
+
+   When a thread performs a lock operation on a mutex that is already in use by another
+   thread, the thread is suspended. When the mutex becomes available again (because the
+   other thread has unlocked it), the thread is awakened and given access to the shared
+   resource. If the duration of suspension exceeds the timeout duration, wait is
+   terminated and no access to mutex is granted. 
+   
+   @datatypes
+   #qurt_mutex_t
+   
+   @param[in]  lock    Pointer to the mutex object; specifies the mutex to lock. 
+   @param[in] duration Interval (in microseconds) that the duration value must be between #QURT_TIMER_MIN_DURATION and
+    #QURT_TIMER_MAX_DURATION 
+
+   @return 
+   #QURT_EOK -- Success \n
+   #QURT_ETIMEDOUT -- Timeout
+ 
+   @dependencies
+   None.  
+ */
+int qurt_mutex_lock_timed (qurt_mutex_t * lock, unsigned long long int duration);
+
+/**@ingroup func_qurt_mutex_unlock
+  Unlocks the specified mutex.  \n
+  More than one thread can be suspended on a mutex. When the mutex is unlocked, only the
+  highest-priority thread waiting on the mutex is awakened. If the awakened thread has
+  higher priority than the current thread, a context switch occurs.
+
+  @note1hang The behavior of QuRT is undefined if a thread unlocks a mutex it did not first
+              lock.  
+  
+   @datatypes
+   #qurt_mutex_t
+   
+   @param[in]  lock  Pointer to the mutex object. Specifies the mutex to unlock. 
+
+   @return
+   None.
+
+   @dependencies
+   None.  
+ */
+void qurt_mutex_unlock(qurt_mutex_t *lock);	/* unlock */
+
+/**@ingroup func_qurt_mutex_try_lock
+   @xreflabel{hdr:qurt_mutex_try_lock}
+   Attempts to lock the specified mutex. 
+   If a thread performs a try_lock operation on a mutex that is not in use, the thread gains
+   access to the shared resource that is protected by the mutex, and continues executing.
+
+   @note1hang If a thread performs a try_lock operation on a mutex that it has already locked 
+              or is in use by another thread, qurt_mutex_try_lock immediately returns with a 
+              nonzero result value.
+   
+  
+   @datatypes
+   #qurt_mutex_t
+   
+   @param[in]  lock  Pointer to the mutex object. Specifies the mutex to lock.
+
+   @return 
+   0 -- Success. \n 
+   Nonzero -- Failure.
+  
+  @dependencies
+  None.
+ */
+int qurt_mutex_try_lock(qurt_mutex_t *lock);	
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_MUTEX_H */
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_os_services.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_os_services.h
new file mode 100755
index 0000000000000..cbc4c239e9620
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_os_services.h
@@ -0,0 +1,24 @@
+/*=============================================================================
+
+                                    qurt_os_services.c
+
+GENERAL DESCRIPTION
+
+EXTERNAL FUNCTIONS
+        None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+        None.
+
+             Copyright (c) 2023  by Qualcomm Technologies, Inc.  All Rights Reserved.
+=============================================================================*/
+
+#define QURT_OS_SERVICE_THREAD                "/os/thread"				/**< Thread service */
+#define QURT_OS_SERVICE_FS_HUB                "/os/fs_hub"  			/**< file-system hub */
+#define QURT_OS_SERVICE_CALLBACK              "/os/callback"            /**< QDI callback service */ 
+#define QURT_OS_SERVICE_INTERRUPTS            "/os/interrupt"           /**< Interrupt service */
+#define QURT_OS_SERVICE_PROXY                 "/os/proxy"               /**< QDI proxy serice */
+#define QURT_OS_SERVICE_MEMORY                "/os/memory"              /**< Memory management service */
+#define QURT_OS_SERVICE_MEMPOOL               "/os/mempool"             /**< Pool management service */
+#define QURT_OS_SERVICE_PROCESS               "/os/process"             /**< Process management service */
+#define QURT_OS_SERVICE_MMAP                  "/os/mem_mapper"          /**< mmapper service */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_pimutex.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_pimutex.h
new file mode 100755
index 0000000000000..61aee5cba7ce8
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_pimutex.h
@@ -0,0 +1,200 @@
+#ifndef QURT_PIMUTEX_H
+#define QURT_PIMUTEX_H 1
+/**
+  @file qurt_pimutex.h   
+  @brief Prototypes of qurt_pimutex API.  
+
+EXTERNAL FUNCTIONS
+   None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+Copyright (c) 2021  by Qualcomm Technologies, Inc.  All Rights Reserved.
+Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+=============================================================================*/
+
+#include <qurt_futex.h>
+#include <qurt_mutex.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*=============================================================================
+                        FUNCTIONS
+=============================================================================*/
+/**@ingroup func_qurt_pimutex_init
+  Initializes a priority inheritance mutex object.
+  The priority inheritance mutex is initially unlocked.
+
+  This function works the same as qurt_mutex_init().
+
+   @note1hang Each pimutex-based object has one or more kernel resources associated with it;
+              to prevent resource leaks, call qurt_pimutex_destroy()
+              when this object is not used anymore
+
+  @datatypes
+  #qurt_mutex_t
+  
+  @param[out]  lock  Pointer to the priority inheritance mutex object.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ 
+ */
+void qurt_pimutex_init(qurt_mutex_t *lock);
+
+/**@ingroup func_qurt_pimutex_destroy
+   Destroys the specified priority inheritance mutex.  
+
+   @note1hang Priority inheritance mutexes must be destroyed when they are no longer in
+              use. Failure to do this causes resource leaks in the QuRT kernel.\n
+   @note1cont Priority inheritance mutexes must not be destroyed while they are still in use.
+              If this occurs, the behavior of QuRT is undefined.
+  
+   @datatypes
+   #qurt_mutex_t
+   
+   @param[in]  lock  Pointer to the priority inheritance mutex object to destroy.
+
+   @return
+   None.
+
+   @dependencies
+   None.
+  
+ */
+void qurt_pimutex_destroy(qurt_mutex_t *lock);
+
+/**@ingroup func_qurt_pimutex_lock
+  Requests access to a shared resources. If a thread performs a lock operation on a mutex 
+  that is not in use, the thread gains access to the shared resource that the mutex protects, 
+  and continues executing.
+ 
+  If a thread performs a lock operation on a mutex that is already in use by another
+  thread, the thread is suspended. When the mutex becomes available again (because the 
+  other thread has unlocked it), the thread is awakened and given access to the shared resource.
+
+  If a thread is suspended on a priority inheritance mutex, and the priority of the suspended
+  thread is higher than the priority of the thread that has locked the mutex, the thread
+  with the mutex acquires the higher priority of the suspended thread. The locker thread blocks
+  until the lock is available.
+ 
+  @note1hang  A thread is not suspended if it locks a priority inheritance mutex that it has 
+              already locked . However, the mutex does not become available to other 
+			  threads until the thread performs a balanced number of unlocks on the mutex.\n
+  @note1cont  When multiple threads compete for a mutex, the lock operation for a priority
+              inheritance mutex is slower than it is for a recursive mutex. 
+			  In particular, it is about 10 times slower when the mutex is available for locking,
+			  and slower (with greatly varying times) when the mutex is already locked.
+
+  @datatypes
+  #qurt_mutex_t
+  
+  @param[in]  lock  Pointer to the priority inheritance mutex object to lock.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+  
+ */
+void qurt_pimutex_lock(qurt_mutex_t *lock);
+
+
+/**@ingroup func_qurt_pimutex_lock_timed
+  Locks a priority inheritance mutex with timeout.
+ 
+  A thread can lock a priority inheritance mutex for multiple times. The mutex is not 
+  available to other threads until the thread performs the same number of mutex unlock
+  operations.
+
+  If a thread performs a lock operation on a mutex that is already locked by another thread, 
+  the thread is moved to waiting state. When the mutex becomes available again (because the 
+  other thread has unlocked the mutex), the thread is awakened and tries to lock the mutex.
+
+  If a thread is waiting on a priority inheritance mutex, and the priority of the waiting thread 
+  is higher than the priority of the thread that has locked the mutex, the priority of the thread
+  that has locked the mutex is raised to the same priority of the waiting thread.
+
+  If the duration of waiting exceeds the timeout duration, the waiting is terminated, and 
+  the function returns QURT_ETIMEDOUT as a failure of the mutex lock.
+  
+
+  @datatypes
+  #qurt_mutex_t
+  
+  @param[in]  lock       Pointer to the mutex object to lock.
+  @param[in]  duration   Duration (in microseconds) to wait. The duration value must be between 
+                         #QURT_TIMER_MIN_DURATION and #QURT_TIMER_MAX_DURATION.
+
+  @return
+   #QURT_EOK       -- Success \n
+   #QURT_ETIMEDOUT -- Timeout
+   #QURT_EINVALID  -- Duration is out of range
+
+  @dependencies
+  None.
+  
+ */
+int qurt_pimutex_lock_timed(qurt_mutex_t *lock, unsigned long long int duration);
+
+
+/**@ingroup func_qurt_pimutex_unlock
+   Releases access to a shared resource; unlocks the specified priority inheritance mutex.  \n
+   More than one thread can be suspended on a priority inheritance mutex. When the mutex
+   is unlocked, only the highest-priority thread waiting on the mutex is awakened. If the
+   awakened thread has higher priority than the current thread, a context switch occurs.
+
+   When a thread unlocks a priority inheritance mutex, its thread priority is restored to its
+   original value from any higher priority value that it acquired from another thread
+   suspended on the mutex.
+  
+   @datatypes
+   #qurt_mutex_t
+   
+   @param[in]  lock  Pointer to the priority inheritance mutex object to unlock.
+
+   @return 
+   None.
+
+   @dependencies
+   None.
+ 
+ */
+void qurt_pimutex_unlock(qurt_mutex_t *lock);
+
+/**@ingroup func_qurt_pimutex_try_lock
+  Request access to a shared resource (without suspend). Attempts to lock the specified priority inheritance mutex.\n
+  If a thread performs a try_lock operation on a priority inheritance mutex that is not in
+  use, the thread gains access to the shared resource that is protected by the mutex, and
+  continues executing.
+  If a thread performs a try_lock operation on a priority inheritance mutex that is already
+  in use by another thread, qurt_pimutex_try_lock immediately returns with a
+  nonzero result value.
+  
+  @datatypes
+  #qurt_mutex_t
+  
+  @param[in]  lock  Pointer to the priority inheritance mutex object to lock.
+
+  @return
+  0 -- Success. \n
+  Nonzero -- Failure. 
+
+  @dependencies
+  None. 
+ */
+int qurt_pimutex_try_lock(qurt_mutex_t *lock);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_PIMUTEX_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_pimutex2.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_pimutex2.h
new file mode 100755
index 0000000000000..b809f163cbfd2
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_pimutex2.h
@@ -0,0 +1,162 @@
+#ifndef QURT_PIMUTEX2_H
+#define QURT_PIMUTEX2_H
+/**
+  @file qurt_pimutex2.h 
+  @brief Prototypes of pimutex2 API  
+
+EXTERNAL FUNCTIONS
+   None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+Copyright (c) 2013, 2021, 2023  by Qualcomm Technologies, Inc.  All Rights Reserved.
+Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+=============================================================================*/
+
+
+#include <qurt_futex.h>
+#include <qurt_mutex.h>
+#include <qurt_rmutex2.h>
+
+/*=============================================================================
+												FUNCTIONS
+=============================================================================*/
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**@ingroup func_qurt_pimutex2_init
+   Initializes a recursive mutex object. 
+
+   @deprecated use #qurt_pimutex_init instead.
+
+   The recursive mutex is initially unlocked.
+  
+   Objects of type pimutex2 solve a potential race condition between
+   unlock() and destroy() operations.
+
+   @datatypes
+   #qurt_rmutex2_t
+   
+   @param[out]  lock  Pointer to the recursive mutex object.
+
+   @return 
+   None.
+
+   @dependencies
+   None.
+  
+ */
+void qurt_pimutex2_init(qurt_rmutex2_t *lock);
+
+/**@ingroup func_qurt_pimutex2_destroy
+
+  @deprecated use #qurt_pimutex_destroy instead.
+
+  Destroys the specified recursive mutex. \n
+  @note1cont Recursive mutexes must not be destroyed while they are still in use. If this
+             occurs, the behavior of QuRT is undefined. 
+  @note1cont In general, application code should destroy an pimutex2 object prior to
+             deallocating it; calling qurt_pimutex2_destroy() before deallocating it ensures
+             that all qurt_pimutex2_unlock() calls complete.
+  
+  @datatypes
+  #qurt_rmutex2_t
+  
+  @param[in]  lock  Pointer to the recursive mutex object to destroy.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+  
+ */
+void qurt_pimutex2_destroy(qurt_rmutex2_t *lock);
+
+/**@ingroup func_qurt_pimutex2_lock
+
+  @deprecated use #qurt_pimutex_lock instead.
+
+  Locks the specified recursive mutex. \n 
+
+  If a thread performs a lock operation on a recursive mutex that is not being used, the
+  thread gains access to the shared resource that is protected by the mutex, and continues
+  executing.
+
+  If a thread performs a lock operation on a recursive mutex that is already being used by
+  another thread, the thread is suspended. When the mutex becomes available again
+  (because the other thread has unlocked it), the thread is awakened and given access to the
+  shared resource.
+  
+  @note1hang A thread is not suspended if it locks a recursive mutex that it has already
+             locked, but the mutex does not become available until the thread performs a
+             balanced number of unlocks on the mutex.
+  
+   @datatypes
+   #qurt_rmutex2_t
+  
+   @param[in]  lock  Pointer to the recursive mutex object to lock. 
+
+   @return
+   None.
+
+   @dependencies
+   None.
+  
+ */
+void qurt_pimutex2_lock(qurt_rmutex2_t *lock);
+
+/**@ingroup func_qurt_pimutex2_unlock
+
+   @deprecated use #qurt_pimutex_unlock instead.
+
+   Unlocks the specified recursive mutex. \n 
+   More than one thread can be suspended on a recursive mutex. When the mutex is
+   unlocked, only the highest-priority thread waiting on the mutex is awakened. If the
+   awakened thread has higher priority than the current thread, a context switch occurs.
+  
+   @datatypes
+   #qurt_rmutex2_t
+  
+   @param[in]  lock  Pointer to the recursive mutex object to unlock. 
+
+   @return
+   None.
+
+   @dependencies
+   None.
+  
+ */
+void qurt_pimutex2_unlock(qurt_rmutex2_t *lock);
+
+/**@ingroup func_qurt_rmutex2_try_lock
+
+   @deprecated use #qurt_pimutex_try_lock instead.
+
+   Attempts to lock the specified recursive mutex.\n
+
+   Non-blocking version of qurt_pimutex2_lock().  If a call to qurt_pimutex2_lock() would
+   succeed immediately, this function behaves similarly, and returns 0 for success.
+   If a call to qurt_pimutex2_lock() would not succeed immediately, this function has
+   no effect and returns non-zero for failure.
+
+   @datatypes
+   #qurt_rmutex2_t
+   
+   @param[in]  lock  Pointer to the recursive mutex object to lock.
+
+   @return 
+   0 -- Success. \n 
+   Nonzero -- Failure. 
+  
+ */
+int qurt_pimutex2_try_lock(qurt_rmutex2_t *lock);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_PIMUTEX2_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_pipe.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_pipe.h
new file mode 100755
index 0000000000000..6bdaa044f8640
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_pipe.h
@@ -0,0 +1,479 @@
+#ifndef QURT_PIPE_H
+#define QURT_PIPE_H
+/**
+  @file qurt_pipe.h 
+  @brief  Prototypes of the pipe interface API  
+   This is a pipe or message queue
+	 It blocks when too full (send) or empty (receive).
+	 Unless using a nonblocking option, all datagrams are 64 bits.
+
+EXTERNAL FUNCTIONS
+   None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+ Copyright (c) 2021,2023  by Qualcomm Technologies, Inc.  All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+=============================================================================*/
+
+#include <stddef.h>
+#include <qurt_mutex.h>
+#include <qurt_sem.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** @addtogroup pipe_types
+@{ */
+/*=============================================================================
+                        CONSTANTS AND MACROS
+=============================================================================*/
+#define QURT_PIPE_MAGIC  0xF1FEF1FE        /**< Magic. */
+#define QURT_PIPE_ATTR_MEM_PARTITION_RAM 0 /**< RAM. */
+#define QURT_PIPE_ATTR_MEM_PARTITION_TCM 1 /**< TCM. */
+
+/*=============================================================================
+                        TYPEDEFS
+=============================================================================*/
+/** QuRT pipe data values type. */
+typedef unsigned long long int qurt_pipe_data_t;
+
+/** QuRT pipe type.*/
+typedef struct {
+    /** @cond */
+	qurt_mutex_t pipe_lock;
+	qurt_sem_t senders;
+	qurt_sem_t receiver;
+	unsigned int size;
+	unsigned int sendidx;
+	unsigned int recvidx;
+	void (*lock_func)(qurt_mutex_t *);
+	void (*unlock_func)(qurt_mutex_t *);
+    int (*try_lock_func)(qurt_mutex_t *);
+    void (*destroy_lock_func)(qurt_mutex_t *);
+	unsigned int magic;
+	qurt_pipe_data_t *data;
+    /** @endcond */
+} qurt_pipe_t;
+
+/**  QuRT pipe attributes type. */
+typedef struct {
+  /** @cond */
+  qurt_pipe_data_t *buffer;
+  unsigned int elements;
+  unsigned char mem_partition;
+  /** @endcond */
+} qurt_pipe_attr_t;
+
+/** @} */ /* end_addtogroup pipe_types */
+/*=============================================================================
+                        FUNCTIONS
+=============================================================================*/
+/**@ingroup func_qurt_pipe_attr_init
+  @xreflabel{hdr:qurt_pipe_attr_init}
+  Initializes the structure that sets the pipe attributes when a pipe is created.
+
+  After an attribute structure is initialized, the individual attributes in the structure are
+  explicitly set using the pipe attribute operations.
+
+  The attribute structure is assigned the following default values: \n
+  - buffer -- 0 \n
+  - elements -- 0 \n
+  - mem_partition -- #QURT_PIPE_ATTR_MEM_PARTITION_RAM
+  
+  @datatypes
+  #qurt_pipe_attr_t
+ 
+  @param[in,out] attr Pointer to the pipe attribute structure.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+static inline void qurt_pipe_attr_init(qurt_pipe_attr_t *attr)
+{
+  attr->buffer = NULL;
+  attr->elements = 0;
+  attr->mem_partition = QURT_PIPE_ATTR_MEM_PARTITION_RAM;
+}
+
+/**@ingroup func_qurt_pipe_attr_set_buffer
+  @xreflabel{sec:qurt_pipe_attr_set_buffer}
+  Sets the pipe buffer address attribute.\n
+  Specifies the base address of the memory area to use for the data buffer of a pipe.
+
+  The base address and size (Section @xref{sec:qurt_pipe_attr_set_elements}) specify the 
+  memory area used as a pipe data buffer. The user is responsible for allocating the 
+  memory area used for the buffer.
+
+  @datatypes
+  #qurt_pipe_attr_t \n
+  #qurt_pipe_data_t
+  
+  @param[in,out] attr Pointer to the pipe attribute structure.
+  @param[in] buffer   Pointer to the buffer base address.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+static inline void qurt_pipe_attr_set_buffer(qurt_pipe_attr_t *attr, qurt_pipe_data_t *buffer)
+{
+  attr->buffer = buffer;
+}
+
+/**@ingroup func_qurt_pipe_attr_set_elements
+  @xreflabel{sec:qurt_pipe_attr_set_elements}
+  Specifies the length of the memory area to use for the data buffer of a pipe. 
+  
+  The length is expressed in terms of the number of 64-bit data elements that 
+  can be stored in the buffer. 
+  
+  The base address (Section @xref{sec:qurt_pipe_attr_set_buffer}) and size specify 
+  the memory area used as a pipe data buffer. The user is responsible for 
+  allocating the memory area used for the buffer.
+
+  @datatypes
+  #qurt_pipe_attr_t
+
+  @param[in,out] attr Pointer to the pipe attribute structure.
+  @param[in] elements Pipe length (64-bit elements). 
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+static inline void qurt_pipe_attr_set_elements(qurt_pipe_attr_t *attr, unsigned int elements)
+{
+  attr->elements = elements;
+}
+
+/**@ingroup func_qurt_pipe_attr_set_buffer_partition
+  @xreflabel{sec:qurt_pipe_attr_set_buffer_partition}
+  Specifies the memory type where a pipe's buffer is allocated.
+  Allocate pipes in RAM or TCM/LPM.
+ 
+  @note1hang If a pipe is specified as allocated in TCM/LPM, it must be created
+  with the qurt_pipe_init() operation. The qurt_pipe_create() operation results in an error.
+
+  @datatypes
+  #qurt_pipe_attr_t
+
+  @param[in,out] attr Pointer to the pipe attribute structure.
+  @param[in] mem_partition Pipe memory partition. Values: \n
+             - #QURT_PIPE_ATTR_MEM_PARTITION_RAM -- Pipe resides in RAM \n
+             - #QURT_PIPE_ATTR_MEM_PARTITION_TCM -- Pipe resides in TCM/LCM @tablebulletend
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+static inline void qurt_pipe_attr_set_buffer_partition(qurt_pipe_attr_t *attr, unsigned char mem_partition)
+{
+  attr->mem_partition = mem_partition;
+}
+
+/**@ingroup func_qurt_pipe_create
+  Creates a pipe.\n
+  Allocates a pipe object and its associated data buffer, and initializes the pipe object.
+
+  @note1hang The buffer address and size stored in the attribute structure specify how the
+             pipe data buffer is allocated.
+  
+  @note1cont If a pipe is specified as allocated in TCM/LPM, it must be created
+             using the qurt_pipe_init() operation. The qurt_pipe_create() operation results in an error.
+  
+  @datatypes
+  #qurt_pipe_t \n
+  #qurt_pipe_attr_t
+  
+  @param[out] pipe  Pointer to the created pipe object.
+  @param[in]  attr  Pointer to the attribute structure used to create the pipe.
+
+  @return 
+  #QURT_EOK -- Pipe created. \n
+  #QURT_EFAILED -- Pipe not created. \n
+  #QURT_ENOTALLOWED -- Pipe cannot be created in TCM/LPM.
+
+  @dependencies
+  None.
+ */
+int qurt_pipe_create(qurt_pipe_t **pipe, qurt_pipe_attr_t *attr);
+
+/**@ingroup func_qurt_pipe_init
+  Initializes a pipe object using an existing data buffer.
+
+  @note1hang The buffer address and size stored in the attribute structure must 
+             specify a data buffer that the user has already allocated.
+
+  @datatypes
+  #qurt_pipe_t \n
+  #qurt_pipe_attr_t
+  
+  @param[out] pipe Pointer to the pipe object to initialize.
+  @param[in] attr  Pointer to the pipe attribute structure used to initialize the pipe.
+
+  @return 
+  #QURT_EOK -- Success. \n
+  #QURT_EFAILED -- Failure.
+
+  @dependencies
+  None.
+ */
+int qurt_pipe_init(qurt_pipe_t *pipe, qurt_pipe_attr_t *attr);
+
+/**@ingroup func_qurt_pipe_destroy
+  @xreflabel{sec:qurt_pipe_destroy}
+  Destroys the specified pipe.
+
+  @note1hang Pipes must be destroyed when they are no longer in use. Failure 
+             to do this causes resource leaks in the QuRT kernel.
+             Pipes must not be destroyed while they are still in use. If this 
+             occurs, the behavior of QuRT is undefined.
+
+  @datatypes
+  #qurt_pipe_t
+  
+  @param[in] pipe Pointer to the pipe object to destroy.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+void qurt_pipe_destroy(qurt_pipe_t *pipe); 
+
+/**@ingroup func_qurt_pipe_delete
+  Deletes the pipe.\n
+  Destroys the specified pipe (Section @xref{sec:qurt_pipe_destroy}) and deallocates the pipe object and its
+  associated data buffer.
+
+  @note1hang Delete pipes only if they were created using qurt_pipe_create
+             (and not qurt_pipe_init). Otherwise the behavior of QuRT is undefined. \n
+  @note1cont Pipes must be deleted when they are no longer in use. Failure to do this 
+             causes resource leaks in the QuRT kernel.\n
+  @note1cont Pipes must not be deleted while they are still in use. If this occurs, the
+             behavior of QuRT is undefined. 
+
+  @datatypes
+  #qurt_pipe_t
+  
+  @param[in] pipe Pointer to the pipe object to destroy.
+
+  @return 
+  None.
+
+  @dependencies
+  None.
+ */
+void qurt_pipe_delete(qurt_pipe_t *pipe);
+
+/**@ingroup func_qurt_pipe_send
+  Writes a data item to the specified pipe. \n
+  If a thread writes to a full pipe, it is suspended on the pipe. When another thread reads
+  from the pipe, the suspended thread is awakened and can then write data to the pipe.
+
+  Pipe data items are defined as 64-bit values. Pipe writes are limited to transferring a single
+  64-bit data item per operation.
+
+  @note1hang Transfer data items larger than 64 bits by reading and writing
+             pointers to the data, or by transferring the data in consecutive 64-bit chunks.
+
+  @datatypes
+  #qurt_pipe_t \n
+  #qurt_pipe_data_t
+  
+  @param[in] pipe Pointer to the pipe object to write to.
+  @param[in] data Data item to write.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+void qurt_pipe_send(qurt_pipe_t *pipe, qurt_pipe_data_t data);
+
+/**@ingroup func_qurt_pipe_receive
+  Reads a data item from the specified pipe.
+
+  If a thread reads from an empty pipe, it is suspended on the pipe. When another thread
+  writes to the pipe, the suspended thread is awakened and can then read data from the pipe.
+  Pipe data items are defined as 64-bit values. Pipe reads are limited to transferring a single
+  64-bit data item per operation.
+
+  @note1hang Transfer data items larger than 64 bits by reading and writing
+             pointers to the data, or by transferring the data in consecutive 64-bit chunks.
+
+  @datatypes
+  #qurt_pipe_t
+  
+  @param[in] pipe Pointer to the pipe object to read from.
+
+  @return
+  Integer containing the 64-bit data item from pipe.
+
+  @dependencies
+  None.
+*/
+qurt_pipe_data_t qurt_pipe_receive(qurt_pipe_t *pipe);
+
+/**@ingroup func_qurt_pipe_try_send
+  Writes a data item to the specified pipe (without suspending the thread if the pipe is full).\n
+
+  If a thread writes to a full pipe, the operation returns immediately with success set to -1.
+  Otherwise, success is always set to 0 to indicate a successful write operation.
+
+  Pipe data items are defined as 64-bit values. Pipe writes are limited to transferring a single
+  64-bit data item per operation.
+
+  @note1hang Transfer data items larger than 64 bits  by reading and writing
+             pointers to the data, or by transferring the data in consecutive 64-bit chunks.
+
+  @datatypes
+  #qurt_pipe_t \n
+  #qurt_pipe_data_t
+  
+  @param[in] pipe Pointer to the pipe object to write to.
+  @param[in] data Data item to write.
+
+  @return
+  0 -- Success. \n
+  -1 -- Failure (pipe full).
+
+  @dependencies
+  None.
+*/ 
+int qurt_pipe_try_send(qurt_pipe_t *pipe, qurt_pipe_data_t data);
+
+/**@ingroup func_qurt_pipe_try_receive
+  Reads a data item from the specified pipe (without suspending the thread if the pipe is
+  empty).\n
+  If a thread reads from an empty pipe, the operation returns immediately with success set
+  to -1. Otherwise, success is always set to 0 to indicate a successful read operation.\n
+
+  Pipe data items are defined as 64-bit values. Pipe reads are limited to transferring a single
+  64-bit data item per operation.
+
+  @note1hang Transfer data items larger than 64 bits by reading and writing
+             pointers to the data, or by transferring the data in consecutive 64-bit chunks.
+
+  @datatypes
+  #qurt_pipe_t
+  
+  @param[in] pipe     Pointer to the pipe object to read from.
+  @param[out] success Pointer to the operation status result.
+
+  @return
+  Integer containing a 64-bit data item from pipe.
+
+  @dependencies
+  None.
+*/
+qurt_pipe_data_t qurt_pipe_try_receive(qurt_pipe_t *pipe, int *success);
+
+/**@ingroup func_qurt_pipe_receive_cancellable  
+  Reads a data item from the specified pipe (with suspend), cancellable.
+
+  If a thread reads from an empty pipe, it is suspended on the pipe. When another thread
+  writes to the pipe, the suspended thread is awakened and can then read data from the pipe.
+  The operation is cancelled if the user process of the calling thread is killed, 
+  or if the calling thread must finish its current QDI invocation and return to user space.
+  Root pd thread can use this api to wait on pipe for receiving and gets resumed with QURT_EDESTROY
+  if the pipe gets destroyed .
+  Pipe data items are defined as 64-bit values. Pipe reads are limited to transferring a single
+  64-bit data item per operation. 
+
+  @note1hang Transfer data items larger than 64 bits by reading and writing
+             pointers to the data, or by transferring the data in consecutive 64-bit chunks.
+
+  @datatypes
+  #qurt_pipe_t \n
+  #qurt_pipe_data_t
+
+  @param[in] pipe     Pointer to the pipe object to read from.
+  @param[in] result   Pointer to the integer containing the 64-bit data item from pipe.
+
+  @return     	
+  #QURT_EOK -- Receive completed. \n
+  #QURT_ECANCEL -- Receive canceled. \n
+  #QURT_EDESTROY -- Receive destroyed. \n
+  #QURT_ENOTALLOWED -- Pipe is not initialized
+
+ 
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+int qurt_pipe_receive_cancellable(qurt_pipe_t *pipe, qurt_pipe_data_t *result);
+
+/**@ingroup func_qurt_pipe_send_cancellable  
+  @xreflabel{hdr:qurt_pipe_send_cancellable}
+  Writes a data item to the specified pipe (with suspend), cancellable. \n
+  If a thread writes to a full pipe, it is suspended on the pipe. When another thread reads
+  from the pipe, the suspended thread is awakened and can then write data to the pipe.
+  The operation is canceled if the user process of the calling thread is killed, or if the 
+  calling thread must finish its current QDI invocation and return to user space.
+  Root pd thread can use this api to wait on pipe for receiving and gets resumed with QURT_EDESTROY
+  if the pipe gets destroyed .
+
+  Pipe data items are defined as 64-bit values. Pipe writes are limited to transferring a single
+  64-bit data item per operation.
+
+  @note1hang Transfer data items larger than 64 bits by reading and writing
+             pointers to the data, or by transferring the data in consecutive 64-bit chunks.
+
+  @datatypes
+  #qurt_pipe_t \n
+  #qurt_pipe_data_t
+
+  @param[in] pipe      Pointer to the pipe object to read from.
+  @param[in] data      Data item to write.
+
+  @return     	
+  #QURT_EOK -- Send completed. \n
+  #QURT_ECANCEL -- Send canceled. \n
+  #QURT_EDESTROY -- Send destroyed. \n
+  #QURT_ENOTALLOWED -- Pipe is not initialized
+
+ 
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+int qurt_pipe_send_cancellable(qurt_pipe_t *pipe, qurt_pipe_data_t data);
+
+/**@ingroup func_qurt_pipe_is_empty
+  Returns a value indicating whether the specified pipe contains any data.
+
+  @datatypes
+  #qurt_pipe_t
+
+  @param[in] pipe     Pointer to the pipe object to read from.
+
+  @return
+  1 -- Pipe contains no data. \n
+  0 -- Pipe contains data.
+
+  @dependencies
+  None.
+*/
+int qurt_pipe_is_empty(qurt_pipe_t *pipe);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif  /* QURT_PIPE_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_pmem_manager.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_pmem_manager.h
new file mode 100755
index 0000000000000..8c8da985228b9
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_pmem_manager.h
@@ -0,0 +1,82 @@
+#ifndef QURT_PMEM_MANAGER_H
+#define QURT_PMEM_MANAGER_H
+/**
+  @file qurt_pmem_manager.h
+  Prototypes of kernel physical memory manager APIs
+
+ EXTERNALIZED FUNCTIONS
+  none
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  none
+
+ Copyright (c) 2023 by Qualcomm Technologies, Inc.  All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*=====================================================================
+ Constants and macros
+ ======================================================================*/
+
+/* physical memory API return error code */
+#define QURT_PMEM_SUCCESS               0
+#define QURT_PMEM_NO_PRIV               1
+#define QURT_PMEM_RETRY                 2
+#define QURT_PMEM_OVERLAP               3
+#define QURT_PMEM_NOT_EXIST             4
+#define QURT_PMEM_INIT_FAILURE          5
+#define QURT_PMEM_OUTSTANDING_MAPPING   6
+#define QURT_PMEM_GENERIC_FAILURE       7
+#define QURT_PMEM_ENTRY_FOUND           8
+#define QURT_PMEM_REACH_END             9
+#define QURT_PMEM_UNCLAIMED             10
+#define QURT_PMEM_ALREADY_CLAIMED       11
+
+/*=====================================================================
+ Functions
+======================================================================*/
+
+/**@ingroup func_qurt_pmem_acquire
+  Acquire the ownership of a specific physical memory region.
+
+  @note1hang The ownership will be the caller
+
+  @param[in] ppage      Starting physical page number
+  @param[in] pnum       Number of physical pages
+
+  @return
+  #QURT_PMEM_NO_PRIV -- Have no privilege to claim the ownership. \n
+  #QURT_PMEM_OVERLAP -- The whole or part of the range has been owned \n
+  #QURT_PMEM_SUCCESS -- Succeed to claim ownership.
+
+  @dependencies
+  None.
+*/
+int qurt_pmem_acquire(unsigned int ppage, unsigned int pnum);
+
+/**@ingroup func_qurt_pmem_release
+  Release the ownership of a specific physical memory region.
+
+  @param[in] ppage      The start of physical page number
+  @param[in] pnum       The numbers of physical pages
+
+  @return
+  #QURT_PMEM_NO_PRIV                -- Have no privilege to claim the ownership. \n
+  #QURT_PMEM_NOT_EXIST              -- The physical memory range is not usable. \n
+  #QURT_PMEM_OUTSTANDING_MAPPING    -- There is outstanding mapping in this range
+  #QURT_PMEM_SUCCESS                -- Succeed to claim ownership.
+
+  @dependencies
+  None.
+ */
+int qurt_pmem_release(unsigned int ppage, unsigned int pnum);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_PMEM_MANAGER_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_pmu.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_pmu.h
new file mode 100755
index 0000000000000..73ea8eba04abf
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_pmu.h
@@ -0,0 +1,121 @@
+#ifndef QURT_PMU_H
+#define QURT_PMU_H
+/**
+  @file qurt_pmu.h 
+  Prototypes of pipe interface API.  
+	 A pipe or message queue blocks when too full (send) or empty (receive).
+	 Unless using a nonblocking option, all datagrams are 64 bits.
+
+  EXTERNAL FUNCTIONS
+   None.
+
+  INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+  Copyright (c) 2021 Qualcomm Technologies, Inc.
+  All rights reserved.
+  Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+=============================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*=============================================================================
+												FUNCTIONS
+=============================================================================*/
+
+/**@ingroup func_qurt_pmu_set
+  Sets the value of the specified PMU register.
+
+  @note1hang Setting PMUEVTCFG automatically clears the PMU registers PMUCNT0
+             through PMUCNT3.
+ 
+  @param[in] reg_id   PMU register. Values: 
+            - #QURT_PMUCNT0
+            - #QURT_PMUCNT1    
+            - #QURT_PMUCNT2    
+            - #QURT_PMUCNT3    
+            - #QURT_PMUCFG     
+            - #QURT_PMUEVTCFG
+            - #QURT_PMUCNT4    
+            - #QURT_PMUCNT5    
+            - #QURT_PMUCNT6    
+            - #QURT_PMUCNT7    
+            - #QURT_PMUEVTCFG1   @tablebulletend 
+
+  @param[in] reg_value  Register value.
+ 
+  @return
+  None.
+   
+  @dependencies
+  None.
+ */
+void qurt_pmu_set (int reg_id, unsigned int reg_value);
+ 
+/**@ingroup func_qurt_pmu_get
+  Gets the PMU register.\n
+  Returns the current value of the specified PMU register.
+
+  @param[in] reg_id   PMU register. Values: 			   
+            - #QURT_PMUCNT0
+            - #QURT_PMUCNT1    
+            - #QURT_PMUCNT2    
+            - #QURT_PMUCNT3    
+            - #QURT_PMUCFG     
+            - #QURT_PMUEVTCFG
+            - #QURT_PMUCNT4    
+            - #QURT_PMUCNT5    
+            - #QURT_PMUCNT6    
+            - #QURT_PMUCNT7    
+            - #QURT_PMUEVTCFG1  @tablebulletend           
+ 
+  @return
+   Integer -- Current value of the specified PMU register.
+
+  @dependencies
+  None.
+ */
+unsigned int  qurt_pmu_get (int reg_id);
+ 
+/**@ingroup func_qurt_pmu_enable
+  Enables or disables the Hexagon processor PMU.
+  Profiling is disabled by default. 
+
+  @note1hang Enabling profiling does not automatically reset the count registers -- this must
+            be done explicitly before starting event counting.
+ 
+  @param[in] enable Performance monitor. Values: \n
+                    - 0 -- Disable performance monitor \n
+                    - 1 -- Enable performance monitor @tablebulletend
+ 
+  @return 
+  None.
+
+  @dependencies
+  None.
+ */
+void qurt_pmu_enable (int enable);
+
+/**@ingroup func_qurt_pmu_get_pmucnt
+  Reads PMU counters in a single trap.
+ 
+  @param[out] buf   Pointer to a buffer to save values read from PMU counters.
+                    buffer size should be at least 32 bytes to read all eight PMU counters.
+ 
+  @return 
+  #QURT_EOK    -- Successful read.\n
+  #QURT_EFATAL -- Failure.
+
+  @dependencies
+  None.
+ */
+int qurt_pmu_get_pmucnt (void * buf);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_PMU_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_power.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_power.h
new file mode 100755
index 0000000000000..2ee4d29a73976
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_power.h
@@ -0,0 +1,140 @@
+#ifndef QURT_POWER_H
+#define QURT_POWER_H
+/**
+  @file qurt_power.h
+  @brief  Prototypes of power API
+
+EXTERNAL FUNCTIONS
+   None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+Copyright (c) 2018-2021  by Qualcomm Technologies, Inc.  All Rights Reserved.
+Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+=============================================================================*/
+
+/*=============================================================================
+
+                        EDIT HISTORY FOR MODULE
+
+ This section contains comments describing changes made to the module.
+ Notice that changes are listed in reverse chronological order.
+
+
+when       who     what, where, why
+--------   ---     ------------------------------------------------------------
+03/03/11   op      Add header file
+12/12/12   cm      (Tech Pubs) Edited/added Doxygen comments and markup.
+=============================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** @cond */
+/**@ingroup func_qurt_power_shutdown_fail_exit
+  Returns from Power Collapse mode when power collapse cannot proceed.
+
+  This function unmasks the global interrupt. This operation is used only when the thread is
+  recovering from a failed power collapse operation (Section @xref{sec:powerShutdownEnter}).
+
+  @return
+  #QURT_EOK -- Operation was successfully performed.
+
+  @dependencies
+  None.
+ */
+#define  qurt_power_shutdown_fail_exit qurt_power_exit
+
+/**@ingroup func_qurt_power_shutdown_exit
+  Undoes state changes made preparing for power collapse.\n
+  This function unmasks the global interrupts.
+
+  @return
+  #QURT_EOK --Operation was successfully performed.
+
+  @dependencies
+  None.
+ */
+#define qurt_power_shutdown_exit qurt_power_exit
+/**@endcond */
+
+/**@ingroup func_qurt_system_ipend_get
+  Gets the IPEND register.\n
+
+  @note1hang Returns the current value of the Hexagon processor IPEND register. The return value
+             is a mask value that identifies the individual interrupts that are pending. \n
+
+  @note1hang The bit order of the mask value is identical to the order defined for the IPEND register. A
+             mask bit value of 1 indicates that the corresponding interrupt is pending, and 0 indicates that the
+             corresponding interrupt is not pending. \n
+
+  @return
+  Return the IPEND register value.
+
+  @dependencies
+  None.
+ */
+unsigned int qurt_system_ipend_get (void);
+
+
+/**@ingroup func_qurt_system_vid_get
+  Gets the VID register. \n
+
+  @note1hang Returns the current value of the Hexagon processor VID register. The return value is
+             the vector number of a second-level interrupt that has been accepted by the Hexagon
+             processor core.\n
+
+  @return
+  Return the VID register value that is the L2 VIC interrupt number accepted by the processor.
+  Valid range is 0 to 1023.
+
+  @dependencies
+  None.
+ */
+unsigned int qurt_system_vid_get(void);
+
+/**@ingroup func_qurt_power_shutdown_get_pcycles
+   Gets the number of power collapses and processor cycles for entering and exiting most recent
+   power collapse.
+
+   @note1hang If no power collapse has occured yet, processor cycle numbers are zero.
+
+   @param[out] enter_pcycles  Number of processor cycles for entering most
+                              recent power collapse.
+   @param[out] exit_pcycles  Number of processor cycles for exiting most
+                             recent power collapse.
+   @return
+   Zero -- No power collapses have occurred. \n
+   Nonzero -- Number of power collapses that have occurred since
+                the processor was reset.
+
+   @dependencies
+   None.
+ */
+int qurt_power_shutdown_get_pcycles( unsigned long long *enter_pcycles,  unsigned long long *exit_pcycles );
+
+/**@ingroup func_qurt_system_tcm_set_size
+   Set size of TCM to save during full power collapse.
+
+   @note1hang The size aligns to 32 bytes. If size passed is greater than the maximum size defined in
+              XML, the size is truncated to the size defined in XML.
+
+   @param[in] new_size Size of TCM to save.
+
+   @return
+   Zero -- Size successfully set \n
+   -1 -- Size of 0 passed
+
+   @dependencies
+   None.
+ */
+int qurt_system_tcm_set_size(unsigned int new_size);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_POWER_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_printf.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_printf.h
new file mode 100755
index 0000000000000..a775d8a815918
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_printf.h
@@ -0,0 +1,44 @@
+#ifndef QURT_PRINTF_H
+#define QURT_PRINTF_H
+
+#include <stdarg.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+  @file qurt_printf.h   
+  Prototypes of printf API.  
+
+EXTERNAL FUNCTIONS
+   None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+Copyright (c) 2021  by Qualcomm Technologies, Inc.  All Rights Reserved.
+Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+=============================================================================*/
+
+
+
+/*=============================================================================
+                        CONSTANTS AND MACROS
+=============================================================================*/
+/** @addtogroup chapter_function_tracing
+@{ */
+
+int qurt_printf(const char* format, ...);
+
+int qurt_vprintf(const char* format, va_list args);
+
+/** @} */ /* end_addtogroup chapter_function_tracing */
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_PRINTF_H */
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_process.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_process.h
new file mode 100755
index 0000000000000..0df9ddc2d4a70
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_process.h
@@ -0,0 +1,995 @@
+#ifndef QURT_PROCESS_H
+#define QURT_PROCESS_H
+/**
+  @file qurt_process.h
+  @brief Prototypes of QuRT process control APIs.
+
+ EXTERNALIZED FUNCTIONS
+ None
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+ None
+
+ Copyright (c) 2009-2013, 2021-2023 Qualcomm Technologies, Inc.
+ All rights reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+#include "qurt_callback.h"
+#include "qurt_consts.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** @addtogroup process_types
+@{ */
+#define QURT_PROCESS_ATTR_NAME_MAXLEN       QURT_MAX_NAME_LEN   /**< Maximum length of the process name. */
+#define QURT_PROCESS_ATTR_BIN_PATH_MAXLEN   128                 /**< Maximum length of the path of binary/ELF for this process. */
+#define QURT_PROCESS_ATTR_CAP_MAXLEN        128                 /**< Maximum length for a resource name. */
+
+/** QuRT process capability wildcard strings */
+#define QURT_PROCESS_ATTR_CAP_ALLOW_ALL     "ALLOW_ALL"         /**< Capability wild-card for full access */
+#define QURT_PROCESS_ATTR_CAP_ALLOW_NONE    "ALLOW_NONE"        /**< Capability wild-card for no access */
+
+/** QuRT process capability states */  
+#define QURT_PROCESS_ATTR_CAP_ENABLED       0x1                 /**< Capability enabled*/
+#define QURT_PROCESS_ATTR_CAP_DISABLED      0x0                 /**< Capability disabled*/  
+
+/* QuRT process thread attributes. */
+#define QURT_PROCESS_DEFAULT_CEILING_PRIO 0        /**< Default ceiling priority of the threads in the new process. */
+#define QURT_PROCESS_DEFAULT_MAX_THREADS  -1       /**< Default number of threads in the new process.
+                                                        -1 indicates that the limit is set to the maximum supported by the system. */
+
+/* QuRT process flags. */
+#define QURT_PROCESS_SUSPEND_ON_STARTUP  (1U)      /**< Suspend the new processes just before calling main(). */
+#define QURT_PROCESS_NON_SYSTEM_CRITICAL (1u << 1) /**< Starts the new process as non system-critical. */
+#define QURT_PROCESS_ISLAND_RESIDENT     (1u << 2) /**< Process is island resident. */
+#define QURT_PROCESS_RESTARTABLE         (1u << 3) /**< Indicates that the process is restartable */
+#define QURT_PROCESS_UNTRUSTED           (1u << 7) /**< Starts the new process as unsigned process. */
+
+/* QuRT process debugging session status.*/
+#define QURT_DEBUG_NOT_START         0  /**< Debug is not started. */
+#define QURT_DEBUG_START             1  /**< Debug has started. */
+
+/** Process Suspend Options */
+#define QURT_PROCESS_SUSPEND_DEFAULT   0
+
+/** Process Resume Options   */
+#define QURT_PROCESS_RESUME_DEFAULT    0
+
+
+/* QuRT process types. */
+typedef enum {
+    QURT_PROCESS_TYPE_RESERVED,            /**< Process type is reserved. \n */
+    QURT_PROCESS_TYPE_KERNEL,              /**< Kernel process. \n*/
+    QURT_PROCESS_TYPE_SRM,                 /**< SRM process.    \n*/
+    QURT_PROCESS_TYPE_SECURE,              /**< Secure process. \n*/
+    QURT_PROCESS_TYPE_ROOT,                /**< Root process.   \n*/
+    QURT_PROCESS_TYPE_USER,                /**< User process.   */
+}qurt_process_type_t;
+
+/** QuRT process callback types. */
+typedef enum {
+   QURT_PROCESS_DUMP_CB_ROOT,             /**< Register the callback that executes in the
+                                               root process context. \n */
+   QURT_PROCESS_DUMP_CB_ERROR,            /**< Register the user process callback that is 
+                                               called after threads in the process are frozen. \n */
+   QURT_PROCESS_DUMP_CB_PRESTM,           /**< Register the user process callback that is
+                                               called before threads in the process are frozen. \n*/
+   QURT_PROCESS_DUMP_CB_MAX               /**< Reserved for error checking. */
+}qurt_process_dump_cb_type_t;
+
+/** QuRT process dump attributes. */
+typedef struct _qurt_pd_dump_attr{
+  /** @cond */
+  unsigned int enabled;                    /**< Process dump is enabled. */
+  const char *path;                        /**< Process dump path. */
+  unsigned int path_len;                   /**< Length of process dump path. */
+  /** @endcond */
+}qurt_pd_dump_attr_t;                    
+
+/** QuRT process capability resource type */
+enum qurt_process_cap_type_t {
+    QURT_PROCESS_CAP_TYPE_NUM_ENTRIES=0,       /**< Number of entries in the capability structure*/
+    QURT_PROCESS_CAP_TYPE_DRIVER=1,            /**< Driver resource */
+    QURT_PROCESS_CAP_TYPE_MAX                  /**< Maximum identifier */        
+};
+
+/** QuRT process capability structure */
+typedef struct _qurt_capability {
+    enum qurt_process_cap_type_t type;             /**< Resource type */
+    char name[QURT_PROCESS_ATTR_CAP_MAXLEN];       /**< Resource name*/ 
+    unsigned long long cap;                        /**< Capabilities allowed for this resource */
+}qurt_capability_t;
+
+/** QuRT process attributes. */
+typedef struct _qurt_process_attr {
+    /** @cond */
+    char name[QURT_PROCESS_ATTR_NAME_MAXLEN];           /**< Name of the new process. */
+    char path[QURT_PROCESS_ATTR_BIN_PATH_MAXLEN];       /**< Path of the binary for the new process. */
+    char dtb_path[QURT_PROCESS_ATTR_BIN_PATH_MAXLEN];   /**< Path of the DTB ELF for the new process. */
+    int flags;                                          /**< Flags as indicated by QuRT process flags. */
+    unsigned int sw_id;                                 /**< Software ID of the process be load. */
+    unsigned sid;                                       /**< Stream ID of the process being spawned. */
+    unsigned max_threads;                               /**< Maximum number of threads that the new process can create. */
+    unsigned short ceiling_prio;                        /**< Maximum priority at which threads can be 
+                                                             created by new process. */
+    qurt_process_type_t type;                           /**< Process type as indicated by 
+                                                             #qurt_process_type_t. */
+    qurt_pd_dump_attr_t dump_attr;                      /**< Process dump attributes for the new process 
+                                                             as indicated by #qurt_pd_dump_attr_t. */ 
+    qurt_capability_t *capabilities;                    /**< Pointer to array of structure of type
+                                                             qurt_capability_t */
+    /** @endcond */
+} qurt_process_attr_t; 
+
+/** @} */ /* end_addtogroup process_types */
+
+/*=============================================================================
+FUNCTIONS
+=============================================================================*/
+ /** @cond rest_reg_dist */
+/**@ingroup func_qurt_process_create
+  Creates a process with the specified attributes, and starts the process.
+
+  The process executes the code in the specified executable ELF file.
+
+  @datatypes
+  #qurt_process_attr_t
+
+  @param[out] attr Accepts an initialized process attribute structure, which specifies
+                   the attributes of the created process.
+
+  @return
+  Postive return value Indicates Process ID.
+  Negative return value Indicates any of follwoing error,
+  #-QURT_EPRIVILEGE      --   Caller does not have privilege for this operation \n
+  #-QURT_EMEM            --   Not enough memory to perform the operation \n
+  #-QURT_EFAILED         --   Operation failed \n
+  #-QURT_ENOTALLOWED     --   Operation not allowed \n
+  #-QURT_ENOREGISTERED   --   Not registered    \n
+  #-QURT_ENORESOURCE     --   Resource exhaustion   \n
+  #-QURT_EINVALID        --   Invalid argument value    
+  #QURT_EFATAL           --   attr is NULL
+
+  @dependencies
+  None.
+*/
+int qurt_process_create (qurt_process_attr_t *attr);
+
+/**@ingroup func_qurt_process_get_id
+  Returns the process identifier for the current thread. 
+
+  @return
+  None.
+
+  @dependencies
+  Process identifier for the current thread.
+*/
+int qurt_process_get_id (void);
+/** @endcond */
+
+/** @cond internal_only*/
+/**@ingroup func_qurt_process_get_uid
+  Returns the user identifier for the current thread. 
+
+  @return
+  None.
+
+  @dependencies
+  User identifier for the current thread.
+*/
+int qurt_process_get_uid (void);
+/** @endcond */
+/** @cond rest_reg_dist */
+/**@ingroup func_qurt_process_attr_init
+  Initializes the structure that sets the process attributes when a thread is created.
+
+  After an attribute structure is initialized, the individual attributes in the structure can 
+  be explicitly set using the process attribute operations.
+
+  Table @xref{tbl:processAttrDefaults} lists the default attribute values set by the initialize 
+  operation.
+
+  @inputov{table_process_attribute_defaults}
+
+  @datatypes
+  #qurt_process_attr_t
+
+  @param[out] attr Pointer to the structure to initialize.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+static inline void qurt_process_attr_init (qurt_process_attr_t *attr)
+{
+    attr->name[0] = '\0';
+    attr->path[0] = '\0';
+    attr->dtb_path[0] = '\0';
+    attr->flags = 0;
+    attr->sw_id = 0;
+    attr->sid = 0;
+    attr->max_threads = (unsigned)QURT_PROCESS_DEFAULT_MAX_THREADS;
+    attr->ceiling_prio = QURT_PROCESS_DEFAULT_CEILING_PRIO;
+    attr->type = QURT_PROCESS_TYPE_RESERVED;
+    attr->dump_attr.enabled = 0;
+    attr->dump_attr.path = NULL;
+    attr->dump_attr.path_len = 0;
+    attr->capabilities = NULL;
+}
+
+/**@ingroup func_qurt_process_attr_set_executable
+  Sets the process name in the specified process attribute structure.
+
+  Process names identify process objects that are already 
+  loaded in memory as part of the QuRT system.
+
+  @note1hang Process objects are incorporated into the QuRT system at build time.
+
+  @note1hang Maximum length of name string is limited to QURT_PROCESS_ATTR_NAME_MAXLEN - 1.
+
+  @datatypes
+  #qurt_process_attr_t
+
+  @param[in] attr Pointer to the process attribute structure.
+  @param[in] name Pointer to the process name.
+ 
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+void qurt_process_attr_set_executable (qurt_process_attr_t *attr, const char *name);
+
+/**@ingroup func_qurt_process_attr_set_binary_path
+  Sets the binary path for the process loading in the specified process attribute structure.
+
+  Path specifies the binary to load for this process.
+  
+  @note1hang Max length of path string is limited to QURT_PROCESS_ATTR_BIN_PATH_MAXLEN-1.
+
+  @datatypes
+  #qurt_process_attr_t
+
+  @param[in] attr Pointer to the process attribute structure.
+  @param[in] path Pointer to the binary path.
+ 
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+void qurt_process_attr_set_binary_path(qurt_process_attr_t *attr, char *path);
+
+/**@ingroup func_qurt_process_attr_set_dtb_path
+  Sets the DTB binary path for the process loading in the specified process attribute structure.
+
+  Path specifies the DTB binary to load for this process.
+  
+  @note1hang Max length of path string is limited to QURT_PROCESS_ATTR_BIN_PATH_MAXLEN-1.
+
+  @datatypes
+  #qurt_process_attr_t
+
+  @param[in] attr Pointer to the process attribute structure.
+  @param[in] path Pointer to the binary path.
+ 
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+void qurt_process_attr_set_dtb_path(qurt_process_attr_t *attr, char *path);
+
+/**@ingroup func_qurt_process_attr_set_flags
+Sets the process properties in the specified process attribute structure.
+Process properties are represented as defined symbols that map into bits 
+0 through 31 of the 32-bit flag value. Multiple properties are specified by OR'ing 
+together the individual property symbols.
+
+@datatypes
+#qurt_process_attr_t
+
+@param[in] attr  Pointer to the process attribute structure.
+@param[in] flags QURT_PROCESS_NON_SYSTEM_CRITICAL Process is considered as non system-critical.
+                                                  This attribute will be used by error services,
+                                                  to decide whether to kill user pd or whole subsystem.
+                 QURT_PROCESS_ISLAND_RESIDENT     Process will be marked as island resident.
+                 QURT_PROCESS_RESTARTABLE         Process will be marked as restartable.
+                 QURT_PROCESS_UNTRUSTED           Process will be marked as unsigned process.
+@return
+None.
+
+@dependencies
+None.
+*/
+static inline void qurt_process_attr_set_flags (qurt_process_attr_t *attr, int flags)
+{
+    attr->flags = flags;
+}
+/** @endcond */
+/** @cond internal_only*/
+/**@ingroup func_qurt_process_attr_set_sid
+Sets the process streamID in the specified process attribute structure.
+
+@datatypes
+#qurt_process_attr_t
+
+@param[in] attr  Pointer to the process attribute structure.
+@param[in] sid   streamID to set for this process.
+
+@return
+None.
+
+@dependencies
+None.
+*/
+static inline void qurt_process_attr_set_sid (qurt_process_attr_t *attr, unsigned sid)
+{
+    attr->sid = sid;
+}
+/** @endcond */
+/** @cond rest_reg_dist */
+/**@ingroup func_qurt_process_attr_set_max_threads
+Sets the maximum number of threads allowed in the specified process attribute structure.
+
+@datatypes
+#qurt_process_attr_t
+
+@param[in] attr          Pointer to the process attribute structure.
+@param[in] max_threads   Maximum number of threads allowed for this process.
+
+@return
+None.
+
+@dependencies
+None.
+*/
+static inline void qurt_process_attr_set_max_threads (qurt_process_attr_t *attr, unsigned max_threads)
+{
+    attr->max_threads = max_threads;
+}
+
+/**@ingroup func_qurt_process_attr_set_sw_id
+Sets the software ID of the process to load in the specified process attribute structure.
+
+@datatypes
+#qurt_process_attr_t
+
+@param[in] attr          Pointer to the process attribute structure.
+@param[in] sw_id         Software ID of the process, used in authentication.
+
+@return
+None.
+
+@dependencies
+None.
+*/
+static inline void qurt_process_attr_set_sw_id(qurt_process_attr_t *attr, unsigned int sw_id)
+{
+    attr->sw_id = sw_id;
+}
+
+/**@ingroup func_qurt_process_attr_set_ceiling_prio
+Sets the highest thread priority allowed in the specified process attribute structure.
+Refer qurt_thread.h for priority ranges.
+
+@datatypes
+#qurt_process_attr_t
+
+@param[in] attr          Pointer to the process attribute structure.
+@param[in] prio          Priority.
+
+@return
+None.
+
+@dependencies
+None.
+*/
+static inline void qurt_process_attr_set_ceiling_prio (qurt_process_attr_t *attr, unsigned short prio)
+{
+    attr->ceiling_prio = prio;
+}
+/** @endcond */
+
+/** @cond internal_only*/
+/**@ingroup func_qurt_process_attr_set_dump_status
+Sets the process domain dump-enabled field in the process domain dump attributes.
+
+@datatypes
+#qurt_process_attr_t
+
+@param[in] attr          Pointer to the process attribute structure.
+@param[in] enabled       1 -- Process domain dump is collected \n
+                         0 -- Process domain dump is not collected
+
+@return
+None.
+
+@dependencies
+None.
+*/
+static inline void qurt_process_attr_set_dump_status(qurt_process_attr_t *attr, unsigned int enabled)
+{
+    attr->dump_attr.enabled = enabled;
+}
+
+/**@ingroup func_qurt_process_attr_set_dump_path
+Sets the process domain dump path and type.
+
+@datatypes
+#qurt_process_attr_t
+
+@param[in] attr          Pointer to the process attribute structure.
+@param[in] path          Path where the process domain dumps must be saved.
+@param[in] path_len      Length of the path string.
+
+@return
+None. 
+
+@dependencies
+None.
+*/
+static inline void qurt_process_attr_set_dump_path(qurt_process_attr_t *attr, const char *path, int path_len)
+{
+    attr->dump_attr.path = path;
+    attr->dump_attr.path_len = (unsigned int)path_len;
+}
+
+/**@ingroup func_qurt_process_attr_set_capabilities
+Sets list of capabilities available to this process.
+
+@datatypes
+#qurt_process_attr_t
+
+@param[in] attr          Pointer to the process attribute structure.
+@param[in] capabilities  Pointer to array of structures of type qurt_capability_t defining 
+                         resources and capabilites
+
+@return
+None. 
+
+@dependencies
+None.
+*/
+static inline void qurt_process_attr_set_capabilities(qurt_process_attr_t *attr, qurt_capability_t *capabilities)
+{
+    attr->capabilities = capabilities;
+}
+
+/** @endcond */
+/** @cond rest_reg_dist */
+/**@ingroup func_qurt_process_cmdline_get
+Gets the command line string associated with the current process.
+The Hexagon simulator command line arguments are retrieved using 
+this function as long as the call is made
+in the process of the QuRT installation, and with the 
+requirement that the program runs in a simulation environment.
+
+If the function modifies the provided buffer, it zero-terminates
+the string. It is possible that the function does not modify the
+provided buffer, so the caller must set buf[0] to a NULL
+byte before making the call. A truncated command line is returned when
+the command line is longer than the provided buffer.
+
+@param[in] buf      Pointer to a character buffer that must be filled in.
+@param[in] buf_siz  Size (in bytes) of the buffer pointed to by the buf argument.
+
+@return
+None.
+
+@dependencies
+None.
+*/
+void qurt_process_cmdline_get(char *buf, unsigned buf_siz);
+
+/**@ingroup func_qurt_process_get_thread_count
+Gets the number of threads present in the process indicated by the PID. 
+ 
+@param[in] pid PID of the process for which the information is required.
+
+@return
+Number of threads in the process indicated by PID, if positive value is obtained
+Negative error code if failed include:
+   QURT_EFATAL - Invalid PID
+   -QURT_ENOTALLOWED - Current process doesnt have access to target process indicated by PID
+
+@dependencies
+None.
+*/
+int qurt_process_get_thread_count(unsigned int pid);
+
+/**@ingroup func_qurt_process_get_thread_ids
+Gets the thread IDs for a process indicated by PID. 
+
+@param[in] pid      PID of the process for which the information is required.
+@param[in] ptr         Pointer to a user passed buffer that must be filled in with thread IDs.
+@param[in] thread_num  Number of thread IDs requested.
+
+@return
+#QURT_EOK - Success
+#QURT_EFATAL - Failed, ptr is NULL
+
+@dependencies
+None.
+ */
+int qurt_process_get_thread_ids(unsigned int pid, unsigned int *ptr, unsigned thread_num);
+/** @endcond */
+/** @cond internal_only*/
+/**@ingroup func_qurt_process_dump_get_mem_mappings_count
+Gets the number of mappings present in the process indicated by the PID. 
+ 
+@param[in] pid PID of the process for which the information is required.
+
+@return
+Number of mappings for the process indicated by the PID.
+
+@dependencies
+None.
+*/
+int qurt_process_dump_get_mem_mappings_count(unsigned int pid);
+
+/**@ingroup func_qurt_process_dump_get_mappings
+Gets the mappings for a specified PID.
+
+@note1hang This API skips device type mappings or mappings created by setting the #QURT_PERM_NODUMP attribute.
+
+@param[in] pid      PID of the process for which the information is required.
+@param[in] ptr      Pointer to a buffer that must be filled in with mappings.
+@param[in] count    Count of mappings requested.
+
+@return
+Number of mappings filled in the buffer passed by the user.
+
+@dependencies
+None.
+*/
+int qurt_process_dump_get_mappings(unsigned int pid, unsigned int *ptr, unsigned count);
+/** @endcond */
+/** @cond rest_reg_dist */
+/**@ingroup func_qurt_process_attr_get
+Gets the attributes of the process with which it was created. 
+ 
+@datatypes
+#qurt_process_attr_t
+
+@param[in]     pid  PID of the process for which the information is required.
+@param[in,out] attr Pointer to the user allocated attribute structure.
+
+@return
+#QURT_EOK     - Success
+#QURT_INVALID - Invalid PID
+#QURT_EFATAL  - attr is NULL
+
+@dependencies
+None.
+*/
+int qurt_process_attr_get(unsigned int pid, qurt_process_attr_t *attr);
+
+/**@ingroup func_qurt_process_dump_register_cb
+Registers the process domain dump callback. 
+ 
+@datatypes
+#qurt_cb_data_t \n
+#qurt_process_dump_cb_type_t
+
+@param[in] cb_data Pointer to the callback information.
+@param[in] type Callback type; these callbacks are called in the context of the user process domain: \n
+   #QURT_PROCESS_DUMP_CB_PRESTM -- Before threads of the exiting process are frozen. \n
+   #QURT_PROCESS_DUMP_CB_ERROR  -- After threads are frozen and captured. \n
+   #QURT_PROCESS_DUMP_CB_ROOT   -- After threads are frozen and captured, and CB_ERROR type of callbacks
+                                   are called.
+@param[in] priority Priority.
+
+@return
+#QURT_EOK -- Success \n
+Other values -- Failure
+    QURT_EFATAL if cb_data is NULL
+    QURT_EINVALID If invalid cb_type
+    QURT_EFAILED If invalid cb_data 
+ 
+@dependencies
+None.
+*/
+int qurt_process_dump_register_cb(qurt_cb_data_t *cb_data, qurt_process_dump_cb_type_t type, unsigned short priority);
+
+/**@ingroup func_qurt_process_dump_deregister_cb
+Deregisters the process domain dump callback.
+
+@datatypes
+#qurt_cb_data_t \n
+#qurt_process_dump_cb_type_t
+
+@param[in] cb_data Pointer to the callback information to deregister.
+@param[in] type    Callback type.
+
+@return
+#QURT_EOK -- Success.\n
+Other values -- Failure.
+    QURT_EFATAL if cb_data is NULL
+    QURT_EINVALID If invalid cb_type
+    QURT_EFAILED If invalid cb_data 
+
+@dependencies
+None.
+*/
+int qurt_process_dump_deregister_cb(qurt_cb_data_t *cb_data,qurt_process_dump_cb_type_t type);
+
+/** @endcond */
+/** @cond internal_only*/
+/**@ingroup func_qurt_process_set_rtld_debug
+Sets rtld_debug for a process. 
+ 
+@param[in] pid     PID of the process for which rtld_debug must be set.
+@param[in] address rtld_debug address.
+
+@return
+#QURT_EOK      - Success
+#QURT_EINVALID - Invalid PID
+#QURT_EFATAL   - Invalid address
+ 
+@dependencies
+None.
+*/
+int qurt_process_set_rtld_debug(unsigned int pid,unsigned int address);
+
+/**@ingroup func_qurt_process_get_rtld_debug
+Gets rtld_debug for a process.
+
+@param[in] pid         PID of the process for which rtld_debug must be set.
+@param[in,out] address Pointer to the user passed address in which the rtld_debug address must be returned.
+
+@return
+#QURT_EOK      - Success
+#QURT_EINVALID - Invalid PID
+#QURT_EFATAL   - Invalid address
+
+@dependencies
+None.
+*/
+int qurt_process_get_rtld_debug(unsigned int pid,unsigned int *address);
+/** @endcond */
+/**@ingroup func_qurt_process_exit
+Exits the current user process with an exit code.
+
+@param[in] exitcode Exit code.
+ 
+@return
+#QURT_EFATAL -- No client found with the specified PID value \n
+#QURT_EINVALID -- Invalid client \n
+#QURT_ENOTALLOWED -- User does not have permission to perform this operation \n
+#QURT_EOK -- Success
+
+@dependencies
+None.
+*/
+int qurt_process_exit(int exitcode);
+
+/**@ingroup func_qurt_process_kill
+Kills the process represented by the PID with the exit code.
+
+@param[in] pid       PID of the process to kill.
+@param[in] exitcode  Exit code.
+ 
+@return
+#QURT_EFATAL -- No client found with the specified PID value \n
+#QURT_EINVALID -- Invalid client \n
+#QURT_ENOTALLOWED -- User does not have permission to perform this operation \n
+#QURT_EOK -- Success
+
+@dependencies
+None.
+*/
+int qurt_process_kill(int pid, int exitcode);
+ 
+ 
+/**@ingroup func_qurt_debugger_register_process
+Registers the process indicated by the PID with the debug monitor. 
+
+@param[in] pid  PID of the process.
+@param[in] adr  Address.
+ 
+@return
+#QURT_EOK -- Success 
+
+@dependencies
+None.
+*/
+int qurt_debugger_register_process(int pid, unsigned int adr);
+ 
+ 
+/**@ingroup func_qurt_debugger_deregister_process
+Deregister the process indicated by the PID with the debug monitor.
+
+@param[in] pid  PID of the process.
+ 
+@return
+#QURT_EOK -- Success
+
+@dependencies
+None.
+*/
+int qurt_debugger_deregister_process(int pid);
+ 
+/**@ingroup func_qurt_process_exec_callback
+Executes callbacks in the user process as indicated by the client_handle argument.
+
+@param[in] client_handle  Client handle obtained from the current invocation function (Section 3.4.1).
+@param[in] callback_fn    Callback function to execute.
+@param[in] stack_base     Stack address to use.
+@param[in] stack_size     Stack size.
+ 
+@return
+#QURT_EOK -- Success
+
+@dependencies
+None.
+*/
+int qurt_process_exec_callback(int client_handle,
+                                     unsigned callback_fn,
+                                     unsigned stack_base,
+                                     unsigned stack_size);
+ 
+/**@ingroup func_qurt_process_get_pid
+Gets the process ID of the process that the client_handle argument represents.
+
+@note1hang This API is not supported for unsigned PD, For unsigned PD use qurt_process_get_id()
+
+@param[in] client_handle    Client handle obtained from the current invocation function (Section 3.4.1).
+@param[in] pid              Pointer to the address to store the PID.
+ 
+@return
+#QURT_EOK -- Success
+#QURT_EFATAL -- pid pointer passed as NULL 
+
+@dependencies
+None.
+*/
+int qurt_process_get_pid(int client_handle, int * pid);
+
+/**@ingroup func_qurt_process_get_dm_status
+Gets the debugging session status on the process represented by the pid argument.
+
+@param[in]     pid      Process ID  
+@param[in,out] status   Address to store the status: \n
+                        #QURT_DEBUG_NOT_START \n        
+                        #QURT_DEBUG_START         
+ 
+@return
+#QURT_EOK - Success \n
+#QURT_EINVALID - Error
+
+@dependencies
+None.
+*/
+int qurt_process_get_dm_status( unsigned int pid, unsigned int *status);
+
+
+/**@ingroup func_qurt_process_suspend_threads 
+  Suspends user threads in a user process with its process identifier.
+  The target user process can be a signed user process or an unsigned user process.
+  The caller is from a thread in GuestOS/root process.
+  After the user threads in the target user process are suspended, they cannot be scheduled to run by the kernel 
+  until they resume later.
+
+  This function has one optional argument with one default option.
+  #QURT_PROCESS_SUSPEND_DEFAULT suspends user threads in the target user process.
+
+  This function call is a synchronous call, the function returns after the relevant threads are 
+  completely suspended. 
+  
+  If some user threads in the target user process are set as non-suspendable, this function call does
+  not suspend these threads.
+
+  If the target user process is already suspended, this function call returns success as the 
+  confirmation on the user process suspending.
+
+  QuRT debugger monitor threads in the target user process are non-suspendable, this function call does
+  not suspend the threads.
+
+  If the target user process is a secure user process, or a CPZ process, this function call returns error 
+  without suspending the target user process.                                          
+
+  If a user thread in the target user process runs in the guest OS/root process via a QDI call, this function call 
+  does not suspend the thread in the guest OS, but instead marks the thread as pending-suspend. The thread is suspended 
+  when it exits the guest OS, before executing the first instruction in the user process.
+  In this case, the function returns success while the user thread can be running in GuestOS, and is suspended 
+  when exiting the guest OS. 
+ 
+  @param[in] process_id  Process identifier.
+  @param[in] option      Dfault option #QURT_PROCESS_SUSPEND_DEFAULT suspends user threads in the target user process.
+ 
+  @return
+  #QURT_EOK         -- Success  \n
+  #QURT_EINVALID    -- Failure because of invalid process_id input \n
+  #QURT_ENOTALLOWED -- Failure because the operation is not allowed, for example, on a secure process/CPZ process.
+ 
+  @dependencies
+  None.
+ */
+int qurt_process_suspend_threads (unsigned int process_id, unsigned int option);
+
+
+/**@ingroup func_qurt_process_resume_threads 
+  Resumes a user process with its process identifier.
+  The target user process can be a signed user process or an unsigned user process.
+  The caller is from a thread in the guest OS/root process.
+  After the user threads in the target user process resume, the kernel scheduler
+  can schedule the user threads to run based on their thread priorities.
+
+  This function has an optional argument, #QURT_PROCESS_RESUME_DEFAULT, which 
+  resumes user threads in the target user process.
+
+  This is an asynchronous function, it returns after the kernel moves the user thread from 
+  suspended state to runnable state. The threads are scheduled to run based on their thread priorities.
+  
+  This function call does not resume threads in the target user process that have been set as non-resumable.
+
+  If the target user process have already resumed, this function call confirms that the user process resumes
+  by returning success.
+
+  If the target user process is a secure user process or a CPZ process, this function call returns an error without 
+  resuming operation.                                          
+
+  If user threads in the target user process run in the guest OS/root process via QDI call, this function 
+  call clears the mark of suspend-pending on these threads, so that the threads are be suspended when it exits 
+  the guest OS. 
+ 
+  @param[in] process_id Process identifier.
+  @param[in] option     Default option #QURT_PROCESS_RESUME_DEFAULT resumes user threads in the target user process.
+ 
+  @return
+  #QURT_EOK         -- Success  
+  #QURT_EINVALID    -- Failure because of invalid process_id input.
+  #QURT_ENOTALLOWED -- Failure because of the operation is not allowed, for example, on a secure process/CPZ process.
+ 
+  @dependencies
+  None.
+ */
+int qurt_process_resume_threads (unsigned int process_id, unsigned int option);
+
+/**@ingroup func_qurt_process_vtcm_window_set
+  Set a VTCM access window for a process.
+  The caller thread needs to be in SRM process.
+  
+  This is an synchronous function, it ensures all running threads of the process have the requested 
+  window in effect.The requested view for all non-running thread will take in effect when they get 
+  scheduled.  
+
+  @param[in] pid Process identifier.
+  @param[in] enable  QURT_VTCM_WINDOW_ENABLE    enforces VTCM access window defined by high and low offset.
+                     QURT_VTCM_WINDOW_DISABLE   high and low offset is ignored and VTCM access is fully 
+                                                disabled for the process.
+  @param[in] high_offset  Specifies the high window offset, in 4K increments, from the base address of the VTCM.
+                          QURT_VTCM_WINDOW_HI_OFFSET_DEFAULT  restore high offset to reset value.
+  @param[in] low_offset   Specifies the low window offset, in 4K increments, from the base address of the VTCM.
+                          QURT_VTCM_WINDOW_LO_OFFSET_DEFAULT restore low offset to reset value.
+           
+  @note1hang
+  when high_offset is set to QURT_VTCM_WINDOW_HI_OFFSET_DEFAULT  and low offset is set as 
+  QURT_VTCM_WINDOW_LO_OFFSET_DEFAULT full VTCM range is accessible. Access to VTCM is controlled 
+  via MMU mapping for the process. 
+  
+  @return
+  #QURT_EOK            -- Success  
+  #QURT_EVAL           -- Failure because of invalid inputs.
+  #QURT_EPRIVILEGE     -- Failure because caller does not have enough privilege for this operation.
+  #QURT_ENOTSUPPORTED  -- Failure because of the operation is not supported due to limitation in HW capabilities 
+ 
+  @dependencies
+  None.
+ */
+int qurt_process_vtcm_window_set(int pid, unsigned int enable, unsigned int high_offset, unsigned int low_offset);
+
+/**@ingroup func_qurt_process_vtcm_window_get
+  Get the VTCM window for a process.
+  The caller thread needs to be in SRM process.
+  
+
+  @param[in] pid Process identifier.
+  @param[out] enable  address to store enable status if set
+  @param[out] high_offset address to return high window offset, in 4K increments, from the base address of the VTCM
+  @param[out] low_offset  address to return low window offset, in 4K increments, from the base address of the VTCM.
+  
+  @note1hang
+  User must first check the value of enable returned before checking high and low offset.
+ 
+  @return
+  #QURT_EOK            -- Success  
+  #QURT_EVAL           -- Failure because of invalid inputs.
+  #QURT_EPRIVILEGE     -- Failure because caller does not have enough privilege for this operation.
+  #QURT_ENOTSUPPORTED  -- Failure because of the operation is not supported due to limitation in HW capabilities 
+ 
+  @dependencies
+  None.
+ */
+int qurt_process_vtcm_window_get(int pid, unsigned int *enable, unsigned int *high_offset, unsigned int *low_offset);
+
+/**@ingroup func_qurt_process_set_group_config
+  Enable thread groups in the process with the ceiling priorities setup
+
+  @param[in] process_id Process identifier.
+  @param[in] group_bitmask 64-bit mask of active thread groups
+  @param[in] ceiling_priorities array of ceiling priorities for thread group
+
+  @note1hang
+  This API can only be called by root PD and can only be called once for each process, otherwise it will be
+  rejected. Group 0 must be enabled in group_bitmask, otherwise QuRT will return error. After this API, all
+  exisiting threads will be moved to group 0, and if there is any thread's priority higher than ceiling
+  priority of group 0, it will be lowered to the ceiling value.
+  Examples 1:
+  group_bitmask = 0xD7; //'b11010111
+  ceiling_priorities[] = {100, 128, 200, 0, 196, 0, 240, 20}; // 0 - does not care
+  Exmaples 2:
+  group_mask = 0x5;     //'b101
+  ceiling_priorities[] = {240, 0, 20}; // 0 - does not care
+
+
+  @return
+  #QURT_EOK            -- Success.
+  #QURT_EVAL           -- Failure because of invalid inputs.
+  #QURT_ENOTALLOWED    -- The group has been configured already.
+
+  @dependencies
+  None.
+ */
+int qurt_process_set_group_config(unsigned int process_id, unsigned long long group_bitmask,
+    unsigned char *ceiling_priorities);
+
+
+/**@ingroup func_qurt_process_stid_set
+  Set the specified stid for a process or for a thread group within a process. 
+
+  @param[in] pid Process identifier.
+  @param[in] group_id  group identifier
+  @param[in] stid stid to be set 
+  
+  @note1hang 
+  User can pass default group_id (QURT_THREAD_DEFAULT_GROUP_ID) if stid needs to set at a process level.
+  All threads within a process that has default stid (QURT_STID_DEFAULT) will inherit the stid set for a process.
+  When a non-default group_id is specified, the stid is set only for a thread group.
+  
+  @return
+  #QURT_EOK            -- Success
+  #QURT_EFATAL         -- Invalid PID
+  #QURT_EVAL           -- Failure because of invalid inputs.
+  #QURT_EPRIVILEGE     -- Failure because caller does not have enough privilege for this operation.
+ 
+  @dependencies
+  None.
+ */
+int qurt_process_stid_set(unsigned int pid, unsigned int group_id , unsigned int stid);
+
+/**@ingroup func_qurt_process_stid_get
+  Get the stid for a process or for a thread group within a process. 
+
+  @param[in]  pid Process identifier.
+  @param[in]  group_id  group identifier
+  @param[out] Pointer to a variable to return  stid
+  
+  @note1hang 
+  User can pass default group_id (QURT_THREAD_DEFAULT_GROUP_ID) to return process-level stid.
+  When a non-default group_id is specified, the stid is returned only for a thread group.
+  
+  @return
+  #QURT_EOK            -- Success
+  #QURT_EFATAL         -- Invalid PID
+  #QURT_EVAL           -- Failure because of invalid inputs.
+  #QURT_EPRIVILEGE     -- Failure because caller does not have enough privilege for this operation.
+ 
+  @dependencies
+  None.
+ */
+int qurt_process_stid_get(unsigned int pid, unsigned int group_id , unsigned int *stid);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_profile.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_profile.h
new file mode 100755
index 0000000000000..2a50c461440f6
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_profile.h
@@ -0,0 +1,98 @@
+#ifndef QURT_PROFILE_H
+#define QURT_PROFILE_H
+/**
+  @file qurt_profile.h
+  QuRT profiling support.
+
+EXTERNAL FUNCTIONS
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+Copyright (c) 2018, 2021, 2023 by Qualcomm Technologies, Inc.  All Rights Reserved.
+Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+==============================================================================*/
+#include "qurt_thread.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** @addtogroup profiling_macros
+@{ */
+#define QURT_PROFILE_DISABLE 0 /**< Disable profiling. */
+#define QURT_PROFILE_ENABLE  1 /**< Enable profiling. */
+
+typedef unsigned int qurt_profile_param_t;
+
+#define QURT_PROFILE_PARAM_THREAD_READY_TIME 0U /**< Profile thread ready time. */
+
+/** @} */ /* end_addtogroup profiling_macros */
+
+/** @addtogroup profiling_types
+    @{ */
+/** Profiling results. */
+typedef union
+{
+    /** Result associated with #QURT_PROFILE_PARAM_THREAD_READY_TIME.  */
+    struct
+    {
+        unsigned int ticks; /**< Cumulative ticks the thread was ready. */
+    } thread_ready_time;
+
+} qurt_profile_result_t;
+/** @} */ /* end_addtogroup profiling_types */
+
+/**@ingroup func_qurt_profile_enable2
+ * Starts profiling of a specific parameter on a specific thread (as applicable).
+ *  
+ * @param[in] param     Profiling parameter.
+ * @param[in] thread_id ID of the thread (if applicable) for which the specified 
+ *                      paramter must be profiled.
+ * @param[in] enable    #QURT_PROFILE_DISABLE -- disable \n #QURT_PROFILE_ENABLE -- 
+ *                      enable
+ *  
+ * @return 
+ * #QURT_EOK -- Success \n 
+ * #QURT_EALREADY -- Measurement already in progress or already stopped \n 
+ * #QURT_ENOTHREAD -- Thread does not exist \n 
+ * #QURT_EINVALID -- Invalid profiling parameter \n
+ *  
+ * @dependencies 
+ * None.   
+ */
+extern int qurt_profile_enable2 (
+    qurt_profile_param_t param,
+    qurt_thread_t        thread_id,
+    int                  enable
+);
+
+/**@ingroup func_qurt_profile_get
+ * Gets the value of the profiling parameter that was previously enabled. 
+ *  
+ * @param[in] param     Profiling parameter.
+ * @param[in] thread_id ID of thread (if applicable) for which the specified 
+ *                      profiling paramter must be retrieved.
+ * @param [out] result  Profiling result associated with the parameter for the specified 
+ *                      thread (if applicable).
+ *  
+ * @return 
+ * #QURT_EOK -- Success \n 
+ * #QURT_EFAILED -- Operation failed; profiling was not enabled \n 
+ * #QURT_ENOTHREAD -- Thread does not exist \n 
+ * #QURT_EINVALID -- Invalid profiling parameter \n
+ *  
+ * @dependencies 
+ * None. 
+ */
+extern int qurt_profile_get (
+    qurt_profile_param_t    param,
+    qurt_thread_t           thread_id,
+    qurt_profile_result_t * result
+);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+#endif
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_ptrace.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_ptrace.h
new file mode 100755
index 0000000000000..622304dd92865
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_ptrace.h
@@ -0,0 +1,37 @@
+/*=============================================================================
+
+                                    qurt_ptrace.h
+
+GENERAL DESCRIPTION
+
+EXTERNAL FUNCTIONS
+        None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+        None.
+
+             Copyright (c) 2013  by Qualcomm Technologies, Inc.  All Rights Reserved.
+=============================================================================*/
+#ifndef __SYS_PTRACE_H__
+#define __SYS_PTRACE_H__
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+enum __ptrace_request
+{
+   /**
+     Indicates that the process making this request is requesting to be traced.
+   */
+   PTRACE_TRACEME = 0,
+   PTRACE_EXT_IS_DEBUG_PERMITTED = 500
+};
+
+long ptrace(enum __ptrace_request request, unsigned int pid, void*addr, void *data);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif //__SYS_PTRACE_H__
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_qdi.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_qdi.h
new file mode 100755
index 0000000000000..705408e5cfc6f
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_qdi.h
@@ -0,0 +1,185 @@
+#ifndef QDI_H
+#define QDI_H
+
+/**
+  @file qurt_qdi.h
+  @brief Prototypes of QuRT Driver Invocation API functions      
+
+ EXTERNALIZED FUNCTIONS
+  none
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  none
+
+ Copyright (c) 2013, 2021, 2023 Qualcomm Technologies, Inc.
+ All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+
+
+#include "qurt_qdi_constants.h"
+#include "qurt_qdi_imacros.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**@ingroup func_qurt_qdi_open
+  Opens the specified driver for subsequent operations.
+  qurt_qdi_open() is the primary mechanism by which a driver user can
+  obtain a QDI handle. The user provides the name of the driver to the 
+  qurt_qdi_open call, and gets back a handle referencing
+  the named driver. \n
+  @note1hang For reasons related to the Hexagon standard for varargs functions, the
+             qurt_qdi_open function prototype is not actually defined as a varargs.
+
+
+  @param[in] p   Driver name.
+  @param[in] ... Up to nine additional device-specific arguments can be passed as parameters, 
+                 and should follow the POSIX open() convention. \n
+                 - flags -- Optional second parameter (POSIX flags), the handle 
+                         access requested (read-only, write-only, or read-write,
+                         for instance) and other flags such as whether the call 
+                         should create a new device or only open an existing 
+                         device.   \n
+                 - mode  -- Optional third parameter (POSIX mode); permissions to
+                         configure when a new device is created. @tablebulletend
+ 
+  @return 
+  Negative value -- Error. \n
+  Non-negative value -- Success, this result value serves as a handle to the
+                        opened driver.
+  @dependencies
+  None.
+ */
+// int qurt_qdi_open();
+#define qurt_qdi_open(p,...) \
+   qurt_qdi_handle_invoke(QDI_HANDLE_GENERIC,QDI_OPEN,(p),##__VA_ARGS__)
+
+#define qurt_qdi_open_dt(p,q,...) \
+   qurt_qdi_handle_invoke(QDI_HANDLE_GENERIC,QDI_OPEN_FROM_DT,(p),(q),##__VA_ARGS__)
+
+/**@ingroup func_qurt_qdi_handle_invoke
+  Performs a generic driver operation, which (depending on the specified operation) can be
+  either be one of the predefined operations listed in @xhyperref{tbl:functionMapping,QDI function mapping} 
+  or a driver-specific operation.
+  The user provides a QDI handle and an integer
+  method number, along with 0 to 8 optional 32-bit arguments.
+  The device driver invocation function is invoked with the
+  same method number and 0 to 8 optional arguments. The
+  return value from the invocation function is passed back to
+  the user as the return value of qurt_qdi_handle_invoke.
+
+  @note1hang For reasons related to the Hexagon standard for varargs functions, the
+             qurt_qdi_handle_invoke() function prototype is not actually defined as a
+             varargs function (and would break if it were defined this way).
+ 
+  @param[in]  h   Driver handle.
+  @param[in]  m   Integer number for the operation to perform.
+  @param[in]  ... Up to eight optional arguments can be passed to the device driver as operation-specific parameters: \n
+               arg1 -- First parameter \n
+               arg2 -- Second parameter  \n
+               arg3 -- Third parameter  \n
+               arg4 -- Fourth parameter  \n
+               arg5 -- Fifth parameter  \n
+               arg6 -- Sixth parameter  \n
+               arg7 -- Seventh parameter  \n
+               arg8 -- Eighth parameter 
+ 
+  @return 
+  Integer value defined by the device driver. \n
+  -1 -- Error.
+
+  @dependencies
+  None.
+ */
+// int qurt_qdi_handle_invoke();
+#define qurt_qdi_handle_invoke(h,m,...) \
+   _QDMPASTE(_QDMHI,_QDMCNT(QDI_HANDLE_LOCAL_CLIENT,h,m,##__VA_ARGS__))(QDI_HANDLE_LOCAL_CLIENT,h,m,##__VA_ARGS__)
+#define _QDMHI3(a,b,c) qurt_qdi_qhi3(0,b,c)
+#define _QDMHI4(a,b,c,d) qurt_qdi_qhi4(0,b,c,(int)(d))
+#define _QDMHI5(a,b,c,d,e) qurt_qdi_qhi5(0,b,c,(int)(d),(int)(e))
+#define _QDMHI6(a,b,c,d,e,f) qurt_qdi_qhi6(0,b,c,(int)(d),(int)(e),(int)(f))
+#define _QDMHI7(a,b,c,d,e,f,g) qurt_qdi_qhi7(8,b,c,(int)(d),(int)(e),(int)(f),(int)(g))
+#define _QDMHI8(a,b,c,d,e,f,g,h) qurt_qdi_qhi8(8,b,c,(int)(d),(int)(e),(int)(f),(int)(g),(int)(h))
+#define _QDMHI9(a,b,c,d,e,f,g,h,i) qurt_qdi_qhi9(16,b,c,(int)(d),(int)(e),(int)(f),(int)(g),(int)(h),(int)(i))
+#define _QDMHI10(a,b,c,d,e,f,g,h,i,j) qurt_qdi_qhi10(16,b,c,(int)(d),(int)(e),(int)(f),(int)(g),(int)(h),(int)(i),(int)(j))
+#define _QDMHI11(a,b,c,d,e,f,g,h,i,j,k) qurt_qdi_qhi11(24,b,c,(int)(d),(int)(e),(int)(f),(int)(g),(int)(h),(int)(i),(int)(j),(int)(k))
+#define _QDMHI12(a,b,c,d,e,f,g,h,i,j,k,l) qurt_qdi_qhi12(24,b,c,(int)(d),(int)(e),(int)(f),(int)(g),(int)(h),(int)(i),(int)(j),(int)(k),(int)(l))
+int qurt_qdi_qhi3(int,int,int);
+int qurt_qdi_qhi4(int,int,int,int);
+int qurt_qdi_qhi5(int,int,int,int,int);
+int qurt_qdi_qhi6(int,int,int,int,int,int);
+int qurt_qdi_qhi7(int,int,int,int,int,int,int);
+int qurt_qdi_qhi8(int,int,int,int,int,int,int,int);
+int qurt_qdi_qhi9(int,int,int,int,int,int,int,int,int);
+int qurt_qdi_qhi10(int,int,int,int,int,int,int,int,int,int);
+int qurt_qdi_qhi11(int,int,int,int,int,int,int,int,int,int,int);
+int qurt_qdi_qhi12(int,int,int,int,int,int,int,int,int,int,int,int);
+
+/**@ingroup func_qurt_qdi_write
+  Writes data to the specified driver.
+  A predefined invocation routine for drivers that
+  support a POSIX-like write functionality.
+  qqurt_qdi_write(handle, buf, len) is equivalent to
+  qurt_qdi_handle_invoke(handle, QDI_WRITE, handle, buf, len);
+ 
+  @param[in]  handle Driver handle.
+  @param[in]  buf    Pointer to the memory address where the data to write is stored.
+  @param[in]  len    Number of bytes of data to write.
+
+  @return 
+  Non-negative integer -- Number of bytes written. \n
+  Negative error code -- Write could not take place.
+
+  @dependencies
+  None.
+ */
+int qurt_qdi_write(int handle, const void *buf, unsigned len);
+
+/**@ingroup func_qurt_qdi_read
+  User-visible API to read data from a QDI handle. 
+  A predefined invocation routine for drivers that
+  support a POSIX-like read functionality.
+  qurt_qdi_read(handle, buf, len) is equivalent to:
+  qurt_qdi_handle_invoke(handle, QDI_READ, handle, buf, len);
+ 
+  @param[in]  handle   Driver handle.
+  @param[in]  buf      Pointer to the memory address where the data read is stored.
+  @param[in]  len      Number of bytes of data to read.
+
+  @return 
+  Non-negative integer number -- Bytes read. \n
+  Negative error code -- Read could not take place.
+
+  @dependencies
+  None.
+ */
+int qurt_qdi_read(int handle, void *buf, unsigned len);
+
+/**@ingroup func_qurt_qdi_close
+  Closes the specified driver, releasing any resources associated with the open driver.
+  User-visible API to close a QDI handle.
+ 
+  This API should be called when the user is done using a
+  QDI-based handle. When this function is called, the driver can release
+  any resources held and perform other necessary cleanup
+  operations. qurt_qdi_close(handle) is equivalent to
+  qurt_qdi_handle_invoke(handle, QDI_CLOSE, handle)
+ 
+  @param[in]  handle Driver handle.
+ 
+  @return 
+  0 -- Success.\n
+  Negative error code -- Failure.
+
+  @dependencies
+  None.
+ */
+int qurt_qdi_close(int handle);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_qdi_constants.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_qdi_constants.h
new file mode 100755
index 0000000000000..4866fada067f0
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_qdi_constants.h
@@ -0,0 +1,193 @@
+#ifndef QDI_CONSTANTS_H
+#define QDI_CONSTANTS_H
+
+/**
+  @file qurt_qdi_constants.h
+  @brief  Predefined invocation methods for drivers.  
+
+ EXTERNALIZED FUNCTIONS
+  None
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  None
+
+ Copyright (c) 2013-2023 by Qualcomm Technologies, Inc.  All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc..
+ ======================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+||  Method numbers used for QDI.
+||
+||  Intended grouping of method numbers for QDI
+||   including future usage:
+||
+||   Method 0 should always be unused and not responded to by
+||    any driver.
+||   Methods 1 and 2 are reserved for name registration and
+||    name lookup.
+||   Methods 3 through 31 are reserved for POSIX-type operations
+||    on open handles.
+||   Methods 32 through 127 are reserved for the QDI infrastructure
+||    and may be extended in the future to provide standard
+||    driver debug services, management services, and system
+||    notifications.
+||   Methods 128 through 255 are reserved for the use of automatically
+||    generated methods such as might be generated by an IDL (interface
+||    definition language).  The infrastructure may be extended to
+||    perform services on these methods based on information provided
+||    by the IDL, such as automatic buffer validation, etc.  These
+||    method numbers should not be used for any "ad hoc" methods.
+||   Methods with number >= 256 are "private" method numbers that are
+||    outside the scope of the QDI infrastructure.  Drivers that want
+||    to generate and consume their own "ad hoc" methods are free to
+||    use these method numbers as they wish. The infrastructure does
+||    not generate these method numbers or respond to them, but
+||    passes them on unmolested.
+||
+||   All driver implementations *should* return a value of
+||    -1 when called with an unsupported method.  The standard error
+||    return value for POSIX APIs is -1, so we emulate that behavior
+||    here.
+*/
+/** @cond */
+#define QDI_UNUSED              0
+#define QDI_DEVNAME_REGISTER    1
+#define QDI_OPEN                2
+#define QDI_CLOSE               3
+#define QDI_READ                4
+#define QDI_WRITE               5
+#define QDI_IOCTL               6
+#define QDI_MMAP                7
+#define QDI_OS_FILEOPEN         8
+#define QDI_FLEN                9
+#define QDI_UNLINK             10
+#define QDI_FTELL              22
+#define QDI_SEEK               23
+#define QDI_FSTAT              24
+
+#define QDI_FSNAME_REGISTER     150    
+#define QDI_FS_OPEN             151
+#define QDI_MMAP2               153
+#define QDI_MPROTECT2           154
+#define QDI_MUNMAP2             155
+
+#define QDI_CLIENT_HANDLE_OBJREF_GET    10
+
+#define QDI_OS_PROCESS_LOAD             12
+#define QDI_OS_PROCESS_CHOOSE_ASID      13
+
+#define QDI_OS_SET_GP                    26
+#define QDI_CLIENT_HANDLE_CALLBACK       27
+   
+#define QDI_CLIENT_HANDLE_ISLAND_HANDLE_CREATE_FROM_OBJ_T  19 //reused
+#define QDI_CLIENT_HANDLE_HANDLE_CREATE_FROM_OBJ_T 80
+#define QDI_CLIENT_HANDLE_HANDLE_RELEASE           81
+#define QDI_CLIENT_HANDLE_COPY_FROM_USER           82
+#define QDI_CLIENT_HANDLE_COPY_TO_USER             83
+#define QDI_CLIENT_HANDLE_SIGNAL_GROUP_CREATE      86
+#define QDI_CLIENT_HANDLE_SAFE_CACHE_OPS           87
+
+#define QDI_CLIENT_HANDLE_BUFFER_LOCK   41
+#define QDI_CLIENT_HLOSPOOL_INFO_GET    90
+#define QDI_CLIENT_HLOSPOOL2_INFO_GET   96
+
+#define QDI_CLIENT_PID                  44
+#define QDI_CLIENT_ASID                 QDI_CLIENT_PID
+
+#define QDI_OS_CLIENT_INFO_GET          48
+
+#define QDI_OS_MEM_LOOKUP_PHYSADDR      57
+
+#define QDI_OS_THREAD_ITERATOR_CREATE   68
+#define QDI_OS_THREAD_ITERATOR_NEXT     69
+
+#define QDI_OS_SYSENV                   78
+
+#define QDI_REGION_USERMALLOC_INIT      180 // This method is for generic handle
+
+
+#define QDI_CLIENT_HANDLE_USER_MALLOC              84
+#define QDI_CLIENT_HANDLE_USER_FREE                85
+
+#define QDI_SIGNAL_GROUP_SIGNAL_CREATE          96
+#define QDI_SIGNAL_GROUP_WAIT                   98
+#define QDI_SIGNAL_GROUP_POLL                   99
+#define QDI_SIGNAL_SET                          96
+#define QDI_SIGNAL_CLEAR                        97
+#define QDI_SIGNAL_WAIT                         98
+#define QDI_SIGNAL_POLL                         99
+
+#define QDI_OS_WAIT_FOR_MAIN_REAPER		104
+
+#define QDI_CLIENT_HANDLE_REFPROXY_INSTALL        105
+#define QDI_CLIENT_HANDLE_REFPROXY_ADD            106
+#define QDI_CLIENT_HANDLE_REFPROXY_REMOVE         107
+
+#define QDI_CLIENT_HANDLE_DETACH                  116
+
+#define QDI_OS_RESERVED1                       139
+
+#define QDI_CLIENT_HANDLE_BUFFER_LOCK2   142
+
+#define QDI_DT_REGISTER 158
+#define QDI_OPEN_DEVICE 159
+#define QDI_OPEN_FROM_DT 160
+
+#define QDI_PRIVATE             256  /* Method numbers beginning at 256
+                                        are private method numbers, which
+                                        are device-specific and available
+                                        for use by device implementors. */
+/*
+||  Permission bitmasks for use with qurt_qdi_lock_buffer().
+||
+||  Make sure these match with permission values from qurt_perm_t.
+*/
+/** @endcond */
+
+/** @addtogroup driver_support_constants
+@{ */
+#define QDI_PERM_W              2                         /**< Write access. */
+#define QDI_PERM_R              1                         /**< Read access. */
+#define QDI_PERM_RW             (QDI_PERM_R | QDI_PERM_W) /**< Read/write access. */
+
+#define QDI_HANDLE_LOCAL_CLIENT         3                 /**< Local client. */
+#define QDI_HANDLE_GENERIC              4                 /**< Generic. */
+
+#define QDI_REFCNT_BASE   0x510000                        /**<  */
+#define QDI_REFCNT_MAXED  0x51FFFD                        /**<  */
+#define QDI_REFCNT_INIT   0x51FFFE                        /**< Driver object is temporary and is eventually deleted.*/
+#define QDI_REFCNT_PERM   0x51FFFF                        /**< Driver object is permanent and is never deleted. */
+/** @} */ /* end_addtogroup driver_support_constants */
+
+/** @cond */
+/*
+||  Flags used by process loaders.
+*/
+
+#define QDI_OS_PROCESS_FLAGS_ISLAND_RESIDENT    0x1     /* Set this flag to request the loaded process
+                                                           to have island residency. */
+#define QDI_OS_PROCESS_FLAGS_ROOT_RESIDENT      0x2     /* Set this flag to request the loaded process
+                                                           to have root residency, for example, DL Pager. */
+/*
+||  Constants used for qurt_event register API, type field.
+*/
+
+#define QURT_PROCESS_EXIT   1
+
+/*
+||  Constants used by QDI extensions.
+*/
+
+#define QURT_QDI_SINGLETON_TYPE_TRUE			0
+#define QURT_QDI_SINGLETON_TYPE_FALSE			1
+#define QURT_QDI_SINGLETON_TYPE_PER_PROCESS		2
+/** @endcond */
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QDI_CONSTANTS_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_qdi_driver.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_qdi_driver.h
new file mode 100755
index 0000000000000..e044e25f1bb72
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_qdi_driver.h
@@ -0,0 +1,868 @@
+#ifndef QURT_QDI_DRIVER_H
+#define QURT_QDI_DRIVER_H
+
+/**
+  @file qurt_qdi_driver.h
+  @brief  Definitions, macros, and prototypes used when writing a
+  QDI driver.
+
+ EXTERNALIZED FUNCTIONS
+  None
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  None
+
+ Copyright (c) 2018, 2019-2021, 2023 Qualcomm Technologies, Inc.
+ All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+
+#include "stddef.h"
+#include "qurt_qdi.h"
+#include "qurt_types.h"
+#include "qurt_callback.h"
+#include "qurt_qdi_constants.h"
+#include "qurt_qdi_imacros.h"
+#include "qurt_mutex.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+||  This gives the canonical form for the arguments to a QDI
+||   driver invocation function.  The arguments are as follows:
+||
+||   int client_handle    (R0) QDI handle that represents the client
+||                             that made this QDI request. If the
+||                             client is remote, this is a
+||                             variable handle; if the client is local
+||                             (same thread and process), this is
+||                             set to QDI_HANDLE_LOCAL_CLIENT.
+||
+||   qurt_qdi_obj_t *obj  (R1) Points at the qdi_object_t structure
+||                             on which this QDI request is being made.
+||                             The qdi_object_t structure is usually
+||                             the first element of a larger structure
+||                             that contains state associated with the
+||                             object; because it is usually the first
+||                             element, the object pointers can be freely
+||                             interchanged through casts.
+||
+||   int method           (R2) Integer QDI method that represents
+||                             the request type.
+||
+||   qurt_qdi_arg_t arg1  (R3) First three general purpose arguments
+||   qurt_qdi_arg_t arg2  (R4)  to the invocation function are passed in
+||   qurt_qdi_arg_t arg3  (R5)  these slots.
+||
+||   qurt_qdi_arg_t arg4  (SP+0)  Arguments beyond the first three are
+||   qurt_qdi_arg_t arg5  (SP+4)  passed on the stack.
+||   qurt_qdi_arg_t arg6  (SP+8)
+||   qurt_qdi_arg_t arg7  (SP+12)
+||   qurt_qdi_arg_t arg8  (SP+16)
+||   qurt_qdi_arg_t arg9  (SP+20)
+||
+||  The canonical form of the invocation function takes a
+||   total of 12 arguments, but not all of them are used.  In general,
+||   the QDI infrastructure only passes those arguments provided by
+||   the caller; if the invocation function accesses additional
+||   arguments beyond those provided by the caller, the values are not
+||   useful.
+*/
+/** @cond */
+#define QDI_INVOKE_ARGS \
+    int, struct qdiobj *, int, \
+    qurt_qdi_arg_t, qurt_qdi_arg_t, qurt_qdi_arg_t, \
+    qurt_qdi_arg_t, qurt_qdi_arg_t, qurt_qdi_arg_t, \
+    qurt_qdi_arg_t, qurt_qdi_arg_t, qurt_qdi_arg_t
+
+#define QDI_EXT_INVOKE_ARGS \
+    int, qurt_qdi_man_obj_t*, int, \
+    qurt_qdi_arg_t, qurt_qdi_arg_t, qurt_qdi_arg_t, \
+    qurt_qdi_arg_t, qurt_qdi_arg_t, qurt_qdi_arg_t, \
+    qurt_qdi_arg_t, qurt_qdi_arg_t, qurt_qdi_arg_t
+
+#define BUFFER_LOCK 1
+#define BUFFER_UNLOCK 0 
+
+struct qdiobj;
+/** @endcond */
+/** @addtogroup driver_support_types
+@{ */
+typedef union {
+    void *ptr; /**< Pointer to the driver handle. */
+    int num;   /**< Method number. */
+} qurt_qdi_arg_t;
+/** @} */ /* end_addtogroup driver_support_types */
+/** @cond */
+/** QuRT QDI driver version */
+typedef union {
+    int num;
+    struct {
+        short major; /** Driver major version number. */
+        short minor; /** Driver minor version number. */
+    };
+} qurt_qdi_version_t;
+
+typedef int (*qurt_qdi_pfn_invoke_t)(QDI_INVOKE_ARGS);
+typedef void (*qurt_qdi_pfn_release_t)(struct qdiobj *);
+/** @endcond */
+/** @addtogroup driver_support_types
+@{ */
+typedef struct qdiobj {
+    qurt_qdi_pfn_invoke_t invoke;   /**< Invocation function that implements the driver methods.*/
+    int refcnt;                     /**< Reference count, an integer value maintained by the QDI infrastructure that tracks the number of
+                                         references to a driver instance. */
+    qurt_qdi_pfn_release_t release; /**< Release function that performs details associated with deleting an instance
+                                         of the driver object.*/
+} qurt_qdi_obj_t;
+/** @} */ /* end_addtogroup driver_support_types */
+/** @cond */
+/** QuRT QDI managed object */
+typedef struct qurt_qdi_man_obj
+{
+    qurt_qdi_obj_t qdi_obj;
+    union
+    {
+        struct qurt_qdi_ext_driver * opener_obj;
+        struct qurt_qdi_ext_device * device_obj;
+    };
+}qurt_qdi_man_obj_t;
+
+typedef int (*qurt_qdi_ext_pfn_create_t)(int client_id, const char *name, qurt_qdi_version_t version, qurt_qdi_man_obj_t **qdi_obj);
+typedef int (*qurt_qdi_ext_pfn_create_device_t)(int client_id, const char *name, qurt_qdi_version_t version, struct qurt_qdi_ext_device * device, qurt_qdi_man_obj_t **qdi_obj);
+typedef int (*qurt_qdi_ext_pfn_invoke_t)(QDI_EXT_INVOKE_ARGS);
+typedef void (*qurt_qdi_ext_pfn_destroy_t)(qurt_qdi_man_obj_t *qdi_obj);
+typedef int (*qurt_qdi_ext_pfn_probe_t)(void *handle, struct qurt_qdi_ext_device **device);
+
+typedef struct qurt_qdi_ext_obj_info{
+    qurt_qdi_man_obj_t *obj;
+    int qdi_client_id;
+    struct qurt_qdi_ext_obj_info *next;
+}qurt_qdi_ext_obj_info_t;
+typedef struct qurt_qdi_ext_obj_info *qurt_qdi_ext_obj_info_ptr;
+
+/** QuRT QDI device */
+//temporarily add this back while there are still drivers who statically define this structure
+struct qurt_qdi_device {
+    qurt_qdi_obj_t opener_obj;
+    const char* name;
+    char island_resident;
+    unsigned char singleton;
+    qurt_qdi_ext_pfn_create_t create;
+    qurt_qdi_ext_pfn_invoke_t invoke;
+    qurt_qdi_ext_pfn_destroy_t destroy;
+    qurt_mutex_t qurt_qdi_ext_list_lock;
+    qurt_qdi_ext_obj_info_ptr qurt_qdi_ext_obj_info_head;
+};
+typedef struct qurt_qdi_device qurt_qdi_man_device;
+
+struct qurt_qdi_ext_driver {
+    qurt_qdi_obj_t opener_obj;
+    const char* name;
+    char island_resident;
+    unsigned char singleton;
+    qurt_qdi_ext_pfn_create_t create;
+    qurt_qdi_ext_pfn_invoke_t invoke;
+    qurt_qdi_ext_pfn_destroy_t destroy;
+    qurt_mutex_t qurt_qdi_ext_list_lock;
+    qurt_qdi_ext_obj_info_ptr qurt_qdi_ext_obj_info_head;
+    qurt_qdi_ext_pfn_create_device_t create_device;
+    qurt_qdi_version_t version;
+    qurt_qdi_ext_pfn_probe_t probe;
+    const char* compatible;
+    struct qurt_qdi_ext_device * device_list;
+    //qurt_qdi_ext_device_ptr device_list;
+};
+typedef struct qurt_qdi_ext_driver qurt_qdi_ext_driver_t;
+//above replaces qurt_qdi_man_device
+
+extern int qurt_qdi_obj_ref_inc(qurt_qdi_obj_t *);
+extern int qurt_qdi_obj_ref_dec(qurt_qdi_obj_t *);
+
+extern int qurt_qdi_ext_opener (QDI_INVOKE_ARGS);
+/** @endcond */
+/**@ingroup func_qurt_qdi_method_default
+  Processes a method that is unrecognized or unsupported in the driver invocation function.
+  All arguments passed to the current invocation function (Section @xref{sec:invocationFunction}) must be forwarded
+  to this function.
+
+  @note1hang Invocation functions must process all unrecognized or unsupported methods
+             by calling this function.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+extern int qurt_qdi_method_default(QDI_INVOKE_ARGS);
+
+/**@ingroup func_qurt_qdi_handle_create_from_obj_t
+  Allocates a new device handle for use with the specified driver object.
+   
+  @param[in] client_handle  Client handle obtained from the current invocation function (Section @xref{sec:invocationFunction}).
+  @param[out] obj           Pointer to the driver object.
+
+  @return
+  Non-negative integer -- Success; this value is the new handle. \n
+  Negative value -- Error.
+
+  @dependencies
+  None.
+*/
+static __inline int qurt_qdi_handle_create_from_obj_t(int client_handle, qurt_qdi_obj_t *obj)
+{
+    return qurt_qdi_handle_invoke(client_handle,
+                                    QDI_CLIENT_HANDLE_HANDLE_CREATE_FROM_OBJ_T,
+                                    obj);
+}
+
+/**@ingroup func_qurt_qdi_handle_invoke
+  Allocates a new island device handle for use with the specified driver object.
+   
+  @param[in] client_handle Client handle obtained from the current invocation function (Section 3.4.1).
+  @param[in] obj           Pointer.
+
+  @return
+  Non-negative integer value that is the new handle -- Success. \n
+  Negative return value -- Error.
+
+  @dependencies
+  None.
+*/
+static __inline int qurt_qdi_island_handle_create_from_obj_t(int client_handle, qurt_qdi_obj_t *obj)
+{
+    return qurt_qdi_handle_invoke(client_handle,
+                                    QDI_CLIENT_HANDLE_ISLAND_HANDLE_CREATE_FROM_OBJ_T,
+                                    obj);
+}
+
+/**@ingroup func_qurt_qdi_handle_release
+  Deallocates the specified device handle.
+
+  @param[in] client_handle     Obtained from the current invocation function (Section @xref{sec:invocationFunction}).
+  @param[in] handle_to_release Handle to release.
+
+  @return 
+  0 -- Success. \n
+  Negative value -- Error. 
+
+  @dependencies
+  None.
+*/
+static __inline int qurt_qdi_handle_release(int client_handle, int handle_to_release)
+{
+    return qurt_qdi_handle_invoke(client_handle,
+                                    QDI_CLIENT_HANDLE_HANDLE_RELEASE,
+                                    handle_to_release);
+}
+
+static __inline qurt_qdi_obj_t *
+qurt_qdi_objref_get_from_handle(int client_handle, int object_handle)
+{
+    qurt_qdi_obj_t *ret;
+
+    ret = NULL;
+
+    qurt_qdi_handle_invoke(client_handle,
+                            QDI_CLIENT_HANDLE_OBJREF_GET,
+                            object_handle,
+                            &ret);
+
+    return ret;
+}
+
+/**@ingroup func_qurt_client_add_memory
+  Adds a physical address range to the HLOS physpool of the caller user PD.
+   
+  @param[in] client_handle  Obtained from the current invocation function (Section 3.4.1).
+  @param[in] phys_addr      Starting address of the physical address range. 
+  @param[in] size           Size.
+
+  @return
+  #QURT_EOK -- Pages successfully added.
+
+  @dependencies
+  None.
+*/
+int qurt_client_add_memory(int client_handle, qurt_addr_t phys_addr, qurt_size_t size);
+
+/**@ingroup func_qurt_client_add_memory2
+  Adds a physical address range to the HLOS physpool of the caller user PD.
+   
+  @param[in] client_handle  Obtained from the current invocation function (Section 3.4.1).
+  @param[in] phys_addr      Starting 36-bit address of the physical address range. 
+  @param[in] size           Size.
+
+  @return
+  #QURT_EOK -- Pages successfully added.
+
+  @dependencies
+  None.
+*/
+int qurt_client_add_memory2(int user_client_handle, qurt_paddr_64_t phys_addr, qurt_size_t size);
+
+static __inline qurt_qdi_obj_t *
+qurt_qdi_objref_get_from_pointer(qurt_qdi_obj_t *objptr)
+{
+    qurt_qdi_obj_t * ret = NULL;
+
+    if (qurt_qdi_obj_ref_inc(objptr) < 0) {
+        ret = NULL;
+    } else {
+        ret = objptr;
+    }
+
+    return ret;
+}
+
+static __inline void
+qurt_qdi_objref_release(qurt_qdi_obj_t *objptr)
+{
+    if (qurt_qdi_obj_ref_dec(objptr) == 1) {
+        (*objptr->release)(objptr);
+    }
+}
+
+/**@ingroup func_qurt_qdi_copy_from_user
+  Copies the contents of a user memory buffer into the current driver.
+
+  @note1hang User buffer addresses are valid only for the duration of the current driver
+  invocation.
+
+  @param[in] client_handle   Obtained from the current invocation function (Section @xref{sec:invocationFunction}).
+  @param[in] dest            Base address of the driver buffer.
+  @param[in] src             Base address of the user buffer.
+  @param[in] len             Number of bytes to copy.
+  
+  @return
+  Negative value -- Indicates a privilege or security violation, the copy operation 
+                has crossed a privilege boundary.
+  
+  @dependencies
+  None.
+*/
+static __inline int qurt_qdi_copy_from_user(int client_handle, void *dest, const void *src, unsigned len)
+{
+    return qurt_qdi_handle_invoke(client_handle,
+                                    QDI_CLIENT_HANDLE_COPY_FROM_USER,
+                                    dest, src, len);
+}
+
+/**@ingroup qurt_qdi_copy_string_from_user
+  Copies the contents of a user memory buffer into the current driver.
+
+  @note1hang User buffer addresses are valid only for the duration of the current driver
+  invocation.
+
+  @param client_handle   Obtained from the current invocation function (Section 3.4.1).
+  @param dest            Base address of the driver buffer.
+  @param src             Base address of the user buffer.
+  @param len             Number of bytes to copy. NOTE: This is the destination buffer length.
+  
+  @return
+  Negative error result -- privilege or security violation, the copy operation 
+                has crossed a privilege boundary.
+  
+  @dependencies
+  None.
+*/
+int qurt_qdi_copy_string_from_user(int client_handle, char *dest, const char *src, unsigned len);
+
+/**@ingroup func_qurt_qdi_copy_to_user
+  Copies the contents of a driver memory buffer to user memory.
+
+  @note1hang User buffer addresses are valid only for the duration of the current driver
+             invocation.
+
+  @param[in] client_handle Client handle obtained from the current invocation function (Section @xref{sec:invocationFunction}).
+  @param[in] dest          Base address of the user buffer.
+  @param[in] src           Base address of the driver buffer.
+  @param[in] len           Number of bytes to copy.
+
+  @return
+  Negative value -- Indicates a privilege or security violation, the copy operation has crossed a 
+                    privilege boundary
+
+  @dependencies
+  None.
+*/
+static __inline int qurt_qdi_copy_to_user(int client_handle, void *dest, const void *src, unsigned len)
+{
+    return qurt_qdi_handle_invoke(client_handle,
+                                    QDI_CLIENT_HANDLE_COPY_TO_USER,
+                                    dest, src, len);
+}
+
+/**@ingroup func_qurt_qdi_safe_cache_ops
+  Do cache operations on user memory
+
+  @note1hang User buffer addresses are valid only for the duration of the current driver
+             invocation.
+
+  @param[in] client_handle Client handle obtained from the current invocation function (Section @xref{sec:invocationFunction}).
+  @param[in] addr          Base address of the user memory.
+  @param[in] size          Size of the user memory.
+  @param[in] opcode        Cache operations (QURT_MEM_CACHE_FLUSH, QURT_MEM_CACHE_INVALIDATE...)
+  @param[in] type          Cache type (QURT_MEM_ICACHE, QURT_MEM_DCACHE)
+
+  @return
+  Negative value -- Indicates a privilege or security violation, the copy operation has crossed a
+                    privilege boundary
+
+  @dependencies
+  None.
+*/
+static __inline int qurt_qdi_safe_cache_ops(int client_handle, qurt_addr_t addr, qurt_size_t size,
+        qurt_mem_cache_op_t opcode, qurt_mem_cache_type_t type)
+{
+    return qurt_qdi_handle_invoke(client_handle,
+                                  QDI_CLIENT_HANDLE_SAFE_CACHE_OPS,
+                                  addr, size, opcode, type);
+}
+
+
+/**@ingroup func_qurt_qdi_buffer_lock
+  Prepares for the direct manipulation of a potentially untrusted buffer provided by a QDI
+  client.
+
+  This function is used to permit a trusted driver to safely access memory that is
+  provided by a potentially untrusted client. A driver calls this function to obtain a safe buffer
+  pointer for accessing the memory.
+
+  This function performs the following security checks: \n
+  - Verifies that the entire buffer is accessible to the client. \n
+  - Ensures that the pointer remains valid for the remainder of the QDI driver
+      operation. \n
+
+  @note1hang  User buffer addresses are valid only for the duration of the current driver
+              invocation.
+
+  @param[in] client_handle Obtained from the current invocation function (Section @xref{sec:invocationFunction}).
+  @param[in] buf           Pointer to the base address of the client buffer address.
+  @param[in] len           Buffer length (in bytes).
+  @param[in] perms         Bitmask value that specifies the read or write access to perform on the
+                       client buffer: \n
+                           - #QDI_PERM_R -- Read access \n
+                           - #QDI_PERM_W -- Write access \n
+                           - #QDI_PERM_RW -- Read/write access @tablebulletend
+  @param[out] obuf     Pointer to the buffer address that the driver must use to access the buffer.
+
+  @return
+  Negative value -- Error; the operation crosses a privilege boundary, indicating a privilege or security violation. \n
+  Nonzero value -- User passed a buffer that does not fulfill the requested read/write access permission.
+                    In this case the QDI driver call must be terminated cleanly, with an appropriate error code 
+                    returned to the client. \n
+  Zero -- Success; when this occurs the QDI driver must use the pointer at *obuf to access memory, and not the
+                    pointer passed in as buf -- even if the user process changes the mapping of memory at buf,
+                   the mapping of memory at *obuf remains valid until the driver invocation completes.
+
+  @dependencies
+  None.
+*/
+static __inline int qurt_qdi_buffer_lock(int client_handle, void *buf, unsigned len,
+                                         unsigned perms, void **obuf)
+{
+    return qurt_qdi_handle_invoke(client_handle,
+                                    QDI_CLIENT_HANDLE_BUFFER_LOCK,
+                                    buf, len, perms, obuf);
+}
+
+/**@ingroup func_qurt_qdi_buffer_lock2
+   Prepares for the direct manipulation of a possibly-untrusted buffer provided by a QDI
+   client.
+   This API permits a trusted driver to safely access memory 
+   provided by a possibly-untrusted client. A driver calls this function to obtain a safe buffer
+   pointer for accessing the memory.
+   This function performs the following security checks: \n
+   -- Entire buffer is accessible to the client. \n
+   -- Entire buffer is mapped with permissions passed in perms field \n
+   -- Entire buffer is physically contiguous \n
+   In addition to the security checks, the API also locks the client mapping such that the client
+   cannot remove the mapping while the physical memory is used by the trusted
+   driver. \n
+
+   @note1      Drivers are responsible for calling qurt_qdi_buffer_unlock() at appropriate time. Not 
+               pairing qurt_qdi_buffer_unlock() with this API leads to resource leakages and 
+               process exit failures. Drivers can keep track of which buffers are locked for
+               a particular client. If the client exits abruptly, the buffers can be
+               unlocked on driver release invocation for the exiting client.
+
+   @note2      This API is supported in limited capacity when called from Island mode. Safe buffer
+               unmapping or user buffer unlock is not supported in Island mode.
+
+   @param client_handle Obtained from the current invocation function (Section 3.4.1).
+   @param buf           Pointer to the base address of the client buffer address.
+   @param len           Buffer length (in bytes).
+   @param perms         Bitmask value that specifies the read or write access to perform on the
+                        client buffer: \n
+                        -- #QDI_PERM_R -- Read access \n
+                        -- #QDI_PERM_W -- Write access \n
+                        -- #QDI_PERM_RW -- Read/write access \n
+   @param obuf         Optional parameter that returns a pointer to the buffer address that 
+                       the driver must use to access the buffer. If NULL is passed, the API 
+                       only performs security checks and does not create a mapping to access the user buffer in
+                       a safe way.
+
+   @return
+   QURT_EINVALID   -- Arguments passed to the API are invalid. User buffer pointer is NULL or length of the
+                      buffer is 0. \n
+   QURT_EPRIVILEGE -- One of the security checks on the user buffer failed. \n
+   QURT_EFAILED    -- Mapping cannot be created for the trusted driver. \n
+   QURT_EOK        -- Lock operation was successful. When this occurs, the QDI driver must use the 
+                      pointer at *obuf to perform its memory accesses, and not the
+                      pointer passed in as buf. 
+                      
+   @dependencies
+   None.
+*/
+static __inline int qurt_qdi_buffer_lock2(int client_handle, void *buf, unsigned len,
+                                         unsigned perms, void **obuf)
+{
+    return qurt_qdi_handle_invoke(client_handle,
+                                    QDI_CLIENT_HANDLE_BUFFER_LOCK2,
+                                    BUFFER_LOCK, buf, len, perms, obuf);
+}
+
+/**@ingroup func_qurt_qdi_buffer_unlock
+   This API is paired with qurt_qdi_buffer_lock2(). A temporary overlapping mapping 
+   created for the driver is removed. Client mapping for the user buffer is
+   unlocked. 
+
+   @note1      Drivers are responsible for pairing this with qurt_qdi_buffer_lock(). Not 
+               pairing qurt_qdi_buffer_lock() with this API leads to resource leakages and 
+               process exit failures. Drivers can keep track of which buffers are locked for
+               a particular client, and if the client exits abruptly, all the buffers can be
+               unlocked on driver release invocation for the exiting client.
+
+   @note2      This API is supported in limited capacity when called from Island mode. Actual
+               unmapping of driver accessible memory or unlocking of the buffer is not
+               supported in Island bode.
+
+   @param client_handle Obtained from the current invocation function (Section 3.4.1).
+   @param buf           Pointer to the base address of the client buffer address.
+   @param len           Buffer length (in bytes).
+   @param obuf          Safe buffer address that was returned in the obuf field after calling
+                        qurt_qdi_buffer_lock2().
+
+   @return
+   QURT_EINVALID   -- Arguments passed to the API are invalid. User buffer pointer is NULL or length of the
+                      buffer is 0. \n
+   QURT_EOK        -- Lock operation was successful. When this occurs, the QDI driver must use the 
+                      pointer at *obuf to perform its memory accesses, and not the
+                      pointer passed in as buf. \n
+   other results   -- Safe buffer unmapping failed or unlocking of user buffer failed \n.
+
+   @dependencies
+   None.
+*/
+static __inline int qurt_qdi_buffer_unlock(int client_handle, void *buf, unsigned len,
+                                           void *obuf)
+{
+    return qurt_qdi_handle_invoke(client_handle,
+                                    QDI_CLIENT_HANDLE_BUFFER_LOCK2,
+                                    BUFFER_UNLOCK, buf, len, obuf);
+}
+
+/**@ingroup func_qurt_qdi_user_malloc
+  Allocates memory area in the QDI heap that is read/write accessible to both the driver and
+  the client. \n
+  @note1hang The QDI heap has a limited amount of memory available, and only the
+  device driver can free the allocated memory.
+
+  @param client_handle Client handle obtained from the current invocation function (Section @xref{sec:invocationFunction}).
+  @param size          Size.
+
+  @return
+  Non-zero -- Success; this returned value points to the allocated memory area. \n
+  Zero -- Error.
+
+  @dependencies
+  None.
+*/
+void *qurt_qdi_user_malloc(int client_handle, unsigned size);
+
+/**@ingroup func_qurt_qdi_user_free
+  Deallocates memory area in the QDI heap.
+
+  @param client_handle Client handle obtained from the current invocation function (Section @xref{sec:invocationFunction}).
+  @param ptr Pointer.
+
+  @dependencies
+  None.
+*/
+void qurt_qdi_user_free(int client_handle, void *ptr);
+
+/**@ingroup funct_qurt_qdi_client_detach
+  Detaches a client (a process), indicating that the client does not
+  participate in the qurt_wait() mechanism. This behavior
+  is opt-in and irrevocable. When a client is detached, it can
+  not be un-detached.
+
+  @param client_handle Handle of the client to detach.
+
+  @return
+  Zero -- Success.  Detachable clients always return success.
+  Nonzero value -- client_handle did not refer to a
+    detachable user client.
+
+  @dependencies
+  None.
+*/
+static __inline int qurt_qdi_client_detach(int client_handle)
+{
+    return qurt_qdi_handle_invoke(client_handle, QDI_CLIENT_HANDLE_DETACH);
+}
+
+/**@ingroup func_qurt_qdi_signal_group_create
+  Creates a new signal group for use in a device driver.
+  A QDI signal group contains up to 32 signals, which can be operated on either
+  individually (using the qurt_qdi_signal_* functions) or as a group (using the
+  qurt_qdi_signal_group_* functions). \n
+  @note1hang Driver implementation is responsible for using the proper signal group
+             handle in any given situation. \n
+  For more information on signals, see the Hexagon QuRT RTOS User Guide (80-VB419-78).
+
+  @param client_handle                 Client handle obtained from the current invocation function (Section @xref{sec:invocationFunction}).
+  @param p_signal_group_handle_local   Returns a handle intended for use by code that
+                                       resides in the same context and process as the created signal group
+                      (for example, the device driver implementation that allocated the 
+                      signal group).
+  @param p_signal_group_handle_remote  Returns a handle intended for use by code
+                                       that resides in a different context and process than the created signal group 
+                      (for example, the user-mode client of an OS driver).
+
+  @return
+  Zero return value indicates success.\n
+  Negative return value indicates could not create signal group.
+
+  @dependencies
+  None.
+*/
+static __inline int qurt_qdi_signal_group_create(int client_handle,
+                                                 int *p_signal_group_handle_local,
+                                                 int *p_signal_group_handle_remote)
+{
+    return qurt_qdi_handle_invoke(client_handle,
+                                    QDI_CLIENT_HANDLE_SIGNAL_GROUP_CREATE,
+                                    p_signal_group_handle_local,
+                                    p_signal_group_handle_remote);
+}
+
+/**@ingroup func_qurt_qdi_signal_group_wait
+  Suspends the current thread until any of the signals are set in the specified signal group.
+
+  If a signal is set in a signal group object, and a thread waits on the signal group object,
+  the thread is awakened. If the awakened thread has higher priority than the current
+  thread, a context switch can occur.
+
+  @param signal_group_handle   Handle of the signal group.
+
+  @return
+  If the client is remote:
+  QURT_EOK -- Wait complete \n
+  QURT_ECANCEL -- Wait cancelled.\n
+  If the client is local, returns a 32-bit word with current signals.
+
+  @dependencies
+  None.
+*/
+static __inline int qurt_qdi_signal_group_wait(int signal_group_handle)
+{
+    return qurt_qdi_handle_invoke(signal_group_handle,
+                                    QDI_SIGNAL_GROUP_WAIT);
+}
+
+/**@ingroup func_qurt_qdi_signal_group_poll
+  Returns a value that indicates if any of the signals are set in the specified signal group.
+
+  @param signal_group_handle Handle of the signal group.
+
+  @return
+  1 -- Indicates whether any of the signals are set in the signal group.\n
+  0 -- Indicates that none of the signals are set.
+
+  @dependencies
+  None.
+*/
+static __inline int qurt_qdi_signal_group_poll(int signal_group_handle)
+{
+    return qurt_qdi_handle_invoke(signal_group_handle,
+                                    QDI_SIGNAL_GROUP_POLL);
+}
+
+
+/**@ingroup func_qurt_qdi_signal_create
+  Creates a new signal in the specified signal group.
+  For more information on signals, see the Hexagon QuRT RTOS User Guide (80-VB419-78).
+
+  @note1hang Driver implementation is responsible for using the proper signal handle in
+             any given situation.
+
+  @param signal_group_handle    Handle of an existing signal group.
+  @param p_signal_handle_local  Returns a handle intended for use by code that resides in
+                               the same context and process as the created signal (for example,
+                               the device driver implementation that allocated the signal).
+  @param p_signal_handle_remote Returns a handle intended for use by code that resides in
+                               a different context and process than the created signal (for 
+                               example, the user-mode client of an OS driver).
+
+  @return 
+  Nonzero value -- No more signals can be created in the specified
+                    signal group. 
+
+  @dependencies
+  None.
+*/
+static __inline int qurt_qdi_signal_create(int signal_group_handle,
+                                           int *p_signal_handle_local,
+                                           int *p_signal_handle_remote)
+{
+    return qurt_qdi_handle_invoke(signal_group_handle,
+                                    QDI_SIGNAL_GROUP_SIGNAL_CREATE,
+                                    p_signal_handle_local,
+                                    p_signal_handle_remote);
+}
+
+/**@ingroup func_qurt_qdi_signal_set
+  Sets the signal in the specified signal object.
+
+  @param signal_handle Handle of the signal.
+
+  @return
+  Always returns 0.
+
+  @dependencies
+  None.
+*/
+static __inline int qurt_qdi_signal_set(int signal_handle)
+{
+    return qurt_qdi_handle_invoke(signal_handle,
+                                    QDI_SIGNAL_SET);
+}
+
+/**@ingroup func_qurt_qdi_signal_clear
+  Clears the signal in the specified signal object.
+
+  @param signal_handle   Handle of the signal.
+  
+  @return 
+  Always returns 0.
+
+  @dependencies
+  None.
+*/
+static __inline int qurt_qdi_signal_clear(int signal_handle)
+{
+    return qurt_qdi_handle_invoke(signal_handle,
+                                    QDI_SIGNAL_CLEAR);
+}
+
+/**@ingroup func_qurt_qdi_signal_wait
+  Suspends the current thread until the specified signal is set.
+  If a signal is set in a signal object, and a thread waits on the signal object, the
+  thread is awakened. If the awakened thread has higher priority than the current thread, a
+  context switch may occur.
+
+  @param signal_handle Handle of the signal.
+
+  @return
+  If client is remote:
+  QURT_EOK -- Wait complete. \n
+  QURT_ECANCEL -- Wait cancelled.\n
+  If client is local, return a 32-bit word with current signals.
+
+  @dependencies
+  None.
+*/
+static __inline int qurt_qdi_signal_wait(int signal_handle)
+{
+    return qurt_qdi_handle_invoke(signal_handle,
+                                    QDI_SIGNAL_WAIT);
+}
+
+/**@ingroup func_qurt_qdi_signal_poll
+  Returns a value that indicates if the specified signal is set.
+
+  @param signal_handle Handle of the signal.
+
+  @return
+  1 -- Signal is set. \n
+  0 -- Signal is not set.
+
+  @dependencies
+  None.
+*/
+static __inline int qurt_qdi_signal_poll(int signal_handle)
+{
+    return qurt_qdi_handle_invoke(signal_handle,
+                                    QDI_SIGNAL_POLL);
+}
+
+/**@ingroup func_qurt_qdi_devname_register
+  Registers a QDI device with the generic QDI object in the 
+  current QDI context.
+
+  This function registers an exact name or a directory prefix with a QDI opener object.
+  Future invocations of qurt_qdi_open() in the context of the caller invokes the
+  opener object if a match is detected.
+
+  Directory prefix names are specified by ending the name with a forward slash character.
+
+  Example of an exact name:
+  @code qurt_qdi_devname_register(/dev/foobar, foobar_opener);@endcode
+
+  Example of a directory prefix:
+  @code qurt_qdi_devname_register(/pipedev/, pipedev_opener);@endcode
+
+  Given the two registrations shown above, the only qurt_qdi_open() requests to
+  direct to the foobar_opener object are requests for the exact name
+  "/dev/foobar", Any request beginning with "/pipedev/" is directed to the
+  pipedev_opener object.
+
+  The pipedev invocation function presumably examines the name argument to
+  determine exactly how to handle the request. The name is passed to the invocation
+  function in the a1.ptr argument (Section @xref{sec:invocationFunction}).
+
+  @param  name   Device name or device name prefix.
+  @param  opener Pointer to the opener object for the device.
+ 
+  @return
+  0 -- Device was successfully registered. \n
+  Negative error code -- Device was not registered.
+
+  @dependencies
+  None.
+ */
+static __inline int qurt_qdi_devname_register(const char *name,
+                                              qurt_qdi_obj_t *opener)
+{
+    return qurt_qdi_handle_invoke(QDI_HANDLE_GENERIC,
+                                    QDI_DEVNAME_REGISTER,
+                                    name,
+                                    opener);
+}
+
+// Macros for backward compatibility with deprecated APIs
+//  (These will go away soon)
+
+#define qurt_qdi_register_devname(name, opener) \
+        qurt_qdi_devname_register((name), (void *)(opener))
+#define qurt_qdi_new_handle_from_obj_t(handle, obj) \
+        qurt_qdi_handle_create_from_obj_t((handle), (obj))
+#define qurt_qdi_release_handle(client_handle, handle) \
+        qurt_qdi_handle_release((client_handle), (handle))
+#define qurt_qdi_lock_buffer(handle, buf, len, perms, obuf) \
+        qurt_qdi_buffer_lock((handle), (buf), (len), (perms), (obuf))
+#define qurt_qdi_usermalloc(handle, size) \
+        qurt_qdi_user_malloc((handle), (size))
+#define qurt_qdi_userfree(handle, ptr) \
+        qurt_qdi_user_free((handle), (ptr))
+        
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_qdi_ext.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_qdi_ext.h
new file mode 100755
index 0000000000000..383e1799a15d6
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_qdi_ext.h
@@ -0,0 +1,58 @@
+#ifndef QURT_QDI_EXT_H
+#define QURT_QDI_EXT_H
+
+/**
+  @file qurt_qdi_driver.h
+  @brief  Definitions, macros, and prototypes used when writing a
+  QDI driver
+
+ EXTERNALIZED FUNCTIONS
+  none
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  none
+
+ Copyright (c) 2018, 2019-2021 by Qualcomm Technologies, Inc.  All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+#include "qurt_qdi_driver.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct qurt_qdi_ext_device {
+    qurt_qdi_ext_obj_info_ptr qurt_qdi_ext_obj_info_head;
+    struct qurt_qdi_ext_device * next;
+    char * instance;
+    fdt_node_handle context;
+};
+typedef struct qurt_qdi_ext_device *qurt_qdi_ext_device_ptr;
+
+/**@ingroup func_qurt_qdi_dt_register
+ Registers a QDI device with the generic QDI object in the current QDI context,
+ if and only if a compatible device node is found in the device tree. This 
+ function serves as a device tree aware wrapper for qurt_qdi_devname_register().
+
+ @param  name   Device name or device name prefix.
+ @param  opener Pointer to QDI ext specialized opener object for the driver.
+
+ @return
+ 0 -- Device was successfully registered. \n
+ Negative error code -- Device was not registered.
+*/
+static __inline int qurt_qdi_dt_register(const char *name, qurt_qdi_obj_t *opener)
+{
+    return qurt_qdi_handle_invoke(QDI_HANDLE_GENERIC, QDI_DT_REGISTER, name, opener);
+}
+
+static inline void qurt_qdi_ext_deviceobj_set_name (struct qurt_qdi_ext_device * device, char * name)
+{
+    device->instance = name;
+}
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_qdi_imacros.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_qdi_imacros.h
new file mode 100755
index 0000000000000..c0a8448ac87f8
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_qdi_imacros.h
@@ -0,0 +1,34 @@
+#ifndef QURT_QDI_IMACROS_H
+#define QURT_QDI_IMACROS_H
+
+/**
+  @file  qurt_qdi_imacros.h 
+  @brief  Internal macros used for QDI. Mostly consists of tricky (and ugly)
+  preprocessor hacks that permit us to do varargs function invocations
+  where we pass optional arguments in registers and where we can do
+  type casting and checking automatically.
+
+ EXTERNALIZED FUNCTIONS
+  none
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  none
+
+ Copyright (c) 2013, 2021  by Qualcomm Technologies, Inc.  All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define _QDMPASTE(a,b) _QDMPASTE_(a,b)
+#define _QDMPASTE_(a,b) a##b
+#define _QDMCNT(...) _QDMCNT_(__VA_ARGS__,12,11,10,9,8,7,6,5,4,3,2,1,0)
+#define _QDMCNT_(a,b,c,d,e,f,g,h,i,j,k,l,cnt,...) cnt
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_qdi_proxy.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_qdi_proxy.h
new file mode 100755
index 0000000000000..f1d8992ea8811
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_qdi_proxy.h
@@ -0,0 +1,55 @@
+/*=============================================================================
+
+                                    qurt_qdi_proxy.h
+
+GENERAL DESCRIPTION
+
+EXTERNAL FUNCTIONS
+        None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+        None.
+
+Copyright (c) 2013, 2021  by Qualcomm Technologies, Inc.  All Rights Reserved.
+=============================================================================*/
+#ifndef _QURT_QDI_PROXY_H
+#define _QURT_QDI_PROXY_H
+
+#include "qurt_qdi_driver.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* APIs allowing operation on the proxy object directly */
+int qurt_qdi_proxy_ref_create(void);
+
+/* APIs allowing to operate on proxy given a known proxy handle 
+ * 1) using qdi handle of the object 
+ * successful return: QURT_EOK, anything else -- failure 
+ */
+int qurt_qdi_proxy_ref_add_by_handle(int proxy_handle, int qdi_handle);
+int qurt_qdi_proxy_ref_sub_by_handle(int proxy_handle, int qdi_handle);
+
+/* 2) using object reference 
+ * successful return: QURT_EOK, anything else -- failure 
+ */
+int qurt_qdi_proxy_ref_add_by_object(int proxy_handle, qurt_qdi_obj_t *obj_ptr);
+int qurt_qdi_proxy_ref_sub_by_object(int proxy_handle, qurt_qdi_obj_t *obj_ptr);
+
+/* API allowing to associate a proxy object with a particular client given a client handle 
+ * successfule return: QURT_EOK, anything else -- failure 
+ */
+int qurt_client_proxy_ref_install (int client_handle, int proxy_handle);
+
+/* APIs allowing operation on proxy object from user client 
+ * successful return: QURT_EOK, anything else -- failure 
+ */
+int qurt_client_proxy_ref_add(int qdi_handle);
+int qurt_client_proxy_ref_remove(int qdi_handle);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_QDI_PROXY_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_rmutex.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_rmutex.h
new file mode 100755
index 0000000000000..a013a0bbddb1d
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_rmutex.h
@@ -0,0 +1,200 @@
+#ifndef QURT_RMUTEX_H
+#define QURT_RMUTEX_H
+/**
+  @file qurt_rmutex.h 
+  Prototypes of rmutex API.  
+
+EXTERNAL FUNCTIONS
+   None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+Copyright (c) 2013 - 2021 by Qualcomm Technologies, Inc.  All Rights Reserved.
+Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+=============================================================================*/
+
+
+#include <qurt_futex.h>
+#include <qurt_mutex.h>
+
+/*=============================================================================
+												FUNCTIONS
+=============================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**@ingroup func_qurt_rmutex_init
+   Initializes a recursive mutex object.
+   The recursive mutex is initialized in unlocked state.
+
+   @datatypes
+   #qurt_mutex_t
+   
+   @param[out]  lock  Pointer to the recursive mutex object.
+
+   @return 
+   None.
+
+   @dependencies
+   None.  
+ */
+void qurt_rmutex_init(qurt_mutex_t *lock);
+
+/**@ingroup func_qurt_rmutex_destroy  
+  Destroys the specified recursive mutex. \n
+  @note1hang Recursive mutexes must not be destroyed while they are still in use. If this
+             occurs, the behavior of QuRT is undefined. 
+  
+  @datatypes
+  #qurt_mutex_t
+  
+  @param[in]  lock  Pointer to the recursive mutex object to destroy.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+  
+ */
+void qurt_rmutex_destroy(qurt_mutex_t *lock);
+
+/**@ingroup func_qurt_rmutex_lock
+  Locks the specified recursive mutex. \n 
+
+  If a thread performs a lock operation on a mutex that is not in use, the thread 
+  gains access to the shared resource that the mutex protects, and continues executing.
+
+  If a thread performs a lock operation on a mutex that is already use by another 
+  thread, the thread is suspended. When the mutex becomes available again (because the 
+  other thread has unlocked it), the thread is awakened and given access to the shared resource.
+  
+   @note1hang A thread is not suspended if it locks a recursive mutex that it has already 
+   locked. However, the mutex does not become available to other threads until the
+   thread performs a balanced number of unlocks on the mutex.
+
+   @datatypes
+   #qurt_mutex_t
+  
+   @param[in]  lock  Pointer to the recursive mutex object to lock. 
+
+   @return
+   None.
+
+   @dependencies
+   None.
+  
+ */
+void qurt_rmutex_lock(qurt_mutex_t *lock);
+
+/**@ingroup func_qurt_rmutex_lock_timed
+  Locks the specified recursive mutex. The wait must be terminated when the specified timeout expires.\n 
+
+  If a thread performs a lock operation on a mutex that is not in use, the thread 
+  gains access to the shared resource that the mutex is protecting, and continues executing.
+
+  If a thread performs a lock operation on a mutex that is already in use by another 
+  thread, the thread is suspended. When the mutex becomes available again (because the 
+  other thread has unlocked it), the thread is awakened and given access to the shared resource.
+  
+   @note1hang A thread is not suspended if it locks a recursive mutex that it has already 
+   locked by itself. However, the mutex does not become available to other threads until the
+   thread performs a balanced number of unlocks on the mutex.
+   If timeout expires, this wait must be terminated and no access to the mutex is granted.
+   
+   @datatypes
+   #qurt_mutex_t
+  
+   @param[in]  lock  Pointer to the recursive mutex object to lock. 
+   @param[in] duration Interval (in microseconds) duration value must be between #QURT_TIMER_MIN_DURATION and
+    #QURT_TIMER_MAX_DURATION 
+
+   @return 
+   #QURT_EOK -- Success \n
+   #QURT_ETIMEDOUT -- Timeout
+
+   @dependencies
+   None.
+  
+ */
+int qurt_rmutex_lock_timed(qurt_mutex_t *lock, unsigned long long int duration);
+
+/**@ingroup func_qurt_rmutex_unlock
+   Unlocks the specified recursive mutex. \n 
+   More than one thread can be suspended on a mutex. When the mutex is 
+   unlocked, the thread waiting on the mutex awakens. If the awakened
+   thread has higher priority than the current thread, a context switch occurs.
+
+   @note1hang When a thread unlocks a recursive mutex, the mutex is not available until 
+   the balanced number of locks and unlocks has been performed on the mutex.
+
+   @datatypes
+   #qurt_mutex_t
+  
+   @param[in]  lock  Pointer to the recursive mutex object to unlock. 
+
+   @return
+   None.
+
+   @dependencies
+   None.
+  
+ */
+void qurt_rmutex_unlock(qurt_mutex_t *lock);
+
+/**@ingroup func_qurt_rmutex_try_lock
+   Attempts to lock the specified recursive mutex.\n
+
+   If a thread performs a try_lock operation on a recursive mutex that is not in use, the
+   thread gains access to the shared resource that is protected by the mutex, and continues
+   executing.\n
+   If a thread performs a try_lock operation on a recursive mutex that another thread has 
+   already locked, qurt_rmutex_try_lock immediately returns with a nonzero result
+   value.
+
+   @datatypes
+   #qurt_mutex_t
+   
+   @param[in]  lock  Pointer to the recursive mutex object to lock.
+
+   @return 
+   0 -- Success. \n 
+   Nonzero -- Failure. 
+  
+ */
+int qurt_rmutex_try_lock(qurt_mutex_t *lock);
+
+/**@ingroup func_qurt_rmutex_try_lock_block_once 
+  Attempts to lock a mutex object recursively. If the mutex is available, 
+  it locks the mutex. If the mutex is held by the current thread, 
+  it increases the internal counter and returns 0. If not, it returns a
+  nonzero value.
+  If the mutex is already locked by another thread, the caller thread is 
+  suspended. When the mutex becomes available again (because the other 
+  thread has unlocked it), the caller thread is awakened and tries to lock
+  the mutex; and if it fails, this function returns failure with a nonzero 
+  value. If it succeeds, this function returns success with zero.
+ 
+  @datatypes
+  #qurt_mutex_t
+  
+  @param[in]  lock  Pointer to the qurt_mutex_t object. 
+
+  @return
+  0 -- Success. \n
+  Nonzero -- Failure. 
+
+  @dependencies
+  None.
+ */
+int qurt_rmutex_try_lock_block_once(qurt_mutex_t *lock);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_RMUTEX_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_rmutex2.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_rmutex2.h
new file mode 100755
index 0000000000000..a37e7e4458c4b
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_rmutex2.h
@@ -0,0 +1,183 @@
+#ifndef QURT_RMUTEX2_H
+#define QURT_RMUTEX2_H
+/**
+  @file qurt_rmutex2.h 
+  @brief Prototypes of rmutex2 API  
+
+EXTERNAL FUNCTIONS
+   None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+Copyright (c) 2013, 2021, 2023  by Qualcomm Technologies, Inc.  All Rights Reserved.
+Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+=============================================================================*/
+
+
+#include <qurt_futex.h>
+#include <qurt_mutex.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** @addtogroup mutex_types
+@{ */
+/*=============================================================================
+                        TYPEDEFS
+=============================================================================*/
+
+/** QuRT rmutex2 type.                                       
+   Mutex type used with rmutex2 APIs.
+ */
+typedef struct {
+   /** @cond */
+   unsigned int holder __attribute__((aligned(8)));    /* UGP value of the mutex holder. */
+   unsigned short waiters;                             /* Number of waiting threads. */
+   unsigned short refs;                                /* Number of references to this mutex. */
+   unsigned int queue;                                 /* Kernel-maintained futex queuevalue. */
+   unsigned int excess_locks;                          /* Number of excess times the holder has locked the mutex. */
+   /** @endcond */
+} qurt_rmutex2_t;
+/** @} */ /* end_addtogroup mutex_types */
+/** @cond internal_only*/
+/*=============================================================================
+												FUNCTIONS
+=============================================================================*/
+
+/**@ingroup func_qurt_rmutex2_init
+
+   @deprecated use #qurt_rmutex_init instead.
+
+   Initializes a recursive mutex object. 
+
+   The recursive mutex is initially unlocked.
+  
+   Objects of type rmutex2 solve a potential race condition between
+   unlock() and destroy() operations.
+
+   @datatypes
+   #qurt_rmutex2_t
+   
+   @param[out]  lock  Pointer to the recursive mutex object.
+
+   @return 
+   None.
+
+   @dependencies
+   None.
+ */
+void qurt_rmutex2_init(qurt_rmutex2_t *lock);
+
+/**@ingroup func_qurt_rmutex2_destroy
+
+  @deprecated use #qurt_rmutex_destroy instead.
+
+  Destroys the specified recursive mutex. \n
+  @note1hang Recursive mutexes must not be destroyed while they are still in use. If this
+             occurs, the behavior of QuRT is undefined. 
+  @note1cont In general, application code must destroy an rmutex2 object prior to
+             deallocating it; calling qurt_rmutex2_destroy() before deallocating it ensures
+             that all qurt_rmutex2_unlock() calls complete.
+  
+  @datatypes
+  #qurt_rmutex2_t
+  
+  @param[in]  lock  Pointer to the recursive mutex object to destroy.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+  
+ */
+void qurt_rmutex2_destroy(qurt_rmutex2_t *lock);
+
+/**@ingroup func_qurt_rmutex2_lock
+
+  @deprecated use #qurt_rmutex_lock instead.
+
+  Locks the specified recursive mutex. \n 
+
+  If a thread performs a lock operation on a recursive mutex that is not in use, the
+  thread gains access to the shared resource that the mutex protects, and continues
+  to execute.
+
+  If a thread performs a lock operation on a recursive mutex that another thread is using, 
+  the thread is suspended. When the mutex becomes available again
+  (because the other thread has unlocked it), the thread is awakened and given access to the
+  shared resource.
+  
+  @note1hang A thread is not suspended if it locks a recursive mutex that it has already
+             locked, but the mutex does not become available until the thread performs a
+             balanced number of unlocks on the mutex.
+  
+   @datatypes
+   #qurt_rmutex2_t
+  
+   @param[in]  lock  Pointer to the recursive mutex object to lock. 
+
+   @return
+   None.
+
+   @dependencies
+   None.
+  
+ */
+void qurt_rmutex2_lock(qurt_rmutex2_t *lock);
+
+/**@ingroup func_qurt_rmutex2_unlock
+
+   @deprecated use #qurt_rmutex_unlock instead.
+
+   Unlocks the specified recursive mutex. \n 
+   More than one thread can be suspended on a recursive mutex. When the mutex is
+   unlocked, only the highest-priority thread waiting on the mutex awakens. If the
+   awakened thread has higher priority than the current thread, a context switch occurs.
+  
+   @datatypes
+   #qurt_rmutex2_t
+  
+   @param[in]  lock  Pointer to the recursive mutex object to unlock. 
+
+   @return
+   None.
+
+   @dependencies
+   None.
+  
+ */
+void qurt_rmutex2_unlock(qurt_rmutex2_t *lock);
+
+/**@ingroup func_qurt_rmutex2_try_lock
+
+   @deprecated use #qurt_rmutex_try_lock instead.
+
+   Attempts to lock the specified recursive mutex.\n
+
+   Non-blocking version of qurt_rmutex2_lock(). When a call to qurt_rmutex2_lock() 
+   succeeds immediately, this function behaves similarly, returning 0 for success.
+   When a call to qurt_rmutex2_lock() does not succeed immediately, this function has
+   no effect and returns nonzero for failure.
+
+   @datatypes
+   #qurt_rmutex2_t
+   
+   @param[in]  lock  Pointer to the recursive mutex object to lock.
+
+   @return 
+   0 -- Success. \n 
+   Nonzero -- Failure. 
+  
+ */
+int qurt_rmutex2_try_lock(qurt_rmutex2_t *lock);
+/** @endcond */
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_RMUTEX2_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_sclk.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_sclk.h
new file mode 100755
index 0000000000000..a83cf5f1db889
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_sclk.h
@@ -0,0 +1,145 @@
+#ifndef QURT_SCLK_H
+#define QURT_SCLK_H
+/**
+  @file qurt_sclk.h 
+  @brief Header file describing the APIs supported by QuRT system SCLK
+   feature.
+
+EXTERNAL FUNCTIONS
+   None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+Copyright (c) 2018-2021, 2023 Qualcomm Technologies, Inc.
+All Rights Reserved.
+Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+=============================================================================*/
+
+
+
+
+/*=============================================================================
+
+                           INCLUDE FILES
+
+=============================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
+
+/**
+ Conversion from microseconds to sleep ticks.
+ */
+#define QURT_SYSCLOCK_TIMETICK_FROM_US(us) ((us) * 192ULL / 10UL)
+#define qurt_sysclock_timetick_from_us(us) QURT_SYSCLOCK_TIMETICK_FROM_US(us)
+
+
+/**
+ Conversion from timer ticks to microseconds at the nominal frequency.
+*/
+#define QURT_SYSCLOCK_TIMETICK_TO_US(ticks) qurt_timer_timetick_to_us(ticks)
+
+/**
+  Maximum microseconds value for Qtimer is 1,042,499 hours.
+*/
+#define QURT_SYSCLOCK_MAX_DURATION (1042499uLL * 3600uLL * 1000uLL * 1000uLL)
+#define qurt_sysclock_max_duration() QURT_SYSCLOCK_MAX_DURATION
+/** 
+ Timer clock for Qtimer is 19.2 MHz.
+*/
+#define QURT_SYSCLOCK_MAX_DURATION_TICKS (1042499uLL * 3600uLL * 19200000uLL)
+#define qurt_sysclock_max_duration_ticks() QURT_SYSCLOCK_MAX_DURATION_TICKS
+/** 
+ Sleep timer error margin for Qtimer is 192 ticks ~10 us.
+*/
+#define QURT_SYSCLOCK_ERROR_MARGIN 192U //QURT_TIMER_MIN_DURATION*timer_freq;
+#define qurt_sysclock_error_margin() QURT_SYSCLOCK_ERROR_MARGIN
+
+/*=============================================================================
+
+                           DATA DECLARATIONS
+
+=============================================================================*/
+
+/**@ingroup func_qurt_sysclock_get_hw_ticks
+  @xreflabel{sec:qurt_sysclock_get_hw_ticks}
+  Gets the hardware tick count.\n
+  Returns the current value of a 64-bit hardware counter. The value wraps around to zero
+  when it exceeds the maximum value. 
+
+  @note1hang This operation must be used with care because of the wrap-around behavior.
+ 
+  @return 
+  Integer -- Current value of 64-bit hardware counter. 
+
+  @dependencies
+  None.
+ */
+unsigned long long qurt_sysclock_get_hw_ticks (void);
+
+
+/**@ingroup func_qurt_sysclock_get_hw_ticks_32
+  @xreflabel{sec:qurt_sysclock_get_hw_ticks_32}
+  Gets the hardware tick count in 32 bits.\n
+  Returns the current value of a 32-bit hardware counter. The value wraps around to zero
+  when it exceeds the maximum value. 
+
+  @note1hang This operation is implemented as an inline C function, and should be called from a C/C++ program.
+             The returned 32 bits are the lower 32 bits of the Qtimer counter.
+ 
+  @return 
+  Integer -- Current value of the 32-bit timer counter. 
+
+  @dependencies
+  None.
+ */
+static inline unsigned long qurt_sysclock_get_hw_ticks_32 (void)
+{
+    //Beginning with v61 there is a HW register that can be read directly.
+          unsigned long count;
+          __asm__ __volatile__ (" %0 = c30 " : "=r"(count));
+          return count;
+}
+
+
+/**@ingroup func_qurt_sysclock_get_hw_ticks_16
+  @xreflabel{sec:qurt_sysclock_get_hw_ticks_16}
+  Gets the hardware tick count in 16 bits.\n
+  Returns the current value of a 16-bit timer counter. The value wraps around to zero
+  when it exceeds the maximum value. 
+
+  @note1hang This operation is implemented as an inline C function, and should be called from a C/C++ program.
+             The returned 16 bits are based on the value of the lower 32 bits in Qtimer 
+             counter, right shifted by 16 bits.
+ 
+  @return 
+  Integer -- Current value of the 16-bit timer counter, calculated from the lower 32 bits in the
+             Qtimer counter, right shifted by 16 bits. 
+
+  @dependencies
+  None.
+ */
+
+
+static inline unsigned short qurt_sysclock_get_hw_ticks_16 (void)
+{
+    unsigned long ticks;
+
+    //Beginning with v61 there is a HW register that can be read directly.
+       __asm__ __volatile__ (" %0 = c30 " : "=r"(ticks));
+    __asm__ __volatile__ ( "%0 = lsr(%0, #16) \n" :"+r"(ticks));
+
+    return (unsigned short)ticks; 
+}
+unsigned long long qurt_timer_timetick_to_us(unsigned long long ticks);
+#define qurt_sysclock_timetick_to_us(ticks) qurt_timer_timetick_to_us(ticks)
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif /* __cplusplus */
+
+#endif /* QURT_SCLK_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_secure_proc.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_secure_proc.h
new file mode 100755
index 0000000000000..f40c7deb9bca1
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_secure_proc.h
@@ -0,0 +1,53 @@
+#ifndef QURT_SECURE_PROC_H
+#define QURT_SECURE_PROC_H
+
+/**
+  @file qurt_secure_proc.h
+  @brief  Definitions, macros, and prototypes used for handling secure process
+
+ EXTERNALIZED FUNCTIONS
+  none
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  none
+
+ Copyright (c) 2015, 2021  by Qualcomm Technologies, Inc.  All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**@ingroup qurt_process_migrate_secure_process
+  Migrate the user process to Qurt secure process 
+   
+  @param secure_phy_address  Physical starting address of secure memory
+  @param secure_memory_size  Size of secure memory
+  @param entry               Entry function to secure process 
+
+  @return
+  EOK
+  Negative return value -- Error.
+
+  @dependencies
+  None.
+*/
+int qurt_process_migrate_secure_process(unsigned long long secure_phy_address, unsigned int secure_memory_size,  void entry(unsigned));
+
+/**@ingroup qurt_process_get_migration_mem_size
+  get the size of all writable memory regions in a user PD. This is for preparation on secure process migration.
+   
+  @return
+  size of all writable memory regions in a user PD.
+ 
+  @dependencies
+  None.
+*/
+int qurt_process_get_migration_mem_size(void);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_sem.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_sem.h
new file mode 100755
index 0000000000000..ee5ce4b2d94ab
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_sem.h
@@ -0,0 +1,252 @@
+#ifndef QURT_SEM_H
+#define QURT_SEM_H 
+/**
+  @file  qurt_sem.h 
+  Prototypes of semaphore API.  
+
+EXTERNAL FUNCTIONS
+   None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+Copyright (c) 2021  by Qualcomm Technologies, Inc.  All Rights Reserved.
+Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+=============================================================================*/
+
+
+#include <qurt_futex.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*=============================================================================
+												TYPEDEFS
+=============================================================================*/
+/** @addtogroup semaphore_types
+@{ */
+
+/** QuRT semaphore type.   */
+typedef union {
+    /** @cond */
+	unsigned int raw[2] __attribute__((aligned(8)));
+	struct {        
+		unsigned short val;        /**< */
+		unsigned short n_waiting;  /**< */
+        unsigned int reserved1;    /**< */
+        unsigned int queue;       /**< */
+        unsigned int reserved2;    /**< */
+	}X; /** @endcond */   
+} qurt_sem_t;
+/** @} */ /* end_addtogroup semaphore_types */
+/*=============================================================================
+												FUNCTIONS
+=============================================================================*/
+
+/**@ingroup func_qurt_sem_add
+  Releases access to a shared resource (the specified amount increments the semaphore count value).\n
+  When a thread performs an add operation on a semaphore, the specified value increments the semaphore count.
+  The result depends on the number of threads waiting
+  on the semaphore: \n
+  - When no threads are waiting, the current thread releases access to the shared resource
+     and continues executing. \n
+  - When one or more threads are waiting and the semaphore count value is nonzero, 
+     the kernel repeatedly awakens the highest-priority waiting thread and decrements
+     the semaphore count value until either no waiting threads remain or the
+     semaphore count value is zero. If any of the awakened threads has higher priority
+     than the current thread, a context switch can occur.
+
+   @datatypes
+   #qurt_sem_t
+   
+   @param[in]  sem  Pointer to the semaphore object to access. 
+   @param[in]  amt  Amount to increment the semaphore count value. 
+
+   @return 
+   Unused integer value.
+
+   @dependencies 
+   None.
+  
+ */
+int qurt_sem_add(qurt_sem_t *sem, unsigned int amt);
+
+/**@ingroup func_qurt_sem_up  
+  Releases access to a shared resource. When a thread performs an up operation on a semaphore,
+  the semaphore count value increments. The result depends on the number of threads waiting 
+  on the semaphore: \n
+  - When no threads are waiting, the current thread releases access to the shared resource
+     and continues executing.\n
+  - When one or more threads are waiting and the semaphore count value is nonzero, 
+     the kernel awakens the highest-priority waiting thread and decrements the
+     semaphore count value. If the awakened thread has higher priority than the current
+     thread, a context switch can occur.
+
+   @datatypes
+   #qurt_sem_t
+   
+   @param[in]  sem  Pointer to the semaphore object to access.
+
+   @return 
+   Unused integer value.
+
+   @dependencies
+   None.  
+ */
+static inline int qurt_sem_up(qurt_sem_t *sem) { return qurt_sem_add(sem,1); }
+
+/**@ingroup func_qurt_sem_down  
+  Requests access to a shared resource. When a thread performs a down operation on a 
+  semaphore, the result depends on the semaphore count value: \n
+  - When the count value is nonzero, it is decremented, and the thread gains access to the
+     shared resource and continues executing.\n
+  - When the count value is zero, it is not decremented, and the thread is suspended on the
+     semaphore. When the count value becomes nonzero (because another thread
+     released the semaphore) it is decremented, and the suspended thread is awakened
+     and gains access to the shared resource.
+  
+   @datatypes
+   #qurt_sem_t
+   
+   @param[in]  sem  Pointer to the semaphore object to access.
+
+   @return 
+   Unused integer value.
+
+   @dependencies
+   None.
+ */
+int qurt_sem_down(qurt_sem_t *sem);
+
+/**@ingroup func_qurt_sem_down_timed  
+  When a thread performs a down operation on a semaphore, the result depends on the
+  semaphore count value: \n
+  - When the count value is nonzero, it is decremented, and the thread gains access to the
+     shared resource and continues executing.\n
+  - When the count value is zero, it is not decremented, and the thread is suspended on the
+     semaphore. When the count value becomes nonzero (because another thread
+     released the semaphore) it is decremented, and the suspended thread is awakened
+     and gains access to the shared resource. Terminate the wait when the specified timeout expires.
+   If timeout expires, terminate this wait and grant no access to the shared resource.
+  
+   @datatypes
+   #qurt_sem_t
+   
+   @param[in]  sem     Pointer to the semaphore object to access. 
+   @param[in] duration Interval (in microseconds) duration value must be between #QURT_TIMER_MIN_DURATION and
+                       #QURT_TIMER_MAX_DURATION 
+
+   @return 
+   #QURT_EOK -- Success \n
+   #QURT_ETIMEDOUT -- Timeout
+
+   @dependencies
+   None.
+ */
+int qurt_sem_down_timed(qurt_sem_t *sem, unsigned long long int duration);
+
+/**@ingroup func_qurt_sem_try_down
+  @xreflabel{hdr:qurt_sem_try_down}
+  Requests access to a shared resource (without suspend). When a thread performs a try down
+  operation on a semaphore, the result depends on the semaphore count value: \n
+  - The count value is decremented when it is nonzero. The down operation returns 0 as
+     the function result, and the thread gains access to the shared resource and is free to
+     continue executing.\n
+  - The count value is not decremented when it is zero. The down operation returns -1
+     as the function result, and the thread does not gain access to the shared resource
+     and should not continue executing.
+ 
+   @datatypes
+   #qurt_sem_t
+   
+   @param[in]  sem  Pointer to the semaphore object to access. 
+
+   @return 
+   0 -- Success. \n
+   -1 -- Failure. 
+
+   @dependencies
+   None.
+   
+ */
+int qurt_sem_try_down(qurt_sem_t *sem);
+
+/**@ingroup func_qurt_sem_init
+  Initializes a semaphore object.
+  The default initial value of the semaphore count value is 1.
+
+  @param[out]  sem  Pointer to the initialized semaphore object.
+
+  @return 
+  None.
+
+  @dependencies
+  None.
+  
+ */
+void qurt_sem_init(qurt_sem_t *sem);
+
+/**@ingroup func_qurt_sem_destroy
+  Destroys the specified semaphore.\n
+  @note1hang Semaphores must be destroyed when they are no longer in use. Failure to do
+             this causes resource leaks in the QuRT kernel.\n
+  @note1cont Semaphores must not be destroyed while they are still in use. If this occur,
+             the behavior of QuRT is undefined.
+
+  @datatypes
+  #qurt_sem_t
+
+  @param[in]  sem  Pointer to the semaphore object to destroy. 
+ 
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+void qurt_sem_destroy(qurt_sem_t *sem);
+
+/**@ingroup func_qurt_sem_init_val
+  Initializes a semaphore object with the specified value.
+
+   @datatypes
+   #qurt_sem_t
+   
+   @param[out]  sem  Pointer to the initialized semaphore object. 
+   @param[in]  val   Initial value of the semaphore count value.
+
+   @return
+   None.
+
+   @dependencies
+   None.
+  
+ */
+void qurt_sem_init_val(qurt_sem_t *sem, unsigned short val);
+
+/**@ingroup func_qurt_sem_get_val
+  Gets the semaphore count value.\n
+  Returns the current count value of the specified semaphore.
+
+  @datatypes
+  #qurt_sem_t
+  
+  @param[in]   sem Pointer to the semaphore object to access.
+
+  @return
+  Integer semaphore count value
+
+  @dependencies
+  None.
+ */
+static inline unsigned short qurt_sem_get_val(qurt_sem_t *sem ){return sem->X.val;}
+int qurt_sem_down_cancellable(qurt_sem_t *sem);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_SEM_H  */
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_shmem.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_shmem.h
new file mode 100755
index 0000000000000..980557323708a
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_shmem.h
@@ -0,0 +1,89 @@
+#ifndef QURT_SHMEM_H
+#define QURT_SHMEM_H
+
+/**
+  @file qurt_shmem.h
+
+  @brief
+  Prototypes of QuRT inter-process shared memory APIs
+
+ EXTERNALIZED FUNCTIONS
+  none
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  none
+
+ Copyright (c) 2013, 2021  by Qualcomm Technologies, Inc.  All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef MODE_T
+#define MODE_T
+typedef unsigned int mode_t;
+#endif //MODE_T
+
+/**
+ * The shm_open() function establishes a connection between a shared memory object and a file descriptor.
+ * The file descriptor is used by other functions such as mmap() to refer to that shared memory object.
+ * 
+ *
+ * @param name      Pointer to string naming a shared memory object. Name has to start with "/shm/"
+ * @param oflag     File status flags and file access modes of the open file description. Following
+ *                  flags are defined in <fcntl.h> and supported:
+ *                  O_RDONLY: oepn for read access only
+ *                  O_RDWR: Open for read or write access
+ *                  O_CREAT: If shared memory object doesn't exist, create one.
+ * @param mode      Permission flags (currently ignored)
+ *
+ * @return    file descriptor (positive number) if operation successful.
+ *                  negative error code if failed
+ *
+*/
+
+int shm_open(const char * name, int oflag, mode_t mode);
+
+/**
+ * The shm_mmap() function create a shared memory mapping in the virtual address space of the
+ * the calling process. 
+ * 
+ * @param addr      The starting address for the new mapping is specified in addr.
+ * @param len       Specifies the lengh of the shared memory region.
+ * @param prot      Describes the desired memory protection of the mapping. Same as the one in mmap of POSIX.
+ * @param flags     Determines whether updates to the mapping is visible or not to other process. Same as
+ *                  the one in mmap of POSIX.
+ * @param fd        The starting adddress for the new mapping is returned.
+ * @param offset    unused.
+ *
+ * @return    The starting adddress for the new mapping is returned.
+ *                  negative error code if failed
+ *
+*/
+
+void *shm_mmap(void *addr, unsigned int len, int prot, int flags, int fd, unsigned int offset);
+
+/**
+ * The shm_close() function removes a connection between a shared memory object and a file descriptor.
+ * If there is no file descriptor connects to the shared memory object, the shared memory object will
+ * be deleted automatically. Shared memory object has same virtual address in any process. This is 
+ * restriction of single virtual address space. 
+ * 
+ *
+ * @param fd        File descriptor of shared memory object
+ *
+ * @return    0 if operation successful.
+ *                  negative error code if failed
+ *
+*/
+
+
+int shm_close(int fd);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_signal.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_signal.h
new file mode 100755
index 0000000000000..3a89c53394ad5
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_signal.h
@@ -0,0 +1,518 @@
+#ifndef QURT_SIGNAL_H
+#define QURT_SIGNAL_H
+
+/**
+  @file qurt_signal.h
+  @brief  Prototypes of kernel signal API functions. 
+
+ EXTERNALIZED FUNCTIONS
+  none
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  none
+
+ Copyright (c) 2021  by Qualcomm Technologies, Inc.  All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** @addtogroup signals_types
+@{ */
+#define QURT_SIGNAL_ATTR_WAIT_ANY 0x00000000  /**< Wait any. */
+#define QURT_SIGNAL_ATTR_WAIT_ALL 0x00000001  /**< Wait all. */
+
+/*=====================================================================
+ Typedefs
+ ======================================================================*/
+
+
+/** QuRT signal type.                                           
+ */
+typedef union {
+    /** @cond */
+	unsigned long long int raw;
+	struct {
+		unsigned int signals;
+		unsigned int waiting;
+		unsigned int queue;
+		unsigned int attribute;
+	}X;
+    /** @endcond */
+} qurt_signal_t;
+
+
+/** QuRT 64-bit signal type.                                           
+ */
+typedef struct {
+    /** @cond */
+    qurt_signal_t signal_sum;
+    unsigned long long signals;
+    unsigned long long waiting;
+    /** @endcond */
+} qurt_signal_64_t;
+/** @} */ /* end_addtogroup signals_types */
+/*=====================================================================
+ Functions
+======================================================================*/
+ 
+/*======================================================================*/
+/**@ingroup func_qurt_signal_init
+  Initializes a signal object.
+  Signal returns the initialized object.
+  The signal object is initially cleared.
+
+  @note1hang   Each signal-based object has one or more kernel resources associated with it;
+               to prevent resource leaks, call qurt_signal_destroy()
+               when this object is not used anymore
+  @datatypes
+  #qurt_signal_t
+
+  @param[in] *signal Pointer to the initialized object.
+
+  @return         
+  None.
+     
+  @dependencies
+  None.
+ */
+/* ======================================================================*/
+void qurt_signal_init(qurt_signal_t *signal);
+
+/*======================================================================*/
+/**@ingroup func_qurt_signal_destroy
+  Destroys the specified signal object.
+  
+  @note1hang Signal objects must be destroyed when they are no longer in use. Failure 
+  to do this causes resource leaks in the QuRT kernel.\n
+  @note1cont Signal objects must not be destroyed while they are still in use. If this
+  occurs, the behavior of QuRT is undefined.
+  
+  @datatypes
+  #qurt_signal_t
+
+  @param[in] *signal  Pointer to the signal object to destroy.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+/* ======================================================================*/
+void qurt_signal_destroy(qurt_signal_t *signal);
+
+/*======================================================================*/
+/**@ingroup func_qurt_signal_wait 
+  @xreflabel{hdr:qurt_signal_wait}
+  Suspends the current thread until the specified signals are set.
+
+  Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1 indicates 
+  waiting on a signal, and 0 indicates not waiting on the signal.
+
+  If a thread is waiting on a signal object for any of the specified set of signals to set, 
+  and one or more of those signals is set in the signal object, the thread is awakened.
+
+  If a thread is waiting on a signal object for all of the specified set of signals to be set, 
+  and all of those signals are set in the signal object, the thread is awakened.
+
+  The specified set of signals can be cleared when the signal is set.
+
+  @note1hang At most, one thread can wait on a signal object at any given time.
+
+  @datatypes
+  #qurt_signal_t
+
+  @param[in] signal      Pointer to the signal object to wait on.
+  @param[in] mask        Mask value identifying the individual signals in the signal object to 
+                         wait on.
+  @param[in] attribute   Indicates whether the thread waits to set any of the signals, or to set all of 
+                         them. \n
+						 @note1hang The wait-any and wait-all types are mutually exclusive.\n Values:\n
+                         - #QURT_SIGNAL_ATTR_WAIT_ANY \n
+                         - #QURT_SIGNAL_ATTR_WAIT_ALL @tablebulletend
+
+  @return     	
+  A 32-bit word with current signals.
+ 
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+unsigned int qurt_signal_wait(qurt_signal_t *signal, unsigned int mask, 
+                unsigned int attribute);
+
+/*======================================================================*/
+/**@ingroup func_qurt_signal_wait_timed
+  @xreflabel{hdr:qurt_signal_wait}
+  Suspends the current thread until the specified signals are set or until timeout.
+
+  Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1 indicates 
+  waiting on a signal, and 0 indicates not waiting.
+
+  If a thread is waiting on a signal object for any of the specified set of signals to be set, 
+  and one or more of those signals is set in the signal object, the thread is awakened.
+
+  If a thread is waiting on a signal object for all of the specified set of signals to be set, 
+  and all of those signals are set in the signal object, the thread is awakened.
+
+  The specified set of signals can be cleared after the signal is set.
+
+  @note1hang At most, one thread can wait on a signal object at any given time.
+
+  @datatypes
+  #qurt_signal_t
+
+  @param[in] signal      Pointer to the signal object to wait on.
+  @param[in] mask        Mask value that identifies the individual signals in the signal object to wait on.
+  @param[in] attribute   Indicates whether the thread must wait until any of the signals are set, or until all of 
+                         them are set. \n
+						 @note1hang The wait-any and wait-all types are mutually exclusive.\n Values:\n
+                         - #QURT_SIGNAL_ATTR_WAIT_ANY \n
+                         - #QURT_SIGNAL_ATTR_WAIT_ALL @tablebulletend
+  @param[out] signals    Bitmask of signals that are set 
+  @param[in] duration    Duration (microseconds) to wait. Must be in the range
+                         [#QURT_TIMER_MIN_DURATION ... #QURT_TIMER_MAX_DURATION]
+
+  @return 				
+  #QURT_EOK -- Success; one or more signals were set \n
+  #QURT_ETIMEDOUT -- Timed-out \n
+  #QURT_EINVALID -- Duration out of range
+  
+  @dependencies
+  Timed-waiting support in the kernel.
+*/
+/* ======================================================================*/
+int qurt_signal_wait_timed(qurt_signal_t *signal, unsigned int mask, 
+                unsigned int attribute, unsigned int *signals, unsigned long long int duration);
+
+/*======================================================================*/
+/**@ingroup func_qurt_signal_wait_any
+  Suspends the current thread until any of the specified signals are set.
+
+  Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1 indicates
+  to wait on a signal, and 0 indicates not to wait on the thread.
+
+  If a thread is waiting on a signal object for any of the specified set of signals to be set, 
+  and one or more of those signals is set in the signal object, the thread is awakened.
+
+  @note1hang At most, one thread can wait on a signal object at any given time.
+
+  @datatypes
+  #qurt_signal_t
+
+  @param[in] signal      Pointer to the signal object to wait on.
+  @param[in] mask        Mask value identifying the individual signals in the signal object to  
+                         wait on.
+	
+  @return     	
+  32-bit word with current signals.
+ 
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+static inline unsigned int qurt_signal_wait_any(qurt_signal_t *signal, unsigned int mask)
+{
+  return qurt_signal_wait(signal, mask, QURT_SIGNAL_ATTR_WAIT_ANY);
+}
+
+/*======================================================================*/
+/**@ingroup func_qurt_signal_wait_all
+  Suspends the current thread until all of the specified signals are set.
+
+  Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1 indicates 
+  to wait on a signal, and 0 indicates not to wait on it.
+
+  If a thread is waiting on a signal object for all of the specified set of signals to be set, 
+  and all of those signals are set in the signal object, the thread is awakened.
+
+  @note1hang At most, one thread can wait on a signal object at any given time.
+
+  @datatypes
+  #qurt_signal_t
+
+  @param[in] signal      Pointer to the signal object to wait on. 
+  @param[in] mask        Mask value identifying the individual signals in the signal object to  
+                         wait on.
+	
+  @return      	
+  A 32-bit word with current signals.
+ 
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+static inline unsigned int qurt_signal_wait_all(qurt_signal_t *signal, unsigned int mask)
+{
+  return qurt_signal_wait(signal, mask, QURT_SIGNAL_ATTR_WAIT_ALL);
+}
+
+/*======================================================================*/
+/**@ingroup func_qurt_signal_set
+  Sets signals in the specified signal object.
+
+  Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1 indicates 
+  to set the signal, and 0 indicates not to set it.
+  	
+  @datatypes
+  #qurt_signal_t
+
+  @param[in]    signal  Pointer to the signal object to modify.
+  @param[in]    mask    Mask value identifying the individual signals to set in the signal
+                        object.
+
+  @return 
+  None.
+  
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+void qurt_signal_set(qurt_signal_t *signal, unsigned int mask);
+
+/*======================================================================*/
+/**@ingroup func_qurt_signal_get
+   Gets a signal from a signal object.
+   
+   Returns the current signal values of the specified signal object.
+
+  @datatypes
+  #qurt_signal_t
+
+  @param[in] *signal Pointer to the signal object to access.
+
+  @return         
+  A 32-bit word with current signals
+    
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+unsigned int qurt_signal_get(qurt_signal_t *signal);
+
+/*======================================================================*/
+/**@ingroup func_qurt_signal_clear
+  Clear signals in the specified signal object.
+
+  Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1 
+  indicates that a signal must be cleared, and 0 indicates not to clear it.
+
+  @note1hang Signals must be explicitly cleared by a thread when it is awakened -- the wait 
+           operations do not automatically clear them.
+
+  @datatypes
+  #qurt_signal_t
+
+  @param[in] signal   Pointer to the signal object to modify.
+  @param[in] mask     Mask value identifying the individual signals to clear in the signal object.
+
+  @return 		  
+  None.
+
+  @dependencies
+  None.
+ */
+/* ======================================================================*/
+void qurt_signal_clear(qurt_signal_t *signal, unsigned int mask);
+
+/**@ingroup func_qurt_signal_wait_cancellable  
+  @xreflabel{hdr:qurt_signal_wait_cancellable}
+  Suspends the current thread until either the specified signals are set or the wait operation is cancelled.
+  The operation is cancelled if the user process of the calling thread is killed, or if the calling thread 
+  must finish its current QDI invocation and return to user space. 
+
+  Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1 indicates 
+  that a signal must be waited on, and 0 indicates not to wait on it.
+
+  If a thread is waiting on a signal object for any of the specified set of signals to be set, and one or 
+  more of those signals is set in the signal object, the thread is awakened.
+
+  If a thread is waiting on a signal object for all of the specified set of signals to be set, and all of 
+  those signals are set in the signal object, the thread is awakened.
+
+  @note1hang At most, one thread can wait on a signal object at any given time.
+
+  @note1cont When the operation is cancelled, the caller must assume that the signal is never set.
+
+  @datatypes
+  #qurt_signal_t
+
+  @param[in] signal      Pointer to the signal object to wait on.
+  @param[in] mask        Mask value identifying the individual signals in the signal object to  
+                         wait on.
+  @param[in] attribute   Indicates whether the thread must wait until any of the signals are set, or until all of 
+                         them are set.  Values:\n
+                         - #QURT_SIGNAL_ATTR_WAIT_ANY \n
+                         - #QURT_SIGNAL_ATTR_WAIT_ALL @tablebulletend
+  @param[out] return_mask Pointer to the 32-bit mask value that was originally passed to the function.
+
+
+  @return     	
+  #QURT_EOK -- Wait completed. \n
+  #QURT_ECANCEL -- Wait cancelled.
+
+ 
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+int qurt_signal_wait_cancellable(qurt_signal_t *signal, unsigned int mask, 
+                                 unsigned int attribute,
+                                 unsigned int *return_mask);
+
+/*======================================================================*/
+/**@ingroup func_qurt_signal_64_init
+  Initializes a 64-bit signal object.\n
+  The signal argument returns the initialized object.
+  The signal object is initially cleared.
+
+  @note1hang   Each signal-based object has one or more kernel resources associated with it;
+               to prevent resource leaks, call qurt_signal_destroy()
+               when this object is not used anymore.
+  @datatypes
+  #qurt_signal_64_t
+
+  @param[in] signal Pointer to the initialized object.
+
+  @return         
+  None.
+     
+  @dependencies
+  None.
+ */
+/* ======================================================================*/
+void qurt_signal_64_init(qurt_signal_64_t *signal);
+
+/*======================================================================*/
+/**@ingroup func_qurt_signal_64_destroy
+  Destroys the specified signal object.
+  
+  @note1hang 64-bit signal objects must be destroyed when they are no longer in use. Failure 
+  to do this causes resource leaks in the QuRT kernel.\n
+  @note1cont Signal objects must not be destroyed while they are still in use. If this
+  occurs, the behavior of QuRT is undefined.
+  
+  @datatypes
+  #qurt_signal_64_t
+
+  @param[in] signal  Pointer to the signal object to destroy.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+/* ======================================================================*/
+void qurt_signal_64_destroy(qurt_signal_64_t *signal);
+
+/*======================================================================*/
+/**@ingroup func_qurt_signal_64_wait
+  Suspends the current thread until all of the specified signals are set.
+
+  Signals are represented as bits 0 through 63 in the 64-bit mask value. A mask bit value of 1 indicates 
+  that a signal must be waited on, and 0 indicates not wait on it.
+
+  If a thread is waiting on a signal object for all of the specified set of signals to be set, 
+  and all of those signals are set in the signal object, the thread is awakened.
+
+  @note1hang At most, one thread can wait on a signal object at any given time.
+
+  @datatypes
+  #qurt_signal_64_t
+
+  @param[in] signal      Pointer to the signal object to wait on. 
+  @param[in] mask        Mask value, which identifies the individual signals in the signal object to  
+                         wait on.
+  @param[in] attribute   Indicates whether the thread must wait until any of the signals are set, or until all of 
+                         them are set.  \n
+						 @note1hang The wait-any and wait-all types are mutually exclusive.\n Values:\n
+                         - #QURT_SIGNAL_ATTR_WAIT_ANY \n
+                         - #QURT_SIGNAL_ATTR_WAIT_ALL @tablebulletend
+	
+  @return      	
+  A 32-bit word with current signals.
+ 
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+unsigned long long qurt_signal_64_wait(qurt_signal_64_t *signal, unsigned long long mask, 
+                unsigned int attribute);
+
+/*======================================================================*/
+/**@ingroup func_qurt_signal_64_set
+  Sets signals in the specified signal object.
+
+  Signals are represented as bits 0 through 63 in the 64-bit mask value. A mask bit value of 1 indicates 
+  that a signal must be set, and 0 indicates not to set it.
+  	
+  @datatypes
+  #qurt_signal_64_t
+
+  @param[in]    signal  Pointer to the signal object to modify.
+  @param[in]    mask    Mask value identifiying the individual signals to set in the signal
+                        object.
+
+  @return 
+  None.
+  
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+void qurt_signal_64_set(qurt_signal_64_t *signal, unsigned long long mask);
+
+/*======================================================================*/
+/**@ingroup func_qurt_signal_64_get
+   Gets a signal from a signal object.
+   
+   Returns the current signal values of the specified signal object.
+
+  @datatypes
+  #qurt_signal_64_t
+
+  @param[in] *signal Pointer to the signal object to access.
+
+  @return         
+  A 64-bit double word with current signals.
+    
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+unsigned long long qurt_signal_64_get(qurt_signal_64_t *signal);
+
+/*======================================================================*/
+/**@ingroup func_qurt_signal_64_clear
+  Clears signals in the specified signal object.
+
+  Signals are represented as bits 0 through 63 in the 64-bit mask value. A mask bit value of 1 
+  indicates that a signal must be cleared, and 0 indicates not to clear it.
+
+  @note1hang Signals must be explicitly cleared by a thread when it is awakened -- the wait 
+           operations do not automatically clear them.
+
+  @datatypes
+  #qurt_signal_64_t
+
+  @param[in] signal   Pointer to the signal object to modify.
+  @param[in] mask     Mask value identifying the individual signals to clear in the signal object.
+
+  @return 		  
+  None.
+
+  @dependencies
+  None.
+ */
+/* ======================================================================*/
+void qurt_signal_64_clear(qurt_signal_64_t *signal, unsigned long long mask);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* QURT_SIGNAL_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_signal2.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_signal2.h
new file mode 100755
index 0000000000000..43975100cbf75
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_signal2.h
@@ -0,0 +1,340 @@
+#ifndef QURT_SIGNAL2_H
+#define QURT_SIGNAL2_H
+
+/**
+  @file qurt_signal2.h
+  @brief  Prototypes of kernel signal2 API functions.
+
+ EXTERNALIZED FUNCTIONS
+  none
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  none
+
+ Copyright (c) 2013, 2021, 2023  by Qualcomm Technologies, Inc.  All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define QURT_SIGNAL_ATTR_WAIT_ANY 0x00000000
+#define QURT_SIGNAL_ATTR_WAIT_ALL 0x00000001
+
+/*=====================================================================
+ Typedefs
+ ======================================================================*/
+
+/** @addtogroup signals2_types
+@{ */
+/** qurt_signal2 type.
+ */
+typedef union {
+   /** @cond */
+  struct{
+   unsigned int cur_mask;                              /* Current set of signal bits that are set. */
+   unsigned int sig_state;                             /* Current state. */
+                                                       /* Bit 0 -- in anysignal wait. */
+                                                       /* Bit 1 -- in allsignal wait. */
+                                                       /* Bit 2 -- in interrupt wait. */
+                                                       /* Bits 31-3 -- reference count field. */
+   unsigned int queue;                                 /* Kernel-maintained futex queue value. */
+   unsigned int wait_mask;                             /* When sig_state indicates a waiter is present, this is the wait mask. */
+   };
+  unsigned long long int raw;
+  /** @endcond */
+} qurt_signal2_t;
+/* @} */ /* end_addtogroup signals2_types */
+
+/*=====================================================================
+ Functions
+======================================================================*/
+
+/*======================================================================*/
+/**@ingroup func_qurt_signal2_init
+
+  @deprecated use #qurt_signal_init instead.
+
+  Initializes a signal2 object.
+  Signal returns the initialized object.
+  The signal object is initially cleared.
+
+  Objects of type signal2 solve a potential race condition between
+  set() and destroy() operations.
+
+  @datatypes
+  #qurt_signal2_t
+
+  @param[in] *signal Pointer to the initialized object.
+
+  @return
+  None.
+
+  @dependencies
+  Each mutex-based object has an associated
+       kernel resource(s), therefore users must call qurt_signal2_destroy()
+       when this object no longer in use.
+ */
+/* ======================================================================*/
+void qurt_signal2_init(qurt_signal2_t *signal);
+
+/*======================================================================*/
+/**@ingroup func_qurt_signal2_destroy
+
+  @deprecated use #qurt_signal_destroy instead.
+
+  Destroys the specified signal object.
+
+  @note1cont Signal objects must not be destroyed while they are still in use. If this
+  occurs, the behavior of QuRT is undefined.
+  @note1cont Application code should destroy a signal2 object prior to deallocating it.
+             Calling qurt_signal2_destroy() before deallocating a 
+             signal2 object ensures completion of all qurt_signal2_set() calls.
+
+  @datatypes
+  #qurt_signal2_t
+
+  @param[in] signal  Pointer to the signal object to destroy.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+/* ======================================================================*/
+void qurt_signal2_destroy(qurt_signal2_t *signal);
+
+/*======================================================================*/
+/**@ingroup func_qurt_signal2_wait
+
+  @deprecated use #qurt_signal_wait instead.
+
+  Suspends the current thread until the specified signals are set.
+
+  Signals are represented as bits [31:0] in the 32-bit mask value. A mask bit value of 1 indicates
+  a signal to wait on.
+
+  If a thread calls this API with QURT_SIGNAL_ATTR_WAIT_ANY, the thread will be awakened when
+  any of the signals specified in the mask are set.
+
+  If a thread calls this API with QURT_SIGNAL_ATTR_WAIT_ALL, the thread will be awakened only
+  when all the signals specified in the mask are set.
+
+  @note1hang At most, one thread can wait on a signal object at any given time.
+
+  @datatypes
+  #qurt_signal2_t
+
+  @param[in] signal      Pointer to the signal object to wait on.
+  @param[in] mask        Mask value identifying the individual signals in the signal object to wait on.
+  @param[in] attribute   Specifies whether the thread waits for any of the signals to be set, or for all of
+                         them to be set. Values:\n
+                         - QURT_SIGNAL_ATTR_WAIT_ANY \n
+                         - QURT_SIGNAL_ATTR_WAIT_ALL @tablebulletend
+  @return
+  A 32-bit word with current signals.
+
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+unsigned int qurt_signal2_wait(qurt_signal2_t *signal, unsigned int mask,
+                unsigned int attribute);
+
+/*======================================================================*/
+/**@ingroup func_qurt_signal2_wait_any
+
+  @deprecated use #qurt_signal_wait_any instead.
+
+  Suspends the current thread until any of the specified signals are set.
+
+  Signals are represented as bits [31:0] in the 32-bit mask value. A mask bit value of 1 indicates
+  a signal to wait on.
+
+  The thread will be awakened when any of the signals specified in the mask are set.
+
+  @note1hang At most, one thread can wait on a signal object at any given time.
+
+  @datatypes
+  #qurt_signal2_t
+
+  @param[in] signal      Pointer to the signal object to wait on.
+  @param[in] mask        Mask value identifying the individual signals in the signal object to 
+                         wait on.
+
+  @return
+  32-bit word with current signals.
+
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+static inline unsigned int qurt_signal2_wait_any(qurt_signal2_t *signal, unsigned int mask)
+{
+  return qurt_signal2_wait(signal, mask, QURT_SIGNAL_ATTR_WAIT_ANY);
+}
+
+/*======================================================================*/
+/**@ingroup func_qurt_signal2_wait_all
+
+  @deprecated use #qurt_signal_wait_all instead.
+
+  Suspends the current thread until all of the specified signals are set.
+
+  Signals are represented as bits [31:0] in the 32-bit mask value. A mask bit value of 1 indicates
+  a signal to wait on.
+
+  The thread will be awakened only when all the signals specified in the mask are set.
+
+  @note1hang At most one thread can wait on a signal object at any given time.
+
+  @datatypes
+  #qurt_signal2_t
+
+  @param[in] signal      Pointer to the signal object to wait on.
+  @param[in] mask        Mask value identifying the individual signals in the signal object to 
+                         wait on.
+
+  @return
+  32-bit word with current signals.
+
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+static inline unsigned int qurt_signal2_wait_all(qurt_signal2_t *signal, unsigned int mask)
+{
+  return qurt_signal2_wait(signal, mask, QURT_SIGNAL_ATTR_WAIT_ALL);
+}
+
+/*======================================================================*/
+/**@ingroup func_qurt_signal2_set
+
+  @deprecated use #qurt_signal_set instead.
+
+  Sets signals in the specified signal object.
+
+  Signals are represented as bits [31:0] in the 32-bit mask value. A mask bit value of 1 indicates
+  that a signal must be set, and 0 indicates not to set the signal.
+
+  @datatypes
+  #qurt_signal2_t
+
+  @param[in]    signal  Pointer to the signal object to modify.
+  @param[in]    mask    Mask value identifying the individual signals to set in the signal
+                        object.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+void qurt_signal2_set(qurt_signal2_t *signal, unsigned int mask);
+
+/*======================================================================*/
+/**@ingroup func_qurt_signal2_get
+
+  @deprecated use #qurt_signal_get instead.
+
+   Gets a signal from a signal object.
+
+   Returns the current signal values of the specified signal object.
+
+  @datatypes
+  #qurt_signal2_t
+
+  @param[in] *signal Pointer to the signal object to access.
+
+  @return
+   32-bit word with current signals.
+
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+unsigned int qurt_signal2_get(qurt_signal2_t *signal);
+
+/*======================================================================*/
+/**@ingroup func_qurt_signal2_clear
+
+  @deprecated use #qurt_signal_clear instead.
+
+  Clear signals in the specified signal object.
+
+  Signals are represented as bits [31:0] in the 32-bit mask value. A mask bit value of 1
+  indicates that a signal must be cleared, and 0 indicates not to clear the signal.
+
+  @note1hang Signals must be explicitly cleared by a thread when it is awakened -- the wait
+           operations do not automatically clear them.
+
+  @datatypes
+  #qurt_signal2_t
+
+  @param[in] signal   Pointer to the signal object to modify.
+  @param[in] mask     Mask value identifying the individual signals to clear in the signal object.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+/* ======================================================================*/
+void qurt_signal2_clear(qurt_signal2_t *signal, unsigned int mask);
+
+/**@ingroup func_qurt_signal2_wait_cancellable  
+  
+  @deprecated use #qurt_signal_wait_cancellable instead.
+
+  Suspends the current thread until either the specified signals are set or the wait operation is cancelled.
+  The operation is cancelled if the user process of the calling thread is killed, or if the calling thread 
+  must finish its current QDI invocation and return to user space. 
+
+  Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1 indicates 
+  that a signal must be waited on, and 0 indicates not to wait on it.
+
+  If a thread is waiting on a signal object for any of the specified set of signals to be set, and one or 
+  more of those signals is set in the signal object, the thread is awakened.
+
+  If a thread is waiting on a signal object for all of the specified set of signals to be set, and all of 
+  those signals are set in the signal object, the thread is awakened.
+
+  @note1hang At most, one thread can wait on a signal object at any given time.
+
+  @note1cont When the operation is cancelled, the caller must assume that the signal is never set.
+
+  @datatypes
+  #qurt_signal2_t
+
+  @param[in] signal      Pointer to the signal object to wait on.
+  @param[in] mask        Mask value identifying the individual signals in the signal object to  
+                         wait on.
+  @param[in] attribute   Indicates whether the thread must wait until any of the signals are set, or until all of 
+                         them are set.  Values:\n
+                         - #QURT_SIGNAL_ATTR_WAIT_ANY \n
+                         - #QURT_SIGNAL_ATTR_WAIT_ALL @tablebulletend
+  @param[out] p_returnmask Pointer to the 32-bit mask value that was originally passed to the function.
+
+
+  @return     	
+  #QURT_EOK -- Wait completed. \n
+  #QURT_ECANCEL -- Wait cancelled.
+
+ 
+  @dependencies
+  None.
+*/
+int qurt_signal2_wait_cancellable(qurt_signal2_t *signal,
+                                  unsigned int mask,
+                                  unsigned int attribute,
+                                  unsigned int *p_returnmask);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_SIGNAL2_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_space.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_space.h
new file mode 100755
index 0000000000000..2c3f9e4496697
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_space.h
@@ -0,0 +1,230 @@
+#ifndef QURT_SPACE_H
+#define QURT_SPACE_H
+/**
+  @file qurt_space.h
+  @brief Prototypes of QuRT process control APIs
+
+ EXTERNALIZED FUNCTIONS
+  none
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  none
+
+ Copyright (c) 2013, 2021, 2023 by Qualcomm Technologies, Inc. All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+
+#include <qurt_types.h>
+#include <qurt_signal.h>
+#include <qurt_process.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** This flag is a request to the OS to suspend the processes just before calling main()
+But it is going to be obsoleted and replaced by QURT_PROCESS_SUSPEND_ON_STARTUP */
+#define SPAWNN_FLAG_SUSPEND_ON_STARTUP QURT_PROCESS_SUSPEND_ON_STARTUP
+
+/**
+ * Creates and starts a process from ELF of a specified name. The slash symbols
+ * "/" or "\" are ignored. Do not include the directory name in the input. This function
+ * accepts the the SPAWN flags. Multiple SPAWN flags can be specified by OR'ing the flags.
+ *
+ * @param name      ELF name of the executable. Name shall not contain directories,
+ *                  use "dsp2.elf", instead of "/prj/qct/.../dsp2.elf"
+ *
+ * @param return
+   Process ID -- Success \n
+   Negative error code -- failure\n
+   #QURT_EPRIVILEGE --                    Caller does not have enough privilege for this operation\n
+   #QURT_EMEM       --                    Not enough memory to perform the operation \n
+   #QURT_EFAILED     --                   Operation failed \n
+   #QURT_ENOTALLOWED --                   Operation not allowed \n
+   #QURT_ENOREGISTERED --                 Not registered \n
+   #QURT_ENORESOURCE  --                  Resource exhaustion \n
+   #QURT_EINVALID --                      Invalid argument value
+*/
+
+int qurt_spawn_flags(const char * name, int flags);
+
+/**
+   Creates and starts a process from an ELF of the specified name. The slash symbols
+   "/" or "\" are ignored. Do not include the directory name in the input.
+
+   @param name      ELF name of the executable. Name shall not contain directories,
+                    use "dsp2.elf", instead of "/prj/qct/.../dsp2.elf".
+
+   @return
+   Process ID -- Success. \m
+   Negative error code -- Failure.
+
+*/
+static inline int qurt_spawn(const char *name)
+{
+    return qurt_spawn_flags(name,0);
+}
+
+/**
+ * Returns the process ID of the current process.
+ *
+ * @return
+ * Process ID
+ *
+*/
+#define qurt_getpid qurt_process_get_id
+
+/**
+ * The qurt_wait() function  waits for status change in a child process. It could be used by parent
+ * process to block on any child process terminates.
+ *
+ * This API returns error if there are no user processes or all user processes got detached.
+ *
+ * @param status    Pointer to status variable. The variable provides the status value of child process.
+ *                  The value comes from exit() system call made by child process.
+ *
+ * @return
+   Process ID of the child process that changes status -- Success \n
+ * Negative error code -- Failure
+ *
+*/
+
+int qurt_wait(int *status);
+
+
+/** @cond */
+/* APIs that allow registering callbacks on spawn of user pd */
+typedef void (*QURT_SPAWN_PFN)(int client_handle, void *data_ptr);  //no return, since we won't be error checking it in spawn 
+typedef int (*QURT_CB_PFN)(int client_handle, void *user_data, void *info);
+typedef union {
+    QURT_SPAWN_PFN spawn_pfn;
+    QURT_CB_PFN cb_pfn;
+} qurt_process_callback_pfn_t;
+/** @endcond */
+
+/** @cond internal_only */
+
+/**@ingroup func_qurt_event_register
+Sets the specified bits by mask in the signal passed by the caller. The signal gets set
+when the client handle indicated by value goes away (at process exit). Multiple clients can register for the signal
+to be set.
+
+@datatypes
+
+@param[in]  type     QURT_PROCESS_EXIT is the only event that can be registered for.
+@param[in]  value    Indicates the client handle of the process for which the event is registered.
+@param[in]  signal   Pointer to the signal object to set when the event occurs.
+@param[in]  mask     Mask bits to set in the signal.
+@param[out] data     Pointer to the variable that would receive the exit code of the exiting process.
+@param[in]  datasize Size of the data variable.
+
+@return
+#QURT_EOK -- Success \n
+#QURT_EMEM -- Not enough memory to allocate resources \n
+#QURT_EVAL -- Invalid values passed to the API
+
+@dependencies
+None.
+*/
+int qurt_event_register(int type, int value, qurt_signal_t *psig, unsigned int mask, void *data, unsigned int data_size);
+
+/**@ingroup func_qurt_callback_register_onspawn
+Allows registering for a callback on spawn of any user process.
+
+@datatypes
+#QURT_SPAWN_PFN
+
+@param[in] pFn         Callback function to call when any user process is spawned.
+@param[in] user_data   Pointer to the argument that the callback must be called with.
+
+
+@return   If positive value is obtained, handle to be used while deregistering the callback.
+          Mutliple clients can register for callback on spawn and some clients might choose to deregister.
+
+          If failed, QURT_EFATAL will be returned.
+
+@dependencies
+None.
+*/
+int qurt_callback_register_onspawn(QURT_SPAWN_PFN pFn, void *user_data);
+
+/**@ingroup func_qurt_callback_deregister_onspawn
+Allows de-registering callback on spawn.
+
+@param[in] callback_handle   Handle returned by qurt_callback_register_onspawn.
+
+@return
+#QURT_EOK --de-registering was successful
+
+@dependencies
+None.
+*/
+int qurt_callback_deregister_onspawn(int callback_handle);
+
+/**@ingroup func_qurt_process_callback_register
+Allows registering for a callback during or after image loading.
+Generic callback types:
+    Functions similarly to qurt_callback_register_onspawn(). Callback is called after process is
+    loaded, before process thread starts. Callback has no return value and has no info provided
+    from OS.
+        pFn - QURT_SPAWN_PFN
+        type - QURT_PROCESS_CB_GENERIC
+        arg1 - not used 
+        arg2 - not used
+        arg3 - not used
+Note callback types:
+    Callback is called during process loading: before segment loading(QURT_PROCESS_NOTE_CB_PRE_MAP),
+    or after segment loading (QURT_PROCESS_NOTE_CB_POST_MAP). OS provides info to the callback. info
+    argument in callback is populated with pointer to the mapped note corresponding to the callback.
+    Callback has return value, loader fails if callback returns a value that is not QURT_EOK.
+        pFn - QURT_CB_PFN
+        type - QURT_PROCESS_NOTE_CB_PRE_MAP or QURT_PROCESS_NOTE_CB_POST_MAP
+        arg1 - note type (ex: NOTE_TYPE_POOL_INFO, NOTE_TYPE_SEGMENT_INFO, NOTE_TYPE_ARB_INFO)
+        arg2 - note name
+        arg3 - not used
+
+@datatypes
+
+@param[in] pFn          Callback function to call
+@param[in] type         Callback type
+@param[in] user_data    Pointer to the argument that the callback must be called with.
+@param[in] arg1         Arguments interpreted by OS based on callback type
+@param[in] arg2         Arguments interpreted by OS based on callback type
+@param[in] arg3         Arguments interpreted by OS based on callback type (currently not used)
+
+
+@return   If positive value is obtained, handle to be used while deregistering the callback.
+          Mutliple clients can register for callback on spawn and some clients might choose to deregister.
+
+          If failed, QURT_EFATAL will be returned.
+
+@dependencies
+None.
+*/
+int qurt_process_callback_register(qurt_process_callback_pfn_t pFn, 
+                                   qurt_process_cb_type_t type, 
+                                   void *user_data, 
+                                   qurt_process_callback_arg_t arg1, 
+                                   qurt_process_callback_arg_t arg2, 
+                                   qurt_process_callback_arg_t arg3);
+
+
+
+/**@ingroup func_qurt_process_callback_deregister
+Allows de-registering callback for imate loading.
+@param[in] callback_handle   Handle returned by qurt_process_callback_register.
+
+@return
+#QURT_EOK --de-registering was successful
+
+@dependencies
+None.
+*/
+int qurt_process_callback_deregister(int callback_handle);
+/** @endcond */
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_SPACE_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_srm_consts.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_srm_consts.h
new file mode 100755
index 0000000000000..48a8b6a38c402
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_srm_consts.h
@@ -0,0 +1,32 @@
+#ifndef QURT_SRM_CONSTS_H
+#define QURT_SRM_CONSTS_H
+/**
+  @file qurt_srm_consts.h 
+  @brief  Type definitions for srm
+
+EXTERNAL FUNCTIONS
+   None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+Copyright (c) 2020-2021, 2022  by Qualcomm Technologies, Inc.  All Rights Reserved
+Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+=============================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** @cond */
+#define QURT_SRM_WAKEUP_REQUEST       1U << 0          /**< Value = 1:  Send wakeup request to the SRM server. */
+#define QURT_SRM_SET_HANDLE           1U << 1          /**< Value = 2:  Set the client handle for a new SRM client. */
+#define QURT_SRM_ALLOC_KERNEL_PAGES   1U << 2          /**< Value = 4:  Allocate pages from the kernel VA space. */
+/** @endcond */
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_SRM_CONSTS_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_srm_driver.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_srm_driver.h
new file mode 100755
index 0000000000000..5489e3dddbcca
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_srm_driver.h
@@ -0,0 +1,140 @@
+#ifndef QURT_SRM_DRIVER_H
+#define QURT_SRM_DRIVER_H
+/**
+  @file qurt_srm_driver.h
+  @brief Definitions, macros, and prototypes used by SRM drivers.
+
+  EXTERNAL FUNCTIONS
+  None.
+
+  INITIALIZATION AND SEQUENCING REQUIREMENTS
+  None.
+
+  Copyright (c) 2021-2023 by Qualcomm Technologies, Inc.  All Rights Reserved.
+
+ =============================================================================*/
+#include <qurt.h>
+#include <qurt_srm_consts.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+||  Define qurt_srm_driver_t structure, which represents
+||   the "registration" object for an SRM driver.
+*/
+/** @cond internal_only */
+struct _qurt_srm_driver {
+   const char *name;
+   qurt_qdi_obj_t *obj;
+};
+
+typedef struct _qurt_srm_driver qurt_srm_driver_t;
+
+/*
+||  qurt_srm_object_invoke() is an internal equivalent to qurt_qdi_handle_invoke().
+||  It behaves the same, but it takes a QDI object pointer instead of a handle.
+*/
+
+#define qurt_srm_object_invoke(o,m,...) \
+   _QDMPASTE(_QDMSOI,_QDMCNT(QDI_HANDLE_LOCAL_CLIENT,o,m,##__VA_ARGS__))(QDI_HANDLE_LOCAL_CLIENT,o,m,##__VA_ARGS__)
+#define _QDMSOI3(a,b,c) qurt_srm_oi3(a,b,c)
+#define _QDMSOI4(a,b,c,d) qurt_srm_oi4(a,b,c,(int)(d))
+#define _QDMSOI5(a,b,c,d,e) qurt_srm_oi5(a,b,c,(int)(d),(int)(e))
+#define _QDMSOI6(a,b,c,d,e,f) qurt_srm_oi6(a,b,c,(int)(d),(int)(e),(int)(f))
+#define _QDMSOI7(a,b,c,d,e,f,g) qurt_srm_oi7(a,b,c,(int)(d),(int)(e),(int)(f),(int)(g))
+#define _QDMSOI8(a,b,c,d,e,f,g,h) qurt_srm_oi8(a,b,c,(int)(d),(int)(e),(int)(f),(int)(g),(int)(h))
+#define _QDMSOI9(a,b,c,d,e,f,g,h,i) qurt_srm_oi9(a,b,c,(int)(d),(int)(e),(int)(f),(int)(g),(int)(h),(int)(i))
+#define _QDMSOI10(a,b,c,d,e,f,g,h,i,j) qurt_srm_oi10(a,b,c,(int)(d),(int)(e),(int)(f),(int)(g),(int)(h),(int)(i),(int)(j))
+#define _QDMSOI11(a,b,c,d,e,f,g,h,i,j,k) qurt_srm_oi11(a,b,c,(int)(d),(int)(e),(int)(f),(int)(g),(int)(h),(int)(i),(int)(j),(int)(k))
+#define _QDMSOI12(a,b,c,d,e,f,g,h,i,j,k,l) qurt_srm_oi12(a,b,c,(int)(d),(int)(e),(int)(f),(int)(g),(int)(h),(int)(i),(int)(j),(int)(k),(int)(l))
+
+int qurt_srm_oi3(int, qurt_qdi_obj_t *, int);
+int qurt_srm_oi4(int, qurt_qdi_obj_t *, int, int);
+int qurt_srm_oi5(int, qurt_qdi_obj_t *, int, int, int);
+int qurt_srm_oi6(int, qurt_qdi_obj_t *, int, int, int, int);
+int qurt_srm_oi7(int, qurt_qdi_obj_t *, int, int, int, int, int);
+int qurt_srm_oi8(int, qurt_qdi_obj_t *, int, int, int, int, int, int);
+int qurt_srm_oi9(int, qurt_qdi_obj_t *, int, int, int, int, int, int, int);
+int qurt_srm_oi10(int, qurt_qdi_obj_t *, int, int, int, int, int, int, int, int);
+int qurt_srm_oi11(int, qurt_qdi_obj_t *, int, int, int, int, int, int, int, int, int);
+int qurt_srm_oi12(int, qurt_qdi_obj_t *, int, int, int, int, int, int, int, int, int, int);
+
+#define QDI_SRM_INIT 192
+
+/*
+||  QURT_SRM_DECLARE_DRIVER() declares an SRM driver to the SRM infrastructure.
+||
+||  The three arguments are:
+||   unique_id -- Unique C identifier, unused but must be a unique global symbol.
+||   name -- Name of the driver by which an SRM client attempts to open it.
+||   obj -- Pointer to the singleton object of the driver, which handles things such as
+||          initialization and QDI_OPEN requests.
+*/
+
+#define QURT_SRM_DECLARE_DRIVER(unique_id, xname, xobj) \
+   __attribute__((section(".srm.rodata.user.main.DECL"))) const qurt_srm_driver_t unique_id = \
+      { .name = xname, .obj = xobj }
+
+
+/*@ingroup func_qurt_srm_mapping_create
+  Creates a memory mapping in pagetable with specified attributes
+
+  @param[in] client_handle  Client handle representing the process for which
+                            mapping would be created.
+  @param[in] pageno_virt    pointer to the virtual page. NULL indicates SRM
+                            would indicate the virtual memory.
+  @param[in] pageno_phys    physical page to be used for the mapping
+  @param[in] page_count     number of 4k pages to be mapped
+  @param[in] cache_attr     cache attributes for the mapping
+  @param[in] perm           permissions to be used for the mapping
+  
+  @return value greater than 0 indicates a handle which can be passed to
+          qdi_close() to remove the mapping. Negative value indicates
+		  an error.
+  
+  @dependencies
+  None.
+*/
+int qurt_srm_mapping_create(int client_handle,
+                            unsigned *pageno_virt,
+                            unsigned pageno_phys,
+                            unsigned page_count,
+                            qurt_mem_cache_mode_t cache_attr,
+                            qurt_perm_t perm);
+
+
+/**@ingroup func_qurt_srm_get_pid
+  Gets the PID for the client_handle that is passed.
+
+  @param[in] client_handle  Client handle for which PID is required.
+
+  @return PID of the client
+          Negative PID value '-1' will be returned in case of Error
+  
+  @dependencies
+  None.
+*/
+unsigned qurt_srm_get_pid(int client_handle);
+
+
+/*@ingroup func_qurt_srm_get_thread_id
+  Gets the thread id of the client requesting a service from SRM
+
+  @param[in] None.
+
+  @return thead id of client thread
+  
+  @dependencies
+  None.
+*/
+qurt_thread_t qurt_srm_get_client_thread_id(void);
+
+/** @endcond */
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_SRM_DRIVER_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_stid.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_stid.h
new file mode 100755
index 0000000000000..379f46aaa4b80
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_stid.h
@@ -0,0 +1,73 @@
+#ifndef QURT_STID_H
+#define QURT_STID_H
+/**
+  @file qurt_stid.h 
+  Prototypes of software thread identifier(stid) interface APIs.  
+  A stid is 8 bit identifier that can be assigned to a software thread.
+  The performance monitor logic uses stid as a counting match criteria
+  for maskable events. stid is also used by the hardware debugger 
+  (ISDB) to match breakpoints. 
+
+  EXTERNAL FUNCTIONS
+   None.
+
+  INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+  Copyright (c) 2024 Qualcomm Technologies, Inc.
+  All rights reserved.
+  Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+=============================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/*=============================================================================
+                            FUNCTIONS
+=============================================================================*/
+
+/**@ingroup func_qurt_stid_alloc
+  Allocate a unique stid 
+
+  @param[in]  pid   Process identifier
+  @param[out] stid  Pointer to a variable to return stid
+ 
+  @return
+  QURT_EOK - Allocation success
+  QURT_ENORESOURCE  - No stid available for allocation
+  QURT_EINVALID - Invalid input
+   
+  @dependencies
+  None.
+ */
+int qurt_stid_alloc(unsigned int pid, unsigned int *stid);
+
+/**@ingroup func_qurt_stid_release
+   Release the stid. 
+
+
+  @param[in]  pid   Process identifier
+  @param[in]  stid  STID to release
+  
+  @note1hang 
+  User shall ensure to clear the released stid from process or thread(s)
+  to default value (QURT_STID_DEFAULT) before releasing that stid
+ 
+  @return
+  QURT_EOK - Release success
+  QURT_ENOTALLOWED   - Operation not allowed for a pid
+  QURT_EINVALID  - Invalid stid
+   
+  @dependencies
+  None.
+ */
+int qurt_stid_release(unsigned int pid, unsigned int stid);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_STID_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_thread.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_thread.h
new file mode 100755
index 0000000000000..499699e7c72e2
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_thread.h
@@ -0,0 +1,1260 @@
+#ifndef QURT_THREAD_H
+#define QURT_THREAD_H
+/**
+  @file qurt_thread.h 
+  @brief Prototypes of Thread API
+
+  EXTERNAL FUNCTIONS
+   None.
+
+  INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+Copyright (c) 2018, 2020-2023  by Qualcomm Technologies, Inc.  All Rights Reserved.
+Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+=============================================================================*/
+
+
+/* The followings are for C code only */
+#ifndef __ASSEMBLER__ 
+#include <string.h>
+#include "qurt_pmu.h"
+#include "qurt_api_version.h"
+#endif /* __ASSEMBLER__ */
+#include "qurt_consts.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*=============================================================================
+                            CONSTANTS AND MACROS
+=============================================================================*/
+
+
+/*
+  Bitmask configuration to select DSP hardware threads. 
+  To select all the hardware threads, use #QURT_THREAD_CFG_BITMASK_ALL 
+  and the following: \n
+  - For QDSP6 V2/V3, all six hardware threads are selected \n
+  - For QDSP6 V3L, all four hardware threads are selected \n
+  - For QDSP6 V4, all three hardware threads are selected
+ */  
+
+#define QURT_THREAD_CFG_BITMASK_HT0      0x00000001   /**< HTO. */
+#define QURT_THREAD_CFG_BITMASK_HT1      0x00000002   /**< HT1. */
+#define QURT_THREAD_CFG_BITMASK_HT2      0x00000004   /**< HT2. */ 
+#define QURT_THREAD_CFG_BITMASK_HT3      0x00000008   /**< HT3. */
+#define QURT_THREAD_CFG_BITMASK_HT4      0x00000010   /**< HT4. */
+#define QURT_THREAD_CFG_BITMASK_HT5      0x00000020   /**< HT5. */
+/** @cond rest_reg_dist */
+/** @addtogroup thread_macros
+@{ */
+/**   @xreflabel{sec:qurt_thread_cfg} */  
+
+#define QURT_THREAD_CFG_BITMASK_ALL      0x000000ffU   /**< Select all the hardware threads. */
+/** @} */ /* end_addtogroup thread_macros */
+/** @endcond */
+
+#define QURT_THREAD_CFG_USE_RAM          0x00000000   /**< Use RAM. */
+#define QURT_THREAD_CFG_USE_TCM          0x00000100   /**< Use TCM. */
+/** @cond rest_reg_dist */
+/** @addtogroup thread_macros
+@{ */
+#define QURT_THREAD_BUS_PRIO_DISABLED    0   /**< Thread internal bus priority disabled. */
+#define QURT_THREAD_BUS_PRIO_ENABLED     1   /**< Thread internal bus priority enabled.  */
+/** @} */ /* end_addtogroup thread_macros */
+/** @endcond */
+
+#define QURT_THREAD_AUTOSTACK_DISABLED    0   /**< Thread has autostack v2 feature disabled. */
+#define QURT_THREAD_AUTOSTACK_ENABLED     1   /**< Thread has autostack v2 feature enabled.  */
+
+/*
+   Macros for QuRT thread attributes.   
+ */
+#define QURT_HTHREAD_L1I_PREFETCH      0x1     /**< Enables hardware L1 instruction cache prefetching. */
+#define QURT_HTHREAD_L1D_PREFETCH      0x2     /**< Enables hardware L1 data cache prefetching. */
+#define QURT_HTHREAD_L2I_PREFETCH      0x4     /**< Enables hardware L2 instruction cache prefetching. */
+#define QURT_HTHREAD_L2D_PREFETCH      0x8     /**< Enables hardware L2 data cache prefetching. */
+#define QURT_HTHREAD_DCFETCH           0x10    /**< Enables DC fetch to the provided virtual address. 
+                                                    DC fetch indicates the hardware that a data memory access is likely. 
+                                                    Instructions are dropped when there is high bus utilization. */
+/** @addtogroup thread_macros
+@{ */
+/** @xreflabel{hdr:partition_tcm} */
+/*
+   Below value is used to create legacy QuRT threads by default.
+   If a thread has this as the detach_state, the thread can be joined
+   on until it exits. When we are able to change default behavior of all
+   QuRT threads to JOINABLE (posix default), we can remove this legacy
+   behavior.
+*/
+#define QURT_THREAD_ATTR_CREATE_LEGACY               0U /**< Create a legacy QuRT thread by default. If a thread has this as a detach state, the thread can be joined on until it exits. */
+#define QURT_THREAD_ATTR_CREATE_JOINABLE             1U /**< Create a joinable thread. */
+#define QURT_THREAD_ATTR_CREATE_DETACHED             2U /**< Create a detached thread. */
+/** @} */ /* end_addtogroup thread_macros */
+
+
+#define QURT_THREAD_ATTR_NAME_MAXLEN            16  /**< Maximum name length. */
+#define QURT_THREAD_ATTR_TCB_PARTITION_RAM      0  /**< Creates threads in RAM/DDR. */
+#define QURT_THREAD_ATTR_TCB_PARTITION_TCM      1  /**< Creates threads in TCM. */
+/** @cond rest_reg_dist */
+/** @addtogroup thread_macros
+@{ */
+#define QURT_THREAD_ATTR_TCB_PARTITION_DEFAULT  QURT_THREAD_ATTR_TCB_PARTITION_RAM  /**< Backward compatibility. */
+#define QURT_THREAD_ATTR_PRIORITY_DEFAULT       254   /**< Priority.*/
+#define QURT_THREAD_ATTR_ASID_DEFAULT           0    /**< ASID. */
+#define QURT_THREAD_ATTR_AFFINITY_DEFAULT      (-1)  /**< Affinity. */
+#define QURT_THREAD_ATTR_BUS_PRIO_DEFAULT       255  /**< Bus priority. */
+#define QURT_THREAD_ATTR_AUTOSTACK_DEFAULT      0    /**< Default autostack v2 disabled thread. */
+#define QURT_THREAD_ATTR_TIMETEST_ID_DEFAULT   (-2)  /**< Timetest ID. */
+#define QURT_THREAD_ATTR_STID_DEFAULT           QURT_STID_DEFAULT  /**< STID. */
+#define QURT_THREAD_ATTR_STID_ENABLE            1  /**< Indicate to allocate STID during thread creation. */
+
+#define  QURT_PRIORITY_FLOOR_DEFAULT            255U  /**< Default floor. */
+/** @} */ /* end_addtogroup thread_macros */
+
+// Option for suspending thread
+#define  QURT_THREAD_SUSPEND_SYNCHRONOUS   0x0U  // bit#0
+#define  QURT_THREAD_SUSPEND_ASYNCHRONOUS  0x1U  // bit#0
+#define  QURT_THREAD_SUSPEND_KEEP_HMX      0x0U  // bit#1
+#define  QURT_THREAD_SUSPEND_DETACH_HMX    0x2U  // bit#1
+ 
+// Option for resuming thread
+#define  QURT_THREAD_RESUME_DEFAULT        0x0
+
+// Thread property IDs
+#define  QURT_THREAD_PROPERTY_SUSPENDABLE  0x0U 
+#define  QURT_THREAD_PROPERTY_RESUMABLE    0x1
+
+// Thread group
+#define  QURT_THREAD_DEFAULT_GROUP_ID      0x0U
+#define  QURT_THREAD_GROUP_ID_MASK         0x3FU
+
+/** @endcond*/
+
+
+/* The followings are for C code only */
+#ifndef __ASSEMBLER__ 
+/*=============================================================================
+                                TYPEDEFS
+=============================================================================*/
+/** @addtogroup thread_types
+@{ */
+/** @cond rest_reg_dist  */
+typedef unsigned int qurt_cache_partition_t; /**< QuRT cache partition type. */
+
+#define CCCC_PARTITION      0U     /**< Use the CCCC page attribute bits to determine the main or auxiliary partition. */
+#define MAIN_PARTITION      1U     /**< Use the main partition. */
+#define AUX_PARTITION       2U     /**< Use the auxiliary partition. */
+#define MINIMUM_PARTITION   3U     /**< Use the minimum. Allocates the least amount of cache (no-allocate policy possible) for this thread. */
+/** @endcond */
+
+/** Thread ID type. */
+typedef unsigned int qurt_thread_t;
+
+/** @cond rest_reg_dist  */
+/** Thread attributes. */
+typedef struct _qurt_thread_attr {
+    
+    char name[QURT_THREAD_ATTR_NAME_MAXLEN]; /**< Thread name. */
+    unsigned char tcb_partition;  /**< Indicates whether the thread TCB resides in RAM or
+                                       on chip memory (TCM). */
+    unsigned char  stid;          /**< Software thread ID used to configure the stid register
+                                       for profiling purposes. */
+    unsigned short priority;      /**< Thread priority. */
+    unsigned char  autostack:1;   /**< Autostack v2 enabled thread. */
+    unsigned char  group_id:6;    /**< Group ID. */
+    unsigned char  reserved:1;    /**< Reserved bits. */
+    unsigned char  bus_priority;  /**< Internal bus priority. */
+    unsigned short timetest_id;   /**< Timetest ID. */
+    unsigned int   stack_size;    /**< Thread stack size. */
+    void *stack_addr;             /**< Pointer to the stack address base. The range of the stack is
+                                       (stack_addr, stack_addr+stack_size-1). */
+    unsigned short detach_state;  /**< Detach state of the thread. */
+
+} qurt_thread_attr_t;
+/** @endcond */
+
+/** @cond rest_reg_dist */
+/** Dynamic TLS attributes. */
+typedef struct qurt_tls_info {
+  unsigned int module_id;        /**< Module ID of the loaded dynamic linked library. */
+  unsigned int tls_start;        /**< Start address of the TLS data. */
+  unsigned int tls_data_end;     /**< End address of the TLS RW data. */
+  unsigned int tls_end;          /**< End address of the TLS data. */
+}qurt_tls_info;
+/** @endcond */
+
+/** @} */ /* end_addtogroup thread_types */
+
+/*=============================================================================
+                       FUNCTIONS
+=============================================================================*/
+/**@ingroup func_qurt_thread_attr_init
+  Initializes the structure used to set the thread attributes when a thread is created.
+  After an attribute structure is initialized, Explicity set the individual attributes in the structure 
+  using the thread attribute operations.
+
+  The initialize operation sets the following default attribute values: \n
+  - Name -- NULL string \n
+  - TCB partition -- QURT_THREAD_ATTR_TCB_PARTITION_DEFAULT
+  - Priority -- QURT_THREAD_ATTR_PRIORITY_DEFAULT \n
+  - Autostack -- QURT_THREAD_ATTR_AUTOSTACK_DEFAULT \n
+  - Bus priority -- QURT_THREAD_ATTR_BUS_PRIO_DEFAULT \n
+  - Timetest ID -- QURT_THREAD_ATTR_TIMETEST_ID_DEFAULT \n
+  - stack_size -- 0 \n
+  - stack_addr -- NULL \n
+  - detach state -- #QURT_THREAD_ATTR_CREATE_LEGACY \n
+  - STID -- #QURT_THREAD_ATTR_STID_DEFAULT
+
+  @datatypes
+  #qurt_thread_attr_t
+  
+  @param[in,out] attr Pointer to the thread attribute structure.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+static inline void qurt_thread_attr_init (qurt_thread_attr_t *attr)
+{
+
+    attr->name[0] = '\0';
+    attr->tcb_partition = QURT_THREAD_ATTR_TCB_PARTITION_DEFAULT;
+    attr->priority = QURT_THREAD_ATTR_PRIORITY_DEFAULT;
+    attr->autostack = QURT_THREAD_ATTR_AUTOSTACK_DEFAULT; /* Default attribute for autostack v2*/
+    attr->bus_priority = QURT_THREAD_ATTR_BUS_PRIO_DEFAULT;
+    attr->timetest_id = (unsigned short)QURT_THREAD_ATTR_TIMETEST_ID_DEFAULT;
+    attr->stack_size = 0;
+    attr->stack_addr = NULL;
+    attr->detach_state = QURT_THREAD_ATTR_CREATE_LEGACY;
+    attr->stid = QURT_THREAD_ATTR_STID_DEFAULT;
+    attr->group_id = QURT_THREAD_DEFAULT_GROUP_ID;
+}
+
+/**@ingroup func_qurt_thread_attr_set_name
+  Sets the thread name attribute.\n
+  This function specifies the name to use by a thread.
+  Thread names identify a thread during debugging or profiling.
+  Maximum name length is 16 charactes  \n
+  @note1hang Thread names differ from the kernel-generated thread identifiers used to
+  specify threads in the API thread operations.
+
+  @datatypes
+  #qurt_thread_attr_t
+
+  @param[in,out] attr Pointer to the thread attribute structure.
+  @param[in] name     Pointer to the character string containing the thread name.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+static inline void qurt_thread_attr_set_name (qurt_thread_attr_t *attr, const char *name)
+{
+    strlcpy (attr->name, name, QURT_THREAD_ATTR_NAME_MAXLEN);
+    attr->name[QURT_THREAD_ATTR_NAME_MAXLEN - 1] = '\0';
+}
+
+
+/**@ingroup func_qurt_thread_attr_set_tcb_partition
+  Sets the thread TCB partition attribute.
+  Specifies the memory type where a TCB of a thread is allocated.
+  Allocates TCBs in RAM or TCM/LPM.
+
+  @datatypes
+  #qurt_thread_attr_t
+
+  @param[in,out] attr  Pointer to the thread attribute structure.
+  @param[in] tcb_partition TCB partition. Values:\n
+                     - 0 -- TCB resides in RAM \n
+                     - 1 -- TCB resides in TCM/LCM @tablebulletend
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+static inline void qurt_thread_attr_set_tcb_partition (qurt_thread_attr_t *attr, unsigned char tcb_partition)
+{
+    attr->tcb_partition = tcb_partition;
+}
+
+/**@ingroup func_qurt_thread_attr_set_priority
+  Sets the thread priority to assign to a thread.
+  Thread priorities are specified as numeric values in the range 1 to 254, with 1 representing
+  the highest priority.
+  Priority 0 and 255  are internally used by the kernel for special purposes.
+
+  @datatypes
+  #qurt_thread_attr_t
+
+  @param[in,out] attr Pointer to the thread attribute structure.
+  @param[in] priority Thread priority.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+static inline void qurt_thread_attr_set_priority (qurt_thread_attr_t *attr, unsigned short priority)
+{
+    attr->priority = priority;
+}
+
+/**@ingroup func_qurt_thread_attr_set_detachstate
+  Sets the thread detach state with which thread is created.
+  Thread detach state is either joinable or detached; specified by the following values:
+  - #QURT_THREAD_ATTR_CREATE_JOINABLE  \n           
+  - #QURT_THREAD_ATTR_CREATE_DETACHED  \n   
+
+  When a detached thread is created (QURT_THREAD_ATTR_CREATE_DETACHED), its thread
+  ID and other resources are reclaimed as soon as the thread exits. When a joinable thread 
+  is created (QURT_THREAD_ATTR_CREATE_JOINABLE), it is assumed that some
+  thread waits to join on it using a qurt_thread_join() call. 
+  By default, detached state is QURT_THREAD_ATTR_CREATE_LEGACY
+  If detached state is QURT_THREAD_ATTR_CREATE_LEGACY then other
+  thread can join before thread exits but it will not wait other thread to join.
+  
+  @note1hang For a joinable thread (QURT_THREAD_ATTR_CREATE_JOINABLE), it is very
+             important that some thread joins on it after it terminates, otherwise
+			 the resources of that thread are not reclaimed, causing memory leaks.      
+
+  @datatypes
+  #qurt_thread_attr_t
+
+  @param[in,out] attr Pointer to the thread attribute structure.
+  @param[in] detachstate Thread detach state.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+static inline void qurt_thread_attr_set_detachstate (qurt_thread_attr_t *attr, unsigned short detachstate)
+{	
+    if(detachstate == QURT_THREAD_ATTR_CREATE_JOINABLE  || detachstate == QURT_THREAD_ATTR_CREATE_DETACHED){
+		attr->detach_state = detachstate;
+	}
+}
+
+
+/**@ingroup func_qurt_thread_attr_set_timetest_id
+  Sets the thread timetest attribute.\n
+  Specifies the timetest identifier to use by a thread.
+
+  Timetest identifiers are used to identify a thread during debugging or profiling. \n
+  @note1hang Timetest identifiers differ from the kernel-generated thread identifiers used to
+             specify threads in the API thread operations.
+
+  @datatypes
+  #qurt_thread_attr_t
+  
+  @param[in,out] attr   Pointer to the thread attribute structure.
+  @param[in] timetest_id Timetest identifier value.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+  */
+static inline void qurt_thread_attr_set_timetest_id (qurt_thread_attr_t *attr, unsigned short timetest_id)
+{
+    attr->timetest_id = timetest_id;
+}
+
+/**@ingroup func_qurt_thread_attr_set_stack_size
+  @xreflabel{sec:set_stack_size}
+  Sets the thread stack size attribute.\n
+  Specifies the size of the memory area to use for a call stack of a thread.
+
+  The thread stack address (Section @xref{sec:set_stack_addr}) and stack size specify the memory area used as a
+  call stack for the thread. The user is responsible for allocating the memory area used for
+  the stack.
+
+  @datatypes
+  #qurt_thread_attr_t
+
+  @param[in,out] attr Pointer to the thread attribute structure.
+  @param[in] stack_size Size (in bytes) of the thread stack.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+
+static inline void qurt_thread_attr_set_stack_size (qurt_thread_attr_t *attr, unsigned int stack_size)
+{
+    attr->stack_size = stack_size;
+}
+
+/**@ingroup func_qurt_thread_attr_set_stack_size2
+  @xreflabel{sec:set_stack_size}
+  Sets the thread stack size attribute for island threads that require a higher guest OS stack size than the stack size
+  defined in the configuration XML.\n
+  Specifies the size of the memory area to use for a call stack of an island thread in User and Guest mode.
+
+  The thread stack address (Section @xref{sec:set_stack_addr}) and stack size specify the memory area used as a
+  call stack for the thread. The user is responsible for allocating the memory area used for
+  the stack.
+
+  @datatypes
+  #qurt_thread_attr_t
+
+  @param[in,out] attr Pointer to the thread attribute structure.
+  @param[in] user_stack_size Size (in bytes) of the stack usage in User mode.
+  @param[in] root_stack_size Size (in bytes) of the stack usage in Guest mode.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+static inline void qurt_thread_attr_set_stack_size2 (qurt_thread_attr_t *attr, unsigned short user_stack_size, unsigned short root_stack_size)
+{
+	union qurt_thread_stack_info{
+		unsigned int raw_size;
+		struct{
+			unsigned short user_stack;
+			unsigned short root_stack;
+		};
+	}user_root_stack_size;
+	user_root_stack_size.user_stack = user_stack_size;
+	user_root_stack_size.root_stack = root_stack_size;
+	
+    attr->stack_size = user_root_stack_size.raw_size;
+}
+
+/**@ingroup func_qurt_thread_attr_set_stack_addr
+  @xreflabel{sec:set_stack_addr}
+  Sets the thread stack address attribute. \n
+  Specifies the base address of the memory area to use for a call stack of a thread.
+
+  stack_addr must contain an address value that is 8-byte aligned.
+
+  The thread stack address and stack size (Section @xref{sec:set_stack_size}) specify the memory area used as a
+  call stack for the thread. \n
+  @note1hang The user is responsible for allocating the memory area used for the thread
+             stack. The memory area must be large enough to contain the stack that the thread
+			 creates.
+
+  @datatypes
+  #qurt_thread_attr_t
+  
+  @param[in,out] attr Pointer to the thread attribute structure.
+  @param[in] stack_addr  Pointer to the 8-byte aligned address of the thread stack.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+static inline void qurt_thread_attr_set_stack_addr (qurt_thread_attr_t *attr, void *stack_addr)
+{
+    attr->stack_addr = stack_addr;
+}
+
+/**@ingroup func_qurt_thread_attr_set_bus_priority
+   Sets the internal bus priority state in the Hexagon core for this software thread attribute. 
+   Memory requests generated by the thread with bus priority enabled are
+   given priority over requests generated by the thread with bus priority disabled. 
+   The default value of bus priority is disabled.
+
+   @note1hang Sets the internal bus priority for Hexagon processor version V60 or greater. 
+              The priority is not propagated to the bus fabric.
+  
+   @datatypes
+   #qurt_thread_attr_t
+
+   @param[in] attr Pointer to the thread attribute structure.
+
+   @param[in] bus_priority Enabling flag. Values: \n 
+         - #QURT_THREAD_BUS_PRIO_DISABLED \n
+         - #QURT_THREAD_BUS_PRIO_ENABLED @tablebulletend
+
+   @return
+   None
+
+   @dependencies
+   None.
+*/
+static inline void qurt_thread_attr_set_bus_priority ( qurt_thread_attr_t *attr, unsigned short bus_priority)
+{
+    attr->bus_priority = (unsigned char)bus_priority;
+}
+
+/**@ingroup func_qurt_thread_attr_set_autostack
+   Enables autostack v2 feature in the thread attributes.
+   
+   When autostack is enabled by the subsystem, in the case that
+   an autostack enabled thread gets framelimit exception, kernel will
+   allocate more stack for thread and return to normal execution. 
+
+   If autostack is not enabled by the subsystem, or it is not enabled
+   for the thread, the framelimit exception will be fatal.
+  
+   @datatypes
+   #qurt_thread_attr_t
+
+   @param[in] attr Pointer to the thread attribute structure.
+   @param[in] autostack  Autostack enable or disable flag. Values: \n 
+         - #QURT_THREAD_AUTOSTACK_DISABLED \n
+         - #QURT_THREAD_AUTOSTACK_ENABLED @tablebulletend
+
+   @return
+   None.
+
+   @dependencies
+   None.
+*/
+static inline void qurt_thread_attr_set_autostack ( qurt_thread_attr_t *attr, unsigned short autostack)
+{
+    attr->autostack = (unsigned char)autostack;  
+}
+/**@ingroup qurt_thread_attr_enable_stid
+   Set STID in the thread attributes.
+  
+   @datatypes
+   #qurt_thread_attr_t
+
+   @param[in] attr Pointer to the thread attribute structure.
+   @param[in] enable_stid  STID to be set. Values: \n 
+         - #QURT_THREAD_ATTR_STID_DEFAULT (0): Default STID. \n
+         - #QURT_THREAD_ATTR_STID_ENABLE (1):  QuRT assigns an STID that is not already in use \n
+         - #2 through #255 : User provided STID.  @tablebulletend
+
+   @return
+   None.
+
+   @dependencies
+   None.
+*/
+static inline void qurt_thread_attr_enable_stid ( qurt_thread_attr_t *attr, char enable_stid)
+{
+    if (enable_stid != '\0') {
+        attr->stid = enable_stid;
+    }
+    else
+    {
+        attr->stid = QURT_THREAD_ATTR_STID_DEFAULT;
+    }
+}
+
+/**@ingroup func_qurt_thread_attr_set_stid
+   Sets the stid thread attribute.
+   The default stid value is QURT_THREAD_ATTR_STID_DEFAULT
+
+   @note1hang When a thread is created with non default stid , 
+   the stid set in thread attribute  will be assigned to a thread.
+  
+   @datatypes
+   #qurt_thread_attr_t
+
+   @param[in] attr Pointer to the thread attribute structure.
+   @param[in] stid Stid to be set for a thread.
+
+   @return
+   None
+
+   @dependencies
+   None.
+*/
+static inline void qurt_thread_attr_set_stid( qurt_thread_attr_t *attr, unsigned int stid){
+    attr->stid = stid;
+}
+
+/**@ingroup func_qurt_thread_attr_set_group_id
+  Sets group id in the thread attributes.
+  Primordial/first thread has group ID 0.
+  If a new thread is created without assigning group_id, it
+  inherits the group ID from its parent thread.
+
+  @note1hang
+  1) Group ID can only be set before creating a thread. It cannot be
+  changed after the thread is created.
+  2) If a non-activated group_id is passed, thread creation will fail.
+  3) Only a thread with Group ID #0 can set Group ID for its child threads.
+  4) If thread with non-zero group ID set the group ID for its child threads,
+  QuRT will ingore this parameter and child threads will inherit the parent
+  thread's group ID. But if passed group ID is not activated, thread creation
+  will still fail.
+
+  @datatypes
+  #qurt_thread_attr_t
+
+  @param[in] attr Pointer to the thread attribute structure.
+  @param[in] group_id Group identifier. Its valid range is 0 ~ 63
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+static inline void qurt_thread_attr_set_group_id(qurt_thread_attr_t *attr, unsigned int group_id)
+{
+    attr->group_id = group_id & QURT_THREAD_GROUP_ID_MASK;
+}
+
+/**@ingroup func_qurt_thread_set_autostack
+  Sets autostack enable in the TCB.
+
+  @param[in] Pointer to UGP
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+
+void qurt_thread_set_autostack(void *);
+
+
+/**@ingroup func_qurt_thread_get_name
+  Gets the thread name of current thread.\n
+  Returns the thread name of the current thread. 
+  Thread names are assigned to threads as thread attributes, see qurt_thread_attr_set_name(). Thread names 
+  identify a thread during debugging or profiling.
+
+  @param[out] name Pointer to a character string, which specifies the address where the returned thread name is stored.
+  @param[in] max_len Maximum length of the character string that can be returned.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+void qurt_thread_get_name (char *name, unsigned char max_len);
+
+/**@ingroup func_qurt_thread_create
+  @xreflabel{hdr:qurt_thread_create}
+  Creates a thread with the specified attributes, and makes it executable.
+
+  @datatypes
+  #qurt_thread_t \n
+  #qurt_thread_attr_t
+  
+  @param[out]  thread_id    Returns a pointer to the thread identifier if the thread was 
+                             successfully created.
+  @param[in]   attr 	    Pointer to the initialized thread attribute structure that specifies 
+                             the attributes of the created thread.
+  @param[in]   entrypoint   C function pointer, which specifies the main function of a thread.
+  @param[in]   arg  	     Pointer to a thread-specific argument structure
+  
+   
+  @return 
+  #QURT_EOK -- Thread created. \n
+  #QURT_EFAILED -- Thread not created. 
+
+  @dependencies
+  None.
+ */
+int qurt_thread_create (qurt_thread_t *thread_id, qurt_thread_attr_t *attr, void (*entrypoint) (void *), void *arg);
+
+/**@ingroup func_qurt_thread_stop
+   Stops the current thread, frees the kernel TCB, and yields to the next highest ready thread. 
+  
+   @return
+   void 
+
+   @dependencies
+   None.
+ */
+void qurt_thread_stop(void);
+
+/** @cond internal_only */
+/**@ingroup func_qurt_thread_resume
+   When a demand-loading paging solution is enabled, this function
+   will resumes the execution of a thread that was suspended due to
+   a page miss.
+  
+   @param[in]  thread_id Thread identifier.
+
+   @return 
+   #QURT_EOK -- Thread successfully resumed. \n
+   #QURT_EFATAL -- Resume operation failed.
+
+   @dependencies
+   None.
+ */
+int qurt_thread_resume(unsigned int thread_id);
+/** @endcond */
+
+/**@ingroup func_qurt_thread_get_id
+   Gets the identifier of the current thread.\n
+   Returns the thread identifier for the current thread.
+     
+   @return 
+   Thread identifier -- Identifier of the current thread. 
+
+   @dependencies
+   None.
+ */
+qurt_thread_t qurt_thread_get_id (void);
+
+
+/**@ingroup func_qurt_thread_get_l2cache_partition
+   Returns the current value of the L2 cache partition assigned to the caller thread.\n
+     
+   @return 
+   Value of the #qurt_cache_partition_t data type.
+
+   @dependencies
+   None.
+ */
+qurt_cache_partition_t qurt_thread_get_l2cache_partition (void);
+
+/**@ingroup func_qurt_thread_set_timetest_id
+   Sets the timetest identifier of the current thread.
+   Timetest identifiers are used to identify a thread during debugging or profiling.\n
+   @note1hang Timetest identifiers differ from the kernel-generated thread identifiers used to
+              specify threads in the API thread operations.
+
+   @param[in]  tid  Timetest identifier.
+
+   @return
+   None.
+
+   @dependencies
+   None.
+ */
+void qurt_thread_set_timetest_id (unsigned short tid);
+
+/**@ingroup func_qurt_thread_set_cache_partition
+   Sets the cache partition for the current thread. This function uses the qurt_cache_partition_t type 
+   to select the cache partition of the current thread for the L1 Icache, L1 Dcache, and L2 cache.
+  
+   @datatypes
+   #qurt_cache_partition_t 
+
+   @param[in] l1_icache L1 I cache partition.
+   @param[in] l1_dcache L1 D cache partition.
+   @param[in] l2_cache L2 cache partition.
+    
+   @return
+   None.
+
+   @dependencies
+   None.
+ */
+void qurt_thread_set_cache_partition(qurt_cache_partition_t l1_icache, qurt_cache_partition_t l1_dcache, qurt_cache_partition_t l2_cache);
+
+
+/**@ingroup func_qurt_thread_get_timetest_id
+   Gets the timetest identifier of the current thread.\n
+   Returns the timetest identifier of the current thread.\n
+   Timetest identifiers are used to identify a thread during debugging or profiling. \n
+   @note1hang Timetest identifiers differ from the kernel-generated thread identifiers used to
+              specify threads in the API thread operations.
+
+   @return 
+   Integer -- Timetest identifier. 
+
+   @dependencies
+   None.
+ */
+unsigned short qurt_thread_get_timetest_id (void);
+
+/**@ingroup func_qurt_thread_exit
+   @xreflabel{sec:qurt_thread_exit}
+   Stops the current thread, awakens threads joined to it, then destroys the stopped
+   thread.
+
+   Threads that are suspended on the current thread (by performing a thread join 
+   Section @xref{sec:thread_join}) are awakened and passed a user-defined status value 
+   that indicates the status of the stopped thread.
+
+   @note1hang Exit must be called in the context of the thread to stop.
+  
+   @param[in]   status User-defined thread exit status value.
+
+   @return
+   None.
+
+   @dependencies
+   None.
+ */
+void qurt_thread_exit(int status);
+
+/**@ingroup func_qurt_thread_join
+   @xreflabel{sec:thread_join}
+   Waits for a specified thread to finish; the specified thread is another thread within
+   the same process.
+   The caller thread is suspended until the specified thread exits. When the unspecified thread
+   exits, the caller thread is awakened. \n
+   @note1hang If the specified thread has already exited, this function returns immediately
+              with the result value #QURT_ENOTHREAD. \n
+   @note1cont Two threads cannot call qurt_thread_join to wait for the same thread to finish.
+              If this occurs, QuRT generates an exception (see Section @xref{sec:exceptionHandling}).
+  
+   @param[in]   tid     Thread identifier.
+   @param[out]  status  Destination variable for thread exit status. Returns an application-defined 
+                        value that indicates the termination status of the specified thread. 
+  
+   @return  
+   #QURT_ENOTHREAD -- Thread has already exited. \n
+   #QURT_EOK -- Thread successfully joined with valid status value. 
+
+   @dependencies
+   None.
+ */
+int qurt_thread_join(unsigned int tid, int *status);
+
+/**@ingroup qurt_thread_detach
+   @xreflabel{sec:thread_detach}
+   Detaches a joinable thread. The specified thread is another thread within the 
+   same process. Create the thread as a joinable thread; only joinable threads 
+   can be detached.
+   If a joinable thread is detached, it finishes execution and exits.
+  
+   @param[in]   tid     Thread identifier.
+   
+   @return  
+   #QURT_ENOTHREAD -- Thread specifed by TID does not exist. \n
+   #QURT_EOK -- Thread successfully detached.
+
+   @dependencies
+   None.
+ */
+int qurt_thread_detach(unsigned int tid);
+
+
+/**@ingroup func_qurt_thread_get_priority 
+   Gets the priority of the specified thread. \n 
+   Returns the thread priority of the specified thread.\n
+   Thread priorities are specified as numeric values in a range as large as 1 through 254, with lower
+   values representing higher priorities. 1 represents the highest possible thread priority. \n
+   Priority 0 and 255 are internally used by the kernel for special purposes.
+
+   @note1hang QuRT can be configured to have different priority ranges.
+
+   @datatypes
+   #qurt_thread_t
+  
+   @param[in]  threadid	   Thread identifier.	
+
+   @return
+   -1 -- Invalid thread identifier. \n
+   1 through 254 -- Thread priority value.
+
+   @dependencies
+   None.
+ */
+int qurt_thread_get_priority (qurt_thread_t threadid);
+
+/**@ingroup func_qurt_thread_set_priority
+   Sets the priority of the specified thread.\n
+   Thread priorities are specified as numeric values in a range as large as 1 through 254, with lower
+   values representing higher priorities. 1 represents the highest possible thread priority.
+   Priority 0 and 255  are internally used by the kernel  for special purposes.
+
+   @note1hang QuRT can be configured to have different priority ranges. For more
+              information, see Section @xref{sec:AppDev}.
+   
+   @datatypes
+   #qurt_thread_t
+
+   @param[in] threadid	    Thread identifier.	
+   @param[in] newprio 	    New thread priority value.
+
+   @return
+   0 -- Priority successfully set. \n
+   -1 -- Invalid thread identifier. \n 
+   
+   @dependencies
+   None.
+ */
+int qurt_thread_set_priority (qurt_thread_t threadid, unsigned short newprio);
+
+
+
+/**@ingroup func_qurt_thread_attr_get
+  Gets the attributes of the specified thread.
+
+  @datatypes
+  #qurt_thread_t \n
+  #qurt_thread_attr_t
+
+  @param[in]  thread_id	    Thread identifier.
+  @param[out] attr 	    Pointer to the destination structure for thread attributes.
+  
+  @return
+  #QURT_EOK -- Success. \n
+  #QURT_EINVALID -- Invalid argument.
+
+  @dependencies
+  None.
+ */
+int qurt_thread_attr_get (qurt_thread_t thread_id, qurt_thread_attr_t *attr);
+
+
+
+/**@ingroup func_qurt_thread_get_tls_base
+  Gets the base address of thread local storage (TLS) of a dynamically loaded module
+  for the current thread.
+  
+  @datatypes
+  #qurt_tls_info 
+
+  @param[in]  info	   Pointer to the TLS information for a module.
+  
+  @return
+   Pointer to the TLS object for the dynamically loaded module.\n
+   NULL -- TLS information is invalid.
+
+  @dependencies
+  None.
+ */
+void * qurt_thread_get_tls_base(qurt_tls_info* info);
+
+/**@ingroup func_qurt_thread_pktcount_get
+  Gets the PKTCOUNT of a specified thread.
+
+  @datatypes
+  #qurt_thread_t 
+
+  @param[in]  thread_id	    Thread identifier.
+  
+  @return
+  PKTCOUNT
+
+  @dependencies
+  None.
+ */
+
+long long int qurt_thread_pktcount_get (qurt_thread_t thread_id);
+
+/**@ingroup func_qurt_thread_pktcount_set
+  Sets the PKTCOUNT for the current QuRT thread.
+  
+  @return
+  Value to which pktcount is set.
+
+  @dependencies
+  None.
+ */
+
+long long int qurt_thread_pktcount_set (long long int);
+
+/**@ingroup func_qurt_thread_stid_get
+  Gets the STID for a specified thread.
+
+  @datatypes
+  #qurt_thread_t 
+
+  @param[in]  thread_id	    Thread identifier.
+  
+  @return
+  STID
+
+  @dependencies
+  None.
+ */
+
+char qurt_thread_stid_get(qurt_thread_t thread_id);
+ 
+/**@ingroup func_qurt_thread_stid_get2
+  Returns the set stid for a thread
+  
+  @param[in]  thread_id   thread identifier
+  @param[out] stid  Pointer to a variable to return  stid
+   
+  @return
+  QURT_EOK - success
+  QURT_ENOTALLOWED   - operation not allowed for a thread
+  QURT_EINVALID - Invalid input
+
+  @dependencies
+  None.
+ */
+int qurt_thread_stid_get2(unsigned int thread_id, unsigned int *stid);
+
+/**@ingroup func_qurt_thread_stid_set
+  Sets the STID for a specified thread. 
+
+  @datatypes
+  #qurt_thread_t 
+
+  @param[in]  stid	    Thread identifier.
+  
+  @return 
+   #QURT_EOK -- STID set created. \n
+   #QURT_EFAILED -- STID not set. 
+
+  @dependencies
+  None.
+ */
+
+int qurt_thread_stid_set(char stid);
+
+/**@ingroup qurt_thread_stid_set2
+   Sets the stid for a specified thread.
+
+   @datatypes
+   #qurt_thread_attr_t
+
+   @param[in]  thread_id  Thread identifier.
+   @param[in]  stid       Stid to be set for a thread.
+
+   @return
+   QURT_EOK -- Success
+   #QURT_EPRIVILEGE -- Failure because caller does not have enough privilege for this operation.
+   #QURT_EVAL -- Failure because of invalid inputs.
+
+   @dependencies
+   None.
+*/
+int qurt_thread_stid_set2(unsigned int thread_id, unsigned int stid); 
+
+/** @cond internal_only */
+/**@ingroup func_qurt_thread_get_running_ids
+  Returns the thread IDs of the running threads in the system; use only during fatal error handling.
+ 
+  @datatypes
+  #qurt_thread_t 
+ 
+  @param[in,out] * Array of thread identifier of size #QURT_MAX_HTHREAD_LIMIT + 1.
+ 
+  @return
+   #QURT_EINVALID -- Incorrect argument \n
+   #QURT_ENOTALLOWED  -- API not called during error handling \n
+   #QURT_EOK -- Success, returns a NULL-terminated array of thread_id
+ 
+  @dependencies
+  None.
+ */
+int qurt_thread_get_running_ids(qurt_thread_t *);
+/** @endcond */
+
+
+/**@ingroup func_qurt_thread_get_thread_id
+  Gets the thread identifier of the thread with the matching name in the same process
+  of the caller.
+ 
+  @datatypes
+  #qurt_thread_t 
+ 
+  @param[out] thread_id Pointer to the thread identifier.
+  @param[in]  name      Pointer to the name of the thread.
+ 
+  @return
+  #QURT_EINVALID -- No thread with matching name in the process of the caller \n
+  #QURT_EOK      -- Success  
+ 
+  @dependencies
+  None.
+ */
+int qurt_thread_get_thread_id (qurt_thread_t *thread_id, char *name);
+
+/**@ingroup func_qurt_sleep
+  Suspends the current thread for the specified amount of time.
+
+  @note1hang Because QuRT timers are deferrable, this call is guaranteed to block
+             at least for the specified amount of time. If power-collapse is 
+             enabled, the maximum amount of time this call can block depends on
+             the earliest wakeup from power-collapse past the specified duration.
+
+  @param[in] duration  Duration (in microseconds) for which the thread is suspended.
+
+  @return 
+  None.
+
+  @dependencies
+  None.
+ */
+void qurt_sleep (unsigned long long int duration);
+
+
+/**@ingroup func_qurt_system_set_priority_floor
+  Sets a priority floor to move threads with thread priority lower than the floor out of the running state.
+  Running threads with thread priority lower than the priority floor are moved into the kernel ready queue, and they 
+  are not scheduled to run when the thread priority is lower than the floor.
+  Later the caller should reset the priority floor back to the default value of QURT_PRIORITY_FLOOR_DEFAULT. 
+  Threads in the kernel ready queue are scheduled to run when the thread priority is higher than the floor.
+
+  The priority floor is set and associated to the user process of the caller. When the caller gets into QuRTOS and
+  sets a new floor, the new floor is associated to its original user process, not the QuRTOS process.
+  The floor associated to the user process is reset when the user process exits or is killed, but not at the time 
+  when the user thread of the caller exits.
+
+  The priority floor cannot be set to a priority higher than the thread priority of the caller.
+
+  The priority floor cannot be set to a priority lower than the default #QURT_PRIORITY_FLOOR_DEFAULT system floor.
+
+  This function is not supported in Island mode.
+
+  After the system floor is set above QURT_PRIORITY_FLOOR_DEFAULT, power collapse is skipped, and sleep task 
+  is not scheduled to run.
+ 
+  @param[in]  priority_floor Priority floor. 
+ 
+  @return
+  #QURT_EOK         -- Success \n  
+  #QURT_ENOTALLOWED -- Floor setting is not allowed
+ 
+  @dependencies
+  None.
+ */
+int qurt_system_set_priority_floor (unsigned int priority_floor);
+
+
+/**@ingroup func_qurt_thread_suspend_thread 
+  Suspend a QuRT thread with its thread identifier.
+  The target thread can be in a signed user process or an unsigned user process.
+  The caller thread can be a thread from the same user process of the target thread, or from its parent process.
+  After the target thread is suspended, the kernel will not schedule it to run until it is resumed later.
+
+  If the target thread is set as non-suspendable, this function call returns an error without suspending 
+  the target thread. 
+
+  If the target thread is already suspended, this function call returns success to confirm 
+  the target thread suspend.                                          
+
+  If the target thread is in a secure user process, or CPZ process, this function call returns an error without
+  suspending the target thread.                                          
+
+  If the target thread is running in the guest OS/root process via a QDI call, this function call does not suspend 
+  the target thread in guest OS, but marks the target thread as suspend-pending. The target thread is
+  suspended when it exits the guest OS, before executing the first instruction in the user process.
+  In this case, the function returns success even with the #QURT_THREAD_SUSPEND_SYNCHRONOUS option, while the target
+  thread can runn in the guest OS, and is suspended when exiting the guest OS. 
+ 
+  QuRT debug monitor threads that are in a user process are non-suspendable. This function does not suspend 
+  those threads.
+
+  @param[in] thread_id  Thread identifier.
+  @param[in] option     Optional argument, multiple options can be ORed. \n
+                        #QURT_THREAD_SUSPEND_SYNCHRONOUS (default) -- set to synchronous function call,
+                        the function returns after the thread is completely suspended.\n
+                        #QURT_THREAD_SUSPEND_ASYNCHRONOUS -- set to asynchronous function call, the function returns
+                        after the kernel acts to suspend the target thread. The target thread
+                        might still be running before it is completely suspended. \n
+                        #QURT_THREAD_SUSPEND_KEEP_HMX (default) -- keep the HMX attachment on the target thread 
+                        if it locks the HMX with qurt_hmx_lock(). In this case, the HMX cannot be re-used by other threads. \n
+                        #QURT_THREAD_SUSPEND_DETACH_HMX -- detach HMX from the target thread if it locks the HMX with qurt_hmx_lock().
+                        Later when the target thread resumes, the HMX is re-attached to the thread. Note that, this option is only 
+                        supported for the caller from the same user process of the target thread, not for a caller from the parent 
+                        process of the target thread, or other processes. With the HMX detach option, Qurt does not save the HMX 
+                        context. Thus, the HMX context state will be lost. It is the responsibility of caller to ensure HMX operations
+                        and its context state saving when calling qurt_thread_suspend_thread() with the HMX detach option.
+                        If a thread from another process uses this detach option, QURT_EHMXNOTDETACHABLE will be returned; in this 
+                        case, if the caller is qualified to suspend the target thread, the target thread will be moved to suspended 
+                        state without HMX detached.
+ 
+  @return
+  #QURT_EOK         -- Success  \n
+  #QURT_EINVALID    -- Failure because of invalid thread_id input \n
+  #QURT_ENOTALLOWED -- Failure because of the operation is not allowed, for example, in secure process/CPZ process.
+  #QURT_EHMXNOTDETACHABLE -- Failure because HMX is not detachable from the target thread.
+ 
+  @dependencies
+  None.
+ */
+int qurt_thread_suspend_thread (unsigned int thread_id, unsigned int option);
+
+
+/**@ingroup func_qurt_thread_resume_thread 
+  Resume a QuRT thread with its thread identifier.
+  The target thread can be in a signed user process or an unsigned user process.
+  The caller thread can be a thread from the same user process of the target thread, or from its parent 
+  process. After the target thread resumes, the kernel scheduler can schedule the thread to run based on 
+  the thread priority.
+
+  There is an option argument in this function, with only one default option as of now,
+     QURT_THREAD_RESUME_DEFAULT: resume the target thread in default way.
+
+  By default, this is an asynchronous function. The function returns after kernel moves the 
+  target thread from suspended state to runnable state. The thread is scheduled to run based on its 
+  thread priority.
+  
+  If the target thread is set as non-resumable, this function call does not resume the target thread.                                          
+
+  If the target thread has already resumed, this function confirms that the target thread resumes
+  by returning success.  
+
+  If the target thread is in a secure user process or CPZ process, this function call returns an error without 
+  resuming the operation.  
+
+  If the target thread runs in the guest OS/root process via a QDI call, this function call clears the mark of
+  suspend-pending on the target thread, and the target thread is not suspended when it exits the 
+  guest OS. 
+ 
+  @param[in] thread_id  Thread identifier.
+  @param[in] option     Optional argument, #QURT_THREAD_RESUME_DEFAULT, which resumes the target thread.
+ 
+  @return
+  #QURT_EOK           -- Success \n 
+  #QURT_EINVALID      -- Failure because of invalid thread_id input \n
+  #QURT_ENOTALLOWED   -- Failure because of the operation is not allowed, for example, in a secure process/CPZ process.
+  #QURT_EHMXNOTAVAIL  -- Failure because when resume a HMX thread, the HMX is not available/free for the HMX thread resume.
+ 
+  @dependencies
+  None.
+ */
+int qurt_thread_resume_thread (unsigned int thread_id, unsigned int option);
+
+
+/**@ingroup func_qurt_thread_set_thread_property 
+  Set a QuRT thread property with its thread identifier.
+  The target thread can be in a signed user process or an unsigned user process.
+  The caller thread can be from the same user process of the target thread, or from its parent process.
+
+  If the target thread is in a secure user process, or CPZ process, this function call returns an error without 
+  changing the property of the target thread.
+
+  @param[in] thread_id    Thread identifier \n
+  @param[in] property_id  Thread property identifier \n
+                          #QURT_THREAD_PROPERTY_SUSPENDABLE -- thread is suspendable. Default is TRUE. \n
+                          #QURT_THREAD_PROPERTY_RESUMEABLE  -- thread is resumable. Default is TRUE
+  @param[in] value        Proper value: \n
+                          TRUE(1) -- TRUE for the property \n
+                          FALSE(0) -- FALSE for the property
+ 
+  @return
+  #QURT_EOK         -- Success  \n
+  #QURT_EINVALID    -- Failure because of invalid thread_id input \n
+  #QURT_ENOTALLOWED -- Failure because of the operation is not allowed, for example, in a secure process/CPZ process.
+ 
+  @dependencies
+  None.
+ */
+int qurt_thread_set_thread_property( unsigned int thread_id, unsigned int property_id, unsigned int value );    
+
+/**@ingroup func_qurt_thread_get_group_id
+  Get the group id of the thread specified by thread_id.\n
+
+  @param[in] thread_id Thread identifier
+  @param[out] group_id Pointer to the variable of group identifier
+
+  @return
+  #QURT_EOK         -- Success  \n
+  #QURT_EINVALID    -- Thread id is invalid, or the process has no groups enabled \n
+  #QURT_ENOTALLOWED -- Operation is not allowed \n
+
+  @dependencies
+  None.
+*/
+int qurt_thread_get_group_id(qurt_thread_t thread_id, unsigned int* group_id);
+
+#endif /* __ASSEMBLER__ */ 
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_THREAD_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_thread_context.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_thread_context.h
new file mode 100755
index 0000000000000..bab09deec8889
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_thread_context.h
@@ -0,0 +1,234 @@
+#ifndef QURT_THREAD_CONTEXT_H
+#define QURT_THREAD_CONTEXT_H
+/**
+  @file qurt_thread_context.h 
+  @brief Kernel thread context structure
+			
+EXTERNAL FUNCTIONS
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+Copyright (c) 2018-2022  by Qualcomm Technologies, Inc.  All Rights Reserved.
+Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+=============================================================================*/
+
+
+
+#include <qurt_qdi_constants.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** @cond internal_only */
+
+#define THREAD_ITERATOR_END ((qurt_thread_t)(-1))  /**< Thread iterator is complete. */   
+
+
+/**@ingroup func_qurt_thread_iterator_create
+Gives the ability to the caller to enumerate threads in the system.
+
+@return 
+Handle of the newly created iterator must be passed for
+subsequent operations on the iterator.           
+
+@dependencies
+None.
+*/
+static inline int qurt_thread_iterator_create(void)
+{
+   return qurt_qdi_handle_invoke(QDI_HANDLE_GENERIC, QDI_OS_THREAD_ITERATOR_CREATE);
+}
+
+/**@ingroup func_qurt_thread_iterator_next
+Iterates over the list of threads in the system.
+
+@datatypes
+#qurt_thread_t
+
+@param[in] iter Iterator handle returned by qurt_thread_iterator_create().
+
+@return 
+#THREAD_ITERATOR_END -- iterator has reached the end of the thread list. \n
+Other values indicate a valid thread_id.
+
+@dependencies
+None.
+*/
+static inline qurt_thread_t qurt_thread_iterator_next(int iter)
+{
+   return (qurt_thread_t)qurt_qdi_handle_invoke(iter, QDI_OS_THREAD_ITERATOR_NEXT);
+}
+
+/**@ingroup func_qurt_thread_iterator_destroy
+Cleans up thread iterator resources.
+
+@param[in] iter Iterator handle returned by qurt_thread_iterator_create().
+
+@return 
+#QURT_EOK -- Successful completion of operation \n
+#QURT_EFATAL -- Invalid handle passed 
+		  
+@dependencies
+None.
+*/
+static inline int qurt_thread_iterator_destroy(int iter)
+{
+   return qurt_qdi_close(iter);
+}
+
+/**@ingroup func_qurt_thread_context_get_tname
+Gets the name of the thread from the specified thread ID.
+
+@param[in]      thread_id   Thread for which name is returned.
+@param[in,out]  name        Pointer to the local buffer where name is copied back.
+@param[in]      max_len     Size of the local buffer.
+
+@return 
+#QURT_EOK -- Success \n
+Failure otherwise
+		  
+@dependencies
+None.
+*/
+int qurt_thread_context_get_tname(unsigned int thread_id, char *name, unsigned char max_len);
+
+/**@ingroup func_qurt_thread_context_get_prio
+Gets the priority for the specified thread.
+
+@param[in]     thread_id   Thread for which priority is returned.
+@param[in,out] prio        Pointer to the local variable where priority is written.
+
+@return  
+#QURT_EOK -- Success \n
+Failure otherwise
+		  
+@dependencies
+None.
+*/
+int qurt_thread_context_get_prio(unsigned int thread_id, unsigned char *prio);
+
+/**@ingroup func_qurt_thread_context_get_pcycles
+Gets pcycles for the specified thread.
+
+@param[in]     thread_id Thread for which processor cycles are returned.
+@param[in,out] pcycles   Pointer to the local variable where processor cycles are written.
+
+@return  
+#QURT_EOK -- Success \n
+Failure otherwise.
+		  
+@dependencies
+None.
+*/
+int qurt_thread_context_get_pcycles(unsigned int thread_id, unsigned long long int *pcycles);
+
+/**@ingroup func_qurt_thread_context_get_stack_base
+Gets the stack base address for the specified thread.
+
+@param[in]     thread_id Thread for which stack base address is returned.
+@param[in,out] sbase     Pointer to the local variable where stack base address is written.
+
+@return  
+QURT_EOK -- Success \n
+Failure otherwise
+		  
+@dependencies
+None.
+*/
+int qurt_thread_context_get_stack_base(unsigned int thread_id, unsigned int *sbase);
+
+/**@ingroup func_qurt_thread_context_get_stack_size
+Gets the stack size for the specified thread.
+
+@param[in]      thread_id   Thread for which stack size is returned.
+@param[in,out]  ssize       Pointer to the local variable where stack size is written.
+
+@return  
+#QURT_EOK -- Success \n
+Failure otherwise
+		  
+@dependencies
+None.
+*/
+int qurt_thread_context_get_stack_size(unsigned int thread_id, unsigned int *ssize);
+
+/**@ingroup func_qurt_thread_context_get_pid
+Gets the process ID for the specified thread.
+
+@param[in]     thread_id  Thread for which process ID is returned.
+@param[in,out] pid        Pointer to the local variable where process id is written.
+
+@return  
+#QURT_EOK -- Success \n
+Failure otherwise
+		  
+@dependencies
+None.
+*/
+int qurt_thread_context_get_pid(unsigned int thread_id, unsigned int *pid);
+
+/**@ingroup func_qurt_thread_context_get_pname
+Gets the process name for the specified thread.
+
+@param[in]       thread_id  Represents the thread for which process name is returned.
+@param[in, out]  name       Pointer to the local buffer where process name is copied back.
+@param[in]       len        Length allocated to the local buffer.
+
+@return  
+#QURT_EOK -- Success \n
+Failure otherwise
+		  
+@dependencies
+None.
+*/
+int qurt_thread_context_get_pname(unsigned int thread_id, char *name, unsigned int len);
+
+/** @addtogroup thread_types
+@{ */
+/** Structure that defines how TCB is interpreted to crash dump tools.*/
+/* Keys are defined in consts.h */
+struct qurt_debug_thread_info {
+/** @cond */
+    char name[QURT_MAX_NAME_LEN];     /**< Name of the thread. */
+    struct {
+        unsigned key;                 
+        unsigned val;
+    } os_info[40];  
+    unsigned gen_regs[32];            /**< General mode registers. */
+    unsigned user_cregs[32];          /**< User mode registers. */
+    unsigned guest_cregs[32];         /**< Guest mode registers. */
+    unsigned monitor_cregs[64];       /**< Monitor mode registers. */
+/** @endcond */
+}; /* should add up to 1K */
+/** @} */ /* end_addtogroup thread_types */
+
+
+/**@ingroup func_qurt_system_tcb_dump_get
+Cleans up thread iterator resources.
+
+@datatypes
+#qurt_thread_t
+
+@param[in]       thread_id  Thread on which the operation must be performed.
+@param[in, out]  ptr        Pointer to the local buffer where contents are written.
+@param[in]       size       Size of the debug thread information structure obtained by calling
+                     qurt_system_tcb_dump_get_size().
+	   
+@return 
+#QURT_EOK -- Success \n
+Failure otherwise
+		  
+@dependencies
+None.
+*/
+int qurt_system_tcb_dump_get(qurt_thread_t thread_id, void *ptr, size_t size);
+/** @endcond */
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_THREAD_CONTEXT_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_timer.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_timer.h
new file mode 100755
index 0000000000000..7bdfdb8f3c3df
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_timer.h
@@ -0,0 +1,560 @@
+#ifndef QURT_TIMER_H
+#define QURT_TIMER_H
+/**
+  @file qurt_timer.h
+  @brief  Prototypes of qurt_timer API
+
+EXTERNAL FUNCTIONS
+   None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+Copyright (c) 2021, 2023 by Qualcomm Technologies, Inc.  All Rights Reserved.
+Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+=============================================================================*/
+
+
+
+#include "qurt_anysignal.h"
+#include "qurt_signal2.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*=============================================================================
+                        CONSTANTS AND MACROS
+=============================================================================*/
+/**@addtogroup timer_const_macros 
+@{ */
+/**
+ Default values.
+*/
+/**	@xreflabel{hdr:QURT_TIMER_ONESHOT}*/
+#define QURT_TIMER_DEFAULT_TYPE            QURT_TIMER_ONESHOT /**< One shot.*/
+#define QURT_TIMER_DEFAULT_DURATION        1000uL             /**< Default duration. */
+#define QURT_TIMER_DEFAULT_EXPIRY          0uL                /**< Default expiration. */
+
+/**
+ Conversion from microseconds to timer ticks.
+ */
+#define QURT_TIMER_TIMETICK_FROM_US(us) QURT_SYSCLOCK_TIMETICK_FROM_US(us)
+
+/**
+ Conversion from timer ticks to microseconds at the nominal frequency.
+*/
+#define QURT_TIMER_TIMETICK_TO_US(ticks) qurt_timer_timetick_to_us(ticks)
+
+/** Minimum microseconds value is 100 microseconds (sleep timer).*/
+#define QURT_TIMER_MIN_DURATION             100uL
+
+/**
+ Maximum microseconds value for Qtimer is 1,042,499 hours.
+*/
+#define QURT_TIMER_MAX_DURATION  QURT_SYSCLOCK_MAX_DURATION
+
+/** 
+ Timer clock for Qtimer is 19.2 MHz.
+*/
+#define QURT_TIMER_MAX_DURATION_TICKS QURT_SYSCLOCK_MAX_DURATION_TICKS
+
+/** 
+ Sleep timer error margin for Qtimer is 1,000 ticks ~52 us.
+*/
+#define QURT_TIMETICK_ERROR_MARGIN QURT_SYSCLOCK_ERROR_MARGIN
+
+/*
+  qurt_timer group defines.                                                    
+*/
+#define QURT_TIMER_MAX_GROUPS              5U /**< Maximum groups.*/
+#define QURT_TIMER_DEFAULT_GROUP           0U /**< Default groups. */
+/** @} */ /* end_addtogroup timer_const_macros */
+
+/** @addtogroup timer_types
+@{ */
+/**
+   QuRT timer types.                                                       
+ */
+typedef enum
+{
+  QURT_TIMER_ONESHOT = 0,  /**< One shot.*/
+  /**	@xreflabel{hdr:QURT_TIMER_PERIODIC}*/
+  QURT_TIMER_PERIODIC      /**< Periodic. */
+} qurt_timer_type_t;
+
+
+/*=============================================================================
+                        TYPEDEFS
+=============================================================================*/
+
+/** QuRT timer type.*/
+typedef unsigned int                        qurt_timer_t;
+
+/** QuRT timer duration type.  */
+typedef unsigned long long                  qurt_timer_duration_t;
+
+/** QuRT timer time type. */
+typedef unsigned long long                  qurt_timer_time_t;
+
+typedef void (*pfn_t)(void);
+/** QuRT timer attribute type. */
+typedef struct 
+{ 
+    /** @cond */
+    unsigned int        magic; /**< Magic number to verify the qmsgq_attr_t pointer.  */    
+     
+    qurt_timer_duration_t   duration; /**< Specifies the duration of the new timer. */
+     
+    qurt_timer_time_t   expiry; /**< Specifies the absolute expiry of the new timer. */
+
+    qurt_timer_duration_t   remaining; /**< Specifies the remaining time of an active timer. */
+   
+    qurt_timer_type_t       type;  /**< Specifies the timer type; only #QURT_TIMER_ONESHOT and
+                                            #QURT_TIMER_PERIODIC are supported.  */
+    
+    unsigned int        group;  /**<  Group number of the timer; the criterion used to disable or enable the set
+       of timers.  */
+    pfn_t pFn;  /**< Callback other than the signal set */
+    /** @endcond */
+}
+qurt_timer_attr_t;
+
+/** @} */ /* end_addtogroup timer_types */
+/*=============================================================================
+                        FUNCTIONS
+=============================================================================*/
+
+/**@ingroup func_qurt_timer_stop
+  @xreflabel{sec:qurt_timer_stop}  
+  Stops a running timer.
+  The timer must be a one-shot timer.
+
+  @note1hang Restart stopped timers with the timer restart operation,
+             see Section @xref{sec:qurt_timer_restart}. 
+
+  @datatypes
+  #qurt_timer_t
+  
+  @param[in] timer    Timer object. 
+
+  @return
+  #QURT_EOK -- Success. \n
+  #QURT_EINVALID -- Invalid timer ID or duration value. \n
+  #QURT_ENOTALLOWED -- Timer is not a one shot timer. \n
+  #QURT_EMEM -- Out of memory error.
+
+  @dependencies
+  None.
+ */
+int qurt_timer_stop (qurt_timer_t timer);
+
+/**@ingroup func_qurt_timer_restart
+   @xreflabel{sec:qurt_timer_restart}
+   Restarts a stopped timer with the specified duration. The timer must be a one-shot timer.
+   Timers stop after they have expired or after they are explicitly stopped with qurt_timer_stop().
+   A restarted timer expires after the specified duration, the starting time is when the function is called.
+
+  @note1hang Timers stop after they have expired or after they are explicitly
+             stopped with the timer stop operation, see Section @xref{sec:qurt_timer_stop}.
+
+  @datatypes
+  #qurt_timer_t \n
+  #qurt_timer_duration_t
+
+  @param[in] timer        Timer object. 
+  @param[in] duration     Timer duration (in microseconds) before the restarted timer
+                          expires again.
+                          The valid range is #QURT_TIMER_MIN_DURATION to
+                          #QURT_TIMER_MAX_DURATION.
+
+  @return             
+  #QURT_EOK -- Success. \n
+  #QURT_EINVALID -- Invalid timer ID or duration value. \n
+  #QURT_ENOTALLOWED -- Timer is not a one-shot timer. \n
+  #QURT_EMEM --  Out-of-memory error.
+
+  @dependencies
+  None.
+ */
+int qurt_timer_restart (qurt_timer_t timer, qurt_timer_duration_t duration);
+
+
+/**@ingroup func_qurt_timer_create
+  Creates a timer.\n
+  Allocates and initializes a timer object, and starts the timer.
+
+  @note1hang A timer event handler must be defined to wait on the specified signal 
+             to handle the timer event.
+
+  @datatypes
+  #qurt_timer_t \n
+  #qurt_timer_attr_t \n
+  #qurt_anysignal_t
+
+  @param[out] timer   Pointer to the created timer object.
+  @param[in]  attr    Pointer to the timer attribute structure.
+  @param[in]  signal  Pointer to the signal object set when timer expires.
+  @param[in]  mask    Signal mask, which specifies the signal to set in the signal object when the
+                      time expires.
+
+  @return
+  #QURT_EOK -- Success. \n
+  #QURT_EMEM -- Not enough memory to create the timer. \n
+  #QURT_EINVALID -- One of the arguments in the attr field is invalid. \n
+  Other error code -- Operation failed. \n
+
+  @dependencies
+  None.
+ */
+int qurt_timer_create (qurt_timer_t *timer, const qurt_timer_attr_t *attr,
+                  const qurt_anysignal_t *signal, unsigned int mask);
+
+int qurt_timer_create_sig2 (qurt_timer_t *timer, const qurt_timer_attr_t *attr, 
+                  const qurt_signal2_t *signal, unsigned int mask);
+
+/**@ingroup func_qurt_timer_attr_init
+  Initializes the specified timer attribute structure with default attribute values: \n
+  - Timer duration -- #QURT_TIMER_DEFAULT_DURATION (Section @xref{dox:timers}) \n
+  - Timer type -- #QURT_TIMER_ONESHOT \n
+  - Timer group -- #QURT_TIMER_DEFAULT_GROUP
+
+  @datatypes
+  #qurt_timer_attr_t
+
+  @param[in,out] attr Pointer to the destination structure for the timer attributes.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+void qurt_timer_attr_init(qurt_timer_attr_t *attr);
+
+
+/*Tech Comm note: removed qurt_timer_attr_set_pfn from documentation 9/10/2020
+@ingroup func_qurt_timer_attr_set_pfn  
+
+  @datatypes
+  #qurt_timer_attr_t
+
+  @param[in,out] attr Pointer to the destination structure for the timer attributes.
+  @param[in] pFn pFn.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+void qurt_timer_attr_set_pfn(qurt_timer_attr_t *attr, pfn_t pFn);
+
+
+/**@ingroup func_qurt_timer_attr_set_duration
+  Sets the timer duration in the specified timer attribute structure.\n
+
+  The timer duration specifies the interval (in microseconds) between the creation of the
+  timer object and the generation of the corresponding timer event.
+
+  The timer duration value must be between #QURT_TIMER_MIN_DURATION and
+  #QURT_TIMER_MAX_DURATION (Section @xref{dox:timers}). Otherwise, the set operation is ignored.
+
+  @datatypes
+  #qurt_timer_attr_t \n
+  #qurt_timer_duration_t
+
+  @param[in,out] attr    Pointer to the timer attribute structure.
+  @param[in] duration    Timer duration (in microseconds).
+                         Valid range is #QURT_TIMER_MIN_DURATION to
+                         #QURT_TIMER_MAX_DURATION.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+void qurt_timer_attr_set_duration(qurt_timer_attr_t *attr, qurt_timer_duration_t duration);
+
+/**@ingroup func_qurt_timer_attr_set_expiry
+   Sets the absolute expiry time in the specified timer attribute structure.\n
+   The timer expiry specifies the absolute time (in microseconds) of the generation of the
+   corresponding timer event.\n
+   Timer expiries are relative to when the system first began executing.
+
+   @datatypes
+   #qurt_timer_attr_t \n
+   #qurt_timer_time_t
+
+   @param[in,out] attr  Pointer to the timer attribute structure.
+   @param[in]     time  Timer expiry.
+
+   @return
+   None.
+
+   @dependencies
+   None.
+ */
+void qurt_timer_attr_set_expiry(qurt_timer_attr_t *attr, qurt_timer_time_t time);
+
+/**@ingroup func_qurt_timer_attr_get_duration
+  Gets the timer duration from the specified timer attribute structure.
+  The value returned is the duration that was originally set for the timer.
+
+  @note1hang This function does not return the remaining time of an active timer; 
+  use qurt_timer_attr_get_remaining() to get the remaining time.
+
+  @datatypes
+  #qurt_timer_attr_t \n
+  #qurt_timer_duration_t
+
+  @param[in]  attr       Pointer to the timer attributes object
+  @param[out] duration   Pointer to the destination variable for timer duration.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+void qurt_timer_attr_get_duration(qurt_timer_attr_t *attr, qurt_timer_duration_t *duration);
+
+/**@ingroup func_qurt_timer_attr_get_remaining
+  Gets the timer remaining duration from the specified timer attribute structure. \n
+
+  The timer remaining duration indicates (in microseconds) how much time remains before
+  the generation of the next timer event on the corresponding timer.
+  In most cases this function assumes that the timer attribute structure was obtained by
+  calling qurt_timer_get_attr().
+
+  @note1hang This attribute is read-only and thus has no set operation defined for it.
+
+  @datatypes
+  #qurt_timer_attr_t \n
+  #qurt_timer_duration_t
+
+  @param[in] attr          Pointer to the timer attribute object.
+  @param[out] remaining    Pointer to the destination variable for remaining time.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+void qurt_timer_attr_get_remaining(qurt_timer_attr_t *attr, qurt_timer_duration_t *remaining);
+
+/**@ingroup func_qurt_timer_attr_set_type
+  Sets the timer type in the specified timer attribute structure.
+
+  The timer type specifies the functional behavior of the timer: \n
+  - A one-shot timer (#QURT_TIMER_ONESHOT) waits for the specified timer duration
+      and then generates a single timer event. After this the timer is nonfunctional. \n
+  - A periodic timer (#QURT_TIMER_PERIODIC) repeatedly waits for the specified
+     timer duration and then generates a timer event. The result is a series of timer
+     events with interval equal to the timer duration.
+
+   @datatypes 
+   #qurt_timer_attr_t \n
+   #qurt_timer_type_t
+   
+   @param[in,out]  attr  Pointer to the timer attribute structure.
+   @param[in]      type  Timer type. Values are: \n
+                   - #QURT_TIMER_ONESHOT -- One-shot timer. \n
+                   - #QURT_TIMER_PERIODIC -- Periodic timer. @tablebulletend
+
+   @return
+   None.
+
+   @dependencies
+   None.
+ */
+void qurt_timer_attr_set_type(qurt_timer_attr_t *attr, qurt_timer_type_t type);
+
+/**@ingroup func_qurt_timer_attr_get_type
+  Gets the timer type from the specified timer attribute structure.
+
+  @datatypes
+  #qurt_timer_attr_t \n
+  #qurt_timer_type_t
+
+  @param[in]  attr  Pointer to the timer attribute structure.
+  @param[out] type  Pointer to the destination variable for the timer type.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+void qurt_timer_attr_get_type(qurt_timer_attr_t *attr, qurt_timer_type_t *type);
+
+/**@ingroup func_qurt_timer_attr_set_group
+  Sets the timer group identifier in the specified timer attribute structure.\n
+  The timer group identifier specifies the group that the timer belongs to. Timer groups are
+  used to enable or disable one or more timers in a single operation. \n
+  The timer group identifier value must be between 0 and (#QURT_TIMER_MAX_GROUPS - 1).
+  See Section @xref{dox:timers}.
+
+  @datatypes
+  #qurt_timer_attr_t
+
+  @param[in,out]  attr  Pointer to the timer attribute object.
+  @param[in] group      Timer group identifier;
+                        Valid range is 0 to (#QURT_TIMER_MAX_GROUPS - 1).
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+void qurt_timer_attr_set_group(qurt_timer_attr_t *attr, unsigned int group);
+
+/**@ingroup func_qurt_timer_attr_get_group
+  Gets the timer group identifier from the specified timer attribute structure.
+
+  @datatypes
+  #qurt_timer_attr_t
+  
+  @param[in]  attr   Pointer to the timer attribute structure.
+  @param[out] group  Pointer to the destination variable for the timer group identifier.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+void qurt_timer_attr_get_group(qurt_timer_attr_t *attr, unsigned int *group);
+
+/**@ingroup func_qurt_timer_get_attr
+  @xreflabel{hdr:qurt_timer_get_attr}
+  Gets the timer attributes of the specified timer when it was created.
+
+  @datatypes
+  #qurt_timer_t \n
+  #qurt_timer_attr_t
+
+  @param[in] timer  Timer object.
+  @param[out] attr  Pointer to the destination structure for timer attributes.
+
+  @return
+  #QURT_EOK -- Success. \n
+  #QURT_EVAL -- Argument passed is not a valid timer.
+
+  @dependencies
+  None.
+ */
+int qurt_timer_get_attr(qurt_timer_t timer, qurt_timer_attr_t *attr);
+
+/**@ingroup func_qurt_timer_delete
+  Deletes the timer.\n
+  Destroys the specified timer and deallocates the timer object.
+
+  @datatypes
+  #qurt_timer_t
+  
+  @param[in] timer  Timer object.
+
+  @return       
+  #QURT_EOK -- Success. \n
+  #QURT_EVAL -- Argument passed is not a valid timer. 
+
+  @dependencies
+  None.
+ */
+int qurt_timer_delete(qurt_timer_t timer);
+
+/**@ingroup func_qurt_timer_sleep
+  Suspends the current thread for the specified amount of time.
+  The sleep duration value must be between #QURT_TIMER_MIN_DURATION and
+  #QURT_TIMER_MAX_DURATION (Section @xref{dox:timers}).
+
+  @datatypes
+  #qurt_timer_duration_t
+
+  @param[in] duration  Interval (in microseconds) between when the thread is suspended
+                       and when it is re-awakened. 
+
+  @return 
+  #QURT_EOK -- Success. \n
+  #QURT_EMEM -- Not enough memory to perform the operation.
+
+  @dependencies
+  None.
+ */
+
+int qurt_timer_sleep(qurt_timer_duration_t duration);
+
+/**@ingroup func_qurt_timer_group_disable
+  Disables all timers that are assigned to the specified timer group.
+  If a specified timer is already disabled, ignore it.
+  If a specified timer is expired, do not process it.
+  If the specified timer group is empty, do nothing.
+
+  @note1hang When a timer is disabled its remaining time does not change, thus it
+             cannot generate a timer event.
+ 
+  @param[in] group  Timer group identifier.
+
+  @return 
+  #QURT_EOK -- Success.
+
+  @dependencies
+  None.
+ */
+int qurt_timer_group_disable (unsigned int group);
+
+/**@ingroup func_qurt_timer_group_enable
+  Enables all timers that are assigned to the specified timer group.
+  If a specified timer is already enabled, ignore it.
+  If a specified timer is expired, process it.
+  If the specified timer group is empty, do nothing.
+
+  @param[in] group  Timer group identifier.
+
+  @return 
+  #QURT_EOK -- Success.
+
+  @dependencies
+  None.
+ */
+int qurt_timer_group_enable (unsigned int group);
+
+
+/**
+  Notifies the timer server recovery from power collapse. The server
+  must account for any missed interrupts during power collapse. 
+ */
+void qurt_timer_recover_pc (void);
+
+/**
+   Determines whether the Qtimer is initialized.
+
+   @return
+   0       -- Not initialized. \n
+   Nonzero -- Initialized.
+ */
+static inline int qurt_timer_is_init (void) {return 1;}
+
+/**@ingroup func_qurt_timer_get_ticks
+   Gets current ticks. The ticks are accumulated since the RTOS
+   has started. Each tick is equal to a single timer clock
+   cycle, where the frequency is 32 KHz on RGPT or 19.2 MHz on Qtimer.
+  
+   @return             
+   Ticks since system started.
+ */
+unsigned long long qurt_timer_get_ticks (void);
+
+#define qurt_timer_timetick_from_us(us) QURT_SYSCLOCK_TIMETICK_FROM_US(us)
+
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_TIMER_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_tlb.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_tlb.h
new file mode 100755
index 0000000000000..b1b2d261d31c0
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_tlb.h
@@ -0,0 +1,215 @@
+#ifndef QURT_TLB_H
+#define QURT_TLB_H
+
+/**
+  @file qurt_tlb.h 
+  @brief  Prototypes of TLB API  
+        The TLB APIs allow explicit control of the portion of TLB between TLB_first_replaceble and TLB_LAST_REPLACEABLE. 
+        Both are nonconfigurable for the time being. This portion of TLB is permanently assigned/locked unless manually removed 
+        by qurt_tlb_remove. Implementation does not change depending on the configuration, such as whether CONFIG_STATIC is set or not. 
+        In CONFIG_STATIC=y, TLB_LAST_REPLACEABLE is set to the last TLB index, which indicates that the entire TLB is permanently 
+        assigned and is not backed up by page table (page table does not exist). TLB indicies are maintained through a 64-bit bitmask. 
+        A new entry is placed in the first available slot. 
+
+EXTERNAL FUNCTIONS
+   None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+Copyright (c) 2013, 2021, 2023
+All Rights Reserved.
+Confidential and Proprietary - Qualcomm Technologies, Inc.
+=============================================================================*/
+
+#include <qurt_types.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/*=============================================================================
+												FUNCTIONS
+=============================================================================*/
+
+/**@ingroup func_qurt_tlb_entry_create
+  Creates a new TLB entry with the specified mapping attributes in the TLB of the Hexagon processor. \n
+  @note1hang If the specified attributes are not valid (such as if the address is not aligned with the
+             size), the entry is created and an error result is returned.\n
+  @note1cont To set the G bit in the new TLB entry, set the ASID argument to -1.
+
+  @datatypes
+  #qurt_addr_t \n
+  #qurt_paddr_t \n
+  #qurt_mem_cache_mode_t \n
+  #qurt_perm_t
+  
+  @param[out]  entry_id         TLB entry identifier.
+  @param[in]   vaddr 			Virtual memory address.
+  @param[in]   paddr  			Physical memory address.
+  @param[in]   size  			Size of memory region to map (in bytes).
+  @param[in]   cache_attribs    Cache mode (writeback, and so on).
+  @param[in]   perms  			Access permissions.  
+  @param[in]   asid  			ASID (space ID).
+ 
+  @return
+  #QURT_EOK -- TLB entry successfully created.\n
+  #QURT_EFATAL -- Entry is not created; the TLB is full. \n
+  #QURT_ETLBCREATESIZE -- Entry is not created; the incorrect size was specified. \n
+  #QURT_ETLBCREATEUNALIGNED -- Entry is not created; an unaligned address was specified. \n
+  #QURT_EINVALID -- Invalid cache attributes / permissions provided.
+
+ */
+int  qurt_tlb_entry_create (unsigned int *entry_id, qurt_addr_t vaddr, qurt_paddr_t paddr, qurt_size_t size, qurt_mem_cache_mode_t cache_attribs, qurt_perm_t perms, int asid);
+
+/**@ingroup func_qurt_tlb_entry_create_64
+  Creates a new TLB entry with the specified mapping attributes in the TLB of the Hexagon processor. \n
+  @note1hang If the specified attributes are not valid (the address is not aligned with the
+             size), the entry is not created, and an error result is returned.\n
+  @note1cont To set the G bit in the new TLB entry, set the asid argument to -1.
+  
+  @param[out]  entry_id         TLB entry identifier.
+  @param[in]   vaddr 			Virtual memory address.
+  @param[in]   paddr_64         64-bit physical memory address.
+  @param[in]   size  			Size of memory region to map (in bytes).
+  @param[in]   cache_attribs    Cache mode (writeback, and so on).
+  @param[in]   perms  			Access permissions.  
+  @param[in]   asid  			ASID (space ID).
+ 
+  @return
+  #QURT_EOK -- TLB entry successfully created.\n
+  #QURT_EFATAL -- Entry was not created; the TLB is full. \n
+  #QURT_ETLBCREATESIZE -- Entry was not created; the incorrect size was specified. \n
+  #QURT_ETLBCREATEUNALIGNED -- Entry was not created; an unaligned address was specified. \n
+  #QURT_EINVALID -- Invalid cache attributes / permissions provided.
+
+ */
+int qurt_tlb_entry_create_64 (unsigned int *entry_id, qurt_addr_t vaddr, qurt_paddr_64_t paddr_64, qurt_size_t size, qurt_mem_cache_mode_t cache_attribs, qurt_perm_t perms, int asid);
+
+/**@ingroup func_qurt_tlb_entry_delete 
+  Deletes the specified TLB entry from the TLB of the Hexagon processor.
+  If the specified entry does not exist, no deletion occurs and an error result is returned.
+
+  @param[in]   entry_id  TLB entry identifier.			
+
+  @return
+  #QURT_EOK -- TLB entry successfully deleted. \n
+  #QURT_EFATAL -- TLB entry does not exist.
+
+  @dependencies
+  None.
+ **/
+int qurt_tlb_entry_delete (unsigned int entry_id);
+
+/**@ingroup func_qurt_tlb_entry_query
+  Searches for the specified TLB entry in the TLB of the Hexagon processor.
+  If the TLB entry is found, its entry identifier is returned.
+
+  @datatypes
+  #qurt_addr_t
+
+  @param[out]   entry_id     TLB entry identifier.  
+  @param[in]    vaddr  		 Virtual memory address.
+  @param[in]    asid 		 ASID (space ID).
+
+  @return  
+  #QURT_EOK -- TLB entry successfully returned. \n
+  #QURT_EFATAL -- TLB entry does not exist.
+
+  @dependencies
+  None.
+ **/
+int qurt_tlb_entry_query (unsigned int *entry_id, qurt_addr_t vaddr, int asid);
+
+/**@ingroup func_qurt_tlb_entry_set
+  Sets the TLB entry by storing an entry at the specified location 
+  in the TLB of the Hexagon processor.
+
+  @param[in]   entry_id  		TLB entry identifier.
+  @param[in]   entry  			64-bit TLB entry to store.
+
+  @return
+  #QURT_EOK -- Entry successfully stored in the TLB. \n
+  #QURT_EFATAL -- Entry not set at the specified location.
+
+  @dependencies
+  None.
+ **/
+int qurt_tlb_entry_set (unsigned int entry_id, unsigned long long int entry);
+
+/**@ingroup func_qurt_tlb_entry_get
+  Gets the TLB entry. \n
+  Returns the specified 64-bit TLB entry in the TLB of the Hexagon processor.
+
+  @param[in]    entry_id  	TLB entry identifier.
+  @param[out]   entry       64-bit TLB entry.
+
+  @return
+  #QURT_EOK -- TLB entry successfully returned. \n
+  #QURT_EFATAL -- TLB entry does not exist.   
+
+  @dependencies
+  None.
+ **/
+int qurt_tlb_entry_get (unsigned int entry_id, unsigned long long int *entry);
+
+/**@ingroup func_qurt_tlb_get_pager_physaddrs
+  Searches the TLB of the Hexagon processor, and returns all physical addresses that belong to the pager.
+  Each returned address indicates the starting address of an active page.
+
+The function return value indicates the number of addresses returned.
+
+  @param[out]  pager_phys_addrs  Pointer to the return array of pager physical addresses.
+ 
+  @return
+  Integer -- Number of addresses returned in array.
+
+  @dependencies
+    None.
+*/
+
+unsigned int qurt_tlb_get_pager_physaddr(unsigned int** pager_phys_addrs);
+
+/**@ingroup func_qurt_tlb_get_pager_virtaddr
+  Searches the TLB of the Hexagon processor, and returns all virtual addresses that belong to the pager.
+  Each returned address indicates the starting address of an active page.
+
+The function return value indicates the number of addresses returned.
+
+  @param[out]  pager_virt_addrs  Pointer to the return array of pager virtual addresses.
+ 
+  @return
+  Integer -- Number of addresses returned in the array.
+
+  @dependencies
+    None.
+*/
+
+unsigned int qurt_tlb_get_pager_virtaddr(unsigned int** pager_virt_addrs);
+
+
+/**@ingroup func_qurt_tlb_entry_set2
+  Sets the TLB entry by storing an entry at the specified location 
+  in the TLB of the Hexagon processor. An additional option can be passed 
+  to lock the TLB entry in the TLB of the Hexagon processor.
+
+  @param[in]   id     TLB entry identifier.
+  @param[in]   tlb    64-bit TLB entry to store.
+  @param[in]   lock   Nonzero value indicates that the TLB entry must be locked in the hardware TLB.
+
+  @return
+  #QURT_EOK -- Entry successfully stored in the TLB. \n
+  #QURT_EFATAL -- Entry not set at the specified location.
+
+  @dependencies
+  None.
+ **/
+int qurt_tlb_entry_set2(unsigned id, unsigned long long tlb, unsigned lock);
+
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_TLB_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_tls.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_tls.h
new file mode 100755
index 0000000000000..6ec3b39ff5cb0
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_tls.h
@@ -0,0 +1,100 @@
+#ifndef QURT_TLS_H
+#define QURT_TLS_H
+/**
+  @file qurt_tls.h 
+  @brief  Prototypes of TLS APIs 
+
+EXTERNAL FUNCTIONS
+   None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+Copyright (c) 2021  by Qualcomm Technologies, Inc.  All Rights Reserved.
+Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+=============================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/*=============================================================================
+												FUNCTIONS
+=============================================================================*/
+
+/**@ingroup func_qurt_tls_create_key
+  @xreflabel{sec:tls_create_key}
+  Creates a key for accessing a thread local storage data item.\n
+  Subsequent get and set operations use the key value.
+
+  @note1hang The destructor function performs any clean-up operations needed by a thread
+             local storage item when its containing thread is deleted (Section @xref{sec:qurt_thread_exit}).
+
+  @param[out] key         Pointer to the newly created thread local storage key value.
+  @param[in]  destructor  Pointer to the key-specific destructor function. Passing NULL 
+                          specifies that no destructor function is defined for the key.
+
+  @return	
+  #QURT_EOK -- Key successfully created. \n
+  #QURT_ETLSAVAIL -- No free TLS key available. 
+
+  @dependencies
+  None.
+ */
+int qurt_tls_create_key (int *key, void (*destructor)(void *));
+
+/**@ingroup func_qurt_tls_set_specific
+  Stores a data item to thread local storage along with the specified key.
+
+  @param[in]    key  Thread local storage key value.
+  @param[in]    value  Pointer to user data value to store.
+
+  @return  
+  #QURT_EOK -- Data item successfully stored. \n
+  #QURT_EINVALID -- Invalid key. \n
+  #QURT_EFAILED -- Invoked from a non-thread context.
+ */
+int qurt_tls_set_specific (int key, const void *value);
+
+/**@ingroup func_qurt_tls_get_specific
+  Loads the data item from thread local storage. \n
+  Returns the data item that is stored in thread local storage with the specified key.
+  The data item is always a pointer to user data.
+
+  @param[in]    key Thread local storage key value.
+
+  @return
+  Pointer -- Data item indexed by key in thread local storage. \n
+  0 (NULL) -- Key out of range.
+
+  @dependencies
+  None.
+ */
+void * __attribute__((section(".text.qurt_tls_get_specific "))) qurt_tls_get_specific (int key);
+
+
+/**@ingroup func_qurt_tls_delete_key
+  Deletes the specified key from thread local storage.
+
+  @note1hang Explicitly deleting a key does not execute any destructor function that is
+             associated with the key (Section @xref{sec:tls_create_key}).
+
+  @param[in]   key  Thread local storage key value to delete.
+
+  @return  
+  #QURT_EOK -- Key successfully deleted. \n
+  #QURT_ETLSENTRY -- Key already free.
+
+  @dependencies
+  None.
+ */
+int qurt_tls_delete_key (int key);
+
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_TLS_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_trace.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_trace.h
new file mode 100755
index 0000000000000..541f8f1d34bf6
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_trace.h
@@ -0,0 +1,317 @@
+﻿#ifndef QURT_TRACE_H
+#define QURT_TRACE_H
+/**
+  @file qurt_trace.h 
+  @brief  Prototypes of system call tracing helpers API  
+
+EXTERNAL FUNCTIONS
+   None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+Copyright (c) 2021-2023 by Qualcomm Technologies, Inc.
+All Rights Reserved.
+Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+=============================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*=============================================================================
+                            GLOBAL VARIABLES
+=============================================================================*/
+/** @cond internal_only */
+/** @addtogroup etm_macros
+@{ */
+/* ETM trace types. */
+#define QURT_ETM_TYPE_PC_ADDR                           (1U<<0) /**< PC address.*/
+#define QURT_ETM_TYPE_MEMORY_ADDR                       (1U<<1) /**< Memory address. */
+#define QURT_ETM_TYPE_TESTBUS                           (1U<<2) /**< Test bus. */
+#define QURT_ETM_TYPE_CYCLE_ACCURATE                    (1U<<3) /**< Cycle accurate. */
+#define QURT_ETM_TYPE_CYCLE_COARSE                      (1U<<4) /**< Cycle coarse. */
+#define QURT_ETM_TYPE_PC_AND_MEMORY_ADDR                (QURT_ETM_TYPE_PC_ADDR|QURT_ETM_TYPE_MEMORY_ADDR) /**< PC and memory address. */
+#define QURT_ETM_TYPE_PC_ADDR_AND_TESTBUS               (QURT_ETM_TYPE_PC_ADDR|QURT_ETM_TYPE_TESTBUS) /**< PC address and test bus. */
+#define QURT_ETM_TYPE_MEMORY_ADDR_AND_TESTBUS           (QURT_ETM_TYPE_MEMORY_ADDR|QURT_ETM_TYPE_TESTBUS) /**< Memory address and test bus.*/
+#define QURT_ETM_TYPE_PC_AND_MEMORY_ADDR_AND_TESTBUS    (QURT_ETM_TYPE_PC_ADDR|QURT_ETM_TYPE_MEMORY_ADDR|QURT_ETM_TYPE_TESTBUS) /**< PC, memory address, and test bus. */
+
+/* ETM routes. */
+#define QURT_ETM_ROUTE_TO_QDSS      0U /**< ETM route to QDSS. */
+#define QURT_ETM_ROUTE_TO_Q6ETB     1U /**< ETM route to Q6ETB. */
+
+/* ETM filters. */
+#define QURT_ETM_TRACE_FILTER_ALL_DEFAULT   0U       /*< Filter all as default. */
+#define QURT_ETM_TRACE_FILTER_HNUM0         (1U<<0)  /*< Filter HNUM0. */    
+#define QURT_ETM_TRACE_FILTER_HNUM1         (1U<<1)  /*< Filter HNUM1. */     
+#define QURT_ETM_TRACE_FILTER_HNUM2         (1U<<2)  /*< Filter HNUM2. */     
+#define QURT_ETM_TRACE_FILTER_HNUM3         (1U<<3)  /*< Filter HNUM3. */  
+#define QURT_ETM_TRACE_FILTER_HNUM4         (1U<<4)  /*< Filter HNUM4. */  
+#define QURT_ETM_TRACE_FILTER_HNUM5         (1U<<5)  /*< Filter HNUM5. */  
+#define QURT_ETM_TRACE_FILTER_HNUM6         (1U<<6)  /*< Filter HNUM6. */  
+#define QURT_ETM_TRACE_FILTER_HNUM7         (1U<<7)  /*< Filter HNUM7. */  
+#define QURT_ETM_TRACE_FILTER_HNUM8         (1U<<8)  /*< Filter HNUM8. */    
+#define QURT_ETM_TRACE_FILTER_HNUM9         (1U<<9)  /*< Filter HNUM9. */     
+#define QURT_ETM_TRACE_FILTER_HNUM10        (1U<<10) /*< Filter HNUM10. */     
+#define QURT_ETM_TRACE_FILTER_HNUM11        (1U<<11) /*< Filter HNUM11. */
+#define QURT_ETM_TRACE_FILTER_HNUM12        (1U<<12) /*< Filter HNUM12. */    
+#define QURT_ETM_TRACE_FILTER_HNUM13        (1U<<13) /*< Filter HNUM13. */     
+#define QURT_ETM_TRACE_FILTER_HNUM14        (1U<<14) /*< Filter HNUM14. */     
+#define QURT_ETM_TRACE_FILTER_HNUM15        (1U<<15) /*< Filter HNUM15. */
+#define QURT_ETM_TRACE_FILTER_ALL           QURT_ETM_TRACE_FILTER_ALL_DEFAULT
+
+#define QURT_ETM_TRACE_FILTER_CLUSTER0      (1<<16)  /*< Filter trace cluster0 address. */  
+#define QURT_ETM_TRACE_FILTER_CLUSTER1      (1<<17)  /*< Filter trace cluster1 address. */  
+#define QURT_ETM_TRACE_FILTER_PC_RANGE      (1<<19)  /*< Filter PC address range. */  
+
+/* ETM memory source - PC or data access */
+#define QURT_ETM_SOURCE_PC                  0U  /**< ETM memory source of SAC* is PC. */
+#define QURT_ETM_SOURCE_DATA                1U  /**< ETM memory source of SAC* is data. */
+
+/* Period between synchronization traces */
+#define QURT_ETM_ASYNC_PERIOD               0  /**< Async.*/
+#define QURT_ETM_ISYNC_PERIOD               1  /**< Isync.*/
+#define QURT_ETM_GSYNC_PERIOD               2  /**< Gsync. */
+
+/* ETM enable flags */
+#define QURT_ETM_OFF                0U  /**< ETM off. */
+#define QURT_ETM_ON                 1U  /**< ETM on. */
+/** @endcond */
+/** @} */ /* end_addtogroup etm_macros */
+
+/** @addtogroup function_tracing_macro
+@{ */
+/* ETM setup return values */
+#define QURT_ETM_SETUP_OK                   0 /**< ETM setup OK. */
+#define QURT_ETM_SETUP_ERR                  1 /**< ETM setup error. */
+/** @} */ /* end_addtogroup function_tracing_macro */
+/* ETM breakpoint types */
+#define QURT_ETM_READWRITE_BRKPT            0U /**< ETM read/write breakpoint. */
+#define QURT_ETM_READ_BRKPT                 1U /**< ETM read breakpoint. */
+#define QURT_ETM_WRITE_BRKPT                2U /**< ETM write breakpoint. */
+#define QURT_ETM_BRKPT_INVALIDATE           3U /**< Invalidate breakpoint. */
+/** @addtogroup function_tracing_macro
+@{ */
+/* ATB status flags */
+#define QURT_ATB_OFF                        0  /**< ATB off. */
+#define QURT_ATB_ON                         1  /**< ATB on. */
+/** @} */ /* end_addtogroup function_tracing_macro */
+/* DTM enable flags */
+#define QURT_DTM_OFF                0  /**< DTM off. */
+#define QURT_DTM_ON                 1  /**< DTM on. */
+
+/** @addtogroup function_tracing_datatypes
+@{ */
+/**STM trace information. */
+typedef struct qurt_stm_trace_info {
+   /** @cond */
+   unsigned int stm_port_addr[6];   /* STM port address to which trace data must be written.*/
+   unsigned int thread_event_id; /* Event ID for context switches.*/
+   unsigned int interrupt_event_id; /* Event ID for interrupts. */
+   unsigned int marker; /* Marker value that must be written at the beginning of the trace. */
+   /** @endcond */
+} qurt_stm_trace_info_t;
+/** @} */ /* end_addtogroup function_tracing_datatypes */
+/*=============================================================================
+                            GLOBAL FUNCTIONS
+=============================================================================*/
+
+
+/**@ingroup func_qurt_trace_get_marker
+  Gets the kernel trace marker.\n
+  Returns the current value of the kernel trace marker.
+  The marker consists of a hardware thread identifier and an index into the kernel trace
+  buffer. The trace buffer records kernel events.
+
+  @note1hang Using this function with qurt_trace_changed() 
+             determines whether certain kernel events occurred in a block of code.
+
+  @return
+  Integer -- Kernel trace marker.
+
+  @dependencies
+  None.
+*/
+unsigned int qurt_trace_get_marker(void);
+
+/**@ingroup func_qurt_trace_changed  
+  Determines whether specific kernel events have occurred. \n
+  Returns a value that indicates whether the specified kernel events are recorded in the
+  kernel trace buffer since the specified kernel trace marker was obtained.
+
+  The prev_trace_marker parameter specifies a kernel trace marker that was obtained by calling 
+  qurt_trace_get_marker().
+  @cond rest_dist For more information on the mask value, see the description of the trace_mask element in 
+  @xhyperref{80VB41992,80-VB419-92}. \n @endcond
+
+  @note1hang Used with qurt_trace_get_marker(), this function determines whether
+             certain kernel events occurred in a block of code.\n
+  @note1cont This function cannot determine whether a specific kernel event type has
+             occurred unless that event type has been enabled in the trace_mask element
+             of the system configuration file. \n
+  @note1cont QuRT supports the recording of interrupt and context switch events only (such as
+             a trace_mask value of 0x3).
+
+  @param[in] prev_trace_marker Previous kernel trace marker.
+  @param[in] trace_mask        Mask value that indicates which kernel events to check for.
+
+  @returns
+  1 -- Kernel events of the specified type have occurred since the
+       specified trace marker was obtained.\n
+  0 -- No kernel events of the specified type have occurred since the
+       specified trace marker was obtained.
+
+  @dependencies
+  None.
+*/
+int qurt_trace_changed(unsigned int prev_trace_marker, unsigned int trace_mask);
+
+/*=============================================================================
+                        CONSTANTS AND MACROS
+=============================================================================*/
+/** @addtogroup function_tracing_macro
+@{ */
+#ifndef QURT_DEBUG 
+#define QURT_TRACE(str, ...) __VA_ARGS__
+  /**< Function tracing is implemented with the QURT_TRACE debug macro, which
+       optionally generates printf statements both before and after every function call that is
+       passed as a macro argument. 
+
+       For example, in the following macro calls in the source code:
+       @code
+       QURT_TRACE(myfunc, my_func(33))
+       
+       @endcode
+       generates the following debug output:
+       @code
+       myfile:nnn: my_func >>> calling my_func(33)
+       myfile:nnn: my_func >>> returned my_func(33)
+       @endcode
+       The debug output includes the source file and line number of the function call, along with
+       the text of the call. Compile the client source file with -D __FILENAME__
+       defined for its file name.
+
+       The library function qurt_printf() generates the debug output.
+       The QURT_DEBUG symbol controls generation of the debug output. If this symbol is
+       not defined, function tracing is not generated.\n
+       @note1hang The debug macro is accessed through the QuRT API header file. 
+        */
+#else
+#define QURT_TRACE(str, ...) \
+	do { \
+		qurt_printf("%s:%d: %s: >>> calling %s\n",__FILENAME__,__LINE__,(str),#__VA_ARGS__); \
+		__VA_ARGS__; \
+		qurt_printf("%s:%d: %s: <<< %s returned\n",__FILENAME__,__LINE__,(str),#__VA_ARGS__); \
+	} while (0);
+#endif
+/** @} */ /* end_addtogroup function_tracing_macro */
+
+/**@ingroup func_qurt_etm_set_pc_range
+  Sets the PC address range for ETM filtering.
+  Depending on the Hexagon core design, a maximum of four PC ranges are supported.
+
+  @param[in] range_num  0 to 3. 
+  @param[in] low_addr   Lower boundary of PC address range.
+  @param[in] high_addr  Higher boundary of PC address range.
+
+  @returns
+  #QURT_ETM_SETUP_OK -- Success. \n
+  #QURT_ETM_SETUP_ERR -- Failure.
+
+  @dependencies
+  None.
+*/
+unsigned int qurt_etm_set_pc_range(unsigned int range_num, unsigned int low_addr, unsigned int high_addr);
+
+/**@ingroup func_qurt_etm_set_range
+  Sets the address range for ETM filtering. 
+  It allows the user to select the source type of addresses - QURT_ETM_SOURCE_PC and QURT_ETM_SOURCE_DATA.
+
+  @param[in] addr_source_type   Type of the address source:\n
+                                - #QURT_ETM_SOURCE_PC \n
+                                - #QURT_ETM_SOURCE_DATA @tablebulletend
+  @param[in] trig_block_num     0 to 3.
+  @param[in] pid                pid of the process
+                                1. Any valid PID number will enable the ASID based trace filtering.
+                                2. QURT_ETM_NO_PID - Disable the ASID based trace filtering.
+  @param[in] low_addr           Lower boundary of PC address range.
+  @param[in] high_addr          Higher boundary of PC address range.
+
+  @returns
+  #QURT_ETM_SETUP_OK -- Success. \n
+  #QURT_ETM_SETUP_ERR -- Failure.
+
+  @dependencies
+  None.
+*/
+unsigned int qurt_etm_set_range(unsigned int addr_source_type, unsigned int trig_block_num, unsigned int pid, unsigned int low_addr, unsigned int high_addr);
+
+/**@ingroup func_qurt_etm_set_atb
+  Sets the advanced trace bus (ATB) state to notify QuRT that the ATB is actively enabled or disabled.
+  QuRT performs the corresponding actions at low power management.
+  
+  @param[in] flag Values: \n
+                         #QURT_ATB_ON \n
+						 #QURT_ATB_OFF  
+      
+  @returns
+  #QURT_ETM_SETUP_OK  -- Success. \n
+  #QURT_ETM_SETUP_ERR -- Failure
+
+  @dependencies
+  None.
+*/
+unsigned int qurt_etm_set_atb(unsigned int flag);
+
+/**@ingroup func_qurt_etm_set_sync_period
+  Sets the period for types of synchronization trace packets. \n
+  ASYNC defines the period between alignment synchronization packets.
+         Period is in terms of bytes in the packet stream. \n 
+  ISYNC defines the period between instruction synchronization packets.
+         Period is per thread and is defined as the bytes sent out for that thread. \n
+  GSYNC is the defined period in thread cycles between GSYNC packets.
+
+  @param[in]  sync_type Type of synchronization packets: \n
+                          #QURT_ETM_ASYNC_PERIOD \n
+                          #QURT_ETM_ISYNC_PERIOD \n
+                          #QURT_ETM_GSYNC_PERIOD
+  @param[in]  period    Period value. 
+
+  @return
+  #QURT_ETM_SETUP_OK -- Success. \n
+  #QURT_ETM_SETUP_ERR -- Failure.
+
+  @dependencies
+  None.
+ */
+unsigned int qurt_etm_set_sync_period(unsigned int sync_type, unsigned int period);
+
+/**@ingroup func_qurt_stm_trace_set_config
+  Sets up a STM port for tracing events.
+
+  @datatypes
+  #qurt_stm_trace_info_t 
+
+  @param[in]  stm_config_info Pointer to the STM trace information used to set up the trace
+              in the kernel.
+			  The strucure must have the following:\n
+			  - One port address per hardware thread \n
+			  - Event ID for context switches \n
+			  - Event ID for interrupt tracing n
+			  - Header or marker to identify the beginning of the trace. @tablebulletend
+
+  @return
+  #QURT_EOK -- Success. \n
+  #QURT_EINVALID -- Failure; possibly because the passed port address is not in the page table.
+
+  @dependencies
+  None.
+ */
+unsigned int qurt_stm_trace_set_config(qurt_stm_trace_info_t *stm_config_info);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_TRACE_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_types.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_types.h
new file mode 100755
index 0000000000000..bdb83a3fe2fb2
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_types.h
@@ -0,0 +1,294 @@
+#ifndef QURT_TYPES_H
+#define QURT_TYPES_H
+/**
+  @file qurt_types.h 
+  @brief  Contains types common to all configurations
+
+EXTERNAL FUNCTIONS
+   None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+Copyright (c) Qualcomm Technologies, Inc.
+All Rights Reserved.
+Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+=============================================================================*/
+
+
+//#include <stddef.h>
+#include <qurt_consts.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*=============================================================================
+                            CONSTANTS AND MACROS
+=============================================================================*/
+#define PGA_BITFIELD_MASK(hi,lo)    (((~0u)>>(31U-((hi)-(lo))))<<(lo))
+#define PGA_BITFIELD_GET(x,hi,lo)   (((x)&PGA_BITFIELD_MASK((hi),(lo)))>>(lo))
+#define PGA_BITFIELD_INS(hi,lo,v)   (((v)<<(lo))&PGA_BITFIELD_MASK((hi),(lo)))
+#define PGA_BITFIELD_SET(x,hi,lo,v) ((x)=((x)&~PGA_BITFIELD_MASK((hi),(lo)))|PGA_BITFIELD_INS((hi),(lo),(v)))
+#define QURT_PGATTR_C_GET(pga)      PGA_BITFIELD_GET((pga).pga_value, 3U, 0U)       /* Bits 3-0:  cache */
+#define QURT_PGATTR_A_GET(pga)      PGA_BITFIELD_GET((pga).pga_value, 5U, 4U)       /* Bits 5-4:  bus attr */
+#define QURT_PGATTR_C_SET(pga,v)    PGA_BITFIELD_SET((pga).pga_value, 3U, 0U, (v))  /* Bits 3-0:  cache */
+#define QURT_PGATTR_A_SET(pga,v)    PGA_BITFIELD_SET((pga).pga_value, 5U, 4U, (v))  /* Bits 5-4:  bus attr */
+#define QURT_PGATTR_MKRAW(v)        ((qurt_pgattr_t){.pga_value = (v)})
+#define QURT_PGATTR_MK(c,a)         QURT_PGATTR_MKRAW(PGA_BITFIELD_INS(3U,0U,(c))|PGA_BITFIELD_INS(5U,4U,(a)))
+
+/*return types for qurt_island_get_status2*/
+#define QURT_ISLAND_MODE_NORMAL    0U    /**< Normal operating mode */
+#define QURT_ISLAND_MODE_ISLAND    1U    /**< Island mode */
+#define QURT_ISLAND_MODE_EXITING   2U    /**< In transition from Island mode to Normal mode */
+
+/*=============================================================================
+                        FORWARD DECLARATIONS & TYPEDEFS
+=============================================================================*/
+/** @addtogroup memory_management_types
+@{ */
+typedef unsigned int qurt_addr_t;          /**< QuRT address type.*/
+typedef unsigned int qurt_paddr_t;         /**< QuRT physical memory address type.  */ 
+/** @cond rest_reg_dist  */
+typedef unsigned long long qurt_addr_64_t;  /**< QuRT 64-bit memory address type. */
+typedef unsigned long long qurt_paddr_64_t; /**< QuRT 64-bit physical memory address type. */
+typedef unsigned int qurt_mem_region_t;    /**< QuRT memory regions type. */
+typedef unsigned int qurt_mem_fs_region_t; /**< QuRT memory FS region type. */
+/**@endcond */
+typedef unsigned int qurt_mem_pool_t;      /**< QuRT memory pool type.*/
+typedef unsigned int qurt_size_t;          /**< QuRT size type. */
+/** @cond  */
+typedef unsigned long long qurt_mmu_entry_t;/**< QuRT MMU entry type. */
+#define QURT_PHYSPOOL_NAME_LEN (32)
+typedef char qurt_physpool_name_t[QURT_PHYSPOOL_NAME_LEN];
+
+
+/*
+ * Mapping type
+ *
+ * QMEM_MAPPING_VIRTUAL is the default mode, in which the system 
+ * picks up the available range of the virtual address, and maps it to 
+ * available contiguous physical addresses. Physical-to-virtual
+ * is not guaranteed to be 1:1; both virtual and physical memory is 
+ * contiguous.
+ *
+ * In QMEM_MAPPING_IDEMPOTENT mode, the user provides the physical address;
+ * the kernel allocates 1:1 physical-to-virtual memory. Primary use of 
+ * of this mapping is to allocate physical-to-virtual memory 1:1.
+ *
+ * In QMEM_MAPPING_PHYS_CONTIGUOUS mode, the virtual address might
+ * not be the same as the physical address. But the physical address of the
+ * memory region is guaranteed to be contiguous starting at the provided
+ * address, it is required to provide a fixed physical address. The primary 
+ * use of this mapping is to allocate physical memory from a particular 
+ * address, where 1:1 physical-to-virtual is not required.
+ *
+ * QMEM_MAPPING_NONE mode must be used to reserve a virtual memory
+ * area (VMA); no physical memory is reserved or mapped to this virtual
+ * space; all standard qmem_region APIs apply to a VMA, however physical
+ * address is always INVALID_ADDR. qmem_region_create() in this mode
+ * returns a handle to the VMA, both virt_addr and phys_addr must
+ * be set to INVALID_ADDR, kernel allocates any available virtual
+ * memory of the specified size. Obtain the starting virtual address 
+ * of VMA through qmem_region_attr_getvirtaddr().
+ * Primary purpose of this mapping mode is to provide a mechanism for
+ * delayed binding in QuRT, for example reserve virtual memory and map it at
+ * some later time to possibly discontiguous physical blocks. Thus, a
+ * single VMA can be partitioned among several physical-virtual mappings
+ * created via qmem_region_create() with QMEM_VIRTUAL_FIXED mapping mode.
+ * Each VMA keeps track of associated mapped regions.
+ * Deletion of VMA succeeds only if all associated "virtual_fixed"
+ * regions are freed prior to VMA deletion.
+ *
+ * Use QMEM_MAPPING_VIRTUAL_FIXED mode to create a region
+ * from virtual space that has been reserved via qmem_region_create()
+ * with QMEM_MAPPING_NONE mapping. A valid virt_add is required, if
+ * phys_addr is specified, the kernel attempts to map it accordingly,
+ * if no phys_addr is specified, kernel maps any available physical
+ * memory. All standard qmem_region APIs apply to such region. Remapping
+ * a virtual range without prior freeing of the region is not permitted.
+ * When such region is deleted its corresponding VMA remains intact.
+ *
+ * QMEM_MAPPING_PHYS_DISCONTIGUOUS mode can obtain contiguous
+ * virtual memory but physical memory can be discontiguous. This method
+ * tries to club small physical memory blocks to obtain requested
+ * memory and is useful in case where there is no contiguous full block
+ * of requested size. If client does not need contiguous physical memory, 
+ * (for example, if client does not use physical addressing), this helps
+ * use smaller physical memory blocks rather than using contiguous memory.
+ * Note: When memory is allocated through this method, physical address is
+ * not returned to the caller using the qurt_mem_region_attr_get() API as there might
+ * not be a single physical address.
+ *
+ */
+/**@endcond */
+/** QuRT memory region mapping type. */
+typedef enum {
+        QURT_MEM_MAPPING_VIRTUAL=0,            /**< Default mode. The region virtual address range maps to an 
+                                          available contiguous area of physical memory. For the most
+                                                    efficient use of virtual memory, the QuRT system 
+                                                    chooses the base address in physical memory. This works for most memory
+                                          use cases.*/
+        QURT_MEM_MAPPING_PHYS_CONTIGUOUS = 1,  /**< The region virtual address space must be mapped to a 
+                                               contiguous area of physical memory. This is necessary when the
+                                               memory region is accessed by external devices that bypass Hexagon
+                                               virtual memory addressing. The base address in physical 
+                                               memory must be explicitly specified.*/
+        QURT_MEM_MAPPING_IDEMPOTENT=2,         /**< Region virtual address space maps
+                                             to the identical area of physical memory. */
+        QURT_MEM_MAPPING_VIRTUAL_FIXED=3,      /**< Virtual address space of the region maps either to the 
+                                           specified area of physical memory or (if no area is specified)
+                                                    to available physical memory. Use this mapping to create
+                                           regions from virtual space that was reserved by calling 
+                                           qurt_mem_region_create() with mapping. */
+        QURT_MEM_MAPPING_NONE=4,  /**< Reserves a virtual memory area (VMA). Remapping a virtual range is not
+                                       permitted without first deleting the memory region. When such a region is
+                                       deleted, its corresponding virtual memory addressing remains intact. */
+        QURT_MEM_MAPPING_VIRTUAL_RANDOM=7,     /**< System chooses a random virtual address and
+                                            maps it to available contiguous physical addresses.*/
+        QURT_MEM_MAPPING_PHYS_DISCONTIGUOUS=8, /**< While virtual memory is contiguous, allocates in discontiguous physical 
+                                                    memory blocks. This helps when there are smaller contiguous blocks
+                                                    than the requested size.
+                                                    Physical address is not provided as part of the get_attr call */
+        QURT_MEM_MAPPING_INVALID=10,        /**< Reserved as an invalid mapping type. */
+} qurt_mem_mapping_t;  
+
+
+/** QuRT cache mode type. */
+typedef enum {
+        QURT_MEM_CACHE_WRITEBACK=7,     /**< Write back. */
+        QURT_MEM_CACHE_NONE_SHARED=6,   /**< Normal uncached memory that can be shared with other subsystems.*/
+        QURT_MEM_CACHE_WRITETHROUGH=5,  /**< Write through. */
+        QURT_MEM_CACHE_WRITEBACK_NONL2CACHEABLE=0,    /**< Write back non-L2-cacheable.*/
+        QURT_MEM_CACHE_WRITETHROUGH_NONL2CACHEABLE=1,  /**< Write through non-L2-cacheable. */
+        QURT_MEM_CACHE_WRITEBACK_L2CACHEABLE=QURT_MEM_CACHE_WRITEBACK,  /**< Write back L2 cacheable. */
+        QURT_MEM_CACHE_WRITETHROUGH_L2CACHEABLE=QURT_MEM_CACHE_WRITETHROUGH,  /**< Write through L2 cacheable.  */
+        QURT_MEM_CACHE_DEVICE = 4,  /**< Volatile memory-mapped device. Access to device memory cannot be cancelled by interrupts, re-ordered, or replayed.*/
+        QURT_MEM_CACHE_NONE = 4,  /**< Deprecated -- use #QURT_MEM_CACHE_DEVICE instead. */
+        QURT_MEM_CACHE_DEVICE_SFC = 2, /**< Enables placing limitations on the number of outstanding transactions. */
+        QURT_MEM_CACHE_INVALID=10,  /**< Reserved as an invalid cache type. */
+} qurt_mem_cache_mode_t;
+
+/** Memory access permission. */
+#define     QURT_PERM_NONE    0x0U     /**< No permission. */
+#define     QURT_PERM_READ    0x1U     /**< Read permission. */
+#define     QURT_PERM_WRITE   0x2U     /**< Write permission. */
+#define     QURT_PERM_EXECUTE 0x4U     /**< Execution permission. */
+#define     QURT_PERM_NODUMP  0x8U   
+                                    /**<  Skip dumping the mapping. During process domain dump, must skip
+                                     some mappings on host memory to avoid a race condition
+                                     where the memory is removed from the host and DSP process
+                                     crashed before the mapping is removed. */
+#define     QURT_PERM_FULL  QURT_PERM_READ | QURT_PERM_WRITE | QURT_PERM_EXECUTE  /**< Read, write, and execute permission. */
+
+typedef unsigned char qurt_perm_t;
+
+
+/** @cond rest_reg_dist*/
+/** QuRT cache type; specifies data cache or instruction cache. */
+typedef enum {
+        QURT_MEM_ICACHE, /**< Instruction cache.*/
+        QURT_MEM_DCACHE  /**< Data cache.*/
+} qurt_mem_cache_type_t;
+
+/** QuRT cache operation code type. */
+typedef enum {
+    QURT_MEM_CACHE_FLUSH, /**< Flush. */
+    QURT_MEM_CACHE_INVALIDATE, /**< Invalidate */
+    QURT_MEM_CACHE_FLUSH_INVALIDATE, /**< Flush invalidate. */
+    QURT_MEM_CACHE_FLUSH_ALL, /**< Flush all. */
+    QURT_MEM_CACHE_FLUSH_INVALIDATE_ALL, /**< Flush invalidate all. */
+    QURT_MEM_CACHE_TABLE_FLUSH_INVALIDATE, /**< Table flush invalidate. */
+    QURT_MEM_CACHE_FLUSH_INVALIDATE_L2, /**< L2 flush invalidate.*/
+} qurt_mem_cache_op_t;
+
+/** QuRT memory region type. */
+typedef enum {
+        QURT_MEM_REGION_LOCAL=0,  /**< Local. */
+        QURT_MEM_REGION_SHARED=1,  /**< Shared.*/
+        QURT_MEM_REGION_USER_ACCESS=2,  /**< User access. */
+        QURT_MEM_REGION_FS=4,  /**< FS. */
+        QURT_MEM_REGION_INVALID=10,  /**< Reserved as an invalid region type. */
+} qurt_mem_region_type_t;
+
+/* Cache and bus attributes are combined into a value of this type for convenience,
+    and macros for combining and extracting fields are defined here.  */
+/** @cond */
+struct qurt_pgattr {
+   unsigned pga_value; /**< PGA value.*/
+};
+typedef struct qurt_pgattr qurt_pgattr_t;
+/** @endcond */
+/** QuRT memory region attributes type.*/  
+/* QMEM_MAPPING_IDEMPOTENT and QMEM_MAPPING_PHYS_CONTIGUOUS mode can specify physaddr.
+   virtaddr cannot be specified for a memory region, it can only be queried by the 
+   qmem_attr_getvirtaddr() function.
+ */
+typedef struct {
+    /** @cond */
+    qurt_mem_mapping_t    mapping_type; 
+    unsigned char          perms;
+    unsigned short         owner;
+    qurt_pgattr_t          pga;
+    unsigned               ppn; //physical page number (physical>>12)
+    qurt_addr_t            virtaddr;
+    qurt_mem_region_type_t   type;   
+    qurt_size_t               size;
+    /** @endcond */
+} qurt_mem_region_attr_t;
+
+
+/** QuRT user physical memory pool type. */
+typedef struct {
+    /** @cond */
+    char name[32];
+    struct ranges{
+        unsigned int start;
+        unsigned int size;
+    } ranges[MAX_POOL_RANGES];
+     /** @endcond */
+} qurt_mem_pool_attr_t;
+
+/** QuRT memory pool status type.*/
+typedef struct _qurt_mem_pool_status {
+
+    qurt_size_t         contig_size; /**< Largest contiguous free memory in bytes. */
+    qurt_size_t         free_size;   /**< Total free memory in bytes. */
+    qurt_size_t         total_size;  /**< Total declared memory in bytes. */
+
+} qurt_mem_pool_status_t;
+
+typedef enum {
+    HEXAGON_L1_I_CACHE = 0,     /**< Hexagon L1 instruction cache. */
+    HEXAGON_L1_D_CACHE = 1,     /**< Hexagon L1 data cache. */
+    HEXAGON_L2_CACHE = 2        /**< Hexagon L2 cache. */
+} qurt_cache_type_t;
+
+typedef enum {
+    FULL_SIZE = 0,                /**< Fully shared cache, without partitioning. */
+    HALF_SIZE = 1,                /**< 1/2 for main, 1/2 for auxiliary. */
+    THREE_QUARTER_SIZE = 2,       /**< 3/4 for main, 1/4 for auxiliary. */
+    SEVEN_EIGHTHS_SIZE = 3        /**< 7/8 for main, 1/8 for auxiliary; for L2 cache only. */
+} qurt_cache_partition_size_t;
+
+typedef enum {
+	QURT_PROCESS_CB_GENERIC,        /**< generic unconditional cb called after image loading. */
+	QURT_PROCESS_NOTE_CB_PRE_MAP,   /**< note cb called before segment loading. */
+	QURT_PROCESS_NOTE_CB_POST_MAP   /**< note cb called after segment loading. */
+} qurt_process_cb_type_t;
+
+typedef union {
+    void *ptr;
+    int num;
+} qurt_process_callback_arg_t;
+
+
+/**@endcond*/
+
+/** @} */ /* end_addtogroup memory_management_types */
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_TYPES_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_user_dma.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_user_dma.h
new file mode 100755
index 0000000000000..e05a6429fd703
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_user_dma.h
@@ -0,0 +1,44 @@
+#ifndef QURT_USER_DMA_H
+#define QURT_USER_DMA_H
+
+/**
+  @file qurt_user_dma.h
+  @brief  Definitions, macros, and prototypes used for handling user DMA.
+
+ EXTERNALIZED FUNCTIONS
+  none
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  none
+
+ Copyright (c) 2021  by Qualcomm Technologies, Inc.  All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**@ingroup qurt_user_dma_dmsyncht
+  Sends the DMSyncht command to the user DMA engine.
+   
+   Call this function to ensure all posted DMA memory operations are
+   complete. 
+   
+   This stalls the current thread until the instruction
+   is complete and returns.
+
+  @return
+  QURT_EOK - On dmsyncht completion \n
+  QURT_ENOTSUPPORTED - User DMA not supported
+  
+  @dependencies
+  None.
+*/
+int qurt_user_dma_dmsyncht(void);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_vtlb.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_vtlb.h
new file mode 100755
index 0000000000000..e064042e447ac
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/include/qurt/qurt_vtlb.h
@@ -0,0 +1,76 @@
+/*=============================================================================
+
+                                    qurt_vtlb.h
+
+GENERAL DESCRIPTION
+
+EXTERNAL FUNCTIONS
+        None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+        None.
+
+Copyright (c) 2019, 2021, 2023  by Qualcomm Technologies, Inc.  All Rights Reserved.
+=============================================================================*/
+#ifndef QURT_VTLB_H
+#define QURT_VTLB_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+||  Names starting with "qurt_i_vtlb" are the internal low-level functions.
+||  These should be considered subject to change.
+*/
+
+int qurt_i_vtlb_entry_create(unsigned *pIndex,
+                             unsigned tlb_lo,
+                             unsigned tlb_hi,
+                             unsigned extension);
+
+int qurt_i_vtlb_entry_create_with_pid(unsigned *pIndex,
+                                      unsigned tlb_lo,
+                                      unsigned tlb_hi,
+                                      unsigned extension,
+                                      unsigned target_pid);
+
+int qurt_i_vtlb_entry_delete(unsigned index);
+
+int qurt_i_vtlb_entry_read(unsigned index, unsigned *tlbinfo);
+
+int qurt_i_vtlb_entry_write(unsigned index, unsigned tlb_lo, unsigned tlb_hi, unsigned extension);
+
+int qurt_i_vtlb_entry_write_with_pid(unsigned index, unsigned tlb_lo, unsigned tlb_hi, unsigned extension, unsigned target_pid);
+
+int qurt_i_vtlb_entry_probe(const void *vaddr, unsigned *tlbinfo, unsigned *pIndex);
+
+int qurt_i_vtlb_entry_probe_with_pid(const void *vaddr, unsigned *tlbinfo, unsigned *pIndex, unsigned target_pid);
+
+
+int qurt_i_vtlb_statistics(unsigned *stats); // Returns stats[0] -- total number of VTLB entries
+                                             //         stats[1] -- number of available VTLB entries
+                                             //         stats[2] -- max size of VTLB tree since boot
+
+//can return index to an entry that was specialed, change it to take addresses instead of pages
+int qurt_i_vtlb_set_special(int index, unsigned pageno, unsigned asid, unsigned size);
+
+int qurt_i_vtlb_queue_ppage(unsigned pageno, unsigned vtlb_index);
+
+#define QURT_VTLB_EXT_DEFAULT      0U
+#define QURT_VTLB_EXT_LOCKED       1U
+#define QURT_VTLB_EXT_EXCLUDE_DUMP 2U      /* Temporary ability to skip certain mappings in pd dump */
+#define QURT_VTLB_EXT_FREELIST     0x800000u
+
+#define QURT_VTLB_ERR_OVERLAP           -64
+#define QURT_VTLB_ERR_TREE_NO_SPACE     -65
+#define QURT_VTLB_ERR_INVALID_SIZE      -68
+#define QURT_VTLB_ERR_INVALID_EXT       -69
+#define QURT_VTLB_ERR_DEL_PGT_LOCKED    -70
+#define QURT_VTLB_ERR_PGT_LOCK_CNT      -71
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif // QURT_VTLB_H
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/lib/libposix.a b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/lib/libposix.a
new file mode 100755
index 0000000000000..f338fbee708ef
Binary files /dev/null and b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/lib/libposix.a differ
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/lib/libqurt.a b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/lib/libqurt.a
new file mode 100755
index 0000000000000..e35606134ddfa
Binary files /dev/null and b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/lib/libqurt.a differ
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/lib/libqurtcfs.a b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/lib/libqurtcfs.a
new file mode 100755
index 0000000000000..02250fa425ac4
Binary files /dev/null and b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/lib/libqurtcfs.a differ
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/lib/libtimer_island.a b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/lib/libtimer_island.a
new file mode 100755
index 0000000000000..bce4fe8cc49b2
Binary files /dev/null and b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/lib/libtimer_island.a differ
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/lib/libtimer_main.a b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/lib/libtimer_main.a
new file mode 100755
index 0000000000000..041565908f9c6
Binary files /dev/null and b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/lib/libtimer_main.a differ
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/lib/pic/libposix.a b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/lib/pic/libposix.a
new file mode 100755
index 0000000000000..044c93bb65797
Binary files /dev/null and b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/lib/pic/libposix.a differ
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/lib/pic/libqurt.a b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/lib/pic/libqurt.a
new file mode 100755
index 0000000000000..a91e0fbb660b7
Binary files /dev/null and b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/lib/pic/libqurt.a differ
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/lib/pic/libqurtcfs.a b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/lib/pic/libqurtcfs.a
new file mode 100755
index 0000000000000..02250fa425ac4
Binary files /dev/null and b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/lib/pic/libqurtcfs.a differ
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/lib/pic/libtimer.a b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/lib/pic/libtimer.a
new file mode 100755
index 0000000000000..10bc3e63c2efc
Binary files /dev/null and b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev73/lib/pic/libtimer.a differ
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/posix/bits/confname.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/posix/bits/confname.h
new file mode 100755
index 0000000000000..d9ca3135501e3
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/posix/bits/confname.h
@@ -0,0 +1,528 @@
+#ifndef CONFNAME_H
+#define CONFNAME_H
+/**
+  @file confname.h
+  @brief Named literals for 'name' argument of sysconf, pathconf
+
+EXTERNAL FUNCTIONS    
+   None 
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS    
+   DONT include this header directly. Instead include unistd.h. For now since 
+   toolchain doesnt provide a hook by including bits/confname.h, we stick this 
+   header in QuRT's sys/types.h 
+
+Copyright (c) 2018, 2023 by Qualcomm Technologies, Inc.  All Rights Reserved.
+Confidential and Proprietary - Qualcomm Technologies, Inc.    
+    
+==============================================================================*/
+/* Values for the NAME argument to `pathconf' and `fpathconf'.  */
+enum
+{
+    _PC_LINK_MAX,
+#define	_PC_LINK_MAX			_PC_LINK_MAX
+    _PC_MAX_CANON,
+#define	_PC_MAX_CANON			_PC_MAX_CANON
+    _PC_MAX_INPUT,
+#define	_PC_MAX_INPUT			_PC_MAX_INPUT
+    _PC_NAME_MAX,
+#define	_PC_NAME_MAX			_PC_NAME_MAX
+    _PC_PATH_MAX,
+#define	_PC_PATH_MAX			_PC_PATH_MAX
+    _PC_PIPE_BUF,
+#define	_PC_PIPE_BUF			_PC_PIPE_BUF
+    _PC_CHOWN_RESTRICTED,
+#define	_PC_CHOWN_RESTRICTED		_PC_CHOWN_RESTRICTED
+    _PC_NO_TRUNC,
+#define	_PC_NO_TRUNC			_PC_NO_TRUNC
+    _PC_VDISABLE,
+#define _PC_VDISABLE			_PC_VDISABLE
+    _PC_SYNC_IO,
+#define	_PC_SYNC_IO			_PC_SYNC_IO
+    _PC_ASYNC_IO,
+#define	_PC_ASYNC_IO			_PC_ASYNC_IO
+    _PC_PRIO_IO,
+#define	_PC_PRIO_IO			_PC_PRIO_IO
+    _PC_SOCK_MAXBUF,
+#define	_PC_SOCK_MAXBUF			_PC_SOCK_MAXBUF
+    _PC_FILESIZEBITS,
+#define _PC_FILESIZEBITS		_PC_FILESIZEBITS
+    _PC_REC_INCR_XFER_SIZE,
+#define _PC_REC_INCR_XFER_SIZE		_PC_REC_INCR_XFER_SIZE
+    _PC_REC_MAX_XFER_SIZE,
+#define _PC_REC_MAX_XFER_SIZE		_PC_REC_MAX_XFER_SIZE
+    _PC_REC_MIN_XFER_SIZE,
+#define _PC_REC_MIN_XFER_SIZE		_PC_REC_MIN_XFER_SIZE
+    _PC_REC_XFER_ALIGN,
+#define _PC_REC_XFER_ALIGN		_PC_REC_XFER_ALIGN
+    _PC_ALLOC_SIZE_MIN,
+#define _PC_ALLOC_SIZE_MIN		_PC_ALLOC_SIZE_MIN
+    _PC_SYMLINK_MAX,
+#define _PC_SYMLINK_MAX			_PC_SYMLINK_MAX
+    _PC_2_SYMLINKS
+#define _PC_2_SYMLINKS			_PC_2_SYMLINKS
+};
+
+/* Values for the argument to `sysconf'.  */
+enum
+{
+    _SC_ARG_MAX,
+#define	_SC_ARG_MAX			_SC_ARG_MAX
+    _SC_CHILD_MAX,
+#define	_SC_CHILD_MAX			_SC_CHILD_MAX
+    _SC_CLK_TCK,
+#define	_SC_CLK_TCK			_SC_CLK_TCK
+    _SC_NGROUPS_MAX,
+#define	_SC_NGROUPS_MAX			_SC_NGROUPS_MAX
+    _SC_OPEN_MAX,
+#define	_SC_OPEN_MAX			_SC_OPEN_MAX
+    _SC_STREAM_MAX,
+#define	_SC_STREAM_MAX			_SC_STREAM_MAX
+    _SC_TZNAME_MAX,
+#define	_SC_TZNAME_MAX			_SC_TZNAME_MAX
+    _SC_JOB_CONTROL,
+#define	_SC_JOB_CONTROL			_SC_JOB_CONTROL
+    _SC_SAVED_IDS,
+#define	_SC_SAVED_IDS			_SC_SAVED_IDS
+    _SC_REALTIME_SIGNALS,
+#define	_SC_REALTIME_SIGNALS		_SC_REALTIME_SIGNALS
+    _SC_PRIORITY_SCHEDULING,
+#define	_SC_PRIORITY_SCHEDULING		_SC_PRIORITY_SCHEDULING
+    _SC_TIMERS,
+#define	_SC_TIMERS			_SC_TIMERS
+    _SC_ASYNCHRONOUS_IO,
+#define	_SC_ASYNCHRONOUS_IO		_SC_ASYNCHRONOUS_IO
+    _SC_PRIORITIZED_IO,
+#define	_SC_PRIORITIZED_IO		_SC_PRIORITIZED_IO
+    _SC_SYNCHRONIZED_IO,
+#define	_SC_SYNCHRONIZED_IO		_SC_SYNCHRONIZED_IO
+    _SC_FSYNC,
+#define	_SC_FSYNC			_SC_FSYNC
+    _SC_MAPPED_FILES,
+#define	_SC_MAPPED_FILES		_SC_MAPPED_FILES
+    _SC_MEMLOCK,
+#define	_SC_MEMLOCK			_SC_MEMLOCK
+    _SC_MEMLOCK_RANGE,
+#define	_SC_MEMLOCK_RANGE		_SC_MEMLOCK_RANGE
+    _SC_MEMORY_PROTECTION,
+#define	_SC_MEMORY_PROTECTION		_SC_MEMORY_PROTECTION
+    _SC_MESSAGE_PASSING,
+#define	_SC_MESSAGE_PASSING		_SC_MESSAGE_PASSING
+    _SC_SEMAPHORES,
+#define	_SC_SEMAPHORES			_SC_SEMAPHORES
+    _SC_SHARED_MEMORY_OBJECTS,
+#define	_SC_SHARED_MEMORY_OBJECTS	_SC_SHARED_MEMORY_OBJECTS
+    _SC_AIO_LISTIO_MAX,
+#define	_SC_AIO_LISTIO_MAX		_SC_AIO_LISTIO_MAX
+    _SC_AIO_MAX,
+#define	_SC_AIO_MAX			_SC_AIO_MAX
+    _SC_AIO_PRIO_DELTA_MAX,
+#define	_SC_AIO_PRIO_DELTA_MAX		_SC_AIO_PRIO_DELTA_MAX
+    _SC_DELAYTIMER_MAX,
+#define	_SC_DELAYTIMER_MAX		_SC_DELAYTIMER_MAX
+    _SC_MQ_OPEN_MAX,
+#define	_SC_MQ_OPEN_MAX			_SC_MQ_OPEN_MAX
+    _SC_MQ_PRIO_MAX,
+#define	_SC_MQ_PRIO_MAX			_SC_MQ_PRIO_MAX
+    _SC_VERSION,
+#define	_SC_VERSION			_SC_VERSION
+    _SC_PAGESIZE,
+#define	_SC_PAGESIZE			_SC_PAGESIZE
+#define	_SC_PAGE_SIZE			_SC_PAGESIZE
+    _SC_RTSIG_MAX,
+#define	_SC_RTSIG_MAX			_SC_RTSIG_MAX
+    _SC_SEM_NSEMS_MAX,
+#define	_SC_SEM_NSEMS_MAX		_SC_SEM_NSEMS_MAX
+    _SC_SEM_VALUE_MAX,
+#define	_SC_SEM_VALUE_MAX		_SC_SEM_VALUE_MAX
+    _SC_SIGQUEUE_MAX,
+#define	_SC_SIGQUEUE_MAX		_SC_SIGQUEUE_MAX
+    _SC_TIMER_MAX,
+#define	_SC_TIMER_MAX			_SC_TIMER_MAX
+
+    /* Values for the argument to `sysconf'
+       corresponding to _POSIX2_* symbols.  */
+    _SC_BC_BASE_MAX,
+#define	_SC_BC_BASE_MAX			_SC_BC_BASE_MAX
+    _SC_BC_DIM_MAX,
+#define	_SC_BC_DIM_MAX			_SC_BC_DIM_MAX
+    _SC_BC_SCALE_MAX,
+#define	_SC_BC_SCALE_MAX		_SC_BC_SCALE_MAX
+    _SC_BC_STRING_MAX,
+#define	_SC_BC_STRING_MAX		_SC_BC_STRING_MAX
+    _SC_COLL_WEIGHTS_MAX,
+#define	_SC_COLL_WEIGHTS_MAX		_SC_COLL_WEIGHTS_MAX
+    _SC_EQUIV_CLASS_MAX,
+#define	_SC_EQUIV_CLASS_MAX		_SC_EQUIV_CLASS_MAX
+    _SC_EXPR_NEST_MAX,
+#define	_SC_EXPR_NEST_MAX		_SC_EXPR_NEST_MAX
+    _SC_LINE_MAX,
+#define	_SC_LINE_MAX			_SC_LINE_MAX
+    _SC_RE_DUP_MAX,
+#define	_SC_RE_DUP_MAX			_SC_RE_DUP_MAX
+    _SC_CHARCLASS_NAME_MAX,
+#define	_SC_CHARCLASS_NAME_MAX		_SC_CHARCLASS_NAME_MAX
+
+    _SC_2_VERSION,
+#define	_SC_2_VERSION			_SC_2_VERSION
+    _SC_2_C_BIND,
+#define	_SC_2_C_BIND			_SC_2_C_BIND
+    _SC_2_C_DEV,
+#define	_SC_2_C_DEV			_SC_2_C_DEV
+    _SC_2_FORT_DEV,
+#define	_SC_2_FORT_DEV			_SC_2_FORT_DEV
+    _SC_2_FORT_RUN,
+#define	_SC_2_FORT_RUN			_SC_2_FORT_RUN
+    _SC_2_SW_DEV,
+#define	_SC_2_SW_DEV			_SC_2_SW_DEV
+    _SC_2_LOCALEDEF,
+#define	_SC_2_LOCALEDEF			_SC_2_LOCALEDEF
+
+    _SC_PII,
+#define	_SC_PII				_SC_PII
+    _SC_PII_XTI,
+#define	_SC_PII_XTI			_SC_PII_XTI
+    _SC_PII_SOCKET,
+#define	_SC_PII_SOCKET			_SC_PII_SOCKET
+    _SC_PII_INTERNET,
+#define	_SC_PII_INTERNET		_SC_PII_INTERNET
+    _SC_PII_OSI,
+#define	_SC_PII_OSI			_SC_PII_OSI
+    _SC_POLL,
+#define	_SC_POLL			_SC_POLL
+    _SC_SELECT,
+#define	_SC_SELECT			_SC_SELECT
+    _SC_UIO_MAXIOV,
+#define	_SC_UIO_MAXIOV			_SC_UIO_MAXIOV
+    _SC_IOV_MAX = _SC_UIO_MAXIOV,
+#define _SC_IOV_MAX			_SC_IOV_MAX
+    _SC_PII_INTERNET_STREAM,
+#define	_SC_PII_INTERNET_STREAM		_SC_PII_INTERNET_STREAM
+    _SC_PII_INTERNET_DGRAM,
+#define	_SC_PII_INTERNET_DGRAM		_SC_PII_INTERNET_DGRAM
+    _SC_PII_OSI_COTS,
+#define	_SC_PII_OSI_COTS		_SC_PII_OSI_COTS
+    _SC_PII_OSI_CLTS,
+#define	_SC_PII_OSI_CLTS		_SC_PII_OSI_CLTS
+    _SC_PII_OSI_M,
+#define	_SC_PII_OSI_M			_SC_PII_OSI_M
+    _SC_T_IOV_MAX,
+#define	_SC_T_IOV_MAX			_SC_T_IOV_MAX
+
+    /* Values according to POSIX 1003.1c (POSIX threads).  */
+    _SC_THREADS,
+#define	_SC_THREADS			_SC_THREADS
+    _SC_THREAD_SAFE_FUNCTIONS,
+#define _SC_THREAD_SAFE_FUNCTIONS	_SC_THREAD_SAFE_FUNCTIONS
+    _SC_GETGR_R_SIZE_MAX,
+#define	_SC_GETGR_R_SIZE_MAX		_SC_GETGR_R_SIZE_MAX
+    _SC_GETPW_R_SIZE_MAX,
+#define	_SC_GETPW_R_SIZE_MAX		_SC_GETPW_R_SIZE_MAX
+    _SC_LOGIN_NAME_MAX,
+#define	_SC_LOGIN_NAME_MAX		_SC_LOGIN_NAME_MAX
+    _SC_TTY_NAME_MAX,
+#define	_SC_TTY_NAME_MAX		_SC_TTY_NAME_MAX
+    _SC_THREAD_DESTRUCTOR_ITERATIONS,
+#define	_SC_THREAD_DESTRUCTOR_ITERATIONS _SC_THREAD_DESTRUCTOR_ITERATIONS
+    _SC_THREAD_KEYS_MAX,
+#define	_SC_THREAD_KEYS_MAX		_SC_THREAD_KEYS_MAX
+    _SC_THREAD_STACK_MIN,
+#define	_SC_THREAD_STACK_MIN		_SC_THREAD_STACK_MIN
+    _SC_THREAD_THREADS_MAX,
+#define	_SC_THREAD_THREADS_MAX		_SC_THREAD_THREADS_MAX
+    _SC_THREAD_ATTR_STACKADDR,
+#define	_SC_THREAD_ATTR_STACKADDR	_SC_THREAD_ATTR_STACKADDR
+    _SC_THREAD_ATTR_STACKSIZE,
+#define	_SC_THREAD_ATTR_STACKSIZE	_SC_THREAD_ATTR_STACKSIZE
+    _SC_THREAD_PRIORITY_SCHEDULING,
+#define	_SC_THREAD_PRIORITY_SCHEDULING	_SC_THREAD_PRIORITY_SCHEDULING
+    _SC_THREAD_PRIO_INHERIT,
+#define	_SC_THREAD_PRIO_INHERIT		_SC_THREAD_PRIO_INHERIT
+    _SC_THREAD_PRIO_PROTECT,
+#define	_SC_THREAD_PRIO_PROTECT		_SC_THREAD_PRIO_PROTECT
+    _SC_THREAD_PROCESS_SHARED,
+#define	_SC_THREAD_PROCESS_SHARED	_SC_THREAD_PROCESS_SHARED
+
+    _SC_NPROCESSORS_CONF,
+#define _SC_NPROCESSORS_CONF		_SC_NPROCESSORS_CONF
+    _SC_NPROCESSORS_ONLN,
+#define _SC_NPROCESSORS_ONLN		_SC_NPROCESSORS_ONLN
+    _SC_PHYS_PAGES,
+#define _SC_PHYS_PAGES			_SC_PHYS_PAGES
+    _SC_AVPHYS_PAGES,
+#define _SC_AVPHYS_PAGES		_SC_AVPHYS_PAGES
+    _SC_ATEXIT_MAX,
+#define _SC_ATEXIT_MAX			_SC_ATEXIT_MAX
+    _SC_PASS_MAX,
+#define _SC_PASS_MAX			_SC_PASS_MAX
+
+    _SC_XOPEN_VERSION,
+#define _SC_XOPEN_VERSION		_SC_XOPEN_VERSION
+    _SC_XOPEN_XCU_VERSION,
+#define _SC_XOPEN_XCU_VERSION		_SC_XOPEN_XCU_VERSION
+    _SC_XOPEN_UNIX,
+#define _SC_XOPEN_UNIX			_SC_XOPEN_UNIX
+    _SC_XOPEN_CRYPT,
+#define _SC_XOPEN_CRYPT			_SC_XOPEN_CRYPT
+    _SC_XOPEN_ENH_I18N,
+#define _SC_XOPEN_ENH_I18N		_SC_XOPEN_ENH_I18N
+    _SC_XOPEN_SHM,
+#define _SC_XOPEN_SHM			_SC_XOPEN_SHM
+
+    _SC_2_CHAR_TERM,
+#define _SC_2_CHAR_TERM			_SC_2_CHAR_TERM
+    _SC_2_C_VERSION,
+#define _SC_2_C_VERSION			_SC_2_C_VERSION
+    _SC_2_UPE,
+#define _SC_2_UPE			_SC_2_UPE
+
+    _SC_XOPEN_XPG2,
+#define _SC_XOPEN_XPG2			_SC_XOPEN_XPG2
+    _SC_XOPEN_XPG3,
+#define _SC_XOPEN_XPG3			_SC_XOPEN_XPG3
+    _SC_XOPEN_XPG4,
+#define _SC_XOPEN_XPG4			_SC_XOPEN_XPG4
+
+    _SC_CHAR_BIT,
+#define	_SC_CHAR_BIT			_SC_CHAR_BIT
+    _SC_CHAR_MAX,
+#define	_SC_CHAR_MAX			_SC_CHAR_MAX
+    _SC_CHAR_MIN,
+#define	_SC_CHAR_MIN			_SC_CHAR_MIN
+    _SC_INT_MAX,
+#define	_SC_INT_MAX			_SC_INT_MAX
+    _SC_INT_MIN,
+#define	_SC_INT_MIN			_SC_INT_MIN
+    _SC_LONG_BIT,
+#define	_SC_LONG_BIT			_SC_LONG_BIT
+    _SC_WORD_BIT,
+#define	_SC_WORD_BIT			_SC_WORD_BIT
+    _SC_MB_LEN_MAX,
+#define	_SC_MB_LEN_MAX			_SC_MB_LEN_MAX
+    _SC_NZERO,
+#define	_SC_NZERO			_SC_NZERO
+    _SC_SSIZE_MAX,
+#define	_SC_SSIZE_MAX			_SC_SSIZE_MAX
+    _SC_SCHAR_MAX,
+#define	_SC_SCHAR_MAX			_SC_SCHAR_MAX
+    _SC_SCHAR_MIN,
+#define	_SC_SCHAR_MIN			_SC_SCHAR_MIN
+    _SC_SHRT_MAX,
+#define	_SC_SHRT_MAX			_SC_SHRT_MAX
+    _SC_SHRT_MIN,
+#define	_SC_SHRT_MIN			_SC_SHRT_MIN
+    _SC_UCHAR_MAX,
+#define	_SC_UCHAR_MAX			_SC_UCHAR_MAX
+    _SC_UINT_MAX,
+#define	_SC_UINT_MAX			_SC_UINT_MAX
+    _SC_ULONG_MAX,
+#define	_SC_ULONG_MAX			_SC_ULONG_MAX
+    _SC_USHRT_MAX,
+#define	_SC_USHRT_MAX			_SC_USHRT_MAX
+
+    _SC_NL_ARGMAX,
+#define	_SC_NL_ARGMAX			_SC_NL_ARGMAX
+    _SC_NL_LANGMAX,
+#define	_SC_NL_LANGMAX			_SC_NL_LANGMAX
+    _SC_NL_MSGMAX,
+#define	_SC_NL_MSGMAX			_SC_NL_MSGMAX
+    _SC_NL_NMAX,
+#define	_SC_NL_NMAX			_SC_NL_NMAX
+    _SC_NL_SETMAX,
+#define	_SC_NL_SETMAX			_SC_NL_SETMAX
+    _SC_NL_TEXTMAX,
+#define	_SC_NL_TEXTMAX			_SC_NL_TEXTMAX
+
+    _SC_XBS5_ILP32_OFF32,
+#define _SC_XBS5_ILP32_OFF32		_SC_XBS5_ILP32_OFF32
+    _SC_XBS5_ILP32_OFFBIG,
+#define _SC_XBS5_ILP32_OFFBIG		_SC_XBS5_ILP32_OFFBIG
+    _SC_XBS5_LP64_OFF64,
+#define _SC_XBS5_LP64_OFF64		_SC_XBS5_LP64_OFF64
+    _SC_XBS5_LPBIG_OFFBIG,
+#define _SC_XBS5_LPBIG_OFFBIG		_SC_XBS5_LPBIG_OFFBIG
+
+    _SC_XOPEN_LEGACY,
+#define _SC_XOPEN_LEGACY		_SC_XOPEN_LEGACY
+    _SC_XOPEN_REALTIME,
+#define _SC_XOPEN_REALTIME		_SC_XOPEN_REALTIME
+    _SC_XOPEN_REALTIME_THREADS,
+#define _SC_XOPEN_REALTIME_THREADS	_SC_XOPEN_REALTIME_THREADS
+
+    _SC_ADVISORY_INFO,
+#define _SC_ADVISORY_INFO		_SC_ADVISORY_INFO
+    _SC_BARRIERS,
+#define _SC_BARRIERS			_SC_BARRIERS
+    _SC_BASE,
+#define _SC_BASE			_SC_BASE
+    _SC_C_LANG_SUPPORT,
+#define _SC_C_LANG_SUPPORT		_SC_C_LANG_SUPPORT
+    _SC_C_LANG_SUPPORT_R,
+#define _SC_C_LANG_SUPPORT_R		_SC_C_LANG_SUPPORT_R
+    _SC_CLOCK_SELECTION,
+#define _SC_CLOCK_SELECTION		_SC_CLOCK_SELECTION
+    _SC_CPUTIME,
+#define _SC_CPUTIME			_SC_CPUTIME
+    _SC_THREAD_CPUTIME,
+#define _SC_THREAD_CPUTIME		_SC_THREAD_CPUTIME
+    _SC_DEVICE_IO,
+#define _SC_DEVICE_IO			_SC_DEVICE_IO
+    _SC_DEVICE_SPECIFIC,
+#define _SC_DEVICE_SPECIFIC		_SC_DEVICE_SPECIFIC
+    _SC_DEVICE_SPECIFIC_R,
+#define _SC_DEVICE_SPECIFIC_R		_SC_DEVICE_SPECIFIC_R
+    _SC_FD_MGMT,
+#define _SC_FD_MGMT			_SC_FD_MGMT
+    _SC_FIFO,
+#define _SC_FIFO			_SC_FIFO
+    _SC_PIPE,
+#define _SC_PIPE			_SC_PIPE
+    _SC_FILE_ATTRIBUTES,
+#define _SC_FILE_ATTRIBUTES		_SC_FILE_ATTRIBUTES
+    _SC_FILE_LOCKING,
+#define _SC_FILE_LOCKING		_SC_FILE_LOCKING
+    _SC_FILE_SYSTEM,
+#define _SC_FILE_SYSTEM			_SC_FILE_SYSTEM
+    _SC_MONOTONIC_CLOCK,
+#define _SC_MONOTONIC_CLOCK		_SC_MONOTONIC_CLOCK
+    _SC_MULTI_PROCESS,
+#define _SC_MULTI_PROCESS		_SC_MULTI_PROCESS
+    _SC_SINGLE_PROCESS,
+#define _SC_SINGLE_PROCESS		_SC_SINGLE_PROCESS
+    _SC_NETWORKING,
+#define _SC_NETWORKING			_SC_NETWORKING
+    _SC_READER_WRITER_LOCKS,
+#define _SC_READER_WRITER_LOCKS		_SC_READER_WRITER_LOCKS
+    _SC_SPIN_LOCKS,
+#define _SC_SPIN_LOCKS			_SC_SPIN_LOCKS
+    _SC_REGEXP,
+#define _SC_REGEXP			_SC_REGEXP
+    _SC_REGEX_VERSION,
+#define _SC_REGEX_VERSION		_SC_REGEX_VERSION
+    _SC_SHELL,
+#define _SC_SHELL			_SC_SHELL
+    _SC_SIGNALS,
+#define _SC_SIGNALS			_SC_SIGNALS
+    _SC_SPAWN,
+#define _SC_SPAWN			_SC_SPAWN
+    _SC_SPORADIC_SERVER,
+#define _SC_SPORADIC_SERVER		_SC_SPORADIC_SERVER
+    _SC_THREAD_SPORADIC_SERVER,
+#define _SC_THREAD_SPORADIC_SERVER	_SC_THREAD_SPORADIC_SERVER
+    _SC_SYSTEM_DATABASE,
+#define _SC_SYSTEM_DATABASE		_SC_SYSTEM_DATABASE
+    _SC_SYSTEM_DATABASE_R,
+#define _SC_SYSTEM_DATABASE_R		_SC_SYSTEM_DATABASE_R
+    _SC_TIMEOUTS,
+#define _SC_TIMEOUTS			_SC_TIMEOUTS
+    _SC_TYPED_MEMORY_OBJECTS,
+#define _SC_TYPED_MEMORY_OBJECTS	_SC_TYPED_MEMORY_OBJECTS
+    _SC_USER_GROUPS,
+#define _SC_USER_GROUPS			_SC_USER_GROUPS
+    _SC_USER_GROUPS_R,
+#define _SC_USER_GROUPS_R		_SC_USER_GROUPS_R
+    _SC_2_PBS,
+#define _SC_2_PBS			_SC_2_PBS
+    _SC_2_PBS_ACCOUNTING,
+#define _SC_2_PBS_ACCOUNTING		_SC_2_PBS_ACCOUNTING
+    _SC_2_PBS_LOCATE,
+#define _SC_2_PBS_LOCATE		_SC_2_PBS_LOCATE
+    _SC_2_PBS_MESSAGE,
+#define _SC_2_PBS_MESSAGE		_SC_2_PBS_MESSAGE
+    _SC_2_PBS_TRACK,
+#define _SC_2_PBS_TRACK			_SC_2_PBS_TRACK
+    _SC_SYMLOOP_MAX,
+#define _SC_SYMLOOP_MAX			_SC_SYMLOOP_MAX
+    _SC_STREAMS,
+#define _SC_STREAMS			_SC_STREAMS
+    _SC_2_PBS_CHECKPOINT,
+#define _SC_2_PBS_CHECKPOINT		_SC_2_PBS_CHECKPOINT
+
+    _SC_V6_ILP32_OFF32,
+#define _SC_V6_ILP32_OFF32		_SC_V6_ILP32_OFF32
+    _SC_V6_ILP32_OFFBIG,
+#define _SC_V6_ILP32_OFFBIG		_SC_V6_ILP32_OFFBIG
+    _SC_V6_LP64_OFF64,
+#define _SC_V6_LP64_OFF64		_SC_V6_LP64_OFF64
+    _SC_V6_LPBIG_OFFBIG,
+#define _SC_V6_LPBIG_OFFBIG		_SC_V6_LPBIG_OFFBIG
+
+    _SC_HOST_NAME_MAX,
+#define _SC_HOST_NAME_MAX		_SC_HOST_NAME_MAX
+    _SC_TRACE,
+#define _SC_TRACE			_SC_TRACE
+    _SC_TRACE_EVENT_FILTER,
+#define _SC_TRACE_EVENT_FILTER		_SC_TRACE_EVENT_FILTER
+    _SC_TRACE_INHERIT,
+#define _SC_TRACE_INHERIT		_SC_TRACE_INHERIT
+    _SC_TRACE_LOG,
+#define _SC_TRACE_LOG			_SC_TRACE_LOG
+
+    _SC_LEVEL1_ICACHE_SIZE,
+#define _SC_LEVEL1_ICACHE_SIZE		_SC_LEVEL1_ICACHE_SIZE
+    _SC_LEVEL1_ICACHE_ASSOC,
+#define _SC_LEVEL1_ICACHE_ASSOC		_SC_LEVEL1_ICACHE_ASSOC
+    _SC_LEVEL1_ICACHE_LINESIZE,
+#define _SC_LEVEL1_ICACHE_LINESIZE	_SC_LEVEL1_ICACHE_LINESIZE
+    _SC_LEVEL1_DCACHE_SIZE,
+#define _SC_LEVEL1_DCACHE_SIZE		_SC_LEVEL1_DCACHE_SIZE
+    _SC_LEVEL1_DCACHE_ASSOC,
+#define _SC_LEVEL1_DCACHE_ASSOC		_SC_LEVEL1_DCACHE_ASSOC
+    _SC_LEVEL1_DCACHE_LINESIZE,
+#define _SC_LEVEL1_DCACHE_LINESIZE	_SC_LEVEL1_DCACHE_LINESIZE
+    _SC_LEVEL2_CACHE_SIZE,
+#define _SC_LEVEL2_CACHE_SIZE		_SC_LEVEL2_CACHE_SIZE
+    _SC_LEVEL2_CACHE_ASSOC,
+#define _SC_LEVEL2_CACHE_ASSOC		_SC_LEVEL2_CACHE_ASSOC
+    _SC_LEVEL2_CACHE_LINESIZE,
+#define _SC_LEVEL2_CACHE_LINESIZE	_SC_LEVEL2_CACHE_LINESIZE
+    _SC_LEVEL3_CACHE_SIZE,
+#define _SC_LEVEL3_CACHE_SIZE		_SC_LEVEL3_CACHE_SIZE
+    _SC_LEVEL3_CACHE_ASSOC,
+#define _SC_LEVEL3_CACHE_ASSOC		_SC_LEVEL3_CACHE_ASSOC
+    _SC_LEVEL3_CACHE_LINESIZE,
+#define _SC_LEVEL3_CACHE_LINESIZE	_SC_LEVEL3_CACHE_LINESIZE
+    _SC_LEVEL4_CACHE_SIZE,
+#define _SC_LEVEL4_CACHE_SIZE		_SC_LEVEL4_CACHE_SIZE
+    _SC_LEVEL4_CACHE_ASSOC,
+#define _SC_LEVEL4_CACHE_ASSOC		_SC_LEVEL4_CACHE_ASSOC
+    _SC_LEVEL4_CACHE_LINESIZE,
+#define _SC_LEVEL4_CACHE_LINESIZE	_SC_LEVEL4_CACHE_LINESIZE
+    /* Leave room here, maybe we need a few more cache levels some day.  */
+
+    _SC_IPV6 = _SC_LEVEL1_ICACHE_SIZE + 50,
+#define _SC_IPV6			_SC_IPV6
+    _SC_RAW_SOCKETS,
+#define _SC_RAW_SOCKETS			_SC_RAW_SOCKETS
+
+    _SC_V7_ILP32_OFF32,
+#define _SC_V7_ILP32_OFF32		_SC_V7_ILP32_OFF32
+    _SC_V7_ILP32_OFFBIG,
+#define _SC_V7_ILP32_OFFBIG		_SC_V7_ILP32_OFFBIG
+    _SC_V7_LP64_OFF64,
+#define _SC_V7_LP64_OFF64		_SC_V7_LP64_OFF64
+    _SC_V7_LPBIG_OFFBIG,
+#define _SC_V7_LPBIG_OFFBIG		_SC_V7_LPBIG_OFFBIG
+
+    _SC_SS_REPL_MAX,
+#define _SC_SS_REPL_MAX			_SC_SS_REPL_MAX
+
+    _SC_TRACE_EVENT_NAME_MAX,
+#define _SC_TRACE_EVENT_NAME_MAX	_SC_TRACE_EVENT_NAME_MAX
+    _SC_TRACE_NAME_MAX,
+#define _SC_TRACE_NAME_MAX		_SC_TRACE_NAME_MAX
+    _SC_TRACE_SYS_MAX,
+#define _SC_TRACE_SYS_MAX		_SC_TRACE_SYS_MAX
+    _SC_TRACE_USER_EVENT_MAX,
+#define _SC_TRACE_USER_EVENT_MAX	_SC_TRACE_USER_EVENT_MAX
+
+    _SC_XOPEN_STREAMS,
+#define _SC_XOPEN_STREAMS		_SC_XOPEN_STREAMS
+
+    _SC_THREAD_ROBUST_PRIO_INHERIT,
+#define _SC_THREAD_ROBUST_PRIO_INHERIT	_SC_THREAD_ROBUST_PRIO_INHERIT
+    _SC_THREAD_ROBUST_PRIO_PROTECT
+#define _SC_THREAD_ROBUST_PRIO_PROTECT	_SC_THREAD_ROBUST_PRIO_PROTECT
+
+};
+#endif
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/posix/bits/posix1_lim.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/posix/bits/posix1_lim.h
new file mode 100755
index 0000000000000..0739958c5a6c4
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/posix/bits/posix1_lim.h
@@ -0,0 +1,34 @@
+#ifndef POSIX1_LIM_H
+#define POSIX1_LIM_H
+/**
+  @file posix1_lim.h
+  @brief POSIX Minimum values
+
+EXTERNAL FUNCTIONS    
+   None 
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None 
+    
+TODO    
+   This header should be ideally relocated under api/posix/bits (something that 
+   doesnt exist today) and be included from api/posix/bits/limits.h which inturn 
+   should be included from toolchain's limits.h 
+
+Copyright (c) 2018, 2023 by Qualcomm Technologies, Inc.  All Rights Reserved.
+Confidential and Proprietary - Qualcomm Technologies, Inc.    
+    
+==============================================================================*/
+
+#ifndef _POSIX_PATH_MAX
+/** @brief Maximum number of bytes in a pathname, including the terminating
+    nul character */
+#define _POSIX_PATH_MAX 256
+#endif
+
+#ifndef _POSIX_SEM_NSEMS_MAX
+/** @brief Maximum number of semaphores that a process may have */
+#define _POSIX_SEM_NSEMS_MAX 16
+#endif
+#endif
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/posix/common/time.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/posix/common/time.h
new file mode 100755
index 0000000000000..76b0d39ab7039
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/posix/common/time.h
@@ -0,0 +1 @@
+#include <time.h>
\ No newline at end of file
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/posix/fcntl.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/posix/fcntl.h
new file mode 100755
index 0000000000000..c80ec98a449b6
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/posix/fcntl.h
@@ -0,0 +1,51 @@
+#ifndef _FCNTL_H
+#define _FCNTL_H
+
+/*==========================================================================
+ * FILE:         fcntl.h
+ *
+ * SERVICES:     POSIX fcntl.h
+ *
+ * DESCRIPTION:  The <fcntl.h> header is needed by the open() and fcntl()
+ *               system calls, which have a variety of parameters and
+ *               flags. They are described here.
+ *
+ *               The formats of the calls to each of these are:
+ *
+ *               open(path, oflag [,mode]) open a file
+ *               fcntl(fd, cmd [,arg]) get or set file attributes
+ *
+ *               Copyright (c) 2013,2016  by Qualcomm Technologies, Inc.  All Rights Reserved. QUALCOMM Proprietary and Confidential.
+
+ *==========================================================================*/
+
+#include <sys/types.h>
+#include <generic/fcntl.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Oflag values for open(). POSIX Table 6-4. */
+#define POSIX_O_CREAT       0x100  /* creat file if it doesn't exist */
+#define POSIX_O_EXCL        0x200  /* exclusive use flag */
+#define POSIX_O_NOCTTY      0x400  /* do not assign a controlling terminal */
+#define POSIX_O_TRUNC       0x1000 /* truncate flag */
+
+/* File status flags for open() and fcntl(). POSIX Table 6-5. */
+#define POSIX_O_APPEND      0x2000 /* set append mode */
+#define POSIX_O_NONBLOCK    0x4000 /* no delay */
+
+/* File access modes for open() and fcntl(). POSIX Table 6-6. */
+#define POSIX_O_RDONLY      0 /* open(name, POSIX_O_RDONLY) opens read only */
+#define POSIX_O_WRONLY      1 /* open(name, POSIX_O_WRONLY) opens write only */
+#define POSIX_O_RDWR        2 /* open(name, POSIX_O_RDWR) opens read/write */
+
+/* Mask for use with file access modes. POSIX Table 6-7. */
+#define POSIX_O_ACCMODE     0x3 /* mask for file access modes */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _FCNTL_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/posix/hooks/unistd.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/posix/hooks/unistd.h
new file mode 100755
index 0000000000000..1c618bfe36b4f
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/posix/hooks/unistd.h
@@ -0,0 +1,115 @@
+#ifndef UNISTD_H
+#define UNISTD_H
+/**
+  @file posix/hooks/unistd.h
+  @brief POSIX related declarations in <unistd.h> that are missing in toolchain 
+         header
+
+EXTERNAL FUNCTIONS    
+   None 
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS    
+   DONT include this header directly! Instead include unistd.h. 
+
+Copyright (c) 2018, 2023 by Qualcomm Technologies, Inc.  All Rights Reserved.
+Confidential and Proprietary - Qualcomm Technologies, Inc.    
+    
+==============================================================================*/
+#include <types.h> /* For various POSIX ID types from toolchain headers */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+extern long pathconf (char const * path, int name);
+
+/* Process*/
+
+/** The getppid() function shall return the parent process ID of the calling process.
+ * Please refer to POSIX standard for details.
+ * @param thread    [in]  none
+ * @param value_ptr [out] the  parent process ID
+ */
+pid_t getppid(void);
+
+/** The getpgid() function shall return the process group ID of the process whose process ID is equal to pid
+ * Please refer to POSIX standard for details.
+ * @param thread    [in]  process ID
+ * @param value_ptr [out] process group ID
+ */
+pid_t getpgid(pid_t pid);
+
+/** The getpgrp() function shall return the process group ID of the calling process
+ * Please refer to POSIX standard for details.
+ * @param thread    [in]  none
+ * @param value_ptr [out] process group ID of the calling process
+ */
+pid_t getpgrp(void);
+
+/**The getuid() function shall return the real user ID of the calling process.
+ * Please refer to POSIX standard for details.
+ * @param thread    [in]  none
+ * @param value_ptr [out] the real user ID of the calling process.
+ */
+uid_t getuid(void); 
+
+/** The geteuid() function shall return the effective user ID of the calling process
+ * Please refer to POSIX standard for details.
+ * @param thread    [in]  none
+ * @param value_ptr [out] effective user ID of the calling process
+ */
+uid_t geteuid(void); 
+
+/** The getegid() function shall return the effective group ID of the calling process.
+ * Please refer to POSIX standard for details.
+ * @param thread    [in]  none
+ * @param value_ptr [out] effective group ID of the calling process.
+ */
+gid_t getegid(void);
+
+/** The getgid() function shall return the real group ID of the calling process
+ * Please refer to POSIX standard for details.
+ * @param thread    [in]  none
+ * @param value_ptr [out] real group ID of the calling process.
+ */
+ gid_t getgid(void); 
+
+/** seteuid set effective user ID
+ * Please refer to POSIX standard for details.
+ * @param thread    [in] effective user ID
+ * @param value_ptr [out] Upon successful completion, 0 shall be returned; otherwise, -1 shall be returned and errno set to indicate the error.
+ */
+int seteuid(uid_t uid);
+
+/** setpgrp - set the process group ID
+ * Please refer to POSIX standard for details.
+ * @param thread    [in] none
+ * @param value_ptr [out] Upon successful completion, 0 shall be returned; otherwise, -1 shall be returned and errno set to indicate the error.
+ */ 
+pid_t setpgrp(void);
+
+/** setuid - set user ID
+ * Please refer to POSIX standard for details.
+ * @param thread    [in]  user ID
+ * @param value_ptr [out] Upon successful completion, 0 shall be returned; otherwise, -1 shall be returned and errno set to indicate the error.
+ */
+int setuid(uid_t uid);
+
+/** setpgid - set process group ID for job control
+ * Please refer to POSIX standard for details.
+ * @param thread    [in] PID of process, PGID to be set
+ * @param value_ptr [out] Upon successful completion, 0 shall be returned; otherwise, -1 shall be returned and errno set to indicate the error.
+ */
+int setpgid(pid_t pid, pid_t pgid);
+
+/** setsid - create session and set process group ID
+ * Please refer to POSIX standard for details.
+ * @param thread    [in] none
+ * @param value_ptr [out] Upon successful completion, 0 shall be returned; otherwise, -1 shall be returned and errno set to indicate the error.
+ */
+pid_t setsid(void);
+
+#ifdef __cplusplus
+}
+#endif
+#endif
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/posix/mqueue.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/posix/mqueue.h
new file mode 100755
index 0000000000000..74dcc2fa202c6
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/posix/mqueue.h
@@ -0,0 +1,203 @@
+#ifndef _POSIX_MQUEUE_H_
+#define _POSIX_MQUEUE_H_
+
+/*==========================================================================
+ * FILE:         mqueue.h
+ *
+ * SERVICES:     POSIX Message Queue API interface
+ *
+ * DESCRIPTION:  POSIX Message Queue API interface based upon POSIX 1003.1-2004
+ *
+ * Copyright (c) 2013, 2016, 2023 Qualcomm Technologies, Inc.  
+ * All Rights Reserved. 
+ * Confidential and Proprietary - Qualcomm Technlogies, Inc.
+ *==========================================================================*/
+
+#include <sys/types.h> /*ssize_t */
+#include <time.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MQ_PRIO_MAX        255     /* max priority */
+#define MQ_PRIO_DEFAULT    0       /* default priority */
+
+typedef int   mqd_t;
+
+struct mq_attr
+{
+    long mq_flags;   /* message queue flags */
+    long mq_maxmsg;  /* maximum number of messages */
+    long mq_msgsize; /* maximum message size */
+    long mq_curmsgs; /* number of messages currently queued */
+};
+
+typedef struct mq_attr mqueue_attr;
+
+/** \details
+ * This provides POSIX Message Queue API.
+ *
+ * mq_notify is not supported.
+ *
+ * Since this implementation of POSIX kernel API is a subset of PSE51,
+ * it only supports Message sending and receiving within one process.
+ * Message sending and receiving among processes are not supported.
+ */
+
+/** \defgroup mqueue POSIX Message Queue API */
+/** \ingroup mqueue */
+/** @{ */
+
+/** Open a message queue.
+ * Please refer to POSIX standard for details.
+ */
+mqd_t mq_open(const char *name, int oflag, /* mode_t mode, struct mq_attr *attr */...);
+
+/** Close a message queue.
+ * Please refer to POSIX standard for details.
+ */
+int mq_close(mqd_t mq_desc);
+
+/** Remove a message queue.
+ * Please refer to POSIX standard for details.
+ */
+int mq_unlink(const char *name);
+
+/** Send a message to a message queue.
+ * Please refer to POSIX standard for details.
+ *
+ * If the queue is full, instead of blocking the sender, this function
+ * will return -1 with errno EAGAIN, in this implementation. This behavior
+ * may change in the future.
+ */
+int mq_send(mqd_t mqdes, const char *msg_ptr, size_t msg_len, unsigned int msg_prio);
+
+/** Send a message to a message queue with timeout.
+ * Please refer to POSIX standard for details.
+ * @param abs_timeout [in] Only abs_timeout={0,0} is supported in this
+ * implementation. This behavior may change in the future.
+ */
+int mq_timedsend(mqd_t mqdes, const char *msg_ptr, size_t msg_len, unsigned int msg_prio, const struct timespec *abs_timeout);
+
+/** Receive a message from a message queue.
+ * Please refer to POSIX standard for details.
+ */
+ssize_t mq_receive(mqd_t mqdes, char *msg_ptr, size_t msg_len, unsigned int *msg_prio);
+
+/** Receive a message from a message queue with timeout.
+ * Please refer to POSIX standard for details.
+ * @param abs_timeout [in] Only abs_timeout={0,0} is supported in this
+ * implementation. This behavior may change in the future.
+ */
+ssize_t mq_timedreceive(mqd_t mqdes, char *restrict msg_ptr, size_t msg_len, unsigned int *restrict msg_prio, const struct timespec *restrict abs_timeout);
+
+/** Get message queue attributes.
+ * Please refer to POSIX standard for details.
+ */
+int mq_getattr(mqd_t mqdes, struct mq_attr *mqstat);
+
+/** Set message queue attributes.
+ * Please refer to POSIX standard for details.
+ */
+int mq_setattr(mqd_t mqdes, const struct mq_attr *restrict mqstat, struct mq_attr *restrict omqstat);
+
+/** @} */
+
+#define NBBY    8U               /* number of bits in a byte */
+
+/*
+ * Select uses bit masks of file descriptors in longs.  These macros
+ * manipulate such bit fields (the filesystem macros use chars).
+ * FD_SETSIZE may be defined by the user, but the default here should
+ * be enough for most uses.
+ */
+#ifndef FD_SETSIZE
+#define FD_SETSIZE    256U
+#endif
+
+typedef unsigned long   fd_mask;
+#define NFDBITS    (sizeof(fd_mask) * (unsigned int)NBBY)     /* bits per mask */
+
+#ifndef howmany
+#define howmany(x, y)    (((x) + ((y) - 1U)) / (y))
+#endif
+
+//equivalent of fd_set fpr WINNT env
+typedef struct fd_set
+{
+    fd_mask fds_bits[howmany(FD_SETSIZE, NFDBITS)];
+} fd_set;
+
+/** \addtogroup mqueue */
+/** @{ */
+
+/** Sets the bit for the file descriptor fd in the file descriptor set fdset.
+ */
+#define FD_SET(n, p) ((p)->fds_bits[((unsigned int) (n)) / NFDBITS] |= (1UL << (((unsigned int) (n)) % NFDBITS)))
+
+/** Clears the bit for the file descriptor fd in the file descriptor set fdset.
+ */
+#define FD_CLR(n, p) ((p)->fds_bits[((unsigned int) (n)) / NFDBITS] &= ~(1UL << (((unsigned int) (n)) % NFDBITS)))
+
+/** Returns a non-zero value if the bit for the file descriptor fd is set in the file descriptor set pointed to by fdset, and 0 otherwise.
+ */
+#define FD_ISSET(n, p)    ((unsigned long)(p)->fds_bits[((unsigned int) (n)) / NFDBITS] & (unsigned long)((unsigned)1U << (((unsigned int) (n)) % NFDBITS)))
+
+/** Copies the file descriptor set.
+ */
+#define FD_COPY(f, t)     (void)(memcpy)((t), (f), sizeof(*(f)))
+
+/** Initializes the file descriptor set fdset to have zero bits for all file descriptors.
+ */
+#define FD_ZERO(p)        (void)memset((p), 0, sizeof(*(p)))
+
+/** Error check the file descriptor set.
+ */
+#define FD_BAD(fd)        ((fd) < 0 /*|| fd >= fd_arraylen || fd_array[fd].obj == 0*/)
+
+/*! Wait for both message queues and signals. In this implementation, only
+ * message queue file descriptors are supported.
+ * @param nfds [in] This is an integer one more than the maximum of any file
+ * descriptor in any of the sets. In other words, while you are busy
+ * adding file descriptors to your sets, you must calculate the maximum
+ * integer value of all of them, then increment this value by one, and
+ * then pass this as nfds to select().
+ * @param readfds  [in] the file descriptor set on all message queues.
+ * @param writefds [in] ignored in this implementation.
+ * @param errorfds [in] ignored in this implementation.
+ * @param timeout  [in] Only timeout={0,0} is supported in this
+ * implementation. This behavior may change in the future.
+ */
+int pselect(int nfds, fd_set *restrict readfds,
+            fd_set *restrict writefds, fd_set *restrict errorfds,
+            const struct timespec *restrict timeout,
+            const sigset_t *restrict sigmask);
+
+/*! Wait for multiple message queues. In this implementation, only
+ * message queue file descriptors are supported.
+ * @param nfds [in] This is an integer one more than the maximum of any file
+ * descriptor in any of the sets. In other words, while you are busy
+ * adding file descriptors to your sets, you must calculate the maximum
+ * integer value of all of them, then increment this value by one, and
+ * then pass this as nfds to select().
+ * @param readfds  [in] the file descriptor set on all message queues.
+ * @param writefds [in] ignored in this implementation.
+ * @param errorfds [in] ignored in this implementation.
+ * @param timeout  [in] Only timeout={0,0} is supported in this
+ * implementation. This behavior may change in the future.
+ */
+int select(int nfds, fd_set *restrict readfds,
+           fd_set *restrict writefds, fd_set *restrict errorfds,
+           struct timeval *restrict timeout);
+
+/** @} */
+
+/* this function is needed for test framework which needs to clean up memory when teardown */
+void _mq_teardown(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/posix/pthread.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/posix/pthread.h
new file mode 100755
index 0000000000000..f64242e8dc683
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/posix/pthread.h
@@ -0,0 +1,287 @@
+#ifndef QURT_PTHREAD_H
+#define QURT_PTHREAD_H  
+
+/*==========================================================================
+ * FILE:         pthread.h
+ *
+ * SERVICES:     POSIX pthread API interface
+ *
+ * DESCRIPTION:  POSIX pthread API interface based upon POSIX 1003.1-2004
+ *
+ *               Copyright (c) 2013,2016,2023  by Qualcomm Technologies, Inc.  All Rights Reserved. QUALCOMM Proprietary and Confidential.
+ *==========================================================================
+ *
+ *                          EDIT HISTORY FOR MODULE
+ *
+ *  This section contains comments describing changes made to the module.
+ *  Notice that changes are listed in reverse chronological order.
+ *
+ *  
+ *
+ *  when       who     what, where, why
+ *  --------   ---     -------------------------------------------------------
+ *  10/13/08   cz      Initial version.
+ *==========================================================================*/
+
+#include <sys/types.h>
+#include "sys/sched.h" /* For struct sched_param */
+#include "sys/errno.h" /* error values */
+#include <qurt.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <time.h>
+#include "pthread_types.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* the range of the set supported by the kernel data type used to represent CPU sets. */
+#define CONFIG_NR_CPUS QURT_THREAD_CFG_BITMASK_ALL
+
+#define UNIMPLEMENTED(FUNC, RETURNTYPE, ARGS)    static inline RETURNTYPE FUNC ARGS { qurt_printf("Unimplemented: %s... exiting\n", __FUNCTION__); exit(1); }
+
+/** @brief Magic (non-portable) value for a stack's address to enable usage
+           of auto-stack feature (if available) */
+#define PTHREAD_AUTO_STACK_MAGIC_ADDR_NP ((void *)0xFFF)
+
+/** \details 
+ * This provides POSIX thread API. 
+ *
+ */
+
+/** \defgroup pthread POSIX pthread API */
+/** \ingroup pthread */
+/** @{ */
+
+/** Compare Two Threads. 
+ * Please refer to POSIX standard for details.  
+ */
+static inline int pthread_equal(pthread_t t1, pthread_t t2)
+{
+    return (t1 == t2) ? 1 : 0;
+}
+
+/** Create Thread. 
+ * Please refer to POSIX standard for details.  
+ */
+int pthread_create(pthread_t * tid, const pthread_attr_t * attr, void *(*start)(void *), void *arg);
+
+/** Terminate Calling Thread. 
+ * Please refer to POSIX standard for details.  
+ */
+void pthread_exit(void *value_ptr);
+
+/** Wait for thread termination.
+ * Please refer to POSIX standard for details.
+ * @param thread    [in]  the thread to be joined
+ * @param value_ptr [out] the pointer of the exit status
+ */
+int pthread_join(pthread_t thread, void **value_ptr);
+
+/** Detach a joinable thread.
+ * Please refer to POSIX standard for details.
+ * @param id    [in]  id of the tread the thread to be detached.
+ */
+int pthread_detach(pthread_t id);
+
+/** Dynamic package initialisation
+ * Please refer to POSIX standard for details.
+ */
+int pthread_once(pthread_once_t *once_control, void (*init_routine)(void));
+
+pthread_t pthread_self(void);
+int pthread_cancel(pthread_t thread);
+static inline void pthread_yield(void)
+{
+    return;
+}
+
+int pthread_kill(pthread_t thread, int sig);
+
+/**
+ * @brief Return name of thread
+ * @warning Donot call this in the error handling path as it may cause deadlock
+ *          due to underlying OS calls
+ * @param thread [in] thread Thread whose name is to be retrieved
+ * @param name [out] name Buffer used to return thread name
+ * @param len [in] len  Number of bytes available in name
+ * @return 0 on success, ESRCH, ERANGE on failure
+ */
+extern int pthread_getname_np (pthread_t thread, char * name, size_t len);
+
+int pthread_getschedparam(pthread_t thread, int *restrict policy, struct sched_param *restrict param);
+int pthread_setschedparam(pthread_t thread, int policy, const struct sched_param *param);
+int pthread_setschedprio(pthread_t thread, int prio);
+int pthread_setcancelstate(int state, int *oldstate);
+int pthread_setcanceltype(int type, int *oldtype);
+
+/* Attribute functions */
+int pthread_attr_init(pthread_attr_t *attr);
+int pthread_attr_destroy(pthread_attr_t *attr);
+int pthread_attr_setschedparam(pthread_attr_t *restrict attr, const sched_param *restrict param);
+int pthread_attr_getschedparam(const pthread_attr_t *restrict attr, sched_param *restrict param);
+int pthread_attr_setstacksize(pthread_attr_t *attr, size_t stacksize);
+int pthread_attr_getstacksize(const pthread_attr_t *attr, size_t *stacksize);
+int pthread_attr_setstackaddr(pthread_attr_t *attr, void * stackaddr);
+int pthread_attr_getstackaddr(const pthread_attr_t *attr, void ** stackaddr);
+int pthread_attr_getdetachstate(const pthread_attr_t *attr, int *detachstate);
+int pthread_attr_setdetachstate(pthread_attr_t *attr, int detachstate);
+int pthread_attr_setstack(pthread_attr_t *attr, void *stackaddr, size_t stacksize);
+int pthread_attr_getstack(const pthread_attr_t *attr, void **stackaddr, size_t *stacksize);
+int pthread_attr_setscope(pthread_attr_t *attr, int scope);
+int pthread_attr_getscope(const pthread_attr_t *attr, int *scope);
+int pthread_attr_setinheritsched(pthread_attr_t *attr, int inheritsched);
+int pthread_attr_getinheritsched(const pthread_attr_t *attr, int *inheritsched);
+int pthread_attr_getguardsize(const pthread_attr_t * attr, size_t * guardsize);
+int pthread_attr_setautostack(pthread_attr_t *attr);
+int pthread_attr_setbuspriority(pthread_attr_t *attr, unsigned short bus_priority);
+
+/* Qualcomm additions to pthread get/set attribute functions */
+int pthread_attr_setthreadname(pthread_attr_t *attr, const char * name);
+int pthread_attr_getthreadname(const pthread_attr_t *attr, char * name, int size);
+int pthread_attr_settimetestid(pthread_attr_t *attr, unsigned int tid);
+int pthread_attr_gettimetestid(const pthread_attr_t *attr, unsigned int* tid);
+
+/* Mutexes */
+int pthread_mutex_init(pthread_mutex_t *mutex, pthread_mutexattr_t *attr);
+int pthread_mutex_lock(pthread_mutex_t *mutex);
+int pthread_mutex_unlock(pthread_mutex_t *mutex);
+int pthread_mutex_trylock(pthread_mutex_t *mutex);
+int pthread_mutex_destroy(pthread_mutex_t *mutex);
+int pthread_mutex_getprioceiling(const pthread_mutex_t *restrict mutex, int *restrict prioceiling);
+int pthread_mutex_setprioceiling(pthread_mutex_t *restrict mutex, int prioceiling, int *restrict old_ceiling);
+
+/* For Mutex with type PTHREAD_MUTEX_NORMAL, Priority Inheritance is not 
+ * supported even PTHREAD_PRIO_INHERIT is defined since QURT does not support
+ * this kind of Mutex */
+int pthread_mutexattr_init(pthread_mutexattr_t *attr);
+int pthread_mutexattr_destroy(pthread_mutexattr_t *attr);
+int pthread_mutexattr_gettype(const pthread_mutexattr_t *restrict, int *restrict);
+int pthread_mutexattr_settype(pthread_mutexattr_t *attr, int type);
+int pthread_mutexattr_getprotocol(const pthread_mutexattr_t *restrict, int *restrict);
+int pthread_mutexattr_setprotocol(pthread_mutexattr_t *attr, int protocol);
+int pthread_mutexattr_getpshared(const pthread_mutexattr_t *restrict, int *restrict);
+int pthread_mutexattr_setpshared(pthread_mutexattr_t *, int);
+int pthread_mutexattr_getprioceiling(const pthread_mutexattr_t *restrict attr, int *restrict prioceiling);
+int pthread_mutexattr_setprioceiling(pthread_mutexattr_t *attr, int prioceiling);
+
+/* Spinlocks */
+int pthread_spin_init(pthread_spinlock_t *lock, int pshared);
+int pthread_spin_destroy(pthread_spinlock_t *lock);
+int pthread_spin_lock(pthread_spinlock_t *lock);
+int pthread_spin_trylock(pthread_spinlock_t *lock);
+int pthread_spin_unlock(pthread_spinlock_t *lock);
+
+/* Condition variables */
+int pthread_condattr_init(pthread_condattr_t *attr);
+int pthread_condattr_destroy(pthread_condattr_t *attr);
+int pthread_condattr_setpshared(pthread_condattr_t *attr, int pshared);
+int pthread_condattr_getpshared(const pthread_condattr_t *restrict attr, int *restrict pshared);
+int pthread_condattr_setclock(pthread_condattr_t *attr, clockid_t clock);
+int pthread_condattr_getclock(const pthread_condattr_t *restrict attr, clockid_t *restrict clock);
+int pthread_cond_init(pthread_cond_t *cond, pthread_condattr_t *attr);
+int pthread_cond_destroy(pthread_cond_t *cond);
+int pthread_cond_signal(pthread_cond_t *cond);
+int pthread_cond_broadcast(pthread_cond_t *cond);
+int pthread_cond_wait(pthread_cond_t *cond, pthread_mutex_t *mutex);
+int pthread_cond_timedwait(pthread_cond_t * cond, pthread_mutex_t * mutex, const struct timespec *time);
+
+/* Barriers */
+int pthread_barrier_init(pthread_barrier_t *restrict barrier, const pthread_barrierattr_t *restrict attr, unsigned count);
+int pthread_barrier_destroy(pthread_barrier_t *barrier);
+int pthread_barrier_wait(pthread_barrier_t *barrier);
+int pthread_barrierattr_init(pthread_barrierattr_t *attr);
+int pthread_barrierattr_destroy(pthread_barrierattr_t *attr);
+int pthread_barrierattr_getpshared(const pthread_barrierattr_t *restrict attr, int *restrict pshared);
+
+
+/*Read-Write locks*/
+int pthread_rwlock_init(pthread_rwlock_t *, const pthread_rwlockattr_t *);
+int pthread_rwlock_destroy(pthread_rwlock_t *);
+int pthread_rwlockattr_init(pthread_rwlockattr_t *);
+int pthread_rwlockattr_destroy(pthread_rwlockattr_t *);
+int pthread_rwlockattr_getpshared(const pthread_rwlockattr_t *, int *);
+int pthread_rwlockattr_setpshared(pthread_rwlockattr_t *, int);
+int pthread_rwlock_rdlock(pthread_rwlock_t *);
+int pthread_rwlock_tryrdlock(pthread_rwlock_t *);
+int pthread_rwlock_wrlock(pthread_rwlock_t *);
+int pthread_rwlock_trywrlock(pthread_rwlock_t *);
+int pthread_rwlock_unlock(pthread_rwlock_t *);
+
+
+/** please refer to POSIX standard document 
+ */
+int pthread_barrierattr_setpshared(pthread_barrierattr_t *attr, int pshared);
+
+/** set CPU affinity attribute in thread attributes object.
+
+ * @param attr       [in] pthread attributes 
+ * @param cpusetsize [in] The argument cpusetsize is the length (in bytes) 
+                          of the buffer pointed to by cpuset. Typically, 
+                          this argument would be specified as 
+                          sizeof(cpu_set_t).
+ * @param cpuset     [in] This data set is a bitset where each bit represents 
+                          a CPU (hw thread). How the system's CPUs are mapped 
+                          to bits in the bitset is system dependent. 
+                          For QURT kernel, Bit 0 is corresponding to hw 
+                          thread 0, and so on. If the corresponding bit is 
+                          set to 1, then the software thread is eligible to 
+                          run this hw thread.  0x3f means it can run any hw
+                          threads 0x0 also means it can run on any hw threads.
+   @return On success, this function returns 0; on error, it returns a 
+           non-zero error number.
+           EINVAL - cpuset specified a CPU that was outside the set supported 
+                    by the kernel.  (The kernel configuration option 
+                    CONFIG_NR_CPUS defines the range of the set supported by 
+                    the kernel data type used to represent CPU sets.)
+ * @note This function is non-standard GNU extensions; hence the suffix "_np"
+         (non-portable) in the names. 
+ */
+int pthread_attr_setaffinity_np(pthread_attr_t *attr, size_t cpusetsize, const cpu_set_t *cpuset);
+
+/** get CPU affinity attribute in thread attributes object.
+ * @param attr       [in] pthread attributes 
+ * @param cpusetsize [in] The argument cpusetsize is the length (in bytes) 
+                          of the buffer pointed to by cpuset. Typically, 
+                          this argument would be specified as 
+                          sizeof(cpu_set_t).
+ * @param cpuset    [out] This data set is a bitset where each bit represents 
+                          a CPU (hw thread). How the system's CPUs are mapped 
+                          to bits in the bitset is system dependent. 
+                          For QURT kernel, Bit 0 is corresponding to hw 
+                          thread 0, and so on. If the corresponding bit is 
+                          set to 1, then the software thread is eligible to 
+                          run this hw thread.  0x3f means it can run any hw
+                          threads 0x0 also means it can run on any hw threads.
+   @return On success, this function returns 0; on error, it returns a 
+           non-zero error number.
+           EINVAL - cpusetsize is smaller than the size of the affinity mask 
+                    used by the kernel.
+ * @note   This function is non-standard GNU extensions; hence the suffix "_np"
+           (non-portable) in the names. 
+ */
+int pthread_attr_getaffinity_np(pthread_attr_t *attr, size_t cpusetsize, cpu_set_t *cpuset);
+
+/* TLS */
+int pthread_key_create(pthread_key_t *key, void (*destructor)(void*));
+int pthread_key_delete(pthread_key_t key);
+int pthread_setspecific(pthread_key_t key, const void *value);
+void *pthread_getspecific(pthread_key_t key);
+int pthread_getattr_np(pthread_t thread, pthread_attr_t * restrict attr); 	 	
+
+/** @} */
+
+/* Calling non-pthread calls this function to create pthred tcb w/o creating actual thread */
+int pthread_fake(pthread_t * restrict thread, const pthread_attr_t * restrict attr);
+int pthread_fake_destroy(pthread_t thread);
+
+//amitkulk: move these to unistd.h after we move that header within qurt
+int posix_memalign(void **memptr, size_t alignment, size_t size);
+void exit(int status);
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* QURT_PTHREAD_H */
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/posix/pthread_types.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/posix/pthread_types.h
new file mode 100755
index 0000000000000..51c3b9dbca243
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/posix/pthread_types.h
@@ -0,0 +1,193 @@
+#ifndef _PTHREAD_TYPES_H_
+#define _PTHREAD_TYPES_H_
+
+/*==========================================================================
+ * FILE:         pthread_types.c
+ *
+ * SERVICES:     types usded in POSIX API interface
+ *
+ * DESCRIPTION:  POSIX API interface based upon POSIX 1003.1-2004
+ *
+ *               Copyright (c) 2016, 2023  by Qualcomm Technologies, Inc.  All Rights Reserved. QUALCOMM Proprietary and Confidential.
+
+ *==========================================================================*/
+
+#include <stddef.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef __GNUC__
+#define restrict __restrict__
+#else
+#define restrict
+#endif
+
+#define _SSIZE_T
+
+#ifndef TRUE
+#define TRUE    1
+#endif
+
+#ifndef FALSE
+#define FALSE    0
+#endif
+
+#define PTHREAD_MAX_THREADS          512U
+
+#define PTHREAD_NAME_LEN             16
+#define PTHREAD_MIN_STACKSIZE        512 //4096
+#define PTHREAD_MAX_STACKSIZE        1048576
+#define PTHREAD_DEFAULT_STACKSIZE    16384
+
+#define PTHREAD_STACK_MIN            (4096U*2U)
+#define PTHREAD_MIN_PRIORITY         0U
+#define PTHREAD_MAX_PRIORITY         255U
+#define PTHREAD_DEFAULT_PRIORITY     1
+
+/*Mutex initialization status*/
+#define PTHREAD_MUTEX_ATTR_UNINITIALIZED    0
+#define PTHREAD_MUTEX_ATTR_INITIALIZED      1
+
+/*Conditional attributes initialization status*/
+#define PTHREAD_COND_ATTR_UNINITIALIZED     0
+#define PTHREAD_COND_ATTR_INITIALIZED       1
+
+#define PTHREAD_DEFAULT_NAME                "Anonymous"
+
+#define PTHREAD_MUTEX_INITIALIZER    ((pthread_mutex_t) 0xFFFFFFFFU)
+                                      
+#define PTHREAD_COND_INITIALIZER     ((pthread_cond_t) 0xFFFFFFFFU)
+
+/* mutex and cond_var shared */
+#define PTHREAD_PROCESS_PRIVATE      0
+#define PTHREAD_PROCESS_SHARED       1
+
+/* mutex type */
+#define PTHREAD_MUTEX_ERRORCHECK     0
+#define PTHREAD_MUTEX_NORMAL         1
+#define PTHREAD_MUTEX_RECURSIVE      2
+#define PTHREAD_MUTEX_DEFAULT        3
+
+/* mutex protocol */
+#define PTHREAD_PRIO_NONE            0
+#define PTHREAD_PRIO_INHERIT         1
+#define PTHREAD_PRIO_PROTECT         2
+
+#define PTHREAD_SPINLOCK_UNLOCKED    0
+#define PTHREAD_SPINLOCK_LOCKED      1
+
+#define PTHREAD_ONCE_INIT (0)
+
+#define PTHREAD_MUTEX_OPAQUE //ToDo: amitkulk: debug
+
+typedef signed int   ssize_t;
+
+/*detatchstate of a pthread*/
+#define PTHREAD_CREATE_JOINABLE             1
+#define PTHREAD_CREATE_DETACHED             0
+
+/*contention scope*/
+#define PTHREAD_SCOPE_PROCESS 1 
+#define PTHREAD_SCOPE_SYSTEM 0
+
+/*scheduler*/
+#define PTHREAD_INHERIT_SCHED 1
+#define PTHREAD_EXPLICIT_SCHED 0
+
+/*
+ * Types and structure definitions
+ *
+ */
+typedef unsigned int cpu_set_t;
+
+typedef unsigned int pthread_t;
+
+typedef struct pthread_attr_t
+{
+    void         *stackaddr;
+    int          internal_stack; /* this flag==1 means the stack needs to be freed by posix */
+    size_t       stacksize;
+    int          priority;
+    unsigned short timetest_id;
+    /* This flag indicate if thread will be autostack thread*/    
+	unsigned short autostack:1;
+    /* This flag is to indicate thread's bus_priority high/low 
+       bus_priority = 0  -- Bus_priority is low
+       bus_priority = 1  -- Bus_priority is high
+       bus_priority = 3  -- Bus_priority is default (takes the default set for the process)
+    */
+    unsigned short bus_priority:2;
+    unsigned short reserved:13;
+    cpu_set_t    cpumask;
+    char         name[PTHREAD_NAME_LEN];
+    /* This flag indicates whether pthread lib should create thread contexts for other OSALs */
+    /* This is used internally by POSIX and not available for general usage */
+    int          ext_context;
+    int          detachstate;
+} pthread_attr_t;
+
+//mutex attr
+typedef struct pthread_mutexattr_t   pthread_mutexattr_t;
+struct pthread_mutexattr_t
+{
+    int is_initialized;
+    int type;
+    int pshared;
+    int protocol;
+};
+
+typedef unsigned int              pthread_mutex_t;
+
+typedef unsigned int              pthread_spinlock_t;
+
+typedef struct pthread_condattr_t
+{
+    int is_initialized;
+    int pshared;
+    clockid_t clock_id;
+} pthread_condattr_t;
+
+typedef unsigned int             pthread_cond_t;
+
+typedef struct pthread_barrierattr_t
+{
+    int is_initialized;
+    int pshared;
+} pthread_barrierattr_t;
+
+typedef unsigned int                pthread_barrier_t;
+
+typedef int pthread_key_t;
+
+typedef int pthread_once_t;
+
+
+/*Read-Write locks*/
+#define PTW32_RWLOCK_MAGIC 0xfacade2
+#define PTHREAD_RWLOCK_INITIALIZER ((pthread_rwlock_t)(size_t) -1)
+
+struct pthread_rwlockattr_t_
+{
+  int pshared;
+};
+
+struct pthread_rwlock_t_
+{
+  pthread_mutex_t mtxExclusiveAccess;
+  pthread_mutex_t mtxSharedAccessCompleted;
+  pthread_cond_t cndSharedAccessCompleted;
+  int nSharedAccessCount;
+  int nExclusiveAccessCount;
+  int nCompletedSharedAccessCount;
+  int nMagic;
+};
+
+typedef struct pthread_rwlock_t_ * pthread_rwlock_t;
+typedef struct pthread_rwlockattr_t_ * pthread_rwlockattr_t;
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _PTHERAD_TYPES_H_ */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/posix/sched.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/posix/sched.h
new file mode 100755
index 0000000000000..faf3365be9f82
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/posix/sched.h
@@ -0,0 +1,21 @@
+/*=============================================================================
+
+                                    sched.h
+
+GENERAL DESCRIPTION
+
+EXTERNAL FUNCTIONS
+        None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+        None.
+
+             Copyright (c) 2013,2016  by Qualcomm Technologies, Inc.  All Rights Reserved.
+=============================================================================*/
+#ifndef __SCHED_H__
+#define __SCHED_H__
+
+#include "sys/sched.h"
+
+#endif //__SCHED_H__
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/posix/semaphore.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/posix/semaphore.h
new file mode 100755
index 0000000000000..d9145b295ae62
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/posix/semaphore.h
@@ -0,0 +1,114 @@
+#ifndef SEMAPHORE_H
+#define SEMAPHORE_H
+
+/*==========================================================================
+ * FILE:         semaphore.h
+ *
+ * SERVICES:     POSIX semaphore API interface
+ *
+ * DESCRIPTION:  POSIX semaphore API interface based upon POSIX 1003.1-2004
+ *
+ *               Copyright (c) 2013, 2016  by Qualcomm Technologies, Inc.  All Rights Reserved. QUALCOMM Proprietary and Confidential.
+
+ *==========================================================================*/
+#include <sys/types.h> // Get all C sys types - includes POSIX specific
+#include "sys/errno.h" // error values
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*=============================================================================
+                        TYPEDEFS
+=============================================================================*/
+/** User facing semaphore container with opaque pointer to implementation */
+typedef struct
+{
+    unsigned int *opaque;
+} sem_t;
+#define _SEM_T
+
+/*=============================================================================
+                        CONSTANTS AND MACROS
+=============================================================================*/
+/* constant definitions */
+#define SEM_FAILED       ((sem_t*) 0)
+
+/* @todo siqbal Should we put such configuration items in a common place
+   instead of this user-facing header? */
+#define SEM_VALUE_MAX    ((unsigned int) 30) // If need be increase this
+
+/*=============================================================================
+                        FUNCTIONS
+=============================================================================*/
+
+/** \details
+ * POSIX standard comes with two kinds of semaphores: named and unnamed
+ * semaphores.
+ *
+ * This implementation of POSIX kernel API provide unnamed & named semaphore.
+ *
+ * 
+ * sem_timedwait() is not provided.
+ */
+
+/** \defgroup semaphore POSIX Semaphore API */
+
+/** \ingroup semaphore */
+/** @{ */
+
+/** Initialize an unnamed semaphore.
+ * Please refer to POSIX standard for details.
+ * @param pshared [in] This implementation does not support non-zero value, 
+ * i.e., semaphore cannot be shared between processes in this implementation. 
+ */                 
+int sem_init(sem_t *sem, int pshared, unsigned int value);
+
+/** Lock a semaphore.
+ * Please refer to POSIX standard for details.
+ */
+int sem_wait(sem_t *sem);
+
+/** Lock a semaphore.
+ * Please refer to POSIX standard for details.
+ */
+int sem_trywait(sem_t *sem);
+
+/** Unlock a semaphore.
+ * Please refer to POSIX standard for details.
+ */
+int sem_post(sem_t *sem);
+
+/** Get the value of a semaphore.
+ * Please refer to POSIX standard for details.
+ */
+int sem_getvalue(sem_t *sem, int *value);
+
+/** Destroy an unnamed semaphore.
+ * Please refer to POSIX standard for details.
+ */
+int sem_destroy(sem_t *sem);
+
+/** creates and initializes a named semaphore.
+ * Please refer to POSIX standard for details.
+ */
+sem_t * sem_open(const char* name , int oflag , ...);
+
+/** closes a semaphore.
+ * Please refer to POSIX standard for details.
+ */
+int sem_close(sem_t *sem);
+
+/** unlinkes a named semaphore.
+ * Please refer to POSIX standard for details.
+ */
+int sem_unlink(const char *name);
+/** @} */
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  /* SEMAPHORE_H */
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/posix/signal.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/posix/signal.h
new file mode 100755
index 0000000000000..35cb1f1a9a319
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/posix/signal.h
@@ -0,0 +1,201 @@
+#ifndef _SIGNAL_H_
+#define _SIGNAL_H_
+
+/*==========================================================================
+ * FILE:         signal.h
+ *
+ * SERVICES:     POSIX Signal API interface
+ *
+ * DESCRIPTION:  POSIX Signal API interface based upon POSIX 1003.1-2004
+ *
+ * Copyright (c) 2013, 2016, 2023 Qualcomm Technologies, Inc.
+ * All Rights Reserved. 
+ * Confidential and Proprietary - Qualcomm Technologies, Inc.
+ 
+ *==========================================================================*/
+
+#include <sys/types.h>
+#include <generic/signal.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* POSIX signal bits */
+
+#define POSIX_MSG      7 /* POSIX msg type used in Qube API */
+#define POSIX_NOTIF    8 /* POSIX msg type used in Qube API */
+#define SIGKILL        9 /* kill (cannot be caught or ignored) */
+
+#define SIGRTMIN       10
+#define SIGRTMAX       32
+
+/* Notification Types. */
+/* No asynchronous notification is delivered when the event of interest occurs. */
+#define SIGEV_NONE      0
+/* The signal specified in sigev_signo shall be generated for the process when
+   the event of interest occurs. */
+#define SIGEV_SIGNAL    1
+/* A notification function is called to perform notification. */
+#define SIGEV_THREAD    2
+#define SA_SIGINFO      1
+
+/*
+ * Flags for sigprocmask:
+ */
+#define SIG_BLOCK       1 /* block specified signal set */
+#define SIG_UNBLOCK     2 /* unblock specified signal set */
+#define SIG_SETMASK     3 /* set specified signal set */
+
+typedef unsigned long int   sigset_t;
+
+union sigval
+{
+    int  sival_int;   /* Integer signal value. */
+    void *sival_ptr;  /* Pointer signal value. */
+};
+
+typedef struct sigevent   sigevent;
+struct sigevent
+{
+    int            sigev_notify;                           /* Notification type.       */
+    int            sigev_signo;                            /* Signal number.           */
+    union sigval   sigev_value;                            /* Signal value.            */
+    void           (*sigev_notify_function)(union sigval); /* Notification function.   */
+    pthread_attr_t *sigev_notify_attributes;
+};
+
+typedef struct siginfo_t   siginfo_t;
+struct siginfo_t
+{
+    int          si_signo;
+    int          si_code;
+    union sigval si_value;
+/*  int          si_errno;
+    pid_t        si_pid;
+    uid_t        si_uid;
+    void         *si_addr;
+    int          si_status;
+    long         si_band;*/
+};
+struct sigaction
+{
+    void     (*sa_handler)(int);
+    sigset_t sa_mask;
+    int      sa_flags;
+    void     (*sa_sigaction)(int, siginfo_t *, void *);
+};
+
+/* Signal functions */
+
+/** \details
+ * This provides POSIX Signal API. Please note that this
+ * implementation does not fully comply with POSIX standard.
+ *
+ * In POSIX standard, Signal can be used as 'interrupt', which means
+ * an incoming signal will interrupt a running thread. After the
+ * registered signal handler is executed, the thread will resume.
+ * This behavior cannot be implemented w/o modifying L4 or QURT kernel.
+ * On the ohter hand, appliation need to be carefully written to avoid
+ * problems caused by 'interrupting' signals.
+ *
+ * Therefore, in this implementation of POSIX signal, thread will
+ * only receive signals when it explicitly waits for signals, i.e., when 
+ * the thread calls either sigwait() or sigsuspend().
+ *
+ * Therefore, pthread_sigmask(), which set or get signal mask for a thread, 
+ * is not supported, since the signal mask will be set by sigwait() and 
+ * sigsuspend().
+ *
+ * Since this implementation of POSIX kernel API is a subset of PSE51,
+ * only threads can send and receive signals. The functions related to 
+ * signal operations with processes, such as kill(), sigqueue(), 
+ * sigprocmask(), are not provided.
+ *
+ * Queued signal is not supported.
+ *
+ * Applications will use signals from SIGRTMIN to SIGRTMAX.
+ *
+ * SIGEV_SIGNAL and SIGEV_THREAD are supported. SIGEV_NONE is not 
+ * supported.
+ *
+ */
+
+/** \defgroup signal POSIX Signal API */
+/** \ingroup signal */
+/** @{ */
+
+/** Wait for signals. This implementation does not support queued signals.
+ *
+ * Please refer to POSIX standard for details.
+ */
+int sigwait(const sigset_t *restrict set, int *restrict sig);
+
+/** Examine and Change Signal Action. 
+ * Please refer to POSIX standard for details.
+ *
+ * @param act [in] A pointer to the sigaction structure that describes the 
+ * action to be taken for the signal. Can be NULL. 
+ * The following flags for sa_flags field in struct sigaction are not 
+ * supported: SA_NOCLDSTOP, SA_ONSTACK, SA_RESETHAND, SA_RESTART, 
+ * SA_NOCLDWAIT and SA_NODEFER. Only flag SA_SIGINFO is supported.  
+ *
+ * @note Define sigaction as macro to avoid a warning when included from 
+ * C++ code - it's causing a "sigaction(...) hides constructor for 
+ * 'struct sigaction'" warning.
+ */
+/*lint -esym(123,sigaction) Suppress "macro used with no arguments" */
+#define sigaction(sig,act,oact) _sigaction((sig),(act),(oact))
+
+/** Wait for signals. 
+ * Please refer to POSIX standard for details.
+ */
+int sigsuspend(const sigset_t *sigmask);
+
+/** Add Signal to Signal Set.
+ * Please refer to POSIX standard for details.
+ */
+int sigaddset(sigset_t *set, int signo);
+
+/** Delete Signal from Signal Set.
+ * Please refer to POSIX standard for details.
+ */
+int sigdelset(sigset_t *set, int signo);
+
+/** Initialize and Empty Signal Set.
+ * Please refer to POSIX standard for details.
+ */
+int sigemptyset(sigset_t *set);
+
+/** Initialize and Fill Signal Set.
+ * Please refer to POSIX standard for details.
+ */
+int sigfillset(sigset_t *set);
+
+/** Test for Signal in Signal Set.
+ * Please refer to POSIX standard for details.
+ */
+int sigismember(const sigset_t *set, int signo);
+
+/** @} */
+
+/* this is not a public api function */
+int _sigaction(int sig, const struct sigaction *act, struct sigaction *oact);
+
+/* have to move #include here to solve circular include problems between time.h and signal.h */
+#include <time.h>
+
+/** Wait for the time interval specified in the timespec structure referenced 
+ * by timeout. This implementation does not support queued signals.
+ * For struct siginfo_t, si_code and si_value are ignored in this implementation.
+ *
+ * Please refer to POSIX standard for details.
+ */
+int sigtimedwait(const sigset_t *restrict set, siginfo_t *restrict info, 
+                 const struct timespec *restrict timeout);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _POSIX_SIGNAL_H_ */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/posix/sys/errno.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/posix/sys/errno.h
new file mode 100755
index 0000000000000..b9edf57bab6c3
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/posix/sys/errno.h
@@ -0,0 +1,20 @@
+#ifndef _SYS_ERRNO_H_
+#define _SYS_ERRNO_H_
+
+/*==========================================================================
+ * FILE:         errno.h
+ *
+ * SERVICES:     POSIX errno header file
+ *
+ * DESCRIPTION:  POSIX errno based upon POSIX 1003.1-2004
+ *
+ *               Copyright (c) 2013, 2016  by Qualcomm Technologies, Inc.  All Rights Reserved. QUALCOMM Proprietary and Confidential.
+
+ *==========================================================================*/
+
+#include <errno.h>
+#ifndef EOK
+#define EOK                0
+#endif
+
+#endif /* _SYS_ERRNO_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/posix/sys/sched.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/posix/sys/sched.h
new file mode 100755
index 0000000000000..2acc34d821725
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/posix/sys/sched.h
@@ -0,0 +1,67 @@
+#ifndef _POSIX_SCHED_H_
+#define _POSIX_SCHED_H_
+
+/*==========================================================================
+ * FILE:         sched.c
+ *
+ * SERVICES:     POSIX Thread sched API interface
+ *
+ * DESCRIPTION:  POSIX Thread sched API interface based upon POSIX 1003.1-2004
+ *
+ *               Copyright (c) 2013, 2016  by Qualcomm Technologies, Inc.  All Rights Reserved. QUALCOMM Proprietary and Confidential.
+
+
+ *==========================================================================*/
+
+#include <qurt.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define SCHED_FIFO        0 /* First in, first out (FIFO) scheduling policy. */
+#define SCHED_RR          1 /* Round robin scheduling policy. */
+#define SCHED_SPORADIC    2 /* Sporadic server scheduling policy. */
+#define SCHED_OTHER       3 /* Another scheduling policy. */
+
+typedef struct sched_param   sched_param;
+struct sched_param
+{
+    void *unimplemented;
+    int  sched_priority;
+};
+
+/** \details 
+ * This provides POSIX sched API. 
+ */
+
+/** \defgroup sched POSIX sched API */
+/** \ingroup sched */
+/** @{ */
+
+/** Relinquish the CPU.
+ * Please refer to POSIX standard for details.
+ */
+static inline int sched_yield(void)
+{
+   return 0;
+}
+
+/** Get the maximum priority.
+ * Please refer to POSIX standard for details.
+ * @param policy [in] SCHED_FIFO is the only valid input for this implementation.
+ */
+int sched_get_priority_max(int policy);
+
+/** Get the minimum priority.
+ * Please refer to POSIX standard for details.
+ * @param policy [in] SCHED_FIFO is the only valid input for this implementation.
+ */
+int sched_get_priority_min(int policy);
+
+/** @} */
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _POSIX_SCHED_H_ */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/posix/sys/types.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/posix/sys/types.h
new file mode 100755
index 0000000000000..700026f9f9e4e
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/posix/sys/types.h
@@ -0,0 +1,35 @@
+#ifndef _SYS_TYPES_H_
+#define _SYS_TYPES_H_
+
+/*==========================================================================
+ * FILE:         types.c
+ *
+ * SERVICES:     types usded in POSIX API interface
+ *
+ * DESCRIPTION:  POSIX API interface based upon POSIX 1003.1-2004
+ *
+ *               Copyright (c) 2013, 2016  by Qualcomm Technologies, Inc.  All Rights Reserved. QUALCOMM Proprietary and Confidential.
+
+ *==========================================================================*/
+
+#if !defined( _PID_T ) || !defined( __pid_t_defined )
+/* POSIX defines pid_t as signed 32-bit type. Hexagon toolchain's header
+   defines it as unsigned 32-bit type citing conflict with QuRT POSIX
+   compatibility later. If any such conflicts exist, we should fix them.
+   pid_t is being defined *BEFORE* inclusion of generic/sys/types.h
+   *INTENTIONALLY* to fix this */
+typedef int        pid_t;
+#define _PID_T
+#define __pid_t_defined
+#endif
+#include <bits/confname.h>
+#include <hooks/unistd.h>
+#include <generic/sys/types.h>
+#include <pthread_types.h>
+
+#ifndef __DEFINED_off_t
+typedef long       off_t;
+#define __DEFINED_off_t
+#endif
+
+#endif /* _SYS_TYPES_H_ */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/posix/time.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/posix/time.h
new file mode 100755
index 0000000000000..13aeb1ea9920d
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/posix/time.h
@@ -0,0 +1,142 @@
+#ifndef _POSIX_TIME_H_
+#define _POSIX_TIME_H_
+
+/*==========================================================================
+ * FILE:         time.h
+ *
+ * SERVICES:     POSIX Timer API interface
+ *
+ * DESCRIPTION:  POSIX Timer API interface based upon POSIX 1003.1-2004
+ *
+ *               Copyright (c) 2013,2016  by Qualcomm Technologies, Inc.  All Rights Reserved. QUALCOMM Proprietary and Confidential.
+ *==========================================================================*/
+
+
+#include <sys/types.h>
+
+typedef int              clockid_t; /* ignored */
+#define _CLOCKID_T
+#define _PROVIDE_POSIX_TIME_DECLS 1
+#include <generic/time.h>
+/* @todo anandj sys/time.h has definition for struct timeval but is not
+         included by generic/time.h */
+#include <sys/time.h>
+
+#define CLOCK_FREQ_NOT_DEFINED          -1
+/* Frequency of Sclk used */
+#define TIME_CONV_SCLK_FREQ             19200000
+
+#define RES_CONV_FACTOR1                1
+#define RES_CONV_FACTOR2                1000000000
+
+#if !defined(CLOCK_REALTIME)
+# define CLOCK_REALTIME 0
+#endif
+
+#if !defined(CLOCK_MONOTONIC)
+# define CLOCK_MONOTONIC 1
+#endif
+
+#if !defined(CLOCK_THREAD_CPUTIME_ID)
+# define CLOCK_THREAD_CPUTIME_ID 2
+#endif
+
+#if !defined(CLOCK_PROCESS_CPUTIME_ID)
+# define CLOCK_PROCESS_CPUTIME_ID 3
+#endif
+
+#if !defined(CLOCK_MONOTONIC_RAW)
+# define CLOCK_MONOTONIC_RAW 4
+#endif
+
+#if !defined(CLOCK_REALTIME_COARSE)
+# define CLOCK_REALTIME_COARSE 5
+#endif
+
+#if !defined(CLOCK_MONOTONIC_COARSE)
+# define CLOCK_MONOTONIC_COARSE 6
+#endif
+
+#if !defined(CLOCK_BOOTTIME)
+# define CLOCK_BOOTTIME 7
+#endif
+
+struct itimerspec
+{
+    struct timespec it_interval;  /* Timer period.     */
+    struct timespec it_value;     /* Timer expiration. */
+};
+
+/* have to move #include here to solve circular include problems between time.h and signal.h */
+#include <signal.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Timer functions */
+
+/** \details
+ * POSIX timers can be either of two types: a one-shot type or a periodic 
+ * type.
+ *
+ * A one-shot is an armed timer that is set to an expiration time relative 
+ * to either a current time or an absolute time. The timer expires once and 
+ * is disarmed. 
+ *
+ * A periodic timer is armed with an initial expiration time and a repetition 
+ * interval. Every time the interval timer 
+ * expires, the timer is reloaded with the repetition interval. The timer 
+ * is then rearmed. 
+ */
+
+/** \defgroup timer POSIX Timer API */
+
+/** \ingroup timer */
+/** @{ */
+
+/** Create a POSIX timer. 
+ * Please refer to POSIX standard for details.
+ * @param clockid [in] ignored in this implementation
+ * @param evp     [in] if non-NULL, points to a sigevent structure. This 
+ * structure, allocated by the application, defines the asynchronous 
+ * notification to occur when the timer expires. If the evp argument is 
+ * NULL, the effect is as if the evp argument pointed to a sigevent 
+ * structure with the sigev_notify member having the value SIGEV_SIGNAL, 
+ * the sigev_signo having a default signal number (SIGALRM), and the 
+ * sigev_value member having the value of the timer ID.
+ */
+int timer_create(clockid_t clockid, struct sigevent *restrict evp,
+                 timer_t *restrict timerid);
+
+/** Delete a POSIX timer. 
+ * Please refer to POSIX standard for details.
+ */                 
+int timer_delete(timer_t timerid);
+
+/** Get the time remaining on a POSIX timer. 
+ * Please refer to POSIX standard for details.
+ */                 
+int timer_gettime(timer_t timerid, struct itimerspec *value);
+
+
+/** Set the time remaining on a POSIX timer. 
+ * Please refer to POSIX standard for details.
+ * @param flags [in] ignored in this implementation
+ */                 
+int timer_settime(timer_t timerid, int flags,
+                  const struct itimerspec *restrict value,
+                  struct itimerspec *restrict ovalue);
+/** Obtain ID of a process CPU-time clock
+ *  @param pid [in] Process ID
+ *  @param clock_id [out] Clock ID
+ *  @return Error values as per POSIX standard
+ */
+int clock_getcpuclockid (pid_t pid, clockid_t * clock_id);
+/** @} */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _POSIX_TIME_H_ */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qube/qube.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qube/qube.h
new file mode 100755
index 0000000000000..1e31e2deedb38
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qube/qube.h
@@ -0,0 +1,51 @@
+#ifndef QUBE_H
+#define QUBE_H
+/*=============================================================================
+
+                 qube.h -- H E A D E R  F I L E
+
+GENERAL DESCRIPTION
+   Prototypes of qpd API
+
+EXTERNAL FUNCTIONS
+   None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+      Copyright (c) 2013  by Qualcomm Technologies, Inc.  All Rights Reserved.
+
+=============================================================================*/
+
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <qurt.h>
+
+/* Define Error codes as QuRT error codes preceed with QURT_ */
+#ifndef EOK
+#define EOK                             QURT_EOK
+#endif /* EOK */
+#ifndef EVAL
+#define EVAL                            QURT_EVAL
+#endif /* EVAL */
+#ifndef EMEM
+#define EMEM                            QURT_EMEM
+#endif /* EMEM */
+#ifndef EINVALID
+#define EINVALID                        QURT_EINVALID
+#endif /* EINVALID */
+
+
+/*=============================================================================
+                      FUNCTION DECLARATIONS                                
+=============================================================================*/
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QUBE_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/atomic_ops.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/atomic_ops.h
new file mode 100755
index 0000000000000..0a9a9f8ba7db5
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/atomic_ops.h
@@ -0,0 +1,197 @@
+#ifndef ATOMIC_OPS_H
+#define ATOMIC_OPS_H
+/**
+  @file atomic_ops.h 
+
+  @brief  Type definitions backwards compatible.
+
+EXTERNAL FUNCTIONS
+   None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+Copyright (c) 2013, 2021  by Qualcomm Technologies, Inc.  All Rights Reserved
+Confidential and Proprietary - Qualcomm Technologies, Inc.
+=============================================================================*/
+
+
+/*
+ * Australian Public Licence B (OZPLB)
+ *
+ * Version 1-0
+ *
+ * Copyright (c) 2007, Open Kernel Labs, Inc.
+ *
+ * All rights reserved. 
+ *
+ * Developed by: Embedded, Real-time and Operating Systems Program (ERTOS)
+ *               National ICT Australia
+ *               http://www.ertos.nicta.com.au
+ *
+ * Permission is granted by National ICT Australia, free of charge, to
+ * any person obtaining a copy of this software and any associated
+ * documentation files (the "Software") to deal with the Software without
+ * restriction, including (without limitation) the rights to use, copy,
+ * modify, adapt, merge, publish, distribute, communicate to the public,
+ * sublicense, and/or sell, lend or rent out copies of the Software, and
+ * to permit persons to whom the Software is furnished to do so, subject
+ * to the following conditions:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimers.
+ *
+ *     * Redistributions in binary form must reproduce the above
+ *       copyright notice, this list of conditions and the following
+ *       disclaimers in the documentation and/or other materials provided
+ *       with the distribution.
+ *
+ *     * Neither the name of National ICT Australia, nor the names of its
+ *       contributors, may be used to endorse or promote products derived
+ *       from this Software without specific prior written permission.
+ *
+ * EXCEPT AS EXPRESSLY STATED IN THIS LICENCE AND TO THE FULL EXTENT
+ * PERMITTED BY APPLICABLE LAW, THE SOFTWARE IS PROVIDED "AS-IS", AND
+ * NATIONAL ICT AUSTRALIA AND ITS CONTRIBUTORS MAKE NO REPRESENTATIONS,
+ * WARRANTIES OR CONDITIONS OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
+ * BUT NOT LIMITED TO ANY REPRESENTATIONS, WARRANTIES OR CONDITIONS
+ * REGARDING THE CONTENTS OR ACCURACY OF THE SOFTWARE, OR OF TITLE,
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT,
+ * THE ABSENCE OF LATENT OR OTHER DEFECTS, OR THE PRESENCE OR ABSENCE OF
+ * ERRORS, WHETHER OR NOT DISCOVERABLE.
+ *
+ * TO THE FULL EXTENT PERMITTED BY APPLICABLE LAW, IN NO EVENT SHALL
+ * NATIONAL ICT AUSTRALIA OR ITS CONTRIBUTORS BE LIABLE ON ANY LEGAL
+ * THEORY (INCLUDING, WITHOUT LIMITATION, IN AN ACTION OF CONTRACT,
+ * NEGLIGENCE OR OTHERWISE) FOR ANY CLAIM, LOSS, DAMAGES OR OTHER
+ * LIABILITY, INCLUDING (WITHOUT LIMITATION) LOSS OF PRODUCTION OR
+ * OPERATION TIME, LOSS, DAMAGE OR CORRUPTION OF DATA OR RECORDS; OR LOSS
+ * OF ANTICIPATED SAVINGS, OPPORTUNITY, REVENUE, PROFIT OR GOODWILL, OR
+ * OTHER ECONOMIC LOSS; OR ANY SPECIAL, INCIDENTAL, INDIRECT,
+ * CONSEQUENTIAL, PUNITIVE OR EXEMPLARY DAMAGES, ARISING OUT OF OR IN
+ * CONNECTION WITH THIS LICENCE, THE SOFTWARE OR THE USE OF OR OTHER
+ * DEALINGS WITH THE SOFTWARE, EVEN IF NATIONAL ICT AUSTRALIA OR ITS
+ * CONTRIBUTORS HAVE BEEN ADVISED OF THE POSSIBILITY OF SUCH CLAIM, LOSS,
+ * DAMAGES OR OTHER LIABILITY.
+ *
+ * If applicable legislation implies representations, warranties, or
+ * conditions, or imposes obligations or liability on National ICT
+ * Australia or one of its contributors in respect of the Software that
+ * cannot be wholly or partly excluded, restricted or modified, the
+ * liability of National ICT Australia or the contributor is limited, to
+ * the full extent permitted by the applicable legislation, at its
+ * option, to:
+ * a.  in the case of goods, any one or more of the following:
+ * i.  the replacement of the goods or the supply of equivalent goods;
+ * ii.  the repair of the goods;
+ * iii. the payment of the cost of replacing the goods or of acquiring
+ *  equivalent goods;
+ * iv.  the payment of the cost of having the goods repaired; or
+ * b.  in the case of services:
+ * i.  the supplying of the services again; or
+ * ii.  the payment of the cost of having the services supplied again.
+ *
+ * The construction, validity and performance of this licence is governed
+ * by the laws in force in New South Wales, Australia.
+ */
+
+/*
+ * Author: Malcolm Purvis <malcolmp@ok-labs.com>
+ * Author: Carlos Dyonisio <medaglia@ok-labs.com>
+ */
+
+#include <qurt_atomic_ops.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef unsigned int atomic_plain_word_t;
+
+/*-------------------------------------------------------------------------*/
+                        /* Atomic Ops API. */
+
+/*
+ * IMPORTANT!
+ * If you plan to change the structure atomic_word_t, please add the new
+ * elements after value. For more information, read the comment in
+ * arch/arm/libs/atomic_ops/v5/src/arm_atomic_ops.spp:66
+ */
+
+typedef struct {
+    volatile atomic_plain_word_t value;
+} atomic_word_t;
+
+#define ATOMIC_INIT(i)  { (i) }
+
+static inline void
+atomic_init(atomic_word_t *a, atomic_plain_word_t v)
+{
+    a->value = v;
+}
+
+#if defined(ARCH_ARM) && defined(ARCH_VER) && (ARCH_VER < 6) && \
+         (!defined(__ATOMIC_OPS_IN_KERNEL__) || defined(MACHINE_SMP))
+
+/* 
+ * If it is ARMv4/v5, the function declarations may change
+ * and are defined in the arch specific header file,
+ * as some of then cannot be declared static because of
+ * the assembler implementation.
+ */
+
+#else 
+
+/* Arithmetic operations. */
+
+void atomic_sub(atomic_word_t *target, atomic_plain_word_t v);
+
+/* Architecture independent definitions. */
+
+static inline atomic_plain_word_t atomic_read(atomic_word_t *target)
+{
+    return target->value;
+}
+
+typedef unsigned long long atomic64_plain_word_t;
+
+typedef struct {
+    volatile atomic64_plain_word_t value;
+} atomic64_word_t;
+
+static inline void
+atomic64_init(atomic64_word_t *a, atomic64_plain_word_t v)
+{
+    a->value = v;
+}
+
+/*********************
+  Support 64-bit  
+ *********************/
+
+atomic64_plain_word_t atomic64_set(atomic64_word_t* target,
+                                      atomic64_plain_word_t value);
+
+void atomic64_xor(atomic64_word_t* target,
+                       atomic64_plain_word_t mask);
+
+/*---------------------------------------------------------------------------*/
+
+/* Architecture independent definitions. */
+
+static inline atomic64_plain_word_t atomic64_read(atomic64_word_t *target)
+{
+    return target->value;
+}
+
+#endif
+
+
+/* Architecture dependent definitions. */
+#include <atomic_ops_plat.h>
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* ATOMIC_OPS_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/atomic_ops_plat.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/atomic_ops_plat.h
new file mode 100755
index 0000000000000..b54b3ff83d978
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/atomic_ops_plat.h
@@ -0,0 +1,86 @@
+#ifndef ATOMIC_OPS_PLAT_H
+#define ATOMIC_OPS_PLAT_H
+/**
+  @file atomic_ops_plat.h 
+
+  @brief  Prototypes of atomic operations API backwards compatible.
+
+EXTERNAL FUNCTIONS
+   None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+Copyright (c) 2013, 2023 Qualcomm Technologies, Inc.
+All Rights Reserved.
+Confidential and Proprietary - Qualcomm Technologies, Inc.
+=============================================================================*/
+
+
+#include <qurt_atomic_ops.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+/*=============================================================================
+                      CONSTANTS AND MACROS                                
+=============================================================================*/
+#define atomic_set(a,b)                qurt_atomic_set((unsigned int *)(a),(unsigned int)(b))
+#define atomic_and(a,b)                qurt_atomic_and((unsigned int *)(a),(unsigned int)(b))
+#define atomic_and_return(a,b)         qurt_atomic_and_return((unsigned int *)(a),(unsigned int)(b))
+#define atomic_or(a,b)                 qurt_atomic_or((unsigned int *)(a),(unsigned int)(b))
+#define atomic_or_return(a,b)          qurt_atomic_or_return((unsigned int *)(a),(unsigned int)(b))
+#define atomic_xor(a,b)                qurt_atomic_xor((unsigned int *)(a),(unsigned int)(b))
+#define atomic_xor_return(a,b)         qurt_atomic_xor_return((unsigned int *)(a),(unsigned int)(b)) 
+#define atomic_set_bit(a,b)            qurt_atomic_set_bit((unsigned int *)(a),(unsigned int)(b))
+#define atomic_clear_bit(a,b)          qurt_atomic_clear_bit((unsigned int *)(a),(unsigned int)(b))
+#define atomic_change_bit(a,b)         qurt_atomic_change_bit((unsigned int *)(a),(unsigned int)(b))
+#define atomic_add(a,b)                qurt_atomic_add((unsigned int *)(a),(unsigned int)(b))
+#define atomic_add_return(a,b)         qurt_atomic_add_return((unsigned int *)(a),(unsigned int)(b))
+#define atomic_add_unless(a,b,c)       qurt_atomic_add_unless((unsigned int *)(a),(unsigned int)(b),(unsigned int)(c))
+#define atomic_sub(a,b)                qurt_atomic_sub((unsigned int *)(a),(unsigned int)(b))
+#define atomic_sub_return(a,b)         qurt_atomic_sub_return((unsigned int *)(a),(unsigned int)(b))
+#define atomic_inc(a)                  qurt_atomic_inc((unsigned int *)(a))
+#define atomic_inc_return(a)           qurt_atomic_inc_return((unsigned int *)(a))
+#define atomic_dec(a)                  qurt_atomic_dec((unsigned int *)(a))
+#define atomic_dec_return(a)           qurt_atomic_dec_return((unsigned int *)(a))
+#define atomic_compare_and_set(a,b,c)  qurt_atomic_compare_and_set((unsigned int *)(a),(unsigned int)(b),(unsigned int)(c))
+#define atomic_barrier                 qurt_atomic_barrier
+#define atomic_barrier_write           qurt_atomic_barrier_write
+#define atomic_barrier_write_smp       qurt_atomic_barrier_write_smp
+#define atomic_barrier_read_smp        qurt_atomic_barrier_read_smp
+#define atomic_barrier_smp             qurt_atomic_barrier_smp
+
+/*============================
+ *       64 bits support
+ *============================ */
+#define atomic64_set(a,b)                qurt_atomic64_set((unsigned long long *)(a),(unsigned long long)(b))  
+#define atomic64_and(a,b)                qurt_atomic64_and((unsigned long long *)(a),(unsigned long long)(b)) 
+#define atomic64_and_return(a,b)         qurt_atomic64_and_return((unsigned long long *)(a),(unsigned long long)(b))  
+#define atomic64_or(a,b)                 qurt_atomic64_or((unsigned long long *)(a),(unsigned long long)(b)) 
+#define atomic64_or_return(a,b)          qurt_atomic64_or_return((unsigned long long *)(a),(unsigned long long)(b)) 
+#define atomic64_xor(a,b)                qurt_atomic64_xor((unsigned long long *)(a),(unsigned long long)(b)) 
+#define atomic64_xor_return(a,b)         qurt_atomic64_xor_return((unsigned long long *)(a),(unsigned long long)(b))  
+#define atomic64_set_bit(a,b)            qurt_atomic64_set_bit((unsigned long long *)(a),(unsigned long long)(b)) 
+#define atomic64_clear_bit(a,b)          qurt_atomic64_clear_bit((unsigned long long *)(a),(unsigned long long)(b))  
+#define atomic64_change_bit(a,b)         qurt_atomic64_change_bit((unsigned long long *)(a),(unsigned long long)(b)) 
+#define atomic64_add(a,b)                qurt_atomic64_add((unsigned long long *)(a),(unsigned long long)(b))  
+#define atomic64_add_return(a,b)         qurt_atomic64_add_return((unsigned long long *)(a),(unsigned long long)(b))
+#define atomic64_sub(a,b)                qurt_atomic64_sub((unsigned long long *)(a),(unsigned long long)(b)) 
+#define atomic64_sub_return(a,b)         qurt_atomic64_sub_return((unsigned long long *)(a),(unsigned long long)(b)) 
+#define atomic64_inc(a)                  qurt_atomic64_inc((unsigned long long *)(a))
+#define atomic64_inc_return(a)           qurt_atomic64_inc_return((unsigned long long *)(a))
+#define atomic64_dec(a)                  qurt_atomic64_dec((unsigned long long *)(a))
+#define atomic64_dec_return(a)           qurt_atomic64_dec_return((unsigned long long *)(a))
+#define atomic64_compare_and_set(a,b,c)  qurt_atomic64_compare_and_set((unsigned long long  *)(a),(unsigned long long )(b),(unsigned long long )(c))
+#define atomic64_barrier                 qurt_atomic64_barrier
+#define atomic64_barrier_write           qurt_atomic64_barrier_write
+#define atomic64_barrier_write_smp       qurt_atomic64_barrier_write_smp
+#define atomic64_barrier_read_smp        qurt_atomic64_barrier_read_smp
+#define atomic64_barrier_smp             qurt_atomic64_barrier_smp
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* ATOMIC_OPS_PLAT_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt.h
new file mode 100755
index 0000000000000..4d25c9b2b6243
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt.h
@@ -0,0 +1,111 @@
+#ifndef QURT_H
+#define QURT_H 
+
+/**
+  @file qurt.h 
+  @brief  Contains kernel header files that provide kernel OS API functions, constants, and 
+  definitions 
+
+ EXTERNALIZED FUNCTIONS
+  none
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  none
+
+ Copyright (c) 2013,2021,2023  by Qualcomm Technologies, Inc.  All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+/*======================================================================
+ *
+ *											 EDIT HISTORY FOR FILE
+ *
+ *	 This section contains comments describing changes made to the
+ *	 module. Notice that changes are listed in reverse chronological
+ *	 order.
+ *
+ *	
+ *
+ *
+ * when 				who 		what, where, why
+ * ---------- 	--- 		------------------------------------------------
+ * 2011-02-25 	op			Add Header file
+   2012-12-16   cm          (Tech Pubs) Edited/added Doxygen comments and markup.
+ ======================================================================*/
+ 
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "qurt_consts.h"
+#include "qurt_api_version.h"
+#include "qurt_alloc.h"
+#include "qurt_futex.h"
+#include "qurt_mutex.h"
+#include "qurt_pipe.h"
+#include "qurt_printf.h"
+#include "qurt_assert.h"
+#include "qurt_thread.h"
+#include "qurt_trace.h"
+#include "qurt_cycles.h"
+#include "qurt_profile.h"
+#include "qurt_sem.h"
+#include "qurt_cond.h"
+#include "qurt_barrier.h"
+#include "qurt_fastint.h"
+#include "qurt_allsignal.h"
+#include "qurt_anysignal.h"
+#include "qurt_signal.h"
+#include "qurt_rmutex.h"
+#include "qurt_pimutex.h"
+#include "qurt_signal2.h"
+#include "qurt_rmutex2.h"
+#include "qurt_pimutex2.h"
+#include "qurt_int.h"
+#include "qurt_lifo.h"
+#include "qurt_power.h"
+#include "qurt_event.h"
+#include "qurt_pmu.h"
+#include "qurt_stid.h"
+//#include "qurt_version.h"
+#include "qurt_tlb.h"
+#include "qurt_vtlb.h"
+#include "qurt_memory.h"
+#include "qurt_qdi.h"
+#include "qurt_sclk.h"
+#include "qurt_space.h"
+#include "qurt_process.h"
+#include "qurt_timer.h"
+#include "qurt_tls.h"
+#include "qurt_thread_context.h"
+#include "qurt_hvx.h"
+#include "qurt_hmx.h"
+#include "qurt_mailbox.h"
+#include "qurt_island.h"
+#include "qurt_qdi_proxy.h"
+#include "qurt_l2cfg.h"
+#include "qurt_mmap.h"
+#include "qurt_isr.h"
+#include "qurt_busywait.h"
+#include "qurt_ecc.h"
+#include "qurt_callback.h"
+#include "qurt_error.h"
+#include "qurt_except.h"
+#include "qurt_mq.h"
+#include "qurt_user_dma.h"
+#include "qurt_fs_hub.h"	
+#include "qurt_os_services.h"	
+
+#ifndef MAIN_ONLY
+#define INCLUDE_ISLAND_CONTENTS
+#endif
+#ifndef ISLAND_ONLY
+#define INCLUDE_MAIN_CONTENTS
+#endif
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_H */
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_alloc.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_alloc.h
new file mode 100755
index 0000000000000..da37a4c0a714e
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_alloc.h
@@ -0,0 +1,145 @@
+#ifndef QURT_ALLOC_H
+#define QURT_ALLOC_H
+
+/**
+  @file qurt_alloc.h 
+  @brief Prototypes of kernel memory allocation API functions.      
+
+ EXTERNALIZED FUNCTIONS
+  none
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  none
+
+ Copyright (c) 2021, 2023 Qualcomm Technologies, Inc.
+ All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+
+/*======================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**@ingroup func_qurt_malloc
+  Dynamically allocates the specified array on the QuRT system heap.
+  The return value is the address of the allocated memory area.
+
+  @note1hang The allocated memory area is automatically initialized to zero.
+
+  @param[in] size     Size (in bytes) of the memory area.
+  
+  @return
+  Nonzero -- Pointer to the allocated memory area. \n
+  0 -- Not enough memory in heap to allocate memory area.
+
+  @dependencies
+  None.    
+
+ */
+/* ======================================================================*/
+void *qurt_malloc( unsigned int size);
+
+/*======================================================================*/
+/**@ingroup func_qurt_calloc
+  Dynamically allocates the specified array on the QuRT system heap.
+  The return value is the address of the allocated array. 
+
+  @note1hang The allocated memory area is automatically initialized to zero.
+
+  @param[in] elsize Size (in bytes) of each array element.
+  @param[in] num    Number of array elements.
+
+  @return 
+  Nonzero -- Pointer to allocated array.\n
+  Zero -- Not enough memory in heap to allocate array.
+
+  @dependencies
+  None.
+  
+ */
+ /* ======================================================================*/
+void *qurt_calloc(unsigned int elsize, unsigned int num);
+
+/*======================================================================*/
+/**@ingroup func_qurt_realloc
+  Reallocates memory on the heap. \n
+  Changes the size of a memory area that is already allocated on the QuRT system heap. 
+  The reallocate memory operation is functionally similar to realloc. It accepts a pointer
+  to an existing memory area on the heap, and resizes the memory area to the specified size
+  while preserving the original contents of the memory area.
+
+  @note1hang This function might change the address of the memory area.
+             If the value of ptr is NULL, this function is equivalent to 
+             qurt_malloc().
+             If the value of new_size is 0, it is equivalent to qurt_free().  
+             If the memory area is expanded, the added memory is not initialized.
+
+  @param[in] *ptr   Pointer to the address of the memory area.
+  @param[in] newsize Size (in bytes) of the reallocated memory area.
+	               	
+  @return
+  Nonzero -- Pointer to reallocated memory area. \n
+  0 -- Not enough memory in heap to reallocate the memory area.
+
+  @dependencies
+  None.
+	 
+ */
+ /* ======================================================================*/
+void *qurt_realloc(void *ptr,  int newsize);
+
+/*======================================================================*/
+/**@ingroup func_qurt_free
+  Frees allocated memory from the heap.\n
+  Deallocates the specified memory from the QuRT system heap.
+
+  @param[in] *ptr Pointer to the address of the memory to deallocate.
+	
+  @return
+  None.
+
+  @dependencies
+  The memory item that the ptr value specifies must have been previously 
+  allocated using one of the qurt_calloc(), 
+  qurt_malloc(), or qurt_realloc() memory allocation functions. 
+  Otherwise the behavior of QuRT is undefined.
+  
+ */
+ /* ======================================================================*/
+void qurt_free( void *ptr);
+
+
+void *qurt_memalign(unsigned int alignment, unsigned int size);
+
+/*
+||  Macro to define a static heap for a QuRT program.
+||
+||  Usage:
+||   Declare at the top-level of any C source file that
+||    is part of the build (and is guaranteed
+||    to actually be pulled into the build). Place
+||    it in the same function with main():
+||
+||    QURT_DECLARE_STATIC_HEAP(512000);
+||
+||  The only argument is the size in bytes, and it is
+||   rounded up to the nearest 64 bytes (size of an
+||   L2 cache block).
+||
+*/
+
+#define QURT_DECLARE_STATIC_HEAP(sz)                    \
+   static struct qurt_static_heap {                     \
+      char space[(sz)] __attribute__((aligned(64)));      \
+   } static_heap[1];                                    \
+   void * const override_heap_Base = &static_heap[0];   \
+   void * const override_heap_Limit = &static_heap[1]
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_ALLOC_H */
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_allsignal.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_allsignal.h
new file mode 100755
index 0000000000000..5dc89e495130d
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_allsignal.h
@@ -0,0 +1,176 @@
+
+#ifndef QURT_ALLSIGNAL_H
+#define QURT_ALLSIGNAL_H
+
+/**
+  @file  qurt_allsignal.h
+  @brief  Prototypes of kernel signal API functions.      
+
+ EXTERNALIZED FUNCTIONS
+  none
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  none
+
+
+ Copyright (c) 2021  by Qualcomm Technologies, Inc.  All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+
+#include <qurt_futex.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** @addtogroup all_signal_types
+@{ */
+/*=====================================================================
+ Typedefs
+ ======================================================================*/
+
+/**          
+qurt_signal_t supersedes qurt_allsignal_t. This type definition was added for backwards compatibility. */
+typedef union {
+    /** @cond */
+	unsigned long long int raw;
+	struct {
+		unsigned int waiting;      /**< */
+		unsigned int signals_in;   /**< */
+		unsigned int queue;        /**< */
+		unsigned int reserved;     /**< */
+	}X;
+    /** @endcond */
+} qurt_allsignal_t;
+/** @} */ /* end_addtogroup all_signal_types */
+
+/*=====================================================================
+ Functions
+======================================================================*/
+ 
+/*======================================================================*/
+/**@ingroup func_qurt_allsignal_init
+  Initializes an all-signal object.\n
+  The all-signal object is initially cleared.
+
+  @datatypes
+  #qurt_allsignal_t
+
+  @param[out] signal Pointer to the all-signal object to initialize. 
+  
+  @return         
+  None.
+
+  @dependencies    
+  None.
+ */
+/* ======================================================================*/
+void qurt_allsignal_init(qurt_allsignal_t *signal);
+
+/*======================================================================*/
+/**@ingroup func_qurt_allsignal_destroy
+  Destroys the specified all-signal object.\n
+  @note1hang All-signal objects must be destroyed when they are no longer in use. 
+             Failure to do this causes resource leaks in the QuRT kernel.  \n
+  @note1cont All-signal objects must not be destroyed while they are still in use. 
+             If this occurs, the behavior of QuRT is undefined.
+  
+  @datatypes
+  #qurt_allsignal_t
+
+  @param[in] signal Pointer to the all-signal object to destroy.
+
+  @return         
+  None.
+ 
+  @dependencies
+  None.
+ */
+/* ======================================================================*/
+void qurt_allsignal_destroy(qurt_allsignal_t *signal);
+
+/*======================================================================*/
+/**@ingroup func_qurt_allsignal_get
+  Gets signal values from the all-signal object.
+
+  Returns the current signal values of the specified all-signal object.
+
+  @datatypes
+  #qurt_allsignal_t
+
+  @param[in] signal Pointer to the all-signal object to access.
+
+  @return         
+  Bitmask with current signal values.
+    
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+static inline unsigned int qurt_allsignal_get(qurt_allsignal_t *signal)
+{ return signal->X.signals_in; }
+    
+/*======================================================================*/
+/**@ingroup func_qurt_allsignal_wait  
+  Waits on the all-signal object.\n
+  Suspends the current thread until all of the specified signals are set.
+  Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1
+  indicates that a signal must be waited on, and 0 that it is not to be waited on.
+
+  If a signal is set in an all-signal object, and a thread is waiting on the all-signal object for
+  that signal, the thread is awakened. If the awakened thread has higher priority than
+  the current thread, a context switch can occur.
+
+  Unlike any-signals, all-signals do not need to explicitly clear any set signals in an all-signal
+  object before waiting on them again -- clearing is done automatically by the wait
+  operation.
+
+  @note1hang At most, one thread can wait on an all-signal object at any given time.
+             Because signal clearing is done by the wait operation, no clear operation is
+             defined for all-signals.
+
+  @datatypes
+  #qurt_allsignal_t
+  
+  @param[in] signal Pointer to the all-signal object to wait on.
+  @param[in] mask	Signal mask value, which identifies the individual signals in the all-signal object
+                    to wait on.
+ 
+  @return
+  None.
+ 
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+void qurt_allsignal_wait(qurt_allsignal_t *signal, unsigned int mask);
+
+/*======================================================================*/
+/**@ingroup func_qurt_allsignal_set
+  Set signals in the specified all-signal object.
+
+  Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit 
+  value of 1 indicates that a signal must be set, and 0 indicates not to set the signal.
+
+  @datatypes
+  #qurt_allsignal_t
+
+  @param[in]	signal  Pointer to the all-signal object to modify. 
+  @param[in]	mask 	Signal mask value identifying the individual signals to  
+                        set in the all-signal object.
+
+  @return
+  None.
+ 
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+void qurt_allsignal_set(qurt_allsignal_t *signal, unsigned int mask);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_ALLSIGNAL_H */
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_anysignal.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_anysignal.h
new file mode 100755
index 0000000000000..9619e2de562b4
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_anysignal.h
@@ -0,0 +1,225 @@
+#ifndef QURT_ANYSIGNAL_H
+#define QURT_ANYSIGNAL_H 
+/**
+  @file qurt_anysignal.h
+  Prototypes of kernel signal API functions.      
+
+ EXTERNALIZED FUNCTIONS
+  None
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  None
+
+Copyright (c) 2021 Qualcomm Technologies, Inc.
+All rights reserved.
+Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+
+#include <qurt_signal.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*=====================================================================
+Typedefs
+======================================================================*/
+
+/**@ingroup anysignals_types                                                 
+ qurt_signal_t supersedes qurt_anysignal_t. This type definition was added for backwards compatibility.  */  
+typedef qurt_signal_t qurt_anysignal_t;
+
+/*=====================================================================
+ Functions
+======================================================================*/
+ 
+/*======================================================================*/
+/**@ingroup func_qurt_anysignal_init
+  Initializes an any-signal object.\n
+  The any-signal object is initially cleared.
+
+  @datatypes
+  #qurt_anysignal_t
+
+  @param[out] signal	Pointer to the initialized any-signal object.  
+  
+  @return         
+  None.
+
+  @dependencies  
+  None.
+ */
+/* ======================================================================*/
+static inline void qurt_anysignal_init(qurt_anysignal_t *signal)
+{
+  qurt_signal_init(signal);
+}
+
+/*======================================================================*/
+/**@ingroup func_qurt_anysignal_destroy
+  Destroys the specified any-signal object. 
+
+  @note1hang Any-signal objects must be destroyed when they are no longer in use. Failure
+             to do this causes resource leaks in the QuRT kernel.\n
+  @note1cont Any-signal objects must not be destroyed while they are still in use. If this
+             occurs, the behavior of QuRT is undefined.
+
+  @datatypes
+  #qurt_anysignal_t
+  
+  @param[in] signal Pointer to the any-signal object to destroy.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+/* ======================================================================*/
+static inline void qurt_anysignal_destroy(qurt_anysignal_t *signal)
+{
+  qurt_signal_destroy(signal);
+}
+
+/*======================================================================*/
+/**@ingroup func_qurt_anysignal_wait
+  Wait on the any-signal object. \n
+  Suspends the current thread until any one of the specified signals is set.
+
+  Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1
+  indicates that a signal must be waited on, and 0 indicates not to wait on the signal.
+  If a signal is set in an any-signal object, and a thread is waiting on the any-signal object for
+  that signal, the thread is awakened. If the awakened thread has higher priority than
+  the current thread, a context switch can occur.
+
+  @note1hang At most, one thread can wait on an any-signal object at any given time.
+
+  @datatypes
+  #qurt_anysignal_t
+	
+  @param[in] signal Pointer to the any-signal object to wait on. 
+  @param[in] mask   Signal mask value, which specifies the individual signals in the any-signal
+                      object to wait on.
+
+  @return 				
+  Bitmask of current signal values.
+
+  @dependencies
+  None.
+ */
+/* ======================================================================*/
+static inline unsigned int qurt_anysignal_wait(qurt_anysignal_t *signal, unsigned int mask)
+{
+  return qurt_signal_wait(signal, mask, QURT_SIGNAL_ATTR_WAIT_ANY);
+}
+
+/*======================================================================*/
+/**@ingroup func_qurt_anysignal_set
+  Sets signals in the specified any-signal object. \n
+  Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1
+  indicates that a signal must be set, and 0 indicates not to set the sigmal.
+
+  @datatypes
+  #qurt_anysignal_t
+	
+  @param[in] signal Pointer to the any-signal object to modify. 
+  @param[in]  mask  Signal mask value identifying the individual signals to  
+                       set in the any-signal object.
+
+  @return 				
+  Bitmask of old signal values (before set).
+
+  @dependencies
+  None.
+ */
+/* ======================================================================*/
+unsigned int qurt_anysignal_set(qurt_anysignal_t *signal, unsigned int mask);
+
+
+
+/*======================================================================*/
+/**@ingroup func_qurt_anysignal_get
+  Gets signal values from the any-signal object.\n
+  Returns the current signal values of the specified any-signal object.
+
+  @datatypes
+  #qurt_anysignal_t
+ 	
+  @param[in] signal Pointer to the any-signal object to access. 
+
+  @return 				
+  A bitmask with the current signal values of the specified any-signal object.
+
+  @dependencies
+  None.
+ */
+/* ======================================================================*/
+static inline unsigned int qurt_anysignal_get(qurt_anysignal_t *signal)
+{
+  return qurt_signal_get(signal);
+}
+
+
+/*======================================================================*/
+/**@ingroup func_qurt_anysignal_clear
+   @xreflabel{sec:anysignal_clear}
+  Clears signals in the specified any-signal object.\n
+  Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1
+  indicates that a signal must be cleared, and 0 indicates not to clear the signal.
+
+  @datatypes
+  #qurt_anysignal_t
+	
+  @param[in] signal Pointer to the any-signal object, which specifies the any-signal object to modify. 
+  @param[in] mask   Signal mask value identifying the individual signals to  
+                    clear in the any-signal object.
+	
+  @return 				
+  Bitmask -- Old signal values (before clear). 
+
+  @dependencies
+  None.
+ */
+/* ======================================================================*/
+unsigned int qurt_anysignal_clear(qurt_anysignal_t *signal, unsigned int mask);
+
+/*======================================================================*/
+/**@ingroup func_qurt_anysignal_wait_timed
+  Waits on the any-signal object. \n
+  Suspends the current thread until any of the specified signals is set or timeout expires.
+
+  Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1
+  indicates that a signal must be waited on, and 0 indicates not to wait on the signal.
+  If a signal is set in an any-signal object, and a thread was waiting on the any-signal object for
+  that signal, the thread is awakened. If the awakened thread has higher priority than
+  the current thread, a context switch can occur.
+
+  @note1hang At most, one thread can wait on an any-signal object at any given time.
+
+  @datatypes
+  #qurt_anysignal_t
+	
+  @param[in] signal Pointer to the any-signal object to wait on. 
+  @param[in] mask   Signal mask value, which specifies the individual signals in the any-signal
+                      object to wait on.
+  @param[out] signals Bitmask of current signal values.
+  @param[in] duration Interval (in microseconds) duration value must be between #QURT_TIMER_MIN_DURATION and
+  #QURT_TIMER_MAX_DURATION.
+
+  @return 				
+   #QURT_EOK -- Success \n
+   #QURT_ETIMEDOUT -- timeout
+   #QURT_EINVALID -- Duration out of range
+
+  @dependencies
+  None.
+ */
+/* ======================================================================*/
+
+int qurt_anysignal_wait_timed(qurt_anysignal_t *signal, unsigned int mask, unsigned int *signals, unsigned long long int duration);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_ANYSIGNAL_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_api_version.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_api_version.h
new file mode 100755
index 0000000000000..dfe53ae755054
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_api_version.h
@@ -0,0 +1,77 @@
+#ifndef QURT_API_VERSION_H
+#define QURT_API_VERSION_H
+/*==============================================================================
+
+qurt_api_version.h
+
+GENERAL DESCRIPTION
+    API version file
+
+EXTERNAL FUNCTIONS
+    None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+    None.
+
+Copyright (c) Qualcomm Technologies, Inc.
+All Rights Reserved.
+Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+==============================================================================*/
+
+/*==============================================================================
+                         CONSTANTS AND DEFINITIONS
+==============================================================================*/
+/**
+ * Each field of the QURT_API_VERSION definitions is an 8-bit unsigned integer.
+ * Main release has first 3 fields updated - Major, Minor and Release.
+ *  - QURT_API_VERSION = Major, Minor, Release.
+ * Patch releases are supported by adding the extra field.
+ *  - QURT_API_VERSION = Major, Minor, Release, Patch.
+ */
+// Major version is incremented for incompatible API changes.
+#define QURT_API_VER_MAJOR 1
+
+// Minor version is incremented for backward-compatible enhancements in the API
+// set.
+#define QURT_API_VER_MINOR 4
+
+// RELEASE version is incremented for each release within a `MAJOR.MINOR`
+// release.
+#define QURT_API_VER_RELEASE 1
+
+// Patch version is incremented when new API content is introduced on older LTS
+// release.
+#define QURT_API_VER_PATCH 0
+
+/* Update the QURT_API_VERSION function macro. */
+#define QURT_API_VERSION_ENCODE(major, minor, release, patch) \
+    ((((major) & 0xFF) << 24) | (((minor) & 0xFF) << 16) | \
+        (((release) & 0xFF) << 8) | ((patch) & 0xFF))
+
+/* Update the QURT_API_VERSION Macro. */
+#define QURT_API_VERSION \
+    QURT_API_VERSION_ENCODE(QURT_API_VER_MAJOR, QURT_API_VER_MINOR, \
+        QURT_API_VER_RELEASE, QURT_API_VER_PATCH)
+
+/** Usage:
+ *
+ * #if QURT_API_VERSION >= QURT_API_VERSION_ENCODE(1,4,0,0)
+ *  qurt_func_2(a,b,c);
+ * #else
+ *  qurt_func(a);
+ * #endif
+ *
+ */
+/*
+   Gets the QuRT API version.
+
+  @return
+  QuRT API version.
+
+  @dependencies
+  None.
+ */
+unsigned int qurt_api_version(void);
+
+#endif /* QURT_API_VERSION_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_assert.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_assert.h
new file mode 100755
index 0000000000000..13cc2afd2e973
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_assert.h
@@ -0,0 +1,51 @@
+#ifndef QURT_ASSERT_H
+#define QURT_ASSERT_H
+/**
+  @file qurt_assert.h   
+  @brief  Prototypes of qurt_assert API  
+
+  EXTERNAL FUNCTIONS
+   None.
+
+  INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+  Copyright (c) 2021  by Qualcomm Technologies, Inc.  All Rights Reserved.
+  Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+=============================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*=============================================================================
+                        CONSTANTS AND MACROS
+=============================================================================*/
+/**@ingroup func_qurt_assert_error
+  Writes diagnostic information to the debug buffer, and raises an error to the QuRT kernel.
+  
+  @datatypes
+  None.
+  
+  @param[in] filename     Pointer to the file name string.
+  @param[in] lineno       Line number.
+  
+  @return
+  None.
+
+  @dependencies
+  None.  
+ */
+void qurt_assert_error(const char *filename, int lineno) __attribute__((noreturn));
+
+#define qurt_assert(cond) ((cond)?(void)0:qurt_assert_error(__QURTFILENAME__,__LINE__))
+
+/** @} */ /* end_ingroup func_qurt_assert */
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_ASSERT_H */
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_atomic_ops.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_atomic_ops.h
new file mode 100755
index 0000000000000..d9b2cff7d737c
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_atomic_ops.h
@@ -0,0 +1,1298 @@
+#ifndef QURT_ATOMIC_OPS_H
+#define QURT_ATOMIC_OPS_H
+/**
+  @file qurt_atomic_ops.h 
+  @brief  Prototypes of kernel atomic operations API.
+
+  EXTERNAL FUNCTIONS
+   None.
+
+   INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+Copyright (c) 2021, 2022  by Qualcomm Technologies, Inc.  All Rights Reserved
+Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+=============================================================================*/
+
+/*
+ * Australian Public Licence B (OZPLB)
+ *
+ * Version 1-0
+ *
+ * Copyright (c) 2007, Open Kernel Labs, Inc.
+ *
+ * All rights reserved. 
+ *
+ * Developed by: Embedded, Real-time and Operating Systems Program (ERTOS)
+ *               National ICT Australia
+ *               http://www.ertos.nicta.com.au
+ *
+ * Permission is granted by National ICT Australia, free of charge, to
+ * any person obtaining a copy of this software and any associated
+ * documentation files (the "Software") to deal with the Software without
+ * restriction, including (without limitation) the rights to use, copy,
+ * modify, adapt, merge, publish, distribute, communicate to the public,
+ * sublicense, and/or sell, lend or rent out copies of the Software, and
+ * to permit persons to whom the Software is furnished to do so, subject
+ * to the following conditions:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimers.
+ *
+ *     * Redistributions in binary form must reproduce the above
+ *       copyright notice, this list of conditions and the following
+ *       disclaimers in the documentation and/or other materials provided
+ *       with the distribution.
+ *
+ *     * Neither the name of National ICT Australia, nor the names of its
+ *       contributors, may be used to endorse or promote products derived
+ *       from this Software without specific prior written permission.
+ *
+ * EXCEPT AS EXPRESSLY STATED IN THIS LICENCE AND TO THE FULL EXTENT
+ * PERMITTED BY APPLICABLE LAW, THE SOFTWARE IS PROVIDED "AS-IS", AND
+ * NATIONAL ICT AUSTRALIA AND ITS CONTRIBUTORS MAKE NO REPRESENTATIONS,
+ * WARRANTIES OR CONDITIONS OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
+ * BUT NOT LIMITED TO ANY REPRESENTATIONS, WARRANTIES OR CONDITIONS
+ * REGARDING THE CONTENTS OR ACCURACY OF THE SOFTWARE, OR OF TITLE,
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT,
+ * THE ABSENCE OF LATENT OR OTHER DEFECTS, OR THE PRESENCE OR ABSENCE OF
+ * ERRORS, WHETHER OR NOT DISCOVERABLE.
+ *
+ * TO THE FULL EXTENT PERMITTED BY APPLICABLE LAW, IN NO EVENT SHALL
+ * NATIONAL ICT AUSTRALIA OR ITS CONTRIBUTORS BE LIABLE ON ANY LEGAL
+ * THEORY (INCLUDING, WITHOUT LIMITATION, IN AN ACTION OF CONTRACT,
+ * NEGLIGENCE OR OTHERWISE) FOR ANY CLAIM, LOSS, DAMAGES OR OTHER
+ * LIABILITY, INCLUDING (WITHOUT LIMITATION) LOSS OF PRODUCTION OR
+ * OPERATION TIME, LOSS, DAMAGE OR CORRUPTION OF DATA OR RECORDS; OR LOSS
+ * OF ANTICIPATED SAVINGS, OPPORTUNITY, REVENUE, PROFIT OR GOODWILL, OR
+ * OTHER ECONOMIC LOSS; OR ANY SPECIAL, INCIDENTAL, INDIRECT,
+ * CONSEQUENTIAL, PUNITIVE OR EXEMPLARY DAMAGES, ARISING OUT OF OR IN
+ * CONNECTION WITH THIS LICENCE, THE SOFTWARE OR THE USE OF OR OTHER
+ * DEALINGS WITH THE SOFTWARE, EVEN IF NATIONAL ICT AUSTRALIA OR ITS
+ * CONTRIBUTORS HAVE BEEN ADVISED OF THE POSSIBILITY OF SUCH CLAIM, LOSS,
+ * DAMAGES OR OTHER LIABILITY.
+ *
+ * If applicable legislation implies representations, warranties, or
+ * conditions, or imposes obligations or liability on National ICT
+ * Australia or one of its contributors in respect of the Software that
+ * cannot be wholly or partly excluded, restricted or modified, the
+ * liability of National ICT Australia or the contributor is limited, to
+ * the full extent permitted by the applicable legislation, at its
+ * option, to:
+ * a.  in the case of goods, any one or more of the following:
+ * i.  the replacement of the goods or the supply of equivalent goods;
+ * ii.  the repair of the goods;
+ * iii. the payment of the cost of replacing the goods or of acquiring
+ *  equivalent goods;
+ * iv.  the payment of the cost of having the goods repaired; or
+ * b.  in the case of services:
+ * i.  the supplying of the services again; or
+ * ii.  the payment of the cost of having the services supplied again.
+ *
+ * The construction, validity and performance of this licence is governed
+ * by the laws in force in New South Wales, Australia.
+ */
+
+/*
+ * Author: Malcolm Purvis <malcolmp@ok-labs.com>
+ *
+ * This file is only included by the main atomic_ops.h, so all of that
+ * file's definitions are available.
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*=============================================================================
+                        CONSTANTS AND MACROS
+=============================================================================*/
+
+///* Sanity check to ensure the smp flag is set in machines.py */
+//#if defined(__ATOMIC_OPS_IN_KERNEL__) && !defined(MACHINE_SMP) && CONFIG_NUM_UNITS > 1
+//#error CONFIG_NUM_UNITS > 1 but smp not defined in machines.py.
+//#endif
+#define QURT_INLINE  __attribute__((always_inline))
+
+/*=============================================================================
+                        FUNCTIONS
+=============================================================================*/
+/**@ingroup func_qurt_atomic_set
+  Sets the atomic variable with the specified value.
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+  @param[in]      value  Value to set.
+  
+  @return
+  Value successfuly set.
+
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE unsigned int
+qurt_atomic_set(unsigned int* target, unsigned int value)
+{
+    unsigned long tmp;
+
+    __asm__ __volatile__(
+        "1:     %0 = memw_locked(%2)\n"
+        "       memw_locked(%2, p0) = %3\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (tmp),"+m" (*target)
+        : "r" (target), "r" (value)
+        : "p0");
+    return value;
+}
+
+/**@ingroup func_qurt_atomic_and
+  Bitwise AND operation of the atomic variable with mask. 
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+  @param[in]      mask   Mask for bitwise AND. 
+
+  @return
+  None
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE void
+qurt_atomic_and(unsigned int* target, unsigned int mask)
+{
+    unsigned int result;
+
+    __asm__ __volatile__(
+        "1:     %0 = memw_locked(%2)\n"
+        "       %0 = and(%0, %3)\n"
+        "       memw_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*target)
+        : "r" (target),"r" (mask)
+        : "p0");
+}
+
+/**@ingroup func_qurt_atomic_and_return
+  Bitwise AND operation of the atomic variable with mask. 
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+  @param[in]      mask   Mask for bitwise AND. 
+
+  @return
+  AND result of atomic variable with mask.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE unsigned int
+qurt_atomic_and_return(unsigned int* target, unsigned int mask)
+{
+    unsigned int result;
+
+    __asm__ __volatile__(
+        "1:     %0 = memw_locked(%2)\n"
+        "       %0 = and(%0, %3)\n"
+        "       memw_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*target)
+        : "r" (target), "r" (mask)
+        : "p0");
+
+    return result;
+}
+
+/**@ingroup func_qurt_atomic_or
+  Bitwise OR operation of the atomic variable with mask. 
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+  @param[in]      mask   Mask for bitwise OR. 
+
+  @return
+  None.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE void
+qurt_atomic_or(unsigned int* target, unsigned int mask)
+{
+    unsigned int result;
+
+    __asm__ __volatile__(
+        "1:     %0 = memw_locked(%2)\n"
+        "       %0 = or(%0, %3)\n"
+        "       memw_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*target)
+        : "r" (target), "r" (mask)
+        : "p0");
+}
+
+/**@ingroup func_qurt_atomic_or_return
+  Bitwise OR operation of the atomic variable with mask. 
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+  @param[in]      mask   Mask for bitwise OR. 
+
+  @return
+  Returns the OR result of the atomic variable with mask.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE unsigned int
+qurt_atomic_or_return(unsigned int* target, unsigned int mask)
+{
+    unsigned int result;
+
+    __asm__ __volatile__(
+        "1:     %0 = memw_locked(%2)\n"
+        "       %0 = or(%0, %3)\n"
+        "       memw_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*target)
+        : "r" (target), "r" (mask)
+        : "p0");
+
+    return result;
+}
+
+/**@ingroup func_qurt_atomic_xor
+  Bitwise XOR operation of the atomic variable with mask. 
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+  @param[in]      mask   Mask for bitwise XOR.
+
+  @return
+  None.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE void
+qurt_atomic_xor(unsigned int* target, unsigned int mask)
+{
+    unsigned int result;
+
+    __asm__ __volatile__(
+        "1:     %0 = memw_locked(%2)\n"
+        "       %0 = xor(%0, %3)\n"
+        "       memw_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*target)
+        : "r" (target), "r" (mask)
+        : "p0");
+}
+
+/**@ingroup func_qurt_atomic_xor_return
+  Bitwise XOR operation of the atomic variable with mask. 
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+  @param[in]      mask   Mask for bitwise XOR. 
+
+  @return
+  XOR result of atomic variable with mask.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE unsigned int
+qurt_atomic_xor_return(unsigned int* target, unsigned int mask)
+{
+    unsigned int result;
+
+    __asm__ __volatile__(
+        "1:     %0 = memw_locked(%2)\n"
+        "       %0 = xor(%0, %3)\n"
+        "       memw_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*target)
+        : "r" (target), "r" (mask)
+        : "p0");
+
+    return result;
+}
+
+/**@ingroup func_qurt_atomic_set_bit
+  Sets a bit in the atomic variable at a specified position.
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+  @param[in]      bit    Bit position to set. 
+
+  @return
+  None.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE void
+qurt_atomic_set_bit(unsigned int *target, unsigned int bit)
+{
+    unsigned int result;
+    unsigned int aword = bit / ((unsigned int)sizeof(unsigned int) * 8U); 
+    unsigned int sbit = bit % ((unsigned int)sizeof(unsigned int) * 8U);
+    unsigned int *wtarget= (unsigned int *)&target[aword];
+
+    __asm__ __volatile__(
+        "1:     %0 = memw_locked(%2)\n"
+        "       %0 = setbit(%0, %3)\n"
+        "       memw_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*wtarget)
+        : "r" (wtarget), "r" (sbit)
+        : "p0");
+}
+
+/**@ingroup func_qurt_atomic_clear_bit
+  Clears a bit in the atomic variable at a specified position. 
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+  @param[in]      bit    Bit position to clear.
+
+  @return
+  None.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE void
+qurt_atomic_clear_bit(unsigned int *target, unsigned int bit)
+{
+    unsigned int result;
+    unsigned int aword = bit / ((unsigned int)sizeof(unsigned int) * 8U); 
+    unsigned int sbit = bit % ((unsigned int)sizeof(unsigned int) * 8U);
+    unsigned int *wtarget= (unsigned int *)&target[aword];
+
+    __asm__ __volatile__(
+        "1:     %0 = memw_locked(%2)\n"
+        "       %0 = clrbit(%0, %3)\n"
+        "       memw_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*wtarget)
+        : "r" (wtarget), "r" (sbit)
+        : "p0");
+}
+
+/**@ingroup func_qurt_atomic_change_bit
+  Toggles a bit in a atomic variable at a bit position. 
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+  @param[in]      bit    Bit position to toggle. 
+
+  @return
+  None.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE void
+qurt_atomic_change_bit(unsigned int *target, unsigned int bit)
+{
+    unsigned int result;
+    unsigned int aword = bit / ((unsigned int)sizeof(unsigned int) * 8U); 
+    unsigned int sbit = bit & 0x1fU;
+    unsigned int *wtarget= (unsigned int *)&target[aword];
+
+    __asm__ __volatile__(
+        "1:     %0 = memw_locked(%2)\n"
+        "       %0 = togglebit(%0, %3)\n"
+        "       memw_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*wtarget)
+        : "r" (wtarget),"r" (sbit)
+        : "p0");
+}
+
+/**@ingroup func_qurt_atomic_add
+  Adds an integer to atomic variable.
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+  @param[in]      v      Integer value to add. 
+
+  @return
+  None.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE void
+qurt_atomic_add(unsigned int *target, unsigned int v)
+{
+    unsigned int result;
+
+    __asm__ __volatile__(
+        "1:     %0 = memw_locked(%2)\n"
+        "       %0 = add(%0, %3)\n"
+        "       memw_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*target)
+        : "r" (target), "r" (v)
+        : "p0");
+}
+
+/**@ingroup func_qurt_atomic_add_return
+  Adds an integer to atomic variable.
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+  @param[in]      v      Integer value to add. 
+
+  @return
+  Result of arithmetic sum.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE unsigned int
+qurt_atomic_add_return(unsigned int *target, unsigned int v)
+{
+    unsigned int result;
+
+    __asm__ __volatile__(
+        "1:     %0 = memw_locked(%2)\n"
+        "       %0 = add(%0, %3)\n"
+        "       memw_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*target)
+        : "r" (target), "r" (v)
+        : "p0");
+
+    return result;
+}
+
+/**@ingroup func_qurt_atomic_add_unless
+  Adds the delta value to an atomic variable unless the current value in the target 
+  matches the unless variable.
+
+  @note1hang The function retries until load lock and store conditional
+             are successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+  @param[in]      delta  Value to add to the current value.
+  @param[in]      unless Perform the addition only when the current value is not 
+                         equal to this unless value.
+  @return
+  TRUE  -- 1 - Addition was performed. \n
+  FALSE -- 0 - Addition was not done.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE unsigned int
+qurt_atomic_add_unless(unsigned int* target,
+                       unsigned int delta,
+                       unsigned int unless)
+{
+    unsigned int current_val;
+    unsigned int new_val;
+
+    __asm__ __volatile__(
+        "1:     %0 = memw_locked(%3)\n"
+        "       p0 = cmp.eq(%0, %5)\n"
+        "       if p0 jump 2f\n"
+        "       %1 = add(%0, %4)\n"
+        "       memw_locked(%3, p0) = %1\n"
+        "       if !p0 jump 1b\n"
+        "2:\n"
+        : "=&r" (current_val),"=&r" (new_val),"+m" (*target)
+        : "r" (target), "r" (delta), "r" (unless)
+        : "p0");
+
+    return (unsigned int)(current_val != unless);
+}
+
+/**@ingroup func_qurt_atomic_sub
+  Subtracts an integer from an atomic variable.
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+  @param[in]      v      Integer value to subtract. 
+
+  @return
+  None.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE void
+qurt_atomic_sub(unsigned int *target, unsigned int v)
+{
+    unsigned int result;
+
+    __asm__ __volatile__(
+        "1:     %0 = memw_locked(%2)\n"
+        "       %0 = sub(%0, %3)\n"
+        "       memw_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*target)
+        : "r" (target), "r" (v)
+        : "p0");
+}
+
+/**@ingroup func_qurt_atomic_sub_return
+  Subtracts an integer from an atomic variable.
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+  @param[in]      v      Integer value to subtract. 
+
+  @return
+  Result of arithmetic subtraction.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE unsigned int
+qurt_atomic_sub_return(unsigned int *target, unsigned int v)
+{
+    unsigned int result;
+
+    __asm__ __volatile__(
+        "1:     %0 = memw_locked(%2)\n"
+        "       %0 = sub(%0, %3)\n"
+        "       memw_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*target)
+        : "r" (target), "r" (v)
+        : "p0");
+
+    return result;
+}
+
+/**@ingroup func_qurt_atomic_inc
+  Increments an atomic variable by one.
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+
+  @return
+  None.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE void
+qurt_atomic_inc(unsigned int *target)
+{
+    unsigned int result;
+
+    __asm__ __volatile__(
+        "1:     %0 = memw_locked(%2)\n"
+        "       %0 = add(%0, #1)\n"
+        "       memw_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*target)
+        : "r" (target)
+        : "p0");
+}
+
+/**@ingroup func_qurt_atomic_inc_return
+  Increments an atomic variable by one.
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+
+  @return
+  Incremented value.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE unsigned int
+qurt_atomic_inc_return(unsigned int *target)
+{
+    unsigned int result;
+
+    __asm__ __volatile__(
+        "1:     %0 = memw_locked(%2)\n"
+        "       %0 = add(%0, #1)\n"
+        "       memw_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*target)
+        : "r" (target)
+        : "p0");
+
+    return result;
+}
+
+/**@ingroup func_qurt_atomic_dec
+  Decrements an atomic variable by one.
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+
+  @return
+  None.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE void
+qurt_atomic_dec(unsigned int *target)
+{
+    unsigned int result;
+
+    __asm__ __volatile__(
+        "1:     %0 = memw_locked(%2)\n"
+        "       %0 = add(%0, #-1)\n"
+        "       memw_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*target)
+        : "r" (target)
+        : "p0");
+}
+
+/**@ingroup func_qurt_atomic_dec_return
+  Decrements an atomic variable by one.
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+
+  @return
+  Decremented value.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE unsigned int
+qurt_atomic_dec_return(unsigned int *target)
+{
+    unsigned int result;
+
+    __asm__ __volatile__(
+        "1:     %0 = memw_locked(%2)\n"
+        "       %0 = add(%0, #-1)\n"
+        "       memw_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*target)
+        : "r" (target)
+        : "p0");
+
+    return result;
+}
+
+/**@ingroup func_qurt_atomic_compare_and_set
+  Compares the current value of the atomic variable with the
+  specified value and set to a new value when compare is successful.
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target  Pointer to the atomic variable.
+  @param[in]      old_val Old value to compare.
+  @param[in]      new_val New value to set.
+
+  @return
+  FALSE -- Specified value is not equal to the current value. \n
+  TRUE --Specified value is equal to the current value.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE unsigned int
+qurt_atomic_compare_and_set(unsigned int* target,
+                       unsigned int old_val,
+                       unsigned int new_val)
+{
+    unsigned int current_val;
+
+    __asm__ __volatile__(
+        "1:     %0 = memw_locked(%2)\n"
+        "       p0 = cmp.eq(%0, %3)\n"
+        "       if !p0 jump 2f\n"
+        "       memw_locked(%2, p0) = %4\n"
+        "       if !p0 jump 1b\n"
+        "2:\n"
+        : "=&r" (current_val),"+m" (*target)
+        : "r" (target), "r" (old_val), "r" (new_val)
+        : "p0");
+
+    return (unsigned int)(current_val == old_val);
+}
+
+/**@ingroup func_qurt_atomic_barrier
+  Allows the compiler to enforce an ordering constraint on memory operation issued
+  before and after the function.
+  
+  @return
+  None.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE void
+qurt_atomic_barrier(void)
+{
+    __asm__ __volatile__ (
+        ""
+        :
+        :
+        :
+        "memory");
+}
+
+
+/**@ingroup func_qurt_atomic64_set
+  Sets the 64-bit atomic variable with the specified value. 
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+  @param[in]      value  64-bit value to set. 
+
+  @return
+  Successfuly set value.
+
+  @dependencies
+  None.
+*/ 
+static inline QURT_INLINE unsigned long long
+qurt_atomic64_set(unsigned long long* target, unsigned long long value)
+{
+    unsigned long long tmp;
+
+    __asm__ __volatile__(
+        "1:     %0 = memd_locked(%2)\n"
+        "       memd_locked(%2, p0) = %3\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (tmp),"+m" (*target)
+        : "r" (target), "r" (value)
+        : "p0");
+    return value;
+}
+
+/**@ingroup func_qurt_atomic64_and_return
+  Bitwise AND operation of a 64-bit atomic variable with mask. 
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+  @param[in]      mask   64-bit mask for bitwise AND. 
+
+  @return
+  AND result of 64-bit atomic variable with mask.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE unsigned long long
+qurt_atomic64_and_return(unsigned long long* target, unsigned long long mask)
+{
+    unsigned long long result;
+
+    __asm__ __volatile__(
+        "1:     %0 = memd_locked(%2)\n"
+        "       %0 = and(%0, %3)\n"
+        "       memd_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*target)
+        : "r" (target), "r" (mask)
+        : "p0");
+
+    return result;
+}
+
+/**@ingroup func_qurt_atomic64_or
+  Bitwise OR operation of a 64-bit atomic variable with mask.
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+  @param[in]      mask   64-bit mask for bitwise OR. 
+
+  @return
+  None.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE void
+qurt_atomic64_or(unsigned long long* target, unsigned long long mask)
+{
+    unsigned long long result;
+
+    __asm__ __volatile__(
+        "1:     %0 = memd_locked(%2)\n"
+        "       %0 = or(%0, %3)\n"
+        "       memd_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*target)
+        : "r" (target), "r" (mask)
+        : "p0");
+}
+
+/**@ingroup func_qurt_atomic64_or_return
+  Bitwise OR operation of a 64-bit atomic variable with mask. 
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+  @param[in]      mask   64-bit mask for bitwise OR. 
+
+  @return
+  OR result of the atomic variable with mask.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE unsigned long long
+qurt_atomic64_or_return(unsigned long long* target, unsigned long long mask)
+{
+    unsigned long long result;
+
+    __asm__ __volatile__(
+        "1:     %0 = memd_locked(%2)\n"
+        "       %0 = or(%0, %3)\n"
+        "       memd_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*target)
+        : "r" (target), "r" (mask)
+        : "p0");
+
+    return result;
+}
+
+/**@ingroup func_qurt_atomic64_xor_return
+  Bitwise XOR operation of 64-bit atomic variable with mask. 
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+  @param[in]      mask   64-bit mask for bitwise XOR. 
+
+  @return
+  XOR result of atomic variable with mask.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE unsigned long long
+qurt_atomic64_xor_return(unsigned long long* target, unsigned long long mask)
+{
+    unsigned long long result;
+
+    __asm__ __volatile__(
+        "1:     %0 = memd_locked(%2)\n"
+        "       %0 = xor(%0, %3)\n"
+        "       memd_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*target)
+        : "r" (target), "r" (mask)
+        : "p0");
+
+    return result;
+}
+
+/**@ingroup func_qurt_atomic64_set_bit
+  Sets a bit in a 64-bit atomic variable at a specified position. 
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+  @param[in]      bit    Bit position to set. 
+
+  @return
+  None.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE void
+qurt_atomic64_set_bit(unsigned long long *target, unsigned int bit)
+{
+    unsigned int result;
+    unsigned int *wtarget;
+    unsigned int *pwtarget = (unsigned int *)target;
+    unsigned int aword = bit / ((unsigned int)sizeof(unsigned int) * 8U); 
+    unsigned int sbit = bit & 0x1FU;
+    wtarget = (unsigned int *)&pwtarget[aword];
+
+
+    __asm__ __volatile__(
+        "1:     %0 = memw_locked(%2)\n"
+        "       %0 = setbit(%0, %3)\n"
+        "       memw_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*wtarget)
+        : "r" (wtarget), "r" (sbit)
+        : "p0");
+}
+
+/**@ingroup func_qurt_atomic64_clear_bit
+  Clears a bit in a 64-bit atomic variable at a specified position. 
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+  @param[in]      bit    Bit position to clear. 
+
+  @return
+  None.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE void
+qurt_atomic64_clear_bit(unsigned long long *target, unsigned int bit)
+{
+    unsigned int result;
+    unsigned int *wtarget;
+    unsigned int *pwtarget = (unsigned int *)target;
+    unsigned int aword = bit / ((unsigned int)sizeof(unsigned int) * 8U); 
+    unsigned int sbit = bit & 0x1FU;
+    wtarget = (unsigned int *)&pwtarget[aword];
+
+    __asm__ __volatile__(
+        "1:     %0 = memw_locked(%2)\n"
+        "       %0 = clrbit(%0, %3)\n"
+        "       memw_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*wtarget)
+        : "r" (wtarget), "r" (sbit)
+        : "p0");
+}
+
+/**@ingroup func_qurt_atomic64_change_bit
+  Toggles a bit in a 64-bit atomic variable at a bit position. 
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+  @param[in]      bit    Bit position to toggle. 
+
+  @return
+  None.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE void
+qurt_atomic64_change_bit(unsigned long long *target, unsigned int bit)
+{
+    unsigned int result;
+    unsigned int *wtarget;
+    unsigned int *pwtarget = (unsigned int *)target;
+    unsigned int aword = bit / ((unsigned int)sizeof(unsigned int) * 8U); 
+    unsigned int sbit = bit & 0x1FU;
+    wtarget = (unsigned int *)&pwtarget[aword];
+
+    __asm__ __volatile__(
+        "1:     %0 = memw_locked(%2)\n"
+        "       %0 = togglebit(%0, %3)\n"
+        "       memw_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*wtarget)
+        : "r" (wtarget),"r" (sbit)
+        : "p0");
+}
+
+/**@ingroup func_qurt_atomic64_add
+  Adds a 64-bit integer to 64-bit atomic variable.
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+  @param[in]      v      64-bit integer value to add. 
+
+  @return
+  None.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE void
+qurt_atomic64_add(unsigned long long *target, unsigned long long v)
+{
+    unsigned long long result;
+
+    __asm__ __volatile__(
+        "1:     %0 = memd_locked(%2)\n"
+        "       %0 = add(%0, %3)\n"
+        "       memd_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*target)
+        : "r" (target), "r" (v)
+        : "p0");
+}
+
+/**@ingroup func_qurt_atomic64_add_return
+  Adds a 64-bit integer to 64-bit atomic variable.
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+  @param[in]      v      64-bit integer value to add. 
+
+  @return
+  Result of arithmetic sum.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE unsigned long long
+qurt_atomic64_add_return(unsigned long long *target, unsigned long long v)
+{
+    unsigned long long result;
+
+    __asm__ __volatile__(
+        "1:     %0 = memd_locked(%2)\n"
+        "       %0 = add(%0, %3)\n"
+        "       memd_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*target)
+        : "r" (target), "r" (v)
+        : "p0");
+
+    return result;
+}
+
+/**@ingroup func_qurt_atomic64_sub_return
+  Subtracts a 64-bit integer from an atomic variable.
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+  @param[in]      v      64-bit integer value to subtract. 
+
+  @return
+  Result of arithmetic subtraction.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE unsigned long long
+qurt_atomic64_sub_return(unsigned long long *target, unsigned long long v)
+{
+    unsigned long long result;
+
+    __asm__ __volatile__(
+        "1:     %0 = memd_locked(%2)\n"
+        "       %0 = sub(%0, %3)\n"
+        "       memd_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*target)
+        : "r" (target), "r" (v)
+        : "p0");
+
+    return result;
+}
+
+/**@ingroup func_qurt_atomic64_inc
+  Increments a 64-bit atomic variable by one.
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+
+  @return
+  None.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE void
+qurt_atomic64_inc(unsigned long long *target)
+{
+    unsigned long long result;
+    unsigned long long inc =1;
+
+    __asm__ __volatile__(
+        "1:     %0 = memd_locked(%2)\n"
+        "       %0 = add(%0, %3)\n"
+        "       memd_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*target)
+        : "r" (target),"r" (inc)
+        : "p0");
+}
+
+/**@ingroup func_qurt_atomic64_inc_return
+  Increments a 64-bit atomic variable by one
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+
+  @return
+  Incremented value.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE unsigned long long
+qurt_atomic64_inc_return(unsigned long long *target)
+{
+    unsigned long long result;
+    unsigned long long inc =1;
+
+    __asm__ __volatile__(
+        "1:     %0 = memd_locked(%2)\n"
+        "       %0 = add(%0, %3)\n"
+        "       memd_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*target)
+        : "r" (target),"r" (inc)
+        : "p0");
+
+    return result;
+}
+
+/**@ingroup func_qurt_atomic64_dec_return
+  Decrements a 64-bit atomic variable by one.
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+
+  @return
+  Decremented value.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE unsigned long long
+qurt_atomic64_dec_return(unsigned long long *target)
+{
+    unsigned long long result;
+    long long minus1 = 0xFFFFFFFFFFFFFFFFLL;
+
+    __asm__ __volatile__(
+        "1:     %0 = memd_locked(%2)\n"
+        "       %0 = add(%0, %3)\n"
+        "       memd_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*target)
+        : "r" (target),"r" (minus1)
+        : "p0");
+
+    return result;
+}
+
+/**@ingroup func_qurt_atomic64_compare_and_set
+  Compares the current value of an 64-bit atomic variable with 
+  the specified value and sets to a new value when compare is successful.
+
+  @note1hang The function keep retrying until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target  Pointer to the atomic variable.
+  @param[in]      old_val 64-bit old value to compare.
+  @param[in]      new_val 64-bit new value to set.
+
+  @return
+  FALSE -- Specified value is not equal to the current value. \n
+  TRUE -- Specified value is equal to the current value.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE int
+qurt_atomic64_compare_and_set(unsigned long long *target,
+                       unsigned long long old_val,
+                       unsigned long long new_val)
+{
+    unsigned long long current_val;
+
+    __asm__ __volatile__(
+        "1:     %0 = memd_locked(%2)\n"
+        "       p0 = cmp.eq(%0, %3)\n"
+        "       if !p0 jump 2f\n"
+        "       memd_locked(%2, p0) = %4\n"
+        "       if !p0 jump 1b\n"
+        "2:\n"
+        : "=&r" (current_val),"+m" (*target)
+        : "r" (target), "r" (old_val), "r" (new_val)
+        : "p0");
+
+    return (int)(current_val == old_val);
+}
+
+/**@ingroup func_qurt_atomic64_barrier
+  Allows compiler to enforce an ordering constraint on memory operation issued
+  before and after the function.
+
+  @return
+  None.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE void
+qurt_atomic64_barrier(void)
+{
+    /** @cond */
+    __asm__ __volatile__ (
+        ""
+        :
+        :
+        :
+        "memory");
+    /** @endcond */
+}
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_ATOMIC_OPS_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_barrier.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_barrier.h
new file mode 100755
index 0000000000000..7c6f787d43bc2
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_barrier.h
@@ -0,0 +1,140 @@
+#ifndef QURT_BARRIER_H
+#define QURT_BARRIER_H
+
+/**
+  @file qurt_barrier.h
+  @brief Prototypes of Kernel barrier API functions.      
+
+ EXTERNALIZED FUNCTIONS
+ None.
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+ None.
+
+ Copyright (c) 2021 Qualcomm Technologies, Inc. All rights reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** @addtogroup barrier_types
+@{ */
+/*=====================================================================
+ Constants and macros
+======================================================================*/
+#define QURT_BARRIER_SERIAL_THREAD 1 /**< Serial thread. */
+#define QURT_BARRIER_OTHER 0         /**< Other. */
+
+#ifndef ASM
+#include <qurt_mutex.h>
+
+/*=====================================================================
+Typedefs
+======================================================================*/
+
+/** QuRT barrier type.                                                 
+ */
+typedef union {
+    /** @cond */
+	struct {
+        unsigned short threads_left;
+		unsigned short count;
+		unsigned int threads_total;
+        unsigned int queue;
+        unsigned int reserved;
+	};
+	unsigned long long int raw;
+    /** @endcond */
+} qurt_barrier_t;
+
+/** @} */ /* end_addtogroup barrier_types */
+
+/*=====================================================================
+ Functions
+======================================================================*/
+ 
+/*======================================================================*/
+/**@ingroup func_qurt_barrier_init
+  Initializes a barrier object.
+	
+  @datatypes
+  #qurt_barrier_t
+
+  @param[out] barrier       Pointer to the barrier object to initialize.
+  @param[in]  threads_total Total number of threads to synchronize on the barrier.
+
+
+  @return
+  Unused integer value.
+
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+int qurt_barrier_init(qurt_barrier_t *barrier, unsigned int threads_total);
+
+/*======================================================================*/
+/**@ingroup func_qurt_barrier_destroy
+  Destroys the specified barrier.
+
+  @note1hang Barriers must be destroyed when they are no longer in use. Failure
+             to do this causes resource leaks in the QuRT kernel.\n
+  @note1cont Barriers must not be destroyed while they are still in use. If this
+             occurs, the behavior of QuRT is undefined.
+
+  @datatypes
+  #qurt_barrier_t
+ 
+  @param[in] barrier Pointer to the barrier object to destroy.
+
+  @return     		
+  Unused integer value.
+
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+int qurt_barrier_destroy(qurt_barrier_t *barrier);
+
+/*======================================================================*/
+/**@ingroup func_qurt_barrier_wait
+  Waits on the barrier.\n
+  Suspends the current thread on the specified barrier. \n
+  The function return value indicates whether the thread was the last one to
+  synchronize on the barrier.
+  When a thread waits on a barrier, it is suspended on the barrier: \n
+  - If the total number of threads waiting on the barrier is less than the assigned value 
+     of the barrier, no other action occurs. \n
+  - If the total number of threads waiting on the barrier equals the assigned value of the
+     barrier, all threads currently waiting on the barrier are awakened, allowing them to
+     execute past the barrier.
+
+  @note1hang After its waiting threads are awakened, a barrier is automatically reset 
+            and can be used again in the program without the need for re-initialization.
+	                
+  @datatypes
+  #qurt_barrier_t
+  
+  @param[in] barrier Pointer to the barrier object to wait on.
+
+  @return 				
+  #QURT_BARRIER_OTHER -- Current thread awakened from barrier. \n 
+  #QURT_BARRIER_SERIAL_THREAD -- Current thread is last caller of barrier.
+
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+int qurt_barrier_wait(qurt_barrier_t *barrier);
+
+
+#endif
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_BARRIER_H */
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_busywait.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_busywait.h
new file mode 100755
index 0000000000000..a4dab80a2520a
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_busywait.h
@@ -0,0 +1,62 @@
+#ifndef QURT_BUSYWAIT_H
+#define QURT_BUSYWAIT_H
+
+/**
+  @file qurt_busywait.h 
+  @brief Implementation of the busywait() function for 
+   hardware based blocking waits that use the QTIMER as a reference.   
+
+ EXTERNALIZED FUNCTIONS
+  none
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  none
+
+ Copyright (c) 2021  by Qualcomm Technologies, Inc.  All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ============================================================================*/
+/*=============================================================================
+ *
+ *                       EDIT HISTORY FOR FILE
+ *
+ *   This section contains comments describing changes made to the
+ *   module. Changes are listed in reverse chronological
+ *   order.
+ *
+ * 
+ * when         who     what, where, why
+ * ----------   ---     -------------------------------------------------------
+ * 2018-03-20   pg      Add Header file
+ ============================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*=============================================================================
+                                    FUNCTIONS
+=============================================================================*/
+
+/**@ingroup func_qurt_busywait
+  Pauses the execution of a thread for a specified time.\n
+  Use for small microsecond delays.
+  
+  @note1hang The function does not return to the caller until
+  the time duration has expired.
+             
+  @param[in] pause_time_us Time to pause in microseconds. 
+ 
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+void qurt_busywait (unsigned int pause_time_us);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_BUSYWAIT_H */
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_callback.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_callback.h
new file mode 100755
index 0000000000000..dc9b896c63454
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_callback.h
@@ -0,0 +1,235 @@
+#ifndef QURT_CALLBACK_H
+#define QURT_CALLBACK_H
+
+/**
+  @file qurt_callback.h
+    Definitions, macros, and prototypes for QuRT callback framework.
+  
+  QDI framework allows the development of root process drivers and services that 
+  a user process client can interact with in a secure manner. QDI framework does 
+  this by elevating the priviledge of user process thread, temporarily allowing 
+  the thread execute in root context and letting it fall back to user context once 
+  the QDI invocation is finished. 
+
+  The QuRT callback framework provides a safe mechanism for root process drivers 
+  to execute callback functions in a user process. The framework hosts 
+  dedicated worker threads in corresponding processes that handle the execution
+  of the callback function. This ensures that the callbacks occur in context of
+  the appropriate process thread, in result maintaining privilege boundaries. 
+
+  Prerequisites for use of this framework are:
+  1. Driver is a QDI driver and client communicates with drivers using QDI 
+     invocations.
+  2. Appropriate callback configuration is specified in cust_config.xml for 
+     the user process that intends to use this framework.
+
+  qurt_cb_data_t is the public data structure that allows client to store all
+  the required information about the callback, including the callback function
+  and the arguments to pass to this function when it executes.
+  The client uses QDI interface to register this structure with root driver.
+  
+  Callback framework provides following APIs that a root driver can use to invoke callback.
+  These functions are described in qurt_qdi_driver.h header file.
+
+  qurt_qdi_cb_invoke_async() triggers an asynchronous callback wherein the
+  invoking thread does not wait for the callback to finish executing.
+
+  qurt_qdi_cb_invoke_sync()  triggers a synchronous callback. Upon invocation
+  the invoking thread gets suspended till the callback function finishes execution.
+  
+  qurt_qdi_cb_invoke_sync_with_data() invokes a synchronous callback similar to
+  qurt_qdi_cb_invoke_sync(). It allows user to pass large data along with 
+  the callback invocation to be utlized during the callback execution.
+     
+ EXTERNALIZED FUNCTIONS
+  none
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  none
+
+ Copyright (c) 2021, 2023  by Qualcomm Technologies, Inc.  All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+#include "qurt_qdi.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef int qurt_cb_result_t;
+
+/* Callback framework error codes.
+  Callback framework returns a nonzero value if callback invocation is unsuccessful.
+  Following macros highlight cause of failure in more detail.
+*/
+#define QURT_CB_ERROR               -1                  /* Callback registration failed.\n*/
+#define QURT_CB_OK                   0                  /* Success.\n*/
+#define QURT_CB_MALLOC_FAILED       -2                  /* QuRTOS malloc failure.\n*/
+#define QURT_CB_WAIT_CANCEL         -3                  /* Process exit cancelled wait operation.\n*/
+#define QURT_CB_CONFIG_NOT_FOUND    -4                  /* Callback configuration for process was not found.\n*/
+#define QURT_CB_QUEUE_FULL          -5                  /* Callback queue is serving at maximum capacity.*/
+/** @addtogroup cb_types
+@{ */
+/** Callback registration data structure.
+  This data structure is used by a client attempting to register a callback with a QDI driver.
+  It holds the address of callback function and the argument supplied to the callback 
+  function when it executes.
+*/
+typedef struct {
+  /** @cond */
+  void* cb_func;             /*< Pointer to the callback function. */
+  unsigned cb_arg;           /*< Not interpreted by the framework.*/
+  /** @endcond */
+} qurt_cb_data_t;
+
+/** @cond */
+/* Defines used as default if cust_config does not specify them. */
+#define CALLBACK_WORKER_STACK_SIZE 0x2000
+/** @endcond */
+/** @} */ /* end_addtogroup cb_typess */
+/**@ingroup func_qurt_cb_data_init 
+  Initializes the callback data structure.
+  Entity registering a callback with the root process driver must call this function
+  to initialize callback registration data structure to the default value.
+
+  @datatypes 
+  #qurt_cb_data_t
+
+  @param[in]  cb_data         Pointer to the callback data structure.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+static inline void qurt_cb_data_init (qurt_cb_data_t* cb_data){
+    cb_data->cb_func = NULL;
+    cb_data->cb_arg = 0;
+}
+
+/**@ingroup func_qurt_cb_data_set_cbfunc
+  Sets up the callback function in the callback registration data structure.
+  
+  @datatypes 
+  #qurt_cb_data_t
+
+  @param[in] cb_data         Pointer to the callback data structure.
+  @param[in] cb_func         Pointer to the callback function.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+static inline void qurt_cb_data_set_cbfunc (qurt_cb_data_t* cb_data, void* cb_func){
+  cb_data->cb_func = cb_func;
+}
+
+/**@ingroup func_qurt_cb_data_set_cbarg
+  Sets up the callback argument.
+  This function sets up the argument passed to the callback function when it executes.
+  
+  @datatypes 
+  #qurt_cb_data_t
+
+  @param[in] cb_data         Pointer to the callback data structure.
+  @param[in] cb_arg          Argument for the callback function.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+static inline void qurt_cb_data_set_cbarg (qurt_cb_data_t* cb_data, unsigned cb_arg){
+  cb_data->cb_arg = cb_arg;
+}
+
+/** @cond */
+/**@ingroup driver_support_functions
+  Invokes an asynchronous callback for a specified process. 
+  A driver that resides in the root process calls this API to launch a callback in
+  a process described by the client_handle.
+  After the callback is invoked, the framework queues the callback as per its 
+  priority and subsequently executes it.
+  The caller of this function is not suspended during the callback execution period.
+  The API returns immediately with a success/failure error code.
+
+  @note1hang  This function is only accessible to drivers in the root process. 
+              User process invocations shall fail with a negative error code return value.
+
+  @param  client_handle   Obtained from the current invocation function (Section 4.3.1).
+  @param  cb_data         Pointer to the callback data structure (refer to qurt_callback.h).
+  @param  prio            Priority at which the callback should execute.
+                          This paraemter is optional. If -1 is passed, the callback frameowrk 
+                          executes the callback at the priority of the API caller.
+  @return
+  QURT_EOK -- Callback was successfully communicated to the framework.
+  Negative error code -- Callback cannot be communicated to the framework.
+ */
+qurt_cb_result_t qurt_qdi_cb_invoke_async(int client_handle,
+                                          qurt_cb_data_t* cb_data,
+                                          int prio);
+
+
+/**@ingroup driver_support_functions
+  Invokes a synchronous callback for a specified process. 
+  A driver that resides in a root process calls this API to launch a sync callback in
+  a process described by the client_handle.
+  AFter the callback is invoked, the framework queues the callback as per its 
+  priority and subsequently executes it.
+  The caller of this function is suspended during the callback execution period.
+  If the process in which to execute the callback exits or terminates, the caller is
+  woken up with error code #QURT_CB_WAIT_CANCEL (refer to qurt_callback.h).
+
+  @note1hang  This function is only accessible to drivers in the root process. 
+              User process invocations shall fail with a negative error code return value.
+
+  @param  client_handle   Obtained from the current invocation function (Section 4.3.1).
+  @param  cb_data         Pointer to the callback data structure (refer to qurt_callback.h).
+  @param  prio            Priority at which the callback should execute.
+                          This paraemter is optional. If -1 is passed, callback frameowrk 
+                          executes the callback at the priority of the API caller.
+  @return
+  QURT_EOK -- Callback was successfully communicated to the framework.
+  Negative error code -- Callback cannot be communicated to the framework.
+ */
+qurt_cb_result_t qurt_qdi_cb_invoke_sync(int client_handle,
+                                         qurt_cb_data_t* cb_data,
+                                         int prio);
+
+/**@ingroup driver_support_functions
+  Invokes a synchronous callback for a specified process, passing driver data to the user PD.
+  This function is similar to qurt_qdi_cb_invoke_sync() and allows the driver to pass arbitrary data to
+  the user process as part of the callback invocation.
+
+  @param  client_handle   Obtained from the current invocation function (Section 4.3.1).
+  @param  cb_data         Pointer to the callback data structure (refer to qurt_callback.h).
+  @param  prio            Priority at which the callback should execute.
+                          This paraemter is optional. If -1 is passed, the callback frameowrk
+                          executes the callback at the priority of the API caller.
+  @param  data            Driver arbitrary data to pass to the user process. Memory pointed to by data
+                          must be accessible to the user PD. The root driver can allocate such memory by
+                          using qurt_mem_mmap().
+  @param  data_len        Driver arbitrary data length.
+  
+  @return
+  QURT_EOK -- Callback was successfully communicated to the framework.
+  Negative error code -- Callback cannot be communicated to the framework.
+ */
+qurt_cb_result_t qurt_qdi_cb_invoke_sync_with_data( int client_handle,
+                                                    qurt_cb_data_t* cb_data,
+                                                    int prio,
+                                                    void *data,
+                                                    unsigned data_len
+                                                    );
+/** @endcond */
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_clade.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_clade.h
new file mode 100755
index 0000000000000..d7442cf98dd94
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_clade.h
@@ -0,0 +1,62 @@
+#ifndef QURT_CLADE_H
+#define QURT_CLADE_H
+/**
+  @file qurt_clade.h 
+  @brief  Prototypes of Cache Line Accelerated Decompression Engine (CLADE) API.
+  CLADE is a cache line level memory compression system that is used to
+  decrease DRAM usage.
+
+EXTERNAL FUNCTIONS
+   None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+ Copyright (c) 2019-2021  by Qualcomm Technologies, Inc.  All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+=============================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*=============================================================================
+                                    FUNCTIONS
+=============================================================================*/
+
+/**@ingroup func_qurt_clade2_get
+  Reads the value of the clade2 register.
+ 
+  @param[in] offset Offset from the clade2 cfg base.
+  @param[out] *value  Pointer to the register value read from the offset.
+ 
+  @return
+  #QURT_EOK - Successfully read the value from the register at offset \n
+  #QURT_EINVALID - Offset passed is incorrect
+   
+  @dependencies
+  None.
+ */
+int qurt_clade2_get(unsigned short offset, unsigned int *value);
+ 
+/**@ingroup func_qurt_clade2_set
+  Sets the PMU register; only PMU_SEL register can be set.
+  
+  @param[in] offset Offset from the QURTK_clade2_cfg_base.          
+  @param[in] value  Value to set at offset.  
+ 
+  @return
+  #QURT_EOK -- Successfully set the value at offset. \n
+  #QURT_ENOTALLOWED -- Set operation performed at an offset other than CLADE2_PMU_SELECTION_REG.
+
+  @dependencies
+  None.
+ */
+int qurt_clade2_set(unsigned short offset, unsigned int value);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_CLADE_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_cond.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_cond.h
new file mode 100755
index 0000000000000..6e65ed82a8393
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_cond.h
@@ -0,0 +1,219 @@
+﻿#ifndef QURT_COND_H
+#define QURT_COND_H 
+/**
+  @file qurt_cond.h
+  @brief  Prototypes of kernel condition variable object API functions.
+
+ EXTERNALIZED FUNCTIONS
+  None
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  none
+
+ Copyright (c) 2021 Qualcomm Technologies, Inc.
+ All rights reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+
+
+#include <qurt_mutex.h>
+#include <qurt_rmutex2.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** @addtogroup condition_variables_types
+@{ */
+/*=====================================================================
+ Typedefs
+ ======================================================================*/
+
+/** QuRT condition variable type.  */
+typedef union {
+    /** @cond */
+	unsigned long long raw;
+	struct {
+		unsigned int count;
+		unsigned int n_waiting;
+        unsigned int queue;
+        unsigned int reserved;
+	}X;
+    /** @endcond */
+} qurt_cond_t;
+
+/** @} */ /* end_addtogroup condition_variables_types */
+
+/*=====================================================================
+ Functions
+======================================================================*/
+ 
+/*======================================================================*/
+/**@ingroup func_qurt_cond_init
+  Initializes a conditional variable object.
+
+  @datatypes
+  #qurt_cond_t
+	
+  @param[out] cond Pointer to the initialized condition variable object. 
+
+  @return
+  None.
+		 
+  @dependencies
+  None.
+ */
+/* ======================================================================*/
+void qurt_cond_init(qurt_cond_t *cond);
+
+/*======================================================================*/
+/**@ingroup func_qurt_cond_destroy
+  Destroys the specified condition variable.
+
+  @note1hang Conditions must be destroyed when they are no longer in use. Failure to do
+             this causes resource leaks in the QuRT kernel.\n
+  @note1cont Conditions must not be destroyed while they are still in use. If this occurs,
+             the behavior of QuRT is undefined. 
+
+  @datatypes
+  #qurt_cond_t
+
+  @param[in] cond Pointer to the condition variable object to destroy.
+
+  @return
+  None.
+
+ */
+/* ======================================================================*/
+void qurt_cond_destroy(qurt_cond_t *cond);
+
+/*======================================================================*/
+/**@ingroup func_qurt_cond_signal
+  Signals a waiting thread that the specified condition is true. \n
+
+  When a thread wishes to signal that a condition is true on a shared data item, it must
+  perform the following procedure: \n
+  -# Lock the mutex that controls access to the data item. \n
+  -# Perform the signal condition operation. \n
+  -# Unlock the mutex.
+
+  @note1hang Failure to properly lock and unlock a mutex of a condition variable can cause
+             the threads to never be suspended (or suspended but never awakened). 
+
+  @note1cont Use condition variables only with regular mutexes -- attempting to use
+             recursive mutexes or priority inheritance mutexes results in undefined behavior.
+             
+  @datatypes
+  #qurt_cond_t
+
+  @param[in] cond Pointer to the condition variable object to signal.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+/* ======================================================================*/
+void qurt_cond_signal(qurt_cond_t *cond);
+
+/*======================================================================*/
+/**@ingroup func_qurt_cond_broadcast
+  Signals multiple waiting threads that the specified condition is true.\n
+  When a thread wishes to broadcast that a condition is true on a shared data item, it must
+  perform the following procedure: \n
+  -# Lock the mutex that controls access to the data item. \n
+  -# Perform the broadcast condition operation. \n
+  -# Unlock the mutex.\n
+
+  @note1hang Failure to properly lock and unlock the mutex of a condition variable can cause
+             the threads to never be suspended (or suspended but never awakened).
+
+  @note1cont Use condition variables only with regular mutexes -- attempting to use
+  recursive mutexes or priority inheritance mutexes results in undefined behavior.
+  
+  @datatypes
+  #qurt_cond_t
+
+  @param[in] cond Pointer to the condition variable object to signal.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+/* ======================================================================*/
+void qurt_cond_broadcast(qurt_cond_t *cond);
+
+/*======================================================================*/
+/**@ingroup func_qurt_cond_wait
+  Suspends the current thread until the specified condition is true.
+  When a thread wishes to wait for a specific condition on a shared data item, it must
+  perform the following procedure: \n
+  -# Lock the mutex that controls access to the data item. \n
+  -# If the condition is not satisfied, perform the wait condition operation on the
+  condition variable (suspends the thread and unlocks the mutex).
+
+  @note1hang Failure to properly lock and unlock the mutex of a condition variable can cause
+             the threads to never be suspended (or suspended but never awakened).
+
+  @note1cont Use condition variables only with regular mutexes -- attempting to use
+  recursive mutexes or priority inheritance mutexes results in undefined behavior.
+  
+  @datatypes
+  #qurt_cond_t \n
+  #qurt_mutex_t
+  
+  @param[in] cond     Pointer to the condition variable object to wait on.
+  @param[in] mutex    Pointer to the mutex associated with condition variable to wait on.
+
+  @return
+  None.
+		 
+  @dependencies 
+  None.
+ */
+/* ======================================================================*/
+void qurt_cond_wait(qurt_cond_t *cond, qurt_mutex_t *mutex);
+
+/*======================================================================*/
+/**@ingroup func_qurt_cond_wait2
+  Suspends the current thread until the specified condition is true.
+  When a thread wishes to wait for a specific condition on a shared data item, it must
+  perform the following procedure: \n
+  -# Lock the mutex that controls access to the data item. \n
+  -# If the condition is not satisfied, perform the wait condition operation on the
+  condition variable, which suspends the thread and unlocks the mutex.
+ 
+  @note1hang Failure to properly lock and unlock the mutex of a condition variable can cause
+             the threads to never be suspended (or suspended but never awakened). 
+
+  @note1cont Use condition variables only with regular mutexes -- attempting to use
+             recursive mutexes or priority inheritance mutexes results in undefined behavior.
+             
+  @note1cont This is the same API as qurt_cond_wait(), use this version 
+             when using mutexes of type #qurt_rmutex2_t.
+
+  @datatypes
+  #qurt_cond_t \n
+  #qurt_rmutex2_t
+  
+  @param[in] cond     Pointer to the condition variable object to wait on.
+  @param[in] mutex    Pointer to the mutex associated with the condition variable to wait on.
+
+  @return
+  None.
+		 
+  @dependencies 
+  None.
+ */
+/* ======================================================================*/
+void qurt_cond_wait2(qurt_cond_t *cond, qurt_rmutex2_t *mutex);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_COND_H */
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_consts.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_consts.h
new file mode 100755
index 0000000000000..b1e35998e73b6
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_consts.h
@@ -0,0 +1,315 @@
+#ifndef QURT_CONSTS_H
+#define QURT_CONSTS_H
+
+/**
+  @file qurt_consts.h
+  @brief  QuRT constants and definitions
+
+  EXTERNAL FUNCTIONS
+   None.
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None
+
+ Copyright (c) 2013-2023 by Qualcomm Technologies, Inc.  All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*=====================================================================
+ Constants and macros
+ ======================================================================*/
+
+/* Definitions of system events. System events suspend
+   a thread and put it into suspending_list.
+   The system event number is saved in CONTEXT::error::cause field
+   of the suspended thread. An event handler thread such as
+   page fault handler or system error handler can wake up the suspended
+   thread.
+ */
+#define QURT_EVENT_PAGEFAULT      0x1 /* Page fault event. */
+#define QURT_EVENT_SYSTEM_ERR     0x2 /* System error event. */
+#define QURT_EVENT_SUSPEND        0x3
+#define QURT_EVENT_PROCESS_EXIT   0x4 /* Process termination event.*/
+
+#define QURT_SYSENV_MAX_THREADS_TYPE           1 /* Maximum threads object. */
+#define QURT_SYSENV_PROCNAME_TYPE              2 /* Process name object. */
+#define QURT_SYSENV_MAX_PI_PRIO_TYPE           3 /* Maximum pi priority object. */
+#define QURT_SYSENV_ARCH_REV_TYPE              4 /* Architecture version object. */
+#define QURT_SYSENV_APP_HEAP_TYPE              5 /* Application heap object. */
+#define QURT_SYSENV_REGION_ATTR_DEFAULT        7 /* Default region attributes. */
+#define QURT_SYSENV_STACK_PROFILE_COUNT_TYPE   8 /* Stack profile count type. */
+#define QURT_SYSENV_ISLAND_CONFIG_TYPE         9 /*island configuration check*/
+#define QURT_SYSENV_HTHREADS_TYPE              10 /* Active threads objec */
+#define QURT_SYSENV_CONFIG_IMAGE_START_LO      11 /* Config image start address for DTB parsing */
+#define QURT_SYSENV_CONFIG_IMAGE_START_HI      12 /* Config Image start address for DTB parsing */
+#define QURT_SYSENV_CHIPPARAMS_LO              13 /* ChipParams for DTB parsing */
+#define QURT_SYSENV_CHIPPARAMS_HI              14 /* ChipParams for DTB parsing */
+#define QURT_SYSENV_PLATPARAMS                 15 /* Platformparams for DTB parsing */
+#define QURT_SYSENV_CONFIG_IMAGE_SIZE          16 /* Config image Size for DTB parsing */
+#define QURT_SYSENV_L2_CACHE_LINE_SIZE         17 /*L2 cache line size*/
+
+/* Get q6 regs */
+#define QURT_GET_SSR         1
+#define QURT_GET_CCR         2
+#define QURT_GET_CFGBASE     3
+#define QURT_GET_SYSCFG      4
+#define QURT_GET_REV         5
+
+
+/** @cond rest_reg_dist */
+/** @addtogroup performance_monitor_macros
+@{ */
+
+/* PMU */
+#define QURT_PMUCNT0    0  /**< */
+#define QURT_PMUCNT1    1  /**< */
+#define QURT_PMUCNT2    2  /**< */
+#define QURT_PMUCNT3    3  /**< */
+#define QURT_PMUCFG     4  /**< */
+#define QURT_PMUEVTCFG  5  /**< */
+
+/* new since V55 */
+#define QURT_PMUCNT4    6  /**< */
+#define QURT_PMUCNT5    7  /**< */
+#define QURT_PMUCNT6    8  /**< */
+#define QURT_PMUCNT7    9  /**< */
+#define QURT_PMUEVTCFG1 10  /**< */
+
+/* new since V61 */
+#define QURT_PMUSTID0   11  /**< */
+#define QURT_PMUSTID1   12  /**< */
+
+#define QURT_PMUCNTSTID0   13  /**< */
+#define QURT_PMUCNTSTID1   14  /**< */
+#define QURT_PMUCNTSTID2   15  /**< */
+#define QURT_PMUCNTSTID3   16  /**< */
+#define QURT_PMUCNTSTID4   17  /**< */
+#define QURT_PMUCNTSTID5   18  /**< */
+#define QURT_PMUCNTSTID6   19  /**< */
+#define QURT_PMUCNTSTID7   20  /**< */
+
+/** @} */ /* end_addtogroup performance_monitor_macros */
+/** @endcond */
+
+/*
+ Power collapse operation
+*/
+#define QURT_POWER_SHUTDOWN       0 /**< */
+#define QURT_TCXO_SHUTDOWN        1 /**< */
+#define QURT_POWER_CMD_PREPARE    0 /**< */
+#define QURT_POWER_CMD_PERFORM    1 /**< */
+#define QURT_POWER_CMD_EXIT       2 /**< */
+#define QURT_POWER_CMD_FAIL_EXIT  3 /**< */
+#define QURT_POWER_CMD_PERFORM_L2_RETENTION 4 /**< */
+#define QURT_POWER_CMD_PERFORM_SAVE_TCM     5 /**< */
+#define QURT_POWER_CMD_DEEP_SLEEP 6           /**< */
+
+
+/** @addtogroup thread_macros
+@{ */
+#define QURT_MAX_HTHREAD_LIMIT    8U /**< Limit on the maximum number of hardware threads supported by QuRT for any
+ Hexagon version. Use this definition to define arrays, and so on, in
+ target independent code. */
+/** @} */ /* end_addtogroup thread_macros */
+
+/** @cond internal_only */
+/** @addtogroup power_management_macros
+@{ */
+/**
+  L2 cache retention mode
+*/
+#define QURT_POWER_SHUTDOWN_TYPE_L2NORET QURT_POWER_CMD_PERFORM /**< */
+#define QURT_POWER_SHUTDOWN_TYPE_L2RET   QURT_POWER_CMD_PERFORM_L2_RETENTION /**< */
+#define QURT_POWER_SHUTDOWN_TYPE_SAVETCM QURT_POWER_CMD_PERFORM_SAVE_TCM /**< */
+/** @} */ /* end_addtogroup power_management_macros */
+/** @endcond */
+
+/*
+  QURT_system_state
+  Use for debugging the shutdown/startup process.
+
+  State transition for cold boot:
+  QURT_BOOT_SETUP_ISDB --> QURT_CBOOT_BSP_INIT -->
+  QURT_CBOOT_END_CLEAN_INIT --> QURT_CBOOT_END_OS_INIT -->
+  QURT_CBOOT_KERNEL_INIT_DONE --> QURT_CBOOT_PLAT_CONFIG_DONE -->
+  QURT_CBOOT_ROOT_TASK_STARTED
+
+  State transition for power collapse:
+  QURT_PREPARE_SINGLE_MODE --> QURT_PERFORM_IPEND -->
+  QURT_PERFORM_SAVE_TLB --> QURT_PERFORM_SWITCH_PC -->
+  cache flush states (dependent on L2 retention config)
+
+  State transition for warm boot:
+  QURT_BOOT_SETUP_ISDB --> QURT_WBOOT_INIT_TLB -->
+  QURT_WBOOT_SET_1TO1_MAP --> QURT_WBOOT_REMOVE_1TO1_MAP -->
+  QURT_CBOOT_END_CLEAN_INIT --> QURT_CBOOT_END_OS_INIT
+*/
+#define QURT_PREPARE_SINGLE_MODE 1 /**< */
+#define QURT_PREPARE_END 2 /**< */
+#define QURT_PERFORM_IPEND 3 /**< */
+#define QURT_PERFORM_SAVE_ISDP 4 /**< */
+#define QURT_PERFORM_SAVE_PMU 5 /**< */
+#define QURT_PERFORM_SAVE_TLB 6 /**< */
+#define QURT_PERFORM_SWITCH_PC 7 /**< */
+#define QURT_PERFORM_EXIT 8 /**< */
+#define QURT_FLUSH_L1CACHE 9 /**< */
+#define QURT_FLUSH_L2CACHE 0xA /**< */
+#define QURT_FLUSH_CACHE_DONE 0xB /**< */
+#define QURT_SWITCH_PC_DONE 0xC /**< */
+#define QURT_BOOT_SETUP_ISDB 0xD /**< */
+#define QURT_WBOOT_INIT_TLB 0xE /**< */
+#define QURT_WBOOT_SET_1TO1_MAP 0xF /**< */
+#define QURT_WBOOT_CFG_ADV_SYSCFG 0x10 /**< */
+#define QURT_WBOOT_REMOVE_1TO1_MAP 0x11 /**< */
+#define QURT_CBOOT_BSP_INIT 0x12 /**< */
+#define QURT_CBOOT_END_CLEAN_L1CACHE 0x13 /**< */
+#define QURT_CBOOT_END_CLEAN_INIT 0x14 /**< */
+#define QURT_CBOOT_END_OS_INIT 0x15 /**< */
+#define QURT_CBOOT_TLB_DUMP_LOAD 0x16 /**< */
+#define QURT_CBOOT_TLB_STATIC_LOAD 0x17 /**< */
+#define QURT_CBOOT_KERNEL_INIT_DONE 0x18 /**< */
+#define QURT_CBOOT_PLAT_CONFIG_DONE 0x19 /**< */
+#define QURT_CBOOT_ROOT_TASK_STARTED 0x1A /**< */
+#define QURT_IMPRECISE_EXCEPTION 0x1B /**< */
+#define QURT_WBOOT_DEBUG_L2_START 0x1C /**< */
+#define QURT_WBOOT_DEBUG_L2_END   0x1D /**< */
+#define QURT_NMI_SAVE_L2VIC_COMPLETE   0x1E /**< */
+#define QURT_NMI_HANDLER_COMPLETE   0x1F /**< */
+#define QURT_NMI_AFTER_SAVE_GLOBAL 0x20 /**< */
+#define QURT_WBOOT_START 0x21 /**< */
+#define QURT_ENTER_ISLAND 0x22 /**< */
+#define QURT_EXIT_ISLAND 0x23 /**< */
+#define QURT_LOAD_NOTIFIER_TCB 0x24 /**< */
+#define QURT_ABNORMAL_RESET 0x25 /**< */
+/*
+  Thread attributes
+*/
+
+#define QURT_THREAD_ATTR_GP                    0x00000002 /*< */
+#define QURT_THREAD_ATTR_UGP                   0x00000003 /*< User general pointer (UGP)*/
+#define QURT_THREAD_ATTR_PREFETCH              0x00000004 /*< */
+#define QURT_THREAD_ATTR_TID                   0x00000005 /*< */
+#define QURT_THREAD_ATTR_CACHE_PART            0x00000007 /*< */
+#define QURT_THREAD_ATTR_COPROCESSOR           0x00000008 /*< */
+#define QURT_THREAD_ATTR_GET_L2CACHE_PART      0x00000009 /*< */
+#define QURT_THREAD_ATTR_SET_FRML              0x0000000A /*< */
+#define QURT_THREAD_ATTR_STID_GET              0x0000000B /*< */
+#define QURT_THREAD_ATTR_STID_SET              0x0000000C /*< */
+#define QURT_THREAD_ATTR_AUTOSTACK             0x0000000D /*< */
+#define QURT_THREAD_ATTR_SYSTEM_THREAD         0x0000000E /*< */
+#define QURT_THREAD_ATTR_STID_SET2             0x0000000F /*< */
+#define QURT_THREAD_ATTR_STID_SET2_ACKNOWLEDGE 0x00000010 /*< */
+#define QURT_THREAD_ATTR_STID_GET2             0x00000011 /*< */
+
+/**  Cache operations*/
+#define QURT_DCCLEAN                0U   /* Clean Dcache. */
+#define QURT_DCINV                  1U   /* Invalidate Dcache. */
+#define QURT_DCCLEANINV             2U   /* Clean and invalidate Dcache. */
+#define QURT_ICINV                  3U   /* Invalidate Icache. */
+#define QURT_DUMP_DCTAGS            4U  /* For testing purpose. */
+#define QURT_FLUSH_ALL              5U  /* Flush entire L1 and L2 cache. */
+#define QURT_TABLE_FLUSH            6U  /* Flush based on table of physical pages */
+#define QURT_CLEAN_INVALIDATE_ALL   7U  /* Flush and invalidate entire L1 and L2 cache. */
+#define QURT_L2CACHE_LOCK_LINES     8U  /* l2 cache lock lines */
+#define QURT_L2CACHE_UNLOCK_LINES   9U  /* l2 cache unlock lines */
+#define QURT_CLEAN                  10U  /* Flush L1 and L2 cache */
+#define QURT_CLEAN_INVALIDATE       11U  /* Flush and invalidate L1 and L2 cache. */
+#define QURT_CLEAN_INVALIDATE_L2    12U  /* Flush and invalidate entire L2 cache. */
+
+/**@ingroup chapter_prefined_symbols */
+/**@xreflabel{hdr:QURT_API_VERSION}*/
+
+
+/* Process state. */
+#define QURT_UPDATE_PROCESS_STATE   0 /**< */
+#define QURT_MP_INIT        1 /*< */
+#define QURT_MP_RUNNING     2 /*< */
+#define QURT_MP_STOPPED     3 /*< */
+
+/* QuRT reset reason. */
+#define QURT_NORMAL_BOOT               0  /* Normal boot. */
+#define QURT_WARM_BOOT                 1  /* Power collapse warm boot. */
+#define QURT_WARM_BOOT_L2_RETENTION    2  /* Power collapse with L2 retention warm boot. */
+#define QURT_WARM_BOOT_SAVE_TCM        3  /* Power collapse with saving TCM. */
+#define QURT_QUICK_BOOT                4  /* Deep sleep. */
+
+/* QuRT Wait for Idle command */
+#define QURT_WAIT_FOR_IDLE_DISABLE  0 /*< */
+#define QURT_WAIT_FOR_IDLE_ENABLE   1 /*< */
+#define QURT_WAIT_FOR_IDLE     2 /*< */
+#define QURT_WAIT_FOR_IDLE_CANCEL 3 /*< */
+
+/*QuRT island exit stages */
+#define QURT_ISLAND_EXIT_STAGE1 1 /*< */
+#define QURT_ISLAND_EXIT_STAGE2 2 /*< */
+
+#define QURT_MAX_NAME_LEN   64 /*< */
+
+#define MAX_POOL_RANGES     16 /*< */
+
+/* key definitions for debug thread info */
+//#define MAX_TCB_KEY           40    //whatever is a good number or makes debug thread structure be 1K
+#define KEY_SCHDULER_STATE      1   /*< */
+#define KEY_PRIORITY            2   /*< */
+#define KEY_PRIORITY_ORIG       3   /*< */
+#define KEY_STACK_BOTTOM        4    // Currently not populated
+#define KEY_STACK_TOP           5    // Currently not populated
+#define KEY_HVX_STATE           6    /*< */
+#define KEY_FUTEX_OBJECT        7    /*< */
+#define KEY_THREAD_ID           8    /*< */
+#define KEY_PROFILE_CYCLE_LO    9    // Currently not populated
+#define KEY_PROFILE_CYCLE_HI    10   // Currently not populated
+#define KEY_ERROR_ADDRESS       11   // This holds the BADVA
+#define KEY_ERROR_CAUSE         12   // This is the same as QURT_error_info.cause
+#define KEY_ERROR_CAUSE2        13   // This is the same as QURT_error_info.cause2
+#define KEY_ERROR_SSR           14   /*< Holds the SSR value */
+#define QURT_RESERVED           -1
+
+/* VTLB method IDs. */
+#define QURT_VTLB_ENTRY_CREATE          0U
+#define QURT_VTLB_ENTRY_DELETE          1U
+#define QURT_VTLB_ENTRY_READ            2U
+#define QURT_VTLB_ENTRY_WRITE           3U
+#define QURT_VTLB_ENTRY_PROBE           4U
+#define QURT_VTLB_ENTRY_SPLIT           5U
+#define QURT_VTLB_ENTRY_MERGE           6U
+#define QURT_VTLB_ENTRY_STATISTICS      7U
+#define QURT_VTLB_ENTRY_SET_SPECIAL     8U
+#define QURT_VTLB_QUEUE_PPAGE           9U
+#define QURT_VTLB_RECLAIM_STACK_PAGES   10U
+#define QURT_VTLB_ASID_SET_STATE_FAST   11U
+#define QURT_VTLB_ASID_SET_STATE        12U
+#define QURT_VTLB_ENTRY_SET_EXTENSION   13U
+#define QURT_VTLB_ENTRY_CLEAR_EXTENSION 14U
+
+/* VTCM window access control HWIO programming. */
+#define QURT_VTCM_WINDOW_ENABLE             1U
+#define QURT_VTCM_WINDOW_DISABLE            0U
+#define QURT_VTCM_WINDOW_HI_OFFSET_DEFAULT  0xFFFU
+#define QURT_VTCM_WINDOW_LO_OFFSET_DEFAULT  0U
+
+/** @cond */
+/* ETM source - PC or data access */
+#define QURT_ETM_SOURCE_PC          0U  /**< Memory source of SAC* is PC. */
+#define QURT_ETM_SOURCE_DATA        1U  /**< Memory source of SAC* is data. */
+
+/* ETM PID status flags */
+#define QURT_ETM_NO_PID             0xFFFFFFFF /**< No PID is selected. */
+/** @endcond */
+
+/* execution context */
+#define QURT_CTX_USER       1
+#define QURT_CTX_GUEST      2
+
+/* Profiling STID */
+#define QURT_STID_DEFAULT   0U
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_CONSTS_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_cycles.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_cycles.h
new file mode 100755
index 0000000000000..b599493f5d563
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_cycles.h
@@ -0,0 +1,301 @@
+
+#ifndef QURT_CYCLES_H
+#define QURT_CYCLES_H 1
+/**
+  @file qurt_cycles.h
+  Prototypes of kernel pcycle API functions.      
+
+ EXTERNALIZED FUNCTIONS
+  none
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  none
+
+ Copyright (c) 2021, 2023  by Qualcomm Technologies, Inc.  All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+ 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+	/*=====================================================================
+	 Functions
+	======================================================================*/
+	 
+/*======================================================================*/
+
+/**@ingroup func_qurt_profile_reset_idle_pcycles
+  @xreflabel{hdr:qurt_profile_reset_idle_pcycles}
+  Sets the per-hardware-thread idle cycle counts to zero. 
+
+  @return 
+  None. 
+		 
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+void qurt_profile_reset_idle_pcycles (void);
+	 
+/*======================================================================*/
+/**@ingroup func_qurt_profile_get_thread_pcycles
+  @xreflabel{hdr:qurt_profile_get_thread_pcycles}
+  Gets the count of the running processor cycles for the current thread.\n
+  Returns the current running processor cycle count for the current QuRT thread.
+
+  @note1hang  Profiling shall be enabled first to start the cycle counting. 
+              The cycles are accumulated once the profiling is enabled and 
+              resets on #qurt_profile_reset_threadid_pcycles
+
+  @return 
+  Integer -- Running processor cycle count for current thread.
+		 
+  @dependencies 
+  None.
+*/
+/* ======================================================================*/
+unsigned long long int qurt_profile_get_thread_pcycles(void);
+
+	
+/*======================================================================*/
+/**@ingroup func_qurt_get_core_pcycles
+  @xreflabel{hdr:qurt_get_core_pcycles}
+  Gets the count of core processor cycles executed.\n
+  Returns the current number of running processor cycles executed since the Hexagon
+  processor was last reset.
+
+  This value is based on the hardware core clock, which varies in speed according to the
+  processor clock frequency.
+
+  @note1hang Because the hardware core clock stops running when the processor shuts
+             down (due to all of the hardware threads being idle), treat the cycle values returned
+             by this operation as relative rather than absolute.
+
+  @note1cont Thread cycle counts are valid only in the V4 Hexagon processor version.
+
+  @return 
+  Integer -- Current count of core processor cycles.
+		 
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+unsigned long long int qurt_get_core_pcycles(void);
+
+/*======================================================================*/
+/**@ingroup func_qurt_profile_get_idle_pcycles
+
+  @deprecated use #qurt_profile_get_idle_pcycles2 instead
+
+  Gets the current idle processor cycle counts for a maximum of 6 hardware threads. Use
+  #qurt_profile_get_idle_pcycles2 for reading pcycles without limitation on maximum hardware threads. 
+
+  This operation accepts a pointer to a user-defined array, and writes to the array the current
+  idle cycle count for each hardware thread.
+
+  Each count value represents the number of processor cycles that have elapsed on the
+  corresponding hardware thread while that thread has been in Wait mode.\n
+
+
+  @note1hang This operation does not return the idle cycles that occur when the Hexagon
+             processor shuts down (due to all of the hardware threads being idle). 
+             Idle cycle counts gets accumulated irrespective of profiling is enabled or not, 
+	           and resets on #qurt_profile_reset_idle_pcycles
+	
+  @param[out] pcycles  User array where the function stores the current idle cycle count values.
+                        Array size should be a minimum of the number of hardware threads intended. 
+
+  @return
+  None.
+		 
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+void qurt_profile_get_idle_pcycles (unsigned long long *pcycles);
+
+/*======================================================================*/
+/**@ingroup func_qurt_profile_get_idle_pcycles2
+  Gets the current idle processor cycle counts for maximum available hardware threads.
+
+  This operation accepts a pointer to a user-defined array with length in bytes, and writes 
+  to the array the current idle cycle count for each hardware thread.
+
+  Each count value represents the number of processor cycles that have elapsed on the
+  corresponding hardware thread while that thread has been in Wait mode.\n
+
+  @note1hang This operation does not return the idle cycles that occur when the Hexagon
+             processor shuts down (due to all of the hardware threads being idle). 
+             Idle cycle counts gets accumulated irrespective of profiling enable status, and 
+             resets on #qurt_profile_reset_idle_pcycles
+	
+  @param[out] pcycles  User array where the function stores the current idle cycle count values. 
+                        Array size should be equivalent to the number of hardware threads intended. 
+                        Call #qurt_sysenv_get_max_hw_threads to determine the array size required.
+  
+  @param[in] length_in_bytes Length of pcycles array in bytes. If the array size is smaller
+                              than the required for the maximum available hardware threads, 
+                              it returns error code. 
+
+  @return
+  #QURT_EOK -- Successful operation. Stored all the data to the destination array
+  #QURT_EFAILED -- Operation failed due to smaller #pcycles array
+		 
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+int qurt_profile_get_idle_pcycles2 (unsigned long long *pcycles, unsigned int length_in_bytes);
+
+/*======================================================================*/
+/**@ingroup func_qurt_profile_get_threadid_pcycles
+  
+  @deprecated use #qurt_profile_get_threadid_pcycles2 instead
+  
+  Gets the current per-hardware-thread running cycle counts for the specified QuRT
+  thread for a maximum of 6 hardware threads.
+
+  Each count value represents the number of processor cycles that have elapsed on the
+  corresponding hardware thread while that thread has been scheduled for the specified
+  QuRT thread.
+
+  @note1hang  Profiling shall be enabled first to start the cycle counting. 
+              The cycles are accumulated once the profiling is enabled and 
+              resets on #qurt_profile_reset_threadid_pcycles
+
+  @param[in]   thread_id  Valid thread identifier.
+  @param[out]  pcycles    Pointer to a user array where the function stores the current running 
+                          cycle count values. Array size should be a minimum of the number of
+                          hardware threads intended. 
+	
+  @return 				
+  None. 
+		 
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+void qurt_profile_get_threadid_pcycles (int thread_id, unsigned long long  *pcycles);
+
+/*======================================================================*/
+/**@ingroup func_qurt_profile_get_threadid_pcycles2
+    
+  Gets the current per-hardware-thread running cycle counts for the specified QuRT
+  thread for maximum available hardware threads.
+
+  Each count value represents the number of processor cycles that have elapsed on the
+  corresponding hardware thread while that thread has been scheduled for the specified
+  QuRT thread.
+
+  @note1hang  Profiling shall be enabled first to start the cycle counting. 
+              The cycles are accumulated once the profiling is enabled and 
+              resets on #qurt_profile_reset_threadid_pcycles
+
+  @param[in]  thread_id  Thread identifier.
+  @param[out] pcycles    Pointer to a user array where the function stores the current running 
+                          cycle count values. Array size should be equivalent to the number of
+                          hardware threads intended. 
+                          Call #qurt_sysenv_get_max_hw_threads to determine the array size required.
+  @param[in]  length_in_bytes Length of pcycles array in bytes. If the array size is smaller
+                              than the required for the maximum available hardware threads, it 
+                              returns error code. 
+  
+  @return
+  #QURT_EOK -- Successful operation. Stored all the data to the destination array
+  #QURT_EFAILED -- Operation failed due to smaller #pcycles array
+  #QURT_ENOTHREAD -- Operation failed due to invalid #thread_id
+		 
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+int qurt_profile_get_threadid_pcycles2 (int thread_id, unsigned long long  *pcycles, unsigned int length_in_bytes);
+
+
+/*======================================================================*/
+/**@ingroup func_qurt_profile_reset_threadid_pcycles
+  @xreflabel{hdr:qurt_profile_reset_threadid_pcycles}
+  Sets the per-hardware-thread running cycle counts to zero for the specified QuRT thread.
+
+  @param[in]  thread_id Thread identifier.
+	
+  @return 
+  None. 
+		 
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+void qurt_profile_reset_threadid_pcycles (int thread_id);
+
+/*======================================================================*/
+/**@ingroup func_qurt_profile_enable
+  @xreflabel{hdr:qurt_profile_enable}
+  Enables profiling.\n
+  Enables or disables cycle counting of the running and idle processor cycles.
+  Profiling is disabled by default. \n
+
+  @note1hang Enabling profiling does not automatically reset the cycle counts -- this must be
+             done explicitly by calling the reset operations before starting cycle counting.
+             Cycle counting starts from the instant of it was enabled using this API, and  
+             halts on profiling disable.
+	
+  @param[in] enable  Profiling. Values: \n
+                     - 0 -- Disable profiling \n
+                     - 1 -- Enable profiling @tablebulletend
+	
+  @return 
+  None. 
+		 
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+void qurt_profile_enable (int enable);
+
+/*======================================================================*/
+/**@ingroup func_qurt_get_hthread_pcycles
+  @xreflabel{hdr:qurt_get_hthread_pcycles}
+  Reads the GCYCLE_nT register to allow performance measurement when N threads are in run mode.\n
+
+  @note1hang Returns 0 when architecture is earlier than v67 or for invalid HW thread id.
+  
+  @param[in] n Threads in run mode. Valid values are 1 through <maximum HW threads>.
+                     
+  
+  @return 
+  Value read from GCYCLE_nT register. This value indicates the total number of pcycles that got executed
+  from reset to current point of execution when n threads are in run mode
+     
+  @dependencies
+  PMU must be enabled.
+*/
+/* ======================================================================*/
+unsigned int qurt_get_hthread_pcycles(int n);
+
+/*======================================================================*/
+/**@ingroup func_qurt_get_hthread_commits
+  @xreflabel{hdr:qurt_get_hthread_commits}
+  Reads the GCOMMIT_nT register to allow performance measurement when N threads are in run mode.\n
+
+  @note1hang Returns 0 when architecture is earlier than v67 or for invalid HW thread id.
+  
+  @param[in] n Threads in run mode. Valid values: 1 through <maximum HW threads>.
+  
+  @return 
+  Value read from the GCOMMIT_nT register. This value indicates the total number of packets 
+  committed from reset to current point of execution when n threads are in run mode.
+     
+  @dependencies
+  PMU must be enabled.
+*/
+/* ======================================================================*/
+unsigned int qurt_get_hthread_commits(int n);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_devtree.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_devtree.h
new file mode 100755
index 0000000000000..4adee45bb44a2
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_devtree.h
@@ -0,0 +1,161 @@
+#ifndef QURT_DEVTREE_H
+#define QURT_DEVTREE_H
+/**
+ @file qurt_devtree.h 
+ @brief  Prototypes and structures for device tree aware QuRT library function.
+
+Copyright (c) 2021, 2023  by Qualcomm Technologies, Inc.  All Rights Reserved.
+*/
+/*qurt_callback is included by qurt_qdi_driver.h and depends on NULL being def.
+  callback is not used here, so define NULL here to avoid including the world*/
+#ifndef NULL
+#define NULL ((void *) 0)
+#endif
+
+#include "libfdt.h"
+#include "DTBExtnLib.h"
+#include "qurt_qdi_ext.h"
+#include "qurt_thread.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define INVALID_BLOB_ID       (-1)
+#define DEFAULT_BLOB_ID         0
+
+/** QURT Device Tree Mapping Macros */
+#define QURT_DT_MAPPING_FAILED         (-1)
+#define QURT_DT_FLAG_ISLAND             0x1
+#define QURT_DT_FLAG_PHYSADDR           0x2
+
+/** Device Tree type for Root PD Device tree.
+    Root PD Device Tree will typically describe the hardware in the subsystem.
+    This is the /soc portion of the Device Tree. */
+#define QURT_DT_BLOB_TYPE_ROOT  0
+
+/** Device Tree type for Local Device tree.
+    Local Device Tree will typically contain the software settings.
+    This is the /sw portion of the Device Tree. */
+#define QURT_DT_BLOB_TYPE_LOCAL 1
+
+int qurt_devtree_init(void);
+
+/**@ingroup func_qurt_dt_mapping_create
+ Creates a memory mapping from the specified property of the specified device
+ tree node. Returns virtual addresses and sizes.
+                  
+ @param[in]   offset         Device tree node offset.
+ @param[in]   flags          Flags to configure memory. Overloaded as property 
+                              index if reg_name is NULL.
+ @param[in]   reg_name       Identifies property to use for mapping, should 
+                              resemble a region.
+ @param[out]   vaddr         Return pointer for the virtual region address.
+ @param[out]   size          Return pointer for the virtual region size.
+
+ @return
+ Result code indicating success or failure \n
+*/
+int qurt_dt_mapping_create(fdt_node_handle *devtreeNode, int flags, char *regionName, int regionIdx, 
+                                unsigned long long *vaddr, unsigned long long *size);
+
+/**@ingroup func_qurt_dt_mapping_create2
+ 
+ Creates a memory mapping from the specified property of the specified device
+ tree node.
+
+ Returns virtual addresses and sizes according to architecture (i.e either 32 bit or 64 bit). 
+
+ @param[in]   devtreeNode    Device Tree node    
+
+ @param[in]   dt_map_flags   Flags to configure memory mapping and are reserved for future purpose.
+                              (0) - Default value assumes details from DT node are phys address, size.
+                              QURT_DT_FLAG_ISLAND <IslandMode-Mapping>
+
+                              NOTE: The PA needs to be added to corresponding island spec to create an island mapping
+
+ @param[in]   regionName     NULL or name of index in range to return, should 
+                              resemble a region.       Ex.reg-names =  "base",         "rx",               "tx";
+
+ @param[in]   regionIdx      Index of range to return.  Ex reg       = <0x1000 0x20>, <0x10000 0x100>, <0x18000 0x100 >;
+                              
+                              NOTE: If client specifies both re_name & regionIdx. The precedence of 
+                              region name is taken over and region index is ignored.
+
+ @param[in]   dt_map_perm    Mapping access permissions(R/W),
+                              QURT_PERM_READ <Read only>
+                              QURT_PERM_WRITE
+
+ @param[in]   cache_attr     QuRT cache mode type's :
+                              QURT_MEM_CACHE_DEVICE <memory-mapped device>
+                              QURT_MEM_CACHE_WRITEBACK <Cached WB>
+                              Other required cache type enums in qurt_types.h can also be passed.
+
+                             NOTE: No default value for cache & perm is present. 
+                             Client always needs to pass any of defined the flags.
+
+ @param[out]  vaddr          Return pointer to the variable that holds the virtual address
+ @param[out]  size           Return pointer for the virtual region size.
+
+ @return
+ #QURT_EOK                   Success indicating mapping created properly.
+ #QURT_DT_MAPPING_FAILED     Failed to create mapping.
+ #QURT_EINVALID              Mismatch in the architecture.
+
+                             else FdtLib or thirdparty error code.
+
+*/
+int qurt_dt_mapping_create2(fdt_node_handle *devtreeNode, unsigned int dt_map_flags, 
+                              char *regionName, int regionIdx, unsigned int dt_map_perm, int cache_attr, void **vaddr, size_t *size);
+
+/**@ingroup func_qurt_dt_isr_register
+  Device tree aware registration of an interrupt service routine (ISR) to an ISR thread. 
+  The interrupt defined in the specified device tree node is enabled when this function returns success.
+
+  @datatypes
+  #qurt_thread_t \n
+  #fdt_node_handle
+
+  @param[in]   dt_node       Device tree node that specifies the interrupt property.
+  @param[in]   dt_int_index  Index of the specific interrupt to use within the device tree node structure.
+                             Specify either this or int_name, use -1 if string is used.
+  @param[in]   dt_int_name   Name of the specific interrupt to use within the device tree node structure.
+                             Either this or int_index should be specified, use NULL if index is used
+  @param[in]   isr_thread_id ISR thread ID, returned from qurt_isr_create(), defined by qurt_isr_register2().  
+  @param[in]   prio          Priority of the ISR, defined by qurt_isr_register2().
+  @param[in]   flags         Defines ACK type. Values : \n
+                             #QURT_INT_NON_DELAYED_ACK - ISR is acknowledged by the interrupt handle routine 
+			                                     in the kernel.
+                             #QURT_INT_DELAYED_ACK     - Client chooses to acknowledge.
+                             Defined by qurt_isr_register2().             
+  @param[in]   isr           ISR with proto type void isr (void *arg, int int_num), defined by qurt_isr_register2().
+  @param[in]   arg  	     First argument of the ISR when it is called to service the interrupt, defined by qurt_isr_register2().
+   
+  @return 
+  #QURT_EOK          -- Successfully registered the ISR for the interrupt \n
+  #QURT_EINT         -- Interrupt not configured \n
+  #QURT_EINVALID     -- Invalid thread ID \n
+  #QURT_EDISABLED    -- The feature is disabled \n
+  #QURT_EDUPLICATE   -- Interrupt is already registered
+
+  @dependencies
+   Create the thread ID qurt_isr_create().
+   ISR registration completed with qurt_isr_register2().
+ */
+int qurt_dt_isr_register(fdt_node_handle *dt_node, int dt_int_index, char * dt_int_name, qurt_thread_t isr_thread_id, 
+                         unsigned short prio, unsigned short flags, void (*isr) (void *, int), void *arg);
+
+/**@ingroup func_qurt_dt_blob_id_get
+ Returns the Blob ID for the Blob type passed.
+ The value returned from this API can be passed as Blob ID parameter to DTBExtnLib APIs.
+
+ @param[in] blob_type  Blob type to look up.
+ @return Blob ID for the passed Blob Type.
+*/
+int qurt_dt_blob_id_get(unsigned int blob_type);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_ecc.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_ecc.h
new file mode 100755
index 0000000000000..09312684e99af
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_ecc.h
@@ -0,0 +1,168 @@
+#ifndef QURT_ECC_H
+#define QURT_ECC_H
+
+
+/*=====================================================================
+ 
+  @file  qurt_ecc.h
+  @brief  Prototypes of QuRT memory ECC API functions      
+
+ Copyright (c) 2018, 2020-2021 by Qualcomm Technologies, Inc.  All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+ ======================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*=============================================================================
+                        TYPEDEFS
+=============================================================================*/
+/** @addtogroup exception_handling_types
+@{ */
+// ECC memory definition
+typedef enum {
+    QURT_ECC_MEM_L1_ICACHE = 0, /**< ECC memory L1 ICache. */
+    QURT_ECC_MEM_L1_DCACHE = 1, /**< ECC memory L1 DCache.*/
+    QURT_ECC_MEM_L2_CACHE  = 2, /**< ECC memory L2 Cache.*/
+    QURT_ECC_MEM_VTCM      = 3  /**< ECC memory VTCM.*/
+} qurt_ecc_memory_t;
+/** @} */ /* end_addtogroup exception_handling_types */
+
+/*=============================================================================
+                        CONSTANTS AND MACROS
+=============================================================================*/
+/** @addtogroup exception_handling_macros
+@{ */
+
+#define   QURT_ECC_ERR_DETECTED_STATUS        0 /**< ECC error detected. */
+#define   QURT_ECC_ERR_TYPE                   1 /**< ECC error type.*/
+// ECC status type
+
+#define  QURT_ECC_CORRECTABLE_COUNT           (1<<0) /**< ECC correctable count.*/
+#define  QURT_ECC_UNCORRECTABLE_COUNT         (1<<1) /**< ECC uncorrectable count.*/
+#define  QURT_ECC_REGION_LOGGING              (1<<2) /**< ECC region logging.*/
+// ECC enable/disable definition
+
+#define QURT_ECC_PROTECTION_DISABLE  (0<<0)    /**< Bit 0. */
+#define QURT_ECC_PROTECTION_ENABLE   (1<<0)    /**< Bit 0. */
+/** @} */ /* end_addtogroup exception_handling_macros */
+/*=============================================================================
+                        FUNCTIONS
+=============================================================================*/
+
+
+/**@ingroup func_qurt_ecc_enable
+  Enables or disables ECC protection on a specified memory.
+  
+  @datatypes
+  #qurt_ecc_memory_t
+  
+  @param[in]  memory Set to one of the following values:
+                     - #QURT_ECC_MEM_L1_ICACHE
+                     - #QURT_ECC_MEM_L1_DCACHE
+                     - #QURT_ECC_MEM_L2_CACHE
+                     - #QURT_ECC_MEM_VTCM   @tablebulletend
+
+  @param[in]  enable Set to one of the following values:
+                     - #QURT_ECC_PROTECTION_ENABLE
+                     - #QURT_ECC_PROTECTION_DISABLE  @tablebulletend
+
+  @return
+  - #QURT_EOK --   ECC enabling or disabling setup is performed successfully
+  - Others  --    Failure
+
+  @dependencies
+  None.
+ */
+int qurt_ecc_enable( qurt_ecc_memory_t memory, unsigned int enable );
+
+
+/**@ingroup func_qurt_ecc_get_error_status
+  Gets ECC error status for a specified memory.
+  
+  @datatypes
+  #qurt_ecc_memory_t
+  
+  @param[in]  memory  Set to one of the following:
+                      - #QURT_ECC_MEM_L1_ICACHE
+                      - #QURT_ECC_MEM_L1_DCACHE
+                      - #QURT_ECC_MEM_L2_CACHE
+                      - #QURT_ECC_MEM_VTCM    @tablebulletend
+
+  @param[in]  type  Set to one of the following:
+                     - #QURT_ECC_ERR_DETECTED_STATUS
+                     - #QURT_ECC_ERR_TYPE  @tablebulletend
+
+  @return
+  Returns the following when the type is #QURT_ECC_ERR_DETECTED_STATUS:
+       - 0 -- No error detected \n
+       - 1 -- At least one error detected \n
+  Returns the following when the type is #QURT_ECC_ERR_TYPE: \n
+       - 0 through 1 -- Correctable error \n
+       - 2 --   Uncorrectable error
+
+  @dependencies
+  None.
+ */
+int qurt_ecc_get_error_status( qurt_ecc_memory_t memory, unsigned int type );
+
+
+/**@ingroup func_qurt_ecc_get_error_count
+  Gets the ECC error count for a specified memory.
+  
+  @datatypes
+  #qurt_ecc_memory_t
+  
+  @param[in]  memory  Set to one of the following values:\n
+                      - #QURT_ECC_MEM_L1_ICACHE \n
+                      - #QURT_ECC_MEM_L1_DCACHE \n
+                      - #QURT_ECC_MEM_L2_CACHE \n
+                      - #QURT_ECC_MEM_VTCM    @tablebulletend
+
+  @param[in]  type  Set to one of the following values: \n
+                     - #QURT_ECC_CORRECTABLE_COUNT \n
+                     - #QURT_ECC_UNCORRECTABLE_COUNT  @tablebulletend
+
+  @return
+  Error count for the specified error type.
+
+  @dependencies
+  None.
+ */
+int qurt_ecc_get_error_count( qurt_ecc_memory_t memory, unsigned int type );
+
+
+/**@ingroup func_qurt_ecc_clear_error_count
+  Clears ECC error count or region logging for a specified memory.
+  
+  @datatypes
+  #qurt_ecc_memory_t
+  
+  @param[in]  memory Set to one of the following values: \n
+                     - #QURT_ECC_MEM_L1_ICACHE \n
+                     - #QURT_ECC_MEM_L1_DCACHE \n
+                     - #QURT_ECC_MEM_L2_CACHE \n
+                     - #QURT_ECC_MEM_VTCM    @tablebulletend
+
+  @param[in]  type Set to one or multiple OR'ed of the following values: \n
+                  - #QURT_ECC_CORRECTABLE_COUNT  \n
+                  - #QURT_ECC_UNCORRECTABLE_COUNT \n
+                  - #QURT_ECC_REGION_LOGGING  @tablebulletend
+     
+  @return
+  #QURT_EOK -- Error count successfully cleared \n
+  Others --   Failure at clearing the error count
+
+  @dependencies
+  None.
+ */
+int qurt_ecc_clear_error_count( qurt_ecc_memory_t memory, unsigned int type );
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_ECC_H */
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_error.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_error.h
new file mode 100755
index 0000000000000..f4666b396c378
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_error.h
@@ -0,0 +1,149 @@
+#ifndef QURT_ERROR_H
+#define QURT_ERROR_H
+
+/**
+  @file qurt_error.h 
+  Error results- QURT defines a set of standard symbols for the error result values. This file lists the
+  symbols and their corresponding values.
+
+ EXTERNALIZED FUNCTIONS
+  none
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  none
+
+ Copyright (c) 2021-2022 , 2023  by Qualcomm Technologies, Inc.  All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc..
+ ======================================================================*/
+#include "qurt_except.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** @addtogroup chapter_error
+@{ */
+
+/*=====================================================================
+Constants and macros
+======================================================================*/
+#define QURT_EOK                             0  /**< Operation successfully performed. */
+#define QURT_EVAL                            1  /**< Wrong values for the parameters. The specified page does not exist. */
+#define QURT_EMEM                            2  /**< Not enough memory to perform the operation.*/
+
+#define QURT_EINVALID                        4  /**< Invalid argument value; invalid key. */ 
+/** @cond  */
+#define QURT_EUNKNOWN                        6  /**< Defined but never used in QuRT. */
+#define QURT_ENOMSGS                         7  /**< Message queue is empty. */ 
+#define QURT_EBADF                           9  /**< Bad message queue descriptor. */
+/** @endcond */
+#define QURT_EFAILED                        12  /**< Operation failed. */ 
+
+#define QURT_ENOTALLOWED                    13  /**< Operation not allowed. */
+
+/** @cond */
+#define QURT_EDUPCLSID                      14  /*< Duplicate class ID. */
+/** @endcond */
+/** @cond rest_reg_dist   */
+#define QURT_ENOREGISTERED                  20  /**< No registered interrupts.*/ 
+/** @endcond */
+
+
+/** @cond */
+#define QURT_EISDB                          21  /*< Power collapse failed due to ISDB being enabled. */
+#define QURT_ESTM                           22  /*< Power collapse failed in a Single-threaded mode check. */
+/** @endcond */
+
+
+/** @cond rest_reg_dist  */
+#define QURT_ETLSAVAIL                      23  /**< No free TLS key is available. */
+#define QURT_ETLSENTRY                      24  /**< TLS key is not already free. */ 
+/** @endcond */
+
+#define QURT_EINT                           26  /**< Invalid interrupt number (not registered). */  
+/** @cond rest_reg_dist */
+#define QURT_ESIG                           27  /**< Invalid signal bitmask (cannot set more than one signal at a time). */
+/** @endcond */
+
+/** @cond */
+#define QURT_EHEAP                          28  /**< No heap space is available. */
+#define QURT_ENOSPC                         28  /**< No space to create another queue in the system. */
+#define QURT_EMEMMAP                        29  /**< Physical address layout is not supported by the kernel. */
+/** @endcond */
+/** @cond rest_reg_dist */
+#define QURT_ENOTHREAD                      30  /**< Thread no longer exists. */
+/** @endcond */
+/** @cond */
+#define QURT_EL2CACHE                       31  /**< L2cachable is not supported in kernel invalidate/cleaninv. */
+/** @endcond */
+/** @cond rest_reg_dist  */
+#define QURT_EALIGN                         32  /**< Not aligned. */
+#define QURT_EDEREGISTERED                  33  /**< Interrupt is already deregistered.*/
+/** @endcond */
+
+/** @cond internal_only */
+
+#define QURT_ETLBCREATESIZE                 34  /**< TLB create error -- Incorrect size.*/
+#define QURT_ETLBCREATEUNALIGNED            35  /**< TLB create error -- Unaligned address.*/
+/** @endcond */
+/** @cond rest_reg_dist*/
+#define QURT_EEXISTS                        35  /**< File or message queue already exists. */
+#define QURT_ENAMETOOLONG                   36  /**< Name too long for message queue creation. */
+#define QURT_EPRIVILEGE                     36  /**< Caller does not have privilege for this operation.*/
+
+#define QURT_ECANCEL                        37  /**< A cancellable request was canceled because the associated process was asked to exit.*/
+/** @endcond */
+
+/** @cond */
+#define QURT_EISLANDTRAP                    38  /*< Unsupported TRAP is called in Island mode.*/ 
+
+#define QURT_ERMUTEXUNLOCKNONHOLDER         39  /*< Rmutex unlock by a non-holder.*/
+#define QURT_ERMUTEXUNLOCKFATAL             40  /*< Rmutex unlock error, all except the non-holder error.*/
+#define QURT_EMUTEXUNLOCKNONHOLDER          41  /*< Mutex unlock by a non-holder.*/
+#define QURT_EMUTEXUNLOCKFATAL              42  /*< Mutex unlock error, all except the non-holder error.*/
+#define QURT_EINVALIDPOWERCOLLAPSE          43  /*< Invalid power collapse mode requested. */ 
+/** @endcond */
+#define QURT_EISLANDUSEREXIT                44  /**< User call has resulted in island exit.*/
+#define QURT_ENOISLANDENTRY                 45  /**< Island mode had not yet been entered.*/
+#define QURT_EISLANDINVALIDINT              46  /**< Exited Island mode due to an invalid island interrupt.*/
+/** @cond rest_reg_dist */
+#define QURT_ETIMEDOUT                      47  /**< Operation timed-out. */
+#define QURT_EALREADY                       48  /**< Operation already in progress. */
+/** @endcond */
+
+#define QURT_ERETRY                         49  /*< Retry the operation. */
+#define QURT_EDISABLED                      50  /*< Resource disabled. */
+#define QURT_EDUPLICATE                     51  /*< Duplicate resource. */
+#define QURT_EBADR                          53  /*< Invalid request descriptor. */
+#define QURT_ETLB                           54  /*< Exceeded maximum allowed TLBs. */
+#define QURT_ENOTSUPPORTED                  55  /*< Operation not supported. */
+/** @cond rest_reg_dist */
+#define QURT_ENORESOURCE                    56  /**< No resource. */
+/** @endcond */
+
+#define QURT_EDTINIT                        57  /**< Problem with device tree intialization. */
+#define QURT_EBUFLOCK                       58  /*< Buffer lock failed because it was already locked many times. */
+#define QURT_ELOCKED                        59  /**< Current operation failed as the buffer is locked. */
+#define QURT_EMSGSIZE                       90  /*< Message queue msg_len is greater than mq_msgsize attribute of the message queue. */
+
+
+#define QURT_ENOTCONFIGURED                 91  /*< Interrupt is NOT configured. */
+
+#define QURT_EBANDWIDTHLIMIT                92  /*< Message queue send exceed the bandwidth limit. */
+
+#define QURT_ECFIVIOLATION                  93  /*< CFI violation detected. */
+
+#define QURT_EDESTROY                       94  /**< A destroy request was made to waiting threads.*/
+
+#define QURT_EHMXNOTAVAIL                   95  /**< HMX is not available to target thread.*/
+#define QURT_EHMXNOTDETACHABLE              96  /**< HMX is not detachable from target thread.*/
+
+#define QURT_EFATAL                         -1  /**< Fatal error. */
+
+/** @} */ /* end_addtogroup chapter_error */
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_ERROR_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_event.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_event.h
new file mode 100755
index 0000000000000..987f0fe79f227
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_event.h
@@ -0,0 +1,452 @@
+#ifndef QURT_EVENT_H
+#define QURT_EVENT_H
+/**
+  @file qurt_event.h
+  @brief Prototypes of kernel event API functions.      
+
+ EXTERNALIZED FUNCTIONS
+  none
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  none
+
+ Copyright (c) 2018-2021, 2023 Qualcomm Technologies, Inc.  All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+
+#include "qurt_consts.h"
+#include "qurt_thread.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * System environment object type.
+ */
+/**@addtogroup sys_env_types
+@{ */
+/** QuRT swap pool information type. */
+typedef struct qurt_sysenv_swap_pools {
+   /** @cond */
+   unsigned int spoolsize; /* Swap pool size.*/
+   unsigned int spooladdr;   /* Swap pool start address.*/
+   /** @endcond */
+}qurt_sysenv_swap_pools_t;
+
+/**QuRT application heap information type. */
+typedef struct qurt_sysenv_app_heap {
+   /** @cond */
+   unsigned int heap_base; /* Heap base address.*/
+   unsigned int heap_limit; /* Heap end address.*/
+   /** @endcond */
+} qurt_sysenv_app_heap_t ;
+
+/** QuRT architecture version information type. */
+typedef struct qurt_sysenv_arch_version {
+   /** @cond */
+    unsigned int arch_version; /*Architecture version.*/
+    /** @endcond */
+}qurt_arch_version_t;
+
+/** QuRT maximum hardware threads information type. */
+typedef struct qurt_sysenv_max_hthreads {
+   /** @cond */
+   unsigned int max_hthreads; /*Maximum number of hardware threads.*/
+   /** @endcond */
+}qurt_sysenv_max_hthreads_t;
+
+/** QuRT active hardware threads information type. */
+typedef struct qurt_sysenv_hthreads {
+   /** @cond */
+   unsigned int hthreads; /*Maximum number of hardware threads.*/
+   /** @endcond */
+}qurt_sysenv_hthreads_t;
+
+/** QuRT maximum pi priority information type. */
+typedef struct qurt_sysenv_max_pi_prio {
+     /** @cond */
+    unsigned int max_pi_prio; /*Maximum pi priority.*/
+     /** @endcond */
+}qurt_sysenv_max_pi_prio_t;
+
+/** QuRT process name information type. */
+typedef struct qurt_sysenv_procname {
+     /** @cond */
+   union {
+      unsigned int asid; /*Address space ID.*/
+      unsigned int pid;  /*Process ID.*/
+   };
+   char name[QURT_MAX_NAME_LEN]; /* Process name.*/
+    /** @endcond */
+}qurt_sysenv_procname_t;
+
+/** QuRT stack profile count information type. */
+typedef struct qurt_sysenv_stack_profile_count {
+     /** @cond */
+   unsigned int count; /*Stack profile count for usage.*/
+   unsigned int count_watermark; /*Stack profile count for watermark.*/
+    /** @endcond */
+}qurt_sysenv_stack_profile_count_t;
+
+/**
+ QuRT system error event type.
+ */
+typedef struct _qurt_sysevent_error_t
+{
+    unsigned int thread_id; /**< Thread ID.  */
+    unsigned int fault_pc;  /**< Fault PC. */
+    unsigned int sp;        /**< Stack pointer. */
+    unsigned int badva;     /**< Virtual data address where the exception occurred. */
+    unsigned int cause;     /**< QuRT error result. */
+    unsigned int ssr;       /**< Supervisor status register. */
+    unsigned int fp;        /**< Frame pointer. */
+    unsigned int lr;        /**< Link register. */
+    unsigned int pid;       /**< PID of the process to which this thread belongs.*/
+ } qurt_sysevent_error_t ;
+
+typedef struct _qurt_sysevent_error_1_t
+{
+    unsigned int thread_id; /**< Thread ID.  */
+    unsigned int fault_pc;  /**< Fault PC. */
+    unsigned int sp;        /**< Stack pointer. */
+    unsigned int badva;     /**< Virtual data address where the exception occurred. */
+    unsigned int cause;     /**< QuRT error result. */
+    unsigned int ssr;       /**< Supervisor status register. */
+    unsigned int fp;        /**< Frame pointer. */
+    unsigned int lr;        /**< Link register. */
+    unsigned int pid;       /**< PID of the process to which this thread belongs.*/
+    unsigned int fkey;      /**< Framekey.*/
+    unsigned int reserved1; /**< Reserved.*/
+    unsigned int reserved2; /**< Reserved.*/
+    unsigned int reserved3; /**< Reserved.*/
+ } qurt_sysevent_error_1_t ;
+ 
+/** QuRT page fault error event information type. */
+typedef struct qurt_sysevent_pagefault {
+    qurt_thread_t thread_id; /**< Thread ID of the page fault thread. */
+    unsigned int fault_addr; /**< Accessed address that caused the page fault. */
+    unsigned int ssr_cause;  /**< SSR cause code for the page fault. */
+} qurt_sysevent_pagefault_t ;
+/** @} */ /* @endaddtogroup sys_env_types */
+/*=============================================================================
+                                    FUNCTIONS
+=============================================================================*/
+
+/*======================================================================*/
+/**
+  Gets the environment swap pool 0 information from the kernel.
+
+  @datatypes
+  #qurt_sysenv_swap_pools_t
+
+  @param[out] pools  Pointer to the pools information.
+
+  @return
+  #QURT_EOK -- Success.
+
+  @dependencies
+  None.
+*/
+int qurt_sysenv_get_swap_spool0 (qurt_sysenv_swap_pools_t *pools );
+
+/*
+  Gets the environment swap pool 1 information from the kernel.
+
+  @datatypes
+  #qurt_sysenv_swap_pools_t
+
+  @param[out] pools  Pointer to the pools information.
+
+  @return
+  #QURT_EOK -- Success.
+
+  @dependencies
+  None.
+*/
+int qurt_sysenv_get_swap_spool1(qurt_sysenv_swap_pools_t *pools );
+
+/**@ingroup func_qurt_sysenv_get_app_heap
+  Gets information on the program heap from the kernel.
+
+  @datatypes
+  #qurt_sysenv_app_heap_t
+
+  @param[out] aheap  Pointer to information on the program heap.
+
+  @return
+  #QURT_EOK -- Success. \n
+  #QURT_EVAL -- Invalid parameter.
+
+  @dependencies
+  None.
+*/
+int qurt_sysenv_get_app_heap(qurt_sysenv_app_heap_t *aheap );
+
+/**@ingroup func_qurt_sysenv_get_arch_version
+  Gets the Hexagon processor architecture version from the kernel.
+
+  @datatypes
+  #qurt_arch_version_t
+
+  @param[out] vers  Pointer to the Hexagon processor architecture version.
+
+  @return
+  #QURT_EOK -- Success. \n
+  #QURT_EVAL -- Invalid parameter
+
+  @dependencies
+  None.
+*/
+int qurt_sysenv_get_arch_version(qurt_arch_version_t *vers);
+
+/**@ingroup func_qurt_sysenv_get_max_hw_threads
+  Gets the maximum number of hardware threads supported in the Hexagon processor. 
+  The API includes the disabled hardware threads to reflect the maximum 
+  hardware thread count.
+  For example, if the image is configured for four hardware threads and hthread_mask is set to 0x5 in 
+  cust_config.xml, only HW0 and HW2 are initialized by QuRT.
+  HW1 and HW3 are not used at all. Under such a scenario, 
+  qurt_sysenv_get_max_hw_threads() still returns four.
+
+  @datatypes
+  #qurt_sysenv_max_hthreads_t
+
+  @param[out] mhwt  Pointer to the maximum number of hardware threads supported in the Hexagon processor.
+
+  @return
+  #QURT_EOK -- Success. \n
+  #QURT_EVAL -- Invalid parameter.
+
+  @dependencies
+  None.
+*/
+int qurt_sysenv_get_max_hw_threads(qurt_sysenv_max_hthreads_t *mhwt );
+
+/**@ingroup func_qurt_sysenv_get_hw_threads
+  Gets the number of hardware threads initialized by QuRT in Hexagon processor.
+  For example, if the image is configured for four hardware threads and hthread_mask is set to 0x5 in 
+  cust_config.xml, QuRT only initializes HW0 and HW2.
+  HW1 and HW3 are not used. In this scenario, qurt_sysenv_get_hw_threads() returns 2.
+
+  @datatypes
+  #qurt_sysenv_hthreads_t
+
+  @param[out] mhwt  Pointer to the number of hardware threads active in the Hexagon processor.
+
+  @return
+  #QURT_EOK -- Success. \n
+  #QURT_EVAL -- Invalid parameter.
+
+  @dependencies
+  None.
+*/
+int qurt_sysenv_get_hw_threads(qurt_sysenv_hthreads_t *mhwt );
+
+/**@ingroup func_qurt_sysenv_get_max_pi_prio
+  Gets the maximum priority inheritance mutex priority from the kernel.
+
+  @datatypes
+  #qurt_sysenv_max_pi_prio_t
+
+  @param[out] mpip  Pointer to the maximum priority inheritance mutex priority.
+
+  @return
+  #QURT_EOK -- Success. \n
+  #QURT_EVAL -- Invalid parameter.
+
+  @dependencies
+  None.
+*/
+int qurt_sysenv_get_max_pi_prio(qurt_sysenv_max_pi_prio_t *mpip );
+
+/**@ingroup func_qurt_sysenv_get_process_name2
+  Gets information on the system environment process names based on the client_handle argument.
+
+  @datatypes
+  #qurt_sysenv_procname_t
+
+  @param[in] client_handle  Obtained from the current invocation function (Section 3.4.1).
+  @param[out] pname         Pointer to information on the process names in the system.
+
+  @return
+  #QURT_EOK -- Success. \n
+  #QURT_EVAL -- Invalid parameter.
+
+  @dependencies
+  None.
+*/
+int qurt_sysenv_get_process_name2(int client_handle, qurt_sysenv_procname_t *pname );
+
+/**@ingroup func_qurt_sysenv_get_process_name
+  Gets information on the system environment process names from the kernel.
+
+  @datatypes
+  #qurt_sysenv_procname_t
+
+  @param[out] pname  Pointer to information on the process names in the system.
+
+  @return
+  #QURT_EOK -- Success. \n
+  #QURT_EVAL -- Invalid parameter.
+
+  @dependencies
+  None.
+*/
+int qurt_sysenv_get_process_name(qurt_sysenv_procname_t *pname );
+
+/**@ingroup func_qurt_sysenv_get_stack_profile_count
+   Gets information on the stack profile count from the kernel.
+
+   @datatypes
+   #qurt_sysenv_stack_profile_count_t
+
+   @param[out] count Pointer to information on the stack profile count.
+
+   @return
+   #QURT_EOK -- Success.
+
+   @dependencies
+   None.
+*/
+int qurt_sysenv_get_stack_profile_count(qurt_sysenv_stack_profile_count_t *count );
+
+/**@ingroup func_qurt_exception_wait
+  Registers the program exception handler.
+  This function assigns the current thread as the QuRT program exception handler and suspends the
+  thread until a program exception occurs.
+
+  When a program exception occurs, the thread is awakened with error information
+  assigned to the parameters of this operation.
+
+  @note1hang If no program exception handler is registered, or if the registered handler
+             calls exit, QuRT raises a kernel exception.
+             If a thread runs in Supervisor mode, any errors are treated as kernel
+             exceptions.
+
+  @param[out]  ip      Pointer to the instruction memory address where the exception occurred.
+  @param[out]  sp      Stack pointer.
+  @param[out]  badva   Pointer to the virtual data address where the exception occurred.
+  @param[out]  cause   Pointer to the QuRT error result code.
+
+  @return
+  Registry status: \n
+  Thread identifier -- Handler successfully registered. \n
+  #QURT_EFATAL -- Registration failed.
+
+  @dependencies
+  None.
+*/
+unsigned int qurt_exception_wait (unsigned int *ip, unsigned int *sp,
+                                  unsigned int *badva, unsigned int *cause);
+
+unsigned int qurt_exception_wait_ext (qurt_sysevent_error_t * sys_err);
+
+/**@ingroup func_qurt_exception_wait3
+  Registers the current thread as the QuRT program exception handler, and suspends the thread until a
+  program exception occurs.
+  When a program exception occurs, the thread is awakened with error information assigned to the specified
+  error event record.
+  If a program exception is raised when no handler is registered (or when a handler is registered, but it calls
+  exit), the exception is treated as fatal.\n
+  @note1hang If a thread runs in Monitor mode, all exceptions are treated as kernel exceptions.\n
+  @note1cont This function differs from qurt_exception_wait() by returning the error information in a data
+              structure rather than as individual variables. It also returns additional information (for example, SSR, FP, and LR).
+
+  @param[out] sys_err       Pointer to the qurt_sysevent_error_1_t type structure.
+  @param[in]  sys_err_size  Size of the qurt_sysevent_error_1_t structure.
+
+  @return
+  Registry status: \n
+  - #QURT_EFATAL -- Failure. \n
+  - Thread ID -- Success.
+
+  @dependencies
+  None.
+*/
+
+unsigned int qurt_exception_wait3(void * sys_err, unsigned int sys_err_size);
+
+/**@ingroup func_qurt_exception_raise_nonfatal
+  Raises a nonfatal program exception in the QuRT program system.
+
+  For more information on program exceptions, see Section @xref{dox:exception_handling}.
+
+  This operation never returns -- the program exception handler is assumed to perform all
+  exception handling before terminating or reloading the QuRT program system.
+
+  @note1hang The C library function abort() calls this operation to indicate software
+             errors.
+
+  @param[in] error QuRT error result code (Section @xref{dox:error_results}).
+
+  @return
+  Integer -- Unused.
+
+  @dependencies
+  None.
+*/
+int qurt_exception_raise_nonfatal (int error) __attribute__((noreturn));
+
+
+/**@ingroup func_qurt_exception_raise_fatal
+  Raises a fatal program exception in the QuRT system.
+
+  Fatal program exceptions terminate the execution of the QuRT system without invoking
+  the program exception handler.
+
+  For more information on fatal program exceptions, see Section @xref{dox:exception_handling}.
+
+  This operation always returns, so the calling program can perform the necessary shutdown
+  operations (data logging, on so on).
+
+  @note1hang Context switches do not work after this operation has been called.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+void qurt_exception_raise_fatal (void);
+
+unsigned int qurt_enable_floating_point_exception(unsigned int mask);
+
+/**@ingroup func_qurt_exception_enable_fp_exceptions
+  Enables the specified floating point exceptions as QuRT program exceptions.
+
+  The exceptions are enabled by setting the corresponding bits in the Hexagon
+  control user status register (USR).
+
+  The mask argument specifies a mask value identifying the individual floating
+  point exceptions to set. The exceptions are represented as defined symbols
+  that map into bits 0 through 31 of the 32-bit flag value.
+  Multiple floating point exceptions are specified by OR'ing together the individual
+  exception symbols.\n
+  @note1hang This function must be called before performing any floating point operations.
+
+  @param[in] mask Floating point exception types. Values: \n
+             - #QURT_FP_EXCEPTION_ALL    \n
+             - #QURT_FP_EXCEPTION_INEXACT    \n
+             - #QURT_FP_EXCEPTION_UNDERFLOW  \n
+             - #QURT_FP_EXCEPTION_OVERFLOW  \n
+             - #QURT_FP_EXCEPTION_DIVIDE0    \n
+             - #QURT_FP_EXCEPTION_INVALID   @tablebulletend
+
+  @return
+  Updated contents of the USR.
+
+  @dependencies
+  None.
+*/
+
+static inline unsigned int qurt_exception_enable_fp_exceptions(unsigned int mask)
+{
+   return qurt_enable_floating_point_exception(mask);
+}
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_EVENT_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_except.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_except.h
new file mode 100755
index 0000000000000..e1684c80e3d50
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_except.h
@@ -0,0 +1,185 @@
+#ifndef QURT_EXCEPT_H
+#define QURT_EXCEPT_H
+
+/**
+  @file qurt_except.h 
+  @brief  Defines Cause and Cause2 codes for error-handling.  
+
+EXTERNAL FUNCTIONS
+   None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+ Copyright (c) 2021-2022 by Qualcomm Technologies, Inc.  All Rights Reserved.
+
+ Confidential and Proprietary - Qualcomm Technologies, Inc..
+ ======================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+  QuRT supports error handling to handle CPU detected exceptions and software errors. 
+  QuRT treats all errors as either fatal errors or nonfatal errors. 
+
+  @section sec1 Fatal errors
+  All supervisor mode exceptions are treated as fatal errors. 
+  If a registered exception handler calls qurt_exit(), it is treated as a fatal error.
+  Fatal errors result in saving the context of primary hardware thread to QURT_error_info and the rest of the thread contexts to the corresponding TCBs. 
+  All hardware threads are eventually stopped and the cache is flushed.
+  NMI exception is treated little differently from other fatal errors. QuRT saves the contexts of all the hardware threads into QURT_error_info.\n
+
+  @subsection subsection1 Debugging fatal errors
+  - QURT_error_info.status.status	 -- Indicates that an error occured.
+  - QURT_error_info.status.cause	 -- Cause code for fatal error; Cause and Cause 2 details are listed below.
+  - QURT_error_info.status.cause2	 -- Cause2 code for fatal error; Cause and Cause 2 details are listed below.
+  - QURT_error_info.status.fatal	 -- Indicates whether a fatal error occurred. A user error can result in a fatal error if the exceptional handler is not registered.
+  - QURT_error_info.status.hw_tnum -- Indicates the index of QURT_error_info.locregs[], where the context is saved when the error is fatal error.
+  - QURT_error_info.global_regs    -- Contains the values of the global registers of Q6
+  - QURT_error_info.local_regs[QURT_error_info.status.hw_tnum] -- Provides the CPU context when the error is a supervisor error.
+    
+
+
+  @subsection subsection2 Debugging nonfatal errors
+  - QURT_error_info.user_errors                                    -- All user errors are logged here.
+  - QURT_error_info.user_errors.counter                            -- Index to last logged error.
+  - QURT_error_info.user_errors.entry[0...counter]	               -- Structure for logged error.
+  - QURT_error_info.user_errors.entry[0...counter].error_tcb       -- TCB for the user error.
+  - QURT_error_info.user_errors.entry[0...counter].error_tcb.error -- Information about the error; Cause, Cause2, Badva and hardware thread ID.
+  - QURT_error_info.user_errors.entry[0...counter].error_code      -- ((cause2 << 8) 'Logical Or' (cause) ); Cause and Cause 2 details are listed below.
+  - QURT_error_info.user_errors.entry[0...counter].hw_thread	   -- Hardware thread ID for error.
+  - QURT_error_info.user_errors.entry[0...counter].pcycle	       -- Pcycle for error.
+
+@note  
+  Important usage note:
+  Cause and Cause2 are error codes to distinguish multiple errors.
+  SSR and BADAVA are inconclusive without the vector number.
+  All cause and cause2 can range from 1 to 255 and every cause can have 1 to 255 error code.
+  Hence the system can have up to 255 * 255 unique error codes.
+  The cominations is representated as ((cause2 << 8) 'Logical OR' (cause) )
+  Some Cause2 codes are statically defined, whereas some are obtaned from SSR[7:0] cause codes. It depends on cause codes.
+  SSR cause codes are defined in Hexagon reference manual.
+  All possible combinations are listed below.
+*/
+/** @addtogroup chapter_error
+@{ */
+/* cause - error type - 8-bits*/
+#define QURT_EXCEPT_PRECISE             0x01U /**< Precise exception occurred. For this cause code, Cause2 is SSR[7:0].*/
+#define QURT_EXCEPT_NMI                 0x02U /**< NMI occurred; Cause2 is not defined. */
+#define QURT_EXCEPT_TLBMISS             0x03U /**< TLBMISS RW occurred; for this cause code, Cause2 is SSR[7:0]. */
+#define QURT_EXCEPT_RSVD_VECTOR         0x04U /**< Interrupt raised on a reserved vector, which must never occur. Cause2 is not defined. */
+#define QURT_EXCEPT_ASSERT              0x05U /**< Kernel assert. Cause2 QURT_ABORT_* are listed below.  */
+#define QURT_EXCEPT_BADTRAP             0x06U /**< trap0(num) called with unsupported num. Cause2 is 0. */
+#define QURT_EXCEPT_UNDEF_TRAP1         0x07U /**< Trap1 is not supported. Using Trap1 causes this error. Cause2 is not defined. */
+#define QURT_EXCEPT_EXIT                0x08U /**< Application called qurt_exit() or qurt_exception_raise_nonfatal(). Can be called from C library. Cause2 is "[Argument passed to qurt_exception_raise_nonfatal() & 0xFF]". */
+#define QURT_EXCEPT_TLBMISS_X           0x0AU /**< TLBMISS X (execution) occurred. Cause2 is not defined. */
+#define QURT_EXCEPT_STOPPED             0x0BU /**< Running thread stopped due to fatal error on other hardware thread. Cause2 is not defined. */
+#define QURT_EXCEPT_FATAL_EXIT          0x0CU /**< Application called qurt_fatal_exit(). Cause2 is not defined. */
+#define QURT_EXCEPT_INVALID_INT         0x0DU /**< Kernel received an invalid L1 interrupt. Cause2 is not defined. */
+#define QURT_EXCEPT_FLOATING_POINT      0x0EU /**< Kernel received an floating point error. Cause2 is not defined.  */
+#define QURT_EXCEPT_DBG_SINGLE_STEP     0x0FU /**< Cause2 is not defined. */
+#define QURT_EXCEPT_TLBMISS_RW_ISLAND   0x10U /**< Read write miss in Island mode. Cause2 QURT_TLB_MISS_RW_MEM* are listed below. */
+#define QURT_EXCEPT_TLBMISS_X_ISLAND    0x11U /**< Execute miss in Island mode. For this cause code, Cause2 is SSR[7:0]. */
+#define QURT_EXCEPT_SYNTHETIC_FAULT     0x12U /**< Synthetic fault with user request that kernel detected. Cause2 QURT_SYNTH_* are listed below. */
+#define QURT_EXCEPT_INVALID_ISLAND_TRAP 0x13U /**< Invalid trap in Island mode. Cause2 is trap number. */
+#define QURT_EXCEPT_UNDEF_TRAP0         0x14U /**< trap0(num) was called with unsupported num. Cause2 is trap number. */
+#define QURT_EXCEPT_PRECISE_DMA_ERROR   0x28U /**< Precise DMA error. Cause2 is DM4[15:8]. Badva is DM5 register. */
+
+#define QURT_ECODE_UPPER_LIBC         (0U << 16)  /**< Upper 16 bits is 0 for libc. */
+#define QURT_ECODE_UPPER_QURT         (0U << 16)  /**< Upper 16 bits is 0 for QuRT. */
+#define QURT_ECODE_UPPER_ERR_SERVICES (2U << 16)  /**< Upper 16 bits is 2 for error service. */
+/** @cond */
+#define QURT_ECODE_ISLAND_INVALID_QDI  3U         /**< Passing invalid QDI method in island. */
+/** @endcond */
+
+/* Cause2 for QURT_EXCEPT_SYNTHETIC_FAULT cause- 8bits */
+#define  QURT_SYNTH_ERR                         0x01U     /**< */
+#define  QURT_SYNTH_INVALID_OP                  0x02U     /**< */
+#define  QURT_SYNTH_DATA_ALIGNMENT_FAULT        0x03U     /**< */
+#define  QURT_SYNTH_FUTEX_INUSE                 0x04U     /**< */
+#define  QURT_SYNTH_FUTEX_BOGUS                 0x05U     /**< */
+#define  QURT_SYNTH_FUTEX_ISLAND                0x06U     /**< */
+#define  QURT_SYNTH_FUTEX_DESTROYED             0x07U     /**< */
+#define  QURT_SYNTH_PRIVILEGE_ERR               0x08U     /**< */
+
+/* Cause2 - Abort cause reason - 8 bits */
+/* ERR_ASSERT cause */
+#define   QURT_ABORT_FUTEX_WAKE_MULTIPLE           0x01U   /**<  Abort cause - futex wake multiple. */
+#define   QURT_ABORT_WAIT_WAKEUP_SINGLE_MODE       0x02U   /**<  Abort cause - thread waiting to wake up in Single Threaded mode. */
+#define   QURT_ABORT_TCXO_SHUTDOWN_NOEXIT          0x03U   /**<  Abort cause - call TCXO shutdown without exit. */
+#define   QURT_ABORT_FUTEX_ALLOC_QUEUE_FAIL        0x04U   /**<  Abort cause - futex allocation queue failure -  QURTK_futexhash_lifo empty. */
+#define   QURT_ABORT_INVALID_CALL_QURTK_WARM_INIT  0x05U   /**<  Abort cause - invalid call QURTK_warm_init() in NONE CONFIG_POWER_MGMT mode. */
+#define   QURT_ABORT_THREAD_SCHEDULE_SANITY        0x06U   /**<  Abort cause - sanity schedule thread is not supposed to run on the current hardware thread. */
+#define   QURT_ABORT_REMAP                         0x07U   /**<  Remap in the page table; the correct behavior must remove mapping if necessary. */
+#define   QURT_ABORT_NOMAP                         0x08U   /**<  No mapping in page table when removing a user mapping. */
+#define   QURT_ABORT_OUT_OF_SPACES                 0x09U
+#define   QURT_ABORT_INVALID_MEM_MAPPING_TYPE      0x0AU   /**<  Invalid memory mapping type when creating qmemory. */
+#define   QURT_ABORT_NOPOOL                        0x0BU   /**<  No pool available to attach. */
+#define   QURT_ABORT_LIFO_REMOVE_NON_EXIST_ITEM    0x0CU   /**<  Cannot allocate more futex waiting queue. */
+#define   QURT_ABORT_ARG_ERROR                     0x0DU
+#define   QURT_ABORT_ASSERT                        0x0EU   /**<  Assert abort. */
+#define   QURT_ABORT_FATAL                         0x0FU   /**<  Fatal error; must never occur. */
+#define   QURT_ABORT_FUTEX_RESUME_INVALID_QUEUE    0x10U   /**<  Abort cause - invalid queue ID in futex resume. */
+#define   QURT_ABORT_FUTEX_WAIT_INVALID_QUEUE      0x11U   /**<  Abort cause - invalid queue ID in futex wait. */
+#define   QURT_ABORT_FUTEX_RESUME_INVALID_FUTEX    0x12U   /**<  Abort cause - invalid futex object in hashtable. */
+#define   QURT_ABORT_NO_ERHNDLR                    0x13U   /**<  No registered error handler. */
+#define   QURT_ABORT_ERR_REAPER                    0x14U   /**<  Exception in the reaper thread. */
+#define   QURT_ABORT_FREEZE_UNKNOWN_CAUSE          0x15U   /**<  Abort in thread freeze operation. */
+#define   QURT_ABORT_FUTEX_WAIT_WRITE_FAILURE      0x16U   /**<  During futex wait processing, could not perform a necessary write operation to userland data; most likely due to a DLPager eviction. */
+#define   QURT_ABORT_ERR_ISLAND_EXP_HANDLER        0x17U   /**<  Exception in Island exception handler task. */
+#define   QURT_ABORT_L2_TAG_DATA_CHECK_FAIL        0x18U   /**<  Detected error in L2 tag/data during warm boot. The L2 tag/data check is done when CONFIG_DEBUG_L2_POWER_COLLAPSE is enabled. */
+#define   QURT_ABORT_ERR_SECURE_PROCESS            0x19U   /**<  Abort error in secure process. */
+#define   QURT_ABORT_ERR_EXP_HANDLER               0x20U   /**<  No exception handler, or the handler caused an exception. */
+#define   QURT_ABORT_ERR_NO_PCB                    0x21U   /**<  PCB of the thread context failed initialization, PCB was NULL. */
+#define   QURT_ABORT_NO_PHYS_ADDR                  0x22U   /**<  Unable to find the physical address for the virtual address. */
+#define   QURT_ABORT_OUT_OF_FASTINT_CONTEXTS       0x23U   /**<  Fast interrupt contexts exhausted. */
+#define   QURT_ABORT_CLADE_ERR                     0x24U   /**<  Fatal error seen with CLADE interrupt. */
+#define   QURT_ABORT_ETM_ERR                       0x25U   /**<  Fatal error seen with ETM interrupt. */
+#define   QURT_ABORT_ECC_DED_ASSERT                0x26U   /**<  ECC two-bit DED error. */
+#define   QURT_ABORT_VTLB_ERR                      0x27U   /**<  Fatal error in the VTLB layer. */
+#define   QURT_ABORT_TLB_ENCODE_DECODE_FAILURE     0x28U   /**<  Failure during the TLB encode or decode operation. */
+#define   QURT_ABORT_VTLB_WALKOBJS_BOUND_FAILURE   0x29U   /**<  Failure to lookup entry in the page table. */
+#define   QURT_ABORT_PHY_MEMORY_OWNERSHIP_FAILURE  0x30U   /**<  Failure to claim phy memory ownership. */
+#define   QURT_ABORT_JTLB_SIZE_CHECK_FAIL          0x31U   /**<  JTLB size configured is more than actual size in hardware */
+#define   QURT_ABORT_AUTOSTACK_ASSERT              0x32U   /**<  Error while handling stack flimit exception. */
+
+/* Cause2 - TLB-miss_X - 8bits */
+#define  QURT_TLB_MISS_X_FETCH_PC_PAGE             0x60U  /**<   */
+#define  QURT_TLB_MISS_X_2ND_PAGE                  0x61U  /**<   */
+#define  QURT_TLB_MISS_X_ICINVA                    0x62U  /**<   */
+
+/* Cause2 - TLB-miss_RW - 8bits */
+#define  QURT_TLB_MISS_RW_MEM_READ                 0x70U  /**<   */
+#define  QURT_TLB_MISS_RW_MEM_WRITE                0x71U  /**<   */
+
+/** @cond rest_reg_dist */
+/* Cause2 - Floating point exception - 8 bits */
+#define  QURT_FLOATING_POINT_EXEC_ERR              0xBFU    /**<  Execute floating-point. */
+/** @endcond */
+
+/** Cause2 - autostackv2 - 8 bits */
+#define  QURT_AUTOSTACKV2_CANARY_NOT_MATCH         0xC1U
+#define  QURT_AUTOSTACKV2_POOL_IDX_OFF_RANGE       0xC2U
+
+/** Cause2 - CFI violation - 8 bits */
+#define  QURT_CFI_VIOLATION                        0xC3U
+
+/** @cond rest_reg_dist*/
+/* Enable floating point exceptions */
+#define QURT_FP_EXCEPTION_ALL        0x1FU << 25 /**< */
+#define QURT_FP_EXCEPTION_INEXACT    0x1U << 29 /**< */
+#define QURT_FP_EXCEPTION_UNDERFLOW  0x1U << 28 /**< */
+#define QURT_FP_EXCEPTION_OVERFLOW   0x1U << 27 /**< */
+#define QURT_FP_EXCEPTION_DIVIDE0    0x1U << 26 /**< */
+#define QURT_FP_EXCEPTION_INVALID    0x1U << 25 /**< */
+
+/** @endcond */
+/** @} */ /* end_addtogroup chapter_error */
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_EXCEPT_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_fastint.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_fastint.h
new file mode 100755
index 0000000000000..ea65dc0917fc0
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_fastint.h
@@ -0,0 +1,71 @@
+#ifndef QURT_FASTINT_H
+#define QURT_FASTINT_H
+
+/**
+  @file qurt_fastint.h
+  @brief QuRT fast interrupt functions      
+
+   Copyright (c) 2013-2021  by Qualcomm Technologies, Inc.  All Rights Reserved.
+
+ ======================================================================*/
+
+/*======================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**@ingroup func_qurt_fastint_register
+  Register fast interrupt callback function
+
+  Fast interrupt callback should be designed to perform the minimal necessary 
+  actions for the interrupt, and/or perform some operations, such as signaling 
+  another regular software thread to start any additional processing. 
+  The callback should be a fast and short function. When a fast interrupt callback 
+  is running, the corresponding interrupt cannot be re-enabled until the callback 
+  returns. 
+
+  The fast interrupt callback must not use any system blocking calls, such as 
+  mutex lock or signal wait. Otherwise, it results in errors.
+
+  The fast interrupt callback function has a single integer argument and the 
+  function ends with no return. The argument value passed in is the interrupt
+  number, and therefore a single callback function can handle 
+  multiple fast interrupts.
+
+  @param[in] intno  Interrupt number to register. 
+  @param[in] fn     Interrupt callback function. 
+    
+  @return
+  #QURT_EOK -- Fast interrupt registration is successful. \n
+  #QURT_EINVALID -- Interrupt is already registered. \n
+  #QURT_EINT -- Invalid interrupt number.    
+*/
+/* ======================================================================*/
+unsigned int qurt_fastint_register(int intno, void (*fn)(int));
+
+
+/*======================================================================*/
+/**@ingroup func_qurt_fastint_deregister
+  Deregisters the fast interrupt callback function. 
+	
+  @param[in] intno  Level-one interrupt number to deregister. Valid range is 1 and 10 through 31 
+                    (simulator only). 
+
+  @return 				
+  #QURT_EOK -- Interrupt deregistration is successful. \n
+  #QURT_EINT -- Invalid interrupt number (not registered). \n
+  #QURT_EINVALID -- Invalid interrupt number (already deregistered).
+
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+unsigned int qurt_fastint_deregister(int intno);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_FASTINT_H */
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_fs_hub.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_fs_hub.h
new file mode 100755
index 0000000000000..aaa050a6c838b
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_fs_hub.h
@@ -0,0 +1,58 @@
+#ifndef QURT_FS_HUB_H
+#define QURT_FS_HUB_H
+
+/**
+  @file qurt_fs_hub.h
+  @brief  Definitions, macros, and prototypes used when writing a
+  QDI driver that provides file-system functionality.
+
+ EXTERNALIZED FUNCTIONS
+  none
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  none
+
+ Copyright (c) 2023 by Qualcomm Technologies, Inc.  All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+  This structure tracks a file-designator for a FS-hub QDI driver.
+  File system's QDI interface should use this object to encapsulate
+  true file-descriptor and return back a QDI handle. This QDI handle
+  will be used as file-descriptor by File-systm-hub. 
+ */
+
+typedef struct qurt_qdi_fs_obj
+{
+    qurt_qdi_obj_t qdi_obj;
+    int client_handle;
+    int fd;
+}qurt_qdi_fs_obj_t;
+
+
+/**@ingroup fs_hub_support_functions
+  This function allows a file-system to register it's QDI interface with file-system-hub.
+  Once registered, all file open operations for any filenames containing the mountpoint will
+  be forwarded to the QDI inteface.
+
+  Mountpoint string must be encased in two forward slashes e.g. "/mountpoint/"
+
+  @param  mtpoint         mount point for the file-system being registered.
+  @param  opener          opener structure for the QDI driver interface
+  
+  @return
+  QURT_EOK -- Successfully registered QDI driver with file-system-hub.
+  Negative error code -- Failed to register with file-system-hub
+ */
+int qurt_fs_hub_mtpoint_register(const char *mtpoint, qurt_qdi_obj_t *opener);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_futex.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_futex.h
new file mode 100755
index 0000000000000..1fdcc79a43f01
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_futex.h
@@ -0,0 +1,82 @@
+#ifndef QURT_FUTEX_H
+#define QURT_FUTEX_H
+/**
+  @file  qurt_futex.h
+
+  @brief  Prototypes of QuRT futex API functions      
+  
+ EXTERNALIZED FUNCTIONS
+  none
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  none
+
+ Copyright (c) 2013, 2020-2021  by Qualcomm Technologies, Inc.  All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*=====================================================================
+ Functions
+======================================================================*/
+
+
+/**@ingroup func_qurt_futex_wait
+  Moves the caller thread into waiting state when a memory object address
+  contains a value that is the same as a specified value. 
+
+   @param[in]  lock  Pointer to the object memory. 
+   @param[in]  val   Value to check against the object content. 
+
+   @return
+   #QURT_EOK -- Success \n
+   Other values -- Failure
+
+   @dependencies
+   None.
+ */
+int qurt_futex_wait(void *lock, int val);
+
+
+/**@ingroup func_qurt_futex_wait_cancellable
+  If a memory object address contains a value that is same as a specified 
+  value, move the caller thread into waiting state. 
+  The kernal can cancel the waiting state when there is a special need. 
+
+   @param[in]  lock  Pointer to the object memory. 
+   @param[in]  val   Value to check against the object content. 
+
+   @return
+   #QURT_EOK -- Success \n
+   Other values  -- Failure
+
+   @dependencies
+   None.
+ */
+int qurt_futex_wait_cancellable(void *lock, int val);
+
+
+/**@ingroup func_qurt_futex_wake
+  Wakes up a specified number of threads that have been waiting 
+  for the object change with qurt_futex_wait().
+
+   @param[in]  lock        Pointer to the object memory. 
+   @param[in]  n_to_wake   Maximum number of threads to wake up.
+
+   @return
+   number of threads to be woken up by this function
+
+   @dependencies
+   None.
+ */
+int qurt_futex_wake(void *lock, int n_to_wake);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_FUTEX_H */
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_hmx.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_hmx.h
new file mode 100755
index 0000000000000..e4037dbeae514
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_hmx.h
@@ -0,0 +1,226 @@
+#ifndef QURT_HMX_H
+#define QURT_HMX_H
+/**
+  @file qurt_hmx.h 
+  @brief   Prototypes of Qurt HMX API.  
+
+Copyright (c) 2019-2021, 2023  by Qualcomm Technologies, Inc.  All Rights Reserved.
+Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+=============================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*=============================================================================
+                        TYPEDEFS
+=============================================================================*/
+
+
+/** @addtogroup hmx_types
+@{ */
+/* HMX locking type */
+#define  QURT_HMX_NON_SHARED_LOCK           0U /**< HMX locking type.*/
+#define  QURT_HMX_SHARED_LOCK               1U /**< HMX locking type.*/
+
+/* HMX unlocking type */
+#define  QURT_HMX_NON_SHARED_UNLOCK         0U /**< HMX unlocking type.*/
+#define  QURT_HMX_SHARED_UNLOCK             1U /**< HMX unlocking type.*/
+
+/* HMX hardware context */
+#define  QURT_HMX_UNIT_0                    0U /**< HMX hardware context #0 */
+#define  QURT_HMX_UNIT_1                    1U /**< HMX hardware context #1 */
+	/** @} */ /* end_addtogroup hmx_types */
+
+
+/*=============================================================================
+                        FUNCTIONS
+=============================================================================*/
+
+
+/**@ingroup func_qurt_hmx_lock2
+  Locks a HMX unit with the specified locking type.
+
+    #QURT_HMX_NON_SHARED_LOCK:
+   - If a HMX unit is available, lock the unit and return success of #QURT_EOK.
+   - If the HMX unit is already locked by another thread, the caller thread is suspended 
+     until the HMX is available and gets locked by this function.
+   - If there is no HMX hardware supported, returns #QURT_EVAL;
+
+    #QURT_HMX_SHARED_LOCK:
+   - If a HMX unit is available, enables HMX access for the caller thread, and returns 
+     success of #QURT_EOK.
+   - If the HMX is enabled on the caller thread, return #QURT_EFAILED.
+   - If the HMX is locked by another thread in the same user process of the caller 
+     thread with locking type of #QURT_HMX_SHARED_LOCK, enable HMX access for the caller 
+     thread, and return success of #QURT_EOK.
+   - If the HMX is locked by another thread in the same user process of the caller 
+     thread with locking type of #QURT_HMX_NON_SHARED_LOCK, return #QURT_EFAILED.
+   - If the HMX is locked by a thread from another user process different from the 
+     user process of the caller thread, return #QURT_EFAILED.
+   - If there is no HMX hardware supported, return #QURT_EVAL.
+
+  @param[in]  type  Locking type.
+    
+  @return
+  #QURT_EOK     -- HMX lock successful.\n
+  #QURT_EFAILED -- Failure due to wrong locking condition.\n
+  #QURT_EVAL    -- Failure because no HMX hardware is supported.
+
+  @dependencies
+  None.
+  
+ */
+int qurt_hmx_lock2(unsigned int type);
+
+
+/**@ingroup func_qurt_hmx_unlock2
+  Unlocks a HMX unit with the unlocking type.
+
+    #QURT_HMX_NON_SHARED_UNLOCK:
+  - If there is a HMX unit locked by the caller thread, unlock the HMX unit and clear the 
+    HMX accumulators (assuming a fixed point type). 
+  - If there is no HMX unit locked by the caller thread, return #QURT_EFAILED. 
+  - If there is no HMX hardware supported, return #QURT_EVAL.
+
+  #QURT_HMX_SHARED_UNLOCK:
+   - If the caller thread has locked HMX with type #QURT_HMX_SHARED_LOCK, disable the 
+     HMX access on the caller thread, and return success of #QURT_EOK.
+     Note: If the caller thread is the last thread that unlocks for #QURT_HMX_SHARED_LOCK 
+           in its user process, the unlock function clears the HMX accumulators. 
+   - If the caller thread has locked HMX with type #QURT_HMX_NON_SHARED_LOCK, return 
+     failure of #QURT_EFAILED.
+   - If the caller thread has not locked HMX, return failure of #QURT_EFAILED.
+   - If there is no HMX hardware supported, returns #QURT_EVAL.
+
+  @param[in]  type  Locking type.
+    
+  @return
+  #QURT_EOK     -- HMX is unlocked successful. \n
+  #QURT_EFAILED -- Failure due to wrong unlocking condition. \n
+  #QURT_EVAL    -- Failure because no HMX hardware is supported.
+
+  @dependencies
+  None.
+  
+ */
+int qurt_hmx_unlock2(unsigned int type);
+
+
+/**@ingroup func_qurt_hmx_lock
+  Locks a HMX unit.
+  If a HMX unit is available, this function locks the unit and returns right away.
+  If there is no HMX unit available, the caller is blocked until a HMX is available 
+  and is locked by the function.
+
+  @return
+  #QURT_EOK -- HMX lock successful. \n
+  #QURT_EFAILED -- Failure due to wrong locking condition. \n
+  #QURT_EVAL    -- Failure because no HMX hardware is supported.
+
+  @dependencies
+  None.
+ */
+int qurt_hmx_lock(void);
+
+
+/**@ingroup func_qurt_hmx_unlock
+  Unlocks a HMX unit.
+  If a HMX unit is locked by the caller thread, unlock the HMX unit and clear its 
+  accumulators(assuming fixed point type). 
+  If there is no HMX unit locked by the caller thread, return failure. 
+  
+  @return
+  #QURT_EOK -- HMX unlock successful. \n
+  #QURT_EFAILED -- Failure due to wrong unlocking condition. \n
+  #QURT_EVAL    -- Failure because no HMX hardware is supported.
+
+  @dependencies
+  None.
+ */
+int qurt_hmx_unlock(void);
+
+
+/**@ingroup func_qurt_hmx_try_lock
+  Tries to lock a HMX unit.
+  If a HMX unit is available, this function locks the unit and returns right away;
+  if there is no HMX unit available, the function returns failure without blocking the caller.
+  
+  @return
+  #QURT_EOK -- HMX lock successful \n
+  #QURT_EFAILED -- Failure due to wrong locking condition.\n
+  #QURT_EVAL    -- Failure because no HMX hardware is supported.
+
+  @dependencies
+  None.
+ */
+int qurt_hmx_try_lock(void);
+
+
+/**@ingroup func_qurt_hmx_assign
+  Assign a HMX unit to a target thread specified by its thread identifier. 
+  The HMX unit (HMX hardware context) is specified by hmx_unit.
+  The caller of this function is limited to the SRM process.
+  If the requested hmx_unit is already assigned to another thread with QURT_HMX_NON_SHARED_LOCK, 
+  kernel will detach it from the thread, and re-assign it to the target thread. 
+  If the target thread has HVX enabled, it cannot have HMX enabled.  
+
+  Locking type 
+  #QURT_HMX_NON_SHARED_LOCK:
+   - If the HMX unit is available, lock the HMX unit and return success of #QURT_EOK.
+   - If the HMX unit is already enabled on the target thread, return #QURT_EOK.
+   - If the HMX unit is already locked by another thread, detach the HMX from the thread.
+     Re-assign the HMX unit to the target thread, and return #QURT_EOK.
+     
+  @param[in]  thread_id    Thread identifier
+  @param[in]  type         Locking type  
+                             #QURT_HMX_NON_SHARED_LOCK -- non-shared lock
+  @param[in]  hmx_unit     HMX hardware context number  
+                             #QURT_HMX_UNIT_0
+                             #QURT_HMX_UNIT_1 
+    
+  @return
+  #QURT_EOK       -- The HMX is assigned successfully. This includes the case that \n
+                     the target thread already has HMX assigned. \n
+  #QURT_EFAILED   -- Failure due to wrong assigning conditions. \n
+  #QURT_EINVALID  -- Failure because no HMX hardware is supported.
+
+  @dependencies
+  None.
+ */
+int qurt_hmx_assign ( unsigned int thread_id, unsigned int type, unsigned int hmx_unit );
+
+
+/**@ingroup func_qurt_hmx_release
+  Release a HMX unit from a target thread specified by its thread identifier. 
+  The HMX unit (HMX hardware context) is specified by hmx_unit.
+  The caller of this function is limited to the SRM process.
+
+  Qurt detaches the specified HMX unit from the target thread, and return success of 
+  #QURT_EOK. If the HMX unit is already released from the target thread, return #QURT_EOK.
+     
+  @param[in]  thread_id    Thread identifier
+  @param[in]  hmx_unit     HMX hardware context number  
+                             #QURT_HMX_UNIT_0
+                             #QURT_HMX_UNIT_1 
+    
+  @return
+  #QURT_EOK       -- The HMX is released successfully. This includes the case that \n
+                     the target thread already has the HMX released. \n
+  #QURT_EFAILED   -- Failure due to wrong assigning condition. \n
+  #QURT_EINVALID  -- Failure because no HMX hardware is supported.
+
+  @dependencies
+  None.
+ */
+int qurt_hmx_release ( unsigned int thread_id, unsigned int hmx_unit );
+
+
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_HMX_H */
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_hvx.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_hvx.h
new file mode 100755
index 0000000000000..13c213d49ac84
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_hvx.h
@@ -0,0 +1,421 @@
+#ifndef QURT_HVX_H
+#define QURT_HVX_H
+/**
+  @file qurt_hvx.h 
+  @brief   Prototypes of QuRT HVX API.  
+
+EXTERNAL FUNCTIONS
+   None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+Copyright (c) 2021-2022  by Qualcomm Technologies, Inc.  All Rights Reserved.
+Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+=============================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*=============================================================================
+                        TYPEDEFS
+=============================================================================*/
+/** @cond */
+
+typedef enum {
+    QURT_HVX_MODE_64B = 0,      /**< HVX mode of 64 bytes */
+    QURT_HVX_MODE_128B = 1      /**< HVX mode of 128 bytes */
+} qurt_hvx_mode_t;
+/** @endcond */
+/*=============================================================================
+                        CONSTANTS AND MACROS
+=============================================================================*/
+/** @cond internal_only*/
+/** @addtogroup hvx_macros
+@{ */
+#define QURT_HVX_HW_UNITS_2X128B_4X64B        0x00000204       /**< Bits 15 through 8 are for the number of 128B units.   */
+                                                               /**< Bits 7 through 0 are for the number of 64B units.     */
+#define QURT_HVX_HW_UNITS_4X128B_0X64B        0x00000400   
+#define QURT_HVX_HW_UNITS_6X128B_0X64B        0x00000600   
+
+/* HVX locking status */
+
+#define QURT_HVX_UNLOCKED                     (0)              /* Has not locked HVX unit */
+#define QURT_HVX_LOCKED                       (1)              /* Has locked HVX unit */
+#define QURT_HVX_ERROR                        (-1)             /* Error, no HVX support */
+
+/* Input value for HVX reservation */
+
+#define QURT_HVX_RESERVE_ALL                  (4)              /* All the HVX units in terms of 64B_MODE are requested to be reserved */
+#define QURT_HVX_RESERVE_ALL_AVAILABLE        (0xff)           /* All remaining unlocked HVX units in terms of 64B_MODE are requested to be reserved */
+
+/* Return values for HVX reservation */
+
+#define QURT_HVX_RESERVE_NOT_SUPPORTED        (-1)             /* There is no HVX hardware, or less units in the hardware than requested */
+#define QURT_HVX_RESERVE_NOT_SUCCESSFUL       (-2)             /* Some HVX units are already locked/reserved by other PD, thus not enough units left for the reservation. */
+#define QURT_HVX_RESERVE_ALREADY_MADE         (-3)             /* There is already a HVX reservation made. */
+#define QURT_HVX_RESERVE_CANCEL_ERR           (-4)             /* The action of cancling the reservation fails because this protection domain has no reservation made before. */
+
+// HVX set requests
+
+#define QURT_HVX_64B                    0  /**< */
+#define QURT_HVX_128B                   1  /**< */
+#define QURT_HVX_NO_USE                 2  /**< */
+#define QURT_HVX_RELEASE_CONTEXT        3  /**< */
+#define QURT_HVX_IMMEDIATE_USE          4  /**< */
+
+// HVX set masks
+
+#define QURT_HVX_64B_PREFERRED          (1<<(QURT_HVX_64B  + 8))/**< */
+#define QURT_HVX_128B_PREFERRED         (1<<(QURT_HVX_128B + 8))/**< */
+#define QURT_HVX_64B_ACCEPTABLE         (1<<(QURT_HVX_64B  + 12))/**< */
+#define QURT_HVX_128B_ACCEPTABLE        (1<<(QURT_HVX_128B + 12))/**< */
+
+// HVX set return "result"
+
+#define QURT_EOK                        0     /**< */
+#define QURT_HVX_SET_ERROR              0xFF  /**< */
+
+// hvx_mode_assigned for QURT_HVX_IMMEDIATE_USE 
+#define QURT_HVX_64B_ASSIGNED          (1<<(QURT_HVX_64B  + 8)) /**< */
+#define QURT_HVX_128B_ASSIGNED         (1<<(QURT_HVX_128B + 8)) /**< */
+
+// Sizes of HVX dump buffer
+
+#define   QURT_HVX_V65_64B_VSIZE           2084U      /**<  64 x 32 +  8 x 4 + 4 (version). */
+#define   QURT_HVX_V65_128B_VSIZE          4164U      /**<  128 x 32 + 16 x 4 + 4 (version). */
+#define   QURT_HVX_V66_128B_VSIZE          4420U      /**<  128 x (32 +2) + 16 x 4 + 4 (version). */
+#define   QURT_HVX_V68_128B_VSIZE          4164U      /**<  128 x 32 + 16 x 4 + 4 (version). */
+#define   QURT_HVX_V79_128B_VSIZE          4740U      /**<  128 x (32+4+1) + 4 (version). */
+#define   QURT_HVX_VREG_BUF_SIZE           QURT_HVX_V79_128B_VSIZE /**< */
+
+// HVX dump versions
+
+#define QURT_HVX_DUMP_V65_64B           1U  /**< */
+#define QURT_HVX_DUMP_V65_128B          2U  /**< */
+#define QURT_HVX_DUMP_V66_128B          3U  /**< */
+#define QURT_HVX_DUMP_V68_128B          4U  /**< */
+#define QURT_HVX_DUMP_V79_128B          5U  /**< */
+/** @} */ /* end_addtogroup hvx_macros */
+/** @endcond */
+/** @cond */
+// Qurt data struct for hvx_set input
+typedef struct qurt_hvx_set_struct_ {          
+    unsigned char set_req;  // LSB
+    struct {
+        unsigned char preferred_mask:4;
+        unsigned char acceptable_mask:4;
+    };
+    unsigned short resvd;   // MSB
+} qurt_hvx_set_struct_t;  // 4 bytes
+
+
+// Qurt data struct for hvx_set return
+typedef struct qurt_hvx_set_return_str_ {          
+    unsigned char result;  // LSB
+    unsigned char hvx_mode_assigned;
+    unsigned short resvd;   // MSB
+} qurt_hvx_set_return_struct_t;  // 4 bytes
+/** @endcond */
+
+
+/*=============================================================================
+                        FUNCTIONS
+=============================================================================*/
+
+/**@ingroup func_qurt_hvx_lock
+  Locks one HVX unit specified by the HVX mode.
+  
+  @note1hang Input variable can be 128B_MODE or 64B_MODE. If an HVX unit in this mode 
+             is available, this function locks the unit and returns right away.
+             If the current HVX mode is different from the requested mode, the current 
+             thread is blocked. When all HVX units become idle, QuRT changes 
+             the mode, locks the HVX unit, and returns.
+
+            Starting from Q6v65 with HVX context switch support, qurt_hvx_lock() is 
+            mapped as qurt_hvx_set(64_BYTE or 128_BYTE).
+  
+  @datatypes
+  #qurt_mode_t
+  
+  @param[in]  lock_mode #QURT_HVX_MODE_64B or #QURT_HVX_MODE_128B.
+
+  @return
+  #QURT_EOK -- Success \n
+  Other value -- Failure
+
+  @dependencies
+  None.
+  
+ */
+int qurt_hvx_lock(qurt_hvx_mode_t lock_mode);
+
+/**@ingroup func_qurt_hvx_unlock
+  Unlocks the HVX unit held by this software thread.
+  
+  @note1hang  Starting from Q6v65 with HVX context switch support, qurt_hvx_unlock()
+              maps as qurt_hvx_set(QURT_HVX_RELEASE_CONTEXT).
+  
+  @return
+  #QURT_EOK -- Successful return \n
+  Other values -- Failure
+
+  @dependencies
+  None.
+  
+ */
+int qurt_hvx_unlock(void);
+
+/**@ingroup func_qurt_hvx_try_lock
+  Tries to lock one HVX unit specified by the HVX mode.
+  
+  @note1hang Input variable can be 128B_MODE or 64B_MODE. If an HVX unit in this mode 
+             is available, this function locks the unit and returns #QURT_EOK; Otherwise,
+             the function returns a failure, but does not block the current software 
+             thread to wait for the HVX unit.
+            Starting from Q6v65 with HVX context switch support, qurt_hvx_try_lock()
+             maps to qurt_hvx_set(FOR_IMMEDIATE_USE| preferred_mask | acceptable_mask);
+  
+  @datatypes
+  #qurt_mode_t
+
+  @return
+  #QURT_EOK -- Successful return \n
+  Other values -- Failure
+
+  @dependencies
+  None.
+  
+ */
+int qurt_hvx_try_lock(qurt_hvx_mode_t lock_mode);
+
+/**@ingroup func_qurt_hvx_get_mode
+  Gets the current HVX mode configured by QuRT.
+  
+  @note1hang Returns #QURT_HVX_MODE_128B or #QURT_HVX_MODE_64B, based on 
+             the current HVX configuration.
+  
+  @param[out] 
+  None.
+
+  @return
+  #QURT_HVX_MODE_128B \n
+  #QURT_HVX_MODE_64B \n
+  -1 -- Not available.
+
+  @dependencies
+  None.
+ */
+int qurt_hvx_get_mode(void);
+
+
+/**@ingroup func_qurt_hvx_get_units
+  Gets the HVX hardware configuration that the chipset supports.
+  
+  @note1hang The function returns the HVX hardware configuration supported by the chipset.
+  
+  @return
+  Bitmask of the units: 1X64, 2X64, 4X64, 1X128, 2X128, and so on.\n
+  - QURT_HVX_HW_UNITS_2X126B_4X64B -- V60, V62, or V65 HVX \n
+  - QURT_HVX_HW_UNITS_4X128B_0X64B -- V66 CDSP or newer \n
+  - 0 --  not available
+
+  @dependencies
+  None.
+  
+ */
+int qurt_hvx_get_units(void);
+
+
+/**@ingroup func_qurt_hvx_reserve
+  Reserves HVX units in terms of 64-byte mode for the protection domain (PD) of the caller.
+  
+  @note1hang Only one HVX reservation in the system is supported.
+             If one HVX unit is already locked by the application in the same PD, the unit is 
+             added to the returned count as one reserved unit for the PD.
+            Starting from Q6v65 with HVX context switch support, qurt_hvx_reserve()
+            only does basic sanity checks on HVX units.
+  
+  @datatypes
+  None.
+
+  @param[in]  num_units  Number of HVX units in terms of 64B_MODE to reserve for the PD.
+                         QURT_HVX_RESERVE_ALL to reserve all the HVX units.
+                         QURT_HVX_RESERVE_ALL_AVAILABLE to reserve the remaining unlocked units.
+
+  @return
+    Number of units successfully reserved, including the units already locked in the same PD. \n
+    #QURT_HVX_RESERVE_NOT_SUPPORTED \n     
+    #QURT_HVX_RESERVE_NOT_SUCCESSFUL \n    
+  #QURT_HVX_RESERVE_ALREADY_MADE    
+
+
+  @dependencies
+  None.
+  
+ */
+int qurt_hvx_reserve(int num_units);
+
+
+/**@ingroup func_qurt_hvx_cancel_reserve
+  Cancels the HVX reservation in the protection domain (PD) of the caller.
+  
+  @note1hang Only one HVX reservation in the system is supported.
+  
+  @return
+    0 -- Success \n
+    #QURT_HVX_RESERVE_CANCEL_ERR -- Failure      
+
+  @dependencies
+  None.
+  
+ */
+int qurt_hvx_cancel_reserve(void);
+
+
+/**@ingroup func_qurt_hvx_get_lock_val
+  Gets the HVX locking status value of the thread of the caller. 
+  
+  @note1hang Returns the status of whether the thread of the caller already locks a HVX unit or not.
+  
+  @datatypes
+  None.
+
+  @return
+    #QURT_HVX_UNLOCKED \n  
+    #QURT_HVX_LOCKED \n   
+    #QURT_HVX_ERROR    
+
+  @dependencies
+  None.
+ */
+int qurt_hvx_get_lock_val(void);
+
+/** @cond internal_only*/
+/**@ingroup func_qurt_hvx_set
+  Sets the HVX configuration for the software thread of the caller. 
+  
+  @datatypes
+  None.
+
+  @param[in] input_arg Composed of set_request | hvx_preferred_mode_mask 
+                       | hvx_acceptable_mode_mask where set_request can be set to: \n
+                       - #QURT_HVX_64B  \n         
+                       - #QURT_HVX_128B  \n       
+                       - #QURT_HVX_NO_USE  \n    
+                       - #QURT_HVX_RELEASE_CONTEXT \n
+                       - #QURT_HVX_IMMEDIATE_USE \n
+                       When set_request is QURT_HVX_IMMEDIATE_USE,  
+    hvx_preferred_mode_mask can be set to: \n
+                       - #QURT_HVX_64B_PREFERRED \n    
+                       - #QURT_HVX_128B_PREFERRED   
+                       When set_request is QURT_HVX_IMMEDIATE_USE,  
+    hvx_acceptable_mode_mask can be set to: \n
+                       - #QURT_HVX_64B_ACCEPTABLE  \n
+                       - #QURT_HVX_128B_ACCEPTABLE @tablebulletend
+
+  @return 
+     Result of the HVX setting in the least significant 8 bits of the returned data. \n
+  #QURT_EOK -- 0  \n
+  #QURT_HVX_SET_ERROR -- 0xFF \n     
+  When #QURT_HVX_IMMEDIATE_USE has a result of #QURT_EOK, 
+  bit 8 to bit 15 of the returned data contain hvx_mode_assigned:\n
+  - #QURT_HVX_64B_ASSIGNED      \n
+  - #QURT_HVX_128B_ASSIGNED   
+
+  @dependencies
+  None.
+ */
+unsigned int qurt_hvx_set(unsigned int input_arg);
+
+
+/**@ingroup func_qurt_system_hvx_regs_get_maxsize
+  Returns the maximum buffer size for saving HVX registers.
+  
+  @datatypes
+  None.
+
+  @return
+  0 -- No HVX supported in the target. \n
+  #QURT_HVX_VREG_BUF_SIZE -- Maximum buffer size for saving HVX registers.
+
+  @dependencies
+  None.
+ */
+unsigned int qurt_system_hvx_regs_get_maxsize(void);
+
+
+/**@ingroup func_qurt_system_hvx_regs_get_size
+  Returns the buffer size for saving HVX registers for a specified thread.
+  
+  @param[in]  thread_id    Thread ID of the target thread.
+
+  @return
+  0 -- No HVX assgined to the thread. \n
+    size -- Size of the buffer in bytes for saving HVX registers for the specified thread: \n 
+  - #QURT_HVX_V65_64B_VSIZE  -- 64 x 32 +  8 x 4 + 4 (version) \n
+  - #QURT_HVX_V65_128B_VSIZE -- 128 x 32 + 16 x 4 + 4 (version) \n
+  - #QURT_HVX_V66_128B_VSIZE -- 128 x (32 +2) + 16 x 4 + 4 (version) \n
+  - #QURT_HVX_V68_128B_VSIZE -- 128 x 32 + 16 x 4 + 4 (version) \n
+  - #QURT_HVX_V79_128B_VSIZE -- 128 x (32+4+1) + 4 (version)
+
+
+  @dependencies
+  None.
+  
+ */
+unsigned int qurt_system_hvx_regs_get_size(unsigned int thread_id);
+
+
+
+/**@ingroup func_qurt_system_hvx_regs_get
+  Saves the HVX registers into the specified buffer.
+  Returns the size of the data saved into the buffer.
+  After calling this function for the first time on a specified thread_id, the QuRT kernel removes the internal HVX saving buffer 
+  from the specified thread. When calling the function on the same thread_id for the second time, this function returns 0.
+  
+  @param[in] thread_id    Thread ID of the target thread.
+  @param[in] pBuf         Pointer to the buffer for HVX register saving.
+                          The first four bytes of the buffer are for saving the HVX version. HVX registers are saved from 
+                          the fifth byte of the buffer. The address of the fifth byte should be 256 bytes aligned. 
+                          For example, a buffer can be declared at first as: \n
+                          unsigned char vbuf[QURT_HVX_VREG_BUF_SIZE+256];\n
+                          unsigned char *pBuf; \n
+                          then align the buffer pointer to: \n
+                          pBuf = vbuf; \n
+                    pBuf += (256 - 4 - (unsigned)pBuf%256);
+  @param[in] size         Size of the buffer provided, which is pointed by *pBuf. The buffer size should not be smaller than that 
+                          returned from qurt_system_hvx_regs_get_size(), and pBuf should be aligned as described above.
+  @param[out] pBuf        Buffer returned with the saved HVx registers (unsigned char hvx_regs[];), which are saved from the fith 
+                          byte of the buffer, and the HVX version (unsigned int hvx_version;), which in the first four bytes 
+                          contain one of the HVX dump versions:\n
+                          - #QURT_HVX_DUMP_V65_64B \n   
+                          - #QURT_HVX_DUMP_V65_128B \n   
+                          - #QURT_HVX_DUMP_V66_128B  \n  
+                          - #QURT_HVX_DUMP_V68_128B  \n  
+                          - #QURT_HVX_DUMP_V79_128B  \n  
+                           @tablebulletend
+
+  @return
+    Total bytes of the data saved in the provided buffer. \n
+  0  -- No HVX assigned to the thread \n
+  #QURT_HVX_V65_64B_VSIZE   --  64 x 32 +  8 x 4 + 4 (version) \n
+  #QURT_HVX_V65_128B_VSIZE  -- 128 x 32 + 16 x 4 + 4 (version) \n
+  #QURT_HVX_V66_128B_VSIZE  -- 128 x (32 +2) + 16 x 4 + 4 (version) \n
+  #QURT_HVX_V68_128B_VSIZE  -- 128 x 32 + 16 x 4 + 4 (version)  \n
+  #QURT_HVX_V79_128B_VSIZE  -- 128 x (32+4+1) + 4 (version)
+
+  @dependencies
+  None.
+ */
+unsigned int qurt_system_hvx_regs_get(unsigned int thread_id, void *pBuf, size_t size);
+/** @endcond */
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_HVX_H */
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_int.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_int.h
new file mode 100755
index 0000000000000..386aeda1051eb
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_int.h
@@ -0,0 +1,509 @@
+﻿#ifndef QURT_INT_H
+#define QURT_INT_H
+/**
+  @file  qurt_int.h
+  @brief  QuRT interrupt functions.    
+
+
+
+ Copyright (c) 2013-2021, 2023 Qualcomm Technologies, Inc.
+ All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*=============================================================================
+                            CONSTANTS AND MACROS
+=============================================================================*/
+
+
+/** @cond rest_reg_dist */
+/** @addtogroup interrupts_constants
+@{ */
+#define SIG_INT_ABORT 0x80000000                                       /**< */
+#define QURT_INT_NON_DELAYED_ACK           0 
+#define QURT_INT_DELAYED_ACK               1
+#define QURT_INT_ACK_DEFAULT               QURT_INT_NON_DELAYED_ACK
+#define QURT_INT_DRV_DEFAULT               0
+#define QURT_INT_PRIORITY_DEFAULT          0xFF
+
+/** QuRT interrupt property. */
+#define QURT_INT_CONFIGID_POLARITY        0x1U /**< */
+#define QURT_INT_CONFIGID_LOCK            0x2U /**< */
+
+/** QuRT interrupt lock.*/
+#define QURT_INT_LOCK_DEFAULT             0x0  /**< Default. */
+#define QURT_INT_LOCK_DISABLE             0x0  /**< Interrupt can be enabled or disabled or deregistered. */
+#define QURT_INT_LOCK_ENABLE              0x1  /**< Interrupt is locked and cannot be enabled, disabled, or deregistered.*/
+/** @} */ /* end_addtogroup interrupts_constants */
+
+/** @addtogroup Qurt_interrupt_type
+@{ */
+/** Trigger type bit fields for a PDC interrupt:\n
+    @verbatim
+    Polarity  Edge  Output\n
+    0         00    Level sensitive active low
+    0         01    Rising edge sensitive
+    0         10    Falling edge sensitive
+    0         11    Dual edge sensitive
+    1         00    Level sensitive active high
+    1         01    Falling edge sensitive
+    1         10    Rising edge sensitive
+    1         11    Dual edge sensitive 
+    @endverbatim
+*/
+#define QURT_INT_TRIGGER_TYPE_SET(pol, edge)   ((((pol) & 0x01U) << 2) | ((edge) & 0x03U)) /**< */
+	 
+#define QURT_INT_TRIGGER_LEVEL_LOW     QURT_INT_TRIGGER_TYPE_SET(0U, 0x00U)  /**< */
+#define QURT_INT_TRIGGER_LEVEL_HIGH    QURT_INT_TRIGGER_TYPE_SET(1U, 0x00U)  /**< */
+#define QURT_INT_TRIGGER_RISING_EDGE   QURT_INT_TRIGGER_TYPE_SET(1U, 0x02U)  /**< */
+#define QURT_INT_TRIGGER_FALLING_EDGE  QURT_INT_TRIGGER_TYPE_SET(0U, 0x02U)  /**< */
+#define QURT_INT_TRIGGER_DUAL_EDGE     QURT_INT_TRIGGER_TYPE_SET(0U, 0x03U)  /**< */
+#define QURT_INT_TRIGGER_USE_DEFAULT   0xffU                                 /**< */
+/** @} */ /* end_addtogroup Qurt_interrupt_type */
+
+/*=====================================================================
+ Functions
+======================================================================*/
+
+/**@ingroup func_qurt_interrupt_register
+  @xreflabel{sec:interrupt_register} 
+  Registers the interrupt.\n
+  Enables the specified interrupt and associates it with the specified QuRT signal object and
+  signal mask.
+
+  Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1
+  indicates that a signal must be waited on, and 0 indicates not to wait.
+
+  When the interrupt occurs, the signal specified in the signal mask is set in the signal
+  object. An IST conventionally waits on that signal to
+  handle the interrupt. The thread that registers the interrupt is set as the IST.
+
+  Up to 31 separate interrupts can be registered to a single signal object, as determined by
+  the number of individual signals the object can store. QuRT reserves signal 31. Thus a
+  single IST can handle several different interrupts.
+
+  QuRT reserves some interrupts for internal use -- the remainder are available for use by
+  applications, and thus are valid interrupt numbers. If the specified interrupt number is
+  outside the valid range, the register operation returns the status value QURT_EINT.
+
+  Only one thread can be registered at a time to a specific interrupt. Attempting to register
+  an already-registered interrupt returns the status value QURT_EVAL.
+
+  Only one signal bit in a signal object can be registered at a time to a specific interrupt.
+  Attempting to register multiple signal bits to an interrupt returns the status value
+  QURT_ESIG.
+
+  When the signal registers an interrupt, QuRT can only set its signal bits 
+  when receiving the interrupt. The QuRT signal API from another
+  software thread cannot set the signal even for unused signal bits.
+
+  @note1hang The valid range for an interrupt number can differ on target execution
+             environments other than the simulator. For more information, see the
+             appropriate hardware document.
+								 
+  @datatypes
+  #qurt_anysignal_t
+
+  @param[in] int_num      L2VIC interrupt to deregister; valid range is 0 to 1023.
+  @param[in] int_signal   Any-signal object to wait on (Section @xref{dox:any_signals}).
+  @param[in] signal_mask  Signal mask value indicating signal to receive the interrupt.
+
+   @return
+   #QURT_EOK -- Interrupt successfully registered.\n
+   #QURT_EINT -- Invalid interrupt number. \n
+   #QURT_ESIG -- Invalid signal bitmask (cannot set more than one
+                signal at a time). \n
+   #QURT_EVAL -- Interrupt already registered.
+
+   @dependencies
+   None.
+*/
+ unsigned int qurt_interrupt_register(int int_num, qurt_anysignal_t *int_signal, int signal_mask);
+
+/**@ingroup func_qurt_interrupt_register2
+  @xreflabel{sec:interrupt_register2} 
+  Registers the interrupt.\n
+  Enables the specified interrupt, associates it with the specified QuRT signal object and
+  signal mask, and sets interrupt flags.
+
+  Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1
+  indicates that a signal must be waited on, and 0 indicates not to wait.
+
+  When the interrupt occurs, the signal specified in the signal mask is set in the signal
+  object. An IST conventionally waits on that signal to
+  handle the interrupt. The thread that registers the interrupt is set as the IST.
+
+  Up to 31 separate interrupts can be registered to a single signal object, as determined by
+  the number of individual signals that the object can store. QuRT reserves signal 31. Thus a
+  single IST can handle several different interrupts.
+
+  QuRT reserves some interrupts for internal use -- the remainder are available for use by
+  applications, and thus are valid interrupt numbers. If the specified interrupt number is
+  outside the valid range, the register operation returns the status value #QURT_EINT.
+
+  Only one thread can be registered at a time to a specific interrupt. Attempting to register
+  an already-registered interrupt returns the status value #QURT_EVAL.
+
+  Only one signal bit in a signal object can be registered at a time to a specific interrupt.
+  Attempting to register multiple signal bits to an interrupt returns the status value
+  #QURT_ESIG.
+
+  When the signal registers an interrupt, QuRT can only set its signal bits 
+  when receiving the interrupt. The QuRT signal API from another
+  software thread cannot set the signal even for unused signal bits.
+
+  @note1hang The valid range for an interrupt number can differ on target execution
+             environments other than the simulator. For more information, see the
+             appropriate hardware document.
+								 
+  @datatypes
+  #qurt_anysignal_t
+
+  @param[in] int_num      L2VIC interrupt to deregister; valid range is 0 to 1023.
+  @param[in] int_signal   Any-signal object to wait on (Section @xref{dox:any_signals}).
+  @param[in] signal_mask  Signal mask value indicating signal to receive the interrupt.
+  @param[in] flags        Defines interrupt property, supported property is interrupt lock enable/disable. 
+                          Possible values for flags: \n
+                           - #QURT_INT_LOCK_ENABLE
+                           - #QURT_INT_LOCK_DISABLE @tablebulletend
+
+   @return
+   #QURT_EOK -- Interrupt successfully registered.\n
+   #QURT_EINT -- Invalid interrupt number. \n
+   #QURT_ESIG -- Invalid signal bitmask (cannot set more than one
+                signal at a time). \n
+   #QURT_EVAL -- Interrupt already registered.
+
+   @dependencies
+   None.
+*/
+ unsigned int qurt_interrupt_register2(int int_num, qurt_anysignal_t *int_signal, int signal_mask, unsigned int flags);
+/*
+ * Waits for registered interrupt signal
+
+ * Suspend the current thread until one of its registered interrupts occurs. The second input mask, 
+ * contains the interrupt signals the IST expects to receive. The interrupt signals are registered 
+ * with interrupts via qurt_register_interrupt API.
+ *
+ * The signals returned in the signal variable indicate which interrupts occurred. Use function 
+ * qurt_anysignal_get to read the signals. IST must locally maintain a table that maps a signal to 
+ * a specific interrupt. IST also checks if signal #SIG_INT_ABORT is received. If so, the IST 
+ * must quit from interrupt receiving loop.
+ *
+ * For detail information on this API, see QuRT User Manual Section 4.2.5
+ *
+ * Prototype
+ *
+ * unsigned int qurt_anysignal_wait(qurt_anysignal_t *int_signal, unsigned int mask)
+ */
+
+/**@ingroup func_qurt_interrupt_acknowledge
+  Acknowledges an interrupt after it has been processed.\n
+  Re-enables an interrupt and clears its pending status. This is done after an interrupt is
+  processed by an IST.
+
+  Interrupts are automatically disabled after they occur. To re-enable an interrupt, an IST
+  performs the acknowledge operation after it has finished processing the interrupt and
+  just before suspending itself (such as by waiting on the interrupt signal).
+
+  @note1hang To prevent losing or reprocessing subsequent occurrences of the interrupt,
+           an IST must clear the interrupt signal (Section @xref{sec:anysignal_clear}) before
+           acknowledging the interrupt.
+
+  @param[in] int_num Interrupt that is being re-enabled.
+
+  @return 
+  #QURT_EOK -- Interrupt acknowledge was successful. \n
+  #QURT_EDEREGISTERED -- Interrupt is already de-registered.
+
+  @dependencies
+  None.	
+*/
+int qurt_interrupt_acknowledge(int int_num);
+
+/**@ingroup func_qurt_interrupt_deregister
+  Disables the specified interrupt and disassociates it from a QuRT signal object.
+  If the specified interrupt was never registered (Section @xref{sec:interrupt_register}), the deregister operation
+  returns the status value #QURT_EINT.
+
+  @note1hang If an interrupt is deregistered while an IST waits
+             to receive it, the IST might wait indefinitely for the interrupt to occur. To avoid
+             this problem, the QuRT kernel sends the signal #SIG_INT_ABORT to awaken an
+             IST after determining that it has no interrupts registered.
+
+  @param[in] int_num L2VIC to deregister; valid range is 0 to 1023.
+
+  @return
+  #QURT_EOK -- Success.\n
+  #QURT_EINT -- Invalid interrupt number (not registered).
+
+  @dependencies
+  None.
+
+*/
+unsigned int qurt_interrupt_deregister(int int_num);
+/** @endcond */
+
+/**@ingroup func_qurt_interrupt_disable
+  Disables an interrupt with its interrupt number.\n
+  The interrupt must be registered prior to calling this function. 
+  After qurt_interrupt_disable() returns, the Hexagon subsystem
+  can no longer send the corresponding interrupt to the Hexagon
+  core, until qurt_interrupt_enable() is called 
+  for the same interrupt. 
+  
+  Avoid calling qurt_interrupt_disable() and qurt_interrupt_enable() frequently within 
+  a short period of time.\n
+  - A pending interrupt can already be in the Hexagon core when qurt_interrupt_disable() 
+    is called. Therefore, some time later, the pending interrupt is received on a Hexagon 
+    hardware thread.\n
+  - After the Hexagon subsystem sends an interrupt to the Hexagon core, the Hexagon 
+    hardware automatically disables the interrupt until kernel software re-enables the interrupt 
+    at the interrupt acknowledgement stage. If qurt_interrupt_enable() is called from a certain 
+    thread at an ealier time, the interrupt is re-enabled earlier and can trigger 
+  sending a new interrupt to the Hexagon core while kernel software is still processing
+  the previous interrupt.
+
+  @param[in] int_num Interrupt number.
+
+  @return
+  #QURT_EOK  -- Interrupt successfully disabled.\n 
+  #QURT_EINT -- Invalid interrupt number.\n
+  #QURT_ENOTALLOWED -- Interrupt is locked. \n
+  #QURT_EVAL -- Interrupt is not registered. 
+
+  @dependencies
+  None.
+*/
+ unsigned int qurt_interrupt_disable(int int_num);
+
+ 
+/**@ingroup func_qurt_interrupt_enable
+  Enables an interrupt with its interrupt number.\n
+  The interrupt must be registered prior to calling this function. 
+
+  @param[in] int_num Interrupt number.
+
+  @return
+  #QURT_EOK -- Interrupt successfully enabled.\n 
+  #QURT_EINT -- Invalid interrupt number.\n
+  #QURT_ENOTALLOWED -- Interrupt is locked. \n
+  #QURT_EVAL -- Interrupt is not registered.
+
+  @dependencies
+  None.
+
+*/
+ unsigned int qurt_interrupt_enable(int int_num);
+
+
+/**@ingroup func_qurt_interrupt_status
+  Returns a value that indicates the pending status of the specified interrupt.
+
+  @param[in]  int_num  Interrupt number that is being checked.
+  @param[out] status   Interrupt status; 1 indicates that an interrupt is
+                       pending, 0 indicates that an interrupt is not pending.
+ 
+  @return 
+  #QURT_EOK -- Success. \n
+  #QURT_EINT -- Failure; invalid interrupt number.
+
+  @dependencies
+  None.
+ */
+unsigned int qurt_interrupt_status(int int_num, int *status);
+
+
+/**@ingroup func_qurt_interrupt_get_status
+  Gets the status of the specified interrupt in L2VIC.
+
+  @param[in]  int_num  Interrupt number that is being checked.
+  @param[in]  status_type     0 -- interrupt pending status \n
+                              1 -- interrupt enabling status
+  @param[out] status          0 -- OFF \n
+                              1 -- ON
+ 
+  @return 
+  #QURT_EOK -- Success. \n
+  #QURT_EINT -- Failure; invalid interrupt number.
+
+  @dependencies
+  None.
+ */
+unsigned int qurt_interrupt_get_status(int int_num, int status_type, int *status);
+
+/** @cond rest_reg_dist */
+/**@ingroup func_qurt_interrupt_clear
+  Clears the pending status of the specified interrupt.
+
+  @note1hang This operation is intended for system-level use, and must be used with care.
+             
+  @param[in] int_num Interrupt that is being re-enabled.
+ 
+  @return 
+  #QURT_EOK -- Success.\n
+  #QURT_EINT -- Invalid interrupt number.
+  
+  @dependencies
+  None.
+ */
+unsigned int qurt_interrupt_clear(int int_num);
+
+
+/**@ingroup func_qurt_interrupt_get_config
+  Gets the L2VIC interrupt configuration. \n
+  This function returns the type and polarity of the specified L2VIC interrupt.
+
+  @param[in]   int_num       L2VIC interrupt that is being re-enabled.
+  @param[out]  int_type      Pointer to an interrupt type. \n
+                             0 -- Level-triggered interrupt \n
+                             1 -- Eedge-triggered interrupt
+  @param[out]  int_polarity  Pointer to interrupt polarity.\n
+                             0 -- Active-high interrupt \n
+                             1 -- Active-low interrupt.
+ 
+  @return 
+  #QURT_EOK -- Configuration successfully returned.\n
+  #QURT_EINT -- Invalid interrupt number. 
+
+  @dependencies
+  None.
+ */
+unsigned int qurt_interrupt_get_config(unsigned int int_num, unsigned int *int_type, unsigned int *int_polarity);
+
+/**@ingroup func_qurt_interrupt_set_config
+  Sets the type and polarity of the specified L2VIC interrupt.
+
+  @note1hang Deregister L2VIC interrupts before reconfiguring them.
+
+  @param[in] int_num        L2VIC interrupt that is being re-enabled.
+  @param[in] int_type       Interrupt type. \n
+                            0 -- Level-triggered interrupt\n
+                            1 -- Edge-triggered interrupt
+  @param[in] int_polarity   Interrupt polarity. \n
+                            0 -- Active-high interrupt \n
+                            1 -- Active-low interrupt
+ 
+  @return
+  #QURT_EOK -- Success. \n
+  #QURT_ENOTALLOWED -- Not allowed; the interrupt is being registered.\n
+  #QURT_EINT -- Invalid interrupt number.
+  
+  @dependencies
+  None.
+ */
+unsigned int qurt_interrupt_set_config(unsigned int int_num, unsigned int int_type, unsigned int int_polarity);
+
+/**@ingroup func_qurt_interrupt_set_config2
+  Sets the type and polarity of the specified L2VIC interrupt.
+
+  @note1hang L2VIC interrupts must be deregistered before they can be reconfigured.
+
+  @param[in] int_num        L2VIC interrupt that is being re-enabled.
+  @param[in] int_type       Notified to the hardware configuration callback function and used to 
+                            modify the L2VIC type. Possible values: \n 
+                            - #QURT_INT_TRIGGER_USE_DEFAULT \n 
+                            - #QURT_INT_TRIGGER_LEVEL_HIGH  \n 
+                            - #QURT_INT_TRIGGER_LEVEL_LOW  \n 
+                            - #QURT_INT_TRIGGER_RISING_EDGE  \n 
+                            - #QURT_INT_TRIGGER_FALLING_EDGE  \n              
+                            - #QURT_INT_TRIGGER_DUAL_EDGE  @tablebulletend
+ 
+  @return
+  #QURT_EOK -- Success. \n
+  #QURT_ENOTALLOWED -- Not allowed; the interrupt is being registered.\n
+  #QURT_EINT -- Invalid interrupt number.
+  
+  @dependencies
+  None.
+ */
+unsigned int qurt_interrupt_set_config2(unsigned int int_num, unsigned int int_type);
+
+/**@ingroup func_ qurt_interrupt_set_config3
+  Sets the specified configuration value for the specified property of the specified L2VIC interrupt.
+
+  @note1hang L2VIC interrupts must be deregistered before they can be reconfigured for polarity.
+    
+  @param[in] int_num        L2VIC interrupt to re-enable.
+  @param[in] config_id      Property to configure: \n
+                            - #QURT_INT_CONFIGID_POLARITY \n
+                            - #QURT_INT_CONFIGID_LOCK @tablebulletend
+  @param[in] config_val    Dependent on the second argument config_id, specifies the value to set. \n
+                           Values for #QURT_INT_CONFIGID_POLARITY: \n 
+                            - #QURT_INT_TRIGGER_USE_DEFAULT \n
+                            - #QURT_INT_TRIGGER_LEVEL_HIGH  \n
+                            - #QURT_INT_TRIGGER_LEVEL_LOW \n
+                            - #QURT_INT_TRIGGER_RISING_EDGE \n
+                            - #QURT_INT_TRIGGER_FALLING_EDGE \n             
+                            - #QURT_INT_TRIGGER_DUAL_EDGE \n
+
+                           Values for #QURT_INT_CONFIGID_LOCK: \n
+                            - #QURT_INT_LOCK_ENABLE\n
+                            - #QURT_INT_LOCK_DISABLE @tablebulletend
+          
+  @return
+  #QURT_EOK -- Success. \n
+  #QURT_ENOTALLOWED -- Not allowed; the interrupt is being registered or is locked for enable/disable.\n
+  #QURT_EINT -- Invalid interrupt number.
+
+  @dependencies
+  None.
+*/
+unsigned int qurt_interrupt_set_config3(unsigned int int_num, unsigned int config_id, unsigned int config_val);
+
+
+/**@ingroup func_qurt_interrupt_raise
+  Raises the interrupt. \n
+  This function triggers a level-triggered L2VIC
+  interrupt, and accepts interrupt numbers in the range of 0 to 1023.
+
+  @param[in] interrupt_num Interrupt number.
+  
+  @return
+  #QURT_EOK --  Success \n
+  -1  --  Failure; the interrupt is not supported.
+
+  @dependencies
+  None.
+ */
+int qurt_interrupt_raise(unsigned int interrupt_num);
+
+/**@ingroup func_qurt_interrupt_raise2
+  Raises the interrupt and returns the current pcycle value.
+
+  @param[in] interrupt_num Interrupt number.
+  
+  @return
+  0xFFFFFFFFFFFFFFFF -- Failure; the interrupt is not supported.\n
+  Other value        -- pcycle count at the time the interrupt is raised.
+
+  @dependencies
+  None.
+ */
+unsigned long long qurt_interrupt_raise2(unsigned int interrupt_num);
+/** @endcond */
+
+/** @cond internal_only */
+/**@ingroup func_qurt_isr_subcall
+  Indicates whether the current function is called from a callback procedure (either short or long).
+  
+  @return
+  #QURT_EOK -- TRUE \n
+  #QURT_EVAL -- FALSE.
+  
+  @dependencies
+  None.
+ */
+int qurt_isr_subcall(void);
+/** @endcond */
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_INT_H */
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_island.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_island.h
new file mode 100755
index 0000000000000..f0c8ee27cf8b0
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_island.h
@@ -0,0 +1,122 @@
+#ifndef QURT_ISLAND_H
+#define QURT_ISLAND_H
+
+/**
+  @file qurt_island.h
+  @brief  Prototypes of power API
+          The APIs allow entering and exiting island mode where the memory
+          accesses are limited to local memory.
+
+  EXTERNAL FUNCTIONS
+   None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+Copyright (c) 2018-2021,2023  by Qualcomm Technologies, Inc.  All Rights Reserved.
+
+=============================================================================*/
+
+#include <qurt_thread.h>
+#include <qurt_memory.h>
+#include <qurt_alloc.h>
+#include <qurt_error.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**@ingroup func_qurt_island_get_status
+  Gets Island mode status.
+
+  Returns a value that indicates whether the QuRT system executes in Island mode.
+
+  @return
+  0 - Normal mode. \n
+  1 - Island mode. 
+
+  @dependencies
+  None.
+*/
+unsigned int qurt_island_get_status (void);
+
+/**@ingroup func_qurt_island_get_status2
+  Gets Island mode status especially that differentiates between island partial exit and complete exit.
+ 
+  Returns a value that indicates the current state. 
+  
+  @note1hang Transition from NORMAL mode to ISLAND mode happens in single
+             threaded mode. Whereas transition from ISLAND mode to other modes
+             happen in multi-threaded mode. So, a thread that gets island mode
+             status as NORMAL can assume the same status till it continues to
+             run. A thread that gets island mode status as ISLAND should 
+             assume that the status may change to EXITING or NORMAL while it
+             runs. A thread that gets island mode status as EXITING should
+             assume that the status may change to NORMAL while it runs. If 
+             the thread goes to wait state in after reading the status, it should get
+             the island mode state again and not assume the previous state. 
+  @note2hang This api returns more intrinsic states than qurt_island_get_status,
+             when qurt_island_get_status returns 0, this api could return 
+             QURT_ISLAND_MODE_EXITING or QURT_ISLAND_MODE_ISLAND
+          
+  @param[in/out] data  field is reserved for future use. If NULL pointer is passed, 
+                       the field will be ignored. If a valid pointer is passed, 
+                  QuRT will return back a bitmask which can be interpreted as follows:
+                  data[31] - Valid bit. Set to 1 to indicate data[30:0] are valid. 
+                  Otherwise set to 0.
+                  data[30:0] – Reserved for future definition. 
+ 
+  @return
+    QURT_ISLAND_MODE_NORMAL   - Main mode \n
+    QURT_ISLAND_MODE_ISLAND   - Island mode \n
+    QURT_ISLAND_MODE_EXITING  - Exiting Island mode \n
+ 
+  @dependencies
+  None.
+*/
+unsigned int qurt_island_get_status2 (unsigned int *data);
+
+
+
+/**@ingroup func_qurt_island_get_exit_status
+  Gets the reason for the last Island mode exit status.
+
+  @param[out] cause_code Pointer that returns the cause code of the last
+                         island exit reason. \n
+                         - #QURT_EISLANDUSEREXIT -- Island exit due to user call for island exit.\n
+                         - #QURT_ENOISLANDENTRY -- API called before exiting island. \n                
+                         - #QURT_EISLANDINVALIDINT -- Island exit due to an invalid interrupt in Island mode. @tablebulletend
+
+  @param[out] int_num Pointer that holds the invalid interrupt number that caused
+                      island exit when the cause code is #QURT_EISLANDINVALIDINT.
+                      For other cases, it is -1.
+
+  @return
+  None. 
+
+  @dependencies
+  None.
+*/
+void qurt_island_get_exit_status(unsigned int *cause_code, int *int_num);
+
+/**@ingroup func_qurt_island_get_enter_timestamp
+  Gets the recent timestamp when the system exits STM during island enter.
+
+  @param[out]    island_enter_timestamp Returns a pointer to the recent timestamp
+                                        recorded after the system exits STM during island enter. If the system never 
+                                        attempts to enter island, the island_enter_timestamp return pointer holds a value 
+                 of zero.
+  
+  @return
+  None. 
+
+  @dependencies
+  None.
+*/
+void qurt_island_get_enter_timestamp(unsigned long long *island_enter_timestamp);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_ISLAND_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_isr.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_isr.h
new file mode 100755
index 0000000000000..db29ea2f265d7
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_isr.h
@@ -0,0 +1,177 @@
+#ifndef QURT_ISR_H
+#define QURT_ISR_H
+
+/*=====================================================================
+ 
+  @file  qurt_isr.h
+
+  @brief  Prototypes of Qurt ISR API functions      
+
+ EXTERNALIZED FUNCTIONS
+  none
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  none
+
+ Copyright (c) 2017, 2021  by Qualcomm Technologies, Inc.  All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+
+#include <string.h>
+#include <qurt_thread.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*=============================================================================
+                            Functions
+=============================================================================*/
+
+
+/**@ingroup func_qurt_isr_set_hw_config_callback
+  Set callback function for the configuration related to interrupt hardware.
+  In a process, the callback function can only be set once.
+
+  @param[in]   cb_addr      address of the callback function.
+   
+  @return 
+  #QURT_EOK     -- the callback function is set succssfully. \n
+  #QURT_EFAILED -- Failure. The callback function has been set before. 
+
+  @dependencies
+  None.
+ */
+int qurt_isr_set_hw_config_callback(unsigned int cb_addr);
+
+
+/**@ingroup func_qurt_isr_set_hw_enable_callback
+  Set callback function for enabling the configuration related to interrupt hardware.
+  In a process, the callback function can only be set once.
+
+  @param[in]   cb_addr      address of the callback function.
+   
+  @return 
+  #QURT_EOK     -- the callback function is set succssfully. \n
+  #QURT_EFAILED -- Failure. The callback function has been set before. 
+
+  @dependencies
+  None.
+ */
+int qurt_isr_set_hw_enable_callback(unsigned int cb_addr);
+
+
+/**@ingroup func_qurt_isr_set_hw_disable_callback
+  Set callback function for disabling the configuration related to interrupt hardware.
+  In a process, the callback function can only be set once.
+
+  @param[in]   cb_addr      address of the callback function.
+   
+  @return 
+  #QURT_EOK     -- the callback function is set succssfully. \n
+  #QURT_EFAILED -- Failure. The callback function has been set before. 
+
+  @dependencies
+  None.
+ */
+int qurt_isr_set_hw_disable_callback(unsigned int cb_addr);
+
+
+/**@ingroup func_qurt_isr_create
+  Creates an ISR thread with the specified attributes, and makes it executable.
+
+  @datatypes
+  #qurt_thread_t \n
+  #qurt_thread_attr_t
+  
+  @param[out]  thread_id    Returns a pointer to the thread identifier if the thread was 
+                             successfully created.
+  @param[in]   attr 	    Pointer to the initialized thread attribute structure that specifies 
+                             the attributes of the created thread.
+   
+  @return 
+  #QURT_EVAL    -- Invalid arguments
+  #QURT_EOK -- Thread created. \n
+  #QURT_EFAILED -- Thread not created. 
+
+  @dependencies
+  None.
+ */
+int qurt_isr_create (qurt_thread_t *thread_id, qurt_thread_attr_t *pAttr);
+
+/**@ingroup func_qurt_isr_register2
+  Registers an Interrupt Service Routine to an ISR thread. ISR callback with the specified attributes.
+  The interrupt is enabled when this function returns success.
+
+  @datatypes
+   qurt_thread_t
+  
+  @param[in]   isr_thread_id ISR thread ID, returned from qurt_isr_create()
+  @param[in]   int_num       The interrupt number
+  @param[in]   prio          Priority of the ISR
+  @param[in]   flags         Defines ACK type. Values : \n
+                             QURT_INT_NON_DELAYED_ACK - ISR is acknowledged by the interrupt handle routine 
+			                                     in the Kernel.
+                             QURT_INT_DELAYED_ACK     - Client chooses to acknowledge. 
+  @param[in]   int_type.     Notifies it to registered function. Values: \n 
+                             - QURT_INT_TRIGGER_USE_DEFAULT
+                             - QURT_INT_TRIGGER_LEVEL_HIGH 
+                             - QURT_INT_TRIGGER_LEVEL_LOW 
+                             - QURT_INT_TRIGGER_RISING_EDGE 
+                             - QURT_INT_TRIGGER_FALLING_EDGE              
+                             - QURT_INT_TRIGGER_DUAL_EDGE              
+  @param[in]   isr           Interrupt Service Routine with proto type void isr (void *arg, int int_num)
+  @param[in]   arg  	     1st argument of the ISR when it is called to service the interrupt
+   
+  @return 
+   QURT_EOK          -- Successfully registered the ISR for the interrupt
+   QURT_EINT         -- Interrupt not configured
+   QURT_EINVALID     -- Invalid Thread ID
+   QURT_EDISABLED    -- The feature is disabled
+   QURT_EDUPLICATE   -- Interrupt is already registered
+
+  @dependencies
+   Thread ID should be created using qurt_isr_create()
+ */
+int qurt_isr_register2 (qurt_thread_t isr_thread_id, int int_num, unsigned short prio, unsigned short flags, unsigned int int_type, void (*isr) (void *, int), void *arg);
+
+/**@ingroup func_qurt_isr_deregister2
+  De-registers the ISR for the specified interrupt.
+  The interrupt is disabled when this function returns success.
+
+  @param[in]   int_num   The interrupt number
+   
+  @return 
+   QURT_EOK            -- ISR deregistered successfully
+   QURT_ENOREGISTERED  -- Interrupt with int_num is not registered
+
+  @dependencies
+  None.
+ */
+int qurt_isr_deregister2 (int int_num);
+
+/**@ingroup func_qurt_isr_delete
+   ISR thread will exit and releases Kernel resources
+
+   @note1hang   The ISR thread shouldn't be actively processing interrupts,
+                otherwise the call will fail and return an error.
+  
+   @param[in]   thread-id of the ISR thread that needs to be deleted.
+
+   @return
+    QURT_ENOTALLOWED   -- ISR thread is processing an interrupt
+    QURT_EINVALID      -- Invalid ISR thread ID
+    QURT_EOK           -- Success 
+
+   @dependencies
+   Thread ID should be created using qurt_isr_create()
+ */
+int qurt_isr_delete (qurt_thread_t isr_tid);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_ISR_H */
+
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_l2cfg.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_l2cfg.h
new file mode 100755
index 0000000000000..7e26b30a580d9
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_l2cfg.h
@@ -0,0 +1,98 @@
+#ifndef QURT_L2CFG_H
+#define QURT_L2CFG_H
+/**
+  @file qurt_l2cfg.h 
+  @brief QuRT APIs for L2 configuration and system configuration
+
+EXTERNAL FUNCTIONS
+   qurt_l2cfg_set 
+   qurt_l2cfg_get 
+   qurt_system_config_get
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+Copyright (c) 2019-2021 by Qualcomm Technologies, Inc.  All Rights Reserved.
+Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+=============================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*=============================================================================
+                        CONSTANTS AND MACROS
+=============================================================================*/
+
+/* Definition for system configuration */
+/** @addtogroup l2cfg_macros
+@{ */
+#define QURT_CORE_CFG_HMX_INT8_SPATIAL  0x78   /**< HMX fixed-point spatial size */
+#define QURT_CORE_CFG_HMX_INT8_DEPTH    0x7C   /**< HMX fixed-point output depth */
+/** @} */ /* end_addtogroup l2cfg_macros */
+/*=============================================================================
+                        FUNCTIONS
+=============================================================================*/
+/**@ingroup func_qurt_l2cfg_set
+  Sets the value of a L2 configuration register. A register can be set *IFF* its 
+  initial value is configured.
+   
+  @param[in] offset Offset of L2 configuration register; must be multiple of 4.
+  @param[in] value  Value to set the register to. 
+   
+  @return 
+  #QURT_EOK -- Success. \n 
+  #QURT_EFAILED -- Internal mapping that covers L2CFG register file absent; likely 
+                    a configuration problem. \n
+  #QURT_EINVALID -- Argument error. \n
+  #QURT_ENOTALLOWED -- Setting this register is prohibited.
+   
+  @dependencies 
+  None.  
+ */
+int qurt_l2cfg_set (unsigned short offset, unsigned int value);
+
+/**@ingroup func_qurt_l2cfg_get 
+  Gets the value of a L2 configuration register. 
+   
+  @param[in]  offset Offset of L2 configuration register; must be multiple of 4.
+  @param[out] value  Pointer to value of the register. 
+   
+  @return 
+  #QURT_EOK -- Success. \n 
+  #QURT_EFAILED -- Internal mapping that covers L2CFG register file absent;  
+                   likely a configuration problem. \n 
+  #QURT_EINVALID -- Argument error. 
+   
+  @dependencies 
+  None. 
+  
+ */
+int qurt_l2cfg_get (unsigned short offset, unsigned int * value);
+
+
+/**@ingroup func_qurt_system_config_get
+  Gets the system configuration information.
+
+  @param[in] index Index to system configuration. Values:\n
+                   - #QURT_CORE_CFG_HMX_INT8_SPATIAL \n
+                   - #QURT_CORE_CFG_HMX_INT8_DEPTH @tablebulletend
+
+  @param[out] data   Pointer to a word for returned data.
+
+  @return
+  #QURT_EOK -- Get the configuration data successful. \n
+  Other values -- Failure (no such configuration available).
+
+  @dependencies
+  None.
+  
+ */
+int qurt_system_config_get(int index, unsigned int *data);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_L2CFG_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_lifo.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_lifo.h
new file mode 100755
index 0000000000000..dc399fccc5f0f
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_lifo.h
@@ -0,0 +1,71 @@
+#ifndef QURT_LIFO_H
+#define QURT_LIFO_H
+/**
+  @file qurt_lifo.h
+
+  @brief
+  Provide lock free LastInFirstOut algorithm, which can be used in a
+  variety of situations for allocation/free fixed size buffer    
+  This implementation touches the first word of your FREED buffer. Even
+  though it does not matter how you use it when it is allocated, you might want
+	to be a bit careful not to put your MAGIC number as the first field.
+	Because it will not hold the magic value for "freed"
+
+ EXTERNALIZED FUNCTIONS
+ None
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+ None
+
+ Copyright (c) 2013, 2021  by Qualcomm Technologies, Inc.  All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+	/*=====================================================================
+	 Functions
+	======================================================================*/
+
+/*======================================================================*/
+/**
+  Pops an element out of the LIFO. 
+
+  @param[in] freelist  Pointer to the head of your list. 
+	
+  @return 				
+  Top object from the list 
+		 
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+void * qurt_lifo_pop(void *freelist);
+
+ 
+/*======================================================================*/
+/**
+  Pushes an element into the LIFO.
+	
+  @param[in] freelist  Pointer to the head of your list. 
+  @param[in] buf       Pointer to your buffer to push into the list. 
+	
+  @return
+  None.
+		 
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+void qurt_lifo_push(void *freelist, void *buf);
+
+void qurt_lifo_remove(void *freelist, void *buf);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_LIFO_H */
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_mailbox.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_mailbox.h
new file mode 100755
index 0000000000000..a6cd91c611782
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_mailbox.h
@@ -0,0 +1,176 @@
+#ifndef QURT_MAILBOX_H
+#define QURT_MAILBOX_H
+
+/**
+  @file qurt_mailbox.h
+  @brief  Definitions, macros, and prototypes used for QuRT mailbox
+
+ EXTERNALIZED FUNCTIONS
+  none
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  none
+
+ Copyright (c) 2015, 2021-2023  by Qualcomm Technologies, Inc.  All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*=============================================================================
+                        CONSTANTS AND MACROS
+=============================================================================*/
+/* Definitions on typedef and return values */
+
+#define   QURT_MAILBOX_ID_NULL               0
+#define   QURT_MAILBOX_ERROR                -1
+#define   QURT_MAILBOX_ID_ERROR             -2
+#define   QURT_MAILBOX_NON_VALID_DATA       -3
+#define   QURT_MAILBOX_FULL                 -4
+#define   QURT_MAILBOX_DELETED              -5
+#define   QURT_MAILBOX_RECEIVE_HALTED       -6
+#define   QURT_MAILBOX_BANDWIDTH_LIMIT      -7
+
+
+/*=============================================================================
+                    FORWARD DECLARATIONS & TYPEDEFS
+=============================================================================*/
+
+#define        QURT_MAILBOX_AT_QURTOS     0U            // Receiver is QurtOS
+#define        QURT_MAILBOX_AT_ROOTPD     1U            // Receiver is RootPD  (ASID=0)
+#define        QURT_MAILBOX_AT_USERPD     2U            // Receiver is User PD (ASID!=0)
+#define        QURT_MAILBOX_AT_SECUREPD   3U            // Receiver is Secure PD
+
+typedef unsigned char qurt_mailbox_receiver_cfg_t;  
+
+#define        QURT_MAILBOX_SEND_OVERWRITE        0U       // When there is already valid content, overwrite it
+#define        QURT_MAILBOX_SEND_NON_OVERWRITE    1U       // When there is already valid content, return failure
+
+typedef unsigned char qurt_mailbox_send_option_t;  
+
+
+#define        QURT_MAILBOX_RECV_WAITING          0U          // When there is no valid content, wait for it 
+#define        QURT_MAILBOX_RECV_NON_WAITING      1U          // When there is no valid content, return failure immediately
+#define        QURT_MAILBOX_RECV_PEEK_NON_WAITING 2U          // Read the content, but doesn't remove it from the mailbox. No waiting.
+
+typedef unsigned char qurt_mailbox_recv_option_t;
+
+
+/*=============================================================================
+                            EXTERNS & FUNCTIONS
+=============================================================================*/
+/* Function prototype */
+
+/**@ingroup qurt_mailbox_create
+  Creates a QuRT mailbox.
+   
+  @param name            Mailbox name up to 8 characters.
+  @param recv_opt        Configuration on the receiver process.
+
+  @return
+  Mailbox ID --          Mailbox Identifier \n
+  #QURT_MAILBOX_ID_NULL --  NULL, failure at creating mailbox
+
+  @dependencies
+  None.
+*/
+unsigned long long qurt_mailbox_create(char *name, qurt_mailbox_receiver_cfg_t recv_opt);
+
+
+/**@ingroup qurt_mailbox_get_id
+  Gets a QuRT mailbox identifier.
+   
+  @param name            Mailbox name up to 8 characters.
+
+  @return
+  Mailbox ID --            Mailbox identifier \n
+  #QURT_MAILBOX_ID_NULL -- NULL, failure at getting mailbox ID
+
+  @dependencies
+  None.
+*/
+unsigned long long qurt_mailbox_get_id(char *name);
+
+
+/**@ingroup qurt_mailbox_send
+  Sends data to a QuRT mailbox.
+   
+  @param mailbox_id   Mailbox identifier.
+  @param send_opt     Option for mailbox send.
+  @param data         Data to send.
+
+
+  @return
+  #QURT_EOK                      Success \n
+  #QURT_MAILBOX_ID_ERROR         Mailbox ID error.\n
+  #QURT_MAILBOX_ERROR            Other errors.\n
+  #QURT_MAILBOX_FULL             Valid data already exists, non-overwriting.\n
+  #QURT_MAILBOX_BANDWIDTH_LIMIT  Reached the bandwidth limitation.   
+
+  @dependencies
+  None.
+*/
+int qurt_mailbox_send(unsigned long long mailbox_id, qurt_mailbox_send_option_t send_opt, unsigned long long data);
+
+
+/**@ingroup qurt_mailbox_receive
+  Receive data from QuRT mailbox
+   
+  @param mailbox_id   Mailbox Identifier
+  @param send_opt     Option for mailbox receiving
+  @param data         Pointer to data buffer for receiving
+
+  @return
+  #QURT_EOK                            Success \n
+  #QURT_MAILBOX_ID_ERROR               Mailbox ID error. \n
+  #QURT_MAILBOX_ERROR                  Other errors. \n
+  #QURT_MAILBOX_NON_VALID_DATA         No current valid data, put the previous content in the buffer. \n
+  #QURT_MAILBOX_RECEIVE_HALTED         Receive halted, the waiting thread is woken up. \n
+  #QURT_MAILBOX_DELETED                Mailbox is deleted, and the waiting thread is woken up.
+
+  @dependencies
+  None.
+*/
+int qurt_mailbox_receive(unsigned long long mailbox_id, qurt_mailbox_recv_option_t recv_opt, unsigned long long *data);
+
+
+/**@ingroup qurt_mailbox_delete
+  Deletes a QuRT mailbox.
+
+  A mailbox can only be deleted from the process that created the mailbox.
+   
+  @param mailbox_id   Mailbox identifier.
+
+  @return
+  #QURT_EOK                   Success. \n
+  #QURT_MAILBOX_ID_ERROR      Mailbox ID error. \n
+  #QURT_MAILBOX_ERROR         Other errors.
+
+  @dependencies
+  None.
+*/
+int qurt_mailbox_delete(unsigned long long mailbox_id);
+
+
+/**@ingroup qurt_mailbox_receive_halt
+  Halts a QuRT mailbox receiving and wakes up waiting threads.
+
+  @param mailbox_id   Mailbox identifier.
+
+  @return
+  #QURT_EOK                   Success. \n
+  #QURT_MAILBOX_ID_ERROR      Mailbox ID error.\n
+  #QURT_MAILBOX_ERROR         Other errors.
+
+  @dependencies
+  None.
+*/
+int qurt_mailbox_receive_halt(unsigned long long mailbox_id);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif // QURT_MAILBOX_H
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_memory.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_memory.h
new file mode 100755
index 0000000000000..90ce2586fec50
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_memory.h
@@ -0,0 +1,1487 @@
+#ifndef QURT_MEMORY_H
+#define QURT_MEMORY_H
+/**
+  @file qurt_memory.h
+  @brief  Prototypes of kernel memory API functions.
+
+ EXTERNALIZED FUNCTIONS
+  none
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  none
+
+ Copyright (c) Qualcomm Technologies, Inc.
+ All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+
+
+#include <qurt_error.h>
+#include <qurt_types.h>
+//#include <qurt_util_macros.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** @addtogroup memory_management_macros
+@{ */
+#define QURT_SYSTEM_ALLOC_VIRTUAL 1 /**< Allocates available virtual memory in the address space of all
+                                processes.*/
+/** @} */ /* end_addtogroup memory_management_macros */
+/**@cond rest_reg_dist */
+/** @addtogroup memory_management_types
+@{ */
+/** @xreflabel{hdr:qurt_mem_default_pool} */
+extern qurt_mem_pool_t qurt_mem_default_pool __attribute__((section(".data"))); /**< Memory pool object.*/
+/** @} */ /* end_addtogroup memory_management_types */
+
+/** @cond rest_reg_dist */
+/** Mapping attribute information*/
+typedef struct{
+    qurt_paddr_64_t        paddr;
+    qurt_size_t            size ;
+    qurt_mem_cache_mode_t  cache_mode;
+    qurt_perm_t            perms ; 
+}qurt_mapping_attr_t;
+/** @endcond */
+/** @} */ /* end_addtogroup mapping_attribute_types*/
+
+/*=====================================================================
+ Functions
+======================================================================*/
+
+/**@ingroup func_qurt_mem_cache_clean
+  Performs a cache clean operation on the data stored in the specified memory area.
+  Peforms a syncht on all the data cache operations when the Hexagon processor version is V60 or greater.
+
+  @note1hang Perform the flush all operation only on the data cache.
+
+  @note1cont This operation flushes and invalidates the contents of all cache lines from start address
+             to end address (start address + size). The contents of the adjoining buffer can be 
+             flushed and invalidated if it falls in any of the cache line.
+
+  @datatypes
+  #qurt_addr_t \n
+  #qurt_size_t \n
+  #qurt_mem_cache_op_t \n
+  #qurt_mem_cache_type_t
+
+  @param[in] addr      Address of data to flush.
+  @param[in] size      Size (in bytes) of data to flush.
+  @param[in] opcode    Type of cache clean operation. Values:  
+                       - #QURT_MEM_CACHE_FLUSH
+                       - #QURT_MEM_CACHE_INVALIDATE
+                       - #QURT_MEM_CACHE_FLUSH_INVALIDATE
+                       - #QURT_MEM_CACHE_FLUSH_ALL\n
+                       @note1 #QURT_MEM_CACHE_FLUSH_ALL is valid only when the type is #QURT_MEM_DCACHE @tablebulletend
+  @param[in] type          Cache type. Values:  
+                       - #QURT_MEM_ICACHE
+                       - #QURT_MEM_DCACHE  @tablebulletend
+ 
+  @return
+  #QURT_EOK -- Cache operation performed successfully.\n
+  #QURT_EVAL -- Invalid cache type.\n
+
+  @dependencies
+  None.
+*/
+int qurt_mem_cache_clean(qurt_addr_t addr, qurt_size_t size, qurt_mem_cache_op_t opcode, qurt_mem_cache_type_t type);
+
+/**@ingroup func_qurt_mem_cache_clean2
+  Performs a data cache clean operation on the data stored in the specified memory area.
+
+  This API only performs the following data cache operations:\n  
+  - #QURT_MEM_CACHE_FLUSH\n
+  - #QURT_MEM_CACHE_INVALIDATE\n  
+  - #QURT_MEM_CACHE_FLUSH_INVALIDATE -- flushes/invalidates the contents of all cache lines from start address
+  to end address (start address + size). The contents of the adjoining buffer can be 
+  flushed/invalidated if it falls in any of the cache line.
+
+  @datatypes
+  #qurt_addr_t \n
+  #qurt_size_t \n
+  #qurt_mem_cache_op_t \n
+  #qurt_mem_cache_type_t
+
+  @param[in] addr      Address of data to flush.
+  @param[in] size      Size (in bytes) of data to flush.
+  @param[in] opcode    Type of cache clean operation. Values:\n  #QURT_MEM_CACHE_FLUSH\n  #QURT_MEM_CACHE_INVALIDATE\n
+                       #QURT_MEM_CACHE_FLUSH_INVALIDATE
+  @param[in] type          Cache type. Values: \n #QURT_MEM_DCACHE
+
+  @return
+  #QURT_EOK -- Cache operation performed successfully.\n
+  #QURT_EVAL -- Invalid cache type.
+
+  @dependencies
+  None.
+*/
+int qurt_mem_cache_clean2(qurt_addr_t addr, qurt_size_t size, qurt_mem_cache_op_t opcode, qurt_mem_cache_type_t type);
+
+/**@ingroup func_qurt_mem_cache_phys_clean
+  Performs a cache clean operation on the data stored in the specified memory area based on address match and mask.
+  Operate on a cache line when (LINE.PhysicalPageNumber & mask) == addrmatch.
+
+  @note1hang The addrmatch value should be the upper 24-bit physical address to match against.
+
+  @datatypes
+  #qurt_mem_cache_op_t \n
+
+  @param[in] mask      24-bit address mask.
+  @param[in] addrmatch Physical page number (24 bits) of memory to use as an address match.
+  @param[in] opcode    Type of cache clean operation. Values:  
+                       - #QURT_MEM_CACHE_FLUSH
+                       - #QURT_MEM_CACHE_INVALIDATE @tablebulletend
+ 
+  @return
+  #QURT_EOK -- Cache operation performed successfully.\n
+  #QURT_EVAL -- Invalid operation
+  
+  @dependencies
+  None.
+*/
+
+int qurt_mem_cache_phys_clean(unsigned int mask, unsigned int addrmatch, qurt_mem_cache_op_t opcode);
+
+/**@ingroup func_qurt_mem_l2cache_line_lock 
+  Performs an L2 cache line locking operation. This function locks selective lines in the L2 cache memory.
+
+  @note1hang Perform the line lock operation only on the 32-byte aligned size and address.
+
+  @datatypes
+  #qurt_addr_t \n
+  #qurt_size_t 
+ 
+  @param[in] addr   Address of the L2 cache memory line to lock; the address must be 32-byte aligned.
+  @param[in] size   Size (in bytes) of L2 cache memory to line lock; size must be a multiple of 32 bytes.
+ 
+  @return
+  #QURT_EOK -- Success.\n
+  #QURT_EALIGN -- Data alignment or address failure.
+  #QURT_EINVALID -- Improper addr and size passed (e.g. integer overflow due to addr + size)
+  #QURT_EFAILED -- Failed to lock cache line as all the ways were locked for the corresponding set of an address 
+                   in the range of addr and addr+size or the address range is not L2 cacheable
+  @dependencies
+  None.
+*/
+int qurt_mem_l2cache_line_lock(qurt_addr_t addr, qurt_size_t size);
+
+/**@ingroup func_qurt_mem_l2cache_line_unlock
+  Performs an L2 cache line unlocking operation. This function unlocks selective lines in the L2 cache memory.
+
+  @note1hang Perform the line unlock operation only on a 32-byte aligned size and address.
+
+  @datatypes
+  #qurt_addr_t \n
+  #qurt_size_t
+
+  @param[in] addr   Address of the L2 cache memory line to unlock; the address must be 32-byte aligned.
+  @param[in] size   Size (in bytes) of the L2 cache memory line to unlock; size must be a multiple of 32 bytes.
+
+  @return
+  #QURT_EOK -- Success. \n
+  #QURT_EALIGN -- Aligning data or address failure. \n
+  #QURT_EFAILED -- Operation failed, cannot find the matching tag.
+
+  @dependencies
+  None.
+*/
+int qurt_mem_l2cache_line_unlock(qurt_addr_t addr, qurt_size_t size);
+
+/**@ingroup func_qurt_mem_region_attr_init
+  @xreflabel{sec:qurt_mem_region_attr_init} 
+  Initializes the specified memory region attribute structure with default attribute values: \n
+  - Mapping -- #QURT_MEM_MAPPING_VIRTUAL \n
+  - Cache mode -- #QURT_MEM_CACHE_WRITEBACK \n
+  - Physical address -- -1 \n
+  - Virtual address -- -1 \n
+  - Memory type -- #QURT_MEM_REGION_LOCAL \n
+  - Size -- -1 
+
+  @note1hang The memory physical address attribute must be explicitly set by calling the
+             qurt_mem_region_attr_set_physaddr() function. The size and pool attributes are set directly
+             as parameters in the memory region create operation.
+
+  @datatypes
+  #qurt_mem_region_attr_t
+
+  @param[in,out] attr  Pointer to the destination structure for the memory region attributes.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+void qurt_mem_region_attr_init(qurt_mem_region_attr_t *attr);
+
+/**@ingroup func_qurt_mem_pool_attach
+  Initializes a memory pool object to attach to a pool predefined in the system
+  configuration file.
+
+  Memory pool objects assign memory regions to physical memory in different
+  Hexagon memory units. They are specified in memory region create operations
+  (Section @xref{sec:mem_region_create}).
+
+  @note1hang QuRT predefines the memory pool object #qurt_mem_default_pool
+             (Section @xref{dox:mem_management}) for allocation memory regions in SMI memory. The pool attach
+             operation is necessary only when allocating memory regions in nonstandard
+             memory units such as TCM.
+
+  @datatypes
+  #qurt_mem_pool_t
+
+  @param[in] name   Pointer to the memory pool name.
+  @param[out] pool  Pointer to the memory pool object.
+
+  @return
+  #QURT_EOK -- Attach operation successful.
+
+  @dependencies
+  None.
+*/
+int qurt_mem_pool_attach(char *name, qurt_mem_pool_t *pool);
+
+/**@ingroup func_qurt_mem_pool_attach2
+  Gets the identifier that corresponds to a pool object created specifically for a client, for example, HLOS_PHYSPOOL.
+  The client_handle is used to look up the client specific pool.
+
+  Memory pool objects assign memory regions to physical memory in different
+  Hexagon memory units. Memory pool objects are specified during mapping creation operations 
+  (qurt_mem_mmap() and qurt_mem_region_create()).
+
+  @note1hang QuRT predefines the memory pool object #qurt_mem_default_pool
+             (Section @xref{dox:mem_management}) for allocation memory regions in SMI memory. The pool_attach2
+             operation is necessary only when allocating memory regions in memory units specific to the client.
+
+  @datatypes
+  #qurt_mem_pool_t
+
+  @param[in] client_handle   Client identifier used by the OS to lookup the identifier
+                             for client specific pool
+  @param[in] name            Pointer to the memory pool name.
+  @param[out] pool           Pointer to the memory pool object.
+
+  @return
+  #QURT_EOK -- Attach operation successful.
+
+  @dependencies
+  None.
+*/
+int qurt_mem_pool_attach2(int client_handle, char *name, qurt_mem_pool_t *pool);
+
+/**@ingroup func_qurt_mem_pool_create
+   @xreflabel{hdr:qurt_mem_pool_create}
+   Dynamically creates a memory pool object from a physical address range.
+
+   The pool is assigned a single memory region with the specified base address and size.
+
+   The base address and size values passed to this function must be aligned to 4K byte
+   boundaries, and must be expressed as the actual base address and size values divided by 4K.
+
+   For example, the function call:
+         @code
+         qurt_mem_pool_create ("TCM_PHYSPOOL", 0xd8020, 0x20, &pool)
+         @endcode
+   ... is equivalent to the following static pool definition in the QuRT system configuration file:
+        @code
+       <physical_pool name="TCM_PHYSPOOL">
+            <region base="0xd8020000" size="0x20000" />
+       </physical_pool>
+       @endcode
+
+   @cond rest_dist For more information on the system configuration file, see @xhyperref{80VB41979,80-VB419-79}. @endcond
+
+   @note1hang Dynamically created pools are not identical to static pools. In particular, 
+   qurt_mem_pool_attr_get() is not valid with dynamically created pools.
+
+   @note1cont Dynamic pool creation permanently consumes system resources, and cannot be undone.
+
+  @datatypes
+  #qurt_mem_pool_t
+
+  @param[in] name           Pointer to the memory pool name. 
+  @param[in] base           Base address of the memory region (divided by 4K).
+  @param[in] size           Size (in bytes) of the memory region (divided by 4K).
+  @param[out] pool          Pointer to the memory pool object.
+
+  @return
+  #QURT_EOK -- Success.
+
+  @dependencies
+  None.
+*/
+int qurt_mem_pool_create(char *name, unsigned base, unsigned size, qurt_mem_pool_t *pool);
+
+/**@ingroup func_qurt_mem_pool_add_pages
+  Adds a physical address range to the specified memory pool object.\n
+ 
+  @note1hang Call this operation only with root privileges (guest OS mode).
+
+  @datatypes
+  #qurt_mem_pool_t
+
+  @param[in] pool           Memory pool object.
+  @param[in] first_pageno   First page number of the physical address range (equivalent to address >> 12)
+  @param[in] size_in_pages  Number of pages in the physical address range (equivalent to size >> 12)
+
+  @return
+  #QURT_EOK -- Pages successfully added.
+
+  @dependencies
+  None.
+*/
+int qurt_mem_pool_add_pages(qurt_mem_pool_t pool,
+                            unsigned first_pageno,
+                            unsigned size_in_pages);
+
+/**@ingroup func_qurt_mem_pool_remove_pages
+  Removes a physical address range from the specified memory pool object.
+ 
+  If any part of the address range is in use, this operation returns an
+  error without changing the state.
+ 
+  @note1hang Call this operation only with root privileges (guest-OS mode).
+ 
+  @note1cont In the future, this operation will support (via the flags parameter) the
+  removal of a physical address range when part of the range is in use.
+ 
+  @datatypes
+  #qurt_mem_pool_t
+
+  @param[in] pool           Memory pool object.
+  @param[in] first_pageno   First page number of the physical address range (equivalent to address >> 12)
+  @param[in] size_in_pages  Number of pages in the physical address range (equivalent to size >> 12)
+  @param[in] flags          Remove options. Values: \n 
+                            - 0 -- Skip holes in the range that are not part of the pool (default) \n
+                            - #QURT_POOL_REMOVE_ALL_OR_NONE -- Pages are removed only if the specified
+                            physical address range is entirely contained (with no holes) in the
+                            pool free space. @tablebulletend                          
+  @param[in] callback       Callback procedure called when pages were successfully removed.
+                            Not called if the operation failed. Passing 0 as the parameter
+                            value causes the callback to not be called. 
+  @param[in] arg            Value passed as an argument to the callback procedure.
+
+  @return
+  #QURT_EOK -- Pages successfully removed.
+
+  @dependencies
+  None.
+*/
+int qurt_mem_pool_remove_pages(qurt_mem_pool_t pool,
+                               unsigned first_pageno,
+                               unsigned size_in_pages,
+                               unsigned flags,
+                               void (*callback)(void *),
+                               void *arg);
+/**@ingroup memory_management_types*/
+#define QURT_POOL_REMOVE_ALL_OR_NONE            1  /**< */
+
+/**@ingroup func_qurt_mem_pool_attr_get  
+   Gets the memory pool attributes. \n
+   Retrieves pool configurations based on the pool handle, and fills in
+   the attribute structure with configuration values.   
+
+   @datatypes
+   #qurt_mem_pool_t \n
+   #qurt_mem_pool_attr_t
+
+   @param[in]  pool   Pool handle obtained from qurt_mem_pool_attach().
+   @param[out] attr   Pointer to the memory region attribute structure. 
+
+   @return   
+   0 -- Success. \n
+   #QURT_EINVALID -- Corrupt handle; pool handle is invalid.
+*/
+int qurt_mem_pool_attr_get (qurt_mem_pool_t pool, qurt_mem_pool_attr_t *attr);
+
+/**@ingroup func_qurt_mem_pool_attr_get_size
+  Gets the size of the specified memory pool range.
+
+  @datatypes
+  #qurt_mem_pool_attr_t \n
+  #qurt_size_t
+ 
+  @param[in] attr        Pointer to the memory pool attribute structure.
+  @param[in] range_id    Memory pool range key.
+  @param[out] size       Pointer to the destination variable for the range size.
+
+  @return 
+  0 -- Success. \n
+  #QURT_EINVALID -- Range is invalid.
+
+  @dependencies
+  None.
+*/
+static inline int qurt_mem_pool_attr_get_size (qurt_mem_pool_attr_t *attr, int range_id, qurt_size_t *size){
+    if ((range_id >= MAX_POOL_RANGES) || (range_id < 0)){
+        (*size) = 0;
+        return QURT_EINVALID;
+    }
+    else {
+        (*size) = attr->ranges[range_id].size;
+    }
+    return QURT_EOK;
+}
+
+/**@ingroup func_qurt_mem_pool_attr_get_addr
+   Gets the start address of the specified memory pool range.
+ 
+  @datatypes
+  #qurt_mem_pool_attr_t \n
+  #qurt_addr_t
+  
+  @param[in] attr        Pointer to the memory pool attribute structure.
+  @param[in]  range_id   Memory pool range key.
+  @param[out] addr       Pointer to the destination variable for range start address.
+
+  @return 
+  0 -- Success. \n
+  #QURT_EINVALID -- Range is invalid.
+
+  @dependencies
+  None.
+*/
+static inline int qurt_mem_pool_attr_get_addr (qurt_mem_pool_attr_t *attr, int range_id, qurt_addr_t *addr){
+    if ((range_id >= MAX_POOL_RANGES) || (range_id < 0)){
+        (*addr) = 0;
+        return QURT_EINVALID;
+    }
+    else {
+        (*addr) = (attr->ranges[range_id].start)<<12;
+   }
+   return QURT_EOK;
+}
+
+/**@ingroup func_qurt_mem_pool_attr_get_addr_64
+   Gets the 64 bit start address of the specified memory pool range.
+ 
+  @datatypes
+  #qurt_mem_pool_attr_t \n
+  #qurt_addr_64_t
+  
+  @param[in] attr        Pointer to the memory pool attribute structure.
+  @param[in]  range_id   Memory pool range key.
+  @param[out] addr       Pointer to the destination variable for range start address.
+
+  @return 
+  0 -- Success. \n
+  #QURT_EINVALID -- Range is invalid.
+
+  @dependencies
+  None.
+*/
+static inline int qurt_mem_pool_attr_get_addr_64 (qurt_mem_pool_attr_t *attr, int range_id, qurt_addr_64_t *addr){
+if ((range_id >= MAX_POOL_RANGES) || (range_id < 0)){
+    (*addr) = 0;
+    return QURT_EINVALID;
+}
+else {
+     (*addr) = ((qurt_addr_64_t)attr->ranges[range_id].start)<<12;
+    }
+    return QURT_EOK;
+ }
+
+
+/**@ingroup func_qurt_mem_pool_status_get  
+   Gets the memory pool status. \n
+   Based on the pool handle, retrieves largest contiguous free memory, 
+   total free memory, and total memory declared for the pool in bytes. Fills in
+   the memory status structure with the values.   
+   
+   @datatypes
+   #qurt_mem_pool_t \n
+   #qurt_mem_pool_status_t
+   
+   @param[in]  pool   Pool handle.
+   @param[out] status Pointer to the memory pool status structure. 
+   
+   @return   
+   #QURT_EOK      -- Success. \n
+   #QURT_EINVALID -- Corrupt handle; pool handle is invalid.
+*/
+int qurt_mem_pool_status_get (qurt_mem_pool_t pool, qurt_mem_pool_status_t *status);
+
+
+/**@ingroup func_qurt_mem_pool_is_available
+   Checks whether the number of pages that the page_count argument indicates
+   can be allocated from the specified pool.
+
+  @datatypes
+  #qurt_mem_pool_attr_t \n
+  #qurt_mem_mapping_t \n
+
+  @param[in] pool          Pool handle obtained from qurt_mem_pool_attach().
+  @param[in] page_count    Number of 4K pages.
+  @param[in] mapping_type  Variable of type qurt_mem_mapping_t.
+
+  @return
+  0 -- Success. \n
+  #QURT_EINVALID -- Mapping_type is invalid. \n
+  #QURT_EMEM     -- Specified pages cannot be allocated from the pool.
+
+  @dependencies
+  None.
+*/
+int qurt_mem_pool_is_available(qurt_mem_pool_t pool, int page_count, qurt_mem_mapping_t mapping_type);
+
+
+/**@ingroup func_qurt_mem_region_create
+  @xreflabel{sec:mem_region_create}
+  Creates a memory region with the specified attributes.
+
+  The application initializes the memory region attribute structure with
+  qurt_mem_region_attr_init() and qurt_mem_region_attr_set_bus_attr().
+
+  If the virtual address attribute is set to its default value 
+  (Section @xref{sec:qurt_mem_region_attr_init}), the virtual address of the memory region is 
+  automatically assigned any available virtual address value.
+
+  If the memory mapping attribute is set to virtual mapping, the physical address of the memory region
+  is also automatically assigned.\n
+
+  @note1hang The physical address attribute is explicitly set in the attribute structure only
+             for memory regions with physical-contiguous-mapped mapping.
+
+  Memory regions are always assigned to memory pools. The pool value specifies the memory pool
+  that the memory region is assigned to.
+
+  @note1hang If attr is specified as NULL, the memory region is created with default
+             attribute values (Section @xref{sec:qurt_mem_region_attr_init}).
+             QuRT predefines the memory pool object #qurt_mem_default_pool
+             (Section @xref{dox:mem_management}), which allocates memory regions in SMI memory.
+
+  @datatypes
+  #qurt_mem_region_t \n
+  #qurt_size_t \n
+  #qurt_mem_pool_t \n
+  #qurt_mem_region_attr_t
+
+  @param[out] region Pointer to the memory region object.
+  @param[in]  size   Memory region size (in bytes). If size is not an integral multiple of 4K,
+                     it is rounded up to a 4K boundary.
+  @param[in]  pool   Memory pool of the region.
+  @param[in]  attr   Pointer to the memory region attribute structure.
+
+  @return
+  #QURT_EOK -- Memory region successfully created.\n
+  #QURT_EMEM -- Not enough memory to create region.
+  #QURT_EINVALID -- Invalid cache attributes / permissions provided in attribute.
+
+  @dependencies
+  None.
+*/
+int qurt_mem_region_create(qurt_mem_region_t *region, qurt_size_t size, qurt_mem_pool_t pool, qurt_mem_region_attr_t *attr);
+
+/**@ingroup func_qurt_mem_region_delete
+  Deletes the specified memory region.
+
+  If the caller application creates the memory region, it is removed and the system reclaims its
+  assigned memory.
+
+  If a different application creates the memory region (and is shared with the caller
+  application), only the local memory mapping to the region is removed; the system does
+  not reclaim the memory.
+
+  @datatypes
+  #qurt_mem_region_t
+
+  @param[in] region Memory region object.
+
+  @returns
+  #QURT_EOK -- Region successfully deleted.
+  #QURT_ELOCKED -- Buffer is locked. Mapping delete failed.
+
+  @dependencies
+  None.
+*/
+int qurt_mem_region_delete(qurt_mem_region_t region);
+
+
+/**@ingroup func_qurt_mem_region_attr_get
+  @xreflabel{sec:mem_region_attr_get}
+  Gets the memory attributes of the specified message region.
+  After a memory region is created, its attributes cannot be changed.
+
+  @datatypes
+  #qurt_mem_region_t \n
+  #qurt_mem_region_attr_t
+
+  @param[in] region     Memory region object.
+  @param[out] attr      Pointer to the destination structure for memory region attributes.
+
+  @return
+  #QURT_EOK -- Operation successfully performed. \n
+  Error code -- Failure.
+
+  @dependencies
+  None.
+*/
+int qurt_mem_region_attr_get(qurt_mem_region_t region, qurt_mem_region_attr_t *attr);
+
+
+/**@ingroup func_qurt_mem_region_attr_set_type
+  Sets the memory type in the specified memory region attribute structure.
+
+  The type indicates whether the memory region is local to an application or shared between
+  applications. 
+  @cond rest_dist For more information, see @xhyperref{80VB41992,80-VB419-92}. @endcond
+ 
+  @datatypes
+  #qurt_mem_region_attr_t \n
+  #qurt_mem_region_type_t
+
+  @param[in,out] attr  Pointer to memory region attribute structure.
+  @param[in]     type  Memory type. Values: \n
+                       - #QURT_MEM_REGION_LOCAL \n
+                       - #QURT_MEM_REGION_SHARED @tablebulletend
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+static inline void qurt_mem_region_attr_set_type(qurt_mem_region_attr_t *attr, qurt_mem_region_type_t type){
+    attr->type = type;
+}
+
+/**@ingroup func_qurt_mem_region_attr_get_size
+  Gets the memory region size from the specified memory region attribute structure.
+
+  @datatypes
+  #qurt_mem_region_attr_t \n
+  #qurt_size_t
+
+  @param[in]  attr  Pointer to the memory region attribute structure.
+  @param[out] size  Pointer to the destination variable for memory region size.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+static inline void qurt_mem_region_attr_get_size(qurt_mem_region_attr_t *attr, qurt_size_t *size){
+    (*size) = attr->size;
+}
+
+/**@ingroup func_qurt_mem_region_attr_get_type
+  Gets the memory type from the specified memory region attribute structure.
+
+  @datatypes
+  #qurt_mem_region_attr_t \n
+  #qurt_mem_region_type_t
+
+  @param[in] attr  Pointer to the memory region attribute structure.
+  @param[out] type  Pointer to the destination variable for the memory type.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+static inline void qurt_mem_region_attr_get_type(qurt_mem_region_attr_t *attr, qurt_mem_region_type_t *type){
+    (*type) = attr->type;
+}
+
+/**@ingroup func_qurt_mem_region_attr_set_physaddr
+  Sets the memory region 32-bit physical address in the specified memory attribute structure.
+
+  @note1hang The physical address attribute is explicitly set only for memory regions with 
+             physical contiguous mapping. Otherwise QuRT automatically sets it
+			 when the memory region is created.
+
+  @datatypes
+  #qurt_mem_region_attr_t \n
+  #qurt_paddr_t
+
+  @param[in,out] attr  Pointer to the memory region attribute structure.
+  @param[in] addr  Memory region physical address.
+
+  @return      
+  None.
+ */
+static inline void qurt_mem_region_attr_set_physaddr(qurt_mem_region_attr_t *attr, qurt_paddr_t addr){
+    attr->ppn = (unsigned)(((unsigned)(addr))>>12);
+}
+
+/**@ingroup func_qurt_mem_region_attr_get_physaddr
+  Gets the memory region physical address from the specified memory region attribute structure.
+  
+  @datatypes
+  #qurt_mem_region_attr_t
+  
+  @param[in]  attr  Pointer to the memory region attribute structure.
+  @param[out] addr  Pointer to the destination variable for memory region physical address.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+static inline void qurt_mem_region_attr_get_physaddr(qurt_mem_region_attr_t *attr, unsigned int *addr){
+    (*addr) = (unsigned)(((unsigned) (attr->ppn))<<12);
+}
+
+/**@ingroup func_qurt_mem_region_attr_set_virtaddr
+  Sets the memory region virtual address in the specified memory attribute structure.
+
+  @datatypes
+  #qurt_mem_region_attr_t \n
+  #qurt_addr_t
+  
+  @param[in,out] attr  Pointer to the memory region attribute structure.
+  @param[in]     addr  Memory region virtual address.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+static inline void qurt_mem_region_attr_set_virtaddr(qurt_mem_region_attr_t *attr, qurt_addr_t addr){
+    attr->virtaddr = addr;
+}
+
+/**@ingroup func_qurt_mem_region_attr_get_virtaddr
+  Gets the memory region virtual address from the specified memory region attribute structure.
+
+  @datatypes
+  #qurt_mem_region_attr_t \n
+
+  @param[in]   attr   Pointer to the memory region attribute structure.
+  @param[out]  addr   Pointer to the destination variable for the memory region virtual address.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+static inline void qurt_mem_region_attr_get_virtaddr(qurt_mem_region_attr_t *attr, unsigned int *addr){
+    (*addr) = (unsigned int)(attr->virtaddr);
+}
+
+/**@ingroup func_qurt_mem_region_attr_set_mapping
+  Sets the memory mapping in the specified memory region attribute structure.
+
+  The mapping value indicates how the memory region is mapped in virtual memory.  
+
+  @datatypes
+  #qurt_mem_region_attr_t \n
+  #qurt_mem_mapping_t
+  
+  @param[in,out] attr     Pointer to the memory region attribute structure.
+  @param[in] mapping  Mapping. Values: 
+                      - #QURT_MEM_MAPPING_VIRTUAL
+                      - #QURT_MEM_MAPPING_PHYS_CONTIGUOUS 
+                      - #QURT_MEM_MAPPING_IDEMPOTENT  	                                   
+                      - #QURT_MEM_MAPPING_VIRTUAL_FIXED								   
+                      - #QURT_MEM_MAPPING_NONE 
+                      - #QURT_MEM_MAPPING_VIRTUAL_RANDOM
+                      - #QURT_MEM_MAPPING_INVALID   @tablebulletend  
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+static inline void qurt_mem_region_attr_set_mapping(qurt_mem_region_attr_t *attr, qurt_mem_mapping_t mapping){
+    attr->mapping_type = mapping;
+}
+
+/**@ingroup func_qurt_mem_region_attr_get_mapping
+  Gets the memory mapping from the specified memory region attribute structure.
+
+  @datatypes
+  #qurt_mem_region_attr_t \n
+  #qurt_mem_mapping_t
+
+  @param[in]  attr     Pointer to the memory region attribute structure.
+  @param[out] mapping  Pointer to the destination variable for memory mapping.
+
+  @return 
+  None.
+
+  @dependencies
+  None.
+ */
+static inline void qurt_mem_region_attr_get_mapping(qurt_mem_region_attr_t *attr, qurt_mem_mapping_t *mapping){
+    (*mapping) = attr->mapping_type;
+}
+
+/**@ingroup func_qurt_mem_region_attr_set_cache_mode
+  Sets the cache operation mode in the specified memory region attribute structure.
+
+  @cond rest_dist For more information on the cache, see @xhyperref{80VB41992,80-VB419-92}.@endcond
+
+  @datatypes
+  #qurt_mem_region_attr_t \n
+  #qurt_mem_cache_mode_t
+  
+  @param[in,out] attr  Pointer to the memory region attribute structure.
+  @param[in] mode      Cache mode. Values:  \n
+                       - #QURT_MEM_CACHE_WRITEBACK \n
+                       - #QURT_MEM_CACHE_WRITETHROUGH\n
+                       - #QURT_MEM_CACHE_WRITEBACK_NONL2CACHEABLE\n
+                       - #QURT_MEM_CACHE_WRITETHROUGH_NONL2CACHEABLE\n
+                       - #QURT_MEM_CACHE_WRITEBACK_L2CACHEABLE\n
+                       - #QURT_MEM_CACHE_WRITETHROUGH_L2CACHEABLE\n
+                       - #QURT_MEM_CACHE_NONE @tablebulletend
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+static inline void qurt_mem_region_attr_set_cache_mode(qurt_mem_region_attr_t *attr, qurt_mem_cache_mode_t mode){
+    QURT_PGATTR_C_SET(attr->pga, (unsigned)mode);
+}
+
+/**@ingroup func_qurt_mem_region_attr_get_cache_mode
+  Gets the cache operation mode from the specified memory region attribute structure.
+
+  @datatypes
+  #qurt_mem_region_attr_t \n
+  #qurt_mem_cache_mode_t
+
+  @param[in]  attr  Pointer to the memory region attribute structure.
+  @param[out] mode  Pointer to the destination variable for cache mode.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+static inline void qurt_mem_region_attr_get_cache_mode(qurt_mem_region_attr_t *attr, qurt_mem_cache_mode_t *mode){
+    unsigned int mode_temp = QURT_PGATTR_C_GET(attr->pga);
+    (*mode) = (qurt_mem_cache_mode_t)mode_temp;
+}
+
+/**@ingroup func_qurt_mem_region_attr_set_bus_attr
+  Sets the (A1, A0) bus attribute bits in the specified memory region attribute structure.
+
+  @cond rest_dist For more information on the bus attribute bits, see the @xhyperref{80VB41992,80-VB419-92}. @endcond
+
+  @datatypes
+  #qurt_mem_region_attr_t
+
+  @param[in,out] attr  Pointer to the memory region attribute structure.
+  @param[in] abits     The (A1, A0) bits to use with the memory region, expressed as a 2-bit binary number.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+static inline void qurt_mem_region_attr_set_bus_attr(qurt_mem_region_attr_t *attr, unsigned abits){
+    QURT_PGATTR_A_SET(attr->pga, abits);
+}
+
+/**@ingroup func_qurt_mem_region_attr_get_bus_attr
+  Gets the (A1, A0) bus attribute bits from the specified memory region attribute structure.
+
+  @datatypes
+  #qurt_mem_region_attr_t 
+
+  @param[in]  attr  Pointer to the memory region attribute structure.
+  @param[out] pbits Pointer to an unsigned integer that is filled in with
+                    the (A1, A0) bits from the memory region attribute structure, expressed as a 2-bit binary number.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+static inline void qurt_mem_region_attr_get_bus_attr(qurt_mem_region_attr_t *attr, unsigned *pbits){
+    (*pbits) = QURT_PGATTR_A_GET(attr->pga);
+}
+
+void qurt_mem_region_attr_set_owner(qurt_mem_region_attr_t *attr, int handle);
+void qurt_mem_region_attr_get_owner(qurt_mem_region_attr_t *attr, int *p_handle);
+void qurt_mem_region_attr_set_perms(qurt_mem_region_attr_t *attr, unsigned perms);
+void qurt_mem_region_attr_get_perms(qurt_mem_region_attr_t *attr, unsigned *p_perms);
+
+/**@ingroup func_qurt_mem_map_static_query
+  Determines whether a memory page is statically mapped.
+  Pages are specified by the following attributes: physical address, page size, cache mode,
+  and memory permissions. \n
+  - If the specified page is statically mapped, vaddr returns the virtual
+     address of the page. \n
+  - If the page is not statically mapped (or if it does not exist as specified), vaddr
+     returns -1 as the virtual address value.\n
+  The system configuration file defines QuRT memory maps.
+ 
+  @datatypes
+  #qurt_addr_t \n
+  #qurt_mem_cache_mode_t \n
+  #qurt_perm_t
+  
+  @param[out]  vaddr             Virtual address corresponding to paddr.
+  @param[in]   paddr             Physical address.  
+  @param[in]   page_size         Size of the mapped memory page.
+  @param[in]   cache_attribs     Cache mode (writeback, and so on).
+  @param[in]   perm              Access permissions.
+
+  @return
+  #QURT_EOK -- Specified page is statically mapped, vaddr returns the virtual address. \n
+  #QURT_EMEM -- Specified page is not statically mapped, vaddr returns -1. \n
+  #QURT_EVAL -- Specified page does not exist.
+
+  @dependencies
+  None.
+ */
+int qurt_mem_map_static_query(qurt_addr_t *vaddr, qurt_addr_t paddr, unsigned int page_size, qurt_mem_cache_mode_t cache_attribs, qurt_perm_t perm);
+
+
+/**@ingroup func_qurt_mem_region_query
+  Queries a memory region. \n
+  This function determines whether a dynamically-created memory region (Section @xref{sec:mem_region_create}) exists for the
+  specified virtual or physical address.  
+  When a memory region has been determined to exist, its attributes are
+  accessible (Section @xref{sec:mem_region_attr_get}).
+
+  @note1hang This function returns #QURT_EFATAL if #QURT_EINVALID is passed to both
+             vaddr and paddr (or to neither). 
+
+  @datatypes
+  #qurt_mem_region_t \n
+  #qurt_paddr_t 
+   
+  @param[out] region_handle    Pointer to the memory region object (if it exists).
+  @param[in]  vaddr            Virtual address to query; if vaddr is specified, paddr must be set to
+                               the value #QURT_EINVALID.
+  @param[in]  paddr            Physical address to query; if paddr is specified, vaddr must be set to
+                               the value #QURT_EINVALID.
+
+  @return 
+  #QURT_EOK -- Query successfully performed. \n
+  #QURT_EMEM -- Region not found for the specified address. \n
+  #QURT_EFATAL -- Invalid input parameters.
+
+  @dependencies
+  None.
+ */
+int qurt_mem_region_query(qurt_mem_region_t *region_handle, qurt_addr_t vaddr, qurt_paddr_t paddr);
+
+
+/**@ingroup func_qurt_mapping_create
+  @xreflabel{hdr:qurt_mapping_create}
+  Creates a memory mapping in the page table.
+  Not supported if called from a user process, always returns QURT_EMEM. 
+
+  @datatypes
+  #qurt_addr_t \n
+  #qurt_size_t \n
+  #qurt_mem_cache_mode_t \n
+  #qurt_perm_t
+ 
+  @param[in] vaddr			Virtual address.
+  @param[in] paddr			Physical address.
+  @param[in] size			Size (4K-aligned) of the mapped memory page.
+  @param[in] cache_attribs		Cache mode (writeback, and so on).
+  @param[in] perm			Access permissions.
+
+  @return			
+  #QURT_EOK -- Mapping created. \n
+  #QURT_EMEM -- Failed to create mapping.
+  #QURT_EINVALID -- Invalid cache attributes / permissions provided.
+
+  @dependencies
+  None.
+*/
+int qurt_mapping_create(qurt_addr_t vaddr, qurt_addr_t paddr, qurt_size_t size,
+                         qurt_mem_cache_mode_t cache_attribs, qurt_perm_t perm);
+
+/**@ingroup func_qurt_mapping_remove
+   @xreflabel{hdr:qurt_mapping_remove}
+  Deletes the specified memory mapping from the page table.
+ 
+  @datatypes
+  #qurt_addr_t \n
+  #qurt_size_t
+
+  @param[in] vaddr			Virtual address.
+  @param[in] paddr			Physical address.
+  @param[in] size			Size of the mapped memory page (4K-aligned).
+
+  @return 			
+  #QURT_EOK -- Mapping created.
+  #QURT_ELOCKED -- Buffer is locked. Mapping delete failed.
+
+  @dependencies
+  None.
+  		
+ */ 		
+int qurt_mapping_remove(qurt_addr_t vaddr, qurt_addr_t paddr, qurt_size_t size);
+
+/**@ingroup func_qurt_lookup_physaddr
+  Translates a virtual memory address to the physical memory address to which it maps. \n
+  The lookup occurs in the process of the caller. Use qurt_lookup_physaddr2() to lookup the
+  physical address of another process.
+  
+
+  @datatypes
+  #qurt_addr_t \n
+  #qurt_paddr_t
+
+  @param[in] vaddr   Virtual address.
+
+  @return
+  Nonzero -- Physical address to which the virtual address is mapped.\n
+  0 -- Virtual address not mapped.
+
+  @dependencies
+  None.
+*/
+qurt_paddr_t qurt_lookup_physaddr (qurt_addr_t vaddr);
+
+/**@ingroup func_qurt_mem_region_attr_set_physaddr_64
+  Sets the memory region 64-bit physical address in the specified memory attribute structure.
+
+  @note1hang The physical address attribute is explicitly set only for memory regions with
+             physical contiguous mapping. Otherwise it is automatically set by
+             QuRT when the memory region is created.
+
+  @datatypes
+  #qurt_mem_region_attr_t \n
+  #qurt_paddr_64_t
+
+  @param[in,out] attr  Pointer to the memory region attribute structure.
+  @param[in] addr_64   Memory region 64-bit physical address.
+
+  @return
+  None.
+ */
+static inline void qurt_mem_region_attr_set_physaddr_64(qurt_mem_region_attr_t *attr, qurt_paddr_64_t addr_64){
+    attr->ppn = (unsigned)(((unsigned long long)(addr_64))>>12);
+}
+
+/**@ingroup func_qurt_mem_region_attr_get_physaddr_64
+  Gets the memory region 64-bit physical address from the specified memory region attribute structure.
+
+  @datatypes
+  #qurt_mem_region_attr_t \n
+  #qurt_paddr_64_t
+
+  @param[in]  attr     Pointer to the memory region attribute structure.
+  @param[out] addr_64  Pointer to the destination variable for the memory region 64-bit physical address.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+static inline void qurt_mem_region_attr_get_physaddr_64(qurt_mem_region_attr_t *attr, qurt_paddr_64_t *addr_64){
+    (*addr_64) = (unsigned long long)(((unsigned long long)(attr->ppn))<<12);
+}
+
+/**@ingroup func_qurt_mem_map_static_query_64
+  Determines if a memory page is statically mapped.
+  The following attributes specify pages: 64-bit physical address, page size, cache mode,
+  and memory permissions. \n
+  If the specified page is statically mapped, vaddr returns the virtual
+     address of the page.
+  If the page is not statically mapped (or if it does not exist as specified), vaddr
+     returns -1 as the virtual address value.\n
+  QuRT memory maps are defined in the system configuration file.
+
+  @datatypes
+  #qurt_addr_t \n
+  #qurt_paddr_64_t \n
+  #qurt_mem_cache_mode_t \n
+  #qurt_perm_t
+
+  @param[out]  vaddr             Virtual address corresponding to paddr.
+  @param[in]   paddr_64          64-bit physical address.
+  @param[in]   page_size         Size of the mapped memory page.
+  @param[in]   cache_attribs     Cache mode (writeback, and so on).
+  @param[in]   perm              Access permissions.
+
+  @return
+  #QURT_EOK -- Specified page is statically mapped; a virtual address is returned in vaddr. \n
+  #QURT_EMEM -- Specified page is not statically mapped; -1 is returned in vaddr. \n
+  #QURT_EVAL -- Specified page does not exist.
+
+  @dependencies
+  None.
+ */
+int qurt_mem_map_static_query_64(qurt_addr_t *vaddr, qurt_paddr_64_t paddr_64, unsigned int page_size, qurt_mem_cache_mode_t cache_attribs, qurt_perm_t perm);
+
+/**@ingroup func_qurt_mem_region_query_64
+  Determines whether a dynamically created memory region (Section @xref{sec:mem_region_create}) exists for the
+  specified virtual or physical address. When a memory region has been determined to exist, its attributes are
+  accessible (Section @xref{sec:mem_region_attr_get}).
+
+  @note1hang This function returns QURT_EFATAL if #QURT_EINVALID is passed to both
+             vaddr and paddr (or to neither).
+
+  @datatypes
+  #qurt_mem_region_t \n
+  #qurt_addr_t \n
+  #qurt_paddr_64_t
+
+  @param[out] region_handle    Pointer to the memory region object (if it exists).
+  @param[in]  vaddr            Virtual address to query; if vaddr is specified, paddr must be set to
+                               the value #QURT_EINVALID.
+  @param[in]  paddr_64         64-bit physical address to query; if paddr is specified, vaddr must be set to
+                               the value #QURT_EINVALID.
+
+  @return
+  #QURT_EOK -- Success. \n
+  #QURT_EMEM -- Region not found for the specified address. \n
+  #QURT_EFATAL -- Invalid input parameters.
+
+  @dependencies
+  None.
+ */
+int qurt_mem_region_query_64(qurt_mem_region_t *region_handle, qurt_addr_t vaddr, qurt_paddr_64_t paddr_64);
+
+/**@ingroup func_qurt_mapping_create_64
+  @xreflabel{hdr:qurt_mapping_create_64}
+  Creates a memory mapping in the page table.
+  Not supported if called from a user process, always returns QURT_EMEM.
+
+  @datatypes
+  #qurt_addr_t \n
+  #qurt_paddr_64_t \n
+  #qurt_size_t \n
+  #qurt_mem_cache_mode_t \n
+  #qurt_perm_t
+ 
+  @param[in] vaddr	        Virtual address.
+  @param[in] paddr_64		64-bit physical address.
+  @param[in] size			Size (4K-aligned) of the mapped memory page.
+  @param[in] cache_attribs  Cache mode (writeback, and so on).
+  @param[in] perm			Access permissions.
+
+  @return			
+  #QURT_EOK -- Success. \n
+  #QURT_EMEM -- Failure.
+  #QURT_EINVALID -- Invalid cache attributes / permissions provided.
+
+  @dependencies
+  None.
+*/
+int qurt_mapping_create_64(qurt_addr_t vaddr, qurt_paddr_64_t paddr_64, qurt_size_t size,
+                         qurt_mem_cache_mode_t cache_attribs, qurt_perm_t perm);
+
+/**@ingroup func_qurt_mapping_remove_64
+   @xreflabel{hdr:qurt_mapping_remove_64}
+  Deletes the specified memory mapping from the page table.
+ 
+  @datatypes
+  #qurt_addr_t \n
+  #qurt_paddr_64_t \n  
+  #qurt_size_t
+ 
+  @param[in] vaddr    Virtual address.
+  @param[in] paddr_64 64-bit physical address.
+  @param[in] size     Size of the mapped memory page (4K-aligned).
+
+  @return 			
+  #QURT_EOK -- Success.
+  #QURT_ELOCKED -- Buffer is locked. Mapping delete failed.
+
+  @dependencies
+  None.
+  		
+ */ 		
+int qurt_mapping_remove_64(qurt_addr_t vaddr, qurt_paddr_64_t paddr_64, qurt_size_t size);
+
+/**@ingroup func_qurt_lookup_physaddr_64
+  Translates a virtual memory address to the 64-bit physical memory address it is mapped to. \n
+  The lookup occurs in the process of the caller. Use qurt_lookup_physaddr2() to lookup the physical
+  address of another process.
+
+  @datatypes
+  #qurt_paddr_64_t \n
+  #qurt_addr_t
+
+  @param[in] vaddr   Virtual address.
+
+  @return
+  Nonzero -- 64-bit physical address to which the virtual address is mapped. \n
+  0 -- Virtual address has not been mapped.
+
+  @dependencies
+  None.
+*/
+qurt_paddr_64_t qurt_lookup_physaddr_64 (qurt_addr_t vaddr);
+/** @endcond */
+
+/** @cond internal_only */
+/**@ingroup func_qurt_mapping_reclaim
+  Deallocates all QuRT resources associated with the specified virtual
+  memory area, making it available for user memory management:\n
+  - The associated physical memory areas are freed and added to the
+    specified physical pool.\n
+  - The associated TLB entries are deleted and made available for TLB
+    management.\n
+  - The virtual memory area is not freed -- it is left in
+    place as allocated, but unmapped virtual memory. Access to this
+    memory area generates an exception.\n
+
+  The virtual memory area must be statically allocated.
+  If no pool is specified, the freed physical memory is not added to any pool.
+
+  @note1hang The virtual memory area is restricted to being filled with locked 
+             TLB entries that are contiguous within the memory area, and contained by it.
+
+  @datatypes
+  #qurt_addr_t \n
+  #qurt_size_t \n
+  #qurt_mem_pool_t
+
+  @param[in] vaddr   Virtual address of the memory area to free.
+  @param[in] vsize   Size (in bytes) of the memory area to free.
+  @param[in] pool    Handle to the physical pool where freed physical memory is added.
+                     If set to 0, freed physical memory is not added to any pool.
+
+  @return
+  0 -- Success. \n
+  Nonzero -- Failure that indicates a partial success, or that the request was malformed. \n @note1hang The expected behavior is that
+       QuRT logs messages related to the failure, and callers are free to ignore the return value.
+
+  @dependencies
+  None.
+*/
+int qurt_mapping_reclaim(qurt_addr_t vaddr, qurt_size_t vsize, qurt_mem_pool_t pool);
+/** @endcond */
+/** @cond rest_reg_dist  */
+/**@ingroup func_qurt_mem_configure_cache_partition
+  Configures the Hexagon cache partition at the system level.
+
+  A partition size value of #SEVEN_EIGHTHS_SIZE is applicable only to the L2 cache.
+
+  The L1 cache partition is not supported in Hexagon processor version V60 or greater.
+
+  @note1hang Call this operation only with QuRT OS privilege.
+
+  @datatypes
+  #qurt_cache_type_t \n
+  #qurt_cache_partition_size_t
+
+  @param[in] cache_type  Cache type for partition configuration. Values: \n
+                       - #HEXAGON_L1_I_CACHE \n
+                       - #HEXAGON_L1_D_CACHE \n
+                       - #HEXAGON_L2_CACHE @tablebulletend
+
+  @param[in] partition_size  Cache partition size. Values: \n
+                        - #FULL_SIZE \n
+                        - #HALF_SIZE \n
+                        - #THREE_QUARTER_SIZE \n
+                        - #SEVEN_EIGHTHS_SIZE @tablebulletend
+
+  @return
+  #QURT_EOK -- Success. \n
+  #QURT_EVAL -- Error.
+
+  @dependencies
+  None.
+ */
+int qurt_mem_configure_cache_partition(qurt_cache_type_t cache_type, qurt_cache_partition_size_t partition_size);
+
+
+/**@ingroup func_qurt_mem_syncht
+   @xreflabel{hdr:qurt_mem_syncht}
+  Performs heavy-weight synchronization of memory transactions.
+
+  This operation does not return until all previous memory transactions (cached and uncached load/store,
+  mem_locked, and so on) that originated from the current thread are complete and globally observable.
+
+  @note1hang This operation is implemented as a wrapper for the Hexagon syncht instruction.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+static inline void qurt_mem_syncht(void){
+    #ifdef __HEXAGON_ARCH__
+    __asm__  __volatile__ (" SYNCHT \n");
+    #endif
+}
+
+/**@ingroup func_qurt_mem_barrier
+   @xreflabel{hdr:qurt_mem_barrier}
+  Creates a barrier for memory transactions.
+
+  This operation ensures that all previous memory transactions are globally observable before any
+  future memory transactions are globally observable.
+
+  @note1hang This operation is implemented as a wrapper for the Hexagon barrier instruction.
+  @return
+  None
+
+  @dependencies
+  None.
+ */
+static inline void qurt_mem_barrier(void){
+    #ifdef __HEXAGON_ARCH__
+    __asm__  __volatile__ (" BARRIER \n");
+    #endif
+}
+/** @endcond */
+
+/** @cond internal_only */
+/**@ingroup func_qurt_system_mem_alloc
+  Requests that the kernel allocates memory from the kernel-owned pool.
+
+  @param[in] size     Size in bytes (aligned to 4K) to allocate.
+  @param[in] align    Any alignment that must be considered for the allocation.
+  @param[in] flags    Supports the #QURT_SYSTEM_ALLOC_VIRTUAL flag; allocates 
+                      available virtual memory in the address space of all processes.
+
+  @return
+  #QURT_EFATAL  -- Allocation failed \n
+  Start address of the successful allocation.  
+
+  @dependencies
+  None.
+*/
+unsigned qurt_system_mem_alloc(unsigned size, unsigned align, unsigned flags);
+/** @endcond */
+/** @cond rest_reg_dist*/
+/**@ingroup func_qurt_lookup_physaddr2
+  Translates the virtual memory address of the specified process to the 64-bit 
+  physical memory address to which it is mapped.
+
+  @datatypes
+  #qurt_addr_t \n
+  #qurt_paddr_64_t
+
+  @param[in] vaddr   Virtual address.
+  @param[in] pid     PID.
+
+  @return
+  Nonzero -- 64-bit physical address to which the virtual address is mapped. \n
+  0 -- Virtual address is not mapped.
+
+  @dependencies
+  None.
+*/
+qurt_paddr_64_t qurt_lookup_physaddr2(qurt_addr_t vaddr, unsigned int pid);
+/** @endcond */
+
+/**@ingroup func_qurt_mapping_attr_get  
+   Gets the mapping attributes for a given virtual address and PID
+
+   @datatypes
+   #qurt_addr_t \n
+   #qurt_mapping_attr_t
+
+   @param[in]  vaddr  virtual address for which the attributes are required.
+   @param[in]  pid    process id for the target process
+   @param[out] attr   Pointer to the mapping attribute structure. 
+
+   @return   
+   0 -- Success. \n
+   #QURT_EINVALID -- Incorrect virtual address or pid
+*/
+int qurt_mapping_attr_get(qurt_addr_t vaddr, unsigned int pid, qurt_mapping_attr_t *attr);
+
+
+/**@ingroup func_qurt_mapping_attr_get_cache_mode
+  Gets the cache operation mode in the specified memory mapping attribute structure.
+
+
+  @datatypes
+  #qurt_mapping_attr_t \n
+  #qurt_mem_cache_mode_t
+  
+  @param[in]  attr  Pointer to the memory mapping attribute structure.
+  @param[out] cache_mode  Pointer to the destination variable for cache mode.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+static inline void qurt_mapping_attr_get_cache_mode(qurt_mapping_attr_t *attr, qurt_mem_cache_mode_t *cache_mode)
+{
+   (*cache_mode) = attr->cache_mode;
+}
+
+/**@ingroup func_qurt_mapping_attr_get_physaddr
+  Gets the physical memory address in the specified memory mapping attribute structure.
+
+
+  @datatypes
+  #qurt_mapping_attr_t \n
+  #qurt_paddr_64_t
+  
+  @param[in]  attr      Pointer to the memory mapping attribute structure.
+  @param[out] physaddr  Pointer to the destination variable for physical address.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+static inline void qurt_mapping_attr_get_physaddr(qurt_mapping_attr_t *attr, qurt_paddr_64_t *physaddr)
+{
+   (*physaddr) = attr->paddr;
+}
+
+/**@ingroup func_qurt_mapping_attr_get_perms
+  Gets the permissions in the specified memory mapping attribute structure.
+
+
+  @datatypes
+  #qurt_mapping_attr_t \n
+  #qurt_perm_t
+  
+  @param[in]  attr   Pointer to the memory mapping attribute structure.
+  @param[out] perms  Pointer to the destination variable for permissions.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+static inline void qurt_mapping_attr_get_perms(qurt_mapping_attr_t *attr, qurt_perm_t *perms)
+{
+   (*perms) = attr->perms;
+}
+
+/**@ingroup func_qurt_mapping_attr_get_size
+  Gets the size in the specified memory mapping attribute structure.This represents size of the
+  TLB entry which covers the virtual address.
+
+
+  @datatypes
+  #qurt_mapping_attr_t \n
+  #unsigned int
+  
+  @param[in]  attr  Pointer to the memory mapping attribute structure.
+  @param[out] size  Pointer to the destination variable for size.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+static inline void qurt_mapping_attr_get_size(qurt_mapping_attr_t *attr, unsigned int *size)
+{
+   (*size) = attr->size;
+}
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_MEMORY_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_mmap.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_mmap.h
new file mode 100755
index 0000000000000..c3bd875910af7
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_mmap.h
@@ -0,0 +1,359 @@
+#ifndef QURT_MMAP_H
+#define QURT_MMAP_H
+/**
+  @file qurt_mmap.h 
+  @brief  Prototypes of memory mapping/unmapping APIs.
+          The APIs allow the user to map, un-map, and change permissions
+          on memory regions. 
+
+  EXTERNAL FUNCTIONS
+   None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+Copyright (c) 2018-2021, 2022, 2023 Qualcomm Technologies, Inc.
+All Rights Reserved.
+Confidential and Proprietary - Qualcomm Technologies, Inc.
+=============================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**@ingroup func_qurt_mem_mmap
+  Creates a memory mapping with the specified attributes. 
+  This API allows the root process caller to create mapping on behalf of a user
+  process. If the client_handle belongs to a valid user process, the resulting
+  mapping is created for the process.
+  If -1 is passed in place of client_handle, the API creates mapping
+  for the underlying process of the caller.
+
+  @note1hang If the specified attributes are not valid, an error result is returned.  
+                
+  @param[out]  client_handle  Client handle to use for this mapping (optional).
+  @param[in]   pool           Optional argument that specifies a pool handle
+                              if the user wants to allocate memory from a specific pool.
+                              The default value for this argument is NULL.
+  @param[in]   pRegion        Map region. This argument is unused, and the default value is NULL.
+  @param[in]   addr           Virtual memory address.
+  @param[in]   length         Size of mapping in bytes.
+  @param[in]   prot           Mapping access permissions (R/W/X).
+  @param[in]   flags          Mapping modes.\n
+                              - #QURT_MAP_NAMED_MEMSECTION 
+                              - #QURT_MAP_FIXED            \n
+                              - #QURT_MAP_NONPROCESS_VPOOL \n
+                              - #QURT_MAP_TRYFIXED         \n
+                              - #QURT_MAP_ANON             \n
+                              - #QURT_MAP_PHYSADDR         \n
+                              - #QURT_MAP_VA_ONLY @tablebulletend  
+  @param[in]   fd             File designator.
+  @param[in]   offset         Offset in file.
+ 
+  @return
+  Valid virtual address -- Success.\n
+  #QURT_MAP_FAILED -- Mapping creation failed. 
+ */
+void *qurt_mem_mmap(int client_handle,
+                    qurt_mem_pool_t pool,
+                    qurt_mem_region_t *pRegion,
+                    void *addr,
+                    size_t length,
+                    int prot,
+                    int flags,
+                    int fd,
+                    unsigned long long offset);
+
+/**@ingroup func_qurt_mem_mmap2
+  Creates a memory mapping with the specified attributes. Returns a more descriptive 
+  error code in case of failure.
+  This API allows the root process caller to create mapping on behalf of a user
+  process. If the client_handle belongs to a valid user process, the resulting
+  mapping is created for the process.
+  If -1 is passed in place of client_handle, the API creates mapping
+  for the underlying process of the caller.
+
+  @note1hang If the specified attributes are not valid, an error result is returned.
+
+  @param[out]  client_handle  Client handle to use for this mapping (optional).
+  @param[in]   pool           Optional argument that allows the user to specify a pool handle
+                              when the user wants to allocate memory from a specific pool.
+                              Default value for this argument is NULL.
+  @param[in]   pRegion        Map region (unused argument); default value is NULL.
+  @param[in]   addr           Virtual memory address.
+  @param[in]   length         Size of mapping in bytes.
+  @param[in]   prot           Mapping access permissions (R/W/X).
+                              Cache attributes, bus attributes, User mode.
+  @param[in]   flags          Mapping modes;
+                              Shared, Private, or Anonymous.
+  @param[in]   fd             File designator.
+  @param[in]   offset         Offset in file.
+ 
+  @return
+  Valid virtual address -- Success.\n
+  #QURT_EMEM -- Physical address is not available. \n
+  #QURT_EFAILED -- VA is not available or mapping failed.\n
+  #QURT_EINVALID -- Invalid argument was passed (for example, an unaligned VA/PA).
+ */
+void *qurt_mem_mmap2(int client_handle,
+                    qurt_mem_pool_t pool,
+                    qurt_mem_region_t *pRegion,
+                    void *addr,
+                    size_t length,
+                    int prot,
+                    int flags,
+                    int fd,
+                    unsigned long long offset);
+
+/**@ingroup func_qurt_mem_mmap_by_name
+  Creates a memory mapping for a named-memsection using the specified attributes.
+  The named memsection should be specified in cust_config.xml.
+
+  @note1hang If the specified attributes are not valid or the named memsection is not found,
+  an error result is returned.
+                  
+  @param[in]   name           Name of the memsection in cust_config.xml that specifies 
+                              this mapping. Should be less than 25 characters.
+  @param[in]   addr           Virtual memory address.
+  @param[in]   length         Size of mapping in bytes.
+  @param[in]   prot           Mapping access permissions (R/W/X).
+                              Cache attributes, bus attributes, User mode
+  @param[in]   flags          Mapping modes, such as
+                              Shared, Private, or Anonymous.
+  @param[in]   offset         Offset relative to the physical address range specified in memsection. 
+                              If offset + length exceeds size of memsection, failure is 
+                              returned.
+  @return
+  Valid virtual address -- Success.\n
+  #QURT_MAP_FAILED -- Mapping creation failed. 
+ */
+void *qurt_mem_mmap_by_name(const char* name,
+                            void *addr,
+                            size_t length,
+                            int prot,
+                            int flags,
+                            unsigned long long offset);
+
+/**@ingroup func_qurt_mem_mprotect2
+  Changes access permissions and attributes on an existing mapping based on the client_handle argument. 
+
+  @note1hang If the specified virtual address is not found or invalid attributes are passed,
+  an error code is returned.
+
+  @note2 When error is returned, it is possible that attributes/permissions are changed for some part of the
+          mapping, while for the remaining it is unchanged. Clients should not use these mappings further.
+                  
+  @param[in]   client_handle  Obtained from the current invocation function (Section 3.4.1).                   
+  @param[in]   addr           Virtual memory address.
+  @param[in]   length         Size of mapping in bytes.
+  @param[in]   prot           Mapping access permissions (R/W/X).
+                              Cache attributes, Bus attributes, User mode.
+  @return
+  #QURT_EOK -- Successfully changes permissions on the mapping.\n
+  #QURT_EFATAL -- Failed to change permissions on the mapping. \n
+  #QURT_EINVALID -- Attributes / permissions requested are invalid.
+ */
+int qurt_mem_mprotect2(int client_handle, const void *addr,
+                      size_t length,
+                      int prot);
+
+/**@ingroup func_qurt_mem_mprotect
+  Changes access permissions and attributes on an existing mapping. 
+
+  @note1hang If the specified virtual address is not found or invalid attributes are passed,
+  an error code is returned.\n
+
+  @note2 When error is returned, it is possible that attributes/permissions are changed for some part of the
+          mapping, while for the remaining it is unchanged. Clients should not use these mappings further.
+                  
+  @param[in]   addr           Virtual memory address.
+  @param[in]   length         Size of mapping in bytes.
+  @param[in]   prot           Mapping access permissions (R/W/X).
+                              Cache attributes, Bus attributes, User mode.
+  @return
+  #QURT_EOK -- Successfully changes permissions on the mapping. \n
+  #QURT_EFATAL -- Failed to change permissions on the mapping. \n
+  #QURT_EINVALID -- Attributes / permissions requested are invalid.
+ */
+int qurt_mem_mprotect(const void *addr,
+                      size_t length,
+                      int prot);
+
+/**@ingroup func_qurt_mem_munmap
+  Removes an existing mapping. 
+
+  @note1hang If the specified mapping is not found in the context of the caller process
+  or invalid attributes are passed, an error code is returned.
+                  
+  @param[in]   addr           Virtual memory address.
+  @param[in]   length         Size of mapping in bytes.
+  
+  @return
+  #QURT_EOK -- Successfully changes permissions on the mapping. \n
+  #QURT_EFATAL -- Failed to change permissions on the mapping.
+  #QURT_ELOCKED - Buffer is locked. Mapping delete failed.
+ */
+int qurt_mem_munmap(void *addr,
+                    size_t length);
+
+/**@ingroup func_qurt_mem_munmap2
+  Removes an existing mapping for a specified process. 
+
+  @note1hang This API allows a root process entity, such as a driver, to remove mapping
+  that was created for a user process. If the specified mapping is not found in the context 
+  of client handle or invalid attributes are passed, an error code is returned.
+             
+  @param[out]  client_handle  Client handle of the user process that owns this mapping. 
+  @param[in]   addr           Virtual memory address.
+  @param[in]   length         Size of mapping in bytes.
+  
+  @return
+  #QURT_EOK -- Successfully changes permissions on the mapping. \n
+  #QURT_EFATAL -- Failed to change permissions on the mapping. 
+  #QURT_ELOCKED - Buffer is locked. Mapping delete failed.
+ */
+int qurt_mem_munmap2(int client_handle,
+                     void *addr,
+                     size_t length);
+
+/**@ingroup func_qurt_mem_munmap3
+  Removes an existing mapping or reservation for a specified process. 
+
+  @param[in]   client_handle  Client handle of the user process that owns this mapping. 
+  @param[in]   addr           Pointer to a virtual memory address.
+  @param[in]   length         Size of mapping in bytes.
+  @param[in]   flags          Specifies the flag.
+  
+  @return
+  #QURT_EOK -- Successfully changes permissions on the mapping. \n
+  #QURT_EFATAL -- Failed to change permissions on the mapping. 
+  #QURT_ELOCKED - Buffer is locked. Mapping delete failed.
+ */
+int qurt_mem_munmap3(int client_handle,
+                     void *addr,
+                     size_t length,
+                     int flags);
+
+/*
+|| The macros here follow the style of the standard mmap() macros, but with
+||  QURT_ prepended to avoid name conflicts, and to avoid having a dependency
+||  on sys/mman.h.
+||
+|| Wherever possible, any values here that are also present in sys/mman.h
+||  should have the same value in both places so that we can accept "mmap"
+||  calls without having to remap parameters to new values.
+||
+|| In the future, it would be desirable to have a regression test that
+||  checks, for instance, that these macros match.  Example:
+||
+||   assert(QURT_MAP_FAILED == MAP_FAILED);
+||   ... repeat as needed ...
+*/
+
+/** @addtogroup memory_mapping_macros
+@{ */
+/** @cond */
+#define QURT_PROT_NONE                  0x00U    /**< */
+#define QURT_PROT_READ                  0x01U    /**< */
+#define QURT_PROT_WRITE                 0x02U    /**< */
+#define QURT_PROT_EXEC                  0x04U    /**< */
+#define QURT_PROT_NODUMP                0x08U    /**< Skip dumping the mapping. During PD dump, must skip
+                                                   some mappings on host memory to avoid a race condition
+                                                      where the memory is removed from the host and the DSP process
+                                                      crashes before the mapping is removed.*/
+#define QURT_PROT_ISLAND                0x10U     /**< Island mapping. */
+
+#define QURT_MAP_SHARED                 0x0001U   /**< Shared. */
+#define QURT_MAP_PRIVATE                0x0002U   /**< Private. */
+/** @endcond */
+#define QURT_MAP_NAMED_MEMSECTION       0x0004U   /**< Named memsection. */
+#define QURT_MAP_FIXED                  0x0010U   /**< Fixed virtual address. */
+#define QURT_MAP_RENAME                 0x0020U   /**< Rename. */
+#define QURT_MAP_NORESERVE              0x0040U   /**< No reserve. */
+#define QURT_MAP_INHERIT                0x0080U   /**< Inherit. */
+#define QURT_MAP_NONPROCESS_VPOOL       0x0100U   /**< Use a virtual address outside of the default range of the
+                                                       processes. This option is only supported in the root process
+                                                       and only when virtual memory split is enabled in the XML.
+                                                       The root process can use this flag to create mapping for a
+                                                       user process, for example, if the virtual address is configured
+                                                       for a 3G/1G split, the root process can use this flag to create
+                                                       mapping in the top 1 GB area for the user process or the
+                                                       lower 3 GB area for the root process. This is useful for
+                                                       shared buffer use cases. */
+#define QURT_MAP_HASSEMAPHORE           0x0200U   /**< Has semaphore. */
+#define QURT_MAP_TRYFIXED               0x0400U   /**< Try to create a mapping for a virtual address that was passed.
+                                                       If the passed virtual address fails, use a random virtual address. */
+#define QURT_MAP_WIRED                  0x0800U   /**< Wired. */
+#define QURT_MAP_FILE                   0x0000U   /**< File. */
+#define QURT_MAP_ANON                   0x1000U   /**< Allocate physical memory from the pool that was passed. 
+                                                       By default, memory is allocated from the default physpool. */
+#define QURT_MAP_VA_ONLY                0X2000U   /**< Reserve a virtual address without
+                                                       mapping it. */
+
+/** @cond */                                                   
+#define QURT_MAP_ALIGNED(n)             ((n) << QURT_MAP_ALIGNMENT_SHIFT)
+#define QURT_MAP_ALIGNMENT_SHIFT        24
+
+
+#define QURT_MAP_ALIGNMENT_MASK         QURT_MAP_ALIGNED(0xff)   /**< */
+#define QURT_MAP_ALIGNMENT_64KB         QURT_MAP_ALIGNED(16)     /**< */
+#define QURT_MAP_ALIGNMENT_16MB         QURT_MAP_ALIGNED(24)     /**< */
+#define QURT_MAP_ALIGNMENT_4GB          QURT_MAP_ALIGNED(32)     /**< */
+#define QURT_MAP_ALIGNMENT_1TB          QURT_MAP_ALIGNED(40)     /**< */
+#define QURT_MAP_ALIGNMENT_256TB        QURT_MAP_ALIGNED(48)     /**< */
+#define QURT_MAP_ALIGNMENT_64PB         QURT_MAP_ALIGNED(56)     /**< */
+/** @endcond */
+#define QURT_MAP_FAILED                 ((void *) -1)            /**< Mapping creation failed. */
+
+/*
+|| The macros below are extensions beyond the standard mmap flags, but follow
+||  the style of the mmap flags.
+*/
+/** @cond */
+// Describe bitfields in (prot)
+#define QURT_PROT_CACHE_BOUNDS          16U,19U,7U         /**< Bits 16 through 19 are cache attribute, default is 0. */
+#define QURT_PROT_BUS_BOUNDS            20U,21U,0U         /**< Bits 20 through 21 are bus attributes, default is 0. */
+#define QURT_PROT_USER_BOUNDS           22U,23U,3U         /**< Bits 22 through 23 are user mode, default is 3;
+                                                                default of 3 means to derive user mode setting from the
+                                                                default mode of the client. */
+
+// Describe bitfields in (flags)
+#define QURT_MAP_PHYSADDR_BOUNDS        15U,15U,0U         /**< Bits 15 through 15 are physaddr, default is 0. */
+#define QURT_MAP_TYPE_BOUNDS            16U,19U,0U         /**< Bits 16 through 19 are mapping type, default is 0. */
+#define QURT_MAP_REGION_BOUNDS          20U,23U,0U         /**< Bits 20 through 23 are region type, default is 0. */
+/** @endcond */
+
+// These macros get OR'ed into (prot)
+#define QURT_PROT_CACHE_MODE(n)         QURT_MMAP_BUILD(QURT_PROT_CACHE_BOUNDS,(n)) /**< */
+#define QURT_PROT_BUS_ATTR(n)           QURT_MMAP_BUILD(QURT_PROT_BUS_BOUNDS,(n))   /**< */
+#define QURT_PROT_USER_MODE(n)          QURT_MMAP_BUILD(QURT_PROT_USER_BOUNDS,(n))  /**< */
+// These macros get OR'ed into (flags)
+
+#define QURT_MAP_PHYSADDR               QURT_MMAP_BUILD(QURT_MAP_PHYSADDR_BOUNDS,1U) /**< Use the physical address that was passed in offset field. 
+                                                                                          This is allowed only for root process. */
+#define QURT_MAP_TYPE(n)                QURT_MMAP_BUILD(QURT_MAP_TYPE_BOUNDS,(n))    /**< */
+#define QURT_MAP_REGION(n)              QURT_MMAP_BUILD(QURT_MAP_REGION_BOUNDS,(n))  /**< */
+/** @} */ /* end_addtogroup memory_mapping_macros */
+/** @cond */
+// These macros extract fields from (prot)
+#define QURT_PROT_GET_CACHE_MODE(n)     QURT_MMAP_EXTRACT(QURT_PROT_CACHE_BOUNDS,(n))  /**< */
+#define QURT_PROT_GET_BUS_ATTR(n)       QURT_MMAP_EXTRACT(QURT_PROT_BUS_BOUNDS,(n))    /**< */
+#define QURT_PROT_GET_USER_MODE(n)      QURT_MMAP_EXTRACT(QURT_PROT_USER_BOUNDS,(n))   /**< */
+
+// These macros extract fields from (flags)
+#define QURT_MAP_GET_TYPE(n)            QURT_MMAP_EXTRACT(QURT_MAP_TYPE_BOUNDS,(n))   /**< */
+#define QURT_MAP_GET_REGION(n)          QURT_MMAP_EXTRACT(QURT_MAP_REGION_BOUNDS,(n)) /**< */
+
+// Macros for bitfield insertion and extraction
+#define QURT_MMAP_MASK(lo,hi)           (~((~0u) << ((hi)-(lo)+1U)))                     /**< Mask of same size as [lo..hi]. */
+#define QURT_MMAP_BUILD_(lo,hi,def,n)   ((((n)^(def))&QURT_MMAP_MASK((lo),(hi)))<<(lo)) /**< */
+#define QURT_MMAP_EXTRACT_(lo,hi,def,n) ((((n)>>(lo))&QURT_MMAP_MASK((lo),(hi)))^(def)) /**< */
+#define QURT_MMAP_BUILD(a,b)            QURT_MMAP_BUILD_(a,b)                           /**< */
+#define QURT_MMAP_EXTRACT(a,b)          QURT_MMAP_EXTRACT_(a,b)                         /**< */
+/** @endcond */
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_mq.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_mq.h
new file mode 100755
index 0000000000000..580c83d3de41a
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_mq.h
@@ -0,0 +1,458 @@
+#ifndef QURT_MQ_H
+#define QURT_MQ_H
+/**
+  @file  qurt_mq.h
+
+  @brief  Prototypes of secure message queues API functions.
+
+ EXTERNALIZED FUNCTIONS
+  None
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  None
+
+ Copyright (c) 2019-2023  by Qualcomm Technologies, Inc.  All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+======================================================================*/
+#include <qurt_types.h>
+#include <qurt_error.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*=============================================================================
+                            CONSTANTS AND MACROS
+=============================================================================*/
+#define QURT_MQ_NAME_MAXLEN            16U  /**< Maximum name length. */
+
+
+/*=============================================================================
+                            FORWARD DECLARATIONS & TYPEDEFS
+=============================================================================*/
+/* This enum must be generated in accordance to process class class numbers.
+   For now it is made to match generated version, do not change this unless 
+   there is a corresponding change in the process_class.py, indicies start from 0
+   basically: QURT_MQ_SECURITY_SCOPE_<x> = (1 << QURTK_process_class_index_<x>)
+*/
+typedef enum {
+    QURT_MQ_SECURITY_SCOPE_KERNEL =   ( 1U << 0 ),
+    QURT_MQ_SECURITY_SCOPE_SRM =      ( 1U << 1 ),
+    QURT_MQ_SECURITY_SCOPE_SECURE =   ( 1U << 2 ),
+    QURT_MQ_SECURITY_SCOPE_CPZ =      ( 1U << 3 ),
+    QURT_MQ_SECURITY_SCOPE_ROOT =     ( 1U << 4 ),
+    QURT_MQ_SECURITY_SCOPE_SIGNED =   ( 1U << 5 ),
+    QURT_MQ_SECURITY_SCOPE_UNSIGNED = ( 1U << 6 ),
+    QURT_MQ_SECURITY_SCOPE_SECURE_ROOT = ( 1U << 7 )
+} qurt_mq_security_scope_t;
+
+typedef enum {
+    QURT_MQ_CARDINALITY_PTP =   (1U << 0),
+    QURT_MQ_CARDINALITY_MTO =   (1U << 1)
+}qurt_mq_cardinality_t;
+
+typedef unsigned int qurt_mqd_t;
+
+typedef union{
+    struct {
+        unsigned int perms:2;
+        unsigned int cardinality:1;
+        unsigned int blocking:1;
+
+        qurt_mq_security_scope_t creator_scope: 8;
+        qurt_mq_security_scope_t allowed_scope: 8; //can be a bitmask in case of MTO
+        unsigned int queue_closed: 1;
+        unsigned int reserved: 11;
+    }; //try to do anonymous struct
+    unsigned int raw;
+} qurt_mq_flags_t;
+
+
+/* permissions are from qurt_types.h , block X though */
+#if 0
+/** Memory access permission. */
+typedef enum {
+        QURT_PERM_READ=0x1U, /**< */
+        QURT_PERM_WRITE=0x2U,  /**< */
+        QURT_PERM_EXECUTE=0x4U,  /**< */
+        QURT_PERM_FULL=QURT_PERM_READ|QURT_PERM_WRITE|QURT_PERM_EXECUTE,  /**< */
+} qurt_perm_t;
+#endif
+
+struct qurt_mq_attr {
+   unsigned flags;                         /**< Configured flags. Only meaningful with get_attr(), only used for qurt_mq_flags_t.perms. */
+   unsigned mq_maxmsg;                     /**< Maximum number of messages. Used with create() and get_attr. */
+   unsigned short mq_send_msgsize;         /**< Maximum size (bytes) of message in receiver facing queue,
+                                                from sender to receiver. */
+   unsigned short mq_recv_msgsize;         /**< Maximum size (bytes) of message in sender facing queue,
+                                                from receiver to sender. */
+   unsigned client_pid;                    /**< Process ID of client that is allowed to open the message queue
+                                                that was created using qurt_mq_create(). */
+   qurt_mq_cardinality_t    cardinality;   /**< Cardinality of message queue connection, see below. */
+   qurt_mq_security_scope_t scope;         /**< Security scope of the senders to the queue. */ 
+};
+
+
+/*=============================================================================
+                            EXTERNS & FUNCTIONS
+=============================================================================*/
+/**@ingroup func_qurt_mq_attr_init
+  Initializes attributes to default values used for creating the queue.
+
+  The initialize operation sets the following default attribute values: \n
+  - flag - QURT_PERM_READ | QURT_PERM_WRITE \n
+  - maxmsg - 1 \n
+  - mq_send_msgsize - 8 \n
+  - mq_recv_msgsize - 8 \n
+  - sender_pid -  -1 \n    
+  - cardinality -  QURT_MQ_CARDINALITY_PTP \n    
+  - scope -  QURT_MQ_SECURITY_SCOPE_SIGNED \n    
+
+  @datatypes
+  #qurt_mq_attr 
+  
+  @param[in,out] attr Pointer to the initialized message queue object.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+void qurt_mq_attr_init(struct qurt_mq_attr * attr);
+
+/**@ingroup qurt_mq_attr_set_send_msgsize
+  Sets the message size in bytes the sender can send.
+  Maximum message length is configurable using the XML configuration, however, limited to a maximum value of 62 bytes.
+  
+  @datatypes
+  #qurt_mq_attr
+
+  @param[in,out] attr Pointer to the message queue object.
+  @param[in] len     Length of message in bytes.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+void qurt_mq_attr_set_send_msgsize (struct qurt_mq_attr *attr, size_t len);
+
+/**@ingroup qurt_mq_attr_set_recv_msgsize
+  Sets the message size in bytes that the receiver can read.
+  Maximum message length is configurable using the XML configuration, however, limited to maximum value of 62 bytes.
+  
+  @datatypes
+  #qurt_mq_attr
+
+  @param[in,out] attr Pointer to the message queue object.
+  @param[in] len     Length of message in bytes.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+void qurt_mq_attr_set_recv_msgsize (struct qurt_mq_attr *attr, size_t len);
+
+/**@ingroup qurt_mq_attr_set_maxmsg
+  Sets the maximum message that can queue in the message queue.
+  Message depth is configurable using the XML configuration. 
+  
+  @datatypes
+  #qurt_mq_attr
+
+  @param[in,out] attr  Pointer to the message queue object.
+  @param[in] depth     Maximum message that can be queued.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+void qurt_mq_attr_set_maxmsg (struct qurt_mq_attr *attr, unsigned int depth);
+
+/**@ingroup qurt_mq_attr_set_scope
+  Sets the scope of the message queue. A message queue created with a security 
+  scope allows only a process class of that scope to open a message queue.
+  
+  @datatypes
+  #qurt_mq_attr \n
+  #qurt_mq_security_scope_t
+
+  @param[in,out] attr  Pointer to the message queue object.
+  @param[in] scope     Scope of the message queue: \n
+                       #QURT_MQ_SECURITY_SCOPE_KERNEL \n
+                       #QURT_MQ_SECURITY_SCOPE_SRM \n
+                       #QURT_MQ_SECURITY_SCOPE_SECURE \n
+                       #QURT_MQ_SECURITY_SCOPE_CPZ \n
+                       #QURT_MQ_SECURITY_SCOPE_ROOT \n
+                       #QURT_MQ_SECURITY_SCOPE_SIGNED \n
+                       #QURT_MQ_SECURITY_SCOPE_UNSIGNED
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+void qurt_mq_attr_set_scope (struct qurt_mq_attr *attr, qurt_mq_security_scope_t scope);
+
+
+/**@ingroup qurt_mq_attr_set_client_pid
+  Sets the client_pid that can open this message queue.
+  If client_pid is set, allowed_scope to open MQ shall not be considered.
+  
+  @datatypes
+  #qurt_mq_attr
+
+  @param[in,out] attr    Pointer to the message queue object.
+  @param[in] client_pid  Valid PID for client process.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+void qurt_mq_attr_set_client_pid (struct qurt_mq_attr *attr, unsigned client_pid);
+
+/**@ingroup qurt_mq_attr_set_flags
+  Sets the properties of the message queues. 
+  The current implementation is only used to set the permission for the message queue using the flag attribute.
+  Default is #QURT_PERM_READ | #QURT_PERM_WRITE, explicit permission is not implemented.
+  
+  @datatypes
+  #qurt_mq_attr
+
+  @param[in,out] attr  Pointer to the message queue object.
+  @param[in] flags     Permission for message queue.  
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+void qurt_mq_attr_set_flags (struct qurt_mq_attr *attr, unsigned int flags);
+
+/**@ingroup qurt_mq_create
+   Create a message queue with the provided name and attributes. 
+   The calling process becomes the owner of the queue.
+   Name of the message queue is limited to 16 characters including the NULL terminator. 
+  
+  @datatypes
+  #qurt_mq_attr \n
+  #qurt_mqd_t
+
+  @param[out] mqd Returns a pointer to the message queue identifier if 
+              the message queue  was successfully created.
+  @param[in] name     String identifier of the message queue.
+  @param[in] attr     Pointer to the initialized message queue attribute 
+                      structure that specifies the attributes of the created message queue.
+
+  @return
+  #QURT_EOK        Message queue created. \n
+  #QURT_EINVALID   Invalid arguments. \n
+  #QURT_ENOSPC     Maximum number of queues in the system is exceeded.
+
+  @dependencies
+  None.
+*/
+int qurt_mq_create(qurt_mqd_t *mqd, const char *name, struct qurt_mq_attr * attr);
+
+/**@ingroup qurt_mq_open
+  Opens a message queue connection between a process and a created message queue. 
+  
+  @datatypes
+  #qurt_mq_attr \n
+  #qurt_mqd_t
+
+  @param[out] mqd Returns a pointer to the message queue 
+              identifier if the message queue  was successfully created.
+  @param[in] name    String identifier of the message queue. 
+  @param[in] flags   Flag that contains the properties that define the behavior of message queue connection.
+                     Permissions:\n
+                      #QURT_PERM_READ \n
+                      #QURT_PERM_WRITE \n
+                      #QURT_PERM_READ | QURT_PERM_WRITE @tablebulletend  
+                      Default is QURT_PERM_READ | QURT_PERM_WRITE, explicit permission is not implemented \n
+                     Cardinality: \n
+                      #QURT_MQ_CARDINALITY_PTP (default) \n      
+                      #QURT_MQ_CARDINALITY_MTO (not implemented) \n
+                      Block suspend thread until the message queue with the apecified name is created. \n
+                     Scope: security boundary to which the message queue and its users are constrained.
+                      Block suspend thread until the message queue with the apecified name is created. \n
+                      It is coupled with process privilege level/scope.\n
+                      #QURT_MQ_SECURITY_SCOPE_KERNEL   \n
+                      #QURT_MQ_SECURITY_SCOPE_SRM      \n
+                      #QURT_MQ_SECURITY_SCOPE_SECURE   \n
+                      #QURT_MQ_SECURITY_SCOPE_CPZ      \n
+                      #QURT_MQ_SECURITY_SCOPE_ROOT     \n
+                      #QURT_MQ_SECURITY_SCOPE_SIGNED   \n
+                      #QURT_MQ_SECURITY_SCOPE_UNSIGNED @tablebulletend
+
+  @return
+  QURT_EOK -- Message queue connection successfully opened \n
+  QURT_EFAILED -- Message queue connection failed , if non-blocking message queue \n
+  QURT_ENOTALLOWED --  Open failed due to security scope mismatch
+
+  @dependencies
+  None.
+*/
+int qurt_mq_open (qurt_mqd_t *mqd, const char *name, qurt_mq_flags_t flags);
+
+/**@ingroup qurt_mq_send
+  Sends a message over message queue.\n
+  - If the message queue is full, the calling thread shall be 
+    suspended until space becomes available to enqueue the message. \n
+  - If there exists a thread suspended on an empty queue 
+  to receive a message,  qurt_mq_send shall resume that thread. 
+
+  @datatypes
+  #qurt_mqd_t
+
+  @param[in] mqd Pointer to the message queue identifier.
+  @param[in] msg_ptr     Pointer to the message buffer.  
+  @param[in] msg_len     Length of the message buffer in bytes.  
+
+  @return
+  #QURT_EOK  Message queue send was successful.\n
+  #QURT_EMSGSIZE  Message size in msg_len field is greater than max_message_len specified during queue creation.\n
+  #QURT_ENOTALLOWED   Send failed due to security scope mismatch.
+
+  @dependencies
+  None.
+*/
+int qurt_mq_send(qurt_mqd_t mqd, const char *msg_ptr, size_t msg_len); 
+
+/**@ingroup qurt_mq_send_timed
+  Sends a message over message queue.\n
+  - If the message queue is full, the calling thread shall be 
+    suspended until space becomes available to enqueue the message or until timeout is reached. \n
+  - If there exists a thread suspended on an empty queue 
+    to receive a message, qurt_mq_send_timed shall return with possible return codes.\n
+  - If timeout is reached, qurt_mq_send_timed shall return #QURT_ETIMEOUT.
+
+  @datatypes
+  #qurt_mqd_t
+
+  @param[in] mqd Pointer to the message queue identifier.
+  @param[in] msg_ptr     Pointer to the message buffer.
+  @param[in] duration    Interval (in microseconds) that the duration value must be
+             between #QURT_TIMER_MIN_DURATION and #QURT_TIMER_MAX_DURATION     
+  @param[in] msg_len     Length of message buffer in bytes.  
+
+  @return
+  #QURT_EOK -- Message queue send was successful. \n
+  #QURT_EMSGSIZE -- Message size in msg_len field is greater than max_message_len specified during queue creation.\n
+  #QURT_ENOTALLOWED --  Send failed due to security scope mismatch \n
+  #QURT_ETIMEDOUT -- Timeout
+  
+  @dependencies
+  None.
+*/
+int qurt_mq_send_timed(qurt_mqd_t mqd, const char *msg_ptr, unsigned long long int duration, size_t msg_len);
+
+ /**@ingroup qurt_mq_recv
+  Receives a message from the message queue. \n
+  -If the message queue is empty, the calling thread shall be 
+   suspended until a message is enqueued in the message queue. \n
+  -If there exists a thread suspended on a full queue to 
+   send a message, qurt_mq_recv shall resume the thread.
+
+  @datatypes
+  #qurt_mqd_t
+
+  @param[in] mqd Pointer to the message queue identifier.
+  @param[in] msg_ptr       Pointer to the message buffer  
+  @param[in,out] msg_len   Pointer to the length of message buffer.  
+
+  @return
+  #QURT_EOK --    Message queue created.\n
+  #QURT_EINVALID  Message pointer or msg_len ptr are NULL. \n
+  #QURT_EBADR     Message queue descriptior (mqd) is invalid. \n
+  #QURT_EBADF     Sender closed the message queue.
+
+  @dependencies
+  None.
+*/
+int qurt_mq_recv(qurt_mqd_t mqd, unsigned char *msg_ptr, size_t *msg_len);
+
+ /**@ingroup qurt_mq_recv_timed
+  Receives a message from the message queue. \n
+  -If the message queue is empty, the calling thread shall be 
+   suspended until a message is enqueued in the message queue or until timeout is reached.\n 
+  -If there exists a thread suspended on a full queue to 
+   send a message, qurt_mq_recv_timed shall return with possible return codes.\n
+  - If timeout is reached, qurt_mq_recv_timed shall return QURT_ETIMEOUT.
+  
+  @datatypes
+  #qurt_mqd_t
+
+  @param[in] mqd Pointer to the message queue identifier.
+  @param[in] msg_ptr     Pointer to the message buffer  
+  @param[in] duration    Interval (in microseconds) that the duration value must be;
+             between #QURT_TIMER_MIN_DURATION and #QURT_TIMER_MAX_DURATION   
+  @param[in,out] msg_len     Pointer to length of message buffer.  
+
+  @return
+  #QURT_EOK --       Message queue created.\n
+  #QURT_EINVALID --  Message ptr or msg_len ptr are NULL. \n
+  #QURT_EBADR    --  Message queue descriptior (mqd) is invalid.\n
+  #QURT_EBADF   --   Sender closed the message queue. \n
+  #QURT_ETIMEDOUT -- Timeout.
+  
+  @dependencies
+  None.
+*/
+int qurt_mq_recv_timed(qurt_mqd_t mqd, unsigned char *msg_ptr, unsigned long long int duration, size_t *msg_len);
+
+ /**@ingroup qurt_mq_close
+  Closes the message queue and disassociates the calling process (client) from the message queue 
+  under this descriptor. Marks the queue as closed for the receiver. 
+  This function is expected to be called from the client side. If called 
+  from the server side, the function reduces to no-op and returns success. 
+
+  @datatypes
+  #qurt_mqd_t
+
+  @param[in] mqd Pointer to the message queue identifier. 
+
+  @return
+  #QURT_EOK -- Message queue close was successfully.\n
+  #QURT_EBADR -- Invalid descriptor.\n
+  #QURT_ENOTALLOWED --   Message queue close is not called from client side.
+
+  @dependencies
+  None.
+*/
+int qurt_mq_close(qurt_mqd_t mqd);
+
+ /**@ingroup qurt_mq_destroy
+  Destroys the message queue. This function ought to be 
+  called from the process that called qurt_mq_create(). 
+
+  @datatypes
+  #qurt_mqd_t
+
+  @param[in] mqd Pointer to the message queue identifier. 
+
+  @return
+  #QURT_EOK -- Message queue destroy was successfully.\n
+  #QURT_EBADR -- Invalid descriptor.\n
+  #QURT_ENOTALLOWED --  Message queue close is not called from client side.
+
+  @dependencies
+  None.
+*/
+int qurt_mq_destroy(qurt_mqd_t mqd);
+
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+#endif //QURT_MQ_H
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_mutex.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_mutex.h
new file mode 100755
index 0000000000000..4ad6b270cdde6
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_mutex.h
@@ -0,0 +1,211 @@
+#ifndef QURT_MUTEX_H
+#define QURT_MUTEX_H
+/**
+  @file qurt_mutex.h 
+  @brief   Prototypes of mutex API.  
+   This is mostly a user space mutex, but calls the 
+   kernel to block if the mutex is taken. 
+
+EXTERNAL FUNCTIONS
+   None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+Copyright (c) 2021  by Qualcomm Technologies, Inc.  All Rights Reserved.
+Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+=============================================================================*/
+
+
+#include <qurt_futex.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** @addtogroup mutex_types
+@{ */
+/*=============================================================================
+                        TYPEDEFS
+=============================================================================*/
+
+/** QuRT mutex type.                                       
+  
+   Both non-recursive mutex lock and unlock, and recursive
+   mutex lock and unlock can be applied to this type.
+ */
+typedef union qurt_mutex_aligned8{
+   /** @cond */  
+    struct {       
+        unsigned int holder; 
+        unsigned int count;  
+        unsigned int queue;  
+        unsigned int wait_count;        
+    };
+    unsigned long long int raw;  
+    /** @endcond */  
+} qurt_mutex_t;
+/** @} */ /* end_addtogroup mutex_types */
+/*=============================================================================
+                        CONSTANTS AND MACROS
+=============================================================================*/
+/* @addtogroup mutex_const_macros
+@{ */
+#define MUTEX_MAGIC 0xfe                             /**< */
+#define QURTK_FUTEX_FREE_MAGIC     0x1F   // 11111   /**< */
+#define QURT_MUTEX_INIT {{MUTEX_MAGIC, 0, QURTK_FUTEX_FREE_MAGIC,0}}   /**< Suitable as an initializer for a
+                                                                        variable of type qurt_mutex_t. */
+/* @} */ /* end_addtogroup mutex_const_macros */
+/*=============================================================================
+                        FUNCTIONS
+=============================================================================*/
+/**@ingroup func_qurt_mutex_init
+  Initializes a mutex object.
+  The mutex is initially unlocked.
+
+  @note1hang Each mutex-based object has one or more kernel resources associated with it;
+             to prevent resource leaks, call qurt_mutex_destroy()
+             when this object is not used anymore
+  @datatypes
+  #qurt_mutex_t
+  
+  @param[out]  lock  Pointer to the mutex object. Returns the initialized object.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+  
+ */
+void qurt_mutex_init(qurt_mutex_t *lock);
+
+/**@ingroup func_qurt_mutex_destroy
+   Destroys the specified mutex. 
+
+   @note1hang Mutexes must be destroyed when they are no longer in use. Failure to do this
+              causes resource leaks in the QuRT kernel.\n
+   @note1cont Mutexes must not be destroyed while they are still in use. If this occurs, the
+              behavior of QuRT is undefined. 
+  
+   @datatypes
+   #qurt_mutex_t
+   
+   @param[in]  lock  Pointer to the mutex object to destroy.
+
+   @return 
+   None.
+
+   @dependencies
+   None.
+  
+ */
+void qurt_mutex_destroy(qurt_mutex_t *lock); 
+
+/**@ingroup func_qurt_mutex_lock
+   Locks the specified mutex.  
+   If a thread performs a lock operation on a mutex that is not in use, the thread gains
+   access to the shared resource that is protected by the mutex, and continues executing.
+
+   If a thread performs a lock operation on a mutex that is already in use by another
+   thread, the thread is suspended. When the mutex becomes available again (because the
+   other thread has unlocked it), the thread is awakened and given access to the shared
+   resource.
+
+   @note1hang A thread is suspended indefinitely if it locks a mutex that it has already
+           locked. Avoid this by using recursive mutexes (Section @xref{dox:recursive_mutexes}).  
+  
+   @datatypes
+   #qurt_mutex_t
+   
+   @param[in]  lock  Pointer to the mutex object. Specifies the mutex to lock.
+
+   @return 
+   None.
+
+   @dependencies
+   None.  
+ */
+void qurt_mutex_lock(qurt_mutex_t *lock);		/* blocking */
+
+/**@ingroup func_qurt_mutex_lock_timed
+   Locks the specified mutex.
+   When a thread performs a lock operation on a mutex that is not in use, the thread gains
+   access to the shared resource that is protected by the mutex, and continues executing.
+
+   When a thread performs a lock operation on a mutex that is already in use by another
+   thread, the thread is suspended. When the mutex becomes available again (because the
+   other thread has unlocked it), the thread is awakened and given access to the shared
+   resource. If the duration of suspension exceeds the timeout duration, wait is
+   terminated and no access to mutex is granted. 
+   
+   @datatypes
+   #qurt_mutex_t
+   
+   @param[in]  lock    Pointer to the mutex object; specifies the mutex to lock. 
+   @param[in] duration Interval (in microseconds) that the duration value must be between #QURT_TIMER_MIN_DURATION and
+    #QURT_TIMER_MAX_DURATION 
+
+   @return 
+   #QURT_EOK -- Success \n
+   #QURT_ETIMEDOUT -- Timeout
+ 
+   @dependencies
+   None.  
+ */
+int qurt_mutex_lock_timed (qurt_mutex_t * lock, unsigned long long int duration);
+
+/**@ingroup func_qurt_mutex_unlock
+  Unlocks the specified mutex.  \n
+  More than one thread can be suspended on a mutex. When the mutex is unlocked, only the
+  highest-priority thread waiting on the mutex is awakened. If the awakened thread has
+  higher priority than the current thread, a context switch occurs.
+
+  @note1hang The behavior of QuRT is undefined if a thread unlocks a mutex it did not first
+              lock.  
+  
+   @datatypes
+   #qurt_mutex_t
+   
+   @param[in]  lock  Pointer to the mutex object. Specifies the mutex to unlock. 
+
+   @return
+   None.
+
+   @dependencies
+   None.  
+ */
+void qurt_mutex_unlock(qurt_mutex_t *lock);	/* unlock */
+
+/**@ingroup func_qurt_mutex_try_lock
+   @xreflabel{hdr:qurt_mutex_try_lock}
+   Attempts to lock the specified mutex. 
+   If a thread performs a try_lock operation on a mutex that is not in use, the thread gains
+   access to the shared resource that is protected by the mutex, and continues executing.
+
+   @note1hang If a thread performs a try_lock operation on a mutex that it has already locked 
+              or is in use by another thread, qurt_mutex_try_lock immediately returns with a 
+              nonzero result value.
+   
+  
+   @datatypes
+   #qurt_mutex_t
+   
+   @param[in]  lock  Pointer to the mutex object. Specifies the mutex to lock.
+
+   @return 
+   0 -- Success. \n 
+   Nonzero -- Failure.
+  
+  @dependencies
+  None.
+ */
+int qurt_mutex_try_lock(qurt_mutex_t *lock);	
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_MUTEX_H */
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_os_services.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_os_services.h
new file mode 100755
index 0000000000000..cbc4c239e9620
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_os_services.h
@@ -0,0 +1,24 @@
+/*=============================================================================
+
+                                    qurt_os_services.c
+
+GENERAL DESCRIPTION
+
+EXTERNAL FUNCTIONS
+        None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+        None.
+
+             Copyright (c) 2023  by Qualcomm Technologies, Inc.  All Rights Reserved.
+=============================================================================*/
+
+#define QURT_OS_SERVICE_THREAD                "/os/thread"				/**< Thread service */
+#define QURT_OS_SERVICE_FS_HUB                "/os/fs_hub"  			/**< file-system hub */
+#define QURT_OS_SERVICE_CALLBACK              "/os/callback"            /**< QDI callback service */ 
+#define QURT_OS_SERVICE_INTERRUPTS            "/os/interrupt"           /**< Interrupt service */
+#define QURT_OS_SERVICE_PROXY                 "/os/proxy"               /**< QDI proxy serice */
+#define QURT_OS_SERVICE_MEMORY                "/os/memory"              /**< Memory management service */
+#define QURT_OS_SERVICE_MEMPOOL               "/os/mempool"             /**< Pool management service */
+#define QURT_OS_SERVICE_PROCESS               "/os/process"             /**< Process management service */
+#define QURT_OS_SERVICE_MMAP                  "/os/mem_mapper"          /**< mmapper service */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_pimutex.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_pimutex.h
new file mode 100755
index 0000000000000..61aee5cba7ce8
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_pimutex.h
@@ -0,0 +1,200 @@
+#ifndef QURT_PIMUTEX_H
+#define QURT_PIMUTEX_H 1
+/**
+  @file qurt_pimutex.h   
+  @brief Prototypes of qurt_pimutex API.  
+
+EXTERNAL FUNCTIONS
+   None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+Copyright (c) 2021  by Qualcomm Technologies, Inc.  All Rights Reserved.
+Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+=============================================================================*/
+
+#include <qurt_futex.h>
+#include <qurt_mutex.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*=============================================================================
+                        FUNCTIONS
+=============================================================================*/
+/**@ingroup func_qurt_pimutex_init
+  Initializes a priority inheritance mutex object.
+  The priority inheritance mutex is initially unlocked.
+
+  This function works the same as qurt_mutex_init().
+
+   @note1hang Each pimutex-based object has one or more kernel resources associated with it;
+              to prevent resource leaks, call qurt_pimutex_destroy()
+              when this object is not used anymore
+
+  @datatypes
+  #qurt_mutex_t
+  
+  @param[out]  lock  Pointer to the priority inheritance mutex object.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ 
+ */
+void qurt_pimutex_init(qurt_mutex_t *lock);
+
+/**@ingroup func_qurt_pimutex_destroy
+   Destroys the specified priority inheritance mutex.  
+
+   @note1hang Priority inheritance mutexes must be destroyed when they are no longer in
+              use. Failure to do this causes resource leaks in the QuRT kernel.\n
+   @note1cont Priority inheritance mutexes must not be destroyed while they are still in use.
+              If this occurs, the behavior of QuRT is undefined.
+  
+   @datatypes
+   #qurt_mutex_t
+   
+   @param[in]  lock  Pointer to the priority inheritance mutex object to destroy.
+
+   @return
+   None.
+
+   @dependencies
+   None.
+  
+ */
+void qurt_pimutex_destroy(qurt_mutex_t *lock);
+
+/**@ingroup func_qurt_pimutex_lock
+  Requests access to a shared resources. If a thread performs a lock operation on a mutex 
+  that is not in use, the thread gains access to the shared resource that the mutex protects, 
+  and continues executing.
+ 
+  If a thread performs a lock operation on a mutex that is already in use by another
+  thread, the thread is suspended. When the mutex becomes available again (because the 
+  other thread has unlocked it), the thread is awakened and given access to the shared resource.
+
+  If a thread is suspended on a priority inheritance mutex, and the priority of the suspended
+  thread is higher than the priority of the thread that has locked the mutex, the thread
+  with the mutex acquires the higher priority of the suspended thread. The locker thread blocks
+  until the lock is available.
+ 
+  @note1hang  A thread is not suspended if it locks a priority inheritance mutex that it has 
+              already locked . However, the mutex does not become available to other 
+			  threads until the thread performs a balanced number of unlocks on the mutex.\n
+  @note1cont  When multiple threads compete for a mutex, the lock operation for a priority
+              inheritance mutex is slower than it is for a recursive mutex. 
+			  In particular, it is about 10 times slower when the mutex is available for locking,
+			  and slower (with greatly varying times) when the mutex is already locked.
+
+  @datatypes
+  #qurt_mutex_t
+  
+  @param[in]  lock  Pointer to the priority inheritance mutex object to lock.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+  
+ */
+void qurt_pimutex_lock(qurt_mutex_t *lock);
+
+
+/**@ingroup func_qurt_pimutex_lock_timed
+  Locks a priority inheritance mutex with timeout.
+ 
+  A thread can lock a priority inheritance mutex for multiple times. The mutex is not 
+  available to other threads until the thread performs the same number of mutex unlock
+  operations.
+
+  If a thread performs a lock operation on a mutex that is already locked by another thread, 
+  the thread is moved to waiting state. When the mutex becomes available again (because the 
+  other thread has unlocked the mutex), the thread is awakened and tries to lock the mutex.
+
+  If a thread is waiting on a priority inheritance mutex, and the priority of the waiting thread 
+  is higher than the priority of the thread that has locked the mutex, the priority of the thread
+  that has locked the mutex is raised to the same priority of the waiting thread.
+
+  If the duration of waiting exceeds the timeout duration, the waiting is terminated, and 
+  the function returns QURT_ETIMEDOUT as a failure of the mutex lock.
+  
+
+  @datatypes
+  #qurt_mutex_t
+  
+  @param[in]  lock       Pointer to the mutex object to lock.
+  @param[in]  duration   Duration (in microseconds) to wait. The duration value must be between 
+                         #QURT_TIMER_MIN_DURATION and #QURT_TIMER_MAX_DURATION.
+
+  @return
+   #QURT_EOK       -- Success \n
+   #QURT_ETIMEDOUT -- Timeout
+   #QURT_EINVALID  -- Duration is out of range
+
+  @dependencies
+  None.
+  
+ */
+int qurt_pimutex_lock_timed(qurt_mutex_t *lock, unsigned long long int duration);
+
+
+/**@ingroup func_qurt_pimutex_unlock
+   Releases access to a shared resource; unlocks the specified priority inheritance mutex.  \n
+   More than one thread can be suspended on a priority inheritance mutex. When the mutex
+   is unlocked, only the highest-priority thread waiting on the mutex is awakened. If the
+   awakened thread has higher priority than the current thread, a context switch occurs.
+
+   When a thread unlocks a priority inheritance mutex, its thread priority is restored to its
+   original value from any higher priority value that it acquired from another thread
+   suspended on the mutex.
+  
+   @datatypes
+   #qurt_mutex_t
+   
+   @param[in]  lock  Pointer to the priority inheritance mutex object to unlock.
+
+   @return 
+   None.
+
+   @dependencies
+   None.
+ 
+ */
+void qurt_pimutex_unlock(qurt_mutex_t *lock);
+
+/**@ingroup func_qurt_pimutex_try_lock
+  Request access to a shared resource (without suspend). Attempts to lock the specified priority inheritance mutex.\n
+  If a thread performs a try_lock operation on a priority inheritance mutex that is not in
+  use, the thread gains access to the shared resource that is protected by the mutex, and
+  continues executing.
+  If a thread performs a try_lock operation on a priority inheritance mutex that is already
+  in use by another thread, qurt_pimutex_try_lock immediately returns with a
+  nonzero result value.
+  
+  @datatypes
+  #qurt_mutex_t
+  
+  @param[in]  lock  Pointer to the priority inheritance mutex object to lock.
+
+  @return
+  0 -- Success. \n
+  Nonzero -- Failure. 
+
+  @dependencies
+  None. 
+ */
+int qurt_pimutex_try_lock(qurt_mutex_t *lock);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_PIMUTEX_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_pimutex2.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_pimutex2.h
new file mode 100755
index 0000000000000..b809f163cbfd2
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_pimutex2.h
@@ -0,0 +1,162 @@
+#ifndef QURT_PIMUTEX2_H
+#define QURT_PIMUTEX2_H
+/**
+  @file qurt_pimutex2.h 
+  @brief Prototypes of pimutex2 API  
+
+EXTERNAL FUNCTIONS
+   None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+Copyright (c) 2013, 2021, 2023  by Qualcomm Technologies, Inc.  All Rights Reserved.
+Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+=============================================================================*/
+
+
+#include <qurt_futex.h>
+#include <qurt_mutex.h>
+#include <qurt_rmutex2.h>
+
+/*=============================================================================
+												FUNCTIONS
+=============================================================================*/
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**@ingroup func_qurt_pimutex2_init
+   Initializes a recursive mutex object. 
+
+   @deprecated use #qurt_pimutex_init instead.
+
+   The recursive mutex is initially unlocked.
+  
+   Objects of type pimutex2 solve a potential race condition between
+   unlock() and destroy() operations.
+
+   @datatypes
+   #qurt_rmutex2_t
+   
+   @param[out]  lock  Pointer to the recursive mutex object.
+
+   @return 
+   None.
+
+   @dependencies
+   None.
+  
+ */
+void qurt_pimutex2_init(qurt_rmutex2_t *lock);
+
+/**@ingroup func_qurt_pimutex2_destroy
+
+  @deprecated use #qurt_pimutex_destroy instead.
+
+  Destroys the specified recursive mutex. \n
+  @note1cont Recursive mutexes must not be destroyed while they are still in use. If this
+             occurs, the behavior of QuRT is undefined. 
+  @note1cont In general, application code should destroy an pimutex2 object prior to
+             deallocating it; calling qurt_pimutex2_destroy() before deallocating it ensures
+             that all qurt_pimutex2_unlock() calls complete.
+  
+  @datatypes
+  #qurt_rmutex2_t
+  
+  @param[in]  lock  Pointer to the recursive mutex object to destroy.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+  
+ */
+void qurt_pimutex2_destroy(qurt_rmutex2_t *lock);
+
+/**@ingroup func_qurt_pimutex2_lock
+
+  @deprecated use #qurt_pimutex_lock instead.
+
+  Locks the specified recursive mutex. \n 
+
+  If a thread performs a lock operation on a recursive mutex that is not being used, the
+  thread gains access to the shared resource that is protected by the mutex, and continues
+  executing.
+
+  If a thread performs a lock operation on a recursive mutex that is already being used by
+  another thread, the thread is suspended. When the mutex becomes available again
+  (because the other thread has unlocked it), the thread is awakened and given access to the
+  shared resource.
+  
+  @note1hang A thread is not suspended if it locks a recursive mutex that it has already
+             locked, but the mutex does not become available until the thread performs a
+             balanced number of unlocks on the mutex.
+  
+   @datatypes
+   #qurt_rmutex2_t
+  
+   @param[in]  lock  Pointer to the recursive mutex object to lock. 
+
+   @return
+   None.
+
+   @dependencies
+   None.
+  
+ */
+void qurt_pimutex2_lock(qurt_rmutex2_t *lock);
+
+/**@ingroup func_qurt_pimutex2_unlock
+
+   @deprecated use #qurt_pimutex_unlock instead.
+
+   Unlocks the specified recursive mutex. \n 
+   More than one thread can be suspended on a recursive mutex. When the mutex is
+   unlocked, only the highest-priority thread waiting on the mutex is awakened. If the
+   awakened thread has higher priority than the current thread, a context switch occurs.
+  
+   @datatypes
+   #qurt_rmutex2_t
+  
+   @param[in]  lock  Pointer to the recursive mutex object to unlock. 
+
+   @return
+   None.
+
+   @dependencies
+   None.
+  
+ */
+void qurt_pimutex2_unlock(qurt_rmutex2_t *lock);
+
+/**@ingroup func_qurt_rmutex2_try_lock
+
+   @deprecated use #qurt_pimutex_try_lock instead.
+
+   Attempts to lock the specified recursive mutex.\n
+
+   Non-blocking version of qurt_pimutex2_lock().  If a call to qurt_pimutex2_lock() would
+   succeed immediately, this function behaves similarly, and returns 0 for success.
+   If a call to qurt_pimutex2_lock() would not succeed immediately, this function has
+   no effect and returns non-zero for failure.
+
+   @datatypes
+   #qurt_rmutex2_t
+   
+   @param[in]  lock  Pointer to the recursive mutex object to lock.
+
+   @return 
+   0 -- Success. \n 
+   Nonzero -- Failure. 
+  
+ */
+int qurt_pimutex2_try_lock(qurt_rmutex2_t *lock);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_PIMUTEX2_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_pipe.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_pipe.h
new file mode 100755
index 0000000000000..6bdaa044f8640
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_pipe.h
@@ -0,0 +1,479 @@
+#ifndef QURT_PIPE_H
+#define QURT_PIPE_H
+/**
+  @file qurt_pipe.h 
+  @brief  Prototypes of the pipe interface API  
+   This is a pipe or message queue
+	 It blocks when too full (send) or empty (receive).
+	 Unless using a nonblocking option, all datagrams are 64 bits.
+
+EXTERNAL FUNCTIONS
+   None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+ Copyright (c) 2021,2023  by Qualcomm Technologies, Inc.  All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+=============================================================================*/
+
+#include <stddef.h>
+#include <qurt_mutex.h>
+#include <qurt_sem.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** @addtogroup pipe_types
+@{ */
+/*=============================================================================
+                        CONSTANTS AND MACROS
+=============================================================================*/
+#define QURT_PIPE_MAGIC  0xF1FEF1FE        /**< Magic. */
+#define QURT_PIPE_ATTR_MEM_PARTITION_RAM 0 /**< RAM. */
+#define QURT_PIPE_ATTR_MEM_PARTITION_TCM 1 /**< TCM. */
+
+/*=============================================================================
+                        TYPEDEFS
+=============================================================================*/
+/** QuRT pipe data values type. */
+typedef unsigned long long int qurt_pipe_data_t;
+
+/** QuRT pipe type.*/
+typedef struct {
+    /** @cond */
+	qurt_mutex_t pipe_lock;
+	qurt_sem_t senders;
+	qurt_sem_t receiver;
+	unsigned int size;
+	unsigned int sendidx;
+	unsigned int recvidx;
+	void (*lock_func)(qurt_mutex_t *);
+	void (*unlock_func)(qurt_mutex_t *);
+    int (*try_lock_func)(qurt_mutex_t *);
+    void (*destroy_lock_func)(qurt_mutex_t *);
+	unsigned int magic;
+	qurt_pipe_data_t *data;
+    /** @endcond */
+} qurt_pipe_t;
+
+/**  QuRT pipe attributes type. */
+typedef struct {
+  /** @cond */
+  qurt_pipe_data_t *buffer;
+  unsigned int elements;
+  unsigned char mem_partition;
+  /** @endcond */
+} qurt_pipe_attr_t;
+
+/** @} */ /* end_addtogroup pipe_types */
+/*=============================================================================
+                        FUNCTIONS
+=============================================================================*/
+/**@ingroup func_qurt_pipe_attr_init
+  @xreflabel{hdr:qurt_pipe_attr_init}
+  Initializes the structure that sets the pipe attributes when a pipe is created.
+
+  After an attribute structure is initialized, the individual attributes in the structure are
+  explicitly set using the pipe attribute operations.
+
+  The attribute structure is assigned the following default values: \n
+  - buffer -- 0 \n
+  - elements -- 0 \n
+  - mem_partition -- #QURT_PIPE_ATTR_MEM_PARTITION_RAM
+  
+  @datatypes
+  #qurt_pipe_attr_t
+ 
+  @param[in,out] attr Pointer to the pipe attribute structure.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+static inline void qurt_pipe_attr_init(qurt_pipe_attr_t *attr)
+{
+  attr->buffer = NULL;
+  attr->elements = 0;
+  attr->mem_partition = QURT_PIPE_ATTR_MEM_PARTITION_RAM;
+}
+
+/**@ingroup func_qurt_pipe_attr_set_buffer
+  @xreflabel{sec:qurt_pipe_attr_set_buffer}
+  Sets the pipe buffer address attribute.\n
+  Specifies the base address of the memory area to use for the data buffer of a pipe.
+
+  The base address and size (Section @xref{sec:qurt_pipe_attr_set_elements}) specify the 
+  memory area used as a pipe data buffer. The user is responsible for allocating the 
+  memory area used for the buffer.
+
+  @datatypes
+  #qurt_pipe_attr_t \n
+  #qurt_pipe_data_t
+  
+  @param[in,out] attr Pointer to the pipe attribute structure.
+  @param[in] buffer   Pointer to the buffer base address.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+static inline void qurt_pipe_attr_set_buffer(qurt_pipe_attr_t *attr, qurt_pipe_data_t *buffer)
+{
+  attr->buffer = buffer;
+}
+
+/**@ingroup func_qurt_pipe_attr_set_elements
+  @xreflabel{sec:qurt_pipe_attr_set_elements}
+  Specifies the length of the memory area to use for the data buffer of a pipe. 
+  
+  The length is expressed in terms of the number of 64-bit data elements that 
+  can be stored in the buffer. 
+  
+  The base address (Section @xref{sec:qurt_pipe_attr_set_buffer}) and size specify 
+  the memory area used as a pipe data buffer. The user is responsible for 
+  allocating the memory area used for the buffer.
+
+  @datatypes
+  #qurt_pipe_attr_t
+
+  @param[in,out] attr Pointer to the pipe attribute structure.
+  @param[in] elements Pipe length (64-bit elements). 
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+static inline void qurt_pipe_attr_set_elements(qurt_pipe_attr_t *attr, unsigned int elements)
+{
+  attr->elements = elements;
+}
+
+/**@ingroup func_qurt_pipe_attr_set_buffer_partition
+  @xreflabel{sec:qurt_pipe_attr_set_buffer_partition}
+  Specifies the memory type where a pipe's buffer is allocated.
+  Allocate pipes in RAM or TCM/LPM.
+ 
+  @note1hang If a pipe is specified as allocated in TCM/LPM, it must be created
+  with the qurt_pipe_init() operation. The qurt_pipe_create() operation results in an error.
+
+  @datatypes
+  #qurt_pipe_attr_t
+
+  @param[in,out] attr Pointer to the pipe attribute structure.
+  @param[in] mem_partition Pipe memory partition. Values: \n
+             - #QURT_PIPE_ATTR_MEM_PARTITION_RAM -- Pipe resides in RAM \n
+             - #QURT_PIPE_ATTR_MEM_PARTITION_TCM -- Pipe resides in TCM/LCM @tablebulletend
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+static inline void qurt_pipe_attr_set_buffer_partition(qurt_pipe_attr_t *attr, unsigned char mem_partition)
+{
+  attr->mem_partition = mem_partition;
+}
+
+/**@ingroup func_qurt_pipe_create
+  Creates a pipe.\n
+  Allocates a pipe object and its associated data buffer, and initializes the pipe object.
+
+  @note1hang The buffer address and size stored in the attribute structure specify how the
+             pipe data buffer is allocated.
+  
+  @note1cont If a pipe is specified as allocated in TCM/LPM, it must be created
+             using the qurt_pipe_init() operation. The qurt_pipe_create() operation results in an error.
+  
+  @datatypes
+  #qurt_pipe_t \n
+  #qurt_pipe_attr_t
+  
+  @param[out] pipe  Pointer to the created pipe object.
+  @param[in]  attr  Pointer to the attribute structure used to create the pipe.
+
+  @return 
+  #QURT_EOK -- Pipe created. \n
+  #QURT_EFAILED -- Pipe not created. \n
+  #QURT_ENOTALLOWED -- Pipe cannot be created in TCM/LPM.
+
+  @dependencies
+  None.
+ */
+int qurt_pipe_create(qurt_pipe_t **pipe, qurt_pipe_attr_t *attr);
+
+/**@ingroup func_qurt_pipe_init
+  Initializes a pipe object using an existing data buffer.
+
+  @note1hang The buffer address and size stored in the attribute structure must 
+             specify a data buffer that the user has already allocated.
+
+  @datatypes
+  #qurt_pipe_t \n
+  #qurt_pipe_attr_t
+  
+  @param[out] pipe Pointer to the pipe object to initialize.
+  @param[in] attr  Pointer to the pipe attribute structure used to initialize the pipe.
+
+  @return 
+  #QURT_EOK -- Success. \n
+  #QURT_EFAILED -- Failure.
+
+  @dependencies
+  None.
+ */
+int qurt_pipe_init(qurt_pipe_t *pipe, qurt_pipe_attr_t *attr);
+
+/**@ingroup func_qurt_pipe_destroy
+  @xreflabel{sec:qurt_pipe_destroy}
+  Destroys the specified pipe.
+
+  @note1hang Pipes must be destroyed when they are no longer in use. Failure 
+             to do this causes resource leaks in the QuRT kernel.
+             Pipes must not be destroyed while they are still in use. If this 
+             occurs, the behavior of QuRT is undefined.
+
+  @datatypes
+  #qurt_pipe_t
+  
+  @param[in] pipe Pointer to the pipe object to destroy.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+void qurt_pipe_destroy(qurt_pipe_t *pipe); 
+
+/**@ingroup func_qurt_pipe_delete
+  Deletes the pipe.\n
+  Destroys the specified pipe (Section @xref{sec:qurt_pipe_destroy}) and deallocates the pipe object and its
+  associated data buffer.
+
+  @note1hang Delete pipes only if they were created using qurt_pipe_create
+             (and not qurt_pipe_init). Otherwise the behavior of QuRT is undefined. \n
+  @note1cont Pipes must be deleted when they are no longer in use. Failure to do this 
+             causes resource leaks in the QuRT kernel.\n
+  @note1cont Pipes must not be deleted while they are still in use. If this occurs, the
+             behavior of QuRT is undefined. 
+
+  @datatypes
+  #qurt_pipe_t
+  
+  @param[in] pipe Pointer to the pipe object to destroy.
+
+  @return 
+  None.
+
+  @dependencies
+  None.
+ */
+void qurt_pipe_delete(qurt_pipe_t *pipe);
+
+/**@ingroup func_qurt_pipe_send
+  Writes a data item to the specified pipe. \n
+  If a thread writes to a full pipe, it is suspended on the pipe. When another thread reads
+  from the pipe, the suspended thread is awakened and can then write data to the pipe.
+
+  Pipe data items are defined as 64-bit values. Pipe writes are limited to transferring a single
+  64-bit data item per operation.
+
+  @note1hang Transfer data items larger than 64 bits by reading and writing
+             pointers to the data, or by transferring the data in consecutive 64-bit chunks.
+
+  @datatypes
+  #qurt_pipe_t \n
+  #qurt_pipe_data_t
+  
+  @param[in] pipe Pointer to the pipe object to write to.
+  @param[in] data Data item to write.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+void qurt_pipe_send(qurt_pipe_t *pipe, qurt_pipe_data_t data);
+
+/**@ingroup func_qurt_pipe_receive
+  Reads a data item from the specified pipe.
+
+  If a thread reads from an empty pipe, it is suspended on the pipe. When another thread
+  writes to the pipe, the suspended thread is awakened and can then read data from the pipe.
+  Pipe data items are defined as 64-bit values. Pipe reads are limited to transferring a single
+  64-bit data item per operation.
+
+  @note1hang Transfer data items larger than 64 bits by reading and writing
+             pointers to the data, or by transferring the data in consecutive 64-bit chunks.
+
+  @datatypes
+  #qurt_pipe_t
+  
+  @param[in] pipe Pointer to the pipe object to read from.
+
+  @return
+  Integer containing the 64-bit data item from pipe.
+
+  @dependencies
+  None.
+*/
+qurt_pipe_data_t qurt_pipe_receive(qurt_pipe_t *pipe);
+
+/**@ingroup func_qurt_pipe_try_send
+  Writes a data item to the specified pipe (without suspending the thread if the pipe is full).\n
+
+  If a thread writes to a full pipe, the operation returns immediately with success set to -1.
+  Otherwise, success is always set to 0 to indicate a successful write operation.
+
+  Pipe data items are defined as 64-bit values. Pipe writes are limited to transferring a single
+  64-bit data item per operation.
+
+  @note1hang Transfer data items larger than 64 bits  by reading and writing
+             pointers to the data, or by transferring the data in consecutive 64-bit chunks.
+
+  @datatypes
+  #qurt_pipe_t \n
+  #qurt_pipe_data_t
+  
+  @param[in] pipe Pointer to the pipe object to write to.
+  @param[in] data Data item to write.
+
+  @return
+  0 -- Success. \n
+  -1 -- Failure (pipe full).
+
+  @dependencies
+  None.
+*/ 
+int qurt_pipe_try_send(qurt_pipe_t *pipe, qurt_pipe_data_t data);
+
+/**@ingroup func_qurt_pipe_try_receive
+  Reads a data item from the specified pipe (without suspending the thread if the pipe is
+  empty).\n
+  If a thread reads from an empty pipe, the operation returns immediately with success set
+  to -1. Otherwise, success is always set to 0 to indicate a successful read operation.\n
+
+  Pipe data items are defined as 64-bit values. Pipe reads are limited to transferring a single
+  64-bit data item per operation.
+
+  @note1hang Transfer data items larger than 64 bits by reading and writing
+             pointers to the data, or by transferring the data in consecutive 64-bit chunks.
+
+  @datatypes
+  #qurt_pipe_t
+  
+  @param[in] pipe     Pointer to the pipe object to read from.
+  @param[out] success Pointer to the operation status result.
+
+  @return
+  Integer containing a 64-bit data item from pipe.
+
+  @dependencies
+  None.
+*/
+qurt_pipe_data_t qurt_pipe_try_receive(qurt_pipe_t *pipe, int *success);
+
+/**@ingroup func_qurt_pipe_receive_cancellable  
+  Reads a data item from the specified pipe (with suspend), cancellable.
+
+  If a thread reads from an empty pipe, it is suspended on the pipe. When another thread
+  writes to the pipe, the suspended thread is awakened and can then read data from the pipe.
+  The operation is cancelled if the user process of the calling thread is killed, 
+  or if the calling thread must finish its current QDI invocation and return to user space.
+  Root pd thread can use this api to wait on pipe for receiving and gets resumed with QURT_EDESTROY
+  if the pipe gets destroyed .
+  Pipe data items are defined as 64-bit values. Pipe reads are limited to transferring a single
+  64-bit data item per operation. 
+
+  @note1hang Transfer data items larger than 64 bits by reading and writing
+             pointers to the data, or by transferring the data in consecutive 64-bit chunks.
+
+  @datatypes
+  #qurt_pipe_t \n
+  #qurt_pipe_data_t
+
+  @param[in] pipe     Pointer to the pipe object to read from.
+  @param[in] result   Pointer to the integer containing the 64-bit data item from pipe.
+
+  @return     	
+  #QURT_EOK -- Receive completed. \n
+  #QURT_ECANCEL -- Receive canceled. \n
+  #QURT_EDESTROY -- Receive destroyed. \n
+  #QURT_ENOTALLOWED -- Pipe is not initialized
+
+ 
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+int qurt_pipe_receive_cancellable(qurt_pipe_t *pipe, qurt_pipe_data_t *result);
+
+/**@ingroup func_qurt_pipe_send_cancellable  
+  @xreflabel{hdr:qurt_pipe_send_cancellable}
+  Writes a data item to the specified pipe (with suspend), cancellable. \n
+  If a thread writes to a full pipe, it is suspended on the pipe. When another thread reads
+  from the pipe, the suspended thread is awakened and can then write data to the pipe.
+  The operation is canceled if the user process of the calling thread is killed, or if the 
+  calling thread must finish its current QDI invocation and return to user space.
+  Root pd thread can use this api to wait on pipe for receiving and gets resumed with QURT_EDESTROY
+  if the pipe gets destroyed .
+
+  Pipe data items are defined as 64-bit values. Pipe writes are limited to transferring a single
+  64-bit data item per operation.
+
+  @note1hang Transfer data items larger than 64 bits by reading and writing
+             pointers to the data, or by transferring the data in consecutive 64-bit chunks.
+
+  @datatypes
+  #qurt_pipe_t \n
+  #qurt_pipe_data_t
+
+  @param[in] pipe      Pointer to the pipe object to read from.
+  @param[in] data      Data item to write.
+
+  @return     	
+  #QURT_EOK -- Send completed. \n
+  #QURT_ECANCEL -- Send canceled. \n
+  #QURT_EDESTROY -- Send destroyed. \n
+  #QURT_ENOTALLOWED -- Pipe is not initialized
+
+ 
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+int qurt_pipe_send_cancellable(qurt_pipe_t *pipe, qurt_pipe_data_t data);
+
+/**@ingroup func_qurt_pipe_is_empty
+  Returns a value indicating whether the specified pipe contains any data.
+
+  @datatypes
+  #qurt_pipe_t
+
+  @param[in] pipe     Pointer to the pipe object to read from.
+
+  @return
+  1 -- Pipe contains no data. \n
+  0 -- Pipe contains data.
+
+  @dependencies
+  None.
+*/
+int qurt_pipe_is_empty(qurt_pipe_t *pipe);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif  /* QURT_PIPE_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_pmem_manager.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_pmem_manager.h
new file mode 100755
index 0000000000000..8c8da985228b9
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_pmem_manager.h
@@ -0,0 +1,82 @@
+#ifndef QURT_PMEM_MANAGER_H
+#define QURT_PMEM_MANAGER_H
+/**
+  @file qurt_pmem_manager.h
+  Prototypes of kernel physical memory manager APIs
+
+ EXTERNALIZED FUNCTIONS
+  none
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  none
+
+ Copyright (c) 2023 by Qualcomm Technologies, Inc.  All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*=====================================================================
+ Constants and macros
+ ======================================================================*/
+
+/* physical memory API return error code */
+#define QURT_PMEM_SUCCESS               0
+#define QURT_PMEM_NO_PRIV               1
+#define QURT_PMEM_RETRY                 2
+#define QURT_PMEM_OVERLAP               3
+#define QURT_PMEM_NOT_EXIST             4
+#define QURT_PMEM_INIT_FAILURE          5
+#define QURT_PMEM_OUTSTANDING_MAPPING   6
+#define QURT_PMEM_GENERIC_FAILURE       7
+#define QURT_PMEM_ENTRY_FOUND           8
+#define QURT_PMEM_REACH_END             9
+#define QURT_PMEM_UNCLAIMED             10
+#define QURT_PMEM_ALREADY_CLAIMED       11
+
+/*=====================================================================
+ Functions
+======================================================================*/
+
+/**@ingroup func_qurt_pmem_acquire
+  Acquire the ownership of a specific physical memory region.
+
+  @note1hang The ownership will be the caller
+
+  @param[in] ppage      Starting physical page number
+  @param[in] pnum       Number of physical pages
+
+  @return
+  #QURT_PMEM_NO_PRIV -- Have no privilege to claim the ownership. \n
+  #QURT_PMEM_OVERLAP -- The whole or part of the range has been owned \n
+  #QURT_PMEM_SUCCESS -- Succeed to claim ownership.
+
+  @dependencies
+  None.
+*/
+int qurt_pmem_acquire(unsigned int ppage, unsigned int pnum);
+
+/**@ingroup func_qurt_pmem_release
+  Release the ownership of a specific physical memory region.
+
+  @param[in] ppage      The start of physical page number
+  @param[in] pnum       The numbers of physical pages
+
+  @return
+  #QURT_PMEM_NO_PRIV                -- Have no privilege to claim the ownership. \n
+  #QURT_PMEM_NOT_EXIST              -- The physical memory range is not usable. \n
+  #QURT_PMEM_OUTSTANDING_MAPPING    -- There is outstanding mapping in this range
+  #QURT_PMEM_SUCCESS                -- Succeed to claim ownership.
+
+  @dependencies
+  None.
+ */
+int qurt_pmem_release(unsigned int ppage, unsigned int pnum);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_PMEM_MANAGER_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_pmu.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_pmu.h
new file mode 100755
index 0000000000000..73ea8eba04abf
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_pmu.h
@@ -0,0 +1,121 @@
+#ifndef QURT_PMU_H
+#define QURT_PMU_H
+/**
+  @file qurt_pmu.h 
+  Prototypes of pipe interface API.  
+	 A pipe or message queue blocks when too full (send) or empty (receive).
+	 Unless using a nonblocking option, all datagrams are 64 bits.
+
+  EXTERNAL FUNCTIONS
+   None.
+
+  INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+  Copyright (c) 2021 Qualcomm Technologies, Inc.
+  All rights reserved.
+  Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+=============================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*=============================================================================
+												FUNCTIONS
+=============================================================================*/
+
+/**@ingroup func_qurt_pmu_set
+  Sets the value of the specified PMU register.
+
+  @note1hang Setting PMUEVTCFG automatically clears the PMU registers PMUCNT0
+             through PMUCNT3.
+ 
+  @param[in] reg_id   PMU register. Values: 
+            - #QURT_PMUCNT0
+            - #QURT_PMUCNT1    
+            - #QURT_PMUCNT2    
+            - #QURT_PMUCNT3    
+            - #QURT_PMUCFG     
+            - #QURT_PMUEVTCFG
+            - #QURT_PMUCNT4    
+            - #QURT_PMUCNT5    
+            - #QURT_PMUCNT6    
+            - #QURT_PMUCNT7    
+            - #QURT_PMUEVTCFG1   @tablebulletend 
+
+  @param[in] reg_value  Register value.
+ 
+  @return
+  None.
+   
+  @dependencies
+  None.
+ */
+void qurt_pmu_set (int reg_id, unsigned int reg_value);
+ 
+/**@ingroup func_qurt_pmu_get
+  Gets the PMU register.\n
+  Returns the current value of the specified PMU register.
+
+  @param[in] reg_id   PMU register. Values: 			   
+            - #QURT_PMUCNT0
+            - #QURT_PMUCNT1    
+            - #QURT_PMUCNT2    
+            - #QURT_PMUCNT3    
+            - #QURT_PMUCFG     
+            - #QURT_PMUEVTCFG
+            - #QURT_PMUCNT4    
+            - #QURT_PMUCNT5    
+            - #QURT_PMUCNT6    
+            - #QURT_PMUCNT7    
+            - #QURT_PMUEVTCFG1  @tablebulletend           
+ 
+  @return
+   Integer -- Current value of the specified PMU register.
+
+  @dependencies
+  None.
+ */
+unsigned int  qurt_pmu_get (int reg_id);
+ 
+/**@ingroup func_qurt_pmu_enable
+  Enables or disables the Hexagon processor PMU.
+  Profiling is disabled by default. 
+
+  @note1hang Enabling profiling does not automatically reset the count registers -- this must
+            be done explicitly before starting event counting.
+ 
+  @param[in] enable Performance monitor. Values: \n
+                    - 0 -- Disable performance monitor \n
+                    - 1 -- Enable performance monitor @tablebulletend
+ 
+  @return 
+  None.
+
+  @dependencies
+  None.
+ */
+void qurt_pmu_enable (int enable);
+
+/**@ingroup func_qurt_pmu_get_pmucnt
+  Reads PMU counters in a single trap.
+ 
+  @param[out] buf   Pointer to a buffer to save values read from PMU counters.
+                    buffer size should be at least 32 bytes to read all eight PMU counters.
+ 
+  @return 
+  #QURT_EOK    -- Successful read.\n
+  #QURT_EFATAL -- Failure.
+
+  @dependencies
+  None.
+ */
+int qurt_pmu_get_pmucnt (void * buf);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_PMU_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_power.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_power.h
new file mode 100755
index 0000000000000..2ee4d29a73976
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_power.h
@@ -0,0 +1,140 @@
+#ifndef QURT_POWER_H
+#define QURT_POWER_H
+/**
+  @file qurt_power.h
+  @brief  Prototypes of power API
+
+EXTERNAL FUNCTIONS
+   None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+Copyright (c) 2018-2021  by Qualcomm Technologies, Inc.  All Rights Reserved.
+Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+=============================================================================*/
+
+/*=============================================================================
+
+                        EDIT HISTORY FOR MODULE
+
+ This section contains comments describing changes made to the module.
+ Notice that changes are listed in reverse chronological order.
+
+
+when       who     what, where, why
+--------   ---     ------------------------------------------------------------
+03/03/11   op      Add header file
+12/12/12   cm      (Tech Pubs) Edited/added Doxygen comments and markup.
+=============================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** @cond */
+/**@ingroup func_qurt_power_shutdown_fail_exit
+  Returns from Power Collapse mode when power collapse cannot proceed.
+
+  This function unmasks the global interrupt. This operation is used only when the thread is
+  recovering from a failed power collapse operation (Section @xref{sec:powerShutdownEnter}).
+
+  @return
+  #QURT_EOK -- Operation was successfully performed.
+
+  @dependencies
+  None.
+ */
+#define  qurt_power_shutdown_fail_exit qurt_power_exit
+
+/**@ingroup func_qurt_power_shutdown_exit
+  Undoes state changes made preparing for power collapse.\n
+  This function unmasks the global interrupts.
+
+  @return
+  #QURT_EOK --Operation was successfully performed.
+
+  @dependencies
+  None.
+ */
+#define qurt_power_shutdown_exit qurt_power_exit
+/**@endcond */
+
+/**@ingroup func_qurt_system_ipend_get
+  Gets the IPEND register.\n
+
+  @note1hang Returns the current value of the Hexagon processor IPEND register. The return value
+             is a mask value that identifies the individual interrupts that are pending. \n
+
+  @note1hang The bit order of the mask value is identical to the order defined for the IPEND register. A
+             mask bit value of 1 indicates that the corresponding interrupt is pending, and 0 indicates that the
+             corresponding interrupt is not pending. \n
+
+  @return
+  Return the IPEND register value.
+
+  @dependencies
+  None.
+ */
+unsigned int qurt_system_ipend_get (void);
+
+
+/**@ingroup func_qurt_system_vid_get
+  Gets the VID register. \n
+
+  @note1hang Returns the current value of the Hexagon processor VID register. The return value is
+             the vector number of a second-level interrupt that has been accepted by the Hexagon
+             processor core.\n
+
+  @return
+  Return the VID register value that is the L2 VIC interrupt number accepted by the processor.
+  Valid range is 0 to 1023.
+
+  @dependencies
+  None.
+ */
+unsigned int qurt_system_vid_get(void);
+
+/**@ingroup func_qurt_power_shutdown_get_pcycles
+   Gets the number of power collapses and processor cycles for entering and exiting most recent
+   power collapse.
+
+   @note1hang If no power collapse has occured yet, processor cycle numbers are zero.
+
+   @param[out] enter_pcycles  Number of processor cycles for entering most
+                              recent power collapse.
+   @param[out] exit_pcycles  Number of processor cycles for exiting most
+                             recent power collapse.
+   @return
+   Zero -- No power collapses have occurred. \n
+   Nonzero -- Number of power collapses that have occurred since
+                the processor was reset.
+
+   @dependencies
+   None.
+ */
+int qurt_power_shutdown_get_pcycles( unsigned long long *enter_pcycles,  unsigned long long *exit_pcycles );
+
+/**@ingroup func_qurt_system_tcm_set_size
+   Set size of TCM to save during full power collapse.
+
+   @note1hang The size aligns to 32 bytes. If size passed is greater than the maximum size defined in
+              XML, the size is truncated to the size defined in XML.
+
+   @param[in] new_size Size of TCM to save.
+
+   @return
+   Zero -- Size successfully set \n
+   -1 -- Size of 0 passed
+
+   @dependencies
+   None.
+ */
+int qurt_system_tcm_set_size(unsigned int new_size);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_POWER_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_printf.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_printf.h
new file mode 100755
index 0000000000000..a775d8a815918
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_printf.h
@@ -0,0 +1,44 @@
+#ifndef QURT_PRINTF_H
+#define QURT_PRINTF_H
+
+#include <stdarg.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+  @file qurt_printf.h   
+  Prototypes of printf API.  
+
+EXTERNAL FUNCTIONS
+   None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+Copyright (c) 2021  by Qualcomm Technologies, Inc.  All Rights Reserved.
+Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+=============================================================================*/
+
+
+
+/*=============================================================================
+                        CONSTANTS AND MACROS
+=============================================================================*/
+/** @addtogroup chapter_function_tracing
+@{ */
+
+int qurt_printf(const char* format, ...);
+
+int qurt_vprintf(const char* format, va_list args);
+
+/** @} */ /* end_addtogroup chapter_function_tracing */
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_PRINTF_H */
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_process.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_process.h
new file mode 100755
index 0000000000000..0df9ddc2d4a70
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_process.h
@@ -0,0 +1,995 @@
+#ifndef QURT_PROCESS_H
+#define QURT_PROCESS_H
+/**
+  @file qurt_process.h
+  @brief Prototypes of QuRT process control APIs.
+
+ EXTERNALIZED FUNCTIONS
+ None
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+ None
+
+ Copyright (c) 2009-2013, 2021-2023 Qualcomm Technologies, Inc.
+ All rights reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+#include "qurt_callback.h"
+#include "qurt_consts.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** @addtogroup process_types
+@{ */
+#define QURT_PROCESS_ATTR_NAME_MAXLEN       QURT_MAX_NAME_LEN   /**< Maximum length of the process name. */
+#define QURT_PROCESS_ATTR_BIN_PATH_MAXLEN   128                 /**< Maximum length of the path of binary/ELF for this process. */
+#define QURT_PROCESS_ATTR_CAP_MAXLEN        128                 /**< Maximum length for a resource name. */
+
+/** QuRT process capability wildcard strings */
+#define QURT_PROCESS_ATTR_CAP_ALLOW_ALL     "ALLOW_ALL"         /**< Capability wild-card for full access */
+#define QURT_PROCESS_ATTR_CAP_ALLOW_NONE    "ALLOW_NONE"        /**< Capability wild-card for no access */
+
+/** QuRT process capability states */  
+#define QURT_PROCESS_ATTR_CAP_ENABLED       0x1                 /**< Capability enabled*/
+#define QURT_PROCESS_ATTR_CAP_DISABLED      0x0                 /**< Capability disabled*/  
+
+/* QuRT process thread attributes. */
+#define QURT_PROCESS_DEFAULT_CEILING_PRIO 0        /**< Default ceiling priority of the threads in the new process. */
+#define QURT_PROCESS_DEFAULT_MAX_THREADS  -1       /**< Default number of threads in the new process.
+                                                        -1 indicates that the limit is set to the maximum supported by the system. */
+
+/* QuRT process flags. */
+#define QURT_PROCESS_SUSPEND_ON_STARTUP  (1U)      /**< Suspend the new processes just before calling main(). */
+#define QURT_PROCESS_NON_SYSTEM_CRITICAL (1u << 1) /**< Starts the new process as non system-critical. */
+#define QURT_PROCESS_ISLAND_RESIDENT     (1u << 2) /**< Process is island resident. */
+#define QURT_PROCESS_RESTARTABLE         (1u << 3) /**< Indicates that the process is restartable */
+#define QURT_PROCESS_UNTRUSTED           (1u << 7) /**< Starts the new process as unsigned process. */
+
+/* QuRT process debugging session status.*/
+#define QURT_DEBUG_NOT_START         0  /**< Debug is not started. */
+#define QURT_DEBUG_START             1  /**< Debug has started. */
+
+/** Process Suspend Options */
+#define QURT_PROCESS_SUSPEND_DEFAULT   0
+
+/** Process Resume Options   */
+#define QURT_PROCESS_RESUME_DEFAULT    0
+
+
+/* QuRT process types. */
+typedef enum {
+    QURT_PROCESS_TYPE_RESERVED,            /**< Process type is reserved. \n */
+    QURT_PROCESS_TYPE_KERNEL,              /**< Kernel process. \n*/
+    QURT_PROCESS_TYPE_SRM,                 /**< SRM process.    \n*/
+    QURT_PROCESS_TYPE_SECURE,              /**< Secure process. \n*/
+    QURT_PROCESS_TYPE_ROOT,                /**< Root process.   \n*/
+    QURT_PROCESS_TYPE_USER,                /**< User process.   */
+}qurt_process_type_t;
+
+/** QuRT process callback types. */
+typedef enum {
+   QURT_PROCESS_DUMP_CB_ROOT,             /**< Register the callback that executes in the
+                                               root process context. \n */
+   QURT_PROCESS_DUMP_CB_ERROR,            /**< Register the user process callback that is 
+                                               called after threads in the process are frozen. \n */
+   QURT_PROCESS_DUMP_CB_PRESTM,           /**< Register the user process callback that is
+                                               called before threads in the process are frozen. \n*/
+   QURT_PROCESS_DUMP_CB_MAX               /**< Reserved for error checking. */
+}qurt_process_dump_cb_type_t;
+
+/** QuRT process dump attributes. */
+typedef struct _qurt_pd_dump_attr{
+  /** @cond */
+  unsigned int enabled;                    /**< Process dump is enabled. */
+  const char *path;                        /**< Process dump path. */
+  unsigned int path_len;                   /**< Length of process dump path. */
+  /** @endcond */
+}qurt_pd_dump_attr_t;                    
+
+/** QuRT process capability resource type */
+enum qurt_process_cap_type_t {
+    QURT_PROCESS_CAP_TYPE_NUM_ENTRIES=0,       /**< Number of entries in the capability structure*/
+    QURT_PROCESS_CAP_TYPE_DRIVER=1,            /**< Driver resource */
+    QURT_PROCESS_CAP_TYPE_MAX                  /**< Maximum identifier */        
+};
+
+/** QuRT process capability structure */
+typedef struct _qurt_capability {
+    enum qurt_process_cap_type_t type;             /**< Resource type */
+    char name[QURT_PROCESS_ATTR_CAP_MAXLEN];       /**< Resource name*/ 
+    unsigned long long cap;                        /**< Capabilities allowed for this resource */
+}qurt_capability_t;
+
+/** QuRT process attributes. */
+typedef struct _qurt_process_attr {
+    /** @cond */
+    char name[QURT_PROCESS_ATTR_NAME_MAXLEN];           /**< Name of the new process. */
+    char path[QURT_PROCESS_ATTR_BIN_PATH_MAXLEN];       /**< Path of the binary for the new process. */
+    char dtb_path[QURT_PROCESS_ATTR_BIN_PATH_MAXLEN];   /**< Path of the DTB ELF for the new process. */
+    int flags;                                          /**< Flags as indicated by QuRT process flags. */
+    unsigned int sw_id;                                 /**< Software ID of the process be load. */
+    unsigned sid;                                       /**< Stream ID of the process being spawned. */
+    unsigned max_threads;                               /**< Maximum number of threads that the new process can create. */
+    unsigned short ceiling_prio;                        /**< Maximum priority at which threads can be 
+                                                             created by new process. */
+    qurt_process_type_t type;                           /**< Process type as indicated by 
+                                                             #qurt_process_type_t. */
+    qurt_pd_dump_attr_t dump_attr;                      /**< Process dump attributes for the new process 
+                                                             as indicated by #qurt_pd_dump_attr_t. */ 
+    qurt_capability_t *capabilities;                    /**< Pointer to array of structure of type
+                                                             qurt_capability_t */
+    /** @endcond */
+} qurt_process_attr_t; 
+
+/** @} */ /* end_addtogroup process_types */
+
+/*=============================================================================
+FUNCTIONS
+=============================================================================*/
+ /** @cond rest_reg_dist */
+/**@ingroup func_qurt_process_create
+  Creates a process with the specified attributes, and starts the process.
+
+  The process executes the code in the specified executable ELF file.
+
+  @datatypes
+  #qurt_process_attr_t
+
+  @param[out] attr Accepts an initialized process attribute structure, which specifies
+                   the attributes of the created process.
+
+  @return
+  Postive return value Indicates Process ID.
+  Negative return value Indicates any of follwoing error,
+  #-QURT_EPRIVILEGE      --   Caller does not have privilege for this operation \n
+  #-QURT_EMEM            --   Not enough memory to perform the operation \n
+  #-QURT_EFAILED         --   Operation failed \n
+  #-QURT_ENOTALLOWED     --   Operation not allowed \n
+  #-QURT_ENOREGISTERED   --   Not registered    \n
+  #-QURT_ENORESOURCE     --   Resource exhaustion   \n
+  #-QURT_EINVALID        --   Invalid argument value    
+  #QURT_EFATAL           --   attr is NULL
+
+  @dependencies
+  None.
+*/
+int qurt_process_create (qurt_process_attr_t *attr);
+
+/**@ingroup func_qurt_process_get_id
+  Returns the process identifier for the current thread. 
+
+  @return
+  None.
+
+  @dependencies
+  Process identifier for the current thread.
+*/
+int qurt_process_get_id (void);
+/** @endcond */
+
+/** @cond internal_only*/
+/**@ingroup func_qurt_process_get_uid
+  Returns the user identifier for the current thread. 
+
+  @return
+  None.
+
+  @dependencies
+  User identifier for the current thread.
+*/
+int qurt_process_get_uid (void);
+/** @endcond */
+/** @cond rest_reg_dist */
+/**@ingroup func_qurt_process_attr_init
+  Initializes the structure that sets the process attributes when a thread is created.
+
+  After an attribute structure is initialized, the individual attributes in the structure can 
+  be explicitly set using the process attribute operations.
+
+  Table @xref{tbl:processAttrDefaults} lists the default attribute values set by the initialize 
+  operation.
+
+  @inputov{table_process_attribute_defaults}
+
+  @datatypes
+  #qurt_process_attr_t
+
+  @param[out] attr Pointer to the structure to initialize.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+static inline void qurt_process_attr_init (qurt_process_attr_t *attr)
+{
+    attr->name[0] = '\0';
+    attr->path[0] = '\0';
+    attr->dtb_path[0] = '\0';
+    attr->flags = 0;
+    attr->sw_id = 0;
+    attr->sid = 0;
+    attr->max_threads = (unsigned)QURT_PROCESS_DEFAULT_MAX_THREADS;
+    attr->ceiling_prio = QURT_PROCESS_DEFAULT_CEILING_PRIO;
+    attr->type = QURT_PROCESS_TYPE_RESERVED;
+    attr->dump_attr.enabled = 0;
+    attr->dump_attr.path = NULL;
+    attr->dump_attr.path_len = 0;
+    attr->capabilities = NULL;
+}
+
+/**@ingroup func_qurt_process_attr_set_executable
+  Sets the process name in the specified process attribute structure.
+
+  Process names identify process objects that are already 
+  loaded in memory as part of the QuRT system.
+
+  @note1hang Process objects are incorporated into the QuRT system at build time.
+
+  @note1hang Maximum length of name string is limited to QURT_PROCESS_ATTR_NAME_MAXLEN - 1.
+
+  @datatypes
+  #qurt_process_attr_t
+
+  @param[in] attr Pointer to the process attribute structure.
+  @param[in] name Pointer to the process name.
+ 
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+void qurt_process_attr_set_executable (qurt_process_attr_t *attr, const char *name);
+
+/**@ingroup func_qurt_process_attr_set_binary_path
+  Sets the binary path for the process loading in the specified process attribute structure.
+
+  Path specifies the binary to load for this process.
+  
+  @note1hang Max length of path string is limited to QURT_PROCESS_ATTR_BIN_PATH_MAXLEN-1.
+
+  @datatypes
+  #qurt_process_attr_t
+
+  @param[in] attr Pointer to the process attribute structure.
+  @param[in] path Pointer to the binary path.
+ 
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+void qurt_process_attr_set_binary_path(qurt_process_attr_t *attr, char *path);
+
+/**@ingroup func_qurt_process_attr_set_dtb_path
+  Sets the DTB binary path for the process loading in the specified process attribute structure.
+
+  Path specifies the DTB binary to load for this process.
+  
+  @note1hang Max length of path string is limited to QURT_PROCESS_ATTR_BIN_PATH_MAXLEN-1.
+
+  @datatypes
+  #qurt_process_attr_t
+
+  @param[in] attr Pointer to the process attribute structure.
+  @param[in] path Pointer to the binary path.
+ 
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+void qurt_process_attr_set_dtb_path(qurt_process_attr_t *attr, char *path);
+
+/**@ingroup func_qurt_process_attr_set_flags
+Sets the process properties in the specified process attribute structure.
+Process properties are represented as defined symbols that map into bits 
+0 through 31 of the 32-bit flag value. Multiple properties are specified by OR'ing 
+together the individual property symbols.
+
+@datatypes
+#qurt_process_attr_t
+
+@param[in] attr  Pointer to the process attribute structure.
+@param[in] flags QURT_PROCESS_NON_SYSTEM_CRITICAL Process is considered as non system-critical.
+                                                  This attribute will be used by error services,
+                                                  to decide whether to kill user pd or whole subsystem.
+                 QURT_PROCESS_ISLAND_RESIDENT     Process will be marked as island resident.
+                 QURT_PROCESS_RESTARTABLE         Process will be marked as restartable.
+                 QURT_PROCESS_UNTRUSTED           Process will be marked as unsigned process.
+@return
+None.
+
+@dependencies
+None.
+*/
+static inline void qurt_process_attr_set_flags (qurt_process_attr_t *attr, int flags)
+{
+    attr->flags = flags;
+}
+/** @endcond */
+/** @cond internal_only*/
+/**@ingroup func_qurt_process_attr_set_sid
+Sets the process streamID in the specified process attribute structure.
+
+@datatypes
+#qurt_process_attr_t
+
+@param[in] attr  Pointer to the process attribute structure.
+@param[in] sid   streamID to set for this process.
+
+@return
+None.
+
+@dependencies
+None.
+*/
+static inline void qurt_process_attr_set_sid (qurt_process_attr_t *attr, unsigned sid)
+{
+    attr->sid = sid;
+}
+/** @endcond */
+/** @cond rest_reg_dist */
+/**@ingroup func_qurt_process_attr_set_max_threads
+Sets the maximum number of threads allowed in the specified process attribute structure.
+
+@datatypes
+#qurt_process_attr_t
+
+@param[in] attr          Pointer to the process attribute structure.
+@param[in] max_threads   Maximum number of threads allowed for this process.
+
+@return
+None.
+
+@dependencies
+None.
+*/
+static inline void qurt_process_attr_set_max_threads (qurt_process_attr_t *attr, unsigned max_threads)
+{
+    attr->max_threads = max_threads;
+}
+
+/**@ingroup func_qurt_process_attr_set_sw_id
+Sets the software ID of the process to load in the specified process attribute structure.
+
+@datatypes
+#qurt_process_attr_t
+
+@param[in] attr          Pointer to the process attribute structure.
+@param[in] sw_id         Software ID of the process, used in authentication.
+
+@return
+None.
+
+@dependencies
+None.
+*/
+static inline void qurt_process_attr_set_sw_id(qurt_process_attr_t *attr, unsigned int sw_id)
+{
+    attr->sw_id = sw_id;
+}
+
+/**@ingroup func_qurt_process_attr_set_ceiling_prio
+Sets the highest thread priority allowed in the specified process attribute structure.
+Refer qurt_thread.h for priority ranges.
+
+@datatypes
+#qurt_process_attr_t
+
+@param[in] attr          Pointer to the process attribute structure.
+@param[in] prio          Priority.
+
+@return
+None.
+
+@dependencies
+None.
+*/
+static inline void qurt_process_attr_set_ceiling_prio (qurt_process_attr_t *attr, unsigned short prio)
+{
+    attr->ceiling_prio = prio;
+}
+/** @endcond */
+
+/** @cond internal_only*/
+/**@ingroup func_qurt_process_attr_set_dump_status
+Sets the process domain dump-enabled field in the process domain dump attributes.
+
+@datatypes
+#qurt_process_attr_t
+
+@param[in] attr          Pointer to the process attribute structure.
+@param[in] enabled       1 -- Process domain dump is collected \n
+                         0 -- Process domain dump is not collected
+
+@return
+None.
+
+@dependencies
+None.
+*/
+static inline void qurt_process_attr_set_dump_status(qurt_process_attr_t *attr, unsigned int enabled)
+{
+    attr->dump_attr.enabled = enabled;
+}
+
+/**@ingroup func_qurt_process_attr_set_dump_path
+Sets the process domain dump path and type.
+
+@datatypes
+#qurt_process_attr_t
+
+@param[in] attr          Pointer to the process attribute structure.
+@param[in] path          Path where the process domain dumps must be saved.
+@param[in] path_len      Length of the path string.
+
+@return
+None. 
+
+@dependencies
+None.
+*/
+static inline void qurt_process_attr_set_dump_path(qurt_process_attr_t *attr, const char *path, int path_len)
+{
+    attr->dump_attr.path = path;
+    attr->dump_attr.path_len = (unsigned int)path_len;
+}
+
+/**@ingroup func_qurt_process_attr_set_capabilities
+Sets list of capabilities available to this process.
+
+@datatypes
+#qurt_process_attr_t
+
+@param[in] attr          Pointer to the process attribute structure.
+@param[in] capabilities  Pointer to array of structures of type qurt_capability_t defining 
+                         resources and capabilites
+
+@return
+None. 
+
+@dependencies
+None.
+*/
+static inline void qurt_process_attr_set_capabilities(qurt_process_attr_t *attr, qurt_capability_t *capabilities)
+{
+    attr->capabilities = capabilities;
+}
+
+/** @endcond */
+/** @cond rest_reg_dist */
+/**@ingroup func_qurt_process_cmdline_get
+Gets the command line string associated with the current process.
+The Hexagon simulator command line arguments are retrieved using 
+this function as long as the call is made
+in the process of the QuRT installation, and with the 
+requirement that the program runs in a simulation environment.
+
+If the function modifies the provided buffer, it zero-terminates
+the string. It is possible that the function does not modify the
+provided buffer, so the caller must set buf[0] to a NULL
+byte before making the call. A truncated command line is returned when
+the command line is longer than the provided buffer.
+
+@param[in] buf      Pointer to a character buffer that must be filled in.
+@param[in] buf_siz  Size (in bytes) of the buffer pointed to by the buf argument.
+
+@return
+None.
+
+@dependencies
+None.
+*/
+void qurt_process_cmdline_get(char *buf, unsigned buf_siz);
+
+/**@ingroup func_qurt_process_get_thread_count
+Gets the number of threads present in the process indicated by the PID. 
+ 
+@param[in] pid PID of the process for which the information is required.
+
+@return
+Number of threads in the process indicated by PID, if positive value is obtained
+Negative error code if failed include:
+   QURT_EFATAL - Invalid PID
+   -QURT_ENOTALLOWED - Current process doesnt have access to target process indicated by PID
+
+@dependencies
+None.
+*/
+int qurt_process_get_thread_count(unsigned int pid);
+
+/**@ingroup func_qurt_process_get_thread_ids
+Gets the thread IDs for a process indicated by PID. 
+
+@param[in] pid      PID of the process for which the information is required.
+@param[in] ptr         Pointer to a user passed buffer that must be filled in with thread IDs.
+@param[in] thread_num  Number of thread IDs requested.
+
+@return
+#QURT_EOK - Success
+#QURT_EFATAL - Failed, ptr is NULL
+
+@dependencies
+None.
+ */
+int qurt_process_get_thread_ids(unsigned int pid, unsigned int *ptr, unsigned thread_num);
+/** @endcond */
+/** @cond internal_only*/
+/**@ingroup func_qurt_process_dump_get_mem_mappings_count
+Gets the number of mappings present in the process indicated by the PID. 
+ 
+@param[in] pid PID of the process for which the information is required.
+
+@return
+Number of mappings for the process indicated by the PID.
+
+@dependencies
+None.
+*/
+int qurt_process_dump_get_mem_mappings_count(unsigned int pid);
+
+/**@ingroup func_qurt_process_dump_get_mappings
+Gets the mappings for a specified PID.
+
+@note1hang This API skips device type mappings or mappings created by setting the #QURT_PERM_NODUMP attribute.
+
+@param[in] pid      PID of the process for which the information is required.
+@param[in] ptr      Pointer to a buffer that must be filled in with mappings.
+@param[in] count    Count of mappings requested.
+
+@return
+Number of mappings filled in the buffer passed by the user.
+
+@dependencies
+None.
+*/
+int qurt_process_dump_get_mappings(unsigned int pid, unsigned int *ptr, unsigned count);
+/** @endcond */
+/** @cond rest_reg_dist */
+/**@ingroup func_qurt_process_attr_get
+Gets the attributes of the process with which it was created. 
+ 
+@datatypes
+#qurt_process_attr_t
+
+@param[in]     pid  PID of the process for which the information is required.
+@param[in,out] attr Pointer to the user allocated attribute structure.
+
+@return
+#QURT_EOK     - Success
+#QURT_INVALID - Invalid PID
+#QURT_EFATAL  - attr is NULL
+
+@dependencies
+None.
+*/
+int qurt_process_attr_get(unsigned int pid, qurt_process_attr_t *attr);
+
+/**@ingroup func_qurt_process_dump_register_cb
+Registers the process domain dump callback. 
+ 
+@datatypes
+#qurt_cb_data_t \n
+#qurt_process_dump_cb_type_t
+
+@param[in] cb_data Pointer to the callback information.
+@param[in] type Callback type; these callbacks are called in the context of the user process domain: \n
+   #QURT_PROCESS_DUMP_CB_PRESTM -- Before threads of the exiting process are frozen. \n
+   #QURT_PROCESS_DUMP_CB_ERROR  -- After threads are frozen and captured. \n
+   #QURT_PROCESS_DUMP_CB_ROOT   -- After threads are frozen and captured, and CB_ERROR type of callbacks
+                                   are called.
+@param[in] priority Priority.
+
+@return
+#QURT_EOK -- Success \n
+Other values -- Failure
+    QURT_EFATAL if cb_data is NULL
+    QURT_EINVALID If invalid cb_type
+    QURT_EFAILED If invalid cb_data 
+ 
+@dependencies
+None.
+*/
+int qurt_process_dump_register_cb(qurt_cb_data_t *cb_data, qurt_process_dump_cb_type_t type, unsigned short priority);
+
+/**@ingroup func_qurt_process_dump_deregister_cb
+Deregisters the process domain dump callback.
+
+@datatypes
+#qurt_cb_data_t \n
+#qurt_process_dump_cb_type_t
+
+@param[in] cb_data Pointer to the callback information to deregister.
+@param[in] type    Callback type.
+
+@return
+#QURT_EOK -- Success.\n
+Other values -- Failure.
+    QURT_EFATAL if cb_data is NULL
+    QURT_EINVALID If invalid cb_type
+    QURT_EFAILED If invalid cb_data 
+
+@dependencies
+None.
+*/
+int qurt_process_dump_deregister_cb(qurt_cb_data_t *cb_data,qurt_process_dump_cb_type_t type);
+
+/** @endcond */
+/** @cond internal_only*/
+/**@ingroup func_qurt_process_set_rtld_debug
+Sets rtld_debug for a process. 
+ 
+@param[in] pid     PID of the process for which rtld_debug must be set.
+@param[in] address rtld_debug address.
+
+@return
+#QURT_EOK      - Success
+#QURT_EINVALID - Invalid PID
+#QURT_EFATAL   - Invalid address
+ 
+@dependencies
+None.
+*/
+int qurt_process_set_rtld_debug(unsigned int pid,unsigned int address);
+
+/**@ingroup func_qurt_process_get_rtld_debug
+Gets rtld_debug for a process.
+
+@param[in] pid         PID of the process for which rtld_debug must be set.
+@param[in,out] address Pointer to the user passed address in which the rtld_debug address must be returned.
+
+@return
+#QURT_EOK      - Success
+#QURT_EINVALID - Invalid PID
+#QURT_EFATAL   - Invalid address
+
+@dependencies
+None.
+*/
+int qurt_process_get_rtld_debug(unsigned int pid,unsigned int *address);
+/** @endcond */
+/**@ingroup func_qurt_process_exit
+Exits the current user process with an exit code.
+
+@param[in] exitcode Exit code.
+ 
+@return
+#QURT_EFATAL -- No client found with the specified PID value \n
+#QURT_EINVALID -- Invalid client \n
+#QURT_ENOTALLOWED -- User does not have permission to perform this operation \n
+#QURT_EOK -- Success
+
+@dependencies
+None.
+*/
+int qurt_process_exit(int exitcode);
+
+/**@ingroup func_qurt_process_kill
+Kills the process represented by the PID with the exit code.
+
+@param[in] pid       PID of the process to kill.
+@param[in] exitcode  Exit code.
+ 
+@return
+#QURT_EFATAL -- No client found with the specified PID value \n
+#QURT_EINVALID -- Invalid client \n
+#QURT_ENOTALLOWED -- User does not have permission to perform this operation \n
+#QURT_EOK -- Success
+
+@dependencies
+None.
+*/
+int qurt_process_kill(int pid, int exitcode);
+ 
+ 
+/**@ingroup func_qurt_debugger_register_process
+Registers the process indicated by the PID with the debug monitor. 
+
+@param[in] pid  PID of the process.
+@param[in] adr  Address.
+ 
+@return
+#QURT_EOK -- Success 
+
+@dependencies
+None.
+*/
+int qurt_debugger_register_process(int pid, unsigned int adr);
+ 
+ 
+/**@ingroup func_qurt_debugger_deregister_process
+Deregister the process indicated by the PID with the debug monitor.
+
+@param[in] pid  PID of the process.
+ 
+@return
+#QURT_EOK -- Success
+
+@dependencies
+None.
+*/
+int qurt_debugger_deregister_process(int pid);
+ 
+/**@ingroup func_qurt_process_exec_callback
+Executes callbacks in the user process as indicated by the client_handle argument.
+
+@param[in] client_handle  Client handle obtained from the current invocation function (Section 3.4.1).
+@param[in] callback_fn    Callback function to execute.
+@param[in] stack_base     Stack address to use.
+@param[in] stack_size     Stack size.
+ 
+@return
+#QURT_EOK -- Success
+
+@dependencies
+None.
+*/
+int qurt_process_exec_callback(int client_handle,
+                                     unsigned callback_fn,
+                                     unsigned stack_base,
+                                     unsigned stack_size);
+ 
+/**@ingroup func_qurt_process_get_pid
+Gets the process ID of the process that the client_handle argument represents.
+
+@note1hang This API is not supported for unsigned PD, For unsigned PD use qurt_process_get_id()
+
+@param[in] client_handle    Client handle obtained from the current invocation function (Section 3.4.1).
+@param[in] pid              Pointer to the address to store the PID.
+ 
+@return
+#QURT_EOK -- Success
+#QURT_EFATAL -- pid pointer passed as NULL 
+
+@dependencies
+None.
+*/
+int qurt_process_get_pid(int client_handle, int * pid);
+
+/**@ingroup func_qurt_process_get_dm_status
+Gets the debugging session status on the process represented by the pid argument.
+
+@param[in]     pid      Process ID  
+@param[in,out] status   Address to store the status: \n
+                        #QURT_DEBUG_NOT_START \n        
+                        #QURT_DEBUG_START         
+ 
+@return
+#QURT_EOK - Success \n
+#QURT_EINVALID - Error
+
+@dependencies
+None.
+*/
+int qurt_process_get_dm_status( unsigned int pid, unsigned int *status);
+
+
+/**@ingroup func_qurt_process_suspend_threads 
+  Suspends user threads in a user process with its process identifier.
+  The target user process can be a signed user process or an unsigned user process.
+  The caller is from a thread in GuestOS/root process.
+  After the user threads in the target user process are suspended, they cannot be scheduled to run by the kernel 
+  until they resume later.
+
+  This function has one optional argument with one default option.
+  #QURT_PROCESS_SUSPEND_DEFAULT suspends user threads in the target user process.
+
+  This function call is a synchronous call, the function returns after the relevant threads are 
+  completely suspended. 
+  
+  If some user threads in the target user process are set as non-suspendable, this function call does
+  not suspend these threads.
+
+  If the target user process is already suspended, this function call returns success as the 
+  confirmation on the user process suspending.
+
+  QuRT debugger monitor threads in the target user process are non-suspendable, this function call does
+  not suspend the threads.
+
+  If the target user process is a secure user process, or a CPZ process, this function call returns error 
+  without suspending the target user process.                                          
+
+  If a user thread in the target user process runs in the guest OS/root process via a QDI call, this function call 
+  does not suspend the thread in the guest OS, but instead marks the thread as pending-suspend. The thread is suspended 
+  when it exits the guest OS, before executing the first instruction in the user process.
+  In this case, the function returns success while the user thread can be running in GuestOS, and is suspended 
+  when exiting the guest OS. 
+ 
+  @param[in] process_id  Process identifier.
+  @param[in] option      Dfault option #QURT_PROCESS_SUSPEND_DEFAULT suspends user threads in the target user process.
+ 
+  @return
+  #QURT_EOK         -- Success  \n
+  #QURT_EINVALID    -- Failure because of invalid process_id input \n
+  #QURT_ENOTALLOWED -- Failure because the operation is not allowed, for example, on a secure process/CPZ process.
+ 
+  @dependencies
+  None.
+ */
+int qurt_process_suspend_threads (unsigned int process_id, unsigned int option);
+
+
+/**@ingroup func_qurt_process_resume_threads 
+  Resumes a user process with its process identifier.
+  The target user process can be a signed user process or an unsigned user process.
+  The caller is from a thread in the guest OS/root process.
+  After the user threads in the target user process resume, the kernel scheduler
+  can schedule the user threads to run based on their thread priorities.
+
+  This function has an optional argument, #QURT_PROCESS_RESUME_DEFAULT, which 
+  resumes user threads in the target user process.
+
+  This is an asynchronous function, it returns after the kernel moves the user thread from 
+  suspended state to runnable state. The threads are scheduled to run based on their thread priorities.
+  
+  This function call does not resume threads in the target user process that have been set as non-resumable.
+
+  If the target user process have already resumed, this function call confirms that the user process resumes
+  by returning success.
+
+  If the target user process is a secure user process or a CPZ process, this function call returns an error without 
+  resuming operation.                                          
+
+  If user threads in the target user process run in the guest OS/root process via QDI call, this function 
+  call clears the mark of suspend-pending on these threads, so that the threads are be suspended when it exits 
+  the guest OS. 
+ 
+  @param[in] process_id Process identifier.
+  @param[in] option     Default option #QURT_PROCESS_RESUME_DEFAULT resumes user threads in the target user process.
+ 
+  @return
+  #QURT_EOK         -- Success  
+  #QURT_EINVALID    -- Failure because of invalid process_id input.
+  #QURT_ENOTALLOWED -- Failure because of the operation is not allowed, for example, on a secure process/CPZ process.
+ 
+  @dependencies
+  None.
+ */
+int qurt_process_resume_threads (unsigned int process_id, unsigned int option);
+
+/**@ingroup func_qurt_process_vtcm_window_set
+  Set a VTCM access window for a process.
+  The caller thread needs to be in SRM process.
+  
+  This is an synchronous function, it ensures all running threads of the process have the requested 
+  window in effect.The requested view for all non-running thread will take in effect when they get 
+  scheduled.  
+
+  @param[in] pid Process identifier.
+  @param[in] enable  QURT_VTCM_WINDOW_ENABLE    enforces VTCM access window defined by high and low offset.
+                     QURT_VTCM_WINDOW_DISABLE   high and low offset is ignored and VTCM access is fully 
+                                                disabled for the process.
+  @param[in] high_offset  Specifies the high window offset, in 4K increments, from the base address of the VTCM.
+                          QURT_VTCM_WINDOW_HI_OFFSET_DEFAULT  restore high offset to reset value.
+  @param[in] low_offset   Specifies the low window offset, in 4K increments, from the base address of the VTCM.
+                          QURT_VTCM_WINDOW_LO_OFFSET_DEFAULT restore low offset to reset value.
+           
+  @note1hang
+  when high_offset is set to QURT_VTCM_WINDOW_HI_OFFSET_DEFAULT  and low offset is set as 
+  QURT_VTCM_WINDOW_LO_OFFSET_DEFAULT full VTCM range is accessible. Access to VTCM is controlled 
+  via MMU mapping for the process. 
+  
+  @return
+  #QURT_EOK            -- Success  
+  #QURT_EVAL           -- Failure because of invalid inputs.
+  #QURT_EPRIVILEGE     -- Failure because caller does not have enough privilege for this operation.
+  #QURT_ENOTSUPPORTED  -- Failure because of the operation is not supported due to limitation in HW capabilities 
+ 
+  @dependencies
+  None.
+ */
+int qurt_process_vtcm_window_set(int pid, unsigned int enable, unsigned int high_offset, unsigned int low_offset);
+
+/**@ingroup func_qurt_process_vtcm_window_get
+  Get the VTCM window for a process.
+  The caller thread needs to be in SRM process.
+  
+
+  @param[in] pid Process identifier.
+  @param[out] enable  address to store enable status if set
+  @param[out] high_offset address to return high window offset, in 4K increments, from the base address of the VTCM
+  @param[out] low_offset  address to return low window offset, in 4K increments, from the base address of the VTCM.
+  
+  @note1hang
+  User must first check the value of enable returned before checking high and low offset.
+ 
+  @return
+  #QURT_EOK            -- Success  
+  #QURT_EVAL           -- Failure because of invalid inputs.
+  #QURT_EPRIVILEGE     -- Failure because caller does not have enough privilege for this operation.
+  #QURT_ENOTSUPPORTED  -- Failure because of the operation is not supported due to limitation in HW capabilities 
+ 
+  @dependencies
+  None.
+ */
+int qurt_process_vtcm_window_get(int pid, unsigned int *enable, unsigned int *high_offset, unsigned int *low_offset);
+
+/**@ingroup func_qurt_process_set_group_config
+  Enable thread groups in the process with the ceiling priorities setup
+
+  @param[in] process_id Process identifier.
+  @param[in] group_bitmask 64-bit mask of active thread groups
+  @param[in] ceiling_priorities array of ceiling priorities for thread group
+
+  @note1hang
+  This API can only be called by root PD and can only be called once for each process, otherwise it will be
+  rejected. Group 0 must be enabled in group_bitmask, otherwise QuRT will return error. After this API, all
+  exisiting threads will be moved to group 0, and if there is any thread's priority higher than ceiling
+  priority of group 0, it will be lowered to the ceiling value.
+  Examples 1:
+  group_bitmask = 0xD7; //'b11010111
+  ceiling_priorities[] = {100, 128, 200, 0, 196, 0, 240, 20}; // 0 - does not care
+  Exmaples 2:
+  group_mask = 0x5;     //'b101
+  ceiling_priorities[] = {240, 0, 20}; // 0 - does not care
+
+
+  @return
+  #QURT_EOK            -- Success.
+  #QURT_EVAL           -- Failure because of invalid inputs.
+  #QURT_ENOTALLOWED    -- The group has been configured already.
+
+  @dependencies
+  None.
+ */
+int qurt_process_set_group_config(unsigned int process_id, unsigned long long group_bitmask,
+    unsigned char *ceiling_priorities);
+
+
+/**@ingroup func_qurt_process_stid_set
+  Set the specified stid for a process or for a thread group within a process. 
+
+  @param[in] pid Process identifier.
+  @param[in] group_id  group identifier
+  @param[in] stid stid to be set 
+  
+  @note1hang 
+  User can pass default group_id (QURT_THREAD_DEFAULT_GROUP_ID) if stid needs to set at a process level.
+  All threads within a process that has default stid (QURT_STID_DEFAULT) will inherit the stid set for a process.
+  When a non-default group_id is specified, the stid is set only for a thread group.
+  
+  @return
+  #QURT_EOK            -- Success
+  #QURT_EFATAL         -- Invalid PID
+  #QURT_EVAL           -- Failure because of invalid inputs.
+  #QURT_EPRIVILEGE     -- Failure because caller does not have enough privilege for this operation.
+ 
+  @dependencies
+  None.
+ */
+int qurt_process_stid_set(unsigned int pid, unsigned int group_id , unsigned int stid);
+
+/**@ingroup func_qurt_process_stid_get
+  Get the stid for a process or for a thread group within a process. 
+
+  @param[in]  pid Process identifier.
+  @param[in]  group_id  group identifier
+  @param[out] Pointer to a variable to return  stid
+  
+  @note1hang 
+  User can pass default group_id (QURT_THREAD_DEFAULT_GROUP_ID) to return process-level stid.
+  When a non-default group_id is specified, the stid is returned only for a thread group.
+  
+  @return
+  #QURT_EOK            -- Success
+  #QURT_EFATAL         -- Invalid PID
+  #QURT_EVAL           -- Failure because of invalid inputs.
+  #QURT_EPRIVILEGE     -- Failure because caller does not have enough privilege for this operation.
+ 
+  @dependencies
+  None.
+ */
+int qurt_process_stid_get(unsigned int pid, unsigned int group_id , unsigned int *stid);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_profile.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_profile.h
new file mode 100755
index 0000000000000..2a50c461440f6
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_profile.h
@@ -0,0 +1,98 @@
+#ifndef QURT_PROFILE_H
+#define QURT_PROFILE_H
+/**
+  @file qurt_profile.h
+  QuRT profiling support.
+
+EXTERNAL FUNCTIONS
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+Copyright (c) 2018, 2021, 2023 by Qualcomm Technologies, Inc.  All Rights Reserved.
+Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+==============================================================================*/
+#include "qurt_thread.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** @addtogroup profiling_macros
+@{ */
+#define QURT_PROFILE_DISABLE 0 /**< Disable profiling. */
+#define QURT_PROFILE_ENABLE  1 /**< Enable profiling. */
+
+typedef unsigned int qurt_profile_param_t;
+
+#define QURT_PROFILE_PARAM_THREAD_READY_TIME 0U /**< Profile thread ready time. */
+
+/** @} */ /* end_addtogroup profiling_macros */
+
+/** @addtogroup profiling_types
+    @{ */
+/** Profiling results. */
+typedef union
+{
+    /** Result associated with #QURT_PROFILE_PARAM_THREAD_READY_TIME.  */
+    struct
+    {
+        unsigned int ticks; /**< Cumulative ticks the thread was ready. */
+    } thread_ready_time;
+
+} qurt_profile_result_t;
+/** @} */ /* end_addtogroup profiling_types */
+
+/**@ingroup func_qurt_profile_enable2
+ * Starts profiling of a specific parameter on a specific thread (as applicable).
+ *  
+ * @param[in] param     Profiling parameter.
+ * @param[in] thread_id ID of the thread (if applicable) for which the specified 
+ *                      paramter must be profiled.
+ * @param[in] enable    #QURT_PROFILE_DISABLE -- disable \n #QURT_PROFILE_ENABLE -- 
+ *                      enable
+ *  
+ * @return 
+ * #QURT_EOK -- Success \n 
+ * #QURT_EALREADY -- Measurement already in progress or already stopped \n 
+ * #QURT_ENOTHREAD -- Thread does not exist \n 
+ * #QURT_EINVALID -- Invalid profiling parameter \n
+ *  
+ * @dependencies 
+ * None.   
+ */
+extern int qurt_profile_enable2 (
+    qurt_profile_param_t param,
+    qurt_thread_t        thread_id,
+    int                  enable
+);
+
+/**@ingroup func_qurt_profile_get
+ * Gets the value of the profiling parameter that was previously enabled. 
+ *  
+ * @param[in] param     Profiling parameter.
+ * @param[in] thread_id ID of thread (if applicable) for which the specified 
+ *                      profiling paramter must be retrieved.
+ * @param [out] result  Profiling result associated with the parameter for the specified 
+ *                      thread (if applicable).
+ *  
+ * @return 
+ * #QURT_EOK -- Success \n 
+ * #QURT_EFAILED -- Operation failed; profiling was not enabled \n 
+ * #QURT_ENOTHREAD -- Thread does not exist \n 
+ * #QURT_EINVALID -- Invalid profiling parameter \n
+ *  
+ * @dependencies 
+ * None. 
+ */
+extern int qurt_profile_get (
+    qurt_profile_param_t    param,
+    qurt_thread_t           thread_id,
+    qurt_profile_result_t * result
+);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+#endif
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_ptrace.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_ptrace.h
new file mode 100755
index 0000000000000..622304dd92865
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_ptrace.h
@@ -0,0 +1,37 @@
+/*=============================================================================
+
+                                    qurt_ptrace.h
+
+GENERAL DESCRIPTION
+
+EXTERNAL FUNCTIONS
+        None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+        None.
+
+             Copyright (c) 2013  by Qualcomm Technologies, Inc.  All Rights Reserved.
+=============================================================================*/
+#ifndef __SYS_PTRACE_H__
+#define __SYS_PTRACE_H__
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+enum __ptrace_request
+{
+   /**
+     Indicates that the process making this request is requesting to be traced.
+   */
+   PTRACE_TRACEME = 0,
+   PTRACE_EXT_IS_DEBUG_PERMITTED = 500
+};
+
+long ptrace(enum __ptrace_request request, unsigned int pid, void*addr, void *data);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif //__SYS_PTRACE_H__
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_qdi.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_qdi.h
new file mode 100755
index 0000000000000..705408e5cfc6f
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_qdi.h
@@ -0,0 +1,185 @@
+#ifndef QDI_H
+#define QDI_H
+
+/**
+  @file qurt_qdi.h
+  @brief Prototypes of QuRT Driver Invocation API functions      
+
+ EXTERNALIZED FUNCTIONS
+  none
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  none
+
+ Copyright (c) 2013, 2021, 2023 Qualcomm Technologies, Inc.
+ All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+
+
+#include "qurt_qdi_constants.h"
+#include "qurt_qdi_imacros.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**@ingroup func_qurt_qdi_open
+  Opens the specified driver for subsequent operations.
+  qurt_qdi_open() is the primary mechanism by which a driver user can
+  obtain a QDI handle. The user provides the name of the driver to the 
+  qurt_qdi_open call, and gets back a handle referencing
+  the named driver. \n
+  @note1hang For reasons related to the Hexagon standard for varargs functions, the
+             qurt_qdi_open function prototype is not actually defined as a varargs.
+
+
+  @param[in] p   Driver name.
+  @param[in] ... Up to nine additional device-specific arguments can be passed as parameters, 
+                 and should follow the POSIX open() convention. \n
+                 - flags -- Optional second parameter (POSIX flags), the handle 
+                         access requested (read-only, write-only, or read-write,
+                         for instance) and other flags such as whether the call 
+                         should create a new device or only open an existing 
+                         device.   \n
+                 - mode  -- Optional third parameter (POSIX mode); permissions to
+                         configure when a new device is created. @tablebulletend
+ 
+  @return 
+  Negative value -- Error. \n
+  Non-negative value -- Success, this result value serves as a handle to the
+                        opened driver.
+  @dependencies
+  None.
+ */
+// int qurt_qdi_open();
+#define qurt_qdi_open(p,...) \
+   qurt_qdi_handle_invoke(QDI_HANDLE_GENERIC,QDI_OPEN,(p),##__VA_ARGS__)
+
+#define qurt_qdi_open_dt(p,q,...) \
+   qurt_qdi_handle_invoke(QDI_HANDLE_GENERIC,QDI_OPEN_FROM_DT,(p),(q),##__VA_ARGS__)
+
+/**@ingroup func_qurt_qdi_handle_invoke
+  Performs a generic driver operation, which (depending on the specified operation) can be
+  either be one of the predefined operations listed in @xhyperref{tbl:functionMapping,QDI function mapping} 
+  or a driver-specific operation.
+  The user provides a QDI handle and an integer
+  method number, along with 0 to 8 optional 32-bit arguments.
+  The device driver invocation function is invoked with the
+  same method number and 0 to 8 optional arguments. The
+  return value from the invocation function is passed back to
+  the user as the return value of qurt_qdi_handle_invoke.
+
+  @note1hang For reasons related to the Hexagon standard for varargs functions, the
+             qurt_qdi_handle_invoke() function prototype is not actually defined as a
+             varargs function (and would break if it were defined this way).
+ 
+  @param[in]  h   Driver handle.
+  @param[in]  m   Integer number for the operation to perform.
+  @param[in]  ... Up to eight optional arguments can be passed to the device driver as operation-specific parameters: \n
+               arg1 -- First parameter \n
+               arg2 -- Second parameter  \n
+               arg3 -- Third parameter  \n
+               arg4 -- Fourth parameter  \n
+               arg5 -- Fifth parameter  \n
+               arg6 -- Sixth parameter  \n
+               arg7 -- Seventh parameter  \n
+               arg8 -- Eighth parameter 
+ 
+  @return 
+  Integer value defined by the device driver. \n
+  -1 -- Error.
+
+  @dependencies
+  None.
+ */
+// int qurt_qdi_handle_invoke();
+#define qurt_qdi_handle_invoke(h,m,...) \
+   _QDMPASTE(_QDMHI,_QDMCNT(QDI_HANDLE_LOCAL_CLIENT,h,m,##__VA_ARGS__))(QDI_HANDLE_LOCAL_CLIENT,h,m,##__VA_ARGS__)
+#define _QDMHI3(a,b,c) qurt_qdi_qhi3(0,b,c)
+#define _QDMHI4(a,b,c,d) qurt_qdi_qhi4(0,b,c,(int)(d))
+#define _QDMHI5(a,b,c,d,e) qurt_qdi_qhi5(0,b,c,(int)(d),(int)(e))
+#define _QDMHI6(a,b,c,d,e,f) qurt_qdi_qhi6(0,b,c,(int)(d),(int)(e),(int)(f))
+#define _QDMHI7(a,b,c,d,e,f,g) qurt_qdi_qhi7(8,b,c,(int)(d),(int)(e),(int)(f),(int)(g))
+#define _QDMHI8(a,b,c,d,e,f,g,h) qurt_qdi_qhi8(8,b,c,(int)(d),(int)(e),(int)(f),(int)(g),(int)(h))
+#define _QDMHI9(a,b,c,d,e,f,g,h,i) qurt_qdi_qhi9(16,b,c,(int)(d),(int)(e),(int)(f),(int)(g),(int)(h),(int)(i))
+#define _QDMHI10(a,b,c,d,e,f,g,h,i,j) qurt_qdi_qhi10(16,b,c,(int)(d),(int)(e),(int)(f),(int)(g),(int)(h),(int)(i),(int)(j))
+#define _QDMHI11(a,b,c,d,e,f,g,h,i,j,k) qurt_qdi_qhi11(24,b,c,(int)(d),(int)(e),(int)(f),(int)(g),(int)(h),(int)(i),(int)(j),(int)(k))
+#define _QDMHI12(a,b,c,d,e,f,g,h,i,j,k,l) qurt_qdi_qhi12(24,b,c,(int)(d),(int)(e),(int)(f),(int)(g),(int)(h),(int)(i),(int)(j),(int)(k),(int)(l))
+int qurt_qdi_qhi3(int,int,int);
+int qurt_qdi_qhi4(int,int,int,int);
+int qurt_qdi_qhi5(int,int,int,int,int);
+int qurt_qdi_qhi6(int,int,int,int,int,int);
+int qurt_qdi_qhi7(int,int,int,int,int,int,int);
+int qurt_qdi_qhi8(int,int,int,int,int,int,int,int);
+int qurt_qdi_qhi9(int,int,int,int,int,int,int,int,int);
+int qurt_qdi_qhi10(int,int,int,int,int,int,int,int,int,int);
+int qurt_qdi_qhi11(int,int,int,int,int,int,int,int,int,int,int);
+int qurt_qdi_qhi12(int,int,int,int,int,int,int,int,int,int,int,int);
+
+/**@ingroup func_qurt_qdi_write
+  Writes data to the specified driver.
+  A predefined invocation routine for drivers that
+  support a POSIX-like write functionality.
+  qqurt_qdi_write(handle, buf, len) is equivalent to
+  qurt_qdi_handle_invoke(handle, QDI_WRITE, handle, buf, len);
+ 
+  @param[in]  handle Driver handle.
+  @param[in]  buf    Pointer to the memory address where the data to write is stored.
+  @param[in]  len    Number of bytes of data to write.
+
+  @return 
+  Non-negative integer -- Number of bytes written. \n
+  Negative error code -- Write could not take place.
+
+  @dependencies
+  None.
+ */
+int qurt_qdi_write(int handle, const void *buf, unsigned len);
+
+/**@ingroup func_qurt_qdi_read
+  User-visible API to read data from a QDI handle. 
+  A predefined invocation routine for drivers that
+  support a POSIX-like read functionality.
+  qurt_qdi_read(handle, buf, len) is equivalent to:
+  qurt_qdi_handle_invoke(handle, QDI_READ, handle, buf, len);
+ 
+  @param[in]  handle   Driver handle.
+  @param[in]  buf      Pointer to the memory address where the data read is stored.
+  @param[in]  len      Number of bytes of data to read.
+
+  @return 
+  Non-negative integer number -- Bytes read. \n
+  Negative error code -- Read could not take place.
+
+  @dependencies
+  None.
+ */
+int qurt_qdi_read(int handle, void *buf, unsigned len);
+
+/**@ingroup func_qurt_qdi_close
+  Closes the specified driver, releasing any resources associated with the open driver.
+  User-visible API to close a QDI handle.
+ 
+  This API should be called when the user is done using a
+  QDI-based handle. When this function is called, the driver can release
+  any resources held and perform other necessary cleanup
+  operations. qurt_qdi_close(handle) is equivalent to
+  qurt_qdi_handle_invoke(handle, QDI_CLOSE, handle)
+ 
+  @param[in]  handle Driver handle.
+ 
+  @return 
+  0 -- Success.\n
+  Negative error code -- Failure.
+
+  @dependencies
+  None.
+ */
+int qurt_qdi_close(int handle);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_qdi_constants.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_qdi_constants.h
new file mode 100755
index 0000000000000..4866fada067f0
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_qdi_constants.h
@@ -0,0 +1,193 @@
+#ifndef QDI_CONSTANTS_H
+#define QDI_CONSTANTS_H
+
+/**
+  @file qurt_qdi_constants.h
+  @brief  Predefined invocation methods for drivers.  
+
+ EXTERNALIZED FUNCTIONS
+  None
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  None
+
+ Copyright (c) 2013-2023 by Qualcomm Technologies, Inc.  All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc..
+ ======================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+||  Method numbers used for QDI.
+||
+||  Intended grouping of method numbers for QDI
+||   including future usage:
+||
+||   Method 0 should always be unused and not responded to by
+||    any driver.
+||   Methods 1 and 2 are reserved for name registration and
+||    name lookup.
+||   Methods 3 through 31 are reserved for POSIX-type operations
+||    on open handles.
+||   Methods 32 through 127 are reserved for the QDI infrastructure
+||    and may be extended in the future to provide standard
+||    driver debug services, management services, and system
+||    notifications.
+||   Methods 128 through 255 are reserved for the use of automatically
+||    generated methods such as might be generated by an IDL (interface
+||    definition language).  The infrastructure may be extended to
+||    perform services on these methods based on information provided
+||    by the IDL, such as automatic buffer validation, etc.  These
+||    method numbers should not be used for any "ad hoc" methods.
+||   Methods with number >= 256 are "private" method numbers that are
+||    outside the scope of the QDI infrastructure.  Drivers that want
+||    to generate and consume their own "ad hoc" methods are free to
+||    use these method numbers as they wish. The infrastructure does
+||    not generate these method numbers or respond to them, but
+||    passes them on unmolested.
+||
+||   All driver implementations *should* return a value of
+||    -1 when called with an unsupported method.  The standard error
+||    return value for POSIX APIs is -1, so we emulate that behavior
+||    here.
+*/
+/** @cond */
+#define QDI_UNUSED              0
+#define QDI_DEVNAME_REGISTER    1
+#define QDI_OPEN                2
+#define QDI_CLOSE               3
+#define QDI_READ                4
+#define QDI_WRITE               5
+#define QDI_IOCTL               6
+#define QDI_MMAP                7
+#define QDI_OS_FILEOPEN         8
+#define QDI_FLEN                9
+#define QDI_UNLINK             10
+#define QDI_FTELL              22
+#define QDI_SEEK               23
+#define QDI_FSTAT              24
+
+#define QDI_FSNAME_REGISTER     150    
+#define QDI_FS_OPEN             151
+#define QDI_MMAP2               153
+#define QDI_MPROTECT2           154
+#define QDI_MUNMAP2             155
+
+#define QDI_CLIENT_HANDLE_OBJREF_GET    10
+
+#define QDI_OS_PROCESS_LOAD             12
+#define QDI_OS_PROCESS_CHOOSE_ASID      13
+
+#define QDI_OS_SET_GP                    26
+#define QDI_CLIENT_HANDLE_CALLBACK       27
+   
+#define QDI_CLIENT_HANDLE_ISLAND_HANDLE_CREATE_FROM_OBJ_T  19 //reused
+#define QDI_CLIENT_HANDLE_HANDLE_CREATE_FROM_OBJ_T 80
+#define QDI_CLIENT_HANDLE_HANDLE_RELEASE           81
+#define QDI_CLIENT_HANDLE_COPY_FROM_USER           82
+#define QDI_CLIENT_HANDLE_COPY_TO_USER             83
+#define QDI_CLIENT_HANDLE_SIGNAL_GROUP_CREATE      86
+#define QDI_CLIENT_HANDLE_SAFE_CACHE_OPS           87
+
+#define QDI_CLIENT_HANDLE_BUFFER_LOCK   41
+#define QDI_CLIENT_HLOSPOOL_INFO_GET    90
+#define QDI_CLIENT_HLOSPOOL2_INFO_GET   96
+
+#define QDI_CLIENT_PID                  44
+#define QDI_CLIENT_ASID                 QDI_CLIENT_PID
+
+#define QDI_OS_CLIENT_INFO_GET          48
+
+#define QDI_OS_MEM_LOOKUP_PHYSADDR      57
+
+#define QDI_OS_THREAD_ITERATOR_CREATE   68
+#define QDI_OS_THREAD_ITERATOR_NEXT     69
+
+#define QDI_OS_SYSENV                   78
+
+#define QDI_REGION_USERMALLOC_INIT      180 // This method is for generic handle
+
+
+#define QDI_CLIENT_HANDLE_USER_MALLOC              84
+#define QDI_CLIENT_HANDLE_USER_FREE                85
+
+#define QDI_SIGNAL_GROUP_SIGNAL_CREATE          96
+#define QDI_SIGNAL_GROUP_WAIT                   98
+#define QDI_SIGNAL_GROUP_POLL                   99
+#define QDI_SIGNAL_SET                          96
+#define QDI_SIGNAL_CLEAR                        97
+#define QDI_SIGNAL_WAIT                         98
+#define QDI_SIGNAL_POLL                         99
+
+#define QDI_OS_WAIT_FOR_MAIN_REAPER		104
+
+#define QDI_CLIENT_HANDLE_REFPROXY_INSTALL        105
+#define QDI_CLIENT_HANDLE_REFPROXY_ADD            106
+#define QDI_CLIENT_HANDLE_REFPROXY_REMOVE         107
+
+#define QDI_CLIENT_HANDLE_DETACH                  116
+
+#define QDI_OS_RESERVED1                       139
+
+#define QDI_CLIENT_HANDLE_BUFFER_LOCK2   142
+
+#define QDI_DT_REGISTER 158
+#define QDI_OPEN_DEVICE 159
+#define QDI_OPEN_FROM_DT 160
+
+#define QDI_PRIVATE             256  /* Method numbers beginning at 256
+                                        are private method numbers, which
+                                        are device-specific and available
+                                        for use by device implementors. */
+/*
+||  Permission bitmasks for use with qurt_qdi_lock_buffer().
+||
+||  Make sure these match with permission values from qurt_perm_t.
+*/
+/** @endcond */
+
+/** @addtogroup driver_support_constants
+@{ */
+#define QDI_PERM_W              2                         /**< Write access. */
+#define QDI_PERM_R              1                         /**< Read access. */
+#define QDI_PERM_RW             (QDI_PERM_R | QDI_PERM_W) /**< Read/write access. */
+
+#define QDI_HANDLE_LOCAL_CLIENT         3                 /**< Local client. */
+#define QDI_HANDLE_GENERIC              4                 /**< Generic. */
+
+#define QDI_REFCNT_BASE   0x510000                        /**<  */
+#define QDI_REFCNT_MAXED  0x51FFFD                        /**<  */
+#define QDI_REFCNT_INIT   0x51FFFE                        /**< Driver object is temporary and is eventually deleted.*/
+#define QDI_REFCNT_PERM   0x51FFFF                        /**< Driver object is permanent and is never deleted. */
+/** @} */ /* end_addtogroup driver_support_constants */
+
+/** @cond */
+/*
+||  Flags used by process loaders.
+*/
+
+#define QDI_OS_PROCESS_FLAGS_ISLAND_RESIDENT    0x1     /* Set this flag to request the loaded process
+                                                           to have island residency. */
+#define QDI_OS_PROCESS_FLAGS_ROOT_RESIDENT      0x2     /* Set this flag to request the loaded process
+                                                           to have root residency, for example, DL Pager. */
+/*
+||  Constants used for qurt_event register API, type field.
+*/
+
+#define QURT_PROCESS_EXIT   1
+
+/*
+||  Constants used by QDI extensions.
+*/
+
+#define QURT_QDI_SINGLETON_TYPE_TRUE			0
+#define QURT_QDI_SINGLETON_TYPE_FALSE			1
+#define QURT_QDI_SINGLETON_TYPE_PER_PROCESS		2
+/** @endcond */
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QDI_CONSTANTS_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_qdi_driver.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_qdi_driver.h
new file mode 100755
index 0000000000000..e044e25f1bb72
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_qdi_driver.h
@@ -0,0 +1,868 @@
+#ifndef QURT_QDI_DRIVER_H
+#define QURT_QDI_DRIVER_H
+
+/**
+  @file qurt_qdi_driver.h
+  @brief  Definitions, macros, and prototypes used when writing a
+  QDI driver.
+
+ EXTERNALIZED FUNCTIONS
+  None
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  None
+
+ Copyright (c) 2018, 2019-2021, 2023 Qualcomm Technologies, Inc.
+ All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+
+#include "stddef.h"
+#include "qurt_qdi.h"
+#include "qurt_types.h"
+#include "qurt_callback.h"
+#include "qurt_qdi_constants.h"
+#include "qurt_qdi_imacros.h"
+#include "qurt_mutex.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+||  This gives the canonical form for the arguments to a QDI
+||   driver invocation function.  The arguments are as follows:
+||
+||   int client_handle    (R0) QDI handle that represents the client
+||                             that made this QDI request. If the
+||                             client is remote, this is a
+||                             variable handle; if the client is local
+||                             (same thread and process), this is
+||                             set to QDI_HANDLE_LOCAL_CLIENT.
+||
+||   qurt_qdi_obj_t *obj  (R1) Points at the qdi_object_t structure
+||                             on which this QDI request is being made.
+||                             The qdi_object_t structure is usually
+||                             the first element of a larger structure
+||                             that contains state associated with the
+||                             object; because it is usually the first
+||                             element, the object pointers can be freely
+||                             interchanged through casts.
+||
+||   int method           (R2) Integer QDI method that represents
+||                             the request type.
+||
+||   qurt_qdi_arg_t arg1  (R3) First three general purpose arguments
+||   qurt_qdi_arg_t arg2  (R4)  to the invocation function are passed in
+||   qurt_qdi_arg_t arg3  (R5)  these slots.
+||
+||   qurt_qdi_arg_t arg4  (SP+0)  Arguments beyond the first three are
+||   qurt_qdi_arg_t arg5  (SP+4)  passed on the stack.
+||   qurt_qdi_arg_t arg6  (SP+8)
+||   qurt_qdi_arg_t arg7  (SP+12)
+||   qurt_qdi_arg_t arg8  (SP+16)
+||   qurt_qdi_arg_t arg9  (SP+20)
+||
+||  The canonical form of the invocation function takes a
+||   total of 12 arguments, but not all of them are used.  In general,
+||   the QDI infrastructure only passes those arguments provided by
+||   the caller; if the invocation function accesses additional
+||   arguments beyond those provided by the caller, the values are not
+||   useful.
+*/
+/** @cond */
+#define QDI_INVOKE_ARGS \
+    int, struct qdiobj *, int, \
+    qurt_qdi_arg_t, qurt_qdi_arg_t, qurt_qdi_arg_t, \
+    qurt_qdi_arg_t, qurt_qdi_arg_t, qurt_qdi_arg_t, \
+    qurt_qdi_arg_t, qurt_qdi_arg_t, qurt_qdi_arg_t
+
+#define QDI_EXT_INVOKE_ARGS \
+    int, qurt_qdi_man_obj_t*, int, \
+    qurt_qdi_arg_t, qurt_qdi_arg_t, qurt_qdi_arg_t, \
+    qurt_qdi_arg_t, qurt_qdi_arg_t, qurt_qdi_arg_t, \
+    qurt_qdi_arg_t, qurt_qdi_arg_t, qurt_qdi_arg_t
+
+#define BUFFER_LOCK 1
+#define BUFFER_UNLOCK 0 
+
+struct qdiobj;
+/** @endcond */
+/** @addtogroup driver_support_types
+@{ */
+typedef union {
+    void *ptr; /**< Pointer to the driver handle. */
+    int num;   /**< Method number. */
+} qurt_qdi_arg_t;
+/** @} */ /* end_addtogroup driver_support_types */
+/** @cond */
+/** QuRT QDI driver version */
+typedef union {
+    int num;
+    struct {
+        short major; /** Driver major version number. */
+        short minor; /** Driver minor version number. */
+    };
+} qurt_qdi_version_t;
+
+typedef int (*qurt_qdi_pfn_invoke_t)(QDI_INVOKE_ARGS);
+typedef void (*qurt_qdi_pfn_release_t)(struct qdiobj *);
+/** @endcond */
+/** @addtogroup driver_support_types
+@{ */
+typedef struct qdiobj {
+    qurt_qdi_pfn_invoke_t invoke;   /**< Invocation function that implements the driver methods.*/
+    int refcnt;                     /**< Reference count, an integer value maintained by the QDI infrastructure that tracks the number of
+                                         references to a driver instance. */
+    qurt_qdi_pfn_release_t release; /**< Release function that performs details associated with deleting an instance
+                                         of the driver object.*/
+} qurt_qdi_obj_t;
+/** @} */ /* end_addtogroup driver_support_types */
+/** @cond */
+/** QuRT QDI managed object */
+typedef struct qurt_qdi_man_obj
+{
+    qurt_qdi_obj_t qdi_obj;
+    union
+    {
+        struct qurt_qdi_ext_driver * opener_obj;
+        struct qurt_qdi_ext_device * device_obj;
+    };
+}qurt_qdi_man_obj_t;
+
+typedef int (*qurt_qdi_ext_pfn_create_t)(int client_id, const char *name, qurt_qdi_version_t version, qurt_qdi_man_obj_t **qdi_obj);
+typedef int (*qurt_qdi_ext_pfn_create_device_t)(int client_id, const char *name, qurt_qdi_version_t version, struct qurt_qdi_ext_device * device, qurt_qdi_man_obj_t **qdi_obj);
+typedef int (*qurt_qdi_ext_pfn_invoke_t)(QDI_EXT_INVOKE_ARGS);
+typedef void (*qurt_qdi_ext_pfn_destroy_t)(qurt_qdi_man_obj_t *qdi_obj);
+typedef int (*qurt_qdi_ext_pfn_probe_t)(void *handle, struct qurt_qdi_ext_device **device);
+
+typedef struct qurt_qdi_ext_obj_info{
+    qurt_qdi_man_obj_t *obj;
+    int qdi_client_id;
+    struct qurt_qdi_ext_obj_info *next;
+}qurt_qdi_ext_obj_info_t;
+typedef struct qurt_qdi_ext_obj_info *qurt_qdi_ext_obj_info_ptr;
+
+/** QuRT QDI device */
+//temporarily add this back while there are still drivers who statically define this structure
+struct qurt_qdi_device {
+    qurt_qdi_obj_t opener_obj;
+    const char* name;
+    char island_resident;
+    unsigned char singleton;
+    qurt_qdi_ext_pfn_create_t create;
+    qurt_qdi_ext_pfn_invoke_t invoke;
+    qurt_qdi_ext_pfn_destroy_t destroy;
+    qurt_mutex_t qurt_qdi_ext_list_lock;
+    qurt_qdi_ext_obj_info_ptr qurt_qdi_ext_obj_info_head;
+};
+typedef struct qurt_qdi_device qurt_qdi_man_device;
+
+struct qurt_qdi_ext_driver {
+    qurt_qdi_obj_t opener_obj;
+    const char* name;
+    char island_resident;
+    unsigned char singleton;
+    qurt_qdi_ext_pfn_create_t create;
+    qurt_qdi_ext_pfn_invoke_t invoke;
+    qurt_qdi_ext_pfn_destroy_t destroy;
+    qurt_mutex_t qurt_qdi_ext_list_lock;
+    qurt_qdi_ext_obj_info_ptr qurt_qdi_ext_obj_info_head;
+    qurt_qdi_ext_pfn_create_device_t create_device;
+    qurt_qdi_version_t version;
+    qurt_qdi_ext_pfn_probe_t probe;
+    const char* compatible;
+    struct qurt_qdi_ext_device * device_list;
+    //qurt_qdi_ext_device_ptr device_list;
+};
+typedef struct qurt_qdi_ext_driver qurt_qdi_ext_driver_t;
+//above replaces qurt_qdi_man_device
+
+extern int qurt_qdi_obj_ref_inc(qurt_qdi_obj_t *);
+extern int qurt_qdi_obj_ref_dec(qurt_qdi_obj_t *);
+
+extern int qurt_qdi_ext_opener (QDI_INVOKE_ARGS);
+/** @endcond */
+/**@ingroup func_qurt_qdi_method_default
+  Processes a method that is unrecognized or unsupported in the driver invocation function.
+  All arguments passed to the current invocation function (Section @xref{sec:invocationFunction}) must be forwarded
+  to this function.
+
+  @note1hang Invocation functions must process all unrecognized or unsupported methods
+             by calling this function.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+extern int qurt_qdi_method_default(QDI_INVOKE_ARGS);
+
+/**@ingroup func_qurt_qdi_handle_create_from_obj_t
+  Allocates a new device handle for use with the specified driver object.
+   
+  @param[in] client_handle  Client handle obtained from the current invocation function (Section @xref{sec:invocationFunction}).
+  @param[out] obj           Pointer to the driver object.
+
+  @return
+  Non-negative integer -- Success; this value is the new handle. \n
+  Negative value -- Error.
+
+  @dependencies
+  None.
+*/
+static __inline int qurt_qdi_handle_create_from_obj_t(int client_handle, qurt_qdi_obj_t *obj)
+{
+    return qurt_qdi_handle_invoke(client_handle,
+                                    QDI_CLIENT_HANDLE_HANDLE_CREATE_FROM_OBJ_T,
+                                    obj);
+}
+
+/**@ingroup func_qurt_qdi_handle_invoke
+  Allocates a new island device handle for use with the specified driver object.
+   
+  @param[in] client_handle Client handle obtained from the current invocation function (Section 3.4.1).
+  @param[in] obj           Pointer.
+
+  @return
+  Non-negative integer value that is the new handle -- Success. \n
+  Negative return value -- Error.
+
+  @dependencies
+  None.
+*/
+static __inline int qurt_qdi_island_handle_create_from_obj_t(int client_handle, qurt_qdi_obj_t *obj)
+{
+    return qurt_qdi_handle_invoke(client_handle,
+                                    QDI_CLIENT_HANDLE_ISLAND_HANDLE_CREATE_FROM_OBJ_T,
+                                    obj);
+}
+
+/**@ingroup func_qurt_qdi_handle_release
+  Deallocates the specified device handle.
+
+  @param[in] client_handle     Obtained from the current invocation function (Section @xref{sec:invocationFunction}).
+  @param[in] handle_to_release Handle to release.
+
+  @return 
+  0 -- Success. \n
+  Negative value -- Error. 
+
+  @dependencies
+  None.
+*/
+static __inline int qurt_qdi_handle_release(int client_handle, int handle_to_release)
+{
+    return qurt_qdi_handle_invoke(client_handle,
+                                    QDI_CLIENT_HANDLE_HANDLE_RELEASE,
+                                    handle_to_release);
+}
+
+static __inline qurt_qdi_obj_t *
+qurt_qdi_objref_get_from_handle(int client_handle, int object_handle)
+{
+    qurt_qdi_obj_t *ret;
+
+    ret = NULL;
+
+    qurt_qdi_handle_invoke(client_handle,
+                            QDI_CLIENT_HANDLE_OBJREF_GET,
+                            object_handle,
+                            &ret);
+
+    return ret;
+}
+
+/**@ingroup func_qurt_client_add_memory
+  Adds a physical address range to the HLOS physpool of the caller user PD.
+   
+  @param[in] client_handle  Obtained from the current invocation function (Section 3.4.1).
+  @param[in] phys_addr      Starting address of the physical address range. 
+  @param[in] size           Size.
+
+  @return
+  #QURT_EOK -- Pages successfully added.
+
+  @dependencies
+  None.
+*/
+int qurt_client_add_memory(int client_handle, qurt_addr_t phys_addr, qurt_size_t size);
+
+/**@ingroup func_qurt_client_add_memory2
+  Adds a physical address range to the HLOS physpool of the caller user PD.
+   
+  @param[in] client_handle  Obtained from the current invocation function (Section 3.4.1).
+  @param[in] phys_addr      Starting 36-bit address of the physical address range. 
+  @param[in] size           Size.
+
+  @return
+  #QURT_EOK -- Pages successfully added.
+
+  @dependencies
+  None.
+*/
+int qurt_client_add_memory2(int user_client_handle, qurt_paddr_64_t phys_addr, qurt_size_t size);
+
+static __inline qurt_qdi_obj_t *
+qurt_qdi_objref_get_from_pointer(qurt_qdi_obj_t *objptr)
+{
+    qurt_qdi_obj_t * ret = NULL;
+
+    if (qurt_qdi_obj_ref_inc(objptr) < 0) {
+        ret = NULL;
+    } else {
+        ret = objptr;
+    }
+
+    return ret;
+}
+
+static __inline void
+qurt_qdi_objref_release(qurt_qdi_obj_t *objptr)
+{
+    if (qurt_qdi_obj_ref_dec(objptr) == 1) {
+        (*objptr->release)(objptr);
+    }
+}
+
+/**@ingroup func_qurt_qdi_copy_from_user
+  Copies the contents of a user memory buffer into the current driver.
+
+  @note1hang User buffer addresses are valid only for the duration of the current driver
+  invocation.
+
+  @param[in] client_handle   Obtained from the current invocation function (Section @xref{sec:invocationFunction}).
+  @param[in] dest            Base address of the driver buffer.
+  @param[in] src             Base address of the user buffer.
+  @param[in] len             Number of bytes to copy.
+  
+  @return
+  Negative value -- Indicates a privilege or security violation, the copy operation 
+                has crossed a privilege boundary.
+  
+  @dependencies
+  None.
+*/
+static __inline int qurt_qdi_copy_from_user(int client_handle, void *dest, const void *src, unsigned len)
+{
+    return qurt_qdi_handle_invoke(client_handle,
+                                    QDI_CLIENT_HANDLE_COPY_FROM_USER,
+                                    dest, src, len);
+}
+
+/**@ingroup qurt_qdi_copy_string_from_user
+  Copies the contents of a user memory buffer into the current driver.
+
+  @note1hang User buffer addresses are valid only for the duration of the current driver
+  invocation.
+
+  @param client_handle   Obtained from the current invocation function (Section 3.4.1).
+  @param dest            Base address of the driver buffer.
+  @param src             Base address of the user buffer.
+  @param len             Number of bytes to copy. NOTE: This is the destination buffer length.
+  
+  @return
+  Negative error result -- privilege or security violation, the copy operation 
+                has crossed a privilege boundary.
+  
+  @dependencies
+  None.
+*/
+int qurt_qdi_copy_string_from_user(int client_handle, char *dest, const char *src, unsigned len);
+
+/**@ingroup func_qurt_qdi_copy_to_user
+  Copies the contents of a driver memory buffer to user memory.
+
+  @note1hang User buffer addresses are valid only for the duration of the current driver
+             invocation.
+
+  @param[in] client_handle Client handle obtained from the current invocation function (Section @xref{sec:invocationFunction}).
+  @param[in] dest          Base address of the user buffer.
+  @param[in] src           Base address of the driver buffer.
+  @param[in] len           Number of bytes to copy.
+
+  @return
+  Negative value -- Indicates a privilege or security violation, the copy operation has crossed a 
+                    privilege boundary
+
+  @dependencies
+  None.
+*/
+static __inline int qurt_qdi_copy_to_user(int client_handle, void *dest, const void *src, unsigned len)
+{
+    return qurt_qdi_handle_invoke(client_handle,
+                                    QDI_CLIENT_HANDLE_COPY_TO_USER,
+                                    dest, src, len);
+}
+
+/**@ingroup func_qurt_qdi_safe_cache_ops
+  Do cache operations on user memory
+
+  @note1hang User buffer addresses are valid only for the duration of the current driver
+             invocation.
+
+  @param[in] client_handle Client handle obtained from the current invocation function (Section @xref{sec:invocationFunction}).
+  @param[in] addr          Base address of the user memory.
+  @param[in] size          Size of the user memory.
+  @param[in] opcode        Cache operations (QURT_MEM_CACHE_FLUSH, QURT_MEM_CACHE_INVALIDATE...)
+  @param[in] type          Cache type (QURT_MEM_ICACHE, QURT_MEM_DCACHE)
+
+  @return
+  Negative value -- Indicates a privilege or security violation, the copy operation has crossed a
+                    privilege boundary
+
+  @dependencies
+  None.
+*/
+static __inline int qurt_qdi_safe_cache_ops(int client_handle, qurt_addr_t addr, qurt_size_t size,
+        qurt_mem_cache_op_t opcode, qurt_mem_cache_type_t type)
+{
+    return qurt_qdi_handle_invoke(client_handle,
+                                  QDI_CLIENT_HANDLE_SAFE_CACHE_OPS,
+                                  addr, size, opcode, type);
+}
+
+
+/**@ingroup func_qurt_qdi_buffer_lock
+  Prepares for the direct manipulation of a potentially untrusted buffer provided by a QDI
+  client.
+
+  This function is used to permit a trusted driver to safely access memory that is
+  provided by a potentially untrusted client. A driver calls this function to obtain a safe buffer
+  pointer for accessing the memory.
+
+  This function performs the following security checks: \n
+  - Verifies that the entire buffer is accessible to the client. \n
+  - Ensures that the pointer remains valid for the remainder of the QDI driver
+      operation. \n
+
+  @note1hang  User buffer addresses are valid only for the duration of the current driver
+              invocation.
+
+  @param[in] client_handle Obtained from the current invocation function (Section @xref{sec:invocationFunction}).
+  @param[in] buf           Pointer to the base address of the client buffer address.
+  @param[in] len           Buffer length (in bytes).
+  @param[in] perms         Bitmask value that specifies the read or write access to perform on the
+                       client buffer: \n
+                           - #QDI_PERM_R -- Read access \n
+                           - #QDI_PERM_W -- Write access \n
+                           - #QDI_PERM_RW -- Read/write access @tablebulletend
+  @param[out] obuf     Pointer to the buffer address that the driver must use to access the buffer.
+
+  @return
+  Negative value -- Error; the operation crosses a privilege boundary, indicating a privilege or security violation. \n
+  Nonzero value -- User passed a buffer that does not fulfill the requested read/write access permission.
+                    In this case the QDI driver call must be terminated cleanly, with an appropriate error code 
+                    returned to the client. \n
+  Zero -- Success; when this occurs the QDI driver must use the pointer at *obuf to access memory, and not the
+                    pointer passed in as buf -- even if the user process changes the mapping of memory at buf,
+                   the mapping of memory at *obuf remains valid until the driver invocation completes.
+
+  @dependencies
+  None.
+*/
+static __inline int qurt_qdi_buffer_lock(int client_handle, void *buf, unsigned len,
+                                         unsigned perms, void **obuf)
+{
+    return qurt_qdi_handle_invoke(client_handle,
+                                    QDI_CLIENT_HANDLE_BUFFER_LOCK,
+                                    buf, len, perms, obuf);
+}
+
+/**@ingroup func_qurt_qdi_buffer_lock2
+   Prepares for the direct manipulation of a possibly-untrusted buffer provided by a QDI
+   client.
+   This API permits a trusted driver to safely access memory 
+   provided by a possibly-untrusted client. A driver calls this function to obtain a safe buffer
+   pointer for accessing the memory.
+   This function performs the following security checks: \n
+   -- Entire buffer is accessible to the client. \n
+   -- Entire buffer is mapped with permissions passed in perms field \n
+   -- Entire buffer is physically contiguous \n
+   In addition to the security checks, the API also locks the client mapping such that the client
+   cannot remove the mapping while the physical memory is used by the trusted
+   driver. \n
+
+   @note1      Drivers are responsible for calling qurt_qdi_buffer_unlock() at appropriate time. Not 
+               pairing qurt_qdi_buffer_unlock() with this API leads to resource leakages and 
+               process exit failures. Drivers can keep track of which buffers are locked for
+               a particular client. If the client exits abruptly, the buffers can be
+               unlocked on driver release invocation for the exiting client.
+
+   @note2      This API is supported in limited capacity when called from Island mode. Safe buffer
+               unmapping or user buffer unlock is not supported in Island mode.
+
+   @param client_handle Obtained from the current invocation function (Section 3.4.1).
+   @param buf           Pointer to the base address of the client buffer address.
+   @param len           Buffer length (in bytes).
+   @param perms         Bitmask value that specifies the read or write access to perform on the
+                        client buffer: \n
+                        -- #QDI_PERM_R -- Read access \n
+                        -- #QDI_PERM_W -- Write access \n
+                        -- #QDI_PERM_RW -- Read/write access \n
+   @param obuf         Optional parameter that returns a pointer to the buffer address that 
+                       the driver must use to access the buffer. If NULL is passed, the API 
+                       only performs security checks and does not create a mapping to access the user buffer in
+                       a safe way.
+
+   @return
+   QURT_EINVALID   -- Arguments passed to the API are invalid. User buffer pointer is NULL or length of the
+                      buffer is 0. \n
+   QURT_EPRIVILEGE -- One of the security checks on the user buffer failed. \n
+   QURT_EFAILED    -- Mapping cannot be created for the trusted driver. \n
+   QURT_EOK        -- Lock operation was successful. When this occurs, the QDI driver must use the 
+                      pointer at *obuf to perform its memory accesses, and not the
+                      pointer passed in as buf. 
+                      
+   @dependencies
+   None.
+*/
+static __inline int qurt_qdi_buffer_lock2(int client_handle, void *buf, unsigned len,
+                                         unsigned perms, void **obuf)
+{
+    return qurt_qdi_handle_invoke(client_handle,
+                                    QDI_CLIENT_HANDLE_BUFFER_LOCK2,
+                                    BUFFER_LOCK, buf, len, perms, obuf);
+}
+
+/**@ingroup func_qurt_qdi_buffer_unlock
+   This API is paired with qurt_qdi_buffer_lock2(). A temporary overlapping mapping 
+   created for the driver is removed. Client mapping for the user buffer is
+   unlocked. 
+
+   @note1      Drivers are responsible for pairing this with qurt_qdi_buffer_lock(). Not 
+               pairing qurt_qdi_buffer_lock() with this API leads to resource leakages and 
+               process exit failures. Drivers can keep track of which buffers are locked for
+               a particular client, and if the client exits abruptly, all the buffers can be
+               unlocked on driver release invocation for the exiting client.
+
+   @note2      This API is supported in limited capacity when called from Island mode. Actual
+               unmapping of driver accessible memory or unlocking of the buffer is not
+               supported in Island bode.
+
+   @param client_handle Obtained from the current invocation function (Section 3.4.1).
+   @param buf           Pointer to the base address of the client buffer address.
+   @param len           Buffer length (in bytes).
+   @param obuf          Safe buffer address that was returned in the obuf field after calling
+                        qurt_qdi_buffer_lock2().
+
+   @return
+   QURT_EINVALID   -- Arguments passed to the API are invalid. User buffer pointer is NULL or length of the
+                      buffer is 0. \n
+   QURT_EOK        -- Lock operation was successful. When this occurs, the QDI driver must use the 
+                      pointer at *obuf to perform its memory accesses, and not the
+                      pointer passed in as buf. \n
+   other results   -- Safe buffer unmapping failed or unlocking of user buffer failed \n.
+
+   @dependencies
+   None.
+*/
+static __inline int qurt_qdi_buffer_unlock(int client_handle, void *buf, unsigned len,
+                                           void *obuf)
+{
+    return qurt_qdi_handle_invoke(client_handle,
+                                    QDI_CLIENT_HANDLE_BUFFER_LOCK2,
+                                    BUFFER_UNLOCK, buf, len, obuf);
+}
+
+/**@ingroup func_qurt_qdi_user_malloc
+  Allocates memory area in the QDI heap that is read/write accessible to both the driver and
+  the client. \n
+  @note1hang The QDI heap has a limited amount of memory available, and only the
+  device driver can free the allocated memory.
+
+  @param client_handle Client handle obtained from the current invocation function (Section @xref{sec:invocationFunction}).
+  @param size          Size.
+
+  @return
+  Non-zero -- Success; this returned value points to the allocated memory area. \n
+  Zero -- Error.
+
+  @dependencies
+  None.
+*/
+void *qurt_qdi_user_malloc(int client_handle, unsigned size);
+
+/**@ingroup func_qurt_qdi_user_free
+  Deallocates memory area in the QDI heap.
+
+  @param client_handle Client handle obtained from the current invocation function (Section @xref{sec:invocationFunction}).
+  @param ptr Pointer.
+
+  @dependencies
+  None.
+*/
+void qurt_qdi_user_free(int client_handle, void *ptr);
+
+/**@ingroup funct_qurt_qdi_client_detach
+  Detaches a client (a process), indicating that the client does not
+  participate in the qurt_wait() mechanism. This behavior
+  is opt-in and irrevocable. When a client is detached, it can
+  not be un-detached.
+
+  @param client_handle Handle of the client to detach.
+
+  @return
+  Zero -- Success.  Detachable clients always return success.
+  Nonzero value -- client_handle did not refer to a
+    detachable user client.
+
+  @dependencies
+  None.
+*/
+static __inline int qurt_qdi_client_detach(int client_handle)
+{
+    return qurt_qdi_handle_invoke(client_handle, QDI_CLIENT_HANDLE_DETACH);
+}
+
+/**@ingroup func_qurt_qdi_signal_group_create
+  Creates a new signal group for use in a device driver.
+  A QDI signal group contains up to 32 signals, which can be operated on either
+  individually (using the qurt_qdi_signal_* functions) or as a group (using the
+  qurt_qdi_signal_group_* functions). \n
+  @note1hang Driver implementation is responsible for using the proper signal group
+             handle in any given situation. \n
+  For more information on signals, see the Hexagon QuRT RTOS User Guide (80-VB419-78).
+
+  @param client_handle                 Client handle obtained from the current invocation function (Section @xref{sec:invocationFunction}).
+  @param p_signal_group_handle_local   Returns a handle intended for use by code that
+                                       resides in the same context and process as the created signal group
+                      (for example, the device driver implementation that allocated the 
+                      signal group).
+  @param p_signal_group_handle_remote  Returns a handle intended for use by code
+                                       that resides in a different context and process than the created signal group 
+                      (for example, the user-mode client of an OS driver).
+
+  @return
+  Zero return value indicates success.\n
+  Negative return value indicates could not create signal group.
+
+  @dependencies
+  None.
+*/
+static __inline int qurt_qdi_signal_group_create(int client_handle,
+                                                 int *p_signal_group_handle_local,
+                                                 int *p_signal_group_handle_remote)
+{
+    return qurt_qdi_handle_invoke(client_handle,
+                                    QDI_CLIENT_HANDLE_SIGNAL_GROUP_CREATE,
+                                    p_signal_group_handle_local,
+                                    p_signal_group_handle_remote);
+}
+
+/**@ingroup func_qurt_qdi_signal_group_wait
+  Suspends the current thread until any of the signals are set in the specified signal group.
+
+  If a signal is set in a signal group object, and a thread waits on the signal group object,
+  the thread is awakened. If the awakened thread has higher priority than the current
+  thread, a context switch can occur.
+
+  @param signal_group_handle   Handle of the signal group.
+
+  @return
+  If the client is remote:
+  QURT_EOK -- Wait complete \n
+  QURT_ECANCEL -- Wait cancelled.\n
+  If the client is local, returns a 32-bit word with current signals.
+
+  @dependencies
+  None.
+*/
+static __inline int qurt_qdi_signal_group_wait(int signal_group_handle)
+{
+    return qurt_qdi_handle_invoke(signal_group_handle,
+                                    QDI_SIGNAL_GROUP_WAIT);
+}
+
+/**@ingroup func_qurt_qdi_signal_group_poll
+  Returns a value that indicates if any of the signals are set in the specified signal group.
+
+  @param signal_group_handle Handle of the signal group.
+
+  @return
+  1 -- Indicates whether any of the signals are set in the signal group.\n
+  0 -- Indicates that none of the signals are set.
+
+  @dependencies
+  None.
+*/
+static __inline int qurt_qdi_signal_group_poll(int signal_group_handle)
+{
+    return qurt_qdi_handle_invoke(signal_group_handle,
+                                    QDI_SIGNAL_GROUP_POLL);
+}
+
+
+/**@ingroup func_qurt_qdi_signal_create
+  Creates a new signal in the specified signal group.
+  For more information on signals, see the Hexagon QuRT RTOS User Guide (80-VB419-78).
+
+  @note1hang Driver implementation is responsible for using the proper signal handle in
+             any given situation.
+
+  @param signal_group_handle    Handle of an existing signal group.
+  @param p_signal_handle_local  Returns a handle intended for use by code that resides in
+                               the same context and process as the created signal (for example,
+                               the device driver implementation that allocated the signal).
+  @param p_signal_handle_remote Returns a handle intended for use by code that resides in
+                               a different context and process than the created signal (for 
+                               example, the user-mode client of an OS driver).
+
+  @return 
+  Nonzero value -- No more signals can be created in the specified
+                    signal group. 
+
+  @dependencies
+  None.
+*/
+static __inline int qurt_qdi_signal_create(int signal_group_handle,
+                                           int *p_signal_handle_local,
+                                           int *p_signal_handle_remote)
+{
+    return qurt_qdi_handle_invoke(signal_group_handle,
+                                    QDI_SIGNAL_GROUP_SIGNAL_CREATE,
+                                    p_signal_handle_local,
+                                    p_signal_handle_remote);
+}
+
+/**@ingroup func_qurt_qdi_signal_set
+  Sets the signal in the specified signal object.
+
+  @param signal_handle Handle of the signal.
+
+  @return
+  Always returns 0.
+
+  @dependencies
+  None.
+*/
+static __inline int qurt_qdi_signal_set(int signal_handle)
+{
+    return qurt_qdi_handle_invoke(signal_handle,
+                                    QDI_SIGNAL_SET);
+}
+
+/**@ingroup func_qurt_qdi_signal_clear
+  Clears the signal in the specified signal object.
+
+  @param signal_handle   Handle of the signal.
+  
+  @return 
+  Always returns 0.
+
+  @dependencies
+  None.
+*/
+static __inline int qurt_qdi_signal_clear(int signal_handle)
+{
+    return qurt_qdi_handle_invoke(signal_handle,
+                                    QDI_SIGNAL_CLEAR);
+}
+
+/**@ingroup func_qurt_qdi_signal_wait
+  Suspends the current thread until the specified signal is set.
+  If a signal is set in a signal object, and a thread waits on the signal object, the
+  thread is awakened. If the awakened thread has higher priority than the current thread, a
+  context switch may occur.
+
+  @param signal_handle Handle of the signal.
+
+  @return
+  If client is remote:
+  QURT_EOK -- Wait complete. \n
+  QURT_ECANCEL -- Wait cancelled.\n
+  If client is local, return a 32-bit word with current signals.
+
+  @dependencies
+  None.
+*/
+static __inline int qurt_qdi_signal_wait(int signal_handle)
+{
+    return qurt_qdi_handle_invoke(signal_handle,
+                                    QDI_SIGNAL_WAIT);
+}
+
+/**@ingroup func_qurt_qdi_signal_poll
+  Returns a value that indicates if the specified signal is set.
+
+  @param signal_handle Handle of the signal.
+
+  @return
+  1 -- Signal is set. \n
+  0 -- Signal is not set.
+
+  @dependencies
+  None.
+*/
+static __inline int qurt_qdi_signal_poll(int signal_handle)
+{
+    return qurt_qdi_handle_invoke(signal_handle,
+                                    QDI_SIGNAL_POLL);
+}
+
+/**@ingroup func_qurt_qdi_devname_register
+  Registers a QDI device with the generic QDI object in the 
+  current QDI context.
+
+  This function registers an exact name or a directory prefix with a QDI opener object.
+  Future invocations of qurt_qdi_open() in the context of the caller invokes the
+  opener object if a match is detected.
+
+  Directory prefix names are specified by ending the name with a forward slash character.
+
+  Example of an exact name:
+  @code qurt_qdi_devname_register(/dev/foobar, foobar_opener);@endcode
+
+  Example of a directory prefix:
+  @code qurt_qdi_devname_register(/pipedev/, pipedev_opener);@endcode
+
+  Given the two registrations shown above, the only qurt_qdi_open() requests to
+  direct to the foobar_opener object are requests for the exact name
+  "/dev/foobar", Any request beginning with "/pipedev/" is directed to the
+  pipedev_opener object.
+
+  The pipedev invocation function presumably examines the name argument to
+  determine exactly how to handle the request. The name is passed to the invocation
+  function in the a1.ptr argument (Section @xref{sec:invocationFunction}).
+
+  @param  name   Device name or device name prefix.
+  @param  opener Pointer to the opener object for the device.
+ 
+  @return
+  0 -- Device was successfully registered. \n
+  Negative error code -- Device was not registered.
+
+  @dependencies
+  None.
+ */
+static __inline int qurt_qdi_devname_register(const char *name,
+                                              qurt_qdi_obj_t *opener)
+{
+    return qurt_qdi_handle_invoke(QDI_HANDLE_GENERIC,
+                                    QDI_DEVNAME_REGISTER,
+                                    name,
+                                    opener);
+}
+
+// Macros for backward compatibility with deprecated APIs
+//  (These will go away soon)
+
+#define qurt_qdi_register_devname(name, opener) \
+        qurt_qdi_devname_register((name), (void *)(opener))
+#define qurt_qdi_new_handle_from_obj_t(handle, obj) \
+        qurt_qdi_handle_create_from_obj_t((handle), (obj))
+#define qurt_qdi_release_handle(client_handle, handle) \
+        qurt_qdi_handle_release((client_handle), (handle))
+#define qurt_qdi_lock_buffer(handle, buf, len, perms, obuf) \
+        qurt_qdi_buffer_lock((handle), (buf), (len), (perms), (obuf))
+#define qurt_qdi_usermalloc(handle, size) \
+        qurt_qdi_user_malloc((handle), (size))
+#define qurt_qdi_userfree(handle, ptr) \
+        qurt_qdi_user_free((handle), (ptr))
+        
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_qdi_ext.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_qdi_ext.h
new file mode 100755
index 0000000000000..383e1799a15d6
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_qdi_ext.h
@@ -0,0 +1,58 @@
+#ifndef QURT_QDI_EXT_H
+#define QURT_QDI_EXT_H
+
+/**
+  @file qurt_qdi_driver.h
+  @brief  Definitions, macros, and prototypes used when writing a
+  QDI driver
+
+ EXTERNALIZED FUNCTIONS
+  none
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  none
+
+ Copyright (c) 2018, 2019-2021 by Qualcomm Technologies, Inc.  All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+#include "qurt_qdi_driver.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct qurt_qdi_ext_device {
+    qurt_qdi_ext_obj_info_ptr qurt_qdi_ext_obj_info_head;
+    struct qurt_qdi_ext_device * next;
+    char * instance;
+    fdt_node_handle context;
+};
+typedef struct qurt_qdi_ext_device *qurt_qdi_ext_device_ptr;
+
+/**@ingroup func_qurt_qdi_dt_register
+ Registers a QDI device with the generic QDI object in the current QDI context,
+ if and only if a compatible device node is found in the device tree. This 
+ function serves as a device tree aware wrapper for qurt_qdi_devname_register().
+
+ @param  name   Device name or device name prefix.
+ @param  opener Pointer to QDI ext specialized opener object for the driver.
+
+ @return
+ 0 -- Device was successfully registered. \n
+ Negative error code -- Device was not registered.
+*/
+static __inline int qurt_qdi_dt_register(const char *name, qurt_qdi_obj_t *opener)
+{
+    return qurt_qdi_handle_invoke(QDI_HANDLE_GENERIC, QDI_DT_REGISTER, name, opener);
+}
+
+static inline void qurt_qdi_ext_deviceobj_set_name (struct qurt_qdi_ext_device * device, char * name)
+{
+    device->instance = name;
+}
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_qdi_imacros.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_qdi_imacros.h
new file mode 100755
index 0000000000000..c0a8448ac87f8
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_qdi_imacros.h
@@ -0,0 +1,34 @@
+#ifndef QURT_QDI_IMACROS_H
+#define QURT_QDI_IMACROS_H
+
+/**
+  @file  qurt_qdi_imacros.h 
+  @brief  Internal macros used for QDI. Mostly consists of tricky (and ugly)
+  preprocessor hacks that permit us to do varargs function invocations
+  where we pass optional arguments in registers and where we can do
+  type casting and checking automatically.
+
+ EXTERNALIZED FUNCTIONS
+  none
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  none
+
+ Copyright (c) 2013, 2021  by Qualcomm Technologies, Inc.  All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define _QDMPASTE(a,b) _QDMPASTE_(a,b)
+#define _QDMPASTE_(a,b) a##b
+#define _QDMCNT(...) _QDMCNT_(__VA_ARGS__,12,11,10,9,8,7,6,5,4,3,2,1,0)
+#define _QDMCNT_(a,b,c,d,e,f,g,h,i,j,k,l,cnt,...) cnt
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_qdi_proxy.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_qdi_proxy.h
new file mode 100755
index 0000000000000..f1d8992ea8811
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_qdi_proxy.h
@@ -0,0 +1,55 @@
+/*=============================================================================
+
+                                    qurt_qdi_proxy.h
+
+GENERAL DESCRIPTION
+
+EXTERNAL FUNCTIONS
+        None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+        None.
+
+Copyright (c) 2013, 2021  by Qualcomm Technologies, Inc.  All Rights Reserved.
+=============================================================================*/
+#ifndef _QURT_QDI_PROXY_H
+#define _QURT_QDI_PROXY_H
+
+#include "qurt_qdi_driver.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* APIs allowing operation on the proxy object directly */
+int qurt_qdi_proxy_ref_create(void);
+
+/* APIs allowing to operate on proxy given a known proxy handle 
+ * 1) using qdi handle of the object 
+ * successful return: QURT_EOK, anything else -- failure 
+ */
+int qurt_qdi_proxy_ref_add_by_handle(int proxy_handle, int qdi_handle);
+int qurt_qdi_proxy_ref_sub_by_handle(int proxy_handle, int qdi_handle);
+
+/* 2) using object reference 
+ * successful return: QURT_EOK, anything else -- failure 
+ */
+int qurt_qdi_proxy_ref_add_by_object(int proxy_handle, qurt_qdi_obj_t *obj_ptr);
+int qurt_qdi_proxy_ref_sub_by_object(int proxy_handle, qurt_qdi_obj_t *obj_ptr);
+
+/* API allowing to associate a proxy object with a particular client given a client handle 
+ * successfule return: QURT_EOK, anything else -- failure 
+ */
+int qurt_client_proxy_ref_install (int client_handle, int proxy_handle);
+
+/* APIs allowing operation on proxy object from user client 
+ * successful return: QURT_EOK, anything else -- failure 
+ */
+int qurt_client_proxy_ref_add(int qdi_handle);
+int qurt_client_proxy_ref_remove(int qdi_handle);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_QDI_PROXY_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_rmutex.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_rmutex.h
new file mode 100755
index 0000000000000..a013a0bbddb1d
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_rmutex.h
@@ -0,0 +1,200 @@
+#ifndef QURT_RMUTEX_H
+#define QURT_RMUTEX_H
+/**
+  @file qurt_rmutex.h 
+  Prototypes of rmutex API.  
+
+EXTERNAL FUNCTIONS
+   None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+Copyright (c) 2013 - 2021 by Qualcomm Technologies, Inc.  All Rights Reserved.
+Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+=============================================================================*/
+
+
+#include <qurt_futex.h>
+#include <qurt_mutex.h>
+
+/*=============================================================================
+												FUNCTIONS
+=============================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**@ingroup func_qurt_rmutex_init
+   Initializes a recursive mutex object.
+   The recursive mutex is initialized in unlocked state.
+
+   @datatypes
+   #qurt_mutex_t
+   
+   @param[out]  lock  Pointer to the recursive mutex object.
+
+   @return 
+   None.
+
+   @dependencies
+   None.  
+ */
+void qurt_rmutex_init(qurt_mutex_t *lock);
+
+/**@ingroup func_qurt_rmutex_destroy  
+  Destroys the specified recursive mutex. \n
+  @note1hang Recursive mutexes must not be destroyed while they are still in use. If this
+             occurs, the behavior of QuRT is undefined. 
+  
+  @datatypes
+  #qurt_mutex_t
+  
+  @param[in]  lock  Pointer to the recursive mutex object to destroy.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+  
+ */
+void qurt_rmutex_destroy(qurt_mutex_t *lock);
+
+/**@ingroup func_qurt_rmutex_lock
+  Locks the specified recursive mutex. \n 
+
+  If a thread performs a lock operation on a mutex that is not in use, the thread 
+  gains access to the shared resource that the mutex protects, and continues executing.
+
+  If a thread performs a lock operation on a mutex that is already use by another 
+  thread, the thread is suspended. When the mutex becomes available again (because the 
+  other thread has unlocked it), the thread is awakened and given access to the shared resource.
+  
+   @note1hang A thread is not suspended if it locks a recursive mutex that it has already 
+   locked. However, the mutex does not become available to other threads until the
+   thread performs a balanced number of unlocks on the mutex.
+
+   @datatypes
+   #qurt_mutex_t
+  
+   @param[in]  lock  Pointer to the recursive mutex object to lock. 
+
+   @return
+   None.
+
+   @dependencies
+   None.
+  
+ */
+void qurt_rmutex_lock(qurt_mutex_t *lock);
+
+/**@ingroup func_qurt_rmutex_lock_timed
+  Locks the specified recursive mutex. The wait must be terminated when the specified timeout expires.\n 
+
+  If a thread performs a lock operation on a mutex that is not in use, the thread 
+  gains access to the shared resource that the mutex is protecting, and continues executing.
+
+  If a thread performs a lock operation on a mutex that is already in use by another 
+  thread, the thread is suspended. When the mutex becomes available again (because the 
+  other thread has unlocked it), the thread is awakened and given access to the shared resource.
+  
+   @note1hang A thread is not suspended if it locks a recursive mutex that it has already 
+   locked by itself. However, the mutex does not become available to other threads until the
+   thread performs a balanced number of unlocks on the mutex.
+   If timeout expires, this wait must be terminated and no access to the mutex is granted.
+   
+   @datatypes
+   #qurt_mutex_t
+  
+   @param[in]  lock  Pointer to the recursive mutex object to lock. 
+   @param[in] duration Interval (in microseconds) duration value must be between #QURT_TIMER_MIN_DURATION and
+    #QURT_TIMER_MAX_DURATION 
+
+   @return 
+   #QURT_EOK -- Success \n
+   #QURT_ETIMEDOUT -- Timeout
+
+   @dependencies
+   None.
+  
+ */
+int qurt_rmutex_lock_timed(qurt_mutex_t *lock, unsigned long long int duration);
+
+/**@ingroup func_qurt_rmutex_unlock
+   Unlocks the specified recursive mutex. \n 
+   More than one thread can be suspended on a mutex. When the mutex is 
+   unlocked, the thread waiting on the mutex awakens. If the awakened
+   thread has higher priority than the current thread, a context switch occurs.
+
+   @note1hang When a thread unlocks a recursive mutex, the mutex is not available until 
+   the balanced number of locks and unlocks has been performed on the mutex.
+
+   @datatypes
+   #qurt_mutex_t
+  
+   @param[in]  lock  Pointer to the recursive mutex object to unlock. 
+
+   @return
+   None.
+
+   @dependencies
+   None.
+  
+ */
+void qurt_rmutex_unlock(qurt_mutex_t *lock);
+
+/**@ingroup func_qurt_rmutex_try_lock
+   Attempts to lock the specified recursive mutex.\n
+
+   If a thread performs a try_lock operation on a recursive mutex that is not in use, the
+   thread gains access to the shared resource that is protected by the mutex, and continues
+   executing.\n
+   If a thread performs a try_lock operation on a recursive mutex that another thread has 
+   already locked, qurt_rmutex_try_lock immediately returns with a nonzero result
+   value.
+
+   @datatypes
+   #qurt_mutex_t
+   
+   @param[in]  lock  Pointer to the recursive mutex object to lock.
+
+   @return 
+   0 -- Success. \n 
+   Nonzero -- Failure. 
+  
+ */
+int qurt_rmutex_try_lock(qurt_mutex_t *lock);
+
+/**@ingroup func_qurt_rmutex_try_lock_block_once 
+  Attempts to lock a mutex object recursively. If the mutex is available, 
+  it locks the mutex. If the mutex is held by the current thread, 
+  it increases the internal counter and returns 0. If not, it returns a
+  nonzero value.
+  If the mutex is already locked by another thread, the caller thread is 
+  suspended. When the mutex becomes available again (because the other 
+  thread has unlocked it), the caller thread is awakened and tries to lock
+  the mutex; and if it fails, this function returns failure with a nonzero 
+  value. If it succeeds, this function returns success with zero.
+ 
+  @datatypes
+  #qurt_mutex_t
+  
+  @param[in]  lock  Pointer to the qurt_mutex_t object. 
+
+  @return
+  0 -- Success. \n
+  Nonzero -- Failure. 
+
+  @dependencies
+  None.
+ */
+int qurt_rmutex_try_lock_block_once(qurt_mutex_t *lock);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_RMUTEX_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_rmutex2.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_rmutex2.h
new file mode 100755
index 0000000000000..a37e7e4458c4b
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_rmutex2.h
@@ -0,0 +1,183 @@
+#ifndef QURT_RMUTEX2_H
+#define QURT_RMUTEX2_H
+/**
+  @file qurt_rmutex2.h 
+  @brief Prototypes of rmutex2 API  
+
+EXTERNAL FUNCTIONS
+   None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+Copyright (c) 2013, 2021, 2023  by Qualcomm Technologies, Inc.  All Rights Reserved.
+Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+=============================================================================*/
+
+
+#include <qurt_futex.h>
+#include <qurt_mutex.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** @addtogroup mutex_types
+@{ */
+/*=============================================================================
+                        TYPEDEFS
+=============================================================================*/
+
+/** QuRT rmutex2 type.                                       
+   Mutex type used with rmutex2 APIs.
+ */
+typedef struct {
+   /** @cond */
+   unsigned int holder __attribute__((aligned(8)));    /* UGP value of the mutex holder. */
+   unsigned short waiters;                             /* Number of waiting threads. */
+   unsigned short refs;                                /* Number of references to this mutex. */
+   unsigned int queue;                                 /* Kernel-maintained futex queuevalue. */
+   unsigned int excess_locks;                          /* Number of excess times the holder has locked the mutex. */
+   /** @endcond */
+} qurt_rmutex2_t;
+/** @} */ /* end_addtogroup mutex_types */
+/** @cond internal_only*/
+/*=============================================================================
+												FUNCTIONS
+=============================================================================*/
+
+/**@ingroup func_qurt_rmutex2_init
+
+   @deprecated use #qurt_rmutex_init instead.
+
+   Initializes a recursive mutex object. 
+
+   The recursive mutex is initially unlocked.
+  
+   Objects of type rmutex2 solve a potential race condition between
+   unlock() and destroy() operations.
+
+   @datatypes
+   #qurt_rmutex2_t
+   
+   @param[out]  lock  Pointer to the recursive mutex object.
+
+   @return 
+   None.
+
+   @dependencies
+   None.
+ */
+void qurt_rmutex2_init(qurt_rmutex2_t *lock);
+
+/**@ingroup func_qurt_rmutex2_destroy
+
+  @deprecated use #qurt_rmutex_destroy instead.
+
+  Destroys the specified recursive mutex. \n
+  @note1hang Recursive mutexes must not be destroyed while they are still in use. If this
+             occurs, the behavior of QuRT is undefined. 
+  @note1cont In general, application code must destroy an rmutex2 object prior to
+             deallocating it; calling qurt_rmutex2_destroy() before deallocating it ensures
+             that all qurt_rmutex2_unlock() calls complete.
+  
+  @datatypes
+  #qurt_rmutex2_t
+  
+  @param[in]  lock  Pointer to the recursive mutex object to destroy.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+  
+ */
+void qurt_rmutex2_destroy(qurt_rmutex2_t *lock);
+
+/**@ingroup func_qurt_rmutex2_lock
+
+  @deprecated use #qurt_rmutex_lock instead.
+
+  Locks the specified recursive mutex. \n 
+
+  If a thread performs a lock operation on a recursive mutex that is not in use, the
+  thread gains access to the shared resource that the mutex protects, and continues
+  to execute.
+
+  If a thread performs a lock operation on a recursive mutex that another thread is using, 
+  the thread is suspended. When the mutex becomes available again
+  (because the other thread has unlocked it), the thread is awakened and given access to the
+  shared resource.
+  
+  @note1hang A thread is not suspended if it locks a recursive mutex that it has already
+             locked, but the mutex does not become available until the thread performs a
+             balanced number of unlocks on the mutex.
+  
+   @datatypes
+   #qurt_rmutex2_t
+  
+   @param[in]  lock  Pointer to the recursive mutex object to lock. 
+
+   @return
+   None.
+
+   @dependencies
+   None.
+  
+ */
+void qurt_rmutex2_lock(qurt_rmutex2_t *lock);
+
+/**@ingroup func_qurt_rmutex2_unlock
+
+   @deprecated use #qurt_rmutex_unlock instead.
+
+   Unlocks the specified recursive mutex. \n 
+   More than one thread can be suspended on a recursive mutex. When the mutex is
+   unlocked, only the highest-priority thread waiting on the mutex awakens. If the
+   awakened thread has higher priority than the current thread, a context switch occurs.
+  
+   @datatypes
+   #qurt_rmutex2_t
+  
+   @param[in]  lock  Pointer to the recursive mutex object to unlock. 
+
+   @return
+   None.
+
+   @dependencies
+   None.
+  
+ */
+void qurt_rmutex2_unlock(qurt_rmutex2_t *lock);
+
+/**@ingroup func_qurt_rmutex2_try_lock
+
+   @deprecated use #qurt_rmutex_try_lock instead.
+
+   Attempts to lock the specified recursive mutex.\n
+
+   Non-blocking version of qurt_rmutex2_lock(). When a call to qurt_rmutex2_lock() 
+   succeeds immediately, this function behaves similarly, returning 0 for success.
+   When a call to qurt_rmutex2_lock() does not succeed immediately, this function has
+   no effect and returns nonzero for failure.
+
+   @datatypes
+   #qurt_rmutex2_t
+   
+   @param[in]  lock  Pointer to the recursive mutex object to lock.
+
+   @return 
+   0 -- Success. \n 
+   Nonzero -- Failure. 
+  
+ */
+int qurt_rmutex2_try_lock(qurt_rmutex2_t *lock);
+/** @endcond */
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_RMUTEX2_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_sclk.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_sclk.h
new file mode 100755
index 0000000000000..a83cf5f1db889
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_sclk.h
@@ -0,0 +1,145 @@
+#ifndef QURT_SCLK_H
+#define QURT_SCLK_H
+/**
+  @file qurt_sclk.h 
+  @brief Header file describing the APIs supported by QuRT system SCLK
+   feature.
+
+EXTERNAL FUNCTIONS
+   None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+Copyright (c) 2018-2021, 2023 Qualcomm Technologies, Inc.
+All Rights Reserved.
+Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+=============================================================================*/
+
+
+
+
+/*=============================================================================
+
+                           INCLUDE FILES
+
+=============================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
+
+/**
+ Conversion from microseconds to sleep ticks.
+ */
+#define QURT_SYSCLOCK_TIMETICK_FROM_US(us) ((us) * 192ULL / 10UL)
+#define qurt_sysclock_timetick_from_us(us) QURT_SYSCLOCK_TIMETICK_FROM_US(us)
+
+
+/**
+ Conversion from timer ticks to microseconds at the nominal frequency.
+*/
+#define QURT_SYSCLOCK_TIMETICK_TO_US(ticks) qurt_timer_timetick_to_us(ticks)
+
+/**
+  Maximum microseconds value for Qtimer is 1,042,499 hours.
+*/
+#define QURT_SYSCLOCK_MAX_DURATION (1042499uLL * 3600uLL * 1000uLL * 1000uLL)
+#define qurt_sysclock_max_duration() QURT_SYSCLOCK_MAX_DURATION
+/** 
+ Timer clock for Qtimer is 19.2 MHz.
+*/
+#define QURT_SYSCLOCK_MAX_DURATION_TICKS (1042499uLL * 3600uLL * 19200000uLL)
+#define qurt_sysclock_max_duration_ticks() QURT_SYSCLOCK_MAX_DURATION_TICKS
+/** 
+ Sleep timer error margin for Qtimer is 192 ticks ~10 us.
+*/
+#define QURT_SYSCLOCK_ERROR_MARGIN 192U //QURT_TIMER_MIN_DURATION*timer_freq;
+#define qurt_sysclock_error_margin() QURT_SYSCLOCK_ERROR_MARGIN
+
+/*=============================================================================
+
+                           DATA DECLARATIONS
+
+=============================================================================*/
+
+/**@ingroup func_qurt_sysclock_get_hw_ticks
+  @xreflabel{sec:qurt_sysclock_get_hw_ticks}
+  Gets the hardware tick count.\n
+  Returns the current value of a 64-bit hardware counter. The value wraps around to zero
+  when it exceeds the maximum value. 
+
+  @note1hang This operation must be used with care because of the wrap-around behavior.
+ 
+  @return 
+  Integer -- Current value of 64-bit hardware counter. 
+
+  @dependencies
+  None.
+ */
+unsigned long long qurt_sysclock_get_hw_ticks (void);
+
+
+/**@ingroup func_qurt_sysclock_get_hw_ticks_32
+  @xreflabel{sec:qurt_sysclock_get_hw_ticks_32}
+  Gets the hardware tick count in 32 bits.\n
+  Returns the current value of a 32-bit hardware counter. The value wraps around to zero
+  when it exceeds the maximum value. 
+
+  @note1hang This operation is implemented as an inline C function, and should be called from a C/C++ program.
+             The returned 32 bits are the lower 32 bits of the Qtimer counter.
+ 
+  @return 
+  Integer -- Current value of the 32-bit timer counter. 
+
+  @dependencies
+  None.
+ */
+static inline unsigned long qurt_sysclock_get_hw_ticks_32 (void)
+{
+    //Beginning with v61 there is a HW register that can be read directly.
+          unsigned long count;
+          __asm__ __volatile__ (" %0 = c30 " : "=r"(count));
+          return count;
+}
+
+
+/**@ingroup func_qurt_sysclock_get_hw_ticks_16
+  @xreflabel{sec:qurt_sysclock_get_hw_ticks_16}
+  Gets the hardware tick count in 16 bits.\n
+  Returns the current value of a 16-bit timer counter. The value wraps around to zero
+  when it exceeds the maximum value. 
+
+  @note1hang This operation is implemented as an inline C function, and should be called from a C/C++ program.
+             The returned 16 bits are based on the value of the lower 32 bits in Qtimer 
+             counter, right shifted by 16 bits.
+ 
+  @return 
+  Integer -- Current value of the 16-bit timer counter, calculated from the lower 32 bits in the
+             Qtimer counter, right shifted by 16 bits. 
+
+  @dependencies
+  None.
+ */
+
+
+static inline unsigned short qurt_sysclock_get_hw_ticks_16 (void)
+{
+    unsigned long ticks;
+
+    //Beginning with v61 there is a HW register that can be read directly.
+       __asm__ __volatile__ (" %0 = c30 " : "=r"(ticks));
+    __asm__ __volatile__ ( "%0 = lsr(%0, #16) \n" :"+r"(ticks));
+
+    return (unsigned short)ticks; 
+}
+unsigned long long qurt_timer_timetick_to_us(unsigned long long ticks);
+#define qurt_sysclock_timetick_to_us(ticks) qurt_timer_timetick_to_us(ticks)
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif /* __cplusplus */
+
+#endif /* QURT_SCLK_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_secure_proc.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_secure_proc.h
new file mode 100755
index 0000000000000..f40c7deb9bca1
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_secure_proc.h
@@ -0,0 +1,53 @@
+#ifndef QURT_SECURE_PROC_H
+#define QURT_SECURE_PROC_H
+
+/**
+  @file qurt_secure_proc.h
+  @brief  Definitions, macros, and prototypes used for handling secure process
+
+ EXTERNALIZED FUNCTIONS
+  none
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  none
+
+ Copyright (c) 2015, 2021  by Qualcomm Technologies, Inc.  All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**@ingroup qurt_process_migrate_secure_process
+  Migrate the user process to Qurt secure process 
+   
+  @param secure_phy_address  Physical starting address of secure memory
+  @param secure_memory_size  Size of secure memory
+  @param entry               Entry function to secure process 
+
+  @return
+  EOK
+  Negative return value -- Error.
+
+  @dependencies
+  None.
+*/
+int qurt_process_migrate_secure_process(unsigned long long secure_phy_address, unsigned int secure_memory_size,  void entry(unsigned));
+
+/**@ingroup qurt_process_get_migration_mem_size
+  get the size of all writable memory regions in a user PD. This is for preparation on secure process migration.
+   
+  @return
+  size of all writable memory regions in a user PD.
+ 
+  @dependencies
+  None.
+*/
+int qurt_process_get_migration_mem_size(void);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_sem.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_sem.h
new file mode 100755
index 0000000000000..ee5ce4b2d94ab
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_sem.h
@@ -0,0 +1,252 @@
+#ifndef QURT_SEM_H
+#define QURT_SEM_H 
+/**
+  @file  qurt_sem.h 
+  Prototypes of semaphore API.  
+
+EXTERNAL FUNCTIONS
+   None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+Copyright (c) 2021  by Qualcomm Technologies, Inc.  All Rights Reserved.
+Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+=============================================================================*/
+
+
+#include <qurt_futex.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*=============================================================================
+												TYPEDEFS
+=============================================================================*/
+/** @addtogroup semaphore_types
+@{ */
+
+/** QuRT semaphore type.   */
+typedef union {
+    /** @cond */
+	unsigned int raw[2] __attribute__((aligned(8)));
+	struct {        
+		unsigned short val;        /**< */
+		unsigned short n_waiting;  /**< */
+        unsigned int reserved1;    /**< */
+        unsigned int queue;       /**< */
+        unsigned int reserved2;    /**< */
+	}X; /** @endcond */   
+} qurt_sem_t;
+/** @} */ /* end_addtogroup semaphore_types */
+/*=============================================================================
+												FUNCTIONS
+=============================================================================*/
+
+/**@ingroup func_qurt_sem_add
+  Releases access to a shared resource (the specified amount increments the semaphore count value).\n
+  When a thread performs an add operation on a semaphore, the specified value increments the semaphore count.
+  The result depends on the number of threads waiting
+  on the semaphore: \n
+  - When no threads are waiting, the current thread releases access to the shared resource
+     and continues executing. \n
+  - When one or more threads are waiting and the semaphore count value is nonzero, 
+     the kernel repeatedly awakens the highest-priority waiting thread and decrements
+     the semaphore count value until either no waiting threads remain or the
+     semaphore count value is zero. If any of the awakened threads has higher priority
+     than the current thread, a context switch can occur.
+
+   @datatypes
+   #qurt_sem_t
+   
+   @param[in]  sem  Pointer to the semaphore object to access. 
+   @param[in]  amt  Amount to increment the semaphore count value. 
+
+   @return 
+   Unused integer value.
+
+   @dependencies 
+   None.
+  
+ */
+int qurt_sem_add(qurt_sem_t *sem, unsigned int amt);
+
+/**@ingroup func_qurt_sem_up  
+  Releases access to a shared resource. When a thread performs an up operation on a semaphore,
+  the semaphore count value increments. The result depends on the number of threads waiting 
+  on the semaphore: \n
+  - When no threads are waiting, the current thread releases access to the shared resource
+     and continues executing.\n
+  - When one or more threads are waiting and the semaphore count value is nonzero, 
+     the kernel awakens the highest-priority waiting thread and decrements the
+     semaphore count value. If the awakened thread has higher priority than the current
+     thread, a context switch can occur.
+
+   @datatypes
+   #qurt_sem_t
+   
+   @param[in]  sem  Pointer to the semaphore object to access.
+
+   @return 
+   Unused integer value.
+
+   @dependencies
+   None.  
+ */
+static inline int qurt_sem_up(qurt_sem_t *sem) { return qurt_sem_add(sem,1); }
+
+/**@ingroup func_qurt_sem_down  
+  Requests access to a shared resource. When a thread performs a down operation on a 
+  semaphore, the result depends on the semaphore count value: \n
+  - When the count value is nonzero, it is decremented, and the thread gains access to the
+     shared resource and continues executing.\n
+  - When the count value is zero, it is not decremented, and the thread is suspended on the
+     semaphore. When the count value becomes nonzero (because another thread
+     released the semaphore) it is decremented, and the suspended thread is awakened
+     and gains access to the shared resource.
+  
+   @datatypes
+   #qurt_sem_t
+   
+   @param[in]  sem  Pointer to the semaphore object to access.
+
+   @return 
+   Unused integer value.
+
+   @dependencies
+   None.
+ */
+int qurt_sem_down(qurt_sem_t *sem);
+
+/**@ingroup func_qurt_sem_down_timed  
+  When a thread performs a down operation on a semaphore, the result depends on the
+  semaphore count value: \n
+  - When the count value is nonzero, it is decremented, and the thread gains access to the
+     shared resource and continues executing.\n
+  - When the count value is zero, it is not decremented, and the thread is suspended on the
+     semaphore. When the count value becomes nonzero (because another thread
+     released the semaphore) it is decremented, and the suspended thread is awakened
+     and gains access to the shared resource. Terminate the wait when the specified timeout expires.
+   If timeout expires, terminate this wait and grant no access to the shared resource.
+  
+   @datatypes
+   #qurt_sem_t
+   
+   @param[in]  sem     Pointer to the semaphore object to access. 
+   @param[in] duration Interval (in microseconds) duration value must be between #QURT_TIMER_MIN_DURATION and
+                       #QURT_TIMER_MAX_DURATION 
+
+   @return 
+   #QURT_EOK -- Success \n
+   #QURT_ETIMEDOUT -- Timeout
+
+   @dependencies
+   None.
+ */
+int qurt_sem_down_timed(qurt_sem_t *sem, unsigned long long int duration);
+
+/**@ingroup func_qurt_sem_try_down
+  @xreflabel{hdr:qurt_sem_try_down}
+  Requests access to a shared resource (without suspend). When a thread performs a try down
+  operation on a semaphore, the result depends on the semaphore count value: \n
+  - The count value is decremented when it is nonzero. The down operation returns 0 as
+     the function result, and the thread gains access to the shared resource and is free to
+     continue executing.\n
+  - The count value is not decremented when it is zero. The down operation returns -1
+     as the function result, and the thread does not gain access to the shared resource
+     and should not continue executing.
+ 
+   @datatypes
+   #qurt_sem_t
+   
+   @param[in]  sem  Pointer to the semaphore object to access. 
+
+   @return 
+   0 -- Success. \n
+   -1 -- Failure. 
+
+   @dependencies
+   None.
+   
+ */
+int qurt_sem_try_down(qurt_sem_t *sem);
+
+/**@ingroup func_qurt_sem_init
+  Initializes a semaphore object.
+  The default initial value of the semaphore count value is 1.
+
+  @param[out]  sem  Pointer to the initialized semaphore object.
+
+  @return 
+  None.
+
+  @dependencies
+  None.
+  
+ */
+void qurt_sem_init(qurt_sem_t *sem);
+
+/**@ingroup func_qurt_sem_destroy
+  Destroys the specified semaphore.\n
+  @note1hang Semaphores must be destroyed when they are no longer in use. Failure to do
+             this causes resource leaks in the QuRT kernel.\n
+  @note1cont Semaphores must not be destroyed while they are still in use. If this occur,
+             the behavior of QuRT is undefined.
+
+  @datatypes
+  #qurt_sem_t
+
+  @param[in]  sem  Pointer to the semaphore object to destroy. 
+ 
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+void qurt_sem_destroy(qurt_sem_t *sem);
+
+/**@ingroup func_qurt_sem_init_val
+  Initializes a semaphore object with the specified value.
+
+   @datatypes
+   #qurt_sem_t
+   
+   @param[out]  sem  Pointer to the initialized semaphore object. 
+   @param[in]  val   Initial value of the semaphore count value.
+
+   @return
+   None.
+
+   @dependencies
+   None.
+  
+ */
+void qurt_sem_init_val(qurt_sem_t *sem, unsigned short val);
+
+/**@ingroup func_qurt_sem_get_val
+  Gets the semaphore count value.\n
+  Returns the current count value of the specified semaphore.
+
+  @datatypes
+  #qurt_sem_t
+  
+  @param[in]   sem Pointer to the semaphore object to access.
+
+  @return
+  Integer semaphore count value
+
+  @dependencies
+  None.
+ */
+static inline unsigned short qurt_sem_get_val(qurt_sem_t *sem ){return sem->X.val;}
+int qurt_sem_down_cancellable(qurt_sem_t *sem);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_SEM_H  */
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_shmem.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_shmem.h
new file mode 100755
index 0000000000000..980557323708a
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_shmem.h
@@ -0,0 +1,89 @@
+#ifndef QURT_SHMEM_H
+#define QURT_SHMEM_H
+
+/**
+  @file qurt_shmem.h
+
+  @brief
+  Prototypes of QuRT inter-process shared memory APIs
+
+ EXTERNALIZED FUNCTIONS
+  none
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  none
+
+ Copyright (c) 2013, 2021  by Qualcomm Technologies, Inc.  All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef MODE_T
+#define MODE_T
+typedef unsigned int mode_t;
+#endif //MODE_T
+
+/**
+ * The shm_open() function establishes a connection between a shared memory object and a file descriptor.
+ * The file descriptor is used by other functions such as mmap() to refer to that shared memory object.
+ * 
+ *
+ * @param name      Pointer to string naming a shared memory object. Name has to start with "/shm/"
+ * @param oflag     File status flags and file access modes of the open file description. Following
+ *                  flags are defined in <fcntl.h> and supported:
+ *                  O_RDONLY: oepn for read access only
+ *                  O_RDWR: Open for read or write access
+ *                  O_CREAT: If shared memory object doesn't exist, create one.
+ * @param mode      Permission flags (currently ignored)
+ *
+ * @return    file descriptor (positive number) if operation successful.
+ *                  negative error code if failed
+ *
+*/
+
+int shm_open(const char * name, int oflag, mode_t mode);
+
+/**
+ * The shm_mmap() function create a shared memory mapping in the virtual address space of the
+ * the calling process. 
+ * 
+ * @param addr      The starting address for the new mapping is specified in addr.
+ * @param len       Specifies the lengh of the shared memory region.
+ * @param prot      Describes the desired memory protection of the mapping. Same as the one in mmap of POSIX.
+ * @param flags     Determines whether updates to the mapping is visible or not to other process. Same as
+ *                  the one in mmap of POSIX.
+ * @param fd        The starting adddress for the new mapping is returned.
+ * @param offset    unused.
+ *
+ * @return    The starting adddress for the new mapping is returned.
+ *                  negative error code if failed
+ *
+*/
+
+void *shm_mmap(void *addr, unsigned int len, int prot, int flags, int fd, unsigned int offset);
+
+/**
+ * The shm_close() function removes a connection between a shared memory object and a file descriptor.
+ * If there is no file descriptor connects to the shared memory object, the shared memory object will
+ * be deleted automatically. Shared memory object has same virtual address in any process. This is 
+ * restriction of single virtual address space. 
+ * 
+ *
+ * @param fd        File descriptor of shared memory object
+ *
+ * @return    0 if operation successful.
+ *                  negative error code if failed
+ *
+*/
+
+
+int shm_close(int fd);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_signal.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_signal.h
new file mode 100755
index 0000000000000..3a89c53394ad5
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_signal.h
@@ -0,0 +1,518 @@
+#ifndef QURT_SIGNAL_H
+#define QURT_SIGNAL_H
+
+/**
+  @file qurt_signal.h
+  @brief  Prototypes of kernel signal API functions. 
+
+ EXTERNALIZED FUNCTIONS
+  none
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  none
+
+ Copyright (c) 2021  by Qualcomm Technologies, Inc.  All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** @addtogroup signals_types
+@{ */
+#define QURT_SIGNAL_ATTR_WAIT_ANY 0x00000000  /**< Wait any. */
+#define QURT_SIGNAL_ATTR_WAIT_ALL 0x00000001  /**< Wait all. */
+
+/*=====================================================================
+ Typedefs
+ ======================================================================*/
+
+
+/** QuRT signal type.                                           
+ */
+typedef union {
+    /** @cond */
+	unsigned long long int raw;
+	struct {
+		unsigned int signals;
+		unsigned int waiting;
+		unsigned int queue;
+		unsigned int attribute;
+	}X;
+    /** @endcond */
+} qurt_signal_t;
+
+
+/** QuRT 64-bit signal type.                                           
+ */
+typedef struct {
+    /** @cond */
+    qurt_signal_t signal_sum;
+    unsigned long long signals;
+    unsigned long long waiting;
+    /** @endcond */
+} qurt_signal_64_t;
+/** @} */ /* end_addtogroup signals_types */
+/*=====================================================================
+ Functions
+======================================================================*/
+ 
+/*======================================================================*/
+/**@ingroup func_qurt_signal_init
+  Initializes a signal object.
+  Signal returns the initialized object.
+  The signal object is initially cleared.
+
+  @note1hang   Each signal-based object has one or more kernel resources associated with it;
+               to prevent resource leaks, call qurt_signal_destroy()
+               when this object is not used anymore
+  @datatypes
+  #qurt_signal_t
+
+  @param[in] *signal Pointer to the initialized object.
+
+  @return         
+  None.
+     
+  @dependencies
+  None.
+ */
+/* ======================================================================*/
+void qurt_signal_init(qurt_signal_t *signal);
+
+/*======================================================================*/
+/**@ingroup func_qurt_signal_destroy
+  Destroys the specified signal object.
+  
+  @note1hang Signal objects must be destroyed when they are no longer in use. Failure 
+  to do this causes resource leaks in the QuRT kernel.\n
+  @note1cont Signal objects must not be destroyed while they are still in use. If this
+  occurs, the behavior of QuRT is undefined.
+  
+  @datatypes
+  #qurt_signal_t
+
+  @param[in] *signal  Pointer to the signal object to destroy.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+/* ======================================================================*/
+void qurt_signal_destroy(qurt_signal_t *signal);
+
+/*======================================================================*/
+/**@ingroup func_qurt_signal_wait 
+  @xreflabel{hdr:qurt_signal_wait}
+  Suspends the current thread until the specified signals are set.
+
+  Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1 indicates 
+  waiting on a signal, and 0 indicates not waiting on the signal.
+
+  If a thread is waiting on a signal object for any of the specified set of signals to set, 
+  and one or more of those signals is set in the signal object, the thread is awakened.
+
+  If a thread is waiting on a signal object for all of the specified set of signals to be set, 
+  and all of those signals are set in the signal object, the thread is awakened.
+
+  The specified set of signals can be cleared when the signal is set.
+
+  @note1hang At most, one thread can wait on a signal object at any given time.
+
+  @datatypes
+  #qurt_signal_t
+
+  @param[in] signal      Pointer to the signal object to wait on.
+  @param[in] mask        Mask value identifying the individual signals in the signal object to 
+                         wait on.
+  @param[in] attribute   Indicates whether the thread waits to set any of the signals, or to set all of 
+                         them. \n
+						 @note1hang The wait-any and wait-all types are mutually exclusive.\n Values:\n
+                         - #QURT_SIGNAL_ATTR_WAIT_ANY \n
+                         - #QURT_SIGNAL_ATTR_WAIT_ALL @tablebulletend
+
+  @return     	
+  A 32-bit word with current signals.
+ 
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+unsigned int qurt_signal_wait(qurt_signal_t *signal, unsigned int mask, 
+                unsigned int attribute);
+
+/*======================================================================*/
+/**@ingroup func_qurt_signal_wait_timed
+  @xreflabel{hdr:qurt_signal_wait}
+  Suspends the current thread until the specified signals are set or until timeout.
+
+  Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1 indicates 
+  waiting on a signal, and 0 indicates not waiting.
+
+  If a thread is waiting on a signal object for any of the specified set of signals to be set, 
+  and one or more of those signals is set in the signal object, the thread is awakened.
+
+  If a thread is waiting on a signal object for all of the specified set of signals to be set, 
+  and all of those signals are set in the signal object, the thread is awakened.
+
+  The specified set of signals can be cleared after the signal is set.
+
+  @note1hang At most, one thread can wait on a signal object at any given time.
+
+  @datatypes
+  #qurt_signal_t
+
+  @param[in] signal      Pointer to the signal object to wait on.
+  @param[in] mask        Mask value that identifies the individual signals in the signal object to wait on.
+  @param[in] attribute   Indicates whether the thread must wait until any of the signals are set, or until all of 
+                         them are set. \n
+						 @note1hang The wait-any and wait-all types are mutually exclusive.\n Values:\n
+                         - #QURT_SIGNAL_ATTR_WAIT_ANY \n
+                         - #QURT_SIGNAL_ATTR_WAIT_ALL @tablebulletend
+  @param[out] signals    Bitmask of signals that are set 
+  @param[in] duration    Duration (microseconds) to wait. Must be in the range
+                         [#QURT_TIMER_MIN_DURATION ... #QURT_TIMER_MAX_DURATION]
+
+  @return 				
+  #QURT_EOK -- Success; one or more signals were set \n
+  #QURT_ETIMEDOUT -- Timed-out \n
+  #QURT_EINVALID -- Duration out of range
+  
+  @dependencies
+  Timed-waiting support in the kernel.
+*/
+/* ======================================================================*/
+int qurt_signal_wait_timed(qurt_signal_t *signal, unsigned int mask, 
+                unsigned int attribute, unsigned int *signals, unsigned long long int duration);
+
+/*======================================================================*/
+/**@ingroup func_qurt_signal_wait_any
+  Suspends the current thread until any of the specified signals are set.
+
+  Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1 indicates
+  to wait on a signal, and 0 indicates not to wait on the thread.
+
+  If a thread is waiting on a signal object for any of the specified set of signals to be set, 
+  and one or more of those signals is set in the signal object, the thread is awakened.
+
+  @note1hang At most, one thread can wait on a signal object at any given time.
+
+  @datatypes
+  #qurt_signal_t
+
+  @param[in] signal      Pointer to the signal object to wait on.
+  @param[in] mask        Mask value identifying the individual signals in the signal object to  
+                         wait on.
+	
+  @return     	
+  32-bit word with current signals.
+ 
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+static inline unsigned int qurt_signal_wait_any(qurt_signal_t *signal, unsigned int mask)
+{
+  return qurt_signal_wait(signal, mask, QURT_SIGNAL_ATTR_WAIT_ANY);
+}
+
+/*======================================================================*/
+/**@ingroup func_qurt_signal_wait_all
+  Suspends the current thread until all of the specified signals are set.
+
+  Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1 indicates 
+  to wait on a signal, and 0 indicates not to wait on it.
+
+  If a thread is waiting on a signal object for all of the specified set of signals to be set, 
+  and all of those signals are set in the signal object, the thread is awakened.
+
+  @note1hang At most, one thread can wait on a signal object at any given time.
+
+  @datatypes
+  #qurt_signal_t
+
+  @param[in] signal      Pointer to the signal object to wait on. 
+  @param[in] mask        Mask value identifying the individual signals in the signal object to  
+                         wait on.
+	
+  @return      	
+  A 32-bit word with current signals.
+ 
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+static inline unsigned int qurt_signal_wait_all(qurt_signal_t *signal, unsigned int mask)
+{
+  return qurt_signal_wait(signal, mask, QURT_SIGNAL_ATTR_WAIT_ALL);
+}
+
+/*======================================================================*/
+/**@ingroup func_qurt_signal_set
+  Sets signals in the specified signal object.
+
+  Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1 indicates 
+  to set the signal, and 0 indicates not to set it.
+  	
+  @datatypes
+  #qurt_signal_t
+
+  @param[in]    signal  Pointer to the signal object to modify.
+  @param[in]    mask    Mask value identifying the individual signals to set in the signal
+                        object.
+
+  @return 
+  None.
+  
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+void qurt_signal_set(qurt_signal_t *signal, unsigned int mask);
+
+/*======================================================================*/
+/**@ingroup func_qurt_signal_get
+   Gets a signal from a signal object.
+   
+   Returns the current signal values of the specified signal object.
+
+  @datatypes
+  #qurt_signal_t
+
+  @param[in] *signal Pointer to the signal object to access.
+
+  @return         
+  A 32-bit word with current signals
+    
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+unsigned int qurt_signal_get(qurt_signal_t *signal);
+
+/*======================================================================*/
+/**@ingroup func_qurt_signal_clear
+  Clear signals in the specified signal object.
+
+  Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1 
+  indicates that a signal must be cleared, and 0 indicates not to clear it.
+
+  @note1hang Signals must be explicitly cleared by a thread when it is awakened -- the wait 
+           operations do not automatically clear them.
+
+  @datatypes
+  #qurt_signal_t
+
+  @param[in] signal   Pointer to the signal object to modify.
+  @param[in] mask     Mask value identifying the individual signals to clear in the signal object.
+
+  @return 		  
+  None.
+
+  @dependencies
+  None.
+ */
+/* ======================================================================*/
+void qurt_signal_clear(qurt_signal_t *signal, unsigned int mask);
+
+/**@ingroup func_qurt_signal_wait_cancellable  
+  @xreflabel{hdr:qurt_signal_wait_cancellable}
+  Suspends the current thread until either the specified signals are set or the wait operation is cancelled.
+  The operation is cancelled if the user process of the calling thread is killed, or if the calling thread 
+  must finish its current QDI invocation and return to user space. 
+
+  Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1 indicates 
+  that a signal must be waited on, and 0 indicates not to wait on it.
+
+  If a thread is waiting on a signal object for any of the specified set of signals to be set, and one or 
+  more of those signals is set in the signal object, the thread is awakened.
+
+  If a thread is waiting on a signal object for all of the specified set of signals to be set, and all of 
+  those signals are set in the signal object, the thread is awakened.
+
+  @note1hang At most, one thread can wait on a signal object at any given time.
+
+  @note1cont When the operation is cancelled, the caller must assume that the signal is never set.
+
+  @datatypes
+  #qurt_signal_t
+
+  @param[in] signal      Pointer to the signal object to wait on.
+  @param[in] mask        Mask value identifying the individual signals in the signal object to  
+                         wait on.
+  @param[in] attribute   Indicates whether the thread must wait until any of the signals are set, or until all of 
+                         them are set.  Values:\n
+                         - #QURT_SIGNAL_ATTR_WAIT_ANY \n
+                         - #QURT_SIGNAL_ATTR_WAIT_ALL @tablebulletend
+  @param[out] return_mask Pointer to the 32-bit mask value that was originally passed to the function.
+
+
+  @return     	
+  #QURT_EOK -- Wait completed. \n
+  #QURT_ECANCEL -- Wait cancelled.
+
+ 
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+int qurt_signal_wait_cancellable(qurt_signal_t *signal, unsigned int mask, 
+                                 unsigned int attribute,
+                                 unsigned int *return_mask);
+
+/*======================================================================*/
+/**@ingroup func_qurt_signal_64_init
+  Initializes a 64-bit signal object.\n
+  The signal argument returns the initialized object.
+  The signal object is initially cleared.
+
+  @note1hang   Each signal-based object has one or more kernel resources associated with it;
+               to prevent resource leaks, call qurt_signal_destroy()
+               when this object is not used anymore.
+  @datatypes
+  #qurt_signal_64_t
+
+  @param[in] signal Pointer to the initialized object.
+
+  @return         
+  None.
+     
+  @dependencies
+  None.
+ */
+/* ======================================================================*/
+void qurt_signal_64_init(qurt_signal_64_t *signal);
+
+/*======================================================================*/
+/**@ingroup func_qurt_signal_64_destroy
+  Destroys the specified signal object.
+  
+  @note1hang 64-bit signal objects must be destroyed when they are no longer in use. Failure 
+  to do this causes resource leaks in the QuRT kernel.\n
+  @note1cont Signal objects must not be destroyed while they are still in use. If this
+  occurs, the behavior of QuRT is undefined.
+  
+  @datatypes
+  #qurt_signal_64_t
+
+  @param[in] signal  Pointer to the signal object to destroy.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+/* ======================================================================*/
+void qurt_signal_64_destroy(qurt_signal_64_t *signal);
+
+/*======================================================================*/
+/**@ingroup func_qurt_signal_64_wait
+  Suspends the current thread until all of the specified signals are set.
+
+  Signals are represented as bits 0 through 63 in the 64-bit mask value. A mask bit value of 1 indicates 
+  that a signal must be waited on, and 0 indicates not wait on it.
+
+  If a thread is waiting on a signal object for all of the specified set of signals to be set, 
+  and all of those signals are set in the signal object, the thread is awakened.
+
+  @note1hang At most, one thread can wait on a signal object at any given time.
+
+  @datatypes
+  #qurt_signal_64_t
+
+  @param[in] signal      Pointer to the signal object to wait on. 
+  @param[in] mask        Mask value, which identifies the individual signals in the signal object to  
+                         wait on.
+  @param[in] attribute   Indicates whether the thread must wait until any of the signals are set, or until all of 
+                         them are set.  \n
+						 @note1hang The wait-any and wait-all types are mutually exclusive.\n Values:\n
+                         - #QURT_SIGNAL_ATTR_WAIT_ANY \n
+                         - #QURT_SIGNAL_ATTR_WAIT_ALL @tablebulletend
+	
+  @return      	
+  A 32-bit word with current signals.
+ 
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+unsigned long long qurt_signal_64_wait(qurt_signal_64_t *signal, unsigned long long mask, 
+                unsigned int attribute);
+
+/*======================================================================*/
+/**@ingroup func_qurt_signal_64_set
+  Sets signals in the specified signal object.
+
+  Signals are represented as bits 0 through 63 in the 64-bit mask value. A mask bit value of 1 indicates 
+  that a signal must be set, and 0 indicates not to set it.
+  	
+  @datatypes
+  #qurt_signal_64_t
+
+  @param[in]    signal  Pointer to the signal object to modify.
+  @param[in]    mask    Mask value identifiying the individual signals to set in the signal
+                        object.
+
+  @return 
+  None.
+  
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+void qurt_signal_64_set(qurt_signal_64_t *signal, unsigned long long mask);
+
+/*======================================================================*/
+/**@ingroup func_qurt_signal_64_get
+   Gets a signal from a signal object.
+   
+   Returns the current signal values of the specified signal object.
+
+  @datatypes
+  #qurt_signal_64_t
+
+  @param[in] *signal Pointer to the signal object to access.
+
+  @return         
+  A 64-bit double word with current signals.
+    
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+unsigned long long qurt_signal_64_get(qurt_signal_64_t *signal);
+
+/*======================================================================*/
+/**@ingroup func_qurt_signal_64_clear
+  Clears signals in the specified signal object.
+
+  Signals are represented as bits 0 through 63 in the 64-bit mask value. A mask bit value of 1 
+  indicates that a signal must be cleared, and 0 indicates not to clear it.
+
+  @note1hang Signals must be explicitly cleared by a thread when it is awakened -- the wait 
+           operations do not automatically clear them.
+
+  @datatypes
+  #qurt_signal_64_t
+
+  @param[in] signal   Pointer to the signal object to modify.
+  @param[in] mask     Mask value identifying the individual signals to clear in the signal object.
+
+  @return 		  
+  None.
+
+  @dependencies
+  None.
+ */
+/* ======================================================================*/
+void qurt_signal_64_clear(qurt_signal_64_t *signal, unsigned long long mask);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* QURT_SIGNAL_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_signal2.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_signal2.h
new file mode 100755
index 0000000000000..43975100cbf75
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_signal2.h
@@ -0,0 +1,340 @@
+#ifndef QURT_SIGNAL2_H
+#define QURT_SIGNAL2_H
+
+/**
+  @file qurt_signal2.h
+  @brief  Prototypes of kernel signal2 API functions.
+
+ EXTERNALIZED FUNCTIONS
+  none
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  none
+
+ Copyright (c) 2013, 2021, 2023  by Qualcomm Technologies, Inc.  All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define QURT_SIGNAL_ATTR_WAIT_ANY 0x00000000
+#define QURT_SIGNAL_ATTR_WAIT_ALL 0x00000001
+
+/*=====================================================================
+ Typedefs
+ ======================================================================*/
+
+/** @addtogroup signals2_types
+@{ */
+/** qurt_signal2 type.
+ */
+typedef union {
+   /** @cond */
+  struct{
+   unsigned int cur_mask;                              /* Current set of signal bits that are set. */
+   unsigned int sig_state;                             /* Current state. */
+                                                       /* Bit 0 -- in anysignal wait. */
+                                                       /* Bit 1 -- in allsignal wait. */
+                                                       /* Bit 2 -- in interrupt wait. */
+                                                       /* Bits 31-3 -- reference count field. */
+   unsigned int queue;                                 /* Kernel-maintained futex queue value. */
+   unsigned int wait_mask;                             /* When sig_state indicates a waiter is present, this is the wait mask. */
+   };
+  unsigned long long int raw;
+  /** @endcond */
+} qurt_signal2_t;
+/* @} */ /* end_addtogroup signals2_types */
+
+/*=====================================================================
+ Functions
+======================================================================*/
+
+/*======================================================================*/
+/**@ingroup func_qurt_signal2_init
+
+  @deprecated use #qurt_signal_init instead.
+
+  Initializes a signal2 object.
+  Signal returns the initialized object.
+  The signal object is initially cleared.
+
+  Objects of type signal2 solve a potential race condition between
+  set() and destroy() operations.
+
+  @datatypes
+  #qurt_signal2_t
+
+  @param[in] *signal Pointer to the initialized object.
+
+  @return
+  None.
+
+  @dependencies
+  Each mutex-based object has an associated
+       kernel resource(s), therefore users must call qurt_signal2_destroy()
+       when this object no longer in use.
+ */
+/* ======================================================================*/
+void qurt_signal2_init(qurt_signal2_t *signal);
+
+/*======================================================================*/
+/**@ingroup func_qurt_signal2_destroy
+
+  @deprecated use #qurt_signal_destroy instead.
+
+  Destroys the specified signal object.
+
+  @note1cont Signal objects must not be destroyed while they are still in use. If this
+  occurs, the behavior of QuRT is undefined.
+  @note1cont Application code should destroy a signal2 object prior to deallocating it.
+             Calling qurt_signal2_destroy() before deallocating a 
+             signal2 object ensures completion of all qurt_signal2_set() calls.
+
+  @datatypes
+  #qurt_signal2_t
+
+  @param[in] signal  Pointer to the signal object to destroy.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+/* ======================================================================*/
+void qurt_signal2_destroy(qurt_signal2_t *signal);
+
+/*======================================================================*/
+/**@ingroup func_qurt_signal2_wait
+
+  @deprecated use #qurt_signal_wait instead.
+
+  Suspends the current thread until the specified signals are set.
+
+  Signals are represented as bits [31:0] in the 32-bit mask value. A mask bit value of 1 indicates
+  a signal to wait on.
+
+  If a thread calls this API with QURT_SIGNAL_ATTR_WAIT_ANY, the thread will be awakened when
+  any of the signals specified in the mask are set.
+
+  If a thread calls this API with QURT_SIGNAL_ATTR_WAIT_ALL, the thread will be awakened only
+  when all the signals specified in the mask are set.
+
+  @note1hang At most, one thread can wait on a signal object at any given time.
+
+  @datatypes
+  #qurt_signal2_t
+
+  @param[in] signal      Pointer to the signal object to wait on.
+  @param[in] mask        Mask value identifying the individual signals in the signal object to wait on.
+  @param[in] attribute   Specifies whether the thread waits for any of the signals to be set, or for all of
+                         them to be set. Values:\n
+                         - QURT_SIGNAL_ATTR_WAIT_ANY \n
+                         - QURT_SIGNAL_ATTR_WAIT_ALL @tablebulletend
+  @return
+  A 32-bit word with current signals.
+
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+unsigned int qurt_signal2_wait(qurt_signal2_t *signal, unsigned int mask,
+                unsigned int attribute);
+
+/*======================================================================*/
+/**@ingroup func_qurt_signal2_wait_any
+
+  @deprecated use #qurt_signal_wait_any instead.
+
+  Suspends the current thread until any of the specified signals are set.
+
+  Signals are represented as bits [31:0] in the 32-bit mask value. A mask bit value of 1 indicates
+  a signal to wait on.
+
+  The thread will be awakened when any of the signals specified in the mask are set.
+
+  @note1hang At most, one thread can wait on a signal object at any given time.
+
+  @datatypes
+  #qurt_signal2_t
+
+  @param[in] signal      Pointer to the signal object to wait on.
+  @param[in] mask        Mask value identifying the individual signals in the signal object to 
+                         wait on.
+
+  @return
+  32-bit word with current signals.
+
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+static inline unsigned int qurt_signal2_wait_any(qurt_signal2_t *signal, unsigned int mask)
+{
+  return qurt_signal2_wait(signal, mask, QURT_SIGNAL_ATTR_WAIT_ANY);
+}
+
+/*======================================================================*/
+/**@ingroup func_qurt_signal2_wait_all
+
+  @deprecated use #qurt_signal_wait_all instead.
+
+  Suspends the current thread until all of the specified signals are set.
+
+  Signals are represented as bits [31:0] in the 32-bit mask value. A mask bit value of 1 indicates
+  a signal to wait on.
+
+  The thread will be awakened only when all the signals specified in the mask are set.
+
+  @note1hang At most one thread can wait on a signal object at any given time.
+
+  @datatypes
+  #qurt_signal2_t
+
+  @param[in] signal      Pointer to the signal object to wait on.
+  @param[in] mask        Mask value identifying the individual signals in the signal object to 
+                         wait on.
+
+  @return
+  32-bit word with current signals.
+
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+static inline unsigned int qurt_signal2_wait_all(qurt_signal2_t *signal, unsigned int mask)
+{
+  return qurt_signal2_wait(signal, mask, QURT_SIGNAL_ATTR_WAIT_ALL);
+}
+
+/*======================================================================*/
+/**@ingroup func_qurt_signal2_set
+
+  @deprecated use #qurt_signal_set instead.
+
+  Sets signals in the specified signal object.
+
+  Signals are represented as bits [31:0] in the 32-bit mask value. A mask bit value of 1 indicates
+  that a signal must be set, and 0 indicates not to set the signal.
+
+  @datatypes
+  #qurt_signal2_t
+
+  @param[in]    signal  Pointer to the signal object to modify.
+  @param[in]    mask    Mask value identifying the individual signals to set in the signal
+                        object.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+void qurt_signal2_set(qurt_signal2_t *signal, unsigned int mask);
+
+/*======================================================================*/
+/**@ingroup func_qurt_signal2_get
+
+  @deprecated use #qurt_signal_get instead.
+
+   Gets a signal from a signal object.
+
+   Returns the current signal values of the specified signal object.
+
+  @datatypes
+  #qurt_signal2_t
+
+  @param[in] *signal Pointer to the signal object to access.
+
+  @return
+   32-bit word with current signals.
+
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+unsigned int qurt_signal2_get(qurt_signal2_t *signal);
+
+/*======================================================================*/
+/**@ingroup func_qurt_signal2_clear
+
+  @deprecated use #qurt_signal_clear instead.
+
+  Clear signals in the specified signal object.
+
+  Signals are represented as bits [31:0] in the 32-bit mask value. A mask bit value of 1
+  indicates that a signal must be cleared, and 0 indicates not to clear the signal.
+
+  @note1hang Signals must be explicitly cleared by a thread when it is awakened -- the wait
+           operations do not automatically clear them.
+
+  @datatypes
+  #qurt_signal2_t
+
+  @param[in] signal   Pointer to the signal object to modify.
+  @param[in] mask     Mask value identifying the individual signals to clear in the signal object.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+/* ======================================================================*/
+void qurt_signal2_clear(qurt_signal2_t *signal, unsigned int mask);
+
+/**@ingroup func_qurt_signal2_wait_cancellable  
+  
+  @deprecated use #qurt_signal_wait_cancellable instead.
+
+  Suspends the current thread until either the specified signals are set or the wait operation is cancelled.
+  The operation is cancelled if the user process of the calling thread is killed, or if the calling thread 
+  must finish its current QDI invocation and return to user space. 
+
+  Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1 indicates 
+  that a signal must be waited on, and 0 indicates not to wait on it.
+
+  If a thread is waiting on a signal object for any of the specified set of signals to be set, and one or 
+  more of those signals is set in the signal object, the thread is awakened.
+
+  If a thread is waiting on a signal object for all of the specified set of signals to be set, and all of 
+  those signals are set in the signal object, the thread is awakened.
+
+  @note1hang At most, one thread can wait on a signal object at any given time.
+
+  @note1cont When the operation is cancelled, the caller must assume that the signal is never set.
+
+  @datatypes
+  #qurt_signal2_t
+
+  @param[in] signal      Pointer to the signal object to wait on.
+  @param[in] mask        Mask value identifying the individual signals in the signal object to  
+                         wait on.
+  @param[in] attribute   Indicates whether the thread must wait until any of the signals are set, or until all of 
+                         them are set.  Values:\n
+                         - #QURT_SIGNAL_ATTR_WAIT_ANY \n
+                         - #QURT_SIGNAL_ATTR_WAIT_ALL @tablebulletend
+  @param[out] p_returnmask Pointer to the 32-bit mask value that was originally passed to the function.
+
+
+  @return     	
+  #QURT_EOK -- Wait completed. \n
+  #QURT_ECANCEL -- Wait cancelled.
+
+ 
+  @dependencies
+  None.
+*/
+int qurt_signal2_wait_cancellable(qurt_signal2_t *signal,
+                                  unsigned int mask,
+                                  unsigned int attribute,
+                                  unsigned int *p_returnmask);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_SIGNAL2_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_space.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_space.h
new file mode 100755
index 0000000000000..2c3f9e4496697
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_space.h
@@ -0,0 +1,230 @@
+#ifndef QURT_SPACE_H
+#define QURT_SPACE_H
+/**
+  @file qurt_space.h
+  @brief Prototypes of QuRT process control APIs
+
+ EXTERNALIZED FUNCTIONS
+  none
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  none
+
+ Copyright (c) 2013, 2021, 2023 by Qualcomm Technologies, Inc. All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+
+#include <qurt_types.h>
+#include <qurt_signal.h>
+#include <qurt_process.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** This flag is a request to the OS to suspend the processes just before calling main()
+But it is going to be obsoleted and replaced by QURT_PROCESS_SUSPEND_ON_STARTUP */
+#define SPAWNN_FLAG_SUSPEND_ON_STARTUP QURT_PROCESS_SUSPEND_ON_STARTUP
+
+/**
+ * Creates and starts a process from ELF of a specified name. The slash symbols
+ * "/" or "\" are ignored. Do not include the directory name in the input. This function
+ * accepts the the SPAWN flags. Multiple SPAWN flags can be specified by OR'ing the flags.
+ *
+ * @param name      ELF name of the executable. Name shall not contain directories,
+ *                  use "dsp2.elf", instead of "/prj/qct/.../dsp2.elf"
+ *
+ * @param return
+   Process ID -- Success \n
+   Negative error code -- failure\n
+   #QURT_EPRIVILEGE --                    Caller does not have enough privilege for this operation\n
+   #QURT_EMEM       --                    Not enough memory to perform the operation \n
+   #QURT_EFAILED     --                   Operation failed \n
+   #QURT_ENOTALLOWED --                   Operation not allowed \n
+   #QURT_ENOREGISTERED --                 Not registered \n
+   #QURT_ENORESOURCE  --                  Resource exhaustion \n
+   #QURT_EINVALID --                      Invalid argument value
+*/
+
+int qurt_spawn_flags(const char * name, int flags);
+
+/**
+   Creates and starts a process from an ELF of the specified name. The slash symbols
+   "/" or "\" are ignored. Do not include the directory name in the input.
+
+   @param name      ELF name of the executable. Name shall not contain directories,
+                    use "dsp2.elf", instead of "/prj/qct/.../dsp2.elf".
+
+   @return
+   Process ID -- Success. \m
+   Negative error code -- Failure.
+
+*/
+static inline int qurt_spawn(const char *name)
+{
+    return qurt_spawn_flags(name,0);
+}
+
+/**
+ * Returns the process ID of the current process.
+ *
+ * @return
+ * Process ID
+ *
+*/
+#define qurt_getpid qurt_process_get_id
+
+/**
+ * The qurt_wait() function  waits for status change in a child process. It could be used by parent
+ * process to block on any child process terminates.
+ *
+ * This API returns error if there are no user processes or all user processes got detached.
+ *
+ * @param status    Pointer to status variable. The variable provides the status value of child process.
+ *                  The value comes from exit() system call made by child process.
+ *
+ * @return
+   Process ID of the child process that changes status -- Success \n
+ * Negative error code -- Failure
+ *
+*/
+
+int qurt_wait(int *status);
+
+
+/** @cond */
+/* APIs that allow registering callbacks on spawn of user pd */
+typedef void (*QURT_SPAWN_PFN)(int client_handle, void *data_ptr);  //no return, since we won't be error checking it in spawn 
+typedef int (*QURT_CB_PFN)(int client_handle, void *user_data, void *info);
+typedef union {
+    QURT_SPAWN_PFN spawn_pfn;
+    QURT_CB_PFN cb_pfn;
+} qurt_process_callback_pfn_t;
+/** @endcond */
+
+/** @cond internal_only */
+
+/**@ingroup func_qurt_event_register
+Sets the specified bits by mask in the signal passed by the caller. The signal gets set
+when the client handle indicated by value goes away (at process exit). Multiple clients can register for the signal
+to be set.
+
+@datatypes
+
+@param[in]  type     QURT_PROCESS_EXIT is the only event that can be registered for.
+@param[in]  value    Indicates the client handle of the process for which the event is registered.
+@param[in]  signal   Pointer to the signal object to set when the event occurs.
+@param[in]  mask     Mask bits to set in the signal.
+@param[out] data     Pointer to the variable that would receive the exit code of the exiting process.
+@param[in]  datasize Size of the data variable.
+
+@return
+#QURT_EOK -- Success \n
+#QURT_EMEM -- Not enough memory to allocate resources \n
+#QURT_EVAL -- Invalid values passed to the API
+
+@dependencies
+None.
+*/
+int qurt_event_register(int type, int value, qurt_signal_t *psig, unsigned int mask, void *data, unsigned int data_size);
+
+/**@ingroup func_qurt_callback_register_onspawn
+Allows registering for a callback on spawn of any user process.
+
+@datatypes
+#QURT_SPAWN_PFN
+
+@param[in] pFn         Callback function to call when any user process is spawned.
+@param[in] user_data   Pointer to the argument that the callback must be called with.
+
+
+@return   If positive value is obtained, handle to be used while deregistering the callback.
+          Mutliple clients can register for callback on spawn and some clients might choose to deregister.
+
+          If failed, QURT_EFATAL will be returned.
+
+@dependencies
+None.
+*/
+int qurt_callback_register_onspawn(QURT_SPAWN_PFN pFn, void *user_data);
+
+/**@ingroup func_qurt_callback_deregister_onspawn
+Allows de-registering callback on spawn.
+
+@param[in] callback_handle   Handle returned by qurt_callback_register_onspawn.
+
+@return
+#QURT_EOK --de-registering was successful
+
+@dependencies
+None.
+*/
+int qurt_callback_deregister_onspawn(int callback_handle);
+
+/**@ingroup func_qurt_process_callback_register
+Allows registering for a callback during or after image loading.
+Generic callback types:
+    Functions similarly to qurt_callback_register_onspawn(). Callback is called after process is
+    loaded, before process thread starts. Callback has no return value and has no info provided
+    from OS.
+        pFn - QURT_SPAWN_PFN
+        type - QURT_PROCESS_CB_GENERIC
+        arg1 - not used 
+        arg2 - not used
+        arg3 - not used
+Note callback types:
+    Callback is called during process loading: before segment loading(QURT_PROCESS_NOTE_CB_PRE_MAP),
+    or after segment loading (QURT_PROCESS_NOTE_CB_POST_MAP). OS provides info to the callback. info
+    argument in callback is populated with pointer to the mapped note corresponding to the callback.
+    Callback has return value, loader fails if callback returns a value that is not QURT_EOK.
+        pFn - QURT_CB_PFN
+        type - QURT_PROCESS_NOTE_CB_PRE_MAP or QURT_PROCESS_NOTE_CB_POST_MAP
+        arg1 - note type (ex: NOTE_TYPE_POOL_INFO, NOTE_TYPE_SEGMENT_INFO, NOTE_TYPE_ARB_INFO)
+        arg2 - note name
+        arg3 - not used
+
+@datatypes
+
+@param[in] pFn          Callback function to call
+@param[in] type         Callback type
+@param[in] user_data    Pointer to the argument that the callback must be called with.
+@param[in] arg1         Arguments interpreted by OS based on callback type
+@param[in] arg2         Arguments interpreted by OS based on callback type
+@param[in] arg3         Arguments interpreted by OS based on callback type (currently not used)
+
+
+@return   If positive value is obtained, handle to be used while deregistering the callback.
+          Mutliple clients can register for callback on spawn and some clients might choose to deregister.
+
+          If failed, QURT_EFATAL will be returned.
+
+@dependencies
+None.
+*/
+int qurt_process_callback_register(qurt_process_callback_pfn_t pFn, 
+                                   qurt_process_cb_type_t type, 
+                                   void *user_data, 
+                                   qurt_process_callback_arg_t arg1, 
+                                   qurt_process_callback_arg_t arg2, 
+                                   qurt_process_callback_arg_t arg3);
+
+
+
+/**@ingroup func_qurt_process_callback_deregister
+Allows de-registering callback for imate loading.
+@param[in] callback_handle   Handle returned by qurt_process_callback_register.
+
+@return
+#QURT_EOK --de-registering was successful
+
+@dependencies
+None.
+*/
+int qurt_process_callback_deregister(int callback_handle);
+/** @endcond */
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_SPACE_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_srm_consts.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_srm_consts.h
new file mode 100755
index 0000000000000..48a8b6a38c402
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_srm_consts.h
@@ -0,0 +1,32 @@
+#ifndef QURT_SRM_CONSTS_H
+#define QURT_SRM_CONSTS_H
+/**
+  @file qurt_srm_consts.h 
+  @brief  Type definitions for srm
+
+EXTERNAL FUNCTIONS
+   None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+Copyright (c) 2020-2021, 2022  by Qualcomm Technologies, Inc.  All Rights Reserved
+Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+=============================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** @cond */
+#define QURT_SRM_WAKEUP_REQUEST       1U << 0          /**< Value = 1:  Send wakeup request to the SRM server. */
+#define QURT_SRM_SET_HANDLE           1U << 1          /**< Value = 2:  Set the client handle for a new SRM client. */
+#define QURT_SRM_ALLOC_KERNEL_PAGES   1U << 2          /**< Value = 4:  Allocate pages from the kernel VA space. */
+/** @endcond */
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_SRM_CONSTS_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_srm_driver.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_srm_driver.h
new file mode 100755
index 0000000000000..5489e3dddbcca
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_srm_driver.h
@@ -0,0 +1,140 @@
+#ifndef QURT_SRM_DRIVER_H
+#define QURT_SRM_DRIVER_H
+/**
+  @file qurt_srm_driver.h
+  @brief Definitions, macros, and prototypes used by SRM drivers.
+
+  EXTERNAL FUNCTIONS
+  None.
+
+  INITIALIZATION AND SEQUENCING REQUIREMENTS
+  None.
+
+  Copyright (c) 2021-2023 by Qualcomm Technologies, Inc.  All Rights Reserved.
+
+ =============================================================================*/
+#include <qurt.h>
+#include <qurt_srm_consts.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+||  Define qurt_srm_driver_t structure, which represents
+||   the "registration" object for an SRM driver.
+*/
+/** @cond internal_only */
+struct _qurt_srm_driver {
+   const char *name;
+   qurt_qdi_obj_t *obj;
+};
+
+typedef struct _qurt_srm_driver qurt_srm_driver_t;
+
+/*
+||  qurt_srm_object_invoke() is an internal equivalent to qurt_qdi_handle_invoke().
+||  It behaves the same, but it takes a QDI object pointer instead of a handle.
+*/
+
+#define qurt_srm_object_invoke(o,m,...) \
+   _QDMPASTE(_QDMSOI,_QDMCNT(QDI_HANDLE_LOCAL_CLIENT,o,m,##__VA_ARGS__))(QDI_HANDLE_LOCAL_CLIENT,o,m,##__VA_ARGS__)
+#define _QDMSOI3(a,b,c) qurt_srm_oi3(a,b,c)
+#define _QDMSOI4(a,b,c,d) qurt_srm_oi4(a,b,c,(int)(d))
+#define _QDMSOI5(a,b,c,d,e) qurt_srm_oi5(a,b,c,(int)(d),(int)(e))
+#define _QDMSOI6(a,b,c,d,e,f) qurt_srm_oi6(a,b,c,(int)(d),(int)(e),(int)(f))
+#define _QDMSOI7(a,b,c,d,e,f,g) qurt_srm_oi7(a,b,c,(int)(d),(int)(e),(int)(f),(int)(g))
+#define _QDMSOI8(a,b,c,d,e,f,g,h) qurt_srm_oi8(a,b,c,(int)(d),(int)(e),(int)(f),(int)(g),(int)(h))
+#define _QDMSOI9(a,b,c,d,e,f,g,h,i) qurt_srm_oi9(a,b,c,(int)(d),(int)(e),(int)(f),(int)(g),(int)(h),(int)(i))
+#define _QDMSOI10(a,b,c,d,e,f,g,h,i,j) qurt_srm_oi10(a,b,c,(int)(d),(int)(e),(int)(f),(int)(g),(int)(h),(int)(i),(int)(j))
+#define _QDMSOI11(a,b,c,d,e,f,g,h,i,j,k) qurt_srm_oi11(a,b,c,(int)(d),(int)(e),(int)(f),(int)(g),(int)(h),(int)(i),(int)(j),(int)(k))
+#define _QDMSOI12(a,b,c,d,e,f,g,h,i,j,k,l) qurt_srm_oi12(a,b,c,(int)(d),(int)(e),(int)(f),(int)(g),(int)(h),(int)(i),(int)(j),(int)(k),(int)(l))
+
+int qurt_srm_oi3(int, qurt_qdi_obj_t *, int);
+int qurt_srm_oi4(int, qurt_qdi_obj_t *, int, int);
+int qurt_srm_oi5(int, qurt_qdi_obj_t *, int, int, int);
+int qurt_srm_oi6(int, qurt_qdi_obj_t *, int, int, int, int);
+int qurt_srm_oi7(int, qurt_qdi_obj_t *, int, int, int, int, int);
+int qurt_srm_oi8(int, qurt_qdi_obj_t *, int, int, int, int, int, int);
+int qurt_srm_oi9(int, qurt_qdi_obj_t *, int, int, int, int, int, int, int);
+int qurt_srm_oi10(int, qurt_qdi_obj_t *, int, int, int, int, int, int, int, int);
+int qurt_srm_oi11(int, qurt_qdi_obj_t *, int, int, int, int, int, int, int, int, int);
+int qurt_srm_oi12(int, qurt_qdi_obj_t *, int, int, int, int, int, int, int, int, int, int);
+
+#define QDI_SRM_INIT 192
+
+/*
+||  QURT_SRM_DECLARE_DRIVER() declares an SRM driver to the SRM infrastructure.
+||
+||  The three arguments are:
+||   unique_id -- Unique C identifier, unused but must be a unique global symbol.
+||   name -- Name of the driver by which an SRM client attempts to open it.
+||   obj -- Pointer to the singleton object of the driver, which handles things such as
+||          initialization and QDI_OPEN requests.
+*/
+
+#define QURT_SRM_DECLARE_DRIVER(unique_id, xname, xobj) \
+   __attribute__((section(".srm.rodata.user.main.DECL"))) const qurt_srm_driver_t unique_id = \
+      { .name = xname, .obj = xobj }
+
+
+/*@ingroup func_qurt_srm_mapping_create
+  Creates a memory mapping in pagetable with specified attributes
+
+  @param[in] client_handle  Client handle representing the process for which
+                            mapping would be created.
+  @param[in] pageno_virt    pointer to the virtual page. NULL indicates SRM
+                            would indicate the virtual memory.
+  @param[in] pageno_phys    physical page to be used for the mapping
+  @param[in] page_count     number of 4k pages to be mapped
+  @param[in] cache_attr     cache attributes for the mapping
+  @param[in] perm           permissions to be used for the mapping
+  
+  @return value greater than 0 indicates a handle which can be passed to
+          qdi_close() to remove the mapping. Negative value indicates
+		  an error.
+  
+  @dependencies
+  None.
+*/
+int qurt_srm_mapping_create(int client_handle,
+                            unsigned *pageno_virt,
+                            unsigned pageno_phys,
+                            unsigned page_count,
+                            qurt_mem_cache_mode_t cache_attr,
+                            qurt_perm_t perm);
+
+
+/**@ingroup func_qurt_srm_get_pid
+  Gets the PID for the client_handle that is passed.
+
+  @param[in] client_handle  Client handle for which PID is required.
+
+  @return PID of the client
+          Negative PID value '-1' will be returned in case of Error
+  
+  @dependencies
+  None.
+*/
+unsigned qurt_srm_get_pid(int client_handle);
+
+
+/*@ingroup func_qurt_srm_get_thread_id
+  Gets the thread id of the client requesting a service from SRM
+
+  @param[in] None.
+
+  @return thead id of client thread
+  
+  @dependencies
+  None.
+*/
+qurt_thread_t qurt_srm_get_client_thread_id(void);
+
+/** @endcond */
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_SRM_DRIVER_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_stid.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_stid.h
new file mode 100755
index 0000000000000..379f46aaa4b80
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_stid.h
@@ -0,0 +1,73 @@
+#ifndef QURT_STID_H
+#define QURT_STID_H
+/**
+  @file qurt_stid.h 
+  Prototypes of software thread identifier(stid) interface APIs.  
+  A stid is 8 bit identifier that can be assigned to a software thread.
+  The performance monitor logic uses stid as a counting match criteria
+  for maskable events. stid is also used by the hardware debugger 
+  (ISDB) to match breakpoints. 
+
+  EXTERNAL FUNCTIONS
+   None.
+
+  INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+  Copyright (c) 2024 Qualcomm Technologies, Inc.
+  All rights reserved.
+  Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+=============================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/*=============================================================================
+                            FUNCTIONS
+=============================================================================*/
+
+/**@ingroup func_qurt_stid_alloc
+  Allocate a unique stid 
+
+  @param[in]  pid   Process identifier
+  @param[out] stid  Pointer to a variable to return stid
+ 
+  @return
+  QURT_EOK - Allocation success
+  QURT_ENORESOURCE  - No stid available for allocation
+  QURT_EINVALID - Invalid input
+   
+  @dependencies
+  None.
+ */
+int qurt_stid_alloc(unsigned int pid, unsigned int *stid);
+
+/**@ingroup func_qurt_stid_release
+   Release the stid. 
+
+
+  @param[in]  pid   Process identifier
+  @param[in]  stid  STID to release
+  
+  @note1hang 
+  User shall ensure to clear the released stid from process or thread(s)
+  to default value (QURT_STID_DEFAULT) before releasing that stid
+ 
+  @return
+  QURT_EOK - Release success
+  QURT_ENOTALLOWED   - Operation not allowed for a pid
+  QURT_EINVALID  - Invalid stid
+   
+  @dependencies
+  None.
+ */
+int qurt_stid_release(unsigned int pid, unsigned int stid);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_STID_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_thread.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_thread.h
new file mode 100755
index 0000000000000..499699e7c72e2
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_thread.h
@@ -0,0 +1,1260 @@
+#ifndef QURT_THREAD_H
+#define QURT_THREAD_H
+/**
+  @file qurt_thread.h 
+  @brief Prototypes of Thread API
+
+  EXTERNAL FUNCTIONS
+   None.
+
+  INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+Copyright (c) 2018, 2020-2023  by Qualcomm Technologies, Inc.  All Rights Reserved.
+Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+=============================================================================*/
+
+
+/* The followings are for C code only */
+#ifndef __ASSEMBLER__ 
+#include <string.h>
+#include "qurt_pmu.h"
+#include "qurt_api_version.h"
+#endif /* __ASSEMBLER__ */
+#include "qurt_consts.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*=============================================================================
+                            CONSTANTS AND MACROS
+=============================================================================*/
+
+
+/*
+  Bitmask configuration to select DSP hardware threads. 
+  To select all the hardware threads, use #QURT_THREAD_CFG_BITMASK_ALL 
+  and the following: \n
+  - For QDSP6 V2/V3, all six hardware threads are selected \n
+  - For QDSP6 V3L, all four hardware threads are selected \n
+  - For QDSP6 V4, all three hardware threads are selected
+ */  
+
+#define QURT_THREAD_CFG_BITMASK_HT0      0x00000001   /**< HTO. */
+#define QURT_THREAD_CFG_BITMASK_HT1      0x00000002   /**< HT1. */
+#define QURT_THREAD_CFG_BITMASK_HT2      0x00000004   /**< HT2. */ 
+#define QURT_THREAD_CFG_BITMASK_HT3      0x00000008   /**< HT3. */
+#define QURT_THREAD_CFG_BITMASK_HT4      0x00000010   /**< HT4. */
+#define QURT_THREAD_CFG_BITMASK_HT5      0x00000020   /**< HT5. */
+/** @cond rest_reg_dist */
+/** @addtogroup thread_macros
+@{ */
+/**   @xreflabel{sec:qurt_thread_cfg} */  
+
+#define QURT_THREAD_CFG_BITMASK_ALL      0x000000ffU   /**< Select all the hardware threads. */
+/** @} */ /* end_addtogroup thread_macros */
+/** @endcond */
+
+#define QURT_THREAD_CFG_USE_RAM          0x00000000   /**< Use RAM. */
+#define QURT_THREAD_CFG_USE_TCM          0x00000100   /**< Use TCM. */
+/** @cond rest_reg_dist */
+/** @addtogroup thread_macros
+@{ */
+#define QURT_THREAD_BUS_PRIO_DISABLED    0   /**< Thread internal bus priority disabled. */
+#define QURT_THREAD_BUS_PRIO_ENABLED     1   /**< Thread internal bus priority enabled.  */
+/** @} */ /* end_addtogroup thread_macros */
+/** @endcond */
+
+#define QURT_THREAD_AUTOSTACK_DISABLED    0   /**< Thread has autostack v2 feature disabled. */
+#define QURT_THREAD_AUTOSTACK_ENABLED     1   /**< Thread has autostack v2 feature enabled.  */
+
+/*
+   Macros for QuRT thread attributes.   
+ */
+#define QURT_HTHREAD_L1I_PREFETCH      0x1     /**< Enables hardware L1 instruction cache prefetching. */
+#define QURT_HTHREAD_L1D_PREFETCH      0x2     /**< Enables hardware L1 data cache prefetching. */
+#define QURT_HTHREAD_L2I_PREFETCH      0x4     /**< Enables hardware L2 instruction cache prefetching. */
+#define QURT_HTHREAD_L2D_PREFETCH      0x8     /**< Enables hardware L2 data cache prefetching. */
+#define QURT_HTHREAD_DCFETCH           0x10    /**< Enables DC fetch to the provided virtual address. 
+                                                    DC fetch indicates the hardware that a data memory access is likely. 
+                                                    Instructions are dropped when there is high bus utilization. */
+/** @addtogroup thread_macros
+@{ */
+/** @xreflabel{hdr:partition_tcm} */
+/*
+   Below value is used to create legacy QuRT threads by default.
+   If a thread has this as the detach_state, the thread can be joined
+   on until it exits. When we are able to change default behavior of all
+   QuRT threads to JOINABLE (posix default), we can remove this legacy
+   behavior.
+*/
+#define QURT_THREAD_ATTR_CREATE_LEGACY               0U /**< Create a legacy QuRT thread by default. If a thread has this as a detach state, the thread can be joined on until it exits. */
+#define QURT_THREAD_ATTR_CREATE_JOINABLE             1U /**< Create a joinable thread. */
+#define QURT_THREAD_ATTR_CREATE_DETACHED             2U /**< Create a detached thread. */
+/** @} */ /* end_addtogroup thread_macros */
+
+
+#define QURT_THREAD_ATTR_NAME_MAXLEN            16  /**< Maximum name length. */
+#define QURT_THREAD_ATTR_TCB_PARTITION_RAM      0  /**< Creates threads in RAM/DDR. */
+#define QURT_THREAD_ATTR_TCB_PARTITION_TCM      1  /**< Creates threads in TCM. */
+/** @cond rest_reg_dist */
+/** @addtogroup thread_macros
+@{ */
+#define QURT_THREAD_ATTR_TCB_PARTITION_DEFAULT  QURT_THREAD_ATTR_TCB_PARTITION_RAM  /**< Backward compatibility. */
+#define QURT_THREAD_ATTR_PRIORITY_DEFAULT       254   /**< Priority.*/
+#define QURT_THREAD_ATTR_ASID_DEFAULT           0    /**< ASID. */
+#define QURT_THREAD_ATTR_AFFINITY_DEFAULT      (-1)  /**< Affinity. */
+#define QURT_THREAD_ATTR_BUS_PRIO_DEFAULT       255  /**< Bus priority. */
+#define QURT_THREAD_ATTR_AUTOSTACK_DEFAULT      0    /**< Default autostack v2 disabled thread. */
+#define QURT_THREAD_ATTR_TIMETEST_ID_DEFAULT   (-2)  /**< Timetest ID. */
+#define QURT_THREAD_ATTR_STID_DEFAULT           QURT_STID_DEFAULT  /**< STID. */
+#define QURT_THREAD_ATTR_STID_ENABLE            1  /**< Indicate to allocate STID during thread creation. */
+
+#define  QURT_PRIORITY_FLOOR_DEFAULT            255U  /**< Default floor. */
+/** @} */ /* end_addtogroup thread_macros */
+
+// Option for suspending thread
+#define  QURT_THREAD_SUSPEND_SYNCHRONOUS   0x0U  // bit#0
+#define  QURT_THREAD_SUSPEND_ASYNCHRONOUS  0x1U  // bit#0
+#define  QURT_THREAD_SUSPEND_KEEP_HMX      0x0U  // bit#1
+#define  QURT_THREAD_SUSPEND_DETACH_HMX    0x2U  // bit#1
+ 
+// Option for resuming thread
+#define  QURT_THREAD_RESUME_DEFAULT        0x0
+
+// Thread property IDs
+#define  QURT_THREAD_PROPERTY_SUSPENDABLE  0x0U 
+#define  QURT_THREAD_PROPERTY_RESUMABLE    0x1
+
+// Thread group
+#define  QURT_THREAD_DEFAULT_GROUP_ID      0x0U
+#define  QURT_THREAD_GROUP_ID_MASK         0x3FU
+
+/** @endcond*/
+
+
+/* The followings are for C code only */
+#ifndef __ASSEMBLER__ 
+/*=============================================================================
+                                TYPEDEFS
+=============================================================================*/
+/** @addtogroup thread_types
+@{ */
+/** @cond rest_reg_dist  */
+typedef unsigned int qurt_cache_partition_t; /**< QuRT cache partition type. */
+
+#define CCCC_PARTITION      0U     /**< Use the CCCC page attribute bits to determine the main or auxiliary partition. */
+#define MAIN_PARTITION      1U     /**< Use the main partition. */
+#define AUX_PARTITION       2U     /**< Use the auxiliary partition. */
+#define MINIMUM_PARTITION   3U     /**< Use the minimum. Allocates the least amount of cache (no-allocate policy possible) for this thread. */
+/** @endcond */
+
+/** Thread ID type. */
+typedef unsigned int qurt_thread_t;
+
+/** @cond rest_reg_dist  */
+/** Thread attributes. */
+typedef struct _qurt_thread_attr {
+    
+    char name[QURT_THREAD_ATTR_NAME_MAXLEN]; /**< Thread name. */
+    unsigned char tcb_partition;  /**< Indicates whether the thread TCB resides in RAM or
+                                       on chip memory (TCM). */
+    unsigned char  stid;          /**< Software thread ID used to configure the stid register
+                                       for profiling purposes. */
+    unsigned short priority;      /**< Thread priority. */
+    unsigned char  autostack:1;   /**< Autostack v2 enabled thread. */
+    unsigned char  group_id:6;    /**< Group ID. */
+    unsigned char  reserved:1;    /**< Reserved bits. */
+    unsigned char  bus_priority;  /**< Internal bus priority. */
+    unsigned short timetest_id;   /**< Timetest ID. */
+    unsigned int   stack_size;    /**< Thread stack size. */
+    void *stack_addr;             /**< Pointer to the stack address base. The range of the stack is
+                                       (stack_addr, stack_addr+stack_size-1). */
+    unsigned short detach_state;  /**< Detach state of the thread. */
+
+} qurt_thread_attr_t;
+/** @endcond */
+
+/** @cond rest_reg_dist */
+/** Dynamic TLS attributes. */
+typedef struct qurt_tls_info {
+  unsigned int module_id;        /**< Module ID of the loaded dynamic linked library. */
+  unsigned int tls_start;        /**< Start address of the TLS data. */
+  unsigned int tls_data_end;     /**< End address of the TLS RW data. */
+  unsigned int tls_end;          /**< End address of the TLS data. */
+}qurt_tls_info;
+/** @endcond */
+
+/** @} */ /* end_addtogroup thread_types */
+
+/*=============================================================================
+                       FUNCTIONS
+=============================================================================*/
+/**@ingroup func_qurt_thread_attr_init
+  Initializes the structure used to set the thread attributes when a thread is created.
+  After an attribute structure is initialized, Explicity set the individual attributes in the structure 
+  using the thread attribute operations.
+
+  The initialize operation sets the following default attribute values: \n
+  - Name -- NULL string \n
+  - TCB partition -- QURT_THREAD_ATTR_TCB_PARTITION_DEFAULT
+  - Priority -- QURT_THREAD_ATTR_PRIORITY_DEFAULT \n
+  - Autostack -- QURT_THREAD_ATTR_AUTOSTACK_DEFAULT \n
+  - Bus priority -- QURT_THREAD_ATTR_BUS_PRIO_DEFAULT \n
+  - Timetest ID -- QURT_THREAD_ATTR_TIMETEST_ID_DEFAULT \n
+  - stack_size -- 0 \n
+  - stack_addr -- NULL \n
+  - detach state -- #QURT_THREAD_ATTR_CREATE_LEGACY \n
+  - STID -- #QURT_THREAD_ATTR_STID_DEFAULT
+
+  @datatypes
+  #qurt_thread_attr_t
+  
+  @param[in,out] attr Pointer to the thread attribute structure.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+static inline void qurt_thread_attr_init (qurt_thread_attr_t *attr)
+{
+
+    attr->name[0] = '\0';
+    attr->tcb_partition = QURT_THREAD_ATTR_TCB_PARTITION_DEFAULT;
+    attr->priority = QURT_THREAD_ATTR_PRIORITY_DEFAULT;
+    attr->autostack = QURT_THREAD_ATTR_AUTOSTACK_DEFAULT; /* Default attribute for autostack v2*/
+    attr->bus_priority = QURT_THREAD_ATTR_BUS_PRIO_DEFAULT;
+    attr->timetest_id = (unsigned short)QURT_THREAD_ATTR_TIMETEST_ID_DEFAULT;
+    attr->stack_size = 0;
+    attr->stack_addr = NULL;
+    attr->detach_state = QURT_THREAD_ATTR_CREATE_LEGACY;
+    attr->stid = QURT_THREAD_ATTR_STID_DEFAULT;
+    attr->group_id = QURT_THREAD_DEFAULT_GROUP_ID;
+}
+
+/**@ingroup func_qurt_thread_attr_set_name
+  Sets the thread name attribute.\n
+  This function specifies the name to use by a thread.
+  Thread names identify a thread during debugging or profiling.
+  Maximum name length is 16 charactes  \n
+  @note1hang Thread names differ from the kernel-generated thread identifiers used to
+  specify threads in the API thread operations.
+
+  @datatypes
+  #qurt_thread_attr_t
+
+  @param[in,out] attr Pointer to the thread attribute structure.
+  @param[in] name     Pointer to the character string containing the thread name.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+static inline void qurt_thread_attr_set_name (qurt_thread_attr_t *attr, const char *name)
+{
+    strlcpy (attr->name, name, QURT_THREAD_ATTR_NAME_MAXLEN);
+    attr->name[QURT_THREAD_ATTR_NAME_MAXLEN - 1] = '\0';
+}
+
+
+/**@ingroup func_qurt_thread_attr_set_tcb_partition
+  Sets the thread TCB partition attribute.
+  Specifies the memory type where a TCB of a thread is allocated.
+  Allocates TCBs in RAM or TCM/LPM.
+
+  @datatypes
+  #qurt_thread_attr_t
+
+  @param[in,out] attr  Pointer to the thread attribute structure.
+  @param[in] tcb_partition TCB partition. Values:\n
+                     - 0 -- TCB resides in RAM \n
+                     - 1 -- TCB resides in TCM/LCM @tablebulletend
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+static inline void qurt_thread_attr_set_tcb_partition (qurt_thread_attr_t *attr, unsigned char tcb_partition)
+{
+    attr->tcb_partition = tcb_partition;
+}
+
+/**@ingroup func_qurt_thread_attr_set_priority
+  Sets the thread priority to assign to a thread.
+  Thread priorities are specified as numeric values in the range 1 to 254, with 1 representing
+  the highest priority.
+  Priority 0 and 255  are internally used by the kernel for special purposes.
+
+  @datatypes
+  #qurt_thread_attr_t
+
+  @param[in,out] attr Pointer to the thread attribute structure.
+  @param[in] priority Thread priority.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+static inline void qurt_thread_attr_set_priority (qurt_thread_attr_t *attr, unsigned short priority)
+{
+    attr->priority = priority;
+}
+
+/**@ingroup func_qurt_thread_attr_set_detachstate
+  Sets the thread detach state with which thread is created.
+  Thread detach state is either joinable or detached; specified by the following values:
+  - #QURT_THREAD_ATTR_CREATE_JOINABLE  \n           
+  - #QURT_THREAD_ATTR_CREATE_DETACHED  \n   
+
+  When a detached thread is created (QURT_THREAD_ATTR_CREATE_DETACHED), its thread
+  ID and other resources are reclaimed as soon as the thread exits. When a joinable thread 
+  is created (QURT_THREAD_ATTR_CREATE_JOINABLE), it is assumed that some
+  thread waits to join on it using a qurt_thread_join() call. 
+  By default, detached state is QURT_THREAD_ATTR_CREATE_LEGACY
+  If detached state is QURT_THREAD_ATTR_CREATE_LEGACY then other
+  thread can join before thread exits but it will not wait other thread to join.
+  
+  @note1hang For a joinable thread (QURT_THREAD_ATTR_CREATE_JOINABLE), it is very
+             important that some thread joins on it after it terminates, otherwise
+			 the resources of that thread are not reclaimed, causing memory leaks.      
+
+  @datatypes
+  #qurt_thread_attr_t
+
+  @param[in,out] attr Pointer to the thread attribute structure.
+  @param[in] detachstate Thread detach state.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+static inline void qurt_thread_attr_set_detachstate (qurt_thread_attr_t *attr, unsigned short detachstate)
+{	
+    if(detachstate == QURT_THREAD_ATTR_CREATE_JOINABLE  || detachstate == QURT_THREAD_ATTR_CREATE_DETACHED){
+		attr->detach_state = detachstate;
+	}
+}
+
+
+/**@ingroup func_qurt_thread_attr_set_timetest_id
+  Sets the thread timetest attribute.\n
+  Specifies the timetest identifier to use by a thread.
+
+  Timetest identifiers are used to identify a thread during debugging or profiling. \n
+  @note1hang Timetest identifiers differ from the kernel-generated thread identifiers used to
+             specify threads in the API thread operations.
+
+  @datatypes
+  #qurt_thread_attr_t
+  
+  @param[in,out] attr   Pointer to the thread attribute structure.
+  @param[in] timetest_id Timetest identifier value.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+  */
+static inline void qurt_thread_attr_set_timetest_id (qurt_thread_attr_t *attr, unsigned short timetest_id)
+{
+    attr->timetest_id = timetest_id;
+}
+
+/**@ingroup func_qurt_thread_attr_set_stack_size
+  @xreflabel{sec:set_stack_size}
+  Sets the thread stack size attribute.\n
+  Specifies the size of the memory area to use for a call stack of a thread.
+
+  The thread stack address (Section @xref{sec:set_stack_addr}) and stack size specify the memory area used as a
+  call stack for the thread. The user is responsible for allocating the memory area used for
+  the stack.
+
+  @datatypes
+  #qurt_thread_attr_t
+
+  @param[in,out] attr Pointer to the thread attribute structure.
+  @param[in] stack_size Size (in bytes) of the thread stack.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+
+static inline void qurt_thread_attr_set_stack_size (qurt_thread_attr_t *attr, unsigned int stack_size)
+{
+    attr->stack_size = stack_size;
+}
+
+/**@ingroup func_qurt_thread_attr_set_stack_size2
+  @xreflabel{sec:set_stack_size}
+  Sets the thread stack size attribute for island threads that require a higher guest OS stack size than the stack size
+  defined in the configuration XML.\n
+  Specifies the size of the memory area to use for a call stack of an island thread in User and Guest mode.
+
+  The thread stack address (Section @xref{sec:set_stack_addr}) and stack size specify the memory area used as a
+  call stack for the thread. The user is responsible for allocating the memory area used for
+  the stack.
+
+  @datatypes
+  #qurt_thread_attr_t
+
+  @param[in,out] attr Pointer to the thread attribute structure.
+  @param[in] user_stack_size Size (in bytes) of the stack usage in User mode.
+  @param[in] root_stack_size Size (in bytes) of the stack usage in Guest mode.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+static inline void qurt_thread_attr_set_stack_size2 (qurt_thread_attr_t *attr, unsigned short user_stack_size, unsigned short root_stack_size)
+{
+	union qurt_thread_stack_info{
+		unsigned int raw_size;
+		struct{
+			unsigned short user_stack;
+			unsigned short root_stack;
+		};
+	}user_root_stack_size;
+	user_root_stack_size.user_stack = user_stack_size;
+	user_root_stack_size.root_stack = root_stack_size;
+	
+    attr->stack_size = user_root_stack_size.raw_size;
+}
+
+/**@ingroup func_qurt_thread_attr_set_stack_addr
+  @xreflabel{sec:set_stack_addr}
+  Sets the thread stack address attribute. \n
+  Specifies the base address of the memory area to use for a call stack of a thread.
+
+  stack_addr must contain an address value that is 8-byte aligned.
+
+  The thread stack address and stack size (Section @xref{sec:set_stack_size}) specify the memory area used as a
+  call stack for the thread. \n
+  @note1hang The user is responsible for allocating the memory area used for the thread
+             stack. The memory area must be large enough to contain the stack that the thread
+			 creates.
+
+  @datatypes
+  #qurt_thread_attr_t
+  
+  @param[in,out] attr Pointer to the thread attribute structure.
+  @param[in] stack_addr  Pointer to the 8-byte aligned address of the thread stack.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+static inline void qurt_thread_attr_set_stack_addr (qurt_thread_attr_t *attr, void *stack_addr)
+{
+    attr->stack_addr = stack_addr;
+}
+
+/**@ingroup func_qurt_thread_attr_set_bus_priority
+   Sets the internal bus priority state in the Hexagon core for this software thread attribute. 
+   Memory requests generated by the thread with bus priority enabled are
+   given priority over requests generated by the thread with bus priority disabled. 
+   The default value of bus priority is disabled.
+
+   @note1hang Sets the internal bus priority for Hexagon processor version V60 or greater. 
+              The priority is not propagated to the bus fabric.
+  
+   @datatypes
+   #qurt_thread_attr_t
+
+   @param[in] attr Pointer to the thread attribute structure.
+
+   @param[in] bus_priority Enabling flag. Values: \n 
+         - #QURT_THREAD_BUS_PRIO_DISABLED \n
+         - #QURT_THREAD_BUS_PRIO_ENABLED @tablebulletend
+
+   @return
+   None
+
+   @dependencies
+   None.
+*/
+static inline void qurt_thread_attr_set_bus_priority ( qurt_thread_attr_t *attr, unsigned short bus_priority)
+{
+    attr->bus_priority = (unsigned char)bus_priority;
+}
+
+/**@ingroup func_qurt_thread_attr_set_autostack
+   Enables autostack v2 feature in the thread attributes.
+   
+   When autostack is enabled by the subsystem, in the case that
+   an autostack enabled thread gets framelimit exception, kernel will
+   allocate more stack for thread and return to normal execution. 
+
+   If autostack is not enabled by the subsystem, or it is not enabled
+   for the thread, the framelimit exception will be fatal.
+  
+   @datatypes
+   #qurt_thread_attr_t
+
+   @param[in] attr Pointer to the thread attribute structure.
+   @param[in] autostack  Autostack enable or disable flag. Values: \n 
+         - #QURT_THREAD_AUTOSTACK_DISABLED \n
+         - #QURT_THREAD_AUTOSTACK_ENABLED @tablebulletend
+
+   @return
+   None.
+
+   @dependencies
+   None.
+*/
+static inline void qurt_thread_attr_set_autostack ( qurt_thread_attr_t *attr, unsigned short autostack)
+{
+    attr->autostack = (unsigned char)autostack;  
+}
+/**@ingroup qurt_thread_attr_enable_stid
+   Set STID in the thread attributes.
+  
+   @datatypes
+   #qurt_thread_attr_t
+
+   @param[in] attr Pointer to the thread attribute structure.
+   @param[in] enable_stid  STID to be set. Values: \n 
+         - #QURT_THREAD_ATTR_STID_DEFAULT (0): Default STID. \n
+         - #QURT_THREAD_ATTR_STID_ENABLE (1):  QuRT assigns an STID that is not already in use \n
+         - #2 through #255 : User provided STID.  @tablebulletend
+
+   @return
+   None.
+
+   @dependencies
+   None.
+*/
+static inline void qurt_thread_attr_enable_stid ( qurt_thread_attr_t *attr, char enable_stid)
+{
+    if (enable_stid != '\0') {
+        attr->stid = enable_stid;
+    }
+    else
+    {
+        attr->stid = QURT_THREAD_ATTR_STID_DEFAULT;
+    }
+}
+
+/**@ingroup func_qurt_thread_attr_set_stid
+   Sets the stid thread attribute.
+   The default stid value is QURT_THREAD_ATTR_STID_DEFAULT
+
+   @note1hang When a thread is created with non default stid , 
+   the stid set in thread attribute  will be assigned to a thread.
+  
+   @datatypes
+   #qurt_thread_attr_t
+
+   @param[in] attr Pointer to the thread attribute structure.
+   @param[in] stid Stid to be set for a thread.
+
+   @return
+   None
+
+   @dependencies
+   None.
+*/
+static inline void qurt_thread_attr_set_stid( qurt_thread_attr_t *attr, unsigned int stid){
+    attr->stid = stid;
+}
+
+/**@ingroup func_qurt_thread_attr_set_group_id
+  Sets group id in the thread attributes.
+  Primordial/first thread has group ID 0.
+  If a new thread is created without assigning group_id, it
+  inherits the group ID from its parent thread.
+
+  @note1hang
+  1) Group ID can only be set before creating a thread. It cannot be
+  changed after the thread is created.
+  2) If a non-activated group_id is passed, thread creation will fail.
+  3) Only a thread with Group ID #0 can set Group ID for its child threads.
+  4) If thread with non-zero group ID set the group ID for its child threads,
+  QuRT will ingore this parameter and child threads will inherit the parent
+  thread's group ID. But if passed group ID is not activated, thread creation
+  will still fail.
+
+  @datatypes
+  #qurt_thread_attr_t
+
+  @param[in] attr Pointer to the thread attribute structure.
+  @param[in] group_id Group identifier. Its valid range is 0 ~ 63
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+static inline void qurt_thread_attr_set_group_id(qurt_thread_attr_t *attr, unsigned int group_id)
+{
+    attr->group_id = group_id & QURT_THREAD_GROUP_ID_MASK;
+}
+
+/**@ingroup func_qurt_thread_set_autostack
+  Sets autostack enable in the TCB.
+
+  @param[in] Pointer to UGP
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+
+void qurt_thread_set_autostack(void *);
+
+
+/**@ingroup func_qurt_thread_get_name
+  Gets the thread name of current thread.\n
+  Returns the thread name of the current thread. 
+  Thread names are assigned to threads as thread attributes, see qurt_thread_attr_set_name(). Thread names 
+  identify a thread during debugging or profiling.
+
+  @param[out] name Pointer to a character string, which specifies the address where the returned thread name is stored.
+  @param[in] max_len Maximum length of the character string that can be returned.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+void qurt_thread_get_name (char *name, unsigned char max_len);
+
+/**@ingroup func_qurt_thread_create
+  @xreflabel{hdr:qurt_thread_create}
+  Creates a thread with the specified attributes, and makes it executable.
+
+  @datatypes
+  #qurt_thread_t \n
+  #qurt_thread_attr_t
+  
+  @param[out]  thread_id    Returns a pointer to the thread identifier if the thread was 
+                             successfully created.
+  @param[in]   attr 	    Pointer to the initialized thread attribute structure that specifies 
+                             the attributes of the created thread.
+  @param[in]   entrypoint   C function pointer, which specifies the main function of a thread.
+  @param[in]   arg  	     Pointer to a thread-specific argument structure
+  
+   
+  @return 
+  #QURT_EOK -- Thread created. \n
+  #QURT_EFAILED -- Thread not created. 
+
+  @dependencies
+  None.
+ */
+int qurt_thread_create (qurt_thread_t *thread_id, qurt_thread_attr_t *attr, void (*entrypoint) (void *), void *arg);
+
+/**@ingroup func_qurt_thread_stop
+   Stops the current thread, frees the kernel TCB, and yields to the next highest ready thread. 
+  
+   @return
+   void 
+
+   @dependencies
+   None.
+ */
+void qurt_thread_stop(void);
+
+/** @cond internal_only */
+/**@ingroup func_qurt_thread_resume
+   When a demand-loading paging solution is enabled, this function
+   will resumes the execution of a thread that was suspended due to
+   a page miss.
+  
+   @param[in]  thread_id Thread identifier.
+
+   @return 
+   #QURT_EOK -- Thread successfully resumed. \n
+   #QURT_EFATAL -- Resume operation failed.
+
+   @dependencies
+   None.
+ */
+int qurt_thread_resume(unsigned int thread_id);
+/** @endcond */
+
+/**@ingroup func_qurt_thread_get_id
+   Gets the identifier of the current thread.\n
+   Returns the thread identifier for the current thread.
+     
+   @return 
+   Thread identifier -- Identifier of the current thread. 
+
+   @dependencies
+   None.
+ */
+qurt_thread_t qurt_thread_get_id (void);
+
+
+/**@ingroup func_qurt_thread_get_l2cache_partition
+   Returns the current value of the L2 cache partition assigned to the caller thread.\n
+     
+   @return 
+   Value of the #qurt_cache_partition_t data type.
+
+   @dependencies
+   None.
+ */
+qurt_cache_partition_t qurt_thread_get_l2cache_partition (void);
+
+/**@ingroup func_qurt_thread_set_timetest_id
+   Sets the timetest identifier of the current thread.
+   Timetest identifiers are used to identify a thread during debugging or profiling.\n
+   @note1hang Timetest identifiers differ from the kernel-generated thread identifiers used to
+              specify threads in the API thread operations.
+
+   @param[in]  tid  Timetest identifier.
+
+   @return
+   None.
+
+   @dependencies
+   None.
+ */
+void qurt_thread_set_timetest_id (unsigned short tid);
+
+/**@ingroup func_qurt_thread_set_cache_partition
+   Sets the cache partition for the current thread. This function uses the qurt_cache_partition_t type 
+   to select the cache partition of the current thread for the L1 Icache, L1 Dcache, and L2 cache.
+  
+   @datatypes
+   #qurt_cache_partition_t 
+
+   @param[in] l1_icache L1 I cache partition.
+   @param[in] l1_dcache L1 D cache partition.
+   @param[in] l2_cache L2 cache partition.
+    
+   @return
+   None.
+
+   @dependencies
+   None.
+ */
+void qurt_thread_set_cache_partition(qurt_cache_partition_t l1_icache, qurt_cache_partition_t l1_dcache, qurt_cache_partition_t l2_cache);
+
+
+/**@ingroup func_qurt_thread_get_timetest_id
+   Gets the timetest identifier of the current thread.\n
+   Returns the timetest identifier of the current thread.\n
+   Timetest identifiers are used to identify a thread during debugging or profiling. \n
+   @note1hang Timetest identifiers differ from the kernel-generated thread identifiers used to
+              specify threads in the API thread operations.
+
+   @return 
+   Integer -- Timetest identifier. 
+
+   @dependencies
+   None.
+ */
+unsigned short qurt_thread_get_timetest_id (void);
+
+/**@ingroup func_qurt_thread_exit
+   @xreflabel{sec:qurt_thread_exit}
+   Stops the current thread, awakens threads joined to it, then destroys the stopped
+   thread.
+
+   Threads that are suspended on the current thread (by performing a thread join 
+   Section @xref{sec:thread_join}) are awakened and passed a user-defined status value 
+   that indicates the status of the stopped thread.
+
+   @note1hang Exit must be called in the context of the thread to stop.
+  
+   @param[in]   status User-defined thread exit status value.
+
+   @return
+   None.
+
+   @dependencies
+   None.
+ */
+void qurt_thread_exit(int status);
+
+/**@ingroup func_qurt_thread_join
+   @xreflabel{sec:thread_join}
+   Waits for a specified thread to finish; the specified thread is another thread within
+   the same process.
+   The caller thread is suspended until the specified thread exits. When the unspecified thread
+   exits, the caller thread is awakened. \n
+   @note1hang If the specified thread has already exited, this function returns immediately
+              with the result value #QURT_ENOTHREAD. \n
+   @note1cont Two threads cannot call qurt_thread_join to wait for the same thread to finish.
+              If this occurs, QuRT generates an exception (see Section @xref{sec:exceptionHandling}).
+  
+   @param[in]   tid     Thread identifier.
+   @param[out]  status  Destination variable for thread exit status. Returns an application-defined 
+                        value that indicates the termination status of the specified thread. 
+  
+   @return  
+   #QURT_ENOTHREAD -- Thread has already exited. \n
+   #QURT_EOK -- Thread successfully joined with valid status value. 
+
+   @dependencies
+   None.
+ */
+int qurt_thread_join(unsigned int tid, int *status);
+
+/**@ingroup qurt_thread_detach
+   @xreflabel{sec:thread_detach}
+   Detaches a joinable thread. The specified thread is another thread within the 
+   same process. Create the thread as a joinable thread; only joinable threads 
+   can be detached.
+   If a joinable thread is detached, it finishes execution and exits.
+  
+   @param[in]   tid     Thread identifier.
+   
+   @return  
+   #QURT_ENOTHREAD -- Thread specifed by TID does not exist. \n
+   #QURT_EOK -- Thread successfully detached.
+
+   @dependencies
+   None.
+ */
+int qurt_thread_detach(unsigned int tid);
+
+
+/**@ingroup func_qurt_thread_get_priority 
+   Gets the priority of the specified thread. \n 
+   Returns the thread priority of the specified thread.\n
+   Thread priorities are specified as numeric values in a range as large as 1 through 254, with lower
+   values representing higher priorities. 1 represents the highest possible thread priority. \n
+   Priority 0 and 255 are internally used by the kernel for special purposes.
+
+   @note1hang QuRT can be configured to have different priority ranges.
+
+   @datatypes
+   #qurt_thread_t
+  
+   @param[in]  threadid	   Thread identifier.	
+
+   @return
+   -1 -- Invalid thread identifier. \n
+   1 through 254 -- Thread priority value.
+
+   @dependencies
+   None.
+ */
+int qurt_thread_get_priority (qurt_thread_t threadid);
+
+/**@ingroup func_qurt_thread_set_priority
+   Sets the priority of the specified thread.\n
+   Thread priorities are specified as numeric values in a range as large as 1 through 254, with lower
+   values representing higher priorities. 1 represents the highest possible thread priority.
+   Priority 0 and 255  are internally used by the kernel  for special purposes.
+
+   @note1hang QuRT can be configured to have different priority ranges. For more
+              information, see Section @xref{sec:AppDev}.
+   
+   @datatypes
+   #qurt_thread_t
+
+   @param[in] threadid	    Thread identifier.	
+   @param[in] newprio 	    New thread priority value.
+
+   @return
+   0 -- Priority successfully set. \n
+   -1 -- Invalid thread identifier. \n 
+   
+   @dependencies
+   None.
+ */
+int qurt_thread_set_priority (qurt_thread_t threadid, unsigned short newprio);
+
+
+
+/**@ingroup func_qurt_thread_attr_get
+  Gets the attributes of the specified thread.
+
+  @datatypes
+  #qurt_thread_t \n
+  #qurt_thread_attr_t
+
+  @param[in]  thread_id	    Thread identifier.
+  @param[out] attr 	    Pointer to the destination structure for thread attributes.
+  
+  @return
+  #QURT_EOK -- Success. \n
+  #QURT_EINVALID -- Invalid argument.
+
+  @dependencies
+  None.
+ */
+int qurt_thread_attr_get (qurt_thread_t thread_id, qurt_thread_attr_t *attr);
+
+
+
+/**@ingroup func_qurt_thread_get_tls_base
+  Gets the base address of thread local storage (TLS) of a dynamically loaded module
+  for the current thread.
+  
+  @datatypes
+  #qurt_tls_info 
+
+  @param[in]  info	   Pointer to the TLS information for a module.
+  
+  @return
+   Pointer to the TLS object for the dynamically loaded module.\n
+   NULL -- TLS information is invalid.
+
+  @dependencies
+  None.
+ */
+void * qurt_thread_get_tls_base(qurt_tls_info* info);
+
+/**@ingroup func_qurt_thread_pktcount_get
+  Gets the PKTCOUNT of a specified thread.
+
+  @datatypes
+  #qurt_thread_t 
+
+  @param[in]  thread_id	    Thread identifier.
+  
+  @return
+  PKTCOUNT
+
+  @dependencies
+  None.
+ */
+
+long long int qurt_thread_pktcount_get (qurt_thread_t thread_id);
+
+/**@ingroup func_qurt_thread_pktcount_set
+  Sets the PKTCOUNT for the current QuRT thread.
+  
+  @return
+  Value to which pktcount is set.
+
+  @dependencies
+  None.
+ */
+
+long long int qurt_thread_pktcount_set (long long int);
+
+/**@ingroup func_qurt_thread_stid_get
+  Gets the STID for a specified thread.
+
+  @datatypes
+  #qurt_thread_t 
+
+  @param[in]  thread_id	    Thread identifier.
+  
+  @return
+  STID
+
+  @dependencies
+  None.
+ */
+
+char qurt_thread_stid_get(qurt_thread_t thread_id);
+ 
+/**@ingroup func_qurt_thread_stid_get2
+  Returns the set stid for a thread
+  
+  @param[in]  thread_id   thread identifier
+  @param[out] stid  Pointer to a variable to return  stid
+   
+  @return
+  QURT_EOK - success
+  QURT_ENOTALLOWED   - operation not allowed for a thread
+  QURT_EINVALID - Invalid input
+
+  @dependencies
+  None.
+ */
+int qurt_thread_stid_get2(unsigned int thread_id, unsigned int *stid);
+
+/**@ingroup func_qurt_thread_stid_set
+  Sets the STID for a specified thread. 
+
+  @datatypes
+  #qurt_thread_t 
+
+  @param[in]  stid	    Thread identifier.
+  
+  @return 
+   #QURT_EOK -- STID set created. \n
+   #QURT_EFAILED -- STID not set. 
+
+  @dependencies
+  None.
+ */
+
+int qurt_thread_stid_set(char stid);
+
+/**@ingroup qurt_thread_stid_set2
+   Sets the stid for a specified thread.
+
+   @datatypes
+   #qurt_thread_attr_t
+
+   @param[in]  thread_id  Thread identifier.
+   @param[in]  stid       Stid to be set for a thread.
+
+   @return
+   QURT_EOK -- Success
+   #QURT_EPRIVILEGE -- Failure because caller does not have enough privilege for this operation.
+   #QURT_EVAL -- Failure because of invalid inputs.
+
+   @dependencies
+   None.
+*/
+int qurt_thread_stid_set2(unsigned int thread_id, unsigned int stid); 
+
+/** @cond internal_only */
+/**@ingroup func_qurt_thread_get_running_ids
+  Returns the thread IDs of the running threads in the system; use only during fatal error handling.
+ 
+  @datatypes
+  #qurt_thread_t 
+ 
+  @param[in,out] * Array of thread identifier of size #QURT_MAX_HTHREAD_LIMIT + 1.
+ 
+  @return
+   #QURT_EINVALID -- Incorrect argument \n
+   #QURT_ENOTALLOWED  -- API not called during error handling \n
+   #QURT_EOK -- Success, returns a NULL-terminated array of thread_id
+ 
+  @dependencies
+  None.
+ */
+int qurt_thread_get_running_ids(qurt_thread_t *);
+/** @endcond */
+
+
+/**@ingroup func_qurt_thread_get_thread_id
+  Gets the thread identifier of the thread with the matching name in the same process
+  of the caller.
+ 
+  @datatypes
+  #qurt_thread_t 
+ 
+  @param[out] thread_id Pointer to the thread identifier.
+  @param[in]  name      Pointer to the name of the thread.
+ 
+  @return
+  #QURT_EINVALID -- No thread with matching name in the process of the caller \n
+  #QURT_EOK      -- Success  
+ 
+  @dependencies
+  None.
+ */
+int qurt_thread_get_thread_id (qurt_thread_t *thread_id, char *name);
+
+/**@ingroup func_qurt_sleep
+  Suspends the current thread for the specified amount of time.
+
+  @note1hang Because QuRT timers are deferrable, this call is guaranteed to block
+             at least for the specified amount of time. If power-collapse is 
+             enabled, the maximum amount of time this call can block depends on
+             the earliest wakeup from power-collapse past the specified duration.
+
+  @param[in] duration  Duration (in microseconds) for which the thread is suspended.
+
+  @return 
+  None.
+
+  @dependencies
+  None.
+ */
+void qurt_sleep (unsigned long long int duration);
+
+
+/**@ingroup func_qurt_system_set_priority_floor
+  Sets a priority floor to move threads with thread priority lower than the floor out of the running state.
+  Running threads with thread priority lower than the priority floor are moved into the kernel ready queue, and they 
+  are not scheduled to run when the thread priority is lower than the floor.
+  Later the caller should reset the priority floor back to the default value of QURT_PRIORITY_FLOOR_DEFAULT. 
+  Threads in the kernel ready queue are scheduled to run when the thread priority is higher than the floor.
+
+  The priority floor is set and associated to the user process of the caller. When the caller gets into QuRTOS and
+  sets a new floor, the new floor is associated to its original user process, not the QuRTOS process.
+  The floor associated to the user process is reset when the user process exits or is killed, but not at the time 
+  when the user thread of the caller exits.
+
+  The priority floor cannot be set to a priority higher than the thread priority of the caller.
+
+  The priority floor cannot be set to a priority lower than the default #QURT_PRIORITY_FLOOR_DEFAULT system floor.
+
+  This function is not supported in Island mode.
+
+  After the system floor is set above QURT_PRIORITY_FLOOR_DEFAULT, power collapse is skipped, and sleep task 
+  is not scheduled to run.
+ 
+  @param[in]  priority_floor Priority floor. 
+ 
+  @return
+  #QURT_EOK         -- Success \n  
+  #QURT_ENOTALLOWED -- Floor setting is not allowed
+ 
+  @dependencies
+  None.
+ */
+int qurt_system_set_priority_floor (unsigned int priority_floor);
+
+
+/**@ingroup func_qurt_thread_suspend_thread 
+  Suspend a QuRT thread with its thread identifier.
+  The target thread can be in a signed user process or an unsigned user process.
+  The caller thread can be a thread from the same user process of the target thread, or from its parent process.
+  After the target thread is suspended, the kernel will not schedule it to run until it is resumed later.
+
+  If the target thread is set as non-suspendable, this function call returns an error without suspending 
+  the target thread. 
+
+  If the target thread is already suspended, this function call returns success to confirm 
+  the target thread suspend.                                          
+
+  If the target thread is in a secure user process, or CPZ process, this function call returns an error without
+  suspending the target thread.                                          
+
+  If the target thread is running in the guest OS/root process via a QDI call, this function call does not suspend 
+  the target thread in guest OS, but marks the target thread as suspend-pending. The target thread is
+  suspended when it exits the guest OS, before executing the first instruction in the user process.
+  In this case, the function returns success even with the #QURT_THREAD_SUSPEND_SYNCHRONOUS option, while the target
+  thread can runn in the guest OS, and is suspended when exiting the guest OS. 
+ 
+  QuRT debug monitor threads that are in a user process are non-suspendable. This function does not suspend 
+  those threads.
+
+  @param[in] thread_id  Thread identifier.
+  @param[in] option     Optional argument, multiple options can be ORed. \n
+                        #QURT_THREAD_SUSPEND_SYNCHRONOUS (default) -- set to synchronous function call,
+                        the function returns after the thread is completely suspended.\n
+                        #QURT_THREAD_SUSPEND_ASYNCHRONOUS -- set to asynchronous function call, the function returns
+                        after the kernel acts to suspend the target thread. The target thread
+                        might still be running before it is completely suspended. \n
+                        #QURT_THREAD_SUSPEND_KEEP_HMX (default) -- keep the HMX attachment on the target thread 
+                        if it locks the HMX with qurt_hmx_lock(). In this case, the HMX cannot be re-used by other threads. \n
+                        #QURT_THREAD_SUSPEND_DETACH_HMX -- detach HMX from the target thread if it locks the HMX with qurt_hmx_lock().
+                        Later when the target thread resumes, the HMX is re-attached to the thread. Note that, this option is only 
+                        supported for the caller from the same user process of the target thread, not for a caller from the parent 
+                        process of the target thread, or other processes. With the HMX detach option, Qurt does not save the HMX 
+                        context. Thus, the HMX context state will be lost. It is the responsibility of caller to ensure HMX operations
+                        and its context state saving when calling qurt_thread_suspend_thread() with the HMX detach option.
+                        If a thread from another process uses this detach option, QURT_EHMXNOTDETACHABLE will be returned; in this 
+                        case, if the caller is qualified to suspend the target thread, the target thread will be moved to suspended 
+                        state without HMX detached.
+ 
+  @return
+  #QURT_EOK         -- Success  \n
+  #QURT_EINVALID    -- Failure because of invalid thread_id input \n
+  #QURT_ENOTALLOWED -- Failure because of the operation is not allowed, for example, in secure process/CPZ process.
+  #QURT_EHMXNOTDETACHABLE -- Failure because HMX is not detachable from the target thread.
+ 
+  @dependencies
+  None.
+ */
+int qurt_thread_suspend_thread (unsigned int thread_id, unsigned int option);
+
+
+/**@ingroup func_qurt_thread_resume_thread 
+  Resume a QuRT thread with its thread identifier.
+  The target thread can be in a signed user process or an unsigned user process.
+  The caller thread can be a thread from the same user process of the target thread, or from its parent 
+  process. After the target thread resumes, the kernel scheduler can schedule the thread to run based on 
+  the thread priority.
+
+  There is an option argument in this function, with only one default option as of now,
+     QURT_THREAD_RESUME_DEFAULT: resume the target thread in default way.
+
+  By default, this is an asynchronous function. The function returns after kernel moves the 
+  target thread from suspended state to runnable state. The thread is scheduled to run based on its 
+  thread priority.
+  
+  If the target thread is set as non-resumable, this function call does not resume the target thread.                                          
+
+  If the target thread has already resumed, this function confirms that the target thread resumes
+  by returning success.  
+
+  If the target thread is in a secure user process or CPZ process, this function call returns an error without 
+  resuming the operation.  
+
+  If the target thread runs in the guest OS/root process via a QDI call, this function call clears the mark of
+  suspend-pending on the target thread, and the target thread is not suspended when it exits the 
+  guest OS. 
+ 
+  @param[in] thread_id  Thread identifier.
+  @param[in] option     Optional argument, #QURT_THREAD_RESUME_DEFAULT, which resumes the target thread.
+ 
+  @return
+  #QURT_EOK           -- Success \n 
+  #QURT_EINVALID      -- Failure because of invalid thread_id input \n
+  #QURT_ENOTALLOWED   -- Failure because of the operation is not allowed, for example, in a secure process/CPZ process.
+  #QURT_EHMXNOTAVAIL  -- Failure because when resume a HMX thread, the HMX is not available/free for the HMX thread resume.
+ 
+  @dependencies
+  None.
+ */
+int qurt_thread_resume_thread (unsigned int thread_id, unsigned int option);
+
+
+/**@ingroup func_qurt_thread_set_thread_property 
+  Set a QuRT thread property with its thread identifier.
+  The target thread can be in a signed user process or an unsigned user process.
+  The caller thread can be from the same user process of the target thread, or from its parent process.
+
+  If the target thread is in a secure user process, or CPZ process, this function call returns an error without 
+  changing the property of the target thread.
+
+  @param[in] thread_id    Thread identifier \n
+  @param[in] property_id  Thread property identifier \n
+                          #QURT_THREAD_PROPERTY_SUSPENDABLE -- thread is suspendable. Default is TRUE. \n
+                          #QURT_THREAD_PROPERTY_RESUMEABLE  -- thread is resumable. Default is TRUE
+  @param[in] value        Proper value: \n
+                          TRUE(1) -- TRUE for the property \n
+                          FALSE(0) -- FALSE for the property
+ 
+  @return
+  #QURT_EOK         -- Success  \n
+  #QURT_EINVALID    -- Failure because of invalid thread_id input \n
+  #QURT_ENOTALLOWED -- Failure because of the operation is not allowed, for example, in a secure process/CPZ process.
+ 
+  @dependencies
+  None.
+ */
+int qurt_thread_set_thread_property( unsigned int thread_id, unsigned int property_id, unsigned int value );    
+
+/**@ingroup func_qurt_thread_get_group_id
+  Get the group id of the thread specified by thread_id.\n
+
+  @param[in] thread_id Thread identifier
+  @param[out] group_id Pointer to the variable of group identifier
+
+  @return
+  #QURT_EOK         -- Success  \n
+  #QURT_EINVALID    -- Thread id is invalid, or the process has no groups enabled \n
+  #QURT_ENOTALLOWED -- Operation is not allowed \n
+
+  @dependencies
+  None.
+*/
+int qurt_thread_get_group_id(qurt_thread_t thread_id, unsigned int* group_id);
+
+#endif /* __ASSEMBLER__ */ 
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_THREAD_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_thread_context.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_thread_context.h
new file mode 100755
index 0000000000000..bab09deec8889
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_thread_context.h
@@ -0,0 +1,234 @@
+#ifndef QURT_THREAD_CONTEXT_H
+#define QURT_THREAD_CONTEXT_H
+/**
+  @file qurt_thread_context.h 
+  @brief Kernel thread context structure
+			
+EXTERNAL FUNCTIONS
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+Copyright (c) 2018-2022  by Qualcomm Technologies, Inc.  All Rights Reserved.
+Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+=============================================================================*/
+
+
+
+#include <qurt_qdi_constants.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** @cond internal_only */
+
+#define THREAD_ITERATOR_END ((qurt_thread_t)(-1))  /**< Thread iterator is complete. */   
+
+
+/**@ingroup func_qurt_thread_iterator_create
+Gives the ability to the caller to enumerate threads in the system.
+
+@return 
+Handle of the newly created iterator must be passed for
+subsequent operations on the iterator.           
+
+@dependencies
+None.
+*/
+static inline int qurt_thread_iterator_create(void)
+{
+   return qurt_qdi_handle_invoke(QDI_HANDLE_GENERIC, QDI_OS_THREAD_ITERATOR_CREATE);
+}
+
+/**@ingroup func_qurt_thread_iterator_next
+Iterates over the list of threads in the system.
+
+@datatypes
+#qurt_thread_t
+
+@param[in] iter Iterator handle returned by qurt_thread_iterator_create().
+
+@return 
+#THREAD_ITERATOR_END -- iterator has reached the end of the thread list. \n
+Other values indicate a valid thread_id.
+
+@dependencies
+None.
+*/
+static inline qurt_thread_t qurt_thread_iterator_next(int iter)
+{
+   return (qurt_thread_t)qurt_qdi_handle_invoke(iter, QDI_OS_THREAD_ITERATOR_NEXT);
+}
+
+/**@ingroup func_qurt_thread_iterator_destroy
+Cleans up thread iterator resources.
+
+@param[in] iter Iterator handle returned by qurt_thread_iterator_create().
+
+@return 
+#QURT_EOK -- Successful completion of operation \n
+#QURT_EFATAL -- Invalid handle passed 
+		  
+@dependencies
+None.
+*/
+static inline int qurt_thread_iterator_destroy(int iter)
+{
+   return qurt_qdi_close(iter);
+}
+
+/**@ingroup func_qurt_thread_context_get_tname
+Gets the name of the thread from the specified thread ID.
+
+@param[in]      thread_id   Thread for which name is returned.
+@param[in,out]  name        Pointer to the local buffer where name is copied back.
+@param[in]      max_len     Size of the local buffer.
+
+@return 
+#QURT_EOK -- Success \n
+Failure otherwise
+		  
+@dependencies
+None.
+*/
+int qurt_thread_context_get_tname(unsigned int thread_id, char *name, unsigned char max_len);
+
+/**@ingroup func_qurt_thread_context_get_prio
+Gets the priority for the specified thread.
+
+@param[in]     thread_id   Thread for which priority is returned.
+@param[in,out] prio        Pointer to the local variable where priority is written.
+
+@return  
+#QURT_EOK -- Success \n
+Failure otherwise
+		  
+@dependencies
+None.
+*/
+int qurt_thread_context_get_prio(unsigned int thread_id, unsigned char *prio);
+
+/**@ingroup func_qurt_thread_context_get_pcycles
+Gets pcycles for the specified thread.
+
+@param[in]     thread_id Thread for which processor cycles are returned.
+@param[in,out] pcycles   Pointer to the local variable where processor cycles are written.
+
+@return  
+#QURT_EOK -- Success \n
+Failure otherwise.
+		  
+@dependencies
+None.
+*/
+int qurt_thread_context_get_pcycles(unsigned int thread_id, unsigned long long int *pcycles);
+
+/**@ingroup func_qurt_thread_context_get_stack_base
+Gets the stack base address for the specified thread.
+
+@param[in]     thread_id Thread for which stack base address is returned.
+@param[in,out] sbase     Pointer to the local variable where stack base address is written.
+
+@return  
+QURT_EOK -- Success \n
+Failure otherwise
+		  
+@dependencies
+None.
+*/
+int qurt_thread_context_get_stack_base(unsigned int thread_id, unsigned int *sbase);
+
+/**@ingroup func_qurt_thread_context_get_stack_size
+Gets the stack size for the specified thread.
+
+@param[in]      thread_id   Thread for which stack size is returned.
+@param[in,out]  ssize       Pointer to the local variable where stack size is written.
+
+@return  
+#QURT_EOK -- Success \n
+Failure otherwise
+		  
+@dependencies
+None.
+*/
+int qurt_thread_context_get_stack_size(unsigned int thread_id, unsigned int *ssize);
+
+/**@ingroup func_qurt_thread_context_get_pid
+Gets the process ID for the specified thread.
+
+@param[in]     thread_id  Thread for which process ID is returned.
+@param[in,out] pid        Pointer to the local variable where process id is written.
+
+@return  
+#QURT_EOK -- Success \n
+Failure otherwise
+		  
+@dependencies
+None.
+*/
+int qurt_thread_context_get_pid(unsigned int thread_id, unsigned int *pid);
+
+/**@ingroup func_qurt_thread_context_get_pname
+Gets the process name for the specified thread.
+
+@param[in]       thread_id  Represents the thread for which process name is returned.
+@param[in, out]  name       Pointer to the local buffer where process name is copied back.
+@param[in]       len        Length allocated to the local buffer.
+
+@return  
+#QURT_EOK -- Success \n
+Failure otherwise
+		  
+@dependencies
+None.
+*/
+int qurt_thread_context_get_pname(unsigned int thread_id, char *name, unsigned int len);
+
+/** @addtogroup thread_types
+@{ */
+/** Structure that defines how TCB is interpreted to crash dump tools.*/
+/* Keys are defined in consts.h */
+struct qurt_debug_thread_info {
+/** @cond */
+    char name[QURT_MAX_NAME_LEN];     /**< Name of the thread. */
+    struct {
+        unsigned key;                 
+        unsigned val;
+    } os_info[40];  
+    unsigned gen_regs[32];            /**< General mode registers. */
+    unsigned user_cregs[32];          /**< User mode registers. */
+    unsigned guest_cregs[32];         /**< Guest mode registers. */
+    unsigned monitor_cregs[64];       /**< Monitor mode registers. */
+/** @endcond */
+}; /* should add up to 1K */
+/** @} */ /* end_addtogroup thread_types */
+
+
+/**@ingroup func_qurt_system_tcb_dump_get
+Cleans up thread iterator resources.
+
+@datatypes
+#qurt_thread_t
+
+@param[in]       thread_id  Thread on which the operation must be performed.
+@param[in, out]  ptr        Pointer to the local buffer where contents are written.
+@param[in]       size       Size of the debug thread information structure obtained by calling
+                     qurt_system_tcb_dump_get_size().
+	   
+@return 
+#QURT_EOK -- Success \n
+Failure otherwise
+		  
+@dependencies
+None.
+*/
+int qurt_system_tcb_dump_get(qurt_thread_t thread_id, void *ptr, size_t size);
+/** @endcond */
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_THREAD_CONTEXT_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_timer.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_timer.h
new file mode 100755
index 0000000000000..7bdfdb8f3c3df
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_timer.h
@@ -0,0 +1,560 @@
+#ifndef QURT_TIMER_H
+#define QURT_TIMER_H
+/**
+  @file qurt_timer.h
+  @brief  Prototypes of qurt_timer API
+
+EXTERNAL FUNCTIONS
+   None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+Copyright (c) 2021, 2023 by Qualcomm Technologies, Inc.  All Rights Reserved.
+Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+=============================================================================*/
+
+
+
+#include "qurt_anysignal.h"
+#include "qurt_signal2.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*=============================================================================
+                        CONSTANTS AND MACROS
+=============================================================================*/
+/**@addtogroup timer_const_macros 
+@{ */
+/**
+ Default values.
+*/
+/**	@xreflabel{hdr:QURT_TIMER_ONESHOT}*/
+#define QURT_TIMER_DEFAULT_TYPE            QURT_TIMER_ONESHOT /**< One shot.*/
+#define QURT_TIMER_DEFAULT_DURATION        1000uL             /**< Default duration. */
+#define QURT_TIMER_DEFAULT_EXPIRY          0uL                /**< Default expiration. */
+
+/**
+ Conversion from microseconds to timer ticks.
+ */
+#define QURT_TIMER_TIMETICK_FROM_US(us) QURT_SYSCLOCK_TIMETICK_FROM_US(us)
+
+/**
+ Conversion from timer ticks to microseconds at the nominal frequency.
+*/
+#define QURT_TIMER_TIMETICK_TO_US(ticks) qurt_timer_timetick_to_us(ticks)
+
+/** Minimum microseconds value is 100 microseconds (sleep timer).*/
+#define QURT_TIMER_MIN_DURATION             100uL
+
+/**
+ Maximum microseconds value for Qtimer is 1,042,499 hours.
+*/
+#define QURT_TIMER_MAX_DURATION  QURT_SYSCLOCK_MAX_DURATION
+
+/** 
+ Timer clock for Qtimer is 19.2 MHz.
+*/
+#define QURT_TIMER_MAX_DURATION_TICKS QURT_SYSCLOCK_MAX_DURATION_TICKS
+
+/** 
+ Sleep timer error margin for Qtimer is 1,000 ticks ~52 us.
+*/
+#define QURT_TIMETICK_ERROR_MARGIN QURT_SYSCLOCK_ERROR_MARGIN
+
+/*
+  qurt_timer group defines.                                                    
+*/
+#define QURT_TIMER_MAX_GROUPS              5U /**< Maximum groups.*/
+#define QURT_TIMER_DEFAULT_GROUP           0U /**< Default groups. */
+/** @} */ /* end_addtogroup timer_const_macros */
+
+/** @addtogroup timer_types
+@{ */
+/**
+   QuRT timer types.                                                       
+ */
+typedef enum
+{
+  QURT_TIMER_ONESHOT = 0,  /**< One shot.*/
+  /**	@xreflabel{hdr:QURT_TIMER_PERIODIC}*/
+  QURT_TIMER_PERIODIC      /**< Periodic. */
+} qurt_timer_type_t;
+
+
+/*=============================================================================
+                        TYPEDEFS
+=============================================================================*/
+
+/** QuRT timer type.*/
+typedef unsigned int                        qurt_timer_t;
+
+/** QuRT timer duration type.  */
+typedef unsigned long long                  qurt_timer_duration_t;
+
+/** QuRT timer time type. */
+typedef unsigned long long                  qurt_timer_time_t;
+
+typedef void (*pfn_t)(void);
+/** QuRT timer attribute type. */
+typedef struct 
+{ 
+    /** @cond */
+    unsigned int        magic; /**< Magic number to verify the qmsgq_attr_t pointer.  */    
+     
+    qurt_timer_duration_t   duration; /**< Specifies the duration of the new timer. */
+     
+    qurt_timer_time_t   expiry; /**< Specifies the absolute expiry of the new timer. */
+
+    qurt_timer_duration_t   remaining; /**< Specifies the remaining time of an active timer. */
+   
+    qurt_timer_type_t       type;  /**< Specifies the timer type; only #QURT_TIMER_ONESHOT and
+                                            #QURT_TIMER_PERIODIC are supported.  */
+    
+    unsigned int        group;  /**<  Group number of the timer; the criterion used to disable or enable the set
+       of timers.  */
+    pfn_t pFn;  /**< Callback other than the signal set */
+    /** @endcond */
+}
+qurt_timer_attr_t;
+
+/** @} */ /* end_addtogroup timer_types */
+/*=============================================================================
+                        FUNCTIONS
+=============================================================================*/
+
+/**@ingroup func_qurt_timer_stop
+  @xreflabel{sec:qurt_timer_stop}  
+  Stops a running timer.
+  The timer must be a one-shot timer.
+
+  @note1hang Restart stopped timers with the timer restart operation,
+             see Section @xref{sec:qurt_timer_restart}. 
+
+  @datatypes
+  #qurt_timer_t
+  
+  @param[in] timer    Timer object. 
+
+  @return
+  #QURT_EOK -- Success. \n
+  #QURT_EINVALID -- Invalid timer ID or duration value. \n
+  #QURT_ENOTALLOWED -- Timer is not a one shot timer. \n
+  #QURT_EMEM -- Out of memory error.
+
+  @dependencies
+  None.
+ */
+int qurt_timer_stop (qurt_timer_t timer);
+
+/**@ingroup func_qurt_timer_restart
+   @xreflabel{sec:qurt_timer_restart}
+   Restarts a stopped timer with the specified duration. The timer must be a one-shot timer.
+   Timers stop after they have expired or after they are explicitly stopped with qurt_timer_stop().
+   A restarted timer expires after the specified duration, the starting time is when the function is called.
+
+  @note1hang Timers stop after they have expired or after they are explicitly
+             stopped with the timer stop operation, see Section @xref{sec:qurt_timer_stop}.
+
+  @datatypes
+  #qurt_timer_t \n
+  #qurt_timer_duration_t
+
+  @param[in] timer        Timer object. 
+  @param[in] duration     Timer duration (in microseconds) before the restarted timer
+                          expires again.
+                          The valid range is #QURT_TIMER_MIN_DURATION to
+                          #QURT_TIMER_MAX_DURATION.
+
+  @return             
+  #QURT_EOK -- Success. \n
+  #QURT_EINVALID -- Invalid timer ID or duration value. \n
+  #QURT_ENOTALLOWED -- Timer is not a one-shot timer. \n
+  #QURT_EMEM --  Out-of-memory error.
+
+  @dependencies
+  None.
+ */
+int qurt_timer_restart (qurt_timer_t timer, qurt_timer_duration_t duration);
+
+
+/**@ingroup func_qurt_timer_create
+  Creates a timer.\n
+  Allocates and initializes a timer object, and starts the timer.
+
+  @note1hang A timer event handler must be defined to wait on the specified signal 
+             to handle the timer event.
+
+  @datatypes
+  #qurt_timer_t \n
+  #qurt_timer_attr_t \n
+  #qurt_anysignal_t
+
+  @param[out] timer   Pointer to the created timer object.
+  @param[in]  attr    Pointer to the timer attribute structure.
+  @param[in]  signal  Pointer to the signal object set when timer expires.
+  @param[in]  mask    Signal mask, which specifies the signal to set in the signal object when the
+                      time expires.
+
+  @return
+  #QURT_EOK -- Success. \n
+  #QURT_EMEM -- Not enough memory to create the timer. \n
+  #QURT_EINVALID -- One of the arguments in the attr field is invalid. \n
+  Other error code -- Operation failed. \n
+
+  @dependencies
+  None.
+ */
+int qurt_timer_create (qurt_timer_t *timer, const qurt_timer_attr_t *attr,
+                  const qurt_anysignal_t *signal, unsigned int mask);
+
+int qurt_timer_create_sig2 (qurt_timer_t *timer, const qurt_timer_attr_t *attr, 
+                  const qurt_signal2_t *signal, unsigned int mask);
+
+/**@ingroup func_qurt_timer_attr_init
+  Initializes the specified timer attribute structure with default attribute values: \n
+  - Timer duration -- #QURT_TIMER_DEFAULT_DURATION (Section @xref{dox:timers}) \n
+  - Timer type -- #QURT_TIMER_ONESHOT \n
+  - Timer group -- #QURT_TIMER_DEFAULT_GROUP
+
+  @datatypes
+  #qurt_timer_attr_t
+
+  @param[in,out] attr Pointer to the destination structure for the timer attributes.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+void qurt_timer_attr_init(qurt_timer_attr_t *attr);
+
+
+/*Tech Comm note: removed qurt_timer_attr_set_pfn from documentation 9/10/2020
+@ingroup func_qurt_timer_attr_set_pfn  
+
+  @datatypes
+  #qurt_timer_attr_t
+
+  @param[in,out] attr Pointer to the destination structure for the timer attributes.
+  @param[in] pFn pFn.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+void qurt_timer_attr_set_pfn(qurt_timer_attr_t *attr, pfn_t pFn);
+
+
+/**@ingroup func_qurt_timer_attr_set_duration
+  Sets the timer duration in the specified timer attribute structure.\n
+
+  The timer duration specifies the interval (in microseconds) between the creation of the
+  timer object and the generation of the corresponding timer event.
+
+  The timer duration value must be between #QURT_TIMER_MIN_DURATION and
+  #QURT_TIMER_MAX_DURATION (Section @xref{dox:timers}). Otherwise, the set operation is ignored.
+
+  @datatypes
+  #qurt_timer_attr_t \n
+  #qurt_timer_duration_t
+
+  @param[in,out] attr    Pointer to the timer attribute structure.
+  @param[in] duration    Timer duration (in microseconds).
+                         Valid range is #QURT_TIMER_MIN_DURATION to
+                         #QURT_TIMER_MAX_DURATION.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+void qurt_timer_attr_set_duration(qurt_timer_attr_t *attr, qurt_timer_duration_t duration);
+
+/**@ingroup func_qurt_timer_attr_set_expiry
+   Sets the absolute expiry time in the specified timer attribute structure.\n
+   The timer expiry specifies the absolute time (in microseconds) of the generation of the
+   corresponding timer event.\n
+   Timer expiries are relative to when the system first began executing.
+
+   @datatypes
+   #qurt_timer_attr_t \n
+   #qurt_timer_time_t
+
+   @param[in,out] attr  Pointer to the timer attribute structure.
+   @param[in]     time  Timer expiry.
+
+   @return
+   None.
+
+   @dependencies
+   None.
+ */
+void qurt_timer_attr_set_expiry(qurt_timer_attr_t *attr, qurt_timer_time_t time);
+
+/**@ingroup func_qurt_timer_attr_get_duration
+  Gets the timer duration from the specified timer attribute structure.
+  The value returned is the duration that was originally set for the timer.
+
+  @note1hang This function does not return the remaining time of an active timer; 
+  use qurt_timer_attr_get_remaining() to get the remaining time.
+
+  @datatypes
+  #qurt_timer_attr_t \n
+  #qurt_timer_duration_t
+
+  @param[in]  attr       Pointer to the timer attributes object
+  @param[out] duration   Pointer to the destination variable for timer duration.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+void qurt_timer_attr_get_duration(qurt_timer_attr_t *attr, qurt_timer_duration_t *duration);
+
+/**@ingroup func_qurt_timer_attr_get_remaining
+  Gets the timer remaining duration from the specified timer attribute structure. \n
+
+  The timer remaining duration indicates (in microseconds) how much time remains before
+  the generation of the next timer event on the corresponding timer.
+  In most cases this function assumes that the timer attribute structure was obtained by
+  calling qurt_timer_get_attr().
+
+  @note1hang This attribute is read-only and thus has no set operation defined for it.
+
+  @datatypes
+  #qurt_timer_attr_t \n
+  #qurt_timer_duration_t
+
+  @param[in] attr          Pointer to the timer attribute object.
+  @param[out] remaining    Pointer to the destination variable for remaining time.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+void qurt_timer_attr_get_remaining(qurt_timer_attr_t *attr, qurt_timer_duration_t *remaining);
+
+/**@ingroup func_qurt_timer_attr_set_type
+  Sets the timer type in the specified timer attribute structure.
+
+  The timer type specifies the functional behavior of the timer: \n
+  - A one-shot timer (#QURT_TIMER_ONESHOT) waits for the specified timer duration
+      and then generates a single timer event. After this the timer is nonfunctional. \n
+  - A periodic timer (#QURT_TIMER_PERIODIC) repeatedly waits for the specified
+     timer duration and then generates a timer event. The result is a series of timer
+     events with interval equal to the timer duration.
+
+   @datatypes 
+   #qurt_timer_attr_t \n
+   #qurt_timer_type_t
+   
+   @param[in,out]  attr  Pointer to the timer attribute structure.
+   @param[in]      type  Timer type. Values are: \n
+                   - #QURT_TIMER_ONESHOT -- One-shot timer. \n
+                   - #QURT_TIMER_PERIODIC -- Periodic timer. @tablebulletend
+
+   @return
+   None.
+
+   @dependencies
+   None.
+ */
+void qurt_timer_attr_set_type(qurt_timer_attr_t *attr, qurt_timer_type_t type);
+
+/**@ingroup func_qurt_timer_attr_get_type
+  Gets the timer type from the specified timer attribute structure.
+
+  @datatypes
+  #qurt_timer_attr_t \n
+  #qurt_timer_type_t
+
+  @param[in]  attr  Pointer to the timer attribute structure.
+  @param[out] type  Pointer to the destination variable for the timer type.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+void qurt_timer_attr_get_type(qurt_timer_attr_t *attr, qurt_timer_type_t *type);
+
+/**@ingroup func_qurt_timer_attr_set_group
+  Sets the timer group identifier in the specified timer attribute structure.\n
+  The timer group identifier specifies the group that the timer belongs to. Timer groups are
+  used to enable or disable one or more timers in a single operation. \n
+  The timer group identifier value must be between 0 and (#QURT_TIMER_MAX_GROUPS - 1).
+  See Section @xref{dox:timers}.
+
+  @datatypes
+  #qurt_timer_attr_t
+
+  @param[in,out]  attr  Pointer to the timer attribute object.
+  @param[in] group      Timer group identifier;
+                        Valid range is 0 to (#QURT_TIMER_MAX_GROUPS - 1).
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+void qurt_timer_attr_set_group(qurt_timer_attr_t *attr, unsigned int group);
+
+/**@ingroup func_qurt_timer_attr_get_group
+  Gets the timer group identifier from the specified timer attribute structure.
+
+  @datatypes
+  #qurt_timer_attr_t
+  
+  @param[in]  attr   Pointer to the timer attribute structure.
+  @param[out] group  Pointer to the destination variable for the timer group identifier.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+void qurt_timer_attr_get_group(qurt_timer_attr_t *attr, unsigned int *group);
+
+/**@ingroup func_qurt_timer_get_attr
+  @xreflabel{hdr:qurt_timer_get_attr}
+  Gets the timer attributes of the specified timer when it was created.
+
+  @datatypes
+  #qurt_timer_t \n
+  #qurt_timer_attr_t
+
+  @param[in] timer  Timer object.
+  @param[out] attr  Pointer to the destination structure for timer attributes.
+
+  @return
+  #QURT_EOK -- Success. \n
+  #QURT_EVAL -- Argument passed is not a valid timer.
+
+  @dependencies
+  None.
+ */
+int qurt_timer_get_attr(qurt_timer_t timer, qurt_timer_attr_t *attr);
+
+/**@ingroup func_qurt_timer_delete
+  Deletes the timer.\n
+  Destroys the specified timer and deallocates the timer object.
+
+  @datatypes
+  #qurt_timer_t
+  
+  @param[in] timer  Timer object.
+
+  @return       
+  #QURT_EOK -- Success. \n
+  #QURT_EVAL -- Argument passed is not a valid timer. 
+
+  @dependencies
+  None.
+ */
+int qurt_timer_delete(qurt_timer_t timer);
+
+/**@ingroup func_qurt_timer_sleep
+  Suspends the current thread for the specified amount of time.
+  The sleep duration value must be between #QURT_TIMER_MIN_DURATION and
+  #QURT_TIMER_MAX_DURATION (Section @xref{dox:timers}).
+
+  @datatypes
+  #qurt_timer_duration_t
+
+  @param[in] duration  Interval (in microseconds) between when the thread is suspended
+                       and when it is re-awakened. 
+
+  @return 
+  #QURT_EOK -- Success. \n
+  #QURT_EMEM -- Not enough memory to perform the operation.
+
+  @dependencies
+  None.
+ */
+
+int qurt_timer_sleep(qurt_timer_duration_t duration);
+
+/**@ingroup func_qurt_timer_group_disable
+  Disables all timers that are assigned to the specified timer group.
+  If a specified timer is already disabled, ignore it.
+  If a specified timer is expired, do not process it.
+  If the specified timer group is empty, do nothing.
+
+  @note1hang When a timer is disabled its remaining time does not change, thus it
+             cannot generate a timer event.
+ 
+  @param[in] group  Timer group identifier.
+
+  @return 
+  #QURT_EOK -- Success.
+
+  @dependencies
+  None.
+ */
+int qurt_timer_group_disable (unsigned int group);
+
+/**@ingroup func_qurt_timer_group_enable
+  Enables all timers that are assigned to the specified timer group.
+  If a specified timer is already enabled, ignore it.
+  If a specified timer is expired, process it.
+  If the specified timer group is empty, do nothing.
+
+  @param[in] group  Timer group identifier.
+
+  @return 
+  #QURT_EOK -- Success.
+
+  @dependencies
+  None.
+ */
+int qurt_timer_group_enable (unsigned int group);
+
+
+/**
+  Notifies the timer server recovery from power collapse. The server
+  must account for any missed interrupts during power collapse. 
+ */
+void qurt_timer_recover_pc (void);
+
+/**
+   Determines whether the Qtimer is initialized.
+
+   @return
+   0       -- Not initialized. \n
+   Nonzero -- Initialized.
+ */
+static inline int qurt_timer_is_init (void) {return 1;}
+
+/**@ingroup func_qurt_timer_get_ticks
+   Gets current ticks. The ticks are accumulated since the RTOS
+   has started. Each tick is equal to a single timer clock
+   cycle, where the frequency is 32 KHz on RGPT or 19.2 MHz on Qtimer.
+  
+   @return             
+   Ticks since system started.
+ */
+unsigned long long qurt_timer_get_ticks (void);
+
+#define qurt_timer_timetick_from_us(us) QURT_SYSCLOCK_TIMETICK_FROM_US(us)
+
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_TIMER_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_tlb.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_tlb.h
new file mode 100755
index 0000000000000..b1b2d261d31c0
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_tlb.h
@@ -0,0 +1,215 @@
+#ifndef QURT_TLB_H
+#define QURT_TLB_H
+
+/**
+  @file qurt_tlb.h 
+  @brief  Prototypes of TLB API  
+        The TLB APIs allow explicit control of the portion of TLB between TLB_first_replaceble and TLB_LAST_REPLACEABLE. 
+        Both are nonconfigurable for the time being. This portion of TLB is permanently assigned/locked unless manually removed 
+        by qurt_tlb_remove. Implementation does not change depending on the configuration, such as whether CONFIG_STATIC is set or not. 
+        In CONFIG_STATIC=y, TLB_LAST_REPLACEABLE is set to the last TLB index, which indicates that the entire TLB is permanently 
+        assigned and is not backed up by page table (page table does not exist). TLB indicies are maintained through a 64-bit bitmask. 
+        A new entry is placed in the first available slot. 
+
+EXTERNAL FUNCTIONS
+   None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+Copyright (c) 2013, 2021, 2023
+All Rights Reserved.
+Confidential and Proprietary - Qualcomm Technologies, Inc.
+=============================================================================*/
+
+#include <qurt_types.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/*=============================================================================
+												FUNCTIONS
+=============================================================================*/
+
+/**@ingroup func_qurt_tlb_entry_create
+  Creates a new TLB entry with the specified mapping attributes in the TLB of the Hexagon processor. \n
+  @note1hang If the specified attributes are not valid (such as if the address is not aligned with the
+             size), the entry is created and an error result is returned.\n
+  @note1cont To set the G bit in the new TLB entry, set the ASID argument to -1.
+
+  @datatypes
+  #qurt_addr_t \n
+  #qurt_paddr_t \n
+  #qurt_mem_cache_mode_t \n
+  #qurt_perm_t
+  
+  @param[out]  entry_id         TLB entry identifier.
+  @param[in]   vaddr 			Virtual memory address.
+  @param[in]   paddr  			Physical memory address.
+  @param[in]   size  			Size of memory region to map (in bytes).
+  @param[in]   cache_attribs    Cache mode (writeback, and so on).
+  @param[in]   perms  			Access permissions.  
+  @param[in]   asid  			ASID (space ID).
+ 
+  @return
+  #QURT_EOK -- TLB entry successfully created.\n
+  #QURT_EFATAL -- Entry is not created; the TLB is full. \n
+  #QURT_ETLBCREATESIZE -- Entry is not created; the incorrect size was specified. \n
+  #QURT_ETLBCREATEUNALIGNED -- Entry is not created; an unaligned address was specified. \n
+  #QURT_EINVALID -- Invalid cache attributes / permissions provided.
+
+ */
+int  qurt_tlb_entry_create (unsigned int *entry_id, qurt_addr_t vaddr, qurt_paddr_t paddr, qurt_size_t size, qurt_mem_cache_mode_t cache_attribs, qurt_perm_t perms, int asid);
+
+/**@ingroup func_qurt_tlb_entry_create_64
+  Creates a new TLB entry with the specified mapping attributes in the TLB of the Hexagon processor. \n
+  @note1hang If the specified attributes are not valid (the address is not aligned with the
+             size), the entry is not created, and an error result is returned.\n
+  @note1cont To set the G bit in the new TLB entry, set the asid argument to -1.
+  
+  @param[out]  entry_id         TLB entry identifier.
+  @param[in]   vaddr 			Virtual memory address.
+  @param[in]   paddr_64         64-bit physical memory address.
+  @param[in]   size  			Size of memory region to map (in bytes).
+  @param[in]   cache_attribs    Cache mode (writeback, and so on).
+  @param[in]   perms  			Access permissions.  
+  @param[in]   asid  			ASID (space ID).
+ 
+  @return
+  #QURT_EOK -- TLB entry successfully created.\n
+  #QURT_EFATAL -- Entry was not created; the TLB is full. \n
+  #QURT_ETLBCREATESIZE -- Entry was not created; the incorrect size was specified. \n
+  #QURT_ETLBCREATEUNALIGNED -- Entry was not created; an unaligned address was specified. \n
+  #QURT_EINVALID -- Invalid cache attributes / permissions provided.
+
+ */
+int qurt_tlb_entry_create_64 (unsigned int *entry_id, qurt_addr_t vaddr, qurt_paddr_64_t paddr_64, qurt_size_t size, qurt_mem_cache_mode_t cache_attribs, qurt_perm_t perms, int asid);
+
+/**@ingroup func_qurt_tlb_entry_delete 
+  Deletes the specified TLB entry from the TLB of the Hexagon processor.
+  If the specified entry does not exist, no deletion occurs and an error result is returned.
+
+  @param[in]   entry_id  TLB entry identifier.			
+
+  @return
+  #QURT_EOK -- TLB entry successfully deleted. \n
+  #QURT_EFATAL -- TLB entry does not exist.
+
+  @dependencies
+  None.
+ **/
+int qurt_tlb_entry_delete (unsigned int entry_id);
+
+/**@ingroup func_qurt_tlb_entry_query
+  Searches for the specified TLB entry in the TLB of the Hexagon processor.
+  If the TLB entry is found, its entry identifier is returned.
+
+  @datatypes
+  #qurt_addr_t
+
+  @param[out]   entry_id     TLB entry identifier.  
+  @param[in]    vaddr  		 Virtual memory address.
+  @param[in]    asid 		 ASID (space ID).
+
+  @return  
+  #QURT_EOK -- TLB entry successfully returned. \n
+  #QURT_EFATAL -- TLB entry does not exist.
+
+  @dependencies
+  None.
+ **/
+int qurt_tlb_entry_query (unsigned int *entry_id, qurt_addr_t vaddr, int asid);
+
+/**@ingroup func_qurt_tlb_entry_set
+  Sets the TLB entry by storing an entry at the specified location 
+  in the TLB of the Hexagon processor.
+
+  @param[in]   entry_id  		TLB entry identifier.
+  @param[in]   entry  			64-bit TLB entry to store.
+
+  @return
+  #QURT_EOK -- Entry successfully stored in the TLB. \n
+  #QURT_EFATAL -- Entry not set at the specified location.
+
+  @dependencies
+  None.
+ **/
+int qurt_tlb_entry_set (unsigned int entry_id, unsigned long long int entry);
+
+/**@ingroup func_qurt_tlb_entry_get
+  Gets the TLB entry. \n
+  Returns the specified 64-bit TLB entry in the TLB of the Hexagon processor.
+
+  @param[in]    entry_id  	TLB entry identifier.
+  @param[out]   entry       64-bit TLB entry.
+
+  @return
+  #QURT_EOK -- TLB entry successfully returned. \n
+  #QURT_EFATAL -- TLB entry does not exist.   
+
+  @dependencies
+  None.
+ **/
+int qurt_tlb_entry_get (unsigned int entry_id, unsigned long long int *entry);
+
+/**@ingroup func_qurt_tlb_get_pager_physaddrs
+  Searches the TLB of the Hexagon processor, and returns all physical addresses that belong to the pager.
+  Each returned address indicates the starting address of an active page.
+
+The function return value indicates the number of addresses returned.
+
+  @param[out]  pager_phys_addrs  Pointer to the return array of pager physical addresses.
+ 
+  @return
+  Integer -- Number of addresses returned in array.
+
+  @dependencies
+    None.
+*/
+
+unsigned int qurt_tlb_get_pager_physaddr(unsigned int** pager_phys_addrs);
+
+/**@ingroup func_qurt_tlb_get_pager_virtaddr
+  Searches the TLB of the Hexagon processor, and returns all virtual addresses that belong to the pager.
+  Each returned address indicates the starting address of an active page.
+
+The function return value indicates the number of addresses returned.
+
+  @param[out]  pager_virt_addrs  Pointer to the return array of pager virtual addresses.
+ 
+  @return
+  Integer -- Number of addresses returned in the array.
+
+  @dependencies
+    None.
+*/
+
+unsigned int qurt_tlb_get_pager_virtaddr(unsigned int** pager_virt_addrs);
+
+
+/**@ingroup func_qurt_tlb_entry_set2
+  Sets the TLB entry by storing an entry at the specified location 
+  in the TLB of the Hexagon processor. An additional option can be passed 
+  to lock the TLB entry in the TLB of the Hexagon processor.
+
+  @param[in]   id     TLB entry identifier.
+  @param[in]   tlb    64-bit TLB entry to store.
+  @param[in]   lock   Nonzero value indicates that the TLB entry must be locked in the hardware TLB.
+
+  @return
+  #QURT_EOK -- Entry successfully stored in the TLB. \n
+  #QURT_EFATAL -- Entry not set at the specified location.
+
+  @dependencies
+  None.
+ **/
+int qurt_tlb_entry_set2(unsigned id, unsigned long long tlb, unsigned lock);
+
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_TLB_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_tls.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_tls.h
new file mode 100755
index 0000000000000..6ec3b39ff5cb0
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_tls.h
@@ -0,0 +1,100 @@
+#ifndef QURT_TLS_H
+#define QURT_TLS_H
+/**
+  @file qurt_tls.h 
+  @brief  Prototypes of TLS APIs 
+
+EXTERNAL FUNCTIONS
+   None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+Copyright (c) 2021  by Qualcomm Technologies, Inc.  All Rights Reserved.
+Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+=============================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/*=============================================================================
+												FUNCTIONS
+=============================================================================*/
+
+/**@ingroup func_qurt_tls_create_key
+  @xreflabel{sec:tls_create_key}
+  Creates a key for accessing a thread local storage data item.\n
+  Subsequent get and set operations use the key value.
+
+  @note1hang The destructor function performs any clean-up operations needed by a thread
+             local storage item when its containing thread is deleted (Section @xref{sec:qurt_thread_exit}).
+
+  @param[out] key         Pointer to the newly created thread local storage key value.
+  @param[in]  destructor  Pointer to the key-specific destructor function. Passing NULL 
+                          specifies that no destructor function is defined for the key.
+
+  @return	
+  #QURT_EOK -- Key successfully created. \n
+  #QURT_ETLSAVAIL -- No free TLS key available. 
+
+  @dependencies
+  None.
+ */
+int qurt_tls_create_key (int *key, void (*destructor)(void *));
+
+/**@ingroup func_qurt_tls_set_specific
+  Stores a data item to thread local storage along with the specified key.
+
+  @param[in]    key  Thread local storage key value.
+  @param[in]    value  Pointer to user data value to store.
+
+  @return  
+  #QURT_EOK -- Data item successfully stored. \n
+  #QURT_EINVALID -- Invalid key. \n
+  #QURT_EFAILED -- Invoked from a non-thread context.
+ */
+int qurt_tls_set_specific (int key, const void *value);
+
+/**@ingroup func_qurt_tls_get_specific
+  Loads the data item from thread local storage. \n
+  Returns the data item that is stored in thread local storage with the specified key.
+  The data item is always a pointer to user data.
+
+  @param[in]    key Thread local storage key value.
+
+  @return
+  Pointer -- Data item indexed by key in thread local storage. \n
+  0 (NULL) -- Key out of range.
+
+  @dependencies
+  None.
+ */
+void * __attribute__((section(".text.qurt_tls_get_specific "))) qurt_tls_get_specific (int key);
+
+
+/**@ingroup func_qurt_tls_delete_key
+  Deletes the specified key from thread local storage.
+
+  @note1hang Explicitly deleting a key does not execute any destructor function that is
+             associated with the key (Section @xref{sec:tls_create_key}).
+
+  @param[in]   key  Thread local storage key value to delete.
+
+  @return  
+  #QURT_EOK -- Key successfully deleted. \n
+  #QURT_ETLSENTRY -- Key already free.
+
+  @dependencies
+  None.
+ */
+int qurt_tls_delete_key (int key);
+
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_TLS_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_trace.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_trace.h
new file mode 100755
index 0000000000000..541f8f1d34bf6
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_trace.h
@@ -0,0 +1,317 @@
+﻿#ifndef QURT_TRACE_H
+#define QURT_TRACE_H
+/**
+  @file qurt_trace.h 
+  @brief  Prototypes of system call tracing helpers API  
+
+EXTERNAL FUNCTIONS
+   None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+Copyright (c) 2021-2023 by Qualcomm Technologies, Inc.
+All Rights Reserved.
+Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+=============================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*=============================================================================
+                            GLOBAL VARIABLES
+=============================================================================*/
+/** @cond internal_only */
+/** @addtogroup etm_macros
+@{ */
+/* ETM trace types. */
+#define QURT_ETM_TYPE_PC_ADDR                           (1U<<0) /**< PC address.*/
+#define QURT_ETM_TYPE_MEMORY_ADDR                       (1U<<1) /**< Memory address. */
+#define QURT_ETM_TYPE_TESTBUS                           (1U<<2) /**< Test bus. */
+#define QURT_ETM_TYPE_CYCLE_ACCURATE                    (1U<<3) /**< Cycle accurate. */
+#define QURT_ETM_TYPE_CYCLE_COARSE                      (1U<<4) /**< Cycle coarse. */
+#define QURT_ETM_TYPE_PC_AND_MEMORY_ADDR                (QURT_ETM_TYPE_PC_ADDR|QURT_ETM_TYPE_MEMORY_ADDR) /**< PC and memory address. */
+#define QURT_ETM_TYPE_PC_ADDR_AND_TESTBUS               (QURT_ETM_TYPE_PC_ADDR|QURT_ETM_TYPE_TESTBUS) /**< PC address and test bus. */
+#define QURT_ETM_TYPE_MEMORY_ADDR_AND_TESTBUS           (QURT_ETM_TYPE_MEMORY_ADDR|QURT_ETM_TYPE_TESTBUS) /**< Memory address and test bus.*/
+#define QURT_ETM_TYPE_PC_AND_MEMORY_ADDR_AND_TESTBUS    (QURT_ETM_TYPE_PC_ADDR|QURT_ETM_TYPE_MEMORY_ADDR|QURT_ETM_TYPE_TESTBUS) /**< PC, memory address, and test bus. */
+
+/* ETM routes. */
+#define QURT_ETM_ROUTE_TO_QDSS      0U /**< ETM route to QDSS. */
+#define QURT_ETM_ROUTE_TO_Q6ETB     1U /**< ETM route to Q6ETB. */
+
+/* ETM filters. */
+#define QURT_ETM_TRACE_FILTER_ALL_DEFAULT   0U       /*< Filter all as default. */
+#define QURT_ETM_TRACE_FILTER_HNUM0         (1U<<0)  /*< Filter HNUM0. */    
+#define QURT_ETM_TRACE_FILTER_HNUM1         (1U<<1)  /*< Filter HNUM1. */     
+#define QURT_ETM_TRACE_FILTER_HNUM2         (1U<<2)  /*< Filter HNUM2. */     
+#define QURT_ETM_TRACE_FILTER_HNUM3         (1U<<3)  /*< Filter HNUM3. */  
+#define QURT_ETM_TRACE_FILTER_HNUM4         (1U<<4)  /*< Filter HNUM4. */  
+#define QURT_ETM_TRACE_FILTER_HNUM5         (1U<<5)  /*< Filter HNUM5. */  
+#define QURT_ETM_TRACE_FILTER_HNUM6         (1U<<6)  /*< Filter HNUM6. */  
+#define QURT_ETM_TRACE_FILTER_HNUM7         (1U<<7)  /*< Filter HNUM7. */  
+#define QURT_ETM_TRACE_FILTER_HNUM8         (1U<<8)  /*< Filter HNUM8. */    
+#define QURT_ETM_TRACE_FILTER_HNUM9         (1U<<9)  /*< Filter HNUM9. */     
+#define QURT_ETM_TRACE_FILTER_HNUM10        (1U<<10) /*< Filter HNUM10. */     
+#define QURT_ETM_TRACE_FILTER_HNUM11        (1U<<11) /*< Filter HNUM11. */
+#define QURT_ETM_TRACE_FILTER_HNUM12        (1U<<12) /*< Filter HNUM12. */    
+#define QURT_ETM_TRACE_FILTER_HNUM13        (1U<<13) /*< Filter HNUM13. */     
+#define QURT_ETM_TRACE_FILTER_HNUM14        (1U<<14) /*< Filter HNUM14. */     
+#define QURT_ETM_TRACE_FILTER_HNUM15        (1U<<15) /*< Filter HNUM15. */
+#define QURT_ETM_TRACE_FILTER_ALL           QURT_ETM_TRACE_FILTER_ALL_DEFAULT
+
+#define QURT_ETM_TRACE_FILTER_CLUSTER0      (1<<16)  /*< Filter trace cluster0 address. */  
+#define QURT_ETM_TRACE_FILTER_CLUSTER1      (1<<17)  /*< Filter trace cluster1 address. */  
+#define QURT_ETM_TRACE_FILTER_PC_RANGE      (1<<19)  /*< Filter PC address range. */  
+
+/* ETM memory source - PC or data access */
+#define QURT_ETM_SOURCE_PC                  0U  /**< ETM memory source of SAC* is PC. */
+#define QURT_ETM_SOURCE_DATA                1U  /**< ETM memory source of SAC* is data. */
+
+/* Period between synchronization traces */
+#define QURT_ETM_ASYNC_PERIOD               0  /**< Async.*/
+#define QURT_ETM_ISYNC_PERIOD               1  /**< Isync.*/
+#define QURT_ETM_GSYNC_PERIOD               2  /**< Gsync. */
+
+/* ETM enable flags */
+#define QURT_ETM_OFF                0U  /**< ETM off. */
+#define QURT_ETM_ON                 1U  /**< ETM on. */
+/** @endcond */
+/** @} */ /* end_addtogroup etm_macros */
+
+/** @addtogroup function_tracing_macro
+@{ */
+/* ETM setup return values */
+#define QURT_ETM_SETUP_OK                   0 /**< ETM setup OK. */
+#define QURT_ETM_SETUP_ERR                  1 /**< ETM setup error. */
+/** @} */ /* end_addtogroup function_tracing_macro */
+/* ETM breakpoint types */
+#define QURT_ETM_READWRITE_BRKPT            0U /**< ETM read/write breakpoint. */
+#define QURT_ETM_READ_BRKPT                 1U /**< ETM read breakpoint. */
+#define QURT_ETM_WRITE_BRKPT                2U /**< ETM write breakpoint. */
+#define QURT_ETM_BRKPT_INVALIDATE           3U /**< Invalidate breakpoint. */
+/** @addtogroup function_tracing_macro
+@{ */
+/* ATB status flags */
+#define QURT_ATB_OFF                        0  /**< ATB off. */
+#define QURT_ATB_ON                         1  /**< ATB on. */
+/** @} */ /* end_addtogroup function_tracing_macro */
+/* DTM enable flags */
+#define QURT_DTM_OFF                0  /**< DTM off. */
+#define QURT_DTM_ON                 1  /**< DTM on. */
+
+/** @addtogroup function_tracing_datatypes
+@{ */
+/**STM trace information. */
+typedef struct qurt_stm_trace_info {
+   /** @cond */
+   unsigned int stm_port_addr[6];   /* STM port address to which trace data must be written.*/
+   unsigned int thread_event_id; /* Event ID for context switches.*/
+   unsigned int interrupt_event_id; /* Event ID for interrupts. */
+   unsigned int marker; /* Marker value that must be written at the beginning of the trace. */
+   /** @endcond */
+} qurt_stm_trace_info_t;
+/** @} */ /* end_addtogroup function_tracing_datatypes */
+/*=============================================================================
+                            GLOBAL FUNCTIONS
+=============================================================================*/
+
+
+/**@ingroup func_qurt_trace_get_marker
+  Gets the kernel trace marker.\n
+  Returns the current value of the kernel trace marker.
+  The marker consists of a hardware thread identifier and an index into the kernel trace
+  buffer. The trace buffer records kernel events.
+
+  @note1hang Using this function with qurt_trace_changed() 
+             determines whether certain kernel events occurred in a block of code.
+
+  @return
+  Integer -- Kernel trace marker.
+
+  @dependencies
+  None.
+*/
+unsigned int qurt_trace_get_marker(void);
+
+/**@ingroup func_qurt_trace_changed  
+  Determines whether specific kernel events have occurred. \n
+  Returns a value that indicates whether the specified kernel events are recorded in the
+  kernel trace buffer since the specified kernel trace marker was obtained.
+
+  The prev_trace_marker parameter specifies a kernel trace marker that was obtained by calling 
+  qurt_trace_get_marker().
+  @cond rest_dist For more information on the mask value, see the description of the trace_mask element in 
+  @xhyperref{80VB41992,80-VB419-92}. \n @endcond
+
+  @note1hang Used with qurt_trace_get_marker(), this function determines whether
+             certain kernel events occurred in a block of code.\n
+  @note1cont This function cannot determine whether a specific kernel event type has
+             occurred unless that event type has been enabled in the trace_mask element
+             of the system configuration file. \n
+  @note1cont QuRT supports the recording of interrupt and context switch events only (such as
+             a trace_mask value of 0x3).
+
+  @param[in] prev_trace_marker Previous kernel trace marker.
+  @param[in] trace_mask        Mask value that indicates which kernel events to check for.
+
+  @returns
+  1 -- Kernel events of the specified type have occurred since the
+       specified trace marker was obtained.\n
+  0 -- No kernel events of the specified type have occurred since the
+       specified trace marker was obtained.
+
+  @dependencies
+  None.
+*/
+int qurt_trace_changed(unsigned int prev_trace_marker, unsigned int trace_mask);
+
+/*=============================================================================
+                        CONSTANTS AND MACROS
+=============================================================================*/
+/** @addtogroup function_tracing_macro
+@{ */
+#ifndef QURT_DEBUG 
+#define QURT_TRACE(str, ...) __VA_ARGS__
+  /**< Function tracing is implemented with the QURT_TRACE debug macro, which
+       optionally generates printf statements both before and after every function call that is
+       passed as a macro argument. 
+
+       For example, in the following macro calls in the source code:
+       @code
+       QURT_TRACE(myfunc, my_func(33))
+       
+       @endcode
+       generates the following debug output:
+       @code
+       myfile:nnn: my_func >>> calling my_func(33)
+       myfile:nnn: my_func >>> returned my_func(33)
+       @endcode
+       The debug output includes the source file and line number of the function call, along with
+       the text of the call. Compile the client source file with -D __FILENAME__
+       defined for its file name.
+
+       The library function qurt_printf() generates the debug output.
+       The QURT_DEBUG symbol controls generation of the debug output. If this symbol is
+       not defined, function tracing is not generated.\n
+       @note1hang The debug macro is accessed through the QuRT API header file. 
+        */
+#else
+#define QURT_TRACE(str, ...) \
+	do { \
+		qurt_printf("%s:%d: %s: >>> calling %s\n",__FILENAME__,__LINE__,(str),#__VA_ARGS__); \
+		__VA_ARGS__; \
+		qurt_printf("%s:%d: %s: <<< %s returned\n",__FILENAME__,__LINE__,(str),#__VA_ARGS__); \
+	} while (0);
+#endif
+/** @} */ /* end_addtogroup function_tracing_macro */
+
+/**@ingroup func_qurt_etm_set_pc_range
+  Sets the PC address range for ETM filtering.
+  Depending on the Hexagon core design, a maximum of four PC ranges are supported.
+
+  @param[in] range_num  0 to 3. 
+  @param[in] low_addr   Lower boundary of PC address range.
+  @param[in] high_addr  Higher boundary of PC address range.
+
+  @returns
+  #QURT_ETM_SETUP_OK -- Success. \n
+  #QURT_ETM_SETUP_ERR -- Failure.
+
+  @dependencies
+  None.
+*/
+unsigned int qurt_etm_set_pc_range(unsigned int range_num, unsigned int low_addr, unsigned int high_addr);
+
+/**@ingroup func_qurt_etm_set_range
+  Sets the address range for ETM filtering. 
+  It allows the user to select the source type of addresses - QURT_ETM_SOURCE_PC and QURT_ETM_SOURCE_DATA.
+
+  @param[in] addr_source_type   Type of the address source:\n
+                                - #QURT_ETM_SOURCE_PC \n
+                                - #QURT_ETM_SOURCE_DATA @tablebulletend
+  @param[in] trig_block_num     0 to 3.
+  @param[in] pid                pid of the process
+                                1. Any valid PID number will enable the ASID based trace filtering.
+                                2. QURT_ETM_NO_PID - Disable the ASID based trace filtering.
+  @param[in] low_addr           Lower boundary of PC address range.
+  @param[in] high_addr          Higher boundary of PC address range.
+
+  @returns
+  #QURT_ETM_SETUP_OK -- Success. \n
+  #QURT_ETM_SETUP_ERR -- Failure.
+
+  @dependencies
+  None.
+*/
+unsigned int qurt_etm_set_range(unsigned int addr_source_type, unsigned int trig_block_num, unsigned int pid, unsigned int low_addr, unsigned int high_addr);
+
+/**@ingroup func_qurt_etm_set_atb
+  Sets the advanced trace bus (ATB) state to notify QuRT that the ATB is actively enabled or disabled.
+  QuRT performs the corresponding actions at low power management.
+  
+  @param[in] flag Values: \n
+                         #QURT_ATB_ON \n
+						 #QURT_ATB_OFF  
+      
+  @returns
+  #QURT_ETM_SETUP_OK  -- Success. \n
+  #QURT_ETM_SETUP_ERR -- Failure
+
+  @dependencies
+  None.
+*/
+unsigned int qurt_etm_set_atb(unsigned int flag);
+
+/**@ingroup func_qurt_etm_set_sync_period
+  Sets the period for types of synchronization trace packets. \n
+  ASYNC defines the period between alignment synchronization packets.
+         Period is in terms of bytes in the packet stream. \n 
+  ISYNC defines the period between instruction synchronization packets.
+         Period is per thread and is defined as the bytes sent out for that thread. \n
+  GSYNC is the defined period in thread cycles between GSYNC packets.
+
+  @param[in]  sync_type Type of synchronization packets: \n
+                          #QURT_ETM_ASYNC_PERIOD \n
+                          #QURT_ETM_ISYNC_PERIOD \n
+                          #QURT_ETM_GSYNC_PERIOD
+  @param[in]  period    Period value. 
+
+  @return
+  #QURT_ETM_SETUP_OK -- Success. \n
+  #QURT_ETM_SETUP_ERR -- Failure.
+
+  @dependencies
+  None.
+ */
+unsigned int qurt_etm_set_sync_period(unsigned int sync_type, unsigned int period);
+
+/**@ingroup func_qurt_stm_trace_set_config
+  Sets up a STM port for tracing events.
+
+  @datatypes
+  #qurt_stm_trace_info_t 
+
+  @param[in]  stm_config_info Pointer to the STM trace information used to set up the trace
+              in the kernel.
+			  The strucure must have the following:\n
+			  - One port address per hardware thread \n
+			  - Event ID for context switches \n
+			  - Event ID for interrupt tracing n
+			  - Header or marker to identify the beginning of the trace. @tablebulletend
+
+  @return
+  #QURT_EOK -- Success. \n
+  #QURT_EINVALID -- Failure; possibly because the passed port address is not in the page table.
+
+  @dependencies
+  None.
+ */
+unsigned int qurt_stm_trace_set_config(qurt_stm_trace_info_t *stm_config_info);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_TRACE_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_types.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_types.h
new file mode 100755
index 0000000000000..bdb83a3fe2fb2
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_types.h
@@ -0,0 +1,294 @@
+#ifndef QURT_TYPES_H
+#define QURT_TYPES_H
+/**
+  @file qurt_types.h 
+  @brief  Contains types common to all configurations
+
+EXTERNAL FUNCTIONS
+   None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+Copyright (c) Qualcomm Technologies, Inc.
+All Rights Reserved.
+Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+=============================================================================*/
+
+
+//#include <stddef.h>
+#include <qurt_consts.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*=============================================================================
+                            CONSTANTS AND MACROS
+=============================================================================*/
+#define PGA_BITFIELD_MASK(hi,lo)    (((~0u)>>(31U-((hi)-(lo))))<<(lo))
+#define PGA_BITFIELD_GET(x,hi,lo)   (((x)&PGA_BITFIELD_MASK((hi),(lo)))>>(lo))
+#define PGA_BITFIELD_INS(hi,lo,v)   (((v)<<(lo))&PGA_BITFIELD_MASK((hi),(lo)))
+#define PGA_BITFIELD_SET(x,hi,lo,v) ((x)=((x)&~PGA_BITFIELD_MASK((hi),(lo)))|PGA_BITFIELD_INS((hi),(lo),(v)))
+#define QURT_PGATTR_C_GET(pga)      PGA_BITFIELD_GET((pga).pga_value, 3U, 0U)       /* Bits 3-0:  cache */
+#define QURT_PGATTR_A_GET(pga)      PGA_BITFIELD_GET((pga).pga_value, 5U, 4U)       /* Bits 5-4:  bus attr */
+#define QURT_PGATTR_C_SET(pga,v)    PGA_BITFIELD_SET((pga).pga_value, 3U, 0U, (v))  /* Bits 3-0:  cache */
+#define QURT_PGATTR_A_SET(pga,v)    PGA_BITFIELD_SET((pga).pga_value, 5U, 4U, (v))  /* Bits 5-4:  bus attr */
+#define QURT_PGATTR_MKRAW(v)        ((qurt_pgattr_t){.pga_value = (v)})
+#define QURT_PGATTR_MK(c,a)         QURT_PGATTR_MKRAW(PGA_BITFIELD_INS(3U,0U,(c))|PGA_BITFIELD_INS(5U,4U,(a)))
+
+/*return types for qurt_island_get_status2*/
+#define QURT_ISLAND_MODE_NORMAL    0U    /**< Normal operating mode */
+#define QURT_ISLAND_MODE_ISLAND    1U    /**< Island mode */
+#define QURT_ISLAND_MODE_EXITING   2U    /**< In transition from Island mode to Normal mode */
+
+/*=============================================================================
+                        FORWARD DECLARATIONS & TYPEDEFS
+=============================================================================*/
+/** @addtogroup memory_management_types
+@{ */
+typedef unsigned int qurt_addr_t;          /**< QuRT address type.*/
+typedef unsigned int qurt_paddr_t;         /**< QuRT physical memory address type.  */ 
+/** @cond rest_reg_dist  */
+typedef unsigned long long qurt_addr_64_t;  /**< QuRT 64-bit memory address type. */
+typedef unsigned long long qurt_paddr_64_t; /**< QuRT 64-bit physical memory address type. */
+typedef unsigned int qurt_mem_region_t;    /**< QuRT memory regions type. */
+typedef unsigned int qurt_mem_fs_region_t; /**< QuRT memory FS region type. */
+/**@endcond */
+typedef unsigned int qurt_mem_pool_t;      /**< QuRT memory pool type.*/
+typedef unsigned int qurt_size_t;          /**< QuRT size type. */
+/** @cond  */
+typedef unsigned long long qurt_mmu_entry_t;/**< QuRT MMU entry type. */
+#define QURT_PHYSPOOL_NAME_LEN (32)
+typedef char qurt_physpool_name_t[QURT_PHYSPOOL_NAME_LEN];
+
+
+/*
+ * Mapping type
+ *
+ * QMEM_MAPPING_VIRTUAL is the default mode, in which the system 
+ * picks up the available range of the virtual address, and maps it to 
+ * available contiguous physical addresses. Physical-to-virtual
+ * is not guaranteed to be 1:1; both virtual and physical memory is 
+ * contiguous.
+ *
+ * In QMEM_MAPPING_IDEMPOTENT mode, the user provides the physical address;
+ * the kernel allocates 1:1 physical-to-virtual memory. Primary use of 
+ * of this mapping is to allocate physical-to-virtual memory 1:1.
+ *
+ * In QMEM_MAPPING_PHYS_CONTIGUOUS mode, the virtual address might
+ * not be the same as the physical address. But the physical address of the
+ * memory region is guaranteed to be contiguous starting at the provided
+ * address, it is required to provide a fixed physical address. The primary 
+ * use of this mapping is to allocate physical memory from a particular 
+ * address, where 1:1 physical-to-virtual is not required.
+ *
+ * QMEM_MAPPING_NONE mode must be used to reserve a virtual memory
+ * area (VMA); no physical memory is reserved or mapped to this virtual
+ * space; all standard qmem_region APIs apply to a VMA, however physical
+ * address is always INVALID_ADDR. qmem_region_create() in this mode
+ * returns a handle to the VMA, both virt_addr and phys_addr must
+ * be set to INVALID_ADDR, kernel allocates any available virtual
+ * memory of the specified size. Obtain the starting virtual address 
+ * of VMA through qmem_region_attr_getvirtaddr().
+ * Primary purpose of this mapping mode is to provide a mechanism for
+ * delayed binding in QuRT, for example reserve virtual memory and map it at
+ * some later time to possibly discontiguous physical blocks. Thus, a
+ * single VMA can be partitioned among several physical-virtual mappings
+ * created via qmem_region_create() with QMEM_VIRTUAL_FIXED mapping mode.
+ * Each VMA keeps track of associated mapped regions.
+ * Deletion of VMA succeeds only if all associated "virtual_fixed"
+ * regions are freed prior to VMA deletion.
+ *
+ * Use QMEM_MAPPING_VIRTUAL_FIXED mode to create a region
+ * from virtual space that has been reserved via qmem_region_create()
+ * with QMEM_MAPPING_NONE mapping. A valid virt_add is required, if
+ * phys_addr is specified, the kernel attempts to map it accordingly,
+ * if no phys_addr is specified, kernel maps any available physical
+ * memory. All standard qmem_region APIs apply to such region. Remapping
+ * a virtual range without prior freeing of the region is not permitted.
+ * When such region is deleted its corresponding VMA remains intact.
+ *
+ * QMEM_MAPPING_PHYS_DISCONTIGUOUS mode can obtain contiguous
+ * virtual memory but physical memory can be discontiguous. This method
+ * tries to club small physical memory blocks to obtain requested
+ * memory and is useful in case where there is no contiguous full block
+ * of requested size. If client does not need contiguous physical memory, 
+ * (for example, if client does not use physical addressing), this helps
+ * use smaller physical memory blocks rather than using contiguous memory.
+ * Note: When memory is allocated through this method, physical address is
+ * not returned to the caller using the qurt_mem_region_attr_get() API as there might
+ * not be a single physical address.
+ *
+ */
+/**@endcond */
+/** QuRT memory region mapping type. */
+typedef enum {
+        QURT_MEM_MAPPING_VIRTUAL=0,            /**< Default mode. The region virtual address range maps to an 
+                                          available contiguous area of physical memory. For the most
+                                                    efficient use of virtual memory, the QuRT system 
+                                                    chooses the base address in physical memory. This works for most memory
+                                          use cases.*/
+        QURT_MEM_MAPPING_PHYS_CONTIGUOUS = 1,  /**< The region virtual address space must be mapped to a 
+                                               contiguous area of physical memory. This is necessary when the
+                                               memory region is accessed by external devices that bypass Hexagon
+                                               virtual memory addressing. The base address in physical 
+                                               memory must be explicitly specified.*/
+        QURT_MEM_MAPPING_IDEMPOTENT=2,         /**< Region virtual address space maps
+                                             to the identical area of physical memory. */
+        QURT_MEM_MAPPING_VIRTUAL_FIXED=3,      /**< Virtual address space of the region maps either to the 
+                                           specified area of physical memory or (if no area is specified)
+                                                    to available physical memory. Use this mapping to create
+                                           regions from virtual space that was reserved by calling 
+                                           qurt_mem_region_create() with mapping. */
+        QURT_MEM_MAPPING_NONE=4,  /**< Reserves a virtual memory area (VMA). Remapping a virtual range is not
+                                       permitted without first deleting the memory region. When such a region is
+                                       deleted, its corresponding virtual memory addressing remains intact. */
+        QURT_MEM_MAPPING_VIRTUAL_RANDOM=7,     /**< System chooses a random virtual address and
+                                            maps it to available contiguous physical addresses.*/
+        QURT_MEM_MAPPING_PHYS_DISCONTIGUOUS=8, /**< While virtual memory is contiguous, allocates in discontiguous physical 
+                                                    memory blocks. This helps when there are smaller contiguous blocks
+                                                    than the requested size.
+                                                    Physical address is not provided as part of the get_attr call */
+        QURT_MEM_MAPPING_INVALID=10,        /**< Reserved as an invalid mapping type. */
+} qurt_mem_mapping_t;  
+
+
+/** QuRT cache mode type. */
+typedef enum {
+        QURT_MEM_CACHE_WRITEBACK=7,     /**< Write back. */
+        QURT_MEM_CACHE_NONE_SHARED=6,   /**< Normal uncached memory that can be shared with other subsystems.*/
+        QURT_MEM_CACHE_WRITETHROUGH=5,  /**< Write through. */
+        QURT_MEM_CACHE_WRITEBACK_NONL2CACHEABLE=0,    /**< Write back non-L2-cacheable.*/
+        QURT_MEM_CACHE_WRITETHROUGH_NONL2CACHEABLE=1,  /**< Write through non-L2-cacheable. */
+        QURT_MEM_CACHE_WRITEBACK_L2CACHEABLE=QURT_MEM_CACHE_WRITEBACK,  /**< Write back L2 cacheable. */
+        QURT_MEM_CACHE_WRITETHROUGH_L2CACHEABLE=QURT_MEM_CACHE_WRITETHROUGH,  /**< Write through L2 cacheable.  */
+        QURT_MEM_CACHE_DEVICE = 4,  /**< Volatile memory-mapped device. Access to device memory cannot be cancelled by interrupts, re-ordered, or replayed.*/
+        QURT_MEM_CACHE_NONE = 4,  /**< Deprecated -- use #QURT_MEM_CACHE_DEVICE instead. */
+        QURT_MEM_CACHE_DEVICE_SFC = 2, /**< Enables placing limitations on the number of outstanding transactions. */
+        QURT_MEM_CACHE_INVALID=10,  /**< Reserved as an invalid cache type. */
+} qurt_mem_cache_mode_t;
+
+/** Memory access permission. */
+#define     QURT_PERM_NONE    0x0U     /**< No permission. */
+#define     QURT_PERM_READ    0x1U     /**< Read permission. */
+#define     QURT_PERM_WRITE   0x2U     /**< Write permission. */
+#define     QURT_PERM_EXECUTE 0x4U     /**< Execution permission. */
+#define     QURT_PERM_NODUMP  0x8U   
+                                    /**<  Skip dumping the mapping. During process domain dump, must skip
+                                     some mappings on host memory to avoid a race condition
+                                     where the memory is removed from the host and DSP process
+                                     crashed before the mapping is removed. */
+#define     QURT_PERM_FULL  QURT_PERM_READ | QURT_PERM_WRITE | QURT_PERM_EXECUTE  /**< Read, write, and execute permission. */
+
+typedef unsigned char qurt_perm_t;
+
+
+/** @cond rest_reg_dist*/
+/** QuRT cache type; specifies data cache or instruction cache. */
+typedef enum {
+        QURT_MEM_ICACHE, /**< Instruction cache.*/
+        QURT_MEM_DCACHE  /**< Data cache.*/
+} qurt_mem_cache_type_t;
+
+/** QuRT cache operation code type. */
+typedef enum {
+    QURT_MEM_CACHE_FLUSH, /**< Flush. */
+    QURT_MEM_CACHE_INVALIDATE, /**< Invalidate */
+    QURT_MEM_CACHE_FLUSH_INVALIDATE, /**< Flush invalidate. */
+    QURT_MEM_CACHE_FLUSH_ALL, /**< Flush all. */
+    QURT_MEM_CACHE_FLUSH_INVALIDATE_ALL, /**< Flush invalidate all. */
+    QURT_MEM_CACHE_TABLE_FLUSH_INVALIDATE, /**< Table flush invalidate. */
+    QURT_MEM_CACHE_FLUSH_INVALIDATE_L2, /**< L2 flush invalidate.*/
+} qurt_mem_cache_op_t;
+
+/** QuRT memory region type. */
+typedef enum {
+        QURT_MEM_REGION_LOCAL=0,  /**< Local. */
+        QURT_MEM_REGION_SHARED=1,  /**< Shared.*/
+        QURT_MEM_REGION_USER_ACCESS=2,  /**< User access. */
+        QURT_MEM_REGION_FS=4,  /**< FS. */
+        QURT_MEM_REGION_INVALID=10,  /**< Reserved as an invalid region type. */
+} qurt_mem_region_type_t;
+
+/* Cache and bus attributes are combined into a value of this type for convenience,
+    and macros for combining and extracting fields are defined here.  */
+/** @cond */
+struct qurt_pgattr {
+   unsigned pga_value; /**< PGA value.*/
+};
+typedef struct qurt_pgattr qurt_pgattr_t;
+/** @endcond */
+/** QuRT memory region attributes type.*/  
+/* QMEM_MAPPING_IDEMPOTENT and QMEM_MAPPING_PHYS_CONTIGUOUS mode can specify physaddr.
+   virtaddr cannot be specified for a memory region, it can only be queried by the 
+   qmem_attr_getvirtaddr() function.
+ */
+typedef struct {
+    /** @cond */
+    qurt_mem_mapping_t    mapping_type; 
+    unsigned char          perms;
+    unsigned short         owner;
+    qurt_pgattr_t          pga;
+    unsigned               ppn; //physical page number (physical>>12)
+    qurt_addr_t            virtaddr;
+    qurt_mem_region_type_t   type;   
+    qurt_size_t               size;
+    /** @endcond */
+} qurt_mem_region_attr_t;
+
+
+/** QuRT user physical memory pool type. */
+typedef struct {
+    /** @cond */
+    char name[32];
+    struct ranges{
+        unsigned int start;
+        unsigned int size;
+    } ranges[MAX_POOL_RANGES];
+     /** @endcond */
+} qurt_mem_pool_attr_t;
+
+/** QuRT memory pool status type.*/
+typedef struct _qurt_mem_pool_status {
+
+    qurt_size_t         contig_size; /**< Largest contiguous free memory in bytes. */
+    qurt_size_t         free_size;   /**< Total free memory in bytes. */
+    qurt_size_t         total_size;  /**< Total declared memory in bytes. */
+
+} qurt_mem_pool_status_t;
+
+typedef enum {
+    HEXAGON_L1_I_CACHE = 0,     /**< Hexagon L1 instruction cache. */
+    HEXAGON_L1_D_CACHE = 1,     /**< Hexagon L1 data cache. */
+    HEXAGON_L2_CACHE = 2        /**< Hexagon L2 cache. */
+} qurt_cache_type_t;
+
+typedef enum {
+    FULL_SIZE = 0,                /**< Fully shared cache, without partitioning. */
+    HALF_SIZE = 1,                /**< 1/2 for main, 1/2 for auxiliary. */
+    THREE_QUARTER_SIZE = 2,       /**< 3/4 for main, 1/4 for auxiliary. */
+    SEVEN_EIGHTHS_SIZE = 3        /**< 7/8 for main, 1/8 for auxiliary; for L2 cache only. */
+} qurt_cache_partition_size_t;
+
+typedef enum {
+	QURT_PROCESS_CB_GENERIC,        /**< generic unconditional cb called after image loading. */
+	QURT_PROCESS_NOTE_CB_PRE_MAP,   /**< note cb called before segment loading. */
+	QURT_PROCESS_NOTE_CB_POST_MAP   /**< note cb called after segment loading. */
+} qurt_process_cb_type_t;
+
+typedef union {
+    void *ptr;
+    int num;
+} qurt_process_callback_arg_t;
+
+
+/**@endcond*/
+
+/** @} */ /* end_addtogroup memory_management_types */
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_TYPES_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_user_dma.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_user_dma.h
new file mode 100755
index 0000000000000..e05a6429fd703
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_user_dma.h
@@ -0,0 +1,44 @@
+#ifndef QURT_USER_DMA_H
+#define QURT_USER_DMA_H
+
+/**
+  @file qurt_user_dma.h
+  @brief  Definitions, macros, and prototypes used for handling user DMA.
+
+ EXTERNALIZED FUNCTIONS
+  none
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  none
+
+ Copyright (c) 2021  by Qualcomm Technologies, Inc.  All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**@ingroup qurt_user_dma_dmsyncht
+  Sends the DMSyncht command to the user DMA engine.
+   
+   Call this function to ensure all posted DMA memory operations are
+   complete. 
+   
+   This stalls the current thread until the instruction
+   is complete and returns.
+
+  @return
+  QURT_EOK - On dmsyncht completion \n
+  QURT_ENOTSUPPORTED - User DMA not supported
+  
+  @dependencies
+  None.
+*/
+int qurt_user_dma_dmsyncht(void);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_vtlb.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_vtlb.h
new file mode 100755
index 0000000000000..e064042e447ac
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/include/qurt/qurt_vtlb.h
@@ -0,0 +1,76 @@
+/*=============================================================================
+
+                                    qurt_vtlb.h
+
+GENERAL DESCRIPTION
+
+EXTERNAL FUNCTIONS
+        None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+        None.
+
+Copyright (c) 2019, 2021, 2023  by Qualcomm Technologies, Inc.  All Rights Reserved.
+=============================================================================*/
+#ifndef QURT_VTLB_H
+#define QURT_VTLB_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+||  Names starting with "qurt_i_vtlb" are the internal low-level functions.
+||  These should be considered subject to change.
+*/
+
+int qurt_i_vtlb_entry_create(unsigned *pIndex,
+                             unsigned tlb_lo,
+                             unsigned tlb_hi,
+                             unsigned extension);
+
+int qurt_i_vtlb_entry_create_with_pid(unsigned *pIndex,
+                                      unsigned tlb_lo,
+                                      unsigned tlb_hi,
+                                      unsigned extension,
+                                      unsigned target_pid);
+
+int qurt_i_vtlb_entry_delete(unsigned index);
+
+int qurt_i_vtlb_entry_read(unsigned index, unsigned *tlbinfo);
+
+int qurt_i_vtlb_entry_write(unsigned index, unsigned tlb_lo, unsigned tlb_hi, unsigned extension);
+
+int qurt_i_vtlb_entry_write_with_pid(unsigned index, unsigned tlb_lo, unsigned tlb_hi, unsigned extension, unsigned target_pid);
+
+int qurt_i_vtlb_entry_probe(const void *vaddr, unsigned *tlbinfo, unsigned *pIndex);
+
+int qurt_i_vtlb_entry_probe_with_pid(const void *vaddr, unsigned *tlbinfo, unsigned *pIndex, unsigned target_pid);
+
+
+int qurt_i_vtlb_statistics(unsigned *stats); // Returns stats[0] -- total number of VTLB entries
+                                             //         stats[1] -- number of available VTLB entries
+                                             //         stats[2] -- max size of VTLB tree since boot
+
+//can return index to an entry that was specialed, change it to take addresses instead of pages
+int qurt_i_vtlb_set_special(int index, unsigned pageno, unsigned asid, unsigned size);
+
+int qurt_i_vtlb_queue_ppage(unsigned pageno, unsigned vtlb_index);
+
+#define QURT_VTLB_EXT_DEFAULT      0U
+#define QURT_VTLB_EXT_LOCKED       1U
+#define QURT_VTLB_EXT_EXCLUDE_DUMP 2U      /* Temporary ability to skip certain mappings in pd dump */
+#define QURT_VTLB_EXT_FREELIST     0x800000u
+
+#define QURT_VTLB_ERR_OVERLAP           -64
+#define QURT_VTLB_ERR_TREE_NO_SPACE     -65
+#define QURT_VTLB_ERR_INVALID_SIZE      -68
+#define QURT_VTLB_ERR_INVALID_EXT       -69
+#define QURT_VTLB_ERR_DEL_PGT_LOCKED    -70
+#define QURT_VTLB_ERR_PGT_LOCK_CNT      -71
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif // QURT_VTLB_H
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/lib/libposix.a b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/lib/libposix.a
new file mode 100755
index 0000000000000..6d29c02c51601
Binary files /dev/null and b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/lib/libposix.a differ
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/lib/libqurt.a b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/lib/libqurt.a
new file mode 100755
index 0000000000000..8d97bbd7c3b58
Binary files /dev/null and b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/lib/libqurt.a differ
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/lib/libqurtcfs.a b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/lib/libqurtcfs.a
new file mode 100755
index 0000000000000..eac612a670347
Binary files /dev/null and b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/lib/libqurtcfs.a differ
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/lib/libtimer_island.a b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/lib/libtimer_island.a
new file mode 100755
index 0000000000000..7e5653a98850c
Binary files /dev/null and b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/lib/libtimer_island.a differ
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/lib/libtimer_main.a b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/lib/libtimer_main.a
new file mode 100755
index 0000000000000..f01114822787c
Binary files /dev/null and b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/lib/libtimer_main.a differ
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/lib/pic/libposix.a b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/lib/pic/libposix.a
new file mode 100755
index 0000000000000..e8007300d0e4a
Binary files /dev/null and b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/lib/pic/libposix.a differ
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/lib/pic/libqurt.a b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/lib/pic/libqurt.a
new file mode 100755
index 0000000000000..c5977b8c3cc5e
Binary files /dev/null and b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/lib/pic/libqurt.a differ
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/lib/pic/libqurtcfs.a b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/lib/pic/libqurtcfs.a
new file mode 100755
index 0000000000000..eac612a670347
Binary files /dev/null and b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/lib/pic/libqurtcfs.a differ
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/lib/pic/libtimer.a b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/lib/pic/libtimer.a
new file mode 100755
index 0000000000000..a8bd4da88cace
Binary files /dev/null and b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev75/lib/pic/libtimer.a differ
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/posix/bits/confname.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/posix/bits/confname.h
new file mode 100755
index 0000000000000..d9ca3135501e3
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/posix/bits/confname.h
@@ -0,0 +1,528 @@
+#ifndef CONFNAME_H
+#define CONFNAME_H
+/**
+  @file confname.h
+  @brief Named literals for 'name' argument of sysconf, pathconf
+
+EXTERNAL FUNCTIONS    
+   None 
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS    
+   DONT include this header directly. Instead include unistd.h. For now since 
+   toolchain doesnt provide a hook by including bits/confname.h, we stick this 
+   header in QuRT's sys/types.h 
+
+Copyright (c) 2018, 2023 by Qualcomm Technologies, Inc.  All Rights Reserved.
+Confidential and Proprietary - Qualcomm Technologies, Inc.    
+    
+==============================================================================*/
+/* Values for the NAME argument to `pathconf' and `fpathconf'.  */
+enum
+{
+    _PC_LINK_MAX,
+#define	_PC_LINK_MAX			_PC_LINK_MAX
+    _PC_MAX_CANON,
+#define	_PC_MAX_CANON			_PC_MAX_CANON
+    _PC_MAX_INPUT,
+#define	_PC_MAX_INPUT			_PC_MAX_INPUT
+    _PC_NAME_MAX,
+#define	_PC_NAME_MAX			_PC_NAME_MAX
+    _PC_PATH_MAX,
+#define	_PC_PATH_MAX			_PC_PATH_MAX
+    _PC_PIPE_BUF,
+#define	_PC_PIPE_BUF			_PC_PIPE_BUF
+    _PC_CHOWN_RESTRICTED,
+#define	_PC_CHOWN_RESTRICTED		_PC_CHOWN_RESTRICTED
+    _PC_NO_TRUNC,
+#define	_PC_NO_TRUNC			_PC_NO_TRUNC
+    _PC_VDISABLE,
+#define _PC_VDISABLE			_PC_VDISABLE
+    _PC_SYNC_IO,
+#define	_PC_SYNC_IO			_PC_SYNC_IO
+    _PC_ASYNC_IO,
+#define	_PC_ASYNC_IO			_PC_ASYNC_IO
+    _PC_PRIO_IO,
+#define	_PC_PRIO_IO			_PC_PRIO_IO
+    _PC_SOCK_MAXBUF,
+#define	_PC_SOCK_MAXBUF			_PC_SOCK_MAXBUF
+    _PC_FILESIZEBITS,
+#define _PC_FILESIZEBITS		_PC_FILESIZEBITS
+    _PC_REC_INCR_XFER_SIZE,
+#define _PC_REC_INCR_XFER_SIZE		_PC_REC_INCR_XFER_SIZE
+    _PC_REC_MAX_XFER_SIZE,
+#define _PC_REC_MAX_XFER_SIZE		_PC_REC_MAX_XFER_SIZE
+    _PC_REC_MIN_XFER_SIZE,
+#define _PC_REC_MIN_XFER_SIZE		_PC_REC_MIN_XFER_SIZE
+    _PC_REC_XFER_ALIGN,
+#define _PC_REC_XFER_ALIGN		_PC_REC_XFER_ALIGN
+    _PC_ALLOC_SIZE_MIN,
+#define _PC_ALLOC_SIZE_MIN		_PC_ALLOC_SIZE_MIN
+    _PC_SYMLINK_MAX,
+#define _PC_SYMLINK_MAX			_PC_SYMLINK_MAX
+    _PC_2_SYMLINKS
+#define _PC_2_SYMLINKS			_PC_2_SYMLINKS
+};
+
+/* Values for the argument to `sysconf'.  */
+enum
+{
+    _SC_ARG_MAX,
+#define	_SC_ARG_MAX			_SC_ARG_MAX
+    _SC_CHILD_MAX,
+#define	_SC_CHILD_MAX			_SC_CHILD_MAX
+    _SC_CLK_TCK,
+#define	_SC_CLK_TCK			_SC_CLK_TCK
+    _SC_NGROUPS_MAX,
+#define	_SC_NGROUPS_MAX			_SC_NGROUPS_MAX
+    _SC_OPEN_MAX,
+#define	_SC_OPEN_MAX			_SC_OPEN_MAX
+    _SC_STREAM_MAX,
+#define	_SC_STREAM_MAX			_SC_STREAM_MAX
+    _SC_TZNAME_MAX,
+#define	_SC_TZNAME_MAX			_SC_TZNAME_MAX
+    _SC_JOB_CONTROL,
+#define	_SC_JOB_CONTROL			_SC_JOB_CONTROL
+    _SC_SAVED_IDS,
+#define	_SC_SAVED_IDS			_SC_SAVED_IDS
+    _SC_REALTIME_SIGNALS,
+#define	_SC_REALTIME_SIGNALS		_SC_REALTIME_SIGNALS
+    _SC_PRIORITY_SCHEDULING,
+#define	_SC_PRIORITY_SCHEDULING		_SC_PRIORITY_SCHEDULING
+    _SC_TIMERS,
+#define	_SC_TIMERS			_SC_TIMERS
+    _SC_ASYNCHRONOUS_IO,
+#define	_SC_ASYNCHRONOUS_IO		_SC_ASYNCHRONOUS_IO
+    _SC_PRIORITIZED_IO,
+#define	_SC_PRIORITIZED_IO		_SC_PRIORITIZED_IO
+    _SC_SYNCHRONIZED_IO,
+#define	_SC_SYNCHRONIZED_IO		_SC_SYNCHRONIZED_IO
+    _SC_FSYNC,
+#define	_SC_FSYNC			_SC_FSYNC
+    _SC_MAPPED_FILES,
+#define	_SC_MAPPED_FILES		_SC_MAPPED_FILES
+    _SC_MEMLOCK,
+#define	_SC_MEMLOCK			_SC_MEMLOCK
+    _SC_MEMLOCK_RANGE,
+#define	_SC_MEMLOCK_RANGE		_SC_MEMLOCK_RANGE
+    _SC_MEMORY_PROTECTION,
+#define	_SC_MEMORY_PROTECTION		_SC_MEMORY_PROTECTION
+    _SC_MESSAGE_PASSING,
+#define	_SC_MESSAGE_PASSING		_SC_MESSAGE_PASSING
+    _SC_SEMAPHORES,
+#define	_SC_SEMAPHORES			_SC_SEMAPHORES
+    _SC_SHARED_MEMORY_OBJECTS,
+#define	_SC_SHARED_MEMORY_OBJECTS	_SC_SHARED_MEMORY_OBJECTS
+    _SC_AIO_LISTIO_MAX,
+#define	_SC_AIO_LISTIO_MAX		_SC_AIO_LISTIO_MAX
+    _SC_AIO_MAX,
+#define	_SC_AIO_MAX			_SC_AIO_MAX
+    _SC_AIO_PRIO_DELTA_MAX,
+#define	_SC_AIO_PRIO_DELTA_MAX		_SC_AIO_PRIO_DELTA_MAX
+    _SC_DELAYTIMER_MAX,
+#define	_SC_DELAYTIMER_MAX		_SC_DELAYTIMER_MAX
+    _SC_MQ_OPEN_MAX,
+#define	_SC_MQ_OPEN_MAX			_SC_MQ_OPEN_MAX
+    _SC_MQ_PRIO_MAX,
+#define	_SC_MQ_PRIO_MAX			_SC_MQ_PRIO_MAX
+    _SC_VERSION,
+#define	_SC_VERSION			_SC_VERSION
+    _SC_PAGESIZE,
+#define	_SC_PAGESIZE			_SC_PAGESIZE
+#define	_SC_PAGE_SIZE			_SC_PAGESIZE
+    _SC_RTSIG_MAX,
+#define	_SC_RTSIG_MAX			_SC_RTSIG_MAX
+    _SC_SEM_NSEMS_MAX,
+#define	_SC_SEM_NSEMS_MAX		_SC_SEM_NSEMS_MAX
+    _SC_SEM_VALUE_MAX,
+#define	_SC_SEM_VALUE_MAX		_SC_SEM_VALUE_MAX
+    _SC_SIGQUEUE_MAX,
+#define	_SC_SIGQUEUE_MAX		_SC_SIGQUEUE_MAX
+    _SC_TIMER_MAX,
+#define	_SC_TIMER_MAX			_SC_TIMER_MAX
+
+    /* Values for the argument to `sysconf'
+       corresponding to _POSIX2_* symbols.  */
+    _SC_BC_BASE_MAX,
+#define	_SC_BC_BASE_MAX			_SC_BC_BASE_MAX
+    _SC_BC_DIM_MAX,
+#define	_SC_BC_DIM_MAX			_SC_BC_DIM_MAX
+    _SC_BC_SCALE_MAX,
+#define	_SC_BC_SCALE_MAX		_SC_BC_SCALE_MAX
+    _SC_BC_STRING_MAX,
+#define	_SC_BC_STRING_MAX		_SC_BC_STRING_MAX
+    _SC_COLL_WEIGHTS_MAX,
+#define	_SC_COLL_WEIGHTS_MAX		_SC_COLL_WEIGHTS_MAX
+    _SC_EQUIV_CLASS_MAX,
+#define	_SC_EQUIV_CLASS_MAX		_SC_EQUIV_CLASS_MAX
+    _SC_EXPR_NEST_MAX,
+#define	_SC_EXPR_NEST_MAX		_SC_EXPR_NEST_MAX
+    _SC_LINE_MAX,
+#define	_SC_LINE_MAX			_SC_LINE_MAX
+    _SC_RE_DUP_MAX,
+#define	_SC_RE_DUP_MAX			_SC_RE_DUP_MAX
+    _SC_CHARCLASS_NAME_MAX,
+#define	_SC_CHARCLASS_NAME_MAX		_SC_CHARCLASS_NAME_MAX
+
+    _SC_2_VERSION,
+#define	_SC_2_VERSION			_SC_2_VERSION
+    _SC_2_C_BIND,
+#define	_SC_2_C_BIND			_SC_2_C_BIND
+    _SC_2_C_DEV,
+#define	_SC_2_C_DEV			_SC_2_C_DEV
+    _SC_2_FORT_DEV,
+#define	_SC_2_FORT_DEV			_SC_2_FORT_DEV
+    _SC_2_FORT_RUN,
+#define	_SC_2_FORT_RUN			_SC_2_FORT_RUN
+    _SC_2_SW_DEV,
+#define	_SC_2_SW_DEV			_SC_2_SW_DEV
+    _SC_2_LOCALEDEF,
+#define	_SC_2_LOCALEDEF			_SC_2_LOCALEDEF
+
+    _SC_PII,
+#define	_SC_PII				_SC_PII
+    _SC_PII_XTI,
+#define	_SC_PII_XTI			_SC_PII_XTI
+    _SC_PII_SOCKET,
+#define	_SC_PII_SOCKET			_SC_PII_SOCKET
+    _SC_PII_INTERNET,
+#define	_SC_PII_INTERNET		_SC_PII_INTERNET
+    _SC_PII_OSI,
+#define	_SC_PII_OSI			_SC_PII_OSI
+    _SC_POLL,
+#define	_SC_POLL			_SC_POLL
+    _SC_SELECT,
+#define	_SC_SELECT			_SC_SELECT
+    _SC_UIO_MAXIOV,
+#define	_SC_UIO_MAXIOV			_SC_UIO_MAXIOV
+    _SC_IOV_MAX = _SC_UIO_MAXIOV,
+#define _SC_IOV_MAX			_SC_IOV_MAX
+    _SC_PII_INTERNET_STREAM,
+#define	_SC_PII_INTERNET_STREAM		_SC_PII_INTERNET_STREAM
+    _SC_PII_INTERNET_DGRAM,
+#define	_SC_PII_INTERNET_DGRAM		_SC_PII_INTERNET_DGRAM
+    _SC_PII_OSI_COTS,
+#define	_SC_PII_OSI_COTS		_SC_PII_OSI_COTS
+    _SC_PII_OSI_CLTS,
+#define	_SC_PII_OSI_CLTS		_SC_PII_OSI_CLTS
+    _SC_PII_OSI_M,
+#define	_SC_PII_OSI_M			_SC_PII_OSI_M
+    _SC_T_IOV_MAX,
+#define	_SC_T_IOV_MAX			_SC_T_IOV_MAX
+
+    /* Values according to POSIX 1003.1c (POSIX threads).  */
+    _SC_THREADS,
+#define	_SC_THREADS			_SC_THREADS
+    _SC_THREAD_SAFE_FUNCTIONS,
+#define _SC_THREAD_SAFE_FUNCTIONS	_SC_THREAD_SAFE_FUNCTIONS
+    _SC_GETGR_R_SIZE_MAX,
+#define	_SC_GETGR_R_SIZE_MAX		_SC_GETGR_R_SIZE_MAX
+    _SC_GETPW_R_SIZE_MAX,
+#define	_SC_GETPW_R_SIZE_MAX		_SC_GETPW_R_SIZE_MAX
+    _SC_LOGIN_NAME_MAX,
+#define	_SC_LOGIN_NAME_MAX		_SC_LOGIN_NAME_MAX
+    _SC_TTY_NAME_MAX,
+#define	_SC_TTY_NAME_MAX		_SC_TTY_NAME_MAX
+    _SC_THREAD_DESTRUCTOR_ITERATIONS,
+#define	_SC_THREAD_DESTRUCTOR_ITERATIONS _SC_THREAD_DESTRUCTOR_ITERATIONS
+    _SC_THREAD_KEYS_MAX,
+#define	_SC_THREAD_KEYS_MAX		_SC_THREAD_KEYS_MAX
+    _SC_THREAD_STACK_MIN,
+#define	_SC_THREAD_STACK_MIN		_SC_THREAD_STACK_MIN
+    _SC_THREAD_THREADS_MAX,
+#define	_SC_THREAD_THREADS_MAX		_SC_THREAD_THREADS_MAX
+    _SC_THREAD_ATTR_STACKADDR,
+#define	_SC_THREAD_ATTR_STACKADDR	_SC_THREAD_ATTR_STACKADDR
+    _SC_THREAD_ATTR_STACKSIZE,
+#define	_SC_THREAD_ATTR_STACKSIZE	_SC_THREAD_ATTR_STACKSIZE
+    _SC_THREAD_PRIORITY_SCHEDULING,
+#define	_SC_THREAD_PRIORITY_SCHEDULING	_SC_THREAD_PRIORITY_SCHEDULING
+    _SC_THREAD_PRIO_INHERIT,
+#define	_SC_THREAD_PRIO_INHERIT		_SC_THREAD_PRIO_INHERIT
+    _SC_THREAD_PRIO_PROTECT,
+#define	_SC_THREAD_PRIO_PROTECT		_SC_THREAD_PRIO_PROTECT
+    _SC_THREAD_PROCESS_SHARED,
+#define	_SC_THREAD_PROCESS_SHARED	_SC_THREAD_PROCESS_SHARED
+
+    _SC_NPROCESSORS_CONF,
+#define _SC_NPROCESSORS_CONF		_SC_NPROCESSORS_CONF
+    _SC_NPROCESSORS_ONLN,
+#define _SC_NPROCESSORS_ONLN		_SC_NPROCESSORS_ONLN
+    _SC_PHYS_PAGES,
+#define _SC_PHYS_PAGES			_SC_PHYS_PAGES
+    _SC_AVPHYS_PAGES,
+#define _SC_AVPHYS_PAGES		_SC_AVPHYS_PAGES
+    _SC_ATEXIT_MAX,
+#define _SC_ATEXIT_MAX			_SC_ATEXIT_MAX
+    _SC_PASS_MAX,
+#define _SC_PASS_MAX			_SC_PASS_MAX
+
+    _SC_XOPEN_VERSION,
+#define _SC_XOPEN_VERSION		_SC_XOPEN_VERSION
+    _SC_XOPEN_XCU_VERSION,
+#define _SC_XOPEN_XCU_VERSION		_SC_XOPEN_XCU_VERSION
+    _SC_XOPEN_UNIX,
+#define _SC_XOPEN_UNIX			_SC_XOPEN_UNIX
+    _SC_XOPEN_CRYPT,
+#define _SC_XOPEN_CRYPT			_SC_XOPEN_CRYPT
+    _SC_XOPEN_ENH_I18N,
+#define _SC_XOPEN_ENH_I18N		_SC_XOPEN_ENH_I18N
+    _SC_XOPEN_SHM,
+#define _SC_XOPEN_SHM			_SC_XOPEN_SHM
+
+    _SC_2_CHAR_TERM,
+#define _SC_2_CHAR_TERM			_SC_2_CHAR_TERM
+    _SC_2_C_VERSION,
+#define _SC_2_C_VERSION			_SC_2_C_VERSION
+    _SC_2_UPE,
+#define _SC_2_UPE			_SC_2_UPE
+
+    _SC_XOPEN_XPG2,
+#define _SC_XOPEN_XPG2			_SC_XOPEN_XPG2
+    _SC_XOPEN_XPG3,
+#define _SC_XOPEN_XPG3			_SC_XOPEN_XPG3
+    _SC_XOPEN_XPG4,
+#define _SC_XOPEN_XPG4			_SC_XOPEN_XPG4
+
+    _SC_CHAR_BIT,
+#define	_SC_CHAR_BIT			_SC_CHAR_BIT
+    _SC_CHAR_MAX,
+#define	_SC_CHAR_MAX			_SC_CHAR_MAX
+    _SC_CHAR_MIN,
+#define	_SC_CHAR_MIN			_SC_CHAR_MIN
+    _SC_INT_MAX,
+#define	_SC_INT_MAX			_SC_INT_MAX
+    _SC_INT_MIN,
+#define	_SC_INT_MIN			_SC_INT_MIN
+    _SC_LONG_BIT,
+#define	_SC_LONG_BIT			_SC_LONG_BIT
+    _SC_WORD_BIT,
+#define	_SC_WORD_BIT			_SC_WORD_BIT
+    _SC_MB_LEN_MAX,
+#define	_SC_MB_LEN_MAX			_SC_MB_LEN_MAX
+    _SC_NZERO,
+#define	_SC_NZERO			_SC_NZERO
+    _SC_SSIZE_MAX,
+#define	_SC_SSIZE_MAX			_SC_SSIZE_MAX
+    _SC_SCHAR_MAX,
+#define	_SC_SCHAR_MAX			_SC_SCHAR_MAX
+    _SC_SCHAR_MIN,
+#define	_SC_SCHAR_MIN			_SC_SCHAR_MIN
+    _SC_SHRT_MAX,
+#define	_SC_SHRT_MAX			_SC_SHRT_MAX
+    _SC_SHRT_MIN,
+#define	_SC_SHRT_MIN			_SC_SHRT_MIN
+    _SC_UCHAR_MAX,
+#define	_SC_UCHAR_MAX			_SC_UCHAR_MAX
+    _SC_UINT_MAX,
+#define	_SC_UINT_MAX			_SC_UINT_MAX
+    _SC_ULONG_MAX,
+#define	_SC_ULONG_MAX			_SC_ULONG_MAX
+    _SC_USHRT_MAX,
+#define	_SC_USHRT_MAX			_SC_USHRT_MAX
+
+    _SC_NL_ARGMAX,
+#define	_SC_NL_ARGMAX			_SC_NL_ARGMAX
+    _SC_NL_LANGMAX,
+#define	_SC_NL_LANGMAX			_SC_NL_LANGMAX
+    _SC_NL_MSGMAX,
+#define	_SC_NL_MSGMAX			_SC_NL_MSGMAX
+    _SC_NL_NMAX,
+#define	_SC_NL_NMAX			_SC_NL_NMAX
+    _SC_NL_SETMAX,
+#define	_SC_NL_SETMAX			_SC_NL_SETMAX
+    _SC_NL_TEXTMAX,
+#define	_SC_NL_TEXTMAX			_SC_NL_TEXTMAX
+
+    _SC_XBS5_ILP32_OFF32,
+#define _SC_XBS5_ILP32_OFF32		_SC_XBS5_ILP32_OFF32
+    _SC_XBS5_ILP32_OFFBIG,
+#define _SC_XBS5_ILP32_OFFBIG		_SC_XBS5_ILP32_OFFBIG
+    _SC_XBS5_LP64_OFF64,
+#define _SC_XBS5_LP64_OFF64		_SC_XBS5_LP64_OFF64
+    _SC_XBS5_LPBIG_OFFBIG,
+#define _SC_XBS5_LPBIG_OFFBIG		_SC_XBS5_LPBIG_OFFBIG
+
+    _SC_XOPEN_LEGACY,
+#define _SC_XOPEN_LEGACY		_SC_XOPEN_LEGACY
+    _SC_XOPEN_REALTIME,
+#define _SC_XOPEN_REALTIME		_SC_XOPEN_REALTIME
+    _SC_XOPEN_REALTIME_THREADS,
+#define _SC_XOPEN_REALTIME_THREADS	_SC_XOPEN_REALTIME_THREADS
+
+    _SC_ADVISORY_INFO,
+#define _SC_ADVISORY_INFO		_SC_ADVISORY_INFO
+    _SC_BARRIERS,
+#define _SC_BARRIERS			_SC_BARRIERS
+    _SC_BASE,
+#define _SC_BASE			_SC_BASE
+    _SC_C_LANG_SUPPORT,
+#define _SC_C_LANG_SUPPORT		_SC_C_LANG_SUPPORT
+    _SC_C_LANG_SUPPORT_R,
+#define _SC_C_LANG_SUPPORT_R		_SC_C_LANG_SUPPORT_R
+    _SC_CLOCK_SELECTION,
+#define _SC_CLOCK_SELECTION		_SC_CLOCK_SELECTION
+    _SC_CPUTIME,
+#define _SC_CPUTIME			_SC_CPUTIME
+    _SC_THREAD_CPUTIME,
+#define _SC_THREAD_CPUTIME		_SC_THREAD_CPUTIME
+    _SC_DEVICE_IO,
+#define _SC_DEVICE_IO			_SC_DEVICE_IO
+    _SC_DEVICE_SPECIFIC,
+#define _SC_DEVICE_SPECIFIC		_SC_DEVICE_SPECIFIC
+    _SC_DEVICE_SPECIFIC_R,
+#define _SC_DEVICE_SPECIFIC_R		_SC_DEVICE_SPECIFIC_R
+    _SC_FD_MGMT,
+#define _SC_FD_MGMT			_SC_FD_MGMT
+    _SC_FIFO,
+#define _SC_FIFO			_SC_FIFO
+    _SC_PIPE,
+#define _SC_PIPE			_SC_PIPE
+    _SC_FILE_ATTRIBUTES,
+#define _SC_FILE_ATTRIBUTES		_SC_FILE_ATTRIBUTES
+    _SC_FILE_LOCKING,
+#define _SC_FILE_LOCKING		_SC_FILE_LOCKING
+    _SC_FILE_SYSTEM,
+#define _SC_FILE_SYSTEM			_SC_FILE_SYSTEM
+    _SC_MONOTONIC_CLOCK,
+#define _SC_MONOTONIC_CLOCK		_SC_MONOTONIC_CLOCK
+    _SC_MULTI_PROCESS,
+#define _SC_MULTI_PROCESS		_SC_MULTI_PROCESS
+    _SC_SINGLE_PROCESS,
+#define _SC_SINGLE_PROCESS		_SC_SINGLE_PROCESS
+    _SC_NETWORKING,
+#define _SC_NETWORKING			_SC_NETWORKING
+    _SC_READER_WRITER_LOCKS,
+#define _SC_READER_WRITER_LOCKS		_SC_READER_WRITER_LOCKS
+    _SC_SPIN_LOCKS,
+#define _SC_SPIN_LOCKS			_SC_SPIN_LOCKS
+    _SC_REGEXP,
+#define _SC_REGEXP			_SC_REGEXP
+    _SC_REGEX_VERSION,
+#define _SC_REGEX_VERSION		_SC_REGEX_VERSION
+    _SC_SHELL,
+#define _SC_SHELL			_SC_SHELL
+    _SC_SIGNALS,
+#define _SC_SIGNALS			_SC_SIGNALS
+    _SC_SPAWN,
+#define _SC_SPAWN			_SC_SPAWN
+    _SC_SPORADIC_SERVER,
+#define _SC_SPORADIC_SERVER		_SC_SPORADIC_SERVER
+    _SC_THREAD_SPORADIC_SERVER,
+#define _SC_THREAD_SPORADIC_SERVER	_SC_THREAD_SPORADIC_SERVER
+    _SC_SYSTEM_DATABASE,
+#define _SC_SYSTEM_DATABASE		_SC_SYSTEM_DATABASE
+    _SC_SYSTEM_DATABASE_R,
+#define _SC_SYSTEM_DATABASE_R		_SC_SYSTEM_DATABASE_R
+    _SC_TIMEOUTS,
+#define _SC_TIMEOUTS			_SC_TIMEOUTS
+    _SC_TYPED_MEMORY_OBJECTS,
+#define _SC_TYPED_MEMORY_OBJECTS	_SC_TYPED_MEMORY_OBJECTS
+    _SC_USER_GROUPS,
+#define _SC_USER_GROUPS			_SC_USER_GROUPS
+    _SC_USER_GROUPS_R,
+#define _SC_USER_GROUPS_R		_SC_USER_GROUPS_R
+    _SC_2_PBS,
+#define _SC_2_PBS			_SC_2_PBS
+    _SC_2_PBS_ACCOUNTING,
+#define _SC_2_PBS_ACCOUNTING		_SC_2_PBS_ACCOUNTING
+    _SC_2_PBS_LOCATE,
+#define _SC_2_PBS_LOCATE		_SC_2_PBS_LOCATE
+    _SC_2_PBS_MESSAGE,
+#define _SC_2_PBS_MESSAGE		_SC_2_PBS_MESSAGE
+    _SC_2_PBS_TRACK,
+#define _SC_2_PBS_TRACK			_SC_2_PBS_TRACK
+    _SC_SYMLOOP_MAX,
+#define _SC_SYMLOOP_MAX			_SC_SYMLOOP_MAX
+    _SC_STREAMS,
+#define _SC_STREAMS			_SC_STREAMS
+    _SC_2_PBS_CHECKPOINT,
+#define _SC_2_PBS_CHECKPOINT		_SC_2_PBS_CHECKPOINT
+
+    _SC_V6_ILP32_OFF32,
+#define _SC_V6_ILP32_OFF32		_SC_V6_ILP32_OFF32
+    _SC_V6_ILP32_OFFBIG,
+#define _SC_V6_ILP32_OFFBIG		_SC_V6_ILP32_OFFBIG
+    _SC_V6_LP64_OFF64,
+#define _SC_V6_LP64_OFF64		_SC_V6_LP64_OFF64
+    _SC_V6_LPBIG_OFFBIG,
+#define _SC_V6_LPBIG_OFFBIG		_SC_V6_LPBIG_OFFBIG
+
+    _SC_HOST_NAME_MAX,
+#define _SC_HOST_NAME_MAX		_SC_HOST_NAME_MAX
+    _SC_TRACE,
+#define _SC_TRACE			_SC_TRACE
+    _SC_TRACE_EVENT_FILTER,
+#define _SC_TRACE_EVENT_FILTER		_SC_TRACE_EVENT_FILTER
+    _SC_TRACE_INHERIT,
+#define _SC_TRACE_INHERIT		_SC_TRACE_INHERIT
+    _SC_TRACE_LOG,
+#define _SC_TRACE_LOG			_SC_TRACE_LOG
+
+    _SC_LEVEL1_ICACHE_SIZE,
+#define _SC_LEVEL1_ICACHE_SIZE		_SC_LEVEL1_ICACHE_SIZE
+    _SC_LEVEL1_ICACHE_ASSOC,
+#define _SC_LEVEL1_ICACHE_ASSOC		_SC_LEVEL1_ICACHE_ASSOC
+    _SC_LEVEL1_ICACHE_LINESIZE,
+#define _SC_LEVEL1_ICACHE_LINESIZE	_SC_LEVEL1_ICACHE_LINESIZE
+    _SC_LEVEL1_DCACHE_SIZE,
+#define _SC_LEVEL1_DCACHE_SIZE		_SC_LEVEL1_DCACHE_SIZE
+    _SC_LEVEL1_DCACHE_ASSOC,
+#define _SC_LEVEL1_DCACHE_ASSOC		_SC_LEVEL1_DCACHE_ASSOC
+    _SC_LEVEL1_DCACHE_LINESIZE,
+#define _SC_LEVEL1_DCACHE_LINESIZE	_SC_LEVEL1_DCACHE_LINESIZE
+    _SC_LEVEL2_CACHE_SIZE,
+#define _SC_LEVEL2_CACHE_SIZE		_SC_LEVEL2_CACHE_SIZE
+    _SC_LEVEL2_CACHE_ASSOC,
+#define _SC_LEVEL2_CACHE_ASSOC		_SC_LEVEL2_CACHE_ASSOC
+    _SC_LEVEL2_CACHE_LINESIZE,
+#define _SC_LEVEL2_CACHE_LINESIZE	_SC_LEVEL2_CACHE_LINESIZE
+    _SC_LEVEL3_CACHE_SIZE,
+#define _SC_LEVEL3_CACHE_SIZE		_SC_LEVEL3_CACHE_SIZE
+    _SC_LEVEL3_CACHE_ASSOC,
+#define _SC_LEVEL3_CACHE_ASSOC		_SC_LEVEL3_CACHE_ASSOC
+    _SC_LEVEL3_CACHE_LINESIZE,
+#define _SC_LEVEL3_CACHE_LINESIZE	_SC_LEVEL3_CACHE_LINESIZE
+    _SC_LEVEL4_CACHE_SIZE,
+#define _SC_LEVEL4_CACHE_SIZE		_SC_LEVEL4_CACHE_SIZE
+    _SC_LEVEL4_CACHE_ASSOC,
+#define _SC_LEVEL4_CACHE_ASSOC		_SC_LEVEL4_CACHE_ASSOC
+    _SC_LEVEL4_CACHE_LINESIZE,
+#define _SC_LEVEL4_CACHE_LINESIZE	_SC_LEVEL4_CACHE_LINESIZE
+    /* Leave room here, maybe we need a few more cache levels some day.  */
+
+    _SC_IPV6 = _SC_LEVEL1_ICACHE_SIZE + 50,
+#define _SC_IPV6			_SC_IPV6
+    _SC_RAW_SOCKETS,
+#define _SC_RAW_SOCKETS			_SC_RAW_SOCKETS
+
+    _SC_V7_ILP32_OFF32,
+#define _SC_V7_ILP32_OFF32		_SC_V7_ILP32_OFF32
+    _SC_V7_ILP32_OFFBIG,
+#define _SC_V7_ILP32_OFFBIG		_SC_V7_ILP32_OFFBIG
+    _SC_V7_LP64_OFF64,
+#define _SC_V7_LP64_OFF64		_SC_V7_LP64_OFF64
+    _SC_V7_LPBIG_OFFBIG,
+#define _SC_V7_LPBIG_OFFBIG		_SC_V7_LPBIG_OFFBIG
+
+    _SC_SS_REPL_MAX,
+#define _SC_SS_REPL_MAX			_SC_SS_REPL_MAX
+
+    _SC_TRACE_EVENT_NAME_MAX,
+#define _SC_TRACE_EVENT_NAME_MAX	_SC_TRACE_EVENT_NAME_MAX
+    _SC_TRACE_NAME_MAX,
+#define _SC_TRACE_NAME_MAX		_SC_TRACE_NAME_MAX
+    _SC_TRACE_SYS_MAX,
+#define _SC_TRACE_SYS_MAX		_SC_TRACE_SYS_MAX
+    _SC_TRACE_USER_EVENT_MAX,
+#define _SC_TRACE_USER_EVENT_MAX	_SC_TRACE_USER_EVENT_MAX
+
+    _SC_XOPEN_STREAMS,
+#define _SC_XOPEN_STREAMS		_SC_XOPEN_STREAMS
+
+    _SC_THREAD_ROBUST_PRIO_INHERIT,
+#define _SC_THREAD_ROBUST_PRIO_INHERIT	_SC_THREAD_ROBUST_PRIO_INHERIT
+    _SC_THREAD_ROBUST_PRIO_PROTECT
+#define _SC_THREAD_ROBUST_PRIO_PROTECT	_SC_THREAD_ROBUST_PRIO_PROTECT
+
+};
+#endif
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/posix/bits/posix1_lim.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/posix/bits/posix1_lim.h
new file mode 100755
index 0000000000000..0739958c5a6c4
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/posix/bits/posix1_lim.h
@@ -0,0 +1,34 @@
+#ifndef POSIX1_LIM_H
+#define POSIX1_LIM_H
+/**
+  @file posix1_lim.h
+  @brief POSIX Minimum values
+
+EXTERNAL FUNCTIONS    
+   None 
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None 
+    
+TODO    
+   This header should be ideally relocated under api/posix/bits (something that 
+   doesnt exist today) and be included from api/posix/bits/limits.h which inturn 
+   should be included from toolchain's limits.h 
+
+Copyright (c) 2018, 2023 by Qualcomm Technologies, Inc.  All Rights Reserved.
+Confidential and Proprietary - Qualcomm Technologies, Inc.    
+    
+==============================================================================*/
+
+#ifndef _POSIX_PATH_MAX
+/** @brief Maximum number of bytes in a pathname, including the terminating
+    nul character */
+#define _POSIX_PATH_MAX 256
+#endif
+
+#ifndef _POSIX_SEM_NSEMS_MAX
+/** @brief Maximum number of semaphores that a process may have */
+#define _POSIX_SEM_NSEMS_MAX 16
+#endif
+#endif
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/posix/common/time.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/posix/common/time.h
new file mode 100755
index 0000000000000..76b0d39ab7039
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/posix/common/time.h
@@ -0,0 +1 @@
+#include <time.h>
\ No newline at end of file
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/posix/fcntl.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/posix/fcntl.h
new file mode 100755
index 0000000000000..c80ec98a449b6
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/posix/fcntl.h
@@ -0,0 +1,51 @@
+#ifndef _FCNTL_H
+#define _FCNTL_H
+
+/*==========================================================================
+ * FILE:         fcntl.h
+ *
+ * SERVICES:     POSIX fcntl.h
+ *
+ * DESCRIPTION:  The <fcntl.h> header is needed by the open() and fcntl()
+ *               system calls, which have a variety of parameters and
+ *               flags. They are described here.
+ *
+ *               The formats of the calls to each of these are:
+ *
+ *               open(path, oflag [,mode]) open a file
+ *               fcntl(fd, cmd [,arg]) get or set file attributes
+ *
+ *               Copyright (c) 2013,2016  by Qualcomm Technologies, Inc.  All Rights Reserved. QUALCOMM Proprietary and Confidential.
+
+ *==========================================================================*/
+
+#include <sys/types.h>
+#include <generic/fcntl.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Oflag values for open(). POSIX Table 6-4. */
+#define POSIX_O_CREAT       0x100  /* creat file if it doesn't exist */
+#define POSIX_O_EXCL        0x200  /* exclusive use flag */
+#define POSIX_O_NOCTTY      0x400  /* do not assign a controlling terminal */
+#define POSIX_O_TRUNC       0x1000 /* truncate flag */
+
+/* File status flags for open() and fcntl(). POSIX Table 6-5. */
+#define POSIX_O_APPEND      0x2000 /* set append mode */
+#define POSIX_O_NONBLOCK    0x4000 /* no delay */
+
+/* File access modes for open() and fcntl(). POSIX Table 6-6. */
+#define POSIX_O_RDONLY      0 /* open(name, POSIX_O_RDONLY) opens read only */
+#define POSIX_O_WRONLY      1 /* open(name, POSIX_O_WRONLY) opens write only */
+#define POSIX_O_RDWR        2 /* open(name, POSIX_O_RDWR) opens read/write */
+
+/* Mask for use with file access modes. POSIX Table 6-7. */
+#define POSIX_O_ACCMODE     0x3 /* mask for file access modes */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _FCNTL_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/posix/hooks/unistd.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/posix/hooks/unistd.h
new file mode 100755
index 0000000000000..1c618bfe36b4f
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/posix/hooks/unistd.h
@@ -0,0 +1,115 @@
+#ifndef UNISTD_H
+#define UNISTD_H
+/**
+  @file posix/hooks/unistd.h
+  @brief POSIX related declarations in <unistd.h> that are missing in toolchain 
+         header
+
+EXTERNAL FUNCTIONS    
+   None 
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS    
+   DONT include this header directly! Instead include unistd.h. 
+
+Copyright (c) 2018, 2023 by Qualcomm Technologies, Inc.  All Rights Reserved.
+Confidential and Proprietary - Qualcomm Technologies, Inc.    
+    
+==============================================================================*/
+#include <types.h> /* For various POSIX ID types from toolchain headers */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+extern long pathconf (char const * path, int name);
+
+/* Process*/
+
+/** The getppid() function shall return the parent process ID of the calling process.
+ * Please refer to POSIX standard for details.
+ * @param thread    [in]  none
+ * @param value_ptr [out] the  parent process ID
+ */
+pid_t getppid(void);
+
+/** The getpgid() function shall return the process group ID of the process whose process ID is equal to pid
+ * Please refer to POSIX standard for details.
+ * @param thread    [in]  process ID
+ * @param value_ptr [out] process group ID
+ */
+pid_t getpgid(pid_t pid);
+
+/** The getpgrp() function shall return the process group ID of the calling process
+ * Please refer to POSIX standard for details.
+ * @param thread    [in]  none
+ * @param value_ptr [out] process group ID of the calling process
+ */
+pid_t getpgrp(void);
+
+/**The getuid() function shall return the real user ID of the calling process.
+ * Please refer to POSIX standard for details.
+ * @param thread    [in]  none
+ * @param value_ptr [out] the real user ID of the calling process.
+ */
+uid_t getuid(void); 
+
+/** The geteuid() function shall return the effective user ID of the calling process
+ * Please refer to POSIX standard for details.
+ * @param thread    [in]  none
+ * @param value_ptr [out] effective user ID of the calling process
+ */
+uid_t geteuid(void); 
+
+/** The getegid() function shall return the effective group ID of the calling process.
+ * Please refer to POSIX standard for details.
+ * @param thread    [in]  none
+ * @param value_ptr [out] effective group ID of the calling process.
+ */
+gid_t getegid(void);
+
+/** The getgid() function shall return the real group ID of the calling process
+ * Please refer to POSIX standard for details.
+ * @param thread    [in]  none
+ * @param value_ptr [out] real group ID of the calling process.
+ */
+ gid_t getgid(void); 
+
+/** seteuid set effective user ID
+ * Please refer to POSIX standard for details.
+ * @param thread    [in] effective user ID
+ * @param value_ptr [out] Upon successful completion, 0 shall be returned; otherwise, -1 shall be returned and errno set to indicate the error.
+ */
+int seteuid(uid_t uid);
+
+/** setpgrp - set the process group ID
+ * Please refer to POSIX standard for details.
+ * @param thread    [in] none
+ * @param value_ptr [out] Upon successful completion, 0 shall be returned; otherwise, -1 shall be returned and errno set to indicate the error.
+ */ 
+pid_t setpgrp(void);
+
+/** setuid - set user ID
+ * Please refer to POSIX standard for details.
+ * @param thread    [in]  user ID
+ * @param value_ptr [out] Upon successful completion, 0 shall be returned; otherwise, -1 shall be returned and errno set to indicate the error.
+ */
+int setuid(uid_t uid);
+
+/** setpgid - set process group ID for job control
+ * Please refer to POSIX standard for details.
+ * @param thread    [in] PID of process, PGID to be set
+ * @param value_ptr [out] Upon successful completion, 0 shall be returned; otherwise, -1 shall be returned and errno set to indicate the error.
+ */
+int setpgid(pid_t pid, pid_t pgid);
+
+/** setsid - create session and set process group ID
+ * Please refer to POSIX standard for details.
+ * @param thread    [in] none
+ * @param value_ptr [out] Upon successful completion, 0 shall be returned; otherwise, -1 shall be returned and errno set to indicate the error.
+ */
+pid_t setsid(void);
+
+#ifdef __cplusplus
+}
+#endif
+#endif
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/posix/mqueue.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/posix/mqueue.h
new file mode 100755
index 0000000000000..74dcc2fa202c6
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/posix/mqueue.h
@@ -0,0 +1,203 @@
+#ifndef _POSIX_MQUEUE_H_
+#define _POSIX_MQUEUE_H_
+
+/*==========================================================================
+ * FILE:         mqueue.h
+ *
+ * SERVICES:     POSIX Message Queue API interface
+ *
+ * DESCRIPTION:  POSIX Message Queue API interface based upon POSIX 1003.1-2004
+ *
+ * Copyright (c) 2013, 2016, 2023 Qualcomm Technologies, Inc.  
+ * All Rights Reserved. 
+ * Confidential and Proprietary - Qualcomm Technlogies, Inc.
+ *==========================================================================*/
+
+#include <sys/types.h> /*ssize_t */
+#include <time.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MQ_PRIO_MAX        255     /* max priority */
+#define MQ_PRIO_DEFAULT    0       /* default priority */
+
+typedef int   mqd_t;
+
+struct mq_attr
+{
+    long mq_flags;   /* message queue flags */
+    long mq_maxmsg;  /* maximum number of messages */
+    long mq_msgsize; /* maximum message size */
+    long mq_curmsgs; /* number of messages currently queued */
+};
+
+typedef struct mq_attr mqueue_attr;
+
+/** \details
+ * This provides POSIX Message Queue API.
+ *
+ * mq_notify is not supported.
+ *
+ * Since this implementation of POSIX kernel API is a subset of PSE51,
+ * it only supports Message sending and receiving within one process.
+ * Message sending and receiving among processes are not supported.
+ */
+
+/** \defgroup mqueue POSIX Message Queue API */
+/** \ingroup mqueue */
+/** @{ */
+
+/** Open a message queue.
+ * Please refer to POSIX standard for details.
+ */
+mqd_t mq_open(const char *name, int oflag, /* mode_t mode, struct mq_attr *attr */...);
+
+/** Close a message queue.
+ * Please refer to POSIX standard for details.
+ */
+int mq_close(mqd_t mq_desc);
+
+/** Remove a message queue.
+ * Please refer to POSIX standard for details.
+ */
+int mq_unlink(const char *name);
+
+/** Send a message to a message queue.
+ * Please refer to POSIX standard for details.
+ *
+ * If the queue is full, instead of blocking the sender, this function
+ * will return -1 with errno EAGAIN, in this implementation. This behavior
+ * may change in the future.
+ */
+int mq_send(mqd_t mqdes, const char *msg_ptr, size_t msg_len, unsigned int msg_prio);
+
+/** Send a message to a message queue with timeout.
+ * Please refer to POSIX standard for details.
+ * @param abs_timeout [in] Only abs_timeout={0,0} is supported in this
+ * implementation. This behavior may change in the future.
+ */
+int mq_timedsend(mqd_t mqdes, const char *msg_ptr, size_t msg_len, unsigned int msg_prio, const struct timespec *abs_timeout);
+
+/** Receive a message from a message queue.
+ * Please refer to POSIX standard for details.
+ */
+ssize_t mq_receive(mqd_t mqdes, char *msg_ptr, size_t msg_len, unsigned int *msg_prio);
+
+/** Receive a message from a message queue with timeout.
+ * Please refer to POSIX standard for details.
+ * @param abs_timeout [in] Only abs_timeout={0,0} is supported in this
+ * implementation. This behavior may change in the future.
+ */
+ssize_t mq_timedreceive(mqd_t mqdes, char *restrict msg_ptr, size_t msg_len, unsigned int *restrict msg_prio, const struct timespec *restrict abs_timeout);
+
+/** Get message queue attributes.
+ * Please refer to POSIX standard for details.
+ */
+int mq_getattr(mqd_t mqdes, struct mq_attr *mqstat);
+
+/** Set message queue attributes.
+ * Please refer to POSIX standard for details.
+ */
+int mq_setattr(mqd_t mqdes, const struct mq_attr *restrict mqstat, struct mq_attr *restrict omqstat);
+
+/** @} */
+
+#define NBBY    8U               /* number of bits in a byte */
+
+/*
+ * Select uses bit masks of file descriptors in longs.  These macros
+ * manipulate such bit fields (the filesystem macros use chars).
+ * FD_SETSIZE may be defined by the user, but the default here should
+ * be enough for most uses.
+ */
+#ifndef FD_SETSIZE
+#define FD_SETSIZE    256U
+#endif
+
+typedef unsigned long   fd_mask;
+#define NFDBITS    (sizeof(fd_mask) * (unsigned int)NBBY)     /* bits per mask */
+
+#ifndef howmany
+#define howmany(x, y)    (((x) + ((y) - 1U)) / (y))
+#endif
+
+//equivalent of fd_set fpr WINNT env
+typedef struct fd_set
+{
+    fd_mask fds_bits[howmany(FD_SETSIZE, NFDBITS)];
+} fd_set;
+
+/** \addtogroup mqueue */
+/** @{ */
+
+/** Sets the bit for the file descriptor fd in the file descriptor set fdset.
+ */
+#define FD_SET(n, p) ((p)->fds_bits[((unsigned int) (n)) / NFDBITS] |= (1UL << (((unsigned int) (n)) % NFDBITS)))
+
+/** Clears the bit for the file descriptor fd in the file descriptor set fdset.
+ */
+#define FD_CLR(n, p) ((p)->fds_bits[((unsigned int) (n)) / NFDBITS] &= ~(1UL << (((unsigned int) (n)) % NFDBITS)))
+
+/** Returns a non-zero value if the bit for the file descriptor fd is set in the file descriptor set pointed to by fdset, and 0 otherwise.
+ */
+#define FD_ISSET(n, p)    ((unsigned long)(p)->fds_bits[((unsigned int) (n)) / NFDBITS] & (unsigned long)((unsigned)1U << (((unsigned int) (n)) % NFDBITS)))
+
+/** Copies the file descriptor set.
+ */
+#define FD_COPY(f, t)     (void)(memcpy)((t), (f), sizeof(*(f)))
+
+/** Initializes the file descriptor set fdset to have zero bits for all file descriptors.
+ */
+#define FD_ZERO(p)        (void)memset((p), 0, sizeof(*(p)))
+
+/** Error check the file descriptor set.
+ */
+#define FD_BAD(fd)        ((fd) < 0 /*|| fd >= fd_arraylen || fd_array[fd].obj == 0*/)
+
+/*! Wait for both message queues and signals. In this implementation, only
+ * message queue file descriptors are supported.
+ * @param nfds [in] This is an integer one more than the maximum of any file
+ * descriptor in any of the sets. In other words, while you are busy
+ * adding file descriptors to your sets, you must calculate the maximum
+ * integer value of all of them, then increment this value by one, and
+ * then pass this as nfds to select().
+ * @param readfds  [in] the file descriptor set on all message queues.
+ * @param writefds [in] ignored in this implementation.
+ * @param errorfds [in] ignored in this implementation.
+ * @param timeout  [in] Only timeout={0,0} is supported in this
+ * implementation. This behavior may change in the future.
+ */
+int pselect(int nfds, fd_set *restrict readfds,
+            fd_set *restrict writefds, fd_set *restrict errorfds,
+            const struct timespec *restrict timeout,
+            const sigset_t *restrict sigmask);
+
+/*! Wait for multiple message queues. In this implementation, only
+ * message queue file descriptors are supported.
+ * @param nfds [in] This is an integer one more than the maximum of any file
+ * descriptor in any of the sets. In other words, while you are busy
+ * adding file descriptors to your sets, you must calculate the maximum
+ * integer value of all of them, then increment this value by one, and
+ * then pass this as nfds to select().
+ * @param readfds  [in] the file descriptor set on all message queues.
+ * @param writefds [in] ignored in this implementation.
+ * @param errorfds [in] ignored in this implementation.
+ * @param timeout  [in] Only timeout={0,0} is supported in this
+ * implementation. This behavior may change in the future.
+ */
+int select(int nfds, fd_set *restrict readfds,
+           fd_set *restrict writefds, fd_set *restrict errorfds,
+           struct timeval *restrict timeout);
+
+/** @} */
+
+/* this function is needed for test framework which needs to clean up memory when teardown */
+void _mq_teardown(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/posix/pthread.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/posix/pthread.h
new file mode 100755
index 0000000000000..f64242e8dc683
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/posix/pthread.h
@@ -0,0 +1,287 @@
+#ifndef QURT_PTHREAD_H
+#define QURT_PTHREAD_H  
+
+/*==========================================================================
+ * FILE:         pthread.h
+ *
+ * SERVICES:     POSIX pthread API interface
+ *
+ * DESCRIPTION:  POSIX pthread API interface based upon POSIX 1003.1-2004
+ *
+ *               Copyright (c) 2013,2016,2023  by Qualcomm Technologies, Inc.  All Rights Reserved. QUALCOMM Proprietary and Confidential.
+ *==========================================================================
+ *
+ *                          EDIT HISTORY FOR MODULE
+ *
+ *  This section contains comments describing changes made to the module.
+ *  Notice that changes are listed in reverse chronological order.
+ *
+ *  
+ *
+ *  when       who     what, where, why
+ *  --------   ---     -------------------------------------------------------
+ *  10/13/08   cz      Initial version.
+ *==========================================================================*/
+
+#include <sys/types.h>
+#include "sys/sched.h" /* For struct sched_param */
+#include "sys/errno.h" /* error values */
+#include <qurt.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <time.h>
+#include "pthread_types.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* the range of the set supported by the kernel data type used to represent CPU sets. */
+#define CONFIG_NR_CPUS QURT_THREAD_CFG_BITMASK_ALL
+
+#define UNIMPLEMENTED(FUNC, RETURNTYPE, ARGS)    static inline RETURNTYPE FUNC ARGS { qurt_printf("Unimplemented: %s... exiting\n", __FUNCTION__); exit(1); }
+
+/** @brief Magic (non-portable) value for a stack's address to enable usage
+           of auto-stack feature (if available) */
+#define PTHREAD_AUTO_STACK_MAGIC_ADDR_NP ((void *)0xFFF)
+
+/** \details 
+ * This provides POSIX thread API. 
+ *
+ */
+
+/** \defgroup pthread POSIX pthread API */
+/** \ingroup pthread */
+/** @{ */
+
+/** Compare Two Threads. 
+ * Please refer to POSIX standard for details.  
+ */
+static inline int pthread_equal(pthread_t t1, pthread_t t2)
+{
+    return (t1 == t2) ? 1 : 0;
+}
+
+/** Create Thread. 
+ * Please refer to POSIX standard for details.  
+ */
+int pthread_create(pthread_t * tid, const pthread_attr_t * attr, void *(*start)(void *), void *arg);
+
+/** Terminate Calling Thread. 
+ * Please refer to POSIX standard for details.  
+ */
+void pthread_exit(void *value_ptr);
+
+/** Wait for thread termination.
+ * Please refer to POSIX standard for details.
+ * @param thread    [in]  the thread to be joined
+ * @param value_ptr [out] the pointer of the exit status
+ */
+int pthread_join(pthread_t thread, void **value_ptr);
+
+/** Detach a joinable thread.
+ * Please refer to POSIX standard for details.
+ * @param id    [in]  id of the tread the thread to be detached.
+ */
+int pthread_detach(pthread_t id);
+
+/** Dynamic package initialisation
+ * Please refer to POSIX standard for details.
+ */
+int pthread_once(pthread_once_t *once_control, void (*init_routine)(void));
+
+pthread_t pthread_self(void);
+int pthread_cancel(pthread_t thread);
+static inline void pthread_yield(void)
+{
+    return;
+}
+
+int pthread_kill(pthread_t thread, int sig);
+
+/**
+ * @brief Return name of thread
+ * @warning Donot call this in the error handling path as it may cause deadlock
+ *          due to underlying OS calls
+ * @param thread [in] thread Thread whose name is to be retrieved
+ * @param name [out] name Buffer used to return thread name
+ * @param len [in] len  Number of bytes available in name
+ * @return 0 on success, ESRCH, ERANGE on failure
+ */
+extern int pthread_getname_np (pthread_t thread, char * name, size_t len);
+
+int pthread_getschedparam(pthread_t thread, int *restrict policy, struct sched_param *restrict param);
+int pthread_setschedparam(pthread_t thread, int policy, const struct sched_param *param);
+int pthread_setschedprio(pthread_t thread, int prio);
+int pthread_setcancelstate(int state, int *oldstate);
+int pthread_setcanceltype(int type, int *oldtype);
+
+/* Attribute functions */
+int pthread_attr_init(pthread_attr_t *attr);
+int pthread_attr_destroy(pthread_attr_t *attr);
+int pthread_attr_setschedparam(pthread_attr_t *restrict attr, const sched_param *restrict param);
+int pthread_attr_getschedparam(const pthread_attr_t *restrict attr, sched_param *restrict param);
+int pthread_attr_setstacksize(pthread_attr_t *attr, size_t stacksize);
+int pthread_attr_getstacksize(const pthread_attr_t *attr, size_t *stacksize);
+int pthread_attr_setstackaddr(pthread_attr_t *attr, void * stackaddr);
+int pthread_attr_getstackaddr(const pthread_attr_t *attr, void ** stackaddr);
+int pthread_attr_getdetachstate(const pthread_attr_t *attr, int *detachstate);
+int pthread_attr_setdetachstate(pthread_attr_t *attr, int detachstate);
+int pthread_attr_setstack(pthread_attr_t *attr, void *stackaddr, size_t stacksize);
+int pthread_attr_getstack(const pthread_attr_t *attr, void **stackaddr, size_t *stacksize);
+int pthread_attr_setscope(pthread_attr_t *attr, int scope);
+int pthread_attr_getscope(const pthread_attr_t *attr, int *scope);
+int pthread_attr_setinheritsched(pthread_attr_t *attr, int inheritsched);
+int pthread_attr_getinheritsched(const pthread_attr_t *attr, int *inheritsched);
+int pthread_attr_getguardsize(const pthread_attr_t * attr, size_t * guardsize);
+int pthread_attr_setautostack(pthread_attr_t *attr);
+int pthread_attr_setbuspriority(pthread_attr_t *attr, unsigned short bus_priority);
+
+/* Qualcomm additions to pthread get/set attribute functions */
+int pthread_attr_setthreadname(pthread_attr_t *attr, const char * name);
+int pthread_attr_getthreadname(const pthread_attr_t *attr, char * name, int size);
+int pthread_attr_settimetestid(pthread_attr_t *attr, unsigned int tid);
+int pthread_attr_gettimetestid(const pthread_attr_t *attr, unsigned int* tid);
+
+/* Mutexes */
+int pthread_mutex_init(pthread_mutex_t *mutex, pthread_mutexattr_t *attr);
+int pthread_mutex_lock(pthread_mutex_t *mutex);
+int pthread_mutex_unlock(pthread_mutex_t *mutex);
+int pthread_mutex_trylock(pthread_mutex_t *mutex);
+int pthread_mutex_destroy(pthread_mutex_t *mutex);
+int pthread_mutex_getprioceiling(const pthread_mutex_t *restrict mutex, int *restrict prioceiling);
+int pthread_mutex_setprioceiling(pthread_mutex_t *restrict mutex, int prioceiling, int *restrict old_ceiling);
+
+/* For Mutex with type PTHREAD_MUTEX_NORMAL, Priority Inheritance is not 
+ * supported even PTHREAD_PRIO_INHERIT is defined since QURT does not support
+ * this kind of Mutex */
+int pthread_mutexattr_init(pthread_mutexattr_t *attr);
+int pthread_mutexattr_destroy(pthread_mutexattr_t *attr);
+int pthread_mutexattr_gettype(const pthread_mutexattr_t *restrict, int *restrict);
+int pthread_mutexattr_settype(pthread_mutexattr_t *attr, int type);
+int pthread_mutexattr_getprotocol(const pthread_mutexattr_t *restrict, int *restrict);
+int pthread_mutexattr_setprotocol(pthread_mutexattr_t *attr, int protocol);
+int pthread_mutexattr_getpshared(const pthread_mutexattr_t *restrict, int *restrict);
+int pthread_mutexattr_setpshared(pthread_mutexattr_t *, int);
+int pthread_mutexattr_getprioceiling(const pthread_mutexattr_t *restrict attr, int *restrict prioceiling);
+int pthread_mutexattr_setprioceiling(pthread_mutexattr_t *attr, int prioceiling);
+
+/* Spinlocks */
+int pthread_spin_init(pthread_spinlock_t *lock, int pshared);
+int pthread_spin_destroy(pthread_spinlock_t *lock);
+int pthread_spin_lock(pthread_spinlock_t *lock);
+int pthread_spin_trylock(pthread_spinlock_t *lock);
+int pthread_spin_unlock(pthread_spinlock_t *lock);
+
+/* Condition variables */
+int pthread_condattr_init(pthread_condattr_t *attr);
+int pthread_condattr_destroy(pthread_condattr_t *attr);
+int pthread_condattr_setpshared(pthread_condattr_t *attr, int pshared);
+int pthread_condattr_getpshared(const pthread_condattr_t *restrict attr, int *restrict pshared);
+int pthread_condattr_setclock(pthread_condattr_t *attr, clockid_t clock);
+int pthread_condattr_getclock(const pthread_condattr_t *restrict attr, clockid_t *restrict clock);
+int pthread_cond_init(pthread_cond_t *cond, pthread_condattr_t *attr);
+int pthread_cond_destroy(pthread_cond_t *cond);
+int pthread_cond_signal(pthread_cond_t *cond);
+int pthread_cond_broadcast(pthread_cond_t *cond);
+int pthread_cond_wait(pthread_cond_t *cond, pthread_mutex_t *mutex);
+int pthread_cond_timedwait(pthread_cond_t * cond, pthread_mutex_t * mutex, const struct timespec *time);
+
+/* Barriers */
+int pthread_barrier_init(pthread_barrier_t *restrict barrier, const pthread_barrierattr_t *restrict attr, unsigned count);
+int pthread_barrier_destroy(pthread_barrier_t *barrier);
+int pthread_barrier_wait(pthread_barrier_t *barrier);
+int pthread_barrierattr_init(pthread_barrierattr_t *attr);
+int pthread_barrierattr_destroy(pthread_barrierattr_t *attr);
+int pthread_barrierattr_getpshared(const pthread_barrierattr_t *restrict attr, int *restrict pshared);
+
+
+/*Read-Write locks*/
+int pthread_rwlock_init(pthread_rwlock_t *, const pthread_rwlockattr_t *);
+int pthread_rwlock_destroy(pthread_rwlock_t *);
+int pthread_rwlockattr_init(pthread_rwlockattr_t *);
+int pthread_rwlockattr_destroy(pthread_rwlockattr_t *);
+int pthread_rwlockattr_getpshared(const pthread_rwlockattr_t *, int *);
+int pthread_rwlockattr_setpshared(pthread_rwlockattr_t *, int);
+int pthread_rwlock_rdlock(pthread_rwlock_t *);
+int pthread_rwlock_tryrdlock(pthread_rwlock_t *);
+int pthread_rwlock_wrlock(pthread_rwlock_t *);
+int pthread_rwlock_trywrlock(pthread_rwlock_t *);
+int pthread_rwlock_unlock(pthread_rwlock_t *);
+
+
+/** please refer to POSIX standard document 
+ */
+int pthread_barrierattr_setpshared(pthread_barrierattr_t *attr, int pshared);
+
+/** set CPU affinity attribute in thread attributes object.
+
+ * @param attr       [in] pthread attributes 
+ * @param cpusetsize [in] The argument cpusetsize is the length (in bytes) 
+                          of the buffer pointed to by cpuset. Typically, 
+                          this argument would be specified as 
+                          sizeof(cpu_set_t).
+ * @param cpuset     [in] This data set is a bitset where each bit represents 
+                          a CPU (hw thread). How the system's CPUs are mapped 
+                          to bits in the bitset is system dependent. 
+                          For QURT kernel, Bit 0 is corresponding to hw 
+                          thread 0, and so on. If the corresponding bit is 
+                          set to 1, then the software thread is eligible to 
+                          run this hw thread.  0x3f means it can run any hw
+                          threads 0x0 also means it can run on any hw threads.
+   @return On success, this function returns 0; on error, it returns a 
+           non-zero error number.
+           EINVAL - cpuset specified a CPU that was outside the set supported 
+                    by the kernel.  (The kernel configuration option 
+                    CONFIG_NR_CPUS defines the range of the set supported by 
+                    the kernel data type used to represent CPU sets.)
+ * @note This function is non-standard GNU extensions; hence the suffix "_np"
+         (non-portable) in the names. 
+ */
+int pthread_attr_setaffinity_np(pthread_attr_t *attr, size_t cpusetsize, const cpu_set_t *cpuset);
+
+/** get CPU affinity attribute in thread attributes object.
+ * @param attr       [in] pthread attributes 
+ * @param cpusetsize [in] The argument cpusetsize is the length (in bytes) 
+                          of the buffer pointed to by cpuset. Typically, 
+                          this argument would be specified as 
+                          sizeof(cpu_set_t).
+ * @param cpuset    [out] This data set is a bitset where each bit represents 
+                          a CPU (hw thread). How the system's CPUs are mapped 
+                          to bits in the bitset is system dependent. 
+                          For QURT kernel, Bit 0 is corresponding to hw 
+                          thread 0, and so on. If the corresponding bit is 
+                          set to 1, then the software thread is eligible to 
+                          run this hw thread.  0x3f means it can run any hw
+                          threads 0x0 also means it can run on any hw threads.
+   @return On success, this function returns 0; on error, it returns a 
+           non-zero error number.
+           EINVAL - cpusetsize is smaller than the size of the affinity mask 
+                    used by the kernel.
+ * @note   This function is non-standard GNU extensions; hence the suffix "_np"
+           (non-portable) in the names. 
+ */
+int pthread_attr_getaffinity_np(pthread_attr_t *attr, size_t cpusetsize, cpu_set_t *cpuset);
+
+/* TLS */
+int pthread_key_create(pthread_key_t *key, void (*destructor)(void*));
+int pthread_key_delete(pthread_key_t key);
+int pthread_setspecific(pthread_key_t key, const void *value);
+void *pthread_getspecific(pthread_key_t key);
+int pthread_getattr_np(pthread_t thread, pthread_attr_t * restrict attr); 	 	
+
+/** @} */
+
+/* Calling non-pthread calls this function to create pthred tcb w/o creating actual thread */
+int pthread_fake(pthread_t * restrict thread, const pthread_attr_t * restrict attr);
+int pthread_fake_destroy(pthread_t thread);
+
+//amitkulk: move these to unistd.h after we move that header within qurt
+int posix_memalign(void **memptr, size_t alignment, size_t size);
+void exit(int status);
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* QURT_PTHREAD_H */
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/posix/pthread_types.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/posix/pthread_types.h
new file mode 100755
index 0000000000000..51c3b9dbca243
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/posix/pthread_types.h
@@ -0,0 +1,193 @@
+#ifndef _PTHREAD_TYPES_H_
+#define _PTHREAD_TYPES_H_
+
+/*==========================================================================
+ * FILE:         pthread_types.c
+ *
+ * SERVICES:     types usded in POSIX API interface
+ *
+ * DESCRIPTION:  POSIX API interface based upon POSIX 1003.1-2004
+ *
+ *               Copyright (c) 2016, 2023  by Qualcomm Technologies, Inc.  All Rights Reserved. QUALCOMM Proprietary and Confidential.
+
+ *==========================================================================*/
+
+#include <stddef.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef __GNUC__
+#define restrict __restrict__
+#else
+#define restrict
+#endif
+
+#define _SSIZE_T
+
+#ifndef TRUE
+#define TRUE    1
+#endif
+
+#ifndef FALSE
+#define FALSE    0
+#endif
+
+#define PTHREAD_MAX_THREADS          512U
+
+#define PTHREAD_NAME_LEN             16
+#define PTHREAD_MIN_STACKSIZE        512 //4096
+#define PTHREAD_MAX_STACKSIZE        1048576
+#define PTHREAD_DEFAULT_STACKSIZE    16384
+
+#define PTHREAD_STACK_MIN            (4096U*2U)
+#define PTHREAD_MIN_PRIORITY         0U
+#define PTHREAD_MAX_PRIORITY         255U
+#define PTHREAD_DEFAULT_PRIORITY     1
+
+/*Mutex initialization status*/
+#define PTHREAD_MUTEX_ATTR_UNINITIALIZED    0
+#define PTHREAD_MUTEX_ATTR_INITIALIZED      1
+
+/*Conditional attributes initialization status*/
+#define PTHREAD_COND_ATTR_UNINITIALIZED     0
+#define PTHREAD_COND_ATTR_INITIALIZED       1
+
+#define PTHREAD_DEFAULT_NAME                "Anonymous"
+
+#define PTHREAD_MUTEX_INITIALIZER    ((pthread_mutex_t) 0xFFFFFFFFU)
+                                      
+#define PTHREAD_COND_INITIALIZER     ((pthread_cond_t) 0xFFFFFFFFU)
+
+/* mutex and cond_var shared */
+#define PTHREAD_PROCESS_PRIVATE      0
+#define PTHREAD_PROCESS_SHARED       1
+
+/* mutex type */
+#define PTHREAD_MUTEX_ERRORCHECK     0
+#define PTHREAD_MUTEX_NORMAL         1
+#define PTHREAD_MUTEX_RECURSIVE      2
+#define PTHREAD_MUTEX_DEFAULT        3
+
+/* mutex protocol */
+#define PTHREAD_PRIO_NONE            0
+#define PTHREAD_PRIO_INHERIT         1
+#define PTHREAD_PRIO_PROTECT         2
+
+#define PTHREAD_SPINLOCK_UNLOCKED    0
+#define PTHREAD_SPINLOCK_LOCKED      1
+
+#define PTHREAD_ONCE_INIT (0)
+
+#define PTHREAD_MUTEX_OPAQUE //ToDo: amitkulk: debug
+
+typedef signed int   ssize_t;
+
+/*detatchstate of a pthread*/
+#define PTHREAD_CREATE_JOINABLE             1
+#define PTHREAD_CREATE_DETACHED             0
+
+/*contention scope*/
+#define PTHREAD_SCOPE_PROCESS 1 
+#define PTHREAD_SCOPE_SYSTEM 0
+
+/*scheduler*/
+#define PTHREAD_INHERIT_SCHED 1
+#define PTHREAD_EXPLICIT_SCHED 0
+
+/*
+ * Types and structure definitions
+ *
+ */
+typedef unsigned int cpu_set_t;
+
+typedef unsigned int pthread_t;
+
+typedef struct pthread_attr_t
+{
+    void         *stackaddr;
+    int          internal_stack; /* this flag==1 means the stack needs to be freed by posix */
+    size_t       stacksize;
+    int          priority;
+    unsigned short timetest_id;
+    /* This flag indicate if thread will be autostack thread*/    
+	unsigned short autostack:1;
+    /* This flag is to indicate thread's bus_priority high/low 
+       bus_priority = 0  -- Bus_priority is low
+       bus_priority = 1  -- Bus_priority is high
+       bus_priority = 3  -- Bus_priority is default (takes the default set for the process)
+    */
+    unsigned short bus_priority:2;
+    unsigned short reserved:13;
+    cpu_set_t    cpumask;
+    char         name[PTHREAD_NAME_LEN];
+    /* This flag indicates whether pthread lib should create thread contexts for other OSALs */
+    /* This is used internally by POSIX and not available for general usage */
+    int          ext_context;
+    int          detachstate;
+} pthread_attr_t;
+
+//mutex attr
+typedef struct pthread_mutexattr_t   pthread_mutexattr_t;
+struct pthread_mutexattr_t
+{
+    int is_initialized;
+    int type;
+    int pshared;
+    int protocol;
+};
+
+typedef unsigned int              pthread_mutex_t;
+
+typedef unsigned int              pthread_spinlock_t;
+
+typedef struct pthread_condattr_t
+{
+    int is_initialized;
+    int pshared;
+    clockid_t clock_id;
+} pthread_condattr_t;
+
+typedef unsigned int             pthread_cond_t;
+
+typedef struct pthread_barrierattr_t
+{
+    int is_initialized;
+    int pshared;
+} pthread_barrierattr_t;
+
+typedef unsigned int                pthread_barrier_t;
+
+typedef int pthread_key_t;
+
+typedef int pthread_once_t;
+
+
+/*Read-Write locks*/
+#define PTW32_RWLOCK_MAGIC 0xfacade2
+#define PTHREAD_RWLOCK_INITIALIZER ((pthread_rwlock_t)(size_t) -1)
+
+struct pthread_rwlockattr_t_
+{
+  int pshared;
+};
+
+struct pthread_rwlock_t_
+{
+  pthread_mutex_t mtxExclusiveAccess;
+  pthread_mutex_t mtxSharedAccessCompleted;
+  pthread_cond_t cndSharedAccessCompleted;
+  int nSharedAccessCount;
+  int nExclusiveAccessCount;
+  int nCompletedSharedAccessCount;
+  int nMagic;
+};
+
+typedef struct pthread_rwlock_t_ * pthread_rwlock_t;
+typedef struct pthread_rwlockattr_t_ * pthread_rwlockattr_t;
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _PTHERAD_TYPES_H_ */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/posix/sched.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/posix/sched.h
new file mode 100755
index 0000000000000..faf3365be9f82
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/posix/sched.h
@@ -0,0 +1,21 @@
+/*=============================================================================
+
+                                    sched.h
+
+GENERAL DESCRIPTION
+
+EXTERNAL FUNCTIONS
+        None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+        None.
+
+             Copyright (c) 2013,2016  by Qualcomm Technologies, Inc.  All Rights Reserved.
+=============================================================================*/
+#ifndef __SCHED_H__
+#define __SCHED_H__
+
+#include "sys/sched.h"
+
+#endif //__SCHED_H__
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/posix/semaphore.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/posix/semaphore.h
new file mode 100755
index 0000000000000..d9145b295ae62
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/posix/semaphore.h
@@ -0,0 +1,114 @@
+#ifndef SEMAPHORE_H
+#define SEMAPHORE_H
+
+/*==========================================================================
+ * FILE:         semaphore.h
+ *
+ * SERVICES:     POSIX semaphore API interface
+ *
+ * DESCRIPTION:  POSIX semaphore API interface based upon POSIX 1003.1-2004
+ *
+ *               Copyright (c) 2013, 2016  by Qualcomm Technologies, Inc.  All Rights Reserved. QUALCOMM Proprietary and Confidential.
+
+ *==========================================================================*/
+#include <sys/types.h> // Get all C sys types - includes POSIX specific
+#include "sys/errno.h" // error values
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*=============================================================================
+                        TYPEDEFS
+=============================================================================*/
+/** User facing semaphore container with opaque pointer to implementation */
+typedef struct
+{
+    unsigned int *opaque;
+} sem_t;
+#define _SEM_T
+
+/*=============================================================================
+                        CONSTANTS AND MACROS
+=============================================================================*/
+/* constant definitions */
+#define SEM_FAILED       ((sem_t*) 0)
+
+/* @todo siqbal Should we put such configuration items in a common place
+   instead of this user-facing header? */
+#define SEM_VALUE_MAX    ((unsigned int) 30) // If need be increase this
+
+/*=============================================================================
+                        FUNCTIONS
+=============================================================================*/
+
+/** \details
+ * POSIX standard comes with two kinds of semaphores: named and unnamed
+ * semaphores.
+ *
+ * This implementation of POSIX kernel API provide unnamed & named semaphore.
+ *
+ * 
+ * sem_timedwait() is not provided.
+ */
+
+/** \defgroup semaphore POSIX Semaphore API */
+
+/** \ingroup semaphore */
+/** @{ */
+
+/** Initialize an unnamed semaphore.
+ * Please refer to POSIX standard for details.
+ * @param pshared [in] This implementation does not support non-zero value, 
+ * i.e., semaphore cannot be shared between processes in this implementation. 
+ */                 
+int sem_init(sem_t *sem, int pshared, unsigned int value);
+
+/** Lock a semaphore.
+ * Please refer to POSIX standard for details.
+ */
+int sem_wait(sem_t *sem);
+
+/** Lock a semaphore.
+ * Please refer to POSIX standard for details.
+ */
+int sem_trywait(sem_t *sem);
+
+/** Unlock a semaphore.
+ * Please refer to POSIX standard for details.
+ */
+int sem_post(sem_t *sem);
+
+/** Get the value of a semaphore.
+ * Please refer to POSIX standard for details.
+ */
+int sem_getvalue(sem_t *sem, int *value);
+
+/** Destroy an unnamed semaphore.
+ * Please refer to POSIX standard for details.
+ */
+int sem_destroy(sem_t *sem);
+
+/** creates and initializes a named semaphore.
+ * Please refer to POSIX standard for details.
+ */
+sem_t * sem_open(const char* name , int oflag , ...);
+
+/** closes a semaphore.
+ * Please refer to POSIX standard for details.
+ */
+int sem_close(sem_t *sem);
+
+/** unlinkes a named semaphore.
+ * Please refer to POSIX standard for details.
+ */
+int sem_unlink(const char *name);
+/** @} */
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  /* SEMAPHORE_H */
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/posix/signal.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/posix/signal.h
new file mode 100755
index 0000000000000..35cb1f1a9a319
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/posix/signal.h
@@ -0,0 +1,201 @@
+#ifndef _SIGNAL_H_
+#define _SIGNAL_H_
+
+/*==========================================================================
+ * FILE:         signal.h
+ *
+ * SERVICES:     POSIX Signal API interface
+ *
+ * DESCRIPTION:  POSIX Signal API interface based upon POSIX 1003.1-2004
+ *
+ * Copyright (c) 2013, 2016, 2023 Qualcomm Technologies, Inc.
+ * All Rights Reserved. 
+ * Confidential and Proprietary - Qualcomm Technologies, Inc.
+ 
+ *==========================================================================*/
+
+#include <sys/types.h>
+#include <generic/signal.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* POSIX signal bits */
+
+#define POSIX_MSG      7 /* POSIX msg type used in Qube API */
+#define POSIX_NOTIF    8 /* POSIX msg type used in Qube API */
+#define SIGKILL        9 /* kill (cannot be caught or ignored) */
+
+#define SIGRTMIN       10
+#define SIGRTMAX       32
+
+/* Notification Types. */
+/* No asynchronous notification is delivered when the event of interest occurs. */
+#define SIGEV_NONE      0
+/* The signal specified in sigev_signo shall be generated for the process when
+   the event of interest occurs. */
+#define SIGEV_SIGNAL    1
+/* A notification function is called to perform notification. */
+#define SIGEV_THREAD    2
+#define SA_SIGINFO      1
+
+/*
+ * Flags for sigprocmask:
+ */
+#define SIG_BLOCK       1 /* block specified signal set */
+#define SIG_UNBLOCK     2 /* unblock specified signal set */
+#define SIG_SETMASK     3 /* set specified signal set */
+
+typedef unsigned long int   sigset_t;
+
+union sigval
+{
+    int  sival_int;   /* Integer signal value. */
+    void *sival_ptr;  /* Pointer signal value. */
+};
+
+typedef struct sigevent   sigevent;
+struct sigevent
+{
+    int            sigev_notify;                           /* Notification type.       */
+    int            sigev_signo;                            /* Signal number.           */
+    union sigval   sigev_value;                            /* Signal value.            */
+    void           (*sigev_notify_function)(union sigval); /* Notification function.   */
+    pthread_attr_t *sigev_notify_attributes;
+};
+
+typedef struct siginfo_t   siginfo_t;
+struct siginfo_t
+{
+    int          si_signo;
+    int          si_code;
+    union sigval si_value;
+/*  int          si_errno;
+    pid_t        si_pid;
+    uid_t        si_uid;
+    void         *si_addr;
+    int          si_status;
+    long         si_band;*/
+};
+struct sigaction
+{
+    void     (*sa_handler)(int);
+    sigset_t sa_mask;
+    int      sa_flags;
+    void     (*sa_sigaction)(int, siginfo_t *, void *);
+};
+
+/* Signal functions */
+
+/** \details
+ * This provides POSIX Signal API. Please note that this
+ * implementation does not fully comply with POSIX standard.
+ *
+ * In POSIX standard, Signal can be used as 'interrupt', which means
+ * an incoming signal will interrupt a running thread. After the
+ * registered signal handler is executed, the thread will resume.
+ * This behavior cannot be implemented w/o modifying L4 or QURT kernel.
+ * On the ohter hand, appliation need to be carefully written to avoid
+ * problems caused by 'interrupting' signals.
+ *
+ * Therefore, in this implementation of POSIX signal, thread will
+ * only receive signals when it explicitly waits for signals, i.e., when 
+ * the thread calls either sigwait() or sigsuspend().
+ *
+ * Therefore, pthread_sigmask(), which set or get signal mask for a thread, 
+ * is not supported, since the signal mask will be set by sigwait() and 
+ * sigsuspend().
+ *
+ * Since this implementation of POSIX kernel API is a subset of PSE51,
+ * only threads can send and receive signals. The functions related to 
+ * signal operations with processes, such as kill(), sigqueue(), 
+ * sigprocmask(), are not provided.
+ *
+ * Queued signal is not supported.
+ *
+ * Applications will use signals from SIGRTMIN to SIGRTMAX.
+ *
+ * SIGEV_SIGNAL and SIGEV_THREAD are supported. SIGEV_NONE is not 
+ * supported.
+ *
+ */
+
+/** \defgroup signal POSIX Signal API */
+/** \ingroup signal */
+/** @{ */
+
+/** Wait for signals. This implementation does not support queued signals.
+ *
+ * Please refer to POSIX standard for details.
+ */
+int sigwait(const sigset_t *restrict set, int *restrict sig);
+
+/** Examine and Change Signal Action. 
+ * Please refer to POSIX standard for details.
+ *
+ * @param act [in] A pointer to the sigaction structure that describes the 
+ * action to be taken for the signal. Can be NULL. 
+ * The following flags for sa_flags field in struct sigaction are not 
+ * supported: SA_NOCLDSTOP, SA_ONSTACK, SA_RESETHAND, SA_RESTART, 
+ * SA_NOCLDWAIT and SA_NODEFER. Only flag SA_SIGINFO is supported.  
+ *
+ * @note Define sigaction as macro to avoid a warning when included from 
+ * C++ code - it's causing a "sigaction(...) hides constructor for 
+ * 'struct sigaction'" warning.
+ */
+/*lint -esym(123,sigaction) Suppress "macro used with no arguments" */
+#define sigaction(sig,act,oact) _sigaction((sig),(act),(oact))
+
+/** Wait for signals. 
+ * Please refer to POSIX standard for details.
+ */
+int sigsuspend(const sigset_t *sigmask);
+
+/** Add Signal to Signal Set.
+ * Please refer to POSIX standard for details.
+ */
+int sigaddset(sigset_t *set, int signo);
+
+/** Delete Signal from Signal Set.
+ * Please refer to POSIX standard for details.
+ */
+int sigdelset(sigset_t *set, int signo);
+
+/** Initialize and Empty Signal Set.
+ * Please refer to POSIX standard for details.
+ */
+int sigemptyset(sigset_t *set);
+
+/** Initialize and Fill Signal Set.
+ * Please refer to POSIX standard for details.
+ */
+int sigfillset(sigset_t *set);
+
+/** Test for Signal in Signal Set.
+ * Please refer to POSIX standard for details.
+ */
+int sigismember(const sigset_t *set, int signo);
+
+/** @} */
+
+/* this is not a public api function */
+int _sigaction(int sig, const struct sigaction *act, struct sigaction *oact);
+
+/* have to move #include here to solve circular include problems between time.h and signal.h */
+#include <time.h>
+
+/** Wait for the time interval specified in the timespec structure referenced 
+ * by timeout. This implementation does not support queued signals.
+ * For struct siginfo_t, si_code and si_value are ignored in this implementation.
+ *
+ * Please refer to POSIX standard for details.
+ */
+int sigtimedwait(const sigset_t *restrict set, siginfo_t *restrict info, 
+                 const struct timespec *restrict timeout);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _POSIX_SIGNAL_H_ */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/posix/sys/errno.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/posix/sys/errno.h
new file mode 100755
index 0000000000000..b9edf57bab6c3
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/posix/sys/errno.h
@@ -0,0 +1,20 @@
+#ifndef _SYS_ERRNO_H_
+#define _SYS_ERRNO_H_
+
+/*==========================================================================
+ * FILE:         errno.h
+ *
+ * SERVICES:     POSIX errno header file
+ *
+ * DESCRIPTION:  POSIX errno based upon POSIX 1003.1-2004
+ *
+ *               Copyright (c) 2013, 2016  by Qualcomm Technologies, Inc.  All Rights Reserved. QUALCOMM Proprietary and Confidential.
+
+ *==========================================================================*/
+
+#include <errno.h>
+#ifndef EOK
+#define EOK                0
+#endif
+
+#endif /* _SYS_ERRNO_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/posix/sys/sched.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/posix/sys/sched.h
new file mode 100755
index 0000000000000..2acc34d821725
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/posix/sys/sched.h
@@ -0,0 +1,67 @@
+#ifndef _POSIX_SCHED_H_
+#define _POSIX_SCHED_H_
+
+/*==========================================================================
+ * FILE:         sched.c
+ *
+ * SERVICES:     POSIX Thread sched API interface
+ *
+ * DESCRIPTION:  POSIX Thread sched API interface based upon POSIX 1003.1-2004
+ *
+ *               Copyright (c) 2013, 2016  by Qualcomm Technologies, Inc.  All Rights Reserved. QUALCOMM Proprietary and Confidential.
+
+
+ *==========================================================================*/
+
+#include <qurt.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define SCHED_FIFO        0 /* First in, first out (FIFO) scheduling policy. */
+#define SCHED_RR          1 /* Round robin scheduling policy. */
+#define SCHED_SPORADIC    2 /* Sporadic server scheduling policy. */
+#define SCHED_OTHER       3 /* Another scheduling policy. */
+
+typedef struct sched_param   sched_param;
+struct sched_param
+{
+    void *unimplemented;
+    int  sched_priority;
+};
+
+/** \details 
+ * This provides POSIX sched API. 
+ */
+
+/** \defgroup sched POSIX sched API */
+/** \ingroup sched */
+/** @{ */
+
+/** Relinquish the CPU.
+ * Please refer to POSIX standard for details.
+ */
+static inline int sched_yield(void)
+{
+   return 0;
+}
+
+/** Get the maximum priority.
+ * Please refer to POSIX standard for details.
+ * @param policy [in] SCHED_FIFO is the only valid input for this implementation.
+ */
+int sched_get_priority_max(int policy);
+
+/** Get the minimum priority.
+ * Please refer to POSIX standard for details.
+ * @param policy [in] SCHED_FIFO is the only valid input for this implementation.
+ */
+int sched_get_priority_min(int policy);
+
+/** @} */
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _POSIX_SCHED_H_ */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/posix/sys/types.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/posix/sys/types.h
new file mode 100755
index 0000000000000..700026f9f9e4e
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/posix/sys/types.h
@@ -0,0 +1,35 @@
+#ifndef _SYS_TYPES_H_
+#define _SYS_TYPES_H_
+
+/*==========================================================================
+ * FILE:         types.c
+ *
+ * SERVICES:     types usded in POSIX API interface
+ *
+ * DESCRIPTION:  POSIX API interface based upon POSIX 1003.1-2004
+ *
+ *               Copyright (c) 2013, 2016  by Qualcomm Technologies, Inc.  All Rights Reserved. QUALCOMM Proprietary and Confidential.
+
+ *==========================================================================*/
+
+#if !defined( _PID_T ) || !defined( __pid_t_defined )
+/* POSIX defines pid_t as signed 32-bit type. Hexagon toolchain's header
+   defines it as unsigned 32-bit type citing conflict with QuRT POSIX
+   compatibility later. If any such conflicts exist, we should fix them.
+   pid_t is being defined *BEFORE* inclusion of generic/sys/types.h
+   *INTENTIONALLY* to fix this */
+typedef int        pid_t;
+#define _PID_T
+#define __pid_t_defined
+#endif
+#include <bits/confname.h>
+#include <hooks/unistd.h>
+#include <generic/sys/types.h>
+#include <pthread_types.h>
+
+#ifndef __DEFINED_off_t
+typedef long       off_t;
+#define __DEFINED_off_t
+#endif
+
+#endif /* _SYS_TYPES_H_ */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/posix/time.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/posix/time.h
new file mode 100755
index 0000000000000..13aeb1ea9920d
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/posix/time.h
@@ -0,0 +1,142 @@
+#ifndef _POSIX_TIME_H_
+#define _POSIX_TIME_H_
+
+/*==========================================================================
+ * FILE:         time.h
+ *
+ * SERVICES:     POSIX Timer API interface
+ *
+ * DESCRIPTION:  POSIX Timer API interface based upon POSIX 1003.1-2004
+ *
+ *               Copyright (c) 2013,2016  by Qualcomm Technologies, Inc.  All Rights Reserved. QUALCOMM Proprietary and Confidential.
+ *==========================================================================*/
+
+
+#include <sys/types.h>
+
+typedef int              clockid_t; /* ignored */
+#define _CLOCKID_T
+#define _PROVIDE_POSIX_TIME_DECLS 1
+#include <generic/time.h>
+/* @todo anandj sys/time.h has definition for struct timeval but is not
+         included by generic/time.h */
+#include <sys/time.h>
+
+#define CLOCK_FREQ_NOT_DEFINED          -1
+/* Frequency of Sclk used */
+#define TIME_CONV_SCLK_FREQ             19200000
+
+#define RES_CONV_FACTOR1                1
+#define RES_CONV_FACTOR2                1000000000
+
+#if !defined(CLOCK_REALTIME)
+# define CLOCK_REALTIME 0
+#endif
+
+#if !defined(CLOCK_MONOTONIC)
+# define CLOCK_MONOTONIC 1
+#endif
+
+#if !defined(CLOCK_THREAD_CPUTIME_ID)
+# define CLOCK_THREAD_CPUTIME_ID 2
+#endif
+
+#if !defined(CLOCK_PROCESS_CPUTIME_ID)
+# define CLOCK_PROCESS_CPUTIME_ID 3
+#endif
+
+#if !defined(CLOCK_MONOTONIC_RAW)
+# define CLOCK_MONOTONIC_RAW 4
+#endif
+
+#if !defined(CLOCK_REALTIME_COARSE)
+# define CLOCK_REALTIME_COARSE 5
+#endif
+
+#if !defined(CLOCK_MONOTONIC_COARSE)
+# define CLOCK_MONOTONIC_COARSE 6
+#endif
+
+#if !defined(CLOCK_BOOTTIME)
+# define CLOCK_BOOTTIME 7
+#endif
+
+struct itimerspec
+{
+    struct timespec it_interval;  /* Timer period.     */
+    struct timespec it_value;     /* Timer expiration. */
+};
+
+/* have to move #include here to solve circular include problems between time.h and signal.h */
+#include <signal.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Timer functions */
+
+/** \details
+ * POSIX timers can be either of two types: a one-shot type or a periodic 
+ * type.
+ *
+ * A one-shot is an armed timer that is set to an expiration time relative 
+ * to either a current time or an absolute time. The timer expires once and 
+ * is disarmed. 
+ *
+ * A periodic timer is armed with an initial expiration time and a repetition 
+ * interval. Every time the interval timer 
+ * expires, the timer is reloaded with the repetition interval. The timer 
+ * is then rearmed. 
+ */
+
+/** \defgroup timer POSIX Timer API */
+
+/** \ingroup timer */
+/** @{ */
+
+/** Create a POSIX timer. 
+ * Please refer to POSIX standard for details.
+ * @param clockid [in] ignored in this implementation
+ * @param evp     [in] if non-NULL, points to a sigevent structure. This 
+ * structure, allocated by the application, defines the asynchronous 
+ * notification to occur when the timer expires. If the evp argument is 
+ * NULL, the effect is as if the evp argument pointed to a sigevent 
+ * structure with the sigev_notify member having the value SIGEV_SIGNAL, 
+ * the sigev_signo having a default signal number (SIGALRM), and the 
+ * sigev_value member having the value of the timer ID.
+ */
+int timer_create(clockid_t clockid, struct sigevent *restrict evp,
+                 timer_t *restrict timerid);
+
+/** Delete a POSIX timer. 
+ * Please refer to POSIX standard for details.
+ */                 
+int timer_delete(timer_t timerid);
+
+/** Get the time remaining on a POSIX timer. 
+ * Please refer to POSIX standard for details.
+ */                 
+int timer_gettime(timer_t timerid, struct itimerspec *value);
+
+
+/** Set the time remaining on a POSIX timer. 
+ * Please refer to POSIX standard for details.
+ * @param flags [in] ignored in this implementation
+ */                 
+int timer_settime(timer_t timerid, int flags,
+                  const struct itimerspec *restrict value,
+                  struct itimerspec *restrict ovalue);
+/** Obtain ID of a process CPU-time clock
+ *  @param pid [in] Process ID
+ *  @param clock_id [out] Clock ID
+ *  @return Error values as per POSIX standard
+ */
+int clock_getcpuclockid (pid_t pid, clockid_t * clock_id);
+/** @} */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _POSIX_TIME_H_ */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qube/qube.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qube/qube.h
new file mode 100755
index 0000000000000..1e31e2deedb38
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qube/qube.h
@@ -0,0 +1,51 @@
+#ifndef QUBE_H
+#define QUBE_H
+/*=============================================================================
+
+                 qube.h -- H E A D E R  F I L E
+
+GENERAL DESCRIPTION
+   Prototypes of qpd API
+
+EXTERNAL FUNCTIONS
+   None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+      Copyright (c) 2013  by Qualcomm Technologies, Inc.  All Rights Reserved.
+
+=============================================================================*/
+
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <qurt.h>
+
+/* Define Error codes as QuRT error codes preceed with QURT_ */
+#ifndef EOK
+#define EOK                             QURT_EOK
+#endif /* EOK */
+#ifndef EVAL
+#define EVAL                            QURT_EVAL
+#endif /* EVAL */
+#ifndef EMEM
+#define EMEM                            QURT_EMEM
+#endif /* EMEM */
+#ifndef EINVALID
+#define EINVALID                        QURT_EINVALID
+#endif /* EINVALID */
+
+
+/*=============================================================================
+                      FUNCTION DECLARATIONS                                
+=============================================================================*/
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QUBE_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/atomic_ops.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/atomic_ops.h
new file mode 100755
index 0000000000000..0a9a9f8ba7db5
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/atomic_ops.h
@@ -0,0 +1,197 @@
+#ifndef ATOMIC_OPS_H
+#define ATOMIC_OPS_H
+/**
+  @file atomic_ops.h 
+
+  @brief  Type definitions backwards compatible.
+
+EXTERNAL FUNCTIONS
+   None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+Copyright (c) 2013, 2021  by Qualcomm Technologies, Inc.  All Rights Reserved
+Confidential and Proprietary - Qualcomm Technologies, Inc.
+=============================================================================*/
+
+
+/*
+ * Australian Public Licence B (OZPLB)
+ *
+ * Version 1-0
+ *
+ * Copyright (c) 2007, Open Kernel Labs, Inc.
+ *
+ * All rights reserved. 
+ *
+ * Developed by: Embedded, Real-time and Operating Systems Program (ERTOS)
+ *               National ICT Australia
+ *               http://www.ertos.nicta.com.au
+ *
+ * Permission is granted by National ICT Australia, free of charge, to
+ * any person obtaining a copy of this software and any associated
+ * documentation files (the "Software") to deal with the Software without
+ * restriction, including (without limitation) the rights to use, copy,
+ * modify, adapt, merge, publish, distribute, communicate to the public,
+ * sublicense, and/or sell, lend or rent out copies of the Software, and
+ * to permit persons to whom the Software is furnished to do so, subject
+ * to the following conditions:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimers.
+ *
+ *     * Redistributions in binary form must reproduce the above
+ *       copyright notice, this list of conditions and the following
+ *       disclaimers in the documentation and/or other materials provided
+ *       with the distribution.
+ *
+ *     * Neither the name of National ICT Australia, nor the names of its
+ *       contributors, may be used to endorse or promote products derived
+ *       from this Software without specific prior written permission.
+ *
+ * EXCEPT AS EXPRESSLY STATED IN THIS LICENCE AND TO THE FULL EXTENT
+ * PERMITTED BY APPLICABLE LAW, THE SOFTWARE IS PROVIDED "AS-IS", AND
+ * NATIONAL ICT AUSTRALIA AND ITS CONTRIBUTORS MAKE NO REPRESENTATIONS,
+ * WARRANTIES OR CONDITIONS OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
+ * BUT NOT LIMITED TO ANY REPRESENTATIONS, WARRANTIES OR CONDITIONS
+ * REGARDING THE CONTENTS OR ACCURACY OF THE SOFTWARE, OR OF TITLE,
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT,
+ * THE ABSENCE OF LATENT OR OTHER DEFECTS, OR THE PRESENCE OR ABSENCE OF
+ * ERRORS, WHETHER OR NOT DISCOVERABLE.
+ *
+ * TO THE FULL EXTENT PERMITTED BY APPLICABLE LAW, IN NO EVENT SHALL
+ * NATIONAL ICT AUSTRALIA OR ITS CONTRIBUTORS BE LIABLE ON ANY LEGAL
+ * THEORY (INCLUDING, WITHOUT LIMITATION, IN AN ACTION OF CONTRACT,
+ * NEGLIGENCE OR OTHERWISE) FOR ANY CLAIM, LOSS, DAMAGES OR OTHER
+ * LIABILITY, INCLUDING (WITHOUT LIMITATION) LOSS OF PRODUCTION OR
+ * OPERATION TIME, LOSS, DAMAGE OR CORRUPTION OF DATA OR RECORDS; OR LOSS
+ * OF ANTICIPATED SAVINGS, OPPORTUNITY, REVENUE, PROFIT OR GOODWILL, OR
+ * OTHER ECONOMIC LOSS; OR ANY SPECIAL, INCIDENTAL, INDIRECT,
+ * CONSEQUENTIAL, PUNITIVE OR EXEMPLARY DAMAGES, ARISING OUT OF OR IN
+ * CONNECTION WITH THIS LICENCE, THE SOFTWARE OR THE USE OF OR OTHER
+ * DEALINGS WITH THE SOFTWARE, EVEN IF NATIONAL ICT AUSTRALIA OR ITS
+ * CONTRIBUTORS HAVE BEEN ADVISED OF THE POSSIBILITY OF SUCH CLAIM, LOSS,
+ * DAMAGES OR OTHER LIABILITY.
+ *
+ * If applicable legislation implies representations, warranties, or
+ * conditions, or imposes obligations or liability on National ICT
+ * Australia or one of its contributors in respect of the Software that
+ * cannot be wholly or partly excluded, restricted or modified, the
+ * liability of National ICT Australia or the contributor is limited, to
+ * the full extent permitted by the applicable legislation, at its
+ * option, to:
+ * a.  in the case of goods, any one or more of the following:
+ * i.  the replacement of the goods or the supply of equivalent goods;
+ * ii.  the repair of the goods;
+ * iii. the payment of the cost of replacing the goods or of acquiring
+ *  equivalent goods;
+ * iv.  the payment of the cost of having the goods repaired; or
+ * b.  in the case of services:
+ * i.  the supplying of the services again; or
+ * ii.  the payment of the cost of having the services supplied again.
+ *
+ * The construction, validity and performance of this licence is governed
+ * by the laws in force in New South Wales, Australia.
+ */
+
+/*
+ * Author: Malcolm Purvis <malcolmp@ok-labs.com>
+ * Author: Carlos Dyonisio <medaglia@ok-labs.com>
+ */
+
+#include <qurt_atomic_ops.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef unsigned int atomic_plain_word_t;
+
+/*-------------------------------------------------------------------------*/
+                        /* Atomic Ops API. */
+
+/*
+ * IMPORTANT!
+ * If you plan to change the structure atomic_word_t, please add the new
+ * elements after value. For more information, read the comment in
+ * arch/arm/libs/atomic_ops/v5/src/arm_atomic_ops.spp:66
+ */
+
+typedef struct {
+    volatile atomic_plain_word_t value;
+} atomic_word_t;
+
+#define ATOMIC_INIT(i)  { (i) }
+
+static inline void
+atomic_init(atomic_word_t *a, atomic_plain_word_t v)
+{
+    a->value = v;
+}
+
+#if defined(ARCH_ARM) && defined(ARCH_VER) && (ARCH_VER < 6) && \
+         (!defined(__ATOMIC_OPS_IN_KERNEL__) || defined(MACHINE_SMP))
+
+/* 
+ * If it is ARMv4/v5, the function declarations may change
+ * and are defined in the arch specific header file,
+ * as some of then cannot be declared static because of
+ * the assembler implementation.
+ */
+
+#else 
+
+/* Arithmetic operations. */
+
+void atomic_sub(atomic_word_t *target, atomic_plain_word_t v);
+
+/* Architecture independent definitions. */
+
+static inline atomic_plain_word_t atomic_read(atomic_word_t *target)
+{
+    return target->value;
+}
+
+typedef unsigned long long atomic64_plain_word_t;
+
+typedef struct {
+    volatile atomic64_plain_word_t value;
+} atomic64_word_t;
+
+static inline void
+atomic64_init(atomic64_word_t *a, atomic64_plain_word_t v)
+{
+    a->value = v;
+}
+
+/*********************
+  Support 64-bit  
+ *********************/
+
+atomic64_plain_word_t atomic64_set(atomic64_word_t* target,
+                                      atomic64_plain_word_t value);
+
+void atomic64_xor(atomic64_word_t* target,
+                       atomic64_plain_word_t mask);
+
+/*---------------------------------------------------------------------------*/
+
+/* Architecture independent definitions. */
+
+static inline atomic64_plain_word_t atomic64_read(atomic64_word_t *target)
+{
+    return target->value;
+}
+
+#endif
+
+
+/* Architecture dependent definitions. */
+#include <atomic_ops_plat.h>
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* ATOMIC_OPS_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/atomic_ops_plat.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/atomic_ops_plat.h
new file mode 100755
index 0000000000000..b54b3ff83d978
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/atomic_ops_plat.h
@@ -0,0 +1,86 @@
+#ifndef ATOMIC_OPS_PLAT_H
+#define ATOMIC_OPS_PLAT_H
+/**
+  @file atomic_ops_plat.h 
+
+  @brief  Prototypes of atomic operations API backwards compatible.
+
+EXTERNAL FUNCTIONS
+   None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+Copyright (c) 2013, 2023 Qualcomm Technologies, Inc.
+All Rights Reserved.
+Confidential and Proprietary - Qualcomm Technologies, Inc.
+=============================================================================*/
+
+
+#include <qurt_atomic_ops.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+/*=============================================================================
+                      CONSTANTS AND MACROS                                
+=============================================================================*/
+#define atomic_set(a,b)                qurt_atomic_set((unsigned int *)(a),(unsigned int)(b))
+#define atomic_and(a,b)                qurt_atomic_and((unsigned int *)(a),(unsigned int)(b))
+#define atomic_and_return(a,b)         qurt_atomic_and_return((unsigned int *)(a),(unsigned int)(b))
+#define atomic_or(a,b)                 qurt_atomic_or((unsigned int *)(a),(unsigned int)(b))
+#define atomic_or_return(a,b)          qurt_atomic_or_return((unsigned int *)(a),(unsigned int)(b))
+#define atomic_xor(a,b)                qurt_atomic_xor((unsigned int *)(a),(unsigned int)(b))
+#define atomic_xor_return(a,b)         qurt_atomic_xor_return((unsigned int *)(a),(unsigned int)(b)) 
+#define atomic_set_bit(a,b)            qurt_atomic_set_bit((unsigned int *)(a),(unsigned int)(b))
+#define atomic_clear_bit(a,b)          qurt_atomic_clear_bit((unsigned int *)(a),(unsigned int)(b))
+#define atomic_change_bit(a,b)         qurt_atomic_change_bit((unsigned int *)(a),(unsigned int)(b))
+#define atomic_add(a,b)                qurt_atomic_add((unsigned int *)(a),(unsigned int)(b))
+#define atomic_add_return(a,b)         qurt_atomic_add_return((unsigned int *)(a),(unsigned int)(b))
+#define atomic_add_unless(a,b,c)       qurt_atomic_add_unless((unsigned int *)(a),(unsigned int)(b),(unsigned int)(c))
+#define atomic_sub(a,b)                qurt_atomic_sub((unsigned int *)(a),(unsigned int)(b))
+#define atomic_sub_return(a,b)         qurt_atomic_sub_return((unsigned int *)(a),(unsigned int)(b))
+#define atomic_inc(a)                  qurt_atomic_inc((unsigned int *)(a))
+#define atomic_inc_return(a)           qurt_atomic_inc_return((unsigned int *)(a))
+#define atomic_dec(a)                  qurt_atomic_dec((unsigned int *)(a))
+#define atomic_dec_return(a)           qurt_atomic_dec_return((unsigned int *)(a))
+#define atomic_compare_and_set(a,b,c)  qurt_atomic_compare_and_set((unsigned int *)(a),(unsigned int)(b),(unsigned int)(c))
+#define atomic_barrier                 qurt_atomic_barrier
+#define atomic_barrier_write           qurt_atomic_barrier_write
+#define atomic_barrier_write_smp       qurt_atomic_barrier_write_smp
+#define atomic_barrier_read_smp        qurt_atomic_barrier_read_smp
+#define atomic_barrier_smp             qurt_atomic_barrier_smp
+
+/*============================
+ *       64 bits support
+ *============================ */
+#define atomic64_set(a,b)                qurt_atomic64_set((unsigned long long *)(a),(unsigned long long)(b))  
+#define atomic64_and(a,b)                qurt_atomic64_and((unsigned long long *)(a),(unsigned long long)(b)) 
+#define atomic64_and_return(a,b)         qurt_atomic64_and_return((unsigned long long *)(a),(unsigned long long)(b))  
+#define atomic64_or(a,b)                 qurt_atomic64_or((unsigned long long *)(a),(unsigned long long)(b)) 
+#define atomic64_or_return(a,b)          qurt_atomic64_or_return((unsigned long long *)(a),(unsigned long long)(b)) 
+#define atomic64_xor(a,b)                qurt_atomic64_xor((unsigned long long *)(a),(unsigned long long)(b)) 
+#define atomic64_xor_return(a,b)         qurt_atomic64_xor_return((unsigned long long *)(a),(unsigned long long)(b))  
+#define atomic64_set_bit(a,b)            qurt_atomic64_set_bit((unsigned long long *)(a),(unsigned long long)(b)) 
+#define atomic64_clear_bit(a,b)          qurt_atomic64_clear_bit((unsigned long long *)(a),(unsigned long long)(b))  
+#define atomic64_change_bit(a,b)         qurt_atomic64_change_bit((unsigned long long *)(a),(unsigned long long)(b)) 
+#define atomic64_add(a,b)                qurt_atomic64_add((unsigned long long *)(a),(unsigned long long)(b))  
+#define atomic64_add_return(a,b)         qurt_atomic64_add_return((unsigned long long *)(a),(unsigned long long)(b))
+#define atomic64_sub(a,b)                qurt_atomic64_sub((unsigned long long *)(a),(unsigned long long)(b)) 
+#define atomic64_sub_return(a,b)         qurt_atomic64_sub_return((unsigned long long *)(a),(unsigned long long)(b)) 
+#define atomic64_inc(a)                  qurt_atomic64_inc((unsigned long long *)(a))
+#define atomic64_inc_return(a)           qurt_atomic64_inc_return((unsigned long long *)(a))
+#define atomic64_dec(a)                  qurt_atomic64_dec((unsigned long long *)(a))
+#define atomic64_dec_return(a)           qurt_atomic64_dec_return((unsigned long long *)(a))
+#define atomic64_compare_and_set(a,b,c)  qurt_atomic64_compare_and_set((unsigned long long  *)(a),(unsigned long long )(b),(unsigned long long )(c))
+#define atomic64_barrier                 qurt_atomic64_barrier
+#define atomic64_barrier_write           qurt_atomic64_barrier_write
+#define atomic64_barrier_write_smp       qurt_atomic64_barrier_write_smp
+#define atomic64_barrier_read_smp        qurt_atomic64_barrier_read_smp
+#define atomic64_barrier_smp             qurt_atomic64_barrier_smp
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* ATOMIC_OPS_PLAT_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt.h
new file mode 100755
index 0000000000000..4d25c9b2b6243
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt.h
@@ -0,0 +1,111 @@
+#ifndef QURT_H
+#define QURT_H 
+
+/**
+  @file qurt.h 
+  @brief  Contains kernel header files that provide kernel OS API functions, constants, and 
+  definitions 
+
+ EXTERNALIZED FUNCTIONS
+  none
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  none
+
+ Copyright (c) 2013,2021,2023  by Qualcomm Technologies, Inc.  All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+/*======================================================================
+ *
+ *											 EDIT HISTORY FOR FILE
+ *
+ *	 This section contains comments describing changes made to the
+ *	 module. Notice that changes are listed in reverse chronological
+ *	 order.
+ *
+ *	
+ *
+ *
+ * when 				who 		what, where, why
+ * ---------- 	--- 		------------------------------------------------
+ * 2011-02-25 	op			Add Header file
+   2012-12-16   cm          (Tech Pubs) Edited/added Doxygen comments and markup.
+ ======================================================================*/
+ 
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "qurt_consts.h"
+#include "qurt_api_version.h"
+#include "qurt_alloc.h"
+#include "qurt_futex.h"
+#include "qurt_mutex.h"
+#include "qurt_pipe.h"
+#include "qurt_printf.h"
+#include "qurt_assert.h"
+#include "qurt_thread.h"
+#include "qurt_trace.h"
+#include "qurt_cycles.h"
+#include "qurt_profile.h"
+#include "qurt_sem.h"
+#include "qurt_cond.h"
+#include "qurt_barrier.h"
+#include "qurt_fastint.h"
+#include "qurt_allsignal.h"
+#include "qurt_anysignal.h"
+#include "qurt_signal.h"
+#include "qurt_rmutex.h"
+#include "qurt_pimutex.h"
+#include "qurt_signal2.h"
+#include "qurt_rmutex2.h"
+#include "qurt_pimutex2.h"
+#include "qurt_int.h"
+#include "qurt_lifo.h"
+#include "qurt_power.h"
+#include "qurt_event.h"
+#include "qurt_pmu.h"
+#include "qurt_stid.h"
+//#include "qurt_version.h"
+#include "qurt_tlb.h"
+#include "qurt_vtlb.h"
+#include "qurt_memory.h"
+#include "qurt_qdi.h"
+#include "qurt_sclk.h"
+#include "qurt_space.h"
+#include "qurt_process.h"
+#include "qurt_timer.h"
+#include "qurt_tls.h"
+#include "qurt_thread_context.h"
+#include "qurt_hvx.h"
+#include "qurt_hmx.h"
+#include "qurt_mailbox.h"
+#include "qurt_island.h"
+#include "qurt_qdi_proxy.h"
+#include "qurt_l2cfg.h"
+#include "qurt_mmap.h"
+#include "qurt_isr.h"
+#include "qurt_busywait.h"
+#include "qurt_ecc.h"
+#include "qurt_callback.h"
+#include "qurt_error.h"
+#include "qurt_except.h"
+#include "qurt_mq.h"
+#include "qurt_user_dma.h"
+#include "qurt_fs_hub.h"	
+#include "qurt_os_services.h"	
+
+#ifndef MAIN_ONLY
+#define INCLUDE_ISLAND_CONTENTS
+#endif
+#ifndef ISLAND_ONLY
+#define INCLUDE_MAIN_CONTENTS
+#endif
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_H */
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_alloc.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_alloc.h
new file mode 100755
index 0000000000000..da37a4c0a714e
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_alloc.h
@@ -0,0 +1,145 @@
+#ifndef QURT_ALLOC_H
+#define QURT_ALLOC_H
+
+/**
+  @file qurt_alloc.h 
+  @brief Prototypes of kernel memory allocation API functions.      
+
+ EXTERNALIZED FUNCTIONS
+  none
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  none
+
+ Copyright (c) 2021, 2023 Qualcomm Technologies, Inc.
+ All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+
+/*======================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**@ingroup func_qurt_malloc
+  Dynamically allocates the specified array on the QuRT system heap.
+  The return value is the address of the allocated memory area.
+
+  @note1hang The allocated memory area is automatically initialized to zero.
+
+  @param[in] size     Size (in bytes) of the memory area.
+  
+  @return
+  Nonzero -- Pointer to the allocated memory area. \n
+  0 -- Not enough memory in heap to allocate memory area.
+
+  @dependencies
+  None.    
+
+ */
+/* ======================================================================*/
+void *qurt_malloc( unsigned int size);
+
+/*======================================================================*/
+/**@ingroup func_qurt_calloc
+  Dynamically allocates the specified array on the QuRT system heap.
+  The return value is the address of the allocated array. 
+
+  @note1hang The allocated memory area is automatically initialized to zero.
+
+  @param[in] elsize Size (in bytes) of each array element.
+  @param[in] num    Number of array elements.
+
+  @return 
+  Nonzero -- Pointer to allocated array.\n
+  Zero -- Not enough memory in heap to allocate array.
+
+  @dependencies
+  None.
+  
+ */
+ /* ======================================================================*/
+void *qurt_calloc(unsigned int elsize, unsigned int num);
+
+/*======================================================================*/
+/**@ingroup func_qurt_realloc
+  Reallocates memory on the heap. \n
+  Changes the size of a memory area that is already allocated on the QuRT system heap. 
+  The reallocate memory operation is functionally similar to realloc. It accepts a pointer
+  to an existing memory area on the heap, and resizes the memory area to the specified size
+  while preserving the original contents of the memory area.
+
+  @note1hang This function might change the address of the memory area.
+             If the value of ptr is NULL, this function is equivalent to 
+             qurt_malloc().
+             If the value of new_size is 0, it is equivalent to qurt_free().  
+             If the memory area is expanded, the added memory is not initialized.
+
+  @param[in] *ptr   Pointer to the address of the memory area.
+  @param[in] newsize Size (in bytes) of the reallocated memory area.
+	               	
+  @return
+  Nonzero -- Pointer to reallocated memory area. \n
+  0 -- Not enough memory in heap to reallocate the memory area.
+
+  @dependencies
+  None.
+	 
+ */
+ /* ======================================================================*/
+void *qurt_realloc(void *ptr,  int newsize);
+
+/*======================================================================*/
+/**@ingroup func_qurt_free
+  Frees allocated memory from the heap.\n
+  Deallocates the specified memory from the QuRT system heap.
+
+  @param[in] *ptr Pointer to the address of the memory to deallocate.
+	
+  @return
+  None.
+
+  @dependencies
+  The memory item that the ptr value specifies must have been previously 
+  allocated using one of the qurt_calloc(), 
+  qurt_malloc(), or qurt_realloc() memory allocation functions. 
+  Otherwise the behavior of QuRT is undefined.
+  
+ */
+ /* ======================================================================*/
+void qurt_free( void *ptr);
+
+
+void *qurt_memalign(unsigned int alignment, unsigned int size);
+
+/*
+||  Macro to define a static heap for a QuRT program.
+||
+||  Usage:
+||   Declare at the top-level of any C source file that
+||    is part of the build (and is guaranteed
+||    to actually be pulled into the build). Place
+||    it in the same function with main():
+||
+||    QURT_DECLARE_STATIC_HEAP(512000);
+||
+||  The only argument is the size in bytes, and it is
+||   rounded up to the nearest 64 bytes (size of an
+||   L2 cache block).
+||
+*/
+
+#define QURT_DECLARE_STATIC_HEAP(sz)                    \
+   static struct qurt_static_heap {                     \
+      char space[(sz)] __attribute__((aligned(64)));      \
+   } static_heap[1];                                    \
+   void * const override_heap_Base = &static_heap[0];   \
+   void * const override_heap_Limit = &static_heap[1]
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_ALLOC_H */
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_allsignal.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_allsignal.h
new file mode 100755
index 0000000000000..5dc89e495130d
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_allsignal.h
@@ -0,0 +1,176 @@
+
+#ifndef QURT_ALLSIGNAL_H
+#define QURT_ALLSIGNAL_H
+
+/**
+  @file  qurt_allsignal.h
+  @brief  Prototypes of kernel signal API functions.      
+
+ EXTERNALIZED FUNCTIONS
+  none
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  none
+
+
+ Copyright (c) 2021  by Qualcomm Technologies, Inc.  All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+
+#include <qurt_futex.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** @addtogroup all_signal_types
+@{ */
+/*=====================================================================
+ Typedefs
+ ======================================================================*/
+
+/**          
+qurt_signal_t supersedes qurt_allsignal_t. This type definition was added for backwards compatibility. */
+typedef union {
+    /** @cond */
+	unsigned long long int raw;
+	struct {
+		unsigned int waiting;      /**< */
+		unsigned int signals_in;   /**< */
+		unsigned int queue;        /**< */
+		unsigned int reserved;     /**< */
+	}X;
+    /** @endcond */
+} qurt_allsignal_t;
+/** @} */ /* end_addtogroup all_signal_types */
+
+/*=====================================================================
+ Functions
+======================================================================*/
+ 
+/*======================================================================*/
+/**@ingroup func_qurt_allsignal_init
+  Initializes an all-signal object.\n
+  The all-signal object is initially cleared.
+
+  @datatypes
+  #qurt_allsignal_t
+
+  @param[out] signal Pointer to the all-signal object to initialize. 
+  
+  @return         
+  None.
+
+  @dependencies    
+  None.
+ */
+/* ======================================================================*/
+void qurt_allsignal_init(qurt_allsignal_t *signal);
+
+/*======================================================================*/
+/**@ingroup func_qurt_allsignal_destroy
+  Destroys the specified all-signal object.\n
+  @note1hang All-signal objects must be destroyed when they are no longer in use. 
+             Failure to do this causes resource leaks in the QuRT kernel.  \n
+  @note1cont All-signal objects must not be destroyed while they are still in use. 
+             If this occurs, the behavior of QuRT is undefined.
+  
+  @datatypes
+  #qurt_allsignal_t
+
+  @param[in] signal Pointer to the all-signal object to destroy.
+
+  @return         
+  None.
+ 
+  @dependencies
+  None.
+ */
+/* ======================================================================*/
+void qurt_allsignal_destroy(qurt_allsignal_t *signal);
+
+/*======================================================================*/
+/**@ingroup func_qurt_allsignal_get
+  Gets signal values from the all-signal object.
+
+  Returns the current signal values of the specified all-signal object.
+
+  @datatypes
+  #qurt_allsignal_t
+
+  @param[in] signal Pointer to the all-signal object to access.
+
+  @return         
+  Bitmask with current signal values.
+    
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+static inline unsigned int qurt_allsignal_get(qurt_allsignal_t *signal)
+{ return signal->X.signals_in; }
+    
+/*======================================================================*/
+/**@ingroup func_qurt_allsignal_wait  
+  Waits on the all-signal object.\n
+  Suspends the current thread until all of the specified signals are set.
+  Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1
+  indicates that a signal must be waited on, and 0 that it is not to be waited on.
+
+  If a signal is set in an all-signal object, and a thread is waiting on the all-signal object for
+  that signal, the thread is awakened. If the awakened thread has higher priority than
+  the current thread, a context switch can occur.
+
+  Unlike any-signals, all-signals do not need to explicitly clear any set signals in an all-signal
+  object before waiting on them again -- clearing is done automatically by the wait
+  operation.
+
+  @note1hang At most, one thread can wait on an all-signal object at any given time.
+             Because signal clearing is done by the wait operation, no clear operation is
+             defined for all-signals.
+
+  @datatypes
+  #qurt_allsignal_t
+  
+  @param[in] signal Pointer to the all-signal object to wait on.
+  @param[in] mask	Signal mask value, which identifies the individual signals in the all-signal object
+                    to wait on.
+ 
+  @return
+  None.
+ 
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+void qurt_allsignal_wait(qurt_allsignal_t *signal, unsigned int mask);
+
+/*======================================================================*/
+/**@ingroup func_qurt_allsignal_set
+  Set signals in the specified all-signal object.
+
+  Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit 
+  value of 1 indicates that a signal must be set, and 0 indicates not to set the signal.
+
+  @datatypes
+  #qurt_allsignal_t
+
+  @param[in]	signal  Pointer to the all-signal object to modify. 
+  @param[in]	mask 	Signal mask value identifying the individual signals to  
+                        set in the all-signal object.
+
+  @return
+  None.
+ 
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+void qurt_allsignal_set(qurt_allsignal_t *signal, unsigned int mask);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_ALLSIGNAL_H */
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_anysignal.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_anysignal.h
new file mode 100755
index 0000000000000..9619e2de562b4
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_anysignal.h
@@ -0,0 +1,225 @@
+#ifndef QURT_ANYSIGNAL_H
+#define QURT_ANYSIGNAL_H 
+/**
+  @file qurt_anysignal.h
+  Prototypes of kernel signal API functions.      
+
+ EXTERNALIZED FUNCTIONS
+  None
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  None
+
+Copyright (c) 2021 Qualcomm Technologies, Inc.
+All rights reserved.
+Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+
+#include <qurt_signal.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*=====================================================================
+Typedefs
+======================================================================*/
+
+/**@ingroup anysignals_types                                                 
+ qurt_signal_t supersedes qurt_anysignal_t. This type definition was added for backwards compatibility.  */  
+typedef qurt_signal_t qurt_anysignal_t;
+
+/*=====================================================================
+ Functions
+======================================================================*/
+ 
+/*======================================================================*/
+/**@ingroup func_qurt_anysignal_init
+  Initializes an any-signal object.\n
+  The any-signal object is initially cleared.
+
+  @datatypes
+  #qurt_anysignal_t
+
+  @param[out] signal	Pointer to the initialized any-signal object.  
+  
+  @return         
+  None.
+
+  @dependencies  
+  None.
+ */
+/* ======================================================================*/
+static inline void qurt_anysignal_init(qurt_anysignal_t *signal)
+{
+  qurt_signal_init(signal);
+}
+
+/*======================================================================*/
+/**@ingroup func_qurt_anysignal_destroy
+  Destroys the specified any-signal object. 
+
+  @note1hang Any-signal objects must be destroyed when they are no longer in use. Failure
+             to do this causes resource leaks in the QuRT kernel.\n
+  @note1cont Any-signal objects must not be destroyed while they are still in use. If this
+             occurs, the behavior of QuRT is undefined.
+
+  @datatypes
+  #qurt_anysignal_t
+  
+  @param[in] signal Pointer to the any-signal object to destroy.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+/* ======================================================================*/
+static inline void qurt_anysignal_destroy(qurt_anysignal_t *signal)
+{
+  qurt_signal_destroy(signal);
+}
+
+/*======================================================================*/
+/**@ingroup func_qurt_anysignal_wait
+  Wait on the any-signal object. \n
+  Suspends the current thread until any one of the specified signals is set.
+
+  Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1
+  indicates that a signal must be waited on, and 0 indicates not to wait on the signal.
+  If a signal is set in an any-signal object, and a thread is waiting on the any-signal object for
+  that signal, the thread is awakened. If the awakened thread has higher priority than
+  the current thread, a context switch can occur.
+
+  @note1hang At most, one thread can wait on an any-signal object at any given time.
+
+  @datatypes
+  #qurt_anysignal_t
+	
+  @param[in] signal Pointer to the any-signal object to wait on. 
+  @param[in] mask   Signal mask value, which specifies the individual signals in the any-signal
+                      object to wait on.
+
+  @return 				
+  Bitmask of current signal values.
+
+  @dependencies
+  None.
+ */
+/* ======================================================================*/
+static inline unsigned int qurt_anysignal_wait(qurt_anysignal_t *signal, unsigned int mask)
+{
+  return qurt_signal_wait(signal, mask, QURT_SIGNAL_ATTR_WAIT_ANY);
+}
+
+/*======================================================================*/
+/**@ingroup func_qurt_anysignal_set
+  Sets signals in the specified any-signal object. \n
+  Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1
+  indicates that a signal must be set, and 0 indicates not to set the sigmal.
+
+  @datatypes
+  #qurt_anysignal_t
+	
+  @param[in] signal Pointer to the any-signal object to modify. 
+  @param[in]  mask  Signal mask value identifying the individual signals to  
+                       set in the any-signal object.
+
+  @return 				
+  Bitmask of old signal values (before set).
+
+  @dependencies
+  None.
+ */
+/* ======================================================================*/
+unsigned int qurt_anysignal_set(qurt_anysignal_t *signal, unsigned int mask);
+
+
+
+/*======================================================================*/
+/**@ingroup func_qurt_anysignal_get
+  Gets signal values from the any-signal object.\n
+  Returns the current signal values of the specified any-signal object.
+
+  @datatypes
+  #qurt_anysignal_t
+ 	
+  @param[in] signal Pointer to the any-signal object to access. 
+
+  @return 				
+  A bitmask with the current signal values of the specified any-signal object.
+
+  @dependencies
+  None.
+ */
+/* ======================================================================*/
+static inline unsigned int qurt_anysignal_get(qurt_anysignal_t *signal)
+{
+  return qurt_signal_get(signal);
+}
+
+
+/*======================================================================*/
+/**@ingroup func_qurt_anysignal_clear
+   @xreflabel{sec:anysignal_clear}
+  Clears signals in the specified any-signal object.\n
+  Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1
+  indicates that a signal must be cleared, and 0 indicates not to clear the signal.
+
+  @datatypes
+  #qurt_anysignal_t
+	
+  @param[in] signal Pointer to the any-signal object, which specifies the any-signal object to modify. 
+  @param[in] mask   Signal mask value identifying the individual signals to  
+                    clear in the any-signal object.
+	
+  @return 				
+  Bitmask -- Old signal values (before clear). 
+
+  @dependencies
+  None.
+ */
+/* ======================================================================*/
+unsigned int qurt_anysignal_clear(qurt_anysignal_t *signal, unsigned int mask);
+
+/*======================================================================*/
+/**@ingroup func_qurt_anysignal_wait_timed
+  Waits on the any-signal object. \n
+  Suspends the current thread until any of the specified signals is set or timeout expires.
+
+  Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1
+  indicates that a signal must be waited on, and 0 indicates not to wait on the signal.
+  If a signal is set in an any-signal object, and a thread was waiting on the any-signal object for
+  that signal, the thread is awakened. If the awakened thread has higher priority than
+  the current thread, a context switch can occur.
+
+  @note1hang At most, one thread can wait on an any-signal object at any given time.
+
+  @datatypes
+  #qurt_anysignal_t
+	
+  @param[in] signal Pointer to the any-signal object to wait on. 
+  @param[in] mask   Signal mask value, which specifies the individual signals in the any-signal
+                      object to wait on.
+  @param[out] signals Bitmask of current signal values.
+  @param[in] duration Interval (in microseconds) duration value must be between #QURT_TIMER_MIN_DURATION and
+  #QURT_TIMER_MAX_DURATION.
+
+  @return 				
+   #QURT_EOK -- Success \n
+   #QURT_ETIMEDOUT -- timeout
+   #QURT_EINVALID -- Duration out of range
+
+  @dependencies
+  None.
+ */
+/* ======================================================================*/
+
+int qurt_anysignal_wait_timed(qurt_anysignal_t *signal, unsigned int mask, unsigned int *signals, unsigned long long int duration);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_ANYSIGNAL_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_api_version.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_api_version.h
new file mode 100755
index 0000000000000..dfe53ae755054
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_api_version.h
@@ -0,0 +1,77 @@
+#ifndef QURT_API_VERSION_H
+#define QURT_API_VERSION_H
+/*==============================================================================
+
+qurt_api_version.h
+
+GENERAL DESCRIPTION
+    API version file
+
+EXTERNAL FUNCTIONS
+    None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+    None.
+
+Copyright (c) Qualcomm Technologies, Inc.
+All Rights Reserved.
+Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+==============================================================================*/
+
+/*==============================================================================
+                         CONSTANTS AND DEFINITIONS
+==============================================================================*/
+/**
+ * Each field of the QURT_API_VERSION definitions is an 8-bit unsigned integer.
+ * Main release has first 3 fields updated - Major, Minor and Release.
+ *  - QURT_API_VERSION = Major, Minor, Release.
+ * Patch releases are supported by adding the extra field.
+ *  - QURT_API_VERSION = Major, Minor, Release, Patch.
+ */
+// Major version is incremented for incompatible API changes.
+#define QURT_API_VER_MAJOR 1
+
+// Minor version is incremented for backward-compatible enhancements in the API
+// set.
+#define QURT_API_VER_MINOR 4
+
+// RELEASE version is incremented for each release within a `MAJOR.MINOR`
+// release.
+#define QURT_API_VER_RELEASE 1
+
+// Patch version is incremented when new API content is introduced on older LTS
+// release.
+#define QURT_API_VER_PATCH 0
+
+/* Update the QURT_API_VERSION function macro. */
+#define QURT_API_VERSION_ENCODE(major, minor, release, patch) \
+    ((((major) & 0xFF) << 24) | (((minor) & 0xFF) << 16) | \
+        (((release) & 0xFF) << 8) | ((patch) & 0xFF))
+
+/* Update the QURT_API_VERSION Macro. */
+#define QURT_API_VERSION \
+    QURT_API_VERSION_ENCODE(QURT_API_VER_MAJOR, QURT_API_VER_MINOR, \
+        QURT_API_VER_RELEASE, QURT_API_VER_PATCH)
+
+/** Usage:
+ *
+ * #if QURT_API_VERSION >= QURT_API_VERSION_ENCODE(1,4,0,0)
+ *  qurt_func_2(a,b,c);
+ * #else
+ *  qurt_func(a);
+ * #endif
+ *
+ */
+/*
+   Gets the QuRT API version.
+
+  @return
+  QuRT API version.
+
+  @dependencies
+  None.
+ */
+unsigned int qurt_api_version(void);
+
+#endif /* QURT_API_VERSION_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_assert.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_assert.h
new file mode 100755
index 0000000000000..13cc2afd2e973
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_assert.h
@@ -0,0 +1,51 @@
+#ifndef QURT_ASSERT_H
+#define QURT_ASSERT_H
+/**
+  @file qurt_assert.h   
+  @brief  Prototypes of qurt_assert API  
+
+  EXTERNAL FUNCTIONS
+   None.
+
+  INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+  Copyright (c) 2021  by Qualcomm Technologies, Inc.  All Rights Reserved.
+  Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+=============================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*=============================================================================
+                        CONSTANTS AND MACROS
+=============================================================================*/
+/**@ingroup func_qurt_assert_error
+  Writes diagnostic information to the debug buffer, and raises an error to the QuRT kernel.
+  
+  @datatypes
+  None.
+  
+  @param[in] filename     Pointer to the file name string.
+  @param[in] lineno       Line number.
+  
+  @return
+  None.
+
+  @dependencies
+  None.  
+ */
+void qurt_assert_error(const char *filename, int lineno) __attribute__((noreturn));
+
+#define qurt_assert(cond) ((cond)?(void)0:qurt_assert_error(__QURTFILENAME__,__LINE__))
+
+/** @} */ /* end_ingroup func_qurt_assert */
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_ASSERT_H */
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_atomic_ops.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_atomic_ops.h
new file mode 100755
index 0000000000000..d9b2cff7d737c
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_atomic_ops.h
@@ -0,0 +1,1298 @@
+#ifndef QURT_ATOMIC_OPS_H
+#define QURT_ATOMIC_OPS_H
+/**
+  @file qurt_atomic_ops.h 
+  @brief  Prototypes of kernel atomic operations API.
+
+  EXTERNAL FUNCTIONS
+   None.
+
+   INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+Copyright (c) 2021, 2022  by Qualcomm Technologies, Inc.  All Rights Reserved
+Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+=============================================================================*/
+
+/*
+ * Australian Public Licence B (OZPLB)
+ *
+ * Version 1-0
+ *
+ * Copyright (c) 2007, Open Kernel Labs, Inc.
+ *
+ * All rights reserved. 
+ *
+ * Developed by: Embedded, Real-time and Operating Systems Program (ERTOS)
+ *               National ICT Australia
+ *               http://www.ertos.nicta.com.au
+ *
+ * Permission is granted by National ICT Australia, free of charge, to
+ * any person obtaining a copy of this software and any associated
+ * documentation files (the "Software") to deal with the Software without
+ * restriction, including (without limitation) the rights to use, copy,
+ * modify, adapt, merge, publish, distribute, communicate to the public,
+ * sublicense, and/or sell, lend or rent out copies of the Software, and
+ * to permit persons to whom the Software is furnished to do so, subject
+ * to the following conditions:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimers.
+ *
+ *     * Redistributions in binary form must reproduce the above
+ *       copyright notice, this list of conditions and the following
+ *       disclaimers in the documentation and/or other materials provided
+ *       with the distribution.
+ *
+ *     * Neither the name of National ICT Australia, nor the names of its
+ *       contributors, may be used to endorse or promote products derived
+ *       from this Software without specific prior written permission.
+ *
+ * EXCEPT AS EXPRESSLY STATED IN THIS LICENCE AND TO THE FULL EXTENT
+ * PERMITTED BY APPLICABLE LAW, THE SOFTWARE IS PROVIDED "AS-IS", AND
+ * NATIONAL ICT AUSTRALIA AND ITS CONTRIBUTORS MAKE NO REPRESENTATIONS,
+ * WARRANTIES OR CONDITIONS OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
+ * BUT NOT LIMITED TO ANY REPRESENTATIONS, WARRANTIES OR CONDITIONS
+ * REGARDING THE CONTENTS OR ACCURACY OF THE SOFTWARE, OR OF TITLE,
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT,
+ * THE ABSENCE OF LATENT OR OTHER DEFECTS, OR THE PRESENCE OR ABSENCE OF
+ * ERRORS, WHETHER OR NOT DISCOVERABLE.
+ *
+ * TO THE FULL EXTENT PERMITTED BY APPLICABLE LAW, IN NO EVENT SHALL
+ * NATIONAL ICT AUSTRALIA OR ITS CONTRIBUTORS BE LIABLE ON ANY LEGAL
+ * THEORY (INCLUDING, WITHOUT LIMITATION, IN AN ACTION OF CONTRACT,
+ * NEGLIGENCE OR OTHERWISE) FOR ANY CLAIM, LOSS, DAMAGES OR OTHER
+ * LIABILITY, INCLUDING (WITHOUT LIMITATION) LOSS OF PRODUCTION OR
+ * OPERATION TIME, LOSS, DAMAGE OR CORRUPTION OF DATA OR RECORDS; OR LOSS
+ * OF ANTICIPATED SAVINGS, OPPORTUNITY, REVENUE, PROFIT OR GOODWILL, OR
+ * OTHER ECONOMIC LOSS; OR ANY SPECIAL, INCIDENTAL, INDIRECT,
+ * CONSEQUENTIAL, PUNITIVE OR EXEMPLARY DAMAGES, ARISING OUT OF OR IN
+ * CONNECTION WITH THIS LICENCE, THE SOFTWARE OR THE USE OF OR OTHER
+ * DEALINGS WITH THE SOFTWARE, EVEN IF NATIONAL ICT AUSTRALIA OR ITS
+ * CONTRIBUTORS HAVE BEEN ADVISED OF THE POSSIBILITY OF SUCH CLAIM, LOSS,
+ * DAMAGES OR OTHER LIABILITY.
+ *
+ * If applicable legislation implies representations, warranties, or
+ * conditions, or imposes obligations or liability on National ICT
+ * Australia or one of its contributors in respect of the Software that
+ * cannot be wholly or partly excluded, restricted or modified, the
+ * liability of National ICT Australia or the contributor is limited, to
+ * the full extent permitted by the applicable legislation, at its
+ * option, to:
+ * a.  in the case of goods, any one or more of the following:
+ * i.  the replacement of the goods or the supply of equivalent goods;
+ * ii.  the repair of the goods;
+ * iii. the payment of the cost of replacing the goods or of acquiring
+ *  equivalent goods;
+ * iv.  the payment of the cost of having the goods repaired; or
+ * b.  in the case of services:
+ * i.  the supplying of the services again; or
+ * ii.  the payment of the cost of having the services supplied again.
+ *
+ * The construction, validity and performance of this licence is governed
+ * by the laws in force in New South Wales, Australia.
+ */
+
+/*
+ * Author: Malcolm Purvis <malcolmp@ok-labs.com>
+ *
+ * This file is only included by the main atomic_ops.h, so all of that
+ * file's definitions are available.
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*=============================================================================
+                        CONSTANTS AND MACROS
+=============================================================================*/
+
+///* Sanity check to ensure the smp flag is set in machines.py */
+//#if defined(__ATOMIC_OPS_IN_KERNEL__) && !defined(MACHINE_SMP) && CONFIG_NUM_UNITS > 1
+//#error CONFIG_NUM_UNITS > 1 but smp not defined in machines.py.
+//#endif
+#define QURT_INLINE  __attribute__((always_inline))
+
+/*=============================================================================
+                        FUNCTIONS
+=============================================================================*/
+/**@ingroup func_qurt_atomic_set
+  Sets the atomic variable with the specified value.
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+  @param[in]      value  Value to set.
+  
+  @return
+  Value successfuly set.
+
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE unsigned int
+qurt_atomic_set(unsigned int* target, unsigned int value)
+{
+    unsigned long tmp;
+
+    __asm__ __volatile__(
+        "1:     %0 = memw_locked(%2)\n"
+        "       memw_locked(%2, p0) = %3\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (tmp),"+m" (*target)
+        : "r" (target), "r" (value)
+        : "p0");
+    return value;
+}
+
+/**@ingroup func_qurt_atomic_and
+  Bitwise AND operation of the atomic variable with mask. 
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+  @param[in]      mask   Mask for bitwise AND. 
+
+  @return
+  None
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE void
+qurt_atomic_and(unsigned int* target, unsigned int mask)
+{
+    unsigned int result;
+
+    __asm__ __volatile__(
+        "1:     %0 = memw_locked(%2)\n"
+        "       %0 = and(%0, %3)\n"
+        "       memw_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*target)
+        : "r" (target),"r" (mask)
+        : "p0");
+}
+
+/**@ingroup func_qurt_atomic_and_return
+  Bitwise AND operation of the atomic variable with mask. 
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+  @param[in]      mask   Mask for bitwise AND. 
+
+  @return
+  AND result of atomic variable with mask.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE unsigned int
+qurt_atomic_and_return(unsigned int* target, unsigned int mask)
+{
+    unsigned int result;
+
+    __asm__ __volatile__(
+        "1:     %0 = memw_locked(%2)\n"
+        "       %0 = and(%0, %3)\n"
+        "       memw_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*target)
+        : "r" (target), "r" (mask)
+        : "p0");
+
+    return result;
+}
+
+/**@ingroup func_qurt_atomic_or
+  Bitwise OR operation of the atomic variable with mask. 
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+  @param[in]      mask   Mask for bitwise OR. 
+
+  @return
+  None.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE void
+qurt_atomic_or(unsigned int* target, unsigned int mask)
+{
+    unsigned int result;
+
+    __asm__ __volatile__(
+        "1:     %0 = memw_locked(%2)\n"
+        "       %0 = or(%0, %3)\n"
+        "       memw_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*target)
+        : "r" (target), "r" (mask)
+        : "p0");
+}
+
+/**@ingroup func_qurt_atomic_or_return
+  Bitwise OR operation of the atomic variable with mask. 
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+  @param[in]      mask   Mask for bitwise OR. 
+
+  @return
+  Returns the OR result of the atomic variable with mask.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE unsigned int
+qurt_atomic_or_return(unsigned int* target, unsigned int mask)
+{
+    unsigned int result;
+
+    __asm__ __volatile__(
+        "1:     %0 = memw_locked(%2)\n"
+        "       %0 = or(%0, %3)\n"
+        "       memw_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*target)
+        : "r" (target), "r" (mask)
+        : "p0");
+
+    return result;
+}
+
+/**@ingroup func_qurt_atomic_xor
+  Bitwise XOR operation of the atomic variable with mask. 
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+  @param[in]      mask   Mask for bitwise XOR.
+
+  @return
+  None.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE void
+qurt_atomic_xor(unsigned int* target, unsigned int mask)
+{
+    unsigned int result;
+
+    __asm__ __volatile__(
+        "1:     %0 = memw_locked(%2)\n"
+        "       %0 = xor(%0, %3)\n"
+        "       memw_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*target)
+        : "r" (target), "r" (mask)
+        : "p0");
+}
+
+/**@ingroup func_qurt_atomic_xor_return
+  Bitwise XOR operation of the atomic variable with mask. 
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+  @param[in]      mask   Mask for bitwise XOR. 
+
+  @return
+  XOR result of atomic variable with mask.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE unsigned int
+qurt_atomic_xor_return(unsigned int* target, unsigned int mask)
+{
+    unsigned int result;
+
+    __asm__ __volatile__(
+        "1:     %0 = memw_locked(%2)\n"
+        "       %0 = xor(%0, %3)\n"
+        "       memw_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*target)
+        : "r" (target), "r" (mask)
+        : "p0");
+
+    return result;
+}
+
+/**@ingroup func_qurt_atomic_set_bit
+  Sets a bit in the atomic variable at a specified position.
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+  @param[in]      bit    Bit position to set. 
+
+  @return
+  None.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE void
+qurt_atomic_set_bit(unsigned int *target, unsigned int bit)
+{
+    unsigned int result;
+    unsigned int aword = bit / ((unsigned int)sizeof(unsigned int) * 8U); 
+    unsigned int sbit = bit % ((unsigned int)sizeof(unsigned int) * 8U);
+    unsigned int *wtarget= (unsigned int *)&target[aword];
+
+    __asm__ __volatile__(
+        "1:     %0 = memw_locked(%2)\n"
+        "       %0 = setbit(%0, %3)\n"
+        "       memw_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*wtarget)
+        : "r" (wtarget), "r" (sbit)
+        : "p0");
+}
+
+/**@ingroup func_qurt_atomic_clear_bit
+  Clears a bit in the atomic variable at a specified position. 
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+  @param[in]      bit    Bit position to clear.
+
+  @return
+  None.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE void
+qurt_atomic_clear_bit(unsigned int *target, unsigned int bit)
+{
+    unsigned int result;
+    unsigned int aword = bit / ((unsigned int)sizeof(unsigned int) * 8U); 
+    unsigned int sbit = bit % ((unsigned int)sizeof(unsigned int) * 8U);
+    unsigned int *wtarget= (unsigned int *)&target[aword];
+
+    __asm__ __volatile__(
+        "1:     %0 = memw_locked(%2)\n"
+        "       %0 = clrbit(%0, %3)\n"
+        "       memw_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*wtarget)
+        : "r" (wtarget), "r" (sbit)
+        : "p0");
+}
+
+/**@ingroup func_qurt_atomic_change_bit
+  Toggles a bit in a atomic variable at a bit position. 
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+  @param[in]      bit    Bit position to toggle. 
+
+  @return
+  None.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE void
+qurt_atomic_change_bit(unsigned int *target, unsigned int bit)
+{
+    unsigned int result;
+    unsigned int aword = bit / ((unsigned int)sizeof(unsigned int) * 8U); 
+    unsigned int sbit = bit & 0x1fU;
+    unsigned int *wtarget= (unsigned int *)&target[aword];
+
+    __asm__ __volatile__(
+        "1:     %0 = memw_locked(%2)\n"
+        "       %0 = togglebit(%0, %3)\n"
+        "       memw_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*wtarget)
+        : "r" (wtarget),"r" (sbit)
+        : "p0");
+}
+
+/**@ingroup func_qurt_atomic_add
+  Adds an integer to atomic variable.
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+  @param[in]      v      Integer value to add. 
+
+  @return
+  None.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE void
+qurt_atomic_add(unsigned int *target, unsigned int v)
+{
+    unsigned int result;
+
+    __asm__ __volatile__(
+        "1:     %0 = memw_locked(%2)\n"
+        "       %0 = add(%0, %3)\n"
+        "       memw_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*target)
+        : "r" (target), "r" (v)
+        : "p0");
+}
+
+/**@ingroup func_qurt_atomic_add_return
+  Adds an integer to atomic variable.
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+  @param[in]      v      Integer value to add. 
+
+  @return
+  Result of arithmetic sum.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE unsigned int
+qurt_atomic_add_return(unsigned int *target, unsigned int v)
+{
+    unsigned int result;
+
+    __asm__ __volatile__(
+        "1:     %0 = memw_locked(%2)\n"
+        "       %0 = add(%0, %3)\n"
+        "       memw_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*target)
+        : "r" (target), "r" (v)
+        : "p0");
+
+    return result;
+}
+
+/**@ingroup func_qurt_atomic_add_unless
+  Adds the delta value to an atomic variable unless the current value in the target 
+  matches the unless variable.
+
+  @note1hang The function retries until load lock and store conditional
+             are successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+  @param[in]      delta  Value to add to the current value.
+  @param[in]      unless Perform the addition only when the current value is not 
+                         equal to this unless value.
+  @return
+  TRUE  -- 1 - Addition was performed. \n
+  FALSE -- 0 - Addition was not done.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE unsigned int
+qurt_atomic_add_unless(unsigned int* target,
+                       unsigned int delta,
+                       unsigned int unless)
+{
+    unsigned int current_val;
+    unsigned int new_val;
+
+    __asm__ __volatile__(
+        "1:     %0 = memw_locked(%3)\n"
+        "       p0 = cmp.eq(%0, %5)\n"
+        "       if p0 jump 2f\n"
+        "       %1 = add(%0, %4)\n"
+        "       memw_locked(%3, p0) = %1\n"
+        "       if !p0 jump 1b\n"
+        "2:\n"
+        : "=&r" (current_val),"=&r" (new_val),"+m" (*target)
+        : "r" (target), "r" (delta), "r" (unless)
+        : "p0");
+
+    return (unsigned int)(current_val != unless);
+}
+
+/**@ingroup func_qurt_atomic_sub
+  Subtracts an integer from an atomic variable.
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+  @param[in]      v      Integer value to subtract. 
+
+  @return
+  None.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE void
+qurt_atomic_sub(unsigned int *target, unsigned int v)
+{
+    unsigned int result;
+
+    __asm__ __volatile__(
+        "1:     %0 = memw_locked(%2)\n"
+        "       %0 = sub(%0, %3)\n"
+        "       memw_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*target)
+        : "r" (target), "r" (v)
+        : "p0");
+}
+
+/**@ingroup func_qurt_atomic_sub_return
+  Subtracts an integer from an atomic variable.
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+  @param[in]      v      Integer value to subtract. 
+
+  @return
+  Result of arithmetic subtraction.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE unsigned int
+qurt_atomic_sub_return(unsigned int *target, unsigned int v)
+{
+    unsigned int result;
+
+    __asm__ __volatile__(
+        "1:     %0 = memw_locked(%2)\n"
+        "       %0 = sub(%0, %3)\n"
+        "       memw_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*target)
+        : "r" (target), "r" (v)
+        : "p0");
+
+    return result;
+}
+
+/**@ingroup func_qurt_atomic_inc
+  Increments an atomic variable by one.
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+
+  @return
+  None.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE void
+qurt_atomic_inc(unsigned int *target)
+{
+    unsigned int result;
+
+    __asm__ __volatile__(
+        "1:     %0 = memw_locked(%2)\n"
+        "       %0 = add(%0, #1)\n"
+        "       memw_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*target)
+        : "r" (target)
+        : "p0");
+}
+
+/**@ingroup func_qurt_atomic_inc_return
+  Increments an atomic variable by one.
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+
+  @return
+  Incremented value.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE unsigned int
+qurt_atomic_inc_return(unsigned int *target)
+{
+    unsigned int result;
+
+    __asm__ __volatile__(
+        "1:     %0 = memw_locked(%2)\n"
+        "       %0 = add(%0, #1)\n"
+        "       memw_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*target)
+        : "r" (target)
+        : "p0");
+
+    return result;
+}
+
+/**@ingroup func_qurt_atomic_dec
+  Decrements an atomic variable by one.
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+
+  @return
+  None.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE void
+qurt_atomic_dec(unsigned int *target)
+{
+    unsigned int result;
+
+    __asm__ __volatile__(
+        "1:     %0 = memw_locked(%2)\n"
+        "       %0 = add(%0, #-1)\n"
+        "       memw_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*target)
+        : "r" (target)
+        : "p0");
+}
+
+/**@ingroup func_qurt_atomic_dec_return
+  Decrements an atomic variable by one.
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+
+  @return
+  Decremented value.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE unsigned int
+qurt_atomic_dec_return(unsigned int *target)
+{
+    unsigned int result;
+
+    __asm__ __volatile__(
+        "1:     %0 = memw_locked(%2)\n"
+        "       %0 = add(%0, #-1)\n"
+        "       memw_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*target)
+        : "r" (target)
+        : "p0");
+
+    return result;
+}
+
+/**@ingroup func_qurt_atomic_compare_and_set
+  Compares the current value of the atomic variable with the
+  specified value and set to a new value when compare is successful.
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target  Pointer to the atomic variable.
+  @param[in]      old_val Old value to compare.
+  @param[in]      new_val New value to set.
+
+  @return
+  FALSE -- Specified value is not equal to the current value. \n
+  TRUE --Specified value is equal to the current value.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE unsigned int
+qurt_atomic_compare_and_set(unsigned int* target,
+                       unsigned int old_val,
+                       unsigned int new_val)
+{
+    unsigned int current_val;
+
+    __asm__ __volatile__(
+        "1:     %0 = memw_locked(%2)\n"
+        "       p0 = cmp.eq(%0, %3)\n"
+        "       if !p0 jump 2f\n"
+        "       memw_locked(%2, p0) = %4\n"
+        "       if !p0 jump 1b\n"
+        "2:\n"
+        : "=&r" (current_val),"+m" (*target)
+        : "r" (target), "r" (old_val), "r" (new_val)
+        : "p0");
+
+    return (unsigned int)(current_val == old_val);
+}
+
+/**@ingroup func_qurt_atomic_barrier
+  Allows the compiler to enforce an ordering constraint on memory operation issued
+  before and after the function.
+  
+  @return
+  None.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE void
+qurt_atomic_barrier(void)
+{
+    __asm__ __volatile__ (
+        ""
+        :
+        :
+        :
+        "memory");
+}
+
+
+/**@ingroup func_qurt_atomic64_set
+  Sets the 64-bit atomic variable with the specified value. 
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+  @param[in]      value  64-bit value to set. 
+
+  @return
+  Successfuly set value.
+
+  @dependencies
+  None.
+*/ 
+static inline QURT_INLINE unsigned long long
+qurt_atomic64_set(unsigned long long* target, unsigned long long value)
+{
+    unsigned long long tmp;
+
+    __asm__ __volatile__(
+        "1:     %0 = memd_locked(%2)\n"
+        "       memd_locked(%2, p0) = %3\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (tmp),"+m" (*target)
+        : "r" (target), "r" (value)
+        : "p0");
+    return value;
+}
+
+/**@ingroup func_qurt_atomic64_and_return
+  Bitwise AND operation of a 64-bit atomic variable with mask. 
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+  @param[in]      mask   64-bit mask for bitwise AND. 
+
+  @return
+  AND result of 64-bit atomic variable with mask.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE unsigned long long
+qurt_atomic64_and_return(unsigned long long* target, unsigned long long mask)
+{
+    unsigned long long result;
+
+    __asm__ __volatile__(
+        "1:     %0 = memd_locked(%2)\n"
+        "       %0 = and(%0, %3)\n"
+        "       memd_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*target)
+        : "r" (target), "r" (mask)
+        : "p0");
+
+    return result;
+}
+
+/**@ingroup func_qurt_atomic64_or
+  Bitwise OR operation of a 64-bit atomic variable with mask.
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+  @param[in]      mask   64-bit mask for bitwise OR. 
+
+  @return
+  None.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE void
+qurt_atomic64_or(unsigned long long* target, unsigned long long mask)
+{
+    unsigned long long result;
+
+    __asm__ __volatile__(
+        "1:     %0 = memd_locked(%2)\n"
+        "       %0 = or(%0, %3)\n"
+        "       memd_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*target)
+        : "r" (target), "r" (mask)
+        : "p0");
+}
+
+/**@ingroup func_qurt_atomic64_or_return
+  Bitwise OR operation of a 64-bit atomic variable with mask. 
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+  @param[in]      mask   64-bit mask for bitwise OR. 
+
+  @return
+  OR result of the atomic variable with mask.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE unsigned long long
+qurt_atomic64_or_return(unsigned long long* target, unsigned long long mask)
+{
+    unsigned long long result;
+
+    __asm__ __volatile__(
+        "1:     %0 = memd_locked(%2)\n"
+        "       %0 = or(%0, %3)\n"
+        "       memd_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*target)
+        : "r" (target), "r" (mask)
+        : "p0");
+
+    return result;
+}
+
+/**@ingroup func_qurt_atomic64_xor_return
+  Bitwise XOR operation of 64-bit atomic variable with mask. 
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+  @param[in]      mask   64-bit mask for bitwise XOR. 
+
+  @return
+  XOR result of atomic variable with mask.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE unsigned long long
+qurt_atomic64_xor_return(unsigned long long* target, unsigned long long mask)
+{
+    unsigned long long result;
+
+    __asm__ __volatile__(
+        "1:     %0 = memd_locked(%2)\n"
+        "       %0 = xor(%0, %3)\n"
+        "       memd_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*target)
+        : "r" (target), "r" (mask)
+        : "p0");
+
+    return result;
+}
+
+/**@ingroup func_qurt_atomic64_set_bit
+  Sets a bit in a 64-bit atomic variable at a specified position. 
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+  @param[in]      bit    Bit position to set. 
+
+  @return
+  None.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE void
+qurt_atomic64_set_bit(unsigned long long *target, unsigned int bit)
+{
+    unsigned int result;
+    unsigned int *wtarget;
+    unsigned int *pwtarget = (unsigned int *)target;
+    unsigned int aword = bit / ((unsigned int)sizeof(unsigned int) * 8U); 
+    unsigned int sbit = bit & 0x1FU;
+    wtarget = (unsigned int *)&pwtarget[aword];
+
+
+    __asm__ __volatile__(
+        "1:     %0 = memw_locked(%2)\n"
+        "       %0 = setbit(%0, %3)\n"
+        "       memw_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*wtarget)
+        : "r" (wtarget), "r" (sbit)
+        : "p0");
+}
+
+/**@ingroup func_qurt_atomic64_clear_bit
+  Clears a bit in a 64-bit atomic variable at a specified position. 
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+  @param[in]      bit    Bit position to clear. 
+
+  @return
+  None.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE void
+qurt_atomic64_clear_bit(unsigned long long *target, unsigned int bit)
+{
+    unsigned int result;
+    unsigned int *wtarget;
+    unsigned int *pwtarget = (unsigned int *)target;
+    unsigned int aword = bit / ((unsigned int)sizeof(unsigned int) * 8U); 
+    unsigned int sbit = bit & 0x1FU;
+    wtarget = (unsigned int *)&pwtarget[aword];
+
+    __asm__ __volatile__(
+        "1:     %0 = memw_locked(%2)\n"
+        "       %0 = clrbit(%0, %3)\n"
+        "       memw_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*wtarget)
+        : "r" (wtarget), "r" (sbit)
+        : "p0");
+}
+
+/**@ingroup func_qurt_atomic64_change_bit
+  Toggles a bit in a 64-bit atomic variable at a bit position. 
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+  @param[in]      bit    Bit position to toggle. 
+
+  @return
+  None.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE void
+qurt_atomic64_change_bit(unsigned long long *target, unsigned int bit)
+{
+    unsigned int result;
+    unsigned int *wtarget;
+    unsigned int *pwtarget = (unsigned int *)target;
+    unsigned int aword = bit / ((unsigned int)sizeof(unsigned int) * 8U); 
+    unsigned int sbit = bit & 0x1FU;
+    wtarget = (unsigned int *)&pwtarget[aword];
+
+    __asm__ __volatile__(
+        "1:     %0 = memw_locked(%2)\n"
+        "       %0 = togglebit(%0, %3)\n"
+        "       memw_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*wtarget)
+        : "r" (wtarget),"r" (sbit)
+        : "p0");
+}
+
+/**@ingroup func_qurt_atomic64_add
+  Adds a 64-bit integer to 64-bit atomic variable.
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+  @param[in]      v      64-bit integer value to add. 
+
+  @return
+  None.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE void
+qurt_atomic64_add(unsigned long long *target, unsigned long long v)
+{
+    unsigned long long result;
+
+    __asm__ __volatile__(
+        "1:     %0 = memd_locked(%2)\n"
+        "       %0 = add(%0, %3)\n"
+        "       memd_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*target)
+        : "r" (target), "r" (v)
+        : "p0");
+}
+
+/**@ingroup func_qurt_atomic64_add_return
+  Adds a 64-bit integer to 64-bit atomic variable.
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+  @param[in]      v      64-bit integer value to add. 
+
+  @return
+  Result of arithmetic sum.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE unsigned long long
+qurt_atomic64_add_return(unsigned long long *target, unsigned long long v)
+{
+    unsigned long long result;
+
+    __asm__ __volatile__(
+        "1:     %0 = memd_locked(%2)\n"
+        "       %0 = add(%0, %3)\n"
+        "       memd_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*target)
+        : "r" (target), "r" (v)
+        : "p0");
+
+    return result;
+}
+
+/**@ingroup func_qurt_atomic64_sub_return
+  Subtracts a 64-bit integer from an atomic variable.
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+  @param[in]      v      64-bit integer value to subtract. 
+
+  @return
+  Result of arithmetic subtraction.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE unsigned long long
+qurt_atomic64_sub_return(unsigned long long *target, unsigned long long v)
+{
+    unsigned long long result;
+
+    __asm__ __volatile__(
+        "1:     %0 = memd_locked(%2)\n"
+        "       %0 = sub(%0, %3)\n"
+        "       memd_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*target)
+        : "r" (target), "r" (v)
+        : "p0");
+
+    return result;
+}
+
+/**@ingroup func_qurt_atomic64_inc
+  Increments a 64-bit atomic variable by one.
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+
+  @return
+  None.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE void
+qurt_atomic64_inc(unsigned long long *target)
+{
+    unsigned long long result;
+    unsigned long long inc =1;
+
+    __asm__ __volatile__(
+        "1:     %0 = memd_locked(%2)\n"
+        "       %0 = add(%0, %3)\n"
+        "       memd_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*target)
+        : "r" (target),"r" (inc)
+        : "p0");
+}
+
+/**@ingroup func_qurt_atomic64_inc_return
+  Increments a 64-bit atomic variable by one
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+
+  @return
+  Incremented value.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE unsigned long long
+qurt_atomic64_inc_return(unsigned long long *target)
+{
+    unsigned long long result;
+    unsigned long long inc =1;
+
+    __asm__ __volatile__(
+        "1:     %0 = memd_locked(%2)\n"
+        "       %0 = add(%0, %3)\n"
+        "       memd_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*target)
+        : "r" (target),"r" (inc)
+        : "p0");
+
+    return result;
+}
+
+/**@ingroup func_qurt_atomic64_dec_return
+  Decrements a 64-bit atomic variable by one.
+
+  @note1hang The function retries until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target Pointer to the atomic variable.
+
+  @return
+  Decremented value.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE unsigned long long
+qurt_atomic64_dec_return(unsigned long long *target)
+{
+    unsigned long long result;
+    long long minus1 = 0xFFFFFFFFFFFFFFFFLL;
+
+    __asm__ __volatile__(
+        "1:     %0 = memd_locked(%2)\n"
+        "       %0 = add(%0, %3)\n"
+        "       memd_locked(%2, p0) = %0\n"
+        "       if !p0 jump 1b\n"
+        : "=&r" (result),"+m" (*target)
+        : "r" (target),"r" (minus1)
+        : "p0");
+
+    return result;
+}
+
+/**@ingroup func_qurt_atomic64_compare_and_set
+  Compares the current value of an 64-bit atomic variable with 
+  the specified value and sets to a new value when compare is successful.
+
+  @note1hang The function keep retrying until load lock and store conditional
+             is successful.
+
+  @param[in,out]  target  Pointer to the atomic variable.
+  @param[in]      old_val 64-bit old value to compare.
+  @param[in]      new_val 64-bit new value to set.
+
+  @return
+  FALSE -- Specified value is not equal to the current value. \n
+  TRUE -- Specified value is equal to the current value.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE int
+qurt_atomic64_compare_and_set(unsigned long long *target,
+                       unsigned long long old_val,
+                       unsigned long long new_val)
+{
+    unsigned long long current_val;
+
+    __asm__ __volatile__(
+        "1:     %0 = memd_locked(%2)\n"
+        "       p0 = cmp.eq(%0, %3)\n"
+        "       if !p0 jump 2f\n"
+        "       memd_locked(%2, p0) = %4\n"
+        "       if !p0 jump 1b\n"
+        "2:\n"
+        : "=&r" (current_val),"+m" (*target)
+        : "r" (target), "r" (old_val), "r" (new_val)
+        : "p0");
+
+    return (int)(current_val == old_val);
+}
+
+/**@ingroup func_qurt_atomic64_barrier
+  Allows compiler to enforce an ordering constraint on memory operation issued
+  before and after the function.
+
+  @return
+  None.
+  
+  @dependencies
+  None.
+*/
+static inline QURT_INLINE void
+qurt_atomic64_barrier(void)
+{
+    /** @cond */
+    __asm__ __volatile__ (
+        ""
+        :
+        :
+        :
+        "memory");
+    /** @endcond */
+}
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_ATOMIC_OPS_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_barrier.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_barrier.h
new file mode 100755
index 0000000000000..7c6f787d43bc2
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_barrier.h
@@ -0,0 +1,140 @@
+#ifndef QURT_BARRIER_H
+#define QURT_BARRIER_H
+
+/**
+  @file qurt_barrier.h
+  @brief Prototypes of Kernel barrier API functions.      
+
+ EXTERNALIZED FUNCTIONS
+ None.
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+ None.
+
+ Copyright (c) 2021 Qualcomm Technologies, Inc. All rights reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** @addtogroup barrier_types
+@{ */
+/*=====================================================================
+ Constants and macros
+======================================================================*/
+#define QURT_BARRIER_SERIAL_THREAD 1 /**< Serial thread. */
+#define QURT_BARRIER_OTHER 0         /**< Other. */
+
+#ifndef ASM
+#include <qurt_mutex.h>
+
+/*=====================================================================
+Typedefs
+======================================================================*/
+
+/** QuRT barrier type.                                                 
+ */
+typedef union {
+    /** @cond */
+	struct {
+        unsigned short threads_left;
+		unsigned short count;
+		unsigned int threads_total;
+        unsigned int queue;
+        unsigned int reserved;
+	};
+	unsigned long long int raw;
+    /** @endcond */
+} qurt_barrier_t;
+
+/** @} */ /* end_addtogroup barrier_types */
+
+/*=====================================================================
+ Functions
+======================================================================*/
+ 
+/*======================================================================*/
+/**@ingroup func_qurt_barrier_init
+  Initializes a barrier object.
+	
+  @datatypes
+  #qurt_barrier_t
+
+  @param[out] barrier       Pointer to the barrier object to initialize.
+  @param[in]  threads_total Total number of threads to synchronize on the barrier.
+
+
+  @return
+  Unused integer value.
+
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+int qurt_barrier_init(qurt_barrier_t *barrier, unsigned int threads_total);
+
+/*======================================================================*/
+/**@ingroup func_qurt_barrier_destroy
+  Destroys the specified barrier.
+
+  @note1hang Barriers must be destroyed when they are no longer in use. Failure
+             to do this causes resource leaks in the QuRT kernel.\n
+  @note1cont Barriers must not be destroyed while they are still in use. If this
+             occurs, the behavior of QuRT is undefined.
+
+  @datatypes
+  #qurt_barrier_t
+ 
+  @param[in] barrier Pointer to the barrier object to destroy.
+
+  @return     		
+  Unused integer value.
+
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+int qurt_barrier_destroy(qurt_barrier_t *barrier);
+
+/*======================================================================*/
+/**@ingroup func_qurt_barrier_wait
+  Waits on the barrier.\n
+  Suspends the current thread on the specified barrier. \n
+  The function return value indicates whether the thread was the last one to
+  synchronize on the barrier.
+  When a thread waits on a barrier, it is suspended on the barrier: \n
+  - If the total number of threads waiting on the barrier is less than the assigned value 
+     of the barrier, no other action occurs. \n
+  - If the total number of threads waiting on the barrier equals the assigned value of the
+     barrier, all threads currently waiting on the barrier are awakened, allowing them to
+     execute past the barrier.
+
+  @note1hang After its waiting threads are awakened, a barrier is automatically reset 
+            and can be used again in the program without the need for re-initialization.
+	                
+  @datatypes
+  #qurt_barrier_t
+  
+  @param[in] barrier Pointer to the barrier object to wait on.
+
+  @return 				
+  #QURT_BARRIER_OTHER -- Current thread awakened from barrier. \n 
+  #QURT_BARRIER_SERIAL_THREAD -- Current thread is last caller of barrier.
+
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+int qurt_barrier_wait(qurt_barrier_t *barrier);
+
+
+#endif
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_BARRIER_H */
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_busywait.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_busywait.h
new file mode 100755
index 0000000000000..a4dab80a2520a
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_busywait.h
@@ -0,0 +1,62 @@
+#ifndef QURT_BUSYWAIT_H
+#define QURT_BUSYWAIT_H
+
+/**
+  @file qurt_busywait.h 
+  @brief Implementation of the busywait() function for 
+   hardware based blocking waits that use the QTIMER as a reference.   
+
+ EXTERNALIZED FUNCTIONS
+  none
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  none
+
+ Copyright (c) 2021  by Qualcomm Technologies, Inc.  All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ============================================================================*/
+/*=============================================================================
+ *
+ *                       EDIT HISTORY FOR FILE
+ *
+ *   This section contains comments describing changes made to the
+ *   module. Changes are listed in reverse chronological
+ *   order.
+ *
+ * 
+ * when         who     what, where, why
+ * ----------   ---     -------------------------------------------------------
+ * 2018-03-20   pg      Add Header file
+ ============================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*=============================================================================
+                                    FUNCTIONS
+=============================================================================*/
+
+/**@ingroup func_qurt_busywait
+  Pauses the execution of a thread for a specified time.\n
+  Use for small microsecond delays.
+  
+  @note1hang The function does not return to the caller until
+  the time duration has expired.
+             
+  @param[in] pause_time_us Time to pause in microseconds. 
+ 
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+void qurt_busywait (unsigned int pause_time_us);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_BUSYWAIT_H */
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_callback.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_callback.h
new file mode 100755
index 0000000000000..dc9b896c63454
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_callback.h
@@ -0,0 +1,235 @@
+#ifndef QURT_CALLBACK_H
+#define QURT_CALLBACK_H
+
+/**
+  @file qurt_callback.h
+    Definitions, macros, and prototypes for QuRT callback framework.
+  
+  QDI framework allows the development of root process drivers and services that 
+  a user process client can interact with in a secure manner. QDI framework does 
+  this by elevating the priviledge of user process thread, temporarily allowing 
+  the thread execute in root context and letting it fall back to user context once 
+  the QDI invocation is finished. 
+
+  The QuRT callback framework provides a safe mechanism for root process drivers 
+  to execute callback functions in a user process. The framework hosts 
+  dedicated worker threads in corresponding processes that handle the execution
+  of the callback function. This ensures that the callbacks occur in context of
+  the appropriate process thread, in result maintaining privilege boundaries. 
+
+  Prerequisites for use of this framework are:
+  1. Driver is a QDI driver and client communicates with drivers using QDI 
+     invocations.
+  2. Appropriate callback configuration is specified in cust_config.xml for 
+     the user process that intends to use this framework.
+
+  qurt_cb_data_t is the public data structure that allows client to store all
+  the required information about the callback, including the callback function
+  and the arguments to pass to this function when it executes.
+  The client uses QDI interface to register this structure with root driver.
+  
+  Callback framework provides following APIs that a root driver can use to invoke callback.
+  These functions are described in qurt_qdi_driver.h header file.
+
+  qurt_qdi_cb_invoke_async() triggers an asynchronous callback wherein the
+  invoking thread does not wait for the callback to finish executing.
+
+  qurt_qdi_cb_invoke_sync()  triggers a synchronous callback. Upon invocation
+  the invoking thread gets suspended till the callback function finishes execution.
+  
+  qurt_qdi_cb_invoke_sync_with_data() invokes a synchronous callback similar to
+  qurt_qdi_cb_invoke_sync(). It allows user to pass large data along with 
+  the callback invocation to be utlized during the callback execution.
+     
+ EXTERNALIZED FUNCTIONS
+  none
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  none
+
+ Copyright (c) 2021, 2023  by Qualcomm Technologies, Inc.  All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+#include "qurt_qdi.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef int qurt_cb_result_t;
+
+/* Callback framework error codes.
+  Callback framework returns a nonzero value if callback invocation is unsuccessful.
+  Following macros highlight cause of failure in more detail.
+*/
+#define QURT_CB_ERROR               -1                  /* Callback registration failed.\n*/
+#define QURT_CB_OK                   0                  /* Success.\n*/
+#define QURT_CB_MALLOC_FAILED       -2                  /* QuRTOS malloc failure.\n*/
+#define QURT_CB_WAIT_CANCEL         -3                  /* Process exit cancelled wait operation.\n*/
+#define QURT_CB_CONFIG_NOT_FOUND    -4                  /* Callback configuration for process was not found.\n*/
+#define QURT_CB_QUEUE_FULL          -5                  /* Callback queue is serving at maximum capacity.*/
+/** @addtogroup cb_types
+@{ */
+/** Callback registration data structure.
+  This data structure is used by a client attempting to register a callback with a QDI driver.
+  It holds the address of callback function and the argument supplied to the callback 
+  function when it executes.
+*/
+typedef struct {
+  /** @cond */
+  void* cb_func;             /*< Pointer to the callback function. */
+  unsigned cb_arg;           /*< Not interpreted by the framework.*/
+  /** @endcond */
+} qurt_cb_data_t;
+
+/** @cond */
+/* Defines used as default if cust_config does not specify them. */
+#define CALLBACK_WORKER_STACK_SIZE 0x2000
+/** @endcond */
+/** @} */ /* end_addtogroup cb_typess */
+/**@ingroup func_qurt_cb_data_init 
+  Initializes the callback data structure.
+  Entity registering a callback with the root process driver must call this function
+  to initialize callback registration data structure to the default value.
+
+  @datatypes 
+  #qurt_cb_data_t
+
+  @param[in]  cb_data         Pointer to the callback data structure.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+static inline void qurt_cb_data_init (qurt_cb_data_t* cb_data){
+    cb_data->cb_func = NULL;
+    cb_data->cb_arg = 0;
+}
+
+/**@ingroup func_qurt_cb_data_set_cbfunc
+  Sets up the callback function in the callback registration data structure.
+  
+  @datatypes 
+  #qurt_cb_data_t
+
+  @param[in] cb_data         Pointer to the callback data structure.
+  @param[in] cb_func         Pointer to the callback function.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+static inline void qurt_cb_data_set_cbfunc (qurt_cb_data_t* cb_data, void* cb_func){
+  cb_data->cb_func = cb_func;
+}
+
+/**@ingroup func_qurt_cb_data_set_cbarg
+  Sets up the callback argument.
+  This function sets up the argument passed to the callback function when it executes.
+  
+  @datatypes 
+  #qurt_cb_data_t
+
+  @param[in] cb_data         Pointer to the callback data structure.
+  @param[in] cb_arg          Argument for the callback function.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+static inline void qurt_cb_data_set_cbarg (qurt_cb_data_t* cb_data, unsigned cb_arg){
+  cb_data->cb_arg = cb_arg;
+}
+
+/** @cond */
+/**@ingroup driver_support_functions
+  Invokes an asynchronous callback for a specified process. 
+  A driver that resides in the root process calls this API to launch a callback in
+  a process described by the client_handle.
+  After the callback is invoked, the framework queues the callback as per its 
+  priority and subsequently executes it.
+  The caller of this function is not suspended during the callback execution period.
+  The API returns immediately with a success/failure error code.
+
+  @note1hang  This function is only accessible to drivers in the root process. 
+              User process invocations shall fail with a negative error code return value.
+
+  @param  client_handle   Obtained from the current invocation function (Section 4.3.1).
+  @param  cb_data         Pointer to the callback data structure (refer to qurt_callback.h).
+  @param  prio            Priority at which the callback should execute.
+                          This paraemter is optional. If -1 is passed, the callback frameowrk 
+                          executes the callback at the priority of the API caller.
+  @return
+  QURT_EOK -- Callback was successfully communicated to the framework.
+  Negative error code -- Callback cannot be communicated to the framework.
+ */
+qurt_cb_result_t qurt_qdi_cb_invoke_async(int client_handle,
+                                          qurt_cb_data_t* cb_data,
+                                          int prio);
+
+
+/**@ingroup driver_support_functions
+  Invokes a synchronous callback for a specified process. 
+  A driver that resides in a root process calls this API to launch a sync callback in
+  a process described by the client_handle.
+  AFter the callback is invoked, the framework queues the callback as per its 
+  priority and subsequently executes it.
+  The caller of this function is suspended during the callback execution period.
+  If the process in which to execute the callback exits or terminates, the caller is
+  woken up with error code #QURT_CB_WAIT_CANCEL (refer to qurt_callback.h).
+
+  @note1hang  This function is only accessible to drivers in the root process. 
+              User process invocations shall fail with a negative error code return value.
+
+  @param  client_handle   Obtained from the current invocation function (Section 4.3.1).
+  @param  cb_data         Pointer to the callback data structure (refer to qurt_callback.h).
+  @param  prio            Priority at which the callback should execute.
+                          This paraemter is optional. If -1 is passed, callback frameowrk 
+                          executes the callback at the priority of the API caller.
+  @return
+  QURT_EOK -- Callback was successfully communicated to the framework.
+  Negative error code -- Callback cannot be communicated to the framework.
+ */
+qurt_cb_result_t qurt_qdi_cb_invoke_sync(int client_handle,
+                                         qurt_cb_data_t* cb_data,
+                                         int prio);
+
+/**@ingroup driver_support_functions
+  Invokes a synchronous callback for a specified process, passing driver data to the user PD.
+  This function is similar to qurt_qdi_cb_invoke_sync() and allows the driver to pass arbitrary data to
+  the user process as part of the callback invocation.
+
+  @param  client_handle   Obtained from the current invocation function (Section 4.3.1).
+  @param  cb_data         Pointer to the callback data structure (refer to qurt_callback.h).
+  @param  prio            Priority at which the callback should execute.
+                          This paraemter is optional. If -1 is passed, the callback frameowrk
+                          executes the callback at the priority of the API caller.
+  @param  data            Driver arbitrary data to pass to the user process. Memory pointed to by data
+                          must be accessible to the user PD. The root driver can allocate such memory by
+                          using qurt_mem_mmap().
+  @param  data_len        Driver arbitrary data length.
+  
+  @return
+  QURT_EOK -- Callback was successfully communicated to the framework.
+  Negative error code -- Callback cannot be communicated to the framework.
+ */
+qurt_cb_result_t qurt_qdi_cb_invoke_sync_with_data( int client_handle,
+                                                    qurt_cb_data_t* cb_data,
+                                                    int prio,
+                                                    void *data,
+                                                    unsigned data_len
+                                                    );
+/** @endcond */
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_clade.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_clade.h
new file mode 100755
index 0000000000000..d7442cf98dd94
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_clade.h
@@ -0,0 +1,62 @@
+#ifndef QURT_CLADE_H
+#define QURT_CLADE_H
+/**
+  @file qurt_clade.h 
+  @brief  Prototypes of Cache Line Accelerated Decompression Engine (CLADE) API.
+  CLADE is a cache line level memory compression system that is used to
+  decrease DRAM usage.
+
+EXTERNAL FUNCTIONS
+   None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+ Copyright (c) 2019-2021  by Qualcomm Technologies, Inc.  All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+=============================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*=============================================================================
+                                    FUNCTIONS
+=============================================================================*/
+
+/**@ingroup func_qurt_clade2_get
+  Reads the value of the clade2 register.
+ 
+  @param[in] offset Offset from the clade2 cfg base.
+  @param[out] *value  Pointer to the register value read from the offset.
+ 
+  @return
+  #QURT_EOK - Successfully read the value from the register at offset \n
+  #QURT_EINVALID - Offset passed is incorrect
+   
+  @dependencies
+  None.
+ */
+int qurt_clade2_get(unsigned short offset, unsigned int *value);
+ 
+/**@ingroup func_qurt_clade2_set
+  Sets the PMU register; only PMU_SEL register can be set.
+  
+  @param[in] offset Offset from the QURTK_clade2_cfg_base.          
+  @param[in] value  Value to set at offset.  
+ 
+  @return
+  #QURT_EOK -- Successfully set the value at offset. \n
+  #QURT_ENOTALLOWED -- Set operation performed at an offset other than CLADE2_PMU_SELECTION_REG.
+
+  @dependencies
+  None.
+ */
+int qurt_clade2_set(unsigned short offset, unsigned int value);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_CLADE_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_cond.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_cond.h
new file mode 100755
index 0000000000000..6e65ed82a8393
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_cond.h
@@ -0,0 +1,219 @@
+﻿#ifndef QURT_COND_H
+#define QURT_COND_H 
+/**
+  @file qurt_cond.h
+  @brief  Prototypes of kernel condition variable object API functions.
+
+ EXTERNALIZED FUNCTIONS
+  None
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  none
+
+ Copyright (c) 2021 Qualcomm Technologies, Inc.
+ All rights reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+
+
+#include <qurt_mutex.h>
+#include <qurt_rmutex2.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** @addtogroup condition_variables_types
+@{ */
+/*=====================================================================
+ Typedefs
+ ======================================================================*/
+
+/** QuRT condition variable type.  */
+typedef union {
+    /** @cond */
+	unsigned long long raw;
+	struct {
+		unsigned int count;
+		unsigned int n_waiting;
+        unsigned int queue;
+        unsigned int reserved;
+	}X;
+    /** @endcond */
+} qurt_cond_t;
+
+/** @} */ /* end_addtogroup condition_variables_types */
+
+/*=====================================================================
+ Functions
+======================================================================*/
+ 
+/*======================================================================*/
+/**@ingroup func_qurt_cond_init
+  Initializes a conditional variable object.
+
+  @datatypes
+  #qurt_cond_t
+	
+  @param[out] cond Pointer to the initialized condition variable object. 
+
+  @return
+  None.
+		 
+  @dependencies
+  None.
+ */
+/* ======================================================================*/
+void qurt_cond_init(qurt_cond_t *cond);
+
+/*======================================================================*/
+/**@ingroup func_qurt_cond_destroy
+  Destroys the specified condition variable.
+
+  @note1hang Conditions must be destroyed when they are no longer in use. Failure to do
+             this causes resource leaks in the QuRT kernel.\n
+  @note1cont Conditions must not be destroyed while they are still in use. If this occurs,
+             the behavior of QuRT is undefined. 
+
+  @datatypes
+  #qurt_cond_t
+
+  @param[in] cond Pointer to the condition variable object to destroy.
+
+  @return
+  None.
+
+ */
+/* ======================================================================*/
+void qurt_cond_destroy(qurt_cond_t *cond);
+
+/*======================================================================*/
+/**@ingroup func_qurt_cond_signal
+  Signals a waiting thread that the specified condition is true. \n
+
+  When a thread wishes to signal that a condition is true on a shared data item, it must
+  perform the following procedure: \n
+  -# Lock the mutex that controls access to the data item. \n
+  -# Perform the signal condition operation. \n
+  -# Unlock the mutex.
+
+  @note1hang Failure to properly lock and unlock a mutex of a condition variable can cause
+             the threads to never be suspended (or suspended but never awakened). 
+
+  @note1cont Use condition variables only with regular mutexes -- attempting to use
+             recursive mutexes or priority inheritance mutexes results in undefined behavior.
+             
+  @datatypes
+  #qurt_cond_t
+
+  @param[in] cond Pointer to the condition variable object to signal.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+/* ======================================================================*/
+void qurt_cond_signal(qurt_cond_t *cond);
+
+/*======================================================================*/
+/**@ingroup func_qurt_cond_broadcast
+  Signals multiple waiting threads that the specified condition is true.\n
+  When a thread wishes to broadcast that a condition is true on a shared data item, it must
+  perform the following procedure: \n
+  -# Lock the mutex that controls access to the data item. \n
+  -# Perform the broadcast condition operation. \n
+  -# Unlock the mutex.\n
+
+  @note1hang Failure to properly lock and unlock the mutex of a condition variable can cause
+             the threads to never be suspended (or suspended but never awakened).
+
+  @note1cont Use condition variables only with regular mutexes -- attempting to use
+  recursive mutexes or priority inheritance mutexes results in undefined behavior.
+  
+  @datatypes
+  #qurt_cond_t
+
+  @param[in] cond Pointer to the condition variable object to signal.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+/* ======================================================================*/
+void qurt_cond_broadcast(qurt_cond_t *cond);
+
+/*======================================================================*/
+/**@ingroup func_qurt_cond_wait
+  Suspends the current thread until the specified condition is true.
+  When a thread wishes to wait for a specific condition on a shared data item, it must
+  perform the following procedure: \n
+  -# Lock the mutex that controls access to the data item. \n
+  -# If the condition is not satisfied, perform the wait condition operation on the
+  condition variable (suspends the thread and unlocks the mutex).
+
+  @note1hang Failure to properly lock and unlock the mutex of a condition variable can cause
+             the threads to never be suspended (or suspended but never awakened).
+
+  @note1cont Use condition variables only with regular mutexes -- attempting to use
+  recursive mutexes or priority inheritance mutexes results in undefined behavior.
+  
+  @datatypes
+  #qurt_cond_t \n
+  #qurt_mutex_t
+  
+  @param[in] cond     Pointer to the condition variable object to wait on.
+  @param[in] mutex    Pointer to the mutex associated with condition variable to wait on.
+
+  @return
+  None.
+		 
+  @dependencies 
+  None.
+ */
+/* ======================================================================*/
+void qurt_cond_wait(qurt_cond_t *cond, qurt_mutex_t *mutex);
+
+/*======================================================================*/
+/**@ingroup func_qurt_cond_wait2
+  Suspends the current thread until the specified condition is true.
+  When a thread wishes to wait for a specific condition on a shared data item, it must
+  perform the following procedure: \n
+  -# Lock the mutex that controls access to the data item. \n
+  -# If the condition is not satisfied, perform the wait condition operation on the
+  condition variable, which suspends the thread and unlocks the mutex.
+ 
+  @note1hang Failure to properly lock and unlock the mutex of a condition variable can cause
+             the threads to never be suspended (or suspended but never awakened). 
+
+  @note1cont Use condition variables only with regular mutexes -- attempting to use
+             recursive mutexes or priority inheritance mutexes results in undefined behavior.
+             
+  @note1cont This is the same API as qurt_cond_wait(), use this version 
+             when using mutexes of type #qurt_rmutex2_t.
+
+  @datatypes
+  #qurt_cond_t \n
+  #qurt_rmutex2_t
+  
+  @param[in] cond     Pointer to the condition variable object to wait on.
+  @param[in] mutex    Pointer to the mutex associated with the condition variable to wait on.
+
+  @return
+  None.
+		 
+  @dependencies 
+  None.
+ */
+/* ======================================================================*/
+void qurt_cond_wait2(qurt_cond_t *cond, qurt_rmutex2_t *mutex);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_COND_H */
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_consts.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_consts.h
new file mode 100755
index 0000000000000..b1e35998e73b6
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_consts.h
@@ -0,0 +1,315 @@
+#ifndef QURT_CONSTS_H
+#define QURT_CONSTS_H
+
+/**
+  @file qurt_consts.h
+  @brief  QuRT constants and definitions
+
+  EXTERNAL FUNCTIONS
+   None.
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None
+
+ Copyright (c) 2013-2023 by Qualcomm Technologies, Inc.  All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*=====================================================================
+ Constants and macros
+ ======================================================================*/
+
+/* Definitions of system events. System events suspend
+   a thread and put it into suspending_list.
+   The system event number is saved in CONTEXT::error::cause field
+   of the suspended thread. An event handler thread such as
+   page fault handler or system error handler can wake up the suspended
+   thread.
+ */
+#define QURT_EVENT_PAGEFAULT      0x1 /* Page fault event. */
+#define QURT_EVENT_SYSTEM_ERR     0x2 /* System error event. */
+#define QURT_EVENT_SUSPEND        0x3
+#define QURT_EVENT_PROCESS_EXIT   0x4 /* Process termination event.*/
+
+#define QURT_SYSENV_MAX_THREADS_TYPE           1 /* Maximum threads object. */
+#define QURT_SYSENV_PROCNAME_TYPE              2 /* Process name object. */
+#define QURT_SYSENV_MAX_PI_PRIO_TYPE           3 /* Maximum pi priority object. */
+#define QURT_SYSENV_ARCH_REV_TYPE              4 /* Architecture version object. */
+#define QURT_SYSENV_APP_HEAP_TYPE              5 /* Application heap object. */
+#define QURT_SYSENV_REGION_ATTR_DEFAULT        7 /* Default region attributes. */
+#define QURT_SYSENV_STACK_PROFILE_COUNT_TYPE   8 /* Stack profile count type. */
+#define QURT_SYSENV_ISLAND_CONFIG_TYPE         9 /*island configuration check*/
+#define QURT_SYSENV_HTHREADS_TYPE              10 /* Active threads objec */
+#define QURT_SYSENV_CONFIG_IMAGE_START_LO      11 /* Config image start address for DTB parsing */
+#define QURT_SYSENV_CONFIG_IMAGE_START_HI      12 /* Config Image start address for DTB parsing */
+#define QURT_SYSENV_CHIPPARAMS_LO              13 /* ChipParams for DTB parsing */
+#define QURT_SYSENV_CHIPPARAMS_HI              14 /* ChipParams for DTB parsing */
+#define QURT_SYSENV_PLATPARAMS                 15 /* Platformparams for DTB parsing */
+#define QURT_SYSENV_CONFIG_IMAGE_SIZE          16 /* Config image Size for DTB parsing */
+#define QURT_SYSENV_L2_CACHE_LINE_SIZE         17 /*L2 cache line size*/
+
+/* Get q6 regs */
+#define QURT_GET_SSR         1
+#define QURT_GET_CCR         2
+#define QURT_GET_CFGBASE     3
+#define QURT_GET_SYSCFG      4
+#define QURT_GET_REV         5
+
+
+/** @cond rest_reg_dist */
+/** @addtogroup performance_monitor_macros
+@{ */
+
+/* PMU */
+#define QURT_PMUCNT0    0  /**< */
+#define QURT_PMUCNT1    1  /**< */
+#define QURT_PMUCNT2    2  /**< */
+#define QURT_PMUCNT3    3  /**< */
+#define QURT_PMUCFG     4  /**< */
+#define QURT_PMUEVTCFG  5  /**< */
+
+/* new since V55 */
+#define QURT_PMUCNT4    6  /**< */
+#define QURT_PMUCNT5    7  /**< */
+#define QURT_PMUCNT6    8  /**< */
+#define QURT_PMUCNT7    9  /**< */
+#define QURT_PMUEVTCFG1 10  /**< */
+
+/* new since V61 */
+#define QURT_PMUSTID0   11  /**< */
+#define QURT_PMUSTID1   12  /**< */
+
+#define QURT_PMUCNTSTID0   13  /**< */
+#define QURT_PMUCNTSTID1   14  /**< */
+#define QURT_PMUCNTSTID2   15  /**< */
+#define QURT_PMUCNTSTID3   16  /**< */
+#define QURT_PMUCNTSTID4   17  /**< */
+#define QURT_PMUCNTSTID5   18  /**< */
+#define QURT_PMUCNTSTID6   19  /**< */
+#define QURT_PMUCNTSTID7   20  /**< */
+
+/** @} */ /* end_addtogroup performance_monitor_macros */
+/** @endcond */
+
+/*
+ Power collapse operation
+*/
+#define QURT_POWER_SHUTDOWN       0 /**< */
+#define QURT_TCXO_SHUTDOWN        1 /**< */
+#define QURT_POWER_CMD_PREPARE    0 /**< */
+#define QURT_POWER_CMD_PERFORM    1 /**< */
+#define QURT_POWER_CMD_EXIT       2 /**< */
+#define QURT_POWER_CMD_FAIL_EXIT  3 /**< */
+#define QURT_POWER_CMD_PERFORM_L2_RETENTION 4 /**< */
+#define QURT_POWER_CMD_PERFORM_SAVE_TCM     5 /**< */
+#define QURT_POWER_CMD_DEEP_SLEEP 6           /**< */
+
+
+/** @addtogroup thread_macros
+@{ */
+#define QURT_MAX_HTHREAD_LIMIT    8U /**< Limit on the maximum number of hardware threads supported by QuRT for any
+ Hexagon version. Use this definition to define arrays, and so on, in
+ target independent code. */
+/** @} */ /* end_addtogroup thread_macros */
+
+/** @cond internal_only */
+/** @addtogroup power_management_macros
+@{ */
+/**
+  L2 cache retention mode
+*/
+#define QURT_POWER_SHUTDOWN_TYPE_L2NORET QURT_POWER_CMD_PERFORM /**< */
+#define QURT_POWER_SHUTDOWN_TYPE_L2RET   QURT_POWER_CMD_PERFORM_L2_RETENTION /**< */
+#define QURT_POWER_SHUTDOWN_TYPE_SAVETCM QURT_POWER_CMD_PERFORM_SAVE_TCM /**< */
+/** @} */ /* end_addtogroup power_management_macros */
+/** @endcond */
+
+/*
+  QURT_system_state
+  Use for debugging the shutdown/startup process.
+
+  State transition for cold boot:
+  QURT_BOOT_SETUP_ISDB --> QURT_CBOOT_BSP_INIT -->
+  QURT_CBOOT_END_CLEAN_INIT --> QURT_CBOOT_END_OS_INIT -->
+  QURT_CBOOT_KERNEL_INIT_DONE --> QURT_CBOOT_PLAT_CONFIG_DONE -->
+  QURT_CBOOT_ROOT_TASK_STARTED
+
+  State transition for power collapse:
+  QURT_PREPARE_SINGLE_MODE --> QURT_PERFORM_IPEND -->
+  QURT_PERFORM_SAVE_TLB --> QURT_PERFORM_SWITCH_PC -->
+  cache flush states (dependent on L2 retention config)
+
+  State transition for warm boot:
+  QURT_BOOT_SETUP_ISDB --> QURT_WBOOT_INIT_TLB -->
+  QURT_WBOOT_SET_1TO1_MAP --> QURT_WBOOT_REMOVE_1TO1_MAP -->
+  QURT_CBOOT_END_CLEAN_INIT --> QURT_CBOOT_END_OS_INIT
+*/
+#define QURT_PREPARE_SINGLE_MODE 1 /**< */
+#define QURT_PREPARE_END 2 /**< */
+#define QURT_PERFORM_IPEND 3 /**< */
+#define QURT_PERFORM_SAVE_ISDP 4 /**< */
+#define QURT_PERFORM_SAVE_PMU 5 /**< */
+#define QURT_PERFORM_SAVE_TLB 6 /**< */
+#define QURT_PERFORM_SWITCH_PC 7 /**< */
+#define QURT_PERFORM_EXIT 8 /**< */
+#define QURT_FLUSH_L1CACHE 9 /**< */
+#define QURT_FLUSH_L2CACHE 0xA /**< */
+#define QURT_FLUSH_CACHE_DONE 0xB /**< */
+#define QURT_SWITCH_PC_DONE 0xC /**< */
+#define QURT_BOOT_SETUP_ISDB 0xD /**< */
+#define QURT_WBOOT_INIT_TLB 0xE /**< */
+#define QURT_WBOOT_SET_1TO1_MAP 0xF /**< */
+#define QURT_WBOOT_CFG_ADV_SYSCFG 0x10 /**< */
+#define QURT_WBOOT_REMOVE_1TO1_MAP 0x11 /**< */
+#define QURT_CBOOT_BSP_INIT 0x12 /**< */
+#define QURT_CBOOT_END_CLEAN_L1CACHE 0x13 /**< */
+#define QURT_CBOOT_END_CLEAN_INIT 0x14 /**< */
+#define QURT_CBOOT_END_OS_INIT 0x15 /**< */
+#define QURT_CBOOT_TLB_DUMP_LOAD 0x16 /**< */
+#define QURT_CBOOT_TLB_STATIC_LOAD 0x17 /**< */
+#define QURT_CBOOT_KERNEL_INIT_DONE 0x18 /**< */
+#define QURT_CBOOT_PLAT_CONFIG_DONE 0x19 /**< */
+#define QURT_CBOOT_ROOT_TASK_STARTED 0x1A /**< */
+#define QURT_IMPRECISE_EXCEPTION 0x1B /**< */
+#define QURT_WBOOT_DEBUG_L2_START 0x1C /**< */
+#define QURT_WBOOT_DEBUG_L2_END   0x1D /**< */
+#define QURT_NMI_SAVE_L2VIC_COMPLETE   0x1E /**< */
+#define QURT_NMI_HANDLER_COMPLETE   0x1F /**< */
+#define QURT_NMI_AFTER_SAVE_GLOBAL 0x20 /**< */
+#define QURT_WBOOT_START 0x21 /**< */
+#define QURT_ENTER_ISLAND 0x22 /**< */
+#define QURT_EXIT_ISLAND 0x23 /**< */
+#define QURT_LOAD_NOTIFIER_TCB 0x24 /**< */
+#define QURT_ABNORMAL_RESET 0x25 /**< */
+/*
+  Thread attributes
+*/
+
+#define QURT_THREAD_ATTR_GP                    0x00000002 /*< */
+#define QURT_THREAD_ATTR_UGP                   0x00000003 /*< User general pointer (UGP)*/
+#define QURT_THREAD_ATTR_PREFETCH              0x00000004 /*< */
+#define QURT_THREAD_ATTR_TID                   0x00000005 /*< */
+#define QURT_THREAD_ATTR_CACHE_PART            0x00000007 /*< */
+#define QURT_THREAD_ATTR_COPROCESSOR           0x00000008 /*< */
+#define QURT_THREAD_ATTR_GET_L2CACHE_PART      0x00000009 /*< */
+#define QURT_THREAD_ATTR_SET_FRML              0x0000000A /*< */
+#define QURT_THREAD_ATTR_STID_GET              0x0000000B /*< */
+#define QURT_THREAD_ATTR_STID_SET              0x0000000C /*< */
+#define QURT_THREAD_ATTR_AUTOSTACK             0x0000000D /*< */
+#define QURT_THREAD_ATTR_SYSTEM_THREAD         0x0000000E /*< */
+#define QURT_THREAD_ATTR_STID_SET2             0x0000000F /*< */
+#define QURT_THREAD_ATTR_STID_SET2_ACKNOWLEDGE 0x00000010 /*< */
+#define QURT_THREAD_ATTR_STID_GET2             0x00000011 /*< */
+
+/**  Cache operations*/
+#define QURT_DCCLEAN                0U   /* Clean Dcache. */
+#define QURT_DCINV                  1U   /* Invalidate Dcache. */
+#define QURT_DCCLEANINV             2U   /* Clean and invalidate Dcache. */
+#define QURT_ICINV                  3U   /* Invalidate Icache. */
+#define QURT_DUMP_DCTAGS            4U  /* For testing purpose. */
+#define QURT_FLUSH_ALL              5U  /* Flush entire L1 and L2 cache. */
+#define QURT_TABLE_FLUSH            6U  /* Flush based on table of physical pages */
+#define QURT_CLEAN_INVALIDATE_ALL   7U  /* Flush and invalidate entire L1 and L2 cache. */
+#define QURT_L2CACHE_LOCK_LINES     8U  /* l2 cache lock lines */
+#define QURT_L2CACHE_UNLOCK_LINES   9U  /* l2 cache unlock lines */
+#define QURT_CLEAN                  10U  /* Flush L1 and L2 cache */
+#define QURT_CLEAN_INVALIDATE       11U  /* Flush and invalidate L1 and L2 cache. */
+#define QURT_CLEAN_INVALIDATE_L2    12U  /* Flush and invalidate entire L2 cache. */
+
+/**@ingroup chapter_prefined_symbols */
+/**@xreflabel{hdr:QURT_API_VERSION}*/
+
+
+/* Process state. */
+#define QURT_UPDATE_PROCESS_STATE   0 /**< */
+#define QURT_MP_INIT        1 /*< */
+#define QURT_MP_RUNNING     2 /*< */
+#define QURT_MP_STOPPED     3 /*< */
+
+/* QuRT reset reason. */
+#define QURT_NORMAL_BOOT               0  /* Normal boot. */
+#define QURT_WARM_BOOT                 1  /* Power collapse warm boot. */
+#define QURT_WARM_BOOT_L2_RETENTION    2  /* Power collapse with L2 retention warm boot. */
+#define QURT_WARM_BOOT_SAVE_TCM        3  /* Power collapse with saving TCM. */
+#define QURT_QUICK_BOOT                4  /* Deep sleep. */
+
+/* QuRT Wait for Idle command */
+#define QURT_WAIT_FOR_IDLE_DISABLE  0 /*< */
+#define QURT_WAIT_FOR_IDLE_ENABLE   1 /*< */
+#define QURT_WAIT_FOR_IDLE     2 /*< */
+#define QURT_WAIT_FOR_IDLE_CANCEL 3 /*< */
+
+/*QuRT island exit stages */
+#define QURT_ISLAND_EXIT_STAGE1 1 /*< */
+#define QURT_ISLAND_EXIT_STAGE2 2 /*< */
+
+#define QURT_MAX_NAME_LEN   64 /*< */
+
+#define MAX_POOL_RANGES     16 /*< */
+
+/* key definitions for debug thread info */
+//#define MAX_TCB_KEY           40    //whatever is a good number or makes debug thread structure be 1K
+#define KEY_SCHDULER_STATE      1   /*< */
+#define KEY_PRIORITY            2   /*< */
+#define KEY_PRIORITY_ORIG       3   /*< */
+#define KEY_STACK_BOTTOM        4    // Currently not populated
+#define KEY_STACK_TOP           5    // Currently not populated
+#define KEY_HVX_STATE           6    /*< */
+#define KEY_FUTEX_OBJECT        7    /*< */
+#define KEY_THREAD_ID           8    /*< */
+#define KEY_PROFILE_CYCLE_LO    9    // Currently not populated
+#define KEY_PROFILE_CYCLE_HI    10   // Currently not populated
+#define KEY_ERROR_ADDRESS       11   // This holds the BADVA
+#define KEY_ERROR_CAUSE         12   // This is the same as QURT_error_info.cause
+#define KEY_ERROR_CAUSE2        13   // This is the same as QURT_error_info.cause2
+#define KEY_ERROR_SSR           14   /*< Holds the SSR value */
+#define QURT_RESERVED           -1
+
+/* VTLB method IDs. */
+#define QURT_VTLB_ENTRY_CREATE          0U
+#define QURT_VTLB_ENTRY_DELETE          1U
+#define QURT_VTLB_ENTRY_READ            2U
+#define QURT_VTLB_ENTRY_WRITE           3U
+#define QURT_VTLB_ENTRY_PROBE           4U
+#define QURT_VTLB_ENTRY_SPLIT           5U
+#define QURT_VTLB_ENTRY_MERGE           6U
+#define QURT_VTLB_ENTRY_STATISTICS      7U
+#define QURT_VTLB_ENTRY_SET_SPECIAL     8U
+#define QURT_VTLB_QUEUE_PPAGE           9U
+#define QURT_VTLB_RECLAIM_STACK_PAGES   10U
+#define QURT_VTLB_ASID_SET_STATE_FAST   11U
+#define QURT_VTLB_ASID_SET_STATE        12U
+#define QURT_VTLB_ENTRY_SET_EXTENSION   13U
+#define QURT_VTLB_ENTRY_CLEAR_EXTENSION 14U
+
+/* VTCM window access control HWIO programming. */
+#define QURT_VTCM_WINDOW_ENABLE             1U
+#define QURT_VTCM_WINDOW_DISABLE            0U
+#define QURT_VTCM_WINDOW_HI_OFFSET_DEFAULT  0xFFFU
+#define QURT_VTCM_WINDOW_LO_OFFSET_DEFAULT  0U
+
+/** @cond */
+/* ETM source - PC or data access */
+#define QURT_ETM_SOURCE_PC          0U  /**< Memory source of SAC* is PC. */
+#define QURT_ETM_SOURCE_DATA        1U  /**< Memory source of SAC* is data. */
+
+/* ETM PID status flags */
+#define QURT_ETM_NO_PID             0xFFFFFFFF /**< No PID is selected. */
+/** @endcond */
+
+/* execution context */
+#define QURT_CTX_USER       1
+#define QURT_CTX_GUEST      2
+
+/* Profiling STID */
+#define QURT_STID_DEFAULT   0U
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_CONSTS_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_cycles.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_cycles.h
new file mode 100755
index 0000000000000..b599493f5d563
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_cycles.h
@@ -0,0 +1,301 @@
+
+#ifndef QURT_CYCLES_H
+#define QURT_CYCLES_H 1
+/**
+  @file qurt_cycles.h
+  Prototypes of kernel pcycle API functions.      
+
+ EXTERNALIZED FUNCTIONS
+  none
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  none
+
+ Copyright (c) 2021, 2023  by Qualcomm Technologies, Inc.  All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+ 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+	/*=====================================================================
+	 Functions
+	======================================================================*/
+	 
+/*======================================================================*/
+
+/**@ingroup func_qurt_profile_reset_idle_pcycles
+  @xreflabel{hdr:qurt_profile_reset_idle_pcycles}
+  Sets the per-hardware-thread idle cycle counts to zero. 
+
+  @return 
+  None. 
+		 
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+void qurt_profile_reset_idle_pcycles (void);
+	 
+/*======================================================================*/
+/**@ingroup func_qurt_profile_get_thread_pcycles
+  @xreflabel{hdr:qurt_profile_get_thread_pcycles}
+  Gets the count of the running processor cycles for the current thread.\n
+  Returns the current running processor cycle count for the current QuRT thread.
+
+  @note1hang  Profiling shall be enabled first to start the cycle counting. 
+              The cycles are accumulated once the profiling is enabled and 
+              resets on #qurt_profile_reset_threadid_pcycles
+
+  @return 
+  Integer -- Running processor cycle count for current thread.
+		 
+  @dependencies 
+  None.
+*/
+/* ======================================================================*/
+unsigned long long int qurt_profile_get_thread_pcycles(void);
+
+	
+/*======================================================================*/
+/**@ingroup func_qurt_get_core_pcycles
+  @xreflabel{hdr:qurt_get_core_pcycles}
+  Gets the count of core processor cycles executed.\n
+  Returns the current number of running processor cycles executed since the Hexagon
+  processor was last reset.
+
+  This value is based on the hardware core clock, which varies in speed according to the
+  processor clock frequency.
+
+  @note1hang Because the hardware core clock stops running when the processor shuts
+             down (due to all of the hardware threads being idle), treat the cycle values returned
+             by this operation as relative rather than absolute.
+
+  @note1cont Thread cycle counts are valid only in the V4 Hexagon processor version.
+
+  @return 
+  Integer -- Current count of core processor cycles.
+		 
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+unsigned long long int qurt_get_core_pcycles(void);
+
+/*======================================================================*/
+/**@ingroup func_qurt_profile_get_idle_pcycles
+
+  @deprecated use #qurt_profile_get_idle_pcycles2 instead
+
+  Gets the current idle processor cycle counts for a maximum of 6 hardware threads. Use
+  #qurt_profile_get_idle_pcycles2 for reading pcycles without limitation on maximum hardware threads. 
+
+  This operation accepts a pointer to a user-defined array, and writes to the array the current
+  idle cycle count for each hardware thread.
+
+  Each count value represents the number of processor cycles that have elapsed on the
+  corresponding hardware thread while that thread has been in Wait mode.\n
+
+
+  @note1hang This operation does not return the idle cycles that occur when the Hexagon
+             processor shuts down (due to all of the hardware threads being idle). 
+             Idle cycle counts gets accumulated irrespective of profiling is enabled or not, 
+	           and resets on #qurt_profile_reset_idle_pcycles
+	
+  @param[out] pcycles  User array where the function stores the current idle cycle count values.
+                        Array size should be a minimum of the number of hardware threads intended. 
+
+  @return
+  None.
+		 
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+void qurt_profile_get_idle_pcycles (unsigned long long *pcycles);
+
+/*======================================================================*/
+/**@ingroup func_qurt_profile_get_idle_pcycles2
+  Gets the current idle processor cycle counts for maximum available hardware threads.
+
+  This operation accepts a pointer to a user-defined array with length in bytes, and writes 
+  to the array the current idle cycle count for each hardware thread.
+
+  Each count value represents the number of processor cycles that have elapsed on the
+  corresponding hardware thread while that thread has been in Wait mode.\n
+
+  @note1hang This operation does not return the idle cycles that occur when the Hexagon
+             processor shuts down (due to all of the hardware threads being idle). 
+             Idle cycle counts gets accumulated irrespective of profiling enable status, and 
+             resets on #qurt_profile_reset_idle_pcycles
+	
+  @param[out] pcycles  User array where the function stores the current idle cycle count values. 
+                        Array size should be equivalent to the number of hardware threads intended. 
+                        Call #qurt_sysenv_get_max_hw_threads to determine the array size required.
+  
+  @param[in] length_in_bytes Length of pcycles array in bytes. If the array size is smaller
+                              than the required for the maximum available hardware threads, 
+                              it returns error code. 
+
+  @return
+  #QURT_EOK -- Successful operation. Stored all the data to the destination array
+  #QURT_EFAILED -- Operation failed due to smaller #pcycles array
+		 
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+int qurt_profile_get_idle_pcycles2 (unsigned long long *pcycles, unsigned int length_in_bytes);
+
+/*======================================================================*/
+/**@ingroup func_qurt_profile_get_threadid_pcycles
+  
+  @deprecated use #qurt_profile_get_threadid_pcycles2 instead
+  
+  Gets the current per-hardware-thread running cycle counts for the specified QuRT
+  thread for a maximum of 6 hardware threads.
+
+  Each count value represents the number of processor cycles that have elapsed on the
+  corresponding hardware thread while that thread has been scheduled for the specified
+  QuRT thread.
+
+  @note1hang  Profiling shall be enabled first to start the cycle counting. 
+              The cycles are accumulated once the profiling is enabled and 
+              resets on #qurt_profile_reset_threadid_pcycles
+
+  @param[in]   thread_id  Valid thread identifier.
+  @param[out]  pcycles    Pointer to a user array where the function stores the current running 
+                          cycle count values. Array size should be a minimum of the number of
+                          hardware threads intended. 
+	
+  @return 				
+  None. 
+		 
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+void qurt_profile_get_threadid_pcycles (int thread_id, unsigned long long  *pcycles);
+
+/*======================================================================*/
+/**@ingroup func_qurt_profile_get_threadid_pcycles2
+    
+  Gets the current per-hardware-thread running cycle counts for the specified QuRT
+  thread for maximum available hardware threads.
+
+  Each count value represents the number of processor cycles that have elapsed on the
+  corresponding hardware thread while that thread has been scheduled for the specified
+  QuRT thread.
+
+  @note1hang  Profiling shall be enabled first to start the cycle counting. 
+              The cycles are accumulated once the profiling is enabled and 
+              resets on #qurt_profile_reset_threadid_pcycles
+
+  @param[in]  thread_id  Thread identifier.
+  @param[out] pcycles    Pointer to a user array where the function stores the current running 
+                          cycle count values. Array size should be equivalent to the number of
+                          hardware threads intended. 
+                          Call #qurt_sysenv_get_max_hw_threads to determine the array size required.
+  @param[in]  length_in_bytes Length of pcycles array in bytes. If the array size is smaller
+                              than the required for the maximum available hardware threads, it 
+                              returns error code. 
+  
+  @return
+  #QURT_EOK -- Successful operation. Stored all the data to the destination array
+  #QURT_EFAILED -- Operation failed due to smaller #pcycles array
+  #QURT_ENOTHREAD -- Operation failed due to invalid #thread_id
+		 
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+int qurt_profile_get_threadid_pcycles2 (int thread_id, unsigned long long  *pcycles, unsigned int length_in_bytes);
+
+
+/*======================================================================*/
+/**@ingroup func_qurt_profile_reset_threadid_pcycles
+  @xreflabel{hdr:qurt_profile_reset_threadid_pcycles}
+  Sets the per-hardware-thread running cycle counts to zero for the specified QuRT thread.
+
+  @param[in]  thread_id Thread identifier.
+	
+  @return 
+  None. 
+		 
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+void qurt_profile_reset_threadid_pcycles (int thread_id);
+
+/*======================================================================*/
+/**@ingroup func_qurt_profile_enable
+  @xreflabel{hdr:qurt_profile_enable}
+  Enables profiling.\n
+  Enables or disables cycle counting of the running and idle processor cycles.
+  Profiling is disabled by default. \n
+
+  @note1hang Enabling profiling does not automatically reset the cycle counts -- this must be
+             done explicitly by calling the reset operations before starting cycle counting.
+             Cycle counting starts from the instant of it was enabled using this API, and  
+             halts on profiling disable.
+	
+  @param[in] enable  Profiling. Values: \n
+                     - 0 -- Disable profiling \n
+                     - 1 -- Enable profiling @tablebulletend
+	
+  @return 
+  None. 
+		 
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+void qurt_profile_enable (int enable);
+
+/*======================================================================*/
+/**@ingroup func_qurt_get_hthread_pcycles
+  @xreflabel{hdr:qurt_get_hthread_pcycles}
+  Reads the GCYCLE_nT register to allow performance measurement when N threads are in run mode.\n
+
+  @note1hang Returns 0 when architecture is earlier than v67 or for invalid HW thread id.
+  
+  @param[in] n Threads in run mode. Valid values are 1 through <maximum HW threads>.
+                     
+  
+  @return 
+  Value read from GCYCLE_nT register. This value indicates the total number of pcycles that got executed
+  from reset to current point of execution when n threads are in run mode
+     
+  @dependencies
+  PMU must be enabled.
+*/
+/* ======================================================================*/
+unsigned int qurt_get_hthread_pcycles(int n);
+
+/*======================================================================*/
+/**@ingroup func_qurt_get_hthread_commits
+  @xreflabel{hdr:qurt_get_hthread_commits}
+  Reads the GCOMMIT_nT register to allow performance measurement when N threads are in run mode.\n
+
+  @note1hang Returns 0 when architecture is earlier than v67 or for invalid HW thread id.
+  
+  @param[in] n Threads in run mode. Valid values: 1 through <maximum HW threads>.
+  
+  @return 
+  Value read from the GCOMMIT_nT register. This value indicates the total number of packets 
+  committed from reset to current point of execution when n threads are in run mode.
+     
+  @dependencies
+  PMU must be enabled.
+*/
+/* ======================================================================*/
+unsigned int qurt_get_hthread_commits(int n);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_devtree.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_devtree.h
new file mode 100755
index 0000000000000..4adee45bb44a2
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_devtree.h
@@ -0,0 +1,161 @@
+#ifndef QURT_DEVTREE_H
+#define QURT_DEVTREE_H
+/**
+ @file qurt_devtree.h 
+ @brief  Prototypes and structures for device tree aware QuRT library function.
+
+Copyright (c) 2021, 2023  by Qualcomm Technologies, Inc.  All Rights Reserved.
+*/
+/*qurt_callback is included by qurt_qdi_driver.h and depends on NULL being def.
+  callback is not used here, so define NULL here to avoid including the world*/
+#ifndef NULL
+#define NULL ((void *) 0)
+#endif
+
+#include "libfdt.h"
+#include "DTBExtnLib.h"
+#include "qurt_qdi_ext.h"
+#include "qurt_thread.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define INVALID_BLOB_ID       (-1)
+#define DEFAULT_BLOB_ID         0
+
+/** QURT Device Tree Mapping Macros */
+#define QURT_DT_MAPPING_FAILED         (-1)
+#define QURT_DT_FLAG_ISLAND             0x1
+#define QURT_DT_FLAG_PHYSADDR           0x2
+
+/** Device Tree type for Root PD Device tree.
+    Root PD Device Tree will typically describe the hardware in the subsystem.
+    This is the /soc portion of the Device Tree. */
+#define QURT_DT_BLOB_TYPE_ROOT  0
+
+/** Device Tree type for Local Device tree.
+    Local Device Tree will typically contain the software settings.
+    This is the /sw portion of the Device Tree. */
+#define QURT_DT_BLOB_TYPE_LOCAL 1
+
+int qurt_devtree_init(void);
+
+/**@ingroup func_qurt_dt_mapping_create
+ Creates a memory mapping from the specified property of the specified device
+ tree node. Returns virtual addresses and sizes.
+                  
+ @param[in]   offset         Device tree node offset.
+ @param[in]   flags          Flags to configure memory. Overloaded as property 
+                              index if reg_name is NULL.
+ @param[in]   reg_name       Identifies property to use for mapping, should 
+                              resemble a region.
+ @param[out]   vaddr         Return pointer for the virtual region address.
+ @param[out]   size          Return pointer for the virtual region size.
+
+ @return
+ Result code indicating success or failure \n
+*/
+int qurt_dt_mapping_create(fdt_node_handle *devtreeNode, int flags, char *regionName, int regionIdx, 
+                                unsigned long long *vaddr, unsigned long long *size);
+
+/**@ingroup func_qurt_dt_mapping_create2
+ 
+ Creates a memory mapping from the specified property of the specified device
+ tree node.
+
+ Returns virtual addresses and sizes according to architecture (i.e either 32 bit or 64 bit). 
+
+ @param[in]   devtreeNode    Device Tree node    
+
+ @param[in]   dt_map_flags   Flags to configure memory mapping and are reserved for future purpose.
+                              (0) - Default value assumes details from DT node are phys address, size.
+                              QURT_DT_FLAG_ISLAND <IslandMode-Mapping>
+
+                              NOTE: The PA needs to be added to corresponding island spec to create an island mapping
+
+ @param[in]   regionName     NULL or name of index in range to return, should 
+                              resemble a region.       Ex.reg-names =  "base",         "rx",               "tx";
+
+ @param[in]   regionIdx      Index of range to return.  Ex reg       = <0x1000 0x20>, <0x10000 0x100>, <0x18000 0x100 >;
+                              
+                              NOTE: If client specifies both re_name & regionIdx. The precedence of 
+                              region name is taken over and region index is ignored.
+
+ @param[in]   dt_map_perm    Mapping access permissions(R/W),
+                              QURT_PERM_READ <Read only>
+                              QURT_PERM_WRITE
+
+ @param[in]   cache_attr     QuRT cache mode type's :
+                              QURT_MEM_CACHE_DEVICE <memory-mapped device>
+                              QURT_MEM_CACHE_WRITEBACK <Cached WB>
+                              Other required cache type enums in qurt_types.h can also be passed.
+
+                             NOTE: No default value for cache & perm is present. 
+                             Client always needs to pass any of defined the flags.
+
+ @param[out]  vaddr          Return pointer to the variable that holds the virtual address
+ @param[out]  size           Return pointer for the virtual region size.
+
+ @return
+ #QURT_EOK                   Success indicating mapping created properly.
+ #QURT_DT_MAPPING_FAILED     Failed to create mapping.
+ #QURT_EINVALID              Mismatch in the architecture.
+
+                             else FdtLib or thirdparty error code.
+
+*/
+int qurt_dt_mapping_create2(fdt_node_handle *devtreeNode, unsigned int dt_map_flags, 
+                              char *regionName, int regionIdx, unsigned int dt_map_perm, int cache_attr, void **vaddr, size_t *size);
+
+/**@ingroup func_qurt_dt_isr_register
+  Device tree aware registration of an interrupt service routine (ISR) to an ISR thread. 
+  The interrupt defined in the specified device tree node is enabled when this function returns success.
+
+  @datatypes
+  #qurt_thread_t \n
+  #fdt_node_handle
+
+  @param[in]   dt_node       Device tree node that specifies the interrupt property.
+  @param[in]   dt_int_index  Index of the specific interrupt to use within the device tree node structure.
+                             Specify either this or int_name, use -1 if string is used.
+  @param[in]   dt_int_name   Name of the specific interrupt to use within the device tree node structure.
+                             Either this or int_index should be specified, use NULL if index is used
+  @param[in]   isr_thread_id ISR thread ID, returned from qurt_isr_create(), defined by qurt_isr_register2().  
+  @param[in]   prio          Priority of the ISR, defined by qurt_isr_register2().
+  @param[in]   flags         Defines ACK type. Values : \n
+                             #QURT_INT_NON_DELAYED_ACK - ISR is acknowledged by the interrupt handle routine 
+			                                     in the kernel.
+                             #QURT_INT_DELAYED_ACK     - Client chooses to acknowledge.
+                             Defined by qurt_isr_register2().             
+  @param[in]   isr           ISR with proto type void isr (void *arg, int int_num), defined by qurt_isr_register2().
+  @param[in]   arg  	     First argument of the ISR when it is called to service the interrupt, defined by qurt_isr_register2().
+   
+  @return 
+  #QURT_EOK          -- Successfully registered the ISR for the interrupt \n
+  #QURT_EINT         -- Interrupt not configured \n
+  #QURT_EINVALID     -- Invalid thread ID \n
+  #QURT_EDISABLED    -- The feature is disabled \n
+  #QURT_EDUPLICATE   -- Interrupt is already registered
+
+  @dependencies
+   Create the thread ID qurt_isr_create().
+   ISR registration completed with qurt_isr_register2().
+ */
+int qurt_dt_isr_register(fdt_node_handle *dt_node, int dt_int_index, char * dt_int_name, qurt_thread_t isr_thread_id, 
+                         unsigned short prio, unsigned short flags, void (*isr) (void *, int), void *arg);
+
+/**@ingroup func_qurt_dt_blob_id_get
+ Returns the Blob ID for the Blob type passed.
+ The value returned from this API can be passed as Blob ID parameter to DTBExtnLib APIs.
+
+ @param[in] blob_type  Blob type to look up.
+ @return Blob ID for the passed Blob Type.
+*/
+int qurt_dt_blob_id_get(unsigned int blob_type);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_ecc.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_ecc.h
new file mode 100755
index 0000000000000..09312684e99af
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_ecc.h
@@ -0,0 +1,168 @@
+#ifndef QURT_ECC_H
+#define QURT_ECC_H
+
+
+/*=====================================================================
+ 
+  @file  qurt_ecc.h
+  @brief  Prototypes of QuRT memory ECC API functions      
+
+ Copyright (c) 2018, 2020-2021 by Qualcomm Technologies, Inc.  All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+ ======================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*=============================================================================
+                        TYPEDEFS
+=============================================================================*/
+/** @addtogroup exception_handling_types
+@{ */
+// ECC memory definition
+typedef enum {
+    QURT_ECC_MEM_L1_ICACHE = 0, /**< ECC memory L1 ICache. */
+    QURT_ECC_MEM_L1_DCACHE = 1, /**< ECC memory L1 DCache.*/
+    QURT_ECC_MEM_L2_CACHE  = 2, /**< ECC memory L2 Cache.*/
+    QURT_ECC_MEM_VTCM      = 3  /**< ECC memory VTCM.*/
+} qurt_ecc_memory_t;
+/** @} */ /* end_addtogroup exception_handling_types */
+
+/*=============================================================================
+                        CONSTANTS AND MACROS
+=============================================================================*/
+/** @addtogroup exception_handling_macros
+@{ */
+
+#define   QURT_ECC_ERR_DETECTED_STATUS        0 /**< ECC error detected. */
+#define   QURT_ECC_ERR_TYPE                   1 /**< ECC error type.*/
+// ECC status type
+
+#define  QURT_ECC_CORRECTABLE_COUNT           (1<<0) /**< ECC correctable count.*/
+#define  QURT_ECC_UNCORRECTABLE_COUNT         (1<<1) /**< ECC uncorrectable count.*/
+#define  QURT_ECC_REGION_LOGGING              (1<<2) /**< ECC region logging.*/
+// ECC enable/disable definition
+
+#define QURT_ECC_PROTECTION_DISABLE  (0<<0)    /**< Bit 0. */
+#define QURT_ECC_PROTECTION_ENABLE   (1<<0)    /**< Bit 0. */
+/** @} */ /* end_addtogroup exception_handling_macros */
+/*=============================================================================
+                        FUNCTIONS
+=============================================================================*/
+
+
+/**@ingroup func_qurt_ecc_enable
+  Enables or disables ECC protection on a specified memory.
+  
+  @datatypes
+  #qurt_ecc_memory_t
+  
+  @param[in]  memory Set to one of the following values:
+                     - #QURT_ECC_MEM_L1_ICACHE
+                     - #QURT_ECC_MEM_L1_DCACHE
+                     - #QURT_ECC_MEM_L2_CACHE
+                     - #QURT_ECC_MEM_VTCM   @tablebulletend
+
+  @param[in]  enable Set to one of the following values:
+                     - #QURT_ECC_PROTECTION_ENABLE
+                     - #QURT_ECC_PROTECTION_DISABLE  @tablebulletend
+
+  @return
+  - #QURT_EOK --   ECC enabling or disabling setup is performed successfully
+  - Others  --    Failure
+
+  @dependencies
+  None.
+ */
+int qurt_ecc_enable( qurt_ecc_memory_t memory, unsigned int enable );
+
+
+/**@ingroup func_qurt_ecc_get_error_status
+  Gets ECC error status for a specified memory.
+  
+  @datatypes
+  #qurt_ecc_memory_t
+  
+  @param[in]  memory  Set to one of the following:
+                      - #QURT_ECC_MEM_L1_ICACHE
+                      - #QURT_ECC_MEM_L1_DCACHE
+                      - #QURT_ECC_MEM_L2_CACHE
+                      - #QURT_ECC_MEM_VTCM    @tablebulletend
+
+  @param[in]  type  Set to one of the following:
+                     - #QURT_ECC_ERR_DETECTED_STATUS
+                     - #QURT_ECC_ERR_TYPE  @tablebulletend
+
+  @return
+  Returns the following when the type is #QURT_ECC_ERR_DETECTED_STATUS:
+       - 0 -- No error detected \n
+       - 1 -- At least one error detected \n
+  Returns the following when the type is #QURT_ECC_ERR_TYPE: \n
+       - 0 through 1 -- Correctable error \n
+       - 2 --   Uncorrectable error
+
+  @dependencies
+  None.
+ */
+int qurt_ecc_get_error_status( qurt_ecc_memory_t memory, unsigned int type );
+
+
+/**@ingroup func_qurt_ecc_get_error_count
+  Gets the ECC error count for a specified memory.
+  
+  @datatypes
+  #qurt_ecc_memory_t
+  
+  @param[in]  memory  Set to one of the following values:\n
+                      - #QURT_ECC_MEM_L1_ICACHE \n
+                      - #QURT_ECC_MEM_L1_DCACHE \n
+                      - #QURT_ECC_MEM_L2_CACHE \n
+                      - #QURT_ECC_MEM_VTCM    @tablebulletend
+
+  @param[in]  type  Set to one of the following values: \n
+                     - #QURT_ECC_CORRECTABLE_COUNT \n
+                     - #QURT_ECC_UNCORRECTABLE_COUNT  @tablebulletend
+
+  @return
+  Error count for the specified error type.
+
+  @dependencies
+  None.
+ */
+int qurt_ecc_get_error_count( qurt_ecc_memory_t memory, unsigned int type );
+
+
+/**@ingroup func_qurt_ecc_clear_error_count
+  Clears ECC error count or region logging for a specified memory.
+  
+  @datatypes
+  #qurt_ecc_memory_t
+  
+  @param[in]  memory Set to one of the following values: \n
+                     - #QURT_ECC_MEM_L1_ICACHE \n
+                     - #QURT_ECC_MEM_L1_DCACHE \n
+                     - #QURT_ECC_MEM_L2_CACHE \n
+                     - #QURT_ECC_MEM_VTCM    @tablebulletend
+
+  @param[in]  type Set to one or multiple OR'ed of the following values: \n
+                  - #QURT_ECC_CORRECTABLE_COUNT  \n
+                  - #QURT_ECC_UNCORRECTABLE_COUNT \n
+                  - #QURT_ECC_REGION_LOGGING  @tablebulletend
+     
+  @return
+  #QURT_EOK -- Error count successfully cleared \n
+  Others --   Failure at clearing the error count
+
+  @dependencies
+  None.
+ */
+int qurt_ecc_clear_error_count( qurt_ecc_memory_t memory, unsigned int type );
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_ECC_H */
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_error.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_error.h
new file mode 100755
index 0000000000000..f4666b396c378
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_error.h
@@ -0,0 +1,149 @@
+#ifndef QURT_ERROR_H
+#define QURT_ERROR_H
+
+/**
+  @file qurt_error.h 
+  Error results- QURT defines a set of standard symbols for the error result values. This file lists the
+  symbols and their corresponding values.
+
+ EXTERNALIZED FUNCTIONS
+  none
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  none
+
+ Copyright (c) 2021-2022 , 2023  by Qualcomm Technologies, Inc.  All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc..
+ ======================================================================*/
+#include "qurt_except.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** @addtogroup chapter_error
+@{ */
+
+/*=====================================================================
+Constants and macros
+======================================================================*/
+#define QURT_EOK                             0  /**< Operation successfully performed. */
+#define QURT_EVAL                            1  /**< Wrong values for the parameters. The specified page does not exist. */
+#define QURT_EMEM                            2  /**< Not enough memory to perform the operation.*/
+
+#define QURT_EINVALID                        4  /**< Invalid argument value; invalid key. */ 
+/** @cond  */
+#define QURT_EUNKNOWN                        6  /**< Defined but never used in QuRT. */
+#define QURT_ENOMSGS                         7  /**< Message queue is empty. */ 
+#define QURT_EBADF                           9  /**< Bad message queue descriptor. */
+/** @endcond */
+#define QURT_EFAILED                        12  /**< Operation failed. */ 
+
+#define QURT_ENOTALLOWED                    13  /**< Operation not allowed. */
+
+/** @cond */
+#define QURT_EDUPCLSID                      14  /*< Duplicate class ID. */
+/** @endcond */
+/** @cond rest_reg_dist   */
+#define QURT_ENOREGISTERED                  20  /**< No registered interrupts.*/ 
+/** @endcond */
+
+
+/** @cond */
+#define QURT_EISDB                          21  /*< Power collapse failed due to ISDB being enabled. */
+#define QURT_ESTM                           22  /*< Power collapse failed in a Single-threaded mode check. */
+/** @endcond */
+
+
+/** @cond rest_reg_dist  */
+#define QURT_ETLSAVAIL                      23  /**< No free TLS key is available. */
+#define QURT_ETLSENTRY                      24  /**< TLS key is not already free. */ 
+/** @endcond */
+
+#define QURT_EINT                           26  /**< Invalid interrupt number (not registered). */  
+/** @cond rest_reg_dist */
+#define QURT_ESIG                           27  /**< Invalid signal bitmask (cannot set more than one signal at a time). */
+/** @endcond */
+
+/** @cond */
+#define QURT_EHEAP                          28  /**< No heap space is available. */
+#define QURT_ENOSPC                         28  /**< No space to create another queue in the system. */
+#define QURT_EMEMMAP                        29  /**< Physical address layout is not supported by the kernel. */
+/** @endcond */
+/** @cond rest_reg_dist */
+#define QURT_ENOTHREAD                      30  /**< Thread no longer exists. */
+/** @endcond */
+/** @cond */
+#define QURT_EL2CACHE                       31  /**< L2cachable is not supported in kernel invalidate/cleaninv. */
+/** @endcond */
+/** @cond rest_reg_dist  */
+#define QURT_EALIGN                         32  /**< Not aligned. */
+#define QURT_EDEREGISTERED                  33  /**< Interrupt is already deregistered.*/
+/** @endcond */
+
+/** @cond internal_only */
+
+#define QURT_ETLBCREATESIZE                 34  /**< TLB create error -- Incorrect size.*/
+#define QURT_ETLBCREATEUNALIGNED            35  /**< TLB create error -- Unaligned address.*/
+/** @endcond */
+/** @cond rest_reg_dist*/
+#define QURT_EEXISTS                        35  /**< File or message queue already exists. */
+#define QURT_ENAMETOOLONG                   36  /**< Name too long for message queue creation. */
+#define QURT_EPRIVILEGE                     36  /**< Caller does not have privilege for this operation.*/
+
+#define QURT_ECANCEL                        37  /**< A cancellable request was canceled because the associated process was asked to exit.*/
+/** @endcond */
+
+/** @cond */
+#define QURT_EISLANDTRAP                    38  /*< Unsupported TRAP is called in Island mode.*/ 
+
+#define QURT_ERMUTEXUNLOCKNONHOLDER         39  /*< Rmutex unlock by a non-holder.*/
+#define QURT_ERMUTEXUNLOCKFATAL             40  /*< Rmutex unlock error, all except the non-holder error.*/
+#define QURT_EMUTEXUNLOCKNONHOLDER          41  /*< Mutex unlock by a non-holder.*/
+#define QURT_EMUTEXUNLOCKFATAL              42  /*< Mutex unlock error, all except the non-holder error.*/
+#define QURT_EINVALIDPOWERCOLLAPSE          43  /*< Invalid power collapse mode requested. */ 
+/** @endcond */
+#define QURT_EISLANDUSEREXIT                44  /**< User call has resulted in island exit.*/
+#define QURT_ENOISLANDENTRY                 45  /**< Island mode had not yet been entered.*/
+#define QURT_EISLANDINVALIDINT              46  /**< Exited Island mode due to an invalid island interrupt.*/
+/** @cond rest_reg_dist */
+#define QURT_ETIMEDOUT                      47  /**< Operation timed-out. */
+#define QURT_EALREADY                       48  /**< Operation already in progress. */
+/** @endcond */
+
+#define QURT_ERETRY                         49  /*< Retry the operation. */
+#define QURT_EDISABLED                      50  /*< Resource disabled. */
+#define QURT_EDUPLICATE                     51  /*< Duplicate resource. */
+#define QURT_EBADR                          53  /*< Invalid request descriptor. */
+#define QURT_ETLB                           54  /*< Exceeded maximum allowed TLBs. */
+#define QURT_ENOTSUPPORTED                  55  /*< Operation not supported. */
+/** @cond rest_reg_dist */
+#define QURT_ENORESOURCE                    56  /**< No resource. */
+/** @endcond */
+
+#define QURT_EDTINIT                        57  /**< Problem with device tree intialization. */
+#define QURT_EBUFLOCK                       58  /*< Buffer lock failed because it was already locked many times. */
+#define QURT_ELOCKED                        59  /**< Current operation failed as the buffer is locked. */
+#define QURT_EMSGSIZE                       90  /*< Message queue msg_len is greater than mq_msgsize attribute of the message queue. */
+
+
+#define QURT_ENOTCONFIGURED                 91  /*< Interrupt is NOT configured. */
+
+#define QURT_EBANDWIDTHLIMIT                92  /*< Message queue send exceed the bandwidth limit. */
+
+#define QURT_ECFIVIOLATION                  93  /*< CFI violation detected. */
+
+#define QURT_EDESTROY                       94  /**< A destroy request was made to waiting threads.*/
+
+#define QURT_EHMXNOTAVAIL                   95  /**< HMX is not available to target thread.*/
+#define QURT_EHMXNOTDETACHABLE              96  /**< HMX is not detachable from target thread.*/
+
+#define QURT_EFATAL                         -1  /**< Fatal error. */
+
+/** @} */ /* end_addtogroup chapter_error */
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_ERROR_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_event.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_event.h
new file mode 100755
index 0000000000000..987f0fe79f227
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_event.h
@@ -0,0 +1,452 @@
+#ifndef QURT_EVENT_H
+#define QURT_EVENT_H
+/**
+  @file qurt_event.h
+  @brief Prototypes of kernel event API functions.      
+
+ EXTERNALIZED FUNCTIONS
+  none
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  none
+
+ Copyright (c) 2018-2021, 2023 Qualcomm Technologies, Inc.  All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+
+#include "qurt_consts.h"
+#include "qurt_thread.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * System environment object type.
+ */
+/**@addtogroup sys_env_types
+@{ */
+/** QuRT swap pool information type. */
+typedef struct qurt_sysenv_swap_pools {
+   /** @cond */
+   unsigned int spoolsize; /* Swap pool size.*/
+   unsigned int spooladdr;   /* Swap pool start address.*/
+   /** @endcond */
+}qurt_sysenv_swap_pools_t;
+
+/**QuRT application heap information type. */
+typedef struct qurt_sysenv_app_heap {
+   /** @cond */
+   unsigned int heap_base; /* Heap base address.*/
+   unsigned int heap_limit; /* Heap end address.*/
+   /** @endcond */
+} qurt_sysenv_app_heap_t ;
+
+/** QuRT architecture version information type. */
+typedef struct qurt_sysenv_arch_version {
+   /** @cond */
+    unsigned int arch_version; /*Architecture version.*/
+    /** @endcond */
+}qurt_arch_version_t;
+
+/** QuRT maximum hardware threads information type. */
+typedef struct qurt_sysenv_max_hthreads {
+   /** @cond */
+   unsigned int max_hthreads; /*Maximum number of hardware threads.*/
+   /** @endcond */
+}qurt_sysenv_max_hthreads_t;
+
+/** QuRT active hardware threads information type. */
+typedef struct qurt_sysenv_hthreads {
+   /** @cond */
+   unsigned int hthreads; /*Maximum number of hardware threads.*/
+   /** @endcond */
+}qurt_sysenv_hthreads_t;
+
+/** QuRT maximum pi priority information type. */
+typedef struct qurt_sysenv_max_pi_prio {
+     /** @cond */
+    unsigned int max_pi_prio; /*Maximum pi priority.*/
+     /** @endcond */
+}qurt_sysenv_max_pi_prio_t;
+
+/** QuRT process name information type. */
+typedef struct qurt_sysenv_procname {
+     /** @cond */
+   union {
+      unsigned int asid; /*Address space ID.*/
+      unsigned int pid;  /*Process ID.*/
+   };
+   char name[QURT_MAX_NAME_LEN]; /* Process name.*/
+    /** @endcond */
+}qurt_sysenv_procname_t;
+
+/** QuRT stack profile count information type. */
+typedef struct qurt_sysenv_stack_profile_count {
+     /** @cond */
+   unsigned int count; /*Stack profile count for usage.*/
+   unsigned int count_watermark; /*Stack profile count for watermark.*/
+    /** @endcond */
+}qurt_sysenv_stack_profile_count_t;
+
+/**
+ QuRT system error event type.
+ */
+typedef struct _qurt_sysevent_error_t
+{
+    unsigned int thread_id; /**< Thread ID.  */
+    unsigned int fault_pc;  /**< Fault PC. */
+    unsigned int sp;        /**< Stack pointer. */
+    unsigned int badva;     /**< Virtual data address where the exception occurred. */
+    unsigned int cause;     /**< QuRT error result. */
+    unsigned int ssr;       /**< Supervisor status register. */
+    unsigned int fp;        /**< Frame pointer. */
+    unsigned int lr;        /**< Link register. */
+    unsigned int pid;       /**< PID of the process to which this thread belongs.*/
+ } qurt_sysevent_error_t ;
+
+typedef struct _qurt_sysevent_error_1_t
+{
+    unsigned int thread_id; /**< Thread ID.  */
+    unsigned int fault_pc;  /**< Fault PC. */
+    unsigned int sp;        /**< Stack pointer. */
+    unsigned int badva;     /**< Virtual data address where the exception occurred. */
+    unsigned int cause;     /**< QuRT error result. */
+    unsigned int ssr;       /**< Supervisor status register. */
+    unsigned int fp;        /**< Frame pointer. */
+    unsigned int lr;        /**< Link register. */
+    unsigned int pid;       /**< PID of the process to which this thread belongs.*/
+    unsigned int fkey;      /**< Framekey.*/
+    unsigned int reserved1; /**< Reserved.*/
+    unsigned int reserved2; /**< Reserved.*/
+    unsigned int reserved3; /**< Reserved.*/
+ } qurt_sysevent_error_1_t ;
+ 
+/** QuRT page fault error event information type. */
+typedef struct qurt_sysevent_pagefault {
+    qurt_thread_t thread_id; /**< Thread ID of the page fault thread. */
+    unsigned int fault_addr; /**< Accessed address that caused the page fault. */
+    unsigned int ssr_cause;  /**< SSR cause code for the page fault. */
+} qurt_sysevent_pagefault_t ;
+/** @} */ /* @endaddtogroup sys_env_types */
+/*=============================================================================
+                                    FUNCTIONS
+=============================================================================*/
+
+/*======================================================================*/
+/**
+  Gets the environment swap pool 0 information from the kernel.
+
+  @datatypes
+  #qurt_sysenv_swap_pools_t
+
+  @param[out] pools  Pointer to the pools information.
+
+  @return
+  #QURT_EOK -- Success.
+
+  @dependencies
+  None.
+*/
+int qurt_sysenv_get_swap_spool0 (qurt_sysenv_swap_pools_t *pools );
+
+/*
+  Gets the environment swap pool 1 information from the kernel.
+
+  @datatypes
+  #qurt_sysenv_swap_pools_t
+
+  @param[out] pools  Pointer to the pools information.
+
+  @return
+  #QURT_EOK -- Success.
+
+  @dependencies
+  None.
+*/
+int qurt_sysenv_get_swap_spool1(qurt_sysenv_swap_pools_t *pools );
+
+/**@ingroup func_qurt_sysenv_get_app_heap
+  Gets information on the program heap from the kernel.
+
+  @datatypes
+  #qurt_sysenv_app_heap_t
+
+  @param[out] aheap  Pointer to information on the program heap.
+
+  @return
+  #QURT_EOK -- Success. \n
+  #QURT_EVAL -- Invalid parameter.
+
+  @dependencies
+  None.
+*/
+int qurt_sysenv_get_app_heap(qurt_sysenv_app_heap_t *aheap );
+
+/**@ingroup func_qurt_sysenv_get_arch_version
+  Gets the Hexagon processor architecture version from the kernel.
+
+  @datatypes
+  #qurt_arch_version_t
+
+  @param[out] vers  Pointer to the Hexagon processor architecture version.
+
+  @return
+  #QURT_EOK -- Success. \n
+  #QURT_EVAL -- Invalid parameter
+
+  @dependencies
+  None.
+*/
+int qurt_sysenv_get_arch_version(qurt_arch_version_t *vers);
+
+/**@ingroup func_qurt_sysenv_get_max_hw_threads
+  Gets the maximum number of hardware threads supported in the Hexagon processor. 
+  The API includes the disabled hardware threads to reflect the maximum 
+  hardware thread count.
+  For example, if the image is configured for four hardware threads and hthread_mask is set to 0x5 in 
+  cust_config.xml, only HW0 and HW2 are initialized by QuRT.
+  HW1 and HW3 are not used at all. Under such a scenario, 
+  qurt_sysenv_get_max_hw_threads() still returns four.
+
+  @datatypes
+  #qurt_sysenv_max_hthreads_t
+
+  @param[out] mhwt  Pointer to the maximum number of hardware threads supported in the Hexagon processor.
+
+  @return
+  #QURT_EOK -- Success. \n
+  #QURT_EVAL -- Invalid parameter.
+
+  @dependencies
+  None.
+*/
+int qurt_sysenv_get_max_hw_threads(qurt_sysenv_max_hthreads_t *mhwt );
+
+/**@ingroup func_qurt_sysenv_get_hw_threads
+  Gets the number of hardware threads initialized by QuRT in Hexagon processor.
+  For example, if the image is configured for four hardware threads and hthread_mask is set to 0x5 in 
+  cust_config.xml, QuRT only initializes HW0 and HW2.
+  HW1 and HW3 are not used. In this scenario, qurt_sysenv_get_hw_threads() returns 2.
+
+  @datatypes
+  #qurt_sysenv_hthreads_t
+
+  @param[out] mhwt  Pointer to the number of hardware threads active in the Hexagon processor.
+
+  @return
+  #QURT_EOK -- Success. \n
+  #QURT_EVAL -- Invalid parameter.
+
+  @dependencies
+  None.
+*/
+int qurt_sysenv_get_hw_threads(qurt_sysenv_hthreads_t *mhwt );
+
+/**@ingroup func_qurt_sysenv_get_max_pi_prio
+  Gets the maximum priority inheritance mutex priority from the kernel.
+
+  @datatypes
+  #qurt_sysenv_max_pi_prio_t
+
+  @param[out] mpip  Pointer to the maximum priority inheritance mutex priority.
+
+  @return
+  #QURT_EOK -- Success. \n
+  #QURT_EVAL -- Invalid parameter.
+
+  @dependencies
+  None.
+*/
+int qurt_sysenv_get_max_pi_prio(qurt_sysenv_max_pi_prio_t *mpip );
+
+/**@ingroup func_qurt_sysenv_get_process_name2
+  Gets information on the system environment process names based on the client_handle argument.
+
+  @datatypes
+  #qurt_sysenv_procname_t
+
+  @param[in] client_handle  Obtained from the current invocation function (Section 3.4.1).
+  @param[out] pname         Pointer to information on the process names in the system.
+
+  @return
+  #QURT_EOK -- Success. \n
+  #QURT_EVAL -- Invalid parameter.
+
+  @dependencies
+  None.
+*/
+int qurt_sysenv_get_process_name2(int client_handle, qurt_sysenv_procname_t *pname );
+
+/**@ingroup func_qurt_sysenv_get_process_name
+  Gets information on the system environment process names from the kernel.
+
+  @datatypes
+  #qurt_sysenv_procname_t
+
+  @param[out] pname  Pointer to information on the process names in the system.
+
+  @return
+  #QURT_EOK -- Success. \n
+  #QURT_EVAL -- Invalid parameter.
+
+  @dependencies
+  None.
+*/
+int qurt_sysenv_get_process_name(qurt_sysenv_procname_t *pname );
+
+/**@ingroup func_qurt_sysenv_get_stack_profile_count
+   Gets information on the stack profile count from the kernel.
+
+   @datatypes
+   #qurt_sysenv_stack_profile_count_t
+
+   @param[out] count Pointer to information on the stack profile count.
+
+   @return
+   #QURT_EOK -- Success.
+
+   @dependencies
+   None.
+*/
+int qurt_sysenv_get_stack_profile_count(qurt_sysenv_stack_profile_count_t *count );
+
+/**@ingroup func_qurt_exception_wait
+  Registers the program exception handler.
+  This function assigns the current thread as the QuRT program exception handler and suspends the
+  thread until a program exception occurs.
+
+  When a program exception occurs, the thread is awakened with error information
+  assigned to the parameters of this operation.
+
+  @note1hang If no program exception handler is registered, or if the registered handler
+             calls exit, QuRT raises a kernel exception.
+             If a thread runs in Supervisor mode, any errors are treated as kernel
+             exceptions.
+
+  @param[out]  ip      Pointer to the instruction memory address where the exception occurred.
+  @param[out]  sp      Stack pointer.
+  @param[out]  badva   Pointer to the virtual data address where the exception occurred.
+  @param[out]  cause   Pointer to the QuRT error result code.
+
+  @return
+  Registry status: \n
+  Thread identifier -- Handler successfully registered. \n
+  #QURT_EFATAL -- Registration failed.
+
+  @dependencies
+  None.
+*/
+unsigned int qurt_exception_wait (unsigned int *ip, unsigned int *sp,
+                                  unsigned int *badva, unsigned int *cause);
+
+unsigned int qurt_exception_wait_ext (qurt_sysevent_error_t * sys_err);
+
+/**@ingroup func_qurt_exception_wait3
+  Registers the current thread as the QuRT program exception handler, and suspends the thread until a
+  program exception occurs.
+  When a program exception occurs, the thread is awakened with error information assigned to the specified
+  error event record.
+  If a program exception is raised when no handler is registered (or when a handler is registered, but it calls
+  exit), the exception is treated as fatal.\n
+  @note1hang If a thread runs in Monitor mode, all exceptions are treated as kernel exceptions.\n
+  @note1cont This function differs from qurt_exception_wait() by returning the error information in a data
+              structure rather than as individual variables. It also returns additional information (for example, SSR, FP, and LR).
+
+  @param[out] sys_err       Pointer to the qurt_sysevent_error_1_t type structure.
+  @param[in]  sys_err_size  Size of the qurt_sysevent_error_1_t structure.
+
+  @return
+  Registry status: \n
+  - #QURT_EFATAL -- Failure. \n
+  - Thread ID -- Success.
+
+  @dependencies
+  None.
+*/
+
+unsigned int qurt_exception_wait3(void * sys_err, unsigned int sys_err_size);
+
+/**@ingroup func_qurt_exception_raise_nonfatal
+  Raises a nonfatal program exception in the QuRT program system.
+
+  For more information on program exceptions, see Section @xref{dox:exception_handling}.
+
+  This operation never returns -- the program exception handler is assumed to perform all
+  exception handling before terminating or reloading the QuRT program system.
+
+  @note1hang The C library function abort() calls this operation to indicate software
+             errors.
+
+  @param[in] error QuRT error result code (Section @xref{dox:error_results}).
+
+  @return
+  Integer -- Unused.
+
+  @dependencies
+  None.
+*/
+int qurt_exception_raise_nonfatal (int error) __attribute__((noreturn));
+
+
+/**@ingroup func_qurt_exception_raise_fatal
+  Raises a fatal program exception in the QuRT system.
+
+  Fatal program exceptions terminate the execution of the QuRT system without invoking
+  the program exception handler.
+
+  For more information on fatal program exceptions, see Section @xref{dox:exception_handling}.
+
+  This operation always returns, so the calling program can perform the necessary shutdown
+  operations (data logging, on so on).
+
+  @note1hang Context switches do not work after this operation has been called.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+void qurt_exception_raise_fatal (void);
+
+unsigned int qurt_enable_floating_point_exception(unsigned int mask);
+
+/**@ingroup func_qurt_exception_enable_fp_exceptions
+  Enables the specified floating point exceptions as QuRT program exceptions.
+
+  The exceptions are enabled by setting the corresponding bits in the Hexagon
+  control user status register (USR).
+
+  The mask argument specifies a mask value identifying the individual floating
+  point exceptions to set. The exceptions are represented as defined symbols
+  that map into bits 0 through 31 of the 32-bit flag value.
+  Multiple floating point exceptions are specified by OR'ing together the individual
+  exception symbols.\n
+  @note1hang This function must be called before performing any floating point operations.
+
+  @param[in] mask Floating point exception types. Values: \n
+             - #QURT_FP_EXCEPTION_ALL    \n
+             - #QURT_FP_EXCEPTION_INEXACT    \n
+             - #QURT_FP_EXCEPTION_UNDERFLOW  \n
+             - #QURT_FP_EXCEPTION_OVERFLOW  \n
+             - #QURT_FP_EXCEPTION_DIVIDE0    \n
+             - #QURT_FP_EXCEPTION_INVALID   @tablebulletend
+
+  @return
+  Updated contents of the USR.
+
+  @dependencies
+  None.
+*/
+
+static inline unsigned int qurt_exception_enable_fp_exceptions(unsigned int mask)
+{
+   return qurt_enable_floating_point_exception(mask);
+}
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_EVENT_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_except.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_except.h
new file mode 100755
index 0000000000000..e1684c80e3d50
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_except.h
@@ -0,0 +1,185 @@
+#ifndef QURT_EXCEPT_H
+#define QURT_EXCEPT_H
+
+/**
+  @file qurt_except.h 
+  @brief  Defines Cause and Cause2 codes for error-handling.  
+
+EXTERNAL FUNCTIONS
+   None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+ Copyright (c) 2021-2022 by Qualcomm Technologies, Inc.  All Rights Reserved.
+
+ Confidential and Proprietary - Qualcomm Technologies, Inc..
+ ======================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+  QuRT supports error handling to handle CPU detected exceptions and software errors. 
+  QuRT treats all errors as either fatal errors or nonfatal errors. 
+
+  @section sec1 Fatal errors
+  All supervisor mode exceptions are treated as fatal errors. 
+  If a registered exception handler calls qurt_exit(), it is treated as a fatal error.
+  Fatal errors result in saving the context of primary hardware thread to QURT_error_info and the rest of the thread contexts to the corresponding TCBs. 
+  All hardware threads are eventually stopped and the cache is flushed.
+  NMI exception is treated little differently from other fatal errors. QuRT saves the contexts of all the hardware threads into QURT_error_info.\n
+
+  @subsection subsection1 Debugging fatal errors
+  - QURT_error_info.status.status	 -- Indicates that an error occured.
+  - QURT_error_info.status.cause	 -- Cause code for fatal error; Cause and Cause 2 details are listed below.
+  - QURT_error_info.status.cause2	 -- Cause2 code for fatal error; Cause and Cause 2 details are listed below.
+  - QURT_error_info.status.fatal	 -- Indicates whether a fatal error occurred. A user error can result in a fatal error if the exceptional handler is not registered.
+  - QURT_error_info.status.hw_tnum -- Indicates the index of QURT_error_info.locregs[], where the context is saved when the error is fatal error.
+  - QURT_error_info.global_regs    -- Contains the values of the global registers of Q6
+  - QURT_error_info.local_regs[QURT_error_info.status.hw_tnum] -- Provides the CPU context when the error is a supervisor error.
+    
+
+
+  @subsection subsection2 Debugging nonfatal errors
+  - QURT_error_info.user_errors                                    -- All user errors are logged here.
+  - QURT_error_info.user_errors.counter                            -- Index to last logged error.
+  - QURT_error_info.user_errors.entry[0...counter]	               -- Structure for logged error.
+  - QURT_error_info.user_errors.entry[0...counter].error_tcb       -- TCB for the user error.
+  - QURT_error_info.user_errors.entry[0...counter].error_tcb.error -- Information about the error; Cause, Cause2, Badva and hardware thread ID.
+  - QURT_error_info.user_errors.entry[0...counter].error_code      -- ((cause2 << 8) 'Logical Or' (cause) ); Cause and Cause 2 details are listed below.
+  - QURT_error_info.user_errors.entry[0...counter].hw_thread	   -- Hardware thread ID for error.
+  - QURT_error_info.user_errors.entry[0...counter].pcycle	       -- Pcycle for error.
+
+@note  
+  Important usage note:
+  Cause and Cause2 are error codes to distinguish multiple errors.
+  SSR and BADAVA are inconclusive without the vector number.
+  All cause and cause2 can range from 1 to 255 and every cause can have 1 to 255 error code.
+  Hence the system can have up to 255 * 255 unique error codes.
+  The cominations is representated as ((cause2 << 8) 'Logical OR' (cause) )
+  Some Cause2 codes are statically defined, whereas some are obtaned from SSR[7:0] cause codes. It depends on cause codes.
+  SSR cause codes are defined in Hexagon reference manual.
+  All possible combinations are listed below.
+*/
+/** @addtogroup chapter_error
+@{ */
+/* cause - error type - 8-bits*/
+#define QURT_EXCEPT_PRECISE             0x01U /**< Precise exception occurred. For this cause code, Cause2 is SSR[7:0].*/
+#define QURT_EXCEPT_NMI                 0x02U /**< NMI occurred; Cause2 is not defined. */
+#define QURT_EXCEPT_TLBMISS             0x03U /**< TLBMISS RW occurred; for this cause code, Cause2 is SSR[7:0]. */
+#define QURT_EXCEPT_RSVD_VECTOR         0x04U /**< Interrupt raised on a reserved vector, which must never occur. Cause2 is not defined. */
+#define QURT_EXCEPT_ASSERT              0x05U /**< Kernel assert. Cause2 QURT_ABORT_* are listed below.  */
+#define QURT_EXCEPT_BADTRAP             0x06U /**< trap0(num) called with unsupported num. Cause2 is 0. */
+#define QURT_EXCEPT_UNDEF_TRAP1         0x07U /**< Trap1 is not supported. Using Trap1 causes this error. Cause2 is not defined. */
+#define QURT_EXCEPT_EXIT                0x08U /**< Application called qurt_exit() or qurt_exception_raise_nonfatal(). Can be called from C library. Cause2 is "[Argument passed to qurt_exception_raise_nonfatal() & 0xFF]". */
+#define QURT_EXCEPT_TLBMISS_X           0x0AU /**< TLBMISS X (execution) occurred. Cause2 is not defined. */
+#define QURT_EXCEPT_STOPPED             0x0BU /**< Running thread stopped due to fatal error on other hardware thread. Cause2 is not defined. */
+#define QURT_EXCEPT_FATAL_EXIT          0x0CU /**< Application called qurt_fatal_exit(). Cause2 is not defined. */
+#define QURT_EXCEPT_INVALID_INT         0x0DU /**< Kernel received an invalid L1 interrupt. Cause2 is not defined. */
+#define QURT_EXCEPT_FLOATING_POINT      0x0EU /**< Kernel received an floating point error. Cause2 is not defined.  */
+#define QURT_EXCEPT_DBG_SINGLE_STEP     0x0FU /**< Cause2 is not defined. */
+#define QURT_EXCEPT_TLBMISS_RW_ISLAND   0x10U /**< Read write miss in Island mode. Cause2 QURT_TLB_MISS_RW_MEM* are listed below. */
+#define QURT_EXCEPT_TLBMISS_X_ISLAND    0x11U /**< Execute miss in Island mode. For this cause code, Cause2 is SSR[7:0]. */
+#define QURT_EXCEPT_SYNTHETIC_FAULT     0x12U /**< Synthetic fault with user request that kernel detected. Cause2 QURT_SYNTH_* are listed below. */
+#define QURT_EXCEPT_INVALID_ISLAND_TRAP 0x13U /**< Invalid trap in Island mode. Cause2 is trap number. */
+#define QURT_EXCEPT_UNDEF_TRAP0         0x14U /**< trap0(num) was called with unsupported num. Cause2 is trap number. */
+#define QURT_EXCEPT_PRECISE_DMA_ERROR   0x28U /**< Precise DMA error. Cause2 is DM4[15:8]. Badva is DM5 register. */
+
+#define QURT_ECODE_UPPER_LIBC         (0U << 16)  /**< Upper 16 bits is 0 for libc. */
+#define QURT_ECODE_UPPER_QURT         (0U << 16)  /**< Upper 16 bits is 0 for QuRT. */
+#define QURT_ECODE_UPPER_ERR_SERVICES (2U << 16)  /**< Upper 16 bits is 2 for error service. */
+/** @cond */
+#define QURT_ECODE_ISLAND_INVALID_QDI  3U         /**< Passing invalid QDI method in island. */
+/** @endcond */
+
+/* Cause2 for QURT_EXCEPT_SYNTHETIC_FAULT cause- 8bits */
+#define  QURT_SYNTH_ERR                         0x01U     /**< */
+#define  QURT_SYNTH_INVALID_OP                  0x02U     /**< */
+#define  QURT_SYNTH_DATA_ALIGNMENT_FAULT        0x03U     /**< */
+#define  QURT_SYNTH_FUTEX_INUSE                 0x04U     /**< */
+#define  QURT_SYNTH_FUTEX_BOGUS                 0x05U     /**< */
+#define  QURT_SYNTH_FUTEX_ISLAND                0x06U     /**< */
+#define  QURT_SYNTH_FUTEX_DESTROYED             0x07U     /**< */
+#define  QURT_SYNTH_PRIVILEGE_ERR               0x08U     /**< */
+
+/* Cause2 - Abort cause reason - 8 bits */
+/* ERR_ASSERT cause */
+#define   QURT_ABORT_FUTEX_WAKE_MULTIPLE           0x01U   /**<  Abort cause - futex wake multiple. */
+#define   QURT_ABORT_WAIT_WAKEUP_SINGLE_MODE       0x02U   /**<  Abort cause - thread waiting to wake up in Single Threaded mode. */
+#define   QURT_ABORT_TCXO_SHUTDOWN_NOEXIT          0x03U   /**<  Abort cause - call TCXO shutdown without exit. */
+#define   QURT_ABORT_FUTEX_ALLOC_QUEUE_FAIL        0x04U   /**<  Abort cause - futex allocation queue failure -  QURTK_futexhash_lifo empty. */
+#define   QURT_ABORT_INVALID_CALL_QURTK_WARM_INIT  0x05U   /**<  Abort cause - invalid call QURTK_warm_init() in NONE CONFIG_POWER_MGMT mode. */
+#define   QURT_ABORT_THREAD_SCHEDULE_SANITY        0x06U   /**<  Abort cause - sanity schedule thread is not supposed to run on the current hardware thread. */
+#define   QURT_ABORT_REMAP                         0x07U   /**<  Remap in the page table; the correct behavior must remove mapping if necessary. */
+#define   QURT_ABORT_NOMAP                         0x08U   /**<  No mapping in page table when removing a user mapping. */
+#define   QURT_ABORT_OUT_OF_SPACES                 0x09U
+#define   QURT_ABORT_INVALID_MEM_MAPPING_TYPE      0x0AU   /**<  Invalid memory mapping type when creating qmemory. */
+#define   QURT_ABORT_NOPOOL                        0x0BU   /**<  No pool available to attach. */
+#define   QURT_ABORT_LIFO_REMOVE_NON_EXIST_ITEM    0x0CU   /**<  Cannot allocate more futex waiting queue. */
+#define   QURT_ABORT_ARG_ERROR                     0x0DU
+#define   QURT_ABORT_ASSERT                        0x0EU   /**<  Assert abort. */
+#define   QURT_ABORT_FATAL                         0x0FU   /**<  Fatal error; must never occur. */
+#define   QURT_ABORT_FUTEX_RESUME_INVALID_QUEUE    0x10U   /**<  Abort cause - invalid queue ID in futex resume. */
+#define   QURT_ABORT_FUTEX_WAIT_INVALID_QUEUE      0x11U   /**<  Abort cause - invalid queue ID in futex wait. */
+#define   QURT_ABORT_FUTEX_RESUME_INVALID_FUTEX    0x12U   /**<  Abort cause - invalid futex object in hashtable. */
+#define   QURT_ABORT_NO_ERHNDLR                    0x13U   /**<  No registered error handler. */
+#define   QURT_ABORT_ERR_REAPER                    0x14U   /**<  Exception in the reaper thread. */
+#define   QURT_ABORT_FREEZE_UNKNOWN_CAUSE          0x15U   /**<  Abort in thread freeze operation. */
+#define   QURT_ABORT_FUTEX_WAIT_WRITE_FAILURE      0x16U   /**<  During futex wait processing, could not perform a necessary write operation to userland data; most likely due to a DLPager eviction. */
+#define   QURT_ABORT_ERR_ISLAND_EXP_HANDLER        0x17U   /**<  Exception in Island exception handler task. */
+#define   QURT_ABORT_L2_TAG_DATA_CHECK_FAIL        0x18U   /**<  Detected error in L2 tag/data during warm boot. The L2 tag/data check is done when CONFIG_DEBUG_L2_POWER_COLLAPSE is enabled. */
+#define   QURT_ABORT_ERR_SECURE_PROCESS            0x19U   /**<  Abort error in secure process. */
+#define   QURT_ABORT_ERR_EXP_HANDLER               0x20U   /**<  No exception handler, or the handler caused an exception. */
+#define   QURT_ABORT_ERR_NO_PCB                    0x21U   /**<  PCB of the thread context failed initialization, PCB was NULL. */
+#define   QURT_ABORT_NO_PHYS_ADDR                  0x22U   /**<  Unable to find the physical address for the virtual address. */
+#define   QURT_ABORT_OUT_OF_FASTINT_CONTEXTS       0x23U   /**<  Fast interrupt contexts exhausted. */
+#define   QURT_ABORT_CLADE_ERR                     0x24U   /**<  Fatal error seen with CLADE interrupt. */
+#define   QURT_ABORT_ETM_ERR                       0x25U   /**<  Fatal error seen with ETM interrupt. */
+#define   QURT_ABORT_ECC_DED_ASSERT                0x26U   /**<  ECC two-bit DED error. */
+#define   QURT_ABORT_VTLB_ERR                      0x27U   /**<  Fatal error in the VTLB layer. */
+#define   QURT_ABORT_TLB_ENCODE_DECODE_FAILURE     0x28U   /**<  Failure during the TLB encode or decode operation. */
+#define   QURT_ABORT_VTLB_WALKOBJS_BOUND_FAILURE   0x29U   /**<  Failure to lookup entry in the page table. */
+#define   QURT_ABORT_PHY_MEMORY_OWNERSHIP_FAILURE  0x30U   /**<  Failure to claim phy memory ownership. */
+#define   QURT_ABORT_JTLB_SIZE_CHECK_FAIL          0x31U   /**<  JTLB size configured is more than actual size in hardware */
+#define   QURT_ABORT_AUTOSTACK_ASSERT              0x32U   /**<  Error while handling stack flimit exception. */
+
+/* Cause2 - TLB-miss_X - 8bits */
+#define  QURT_TLB_MISS_X_FETCH_PC_PAGE             0x60U  /**<   */
+#define  QURT_TLB_MISS_X_2ND_PAGE                  0x61U  /**<   */
+#define  QURT_TLB_MISS_X_ICINVA                    0x62U  /**<   */
+
+/* Cause2 - TLB-miss_RW - 8bits */
+#define  QURT_TLB_MISS_RW_MEM_READ                 0x70U  /**<   */
+#define  QURT_TLB_MISS_RW_MEM_WRITE                0x71U  /**<   */
+
+/** @cond rest_reg_dist */
+/* Cause2 - Floating point exception - 8 bits */
+#define  QURT_FLOATING_POINT_EXEC_ERR              0xBFU    /**<  Execute floating-point. */
+/** @endcond */
+
+/** Cause2 - autostackv2 - 8 bits */
+#define  QURT_AUTOSTACKV2_CANARY_NOT_MATCH         0xC1U
+#define  QURT_AUTOSTACKV2_POOL_IDX_OFF_RANGE       0xC2U
+
+/** Cause2 - CFI violation - 8 bits */
+#define  QURT_CFI_VIOLATION                        0xC3U
+
+/** @cond rest_reg_dist*/
+/* Enable floating point exceptions */
+#define QURT_FP_EXCEPTION_ALL        0x1FU << 25 /**< */
+#define QURT_FP_EXCEPTION_INEXACT    0x1U << 29 /**< */
+#define QURT_FP_EXCEPTION_UNDERFLOW  0x1U << 28 /**< */
+#define QURT_FP_EXCEPTION_OVERFLOW   0x1U << 27 /**< */
+#define QURT_FP_EXCEPTION_DIVIDE0    0x1U << 26 /**< */
+#define QURT_FP_EXCEPTION_INVALID    0x1U << 25 /**< */
+
+/** @endcond */
+/** @} */ /* end_addtogroup chapter_error */
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_EXCEPT_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_fastint.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_fastint.h
new file mode 100755
index 0000000000000..ea65dc0917fc0
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_fastint.h
@@ -0,0 +1,71 @@
+#ifndef QURT_FASTINT_H
+#define QURT_FASTINT_H
+
+/**
+  @file qurt_fastint.h
+  @brief QuRT fast interrupt functions      
+
+   Copyright (c) 2013-2021  by Qualcomm Technologies, Inc.  All Rights Reserved.
+
+ ======================================================================*/
+
+/*======================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**@ingroup func_qurt_fastint_register
+  Register fast interrupt callback function
+
+  Fast interrupt callback should be designed to perform the minimal necessary 
+  actions for the interrupt, and/or perform some operations, such as signaling 
+  another regular software thread to start any additional processing. 
+  The callback should be a fast and short function. When a fast interrupt callback 
+  is running, the corresponding interrupt cannot be re-enabled until the callback 
+  returns. 
+
+  The fast interrupt callback must not use any system blocking calls, such as 
+  mutex lock or signal wait. Otherwise, it results in errors.
+
+  The fast interrupt callback function has a single integer argument and the 
+  function ends with no return. The argument value passed in is the interrupt
+  number, and therefore a single callback function can handle 
+  multiple fast interrupts.
+
+  @param[in] intno  Interrupt number to register. 
+  @param[in] fn     Interrupt callback function. 
+    
+  @return
+  #QURT_EOK -- Fast interrupt registration is successful. \n
+  #QURT_EINVALID -- Interrupt is already registered. \n
+  #QURT_EINT -- Invalid interrupt number.    
+*/
+/* ======================================================================*/
+unsigned int qurt_fastint_register(int intno, void (*fn)(int));
+
+
+/*======================================================================*/
+/**@ingroup func_qurt_fastint_deregister
+  Deregisters the fast interrupt callback function. 
+	
+  @param[in] intno  Level-one interrupt number to deregister. Valid range is 1 and 10 through 31 
+                    (simulator only). 
+
+  @return 				
+  #QURT_EOK -- Interrupt deregistration is successful. \n
+  #QURT_EINT -- Invalid interrupt number (not registered). \n
+  #QURT_EINVALID -- Invalid interrupt number (already deregistered).
+
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+unsigned int qurt_fastint_deregister(int intno);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_FASTINT_H */
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_fs_hub.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_fs_hub.h
new file mode 100755
index 0000000000000..aaa050a6c838b
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_fs_hub.h
@@ -0,0 +1,58 @@
+#ifndef QURT_FS_HUB_H
+#define QURT_FS_HUB_H
+
+/**
+  @file qurt_fs_hub.h
+  @brief  Definitions, macros, and prototypes used when writing a
+  QDI driver that provides file-system functionality.
+
+ EXTERNALIZED FUNCTIONS
+  none
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  none
+
+ Copyright (c) 2023 by Qualcomm Technologies, Inc.  All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+  This structure tracks a file-designator for a FS-hub QDI driver.
+  File system's QDI interface should use this object to encapsulate
+  true file-descriptor and return back a QDI handle. This QDI handle
+  will be used as file-descriptor by File-systm-hub. 
+ */
+
+typedef struct qurt_qdi_fs_obj
+{
+    qurt_qdi_obj_t qdi_obj;
+    int client_handle;
+    int fd;
+}qurt_qdi_fs_obj_t;
+
+
+/**@ingroup fs_hub_support_functions
+  This function allows a file-system to register it's QDI interface with file-system-hub.
+  Once registered, all file open operations for any filenames containing the mountpoint will
+  be forwarded to the QDI inteface.
+
+  Mountpoint string must be encased in two forward slashes e.g. "/mountpoint/"
+
+  @param  mtpoint         mount point for the file-system being registered.
+  @param  opener          opener structure for the QDI driver interface
+  
+  @return
+  QURT_EOK -- Successfully registered QDI driver with file-system-hub.
+  Negative error code -- Failed to register with file-system-hub
+ */
+int qurt_fs_hub_mtpoint_register(const char *mtpoint, qurt_qdi_obj_t *opener);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_futex.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_futex.h
new file mode 100755
index 0000000000000..1fdcc79a43f01
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_futex.h
@@ -0,0 +1,82 @@
+#ifndef QURT_FUTEX_H
+#define QURT_FUTEX_H
+/**
+  @file  qurt_futex.h
+
+  @brief  Prototypes of QuRT futex API functions      
+  
+ EXTERNALIZED FUNCTIONS
+  none
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  none
+
+ Copyright (c) 2013, 2020-2021  by Qualcomm Technologies, Inc.  All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*=====================================================================
+ Functions
+======================================================================*/
+
+
+/**@ingroup func_qurt_futex_wait
+  Moves the caller thread into waiting state when a memory object address
+  contains a value that is the same as a specified value. 
+
+   @param[in]  lock  Pointer to the object memory. 
+   @param[in]  val   Value to check against the object content. 
+
+   @return
+   #QURT_EOK -- Success \n
+   Other values -- Failure
+
+   @dependencies
+   None.
+ */
+int qurt_futex_wait(void *lock, int val);
+
+
+/**@ingroup func_qurt_futex_wait_cancellable
+  If a memory object address contains a value that is same as a specified 
+  value, move the caller thread into waiting state. 
+  The kernal can cancel the waiting state when there is a special need. 
+
+   @param[in]  lock  Pointer to the object memory. 
+   @param[in]  val   Value to check against the object content. 
+
+   @return
+   #QURT_EOK -- Success \n
+   Other values  -- Failure
+
+   @dependencies
+   None.
+ */
+int qurt_futex_wait_cancellable(void *lock, int val);
+
+
+/**@ingroup func_qurt_futex_wake
+  Wakes up a specified number of threads that have been waiting 
+  for the object change with qurt_futex_wait().
+
+   @param[in]  lock        Pointer to the object memory. 
+   @param[in]  n_to_wake   Maximum number of threads to wake up.
+
+   @return
+   number of threads to be woken up by this function
+
+   @dependencies
+   None.
+ */
+int qurt_futex_wake(void *lock, int n_to_wake);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_FUTEX_H */
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_hmx.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_hmx.h
new file mode 100755
index 0000000000000..e4037dbeae514
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_hmx.h
@@ -0,0 +1,226 @@
+#ifndef QURT_HMX_H
+#define QURT_HMX_H
+/**
+  @file qurt_hmx.h 
+  @brief   Prototypes of Qurt HMX API.  
+
+Copyright (c) 2019-2021, 2023  by Qualcomm Technologies, Inc.  All Rights Reserved.
+Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+=============================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*=============================================================================
+                        TYPEDEFS
+=============================================================================*/
+
+
+/** @addtogroup hmx_types
+@{ */
+/* HMX locking type */
+#define  QURT_HMX_NON_SHARED_LOCK           0U /**< HMX locking type.*/
+#define  QURT_HMX_SHARED_LOCK               1U /**< HMX locking type.*/
+
+/* HMX unlocking type */
+#define  QURT_HMX_NON_SHARED_UNLOCK         0U /**< HMX unlocking type.*/
+#define  QURT_HMX_SHARED_UNLOCK             1U /**< HMX unlocking type.*/
+
+/* HMX hardware context */
+#define  QURT_HMX_UNIT_0                    0U /**< HMX hardware context #0 */
+#define  QURT_HMX_UNIT_1                    1U /**< HMX hardware context #1 */
+	/** @} */ /* end_addtogroup hmx_types */
+
+
+/*=============================================================================
+                        FUNCTIONS
+=============================================================================*/
+
+
+/**@ingroup func_qurt_hmx_lock2
+  Locks a HMX unit with the specified locking type.
+
+    #QURT_HMX_NON_SHARED_LOCK:
+   - If a HMX unit is available, lock the unit and return success of #QURT_EOK.
+   - If the HMX unit is already locked by another thread, the caller thread is suspended 
+     until the HMX is available and gets locked by this function.
+   - If there is no HMX hardware supported, returns #QURT_EVAL;
+
+    #QURT_HMX_SHARED_LOCK:
+   - If a HMX unit is available, enables HMX access for the caller thread, and returns 
+     success of #QURT_EOK.
+   - If the HMX is enabled on the caller thread, return #QURT_EFAILED.
+   - If the HMX is locked by another thread in the same user process of the caller 
+     thread with locking type of #QURT_HMX_SHARED_LOCK, enable HMX access for the caller 
+     thread, and return success of #QURT_EOK.
+   - If the HMX is locked by another thread in the same user process of the caller 
+     thread with locking type of #QURT_HMX_NON_SHARED_LOCK, return #QURT_EFAILED.
+   - If the HMX is locked by a thread from another user process different from the 
+     user process of the caller thread, return #QURT_EFAILED.
+   - If there is no HMX hardware supported, return #QURT_EVAL.
+
+  @param[in]  type  Locking type.
+    
+  @return
+  #QURT_EOK     -- HMX lock successful.\n
+  #QURT_EFAILED -- Failure due to wrong locking condition.\n
+  #QURT_EVAL    -- Failure because no HMX hardware is supported.
+
+  @dependencies
+  None.
+  
+ */
+int qurt_hmx_lock2(unsigned int type);
+
+
+/**@ingroup func_qurt_hmx_unlock2
+  Unlocks a HMX unit with the unlocking type.
+
+    #QURT_HMX_NON_SHARED_UNLOCK:
+  - If there is a HMX unit locked by the caller thread, unlock the HMX unit and clear the 
+    HMX accumulators (assuming a fixed point type). 
+  - If there is no HMX unit locked by the caller thread, return #QURT_EFAILED. 
+  - If there is no HMX hardware supported, return #QURT_EVAL.
+
+  #QURT_HMX_SHARED_UNLOCK:
+   - If the caller thread has locked HMX with type #QURT_HMX_SHARED_LOCK, disable the 
+     HMX access on the caller thread, and return success of #QURT_EOK.
+     Note: If the caller thread is the last thread that unlocks for #QURT_HMX_SHARED_LOCK 
+           in its user process, the unlock function clears the HMX accumulators. 
+   - If the caller thread has locked HMX with type #QURT_HMX_NON_SHARED_LOCK, return 
+     failure of #QURT_EFAILED.
+   - If the caller thread has not locked HMX, return failure of #QURT_EFAILED.
+   - If there is no HMX hardware supported, returns #QURT_EVAL.
+
+  @param[in]  type  Locking type.
+    
+  @return
+  #QURT_EOK     -- HMX is unlocked successful. \n
+  #QURT_EFAILED -- Failure due to wrong unlocking condition. \n
+  #QURT_EVAL    -- Failure because no HMX hardware is supported.
+
+  @dependencies
+  None.
+  
+ */
+int qurt_hmx_unlock2(unsigned int type);
+
+
+/**@ingroup func_qurt_hmx_lock
+  Locks a HMX unit.
+  If a HMX unit is available, this function locks the unit and returns right away.
+  If there is no HMX unit available, the caller is blocked until a HMX is available 
+  and is locked by the function.
+
+  @return
+  #QURT_EOK -- HMX lock successful. \n
+  #QURT_EFAILED -- Failure due to wrong locking condition. \n
+  #QURT_EVAL    -- Failure because no HMX hardware is supported.
+
+  @dependencies
+  None.
+ */
+int qurt_hmx_lock(void);
+
+
+/**@ingroup func_qurt_hmx_unlock
+  Unlocks a HMX unit.
+  If a HMX unit is locked by the caller thread, unlock the HMX unit and clear its 
+  accumulators(assuming fixed point type). 
+  If there is no HMX unit locked by the caller thread, return failure. 
+  
+  @return
+  #QURT_EOK -- HMX unlock successful. \n
+  #QURT_EFAILED -- Failure due to wrong unlocking condition. \n
+  #QURT_EVAL    -- Failure because no HMX hardware is supported.
+
+  @dependencies
+  None.
+ */
+int qurt_hmx_unlock(void);
+
+
+/**@ingroup func_qurt_hmx_try_lock
+  Tries to lock a HMX unit.
+  If a HMX unit is available, this function locks the unit and returns right away;
+  if there is no HMX unit available, the function returns failure without blocking the caller.
+  
+  @return
+  #QURT_EOK -- HMX lock successful \n
+  #QURT_EFAILED -- Failure due to wrong locking condition.\n
+  #QURT_EVAL    -- Failure because no HMX hardware is supported.
+
+  @dependencies
+  None.
+ */
+int qurt_hmx_try_lock(void);
+
+
+/**@ingroup func_qurt_hmx_assign
+  Assign a HMX unit to a target thread specified by its thread identifier. 
+  The HMX unit (HMX hardware context) is specified by hmx_unit.
+  The caller of this function is limited to the SRM process.
+  If the requested hmx_unit is already assigned to another thread with QURT_HMX_NON_SHARED_LOCK, 
+  kernel will detach it from the thread, and re-assign it to the target thread. 
+  If the target thread has HVX enabled, it cannot have HMX enabled.  
+
+  Locking type 
+  #QURT_HMX_NON_SHARED_LOCK:
+   - If the HMX unit is available, lock the HMX unit and return success of #QURT_EOK.
+   - If the HMX unit is already enabled on the target thread, return #QURT_EOK.
+   - If the HMX unit is already locked by another thread, detach the HMX from the thread.
+     Re-assign the HMX unit to the target thread, and return #QURT_EOK.
+     
+  @param[in]  thread_id    Thread identifier
+  @param[in]  type         Locking type  
+                             #QURT_HMX_NON_SHARED_LOCK -- non-shared lock
+  @param[in]  hmx_unit     HMX hardware context number  
+                             #QURT_HMX_UNIT_0
+                             #QURT_HMX_UNIT_1 
+    
+  @return
+  #QURT_EOK       -- The HMX is assigned successfully. This includes the case that \n
+                     the target thread already has HMX assigned. \n
+  #QURT_EFAILED   -- Failure due to wrong assigning conditions. \n
+  #QURT_EINVALID  -- Failure because no HMX hardware is supported.
+
+  @dependencies
+  None.
+ */
+int qurt_hmx_assign ( unsigned int thread_id, unsigned int type, unsigned int hmx_unit );
+
+
+/**@ingroup func_qurt_hmx_release
+  Release a HMX unit from a target thread specified by its thread identifier. 
+  The HMX unit (HMX hardware context) is specified by hmx_unit.
+  The caller of this function is limited to the SRM process.
+
+  Qurt detaches the specified HMX unit from the target thread, and return success of 
+  #QURT_EOK. If the HMX unit is already released from the target thread, return #QURT_EOK.
+     
+  @param[in]  thread_id    Thread identifier
+  @param[in]  hmx_unit     HMX hardware context number  
+                             #QURT_HMX_UNIT_0
+                             #QURT_HMX_UNIT_1 
+    
+  @return
+  #QURT_EOK       -- The HMX is released successfully. This includes the case that \n
+                     the target thread already has the HMX released. \n
+  #QURT_EFAILED   -- Failure due to wrong assigning condition. \n
+  #QURT_EINVALID  -- Failure because no HMX hardware is supported.
+
+  @dependencies
+  None.
+ */
+int qurt_hmx_release ( unsigned int thread_id, unsigned int hmx_unit );
+
+
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_HMX_H */
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_hvx.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_hvx.h
new file mode 100755
index 0000000000000..13c213d49ac84
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_hvx.h
@@ -0,0 +1,421 @@
+#ifndef QURT_HVX_H
+#define QURT_HVX_H
+/**
+  @file qurt_hvx.h 
+  @brief   Prototypes of QuRT HVX API.  
+
+EXTERNAL FUNCTIONS
+   None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+Copyright (c) 2021-2022  by Qualcomm Technologies, Inc.  All Rights Reserved.
+Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+=============================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*=============================================================================
+                        TYPEDEFS
+=============================================================================*/
+/** @cond */
+
+typedef enum {
+    QURT_HVX_MODE_64B = 0,      /**< HVX mode of 64 bytes */
+    QURT_HVX_MODE_128B = 1      /**< HVX mode of 128 bytes */
+} qurt_hvx_mode_t;
+/** @endcond */
+/*=============================================================================
+                        CONSTANTS AND MACROS
+=============================================================================*/
+/** @cond internal_only*/
+/** @addtogroup hvx_macros
+@{ */
+#define QURT_HVX_HW_UNITS_2X128B_4X64B        0x00000204       /**< Bits 15 through 8 are for the number of 128B units.   */
+                                                               /**< Bits 7 through 0 are for the number of 64B units.     */
+#define QURT_HVX_HW_UNITS_4X128B_0X64B        0x00000400   
+#define QURT_HVX_HW_UNITS_6X128B_0X64B        0x00000600   
+
+/* HVX locking status */
+
+#define QURT_HVX_UNLOCKED                     (0)              /* Has not locked HVX unit */
+#define QURT_HVX_LOCKED                       (1)              /* Has locked HVX unit */
+#define QURT_HVX_ERROR                        (-1)             /* Error, no HVX support */
+
+/* Input value for HVX reservation */
+
+#define QURT_HVX_RESERVE_ALL                  (4)              /* All the HVX units in terms of 64B_MODE are requested to be reserved */
+#define QURT_HVX_RESERVE_ALL_AVAILABLE        (0xff)           /* All remaining unlocked HVX units in terms of 64B_MODE are requested to be reserved */
+
+/* Return values for HVX reservation */
+
+#define QURT_HVX_RESERVE_NOT_SUPPORTED        (-1)             /* There is no HVX hardware, or less units in the hardware than requested */
+#define QURT_HVX_RESERVE_NOT_SUCCESSFUL       (-2)             /* Some HVX units are already locked/reserved by other PD, thus not enough units left for the reservation. */
+#define QURT_HVX_RESERVE_ALREADY_MADE         (-3)             /* There is already a HVX reservation made. */
+#define QURT_HVX_RESERVE_CANCEL_ERR           (-4)             /* The action of cancling the reservation fails because this protection domain has no reservation made before. */
+
+// HVX set requests
+
+#define QURT_HVX_64B                    0  /**< */
+#define QURT_HVX_128B                   1  /**< */
+#define QURT_HVX_NO_USE                 2  /**< */
+#define QURT_HVX_RELEASE_CONTEXT        3  /**< */
+#define QURT_HVX_IMMEDIATE_USE          4  /**< */
+
+// HVX set masks
+
+#define QURT_HVX_64B_PREFERRED          (1<<(QURT_HVX_64B  + 8))/**< */
+#define QURT_HVX_128B_PREFERRED         (1<<(QURT_HVX_128B + 8))/**< */
+#define QURT_HVX_64B_ACCEPTABLE         (1<<(QURT_HVX_64B  + 12))/**< */
+#define QURT_HVX_128B_ACCEPTABLE        (1<<(QURT_HVX_128B + 12))/**< */
+
+// HVX set return "result"
+
+#define QURT_EOK                        0     /**< */
+#define QURT_HVX_SET_ERROR              0xFF  /**< */
+
+// hvx_mode_assigned for QURT_HVX_IMMEDIATE_USE 
+#define QURT_HVX_64B_ASSIGNED          (1<<(QURT_HVX_64B  + 8)) /**< */
+#define QURT_HVX_128B_ASSIGNED         (1<<(QURT_HVX_128B + 8)) /**< */
+
+// Sizes of HVX dump buffer
+
+#define   QURT_HVX_V65_64B_VSIZE           2084U      /**<  64 x 32 +  8 x 4 + 4 (version). */
+#define   QURT_HVX_V65_128B_VSIZE          4164U      /**<  128 x 32 + 16 x 4 + 4 (version). */
+#define   QURT_HVX_V66_128B_VSIZE          4420U      /**<  128 x (32 +2) + 16 x 4 + 4 (version). */
+#define   QURT_HVX_V68_128B_VSIZE          4164U      /**<  128 x 32 + 16 x 4 + 4 (version). */
+#define   QURT_HVX_V79_128B_VSIZE          4740U      /**<  128 x (32+4+1) + 4 (version). */
+#define   QURT_HVX_VREG_BUF_SIZE           QURT_HVX_V79_128B_VSIZE /**< */
+
+// HVX dump versions
+
+#define QURT_HVX_DUMP_V65_64B           1U  /**< */
+#define QURT_HVX_DUMP_V65_128B          2U  /**< */
+#define QURT_HVX_DUMP_V66_128B          3U  /**< */
+#define QURT_HVX_DUMP_V68_128B          4U  /**< */
+#define QURT_HVX_DUMP_V79_128B          5U  /**< */
+/** @} */ /* end_addtogroup hvx_macros */
+/** @endcond */
+/** @cond */
+// Qurt data struct for hvx_set input
+typedef struct qurt_hvx_set_struct_ {          
+    unsigned char set_req;  // LSB
+    struct {
+        unsigned char preferred_mask:4;
+        unsigned char acceptable_mask:4;
+    };
+    unsigned short resvd;   // MSB
+} qurt_hvx_set_struct_t;  // 4 bytes
+
+
+// Qurt data struct for hvx_set return
+typedef struct qurt_hvx_set_return_str_ {          
+    unsigned char result;  // LSB
+    unsigned char hvx_mode_assigned;
+    unsigned short resvd;   // MSB
+} qurt_hvx_set_return_struct_t;  // 4 bytes
+/** @endcond */
+
+
+/*=============================================================================
+                        FUNCTIONS
+=============================================================================*/
+
+/**@ingroup func_qurt_hvx_lock
+  Locks one HVX unit specified by the HVX mode.
+  
+  @note1hang Input variable can be 128B_MODE or 64B_MODE. If an HVX unit in this mode 
+             is available, this function locks the unit and returns right away.
+             If the current HVX mode is different from the requested mode, the current 
+             thread is blocked. When all HVX units become idle, QuRT changes 
+             the mode, locks the HVX unit, and returns.
+
+            Starting from Q6v65 with HVX context switch support, qurt_hvx_lock() is 
+            mapped as qurt_hvx_set(64_BYTE or 128_BYTE).
+  
+  @datatypes
+  #qurt_mode_t
+  
+  @param[in]  lock_mode #QURT_HVX_MODE_64B or #QURT_HVX_MODE_128B.
+
+  @return
+  #QURT_EOK -- Success \n
+  Other value -- Failure
+
+  @dependencies
+  None.
+  
+ */
+int qurt_hvx_lock(qurt_hvx_mode_t lock_mode);
+
+/**@ingroup func_qurt_hvx_unlock
+  Unlocks the HVX unit held by this software thread.
+  
+  @note1hang  Starting from Q6v65 with HVX context switch support, qurt_hvx_unlock()
+              maps as qurt_hvx_set(QURT_HVX_RELEASE_CONTEXT).
+  
+  @return
+  #QURT_EOK -- Successful return \n
+  Other values -- Failure
+
+  @dependencies
+  None.
+  
+ */
+int qurt_hvx_unlock(void);
+
+/**@ingroup func_qurt_hvx_try_lock
+  Tries to lock one HVX unit specified by the HVX mode.
+  
+  @note1hang Input variable can be 128B_MODE or 64B_MODE. If an HVX unit in this mode 
+             is available, this function locks the unit and returns #QURT_EOK; Otherwise,
+             the function returns a failure, but does not block the current software 
+             thread to wait for the HVX unit.
+            Starting from Q6v65 with HVX context switch support, qurt_hvx_try_lock()
+             maps to qurt_hvx_set(FOR_IMMEDIATE_USE| preferred_mask | acceptable_mask);
+  
+  @datatypes
+  #qurt_mode_t
+
+  @return
+  #QURT_EOK -- Successful return \n
+  Other values -- Failure
+
+  @dependencies
+  None.
+  
+ */
+int qurt_hvx_try_lock(qurt_hvx_mode_t lock_mode);
+
+/**@ingroup func_qurt_hvx_get_mode
+  Gets the current HVX mode configured by QuRT.
+  
+  @note1hang Returns #QURT_HVX_MODE_128B or #QURT_HVX_MODE_64B, based on 
+             the current HVX configuration.
+  
+  @param[out] 
+  None.
+
+  @return
+  #QURT_HVX_MODE_128B \n
+  #QURT_HVX_MODE_64B \n
+  -1 -- Not available.
+
+  @dependencies
+  None.
+ */
+int qurt_hvx_get_mode(void);
+
+
+/**@ingroup func_qurt_hvx_get_units
+  Gets the HVX hardware configuration that the chipset supports.
+  
+  @note1hang The function returns the HVX hardware configuration supported by the chipset.
+  
+  @return
+  Bitmask of the units: 1X64, 2X64, 4X64, 1X128, 2X128, and so on.\n
+  - QURT_HVX_HW_UNITS_2X126B_4X64B -- V60, V62, or V65 HVX \n
+  - QURT_HVX_HW_UNITS_4X128B_0X64B -- V66 CDSP or newer \n
+  - 0 --  not available
+
+  @dependencies
+  None.
+  
+ */
+int qurt_hvx_get_units(void);
+
+
+/**@ingroup func_qurt_hvx_reserve
+  Reserves HVX units in terms of 64-byte mode for the protection domain (PD) of the caller.
+  
+  @note1hang Only one HVX reservation in the system is supported.
+             If one HVX unit is already locked by the application in the same PD, the unit is 
+             added to the returned count as one reserved unit for the PD.
+            Starting from Q6v65 with HVX context switch support, qurt_hvx_reserve()
+            only does basic sanity checks on HVX units.
+  
+  @datatypes
+  None.
+
+  @param[in]  num_units  Number of HVX units in terms of 64B_MODE to reserve for the PD.
+                         QURT_HVX_RESERVE_ALL to reserve all the HVX units.
+                         QURT_HVX_RESERVE_ALL_AVAILABLE to reserve the remaining unlocked units.
+
+  @return
+    Number of units successfully reserved, including the units already locked in the same PD. \n
+    #QURT_HVX_RESERVE_NOT_SUPPORTED \n     
+    #QURT_HVX_RESERVE_NOT_SUCCESSFUL \n    
+  #QURT_HVX_RESERVE_ALREADY_MADE    
+
+
+  @dependencies
+  None.
+  
+ */
+int qurt_hvx_reserve(int num_units);
+
+
+/**@ingroup func_qurt_hvx_cancel_reserve
+  Cancels the HVX reservation in the protection domain (PD) of the caller.
+  
+  @note1hang Only one HVX reservation in the system is supported.
+  
+  @return
+    0 -- Success \n
+    #QURT_HVX_RESERVE_CANCEL_ERR -- Failure      
+
+  @dependencies
+  None.
+  
+ */
+int qurt_hvx_cancel_reserve(void);
+
+
+/**@ingroup func_qurt_hvx_get_lock_val
+  Gets the HVX locking status value of the thread of the caller. 
+  
+  @note1hang Returns the status of whether the thread of the caller already locks a HVX unit or not.
+  
+  @datatypes
+  None.
+
+  @return
+    #QURT_HVX_UNLOCKED \n  
+    #QURT_HVX_LOCKED \n   
+    #QURT_HVX_ERROR    
+
+  @dependencies
+  None.
+ */
+int qurt_hvx_get_lock_val(void);
+
+/** @cond internal_only*/
+/**@ingroup func_qurt_hvx_set
+  Sets the HVX configuration for the software thread of the caller. 
+  
+  @datatypes
+  None.
+
+  @param[in] input_arg Composed of set_request | hvx_preferred_mode_mask 
+                       | hvx_acceptable_mode_mask where set_request can be set to: \n
+                       - #QURT_HVX_64B  \n         
+                       - #QURT_HVX_128B  \n       
+                       - #QURT_HVX_NO_USE  \n    
+                       - #QURT_HVX_RELEASE_CONTEXT \n
+                       - #QURT_HVX_IMMEDIATE_USE \n
+                       When set_request is QURT_HVX_IMMEDIATE_USE,  
+    hvx_preferred_mode_mask can be set to: \n
+                       - #QURT_HVX_64B_PREFERRED \n    
+                       - #QURT_HVX_128B_PREFERRED   
+                       When set_request is QURT_HVX_IMMEDIATE_USE,  
+    hvx_acceptable_mode_mask can be set to: \n
+                       - #QURT_HVX_64B_ACCEPTABLE  \n
+                       - #QURT_HVX_128B_ACCEPTABLE @tablebulletend
+
+  @return 
+     Result of the HVX setting in the least significant 8 bits of the returned data. \n
+  #QURT_EOK -- 0  \n
+  #QURT_HVX_SET_ERROR -- 0xFF \n     
+  When #QURT_HVX_IMMEDIATE_USE has a result of #QURT_EOK, 
+  bit 8 to bit 15 of the returned data contain hvx_mode_assigned:\n
+  - #QURT_HVX_64B_ASSIGNED      \n
+  - #QURT_HVX_128B_ASSIGNED   
+
+  @dependencies
+  None.
+ */
+unsigned int qurt_hvx_set(unsigned int input_arg);
+
+
+/**@ingroup func_qurt_system_hvx_regs_get_maxsize
+  Returns the maximum buffer size for saving HVX registers.
+  
+  @datatypes
+  None.
+
+  @return
+  0 -- No HVX supported in the target. \n
+  #QURT_HVX_VREG_BUF_SIZE -- Maximum buffer size for saving HVX registers.
+
+  @dependencies
+  None.
+ */
+unsigned int qurt_system_hvx_regs_get_maxsize(void);
+
+
+/**@ingroup func_qurt_system_hvx_regs_get_size
+  Returns the buffer size for saving HVX registers for a specified thread.
+  
+  @param[in]  thread_id    Thread ID of the target thread.
+
+  @return
+  0 -- No HVX assgined to the thread. \n
+    size -- Size of the buffer in bytes for saving HVX registers for the specified thread: \n 
+  - #QURT_HVX_V65_64B_VSIZE  -- 64 x 32 +  8 x 4 + 4 (version) \n
+  - #QURT_HVX_V65_128B_VSIZE -- 128 x 32 + 16 x 4 + 4 (version) \n
+  - #QURT_HVX_V66_128B_VSIZE -- 128 x (32 +2) + 16 x 4 + 4 (version) \n
+  - #QURT_HVX_V68_128B_VSIZE -- 128 x 32 + 16 x 4 + 4 (version) \n
+  - #QURT_HVX_V79_128B_VSIZE -- 128 x (32+4+1) + 4 (version)
+
+
+  @dependencies
+  None.
+  
+ */
+unsigned int qurt_system_hvx_regs_get_size(unsigned int thread_id);
+
+
+
+/**@ingroup func_qurt_system_hvx_regs_get
+  Saves the HVX registers into the specified buffer.
+  Returns the size of the data saved into the buffer.
+  After calling this function for the first time on a specified thread_id, the QuRT kernel removes the internal HVX saving buffer 
+  from the specified thread. When calling the function on the same thread_id for the second time, this function returns 0.
+  
+  @param[in] thread_id    Thread ID of the target thread.
+  @param[in] pBuf         Pointer to the buffer for HVX register saving.
+                          The first four bytes of the buffer are for saving the HVX version. HVX registers are saved from 
+                          the fifth byte of the buffer. The address of the fifth byte should be 256 bytes aligned. 
+                          For example, a buffer can be declared at first as: \n
+                          unsigned char vbuf[QURT_HVX_VREG_BUF_SIZE+256];\n
+                          unsigned char *pBuf; \n
+                          then align the buffer pointer to: \n
+                          pBuf = vbuf; \n
+                    pBuf += (256 - 4 - (unsigned)pBuf%256);
+  @param[in] size         Size of the buffer provided, which is pointed by *pBuf. The buffer size should not be smaller than that 
+                          returned from qurt_system_hvx_regs_get_size(), and pBuf should be aligned as described above.
+  @param[out] pBuf        Buffer returned with the saved HVx registers (unsigned char hvx_regs[];), which are saved from the fith 
+                          byte of the buffer, and the HVX version (unsigned int hvx_version;), which in the first four bytes 
+                          contain one of the HVX dump versions:\n
+                          - #QURT_HVX_DUMP_V65_64B \n   
+                          - #QURT_HVX_DUMP_V65_128B \n   
+                          - #QURT_HVX_DUMP_V66_128B  \n  
+                          - #QURT_HVX_DUMP_V68_128B  \n  
+                          - #QURT_HVX_DUMP_V79_128B  \n  
+                           @tablebulletend
+
+  @return
+    Total bytes of the data saved in the provided buffer. \n
+  0  -- No HVX assigned to the thread \n
+  #QURT_HVX_V65_64B_VSIZE   --  64 x 32 +  8 x 4 + 4 (version) \n
+  #QURT_HVX_V65_128B_VSIZE  -- 128 x 32 + 16 x 4 + 4 (version) \n
+  #QURT_HVX_V66_128B_VSIZE  -- 128 x (32 +2) + 16 x 4 + 4 (version) \n
+  #QURT_HVX_V68_128B_VSIZE  -- 128 x 32 + 16 x 4 + 4 (version)  \n
+  #QURT_HVX_V79_128B_VSIZE  -- 128 x (32+4+1) + 4 (version)
+
+  @dependencies
+  None.
+ */
+unsigned int qurt_system_hvx_regs_get(unsigned int thread_id, void *pBuf, size_t size);
+/** @endcond */
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_HVX_H */
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_int.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_int.h
new file mode 100755
index 0000000000000..386aeda1051eb
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_int.h
@@ -0,0 +1,509 @@
+﻿#ifndef QURT_INT_H
+#define QURT_INT_H
+/**
+  @file  qurt_int.h
+  @brief  QuRT interrupt functions.    
+
+
+
+ Copyright (c) 2013-2021, 2023 Qualcomm Technologies, Inc.
+ All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*=============================================================================
+                            CONSTANTS AND MACROS
+=============================================================================*/
+
+
+/** @cond rest_reg_dist */
+/** @addtogroup interrupts_constants
+@{ */
+#define SIG_INT_ABORT 0x80000000                                       /**< */
+#define QURT_INT_NON_DELAYED_ACK           0 
+#define QURT_INT_DELAYED_ACK               1
+#define QURT_INT_ACK_DEFAULT               QURT_INT_NON_DELAYED_ACK
+#define QURT_INT_DRV_DEFAULT               0
+#define QURT_INT_PRIORITY_DEFAULT          0xFF
+
+/** QuRT interrupt property. */
+#define QURT_INT_CONFIGID_POLARITY        0x1U /**< */
+#define QURT_INT_CONFIGID_LOCK            0x2U /**< */
+
+/** QuRT interrupt lock.*/
+#define QURT_INT_LOCK_DEFAULT             0x0  /**< Default. */
+#define QURT_INT_LOCK_DISABLE             0x0  /**< Interrupt can be enabled or disabled or deregistered. */
+#define QURT_INT_LOCK_ENABLE              0x1  /**< Interrupt is locked and cannot be enabled, disabled, or deregistered.*/
+/** @} */ /* end_addtogroup interrupts_constants */
+
+/** @addtogroup Qurt_interrupt_type
+@{ */
+/** Trigger type bit fields for a PDC interrupt:\n
+    @verbatim
+    Polarity  Edge  Output\n
+    0         00    Level sensitive active low
+    0         01    Rising edge sensitive
+    0         10    Falling edge sensitive
+    0         11    Dual edge sensitive
+    1         00    Level sensitive active high
+    1         01    Falling edge sensitive
+    1         10    Rising edge sensitive
+    1         11    Dual edge sensitive 
+    @endverbatim
+*/
+#define QURT_INT_TRIGGER_TYPE_SET(pol, edge)   ((((pol) & 0x01U) << 2) | ((edge) & 0x03U)) /**< */
+	 
+#define QURT_INT_TRIGGER_LEVEL_LOW     QURT_INT_TRIGGER_TYPE_SET(0U, 0x00U)  /**< */
+#define QURT_INT_TRIGGER_LEVEL_HIGH    QURT_INT_TRIGGER_TYPE_SET(1U, 0x00U)  /**< */
+#define QURT_INT_TRIGGER_RISING_EDGE   QURT_INT_TRIGGER_TYPE_SET(1U, 0x02U)  /**< */
+#define QURT_INT_TRIGGER_FALLING_EDGE  QURT_INT_TRIGGER_TYPE_SET(0U, 0x02U)  /**< */
+#define QURT_INT_TRIGGER_DUAL_EDGE     QURT_INT_TRIGGER_TYPE_SET(0U, 0x03U)  /**< */
+#define QURT_INT_TRIGGER_USE_DEFAULT   0xffU                                 /**< */
+/** @} */ /* end_addtogroup Qurt_interrupt_type */
+
+/*=====================================================================
+ Functions
+======================================================================*/
+
+/**@ingroup func_qurt_interrupt_register
+  @xreflabel{sec:interrupt_register} 
+  Registers the interrupt.\n
+  Enables the specified interrupt and associates it with the specified QuRT signal object and
+  signal mask.
+
+  Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1
+  indicates that a signal must be waited on, and 0 indicates not to wait.
+
+  When the interrupt occurs, the signal specified in the signal mask is set in the signal
+  object. An IST conventionally waits on that signal to
+  handle the interrupt. The thread that registers the interrupt is set as the IST.
+
+  Up to 31 separate interrupts can be registered to a single signal object, as determined by
+  the number of individual signals the object can store. QuRT reserves signal 31. Thus a
+  single IST can handle several different interrupts.
+
+  QuRT reserves some interrupts for internal use -- the remainder are available for use by
+  applications, and thus are valid interrupt numbers. If the specified interrupt number is
+  outside the valid range, the register operation returns the status value QURT_EINT.
+
+  Only one thread can be registered at a time to a specific interrupt. Attempting to register
+  an already-registered interrupt returns the status value QURT_EVAL.
+
+  Only one signal bit in a signal object can be registered at a time to a specific interrupt.
+  Attempting to register multiple signal bits to an interrupt returns the status value
+  QURT_ESIG.
+
+  When the signal registers an interrupt, QuRT can only set its signal bits 
+  when receiving the interrupt. The QuRT signal API from another
+  software thread cannot set the signal even for unused signal bits.
+
+  @note1hang The valid range for an interrupt number can differ on target execution
+             environments other than the simulator. For more information, see the
+             appropriate hardware document.
+								 
+  @datatypes
+  #qurt_anysignal_t
+
+  @param[in] int_num      L2VIC interrupt to deregister; valid range is 0 to 1023.
+  @param[in] int_signal   Any-signal object to wait on (Section @xref{dox:any_signals}).
+  @param[in] signal_mask  Signal mask value indicating signal to receive the interrupt.
+
+   @return
+   #QURT_EOK -- Interrupt successfully registered.\n
+   #QURT_EINT -- Invalid interrupt number. \n
+   #QURT_ESIG -- Invalid signal bitmask (cannot set more than one
+                signal at a time). \n
+   #QURT_EVAL -- Interrupt already registered.
+
+   @dependencies
+   None.
+*/
+ unsigned int qurt_interrupt_register(int int_num, qurt_anysignal_t *int_signal, int signal_mask);
+
+/**@ingroup func_qurt_interrupt_register2
+  @xreflabel{sec:interrupt_register2} 
+  Registers the interrupt.\n
+  Enables the specified interrupt, associates it with the specified QuRT signal object and
+  signal mask, and sets interrupt flags.
+
+  Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1
+  indicates that a signal must be waited on, and 0 indicates not to wait.
+
+  When the interrupt occurs, the signal specified in the signal mask is set in the signal
+  object. An IST conventionally waits on that signal to
+  handle the interrupt. The thread that registers the interrupt is set as the IST.
+
+  Up to 31 separate interrupts can be registered to a single signal object, as determined by
+  the number of individual signals that the object can store. QuRT reserves signal 31. Thus a
+  single IST can handle several different interrupts.
+
+  QuRT reserves some interrupts for internal use -- the remainder are available for use by
+  applications, and thus are valid interrupt numbers. If the specified interrupt number is
+  outside the valid range, the register operation returns the status value #QURT_EINT.
+
+  Only one thread can be registered at a time to a specific interrupt. Attempting to register
+  an already-registered interrupt returns the status value #QURT_EVAL.
+
+  Only one signal bit in a signal object can be registered at a time to a specific interrupt.
+  Attempting to register multiple signal bits to an interrupt returns the status value
+  #QURT_ESIG.
+
+  When the signal registers an interrupt, QuRT can only set its signal bits 
+  when receiving the interrupt. The QuRT signal API from another
+  software thread cannot set the signal even for unused signal bits.
+
+  @note1hang The valid range for an interrupt number can differ on target execution
+             environments other than the simulator. For more information, see the
+             appropriate hardware document.
+								 
+  @datatypes
+  #qurt_anysignal_t
+
+  @param[in] int_num      L2VIC interrupt to deregister; valid range is 0 to 1023.
+  @param[in] int_signal   Any-signal object to wait on (Section @xref{dox:any_signals}).
+  @param[in] signal_mask  Signal mask value indicating signal to receive the interrupt.
+  @param[in] flags        Defines interrupt property, supported property is interrupt lock enable/disable. 
+                          Possible values for flags: \n
+                           - #QURT_INT_LOCK_ENABLE
+                           - #QURT_INT_LOCK_DISABLE @tablebulletend
+
+   @return
+   #QURT_EOK -- Interrupt successfully registered.\n
+   #QURT_EINT -- Invalid interrupt number. \n
+   #QURT_ESIG -- Invalid signal bitmask (cannot set more than one
+                signal at a time). \n
+   #QURT_EVAL -- Interrupt already registered.
+
+   @dependencies
+   None.
+*/
+ unsigned int qurt_interrupt_register2(int int_num, qurt_anysignal_t *int_signal, int signal_mask, unsigned int flags);
+/*
+ * Waits for registered interrupt signal
+
+ * Suspend the current thread until one of its registered interrupts occurs. The second input mask, 
+ * contains the interrupt signals the IST expects to receive. The interrupt signals are registered 
+ * with interrupts via qurt_register_interrupt API.
+ *
+ * The signals returned in the signal variable indicate which interrupts occurred. Use function 
+ * qurt_anysignal_get to read the signals. IST must locally maintain a table that maps a signal to 
+ * a specific interrupt. IST also checks if signal #SIG_INT_ABORT is received. If so, the IST 
+ * must quit from interrupt receiving loop.
+ *
+ * For detail information on this API, see QuRT User Manual Section 4.2.5
+ *
+ * Prototype
+ *
+ * unsigned int qurt_anysignal_wait(qurt_anysignal_t *int_signal, unsigned int mask)
+ */
+
+/**@ingroup func_qurt_interrupt_acknowledge
+  Acknowledges an interrupt after it has been processed.\n
+  Re-enables an interrupt and clears its pending status. This is done after an interrupt is
+  processed by an IST.
+
+  Interrupts are automatically disabled after they occur. To re-enable an interrupt, an IST
+  performs the acknowledge operation after it has finished processing the interrupt and
+  just before suspending itself (such as by waiting on the interrupt signal).
+
+  @note1hang To prevent losing or reprocessing subsequent occurrences of the interrupt,
+           an IST must clear the interrupt signal (Section @xref{sec:anysignal_clear}) before
+           acknowledging the interrupt.
+
+  @param[in] int_num Interrupt that is being re-enabled.
+
+  @return 
+  #QURT_EOK -- Interrupt acknowledge was successful. \n
+  #QURT_EDEREGISTERED -- Interrupt is already de-registered.
+
+  @dependencies
+  None.	
+*/
+int qurt_interrupt_acknowledge(int int_num);
+
+/**@ingroup func_qurt_interrupt_deregister
+  Disables the specified interrupt and disassociates it from a QuRT signal object.
+  If the specified interrupt was never registered (Section @xref{sec:interrupt_register}), the deregister operation
+  returns the status value #QURT_EINT.
+
+  @note1hang If an interrupt is deregistered while an IST waits
+             to receive it, the IST might wait indefinitely for the interrupt to occur. To avoid
+             this problem, the QuRT kernel sends the signal #SIG_INT_ABORT to awaken an
+             IST after determining that it has no interrupts registered.
+
+  @param[in] int_num L2VIC to deregister; valid range is 0 to 1023.
+
+  @return
+  #QURT_EOK -- Success.\n
+  #QURT_EINT -- Invalid interrupt number (not registered).
+
+  @dependencies
+  None.
+
+*/
+unsigned int qurt_interrupt_deregister(int int_num);
+/** @endcond */
+
+/**@ingroup func_qurt_interrupt_disable
+  Disables an interrupt with its interrupt number.\n
+  The interrupt must be registered prior to calling this function. 
+  After qurt_interrupt_disable() returns, the Hexagon subsystem
+  can no longer send the corresponding interrupt to the Hexagon
+  core, until qurt_interrupt_enable() is called 
+  for the same interrupt. 
+  
+  Avoid calling qurt_interrupt_disable() and qurt_interrupt_enable() frequently within 
+  a short period of time.\n
+  - A pending interrupt can already be in the Hexagon core when qurt_interrupt_disable() 
+    is called. Therefore, some time later, the pending interrupt is received on a Hexagon 
+    hardware thread.\n
+  - After the Hexagon subsystem sends an interrupt to the Hexagon core, the Hexagon 
+    hardware automatically disables the interrupt until kernel software re-enables the interrupt 
+    at the interrupt acknowledgement stage. If qurt_interrupt_enable() is called from a certain 
+    thread at an ealier time, the interrupt is re-enabled earlier and can trigger 
+  sending a new interrupt to the Hexagon core while kernel software is still processing
+  the previous interrupt.
+
+  @param[in] int_num Interrupt number.
+
+  @return
+  #QURT_EOK  -- Interrupt successfully disabled.\n 
+  #QURT_EINT -- Invalid interrupt number.\n
+  #QURT_ENOTALLOWED -- Interrupt is locked. \n
+  #QURT_EVAL -- Interrupt is not registered. 
+
+  @dependencies
+  None.
+*/
+ unsigned int qurt_interrupt_disable(int int_num);
+
+ 
+/**@ingroup func_qurt_interrupt_enable
+  Enables an interrupt with its interrupt number.\n
+  The interrupt must be registered prior to calling this function. 
+
+  @param[in] int_num Interrupt number.
+
+  @return
+  #QURT_EOK -- Interrupt successfully enabled.\n 
+  #QURT_EINT -- Invalid interrupt number.\n
+  #QURT_ENOTALLOWED -- Interrupt is locked. \n
+  #QURT_EVAL -- Interrupt is not registered.
+
+  @dependencies
+  None.
+
+*/
+ unsigned int qurt_interrupt_enable(int int_num);
+
+
+/**@ingroup func_qurt_interrupt_status
+  Returns a value that indicates the pending status of the specified interrupt.
+
+  @param[in]  int_num  Interrupt number that is being checked.
+  @param[out] status   Interrupt status; 1 indicates that an interrupt is
+                       pending, 0 indicates that an interrupt is not pending.
+ 
+  @return 
+  #QURT_EOK -- Success. \n
+  #QURT_EINT -- Failure; invalid interrupt number.
+
+  @dependencies
+  None.
+ */
+unsigned int qurt_interrupt_status(int int_num, int *status);
+
+
+/**@ingroup func_qurt_interrupt_get_status
+  Gets the status of the specified interrupt in L2VIC.
+
+  @param[in]  int_num  Interrupt number that is being checked.
+  @param[in]  status_type     0 -- interrupt pending status \n
+                              1 -- interrupt enabling status
+  @param[out] status          0 -- OFF \n
+                              1 -- ON
+ 
+  @return 
+  #QURT_EOK -- Success. \n
+  #QURT_EINT -- Failure; invalid interrupt number.
+
+  @dependencies
+  None.
+ */
+unsigned int qurt_interrupt_get_status(int int_num, int status_type, int *status);
+
+/** @cond rest_reg_dist */
+/**@ingroup func_qurt_interrupt_clear
+  Clears the pending status of the specified interrupt.
+
+  @note1hang This operation is intended for system-level use, and must be used with care.
+             
+  @param[in] int_num Interrupt that is being re-enabled.
+ 
+  @return 
+  #QURT_EOK -- Success.\n
+  #QURT_EINT -- Invalid interrupt number.
+  
+  @dependencies
+  None.
+ */
+unsigned int qurt_interrupt_clear(int int_num);
+
+
+/**@ingroup func_qurt_interrupt_get_config
+  Gets the L2VIC interrupt configuration. \n
+  This function returns the type and polarity of the specified L2VIC interrupt.
+
+  @param[in]   int_num       L2VIC interrupt that is being re-enabled.
+  @param[out]  int_type      Pointer to an interrupt type. \n
+                             0 -- Level-triggered interrupt \n
+                             1 -- Eedge-triggered interrupt
+  @param[out]  int_polarity  Pointer to interrupt polarity.\n
+                             0 -- Active-high interrupt \n
+                             1 -- Active-low interrupt.
+ 
+  @return 
+  #QURT_EOK -- Configuration successfully returned.\n
+  #QURT_EINT -- Invalid interrupt number. 
+
+  @dependencies
+  None.
+ */
+unsigned int qurt_interrupt_get_config(unsigned int int_num, unsigned int *int_type, unsigned int *int_polarity);
+
+/**@ingroup func_qurt_interrupt_set_config
+  Sets the type and polarity of the specified L2VIC interrupt.
+
+  @note1hang Deregister L2VIC interrupts before reconfiguring them.
+
+  @param[in] int_num        L2VIC interrupt that is being re-enabled.
+  @param[in] int_type       Interrupt type. \n
+                            0 -- Level-triggered interrupt\n
+                            1 -- Edge-triggered interrupt
+  @param[in] int_polarity   Interrupt polarity. \n
+                            0 -- Active-high interrupt \n
+                            1 -- Active-low interrupt
+ 
+  @return
+  #QURT_EOK -- Success. \n
+  #QURT_ENOTALLOWED -- Not allowed; the interrupt is being registered.\n
+  #QURT_EINT -- Invalid interrupt number.
+  
+  @dependencies
+  None.
+ */
+unsigned int qurt_interrupt_set_config(unsigned int int_num, unsigned int int_type, unsigned int int_polarity);
+
+/**@ingroup func_qurt_interrupt_set_config2
+  Sets the type and polarity of the specified L2VIC interrupt.
+
+  @note1hang L2VIC interrupts must be deregistered before they can be reconfigured.
+
+  @param[in] int_num        L2VIC interrupt that is being re-enabled.
+  @param[in] int_type       Notified to the hardware configuration callback function and used to 
+                            modify the L2VIC type. Possible values: \n 
+                            - #QURT_INT_TRIGGER_USE_DEFAULT \n 
+                            - #QURT_INT_TRIGGER_LEVEL_HIGH  \n 
+                            - #QURT_INT_TRIGGER_LEVEL_LOW  \n 
+                            - #QURT_INT_TRIGGER_RISING_EDGE  \n 
+                            - #QURT_INT_TRIGGER_FALLING_EDGE  \n              
+                            - #QURT_INT_TRIGGER_DUAL_EDGE  @tablebulletend
+ 
+  @return
+  #QURT_EOK -- Success. \n
+  #QURT_ENOTALLOWED -- Not allowed; the interrupt is being registered.\n
+  #QURT_EINT -- Invalid interrupt number.
+  
+  @dependencies
+  None.
+ */
+unsigned int qurt_interrupt_set_config2(unsigned int int_num, unsigned int int_type);
+
+/**@ingroup func_ qurt_interrupt_set_config3
+  Sets the specified configuration value for the specified property of the specified L2VIC interrupt.
+
+  @note1hang L2VIC interrupts must be deregistered before they can be reconfigured for polarity.
+    
+  @param[in] int_num        L2VIC interrupt to re-enable.
+  @param[in] config_id      Property to configure: \n
+                            - #QURT_INT_CONFIGID_POLARITY \n
+                            - #QURT_INT_CONFIGID_LOCK @tablebulletend
+  @param[in] config_val    Dependent on the second argument config_id, specifies the value to set. \n
+                           Values for #QURT_INT_CONFIGID_POLARITY: \n 
+                            - #QURT_INT_TRIGGER_USE_DEFAULT \n
+                            - #QURT_INT_TRIGGER_LEVEL_HIGH  \n
+                            - #QURT_INT_TRIGGER_LEVEL_LOW \n
+                            - #QURT_INT_TRIGGER_RISING_EDGE \n
+                            - #QURT_INT_TRIGGER_FALLING_EDGE \n             
+                            - #QURT_INT_TRIGGER_DUAL_EDGE \n
+
+                           Values for #QURT_INT_CONFIGID_LOCK: \n
+                            - #QURT_INT_LOCK_ENABLE\n
+                            - #QURT_INT_LOCK_DISABLE @tablebulletend
+          
+  @return
+  #QURT_EOK -- Success. \n
+  #QURT_ENOTALLOWED -- Not allowed; the interrupt is being registered or is locked for enable/disable.\n
+  #QURT_EINT -- Invalid interrupt number.
+
+  @dependencies
+  None.
+*/
+unsigned int qurt_interrupt_set_config3(unsigned int int_num, unsigned int config_id, unsigned int config_val);
+
+
+/**@ingroup func_qurt_interrupt_raise
+  Raises the interrupt. \n
+  This function triggers a level-triggered L2VIC
+  interrupt, and accepts interrupt numbers in the range of 0 to 1023.
+
+  @param[in] interrupt_num Interrupt number.
+  
+  @return
+  #QURT_EOK --  Success \n
+  -1  --  Failure; the interrupt is not supported.
+
+  @dependencies
+  None.
+ */
+int qurt_interrupt_raise(unsigned int interrupt_num);
+
+/**@ingroup func_qurt_interrupt_raise2
+  Raises the interrupt and returns the current pcycle value.
+
+  @param[in] interrupt_num Interrupt number.
+  
+  @return
+  0xFFFFFFFFFFFFFFFF -- Failure; the interrupt is not supported.\n
+  Other value        -- pcycle count at the time the interrupt is raised.
+
+  @dependencies
+  None.
+ */
+unsigned long long qurt_interrupt_raise2(unsigned int interrupt_num);
+/** @endcond */
+
+/** @cond internal_only */
+/**@ingroup func_qurt_isr_subcall
+  Indicates whether the current function is called from a callback procedure (either short or long).
+  
+  @return
+  #QURT_EOK -- TRUE \n
+  #QURT_EVAL -- FALSE.
+  
+  @dependencies
+  None.
+ */
+int qurt_isr_subcall(void);
+/** @endcond */
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_INT_H */
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_island.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_island.h
new file mode 100755
index 0000000000000..f0c8ee27cf8b0
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_island.h
@@ -0,0 +1,122 @@
+#ifndef QURT_ISLAND_H
+#define QURT_ISLAND_H
+
+/**
+  @file qurt_island.h
+  @brief  Prototypes of power API
+          The APIs allow entering and exiting island mode where the memory
+          accesses are limited to local memory.
+
+  EXTERNAL FUNCTIONS
+   None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+Copyright (c) 2018-2021,2023  by Qualcomm Technologies, Inc.  All Rights Reserved.
+
+=============================================================================*/
+
+#include <qurt_thread.h>
+#include <qurt_memory.h>
+#include <qurt_alloc.h>
+#include <qurt_error.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**@ingroup func_qurt_island_get_status
+  Gets Island mode status.
+
+  Returns a value that indicates whether the QuRT system executes in Island mode.
+
+  @return
+  0 - Normal mode. \n
+  1 - Island mode. 
+
+  @dependencies
+  None.
+*/
+unsigned int qurt_island_get_status (void);
+
+/**@ingroup func_qurt_island_get_status2
+  Gets Island mode status especially that differentiates between island partial exit and complete exit.
+ 
+  Returns a value that indicates the current state. 
+  
+  @note1hang Transition from NORMAL mode to ISLAND mode happens in single
+             threaded mode. Whereas transition from ISLAND mode to other modes
+             happen in multi-threaded mode. So, a thread that gets island mode
+             status as NORMAL can assume the same status till it continues to
+             run. A thread that gets island mode status as ISLAND should 
+             assume that the status may change to EXITING or NORMAL while it
+             runs. A thread that gets island mode status as EXITING should
+             assume that the status may change to NORMAL while it runs. If 
+             the thread goes to wait state in after reading the status, it should get
+             the island mode state again and not assume the previous state. 
+  @note2hang This api returns more intrinsic states than qurt_island_get_status,
+             when qurt_island_get_status returns 0, this api could return 
+             QURT_ISLAND_MODE_EXITING or QURT_ISLAND_MODE_ISLAND
+          
+  @param[in/out] data  field is reserved for future use. If NULL pointer is passed, 
+                       the field will be ignored. If a valid pointer is passed, 
+                  QuRT will return back a bitmask which can be interpreted as follows:
+                  data[31] - Valid bit. Set to 1 to indicate data[30:0] are valid. 
+                  Otherwise set to 0.
+                  data[30:0] – Reserved for future definition. 
+ 
+  @return
+    QURT_ISLAND_MODE_NORMAL   - Main mode \n
+    QURT_ISLAND_MODE_ISLAND   - Island mode \n
+    QURT_ISLAND_MODE_EXITING  - Exiting Island mode \n
+ 
+  @dependencies
+  None.
+*/
+unsigned int qurt_island_get_status2 (unsigned int *data);
+
+
+
+/**@ingroup func_qurt_island_get_exit_status
+  Gets the reason for the last Island mode exit status.
+
+  @param[out] cause_code Pointer that returns the cause code of the last
+                         island exit reason. \n
+                         - #QURT_EISLANDUSEREXIT -- Island exit due to user call for island exit.\n
+                         - #QURT_ENOISLANDENTRY -- API called before exiting island. \n                
+                         - #QURT_EISLANDINVALIDINT -- Island exit due to an invalid interrupt in Island mode. @tablebulletend
+
+  @param[out] int_num Pointer that holds the invalid interrupt number that caused
+                      island exit when the cause code is #QURT_EISLANDINVALIDINT.
+                      For other cases, it is -1.
+
+  @return
+  None. 
+
+  @dependencies
+  None.
+*/
+void qurt_island_get_exit_status(unsigned int *cause_code, int *int_num);
+
+/**@ingroup func_qurt_island_get_enter_timestamp
+  Gets the recent timestamp when the system exits STM during island enter.
+
+  @param[out]    island_enter_timestamp Returns a pointer to the recent timestamp
+                                        recorded after the system exits STM during island enter. If the system never 
+                                        attempts to enter island, the island_enter_timestamp return pointer holds a value 
+                 of zero.
+  
+  @return
+  None. 
+
+  @dependencies
+  None.
+*/
+void qurt_island_get_enter_timestamp(unsigned long long *island_enter_timestamp);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_ISLAND_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_isr.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_isr.h
new file mode 100755
index 0000000000000..db29ea2f265d7
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_isr.h
@@ -0,0 +1,177 @@
+#ifndef QURT_ISR_H
+#define QURT_ISR_H
+
+/*=====================================================================
+ 
+  @file  qurt_isr.h
+
+  @brief  Prototypes of Qurt ISR API functions      
+
+ EXTERNALIZED FUNCTIONS
+  none
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  none
+
+ Copyright (c) 2017, 2021  by Qualcomm Technologies, Inc.  All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+
+#include <string.h>
+#include <qurt_thread.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*=============================================================================
+                            Functions
+=============================================================================*/
+
+
+/**@ingroup func_qurt_isr_set_hw_config_callback
+  Set callback function for the configuration related to interrupt hardware.
+  In a process, the callback function can only be set once.
+
+  @param[in]   cb_addr      address of the callback function.
+   
+  @return 
+  #QURT_EOK     -- the callback function is set succssfully. \n
+  #QURT_EFAILED -- Failure. The callback function has been set before. 
+
+  @dependencies
+  None.
+ */
+int qurt_isr_set_hw_config_callback(unsigned int cb_addr);
+
+
+/**@ingroup func_qurt_isr_set_hw_enable_callback
+  Set callback function for enabling the configuration related to interrupt hardware.
+  In a process, the callback function can only be set once.
+
+  @param[in]   cb_addr      address of the callback function.
+   
+  @return 
+  #QURT_EOK     -- the callback function is set succssfully. \n
+  #QURT_EFAILED -- Failure. The callback function has been set before. 
+
+  @dependencies
+  None.
+ */
+int qurt_isr_set_hw_enable_callback(unsigned int cb_addr);
+
+
+/**@ingroup func_qurt_isr_set_hw_disable_callback
+  Set callback function for disabling the configuration related to interrupt hardware.
+  In a process, the callback function can only be set once.
+
+  @param[in]   cb_addr      address of the callback function.
+   
+  @return 
+  #QURT_EOK     -- the callback function is set succssfully. \n
+  #QURT_EFAILED -- Failure. The callback function has been set before. 
+
+  @dependencies
+  None.
+ */
+int qurt_isr_set_hw_disable_callback(unsigned int cb_addr);
+
+
+/**@ingroup func_qurt_isr_create
+  Creates an ISR thread with the specified attributes, and makes it executable.
+
+  @datatypes
+  #qurt_thread_t \n
+  #qurt_thread_attr_t
+  
+  @param[out]  thread_id    Returns a pointer to the thread identifier if the thread was 
+                             successfully created.
+  @param[in]   attr 	    Pointer to the initialized thread attribute structure that specifies 
+                             the attributes of the created thread.
+   
+  @return 
+  #QURT_EVAL    -- Invalid arguments
+  #QURT_EOK -- Thread created. \n
+  #QURT_EFAILED -- Thread not created. 
+
+  @dependencies
+  None.
+ */
+int qurt_isr_create (qurt_thread_t *thread_id, qurt_thread_attr_t *pAttr);
+
+/**@ingroup func_qurt_isr_register2
+  Registers an Interrupt Service Routine to an ISR thread. ISR callback with the specified attributes.
+  The interrupt is enabled when this function returns success.
+
+  @datatypes
+   qurt_thread_t
+  
+  @param[in]   isr_thread_id ISR thread ID, returned from qurt_isr_create()
+  @param[in]   int_num       The interrupt number
+  @param[in]   prio          Priority of the ISR
+  @param[in]   flags         Defines ACK type. Values : \n
+                             QURT_INT_NON_DELAYED_ACK - ISR is acknowledged by the interrupt handle routine 
+			                                     in the Kernel.
+                             QURT_INT_DELAYED_ACK     - Client chooses to acknowledge. 
+  @param[in]   int_type.     Notifies it to registered function. Values: \n 
+                             - QURT_INT_TRIGGER_USE_DEFAULT
+                             - QURT_INT_TRIGGER_LEVEL_HIGH 
+                             - QURT_INT_TRIGGER_LEVEL_LOW 
+                             - QURT_INT_TRIGGER_RISING_EDGE 
+                             - QURT_INT_TRIGGER_FALLING_EDGE              
+                             - QURT_INT_TRIGGER_DUAL_EDGE              
+  @param[in]   isr           Interrupt Service Routine with proto type void isr (void *arg, int int_num)
+  @param[in]   arg  	     1st argument of the ISR when it is called to service the interrupt
+   
+  @return 
+   QURT_EOK          -- Successfully registered the ISR for the interrupt
+   QURT_EINT         -- Interrupt not configured
+   QURT_EINVALID     -- Invalid Thread ID
+   QURT_EDISABLED    -- The feature is disabled
+   QURT_EDUPLICATE   -- Interrupt is already registered
+
+  @dependencies
+   Thread ID should be created using qurt_isr_create()
+ */
+int qurt_isr_register2 (qurt_thread_t isr_thread_id, int int_num, unsigned short prio, unsigned short flags, unsigned int int_type, void (*isr) (void *, int), void *arg);
+
+/**@ingroup func_qurt_isr_deregister2
+  De-registers the ISR for the specified interrupt.
+  The interrupt is disabled when this function returns success.
+
+  @param[in]   int_num   The interrupt number
+   
+  @return 
+   QURT_EOK            -- ISR deregistered successfully
+   QURT_ENOREGISTERED  -- Interrupt with int_num is not registered
+
+  @dependencies
+  None.
+ */
+int qurt_isr_deregister2 (int int_num);
+
+/**@ingroup func_qurt_isr_delete
+   ISR thread will exit and releases Kernel resources
+
+   @note1hang   The ISR thread shouldn't be actively processing interrupts,
+                otherwise the call will fail and return an error.
+  
+   @param[in]   thread-id of the ISR thread that needs to be deleted.
+
+   @return
+    QURT_ENOTALLOWED   -- ISR thread is processing an interrupt
+    QURT_EINVALID      -- Invalid ISR thread ID
+    QURT_EOK           -- Success 
+
+   @dependencies
+   Thread ID should be created using qurt_isr_create()
+ */
+int qurt_isr_delete (qurt_thread_t isr_tid);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_ISR_H */
+
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_l2cfg.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_l2cfg.h
new file mode 100755
index 0000000000000..7e26b30a580d9
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_l2cfg.h
@@ -0,0 +1,98 @@
+#ifndef QURT_L2CFG_H
+#define QURT_L2CFG_H
+/**
+  @file qurt_l2cfg.h 
+  @brief QuRT APIs for L2 configuration and system configuration
+
+EXTERNAL FUNCTIONS
+   qurt_l2cfg_set 
+   qurt_l2cfg_get 
+   qurt_system_config_get
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+Copyright (c) 2019-2021 by Qualcomm Technologies, Inc.  All Rights Reserved.
+Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+=============================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*=============================================================================
+                        CONSTANTS AND MACROS
+=============================================================================*/
+
+/* Definition for system configuration */
+/** @addtogroup l2cfg_macros
+@{ */
+#define QURT_CORE_CFG_HMX_INT8_SPATIAL  0x78   /**< HMX fixed-point spatial size */
+#define QURT_CORE_CFG_HMX_INT8_DEPTH    0x7C   /**< HMX fixed-point output depth */
+/** @} */ /* end_addtogroup l2cfg_macros */
+/*=============================================================================
+                        FUNCTIONS
+=============================================================================*/
+/**@ingroup func_qurt_l2cfg_set
+  Sets the value of a L2 configuration register. A register can be set *IFF* its 
+  initial value is configured.
+   
+  @param[in] offset Offset of L2 configuration register; must be multiple of 4.
+  @param[in] value  Value to set the register to. 
+   
+  @return 
+  #QURT_EOK -- Success. \n 
+  #QURT_EFAILED -- Internal mapping that covers L2CFG register file absent; likely 
+                    a configuration problem. \n
+  #QURT_EINVALID -- Argument error. \n
+  #QURT_ENOTALLOWED -- Setting this register is prohibited.
+   
+  @dependencies 
+  None.  
+ */
+int qurt_l2cfg_set (unsigned short offset, unsigned int value);
+
+/**@ingroup func_qurt_l2cfg_get 
+  Gets the value of a L2 configuration register. 
+   
+  @param[in]  offset Offset of L2 configuration register; must be multiple of 4.
+  @param[out] value  Pointer to value of the register. 
+   
+  @return 
+  #QURT_EOK -- Success. \n 
+  #QURT_EFAILED -- Internal mapping that covers L2CFG register file absent;  
+                   likely a configuration problem. \n 
+  #QURT_EINVALID -- Argument error. 
+   
+  @dependencies 
+  None. 
+  
+ */
+int qurt_l2cfg_get (unsigned short offset, unsigned int * value);
+
+
+/**@ingroup func_qurt_system_config_get
+  Gets the system configuration information.
+
+  @param[in] index Index to system configuration. Values:\n
+                   - #QURT_CORE_CFG_HMX_INT8_SPATIAL \n
+                   - #QURT_CORE_CFG_HMX_INT8_DEPTH @tablebulletend
+
+  @param[out] data   Pointer to a word for returned data.
+
+  @return
+  #QURT_EOK -- Get the configuration data successful. \n
+  Other values -- Failure (no such configuration available).
+
+  @dependencies
+  None.
+  
+ */
+int qurt_system_config_get(int index, unsigned int *data);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_L2CFG_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_lifo.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_lifo.h
new file mode 100755
index 0000000000000..dc399fccc5f0f
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_lifo.h
@@ -0,0 +1,71 @@
+#ifndef QURT_LIFO_H
+#define QURT_LIFO_H
+/**
+  @file qurt_lifo.h
+
+  @brief
+  Provide lock free LastInFirstOut algorithm, which can be used in a
+  variety of situations for allocation/free fixed size buffer    
+  This implementation touches the first word of your FREED buffer. Even
+  though it does not matter how you use it when it is allocated, you might want
+	to be a bit careful not to put your MAGIC number as the first field.
+	Because it will not hold the magic value for "freed"
+
+ EXTERNALIZED FUNCTIONS
+ None
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+ None
+
+ Copyright (c) 2013, 2021  by Qualcomm Technologies, Inc.  All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+	/*=====================================================================
+	 Functions
+	======================================================================*/
+
+/*======================================================================*/
+/**
+  Pops an element out of the LIFO. 
+
+  @param[in] freelist  Pointer to the head of your list. 
+	
+  @return 				
+  Top object from the list 
+		 
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+void * qurt_lifo_pop(void *freelist);
+
+ 
+/*======================================================================*/
+/**
+  Pushes an element into the LIFO.
+	
+  @param[in] freelist  Pointer to the head of your list. 
+  @param[in] buf       Pointer to your buffer to push into the list. 
+	
+  @return
+  None.
+		 
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+void qurt_lifo_push(void *freelist, void *buf);
+
+void qurt_lifo_remove(void *freelist, void *buf);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_LIFO_H */
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_mailbox.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_mailbox.h
new file mode 100755
index 0000000000000..a6cd91c611782
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_mailbox.h
@@ -0,0 +1,176 @@
+#ifndef QURT_MAILBOX_H
+#define QURT_MAILBOX_H
+
+/**
+  @file qurt_mailbox.h
+  @brief  Definitions, macros, and prototypes used for QuRT mailbox
+
+ EXTERNALIZED FUNCTIONS
+  none
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  none
+
+ Copyright (c) 2015, 2021-2023  by Qualcomm Technologies, Inc.  All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*=============================================================================
+                        CONSTANTS AND MACROS
+=============================================================================*/
+/* Definitions on typedef and return values */
+
+#define   QURT_MAILBOX_ID_NULL               0
+#define   QURT_MAILBOX_ERROR                -1
+#define   QURT_MAILBOX_ID_ERROR             -2
+#define   QURT_MAILBOX_NON_VALID_DATA       -3
+#define   QURT_MAILBOX_FULL                 -4
+#define   QURT_MAILBOX_DELETED              -5
+#define   QURT_MAILBOX_RECEIVE_HALTED       -6
+#define   QURT_MAILBOX_BANDWIDTH_LIMIT      -7
+
+
+/*=============================================================================
+                    FORWARD DECLARATIONS & TYPEDEFS
+=============================================================================*/
+
+#define        QURT_MAILBOX_AT_QURTOS     0U            // Receiver is QurtOS
+#define        QURT_MAILBOX_AT_ROOTPD     1U            // Receiver is RootPD  (ASID=0)
+#define        QURT_MAILBOX_AT_USERPD     2U            // Receiver is User PD (ASID!=0)
+#define        QURT_MAILBOX_AT_SECUREPD   3U            // Receiver is Secure PD
+
+typedef unsigned char qurt_mailbox_receiver_cfg_t;  
+
+#define        QURT_MAILBOX_SEND_OVERWRITE        0U       // When there is already valid content, overwrite it
+#define        QURT_MAILBOX_SEND_NON_OVERWRITE    1U       // When there is already valid content, return failure
+
+typedef unsigned char qurt_mailbox_send_option_t;  
+
+
+#define        QURT_MAILBOX_RECV_WAITING          0U          // When there is no valid content, wait for it 
+#define        QURT_MAILBOX_RECV_NON_WAITING      1U          // When there is no valid content, return failure immediately
+#define        QURT_MAILBOX_RECV_PEEK_NON_WAITING 2U          // Read the content, but doesn't remove it from the mailbox. No waiting.
+
+typedef unsigned char qurt_mailbox_recv_option_t;
+
+
+/*=============================================================================
+                            EXTERNS & FUNCTIONS
+=============================================================================*/
+/* Function prototype */
+
+/**@ingroup qurt_mailbox_create
+  Creates a QuRT mailbox.
+   
+  @param name            Mailbox name up to 8 characters.
+  @param recv_opt        Configuration on the receiver process.
+
+  @return
+  Mailbox ID --          Mailbox Identifier \n
+  #QURT_MAILBOX_ID_NULL --  NULL, failure at creating mailbox
+
+  @dependencies
+  None.
+*/
+unsigned long long qurt_mailbox_create(char *name, qurt_mailbox_receiver_cfg_t recv_opt);
+
+
+/**@ingroup qurt_mailbox_get_id
+  Gets a QuRT mailbox identifier.
+   
+  @param name            Mailbox name up to 8 characters.
+
+  @return
+  Mailbox ID --            Mailbox identifier \n
+  #QURT_MAILBOX_ID_NULL -- NULL, failure at getting mailbox ID
+
+  @dependencies
+  None.
+*/
+unsigned long long qurt_mailbox_get_id(char *name);
+
+
+/**@ingroup qurt_mailbox_send
+  Sends data to a QuRT mailbox.
+   
+  @param mailbox_id   Mailbox identifier.
+  @param send_opt     Option for mailbox send.
+  @param data         Data to send.
+
+
+  @return
+  #QURT_EOK                      Success \n
+  #QURT_MAILBOX_ID_ERROR         Mailbox ID error.\n
+  #QURT_MAILBOX_ERROR            Other errors.\n
+  #QURT_MAILBOX_FULL             Valid data already exists, non-overwriting.\n
+  #QURT_MAILBOX_BANDWIDTH_LIMIT  Reached the bandwidth limitation.   
+
+  @dependencies
+  None.
+*/
+int qurt_mailbox_send(unsigned long long mailbox_id, qurt_mailbox_send_option_t send_opt, unsigned long long data);
+
+
+/**@ingroup qurt_mailbox_receive
+  Receive data from QuRT mailbox
+   
+  @param mailbox_id   Mailbox Identifier
+  @param send_opt     Option for mailbox receiving
+  @param data         Pointer to data buffer for receiving
+
+  @return
+  #QURT_EOK                            Success \n
+  #QURT_MAILBOX_ID_ERROR               Mailbox ID error. \n
+  #QURT_MAILBOX_ERROR                  Other errors. \n
+  #QURT_MAILBOX_NON_VALID_DATA         No current valid data, put the previous content in the buffer. \n
+  #QURT_MAILBOX_RECEIVE_HALTED         Receive halted, the waiting thread is woken up. \n
+  #QURT_MAILBOX_DELETED                Mailbox is deleted, and the waiting thread is woken up.
+
+  @dependencies
+  None.
+*/
+int qurt_mailbox_receive(unsigned long long mailbox_id, qurt_mailbox_recv_option_t recv_opt, unsigned long long *data);
+
+
+/**@ingroup qurt_mailbox_delete
+  Deletes a QuRT mailbox.
+
+  A mailbox can only be deleted from the process that created the mailbox.
+   
+  @param mailbox_id   Mailbox identifier.
+
+  @return
+  #QURT_EOK                   Success. \n
+  #QURT_MAILBOX_ID_ERROR      Mailbox ID error. \n
+  #QURT_MAILBOX_ERROR         Other errors.
+
+  @dependencies
+  None.
+*/
+int qurt_mailbox_delete(unsigned long long mailbox_id);
+
+
+/**@ingroup qurt_mailbox_receive_halt
+  Halts a QuRT mailbox receiving and wakes up waiting threads.
+
+  @param mailbox_id   Mailbox identifier.
+
+  @return
+  #QURT_EOK                   Success. \n
+  #QURT_MAILBOX_ID_ERROR      Mailbox ID error.\n
+  #QURT_MAILBOX_ERROR         Other errors.
+
+  @dependencies
+  None.
+*/
+int qurt_mailbox_receive_halt(unsigned long long mailbox_id);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif // QURT_MAILBOX_H
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_memory.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_memory.h
new file mode 100755
index 0000000000000..90ce2586fec50
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_memory.h
@@ -0,0 +1,1487 @@
+#ifndef QURT_MEMORY_H
+#define QURT_MEMORY_H
+/**
+  @file qurt_memory.h
+  @brief  Prototypes of kernel memory API functions.
+
+ EXTERNALIZED FUNCTIONS
+  none
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  none
+
+ Copyright (c) Qualcomm Technologies, Inc.
+ All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+
+
+#include <qurt_error.h>
+#include <qurt_types.h>
+//#include <qurt_util_macros.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** @addtogroup memory_management_macros
+@{ */
+#define QURT_SYSTEM_ALLOC_VIRTUAL 1 /**< Allocates available virtual memory in the address space of all
+                                processes.*/
+/** @} */ /* end_addtogroup memory_management_macros */
+/**@cond rest_reg_dist */
+/** @addtogroup memory_management_types
+@{ */
+/** @xreflabel{hdr:qurt_mem_default_pool} */
+extern qurt_mem_pool_t qurt_mem_default_pool __attribute__((section(".data"))); /**< Memory pool object.*/
+/** @} */ /* end_addtogroup memory_management_types */
+
+/** @cond rest_reg_dist */
+/** Mapping attribute information*/
+typedef struct{
+    qurt_paddr_64_t        paddr;
+    qurt_size_t            size ;
+    qurt_mem_cache_mode_t  cache_mode;
+    qurt_perm_t            perms ; 
+}qurt_mapping_attr_t;
+/** @endcond */
+/** @} */ /* end_addtogroup mapping_attribute_types*/
+
+/*=====================================================================
+ Functions
+======================================================================*/
+
+/**@ingroup func_qurt_mem_cache_clean
+  Performs a cache clean operation on the data stored in the specified memory area.
+  Peforms a syncht on all the data cache operations when the Hexagon processor version is V60 or greater.
+
+  @note1hang Perform the flush all operation only on the data cache.
+
+  @note1cont This operation flushes and invalidates the contents of all cache lines from start address
+             to end address (start address + size). The contents of the adjoining buffer can be 
+             flushed and invalidated if it falls in any of the cache line.
+
+  @datatypes
+  #qurt_addr_t \n
+  #qurt_size_t \n
+  #qurt_mem_cache_op_t \n
+  #qurt_mem_cache_type_t
+
+  @param[in] addr      Address of data to flush.
+  @param[in] size      Size (in bytes) of data to flush.
+  @param[in] opcode    Type of cache clean operation. Values:  
+                       - #QURT_MEM_CACHE_FLUSH
+                       - #QURT_MEM_CACHE_INVALIDATE
+                       - #QURT_MEM_CACHE_FLUSH_INVALIDATE
+                       - #QURT_MEM_CACHE_FLUSH_ALL\n
+                       @note1 #QURT_MEM_CACHE_FLUSH_ALL is valid only when the type is #QURT_MEM_DCACHE @tablebulletend
+  @param[in] type          Cache type. Values:  
+                       - #QURT_MEM_ICACHE
+                       - #QURT_MEM_DCACHE  @tablebulletend
+ 
+  @return
+  #QURT_EOK -- Cache operation performed successfully.\n
+  #QURT_EVAL -- Invalid cache type.\n
+
+  @dependencies
+  None.
+*/
+int qurt_mem_cache_clean(qurt_addr_t addr, qurt_size_t size, qurt_mem_cache_op_t opcode, qurt_mem_cache_type_t type);
+
+/**@ingroup func_qurt_mem_cache_clean2
+  Performs a data cache clean operation on the data stored in the specified memory area.
+
+  This API only performs the following data cache operations:\n  
+  - #QURT_MEM_CACHE_FLUSH\n
+  - #QURT_MEM_CACHE_INVALIDATE\n  
+  - #QURT_MEM_CACHE_FLUSH_INVALIDATE -- flushes/invalidates the contents of all cache lines from start address
+  to end address (start address + size). The contents of the adjoining buffer can be 
+  flushed/invalidated if it falls in any of the cache line.
+
+  @datatypes
+  #qurt_addr_t \n
+  #qurt_size_t \n
+  #qurt_mem_cache_op_t \n
+  #qurt_mem_cache_type_t
+
+  @param[in] addr      Address of data to flush.
+  @param[in] size      Size (in bytes) of data to flush.
+  @param[in] opcode    Type of cache clean operation. Values:\n  #QURT_MEM_CACHE_FLUSH\n  #QURT_MEM_CACHE_INVALIDATE\n
+                       #QURT_MEM_CACHE_FLUSH_INVALIDATE
+  @param[in] type          Cache type. Values: \n #QURT_MEM_DCACHE
+
+  @return
+  #QURT_EOK -- Cache operation performed successfully.\n
+  #QURT_EVAL -- Invalid cache type.
+
+  @dependencies
+  None.
+*/
+int qurt_mem_cache_clean2(qurt_addr_t addr, qurt_size_t size, qurt_mem_cache_op_t opcode, qurt_mem_cache_type_t type);
+
+/**@ingroup func_qurt_mem_cache_phys_clean
+  Performs a cache clean operation on the data stored in the specified memory area based on address match and mask.
+  Operate on a cache line when (LINE.PhysicalPageNumber & mask) == addrmatch.
+
+  @note1hang The addrmatch value should be the upper 24-bit physical address to match against.
+
+  @datatypes
+  #qurt_mem_cache_op_t \n
+
+  @param[in] mask      24-bit address mask.
+  @param[in] addrmatch Physical page number (24 bits) of memory to use as an address match.
+  @param[in] opcode    Type of cache clean operation. Values:  
+                       - #QURT_MEM_CACHE_FLUSH
+                       - #QURT_MEM_CACHE_INVALIDATE @tablebulletend
+ 
+  @return
+  #QURT_EOK -- Cache operation performed successfully.\n
+  #QURT_EVAL -- Invalid operation
+  
+  @dependencies
+  None.
+*/
+
+int qurt_mem_cache_phys_clean(unsigned int mask, unsigned int addrmatch, qurt_mem_cache_op_t opcode);
+
+/**@ingroup func_qurt_mem_l2cache_line_lock 
+  Performs an L2 cache line locking operation. This function locks selective lines in the L2 cache memory.
+
+  @note1hang Perform the line lock operation only on the 32-byte aligned size and address.
+
+  @datatypes
+  #qurt_addr_t \n
+  #qurt_size_t 
+ 
+  @param[in] addr   Address of the L2 cache memory line to lock; the address must be 32-byte aligned.
+  @param[in] size   Size (in bytes) of L2 cache memory to line lock; size must be a multiple of 32 bytes.
+ 
+  @return
+  #QURT_EOK -- Success.\n
+  #QURT_EALIGN -- Data alignment or address failure.
+  #QURT_EINVALID -- Improper addr and size passed (e.g. integer overflow due to addr + size)
+  #QURT_EFAILED -- Failed to lock cache line as all the ways were locked for the corresponding set of an address 
+                   in the range of addr and addr+size or the address range is not L2 cacheable
+  @dependencies
+  None.
+*/
+int qurt_mem_l2cache_line_lock(qurt_addr_t addr, qurt_size_t size);
+
+/**@ingroup func_qurt_mem_l2cache_line_unlock
+  Performs an L2 cache line unlocking operation. This function unlocks selective lines in the L2 cache memory.
+
+  @note1hang Perform the line unlock operation only on a 32-byte aligned size and address.
+
+  @datatypes
+  #qurt_addr_t \n
+  #qurt_size_t
+
+  @param[in] addr   Address of the L2 cache memory line to unlock; the address must be 32-byte aligned.
+  @param[in] size   Size (in bytes) of the L2 cache memory line to unlock; size must be a multiple of 32 bytes.
+
+  @return
+  #QURT_EOK -- Success. \n
+  #QURT_EALIGN -- Aligning data or address failure. \n
+  #QURT_EFAILED -- Operation failed, cannot find the matching tag.
+
+  @dependencies
+  None.
+*/
+int qurt_mem_l2cache_line_unlock(qurt_addr_t addr, qurt_size_t size);
+
+/**@ingroup func_qurt_mem_region_attr_init
+  @xreflabel{sec:qurt_mem_region_attr_init} 
+  Initializes the specified memory region attribute structure with default attribute values: \n
+  - Mapping -- #QURT_MEM_MAPPING_VIRTUAL \n
+  - Cache mode -- #QURT_MEM_CACHE_WRITEBACK \n
+  - Physical address -- -1 \n
+  - Virtual address -- -1 \n
+  - Memory type -- #QURT_MEM_REGION_LOCAL \n
+  - Size -- -1 
+
+  @note1hang The memory physical address attribute must be explicitly set by calling the
+             qurt_mem_region_attr_set_physaddr() function. The size and pool attributes are set directly
+             as parameters in the memory region create operation.
+
+  @datatypes
+  #qurt_mem_region_attr_t
+
+  @param[in,out] attr  Pointer to the destination structure for the memory region attributes.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+void qurt_mem_region_attr_init(qurt_mem_region_attr_t *attr);
+
+/**@ingroup func_qurt_mem_pool_attach
+  Initializes a memory pool object to attach to a pool predefined in the system
+  configuration file.
+
+  Memory pool objects assign memory regions to physical memory in different
+  Hexagon memory units. They are specified in memory region create operations
+  (Section @xref{sec:mem_region_create}).
+
+  @note1hang QuRT predefines the memory pool object #qurt_mem_default_pool
+             (Section @xref{dox:mem_management}) for allocation memory regions in SMI memory. The pool attach
+             operation is necessary only when allocating memory regions in nonstandard
+             memory units such as TCM.
+
+  @datatypes
+  #qurt_mem_pool_t
+
+  @param[in] name   Pointer to the memory pool name.
+  @param[out] pool  Pointer to the memory pool object.
+
+  @return
+  #QURT_EOK -- Attach operation successful.
+
+  @dependencies
+  None.
+*/
+int qurt_mem_pool_attach(char *name, qurt_mem_pool_t *pool);
+
+/**@ingroup func_qurt_mem_pool_attach2
+  Gets the identifier that corresponds to a pool object created specifically for a client, for example, HLOS_PHYSPOOL.
+  The client_handle is used to look up the client specific pool.
+
+  Memory pool objects assign memory regions to physical memory in different
+  Hexagon memory units. Memory pool objects are specified during mapping creation operations 
+  (qurt_mem_mmap() and qurt_mem_region_create()).
+
+  @note1hang QuRT predefines the memory pool object #qurt_mem_default_pool
+             (Section @xref{dox:mem_management}) for allocation memory regions in SMI memory. The pool_attach2
+             operation is necessary only when allocating memory regions in memory units specific to the client.
+
+  @datatypes
+  #qurt_mem_pool_t
+
+  @param[in] client_handle   Client identifier used by the OS to lookup the identifier
+                             for client specific pool
+  @param[in] name            Pointer to the memory pool name.
+  @param[out] pool           Pointer to the memory pool object.
+
+  @return
+  #QURT_EOK -- Attach operation successful.
+
+  @dependencies
+  None.
+*/
+int qurt_mem_pool_attach2(int client_handle, char *name, qurt_mem_pool_t *pool);
+
+/**@ingroup func_qurt_mem_pool_create
+   @xreflabel{hdr:qurt_mem_pool_create}
+   Dynamically creates a memory pool object from a physical address range.
+
+   The pool is assigned a single memory region with the specified base address and size.
+
+   The base address and size values passed to this function must be aligned to 4K byte
+   boundaries, and must be expressed as the actual base address and size values divided by 4K.
+
+   For example, the function call:
+         @code
+         qurt_mem_pool_create ("TCM_PHYSPOOL", 0xd8020, 0x20, &pool)
+         @endcode
+   ... is equivalent to the following static pool definition in the QuRT system configuration file:
+        @code
+       <physical_pool name="TCM_PHYSPOOL">
+            <region base="0xd8020000" size="0x20000" />
+       </physical_pool>
+       @endcode
+
+   @cond rest_dist For more information on the system configuration file, see @xhyperref{80VB41979,80-VB419-79}. @endcond
+
+   @note1hang Dynamically created pools are not identical to static pools. In particular, 
+   qurt_mem_pool_attr_get() is not valid with dynamically created pools.
+
+   @note1cont Dynamic pool creation permanently consumes system resources, and cannot be undone.
+
+  @datatypes
+  #qurt_mem_pool_t
+
+  @param[in] name           Pointer to the memory pool name. 
+  @param[in] base           Base address of the memory region (divided by 4K).
+  @param[in] size           Size (in bytes) of the memory region (divided by 4K).
+  @param[out] pool          Pointer to the memory pool object.
+
+  @return
+  #QURT_EOK -- Success.
+
+  @dependencies
+  None.
+*/
+int qurt_mem_pool_create(char *name, unsigned base, unsigned size, qurt_mem_pool_t *pool);
+
+/**@ingroup func_qurt_mem_pool_add_pages
+  Adds a physical address range to the specified memory pool object.\n
+ 
+  @note1hang Call this operation only with root privileges (guest OS mode).
+
+  @datatypes
+  #qurt_mem_pool_t
+
+  @param[in] pool           Memory pool object.
+  @param[in] first_pageno   First page number of the physical address range (equivalent to address >> 12)
+  @param[in] size_in_pages  Number of pages in the physical address range (equivalent to size >> 12)
+
+  @return
+  #QURT_EOK -- Pages successfully added.
+
+  @dependencies
+  None.
+*/
+int qurt_mem_pool_add_pages(qurt_mem_pool_t pool,
+                            unsigned first_pageno,
+                            unsigned size_in_pages);
+
+/**@ingroup func_qurt_mem_pool_remove_pages
+  Removes a physical address range from the specified memory pool object.
+ 
+  If any part of the address range is in use, this operation returns an
+  error without changing the state.
+ 
+  @note1hang Call this operation only with root privileges (guest-OS mode).
+ 
+  @note1cont In the future, this operation will support (via the flags parameter) the
+  removal of a physical address range when part of the range is in use.
+ 
+  @datatypes
+  #qurt_mem_pool_t
+
+  @param[in] pool           Memory pool object.
+  @param[in] first_pageno   First page number of the physical address range (equivalent to address >> 12)
+  @param[in] size_in_pages  Number of pages in the physical address range (equivalent to size >> 12)
+  @param[in] flags          Remove options. Values: \n 
+                            - 0 -- Skip holes in the range that are not part of the pool (default) \n
+                            - #QURT_POOL_REMOVE_ALL_OR_NONE -- Pages are removed only if the specified
+                            physical address range is entirely contained (with no holes) in the
+                            pool free space. @tablebulletend                          
+  @param[in] callback       Callback procedure called when pages were successfully removed.
+                            Not called if the operation failed. Passing 0 as the parameter
+                            value causes the callback to not be called. 
+  @param[in] arg            Value passed as an argument to the callback procedure.
+
+  @return
+  #QURT_EOK -- Pages successfully removed.
+
+  @dependencies
+  None.
+*/
+int qurt_mem_pool_remove_pages(qurt_mem_pool_t pool,
+                               unsigned first_pageno,
+                               unsigned size_in_pages,
+                               unsigned flags,
+                               void (*callback)(void *),
+                               void *arg);
+/**@ingroup memory_management_types*/
+#define QURT_POOL_REMOVE_ALL_OR_NONE            1  /**< */
+
+/**@ingroup func_qurt_mem_pool_attr_get  
+   Gets the memory pool attributes. \n
+   Retrieves pool configurations based on the pool handle, and fills in
+   the attribute structure with configuration values.   
+
+   @datatypes
+   #qurt_mem_pool_t \n
+   #qurt_mem_pool_attr_t
+
+   @param[in]  pool   Pool handle obtained from qurt_mem_pool_attach().
+   @param[out] attr   Pointer to the memory region attribute structure. 
+
+   @return   
+   0 -- Success. \n
+   #QURT_EINVALID -- Corrupt handle; pool handle is invalid.
+*/
+int qurt_mem_pool_attr_get (qurt_mem_pool_t pool, qurt_mem_pool_attr_t *attr);
+
+/**@ingroup func_qurt_mem_pool_attr_get_size
+  Gets the size of the specified memory pool range.
+
+  @datatypes
+  #qurt_mem_pool_attr_t \n
+  #qurt_size_t
+ 
+  @param[in] attr        Pointer to the memory pool attribute structure.
+  @param[in] range_id    Memory pool range key.
+  @param[out] size       Pointer to the destination variable for the range size.
+
+  @return 
+  0 -- Success. \n
+  #QURT_EINVALID -- Range is invalid.
+
+  @dependencies
+  None.
+*/
+static inline int qurt_mem_pool_attr_get_size (qurt_mem_pool_attr_t *attr, int range_id, qurt_size_t *size){
+    if ((range_id >= MAX_POOL_RANGES) || (range_id < 0)){
+        (*size) = 0;
+        return QURT_EINVALID;
+    }
+    else {
+        (*size) = attr->ranges[range_id].size;
+    }
+    return QURT_EOK;
+}
+
+/**@ingroup func_qurt_mem_pool_attr_get_addr
+   Gets the start address of the specified memory pool range.
+ 
+  @datatypes
+  #qurt_mem_pool_attr_t \n
+  #qurt_addr_t
+  
+  @param[in] attr        Pointer to the memory pool attribute structure.
+  @param[in]  range_id   Memory pool range key.
+  @param[out] addr       Pointer to the destination variable for range start address.
+
+  @return 
+  0 -- Success. \n
+  #QURT_EINVALID -- Range is invalid.
+
+  @dependencies
+  None.
+*/
+static inline int qurt_mem_pool_attr_get_addr (qurt_mem_pool_attr_t *attr, int range_id, qurt_addr_t *addr){
+    if ((range_id >= MAX_POOL_RANGES) || (range_id < 0)){
+        (*addr) = 0;
+        return QURT_EINVALID;
+    }
+    else {
+        (*addr) = (attr->ranges[range_id].start)<<12;
+   }
+   return QURT_EOK;
+}
+
+/**@ingroup func_qurt_mem_pool_attr_get_addr_64
+   Gets the 64 bit start address of the specified memory pool range.
+ 
+  @datatypes
+  #qurt_mem_pool_attr_t \n
+  #qurt_addr_64_t
+  
+  @param[in] attr        Pointer to the memory pool attribute structure.
+  @param[in]  range_id   Memory pool range key.
+  @param[out] addr       Pointer to the destination variable for range start address.
+
+  @return 
+  0 -- Success. \n
+  #QURT_EINVALID -- Range is invalid.
+
+  @dependencies
+  None.
+*/
+static inline int qurt_mem_pool_attr_get_addr_64 (qurt_mem_pool_attr_t *attr, int range_id, qurt_addr_64_t *addr){
+if ((range_id >= MAX_POOL_RANGES) || (range_id < 0)){
+    (*addr) = 0;
+    return QURT_EINVALID;
+}
+else {
+     (*addr) = ((qurt_addr_64_t)attr->ranges[range_id].start)<<12;
+    }
+    return QURT_EOK;
+ }
+
+
+/**@ingroup func_qurt_mem_pool_status_get  
+   Gets the memory pool status. \n
+   Based on the pool handle, retrieves largest contiguous free memory, 
+   total free memory, and total memory declared for the pool in bytes. Fills in
+   the memory status structure with the values.   
+   
+   @datatypes
+   #qurt_mem_pool_t \n
+   #qurt_mem_pool_status_t
+   
+   @param[in]  pool   Pool handle.
+   @param[out] status Pointer to the memory pool status structure. 
+   
+   @return   
+   #QURT_EOK      -- Success. \n
+   #QURT_EINVALID -- Corrupt handle; pool handle is invalid.
+*/
+int qurt_mem_pool_status_get (qurt_mem_pool_t pool, qurt_mem_pool_status_t *status);
+
+
+/**@ingroup func_qurt_mem_pool_is_available
+   Checks whether the number of pages that the page_count argument indicates
+   can be allocated from the specified pool.
+
+  @datatypes
+  #qurt_mem_pool_attr_t \n
+  #qurt_mem_mapping_t \n
+
+  @param[in] pool          Pool handle obtained from qurt_mem_pool_attach().
+  @param[in] page_count    Number of 4K pages.
+  @param[in] mapping_type  Variable of type qurt_mem_mapping_t.
+
+  @return
+  0 -- Success. \n
+  #QURT_EINVALID -- Mapping_type is invalid. \n
+  #QURT_EMEM     -- Specified pages cannot be allocated from the pool.
+
+  @dependencies
+  None.
+*/
+int qurt_mem_pool_is_available(qurt_mem_pool_t pool, int page_count, qurt_mem_mapping_t mapping_type);
+
+
+/**@ingroup func_qurt_mem_region_create
+  @xreflabel{sec:mem_region_create}
+  Creates a memory region with the specified attributes.
+
+  The application initializes the memory region attribute structure with
+  qurt_mem_region_attr_init() and qurt_mem_region_attr_set_bus_attr().
+
+  If the virtual address attribute is set to its default value 
+  (Section @xref{sec:qurt_mem_region_attr_init}), the virtual address of the memory region is 
+  automatically assigned any available virtual address value.
+
+  If the memory mapping attribute is set to virtual mapping, the physical address of the memory region
+  is also automatically assigned.\n
+
+  @note1hang The physical address attribute is explicitly set in the attribute structure only
+             for memory regions with physical-contiguous-mapped mapping.
+
+  Memory regions are always assigned to memory pools. The pool value specifies the memory pool
+  that the memory region is assigned to.
+
+  @note1hang If attr is specified as NULL, the memory region is created with default
+             attribute values (Section @xref{sec:qurt_mem_region_attr_init}).
+             QuRT predefines the memory pool object #qurt_mem_default_pool
+             (Section @xref{dox:mem_management}), which allocates memory regions in SMI memory.
+
+  @datatypes
+  #qurt_mem_region_t \n
+  #qurt_size_t \n
+  #qurt_mem_pool_t \n
+  #qurt_mem_region_attr_t
+
+  @param[out] region Pointer to the memory region object.
+  @param[in]  size   Memory region size (in bytes). If size is not an integral multiple of 4K,
+                     it is rounded up to a 4K boundary.
+  @param[in]  pool   Memory pool of the region.
+  @param[in]  attr   Pointer to the memory region attribute structure.
+
+  @return
+  #QURT_EOK -- Memory region successfully created.\n
+  #QURT_EMEM -- Not enough memory to create region.
+  #QURT_EINVALID -- Invalid cache attributes / permissions provided in attribute.
+
+  @dependencies
+  None.
+*/
+int qurt_mem_region_create(qurt_mem_region_t *region, qurt_size_t size, qurt_mem_pool_t pool, qurt_mem_region_attr_t *attr);
+
+/**@ingroup func_qurt_mem_region_delete
+  Deletes the specified memory region.
+
+  If the caller application creates the memory region, it is removed and the system reclaims its
+  assigned memory.
+
+  If a different application creates the memory region (and is shared with the caller
+  application), only the local memory mapping to the region is removed; the system does
+  not reclaim the memory.
+
+  @datatypes
+  #qurt_mem_region_t
+
+  @param[in] region Memory region object.
+
+  @returns
+  #QURT_EOK -- Region successfully deleted.
+  #QURT_ELOCKED -- Buffer is locked. Mapping delete failed.
+
+  @dependencies
+  None.
+*/
+int qurt_mem_region_delete(qurt_mem_region_t region);
+
+
+/**@ingroup func_qurt_mem_region_attr_get
+  @xreflabel{sec:mem_region_attr_get}
+  Gets the memory attributes of the specified message region.
+  After a memory region is created, its attributes cannot be changed.
+
+  @datatypes
+  #qurt_mem_region_t \n
+  #qurt_mem_region_attr_t
+
+  @param[in] region     Memory region object.
+  @param[out] attr      Pointer to the destination structure for memory region attributes.
+
+  @return
+  #QURT_EOK -- Operation successfully performed. \n
+  Error code -- Failure.
+
+  @dependencies
+  None.
+*/
+int qurt_mem_region_attr_get(qurt_mem_region_t region, qurt_mem_region_attr_t *attr);
+
+
+/**@ingroup func_qurt_mem_region_attr_set_type
+  Sets the memory type in the specified memory region attribute structure.
+
+  The type indicates whether the memory region is local to an application or shared between
+  applications. 
+  @cond rest_dist For more information, see @xhyperref{80VB41992,80-VB419-92}. @endcond
+ 
+  @datatypes
+  #qurt_mem_region_attr_t \n
+  #qurt_mem_region_type_t
+
+  @param[in,out] attr  Pointer to memory region attribute structure.
+  @param[in]     type  Memory type. Values: \n
+                       - #QURT_MEM_REGION_LOCAL \n
+                       - #QURT_MEM_REGION_SHARED @tablebulletend
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+static inline void qurt_mem_region_attr_set_type(qurt_mem_region_attr_t *attr, qurt_mem_region_type_t type){
+    attr->type = type;
+}
+
+/**@ingroup func_qurt_mem_region_attr_get_size
+  Gets the memory region size from the specified memory region attribute structure.
+
+  @datatypes
+  #qurt_mem_region_attr_t \n
+  #qurt_size_t
+
+  @param[in]  attr  Pointer to the memory region attribute structure.
+  @param[out] size  Pointer to the destination variable for memory region size.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+static inline void qurt_mem_region_attr_get_size(qurt_mem_region_attr_t *attr, qurt_size_t *size){
+    (*size) = attr->size;
+}
+
+/**@ingroup func_qurt_mem_region_attr_get_type
+  Gets the memory type from the specified memory region attribute structure.
+
+  @datatypes
+  #qurt_mem_region_attr_t \n
+  #qurt_mem_region_type_t
+
+  @param[in] attr  Pointer to the memory region attribute structure.
+  @param[out] type  Pointer to the destination variable for the memory type.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+static inline void qurt_mem_region_attr_get_type(qurt_mem_region_attr_t *attr, qurt_mem_region_type_t *type){
+    (*type) = attr->type;
+}
+
+/**@ingroup func_qurt_mem_region_attr_set_physaddr
+  Sets the memory region 32-bit physical address in the specified memory attribute structure.
+
+  @note1hang The physical address attribute is explicitly set only for memory regions with 
+             physical contiguous mapping. Otherwise QuRT automatically sets it
+			 when the memory region is created.
+
+  @datatypes
+  #qurt_mem_region_attr_t \n
+  #qurt_paddr_t
+
+  @param[in,out] attr  Pointer to the memory region attribute structure.
+  @param[in] addr  Memory region physical address.
+
+  @return      
+  None.
+ */
+static inline void qurt_mem_region_attr_set_physaddr(qurt_mem_region_attr_t *attr, qurt_paddr_t addr){
+    attr->ppn = (unsigned)(((unsigned)(addr))>>12);
+}
+
+/**@ingroup func_qurt_mem_region_attr_get_physaddr
+  Gets the memory region physical address from the specified memory region attribute structure.
+  
+  @datatypes
+  #qurt_mem_region_attr_t
+  
+  @param[in]  attr  Pointer to the memory region attribute structure.
+  @param[out] addr  Pointer to the destination variable for memory region physical address.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+static inline void qurt_mem_region_attr_get_physaddr(qurt_mem_region_attr_t *attr, unsigned int *addr){
+    (*addr) = (unsigned)(((unsigned) (attr->ppn))<<12);
+}
+
+/**@ingroup func_qurt_mem_region_attr_set_virtaddr
+  Sets the memory region virtual address in the specified memory attribute structure.
+
+  @datatypes
+  #qurt_mem_region_attr_t \n
+  #qurt_addr_t
+  
+  @param[in,out] attr  Pointer to the memory region attribute structure.
+  @param[in]     addr  Memory region virtual address.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+static inline void qurt_mem_region_attr_set_virtaddr(qurt_mem_region_attr_t *attr, qurt_addr_t addr){
+    attr->virtaddr = addr;
+}
+
+/**@ingroup func_qurt_mem_region_attr_get_virtaddr
+  Gets the memory region virtual address from the specified memory region attribute structure.
+
+  @datatypes
+  #qurt_mem_region_attr_t \n
+
+  @param[in]   attr   Pointer to the memory region attribute structure.
+  @param[out]  addr   Pointer to the destination variable for the memory region virtual address.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+static inline void qurt_mem_region_attr_get_virtaddr(qurt_mem_region_attr_t *attr, unsigned int *addr){
+    (*addr) = (unsigned int)(attr->virtaddr);
+}
+
+/**@ingroup func_qurt_mem_region_attr_set_mapping
+  Sets the memory mapping in the specified memory region attribute structure.
+
+  The mapping value indicates how the memory region is mapped in virtual memory.  
+
+  @datatypes
+  #qurt_mem_region_attr_t \n
+  #qurt_mem_mapping_t
+  
+  @param[in,out] attr     Pointer to the memory region attribute structure.
+  @param[in] mapping  Mapping. Values: 
+                      - #QURT_MEM_MAPPING_VIRTUAL
+                      - #QURT_MEM_MAPPING_PHYS_CONTIGUOUS 
+                      - #QURT_MEM_MAPPING_IDEMPOTENT  	                                   
+                      - #QURT_MEM_MAPPING_VIRTUAL_FIXED								   
+                      - #QURT_MEM_MAPPING_NONE 
+                      - #QURT_MEM_MAPPING_VIRTUAL_RANDOM
+                      - #QURT_MEM_MAPPING_INVALID   @tablebulletend  
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+static inline void qurt_mem_region_attr_set_mapping(qurt_mem_region_attr_t *attr, qurt_mem_mapping_t mapping){
+    attr->mapping_type = mapping;
+}
+
+/**@ingroup func_qurt_mem_region_attr_get_mapping
+  Gets the memory mapping from the specified memory region attribute structure.
+
+  @datatypes
+  #qurt_mem_region_attr_t \n
+  #qurt_mem_mapping_t
+
+  @param[in]  attr     Pointer to the memory region attribute structure.
+  @param[out] mapping  Pointer to the destination variable for memory mapping.
+
+  @return 
+  None.
+
+  @dependencies
+  None.
+ */
+static inline void qurt_mem_region_attr_get_mapping(qurt_mem_region_attr_t *attr, qurt_mem_mapping_t *mapping){
+    (*mapping) = attr->mapping_type;
+}
+
+/**@ingroup func_qurt_mem_region_attr_set_cache_mode
+  Sets the cache operation mode in the specified memory region attribute structure.
+
+  @cond rest_dist For more information on the cache, see @xhyperref{80VB41992,80-VB419-92}.@endcond
+
+  @datatypes
+  #qurt_mem_region_attr_t \n
+  #qurt_mem_cache_mode_t
+  
+  @param[in,out] attr  Pointer to the memory region attribute structure.
+  @param[in] mode      Cache mode. Values:  \n
+                       - #QURT_MEM_CACHE_WRITEBACK \n
+                       - #QURT_MEM_CACHE_WRITETHROUGH\n
+                       - #QURT_MEM_CACHE_WRITEBACK_NONL2CACHEABLE\n
+                       - #QURT_MEM_CACHE_WRITETHROUGH_NONL2CACHEABLE\n
+                       - #QURT_MEM_CACHE_WRITEBACK_L2CACHEABLE\n
+                       - #QURT_MEM_CACHE_WRITETHROUGH_L2CACHEABLE\n
+                       - #QURT_MEM_CACHE_NONE @tablebulletend
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+static inline void qurt_mem_region_attr_set_cache_mode(qurt_mem_region_attr_t *attr, qurt_mem_cache_mode_t mode){
+    QURT_PGATTR_C_SET(attr->pga, (unsigned)mode);
+}
+
+/**@ingroup func_qurt_mem_region_attr_get_cache_mode
+  Gets the cache operation mode from the specified memory region attribute structure.
+
+  @datatypes
+  #qurt_mem_region_attr_t \n
+  #qurt_mem_cache_mode_t
+
+  @param[in]  attr  Pointer to the memory region attribute structure.
+  @param[out] mode  Pointer to the destination variable for cache mode.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+static inline void qurt_mem_region_attr_get_cache_mode(qurt_mem_region_attr_t *attr, qurt_mem_cache_mode_t *mode){
+    unsigned int mode_temp = QURT_PGATTR_C_GET(attr->pga);
+    (*mode) = (qurt_mem_cache_mode_t)mode_temp;
+}
+
+/**@ingroup func_qurt_mem_region_attr_set_bus_attr
+  Sets the (A1, A0) bus attribute bits in the specified memory region attribute structure.
+
+  @cond rest_dist For more information on the bus attribute bits, see the @xhyperref{80VB41992,80-VB419-92}. @endcond
+
+  @datatypes
+  #qurt_mem_region_attr_t
+
+  @param[in,out] attr  Pointer to the memory region attribute structure.
+  @param[in] abits     The (A1, A0) bits to use with the memory region, expressed as a 2-bit binary number.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+static inline void qurt_mem_region_attr_set_bus_attr(qurt_mem_region_attr_t *attr, unsigned abits){
+    QURT_PGATTR_A_SET(attr->pga, abits);
+}
+
+/**@ingroup func_qurt_mem_region_attr_get_bus_attr
+  Gets the (A1, A0) bus attribute bits from the specified memory region attribute structure.
+
+  @datatypes
+  #qurt_mem_region_attr_t 
+
+  @param[in]  attr  Pointer to the memory region attribute structure.
+  @param[out] pbits Pointer to an unsigned integer that is filled in with
+                    the (A1, A0) bits from the memory region attribute structure, expressed as a 2-bit binary number.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+static inline void qurt_mem_region_attr_get_bus_attr(qurt_mem_region_attr_t *attr, unsigned *pbits){
+    (*pbits) = QURT_PGATTR_A_GET(attr->pga);
+}
+
+void qurt_mem_region_attr_set_owner(qurt_mem_region_attr_t *attr, int handle);
+void qurt_mem_region_attr_get_owner(qurt_mem_region_attr_t *attr, int *p_handle);
+void qurt_mem_region_attr_set_perms(qurt_mem_region_attr_t *attr, unsigned perms);
+void qurt_mem_region_attr_get_perms(qurt_mem_region_attr_t *attr, unsigned *p_perms);
+
+/**@ingroup func_qurt_mem_map_static_query
+  Determines whether a memory page is statically mapped.
+  Pages are specified by the following attributes: physical address, page size, cache mode,
+  and memory permissions. \n
+  - If the specified page is statically mapped, vaddr returns the virtual
+     address of the page. \n
+  - If the page is not statically mapped (or if it does not exist as specified), vaddr
+     returns -1 as the virtual address value.\n
+  The system configuration file defines QuRT memory maps.
+ 
+  @datatypes
+  #qurt_addr_t \n
+  #qurt_mem_cache_mode_t \n
+  #qurt_perm_t
+  
+  @param[out]  vaddr             Virtual address corresponding to paddr.
+  @param[in]   paddr             Physical address.  
+  @param[in]   page_size         Size of the mapped memory page.
+  @param[in]   cache_attribs     Cache mode (writeback, and so on).
+  @param[in]   perm              Access permissions.
+
+  @return
+  #QURT_EOK -- Specified page is statically mapped, vaddr returns the virtual address. \n
+  #QURT_EMEM -- Specified page is not statically mapped, vaddr returns -1. \n
+  #QURT_EVAL -- Specified page does not exist.
+
+  @dependencies
+  None.
+ */
+int qurt_mem_map_static_query(qurt_addr_t *vaddr, qurt_addr_t paddr, unsigned int page_size, qurt_mem_cache_mode_t cache_attribs, qurt_perm_t perm);
+
+
+/**@ingroup func_qurt_mem_region_query
+  Queries a memory region. \n
+  This function determines whether a dynamically-created memory region (Section @xref{sec:mem_region_create}) exists for the
+  specified virtual or physical address.  
+  When a memory region has been determined to exist, its attributes are
+  accessible (Section @xref{sec:mem_region_attr_get}).
+
+  @note1hang This function returns #QURT_EFATAL if #QURT_EINVALID is passed to both
+             vaddr and paddr (or to neither). 
+
+  @datatypes
+  #qurt_mem_region_t \n
+  #qurt_paddr_t 
+   
+  @param[out] region_handle    Pointer to the memory region object (if it exists).
+  @param[in]  vaddr            Virtual address to query; if vaddr is specified, paddr must be set to
+                               the value #QURT_EINVALID.
+  @param[in]  paddr            Physical address to query; if paddr is specified, vaddr must be set to
+                               the value #QURT_EINVALID.
+
+  @return 
+  #QURT_EOK -- Query successfully performed. \n
+  #QURT_EMEM -- Region not found for the specified address. \n
+  #QURT_EFATAL -- Invalid input parameters.
+
+  @dependencies
+  None.
+ */
+int qurt_mem_region_query(qurt_mem_region_t *region_handle, qurt_addr_t vaddr, qurt_paddr_t paddr);
+
+
+/**@ingroup func_qurt_mapping_create
+  @xreflabel{hdr:qurt_mapping_create}
+  Creates a memory mapping in the page table.
+  Not supported if called from a user process, always returns QURT_EMEM. 
+
+  @datatypes
+  #qurt_addr_t \n
+  #qurt_size_t \n
+  #qurt_mem_cache_mode_t \n
+  #qurt_perm_t
+ 
+  @param[in] vaddr			Virtual address.
+  @param[in] paddr			Physical address.
+  @param[in] size			Size (4K-aligned) of the mapped memory page.
+  @param[in] cache_attribs		Cache mode (writeback, and so on).
+  @param[in] perm			Access permissions.
+
+  @return			
+  #QURT_EOK -- Mapping created. \n
+  #QURT_EMEM -- Failed to create mapping.
+  #QURT_EINVALID -- Invalid cache attributes / permissions provided.
+
+  @dependencies
+  None.
+*/
+int qurt_mapping_create(qurt_addr_t vaddr, qurt_addr_t paddr, qurt_size_t size,
+                         qurt_mem_cache_mode_t cache_attribs, qurt_perm_t perm);
+
+/**@ingroup func_qurt_mapping_remove
+   @xreflabel{hdr:qurt_mapping_remove}
+  Deletes the specified memory mapping from the page table.
+ 
+  @datatypes
+  #qurt_addr_t \n
+  #qurt_size_t
+
+  @param[in] vaddr			Virtual address.
+  @param[in] paddr			Physical address.
+  @param[in] size			Size of the mapped memory page (4K-aligned).
+
+  @return 			
+  #QURT_EOK -- Mapping created.
+  #QURT_ELOCKED -- Buffer is locked. Mapping delete failed.
+
+  @dependencies
+  None.
+  		
+ */ 		
+int qurt_mapping_remove(qurt_addr_t vaddr, qurt_addr_t paddr, qurt_size_t size);
+
+/**@ingroup func_qurt_lookup_physaddr
+  Translates a virtual memory address to the physical memory address to which it maps. \n
+  The lookup occurs in the process of the caller. Use qurt_lookup_physaddr2() to lookup the
+  physical address of another process.
+  
+
+  @datatypes
+  #qurt_addr_t \n
+  #qurt_paddr_t
+
+  @param[in] vaddr   Virtual address.
+
+  @return
+  Nonzero -- Physical address to which the virtual address is mapped.\n
+  0 -- Virtual address not mapped.
+
+  @dependencies
+  None.
+*/
+qurt_paddr_t qurt_lookup_physaddr (qurt_addr_t vaddr);
+
+/**@ingroup func_qurt_mem_region_attr_set_physaddr_64
+  Sets the memory region 64-bit physical address in the specified memory attribute structure.
+
+  @note1hang The physical address attribute is explicitly set only for memory regions with
+             physical contiguous mapping. Otherwise it is automatically set by
+             QuRT when the memory region is created.
+
+  @datatypes
+  #qurt_mem_region_attr_t \n
+  #qurt_paddr_64_t
+
+  @param[in,out] attr  Pointer to the memory region attribute structure.
+  @param[in] addr_64   Memory region 64-bit physical address.
+
+  @return
+  None.
+ */
+static inline void qurt_mem_region_attr_set_physaddr_64(qurt_mem_region_attr_t *attr, qurt_paddr_64_t addr_64){
+    attr->ppn = (unsigned)(((unsigned long long)(addr_64))>>12);
+}
+
+/**@ingroup func_qurt_mem_region_attr_get_physaddr_64
+  Gets the memory region 64-bit physical address from the specified memory region attribute structure.
+
+  @datatypes
+  #qurt_mem_region_attr_t \n
+  #qurt_paddr_64_t
+
+  @param[in]  attr     Pointer to the memory region attribute structure.
+  @param[out] addr_64  Pointer to the destination variable for the memory region 64-bit physical address.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+static inline void qurt_mem_region_attr_get_physaddr_64(qurt_mem_region_attr_t *attr, qurt_paddr_64_t *addr_64){
+    (*addr_64) = (unsigned long long)(((unsigned long long)(attr->ppn))<<12);
+}
+
+/**@ingroup func_qurt_mem_map_static_query_64
+  Determines if a memory page is statically mapped.
+  The following attributes specify pages: 64-bit physical address, page size, cache mode,
+  and memory permissions. \n
+  If the specified page is statically mapped, vaddr returns the virtual
+     address of the page.
+  If the page is not statically mapped (or if it does not exist as specified), vaddr
+     returns -1 as the virtual address value.\n
+  QuRT memory maps are defined in the system configuration file.
+
+  @datatypes
+  #qurt_addr_t \n
+  #qurt_paddr_64_t \n
+  #qurt_mem_cache_mode_t \n
+  #qurt_perm_t
+
+  @param[out]  vaddr             Virtual address corresponding to paddr.
+  @param[in]   paddr_64          64-bit physical address.
+  @param[in]   page_size         Size of the mapped memory page.
+  @param[in]   cache_attribs     Cache mode (writeback, and so on).
+  @param[in]   perm              Access permissions.
+
+  @return
+  #QURT_EOK -- Specified page is statically mapped; a virtual address is returned in vaddr. \n
+  #QURT_EMEM -- Specified page is not statically mapped; -1 is returned in vaddr. \n
+  #QURT_EVAL -- Specified page does not exist.
+
+  @dependencies
+  None.
+ */
+int qurt_mem_map_static_query_64(qurt_addr_t *vaddr, qurt_paddr_64_t paddr_64, unsigned int page_size, qurt_mem_cache_mode_t cache_attribs, qurt_perm_t perm);
+
+/**@ingroup func_qurt_mem_region_query_64
+  Determines whether a dynamically created memory region (Section @xref{sec:mem_region_create}) exists for the
+  specified virtual or physical address. When a memory region has been determined to exist, its attributes are
+  accessible (Section @xref{sec:mem_region_attr_get}).
+
+  @note1hang This function returns QURT_EFATAL if #QURT_EINVALID is passed to both
+             vaddr and paddr (or to neither).
+
+  @datatypes
+  #qurt_mem_region_t \n
+  #qurt_addr_t \n
+  #qurt_paddr_64_t
+
+  @param[out] region_handle    Pointer to the memory region object (if it exists).
+  @param[in]  vaddr            Virtual address to query; if vaddr is specified, paddr must be set to
+                               the value #QURT_EINVALID.
+  @param[in]  paddr_64         64-bit physical address to query; if paddr is specified, vaddr must be set to
+                               the value #QURT_EINVALID.
+
+  @return
+  #QURT_EOK -- Success. \n
+  #QURT_EMEM -- Region not found for the specified address. \n
+  #QURT_EFATAL -- Invalid input parameters.
+
+  @dependencies
+  None.
+ */
+int qurt_mem_region_query_64(qurt_mem_region_t *region_handle, qurt_addr_t vaddr, qurt_paddr_64_t paddr_64);
+
+/**@ingroup func_qurt_mapping_create_64
+  @xreflabel{hdr:qurt_mapping_create_64}
+  Creates a memory mapping in the page table.
+  Not supported if called from a user process, always returns QURT_EMEM.
+
+  @datatypes
+  #qurt_addr_t \n
+  #qurt_paddr_64_t \n
+  #qurt_size_t \n
+  #qurt_mem_cache_mode_t \n
+  #qurt_perm_t
+ 
+  @param[in] vaddr	        Virtual address.
+  @param[in] paddr_64		64-bit physical address.
+  @param[in] size			Size (4K-aligned) of the mapped memory page.
+  @param[in] cache_attribs  Cache mode (writeback, and so on).
+  @param[in] perm			Access permissions.
+
+  @return			
+  #QURT_EOK -- Success. \n
+  #QURT_EMEM -- Failure.
+  #QURT_EINVALID -- Invalid cache attributes / permissions provided.
+
+  @dependencies
+  None.
+*/
+int qurt_mapping_create_64(qurt_addr_t vaddr, qurt_paddr_64_t paddr_64, qurt_size_t size,
+                         qurt_mem_cache_mode_t cache_attribs, qurt_perm_t perm);
+
+/**@ingroup func_qurt_mapping_remove_64
+   @xreflabel{hdr:qurt_mapping_remove_64}
+  Deletes the specified memory mapping from the page table.
+ 
+  @datatypes
+  #qurt_addr_t \n
+  #qurt_paddr_64_t \n  
+  #qurt_size_t
+ 
+  @param[in] vaddr    Virtual address.
+  @param[in] paddr_64 64-bit physical address.
+  @param[in] size     Size of the mapped memory page (4K-aligned).
+
+  @return 			
+  #QURT_EOK -- Success.
+  #QURT_ELOCKED -- Buffer is locked. Mapping delete failed.
+
+  @dependencies
+  None.
+  		
+ */ 		
+int qurt_mapping_remove_64(qurt_addr_t vaddr, qurt_paddr_64_t paddr_64, qurt_size_t size);
+
+/**@ingroup func_qurt_lookup_physaddr_64
+  Translates a virtual memory address to the 64-bit physical memory address it is mapped to. \n
+  The lookup occurs in the process of the caller. Use qurt_lookup_physaddr2() to lookup the physical
+  address of another process.
+
+  @datatypes
+  #qurt_paddr_64_t \n
+  #qurt_addr_t
+
+  @param[in] vaddr   Virtual address.
+
+  @return
+  Nonzero -- 64-bit physical address to which the virtual address is mapped. \n
+  0 -- Virtual address has not been mapped.
+
+  @dependencies
+  None.
+*/
+qurt_paddr_64_t qurt_lookup_physaddr_64 (qurt_addr_t vaddr);
+/** @endcond */
+
+/** @cond internal_only */
+/**@ingroup func_qurt_mapping_reclaim
+  Deallocates all QuRT resources associated with the specified virtual
+  memory area, making it available for user memory management:\n
+  - The associated physical memory areas are freed and added to the
+    specified physical pool.\n
+  - The associated TLB entries are deleted and made available for TLB
+    management.\n
+  - The virtual memory area is not freed -- it is left in
+    place as allocated, but unmapped virtual memory. Access to this
+    memory area generates an exception.\n
+
+  The virtual memory area must be statically allocated.
+  If no pool is specified, the freed physical memory is not added to any pool.
+
+  @note1hang The virtual memory area is restricted to being filled with locked 
+             TLB entries that are contiguous within the memory area, and contained by it.
+
+  @datatypes
+  #qurt_addr_t \n
+  #qurt_size_t \n
+  #qurt_mem_pool_t
+
+  @param[in] vaddr   Virtual address of the memory area to free.
+  @param[in] vsize   Size (in bytes) of the memory area to free.
+  @param[in] pool    Handle to the physical pool where freed physical memory is added.
+                     If set to 0, freed physical memory is not added to any pool.
+
+  @return
+  0 -- Success. \n
+  Nonzero -- Failure that indicates a partial success, or that the request was malformed. \n @note1hang The expected behavior is that
+       QuRT logs messages related to the failure, and callers are free to ignore the return value.
+
+  @dependencies
+  None.
+*/
+int qurt_mapping_reclaim(qurt_addr_t vaddr, qurt_size_t vsize, qurt_mem_pool_t pool);
+/** @endcond */
+/** @cond rest_reg_dist  */
+/**@ingroup func_qurt_mem_configure_cache_partition
+  Configures the Hexagon cache partition at the system level.
+
+  A partition size value of #SEVEN_EIGHTHS_SIZE is applicable only to the L2 cache.
+
+  The L1 cache partition is not supported in Hexagon processor version V60 or greater.
+
+  @note1hang Call this operation only with QuRT OS privilege.
+
+  @datatypes
+  #qurt_cache_type_t \n
+  #qurt_cache_partition_size_t
+
+  @param[in] cache_type  Cache type for partition configuration. Values: \n
+                       - #HEXAGON_L1_I_CACHE \n
+                       - #HEXAGON_L1_D_CACHE \n
+                       - #HEXAGON_L2_CACHE @tablebulletend
+
+  @param[in] partition_size  Cache partition size. Values: \n
+                        - #FULL_SIZE \n
+                        - #HALF_SIZE \n
+                        - #THREE_QUARTER_SIZE \n
+                        - #SEVEN_EIGHTHS_SIZE @tablebulletend
+
+  @return
+  #QURT_EOK -- Success. \n
+  #QURT_EVAL -- Error.
+
+  @dependencies
+  None.
+ */
+int qurt_mem_configure_cache_partition(qurt_cache_type_t cache_type, qurt_cache_partition_size_t partition_size);
+
+
+/**@ingroup func_qurt_mem_syncht
+   @xreflabel{hdr:qurt_mem_syncht}
+  Performs heavy-weight synchronization of memory transactions.
+
+  This operation does not return until all previous memory transactions (cached and uncached load/store,
+  mem_locked, and so on) that originated from the current thread are complete and globally observable.
+
+  @note1hang This operation is implemented as a wrapper for the Hexagon syncht instruction.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+static inline void qurt_mem_syncht(void){
+    #ifdef __HEXAGON_ARCH__
+    __asm__  __volatile__ (" SYNCHT \n");
+    #endif
+}
+
+/**@ingroup func_qurt_mem_barrier
+   @xreflabel{hdr:qurt_mem_barrier}
+  Creates a barrier for memory transactions.
+
+  This operation ensures that all previous memory transactions are globally observable before any
+  future memory transactions are globally observable.
+
+  @note1hang This operation is implemented as a wrapper for the Hexagon barrier instruction.
+  @return
+  None
+
+  @dependencies
+  None.
+ */
+static inline void qurt_mem_barrier(void){
+    #ifdef __HEXAGON_ARCH__
+    __asm__  __volatile__ (" BARRIER \n");
+    #endif
+}
+/** @endcond */
+
+/** @cond internal_only */
+/**@ingroup func_qurt_system_mem_alloc
+  Requests that the kernel allocates memory from the kernel-owned pool.
+
+  @param[in] size     Size in bytes (aligned to 4K) to allocate.
+  @param[in] align    Any alignment that must be considered for the allocation.
+  @param[in] flags    Supports the #QURT_SYSTEM_ALLOC_VIRTUAL flag; allocates 
+                      available virtual memory in the address space of all processes.
+
+  @return
+  #QURT_EFATAL  -- Allocation failed \n
+  Start address of the successful allocation.  
+
+  @dependencies
+  None.
+*/
+unsigned qurt_system_mem_alloc(unsigned size, unsigned align, unsigned flags);
+/** @endcond */
+/** @cond rest_reg_dist*/
+/**@ingroup func_qurt_lookup_physaddr2
+  Translates the virtual memory address of the specified process to the 64-bit 
+  physical memory address to which it is mapped.
+
+  @datatypes
+  #qurt_addr_t \n
+  #qurt_paddr_64_t
+
+  @param[in] vaddr   Virtual address.
+  @param[in] pid     PID.
+
+  @return
+  Nonzero -- 64-bit physical address to which the virtual address is mapped. \n
+  0 -- Virtual address is not mapped.
+
+  @dependencies
+  None.
+*/
+qurt_paddr_64_t qurt_lookup_physaddr2(qurt_addr_t vaddr, unsigned int pid);
+/** @endcond */
+
+/**@ingroup func_qurt_mapping_attr_get  
+   Gets the mapping attributes for a given virtual address and PID
+
+   @datatypes
+   #qurt_addr_t \n
+   #qurt_mapping_attr_t
+
+   @param[in]  vaddr  virtual address for which the attributes are required.
+   @param[in]  pid    process id for the target process
+   @param[out] attr   Pointer to the mapping attribute structure. 
+
+   @return   
+   0 -- Success. \n
+   #QURT_EINVALID -- Incorrect virtual address or pid
+*/
+int qurt_mapping_attr_get(qurt_addr_t vaddr, unsigned int pid, qurt_mapping_attr_t *attr);
+
+
+/**@ingroup func_qurt_mapping_attr_get_cache_mode
+  Gets the cache operation mode in the specified memory mapping attribute structure.
+
+
+  @datatypes
+  #qurt_mapping_attr_t \n
+  #qurt_mem_cache_mode_t
+  
+  @param[in]  attr  Pointer to the memory mapping attribute structure.
+  @param[out] cache_mode  Pointer to the destination variable for cache mode.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+static inline void qurt_mapping_attr_get_cache_mode(qurt_mapping_attr_t *attr, qurt_mem_cache_mode_t *cache_mode)
+{
+   (*cache_mode) = attr->cache_mode;
+}
+
+/**@ingroup func_qurt_mapping_attr_get_physaddr
+  Gets the physical memory address in the specified memory mapping attribute structure.
+
+
+  @datatypes
+  #qurt_mapping_attr_t \n
+  #qurt_paddr_64_t
+  
+  @param[in]  attr      Pointer to the memory mapping attribute structure.
+  @param[out] physaddr  Pointer to the destination variable for physical address.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+static inline void qurt_mapping_attr_get_physaddr(qurt_mapping_attr_t *attr, qurt_paddr_64_t *physaddr)
+{
+   (*physaddr) = attr->paddr;
+}
+
+/**@ingroup func_qurt_mapping_attr_get_perms
+  Gets the permissions in the specified memory mapping attribute structure.
+
+
+  @datatypes
+  #qurt_mapping_attr_t \n
+  #qurt_perm_t
+  
+  @param[in]  attr   Pointer to the memory mapping attribute structure.
+  @param[out] perms  Pointer to the destination variable for permissions.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+static inline void qurt_mapping_attr_get_perms(qurt_mapping_attr_t *attr, qurt_perm_t *perms)
+{
+   (*perms) = attr->perms;
+}
+
+/**@ingroup func_qurt_mapping_attr_get_size
+  Gets the size in the specified memory mapping attribute structure.This represents size of the
+  TLB entry which covers the virtual address.
+
+
+  @datatypes
+  #qurt_mapping_attr_t \n
+  #unsigned int
+  
+  @param[in]  attr  Pointer to the memory mapping attribute structure.
+  @param[out] size  Pointer to the destination variable for size.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+static inline void qurt_mapping_attr_get_size(qurt_mapping_attr_t *attr, unsigned int *size)
+{
+   (*size) = attr->size;
+}
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_MEMORY_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_mmap.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_mmap.h
new file mode 100755
index 0000000000000..c3bd875910af7
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_mmap.h
@@ -0,0 +1,359 @@
+#ifndef QURT_MMAP_H
+#define QURT_MMAP_H
+/**
+  @file qurt_mmap.h 
+  @brief  Prototypes of memory mapping/unmapping APIs.
+          The APIs allow the user to map, un-map, and change permissions
+          on memory regions. 
+
+  EXTERNAL FUNCTIONS
+   None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+Copyright (c) 2018-2021, 2022, 2023 Qualcomm Technologies, Inc.
+All Rights Reserved.
+Confidential and Proprietary - Qualcomm Technologies, Inc.
+=============================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**@ingroup func_qurt_mem_mmap
+  Creates a memory mapping with the specified attributes. 
+  This API allows the root process caller to create mapping on behalf of a user
+  process. If the client_handle belongs to a valid user process, the resulting
+  mapping is created for the process.
+  If -1 is passed in place of client_handle, the API creates mapping
+  for the underlying process of the caller.
+
+  @note1hang If the specified attributes are not valid, an error result is returned.  
+                
+  @param[out]  client_handle  Client handle to use for this mapping (optional).
+  @param[in]   pool           Optional argument that specifies a pool handle
+                              if the user wants to allocate memory from a specific pool.
+                              The default value for this argument is NULL.
+  @param[in]   pRegion        Map region. This argument is unused, and the default value is NULL.
+  @param[in]   addr           Virtual memory address.
+  @param[in]   length         Size of mapping in bytes.
+  @param[in]   prot           Mapping access permissions (R/W/X).
+  @param[in]   flags          Mapping modes.\n
+                              - #QURT_MAP_NAMED_MEMSECTION 
+                              - #QURT_MAP_FIXED            \n
+                              - #QURT_MAP_NONPROCESS_VPOOL \n
+                              - #QURT_MAP_TRYFIXED         \n
+                              - #QURT_MAP_ANON             \n
+                              - #QURT_MAP_PHYSADDR         \n
+                              - #QURT_MAP_VA_ONLY @tablebulletend  
+  @param[in]   fd             File designator.
+  @param[in]   offset         Offset in file.
+ 
+  @return
+  Valid virtual address -- Success.\n
+  #QURT_MAP_FAILED -- Mapping creation failed. 
+ */
+void *qurt_mem_mmap(int client_handle,
+                    qurt_mem_pool_t pool,
+                    qurt_mem_region_t *pRegion,
+                    void *addr,
+                    size_t length,
+                    int prot,
+                    int flags,
+                    int fd,
+                    unsigned long long offset);
+
+/**@ingroup func_qurt_mem_mmap2
+  Creates a memory mapping with the specified attributes. Returns a more descriptive 
+  error code in case of failure.
+  This API allows the root process caller to create mapping on behalf of a user
+  process. If the client_handle belongs to a valid user process, the resulting
+  mapping is created for the process.
+  If -1 is passed in place of client_handle, the API creates mapping
+  for the underlying process of the caller.
+
+  @note1hang If the specified attributes are not valid, an error result is returned.
+
+  @param[out]  client_handle  Client handle to use for this mapping (optional).
+  @param[in]   pool           Optional argument that allows the user to specify a pool handle
+                              when the user wants to allocate memory from a specific pool.
+                              Default value for this argument is NULL.
+  @param[in]   pRegion        Map region (unused argument); default value is NULL.
+  @param[in]   addr           Virtual memory address.
+  @param[in]   length         Size of mapping in bytes.
+  @param[in]   prot           Mapping access permissions (R/W/X).
+                              Cache attributes, bus attributes, User mode.
+  @param[in]   flags          Mapping modes;
+                              Shared, Private, or Anonymous.
+  @param[in]   fd             File designator.
+  @param[in]   offset         Offset in file.
+ 
+  @return
+  Valid virtual address -- Success.\n
+  #QURT_EMEM -- Physical address is not available. \n
+  #QURT_EFAILED -- VA is not available or mapping failed.\n
+  #QURT_EINVALID -- Invalid argument was passed (for example, an unaligned VA/PA).
+ */
+void *qurt_mem_mmap2(int client_handle,
+                    qurt_mem_pool_t pool,
+                    qurt_mem_region_t *pRegion,
+                    void *addr,
+                    size_t length,
+                    int prot,
+                    int flags,
+                    int fd,
+                    unsigned long long offset);
+
+/**@ingroup func_qurt_mem_mmap_by_name
+  Creates a memory mapping for a named-memsection using the specified attributes.
+  The named memsection should be specified in cust_config.xml.
+
+  @note1hang If the specified attributes are not valid or the named memsection is not found,
+  an error result is returned.
+                  
+  @param[in]   name           Name of the memsection in cust_config.xml that specifies 
+                              this mapping. Should be less than 25 characters.
+  @param[in]   addr           Virtual memory address.
+  @param[in]   length         Size of mapping in bytes.
+  @param[in]   prot           Mapping access permissions (R/W/X).
+                              Cache attributes, bus attributes, User mode
+  @param[in]   flags          Mapping modes, such as
+                              Shared, Private, or Anonymous.
+  @param[in]   offset         Offset relative to the physical address range specified in memsection. 
+                              If offset + length exceeds size of memsection, failure is 
+                              returned.
+  @return
+  Valid virtual address -- Success.\n
+  #QURT_MAP_FAILED -- Mapping creation failed. 
+ */
+void *qurt_mem_mmap_by_name(const char* name,
+                            void *addr,
+                            size_t length,
+                            int prot,
+                            int flags,
+                            unsigned long long offset);
+
+/**@ingroup func_qurt_mem_mprotect2
+  Changes access permissions and attributes on an existing mapping based on the client_handle argument. 
+
+  @note1hang If the specified virtual address is not found or invalid attributes are passed,
+  an error code is returned.
+
+  @note2 When error is returned, it is possible that attributes/permissions are changed for some part of the
+          mapping, while for the remaining it is unchanged. Clients should not use these mappings further.
+                  
+  @param[in]   client_handle  Obtained from the current invocation function (Section 3.4.1).                   
+  @param[in]   addr           Virtual memory address.
+  @param[in]   length         Size of mapping in bytes.
+  @param[in]   prot           Mapping access permissions (R/W/X).
+                              Cache attributes, Bus attributes, User mode.
+  @return
+  #QURT_EOK -- Successfully changes permissions on the mapping.\n
+  #QURT_EFATAL -- Failed to change permissions on the mapping. \n
+  #QURT_EINVALID -- Attributes / permissions requested are invalid.
+ */
+int qurt_mem_mprotect2(int client_handle, const void *addr,
+                      size_t length,
+                      int prot);
+
+/**@ingroup func_qurt_mem_mprotect
+  Changes access permissions and attributes on an existing mapping. 
+
+  @note1hang If the specified virtual address is not found or invalid attributes are passed,
+  an error code is returned.\n
+
+  @note2 When error is returned, it is possible that attributes/permissions are changed for some part of the
+          mapping, while for the remaining it is unchanged. Clients should not use these mappings further.
+                  
+  @param[in]   addr           Virtual memory address.
+  @param[in]   length         Size of mapping in bytes.
+  @param[in]   prot           Mapping access permissions (R/W/X).
+                              Cache attributes, Bus attributes, User mode.
+  @return
+  #QURT_EOK -- Successfully changes permissions on the mapping. \n
+  #QURT_EFATAL -- Failed to change permissions on the mapping. \n
+  #QURT_EINVALID -- Attributes / permissions requested are invalid.
+ */
+int qurt_mem_mprotect(const void *addr,
+                      size_t length,
+                      int prot);
+
+/**@ingroup func_qurt_mem_munmap
+  Removes an existing mapping. 
+
+  @note1hang If the specified mapping is not found in the context of the caller process
+  or invalid attributes are passed, an error code is returned.
+                  
+  @param[in]   addr           Virtual memory address.
+  @param[in]   length         Size of mapping in bytes.
+  
+  @return
+  #QURT_EOK -- Successfully changes permissions on the mapping. \n
+  #QURT_EFATAL -- Failed to change permissions on the mapping.
+  #QURT_ELOCKED - Buffer is locked. Mapping delete failed.
+ */
+int qurt_mem_munmap(void *addr,
+                    size_t length);
+
+/**@ingroup func_qurt_mem_munmap2
+  Removes an existing mapping for a specified process. 
+
+  @note1hang This API allows a root process entity, such as a driver, to remove mapping
+  that was created for a user process. If the specified mapping is not found in the context 
+  of client handle or invalid attributes are passed, an error code is returned.
+             
+  @param[out]  client_handle  Client handle of the user process that owns this mapping. 
+  @param[in]   addr           Virtual memory address.
+  @param[in]   length         Size of mapping in bytes.
+  
+  @return
+  #QURT_EOK -- Successfully changes permissions on the mapping. \n
+  #QURT_EFATAL -- Failed to change permissions on the mapping. 
+  #QURT_ELOCKED - Buffer is locked. Mapping delete failed.
+ */
+int qurt_mem_munmap2(int client_handle,
+                     void *addr,
+                     size_t length);
+
+/**@ingroup func_qurt_mem_munmap3
+  Removes an existing mapping or reservation for a specified process. 
+
+  @param[in]   client_handle  Client handle of the user process that owns this mapping. 
+  @param[in]   addr           Pointer to a virtual memory address.
+  @param[in]   length         Size of mapping in bytes.
+  @param[in]   flags          Specifies the flag.
+  
+  @return
+  #QURT_EOK -- Successfully changes permissions on the mapping. \n
+  #QURT_EFATAL -- Failed to change permissions on the mapping. 
+  #QURT_ELOCKED - Buffer is locked. Mapping delete failed.
+ */
+int qurt_mem_munmap3(int client_handle,
+                     void *addr,
+                     size_t length,
+                     int flags);
+
+/*
+|| The macros here follow the style of the standard mmap() macros, but with
+||  QURT_ prepended to avoid name conflicts, and to avoid having a dependency
+||  on sys/mman.h.
+||
+|| Wherever possible, any values here that are also present in sys/mman.h
+||  should have the same value in both places so that we can accept "mmap"
+||  calls without having to remap parameters to new values.
+||
+|| In the future, it would be desirable to have a regression test that
+||  checks, for instance, that these macros match.  Example:
+||
+||   assert(QURT_MAP_FAILED == MAP_FAILED);
+||   ... repeat as needed ...
+*/
+
+/** @addtogroup memory_mapping_macros
+@{ */
+/** @cond */
+#define QURT_PROT_NONE                  0x00U    /**< */
+#define QURT_PROT_READ                  0x01U    /**< */
+#define QURT_PROT_WRITE                 0x02U    /**< */
+#define QURT_PROT_EXEC                  0x04U    /**< */
+#define QURT_PROT_NODUMP                0x08U    /**< Skip dumping the mapping. During PD dump, must skip
+                                                   some mappings on host memory to avoid a race condition
+                                                      where the memory is removed from the host and the DSP process
+                                                      crashes before the mapping is removed.*/
+#define QURT_PROT_ISLAND                0x10U     /**< Island mapping. */
+
+#define QURT_MAP_SHARED                 0x0001U   /**< Shared. */
+#define QURT_MAP_PRIVATE                0x0002U   /**< Private. */
+/** @endcond */
+#define QURT_MAP_NAMED_MEMSECTION       0x0004U   /**< Named memsection. */
+#define QURT_MAP_FIXED                  0x0010U   /**< Fixed virtual address. */
+#define QURT_MAP_RENAME                 0x0020U   /**< Rename. */
+#define QURT_MAP_NORESERVE              0x0040U   /**< No reserve. */
+#define QURT_MAP_INHERIT                0x0080U   /**< Inherit. */
+#define QURT_MAP_NONPROCESS_VPOOL       0x0100U   /**< Use a virtual address outside of the default range of the
+                                                       processes. This option is only supported in the root process
+                                                       and only when virtual memory split is enabled in the XML.
+                                                       The root process can use this flag to create mapping for a
+                                                       user process, for example, if the virtual address is configured
+                                                       for a 3G/1G split, the root process can use this flag to create
+                                                       mapping in the top 1 GB area for the user process or the
+                                                       lower 3 GB area for the root process. This is useful for
+                                                       shared buffer use cases. */
+#define QURT_MAP_HASSEMAPHORE           0x0200U   /**< Has semaphore. */
+#define QURT_MAP_TRYFIXED               0x0400U   /**< Try to create a mapping for a virtual address that was passed.
+                                                       If the passed virtual address fails, use a random virtual address. */
+#define QURT_MAP_WIRED                  0x0800U   /**< Wired. */
+#define QURT_MAP_FILE                   0x0000U   /**< File. */
+#define QURT_MAP_ANON                   0x1000U   /**< Allocate physical memory from the pool that was passed. 
+                                                       By default, memory is allocated from the default physpool. */
+#define QURT_MAP_VA_ONLY                0X2000U   /**< Reserve a virtual address without
+                                                       mapping it. */
+
+/** @cond */                                                   
+#define QURT_MAP_ALIGNED(n)             ((n) << QURT_MAP_ALIGNMENT_SHIFT)
+#define QURT_MAP_ALIGNMENT_SHIFT        24
+
+
+#define QURT_MAP_ALIGNMENT_MASK         QURT_MAP_ALIGNED(0xff)   /**< */
+#define QURT_MAP_ALIGNMENT_64KB         QURT_MAP_ALIGNED(16)     /**< */
+#define QURT_MAP_ALIGNMENT_16MB         QURT_MAP_ALIGNED(24)     /**< */
+#define QURT_MAP_ALIGNMENT_4GB          QURT_MAP_ALIGNED(32)     /**< */
+#define QURT_MAP_ALIGNMENT_1TB          QURT_MAP_ALIGNED(40)     /**< */
+#define QURT_MAP_ALIGNMENT_256TB        QURT_MAP_ALIGNED(48)     /**< */
+#define QURT_MAP_ALIGNMENT_64PB         QURT_MAP_ALIGNED(56)     /**< */
+/** @endcond */
+#define QURT_MAP_FAILED                 ((void *) -1)            /**< Mapping creation failed. */
+
+/*
+|| The macros below are extensions beyond the standard mmap flags, but follow
+||  the style of the mmap flags.
+*/
+/** @cond */
+// Describe bitfields in (prot)
+#define QURT_PROT_CACHE_BOUNDS          16U,19U,7U         /**< Bits 16 through 19 are cache attribute, default is 0. */
+#define QURT_PROT_BUS_BOUNDS            20U,21U,0U         /**< Bits 20 through 21 are bus attributes, default is 0. */
+#define QURT_PROT_USER_BOUNDS           22U,23U,3U         /**< Bits 22 through 23 are user mode, default is 3;
+                                                                default of 3 means to derive user mode setting from the
+                                                                default mode of the client. */
+
+// Describe bitfields in (flags)
+#define QURT_MAP_PHYSADDR_BOUNDS        15U,15U,0U         /**< Bits 15 through 15 are physaddr, default is 0. */
+#define QURT_MAP_TYPE_BOUNDS            16U,19U,0U         /**< Bits 16 through 19 are mapping type, default is 0. */
+#define QURT_MAP_REGION_BOUNDS          20U,23U,0U         /**< Bits 20 through 23 are region type, default is 0. */
+/** @endcond */
+
+// These macros get OR'ed into (prot)
+#define QURT_PROT_CACHE_MODE(n)         QURT_MMAP_BUILD(QURT_PROT_CACHE_BOUNDS,(n)) /**< */
+#define QURT_PROT_BUS_ATTR(n)           QURT_MMAP_BUILD(QURT_PROT_BUS_BOUNDS,(n))   /**< */
+#define QURT_PROT_USER_MODE(n)          QURT_MMAP_BUILD(QURT_PROT_USER_BOUNDS,(n))  /**< */
+// These macros get OR'ed into (flags)
+
+#define QURT_MAP_PHYSADDR               QURT_MMAP_BUILD(QURT_MAP_PHYSADDR_BOUNDS,1U) /**< Use the physical address that was passed in offset field. 
+                                                                                          This is allowed only for root process. */
+#define QURT_MAP_TYPE(n)                QURT_MMAP_BUILD(QURT_MAP_TYPE_BOUNDS,(n))    /**< */
+#define QURT_MAP_REGION(n)              QURT_MMAP_BUILD(QURT_MAP_REGION_BOUNDS,(n))  /**< */
+/** @} */ /* end_addtogroup memory_mapping_macros */
+/** @cond */
+// These macros extract fields from (prot)
+#define QURT_PROT_GET_CACHE_MODE(n)     QURT_MMAP_EXTRACT(QURT_PROT_CACHE_BOUNDS,(n))  /**< */
+#define QURT_PROT_GET_BUS_ATTR(n)       QURT_MMAP_EXTRACT(QURT_PROT_BUS_BOUNDS,(n))    /**< */
+#define QURT_PROT_GET_USER_MODE(n)      QURT_MMAP_EXTRACT(QURT_PROT_USER_BOUNDS,(n))   /**< */
+
+// These macros extract fields from (flags)
+#define QURT_MAP_GET_TYPE(n)            QURT_MMAP_EXTRACT(QURT_MAP_TYPE_BOUNDS,(n))   /**< */
+#define QURT_MAP_GET_REGION(n)          QURT_MMAP_EXTRACT(QURT_MAP_REGION_BOUNDS,(n)) /**< */
+
+// Macros for bitfield insertion and extraction
+#define QURT_MMAP_MASK(lo,hi)           (~((~0u) << ((hi)-(lo)+1U)))                     /**< Mask of same size as [lo..hi]. */
+#define QURT_MMAP_BUILD_(lo,hi,def,n)   ((((n)^(def))&QURT_MMAP_MASK((lo),(hi)))<<(lo)) /**< */
+#define QURT_MMAP_EXTRACT_(lo,hi,def,n) ((((n)>>(lo))&QURT_MMAP_MASK((lo),(hi)))^(def)) /**< */
+#define QURT_MMAP_BUILD(a,b)            QURT_MMAP_BUILD_(a,b)                           /**< */
+#define QURT_MMAP_EXTRACT(a,b)          QURT_MMAP_EXTRACT_(a,b)                         /**< */
+/** @endcond */
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_mq.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_mq.h
new file mode 100755
index 0000000000000..580c83d3de41a
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_mq.h
@@ -0,0 +1,458 @@
+#ifndef QURT_MQ_H
+#define QURT_MQ_H
+/**
+  @file  qurt_mq.h
+
+  @brief  Prototypes of secure message queues API functions.
+
+ EXTERNALIZED FUNCTIONS
+  None
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  None
+
+ Copyright (c) 2019-2023  by Qualcomm Technologies, Inc.  All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+======================================================================*/
+#include <qurt_types.h>
+#include <qurt_error.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*=============================================================================
+                            CONSTANTS AND MACROS
+=============================================================================*/
+#define QURT_MQ_NAME_MAXLEN            16U  /**< Maximum name length. */
+
+
+/*=============================================================================
+                            FORWARD DECLARATIONS & TYPEDEFS
+=============================================================================*/
+/* This enum must be generated in accordance to process class class numbers.
+   For now it is made to match generated version, do not change this unless 
+   there is a corresponding change in the process_class.py, indicies start from 0
+   basically: QURT_MQ_SECURITY_SCOPE_<x> = (1 << QURTK_process_class_index_<x>)
+*/
+typedef enum {
+    QURT_MQ_SECURITY_SCOPE_KERNEL =   ( 1U << 0 ),
+    QURT_MQ_SECURITY_SCOPE_SRM =      ( 1U << 1 ),
+    QURT_MQ_SECURITY_SCOPE_SECURE =   ( 1U << 2 ),
+    QURT_MQ_SECURITY_SCOPE_CPZ =      ( 1U << 3 ),
+    QURT_MQ_SECURITY_SCOPE_ROOT =     ( 1U << 4 ),
+    QURT_MQ_SECURITY_SCOPE_SIGNED =   ( 1U << 5 ),
+    QURT_MQ_SECURITY_SCOPE_UNSIGNED = ( 1U << 6 ),
+    QURT_MQ_SECURITY_SCOPE_SECURE_ROOT = ( 1U << 7 )
+} qurt_mq_security_scope_t;
+
+typedef enum {
+    QURT_MQ_CARDINALITY_PTP =   (1U << 0),
+    QURT_MQ_CARDINALITY_MTO =   (1U << 1)
+}qurt_mq_cardinality_t;
+
+typedef unsigned int qurt_mqd_t;
+
+typedef union{
+    struct {
+        unsigned int perms:2;
+        unsigned int cardinality:1;
+        unsigned int blocking:1;
+
+        qurt_mq_security_scope_t creator_scope: 8;
+        qurt_mq_security_scope_t allowed_scope: 8; //can be a bitmask in case of MTO
+        unsigned int queue_closed: 1;
+        unsigned int reserved: 11;
+    }; //try to do anonymous struct
+    unsigned int raw;
+} qurt_mq_flags_t;
+
+
+/* permissions are from qurt_types.h , block X though */
+#if 0
+/** Memory access permission. */
+typedef enum {
+        QURT_PERM_READ=0x1U, /**< */
+        QURT_PERM_WRITE=0x2U,  /**< */
+        QURT_PERM_EXECUTE=0x4U,  /**< */
+        QURT_PERM_FULL=QURT_PERM_READ|QURT_PERM_WRITE|QURT_PERM_EXECUTE,  /**< */
+} qurt_perm_t;
+#endif
+
+struct qurt_mq_attr {
+   unsigned flags;                         /**< Configured flags. Only meaningful with get_attr(), only used for qurt_mq_flags_t.perms. */
+   unsigned mq_maxmsg;                     /**< Maximum number of messages. Used with create() and get_attr. */
+   unsigned short mq_send_msgsize;         /**< Maximum size (bytes) of message in receiver facing queue,
+                                                from sender to receiver. */
+   unsigned short mq_recv_msgsize;         /**< Maximum size (bytes) of message in sender facing queue,
+                                                from receiver to sender. */
+   unsigned client_pid;                    /**< Process ID of client that is allowed to open the message queue
+                                                that was created using qurt_mq_create(). */
+   qurt_mq_cardinality_t    cardinality;   /**< Cardinality of message queue connection, see below. */
+   qurt_mq_security_scope_t scope;         /**< Security scope of the senders to the queue. */ 
+};
+
+
+/*=============================================================================
+                            EXTERNS & FUNCTIONS
+=============================================================================*/
+/**@ingroup func_qurt_mq_attr_init
+  Initializes attributes to default values used for creating the queue.
+
+  The initialize operation sets the following default attribute values: \n
+  - flag - QURT_PERM_READ | QURT_PERM_WRITE \n
+  - maxmsg - 1 \n
+  - mq_send_msgsize - 8 \n
+  - mq_recv_msgsize - 8 \n
+  - sender_pid -  -1 \n    
+  - cardinality -  QURT_MQ_CARDINALITY_PTP \n    
+  - scope -  QURT_MQ_SECURITY_SCOPE_SIGNED \n    
+
+  @datatypes
+  #qurt_mq_attr 
+  
+  @param[in,out] attr Pointer to the initialized message queue object.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+void qurt_mq_attr_init(struct qurt_mq_attr * attr);
+
+/**@ingroup qurt_mq_attr_set_send_msgsize
+  Sets the message size in bytes the sender can send.
+  Maximum message length is configurable using the XML configuration, however, limited to a maximum value of 62 bytes.
+  
+  @datatypes
+  #qurt_mq_attr
+
+  @param[in,out] attr Pointer to the message queue object.
+  @param[in] len     Length of message in bytes.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+void qurt_mq_attr_set_send_msgsize (struct qurt_mq_attr *attr, size_t len);
+
+/**@ingroup qurt_mq_attr_set_recv_msgsize
+  Sets the message size in bytes that the receiver can read.
+  Maximum message length is configurable using the XML configuration, however, limited to maximum value of 62 bytes.
+  
+  @datatypes
+  #qurt_mq_attr
+
+  @param[in,out] attr Pointer to the message queue object.
+  @param[in] len     Length of message in bytes.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+void qurt_mq_attr_set_recv_msgsize (struct qurt_mq_attr *attr, size_t len);
+
+/**@ingroup qurt_mq_attr_set_maxmsg
+  Sets the maximum message that can queue in the message queue.
+  Message depth is configurable using the XML configuration. 
+  
+  @datatypes
+  #qurt_mq_attr
+
+  @param[in,out] attr  Pointer to the message queue object.
+  @param[in] depth     Maximum message that can be queued.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+void qurt_mq_attr_set_maxmsg (struct qurt_mq_attr *attr, unsigned int depth);
+
+/**@ingroup qurt_mq_attr_set_scope
+  Sets the scope of the message queue. A message queue created with a security 
+  scope allows only a process class of that scope to open a message queue.
+  
+  @datatypes
+  #qurt_mq_attr \n
+  #qurt_mq_security_scope_t
+
+  @param[in,out] attr  Pointer to the message queue object.
+  @param[in] scope     Scope of the message queue: \n
+                       #QURT_MQ_SECURITY_SCOPE_KERNEL \n
+                       #QURT_MQ_SECURITY_SCOPE_SRM \n
+                       #QURT_MQ_SECURITY_SCOPE_SECURE \n
+                       #QURT_MQ_SECURITY_SCOPE_CPZ \n
+                       #QURT_MQ_SECURITY_SCOPE_ROOT \n
+                       #QURT_MQ_SECURITY_SCOPE_SIGNED \n
+                       #QURT_MQ_SECURITY_SCOPE_UNSIGNED
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+void qurt_mq_attr_set_scope (struct qurt_mq_attr *attr, qurt_mq_security_scope_t scope);
+
+
+/**@ingroup qurt_mq_attr_set_client_pid
+  Sets the client_pid that can open this message queue.
+  If client_pid is set, allowed_scope to open MQ shall not be considered.
+  
+  @datatypes
+  #qurt_mq_attr
+
+  @param[in,out] attr    Pointer to the message queue object.
+  @param[in] client_pid  Valid PID for client process.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+void qurt_mq_attr_set_client_pid (struct qurt_mq_attr *attr, unsigned client_pid);
+
+/**@ingroup qurt_mq_attr_set_flags
+  Sets the properties of the message queues. 
+  The current implementation is only used to set the permission for the message queue using the flag attribute.
+  Default is #QURT_PERM_READ | #QURT_PERM_WRITE, explicit permission is not implemented.
+  
+  @datatypes
+  #qurt_mq_attr
+
+  @param[in,out] attr  Pointer to the message queue object.
+  @param[in] flags     Permission for message queue.  
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+void qurt_mq_attr_set_flags (struct qurt_mq_attr *attr, unsigned int flags);
+
+/**@ingroup qurt_mq_create
+   Create a message queue with the provided name and attributes. 
+   The calling process becomes the owner of the queue.
+   Name of the message queue is limited to 16 characters including the NULL terminator. 
+  
+  @datatypes
+  #qurt_mq_attr \n
+  #qurt_mqd_t
+
+  @param[out] mqd Returns a pointer to the message queue identifier if 
+              the message queue  was successfully created.
+  @param[in] name     String identifier of the message queue.
+  @param[in] attr     Pointer to the initialized message queue attribute 
+                      structure that specifies the attributes of the created message queue.
+
+  @return
+  #QURT_EOK        Message queue created. \n
+  #QURT_EINVALID   Invalid arguments. \n
+  #QURT_ENOSPC     Maximum number of queues in the system is exceeded.
+
+  @dependencies
+  None.
+*/
+int qurt_mq_create(qurt_mqd_t *mqd, const char *name, struct qurt_mq_attr * attr);
+
+/**@ingroup qurt_mq_open
+  Opens a message queue connection between a process and a created message queue. 
+  
+  @datatypes
+  #qurt_mq_attr \n
+  #qurt_mqd_t
+
+  @param[out] mqd Returns a pointer to the message queue 
+              identifier if the message queue  was successfully created.
+  @param[in] name    String identifier of the message queue. 
+  @param[in] flags   Flag that contains the properties that define the behavior of message queue connection.
+                     Permissions:\n
+                      #QURT_PERM_READ \n
+                      #QURT_PERM_WRITE \n
+                      #QURT_PERM_READ | QURT_PERM_WRITE @tablebulletend  
+                      Default is QURT_PERM_READ | QURT_PERM_WRITE, explicit permission is not implemented \n
+                     Cardinality: \n
+                      #QURT_MQ_CARDINALITY_PTP (default) \n      
+                      #QURT_MQ_CARDINALITY_MTO (not implemented) \n
+                      Block suspend thread until the message queue with the apecified name is created. \n
+                     Scope: security boundary to which the message queue and its users are constrained.
+                      Block suspend thread until the message queue with the apecified name is created. \n
+                      It is coupled with process privilege level/scope.\n
+                      #QURT_MQ_SECURITY_SCOPE_KERNEL   \n
+                      #QURT_MQ_SECURITY_SCOPE_SRM      \n
+                      #QURT_MQ_SECURITY_SCOPE_SECURE   \n
+                      #QURT_MQ_SECURITY_SCOPE_CPZ      \n
+                      #QURT_MQ_SECURITY_SCOPE_ROOT     \n
+                      #QURT_MQ_SECURITY_SCOPE_SIGNED   \n
+                      #QURT_MQ_SECURITY_SCOPE_UNSIGNED @tablebulletend
+
+  @return
+  QURT_EOK -- Message queue connection successfully opened \n
+  QURT_EFAILED -- Message queue connection failed , if non-blocking message queue \n
+  QURT_ENOTALLOWED --  Open failed due to security scope mismatch
+
+  @dependencies
+  None.
+*/
+int qurt_mq_open (qurt_mqd_t *mqd, const char *name, qurt_mq_flags_t flags);
+
+/**@ingroup qurt_mq_send
+  Sends a message over message queue.\n
+  - If the message queue is full, the calling thread shall be 
+    suspended until space becomes available to enqueue the message. \n
+  - If there exists a thread suspended on an empty queue 
+  to receive a message,  qurt_mq_send shall resume that thread. 
+
+  @datatypes
+  #qurt_mqd_t
+
+  @param[in] mqd Pointer to the message queue identifier.
+  @param[in] msg_ptr     Pointer to the message buffer.  
+  @param[in] msg_len     Length of the message buffer in bytes.  
+
+  @return
+  #QURT_EOK  Message queue send was successful.\n
+  #QURT_EMSGSIZE  Message size in msg_len field is greater than max_message_len specified during queue creation.\n
+  #QURT_ENOTALLOWED   Send failed due to security scope mismatch.
+
+  @dependencies
+  None.
+*/
+int qurt_mq_send(qurt_mqd_t mqd, const char *msg_ptr, size_t msg_len); 
+
+/**@ingroup qurt_mq_send_timed
+  Sends a message over message queue.\n
+  - If the message queue is full, the calling thread shall be 
+    suspended until space becomes available to enqueue the message or until timeout is reached. \n
+  - If there exists a thread suspended on an empty queue 
+    to receive a message, qurt_mq_send_timed shall return with possible return codes.\n
+  - If timeout is reached, qurt_mq_send_timed shall return #QURT_ETIMEOUT.
+
+  @datatypes
+  #qurt_mqd_t
+
+  @param[in] mqd Pointer to the message queue identifier.
+  @param[in] msg_ptr     Pointer to the message buffer.
+  @param[in] duration    Interval (in microseconds) that the duration value must be
+             between #QURT_TIMER_MIN_DURATION and #QURT_TIMER_MAX_DURATION     
+  @param[in] msg_len     Length of message buffer in bytes.  
+
+  @return
+  #QURT_EOK -- Message queue send was successful. \n
+  #QURT_EMSGSIZE -- Message size in msg_len field is greater than max_message_len specified during queue creation.\n
+  #QURT_ENOTALLOWED --  Send failed due to security scope mismatch \n
+  #QURT_ETIMEDOUT -- Timeout
+  
+  @dependencies
+  None.
+*/
+int qurt_mq_send_timed(qurt_mqd_t mqd, const char *msg_ptr, unsigned long long int duration, size_t msg_len);
+
+ /**@ingroup qurt_mq_recv
+  Receives a message from the message queue. \n
+  -If the message queue is empty, the calling thread shall be 
+   suspended until a message is enqueued in the message queue. \n
+  -If there exists a thread suspended on a full queue to 
+   send a message, qurt_mq_recv shall resume the thread.
+
+  @datatypes
+  #qurt_mqd_t
+
+  @param[in] mqd Pointer to the message queue identifier.
+  @param[in] msg_ptr       Pointer to the message buffer  
+  @param[in,out] msg_len   Pointer to the length of message buffer.  
+
+  @return
+  #QURT_EOK --    Message queue created.\n
+  #QURT_EINVALID  Message pointer or msg_len ptr are NULL. \n
+  #QURT_EBADR     Message queue descriptior (mqd) is invalid. \n
+  #QURT_EBADF     Sender closed the message queue.
+
+  @dependencies
+  None.
+*/
+int qurt_mq_recv(qurt_mqd_t mqd, unsigned char *msg_ptr, size_t *msg_len);
+
+ /**@ingroup qurt_mq_recv_timed
+  Receives a message from the message queue. \n
+  -If the message queue is empty, the calling thread shall be 
+   suspended until a message is enqueued in the message queue or until timeout is reached.\n 
+  -If there exists a thread suspended on a full queue to 
+   send a message, qurt_mq_recv_timed shall return with possible return codes.\n
+  - If timeout is reached, qurt_mq_recv_timed shall return QURT_ETIMEOUT.
+  
+  @datatypes
+  #qurt_mqd_t
+
+  @param[in] mqd Pointer to the message queue identifier.
+  @param[in] msg_ptr     Pointer to the message buffer  
+  @param[in] duration    Interval (in microseconds) that the duration value must be;
+             between #QURT_TIMER_MIN_DURATION and #QURT_TIMER_MAX_DURATION   
+  @param[in,out] msg_len     Pointer to length of message buffer.  
+
+  @return
+  #QURT_EOK --       Message queue created.\n
+  #QURT_EINVALID --  Message ptr or msg_len ptr are NULL. \n
+  #QURT_EBADR    --  Message queue descriptior (mqd) is invalid.\n
+  #QURT_EBADF   --   Sender closed the message queue. \n
+  #QURT_ETIMEDOUT -- Timeout.
+  
+  @dependencies
+  None.
+*/
+int qurt_mq_recv_timed(qurt_mqd_t mqd, unsigned char *msg_ptr, unsigned long long int duration, size_t *msg_len);
+
+ /**@ingroup qurt_mq_close
+  Closes the message queue and disassociates the calling process (client) from the message queue 
+  under this descriptor. Marks the queue as closed for the receiver. 
+  This function is expected to be called from the client side. If called 
+  from the server side, the function reduces to no-op and returns success. 
+
+  @datatypes
+  #qurt_mqd_t
+
+  @param[in] mqd Pointer to the message queue identifier. 
+
+  @return
+  #QURT_EOK -- Message queue close was successfully.\n
+  #QURT_EBADR -- Invalid descriptor.\n
+  #QURT_ENOTALLOWED --   Message queue close is not called from client side.
+
+  @dependencies
+  None.
+*/
+int qurt_mq_close(qurt_mqd_t mqd);
+
+ /**@ingroup qurt_mq_destroy
+  Destroys the message queue. This function ought to be 
+  called from the process that called qurt_mq_create(). 
+
+  @datatypes
+  #qurt_mqd_t
+
+  @param[in] mqd Pointer to the message queue identifier. 
+
+  @return
+  #QURT_EOK -- Message queue destroy was successfully.\n
+  #QURT_EBADR -- Invalid descriptor.\n
+  #QURT_ENOTALLOWED --  Message queue close is not called from client side.
+
+  @dependencies
+  None.
+*/
+int qurt_mq_destroy(qurt_mqd_t mqd);
+
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+#endif //QURT_MQ_H
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_mutex.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_mutex.h
new file mode 100755
index 0000000000000..4ad6b270cdde6
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_mutex.h
@@ -0,0 +1,211 @@
+#ifndef QURT_MUTEX_H
+#define QURT_MUTEX_H
+/**
+  @file qurt_mutex.h 
+  @brief   Prototypes of mutex API.  
+   This is mostly a user space mutex, but calls the 
+   kernel to block if the mutex is taken. 
+
+EXTERNAL FUNCTIONS
+   None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+Copyright (c) 2021  by Qualcomm Technologies, Inc.  All Rights Reserved.
+Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+=============================================================================*/
+
+
+#include <qurt_futex.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** @addtogroup mutex_types
+@{ */
+/*=============================================================================
+                        TYPEDEFS
+=============================================================================*/
+
+/** QuRT mutex type.                                       
+  
+   Both non-recursive mutex lock and unlock, and recursive
+   mutex lock and unlock can be applied to this type.
+ */
+typedef union qurt_mutex_aligned8{
+   /** @cond */  
+    struct {       
+        unsigned int holder; 
+        unsigned int count;  
+        unsigned int queue;  
+        unsigned int wait_count;        
+    };
+    unsigned long long int raw;  
+    /** @endcond */  
+} qurt_mutex_t;
+/** @} */ /* end_addtogroup mutex_types */
+/*=============================================================================
+                        CONSTANTS AND MACROS
+=============================================================================*/
+/* @addtogroup mutex_const_macros
+@{ */
+#define MUTEX_MAGIC 0xfe                             /**< */
+#define QURTK_FUTEX_FREE_MAGIC     0x1F   // 11111   /**< */
+#define QURT_MUTEX_INIT {{MUTEX_MAGIC, 0, QURTK_FUTEX_FREE_MAGIC,0}}   /**< Suitable as an initializer for a
+                                                                        variable of type qurt_mutex_t. */
+/* @} */ /* end_addtogroup mutex_const_macros */
+/*=============================================================================
+                        FUNCTIONS
+=============================================================================*/
+/**@ingroup func_qurt_mutex_init
+  Initializes a mutex object.
+  The mutex is initially unlocked.
+
+  @note1hang Each mutex-based object has one or more kernel resources associated with it;
+             to prevent resource leaks, call qurt_mutex_destroy()
+             when this object is not used anymore
+  @datatypes
+  #qurt_mutex_t
+  
+  @param[out]  lock  Pointer to the mutex object. Returns the initialized object.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+  
+ */
+void qurt_mutex_init(qurt_mutex_t *lock);
+
+/**@ingroup func_qurt_mutex_destroy
+   Destroys the specified mutex. 
+
+   @note1hang Mutexes must be destroyed when they are no longer in use. Failure to do this
+              causes resource leaks in the QuRT kernel.\n
+   @note1cont Mutexes must not be destroyed while they are still in use. If this occurs, the
+              behavior of QuRT is undefined. 
+  
+   @datatypes
+   #qurt_mutex_t
+   
+   @param[in]  lock  Pointer to the mutex object to destroy.
+
+   @return 
+   None.
+
+   @dependencies
+   None.
+  
+ */
+void qurt_mutex_destroy(qurt_mutex_t *lock); 
+
+/**@ingroup func_qurt_mutex_lock
+   Locks the specified mutex.  
+   If a thread performs a lock operation on a mutex that is not in use, the thread gains
+   access to the shared resource that is protected by the mutex, and continues executing.
+
+   If a thread performs a lock operation on a mutex that is already in use by another
+   thread, the thread is suspended. When the mutex becomes available again (because the
+   other thread has unlocked it), the thread is awakened and given access to the shared
+   resource.
+
+   @note1hang A thread is suspended indefinitely if it locks a mutex that it has already
+           locked. Avoid this by using recursive mutexes (Section @xref{dox:recursive_mutexes}).  
+  
+   @datatypes
+   #qurt_mutex_t
+   
+   @param[in]  lock  Pointer to the mutex object. Specifies the mutex to lock.
+
+   @return 
+   None.
+
+   @dependencies
+   None.  
+ */
+void qurt_mutex_lock(qurt_mutex_t *lock);		/* blocking */
+
+/**@ingroup func_qurt_mutex_lock_timed
+   Locks the specified mutex.
+   When a thread performs a lock operation on a mutex that is not in use, the thread gains
+   access to the shared resource that is protected by the mutex, and continues executing.
+
+   When a thread performs a lock operation on a mutex that is already in use by another
+   thread, the thread is suspended. When the mutex becomes available again (because the
+   other thread has unlocked it), the thread is awakened and given access to the shared
+   resource. If the duration of suspension exceeds the timeout duration, wait is
+   terminated and no access to mutex is granted. 
+   
+   @datatypes
+   #qurt_mutex_t
+   
+   @param[in]  lock    Pointer to the mutex object; specifies the mutex to lock. 
+   @param[in] duration Interval (in microseconds) that the duration value must be between #QURT_TIMER_MIN_DURATION and
+    #QURT_TIMER_MAX_DURATION 
+
+   @return 
+   #QURT_EOK -- Success \n
+   #QURT_ETIMEDOUT -- Timeout
+ 
+   @dependencies
+   None.  
+ */
+int qurt_mutex_lock_timed (qurt_mutex_t * lock, unsigned long long int duration);
+
+/**@ingroup func_qurt_mutex_unlock
+  Unlocks the specified mutex.  \n
+  More than one thread can be suspended on a mutex. When the mutex is unlocked, only the
+  highest-priority thread waiting on the mutex is awakened. If the awakened thread has
+  higher priority than the current thread, a context switch occurs.
+
+  @note1hang The behavior of QuRT is undefined if a thread unlocks a mutex it did not first
+              lock.  
+  
+   @datatypes
+   #qurt_mutex_t
+   
+   @param[in]  lock  Pointer to the mutex object. Specifies the mutex to unlock. 
+
+   @return
+   None.
+
+   @dependencies
+   None.  
+ */
+void qurt_mutex_unlock(qurt_mutex_t *lock);	/* unlock */
+
+/**@ingroup func_qurt_mutex_try_lock
+   @xreflabel{hdr:qurt_mutex_try_lock}
+   Attempts to lock the specified mutex. 
+   If a thread performs a try_lock operation on a mutex that is not in use, the thread gains
+   access to the shared resource that is protected by the mutex, and continues executing.
+
+   @note1hang If a thread performs a try_lock operation on a mutex that it has already locked 
+              or is in use by another thread, qurt_mutex_try_lock immediately returns with a 
+              nonzero result value.
+   
+  
+   @datatypes
+   #qurt_mutex_t
+   
+   @param[in]  lock  Pointer to the mutex object. Specifies the mutex to lock.
+
+   @return 
+   0 -- Success. \n 
+   Nonzero -- Failure.
+  
+  @dependencies
+  None.
+ */
+int qurt_mutex_try_lock(qurt_mutex_t *lock);	
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_MUTEX_H */
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_os_services.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_os_services.h
new file mode 100755
index 0000000000000..cbc4c239e9620
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_os_services.h
@@ -0,0 +1,24 @@
+/*=============================================================================
+
+                                    qurt_os_services.c
+
+GENERAL DESCRIPTION
+
+EXTERNAL FUNCTIONS
+        None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+        None.
+
+             Copyright (c) 2023  by Qualcomm Technologies, Inc.  All Rights Reserved.
+=============================================================================*/
+
+#define QURT_OS_SERVICE_THREAD                "/os/thread"				/**< Thread service */
+#define QURT_OS_SERVICE_FS_HUB                "/os/fs_hub"  			/**< file-system hub */
+#define QURT_OS_SERVICE_CALLBACK              "/os/callback"            /**< QDI callback service */ 
+#define QURT_OS_SERVICE_INTERRUPTS            "/os/interrupt"           /**< Interrupt service */
+#define QURT_OS_SERVICE_PROXY                 "/os/proxy"               /**< QDI proxy serice */
+#define QURT_OS_SERVICE_MEMORY                "/os/memory"              /**< Memory management service */
+#define QURT_OS_SERVICE_MEMPOOL               "/os/mempool"             /**< Pool management service */
+#define QURT_OS_SERVICE_PROCESS               "/os/process"             /**< Process management service */
+#define QURT_OS_SERVICE_MMAP                  "/os/mem_mapper"          /**< mmapper service */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_pimutex.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_pimutex.h
new file mode 100755
index 0000000000000..61aee5cba7ce8
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_pimutex.h
@@ -0,0 +1,200 @@
+#ifndef QURT_PIMUTEX_H
+#define QURT_PIMUTEX_H 1
+/**
+  @file qurt_pimutex.h   
+  @brief Prototypes of qurt_pimutex API.  
+
+EXTERNAL FUNCTIONS
+   None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+Copyright (c) 2021  by Qualcomm Technologies, Inc.  All Rights Reserved.
+Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+=============================================================================*/
+
+#include <qurt_futex.h>
+#include <qurt_mutex.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*=============================================================================
+                        FUNCTIONS
+=============================================================================*/
+/**@ingroup func_qurt_pimutex_init
+  Initializes a priority inheritance mutex object.
+  The priority inheritance mutex is initially unlocked.
+
+  This function works the same as qurt_mutex_init().
+
+   @note1hang Each pimutex-based object has one or more kernel resources associated with it;
+              to prevent resource leaks, call qurt_pimutex_destroy()
+              when this object is not used anymore
+
+  @datatypes
+  #qurt_mutex_t
+  
+  @param[out]  lock  Pointer to the priority inheritance mutex object.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ 
+ */
+void qurt_pimutex_init(qurt_mutex_t *lock);
+
+/**@ingroup func_qurt_pimutex_destroy
+   Destroys the specified priority inheritance mutex.  
+
+   @note1hang Priority inheritance mutexes must be destroyed when they are no longer in
+              use. Failure to do this causes resource leaks in the QuRT kernel.\n
+   @note1cont Priority inheritance mutexes must not be destroyed while they are still in use.
+              If this occurs, the behavior of QuRT is undefined.
+  
+   @datatypes
+   #qurt_mutex_t
+   
+   @param[in]  lock  Pointer to the priority inheritance mutex object to destroy.
+
+   @return
+   None.
+
+   @dependencies
+   None.
+  
+ */
+void qurt_pimutex_destroy(qurt_mutex_t *lock);
+
+/**@ingroup func_qurt_pimutex_lock
+  Requests access to a shared resources. If a thread performs a lock operation on a mutex 
+  that is not in use, the thread gains access to the shared resource that the mutex protects, 
+  and continues executing.
+ 
+  If a thread performs a lock operation on a mutex that is already in use by another
+  thread, the thread is suspended. When the mutex becomes available again (because the 
+  other thread has unlocked it), the thread is awakened and given access to the shared resource.
+
+  If a thread is suspended on a priority inheritance mutex, and the priority of the suspended
+  thread is higher than the priority of the thread that has locked the mutex, the thread
+  with the mutex acquires the higher priority of the suspended thread. The locker thread blocks
+  until the lock is available.
+ 
+  @note1hang  A thread is not suspended if it locks a priority inheritance mutex that it has 
+              already locked . However, the mutex does not become available to other 
+			  threads until the thread performs a balanced number of unlocks on the mutex.\n
+  @note1cont  When multiple threads compete for a mutex, the lock operation for a priority
+              inheritance mutex is slower than it is for a recursive mutex. 
+			  In particular, it is about 10 times slower when the mutex is available for locking,
+			  and slower (with greatly varying times) when the mutex is already locked.
+
+  @datatypes
+  #qurt_mutex_t
+  
+  @param[in]  lock  Pointer to the priority inheritance mutex object to lock.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+  
+ */
+void qurt_pimutex_lock(qurt_mutex_t *lock);
+
+
+/**@ingroup func_qurt_pimutex_lock_timed
+  Locks a priority inheritance mutex with timeout.
+ 
+  A thread can lock a priority inheritance mutex for multiple times. The mutex is not 
+  available to other threads until the thread performs the same number of mutex unlock
+  operations.
+
+  If a thread performs a lock operation on a mutex that is already locked by another thread, 
+  the thread is moved to waiting state. When the mutex becomes available again (because the 
+  other thread has unlocked the mutex), the thread is awakened and tries to lock the mutex.
+
+  If a thread is waiting on a priority inheritance mutex, and the priority of the waiting thread 
+  is higher than the priority of the thread that has locked the mutex, the priority of the thread
+  that has locked the mutex is raised to the same priority of the waiting thread.
+
+  If the duration of waiting exceeds the timeout duration, the waiting is terminated, and 
+  the function returns QURT_ETIMEDOUT as a failure of the mutex lock.
+  
+
+  @datatypes
+  #qurt_mutex_t
+  
+  @param[in]  lock       Pointer to the mutex object to lock.
+  @param[in]  duration   Duration (in microseconds) to wait. The duration value must be between 
+                         #QURT_TIMER_MIN_DURATION and #QURT_TIMER_MAX_DURATION.
+
+  @return
+   #QURT_EOK       -- Success \n
+   #QURT_ETIMEDOUT -- Timeout
+   #QURT_EINVALID  -- Duration is out of range
+
+  @dependencies
+  None.
+  
+ */
+int qurt_pimutex_lock_timed(qurt_mutex_t *lock, unsigned long long int duration);
+
+
+/**@ingroup func_qurt_pimutex_unlock
+   Releases access to a shared resource; unlocks the specified priority inheritance mutex.  \n
+   More than one thread can be suspended on a priority inheritance mutex. When the mutex
+   is unlocked, only the highest-priority thread waiting on the mutex is awakened. If the
+   awakened thread has higher priority than the current thread, a context switch occurs.
+
+   When a thread unlocks a priority inheritance mutex, its thread priority is restored to its
+   original value from any higher priority value that it acquired from another thread
+   suspended on the mutex.
+  
+   @datatypes
+   #qurt_mutex_t
+   
+   @param[in]  lock  Pointer to the priority inheritance mutex object to unlock.
+
+   @return 
+   None.
+
+   @dependencies
+   None.
+ 
+ */
+void qurt_pimutex_unlock(qurt_mutex_t *lock);
+
+/**@ingroup func_qurt_pimutex_try_lock
+  Request access to a shared resource (without suspend). Attempts to lock the specified priority inheritance mutex.\n
+  If a thread performs a try_lock operation on a priority inheritance mutex that is not in
+  use, the thread gains access to the shared resource that is protected by the mutex, and
+  continues executing.
+  If a thread performs a try_lock operation on a priority inheritance mutex that is already
+  in use by another thread, qurt_pimutex_try_lock immediately returns with a
+  nonzero result value.
+  
+  @datatypes
+  #qurt_mutex_t
+  
+  @param[in]  lock  Pointer to the priority inheritance mutex object to lock.
+
+  @return
+  0 -- Success. \n
+  Nonzero -- Failure. 
+
+  @dependencies
+  None. 
+ */
+int qurt_pimutex_try_lock(qurt_mutex_t *lock);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_PIMUTEX_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_pimutex2.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_pimutex2.h
new file mode 100755
index 0000000000000..b809f163cbfd2
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_pimutex2.h
@@ -0,0 +1,162 @@
+#ifndef QURT_PIMUTEX2_H
+#define QURT_PIMUTEX2_H
+/**
+  @file qurt_pimutex2.h 
+  @brief Prototypes of pimutex2 API  
+
+EXTERNAL FUNCTIONS
+   None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+Copyright (c) 2013, 2021, 2023  by Qualcomm Technologies, Inc.  All Rights Reserved.
+Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+=============================================================================*/
+
+
+#include <qurt_futex.h>
+#include <qurt_mutex.h>
+#include <qurt_rmutex2.h>
+
+/*=============================================================================
+												FUNCTIONS
+=============================================================================*/
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**@ingroup func_qurt_pimutex2_init
+   Initializes a recursive mutex object. 
+
+   @deprecated use #qurt_pimutex_init instead.
+
+   The recursive mutex is initially unlocked.
+  
+   Objects of type pimutex2 solve a potential race condition between
+   unlock() and destroy() operations.
+
+   @datatypes
+   #qurt_rmutex2_t
+   
+   @param[out]  lock  Pointer to the recursive mutex object.
+
+   @return 
+   None.
+
+   @dependencies
+   None.
+  
+ */
+void qurt_pimutex2_init(qurt_rmutex2_t *lock);
+
+/**@ingroup func_qurt_pimutex2_destroy
+
+  @deprecated use #qurt_pimutex_destroy instead.
+
+  Destroys the specified recursive mutex. \n
+  @note1cont Recursive mutexes must not be destroyed while they are still in use. If this
+             occurs, the behavior of QuRT is undefined. 
+  @note1cont In general, application code should destroy an pimutex2 object prior to
+             deallocating it; calling qurt_pimutex2_destroy() before deallocating it ensures
+             that all qurt_pimutex2_unlock() calls complete.
+  
+  @datatypes
+  #qurt_rmutex2_t
+  
+  @param[in]  lock  Pointer to the recursive mutex object to destroy.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+  
+ */
+void qurt_pimutex2_destroy(qurt_rmutex2_t *lock);
+
+/**@ingroup func_qurt_pimutex2_lock
+
+  @deprecated use #qurt_pimutex_lock instead.
+
+  Locks the specified recursive mutex. \n 
+
+  If a thread performs a lock operation on a recursive mutex that is not being used, the
+  thread gains access to the shared resource that is protected by the mutex, and continues
+  executing.
+
+  If a thread performs a lock operation on a recursive mutex that is already being used by
+  another thread, the thread is suspended. When the mutex becomes available again
+  (because the other thread has unlocked it), the thread is awakened and given access to the
+  shared resource.
+  
+  @note1hang A thread is not suspended if it locks a recursive mutex that it has already
+             locked, but the mutex does not become available until the thread performs a
+             balanced number of unlocks on the mutex.
+  
+   @datatypes
+   #qurt_rmutex2_t
+  
+   @param[in]  lock  Pointer to the recursive mutex object to lock. 
+
+   @return
+   None.
+
+   @dependencies
+   None.
+  
+ */
+void qurt_pimutex2_lock(qurt_rmutex2_t *lock);
+
+/**@ingroup func_qurt_pimutex2_unlock
+
+   @deprecated use #qurt_pimutex_unlock instead.
+
+   Unlocks the specified recursive mutex. \n 
+   More than one thread can be suspended on a recursive mutex. When the mutex is
+   unlocked, only the highest-priority thread waiting on the mutex is awakened. If the
+   awakened thread has higher priority than the current thread, a context switch occurs.
+  
+   @datatypes
+   #qurt_rmutex2_t
+  
+   @param[in]  lock  Pointer to the recursive mutex object to unlock. 
+
+   @return
+   None.
+
+   @dependencies
+   None.
+  
+ */
+void qurt_pimutex2_unlock(qurt_rmutex2_t *lock);
+
+/**@ingroup func_qurt_rmutex2_try_lock
+
+   @deprecated use #qurt_pimutex_try_lock instead.
+
+   Attempts to lock the specified recursive mutex.\n
+
+   Non-blocking version of qurt_pimutex2_lock().  If a call to qurt_pimutex2_lock() would
+   succeed immediately, this function behaves similarly, and returns 0 for success.
+   If a call to qurt_pimutex2_lock() would not succeed immediately, this function has
+   no effect and returns non-zero for failure.
+
+   @datatypes
+   #qurt_rmutex2_t
+   
+   @param[in]  lock  Pointer to the recursive mutex object to lock.
+
+   @return 
+   0 -- Success. \n 
+   Nonzero -- Failure. 
+  
+ */
+int qurt_pimutex2_try_lock(qurt_rmutex2_t *lock);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_PIMUTEX2_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_pipe.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_pipe.h
new file mode 100755
index 0000000000000..6bdaa044f8640
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_pipe.h
@@ -0,0 +1,479 @@
+#ifndef QURT_PIPE_H
+#define QURT_PIPE_H
+/**
+  @file qurt_pipe.h 
+  @brief  Prototypes of the pipe interface API  
+   This is a pipe or message queue
+	 It blocks when too full (send) or empty (receive).
+	 Unless using a nonblocking option, all datagrams are 64 bits.
+
+EXTERNAL FUNCTIONS
+   None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+ Copyright (c) 2021,2023  by Qualcomm Technologies, Inc.  All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+=============================================================================*/
+
+#include <stddef.h>
+#include <qurt_mutex.h>
+#include <qurt_sem.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** @addtogroup pipe_types
+@{ */
+/*=============================================================================
+                        CONSTANTS AND MACROS
+=============================================================================*/
+#define QURT_PIPE_MAGIC  0xF1FEF1FE        /**< Magic. */
+#define QURT_PIPE_ATTR_MEM_PARTITION_RAM 0 /**< RAM. */
+#define QURT_PIPE_ATTR_MEM_PARTITION_TCM 1 /**< TCM. */
+
+/*=============================================================================
+                        TYPEDEFS
+=============================================================================*/
+/** QuRT pipe data values type. */
+typedef unsigned long long int qurt_pipe_data_t;
+
+/** QuRT pipe type.*/
+typedef struct {
+    /** @cond */
+	qurt_mutex_t pipe_lock;
+	qurt_sem_t senders;
+	qurt_sem_t receiver;
+	unsigned int size;
+	unsigned int sendidx;
+	unsigned int recvidx;
+	void (*lock_func)(qurt_mutex_t *);
+	void (*unlock_func)(qurt_mutex_t *);
+    int (*try_lock_func)(qurt_mutex_t *);
+    void (*destroy_lock_func)(qurt_mutex_t *);
+	unsigned int magic;
+	qurt_pipe_data_t *data;
+    /** @endcond */
+} qurt_pipe_t;
+
+/**  QuRT pipe attributes type. */
+typedef struct {
+  /** @cond */
+  qurt_pipe_data_t *buffer;
+  unsigned int elements;
+  unsigned char mem_partition;
+  /** @endcond */
+} qurt_pipe_attr_t;
+
+/** @} */ /* end_addtogroup pipe_types */
+/*=============================================================================
+                        FUNCTIONS
+=============================================================================*/
+/**@ingroup func_qurt_pipe_attr_init
+  @xreflabel{hdr:qurt_pipe_attr_init}
+  Initializes the structure that sets the pipe attributes when a pipe is created.
+
+  After an attribute structure is initialized, the individual attributes in the structure are
+  explicitly set using the pipe attribute operations.
+
+  The attribute structure is assigned the following default values: \n
+  - buffer -- 0 \n
+  - elements -- 0 \n
+  - mem_partition -- #QURT_PIPE_ATTR_MEM_PARTITION_RAM
+  
+  @datatypes
+  #qurt_pipe_attr_t
+ 
+  @param[in,out] attr Pointer to the pipe attribute structure.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+static inline void qurt_pipe_attr_init(qurt_pipe_attr_t *attr)
+{
+  attr->buffer = NULL;
+  attr->elements = 0;
+  attr->mem_partition = QURT_PIPE_ATTR_MEM_PARTITION_RAM;
+}
+
+/**@ingroup func_qurt_pipe_attr_set_buffer
+  @xreflabel{sec:qurt_pipe_attr_set_buffer}
+  Sets the pipe buffer address attribute.\n
+  Specifies the base address of the memory area to use for the data buffer of a pipe.
+
+  The base address and size (Section @xref{sec:qurt_pipe_attr_set_elements}) specify the 
+  memory area used as a pipe data buffer. The user is responsible for allocating the 
+  memory area used for the buffer.
+
+  @datatypes
+  #qurt_pipe_attr_t \n
+  #qurt_pipe_data_t
+  
+  @param[in,out] attr Pointer to the pipe attribute structure.
+  @param[in] buffer   Pointer to the buffer base address.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+static inline void qurt_pipe_attr_set_buffer(qurt_pipe_attr_t *attr, qurt_pipe_data_t *buffer)
+{
+  attr->buffer = buffer;
+}
+
+/**@ingroup func_qurt_pipe_attr_set_elements
+  @xreflabel{sec:qurt_pipe_attr_set_elements}
+  Specifies the length of the memory area to use for the data buffer of a pipe. 
+  
+  The length is expressed in terms of the number of 64-bit data elements that 
+  can be stored in the buffer. 
+  
+  The base address (Section @xref{sec:qurt_pipe_attr_set_buffer}) and size specify 
+  the memory area used as a pipe data buffer. The user is responsible for 
+  allocating the memory area used for the buffer.
+
+  @datatypes
+  #qurt_pipe_attr_t
+
+  @param[in,out] attr Pointer to the pipe attribute structure.
+  @param[in] elements Pipe length (64-bit elements). 
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+static inline void qurt_pipe_attr_set_elements(qurt_pipe_attr_t *attr, unsigned int elements)
+{
+  attr->elements = elements;
+}
+
+/**@ingroup func_qurt_pipe_attr_set_buffer_partition
+  @xreflabel{sec:qurt_pipe_attr_set_buffer_partition}
+  Specifies the memory type where a pipe's buffer is allocated.
+  Allocate pipes in RAM or TCM/LPM.
+ 
+  @note1hang If a pipe is specified as allocated in TCM/LPM, it must be created
+  with the qurt_pipe_init() operation. The qurt_pipe_create() operation results in an error.
+
+  @datatypes
+  #qurt_pipe_attr_t
+
+  @param[in,out] attr Pointer to the pipe attribute structure.
+  @param[in] mem_partition Pipe memory partition. Values: \n
+             - #QURT_PIPE_ATTR_MEM_PARTITION_RAM -- Pipe resides in RAM \n
+             - #QURT_PIPE_ATTR_MEM_PARTITION_TCM -- Pipe resides in TCM/LCM @tablebulletend
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+static inline void qurt_pipe_attr_set_buffer_partition(qurt_pipe_attr_t *attr, unsigned char mem_partition)
+{
+  attr->mem_partition = mem_partition;
+}
+
+/**@ingroup func_qurt_pipe_create
+  Creates a pipe.\n
+  Allocates a pipe object and its associated data buffer, and initializes the pipe object.
+
+  @note1hang The buffer address and size stored in the attribute structure specify how the
+             pipe data buffer is allocated.
+  
+  @note1cont If a pipe is specified as allocated in TCM/LPM, it must be created
+             using the qurt_pipe_init() operation. The qurt_pipe_create() operation results in an error.
+  
+  @datatypes
+  #qurt_pipe_t \n
+  #qurt_pipe_attr_t
+  
+  @param[out] pipe  Pointer to the created pipe object.
+  @param[in]  attr  Pointer to the attribute structure used to create the pipe.
+
+  @return 
+  #QURT_EOK -- Pipe created. \n
+  #QURT_EFAILED -- Pipe not created. \n
+  #QURT_ENOTALLOWED -- Pipe cannot be created in TCM/LPM.
+
+  @dependencies
+  None.
+ */
+int qurt_pipe_create(qurt_pipe_t **pipe, qurt_pipe_attr_t *attr);
+
+/**@ingroup func_qurt_pipe_init
+  Initializes a pipe object using an existing data buffer.
+
+  @note1hang The buffer address and size stored in the attribute structure must 
+             specify a data buffer that the user has already allocated.
+
+  @datatypes
+  #qurt_pipe_t \n
+  #qurt_pipe_attr_t
+  
+  @param[out] pipe Pointer to the pipe object to initialize.
+  @param[in] attr  Pointer to the pipe attribute structure used to initialize the pipe.
+
+  @return 
+  #QURT_EOK -- Success. \n
+  #QURT_EFAILED -- Failure.
+
+  @dependencies
+  None.
+ */
+int qurt_pipe_init(qurt_pipe_t *pipe, qurt_pipe_attr_t *attr);
+
+/**@ingroup func_qurt_pipe_destroy
+  @xreflabel{sec:qurt_pipe_destroy}
+  Destroys the specified pipe.
+
+  @note1hang Pipes must be destroyed when they are no longer in use. Failure 
+             to do this causes resource leaks in the QuRT kernel.
+             Pipes must not be destroyed while they are still in use. If this 
+             occurs, the behavior of QuRT is undefined.
+
+  @datatypes
+  #qurt_pipe_t
+  
+  @param[in] pipe Pointer to the pipe object to destroy.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+void qurt_pipe_destroy(qurt_pipe_t *pipe); 
+
+/**@ingroup func_qurt_pipe_delete
+  Deletes the pipe.\n
+  Destroys the specified pipe (Section @xref{sec:qurt_pipe_destroy}) and deallocates the pipe object and its
+  associated data buffer.
+
+  @note1hang Delete pipes only if they were created using qurt_pipe_create
+             (and not qurt_pipe_init). Otherwise the behavior of QuRT is undefined. \n
+  @note1cont Pipes must be deleted when they are no longer in use. Failure to do this 
+             causes resource leaks in the QuRT kernel.\n
+  @note1cont Pipes must not be deleted while they are still in use. If this occurs, the
+             behavior of QuRT is undefined. 
+
+  @datatypes
+  #qurt_pipe_t
+  
+  @param[in] pipe Pointer to the pipe object to destroy.
+
+  @return 
+  None.
+
+  @dependencies
+  None.
+ */
+void qurt_pipe_delete(qurt_pipe_t *pipe);
+
+/**@ingroup func_qurt_pipe_send
+  Writes a data item to the specified pipe. \n
+  If a thread writes to a full pipe, it is suspended on the pipe. When another thread reads
+  from the pipe, the suspended thread is awakened and can then write data to the pipe.
+
+  Pipe data items are defined as 64-bit values. Pipe writes are limited to transferring a single
+  64-bit data item per operation.
+
+  @note1hang Transfer data items larger than 64 bits by reading and writing
+             pointers to the data, or by transferring the data in consecutive 64-bit chunks.
+
+  @datatypes
+  #qurt_pipe_t \n
+  #qurt_pipe_data_t
+  
+  @param[in] pipe Pointer to the pipe object to write to.
+  @param[in] data Data item to write.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+void qurt_pipe_send(qurt_pipe_t *pipe, qurt_pipe_data_t data);
+
+/**@ingroup func_qurt_pipe_receive
+  Reads a data item from the specified pipe.
+
+  If a thread reads from an empty pipe, it is suspended on the pipe. When another thread
+  writes to the pipe, the suspended thread is awakened and can then read data from the pipe.
+  Pipe data items are defined as 64-bit values. Pipe reads are limited to transferring a single
+  64-bit data item per operation.
+
+  @note1hang Transfer data items larger than 64 bits by reading and writing
+             pointers to the data, or by transferring the data in consecutive 64-bit chunks.
+
+  @datatypes
+  #qurt_pipe_t
+  
+  @param[in] pipe Pointer to the pipe object to read from.
+
+  @return
+  Integer containing the 64-bit data item from pipe.
+
+  @dependencies
+  None.
+*/
+qurt_pipe_data_t qurt_pipe_receive(qurt_pipe_t *pipe);
+
+/**@ingroup func_qurt_pipe_try_send
+  Writes a data item to the specified pipe (without suspending the thread if the pipe is full).\n
+
+  If a thread writes to a full pipe, the operation returns immediately with success set to -1.
+  Otherwise, success is always set to 0 to indicate a successful write operation.
+
+  Pipe data items are defined as 64-bit values. Pipe writes are limited to transferring a single
+  64-bit data item per operation.
+
+  @note1hang Transfer data items larger than 64 bits  by reading and writing
+             pointers to the data, or by transferring the data in consecutive 64-bit chunks.
+
+  @datatypes
+  #qurt_pipe_t \n
+  #qurt_pipe_data_t
+  
+  @param[in] pipe Pointer to the pipe object to write to.
+  @param[in] data Data item to write.
+
+  @return
+  0 -- Success. \n
+  -1 -- Failure (pipe full).
+
+  @dependencies
+  None.
+*/ 
+int qurt_pipe_try_send(qurt_pipe_t *pipe, qurt_pipe_data_t data);
+
+/**@ingroup func_qurt_pipe_try_receive
+  Reads a data item from the specified pipe (without suspending the thread if the pipe is
+  empty).\n
+  If a thread reads from an empty pipe, the operation returns immediately with success set
+  to -1. Otherwise, success is always set to 0 to indicate a successful read operation.\n
+
+  Pipe data items are defined as 64-bit values. Pipe reads are limited to transferring a single
+  64-bit data item per operation.
+
+  @note1hang Transfer data items larger than 64 bits by reading and writing
+             pointers to the data, or by transferring the data in consecutive 64-bit chunks.
+
+  @datatypes
+  #qurt_pipe_t
+  
+  @param[in] pipe     Pointer to the pipe object to read from.
+  @param[out] success Pointer to the operation status result.
+
+  @return
+  Integer containing a 64-bit data item from pipe.
+
+  @dependencies
+  None.
+*/
+qurt_pipe_data_t qurt_pipe_try_receive(qurt_pipe_t *pipe, int *success);
+
+/**@ingroup func_qurt_pipe_receive_cancellable  
+  Reads a data item from the specified pipe (with suspend), cancellable.
+
+  If a thread reads from an empty pipe, it is suspended on the pipe. When another thread
+  writes to the pipe, the suspended thread is awakened and can then read data from the pipe.
+  The operation is cancelled if the user process of the calling thread is killed, 
+  or if the calling thread must finish its current QDI invocation and return to user space.
+  Root pd thread can use this api to wait on pipe for receiving and gets resumed with QURT_EDESTROY
+  if the pipe gets destroyed .
+  Pipe data items are defined as 64-bit values. Pipe reads are limited to transferring a single
+  64-bit data item per operation. 
+
+  @note1hang Transfer data items larger than 64 bits by reading and writing
+             pointers to the data, or by transferring the data in consecutive 64-bit chunks.
+
+  @datatypes
+  #qurt_pipe_t \n
+  #qurt_pipe_data_t
+
+  @param[in] pipe     Pointer to the pipe object to read from.
+  @param[in] result   Pointer to the integer containing the 64-bit data item from pipe.
+
+  @return     	
+  #QURT_EOK -- Receive completed. \n
+  #QURT_ECANCEL -- Receive canceled. \n
+  #QURT_EDESTROY -- Receive destroyed. \n
+  #QURT_ENOTALLOWED -- Pipe is not initialized
+
+ 
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+int qurt_pipe_receive_cancellable(qurt_pipe_t *pipe, qurt_pipe_data_t *result);
+
+/**@ingroup func_qurt_pipe_send_cancellable  
+  @xreflabel{hdr:qurt_pipe_send_cancellable}
+  Writes a data item to the specified pipe (with suspend), cancellable. \n
+  If a thread writes to a full pipe, it is suspended on the pipe. When another thread reads
+  from the pipe, the suspended thread is awakened and can then write data to the pipe.
+  The operation is canceled if the user process of the calling thread is killed, or if the 
+  calling thread must finish its current QDI invocation and return to user space.
+  Root pd thread can use this api to wait on pipe for receiving and gets resumed with QURT_EDESTROY
+  if the pipe gets destroyed .
+
+  Pipe data items are defined as 64-bit values. Pipe writes are limited to transferring a single
+  64-bit data item per operation.
+
+  @note1hang Transfer data items larger than 64 bits by reading and writing
+             pointers to the data, or by transferring the data in consecutive 64-bit chunks.
+
+  @datatypes
+  #qurt_pipe_t \n
+  #qurt_pipe_data_t
+
+  @param[in] pipe      Pointer to the pipe object to read from.
+  @param[in] data      Data item to write.
+
+  @return     	
+  #QURT_EOK -- Send completed. \n
+  #QURT_ECANCEL -- Send canceled. \n
+  #QURT_EDESTROY -- Send destroyed. \n
+  #QURT_ENOTALLOWED -- Pipe is not initialized
+
+ 
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+int qurt_pipe_send_cancellable(qurt_pipe_t *pipe, qurt_pipe_data_t data);
+
+/**@ingroup func_qurt_pipe_is_empty
+  Returns a value indicating whether the specified pipe contains any data.
+
+  @datatypes
+  #qurt_pipe_t
+
+  @param[in] pipe     Pointer to the pipe object to read from.
+
+  @return
+  1 -- Pipe contains no data. \n
+  0 -- Pipe contains data.
+
+  @dependencies
+  None.
+*/
+int qurt_pipe_is_empty(qurt_pipe_t *pipe);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif  /* QURT_PIPE_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_pmem_manager.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_pmem_manager.h
new file mode 100755
index 0000000000000..8c8da985228b9
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_pmem_manager.h
@@ -0,0 +1,82 @@
+#ifndef QURT_PMEM_MANAGER_H
+#define QURT_PMEM_MANAGER_H
+/**
+  @file qurt_pmem_manager.h
+  Prototypes of kernel physical memory manager APIs
+
+ EXTERNALIZED FUNCTIONS
+  none
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  none
+
+ Copyright (c) 2023 by Qualcomm Technologies, Inc.  All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*=====================================================================
+ Constants and macros
+ ======================================================================*/
+
+/* physical memory API return error code */
+#define QURT_PMEM_SUCCESS               0
+#define QURT_PMEM_NO_PRIV               1
+#define QURT_PMEM_RETRY                 2
+#define QURT_PMEM_OVERLAP               3
+#define QURT_PMEM_NOT_EXIST             4
+#define QURT_PMEM_INIT_FAILURE          5
+#define QURT_PMEM_OUTSTANDING_MAPPING   6
+#define QURT_PMEM_GENERIC_FAILURE       7
+#define QURT_PMEM_ENTRY_FOUND           8
+#define QURT_PMEM_REACH_END             9
+#define QURT_PMEM_UNCLAIMED             10
+#define QURT_PMEM_ALREADY_CLAIMED       11
+
+/*=====================================================================
+ Functions
+======================================================================*/
+
+/**@ingroup func_qurt_pmem_acquire
+  Acquire the ownership of a specific physical memory region.
+
+  @note1hang The ownership will be the caller
+
+  @param[in] ppage      Starting physical page number
+  @param[in] pnum       Number of physical pages
+
+  @return
+  #QURT_PMEM_NO_PRIV -- Have no privilege to claim the ownership. \n
+  #QURT_PMEM_OVERLAP -- The whole or part of the range has been owned \n
+  #QURT_PMEM_SUCCESS -- Succeed to claim ownership.
+
+  @dependencies
+  None.
+*/
+int qurt_pmem_acquire(unsigned int ppage, unsigned int pnum);
+
+/**@ingroup func_qurt_pmem_release
+  Release the ownership of a specific physical memory region.
+
+  @param[in] ppage      The start of physical page number
+  @param[in] pnum       The numbers of physical pages
+
+  @return
+  #QURT_PMEM_NO_PRIV                -- Have no privilege to claim the ownership. \n
+  #QURT_PMEM_NOT_EXIST              -- The physical memory range is not usable. \n
+  #QURT_PMEM_OUTSTANDING_MAPPING    -- There is outstanding mapping in this range
+  #QURT_PMEM_SUCCESS                -- Succeed to claim ownership.
+
+  @dependencies
+  None.
+ */
+int qurt_pmem_release(unsigned int ppage, unsigned int pnum);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_PMEM_MANAGER_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_pmu.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_pmu.h
new file mode 100755
index 0000000000000..73ea8eba04abf
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_pmu.h
@@ -0,0 +1,121 @@
+#ifndef QURT_PMU_H
+#define QURT_PMU_H
+/**
+  @file qurt_pmu.h 
+  Prototypes of pipe interface API.  
+	 A pipe or message queue blocks when too full (send) or empty (receive).
+	 Unless using a nonblocking option, all datagrams are 64 bits.
+
+  EXTERNAL FUNCTIONS
+   None.
+
+  INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+  Copyright (c) 2021 Qualcomm Technologies, Inc.
+  All rights reserved.
+  Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+=============================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*=============================================================================
+												FUNCTIONS
+=============================================================================*/
+
+/**@ingroup func_qurt_pmu_set
+  Sets the value of the specified PMU register.
+
+  @note1hang Setting PMUEVTCFG automatically clears the PMU registers PMUCNT0
+             through PMUCNT3.
+ 
+  @param[in] reg_id   PMU register. Values: 
+            - #QURT_PMUCNT0
+            - #QURT_PMUCNT1    
+            - #QURT_PMUCNT2    
+            - #QURT_PMUCNT3    
+            - #QURT_PMUCFG     
+            - #QURT_PMUEVTCFG
+            - #QURT_PMUCNT4    
+            - #QURT_PMUCNT5    
+            - #QURT_PMUCNT6    
+            - #QURT_PMUCNT7    
+            - #QURT_PMUEVTCFG1   @tablebulletend 
+
+  @param[in] reg_value  Register value.
+ 
+  @return
+  None.
+   
+  @dependencies
+  None.
+ */
+void qurt_pmu_set (int reg_id, unsigned int reg_value);
+ 
+/**@ingroup func_qurt_pmu_get
+  Gets the PMU register.\n
+  Returns the current value of the specified PMU register.
+
+  @param[in] reg_id   PMU register. Values: 			   
+            - #QURT_PMUCNT0
+            - #QURT_PMUCNT1    
+            - #QURT_PMUCNT2    
+            - #QURT_PMUCNT3    
+            - #QURT_PMUCFG     
+            - #QURT_PMUEVTCFG
+            - #QURT_PMUCNT4    
+            - #QURT_PMUCNT5    
+            - #QURT_PMUCNT6    
+            - #QURT_PMUCNT7    
+            - #QURT_PMUEVTCFG1  @tablebulletend           
+ 
+  @return
+   Integer -- Current value of the specified PMU register.
+
+  @dependencies
+  None.
+ */
+unsigned int  qurt_pmu_get (int reg_id);
+ 
+/**@ingroup func_qurt_pmu_enable
+  Enables or disables the Hexagon processor PMU.
+  Profiling is disabled by default. 
+
+  @note1hang Enabling profiling does not automatically reset the count registers -- this must
+            be done explicitly before starting event counting.
+ 
+  @param[in] enable Performance monitor. Values: \n
+                    - 0 -- Disable performance monitor \n
+                    - 1 -- Enable performance monitor @tablebulletend
+ 
+  @return 
+  None.
+
+  @dependencies
+  None.
+ */
+void qurt_pmu_enable (int enable);
+
+/**@ingroup func_qurt_pmu_get_pmucnt
+  Reads PMU counters in a single trap.
+ 
+  @param[out] buf   Pointer to a buffer to save values read from PMU counters.
+                    buffer size should be at least 32 bytes to read all eight PMU counters.
+ 
+  @return 
+  #QURT_EOK    -- Successful read.\n
+  #QURT_EFATAL -- Failure.
+
+  @dependencies
+  None.
+ */
+int qurt_pmu_get_pmucnt (void * buf);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_PMU_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_power.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_power.h
new file mode 100755
index 0000000000000..2ee4d29a73976
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_power.h
@@ -0,0 +1,140 @@
+#ifndef QURT_POWER_H
+#define QURT_POWER_H
+/**
+  @file qurt_power.h
+  @brief  Prototypes of power API
+
+EXTERNAL FUNCTIONS
+   None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+Copyright (c) 2018-2021  by Qualcomm Technologies, Inc.  All Rights Reserved.
+Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+=============================================================================*/
+
+/*=============================================================================
+
+                        EDIT HISTORY FOR MODULE
+
+ This section contains comments describing changes made to the module.
+ Notice that changes are listed in reverse chronological order.
+
+
+when       who     what, where, why
+--------   ---     ------------------------------------------------------------
+03/03/11   op      Add header file
+12/12/12   cm      (Tech Pubs) Edited/added Doxygen comments and markup.
+=============================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** @cond */
+/**@ingroup func_qurt_power_shutdown_fail_exit
+  Returns from Power Collapse mode when power collapse cannot proceed.
+
+  This function unmasks the global interrupt. This operation is used only when the thread is
+  recovering from a failed power collapse operation (Section @xref{sec:powerShutdownEnter}).
+
+  @return
+  #QURT_EOK -- Operation was successfully performed.
+
+  @dependencies
+  None.
+ */
+#define  qurt_power_shutdown_fail_exit qurt_power_exit
+
+/**@ingroup func_qurt_power_shutdown_exit
+  Undoes state changes made preparing for power collapse.\n
+  This function unmasks the global interrupts.
+
+  @return
+  #QURT_EOK --Operation was successfully performed.
+
+  @dependencies
+  None.
+ */
+#define qurt_power_shutdown_exit qurt_power_exit
+/**@endcond */
+
+/**@ingroup func_qurt_system_ipend_get
+  Gets the IPEND register.\n
+
+  @note1hang Returns the current value of the Hexagon processor IPEND register. The return value
+             is a mask value that identifies the individual interrupts that are pending. \n
+
+  @note1hang The bit order of the mask value is identical to the order defined for the IPEND register. A
+             mask bit value of 1 indicates that the corresponding interrupt is pending, and 0 indicates that the
+             corresponding interrupt is not pending. \n
+
+  @return
+  Return the IPEND register value.
+
+  @dependencies
+  None.
+ */
+unsigned int qurt_system_ipend_get (void);
+
+
+/**@ingroup func_qurt_system_vid_get
+  Gets the VID register. \n
+
+  @note1hang Returns the current value of the Hexagon processor VID register. The return value is
+             the vector number of a second-level interrupt that has been accepted by the Hexagon
+             processor core.\n
+
+  @return
+  Return the VID register value that is the L2 VIC interrupt number accepted by the processor.
+  Valid range is 0 to 1023.
+
+  @dependencies
+  None.
+ */
+unsigned int qurt_system_vid_get(void);
+
+/**@ingroup func_qurt_power_shutdown_get_pcycles
+   Gets the number of power collapses and processor cycles for entering and exiting most recent
+   power collapse.
+
+   @note1hang If no power collapse has occured yet, processor cycle numbers are zero.
+
+   @param[out] enter_pcycles  Number of processor cycles for entering most
+                              recent power collapse.
+   @param[out] exit_pcycles  Number of processor cycles for exiting most
+                             recent power collapse.
+   @return
+   Zero -- No power collapses have occurred. \n
+   Nonzero -- Number of power collapses that have occurred since
+                the processor was reset.
+
+   @dependencies
+   None.
+ */
+int qurt_power_shutdown_get_pcycles( unsigned long long *enter_pcycles,  unsigned long long *exit_pcycles );
+
+/**@ingroup func_qurt_system_tcm_set_size
+   Set size of TCM to save during full power collapse.
+
+   @note1hang The size aligns to 32 bytes. If size passed is greater than the maximum size defined in
+              XML, the size is truncated to the size defined in XML.
+
+   @param[in] new_size Size of TCM to save.
+
+   @return
+   Zero -- Size successfully set \n
+   -1 -- Size of 0 passed
+
+   @dependencies
+   None.
+ */
+int qurt_system_tcm_set_size(unsigned int new_size);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_POWER_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_printf.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_printf.h
new file mode 100755
index 0000000000000..a775d8a815918
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_printf.h
@@ -0,0 +1,44 @@
+#ifndef QURT_PRINTF_H
+#define QURT_PRINTF_H
+
+#include <stdarg.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+  @file qurt_printf.h   
+  Prototypes of printf API.  
+
+EXTERNAL FUNCTIONS
+   None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+Copyright (c) 2021  by Qualcomm Technologies, Inc.  All Rights Reserved.
+Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+=============================================================================*/
+
+
+
+/*=============================================================================
+                        CONSTANTS AND MACROS
+=============================================================================*/
+/** @addtogroup chapter_function_tracing
+@{ */
+
+int qurt_printf(const char* format, ...);
+
+int qurt_vprintf(const char* format, va_list args);
+
+/** @} */ /* end_addtogroup chapter_function_tracing */
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_PRINTF_H */
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_process.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_process.h
new file mode 100755
index 0000000000000..0df9ddc2d4a70
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_process.h
@@ -0,0 +1,995 @@
+#ifndef QURT_PROCESS_H
+#define QURT_PROCESS_H
+/**
+  @file qurt_process.h
+  @brief Prototypes of QuRT process control APIs.
+
+ EXTERNALIZED FUNCTIONS
+ None
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+ None
+
+ Copyright (c) 2009-2013, 2021-2023 Qualcomm Technologies, Inc.
+ All rights reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+#include "qurt_callback.h"
+#include "qurt_consts.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** @addtogroup process_types
+@{ */
+#define QURT_PROCESS_ATTR_NAME_MAXLEN       QURT_MAX_NAME_LEN   /**< Maximum length of the process name. */
+#define QURT_PROCESS_ATTR_BIN_PATH_MAXLEN   128                 /**< Maximum length of the path of binary/ELF for this process. */
+#define QURT_PROCESS_ATTR_CAP_MAXLEN        128                 /**< Maximum length for a resource name. */
+
+/** QuRT process capability wildcard strings */
+#define QURT_PROCESS_ATTR_CAP_ALLOW_ALL     "ALLOW_ALL"         /**< Capability wild-card for full access */
+#define QURT_PROCESS_ATTR_CAP_ALLOW_NONE    "ALLOW_NONE"        /**< Capability wild-card for no access */
+
+/** QuRT process capability states */  
+#define QURT_PROCESS_ATTR_CAP_ENABLED       0x1                 /**< Capability enabled*/
+#define QURT_PROCESS_ATTR_CAP_DISABLED      0x0                 /**< Capability disabled*/  
+
+/* QuRT process thread attributes. */
+#define QURT_PROCESS_DEFAULT_CEILING_PRIO 0        /**< Default ceiling priority of the threads in the new process. */
+#define QURT_PROCESS_DEFAULT_MAX_THREADS  -1       /**< Default number of threads in the new process.
+                                                        -1 indicates that the limit is set to the maximum supported by the system. */
+
+/* QuRT process flags. */
+#define QURT_PROCESS_SUSPEND_ON_STARTUP  (1U)      /**< Suspend the new processes just before calling main(). */
+#define QURT_PROCESS_NON_SYSTEM_CRITICAL (1u << 1) /**< Starts the new process as non system-critical. */
+#define QURT_PROCESS_ISLAND_RESIDENT     (1u << 2) /**< Process is island resident. */
+#define QURT_PROCESS_RESTARTABLE         (1u << 3) /**< Indicates that the process is restartable */
+#define QURT_PROCESS_UNTRUSTED           (1u << 7) /**< Starts the new process as unsigned process. */
+
+/* QuRT process debugging session status.*/
+#define QURT_DEBUG_NOT_START         0  /**< Debug is not started. */
+#define QURT_DEBUG_START             1  /**< Debug has started. */
+
+/** Process Suspend Options */
+#define QURT_PROCESS_SUSPEND_DEFAULT   0
+
+/** Process Resume Options   */
+#define QURT_PROCESS_RESUME_DEFAULT    0
+
+
+/* QuRT process types. */
+typedef enum {
+    QURT_PROCESS_TYPE_RESERVED,            /**< Process type is reserved. \n */
+    QURT_PROCESS_TYPE_KERNEL,              /**< Kernel process. \n*/
+    QURT_PROCESS_TYPE_SRM,                 /**< SRM process.    \n*/
+    QURT_PROCESS_TYPE_SECURE,              /**< Secure process. \n*/
+    QURT_PROCESS_TYPE_ROOT,                /**< Root process.   \n*/
+    QURT_PROCESS_TYPE_USER,                /**< User process.   */
+}qurt_process_type_t;
+
+/** QuRT process callback types. */
+typedef enum {
+   QURT_PROCESS_DUMP_CB_ROOT,             /**< Register the callback that executes in the
+                                               root process context. \n */
+   QURT_PROCESS_DUMP_CB_ERROR,            /**< Register the user process callback that is 
+                                               called after threads in the process are frozen. \n */
+   QURT_PROCESS_DUMP_CB_PRESTM,           /**< Register the user process callback that is
+                                               called before threads in the process are frozen. \n*/
+   QURT_PROCESS_DUMP_CB_MAX               /**< Reserved for error checking. */
+}qurt_process_dump_cb_type_t;
+
+/** QuRT process dump attributes. */
+typedef struct _qurt_pd_dump_attr{
+  /** @cond */
+  unsigned int enabled;                    /**< Process dump is enabled. */
+  const char *path;                        /**< Process dump path. */
+  unsigned int path_len;                   /**< Length of process dump path. */
+  /** @endcond */
+}qurt_pd_dump_attr_t;                    
+
+/** QuRT process capability resource type */
+enum qurt_process_cap_type_t {
+    QURT_PROCESS_CAP_TYPE_NUM_ENTRIES=0,       /**< Number of entries in the capability structure*/
+    QURT_PROCESS_CAP_TYPE_DRIVER=1,            /**< Driver resource */
+    QURT_PROCESS_CAP_TYPE_MAX                  /**< Maximum identifier */        
+};
+
+/** QuRT process capability structure */
+typedef struct _qurt_capability {
+    enum qurt_process_cap_type_t type;             /**< Resource type */
+    char name[QURT_PROCESS_ATTR_CAP_MAXLEN];       /**< Resource name*/ 
+    unsigned long long cap;                        /**< Capabilities allowed for this resource */
+}qurt_capability_t;
+
+/** QuRT process attributes. */
+typedef struct _qurt_process_attr {
+    /** @cond */
+    char name[QURT_PROCESS_ATTR_NAME_MAXLEN];           /**< Name of the new process. */
+    char path[QURT_PROCESS_ATTR_BIN_PATH_MAXLEN];       /**< Path of the binary for the new process. */
+    char dtb_path[QURT_PROCESS_ATTR_BIN_PATH_MAXLEN];   /**< Path of the DTB ELF for the new process. */
+    int flags;                                          /**< Flags as indicated by QuRT process flags. */
+    unsigned int sw_id;                                 /**< Software ID of the process be load. */
+    unsigned sid;                                       /**< Stream ID of the process being spawned. */
+    unsigned max_threads;                               /**< Maximum number of threads that the new process can create. */
+    unsigned short ceiling_prio;                        /**< Maximum priority at which threads can be 
+                                                             created by new process. */
+    qurt_process_type_t type;                           /**< Process type as indicated by 
+                                                             #qurt_process_type_t. */
+    qurt_pd_dump_attr_t dump_attr;                      /**< Process dump attributes for the new process 
+                                                             as indicated by #qurt_pd_dump_attr_t. */ 
+    qurt_capability_t *capabilities;                    /**< Pointer to array of structure of type
+                                                             qurt_capability_t */
+    /** @endcond */
+} qurt_process_attr_t; 
+
+/** @} */ /* end_addtogroup process_types */
+
+/*=============================================================================
+FUNCTIONS
+=============================================================================*/
+ /** @cond rest_reg_dist */
+/**@ingroup func_qurt_process_create
+  Creates a process with the specified attributes, and starts the process.
+
+  The process executes the code in the specified executable ELF file.
+
+  @datatypes
+  #qurt_process_attr_t
+
+  @param[out] attr Accepts an initialized process attribute structure, which specifies
+                   the attributes of the created process.
+
+  @return
+  Postive return value Indicates Process ID.
+  Negative return value Indicates any of follwoing error,
+  #-QURT_EPRIVILEGE      --   Caller does not have privilege for this operation \n
+  #-QURT_EMEM            --   Not enough memory to perform the operation \n
+  #-QURT_EFAILED         --   Operation failed \n
+  #-QURT_ENOTALLOWED     --   Operation not allowed \n
+  #-QURT_ENOREGISTERED   --   Not registered    \n
+  #-QURT_ENORESOURCE     --   Resource exhaustion   \n
+  #-QURT_EINVALID        --   Invalid argument value    
+  #QURT_EFATAL           --   attr is NULL
+
+  @dependencies
+  None.
+*/
+int qurt_process_create (qurt_process_attr_t *attr);
+
+/**@ingroup func_qurt_process_get_id
+  Returns the process identifier for the current thread. 
+
+  @return
+  None.
+
+  @dependencies
+  Process identifier for the current thread.
+*/
+int qurt_process_get_id (void);
+/** @endcond */
+
+/** @cond internal_only*/
+/**@ingroup func_qurt_process_get_uid
+  Returns the user identifier for the current thread. 
+
+  @return
+  None.
+
+  @dependencies
+  User identifier for the current thread.
+*/
+int qurt_process_get_uid (void);
+/** @endcond */
+/** @cond rest_reg_dist */
+/**@ingroup func_qurt_process_attr_init
+  Initializes the structure that sets the process attributes when a thread is created.
+
+  After an attribute structure is initialized, the individual attributes in the structure can 
+  be explicitly set using the process attribute operations.
+
+  Table @xref{tbl:processAttrDefaults} lists the default attribute values set by the initialize 
+  operation.
+
+  @inputov{table_process_attribute_defaults}
+
+  @datatypes
+  #qurt_process_attr_t
+
+  @param[out] attr Pointer to the structure to initialize.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+static inline void qurt_process_attr_init (qurt_process_attr_t *attr)
+{
+    attr->name[0] = '\0';
+    attr->path[0] = '\0';
+    attr->dtb_path[0] = '\0';
+    attr->flags = 0;
+    attr->sw_id = 0;
+    attr->sid = 0;
+    attr->max_threads = (unsigned)QURT_PROCESS_DEFAULT_MAX_THREADS;
+    attr->ceiling_prio = QURT_PROCESS_DEFAULT_CEILING_PRIO;
+    attr->type = QURT_PROCESS_TYPE_RESERVED;
+    attr->dump_attr.enabled = 0;
+    attr->dump_attr.path = NULL;
+    attr->dump_attr.path_len = 0;
+    attr->capabilities = NULL;
+}
+
+/**@ingroup func_qurt_process_attr_set_executable
+  Sets the process name in the specified process attribute structure.
+
+  Process names identify process objects that are already 
+  loaded in memory as part of the QuRT system.
+
+  @note1hang Process objects are incorporated into the QuRT system at build time.
+
+  @note1hang Maximum length of name string is limited to QURT_PROCESS_ATTR_NAME_MAXLEN - 1.
+
+  @datatypes
+  #qurt_process_attr_t
+
+  @param[in] attr Pointer to the process attribute structure.
+  @param[in] name Pointer to the process name.
+ 
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+void qurt_process_attr_set_executable (qurt_process_attr_t *attr, const char *name);
+
+/**@ingroup func_qurt_process_attr_set_binary_path
+  Sets the binary path for the process loading in the specified process attribute structure.
+
+  Path specifies the binary to load for this process.
+  
+  @note1hang Max length of path string is limited to QURT_PROCESS_ATTR_BIN_PATH_MAXLEN-1.
+
+  @datatypes
+  #qurt_process_attr_t
+
+  @param[in] attr Pointer to the process attribute structure.
+  @param[in] path Pointer to the binary path.
+ 
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+void qurt_process_attr_set_binary_path(qurt_process_attr_t *attr, char *path);
+
+/**@ingroup func_qurt_process_attr_set_dtb_path
+  Sets the DTB binary path for the process loading in the specified process attribute structure.
+
+  Path specifies the DTB binary to load for this process.
+  
+  @note1hang Max length of path string is limited to QURT_PROCESS_ATTR_BIN_PATH_MAXLEN-1.
+
+  @datatypes
+  #qurt_process_attr_t
+
+  @param[in] attr Pointer to the process attribute structure.
+  @param[in] path Pointer to the binary path.
+ 
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+void qurt_process_attr_set_dtb_path(qurt_process_attr_t *attr, char *path);
+
+/**@ingroup func_qurt_process_attr_set_flags
+Sets the process properties in the specified process attribute structure.
+Process properties are represented as defined symbols that map into bits 
+0 through 31 of the 32-bit flag value. Multiple properties are specified by OR'ing 
+together the individual property symbols.
+
+@datatypes
+#qurt_process_attr_t
+
+@param[in] attr  Pointer to the process attribute structure.
+@param[in] flags QURT_PROCESS_NON_SYSTEM_CRITICAL Process is considered as non system-critical.
+                                                  This attribute will be used by error services,
+                                                  to decide whether to kill user pd or whole subsystem.
+                 QURT_PROCESS_ISLAND_RESIDENT     Process will be marked as island resident.
+                 QURT_PROCESS_RESTARTABLE         Process will be marked as restartable.
+                 QURT_PROCESS_UNTRUSTED           Process will be marked as unsigned process.
+@return
+None.
+
+@dependencies
+None.
+*/
+static inline void qurt_process_attr_set_flags (qurt_process_attr_t *attr, int flags)
+{
+    attr->flags = flags;
+}
+/** @endcond */
+/** @cond internal_only*/
+/**@ingroup func_qurt_process_attr_set_sid
+Sets the process streamID in the specified process attribute structure.
+
+@datatypes
+#qurt_process_attr_t
+
+@param[in] attr  Pointer to the process attribute structure.
+@param[in] sid   streamID to set for this process.
+
+@return
+None.
+
+@dependencies
+None.
+*/
+static inline void qurt_process_attr_set_sid (qurt_process_attr_t *attr, unsigned sid)
+{
+    attr->sid = sid;
+}
+/** @endcond */
+/** @cond rest_reg_dist */
+/**@ingroup func_qurt_process_attr_set_max_threads
+Sets the maximum number of threads allowed in the specified process attribute structure.
+
+@datatypes
+#qurt_process_attr_t
+
+@param[in] attr          Pointer to the process attribute structure.
+@param[in] max_threads   Maximum number of threads allowed for this process.
+
+@return
+None.
+
+@dependencies
+None.
+*/
+static inline void qurt_process_attr_set_max_threads (qurt_process_attr_t *attr, unsigned max_threads)
+{
+    attr->max_threads = max_threads;
+}
+
+/**@ingroup func_qurt_process_attr_set_sw_id
+Sets the software ID of the process to load in the specified process attribute structure.
+
+@datatypes
+#qurt_process_attr_t
+
+@param[in] attr          Pointer to the process attribute structure.
+@param[in] sw_id         Software ID of the process, used in authentication.
+
+@return
+None.
+
+@dependencies
+None.
+*/
+static inline void qurt_process_attr_set_sw_id(qurt_process_attr_t *attr, unsigned int sw_id)
+{
+    attr->sw_id = sw_id;
+}
+
+/**@ingroup func_qurt_process_attr_set_ceiling_prio
+Sets the highest thread priority allowed in the specified process attribute structure.
+Refer qurt_thread.h for priority ranges.
+
+@datatypes
+#qurt_process_attr_t
+
+@param[in] attr          Pointer to the process attribute structure.
+@param[in] prio          Priority.
+
+@return
+None.
+
+@dependencies
+None.
+*/
+static inline void qurt_process_attr_set_ceiling_prio (qurt_process_attr_t *attr, unsigned short prio)
+{
+    attr->ceiling_prio = prio;
+}
+/** @endcond */
+
+/** @cond internal_only*/
+/**@ingroup func_qurt_process_attr_set_dump_status
+Sets the process domain dump-enabled field in the process domain dump attributes.
+
+@datatypes
+#qurt_process_attr_t
+
+@param[in] attr          Pointer to the process attribute structure.
+@param[in] enabled       1 -- Process domain dump is collected \n
+                         0 -- Process domain dump is not collected
+
+@return
+None.
+
+@dependencies
+None.
+*/
+static inline void qurt_process_attr_set_dump_status(qurt_process_attr_t *attr, unsigned int enabled)
+{
+    attr->dump_attr.enabled = enabled;
+}
+
+/**@ingroup func_qurt_process_attr_set_dump_path
+Sets the process domain dump path and type.
+
+@datatypes
+#qurt_process_attr_t
+
+@param[in] attr          Pointer to the process attribute structure.
+@param[in] path          Path where the process domain dumps must be saved.
+@param[in] path_len      Length of the path string.
+
+@return
+None. 
+
+@dependencies
+None.
+*/
+static inline void qurt_process_attr_set_dump_path(qurt_process_attr_t *attr, const char *path, int path_len)
+{
+    attr->dump_attr.path = path;
+    attr->dump_attr.path_len = (unsigned int)path_len;
+}
+
+/**@ingroup func_qurt_process_attr_set_capabilities
+Sets list of capabilities available to this process.
+
+@datatypes
+#qurt_process_attr_t
+
+@param[in] attr          Pointer to the process attribute structure.
+@param[in] capabilities  Pointer to array of structures of type qurt_capability_t defining 
+                         resources and capabilites
+
+@return
+None. 
+
+@dependencies
+None.
+*/
+static inline void qurt_process_attr_set_capabilities(qurt_process_attr_t *attr, qurt_capability_t *capabilities)
+{
+    attr->capabilities = capabilities;
+}
+
+/** @endcond */
+/** @cond rest_reg_dist */
+/**@ingroup func_qurt_process_cmdline_get
+Gets the command line string associated with the current process.
+The Hexagon simulator command line arguments are retrieved using 
+this function as long as the call is made
+in the process of the QuRT installation, and with the 
+requirement that the program runs in a simulation environment.
+
+If the function modifies the provided buffer, it zero-terminates
+the string. It is possible that the function does not modify the
+provided buffer, so the caller must set buf[0] to a NULL
+byte before making the call. A truncated command line is returned when
+the command line is longer than the provided buffer.
+
+@param[in] buf      Pointer to a character buffer that must be filled in.
+@param[in] buf_siz  Size (in bytes) of the buffer pointed to by the buf argument.
+
+@return
+None.
+
+@dependencies
+None.
+*/
+void qurt_process_cmdline_get(char *buf, unsigned buf_siz);
+
+/**@ingroup func_qurt_process_get_thread_count
+Gets the number of threads present in the process indicated by the PID. 
+ 
+@param[in] pid PID of the process for which the information is required.
+
+@return
+Number of threads in the process indicated by PID, if positive value is obtained
+Negative error code if failed include:
+   QURT_EFATAL - Invalid PID
+   -QURT_ENOTALLOWED - Current process doesnt have access to target process indicated by PID
+
+@dependencies
+None.
+*/
+int qurt_process_get_thread_count(unsigned int pid);
+
+/**@ingroup func_qurt_process_get_thread_ids
+Gets the thread IDs for a process indicated by PID. 
+
+@param[in] pid      PID of the process for which the information is required.
+@param[in] ptr         Pointer to a user passed buffer that must be filled in with thread IDs.
+@param[in] thread_num  Number of thread IDs requested.
+
+@return
+#QURT_EOK - Success
+#QURT_EFATAL - Failed, ptr is NULL
+
+@dependencies
+None.
+ */
+int qurt_process_get_thread_ids(unsigned int pid, unsigned int *ptr, unsigned thread_num);
+/** @endcond */
+/** @cond internal_only*/
+/**@ingroup func_qurt_process_dump_get_mem_mappings_count
+Gets the number of mappings present in the process indicated by the PID. 
+ 
+@param[in] pid PID of the process for which the information is required.
+
+@return
+Number of mappings for the process indicated by the PID.
+
+@dependencies
+None.
+*/
+int qurt_process_dump_get_mem_mappings_count(unsigned int pid);
+
+/**@ingroup func_qurt_process_dump_get_mappings
+Gets the mappings for a specified PID.
+
+@note1hang This API skips device type mappings or mappings created by setting the #QURT_PERM_NODUMP attribute.
+
+@param[in] pid      PID of the process for which the information is required.
+@param[in] ptr      Pointer to a buffer that must be filled in with mappings.
+@param[in] count    Count of mappings requested.
+
+@return
+Number of mappings filled in the buffer passed by the user.
+
+@dependencies
+None.
+*/
+int qurt_process_dump_get_mappings(unsigned int pid, unsigned int *ptr, unsigned count);
+/** @endcond */
+/** @cond rest_reg_dist */
+/**@ingroup func_qurt_process_attr_get
+Gets the attributes of the process with which it was created. 
+ 
+@datatypes
+#qurt_process_attr_t
+
+@param[in]     pid  PID of the process for which the information is required.
+@param[in,out] attr Pointer to the user allocated attribute structure.
+
+@return
+#QURT_EOK     - Success
+#QURT_INVALID - Invalid PID
+#QURT_EFATAL  - attr is NULL
+
+@dependencies
+None.
+*/
+int qurt_process_attr_get(unsigned int pid, qurt_process_attr_t *attr);
+
+/**@ingroup func_qurt_process_dump_register_cb
+Registers the process domain dump callback. 
+ 
+@datatypes
+#qurt_cb_data_t \n
+#qurt_process_dump_cb_type_t
+
+@param[in] cb_data Pointer to the callback information.
+@param[in] type Callback type; these callbacks are called in the context of the user process domain: \n
+   #QURT_PROCESS_DUMP_CB_PRESTM -- Before threads of the exiting process are frozen. \n
+   #QURT_PROCESS_DUMP_CB_ERROR  -- After threads are frozen and captured. \n
+   #QURT_PROCESS_DUMP_CB_ROOT   -- After threads are frozen and captured, and CB_ERROR type of callbacks
+                                   are called.
+@param[in] priority Priority.
+
+@return
+#QURT_EOK -- Success \n
+Other values -- Failure
+    QURT_EFATAL if cb_data is NULL
+    QURT_EINVALID If invalid cb_type
+    QURT_EFAILED If invalid cb_data 
+ 
+@dependencies
+None.
+*/
+int qurt_process_dump_register_cb(qurt_cb_data_t *cb_data, qurt_process_dump_cb_type_t type, unsigned short priority);
+
+/**@ingroup func_qurt_process_dump_deregister_cb
+Deregisters the process domain dump callback.
+
+@datatypes
+#qurt_cb_data_t \n
+#qurt_process_dump_cb_type_t
+
+@param[in] cb_data Pointer to the callback information to deregister.
+@param[in] type    Callback type.
+
+@return
+#QURT_EOK -- Success.\n
+Other values -- Failure.
+    QURT_EFATAL if cb_data is NULL
+    QURT_EINVALID If invalid cb_type
+    QURT_EFAILED If invalid cb_data 
+
+@dependencies
+None.
+*/
+int qurt_process_dump_deregister_cb(qurt_cb_data_t *cb_data,qurt_process_dump_cb_type_t type);
+
+/** @endcond */
+/** @cond internal_only*/
+/**@ingroup func_qurt_process_set_rtld_debug
+Sets rtld_debug for a process. 
+ 
+@param[in] pid     PID of the process for which rtld_debug must be set.
+@param[in] address rtld_debug address.
+
+@return
+#QURT_EOK      - Success
+#QURT_EINVALID - Invalid PID
+#QURT_EFATAL   - Invalid address
+ 
+@dependencies
+None.
+*/
+int qurt_process_set_rtld_debug(unsigned int pid,unsigned int address);
+
+/**@ingroup func_qurt_process_get_rtld_debug
+Gets rtld_debug for a process.
+
+@param[in] pid         PID of the process for which rtld_debug must be set.
+@param[in,out] address Pointer to the user passed address in which the rtld_debug address must be returned.
+
+@return
+#QURT_EOK      - Success
+#QURT_EINVALID - Invalid PID
+#QURT_EFATAL   - Invalid address
+
+@dependencies
+None.
+*/
+int qurt_process_get_rtld_debug(unsigned int pid,unsigned int *address);
+/** @endcond */
+/**@ingroup func_qurt_process_exit
+Exits the current user process with an exit code.
+
+@param[in] exitcode Exit code.
+ 
+@return
+#QURT_EFATAL -- No client found with the specified PID value \n
+#QURT_EINVALID -- Invalid client \n
+#QURT_ENOTALLOWED -- User does not have permission to perform this operation \n
+#QURT_EOK -- Success
+
+@dependencies
+None.
+*/
+int qurt_process_exit(int exitcode);
+
+/**@ingroup func_qurt_process_kill
+Kills the process represented by the PID with the exit code.
+
+@param[in] pid       PID of the process to kill.
+@param[in] exitcode  Exit code.
+ 
+@return
+#QURT_EFATAL -- No client found with the specified PID value \n
+#QURT_EINVALID -- Invalid client \n
+#QURT_ENOTALLOWED -- User does not have permission to perform this operation \n
+#QURT_EOK -- Success
+
+@dependencies
+None.
+*/
+int qurt_process_kill(int pid, int exitcode);
+ 
+ 
+/**@ingroup func_qurt_debugger_register_process
+Registers the process indicated by the PID with the debug monitor. 
+
+@param[in] pid  PID of the process.
+@param[in] adr  Address.
+ 
+@return
+#QURT_EOK -- Success 
+
+@dependencies
+None.
+*/
+int qurt_debugger_register_process(int pid, unsigned int adr);
+ 
+ 
+/**@ingroup func_qurt_debugger_deregister_process
+Deregister the process indicated by the PID with the debug monitor.
+
+@param[in] pid  PID of the process.
+ 
+@return
+#QURT_EOK -- Success
+
+@dependencies
+None.
+*/
+int qurt_debugger_deregister_process(int pid);
+ 
+/**@ingroup func_qurt_process_exec_callback
+Executes callbacks in the user process as indicated by the client_handle argument.
+
+@param[in] client_handle  Client handle obtained from the current invocation function (Section 3.4.1).
+@param[in] callback_fn    Callback function to execute.
+@param[in] stack_base     Stack address to use.
+@param[in] stack_size     Stack size.
+ 
+@return
+#QURT_EOK -- Success
+
+@dependencies
+None.
+*/
+int qurt_process_exec_callback(int client_handle,
+                                     unsigned callback_fn,
+                                     unsigned stack_base,
+                                     unsigned stack_size);
+ 
+/**@ingroup func_qurt_process_get_pid
+Gets the process ID of the process that the client_handle argument represents.
+
+@note1hang This API is not supported for unsigned PD, For unsigned PD use qurt_process_get_id()
+
+@param[in] client_handle    Client handle obtained from the current invocation function (Section 3.4.1).
+@param[in] pid              Pointer to the address to store the PID.
+ 
+@return
+#QURT_EOK -- Success
+#QURT_EFATAL -- pid pointer passed as NULL 
+
+@dependencies
+None.
+*/
+int qurt_process_get_pid(int client_handle, int * pid);
+
+/**@ingroup func_qurt_process_get_dm_status
+Gets the debugging session status on the process represented by the pid argument.
+
+@param[in]     pid      Process ID  
+@param[in,out] status   Address to store the status: \n
+                        #QURT_DEBUG_NOT_START \n        
+                        #QURT_DEBUG_START         
+ 
+@return
+#QURT_EOK - Success \n
+#QURT_EINVALID - Error
+
+@dependencies
+None.
+*/
+int qurt_process_get_dm_status( unsigned int pid, unsigned int *status);
+
+
+/**@ingroup func_qurt_process_suspend_threads 
+  Suspends user threads in a user process with its process identifier.
+  The target user process can be a signed user process or an unsigned user process.
+  The caller is from a thread in GuestOS/root process.
+  After the user threads in the target user process are suspended, they cannot be scheduled to run by the kernel 
+  until they resume later.
+
+  This function has one optional argument with one default option.
+  #QURT_PROCESS_SUSPEND_DEFAULT suspends user threads in the target user process.
+
+  This function call is a synchronous call, the function returns after the relevant threads are 
+  completely suspended. 
+  
+  If some user threads in the target user process are set as non-suspendable, this function call does
+  not suspend these threads.
+
+  If the target user process is already suspended, this function call returns success as the 
+  confirmation on the user process suspending.
+
+  QuRT debugger monitor threads in the target user process are non-suspendable, this function call does
+  not suspend the threads.
+
+  If the target user process is a secure user process, or a CPZ process, this function call returns error 
+  without suspending the target user process.                                          
+
+  If a user thread in the target user process runs in the guest OS/root process via a QDI call, this function call 
+  does not suspend the thread in the guest OS, but instead marks the thread as pending-suspend. The thread is suspended 
+  when it exits the guest OS, before executing the first instruction in the user process.
+  In this case, the function returns success while the user thread can be running in GuestOS, and is suspended 
+  when exiting the guest OS. 
+ 
+  @param[in] process_id  Process identifier.
+  @param[in] option      Dfault option #QURT_PROCESS_SUSPEND_DEFAULT suspends user threads in the target user process.
+ 
+  @return
+  #QURT_EOK         -- Success  \n
+  #QURT_EINVALID    -- Failure because of invalid process_id input \n
+  #QURT_ENOTALLOWED -- Failure because the operation is not allowed, for example, on a secure process/CPZ process.
+ 
+  @dependencies
+  None.
+ */
+int qurt_process_suspend_threads (unsigned int process_id, unsigned int option);
+
+
+/**@ingroup func_qurt_process_resume_threads 
+  Resumes a user process with its process identifier.
+  The target user process can be a signed user process or an unsigned user process.
+  The caller is from a thread in the guest OS/root process.
+  After the user threads in the target user process resume, the kernel scheduler
+  can schedule the user threads to run based on their thread priorities.
+
+  This function has an optional argument, #QURT_PROCESS_RESUME_DEFAULT, which 
+  resumes user threads in the target user process.
+
+  This is an asynchronous function, it returns after the kernel moves the user thread from 
+  suspended state to runnable state. The threads are scheduled to run based on their thread priorities.
+  
+  This function call does not resume threads in the target user process that have been set as non-resumable.
+
+  If the target user process have already resumed, this function call confirms that the user process resumes
+  by returning success.
+
+  If the target user process is a secure user process or a CPZ process, this function call returns an error without 
+  resuming operation.                                          
+
+  If user threads in the target user process run in the guest OS/root process via QDI call, this function 
+  call clears the mark of suspend-pending on these threads, so that the threads are be suspended when it exits 
+  the guest OS. 
+ 
+  @param[in] process_id Process identifier.
+  @param[in] option     Default option #QURT_PROCESS_RESUME_DEFAULT resumes user threads in the target user process.
+ 
+  @return
+  #QURT_EOK         -- Success  
+  #QURT_EINVALID    -- Failure because of invalid process_id input.
+  #QURT_ENOTALLOWED -- Failure because of the operation is not allowed, for example, on a secure process/CPZ process.
+ 
+  @dependencies
+  None.
+ */
+int qurt_process_resume_threads (unsigned int process_id, unsigned int option);
+
+/**@ingroup func_qurt_process_vtcm_window_set
+  Set a VTCM access window for a process.
+  The caller thread needs to be in SRM process.
+  
+  This is an synchronous function, it ensures all running threads of the process have the requested 
+  window in effect.The requested view for all non-running thread will take in effect when they get 
+  scheduled.  
+
+  @param[in] pid Process identifier.
+  @param[in] enable  QURT_VTCM_WINDOW_ENABLE    enforces VTCM access window defined by high and low offset.
+                     QURT_VTCM_WINDOW_DISABLE   high and low offset is ignored and VTCM access is fully 
+                                                disabled for the process.
+  @param[in] high_offset  Specifies the high window offset, in 4K increments, from the base address of the VTCM.
+                          QURT_VTCM_WINDOW_HI_OFFSET_DEFAULT  restore high offset to reset value.
+  @param[in] low_offset   Specifies the low window offset, in 4K increments, from the base address of the VTCM.
+                          QURT_VTCM_WINDOW_LO_OFFSET_DEFAULT restore low offset to reset value.
+           
+  @note1hang
+  when high_offset is set to QURT_VTCM_WINDOW_HI_OFFSET_DEFAULT  and low offset is set as 
+  QURT_VTCM_WINDOW_LO_OFFSET_DEFAULT full VTCM range is accessible. Access to VTCM is controlled 
+  via MMU mapping for the process. 
+  
+  @return
+  #QURT_EOK            -- Success  
+  #QURT_EVAL           -- Failure because of invalid inputs.
+  #QURT_EPRIVILEGE     -- Failure because caller does not have enough privilege for this operation.
+  #QURT_ENOTSUPPORTED  -- Failure because of the operation is not supported due to limitation in HW capabilities 
+ 
+  @dependencies
+  None.
+ */
+int qurt_process_vtcm_window_set(int pid, unsigned int enable, unsigned int high_offset, unsigned int low_offset);
+
+/**@ingroup func_qurt_process_vtcm_window_get
+  Get the VTCM window for a process.
+  The caller thread needs to be in SRM process.
+  
+
+  @param[in] pid Process identifier.
+  @param[out] enable  address to store enable status if set
+  @param[out] high_offset address to return high window offset, in 4K increments, from the base address of the VTCM
+  @param[out] low_offset  address to return low window offset, in 4K increments, from the base address of the VTCM.
+  
+  @note1hang
+  User must first check the value of enable returned before checking high and low offset.
+ 
+  @return
+  #QURT_EOK            -- Success  
+  #QURT_EVAL           -- Failure because of invalid inputs.
+  #QURT_EPRIVILEGE     -- Failure because caller does not have enough privilege for this operation.
+  #QURT_ENOTSUPPORTED  -- Failure because of the operation is not supported due to limitation in HW capabilities 
+ 
+  @dependencies
+  None.
+ */
+int qurt_process_vtcm_window_get(int pid, unsigned int *enable, unsigned int *high_offset, unsigned int *low_offset);
+
+/**@ingroup func_qurt_process_set_group_config
+  Enable thread groups in the process with the ceiling priorities setup
+
+  @param[in] process_id Process identifier.
+  @param[in] group_bitmask 64-bit mask of active thread groups
+  @param[in] ceiling_priorities array of ceiling priorities for thread group
+
+  @note1hang
+  This API can only be called by root PD and can only be called once for each process, otherwise it will be
+  rejected. Group 0 must be enabled in group_bitmask, otherwise QuRT will return error. After this API, all
+  exisiting threads will be moved to group 0, and if there is any thread's priority higher than ceiling
+  priority of group 0, it will be lowered to the ceiling value.
+  Examples 1:
+  group_bitmask = 0xD7; //'b11010111
+  ceiling_priorities[] = {100, 128, 200, 0, 196, 0, 240, 20}; // 0 - does not care
+  Exmaples 2:
+  group_mask = 0x5;     //'b101
+  ceiling_priorities[] = {240, 0, 20}; // 0 - does not care
+
+
+  @return
+  #QURT_EOK            -- Success.
+  #QURT_EVAL           -- Failure because of invalid inputs.
+  #QURT_ENOTALLOWED    -- The group has been configured already.
+
+  @dependencies
+  None.
+ */
+int qurt_process_set_group_config(unsigned int process_id, unsigned long long group_bitmask,
+    unsigned char *ceiling_priorities);
+
+
+/**@ingroup func_qurt_process_stid_set
+  Set the specified stid for a process or for a thread group within a process. 
+
+  @param[in] pid Process identifier.
+  @param[in] group_id  group identifier
+  @param[in] stid stid to be set 
+  
+  @note1hang 
+  User can pass default group_id (QURT_THREAD_DEFAULT_GROUP_ID) if stid needs to set at a process level.
+  All threads within a process that has default stid (QURT_STID_DEFAULT) will inherit the stid set for a process.
+  When a non-default group_id is specified, the stid is set only for a thread group.
+  
+  @return
+  #QURT_EOK            -- Success
+  #QURT_EFATAL         -- Invalid PID
+  #QURT_EVAL           -- Failure because of invalid inputs.
+  #QURT_EPRIVILEGE     -- Failure because caller does not have enough privilege for this operation.
+ 
+  @dependencies
+  None.
+ */
+int qurt_process_stid_set(unsigned int pid, unsigned int group_id , unsigned int stid);
+
+/**@ingroup func_qurt_process_stid_get
+  Get the stid for a process or for a thread group within a process. 
+
+  @param[in]  pid Process identifier.
+  @param[in]  group_id  group identifier
+  @param[out] Pointer to a variable to return  stid
+  
+  @note1hang 
+  User can pass default group_id (QURT_THREAD_DEFAULT_GROUP_ID) to return process-level stid.
+  When a non-default group_id is specified, the stid is returned only for a thread group.
+  
+  @return
+  #QURT_EOK            -- Success
+  #QURT_EFATAL         -- Invalid PID
+  #QURT_EVAL           -- Failure because of invalid inputs.
+  #QURT_EPRIVILEGE     -- Failure because caller does not have enough privilege for this operation.
+ 
+  @dependencies
+  None.
+ */
+int qurt_process_stid_get(unsigned int pid, unsigned int group_id , unsigned int *stid);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_profile.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_profile.h
new file mode 100755
index 0000000000000..2a50c461440f6
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_profile.h
@@ -0,0 +1,98 @@
+#ifndef QURT_PROFILE_H
+#define QURT_PROFILE_H
+/**
+  @file qurt_profile.h
+  QuRT profiling support.
+
+EXTERNAL FUNCTIONS
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+Copyright (c) 2018, 2021, 2023 by Qualcomm Technologies, Inc.  All Rights Reserved.
+Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+==============================================================================*/
+#include "qurt_thread.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** @addtogroup profiling_macros
+@{ */
+#define QURT_PROFILE_DISABLE 0 /**< Disable profiling. */
+#define QURT_PROFILE_ENABLE  1 /**< Enable profiling. */
+
+typedef unsigned int qurt_profile_param_t;
+
+#define QURT_PROFILE_PARAM_THREAD_READY_TIME 0U /**< Profile thread ready time. */
+
+/** @} */ /* end_addtogroup profiling_macros */
+
+/** @addtogroup profiling_types
+    @{ */
+/** Profiling results. */
+typedef union
+{
+    /** Result associated with #QURT_PROFILE_PARAM_THREAD_READY_TIME.  */
+    struct
+    {
+        unsigned int ticks; /**< Cumulative ticks the thread was ready. */
+    } thread_ready_time;
+
+} qurt_profile_result_t;
+/** @} */ /* end_addtogroup profiling_types */
+
+/**@ingroup func_qurt_profile_enable2
+ * Starts profiling of a specific parameter on a specific thread (as applicable).
+ *  
+ * @param[in] param     Profiling parameter.
+ * @param[in] thread_id ID of the thread (if applicable) for which the specified 
+ *                      paramter must be profiled.
+ * @param[in] enable    #QURT_PROFILE_DISABLE -- disable \n #QURT_PROFILE_ENABLE -- 
+ *                      enable
+ *  
+ * @return 
+ * #QURT_EOK -- Success \n 
+ * #QURT_EALREADY -- Measurement already in progress or already stopped \n 
+ * #QURT_ENOTHREAD -- Thread does not exist \n 
+ * #QURT_EINVALID -- Invalid profiling parameter \n
+ *  
+ * @dependencies 
+ * None.   
+ */
+extern int qurt_profile_enable2 (
+    qurt_profile_param_t param,
+    qurt_thread_t        thread_id,
+    int                  enable
+);
+
+/**@ingroup func_qurt_profile_get
+ * Gets the value of the profiling parameter that was previously enabled. 
+ *  
+ * @param[in] param     Profiling parameter.
+ * @param[in] thread_id ID of thread (if applicable) for which the specified 
+ *                      profiling paramter must be retrieved.
+ * @param [out] result  Profiling result associated with the parameter for the specified 
+ *                      thread (if applicable).
+ *  
+ * @return 
+ * #QURT_EOK -- Success \n 
+ * #QURT_EFAILED -- Operation failed; profiling was not enabled \n 
+ * #QURT_ENOTHREAD -- Thread does not exist \n 
+ * #QURT_EINVALID -- Invalid profiling parameter \n
+ *  
+ * @dependencies 
+ * None. 
+ */
+extern int qurt_profile_get (
+    qurt_profile_param_t    param,
+    qurt_thread_t           thread_id,
+    qurt_profile_result_t * result
+);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+#endif
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_ptrace.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_ptrace.h
new file mode 100755
index 0000000000000..622304dd92865
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_ptrace.h
@@ -0,0 +1,37 @@
+/*=============================================================================
+
+                                    qurt_ptrace.h
+
+GENERAL DESCRIPTION
+
+EXTERNAL FUNCTIONS
+        None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+        None.
+
+             Copyright (c) 2013  by Qualcomm Technologies, Inc.  All Rights Reserved.
+=============================================================================*/
+#ifndef __SYS_PTRACE_H__
+#define __SYS_PTRACE_H__
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+enum __ptrace_request
+{
+   /**
+     Indicates that the process making this request is requesting to be traced.
+   */
+   PTRACE_TRACEME = 0,
+   PTRACE_EXT_IS_DEBUG_PERMITTED = 500
+};
+
+long ptrace(enum __ptrace_request request, unsigned int pid, void*addr, void *data);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif //__SYS_PTRACE_H__
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_qdi.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_qdi.h
new file mode 100755
index 0000000000000..705408e5cfc6f
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_qdi.h
@@ -0,0 +1,185 @@
+#ifndef QDI_H
+#define QDI_H
+
+/**
+  @file qurt_qdi.h
+  @brief Prototypes of QuRT Driver Invocation API functions      
+
+ EXTERNALIZED FUNCTIONS
+  none
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  none
+
+ Copyright (c) 2013, 2021, 2023 Qualcomm Technologies, Inc.
+ All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+
+
+#include "qurt_qdi_constants.h"
+#include "qurt_qdi_imacros.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**@ingroup func_qurt_qdi_open
+  Opens the specified driver for subsequent operations.
+  qurt_qdi_open() is the primary mechanism by which a driver user can
+  obtain a QDI handle. The user provides the name of the driver to the 
+  qurt_qdi_open call, and gets back a handle referencing
+  the named driver. \n
+  @note1hang For reasons related to the Hexagon standard for varargs functions, the
+             qurt_qdi_open function prototype is not actually defined as a varargs.
+
+
+  @param[in] p   Driver name.
+  @param[in] ... Up to nine additional device-specific arguments can be passed as parameters, 
+                 and should follow the POSIX open() convention. \n
+                 - flags -- Optional second parameter (POSIX flags), the handle 
+                         access requested (read-only, write-only, or read-write,
+                         for instance) and other flags such as whether the call 
+                         should create a new device or only open an existing 
+                         device.   \n
+                 - mode  -- Optional third parameter (POSIX mode); permissions to
+                         configure when a new device is created. @tablebulletend
+ 
+  @return 
+  Negative value -- Error. \n
+  Non-negative value -- Success, this result value serves as a handle to the
+                        opened driver.
+  @dependencies
+  None.
+ */
+// int qurt_qdi_open();
+#define qurt_qdi_open(p,...) \
+   qurt_qdi_handle_invoke(QDI_HANDLE_GENERIC,QDI_OPEN,(p),##__VA_ARGS__)
+
+#define qurt_qdi_open_dt(p,q,...) \
+   qurt_qdi_handle_invoke(QDI_HANDLE_GENERIC,QDI_OPEN_FROM_DT,(p),(q),##__VA_ARGS__)
+
+/**@ingroup func_qurt_qdi_handle_invoke
+  Performs a generic driver operation, which (depending on the specified operation) can be
+  either be one of the predefined operations listed in @xhyperref{tbl:functionMapping,QDI function mapping} 
+  or a driver-specific operation.
+  The user provides a QDI handle and an integer
+  method number, along with 0 to 8 optional 32-bit arguments.
+  The device driver invocation function is invoked with the
+  same method number and 0 to 8 optional arguments. The
+  return value from the invocation function is passed back to
+  the user as the return value of qurt_qdi_handle_invoke.
+
+  @note1hang For reasons related to the Hexagon standard for varargs functions, the
+             qurt_qdi_handle_invoke() function prototype is not actually defined as a
+             varargs function (and would break if it were defined this way).
+ 
+  @param[in]  h   Driver handle.
+  @param[in]  m   Integer number for the operation to perform.
+  @param[in]  ... Up to eight optional arguments can be passed to the device driver as operation-specific parameters: \n
+               arg1 -- First parameter \n
+               arg2 -- Second parameter  \n
+               arg3 -- Third parameter  \n
+               arg4 -- Fourth parameter  \n
+               arg5 -- Fifth parameter  \n
+               arg6 -- Sixth parameter  \n
+               arg7 -- Seventh parameter  \n
+               arg8 -- Eighth parameter 
+ 
+  @return 
+  Integer value defined by the device driver. \n
+  -1 -- Error.
+
+  @dependencies
+  None.
+ */
+// int qurt_qdi_handle_invoke();
+#define qurt_qdi_handle_invoke(h,m,...) \
+   _QDMPASTE(_QDMHI,_QDMCNT(QDI_HANDLE_LOCAL_CLIENT,h,m,##__VA_ARGS__))(QDI_HANDLE_LOCAL_CLIENT,h,m,##__VA_ARGS__)
+#define _QDMHI3(a,b,c) qurt_qdi_qhi3(0,b,c)
+#define _QDMHI4(a,b,c,d) qurt_qdi_qhi4(0,b,c,(int)(d))
+#define _QDMHI5(a,b,c,d,e) qurt_qdi_qhi5(0,b,c,(int)(d),(int)(e))
+#define _QDMHI6(a,b,c,d,e,f) qurt_qdi_qhi6(0,b,c,(int)(d),(int)(e),(int)(f))
+#define _QDMHI7(a,b,c,d,e,f,g) qurt_qdi_qhi7(8,b,c,(int)(d),(int)(e),(int)(f),(int)(g))
+#define _QDMHI8(a,b,c,d,e,f,g,h) qurt_qdi_qhi8(8,b,c,(int)(d),(int)(e),(int)(f),(int)(g),(int)(h))
+#define _QDMHI9(a,b,c,d,e,f,g,h,i) qurt_qdi_qhi9(16,b,c,(int)(d),(int)(e),(int)(f),(int)(g),(int)(h),(int)(i))
+#define _QDMHI10(a,b,c,d,e,f,g,h,i,j) qurt_qdi_qhi10(16,b,c,(int)(d),(int)(e),(int)(f),(int)(g),(int)(h),(int)(i),(int)(j))
+#define _QDMHI11(a,b,c,d,e,f,g,h,i,j,k) qurt_qdi_qhi11(24,b,c,(int)(d),(int)(e),(int)(f),(int)(g),(int)(h),(int)(i),(int)(j),(int)(k))
+#define _QDMHI12(a,b,c,d,e,f,g,h,i,j,k,l) qurt_qdi_qhi12(24,b,c,(int)(d),(int)(e),(int)(f),(int)(g),(int)(h),(int)(i),(int)(j),(int)(k),(int)(l))
+int qurt_qdi_qhi3(int,int,int);
+int qurt_qdi_qhi4(int,int,int,int);
+int qurt_qdi_qhi5(int,int,int,int,int);
+int qurt_qdi_qhi6(int,int,int,int,int,int);
+int qurt_qdi_qhi7(int,int,int,int,int,int,int);
+int qurt_qdi_qhi8(int,int,int,int,int,int,int,int);
+int qurt_qdi_qhi9(int,int,int,int,int,int,int,int,int);
+int qurt_qdi_qhi10(int,int,int,int,int,int,int,int,int,int);
+int qurt_qdi_qhi11(int,int,int,int,int,int,int,int,int,int,int);
+int qurt_qdi_qhi12(int,int,int,int,int,int,int,int,int,int,int,int);
+
+/**@ingroup func_qurt_qdi_write
+  Writes data to the specified driver.
+  A predefined invocation routine for drivers that
+  support a POSIX-like write functionality.
+  qqurt_qdi_write(handle, buf, len) is equivalent to
+  qurt_qdi_handle_invoke(handle, QDI_WRITE, handle, buf, len);
+ 
+  @param[in]  handle Driver handle.
+  @param[in]  buf    Pointer to the memory address where the data to write is stored.
+  @param[in]  len    Number of bytes of data to write.
+
+  @return 
+  Non-negative integer -- Number of bytes written. \n
+  Negative error code -- Write could not take place.
+
+  @dependencies
+  None.
+ */
+int qurt_qdi_write(int handle, const void *buf, unsigned len);
+
+/**@ingroup func_qurt_qdi_read
+  User-visible API to read data from a QDI handle. 
+  A predefined invocation routine for drivers that
+  support a POSIX-like read functionality.
+  qurt_qdi_read(handle, buf, len) is equivalent to:
+  qurt_qdi_handle_invoke(handle, QDI_READ, handle, buf, len);
+ 
+  @param[in]  handle   Driver handle.
+  @param[in]  buf      Pointer to the memory address where the data read is stored.
+  @param[in]  len      Number of bytes of data to read.
+
+  @return 
+  Non-negative integer number -- Bytes read. \n
+  Negative error code -- Read could not take place.
+
+  @dependencies
+  None.
+ */
+int qurt_qdi_read(int handle, void *buf, unsigned len);
+
+/**@ingroup func_qurt_qdi_close
+  Closes the specified driver, releasing any resources associated with the open driver.
+  User-visible API to close a QDI handle.
+ 
+  This API should be called when the user is done using a
+  QDI-based handle. When this function is called, the driver can release
+  any resources held and perform other necessary cleanup
+  operations. qurt_qdi_close(handle) is equivalent to
+  qurt_qdi_handle_invoke(handle, QDI_CLOSE, handle)
+ 
+  @param[in]  handle Driver handle.
+ 
+  @return 
+  0 -- Success.\n
+  Negative error code -- Failure.
+
+  @dependencies
+  None.
+ */
+int qurt_qdi_close(int handle);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_qdi_constants.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_qdi_constants.h
new file mode 100755
index 0000000000000..4866fada067f0
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_qdi_constants.h
@@ -0,0 +1,193 @@
+#ifndef QDI_CONSTANTS_H
+#define QDI_CONSTANTS_H
+
+/**
+  @file qurt_qdi_constants.h
+  @brief  Predefined invocation methods for drivers.  
+
+ EXTERNALIZED FUNCTIONS
+  None
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  None
+
+ Copyright (c) 2013-2023 by Qualcomm Technologies, Inc.  All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc..
+ ======================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+||  Method numbers used for QDI.
+||
+||  Intended grouping of method numbers for QDI
+||   including future usage:
+||
+||   Method 0 should always be unused and not responded to by
+||    any driver.
+||   Methods 1 and 2 are reserved for name registration and
+||    name lookup.
+||   Methods 3 through 31 are reserved for POSIX-type operations
+||    on open handles.
+||   Methods 32 through 127 are reserved for the QDI infrastructure
+||    and may be extended in the future to provide standard
+||    driver debug services, management services, and system
+||    notifications.
+||   Methods 128 through 255 are reserved for the use of automatically
+||    generated methods such as might be generated by an IDL (interface
+||    definition language).  The infrastructure may be extended to
+||    perform services on these methods based on information provided
+||    by the IDL, such as automatic buffer validation, etc.  These
+||    method numbers should not be used for any "ad hoc" methods.
+||   Methods with number >= 256 are "private" method numbers that are
+||    outside the scope of the QDI infrastructure.  Drivers that want
+||    to generate and consume their own "ad hoc" methods are free to
+||    use these method numbers as they wish. The infrastructure does
+||    not generate these method numbers or respond to them, but
+||    passes them on unmolested.
+||
+||   All driver implementations *should* return a value of
+||    -1 when called with an unsupported method.  The standard error
+||    return value for POSIX APIs is -1, so we emulate that behavior
+||    here.
+*/
+/** @cond */
+#define QDI_UNUSED              0
+#define QDI_DEVNAME_REGISTER    1
+#define QDI_OPEN                2
+#define QDI_CLOSE               3
+#define QDI_READ                4
+#define QDI_WRITE               5
+#define QDI_IOCTL               6
+#define QDI_MMAP                7
+#define QDI_OS_FILEOPEN         8
+#define QDI_FLEN                9
+#define QDI_UNLINK             10
+#define QDI_FTELL              22
+#define QDI_SEEK               23
+#define QDI_FSTAT              24
+
+#define QDI_FSNAME_REGISTER     150    
+#define QDI_FS_OPEN             151
+#define QDI_MMAP2               153
+#define QDI_MPROTECT2           154
+#define QDI_MUNMAP2             155
+
+#define QDI_CLIENT_HANDLE_OBJREF_GET    10
+
+#define QDI_OS_PROCESS_LOAD             12
+#define QDI_OS_PROCESS_CHOOSE_ASID      13
+
+#define QDI_OS_SET_GP                    26
+#define QDI_CLIENT_HANDLE_CALLBACK       27
+   
+#define QDI_CLIENT_HANDLE_ISLAND_HANDLE_CREATE_FROM_OBJ_T  19 //reused
+#define QDI_CLIENT_HANDLE_HANDLE_CREATE_FROM_OBJ_T 80
+#define QDI_CLIENT_HANDLE_HANDLE_RELEASE           81
+#define QDI_CLIENT_HANDLE_COPY_FROM_USER           82
+#define QDI_CLIENT_HANDLE_COPY_TO_USER             83
+#define QDI_CLIENT_HANDLE_SIGNAL_GROUP_CREATE      86
+#define QDI_CLIENT_HANDLE_SAFE_CACHE_OPS           87
+
+#define QDI_CLIENT_HANDLE_BUFFER_LOCK   41
+#define QDI_CLIENT_HLOSPOOL_INFO_GET    90
+#define QDI_CLIENT_HLOSPOOL2_INFO_GET   96
+
+#define QDI_CLIENT_PID                  44
+#define QDI_CLIENT_ASID                 QDI_CLIENT_PID
+
+#define QDI_OS_CLIENT_INFO_GET          48
+
+#define QDI_OS_MEM_LOOKUP_PHYSADDR      57
+
+#define QDI_OS_THREAD_ITERATOR_CREATE   68
+#define QDI_OS_THREAD_ITERATOR_NEXT     69
+
+#define QDI_OS_SYSENV                   78
+
+#define QDI_REGION_USERMALLOC_INIT      180 // This method is for generic handle
+
+
+#define QDI_CLIENT_HANDLE_USER_MALLOC              84
+#define QDI_CLIENT_HANDLE_USER_FREE                85
+
+#define QDI_SIGNAL_GROUP_SIGNAL_CREATE          96
+#define QDI_SIGNAL_GROUP_WAIT                   98
+#define QDI_SIGNAL_GROUP_POLL                   99
+#define QDI_SIGNAL_SET                          96
+#define QDI_SIGNAL_CLEAR                        97
+#define QDI_SIGNAL_WAIT                         98
+#define QDI_SIGNAL_POLL                         99
+
+#define QDI_OS_WAIT_FOR_MAIN_REAPER		104
+
+#define QDI_CLIENT_HANDLE_REFPROXY_INSTALL        105
+#define QDI_CLIENT_HANDLE_REFPROXY_ADD            106
+#define QDI_CLIENT_HANDLE_REFPROXY_REMOVE         107
+
+#define QDI_CLIENT_HANDLE_DETACH                  116
+
+#define QDI_OS_RESERVED1                       139
+
+#define QDI_CLIENT_HANDLE_BUFFER_LOCK2   142
+
+#define QDI_DT_REGISTER 158
+#define QDI_OPEN_DEVICE 159
+#define QDI_OPEN_FROM_DT 160
+
+#define QDI_PRIVATE             256  /* Method numbers beginning at 256
+                                        are private method numbers, which
+                                        are device-specific and available
+                                        for use by device implementors. */
+/*
+||  Permission bitmasks for use with qurt_qdi_lock_buffer().
+||
+||  Make sure these match with permission values from qurt_perm_t.
+*/
+/** @endcond */
+
+/** @addtogroup driver_support_constants
+@{ */
+#define QDI_PERM_W              2                         /**< Write access. */
+#define QDI_PERM_R              1                         /**< Read access. */
+#define QDI_PERM_RW             (QDI_PERM_R | QDI_PERM_W) /**< Read/write access. */
+
+#define QDI_HANDLE_LOCAL_CLIENT         3                 /**< Local client. */
+#define QDI_HANDLE_GENERIC              4                 /**< Generic. */
+
+#define QDI_REFCNT_BASE   0x510000                        /**<  */
+#define QDI_REFCNT_MAXED  0x51FFFD                        /**<  */
+#define QDI_REFCNT_INIT   0x51FFFE                        /**< Driver object is temporary and is eventually deleted.*/
+#define QDI_REFCNT_PERM   0x51FFFF                        /**< Driver object is permanent and is never deleted. */
+/** @} */ /* end_addtogroup driver_support_constants */
+
+/** @cond */
+/*
+||  Flags used by process loaders.
+*/
+
+#define QDI_OS_PROCESS_FLAGS_ISLAND_RESIDENT    0x1     /* Set this flag to request the loaded process
+                                                           to have island residency. */
+#define QDI_OS_PROCESS_FLAGS_ROOT_RESIDENT      0x2     /* Set this flag to request the loaded process
+                                                           to have root residency, for example, DL Pager. */
+/*
+||  Constants used for qurt_event register API, type field.
+*/
+
+#define QURT_PROCESS_EXIT   1
+
+/*
+||  Constants used by QDI extensions.
+*/
+
+#define QURT_QDI_SINGLETON_TYPE_TRUE			0
+#define QURT_QDI_SINGLETON_TYPE_FALSE			1
+#define QURT_QDI_SINGLETON_TYPE_PER_PROCESS		2
+/** @endcond */
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QDI_CONSTANTS_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_qdi_driver.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_qdi_driver.h
new file mode 100755
index 0000000000000..e044e25f1bb72
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_qdi_driver.h
@@ -0,0 +1,868 @@
+#ifndef QURT_QDI_DRIVER_H
+#define QURT_QDI_DRIVER_H
+
+/**
+  @file qurt_qdi_driver.h
+  @brief  Definitions, macros, and prototypes used when writing a
+  QDI driver.
+
+ EXTERNALIZED FUNCTIONS
+  None
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  None
+
+ Copyright (c) 2018, 2019-2021, 2023 Qualcomm Technologies, Inc.
+ All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+
+#include "stddef.h"
+#include "qurt_qdi.h"
+#include "qurt_types.h"
+#include "qurt_callback.h"
+#include "qurt_qdi_constants.h"
+#include "qurt_qdi_imacros.h"
+#include "qurt_mutex.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+||  This gives the canonical form for the arguments to a QDI
+||   driver invocation function.  The arguments are as follows:
+||
+||   int client_handle    (R0) QDI handle that represents the client
+||                             that made this QDI request. If the
+||                             client is remote, this is a
+||                             variable handle; if the client is local
+||                             (same thread and process), this is
+||                             set to QDI_HANDLE_LOCAL_CLIENT.
+||
+||   qurt_qdi_obj_t *obj  (R1) Points at the qdi_object_t structure
+||                             on which this QDI request is being made.
+||                             The qdi_object_t structure is usually
+||                             the first element of a larger structure
+||                             that contains state associated with the
+||                             object; because it is usually the first
+||                             element, the object pointers can be freely
+||                             interchanged through casts.
+||
+||   int method           (R2) Integer QDI method that represents
+||                             the request type.
+||
+||   qurt_qdi_arg_t arg1  (R3) First three general purpose arguments
+||   qurt_qdi_arg_t arg2  (R4)  to the invocation function are passed in
+||   qurt_qdi_arg_t arg3  (R5)  these slots.
+||
+||   qurt_qdi_arg_t arg4  (SP+0)  Arguments beyond the first three are
+||   qurt_qdi_arg_t arg5  (SP+4)  passed on the stack.
+||   qurt_qdi_arg_t arg6  (SP+8)
+||   qurt_qdi_arg_t arg7  (SP+12)
+||   qurt_qdi_arg_t arg8  (SP+16)
+||   qurt_qdi_arg_t arg9  (SP+20)
+||
+||  The canonical form of the invocation function takes a
+||   total of 12 arguments, but not all of them are used.  In general,
+||   the QDI infrastructure only passes those arguments provided by
+||   the caller; if the invocation function accesses additional
+||   arguments beyond those provided by the caller, the values are not
+||   useful.
+*/
+/** @cond */
+#define QDI_INVOKE_ARGS \
+    int, struct qdiobj *, int, \
+    qurt_qdi_arg_t, qurt_qdi_arg_t, qurt_qdi_arg_t, \
+    qurt_qdi_arg_t, qurt_qdi_arg_t, qurt_qdi_arg_t, \
+    qurt_qdi_arg_t, qurt_qdi_arg_t, qurt_qdi_arg_t
+
+#define QDI_EXT_INVOKE_ARGS \
+    int, qurt_qdi_man_obj_t*, int, \
+    qurt_qdi_arg_t, qurt_qdi_arg_t, qurt_qdi_arg_t, \
+    qurt_qdi_arg_t, qurt_qdi_arg_t, qurt_qdi_arg_t, \
+    qurt_qdi_arg_t, qurt_qdi_arg_t, qurt_qdi_arg_t
+
+#define BUFFER_LOCK 1
+#define BUFFER_UNLOCK 0 
+
+struct qdiobj;
+/** @endcond */
+/** @addtogroup driver_support_types
+@{ */
+typedef union {
+    void *ptr; /**< Pointer to the driver handle. */
+    int num;   /**< Method number. */
+} qurt_qdi_arg_t;
+/** @} */ /* end_addtogroup driver_support_types */
+/** @cond */
+/** QuRT QDI driver version */
+typedef union {
+    int num;
+    struct {
+        short major; /** Driver major version number. */
+        short minor; /** Driver minor version number. */
+    };
+} qurt_qdi_version_t;
+
+typedef int (*qurt_qdi_pfn_invoke_t)(QDI_INVOKE_ARGS);
+typedef void (*qurt_qdi_pfn_release_t)(struct qdiobj *);
+/** @endcond */
+/** @addtogroup driver_support_types
+@{ */
+typedef struct qdiobj {
+    qurt_qdi_pfn_invoke_t invoke;   /**< Invocation function that implements the driver methods.*/
+    int refcnt;                     /**< Reference count, an integer value maintained by the QDI infrastructure that tracks the number of
+                                         references to a driver instance. */
+    qurt_qdi_pfn_release_t release; /**< Release function that performs details associated with deleting an instance
+                                         of the driver object.*/
+} qurt_qdi_obj_t;
+/** @} */ /* end_addtogroup driver_support_types */
+/** @cond */
+/** QuRT QDI managed object */
+typedef struct qurt_qdi_man_obj
+{
+    qurt_qdi_obj_t qdi_obj;
+    union
+    {
+        struct qurt_qdi_ext_driver * opener_obj;
+        struct qurt_qdi_ext_device * device_obj;
+    };
+}qurt_qdi_man_obj_t;
+
+typedef int (*qurt_qdi_ext_pfn_create_t)(int client_id, const char *name, qurt_qdi_version_t version, qurt_qdi_man_obj_t **qdi_obj);
+typedef int (*qurt_qdi_ext_pfn_create_device_t)(int client_id, const char *name, qurt_qdi_version_t version, struct qurt_qdi_ext_device * device, qurt_qdi_man_obj_t **qdi_obj);
+typedef int (*qurt_qdi_ext_pfn_invoke_t)(QDI_EXT_INVOKE_ARGS);
+typedef void (*qurt_qdi_ext_pfn_destroy_t)(qurt_qdi_man_obj_t *qdi_obj);
+typedef int (*qurt_qdi_ext_pfn_probe_t)(void *handle, struct qurt_qdi_ext_device **device);
+
+typedef struct qurt_qdi_ext_obj_info{
+    qurt_qdi_man_obj_t *obj;
+    int qdi_client_id;
+    struct qurt_qdi_ext_obj_info *next;
+}qurt_qdi_ext_obj_info_t;
+typedef struct qurt_qdi_ext_obj_info *qurt_qdi_ext_obj_info_ptr;
+
+/** QuRT QDI device */
+//temporarily add this back while there are still drivers who statically define this structure
+struct qurt_qdi_device {
+    qurt_qdi_obj_t opener_obj;
+    const char* name;
+    char island_resident;
+    unsigned char singleton;
+    qurt_qdi_ext_pfn_create_t create;
+    qurt_qdi_ext_pfn_invoke_t invoke;
+    qurt_qdi_ext_pfn_destroy_t destroy;
+    qurt_mutex_t qurt_qdi_ext_list_lock;
+    qurt_qdi_ext_obj_info_ptr qurt_qdi_ext_obj_info_head;
+};
+typedef struct qurt_qdi_device qurt_qdi_man_device;
+
+struct qurt_qdi_ext_driver {
+    qurt_qdi_obj_t opener_obj;
+    const char* name;
+    char island_resident;
+    unsigned char singleton;
+    qurt_qdi_ext_pfn_create_t create;
+    qurt_qdi_ext_pfn_invoke_t invoke;
+    qurt_qdi_ext_pfn_destroy_t destroy;
+    qurt_mutex_t qurt_qdi_ext_list_lock;
+    qurt_qdi_ext_obj_info_ptr qurt_qdi_ext_obj_info_head;
+    qurt_qdi_ext_pfn_create_device_t create_device;
+    qurt_qdi_version_t version;
+    qurt_qdi_ext_pfn_probe_t probe;
+    const char* compatible;
+    struct qurt_qdi_ext_device * device_list;
+    //qurt_qdi_ext_device_ptr device_list;
+};
+typedef struct qurt_qdi_ext_driver qurt_qdi_ext_driver_t;
+//above replaces qurt_qdi_man_device
+
+extern int qurt_qdi_obj_ref_inc(qurt_qdi_obj_t *);
+extern int qurt_qdi_obj_ref_dec(qurt_qdi_obj_t *);
+
+extern int qurt_qdi_ext_opener (QDI_INVOKE_ARGS);
+/** @endcond */
+/**@ingroup func_qurt_qdi_method_default
+  Processes a method that is unrecognized or unsupported in the driver invocation function.
+  All arguments passed to the current invocation function (Section @xref{sec:invocationFunction}) must be forwarded
+  to this function.
+
+  @note1hang Invocation functions must process all unrecognized or unsupported methods
+             by calling this function.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+extern int qurt_qdi_method_default(QDI_INVOKE_ARGS);
+
+/**@ingroup func_qurt_qdi_handle_create_from_obj_t
+  Allocates a new device handle for use with the specified driver object.
+   
+  @param[in] client_handle  Client handle obtained from the current invocation function (Section @xref{sec:invocationFunction}).
+  @param[out] obj           Pointer to the driver object.
+
+  @return
+  Non-negative integer -- Success; this value is the new handle. \n
+  Negative value -- Error.
+
+  @dependencies
+  None.
+*/
+static __inline int qurt_qdi_handle_create_from_obj_t(int client_handle, qurt_qdi_obj_t *obj)
+{
+    return qurt_qdi_handle_invoke(client_handle,
+                                    QDI_CLIENT_HANDLE_HANDLE_CREATE_FROM_OBJ_T,
+                                    obj);
+}
+
+/**@ingroup func_qurt_qdi_handle_invoke
+  Allocates a new island device handle for use with the specified driver object.
+   
+  @param[in] client_handle Client handle obtained from the current invocation function (Section 3.4.1).
+  @param[in] obj           Pointer.
+
+  @return
+  Non-negative integer value that is the new handle -- Success. \n
+  Negative return value -- Error.
+
+  @dependencies
+  None.
+*/
+static __inline int qurt_qdi_island_handle_create_from_obj_t(int client_handle, qurt_qdi_obj_t *obj)
+{
+    return qurt_qdi_handle_invoke(client_handle,
+                                    QDI_CLIENT_HANDLE_ISLAND_HANDLE_CREATE_FROM_OBJ_T,
+                                    obj);
+}
+
+/**@ingroup func_qurt_qdi_handle_release
+  Deallocates the specified device handle.
+
+  @param[in] client_handle     Obtained from the current invocation function (Section @xref{sec:invocationFunction}).
+  @param[in] handle_to_release Handle to release.
+
+  @return 
+  0 -- Success. \n
+  Negative value -- Error. 
+
+  @dependencies
+  None.
+*/
+static __inline int qurt_qdi_handle_release(int client_handle, int handle_to_release)
+{
+    return qurt_qdi_handle_invoke(client_handle,
+                                    QDI_CLIENT_HANDLE_HANDLE_RELEASE,
+                                    handle_to_release);
+}
+
+static __inline qurt_qdi_obj_t *
+qurt_qdi_objref_get_from_handle(int client_handle, int object_handle)
+{
+    qurt_qdi_obj_t *ret;
+
+    ret = NULL;
+
+    qurt_qdi_handle_invoke(client_handle,
+                            QDI_CLIENT_HANDLE_OBJREF_GET,
+                            object_handle,
+                            &ret);
+
+    return ret;
+}
+
+/**@ingroup func_qurt_client_add_memory
+  Adds a physical address range to the HLOS physpool of the caller user PD.
+   
+  @param[in] client_handle  Obtained from the current invocation function (Section 3.4.1).
+  @param[in] phys_addr      Starting address of the physical address range. 
+  @param[in] size           Size.
+
+  @return
+  #QURT_EOK -- Pages successfully added.
+
+  @dependencies
+  None.
+*/
+int qurt_client_add_memory(int client_handle, qurt_addr_t phys_addr, qurt_size_t size);
+
+/**@ingroup func_qurt_client_add_memory2
+  Adds a physical address range to the HLOS physpool of the caller user PD.
+   
+  @param[in] client_handle  Obtained from the current invocation function (Section 3.4.1).
+  @param[in] phys_addr      Starting 36-bit address of the physical address range. 
+  @param[in] size           Size.
+
+  @return
+  #QURT_EOK -- Pages successfully added.
+
+  @dependencies
+  None.
+*/
+int qurt_client_add_memory2(int user_client_handle, qurt_paddr_64_t phys_addr, qurt_size_t size);
+
+static __inline qurt_qdi_obj_t *
+qurt_qdi_objref_get_from_pointer(qurt_qdi_obj_t *objptr)
+{
+    qurt_qdi_obj_t * ret = NULL;
+
+    if (qurt_qdi_obj_ref_inc(objptr) < 0) {
+        ret = NULL;
+    } else {
+        ret = objptr;
+    }
+
+    return ret;
+}
+
+static __inline void
+qurt_qdi_objref_release(qurt_qdi_obj_t *objptr)
+{
+    if (qurt_qdi_obj_ref_dec(objptr) == 1) {
+        (*objptr->release)(objptr);
+    }
+}
+
+/**@ingroup func_qurt_qdi_copy_from_user
+  Copies the contents of a user memory buffer into the current driver.
+
+  @note1hang User buffer addresses are valid only for the duration of the current driver
+  invocation.
+
+  @param[in] client_handle   Obtained from the current invocation function (Section @xref{sec:invocationFunction}).
+  @param[in] dest            Base address of the driver buffer.
+  @param[in] src             Base address of the user buffer.
+  @param[in] len             Number of bytes to copy.
+  
+  @return
+  Negative value -- Indicates a privilege or security violation, the copy operation 
+                has crossed a privilege boundary.
+  
+  @dependencies
+  None.
+*/
+static __inline int qurt_qdi_copy_from_user(int client_handle, void *dest, const void *src, unsigned len)
+{
+    return qurt_qdi_handle_invoke(client_handle,
+                                    QDI_CLIENT_HANDLE_COPY_FROM_USER,
+                                    dest, src, len);
+}
+
+/**@ingroup qurt_qdi_copy_string_from_user
+  Copies the contents of a user memory buffer into the current driver.
+
+  @note1hang User buffer addresses are valid only for the duration of the current driver
+  invocation.
+
+  @param client_handle   Obtained from the current invocation function (Section 3.4.1).
+  @param dest            Base address of the driver buffer.
+  @param src             Base address of the user buffer.
+  @param len             Number of bytes to copy. NOTE: This is the destination buffer length.
+  
+  @return
+  Negative error result -- privilege or security violation, the copy operation 
+                has crossed a privilege boundary.
+  
+  @dependencies
+  None.
+*/
+int qurt_qdi_copy_string_from_user(int client_handle, char *dest, const char *src, unsigned len);
+
+/**@ingroup func_qurt_qdi_copy_to_user
+  Copies the contents of a driver memory buffer to user memory.
+
+  @note1hang User buffer addresses are valid only for the duration of the current driver
+             invocation.
+
+  @param[in] client_handle Client handle obtained from the current invocation function (Section @xref{sec:invocationFunction}).
+  @param[in] dest          Base address of the user buffer.
+  @param[in] src           Base address of the driver buffer.
+  @param[in] len           Number of bytes to copy.
+
+  @return
+  Negative value -- Indicates a privilege or security violation, the copy operation has crossed a 
+                    privilege boundary
+
+  @dependencies
+  None.
+*/
+static __inline int qurt_qdi_copy_to_user(int client_handle, void *dest, const void *src, unsigned len)
+{
+    return qurt_qdi_handle_invoke(client_handle,
+                                    QDI_CLIENT_HANDLE_COPY_TO_USER,
+                                    dest, src, len);
+}
+
+/**@ingroup func_qurt_qdi_safe_cache_ops
+  Do cache operations on user memory
+
+  @note1hang User buffer addresses are valid only for the duration of the current driver
+             invocation.
+
+  @param[in] client_handle Client handle obtained from the current invocation function (Section @xref{sec:invocationFunction}).
+  @param[in] addr          Base address of the user memory.
+  @param[in] size          Size of the user memory.
+  @param[in] opcode        Cache operations (QURT_MEM_CACHE_FLUSH, QURT_MEM_CACHE_INVALIDATE...)
+  @param[in] type          Cache type (QURT_MEM_ICACHE, QURT_MEM_DCACHE)
+
+  @return
+  Negative value -- Indicates a privilege or security violation, the copy operation has crossed a
+                    privilege boundary
+
+  @dependencies
+  None.
+*/
+static __inline int qurt_qdi_safe_cache_ops(int client_handle, qurt_addr_t addr, qurt_size_t size,
+        qurt_mem_cache_op_t opcode, qurt_mem_cache_type_t type)
+{
+    return qurt_qdi_handle_invoke(client_handle,
+                                  QDI_CLIENT_HANDLE_SAFE_CACHE_OPS,
+                                  addr, size, opcode, type);
+}
+
+
+/**@ingroup func_qurt_qdi_buffer_lock
+  Prepares for the direct manipulation of a potentially untrusted buffer provided by a QDI
+  client.
+
+  This function is used to permit a trusted driver to safely access memory that is
+  provided by a potentially untrusted client. A driver calls this function to obtain a safe buffer
+  pointer for accessing the memory.
+
+  This function performs the following security checks: \n
+  - Verifies that the entire buffer is accessible to the client. \n
+  - Ensures that the pointer remains valid for the remainder of the QDI driver
+      operation. \n
+
+  @note1hang  User buffer addresses are valid only for the duration of the current driver
+              invocation.
+
+  @param[in] client_handle Obtained from the current invocation function (Section @xref{sec:invocationFunction}).
+  @param[in] buf           Pointer to the base address of the client buffer address.
+  @param[in] len           Buffer length (in bytes).
+  @param[in] perms         Bitmask value that specifies the read or write access to perform on the
+                       client buffer: \n
+                           - #QDI_PERM_R -- Read access \n
+                           - #QDI_PERM_W -- Write access \n
+                           - #QDI_PERM_RW -- Read/write access @tablebulletend
+  @param[out] obuf     Pointer to the buffer address that the driver must use to access the buffer.
+
+  @return
+  Negative value -- Error; the operation crosses a privilege boundary, indicating a privilege or security violation. \n
+  Nonzero value -- User passed a buffer that does not fulfill the requested read/write access permission.
+                    In this case the QDI driver call must be terminated cleanly, with an appropriate error code 
+                    returned to the client. \n
+  Zero -- Success; when this occurs the QDI driver must use the pointer at *obuf to access memory, and not the
+                    pointer passed in as buf -- even if the user process changes the mapping of memory at buf,
+                   the mapping of memory at *obuf remains valid until the driver invocation completes.
+
+  @dependencies
+  None.
+*/
+static __inline int qurt_qdi_buffer_lock(int client_handle, void *buf, unsigned len,
+                                         unsigned perms, void **obuf)
+{
+    return qurt_qdi_handle_invoke(client_handle,
+                                    QDI_CLIENT_HANDLE_BUFFER_LOCK,
+                                    buf, len, perms, obuf);
+}
+
+/**@ingroup func_qurt_qdi_buffer_lock2
+   Prepares for the direct manipulation of a possibly-untrusted buffer provided by a QDI
+   client.
+   This API permits a trusted driver to safely access memory 
+   provided by a possibly-untrusted client. A driver calls this function to obtain a safe buffer
+   pointer for accessing the memory.
+   This function performs the following security checks: \n
+   -- Entire buffer is accessible to the client. \n
+   -- Entire buffer is mapped with permissions passed in perms field \n
+   -- Entire buffer is physically contiguous \n
+   In addition to the security checks, the API also locks the client mapping such that the client
+   cannot remove the mapping while the physical memory is used by the trusted
+   driver. \n
+
+   @note1      Drivers are responsible for calling qurt_qdi_buffer_unlock() at appropriate time. Not 
+               pairing qurt_qdi_buffer_unlock() with this API leads to resource leakages and 
+               process exit failures. Drivers can keep track of which buffers are locked for
+               a particular client. If the client exits abruptly, the buffers can be
+               unlocked on driver release invocation for the exiting client.
+
+   @note2      This API is supported in limited capacity when called from Island mode. Safe buffer
+               unmapping or user buffer unlock is not supported in Island mode.
+
+   @param client_handle Obtained from the current invocation function (Section 3.4.1).
+   @param buf           Pointer to the base address of the client buffer address.
+   @param len           Buffer length (in bytes).
+   @param perms         Bitmask value that specifies the read or write access to perform on the
+                        client buffer: \n
+                        -- #QDI_PERM_R -- Read access \n
+                        -- #QDI_PERM_W -- Write access \n
+                        -- #QDI_PERM_RW -- Read/write access \n
+   @param obuf         Optional parameter that returns a pointer to the buffer address that 
+                       the driver must use to access the buffer. If NULL is passed, the API 
+                       only performs security checks and does not create a mapping to access the user buffer in
+                       a safe way.
+
+   @return
+   QURT_EINVALID   -- Arguments passed to the API are invalid. User buffer pointer is NULL or length of the
+                      buffer is 0. \n
+   QURT_EPRIVILEGE -- One of the security checks on the user buffer failed. \n
+   QURT_EFAILED    -- Mapping cannot be created for the trusted driver. \n
+   QURT_EOK        -- Lock operation was successful. When this occurs, the QDI driver must use the 
+                      pointer at *obuf to perform its memory accesses, and not the
+                      pointer passed in as buf. 
+                      
+   @dependencies
+   None.
+*/
+static __inline int qurt_qdi_buffer_lock2(int client_handle, void *buf, unsigned len,
+                                         unsigned perms, void **obuf)
+{
+    return qurt_qdi_handle_invoke(client_handle,
+                                    QDI_CLIENT_HANDLE_BUFFER_LOCK2,
+                                    BUFFER_LOCK, buf, len, perms, obuf);
+}
+
+/**@ingroup func_qurt_qdi_buffer_unlock
+   This API is paired with qurt_qdi_buffer_lock2(). A temporary overlapping mapping 
+   created for the driver is removed. Client mapping for the user buffer is
+   unlocked. 
+
+   @note1      Drivers are responsible for pairing this with qurt_qdi_buffer_lock(). Not 
+               pairing qurt_qdi_buffer_lock() with this API leads to resource leakages and 
+               process exit failures. Drivers can keep track of which buffers are locked for
+               a particular client, and if the client exits abruptly, all the buffers can be
+               unlocked on driver release invocation for the exiting client.
+
+   @note2      This API is supported in limited capacity when called from Island mode. Actual
+               unmapping of driver accessible memory or unlocking of the buffer is not
+               supported in Island bode.
+
+   @param client_handle Obtained from the current invocation function (Section 3.4.1).
+   @param buf           Pointer to the base address of the client buffer address.
+   @param len           Buffer length (in bytes).
+   @param obuf          Safe buffer address that was returned in the obuf field after calling
+                        qurt_qdi_buffer_lock2().
+
+   @return
+   QURT_EINVALID   -- Arguments passed to the API are invalid. User buffer pointer is NULL or length of the
+                      buffer is 0. \n
+   QURT_EOK        -- Lock operation was successful. When this occurs, the QDI driver must use the 
+                      pointer at *obuf to perform its memory accesses, and not the
+                      pointer passed in as buf. \n
+   other results   -- Safe buffer unmapping failed or unlocking of user buffer failed \n.
+
+   @dependencies
+   None.
+*/
+static __inline int qurt_qdi_buffer_unlock(int client_handle, void *buf, unsigned len,
+                                           void *obuf)
+{
+    return qurt_qdi_handle_invoke(client_handle,
+                                    QDI_CLIENT_HANDLE_BUFFER_LOCK2,
+                                    BUFFER_UNLOCK, buf, len, obuf);
+}
+
+/**@ingroup func_qurt_qdi_user_malloc
+  Allocates memory area in the QDI heap that is read/write accessible to both the driver and
+  the client. \n
+  @note1hang The QDI heap has a limited amount of memory available, and only the
+  device driver can free the allocated memory.
+
+  @param client_handle Client handle obtained from the current invocation function (Section @xref{sec:invocationFunction}).
+  @param size          Size.
+
+  @return
+  Non-zero -- Success; this returned value points to the allocated memory area. \n
+  Zero -- Error.
+
+  @dependencies
+  None.
+*/
+void *qurt_qdi_user_malloc(int client_handle, unsigned size);
+
+/**@ingroup func_qurt_qdi_user_free
+  Deallocates memory area in the QDI heap.
+
+  @param client_handle Client handle obtained from the current invocation function (Section @xref{sec:invocationFunction}).
+  @param ptr Pointer.
+
+  @dependencies
+  None.
+*/
+void qurt_qdi_user_free(int client_handle, void *ptr);
+
+/**@ingroup funct_qurt_qdi_client_detach
+  Detaches a client (a process), indicating that the client does not
+  participate in the qurt_wait() mechanism. This behavior
+  is opt-in and irrevocable. When a client is detached, it can
+  not be un-detached.
+
+  @param client_handle Handle of the client to detach.
+
+  @return
+  Zero -- Success.  Detachable clients always return success.
+  Nonzero value -- client_handle did not refer to a
+    detachable user client.
+
+  @dependencies
+  None.
+*/
+static __inline int qurt_qdi_client_detach(int client_handle)
+{
+    return qurt_qdi_handle_invoke(client_handle, QDI_CLIENT_HANDLE_DETACH);
+}
+
+/**@ingroup func_qurt_qdi_signal_group_create
+  Creates a new signal group for use in a device driver.
+  A QDI signal group contains up to 32 signals, which can be operated on either
+  individually (using the qurt_qdi_signal_* functions) or as a group (using the
+  qurt_qdi_signal_group_* functions). \n
+  @note1hang Driver implementation is responsible for using the proper signal group
+             handle in any given situation. \n
+  For more information on signals, see the Hexagon QuRT RTOS User Guide (80-VB419-78).
+
+  @param client_handle                 Client handle obtained from the current invocation function (Section @xref{sec:invocationFunction}).
+  @param p_signal_group_handle_local   Returns a handle intended for use by code that
+                                       resides in the same context and process as the created signal group
+                      (for example, the device driver implementation that allocated the 
+                      signal group).
+  @param p_signal_group_handle_remote  Returns a handle intended for use by code
+                                       that resides in a different context and process than the created signal group 
+                      (for example, the user-mode client of an OS driver).
+
+  @return
+  Zero return value indicates success.\n
+  Negative return value indicates could not create signal group.
+
+  @dependencies
+  None.
+*/
+static __inline int qurt_qdi_signal_group_create(int client_handle,
+                                                 int *p_signal_group_handle_local,
+                                                 int *p_signal_group_handle_remote)
+{
+    return qurt_qdi_handle_invoke(client_handle,
+                                    QDI_CLIENT_HANDLE_SIGNAL_GROUP_CREATE,
+                                    p_signal_group_handle_local,
+                                    p_signal_group_handle_remote);
+}
+
+/**@ingroup func_qurt_qdi_signal_group_wait
+  Suspends the current thread until any of the signals are set in the specified signal group.
+
+  If a signal is set in a signal group object, and a thread waits on the signal group object,
+  the thread is awakened. If the awakened thread has higher priority than the current
+  thread, a context switch can occur.
+
+  @param signal_group_handle   Handle of the signal group.
+
+  @return
+  If the client is remote:
+  QURT_EOK -- Wait complete \n
+  QURT_ECANCEL -- Wait cancelled.\n
+  If the client is local, returns a 32-bit word with current signals.
+
+  @dependencies
+  None.
+*/
+static __inline int qurt_qdi_signal_group_wait(int signal_group_handle)
+{
+    return qurt_qdi_handle_invoke(signal_group_handle,
+                                    QDI_SIGNAL_GROUP_WAIT);
+}
+
+/**@ingroup func_qurt_qdi_signal_group_poll
+  Returns a value that indicates if any of the signals are set in the specified signal group.
+
+  @param signal_group_handle Handle of the signal group.
+
+  @return
+  1 -- Indicates whether any of the signals are set in the signal group.\n
+  0 -- Indicates that none of the signals are set.
+
+  @dependencies
+  None.
+*/
+static __inline int qurt_qdi_signal_group_poll(int signal_group_handle)
+{
+    return qurt_qdi_handle_invoke(signal_group_handle,
+                                    QDI_SIGNAL_GROUP_POLL);
+}
+
+
+/**@ingroup func_qurt_qdi_signal_create
+  Creates a new signal in the specified signal group.
+  For more information on signals, see the Hexagon QuRT RTOS User Guide (80-VB419-78).
+
+  @note1hang Driver implementation is responsible for using the proper signal handle in
+             any given situation.
+
+  @param signal_group_handle    Handle of an existing signal group.
+  @param p_signal_handle_local  Returns a handle intended for use by code that resides in
+                               the same context and process as the created signal (for example,
+                               the device driver implementation that allocated the signal).
+  @param p_signal_handle_remote Returns a handle intended for use by code that resides in
+                               a different context and process than the created signal (for 
+                               example, the user-mode client of an OS driver).
+
+  @return 
+  Nonzero value -- No more signals can be created in the specified
+                    signal group. 
+
+  @dependencies
+  None.
+*/
+static __inline int qurt_qdi_signal_create(int signal_group_handle,
+                                           int *p_signal_handle_local,
+                                           int *p_signal_handle_remote)
+{
+    return qurt_qdi_handle_invoke(signal_group_handle,
+                                    QDI_SIGNAL_GROUP_SIGNAL_CREATE,
+                                    p_signal_handle_local,
+                                    p_signal_handle_remote);
+}
+
+/**@ingroup func_qurt_qdi_signal_set
+  Sets the signal in the specified signal object.
+
+  @param signal_handle Handle of the signal.
+
+  @return
+  Always returns 0.
+
+  @dependencies
+  None.
+*/
+static __inline int qurt_qdi_signal_set(int signal_handle)
+{
+    return qurt_qdi_handle_invoke(signal_handle,
+                                    QDI_SIGNAL_SET);
+}
+
+/**@ingroup func_qurt_qdi_signal_clear
+  Clears the signal in the specified signal object.
+
+  @param signal_handle   Handle of the signal.
+  
+  @return 
+  Always returns 0.
+
+  @dependencies
+  None.
+*/
+static __inline int qurt_qdi_signal_clear(int signal_handle)
+{
+    return qurt_qdi_handle_invoke(signal_handle,
+                                    QDI_SIGNAL_CLEAR);
+}
+
+/**@ingroup func_qurt_qdi_signal_wait
+  Suspends the current thread until the specified signal is set.
+  If a signal is set in a signal object, and a thread waits on the signal object, the
+  thread is awakened. If the awakened thread has higher priority than the current thread, a
+  context switch may occur.
+
+  @param signal_handle Handle of the signal.
+
+  @return
+  If client is remote:
+  QURT_EOK -- Wait complete. \n
+  QURT_ECANCEL -- Wait cancelled.\n
+  If client is local, return a 32-bit word with current signals.
+
+  @dependencies
+  None.
+*/
+static __inline int qurt_qdi_signal_wait(int signal_handle)
+{
+    return qurt_qdi_handle_invoke(signal_handle,
+                                    QDI_SIGNAL_WAIT);
+}
+
+/**@ingroup func_qurt_qdi_signal_poll
+  Returns a value that indicates if the specified signal is set.
+
+  @param signal_handle Handle of the signal.
+
+  @return
+  1 -- Signal is set. \n
+  0 -- Signal is not set.
+
+  @dependencies
+  None.
+*/
+static __inline int qurt_qdi_signal_poll(int signal_handle)
+{
+    return qurt_qdi_handle_invoke(signal_handle,
+                                    QDI_SIGNAL_POLL);
+}
+
+/**@ingroup func_qurt_qdi_devname_register
+  Registers a QDI device with the generic QDI object in the 
+  current QDI context.
+
+  This function registers an exact name or a directory prefix with a QDI opener object.
+  Future invocations of qurt_qdi_open() in the context of the caller invokes the
+  opener object if a match is detected.
+
+  Directory prefix names are specified by ending the name with a forward slash character.
+
+  Example of an exact name:
+  @code qurt_qdi_devname_register(/dev/foobar, foobar_opener);@endcode
+
+  Example of a directory prefix:
+  @code qurt_qdi_devname_register(/pipedev/, pipedev_opener);@endcode
+
+  Given the two registrations shown above, the only qurt_qdi_open() requests to
+  direct to the foobar_opener object are requests for the exact name
+  "/dev/foobar", Any request beginning with "/pipedev/" is directed to the
+  pipedev_opener object.
+
+  The pipedev invocation function presumably examines the name argument to
+  determine exactly how to handle the request. The name is passed to the invocation
+  function in the a1.ptr argument (Section @xref{sec:invocationFunction}).
+
+  @param  name   Device name or device name prefix.
+  @param  opener Pointer to the opener object for the device.
+ 
+  @return
+  0 -- Device was successfully registered. \n
+  Negative error code -- Device was not registered.
+
+  @dependencies
+  None.
+ */
+static __inline int qurt_qdi_devname_register(const char *name,
+                                              qurt_qdi_obj_t *opener)
+{
+    return qurt_qdi_handle_invoke(QDI_HANDLE_GENERIC,
+                                    QDI_DEVNAME_REGISTER,
+                                    name,
+                                    opener);
+}
+
+// Macros for backward compatibility with deprecated APIs
+//  (These will go away soon)
+
+#define qurt_qdi_register_devname(name, opener) \
+        qurt_qdi_devname_register((name), (void *)(opener))
+#define qurt_qdi_new_handle_from_obj_t(handle, obj) \
+        qurt_qdi_handle_create_from_obj_t((handle), (obj))
+#define qurt_qdi_release_handle(client_handle, handle) \
+        qurt_qdi_handle_release((client_handle), (handle))
+#define qurt_qdi_lock_buffer(handle, buf, len, perms, obuf) \
+        qurt_qdi_buffer_lock((handle), (buf), (len), (perms), (obuf))
+#define qurt_qdi_usermalloc(handle, size) \
+        qurt_qdi_user_malloc((handle), (size))
+#define qurt_qdi_userfree(handle, ptr) \
+        qurt_qdi_user_free((handle), (ptr))
+        
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_qdi_ext.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_qdi_ext.h
new file mode 100755
index 0000000000000..383e1799a15d6
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_qdi_ext.h
@@ -0,0 +1,58 @@
+#ifndef QURT_QDI_EXT_H
+#define QURT_QDI_EXT_H
+
+/**
+  @file qurt_qdi_driver.h
+  @brief  Definitions, macros, and prototypes used when writing a
+  QDI driver
+
+ EXTERNALIZED FUNCTIONS
+  none
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  none
+
+ Copyright (c) 2018, 2019-2021 by Qualcomm Technologies, Inc.  All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+#include "qurt_qdi_driver.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct qurt_qdi_ext_device {
+    qurt_qdi_ext_obj_info_ptr qurt_qdi_ext_obj_info_head;
+    struct qurt_qdi_ext_device * next;
+    char * instance;
+    fdt_node_handle context;
+};
+typedef struct qurt_qdi_ext_device *qurt_qdi_ext_device_ptr;
+
+/**@ingroup func_qurt_qdi_dt_register
+ Registers a QDI device with the generic QDI object in the current QDI context,
+ if and only if a compatible device node is found in the device tree. This 
+ function serves as a device tree aware wrapper for qurt_qdi_devname_register().
+
+ @param  name   Device name or device name prefix.
+ @param  opener Pointer to QDI ext specialized opener object for the driver.
+
+ @return
+ 0 -- Device was successfully registered. \n
+ Negative error code -- Device was not registered.
+*/
+static __inline int qurt_qdi_dt_register(const char *name, qurt_qdi_obj_t *opener)
+{
+    return qurt_qdi_handle_invoke(QDI_HANDLE_GENERIC, QDI_DT_REGISTER, name, opener);
+}
+
+static inline void qurt_qdi_ext_deviceobj_set_name (struct qurt_qdi_ext_device * device, char * name)
+{
+    device->instance = name;
+}
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_qdi_imacros.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_qdi_imacros.h
new file mode 100755
index 0000000000000..c0a8448ac87f8
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_qdi_imacros.h
@@ -0,0 +1,34 @@
+#ifndef QURT_QDI_IMACROS_H
+#define QURT_QDI_IMACROS_H
+
+/**
+  @file  qurt_qdi_imacros.h 
+  @brief  Internal macros used for QDI. Mostly consists of tricky (and ugly)
+  preprocessor hacks that permit us to do varargs function invocations
+  where we pass optional arguments in registers and where we can do
+  type casting and checking automatically.
+
+ EXTERNALIZED FUNCTIONS
+  none
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  none
+
+ Copyright (c) 2013, 2021  by Qualcomm Technologies, Inc.  All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define _QDMPASTE(a,b) _QDMPASTE_(a,b)
+#define _QDMPASTE_(a,b) a##b
+#define _QDMCNT(...) _QDMCNT_(__VA_ARGS__,12,11,10,9,8,7,6,5,4,3,2,1,0)
+#define _QDMCNT_(a,b,c,d,e,f,g,h,i,j,k,l,cnt,...) cnt
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_qdi_proxy.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_qdi_proxy.h
new file mode 100755
index 0000000000000..f1d8992ea8811
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_qdi_proxy.h
@@ -0,0 +1,55 @@
+/*=============================================================================
+
+                                    qurt_qdi_proxy.h
+
+GENERAL DESCRIPTION
+
+EXTERNAL FUNCTIONS
+        None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+        None.
+
+Copyright (c) 2013, 2021  by Qualcomm Technologies, Inc.  All Rights Reserved.
+=============================================================================*/
+#ifndef _QURT_QDI_PROXY_H
+#define _QURT_QDI_PROXY_H
+
+#include "qurt_qdi_driver.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* APIs allowing operation on the proxy object directly */
+int qurt_qdi_proxy_ref_create(void);
+
+/* APIs allowing to operate on proxy given a known proxy handle 
+ * 1) using qdi handle of the object 
+ * successful return: QURT_EOK, anything else -- failure 
+ */
+int qurt_qdi_proxy_ref_add_by_handle(int proxy_handle, int qdi_handle);
+int qurt_qdi_proxy_ref_sub_by_handle(int proxy_handle, int qdi_handle);
+
+/* 2) using object reference 
+ * successful return: QURT_EOK, anything else -- failure 
+ */
+int qurt_qdi_proxy_ref_add_by_object(int proxy_handle, qurt_qdi_obj_t *obj_ptr);
+int qurt_qdi_proxy_ref_sub_by_object(int proxy_handle, qurt_qdi_obj_t *obj_ptr);
+
+/* API allowing to associate a proxy object with a particular client given a client handle 
+ * successfule return: QURT_EOK, anything else -- failure 
+ */
+int qurt_client_proxy_ref_install (int client_handle, int proxy_handle);
+
+/* APIs allowing operation on proxy object from user client 
+ * successful return: QURT_EOK, anything else -- failure 
+ */
+int qurt_client_proxy_ref_add(int qdi_handle);
+int qurt_client_proxy_ref_remove(int qdi_handle);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_QDI_PROXY_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_rmutex.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_rmutex.h
new file mode 100755
index 0000000000000..a013a0bbddb1d
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_rmutex.h
@@ -0,0 +1,200 @@
+#ifndef QURT_RMUTEX_H
+#define QURT_RMUTEX_H
+/**
+  @file qurt_rmutex.h 
+  Prototypes of rmutex API.  
+
+EXTERNAL FUNCTIONS
+   None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+Copyright (c) 2013 - 2021 by Qualcomm Technologies, Inc.  All Rights Reserved.
+Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+=============================================================================*/
+
+
+#include <qurt_futex.h>
+#include <qurt_mutex.h>
+
+/*=============================================================================
+												FUNCTIONS
+=============================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**@ingroup func_qurt_rmutex_init
+   Initializes a recursive mutex object.
+   The recursive mutex is initialized in unlocked state.
+
+   @datatypes
+   #qurt_mutex_t
+   
+   @param[out]  lock  Pointer to the recursive mutex object.
+
+   @return 
+   None.
+
+   @dependencies
+   None.  
+ */
+void qurt_rmutex_init(qurt_mutex_t *lock);
+
+/**@ingroup func_qurt_rmutex_destroy  
+  Destroys the specified recursive mutex. \n
+  @note1hang Recursive mutexes must not be destroyed while they are still in use. If this
+             occurs, the behavior of QuRT is undefined. 
+  
+  @datatypes
+  #qurt_mutex_t
+  
+  @param[in]  lock  Pointer to the recursive mutex object to destroy.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+  
+ */
+void qurt_rmutex_destroy(qurt_mutex_t *lock);
+
+/**@ingroup func_qurt_rmutex_lock
+  Locks the specified recursive mutex. \n 
+
+  If a thread performs a lock operation on a mutex that is not in use, the thread 
+  gains access to the shared resource that the mutex protects, and continues executing.
+
+  If a thread performs a lock operation on a mutex that is already use by another 
+  thread, the thread is suspended. When the mutex becomes available again (because the 
+  other thread has unlocked it), the thread is awakened and given access to the shared resource.
+  
+   @note1hang A thread is not suspended if it locks a recursive mutex that it has already 
+   locked. However, the mutex does not become available to other threads until the
+   thread performs a balanced number of unlocks on the mutex.
+
+   @datatypes
+   #qurt_mutex_t
+  
+   @param[in]  lock  Pointer to the recursive mutex object to lock. 
+
+   @return
+   None.
+
+   @dependencies
+   None.
+  
+ */
+void qurt_rmutex_lock(qurt_mutex_t *lock);
+
+/**@ingroup func_qurt_rmutex_lock_timed
+  Locks the specified recursive mutex. The wait must be terminated when the specified timeout expires.\n 
+
+  If a thread performs a lock operation on a mutex that is not in use, the thread 
+  gains access to the shared resource that the mutex is protecting, and continues executing.
+
+  If a thread performs a lock operation on a mutex that is already in use by another 
+  thread, the thread is suspended. When the mutex becomes available again (because the 
+  other thread has unlocked it), the thread is awakened and given access to the shared resource.
+  
+   @note1hang A thread is not suspended if it locks a recursive mutex that it has already 
+   locked by itself. However, the mutex does not become available to other threads until the
+   thread performs a balanced number of unlocks on the mutex.
+   If timeout expires, this wait must be terminated and no access to the mutex is granted.
+   
+   @datatypes
+   #qurt_mutex_t
+  
+   @param[in]  lock  Pointer to the recursive mutex object to lock. 
+   @param[in] duration Interval (in microseconds) duration value must be between #QURT_TIMER_MIN_DURATION and
+    #QURT_TIMER_MAX_DURATION 
+
+   @return 
+   #QURT_EOK -- Success \n
+   #QURT_ETIMEDOUT -- Timeout
+
+   @dependencies
+   None.
+  
+ */
+int qurt_rmutex_lock_timed(qurt_mutex_t *lock, unsigned long long int duration);
+
+/**@ingroup func_qurt_rmutex_unlock
+   Unlocks the specified recursive mutex. \n 
+   More than one thread can be suspended on a mutex. When the mutex is 
+   unlocked, the thread waiting on the mutex awakens. If the awakened
+   thread has higher priority than the current thread, a context switch occurs.
+
+   @note1hang When a thread unlocks a recursive mutex, the mutex is not available until 
+   the balanced number of locks and unlocks has been performed on the mutex.
+
+   @datatypes
+   #qurt_mutex_t
+  
+   @param[in]  lock  Pointer to the recursive mutex object to unlock. 
+
+   @return
+   None.
+
+   @dependencies
+   None.
+  
+ */
+void qurt_rmutex_unlock(qurt_mutex_t *lock);
+
+/**@ingroup func_qurt_rmutex_try_lock
+   Attempts to lock the specified recursive mutex.\n
+
+   If a thread performs a try_lock operation on a recursive mutex that is not in use, the
+   thread gains access to the shared resource that is protected by the mutex, and continues
+   executing.\n
+   If a thread performs a try_lock operation on a recursive mutex that another thread has 
+   already locked, qurt_rmutex_try_lock immediately returns with a nonzero result
+   value.
+
+   @datatypes
+   #qurt_mutex_t
+   
+   @param[in]  lock  Pointer to the recursive mutex object to lock.
+
+   @return 
+   0 -- Success. \n 
+   Nonzero -- Failure. 
+  
+ */
+int qurt_rmutex_try_lock(qurt_mutex_t *lock);
+
+/**@ingroup func_qurt_rmutex_try_lock_block_once 
+  Attempts to lock a mutex object recursively. If the mutex is available, 
+  it locks the mutex. If the mutex is held by the current thread, 
+  it increases the internal counter and returns 0. If not, it returns a
+  nonzero value.
+  If the mutex is already locked by another thread, the caller thread is 
+  suspended. When the mutex becomes available again (because the other 
+  thread has unlocked it), the caller thread is awakened and tries to lock
+  the mutex; and if it fails, this function returns failure with a nonzero 
+  value. If it succeeds, this function returns success with zero.
+ 
+  @datatypes
+  #qurt_mutex_t
+  
+  @param[in]  lock  Pointer to the qurt_mutex_t object. 
+
+  @return
+  0 -- Success. \n
+  Nonzero -- Failure. 
+
+  @dependencies
+  None.
+ */
+int qurt_rmutex_try_lock_block_once(qurt_mutex_t *lock);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_RMUTEX_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_rmutex2.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_rmutex2.h
new file mode 100755
index 0000000000000..a37e7e4458c4b
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_rmutex2.h
@@ -0,0 +1,183 @@
+#ifndef QURT_RMUTEX2_H
+#define QURT_RMUTEX2_H
+/**
+  @file qurt_rmutex2.h 
+  @brief Prototypes of rmutex2 API  
+
+EXTERNAL FUNCTIONS
+   None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+Copyright (c) 2013, 2021, 2023  by Qualcomm Technologies, Inc.  All Rights Reserved.
+Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+=============================================================================*/
+
+
+#include <qurt_futex.h>
+#include <qurt_mutex.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** @addtogroup mutex_types
+@{ */
+/*=============================================================================
+                        TYPEDEFS
+=============================================================================*/
+
+/** QuRT rmutex2 type.                                       
+   Mutex type used with rmutex2 APIs.
+ */
+typedef struct {
+   /** @cond */
+   unsigned int holder __attribute__((aligned(8)));    /* UGP value of the mutex holder. */
+   unsigned short waiters;                             /* Number of waiting threads. */
+   unsigned short refs;                                /* Number of references to this mutex. */
+   unsigned int queue;                                 /* Kernel-maintained futex queuevalue. */
+   unsigned int excess_locks;                          /* Number of excess times the holder has locked the mutex. */
+   /** @endcond */
+} qurt_rmutex2_t;
+/** @} */ /* end_addtogroup mutex_types */
+/** @cond internal_only*/
+/*=============================================================================
+												FUNCTIONS
+=============================================================================*/
+
+/**@ingroup func_qurt_rmutex2_init
+
+   @deprecated use #qurt_rmutex_init instead.
+
+   Initializes a recursive mutex object. 
+
+   The recursive mutex is initially unlocked.
+  
+   Objects of type rmutex2 solve a potential race condition between
+   unlock() and destroy() operations.
+
+   @datatypes
+   #qurt_rmutex2_t
+   
+   @param[out]  lock  Pointer to the recursive mutex object.
+
+   @return 
+   None.
+
+   @dependencies
+   None.
+ */
+void qurt_rmutex2_init(qurt_rmutex2_t *lock);
+
+/**@ingroup func_qurt_rmutex2_destroy
+
+  @deprecated use #qurt_rmutex_destroy instead.
+
+  Destroys the specified recursive mutex. \n
+  @note1hang Recursive mutexes must not be destroyed while they are still in use. If this
+             occurs, the behavior of QuRT is undefined. 
+  @note1cont In general, application code must destroy an rmutex2 object prior to
+             deallocating it; calling qurt_rmutex2_destroy() before deallocating it ensures
+             that all qurt_rmutex2_unlock() calls complete.
+  
+  @datatypes
+  #qurt_rmutex2_t
+  
+  @param[in]  lock  Pointer to the recursive mutex object to destroy.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+  
+ */
+void qurt_rmutex2_destroy(qurt_rmutex2_t *lock);
+
+/**@ingroup func_qurt_rmutex2_lock
+
+  @deprecated use #qurt_rmutex_lock instead.
+
+  Locks the specified recursive mutex. \n 
+
+  If a thread performs a lock operation on a recursive mutex that is not in use, the
+  thread gains access to the shared resource that the mutex protects, and continues
+  to execute.
+
+  If a thread performs a lock operation on a recursive mutex that another thread is using, 
+  the thread is suspended. When the mutex becomes available again
+  (because the other thread has unlocked it), the thread is awakened and given access to the
+  shared resource.
+  
+  @note1hang A thread is not suspended if it locks a recursive mutex that it has already
+             locked, but the mutex does not become available until the thread performs a
+             balanced number of unlocks on the mutex.
+  
+   @datatypes
+   #qurt_rmutex2_t
+  
+   @param[in]  lock  Pointer to the recursive mutex object to lock. 
+
+   @return
+   None.
+
+   @dependencies
+   None.
+  
+ */
+void qurt_rmutex2_lock(qurt_rmutex2_t *lock);
+
+/**@ingroup func_qurt_rmutex2_unlock
+
+   @deprecated use #qurt_rmutex_unlock instead.
+
+   Unlocks the specified recursive mutex. \n 
+   More than one thread can be suspended on a recursive mutex. When the mutex is
+   unlocked, only the highest-priority thread waiting on the mutex awakens. If the
+   awakened thread has higher priority than the current thread, a context switch occurs.
+  
+   @datatypes
+   #qurt_rmutex2_t
+  
+   @param[in]  lock  Pointer to the recursive mutex object to unlock. 
+
+   @return
+   None.
+
+   @dependencies
+   None.
+  
+ */
+void qurt_rmutex2_unlock(qurt_rmutex2_t *lock);
+
+/**@ingroup func_qurt_rmutex2_try_lock
+
+   @deprecated use #qurt_rmutex_try_lock instead.
+
+   Attempts to lock the specified recursive mutex.\n
+
+   Non-blocking version of qurt_rmutex2_lock(). When a call to qurt_rmutex2_lock() 
+   succeeds immediately, this function behaves similarly, returning 0 for success.
+   When a call to qurt_rmutex2_lock() does not succeed immediately, this function has
+   no effect and returns nonzero for failure.
+
+   @datatypes
+   #qurt_rmutex2_t
+   
+   @param[in]  lock  Pointer to the recursive mutex object to lock.
+
+   @return 
+   0 -- Success. \n 
+   Nonzero -- Failure. 
+  
+ */
+int qurt_rmutex2_try_lock(qurt_rmutex2_t *lock);
+/** @endcond */
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_RMUTEX2_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_sclk.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_sclk.h
new file mode 100755
index 0000000000000..a83cf5f1db889
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_sclk.h
@@ -0,0 +1,145 @@
+#ifndef QURT_SCLK_H
+#define QURT_SCLK_H
+/**
+  @file qurt_sclk.h 
+  @brief Header file describing the APIs supported by QuRT system SCLK
+   feature.
+
+EXTERNAL FUNCTIONS
+   None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+Copyright (c) 2018-2021, 2023 Qualcomm Technologies, Inc.
+All Rights Reserved.
+Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+=============================================================================*/
+
+
+
+
+/*=============================================================================
+
+                           INCLUDE FILES
+
+=============================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
+
+/**
+ Conversion from microseconds to sleep ticks.
+ */
+#define QURT_SYSCLOCK_TIMETICK_FROM_US(us) ((us) * 192ULL / 10UL)
+#define qurt_sysclock_timetick_from_us(us) QURT_SYSCLOCK_TIMETICK_FROM_US(us)
+
+
+/**
+ Conversion from timer ticks to microseconds at the nominal frequency.
+*/
+#define QURT_SYSCLOCK_TIMETICK_TO_US(ticks) qurt_timer_timetick_to_us(ticks)
+
+/**
+  Maximum microseconds value for Qtimer is 1,042,499 hours.
+*/
+#define QURT_SYSCLOCK_MAX_DURATION (1042499uLL * 3600uLL * 1000uLL * 1000uLL)
+#define qurt_sysclock_max_duration() QURT_SYSCLOCK_MAX_DURATION
+/** 
+ Timer clock for Qtimer is 19.2 MHz.
+*/
+#define QURT_SYSCLOCK_MAX_DURATION_TICKS (1042499uLL * 3600uLL * 19200000uLL)
+#define qurt_sysclock_max_duration_ticks() QURT_SYSCLOCK_MAX_DURATION_TICKS
+/** 
+ Sleep timer error margin for Qtimer is 192 ticks ~10 us.
+*/
+#define QURT_SYSCLOCK_ERROR_MARGIN 192U //QURT_TIMER_MIN_DURATION*timer_freq;
+#define qurt_sysclock_error_margin() QURT_SYSCLOCK_ERROR_MARGIN
+
+/*=============================================================================
+
+                           DATA DECLARATIONS
+
+=============================================================================*/
+
+/**@ingroup func_qurt_sysclock_get_hw_ticks
+  @xreflabel{sec:qurt_sysclock_get_hw_ticks}
+  Gets the hardware tick count.\n
+  Returns the current value of a 64-bit hardware counter. The value wraps around to zero
+  when it exceeds the maximum value. 
+
+  @note1hang This operation must be used with care because of the wrap-around behavior.
+ 
+  @return 
+  Integer -- Current value of 64-bit hardware counter. 
+
+  @dependencies
+  None.
+ */
+unsigned long long qurt_sysclock_get_hw_ticks (void);
+
+
+/**@ingroup func_qurt_sysclock_get_hw_ticks_32
+  @xreflabel{sec:qurt_sysclock_get_hw_ticks_32}
+  Gets the hardware tick count in 32 bits.\n
+  Returns the current value of a 32-bit hardware counter. The value wraps around to zero
+  when it exceeds the maximum value. 
+
+  @note1hang This operation is implemented as an inline C function, and should be called from a C/C++ program.
+             The returned 32 bits are the lower 32 bits of the Qtimer counter.
+ 
+  @return 
+  Integer -- Current value of the 32-bit timer counter. 
+
+  @dependencies
+  None.
+ */
+static inline unsigned long qurt_sysclock_get_hw_ticks_32 (void)
+{
+    //Beginning with v61 there is a HW register that can be read directly.
+          unsigned long count;
+          __asm__ __volatile__ (" %0 = c30 " : "=r"(count));
+          return count;
+}
+
+
+/**@ingroup func_qurt_sysclock_get_hw_ticks_16
+  @xreflabel{sec:qurt_sysclock_get_hw_ticks_16}
+  Gets the hardware tick count in 16 bits.\n
+  Returns the current value of a 16-bit timer counter. The value wraps around to zero
+  when it exceeds the maximum value. 
+
+  @note1hang This operation is implemented as an inline C function, and should be called from a C/C++ program.
+             The returned 16 bits are based on the value of the lower 32 bits in Qtimer 
+             counter, right shifted by 16 bits.
+ 
+  @return 
+  Integer -- Current value of the 16-bit timer counter, calculated from the lower 32 bits in the
+             Qtimer counter, right shifted by 16 bits. 
+
+  @dependencies
+  None.
+ */
+
+
+static inline unsigned short qurt_sysclock_get_hw_ticks_16 (void)
+{
+    unsigned long ticks;
+
+    //Beginning with v61 there is a HW register that can be read directly.
+       __asm__ __volatile__ (" %0 = c30 " : "=r"(ticks));
+    __asm__ __volatile__ ( "%0 = lsr(%0, #16) \n" :"+r"(ticks));
+
+    return (unsigned short)ticks; 
+}
+unsigned long long qurt_timer_timetick_to_us(unsigned long long ticks);
+#define qurt_sysclock_timetick_to_us(ticks) qurt_timer_timetick_to_us(ticks)
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif /* __cplusplus */
+
+#endif /* QURT_SCLK_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_secure_proc.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_secure_proc.h
new file mode 100755
index 0000000000000..f40c7deb9bca1
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_secure_proc.h
@@ -0,0 +1,53 @@
+#ifndef QURT_SECURE_PROC_H
+#define QURT_SECURE_PROC_H
+
+/**
+  @file qurt_secure_proc.h
+  @brief  Definitions, macros, and prototypes used for handling secure process
+
+ EXTERNALIZED FUNCTIONS
+  none
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  none
+
+ Copyright (c) 2015, 2021  by Qualcomm Technologies, Inc.  All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**@ingroup qurt_process_migrate_secure_process
+  Migrate the user process to Qurt secure process 
+   
+  @param secure_phy_address  Physical starting address of secure memory
+  @param secure_memory_size  Size of secure memory
+  @param entry               Entry function to secure process 
+
+  @return
+  EOK
+  Negative return value -- Error.
+
+  @dependencies
+  None.
+*/
+int qurt_process_migrate_secure_process(unsigned long long secure_phy_address, unsigned int secure_memory_size,  void entry(unsigned));
+
+/**@ingroup qurt_process_get_migration_mem_size
+  get the size of all writable memory regions in a user PD. This is for preparation on secure process migration.
+   
+  @return
+  size of all writable memory regions in a user PD.
+ 
+  @dependencies
+  None.
+*/
+int qurt_process_get_migration_mem_size(void);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_sem.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_sem.h
new file mode 100755
index 0000000000000..ee5ce4b2d94ab
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_sem.h
@@ -0,0 +1,252 @@
+#ifndef QURT_SEM_H
+#define QURT_SEM_H 
+/**
+  @file  qurt_sem.h 
+  Prototypes of semaphore API.  
+
+EXTERNAL FUNCTIONS
+   None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+Copyright (c) 2021  by Qualcomm Technologies, Inc.  All Rights Reserved.
+Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+=============================================================================*/
+
+
+#include <qurt_futex.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*=============================================================================
+												TYPEDEFS
+=============================================================================*/
+/** @addtogroup semaphore_types
+@{ */
+
+/** QuRT semaphore type.   */
+typedef union {
+    /** @cond */
+	unsigned int raw[2] __attribute__((aligned(8)));
+	struct {        
+		unsigned short val;        /**< */
+		unsigned short n_waiting;  /**< */
+        unsigned int reserved1;    /**< */
+        unsigned int queue;       /**< */
+        unsigned int reserved2;    /**< */
+	}X; /** @endcond */   
+} qurt_sem_t;
+/** @} */ /* end_addtogroup semaphore_types */
+/*=============================================================================
+												FUNCTIONS
+=============================================================================*/
+
+/**@ingroup func_qurt_sem_add
+  Releases access to a shared resource (the specified amount increments the semaphore count value).\n
+  When a thread performs an add operation on a semaphore, the specified value increments the semaphore count.
+  The result depends on the number of threads waiting
+  on the semaphore: \n
+  - When no threads are waiting, the current thread releases access to the shared resource
+     and continues executing. \n
+  - When one or more threads are waiting and the semaphore count value is nonzero, 
+     the kernel repeatedly awakens the highest-priority waiting thread and decrements
+     the semaphore count value until either no waiting threads remain or the
+     semaphore count value is zero. If any of the awakened threads has higher priority
+     than the current thread, a context switch can occur.
+
+   @datatypes
+   #qurt_sem_t
+   
+   @param[in]  sem  Pointer to the semaphore object to access. 
+   @param[in]  amt  Amount to increment the semaphore count value. 
+
+   @return 
+   Unused integer value.
+
+   @dependencies 
+   None.
+  
+ */
+int qurt_sem_add(qurt_sem_t *sem, unsigned int amt);
+
+/**@ingroup func_qurt_sem_up  
+  Releases access to a shared resource. When a thread performs an up operation on a semaphore,
+  the semaphore count value increments. The result depends on the number of threads waiting 
+  on the semaphore: \n
+  - When no threads are waiting, the current thread releases access to the shared resource
+     and continues executing.\n
+  - When one or more threads are waiting and the semaphore count value is nonzero, 
+     the kernel awakens the highest-priority waiting thread and decrements the
+     semaphore count value. If the awakened thread has higher priority than the current
+     thread, a context switch can occur.
+
+   @datatypes
+   #qurt_sem_t
+   
+   @param[in]  sem  Pointer to the semaphore object to access.
+
+   @return 
+   Unused integer value.
+
+   @dependencies
+   None.  
+ */
+static inline int qurt_sem_up(qurt_sem_t *sem) { return qurt_sem_add(sem,1); }
+
+/**@ingroup func_qurt_sem_down  
+  Requests access to a shared resource. When a thread performs a down operation on a 
+  semaphore, the result depends on the semaphore count value: \n
+  - When the count value is nonzero, it is decremented, and the thread gains access to the
+     shared resource and continues executing.\n
+  - When the count value is zero, it is not decremented, and the thread is suspended on the
+     semaphore. When the count value becomes nonzero (because another thread
+     released the semaphore) it is decremented, and the suspended thread is awakened
+     and gains access to the shared resource.
+  
+   @datatypes
+   #qurt_sem_t
+   
+   @param[in]  sem  Pointer to the semaphore object to access.
+
+   @return 
+   Unused integer value.
+
+   @dependencies
+   None.
+ */
+int qurt_sem_down(qurt_sem_t *sem);
+
+/**@ingroup func_qurt_sem_down_timed  
+  When a thread performs a down operation on a semaphore, the result depends on the
+  semaphore count value: \n
+  - When the count value is nonzero, it is decremented, and the thread gains access to the
+     shared resource and continues executing.\n
+  - When the count value is zero, it is not decremented, and the thread is suspended on the
+     semaphore. When the count value becomes nonzero (because another thread
+     released the semaphore) it is decremented, and the suspended thread is awakened
+     and gains access to the shared resource. Terminate the wait when the specified timeout expires.
+   If timeout expires, terminate this wait and grant no access to the shared resource.
+  
+   @datatypes
+   #qurt_sem_t
+   
+   @param[in]  sem     Pointer to the semaphore object to access. 
+   @param[in] duration Interval (in microseconds) duration value must be between #QURT_TIMER_MIN_DURATION and
+                       #QURT_TIMER_MAX_DURATION 
+
+   @return 
+   #QURT_EOK -- Success \n
+   #QURT_ETIMEDOUT -- Timeout
+
+   @dependencies
+   None.
+ */
+int qurt_sem_down_timed(qurt_sem_t *sem, unsigned long long int duration);
+
+/**@ingroup func_qurt_sem_try_down
+  @xreflabel{hdr:qurt_sem_try_down}
+  Requests access to a shared resource (without suspend). When a thread performs a try down
+  operation on a semaphore, the result depends on the semaphore count value: \n
+  - The count value is decremented when it is nonzero. The down operation returns 0 as
+     the function result, and the thread gains access to the shared resource and is free to
+     continue executing.\n
+  - The count value is not decremented when it is zero. The down operation returns -1
+     as the function result, and the thread does not gain access to the shared resource
+     and should not continue executing.
+ 
+   @datatypes
+   #qurt_sem_t
+   
+   @param[in]  sem  Pointer to the semaphore object to access. 
+
+   @return 
+   0 -- Success. \n
+   -1 -- Failure. 
+
+   @dependencies
+   None.
+   
+ */
+int qurt_sem_try_down(qurt_sem_t *sem);
+
+/**@ingroup func_qurt_sem_init
+  Initializes a semaphore object.
+  The default initial value of the semaphore count value is 1.
+
+  @param[out]  sem  Pointer to the initialized semaphore object.
+
+  @return 
+  None.
+
+  @dependencies
+  None.
+  
+ */
+void qurt_sem_init(qurt_sem_t *sem);
+
+/**@ingroup func_qurt_sem_destroy
+  Destroys the specified semaphore.\n
+  @note1hang Semaphores must be destroyed when they are no longer in use. Failure to do
+             this causes resource leaks in the QuRT kernel.\n
+  @note1cont Semaphores must not be destroyed while they are still in use. If this occur,
+             the behavior of QuRT is undefined.
+
+  @datatypes
+  #qurt_sem_t
+
+  @param[in]  sem  Pointer to the semaphore object to destroy. 
+ 
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+void qurt_sem_destroy(qurt_sem_t *sem);
+
+/**@ingroup func_qurt_sem_init_val
+  Initializes a semaphore object with the specified value.
+
+   @datatypes
+   #qurt_sem_t
+   
+   @param[out]  sem  Pointer to the initialized semaphore object. 
+   @param[in]  val   Initial value of the semaphore count value.
+
+   @return
+   None.
+
+   @dependencies
+   None.
+  
+ */
+void qurt_sem_init_val(qurt_sem_t *sem, unsigned short val);
+
+/**@ingroup func_qurt_sem_get_val
+  Gets the semaphore count value.\n
+  Returns the current count value of the specified semaphore.
+
+  @datatypes
+  #qurt_sem_t
+  
+  @param[in]   sem Pointer to the semaphore object to access.
+
+  @return
+  Integer semaphore count value
+
+  @dependencies
+  None.
+ */
+static inline unsigned short qurt_sem_get_val(qurt_sem_t *sem ){return sem->X.val;}
+int qurt_sem_down_cancellable(qurt_sem_t *sem);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_SEM_H  */
+
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_shmem.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_shmem.h
new file mode 100755
index 0000000000000..980557323708a
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_shmem.h
@@ -0,0 +1,89 @@
+#ifndef QURT_SHMEM_H
+#define QURT_SHMEM_H
+
+/**
+  @file qurt_shmem.h
+
+  @brief
+  Prototypes of QuRT inter-process shared memory APIs
+
+ EXTERNALIZED FUNCTIONS
+  none
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  none
+
+ Copyright (c) 2013, 2021  by Qualcomm Technologies, Inc.  All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef MODE_T
+#define MODE_T
+typedef unsigned int mode_t;
+#endif //MODE_T
+
+/**
+ * The shm_open() function establishes a connection between a shared memory object and a file descriptor.
+ * The file descriptor is used by other functions such as mmap() to refer to that shared memory object.
+ * 
+ *
+ * @param name      Pointer to string naming a shared memory object. Name has to start with "/shm/"
+ * @param oflag     File status flags and file access modes of the open file description. Following
+ *                  flags are defined in <fcntl.h> and supported:
+ *                  O_RDONLY: oepn for read access only
+ *                  O_RDWR: Open for read or write access
+ *                  O_CREAT: If shared memory object doesn't exist, create one.
+ * @param mode      Permission flags (currently ignored)
+ *
+ * @return    file descriptor (positive number) if operation successful.
+ *                  negative error code if failed
+ *
+*/
+
+int shm_open(const char * name, int oflag, mode_t mode);
+
+/**
+ * The shm_mmap() function create a shared memory mapping in the virtual address space of the
+ * the calling process. 
+ * 
+ * @param addr      The starting address for the new mapping is specified in addr.
+ * @param len       Specifies the lengh of the shared memory region.
+ * @param prot      Describes the desired memory protection of the mapping. Same as the one in mmap of POSIX.
+ * @param flags     Determines whether updates to the mapping is visible or not to other process. Same as
+ *                  the one in mmap of POSIX.
+ * @param fd        The starting adddress for the new mapping is returned.
+ * @param offset    unused.
+ *
+ * @return    The starting adddress for the new mapping is returned.
+ *                  negative error code if failed
+ *
+*/
+
+void *shm_mmap(void *addr, unsigned int len, int prot, int flags, int fd, unsigned int offset);
+
+/**
+ * The shm_close() function removes a connection between a shared memory object and a file descriptor.
+ * If there is no file descriptor connects to the shared memory object, the shared memory object will
+ * be deleted automatically. Shared memory object has same virtual address in any process. This is 
+ * restriction of single virtual address space. 
+ * 
+ *
+ * @param fd        File descriptor of shared memory object
+ *
+ * @return    0 if operation successful.
+ *                  negative error code if failed
+ *
+*/
+
+
+int shm_close(int fd);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_signal.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_signal.h
new file mode 100755
index 0000000000000..3a89c53394ad5
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_signal.h
@@ -0,0 +1,518 @@
+#ifndef QURT_SIGNAL_H
+#define QURT_SIGNAL_H
+
+/**
+  @file qurt_signal.h
+  @brief  Prototypes of kernel signal API functions. 
+
+ EXTERNALIZED FUNCTIONS
+  none
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  none
+
+ Copyright (c) 2021  by Qualcomm Technologies, Inc.  All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** @addtogroup signals_types
+@{ */
+#define QURT_SIGNAL_ATTR_WAIT_ANY 0x00000000  /**< Wait any. */
+#define QURT_SIGNAL_ATTR_WAIT_ALL 0x00000001  /**< Wait all. */
+
+/*=====================================================================
+ Typedefs
+ ======================================================================*/
+
+
+/** QuRT signal type.                                           
+ */
+typedef union {
+    /** @cond */
+	unsigned long long int raw;
+	struct {
+		unsigned int signals;
+		unsigned int waiting;
+		unsigned int queue;
+		unsigned int attribute;
+	}X;
+    /** @endcond */
+} qurt_signal_t;
+
+
+/** QuRT 64-bit signal type.                                           
+ */
+typedef struct {
+    /** @cond */
+    qurt_signal_t signal_sum;
+    unsigned long long signals;
+    unsigned long long waiting;
+    /** @endcond */
+} qurt_signal_64_t;
+/** @} */ /* end_addtogroup signals_types */
+/*=====================================================================
+ Functions
+======================================================================*/
+ 
+/*======================================================================*/
+/**@ingroup func_qurt_signal_init
+  Initializes a signal object.
+  Signal returns the initialized object.
+  The signal object is initially cleared.
+
+  @note1hang   Each signal-based object has one or more kernel resources associated with it;
+               to prevent resource leaks, call qurt_signal_destroy()
+               when this object is not used anymore
+  @datatypes
+  #qurt_signal_t
+
+  @param[in] *signal Pointer to the initialized object.
+
+  @return         
+  None.
+     
+  @dependencies
+  None.
+ */
+/* ======================================================================*/
+void qurt_signal_init(qurt_signal_t *signal);
+
+/*======================================================================*/
+/**@ingroup func_qurt_signal_destroy
+  Destroys the specified signal object.
+  
+  @note1hang Signal objects must be destroyed when they are no longer in use. Failure 
+  to do this causes resource leaks in the QuRT kernel.\n
+  @note1cont Signal objects must not be destroyed while they are still in use. If this
+  occurs, the behavior of QuRT is undefined.
+  
+  @datatypes
+  #qurt_signal_t
+
+  @param[in] *signal  Pointer to the signal object to destroy.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+/* ======================================================================*/
+void qurt_signal_destroy(qurt_signal_t *signal);
+
+/*======================================================================*/
+/**@ingroup func_qurt_signal_wait 
+  @xreflabel{hdr:qurt_signal_wait}
+  Suspends the current thread until the specified signals are set.
+
+  Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1 indicates 
+  waiting on a signal, and 0 indicates not waiting on the signal.
+
+  If a thread is waiting on a signal object for any of the specified set of signals to set, 
+  and one or more of those signals is set in the signal object, the thread is awakened.
+
+  If a thread is waiting on a signal object for all of the specified set of signals to be set, 
+  and all of those signals are set in the signal object, the thread is awakened.
+
+  The specified set of signals can be cleared when the signal is set.
+
+  @note1hang At most, one thread can wait on a signal object at any given time.
+
+  @datatypes
+  #qurt_signal_t
+
+  @param[in] signal      Pointer to the signal object to wait on.
+  @param[in] mask        Mask value identifying the individual signals in the signal object to 
+                         wait on.
+  @param[in] attribute   Indicates whether the thread waits to set any of the signals, or to set all of 
+                         them. \n
+						 @note1hang The wait-any and wait-all types are mutually exclusive.\n Values:\n
+                         - #QURT_SIGNAL_ATTR_WAIT_ANY \n
+                         - #QURT_SIGNAL_ATTR_WAIT_ALL @tablebulletend
+
+  @return     	
+  A 32-bit word with current signals.
+ 
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+unsigned int qurt_signal_wait(qurt_signal_t *signal, unsigned int mask, 
+                unsigned int attribute);
+
+/*======================================================================*/
+/**@ingroup func_qurt_signal_wait_timed
+  @xreflabel{hdr:qurt_signal_wait}
+  Suspends the current thread until the specified signals are set or until timeout.
+
+  Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1 indicates 
+  waiting on a signal, and 0 indicates not waiting.
+
+  If a thread is waiting on a signal object for any of the specified set of signals to be set, 
+  and one or more of those signals is set in the signal object, the thread is awakened.
+
+  If a thread is waiting on a signal object for all of the specified set of signals to be set, 
+  and all of those signals are set in the signal object, the thread is awakened.
+
+  The specified set of signals can be cleared after the signal is set.
+
+  @note1hang At most, one thread can wait on a signal object at any given time.
+
+  @datatypes
+  #qurt_signal_t
+
+  @param[in] signal      Pointer to the signal object to wait on.
+  @param[in] mask        Mask value that identifies the individual signals in the signal object to wait on.
+  @param[in] attribute   Indicates whether the thread must wait until any of the signals are set, or until all of 
+                         them are set. \n
+						 @note1hang The wait-any and wait-all types are mutually exclusive.\n Values:\n
+                         - #QURT_SIGNAL_ATTR_WAIT_ANY \n
+                         - #QURT_SIGNAL_ATTR_WAIT_ALL @tablebulletend
+  @param[out] signals    Bitmask of signals that are set 
+  @param[in] duration    Duration (microseconds) to wait. Must be in the range
+                         [#QURT_TIMER_MIN_DURATION ... #QURT_TIMER_MAX_DURATION]
+
+  @return 				
+  #QURT_EOK -- Success; one or more signals were set \n
+  #QURT_ETIMEDOUT -- Timed-out \n
+  #QURT_EINVALID -- Duration out of range
+  
+  @dependencies
+  Timed-waiting support in the kernel.
+*/
+/* ======================================================================*/
+int qurt_signal_wait_timed(qurt_signal_t *signal, unsigned int mask, 
+                unsigned int attribute, unsigned int *signals, unsigned long long int duration);
+
+/*======================================================================*/
+/**@ingroup func_qurt_signal_wait_any
+  Suspends the current thread until any of the specified signals are set.
+
+  Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1 indicates
+  to wait on a signal, and 0 indicates not to wait on the thread.
+
+  If a thread is waiting on a signal object for any of the specified set of signals to be set, 
+  and one or more of those signals is set in the signal object, the thread is awakened.
+
+  @note1hang At most, one thread can wait on a signal object at any given time.
+
+  @datatypes
+  #qurt_signal_t
+
+  @param[in] signal      Pointer to the signal object to wait on.
+  @param[in] mask        Mask value identifying the individual signals in the signal object to  
+                         wait on.
+	
+  @return     	
+  32-bit word with current signals.
+ 
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+static inline unsigned int qurt_signal_wait_any(qurt_signal_t *signal, unsigned int mask)
+{
+  return qurt_signal_wait(signal, mask, QURT_SIGNAL_ATTR_WAIT_ANY);
+}
+
+/*======================================================================*/
+/**@ingroup func_qurt_signal_wait_all
+  Suspends the current thread until all of the specified signals are set.
+
+  Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1 indicates 
+  to wait on a signal, and 0 indicates not to wait on it.
+
+  If a thread is waiting on a signal object for all of the specified set of signals to be set, 
+  and all of those signals are set in the signal object, the thread is awakened.
+
+  @note1hang At most, one thread can wait on a signal object at any given time.
+
+  @datatypes
+  #qurt_signal_t
+
+  @param[in] signal      Pointer to the signal object to wait on. 
+  @param[in] mask        Mask value identifying the individual signals in the signal object to  
+                         wait on.
+	
+  @return      	
+  A 32-bit word with current signals.
+ 
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+static inline unsigned int qurt_signal_wait_all(qurt_signal_t *signal, unsigned int mask)
+{
+  return qurt_signal_wait(signal, mask, QURT_SIGNAL_ATTR_WAIT_ALL);
+}
+
+/*======================================================================*/
+/**@ingroup func_qurt_signal_set
+  Sets signals in the specified signal object.
+
+  Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1 indicates 
+  to set the signal, and 0 indicates not to set it.
+  	
+  @datatypes
+  #qurt_signal_t
+
+  @param[in]    signal  Pointer to the signal object to modify.
+  @param[in]    mask    Mask value identifying the individual signals to set in the signal
+                        object.
+
+  @return 
+  None.
+  
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+void qurt_signal_set(qurt_signal_t *signal, unsigned int mask);
+
+/*======================================================================*/
+/**@ingroup func_qurt_signal_get
+   Gets a signal from a signal object.
+   
+   Returns the current signal values of the specified signal object.
+
+  @datatypes
+  #qurt_signal_t
+
+  @param[in] *signal Pointer to the signal object to access.
+
+  @return         
+  A 32-bit word with current signals
+    
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+unsigned int qurt_signal_get(qurt_signal_t *signal);
+
+/*======================================================================*/
+/**@ingroup func_qurt_signal_clear
+  Clear signals in the specified signal object.
+
+  Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1 
+  indicates that a signal must be cleared, and 0 indicates not to clear it.
+
+  @note1hang Signals must be explicitly cleared by a thread when it is awakened -- the wait 
+           operations do not automatically clear them.
+
+  @datatypes
+  #qurt_signal_t
+
+  @param[in] signal   Pointer to the signal object to modify.
+  @param[in] mask     Mask value identifying the individual signals to clear in the signal object.
+
+  @return 		  
+  None.
+
+  @dependencies
+  None.
+ */
+/* ======================================================================*/
+void qurt_signal_clear(qurt_signal_t *signal, unsigned int mask);
+
+/**@ingroup func_qurt_signal_wait_cancellable  
+  @xreflabel{hdr:qurt_signal_wait_cancellable}
+  Suspends the current thread until either the specified signals are set or the wait operation is cancelled.
+  The operation is cancelled if the user process of the calling thread is killed, or if the calling thread 
+  must finish its current QDI invocation and return to user space. 
+
+  Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1 indicates 
+  that a signal must be waited on, and 0 indicates not to wait on it.
+
+  If a thread is waiting on a signal object for any of the specified set of signals to be set, and one or 
+  more of those signals is set in the signal object, the thread is awakened.
+
+  If a thread is waiting on a signal object for all of the specified set of signals to be set, and all of 
+  those signals are set in the signal object, the thread is awakened.
+
+  @note1hang At most, one thread can wait on a signal object at any given time.
+
+  @note1cont When the operation is cancelled, the caller must assume that the signal is never set.
+
+  @datatypes
+  #qurt_signal_t
+
+  @param[in] signal      Pointer to the signal object to wait on.
+  @param[in] mask        Mask value identifying the individual signals in the signal object to  
+                         wait on.
+  @param[in] attribute   Indicates whether the thread must wait until any of the signals are set, or until all of 
+                         them are set.  Values:\n
+                         - #QURT_SIGNAL_ATTR_WAIT_ANY \n
+                         - #QURT_SIGNAL_ATTR_WAIT_ALL @tablebulletend
+  @param[out] return_mask Pointer to the 32-bit mask value that was originally passed to the function.
+
+
+  @return     	
+  #QURT_EOK -- Wait completed. \n
+  #QURT_ECANCEL -- Wait cancelled.
+
+ 
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+int qurt_signal_wait_cancellable(qurt_signal_t *signal, unsigned int mask, 
+                                 unsigned int attribute,
+                                 unsigned int *return_mask);
+
+/*======================================================================*/
+/**@ingroup func_qurt_signal_64_init
+  Initializes a 64-bit signal object.\n
+  The signal argument returns the initialized object.
+  The signal object is initially cleared.
+
+  @note1hang   Each signal-based object has one or more kernel resources associated with it;
+               to prevent resource leaks, call qurt_signal_destroy()
+               when this object is not used anymore.
+  @datatypes
+  #qurt_signal_64_t
+
+  @param[in] signal Pointer to the initialized object.
+
+  @return         
+  None.
+     
+  @dependencies
+  None.
+ */
+/* ======================================================================*/
+void qurt_signal_64_init(qurt_signal_64_t *signal);
+
+/*======================================================================*/
+/**@ingroup func_qurt_signal_64_destroy
+  Destroys the specified signal object.
+  
+  @note1hang 64-bit signal objects must be destroyed when they are no longer in use. Failure 
+  to do this causes resource leaks in the QuRT kernel.\n
+  @note1cont Signal objects must not be destroyed while they are still in use. If this
+  occurs, the behavior of QuRT is undefined.
+  
+  @datatypes
+  #qurt_signal_64_t
+
+  @param[in] signal  Pointer to the signal object to destroy.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+/* ======================================================================*/
+void qurt_signal_64_destroy(qurt_signal_64_t *signal);
+
+/*======================================================================*/
+/**@ingroup func_qurt_signal_64_wait
+  Suspends the current thread until all of the specified signals are set.
+
+  Signals are represented as bits 0 through 63 in the 64-bit mask value. A mask bit value of 1 indicates 
+  that a signal must be waited on, and 0 indicates not wait on it.
+
+  If a thread is waiting on a signal object for all of the specified set of signals to be set, 
+  and all of those signals are set in the signal object, the thread is awakened.
+
+  @note1hang At most, one thread can wait on a signal object at any given time.
+
+  @datatypes
+  #qurt_signal_64_t
+
+  @param[in] signal      Pointer to the signal object to wait on. 
+  @param[in] mask        Mask value, which identifies the individual signals in the signal object to  
+                         wait on.
+  @param[in] attribute   Indicates whether the thread must wait until any of the signals are set, or until all of 
+                         them are set.  \n
+						 @note1hang The wait-any and wait-all types are mutually exclusive.\n Values:\n
+                         - #QURT_SIGNAL_ATTR_WAIT_ANY \n
+                         - #QURT_SIGNAL_ATTR_WAIT_ALL @tablebulletend
+	
+  @return      	
+  A 32-bit word with current signals.
+ 
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+unsigned long long qurt_signal_64_wait(qurt_signal_64_t *signal, unsigned long long mask, 
+                unsigned int attribute);
+
+/*======================================================================*/
+/**@ingroup func_qurt_signal_64_set
+  Sets signals in the specified signal object.
+
+  Signals are represented as bits 0 through 63 in the 64-bit mask value. A mask bit value of 1 indicates 
+  that a signal must be set, and 0 indicates not to set it.
+  	
+  @datatypes
+  #qurt_signal_64_t
+
+  @param[in]    signal  Pointer to the signal object to modify.
+  @param[in]    mask    Mask value identifiying the individual signals to set in the signal
+                        object.
+
+  @return 
+  None.
+  
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+void qurt_signal_64_set(qurt_signal_64_t *signal, unsigned long long mask);
+
+/*======================================================================*/
+/**@ingroup func_qurt_signal_64_get
+   Gets a signal from a signal object.
+   
+   Returns the current signal values of the specified signal object.
+
+  @datatypes
+  #qurt_signal_64_t
+
+  @param[in] *signal Pointer to the signal object to access.
+
+  @return         
+  A 64-bit double word with current signals.
+    
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+unsigned long long qurt_signal_64_get(qurt_signal_64_t *signal);
+
+/*======================================================================*/
+/**@ingroup func_qurt_signal_64_clear
+  Clears signals in the specified signal object.
+
+  Signals are represented as bits 0 through 63 in the 64-bit mask value. A mask bit value of 1 
+  indicates that a signal must be cleared, and 0 indicates not to clear it.
+
+  @note1hang Signals must be explicitly cleared by a thread when it is awakened -- the wait 
+           operations do not automatically clear them.
+
+  @datatypes
+  #qurt_signal_64_t
+
+  @param[in] signal   Pointer to the signal object to modify.
+  @param[in] mask     Mask value identifying the individual signals to clear in the signal object.
+
+  @return 		  
+  None.
+
+  @dependencies
+  None.
+ */
+/* ======================================================================*/
+void qurt_signal_64_clear(qurt_signal_64_t *signal, unsigned long long mask);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* QURT_SIGNAL_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_signal2.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_signal2.h
new file mode 100755
index 0000000000000..43975100cbf75
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_signal2.h
@@ -0,0 +1,340 @@
+#ifndef QURT_SIGNAL2_H
+#define QURT_SIGNAL2_H
+
+/**
+  @file qurt_signal2.h
+  @brief  Prototypes of kernel signal2 API functions.
+
+ EXTERNALIZED FUNCTIONS
+  none
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  none
+
+ Copyright (c) 2013, 2021, 2023  by Qualcomm Technologies, Inc.  All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define QURT_SIGNAL_ATTR_WAIT_ANY 0x00000000
+#define QURT_SIGNAL_ATTR_WAIT_ALL 0x00000001
+
+/*=====================================================================
+ Typedefs
+ ======================================================================*/
+
+/** @addtogroup signals2_types
+@{ */
+/** qurt_signal2 type.
+ */
+typedef union {
+   /** @cond */
+  struct{
+   unsigned int cur_mask;                              /* Current set of signal bits that are set. */
+   unsigned int sig_state;                             /* Current state. */
+                                                       /* Bit 0 -- in anysignal wait. */
+                                                       /* Bit 1 -- in allsignal wait. */
+                                                       /* Bit 2 -- in interrupt wait. */
+                                                       /* Bits 31-3 -- reference count field. */
+   unsigned int queue;                                 /* Kernel-maintained futex queue value. */
+   unsigned int wait_mask;                             /* When sig_state indicates a waiter is present, this is the wait mask. */
+   };
+  unsigned long long int raw;
+  /** @endcond */
+} qurt_signal2_t;
+/* @} */ /* end_addtogroup signals2_types */
+
+/*=====================================================================
+ Functions
+======================================================================*/
+
+/*======================================================================*/
+/**@ingroup func_qurt_signal2_init
+
+  @deprecated use #qurt_signal_init instead.
+
+  Initializes a signal2 object.
+  Signal returns the initialized object.
+  The signal object is initially cleared.
+
+  Objects of type signal2 solve a potential race condition between
+  set() and destroy() operations.
+
+  @datatypes
+  #qurt_signal2_t
+
+  @param[in] *signal Pointer to the initialized object.
+
+  @return
+  None.
+
+  @dependencies
+  Each mutex-based object has an associated
+       kernel resource(s), therefore users must call qurt_signal2_destroy()
+       when this object no longer in use.
+ */
+/* ======================================================================*/
+void qurt_signal2_init(qurt_signal2_t *signal);
+
+/*======================================================================*/
+/**@ingroup func_qurt_signal2_destroy
+
+  @deprecated use #qurt_signal_destroy instead.
+
+  Destroys the specified signal object.
+
+  @note1cont Signal objects must not be destroyed while they are still in use. If this
+  occurs, the behavior of QuRT is undefined.
+  @note1cont Application code should destroy a signal2 object prior to deallocating it.
+             Calling qurt_signal2_destroy() before deallocating a 
+             signal2 object ensures completion of all qurt_signal2_set() calls.
+
+  @datatypes
+  #qurt_signal2_t
+
+  @param[in] signal  Pointer to the signal object to destroy.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+/* ======================================================================*/
+void qurt_signal2_destroy(qurt_signal2_t *signal);
+
+/*======================================================================*/
+/**@ingroup func_qurt_signal2_wait
+
+  @deprecated use #qurt_signal_wait instead.
+
+  Suspends the current thread until the specified signals are set.
+
+  Signals are represented as bits [31:0] in the 32-bit mask value. A mask bit value of 1 indicates
+  a signal to wait on.
+
+  If a thread calls this API with QURT_SIGNAL_ATTR_WAIT_ANY, the thread will be awakened when
+  any of the signals specified in the mask are set.
+
+  If a thread calls this API with QURT_SIGNAL_ATTR_WAIT_ALL, the thread will be awakened only
+  when all the signals specified in the mask are set.
+
+  @note1hang At most, one thread can wait on a signal object at any given time.
+
+  @datatypes
+  #qurt_signal2_t
+
+  @param[in] signal      Pointer to the signal object to wait on.
+  @param[in] mask        Mask value identifying the individual signals in the signal object to wait on.
+  @param[in] attribute   Specifies whether the thread waits for any of the signals to be set, or for all of
+                         them to be set. Values:\n
+                         - QURT_SIGNAL_ATTR_WAIT_ANY \n
+                         - QURT_SIGNAL_ATTR_WAIT_ALL @tablebulletend
+  @return
+  A 32-bit word with current signals.
+
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+unsigned int qurt_signal2_wait(qurt_signal2_t *signal, unsigned int mask,
+                unsigned int attribute);
+
+/*======================================================================*/
+/**@ingroup func_qurt_signal2_wait_any
+
+  @deprecated use #qurt_signal_wait_any instead.
+
+  Suspends the current thread until any of the specified signals are set.
+
+  Signals are represented as bits [31:0] in the 32-bit mask value. A mask bit value of 1 indicates
+  a signal to wait on.
+
+  The thread will be awakened when any of the signals specified in the mask are set.
+
+  @note1hang At most, one thread can wait on a signal object at any given time.
+
+  @datatypes
+  #qurt_signal2_t
+
+  @param[in] signal      Pointer to the signal object to wait on.
+  @param[in] mask        Mask value identifying the individual signals in the signal object to 
+                         wait on.
+
+  @return
+  32-bit word with current signals.
+
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+static inline unsigned int qurt_signal2_wait_any(qurt_signal2_t *signal, unsigned int mask)
+{
+  return qurt_signal2_wait(signal, mask, QURT_SIGNAL_ATTR_WAIT_ANY);
+}
+
+/*======================================================================*/
+/**@ingroup func_qurt_signal2_wait_all
+
+  @deprecated use #qurt_signal_wait_all instead.
+
+  Suspends the current thread until all of the specified signals are set.
+
+  Signals are represented as bits [31:0] in the 32-bit mask value. A mask bit value of 1 indicates
+  a signal to wait on.
+
+  The thread will be awakened only when all the signals specified in the mask are set.
+
+  @note1hang At most one thread can wait on a signal object at any given time.
+
+  @datatypes
+  #qurt_signal2_t
+
+  @param[in] signal      Pointer to the signal object to wait on.
+  @param[in] mask        Mask value identifying the individual signals in the signal object to 
+                         wait on.
+
+  @return
+  32-bit word with current signals.
+
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+static inline unsigned int qurt_signal2_wait_all(qurt_signal2_t *signal, unsigned int mask)
+{
+  return qurt_signal2_wait(signal, mask, QURT_SIGNAL_ATTR_WAIT_ALL);
+}
+
+/*======================================================================*/
+/**@ingroup func_qurt_signal2_set
+
+  @deprecated use #qurt_signal_set instead.
+
+  Sets signals in the specified signal object.
+
+  Signals are represented as bits [31:0] in the 32-bit mask value. A mask bit value of 1 indicates
+  that a signal must be set, and 0 indicates not to set the signal.
+
+  @datatypes
+  #qurt_signal2_t
+
+  @param[in]    signal  Pointer to the signal object to modify.
+  @param[in]    mask    Mask value identifying the individual signals to set in the signal
+                        object.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+void qurt_signal2_set(qurt_signal2_t *signal, unsigned int mask);
+
+/*======================================================================*/
+/**@ingroup func_qurt_signal2_get
+
+  @deprecated use #qurt_signal_get instead.
+
+   Gets a signal from a signal object.
+
+   Returns the current signal values of the specified signal object.
+
+  @datatypes
+  #qurt_signal2_t
+
+  @param[in] *signal Pointer to the signal object to access.
+
+  @return
+   32-bit word with current signals.
+
+  @dependencies
+  None.
+*/
+/* ======================================================================*/
+unsigned int qurt_signal2_get(qurt_signal2_t *signal);
+
+/*======================================================================*/
+/**@ingroup func_qurt_signal2_clear
+
+  @deprecated use #qurt_signal_clear instead.
+
+  Clear signals in the specified signal object.
+
+  Signals are represented as bits [31:0] in the 32-bit mask value. A mask bit value of 1
+  indicates that a signal must be cleared, and 0 indicates not to clear the signal.
+
+  @note1hang Signals must be explicitly cleared by a thread when it is awakened -- the wait
+           operations do not automatically clear them.
+
+  @datatypes
+  #qurt_signal2_t
+
+  @param[in] signal   Pointer to the signal object to modify.
+  @param[in] mask     Mask value identifying the individual signals to clear in the signal object.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+/* ======================================================================*/
+void qurt_signal2_clear(qurt_signal2_t *signal, unsigned int mask);
+
+/**@ingroup func_qurt_signal2_wait_cancellable  
+  
+  @deprecated use #qurt_signal_wait_cancellable instead.
+
+  Suspends the current thread until either the specified signals are set or the wait operation is cancelled.
+  The operation is cancelled if the user process of the calling thread is killed, or if the calling thread 
+  must finish its current QDI invocation and return to user space. 
+
+  Signals are represented as bits 0 through 31 in the 32-bit mask value. A mask bit value of 1 indicates 
+  that a signal must be waited on, and 0 indicates not to wait on it.
+
+  If a thread is waiting on a signal object for any of the specified set of signals to be set, and one or 
+  more of those signals is set in the signal object, the thread is awakened.
+
+  If a thread is waiting on a signal object for all of the specified set of signals to be set, and all of 
+  those signals are set in the signal object, the thread is awakened.
+
+  @note1hang At most, one thread can wait on a signal object at any given time.
+
+  @note1cont When the operation is cancelled, the caller must assume that the signal is never set.
+
+  @datatypes
+  #qurt_signal2_t
+
+  @param[in] signal      Pointer to the signal object to wait on.
+  @param[in] mask        Mask value identifying the individual signals in the signal object to  
+                         wait on.
+  @param[in] attribute   Indicates whether the thread must wait until any of the signals are set, or until all of 
+                         them are set.  Values:\n
+                         - #QURT_SIGNAL_ATTR_WAIT_ANY \n
+                         - #QURT_SIGNAL_ATTR_WAIT_ALL @tablebulletend
+  @param[out] p_returnmask Pointer to the 32-bit mask value that was originally passed to the function.
+
+
+  @return     	
+  #QURT_EOK -- Wait completed. \n
+  #QURT_ECANCEL -- Wait cancelled.
+
+ 
+  @dependencies
+  None.
+*/
+int qurt_signal2_wait_cancellable(qurt_signal2_t *signal,
+                                  unsigned int mask,
+                                  unsigned int attribute,
+                                  unsigned int *p_returnmask);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_SIGNAL2_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_space.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_space.h
new file mode 100755
index 0000000000000..2c3f9e4496697
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_space.h
@@ -0,0 +1,230 @@
+#ifndef QURT_SPACE_H
+#define QURT_SPACE_H
+/**
+  @file qurt_space.h
+  @brief Prototypes of QuRT process control APIs
+
+ EXTERNALIZED FUNCTIONS
+  none
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  none
+
+ Copyright (c) 2013, 2021, 2023 by Qualcomm Technologies, Inc. All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+
+#include <qurt_types.h>
+#include <qurt_signal.h>
+#include <qurt_process.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** This flag is a request to the OS to suspend the processes just before calling main()
+But it is going to be obsoleted and replaced by QURT_PROCESS_SUSPEND_ON_STARTUP */
+#define SPAWNN_FLAG_SUSPEND_ON_STARTUP QURT_PROCESS_SUSPEND_ON_STARTUP
+
+/**
+ * Creates and starts a process from ELF of a specified name. The slash symbols
+ * "/" or "\" are ignored. Do not include the directory name in the input. This function
+ * accepts the the SPAWN flags. Multiple SPAWN flags can be specified by OR'ing the flags.
+ *
+ * @param name      ELF name of the executable. Name shall not contain directories,
+ *                  use "dsp2.elf", instead of "/prj/qct/.../dsp2.elf"
+ *
+ * @param return
+   Process ID -- Success \n
+   Negative error code -- failure\n
+   #QURT_EPRIVILEGE --                    Caller does not have enough privilege for this operation\n
+   #QURT_EMEM       --                    Not enough memory to perform the operation \n
+   #QURT_EFAILED     --                   Operation failed \n
+   #QURT_ENOTALLOWED --                   Operation not allowed \n
+   #QURT_ENOREGISTERED --                 Not registered \n
+   #QURT_ENORESOURCE  --                  Resource exhaustion \n
+   #QURT_EINVALID --                      Invalid argument value
+*/
+
+int qurt_spawn_flags(const char * name, int flags);
+
+/**
+   Creates and starts a process from an ELF of the specified name. The slash symbols
+   "/" or "\" are ignored. Do not include the directory name in the input.
+
+   @param name      ELF name of the executable. Name shall not contain directories,
+                    use "dsp2.elf", instead of "/prj/qct/.../dsp2.elf".
+
+   @return
+   Process ID -- Success. \m
+   Negative error code -- Failure.
+
+*/
+static inline int qurt_spawn(const char *name)
+{
+    return qurt_spawn_flags(name,0);
+}
+
+/**
+ * Returns the process ID of the current process.
+ *
+ * @return
+ * Process ID
+ *
+*/
+#define qurt_getpid qurt_process_get_id
+
+/**
+ * The qurt_wait() function  waits for status change in a child process. It could be used by parent
+ * process to block on any child process terminates.
+ *
+ * This API returns error if there are no user processes or all user processes got detached.
+ *
+ * @param status    Pointer to status variable. The variable provides the status value of child process.
+ *                  The value comes from exit() system call made by child process.
+ *
+ * @return
+   Process ID of the child process that changes status -- Success \n
+ * Negative error code -- Failure
+ *
+*/
+
+int qurt_wait(int *status);
+
+
+/** @cond */
+/* APIs that allow registering callbacks on spawn of user pd */
+typedef void (*QURT_SPAWN_PFN)(int client_handle, void *data_ptr);  //no return, since we won't be error checking it in spawn 
+typedef int (*QURT_CB_PFN)(int client_handle, void *user_data, void *info);
+typedef union {
+    QURT_SPAWN_PFN spawn_pfn;
+    QURT_CB_PFN cb_pfn;
+} qurt_process_callback_pfn_t;
+/** @endcond */
+
+/** @cond internal_only */
+
+/**@ingroup func_qurt_event_register
+Sets the specified bits by mask in the signal passed by the caller. The signal gets set
+when the client handle indicated by value goes away (at process exit). Multiple clients can register for the signal
+to be set.
+
+@datatypes
+
+@param[in]  type     QURT_PROCESS_EXIT is the only event that can be registered for.
+@param[in]  value    Indicates the client handle of the process for which the event is registered.
+@param[in]  signal   Pointer to the signal object to set when the event occurs.
+@param[in]  mask     Mask bits to set in the signal.
+@param[out] data     Pointer to the variable that would receive the exit code of the exiting process.
+@param[in]  datasize Size of the data variable.
+
+@return
+#QURT_EOK -- Success \n
+#QURT_EMEM -- Not enough memory to allocate resources \n
+#QURT_EVAL -- Invalid values passed to the API
+
+@dependencies
+None.
+*/
+int qurt_event_register(int type, int value, qurt_signal_t *psig, unsigned int mask, void *data, unsigned int data_size);
+
+/**@ingroup func_qurt_callback_register_onspawn
+Allows registering for a callback on spawn of any user process.
+
+@datatypes
+#QURT_SPAWN_PFN
+
+@param[in] pFn         Callback function to call when any user process is spawned.
+@param[in] user_data   Pointer to the argument that the callback must be called with.
+
+
+@return   If positive value is obtained, handle to be used while deregistering the callback.
+          Mutliple clients can register for callback on spawn and some clients might choose to deregister.
+
+          If failed, QURT_EFATAL will be returned.
+
+@dependencies
+None.
+*/
+int qurt_callback_register_onspawn(QURT_SPAWN_PFN pFn, void *user_data);
+
+/**@ingroup func_qurt_callback_deregister_onspawn
+Allows de-registering callback on spawn.
+
+@param[in] callback_handle   Handle returned by qurt_callback_register_onspawn.
+
+@return
+#QURT_EOK --de-registering was successful
+
+@dependencies
+None.
+*/
+int qurt_callback_deregister_onspawn(int callback_handle);
+
+/**@ingroup func_qurt_process_callback_register
+Allows registering for a callback during or after image loading.
+Generic callback types:
+    Functions similarly to qurt_callback_register_onspawn(). Callback is called after process is
+    loaded, before process thread starts. Callback has no return value and has no info provided
+    from OS.
+        pFn - QURT_SPAWN_PFN
+        type - QURT_PROCESS_CB_GENERIC
+        arg1 - not used 
+        arg2 - not used
+        arg3 - not used
+Note callback types:
+    Callback is called during process loading: before segment loading(QURT_PROCESS_NOTE_CB_PRE_MAP),
+    or after segment loading (QURT_PROCESS_NOTE_CB_POST_MAP). OS provides info to the callback. info
+    argument in callback is populated with pointer to the mapped note corresponding to the callback.
+    Callback has return value, loader fails if callback returns a value that is not QURT_EOK.
+        pFn - QURT_CB_PFN
+        type - QURT_PROCESS_NOTE_CB_PRE_MAP or QURT_PROCESS_NOTE_CB_POST_MAP
+        arg1 - note type (ex: NOTE_TYPE_POOL_INFO, NOTE_TYPE_SEGMENT_INFO, NOTE_TYPE_ARB_INFO)
+        arg2 - note name
+        arg3 - not used
+
+@datatypes
+
+@param[in] pFn          Callback function to call
+@param[in] type         Callback type
+@param[in] user_data    Pointer to the argument that the callback must be called with.
+@param[in] arg1         Arguments interpreted by OS based on callback type
+@param[in] arg2         Arguments interpreted by OS based on callback type
+@param[in] arg3         Arguments interpreted by OS based on callback type (currently not used)
+
+
+@return   If positive value is obtained, handle to be used while deregistering the callback.
+          Mutliple clients can register for callback on spawn and some clients might choose to deregister.
+
+          If failed, QURT_EFATAL will be returned.
+
+@dependencies
+None.
+*/
+int qurt_process_callback_register(qurt_process_callback_pfn_t pFn, 
+                                   qurt_process_cb_type_t type, 
+                                   void *user_data, 
+                                   qurt_process_callback_arg_t arg1, 
+                                   qurt_process_callback_arg_t arg2, 
+                                   qurt_process_callback_arg_t arg3);
+
+
+
+/**@ingroup func_qurt_process_callback_deregister
+Allows de-registering callback for imate loading.
+@param[in] callback_handle   Handle returned by qurt_process_callback_register.
+
+@return
+#QURT_EOK --de-registering was successful
+
+@dependencies
+None.
+*/
+int qurt_process_callback_deregister(int callback_handle);
+/** @endcond */
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_SPACE_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_srm_consts.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_srm_consts.h
new file mode 100755
index 0000000000000..48a8b6a38c402
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_srm_consts.h
@@ -0,0 +1,32 @@
+#ifndef QURT_SRM_CONSTS_H
+#define QURT_SRM_CONSTS_H
+/**
+  @file qurt_srm_consts.h 
+  @brief  Type definitions for srm
+
+EXTERNAL FUNCTIONS
+   None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+Copyright (c) 2020-2021, 2022  by Qualcomm Technologies, Inc.  All Rights Reserved
+Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+=============================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** @cond */
+#define QURT_SRM_WAKEUP_REQUEST       1U << 0          /**< Value = 1:  Send wakeup request to the SRM server. */
+#define QURT_SRM_SET_HANDLE           1U << 1          /**< Value = 2:  Set the client handle for a new SRM client. */
+#define QURT_SRM_ALLOC_KERNEL_PAGES   1U << 2          /**< Value = 4:  Allocate pages from the kernel VA space. */
+/** @endcond */
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_SRM_CONSTS_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_srm_driver.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_srm_driver.h
new file mode 100755
index 0000000000000..5489e3dddbcca
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_srm_driver.h
@@ -0,0 +1,140 @@
+#ifndef QURT_SRM_DRIVER_H
+#define QURT_SRM_DRIVER_H
+/**
+  @file qurt_srm_driver.h
+  @brief Definitions, macros, and prototypes used by SRM drivers.
+
+  EXTERNAL FUNCTIONS
+  None.
+
+  INITIALIZATION AND SEQUENCING REQUIREMENTS
+  None.
+
+  Copyright (c) 2021-2023 by Qualcomm Technologies, Inc.  All Rights Reserved.
+
+ =============================================================================*/
+#include <qurt.h>
+#include <qurt_srm_consts.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+||  Define qurt_srm_driver_t structure, which represents
+||   the "registration" object for an SRM driver.
+*/
+/** @cond internal_only */
+struct _qurt_srm_driver {
+   const char *name;
+   qurt_qdi_obj_t *obj;
+};
+
+typedef struct _qurt_srm_driver qurt_srm_driver_t;
+
+/*
+||  qurt_srm_object_invoke() is an internal equivalent to qurt_qdi_handle_invoke().
+||  It behaves the same, but it takes a QDI object pointer instead of a handle.
+*/
+
+#define qurt_srm_object_invoke(o,m,...) \
+   _QDMPASTE(_QDMSOI,_QDMCNT(QDI_HANDLE_LOCAL_CLIENT,o,m,##__VA_ARGS__))(QDI_HANDLE_LOCAL_CLIENT,o,m,##__VA_ARGS__)
+#define _QDMSOI3(a,b,c) qurt_srm_oi3(a,b,c)
+#define _QDMSOI4(a,b,c,d) qurt_srm_oi4(a,b,c,(int)(d))
+#define _QDMSOI5(a,b,c,d,e) qurt_srm_oi5(a,b,c,(int)(d),(int)(e))
+#define _QDMSOI6(a,b,c,d,e,f) qurt_srm_oi6(a,b,c,(int)(d),(int)(e),(int)(f))
+#define _QDMSOI7(a,b,c,d,e,f,g) qurt_srm_oi7(a,b,c,(int)(d),(int)(e),(int)(f),(int)(g))
+#define _QDMSOI8(a,b,c,d,e,f,g,h) qurt_srm_oi8(a,b,c,(int)(d),(int)(e),(int)(f),(int)(g),(int)(h))
+#define _QDMSOI9(a,b,c,d,e,f,g,h,i) qurt_srm_oi9(a,b,c,(int)(d),(int)(e),(int)(f),(int)(g),(int)(h),(int)(i))
+#define _QDMSOI10(a,b,c,d,e,f,g,h,i,j) qurt_srm_oi10(a,b,c,(int)(d),(int)(e),(int)(f),(int)(g),(int)(h),(int)(i),(int)(j))
+#define _QDMSOI11(a,b,c,d,e,f,g,h,i,j,k) qurt_srm_oi11(a,b,c,(int)(d),(int)(e),(int)(f),(int)(g),(int)(h),(int)(i),(int)(j),(int)(k))
+#define _QDMSOI12(a,b,c,d,e,f,g,h,i,j,k,l) qurt_srm_oi12(a,b,c,(int)(d),(int)(e),(int)(f),(int)(g),(int)(h),(int)(i),(int)(j),(int)(k),(int)(l))
+
+int qurt_srm_oi3(int, qurt_qdi_obj_t *, int);
+int qurt_srm_oi4(int, qurt_qdi_obj_t *, int, int);
+int qurt_srm_oi5(int, qurt_qdi_obj_t *, int, int, int);
+int qurt_srm_oi6(int, qurt_qdi_obj_t *, int, int, int, int);
+int qurt_srm_oi7(int, qurt_qdi_obj_t *, int, int, int, int, int);
+int qurt_srm_oi8(int, qurt_qdi_obj_t *, int, int, int, int, int, int);
+int qurt_srm_oi9(int, qurt_qdi_obj_t *, int, int, int, int, int, int, int);
+int qurt_srm_oi10(int, qurt_qdi_obj_t *, int, int, int, int, int, int, int, int);
+int qurt_srm_oi11(int, qurt_qdi_obj_t *, int, int, int, int, int, int, int, int, int);
+int qurt_srm_oi12(int, qurt_qdi_obj_t *, int, int, int, int, int, int, int, int, int, int);
+
+#define QDI_SRM_INIT 192
+
+/*
+||  QURT_SRM_DECLARE_DRIVER() declares an SRM driver to the SRM infrastructure.
+||
+||  The three arguments are:
+||   unique_id -- Unique C identifier, unused but must be a unique global symbol.
+||   name -- Name of the driver by which an SRM client attempts to open it.
+||   obj -- Pointer to the singleton object of the driver, which handles things such as
+||          initialization and QDI_OPEN requests.
+*/
+
+#define QURT_SRM_DECLARE_DRIVER(unique_id, xname, xobj) \
+   __attribute__((section(".srm.rodata.user.main.DECL"))) const qurt_srm_driver_t unique_id = \
+      { .name = xname, .obj = xobj }
+
+
+/*@ingroup func_qurt_srm_mapping_create
+  Creates a memory mapping in pagetable with specified attributes
+
+  @param[in] client_handle  Client handle representing the process for which
+                            mapping would be created.
+  @param[in] pageno_virt    pointer to the virtual page. NULL indicates SRM
+                            would indicate the virtual memory.
+  @param[in] pageno_phys    physical page to be used for the mapping
+  @param[in] page_count     number of 4k pages to be mapped
+  @param[in] cache_attr     cache attributes for the mapping
+  @param[in] perm           permissions to be used for the mapping
+  
+  @return value greater than 0 indicates a handle which can be passed to
+          qdi_close() to remove the mapping. Negative value indicates
+		  an error.
+  
+  @dependencies
+  None.
+*/
+int qurt_srm_mapping_create(int client_handle,
+                            unsigned *pageno_virt,
+                            unsigned pageno_phys,
+                            unsigned page_count,
+                            qurt_mem_cache_mode_t cache_attr,
+                            qurt_perm_t perm);
+
+
+/**@ingroup func_qurt_srm_get_pid
+  Gets the PID for the client_handle that is passed.
+
+  @param[in] client_handle  Client handle for which PID is required.
+
+  @return PID of the client
+          Negative PID value '-1' will be returned in case of Error
+  
+  @dependencies
+  None.
+*/
+unsigned qurt_srm_get_pid(int client_handle);
+
+
+/*@ingroup func_qurt_srm_get_thread_id
+  Gets the thread id of the client requesting a service from SRM
+
+  @param[in] None.
+
+  @return thead id of client thread
+  
+  @dependencies
+  None.
+*/
+qurt_thread_t qurt_srm_get_client_thread_id(void);
+
+/** @endcond */
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_SRM_DRIVER_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_stid.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_stid.h
new file mode 100755
index 0000000000000..379f46aaa4b80
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_stid.h
@@ -0,0 +1,73 @@
+#ifndef QURT_STID_H
+#define QURT_STID_H
+/**
+  @file qurt_stid.h 
+  Prototypes of software thread identifier(stid) interface APIs.  
+  A stid is 8 bit identifier that can be assigned to a software thread.
+  The performance monitor logic uses stid as a counting match criteria
+  for maskable events. stid is also used by the hardware debugger 
+  (ISDB) to match breakpoints. 
+
+  EXTERNAL FUNCTIONS
+   None.
+
+  INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+  Copyright (c) 2024 Qualcomm Technologies, Inc.
+  All rights reserved.
+  Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+=============================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/*=============================================================================
+                            FUNCTIONS
+=============================================================================*/
+
+/**@ingroup func_qurt_stid_alloc
+  Allocate a unique stid 
+
+  @param[in]  pid   Process identifier
+  @param[out] stid  Pointer to a variable to return stid
+ 
+  @return
+  QURT_EOK - Allocation success
+  QURT_ENORESOURCE  - No stid available for allocation
+  QURT_EINVALID - Invalid input
+   
+  @dependencies
+  None.
+ */
+int qurt_stid_alloc(unsigned int pid, unsigned int *stid);
+
+/**@ingroup func_qurt_stid_release
+   Release the stid. 
+
+
+  @param[in]  pid   Process identifier
+  @param[in]  stid  STID to release
+  
+  @note1hang 
+  User shall ensure to clear the released stid from process or thread(s)
+  to default value (QURT_STID_DEFAULT) before releasing that stid
+ 
+  @return
+  QURT_EOK - Release success
+  QURT_ENOTALLOWED   - Operation not allowed for a pid
+  QURT_EINVALID  - Invalid stid
+   
+  @dependencies
+  None.
+ */
+int qurt_stid_release(unsigned int pid, unsigned int stid);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_STID_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_thread.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_thread.h
new file mode 100755
index 0000000000000..499699e7c72e2
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_thread.h
@@ -0,0 +1,1260 @@
+#ifndef QURT_THREAD_H
+#define QURT_THREAD_H
+/**
+  @file qurt_thread.h 
+  @brief Prototypes of Thread API
+
+  EXTERNAL FUNCTIONS
+   None.
+
+  INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+Copyright (c) 2018, 2020-2023  by Qualcomm Technologies, Inc.  All Rights Reserved.
+Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+=============================================================================*/
+
+
+/* The followings are for C code only */
+#ifndef __ASSEMBLER__ 
+#include <string.h>
+#include "qurt_pmu.h"
+#include "qurt_api_version.h"
+#endif /* __ASSEMBLER__ */
+#include "qurt_consts.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*=============================================================================
+                            CONSTANTS AND MACROS
+=============================================================================*/
+
+
+/*
+  Bitmask configuration to select DSP hardware threads. 
+  To select all the hardware threads, use #QURT_THREAD_CFG_BITMASK_ALL 
+  and the following: \n
+  - For QDSP6 V2/V3, all six hardware threads are selected \n
+  - For QDSP6 V3L, all four hardware threads are selected \n
+  - For QDSP6 V4, all three hardware threads are selected
+ */  
+
+#define QURT_THREAD_CFG_BITMASK_HT0      0x00000001   /**< HTO. */
+#define QURT_THREAD_CFG_BITMASK_HT1      0x00000002   /**< HT1. */
+#define QURT_THREAD_CFG_BITMASK_HT2      0x00000004   /**< HT2. */ 
+#define QURT_THREAD_CFG_BITMASK_HT3      0x00000008   /**< HT3. */
+#define QURT_THREAD_CFG_BITMASK_HT4      0x00000010   /**< HT4. */
+#define QURT_THREAD_CFG_BITMASK_HT5      0x00000020   /**< HT5. */
+/** @cond rest_reg_dist */
+/** @addtogroup thread_macros
+@{ */
+/**   @xreflabel{sec:qurt_thread_cfg} */  
+
+#define QURT_THREAD_CFG_BITMASK_ALL      0x000000ffU   /**< Select all the hardware threads. */
+/** @} */ /* end_addtogroup thread_macros */
+/** @endcond */
+
+#define QURT_THREAD_CFG_USE_RAM          0x00000000   /**< Use RAM. */
+#define QURT_THREAD_CFG_USE_TCM          0x00000100   /**< Use TCM. */
+/** @cond rest_reg_dist */
+/** @addtogroup thread_macros
+@{ */
+#define QURT_THREAD_BUS_PRIO_DISABLED    0   /**< Thread internal bus priority disabled. */
+#define QURT_THREAD_BUS_PRIO_ENABLED     1   /**< Thread internal bus priority enabled.  */
+/** @} */ /* end_addtogroup thread_macros */
+/** @endcond */
+
+#define QURT_THREAD_AUTOSTACK_DISABLED    0   /**< Thread has autostack v2 feature disabled. */
+#define QURT_THREAD_AUTOSTACK_ENABLED     1   /**< Thread has autostack v2 feature enabled.  */
+
+/*
+   Macros for QuRT thread attributes.   
+ */
+#define QURT_HTHREAD_L1I_PREFETCH      0x1     /**< Enables hardware L1 instruction cache prefetching. */
+#define QURT_HTHREAD_L1D_PREFETCH      0x2     /**< Enables hardware L1 data cache prefetching. */
+#define QURT_HTHREAD_L2I_PREFETCH      0x4     /**< Enables hardware L2 instruction cache prefetching. */
+#define QURT_HTHREAD_L2D_PREFETCH      0x8     /**< Enables hardware L2 data cache prefetching. */
+#define QURT_HTHREAD_DCFETCH           0x10    /**< Enables DC fetch to the provided virtual address. 
+                                                    DC fetch indicates the hardware that a data memory access is likely. 
+                                                    Instructions are dropped when there is high bus utilization. */
+/** @addtogroup thread_macros
+@{ */
+/** @xreflabel{hdr:partition_tcm} */
+/*
+   Below value is used to create legacy QuRT threads by default.
+   If a thread has this as the detach_state, the thread can be joined
+   on until it exits. When we are able to change default behavior of all
+   QuRT threads to JOINABLE (posix default), we can remove this legacy
+   behavior.
+*/
+#define QURT_THREAD_ATTR_CREATE_LEGACY               0U /**< Create a legacy QuRT thread by default. If a thread has this as a detach state, the thread can be joined on until it exits. */
+#define QURT_THREAD_ATTR_CREATE_JOINABLE             1U /**< Create a joinable thread. */
+#define QURT_THREAD_ATTR_CREATE_DETACHED             2U /**< Create a detached thread. */
+/** @} */ /* end_addtogroup thread_macros */
+
+
+#define QURT_THREAD_ATTR_NAME_MAXLEN            16  /**< Maximum name length. */
+#define QURT_THREAD_ATTR_TCB_PARTITION_RAM      0  /**< Creates threads in RAM/DDR. */
+#define QURT_THREAD_ATTR_TCB_PARTITION_TCM      1  /**< Creates threads in TCM. */
+/** @cond rest_reg_dist */
+/** @addtogroup thread_macros
+@{ */
+#define QURT_THREAD_ATTR_TCB_PARTITION_DEFAULT  QURT_THREAD_ATTR_TCB_PARTITION_RAM  /**< Backward compatibility. */
+#define QURT_THREAD_ATTR_PRIORITY_DEFAULT       254   /**< Priority.*/
+#define QURT_THREAD_ATTR_ASID_DEFAULT           0    /**< ASID. */
+#define QURT_THREAD_ATTR_AFFINITY_DEFAULT      (-1)  /**< Affinity. */
+#define QURT_THREAD_ATTR_BUS_PRIO_DEFAULT       255  /**< Bus priority. */
+#define QURT_THREAD_ATTR_AUTOSTACK_DEFAULT      0    /**< Default autostack v2 disabled thread. */
+#define QURT_THREAD_ATTR_TIMETEST_ID_DEFAULT   (-2)  /**< Timetest ID. */
+#define QURT_THREAD_ATTR_STID_DEFAULT           QURT_STID_DEFAULT  /**< STID. */
+#define QURT_THREAD_ATTR_STID_ENABLE            1  /**< Indicate to allocate STID during thread creation. */
+
+#define  QURT_PRIORITY_FLOOR_DEFAULT            255U  /**< Default floor. */
+/** @} */ /* end_addtogroup thread_macros */
+
+// Option for suspending thread
+#define  QURT_THREAD_SUSPEND_SYNCHRONOUS   0x0U  // bit#0
+#define  QURT_THREAD_SUSPEND_ASYNCHRONOUS  0x1U  // bit#0
+#define  QURT_THREAD_SUSPEND_KEEP_HMX      0x0U  // bit#1
+#define  QURT_THREAD_SUSPEND_DETACH_HMX    0x2U  // bit#1
+ 
+// Option for resuming thread
+#define  QURT_THREAD_RESUME_DEFAULT        0x0
+
+// Thread property IDs
+#define  QURT_THREAD_PROPERTY_SUSPENDABLE  0x0U 
+#define  QURT_THREAD_PROPERTY_RESUMABLE    0x1
+
+// Thread group
+#define  QURT_THREAD_DEFAULT_GROUP_ID      0x0U
+#define  QURT_THREAD_GROUP_ID_MASK         0x3FU
+
+/** @endcond*/
+
+
+/* The followings are for C code only */
+#ifndef __ASSEMBLER__ 
+/*=============================================================================
+                                TYPEDEFS
+=============================================================================*/
+/** @addtogroup thread_types
+@{ */
+/** @cond rest_reg_dist  */
+typedef unsigned int qurt_cache_partition_t; /**< QuRT cache partition type. */
+
+#define CCCC_PARTITION      0U     /**< Use the CCCC page attribute bits to determine the main or auxiliary partition. */
+#define MAIN_PARTITION      1U     /**< Use the main partition. */
+#define AUX_PARTITION       2U     /**< Use the auxiliary partition. */
+#define MINIMUM_PARTITION   3U     /**< Use the minimum. Allocates the least amount of cache (no-allocate policy possible) for this thread. */
+/** @endcond */
+
+/** Thread ID type. */
+typedef unsigned int qurt_thread_t;
+
+/** @cond rest_reg_dist  */
+/** Thread attributes. */
+typedef struct _qurt_thread_attr {
+    
+    char name[QURT_THREAD_ATTR_NAME_MAXLEN]; /**< Thread name. */
+    unsigned char tcb_partition;  /**< Indicates whether the thread TCB resides in RAM or
+                                       on chip memory (TCM). */
+    unsigned char  stid;          /**< Software thread ID used to configure the stid register
+                                       for profiling purposes. */
+    unsigned short priority;      /**< Thread priority. */
+    unsigned char  autostack:1;   /**< Autostack v2 enabled thread. */
+    unsigned char  group_id:6;    /**< Group ID. */
+    unsigned char  reserved:1;    /**< Reserved bits. */
+    unsigned char  bus_priority;  /**< Internal bus priority. */
+    unsigned short timetest_id;   /**< Timetest ID. */
+    unsigned int   stack_size;    /**< Thread stack size. */
+    void *stack_addr;             /**< Pointer to the stack address base. The range of the stack is
+                                       (stack_addr, stack_addr+stack_size-1). */
+    unsigned short detach_state;  /**< Detach state of the thread. */
+
+} qurt_thread_attr_t;
+/** @endcond */
+
+/** @cond rest_reg_dist */
+/** Dynamic TLS attributes. */
+typedef struct qurt_tls_info {
+  unsigned int module_id;        /**< Module ID of the loaded dynamic linked library. */
+  unsigned int tls_start;        /**< Start address of the TLS data. */
+  unsigned int tls_data_end;     /**< End address of the TLS RW data. */
+  unsigned int tls_end;          /**< End address of the TLS data. */
+}qurt_tls_info;
+/** @endcond */
+
+/** @} */ /* end_addtogroup thread_types */
+
+/*=============================================================================
+                       FUNCTIONS
+=============================================================================*/
+/**@ingroup func_qurt_thread_attr_init
+  Initializes the structure used to set the thread attributes when a thread is created.
+  After an attribute structure is initialized, Explicity set the individual attributes in the structure 
+  using the thread attribute operations.
+
+  The initialize operation sets the following default attribute values: \n
+  - Name -- NULL string \n
+  - TCB partition -- QURT_THREAD_ATTR_TCB_PARTITION_DEFAULT
+  - Priority -- QURT_THREAD_ATTR_PRIORITY_DEFAULT \n
+  - Autostack -- QURT_THREAD_ATTR_AUTOSTACK_DEFAULT \n
+  - Bus priority -- QURT_THREAD_ATTR_BUS_PRIO_DEFAULT \n
+  - Timetest ID -- QURT_THREAD_ATTR_TIMETEST_ID_DEFAULT \n
+  - stack_size -- 0 \n
+  - stack_addr -- NULL \n
+  - detach state -- #QURT_THREAD_ATTR_CREATE_LEGACY \n
+  - STID -- #QURT_THREAD_ATTR_STID_DEFAULT
+
+  @datatypes
+  #qurt_thread_attr_t
+  
+  @param[in,out] attr Pointer to the thread attribute structure.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+static inline void qurt_thread_attr_init (qurt_thread_attr_t *attr)
+{
+
+    attr->name[0] = '\0';
+    attr->tcb_partition = QURT_THREAD_ATTR_TCB_PARTITION_DEFAULT;
+    attr->priority = QURT_THREAD_ATTR_PRIORITY_DEFAULT;
+    attr->autostack = QURT_THREAD_ATTR_AUTOSTACK_DEFAULT; /* Default attribute for autostack v2*/
+    attr->bus_priority = QURT_THREAD_ATTR_BUS_PRIO_DEFAULT;
+    attr->timetest_id = (unsigned short)QURT_THREAD_ATTR_TIMETEST_ID_DEFAULT;
+    attr->stack_size = 0;
+    attr->stack_addr = NULL;
+    attr->detach_state = QURT_THREAD_ATTR_CREATE_LEGACY;
+    attr->stid = QURT_THREAD_ATTR_STID_DEFAULT;
+    attr->group_id = QURT_THREAD_DEFAULT_GROUP_ID;
+}
+
+/**@ingroup func_qurt_thread_attr_set_name
+  Sets the thread name attribute.\n
+  This function specifies the name to use by a thread.
+  Thread names identify a thread during debugging or profiling.
+  Maximum name length is 16 charactes  \n
+  @note1hang Thread names differ from the kernel-generated thread identifiers used to
+  specify threads in the API thread operations.
+
+  @datatypes
+  #qurt_thread_attr_t
+
+  @param[in,out] attr Pointer to the thread attribute structure.
+  @param[in] name     Pointer to the character string containing the thread name.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+static inline void qurt_thread_attr_set_name (qurt_thread_attr_t *attr, const char *name)
+{
+    strlcpy (attr->name, name, QURT_THREAD_ATTR_NAME_MAXLEN);
+    attr->name[QURT_THREAD_ATTR_NAME_MAXLEN - 1] = '\0';
+}
+
+
+/**@ingroup func_qurt_thread_attr_set_tcb_partition
+  Sets the thread TCB partition attribute.
+  Specifies the memory type where a TCB of a thread is allocated.
+  Allocates TCBs in RAM or TCM/LPM.
+
+  @datatypes
+  #qurt_thread_attr_t
+
+  @param[in,out] attr  Pointer to the thread attribute structure.
+  @param[in] tcb_partition TCB partition. Values:\n
+                     - 0 -- TCB resides in RAM \n
+                     - 1 -- TCB resides in TCM/LCM @tablebulletend
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+static inline void qurt_thread_attr_set_tcb_partition (qurt_thread_attr_t *attr, unsigned char tcb_partition)
+{
+    attr->tcb_partition = tcb_partition;
+}
+
+/**@ingroup func_qurt_thread_attr_set_priority
+  Sets the thread priority to assign to a thread.
+  Thread priorities are specified as numeric values in the range 1 to 254, with 1 representing
+  the highest priority.
+  Priority 0 and 255  are internally used by the kernel for special purposes.
+
+  @datatypes
+  #qurt_thread_attr_t
+
+  @param[in,out] attr Pointer to the thread attribute structure.
+  @param[in] priority Thread priority.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+static inline void qurt_thread_attr_set_priority (qurt_thread_attr_t *attr, unsigned short priority)
+{
+    attr->priority = priority;
+}
+
+/**@ingroup func_qurt_thread_attr_set_detachstate
+  Sets the thread detach state with which thread is created.
+  Thread detach state is either joinable or detached; specified by the following values:
+  - #QURT_THREAD_ATTR_CREATE_JOINABLE  \n           
+  - #QURT_THREAD_ATTR_CREATE_DETACHED  \n   
+
+  When a detached thread is created (QURT_THREAD_ATTR_CREATE_DETACHED), its thread
+  ID and other resources are reclaimed as soon as the thread exits. When a joinable thread 
+  is created (QURT_THREAD_ATTR_CREATE_JOINABLE), it is assumed that some
+  thread waits to join on it using a qurt_thread_join() call. 
+  By default, detached state is QURT_THREAD_ATTR_CREATE_LEGACY
+  If detached state is QURT_THREAD_ATTR_CREATE_LEGACY then other
+  thread can join before thread exits but it will not wait other thread to join.
+  
+  @note1hang For a joinable thread (QURT_THREAD_ATTR_CREATE_JOINABLE), it is very
+             important that some thread joins on it after it terminates, otherwise
+			 the resources of that thread are not reclaimed, causing memory leaks.      
+
+  @datatypes
+  #qurt_thread_attr_t
+
+  @param[in,out] attr Pointer to the thread attribute structure.
+  @param[in] detachstate Thread detach state.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+static inline void qurt_thread_attr_set_detachstate (qurt_thread_attr_t *attr, unsigned short detachstate)
+{	
+    if(detachstate == QURT_THREAD_ATTR_CREATE_JOINABLE  || detachstate == QURT_THREAD_ATTR_CREATE_DETACHED){
+		attr->detach_state = detachstate;
+	}
+}
+
+
+/**@ingroup func_qurt_thread_attr_set_timetest_id
+  Sets the thread timetest attribute.\n
+  Specifies the timetest identifier to use by a thread.
+
+  Timetest identifiers are used to identify a thread during debugging or profiling. \n
+  @note1hang Timetest identifiers differ from the kernel-generated thread identifiers used to
+             specify threads in the API thread operations.
+
+  @datatypes
+  #qurt_thread_attr_t
+  
+  @param[in,out] attr   Pointer to the thread attribute structure.
+  @param[in] timetest_id Timetest identifier value.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+  */
+static inline void qurt_thread_attr_set_timetest_id (qurt_thread_attr_t *attr, unsigned short timetest_id)
+{
+    attr->timetest_id = timetest_id;
+}
+
+/**@ingroup func_qurt_thread_attr_set_stack_size
+  @xreflabel{sec:set_stack_size}
+  Sets the thread stack size attribute.\n
+  Specifies the size of the memory area to use for a call stack of a thread.
+
+  The thread stack address (Section @xref{sec:set_stack_addr}) and stack size specify the memory area used as a
+  call stack for the thread. The user is responsible for allocating the memory area used for
+  the stack.
+
+  @datatypes
+  #qurt_thread_attr_t
+
+  @param[in,out] attr Pointer to the thread attribute structure.
+  @param[in] stack_size Size (in bytes) of the thread stack.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+
+static inline void qurt_thread_attr_set_stack_size (qurt_thread_attr_t *attr, unsigned int stack_size)
+{
+    attr->stack_size = stack_size;
+}
+
+/**@ingroup func_qurt_thread_attr_set_stack_size2
+  @xreflabel{sec:set_stack_size}
+  Sets the thread stack size attribute for island threads that require a higher guest OS stack size than the stack size
+  defined in the configuration XML.\n
+  Specifies the size of the memory area to use for a call stack of an island thread in User and Guest mode.
+
+  The thread stack address (Section @xref{sec:set_stack_addr}) and stack size specify the memory area used as a
+  call stack for the thread. The user is responsible for allocating the memory area used for
+  the stack.
+
+  @datatypes
+  #qurt_thread_attr_t
+
+  @param[in,out] attr Pointer to the thread attribute structure.
+  @param[in] user_stack_size Size (in bytes) of the stack usage in User mode.
+  @param[in] root_stack_size Size (in bytes) of the stack usage in Guest mode.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+static inline void qurt_thread_attr_set_stack_size2 (qurt_thread_attr_t *attr, unsigned short user_stack_size, unsigned short root_stack_size)
+{
+	union qurt_thread_stack_info{
+		unsigned int raw_size;
+		struct{
+			unsigned short user_stack;
+			unsigned short root_stack;
+		};
+	}user_root_stack_size;
+	user_root_stack_size.user_stack = user_stack_size;
+	user_root_stack_size.root_stack = root_stack_size;
+	
+    attr->stack_size = user_root_stack_size.raw_size;
+}
+
+/**@ingroup func_qurt_thread_attr_set_stack_addr
+  @xreflabel{sec:set_stack_addr}
+  Sets the thread stack address attribute. \n
+  Specifies the base address of the memory area to use for a call stack of a thread.
+
+  stack_addr must contain an address value that is 8-byte aligned.
+
+  The thread stack address and stack size (Section @xref{sec:set_stack_size}) specify the memory area used as a
+  call stack for the thread. \n
+  @note1hang The user is responsible for allocating the memory area used for the thread
+             stack. The memory area must be large enough to contain the stack that the thread
+			 creates.
+
+  @datatypes
+  #qurt_thread_attr_t
+  
+  @param[in,out] attr Pointer to the thread attribute structure.
+  @param[in] stack_addr  Pointer to the 8-byte aligned address of the thread stack.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+static inline void qurt_thread_attr_set_stack_addr (qurt_thread_attr_t *attr, void *stack_addr)
+{
+    attr->stack_addr = stack_addr;
+}
+
+/**@ingroup func_qurt_thread_attr_set_bus_priority
+   Sets the internal bus priority state in the Hexagon core for this software thread attribute. 
+   Memory requests generated by the thread with bus priority enabled are
+   given priority over requests generated by the thread with bus priority disabled. 
+   The default value of bus priority is disabled.
+
+   @note1hang Sets the internal bus priority for Hexagon processor version V60 or greater. 
+              The priority is not propagated to the bus fabric.
+  
+   @datatypes
+   #qurt_thread_attr_t
+
+   @param[in] attr Pointer to the thread attribute structure.
+
+   @param[in] bus_priority Enabling flag. Values: \n 
+         - #QURT_THREAD_BUS_PRIO_DISABLED \n
+         - #QURT_THREAD_BUS_PRIO_ENABLED @tablebulletend
+
+   @return
+   None
+
+   @dependencies
+   None.
+*/
+static inline void qurt_thread_attr_set_bus_priority ( qurt_thread_attr_t *attr, unsigned short bus_priority)
+{
+    attr->bus_priority = (unsigned char)bus_priority;
+}
+
+/**@ingroup func_qurt_thread_attr_set_autostack
+   Enables autostack v2 feature in the thread attributes.
+   
+   When autostack is enabled by the subsystem, in the case that
+   an autostack enabled thread gets framelimit exception, kernel will
+   allocate more stack for thread and return to normal execution. 
+
+   If autostack is not enabled by the subsystem, or it is not enabled
+   for the thread, the framelimit exception will be fatal.
+  
+   @datatypes
+   #qurt_thread_attr_t
+
+   @param[in] attr Pointer to the thread attribute structure.
+   @param[in] autostack  Autostack enable or disable flag. Values: \n 
+         - #QURT_THREAD_AUTOSTACK_DISABLED \n
+         - #QURT_THREAD_AUTOSTACK_ENABLED @tablebulletend
+
+   @return
+   None.
+
+   @dependencies
+   None.
+*/
+static inline void qurt_thread_attr_set_autostack ( qurt_thread_attr_t *attr, unsigned short autostack)
+{
+    attr->autostack = (unsigned char)autostack;  
+}
+/**@ingroup qurt_thread_attr_enable_stid
+   Set STID in the thread attributes.
+  
+   @datatypes
+   #qurt_thread_attr_t
+
+   @param[in] attr Pointer to the thread attribute structure.
+   @param[in] enable_stid  STID to be set. Values: \n 
+         - #QURT_THREAD_ATTR_STID_DEFAULT (0): Default STID. \n
+         - #QURT_THREAD_ATTR_STID_ENABLE (1):  QuRT assigns an STID that is not already in use \n
+         - #2 through #255 : User provided STID.  @tablebulletend
+
+   @return
+   None.
+
+   @dependencies
+   None.
+*/
+static inline void qurt_thread_attr_enable_stid ( qurt_thread_attr_t *attr, char enable_stid)
+{
+    if (enable_stid != '\0') {
+        attr->stid = enable_stid;
+    }
+    else
+    {
+        attr->stid = QURT_THREAD_ATTR_STID_DEFAULT;
+    }
+}
+
+/**@ingroup func_qurt_thread_attr_set_stid
+   Sets the stid thread attribute.
+   The default stid value is QURT_THREAD_ATTR_STID_DEFAULT
+
+   @note1hang When a thread is created with non default stid , 
+   the stid set in thread attribute  will be assigned to a thread.
+  
+   @datatypes
+   #qurt_thread_attr_t
+
+   @param[in] attr Pointer to the thread attribute structure.
+   @param[in] stid Stid to be set for a thread.
+
+   @return
+   None
+
+   @dependencies
+   None.
+*/
+static inline void qurt_thread_attr_set_stid( qurt_thread_attr_t *attr, unsigned int stid){
+    attr->stid = stid;
+}
+
+/**@ingroup func_qurt_thread_attr_set_group_id
+  Sets group id in the thread attributes.
+  Primordial/first thread has group ID 0.
+  If a new thread is created without assigning group_id, it
+  inherits the group ID from its parent thread.
+
+  @note1hang
+  1) Group ID can only be set before creating a thread. It cannot be
+  changed after the thread is created.
+  2) If a non-activated group_id is passed, thread creation will fail.
+  3) Only a thread with Group ID #0 can set Group ID for its child threads.
+  4) If thread with non-zero group ID set the group ID for its child threads,
+  QuRT will ingore this parameter and child threads will inherit the parent
+  thread's group ID. But if passed group ID is not activated, thread creation
+  will still fail.
+
+  @datatypes
+  #qurt_thread_attr_t
+
+  @param[in] attr Pointer to the thread attribute structure.
+  @param[in] group_id Group identifier. Its valid range is 0 ~ 63
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+static inline void qurt_thread_attr_set_group_id(qurt_thread_attr_t *attr, unsigned int group_id)
+{
+    attr->group_id = group_id & QURT_THREAD_GROUP_ID_MASK;
+}
+
+/**@ingroup func_qurt_thread_set_autostack
+  Sets autostack enable in the TCB.
+
+  @param[in] Pointer to UGP
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+
+void qurt_thread_set_autostack(void *);
+
+
+/**@ingroup func_qurt_thread_get_name
+  Gets the thread name of current thread.\n
+  Returns the thread name of the current thread. 
+  Thread names are assigned to threads as thread attributes, see qurt_thread_attr_set_name(). Thread names 
+  identify a thread during debugging or profiling.
+
+  @param[out] name Pointer to a character string, which specifies the address where the returned thread name is stored.
+  @param[in] max_len Maximum length of the character string that can be returned.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+*/
+void qurt_thread_get_name (char *name, unsigned char max_len);
+
+/**@ingroup func_qurt_thread_create
+  @xreflabel{hdr:qurt_thread_create}
+  Creates a thread with the specified attributes, and makes it executable.
+
+  @datatypes
+  #qurt_thread_t \n
+  #qurt_thread_attr_t
+  
+  @param[out]  thread_id    Returns a pointer to the thread identifier if the thread was 
+                             successfully created.
+  @param[in]   attr 	    Pointer to the initialized thread attribute structure that specifies 
+                             the attributes of the created thread.
+  @param[in]   entrypoint   C function pointer, which specifies the main function of a thread.
+  @param[in]   arg  	     Pointer to a thread-specific argument structure
+  
+   
+  @return 
+  #QURT_EOK -- Thread created. \n
+  #QURT_EFAILED -- Thread not created. 
+
+  @dependencies
+  None.
+ */
+int qurt_thread_create (qurt_thread_t *thread_id, qurt_thread_attr_t *attr, void (*entrypoint) (void *), void *arg);
+
+/**@ingroup func_qurt_thread_stop
+   Stops the current thread, frees the kernel TCB, and yields to the next highest ready thread. 
+  
+   @return
+   void 
+
+   @dependencies
+   None.
+ */
+void qurt_thread_stop(void);
+
+/** @cond internal_only */
+/**@ingroup func_qurt_thread_resume
+   When a demand-loading paging solution is enabled, this function
+   will resumes the execution of a thread that was suspended due to
+   a page miss.
+  
+   @param[in]  thread_id Thread identifier.
+
+   @return 
+   #QURT_EOK -- Thread successfully resumed. \n
+   #QURT_EFATAL -- Resume operation failed.
+
+   @dependencies
+   None.
+ */
+int qurt_thread_resume(unsigned int thread_id);
+/** @endcond */
+
+/**@ingroup func_qurt_thread_get_id
+   Gets the identifier of the current thread.\n
+   Returns the thread identifier for the current thread.
+     
+   @return 
+   Thread identifier -- Identifier of the current thread. 
+
+   @dependencies
+   None.
+ */
+qurt_thread_t qurt_thread_get_id (void);
+
+
+/**@ingroup func_qurt_thread_get_l2cache_partition
+   Returns the current value of the L2 cache partition assigned to the caller thread.\n
+     
+   @return 
+   Value of the #qurt_cache_partition_t data type.
+
+   @dependencies
+   None.
+ */
+qurt_cache_partition_t qurt_thread_get_l2cache_partition (void);
+
+/**@ingroup func_qurt_thread_set_timetest_id
+   Sets the timetest identifier of the current thread.
+   Timetest identifiers are used to identify a thread during debugging or profiling.\n
+   @note1hang Timetest identifiers differ from the kernel-generated thread identifiers used to
+              specify threads in the API thread operations.
+
+   @param[in]  tid  Timetest identifier.
+
+   @return
+   None.
+
+   @dependencies
+   None.
+ */
+void qurt_thread_set_timetest_id (unsigned short tid);
+
+/**@ingroup func_qurt_thread_set_cache_partition
+   Sets the cache partition for the current thread. This function uses the qurt_cache_partition_t type 
+   to select the cache partition of the current thread for the L1 Icache, L1 Dcache, and L2 cache.
+  
+   @datatypes
+   #qurt_cache_partition_t 
+
+   @param[in] l1_icache L1 I cache partition.
+   @param[in] l1_dcache L1 D cache partition.
+   @param[in] l2_cache L2 cache partition.
+    
+   @return
+   None.
+
+   @dependencies
+   None.
+ */
+void qurt_thread_set_cache_partition(qurt_cache_partition_t l1_icache, qurt_cache_partition_t l1_dcache, qurt_cache_partition_t l2_cache);
+
+
+/**@ingroup func_qurt_thread_get_timetest_id
+   Gets the timetest identifier of the current thread.\n
+   Returns the timetest identifier of the current thread.\n
+   Timetest identifiers are used to identify a thread during debugging or profiling. \n
+   @note1hang Timetest identifiers differ from the kernel-generated thread identifiers used to
+              specify threads in the API thread operations.
+
+   @return 
+   Integer -- Timetest identifier. 
+
+   @dependencies
+   None.
+ */
+unsigned short qurt_thread_get_timetest_id (void);
+
+/**@ingroup func_qurt_thread_exit
+   @xreflabel{sec:qurt_thread_exit}
+   Stops the current thread, awakens threads joined to it, then destroys the stopped
+   thread.
+
+   Threads that are suspended on the current thread (by performing a thread join 
+   Section @xref{sec:thread_join}) are awakened and passed a user-defined status value 
+   that indicates the status of the stopped thread.
+
+   @note1hang Exit must be called in the context of the thread to stop.
+  
+   @param[in]   status User-defined thread exit status value.
+
+   @return
+   None.
+
+   @dependencies
+   None.
+ */
+void qurt_thread_exit(int status);
+
+/**@ingroup func_qurt_thread_join
+   @xreflabel{sec:thread_join}
+   Waits for a specified thread to finish; the specified thread is another thread within
+   the same process.
+   The caller thread is suspended until the specified thread exits. When the unspecified thread
+   exits, the caller thread is awakened. \n
+   @note1hang If the specified thread has already exited, this function returns immediately
+              with the result value #QURT_ENOTHREAD. \n
+   @note1cont Two threads cannot call qurt_thread_join to wait for the same thread to finish.
+              If this occurs, QuRT generates an exception (see Section @xref{sec:exceptionHandling}).
+  
+   @param[in]   tid     Thread identifier.
+   @param[out]  status  Destination variable for thread exit status. Returns an application-defined 
+                        value that indicates the termination status of the specified thread. 
+  
+   @return  
+   #QURT_ENOTHREAD -- Thread has already exited. \n
+   #QURT_EOK -- Thread successfully joined with valid status value. 
+
+   @dependencies
+   None.
+ */
+int qurt_thread_join(unsigned int tid, int *status);
+
+/**@ingroup qurt_thread_detach
+   @xreflabel{sec:thread_detach}
+   Detaches a joinable thread. The specified thread is another thread within the 
+   same process. Create the thread as a joinable thread; only joinable threads 
+   can be detached.
+   If a joinable thread is detached, it finishes execution and exits.
+  
+   @param[in]   tid     Thread identifier.
+   
+   @return  
+   #QURT_ENOTHREAD -- Thread specifed by TID does not exist. \n
+   #QURT_EOK -- Thread successfully detached.
+
+   @dependencies
+   None.
+ */
+int qurt_thread_detach(unsigned int tid);
+
+
+/**@ingroup func_qurt_thread_get_priority 
+   Gets the priority of the specified thread. \n 
+   Returns the thread priority of the specified thread.\n
+   Thread priorities are specified as numeric values in a range as large as 1 through 254, with lower
+   values representing higher priorities. 1 represents the highest possible thread priority. \n
+   Priority 0 and 255 are internally used by the kernel for special purposes.
+
+   @note1hang QuRT can be configured to have different priority ranges.
+
+   @datatypes
+   #qurt_thread_t
+  
+   @param[in]  threadid	   Thread identifier.	
+
+   @return
+   -1 -- Invalid thread identifier. \n
+   1 through 254 -- Thread priority value.
+
+   @dependencies
+   None.
+ */
+int qurt_thread_get_priority (qurt_thread_t threadid);
+
+/**@ingroup func_qurt_thread_set_priority
+   Sets the priority of the specified thread.\n
+   Thread priorities are specified as numeric values in a range as large as 1 through 254, with lower
+   values representing higher priorities. 1 represents the highest possible thread priority.
+   Priority 0 and 255  are internally used by the kernel  for special purposes.
+
+   @note1hang QuRT can be configured to have different priority ranges. For more
+              information, see Section @xref{sec:AppDev}.
+   
+   @datatypes
+   #qurt_thread_t
+
+   @param[in] threadid	    Thread identifier.	
+   @param[in] newprio 	    New thread priority value.
+
+   @return
+   0 -- Priority successfully set. \n
+   -1 -- Invalid thread identifier. \n 
+   
+   @dependencies
+   None.
+ */
+int qurt_thread_set_priority (qurt_thread_t threadid, unsigned short newprio);
+
+
+
+/**@ingroup func_qurt_thread_attr_get
+  Gets the attributes of the specified thread.
+
+  @datatypes
+  #qurt_thread_t \n
+  #qurt_thread_attr_t
+
+  @param[in]  thread_id	    Thread identifier.
+  @param[out] attr 	    Pointer to the destination structure for thread attributes.
+  
+  @return
+  #QURT_EOK -- Success. \n
+  #QURT_EINVALID -- Invalid argument.
+
+  @dependencies
+  None.
+ */
+int qurt_thread_attr_get (qurt_thread_t thread_id, qurt_thread_attr_t *attr);
+
+
+
+/**@ingroup func_qurt_thread_get_tls_base
+  Gets the base address of thread local storage (TLS) of a dynamically loaded module
+  for the current thread.
+  
+  @datatypes
+  #qurt_tls_info 
+
+  @param[in]  info	   Pointer to the TLS information for a module.
+  
+  @return
+   Pointer to the TLS object for the dynamically loaded module.\n
+   NULL -- TLS information is invalid.
+
+  @dependencies
+  None.
+ */
+void * qurt_thread_get_tls_base(qurt_tls_info* info);
+
+/**@ingroup func_qurt_thread_pktcount_get
+  Gets the PKTCOUNT of a specified thread.
+
+  @datatypes
+  #qurt_thread_t 
+
+  @param[in]  thread_id	    Thread identifier.
+  
+  @return
+  PKTCOUNT
+
+  @dependencies
+  None.
+ */
+
+long long int qurt_thread_pktcount_get (qurt_thread_t thread_id);
+
+/**@ingroup func_qurt_thread_pktcount_set
+  Sets the PKTCOUNT for the current QuRT thread.
+  
+  @return
+  Value to which pktcount is set.
+
+  @dependencies
+  None.
+ */
+
+long long int qurt_thread_pktcount_set (long long int);
+
+/**@ingroup func_qurt_thread_stid_get
+  Gets the STID for a specified thread.
+
+  @datatypes
+  #qurt_thread_t 
+
+  @param[in]  thread_id	    Thread identifier.
+  
+  @return
+  STID
+
+  @dependencies
+  None.
+ */
+
+char qurt_thread_stid_get(qurt_thread_t thread_id);
+ 
+/**@ingroup func_qurt_thread_stid_get2
+  Returns the set stid for a thread
+  
+  @param[in]  thread_id   thread identifier
+  @param[out] stid  Pointer to a variable to return  stid
+   
+  @return
+  QURT_EOK - success
+  QURT_ENOTALLOWED   - operation not allowed for a thread
+  QURT_EINVALID - Invalid input
+
+  @dependencies
+  None.
+ */
+int qurt_thread_stid_get2(unsigned int thread_id, unsigned int *stid);
+
+/**@ingroup func_qurt_thread_stid_set
+  Sets the STID for a specified thread. 
+
+  @datatypes
+  #qurt_thread_t 
+
+  @param[in]  stid	    Thread identifier.
+  
+  @return 
+   #QURT_EOK -- STID set created. \n
+   #QURT_EFAILED -- STID not set. 
+
+  @dependencies
+  None.
+ */
+
+int qurt_thread_stid_set(char stid);
+
+/**@ingroup qurt_thread_stid_set2
+   Sets the stid for a specified thread.
+
+   @datatypes
+   #qurt_thread_attr_t
+
+   @param[in]  thread_id  Thread identifier.
+   @param[in]  stid       Stid to be set for a thread.
+
+   @return
+   QURT_EOK -- Success
+   #QURT_EPRIVILEGE -- Failure because caller does not have enough privilege for this operation.
+   #QURT_EVAL -- Failure because of invalid inputs.
+
+   @dependencies
+   None.
+*/
+int qurt_thread_stid_set2(unsigned int thread_id, unsigned int stid); 
+
+/** @cond internal_only */
+/**@ingroup func_qurt_thread_get_running_ids
+  Returns the thread IDs of the running threads in the system; use only during fatal error handling.
+ 
+  @datatypes
+  #qurt_thread_t 
+ 
+  @param[in,out] * Array of thread identifier of size #QURT_MAX_HTHREAD_LIMIT + 1.
+ 
+  @return
+   #QURT_EINVALID -- Incorrect argument \n
+   #QURT_ENOTALLOWED  -- API not called during error handling \n
+   #QURT_EOK -- Success, returns a NULL-terminated array of thread_id
+ 
+  @dependencies
+  None.
+ */
+int qurt_thread_get_running_ids(qurt_thread_t *);
+/** @endcond */
+
+
+/**@ingroup func_qurt_thread_get_thread_id
+  Gets the thread identifier of the thread with the matching name in the same process
+  of the caller.
+ 
+  @datatypes
+  #qurt_thread_t 
+ 
+  @param[out] thread_id Pointer to the thread identifier.
+  @param[in]  name      Pointer to the name of the thread.
+ 
+  @return
+  #QURT_EINVALID -- No thread with matching name in the process of the caller \n
+  #QURT_EOK      -- Success  
+ 
+  @dependencies
+  None.
+ */
+int qurt_thread_get_thread_id (qurt_thread_t *thread_id, char *name);
+
+/**@ingroup func_qurt_sleep
+  Suspends the current thread for the specified amount of time.
+
+  @note1hang Because QuRT timers are deferrable, this call is guaranteed to block
+             at least for the specified amount of time. If power-collapse is 
+             enabled, the maximum amount of time this call can block depends on
+             the earliest wakeup from power-collapse past the specified duration.
+
+  @param[in] duration  Duration (in microseconds) for which the thread is suspended.
+
+  @return 
+  None.
+
+  @dependencies
+  None.
+ */
+void qurt_sleep (unsigned long long int duration);
+
+
+/**@ingroup func_qurt_system_set_priority_floor
+  Sets a priority floor to move threads with thread priority lower than the floor out of the running state.
+  Running threads with thread priority lower than the priority floor are moved into the kernel ready queue, and they 
+  are not scheduled to run when the thread priority is lower than the floor.
+  Later the caller should reset the priority floor back to the default value of QURT_PRIORITY_FLOOR_DEFAULT. 
+  Threads in the kernel ready queue are scheduled to run when the thread priority is higher than the floor.
+
+  The priority floor is set and associated to the user process of the caller. When the caller gets into QuRTOS and
+  sets a new floor, the new floor is associated to its original user process, not the QuRTOS process.
+  The floor associated to the user process is reset when the user process exits or is killed, but not at the time 
+  when the user thread of the caller exits.
+
+  The priority floor cannot be set to a priority higher than the thread priority of the caller.
+
+  The priority floor cannot be set to a priority lower than the default #QURT_PRIORITY_FLOOR_DEFAULT system floor.
+
+  This function is not supported in Island mode.
+
+  After the system floor is set above QURT_PRIORITY_FLOOR_DEFAULT, power collapse is skipped, and sleep task 
+  is not scheduled to run.
+ 
+  @param[in]  priority_floor Priority floor. 
+ 
+  @return
+  #QURT_EOK         -- Success \n  
+  #QURT_ENOTALLOWED -- Floor setting is not allowed
+ 
+  @dependencies
+  None.
+ */
+int qurt_system_set_priority_floor (unsigned int priority_floor);
+
+
+/**@ingroup func_qurt_thread_suspend_thread 
+  Suspend a QuRT thread with its thread identifier.
+  The target thread can be in a signed user process or an unsigned user process.
+  The caller thread can be a thread from the same user process of the target thread, or from its parent process.
+  After the target thread is suspended, the kernel will not schedule it to run until it is resumed later.
+
+  If the target thread is set as non-suspendable, this function call returns an error without suspending 
+  the target thread. 
+
+  If the target thread is already suspended, this function call returns success to confirm 
+  the target thread suspend.                                          
+
+  If the target thread is in a secure user process, or CPZ process, this function call returns an error without
+  suspending the target thread.                                          
+
+  If the target thread is running in the guest OS/root process via a QDI call, this function call does not suspend 
+  the target thread in guest OS, but marks the target thread as suspend-pending. The target thread is
+  suspended when it exits the guest OS, before executing the first instruction in the user process.
+  In this case, the function returns success even with the #QURT_THREAD_SUSPEND_SYNCHRONOUS option, while the target
+  thread can runn in the guest OS, and is suspended when exiting the guest OS. 
+ 
+  QuRT debug monitor threads that are in a user process are non-suspendable. This function does not suspend 
+  those threads.
+
+  @param[in] thread_id  Thread identifier.
+  @param[in] option     Optional argument, multiple options can be ORed. \n
+                        #QURT_THREAD_SUSPEND_SYNCHRONOUS (default) -- set to synchronous function call,
+                        the function returns after the thread is completely suspended.\n
+                        #QURT_THREAD_SUSPEND_ASYNCHRONOUS -- set to asynchronous function call, the function returns
+                        after the kernel acts to suspend the target thread. The target thread
+                        might still be running before it is completely suspended. \n
+                        #QURT_THREAD_SUSPEND_KEEP_HMX (default) -- keep the HMX attachment on the target thread 
+                        if it locks the HMX with qurt_hmx_lock(). In this case, the HMX cannot be re-used by other threads. \n
+                        #QURT_THREAD_SUSPEND_DETACH_HMX -- detach HMX from the target thread if it locks the HMX with qurt_hmx_lock().
+                        Later when the target thread resumes, the HMX is re-attached to the thread. Note that, this option is only 
+                        supported for the caller from the same user process of the target thread, not for a caller from the parent 
+                        process of the target thread, or other processes. With the HMX detach option, Qurt does not save the HMX 
+                        context. Thus, the HMX context state will be lost. It is the responsibility of caller to ensure HMX operations
+                        and its context state saving when calling qurt_thread_suspend_thread() with the HMX detach option.
+                        If a thread from another process uses this detach option, QURT_EHMXNOTDETACHABLE will be returned; in this 
+                        case, if the caller is qualified to suspend the target thread, the target thread will be moved to suspended 
+                        state without HMX detached.
+ 
+  @return
+  #QURT_EOK         -- Success  \n
+  #QURT_EINVALID    -- Failure because of invalid thread_id input \n
+  #QURT_ENOTALLOWED -- Failure because of the operation is not allowed, for example, in secure process/CPZ process.
+  #QURT_EHMXNOTDETACHABLE -- Failure because HMX is not detachable from the target thread.
+ 
+  @dependencies
+  None.
+ */
+int qurt_thread_suspend_thread (unsigned int thread_id, unsigned int option);
+
+
+/**@ingroup func_qurt_thread_resume_thread 
+  Resume a QuRT thread with its thread identifier.
+  The target thread can be in a signed user process or an unsigned user process.
+  The caller thread can be a thread from the same user process of the target thread, or from its parent 
+  process. After the target thread resumes, the kernel scheduler can schedule the thread to run based on 
+  the thread priority.
+
+  There is an option argument in this function, with only one default option as of now,
+     QURT_THREAD_RESUME_DEFAULT: resume the target thread in default way.
+
+  By default, this is an asynchronous function. The function returns after kernel moves the 
+  target thread from suspended state to runnable state. The thread is scheduled to run based on its 
+  thread priority.
+  
+  If the target thread is set as non-resumable, this function call does not resume the target thread.                                          
+
+  If the target thread has already resumed, this function confirms that the target thread resumes
+  by returning success.  
+
+  If the target thread is in a secure user process or CPZ process, this function call returns an error without 
+  resuming the operation.  
+
+  If the target thread runs in the guest OS/root process via a QDI call, this function call clears the mark of
+  suspend-pending on the target thread, and the target thread is not suspended when it exits the 
+  guest OS. 
+ 
+  @param[in] thread_id  Thread identifier.
+  @param[in] option     Optional argument, #QURT_THREAD_RESUME_DEFAULT, which resumes the target thread.
+ 
+  @return
+  #QURT_EOK           -- Success \n 
+  #QURT_EINVALID      -- Failure because of invalid thread_id input \n
+  #QURT_ENOTALLOWED   -- Failure because of the operation is not allowed, for example, in a secure process/CPZ process.
+  #QURT_EHMXNOTAVAIL  -- Failure because when resume a HMX thread, the HMX is not available/free for the HMX thread resume.
+ 
+  @dependencies
+  None.
+ */
+int qurt_thread_resume_thread (unsigned int thread_id, unsigned int option);
+
+
+/**@ingroup func_qurt_thread_set_thread_property 
+  Set a QuRT thread property with its thread identifier.
+  The target thread can be in a signed user process or an unsigned user process.
+  The caller thread can be from the same user process of the target thread, or from its parent process.
+
+  If the target thread is in a secure user process, or CPZ process, this function call returns an error without 
+  changing the property of the target thread.
+
+  @param[in] thread_id    Thread identifier \n
+  @param[in] property_id  Thread property identifier \n
+                          #QURT_THREAD_PROPERTY_SUSPENDABLE -- thread is suspendable. Default is TRUE. \n
+                          #QURT_THREAD_PROPERTY_RESUMEABLE  -- thread is resumable. Default is TRUE
+  @param[in] value        Proper value: \n
+                          TRUE(1) -- TRUE for the property \n
+                          FALSE(0) -- FALSE for the property
+ 
+  @return
+  #QURT_EOK         -- Success  \n
+  #QURT_EINVALID    -- Failure because of invalid thread_id input \n
+  #QURT_ENOTALLOWED -- Failure because of the operation is not allowed, for example, in a secure process/CPZ process.
+ 
+  @dependencies
+  None.
+ */
+int qurt_thread_set_thread_property( unsigned int thread_id, unsigned int property_id, unsigned int value );    
+
+/**@ingroup func_qurt_thread_get_group_id
+  Get the group id of the thread specified by thread_id.\n
+
+  @param[in] thread_id Thread identifier
+  @param[out] group_id Pointer to the variable of group identifier
+
+  @return
+  #QURT_EOK         -- Success  \n
+  #QURT_EINVALID    -- Thread id is invalid, or the process has no groups enabled \n
+  #QURT_ENOTALLOWED -- Operation is not allowed \n
+
+  @dependencies
+  None.
+*/
+int qurt_thread_get_group_id(qurt_thread_t thread_id, unsigned int* group_id);
+
+#endif /* __ASSEMBLER__ */ 
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_THREAD_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_thread_context.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_thread_context.h
new file mode 100755
index 0000000000000..bab09deec8889
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_thread_context.h
@@ -0,0 +1,234 @@
+#ifndef QURT_THREAD_CONTEXT_H
+#define QURT_THREAD_CONTEXT_H
+/**
+  @file qurt_thread_context.h 
+  @brief Kernel thread context structure
+			
+EXTERNAL FUNCTIONS
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+Copyright (c) 2018-2022  by Qualcomm Technologies, Inc.  All Rights Reserved.
+Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+=============================================================================*/
+
+
+
+#include <qurt_qdi_constants.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** @cond internal_only */
+
+#define THREAD_ITERATOR_END ((qurt_thread_t)(-1))  /**< Thread iterator is complete. */   
+
+
+/**@ingroup func_qurt_thread_iterator_create
+Gives the ability to the caller to enumerate threads in the system.
+
+@return 
+Handle of the newly created iterator must be passed for
+subsequent operations on the iterator.           
+
+@dependencies
+None.
+*/
+static inline int qurt_thread_iterator_create(void)
+{
+   return qurt_qdi_handle_invoke(QDI_HANDLE_GENERIC, QDI_OS_THREAD_ITERATOR_CREATE);
+}
+
+/**@ingroup func_qurt_thread_iterator_next
+Iterates over the list of threads in the system.
+
+@datatypes
+#qurt_thread_t
+
+@param[in] iter Iterator handle returned by qurt_thread_iterator_create().
+
+@return 
+#THREAD_ITERATOR_END -- iterator has reached the end of the thread list. \n
+Other values indicate a valid thread_id.
+
+@dependencies
+None.
+*/
+static inline qurt_thread_t qurt_thread_iterator_next(int iter)
+{
+   return (qurt_thread_t)qurt_qdi_handle_invoke(iter, QDI_OS_THREAD_ITERATOR_NEXT);
+}
+
+/**@ingroup func_qurt_thread_iterator_destroy
+Cleans up thread iterator resources.
+
+@param[in] iter Iterator handle returned by qurt_thread_iterator_create().
+
+@return 
+#QURT_EOK -- Successful completion of operation \n
+#QURT_EFATAL -- Invalid handle passed 
+		  
+@dependencies
+None.
+*/
+static inline int qurt_thread_iterator_destroy(int iter)
+{
+   return qurt_qdi_close(iter);
+}
+
+/**@ingroup func_qurt_thread_context_get_tname
+Gets the name of the thread from the specified thread ID.
+
+@param[in]      thread_id   Thread for which name is returned.
+@param[in,out]  name        Pointer to the local buffer where name is copied back.
+@param[in]      max_len     Size of the local buffer.
+
+@return 
+#QURT_EOK -- Success \n
+Failure otherwise
+		  
+@dependencies
+None.
+*/
+int qurt_thread_context_get_tname(unsigned int thread_id, char *name, unsigned char max_len);
+
+/**@ingroup func_qurt_thread_context_get_prio
+Gets the priority for the specified thread.
+
+@param[in]     thread_id   Thread for which priority is returned.
+@param[in,out] prio        Pointer to the local variable where priority is written.
+
+@return  
+#QURT_EOK -- Success \n
+Failure otherwise
+		  
+@dependencies
+None.
+*/
+int qurt_thread_context_get_prio(unsigned int thread_id, unsigned char *prio);
+
+/**@ingroup func_qurt_thread_context_get_pcycles
+Gets pcycles for the specified thread.
+
+@param[in]     thread_id Thread for which processor cycles are returned.
+@param[in,out] pcycles   Pointer to the local variable where processor cycles are written.
+
+@return  
+#QURT_EOK -- Success \n
+Failure otherwise.
+		  
+@dependencies
+None.
+*/
+int qurt_thread_context_get_pcycles(unsigned int thread_id, unsigned long long int *pcycles);
+
+/**@ingroup func_qurt_thread_context_get_stack_base
+Gets the stack base address for the specified thread.
+
+@param[in]     thread_id Thread for which stack base address is returned.
+@param[in,out] sbase     Pointer to the local variable where stack base address is written.
+
+@return  
+QURT_EOK -- Success \n
+Failure otherwise
+		  
+@dependencies
+None.
+*/
+int qurt_thread_context_get_stack_base(unsigned int thread_id, unsigned int *sbase);
+
+/**@ingroup func_qurt_thread_context_get_stack_size
+Gets the stack size for the specified thread.
+
+@param[in]      thread_id   Thread for which stack size is returned.
+@param[in,out]  ssize       Pointer to the local variable where stack size is written.
+
+@return  
+#QURT_EOK -- Success \n
+Failure otherwise
+		  
+@dependencies
+None.
+*/
+int qurt_thread_context_get_stack_size(unsigned int thread_id, unsigned int *ssize);
+
+/**@ingroup func_qurt_thread_context_get_pid
+Gets the process ID for the specified thread.
+
+@param[in]     thread_id  Thread for which process ID is returned.
+@param[in,out] pid        Pointer to the local variable where process id is written.
+
+@return  
+#QURT_EOK -- Success \n
+Failure otherwise
+		  
+@dependencies
+None.
+*/
+int qurt_thread_context_get_pid(unsigned int thread_id, unsigned int *pid);
+
+/**@ingroup func_qurt_thread_context_get_pname
+Gets the process name for the specified thread.
+
+@param[in]       thread_id  Represents the thread for which process name is returned.
+@param[in, out]  name       Pointer to the local buffer where process name is copied back.
+@param[in]       len        Length allocated to the local buffer.
+
+@return  
+#QURT_EOK -- Success \n
+Failure otherwise
+		  
+@dependencies
+None.
+*/
+int qurt_thread_context_get_pname(unsigned int thread_id, char *name, unsigned int len);
+
+/** @addtogroup thread_types
+@{ */
+/** Structure that defines how TCB is interpreted to crash dump tools.*/
+/* Keys are defined in consts.h */
+struct qurt_debug_thread_info {
+/** @cond */
+    char name[QURT_MAX_NAME_LEN];     /**< Name of the thread. */
+    struct {
+        unsigned key;                 
+        unsigned val;
+    } os_info[40];  
+    unsigned gen_regs[32];            /**< General mode registers. */
+    unsigned user_cregs[32];          /**< User mode registers. */
+    unsigned guest_cregs[32];         /**< Guest mode registers. */
+    unsigned monitor_cregs[64];       /**< Monitor mode registers. */
+/** @endcond */
+}; /* should add up to 1K */
+/** @} */ /* end_addtogroup thread_types */
+
+
+/**@ingroup func_qurt_system_tcb_dump_get
+Cleans up thread iterator resources.
+
+@datatypes
+#qurt_thread_t
+
+@param[in]       thread_id  Thread on which the operation must be performed.
+@param[in, out]  ptr        Pointer to the local buffer where contents are written.
+@param[in]       size       Size of the debug thread information structure obtained by calling
+                     qurt_system_tcb_dump_get_size().
+	   
+@return 
+#QURT_EOK -- Success \n
+Failure otherwise
+		  
+@dependencies
+None.
+*/
+int qurt_system_tcb_dump_get(qurt_thread_t thread_id, void *ptr, size_t size);
+/** @endcond */
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_THREAD_CONTEXT_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_timer.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_timer.h
new file mode 100755
index 0000000000000..7bdfdb8f3c3df
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_timer.h
@@ -0,0 +1,560 @@
+#ifndef QURT_TIMER_H
+#define QURT_TIMER_H
+/**
+  @file qurt_timer.h
+  @brief  Prototypes of qurt_timer API
+
+EXTERNAL FUNCTIONS
+   None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+Copyright (c) 2021, 2023 by Qualcomm Technologies, Inc.  All Rights Reserved.
+Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+=============================================================================*/
+
+
+
+#include "qurt_anysignal.h"
+#include "qurt_signal2.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*=============================================================================
+                        CONSTANTS AND MACROS
+=============================================================================*/
+/**@addtogroup timer_const_macros 
+@{ */
+/**
+ Default values.
+*/
+/**	@xreflabel{hdr:QURT_TIMER_ONESHOT}*/
+#define QURT_TIMER_DEFAULT_TYPE            QURT_TIMER_ONESHOT /**< One shot.*/
+#define QURT_TIMER_DEFAULT_DURATION        1000uL             /**< Default duration. */
+#define QURT_TIMER_DEFAULT_EXPIRY          0uL                /**< Default expiration. */
+
+/**
+ Conversion from microseconds to timer ticks.
+ */
+#define QURT_TIMER_TIMETICK_FROM_US(us) QURT_SYSCLOCK_TIMETICK_FROM_US(us)
+
+/**
+ Conversion from timer ticks to microseconds at the nominal frequency.
+*/
+#define QURT_TIMER_TIMETICK_TO_US(ticks) qurt_timer_timetick_to_us(ticks)
+
+/** Minimum microseconds value is 100 microseconds (sleep timer).*/
+#define QURT_TIMER_MIN_DURATION             100uL
+
+/**
+ Maximum microseconds value for Qtimer is 1,042,499 hours.
+*/
+#define QURT_TIMER_MAX_DURATION  QURT_SYSCLOCK_MAX_DURATION
+
+/** 
+ Timer clock for Qtimer is 19.2 MHz.
+*/
+#define QURT_TIMER_MAX_DURATION_TICKS QURT_SYSCLOCK_MAX_DURATION_TICKS
+
+/** 
+ Sleep timer error margin for Qtimer is 1,000 ticks ~52 us.
+*/
+#define QURT_TIMETICK_ERROR_MARGIN QURT_SYSCLOCK_ERROR_MARGIN
+
+/*
+  qurt_timer group defines.                                                    
+*/
+#define QURT_TIMER_MAX_GROUPS              5U /**< Maximum groups.*/
+#define QURT_TIMER_DEFAULT_GROUP           0U /**< Default groups. */
+/** @} */ /* end_addtogroup timer_const_macros */
+
+/** @addtogroup timer_types
+@{ */
+/**
+   QuRT timer types.                                                       
+ */
+typedef enum
+{
+  QURT_TIMER_ONESHOT = 0,  /**< One shot.*/
+  /**	@xreflabel{hdr:QURT_TIMER_PERIODIC}*/
+  QURT_TIMER_PERIODIC      /**< Periodic. */
+} qurt_timer_type_t;
+
+
+/*=============================================================================
+                        TYPEDEFS
+=============================================================================*/
+
+/** QuRT timer type.*/
+typedef unsigned int                        qurt_timer_t;
+
+/** QuRT timer duration type.  */
+typedef unsigned long long                  qurt_timer_duration_t;
+
+/** QuRT timer time type. */
+typedef unsigned long long                  qurt_timer_time_t;
+
+typedef void (*pfn_t)(void);
+/** QuRT timer attribute type. */
+typedef struct 
+{ 
+    /** @cond */
+    unsigned int        magic; /**< Magic number to verify the qmsgq_attr_t pointer.  */    
+     
+    qurt_timer_duration_t   duration; /**< Specifies the duration of the new timer. */
+     
+    qurt_timer_time_t   expiry; /**< Specifies the absolute expiry of the new timer. */
+
+    qurt_timer_duration_t   remaining; /**< Specifies the remaining time of an active timer. */
+   
+    qurt_timer_type_t       type;  /**< Specifies the timer type; only #QURT_TIMER_ONESHOT and
+                                            #QURT_TIMER_PERIODIC are supported.  */
+    
+    unsigned int        group;  /**<  Group number of the timer; the criterion used to disable or enable the set
+       of timers.  */
+    pfn_t pFn;  /**< Callback other than the signal set */
+    /** @endcond */
+}
+qurt_timer_attr_t;
+
+/** @} */ /* end_addtogroup timer_types */
+/*=============================================================================
+                        FUNCTIONS
+=============================================================================*/
+
+/**@ingroup func_qurt_timer_stop
+  @xreflabel{sec:qurt_timer_stop}  
+  Stops a running timer.
+  The timer must be a one-shot timer.
+
+  @note1hang Restart stopped timers with the timer restart operation,
+             see Section @xref{sec:qurt_timer_restart}. 
+
+  @datatypes
+  #qurt_timer_t
+  
+  @param[in] timer    Timer object. 
+
+  @return
+  #QURT_EOK -- Success. \n
+  #QURT_EINVALID -- Invalid timer ID or duration value. \n
+  #QURT_ENOTALLOWED -- Timer is not a one shot timer. \n
+  #QURT_EMEM -- Out of memory error.
+
+  @dependencies
+  None.
+ */
+int qurt_timer_stop (qurt_timer_t timer);
+
+/**@ingroup func_qurt_timer_restart
+   @xreflabel{sec:qurt_timer_restart}
+   Restarts a stopped timer with the specified duration. The timer must be a one-shot timer.
+   Timers stop after they have expired or after they are explicitly stopped with qurt_timer_stop().
+   A restarted timer expires after the specified duration, the starting time is when the function is called.
+
+  @note1hang Timers stop after they have expired or after they are explicitly
+             stopped with the timer stop operation, see Section @xref{sec:qurt_timer_stop}.
+
+  @datatypes
+  #qurt_timer_t \n
+  #qurt_timer_duration_t
+
+  @param[in] timer        Timer object. 
+  @param[in] duration     Timer duration (in microseconds) before the restarted timer
+                          expires again.
+                          The valid range is #QURT_TIMER_MIN_DURATION to
+                          #QURT_TIMER_MAX_DURATION.
+
+  @return             
+  #QURT_EOK -- Success. \n
+  #QURT_EINVALID -- Invalid timer ID or duration value. \n
+  #QURT_ENOTALLOWED -- Timer is not a one-shot timer. \n
+  #QURT_EMEM --  Out-of-memory error.
+
+  @dependencies
+  None.
+ */
+int qurt_timer_restart (qurt_timer_t timer, qurt_timer_duration_t duration);
+
+
+/**@ingroup func_qurt_timer_create
+  Creates a timer.\n
+  Allocates and initializes a timer object, and starts the timer.
+
+  @note1hang A timer event handler must be defined to wait on the specified signal 
+             to handle the timer event.
+
+  @datatypes
+  #qurt_timer_t \n
+  #qurt_timer_attr_t \n
+  #qurt_anysignal_t
+
+  @param[out] timer   Pointer to the created timer object.
+  @param[in]  attr    Pointer to the timer attribute structure.
+  @param[in]  signal  Pointer to the signal object set when timer expires.
+  @param[in]  mask    Signal mask, which specifies the signal to set in the signal object when the
+                      time expires.
+
+  @return
+  #QURT_EOK -- Success. \n
+  #QURT_EMEM -- Not enough memory to create the timer. \n
+  #QURT_EINVALID -- One of the arguments in the attr field is invalid. \n
+  Other error code -- Operation failed. \n
+
+  @dependencies
+  None.
+ */
+int qurt_timer_create (qurt_timer_t *timer, const qurt_timer_attr_t *attr,
+                  const qurt_anysignal_t *signal, unsigned int mask);
+
+int qurt_timer_create_sig2 (qurt_timer_t *timer, const qurt_timer_attr_t *attr, 
+                  const qurt_signal2_t *signal, unsigned int mask);
+
+/**@ingroup func_qurt_timer_attr_init
+  Initializes the specified timer attribute structure with default attribute values: \n
+  - Timer duration -- #QURT_TIMER_DEFAULT_DURATION (Section @xref{dox:timers}) \n
+  - Timer type -- #QURT_TIMER_ONESHOT \n
+  - Timer group -- #QURT_TIMER_DEFAULT_GROUP
+
+  @datatypes
+  #qurt_timer_attr_t
+
+  @param[in,out] attr Pointer to the destination structure for the timer attributes.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+void qurt_timer_attr_init(qurt_timer_attr_t *attr);
+
+
+/*Tech Comm note: removed qurt_timer_attr_set_pfn from documentation 9/10/2020
+@ingroup func_qurt_timer_attr_set_pfn  
+
+  @datatypes
+  #qurt_timer_attr_t
+
+  @param[in,out] attr Pointer to the destination structure for the timer attributes.
+  @param[in] pFn pFn.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+void qurt_timer_attr_set_pfn(qurt_timer_attr_t *attr, pfn_t pFn);
+
+
+/**@ingroup func_qurt_timer_attr_set_duration
+  Sets the timer duration in the specified timer attribute structure.\n
+
+  The timer duration specifies the interval (in microseconds) between the creation of the
+  timer object and the generation of the corresponding timer event.
+
+  The timer duration value must be between #QURT_TIMER_MIN_DURATION and
+  #QURT_TIMER_MAX_DURATION (Section @xref{dox:timers}). Otherwise, the set operation is ignored.
+
+  @datatypes
+  #qurt_timer_attr_t \n
+  #qurt_timer_duration_t
+
+  @param[in,out] attr    Pointer to the timer attribute structure.
+  @param[in] duration    Timer duration (in microseconds).
+                         Valid range is #QURT_TIMER_MIN_DURATION to
+                         #QURT_TIMER_MAX_DURATION.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+void qurt_timer_attr_set_duration(qurt_timer_attr_t *attr, qurt_timer_duration_t duration);
+
+/**@ingroup func_qurt_timer_attr_set_expiry
+   Sets the absolute expiry time in the specified timer attribute structure.\n
+   The timer expiry specifies the absolute time (in microseconds) of the generation of the
+   corresponding timer event.\n
+   Timer expiries are relative to when the system first began executing.
+
+   @datatypes
+   #qurt_timer_attr_t \n
+   #qurt_timer_time_t
+
+   @param[in,out] attr  Pointer to the timer attribute structure.
+   @param[in]     time  Timer expiry.
+
+   @return
+   None.
+
+   @dependencies
+   None.
+ */
+void qurt_timer_attr_set_expiry(qurt_timer_attr_t *attr, qurt_timer_time_t time);
+
+/**@ingroup func_qurt_timer_attr_get_duration
+  Gets the timer duration from the specified timer attribute structure.
+  The value returned is the duration that was originally set for the timer.
+
+  @note1hang This function does not return the remaining time of an active timer; 
+  use qurt_timer_attr_get_remaining() to get the remaining time.
+
+  @datatypes
+  #qurt_timer_attr_t \n
+  #qurt_timer_duration_t
+
+  @param[in]  attr       Pointer to the timer attributes object
+  @param[out] duration   Pointer to the destination variable for timer duration.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+void qurt_timer_attr_get_duration(qurt_timer_attr_t *attr, qurt_timer_duration_t *duration);
+
+/**@ingroup func_qurt_timer_attr_get_remaining
+  Gets the timer remaining duration from the specified timer attribute structure. \n
+
+  The timer remaining duration indicates (in microseconds) how much time remains before
+  the generation of the next timer event on the corresponding timer.
+  In most cases this function assumes that the timer attribute structure was obtained by
+  calling qurt_timer_get_attr().
+
+  @note1hang This attribute is read-only and thus has no set operation defined for it.
+
+  @datatypes
+  #qurt_timer_attr_t \n
+  #qurt_timer_duration_t
+
+  @param[in] attr          Pointer to the timer attribute object.
+  @param[out] remaining    Pointer to the destination variable for remaining time.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+void qurt_timer_attr_get_remaining(qurt_timer_attr_t *attr, qurt_timer_duration_t *remaining);
+
+/**@ingroup func_qurt_timer_attr_set_type
+  Sets the timer type in the specified timer attribute structure.
+
+  The timer type specifies the functional behavior of the timer: \n
+  - A one-shot timer (#QURT_TIMER_ONESHOT) waits for the specified timer duration
+      and then generates a single timer event. After this the timer is nonfunctional. \n
+  - A periodic timer (#QURT_TIMER_PERIODIC) repeatedly waits for the specified
+     timer duration and then generates a timer event. The result is a series of timer
+     events with interval equal to the timer duration.
+
+   @datatypes 
+   #qurt_timer_attr_t \n
+   #qurt_timer_type_t
+   
+   @param[in,out]  attr  Pointer to the timer attribute structure.
+   @param[in]      type  Timer type. Values are: \n
+                   - #QURT_TIMER_ONESHOT -- One-shot timer. \n
+                   - #QURT_TIMER_PERIODIC -- Periodic timer. @tablebulletend
+
+   @return
+   None.
+
+   @dependencies
+   None.
+ */
+void qurt_timer_attr_set_type(qurt_timer_attr_t *attr, qurt_timer_type_t type);
+
+/**@ingroup func_qurt_timer_attr_get_type
+  Gets the timer type from the specified timer attribute structure.
+
+  @datatypes
+  #qurt_timer_attr_t \n
+  #qurt_timer_type_t
+
+  @param[in]  attr  Pointer to the timer attribute structure.
+  @param[out] type  Pointer to the destination variable for the timer type.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+void qurt_timer_attr_get_type(qurt_timer_attr_t *attr, qurt_timer_type_t *type);
+
+/**@ingroup func_qurt_timer_attr_set_group
+  Sets the timer group identifier in the specified timer attribute structure.\n
+  The timer group identifier specifies the group that the timer belongs to. Timer groups are
+  used to enable or disable one or more timers in a single operation. \n
+  The timer group identifier value must be between 0 and (#QURT_TIMER_MAX_GROUPS - 1).
+  See Section @xref{dox:timers}.
+
+  @datatypes
+  #qurt_timer_attr_t
+
+  @param[in,out]  attr  Pointer to the timer attribute object.
+  @param[in] group      Timer group identifier;
+                        Valid range is 0 to (#QURT_TIMER_MAX_GROUPS - 1).
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+void qurt_timer_attr_set_group(qurt_timer_attr_t *attr, unsigned int group);
+
+/**@ingroup func_qurt_timer_attr_get_group
+  Gets the timer group identifier from the specified timer attribute structure.
+
+  @datatypes
+  #qurt_timer_attr_t
+  
+  @param[in]  attr   Pointer to the timer attribute structure.
+  @param[out] group  Pointer to the destination variable for the timer group identifier.
+
+  @return
+  None.
+
+  @dependencies
+  None.
+ */
+void qurt_timer_attr_get_group(qurt_timer_attr_t *attr, unsigned int *group);
+
+/**@ingroup func_qurt_timer_get_attr
+  @xreflabel{hdr:qurt_timer_get_attr}
+  Gets the timer attributes of the specified timer when it was created.
+
+  @datatypes
+  #qurt_timer_t \n
+  #qurt_timer_attr_t
+
+  @param[in] timer  Timer object.
+  @param[out] attr  Pointer to the destination structure for timer attributes.
+
+  @return
+  #QURT_EOK -- Success. \n
+  #QURT_EVAL -- Argument passed is not a valid timer.
+
+  @dependencies
+  None.
+ */
+int qurt_timer_get_attr(qurt_timer_t timer, qurt_timer_attr_t *attr);
+
+/**@ingroup func_qurt_timer_delete
+  Deletes the timer.\n
+  Destroys the specified timer and deallocates the timer object.
+
+  @datatypes
+  #qurt_timer_t
+  
+  @param[in] timer  Timer object.
+
+  @return       
+  #QURT_EOK -- Success. \n
+  #QURT_EVAL -- Argument passed is not a valid timer. 
+
+  @dependencies
+  None.
+ */
+int qurt_timer_delete(qurt_timer_t timer);
+
+/**@ingroup func_qurt_timer_sleep
+  Suspends the current thread for the specified amount of time.
+  The sleep duration value must be between #QURT_TIMER_MIN_DURATION and
+  #QURT_TIMER_MAX_DURATION (Section @xref{dox:timers}).
+
+  @datatypes
+  #qurt_timer_duration_t
+
+  @param[in] duration  Interval (in microseconds) between when the thread is suspended
+                       and when it is re-awakened. 
+
+  @return 
+  #QURT_EOK -- Success. \n
+  #QURT_EMEM -- Not enough memory to perform the operation.
+
+  @dependencies
+  None.
+ */
+
+int qurt_timer_sleep(qurt_timer_duration_t duration);
+
+/**@ingroup func_qurt_timer_group_disable
+  Disables all timers that are assigned to the specified timer group.
+  If a specified timer is already disabled, ignore it.
+  If a specified timer is expired, do not process it.
+  If the specified timer group is empty, do nothing.
+
+  @note1hang When a timer is disabled its remaining time does not change, thus it
+             cannot generate a timer event.
+ 
+  @param[in] group  Timer group identifier.
+
+  @return 
+  #QURT_EOK -- Success.
+
+  @dependencies
+  None.
+ */
+int qurt_timer_group_disable (unsigned int group);
+
+/**@ingroup func_qurt_timer_group_enable
+  Enables all timers that are assigned to the specified timer group.
+  If a specified timer is already enabled, ignore it.
+  If a specified timer is expired, process it.
+  If the specified timer group is empty, do nothing.
+
+  @param[in] group  Timer group identifier.
+
+  @return 
+  #QURT_EOK -- Success.
+
+  @dependencies
+  None.
+ */
+int qurt_timer_group_enable (unsigned int group);
+
+
+/**
+  Notifies the timer server recovery from power collapse. The server
+  must account for any missed interrupts during power collapse. 
+ */
+void qurt_timer_recover_pc (void);
+
+/**
+   Determines whether the Qtimer is initialized.
+
+   @return
+   0       -- Not initialized. \n
+   Nonzero -- Initialized.
+ */
+static inline int qurt_timer_is_init (void) {return 1;}
+
+/**@ingroup func_qurt_timer_get_ticks
+   Gets current ticks. The ticks are accumulated since the RTOS
+   has started. Each tick is equal to a single timer clock
+   cycle, where the frequency is 32 KHz on RGPT or 19.2 MHz on Qtimer.
+  
+   @return             
+   Ticks since system started.
+ */
+unsigned long long qurt_timer_get_ticks (void);
+
+#define qurt_timer_timetick_from_us(us) QURT_SYSCLOCK_TIMETICK_FROM_US(us)
+
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_TIMER_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_tlb.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_tlb.h
new file mode 100755
index 0000000000000..b1b2d261d31c0
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_tlb.h
@@ -0,0 +1,215 @@
+#ifndef QURT_TLB_H
+#define QURT_TLB_H
+
+/**
+  @file qurt_tlb.h 
+  @brief  Prototypes of TLB API  
+        The TLB APIs allow explicit control of the portion of TLB between TLB_first_replaceble and TLB_LAST_REPLACEABLE. 
+        Both are nonconfigurable for the time being. This portion of TLB is permanently assigned/locked unless manually removed 
+        by qurt_tlb_remove. Implementation does not change depending on the configuration, such as whether CONFIG_STATIC is set or not. 
+        In CONFIG_STATIC=y, TLB_LAST_REPLACEABLE is set to the last TLB index, which indicates that the entire TLB is permanently 
+        assigned and is not backed up by page table (page table does not exist). TLB indicies are maintained through a 64-bit bitmask. 
+        A new entry is placed in the first available slot. 
+
+EXTERNAL FUNCTIONS
+   None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+Copyright (c) 2013, 2021, 2023
+All Rights Reserved.
+Confidential and Proprietary - Qualcomm Technologies, Inc.
+=============================================================================*/
+
+#include <qurt_types.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/*=============================================================================
+												FUNCTIONS
+=============================================================================*/
+
+/**@ingroup func_qurt_tlb_entry_create
+  Creates a new TLB entry with the specified mapping attributes in the TLB of the Hexagon processor. \n
+  @note1hang If the specified attributes are not valid (such as if the address is not aligned with the
+             size), the entry is created and an error result is returned.\n
+  @note1cont To set the G bit in the new TLB entry, set the ASID argument to -1.
+
+  @datatypes
+  #qurt_addr_t \n
+  #qurt_paddr_t \n
+  #qurt_mem_cache_mode_t \n
+  #qurt_perm_t
+  
+  @param[out]  entry_id         TLB entry identifier.
+  @param[in]   vaddr 			Virtual memory address.
+  @param[in]   paddr  			Physical memory address.
+  @param[in]   size  			Size of memory region to map (in bytes).
+  @param[in]   cache_attribs    Cache mode (writeback, and so on).
+  @param[in]   perms  			Access permissions.  
+  @param[in]   asid  			ASID (space ID).
+ 
+  @return
+  #QURT_EOK -- TLB entry successfully created.\n
+  #QURT_EFATAL -- Entry is not created; the TLB is full. \n
+  #QURT_ETLBCREATESIZE -- Entry is not created; the incorrect size was specified. \n
+  #QURT_ETLBCREATEUNALIGNED -- Entry is not created; an unaligned address was specified. \n
+  #QURT_EINVALID -- Invalid cache attributes / permissions provided.
+
+ */
+int  qurt_tlb_entry_create (unsigned int *entry_id, qurt_addr_t vaddr, qurt_paddr_t paddr, qurt_size_t size, qurt_mem_cache_mode_t cache_attribs, qurt_perm_t perms, int asid);
+
+/**@ingroup func_qurt_tlb_entry_create_64
+  Creates a new TLB entry with the specified mapping attributes in the TLB of the Hexagon processor. \n
+  @note1hang If the specified attributes are not valid (the address is not aligned with the
+             size), the entry is not created, and an error result is returned.\n
+  @note1cont To set the G bit in the new TLB entry, set the asid argument to -1.
+  
+  @param[out]  entry_id         TLB entry identifier.
+  @param[in]   vaddr 			Virtual memory address.
+  @param[in]   paddr_64         64-bit physical memory address.
+  @param[in]   size  			Size of memory region to map (in bytes).
+  @param[in]   cache_attribs    Cache mode (writeback, and so on).
+  @param[in]   perms  			Access permissions.  
+  @param[in]   asid  			ASID (space ID).
+ 
+  @return
+  #QURT_EOK -- TLB entry successfully created.\n
+  #QURT_EFATAL -- Entry was not created; the TLB is full. \n
+  #QURT_ETLBCREATESIZE -- Entry was not created; the incorrect size was specified. \n
+  #QURT_ETLBCREATEUNALIGNED -- Entry was not created; an unaligned address was specified. \n
+  #QURT_EINVALID -- Invalid cache attributes / permissions provided.
+
+ */
+int qurt_tlb_entry_create_64 (unsigned int *entry_id, qurt_addr_t vaddr, qurt_paddr_64_t paddr_64, qurt_size_t size, qurt_mem_cache_mode_t cache_attribs, qurt_perm_t perms, int asid);
+
+/**@ingroup func_qurt_tlb_entry_delete 
+  Deletes the specified TLB entry from the TLB of the Hexagon processor.
+  If the specified entry does not exist, no deletion occurs and an error result is returned.
+
+  @param[in]   entry_id  TLB entry identifier.			
+
+  @return
+  #QURT_EOK -- TLB entry successfully deleted. \n
+  #QURT_EFATAL -- TLB entry does not exist.
+
+  @dependencies
+  None.
+ **/
+int qurt_tlb_entry_delete (unsigned int entry_id);
+
+/**@ingroup func_qurt_tlb_entry_query
+  Searches for the specified TLB entry in the TLB of the Hexagon processor.
+  If the TLB entry is found, its entry identifier is returned.
+
+  @datatypes
+  #qurt_addr_t
+
+  @param[out]   entry_id     TLB entry identifier.  
+  @param[in]    vaddr  		 Virtual memory address.
+  @param[in]    asid 		 ASID (space ID).
+
+  @return  
+  #QURT_EOK -- TLB entry successfully returned. \n
+  #QURT_EFATAL -- TLB entry does not exist.
+
+  @dependencies
+  None.
+ **/
+int qurt_tlb_entry_query (unsigned int *entry_id, qurt_addr_t vaddr, int asid);
+
+/**@ingroup func_qurt_tlb_entry_set
+  Sets the TLB entry by storing an entry at the specified location 
+  in the TLB of the Hexagon processor.
+
+  @param[in]   entry_id  		TLB entry identifier.
+  @param[in]   entry  			64-bit TLB entry to store.
+
+  @return
+  #QURT_EOK -- Entry successfully stored in the TLB. \n
+  #QURT_EFATAL -- Entry not set at the specified location.
+
+  @dependencies
+  None.
+ **/
+int qurt_tlb_entry_set (unsigned int entry_id, unsigned long long int entry);
+
+/**@ingroup func_qurt_tlb_entry_get
+  Gets the TLB entry. \n
+  Returns the specified 64-bit TLB entry in the TLB of the Hexagon processor.
+
+  @param[in]    entry_id  	TLB entry identifier.
+  @param[out]   entry       64-bit TLB entry.
+
+  @return
+  #QURT_EOK -- TLB entry successfully returned. \n
+  #QURT_EFATAL -- TLB entry does not exist.   
+
+  @dependencies
+  None.
+ **/
+int qurt_tlb_entry_get (unsigned int entry_id, unsigned long long int *entry);
+
+/**@ingroup func_qurt_tlb_get_pager_physaddrs
+  Searches the TLB of the Hexagon processor, and returns all physical addresses that belong to the pager.
+  Each returned address indicates the starting address of an active page.
+
+The function return value indicates the number of addresses returned.
+
+  @param[out]  pager_phys_addrs  Pointer to the return array of pager physical addresses.
+ 
+  @return
+  Integer -- Number of addresses returned in array.
+
+  @dependencies
+    None.
+*/
+
+unsigned int qurt_tlb_get_pager_physaddr(unsigned int** pager_phys_addrs);
+
+/**@ingroup func_qurt_tlb_get_pager_virtaddr
+  Searches the TLB of the Hexagon processor, and returns all virtual addresses that belong to the pager.
+  Each returned address indicates the starting address of an active page.
+
+The function return value indicates the number of addresses returned.
+
+  @param[out]  pager_virt_addrs  Pointer to the return array of pager virtual addresses.
+ 
+  @return
+  Integer -- Number of addresses returned in the array.
+
+  @dependencies
+    None.
+*/
+
+unsigned int qurt_tlb_get_pager_virtaddr(unsigned int** pager_virt_addrs);
+
+
+/**@ingroup func_qurt_tlb_entry_set2
+  Sets the TLB entry by storing an entry at the specified location 
+  in the TLB of the Hexagon processor. An additional option can be passed 
+  to lock the TLB entry in the TLB of the Hexagon processor.
+
+  @param[in]   id     TLB entry identifier.
+  @param[in]   tlb    64-bit TLB entry to store.
+  @param[in]   lock   Nonzero value indicates that the TLB entry must be locked in the hardware TLB.
+
+  @return
+  #QURT_EOK -- Entry successfully stored in the TLB. \n
+  #QURT_EFATAL -- Entry not set at the specified location.
+
+  @dependencies
+  None.
+ **/
+int qurt_tlb_entry_set2(unsigned id, unsigned long long tlb, unsigned lock);
+
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_TLB_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_tls.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_tls.h
new file mode 100755
index 0000000000000..6ec3b39ff5cb0
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_tls.h
@@ -0,0 +1,100 @@
+#ifndef QURT_TLS_H
+#define QURT_TLS_H
+/**
+  @file qurt_tls.h 
+  @brief  Prototypes of TLS APIs 
+
+EXTERNAL FUNCTIONS
+   None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+Copyright (c) 2021  by Qualcomm Technologies, Inc.  All Rights Reserved.
+Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+=============================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/*=============================================================================
+												FUNCTIONS
+=============================================================================*/
+
+/**@ingroup func_qurt_tls_create_key
+  @xreflabel{sec:tls_create_key}
+  Creates a key for accessing a thread local storage data item.\n
+  Subsequent get and set operations use the key value.
+
+  @note1hang The destructor function performs any clean-up operations needed by a thread
+             local storage item when its containing thread is deleted (Section @xref{sec:qurt_thread_exit}).
+
+  @param[out] key         Pointer to the newly created thread local storage key value.
+  @param[in]  destructor  Pointer to the key-specific destructor function. Passing NULL 
+                          specifies that no destructor function is defined for the key.
+
+  @return	
+  #QURT_EOK -- Key successfully created. \n
+  #QURT_ETLSAVAIL -- No free TLS key available. 
+
+  @dependencies
+  None.
+ */
+int qurt_tls_create_key (int *key, void (*destructor)(void *));
+
+/**@ingroup func_qurt_tls_set_specific
+  Stores a data item to thread local storage along with the specified key.
+
+  @param[in]    key  Thread local storage key value.
+  @param[in]    value  Pointer to user data value to store.
+
+  @return  
+  #QURT_EOK -- Data item successfully stored. \n
+  #QURT_EINVALID -- Invalid key. \n
+  #QURT_EFAILED -- Invoked from a non-thread context.
+ */
+int qurt_tls_set_specific (int key, const void *value);
+
+/**@ingroup func_qurt_tls_get_specific
+  Loads the data item from thread local storage. \n
+  Returns the data item that is stored in thread local storage with the specified key.
+  The data item is always a pointer to user data.
+
+  @param[in]    key Thread local storage key value.
+
+  @return
+  Pointer -- Data item indexed by key in thread local storage. \n
+  0 (NULL) -- Key out of range.
+
+  @dependencies
+  None.
+ */
+void * __attribute__((section(".text.qurt_tls_get_specific "))) qurt_tls_get_specific (int key);
+
+
+/**@ingroup func_qurt_tls_delete_key
+  Deletes the specified key from thread local storage.
+
+  @note1hang Explicitly deleting a key does not execute any destructor function that is
+             associated with the key (Section @xref{sec:tls_create_key}).
+
+  @param[in]   key  Thread local storage key value to delete.
+
+  @return  
+  #QURT_EOK -- Key successfully deleted. \n
+  #QURT_ETLSENTRY -- Key already free.
+
+  @dependencies
+  None.
+ */
+int qurt_tls_delete_key (int key);
+
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_TLS_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_trace.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_trace.h
new file mode 100755
index 0000000000000..541f8f1d34bf6
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_trace.h
@@ -0,0 +1,317 @@
+﻿#ifndef QURT_TRACE_H
+#define QURT_TRACE_H
+/**
+  @file qurt_trace.h 
+  @brief  Prototypes of system call tracing helpers API  
+
+EXTERNAL FUNCTIONS
+   None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+Copyright (c) 2021-2023 by Qualcomm Technologies, Inc.
+All Rights Reserved.
+Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+=============================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*=============================================================================
+                            GLOBAL VARIABLES
+=============================================================================*/
+/** @cond internal_only */
+/** @addtogroup etm_macros
+@{ */
+/* ETM trace types. */
+#define QURT_ETM_TYPE_PC_ADDR                           (1U<<0) /**< PC address.*/
+#define QURT_ETM_TYPE_MEMORY_ADDR                       (1U<<1) /**< Memory address. */
+#define QURT_ETM_TYPE_TESTBUS                           (1U<<2) /**< Test bus. */
+#define QURT_ETM_TYPE_CYCLE_ACCURATE                    (1U<<3) /**< Cycle accurate. */
+#define QURT_ETM_TYPE_CYCLE_COARSE                      (1U<<4) /**< Cycle coarse. */
+#define QURT_ETM_TYPE_PC_AND_MEMORY_ADDR                (QURT_ETM_TYPE_PC_ADDR|QURT_ETM_TYPE_MEMORY_ADDR) /**< PC and memory address. */
+#define QURT_ETM_TYPE_PC_ADDR_AND_TESTBUS               (QURT_ETM_TYPE_PC_ADDR|QURT_ETM_TYPE_TESTBUS) /**< PC address and test bus. */
+#define QURT_ETM_TYPE_MEMORY_ADDR_AND_TESTBUS           (QURT_ETM_TYPE_MEMORY_ADDR|QURT_ETM_TYPE_TESTBUS) /**< Memory address and test bus.*/
+#define QURT_ETM_TYPE_PC_AND_MEMORY_ADDR_AND_TESTBUS    (QURT_ETM_TYPE_PC_ADDR|QURT_ETM_TYPE_MEMORY_ADDR|QURT_ETM_TYPE_TESTBUS) /**< PC, memory address, and test bus. */
+
+/* ETM routes. */
+#define QURT_ETM_ROUTE_TO_QDSS      0U /**< ETM route to QDSS. */
+#define QURT_ETM_ROUTE_TO_Q6ETB     1U /**< ETM route to Q6ETB. */
+
+/* ETM filters. */
+#define QURT_ETM_TRACE_FILTER_ALL_DEFAULT   0U       /*< Filter all as default. */
+#define QURT_ETM_TRACE_FILTER_HNUM0         (1U<<0)  /*< Filter HNUM0. */    
+#define QURT_ETM_TRACE_FILTER_HNUM1         (1U<<1)  /*< Filter HNUM1. */     
+#define QURT_ETM_TRACE_FILTER_HNUM2         (1U<<2)  /*< Filter HNUM2. */     
+#define QURT_ETM_TRACE_FILTER_HNUM3         (1U<<3)  /*< Filter HNUM3. */  
+#define QURT_ETM_TRACE_FILTER_HNUM4         (1U<<4)  /*< Filter HNUM4. */  
+#define QURT_ETM_TRACE_FILTER_HNUM5         (1U<<5)  /*< Filter HNUM5. */  
+#define QURT_ETM_TRACE_FILTER_HNUM6         (1U<<6)  /*< Filter HNUM6. */  
+#define QURT_ETM_TRACE_FILTER_HNUM7         (1U<<7)  /*< Filter HNUM7. */  
+#define QURT_ETM_TRACE_FILTER_HNUM8         (1U<<8)  /*< Filter HNUM8. */    
+#define QURT_ETM_TRACE_FILTER_HNUM9         (1U<<9)  /*< Filter HNUM9. */     
+#define QURT_ETM_TRACE_FILTER_HNUM10        (1U<<10) /*< Filter HNUM10. */     
+#define QURT_ETM_TRACE_FILTER_HNUM11        (1U<<11) /*< Filter HNUM11. */
+#define QURT_ETM_TRACE_FILTER_HNUM12        (1U<<12) /*< Filter HNUM12. */    
+#define QURT_ETM_TRACE_FILTER_HNUM13        (1U<<13) /*< Filter HNUM13. */     
+#define QURT_ETM_TRACE_FILTER_HNUM14        (1U<<14) /*< Filter HNUM14. */     
+#define QURT_ETM_TRACE_FILTER_HNUM15        (1U<<15) /*< Filter HNUM15. */
+#define QURT_ETM_TRACE_FILTER_ALL           QURT_ETM_TRACE_FILTER_ALL_DEFAULT
+
+#define QURT_ETM_TRACE_FILTER_CLUSTER0      (1<<16)  /*< Filter trace cluster0 address. */  
+#define QURT_ETM_TRACE_FILTER_CLUSTER1      (1<<17)  /*< Filter trace cluster1 address. */  
+#define QURT_ETM_TRACE_FILTER_PC_RANGE      (1<<19)  /*< Filter PC address range. */  
+
+/* ETM memory source - PC or data access */
+#define QURT_ETM_SOURCE_PC                  0U  /**< ETM memory source of SAC* is PC. */
+#define QURT_ETM_SOURCE_DATA                1U  /**< ETM memory source of SAC* is data. */
+
+/* Period between synchronization traces */
+#define QURT_ETM_ASYNC_PERIOD               0  /**< Async.*/
+#define QURT_ETM_ISYNC_PERIOD               1  /**< Isync.*/
+#define QURT_ETM_GSYNC_PERIOD               2  /**< Gsync. */
+
+/* ETM enable flags */
+#define QURT_ETM_OFF                0U  /**< ETM off. */
+#define QURT_ETM_ON                 1U  /**< ETM on. */
+/** @endcond */
+/** @} */ /* end_addtogroup etm_macros */
+
+/** @addtogroup function_tracing_macro
+@{ */
+/* ETM setup return values */
+#define QURT_ETM_SETUP_OK                   0 /**< ETM setup OK. */
+#define QURT_ETM_SETUP_ERR                  1 /**< ETM setup error. */
+/** @} */ /* end_addtogroup function_tracing_macro */
+/* ETM breakpoint types */
+#define QURT_ETM_READWRITE_BRKPT            0U /**< ETM read/write breakpoint. */
+#define QURT_ETM_READ_BRKPT                 1U /**< ETM read breakpoint. */
+#define QURT_ETM_WRITE_BRKPT                2U /**< ETM write breakpoint. */
+#define QURT_ETM_BRKPT_INVALIDATE           3U /**< Invalidate breakpoint. */
+/** @addtogroup function_tracing_macro
+@{ */
+/* ATB status flags */
+#define QURT_ATB_OFF                        0  /**< ATB off. */
+#define QURT_ATB_ON                         1  /**< ATB on. */
+/** @} */ /* end_addtogroup function_tracing_macro */
+/* DTM enable flags */
+#define QURT_DTM_OFF                0  /**< DTM off. */
+#define QURT_DTM_ON                 1  /**< DTM on. */
+
+/** @addtogroup function_tracing_datatypes
+@{ */
+/**STM trace information. */
+typedef struct qurt_stm_trace_info {
+   /** @cond */
+   unsigned int stm_port_addr[6];   /* STM port address to which trace data must be written.*/
+   unsigned int thread_event_id; /* Event ID for context switches.*/
+   unsigned int interrupt_event_id; /* Event ID for interrupts. */
+   unsigned int marker; /* Marker value that must be written at the beginning of the trace. */
+   /** @endcond */
+} qurt_stm_trace_info_t;
+/** @} */ /* end_addtogroup function_tracing_datatypes */
+/*=============================================================================
+                            GLOBAL FUNCTIONS
+=============================================================================*/
+
+
+/**@ingroup func_qurt_trace_get_marker
+  Gets the kernel trace marker.\n
+  Returns the current value of the kernel trace marker.
+  The marker consists of a hardware thread identifier and an index into the kernel trace
+  buffer. The trace buffer records kernel events.
+
+  @note1hang Using this function with qurt_trace_changed() 
+             determines whether certain kernel events occurred in a block of code.
+
+  @return
+  Integer -- Kernel trace marker.
+
+  @dependencies
+  None.
+*/
+unsigned int qurt_trace_get_marker(void);
+
+/**@ingroup func_qurt_trace_changed  
+  Determines whether specific kernel events have occurred. \n
+  Returns a value that indicates whether the specified kernel events are recorded in the
+  kernel trace buffer since the specified kernel trace marker was obtained.
+
+  The prev_trace_marker parameter specifies a kernel trace marker that was obtained by calling 
+  qurt_trace_get_marker().
+  @cond rest_dist For more information on the mask value, see the description of the trace_mask element in 
+  @xhyperref{80VB41992,80-VB419-92}. \n @endcond
+
+  @note1hang Used with qurt_trace_get_marker(), this function determines whether
+             certain kernel events occurred in a block of code.\n
+  @note1cont This function cannot determine whether a specific kernel event type has
+             occurred unless that event type has been enabled in the trace_mask element
+             of the system configuration file. \n
+  @note1cont QuRT supports the recording of interrupt and context switch events only (such as
+             a trace_mask value of 0x3).
+
+  @param[in] prev_trace_marker Previous kernel trace marker.
+  @param[in] trace_mask        Mask value that indicates which kernel events to check for.
+
+  @returns
+  1 -- Kernel events of the specified type have occurred since the
+       specified trace marker was obtained.\n
+  0 -- No kernel events of the specified type have occurred since the
+       specified trace marker was obtained.
+
+  @dependencies
+  None.
+*/
+int qurt_trace_changed(unsigned int prev_trace_marker, unsigned int trace_mask);
+
+/*=============================================================================
+                        CONSTANTS AND MACROS
+=============================================================================*/
+/** @addtogroup function_tracing_macro
+@{ */
+#ifndef QURT_DEBUG 
+#define QURT_TRACE(str, ...) __VA_ARGS__
+  /**< Function tracing is implemented with the QURT_TRACE debug macro, which
+       optionally generates printf statements both before and after every function call that is
+       passed as a macro argument. 
+
+       For example, in the following macro calls in the source code:
+       @code
+       QURT_TRACE(myfunc, my_func(33))
+       
+       @endcode
+       generates the following debug output:
+       @code
+       myfile:nnn: my_func >>> calling my_func(33)
+       myfile:nnn: my_func >>> returned my_func(33)
+       @endcode
+       The debug output includes the source file and line number of the function call, along with
+       the text of the call. Compile the client source file with -D __FILENAME__
+       defined for its file name.
+
+       The library function qurt_printf() generates the debug output.
+       The QURT_DEBUG symbol controls generation of the debug output. If this symbol is
+       not defined, function tracing is not generated.\n
+       @note1hang The debug macro is accessed through the QuRT API header file. 
+        */
+#else
+#define QURT_TRACE(str, ...) \
+	do { \
+		qurt_printf("%s:%d: %s: >>> calling %s\n",__FILENAME__,__LINE__,(str),#__VA_ARGS__); \
+		__VA_ARGS__; \
+		qurt_printf("%s:%d: %s: <<< %s returned\n",__FILENAME__,__LINE__,(str),#__VA_ARGS__); \
+	} while (0);
+#endif
+/** @} */ /* end_addtogroup function_tracing_macro */
+
+/**@ingroup func_qurt_etm_set_pc_range
+  Sets the PC address range for ETM filtering.
+  Depending on the Hexagon core design, a maximum of four PC ranges are supported.
+
+  @param[in] range_num  0 to 3. 
+  @param[in] low_addr   Lower boundary of PC address range.
+  @param[in] high_addr  Higher boundary of PC address range.
+
+  @returns
+  #QURT_ETM_SETUP_OK -- Success. \n
+  #QURT_ETM_SETUP_ERR -- Failure.
+
+  @dependencies
+  None.
+*/
+unsigned int qurt_etm_set_pc_range(unsigned int range_num, unsigned int low_addr, unsigned int high_addr);
+
+/**@ingroup func_qurt_etm_set_range
+  Sets the address range for ETM filtering. 
+  It allows the user to select the source type of addresses - QURT_ETM_SOURCE_PC and QURT_ETM_SOURCE_DATA.
+
+  @param[in] addr_source_type   Type of the address source:\n
+                                - #QURT_ETM_SOURCE_PC \n
+                                - #QURT_ETM_SOURCE_DATA @tablebulletend
+  @param[in] trig_block_num     0 to 3.
+  @param[in] pid                pid of the process
+                                1. Any valid PID number will enable the ASID based trace filtering.
+                                2. QURT_ETM_NO_PID - Disable the ASID based trace filtering.
+  @param[in] low_addr           Lower boundary of PC address range.
+  @param[in] high_addr          Higher boundary of PC address range.
+
+  @returns
+  #QURT_ETM_SETUP_OK -- Success. \n
+  #QURT_ETM_SETUP_ERR -- Failure.
+
+  @dependencies
+  None.
+*/
+unsigned int qurt_etm_set_range(unsigned int addr_source_type, unsigned int trig_block_num, unsigned int pid, unsigned int low_addr, unsigned int high_addr);
+
+/**@ingroup func_qurt_etm_set_atb
+  Sets the advanced trace bus (ATB) state to notify QuRT that the ATB is actively enabled or disabled.
+  QuRT performs the corresponding actions at low power management.
+  
+  @param[in] flag Values: \n
+                         #QURT_ATB_ON \n
+						 #QURT_ATB_OFF  
+      
+  @returns
+  #QURT_ETM_SETUP_OK  -- Success. \n
+  #QURT_ETM_SETUP_ERR -- Failure
+
+  @dependencies
+  None.
+*/
+unsigned int qurt_etm_set_atb(unsigned int flag);
+
+/**@ingroup func_qurt_etm_set_sync_period
+  Sets the period for types of synchronization trace packets. \n
+  ASYNC defines the period between alignment synchronization packets.
+         Period is in terms of bytes in the packet stream. \n 
+  ISYNC defines the period between instruction synchronization packets.
+         Period is per thread and is defined as the bytes sent out for that thread. \n
+  GSYNC is the defined period in thread cycles between GSYNC packets.
+
+  @param[in]  sync_type Type of synchronization packets: \n
+                          #QURT_ETM_ASYNC_PERIOD \n
+                          #QURT_ETM_ISYNC_PERIOD \n
+                          #QURT_ETM_GSYNC_PERIOD
+  @param[in]  period    Period value. 
+
+  @return
+  #QURT_ETM_SETUP_OK -- Success. \n
+  #QURT_ETM_SETUP_ERR -- Failure.
+
+  @dependencies
+  None.
+ */
+unsigned int qurt_etm_set_sync_period(unsigned int sync_type, unsigned int period);
+
+/**@ingroup func_qurt_stm_trace_set_config
+  Sets up a STM port for tracing events.
+
+  @datatypes
+  #qurt_stm_trace_info_t 
+
+  @param[in]  stm_config_info Pointer to the STM trace information used to set up the trace
+              in the kernel.
+			  The strucure must have the following:\n
+			  - One port address per hardware thread \n
+			  - Event ID for context switches \n
+			  - Event ID for interrupt tracing n
+			  - Header or marker to identify the beginning of the trace. @tablebulletend
+
+  @return
+  #QURT_EOK -- Success. \n
+  #QURT_EINVALID -- Failure; possibly because the passed port address is not in the page table.
+
+  @dependencies
+  None.
+ */
+unsigned int qurt_stm_trace_set_config(qurt_stm_trace_info_t *stm_config_info);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_TRACE_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_types.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_types.h
new file mode 100755
index 0000000000000..bdb83a3fe2fb2
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_types.h
@@ -0,0 +1,294 @@
+#ifndef QURT_TYPES_H
+#define QURT_TYPES_H
+/**
+  @file qurt_types.h 
+  @brief  Contains types common to all configurations
+
+EXTERNAL FUNCTIONS
+   None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+   None.
+
+Copyright (c) Qualcomm Technologies, Inc.
+All Rights Reserved.
+Confidential and Proprietary - Qualcomm Technologies, Inc.
+
+=============================================================================*/
+
+
+//#include <stddef.h>
+#include <qurt_consts.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*=============================================================================
+                            CONSTANTS AND MACROS
+=============================================================================*/
+#define PGA_BITFIELD_MASK(hi,lo)    (((~0u)>>(31U-((hi)-(lo))))<<(lo))
+#define PGA_BITFIELD_GET(x,hi,lo)   (((x)&PGA_BITFIELD_MASK((hi),(lo)))>>(lo))
+#define PGA_BITFIELD_INS(hi,lo,v)   (((v)<<(lo))&PGA_BITFIELD_MASK((hi),(lo)))
+#define PGA_BITFIELD_SET(x,hi,lo,v) ((x)=((x)&~PGA_BITFIELD_MASK((hi),(lo)))|PGA_BITFIELD_INS((hi),(lo),(v)))
+#define QURT_PGATTR_C_GET(pga)      PGA_BITFIELD_GET((pga).pga_value, 3U, 0U)       /* Bits 3-0:  cache */
+#define QURT_PGATTR_A_GET(pga)      PGA_BITFIELD_GET((pga).pga_value, 5U, 4U)       /* Bits 5-4:  bus attr */
+#define QURT_PGATTR_C_SET(pga,v)    PGA_BITFIELD_SET((pga).pga_value, 3U, 0U, (v))  /* Bits 3-0:  cache */
+#define QURT_PGATTR_A_SET(pga,v)    PGA_BITFIELD_SET((pga).pga_value, 5U, 4U, (v))  /* Bits 5-4:  bus attr */
+#define QURT_PGATTR_MKRAW(v)        ((qurt_pgattr_t){.pga_value = (v)})
+#define QURT_PGATTR_MK(c,a)         QURT_PGATTR_MKRAW(PGA_BITFIELD_INS(3U,0U,(c))|PGA_BITFIELD_INS(5U,4U,(a)))
+
+/*return types for qurt_island_get_status2*/
+#define QURT_ISLAND_MODE_NORMAL    0U    /**< Normal operating mode */
+#define QURT_ISLAND_MODE_ISLAND    1U    /**< Island mode */
+#define QURT_ISLAND_MODE_EXITING   2U    /**< In transition from Island mode to Normal mode */
+
+/*=============================================================================
+                        FORWARD DECLARATIONS & TYPEDEFS
+=============================================================================*/
+/** @addtogroup memory_management_types
+@{ */
+typedef unsigned int qurt_addr_t;          /**< QuRT address type.*/
+typedef unsigned int qurt_paddr_t;         /**< QuRT physical memory address type.  */ 
+/** @cond rest_reg_dist  */
+typedef unsigned long long qurt_addr_64_t;  /**< QuRT 64-bit memory address type. */
+typedef unsigned long long qurt_paddr_64_t; /**< QuRT 64-bit physical memory address type. */
+typedef unsigned int qurt_mem_region_t;    /**< QuRT memory regions type. */
+typedef unsigned int qurt_mem_fs_region_t; /**< QuRT memory FS region type. */
+/**@endcond */
+typedef unsigned int qurt_mem_pool_t;      /**< QuRT memory pool type.*/
+typedef unsigned int qurt_size_t;          /**< QuRT size type. */
+/** @cond  */
+typedef unsigned long long qurt_mmu_entry_t;/**< QuRT MMU entry type. */
+#define QURT_PHYSPOOL_NAME_LEN (32)
+typedef char qurt_physpool_name_t[QURT_PHYSPOOL_NAME_LEN];
+
+
+/*
+ * Mapping type
+ *
+ * QMEM_MAPPING_VIRTUAL is the default mode, in which the system 
+ * picks up the available range of the virtual address, and maps it to 
+ * available contiguous physical addresses. Physical-to-virtual
+ * is not guaranteed to be 1:1; both virtual and physical memory is 
+ * contiguous.
+ *
+ * In QMEM_MAPPING_IDEMPOTENT mode, the user provides the physical address;
+ * the kernel allocates 1:1 physical-to-virtual memory. Primary use of 
+ * of this mapping is to allocate physical-to-virtual memory 1:1.
+ *
+ * In QMEM_MAPPING_PHYS_CONTIGUOUS mode, the virtual address might
+ * not be the same as the physical address. But the physical address of the
+ * memory region is guaranteed to be contiguous starting at the provided
+ * address, it is required to provide a fixed physical address. The primary 
+ * use of this mapping is to allocate physical memory from a particular 
+ * address, where 1:1 physical-to-virtual is not required.
+ *
+ * QMEM_MAPPING_NONE mode must be used to reserve a virtual memory
+ * area (VMA); no physical memory is reserved or mapped to this virtual
+ * space; all standard qmem_region APIs apply to a VMA, however physical
+ * address is always INVALID_ADDR. qmem_region_create() in this mode
+ * returns a handle to the VMA, both virt_addr and phys_addr must
+ * be set to INVALID_ADDR, kernel allocates any available virtual
+ * memory of the specified size. Obtain the starting virtual address 
+ * of VMA through qmem_region_attr_getvirtaddr().
+ * Primary purpose of this mapping mode is to provide a mechanism for
+ * delayed binding in QuRT, for example reserve virtual memory and map it at
+ * some later time to possibly discontiguous physical blocks. Thus, a
+ * single VMA can be partitioned among several physical-virtual mappings
+ * created via qmem_region_create() with QMEM_VIRTUAL_FIXED mapping mode.
+ * Each VMA keeps track of associated mapped regions.
+ * Deletion of VMA succeeds only if all associated "virtual_fixed"
+ * regions are freed prior to VMA deletion.
+ *
+ * Use QMEM_MAPPING_VIRTUAL_FIXED mode to create a region
+ * from virtual space that has been reserved via qmem_region_create()
+ * with QMEM_MAPPING_NONE mapping. A valid virt_add is required, if
+ * phys_addr is specified, the kernel attempts to map it accordingly,
+ * if no phys_addr is specified, kernel maps any available physical
+ * memory. All standard qmem_region APIs apply to such region. Remapping
+ * a virtual range without prior freeing of the region is not permitted.
+ * When such region is deleted its corresponding VMA remains intact.
+ *
+ * QMEM_MAPPING_PHYS_DISCONTIGUOUS mode can obtain contiguous
+ * virtual memory but physical memory can be discontiguous. This method
+ * tries to club small physical memory blocks to obtain requested
+ * memory and is useful in case where there is no contiguous full block
+ * of requested size. If client does not need contiguous physical memory, 
+ * (for example, if client does not use physical addressing), this helps
+ * use smaller physical memory blocks rather than using contiguous memory.
+ * Note: When memory is allocated through this method, physical address is
+ * not returned to the caller using the qurt_mem_region_attr_get() API as there might
+ * not be a single physical address.
+ *
+ */
+/**@endcond */
+/** QuRT memory region mapping type. */
+typedef enum {
+        QURT_MEM_MAPPING_VIRTUAL=0,            /**< Default mode. The region virtual address range maps to an 
+                                          available contiguous area of physical memory. For the most
+                                                    efficient use of virtual memory, the QuRT system 
+                                                    chooses the base address in physical memory. This works for most memory
+                                          use cases.*/
+        QURT_MEM_MAPPING_PHYS_CONTIGUOUS = 1,  /**< The region virtual address space must be mapped to a 
+                                               contiguous area of physical memory. This is necessary when the
+                                               memory region is accessed by external devices that bypass Hexagon
+                                               virtual memory addressing. The base address in physical 
+                                               memory must be explicitly specified.*/
+        QURT_MEM_MAPPING_IDEMPOTENT=2,         /**< Region virtual address space maps
+                                             to the identical area of physical memory. */
+        QURT_MEM_MAPPING_VIRTUAL_FIXED=3,      /**< Virtual address space of the region maps either to the 
+                                           specified area of physical memory or (if no area is specified)
+                                                    to available physical memory. Use this mapping to create
+                                           regions from virtual space that was reserved by calling 
+                                           qurt_mem_region_create() with mapping. */
+        QURT_MEM_MAPPING_NONE=4,  /**< Reserves a virtual memory area (VMA). Remapping a virtual range is not
+                                       permitted without first deleting the memory region. When such a region is
+                                       deleted, its corresponding virtual memory addressing remains intact. */
+        QURT_MEM_MAPPING_VIRTUAL_RANDOM=7,     /**< System chooses a random virtual address and
+                                            maps it to available contiguous physical addresses.*/
+        QURT_MEM_MAPPING_PHYS_DISCONTIGUOUS=8, /**< While virtual memory is contiguous, allocates in discontiguous physical 
+                                                    memory blocks. This helps when there are smaller contiguous blocks
+                                                    than the requested size.
+                                                    Physical address is not provided as part of the get_attr call */
+        QURT_MEM_MAPPING_INVALID=10,        /**< Reserved as an invalid mapping type. */
+} qurt_mem_mapping_t;  
+
+
+/** QuRT cache mode type. */
+typedef enum {
+        QURT_MEM_CACHE_WRITEBACK=7,     /**< Write back. */
+        QURT_MEM_CACHE_NONE_SHARED=6,   /**< Normal uncached memory that can be shared with other subsystems.*/
+        QURT_MEM_CACHE_WRITETHROUGH=5,  /**< Write through. */
+        QURT_MEM_CACHE_WRITEBACK_NONL2CACHEABLE=0,    /**< Write back non-L2-cacheable.*/
+        QURT_MEM_CACHE_WRITETHROUGH_NONL2CACHEABLE=1,  /**< Write through non-L2-cacheable. */
+        QURT_MEM_CACHE_WRITEBACK_L2CACHEABLE=QURT_MEM_CACHE_WRITEBACK,  /**< Write back L2 cacheable. */
+        QURT_MEM_CACHE_WRITETHROUGH_L2CACHEABLE=QURT_MEM_CACHE_WRITETHROUGH,  /**< Write through L2 cacheable.  */
+        QURT_MEM_CACHE_DEVICE = 4,  /**< Volatile memory-mapped device. Access to device memory cannot be cancelled by interrupts, re-ordered, or replayed.*/
+        QURT_MEM_CACHE_NONE = 4,  /**< Deprecated -- use #QURT_MEM_CACHE_DEVICE instead. */
+        QURT_MEM_CACHE_DEVICE_SFC = 2, /**< Enables placing limitations on the number of outstanding transactions. */
+        QURT_MEM_CACHE_INVALID=10,  /**< Reserved as an invalid cache type. */
+} qurt_mem_cache_mode_t;
+
+/** Memory access permission. */
+#define     QURT_PERM_NONE    0x0U     /**< No permission. */
+#define     QURT_PERM_READ    0x1U     /**< Read permission. */
+#define     QURT_PERM_WRITE   0x2U     /**< Write permission. */
+#define     QURT_PERM_EXECUTE 0x4U     /**< Execution permission. */
+#define     QURT_PERM_NODUMP  0x8U   
+                                    /**<  Skip dumping the mapping. During process domain dump, must skip
+                                     some mappings on host memory to avoid a race condition
+                                     where the memory is removed from the host and DSP process
+                                     crashed before the mapping is removed. */
+#define     QURT_PERM_FULL  QURT_PERM_READ | QURT_PERM_WRITE | QURT_PERM_EXECUTE  /**< Read, write, and execute permission. */
+
+typedef unsigned char qurt_perm_t;
+
+
+/** @cond rest_reg_dist*/
+/** QuRT cache type; specifies data cache or instruction cache. */
+typedef enum {
+        QURT_MEM_ICACHE, /**< Instruction cache.*/
+        QURT_MEM_DCACHE  /**< Data cache.*/
+} qurt_mem_cache_type_t;
+
+/** QuRT cache operation code type. */
+typedef enum {
+    QURT_MEM_CACHE_FLUSH, /**< Flush. */
+    QURT_MEM_CACHE_INVALIDATE, /**< Invalidate */
+    QURT_MEM_CACHE_FLUSH_INVALIDATE, /**< Flush invalidate. */
+    QURT_MEM_CACHE_FLUSH_ALL, /**< Flush all. */
+    QURT_MEM_CACHE_FLUSH_INVALIDATE_ALL, /**< Flush invalidate all. */
+    QURT_MEM_CACHE_TABLE_FLUSH_INVALIDATE, /**< Table flush invalidate. */
+    QURT_MEM_CACHE_FLUSH_INVALIDATE_L2, /**< L2 flush invalidate.*/
+} qurt_mem_cache_op_t;
+
+/** QuRT memory region type. */
+typedef enum {
+        QURT_MEM_REGION_LOCAL=0,  /**< Local. */
+        QURT_MEM_REGION_SHARED=1,  /**< Shared.*/
+        QURT_MEM_REGION_USER_ACCESS=2,  /**< User access. */
+        QURT_MEM_REGION_FS=4,  /**< FS. */
+        QURT_MEM_REGION_INVALID=10,  /**< Reserved as an invalid region type. */
+} qurt_mem_region_type_t;
+
+/* Cache and bus attributes are combined into a value of this type for convenience,
+    and macros for combining and extracting fields are defined here.  */
+/** @cond */
+struct qurt_pgattr {
+   unsigned pga_value; /**< PGA value.*/
+};
+typedef struct qurt_pgattr qurt_pgattr_t;
+/** @endcond */
+/** QuRT memory region attributes type.*/  
+/* QMEM_MAPPING_IDEMPOTENT and QMEM_MAPPING_PHYS_CONTIGUOUS mode can specify physaddr.
+   virtaddr cannot be specified for a memory region, it can only be queried by the 
+   qmem_attr_getvirtaddr() function.
+ */
+typedef struct {
+    /** @cond */
+    qurt_mem_mapping_t    mapping_type; 
+    unsigned char          perms;
+    unsigned short         owner;
+    qurt_pgattr_t          pga;
+    unsigned               ppn; //physical page number (physical>>12)
+    qurt_addr_t            virtaddr;
+    qurt_mem_region_type_t   type;   
+    qurt_size_t               size;
+    /** @endcond */
+} qurt_mem_region_attr_t;
+
+
+/** QuRT user physical memory pool type. */
+typedef struct {
+    /** @cond */
+    char name[32];
+    struct ranges{
+        unsigned int start;
+        unsigned int size;
+    } ranges[MAX_POOL_RANGES];
+     /** @endcond */
+} qurt_mem_pool_attr_t;
+
+/** QuRT memory pool status type.*/
+typedef struct _qurt_mem_pool_status {
+
+    qurt_size_t         contig_size; /**< Largest contiguous free memory in bytes. */
+    qurt_size_t         free_size;   /**< Total free memory in bytes. */
+    qurt_size_t         total_size;  /**< Total declared memory in bytes. */
+
+} qurt_mem_pool_status_t;
+
+typedef enum {
+    HEXAGON_L1_I_CACHE = 0,     /**< Hexagon L1 instruction cache. */
+    HEXAGON_L1_D_CACHE = 1,     /**< Hexagon L1 data cache. */
+    HEXAGON_L2_CACHE = 2        /**< Hexagon L2 cache. */
+} qurt_cache_type_t;
+
+typedef enum {
+    FULL_SIZE = 0,                /**< Fully shared cache, without partitioning. */
+    HALF_SIZE = 1,                /**< 1/2 for main, 1/2 for auxiliary. */
+    THREE_QUARTER_SIZE = 2,       /**< 3/4 for main, 1/4 for auxiliary. */
+    SEVEN_EIGHTHS_SIZE = 3        /**< 7/8 for main, 1/8 for auxiliary; for L2 cache only. */
+} qurt_cache_partition_size_t;
+
+typedef enum {
+	QURT_PROCESS_CB_GENERIC,        /**< generic unconditional cb called after image loading. */
+	QURT_PROCESS_NOTE_CB_PRE_MAP,   /**< note cb called before segment loading. */
+	QURT_PROCESS_NOTE_CB_POST_MAP   /**< note cb called after segment loading. */
+} qurt_process_cb_type_t;
+
+typedef union {
+    void *ptr;
+    int num;
+} qurt_process_callback_arg_t;
+
+
+/**@endcond*/
+
+/** @} */ /* end_addtogroup memory_management_types */
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif /* QURT_TYPES_H */
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_user_dma.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_user_dma.h
new file mode 100755
index 0000000000000..e05a6429fd703
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_user_dma.h
@@ -0,0 +1,44 @@
+#ifndef QURT_USER_DMA_H
+#define QURT_USER_DMA_H
+
+/**
+  @file qurt_user_dma.h
+  @brief  Definitions, macros, and prototypes used for handling user DMA.
+
+ EXTERNALIZED FUNCTIONS
+  none
+
+ INITIALIZATION AND SEQUENCING REQUIREMENTS
+  none
+
+ Copyright (c) 2021  by Qualcomm Technologies, Inc.  All Rights Reserved.
+ Confidential and Proprietary - Qualcomm Technologies, Inc.
+ ======================================================================*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**@ingroup qurt_user_dma_dmsyncht
+  Sends the DMSyncht command to the user DMA engine.
+   
+   Call this function to ensure all posted DMA memory operations are
+   complete. 
+   
+   This stalls the current thread until the instruction
+   is complete and returns.
+
+  @return
+  QURT_EOK - On dmsyncht completion \n
+  QURT_ENOTSUPPORTED - User DMA not supported
+  
+  @dependencies
+  None.
+*/
+int qurt_user_dma_dmsyncht(void);
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_vtlb.h b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_vtlb.h
new file mode 100755
index 0000000000000..e064042e447ac
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/include/qurt/qurt_vtlb.h
@@ -0,0 +1,76 @@
+/*=============================================================================
+
+                                    qurt_vtlb.h
+
+GENERAL DESCRIPTION
+
+EXTERNAL FUNCTIONS
+        None.
+
+INITIALIZATION AND SEQUENCING REQUIREMENTS
+        None.
+
+Copyright (c) 2019, 2021, 2023  by Qualcomm Technologies, Inc.  All Rights Reserved.
+=============================================================================*/
+#ifndef QURT_VTLB_H
+#define QURT_VTLB_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+||  Names starting with "qurt_i_vtlb" are the internal low-level functions.
+||  These should be considered subject to change.
+*/
+
+int qurt_i_vtlb_entry_create(unsigned *pIndex,
+                             unsigned tlb_lo,
+                             unsigned tlb_hi,
+                             unsigned extension);
+
+int qurt_i_vtlb_entry_create_with_pid(unsigned *pIndex,
+                                      unsigned tlb_lo,
+                                      unsigned tlb_hi,
+                                      unsigned extension,
+                                      unsigned target_pid);
+
+int qurt_i_vtlb_entry_delete(unsigned index);
+
+int qurt_i_vtlb_entry_read(unsigned index, unsigned *tlbinfo);
+
+int qurt_i_vtlb_entry_write(unsigned index, unsigned tlb_lo, unsigned tlb_hi, unsigned extension);
+
+int qurt_i_vtlb_entry_write_with_pid(unsigned index, unsigned tlb_lo, unsigned tlb_hi, unsigned extension, unsigned target_pid);
+
+int qurt_i_vtlb_entry_probe(const void *vaddr, unsigned *tlbinfo, unsigned *pIndex);
+
+int qurt_i_vtlb_entry_probe_with_pid(const void *vaddr, unsigned *tlbinfo, unsigned *pIndex, unsigned target_pid);
+
+
+int qurt_i_vtlb_statistics(unsigned *stats); // Returns stats[0] -- total number of VTLB entries
+                                             //         stats[1] -- number of available VTLB entries
+                                             //         stats[2] -- max size of VTLB tree since boot
+
+//can return index to an entry that was specialed, change it to take addresses instead of pages
+int qurt_i_vtlb_set_special(int index, unsigned pageno, unsigned asid, unsigned size);
+
+int qurt_i_vtlb_queue_ppage(unsigned pageno, unsigned vtlb_index);
+
+#define QURT_VTLB_EXT_DEFAULT      0U
+#define QURT_VTLB_EXT_LOCKED       1U
+#define QURT_VTLB_EXT_EXCLUDE_DUMP 2U      /* Temporary ability to skip certain mappings in pd dump */
+#define QURT_VTLB_EXT_FREELIST     0x800000u
+
+#define QURT_VTLB_ERR_OVERLAP           -64
+#define QURT_VTLB_ERR_TREE_NO_SPACE     -65
+#define QURT_VTLB_ERR_INVALID_SIZE      -68
+#define QURT_VTLB_ERR_INVALID_EXT       -69
+#define QURT_VTLB_ERR_DEL_PGT_LOCKED    -70
+#define QURT_VTLB_ERR_PGT_LOCK_CNT      -71
+
+#ifdef __cplusplus
+} /* closing brace for extern "C" */
+#endif
+
+#endif // QURT_VTLB_H
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/lib/libposix.a b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/lib/libposix.a
new file mode 100755
index 0000000000000..a9a8baba7faf1
Binary files /dev/null and b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/lib/libposix.a differ
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/lib/libqurt.a b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/lib/libqurt.a
new file mode 100755
index 0000000000000..0ba0327f99d81
Binary files /dev/null and b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/lib/libqurt.a differ
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/lib/libqurtcfs.a b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/lib/libqurtcfs.a
new file mode 100755
index 0000000000000..339de1f596ddb
Binary files /dev/null and b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/lib/libqurtcfs.a differ
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/lib/libtimer_island.a b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/lib/libtimer_island.a
new file mode 100755
index 0000000000000..98d0a1128a8a4
Binary files /dev/null and b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/lib/libtimer_island.a differ
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/lib/libtimer_main.a b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/lib/libtimer_main.a
new file mode 100755
index 0000000000000..e95a77af5ed1a
Binary files /dev/null and b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/lib/libtimer_main.a differ
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/lib/pic/libposix.a b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/lib/pic/libposix.a
new file mode 100755
index 0000000000000..6aaca8da9e012
Binary files /dev/null and b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/lib/pic/libposix.a differ
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/lib/pic/libqurt.a b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/lib/pic/libqurt.a
new file mode 100755
index 0000000000000..ba96bbf241f10
Binary files /dev/null and b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/lib/pic/libqurt.a differ
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/lib/pic/libqurtcfs.a b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/lib/pic/libqurtcfs.a
new file mode 100755
index 0000000000000..339de1f596ddb
Binary files /dev/null and b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/lib/pic/libqurtcfs.a differ
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/lib/pic/libtimer.a b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/lib/pic/libtimer.a
new file mode 100755
index 0000000000000..f2f6d0b611216
Binary files /dev/null and b/prebuilts/Hexagon_SDK/6.2.0.1/rtos/qurt/computev79/lib/pic/libtimer.a differ
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/tools/HEXAGON_Tools/.lock b/prebuilts/Hexagon_SDK/6.2.0.1/tools/HEXAGON_Tools/.lock
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/utils/examples/dsp_capabilities_utils.h b/prebuilts/Hexagon_SDK/6.2.0.1/utils/examples/dsp_capabilities_utils.h
new file mode 100755
index 0000000000000..2cafe29dfe9aa
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/utils/examples/dsp_capabilities_utils.h
@@ -0,0 +1,164 @@
+/**=============================================================================
+@file
+    dsp_capabilities_utils.h
+
+@brief
+    Wrapper functions for FastRPC Capability APIs.
+
+Copyright (c) 2020-2021 Qualcomm Technologies Incorporated.
+All Rights Reserved. Qualcomm Proprietary and Confidential.
+=============================================================================**/
+#ifndef DSP_CAPABILITIES_UTILS_H
+#define DSP_CAPABILITIES_UTILS_H
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+#include <inttypes.h>
+#include <stdbool.h>
+#include "AEEStdErr.h"
+#include "remote.h"
+
+#if !defined (_WINDOWS)
+    #pragma weak remote_system_request
+#endif
+ /**
+ * Wrapper for FastRPC Capability API: query DSP support.
+ *
+ * @param[out]  domain pointer to supported domain.
+ * @return      0          if query is successful.
+ *              non-zero   if error, return value points to the error.
+ */
+int get_dsp_support(int *domain);
+
+ /**
+ * Wrapper for FastRPC Capability API: query VTCM information.
+ *
+ * @param[in]   domain value of domain in the queried.
+ * @param[out]  capability capability value of the attribute queried.
+ * @param[in]   attr value of the attribute to the queried.
+ * @return      0          if query is successful.
+ *              non-zero   if error, return value points to the error.
+ */
+int get_vtcm_info(int domain, uint32_t *capability, uint32_t attr);
+
+ /**
+ * Wrapper for FastRPC Capability API: query unsigned pd support on CDSP domain.
+ *
+ * @return      true          if unsigned pd is supported.
+ *              false         if unsigned pd is not supported, capability query failed.
+ */
+
+bool get_unsignedpd_support(void);
+
+ /**
+ * Wrapper for FastRPC Capability API: query unsigned pd support.
+ *
+ * @param[in]   domain value of domain in the queried.
+ * @return      true          if unsigned pd is supported.
+ *              false         if unsigned pd is not supported, capability query failed.
+ */
+
+bool is_unsignedpd_supported(int domain_id);
+
+ /**
+ * is_valid_domain_id API: query a domain id is valid.
+ *
+ * @param[in]   domain value of domain in the queried.
+ * @param[in]   compute_only value of domain is only compared with CDSP domains supported by the target when enabled.
+ * @return      true          if value of domain is valid.
+ *              false         if value of domain is not valid.
+ */
+
+bool is_valid_domain_id(int domain_id, int compute_only);
+
+ /**
+ * get_domain API: get domain struct from domain value.
+ *
+ * @param[in]  domain value of a domain
+ * @return     Returns domain struct of the domain if it is supported or else
+ *             returns NULL.
+ *
+ */
+
+domain* get_domain(int domain_id);
+
+ /**
+ * get_domains_info API: get information for all the domains available on the device
+ *
+ * @param[in]  domain_type pointer to domain type
+ * @param[in]  num_domains pointer to number of domains
+ * @param[in]  domains_info pointer to save discovered domains information.
+ * @return     0 if query is successful.
+ *              non-zero if error, return value points to the error.
+ *
+ * It is user's responsibility to free the memory used to store the domains info whose address is present in domains_info before closing the application.
+ *
+ */
+
+int get_domains_info(char *domain_type, int *num_domains, fastrpc_domain **domains_info);
+
+  /**
+ * is_async_fastrpc_supported API: query a domain id has async fastrpc supported or not
+ *
+ * @param[in]  domain_id value of a domain
+ * @return     Returns true or false stating support of Async FastRPC
+ *
+ */
+
+bool is_async_fastrpc_supported(int domain_id);
+
+ /**
+ * is_status_notification_supported API: query the DSP for STATUS_NOTIFICATION_SUPPORT information
+ *
+ * @param[in]  domain_id value of a domain
+ * @return     Returns true or false stating status notification support information
+ *
+ */
+bool is_status_notification_supported(int domain_id);
+
+ /**
+ * get_hmx_support_info API: query the DSP for HMX SUPPORT information
+ *
+ * @param[in]   domain_id value of a domain
+ * @param[out]  capability capability value of the attribute queried.
+ * @param[in]   attr value of the attribute to the queried.
+ * @return      0 if query is successful.
+ *              non-zero if error, return value points to the error.
+ *
+ */
+int get_hmx_support_info(int domain, uint32_t *capability, uint32_t attr);
+
+ /**
+ * get_hex_arch_ver API: query the Hexagon processor architecture version information
+ *
+ * @param[in]   domain_id value of a domain
+ * @param[out]  capability capability value of the attribute queried.
+ *              The last byte of the capability value represents the architecture of the DSP being queried in hexadecimal format.
+ *              Eg. 0x8D73 represents a v73 architecture. The other byte stands for other capabilities depending on the device.
+ * @return      0 if query is successful.
+ *              non-zero if error, return value points to the error.
+ *
+ */
+int get_hex_arch_ver(int domain, uint32_t *capability);
+
+ /**
+ * get_hvx_support_info API: query the DSP for HVX SUPPORT information
+ *
+ * @param[in]   domain_id value of a domain
+ * @param[out]  capability capability value of the attribute queried.
+ * @param[in]   attr value of the attribute to the queried.
+ * @return      0 if query is successful.
+ *              non-zero if error, return value points to the error.
+ *
+ */
+int get_hvx_support_info(int domain, uint32_t *capability, uint32_t attr);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  //DSP_CAPABILITIES_UTILS_H
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/utils/examples/mem_utils.h b/prebuilts/Hexagon_SDK/6.2.0.1/utils/examples/mem_utils.h
new file mode 100755
index 0000000000000..16f002ef060fa
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/utils/examples/mem_utils.h
@@ -0,0 +1,54 @@
+/**=============================================================================
+@file
+    mem_utils.h
+
+@brief
+    Abstract operating system specific timing APIs.
+
+Copyright (c) 2021 Qualcomm Technologies Incorporated.
+All Rights Reserved. Qualcomm Proprietary and Confidential.
+=============================================================================**/
+
+#ifndef MEM_UTILS_H_
+#define MEM_UTILS_H_
+
+
+#ifdef __cplusplus
+  extern "C" {
+#endif
+
+
+#ifdef _WINDOWS
+    #include <windows.h>
+#else
+#ifdef __hexagon__
+#else
+    #include <malloc.h>
+    #include <sys/mman.h>
+#endif
+#endif
+
+
+#ifndef MEMALIGN
+  #ifdef _WINDOWS
+    #define MEMALIGN(alignment,size) _aligned_malloc(size,alignment)
+  #else
+    #define MEMALIGN(alignment,size) memalign(alignment,size)
+  #endif
+#endif
+
+
+#ifndef ALIGNED_FREE
+  #ifdef _WINDOWS
+    #define ALIGNED_FREE(ptr) _aligned_free(ptr)
+  #else
+    #define ALIGNED_FREE(ptr) free(ptr)
+  #endif
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+
+#endif //MEM_UTILS_H_
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/utils/examples/os_defines.h b/prebuilts/Hexagon_SDK/6.2.0.1/utils/examples/os_defines.h
new file mode 100755
index 0000000000000..a7b5947fa908e
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/utils/examples/os_defines.h
@@ -0,0 +1,97 @@
+/**=============================================================================
+@file
+    os_defines.h
+
+@brief
+    Abstract operating system specific defines, includes and global variables
+    to make it convenient for developers to code for multiple OS platforms.
+
+Copyright (c) 2021 Qualcomm Technologies Incorporated.
+All Rights Reserved. Qualcomm Proprietary and Confidential.
+=============================================================================**/
+
+#ifndef OS_DEFINES_H_
+#define OS_DEFINES_H_
+
+
+#ifdef __cplusplus
+  extern "C" {
+#endif
+
+
+/* Offset to differentiate HLOS and Hexagon error codes.
+   Stores the value of AEE_EOFFSET for Hexagon. */
+#ifndef DSP_OFFSET
+  #define DSP_OFFSET 0x80000400
+#endif
+
+
+/* Errno for connection reset by peer. */
+#ifndef ECONNRESET
+  #ifdef __hexagon__
+    #define ECONNRESET 104
+  #endif
+#endif
+
+
+/* Abstraction of different OS specific sleep APIs.
+   SLEEP accepts input in seconds. */
+#ifndef SLEEP
+  #ifdef __hexagon__
+    #define SLEEP(x) {/* Do nothing for simulator. */}
+  #else
+    #ifdef _WINDOWS
+      #define SLEEP(x) Sleep(1000*x) /* Sleep accepts input in milliseconds. */
+    #else
+      #define SLEEP(x) sleep(x) /* sleep accepts input in seconds. */
+    #endif
+  #endif
+#endif
+
+
+/* Include windows specific header files. */
+#ifdef _WINDOWS
+  #include <windows.h>
+  #include <sysinfoapi.h>
+  #define _CRT_SECURE_NO_WARNINGS 1
+  #define _WINSOCK_DEPRECATED_NO_WARNINGS 1
+  /* Including this file for custom implementation of getopt function. */
+  #include "getopt_custom.h"
+#endif
+
+
+/* Includes and defines for all HLOS except windows */
+#if !defined(__hexagon__) && !defined (_WINDOWS)
+  #include "unistd.h"
+  #include <sys/time.h>
+#endif
+
+
+/* Includes and defines for Hexagon and all HLOS except Windows. */
+#if !defined (_WINDOWS)
+  /* Weak reference to remote symbol for compilation. */
+  #pragma weak remote_session_control
+  #pragma weak remote_handle_control
+  #pragma weak remote_handle64_control
+  #pragma weak fastrpc_mmap
+  #pragma weak fastrpc_munmap
+#endif
+
+
+/* Includes and defines for hexagon */
+#ifdef __hexagon__
+#endif
+
+
+/* Includes and defines for Android */
+#ifdef ANDROID
+#endif
+
+
+#ifdef __cplusplus
+}
+#endif
+
+
+
+#endif //OS_DEFINES_H_
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/utils/examples/pd_status_notification.h b/prebuilts/Hexagon_SDK/6.2.0.1/utils/examples/pd_status_notification.h
new file mode 100755
index 0000000000000..6579058b8e451
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/utils/examples/pd_status_notification.h
@@ -0,0 +1,25 @@
+#include <stdio.h>
+#include <stdbool.h>
+#include "AEEStdErr.h"
+#include "remote.h"
+#include "dsp_capabilities_utils.h"
+
+
+ /**
+ * request_status_notifications_enable API: Allow users to enable status notification from client PD.
+ *
+ * @param[in]  domain value of a domain
+ * @param[in]  Context of the client
+ * @param[in]  callback function for status notification
+ * @return      0          if successful.
+ *              non-zero   if error, return value points to the error.
+ *
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+int request_status_notifications_enable(int domain_id, void *context, int(*notif_callback_fn)(void *context, int domain, int session, remote_rpc_status_flags_t status));
+#ifdef __cplusplus
+}
+#endif
diff --git a/prebuilts/Hexagon_SDK/6.2.0.1/utils/examples/time_utils.h b/prebuilts/Hexagon_SDK/6.2.0.1/utils/examples/time_utils.h
new file mode 100755
index 0000000000000..969747f26f309
--- /dev/null
+++ b/prebuilts/Hexagon_SDK/6.2.0.1/utils/examples/time_utils.h
@@ -0,0 +1,64 @@
+/**=============================================================================
+@file
+    time_utils.h
+
+@brief
+    Abstract operating system specific timing APIs.
+
+Copyright (c) 2021 Qualcomm Technologies Incorporated.
+All Rights Reserved. Qualcomm Proprietary and Confidential.
+=============================================================================**/
+
+#ifndef TIME_UTILS_H_
+#define TIME_UTILS_H_
+
+
+#ifdef __cplusplus
+  extern "C" {
+#endif
+
+#ifdef _WINDOWS
+    #include <windows.h>
+#else
+#ifdef __hexagon__
+    #include "hexagon_sim_timer.h"
+#else
+    #include <sys/time.h>
+#endif
+#endif
+
+unsigned long long get_time(void);
+void sleep_in_microseconds(unsigned long long);
+
+/* Abstraction of different OS specific usleep APIs.
+   USLEEP accepts input in microseconds. */
+#ifndef USLEEP
+  #ifdef __hexagon__
+    #define USLEEP(x) {/* Do nothing for simulator. */}
+  #else
+    #ifdef _WINDOWS
+      #define USLEEP(x) sleep_in_microseconds(x)
+    #else
+	  #include <unistd.h>
+      #define USLEEP(x) usleep(x)
+    #endif
+  #endif
+#endif
+
+/* Abstraction of different OS specific timer APIs.
+   GET_TIME returns the value of time*/
+#ifndef GET_TIME
+  #ifdef __hexagon__
+    #define GET_TIME hexagon_sim_read_pcycles
+  #else
+    #define GET_TIME get_time
+  #endif
+#endif
+
+
+#ifdef __cplusplus
+}
+#endif
+
+
+#endif //TIME_UTILS_H_
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/CPU/QnnCpuCommon.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/CPU/QnnCpuCommon.h
new file mode 100755
index 0000000000000..fdbfc1136d556
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/CPU/QnnCpuCommon.h
@@ -0,0 +1,50 @@
+//=============================================================================
+//
+//  Copyright (c) 2020-2023 Qualcomm Technologies, Inc.
+//  All Rights Reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//=============================================================================
+
+/** @file
+ *  @brief QNN CPU Common components
+ *
+ *         This file defines versioning and other identification details
+ *         and supplements QnnCommon.h for CPU backend
+ */
+
+#ifndef QNN_CPU_COMMON_H
+#define QNN_CPU_COMMON_H
+
+#include "QnnCommon.h"
+
+/// CPU Backend identifier
+#define QNN_BACKEND_ID_CPU 3
+
+/// CPU interface provider
+#define QNN_CPU_INTERFACE_PROVIDER_NAME "CPU_QTI_AISW"
+
+// CPU API Version values
+#define QNN_CPU_API_VERSION_MAJOR 1
+#define QNN_CPU_API_VERSION_MINOR 1
+#define QNN_CPU_API_VERSION_PATCH 0
+
+// clang-format off
+/// Macro to set Qnn_ApiVersion_t for CPU backend
+#define QNN_CPU_API_VERSION_INIT                                 \
+  {                                                              \
+    {                                                            \
+      QNN_API_VERSION_MAJOR,     /*coreApiVersion.major*/        \
+      QNN_API_VERSION_MINOR,     /*coreApiVersion.major*/        \
+      QNN_API_VERSION_PATCH      /*coreApiVersion.major*/        \
+    },                                                           \
+    {                                                            \
+      QNN_CPU_API_VERSION_MAJOR, /*backendApiVersion.major*/     \
+      QNN_CPU_API_VERSION_MINOR, /*backendApiVersion.minor*/     \
+      QNN_CPU_API_VERSION_PATCH  /*backendApiVersion.patch*/     \
+    }                                                            \
+  }
+
+// clang-format on
+
+#endif  // QNN_CPU_COMMON_H
\ No newline at end of file
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/CPU/QnnCpuGraph.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/CPU/QnnCpuGraph.h
new file mode 100755
index 0000000000000..750cfd0b501f1
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/CPU/QnnCpuGraph.h
@@ -0,0 +1,117 @@
+//=============================================================================
+//
+//  Copyright (c) 2022 Qualcomm Technologies, Inc.
+//  All Rights Reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//=============================================================================
+
+/** @file
+ *  @brief QNN CPU component Graph API.
+ *
+ *         The interfaces in this file work with the top level QNN
+ *         API and supplements QnnGraph.h for CPU backend
+ */
+
+#ifndef QNN_CPU_GRAPH_H
+#define QNN_CPU_GRAPH_H
+
+#include "QnnGraph.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//=============================================================================
+// Macros
+//=============================================================================
+
+//=============================================================================
+// Data Types
+//=============================================================================
+
+/**
+ * @brief This enum provides different CPU graph configuration
+ *         options associated with QnnGraph
+ */
+typedef enum {
+  QNN_CPU_GRAPH_CONFIG_OPTION_OP_DEBUG_CALLBACK = 1,
+  QNN_CPU_GRAPH_CONFIG_OPTION_UNDEFINED         = 0x7fffffff
+} QnnCpuGraph_ConfigOption_t;
+
+/* @brief CallBack function pointer to be filled by user.
+ *        This callback will be called after each op execution.
+ *        Only outputTensor id and data buffer is valid, consumable.
+ *        Memory is owned by BE which is valid throughout the callback.
+ *        Client should not update any parameter and argument of opConfig.
+ *        NULL tensor/buffer indicate invalid data buffer.
+ */
+typedef Qnn_ErrorHandle_t (*QnnCpuGraph_OpDebugCallback_t)(Qnn_OpConfig_t* opConfig,
+                                                           void* callBackParam);
+
+/* @brief Structure to be filled by user.
+ *        This structure will have callback function and callback reference data.
+ *        Memory is owned by BE which is valid throughout the callback.
+ *        Client should not update any parameter and argument of opConfig.
+ *        NULL callback function indicate no debug option.
+ */
+typedef struct {
+  void* callBackParam;
+  QnnCpuGraph_OpDebugCallback_t cpuGraphOpDebugCallback;
+} QnnCpuGraph_OpDebug_t;
+
+// clang-format off
+/// QnnCpuGraph_OpDebug_t initializer macro
+#define QNN_CPU_GRAPH_OP_DEBUG_INIT       \
+  {                                       \
+    NULL,    /*callBackParam*/            \
+    NULL     /*cpuGraphOpDebugCallback*/  \
+  }
+// clang-format on
+
+//=============================================================================
+// Public Functions
+//=============================================================================
+
+//------------------------------------------------------------------------------
+//   Implementation Definition
+//------------------------------------------------------------------------------
+
+/**
+ * @brief        Structure describing the set of configurations supported by graph.
+ *               Objects of this type are to be referenced through QnnGraph_CustomConfig_t.
+ *
+ *               The struct has two fields - option and a union of corresponding config values
+ *               Based on the option corresponding item in the union can be used to specify
+ *               config.
+ *               Below is the map between QnnCpuGraph_ConfigOption_t and config value
+ *
+ *               \verbatim embed:rst:leading-asterisk
+ *               +----+------------------------------------------+------------------------------------+
+ *               | #  | Config Option                            | Configuration Struct/value      |
+ *               +====+==========================================+====================================+
+ *               | 1  | QNN_CPU_GRAPH_CONFIG_DEBUG_CALLBACK      | QnnCpuGraph_OpDebug_t           |
+ *               +----+------------------------------------------+------------------------------------+
+ *               \endverbatim
+ */
+typedef struct {
+  QnnCpuGraph_ConfigOption_t option;
+  union UNNAMED {
+    QnnCpuGraph_OpDebug_t cpuGraphOpDebug;
+  };
+} QnnCpuGraph_CustomConfig_t;
+
+/// QnnCpuGraph_CustomConfig_t initializer macro
+#define QNN_CPU_GRAPH_CUSTOM_CONFIG_INIT                      \
+  {                                                           \
+    QNN_CPU_GRAPH_CONFIG_OPTION_UNKNOWN, /*option*/           \
+    {                                                         \
+      QNN_CPU_GRAPH_OP_DEBUG_INIT /*cpuGraphOpDebugCallback*/ \
+    }                                                         \
+  }
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/CPU/QnnCpuOpPackage.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/CPU/QnnCpuOpPackage.h
new file mode 100755
index 0000000000000..97bdab8dfd3f9
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/CPU/QnnCpuOpPackage.h
@@ -0,0 +1,224 @@
+//==============================================================================
+//
+//  Copyright (c) 2020-2023 Qualcomm Technologies, Inc.
+//  All Rights Reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+/** @file
+ *  @brief CPU Operation Package component API
+ *
+ *         Provides interface to interact with OpPackage libraries registered
+ *         with the CPU backend.
+ */
+
+#ifndef QNN_CPU_OP_PACKAGE_H
+#define QNN_CPU_OP_PACKAGE_H
+
+#include "CPU/QnnCpuCommon.h"
+#include "QnnGraph.h"
+#include "QnnOpPackage.h"
+#include "QnnTypes.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define QNN_CPUOPPACKAGE_TENSOR_DATA_FORMAT_FLAT_BUFFER 0
+
+/**
+ * @brief A value representing a tensor data format.
+ */
+typedef uint32_t QnnCpuOpPackage_TensorDataFormat_t;
+
+/**
+ * @brief A value representing a profile data in ms.
+ */
+typedef double QnnCpuOpPackage_ProfileData_t;
+
+/**
+ * @brief An enum to specify a param type.
+ */
+typedef enum {
+  QNN_CPU_PARAMTYPE_SCALAR = 0,
+  QNN_CPU_PARAMTYPE_TENSOR = 1,
+  QNN_CPU_PARAMTYPE_STRING = 2,
+  // Unused, present to ensure 32 bits.
+  QNN_CPU_PARAMTYPE_UNDEFINED = 0xFFFFFFFF
+} QnnCpuOpPackage_ParamType_t;
+
+/**
+ * @brief An enum to specify tensor data type.
+ */
+typedef enum {
+  QNN_CPU_DATATYPE_BOOL_8   = 0x0508,
+  QNN_CPU_DATATYPE_INT_8    = 0x0008,
+  QNN_CPU_DATATYPE_INT_32   = 0x0032,
+  QNN_CPU_DATATYPE_UINT_8   = 0x0108,
+  QNN_CPU_DATATYPE_UINT_32  = 0x0132,
+  QNN_CPU_DATATYPE_FLOAT_32 = 0x0232,
+  // Unused, present to ensure 32 bits.
+  QNN_CPU_DATATYPE_UNDEFINED = 0x7FFFFFFF
+} QnnCpuOpPackage_DataType_t;
+
+/**
+ * @brief An enum to specify logging level.
+ */
+typedef enum {
+  QNN_CPU_MSG_ERROR = 1,
+  QNN_CPU_MSG_DEBUG = 2,
+  QNN_CPU_MSG_LOW   = 3,
+  QNN_CPU_MSG_MED   = 4,
+  QNN_CPU_MSG_HIGH  = 5,
+  // Unused, present to ensure 32 bits
+  QNN_CPU_MSG_UNDEFINED = 0x7FFFFFFF
+} QnnCpuOpPackage_MsgType_t;
+
+/**
+ * @brief An enum to specify the profiling type.
+ */
+typedef enum {
+  QNN_CPU_PROFILE_BASIC    = 1,
+  QNN_CPU_PROFILE_DETAILED = 2,
+  // Unused, present to ensure 32 bits
+  QNN_CPU_PROFILE_UNDEFINED = 0x7FFFFFFF
+} QnnCpuOpPackage_ProfileType_t;
+
+/**
+ * @brief A struct which defines the Global infrastructure.
+ */
+typedef struct _QnnOpPackage_GlobalInfrastructure_t {
+  // Message
+  void (*reportMessage)(QnnCpuOpPackage_MsgType_t msgType, const char* msg, ...);
+
+  // Profile
+  void (*profile)(QnnCpuOpPackage_ProfileType_t profileType,
+                  QnnCpuOpPackage_ProfileData_t timeInMsec);
+} QnnCpuOpPackage_GlobalInfra_t;
+
+// clang-format off
+/// QnnCpuOpPackage_GlobalInfra_t initializer macro
+#define QNN_CPU_OP_PACKAGE_GLOBAL_INFRA_INIT \
+  {                                          \
+    NULL,    /*reportMessage*/               \
+    NULL     /*profile*/                     \
+  }
+// clang-format on
+
+typedef Qnn_ErrorHandle_t (*QnnCpuOpPackage_OpImplFn_t)(void* opPkgNodeData);
+
+/**
+ * @brief A struct which defines the OpImpl definition.
+ */
+typedef struct _QnnOpPackage_OpImpl_t {
+  QnnCpuOpPackage_OpImplFn_t opImplFn;
+  void* userData;
+} QnnCpuOpPackage_OpImpl_t;
+
+// clang-format off
+/// QnnCpuOpPackage_OpImpl_t initializer macro
+#define QNN_CPU_OP_PACKAGE_OPIMPL_INIT \
+  {                                    \
+    NULL,    /*kernelFn*/              \
+    NULL     /*userData*/              \
+  }
+// clang-format on
+
+/**
+ * @brief A struct which describes the properties of a tensor.
+ *
+ */
+typedef struct {
+  QnnCpuOpPackage_TensorDataFormat_t dataFormat;
+  QnnCpuOpPackage_DataType_t dataType;
+  uint32_t rank;
+  uint32_t* maxDimensions;
+  uint32_t* currentDimensions;
+  void* data;
+  Qnn_QuantizeParams_t quantizeParams;
+} QnnCpuOpPackage_Tensor_t;
+
+// clang-format off
+/// QnnCpuOpPackage_Tensor_t initializer macro
+#define QNN_CPU_OP_PACKAGE_TENSOR_INIT                        \
+  {                                                           \
+    QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER, /*dataFormat*/        \
+    QNN_CPU_DATATYPE_UNDEFINED,         /*dataType*/          \
+    0,                                  /*rank*/              \
+    NULL,                               /*maxDimensions*/     \
+    NULL,                               /*currentDimensions*/ \
+    NULL,                               /*data*/              \
+    QNN_QUANTIZE_PARAMS_INIT            /*quantizeParams*/    \
+  }
+// clang-format on
+
+/**
+ * @brief A struct which describes the parameters of a node.
+ *
+ */
+typedef struct {
+  QnnCpuOpPackage_ParamType_t type;
+  const char* name;
+  union {
+    double scalarParam;
+    const char* string;
+    QnnCpuOpPackage_Tensor_t* tensorParam;
+  };
+} QnnCpuOpPackage_Param_t;
+
+// clang-format off
+/// QnnCpuOpPackage_Param_t initializer macro
+#define QNN_CPU_OP_PACKAGE_PARAM_INIT     \
+  {                                       \
+    QNN_CPU_PARAMTYPE_UNDEFINED, /*type*/ \
+    NULL,                        /*name*/ \
+    {                                     \
+      0 /*scalarParam*/                   \
+    }                                     \
+  }
+// clang-format on
+
+/**
+ * @brief A struct which describes the node.
+ *
+ */
+typedef struct _QnnOpPackage_Node_t {
+  const char* name;
+  const char* packageName;
+  const char* typeName;
+  uint32_t numOfParams;
+  QnnCpuOpPackage_Param_t** params;
+  uint32_t numOfInputs;
+  QnnCpuOpPackage_Tensor_t** inputs;
+  uint32_t numOfOutputs;
+  QnnCpuOpPackage_Tensor_t** outputs;
+} QnnCpuOpPackage_Node_t;
+
+// clang-format off
+/// QnnCpuOpPackage_Node_t initializer macro
+#define QNN_CPU_OP_PACKAGE_NODE_INIT \
+  {                                  \
+    NULL,     /*name*/               \
+    NULL,     /*packageName*/        \
+    NULL,     /*typeName*/           \
+    0,        /*numOfParams*/        \
+    NULL,     /*params*/             \
+    0,        /*numOfInputs*/        \
+    NULL,     /*inputs*/             \
+    0,        /*numOfOutputs*/       \
+    NULL      /*outputs*/            \
+  }
+// clang-format on
+
+/**
+ * @brief Graph infrastructure.
+ *
+ */
+typedef _QnnOpPackage_GraphInfrastructure_t QnnCpuOpPackage_GraphInfrastructure_t;
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // QNN_CPU_OP_PACKAGE_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/QnnDspBackend.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/QnnDspBackend.h
new file mode 100755
index 0000000000000..e2b6c69dffbdf
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/QnnDspBackend.h
@@ -0,0 +1,108 @@
+//=============================================================================
+//
+//  Copyright (c) 2022-2023 Qualcomm Technologies, Inc.
+//  All Rights Reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//=============================================================================
+
+/** @file
+ *  @brief QNN DSP component Backend API.
+ *
+ *         The interfaces in this file work with the top level QNN
+ *         API and supplements QnnBackend.h for DSP backend
+ */
+
+#ifndef QNN_DSP_BACKEND_H
+#define QNN_DSP_BACKEND_H
+
+#include "QnnBackend.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//=============================================================================
+// Macros
+//=============================================================================
+
+//=============================================================================
+// Data Types
+//=============================================================================
+
+//=============================================================================
+// Public Functions
+//=============================================================================
+
+//------------------------------------------------------------------------------
+//   Implementation Definition
+//------------------------------------------------------------------------------
+
+// clang-format off
+
+/* @brief Enum describing the set of custom configs supported by DSP backend.
+*/
+typedef enum {
+  ///  The accelerator will always attempt to fold relu activation
+  ///  into the immediate preceding convolution operation. This optimization
+  ///  is correct when quantization ranges for convolution are equal or
+  ///  subset of the Relu operation. For graphs, where this cannot be
+  ///  guaranteed, the client should set this option to true
+  QNN_DSP_BACKEND_CONFIG_OPTION_FOLD_RELU_ACTIVATION_INTO_CONV_OFF = 0,
+  ///  The accelerator will always attempt to all Convolution
+  ///  operation using HMX instructions. Convolution that have
+  ///  short depth and/or weights that are not symmetric could
+  ///  exhibit inaccurate results. In such cases, clients must
+  ///  set this option to true to guarantee correctness of the operation
+  QNN_DSP_BACKEND_CONFIG_OPTION_SHORT_DEPTH_CONV_ON_HMX_OFF = 1,
+  ///  Every APP side user process that uses a DSP via FastRPC
+  ///  has a corresponding dynamic user process domain on the DSP side.
+  ///  QNN by default opens RPC session as unsigned PD,
+  ///  in case this option is set to true,
+  ///  RPC session will be opened as signed PD (requires signed .so).
+  QNN_DSP_BACKEND_CONFIG_OPTION_USE_SIGNED_PROCESS_DOMAIN = 2,
+  /// set QnnDspBackend_DspArch_t for offline prepare mode
+  QNN_DSP_BACKEND_CONFIG_OPTION_ARCH = 3,
+  /// UNKNOWN enum option that must not be used
+  QNN_DSP_BACKEND_CONFIG_OPTION_UNKNOWN = 0x7fffffff
+} QnnDspBackend_ConfigOption_t;
+
+typedef enum {
+  QNN_DSP_BACKEND_DSP_ARCH_NONE = 0,
+  QNN_DSP_BACKEND_DSP_ARCH_V65 = 65,
+  QNN_DSP_BACKEND_DSP_ARCH_V66 = 66,
+  QNN_DSP_BACKEND_DSP_ARCH_V68 = 68,
+  QNN_DSP_BACKEND_DSP_ARCH_V69 = 69,
+  QNN_DSP_BACKEND_DSP_ARCH_V73 = 73,
+  QNN_DSP_BACKEND_DSP_ARCH_UNKNOWN = 0x7fffffff
+} QnnDspBackend_DspArch_t;
+
+/**
+ * @brief Structure describing the set of configurations supported by the backend.
+ *        Objects of this type are to be referenced through QnnBackend_CustomConfig_t.
+ */
+typedef struct QnnDspBackend_CustomConfig {
+  QnnDspBackend_ConfigOption_t option;
+  union UNNAMED {
+    bool foldReluActivationIntoConvOff;
+    bool shortDepthConvOnHmxOff;
+    bool useSignedProcessDomain;
+    QnnDspBackend_DspArch_t arch;
+  };
+} QnnDspBackend_CustomConfig_t ;
+
+/// QnnDspBackend_CustomConfig_t initializer macro
+#define QNN_DSP_BACKEND_CUSTOM_CONFIG_INIT \
+  {                                                   \
+    QNN_DSP_BACKEND_CONFIG_OPTION_UNKNOWN, /*option*/ \
+    {                                                 \
+      false /*foldReluActivationIntoConvOff*/         \
+    }                                                 \
+  }
+
+// clang-format on
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/QnnDspCommon.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/QnnDspCommon.h
new file mode 100755
index 0000000000000..8b5ad49d04d6e
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/QnnDspCommon.h
@@ -0,0 +1,61 @@
+//=============================================================================
+//
+//  Copyright (c) 2022-2023 Qualcomm Technologies, Inc.
+//  All Rights Reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//=============================================================================
+
+/** @file
+ *  @brief QNN DSP Common components
+ *
+ *         This file defines versioning and other identification details
+ *         and supplements QnnCommon.h for DSP backend
+ */
+
+#ifndef QNN_DSP_COMMON_H
+#define QNN_DSP_COMMON_H
+
+#include "QnnCommon.h"
+
+/// DSP Backend identifier
+#define QNN_BACKEND_ID_DSP 5
+
+/// DSP interface provider
+#define QNN_DSP_INTERFACE_PROVIDER_NAME "DSP_QTI_AISW"
+
+// DSP API Version values
+#define QNN_DSP_API_VERSION_MAJOR 5
+#define QNN_DSP_API_VERSION_MINOR 0
+#define QNN_DSP_API_VERSION_PATCH 1
+
+// clang-format off
+
+/// Macro to set Qnn_ApiVersion_t for DSP backend
+#define QNN_DSP_API_VERSION_INIT                                 \
+  {                                                              \
+    {                                                            \
+      QNN_API_VERSION_MAJOR,     /*coreApiVersion.major*/        \
+      QNN_API_VERSION_MINOR,     /*coreApiVersion.major*/        \
+      QNN_API_VERSION_PATCH      /*coreApiVersion.major*/        \
+    },                                                           \
+    {                                                            \
+      QNN_DSP_API_VERSION_MAJOR, /*backendApiVersion.major*/     \
+      QNN_DSP_API_VERSION_MINOR, /*backendApiVersion.minor*/     \
+      QNN_DSP_API_VERSION_PATCH  /*backendApiVersion.patch*/     \
+    }                                                            \
+  }
+
+// clang-format on
+
+// DSP Binary Version values
+#define QNN_DSP_BINARY_VERSION_MAJOR 1
+#define QNN_DSP_BINARY_VERSION_MINOR 0
+#define QNN_DSP_BINARY_VERSION_PATCH 0
+
+// DSP Context blob Version values
+#define QNN_DSP_CONTEXT_BLOB_VERSION_MAJOR 1
+#define QNN_DSP_CONTEXT_BLOB_VERSION_MINOR 0
+#define QNN_DSP_CONTEXT_BLOB_VERSION_PATCH 0
+
+#endif  // QNN_DSP_COMMON_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/QnnDspDevice.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/QnnDspDevice.h
new file mode 100755
index 0000000000000..eecf62f5cbc02
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/QnnDspDevice.h
@@ -0,0 +1,46 @@
+//=============================================================================
+//
+//  Copyright (c) 2022-2023 Qualcomm Technologies, Inc.
+//  All Rights Reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//=============================================================================
+
+/** @file
+ *  @brief QNN DSP component Device API.
+ *
+ *         The interfaces in this file work with the top level QNN
+ *         API and supplements QnnDevice.h for DSP backend
+ */
+#ifndef QNN_DSP_DEVICE_H
+#define QNN_DSP_DEVICE_H
+
+#include "QnnDevice.h"
+#include "QnnDspPerfInfrastructure.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct _QnnDevice_Infrastructure_t {
+  QnnDspPerfInfrastructure_CreatePowerConfigIdFn_t createPowerConfigId;
+  QnnDspPerfInfrastructure_DestroyPowerConfigIdFn_t destroyPowerConfigId;
+  QnnDspPerfInfrastructure_SetPowerConfigFn_t setPowerConfig;
+  QnnDspPerfInfrastructure_SetMemoryConfigFn_t setMemoryConfig;
+  QnnDspPerfInfrastructure_SetThreadConfigFn_t setThreadConfig;
+} QnnDspDevice_Infrastructure_t;
+
+#define QNN_DSP_DEVICE_INFRASTRUCTURE_INIT \
+  {                                        \
+    NULL,     /*createPowerConfigId*/      \
+        NULL, /*destroyPowerConfigId*/     \
+        NULL, /*setPowerConfig*/           \
+        NULL, /*setMemoryConfig*/          \
+        NULL  /*setThreadConfig*/          \
+  }
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
\ No newline at end of file
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/QnnDspGraph.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/QnnDspGraph.h
new file mode 100755
index 0000000000000..dd1c5220c8721
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/QnnDspGraph.h
@@ -0,0 +1,171 @@
+//=============================================================================
+//
+//  Copyright (c) 2022-2023 Qualcomm Technologies, Inc.
+//  All Rights Reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//=============================================================================
+
+/**
+ *  @file
+ *  @brief QNN DSP component Graph API.
+ *
+ *         The interfaces in this file work with the top level QNN
+ *         API and supplements QnnGraph.h for DSP backend
+ */
+
+#ifndef QNN_DSP_GRAPH_H
+#define QNN_DSP_GRAPH_H
+
+#include "QnnGraph.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//=============================================================================
+// Macros
+//=============================================================================
+
+//=============================================================================
+// Data Types
+//=============================================================================
+
+/**
+ * @brief This enum provides different DSP graph optimization
+ *        options that can be used to finalize the graph
+ *        for optimum performance.
+ */
+typedef enum {
+  QNN_DSP_GRAPH_OPTIMIZATION_TYPE_SCHEDULE_THRESHOLD         = 1,
+  QNN_DSP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_RETRIES           = 2,
+  QNN_DSP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG = 3,
+  QNN_DSP_GRAPH_OPTIMIZATION_TYPE_ENABLE_DLBC                = 4,
+  QNN_DSP_GRAPH_OPTIMIZATION_TYPE_UNKNOWN                    = 0x7fffffff
+} QnnDspGraph_OptimizationType_t;
+
+// clang-format off
+
+/**
+ * @brief Struct describing the set of optimization types
+ *        and the values associated with each optimization type.
+ *
+ *        Below is the Map between QnnDspGraph_OptimizationType_t and allowable values:
+ *
+ *        \verbatim embed:rst:leading-asterisk
+ *        +----+------------------------------------------------------------+-----------------------------------------------------------+
+ *        | #  | OptimizationType option                                    | Allowable values                                          |
+ *        +====+============================================================+===========================================================+
+ *        | 1  | QNN_DSP_GRAPH_OPTIMIZATION_TYPE_SCHEDULE_THRESHOLD         | Reserved                                                  |
+ *        +----+------------------------------------------------------------+-----------------------------------------------------------+
+ *        | 2  | QNN_DSP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_RETRIES           | Reserved                                                  |
+ *        +----+------------------------------------------------------------+-----------------------------------------------------------+
+ *        | 3  | QNN_DSP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG | Defines the optimization strategy used by the HTP backend |
+ *        |    |                                                            |                                                           |
+ *        |    |                                                            |   1 = Faster preparation time, less optimal graph         |
+ *        |    |                                                            |                                                           |
+ *        |    |                                                            |   2 = More optimal graph but may take longer to prepare   |
+ *        +----+------------------------------------------------------------+-----------------------------------------------------------+
+ *        | 4  | QNN_DSP_GRAPH_OPTIMIZATION_TYPE_ENABLE_DLBC                | Reserved                                                  |
+ *        +----+------------------------------------------------------------+-----------------------------------------------------------+
+ *        \endverbatim
+ */
+typedef struct {
+  QnnDspGraph_OptimizationType_t type;
+  float floatValue;
+} QnnDspGraph_OptimizationOption_t;
+
+/// QnnDspGraph_OptimizationOption_t initializer macro
+#define QNN_DSP_GRAPH_OPTIMIZATION_OPTION_INIT              \
+  {                                                         \
+    QNN_DSP_GRAPH_OPTIMIZATION_TYPE_UNKNOWN, /*type*/       \
+    0.0f                                     /*floatValue*/ \
+  }
+// clang-format on
+
+/**
+ * @brief This enum provides different DSP graph configuration
+ *        options associated with QnnGraph
+ */
+typedef enum {
+  QNN_DSP_GRAPH_CONFIG_OPTION_OPTIMIZATION = 1,
+  QNN_DSP_GRAPH_CONFIG_OPTION_ENCODING     = 2,
+  QNN_DSP_GRAPH_CONFIG_OPTION_PRIORITY     = 3,
+  QNN_DSP_GRAPH_CONFIG_OPTION_PRECISION    = 4,
+  QNN_DSP_GRAPH_CONFIG_OPTION_UNKNOWN      = 0x7fffffff
+} QnnDspGraph_ConfigOption_t;
+
+typedef enum {
+  QNN_DSP_GRAPH_ENCODING_DYNAMIC = 1,
+  /** @deprecated
+   */
+  QNN_DSP_GRAPH_ENCOING_DYNAMIC = QNN_DSP_GRAPH_ENCODING_DYNAMIC,
+  QNN_DSP_GRAPH_ENCODING_STATIC = 2,
+  /** @deprecated
+   */
+  QNN_DSP_GRAPH_ENCOING_STATIC   = QNN_DSP_GRAPH_ENCODING_STATIC,
+  QNN_DSP_GRAPH_ENCODING_UNKNOWN = 0x7fffffff,
+  /** @deprecated
+   */
+  QNN_DSP_GRAPH_ENCOING_UNKNOW = QNN_DSP_GRAPH_ENCODING_UNKNOWN
+} QnnDspGraph_Encoding_t;
+
+//=============================================================================
+// Public Functions
+//=============================================================================
+
+//------------------------------------------------------------------------------
+//   Implementation Definition
+//------------------------------------------------------------------------------
+
+// clang-format off
+
+/**
+ * @brief        Structure describing the set of configurations supported by graph.
+ *               Objects of this type are to be referenced through QnnGraph_CustomConfig_t.
+ *
+ *               The struct has two fields - option and a union of corresponding config values
+ *               Based on the option corresponding item in the union can be used to specify
+ *               config.
+ *
+ *               Below is the Map between QnnDspGraph_ConfigOption_t and config value
+ *
+ *               \verbatim embed:rst:leading-asterisk
+ *               +----+------------------------------------------+------------------------------------+
+ *               | #  | Config Option                            | Configuration Struct/value         |
+ *               +====+==========================================+====================================+
+ *               | 1  | QNN_DSP_GRAPH_CONFIG_OPTION_OPTIMIZATION | QnnDspGraph_OptimizationOption_t   |
+ *               +----+------------------------------------------+------------------------------------+
+ *               | 2  | QNN_DSP_GRAPH_CONFIG_OPTION_ENCODING     | QnnDspGraph_Encoding_t             |
+ *               +----+------------------------------------------+------------------------------------+
+ *               | 3  | QNN_DSP_GRAPH_CONFIG_OPTION_PRECISION    | Qnn_Precision_t                    |
+ *               +----+------------------------------------------+------------------------------------+
+ *               | 4  | QNN_DSP_GRAPH_CONFIG_OPTION_PRIORITY     | Qnn_Priority_t                     |
+ *               +----+------------------------------------------+------------------------------------+
+ *               \endverbatim
+ */
+typedef struct {
+  QnnDspGraph_ConfigOption_t option;
+  union {
+    QnnDspGraph_OptimizationOption_t optimizationOption;
+    QnnDspGraph_Encoding_t encoding;
+    Qnn_Priority_t priority;
+    Qnn_Precision_t precision;
+  };
+} QnnDspGraph_CustomConfig_t;
+
+// clang-format on
+/// QnnDspGraph_CustomConfig_t initializer macro
+#define QNN_DSP_GRAPH_CUSTOM_CONFIG_INIT                            \
+  {                                                                 \
+    QNN_DSP_GRAPH_CONFIG_OPTION_UNKNOWN, /*option*/                 \
+    {                                                               \
+      QNN_DSP_GRAPH_OPTIMIZATION_OPTION_INIT /*optimizationOption*/ \
+    }                                                               \
+  }
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/QnnDspOpPackage.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/QnnDspOpPackage.h
new file mode 100755
index 0000000000000..c8760ecb6b798
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/QnnDspOpPackage.h
@@ -0,0 +1,42 @@
+//==============================================================================
+//
+//  Copyright (c) 2022-2023 Qualcomm Technologies, Inc.
+//  All Rights Reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef QNN_DSP_OP_PACKAGE_HPP
+#define QNN_DSP_OP_PACKAGE_HPP
+
+#include "QnnOpPackage.h"
+#include "QnnTypes.h"
+#include "Udo/UdoImplDsp.h"
+
+/**
+ * @brief A struct which defines the Global infrastructure.
+ */
+typedef struct _QnnOpPackage_GlobalInfrastructure_t {
+  /// include the UdoMalloc, UdoFree and so on
+  Udo_DspGlobalInfrastructure_t* dspGlobalInfra;
+} QnnDspOpPackage_GlobalInfrastructure_t;
+
+/**
+ * @brief A struct which defines the operation info.
+ */
+typedef struct _QnnOpPackage_OperationInfo_t {
+  char* opType;
+  uint32_t numOfStaticParams;
+  uint32_t numOfInputs;
+  uint32_t numOfOutputs;
+
+  Udo_CreateOpFactoryFunction_t createOpFactory;
+  Udo_CreateOperationFunction_t createOperation;
+  Udo_ExecuteOpFunction_t executeOp;
+  Udo_ReleaseOpFunction_t releaseOp;
+  Udo_ReleaseOpFactoryFunction_t releaseOpFactory;
+  Udo_ValidateOperationFunction_t validateOp;
+  Udo_QueryOperationFunction_t queryOp;
+} QnnDspOpPackage_OperationInfo_t;
+
+#endif
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/QnnDspPerfInfrastructure.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/QnnDspPerfInfrastructure.h
new file mode 100755
index 0000000000000..c9b1aa3020b9e
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/QnnDspPerfInfrastructure.h
@@ -0,0 +1,448 @@
+//==============================================================================
+//
+// Copyright (c) 2022-2023 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+/** @file
+ *  @brief QNN DSP component Performance Infrastructure API
+ *
+ *         Provides interface to the client to control performance and system
+ *         settings of the QNN DSP Accelerator
+ */
+
+#ifndef QNN_DSP_PERF_INFRASTRUCTURE_H
+#define QNN_DSP_PERF_INFRASTRUCTURE_H
+
+#include "QnnCommon.h"
+#include "QnnTypes.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// max rpc polling time allowed - 9999 us
+#define QNN_DSP_PERF_INFRASTRUCTURE_POWER_CONFIG_MAX_RPC_POLLING_TIME 9999
+
+//=============================================================================
+// Data Types
+//=============================================================================
+
+/**
+ * @brief QNN DSP PerfInfrastructure API result / error codes.
+ *
+ */
+typedef enum {
+  QNN_DSP_PERF_INFRASTRUCTURE_MIN_ERROR = QNN_MIN_ERROR_PERF_INFRASTRUCTURE,
+  ////////////////////////////////////////////////////////////////////////
+
+  QNN_DSP_PERF_INFRASTRUCTURE_NO_ERROR                 = QNN_SUCCESS,
+  QNN_DSP_PERF_INFRASTRUCTURE_ERROR_INVALID_HANDLE_PTR = QNN_MIN_ERROR_PERF_INFRASTRUCTURE + 0,
+  QNN_DSP_PERF_INFRASTRUCTURE_ERROR_INVALID_INPUT      = QNN_MIN_ERROR_PERF_INFRASTRUCTURE + 1,
+  QNN_DSP_PERF_INFRASTRUCTURE_ERROR_UNSUPPORTED_CONFIG = QNN_MIN_ERROR_PERF_INFRASTRUCTURE + 2,
+  QNN_DSP_PERF_INFRASTRUCTURE_ERROR_TRANSPORT          = QNN_MIN_ERROR_PERF_INFRASTRUCTURE + 3,
+  QNN_DSP_PERF_INFRASTRUCTURE_ERROR_UNSUPPORTED        = QNN_MIN_ERROR_PERF_INFRASTRUCTURE + 4,
+  QNN_DSP_PERF_INFRASTRUCTURE_ERROR_FAILED             = QNN_MIN_ERROR_PERF_INFRASTRUCTURE + 5,
+
+  ////////////////////////////////////////////////////////////////////////
+  QNN_DSP_PERF_INFRASTRUCTURE_MAX_ERROR = QNN_MAX_ERROR_PERF_INFRASTRUCTURE,
+  /// UNDEFINED value that must not be used by client
+  QNN_DSP_PERF_INFRASTRUCTURE_ERROR_UNDEFINED = 0x7fffffff
+} QnnDspPerfInfrastructure_Error_t;
+
+/**
+ * @brief Used to allow client start (non-zero value) or stop participating
+ * (zero value) in DCVS
+ *
+ */
+typedef uint32_t QnnDspPerfInfrastructure_DcvsEnable_t;
+
+/**
+ * @brief Allows client to set up the sleep latency in microseconds
+ *
+ */
+typedef uint32_t QnnDspPerfInfrastructure_SleepLatency_t;
+
+/**
+ * @brief Allows client to disable sleep or low power modes.
+ * Pass a non-zero value to disable sleep in DSP
+ *
+ */
+typedef uint32_t QnnDspPerfInfrastructure_SleepDisable_t;
+
+/**
+ * @brief sets the minimum size by which user heap should grow
+ * when heap is exhausted. This API is expected to be
+ * called only once per backend and has a process wide impact
+ *
+ * Grow size provided in bytes and defaults to 16MB
+ */
+typedef uint32_t QnnDspPerfInfrastructure_MemGrowSize_t;
+
+/**
+ * @brief sets the vtcm size to use for graphs that
+ * are prepared offline. This API should be set up
+ * before users can finalize a graph offline. It allows
+ * the QNN DSP backend to configure the serialized
+ * context for the available vtcm on target
+ *
+ * VTCM size provided in MB and does not have a default
+ */
+typedef uint32_t QnnDspPerfInfrastructure_VtcmSize_t;
+
+/**
+ * @brief sets the number of HVX threads for QNN DSP
+ */
+typedef uint32_t QnnDspPerfInfrastructure_HvxThreadNumber_t;
+
+/**
+ * @brief These are the different voltage corners that can
+ * be requested by the client to influence the voting scheme
+ * for DCVS
+ *
+ */
+typedef enum {
+  /// Maps to HAP_DCVS_VCORNER_DISABLE.
+  /// Disable setting up voltage corner
+  DCVS_VOLTAGE_CORNER_DISABLE = 0x10,
+  /// Maps to HAP_DCVS_VCORNER_SVS2.
+  /// Set voltage corner to minimum value supported on platform
+  DCVS_VOLTAGE_VCORNER_MIN_VOLTAGE_CORNER = 0x20,
+  /// Maps to HAP_DCVS_VCORNER_SVS2.
+  /// Set voltage corner to SVS2 value for the platform
+  DCVS_VOLTAGE_VCORNER_SVS2 = 0x30,
+  /// Maps to HAP_DCVS_VCORNER_SVS.
+  /// Set voltage corner to SVS value for the platform
+  DCVS_VOLTAGE_VCORNER_SVS = 0x40,
+  /// Maps to HAP_DCVS_VCORNER_SVS_PLUS.
+  /// Set voltage corner to SVS_PLUS value for the platform
+  DCVS_VOLTAGE_VCORNER_SVS_PLUS = 0x50,
+  /// Maps to HAP_DCVS_VCORNER_NOM.
+  /// Set voltage corner to NOMINAL value for the platform
+  DCVS_VOLTAGE_VCORNER_NOM = 0x60,
+  /// Maps to HAP_DCVS_VCORNER_NOM_PLUS.
+  /// Set voltage corner to NOMINAL_PLUS value for the platform
+  DCVS_VOLTAGE_VCORNER_NOM_PLUS = 0x70,
+  /// Maps to HAP_DCVS_VCORNER_TURBO.
+  /// Set voltage corner to TURBO value for the platform
+  DCVS_VOLTAGE_VCORNER_TURBO = 0x80,
+  /// Maps to HAP_DCVS_VCORNER_TURBO_PLUS.
+  /// Set voltage corner to TURBO_PLUS value for the platform
+  DCVS_VOLTAGE_VCORNER_TURBO_PLUS = 0x90,
+  /// Maps to HAP_DCVS_VCORNER_MAX.
+  /// Set voltage corner to maximum value supported on the platform
+  DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER = 0xA0,
+  /// UNKNOWN value that must not be used by client
+  DCVS_VOLTAGE_VCORNER_UNKNOWN = 0x7fffffff
+} QnnDspPerfInfrastructure_VoltageCorner_t;
+
+/**
+ * @brief This enum defines all the possible power mode
+ *        that a client can set to influence DCVS mode
+ */
+typedef enum {
+  /// Maps to HAP_DCVS_V2_ADJUST_UP_DOWN.
+  /// Allows for DCVS to adjust up and down
+  QNN_DSP_PERF_INFRASTRUCTURE_POWERMODE_ADJUST_UP_DOWN = 0x1,
+  /// Maps to HAP_DCVS_V2_ADJUST_ONLY_UP.
+  /// Allows for DCVS to adjust up only
+  QNN_DSP_PERF_INFRASTRUCTURE_POWERMODE_ADJUST_ONLY_UP = 0x2,
+  /// Maps to HAP_DCVS_V2_POWER_SAVER_MODE.
+  /// Higher thresholds for power efficiency
+  QNN_DSP_PERF_INFRASTRUCTURE_POWERMODE_POWER_SAVER_MODE = 0x4,
+  /// Maps to HAP_DCVS_V2_POWER_SAVER_AGGRESSIVE_MODE.
+  /// Higher thresholds for power efficiency with faster ramp down
+  QNN_DSP_PERF_INFRASTRUCTURE_POWERMODE_POWER_SAVER_AGGRESSIVE_MODE = 0x8,
+  /// Maps to HAP_DCVS_V2_PERFORMANCE_MODE.
+  /// Lower thresholds for maximum performance
+  QNN_DSP_PERF_INFRASTRUCTURE_POWERMODE_PERFORMANCE_MODE = 0x10,
+  /// Maps to HAP_DCVS_V2_DUTY_CYCLE_MODE.
+  /// The below value applies only for HVX clients:
+  ///  - For streaming class clients:
+  ///   - detects periodicity based on HVX usage
+  ///   - lowers clocks in the no HVX activity region of each period.
+  ///  - For compute class clients:
+  ///   - Lowers clocks on no HVX activity detects and brings clocks up on detecting HVX activity
+  ///   again.
+  ///   - Latency involved in bringing up the clock will be at max 1 to 2 ms.
+  QNN_DSP_PERF_INFRASTRUCTURE_POWERMODE_DUTY_CYCLE_MODE = 0x20,
+  /// UNKNOWN value that must not be used by client
+  QNN_DSP_PERF_INFRASTRUCTURE_POWERMODE_UNKNOWN = 0x7fffffff
+} QnnDspPerfInfrastructure_PowerMode_t;
+
+/**
+ * @brief This enum defines all the possible performance
+ *        options in Dsp Performance Infrastructure that
+ *        relate to setting up of power levels
+ */
+typedef enum {
+  /// config enum implies the usage of dcvsEnableConfig struct. For dcvs v2, if not provided, will
+  /// set to false
+  QNN_DSP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_DCVS_ENABLE = 1,
+  /// config enum implies the usage of sleepLatencyConfig struct
+  QNN_DSP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_SLEEP_LATENCY = 2,
+  /// config enum implies the usage of sleepDisableConfig struct
+  QNN_DSP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_SLEEP_DISABLE = 3,
+  /// config enum implies the usage of dcvsPowerModeConfig struct. If not provided, power save mode
+  /// will be used
+  QNN_DSP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_DCVS_POWER_MODE = 4,
+  /// config enum implies the usage of dcvsVoltageCornerConfig struct
+  QNN_DSP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_DCVS_VOLTAGE_CORNER = 5,
+  /// config enum implies the usage of busVoltageCornerConfig struct
+  QNN_DSP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_BUS_VOLTAGE_CORNER = 6,
+  /// config enum implies the usage of coreVoltageCornerConfig struct
+  QNN_DSP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_CORE_VOLTAGE_CORNER = 7,
+  /// config enum implies the usage of rpcControlLatencyConfig struct
+  QNN_DSP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_RPC_CONTROL_LATENCY = 9,
+  /// config enum implies the usage of rpcPollingTimeConfig struct
+  /// this config is only supported on V69 and later
+  /// if enabled, this config is applied to entire process
+  /// max allowed is QNN_DSP_PERF_INFRASTRUCTURE_POWER_CONFIG_MAX_RPC_POLLING_TIME us
+  QNN_DSP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_RPC_POLLING_TIME = 10,
+  /// config HMX timeout interval in us. The HMX is turned off after the set interval
+  /// time if no interaction with it after an inference is finished.
+  QNN_DSP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_HMX_TIMEOUT_INTERVAL_US = 11,
+  /// UNKNOWN config option which must not be used
+  QNN_DSP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_UNKNOWN = 0x7fffffff
+} QnnDspPerfInfrastructure_PowerConfigOption_t;
+
+/**
+ * @brief Allows client to set up the RPC control latency in microseconds
+ *
+ */
+typedef uint32_t QnnDspPerfInfrastructure_RpcControlLatency_t;
+
+/**
+ * @brief Allows client to set up the RPC polling time in microseconds
+ */
+typedef uint32_t QnnDspPerfInfrastructure_RpcPollingTime_t;
+
+/**
+ * @brief Allows client to set up the HMX timeout interval in microseconds
+ */
+typedef uint32_t QnnDspPerfInfrastructure_HmxTimeoutIntervalUs_t;
+
+/**
+ * @brief This struct provides performance infrastructure configuration
+ *         associated with setting up of power levels
+ */
+typedef struct {
+  QnnDspPerfInfrastructure_PowerConfigOption_t config;
+  union {
+    QnnDspPerfInfrastructure_DcvsEnable_t dcvsEnableConfig;
+    QnnDspPerfInfrastructure_SleepLatency_t sleepLatencyConfig;
+    QnnDspPerfInfrastructure_SleepDisable_t sleepDisableConfig;
+    QnnDspPerfInfrastructure_PowerMode_t dcvsPowerModeConfig;
+    QnnDspPerfInfrastructure_VoltageCorner_t dcvsVoltageCornerMinConfig;
+    QnnDspPerfInfrastructure_VoltageCorner_t dcvsVoltageCornerTargetConfig;
+    QnnDspPerfInfrastructure_VoltageCorner_t dcvsVoltageCornerMaxConfig;
+    QnnDspPerfInfrastructure_VoltageCorner_t busVoltageCornerMinConfig;
+    QnnDspPerfInfrastructure_VoltageCorner_t busVoltageCornerTargetConfig;
+    QnnDspPerfInfrastructure_VoltageCorner_t busVoltageCornerMaxConfig;
+    QnnDspPerfInfrastructure_VoltageCorner_t coreVoltageCornerMinConfig;
+    QnnDspPerfInfrastructure_VoltageCorner_t coreVoltageCornerTargetConfig;
+    QnnDspPerfInfrastructure_VoltageCorner_t coreVoltageCornerMaxConfig;
+    QnnDspPerfInfrastructure_RpcControlLatency_t rpcControlLatencyConfig;
+    QnnDspPerfInfrastructure_RpcPollingTime_t rpcPollingTimeConfig;
+    QnnDspPerfInfrastructure_HmxTimeoutIntervalUs_t hmxTimeoutIntervalUsConfig;
+  };
+} QnnDspPerfInfrastructure_PowerConfig_t;
+
+/// QnnDspPerfInfrastructure_PowerConfig_t initializer macro
+#define QNN_DSP_PERF_INFRASTRUCTURE_POWER_CONFIG_INIT                  \
+  {                                                                    \
+    QNN_DSP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_UNKNOWN, /*config*/ \
+    {                                                                  \
+      0 /*dcvsEnableConfig*/                                           \
+    }                                                                  \
+  }
+
+/**
+ * @brief This enum defines all the possible performance
+ *        options in Dsp Performance Infrastructure that
+ *        relate to system memory settings
+ */
+typedef enum {
+  /// sets memory grow size
+  QNN_DSP_PERF_INFRASTRUCTURE_MEMORY_CONFIGOPTION_GROW_SIZE = 1,
+  /// set the size of VTCM configuration (in MB) to use
+  /// This setting is applicable only for off target usage.
+  /// For on-target usage, refer QNN_DSP_PERF_INFRASTRUCTURE_MEMORY_CONFIGOPTION_VTCM_USAGE_FACTOR
+  QNN_DSP_PERF_INFRASTRUCTURE_MEMORY_CONFIGOPTION_VTCM_SIZE = 2,
+  /// set the vtcm usage factor on-target
+  QNN_DSP_PERF_INFRASTRUCTURE_MEMORY_CONFIGOPTION_VTCM_USAGE_FACTOR = 3,
+  /// UNKNOWN config option that must not be used
+  QNN_DSP_PERF_INFRASTRUCTURE_MEMORY_CONFIGOPTION_UNKNOWN = 0x7fffffff
+} QnnDspPerfInfrastructure_MemoryConfigOption_t;
+
+/**
+ * @brief This enum defines all the possible performance
+ *        options in Dsp Performance Infrastructure that
+ *        relate to thread settings
+ */
+typedef enum {
+  /// sets number of HVX threads
+  QNN_DSP_PERF_INFRASTRUCTURE_THREAD_CONFIGOPTION_NUMBER_OF_HVX_THREADS = 1,
+  /// UNKNOWN config option that must not be used
+  QNN_DSP_PERF_INFRASTRUCTURE_THREAD_CONFIGOPTION_UNKNOWN = 0x7fffffff
+} QnnDspPerfInfrastructure_ThreadConfigOption_t;
+
+/**
+ * @brief This enum defines all the possible vtcm
+ *        usage configuration. These settings apply only
+ *        for on-target libraries
+ *
+ */
+typedef enum {
+  /// use all the vtcm available on target
+  QNN_DSP_PERF_INFRASTRUCTURE_VTCM_USE_FULL = 1,
+  /// use bare minimal vtcm available on target. This is
+  /// not supported in the current release.
+  QNN_DSP_PERF_INFRASTRUCTURE_VTCM_USE_MIN     = 2,
+  QNN_DSP_PERF_INFRASTRUCTURE_VTCM_USE_UNKNOWN = 0x7fffffff
+} QnnDspPerfInfrastructure_VtcmUsageFactor_t;
+
+/**
+ * @brief Provides performance infrastructure configuration
+ *        options that are memory specific
+ */
+typedef struct {
+  QnnDspPerfInfrastructure_MemoryConfigOption_t config;
+  union {
+    QnnDspPerfInfrastructure_MemGrowSize_t memGrowSizeConfig;
+    QnnDspPerfInfrastructure_VtcmSize_t vtcmSizeInMB;
+    QnnDspPerfInfrastructure_VtcmUsageFactor_t vtcmUsageConfig;
+  };
+} QnnDspPerfInfrastructure_MemoryConfig_t;
+
+/// QnnDspPerfInfrastructure_MemoryConfig_t initializer macro
+#define QNN_DSP_PERF_INFRASTRUCTURE_MEMORY_CONFIG_INIT                  \
+  {                                                                     \
+    QNN_DSP_PERF_INFRASTRUCTURE_MEMORY_CONFIGOPTION_UNKNOWN, /*config*/ \
+    {                                                                   \
+      0 /*memGrowSizeConfig*/                                           \
+    }                                                                   \
+  }
+
+/**
+ * @brief Provides performance infrastructure configuration
+ *        options that are thread specific
+ */
+typedef struct {
+  QnnDspPerfInfrastructure_ThreadConfigOption_t config;
+  union {
+    QnnDspPerfInfrastructure_HvxThreadNumber_t numHvxThreads;
+  };
+} QnnDspPerfInfrastructure_ThreadConfig_t;
+
+/// QnnDspPerfInfrastructure_ThreadConfig_t initializer macro
+#define QNN_DSP_PERF_INFRASTRUCTURE_THREAD_CONFIG_INIT                  \
+  {                                                                     \
+    QNN_DSP_PERF_INFRASTRUCTURE_THREAD_CONFIGOPTION_UNKNOWN, /*config*/ \
+    {                                                                   \
+      0 /*numHvxThreads*/                                               \
+    }                                                                   \
+  }
+
+//=============================================================================
+// API Methods
+//=============================================================================
+
+/**
+ * @brief This API allows client to create power configuration id that
+ *        has to be used to set different performance modes.
+ *        Power configuration id has to be destroyed by client when not needed.
+ *
+ * @param[out] powerConfigId Pointer to power configuration id to be created.
+ *
+ *
+ * @return Error code
+ *         \n QNN_SUCCESS: No error encountered
+ *         \n QNN_DSP_PERF_INFRASTRUCTURE_ERROR_INVALID_INPUT if power configuration
+ *            id is NULL
+ */
+typedef Qnn_ErrorHandle_t (*QnnDspPerfInfrastructure_CreatePowerConfigIdFn_t)(
+    uint32_t* powerConfigId);
+
+/**
+ * @brief This API allows client to destroy power configuration id.
+ *
+ * @param[in] powerConfigId A power configuration id to be destroyed.
+ *
+ *
+ * @return Error code
+ *         \n QNN_SUCCESS: No error encountered
+ *         \n QNN_DSP_PERF_INFRASTRUCTURE_ERROR_INVALID_INPUT if power configuration
+ *            id does not exist
+ */
+typedef Qnn_ErrorHandle_t (*QnnDspPerfInfrastructure_DestroyPowerConfigIdFn_t)(
+    uint32_t powerConfigId);
+
+/**
+ * @brief This API allows client to set up system power configuration that
+ *        will enable different performance modes. This API uses
+ *        HAP_power_dcvs_v3_payload struct to config HAP power parameters.
+ *        Detailed HAP power parameters description please refer to Hexagon
+ *        SDK HAP_power_dcvs_v3_payload documentation.
+ *
+ * @param[in] powerConfigId A power client id to associate calls to system
+ *            power settings. A value of 0 implies NULL power client id
+ *            and can override every other setting the user process. To
+ *            enable power settings for multiple clients in the same
+ *            process, use a non-zero power client id.
+ *
+ *
+ * @param[in] config Pointer to a NULL terminated array
+ *            of config option for performance configuration.
+ *            NULL is allowed and indicates no config options are provided.
+ *
+ * @return Error code
+ *         \n QNN_SUCCESS: No error encountered
+ *         \n QNN_DSP_PERF_INFRASTRUCTURE_ERROR_INVALID_INPUT if power configuration
+ *            does not exist
+ */
+typedef Qnn_ErrorHandle_t (*QnnDspPerfInfrastructure_SetPowerConfigFn_t)(
+    uint32_t powerConfigId, const QnnDspPerfInfrastructure_PowerConfig_t** config);
+
+/**
+ * @brief This API allows clients to set up configuration associated with
+ *        system memory
+ *
+ * @param[in] config Pointer to a NULL terminated array
+ *            of config option for system memory configuration.
+ *            NULL is allowed and indicates no config options are provided.
+ *
+ * @return Error code
+ *         \n QNN_SUCCESS: No error encountered
+ */
+typedef Qnn_ErrorHandle_t (*QnnDspPerfInfrastructure_SetMemoryConfigFn_t)(
+    const QnnDspPerfInfrastructure_MemoryConfig_t** config);
+
+/**
+ * @brief This API allows clients to set up configuration for threads
+ *
+ * @param[in] config Pointer to a NULL terminated array
+ *            of config option for thread configuration.
+ *            NULL is allowed and indicates no config options are provided.
+ *
+ * @note This function should be called after QnnBackend_initialize and
+ *       before Context and Graph calls
+ *
+ * @return Error code
+ *         \n QNN_SUCCESS: No error encountered
+ *         \n QNN_DSP_PERF_INFRASTRUCTURE_ERROR_UNSUPPORTED_CONFIG if invalid
+ *            config or value passed
+ *         \n QNN_DSP_PERF_INFRASTRUCTURE_ERROR_INVALID_INPUT if config is NULL
+ *         \n QNN_DSP_PERF_INFRASTRUCTURE_ERROR_TRANSPORT if unable to set the
+ *            settings in DSP
+ */
+typedef Qnn_ErrorHandle_t (*QnnDspPerfInfrastructure_SetThreadConfigFn_t)(
+    const QnnDspPerfInfrastructure_ThreadConfig_t** config);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // QNN_DSP_PERF_INFRASTRUCTURE_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/QnnDspProfile.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/QnnDspProfile.h
new file mode 100755
index 0000000000000..04c1897aa7e18
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/QnnDspProfile.h
@@ -0,0 +1,244 @@
+//==============================================================================
+//
+//  Copyright (c) 2022-2023 Qualcomm Technologies, Inc.
+//  All Rights Reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+/**
+ *  @file
+ *  @brief QNN DSP Profile component API.
+ *
+ *          Requires DSP backend to be initialized.
+ *          Should be used with the QnnProfile API but has DSP backend
+ *          specific definition for different QnnProfile data structures
+ *
+ */
+
+#ifndef QNN_DSP_PROFILE_H
+#define QNN_DSP_PROFILE_H
+
+#include "QnnProfile.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//=============================================================================
+// Macros
+//=============================================================================
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to the remote procedure call on the ARM processor
+ *        when client invokes QnnContext_createFromBinary. The value
+ *        returned is time in microseconds.
+ *
+ * @note context load binary host rpc time maybe available on both
+ *       QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels
+ */
+#define QNN_DSP_PROFILE_EVENTTYPE_CONTEXT_LOAD_BIN_HOST_RPC_TIME_MICROSEC 1002
+
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to the remote procedure call on the DSP processor
+ *        when client invokes QnnContext_createFromBinary. The value
+ *        returned is time in microseconds.
+ *
+ * @note context load binary dsp rpc time maybe available on both
+ *       QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels
+ */
+#define QNN_DSP_PROFILE_EVENTTYPE_CONTEXT_LOAD_BIN_DSP_RPC_TIME_MICROSEC 1003
+
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to the time taken to create the context on the
+ *        accelerator when client invokes QnnContext_createFromBinary.
+ *        The value returned is time in microseconds.
+ *
+ * @note context load binary accelerator time maybe available on both
+ *       QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels
+ */
+#define QNN_DSP_PROFILE_EVENTTYPE_CONTEXT_LOAD_BIN_ACCEL_TIME_MICROSEC 1004
+
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to the remote procedure call on the ARM processor
+ *        when client invokes QnnGraph_finalize.
+ *        The value returned is time in microseconds.
+ *
+ * @note graph finalize host rpc time maybe available on both
+ *       QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels
+ */
+#define QNN_DSP_PROFILE_EVENTTYPE_GRAPH_FINALIZE_HOST_RPC_TIME_MICROSEC 2001
+
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to the remote procedure call on the DSP processor
+ *        when client invokes QnnGraph_finalize.
+ *        The value returned is time in microseconds.
+ *
+ * @note graph finalize dsp rpc time maybe available on both
+ *       QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels
+ */
+#define QNN_DSP_PROFILE_EVENTTYPE_GRAPH_FINALIZE_DSP_RPC_TIME_MICROSEC 2002
+
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to finalize the graph on the accelerator
+ *        when client invokes QnnGraph_finalize.
+ *        The value returned is time in microseconds.
+ *
+ * @note graph finalize accelerator time maybe available on both
+ *       QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels
+ */
+#define QNN_DSP_PROFILE_EVENTTYPE_GRAPH_FINALIZE_ACCEL_TIME_MICROSEC 2003
+
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to the remote procedure call on the ARM processor
+ *        when client invokes QnnGraph_execute or QnnGraph_executeAsync.
+ *        The value returned is time in microseconds.
+ *
+ * @note graph execute host rpc time maybe available on both
+ *       QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels
+ */
+#define QNN_DSP_PROFILE_EVENTTYPE_GRAPH_EXECUTE_HOST_RPC_TIME_MICROSEC 3001
+
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to the remote procedure call on the DSP processor
+ *        when client invokes QnnGraph_execute or QnnGraph_executeAsync.
+ *        The value returned is time in microseconds.
+ *
+ * @note graph execute dsp rpc time maybe available on both
+ *       QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels
+ */
+#define QNN_DSP_PROFILE_EVENTTYPE_GRAPH_EXECUTE_DSP_RPC_TIME_MICROSEC 3002
+
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to execute the graph on the accelerator
+ *        when client invokes QnnGraph_execute or QnnGraph_executeAsync.
+ *        The value returned is number of processor cycles taken.
+ *
+ * @note graph execute accelerator time maybe available only on
+ *       QNN_PROFILE_LEVEL_DETAILED levels
+ *
+ * @note When QNN_PROFILE_LEVEL_DETAILED is used, this event can have
+ *       multiple sub-events of type QNN_PROFILE_EVENTTYPE_NODE.
+ *       There will be a sub-event for each node that was added to the graph
+ */
+#define QNN_DSP_PROFILE_EVENTTYPE_GRAPH_EXECUTE_ACCEL_TIME_CYCLE 3003
+
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to execute the graph on the accelerator
+ *        when client invokes QnnGraph_execute or QnnGraph_executeAsync.
+ *        The value returned is time taken in microseconds
+ *
+ * @note graph execute accelerator time maybe available on both
+ *       QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels
+ *
+ * @note When QNN_PROFILE_LEVEL_DETAILED is used, this event can have
+ *       multiple sub-events of type QNN_PROFILE_EVENTTYPE_NODE / QNN_PROFILE_EVENTUNIT_MICROSEC
+ *       There will be a sub-event for each node that was added to the graph
+ */
+#define QNN_DSP_PROFILE_EVENTTYPE_GRAPH_EXECUTE_ACCEL_TIME_MICROSEC 3004
+
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to time taken for miscellaneous work i.e. time
+ *        that cannot be attributed to a node but are still needed to
+ *        execute the graph on the accelerator. This occurs when client invokes
+ *        QnnGraph_execute or QnnGraph_executeAsync.
+ *        The value returned is time taken in microseconds
+ *
+ * @note graph execute misc accelerator time is available only on
+ *       QNN_PROFILE_LEVEL_DETAILED levels
+ */
+#define QNN_DSP_PROFILE_EVENTTYPE_GRAPH_EXECUTE_MISC_ACCEL_TIME_MICROSEC 3005
+
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to time taken for a graph yield instance to
+ *        release all its resources to the other graph.
+ *        The value returned is time taken in microseconds.
+ */
+#define QNN_DSP_PROFILE_EVENTTYPE_GRAPH_EXECUTE_YIELD_INSTANCE_RELEASE_TIME 3006
+
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to time a graph spends waiting for a higher
+ *        priority graph to finish execution.
+ *        The value returned is time taken in microseconds
+ */
+#define QNN_DSP_PROFILE_EVENTTYPE_GRAPH_EXECUTE_YIELD_INSTANCE_WAIT_TIME 3007
+
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to time a graph spends re-acquiring resources
+ *        and restoring vtcm.
+ *        The value returned is time taken in microseconds
+ */
+#define QNN_DSP_PROFILE_EVENTTYPE_GRAPH_EXECUTE_YIELD_INSTANCE_RESTORE_TIME 3008
+
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to the number of times that a yield occured
+ *        during execution
+ */
+#define QNN_DSP_PROFILE_EVENTTYPE_GRAPH_EXECUTE_YIELD_COUNT 3009
+
+/**
+ * @brief QnnProfile_EventType_t definition for time a graph waits to get
+ *        VTCM. This should be constant UNLESS we need another graph to yield.
+ *        The value returned is time taken in microseconds.
+ */
+#define QNN_DSP_PROFILE_EVENTTYPE_GRAPH_EXECUTE_VTCM_ACQUIRE_TIME 3010
+
+/**
+ * @brief QnnProfile_EventType_t definition for time a graph waits to get
+ *        HMX + HVX, and turn them all on.
+ *        The value returned is time taken in microseconds.
+ */
+#define QNN_DSP_PROFILE_EVENTTYPE_GRAPH_EXECUTE_RESOURCE_POWER_UP_TIME 3011
+
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to the remote procedure call on the ARM processor
+ *        when client invokes QnnContext_free which in consequence deinit graph.
+ *        The value returned is time in microseconds.
+ *
+ * @note graph deinit host rpc time maybe available on both
+ *       QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels
+ */
+#define QNN_DSP_PROFILE_EVENTTYPE_GRAPH_DEINIT_HOST_RPC_TIME_MICROSEC 4001
+
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to the remote procedure call on the DSP processor
+ *        when client invokes QnnContext_free which in consequence deinit graph.
+ *        The value returned is time in microseconds.
+ *
+ * @note graph deinit dsp rpc time maybe available on both
+ *       QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels
+ */
+#define QNN_DSP_PROFILE_EVENTTYPE_GRAPH_DEINIT_DSP_RPC_TIME_MICROSEC 4002
+
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to the time taken to deinit graph on the
+ *        accelerator when client invokes QnnContext_free which in consequence
+ *        deinit graph. The value returned is time in microseconds.
+ *
+ * @note graph deinit accelerator time maybe available on both
+ *       QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels
+ */
+#define QNN_DSP_PROFILE_EVENTTYPE_GRAPH_DEINIT_ACCEL_TIME_MICROSEC 4003
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // QNN_DSP_PROFILE_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/QnnDspProperty.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/QnnDspProperty.h
new file mode 100755
index 0000000000000..39669338e35f8
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/QnnDspProperty.h
@@ -0,0 +1,30 @@
+//==============================================================================
+//
+//  Copyright (c) 2022-2023 Qualcomm Technologies, Inc.
+//  All Rights Reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef QNN_DSP_PROPERTY_H
+#define QNN_DSP_PROPERTY_H
+
+#include "QnnProperty.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//=============================================================================
+// Macros
+//=============================================================================
+/**
+ * @brief Property key for determining whether a backend supports unsigned pd.
+ */
+#define QNN_PROPERTY_CUSTOM_DSP_UNSIGNED_PD_SUPPORT QNN_PROPERTY_GROUP_CUSTOM + 1
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // QNN_DSP_PROPERTY_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/Udo/UdoBase.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/Udo/UdoBase.h
new file mode 100755
index 0000000000000..942e5997ab5ff
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/Udo/UdoBase.h
@@ -0,0 +1,509 @@
+//==============================================================================
+//
+// Copyright (c) 2019-2021 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef SNPE_UDO_BASE_H
+#define SNPE_UDO_BASE_H
+
+#include <stdint.h>
+
+// Provide values to use for API version.
+#define API_VERSION_MAJOR 1
+#define API_VERSION_MINOR 6
+#define API_VERSION_TEENY 0
+
+/** @addtogroup c_plus_plus_apis C++
+@{ */
+
+// Defines a bitmask of enum values.
+typedef uint32_t SnpeUdo_Bitmask_t;
+typedef SnpeUdo_Bitmask_t Udo_Bitmask_t;
+
+// A string of characters, rather than an array of bytes.
+// Assumed to be UTF-8.
+typedef char* SnpeUdo_String_t;
+typedef SnpeUdo_String_t Udo_String_t;
+
+// The maximum allowable length of a SnpeUdo_String_t in bytes,
+// including null terminator. SNPE will truncate strings longer
+// than this.
+#define SNPE_UDO_MAX_STRING_SIZE 1024
+
+/**
+  * An enum which holds the various error types.
+  * The error types are divided to classes :
+  * 0 - 99    : generic errors
+  * 100 - 200 : errors related to configuration
+  *
+  */
+typedef enum
+{
+   /// No Error
+   SNPE_UDO_NO_ERROR                    = 0,          UDO_NO_ERROR                    = 0,
+   /// Unsupported value for core type
+   SNPE_UDO_WRONG_CORE                  = 1,          UDO_WRONG_CORE                  = 1,
+   /// Invalid attribute/argument passed into UDO API
+   SNPE_UDO_INVALID_ARGUMENT            = 2,          UDO_INVALID_ARGUMENT            = 2,
+   /// Unsupported feature error
+   SNPE_UDO_UNSUPPORTED_FEATURE         = 3,          UDO_UNSUPPORTED_FEATURE         = 3,
+   /// Error relating to memory allocation
+   SNPE_UDO_MEM_ALLOC_ERROR             = 4,          UDO_MEM_ALLOC_ERROR             = 4,
+   /* Configuration Specific errors */
+   /// No op with given attributes available in library
+   SNPE_UDO_WRONG_OPERATION             = 100,        UDO_WRONG_OPERATION             = 100,
+   /// Unsupported value for core type in UDO configuration
+   SNPE_UDO_WRONG_CORE_TYPE             = 101,        UDO_WRONG_CORE_TYPE             = 101,
+   /// Wrong number of params in UDO definition
+   SNPE_UDO_WRONG_NUM_OF_PARAMS         = 102,        UDO_WRONG_NUM_OF_PARAMS         = 102,
+   /// Wrong number of dimensions for tensor(s) in UDO definition
+   SNPE_UDO_WRONG_NUM_OF_DIMENSIONS     = 103,        UDO_WRONG_NUM_OF_DIMENSIONS     = 103,
+   /// Wrong number of input tensors in UDO definition
+   SNPE_UDO_WRONG_NUM_OF_INPUTS         = 104,        UDO_WRONG_NUM_OF_INPUTS         = 104,
+   /// Wrong number of output tensors in UDO definition
+   SNPE_UDO_WRONG_NUM_OF_OUTPUTS        = 105,        UDO_WRONG_NUM_OF_OUTPUTS        = 105,
+   SNPE_UDO_PROGRAM_CACHE_NOT_FOUND     = 106,        UDO_PROGRAM_CACHE_NOT_FOUND     = 106,
+   SNPE_UDO_UNKNOWN_ERROR               = 0xFFFFFFFF, UDO_UNKNOWN_ERROR               = 0xFFFFFFFF
+} SnpeUdo_ErrorType_t;
+
+typedef SnpeUdo_ErrorType_t Udo_ErrorType_t;
+
+/**
+  * An enum which holds the various data types.
+  * Designed to be used as single values or combined into a bitfield parameter
+  * (0x1, 0x2, 0x4, etc)
+  * \n FIXED_XX types are targeted for data in tensors.
+  * \n UINT / INT types are targeted for scalar params
+  */
+typedef enum
+{
+   /// data type: 16-bit floating point
+   SNPE_UDO_DATATYPE_FLOAT_16       = 0x01,        UDO_DATATYPE_FLOAT_16       = 0x01,
+   /// data type: 32-bit floating point
+   SNPE_UDO_DATATYPE_FLOAT_32       = 0x02,        UDO_DATATYPE_FLOAT_32       = 0x02,
+   /// data type: 4-bit fixed point
+   SNPE_UDO_DATATYPE_FIXED_4        = 0x04,        UDO_DATATYPE_FIXED_4        = 0x04,
+   /// data type: 8-bit fixed point
+   SNPE_UDO_DATATYPE_FIXED_8        = 0x08,        UDO_DATATYPE_FIXED_8        = 0x08,
+   /// data type: 16-bit fixed point
+   SNPE_UDO_DATATYPE_FIXED_16       = 0x10,        UDO_DATATYPE_FIXED_16       = 0x10,
+   /// data type: 32-bit fixed point
+   SNPE_UDO_DATATYPE_FIXED_32       = 0x20,        UDO_DATATYPE_FIXED_32       = 0x20,
+   /// data type: 8-bit unsigned integer
+   SNPE_UDO_DATATYPE_UINT_8         = 0x100,       UDO_DATATYPE_UINT_8         = 0x100,
+   /// data type: 16-bit unsigned integer
+   SNPE_UDO_DATATYPE_UINT_16        = 0x200,       UDO_DATATYPE_UINT_16        = 0x200,
+   /// data type: 32-bit unsigned integer
+   SNPE_UDO_DATATYPE_UINT_32        = 0x400,       UDO_DATATYPE_UINT_32        = 0x400,
+   /// data type: 8-bit signed integer
+   SNPE_UDO_DATATYPE_INT_8          = 0x1000,      UDO_DATATYPE_INT_8          = 0x1000,
+   /// data type: 16-bit signed integer
+   SNPE_UDO_DATATYPE_INT_16         = 0x2000,      UDO_DATATYPE_INT_16         = 0x2000,
+   /// data type: 32-bit signed integer
+   SNPE_UDO_DATATYPE_INT_32         = 0x4000,      UDO_DATATYPE_INT_32         = 0x4000,
+   SNPE_UDO_DATATYPE_LAST           = 0xFFFFFFFF,  UDO_DATATYPE_LAST           = 0xFFFFFFFF
+} SnpeUdo_DataType_t;
+
+typedef SnpeUdo_DataType_t Udo_DataType_t;
+
+/**
+  * An enum which holds the various layouts.
+  * Designed to be used as single values or combined into a bitfield parameter
+  * (0x1, 0x2, 0x4, etc)
+  */
+typedef enum
+{
+   /// data layout (4D): NHWC (batch-height-width-channel)
+   SNPE_UDO_LAYOUT_NHWC             = 0x01,        UDO_LAYOUT_NHWC             = 0x01,
+   /// data layout (4D): NCHW (batch-channel-height-width)
+   SNPE_UDO_LAYOUT_NCHW             = 0x02,        UDO_LAYOUT_NCHW             = 0x02,
+   /// data layout (5D): NDHWC (batch-dimension-height-width-channel)
+   SNPE_UDO_LAYOUT_NDHWC            = 0x04,        UDO_LAYOUT_NDHWC            = 0x04,
+   SNPE_UDO_LAYOUT_GPU_OPTIMAL1     = 0x08,        UDO_LAYOUT_GPU_OPTIMAL1     = 0x08,
+   SNPE_UDO_LAYOUT_GPU_OPTIMAL2     = 0x10,        UDO_LAYOUT_GPU_OPTIMAL2     = 0x10,
+   SNPE_UDO_LAYOUT_DSP_OPTIMAL1     = 0x11,        UDO_LAYOUT_DSP_OPTIMAL1     = 0x11,
+   SNPE_UDO_LAYOUT_DSP_OPTIMAL2     = 0x12,        UDO_LAYOUT_DSP_OPTIMAL2     = 0x12,
+   // Indicates no data will be allocated for this tensor.
+   // Used to specify optional inputs/outputs positionally.
+   SNPE_UDO_LAYOUT_NULL             = 0x13,        UDO_LAYOUT_NULL             = 0x13,
+   SNPE_UDO_LAYOUT_LAST             = 0xFFFFFFFF,  UDO_LAYOUT_LAST             = 0xFFFFFFFF
+} SnpeUdo_TensorLayout_t;
+
+typedef SnpeUdo_TensorLayout_t Udo_TensorLayout_t;
+
+/**
+  * An enum which holds the UDO library Core type .
+  * Designed to be used as single values or combined into a bitfield parameter
+  * (0x1, 0x2, 0x4, etc)
+  */
+typedef enum
+{
+   /// Library target IP Core is undefined
+   SNPE_UDO_CORETYPE_UNDEFINED   = 0x00,          UDO_CORETYPE_UNDEFINED   = 0x00,
+   /// Library target IP Core is CPU
+   SNPE_UDO_CORETYPE_CPU         = 0x01,          UDO_CORETYPE_CPU         = 0x01,
+   /// Library target IP Core is GPU
+   SNPE_UDO_CORETYPE_GPU         = 0x02,          UDO_CORETYPE_GPU         = 0x02,
+   /// Library target IP Core is DSP
+   SNPE_UDO_CORETYPE_DSP         = 0x04,          UDO_CORETYPE_DSP         = 0x04,
+   SNPE_UDO_CORETYPE_LAST        = 0xFFFFFFFF,    UDO_CORETYPE_LAST        = 0xFFFFFFFF
+} SnpeUdo_CoreType_t;
+
+typedef SnpeUdo_CoreType_t Udo_CoreType_t;
+
+/**
+  * An enum to specify the parameter type : Scalar or Tensor
+  */
+typedef enum
+{
+   /// UDO static param type: scalar
+   SNPE_UDO_PARAMTYPE_SCALAR = 0x00,         UDO_PARAMTYPE_SCALAR = 0x00,
+   /// UDO static param type: string
+   SNPE_UDO_PARAMTYPE_STRING = 0x01,         UDO_PARAMTYPE_STRING = 0x01,
+   /// UDO static param type: tensor
+   SNPE_UDO_PARAMTYPE_TENSOR = 0x02,         UDO_PARAMTYPE_TENSOR = 0x02,
+   SNPE_UDO_PARAMTYPE_LAST   = 0xFFFFFFFF,   UDO_PARAMTYPE_LAST   = 0xFFFFFFFF
+} SnpeUdo_ParamType_t;
+
+typedef SnpeUdo_ParamType_t Udo_ParamType_t;
+
+/**
+  * An enum to specify quantization type
+  */
+typedef enum
+{
+   /// Tensor Quantization type: NONE. Signifies unquantized tensor data
+   SNPE_UDO_QUANTIZATION_NONE   = 0x00,         UDO_QUANTIZATION_NONE   = 0x00,
+   /// Tensor Quantization type: Tensorflow-style
+   SNPE_UDO_QUANTIZATION_TF     = 0x01,         UDO_QUANTIZATION_TF     = 0x01,
+   SNPE_UDO_QUANTIZATION_QMN    = 0x02,         UDO_QUANTIZATION_QMN    = 0x02,
+   SNPE_UDO_QUANTIZATION_LAST   = 0xFFFFFFFF,   UDO_QUANTIZATION_LAST   = 0xFFFFFFFF
+} SnpeUdo_QuantizationType_t;
+
+typedef SnpeUdo_QuantizationType_t Udo_QuantizationType_t;
+
+/**
+ * @brief A struct which is used to provide a version number using 3 values : major, minor, teeny
+ *
+ */
+typedef struct
+{
+   /// version field: major - for backward-incompatible changes
+   uint32_t major;
+   /// version field: minor - for backward-compatible feature updates
+   uint32_t minor;
+   /// version field: teeny - for minor bug-fixes and clean-up
+   uint32_t teeny;
+} SnpeUdo_Version_t;
+
+typedef SnpeUdo_Version_t Udo_Version_t;
+
+/**
+ * @brief A struct returned from version query, contains the Library version and API version
+ *
+ */
+typedef struct
+{
+   /// Version of UDO library. Controlled by users
+   SnpeUdo_Version_t libVersion;
+   /// Version of SNPE UDO API used in compiling library. Determined by SNPE
+   SnpeUdo_Version_t apiVersion;
+} SnpeUdo_LibVersion_t;
+
+/**
+ * @brief A struct returned from version query, contains the package version
+ *
+ */
+typedef struct
+{
+   /// Version of UDO API used in package.
+   Udo_Version_t apiVersion;
+} Udo_PkgVersion_t;
+
+/**
+ * @brief A union to hold the value of a generic type. Allows defining a parameter struct
+ * in a generic way, with a "value" location that holds the data regardless of the type.
+ *
+ */
+typedef union
+{
+   /// value type: float
+   float    floatValue;
+   /// value type: unsigned 32-bit integer
+   uint32_t uint32Value;
+   /// value type: signed 32-bit integer
+   int32_t  int32Value;
+   /// value type: unsigned 16-bit integer
+   uint16_t uint16Value;
+   /// value type: signed 16-bit integer
+   int16_t  int16Value;
+   /// value type: unsigned 8-bit integer
+   uint8_t  uint8Value;
+   /// value type: signed 8-bit integer
+   int8_t   int8Value;
+} SnpeUdo_Value_t;
+
+typedef SnpeUdo_Value_t Udo_Value_t;
+
+/**
+ * @brief A struct which defines a scalar parameter : name, data type, and union of values
+ *
+ */
+typedef struct
+{
+   /// The parameter data type : float, int, etc.
+   SnpeUdo_DataType_t  dataType;
+   /// a union of specified type which holds the data
+   SnpeUdo_Value_t dataValue;
+} SnpeUdo_ScalarParam_t;
+
+typedef SnpeUdo_ScalarParam_t Udo_ScalarParam_t;
+
+/**
+ * @brief A struct which defines the quantization parameters in case of Tensorflow style quantization
+ *
+ */
+typedef struct
+{
+   /// minimum value of the quantization range of data
+   float minValue;
+   /// maximum value of the quantization range of data
+   float maxValue;
+} SnpeUdo_TFQuantize_t;
+
+typedef SnpeUdo_TFQuantize_t Udo_TFQuantize_t;
+
+/**
+ * @brief A struct which defines the quantization type, and union of supported quantization structs
+ *
+ */
+typedef struct
+{
+   /// quantization type (only TF-style currently supported)
+   SnpeUdo_QuantizationType_t quantizeType;
+   union
+   {
+     /// TF-style min-max quantization ranges
+     SnpeUdo_TFQuantize_t TFParams;
+   };
+} SnpeUdo_QuantizeParams_t;
+
+typedef SnpeUdo_QuantizeParams_t Udo_QuantizeParams_t;
+
+/**
+ * @brief A struct which defines the datatype associated with a specified core-type
+ * This should be used to denote the datatypes for a single tensor info, depending
+ * on the intended execution core.
+ *
+ */
+typedef struct
+{
+    /// The IP Core
+    SnpeUdo_CoreType_t     coreType;
+    /// The associated datatype for this coreType
+    SnpeUdo_DataType_t       dataType;
+} SnpeUdo_PerCoreDatatype_t;
+
+typedef SnpeUdo_PerCoreDatatype_t Udo_PerCoreDatatype_t;
+
+/**
+ * @brief A struct which defines a tensor parameter : name, data type, layout, quantization, more.
+ *        Also holds a pointer to the tensor data.
+ *
+ */
+typedef struct
+{
+   /// The maximum allowable dimensions of the tensor. The memory held in
+   /// _tensorData_ is guaranteed to be large enough for this.
+   uint32_t*                maxDimensions;
+   /// The current dimensions of the tensor. An operation may modify the current
+   /// dimensions of its output, to indicate cases where the output has been
+   /// "resized".
+   /// Note that for static parameters, the current and max dimensions must
+   /// match.
+   uint32_t*                currDimensions;
+   /// Quantization params applicable to the tensor. Currently only supports
+   /// Tensorflow quantization style.
+   SnpeUdo_QuantizeParams_t quantizeParams;
+   /// Number of dimensions to the tensor: 3D, 4D, etc.
+   uint32_t                 tensorRank;
+   /// The parameter data type: float, int, etc.
+   SnpeUdo_DataType_t       dataType;
+   /// The tensor layout type: NCHW, NHWC, etc.
+   SnpeUdo_TensorLayout_t   layout;
+   /// Opaque pointer to tensor data. User may be required to re-interpret the pointer
+   /// based on core-specific definitions.
+   void*                    tensorData;
+} SnpeUdo_TensorParam_t;
+
+typedef SnpeUdo_TensorParam_t Udo_TensorParam_t;
+
+/**
+ * @brief struct which defines a UDO parameter - a union of scalar, tensor and string parameters
+ *
+ */
+typedef struct
+{
+   /// Type is scalar or tensor
+  SnpeUdo_ParamType_t paramType;
+  /// The param name, for example : "offset", "activation_type"
+  SnpeUdo_String_t    paramName;
+  union
+  {
+    /// scalar param value
+    SnpeUdo_ScalarParam_t scalarParam;
+    /// tensor param value
+    SnpeUdo_TensorParam_t tensorParam;
+    /// string param value
+    SnpeUdo_String_t      stringParam;
+  };
+} SnpeUdo_Param_t;
+
+typedef SnpeUdo_Param_t Udo_Param_t;
+
+/**
+ * @brief A struct which defines Operation information which is specific for IP core (CPU, GPU, DSP ...)
+ *
+ */
+typedef struct
+{
+   /// The IP Core
+   SnpeUdo_CoreType_t     udoCoreType;
+   /// Bitmask, defines supported internal calculation types (like FLOAT_32, etc)
+   /// Based on SnpeUdo_DataType
+   SnpeUdo_Bitmask_t      operationCalculationTypes;
+} SnpeUdo_OpCoreInfo_t;
+
+typedef SnpeUdo_OpCoreInfo_t Udo_OpCoreInfo_t;
+
+/**
+ * @brief A struct which defines the common and core-specific Operation information
+ *
+ */
+typedef struct
+{
+   /// Operation type
+   SnpeUdo_String_t      operationType;
+   /// A bitmask describing which IP Cores (CPU, GPU, DSP ...) support this operation
+   /// Translated based on SnpeUdo_CoreType
+   SnpeUdo_Bitmask_t     supportedByCores;
+   /// Number of static parameters defined by the op
+   uint32_t              numOfStaticParams;
+   /// Array of static parameters. Can be scalar or tensor params
+   SnpeUdo_Param_t*      staticParams;
+   /// Number of input tensors this op receives
+   uint32_t              numOfInputs;
+   /// Array of input tensor names to this operation
+   SnpeUdo_String_t*     inputNames;
+   /// Number of output tensors this op receives
+   uint32_t              numOfOutputs;
+   /// Array of output tensor names to this operation
+   SnpeUdo_String_t*     outputNames;
+   /// Number of cores that the op can execute on
+   uint32_t              numOfCoreInfo;
+   /// Array of per-core information entries
+   SnpeUdo_OpCoreInfo_t* opPerCoreInfo;
+} SnpeUdo_OperationInfo_t;
+
+typedef SnpeUdo_OperationInfo_t Udo_OperationInfo_t;
+
+/**
+ * @brief A struct which provides the implementation library info : type, name
+ *
+ */
+typedef struct
+{
+   /// Defines the IP Core that this implementation library is targeting
+   SnpeUdo_CoreType_t     udoCoreType;
+   /// library name. will be looked at in the standard library path
+   SnpeUdo_String_t       libraryName;
+} SnpeUdo_LibraryInfo_t;
+
+typedef SnpeUdo_LibraryInfo_t Udo_LibraryInfo_t;
+
+/**
+ * @brief A struct returned by the registration library and contains information on the UDO package :
+ * name, operations, libraries, etc.
+ *
+ */
+typedef struct
+{
+   /// A string containing the package name
+   SnpeUdo_String_t         packageName;
+   /// A bitmask describing supported IP cores (CPU, GPU, DSP ...)
+   /// Translated based on SnpeUdo_CoreType
+   SnpeUdo_Bitmask_t        supportedCoreTypes;
+   /// The number of implementation libraries in the package
+   uint32_t                numOfImplementationLib;
+   /// Array of implementation libraries names/types
+   SnpeUdo_LibraryInfo_t*   implementationLib;
+   /// A string containing all operation types separated by space
+   SnpeUdo_String_t         operationsString;
+   /// Number of supported operations
+   uint32_t                numOfOperations;
+   /// Array of Operation info structs. Each entry describes one
+   /// Operation (name, params, inputs, outputs)
+   SnpeUdo_OperationInfo_t* operationsInfo;
+} SnpeUdo_RegInfo_t;
+
+typedef SnpeUdo_RegInfo_t Udo_RegInfo_t;
+
+/**
+* @brief A struct returned by the implementation library and contains information on the
+* specific library: name, IP Core, operations, etc.
+*
+*/
+typedef struct
+{
+   /// Defines the IP Core that this implementation library is targeting
+   SnpeUdo_CoreType_t     udoCoreType;
+   /// A string containing the package name
+   SnpeUdo_String_t       packageName;
+   /// A string containing all operation types separated by space
+   SnpeUdo_String_t       operationsString;
+   /// Number of supported operations
+   uint32_t              numOfOperations;
+} SnpeUdo_ImpInfo_t;
+
+typedef SnpeUdo_ImpInfo_t Udo_ImpInfo_t;
+
+/**
+ * @brief This struct defines an operation. It is used for validation
+ * or creation of an operation.
+ * In case of using it for creation, the static params which are tensors
+ * contain pointers to the real data (weights, for example), and input/output
+ * tensors also include pointers to the buffers used.
+ */
+typedef struct
+{
+   /// The IP Core that the operation is defined for - CPU, GPU, DSP...
+   SnpeUdo_CoreType_t      udoCoreType;
+   /// Operation type
+   SnpeUdo_String_t        operationType;
+   /// The number of static parameters provided in the staticParams array.
+   /// this number has to match the number provided by the UDO Registration library information
+   uint32_t               numOfStaticParams;
+   /// Array of static parameters
+   SnpeUdo_Param_t*        staticParams;
+   /// The number of input parameters provided in inputs array.
+   /// this number has to match the number provided by the UDO Registration library information
+   uint32_t               numOfInputs;
+   /// Array of input tensors, providing layout, data type, sizes, etc
+   /// When used to create an operation, also contains the initial location of the data
+   SnpeUdo_TensorParam_t*  inputs;
+   /// The number of output parameters provided in inputs array.
+   /// this number has to match the number provided by the UDO Registration library information
+   uint32_t               numOfOutputs;
+   /// Array of output tensors, providing layout, data type, sizes, etc
+   /// When used to create an operation, also contains the initial location of the data
+   SnpeUdo_TensorParam_t*  outputs;
+} SnpeUdo_OpDefinition_t;
+
+typedef SnpeUdo_OpDefinition_t Udo_OpDefinition_t;
+
+/** @} */ /* end_addtogroup c_plus_plus_apis C++ */
+
+#endif //SNPE_UDO_BASE_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/Udo/UdoFlatten.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/Udo/UdoFlatten.h
new file mode 100755
index 0000000000000..84a8fe310908e
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/Udo/UdoFlatten.h
@@ -0,0 +1,78 @@
+//==============================================================================
+//
+// Copyright (c) 2019 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#include "DSP/Udo/UdoBase.h"
+
+#define HVX_ALIGNMENT 128
+#define DSP_STRUCT_ALIGNMENT 8
+#define DSP_ALIGN(X, ALIGNMENT) (((X) + ALIGNMENT - 1) & (~((ALIGNMENT)-1)))
+
+typedef struct dspStaticParamsMeta {
+        uint32_t size;
+        uint32_t numParams;       
+} dspStaticParamsMeta_t;
+
+typedef struct tensorParamInfo {
+        SnpeUdo_TensorLayout_t layout;
+        SnpeUdo_QuantizeParams_t quantizeInfo;
+        SnpeUdo_DataType_t dataType;
+        uint32_t paddingFor8byteAlignment;
+} tensorParamInfo_t;
+
+typedef struct udoString {
+        uint32_t sizeStruct;  // aligned
+        uint32_t lengthString;  // does not include null character
+        // followed by a string
+} udoString_t;  // allocate mem for string for 8 byte alignment
+
+typedef struct dims {
+        uint32_t size;
+        uint32_t rank;
+        uint32_t ds;  // rank # of max dimensions followed by rank # of current dimensions for tensors
+} dims_t;
+
+typedef struct tensorData {
+	uint32_t structSize;
+        uint32_t dataSize;
+        // followed by actual tensor data
+} tensorData_t;
+
+typedef struct dspStaticParamDescriptor {
+        uint32_t size;   // including size of descriptor (including dims + data for tensors) (or including string for strings)
+        SnpeUdo_ParamType_t paramType;
+        union {   // not used for string data
+                SnpeUdo_ScalarParam_t scalarInfo;
+                tensorParamInfo_t tensorInfo;
+        };
+        udoString_t name;
+        // followed by char*
+        // in case of tensor, followed by dim_stride and tensor_data
+        // in case of string, followed by udo_string and char*
+} dspStaticParamDescriptor_t;
+
+typedef struct paramSizes {
+       uint32_t descriptorSize;
+       uint32_t nameStructSize;
+       uint32_t dimsSize;
+       uint32_t dataStructSize;
+       uint32_t dataSize;
+       uint32_t stringDataStructSize;
+} paramSizes_t;
+
+typedef struct dspStaticParams {
+        dspStaticParamsMeta_t meta;
+        dspStaticParamDescriptor_t paramDesc;
+} dspStaticParams_t;
+
+
+int 
+SnpeUdo_flattenStaticParams (SnpeUdo_Param_t** paramList, uint32_t numParams, uint32_t* flattenedSize, void** flattened);
+
+void 
+SnpeUdo_freeFlattenedStaticParams (void** flattened);
+
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/Udo/UdoImpl.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/Udo/UdoImpl.h
new file mode 100755
index 0000000000000..bcc767a3c4a0f
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/Udo/UdoImpl.h
@@ -0,0 +1,343 @@
+//==============================================================================
+//
+// Copyright (c) 2019-2021 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef SNPE_UDO_IMPL_H
+#define SNPE_UDO_IMPL_H
+
+#include <stdbool.h>
+
+#include "DSP/Udo/UdoShared.h"
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+/** @addtogroup c_plus_plus_apis C++
+@{ */
+
+typedef struct _SnpeUdo_OpFactory_t* SnpeUdo_OpFactory_t;
+typedef struct _SnpeUdo_Operation_t* SnpeUdo_Operation_t;
+
+typedef SnpeUdo_OpFactory_t Udo_OpFactory_t;
+typedef SnpeUdo_Operation_t Udo_Operation_t;
+
+/**
+ * @brief Initialize the shared library's data structures. Calling any other
+ *        library function before this one will result in error.
+ *
+ * @param[in] globalInfrastructure Global core-specific infrastructure to be
+ *            used by operations created in this library. The definition and
+ *            semantics of this object will be defined in the corresponding
+ *            implementation header for the core type.
+ * @return Error code
+ */
+SnpeUdo_ErrorType_t
+SnpeUdo_initImplLibrary(void* globalInfrastructure);
+
+typedef SnpeUdo_ErrorType_t
+(*SnpeUdo_InitImplLibraryFunction_t)(void*);
+
+/**
+ * @brief A function to query the API version of the UDO implementation library.
+ *        The function populates a SnpeUdo_LibVersion_t struct, which contains a SnpeUdo_Version_t
+ *        struct for API version and library version.
+ *
+ * @param[in, out] version A pointer to struct which contains major, minor, teeny information for
+ *                 library and api versions.
+ *
+ * @return Error code
+ */
+SnpeUdo_ErrorType_t
+SnpeUdo_getImplVersion(SnpeUdo_LibVersion_t** version);
+
+typedef SnpeUdo_ErrorType_t
+(*SnpeUdo_getImplVersion_t)(SnpeUdo_LibVersion_t** version);
+
+/**
+ * @brief Release the shared library's data structures, and invalidate any
+ *        handles returned by the library. The behavior of any outstanding
+ *        asynchronous calls made to this library when this function is called
+ *        are undefined. All library functions (except SnpeUdo_initImplLibrary) will
+ *        return an error after this function has been successfully called.
+ *
+ *        It should be possible to call SnpeUdo_initImplLibrary after calling this
+ *        function, and re-initialize the library.
+ *
+ * @return Error code
+ */
+SnpeUdo_ErrorType_t
+SnpeUdo_terminateImplLibrary(void);
+
+typedef SnpeUdo_ErrorType_t
+(*SnpeUdo_TerminateImplLibraryFunction_t)(void);
+
+
+/**
+ * @brief A function to query info on the UDO implementation library.
+ *        The function populates a structure which contains information about
+ *        operations that are part of this library
+ *
+ * @param[in, out] implementationInfo A pointer to struct which contains information
+ *                 on the operations
+ *
+ * @return error code
+ *
+ */
+SnpeUdo_ErrorType_t
+SnpeUdo_getImpInfo(SnpeUdo_ImpInfo_t** implementationInfo);
+
+typedef SnpeUdo_ErrorType_t
+(*SnpeUdo_GetImpInfoFunction_t)(SnpeUdo_ImpInfo_t** implementationInfo);
+
+typedef SnpeUdo_GetImpInfoFunction_t Udo_GetImpInfoFunction_t;
+
+/**
+ * @brief A function to create an operation factory.
+ *        The function receives the operation type, and an array of static parameters,
+ *        and returns operation factory handler
+ *
+ * @param[in] udoCoreType The Core type to create the operation on. An error will
+ *            be returned if this does not match the core type of the library.
+ *
+ * @param[in] perFactoryInfrastructure CreateOpFactory infrastructure appropriate to this
+ *            core type. The definition and semantics of this object will be defined
+ *            in the corresponding implementation header for the core type.
+ *
+ * @param[in] operationType A string containing Operation type. for example "MY_CONV"
+ *
+ * @param[in] numOfStaticParams The number of static parameters.
+ *
+ * @param[in] staticParams Array of static parameters
+ *
+ * @param[in,out] opFactory Handler to Operation Factory, to be used when creating operations
+ *
+ * @return Error Code
+ */
+SnpeUdo_ErrorType_t
+SnpeUdo_createOpFactory(SnpeUdo_CoreType_t    udoCoreType,
+                        void*                perFactoryInfrastructure,
+                        SnpeUdo_String_t      operationType,
+                        uint32_t             numOfStaticParams,
+                        SnpeUdo_Param_t*      staticParams,
+                        SnpeUdo_OpFactory_t*  opFactory);
+
+typedef SnpeUdo_ErrorType_t
+(*SnpeUdo_CreateOpFactoryFunction_t)(SnpeUdo_CoreType_t,
+                                     void*,
+                                     SnpeUdo_String_t,
+                                     uint32_t,
+                                     SnpeUdo_Param_t*,
+                                     SnpeUdo_OpFactory_t*);
+
+typedef SnpeUdo_CreateOpFactoryFunction_t Udo_CreateOpFactoryFunction_t;
+
+/**
+ * @brief A function to release the resources allocated for an operation factory
+ *        created by this library.
+ *
+ * @param[in] opFactory The operation factory to release. Upon success this handle will be invalidated.
+ *
+ * @return Error Code
+ */
+SnpeUdo_ErrorType_t
+SnpeUdo_releaseOpFactory(SnpeUdo_OpFactory_t opFactory);
+
+typedef SnpeUdo_ErrorType_t
+(*SnpeUdo_ReleaseOpFactoryFunction_t)(SnpeUdo_OpFactory_t);
+
+typedef SnpeUdo_ReleaseOpFactoryFunction_t Udo_ReleaseOpFactoryFunction_t;
+
+/**
+ * @brief A function to create an operation from the factory.
+ *        The function receives array of inputs and array of outputs, and creates an operation
+ *        instance, returning the operation instance handler.
+ *
+ * @param[in] opFactory OpFactory instance containing the parameters for this operation.
+ *
+ * @param[in] perOpInfrastructure Per-Op infrastructure for this operation. The definition
+ *            and semantics of this object will be defined in the implementation header
+ *            appropriate to this core type.
+ *
+ * @param[in] numOfInputs The number of input tensors this operation will receive.
+ *
+ * @param[in] inputs Array of input tensors, providing both the sizes and initial
+ *            location of the data.
+ *
+ * @param[in] numOfOutputs Number of output tensors this operation will produce.
+ *
+ * @param[in] outputs Array of output tensors, providing both the sizes and
+ *            initial location of the data.
+ *
+ * @param[in,out] operation Handle for newly created operation instance.
+ *
+ * @return Error Code
+ */
+SnpeUdo_ErrorType_t
+SnpeUdo_createOperation(SnpeUdo_OpFactory_t    opFactory,
+                        void*                 perOpInfrastructure,
+                        uint32_t              numOfInputs,
+                        SnpeUdo_TensorParam_t* inputs,
+                        uint32_t              numOfOutputs,
+                        SnpeUdo_TensorParam_t* outputs,
+                        SnpeUdo_Operation_t*   operation);
+
+typedef SnpeUdo_ErrorType_t
+(*SnpeUdo_CreateOperationFunction_t)(SnpeUdo_OpFactory_t,
+                                     void*,
+                                     uint32_t,
+                                     SnpeUdo_TensorParam_t*,
+                                     uint32_t,
+                                     SnpeUdo_TensorParam_t*,
+                                     SnpeUdo_Operation_t*);
+
+typedef SnpeUdo_CreateOperationFunction_t Udo_CreateOperationFunction_t;
+
+/**
+ * @brief A pointer to notification function.
+ *
+ *        The notification function supports the non-blocking (e.g. asynchronous) execution use-case.
+ *        In case an "executeUdoOp" function is called with "blocking" set to zero, and a
+ *        notify function, this function will be called by the implementation library at the
+ *        end of execution. The implementation library will pass the notify function the ID
+ *        that was provided to it when "executeUdoOp" was called.
+ *
+ * @param[in] ID 32-bit value, that was provided to executeUdoOp by the calling entity.
+ *            Can be used to track the notifications, in case of multiple execute calls issued.
+ *
+ * @return Error code
+ *
+ */
+typedef SnpeUdo_ErrorType_t
+(*SnpeUdo_ExternalNotify_t)(const uint32_t ID);
+
+typedef SnpeUdo_ExternalNotify_t Udo_ExternalNotify_t;
+
+/**
+ * @brief Operation execution function.
+ *
+ *        Calling this function will run the operation on set of inputs, generating a set of outputs.
+ *        The call can be blocking (synchronous) or non-blocking (asynchronous). To support the
+ *        non-blocking mode, the calling entity can pass an ID and a notification function.
+ *        At the end of the execution this notification function would be called, passing it the ID.
+ *        <b> NOTE: Asynchronous execution mode not supported in this release. </b>
+ *
+ * @param[in] operation handle to the operation on which execute is invoked
+ * @param[in] blocking flag to indicate execution mode.
+ *            If set, execution is blocking,
+ *            e.g SnpeUdo_executeOp call does not return until execution is done.
+ *            If not set, SnpeUdo_executeOp returns immediately, and the
+ *            library will call the notification function (if set) when execution is done.
+ *
+ * @param[in] ID 32-bit number that can be used by the calling entity to track execution
+ *            in case of non-blocking execution.
+ *            For example, it can be a sequence number, increased by one on each call.
+ *
+ * @param[in] notifyFunc Pointer to notification function. if the pointer is set, and execution is
+ *            non-blocking, the library will call this function at end of execution,
+ *            passing the number provided as ID
+ *
+ * @return Error code
+ *
+ */
+SnpeUdo_ErrorType_t
+SnpeUdo_executeOp(SnpeUdo_Operation_t operation,
+                  bool         blocking,
+                  const uint32_t ID,
+                  SnpeUdo_ExternalNotify_t notifyFunc);
+
+typedef SnpeUdo_ErrorType_t
+(*SnpeUdo_ExecuteOpFunction_t)(SnpeUdo_Operation_t,
+                               bool,
+                               const uint32_t,
+                               SnpeUdo_ExternalNotify_t);
+
+typedef SnpeUdo_ExecuteOpFunction_t Udo_ExecuteOpFunction_t;
+
+/**
+ * @brief A function to setting the inputs & outputs. part of SnpeUdo_Operation struct,
+ *        returned from creation of a new operation instance.
+ *        <b> Not supported in this release. </b>
+ *
+ *        This function allows the calling entity to change some of the inputs and outputs
+ *        between calls to execute.
+ *        Note that the change is limited to changing the <b> pointer </b> to the tensor data only.
+ *        Any other change may be rejected by the implementation library, causing
+ *        immediate invalidation of the operation instance
+ *
+ * @param[in] operation Operation on which IO tensors are set
+ *
+ * @param[in] inputs array of tensor parameters. The calling entity may provide a subset of the
+ *            operation inputs, providing only those that it wants to change.
+ *
+ * @param[in] outputs array of tensor parameters. The calling entity may provide a subset of the
+ *            operation outputs, providing only those that it wants to change.
+ *
+ * @return Error code
+ *
+ */
+SnpeUdo_ErrorType_t
+SnpeUdo_setOpIO(SnpeUdo_Operation_t operation,
+                SnpeUdo_TensorParam_t* inputs,
+                SnpeUdo_TensorParam_t* outputs);
+
+typedef SnpeUdo_ErrorType_t
+(*SnpeUdo_SetOpIOFunction_t)(SnpeUdo_Operation_t,
+                             SnpeUdo_TensorParam_t*,
+                             SnpeUdo_TensorParam_t*);
+
+typedef SnpeUdo_SetOpIOFunction_t Udo_SetOpIOFunction_t;
+
+/**
+ * @brief A function to return execution times.
+ *
+ *        This function can be called to query the operation execution times on the IP core
+ *        on which the operation is run. The time is provided in micro-seconds
+ *
+ * @param[in] operation Handle to operation whose execution time is being profiled
+ *
+ * @param[in,out] executionTime pointer to a uint32 value.This function writes the operation
+ *                execution time in usec into this value.
+ *
+ * @return Error code
+ *
+ */
+SnpeUdo_ErrorType_t
+SnpeUdo_profileOp(SnpeUdo_Operation_t operation, uint32_t *executionTime);
+
+typedef SnpeUdo_ErrorType_t
+(*SnpeUdo_ProfileOpFunction_t)(SnpeUdo_Operation_t, uint32_t*);
+
+typedef SnpeUdo_ProfileOpFunction_t Udo_ProfileOpFunction_t;
+
+/**
+ * @brief A function to release the operation instance
+ *        \n When it is called, the implementation library needs to release all resources
+ *        allocated for this operation instance.
+ *        \n Note that all function pointers which are part of SnpeUdo_Operation become
+ *        <b> invalid </b> once releaseUdoOp call returns.
+ *
+ * @param[in] operation Handle to operation to be released
+ * @return Error code
+ *
+ */
+SnpeUdo_ErrorType_t
+SnpeUdo_releaseOp(SnpeUdo_Operation_t operation);
+
+typedef SnpeUdo_ErrorType_t
+(*SnpeUdo_ReleaseOpFunction_t)(SnpeUdo_Operation_t);
+
+typedef SnpeUdo_ReleaseOpFunction_t Udo_ReleaseOpFunction_t;
+
+/** @} */ /* end_addtogroup c_plus_plus_apis C++ */
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif //SNPE_UDO_IMPL_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/Udo/UdoImplDsp.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/Udo/UdoImplDsp.h
new file mode 100755
index 0000000000000..522c6050a402d
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/Udo/UdoImplDsp.h
@@ -0,0 +1,199 @@
+//==============================================================================
+//
+// Copyright (c) 2019-2021 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+// Header to be used by a DSP Hexnn UDO Implementation library
+
+#ifndef SNPE_UDO_IMPL_DSP_H
+#define SNPE_UDO_IMPL_DSP_H
+#include <stdio.h>
+#include "DSP/Udo/UdoImpl.h"
+
+/** @addtogroup c_plus_plus_apis C++
+@{ */
+
+/**
+ * @brief A function to validate that a set of params is supported by an operation
+ *        This function is HexNN specific, use case is when registration library is not in use.
+ *        Optional function.
+ *
+ * @param[in] operationType Operation type
+ * @param[in] numOfStaticParams Number of static params defined by the op
+ * @param[in] staticParams Array of static params to the op
+ * @return Error code, indicating if the operation can be created on this set of configuration or not.
+ *
+ */
+
+SnpeUdo_ErrorType_t
+SnpeUdo_validateOperation (SnpeUdo_String_t operationType,
+                           uint32_t numOfStaticParams,
+                           const SnpeUdo_Param_t* staticParams);
+
+typedef SnpeUdo_ErrorType_t (*SnpeUdo_ValidateOperationFunction_t) (SnpeUdo_String_t,
+                                                                    uint32_t,
+                                                                    const SnpeUdo_Param_t*);
+
+typedef SnpeUdo_ValidateOperationFunction_t Udo_ValidateOperationFunction_t;
+
+// enum used for indicating input/outout tensor data layouts on DSP, plain vs d32
+typedef enum {
+        SNPE_UDO_DSP_TENSOR_LAYOUT_PLAIN = 0x00,        UDO_DSP_TENSOR_LAYOUT_PLAIN = 0x00,
+        SNPE_UDO_DSP_TENSOR_LAYOUT_D32   = 0x01,        UDO_DSP_TENSOR_LAYOUT_D32   = 0x01
+} SnpeUdo_HexNNTensorLayout_t;
+
+typedef SnpeUdo_HexNNTensorLayout_t Udo_HexNNTensorLayout_t;
+
+/**
+ * @brief A function to query numbers of inputs and outputs,
+ *        quantization type of each input and each output as arrays,
+ *        and data layout (plain vs d32) of each input and each output as arrays
+ *        of an operation.
+ * inputsQuantTypes and inputsLayouts should point to arrays of size numOfInputs
+ * outputsQuantTypes and outputsLayouts should point to arrays of size numOfOutputs
+ *
+ * Note: inputsLayouts and inputsLayouts can point to NULL, in this case, it is
+ * assumed all inputs and/or outputs have plain data layouts, i.e. no D32
+ *
+ * @param[in] operationType Operation type
+ * @param[in] numOfStaticParams Number of static params defined by the op
+ * @param[in] staticParams Array of static params to the op
+ * @param[in,out] numOfInputs Number of input tensors to the op
+ * @param[in,out] inputsQuantTypes Array of Quantization info for each input tensor
+ * @param[in,out] inputsLayouts Array of layout type for each input tensor
+ * @param[in,out] numOfOutputs Number of output tensors to the op
+ * @param[in,out] outputsQuantTypes Array of Quantization info for each output tensor
+ * @param[in,out] outputsLayouts Array of layout type for each output tensor
+ * @return error code, indicating status of query
+ */
+
+SnpeUdo_ErrorType_t
+SnpeUdo_queryOperation (SnpeUdo_String_t operationType,
+                        uint32_t numOfStaticParams,
+                        const SnpeUdo_Param_t* staticParams,
+                        uint32_t* numOfInputs,
+                        SnpeUdo_QuantizationType_t** inputsQuantTypes,
+                        SnpeUdo_HexNNTensorLayout_t** inputsLayouts,
+                        uint32_t* numOfOutputs,
+                        SnpeUdo_QuantizationType_t** outputsQuantTypes,
+                        SnpeUdo_HexNNTensorLayout_t** outputsLayouts);
+
+typedef SnpeUdo_ErrorType_t (*SnpeUdo_QueryOperationFunction_t) (SnpeUdo_String_t,
+                                                                 uint32_t,
+                                                                 const SnpeUdo_Param_t*,
+                                                                 uint32_t*,
+                                                                 SnpeUdo_QuantizationType_t**,
+                                                                 SnpeUdo_HexNNTensorLayout_t**,
+                                                                 uint32_t*,
+                                                                 SnpeUdo_QuantizationType_t**,
+                                                                 SnpeUdo_HexNNTensorLayout_t**);
+
+typedef SnpeUdo_QueryOperationFunction_t Udo_QueryOperationFunction_t;
+
+// Global infrastructure functions supported by Hexagon-NN v2
+typedef void (*workerThread_t) (void* perOpInfrastructure, void* userData);
+typedef int (*udoSetOutputTensorSize_t) (void* perOpInfrastructure, uint32_t outIdx, uint32_t size);
+typedef int (*udoGetInputD32Paddings_t) (void* perOpInfrastructure, uint32_t inIdx,
+                                         uint32_t* heightPadBefore, uint32_t* heightPadAfter,
+                                         uint32_t* widthPadBefore, uint32_t* widthPadAfter,
+                                         uint32_t* depthPadBefore, uint32_t* depthPadAfter);
+typedef int (*udoSetOutputD32ShapeSizePaddings_t) (void* perOpInfrastructure, uint32_t outIdx,
+                                                   uint32_t batch,
+                                                   uint32_t height, uint32_t heightPadBefore, uint32_t heightPadAfter,
+                                                   uint32_t width, uint32_t widthPadBefore, uint32_t widthPadAfter,
+                                                   uint32_t depth, uint32_t depthPadBefore, uint32_t depthPadAfter,
+                                                   SnpeUdo_DataType_t dataType);
+typedef void* (*udoMemalign_t) (size_t n, size_t size);
+typedef void* (*udoMalloc_t) (size_t size);
+typedef void* (*udoCalloc_t) (size_t n, size_t size);
+typedef void (*udoFree_t) (void* ptr);
+typedef uint32_t (*udoGetVtcmSize_t) (void* perOpInfrastructure);
+typedef void* (*udoGetVtcmPtr_t) (void* perOpInfrastructure);
+typedef uint32_t (*udoVtcmIsReal_t) (void* perOpInfrastructure);
+typedef void (*udoRunWorkerThreads_t) (void* perOpInfrastructure, uint32_t nThreads, workerThread_t w, void* userData);
+
+typedef struct hexNNv2GlobalInfra {
+    udoSetOutputTensorSize_t udoSetOutputTensorSize;
+    udoGetInputD32Paddings_t udoGetInputD32Paddings;
+    udoSetOutputD32ShapeSizePaddings_t udoSetOutputD32ShapeSizePaddings;
+    udoMemalign_t udoMemalign;
+    udoMalloc_t udoMalloc;
+    udoCalloc_t udoCalloc;
+    udoFree_t udoFree;
+    udoGetVtcmSize_t udoGetVtcmSize;
+    udoGetVtcmPtr_t udoGetVtcmPtr;
+    udoVtcmIsReal_t udoVtcmIsReal;
+    udoRunWorkerThreads_t udoRunWorkerThreads;
+} SnpeUdo_HexNNv2GlobalInfra_t;
+
+typedef SnpeUdo_HexNNv2GlobalInfra_t Udo_HexNNv2GlobalInfra_t;
+
+// hexnn types
+typedef enum hexnnInfraType {
+   UDO_INFRA_HEXNN_V2,
+   UDO_INFRA_HEXNN_V3   // reserved, do not use
+} SnpeUdo_HexNNInfraType_t;
+
+typedef SnpeUdo_HexNNInfraType_t Udo_HexNNInfraType_t;
+
+typedef struct {
+    Udo_CreateOpFactoryFunction_t create_op_factory;
+    Udo_CreateOperationFunction_t create_operation;
+    Udo_ExecuteOpFunction_t execute_op;
+    Udo_ReleaseOpFunction_t release_op;
+    Udo_ReleaseOpFactoryFunction_t release_op_factory;
+    Udo_ValidateOperationFunction_t validate_op;
+    Udo_QueryOperationFunction_t query_op;
+} udo_func_package_t;
+
+/**
+ * @brief Infrastructures needed by a developer of DSP Hexnn UDO Implementation library.
+ *
+ * The framework/runtime which loads the Hexnn UDO implementation library provides
+ * this infrastructure to the loaded library by calling "SnpeUdo_initImplLibrary"
+ * function, and passing it (cast to void*). The Hexnn UDO library is expected
+ * to cast it back to this structure.
+ *
+ */
+typedef struct dspGlobalInfrastructure {
+    SnpeUdo_Version_t   dspInfraVersion;     // api version
+    SnpeUdo_HexNNInfraType_t infraType;
+    SnpeUdo_HexNNv2GlobalInfra_t hexNNv2Infra;
+} SnpeUdo_DspGlobalInfrastructure_t;
+
+typedef SnpeUdo_DspGlobalInfrastructure_t Udo_DspGlobalInfrastructure_t;
+
+/**
+ * hexnn v2 per op factory infrastructure
+ *
+ * The framework/runtime passes per op factory infrastructure as a void pointer
+ * to HexNN UDO implementation library by calling function "SnpeUdo_createOpFactory".
+ * UDO implementation library is expected to cast it back to this following struct.
+ *
+ */
+typedef struct hexnnv2OpFactoryInfra {
+   unsigned long graphId;
+} SnpeUdo_HexNNv2OpFactoryInfra_t;
+
+typedef SnpeUdo_HexNNv2OpFactoryInfra_t Udo_HexNNv2OpFactoryInfra_t;
+
+/**
+ * hexnn v2 per operation infrastructure
+ *
+ * The framework/runtime passes per operation infrastructure as a void pointer
+ * to HexNN UDO implementation library by calling function "SnpeUdo_createOperation".
+ * UDO implementation library is expected to cast it to the following type and save it.
+ *
+ * This is needed to be passed back into some functions from global infrastructure.
+ *
+ */
+typedef void* SnpeUdo_HexNNv2OpInfra_t;
+
+typedef SnpeUdo_HexNNv2OpInfra_t Udo_HexNNv2OpInfra_t;
+
+/** @} */ /* end_addtogroup c_plus_plus_apis C++ */
+
+#endif // SNPE_UDO_IMPL_DSP_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/Udo/UdoShared.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/Udo/UdoShared.h
new file mode 100755
index 0000000000000..8c17c1d5b35f1
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/DSP/Udo/UdoShared.h
@@ -0,0 +1,48 @@
+//==============================================================================
+//
+// Copyright (c) 2019-2021 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef SNPE_UDO_SHARED_H
+#define SNPE_UDO_SHARED_H
+
+#include "DSP/Udo/UdoBase.h"
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+/** @addtogroup c_plus_plus_apis C++
+@{ */
+
+/**
+ * @brief A function to return the various versions as they relate to the UDO
+ *        The function returns a struct containing the the following:
+ *        libVersion: the version of the implementation library compiled for the UDO. Set by user
+ *        apiVersion: the version of the UDO API used in compiling the implementation library.
+ *        Set by SNPE
+ *
+ * @param[in, out] version A pointer to Version struct of type SnpeUdo_LibVersion_t
+ *
+ * @return Error code
+ *
+ */
+SnpeUdo_ErrorType_t
+SnpeUdo_getVersion (SnpeUdo_LibVersion_t** version);
+
+typedef SnpeUdo_ErrorType_t
+(*SnpeUdo_GetVersionFunction_t) (SnpeUdo_LibVersion_t** version);
+
+typedef SnpeUdo_GetVersionFunction_t Udo_GetVersionFunction_t;
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+/** @} */ /* end_addtogroup c_plus_plus_apis C++ */
+
+#endif // SNPE_UDO_SHARED_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/GPU/QnnGpuBackend.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/GPU/QnnGpuBackend.h
new file mode 100755
index 0000000000000..d7050c875f6db
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/GPU/QnnGpuBackend.h
@@ -0,0 +1,71 @@
+//==============================================================================
+//
+// Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+/**
+ *  @file
+ *  @brief  A header which defines the QNN GPU specialization of the QnnBackend.h interface.
+ */
+
+#ifndef QNN_GPU_BACKEND_H
+#define QNN_GPU_BACKEND_H
+
+#ifdef __cplusplus
+#include <cstdint>
+#else
+#include <stdint.h>
+#endif
+
+#include "QnnBackend.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+* @brief This enum defines QNN GPU custom Backend config options.
+*/
+typedef enum {
+  /// If non-zero, tuning mode will be enabled
+  QNN_GPU_BACKEND_CONFIG_OPTION_ENABLE_TUNING_MODE = 0,
+  /// The Performance cache directory. Must be non-null
+  QNN_GPU_BACKEND_CONFIG_OPTION_PERFORMANCE_CACHE_DIR = 1,
+  /// If non-zero, the performance cache will be ignored when initializing
+  QNN_GPU_BACKEND_CONFIG_OPTION_INVALIDATE_PERFORMANCE_CACHE = 2,
+  /// Unused, present to ensure 32 bits.
+  QNN_GPU_BACKEND_CONFIG_OPTION_UNDEFINED = 0x7FFFFFFF,
+} QnnGpuBackend_ConfigOption_t;
+
+/**
+ * @brief A struct which defines the QNN GPU Backend custom configuration options.
+ *        Objects of this type are to be referenced through QnnBackend_CustomConfig_t.
+ */
+typedef struct {
+   QnnGpuBackend_ConfigOption_t option;
+   union UNNAMED {
+      uint8_t enableTuningMode;
+      const char* performanceCacheDir;
+      uint8_t invalidatePerformanceCache;
+   };
+} QnnGpuBackend_CustomConfig_t;
+
+// clang-format off
+/// QnnGpuBackend_CustomConfig_t initializer macro
+#define QNN_GPU_BACKEND_CUSTOM_CONFIG_INIT                        \
+  {                                                               \
+    QNN_GPU_BACKEND_CONFIG_OPTION_UNDEFINED, /*option*/           \
+    {                                                             \
+    false                            /*enableTuningMode*/         \
+    }                                                             \
+  }
+// clang-format on
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/GPU/QnnGpuCommon.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/GPU/QnnGpuCommon.h
new file mode 100755
index 0000000000000..8fd9c18afb46b
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/GPU/QnnGpuCommon.h
@@ -0,0 +1,49 @@
+//==============================================================================
+//
+// Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+/**
+ *  @file
+ *  @brief  A header which defines common QNN GPU macros.
+ */
+
+#ifndef QNN_GPU_COMMON_H
+#define QNN_GPU_COMMON_H
+
+#include "QnnCommon.h"
+
+/// GPU Backend identifier
+#define QNN_BACKEND_ID_GPU 4
+
+/// GPU interface provider
+#define QNN_GPU_INTERFACE_PROVIDER_NAME "GPU_QTI_AISW"
+
+// GPU API Version values
+#define QNN_GPU_API_VERSION_MAJOR 3
+#define QNN_GPU_API_VERSION_MINOR 7
+#define QNN_GPU_API_VERSION_PATCH 0
+
+// clang-format off
+
+/// Macro to set Qnn_ApiVersion_t for GPU backend
+#define QNN_GPU_API_VERSION_INIT                                 \
+  {                                                              \
+    {                                                            \
+      QNN_API_VERSION_MAJOR,     /*coreApiVersion.major*/        \
+      QNN_API_VERSION_MINOR,     /*coreApiVersion.major*/        \
+      QNN_API_VERSION_PATCH      /*coreApiVersion.major*/        \
+    },                                                           \
+    {                                                            \
+      QNN_GPU_API_VERSION_MAJOR, /*backendApiVersion.major*/     \
+      QNN_GPU_API_VERSION_MINOR, /*backendApiVersion.minor*/     \
+      QNN_GPU_API_VERSION_PATCH  /*backendApiVersion.patch*/     \
+    }                                                            \
+  }
+
+// clang-format on
+
+#endif  // QNN_GPU_COMMON_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/GPU/QnnGpuContext.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/GPU/QnnGpuContext.h
new file mode 100755
index 0000000000000..42599e4280971
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/GPU/QnnGpuContext.h
@@ -0,0 +1,78 @@
+//==============================================================================
+//
+// Copyright (c) 2021-2023 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+/**
+ *  @file
+ *  @brief  A header which defines the QNN GPU specialization of the QnnContext.h interface.
+ */
+
+#ifndef QNN_GPU_CONTEXT_H
+#define QNN_GPU_CONTEXT_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * @brief This enum defines QNN GPU custom context config options.
+ */
+typedef enum {
+  /// Sets performance hint options via QnnGpuContext_PerfHint_t
+  QNN_GPU_CONTEXT_CONFIG_OPTION_PERF_HINT = 0,
+  /// If non-zero, OpenGL buffers will be used
+  QNN_GPU_CONTEXT_CONFIG_OPTION_USE_GL_BUFFERS = 1,
+  /// The kernel disk cache directory. Must be non-null
+  QNN_GPU_CONTEXT_CONFIG_OPTION_KERNEL_REPO_DIR = 2,
+  /// If non-zero, the kernel disk cache will be ignored when initializing
+  QNN_GPU_CONTEXT_CONFIG_OPTION_INVALIDATE_KERNEL_REPO = 3,
+  /// Unused, present to ensure 32 bits.
+  QNN_GPU_CONTEXT_CONFIG_OPTION_UNDEFINED = 0x7FFFFFFF
+} QnnGpuContext_ConfigOption_t;
+
+/**
+ * @brief An enum which defines the different GPU performance hint options.
+ */
+typedef enum {
+  /// Sets the GPU performance hint to high performance, this is the default
+  QNN_GPU_CONTEXT_PERF_HINT_HIGH = 0,
+  /// Sets the GPU performance hint to normal performance
+  QNN_GPU_CONTEXT_PERF_HINT_NORMAL = 1,
+  /// Sets the GPU performance hint to low performance
+  QNN_GPU_CONTEXT_PERF_HINT_LOW = 2
+} QnnGpuContext_PerfHint_t;
+
+/**
+ * @brief A struct which defines the QNN GPU context custom configuration options.
+ *        Objects of this type are to be referenced through QnnContext_CustomConfig_t.
+ */
+typedef struct {
+  QnnGpuContext_ConfigOption_t option;
+  union UNNAMED {
+    QnnGpuContext_PerfHint_t perfHint;
+    uint8_t useGLBuffers;
+    const char* kernelRepoDir;
+    uint8_t invalidateKernelRepo;
+  };
+} QnnGpuContext_CustomConfig_t;
+
+// clang-format off
+/// QnnGpuContext_CustomConfig_t initializer macro
+#define QNN_GPU_CONTEXT_CUSTOM_CONFIG_INIT                        \
+  {                                                               \
+    QNN_GPU_CONTEXT_CONFIG_OPTION_UNDEFINED, /*option*/           \
+    {                                                             \
+    QNN_GPU_CONTEXT_PERF_HINT_HIGH           /*perfHint*/         \
+    }                                                             \
+  }
+// clang-format on
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/GPU/QnnGpuGraph.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/GPU/QnnGpuGraph.h
new file mode 100755
index 0000000000000..e0652d44883ef
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/GPU/QnnGpuGraph.h
@@ -0,0 +1,72 @@
+//==============================================================================
+//
+// Copyright (c) 2020-2021 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+/**
+ *  @file
+ *  @brief  A header which defines the QNN GPU specialization of the QnnGraph.h interface.
+ */
+
+#ifndef QNN_GPU_GRAPH_H
+#define QNN_GPU_GRAPH_H
+
+#ifdef __cplusplus
+#include <cstdint>
+#else
+#include <stdint.h>
+#endif
+
+#include "QnnGraph.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * @brief An enum which defines the different tensor optimization options. A
+ *        tensor may be optimized to the specified QnnGpu_Precision_t when it
+ *        is a graph tensor that is not a graph input or a graph output and
+ *        does not connect two operations from different op packages.
+ */
+typedef enum {
+  /// Sets the precision mode to floating point 32-bit (FP32)
+  QNN_GPU_PRECISION_FP32 = 0,
+  /// Sets the precision mode to floating point 16-bit (FP16)
+  QNN_GPU_PRECISION_FP16 = 1,
+  /// Sets the precision mode to FP16 for storage and FP32 for calculations
+  QNN_GPU_PRECISION_HYBRID = 2,
+  /// Uses the tensor data type provided by the user (default)
+  QNN_GPU_PRECISION_USER_PROVIDED = 3,
+} QnnGpu_Precision_t;
+
+/**
+ * @brief A struct which defines the QNN GPU graph custom configuration options.
+ *        Objects of this type are to be referenced through QnnGraph_CustomConfig_t.
+ */
+typedef struct {
+  QnnGpu_Precision_t precision;
+  uint8_t disableMemoryOptimizations;
+  uint8_t disableNodeOptimizations;
+  uint8_t disableQueueRecording;
+} QnnGpuGraph_CustomConfig_t;
+
+// clang-format off
+/// QnnGpuGraph_CustomConfig_t initializer macro
+#define QNN_GPU_GRAPH_CUSTOM_CONFIG_INIT                              \
+  {                                                                   \
+    QNN_GPU_PRECISION_USER_PROVIDED,   /*precision*/                  \
+    0u,                                /*disableMemoryOptimizations*/ \
+    0u,                                /*disableNodeOptimizations*/   \
+    0u                                 /*disableQueueRecording*/      \
+  }
+// clang-format on
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/GPU/QnnGpuMem.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/GPU/QnnGpuMem.h
new file mode 100755
index 0000000000000..1c6cd5c3e032a
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/GPU/QnnGpuMem.h
@@ -0,0 +1,52 @@
+//==============================================================================
+//
+// Copyright (c) 2024 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+/**
+ *  @file
+ *  @brief  A header which defines the QNN GPU specialization of the QnnMem.h interface.
+ */
+
+#ifndef QNN_GPU_MEM_H
+#define QNN_GPU_MEM_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef void* QnnGpuMem_Buffer_t;
+
+/**
+ * @brief This enum defines QNN GPU memory type
+ */
+typedef enum { QNN_GPU_MEM_OPENCL = 0, QNN_GPU_MEM_UNDEFINED = 0x7FFFFFF } QnnGpu_MemType_t;
+
+/**
+ * @brief A struct which defines the QNN GPU memory preallocated by the client.
+ *        Objects of this type are to be referenced through Qnn_MemInfoCustom_t.
+ */
+typedef struct {
+  QnnGpu_MemType_t memType;
+  union {
+    QnnGpuMem_Buffer_t buffer;
+  };
+} QnnGpu_MemInfoCustom_t;
+
+// clang-format off
+/// QnnGpu_MemInfoCustom_t initializer macro
+#define QNN_GPU_MEMINFO_CUSTOM_INIT                               \
+  {                                                               \
+    QNN_GPU_MEM_UNDEFINED, /*memType*/                            \
+    NULL /* buffer*/                                              \
+  }
+// clang-format on
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/GPU/QnnGpuOpPackage.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/GPU/QnnGpuOpPackage.h
new file mode 100755
index 0000000000000..5413f50ba2267
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/GPU/QnnGpuOpPackage.h
@@ -0,0 +1,682 @@
+//==============================================================================
+//
+//  Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+//  All Rights Reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+/**
+ *  @file
+ *  @brief  A header which defines the QNN GPU specialization of the QnnOpPackage.h interface.
+ */
+
+#ifndef QNN_GPU_OP_PACKAGE_H
+#define QNN_GPU_OP_PACKAGE_H
+
+#ifdef __cplusplus
+#include <cstdint>
+#else
+#include <stdint.h>
+#endif
+
+#include "GPU/QnnGpuCommon.h"
+#include "GPU/QnnGpuGraph.h"
+#include "QnnOpPackage.h"
+#include "QnnTypes.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//=============================================================================
+// QnnOpPackage_GlobalInfrastructure_t specialization.
+//=============================================================================
+
+/**
+ * @brief A struct which is used to communicate device constant properties
+ */
+typedef struct {
+  /// GPU device version string
+  char deviceVersion[128];
+  /// GPU driver interface version {major, minor}
+  uint32_t interfaceVersion[2];
+  /// GPU Adreno(TM) tier string
+  char tierName[8];
+  /// GPU driver version {product, major, minor, patch}
+  uint32_t compilerVersion[4];
+  /// GPU device max work group size
+  size_t maxWorkGroupSize;
+  /// GPU device image 2D max width
+  size_t image2dMaxWidth;
+  /// GPU device image 2D max height
+  size_t image2dMaxHeight;
+  /// GPU device max memory allocation size
+  size_t maxBufferAllocSize;
+  /// GPU device addr alignment in bits
+  uint32_t baseAddrAlignment;
+  /// GPU device image 2D Array max width
+  size_t image2dArrayMaxWidth;
+  /// GPU device image 2D Array max height
+  size_t image2dArrayMaxHeight;
+  /// GPU device image 2D Array max depth
+  size_t image2dArrayMaxDepth;
+} QnnGpu_DeviceProperties_t;
+
+/**
+ * @brief A QNN GPU struct specializing QnnOpPackage_GlobalInfrastructure_t
+ */
+typedef struct _QnnOpPackage_GlobalInfrastructure_t {
+  /// GPU backend version (as returned by QnnBackend_getApiVersion())
+  const Qnn_ApiVersion_t* sdkApiVersion;
+  /// GPU device properties
+  const QnnGpu_DeviceProperties_t* deviceProperties;
+  /// Null terminated path to the OpenCL driver used by the backend
+  const char* driverPath;
+} QnnGpuOpPackage_GlobalInfrastructure_t;
+
+//=============================================================================
+// QnnOpPackage_PackageInfo_t specialization.
+//=============================================================================
+
+/**
+ * @brief A struct having op package specific information
+ */
+typedef struct _QnnOpPackage_PackageInfo_t {
+  /// Null terminated hash key string of all kernel sources
+  const char* kernelRepoHash;
+} QnnGpuOpPackage_PackageInfo_t;
+
+//=============================================================================
+// QnnOpPackage_Optimization_t specialization.
+//=============================================================================
+
+/**
+ * @brief An enum to specify the QNN GPU optimization type
+ *
+ */
+typedef enum {
+  /// Undefined option only used for QNN_GPU_OP_PACKAGE_OPTIMIZATION_INIT
+  QNN_GPU_OPTIMIZATION_TYPE_UNDEFINED = 0,
+  /// Super node optimization
+  QNN_GPU_OPTIMIZATION_TYPE_SUPER_NODE = 2,
+} QnnGpuOpPackage_OptimizationType_t;
+
+/**
+ * @brief A struct representing a super node connection constraint.
+ */
+typedef struct {
+  /// Producer node corresponding to QnnGpuOpPackage_SuperNodeOptimization_t::operations
+  uint32_t producer;
+  /// Output tensor index corresponding to the producer node
+  uint32_t producerOutputIndex;
+  /// Consumer node corresponding to QnnGpuOpPackage_SuperNodeOptimization_t::operations
+  uint32_t consumer;
+  /// Output tensor index corresponding to the consumer node
+  uint32_t consumerInputIndex;
+} QnnGpuOpPackage_SuperNodeConnectionConstraint_t;
+
+/**
+ * @brief An enum to specify the source of a tensor in an op def for a tensor constraint.
+ *
+ */
+typedef enum {
+  /// Tensor is an op def output
+  QNN_GPU_OPTIMIZATION_SUPER_NODE_TENSOR_SOURCE_OUTPUT = 1,
+  QNN_GPU_OPTIMIZATION_SUPER_NODE_TENSOR_SOURCE_INPUT  = 2,
+} QnnGpuOpPackage_TensorConstraintSource_t;
+
+/**
+ * @brief An enum to specify the tensor constraint type.
+ *
+ */
+typedef enum {
+  /// Add a Qnn_DataType_t to the whitelist of allowable types.
+  /// If no data type constraint is present for a tensor, all data types are allowed.
+  QNN_GPU_OPTIMIZATION_SUPER_NODE_TENSOR_CONSTRAINT_DATA_TYPE = 1,
+  /// Tensor must match it's rank
+  QNN_GPU_OPTIMIZATION_SUPER_NODE_TENSOR_CONSTRAINT_RANK = 2,
+  /// Tensor must match one of it's dimensions
+  QNN_GPU_OPTIMIZATION_SUPER_NODE_TENSOR_CONSTRAINT_DIMENSION = 3,
+  /// Add a Qnn_TensorType_t to the whitelist of allowable tensor types.
+  /// If no tensor type constraint is present for a tensor, all types are allowed.
+  QNN_GPU_OPTIMIZATION_SUPER_NODE_TENSOR_CONSTRAINT_TENSOR_TYPE = 4,
+} QnnGpuOpPackage_TensorConstraintType_t;
+
+/**
+ * @brief A struct representing a tensor constraint.
+ */
+typedef struct {
+  /// Operation corresponding to QnnGpuOpPackage_SuperNodeOptimization_t::operations
+  uint32_t operationIndex;
+  /// Source of the tensor in the Qnn_OpConfig_t
+  QnnGpuOpPackage_TensorConstraintSource_t source;
+  union {
+    /// Tensor index in the Qnn_OpConfig_t, used only for inputs and outputs
+    uint32_t index;
+    /// Tensor parameter name in the Qnn_OpConfig_t, used only for parameters
+    const char* name;
+  };
+  /// Type of tensor constraint
+  QnnGpuOpPackage_TensorConstraintType_t type;
+  union {
+    /// Tensor data type for Qnn_DataType_t constraints
+    Qnn_DataType_t dataType;
+    /// Tensor type for Qnn_TensorType_t constraints
+    Qnn_TensorType_t tensorType;
+    /// Tensor rank for rank constraints
+    uint32_t rank;
+    struct {
+      /// Tensor dimension index for dimension constraints
+      uint32_t index;
+      /// Tensor dimension size for dimension constraints
+      uint32_t size;
+    } dimension;
+  };
+} QnnGpuOpPackage_TensorConstraint_t;
+
+typedef struct {
+  /// Null-terminated array of comma separated lists of operations used for matching super node ops.
+  /// An asterisk (*) may be used to represent any operation type.
+  const char** operations;
+  /// Null-terminated array of pointers to super node connection constraints
+  QnnGpuOpPackage_SuperNodeConnectionConstraint_t** connectionConstraints;
+  /// Null-terminated array of pointers to super node tensor constraints
+  QnnGpuOpPackage_TensorConstraint_t** tensorConstraints;
+} QnnGpuOpPackage_SuperNodeOptimization_t;
+
+// clang-format off
+/// QnnGpuOpPackage_SuperNodeOptimization_t initializer macro
+#define QNN_GPU_OP_PACKAGE_SUPER_NODE_OPTIMIZATION_INIT \
+  {                                                     \
+    NULL, /*operations*/                                \
+    NULL, /*connectionConstraints*/                     \
+    NULL, /*tensorConstraints*/                         \
+  }
+// clang-format on
+
+/**
+ * @brief A struct representing a QNN GPU optimization.
+ */
+typedef struct _QnnOpPackage_Optimization_t {
+  /// Type of optimization
+  QnnGpuOpPackage_OptimizationType_t type;
+  /// Op package assigned name of the optimization
+  const char* name;
+  union {
+    /// Super node optimization, used when type is QNN_GPU_OPTIMIZATION_TYPE_SUPER_NODE
+    const QnnGpuOpPackage_SuperNodeOptimization_t* superNode;
+  };
+} QnnGpuOpPackage_Optimization_t;
+
+/// QnnGpuOpPackage_Optimization_t initializer macro
+#define QNN_GPU_OP_PACKAGE_OPTIMIZATION_INIT            \
+  {                                                     \
+    QNN_GPU_OPTIMIZATION_TYPE_UNDEFINED, NULL, { NULL } \
+  }
+
+//=============================================================================
+// QnnOpPackage_GraphInfrastructure_t specialization.
+//=============================================================================
+
+/**
+ * @brief A QNN GPU struct specializing QnnOpPackage_GraphInfrastructure_t
+ */
+typedef struct _QnnOpPackage_GraphInfrastructure_t {
+  /// GPU precision mode, user-supplied hint used for optimal kernel selection
+  QnnGpu_Precision_t precisionMode;
+} QnnGpuOpPackage_GraphInfrastructure_t;
+
+//=============================================================================
+// QNN GPU Memory Object
+//=============================================================================
+
+/**
+ * @brief An enum to specify the QNN GPU memory object type
+ *
+ */
+typedef enum {
+  /// Host memory, only used for Qnn_Param_t tensors
+  QNN_GPU_MEM_OBJ_TYPE_HOST = 0,
+  /// GPU driver buffer memory object
+  QNN_GPU_MEM_OBJ_TYPE_BUFFER = 1,
+  /// GPU driver image 2D memory object
+  QNN_GPU_MEM_OBJ_TYPE_IMAGE2D = 2,
+  /// GPU driver image 2D array memory object
+  QNN_GPU_MEM_OBJ_TYPE_IMAGE2D_ARRAY = 3,
+  /// Aggregation of GPU driver image 2D memory objects
+  QNN_GPU_MEM_OBJ_TYPE_AGGREGATED_IMAGE2D = 4,
+  /// Aggregation of GPU driver image 2D array memory objects
+  QNN_GPU_MEM_OBJ_TYPE_AGGREGATED_IMAGE2D_ARRAY = 5,
+  /// Memory type is unclaimed and can be specified by the op package via the \n
+  /// QnnGpu_OutputClaim_t struct
+  QNN_GPU_MEM_OBJ_TYPE_UNCLAIMED = 6,
+} QnnGpu_MemoryObjectType_t;
+
+/**
+ * @brief An enum to specify the QNN GPU memory layout
+ *
+ */
+typedef enum {
+  /// HWC layout
+  QNN_GPU_MEM_LAYOUT_HWC = 0,
+  /// HCW layout
+  QNN_GPU_MEM_LAYOUT_HCW = 1,
+  /// CHW layout
+  QNN_GPU_MEM_LAYOUT_CHW = 2,
+  /// Undefined
+  QNN_GPU_MEM_LAYOUT_UNDEFINED = 0x7FFFFFFF,
+} QnnGpu_MemoryLayout_t;
+
+/**
+ * @brief A struct to specify blockSize for weight Tensor and tensorId for weight Param tensor
+ */
+typedef struct {
+  // Block Quantization, block Sizes
+  uint32_t* bqBlockSize;
+  /// Tensor Id for Quantization encodings
+  uint32_t bqEncodingTensorId;
+} QnnGpu_BlockEncodingInfo_t;
+
+// clang-format off
+/// QnnGpu_MemoryObject_t initializer macro
+#define QNN_GPU_BLOCK_ENCODING_INFO_INIT                   \
+  {                                                   \
+    NULL,                      /*bqBlockSize*/    \
+    0u                         /*bqEncodingTensorId*/      \
+  }
+// clang-format on
+
+/**
+ * @brief A QNN GPU struct specifying a memory object
+ *        This struct is used with the following kernel argument types:
+ *          - QNN_GPU_KERNEL_ARG_TYPE_OP_INPUT_READ
+ *          - QNN_GPU_KERNEL_ARG_TYPE_OP_INPUT_READWRITE
+ *          - QNN_GPU_KERNEL_ARG_TYPE_OP_OUTPUT_WRITE
+ *          - QNN_GPU_KERNEL_ARG_TYPE_INTERNAL_READ
+ *          - QNN_GPU_KERNEL_ARG_TYPE_INTERNAL_READWRITE
+ *          - QNN_GPU_KERNEL_ARG_TYPE_INTERNAL_WRITE
+ */
+typedef struct {
+  /// Type of memory object
+  QnnGpu_MemoryObjectType_t type;
+  /// Data type of the memory object
+  Qnn_DataType_t dataType;
+  /// Memory object dimensions                                                                 \n
+  ///   Size is numDimensions. Uses the following type dependent format:                       \n
+  ///   QNN_GPU_MEM_OBJ_TYPE_BUFFER                   -> {numElements}                         \n
+  ///   QNN_GPU_MEM_OBJ_TYPE_IMAGE2D                  -> {height,width}                        \n
+  ///   QNN_GPU_MEM_OBJ_TYPE_IMAGE2D_ARRAY            -> {height,width,array_size}             \n
+  ///   QNN_GPU_MEM_OBJ_TYPE_AGGREGATED_IMAGE2D       -> {num_batches,height,width}            \n
+  ///   QNN_GPU_MEM_OBJ_TYPE_AGGREGATED_IMAGE2D_ARRAY -> {num_batches,height,width,array_size}
+  uint32_t* dimensions;
+  /// Memory object offsets                                         \n
+  ///   Size is numDimensions.                                      \n
+  ///   Indicates where the data store starts in the memory object. \n
+  uint32_t* offsets;
+  /// Number of dimensions in memory object                           \n
+  ///   Size is numDimensions. Has the following type dependent size: \n
+  ///   QNN_GPU_MEM_OBJ_TYPE_BUFFER                   -> 1            \n
+  ///   QNN_GPU_MEM_OBJ_TYPE_IMAGE2D                  -> 2            \n
+  ///   QNN_GPU_MEM_OBJ_TYPE_IMAGE2D_ARRAY            -> 3            \n
+  ///   QNN_GPU_MEM_OBJ_TYPE_AGGREGATED_IMAGE2D       -> 3            \n
+  ///   QNN_GPU_MEM_OBJ_TYPE_AGGREGATED_IMAGE2D_ARRAY -> 4
+  uint32_t numDimensions;
+  /// Memory object layout                           \n
+  /// Op package specific layout identifier          \n
+  /// Default is QNN_GPU_MEM_LAYOUT_UNDEFINED if not already specified by a prior operation
+  QnnGpu_MemoryLayout_t layout;
+  /// Block Quantization Tensor Information
+  QnnGpu_BlockEncodingInfo_t blockEncodingInfo;
+} QnnGpu_MemoryObject_t;
+
+// clang-format off
+/// QnnGpu_MemoryObject_t initializer macro
+#define QNN_GPU_MEMORY_OBJECT_INIT                    \
+  {                                                   \
+    QNN_GPU_MEM_OBJ_TYPE_UNCLAIMED, /*type*/          \
+    QNN_DATATYPE_UNDEFINED,         /*dataType*/      \
+    NULL,                           /*dimensions*/    \
+    NULL,                           /*offsets*/       \
+    0u,                             /*numDimensions*/ \
+    QNN_GPU_MEM_LAYOUT_UNDEFINED,   /*layout*/        \
+    QNN_GPU_BLOCK_ENCODING_INFO_INIT  /*blockEncodingInfo*/    \
+  }
+// clang-format on
+
+//=============================================================================
+// QnnOpPackage_Node_t specialization.
+//=============================================================================
+
+/**
+ * @brief A QNN GPU struct specifying a storage tensor
+ */
+typedef struct {
+  /// Tensor ID
+  uint32_t id;
+  /// Tensor's associated memory object
+  const QnnGpu_MemoryObject_t* memoryObject;
+} QnnGpu_TensorStorageType_t;
+
+// clang-format off
+/// QnnGpu_TensorStorageType_t initializer macro
+#define QNN_GPU_TENSOR_STORAGE_TYPE_INIT \
+  {                                      \
+    0u,   /*id*/                         \
+    NULL  /*memoryObject*/               \
+  }
+// clang-format on
+
+/**
+ * @brief A QNN GPU struct specializing QnnOpPackage_Node_t
+ */
+typedef struct _QnnOpPackage_Node_t {
+  /// Optimization index, see QnnOpPackage_Info_t, ignore when only one op config provided
+  uint32_t optimization;
+  /// Null-terminated array of operation config pointers
+  /// Only one pointer provided when no optimizations performed
+  const Qnn_OpConfig_t** configs;
+  /// Null-terminated array of tensor storage type pointers called out in the config
+  const QnnGpu_TensorStorageType_t** storageTypes;
+  /// Kernel variant index, if set then used by OpPackage to determine kernel selection
+  int32_t kernelVariant;
+} QnnGpuOpPackage_Node_t;
+
+//=============================================================================
+// QnnOpPackage_OpImpl_t specialization.
+//=============================================================================
+
+/**
+ * @brief A QNN GPU struct specifying an output tensor claim. Using the principle
+ *        of least work, operations must output a memory object type that is most
+ *        convenient for itself. Only QNN_TENSOR_TYPE_NATIVE tensor types may
+ *        be claimed.
+ */
+typedef struct {
+  /// Index into the Qnn_OpConfig_t provided in QnnGpuOpPackage_Node_t
+  uint32_t opConfigIndex;
+  /// Index into the operation outputs to identify the tensor
+  uint32_t outputIndex;
+  /// Specification of the claimed memory object
+  const QnnGpu_MemoryObject_t* memoryObject;
+} QnnGpu_OutputClaim_t;
+
+// clang-format off
+/// QnnGpu_OutputClaim_t initializer macro
+#define QNN_GPU_OUTPUT_CLAIM_INIT \
+  {                               \
+    0u,      /*opConfigIndex*/    \
+    0u,      /*outputIndex*/      \
+    NULL     /*memoryObject*/     \
+  }
+// clang-format on
+
+/**
+ * @brief An enum to specify the kernel argument type.
+ *
+ */
+typedef enum {
+  /// Operation input tensor used as kernel input
+  QNN_GPU_KERNEL_ARG_TYPE_OP_INPUT_READ = 0,
+  /// Operation input tensor used as kernel output
+  QNN_GPU_KERNEL_ARG_TYPE_OP_INPUT_READWRITE = 1,
+  /// Operation output tensor used as kernel output
+  QNN_GPU_KERNEL_ARG_TYPE_OP_OUTPUT_WRITE = 2,
+  /// Operation internal tensor used as kernel input
+  QNN_GPU_KERNEL_ARG_TYPE_INTERNAL_READ = 3,
+  /// Operation internal tensor used as kernel input/output
+  QNN_GPU_KERNEL_ARG_TYPE_INTERNAL_READWRITE = 4,
+  /// Operation internal tensor used as kernel output
+  QNN_GPU_KERNEL_ARG_TYPE_INTERNAL_WRITE = 5,
+  /// Plain old data kernel argument
+  QNN_GPU_KERNEL_ARG_TYPE_DATA = 6,
+  /// Local memory kernel argument
+  QNN_GPU_KERNEL_ARG_TYPE_LOCAL = 7,
+  /// Null pointer kernel argument
+  QNN_GPU_KERNEL_ARG_TYPE_NULL_PTR = 8,
+  /// Operation tensor parameter used as kernel input
+  QNN_GPU_KERNEL_ARG_TYPE_OP_TENSOR_PARAM = 9,
+} QnnGpu_KernelArgType_t;
+
+/**
+ * @brief A QNN GPU struct specifying a kernel argument corresponding to a tensor.
+ *        This struct is used with the following kernel argument types:
+ *          - QNN_GPU_KERNEL_ARG_TYPE_OP_INPUT_READ
+ *          - QNN_GPU_KERNEL_ARG_TYPE_OP_INPUT_READWRITE
+ *          - QNN_GPU_KERNEL_ARG_TYPE_OP_OUTPUT_WRITE
+ *          - QNN_GPU_KERNEL_ARG_TYPE_INTERNAL_READ
+ *          - QNN_GPU_KERNEL_ARG_TYPE_INTERNAL_READWRITE
+ *          - QNN_GPU_KERNEL_ARG_TYPE_INTERNAL_WRITE
+ */
+typedef struct {
+  /// Index into the Qnn_OpConfig_t provided in QnnGpuOpPackage_Node_t, ignored for INTERNAL types
+  uint32_t opConfigIndex;
+  /// Index into the operation input ot output list or the internal tensor list
+  uint32_t tensorIndex;
+  /// Batch element index for aggregated tensor types
+  uint32_t element;
+} QnnGpu_TensorKernelArg_t;
+
+// clang-format off
+/// QnnGpu_TensorKernelArg_t initializer macro
+#define QNN_GPU_TENSOR_KERNEL_ARG_INIT \
+  {                                    \
+    0u,   /*opConfigIndex*/            \
+    0u,   /*tensorIndex*/              \
+    0u    /*element*/                  \
+  }
+// clang-format on
+
+/**
+ * @brief An enum to specify the kernel data argument type.
+ *
+ */
+typedef enum {
+  QNN_GPU_KERNEL_ARG_CL_TYPE_CHAR   = 0,
+  QNN_GPU_KERNEL_ARG_CL_TYPE_UCHAR  = 1,
+  QNN_GPU_KERNEL_ARG_CL_TYPE_SHORT  = 2,
+  QNN_GPU_KERNEL_ARG_CL_TYPE_USHORT = 3,
+  QNN_GPU_KERNEL_ARG_CL_TYPE_INT    = 4,
+  QNN_GPU_KERNEL_ARG_CL_TYPE_UINT   = 5,
+  QNN_GPU_KERNEL_ARG_CL_TYPE_LONG   = 6,
+  QNN_GPU_KERNEL_ARG_CL_TYPE_ULONG  = 7,
+  QNN_GPU_KERNEL_ARG_CL_TYPE_FLOAT  = 8,
+  QNN_GPU_KERNEL_ARG_CL_TYPE_DOUBLE = 9,
+} QnnGpu_DataKernelArgType_t;
+
+/**
+ * @brief A QNN GPU struct specifying a kernel argument corresponding to a plain old data.
+ *        This struct is used only with the QNN_GPU_KERNEL_ARG_TYPE_DATA arg type.
+ */
+typedef struct {
+  /// Data type of the data
+  QnnGpu_DataKernelArgType_t type;
+  union {
+    /// Used with QNN_GPU_KERNEL_ARG_CL_TYPE_CHAR
+    int8_t qnnChar;
+    /// Used with QNN_GPU_KERNEL_ARG_CL_TYPE_UCHAR
+    uint8_t qnnUChar;
+    /// Used with QNN_GPU_KERNEL_ARG_CL_TYPE_SHORT
+    int16_t qnnShort;
+    /// Used with QNN_GPU_KERNEL_ARG_CL_TYPE_USHORT
+    uint16_t qnnUShort;
+    /// Used with QNN_GPU_KERNEL_ARG_CL_TYPE_INT
+    int32_t qnnInt;
+    /// Used with QNN_GPU_KERNEL_ARG_CL_TYPE_UINT
+    uint32_t qnnUInt;
+    /// Used with QNN_GPU_KERNEL_ARG_CL_TYPE_LONG
+    int64_t qnnLong;
+    /// Used with QNN_GPU_KERNEL_ARG_CL_TYPE_ULONG
+    uint64_t qnnULong;
+    /// Used with QNN_GPU_KERNEL_ARG_CL_TYPE_FLOAT
+    float qnnFloat;
+    /// Used with QNN_GPU_KERNEL_ARG_CL_TYPE_DOUBLE
+    double qnnDouble;
+  };
+} QnnGpu_DataKernelArg_t;
+
+/// QnnGpu_DataKernelArg_t initializer macro
+#define QNN_GPU_DATA_KERNEL_ARG_INIT          \
+  {                                           \
+    QNN_GPU_KERNEL_ARG_CL_TYPE_CHAR, /*type*/ \
+    {                                         \
+      0 /*qnnChar*/                           \
+    }                                         \
+  }
+
+/**
+ * @brief A QNN GPU struct specifying a kernel argument corresponding to a local memory type.
+ *        This struct is used only with the QNN_GPU_KERNEL_ARG_TYPE_LOCAL arg type.
+ */
+typedef struct {
+  /// Size of the memory requested in bytes
+  uint32_t size;
+} QnnGpu_LocalKernelArg_t;
+
+/// QnnGpu_LocalKernelArg_t initializer macro
+#define QNN_GPU_LOCAL_KERNEL_ARG_INIT \
+  { 0u /*size*/ }
+
+/**
+ * @brief A QNN GPU struct specifying a kernel argument.
+ *        Note that the QNN_GPU_KERNEL_ARG_TYPE_NULL_PTR type does not have an entry in
+ *        the union.
+ */
+typedef struct {
+  /// Type of kernel argument
+  QnnGpu_KernelArgType_t type;
+  union {
+    /// Tensor type argument
+    QnnGpu_TensorKernelArg_t tensor;
+    /// Plain old data argument
+    QnnGpu_DataKernelArg_t data;
+    /// Local memory argument
+    QnnGpu_LocalKernelArg_t local;
+  };
+} QnnGpu_KernelArg_t;
+
+/// QnnGpu_KernelArg_t initializer macro
+#define QNN_GPU_KERNEL_ARG_INIT                 \
+  {                                             \
+    QNN_GPU_KERNEL_ARG_TYPE_NULL_PTR, /*type*/  \
+    {                                           \
+      QNN_GPU_TENSOR_KERNEL_ARG_INIT /*tensor*/ \
+    }                                           \
+  }
+
+/**
+ * @brief An enum to specify the kernel source type.
+ *
+ */
+typedef enum {
+  QNN_GPU_KERNEL_SOURCE_TYPE_TEXT   = 0,
+  QNN_GPU_KERNEL_SOURCE_TYPE_BINARY = 1,
+} QnnGpu_KernelSourceType_t;
+
+/**
+ * @brief This enum defines QNN GPU kernel tuning options.
+ */
+typedef enum {
+  /// local work size tuning
+  QNN_GPU_KERNEL_TUNING_LOCAL_WORK_SIZE = 0,
+  QNN_GPU_KERNEL_TUNING_UNDEFINED       = 0x7FFFFFFF
+} QnnGpu_KernelTuningOption_t;
+
+/**
+ * @brief This struct provides local-work-size tuning configuration.
+ */
+typedef struct {
+  uint32_t minValue[3];
+  uint32_t maxValue[3];
+  uint32_t stepSize[3];
+} QnnGpu_KernelLocalWorkSizeTuning_t;
+
+/**
+ * @brief This struct provides QNN GPU kernel tuning configuration.
+ */
+typedef struct {
+  QnnGpu_KernelTuningOption_t option;
+  union UNNAMED {
+    QnnGpu_KernelLocalWorkSizeTuning_t lws;
+  };
+} QnnGpu_KernelTuningConfig_t;
+
+/**
+ * @brief A QNN GPU struct specifying a kernel.
+ */
+typedef struct {
+  /// Kernel source code or binary
+  const void* kernelSource;
+  /// Length of kernel source/binary in bytes
+  size_t sourceLength;
+  /// Type of kernel source
+  QnnGpu_KernelSourceType_t sourceType;
+  /// Null terminated build options string used for kernel compilation
+  const char* buildOptions;
+  /// Rank of the globalWorkSizes
+  size_t globalWorkDim;
+  /// Global work sizes used by enqueuing the kernel
+  size_t globalWorkSizes[3];
+  /// Rank of the localWorkSizes
+  size_t localWorkDim;
+  /// Local work sizes used by enqueuing the kernel
+  size_t localWorkSizes[3];
+  /// Null-terminated array of kernel arguments in the order they appear in the kernel function
+  QnnGpu_KernelArg_t** args;
+  /// Null terminated name of the kernel
+  const char* name;
+  /// If non-zero, kernel will be enqueued during execute even if it is static
+  uint32_t isDynamic;
+  /// Null-terminated array to provide kernel tuning configurations.
+  QnnGpu_KernelTuningConfig_t** tuningConfigs;
+  /// Reserved field, must be null
+  void* reserved;
+} QnnGpu_Kernel_t;
+
+// clang-format off
+/// QnnGpu_Kernel_t initializer macro
+#define QNN_GPU_KERNEL_INIT                              \
+  {                                                      \
+    NULL,                            /*kernelSource*/    \
+    0u,                              /*sourceLength*/    \
+    QNN_GPU_KERNEL_SOURCE_TYPE_TEXT, /*sourceType*/      \
+    NULL,                            /*buildOptions*/    \
+    0u,                              /*globalWorkDim*/   \
+    {0u},                            /*globalWorkSizes*/ \
+    0u,                              /*localWorkDim*/    \
+    {0u},                            /*localWorkSizes*/  \
+    NULL,                            /*args*/            \
+    NULL,                            /*name*/            \
+    0u,                              /*isDynamic*/       \
+    NULL,                            /*tuningConfigs*/   \
+    NULL                             /*reserved*/        \
+  }
+// clang-format on
+
+/**
+ * @brief A QNN GPU struct specifying an operation.
+ */
+typedef struct _QnnOpPackage_OpImpl_t {
+  /// Null-terminated array of output claims
+  QnnGpu_OutputClaim_t** outputClaims;
+  /// Null-terminated array of tensor requests
+  QnnGpu_MemoryObject_t** memoryObjects;
+  /// Null-terminated array of kernels
+  QnnGpu_Kernel_t** kernels;
+} QnnGpu_Operation_t;
+
+// clang-format off
+/// QnnGpu_Operation_t initializer macro
+#define QNN_GPU_OPERATION_INIT     \
+  {                                \
+    NULL,     /*outputClaims*/     \
+    NULL,     /*memoryObjects*/    \
+    NULL,     /*kernels*/          \
+  }
+// clang-format on
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/GenAiTransformer/QnnGenAiTransformerCommon.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/GenAiTransformer/QnnGenAiTransformerCommon.h
new file mode 100755
index 0000000000000..3adb43819b8b3
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/GenAiTransformer/QnnGenAiTransformerCommon.h
@@ -0,0 +1,50 @@
+//=============================================================================
+//
+//  Copyright (c) 2024 Qualcomm Technologies, Inc.
+//  All Rights Reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//=============================================================================
+
+/** @file
+ *  @brief QNN GenAiTransformer Common components
+ *
+ *         This file defines versioning and other identification details
+ *         and supplements QnnCommon.h for GenAiTransformer backend
+ */
+
+#ifndef QNN_GENAI_TRANSFORMER_COMMON_H
+#define QNN_GENAI_TRANSFORMER_COMMON_H
+
+#include "QnnCommon.h"
+
+/// GenAiTransformer Backend identifier
+#define QNN_BACKEND_ID_GENAI_TRANSFORMER 14
+
+/// GenAiTransformer interface provider
+#define QNN_GENAI_TRANSFORMER_INTERFACE_PROVIDER_NAME "GENAI_TRANSFORMER_QTI_AISW"
+
+// GenAiTransformer API Version values
+#define QNN_GENAI_TRANSFORMER_API_VERSION_MAJOR 1
+#define QNN_GENAI_TRANSFORMER_API_VERSION_MINOR 0
+#define QNN_GENAI_TRANSFORMER_API_VERSION_PATCH 0
+
+// clang-format off
+/// Macro to set Qnn_ApiVersion_t for GENAI_TRANSFORMER backend
+#define QNN_GENAI_TRANSFORMER_API_VERSION_INIT                                 \
+  {                                                              \
+    {                                                            \
+      QNN_API_VERSION_MAJOR,     /*coreApiVersion.major*/        \
+      QNN_API_VERSION_MINOR,     /*coreApiVersion.major*/        \
+      QNN_API_VERSION_PATCH      /*coreApiVersion.major*/        \
+    },                                                           \
+    {                                                            \
+      QNN_GENAI_TRANSFORMER_API_VERSION_MAJOR, /*backendApiVersion.major*/     \
+      QNN_GENAI_TRANSFORMER_API_VERSION_MINOR, /*backendApiVersion.minor*/     \
+      QNN_GENAI_TRANSFORMER_API_VERSION_PATCH  /*backendApiVersion.patch*/     \
+    }                                                            \
+  }
+
+// clang-format on
+
+#endif  // QNN_GENAI_TRANSFORMER_COMMON_H
\ No newline at end of file
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTA/QnnHtaBackend.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTA/QnnHtaBackend.h
new file mode 100755
index 0000000000000..e756b8042ec09
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTA/QnnHtaBackend.h
@@ -0,0 +1,76 @@
+//=============================================================================
+//
+//  Copyright (c) 2022 Qualcomm Technologies, Inc.
+//  All Rights Reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//=============================================================================
+
+/** @file
+ *  @brief QNN HTA component Backend API.
+ *
+ *         The interfaces in this file work with the top level QNN
+ *         API and supplements QnnBackend.h for HTA backend
+ */
+
+#ifndef QNN_HTA_BACKEND_H
+#define QNN_HTA_BACKEND_H
+
+#include "QnnBackend.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//=============================================================================
+// Macros
+//=============================================================================
+
+//=============================================================================
+// Data Types
+//=============================================================================
+
+/* @brief Enum describing the set of features supported by HTA backend.
+          This is used as a bitmask, so assign unique bits to each entries.
+*/
+typedef enum {
+  ///  The accelerator will always attempt to fold relu activation
+  ///  into the immediate preceding convolution operation. This optimization
+  ///  is correct when quantization ranges for convolution are equal or
+  ///  subset of the Relu operation. For graphs, where this cannot be
+  ///  guranteed, the client should set this flag
+  QNN_HTA_FOLD_RELU_ACTIVATION_INTO_CONV_OFF = 1 << 0,
+  /// UNKNOWN enum event that must not be used
+  QNN_HTA_BACKEND_FEATURES_UNKNOWN = 0x7fffffff
+} QnnHtaBackend_Features_t;
+
+//=============================================================================
+// Public Functions
+//=============================================================================
+
+//------------------------------------------------------------------------------
+//   Implementation Definition
+//------------------------------------------------------------------------------
+
+// clang-format off
+
+/**
+ * @brief Structure describing the set of configurations supported by the backend.
+ *        Objects of this type are to be referenced through QnnBackend_CustomConfig_t.
+ */
+typedef struct {
+    /// field to save the features that are passed
+    /// via QnnHtaBackend_Features_t
+    uint32_t bitmaskFeatures;
+} QnnHtaBackend_CustomConfig_t ;
+
+/// QnnHtaBackend_CustomConfig_t initializer macro
+#define QNN_HTA_BACKEND_CUSTOM_CONFIG_INIT \
+  { 0 /*bitmaskFeatures*/ }
+
+// clang-format on
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTA/QnnHtaCommon.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTA/QnnHtaCommon.h
new file mode 100755
index 0000000000000..1eb8e1f0a99a4
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTA/QnnHtaCommon.h
@@ -0,0 +1,62 @@
+//=============================================================================
+//
+//  Copyright (c) 2022 Qualcomm Technologies, Inc.
+//  All Rights Reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//=============================================================================
+
+/** @file
+ *  @brief QNN HTA Common components
+ *
+ *         This file defines versioning and other identification details
+ *         and supplements QnnCommon.h for HTA backend
+ */
+
+#ifndef QNN_HTA_COMMON_H
+#define QNN_HTA_COMMON_H
+
+#include "QnnCommon.h"
+
+/// HTA Backend identifier
+#define QNN_BACKEND_ID_HTA 7
+
+/// HTA interface provider
+#define QNN_HTA_INTERFACE_PROVIDER_NAME "HTA_QTI_AISW"
+
+// HTA API Version values
+
+#define QNN_HTA_API_VERSION_MAJOR 2
+#define QNN_HTA_API_VERSION_MINOR 0
+#define QNN_HTA_API_VERSION_PATCH 0
+
+// clang-format off
+
+/// Macro to set Qnn_ApiVersion_t for HTA backend
+#define QNN_HTA_API_VERSION_INIT                                 \
+  {                                                              \
+    {                                                            \
+      QNN_API_VERSION_MAJOR,     /*coreApiVersion.major*/        \
+      QNN_API_VERSION_MINOR,     /*coreApiVersion.major*/        \
+      QNN_API_VERSION_PATCH      /*coreApiVersion.major*/        \
+    },                                                           \
+    {                                                            \
+      QNN_HTA_API_VERSION_MAJOR, /*backendApiVersion.major*/     \
+      QNN_HTA_API_VERSION_MINOR, /*backendApiVersion.minor*/     \
+      QNN_HTA_API_VERSION_PATCH  /*backendApiVersion.patch*/     \
+    }                                                            \
+  }
+
+// clang-format on
+
+// HTA Binary Version values
+#define QNN_HTA_BINARY_VERSION_MAJOR 2
+#define QNN_HTA_BINARY_VERSION_MINOR 0
+#define QNN_HTA_BINARY_VERSION_PATCH 0
+
+// HTA Context blob Version values
+#define QNN_HTA_CONTEXT_BLOB_VERSION_MAJOR 1
+#define QNN_HTA_CONTEXT_BLOB_VERSION_MINOR 1
+#define QNN_HTA_CONTEXT_BLOB_VERSION_PATCH 0
+
+#endif  // QNN_HTA_COMMON_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTA/QnnHtaDevice.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTA/QnnHtaDevice.h
new file mode 100755
index 0000000000000..d31f5232e21f3
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTA/QnnHtaDevice.h
@@ -0,0 +1,41 @@
+//=============================================================================
+//
+//  Copyright (c) 2022 Qualcomm Technologies, Inc.
+//  All Rights Reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//=============================================================================
+
+/** @file
+ *  @brief QNN HTA component Device API.
+ *
+ *         The interfaces in this file work with the top level QNN
+ *         API and supplements QnnDevice.h for HTA backend
+ */
+#ifndef QNN_HTA_DEVICE_H
+#define QNN_HTA_DEVICE_H
+
+#include "QnnDevice.h"
+#include "QnnHtaPerfInfrastructure.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct _QnnDevice_Infrastructure_t {
+  QnnHtaPerfInfrastructure_SetPowerConfigFn_t setPowerConfig;
+} QnnHtaDevice_Infrastructure_t;
+
+// clang-format off
+/// QnnHtaDevice_Infrastructure_t initializer macro
+#define QNN_HTA_DEVICE_INFRASTRUCTURE_INIT \
+  {                                        \
+    NULL,     /*setPowerConfig*/           \
+  }
+// clang-format on
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif
\ No newline at end of file
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTA/QnnHtaGraph.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTA/QnnHtaGraph.h
new file mode 100755
index 0000000000000..0abbb9bc5114d
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTA/QnnHtaGraph.h
@@ -0,0 +1,123 @@
+//=============================================================================
+//
+//  Copyright (c) 2022 Qualcomm Technologies, Inc.
+//  All Rights Reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//=============================================================================
+
+/** @file
+ *  @brief QNN HTA component Graph API.
+ *
+ *         The interfaces in this file work with the top level QNN
+ *         API and supplements QnnGraph.h for HTA backend
+ */
+
+#ifndef QNN_HTA_GRAPH_H
+#define QNN_HTA_GRAPH_H
+
+#include "QnnGraph.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//=============================================================================
+// Macros
+//=============================================================================
+
+//=============================================================================
+// Data Types
+//=============================================================================
+
+/**
+ * @brief This enum provides different HTA graph optimization
+ *         options that can be used to finalize the graph
+ *         for optimum performance
+ */
+typedef enum QnnHtaGraph_OptimizationType {
+  QNN_HTA_GRAPH_OPTIMIZATION_TYPE_SCHEDULE_THRESHOLD = 1,
+  QNN_HTA_GRAPH_OPTIMIZATION_TYPE_FINALIZE_RETRIES   = 2,
+  QNN_HTA_GRAPH_OPTIMIZATION_TYPE_UNKNOWN            = 0x7fffffff
+} QnnHtaGraph_OptimizationType_t;
+
+/* @brief Struct describing the set of optimization type
+ *        and the value associated with the optimization
+ */
+typedef struct QnnHtaGraph_OptimizationOption {
+  QnnHtaGraph_OptimizationType_t type;
+  float floatValue;
+} QnnHtaGraph_OptimizationOption_t;
+
+// clang-format off
+/// QnnHtaGraph_OptimizationOption_t initializer macro
+#define QNN_HTA_GRAPH_OPTIMIZATION_OPTION_INIT              \
+  {                                                         \
+    QNN_HTA_GRAPH_OPTIMIZATION_TYPE_UNKNOWN, /*type*/       \
+    0.0f                                     /*floatValue*/ \
+  }
+// clang-format on
+
+/**
+ * @brief This enum provides different HTA graph configuration
+ *         options associated with QnnGraph
+ */
+typedef enum QnnHtaGraph_ConfigOption {
+  QNN_HTA_GRAPH_CONFIG_OPTION_OPTIMIZATION = 1,
+  QNN_HTA_GRAPH_CONFIG_OPTION_PRIORITY     = 2,
+  QNN_HTA_GRAPH_CONFIG_OPTION_UNKNOWN      = 0x7fffffff
+} QnnHtaGraph_ConfigOption_t;
+
+//=============================================================================
+// Public Functions
+//=============================================================================
+
+//------------------------------------------------------------------------------
+//   Implementation Definition
+//------------------------------------------------------------------------------
+
+// clang-format off
+
+/**
+ * @brief        Structure describing the set of configurations supported by graph.
+ *               Objects of this type are to be referenced through QnnGraph_CustomConfig_t.
+ *
+ *               The struct has two fields - option and a union of corresponding config values
+ *               Based on the option corresponding item in the union can be used to specify
+ *               config
+ *               Below is the Map between QnnHtaGraph_ConfigOption_t and config value
+ *
+ *               \verbatim embed:rst:leading-asterisk
+ *               +----+------------------------------------------+------------------------------------+
+ *               | #  | Config Option                            | Configuration Struct/value         |
+ *               +====+==========================================+====================================+
+ *               | 1  | QNN_HTA_GRAPH_CONFIG_OPTION_OPTIMIZATION | QnnHtaGraph_OptimizationOption_t   |
+ *               +----+------------------------------------------+------------------------------------+
+ *               | 2  | QNN_HTA_GRAPH_CONFIG_OPTION_PRIORITY     | Qnn_Priority_t                     |
+ *               +----+------------------------------------------+------------------------------------+
+ *               \endverbatim
+ */
+typedef struct {
+  QnnHtaGraph_ConfigOption_t option;
+  union {
+    QnnHtaGraph_OptimizationOption_t optimizationOption;
+    Qnn_Priority_t priority;
+  };
+} QnnHtaGraph_CustomConfig_t ;
+
+
+/// QnnHtaGraph_CustomConfig_t initalizer macro
+#define QNN_HTA_GRAPH_CUSTOM_CONFIG_INIT                            \
+  {                                                                 \
+    QNN_HTA_GRAPH_CONFIG_OPTION_UNKNOWN, /*option*/                 \
+    {                                                               \
+      QNN_HTA_GRAPH_OPTIMIZATION_OPTION_INIT /*optimizationOption*/ \
+    }                                                               \
+  }
+
+// clang-format on
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTA/QnnHtaPerfInfrastructure.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTA/QnnHtaPerfInfrastructure.h
new file mode 100755
index 0000000000000..4f6e0c22c274b
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTA/QnnHtaPerfInfrastructure.h
@@ -0,0 +1,134 @@
+//==============================================================================
+//
+// Copyright (c) 2022 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+/** @file
+ *  @brief QNN HTA component Performance Infrastructure API
+ *
+ *         Provides interface to the client to control performance and system
+ *         settings of the QNN HTA Accelerator
+ */
+
+#ifndef QNN_HTA_PERF_INFRASTRUCTURE_H
+#define QNN_HTA_PERF_INFRASTRUCTURE_H
+
+#include "QnnCommon.h"
+#include "QnnTypes.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//=============================================================================
+// Data Types
+//=============================================================================
+
+/**
+ * @brief QNN HTA PerfInfrastructure API result / error codes.
+ *
+ */
+typedef enum {
+  QNN_HTA_PERF_INFRASTRUCTURE_MIN_ERROR = QNN_MIN_ERROR_PERF_INFRASTRUCTURE,
+  ////////////////////////////////////////////////////////////////////////
+
+  QNN_HTA_PERF_INFRASTRUCTURE_NO_ERROR                 = QNN_SUCCESS,
+  QNN_HTA_PERF_INFRASTRUCTURE_ERROR_INVALID_HANDLE_PTR = QNN_MIN_ERROR_PERF_INFRASTRUCTURE + 0,
+  QNN_HTA_PERF_INFRASTRUCTURE_ERROR_INVALID_INPUT      = QNN_MIN_ERROR_PERF_INFRASTRUCTURE + 1,
+  QNN_HTA_PERF_INFRASTRUCTURE_ERROR_UNSUPPORTED_CONFIG = QNN_MIN_ERROR_PERF_INFRASTRUCTURE + 2,
+  QNN_HTA_PERF_INFRASTRUCTURE_ERROR_TRANSPORT          = QNN_MIN_ERROR_PERF_INFRASTRUCTURE + 3,
+
+  ////////////////////////////////////////////////////////////////////////
+  QNN_HTA_PERF_INFRASTRUCTURE_MAX_ERROR = QNN_MAX_ERROR_PERF_INFRASTRUCTURE
+} QnnHtaPerfInfrastructure_Error_t;
+
+/**
+ * @brief This enum defines all the possible performance
+ *        options in Hta Performance Infrastructure that
+ *        relate to setting up of power levels
+ */
+typedef enum {
+  /// config enum implies the usage of powerModeConfig struct. If not provided
+  /// will be used as type identificator
+  QNN_HTA_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_POWER_MODE = 1,
+  /// UNKNOWN config option which must not be used
+  QNN_HTA_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_UNKNOWN = 0x7fffffff
+} QnnHtaPerfInfrastructure_PowerConfigOption_t;
+
+/**
+ * @brief This enum defines all the possible power mode
+ *        that a client can set
+ */
+typedef enum {
+  /// default mode
+  QNN_HTA_PERF_INFRASTRUCTURE_POWERMODE_DEFAULT = 0,
+  /// low power saver mode
+  QNN_HTA_PERF_INFRASTRUCTURE_POWERMODE_LOW_POWER_SAVER = 1,
+  /// power saver mode
+  QNN_HTA_PERF_INFRASTRUCTURE_POWERMODE_POWER_SAVER = 2,
+  /// high power saver mode
+  QNN_HTA_PERF_INFRASTRUCTURE_POWERMODE_HIGH_POWER_SAVER = 3,
+  /// balanced mode
+  QNN_HTA_PERF_INFRASTRUCTURE_POWERMODE_BALANCED = 4,
+  /// high performance mode
+  QNN_HTA_PERF_INFRASTRUCTURE_POWERMODE_HIGH_PERFORMANCE = 5,
+  /// burst mode
+  QNN_HTA_PERF_INFRASTRUCTURE_POWERMODE_BURST = 6,
+  /// UNKNOWN value that must not be used by client
+  QNN_HTA_PERF_INFRASTRUCTURE_POWERMODE_UNKNOWN = 0x7fffffff
+} QnnHtaPerfInfrastructure_PowerMode_t;
+
+/**
+ * @brief This struct provides performance infrastructure configuration
+ *         associated with setting up of power levels
+ */
+typedef struct {
+  QnnHtaPerfInfrastructure_PowerConfigOption_t config;
+  // Organize as union for future expand flexibility defined by PowerConfigOption_t
+  union {
+    QnnHtaPerfInfrastructure_PowerMode_t powerModeConfig;
+  };
+} QnnHtaPerfInfrastructure_PowerConfig_t;
+
+/// QnnHtaPerfInfrastructure_PowerConfig_t initializer macro
+#define QNN_HTA_PERF_INFRASTRUCTURE_POWER_CONFIG_INIT                   \
+  {                                                                     \
+    QNN_HTA_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_UNKNOWN, /*config*/  \
+    {                                                                   \
+      QNN_HTA_PERF_INFRASTRUCTURE_POWERMODE_UNKNOWN /*powerModeConfig*/ \
+    }                                                                   \
+  }
+
+//=============================================================================
+// API Methods
+//=============================================================================
+
+/**
+ * @brief This API allows client to set up system power configuration that
+ *        will enable different performance modes.
+ *
+ * @param[in] clientId A power client id to associate calls to system
+ *            power settings. A value of 0 implies NULL power client id
+ *            and can override every other setting the user process. To
+ *            enable power settings for multiple clients in the same
+ *            process, use a non-zero power client id.
+ *
+ *
+ * @param[in] config Pointer to a NULL terminated array
+ *            of config option for performance configuration.
+ *            NULL is allowed and indicates no config options are provided.
+ *
+ * @return Error code
+ *         \n QNN_SUCCESS: No error encountered
+ */
+typedef Qnn_ErrorHandle_t (*QnnHtaPerfInfrastructure_SetPowerConfigFn_t)(
+    uint32_t clientId, const QnnHtaPerfInfrastructure_PowerConfig_t** config);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // QNN_HTA_PERF_INFRASTRUCTURE_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTA/QnnHtaProfile.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTA/QnnHtaProfile.h
new file mode 100755
index 0000000000000..f069dbbedf6b7
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTA/QnnHtaProfile.h
@@ -0,0 +1,199 @@
+//==============================================================================
+//
+//  Copyright (c) 2022 Qualcomm Technologies, Inc.
+//  All Rights Reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+/**
+ *  @file
+ *  @brief QNN HTA Profile component API.
+ *
+ *          Requires HTA backend to be initialized.
+ *          Should be used with the QnnProfile API but has HTA backend
+ *          specific definition for different QnnProfile data structures
+ *
+ */
+
+#ifndef QNN_HTA_PROFILE_H
+#define QNN_HTA_PROFILE_H
+
+#include "QnnProfile.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//=============================================================================
+// Macros
+//=============================================================================
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to the remote procedure call on the ARM processor
+ *        when client invokes QnnContext_createFromBinary. The value
+ *        returned is time in microseconds.
+ *
+ * @note context load binary host time maybe available on both
+ *       QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels
+ */
+#define QNN_HTA_PROFILE_EVENTTYPE_CONTEXT_LOAD_BIN_HOST_TIME_MICROSEC 1002
+
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to the remote procedure call on the HTA processor
+ *        when client invokes QnnContext_createFromBinary. The value
+ *        returned is time in microseconds.
+ *
+ * @note context load binary HTA time maybe available on both
+ *       QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels
+ */
+#define QNN_HTA_PROFILE_EVENTTYPE_CONTEXT_LOAD_BIN_HTA_TIME_MICROSEC 1003
+
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to the time taken to create the context on the
+ *        accelerator when client invokes QnnContext_createFromBinary.
+ *        The value returned is time in microseconds.
+ *
+ * @note context load binary accelerator time maybe available on both
+ *       QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels
+ */
+#define QNN_HTA_PROFILE_EVENTTYPE_CONTEXT_LOAD_BIN_ACCEL_TIME_MICROSEC 1004
+
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to the remote procedure call on the ARM processor
+ *        when client invokes QnnGraph_finalize.
+ *        The value returned is time in microseconds.
+ *
+ * @note graph finalize host time maybe available on both
+ *       QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels
+ */
+#define QNN_HTA_PROFILE_EVENTTYPE_GRAPH_FINALIZE_HOST_TIME_MICROSEC 2001
+
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to the remote procedure call on the HTA processor
+ *        when client invokes QnnGraph_finalize.
+ *        The value returned is time in microseconds.
+ *
+ * @note graph finalize HTA time maybe available on both
+ *       QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels
+ */
+#define QNN_HTA_PROFILE_EVENTTYPE_GRAPH_FINALIZE_HTA_TIME_MICROSEC 2002
+
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to finalize the graph on the accelerator
+ *        when client invokes QnnGraph_finalize.
+ *        The value returned is time in microseconds.
+ *
+ * @note graph finalize accelerator time maybe available on both
+ *       QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels
+ */
+#define QNN_HTA_PROFILE_EVENTTYPE_GRAPH_FINALIZE_ACCEL_TIME_MICROSEC 2003
+
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to the remote procedure call on the ARM processor
+ *        when client invokes QnnGraph_execute or QnnGraph_executeAsync.
+ *        The value returned is time in microseconds.
+ *
+ * @note graph execute host time maybe available on both
+ *       QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels
+ */
+#define QNN_HTA_PROFILE_EVENTTYPE_GRAPH_EXECUTE_HOST_TIME_MICROSEC 3001
+
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to the remote procedure call on the HTA processor
+ *        when client invokes QnnGraph_execute or QnnGraph_executeAsync.
+ *        The value returned is time in microseconds.
+ *
+ * @note graph execute HTA time maybe available on both
+ *       QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels
+ */
+#define QNN_HTA_PROFILE_EVENTTYPE_GRAPH_EXECUTE_HTA_TIME_MICROSEC 3002
+
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to execute the graph on the accelerator
+ *        when client invokes QnnGraph_execute or QnnGraph_executeAsync.
+ *        The value returned is number of processor cycles taken.
+ *
+ * @note graph execute accelerator time maybe available only on
+ *       QNN_PROFILE_LEVEL_DETAILED levels
+ *
+ * @note When QNN_PROFILE_LEVEL_DETAILED is used, this event can have
+ *       multiple sub-events of type QNN_PROFILE_EVENTTYPE_NODE.
+ *       There will be a sub-event for each node that was added to the graph
+ */
+#define QNN_HTA_PROFILE_EVENTTYPE_GRAPH_EXECUTE_ACCEL_TIME_CYCLE 3003
+
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to execute the graph on the accelerator
+ *        when client invokes QnnGraph_execute or QnnGraph_executeAsync.
+ *        The value returned is time taken in microseconds
+ *
+ * @note graph execute accelerator time maybe available on both
+ *       QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels
+ *
+ * @note When QNN_PROFILE_LEVEL_DETAILED is used, this event can have
+ *       multiple sub-events of type QNN_PROFILE_EVENTTYPE_NODE / QNN_PROFILE_EVENTUNIT_MICROSEC.
+ *       There will be a sub-event for each node that was added to the graph
+ */
+#define QNN_HTA_PROFILE_EVENTTYPE_GRAPH_EXECUTE_ACCEL_TIME_MICROSEC 3004
+
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to time taken for miscellaneous work i.e. time
+ *        that cannot be attributed to a node but are still needed to
+ *        execute the graph on the accelerator. This occurs when client invokes
+ *        QnnGraph_execute or QnnGraph_executeAsync.
+ *        The value returned is time taken in microseconds
+ *
+ * @note graph execute misc accelerator time is available only on
+ *       QNN_PROFILE_LEVEL_DETAILED levels
+ */
+#define QNN_HTA_PROFILE_EVENTTYPE_GRAPH_EXECUTE_MISC_ACCEL_TIME_MICROSEC 3005
+
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to the remote procedure call on the ARM processor
+ *        when client invokes QnnContext_free which in consequence deinit graph.
+ *        The value returned is time in microseconds.
+ *
+ * @note graph deinit host time maybe available on both
+ *       QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels
+ */
+#define QNN_HTA_PROFILE_EVENTTYPE_GRAPH_DEINIT_HOST_TIME_MICROSEC 4001
+
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to the remote procedure call on the HTA processor
+ *        when client invokes QnnContext_free which in consequence deinit graph.
+ *        The value returned is time in microseconds.
+ *
+ * @note graph deinit HTA time maybe available on both
+ *       QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels
+ */
+#define QNN_HTA_PROFILE_EVENTTYPE_GRAPH_DEINIT_HTA_TIME_MICROSEC 4002
+
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to the time taken to deinit graph on the
+ *        accelerator when client invokes QnnContext_free which in consequence
+ *        deinit graph. The value returned is time in microseconds.
+ *
+ * @note graph deinit accelerator time maybe available on both
+ *       QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels
+ */
+#define QNN_HTA_PROFILE_EVENTTYPE_GRAPH_DEINIT_ACCEL_TIME_MICROSEC 4003
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // QNN_HTA_PROFILE_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/QnnHtpCommon.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/QnnHtpCommon.h
new file mode 100755
index 0000000000000..8b1d458a04b8e
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/QnnHtpCommon.h
@@ -0,0 +1,98 @@
+//=============================================================================
+//
+//  Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+//  All Rights Reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//=============================================================================
+
+/** @file
+ *  @brief QNN HTP Common components
+ *
+ *         This file defines versioning and other identification details
+ *         and supplements QnnCommon.h for HTP backend
+ */
+
+#ifndef QNN_HTP_COMMON_H
+#define QNN_HTP_COMMON_H
+
+#include "QnnCommon.h"
+
+/// HTP Backend identifier
+#define QNN_BACKEND_ID_HTP 6
+
+/// HTP interface provider
+#define QNN_HTP_INTERFACE_PROVIDER_NAME "HTP_QTI_AISW"
+
+// HTP API Version values
+#define QNN_HTP_API_VERSION_MAJOR 5
+#define QNN_HTP_API_VERSION_MINOR 34
+#define QNN_HTP_API_VERSION_PATCH 0
+
+// clang-format off
+
+/// Macro to set Qnn_ApiVersion_t for HTP backend
+#define QNN_HTP_API_VERSION_INIT                                 \
+  {                                                              \
+    {                                                            \
+        QNN_API_VERSION_MAJOR,        /*coreApiVersion.major*/   \
+        QNN_API_VERSION_MINOR,        /*coreApiVersion.major*/   \
+        QNN_API_VERSION_PATCH         /*coreApiVersion.major*/   \
+    },                                                           \
+    {                                                            \
+      QNN_HTP_API_VERSION_MAJOR,     /*backendApiVersion.major*/ \
+      QNN_HTP_API_VERSION_MINOR,     /*backendApiVersion.minor*/ \
+      QNN_HTP_API_VERSION_PATCH      /*backendApiVersion.patch*/ \
+    }                                                            \
+  }
+
+// clang-format on
+
+// DSP Context blob Version values
+#define QNN_HTP_CONTEXT_BLOB_VERSION_MAJOR 3
+#define QNN_HTP_CONTEXT_BLOB_VERSION_MINOR 2
+#define QNN_HTP_CONTEXT_BLOB_VERSION_PATCH 3
+
+/* ==== CDSP Security Library Versioning ==== */
+/* ==== This information is only intended for OEMs ==== */
+
+/* Security versioning for DSP libraries is supported V73 onwards */
+#define QNN_HTP_NATIVE_LIB_SECURITY_VERSIONING_MIN_ARCH 73
+
+/* Here we will define CDSP library versions for different targets
+ * Version is increased whenever there is a security fix from CDSP
+ * The versioning will start from 1.0.0 for each new target
+ * */
+
+/* V73 Security Issues:
+ * List of security issues fixed for V73 and the fixed version
+ * */
+#define QNN_HTP_V73_NATIVE_LIB_SECURITY_VERSION_MAJOR 1
+#define QNN_HTP_V73_NATIVE_LIB_SECURITY_VERSION_MINOR 0
+#define QNN_HTP_V73_NATIVE_LIB_SECURITY_VERSION_PATCH 0
+
+/* V75 Security Issues:
+ * List of security issues fixed for V75 and the fixed version
+ * */
+// HTP Native library version values for V75
+#define QNN_HTP_V75_NATIVE_LIB_SECURITY_VERSION_MAJOR 1
+#define QNN_HTP_V75_NATIVE_LIB_SECURITY_VERSION_MINOR 0
+#define QNN_HTP_V75_NATIVE_LIB_SECURITY_VERSION_PATCH 0
+
+/* V79 Security Issues:
+ * List of security issues fixed for V79 and the fixed version
+ * */
+// HTP Native library version values for V79
+#define QNN_HTP_V79_NATIVE_LIB_SECURITY_VERSION_MAJOR 1
+#define QNN_HTP_V79_NATIVE_LIB_SECURITY_VERSION_MINOR 0
+#define QNN_HTP_V79_NATIVE_LIB_SECURITY_VERSION_PATCH 0
+
+/* V81 Security Issues:
+ * List of security issues fixed for V81 and the fixed version
+ * */
+// HTP Native library version values for V81
+#define QNN_HTP_V81_NATIVE_LIB_SECURITY_VERSION_MAJOR 1
+#define QNN_HTP_V81_NATIVE_LIB_SECURITY_VERSION_MINOR 0
+#define QNN_HTP_V81_NATIVE_LIB_SECURITY_VERSION_PATCH 0
+
+#endif  // QNN_HTP_COMMON_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/QnnHtpContext.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/QnnHtpContext.h
new file mode 100755
index 0000000000000..8266817e2dc41
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/QnnHtpContext.h
@@ -0,0 +1,164 @@
+//==============================================================================
+//
+//  Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+//  All rights reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.s
+//
+//==============================================================================
+
+/**
+ *  @file
+ *  @brief QNN HTP component Context API.
+ *
+ *         The interfaces in this file work with the top level QNN
+ *         API and supplements QnnContext.h for HTP backend
+ */
+
+#ifndef QNN_HTP_CONTEXT_H
+#define QNN_HTP_CONTEXT_H
+
+#include "QnnContext.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//=============================================================================
+// Macros
+//=============================================================================
+
+//=============================================================================
+// Data Types
+//=============================================================================
+
+/**
+ * @brief This enum provides different HTP context configuration
+ *        options associated with QnnContext
+ */
+typedef enum {
+  QNN_HTP_CONTEXT_CONFIG_OPTION_WEIGHT_SHARING_ENABLED            = 1,
+  QNN_HTP_CONTEXT_CONFIG_OPTION_REGISTER_MULTI_CONTEXTS           = 2,
+  QNN_HTP_CONTEXT_CONFIG_OPTION_FILE_READ_MEMORY_BUDGET           = 3,
+  QNN_HTP_CONTEXT_CONFIG_OPTION_DSP_MEMORY_PROFILING_ENABLED      = 4,
+  QNN_HTP_CONTEXT_CONFIG_OPTION_SHARE_RESOURCES                   = 5,
+  QNN_HTP_CONTEXT_CONFIG_OPTION_IO_MEM_ESTIMATION                 = 6,
+  QNN_HTP_CONTEXT_CONFIG_OPTION_PREPARE_ONLY                      = 7,
+  QNN_HTP_CONTEXT_CONFIG_OPTION_INIT_ACCELERATION                 = 8,
+  QNN_HTP_CONTEXT_CONFIG_OPTION_SKIP_VALIDATION_ON_BINARY_SECTION = 9,
+  QNN_HTP_CONTEXT_CONFIG_OPTION_UNKNOWN                           = 0x7fffffff
+} QnnHtpContext_ConfigOption_t;
+
+typedef struct {
+  // Handle referring to the first context associated to a group. When a new
+  // group is to be registered, the following value must be 0.
+  Qnn_ContextHandle_t firstGroupHandle;
+  // Max spill-fill buffer to be allocated for the group of context in bytes.
+  // The value that is passed during the registration of the first context to
+  // a group is taken. Subsequent configuration of this value is disregarded.
+  uint64_t maxSpillFillBuffer;
+} QnnHtpContext_GroupRegistration_t;
+
+//=============================================================================
+// Public Functions
+//=============================================================================
+
+//------------------------------------------------------------------------------
+//   Implementation Definition
+//------------------------------------------------------------------------------
+
+// clang-format off
+
+/**
+ * @brief        Structure describing the set of configurations supported by context.
+ *               Objects of this type are to be referenced through QnnContext_CustomConfig_t.
+ *
+ *               The struct has two fields - option and a union of config values
+ *               Based on the option corresponding item in the union can be used to specify
+ *               config.
+ *
+ *               Below is the Map between QnnHtpContext_CustomConfig_t and config value
+ *
+ *               \verbatim embed:rst:leading-asterisk
+ *               +----+---------------------------------------------------------------------+---------------------------------------+
+ *               | #  | Config Option                                                       | Configuration Struct/value            |
+ *               +====+=====================================================================+=======================================+
+ *               | 1  | QNN_HTP_CONTEXT_CONFIG_OPTION_WEIGHT_SHARING_ENABLED                | bool                                  |
+ *               +====+=====================================================================+=======================================+
+ *               | 2  | QNN_HTP_CONTEXT_CONFIG_OPTION_REGISTER_MULTI_CONTEXTS               | QnnHtpContext_GroupRegistration_t     |
+ *               +====+=====================================================================+=======================================+
+ *               | 3  | QNN_HTP_CONTEXT_CONFIG_OPTION_FILE_READ_MEMORY_BUDGET               | uint64_t                              |
+ *               +====+=====================================================================+=======================================+
+ *               | 4  | QNN_HTP_CONTEXT_CONFIG_OPTION_DSP_MEMORY_PROFILING_ENABLED          | bool                                  |
+ *               +====+=====================================================================+=======================================+
+ *               | 5  | QNN_HTP_CONTEXT_CONFIG_OPTION_SHARE_RESOURCES                       | bool                                  |
+ *               +----+---------------------------------------------------------------------+---------------------------------------+
+ *               | 6  | QNN_HTP_CONTEXT_CONFIG_OPTION_IO_MEM_ESTIMATION                     | bool                                  |
+ *               +----+---------------------------------------------------------------------+---------------------------------------+
+ *               | 7  | QNN_HTP_CONTEXT_CONFIG_OPTION_PREPARE_ONLY                          | bool                                  |
+ *               +----+---------------------------------------------------------------------+---------------------------------------+
+ *               | 8  | QNN_HTP_CONTEXT_CONFIG_OPTION_INIT_ACCELERATION                     | bool                                  |
+ *               +----+---------------------------------------------------------------------+---------------------------------------+
+ *               | 9  | QNN_HTP_CONTEXT_CONFIG_OPTION_SKIP_VALIDATION_ON_BINARY_SECTION     | bool                                  |
+ *               +----+---------------------------------------------------------------------+---------------------------------------+
+ *               \endverbatim
+ */
+typedef struct QnnHtpContext_CustomConfig {
+  QnnHtpContext_ConfigOption_t option;
+  union UNNAMED {
+    // This field sets the weight sharing which is by default false
+    bool weightSharingEnabled;
+    QnnHtpContext_GroupRegistration_t groupRegistration;
+    // - Init time may be impacted depending the value set below
+    // - Value should be grather than 0 and less than or equal to the file size
+    //    - If set to 0, the feature is not utilized
+    //    - If set to greater than file size, min(fileSize, fileReadMemoryBudgetInMb) is used
+    // - As an example, if value 2 is passed, it would translate to (2 * 1024 * 1024) bytes
+    uint64_t fileReadMemoryBudgetInMb;
+    bool dspMemoryProfilingEnabled;
+    // This field enables resource sharing across different contexts, enhancing RAM and virtual
+    // address(VA) space utialization. When this flag is activated, graphs are expected to execute
+    // sequentially. Note that this configuration option is only supported when using the
+    // QnnContext_createFromBinaryListAsync API.
+    bool shareResources;
+    // This field enables I/O memory estimation during QnnContext_createFromBinary API when multiple
+    // PDs are available. When enabled, it estimates the total size of the I/O tensors required by
+    // the context to ensure sufficient space on the PD before deserialization. This feature helps
+    // with memory registration failures in large models.
+    // Note that enabling this feature increases peak RAM usage during context initialization phase
+    // in QnnContext_createFromBinary, but sustained RAM remains unaffected.
+    bool ioMemEstimation;
+    // This field enables model preparation without mapping its content on the DSP side. It is
+    // useful when a model needs to be prepared on the device but executed through a serialized
+    // binary method. This prevents extra mapping onto the DSP VA space. Set this flag only when
+    // creating the context.
+    bool isPrepareOnly;
+    // This field enables initialization acceleration, which is disabled by default.
+    // If set to true, the DSP will utilize all hardware threads to accelerate deserialization.
+    // It is not recommended to execute graphs simultaneously, as this will significantly degrade
+    // performance.
+    // Note that this feature may not be effective for small graphs with a few number of ops.
+    bool initAcceleration;
+    // This field enables crc32 check skip in Lora super adapter apply, which is disabled by default.
+    // If set to true, crc32 check for non-base adapter in super adapter apply use case will be
+    // skipped to improve time cost.
+    // Note that base adapter in super adaper never do crc32 check, therefore, their apply time cost
+    // won't improve by turning this config option on.
+    bool skipValidationOnBinarySection;
+  };
+} QnnHtpContext_CustomConfig_t;
+
+/// QnnHtpContext_CustomConfig_t initializer macro
+#define QNN_HTP_CONTEXT_CUSTOM_CONFIG_INIT            \
+  {                                                   \
+    QNN_HTP_CONTEXT_CONFIG_OPTION_UNKNOWN, /*option*/ \
+    {                                                 \
+      false                          /*weightsharing*/\
+    }                                                 \
+  }
+
+// clang-format on
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/QnnHtpDevice.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/QnnHtpDevice.h
new file mode 100755
index 0000000000000..e70c23577264b
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/QnnHtpDevice.h
@@ -0,0 +1,178 @@
+//=============================================================================
+//
+//  Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+//  All Rights Reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//=============================================================================
+
+/** @file
+ *  @brief QNN HTP Device components
+ *
+ *  This file defines structures and supplements QnnDevice.h for QNN HTP device
+ */
+
+#pragma once
+
+#include "QnnCommon.h"
+#include "QnnDevice.h"
+#include "QnnHtpPerfInfrastructure.h"
+#include "QnnTypes.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * This is used to represent the HTP hardware architecture
+ * Since QnnDevice only supports V68 or newer, using legacy ARCH will result in error
+ */
+typedef enum {
+  QNN_HTP_DEVICE_ARCH_NONE    = 0,
+  QNN_HTP_DEVICE_ARCH_V68     = 68,
+  QNN_HTP_DEVICE_ARCH_V69     = 69,
+  QNN_HTP_DEVICE_ARCH_V73     = 73,
+  QNN_HTP_DEVICE_ARCH_V75     = 75,
+  QNN_HTP_DEVICE_ARCH_V79     = 79,
+  QNN_HTP_DEVICE_ARCH_V81     = 81,
+  QNN_HTP_DEVICE_ARCH_UNKNOWN = 0x7fffffff
+} QnnHtpDevice_Arch_t;
+
+/**
+ * data struture to configure a device to set the minimum HTP Arch
+ * the driver will use ops that compatible to this HTP Arch
+ */
+typedef struct {
+  uint32_t deviceId;
+  QnnHtpDevice_Arch_t arch;
+} QnnHtpDevice_Minimum_Arch_t;
+
+/**
+ * data struture to configure a device to running in Signed/unsigned Domain.
+ */
+typedef struct {
+  uint32_t deviceId;
+  bool useSignedProcessDomain;
+} QnnHtpDevice_UseSignedProcessDomain_t;
+
+typedef void* QnnHtpDevice_UseCustomSetting_t;
+
+/**
+ * enum to list what custom configure is available.
+ */
+typedef enum {
+  QNN_HTP_DEVICE_CONFIG_OPTION_SOC      = 0,
+  QNN_HTP_DEVICE_CONFIG_OPTION_ARCH     = 1,
+  QNN_HTP_DEVICE_CONFIG_OPTION_SIGNEDPD = 2,
+  QNN_HTP_DEVICE_CONFIG_OPTION_CUSTOM   = 3,
+  QNN_HTP_DEVICE_CONFIG_OPTION_RESERVED = 0x7fff0000,
+  QNN_HTP_DEVICE_CONFIG_OPTION_UNKNOWN  = 0x7fffffff
+} QnnHtpDevice_ConfigOption_t;
+
+/**
+ * Data structure for custom configure.
+ */
+typedef struct {
+  QnnHtpDevice_ConfigOption_t option;
+  union UNNAMED {
+    // This field set the SoC Model
+    uint32_t socModel;
+    // This field update the minimum HTP arch
+    QnnHtpDevice_Minimum_Arch_t arch;
+    // This structure is used for enable/disable Signed/unsigned PD
+    QnnHtpDevice_UseSignedProcessDomain_t useSignedProcessDomain;
+    // This structure is used for enable Custom setting
+    QnnHtpDevice_UseCustomSetting_t useCustomSetting;
+    // Reserved for internal purposes
+    void* reserved;
+  };
+} QnnHtpDevice_CustomConfig_t;
+
+// For deviceType in QnnDevice_HardwareDeviceInfoV1_t
+typedef enum {
+  QNN_HTP_DEVICE_TYPE_ON_CHIP = 0,  // HTP cores are inside SoC
+  QNN_HTP_DEVICE_TYPE_UNKNOWN = 0x7fffffff
+} QnnHtpDevice_DeviceType_t;
+
+/**
+ * @brief QNN HTP Device core type
+ * This enumeration provides information about the core type inside the SOC.
+ *
+ * For online operation, the caller should retrieve this information from
+ * `QnnDevice_getPlatformInfo`. For offline operation, the caller needs to create a
+ * `QnnDevice_CoreInfo_t` with the correct core type, and then use it to create the
+ * `QnnDevice_PlatformInfo_t`.
+ */
+typedef enum {
+  QNN_HTP_CORE_TYPE_NSP   = 0,
+  QNN_HTP_CORE_TYPE_HPASS = 1,
+
+  // supported coreType are < QNN_CORE_TYPE_MAX
+  QNN_HTP_CORE_TYPE_MAX,
+  QNN_HTP_CORE_TYPE_UNKNOWN = 0x7fffffff
+} QnnHtpDevice_CoreType_t;
+
+/**
+ * This structure provides info about the NSP device inside SoC
+ * For online operation, caller should get these info from QnnDevice_getPlatformInfo
+ * For offline operation, caller need to create this structure and filling the correct information
+ * for QnnDevice_create
+ */
+typedef struct {
+  size_t vtcmSize;           // The VTCM for this device in Mega Byte
+                             // user could not request VTCM size exceed this value
+  uint32_t socModel;         // An enum value defined in Qnn Header that represent SoC model
+  bool signedPdSupport;      // This field is true if the device supports Signed PD
+  bool dlbcSupport;          // This field is true if the device supports DLBC
+  QnnHtpDevice_Arch_t arch;  // This field shows the Architecture of this device
+} QnnHtpDevice_OnChipDeviceInfoExtension_t;
+
+/**
+ * This structure is being used in QnnDevice_HardwareDeviceInfoV1_t
+ * QnnDevice_getPlatformInfo use this structure to list the supported device features/info
+ */
+typedef struct _QnnDevice_DeviceInfoExtension_t {
+  QnnHtpDevice_DeviceType_t devType;
+  union UNNAMED {
+    QnnHtpDevice_OnChipDeviceInfoExtension_t onChipDevice;
+  };
+} QnnHtpDevice_DeviceInfoExtension_t;
+
+/**
+ * @brief QNN HTP Device PerfInfrastructure specialization structure.
+ *        Objects of this type are to be referenced through QnnDevice_getInfrastructure.
+ *
+ *        Contains function pointers for each interface method for
+ *        Htp PerfInfrastructure.
+ */
+typedef struct {
+  QnnHtpPerfInfrastructure_CreatePowerConfigIdFn_t createPowerConfigId;
+  QnnHtpPerfInfrastructure_DestroyPowerConfigIdFn_t destroyPowerConfigId;
+  QnnHtpPerfInfrastructure_SetPowerConfigFn_t setPowerConfig;
+  QnnHtpPerfInfrastructure_SetMemoryConfigFn_t setMemoryConfig;
+} QnnHtpDevice_PerfInfrastructure_t;
+
+/// QnnHtpDevice_PerfInfrastructure_t initializer macro
+#define QNN_HTP_DEVICE_PERF_INFRASTRUCTURE_INIT \
+  {                                             \
+    NULL,     /*createPowerConfigId*/           \
+        NULL, /*destroyPowerConfigId*/          \
+        NULL, /*setPowerConfig*/                \
+        NULL  /*setMemoryConfig*/               \
+  }
+
+typedef enum {
+  QNN_HTP_DEVICE_INFRASTRUCTURE_TYPE_PERF    = 0,
+  QNN_HTP_DEVICE_INFRASTRUCTURE_TYPE_UNKNOWN = 0x7fffffff
+} QnnHtpDevice_InfrastructureType_t;
+
+typedef struct _QnnDevice_Infrastructure_t {
+  QnnHtpDevice_InfrastructureType_t infraType;
+  union UNNAMED {
+    QnnHtpDevice_PerfInfrastructure_t perfInfra;
+  };
+} QnnHtpDevice_Infrastructure_t;
+
+// clang-format on
+#ifdef __cplusplus
+}  // extern "C"
+#endif
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/QnnHtpGraph.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/QnnHtpGraph.h
new file mode 100755
index 0000000000000..f7e49e9fb8bc3
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/QnnHtpGraph.h
@@ -0,0 +1,299 @@
+//=============================================================================
+//
+//  Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+//  All Rights Reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//=============================================================================
+
+/**
+ *  @file
+ *  @brief QNN HTP component Graph API.
+ *
+ *         The interfaces in this file work with the top level QNN
+ *         API and supplements QnnGraph.h for HTP backend
+ */
+
+#ifndef QNN_HTP_GRAPH_H
+#define QNN_HTP_GRAPH_H
+
+#include "QnnGraph.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//=============================================================================
+// Macros
+//=============================================================================
+/**
+ * @brief QnnHtpGraph config value macro. Represents to use the maximum
+ *        available number of the resource.
+ *
+ *        Currently only applicable for QNN_HTP_GRAPH_CONFIG_OPTION_VTCM_SIZE.
+ */
+#define QNN_HTP_GRAPH_CONFIG_OPTION_MAX 0
+
+//=============================================================================
+// Data Types
+//=============================================================================
+
+/**
+ * @brief This enum provides different HTP graph optimization
+ *        options that can be used to finalize the graph
+ *        for optimum performance.
+ */
+typedef enum {
+  QNN_HTP_GRAPH_OPTIMIZATION_TYPE_SCHEDULE_THRESHOLD                = 1,
+  QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_RETRIES                  = 2,
+  QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG        = 3,
+  QNN_HTP_GRAPH_OPTIMIZATION_TYPE_ENABLE_DLBC                       = 4,
+  QNN_HTP_GRAPH_OPTIMIZATION_TYPE_ENABLE_DLBC_WEIGHTS               = 5,
+  QNN_HTP_GRAPH_OPTIMIZATION_TYPE_ENABLE_SPARSE_WEIGHTS_COMPRESSION = 6,
+  QNN_HTP_GRAPH_OPTIMIZATION_TYPE_ENABLE_SLC_ALLOCATOR              = 7,
+  QNN_HTP_GRAPH_OPTIMIZATION_TYPE_UNKNOWN                           = 0x7fffffff
+} QnnHtpGraph_OptimizationType_t;
+
+// clang-format off
+
+/**
+ * @brief Struct describing the set of optimization types
+ *        and the values associated with each optimization type.
+ *
+ *        Below is the Map between QnnHtpGraph_OptimizationType_t and allowable values:
+ *
+ *        \verbatim embed:rst:leading-asterisk
+ *        +----+--------------------------------------------------------------------+---------------------------------------------------------------------+
+ *        | #  | OptimizationType option                                            | Allowable values                                                    |
+ *        +====+====================================================================+=====================================================================+
+ *        | 1  | QNN_HTP_GRAPH_OPTIMIZATION_TYPE_SCHEDULE_THRESHOLD                 | Reserved                                                            |
+ *        +----+--------------------------------------------------------------------+---------------------------------------------------------------------+
+ *        | 2  | QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_RETRIES                   | Reserved                                                            |
+ *        +----+--------------------------------------------------------------------+---------------------------------------------------------------------+
+ *        | 3  | QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG         | Defines the optimization strategy used by the HTP backend           |
+ *        +----+--------------------------------------------------------------------+---------------------------------------------------------------------+
+ *        | 4  | QNN_HTP_GRAPH_OPTIMIZATION_TYPE_ENABLE_DLBC                        | Reserved                                                            |
+ *        +----+--------------------------------------------------------------------+---------------------------------------------------------------------+
+ *        | 5  | QNN_HTP_GRAPH_OPTIMIZATION_TYPE_ENABLE_DLBC_WEIGHTS                | Enables DLBC weights compression                                    |
+ *        +----+--------------------------------------------------------------------+---------------------------------------------------------------------+
+ *        | 6  | QNN_HTP_GRAPH_OPTIMIZATION_TYPE_ENABLE_SPARSE_WEIGHTS_COMPRESSION  | Enables Weight Sparsity Compression                                 |
+ *        +----+--------------------------------------------------------------------+---------------------------------------------------------------------+
+ *        | 7  | QNN_HTP_GRAPH_OPTIMIZATION_TYPE_ENABLE_SLC_ALLOCATOR               | Enables System Level Cache Allocator usage                          |
+ *        +----+--------------------------------------------------------------------+---------------------------------------------------------------------+
+ *        \endverbatim
+ */
+typedef struct {
+  QnnHtpGraph_OptimizationType_t type;
+  float floatValue;
+} QnnHtpGraph_OptimizationOption_t;
+
+/**
+ * @brief This struct encapsulates all the VTCM configurations for parallel graph execution.
+ *
+ * @code
+ *    |<--           (1) 8MB Total Hardware VTCM           -->|
+ *           |<--            (2) 7MB Addressable           -->|
+ *    +------+------+------+------+------+------+------+------+
+ *    |  CV  |      |      |      |      |      |      |      |
+ *    +------+------+------+------+------+------+------+------+
+ *           |<-- (4) Graph A  -->|<--     (4) Graph B     -->|
+ *
+ *         A |> 0 MB      (3) Graph Offset
+ *         B |-------------------> 3 MB
+ * @endcode
+ */
+typedef struct {
+    /// (4) above, the amount of VTCM used by a graph
+    uint32_t sizeInBytes;
+    /// (3) above, where in the addressable region to start VTCM.
+    ///     Note: (3) + (4) <= (2)
+    uint32_t offsetInBytes;
+    /// (2) Addressable portion of VTCM.
+    /// Set to less than hardware size so Graph(s) can coexist with other VTCM clients.
+    uint32_t sizeTotalInBytes;
+
+    // For ABI compatibility in the future.
+    // Set to 0 for now.
+    uint32_t reserved[3];
+} QnnHtpGraph_VtcmConfig_t;
+
+/**
+ * @brief This enum defines whether graph concurrency (i.e. multiple graphs running concurrently)
+ *        is possible, and how to behave when circumstances for concurrency aren't possible.
+ */
+typedef enum {
+  /// This graph will not be able to run concurrently with other graphs.
+  QNN_HTP_GRAPH_CONCURRENCY_OPTION_NONE                       = 0,
+  QNN_HTP_GRAPH_CONCURRENCY_OPTION_DEFAULT                    = QNN_HTP_GRAPH_CONCURRENCY_OPTION_NONE,
+  /// Graph will try to run concurrently, sharing all resources on the DSP (VTCM, HMX, HVX, etc).
+  QNN_HTP_GRAPH_CONCURRENCY_OPTION_ALL_SHARED                 = 1,
+  // Unused, present to ensure 32 bits.
+  QNN_HTP_GRAPH_CONCURRENCY_OPTION_UNKNOWN                    = 0x7fffffff
+} QnnHtpGraph_ConcurrencyOption_t;
+
+/**
+ * @brief This struct encapsulates all the configurations for parallel graph execution.
+ */
+typedef struct {
+  QnnHtpGraph_ConcurrencyOption_t concurrency;
+  QnnHtpGraph_VtcmConfig_t vtcmConfig;
+
+  // For ABI compatibility in the future.
+  // Set to 0 for now.
+  uint32_t reserved[4];
+} QnnHtpGraph_ParallelGraphExecutionConfig_t;
+/// The settings in this struct is only applicable
+///  for DSP architectures >= V81.
+/// Use on other SOCs will return an error.
+///
+/// Values will be defaulted to their SOC's TURBO frequency
+///  (SOC as identified by Qnn_DeviceHandle_t).
+///
+/// On automotive SDKs HMX OP Bounding will be enabled by default.
+///
+/// On non-automotive SDKs using this setting will enable
+///  HMX OP Bounding. It is off by default.
+typedef struct QnnHtp_HmxBoundingInfo {
+  /// Target HMX freq in Hz.
+  /// Can be derived from sysMonApp (HexagonSDK) or QProfiler.
+  float targetHmxFreqHz;
+  /// Target DSP Core freq in Hz.
+  /// Can be derived from sysMonApp (HexagonSDK) or QProfiler.
+  float targetDspCoreFreq;
+} QnnHtp_HmxBoundingInfo_t;
+
+/// QnnHtpGraph_OptimizationOption_t initializer macro
+#define QNN_HTP_GRAPH_OPTIMIZATION_OPTION_INIT              \
+  {                                                         \
+    QNN_HTP_GRAPH_OPTIMIZATION_TYPE_UNKNOWN, /*type*/       \
+    0.0f                                     /*floatValue*/ \
+  }
+// clang-format on
+
+/**
+ * @brief This enum provides different HTP graph configuration
+ *        options associated with QnnGraph
+ */
+typedef enum {
+  QNN_HTP_GRAPH_CONFIG_OPTION_OPTIMIZATION    = 1,
+  QNN_HTP_GRAPH_CONFIG_OPTION_PRECISION       = 2,
+  QNN_HTP_GRAPH_CONFIG_OPTION_VTCM_SIZE_IN_MB = 3,
+  QNN_HTP_GRAPH_CONFIG_OPTION_VTCM_SIZE       = QNN_HTP_GRAPH_CONFIG_OPTION_VTCM_SIZE_IN_MB,
+  QNN_HTP_GRAPH_CONFIG_OPTION_FOLD_RELU_ACTIVATION_INTO_CONV_OFF = 4,
+  QNN_HTP_GRAPH_CONFIG_OPTION_SHORT_DEPTH_CONV_ON_HMX_OFF        = 5,
+  QNN_HTP_GRAPH_CONFIG_OPTION_NUM_HVX_THREADS                    = 6,
+  QNN_HTP_GRAPH_CONFIG_OPTION_FINALIZE_CONFIG                    = 7,
+  QNN_HTP_GRAPH_CONFIG_OPTION_NUM_CORES                          = 8,
+  QNN_HTP_GRAPH_CONFIG_OPTION_PARALLEL_GRAPH_EXECUTION_CONFIG    = 9,
+  QNN_HTP_GRAPH_CONFIG_OPTION_VTCM_SIZE_IN_BYTES                 = 10,
+  QNN_HTP_GRAPH_CONFIG_OPTION_HMX_BOUNDING                       = 11,
+  QNN_HTP_GRAPH_CONFIG_OPTION_WEIGHTS_PACKING                    = 12,
+  QNN_HTP_GRAPH_CONFIG_OPTION_ASSUME_SAME_QUANT                  = 13,
+  QNN_HTP_GRAPH_CONFIG_OPTION_RESERVED                           = 0x7fff0000,
+  QNN_HTP_GRAPH_CONFIG_OPTION_UNKNOWN                            = 0x7fffffff
+} QnnHtpGraph_ConfigOption_t;
+
+//=============================================================================
+// Public Functions
+//=============================================================================
+
+//------------------------------------------------------------------------------
+//   Implementation Definition
+//------------------------------------------------------------------------------
+
+/**
+ * @brief A struct for different config parameters in a key value format.
+ */
+typedef struct {
+  const char* key;
+  Qnn_Scalar_t value;
+} QnnHtpGraph_FinalizeConfig_t;
+
+/**
+ * @brief        Structure describing the set of configurations supported by graph.
+ *               Objects of this type are to be referenced through QnnGraph_CustomConfig_t.
+ *
+ *               The struct has two fields - option and a union of corresponding config values
+ *               Based on the option corresponding item in the union can be used to specify
+ *               config.
+ *
+ *               Below is the Map between QnnHtpGraph_ConfigOption_t and config value
+ *
+ *               \verbatim embed:rst:leading-asterisk
+ *               +----+-------------------------------------------------------------------------------------+------------------------------------------------+
+ *               | #  | Config Option | Configuration Struct/value                     |
+ *               +====+=====================================================================================+================================================+
+ *               | 1  | QNN_HTP_GRAPH_CONFIG_OPTION_OPTIMIZATION | QnnHtpGraph_OptimizationOption_t
+ * |
+ *               +----+-------------------------------------------------------------------------------------+------------------------------------------------+
+ *               | 2  | QNN_HTP_GRAPH_CONFIG_OPTION_PRECISION | Qnn_Precision_t |
+ *               +----+-------------------------------------------------------------------------------------+------------------------------------------------+
+ *               | 3  |
+ * QNN_HTP_GRAPH_CONFIG_OPTION_VTCM_SIZE_IN_MB/QNN_HTP_GRAPH_CONFIG_OPTION_VTCM_SIZE   | uint32_t |
+ *               +----+-------------------------------------------------------------------------------------+------------------------------------------------+
+ *               | 4  | QNN_HTP_GRAPH_CONFIG_OPTION_FOLD_RELU_ACTIVATION_INTO_CONV_OFF | bool |
+ *               +----+-------------------------------------------------------------------------------------+------------------------------------------------+
+ *               | 5  | QNN_HTP_GRAPH_CONFIG_OPTION_SHORT_DEPTH_CONV_ON_HMX_OFF | bool |
+ *               +----+-------------------------------------------------------------------------------------+------------------------------------------------+
+ *               | 6  | QNN_HTP_GRAPH_CONFIG_OPTION_NUM_HVX_THREADS | uint32_t |
+ *               +----+-------------------------------------------------------------------------------------+------------------------------------------------+
+ *               | 7  | QNN_HTP_GRAPH_CONFIG_OPTION_FINALIZE_CONFIG | QnnHtpGraph_FinalizeConfig_t |
+ *               +----+-------------------------------------------------------------------------------------+------------------------------------------------+
+ *               | 8  | QNN_HTP_GRAPH_CONFIG_OPTION_NUM_CORES | uint32_t |
+ *               +----+-------------------------------------------------------------------------------------+------------------------------------------------+
+ *               |  9 | QNN_HTP_GRAPH_CONFIG_OPTION_PARALLEL_GRAPH_EXECUTION_CONFIG |
+ * QnnHtpGraph_ParallelGraphExecutionConfig_t     |
+ *               +----+-------------------------------------------------------------------------------------+------------------------------------------------+
+ *               | 10 | QNN_HTP_GRAPH_CONFIG_OPTION_VTCM_SIZE_IN_BYTES | uint32_t |
+ *               +----+-------------------------------------------------------------------------------------+------------------------------------------------+
+ *               | 11 | QNN_HTP_GRAPH_CONFIG_OPTION_HMX_BOUNDING | uint32_t |
+ *               +----+-------------------------------------------------------------------------------------+------------------------------------------------+
+ *               | 12 | QNN_HTP_GRAPH_CONFIG_OPTION_WEIGHTS_PACKING | bool |
+ *               +----+-------------------------------------------------------------------------------------+------------------------------------------------+
+ *               | 13 | QNN_HTP_GRAPH_CONFIG_OPTION_ASSUME_SAME_QUANT | bool |
+ *               +----+-------------------------------------------------------------------------------------+------------------------------------------------+
+ *               +-------------------------+----------------------------------------------------------------+------------------------------------------------+
+ *               | 0x7fff0000 - 0x7ffffffe | QNN_HTP_GRAPH_CONFIG_OPTION_RESERVED | These are
+ * reserved for internal purposes       |
+ *               +-------------------------+----------------------------------------------------------------+------------------------------------------------+
+ *               \endverbatim
+ *
+ *               NOTE: Option #6 (i.e. QNN_HTP_GRAPH_CONFIG_OPTION_NUM_HVX_THREADS), can only be
+ *               set prior to the first execution of the graph. Proceeding executions will not use
+ *               the updated value if user does change it after the first execution.
+ */
+typedef struct {
+  QnnHtpGraph_ConfigOption_t option;
+  union {
+    QnnHtpGraph_OptimizationOption_t optimizationOption;
+    Qnn_Precision_t precision;
+    uint32_t vtcmSizeInMB;
+    bool foldReluActivationIntoConvOff;
+    bool shortDepthConvOnHmxOff;
+    uint64_t numHvxThreads;
+    void* reserved;
+    QnnHtpGraph_FinalizeConfig_t finalizeConfig;
+    uint32_t numCores;
+    QnnHtpGraph_ParallelGraphExecutionConfig_t parallelGraphExecutionConfig;
+    uint32_t vtcmSizeInBytes;
+    QnnHtp_HmxBoundingInfo_t hmxBoundingInfo;
+    bool weightsPacking;
+    bool assumeSameQuant;
+  };
+} QnnHtpGraph_CustomConfig_t;
+
+// clang-format on
+/// QnnHtpGraph_CustomConfig_t initializer macro
+#define QNN_HTP_GRAPH_CUSTOM_CONFIG_INIT                            \
+  {                                                                 \
+    QNN_HTP_GRAPH_CONFIG_OPTION_UNKNOWN, /*option*/                 \
+    {                                                               \
+      QNN_HTP_GRAPH_OPTIMIZATION_OPTION_INIT /*optimizationOption*/ \
+    }                                                               \
+  }
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/QnnHtpMem.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/QnnHtpMem.h
new file mode 100755
index 0000000000000..adc9ef2c52504
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/QnnHtpMem.h
@@ -0,0 +1,85 @@
+//==============================================================================
+//
+//  Copyright (c) 2022-2023 Qualcomm Technologies, Inc.
+//  All Rights Reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef QNN_HTP_MEMORY_INFRASTRUCTURE_2_H
+#define QNN_HTP_MEMORY_INFRASTRUCTURE_2_H
+
+#include "QnnCommon.h"
+
+/**
+ *  @file
+ *  @brief QNN HTP Memory Infrastructure component API.
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//=============================================================================
+// VTCM
+//=============================================================================
+
+// clang-format off
+
+/**
+ * @brief Raw memory address that exists ONLY on the QURT
+ * side.
+ */
+typedef uint32_t QnnHtpMem_QurtAddress_t;
+
+/**
+ * @brief Configuration for custom shared buffer memory type
+ * This shared buffer is a contiguous chunk of memory identified
+ * by a single file descriptor which will be used by multiple tensors
+ * based on the offset provided
+ * Each QnnMem_register call with different offset will return a
+ * unique memory handle
+ */
+typedef struct {
+  // File descriptor for memory, must be set to QNN_MEM_INVALID_FD if not applicable
+  int32_t fd;
+  // Offset to be used in contiguous shared buffer
+  uint64_t offset;
+} QnnHtpMem_SharedBufferConfig_t;
+
+// clang-format off
+
+/**
+ * @brief QNN Memory Type
+ */
+typedef enum {
+  QNN_HTP_MEM_QURT = 0,
+  QNN_HTP_MEM_SHARED_BUFFER = 1,
+  QNN_HTP_MEM_UNDEFINED = 0x7FFFFFFF
+} QnnHtpMem_Type_t;
+
+// clang-format off
+
+/**
+ * @brief descriptor used for the QNN API
+ */
+typedef struct {
+  // Memory type identified by QnnHtpMem_Type_t
+  QnnHtpMem_Type_t type;
+  // Total size of the buffer
+  // For memory type QURT, it would be size of a tensor
+  // For memory type SHARED BUFFER, it would be the total size of the buffer
+  uint64_t size;
+
+  union {
+    QnnHtpMem_QurtAddress_t qurtAddress;
+    QnnHtpMem_SharedBufferConfig_t sharedBufferConfig;
+  };
+} QnnMemHtp_Descriptor_t;
+
+// clang-format on
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/QnnHtpPerfInfrastructure.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/QnnHtpPerfInfrastructure.h
new file mode 100755
index 0000000000000..f92317ac94bf2
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/QnnHtpPerfInfrastructure.h
@@ -0,0 +1,511 @@
+//==============================================================================
+//
+// Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+/** @file
+ *  @brief QNN HTP component Performance Infrastructure API
+ *
+ *         Provides interface to the client to control performance and system
+ *         settings of the QNN HTP Accelerator
+ */
+
+#ifndef QNN_HTP_PERF_INFRASTRUCTURE_H
+#define QNN_HTP_PERF_INFRASTRUCTURE_H
+
+#include "QnnCommon.h"
+#include "QnnTypes.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// max rpc polling time allowed - 9999 us
+#define QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIG_MAX_RPC_POLLING_TIME 9999
+
+//=============================================================================
+// Data Types
+//=============================================================================
+
+/**
+ * @brief QNN HTP PerfInfrastructure API result / error codes.
+ *
+ */
+typedef enum {
+  QNN_HTP_PERF_INFRASTRUCTURE_MIN_ERROR = QNN_MIN_ERROR_PERF_INFRASTRUCTURE,
+  ////////////////////////////////////////////////////////////////////////
+
+  QNN_HTP_PERF_INFRASTRUCTURE_NO_ERROR                 = QNN_SUCCESS,
+  QNN_HTP_PERF_INFRASTRUCTURE_ERROR_INVALID_HANDLE_PTR = QNN_MIN_ERROR_PERF_INFRASTRUCTURE + 0,
+  QNN_HTP_PERF_INFRASTRUCTURE_ERROR_INVALID_INPUT      = QNN_MIN_ERROR_PERF_INFRASTRUCTURE + 1,
+  QNN_HTP_PERF_INFRASTRUCTURE_ERROR_UNSUPPORTED_CONFIG = QNN_MIN_ERROR_PERF_INFRASTRUCTURE + 2,
+  QNN_HTP_PERF_INFRASTRUCTURE_ERROR_TRANSPORT          = QNN_MIN_ERROR_PERF_INFRASTRUCTURE + 3,
+  QNN_HTP_PERF_INFRASTRUCTURE_ERROR_UNSUPPORTED        = QNN_MIN_ERROR_PERF_INFRASTRUCTURE + 4,
+  QNN_HTP_PERF_INFRASTRUCTURE_ERROR_MEM_ALLOC          = QNN_MIN_ERROR_PERF_INFRASTRUCTURE + 5,
+  QNN_HTP_PERF_INFRASTRUCTURE_ERROR_FAILED             = QNN_MIN_ERROR_PERF_INFRASTRUCTURE + 6,
+
+  ////////////////////////////////////////////////////////////////////////
+  QNN_HTP_PERF_INFRASTRUCTURE_MAX_ERROR = QNN_MAX_ERROR_PERF_INFRASTRUCTURE,
+  /// UNDEFINED value that must not be used by client
+  QNN_HTP_PERF_INFRASTRUCTURE_ERROR_UNDEFINED = 0x7fffffff
+} QnnHtpPerfInfrastructure_Error_t;
+
+/**
+ * @brief Allows client to consider (non-zero value) DCVS enable/disable
+ * and option parameters, otherwise (zero value)
+ *
+ */
+typedef uint32_t QnnHtpPerfInfrastructure_SetDcvsEnable_t;
+
+/**
+ * @brief Allows client to start (non-zero value) or stop (zero value)
+ * participating in DCVS
+ *
+ */
+typedef uint32_t QnnHtpPerfInfrastructure_DcvsEnable_t;
+
+/**
+ * @brief Allows client to consider (non-zero value) latency parameter,
+ * otherwise (zero value)
+ *
+ */
+typedef uint32_t QnnHtpPerfInfrastructure_SetSleepLatency_t;
+
+/**
+ * @brief Allows client to set up the sleep latency in microseconds
+ *
+ */
+typedef uint32_t QnnHtpPerfInfrastructure_SleepLatency_t;
+
+/**
+ * @brief Allows client to consider (non-zero value) sleep disable
+ * parameter, otherwise (zero value)
+ *
+ */
+typedef uint32_t QnnHtpPerfInfrastructure_SetSleepDisable_t;
+
+/**
+ * @brief Allows client to disable sleep or low power modes.
+ * Pass a non-zero value to disable sleep in HTP
+ *
+ */
+typedef uint32_t QnnHtpPerfInfrastructure_SleepDisable_t;
+
+/**
+ * @brief Allows client to consider (non-zero value) bus clock
+ * params, otherwise (zero value)
+ *
+ */
+typedef uint32_t QnnHtpPerfInfrastructure_SetBusParams_t;
+
+/**
+ * @brief Allows client consider (non-zero value) core clock
+ * params, otherwise (zero value)
+ *
+ */
+typedef uint32_t QnnHtpPerfInfrastructure_SetCoreParams_t;
+
+/**
+ * @brief Allows client to set up the RPC control latency in microseconds
+ *
+ */
+typedef uint32_t QnnHtpPerfInfrastructure_RpcControlLatency_t;
+
+/**
+ * @brief Allows client to set up the RPC polling time in microseconds
+ */
+typedef uint32_t QnnHtpPerfInfrastructure_RpcPollingTime_t;
+
+/**
+ * @brief Allows client to set up the adaptive polling time in microseconds
+ */
+typedef uint32_t QnnHtpPerfInfrastructure_AdaptivePollingTime_t;
+
+/**
+ * @brief Allows client to set up the HMX timeout interval in microseconds
+ */
+typedef uint32_t QnnHtpPerfInfrastructure_HmxTimeoutIntervalUs_t;
+
+/**
+ * @brief sets the minimum size by which user heap should grow
+ * when heap is exhausted. This API is expected to be
+ * called only once per backend and has a process wide impact
+ *
+ * Grow size provided in bytes and defaults to 16MB
+ */
+typedef uint32_t QnnHtpPerfInfrastructure_MemGrowSize_t;
+
+/**
+ * @brief Allows client to set default values for HMX frequency.
+ * If enabled 1 HMX vote will scale with DCVS Corner if 0 HMX vote
+ * needs to be specified manually.
+ *
+ */
+typedef uint32_t QnnHtpPerfInfrastructure_HmxDefault_Vote_t;
+
+/**
+ *  @brief Perf modes to specify clock frequency level within
+ *  target voltage corner currently applies only for HMX config.
+ */
+typedef enum {
+  // To select max frequency at target voltage corner.
+  QNN_HTP_PERF_INFRASTRUCTURE_CLK_PERF_HIGH = 0,
+  // To select min frequency at target voltage corner.
+  QNN_HTP_PERF_INFRASTRUCTURE_CLK_PERF_LOW,
+  /// UNKNOWN value that must not be used by client
+  QNN_HTP_PERF_INFRASTRUCTURE_CLK_PERF_UNKNOWN = 0x7fffffff
+} QnnHtpPerfInfrastructure_ClkPerfMode_t;
+
+/**
+ * @brief These are the different voltage corners that can
+ * be requested by the client to influence the voting scheme
+ * for DCVS
+ *
+ */
+typedef enum {
+  /// Maps to HAP_DCVS_VCORNER_DISABLE.
+  /// Disable setting up voltage corner
+  DCVS_VOLTAGE_CORNER_DISABLE = 0x10,
+  /// Maps to HAP_DCVS_VCORNER_SVS2.
+  /// Set voltage corner to minimum value supported on platform
+  DCVS_VOLTAGE_VCORNER_MIN_VOLTAGE_CORNER = 0x20,
+  /// Maps to HAP_DCVS_VCORNER_SVS2.
+  /// Set voltage corner to SVS2 value for the platform
+  DCVS_VOLTAGE_VCORNER_SVS2 = 0x30,
+  /// Maps to HAP_DCVS_VCORNER_SVS.
+  /// Set voltage corner to SVS value for the platform
+  DCVS_VOLTAGE_VCORNER_SVS = 0x40,
+  /// Maps to HAP_DCVS_VCORNER_SVS_PLUS.
+  /// Set voltage corner to SVS_PLUS value for the platform
+  DCVS_VOLTAGE_VCORNER_SVS_PLUS = 0x50,
+  /// Maps to HAP_DCVS_VCORNER_NOM.
+  /// Set voltage corner to NOMINAL value for the platform
+  DCVS_VOLTAGE_VCORNER_NOM = 0x60,
+  /// Maps to HAP_DCVS_VCORNER_NOM_PLUS.
+  /// Set voltage corner to NOMINAL_PLUS value for the platform
+  DCVS_VOLTAGE_VCORNER_NOM_PLUS = 0x70,
+  /// Maps to HAP_DCVS_VCORNER_TURBO.
+  /// Set voltage corner to TURBO value for the platform
+  DCVS_VOLTAGE_VCORNER_TURBO = 0x80,
+  /// Maps to HAP_DCVS_VCORNER_TURBO_PLUS.
+  /// Set voltage corner to TURBO_PLUS value for the platform
+  DCVS_VOLTAGE_VCORNER_TURBO_PLUS = 0x90,
+  /// Maps to HAP_DCVS_VCORNER_TURBO_L2.
+  /// Set voltage corner to TURBO_L2 value for the platform
+  DCVS_VOLTAGE_VCORNER_TURBO_L2 = 0x92,
+  /// Maps to HAP_DCVS_VCORNER_TURBO_L3.
+  /// Set voltage corner to TURBO_L3 value for the platform
+  DCVS_VOLTAGE_VCORNER_TURBO_L3 = 0x93,
+  /// Maps to HAP_DCVS_VCORNER_MAX.
+  /// Set voltage corner to maximum value supported on the platform
+  DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER = 0xA0,
+  /// UNKNOWN value that must not be used by client
+  DCVS_VOLTAGE_VCORNER_UNKNOWN = 0x7fffffff
+} QnnHtpPerfInfrastructure_VoltageCorner_t;
+
+/**
+ * @brief These are the expanded voltage corners that can
+ * be requested by the client to influence the voting scheme
+ * for DCVS
+ *
+ */
+typedef enum {
+  /// Maps to HAP_DCVS_EXP_VCORNER_DISABLE.
+  /// Disable setting up voltage corner
+  DCVS_EXP_VCORNER_DISABLE = 0,
+  /// Maps to HAP_DCVS_EXP_VCORNER_MIN.
+  /// Set voltage corner to minimum value supported on platform
+  DCVS_EXP_VCORNER_MIN = 0x100,
+  /// Maps to HAP_DCVS_EXP_VCORNER_LOW_SVS_D2.
+  /// Set voltage corner to LOWSVS_D2 value for the platform
+  DCVS_EXP_VCORNER_LOW_SVS_D2 = 0x134,
+  /// Maps to HAP_DCVS_EXP_VCORNER_LOW_SVS_D1.
+  /// Set voltage corner to LOWSVS_D1 value for the platform
+  DCVS_EXP_VCORNER_LOW_SVS_D1 = 0x138,
+  /// Maps to HAP_DCVS_EXP_VCORNER_LOW_SVS.
+  /// Set voltage corner to LOWSVS value for the platform
+  DCVS_EXP_VCORNER_LOW_SVS = 0x140,
+  /// Maps to HAP_DCVS_EXP_VCORNER_SVS.
+  /// Set voltage corner to SVS value for the platform
+  DCVS_EXP_VCORNER_SVS = 0x180,
+  /// Maps to HAP_DCVS_EXP_VCORNER_SVS_L1.
+  /// Set voltage corner to SVS_L1 value for the platform
+  DCVS_EXP_VCORNER_SVS_L1 = 0x1C0,
+  /// Maps to HAP_DCVS_EXP_VCORNER_NOM.
+  /// Set voltage corner to NOM value for the platform
+  DCVS_EXP_VCORNER_NOM = 0x200,
+  /// Maps to HAP_DCVS_EXP_VCORNER_NOM_L1.
+  /// Set voltage corner to NOM_L1 value for the platform
+  DCVS_EXP_VCORNER_NOM_L1 = 0x240,
+  /// Maps to HAP_DCVS_EXP_VCORNER_TUR.
+  /// Set voltage corner to TURBO value for the platform
+  DCVS_EXP_VCORNER_TUR = 0x280,
+  /// Maps to HAP_DCVS_EXP_VCORNER_TUR_L1.
+  /// Set voltage corner to TURBO_L1 value for the platform
+  DCVS_EXP_VCORNER_TUR_L1 = 0x2A0,
+  /// Maps to HAP_DCVS_EXP_VCORNER_TUR_L2.
+  /// Set voltage corner to TURBO_L2 value for the platform
+  DCVS_EXP_VCORNER_TUR_L2 = 0x2B0,
+  /// Maps to HAP_DCVS_EXP_VCORNER_TUR_L3.
+  /// Set voltage corner to TURBO_L3 value for the platform
+  DCVS_EXP_VCORNER_TUR_L3 = 0x2C0,
+  /// Maps to HAP_DCVS_EXP_VCORNER_MAX.
+  /// Selects the maximum voltage corner defined for the chipset
+  DCVS_EXP_VCORNER_MAX = 0xFFFF,
+  /// UNKNOWN value that must not be used by client
+  DCVS_EXP_VCORNER_UNKNOWN = 0x7fffffff
+} QnnHtpPerfInfrastructure_ExpVoltageCorner_t;
+
+/**
+ * @brief This enum defines all the possible power mode
+ *        that a client can set to influence DCVS mode
+ */
+typedef enum {
+  /// Maps to HAP_DCVS_V2_ADJUST_UP_DOWN.
+  /// Allows for DCVS to adjust up and down
+  QNN_HTP_PERF_INFRASTRUCTURE_POWERMODE_ADJUST_UP_DOWN = 0x1,
+  /// Maps to HAP_DCVS_V2_ADJUST_ONLY_UP.
+  /// Allows for DCVS to adjust up only
+  QNN_HTP_PERF_INFRASTRUCTURE_POWERMODE_ADJUST_ONLY_UP = 0x2,
+  /// Maps to HAP_DCVS_V2_POWER_SAVER_MODE.
+  /// Higher thresholds for power efficiency
+  QNN_HTP_PERF_INFRASTRUCTURE_POWERMODE_POWER_SAVER_MODE = 0x4,
+  /// Maps to HAP_DCVS_V2_POWER_SAVER_AGGRESSIVE_MODE.
+  /// Higher thresholds for power efficiency with faster ramp down
+  QNN_HTP_PERF_INFRASTRUCTURE_POWERMODE_POWER_SAVER_AGGRESSIVE_MODE = 0x8,
+  /// Maps to HAP_DCVS_V2_PERFORMANCE_MODE.
+  /// Lower thresholds for maximum performance
+  QNN_HTP_PERF_INFRASTRUCTURE_POWERMODE_PERFORMANCE_MODE = 0x10,
+  /// Maps to HAP_DCVS_V2_DUTY_CYCLE_MODE.
+  /// The below value applies only for HVX clients:
+  ///  - For streaming class clients:
+  ///   - detects periodicity based on HVX usage
+  ///   - lowers clocks in the no HVX activity region of each period.
+  ///  - For compute class clients:
+  ///   - Lowers clocks on no HVX activity detects and brings clocks up on detecting HVX activity
+  ///   again.
+  ///   - Latency involved in bringing up the clock will be at max 1 to 2 ms.
+  QNN_HTP_PERF_INFRASTRUCTURE_POWERMODE_DUTY_CYCLE_MODE = 0x20,
+  /// UNKNOWN value that must not be used by client
+  QNN_HTP_PERF_INFRASTRUCTURE_POWERMODE_UNKNOWN = 0x7fffffff
+} QnnHtpPerfInfrastructure_PowerMode_t;
+
+/**
+ * @brief This struct provides performance infrastructure configuration
+ *        associated with setting up of DcvsV3 which allows to select
+ *        bus and core operating corners separately
+ */
+typedef struct {
+  uint32_t contextId;
+  QnnHtpPerfInfrastructure_SetDcvsEnable_t setDcvsEnable;
+  QnnHtpPerfInfrastructure_DcvsEnable_t dcvsEnable;
+  QnnHtpPerfInfrastructure_PowerMode_t powerMode;
+  QnnHtpPerfInfrastructure_SetSleepLatency_t setSleepLatency;
+  QnnHtpPerfInfrastructure_SleepLatency_t sleepLatency;
+  QnnHtpPerfInfrastructure_SetSleepDisable_t setSleepDisable;
+  QnnHtpPerfInfrastructure_SleepDisable_t sleepDisable;
+  QnnHtpPerfInfrastructure_SetBusParams_t setBusParams;
+  QnnHtpPerfInfrastructure_VoltageCorner_t busVoltageCornerMin;
+  QnnHtpPerfInfrastructure_VoltageCorner_t busVoltageCornerTarget;
+  QnnHtpPerfInfrastructure_VoltageCorner_t busVoltageCornerMax;
+  QnnHtpPerfInfrastructure_SetCoreParams_t setCoreParams;
+  QnnHtpPerfInfrastructure_VoltageCorner_t coreVoltageCornerMin;
+  QnnHtpPerfInfrastructure_VoltageCorner_t coreVoltageCornerTarget;
+  QnnHtpPerfInfrastructure_VoltageCorner_t coreVoltageCornerMax;
+} QnnHtpPerfInfrastructure_DcvsV3_t;
+
+/**
+ * @brief This struct provides performance infrastructure configuration
+ *        associated with setting up of hmxv2 which allows to select
+ *        hmx corner separately. If hmxPickDefault is 1 all voltage corner
+ *        params will be ignored. Ensure to use same contextID as used for
+ *        DCVS vote.
+ */
+typedef struct {
+  QnnHtpPerfInfrastructure_HmxDefault_Vote_t hmxPickDefault;
+  QnnHtpPerfInfrastructure_ExpVoltageCorner_t hmxVoltageCornerMin;
+  QnnHtpPerfInfrastructure_ExpVoltageCorner_t hmxVoltageCornerTarget;
+  QnnHtpPerfInfrastructure_ExpVoltageCorner_t hmxVoltageCornerMax;
+  QnnHtpPerfInfrastructure_ClkPerfMode_t hmxPerfMode;
+} QnnHtpPerfInfrastructure_HmxV2_t;
+
+/**
+ * @brief This enum defines all the possible performance
+ *        options in Htp Performance Infrastructure that
+ *        relate to setting up of power levels
+ */
+typedef enum {
+  /// config enum implies the usage of Dcvs v3
+  QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_DCVS_V3 = 1,
+  /// config enum implies the usage of rpcControlLatencyConfig struct
+  QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_RPC_CONTROL_LATENCY = 2,
+  /// config enum implies the usage of rpcPollingTimeConfig struct
+  /// this config is only supported on V69 and later
+  /// if enabled, this config is applied to entire process
+  /// max allowed is QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIG_MAX_RPC_POLLING_TIME us
+  QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_RPC_POLLING_TIME = 3,
+  /// config HMX timeout interval in us. The HMX is turned off after the set interval
+  /// time if no interaction with it after an inference is finished.
+  QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_HMX_TIMEOUT_INTERVAL_US = 4,
+  /// config HMX V2 voting parameters only on supported chips
+  QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_HMX_V2 = 5,
+  /// config enum implies the usage of adaptivePollingTime struct
+  /// this config can only be enabled in the RPC polling mode
+  /// if enabled, this config is applied to the entire process
+  QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_ADAPTIVE_POLLING_TIME = 6,
+  /// UNKNOWN config option which must not be used
+  QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_UNKNOWN = 0x7fffffff
+} QnnHtpPerfInfrastructure_PowerConfigOption_t;
+
+/**
+ * @brief This struct provides performance infrastructure configuration
+ *         associated with setting up of power levels
+ */
+typedef struct {
+  QnnHtpPerfInfrastructure_PowerConfigOption_t option;
+  union UNNAMED {
+    QnnHtpPerfInfrastructure_DcvsV3_t dcvsV3Config;
+    QnnHtpPerfInfrastructure_RpcControlLatency_t rpcControlLatencyConfig;
+    QnnHtpPerfInfrastructure_RpcPollingTime_t rpcPollingTimeConfig;
+    QnnHtpPerfInfrastructure_HmxTimeoutIntervalUs_t hmxTimeoutIntervalUsConfig;
+    QnnHtpPerfInfrastructure_HmxV2_t hmxV2Config;
+    QnnHtpPerfInfrastructure_AdaptivePollingTime_t adaptivePollingTimeConfig;
+  };
+} QnnHtpPerfInfrastructure_PowerConfig_t;
+
+/// QnnHtpPerfInfrastructure_PowerConfig_t initializer macro
+#define QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIG_INIT                  \
+  {                                                                    \
+    QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_UNKNOWN, /*config*/ \
+    {                                                                  \
+      0 /*dcvsV3Config*/                                               \
+    }                                                                  \
+  }
+
+/**
+ * @brief This enum defines all the possible performance
+ *        options in Htp Performance Infrastructure that
+ *        relate to system memory settings
+ */
+typedef enum {
+  /// sets memory grow size
+  QNN_HTP_PERF_INFRASTRUCTURE_MEMORY_CONFIGOPTION_GROW_SIZE = 1,
+  /// UNKNOWN config option that must not be used
+  QNN_HTP_PERF_INFRASTRUCTURE_MEMORY_CONFIGOPTION_UNKNOWN = 0x7fffffff
+} QnnHtpPerfInfrastructure_MemoryConfigOption_t;
+
+/**
+ * @brief Provides performance infrastructure configuration
+ *        options that are memory specific
+ */
+typedef struct {
+  QnnHtpPerfInfrastructure_MemoryConfigOption_t option;
+  union UNNAMED {
+    QnnHtpPerfInfrastructure_MemGrowSize_t memGrowSizeConfig;
+  };
+} QnnHtpPerfInfrastructure_MemoryConfig_t;
+
+/// QnnHtpPerfInfrastructure_MemoryConfig_t initializer macro
+#define QNN_HTP_PERF_INFRASTRUCTURE_MEMORY_CONFIG_INIT                  \
+  {                                                                     \
+    QNN_HTP_PERF_INFRASTRUCTURE_MEMORY_CONFIGOPTION_UNKNOWN, /*config*/ \
+    {                                                                   \
+      0 /*memGrowSizeConfig*/                                           \
+    }                                                                   \
+  }
+
+//=============================================================================
+// API Methods
+//=============================================================================
+
+/**
+ * @brief This API allows client to create power configuration id that
+ *        has to be used to set different performance modes.
+ *        Power configuration id has to be destroyed by client when not needed.
+ *
+ * @param[in] deviceId Hardware Device on which this config id needs to be created.
+ *
+ * @param[in] coreId Core/NSP on which this config id needs to be created.
+ *
+ * @param[out] powerConfigId Pointer to power configuration id to be created.
+ *
+ * @return Error code
+ *         \n QNN_SUCCESS: No error encountered
+ *         \n QNN_HTP_PERF_INFRASTRUCTURE_ERROR_INVALID_INPUT if deviceId/coreId
+ *            or power configuration id is NULL
+ */
+typedef Qnn_ErrorHandle_t (*QnnHtpPerfInfrastructure_CreatePowerConfigIdFn_t)(
+    uint32_t deviceId, uint32_t coreId, uint32_t* powerConfigId);
+
+/**
+ * @brief This API allows client to destroy power configuration id.
+ *
+ * @param[in] powerConfigId A power configuration id to be destroyed.
+ *
+ * @return Error code
+ *         \n QNN_SUCCESS: No error encountered
+ *         \n QNN_HTP_PERF_INFRASTRUCTURE_ERROR_INVALID_INPUT if power configuration
+ *            id does not exist
+ *         \n QNN_COMMON_ERROR_SYSTEM_COMMUNICATION: SSR occurence (successful recovery)
+ *         \n QNN_COMMON_ERROR_SYSTEM_COMMUNICATION_FATAL: SSR occurence (unsuccessful recovery)
+ */
+typedef Qnn_ErrorHandle_t (*QnnHtpPerfInfrastructure_DestroyPowerConfigIdFn_t)(
+    uint32_t powerConfigId);
+
+/**
+ * @brief This API allows client to set up system power configuration that
+ *        will enable different performance modes. This API uses
+ *        HAP_power_dcvs_v3_payload struct to config HAP power parameters.
+ *        Detailed HAP power parameters description please refer to Hexagon
+ *        SDK HAP_power_dcvs_v3_payload documentation.
+ *
+ * @param[in] powerConfigId A power client id to associate calls to system
+ *            power settings. A value of 0 implies NULL power client id
+ *            and can override every other setting the user process. To
+ *            enable power settings for multiple clients in the same
+ *            process, use a non-zero power client id.
+ *
+ * @param[in] config Pointer to a NULL terminated array
+ *            of config option for performance configuration.
+ *            NULL is allowed and indicates no config options are provided.
+ *
+ * @return Error code
+ *         \n QNN_SUCCESS: No error encountered
+ *         \n QNN_HTP_PERF_INFRASTRUCTURE_ERROR_INVALID_INPUT if power configuration
+ *            does not exist
+ *         \n QNN_COMMON_ERROR_SYSTEM_COMMUNICATION: SSR occurence (successful recovery)
+ *         \n QNN_COMMON_ERROR_SYSTEM_COMMUNICATION_FATAL: SSR occurence (unsuccessful recovery)
+ */
+typedef Qnn_ErrorHandle_t (*QnnHtpPerfInfrastructure_SetPowerConfigFn_t)(
+    uint32_t powerConfigId, const QnnHtpPerfInfrastructure_PowerConfig_t** config);
+
+/**
+ * @brief This API allows clients to set up configuration associated with
+ *        system memory on a specific device
+ *
+ * @param[in] deviceId Hardware Device on which this config needs to be applied.
+ *
+ * @param[in] coreId Core/NSP on which this config needs to be applied.
+ *
+ * @param[in] config Pointer to a NULL terminated array
+ *            of config option for system memory configuration.
+ *            NULL is allowed and indicates no config options are provided.
+ *
+ * @return Error code
+ *         \n QNN_SUCCESS: No error encountered
+ *         \n QNN_HTP_PERF_INFRASTRUCTURE_ERROR_INVALID_INPUT if deviceId/coreId
+ *            or memory configuration does not exist
+ *         \n QNN_COMMON_ERROR_SYSTEM_COMMUNICATION: SSR occurence (successful recovery)
+ *         \n QNN_COMMON_ERROR_SYSTEM_COMMUNICATION_FATAL: SSR occurence (unsuccessful recovery)
+ */
+typedef Qnn_ErrorHandle_t (*QnnHtpPerfInfrastructure_SetMemoryConfigFn_t)(
+    uint32_t deviceId, uint32_t coreId, const QnnHtpPerfInfrastructure_MemoryConfig_t** config);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // QNN_HTP_PERF_INFRASTRUCTURE_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/QnnHtpProfile.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/QnnHtpProfile.h
new file mode 100755
index 0000000000000..92381d17b0440
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/QnnHtpProfile.h
@@ -0,0 +1,567 @@
+//==============================================================================
+//
+//  Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+//  All Rights Reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+/**
+ *  @file
+ *  @brief QNN HTP Profile component API.
+ *
+ *          Requires HTP backend to be initialized.
+ *          Should be used with the QnnProfile API but has HTP backend
+ *          specific definition for different QnnProfile data structures
+ *
+ */
+
+#ifndef QNN_HTP_PROFILE_H
+#define QNN_HTP_PROFILE_H
+
+#include "QnnProfile.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//=============================================================================
+// Macros
+//=============================================================================
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to the remote procedure call on the ARM processor
+ *        when client invokes QnnContext_createFromBinary. The value
+ *        returned is time in microseconds.
+ *
+ * @note context load binary host rpc time maybe available on both
+ *       QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels
+ */
+#define QNN_HTP_PROFILE_EVENTTYPE_CONTEXT_LOAD_BIN_HOST_RPC_TIME_MICROSEC 1002
+
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to the remote procedure call on the HTP processor
+ *        when client invokes QnnContext_createFromBinary. The value
+ *        returned is time in microseconds.
+ *
+ * @note context load binary htp rpc time maybe available on both
+ *       QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels
+ */
+#define QNN_HTP_PROFILE_EVENTTYPE_CONTEXT_LOAD_BIN_HTP_RPC_TIME_MICROSEC 1003
+
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to the time taken to create the context on the
+ *        accelerator when client invokes QnnContext_createFromBinary.
+ *        The value returned is time in microseconds.
+ *
+ * @note context load binary accelerator time maybe available on both
+ *       QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels
+ */
+#define QNN_HTP_PROFILE_EVENTTYPE_CONTEXT_LOAD_BIN_ACCEL_TIME_MICROSEC 1004
+
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to the remote procedure call on the ARM processor
+ *        when client invokes QnnGraph_finalize.
+ *        The value returned is time in microseconds.
+ *
+ * @note graph finalize host rpc time maybe available on both
+ *       QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels
+ */
+#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_FINALIZE_HOST_RPC_TIME_MICROSEC 2001
+
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to the remote procedure call on the HTP processor
+ *        when client invokes QnnGraph_finalize.
+ *        The value returned is time in microseconds.
+ *
+ * @note graph finalize htp rpc time maybe available on both
+ *       QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels
+ */
+#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_FINALIZE_HTP_RPC_TIME_MICROSEC 2002
+
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to finalize the graph on the accelerator
+ *        when client invokes QnnGraph_finalize.
+ *        The value returned is time in microseconds.
+ *
+ * @note graph finalize accelerator time maybe available on both
+ *       QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels
+ */
+#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_FINALIZE_ACCEL_TIME_MICROSEC 2003
+
+/* Graph Performance Estimate Support
+ *
+ **/
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to Performance Estimates for the graph
+ *        when client invokes QnnGraph_finalize.
+ *        This is just a dummy event which will print only the heading
+ *        with no value  or unit.
+ * @note HTP Performance Estimates maybe available on both
+ *       QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels
+ */
+#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_FINALIZE_PERF_ESTIMATE 2004
+
+/**
+ * @brief QnnProfile_EventType_t definition to get perf mode at which
+ *        the perf estimates are collected during QnnGraph_finalize.
+ *        The value returned is the perf mode in string with no unit.
+ *
+ * @note Perf mode maybe available on both
+ *       QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels
+ */
+#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_FINALIZE_PERF_ESTIMATE_MODE 2005
+
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to simulated execution cycles during
+ *        QnnGraph_finalize.
+ *        The value returned is number of cycles.
+ *
+ * @note Simulated execution cycles maybe available on both
+ *       QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels
+ */
+#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_FINALIZE_PERF_ESTIMATE_SIM_EXEC_CYCLES 2006
+
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to a lower estimate of simulated execution
+ *        cycles during QnnGraph_finalize.
+ *        The value returned is number of cycles.
+ *
+ * @note Simulated execution cycles lower estimate maybe available on both
+ *       QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels
+ */
+#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_FINALIZE_PERF_ESTIMATE_SIM_EXEC_LOWER_CYCLES 2007
+
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to a upper estimate of simulated execution
+ *        cycles during QnnGraph_finalize.
+ *        The value returned is number of cycles.
+ *
+ * @note Simulated execution cycles upper estimate maybe available on both
+ *       QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels
+ */
+#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_FINALIZE_PERF_ESTIMATE_SIM_EXEC_UPPER_CYCLES 2008
+
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to DDR information for each HTP during
+ *        QnnGraph_finalize.
+ *        This is just a dummy event which will print only the heading
+ *        with no value  or unit.
+ *
+ * @note DDR Information for each HTP maybe available on both
+ *       QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels
+ */
+#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_FINALIZE_PERF_ESTIMATE_BANDWIDTH_STATS 2009
+
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to the HTP ID on chip during QnnGraph_finalize.
+ *        The value returned is the HTP ID with no unit.
+ *
+ * @note HTP ID's maybe available on both
+ *       QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels
+ */
+#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_FINALIZE_PERF_ESTIMATE_BANDWIDTH_STATS_HTP_ID 2010
+
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to the Graph defined inputs or the total reads
+ *        (in bytes) from DDR for graph input related tensors (weights,
+ *        bias, activations) which do not have predecessors.
+ *        The value returned is the num of blocks in bytes.
+ *
+ * @note Graph defined inputs for each HTP maybe available on both
+ *       QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels
+ */
+#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_FINALIZE_PERF_ESTIMATE_INPUT_FILL 2011
+
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to the total reads (in bytes) from DDR for
+ *        compiler generated fill operators which have predecessors and
+ *        successors and originate on the same HTP.
+ *        The value returned is the num of blocks in bytes.
+ *
+ * @note Intermediate Fill Information for each HTP maybe available on both
+ *       QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels
+ */
+#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_FINALIZE_PERF_ESTIMATE_INTERMEDIATE_FILL 2012
+
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to the total writes (in bytes) from DDR for
+ *        compiler generated fill operators which have predecessors and
+ *        successors and originate on the same HTP.
+ *        The value returned is the num of blocks in bytes.
+ *
+ * @note Intermediate Spill Information for each HTP maybe available on both
+ *       QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels
+ */
+#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_FINALIZE_PERF_ESTIMATE_INTERMEDIATE_SPILL 2013
+
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to the total reads (in bytes) from DDR for
+ *        fills which were generated by a different HTP core and do not
+ *        have a predecessor, but have a successor.
+ *        The value returned is the num of blocks in bytes.
+ *
+ * @note Inter HTP Fill Information for each HTP maybe available on both
+ *       QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels
+ */
+#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_FINALIZE_PERF_ESTIMATE_INTER_HTP_FILL 2014
+
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to the total writes (in bytes) from DDR for
+ *        fills which were generated by a different HTP core and do not
+ *        have a successor, but have a predecessor.
+ *        The value returned is the num of blocks in bytes.
+ *
+ * @note Inter HTP Spill Information for each HTP maybe available on both
+ *       QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels
+ */
+#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_FINALIZE_PERF_ESTIMATE_INTER_HTP_SPILL 2015
+
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to the total writes (in bytes) to DDR for
+ *        graph output related tensors which do not have successors.
+ *        The value returned is the num of blocks in bytes.
+ *
+ * @note Graph output related tensors for each HTP maybe available on both
+ *       QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels
+ */
+#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_FINALIZE_PERF_ESTIMATE_OUTPUT_SPILL 2016
+
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to the total number of missing ops which do
+ *        not have any cost associated with them while getting the graph
+ *        performance estimates.
+ *        The value returned is the num of missing ops with no unit.
+ *
+ * @note Number of missing cost ops maybe available on both
+ *       QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels
+ */
+#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_FINALIZE_PERF_ESTIMATE_MISSING_COST_OPS 2017
+
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to the op ids of the missing ops which do
+ *        not have any cost associated with them while getting the graph
+ *        performance estimates.
+ *        The value returned is the opname along with the op id (decimal
+ *        format) of the ops which does not have any costs associated
+ *        with them.
+ *
+ * @note Opname and Op ids of missing cost ops are available only with
+ *       QNN_PROFILE_LEVEL_DETAILED levels
+ */
+#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_FINALIZE_PERF_ESTIMATE_MISSING_COST_OPID 2018
+
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to the remote procedure call on the ARM processor
+ *        when client invokes QnnGraph_execute or QnnGraph_executeAsync.
+ *        The value returned is time in microseconds.
+ *
+ * @note graph execute host rpc time maybe available on both
+ *       QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels
+ */
+#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_EXECUTE_HOST_RPC_TIME_MICROSEC 3001
+
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to the remote procedure call on the HTP processor
+ *        when client invokes QnnGraph_execute or QnnGraph_executeAsync.
+ *        The value returned is time in microseconds.
+ *
+ * @note graph execute htp rpc time maybe available on both
+ *       QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels
+ */
+#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_EXECUTE_HTP_RPC_TIME_MICROSEC 3002
+
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to execute the graph on the accelerator
+ *        when client invokes QnnGraph_execute or QnnGraph_executeAsync.
+ *        The value returned is number of processor cycles taken.
+ *
+ * @note graph execute accelerator time maybe available only on
+ *       QNN_PROFILE_LEVEL_DETAILED levels
+ *
+ * @note When QNN_PROFILE_LEVEL_DETAILED is used, this event can have
+ *       multiple sub-events of type QNN_PROFILE_EVENTTYPE_NODE.
+ *       There will be a sub-event for each node that was added to the graph
+ */
+#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_EXECUTE_ACCEL_TIME_CYCLE 3003
+
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to execute the graph on the accelerator
+ *        when client invokes QnnGraph_execute or QnnGraph_executeAsync.
+ *        The value indicates execute including wait/resource acquisition
+ *        time on the accelerator, if applicable in multi-threaded scenarios.
+ *        The value returned is time taken in microseconds
+ *
+ * @note graph execute accelerator time maybe available on both
+ *       QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels
+ *
+ * @note When QNN_PROFILE_LEVEL_DETAILED is used, this event can have
+ *       multiple sub-events of type QNN_PROFILE_EVENTTYPE_NODE / QNN_PROFILE_EVENTUNIT_MICROSEC
+ *       There will be a sub-event for each node that was added to the graph
+ */
+#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_EXECUTE_ACCEL_TIME_MICROSEC 3004
+
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to time taken for miscellaneous work i.e. time
+ *        that cannot be attributed to a node but are still needed to
+ *        execute the graph on the accelerator. This occurs when client invokes
+ *        QnnGraph_execute or QnnGraph_executeAsync.
+ *        The value returned is time taken in microseconds
+ *
+ * @note graph execute misc accelerator time is available only on
+ *       QNN_PROFILE_LEVEL_DETAILED levels
+ */
+#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_EXECUTE_MISC_ACCEL_TIME_MICROSEC 3005
+
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to time taken for a graph yield instance to
+ *        release all its resources to the other graph.
+ *        The value returned is time taken in microseconds.
+ */
+#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_EXECUTE_YIELD_INSTANCE_RELEASE_TIME 3006
+
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to time a graph spends waiting for a higher
+ *        priority graph to finish execution.
+ *        The value returned is time taken in microseconds
+ */
+#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_EXECUTE_YIELD_INSTANCE_WAIT_TIME 3007
+
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to time a graph spends re-acquiring resources
+ *        and restoring vtcm.
+ *        The value returned is time taken in microseconds
+ */
+#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_EXECUTE_YIELD_INSTANCE_RESTORE_TIME 3008
+
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to the number of times that a yield occured
+ *        during execution
+ */
+#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_EXECUTE_YIELD_COUNT 3009
+
+/**
+ * @brief QnnProfile_EventType_t definition for time a graph waits to get
+ *        VTCM. This should be constant UNLESS we need another graph to yield.
+ *        The value returned is time taken in microseconds.
+ */
+#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_EXECUTE_VTCM_ACQUIRE_TIME 3010
+
+/**
+ * @brief QnnProfile_EventType_t definition for time a graph waits to get
+ *        HMX + HVX, and turn them all on.
+ *        The value returned is time taken in microseconds.
+ */
+#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_EXECUTE_RESOURCE_POWER_UP_TIME 3011
+
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to execute the graph on the accelerator
+ *        when client invokes QnnGraph_execute or QnnGraph_executeAsync.
+ *        The value indicates execute excluding wait/resource acquisition
+ *        time on the accelerator, if applicable in multi-threaded scenarios.
+ *        The value returned is time taken in microseconds
+ *
+ * @note graph execute accelerator time maybe available on both
+ *       QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels
+ *
+ * @note When QNN_PROFILE_LEVEL_DETAILED is used, this event can have
+ *       multiple sub-events of type QNN_PROFILE_EVENTTYPE_NODE / QNN_PROFILE_EVENTUNIT_MICROSEC
+ *       There will be a sub-event for each node that was added to the graph
+ */
+#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_EXECUTE_ACCEL_EXCL_WAIT_TIME_MICROSEC 3012
+
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to the remote procedure call on the ARM processor
+ *        when client invokes QnnContext_free which in consequence deinit graph.
+ *        The value returned is time in microseconds.
+ *
+ * @note graph deinit host rpc time maybe available on both
+ *       QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels
+ */
+#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_DEINIT_HOST_RPC_TIME_MICROSEC 4001
+
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to the remote procedure call on the HTP processor
+ *        when client invokes QnnContext_free which in consequence deinit graph.
+ *        The value returned is time in microseconds.
+ *
+ * @note graph deinit htp rpc time maybe available on both
+ *       QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels
+ */
+#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_DEINIT_HTP_RPC_TIME_MICROSEC 4002
+
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to the time taken to deinit graph on the
+ *        accelerator when client invokes QnnContext_free which in consequence
+ *        deinit graph. The value returned is time in microseconds.
+ *
+ * @note graph deinit accelerator time maybe available on both
+ *       QNN_PROFILE_LEVEL_BASIC and QNN_PROFILE_LEVEL_DETAILED levels
+ */
+#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_DEINIT_ACCEL_TIME_MICROSEC 4003
+
+/**
+ * @brief QnnProfile_EventType_t definition to get data related to execution of
+ *        an operation. This value represents the amount of time an op spends
+ *        waiting for execution on the main thread since the last op on the main
+ *        thread due to scheduling and can be interpreted appropriately in
+ *        conjunction with the unit.
+ *
+ * @note node wait information is available on QNN_HTP_PROFILE_LEVEL_LINTING level
+ */
+#define QNN_HTP_PROFILE_EVENTTYPE_NODE_WAIT 5001
+
+/**
+ * @brief QnnProfile_EventType_t definition to get data related to execution of
+ *        an operation. This value represents the amount of time at least one
+ *        background op is running during the execution of an op on the main thread
+ *        and can be interpreted appropriately in conjunction with the unit.
+ *
+ * @note node overlap information is available on QNN_HTP_PROFILE_LEVEL_LINTING level
+ */
+#define QNN_HTP_PROFILE_EVENTTYPE_NODE_OVERLAP 5002
+
+/**
+ * @brief QnnProfile_EventType_t definition to get data related to execution of
+ *        an operation. This value represents the amount of time at least one
+ *        background op that is not being waited upon to finish is running during
+ *        the wait period of an op on the main thread and can be interpreted
+ *        appropriately in conjunction with the unit.
+ *
+ * @note node wait overlap information is available on QNN_HTP_PROFILE_LEVEL_LINTING
+ *       level
+ */
+#define QNN_HTP_PROFILE_EVENTTYPE_NODE_WAIT_OVERLAP 5003
+
+/**
+ * @brief QnnProfile_EventType_t definition to get data related to execution of
+ *        an operation. This value represents a bitmask denoting the resources
+ *        an op uses.
+ *
+ * @note node specific information is available on QNN_HTP_PROFILE_LEVEL_LINTING level
+ */
+#define QNN_HTP_PROFILE_EVENTTYPE_NODE_RESOURCEMASK 5004
+
+/**
+ * @brief QnnProfile_EventType_t definition to get data related to execution of
+ *        an operation. This value represents the ID of an op running in parallel to
+ *        an op running on the main thread or on HMX.
+ *
+ * @note node specific information is available on QNN_HTP_PROFILE_LEVEL_LINTING level
+ */
+#define QNN_HTP_PROFILE_EVENTTYPE_NODE_CRITICAL_BG_OP_ID 5005
+
+/**
+ * @brief QnnProfile_EventType_t definition to get data related to execution of
+ *        an operation. This value represents the ID of an op running on threads other
+ *        than the main or the HMX thread when the main and the HMX threads are not
+ *        executing any op.
+ *
+ * @note node specific information is available on QNN_HTP_PROFILE_LEVEL_LINTING level
+ */
+#define QNN_HTP_PROFILE_EVENTTYPE_NODE_WAIT_BG_OP_ID 5006
+
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to execute the graph's critical path on the accelerator
+ *        when client invokes QnnGraph_execute or QnnGraph_executeAsync.
+ *        The value returned is number of processor cycles taken.
+ *
+ * @note graph execute accelerator time maybe available only on
+ *       QNN_HTP_PROFILE_LEVEL_LINTING levels
+ *
+ * @note When QNN_HTP_PROFILE_LEVEL_LINTING is used, this event can have
+ *       multiple sub-events of type QNN_PROFILE_EVENTTYPE_NODE.
+ *       There will be a sub-event for each node that was added to the graph
+ */
+#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_EXECUTE_CRITICAL_ACCEL_TIME_CYCLE 6001
+
+/**
+ * @brief Linting QnnProfile_Level_t definition that allows collecting in-depth
+ *        performance metrics for each op in the graph including main thread
+ *        execution time and time spent on parallel background ops.
+ */
+#define QNN_HTP_PROFILE_LEVEL_LINTING 7001
+
+/**
+ * @brief QnnProfile_EventType_t definition to get number of HVX threads
+ *        configured by a graph. Different graphs can have a different
+ *        value.
+ */
+#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_NUMBER_OF_HVX_THREADS 8001
+
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to applying binary section for updatable tensors
+ *        when client invokes QnnContext_ApplyBinarySection.
+ *        It refers to the total time the entire API takes.
+ *        The value returned is time taken in microseconds.
+ */
+#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_APPLY_BINARY_SECTION_QNN 9001
+
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to applying binary section for updatable tensors
+ *        when client invokes QnnContext_ApplyBinarySection.
+ *        It refers to the time of callTransport.
+ *        The value returned is time taken in microseconds.
+ */
+#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_APPLY_BINARY_SECTION_RPC 9002
+
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to applying binary section for updatable tensors
+ *        when client invokes QnnContext_ApplyBinarySection.
+ *        It refers to the remote procedure call on the HTP processor.
+ *        The value returned is time taken in microseconds.
+ */
+#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_APPLY_BINARY_SECTION_QNN_ACC 9003
+
+/**
+ * @brief QnnProfile_EventType_t definition to get profile information
+ *        that corresponds to applying binary section for updatable tensors
+ *        when client invokes QnnContext_ApplyBinarySection.
+ *        It refers to the Hexnn call
+ *        The value returned is time taken in microseconds.
+ */
+#define QNN_HTP_PROFILE_EVENTTYPE_GRAPH_APPLY_BINARY_SECTION_ACC 9004
+
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // QNN_HTP_PROFILE_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/QnnHtpProperty.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/QnnHtpProperty.h
new file mode 100755
index 0000000000000..51440061dc611
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/QnnHtpProperty.h
@@ -0,0 +1,30 @@
+//==============================================================================
+//
+//  Copyright (c) 2022 Qualcomm Technologies, Inc.
+//  All Rights Reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef QNN_HTP_PROPERTY_H
+#define QNN_HTP_PROPERTY_H
+
+#include "QnnProperty.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//=============================================================================
+// Macros
+//=============================================================================
+/**
+ * @brief Property key for determining whether a backend supports unsigned pd.
+ */
+#define QNN_PROPERTY_CUSTOM_HTP_UNSIGNED_PD_SUPPORT QNN_PROPERTY_GROUP_CUSTOM + 1
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // QNN_HTP_PROPERTY_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/QnnHtpSystemContext.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/QnnHtpSystemContext.h
new file mode 100755
index 0000000000000..dcfedcb3f6450
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/QnnHtpSystemContext.h
@@ -0,0 +1,119 @@
+//==============================================================================
+//
+//  Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+//  All rights reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+/**
+ *  @file
+ *  @brief QNN HTP component System Context API.
+ *
+ *         The interfaces in this file work with the top level QNN
+ *         API and supplements QnnSystemContext.h for HTP backend
+ */
+
+#ifndef QNN_HTP_SYSTEM_CONTEXT_H
+#define QNN_HTP_SYSTEM_CONTEXT_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//=============================================================================
+// Macros
+//=============================================================================
+typedef enum {
+  // Following version with hwInfoBlobVersion as:
+  //   - Major 0, Minor: 0, Patch: 1
+  QNN_SYSTEM_CONTEXT_HTP_HW_INFO_BLOB_VERSION_V1 = 0x01,
+  // Unused, present to ensure 32 bits.
+  QNN_SYSTEM_CONTEXT_HTP_HW_INFO_BLOB_UNDEFINED = 0x7FFFFFFF
+} QnnHtpSystemContext_HwInfoBlobVersion_t;
+
+// This struct is gets populated within a binary blob as part of hwInfoBlob in
+// QnnSystemContext_BinaryInfoV#_t struct in QnnSystemContext.h
+typedef struct QnnHtpSystemContext_HwBlobInfoV1 {
+  // This value represents the index of the list of graphs registered
+  // to this context as specified in QnnSystemContext_GraphInfo_t*
+  uint32_t graphListIndex;
+  // Stores the spill-fill buffer size used by each of the graphs
+  uint64_t spillFillBufferSize;
+} QnnHtpSystemContext_HwBlobInfoV1_t;
+
+typedef struct {
+  QnnHtpSystemContext_HwInfoBlobVersion_t version;
+  union UNNAMED {
+    QnnHtpSystemContext_HwBlobInfoV1_t contextBinaryHwInfoBlobV1_t;
+  };
+} QnnHtpSystemContext_HwBlobInfo_t;
+
+typedef enum {
+  // Following version with GraphInfoBlobVersion as:
+  //   - Major 0, Minor: 0, Patch: 1
+  QNN_SYSTEM_CONTEXT_HTP_GRAPH_INFO_BLOB_VERSION_V1 = 0x01,
+  // Unused, present to ensure 32 bits.
+  QNN_SYSTEM_CONTEXT_HTP_GRAPH_INFO_BLOB_UNDEFINED = 0x7FFFFFFF
+} QnnHtpSystemContext_GraphInfoBlobVersion_t;
+
+// This struct is gets populated within a binary blob as part of GraphInfoBlob in
+// QnnSystemContext_BinaryInfoV#_t struct in QnnSystemContext.h
+typedef struct {
+  // Stores the spill-fill buffer size used by each of the graphs
+  uint64_t spillFillBufferSize;
+  // HTP vtcm size (MB)
+  uint32_t vtcmSize;
+  // Optimization level
+  uint32_t optimizationLevel;
+  // Htp Dlbc
+  uint8_t htpDlbc;
+  // Number of HVX Threads to reserve;
+  uint64_t numHvxThreads;
+} QnnHtpSystemContext_GraphBlobInfoV1_t;
+
+typedef struct {
+  QnnHtpSystemContext_GraphInfoBlobVersion_t version;
+  union UNNAMED {
+    QnnHtpSystemContext_GraphBlobInfoV1_t contextBinaryGraphBlobInfoV1;
+  };
+} QnnHtpSystemContext_GraphBlobInfo_t;
+
+typedef enum {
+  // Following version with ContextInfoBlobVersion as:
+  //   - Major 0, Minor: 0, Patch: 1
+  QNN_SYSTEM_CONTEXT_HTP_CONTEXT_INFO_BLOB_VERSION_V1 = 0x01,
+  // Unused, present to ensure 32 bits.
+  QNN_SYSTEM_CONTEXT_HTP_CONTEXT_INFO_BLOB_UNDEFINED = 0x7FFFFFFF
+} QnnHtpSystemContext_ContextInfoBlobVersion_t;
+
+typedef struct{
+    /// An integer representation of SocUtility::DspArch
+    uint32_t dspArch;
+} QnnHtpSystemContext_ContextBlobInfoV1_t;
+
+typedef struct {
+  QnnHtpSystemContext_ContextInfoBlobVersion_t version;
+  union UNNAMED {
+    QnnHtpSystemContext_ContextBlobInfoV1_t contextBinaryContextBlobInfoV1;
+  };
+} QnnHtpSystemContext_ContextBlobInfo_t;
+
+//=============================================================================
+// Data Types
+//=============================================================================
+
+//=============================================================================
+// Public Functions
+//=============================================================================
+
+//=============================================================================
+// Implementation Definition
+//=============================================================================
+
+// clang-format on
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif
\ No newline at end of file
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/afuncs.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/afuncs.h
new file mode 100755
index 0000000000000..28b5685f29750
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/afuncs.h
@@ -0,0 +1,338 @@
+//==============================================================================
+//
+// Copyright (c) 2018, 2023 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef AFUNCS_H
+#define AFUNCS_H 1
+
+#include <algorithm>
+#include <cmath>
+#include "dtype.h"
+#ifndef __hexagon__
+#include <cstring> // for memcpy etc
+#endif
+// #include "asm_define.h"
+#include "builtin_intrinsics.h"
+#include "macros_attribute.h"
+
+struct tile_data {
+    uint8_t **addr;
+    uint32_t offset_t_col;
+    uint32_t offset_t_row;
+    uint32_t width;
+    uint32_t height;
+    uint32_t depth;
+};
+
+// Define order: .addr, .offset_t_col, .offset_t_row, .width, .height, .depth
+#define TILEDATA(adrtab, next_tab_col, next_tab_row, h, w, d)                                                          \
+    {                                                                                                                  \
+        (uint8_t **)(adrtab), static_cast<uint32_t>(next_tab_col), static_cast<uint32_t>(next_tab_row),                \
+                static_cast<uint32_t>(w), static_cast<uint32_t>(h), static_cast<uint32_t>(d)                           \
+    }
+
+/*=======================================*/
+/* Auxiliary functions                   */
+/*=======================================*/
+#if defined(__hexagon__)
+inline int32_t max_i32(int32_t a, int32_t b)
+{
+    return Q6_R_max_RR(a, b);
+}
+inline int32_t min_i32(int32_t a, int32_t b)
+{
+    return Q6_R_min_RR(a, b);
+}
+inline uint32_t max_u32(uint32_t a, uint32_t b)
+{
+    return Q6_R_maxu_RR(a, b);
+}
+inline uint32_t min_u32(uint32_t a, uint32_t b)
+{
+    return Q6_R_minu_RR(a, b);
+}
+#else
+inline int32_t max_i32(int32_t a, int32_t b)
+{
+    return (a < b) ? b : a;
+}
+inline int32_t min_i32(int32_t a, int32_t b)
+{
+    return (a < b) ? a : b;
+}
+inline uint32_t max_u32(uint32_t a, uint32_t b)
+{
+    return (a < b) ? b : a;
+}
+inline uint32_t min_u32(uint32_t a, uint32_t b)
+{
+    return (a < b) ? a : b;
+}
+#endif
+
+[[maybe_unused]] inline ALWAYSINLINE int64_t roundf_i64(float val)
+{
+    // add 0.5 (with same sign as val) and then conversion to int truncates toward 0.
+    // values exactly halfway will round away from 0 (like roundf).
+
+    return (int64_t)(val + copysignf(0.5f, val));
+}
+
+[[maybe_unused]] inline ALWAYSINLINE NN_INT32_T roundf_i32(float val)
+{
+    // add 0.5 (with same sign as val) and then conversion to int truncates toward 0.
+    // values exactly halfway will round away from 0 (like roundf).
+
+    return (int)(val + copysignf(0.5f, val));
+}
+// same thing for rounding to unsigned range; -ve inputs will give 0.
+//
+[[maybe_unused]] inline ALWAYSINLINE uint32_t roundf_u32(float val)
+{
+    // add 0.5f and then convert to uint (trunc towards 0; -ve values are clipped to 0).
+#ifdef __hexagon__
+    // use intrinsic since conv of -ve float to unsigned is 'undefined behaviour' in C.
+    return Q6_R_convert_sf2uw_R_chop(val + 0.5f);
+#else
+    return (val < 0.5f) ? 0 : (uint32_t)(val + 0.5f);
+#endif
+}
+
+[[maybe_unused]] inline ALWAYSINLINE NN_INT32_T roundd_i32(double val)
+{
+    // add 0.5 (with same sign as val) and then conversion to int truncates toward 0.
+    // values exactly halfway will round away from 0 (like round).
+
+    return (int)(val + copysign(0.5, val));
+}
+
+[[maybe_unused]] inline ALWAYSINLINE NN_INT32_T saturate_u8(NN_INT32_T val)
+{
+#ifdef __hexagon__
+    return Q6_R_satub_R(val);
+#else
+    return (val < 0) ? 0 : ((val > 255) ? 255 : val);
+#endif
+}
+
+[[maybe_unused]] inline ALWAYSINLINE NN_INT32_T saturate_u16(NN_INT32_T val)
+{
+#ifdef __hexagon__
+    return Q6_R_satuh_R(val);
+#else
+    return (val < 0) ? 0 : ((val > 65535) ? 65535 : val);
+#endif
+}
+
+[[maybe_unused]] static inline ALWAYSINLINE NN_INT32_T saturate_i16(NN_INT32_T val)
+{
+#ifdef __hexagon__
+    return Q6_R_sath_R(val);
+#else
+    return (val < -32768) ? -32768 : ((val > 32767) ? 32767 : val);
+#endif
+}
+
+/**
+ * @brief low-cost frexpf (but only the exponent result);
+ * Generates only a few instructions on hexagon.
+ *
+ * Input must not be inf,nan, zero, or denormal.
+ *
+ * returns:
+ *        -1 if abs(x) is in range 0.25 ... 0.249999
+ *         0 if abs(x) is in range 0.5 ... 0.99999
+ *         1 if abs(x) is in range 1.0 .. 1.9999
+ *  etc
+ *
+ *  If the value -126 is returned, x is a zero or denormal;
+ *  129 is returned for inf or NaN. for other cases the value is the same
+ *  as what frexpf  (in math.h) generates for the exponent.
+ */
+[[maybe_unused]] inline ALWAYSINLINE constexpr int flt_getexp(float x)
+{
+    union {
+        float f;
+        uint32_t u32;
+    } const uu = {x};
+    return ((uu.u32 >> 23u) & 0xFFu) - 126;
+}
+/**
+ * @brief low-cost frexpf (but only the 'fraction' result);
+ * Generates only a few instructions on hexagon.
+ *
+ * Input must not be inf,nan, zero, or denormal.
+ *
+ * returns a value in the range [0.5, 1.0)  (or in (-1.0,-0.5] when x < 0)
+ * such that x = flt_getmant(x) * powf2(2.0, flt_getexp(x))
+ *
+ */
+[[maybe_unused]] inline ALWAYSINLINE constexpr float flt_getmant(float x)
+{
+    union {
+        float f;
+        uint32_t u32;
+    } uu = {x};
+    uu.u32 = (uu.u32 & 0x807fffffu) | (uint32_t(126) << 23u); // force exponent = 126
+    return uu.f;
+}
+
+/**
+ * @brief returns the mantissa of x, as a 24-bit number
+ * in the range 0x800000 .. 0xFFFFFF
+ *
+ * Input must not be inf,nan, zero, or denormal.
+ *
+ * Sign is discarded. same as powf(2,24) * flt_getmant(fabsf(x)).
+ */
+[[maybe_unused]] inline ALWAYSINLINE constexpr int32_t flt_getfrac(float x)
+{
+    union {
+        float f;
+        uint32_t u32;
+    } const uu = {x};
+    int32_t const m = (uu.u32 & 0x007fffffu) | (uint32_t(1) << 23u);
+    return m;
+}
+
+//
+// This 'normalizes' a float to 0.5 .. 0.9999  (sign is retained)
+// Same result as the return value from frexpf, without using a function call
+// Results are not valid if x is 0, denormal, or inf/nan
+//
+[[maybe_unused]] inline ALWAYSINLINE float flt_getfrac_norm(float x)
+{
+    union {
+        float f;
+        uint32_t u32;
+    } uu = {x};
+    uu.u32 = (uu.u32 & 0x807fffffu) | (uint32_t(126) << 23u); // force exponent = 126
+    return uu.f;
+}
+/**
+ * @brief low-cost 2.0*n for integer n.
+ * Same as powf(2.0f, iexpo) without a function call;
+ *
+ * Constraint: iexpo must be in range -126..127
+ */
+[[maybe_unused]] inline ALWAYSINLINE constexpr float flt_power2(uint32_t const iexpo)
+{
+    uint32_t const a = (iexpo + 127) & 0xFFu;
+    union {
+        uint32_t u32;
+        float f;
+    } const uu = {a << 23u};
+    return uu.f;
+}
+/**
+ * @brief low-cost ldexpf
+ * Same as ldexpf(val, iexpo) without a function call;
+ *
+ * Constraint: iexpo must be in range -126..127
+ */
+[[maybe_unused]] inline ALWAYSINLINE constexpr float flt_ldexp(float val, int iexpo)
+{
+    return val * flt_power2(iexpo);
+}
+/**
+ * @brief low-cost 2.0*n for integer n.
+ * Same as pow(2.0d, iexpo) without a function call;
+ *
+ * Constraint: iexpo must be in range -1022..1023
+ */
+[[maybe_unused]] inline ALWAYSINLINE constexpr double double_power2(uint32_t const iexpo)
+{
+    uint64_t const a = (iexpo + 1023) & 0x7FFu;
+    union {
+        uint64_t u64;
+        double d;
+    } const uu = {a << 52u};
+    return uu.d;
+}
+/**
+ * @brief low-cost ldexpf
+ * Same as ldexp(val, iexpo) without a function call;
+ *
+ * Constraint: iexpo must be in range -1022..1023
+ */
+[[maybe_unused]] inline ALWAYSINLINE constexpr double double_ldexp(double val, int iexpo)
+{
+    return val * double_power2(iexpo);
+}
+
+/**
+ * @brief returns the exponent and mantissa of x, as a n-bit number
+ *
+ * Constraint: iexpo must be in range -126..127
+ * Input must not be negative, inf,nan, zero, or denormal.
+ */
+template <uint32_t MBITS> inline constexpr std::pair<int32_t, uint32_t> get_scalefactor(float x)
+{
+    union {
+        float f;
+        uint32_t u32;
+    } const uu = {x};
+
+    uint32_t inval = uu.u32;
+    uint32_t const mask = hnnx::safe_lshift(1, MBITS) - 1;
+    inval = hnnx::safe_rshift(inval + hnnx::safe_lshift(1, (24 - MBITS - 1)),
+                              (24 - MBITS)); // possibly overflows into exponent, but that's OK.
+    uint32_t const m = ((inval & mask) | hnnx::safe_lshift(1u, (MBITS - 1)));
+    int32_t const e = int32_t(hnnx::safe_rshift(inval, (MBITS - 1)) & 0xFFu) - 126;
+    return {e, m};
+}
+
+/**
+ * @brief returns the parameters for scaling.
+ * bit 31-24: left shift amount
+ * bit 23-16: right shift amout
+ * bit 15- 0: scale factor
+ *
+ * Input must not be inf,nan, zero, negative or denormal.
+ *
+ */
+[[maybe_unused]] inline ALWAYSINLINE constexpr uint32_t get_scaling_params(float x, int max_sl, int max_sr)
+{
+    auto [e, m] = get_scalefactor<15>(x);
+    // Set a sl or sr amount to perform a multiply of 2^exponent by mantissa.
+    int sl = (e > 0) ? e : 0;
+    int sr = (e > 0) ? 0 : -e;
+    // The max_sl allows the addition of extra left shifts when working with small numbers having negative exponents.
+    // For every extra left shift, there is an offsetting right shift added so that the net right shift amount
+    // required from the exponent stays the same. The max_sr parameter provides a ceiling to the required offsetting
+    // right shifts, preventing the total right shift requirement from being large enough to erase data through shifting.
+    if (sl == 0 && sr > 0) {
+        sl = min_i32(max_sl, max_i32(max_sr - sr, 0));
+        sr = sr + sl;
+    }
+    return ((uint32_t(sl) & 0x0FFu) << 24u) | ((uint32_t(sr) & 0x0FFu) << 16u) | uint32_t(m);
+}
+
+/**
+ * @brief given a scale in float and a recip shift amount
+ *  return a quantized scale multiplier and change recip shamt inplace
+ *
+ */
+inline uint32_t get_quantized_multipiler(const float scale_f, int &recip_shamt)
+{
+    recip_shamt = (scale_f <= 1.0f) ? 0 : flt_getexp(scale_f);
+    uint32_t scale = static_cast<uint32_t>(roundf(flt_ldexp(scale_f, (31 - recip_shamt))));
+    scale = (scale < 0x7fffffffu) ? scale : 0x7FFFFFFFu;
+    return scale;
+}
+
+/**
+ * @brief given a scale in float and a recip shift amount
+ *  return a quantized scale multiplier and change recip shamt inplace
+ *
+ */
+//Now with corrected spelling
+inline uint32_t get_quantized_multiplier(const float scale_f, int &recip_shamt)
+{
+    return get_quantized_multipiler(scale_f, recip_shamt);
+}
+#endif /*AFUNCS_H*/
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/allocator.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/allocator.h
new file mode 100755
index 0000000000000..844bcf4c7ec50
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/allocator.h
@@ -0,0 +1,236 @@
+//==============================================================================
+//
+// Copyright (c) 2020 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef ALLOCATOR_H
+#define ALLOCATOR_H 1
+
+#include <cstddef>
+#include <algorithm>
+#include <memory>
+#include "dtype_enum.h"
+#include "weak_linkage.h"
+#include "macros_attribute.h"
+#include "forward_classes.h"
+#include "hexagon_nn_types.h"
+
+enum class MemoryClass {
+    Plain,
+    TCM,
+    UnCached, // for spill/fill DDR
+    XXX_LAST_MEMORY_TYPE,
+    Default = Plain
+};
+
+PUSH_VISIBILITY(default)
+
+extern bool TrackedAllocError;
+
+class Graph;
+class HexagonNNEnv;
+namespace fa {
+struct PoolDesc;
+struct BigBuff;
+struct RuntimeAllocator;
+} // namespace fa
+namespace hnnx {
+
+class Serializer;
+class Deserializer;
+
+// some options flags (powers of 2) for calls to Tensor::allocate
+enum AllocOptions {
+    uncached_int8 = 0x1, // override MemoryClass to UnCached.
+    uncached_int16 = 0x2,
+    uncached_fp16 = 0x4
+};
+
+/*
+ * Maybe FIXME: It seems like FancyAllocator has just about all the same interfaces as Allocator,
+ * is all this pimpl stuff needed, or could we just inherit Allocator and have a unique_ptr<Allocator>
+ * in our graph?
+ */
+
+class Allocator {
+  public:
+    // MIN_ALIGN, MAX_ALIGN:
+    //  - both must be powers of 2
+    //  -  8 <= MIN_ALIGN <= MAX_ALIGN
+    // All allocations will be aligned to at least MIN_ALIGN, both start and end of each region.
+    // This includes sub-allocations in memory pools.
+    // Alignment requests > MAX_ALIGN may be treated as MAX_ALIGN if allocated in DDR.
+    //
+    static constexpr unsigned MIN_ALIGN = 256;
+    static constexpr unsigned MAX_ALIGN = 256;
+
+    // The alignment used by TCM allocation; >= MIN_ALIGN
+    static constexpr unsigned TCM_ALLOC_ALIGN = 2048;
+
+    static void *vacant() { return (void *)2; } // special value for 'vacant' slot.
+    enum Mode { AllocVirtual, AllocPhysical, AllocTemp, AllocTempEnd, AllocComplete, LastMode = AllocComplete };
+
+    // AllocTemp/AllocTempEnd are used in Virtual mode, to set a 'Temp Physical' mode
+    // where allocation is done to physical memory, but into memory blocks which
+    // are discarded when we return via AllocTempEnd (So, AllocTempEnd is not possible as an actual
+    // current mode).
+    // This is intended to support nesting (multiple levels of AllocTemp; each
+    // AllocTempEnd discards all allocs since the matching AllocTemp; but
+    // currently nesting is not supported, so AllocTemp must be followed by AllocTempEnd,
+    // which actually takes you back to AllocVirtual
+    // AllocComplete allows no further allocations. A deserialized allocator
+    // is in this state.
+
+    API_EXPORT Allocator(Mode mode_in, Graph &graph_in) : graph(graph_in), mode(mode_in){};
+    API_EXPORT virtual ~Allocator() = 0;
+
+    Graph &graph;
+
+    // Either allocates enough, or dips into a buffer (and changes the buffer pointer and size parameter accordingly).
+    // al is an alignment parameter; it must be a power of 2 or the code below won't work.
+    API_EXPORT void *tracked_aligned_alloc(size_t al, size_t bytes, fa::BigBuff *const bb = nullptr);
+    API_EXPORT void tracked_free(void *aligned_ptr) noexcept;
+
+    API_EXPORT virtual void allocate_n(void **arrp, size_t n, size_t block_size, size_t alignment, MemoryClass memclass,
+                                       unsigned options, DType dtype);
+
+    // options for allocate_persistent_blocks.
+    // if 'allnew' is *not* present, it is assumed that all of the pointers
+    //   are either null, or point to existing persistent blocks. The 'null' ones
+    //   are replaced with new allocations, and the ref counts are increased in both cases.
+    // with 'allnew': pointers are assumed to contain garbage. Equivalent to zeroing the
+    //   pointer table first.
+    //
+    // zoneB: with this, ref counts are update in 'B' zone instead of A
+    //
+    // incref: ovverides 'allnew'; all of the existing pointers are required to be valid persistent
+    //    blocks; the ref counts are increased by 1
+    // decref: overrides 'incref and allnew'; all of the pointers are required to be valid persistent
+    //    blocks; the ref counts are reduced by 1. If total refs are zero, block is freed.
+    //    the pointer table is not updated.
+    //
+    // infinite: newly alloc'd blocks get refcount set to a huge number, instead of 1.
+    // Currently this is used when deserializing, since we can't free things immediately when in Crate.
+    //
+    enum persistent_options {
+        allnew = 1u, // assume existing pointers are garbage, allocate them all.
+        zoneB = 2u, // reference count in zone B instead of A.
+        incref = 4u, // enforce that all existing are persistnent; incref them.
+        decref = 8u,
+        infinite = 16u, // refcounts on new blocks, set to a huge # instead of 1.
+    };
+
+    // allocate n 'persistent' blocks of the given size/alignment, and update the table.
+    API_EXPORT virtual void allocate_persistent_blocks(void **table, size_t nblocks, size_t block_size,
+                                                       size_t alignment, unsigned options);
+
+    API_EXPORT inline void *allocate(const void *oldval, size_t block_size, size_t alignment, MemoryClass memclass,
+                                     unsigned options, DType dtype)
+    {
+        PUSH_WARNING()
+        DISABLE_WARNING("-Wcast-qual", MSVC_NO_EQUIV)
+        void *tmp = const_cast<void *>(oldval);
+        POP_WARNING()
+        allocate_n(&tmp, 1, block_size, alignment, memclass, options, dtype);
+        return tmp;
+    }
+
+    API_EXPORT Mode get_mode() const { return mode; }
+    API_EXPORT virtual void set_mode(Mode new_mode);
+
+    API_EXPORT virtual void set_tcm_pool(void *base, size_t size);
+
+    API_EXPORT virtual void set_largest_memory_alloc_size(size_t size);
+
+    /*
+	 * Serialize all the internal data for the allocator.
+	 * Memory regions / pools, etc.
+	 */
+    API_EXPORT virtual void serialize(Serializer &) const;
+    /*
+     * Deserialize the allocator, restore internal data from buffer.
+     */
+    API_EXPORT virtual void deserialize(HexagonNNEnv &env, Deserializer &dctx,
+                                        hexagon_nn_wide_address_const_t params_weights = 0U,
+                                        const size_t params_weights_length = 0,
+                                        hexagon_nn_wide_iovec_t const &weights = NULL_IOVEC);
+
+    API_EXPORT virtual int find_replaceable_mempool(unsigned const replaceable_pool_seq,
+                                                    fa::PoolDesc &found_pool) const;
+
+    // LCOV_EXCL_START [SAFTYSWCCB-1542]
+    API_EXPORT static inline constexpr size_t fixup_alignment(size_t align)
+    {
+        static_assert(MIN_ALIGN >= 8 && (MIN_ALIGN & (MIN_ALIGN - 1)) == 0, "bad MIN_ALIGN");
+        static_assert(MAX_ALIGN >= MIN_ALIGN && (MAX_ALIGN & (MAX_ALIGN - 1)) == 0, "bad MAX_ALIGN");
+        if (MIN_ALIGN < MAX_ALIGN) {
+            return std::max<size_t>(MIN_ALIGN, std::min<size_t>(MAX_ALIGN, align));
+        } else {
+            return MIN_ALIGN;
+        }
+    }
+    // LCOV_EXCL_STOP
+
+    API_EXPORT static inline constexpr size_t round_up_align(size_t n, size_t align)
+    {
+        return (n + (align - 1)) & ~(align - 1);
+    }
+    template <typename T> API_EXPORT static inline T *round_up_align(T *p, size_t align)
+    {
+        return (T *)round_up_align((size_t)p, align);
+    }
+
+  protected:
+    Mode mode = AllocVirtual;
+};
+
+//
+// this is s 'shim' class to help in making dummy allocators. It defines overrides
+// for all of the pure-virtual methods, so you don't need to
+//
+class FakeAllocator : public Allocator {
+  public:
+    API_EXPORT FakeAllocator(Allocator::Mode mode_in, Graph &graph_in) : Allocator(mode_in, graph_in){};
+    API_EXPORT virtual ~FakeAllocator();
+};
+
+// this is an accessor which is used by the Dma 'Fill' operation
+// to get a source pointer for reading const, based on (pool_id, offset).
+// It also holds the base pointer for ddr spill area.
+// Maybe other things could be added later.
+
+class MemPoolRunTimeAccessor {
+    hexagon_nn_wide_address_t spill_area;
+    fa::PoolDesc const *pool_table; // pool_table[0] is for poolid=1
+    unsigned max_pool_id;
+
+  public:
+    API_EXPORT MemPoolRunTimeAccessor(hexagon_nn_wide_address_const_t spill_area_in, fa::PoolDesc const *const pt,
+                                      unsigned const pt_size)
+        : spill_area(spill_area_in), pool_table(pt), max_pool_id(pt_size)
+    {
+    }
+    API_EXPORT MemPoolRunTimeAccessor() : spill_area(0), pool_table(nullptr), max_pool_id(0) {}
+    API_EXPORT MemPoolRunTimeAccessor(MemPoolRunTimeAccessor const &) = default;
+    API_EXPORT MemPoolRunTimeAccessor &operator=(MemPoolRunTimeAccessor const &) = default;
+
+    // pool ids are >= 1, <= num_pools
+    API_EXPORT constexpr unsigned num_pools() const { return max_pool_id; } //LCOV_EXCL_LINE [SAFTYSWCCB-1542]
+    // map pool_id to base address of the data, for persistent pool; also get 'is_weights' flag.
+    // implementation in runtime_alloc.h
+    std::pair<hexagon_nn_wide_address_t, bool> get_persistent_pool_base_iswts(unsigned pool_id) const;
+    API_EXPORT hexagon_nn_wide_address_t get_spill_area() const { return spill_area; }
+
+    // used to construct the ConstExtentDescriptor during prep
+    // implementation in fa_alloc.h
+    API_EXPORT fa::PoolDesc const *get_descriptor(unsigned pool_id) const;
+};
+
+} // namespace hnnx
+
+POP_VISIBILITY()
+
+#endif
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/bake_defs.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/bake_defs.h
new file mode 100755
index 0000000000000..11d01bcb31b95
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/bake_defs.h
@@ -0,0 +1,244 @@
+//==============================================================================
+//
+// Copyright (c) Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef BAKE_DEFS
+#define BAKE_DEFS 1
+#include <cstdint>
+#include <algorithm>
+#include <utility>
+#include <tuple>
+
+#include "executable.h"
+
+// Contains defs for host-side and target side, so try not
+// to add too many 'host only' things.
+
+#ifdef __hexagon__
+#define HNNX_ARCH_CAN_RUN_BAKED 1
+#endif
+
+namespace hnnx {
+
+namespace bake {
+
+using tgt_ptr_word = unsigned;
+using tgt_sizet_word = unsigned;
+static constexpr unsigned tgt_ptr_bytes = sizeof(tgt_ptr_word);
+static constexpr unsigned tgt_sizet_bytes = sizeof(tgt_sizet_word);
+static constexpr bool op_has_graphp = false;
+static constexpr unsigned tensor_uptr_ptrs = 2;
+static constexpr unsigned max_opaquet_align = 1024; // must be power of 2
+
+// This should be OK as a first approx: includes hexagon and x86-32
+static constexpr bool host_can_run_baked = sizeof(void *) == tgt_ptr_bytes;
+
+inline unsigned constexpr round_up(unsigned x, unsigned m)
+{
+    return ((x + (m - 1)) / m) * m;
+}
+
+// functions to calculate size, align of various things. They
+// are included in target build so we can static_assert that sizes are what we think they are.
+// (all must be constexpr).
+
+// {size, alignment} of typical_op
+inline constexpr std::pair<unsigned, unsigned> typical_op_tgt_size_align(unsigned n_in, unsigned n_out)
+{
+    // 1 pointer per input, plus tensor_uptr_ptrs per output; but if n_in = n_out == 0, it's 1 pointer.
+    // (for a 'fill' byte).
+    unsigned num_io_ptrs = n_in + n_out * tensor_uptr_ptrs;
+    if (num_io_ptrs == 0) num_io_ptrs = 1; // n_in = n_out = 0 case
+    return {tgt_ptr_bytes * ((op_has_graphp ? 2 : 1) // vptr, and maybe Graph *
+                             + num_io_ptrs), // inputs and outputs
+            tgt_ptr_bytes}; // align
+}
+
+// 'tensor_op_tgt_size_align is used for crate accounting of ShapeWrapperOp, ConstWrapperOp, DummyOp<N>
+// In a proper 'baked graph' we don't need to insert these, just the tensors...
+
+inline constexpr std::pair<unsigned, unsigned> tensor_op_tgt_size_align(unsigned n_out)
+{
+    // happens to be the same as TypicalOp with no inputs...
+    return typical_op_tgt_size_align(0, n_out);
+}
+
+// {size, alignment, extra} of typical_op_with_compiler
+//    extra_len is the len of the extra data
+//    extra_align is its alignment.
+// The 3rd return value is the offset of the 'extra' within the image.
+//
+inline constexpr std::tuple<unsigned, unsigned, unsigned>
+typical_op_extra_tgt_size_align(unsigned n_in, unsigned n_out, unsigned extra_len, unsigned extra_align)
+{
+    std::pair<unsigned, unsigned> base_size = typical_op_tgt_size_align(n_in, n_out);
+    unsigned extra_offs = base_size.first;
+    if (extra_len > 0) {
+        extra_align = std::max(extra_align, base_size.second);
+        extra_len = round_up(extra_len, extra_align);
+        extra_offs = round_up(extra_offs, extra_align);
+        base_size.first = extra_offs + extra_len;
+        base_size.second = extra_align;
+    }
+    return {base_size.first, base_size.second, extra_offs};
+}
+
+// {size, alignment} of variadic op (without the in, out array contents)!
+constexpr std::pair<unsigned, unsigned> variadic_op_tgt_size_align(unsigned n_in, unsigned n_out)
+{
+    const unsigned cratevec_words = 2;
+    return {tgt_ptr_bytes * (1 // vptr
+                             + (op_has_graphp ? 1 : 0) // Graph *
+                             + 2 * cratevec_words), // two cratevecs
+            tgt_ptr_bytes}; // align
+}
+// {size, alignment} of simple_op_wrapper (without the in, out array contents)!
+constexpr std::pair<unsigned, unsigned> simplewrap_op_tgt_size_align(unsigned n_in, unsigned n_out)
+{
+    // this is just one more pointer than a variadic op...
+    const auto var_result = variadic_op_tgt_size_align(n_in, n_out);
+    return {var_result.first + tgt_ptr_bytes, var_result.second};
+}
+
+// {size, alignment} of a ChunkPreloadOp
+constexpr std::pair<unsigned, unsigned> chunk_preload_op_tgt_size_align()
+{
+    return {tgt_ptr_bytes * (1 // vptr
+                             + (op_has_graphp ? 1 : 0) // Graph *
+                             + 2), // ptr, len;
+            tgt_ptr_bytes}; // align
+}
+
+//
+// {size_align} of Shape<RANK> object
+//
+constexpr std::pair<unsigned, unsigned> shape_tgt_size_align(unsigned rank)
+{
+    // tgt_sizet_bytes * (1 + 1 + 2 * rank) =
+    //      vtable ptr
+    //      shapeflag flags + padding[]
+    //      std::array<size_t, Rank> dims
+    //      std::array<size_t, Rank> max_dims
+    //  + rank = std::array<uint8_t, Rank> pad
+    return {round_up(tgt_sizet_bytes * (1 + 1 + 1 + 2 * rank) + rank, tgt_sizet_bytes), tgt_sizet_bytes};
+}
+
+//
+// {size_align} of DynamicShape<RANK> object
+//
+constexpr std::pair<unsigned, unsigned> dynamic_shape_tgt_size_align(const unsigned rank)
+{
+    // std::array<size_t, Rank> dims == tgt_sizet_bytes * rank
+    // (shapeflag flags + padding[]) + vtable ptr + dynamic_state =  (3 * tgt_sizet_bytes)
+    return {round_up(tgt_sizet_bytes * rank + (4 * tgt_sizet_bytes), tgt_sizet_bytes), tgt_sizet_bytes};
+}
+
+//
+// {size_align} of interface object (may or may not be quantized)
+//
+constexpr std::pair<unsigned, unsigned> interface_tgt_size_align(bool is_quantized)
+{
+    return {tgt_sizet_bytes + (is_quantized ? round_up(3 * 4, tgt_sizet_bytes) : 0), tgt_sizet_bytes};
+}
+
+// {size_align} of Tensors, of three different forms:
+//
+// 'general' tensor
+//
+constexpr std::pair<unsigned, unsigned> tensor_general_tgt_size_align()
+{
+    return {tgt_sizet_bytes * 4 + 2 * tgt_ptr_bytes, tgt_sizet_bytes};
+}
+
+// 'shape' tensor, of given rank.
+//
+constexpr std::pair<unsigned, unsigned> tensor_shape_tgt_size_align(unsigned rank)
+{
+    return {tgt_sizet_bytes * ((rank == 0 ? 1 : rank) + 1), tgt_sizet_bytes};
+}
+
+// 'scalar' tensor, need to know if the interface is 'quantized' or not
+// Note, this assumes all value are <= size_t bytes.
+//
+constexpr std::pair<unsigned, unsigned> tensor_scalar_tgt_size_align(bool is_quantized)
+{
+    const unsigned ifc_size = interface_tgt_size_align(is_quantized).first;
+    return {tgt_sizet_bytes * 2 + ifc_size, tgt_sizet_bytes};
+}
+// sizeof OpExtraInfo on target: {long long, 2 * unsigned, char *, 4 * padbyte}
+constexpr std::pair<unsigned, unsigned> OpExtraInfo_size_align = {24, 8};
+
+// The size of a SliceDispatchOp for the given number of slices.
+// Currently it's always the same regardless of 'nslices'; We may introduce 'right-sized'
+// value, in which case 'exact=true' will get the 'real' size; but exact = false will always
+// give the full size.
+constexpr std::pair<unsigned, unsigned> slice_dispatch_op_size_align(unsigned const nslices, bool const exact = false)
+{
+    return {tgt_sizet_bytes * ((op_has_graphp ? 5 : 4) + 3 * Executable::MAX_OP_SLICES), tgt_sizet_bytes};
+}
+
+// The size of a Predicated Op
+constexpr std::pair<unsigned, unsigned> pred_op_size_align()
+{
+    return {tgt_sizet_bytes * ((op_has_graphp ? 5 : 4) + 3), tgt_sizet_bytes};
+}
+
+// this is used in e.g.
+// if constexpr(host_can_run_baked) static_assert(size_align_matches<TypicalOp>(N_IN, N_OUT));
+
+template <typename T, typename SZAL> constexpr bool size_align_matches(SZAL sz)
+{
+    return sizeof(T) == std::get<0>(sz) && alignof(T) == std::get<1>(sz);
+}
+
+// This is a utility to check that a type T has a given size and aligment, using static_assert;
+// Just need to include a call to 'do-nothing' bake::check_size_align<T>::template check<SIZE,ALIGN>();
+// The static assert is *disabled* unless compiling on hexagon (or compatible host).
+//
+// It's more complex than it needs to be, since it's designed to make sure the type and
+// numbers wind up in the error message, e.g. you could end up with
+//   error: static_assert failed due to requirement 'claimed(40) == actual(48)' "size not as claimed"
+//        static_assert(claimed(CLAIMED_SIZE) == actual(ACTUAL_SIZE), "size not as claimed");
+// ... note: in instantiation of function template specialization 'check_szal<MyType>::check_size_align<..., ...>'
+//
+template <typename T> struct check_size_align {
+    static constexpr int claimed(int K) { return K; }
+    static constexpr int actual(int K) { return K; }
+    template <int CLAIMED_SIZE, int ACTUAL_SIZE = sizeof(T)> static constexpr bool check_size()
+    {
+        static_assert(claimed(CLAIMED_SIZE) == actual(ACTUAL_SIZE), "size not as claimed");
+        return CLAIMED_SIZE == ACTUAL_SIZE;
+    }
+    template <int CLAIMED_ALIGN, int ACTUAL_ALIGN = alignof(T)> static constexpr bool check_align()
+    {
+        static_assert(claimed(CLAIMED_ALIGN) == actual(ACTUAL_ALIGN), "align not as claimed");
+        return CLAIMED_ALIGN == ACTUAL_ALIGN;
+    }
+
+    template <int CLAIMED_SIZE, int CLAIMED_ALIGN> static constexpr bool check()
+    {
+        bool result = true;
+        if constexpr (host_can_run_baked) {
+            result = check_size<CLAIMED_SIZE>() && check_align<CLAIMED_ALIGN>();
+        }
+        return result;
+    }
+};
+
+} // namespace bake
+
+//
+// op_opaque_tgt_info<OpaqueT> must be specialized for each OpaqueT used in TypicalOpWithCompiler
+//
+template <typename OpaqueT> struct op_opaque_tgt_info {
+    // static constexpr unsigned length = ..; // length of the struct on target CPU
+    // static constexpr unsigned alignment = ... // aligbment on target CPU
+};
+
+} // namespace hnnx
+
+#endif // BAKE_DEFS
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/builtin_intrinsics.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/builtin_intrinsics.h
new file mode 100755
index 0000000000000..3496b792f25aa
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/builtin_intrinsics.h
@@ -0,0 +1,247 @@
+//==============================================================================
+//
+// Copyright (c) 2023 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+// Compiler builtin intrinsic functions should be specified in this file
+
+#ifndef BUILTIN_INTRINSICS_H_
+#define BUILTIN_INTRINSICS_H_
+
+#include <cassert>
+#include <climits>
+#include <cstdint>
+#include <type_traits>
+
+// Branch prediction
+#if defined(__clang__)
+
+#define HEX_LIKELY(x)   __builtin_expect(!!(x), 1)
+#define HEX_UNLIKELY(x) __builtin_expect(!!(x), 0)
+
+#define HEX_ASSUME      __builtin_assume
+#define HEX_UNREACHABLE __builtin_unreachable
+
+#elif defined(_MSC_VER)
+
+#define HEX_LIKELY(x)   (x)
+#define HEX_UNLIKELY(x) (x)
+
+#define HEX_ASSUME        __assume
+#define HEX_UNREACHABLE() __assume(0)
+
+#elif defined(__GNUC__)
+//No equivalent __builtin_assume in GNUC. Hence leaving empty.
+#define HEX_ASSUME(cond)
+
+#define HEX_LIKELY(x)   __builtin_expect(!!(x), 1)
+#define HEX_UNLIKELY(x) __builtin_expect(!!(x), 0)
+#define HEX_UNREACHABLE __builtin_unreachable
+
+#endif // defined(__clang__)
+
+// Overflow detection
+#if defined(__clang__) || defined(__GNUC__)
+
+#define HEX_ADD_OVERFLOW __builtin_add_overflow
+#define HEX_MUL_OVERFLOW __builtin_mul_overflow
+
+#elif defined(_MSC_VER)
+
+#include <limits>
+
+template <typename _T> static inline bool HEX_ADD_OVERFLOW(_T a, _T b, _T *out)
+{
+    *out = a + b;
+    return ((b > 0) && (a > std::numeric_limits<_T>::max() - b)) ||
+           ((b < 0) && (a < std::numeric_limits<_T>::min() - b));
+}
+
+template <typename _T> static inline bool HEX_MUL_OVERFLOW(_T a, _T b, _T *out)
+{
+    *out = a * b;
+    return ((b > 0) && (a > std::numeric_limits<_T>::max() / b || a < std::numeric_limits<_T>::min() / b)) ||
+           ((b < 0) && (a > std::numeric_limits<_T>::min() / b || a < std::numeric_limits<_T>::max() / b));
+}
+
+#endif // __clang__
+
+// Count bits
+
+#include <bitset>
+
+template <typename _T> static inline int HEX_COUNT_ONE_BIT(_T x)
+{
+    return std::bitset<sizeof(_T) * 8>(x).count();
+}
+
+#define HEX_COUNT_ONE_BIT_ULL HEX_COUNT_ONE_BIT
+#define HEX_COUNT_ONE_BIT_UL  HEX_COUNT_ONE_BIT
+
+#if defined(__clang__) || defined(__GNUC__)
+
+#define HEX_COUNT_LEADING_ZERO     __builtin_clz
+#define HEX_COUNT_LEADING_ZERO_UL  __builtin_clzl
+#define HEX_COUNT_LEADING_ZERO_ULL __builtin_clzll
+
+#define HEX_COUNT_TRAILING_ZERO     __builtin_ctz
+#define HEX_COUNT_TRAILING_ZERO_UL  __builtin_ctzl
+#define HEX_COUNT_TRAILING_ZERO_ULL __builtin_ctzll
+
+#elif defined(_MSC_VER)
+
+#include <intrin.h>
+
+// Returns the number of leading 0-bits in x, starting at the most significant
+// bit position. If x is 0, the result is undefined.
+static inline int HEX_COUNT_LEADING_ZERO_ULL(unsigned long long x)
+{
+    unsigned long where;
+    if (_BitScanReverse64(&where, x)) return static_cast<int>(63 - where);
+    return 64; // Undefined behavior
+}
+
+static inline int HEX_COUNT_LEADING_ZERO(unsigned int x)
+{
+    unsigned long where;
+    if (_BitScanReverse(&where, x)) return static_cast<int>(31 - where);
+    return 32; // Undefined Behavior.
+}
+
+static inline int HEX_COUNT_LEADING_ZERO_UL(unsigned long x)
+{
+    return sizeof(x) == 8 ? HEX_COUNT_LEADING_ZERO_ULL(x) : HEX_COUNT_LEADING_ZERO(static_cast<unsigned int>(x));
+}
+
+// Returns the number of trailing 0-bits in x, starting at the least significant
+// bit position. If x is 0, the result is undefined.
+static inline int HEX_COUNT_TRAILING_ZERO_ULL(unsigned long long x)
+{
+    unsigned long where;
+    if (_BitScanForward64(&where, x)) return static_cast<int>(where);
+    return 64; // Undefined Behavior.
+}
+
+static inline int HEX_COUNT_TRAILING_ZERO(unsigned int x)
+{
+    unsigned long where;
+    if (_BitScanForward(&where, x)) return static_cast<int>(where);
+    return 32; // Undefined Behavior.
+}
+
+static inline int HEX_COUNT_TRAILING_ZERO_UL(unsigned long x)
+{
+    return sizeof(x) == 8 ? HEX_COUNT_TRAILING_ZERO_ULL(x) : HEX_COUNT_TRAILING_ZERO(static_cast<unsigned int>(x));
+}
+
+#endif // defined(__clang__)
+
+// Atomic operation
+
+#if defined(__clang__) || defined(__GNUC__)
+
+#define HEX_ATOMIC_FETCH_AND_ADD __sync_fetch_and_add
+
+#define HEX_ATOMIC_FETCH_AND_AND __sync_fetch_and_and
+#define HEX_ATOMIC_FETCH_AND_OR  __sync_fetch_and_or
+
+#define HEX_ATOMIC_VAL_COMPARE_AND_SWAP  __sync_val_compare_and_swap
+#define HEX_ATOMIC_BOOL_COMPARE_AND_SWAP __sync_bool_compare_and_swap
+
+#elif defined(_MSC_VER)
+
+#include <intrin.h>
+
+#define HEX_ATOMIC_FETCH_AND_ADD(_p, _v)                                                                               \
+    (sizeof *(_p) == sizeof(__int64) ? _InterlockedExchangeAdd64((__int64 *)(_p), (__int64)(_v))                       \
+                                     : _InterlockedExchangeAdd((long *)(_p), (long)(_v)))
+
+template <typename _T> static inline _T HEX_ATOMIC_FETCH_AND_AND(_T volatile *_p, _T _v)
+{
+    _InterlockedAnd((long *)_p, (long)_v);
+    return static_cast<_T>(*_p);
+}
+
+template <typename _T> static inline _T HEX_ATOMIC_FETCH_AND_OR(_T volatile *_p, _T _v)
+{
+    _InterlockedOr((long *)_p, (long)_v);
+    return static_cast<_T>(*_p);
+}
+
+#define HEX_ATOMIC_VAL_COMPARE_AND_SWAP(_p, _old, _new)                                                                \
+    (sizeof *(_p) == sizeof(__int64)                                                                                   \
+             ? _InterlockedCompareExchange64((__int64 *)(_p), (__int64)(_new), (__int64)(_old))                        \
+             : _InterlockedCompareExchange((long *)(_p), (long)(_new), (long)(_old)))
+
+#define HEX_ATOMIC_BOOL_COMPARE_AND_SWAP(_p, _old, _new) (HEX_ATOMIC_VAL_COMPARE_AND_SWAP(_p, _old, _new) == (_old))
+
+#endif // defined(__clang__)
+
+namespace hnnx {
+
+/**
+ * @brief promote_shift_operand reflects the integral promotions for small integer types.
+ * safe_lshift/safe_rshift must be aware of these promotions, since the C++ standard only
+ * defines the behavior for shift operations where the RHS is between 0 and
+ * 1 less than the bit-width of the *promoted* type of the LHS.
+ */
+template <typename T> struct promote_shift_operand {
+    typedef T type;
+};
+
+template <> struct promote_shift_operand<char> {
+    using type = int;
+};
+template <> struct promote_shift_operand<signed char> {
+    using type = int;
+};
+template <> struct promote_shift_operand<unsigned char> {
+    using type = int;
+};
+template <> struct promote_shift_operand<short> {
+    using type = int;
+};
+template <> struct promote_shift_operand<unsigned short> {
+    using type = int;
+};
+
+template <typename T> using promote_shift_operand_t = typename promote_shift_operand<T>::type;
+
+// The following portable template functions are replacements for the
+// built-in shift operations, << and >>, that provide the following guarantees:
+//
+// 1. Both the left and right operands of the shift will be treated as unsigned.
+//    This, by construction, prevents any undefined or implementation-defined
+//    behavior that may arise when shifting negative-valued expressions.
+// 2. The right operand will be bit-masked in a way that guarantees
+//    that its value is in the range [0, bitwidth(promoted_left_operand) - 1]
+
+template <typename T> constexpr unsigned get_safe_shift_mask()
+{
+    return unsigned(CHAR_BIT * sizeof(promote_shift_operand_t<std::remove_cv_t<std::remove_reference_t<T>>>) - 1);
+}
+
+template <typename T, typename S, unsigned mask = get_safe_shift_mask<T>()>
+constexpr auto safe_lshift(T const value, S const shift_amount)
+{
+    static_assert(std::is_integral<T>::value && std::is_integral<S>::value,
+                  "safe_lshift only makes sense for integral parameters");
+    assert((static_cast<unsigned>(shift_amount) & ~mask) == 0 && "shift_amount is out of range");
+    return value << shift_amount;
+}
+
+template <typename T, typename S, unsigned mask = get_safe_shift_mask<T>()>
+constexpr auto safe_rshift(T const value, S const shift_amount)
+{
+    static_assert(std::is_integral<T>::value && std::is_integral<S>::value,
+                  "safe_rshift only makes sense for integral parameters");
+    assert((static_cast<unsigned>(shift_amount) & ~mask) == 0 && "shift_amount is out of range");
+    return value >> shift_amount;
+}
+
+} // namespace hnnx
+
+#endif /* BUILTIN_INTRINSICS_H_ */
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/c_tricks.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/c_tricks.h
new file mode 100755
index 0000000000000..0531625039312
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/c_tricks.h
@@ -0,0 +1,21 @@
+//==============================================================================
+//
+// Copyright (c) 2020 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef C_TRICKS_H
+#define C_TRICKS_H 1
+
+#define CTRICKS_PASTER2(A, B) A##B
+#define CTRICKS_PASTER(A, B)  CTRICKS_PASTER2(A, B)
+
+#define STRINGIFY(x) #x
+#define TOSTRING(x)  STRINGIFY(x)
+
+#define PROBABLY(x)  __builtin_expect(!(!(x)), 1)
+#define YEAHRIGHT(x) __builtin_expect(!(!(x)), 1)
+
+#endif
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/cc_pp.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/cc_pp.h
new file mode 100755
index 0000000000000..c4363d8cb3e6f
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/cc_pp.h
@@ -0,0 +1,26 @@
+//==============================================================================
+//
+// Copyright (c) 2020 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef CC_PP_H
+#define CC_PP_H 1
+
+/*
+ * C++ Preprocessor Definitions
+ */
+
+#ifdef __cplusplus
+#define EXTERN_C_BEGIN extern "C" {
+#define EXTERN_C_END                                                                                                   \
+    }                                                                                                                  \
+    ;
+#else
+#define EXTERN_C_BEGIN /* NOTHING */
+#define EXTERN_C_END   /* NOTHING */
+#endif
+
+#endif
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/check_hvx.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/check_hvx.h
new file mode 100755
index 0000000000000..bd12354b0a314
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/check_hvx.h
@@ -0,0 +1,35 @@
+//==============================================================================
+//
+// Copyright (c) 2022-2023 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#include "cc_pp.h"
+#include "macros_attribute.h"
+#include "weak_linkage.h"
+
+#ifndef CHECK_HVX_H
+#define CHECK_HVX_H 1
+
+//
+// This makes sure that we have an HVX context (or not).  Does nothing on H2 or
+// QuRT, but on x86, makes use of a TLS variable to do the check.
+//
+
+#ifdef __hexagon__
+
+static inline void check_hvx() {}
+static inline void check_not_hvx() {}
+
+#else
+
+PUSH_VISIBILITY(default)
+API_EXPORT void check_hvx();
+API_EXPORT void check_not_hvx();
+POP_VISIBILITY()
+
+#endif
+
+#endif // CHECK_HVX_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/const_extent_descriptor.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/const_extent_descriptor.h
new file mode 100755
index 0000000000000..a7f50569eb471
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/const_extent_descriptor.h
@@ -0,0 +1,207 @@
+//==============================================================================
+//
+// Copyright (c) 2023 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef CONST_EXTENT_DESCRIPTOR_H
+#define CONST_EXTENT_DESCRIPTOR_H 1
+
+#include <cstdio>
+#include <vector>
+#include <cassert>
+#include <string>
+#include "forward_classes.h"
+#include "serialize_defs.h"
+#include "pickle_header_tags.h"
+#include "const_extent_shared.h"
+
+namespace hnnx {
+
+// This class is used, on both encoder and decoder, to contain a 'const extent descriptor' in its raw form, (just an array of uint32)
+// and provide higher-level access to the contents.
+
+class ConstExtentDesc {
+  protected:
+    using table_t = std::vector<uint32_t>;
+    // The 'table' may or may not contain the 'padding' section at the end; this is not accessed,
+    // and the serialize method will always generate the required padding.
+    table_t table;
+    // some values broken out from the header...
+    unsigned extab_n = 0, extab_idx = 0; // number of extents, and word index where they start
+    unsigned mptab_n = 0, mptab_idx = 0; // number of memory pools, and word index where they start.
+    unsigned desc_len = 0; // length of the entire descriptor in bytes (0 if invalid descriptor)
+
+    bool scan_table(); // sanity check, and unpacks the above; returns true if OK.
+
+  public:
+    static uint8_t constexpr EXTENT_FLAGS_BITFIELD_LSB = 8;
+    static uint8_t constexpr EXTENT_FLAGS_BITFIELD_WIDTH = 8;
+
+    ///
+    /// @brief Values for 8b flags in extent record
+    ///
+    static uint8_t constexpr EXTENT_FLAG_RESERVED_0 = (1 << 0);
+    static uint8_t constexpr EXTENT_FLAG_RESERVED_1 = (1 << 1);
+    static uint8_t constexpr EXTENT_FLAG_RESERVED_2 = (1 << 2);
+    static uint8_t constexpr EXTENT_FLAG_RESERVED_3 = (1 << 3);
+    static uint8_t constexpr EXTENT_FLAG_IS_FAR_HINT = (1 << 4); ///< Contents maybe far
+    static uint8_t constexpr EXTENT_FLAG_RESERVED_5 = (1 << 5);
+    static uint8_t constexpr EXTENT_FLAG_RESERVED_6 = (1 << 6);
+    static uint8_t constexpr EXTENT_FLAG_RESERVED_7 = (1 << 7);
+
+    // Return from 'extent_info'.
+    struct extab_entry {
+        uint32_t extent_flags;
+        uint32_t align; // a power of 2, >= 64
+        uint64_t offset; // offset, in bytes, from the start of the descriptor, to where the data is.
+        uint64_t length; // length of the data in bytes.
+    };
+    // Return from 'mempool_info'.
+    // Note: if 'adjust_offset' is true, the 'offset' field from the containing extent will be added to offset,
+    // so that the offset is from the start of the descriptor, instead of the start of the containing extent.
+    struct mempool_entry {
+        uint32_t mempool_id; // a mempool id >=2 indicating a const mempool
+        uint32_t extent_id; // an extent_id, >=1
+        uint64_t offset; // offset in bytes of the data from the start of the extent (see note above)
+        uint64_t length; // length in bytes of the data
+    };
+    // optional name of the const_extent this descriptor corresponds to. Used for matching in weight_sharing.
+    std::string name = std::string{};
+
+    ConstExtentDesc() {}
+    ConstExtentDesc(table_t &&table_in);
+    void serialize(Serializer &) const;
+    inline bool load_table(table_t &&table_in)
+    {
+        table = std::move(table_in);
+        return scan_table();
+    }
+
+    constexpr bool is_valid() const { return desc_len != 0; }
+
+    constexpr unsigned descriptor_length() const { return desc_len; }
+
+    constexpr unsigned num_extents() const { return extab_n; }
+    constexpr unsigned num_mempools() const { return mptab_n; }
+
+    // unpack a row of the extent table
+    // NOTE: extent_id is 1-based, must be 1 .. num_extents()
+    extab_entry extent_info(unsigned extent_id) const;
+
+    // unpack a row of the mempool table.
+    // note: idx is not a mempool idx, it is a 1-based row in range 1...num_mempools();
+    // if adjust_offset, the offset of the containing extent is added to the offset
+    // of the mempool in the returned value.
+    mempool_entry mempool_info(unsigned idx, bool adjust_offset = false) const;
+
+    // The ordering of the data and the descriptors is such that:
+    //
+    // (1)  extent_info(1).offset >= descriptor_length()
+    //      mempool_info(1,true).offset >= descriptor_length()
+    // (2) for i >=2,
+    //      extent_info(i).offset >= extent_info(i+1).offset + extent_info(i+1).length
+    //      mempool_info(i,true).offset >= mempool_info(1-1,true).offset + mempool_info(1-1).length
+    //
+
+#if !defined(PREPARE_DISABLED)
+    ///
+    /// @brief Memory pool record iterator
+    /// @details Use to iterator over records in memory pool table in constant
+    /// extent descriptor
+    ///
+    class mempool_iterator {
+      public:
+        using iterator_category = std::input_iterator_tag;
+        using value_type = ConstExtentDesc::mempool_entry;
+        using difference_type = std::ptrdiff_t;
+        using pointer = value_type *;
+        using reference = value_type &;
+
+        ///
+        /// @brief Constructor
+        /// @param [in] cedesc A valid constant extent descriptor instance
+        /// @param [in] index Record index (zero-based!)
+        ///
+        explicit mempool_iterator(ConstExtentDesc const &cedesc, uint32_t index) : _cedesc(cedesc), _index(index) {}
+
+        ///
+        /// @brief Increment record
+        /// @return Iterator
+        ///
+        mempool_iterator &operator++()
+        {
+            // Increment IFF valid constant extent descriptor and mempool record
+            // index within range
+            _index += (_cedesc.is_valid() && (_index < _cedesc.mptab_n)) ? 1 : 0;
+            return *this;
+        }
+
+        ///
+        /// @brief Equality operator
+        /// @return true if iterators are equal
+        ///
+        bool operator==(mempool_iterator const &other) const { return _index == other._index; }
+
+        ///
+        /// @brief Inequality operator
+        /// @return true if iterators are not equal
+        ///
+        bool operator!=(mempool_iterator const &other) const { return !(*this == other); }
+
+        ///
+        /// @brief Dereference iterator
+        ///
+        reference operator*();
+
+      private:
+        ///
+        /// @brief Reference to a constant extent descriptor instance
+        /// @details It contains the blob representing constant extent segment
+        ///
+        ConstExtentDesc const &_cedesc;
+
+        ///
+        /// @brief Current index
+        ///
+        uint32_t _index;
+
+        ///
+        /// @brief Mempool record entry
+        /// @details It is assigned when on iterator dereference
+        ///
+        value_type _entry;
+    };
+
+    ///
+    /// @brief Return mempool iterator initialized to the first record
+    /// @return Mempool iterator
+    ///
+    mempool_iterator begin() { return mempool_iterator(*this, 0); }
+
+    ///
+    /// @brief Return mempool iterator beyond the last record
+    /// @warning Intended to be used as a sentinel
+    /// @return Mempool iterator
+    ///
+    mempool_iterator end() { return mempool_iterator(*this, mptab_n); }
+#endif
+};
+#ifndef PREPARE_DISABLED
+// Called at the end of serializing a graph, if 'const extent' mode is enabled.
+// See comment in const_extent_descriptor.cc for full details.
+// LCOV_EXCL_START [SAFTYSWCCB-1542]
+size_t write_aligned_const_info(Graph const &gr, Serializer &sctx, unsigned buried_aux_n_words = 0);
+#else
+inline constexpr size_t write_aligned_const_info(Graph const &gr, Serializer const &sctx, unsigned = 0)
+{
+    return 0;
+}
+// LCOV_EXCL_STOP
+#endif
+
+} // namespace hnnx
+
+#endif // CONST_EXTENT_DESCRIPTOR_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/const_extent_shared.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/const_extent_shared.h
new file mode 100755
index 0000000000000..39c95e26ed561
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/const_extent_shared.h
@@ -0,0 +1,81 @@
+//==============================================================================
+//
+// Copyright (c) 2024 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef CONST_EXTENT_SHARED_H_
+#define CONST_EXTENT_SHARED_H_
+
+namespace hnnx {
+// definitions pertaining to the 'const extent descriptor'.
+
+constexpr unsigned CONST_EXTENT_DESC_MAGIC = 0x71c43c9b;
+// if a const extent descriptor has a 'cbname' in it, the last 32-bit slot
+// is this value. The 0x3e, 0x00 is the ">\0" at the end of the cbname
+constexpr unsigned CONST_EXTENT_CBNAME_TAG = 0xebbe003e;
+
+// This must be a power of 2, and >= 64.
+// This is effectively a 'quiet' minimum on options.serialize_const_alignment, which sets
+// the actual alignment.
+// It is not necessary for the decoder to know what value of alignment was used in the encoder.
+constexpr unsigned CONST_EXTENT_MIN_ALIGN = 256;
+//
+// this is a (non-quiet) maximum on options.serialize_const_alignment
+constexpr unsigned CONST_EXTENT_MAX_ALIGN = 1024 * 1024;
+
+///
+/// @brief Size of const extent descriptor header
+///
+constexpr unsigned CONST_EXTENT_HEADER_SIZE_WORDS = 4u;
+constexpr unsigned CONST_EXTENT_HEADER_SIZE_BYTES = CONST_EXTENT_HEADER_SIZE_WORDS * 4u;
+
+///
+/// @brief Size of an extent record
+/// @details Const extent descriptor contains a table of such records
+///
+constexpr unsigned CONST_EXTENT_RECORD_SIZE_WORDS = 4u;
+constexpr unsigned CONST_EXTENT_RECORD_SIZE_BYTES = CONST_EXTENT_RECORD_SIZE_WORDS * 4u;
+
+///
+/// @brief Offset of extent record table relative to const extent descriptor
+/// @details Both byte and words offsets are listed
+///
+constexpr unsigned CONST_EXTENT_RECORD_TAB_OFFSET_WORDS = 4u;
+constexpr unsigned CONST_EXTENT_RECORD_TAB_OFFSET_BYTES = CONST_EXTENT_RECORD_TAB_OFFSET_WORDS * 4u;
+
+///
+/// @brief Size of mempool record in a const extent descriptor
+/// @details Both byte and word sizes are provided
+///
+constexpr unsigned CONST_EXTENT_MEMPOOL_RECORD_SIZE_WORDS = 4u;
+constexpr unsigned CONST_EXTENT_MEMPOOL_RECORD_SIZE_BYTES = CONST_EXTENT_MEMPOOL_RECORD_SIZE_WORDS * 4u;
+
+// This function is used by deserializer to help it extract the extent-desc table (as a vector<uint32_t>) from some
+// arbitrary point down the pickle. Parameter is a pointer to the first 4 words; the return value is
+//  0 if the first two words do not look like CEDesc header;
+//  n otherwise (where 'n' is the number of 32-bit words to extract).
+//
+inline unsigned const_extent_hdr_check(uint32_t const *const hdrp)
+{
+    if (hdrp[0] != CONST_EXTENT_DESC_MAGIC) return 0;
+    const unsigned word0 = hdrp[1];
+    const unsigned hdr_len16 = word0 >> 24u; // units of 16 bytes
+    const unsigned desc_len64 = word0 & 0xFFFFFFu; // units of 64 bytes
+    const unsigned n_extent = hdrp[2] & 0xFFFFFFu;
+    const unsigned n_mempool = hdrp[3] & 0xFFFFFFu;
+    // no. of words actually needed
+    const unsigned desc_words = 4 * (hdr_len16 + n_extent + n_mempool);
+
+    // note, n_extent == n_mempool == 0 is allowed.
+    if (hdr_len16 == 0 || desc_len64 == 0 || n_extent > n_mempool || desc_words > desc_len64 * 16) {
+        return -1;
+    }
+    return desc_words;
+}
+
+} // namespace hnnx
+
+#endif
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/constraints.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/constraints.h
new file mode 100755
index 0000000000000..b30f7b8f5c871
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/constraints.h
@@ -0,0 +1,121 @@
+//==============================================================================
+//
+// Copyright (c) 2020 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef CONSTRAINTS_H
+#define CONSTRAINTS_H
+
+#include "interface_defs.h"
+#include "op_def.h"
+
+#include <cstddef>
+#include <cstdint>
+
+namespace constraint_lib {
+
+/** \defgroup OptConstraint Constraint Expressions for Optimization Rules
+ * \ingroup OptimizationFuncs
+ *
+ * @{
+ */
+//! Find the chunksize of a given tensor type in a given dimension (a constant).
+/// For instance, LAYOUT_CHUNKSIZE(QUint8CroutonTensor,3) gives size_t(32)
+///
+#define LAYOUT_CHUNKSIZE(TYPENAME, IDX) (TYPENAME::layout.ChunkSizes[(IDX)])
+
+// some convenience wrappers...
+
+//! IS_FLOAT16("operand") -> bool   (true if operand has Float16 output)
+#define IS_FLOAT16(X) EQ(DTYPE_OF(X), DType::Float16)
+
+//! IS_FLOAT32("operand") -> bool   (true if operand has float output)
+#define IS_FLOAT32(X) EQ(DTYPE_OF(X), DType::Float32)
+
+//! IS_FLOAT("operand") -> bool   (alias of IS_FLOAT32)
+#define IS_FLOAT(X) IS_FLOAT32(X)
+
+//! IS_QUINT8("operand") -> bool   (true if operand has 'QUInt8' output)
+#define IS_QUINT8(X) EQ(DTYPE_OF(X), DType::QUInt8)
+
+//! IS_QINT8("operand") -> bool (true if operand has 'QInt8' output)
+#define IS_QINT8(X) EQ(DTYPE_OF(X), DType::QInt8)
+
+//! IS_QINT16("operand") -> bool   (true if operand has 'QInt16' output)
+#define IS_QINT16(X) EQ(DTYPE_OF(X), DType::QInt16)
+
+//! IS_QUINT16("operand") -> bool   (true if operand has 'QUInt16' output)
+#define IS_QUINT16(X) EQ(DTYPE_OF(X), DType::QUInt16)
+
+//! IS_QINT32("operand") -> bool   (true if operand has 'QInt32' output)
+#define IS_QINT32(X) EQ(DTYPE_OF(X), DType::QInt32)
+//! IS_INT32("operand") -> bool   (true if operand has 'Int32' output)
+#define IS_INT32(X) EQ(DTYPE_OF(X), DType::Int32)
+
+//! IS_INT64("operand") -> bool   (true if operand has 'Int64' output)
+#define IS_INT64(X) EQ(DTYPE_OF(X), DType::Int64)
+
+//! IS_QUANT_TYPE("operand") -> bool (true if operand has 'Quantized' output)
+#define IS_QUANT_TYPE(X) OR(IS_QUINT8(X), IS_QINT8(X), IS_QINT16(X), IS_QUINT16(X), IS_QINT32(X))
+//! IS_QUANT_SIGNED("operand") -> bool (true if operand has 'Signed Quantized' output)
+#define IS_QUANT_SIGNED(X) OR(IS_QINT32(X), IS_QINT16(X), IS_QINT8(X))
+//! IS_SIGNED_SYMM("operand") -> bool (true if operand has 'Signed Quantized' output with offset == 0)
+#define IS_SIGNED_SYMM(X) AND(IS_QUANT_SIGNED(X), EQ(ZERO_OFFSET_OF(X), 0))
+
+// The problem with IS_SIGNED_SYMM is that it tends to get used as
+//  AND( IS_QINT8(X), IS_SIGNED_SYMM(X))
+// which expands to X.dtype==qint8 && ( (X.dtype ==qint32 || X.dtype == .. ) && X.zero_offs == 0)
+// So, use IS_QINT8_SYMM(X) etc instead.
+
+//! IS_QINT8_SYMM("operand") -> bool (true if operand has QINT8 output with offset == 0)
+#define IS_QINT8_SYMM(X) AND(IS_QINT8(X), EQ(ZERO_OFFSET_OF(X), 0))
+//! IS_QINT16_SYMM("operand") -> bool (true if operand has QINT16 output with offset == 0)
+#define IS_QINT16_SYMM(X) AND(IS_QINT16(X), EQ(ZERO_OFFSET_OF(X), 0))
+//! IS_QINT32_SYMM("operand") -> bool (true if operand has QINT32 output with offset == 0)
+#define IS_QINT32_SYMM(X) AND(IS_QINT32(X), EQ(ZERO_OFFSET_OF(X), 0))
+
+//! IS_FULLY_CONNECT_WEIGHT("operand") -> bool (true if operand is QUInt8 or (QInt8 and symmetrically quantized))
+#define IS_FULLY_CONNECT_WEIGHT(X) OR(IS_QUINT8(X), IS_QINT8_SYMM(X))
+
+//! IS_FLOAT16_BOTH("operand", "operand") -> bool (true if both operands are FP16 type)
+#define IS_FLOAT16_BOTH(X, Y) AND(IS_FLOAT16(X), IS_FLOAT16(Y))
+//! IS_FLOAT16_ALL("operand", ...) -> bool (true if all operands are FP16 type)
+#define IS_FLOAT16_ALL(...) IS_DTYPE_ALL(DType::Float16, __VA_ARGS__)
+//! IS_FLOAT32_ALL("operand", ...) -> bool (true if all operands are FP32 type)
+#define IS_FLOAT32_ALL(...) IS_DTYPE_ALL(DType::Float32, __VA_ARGS__)
+
+//! DIM_CHANNEL("operand") -> unsigned (extract depth dimension, #4)
+#define DIM_CHANNEL(X) DIM_OF(X, 4)
+//! DIM_DEPTH("operand") -> unsigned (extract depth dimension, #3)
+#define DIM_DEPTH(X) DIM_OF(X, 3)
+//! DIM_WIDTH("operand") -> unsigned (extract width dimension, #2)
+#define DIM_WIDTH(X) DIM_OF(X, 2)
+//! DIM_HEIGHT("operand") -> unsigned (extract height dimension, #1)
+#define DIM_HEIGHT(X) DIM_OF(X, 1)
+//! DIM_BATCHES("operand") -> unsigned (extract batches dimension, #0)
+#define DIM_BATCHES(X) DIM_OF(X, 0)
+
+//! DIM_NFILTS("operand") -> unsigned (extract 'output depth' dimension from filter weights, #3)
+#define DIM_NFILTS(X) DIM_OF(X, 3)
+//! DIM_FILTDEPTH("operand") -> unsigned (extract 'input depth' dimension from filter weights, #2)
+#define DIM_FILTDEPTH(X) DIM_OF(X, 2)
+//! DIM_FILTWIDTH("operand") -> unsigned (extract 'filter width' dimension from filter weights, #1)
+#define DIM_FILTWIDTH(X) DIM_OF(X, 1)
+//! DIM_FILTHEIGHT("operand") -> unsigned (extract 'filter height' dimension from filter weights, #0)
+#define DIM_FILTHEIGHT(X) DIM_OF(X, 0)
+
+#define MAX_SPARSE_ELEMENTS(X) DIM_OF(X, (MAX_DIMENSIONS - 1))
+
+//! IS_EMPTY_DIM("operand", dim) -> bool (true if size of dim is 0)
+#define IS_EMPTY_DIM(X, DIM) EQ(DIM_OF(X, DIM), 0)
+
+//! IS_EMPTY("operand") -> bool (true if size of all dims is 0)
+#define IS_EMPTY(X) AND(IS_EMPTY_DIM(X, 0), IS_EMPTY_DIM(X, 1), IS_EMPTY_DIM(X, 2), IS_EMPTY_DIM(X, 3))
+
+} // namespace constraint_lib
+/** @} */
+
+#endif
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/conversions.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/conversions.h
new file mode 100755
index 0000000000000..4cb348c637953
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/conversions.h
@@ -0,0 +1,609 @@
+//==============================================================================
+//
+// Copyright (c) 2018 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef CONVERSIONS_H
+#define CONVERSIONS_H
+
+#include <algorithm>
+#include <cstdint>
+#include <cstring>
+#include <cmath>
+#include <limits>
+
+#include "builtin_intrinsics.h"
+
+#ifdef __hexagon__
+#include "hexagon_protos.h"
+#endif
+
+#include "float16.h"
+
+#if defined(__clang__)
+#define ATTR_NO_SANITIZE(CATEGORY) __attribute__((no_sanitize(CATEGORY)))
+#else
+#define ATTR_NO_SANITIZE(CATEGORY) /*empty */
+#endif
+
+namespace hnnx {
+
+namespace scast {
+
+// for a given floating type F, and a integer type TI,
+//  intrange_within_float<F,TI>::max()
+// generates the largest value representable in type F which will fit into TI without overflow.
+// in many cases this is F(std::numeric_limits<TI>::max()),
+// but there are exceptions when the mantissa of F is narrower than TI; in those cases we
+// want the representable value which is smaller than the integer's max value, not the nearest:
+//     F        TI
+//   Float16  int16   32752.0                (0x7ff0)
+//   Float15  uint16  65504.0                (0xffe0)
+//   float    int32   2147483520.0           (0x7fffff80)
+//   float    uint32  4294967040.0           (0xFFFFFF00)
+//   float    int64   9.223371487e18         (0x7fff_ff80_0000_0000)
+//   float    uint64  1.844674297e+19        (0xFFFF_FF00__0000_0000)
+//   double   int64   9223372036854774784.0  (0x7FFF_FFFF_FFFF_FC00)
+//   double   uint64  18446744073709549568.0 (0xFFFF_FFFF_FFFF_F800)
+//
+// All of the 'min' limits are zero or powers of 2, so those can be converted
+// directly from std::numeric_limits<TI>::min()
+//
+//
+template <typename F, typename TI> struct intrange_within_float {
+};
+
+// LCOV_EXCL_START [SAFTYSWCCB-1736] constexprs resolved during compile time
+template <typename TI> struct intrange_within_float<Float16, TI> {
+    static_assert(std::numeric_limits<TI>::is_integer);
+    static inline constexpr Float16 max()
+    {
+        if constexpr (sizeof(TI) < 2) {
+            return Float16(std::numeric_limits<TI>::max());
+        } else if constexpr (sizeof(TI) == 2) {
+            return std::numeric_limits<TI>::is_signed ? Float16(32752.0f) : Float16(65504.0f);
+        } else {
+            return std::numeric_limits<TI>::is_signed ? Float16(-65504.0f) : Float16(65504.0f);
+        }
+    }
+    // 'min' value of integer range is always exactly representable
+    static inline constexpr Float16 min() { return Float16(std::numeric_limits<TI>::min()); }
+};
+
+template <typename TI> struct intrange_within_float<float, TI> {
+    static_assert(std::numeric_limits<TI>::is_integer);
+    static inline constexpr float max()
+    {
+        if constexpr (sizeof(TI) < 4) {
+            return float(std::numeric_limits<TI>::max());
+        } else if constexpr (sizeof(TI) == 4) {
+            return std::numeric_limits<TI>::is_signed ? 2147483520.0f : 4294967040.0f;
+        } else {
+            static_assert(sizeof(TI) == 8);
+            return std::numeric_limits<TI>::is_signed ? 9.223371487e18f : 1.844674297e+19f;
+        }
+    }
+    // 'min' value of integer range is always exactly representable
+    static inline constexpr float min() { return float(std::numeric_limits<TI>::min()); }
+};
+
+template <typename TI> struct intrange_within_float<double, TI> {
+    static_assert(std::numeric_limits<TI>::is_integer);
+    static inline constexpr double max()
+    {
+        if constexpr (sizeof(TI) < 8) {
+            return double(std::numeric_limits<TI>::max());
+        } else {
+            static_assert(sizeof(TI) == 8);
+            return std::numeric_limits<TI>::is_signed ? 9223372036854774784.0 : 18446744073709549568.0;
+        }
+    }
+    // 'min' value of integer range is always exactly representable
+    static inline constexpr float min() { return double(std::numeric_limits<TI>::min()); }
+};
+// LCOV_EXCL_STOP
+
+template <typename TOUT, typename TIN> struct satcast_helper {
+    static_assert(std::numeric_limits<TOUT>::is_specialized && std::numeric_limits<TIN>::is_specialized);
+    static inline TOUT constexpr op(TIN val)
+    {
+        if constexpr (!std::numeric_limits<TOUT>::is_integer) { // convert to a float
+            return TOUT(val);
+        } else {
+            constexpr bool OUTS = std::numeric_limits<TOUT>::is_signed;
+            if constexpr (std::numeric_limits<TIN>::is_integer) {
+                // integer to integer.
+                // widening? or same width, same signedness?
+                constexpr bool INS = std::numeric_limits<TIN>::is_signed;
+                if (sizeof(TOUT) > sizeof(TIN) || (sizeof(TOUT) == sizeof(TIN) && OUTS == INS)) {
+                    // if the output is unsigned and the input < 0, return 0
+                    // otherwise it's a normal cast.
+                    return (!OUTS && INS && val < 0) ? TOUT(0) : TOUT(val);
+                } else if (sizeof(TOUT) == sizeof(TIN)) {
+                    if (!OUTS) { // same size, different signs
+                        return (val < 0) ? (TOUT)0 : (TOUT)val; // signed->unsigned
+                    } else {
+                        constexpr TIN lim = std::numeric_limits<TOUT>::max();
+                        return (val > lim) ? (TOUT)lim : (TOUT)val;
+                    }
+                } else {
+                    // narrowing conversion
+                    if (!OUTS) {
+                        constexpr TIN m = std::numeric_limits<TOUT>::max();
+                        return (val < 0) ? TOUT(0) : (val > m) ? TOUT(m) : TOUT(val);
+                    } else {
+                        constexpr TIN mn = INS ? std::numeric_limits<TOUT>::min() : 0;
+                        constexpr TIN mx = std::numeric_limits<TOUT>::max();
+                        return (val < mn) ? TOUT(mn) : (val > mx) ? TOUT(mx) : TOUT(val);
+                    }
+                }
+            } else { // float to integer
+                if constexpr (sizeof(TOUT) <= sizeof(int32_t)) {
+                    if constexpr (OUTS) {
+                        constexpr TIN loval = intrange_within_float<TIN, int32_t>::min();
+                        constexpr TIN hival = intrange_within_float<TIN, int32_t>::max();
+                        int32_t const tmp = (int32_t)std::max(loval, std::min(hival, val));
+                        return satcast_helper<TOUT, int32_t>::op(tmp);
+                    } else {
+                        constexpr TIN loval = 0.0;
+                        constexpr TIN hival = intrange_within_float<TIN, uint32_t>::max();
+                        uint32_t const tmp = (uint32_t)std::max(loval, std::min(hival, val));
+                        return satcast_helper<TOUT, uint32_t>::op(tmp);
+                    }
+                } else { // 64-bit output assumed
+                    constexpr TIN loval = intrange_within_float<TIN, TOUT>::min();
+                    constexpr TIN hival = intrange_within_float<TIN, TOUT>::max();
+                    return (TOUT)std::max(loval, std::min(hival, val));
+                }
+            }
+        }
+    }
+};
+// specialize for conversion to same
+template <typename TT> struct satcast_helper<TT, TT> {
+    static_assert(std::numeric_limits<TT>::is_specialized);
+    static inline TT constexpr op(TT val) { return val; }
+};
+
+#ifdef __hexagon__
+
+// saturate to types <= int.
+template <typename T> struct q6_sat_int {
+};
+template <> struct q6_sat_int<int8_t> {
+    static inline int op(int x) { return Q6_R_satb_R(x); }
+};
+template <> struct q6_sat_int<uint8_t> {
+    static inline int op(int x) { return Q6_R_satub_R(x); }
+};
+template <> struct q6_sat_int<int16_t> {
+    static inline int op(int x) { return Q6_R_sath_R(x); }
+};
+template <> struct q6_sat_int<uint16_t> {
+    static inline int op(int x) { return Q6_R_satuh_R(x); }
+};
+
+// TODO: these should be done again for 'long' if long is also 32 bits.
+#if 0 // NOTE: we can't really do this unless intrinsics are constexpr
+template <> struct satcast_helper<uint8_t, int> {
+    static inline uint8_t /*constexpr*/ op(int val)
+    {
+        return Q6_R_satub_R(val);
+    }
+};
+template <> struct satcast_helper<int8_t, int> {
+    static inline int8_t /*constexpr*/ op(int val) { return Q6_R_satb_R(val); }
+};
+template <> struct satcast_helper<uint16_t, int> {
+    static inline uint16_t /*constexpr*/ op(int val)
+    {
+        return Q6_R_satuh_R(val);
+    }
+};
+template <> struct satcast_helper<int16_t, int> {
+    static inline int16_t /*constexpr*/ op(int val) { return Q6_R_sath_R(val); }
+};
+#endif
+
+#endif
+} // end namespace scast
+
+} // namespace hnnx
+
+/**
+ * @brief saturate_cast<TOUT,TIN>( TIN val ) will work on any two numeric types;
+ * if the input is outside the numeric range of the output type, it
+ * will be range-limited.
+ *
+ * it works as follows:
+ *   * if TOUT is a floating type, the operation is the same as the C++ cast.
+ *   * if TOUT is integer and TIN is float, the input is first converted
+ *    to one of int32,uint32, int64, uint64 ensuring that out-of-range values
+ *    are clipped; and then converted to the output type as below (if it is smaller
+ *    than 32 bits) (The 2-step conversion is intended to work well when things
+ *    are specialized to support native hexagon ops).
+ *  * Otherwise they are both integers.
+ *    - If the output width is larger than the input (or if they are the same size
+ *      and of the same signedness):
+ *        * if the output is unsigned, and the input is < 0, the result is zero
+ *        * otherwise the result is the same as a C++ cast (all values representable)
+ *    - Otherwise, it is a saturating cast; values are limited to the range of TOUT.
+ */
+template <typename TOUT, typename TIN> inline constexpr TOUT saturate_cast(TIN val)
+{
+    return hnnx::scast::satcast_helper<TOUT, TIN>::op(val);
+}
+
+/**
+ * @brief T saturate_round<T>( float val )
+ * round val to nearest int, and saturate to range of T.
+ *
+ * T must be an integer type, at most 32 bits.
+ */
+// For general C platform, we need to clip the range before converting to int;
+// for hexagon the conversions saturate.
+//
+#ifndef __hexagon__
+template <typename TOUT> inline TOUT saturate_round(float val)
+{
+    static_assert(sizeof(TOUT) <= 8 && std::numeric_limits<TOUT>::is_integer);
+    return saturate_cast<TOUT>(std::nearbyintf(val));
+}
+
+#else
+template <typename TOUT> inline TOUT saturate_round(float val)
+{
+    static_assert(sizeof(TOUT) <= 8 && std::numeric_limits<TOUT>::is_integer);
+    if constexpr ((sizeof(TOUT) == 8) && !std::numeric_limits<TOUT>::is_signed) {
+        // convert to unsigned u64, rounding, saturating
+        return Q6_P_convert_sf2ud_R(val);
+    } else if constexpr ((sizeof(TOUT) == 8) && std::numeric_limits<TOUT>::is_signed) {
+        // convert to int64, rounding
+        return Q6_P_convert_sf2d_R(val);
+    } else if constexpr ((sizeof(TOUT) == 4) && !std::numeric_limits<TOUT>::is_signed) {
+        // convert to unsigned u32, rounding, saturating
+        return Q6_R_convert_sf2uw_R(val);
+    } else {
+        // convert to int32,rounding;
+        int const r = Q6_R_convert_sf2w_R(val);
+        if constexpr (sizeof(TOUT) < 4) return static_cast<TOUT>(hnnx::scast::q6_sat_int<TOUT>::op(r));
+        return static_cast<TOUT>(r); // LCOV_EXCL_LINE [SAFTYSWCCB-1736]
+    }
+}
+#endif
+
+namespace hnnx {
+
+/**
+ * @brief 'proper' compare of any two integer types
+ *  proper_gt( a, b) => a > b;
+ *    E.g. if a is unsigned and b is signed, the operation checks to see if b is < 0;
+ *    if so, the result is true; otherwise an unsigned compare is done: a > (unsigned)b
+ *
+ */
+namespace prpercmp {
+
+/**
+ * @brief if both A and B are either *int*, or smaller than int,
+ *   then promote them both to int and compare them.
+ *
+ * otherwise, if TA is wider than TB, (or the same, with TA unsigned):
+ *    promote b to TA, and then compare them.
+ *    Exception, if TA is unsigned and TB is signed and b < 0; then a<b always.
+ * otherwise, TB is wider than TA (or the same with TA signed):
+ *   promote a to TB, and then compare them.
+ *   Exception, if TB is unsigned and TA is signed and a < 0.
+ *
+ */
+
+template <typename TA, typename TB> struct proper_cmp_helper {
+    static_assert(std::numeric_limits<TA>::is_integer && std::numeric_limits<TB>::is_integer);
+    static const bool ASIGNED = std::numeric_limits<TA>::is_signed;
+    static const bool BSIGNED = std::numeric_limits<TB>::is_signed;
+
+    // compare by promoting both to int, when...
+    static const bool CMP_AS_INT = (sizeof(TA) < sizeof(int) || (sizeof(TA) == sizeof(int) && ASIGNED)) &&
+                                   (sizeof(TB) < sizeof(int) || (sizeof(TB) == sizeof(int) && BSIGNED));
+    // otherwise, compare by promoting B to A when ...
+    static const bool B_TO_A = sizeof(TA) > sizeof(TB) || (sizeof(TA) == sizeof(TB) && !ASIGNED);
+    // otherwise, compare by promoting A to B
+
+    static inline bool constexpr eq(TA a, TB b)
+    {
+        if (CMP_AS_INT) {
+            return (int)a == (int)b;
+        } else if (B_TO_A) {
+            if (!ASIGNED && BSIGNED && b < 0) return false;
+            return a == (TA)b;
+        } else {
+            if (!BSIGNED && ASIGNED && a < 0) return false;
+            return (TB)a == b;
+        }
+    }
+    static inline bool constexpr lt(TA a, TB b)
+    {
+        if (CMP_AS_INT) {
+            return (int)a < (int)b;
+        } else if (B_TO_A) {
+            if (!ASIGNED && BSIGNED && b < 0) return false; // a < b  always false if  b<0
+            return a < (TA)b;
+        } else {
+            if (!BSIGNED && ASIGNED && a < 0) return true; // a < b  always true if  a<0
+            return (TB)a < b;
+        }
+    }
+};
+/**
+ * @brief specialize for comparison to same type
+ */
+template <typename T> struct proper_cmp_helper<T, T> {
+    static_assert(std::numeric_limits<T>::is_integer);
+    static inline bool constexpr eq(T a, T b) { return a == b; }
+    static inline bool constexpr lt(T a, T b) { return a < b; }
+};
+
+} // end namespace prpercmp
+
+} // namespace hnnx
+
+/**
+ * @brief 'proper' compare of any two integer types, respecting signedness and actual numeric value.
+ *  proper_eq(a,b) => a == b;
+ *
+ * E.g. if a is signed and <0, and b is unsigned, result will always be false.
+ *
+ */
+
+template <typename TA, typename TB> inline bool constexpr proper_eq(TA a, TB b)
+{
+    return hnnx::prpercmp::proper_cmp_helper<TA, TB>::eq(a, b);
+}
+/**
+ * @brief 'proper' compare of any two integer types, respecting signedness and actual numeric value
+ *  proper_ne(a,b) => !proper_eq(a,b);
+ */
+template <typename TA, typename TB> inline bool constexpr proper_ne(TA a, TB b)
+{
+    return !hnnx::prpercmp::proper_cmp_helper<TA, TB>::eq(a, b);
+}
+/**
+ * @brief 'proper' compare of any two integer types, respecting signedness and actual numeric value
+ *  proper_lt(a,b) => a<b;
+ */
+template <typename TA, typename TB> inline bool constexpr proper_lt(TA a, TB b)
+{
+    return hnnx::prpercmp::proper_cmp_helper<TA, TB>::lt(a, b);
+}
+/**
+ * @brief 'proper' compare of any two integer types, respecting signedness and actual numeric value
+ *  proper_ge(a,b) => a>=b;
+ */
+template <typename TA, typename TB> inline bool constexpr proper_ge(TA a, TB b)
+{
+    return !hnnx::prpercmp::proper_cmp_helper<TA, TB>::lt(a, b);
+}
+/**
+ * @brief 'proper' compare of any two integer types, respecting signedness and actual numeric value
+ *  proper_gt(a,b) => a>b;
+ */
+template <typename TA, typename TB> inline bool constexpr proper_gt(TA a, TB b)
+{
+    return hnnx::prpercmp::proper_cmp_helper<TB, TA>::lt(b, a);
+}
+/**
+ * @brief 'proper' compare of any two integer types, respecting signedness and actual numeric value
+ *  proper_le(a,b) => a<=b;
+ */
+template <typename TA, typename TB> inline bool constexpr proper_le(TA a, TB b)
+{
+    return !hnnx::prpercmp::proper_cmp_helper<TB, TA>::lt(b, a);
+}
+/**
+ * @brief x >= lo && x < limit, using proper compares
+ */
+template <typename TA, typename TB, typename TC> inline bool constexpr proper_inrange(TA x, TB lo, TC limit)
+{
+    return proper_ge<TA, TB>(x, lo) && proper_lt<TA, TC>(x, limit);
+}
+
+/**
+ * @brief x >= lo && x <= hi, using proper compares
+ */
+template <typename TA, typename TB, typename TC> inline bool constexpr proper_inrange_closed(TA x, TB lo, TC hi)
+{
+    return proper_ge<TA, TB>(x, lo) && proper_le<TA, TC>(x, hi);
+}
+
+/**
+ * @brief find the 'width' of an unsigned value (# of bits needed to contain it)
+ * this is floor( log2(x))+1
+ *  (and 0 when x = 0)
+ *
+ */
+inline int constexpr binary_bitwidth(unsigned x)
+{
+    return (x == 0) ? 0 : (sizeof(unsigned) * 8 - HEX_COUNT_LEADING_ZERO(x));
+}
+/**
+ * @brief find the 'width' of an unsigned long value (# of bits needed to contain it)
+ * this is floor( log2(x))+1
+ *  (and 0 when x = 0)
+ *
+ */
+inline int constexpr binary_bitwidth(unsigned long x)
+{
+    return (x == 0) ? 0 : (sizeof(unsigned long) * 8 - HEX_COUNT_LEADING_ZERO_UL(x));
+}
+/**
+ * @brief find the 'width' of an unsigned long long value (# of bits needed to contain it)
+ * this is floor( log2(x))+1
+ *  (and 0 when x = 0)
+ *
+ */
+inline int constexpr binary_bitwidth(unsigned long long x)
+{
+    return (x == 0) ? 0 : (sizeof(unsigned long long) * 8 - HEX_COUNT_LEADING_ZERO_ULL(x));
+}
+/**
+ * @brief saturating u32+u32 add
+ */
+inline uint32_t /*constexpr*/ addu32_sat(uint32_t a, uint32_t b)
+{
+    uint64_t const prod = (uint64_t)a + b;
+    return saturate_cast<uint32_t>(prod);
+}
+
+/**
+ * @brief saturating i32+i32 add
+ */
+inline int32_t /*constexpr*/ addi32_sat(int32_t a, int32_t b)
+{
+#ifdef __hexagon__
+    return Q6_R_add_RR_sat(a, b);
+#else
+    int64_t prod = (int64_t)a + b;
+    return saturate_cast<int32_t>(prod);
+#endif
+}
+
+/**
+ * @brief saturating u32xu32 multiply
+ */
+inline uint32_t constexpr mulu32_sat(uint32_t a, uint32_t b)
+{
+    uint64_t const prod = (uint64_t)a * b;
+    return saturate_cast<uint32_t>(prod);
+}
+
+/**
+ * @brief saturating i32xi32 multiply
+ */
+inline int32_t constexpr muli32_sat(int32_t a, int32_t b)
+{
+    int64_t const prod = (int64_t)a * b;
+    return saturate_cast<int32_t>(prod);
+}
+
+/**
+ * @brief saturating u64xu64 multiply
+ */
+inline uint64_t /*constexpr*/ mulu64_sat(uint64_t a, uint64_t b)
+{
+    uint64_t prod = 0;
+    if (HEX_MUL_OVERFLOW(a, b, &prod)) {
+        prod = std::numeric_limits<uint64_t>::max();
+    }
+    return prod;
+}
+
+/**
+ * @brief saturating i64xi64 multiply
+ */
+inline int64_t /*constexpr*/ muli64_sat(int64_t a, int64_t b)
+{
+    int64_t prod = 0;
+    if (HEX_MUL_OVERFLOW(a, b, &prod)) {
+        prod = (int64_t(uint64_t(a) ^ uint64_t(b)) >= 0) ? std::numeric_limits<int64_t>::max()
+                                                         : std::numeric_limits<int64_t>::min();
+    }
+    return prod;
+}
+/**
+ * @brief add unsigned+unsigned->unsigned, escaping 'unsigned overflow' checks
+ */
+ATTR_NO_SANITIZE("unsigned-integer-overflow")
+inline unsigned constexpr addu32_modular(unsigned a, unsigned b)
+{
+    return a + b;
+}
+/**
+ * @brief subtract unsigned-unsigned->unsigned, escaping 'unsigned overflow' checks
+ * For '-unsigned_var', use subu32_modular(0,unsigned_var)
+ */
+ATTR_NO_SANITIZE("unsigned-integer-overflow")
+inline unsigned constexpr subu32_modular(unsigned a, unsigned b)
+{
+    return a - b;
+}
+/**
+ * @brief multiply unsigned*unsigned->unsigned, escaping 'unsigned overflow' checks
+ */
+ATTR_NO_SANITIZE("unsigned-integer-overflow")
+inline unsigned constexpr mulu32_modular(unsigned a, unsigned b)
+{
+    return a * b;
+}
+/**
+ * @brief mul-add u32*u32+u32->u32, escaping 'unsigned overflow' checks
+ */
+ATTR_NO_SANITIZE("unsigned-integer-overflow")
+inline unsigned constexpr muladdu32_modular(unsigned a, unsigned b, unsigned c)
+{
+    return a * b + c;
+}
+
+/**
+ * @brief add u64+u64->u64, escaping 'unsigned overflow' checks
+ */
+ATTR_NO_SANITIZE("unsigned-integer-overflow")
+inline uint64_t constexpr addu64_modular(uint64_t a, uint64_t b)
+{
+    return a + b;
+}
+
+/**
+ * @brief subtract u64-u64->u64, escaping 'unsigned overflow' checks
+ */
+ATTR_NO_SANITIZE("unsigned-integer-overflow")
+inline uint64_t constexpr subu64_modular(uint64_t a, uint64_t b)
+{
+    return a - b;
+}
+/**
+ * @brief mul u64*u64->u64, escaping 'unsigned overflow' checks
+ */
+ATTR_NO_SANITIZE("unsigned-integer-overflow")
+inline uint64_t constexpr mulu64_modular(uint64_t a, uint64_t b)
+{
+    return a * b;
+}
+
+/**
+ * @brief 'image' conversion from TIN to TOUT (which must be the same size)
+ * e.g. image_convert<unsigned,float>( 1.25f) -> 0x3fa00000
+ */
+
+template <typename TOUT, typename TIN> inline constexpr TOUT image_convert(TIN x)
+{
+    static_assert(sizeof(TOUT) == sizeof(TIN));
+    static_assert(std::is_trivially_copyable_v<TOUT>);
+    static_assert(std::is_trivially_copyable_v<TIN>);
+    static_assert(std::is_trivially_constructible_v<TOUT>);
+    TOUT out;
+    std::memcpy(&out, &x, sizeof(TOUT));
+    return out;
+}
+
+// round up A to a multiple of B.
+// b is expected to be > 0 even if signed.
+
+template <typename TD> inline constexpr size_t round_up(size_t a, TD b)
+{
+    static_assert(std::is_integral_v<TD>, "round_up can only apply to integer types");
+    // for b being  a power of 2, this should compile as (a+(b-1)) &~(b-1)
+    return b * ((a + (b - 1)) / b);
+}
+// for int, b is expected to be > 0;
+// this will work for negative a, e.g. round_up(-53,10) -> -50
+template <typename TD> inline constexpr size_t round_up(int a, TD b)
+{
+    static_assert(std::is_integral_v<TD>, "round_up can only apply to integer types");
+    int const bi = b;
+    int const tmp = a + ((a > 0) ? (bi - 1) : 0);
+    return bi * (tmp / bi);
+}
+
+#endif /*CONVERSIONS_H*/
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/cost.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/cost.h
new file mode 100755
index 0000000000000..8f0b21ccb86e5
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/cost.h
@@ -0,0 +1,38 @@
+//==============================================================================
+//
+// Copyright (c) 2020 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef COST_H
+#define COST_H 1
+
+// NOTE: WHATCOST may be something like SNAIL/128
+#define COST_OF(FUNC, WHATCOST)     COST_OF_OP(typename DerivedType<(FUNC)>::type, WHATCOST)
+#define COST_OF_F(FUNC, WHATCOSTFN) COST_OF_OP_F(typename DerivedType<(FUNC)>::type, WHATCOSTFN)
+
+#ifdef PREPARE_DISABLED
+#define COST_OF_OP(OP, WHATCOST)
+#define COST_OF_OP_F(OP, WHATCOSTFN)
+#else
+#define COST_OF_OP(OP, WHATCOST)                                                                                       \
+    template <> [[maybe_unused]] constexpr hnnx::cost_function_t hnnx::get_costf<OP>()                                 \
+    {                                                                                                                  \
+        return hnnx::cost_function_t(float(StandardCosts::WHATCOST));                                                  \
+    }
+
+#define COST_OF_OP_F(OP, WHATCOSTFN)                                                                                   \
+    template <>                                                                                                        \
+    float hnnx::cost_function_t::cfunc<OP>(hnnx::cost_function_t const &, const Graph &graph_in, const Op *op)         \
+    {                                                                                                                  \
+        return WHATCOSTFN(graph_in, op);                                                                               \
+    }                                                                                                                  \
+    template <> [[maybe_unused]] constexpr hnnx::cost_function_t hnnx::get_costf<OP>()                                 \
+    {                                                                                                                  \
+        return hnnx::cost_function_t(hnnx::cost_function_t::cfunc<OP>, 1.0);                                           \
+    }
+#endif
+
+#endif
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/cost_funcs.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/cost_funcs.h
new file mode 100755
index 0000000000000..286945b9b34b8
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/cost_funcs.h
@@ -0,0 +1,56 @@
+//=============================================================================
+//
+//  Copyright (c) 2020 Qualcomm Technologies, Inc.
+//  All Rights Reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//============================================================================
+
+#ifndef COST_FUNCS_H
+#define COST_FUNCS_H
+#include <string_view>
+#include <utility>
+#include "weak_linkage.h"
+#include "macros_attribute.h"
+PUSH_VISIBILITY(default)
+
+class Graph;
+class Op;
+
+namespace hnnx {
+
+class API_EXPORT cost_function_t {
+    using inner_func_t = float (*)(cost_function_t const &, const Graph &, Op const *);
+    inner_func_t funcp;
+    float val;
+
+  public:
+    cost_function_t(cost_function_t const &) = default;
+    cost_function_t &operator=(cost_function_t const &) = default;
+    constexpr explicit cost_function_t(float val_in) : funcp(simple_cost_function), val(val_in) {}
+    constexpr cost_function_t(inner_func_t f, float val_in) : funcp(f), val(val_in) {}
+    constexpr cost_function_t() noexcept : funcp(simple_cost_function), val(0.0f) {}
+
+    inline float operator()(const Graph &graph_in, Op const *op) const { return (*funcp)(*this, graph_in, op); }
+    static float simple_cost_function(cost_function_t const &self, const Graph &, Op const *)
+    {
+        return self.val;
+    } // just returns val;
+
+    float get_val() const { return val; }
+
+    // unreliable compare for two cost func: returns  -1,0,1 if this cost
+    // is <,=,> than rhs cost, with the second result being true; or <0,false>
+    // if it can't tell.
+    std::pair<int, bool> compare(cost_function_t const &rhs) const;
+
+    template <class T> static float cfunc(cost_function_t const &, const Graph &, Op const *);
+};
+
+API_EXPORT cost_function_t cost_func_from_str(std::string_view);
+
+} // namespace hnnx
+
+POP_VISIBILITY()
+
+#endif
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/crate.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/crate.h
new file mode 100755
index 0000000000000..494f51e40fa0f
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/crate.h
@@ -0,0 +1,471 @@
+//==============================================================================
+//
+// Copyright (c) Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+/*
+ * crate.h
+ *
+ *  Created on: Aug 1, 2019
+ *      Author: smithg
+ */
+
+#ifndef CRATE_H_
+#define CRATE_H_
+#include <cstddef>
+#include <cstdint>
+#include <utility>
+#include <list>
+#include <memory>
+#include <vector>
+#include <cstring>
+#include <stdexcept>
+
+#include "is_detected.h"
+#include "forward_classes.h"
+#include "macros_attribute.h"
+#include "weak_linkage.h"
+#include "size_align_code.h"
+
+PUSH_VISIBILITY(default)
+
+class Graph;
+class Tensor;
+
+/// @brief A 'Crate' allows construction of some number of different data types,
+/// contiguously packed into a few large memory blocks.
+///
+/// Example:
+///
+///     Crate crt;
+///     Thing  tp* = crt.emplace<Thing>( ... ctor parms for Thing ... )
+///     AnotherThing tp2* = crt.emplace<AnotherThing>( ... ctor parms for AnotherThing ... )
+///
+/// When the crate is destroyed, all of the contained objects are destroyed in the reverse
+/// order. You cannot 'remove' a single entry using
+///
+///  crt.erase has been deprecated
+///
+/// However, this is likely not going to free any memory; it will just call the dtor of the
+/// object (and make sure it doesn't get called later, when the Crate is cleared or destroyed).
+///
+/// You can also emplace variable-sized arrays of trivially-destructable objects.
+///
+/// alloc_array does not initialize:
+///
+///     float * farr = crt.alloc_array<float>(n);
+///
+/// alloc_array_zero does zero-initializing:
+///
+///     int * farr = crt.alloc_array_zero<int>(n);
+///
+/// If an allocation needs space larger than CHUNKBYTES, it will get its own chunk.
+///
+// Each record containing an object has a non-null 'dtor' field; if the object is trivially destructible,
+// this will be  (dtor_funcp)1, and the object is not on the linked-list.
+//
+// note:
+//  A constructor may emplace additional records in the crate recursively. Likewise,
+//  it's OK if the dtors call erase() on other objects. If this happens during a 'clear',
+//  the erase calls are ignored since the other objects are going to get dtor'd anyhow (if they have not
+//  been already).
+// Important: if object A's constructor places B into the crate, then B will very likely get destroyed
+//  first when the crate is cleared. Thus, A's destructor can't look at B (it can erase B, which is ignored
+//  as described above).
+
+//
+// new 'raw' mode:
+//  - when the crate is in 'raw' mode, no destructors are registered. inserting an object
+//    increases 'alloc_count' in the chunk header, but does not increment 'nrec', nor any
+//    does it increase Crate::m_records.
+//  - raw mode is entered by enable_raw_mode(size_needed):
+//      which does this in addition to enabling raw mode:
+//         - if there is no current chunk, or if the current chunk doesn't have room for 'size_needed' bytes,
+//           a new chunk is added which does.
+//         - enable_raw_mode(size_needed) returns a chunk handle.
+//
+// Internally, raw_mode causes add_record_slot() to do the same thing, but it only moves alloc_count, it does
+// not assign a slot index, and 'idx' is -1 in the returned struct.
+// All callers of add_record_slot() *must* check for raw mode (can be done by checking idx < 0), and then avoid
+// adding an dtor or doing '++m_records'.
+//
+// it's also possible to call .enable_raw_mode(), disable_raw_mode()
+// but .enable_raw_mode() does nothing if there isn't at least one chunk allocated.
+//
+
+namespace hnnx {
+
+//
+// This is used to statically determine whether a type T has a clear(Graph&)
+// method.  This is used as an additional destructor which takes a Graph
+// reference.
+//
+
+template <typename T> using clear_t = decltype(std::declval<T &>().clear(std::declval<Graph &>()));
+
+template <typename T> constexpr bool has_clear = is_detected_v<clear_t, T>;
+
+class Deserz;
+class DCrate;
+
+class Crate {
+    API_EXPORT static constexpr size_t CHUNKBYTES = (1 << 16);
+    static_assert(CHUNKBYTES % 8 == 0 && CHUNKBYTES >= 128);
+    typedef void (*dtor_funcp)(Graph *graph_in, void *);
+    API_EXPORT static dtor_funcp DTOR_TRIVIAL() { return (dtor_funcp)1; }
+    API_EXPORT static dtor_funcp DTOR_IN_PROCESS() { return (dtor_funcp)2; }
+
+    //! A record in the index of a chunk
+    struct index_rec {
+        unsigned loc; ///< offset in bytes to the object
+        dtor_funcp
+                dtor; ///< pointer to dtor function (null if empty record; (DTOR_TRIVIAL if the object is trivial dtor)
+    };
+    //! A chunk record in the crate.
+    ///
+    /// Each chunk is created as an array of uint64_t, via make_unique<uint64_t[]>
+    /// The memory in a chunk has a chunkhdr, which is followed by:
+    ///
+    ///    [Objects][Objects][Objects]--> free space   <--[Index records]
+    ///
+    /// 'alloc_count' is the next offset available to be allocated.
+    /// index records are entered in reverse order from the end. So, the last nrec*sizeof(index_rec)
+    /// bytes of the area, are the index.
+    ///
+    typedef std::unique_ptr<uint64_t[]> uptr_chunk_t;
+    struct chunkhdr;
+    API_EXPORT static chunkhdr *hdr_of(uptr_chunk_t &p) { return reinterpret_cast<chunkhdr *>(p.get()); }
+    API_EXPORT static chunkhdr const *hdr_of(uptr_chunk_t const &p)
+    {
+        return reinterpret_cast<chunkhdr const *>(p.get());
+    }
+    /// The chunkhdr is the first portion of the chunk, and is immediately followed
+    /// by data_len bytes, which is a multiple of 8.
+    struct API_EXPORT alignas(8) chunkhdr {
+        unsigned data_len; ///< length of the data area following header, bytes (>=CHUNKBYTES).
+        unsigned nrec; ///< records in use (including deleted ones)
+        unsigned alloc_count; ///< offset of first byte in 'free space'
+        // init to a given length (header not included)
+        void init(unsigned length)
+        {
+            data_len = length;
+            nrec = 0;
+            alloc_count = 0;
+        }
+        // reset (preserve data_len)
+        void init()
+        {
+            nrec = 0;
+            alloc_count = 0;
+        }
+        // pointer to 'offs ' within data area
+        inline uint8_t *get_ptr(unsigned offs) { return (uint8_t *)(this + 1) + offs; }
+        // pointer to end of  the allocation
+        inline uint8_t *get_end_ptr() { return (uint8_t *)(this + 1) + data_len; }
+        // amount of space remaining
+        inline size_t space_avail() const { return data_len - alloc_count - nrec * sizeof(index_rec); }
+        // get pointer to an index record.
+        // record 0 is the last (oldest) one.
+        index_rec *index_p(int idx) { return (index_rec *)get_end_ptr() - (idx + 1); }
+        static uptr_chunk_t allocate(unsigned len);
+    };
+    std::vector<uptr_chunk_t> m_chunks; /// < chunks with data
+    std::vector<uptr_chunk_t> m_free; /// < chunks without
+    typedef std::vector<uptr_chunk_t>::iterator chunk_iter;
+
+    bool m_rawmode = false;
+    bool m_clearing = false; ///< set while clearing.
+    size_t m_allrecords = 0; ///< includes removed and 'padding' records
+    size_t m_records = 0; ///< only actual, non-erased records.
+
+    //! Returned from add_record_slot (which is used to create a new record)
+    struct recposn {
+        chunkhdr *chunkp; ///< the chunk in which it was found
+        void *objp; ///< pointer to the object
+        int idx; ///< index within the chunk (= -1 if insert was done in raw mode)
+    };
+    API_EXPORT recposn add_record_slot(size_t bytes, size_t align);
+    API_EXPORT void recover_ctor_throw(recposn const &) noexcept;
+    API_EXPORT void install_dtor(recposn const &, dtor_funcp dtor_func);
+    API_EXPORT void move_to_free(chunk_iter chunk_to_free);
+
+  public:
+    class ChunkHandle {
+        friend class Crate;
+        chunkhdr *chunkp;
+
+      protected:
+        ChunkHandle(chunkhdr *cp) : chunkp(cp){};
+
+      public:
+        ChunkHandle() : chunkp(nullptr) {} // null handle may only be assigned-to
+        ChunkHandle(ChunkHandle const &) = default;
+        ChunkHandle &operator=(ChunkHandle const &) = default;
+        friend inline bool operator==(ChunkHandle const &a, ChunkHandle const &b) { return a.chunkp == b.chunkp; }
+        std::pair<void *, size_t> get_memory_extent() const
+        {
+            size_t const len = chunkp->get_ptr(chunkp->alloc_count) - (uint8_t *)chunkp;
+            return {chunkp, len};
+        }
+    };
+
+    API_EXPORT Crate(); ///< Construct a new Crate
+    Crate(Crate const &) = delete;
+    Crate &operator=(Crate const &) = delete;
+
+    // get the preload handle for the first chunk
+    ChunkHandle first_chunk_handle() const
+    {
+        return ChunkHandle(m_chunks.empty() ? nullptr : hdr_of(const_cast<Crate &>(*this).m_chunks.front()));
+    }
+    // get the preload handle for the most recent chunk
+    ChunkHandle last_chunk_handle() const
+    {
+        return ChunkHandle(m_chunks.empty() ? nullptr : hdr_of(const_cast<Crate &>(*this).m_chunks.back()));
+    }
+    // 'raw mode'
+    ChunkHandle enable_raw_mode(unsigned bytes_needed);
+    API_EXPORT void enable_raw_mode();
+    void disable_raw_mode() { m_rawmode = false; }
+    bool raw_mode() const { return m_rawmode; }
+
+    // Note that the destructor doesn't do anything.  You have to call clear() manually.
+    API_EXPORT ~Crate();
+    //! The number of objects in the crate.
+    size_t size() const { return m_records; }
+    //! The number of chunks in use
+    size_t chunk_count() const { return m_chunks.size(); }
+    //! The amount of space left in the current chunk, approximately.
+    /// DO NOT CALL unless chunk_count() > 0
+    size_t current_chunk_space_remain() const { return hdr_of(this->m_chunks.back())->space_avail(); }
+    //! Delete all objects. Does not necessarily free all storage to the
+    /// system; but all retained storage is availabe for re-use in the crate.
+    /// Note that this is no longer called by the destructor- it must be called explicitly.
+    API_EXPORT void clear(Graph *graph_in);
+    // Special entry for deserialzing in segments.
+    // If it is possible to allocate, in current raw-mode chunk, everything from offset 'start'
+    // up to but not including 'limit', this is done, and the base address of that region is returned.
+    // otherwise does nothing and returns null.
+    API_EXPORT void *allocate_bulk(size_t start, size_t limit);
+
+    //! Construct an object of type T into the crate, using the
+    /// parameters of any constructor of T. It is acceptable for the
+    /// constructor to call the emplace method to add other objects to
+    /// the crate.
+    template <typename T, typename... Args> API_HIDDEN T *emplace(Args &&...args)
+    {
+        recposn const pos = add_record_slot(sizeof(T), alignof(T));
+        // construct the object
+        if constexpr (std::is_nothrow_constructible<T, Args...>::value) {
+            new (pos.objp) T(std::forward<Args>(args)...);
+        } else {
+            try {
+                new (pos.objp) T(std::forward<Args>(args)...);
+            } catch (const std::exception &e) {
+                recover_ctor_throw(pos);
+                throw;
+            }
+        }
+        if (pos.idx >= 0) {
+            // register destructor
+            if constexpr (!std::is_trivially_destructible<T>::value) {
+                // Obtain a callable '~T()' function.
+                // this typically generates a jump, or a small inline; lambda can
+                // be cast to a function pointer since it has no state.
+                auto dtor_func = [](Graph *graph_in, void *obj) {
+                    if constexpr (has_clear<T>) {
+                        static_cast<T *>(obj)->clear(graph_in);
+                    }
+                    static_cast<T *>(obj)->~T();
+                };
+                install_dtor(pos, (dtor_funcp)dtor_func);
+            } else {
+                ++m_records; // note, install_dtor does this too.
+            }
+        }
+        return static_cast<T *>(pos.objp);
+    }
+
+    using deserialize_op_func = void *(*)(void *, Deserz &);
+    using deserialize_dtor_func = void (*)(Graph *, void *);
+
+    // Alternate interface to cut down on template instantations:
+    // init_func is used to initialize the memory, and dtor_func
+    // is is used to register the destructor.  It's up to the user
+    // to provide the correct size and alignment.
+
+    API_EXPORT void *emplace_explicit(Deserz &dctx, deserialize_op_func init_func, deserialize_dtor_func dtor_func,
+                                      size_align_code_t size_al);
+
+    //! Allocate 'n' of type T in the crate.
+    /// Will initially be garbage; T must be trivially destructable (unless waived)
+    template <typename T, bool DTOR_OK = false> T *alloc_array(size_t n)
+    {
+        static_assert(DTOR_OK || std::is_trivially_destructible<T>::value);
+        if (n == 0) return nullptr;
+        recposn const pos = add_record_slot(sizeof(T) * n, alignof(T));
+        if (pos.idx >= 0) m_records++;
+        return static_cast<T *>(pos.objp);
+    }
+    //! Allocate 'n' of type T in the crate.
+    /// Will be zero-filled; T must be trivially destructable.
+    template <typename T> T *alloc_array_zero(size_t n)
+    {
+        T *const res = alloc_array<T>(n);
+        if (n != 0) ::memset(res, 0, sizeof(T) * n);
+        return res;
+    }
+    //! Allocate 'n' of type T in the crate.
+    /// Will be "value constructed"; in case of things like int and pointer,
+    /// this means they will be zeroed.
+    ///
+    /// T must be trivially destructable.
+    template <typename T> T *alloc_array_value(size_t n)
+    {
+        T *res = alloc_array<T>(n);
+        if (n != 0) std::uninitialized_value_construct_n(res, n);
+        return res;
+    }
+};
+
+/*
+ * EJP: This seems silly, but I don't know how to get visibility into Graph into a templated Tensor because of include hell.
+ */
+
+API_EXPORT Crate *graph_crate(Graph &graph_in);
+
+//
+// replacement for vector, for use in ops;
+
+//
+// limited options for constructor:
+//   (1) copy, or move, from vector<T> - need Graph *;
+//   (2) create with a given size, null-initialized; - need Graph *;
+//   (3) create empty, and then fill in later
+//       using init( Graph* , std::vector const &)
+//       or init( Graph* , std::vector &&)
+//       or init( Graph *, size )
+//       or init( Graph *, T const *ptr, size );
+//       or init_move( Graph *, T *ptr, size );
+
+// With option 3, it assumed that the 'init' is done during the constructor of
+// a host object - this is needed during deserialize, for instance.
+// the 'len' is 32 bits so this type occupies 2 pointers, vs. 3 for std::vector.
+//
+// If 'T' has a destructor, the cratevec's destructor will invoke that on
+// each element of the vector, in reverse order.
+// when the 'move-from' mechanisms to init from 'std::vector && are used,
+// the supplied vector will not be cleared; but its elements will all be
+// 'moved-from'.
+
+template <typename T> class cratevec {
+    T *m_ptr;
+    unsigned m_len;
+    using vec_t = std::vector<T>;
+    static constexpr bool need_dtor = !std::is_trivially_destructible<T>::value;
+
+  public:
+    using iterator = T *;
+    using const_iterator = T const *;
+    using value_type = T;
+    using size_type = size_t;
+    using difference_type = ptrdiff_t;
+    using reference = T &;
+    using const_reference = T const &;
+
+    cratevec() : m_ptr(nullptr), m_len(0) {}
+    cratevec(Graph *g, vec_t const &v) : cratevec()
+    {
+        if (!v.empty()) init(g, v.data(), v.size());
+    }
+    cratevec(Graph *g, vec_t &&v) : cratevec()
+    {
+        if (!v.empty()) init_move(g, v.data(), v.size());
+    }
+    cratevec(Graph *g, size_t n) : cratevec() { init(g, n); }
+    cratevec(cratevec const &) = delete;
+    cratevec(cratevec &&) = delete;
+    ~cratevec()
+    {
+        if constexpr (need_dtor) {
+            if (m_len > 0) {
+                T *const ptr0 = m_ptr;
+                T *ptr = ptr0 + m_len;
+                do {
+                    ptr--;
+                    ptr->~T();
+                } while (ptr > ptr0);
+            }
+        }
+    }
+
+    cratevec &operator=(cratevec const &) = delete;
+    cratevec &operator=(cratevec &&) = delete;
+
+    void init(Graph *g, T const *data, size_t n)
+    {
+        assert(m_len == 0);
+        if (n) {
+            m_ptr = graph_crate(*g)->alloc_array<T, true>(n);
+            std::uninitialized_copy_n(data, n, m_ptr);
+            m_len = n;
+        }
+    }
+    void init_move(Graph *g, T *data, size_t n)
+    {
+        assert(m_len == 0);
+        if (n) {
+            m_ptr = graph_crate(*g)->alloc_array<T, true>(n);
+            std::uninitialized_move_n(data, n, m_ptr);
+            m_len = n;
+        }
+    }
+    // these methods get used during deserialize, so allow it to pass crate in directly.
+    void init(hnnx::Crate *const crate_p, size_t const n)
+    {
+        assert(m_len == 0);
+        if (n) {
+            m_ptr = crate_p->alloc_array<T, true>(n);
+            std::uninitialized_value_construct_n(m_ptr, n);
+            m_len = n;
+        }
+    }
+    // The DCrate version is defined in dcrate_inlines.h
+    void init(hnnx::DCrate *crate_p, size_t n);
+
+    void init(Graph *const g, size_t const n) { init(graph_crate(*g), n); }
+    void init(Graph *const g, vec_t const &v) { init(g, v.data(), v.size()); }
+    void init(Graph *const g, vec_t &&v) { init_move(g, v.data(), v.size()); }
+
+    iterator begin() noexcept { return m_ptr; }
+    iterator end() noexcept { return m_ptr + m_len; }
+    const_iterator begin() const noexcept { return m_ptr; }
+    const_iterator end() const noexcept { return m_ptr + m_len; }
+    const_iterator cbegin() const noexcept { return m_ptr; }
+    const_iterator cend() const noexcept { return m_ptr + m_len; }
+    size_type size() const noexcept { return m_len; }
+    T *data() noexcept { return m_ptr; }
+    T const *data() const noexcept { return m_ptr; }
+    bool empty() const noexcept { return m_len == 0; }
+    reference operator[](size_type idx) { return m_ptr[idx]; }
+    const_reference operator[](size_type idx) const { return m_ptr[idx]; }
+    reference at(size_type idx)
+    {
+        if (idx >= m_len) throw std::range_error("cratevec");
+        return m_ptr[idx];
+    }
+    const_reference at(size_type idx) const { return const_cast<cratevec &>(*this).at(idx); }
+    reference front() { return m_ptr[0]; }
+    const_reference front() const { return m_ptr[0]; }
+    reference back() { return m_ptr[m_len - 1]; }
+    const_reference back() const { return m_ptr[m_len - 1]; }
+};
+
+} // namespace hnnx
+
+POP_VISIBILITY()
+
+#endif /* CRATE_H_ */
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/dcrate_inlines.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/dcrate_inlines.h
new file mode 100755
index 0000000000000..a48e7bc909904
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/dcrate_inlines.h
@@ -0,0 +1,101 @@
+//==============================================================================
+//
+// Copyright (c) Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef DCRATE_INLINES_H
+#define DCRATE_INLINES_H 1
+
+#include <cstddef>
+#include <cstdint>
+#include <cassert>
+
+#include "macros_attribute.h"
+#include "deser_concurrent.h"
+#include "crate.h"
+
+namespace hnnx {
+
+// alloc 'amount' bytes with given alignment.
+inline void *DCrate::do_alloc(const size_t align, const size_t amount)
+{
+    size_t basep = size_t(nextp);
+    if (align > 4) {
+        basep = (basep + (align - 1)) & ~(align - 1);
+    }
+    size_t const next_base = basep + amount;
+    if (next_base > (size_t)limitp) return nullptr;
+    nextp = (void *)next_base; // update 'nextp' ...
+    return (void *)basep;
+}
+
+template <typename T, bool DTOR_OK /*=false*/> inline T *DCrate::alloc_array(const size_t n)
+{
+    if (nextp != nullptr) {
+        void *const allocp = do_alloc(alignof(T), sizeof(T) * n);
+        if (allocp) return (T *)allocp;
+    }
+    return cratep->alloc_array<T, DTOR_OK>(n);
+}
+
+template <typename T, typename... Args> inline T *DCrate::emplace(Args &&...args)
+{
+    if (nextp != nullptr) {
+        void *const allocp = do_alloc(alignof(T), sizeof(T));
+        if (allocp) {
+            new (allocp) T(std::forward<Args>(args)...);
+            return (T *)allocp;
+        }
+    }
+    return cratep->emplace<T>(std::forward<Args>(args)...);
+}
+
+template <>
+inline void *DCrate::emplace_explicit(Deserz &dctx, deserialize_op_func const init_func,
+                                      deserialize_dtor_func const dtor_func, size_align_code_t const size_al)
+{
+    if (nextp != nullptr) {
+        void *const allocp = do_alloc(size_al.align(), size_al.size());
+        if (allocp) {
+            init_func(allocp, dctx);
+            return allocp;
+        }
+    }
+    return cratep->emplace_explicit(dctx, init_func, dtor_func, size_al);
+}
+
+// this will be used in place of 'emplace' when the constructor parms
+// are just 'Deserz &'
+template <typename T> inline T *DCrate::emplace0(Deserz &dctx)
+{
+    deserialize_op_func const ctor = [](void *const ptr, Deserz &dctx) -> void * {
+        new (ptr) T(dctx);
+        return ptr;
+    };
+    if (nextp != nullptr) {
+        void *const allocp = do_alloc(alignof(T), sizeof(T));
+        if (allocp) {
+            (ctor)(allocp, dctx);
+            return (T *)allocp;
+        }
+    }
+    return (T *)cratep->emplace_explicit(dctx, ctor, nullptr, size_align_code_t::for_type<T>());
+}
+// init method of cratevec<T> using 'Dcrate' is declared here to avoid header inclusion madness.
+//
+template <typename T> inline void hnnx::cratevec<T>::init(hnnx::DCrate *crate_p, size_t n)
+{
+    assert(m_len == 0);
+    if (n) {
+        m_ptr = crate_p->alloc_array<T, true>(n);
+        std::uninitialized_value_construct_n(m_ptr, n);
+        m_len = n;
+    }
+}
+
+} // namespace hnnx
+
+#endif // DCRATE_INLINES_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/deser_concurrent.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/deser_concurrent.h
new file mode 100755
index 0000000000000..16db21a082cf1
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/deser_concurrent.h
@@ -0,0 +1,288 @@
+//==============================================================================
+//
+// Copyright (c) Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef DESER_CONCURRENT_H
+#define DESER_CONCURRENT_H 1
+
+#include <cstddef>
+#include <cstdint>
+#include <cassert>
+#include <cstring>
+#include <memory>
+#include <vector>
+#include <tuple>
+
+#include "deser_concurrent_defs.h"
+
+// this is intended to be included only in "deserialize.h"
+
+struct PreloadInfo;
+
+namespace hnnx {
+struct runlist_seg_descriptor;
+class Crate;
+class Deserz;
+class fixup_supplemental_recs;
+class InitTimeSchedule;
+
+// describes a 'span' of the deserialized data
+struct deser_segment_span {
+    void *base;
+    void *limit;
+};
+
+// This describes a partially-decoded segment; includes fixups.
+// This should stay small so we can place it inside Deserz, and std::move it
+// out (to keep the fixup list) when done with the segment.
+struct runlist_fixup_state {
+    unsigned segno = 0;
+    size_t *crate_begin = nullptr; // where the data starts in the crate
+    runlist_seg_descriptor *seg_desc = nullptr; // Corresponding 'runlist_seg_descriptor' for reference.
+    // The next three are copied from the runlist_auxdata_seg_desc
+    uint32_t base_tensor_index = 0; // first tensor index defined this segment
+    uint32_t base_blocktable_index = 0; // first blocktable index defined in this segment
+    uint32_t base_sharedobj_index = 0; // first 'shared_object' index defined in this segment
+    // fixup data
+    size_t *fixup_list_head = nullptr; // head of the 'fixup list', or null if none.
+    fixup_supplemental_recs *fixup_supplemental; // supplemental fixup list
+
+    runlist_fixup_state() = default;
+    ~runlist_fixup_state() = default;
+    runlist_fixup_state(runlist_fixup_state const &) = default;
+    // *Some* implementations of c++lib require this to have operator= (non-move)
+    // in order for std::vector containing it to be constructed via resize.
+    runlist_fixup_state &operator=(runlist_fixup_state const &) = default;
+    // the move-ctor and move-assign must leave the source with no fixup list,
+    // and segno = 0.
+    runlist_fixup_state(runlist_fixup_state &&from) { do_move_from(std::move(from)); }
+    runlist_fixup_state &operator=(runlist_fixup_state &&from)
+    {
+        do_move_from(std::move(from));
+        return *this;
+    }
+
+  private:
+    // this is used in move-constructor and move-assign; it will always leave 'from'
+    // with certain 'null' values to trap cases where we're using the wrong instance.
+    void do_move_from(runlist_fixup_state &&from)
+    {
+        segno = from.segno;
+        crate_begin = from.crate_begin;
+        seg_desc = from.seg_desc;
+        base_tensor_index = from.base_tensor_index;
+        base_blocktable_index = from.base_blocktable_index;
+        base_sharedobj_index = from.base_sharedobj_index;
+        fixup_list_head = from.fixup_list_head;
+        fixup_supplemental = from.fixup_supplemental;
+        from.segno = 0;
+        from.seg_desc = nullptr;
+        from.fixup_list_head = nullptr;
+    }
+};
+//
+// This contains 'supplemental' fixup records for a segment;  there is one instance in each runlist_seg_descriptor,
+// and a pointer to in the runlist_fixup_state. When the 'runlist_fixup_state' is moved in or out of the Deserz,
+// the pointer to this remains.
+// To avoid the overhead of vec_push_back, this // has a static array into which values are recorded;
+// when this is full (or near full), all the records within are appended to the vector in a single operation.
+// At the end of the operation, any remaining records are appended to the vector, but only if the vector
+// is not empty (we can read the records out of the fixed array, if they all fit).
+//
+// The append() is not safe unless 'ensure_room_for' is checked first; you can e.g. do ensure_room_for(3)
+// ahead of doing up to 3 append
+// It is best to use a constant as parameter to ensure_room_for, i.e. ahead of code which may append
+// *up to* 4 values, use ensure_room_for(4); this simplifies the inline expansion of 'ensure_room_for',
+// and makes very little difference to performance compared to using the exact value.
+//
+class fixup_supplemental_recs {
+    static constexpr unsigned ARR_SIZE = 64;
+    unsigned num_in_arr = 0;
+    uint32_t fixed_arr[ARR_SIZE];
+    std::vector<uint32_t> var_arr;
+    unsigned n_vec = 0; // = var_arr.size()
+
+  public:
+    void clear();
+    unsigned constexpr size() const { return num_in_arr + n_vec; }
+    void reserve(unsigned const n) { var_arr.reserve(n); }
+    inline void ensure_room_for(unsigned const n)
+    {
+        assert(n <= ARR_SIZE);
+        if (num_in_arr > ARR_SIZE - n) flush_to_vec();
+    }
+    // append allowed only when preceded by 'ensure_room_for'
+    inline void append(uint32_t const val)
+    {
+        assert(num_in_arr < ARR_SIZE);
+        fixed_arr[num_in_arr++] = val;
+    }
+    // use instead of 'ensure_room_for(1); push_back(n)'
+    inline void push_back(uint32_t const val)
+    {
+        if (num_in_arr > ARR_SIZE - 1) flush_to_vec();
+        fixed_arr[num_in_arr++] = val;
+    }
+    // After all push_back() done, do a 'finish'
+    // and then get_limits() can be used to traverse the data.
+    void finish(); // flushes, but only if the vec is not empty.
+    std::pair<uint32_t const *, uint32_t const *> get_limits() const;
+
+  protected:
+    void flush_to_vec();
+};
+
+// An array of these (size N+1) is used to hold the
+// information used in deserializing each each segment.
+// The [N+1] is partially used; some operations may use
+// e.g. arr[i+1].auxinfo.some_field to find out where something
+// ends for the current segment, using the start of the next segment;
+// so N-1 entry needs a next.
+
+struct runlist_seg_descriptor {
+    runlist_auxdata_seg_desc auxinfo; // the data from the 'aux_data' record for this segment
+    runlist_fixup_state segfixup; // the deserialization state (moved in and out of Deserz as needed)
+    fixup_supplemental_recs fixup_supp; // fixup supplemental recs.
+    deser_segment_span span_to_deser = {};
+    // These are used to configure the last preload in each segment, which preloads a region
+    // which is either partially, or entirely, in the next segment. So, the first two entries
+    // below are actually set at the end of deserialization of the previous segment; the end_preload
+    // is set by the current segment deserialize.
+    // The information stored in [N] is for configuring
+    // the last preload in the last segment, with end_preload set to 'end of crate'; in this case
+    // start_preload could be <= the end of the crate, and then we don't configure it.
+    // likewise the information in [0] is only 'end_preload', which can be used to configure
+    // 'Graph::m_initial_preload' (it should go from start-of-crate to seg[0].end_preload).
+    // In some cases (hopefully, only in testing) we may have segments with no preloads in them,
+    // in which case null pointers will appear in some of these; the ChunkPreload ops need to
+    // configured by getting info from adjacent segments.
+    PreloadInfo *prev_seg_final_preload{}; // points to the prev segments' final PreloadInfo
+    char *start_preload{}; // the preload start address for prev seg's final preload
+    char *end_preload{}; // end address  for prev seg's final preload
+};
+
+// One instance of this is in Deserializer, called segments.
+// It is created 'empty', and populated when we encounter the valid
+// Aux Data record.
+//
+class DeserSegDescs {
+    unsigned n_segs = 0;
+    // points to an array of n_seg + 1, if n_segs > 0
+    std::unique_ptr<runlist_seg_descriptor[]> seg_arr;
+
+  public:
+    DeserSegDescs() = default;
+    ~DeserSegDescs() = default;
+    DeserSegDescs(DeserSegDescs const &) = delete;
+    DeserSegDescs(DeserSegDescs &&) = default;
+    DeserSegDescs &operator=(DeserSegDescs const &) = delete;
+    DeserSegDescs &operator=(DeserSegDescs &&) = default;
+
+    // these two are used to create the array
+    void set_size(unsigned const n); // used to create sized, empty array
+    runlist_seg_descriptor *data() { return seg_arr.get(); }
+
+    constexpr unsigned num_segs() const { return n_segs; }
+    constexpr bool is_active() const { return n_segs != 0; }
+    // note: 'i' may be 0 .. num_segs(); only can use when 'is_active'.
+    runlist_seg_descriptor &operator[](unsigned const i) { return seg_arr[i]; }
+    runlist_seg_descriptor const &operator[](unsigned const i) const { return seg_arr[i]; }
+
+    // We can add other data in here, to manage the concurrent deserialization.
+    unsigned n_threads = 0; // set when allocating the 'Deserz' array
+    std::vector<Deserz> deserz_arr; // sized as 'n_threads'.
+
+    // start-of-crate, rounded to a multiple of 32; Calculated before any multi-thread
+    // operations. Use to configure Graph::m_initial_preload.
+    void *crate_preload_start_boundary;
+    // end-of-crate, rounded up to multiple of 32. Calculated before any multi-thread
+    // operations. No 'ChunkPreloadOp' will exceed this.
+    void *crate_preload_final_boundary;
+
+    InitTimeSchedule *initSchedule;
+};
+
+// A 'DCrate' is a proxy object stored within Deserz.
+// It has some of the same methods as Crate; but if nextp is not null,
+// it will allocated into the space at 'nextp', limited by 'limitp'
+// Otherwise it will use the Crate.
+// Most methods are defined as inlines in dcrate_inlines,h
+//
+class DCrate {
+    // these are either both null, or both non-null and 4-aligned.
+    void *nextp = nullptr;
+    void *limitp = nullptr;
+    Crate *cratep = nullptr;
+
+  public:
+    DCrate() {}
+    ~DCrate() {}
+    DCrate(DCrate const &) = default;
+    DCrate(DCrate &&) = default;
+    DCrate &operator=(DCrate const &) = default;
+    DCrate &operator=(DCrate &&) = default;
+    explicit DCrate(Crate &c) : cratep(&c) {}
+    void set_crate(Crate &c) { cratep = &c; }
+    Crate *crate() { return cratep; }
+    bool is_active() const { return nextp != nullptr; }
+
+    constexpr size_t bytes_remaining() const { return (char *)limitp - (char *)nextp; }
+    char *next_loc() { return (char *)nextp; }
+    std::pair<char *, char *> range_remain() { return {(char *)nextp, (char *)limitp}; }
+
+    void set_memory_range(void *base, unsigned len)
+    {
+        nextp = base;
+        limitp = (void *)((char *)base + len);
+    }
+    void remove_memory_range()
+    {
+        nextp = nullptr;
+        limitp = nullptr;
+    }
+
+    // Methods of Crate we want to support (See crate.h for more more detail).
+    // Note that the constructors invoked in 'emplace' and 'emplace_explicit'
+    // can and will recursively call 'emplace' to construct their sub-objects.
+    template <typename T, typename... Args> T *emplace(Args &&...args);
+    // variant of 'emplace' which can use the 'emplace_explicit' call to avoid
+    // instantiating the constructor twice
+    template <typename T> T *emplace0(Deserz &dctx);
+    // (this is defined with 'template' args, only so it can be declared here without
+    // forward refs. All are pass-by-value. Only one specialization will be defined).
+    template <typename FI, typename FD, typename SA> void *emplace_explicit(Deserz &dctx, FI, FD, SA);
+    // array allocation, used to make all arrays in crate during deserialize.
+    template <typename T, bool DTOR_OK = false> T *alloc_array(size_t n);
+
+  private:
+    // reserve the specified data in the range, and return pointer to start; or
+    // return null if not possible.
+    void *do_alloc(size_t align, size_t amount);
+};
+
+// defines the encoding in the upper 3 bits of the last word of a 'multi-word' supplemental record
+// all must be 4..7, since a 0 in the msb indicates a 'short' record.
+
+constexpr unsigned SUPPFIXUP_CAT_tensor = 4;
+constexpr unsigned SUPPFIXUP_CAT_sharedobj = 5;
+constexpr unsigned SUPPFIXUP_CAT_blocktable = 6; // with indices packed in one word
+constexpr unsigned SUPPFIXUP_CAT_blocktable_full = 7; // .. in two words
+constexpr unsigned SUPPFIXUP_CAT_SHIFT = 29u;
+
+bool fixup_encode_for_blocktable(runlist_fixup_state &seginfo, uint32_t idx, uint32_t table_offs, void **ptrloc);
+
+// high-level operations in the 'deserialize by segments' code.
+
+GraphStatus do_multiseg_deser(Deserializer &dctx, size_t ref_deser_pos);
+GraphStatus segmentjob_deserialize_ops(Deserializer &dctx, unsigned segno, unsigned threadno);
+GraphStatus segmentjob_process_fixups(Deserializer &dctx, unsigned segno, unsigned threadno);
+GraphStatus segmentjob_compile_ops(Deserializer &dctx, unsigned segno, unsigned threadno);
+void resolve_chunk_preload_after_multiseg_deser(Deserializer &dctx);
+
+} // namespace hnnx
+
+#endif // DESER_CONCURRENT_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/deser_concurrent_defs.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/deser_concurrent_defs.h
new file mode 100755
index 0000000000000..3d72ed7d2de71
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/deser_concurrent_defs.h
@@ -0,0 +1,97 @@
+//==============================================================================
+//
+// Copyright (c) Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef DESER_CONCURRENT_DEFS_H
+#define DESER_CONCURRENT_DEFS_H 1
+
+#include <cstddef>
+#include <cstdint>
+
+namespace hnnx {
+
+// NOTE: this file contains defs for concurrent deserialize which are needed on both decode and prepare
+// side; mostly just the format of the Aux Data records.
+// Defs needed only on decode side are in 'deser_concurrent.h', which #includes this file.
+
+constexpr unsigned DesConcur_MIN_SEGMENTS = 8; // can't have less than this number.
+
+// This is the number of runlist slots in the runlist_auxdata_seg_desc format.
+// It must be >= the actual number. This number is coded into the start of the AuxData
+// payload. If the number gets bigger, the reader of the aux-data
+// record will need to be able to cope with the older, smaller value.
+
+constexpr unsigned DesConcur_MAX_RUNLISTS = 4;
+
+// The 'Aux Data' record describing the runlist partition has a payload formed of
+// a runlist_auxdata_header, followed immediately by N+1 of runlist_auxdata_seg_desc.
+// The number N is in the header; there may be additional words after, which can be
+// ignored
+//
+// Aux Data header record.
+// The 'record_version' is reserved to flag changes in the format, so that
+//   if it changes, new skel can understand old records.
+//    Currently, It has this format; most changes will expand one of the fields
+//    so following this may be adequate to capture version changes; if it is not,
+//   add flags in the upper bits.
+//   bits 31 ..13 : reserved, 0
+//   bit  12: set of crate sizes are calculated based on 'dynamic tensor' sizes
+//   bits 11..8 length of the header in uint32's
+//   bits 7..3 length of 'segment' record, in uint32's
+//   bits 2..0 .. value of DesConcur_MAX_RUNLISTS
+//
+struct runlist_auxdata_header {
+    unsigned record_version; // see above
+    unsigned numsegs : 16; // number of segments; >= 8, likely <= 64 but who knows
+    unsigned hdrflags : 16; // reserved for flags
+    unsigned runlist_offset; // see below
+};
+
+// 'runlist_offset' is the offset, in u32's units, from the 'num_in_tensors' word
+// to the 'n_ops_total' word. This is needed by 'weight share' processing in order to
+// adjust the deser_offset values to accommodate changes in the encoding length of pointers.
+
+// The N segments are described by an array of N+1 of runlist_auxdata_seg_desc;
+// segment i is defined by arr[i] (start) and arr[i+1] (end).
+// An exception is 'crate_seg_len'- this may be less than arr[i+1].crate_offset - arr[i].crate_offset
+//  due to padding.
+// In the final record arr[N]:
+//    - crate_seg_len is not used (0)
+//    - The *_list_posn records are the total length of the runlists
+//    - the four 'base_*_index' values are all 1 greater than any index used in the graph
+//
+struct runlist_auxdata_seg_desc {
+    uint32_t deser_offset; // where the input (pickle) data begins - reference point is the start of 'Runlist' as
+    //                     // defined in docs/pickle_format.md, i.e. the location of 'n_ops_total' word
+    uint32_t crate_offset; // offset in crate
+    uint32_t crate_seg_len; // crate length needed (not used in final entry)
+    uint32_t runlist_posn[DesConcur_MAX_RUNLISTS]; // where the segment starts in Op* runlist
+    uint32_t execlist_posn[DesConcur_MAX_RUNLISTS]; // where the segment starts in 'execlist'
+    uint32_t base_opseq_index; // first 'op_sequence_marker' index used in the segment.
+    uint32_t base_tensor_index; // first tensor index defined this segment
+    uint32_t base_blocktable_index; // first blocktable index defined in this segment
+    uint32_t base_sharedobj_index; // first 'shared_object' index defined in this segment
+};
+
+// Bit in the header version indicating crate sizes allow for 'dynamic shapes'.
+// NOTE: if that gets backed out later, leave this here but remove it from DesConcur_AUXDATA_REC_VERSION
+//
+constexpr unsigned DesConcur_AUXDATA_REC_VERSION_DYNSHAPE_SIZES = 4096;
+
+constexpr unsigned DesConcur_AUXDATA_REC_VERSION = // composed of:
+        ((sizeof(runlist_auxdata_header) / sizeof(uint32_t)) * 256 // header size
+         + (sizeof(runlist_auxdata_seg_desc) / sizeof(uint32_t)) * 8 // seg desc len
+         + DesConcur_MAX_RUNLISTS) |
+        DesConcur_AUXDATA_REC_VERSION_DYNSHAPE_SIZES;
+
+// values to be used to 'grow' old crate estimate to compensate for 'dyn shape' mismatch
+constexpr unsigned DesConcur_CrateGrowPerTensor = 2; // number of words per 'tensor'
+constexpr unsigned DesConcur_CrateGrowPerShared = 2; // number of words per 'shared object'
+
+} // namespace hnnx
+
+#endif // DESER_CONCURRENT_DEFS_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/deserialize_tensors.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/deserialize_tensors.h
new file mode 100755
index 0000000000000..43f14039fd1ad
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/deserialize_tensors.h
@@ -0,0 +1,68 @@
+//==============================================================================
+//
+// Copyright (c) 2021-2023 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef DESERIALIZE_TENSORS_H
+#define DESERIALIZE_TENSORS_H 1
+
+#include <cstdio>
+#include <cassert>
+#include <cstring>
+#include <memory>
+#include <vector>
+#include <tuple>
+#include "limits.h"
+#include "log.h"
+
+#include "forward_classes.h"
+#include "serdes_tensors.h"
+
+namespace hnnx {
+
+// see comment in serdes_tensors.h for overview of how this works.
+
+class Deserializer;
+
+class DeserTensorConn : public SerTensorConnDefs {
+    typedef unsigned tensor_idx;
+    typedef Tensor const *ptr_type;
+
+    // this collects all of the tensor_def we have seen. index is seq_index-1.
+    std::vector<ptr_type> defined_tensors;
+
+  public:
+    DeserTensorConn() {}
+    // process a tensor definition
+    void tensor_def(Deserz &, ptr_type);
+    // process n tensor refs.
+    void tensor_refs(Deserz &, ptr_type *ptrs, unsigned num);
+    // process a tensor ref
+    void tensor_ref(Deserz &dctx, ptr_type &ptr) { tensor_refs(dctx, &ptr, 1); }
+
+    // TODO: remove these two, we don't use them, and should not.
+    // read an identity (for use in subsequent need_fixup)
+    tensor_idx read_identity(Deserz &);
+    // apply the identity to 'fix' a tensor pointer (usually now, sometimes later
+    void need_fixup(tensor_idx ident, ptr_type *dst);
+
+    // 'reserve' the defined tensors to avoid allocation overhead...
+    inline void reserve_tensors(const size_t n) { defined_tensors.reserve(n); }
+    // resize the 'defined tensors' table to its full capacity (specified).
+    // Used only in multi-thread deserialize, prior to deserializing the runlist.
+    inline void resize_tensordef_table(const size_t n) { defined_tensors.resize(n); }
+
+    // this is for use by 'reference fixup' code, in concurrent deserialize.
+    std::vector<ptr_type> const &get_defined_tensors() const { return defined_tensors; }
+
+  protected:
+    tensor_idx read_identity_inline(Deserz &);
+    void apply_fixup_inline(tensor_idx idx, ptr_type *dst);
+};
+
+} // namespace hnnx
+
+#endif // DESERIALIZE_TENSORS_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/deserializer.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/deserializer.h
new file mode 100755
index 0000000000000..7312ae8bdd948
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/deserializer.h
@@ -0,0 +1,761 @@
+//==============================================================================
+//
+// Copyright (c) Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef DESERIALIZER_H
+#define DESERIALIZER_H 1
+
+#include <cstdio>
+#include <cassert>
+#include <cstring>
+#include <memory>
+#include <string>
+#include <tuple>
+#include <type_traits>
+#include <array>
+#include <vector>
+#include <map>
+#include <typeinfo>
+#include <typeindex>
+#include <string_view>
+#include "limits.h"
+#include "dtype.h"
+#include "log.h"
+#include "allocator.h"
+#include "op_extra_info.h"
+
+#include "serialize_defs.h"
+#include "forward_classes.h"
+#include "deserialize_tensors.h"
+#include "macros_attribute.h"
+#include "const_extent_descriptor.h"
+#include "weak_linkage.h"
+#include "size_align_code.h"
+#include "deser_concurrent.h"
+#include "hexagon_nn_types.h"
+
+namespace hnnx {
+class DMA_Manager;
+class Crate;
+/**
+ * @brief \ref Serializer and \ref Deserializer modules that provides
+ * a mechanism to flatten (serialize) and reconstruct (deserialize)
+ * primitive and user-defined data types. The initial objective
+ * was to create an in-memory representation of the optimized
+ * \ref Graph on x86 which can then be reconstructed and executed on
+ * a qdsp target, essentially, a means to Graph caching.
+ *
+ */
+using tensor_deserializer_fn = uptr_Tensor (*)(Deserz &);
+
+using deserialize_op_func = void *(*)(void *, Deserz &); // Allocation function
+using deserialize_dtor_func = void (*)(Graph *, void *); // Deallocation function
+class SimpleOpBase;
+using deserialize_make_unique = std::unique_ptr<SimpleOpBase> (*)();
+
+struct op_deserializer_fn {
+    op_deserializer_fn(deserialize_op_func init_func_in, const size_align_code_t sizeal_in)
+        : init_func(init_func_in), size_align_code(sizeal_in)
+    {
+    }
+    op_deserializer_fn(deserialize_op_func init_func_in, deserialize_dtor_func dtor_func_in,
+                       const size_align_code_t sizeal_in)
+        : dtor_func(dtor_func_in), init_func(init_func_in), size_align_code(sizeal_in){};
+    op_deserializer_fn(const op_deserializer_fn &) = default;
+    op_deserializer_fn(op_deserializer_fn &&) = default;
+    op_deserializer_fn &operator=(const op_deserializer_fn &) = delete;
+    deserialize_dtor_func dtor_func = nullptr;
+    deserialize_op_func init_func = nullptr;
+    const size_align_code_t size_align_code{};
+    inline constexpr size_t get_size() const { return size_align_code.size(); }
+    inline constexpr size_t get_align() const { return size_align_code.align(); }
+};
+
+// here's a quick and dirty way to make these maps go faster: compare string_view starting with len;
+// and if the len is the same, then compare the middle character, and if that's the same,
+// use memcmp. This avoids getting slowed down by a lot of long common prefixes in the type names.
+// and we don't care about the weird ordering it generates.
+//
+struct trick_stringview_lt {
+    bool operator()(std::string_view const &a, std::string_view const &b) const
+    {
+        unsigned const na = a.size();
+        unsigned const nb = b.size();
+        if (na != nb) return na < nb;
+        char const *const pa = a.data();
+        char const *const pb = b.data();
+        if (pa == pb || na == 0) return false; // pa==pb is a  common case.
+        unsigned const char_a = pa[na >> 1];
+        unsigned const char_b = pb[na >> 1];
+        if (char_a != char_b) return char_a < char_b;
+        return ::memcmp(pa, pb, na) < 0;
+    }
+};
+
+using op_deserializer_map_t = std::map<std::string_view, std::pair<op_deserializer_fn, bool>, trick_stringview_lt>;
+using tensor_deserializer_map_t = std::map<std::string_view, tensor_deserializer_fn, trick_stringview_lt>;
+using cexdesc_deserializer_map = std::map<std::string, ConstExtentDesc>;
+
+using const_extent_t = std::pair<hexagon_nn_wide_address_t, size_t>;
+using weight_buf_deserializer_map = std::map<std::string, const_extent_t>;
+
+/**
+ * @brief Deserializer class to reverse the serialization
+ * process and reconstruct the data for specific types
+ *
+ */
+class Deserz : public DeSerError {
+    friend class Deserializer; // weirdly, sometimes a derived class needs to be a friend.
+    friend class DeserTensorConn;
+
+  protected:
+    Deserz(Deserializer *full_deser, char const *p, size_t n, Graph *g = nullptr);
+
+  public:
+    // I want to make this protected, but can't.
+    // Even code which has access to a protected copy_ctor
+    // of foo can't invoke .resize(n, foo_inst) on a std::vector<foo>. This
+    // seems like a defect in C++. Applies to various 'emplace' methods too;
+    // the 'emplace' can only ever use public ctors.
+    Deserz(Deserz const &) = default;
+
+  public:
+    virtual ~Deserz(); // please keep this as first virtual method declared.
+
+    // These three ONLY TO BE USED when setting up a Deserz to start processing a segment.
+    void setup_source_span(deser_segment_span const &);
+    void setup_dcrate_out(void *base, size_t len);
+    void setup_next_tensor_index(unsigned const idx) { next_tensordef_index = idx; }
+
+    typedef uint32_t object_identity_type;
+
+    // Note, various accessor methods are defined as inlines below 'class Deserializer'.
+    // true if this Deserz is really an instance of Deserializer.
+    constexpr bool is_base_deser() const;
+
+    using op_deserialize_fn_list_t = std::vector<op_deserializer_map_t::const_iterator>;
+    using tensor_deserialize_fn_list_t = std::vector<tensor_deserializer_fn>;
+
+    op_deserialize_fn_list_t &get_op_deserialize_fn_list();
+    tensor_deserialize_fn_list_t &get_tensor_deserialize_fn_list();
+    std::vector<void *const *> &get_blocktable_link_table();
+    // when deserializing an op:
+    //  - call deserialize_tensor_ref (or _refs) on all the input tensor pointers
+    //  - pass all output tensor addresses to deserialize_tensor_def
+    //  Sequence must match serialization; note that the deserialize-ctor of Tensor
+    //  calls deserialize_tensor_def on itself; so there is no need to call it elsewhere,
+    //   except for specialized types which are constructed otherwise during depickle (e.g.,
+    //   types embedded in the Op).
+    //
+    // Some ops have multiple copies of some input tensor pointers; for these, it's possible
+    // serialize just one reference, and the deserialize it using
+    //     auto id = deserialize_object_identity()		// <- corresponds to serialize_tensor_ref
+    //     need_tensor_fixup( id, &first_tensor_pointer);
+    //      (other deserialize activity can happen here)
+    //     need_tensor_fixup( id, &second_tensor_pointer);
+
+    void deserialize_tensor_def(Tensor const *tensor_ptr);
+    void deserialize_tensor_ref(Tensor const *&where);
+    void deserialize_tensor_refs(Tensor const **ptrs, unsigned n);
+    template <typename T> void deserialize_tensor_ref(T const *&where);
+    template <typename T> void deserialize_tensor_refs(T const **ptrs, unsigned n);
+    object_identity_type deserialize_object_identity();
+    void need_tensor_fixup(object_identity_type oid, Tensor const **where);
+
+    Graph &graph() const { return *graph_ptr; }
+    Crate *crate() { return d_crate.crate(); }
+    DCrate *dcrate() { return &d_crate; }
+    DeserSegDescs const &get_segments() const; // gets ref to associated 'segments' object
+    op_deserializer_map_t const &get_op_deser_map() const { return *op_deserializer_map; }
+
+    bool is_aligned_const_format() const;
+    bool has_pending_tensor_updates();
+
+    bool is_shared_dynamic_tensor_shape_format() const;
+
+    fa::RuntimeAllocator *allocator;
+    DCrate d_crate; // contains a crate pointer
+
+  protected:
+    // hoist pointers to these maps into Deserializer to avoid static lock overhead
+    op_deserializer_map_t const *op_deserializer_map;
+    tensor_deserializer_map_t const *tensor_deserializer_map;
+    Graph *graph_ptr{};
+    Deserializer *full_deser;
+
+    char const *bufstart; // start of current buffer
+    char const *bufend; // first byte we can't read
+    char const *bufp; // next to read
+    char const *buf_limit; // <= bufend; where 'fill_buffer' needs to be called.
+    size_t bytes_filled; // bytes previously filled
+
+    uint32_t op_flags;
+    OpExtraInfo op_extra_info;
+
+    unsigned next_tensordef_index = 1; // belongs to 'tensorconn' but needs to be in Deserz.
+    // 'format version'. Currently only ones used are 0 = classic, 1 = July/2023
+    // Only access through methods like .classic_format();
+    // This is changed to non-zero value based on seeing certain Aux Data records
+    // (which must appear before the allocator).
+    int format_version = 0;
+
+    // this is used in multi-thread decoding. It is important that
+    // it remains null-constructed if the object is really a base of Deserializer;
+    // it is only used in 'segment' Deserz instances.
+    runlist_fixup_state seg_fixup_state{};
+
+    /**
+	 * @brief throws an error since deserializer detected
+	 * deserialization on insufficient bytes i.e. an underflow
+	 *
+	 */
+    API_EXPORT virtual char const *fill_buffer(); // called for underflow on short operation
+
+    /**
+	 * @brief Deserialize data of specified length and write into
+	 * buffer provided by caller
+	 *
+	 * @param[out] p buffer to write to
+	 * @param[in] len length of the \ref bufp to read from
+	 * @param[in] align if true, skip input bytes to a boundary of 4
+	 */
+    API_EXPORT virtual void deserialize_fread(void *p, size_t len, bool align);
+
+    /**
+	 * @brief Get current position of buffer from which next data will be read
+	 *
+	 * @return size_t offset from buffer start
+	 */
+    size_t buffer_offset() const { return bufp - bufstart; }
+    /**
+	 * @brief Available buffer size remaining for deserialization
+	 *
+	 * @return size_t remaining bytes size
+	 */
+    size_t buffer_remain() const { return bufend - bufp; }
+
+    /**
+	 * @brief deserialize buffer for type T
+	 *
+	 * @retval T returs the deserialized value of type T
+	 *
+	 * Note: This is the templated API called by deserialize_T() functions
+	 *
+	 * Note: Cannot be used for more than 4 bytes, there is a specialized version to read u64.
+	 */
+    template <typename T> T simple_deserialize()
+    {
+        static_assert(sizeof(T) <= 4, "can only read sizeof(T) <= 4");
+        constexpr size_t W = 4;
+        char const *curr_p = bufp;
+        if (curr_p >= buf_limit) {
+            curr_p = fill_buffer();
+        }
+        T const val = *(T const *)(curr_p);
+        bufp = curr_p + W;
+        return val;
+    }
+    // see comment above deserialize_shared_obj.
+    API_EXPORT std::pair<void const *, void const **> deserialize_shared_obj_func(void const **ptrloc);
+    API_EXPORT uint64_t deser_u64_slowpath();
+    void initial_l2fetch(); // called only from ctor
+
+  public:
+    inline constexpr bool classic_format() const { return format_version == 0; }
+    /**
+	 * @brief deserialize data of type which calls simple_deserialize
+	 *
+	 * @param val data to deserialize
+	 *
+	 * Note: the below are the only types supported for deserialize_type<T>
+	 */
+    API_EXPORT uint64_t deserialize_uint64(); // inline later
+    inline float deserialize_float() { return simple_deserialize<float>(); }
+    inline uint32_t deserialize_uint32() { return simple_deserialize<uint32_t>(); }
+    inline NN_INT32_T deserialize_int32() { return simple_deserialize<NN_INT32_T>(); }
+    inline int16_t deserialize_int16() { return simple_deserialize<int16_t>(); }
+    inline uint16_t deserialize_uint16() { return simple_deserialize<uint16_t>(); }
+    inline int8_t deserialize_int8() { return simple_deserialize<int8_t>(); }
+    inline uint8_t deserialize_uint8() { return simple_deserialize<uint8_t>(); }
+
+    inline uint64_t deserialize_namesig() { return deserialize_uint64(); }
+
+    // note, this is defined as an inline in deserializer.cc and not available elsewhere
+    tensor_deserializer_fn deserialize_tensor_identification(unsigned tensor_class_index);
+
+    // deserialize string
+    // **NOTE** will throe runtime error if called in a Deserz which is not really a Deserialize.
+    API_EXPORT std::string_view deserialize_str();
+
+    uint32_t get_op_flags() const { return op_flags; };
+    void clear_op_flags() { op_flags = 0; };
+    void set_op_flags(uint32_t f) { op_flags = f; };
+
+    const OpExtraInfo &get_op_extra_info() const { return op_extra_info; };
+    void clear_extra_info() { op_extra_info.clear(); };
+    void set_op_extra_info(OpExtraInfo in_op_extra_info) { op_extra_info = in_op_extra_info; };
+
+    /**
+	 * @brief deserialize buffer for specified size
+	 *
+	 * @param[in] alloc_size number of bytes to read from \ref bufp
+	 * @param[out] ptr destination buffer for the read bytes
+	 * @return size_t number of bytes actually read
+	 */
+    API_EXPORT size_t deserialize_buf(size_t alloc_size, void *ptr);
+    /**
+	 * @brief similar to deserialize_buf but first deserialize a
+	 * uint32_t size of bytes that should match the alloc_size
+	 *
+	 * @param[in] alloc_size number of bytes to read from \ref bufp
+	 * @param[out] ptr destination buffer for the read bytes
+	 * @return size_t number of bytes actually read
+	 */
+    API_EXPORT size_t deserialize_buf_withlen(size_t alloc_size, void *ptr);
+    // deserialize a pointer as 64 bits
+    inline void *deserialize_ptr() { return (void *)size_t(deserialize_uint64()); }
+
+    template <typename T> T deserialize_type();
+
+    template <typename RetT, size_t N, typename SerialT> std::array<RetT, N> deserialize_array();
+
+    /**
+	 * @brief convernience wrappers for deserialize fuctions that
+	 * take in different number of arguments of uint32_t type
+	 *
+	 * @return std::tuple<uint32_t,uint32_t> (first, second) uint32_t data deserialized
+	 */
+    // convenience wrappers (to reduce inlined code size w/o much loss of speed)
+    API_EXPORT std::tuple<uint32_t, uint32_t> deserialize_uint32_x2();
+    API_EXPORT std::tuple<uint32_t, uint32_t, uint32_t> deserialize_uint32_x3();
+    API_EXPORT std::tuple<uint32_t, uint32_t, uint32_t, uint32_t> deserialize_uint32_x4();
+
+    API_EXPORT void deserialize_uint32_arr(uint32_t *p, size_t N);
+
+    // to reduce code size in the templates, we can deserialize arrays of
+    // N uint32 to sizet
+    API_EXPORT void deserialize_uint32_arr_sizet(size_t *p, size_t N);
+
+    /**
+	 * @brief deserialize array containing uint32_t type date
+	 *
+	 * @tparam N size of the array
+	 * @return std::array<size_t,N> array containing the deserialized values
+	 */
+    template <size_t N> std::array<size_t, N> deserialize_uint32_array_sizet()
+    {
+        std::array<size_t, N> res;
+        deserialize_uint32_arr_sizet(&res[0], N);
+        return res;
+    }
+
+    //
+    // This is used for shared objects like Shape and Interface.
+    // it deserializes the index, and decides if it's the first instance.
+    //  - must always pass the address which needs to point to it; though it
+    //    will be not be set by this function.
+    //  - if retval.second is null, then the object was previously deserialized,
+    //    and return.first is the pointer to it.
+    //  - otherwise, caller must deserialize the instance, and store the pointer
+    //    at *retval.second. retval.first will be null in this case.
+    // In scenarios where delayed resolution is used, the return may be {token,null}
+    // where 'token' is actually  delayed resolution token.
+    //
+    template <typename T>
+    std::pair<T const *, T const **> // see above
+    deserialize_shared_obj(T const **const loc)
+    {
+        auto const res = deserialize_shared_obj_func((void const **)loc);
+        return {(T const *)res.first, (T const **)res.second};
+    }
+
+    // Increment tue current read position of internal buffer without reading anything
+    void deserialize_skip_words(size_t nwords);
+
+    // Apply the 'pointer fixups' contained within seg_info. This can
+    // be called with 'this' being any Deserz or Deserializer associated
+    // with the operation (it is only used to access tables in Deserializer).
+    // This can only be done on a given segment when all previous have
+    // been deserialized; so if we have one Deserz per thread, we need
+    // to 'move' the seg_info object out of it after completing the segment,
+    // and use this later to do the fixups.
+    // Returns true if ok, false if failed.
+    // Will leave the fixup list empty on success.
+    bool apply_segment_fixups(runlist_fixup_state &seg_info) const;
+
+    // Methods to move 'seg_fixup_state' object in or out.
+    void install_seg_fixup_state(runlist_fixup_state &&src) { seg_fixup_state = std::move(src); }
+    runlist_fixup_state extract_seg_fixup_state() { return std::move(seg_fixup_state); }
+    void extract_seg_fixup_state_to(runlist_fixup_state &dest) { dest = std::move(seg_fixup_state); }
+
+    // and a read_only accessor
+    runlist_fixup_state const &fixup_state() const { return seg_fixup_state; }
+
+    // for Tensor::deserialize_blocktable
+    inline bool fixup_encode_for_blocktable(uint32_t const idx, uint32_t const table_offs, void **const ptrloc)
+    {
+        return hnnx::fixup_encode_for_blocktable(seg_fixup_state, idx, table_offs, ptrloc);
+    }
+};
+
+/////////////////
+
+class Deserializer : public Deserz {
+    friend class Deserz;
+
+  public:
+    /**
+	 * @brief Construct a new Deserializer object
+	 *
+	 * @param[in] p buffer that needs to be deserialized
+	 * @param[in] n length of the buffer
+	 * @param[in] g pointer Graph object to deserialize (usually null, since object
+	 *              is being passed to the Graph::Graph ctor to deserialize; that ctor
+	 *              must immediately call dctx.set_graph(*this) )
+	 */
+    API_EXPORT Deserializer(char const *p, size_t n, Graph *g = nullptr);
+    API_EXPORT virtual ~Deserializer(); // please keep this as first virtual method declared.
+
+    void set_graph(Graph &g);
+
+    inline void deserialize_tensor_def(Tensor const *tensor_ptr) { tensorconn.tensor_def(*this, tensor_ptr); }
+    inline void deserialize_tensor_ref(Tensor const *&where) { tensorconn.tensor_ref(*this, where); }
+    inline void deserialize_tensor_refs(Tensor const **ptrs, unsigned n) { tensorconn.tensor_refs(*this, ptrs, n); }
+    inline void deserialize_pred_conditions(std::vector<uint32_t> &pred_cond_list)
+    {
+        // get the number of items in the vector
+        uint32_t num_of_objects = deserialize_uint32();
+        assert(num_of_objects <= UINT32_MAX);
+        if (num_of_objects > 0) {
+            pred_cond_list.resize(num_of_objects);
+
+            // TODO: remove this once we know how to update it at runtime
+            // Currently setting it to true
+            pred_cond_list.at(0) = 1;
+        }
+    }
+    template <typename T> inline void deserialize_tensor_ref(T const *&where)
+    {
+        static_assert(std::is_base_of<Tensor, T>::value);
+        tensorconn.tensor_ref(*this, *(Tensor const **)&where);
+    }
+    template <typename T> void deserialize_tensor_refs(T const **ptrs, unsigned n)
+    {
+        static_assert(std::is_base_of<Tensor, T>::value);
+        tensorconn.tensor_refs(*this, (Tensor const **)ptrs, n);
+    }
+    inline object_identity_type deserialize_object_identity() { return tensorconn.read_identity(*this); }
+
+    inline void need_tensor_fixup(object_identity_type oid, Tensor const **where) { tensorconn.need_fixup(oid, where); }
+    inline void resolve_fixups()
+    {
+        [[maybe_unused]] const object_identity_type newval = tensorconn.read_identity(*this);
+        assert(newval == 0);
+    }
+
+    constexpr bool is_aligned_const_format() const { return aligned_const_format_flag; }
+    void set_aligned_const_format(const bool v = true) { aligned_const_format_flag = v; }
+
+    constexpr bool is_shared_dynamic_tensor_shape_format() const { return shared_dynamic_tensor_shape; }
+    void set_shared_dynamic_tensor_shape_format(const bool v = true) { shared_dynamic_tensor_shape = v; }
+
+    PUSH_WARNING()
+    DISABLE_WARNING("-Wcast-qual", MSVC_NO_EQUIV)
+    // valid when the entire pickle, in const_extent format, is loaded as a single, persistent dma buffer
+    inline unsigned char *get_weight_pointer() { return ((unsigned char *)bufstart) + (4 * pickle_len_words); };
+    POP_WARNING()
+    inline size_t get_weight_size() { return (bufend - bufstart) - (4 * pickle_len_words); };
+
+    inline op_deserialize_fn_list_t &get_op_deserialize_fn_list() { return op_deserialize_fn_list; }
+    inline tensor_deserialize_fn_list_t &get_tensor_deserialize_fn_list() { return tensor_deserialize_fn_list; }
+
+    // Next 4 methods are used to support 'deserialize_by_segments'.
+    // 'get_forward_span' returns a 'deser_segment_span' (pair of pointers) for a region of deserialized data
+    // from 'ref + start' up to 'ref + end', where start and end (0 <= start < end) are byte offsets
+    // relative to some position 'ref' in the deserialized data, and 'ref' is the value which bytes_consumed()
+    // returned at that reference point. All should be multiples of 4.
+    deser_segment_span get_forward_span(size_t ref, size_t start, size_t end);
+    // used to get a reference point for bytes_consumed
+    size_t bytes_consumed() const { return bufp - bufstart; }
+    // used to skip past the last 'get_forward_span' we did
+    void skip_to_after_span(deser_segment_span const &);
+    // resize tables: tensor, shared_obj, linktable, according to info in final_segdesc
+    void resize_object_tables(runlist_auxdata_seg_desc const &final_desc);
+
+    uint32_t crate_size_according_to_segments() const;
+
+  protected:
+    std::vector<void const *> objindex; // index of pointers to shape, etc.
+    // the state of the 'tensor connectivity' deserialize engine.
+    DeserTensorConn tensorconn;
+    bool aligned_const_format_flag = false;
+    bool shared_dynamic_tensor_shape = false;
+
+    // this is used in 'deserialize_str', so it ideally should be in Deserz; but
+    // it's pretty large; so, put it here and forbid calling deserialize_str
+    // on a Derserz which not really a Deserialize. We only use it to decode
+    // 'classic' pickles, so this is ok.
+    char name_buf[4096]; // used for string view
+
+    // do the reference fixups on a segment. Return true if OK.
+    // See Deserz::apply_segment_fixups for public API.
+    static bool do_segment_fixups(runlist_fixup_state &seginfo, Deserz const &dctx0);
+
+  public:
+    inline constexpr bool classic_format() const { return format_version == 0; }
+    inline void set_format_2307() { format_version = 1; }
+
+    // This is called when a 'class index' Aux Data is encountered.
+    // It must deserialize exactly the indicated number of payload words.
+    // is_tensor = false for "Co" (op class index), and true for "Ct" (tensor class index)
+    API_EXPORT void auxdata_class_index(unsigned payload_words, bool is_tensor);
+    //
+    // called when an 'Nt' Aux data is encountered, which provides some array sizes for the
+    // deserialization.
+    // It must deserialize exactly the indicated number of payload words.
+    API_EXPORT void auxdata_temparr_sizes(unsigned payload_words);
+    // Called when a 'AuxTag_deserializeSegments' is encountered. If it likes
+    // the record, it will set up the 'segments' object.
+    API_EXPORT void auxdata_deserialize_segments(unsigned payload_words);
+
+    // called when a 'KS' Aux data is encountered, which provides a const_extent_descriptor
+    // It must deserialize exactly the indicated number of payload words.
+    API_EXPORT int auxdata_read_const_extent_descriptor(const unsigned payload_words);
+    // helper for above. payload_words is the length WITH PADDING
+    API_EXPORT int extract_const_extent_name(const unsigned payload_words, std::string &retVal);
+
+    // Extract a std::vector<uint32_t> containing the 'const extent descriptor table,
+    // from a given offset (in units of 32-bit words) relative to the start of the pickle.
+    // or separate pointer (if separate buffer for the weights was passed in).
+    // This does not affect the current position.
+    // If there is a problem, it returns an empty vector; caller *must* check and report.
+    // This uses hnnx::const_extent_hdr_check to understand how much it should read,
+    // and to do basic check.
+    API_EXPORT std::vector<uint32_t> extract_const_extent_table(size_t posn_in_words);
+    std::vector<uint32_t> extract_const_extent_table(hexagon_nn_wide_address_const_t weight_data,
+                                                     const size_t weight_size);
+    // given a destination char pointer, prefilled with \null, fills it in with the name of the const_extent
+    // caller must provide destination of sufficient length
+    std::string name_from_weight_data(hexagon_nn_wide_address_const_t weight_data, const uint32_t weight_length);
+    // helper func for above. return -1 if name not present.
+    std::string get_name(hexagon_nn_wide_address_const_t weight_data, const uint32_t weight_length);
+    // give a vector of weight_data buffers, stores them all in the appropriate map
+    void store_named_weight_bufs(const hexagon_nn_wide_address_const_t *const buffers, const uint64_t *const lengths,
+                                 const unsigned num_buffers);
+    //
+    // copy 'len' bytes of data at offset offs_bytes in the pickle into location dstp.
+    // returns true if it's possible. You can maybe pass a DMA_Manager to have it queued...
+    // offs_bytes defined as uint64_t to support possible 'far' data on hexagon.
+    API_EXPORT bool extract_const_extent_data(uint64_t offs_bytes, size_t len, void *dstp, DMA_Manager *dma = nullptr);
+    // same, using an external const_extent
+    bool extract_const_extent_data(uint64_t offs_bytes, size_t len, void *dstp,
+                                   hexagon_nn_wide_address_const_t weight_data, const size_t weight_length);
+
+    // This extracts the 'objindex', when it is needed e.g. to 'patch' interfaces.
+    // Must be done only after deserializing, and can only be done once.
+    std::vector<void const *> extract_objindex() { return std::move(objindex); }
+
+    DeserSegDescs segments; // array of runlist_seg_descriptor, empty if not doing multiseg.
+
+    // this is used to pass the offset of the const-extent-descriptor (recorded as pickle_len)
+    // to the alloc->deserialize.
+    size_t pickle_len_words;
+
+    // OPTIONAL maps from weight buffer names to the descriptors and the buffers, respectively
+    cexdesc_deserializer_map named_cexdescs;
+    weight_buf_deserializer_map named_weight_bufs;
+
+    void *uncached_ptr;
+    uint32_t uncached_len;
+
+    std::vector<op_deserializer_map_t::const_iterator> op_deserialize_fn_list;
+    std::vector<tensor_deserializer_fn> tensor_deserialize_fn_list;
+
+    // used to 'link' shared blocktables during deser.
+    std::vector<void *const *> blocktable_link_table;
+};
+
+/////////////////
+
+// true if this Deserz is really an instance of Deserializer.
+inline constexpr bool Deserz::is_base_deser() const
+{
+    return static_cast<Deserz const *>(full_deser) == this;
+}
+
+inline bool Deserz::is_aligned_const_format() const
+{
+    return full_deser->aligned_const_format_flag;
+}
+inline bool Deserz::is_shared_dynamic_tensor_shape_format() const
+{
+    return full_deser->shared_dynamic_tensor_shape;
+}
+inline Deserz::op_deserialize_fn_list_t &Deserz::get_op_deserialize_fn_list()
+{
+    return full_deser->op_deserialize_fn_list;
+}
+inline Deserz::tensor_deserialize_fn_list_t &Deserz::get_tensor_deserialize_fn_list()
+{
+    return full_deser->tensor_deserialize_fn_list;
+}
+inline std::vector<void *const *> &Deserz::get_blocktable_link_table()
+{
+    return full_deser->blocktable_link_table;
+}
+// For these in Deserz, we must call the corresponding methods on the
+// tensorconn in 'full_deser', but must pass 'this' as first parameter.
+inline void Deserz::deserialize_tensor_def(Tensor const *const tensor_ptr)
+{
+    full_deser->tensorconn.tensor_def(*this, tensor_ptr);
+}
+inline void Deserz::deserialize_tensor_ref(Tensor const *&where)
+{
+    full_deser->tensorconn.tensor_ref(*this, where);
+}
+inline void Deserz::deserialize_tensor_refs(Tensor const **const ptrs, const unsigned n)
+{
+    full_deser->tensorconn.tensor_refs(*this, ptrs, n);
+}
+inline DeserSegDescs const &Deserz::get_segments() const
+{
+    return full_deser->segments;
+}
+
+// unaligned read of 64-bits (two 32-bit aligned reads)
+template <> inline uint64_t Deserz::simple_deserialize<uint64_t>()
+{
+    char const *const curr_p = bufp;
+    if (curr_p + 8u > buf_limit) {
+        return deser_u64_slowpath();
+    }
+    uint32_t const *const p = (uint32_t const *)(curr_p);
+    bufp = curr_p + 8u;
+    return p[0] + ((uint64_t)p[1] << 32);
+}
+inline uint64_t Deserz::deserialize_uint64()
+{
+    return simple_deserialize<uint64_t>();
+}
+
+template <> inline uint64_t Deserz::deserialize_type<uint64_t>()
+{
+    return deserialize_uint64();
+}
+template <> inline float Deserz::deserialize_type<float>()
+{
+    return deserialize_float();
+}
+// sometimes uint32_t is unsigned long, sometimes it's unsigned
+// sometimes unsigned long is uint64. Hopefully this should cover it all.
+#if ULONG_MAX == UINT_MAX
+template <> inline unsigned long Deserz::deserialize_type<unsigned long>()
+{
+    return deserialize_uint32();
+}
+template <> inline long Deserz::deserialize_type<long>()
+{
+    return deserialize_int32();
+}
+#endif
+template <> inline unsigned Deserz::deserialize_type<unsigned>()
+{
+    return deserialize_uint32();
+}
+template <> inline int Deserz::deserialize_type<int>()
+{
+    return deserialize_int32();
+}
+template <> inline int16_t Deserz::deserialize_type<int16_t>()
+{
+    return deserialize_int16();
+}
+template <> inline uint16_t Deserz::deserialize_type<uint16_t>()
+{
+    return deserialize_uint16();
+}
+template <> inline int8_t Deserz::deserialize_type<int8_t>()
+{
+    return deserialize_int8();
+}
+template <> inline uint8_t Deserz::deserialize_type<uint8_t>()
+{
+    return deserialize_uint8();
+}
+
+// assert( dctx.deserialize_uint32() == SOME_CONST );
+// is not safe, since if you turn off asserts, it will no longer read the 4 bytes. This is to allow that to work
+#define DESERIALIZE_ASSERT_UINT32(DCTX, VAL)                                                                           \
+    do {                                                                                                               \
+        uint32_t const tmp [[gnu::unused]] = (DCTX).deserialize_uint32();                                              \
+        assert(tmp == (VAL));                                                                                          \
+    } while (0)
+
+#include "weak_linkage.h"
+PUSH_VISIBILITY(default)
+
+/**
+ * @brief register the deserialization function for each \ref Op
+ * TypicalOp and VariadicOp derived classes are instantiated via
+ * template and hence the need to create a map of deserialize functions
+ * for each Op when they are generated at library initialization
+ *
+ * @param[in] tinf Op type_info that is used to key the map
+ * @param[in] fn Deserialize function
+ */
+API_EXPORT void deserialize_op_register(std::type_info const *tinf, const std::string_view type_tag,
+                                        const op_deserializer_fn &fn, bool is_external = false);
+/**
+ * @brief register the deserialization function for each \ref Tensor
+ * Since \ref Tensor derived classes are instantiated via templates, there
+ * is a need to create a map of deserialize function for each Tensor at runtime
+ *
+ * @param[in] type_tag Tensor type tag that is used to key the map
+ * @param[in] fn Deserialize function
+ */
+API_FUNC_EXPORT void deserialize_tensor_register(std::type_info const &tinf, const char *type_tag,
+                                                 tensor_deserializer_fn fn);
+
+POP_VISIBILITY()
+
+// this is fully defined in serialize_register.h
+template <typename T> struct deserialize_tensor_using_constructor;
+
+// this is fully defined in serialize_register.h
+template <typename T> struct alloc_func_for_op;
+template <typename T> struct dealloc_func_for_op;
+
+//////////////////////
+// Forward decls of things defined in template_help.h
+//
+// contains_type< tuple<a,b,c>, x >::value: true if x is in a,b,c ...
+// no 'remove ref' etc is done.
+template <typename TUPLET, typename T> struct contains_type;
+template <typename TUPLET, typename T> struct not_contains_type;
+template <template <typename> typename Pred, typename...> struct TupFilter;
+
+PUSH_VISIBILITY(default)
+
+// 'slow path' for deserialize_op_idx, used when the value is not aready in the table.
+API_EXPORT uint32_t deserialize_op_idx_slow(Deserz &dctx, uint32_t op_idx);
+
+/**
+ * @brief deserialize a \ref Tensor. The implementation makes use of the map
+ * created during \ref deserialize_tensor_register to construct the Tensor.
+ *
+ * @param[in] producer \ref Op that will produce this tensor
+ * @param[in] dctx \ref Deserializer context that has the buffer to read from
+ * @param[in] graph_in \ref Graph context where this Tensor lives.
+ * @return uptr_Tensor unique_ptr of \ref Tensor type
+ */
+API_EXPORT uptr_Tensor deserialize_tensor(Deserz &dctx);
+
+POP_VISIBILITY()
+
+} // namespace hnnx
+
+#endif // DESERIALIZER_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/dtype.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/dtype.h
new file mode 100755
index 0000000000000..4e93116f9b467
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/dtype.h
@@ -0,0 +1,180 @@
+//==============================================================================
+//
+// Copyright (c) 2020 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef DTYPE_H
+#define DTYPE_H 1
+
+#include <cstdint>
+#include <type_traits>
+#include "dtype_enum.h"
+#include "float16.h"
+#include "macros_attribute.h"
+#include "weak_linkage.h"
+
+template <DType DT> struct dtype_traits {
+};
+
+template <> struct dtype_traits<DType::QUInt8> {
+    typedef uint8_t element_type;
+    typedef uint8_t storage_type;
+    static const int element_size = sizeof(element_type);
+    static const bool is_quant = true;
+    static const bool is_float = false;
+    static const storage_type minus_inf_code = 0;
+};
+
+template <> struct dtype_traits<DType::QUInt16> {
+    typedef uint16_t element_type;
+    typedef uint16_t storage_type;
+    static const int element_size = sizeof(element_type);
+    static const bool is_quant = true;
+    static const bool is_float = false;
+    static const storage_type minus_inf_code = 0;
+};
+
+template <> struct dtype_traits<DType::QInt16> {
+    typedef int16_t element_type;
+    typedef uint16_t storage_type;
+    static const int element_size = sizeof(element_type);
+    static const bool is_quant = true;
+    static const bool is_float = false;
+    static const storage_type minus_inf_code = 0x8000;
+};
+template <> struct dtype_traits<DType::Float16> {
+    typedef Float16 element_type;
+    typedef uint16_t storage_type;
+    static const int element_size = sizeof(element_type);
+    static const bool is_quant = false;
+    static const bool is_float = true;
+    // -inf pattern (but, if hvx flt16 are used, maybe
+    // it should be 0xFFFF?
+    static const storage_type minus_inf_code = 0xFC00;
+};
+template <> struct dtype_traits<DType::Float32> {
+    typedef float element_type;
+    typedef NN_UINT32_T storage_type;
+    static const int element_size = sizeof(element_type);
+    static const bool is_quant = false;
+    static const bool is_float = true;
+    // -inf pattern (but, if hvx flt16 are used, maybe
+    // it should be 0xFFFFFFFF?
+    static const storage_type minus_inf_code = 0xFF800000;
+};
+template <> struct dtype_traits<DType::Int32> {
+    typedef NN_INT32_T element_type;
+    typedef NN_UINT32_T storage_type;
+    static const int element_size = sizeof(element_type);
+    static const bool is_quant = false;
+    static const bool is_float = false;
+    static const storage_type minus_inf_code = 1u << 31;
+};
+template <> struct dtype_traits<DType::QInt32> {
+    typedef NN_INT32_T element_type;
+    typedef NN_UINT32_T storage_type;
+    static const int element_size = sizeof(element_type);
+    static const bool is_quant = true;
+    static const bool is_float = false;
+    static const storage_type minus_inf_code = 1u << 31;
+};
+template <> struct dtype_traits<DType::QInt8> {
+    typedef int8_t element_type;
+    typedef uint8_t storage_type;
+    static const int element_size = sizeof(element_type);
+    static const bool is_quant = true;
+    static const bool is_float = false;
+    static const storage_type minus_inf_code = 128;
+};
+template <> struct dtype_traits<DType::Int64> {
+    typedef NN_INT64_T element_type;
+    typedef NN_UINT64_T storage_type;
+    static const int element_size = sizeof(element_type);
+    static const bool is_quant = false;
+    static const bool is_float = false;
+    static const storage_type minus_inf_code = 1llu << 63;
+};
+
+// 'runtime' attributes
+// E.g. Dtype_info(d).elbytes gives the element size.
+struct dtype_info {
+    unsigned elbytes : 8;
+    DType dtype : 8;
+    unsigned is_quant : 1;
+    unsigned is_float : 1;
+    unsigned is_signed : 1;
+};
+
+PUSH_VISIBILITY(default)
+API_EXPORT dtype_info DType_info(DType d); // in graph.cc
+POP_VISIBILITY()
+
+namespace hnnx {
+namespace dtype_private {
+template <DType DT> dtype_info constexpr inline dtype_info_for()
+{
+    typedef dtype_traits<DT> traits;
+    return dtype_info{
+            sizeof(typename traits::element_type), //elbytes
+            DT, // dtype
+            traits::is_quant, //is_quant
+            traits::is_float, //is_float
+            (std::is_signed<typename traits::element_type>::value ? 1 : 0) //is_signed
+    };
+}
+template <> dtype_info constexpr inline dtype_info_for<DType::UNKNOWN>()
+{
+    return dtype_info{
+            0, //elbytes
+            DType::UNKNOWN, // dtype
+            0, //is_quant
+            0, //is_float
+            0 //is_signed
+    };
+}
+// this is intended to be only referenced once (inside DType_info, in graph.cc)
+// and is placed here for easy maintenance
+
+inline constexpr dtype_info DType_info_inline(DType d)
+{
+    switch (d) {
+    case DType::QUInt8:
+        return dtype_info_for<DType::QUInt8>();
+    case DType::QUInt16:
+        return dtype_info_for<DType::QUInt16>();
+    case DType::QInt16:
+        return dtype_info_for<DType::QInt16>();
+    case DType::Float16:
+        return dtype_info_for<DType::Float16>();
+    case DType::Float32:
+        return dtype_info_for<DType::Float32>();
+    case DType::Int32:
+        return dtype_info_for<DType::Int32>();
+    case DType::QInt32:
+        return dtype_info_for<DType::QInt32>();
+    case DType::QInt8:
+        return dtype_info_for<DType::QInt8>();
+    case DType::Int64:
+        return dtype_info_for<DType::Int64>();
+    default:
+        return dtype_info_for<DType::UNKNOWN>();
+    }
+}
+} //namespace dtype_private
+
+template <DType DT> // maps DT -> constexpr dtype_info
+constexpr dtype_info dtype_info_v = dtype_private::dtype_info_for<DT>();
+
+} // namespace hnnx
+
+// LCOV_EXCL_START [SAFTYSWCCB-1736] constexprs resolved during compile time
+template <typename TINTERFACE> constexpr DType dtype_of_type()
+{
+    return DType::UNKNOWN;
+}
+// LCOV_EXCL_STOP
+
+#endif
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/dtype_enum.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/dtype_enum.h
new file mode 100755
index 0000000000000..9a600a6d0d61d
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/dtype_enum.h
@@ -0,0 +1,97 @@
+//==============================================================================
+//
+// Copyright (c) 2020, 2023 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef DTYPE_ENUM_H
+#define DTYPE_ENUM_H 1
+
+#include <stdint.h>
+#include "weak_linkage.h"
+
+PUSH_VISIBILITY(default)
+
+typedef int NN_INT32_T;
+
+typedef unsigned int NN_UINT32_T;
+
+typedef int64_t NN_INT64_T;
+
+typedef uint64_t NN_UINT64_T;
+
+#ifdef __cplusplus
+enum class DType : uint32_t {
+#else
+enum DType {
+#endif
+    UNKNOWN = 0,
+    QUInt8 = 1,
+    QUInt16 = 2,
+    QInt16 = 3,
+    Float32 = 4,
+    Int32 = 5,
+    QInt32 = 6,
+    QInt8 = 7,
+    Float16 = 8,
+    Int64 = 9,
+    ZZ_LAST_DTYPE,
+    None = 254, //  for output of OpDef representing null output. Not for use by external API.
+    Multi = 255 //  for output of OpDef representing multiple outputs. Not for use by external API.
+};
+
+#define DTYPE_NAMETABLE_INIT                                                                                           \
+    {                                                                                                                  \
+        "UNKNOWN", "QUInt8", "QUInt16", "QInt16", "Float32", "Int32", "QInt32", "QInt8", "Float16", "Int64"            \
+    }
+
+#ifdef __cplusplus
+namespace hnnx {
+extern "C" {
+#endif
+API_FUNC_EXPORT char const *DType_name(enum DType);
+#ifdef __cplusplus
+} // extern C
+// this is intended to be only referenced once (inside DType_name, in graph.cc)
+// and is placed here for easy maintenance
+// LCOV_EXCL_START [SAFTYSWCCB-1542]
+inline char const *DType_name_inline(DType d)
+{
+    switch (d) {
+    default:
+    case DType::ZZ_LAST_DTYPE:
+    case DType::None:
+        return "Bad_DType";
+    case DType::UNKNOWN:
+        return "UNKNOWN";
+    case DType::QUInt8:
+        return "QUInt8";
+    case DType::QUInt16:
+        return "QUInt16";
+    case DType::QInt16:
+        return "QInt16";
+    case DType::Float16:
+        return "Float16";
+    case DType::Float32:
+        return "Float32";
+    case DType::Int32:
+        return "Int32";
+    case DType::Int64:
+        return "Int64";
+    case DType::QInt32:
+        return "QInt32";
+    case DType::QInt8:
+        return "QInt8";
+    case DType::Multi:
+        return "Multi";
+    }
+}
+// LCOV_EXCL_STOP
+} // namespace hnnx
+#endif
+
+POP_VISIBILITY()
+
+#endif
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/dynamic_tensors.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/dynamic_tensors.h
new file mode 100755
index 0000000000000..c3884ee42431a
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/dynamic_tensors.h
@@ -0,0 +1,55 @@
+//==============================================================================
+//
+// Copyright (c) 2024 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef DYNAMIC_TENSOR_H
+#define DYNAMIC_TENSOR_H
+
+#ifdef __cplusplus
+struct DynamicStatus {
+#endif // __cplusplus
+    enum DynamicTensorErrorCode {
+        ValidData = 0,
+        SemiValidData = 1,
+        InvalidData = 2,
+        InPlace = 3,
+        Fallback = 4,
+        NonInplace = 4, // alias to fallback
+        InvalidConfig = 5
+    };
+#ifdef __cplusplus
+    static bool skip_execute(const DynamicStatus ec)
+    {
+        bool retVal;
+        switch (DynamicTensorErrorCode(ec)) {
+        case ValidData:
+        case SemiValidData:
+        case Fallback:
+            retVal = false;
+            break;
+        default:
+            retVal = true;
+            break;
+        }
+        return retVal;
+    }
+    DynamicStatus(const DynamicTensorErrorCode ec) : error_code(ec) {}
+    explicit DynamicStatus(const int ec) : error_code(static_cast<DynamicTensorErrorCode>(ec)) {}
+    bool operator==(const DynamicTensorErrorCode ec) const { return error_code == ec; }
+    bool operator!=(const DynamicTensorErrorCode ec) const { return error_code != ec; }
+    int to_int() const { return static_cast<int>(error_code); }
+    explicit operator DynamicTensorErrorCode() const { return error_code; }
+    explicit operator bool() const { return !skip_execute(error_code); }
+    static bool failed_execute(const DynamicStatus ec) { return DynamicTensorErrorCode(ec) == InvalidConfig; }
+
+  private:
+    DynamicTensorErrorCode error_code;
+};
+
+#endif // __cplusplus
+
+#endif // DYNAMIC_TENSOR_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/executable.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/executable.h
new file mode 100755
index 0000000000000..923cba972ff8e
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/executable.h
@@ -0,0 +1,129 @@
+//==============================================================================
+//
+// Copyright (c) 2018-2024 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+#ifndef EXECUTABLE_H
+#define EXECUTABLE_H 1
+
+#include "graph_status.h"
+#include <tuple>
+#include <cstdlib>
+#include <stdint.h>
+
+class Graph;
+
+#include "weak_linkage.h"
+#include "macros_attribute.h"
+PUSH_VISIBILITY(default)
+
+namespace hnnx {
+
+// Passed to any Op function which specfies 'op_slice_spec' as a value parameter.
+// This is called by its typedef 'hnnx::op_slice_spec' everywhere; it has a short name hnnx::OsS since it's
+// mangled into all the Op execute method names.
+// The parameter must be the last one (unless there is a Graph const &, in which case it's before that).
+struct OsS {
+  protected:
+    // on hexagon:
+    //   must be possible to pass this in a 32 bit register; and 'default constructor' must
+    //   be equivalent to a 32-bit value of '1'.
+    unsigned m_nslices : 16;
+    unsigned m_slice_idx : 16;
+
+  public:
+    OsS(OsS const &) = default;
+    OsS &operator=(OsS const &) = default;
+    constexpr OsS() : m_nslices(1), m_slice_idx(0) {}
+    constexpr OsS(unsigned const n, unsigned const i) : m_nslices(n), m_slice_idx(i) {}
+
+    constexpr unsigned num_slices() const { return m_nslices; }
+    constexpr unsigned slice_idx() const { return m_slice_idx; }
+
+    // If you want to pass an op_slice_spec into an asm routine as 32-bits, use this;
+    // it provides an integer with 'num_slices' in lower 16 bits and 'slice_idx' in upper 16,
+    // and will do so even if we change the format of op_slice_spec (e.g. to add some extra bits)
+    // so you won't need to change your asm.
+    unsigned as_uint32() const
+    {
+        union {
+            OsS ss;
+            unsigned as_u;
+        } uu = {*this};
+        return uu.as_u;
+    }
+
+    void from_uint32(unsigned x)
+    {
+        union {
+            unsigned u;
+            OsS ss;
+        } uu = {x};
+        *this = uu.ss;
+    }
+};
+using op_slice_spec = OsS;
+
+#define EXECUTE_METHOD_PARMS Graph *gr [[maybe_unused]], hnnx::op_slice_spec
+
+typedef volatile uint32_t *counter_t;
+typedef volatile uint32_t *counter_nc_t;
+
+/*
+	 * We want to have an abstraction for things that can execute()
+	 * so that we can treat them more abstractly than all the things in Op
+	 *
+	 * This is that interface.
+	 *
+	 * Note that an important optimization is to be able to obtain the address of the execute call.
+	 *
+	 * So....
+	 * THE "execute()" VIRTUAL FUNCTION MUST BE THE 0th THING IN THE VTABLE
+	 * THE "execute()" VIRTUAL FUNCTION MUST NOT CHANGE SIGNATURES
+	 *
+	 * This allows us to look into the structures to find out more concrete addresses.
+	 */
+// Note: do not change the class design in any way that requires up/down pointer casts
+// (between Excecutable its subclasses) to change the pointer value.
+class API_EXPORT Executable {
+  public:
+    static constexpr unsigned MAX_OP_SLICES = 4;
+
+    using FuncType = GraphStatus (*)(const void *, EXECUTE_METHOD_PARMS);
+    using ItemType = std::pair<FuncType, const void *>;
+    struct alignas(16) ExecType { // alignment keeps it all in same cache line on hexagon.
+        FuncType funcp;
+        const void *datap;
+        counter_t gate_cp;
+        counter_t done_cp;
+        ExecType(FuncType const f, const void *const d, counter_t const gc, counter_t const dc)
+            : funcp(f), datap(d), gate_cp(gc), done_cp(dc)
+        {
+        }
+        ExecType &operator=(ExecType const &rhs) = default;
+        ExecType() : funcp{}, datap{}, gate_cp{}, done_cp{} {}
+    };
+    virtual GraphStatus execute(EXECUTE_METHOD_PARMS) const noexcept = 0; // Needs to be at vtable offset zero!!!
+    virtual ItemType compile(Graph &graph_in) const; // Turn this Executable into a function pointer and data pointer.
+    virtual ~Executable() = default;
+    static const size_t *vtable(Executable const *); // helper function: get vtable
+    static size_t execute_address(Executable const *); // helper function: get address of execute() function
+
+    static GraphStatus no_op_function(const void *, EXECUTE_METHOD_PARMS); // just returns Success.
+    static ItemType null_item() { return {no_op_function, nullptr}; }
+};
+
+// to execute an Executable::ItemType...
+
+inline GraphStatus execute_item(Graph *graph_in, Executable::ExecType const &itemt)
+{
+    return (*itemt.funcp)(itemt.datap, graph_in, op_slice_spec{});
+}
+
+}; // namespace hnnx
+
+POP_VISIBILITY()
+
+#endif
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/flags.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/flags.h
new file mode 100755
index 0000000000000..c22f44296d88b
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/flags.h
@@ -0,0 +1,177 @@
+//==============================================================================
+//
+// Copyright (c) Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef HEXNN_FLAGS_H
+#define HEXNN_FLAGS_H 1
+
+#include "builtin_intrinsics.h"
+#include <cstddef>
+
+/*
+ * Every flag needs a constant for what it refers to
+ */
+typedef unsigned long Flags_word;
+enum class Flags : unsigned {
+    IS_CONST = 0, // output doesn't change
+    INHIBIT_CONST_PROP, // do not const-propagate this Op
+    RESOURCE_HVX, // op needs an HVX thread (converted to spawn/validate)
+    RESOURCE_HMX, // uses HMX
+    RESOURCE_HLX, // uses HLX
+    IS_DMA, // op issues dma, does not wait (converted to dma_start/dma_sync)
+    FOR_HVX, // op is a spawn or validate for HVX (must combine with MOVE_*)
+    FOR_HMX, // op is a spawn or validate for HMX (must combine with MOVE_*)
+    FOR_HLX, // op is a spawn or validate for HLX (must combine with MOVE_*)
+    FOR_DMA, // op is a dma_start or dma_sync (must combine with MOVE_*)
+    MOVE_EARLY, // scheduler should move early (must combine with FOR_*)
+    MOVE_LATE, // scheduler should move late (must combine with FOR_*)
+    NULL_EXEC, // exec function does nothing
+    IS_SPILL, // Op is a 'Spill', added during schedule/alloc
+    IS_FILL, // Op is a 'Fill', added during schedule/alloc
+    INPLACE_NOP, // A NULL_EXEC which is just a copy (see below).
+    IS_COPY, // An op which is a copy (see below)
+    IS_SYNC, // Op is a 'SyncOp'
+    IS_SEND, // Op is a multi-core multicast send.
+    IS_RECV, // Op is a multi-core receive.
+    IS_PADZAP, // Op is a crouton padzap (same tensor type and shape in and out)
+    IS_PRELOAD, // Op is a chunk preload op
+    CAN_BE_SRC_DESTRUCTIVE, // Op will work correctly if input[0] and output[0] are at the same places in TCM
+    IS_WEIGHT_FOR_BIT_REARRANGE, // Indicates Weight data will be used for bit rearrangement
+    XXX_LAST_FLAG
+};
+// NOTE: if you add new background op types (FOR_HVX, RESOURCE_HVX, etc) make sure to update
+// Flags_RESOURCE_BACKGROUND, Flags_FOR_BACKGROUND in include/priv/flags_prepare.h
+
+// INPLACE_NOP is a null_exec op that has 1 input and 1 output, with identical DType and quantization,
+// shape, and tensor type; which could just be bypassed. Examples are the 'Padzap' with no work to do,
+// and all ForceFormat_flat where the input is already flat (implemented by format_no_translate_flat).
+
+// IS_COPY is a op that has 1 input and 1 output, with the same shape and tensor layout type; the memory
+// class and dtype can be different (dtype must be the same #bytes); but the operation must be fulfilled
+// by raw copy of the input block(s) to the output block(s). Mainly this is intended to mark const->TCM
+// operations that can be replaced by 'const-fill'.
+
+static_assert(static_cast<int>(Flags::XXX_LAST_FLAG) <= 64, "Too many flags");
+
+/**
+ * @brief Now we want to add flags to Ops (and maybe other things)
+ * The default for all flags is 0.
+ * Ideally, every op have one poitner/reference/function pointer/something per class (not per obj)
+ * that would get the right flags.
+ *
+ * We could get that with a virtual function (entry in the vtable)
+ * ... the default could be inherited
+ * ... But how do we choose whether or not to override that function?
+ * ... And how do we override the function conditionally?
+ * We could get a static constexpr variable...
+ * ... But how would we get at the static constexpr value from a pointer to base?
+ *
+ * We have get_flag_word(), virtual method of Op, which returns a Flags_word value.
+ * And get_flag(Flag f), non-virtual; calls get_flag_word() and then tests the specified bit.
+ * You can also call get_flag_word() once and test multiple bits using calls to test_flag_for().
+ */
+
+/*
+ * We might be able to use bitset here, but bitset has limited constexpr for some reason...
+ */
+
+namespace hnnx {
+
+template <Flags... idxs>
+constexpr Flags_word flagval_generate = ((Flags_word(1) << static_cast<unsigned>(idxs)) | ... | 0);
+
+constexpr int FLAG_FOLDING_LIMIT = 20;
+
+template <typename T, int S = 1> inline constexpr Flags_word flags_for()
+{
+    return 0;
+}
+
+inline constexpr bool test_flag_for(Flags_word w, Flags which)
+{
+    return (safe_rshift(w, static_cast<unsigned>(which)) & 1u) != 0;
+}
+
+// LCOV_EXCL_START [SAFTYSWCCB-1542]
+inline constexpr bool test_flag_and(Flags_word w, Flags which_a, Flags which_b)
+{
+    if ((safe_rshift(w, static_cast<unsigned>(which_a)) & 1u) == 0) return false;
+    return (safe_rshift(w, static_cast<unsigned>(which_b)) & 1u) != 0;
+}
+//LCOV_EXCL_STOP
+
+// LCOV_EXCL_START [SAFTYSWCCB-1736] constexprs resolved during compile time
+// used in pub/impl/op_register.h for internal ops with constexpr lva
+template <typename T, int S> class FlagCounter {
+  public:
+    constexpr static unsigned increment() { return 0U; }
+
+    constexpr static unsigned get() { return increment() + FlagCounter<T, S - 1>::get(); }
+};
+
+template <typename T> class FlagCounter<T, -1> {
+  public:
+    constexpr static unsigned get() { return 0U; }
+};
+//LCOV_EXCL_STOP
+
+#define DOCS_UNSET ""
+
+// LCOV_EXCL_START [SAFTYSWCCB-1542]
+template <typename T> [[maybe_unused]] static constexpr const char *docs_for()
+{
+    return DOCS_UNSET;
+}
+//LCOV_EXCL_STOP
+
+} // namespace hnnx
+
+/* Counts up from 0 each time a flag is added for this Type, specializing flags_for
+ * with the value from the counter. To access the flag for the op, it's expected to be at
+ * flags_for<Op>. The S value is any monotonically increasing value.
+ */
+#define FLAGS_FOR_IMPL(T, S, ...)                                                                                                                                                                                                   \
+    using hnnx::FlagCounter;                                                                                                                                                                                                        \
+    /* LCOV_EXCL_START [SAFTYSWCCB-1736] constexprs resolved during compile time */                                                                                                                                                 \
+    template <> constexpr unsigned FlagCounter<T, S>::increment() { return 1U; }                                                                                                                                                    \
+    /* LCOV_EXCL_STOP */                                                                                                                                                                                                            \
+    template <> constexpr Flags_word hnnx::flags_for<T, FlagCounter<T, S>::get()>()                                                                                                                                                 \
+    {                                                                                                                                                                                                                               \
+        constexpr Flags_word flags = hnnx::flagval_generate<__VA_ARGS__>;                                                                                                                                                           \
+        static_assert(                                                                                                                                                                                                              \
+                FlagCounter<T, S>::get() <= hnnx::FLAG_FOLDING_LIMIT,                                                                                                                                                               \
+                "Flag folding limit exceeded, this means you tried to register too many flags to the same type.");                                                                                                                  \
+        static_assert(                                                                                                                                                                                                              \
+                FlagCounter<T, S>::get() == 1 || flags == flags_for<T, FlagCounter<T, S>::get() - 1>(),                                                                                                                             \
+                "Flags mismatch, this happens when different flags have been registered for the same type. Due to TCM folding, this can also happen when registering different flags for the TCM and non-TCM op implementations."); \
+        return flags;                                                                                                                                                                                                               \
+    }
+
+#define FLAGS_FOR(F, ...)                   FLAGS_FOR_IMPL(F, __COUNTER__, __VA_ARGS__)
+#define FLAGS_FOR_DT_NO_TCM_FOLDING(F, ...) FLAGS_FOR_IMPL(DerivedType<F>::type, __COUNTER__, __VA_ARGS__)
+
+#if defined(PREPARE_DISABLED) && !defined(TCM_FOLDING_DISABLED)
+// See register-op-tcm-folding.md
+#define MOD_DER_TYPE(F, LINE) fold::ModifiedDerivedType<F, LINE>::Modified
+#define FLAGS_FOR_DT_IMPL(F, UNIQUE_VAL, ...)                                                                          \
+    MDT(F, UNIQUE_VAL)                                                                                                 \
+    FLAGS_FOR_IMPL(MOD_DER_TYPE(F, UNIQUE_VAL), __COUNTER__, __VA_ARGS__)
+#define FLAGS_FOR_DT(F, ...) FLAGS_FOR_DT_IMPL(F, __COUNTER__, __VA_ARGS__)
+#else
+#define FLAGS_FOR_DT(F, ...) FLAGS_FOR_IMPL(DerivedType<F>::type, __COUNTER__, __VA_ARGS__)
+#endif
+
+#define DOCS_FOR_DT(F, DOCSTRING) DOCS_FOR(DerivedType<F>::type, DOCSTRING)
+
+#ifndef PREPARE_DISABLED
+#define DOCS_FOR(F, DOCSTRING)                                                                                         \
+    template <> constexpr const char *hnnx::docs_for<F>() { return DOCSTRING; }
+#else
+#define DOCS_FOR(F, DOCSTRING)
+#endif
+
+#endif
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/float16.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/float16.h
new file mode 100755
index 0000000000000..dc50b93a4dd99
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/float16.h
@@ -0,0 +1,474 @@
+//==============================================================================
+//
+// Copyright (c) 2020, 2023 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef FLOAT16_H
+#define FLOAT16_H
+
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <cmath>
+#include <limits>
+
+#include "builtin_intrinsics.h"
+
+#include "weak_linkage.h"
+#include "macros_attribute.h"
+
+PUSH_VISIBILITY(default)
+
+struct API_EXPORT Float16 {
+    constexpr Float16() : d(0) {}
+    constexpr Float16(float f);
+    constexpr Float16(const Float16 &f) : d(f.d) {}
+    constexpr Float16 &operator=(Float16 f);
+
+    constexpr bool is_zero() const;
+    constexpr bool is_neg() const;
+    constexpr bool is_inf() const;
+    constexpr bool is_nan() const;
+    constexpr bool is_subnorm() const;
+    constexpr bool is_norm() const;
+    constexpr bool is_finite() const;
+
+    constexpr int16_t exp() const;
+    constexpr int16_t frac() const;
+    constexpr uint16_t raw() const { return d; }
+
+    static constexpr int exp_max() { return 15; }
+    static constexpr int exp_min() { return -14; }
+    static constexpr int16_t bias() { return 15; }
+
+    static constexpr Float16 zero(bool neg = false);
+    static constexpr Float16 qnan();
+    static constexpr Float16 snan();
+    static constexpr Float16 inf(bool neg = false);
+
+    static constexpr Float16 from_raw(uint16_t v);
+
+    constexpr operator float() const;
+    // same as ->float, but treats max. exp as a normal number
+    // instead of inf/nan
+    float to_float_alt() const;
+    // same as from-float, but allows +/- 131008 range, using
+    // exp=31 as normal.
+    static Float16 from_float_alt(float v);
+
+  private:
+    explicit constexpr Float16(int sign, int exp, int frac);
+
+    constexpr uint16_t sign_bit() const;
+    constexpr uint16_t exp_bits() const;
+    constexpr uint16_t frac_bits() const;
+
+    static constexpr uint16_t make_exp_bits(uint16_t e);
+    static constexpr uint16_t make_sign_bit(uint16_t s);
+    static constexpr uint16_t make_frac_bits(uint16_t f);
+
+    static constexpr uint16_t make_zero(bool neg);
+    static constexpr uint16_t make_nan(bool quiet);
+    static constexpr uint16_t make_inf(bool neg);
+    static constexpr uint16_t make(int sign, int exp, int frac);
+
+    static constexpr uint32_t round(uint32_t v, unsigned s);
+
+    std::pair<int32_t, int32_t> force_norm() const;
+
+    union {
+        uint16_t d;
+        struct {
+            uint16_t mantissa : 10;
+            uint16_t exponent : 5;
+            uint16_t sign : 1;
+        };
+    };
+
+    friend API_FUNC_EXPORT Float16 operator-(Float16 a);
+    friend API_FUNC_EXPORT Float16 operator+(Float16 a, Float16 b);
+    friend API_FUNC_EXPORT Float16 operator-(Float16 a, Float16 b);
+    friend API_FUNC_EXPORT Float16 operator*(Float16 a, Float16 b);
+};
+
+POP_VISIBILITY()
+
+inline constexpr Float16::Float16(float f) : d(0)
+{
+    union U {
+        constexpr U(float f) : f(f) {}
+        float f;
+        uint32_t w;
+    } const u(f);
+
+    bool const neg = u.w & (uint32_t(1u) << 31u);
+    int const exp_extract = (u.w >> 23u) & 0xFFu;
+    uint32_t const frac_bits = u.w & 0x7FFFFFu;
+
+    if (exp_extract == 0xFF) {
+        if (frac_bits == 0)
+            d = make_inf(neg);
+        else
+            d = make_nan(frac_bits & 0x400000u);
+        return;
+    }
+
+    if (exp_extract == 0) {
+        // It could be a subnormal number, but all single-precision subnormals
+        // become 0 in half-precision.
+        d = make_zero(neg);
+        return;
+    }
+
+    int const exp = exp_extract - 127;
+    int const frac = round(frac_bits | (uint32_t(1) << 23u), 23 - 10);
+    d = make(neg, exp, frac);
+}
+
+inline constexpr Float16 &Float16::operator=(const Float16 f)
+{
+    d = f.d;
+    return *this;
+}
+
+inline constexpr bool Float16::is_zero() const
+{
+    return (exp_bits() | frac_bits()) == 0x0000;
+}
+
+inline constexpr bool Float16::is_neg() const
+{
+    return sign_bit();
+}
+
+inline constexpr bool Float16::is_inf() const
+{
+    return exp_bits() == make_exp_bits(0x001F) && frac_bits() == 0x0000;
+}
+
+inline constexpr bool Float16::is_nan() const
+{
+    return exp_bits() == make_exp_bits(0x001F) && frac_bits() != 0x0000;
+}
+
+inline constexpr bool Float16::is_subnorm() const
+{
+    return exp_bits() == make_exp_bits(0x0000) && frac_bits() != 0x0000;
+}
+
+inline constexpr bool Float16::is_norm() const
+{
+    if (is_zero()) return true;
+    return exp_bits() > make_exp_bits(0x0000) && exp_bits() < make_exp_bits(0x001F);
+}
+
+inline constexpr bool Float16::is_finite() const
+{
+    return is_norm() || is_subnorm();
+}
+
+inline constexpr int16_t Float16::exp() const
+{
+    assert(is_finite());
+    int16_t const e = static_cast<int16_t>(exp_bits() >> 10u);
+    return e != 0 ? e - bias() : e - bias() + 1;
+}
+
+inline constexpr int16_t Float16::frac() const
+{
+    assert(is_finite());
+    uint16_t f = frac_bits();
+    if (is_norm()) f |= uint32_t(1) << 10u;
+    return static_cast<int16_t>(f);
+}
+
+inline constexpr Float16 Float16::zero(bool neg)
+{
+    return Float16::from_raw(make_zero(neg));
+}
+
+inline constexpr Float16 Float16::qnan()
+{
+    return Float16::from_raw(make_nan(true));
+}
+
+inline constexpr Float16 Float16::snan()
+{
+    return Float16::from_raw(make_nan(false));
+}
+
+inline constexpr Float16 Float16::inf(bool neg)
+{
+    return Float16::from_raw(make_inf(neg));
+}
+
+inline constexpr Float16 Float16::from_raw(uint16_t v)
+{
+    Float16 f;
+    f.d = v;
+    return f;
+}
+
+inline constexpr Float16::operator float() const
+{
+    uint32_t const sign = is_neg();
+
+    // Reproduce the right type of inf/nan.
+    if (exp_bits() == make_exp_bits(0x001F)) {
+        union {
+            uint32_t w;
+            float f;
+        } u{};
+        u.w = 0;
+        u.w |= sign << 31u;
+        u.w |= uint32_t(0xFF) << 23u;
+        // Copy over the msb of the fractional part.
+        uint16_t const frac = frac_bits();
+        uint32_t frac_msb = frac & (uint32_t(1u) << 9u);
+        frac_msb <<= 12u; // RHS = 21 - 9
+        u.w |= frac_msb;
+        // Make sure the frac part doesn't become 0 for signaling NaNs.
+        if ((frac & (frac_msb - 1)) != 0) u.w |= 1u;
+        return u.f;
+    }
+
+    auto [e, f] = force_norm();
+    if (f == 0) return sign != 0 ? -0.0f : 0.0f;
+
+    float const v = ldexpf(f, e - 10);
+    return sign ? -v : v;
+}
+
+inline constexpr Float16::Float16(int sign, int exp, int frac) : d(make(sign, exp, frac)) {}
+
+inline constexpr uint16_t Float16::sign_bit() const
+{
+    return d & 0x8000u;
+}
+
+inline constexpr uint16_t Float16::exp_bits() const
+{
+    return d & 0x7C00u;
+}
+
+inline constexpr uint16_t Float16::frac_bits() const
+{
+    return d & 0x03FFu;
+}
+
+inline constexpr uint16_t Float16::make_sign_bit(uint16_t s)
+{
+    return static_cast<uint16_t>(!!s) << 15u;
+}
+
+inline constexpr uint16_t Float16::make_exp_bits(uint16_t e)
+{
+    return (e & 0x001Fu) << 10u;
+}
+
+inline constexpr uint16_t Float16::make_frac_bits(uint16_t f)
+{
+    return f & 0x03FFu;
+}
+
+inline constexpr uint16_t Float16::make_zero(bool neg)
+{
+    return make_sign_bit(neg) | make_exp_bits(0) | make_frac_bits(0);
+}
+
+inline constexpr uint16_t Float16::make_nan(bool quiet)
+{
+    uint16_t const f = quiet ? 0x0200 : 0x0100;
+    return make_sign_bit(0) | make_exp_bits(0x001F) | make_frac_bits(f);
+}
+
+inline constexpr uint16_t Float16::make_inf(bool neg)
+{
+    return make_sign_bit(neg) | make_exp_bits(0x001F) | make_frac_bits(0x0000);
+}
+
+inline constexpr uint16_t Float16::make(int sign, int exp, int frac)
+{
+    // Treat frac as a fixed-point value with 10 fraction bits.
+    if (frac == 0) {
+        // Signed zero.
+        return make_zero(sign);
+    }
+    assert(frac > 0);
+    unsigned const clz = HEX_COUNT_LEADING_ZERO(frac);
+    // For a finite, normalized non-zero number, clz should be 16+(16-11) = 21.
+    int exp_inc = 21 - clz;
+    if (exp + exp_inc > exp_max()) {
+        // Number has a magnitude that is too large.
+        return make_inf(sign);
+    }
+    if (exp + exp_inc < exp_min()) {
+        // This number can become subnormal or zero.
+        // safe_rshift will hit an assert if the shift is out of range
+        // If we had an out of range shift, then we should just clip it to the range
+        // Which should cause the frac to become 0 in either case
+        int mask = static_cast<int>(hnnx::get_safe_shift_mask<int>());
+        int shift_amount = exp_min() - exp - exp_inc;
+        shift_amount = (shift_amount > mask) ? mask : shift_amount;
+        frac = hnnx::safe_rshift(static_cast<unsigned>(frac), shift_amount);
+        return make_sign_bit(static_cast<uint16_t>(sign)) | make_exp_bits(0) |
+               make_frac_bits(static_cast<uint16_t>(frac));
+    }
+
+    if (exp_inc < 0) {
+        frac = hnnx::safe_lshift(static_cast<unsigned>(frac), -exp_inc);
+    } else if (exp_inc > 0) {
+        frac = round(static_cast<uint32_t>(frac), exp_inc);
+        // Rounding can change the most significant bit, so check it again.
+        unsigned const clzr = HEX_COUNT_LEADING_ZERO(frac);
+        assert(clzr == 20 || clzr == 21);
+        if (clzr < 21) {
+            frac = hnnx::safe_rshift(frac, (21 - clzr));
+            exp_inc += (21 - clzr);
+            // And the exponent check one more time...
+            if (exp + exp_inc > exp_max()) return make_inf(sign);
+        }
+    }
+    exp += exp_inc;
+    exp += bias();
+    return make_sign_bit(static_cast<uint16_t>(sign)) | make_exp_bits(static_cast<uint16_t>(exp)) |
+           make_frac_bits(static_cast<uint16_t>(frac));
+}
+
+inline constexpr uint32_t Float16::round(uint32_t v, unsigned s)
+{
+    if (s == 0) return v;
+    unsigned const out_msb = hnnx::safe_lshift(1u, (s - 1));
+    if ((v & out_msb) == 0) {
+        // Round down.
+        return hnnx::safe_rshift(v, s);
+    }
+    if ((v & (out_msb - 1)) == 0) {
+        // It's a tie, round to even.
+        v = hnnx::safe_rshift(v, s);
+        return v & 1u ? v + 1 : v;
+    }
+    // Round up.
+    return hnnx::safe_rshift(v, s) + 1;
+}
+
+inline std::pair<int32_t, int32_t> Float16::force_norm() const
+{
+    if (is_zero()) return std::make_pair(0, 0);
+    uint32_t f = frac_bits();
+    int32_t e = static_cast<int32_t>(exp_bits() >> 10u);
+    if (e == 0) {
+        // Subnormal number.
+        assert(f != 0);
+        unsigned const clz = HEX_COUNT_LEADING_ZERO(f) - 16; // Pretend we have 16 bits.
+        // Shift f left so that the first bit 1 is at position 10 from lsb
+        // (assuming that lsb is at 0).
+        e = -14 - (clz - 5);
+        f = hnnx::safe_lshift(f, clz - 5);
+    } else {
+        e -= bias();
+        f |= uint32_t(1) << 10u;
+    }
+    return std::make_pair(e, f);
+}
+
+constexpr Float16 operator"" _f16(long double v)
+{
+    return Float16(static_cast<float>(v));
+}
+
+PUSH_VISIBILITY(default)
+
+template <> class API_EXPORT std::numeric_limits<Float16> {
+  public:
+    static constexpr bool is_specialized = true;
+    static constexpr bool is_signed = true;
+    static constexpr bool is_integer = false;
+    static constexpr bool is_exact = false;
+    static constexpr bool has_infinity = true;
+    static constexpr bool has_quiet_NaN = true;
+    static constexpr bool has_signaling_NaN = true;
+    static constexpr auto has_denorm = std::denorm_present;
+    static constexpr bool has_denorm_loss = false; // libc++
+    static constexpr auto round_style = std::round_to_nearest;
+    static constexpr bool is_iec559 = true;
+    static constexpr bool is_bounded = true;
+    static constexpr bool is_modulo = false;
+    static constexpr int digits = 11;
+    static constexpr int digits10 = 3; // floor((digits-1) * log10(2))
+    static constexpr int max_digits10 = 5; // ceil(digits * log10(2) + 1)
+    static constexpr int radix = 2;
+    static constexpr int min_exponent = -13;
+    static constexpr int min_exponent10 = -4; // min normal =~ 0.000061035
+    static constexpr int max_exponent = 15;
+    static constexpr int max_exponent10 = 5; // largest finite val = 65504
+    static constexpr bool traps = false;
+    static constexpr bool tinyness_before = false; // libc++
+
+    static constexpr Float16 min() noexcept; // returns min positive normal
+    static constexpr Float16 lowest() noexcept; // returns true min
+    static constexpr Float16 max() noexcept; // max positive
+    static constexpr Float16 epsilon() noexcept; // step at 1.0
+    static constexpr Float16 round_error() noexcept; // 0.5
+    static constexpr Float16 infinity() noexcept;
+    static constexpr Float16 quiet_NaN() noexcept;
+    static constexpr Float16 signaling_NaN() noexcept;
+    static constexpr Float16 denorm_min() noexcept; // min positive denorm
+};
+
+POP_VISIBILITY()
+
+constexpr Float16 std::numeric_limits<Float16>::min() noexcept
+{
+    // 2^-14 * (1 + 0/1024)     ; 0 00001 0000000000
+    return Float16::from_raw(0x0400);
+}
+
+constexpr Float16 std::numeric_limits<Float16>::lowest() noexcept
+{
+    // -2^15 * (1 + 1023/1024)  ; 1 11110 1111111111
+    return Float16::from_raw(0xfbff); // -65504
+}
+
+constexpr Float16 std::numeric_limits<Float16>::max() noexcept
+{
+    // 2^15 * (1 + 1023/1024)   ; 0 11110 1111111111
+    return Float16::from_raw(0x7bff); // 65504
+}
+
+constexpr Float16 std::numeric_limits<Float16>::epsilon() noexcept
+{
+    // 2^-10 * (1 + 0/1024)     ; 0 00101 0000000000
+    return Float16::from_raw(0x1400); // next_after_1.0 - 1.0
+}
+
+constexpr Float16 std::numeric_limits<Float16>::round_error() noexcept
+{
+    // 2^-1 * (1 + 0/1024)      ; 0 01110 0000000000
+    return Float16::from_raw(0x3800); // 0.5
+}
+
+constexpr Float16 std::numeric_limits<Float16>::infinity() noexcept
+{
+    return Float16::inf(false);
+}
+
+constexpr Float16 std::numeric_limits<Float16>::quiet_NaN() noexcept
+{
+    return Float16::qnan();
+}
+
+constexpr Float16 std::numeric_limits<Float16>::signaling_NaN() noexcept
+{
+    return Float16::snan();
+}
+
+constexpr Float16 std::numeric_limits<Float16>::denorm_min() noexcept
+{
+    return Float16::from_raw(0x0001);
+}
+
+#endif // FLOAT16_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/forward_classes.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/forward_classes.h
new file mode 100755
index 0000000000000..22cd8b60361a2
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/forward_classes.h
@@ -0,0 +1,75 @@
+//==============================================================================
+//
+// Copyright (c) 2020, 2023 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef HEXNN_FORWARD_CLASSES_H
+#define HEXNN_FORWARD_CLASSES_H 1
+
+#include <memory>
+
+#include "weak_linkage.h"
+PUSH_VISIBILITY(default)
+
+class Graph;
+class Op;
+class OpDef;
+class Tensor;
+class Interface;
+template <unsigned TRank> class TensorShape;
+template <typename T> class PlainInterface;
+template <typename T> class ScaleOffsetInterface;
+
+namespace hnnx {
+
+class Serializer;
+class Deserializer;
+struct ShapeFlags;
+
+// this is a deleter for class T, for use in uniqe_ptr, by default it has the same
+// effect as default_delete, but it can be created
+// with a parameter 'true' that will cause it to do nothing instead of normal deletion.
+template <typename T> class DeleterWithDisable {
+    bool skip_delete;
+
+  public:
+    API_FUNC_EXPORT DeleterWithDisable() : skip_delete(false) {}
+    API_FUNC_EXPORT explicit DeleterWithDisable(bool skip) : skip_delete(skip) {}
+    API_FUNC_EXPORT DeleterWithDisable(DeleterWithDisable const &) = default;
+    API_FUNC_EXPORT DeleterWithDisable &operator=(DeleterWithDisable const &) = default;
+    // this conversion allows us to convert a unique_ptr<T> to unique_ptr<T,DeleterWithDisable<T> >
+    API_FUNC_EXPORT DeleterWithDisable(std::default_delete<T> const &) : skip_delete(false) {}
+    API_FUNC_EXPORT void operator()(T const *p) const;
+    API_FUNC_EXPORT inline bool delete_disabled() const { return skip_delete; }
+};
+template <typename T> API_FUNC_EXPORT void DeleterWithDisable<T>::operator()(T const *p) const
+{
+    if (!skip_delete) delete p;
+}
+
+extern template class DeleterWithDisable<Op>;
+extern template class DeleterWithDisable<Tensor>;
+
+typedef DeleterWithDisable<Op> Op_Deleter;
+typedef DeleterWithDisable<Tensor> Tensor_Deleter;
+typedef std::unique_ptr<Op, Op_Deleter> uptr_Op;
+typedef std::unique_ptr<Tensor, Tensor_Deleter> uptr_Tensor;
+
+// this can be applied to a uptr_Op or uptr_Tensor;
+// it will return true if the skip flag is set (i.e the object
+// is in a crate).
+//
+template <typename TA, typename TB>
+API_FUNC_EXPORT inline bool is_in_crate(std::unique_ptr<TA, DeleterWithDisable<TB>> &tp)
+{
+    return tp.get() != nullptr && tp.get_deleter().delete_disabled();
+}
+
+} // namespace hnnx
+
+POP_VISIBILITY()
+
+#endif
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/graph_status.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/graph_status.h
new file mode 100755
index 0000000000000..e9e970cf44439
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/graph_status.h
@@ -0,0 +1,81 @@
+//==============================================================================
+//
+// Copyright (c) 2020, 2023 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef GRAPH_STATUS_H
+#define GRAPH_STATUS_H
+
+#ifdef __cplusplus
+struct GraphStatus {
+#endif // __cplusplus
+    enum GraphErrorCode {
+        Success = 0,
+        ErrorPickleSkipped = 1,
+        ErrorDimensions = 2,
+        ErrorPrecision = 3,
+        ErrorNAN = 4,
+        ErrorNoTCM = 5,
+        ErrorNoSpace = 6,
+        ErrorUnsupported = 7,
+        ErrorSequence = 8, // e.g. adding a node after prepare
+        ErrorBadID = 9, // source ref was 0 or not defined in graph; node ID was 0 or duplicate.
+        ErrorBadInput = 10,
+        ErrorInvalidTCM = 11,
+        ErrorFatalSchdule = 12,
+        ErrorFatalTCMRequest = 13,
+        ErrorFatalAllocate = 14,
+        ErrorFatalCheck = 15, // preprocess in prepare, e.g. clear the opid_alias_map, check connectivity, order_nodes
+        ErrorBadOpName = 16,
+        ErrorFatalOptimize = 17,
+        ErrorFatalCSE = 18, // steps that combined with CSE e.g. dead_code_removal_and_cse, const_prop_and_cse()
+        ErrorFatalInsert = 19, // when inject DMA spill/fill to fix any oversubscription of TCM
+        ErrorFatalReschedule = 20,
+        ErrorEmptyList = 21,
+        ErrorFatalExecute = 22,
+        ErrorFatalExecuteLastRun = 23,
+        ErrorTCMAcquire = 24, // we can recover from TCM acquire failures (when tcm was locked by a different client)
+        ErrorHMXAcquire = 25,
+        ErrorHMXPower = 26,
+        ErrorBadPMU = 27,
+        ErrorThreadCounts = 28,
+        ErrorClobberedPMU = 29, // Something clobbered our expected PMU event.
+        ErrorWeightsCompressedNoAperture = 30, // Weights are DLBC compressed, but failed to acquire aperture for it
+        ErrorRank = 31,
+        ErrorHMXRelease = 32,
+        ErrorTCMRelease = 33,
+        ErrorWeightsCompressedBadFormat = 34, // Weights are DLBC compressed, but compression format is not supported
+
+        ErrorFatalMcMetaData = 93,
+        ErrorFatalApiRecVersion = 94,
+        ErrorFatalDeserialize = 95,
+        ErrorFatalBlobVersion = 96,
+        ErrorFatalBlobVtcmSize = 97,
+        ErrorFatalUnusableGraph = 98,
+        ErrorFatalException = 99,
+        NotApplicable = 100, // used for internal signaling, should not be returned from API
+        Yielding = 101,
+        AbortSuccess = 102,
+        ErrorBadDynamicOp = 103,
+        ErrorFatal = -1,
+    };
+#ifdef __cplusplus
+    GraphStatus(const GraphStatus &) = default;
+    GraphStatus &operator=(const GraphStatus &) = default;
+    GraphStatus(GraphErrorCode ec) : error_code(ec) {}
+    explicit GraphStatus(int ec) : error_code(static_cast<GraphErrorCode>(ec)) {}
+    int to_int() const { return static_cast<int>(error_code); }
+    operator bool() const { return error_code != Success; }
+
+    bool operator==(GraphErrorCode ec) const { return error_code == ec; }
+    bool operator!=(GraphErrorCode ec) const { return error_code != ec; }
+
+  private:
+    GraphErrorCode error_code;
+};
+#endif // __cplusplus
+
+#endif // GRAPH_STATUS
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/hexagon_nn_types.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/hexagon_nn_types.h
new file mode 100755
index 0000000000000..d7252eeb9e9d3
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/hexagon_nn_types.h
@@ -0,0 +1,92 @@
+#pragma once
+//==============================================================================
+// @brief Collection of types used by various external/API headers
+//
+// Copyright (c) 2024 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+#include <stdint.h>
+#include <stddef.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// We need this so, as on Windows, long is just 32-bits.  This way, Long is consistently 64-bits on
+// 64-bit architectures (x86, aarch64 on Linux, Android, Windows, QNX, etc.).
+typedef ptrdiff_t Long;
+
+///
+/// @brief Max number of PMU events HexNN can sample
+///
+#define HEXAGON_NN_MAX_PMU_EVENTS 8
+
+///
+/// @brief Type for 32b (virtual) address
+///
+typedef uint32_t hexagon_nn_address_t;
+
+///
+/// @brief Type for 64b (virtual) address
+///
+typedef uint64_t hexagon_nn_wide_address_t;
+
+///
+/// @brief A visual marker for an address whose contents (the thing this points
+/// to) are immutable
+/// @details For example a pointer to a shared weights table. The table has a
+/// list of near/far pointers whose contents (weights) are considered immutable
+///
+typedef uint64_t hexagon_nn_wide_address_const_t;
+
+///
+/// @brief Type for iovec with 32b pointer/address and size
+///
+typedef struct {
+    hexagon_nn_address_t val;
+    uint32_t len;
+} hexagon_nn_iovec_t;
+
+///
+/// @brief Type for iovec with 64b pointer/address and size
+///
+typedef struct {
+    hexagon_nn_wide_address_t val;
+    uint64_t len;
+} hexagon_nn_wide_iovec_t;
+
+///
+/// @brief Used to specify thread types when calling hexagon_nn_set_thread_count
+/// and hexagon_nn_get_thread_count.
+///
+enum hexagon_nn_thread_type_t {
+    // Use these enums to specify the type of thread for hexagon_nn_set_thread_count.
+    VecThread = 0,
+    MtxThread = 1,
+    EltThread = 2,
+    // Use this for `count` to specify that the maximum available number of threads should be used.
+    MaxOsThreads = 1001,
+};
+
+enum MemContentType {
+    Standard = 0,
+    Weight = 1,
+    WeightDLBC = 2,
+    WeightReplaceable = 3,
+    ExtendedRO, ///< Content mapped to far memory with read-only permissions
+    ExtendedRW ///< Content mapped to far memory with read-write permissions
+};
+
+///
+/// @brief A NULL wide IO vector
+///
+/// @details Equivalent to nullptr for a pointer instance. Can be used as
+/// default value for arguments
+///
+static hexagon_nn_wide_iovec_t const NULL_IOVEC = {0ull, 0ull};
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/hvx_mathops.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/hvx_mathops.h
new file mode 100755
index 0000000000000..0a2f1c5f3147a
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/hvx_mathops.h
@@ -0,0 +1,83 @@
+//==============================================================================
+//
+// Copyright (c) 2021 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef HVX_MATHOPS_H_
+#define HVX_MATHOPS_H_ 1
+
+#include "intrinsics.h"
+
+namespace hnnx {
+//
+// Conversion of qf to int16, with a specific number of fractional bits,
+// and rounding/saturation. The number of fractional bits is set in range -2 .. 9
+// by a template parameter.
+//  E.g. a value of 51.0 will convert to 51 when FBITS=0, to 408 when FBITS=3,
+// and to 13 when FBITS=-2 (rounded from 12.75).
+//
+// This has been sweep-tested over all possible inputs.
+// This respects the 'hvx extended' hf, where exponent=31 is a normal range.
+// (but, you only see that extra range when FBITS =-2; otherwise those values
+// are saturated).
+// Any input values which are exactly halfway between integers are rounded
+// away from 0; others are rounded to nearest.
+//
+//
+// This should really work for larger FBITS, but for 10 or more, internal
+// rounding errors show up in the output somehow. Instead of scaling the input
+// directly according to FBITS, I also tried reducing the exponent of the added value,
+// i.e. use Q6_V_vsplat_R( 0x48400000 - (FBITS<<23) )
+// In principle this allows FBITS up to 28 or so, but results are not as expected, and
+// that approach only works for FBITS <= 8.
+//
+
+template <int FBITS, bool RND> inline HVX_Vector s16_from_hf_core(HVX_Vector vin)
+{
+    // convert to qf32, multiplying by 1.0 in the process.
+    HVX_Vector result = {0};
+    HVX_VectorPair v32 = Q6_Wqf32_vmpy_VhfVhf(vin, Q6_Vh_vsplat_R(0x3C00 + FBITS * 0x400));
+    // 'in-range' values are +/32752.
+    // add 192K to it, convert to sf
+    HVX_Vector v192K = Q6_V_vsplat_R(0x48400000);
+    HVX_Vector vsf_0 = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(Q6_V_lo_W(v32), v192K));
+    HVX_Vector vsf_1 = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(Q6_V_hi_W(v32), v192K));
+    // for in-range cases, result is {163858... 229360} so the exponent is always 144.
+    // if we extract bits 21..0 as a signed quantity, and round 6 bits off, that will be the answer.
+    // Start by <<10 to get the final 'sign' bit in bit 15...
+    vsf_0 = Q6_Vw_vasl_VwR(vsf_0, 10);
+    vsf_1 = Q6_Vw_vasl_VwR(vsf_1, 10);
+    // now round down to 16
+    if constexpr (RND) {
+        result = Q6_Vh_vround_VwVw_sat(vsf_1, vsf_0);
+    } else {
+        result = Q6_Vh_vsat_VwVw(vsf_1, vsf_0);
+    }
+    // but we need to also take care of out-of-range inputs; any with original exponent exceeding
+    // 29-FBITS. This is only possible when FBITS is -1 or more.
+    if (FBITS > -2) {
+        HVX_Vector tmp = Q6_Vh_vadd_VhVh(vin, vin); // shift out sign bit
+        HVX_Vector thrsh = Q6_Vh_vsplat_R((30 - FBITS) * 0x800); // must be <this
+        HVX_VectorPred n_overflow = Q6_Q_vcmp_gt_VuhVuh(thrsh, tmp);
+        HVX_Vector saturated = Q6_Vh_vlut4_VuhPh(vin, 0x800080007fff7fffULL);
+        result = Q6_V_vmux_QVV(n_overflow, result, saturated);
+    }
+    return result;
+}
+
+template <int FBITS> inline HVX_Vector s16_from_hf_rnd_sat(HVX_Vector vin)
+{
+    return s16_from_hf_core<FBITS, 1>(vin);
+}
+
+template <int FBITS> inline HVX_Vector s16_from_hf_sat(HVX_Vector vin)
+{
+    return s16_from_hf_core<FBITS, 0>(vin);
+}
+
+} //namespace hnnx
+
+#endif /* HVX_MATHOPS_H_ */
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/interface_defs.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/interface_defs.h
new file mode 100755
index 0000000000000..60db53fc840a5
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/interface_defs.h
@@ -0,0 +1,59 @@
+//==============================================================================
+//
+// Copyright (c) 2020 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef INTERFACE_DEFS_H
+#define INTERFACE_DEFS_H 1
+
+#include "dtype_enum.h"
+
+#include <cstddef>
+#include <cstdint>
+
+typedef unsigned long long OpId;
+typedef unsigned OpId_32;
+typedef unsigned size_t_32;
+#define MAX_DIMENSIONS 8
+
+// must be the same layout as struct input
+struct InputDef {
+    uint32_t input_id;
+    uint32_t output_idx;
+};
+
+struct OutputDef {
+    NN_UINT32_T rank;
+    DType dtype;
+    size_t max_sizes[MAX_DIMENSIONS];
+    NN_INT32_T zero_offset;
+    float stepsize;
+};
+
+struct InputDef_CanFormat {
+    OpId_32 input_id;
+    size_t_32 output_idx;
+};
+
+struct OutputDef_CanFormat {
+    NN_UINT32_T rank;
+    size_t_32 max_sizes[MAX_DIMENSIONS];
+    DType dtype;
+    NN_INT32_T zero_offset;
+    float stepsize;
+};
+struct Const_prefix_CanFormat {
+    size_t_32 reclen;
+    NN_UINT32_T rectype;
+    OpId_32 node_id;
+    size_t_32 rank;
+    DType dtype;
+    NN_INT32_T zero_offset;
+    float stepsize;
+    size_t_32 datalen;
+};
+
+#endif
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/intrinsics.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/intrinsics.h
new file mode 100755
index 0000000000000..f35d3d2d31a11
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/intrinsics.h
@@ -0,0 +1,804 @@
+//==============================================================================
+//
+// Copyright (c) 2020-2023 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef INTRINSICS_H
+#define INTRINSICS_H 1
+
+#if !defined(__hexagon__) && !defined(_WIN32)
+#include <sched.h>
+#endif
+
+#include "log.h"
+
+#ifdef __hexagon__
+#include "hexagon_types.h"
+#endif
+#include "hexagon_protos.h"
+
+#include "afuncs.h"
+
+#include "check_hvx.h"
+
+#ifdef _WIN32
+#include <thread>
+#endif
+
+///////// inline implementation of trivial HVX intrinsics, reduces code size significantly.
+#if !defined(__hexagon__) && defined(Q6_V_lo_W)
+namespace hnnx {
+inline HVX_Vector q6op_V_lo_W(HVX_VectorPair const &w)
+{
+    return w.v[0];
+}
+inline HVX_Vector q6op_V_hi_W(HVX_VectorPair const &w)
+{
+    return w.v[1];
+}
+inline HVX_VectorPair q6op_W_vcombine_VV(HVX_Vector const &v1, HVX_Vector const &v0)
+{
+    HVX_VectorPair result;
+    result.v[0] = v0;
+    result.v[1] = v1;
+    return result;
+}
+#undef Q6_V_lo_W
+#define Q6_V_lo_W(W) hnnx::q6op_V_lo_W(W)
+#undef Q6_V_hi_W
+#define Q6_V_hi_W(W) hnnx::q6op_V_hi_W(W)
+#undef Q6_W_vcombine_VV
+#define Q6_W_vcombine_VV(V1, V0) hnnx::q6op_W_vcombine_VV(V1, V0)
+} // namespace hnnx
+
+//
+// workaround for defect in libnative Q6_Vw_vadd_VwVwQ_carry, Q6_Vw_vsub_VwVwQ_carry [QTOOL-108346]
+//
+namespace hnnx {
+template <bool SUBTRACT>
+HVX_Vector do_Vw_vaddORsub_VwVwQ_carry(HVX_Vector const &vu, HVX_Vector const &vv, HVX_VectorPred *qcarry)
+{
+    using u64 = unsigned long long;
+    HVX_Vector result;
+    HVX_VectorPred carry_out; // use local for this, in case qcarry aliases one of the inputs.
+    for (unsigned i = 0; i < 32; i++) {
+        unsigned v_val = vv.uw[i];
+        if (SUBTRACT) v_val = ~v_val;
+        u64 sum = u64(vu.uw[i]) + u64(v_val) + (qcarry->uw[i] & 1u);
+        result.uw[i] = unsigned(sum);
+        carry_out.uw[i] = (sum > 0xFFFFFFFFu) ? 0x01010101 : 0;
+    }
+    *qcarry = carry_out;
+    return result;
+}
+#undef Q6_Vw_vadd_VwVwQ_carry
+#define Q6_Vw_vadd_VwVwQ_carry(VA, VB, QP) hnnx::do_Vw_vaddORsub_VwVwQ_carry<false>(VA, VB, QP)
+#undef Q6_Vw_vsub_VwVwQ_carry
+#define Q6_Vw_vsub_VwVwQ_carry(VA, VB, QP) hnnx::do_Vw_vaddORsub_VwVwQ_carry<true>(VA, VB, QP)
+} // namespace hnnx
+// end [QTOOL-108346]
+
+#endif
+////////////////////////////////////////
+
+#include "hvx_mathops.h"
+#include "macros_attribute.h"
+
+typedef struct {
+    HVX_Vector val[2];
+} HVX_Vector_x2;
+
+typedef struct {
+    HVX_Vector val[3];
+} HVX_Vector_x3;
+typedef struct {
+    HVX_Vector val[4];
+} HVX_Vector_x4;
+
+typedef struct {
+    HVX_VectorPair val[2];
+} HVX_VectorPair_x2;
+
+typedef struct {
+    HVX_VectorPair val[3];
+} HVX_VectorPair_x3;
+
+typedef struct {
+    HVX_VectorPair val[4];
+} HVX_VectorPair_x4;
+
+// Splat 32b float to vector
+inline ALWAYSINLINE HVX_Vector q6op_V_vsplat_float32(const float val)
+{
+    union {
+        float as_f32;
+        int32_t as_i32;
+    } bitCast;
+    bitCast.as_f32 = val;
+    return Q6_V_vsplat_R(bitCast.as_i32);
+}
+
+// for splat 16b IEEE float to vector
+union bitcast_fp16 {
+    Float16 as_fp16;
+    int16_t as_i16;
+};
+
+inline ALWAYSINLINE HVX_Vector q6op_V_vsplat_float16(const Float16 val)
+{
+    const bitcast_fp16 bitcast{val};
+    return Q6_Vh_vsplat_R(bitcast.as_i16);
+}
+
+// 32x32 fractional multiply - expands to two ops
+//  equiv to :
+//    p  = (a*b + (1<<30)) >> 31     [with rounding]
+//    p  = a*b >> 31     			[without rounding]
+// The 'sat' only takes effect when both inputs
+// are -0x80000000 and causes the result to saturate to 0x7fffffff
+
+inline HVX_Vector q6op_Vw_vmpy_VwVw_s1_rnd_sat(HVX_Vector vu, HVX_Vector vv)
+{
+    return Q6_Vw_vmpyoacc_VwVwVh_s1_rnd_sat_shift(Q6_Vw_vmpye_VwVuh(vu, vv), vu, vv);
+}
+
+inline HVX_Vector q6op_Vw_vmpy_VwVw_s1_sat(HVX_Vector vu, HVX_Vector vv)
+{
+    return Q6_Vw_vmpyoacc_VwVwVh_s1_sat_shift(Q6_Vw_vmpye_VwVuh(vu, vv), vu, vv);
+}
+
+#ifdef __hexagon__
+// HEXAGON
+
+//Unaligned vector load
+
+inline HVX_Vector q6op_V_vldu_A(void const *addr)
+{
+#pragma pack(push, 1)
+    struct varr {
+        HVX_Vector v;
+    } const *pp;
+#pragma pack(pop)
+    pp = (struct varr const *)addr;
+    return pp->v;
+}
+
+// unaligned vector store.
+
+inline void q6op_vstu_AV(void *addr, HVX_Vector v)
+{
+#pragma pack(push, 1)
+    struct varr {
+        HVX_Vector v;
+    } * pp;
+#pragma pack(pop)
+    pp = (struct varr *)addr;
+    pp->v = v;
+}
+
+// conditional unaligned vector store.
+
+inline void q6op_vstu_QAV(HVX_VectorPred Qmask, void *addr, HVX_Vector v)
+{
+    unsigned int const bL = (uintptr_t)addr;
+    HVX_Vector *addr_v = (HVX_Vector *)addr;
+    HVX_Vector mask = Q6_V_vand_QR(Qmask, -1);
+    HVX_Vector vzero = Q6_V_vzero();
+    HVX_Vector vx = Q6_V_vlalign_VVR(v, v, bL);
+    HVX_Vector maskL = Q6_V_vlalign_VVR(mask, vzero, bL);
+    HVX_Vector maskH = Q6_V_vlalign_VVR(vzero, mask, bL);
+    HVX_Vector QL = Q6_Q_vcmp_gt_VubVub(maskL, vzero);
+    HVX_Vector QH = Q6_Q_vcmp_gt_VubVub(maskH, vzero);
+    Q6_vmem_QRIV(QL, &addr_v[0], vx);
+    if ((bL & 127) != 0) {
+        Q6_vmem_QRIV(QH, &addr_v[1], vx);
+    }
+}
+
+// Unaligned unaligned load/store:
+// vmemu( void *) can be assigned to, or read from,
+// and unaligned load/store will be used.
+// vmem( void const *) can be read from.
+//
+#pragma pack(push, 1)
+struct unaligned_vector_wrapper {
+    HVX_Vector v;
+    inline operator HVX_Vector() const { return v; };
+    inline HVX_Vector operator=(HVX_Vector val)
+    {
+        v = val;
+        return val;
+    }
+    inline HVX_Vector operator=(unaligned_vector_wrapper const &rhs)
+    {
+        if (this != &rhs) {
+            v = rhs.v;
+        }
+        return *this;
+    }
+}; // <- so the struct is not considered aligned
+#pragma pack(pop)
+inline HVX_Vector vmemu(void const *addr)
+{
+    return ((unaligned_vector_wrapper const *)addr)->v;
+}
+inline unaligned_vector_wrapper &vmemu(void *addr)
+{
+    return *(unaligned_vector_wrapper *)addr;
+}
+
+// this stores the first n bytes from vector vin to address 'addr'.
+// n must be in range 1..128, addr may have any alignment; does one or
+// two masked stores
+
+inline void q6op_vstu_variable_ARV(void *addr, int n, HVX_Vector vin)
+{
+    vin = Q6_V_vlalign_VVR(vin, vin, (size_t)addr); //rotate as needed.
+    unsigned const left_off = (size_t)addr & 127;
+    unsigned const right_off = left_off + n;
+    HVX_VectorPred qL_not = Q6_Q_vsetq_R((size_t)addr);
+    HVX_VectorPred qR = Q6_Q_vsetq2_R(right_off);
+    if (right_off > 128) {
+        Q6_vmaskedstorentq_QAV(qR, (HVX_Vector *)addr + 1, vin);
+        qR = Q6_Q_vcmp_eq_VbVb(vin, vin); // all 1's
+    }
+    qL_not = Q6_Q_or_QQn(qL_not, qR);
+    Q6_vmaskedstorentnq_QAV(qL_not, (HVX_Vector *)addr, vin);
+}
+// store 'n' bytes (1..128) from a vector to unaligned location 'ptr'.
+// The bytes are extracted from value, starting at position 'pos0' (and wrapping around, if pos0+n > 128).
+// Only the 7 lsbs of pos0 are used.
+inline void q6op_vstu_variable_ARVR(void *addr, int n, HVX_Vector vin, int pos0)
+{
+    vin = Q6_V_vlalign_VVR(vin, vin, (size_t)addr - pos0); //rotate as needed.
+    unsigned const left_off = (size_t)addr & 127;
+    unsigned const right_off = left_off + n;
+    HVX_VectorPred qL_not = Q6_Q_vsetq_R((size_t)addr);
+    HVX_VectorPred qR = Q6_Q_vsetq2_R(right_off);
+    if (right_off > 128) {
+        Q6_vmaskedstorentq_QAV(qR, (HVX_Vector *)addr + 1, vin);
+        qR = Q6_Q_vcmp_eq_VbVb(vin, vin); // all 1's
+    }
+    qL_not = Q6_Q_or_QQn(qL_not, qR);
+    Q6_vmaskedstorentnq_QAV(qL_not, (HVX_Vector *)addr, vin);
+}
+
+#if 0
+// store 'w' bytes (1..128) from a vector to unaligned location 'ptr'.
+// The bytes are extracted from value, starting at position 'pos0' (and wrapping around, if pos0+w > 128).
+// Only the 7 lsbs of pos0 are used.
+// This is an alternate implementation, seenms to be about the same cost, but maybe in some loops it will
+// be better depending on what else is in the loop. Probably not useful where 'w' is not a loop invariant.
+inline void q6op_vstu_variable_ARVR_alt(void *ptr, unsigned w, HVX_Vector value,
+                                        unsigned pos0 = 0)
+{
+    // make a mask with 1's in the first 'w' slots
+    HVX_Vector msk0 = Q6_V_vand_QR(Q6_Q_vsetq2_R(w), 1);
+    unsigned uptr = (size_t)ptr;
+    unsigned offs = uptr & 127;
+    // rotate data up according to 'offs' (and pos0)
+    value = Q6_V_vlalign_VVR(value, value, uptr - pos0);
+    // shift the mask up according to 'offs'
+    HVX_Vector mlo = Q6_V_vlalign_VVR(msk0, Q6_V_vzero(), uptr);
+    // and get upper part
+    HVX_Vector mhi = Q6_V_vlalign_VVR(Q6_V_vzero(), msk0, uptr);
+    Q6_vmaskedstorentq_QAV(Q6_Q_vcmp_gt_VubVub(mlo, Q6_V_vzero()), (char *)ptr,
+                           value);
+    if (offs + w > 128) {
+        Q6_vmaskedstorentq_QAV(Q6_Q_vcmp_gt_VubVub(mhi, Q6_V_vzero()),
+                               (char *)ptr + 128, value);
+    }
+}
+#endif
+
+#define PGSIZE (1024 * 1024)
+
+inline void dcfetch(void const *addr)
+{
+    PUSH_WARNING()
+    DISABLE_WARNING("-Wcast-qual", MSVC_NO_EQUIV)
+    //    asm volatile(" dcfetch(%0) " : : "r"(addr));
+    Q6_dcfetch_A(const_cast<void *>(addr));
+    POP_WARNING()
+}
+
+inline void ALWAYSINLINE l2pref(const void *p, uint32_t height, uint32_t width, uint32_t stride)
+{
+    uint64_t const control = Q6_P_combine_RR(stride, Q6_R_combine_RlRl(width, height));
+    asm volatile(" l2fetch(%0,%1) " : : "r"(p), "r"(control));
+}
+
+inline void ALWAYSINLINE pause_just_enough()
+{
+#if (__HEXAGON_ARCH__ >= 73)
+    //    asm volatile("pause(#1023)");
+    asm volatile("pause(#255)");
+// LCOV_EXCL_START [SAFTYSWCCB-1735]
+#elif (__HEXAGON_ARCH__ >= 69)
+    asm volatile("pause(#128)");
+#else
+    int tmp = 0;
+    asm volatile("%0 = add(pc,#8); jumpr %0;" : : "r"(tmp));
+    asm volatile("%0 = add(pc,#8); jumpr %0;" : : "r"(tmp));
+    asm volatile("%0 = add(pc,#8); jumpr %0;" : : "r"(tmp));
+    asm volatile("%0 = add(pc,#8); jumpr %0;" : : "r"(tmp));
+// LCOV_EXCL_STOP
+#endif
+}
+
+#else
+
+// PORTABLE
+#include <cstring>
+
+inline HVX_Vector &vmemu(void *addr)
+{
+    return *(HVX_Vector *)addr;
+}
+
+inline HVX_Vector vmemu(void const *v)
+{
+    return *(HVX_Vector const *)(v);
+}
+inline HVX_Vector q6op_V_vldu_A(void const *addr)
+{
+    return *(HVX_Vector const *)addr;
+}
+inline void q6op_vstu_AV(void *addr, HVX_Vector v)
+{
+    *(HVX_Vector *)addr = v;
+}
+
+inline void q6op_vstu_variable_ARV(void *addr, int n, HVX_Vector vin)
+{
+    check_hvx();
+
+    typedef union {
+        HVX_Vector v;
+        uint8_t u8[128];
+    } vec;
+
+    vec v;
+    v.v = vin;
+    std::memcpy((uint8_t *)addr, v.u8, n);
+}
+inline void q6op_vstu_variable_ARVR(void *addr, int n, HVX_Vector vin, int pos0)
+{
+    q6op_vstu_variable_ARV(addr, n, Q6_V_vror_VR(vin, pos0));
+}
+
+inline void dcfetch(void const volatile *addr) {}
+inline void l2pref(const void *p, uint32_t height, uint32_t width, uint32_t stride) {}
+
+inline void pause_just_enough()
+{
+#ifndef _WIN32
+    sched_yield();
+#else
+    std::this_thread::yield();
+#endif
+}
+
+#endif
+
+inline void dcfetch_block(const void *addr, int size)
+{
+    auto address = static_cast<const uint8_t *>(addr);
+
+    for (int i = 0; i < size; i += 64) {
+        dcfetch(address);
+        address += 64;
+    }
+}
+
+// unaligned load the lo part of HVX_VECTOR into pDst
+inline void vmemu_lo(HVX_VectorPair &output, uint8_t *pDst)
+{
+    HVX_Vector output_lo = Q6_V_lo_W(output);
+    q6op_vstu_AV(pDst, output_lo);
+}
+
+// unaligned load the hi part of HVX_VECTOR into pDst
+inline void vmemu_hi(HVX_VectorPair &output, uint8_t *pDst)
+{
+    HVX_Vector output_hi = Q6_V_hi_W(output);
+    q6op_vstu_AV(pDst, output_hi);
+}
+
+// This func conditional unaligned stores the first nwrite byte of vreg into addr.
+inline void q6op_vmemu_partial(uint8_t *addr, HVX_Vector vreg, int nwrite)
+{
+    HVX_VectorPred cond = Q6_Q_vsetq2_R(std::min(128, nwrite));
+    q6op_vstu_AV(addr, Q6_V_vmux_QVV(cond, vreg, vmemu(addr)));
+}
+
+// this is called with a dest pointer, two vectors, and 'bytes' in range 1..256.
+// The first 'bytes' bytes from the vectors (v0 followed by v1) will be stored
+// at the address, using  unaligned and masked stores as needed. If bytes <=0,
+// nothing is stored; if bytes > 256, the effect is the same as bytes == 256 (all stored).
+void hvx_store_vec_x2_unaligned(void *addr, HVX_Vector v0, HVX_Vector v1, int bytes) noexcept;
+
+inline void hvx_store_vec_x2_unaligned_inline(void *addr, HVX_Vector v0, HVX_Vector v1, int bytes) noexcept
+{
+    check_hvx();
+
+    static constexpr unsigned int vector_size = 128;
+    HVX_Vector *outp = (HVX_Vector *)addr;
+    if (bytes >= vector_size) {
+        q6op_vstu_AV(outp, v0);
+        outp++;
+        bytes -= vector_size;
+        v0 = v1;
+    }
+    if (bytes >= vector_size) {
+        q6op_vstu_AV(outp, v0);
+    } else if (bytes >= 1) {
+        q6op_vstu_variable_ARV(outp, bytes, v0);
+    }
+}
+
+// this is called with a dest pointer, four vectors, and 'bytes' in range 1..512.
+// The first 'bytes' bytes from the vectors (v0...v3) will be stored
+// at the address, using  unaligned and masked stores as needed. If bytes <=0,
+// nothing is stored; if bytes > 512, the effect is the same as bytes == 512 (all stored).
+void hvx_store_vec_x4_unaligned(void *addr, HVX_Vector v0, HVX_Vector v1, HVX_Vector v2, HVX_Vector v3,
+                                int bytes) noexcept;
+
+inline void hvx_store_vec_x4_unaligned_inline(void *addr, HVX_Vector v0, HVX_Vector v1, HVX_Vector v2, HVX_Vector v3,
+                                              int bytes) noexcept
+{
+    check_hvx();
+
+    static constexpr unsigned int vector_size = 128;
+    HVX_Vector *outp = (HVX_Vector *)addr;
+    if (bytes >= vector_size) {
+        q6op_vstu_AV(outp, v0);
+        outp++;
+        bytes -= vector_size;
+        v0 = v1;
+    }
+    if (bytes >= vector_size) {
+        q6op_vstu_AV(outp, v0);
+        outp++;
+        bytes -= vector_size;
+        v0 = v2;
+    }
+    if (bytes >= vector_size) {
+        q6op_vstu_AV(outp, v0);
+        outp++;
+        bytes -= vector_size;
+        v0 = v3;
+    }
+    if (bytes >= vector_size) {
+        q6op_vstu_AV(outp, v0);
+    } else if (bytes >= 1) {
+        q6op_vstu_variable_ARV(outp, bytes, v0);
+    }
+}
+
+inline HVX_VectorPair addv_u64(HVX_VectorPair acc, HVX_Vector newdata)
+{
+    const HVX_Vector v_one = Q6_V_vsplat_R(1);
+    HVX_Vector acc_lo = Q6_V_lo_W(acc);
+    HVX_Vector acc_hi = Q6_V_hi_W(acc);
+    // works for unsigned newdata since if acc_lo is >= 2^15 and we add
+    // newdata (as unsigned), then we have either
+    // 1) newdata < 2^15, in which case acc_lo will get 'less negative'
+    //    thus decreasing the magnitude of the negative acc_lo (which makes it bigger as
+    //    an unsigned int)
+    // 2) newdata > 2^15, then both acc_lo and newdata are negative so it just adds the magnitude
+    //   (as in -a + -b = -(a+b))
+    HVX_Vector new_lo = Q6_Vw_vadd_VwVw(acc_lo, newdata);
+    HVX_VectorPred ovf = Q6_Q_vcmp_gt_VuwVuw(newdata, new_lo);
+    acc_hi = Q6_Vw_condacc_QVwVw(ovf, acc_hi, v_one);
+    return Q6_W_vcombine_VV(acc_hi, new_lo);
+}
+
+inline HVX_VectorPair addw_u64(HVX_VectorPair acc, HVX_VectorPair addend)
+{
+    const HVX_Vector v_one = Q6_V_vsplat_R(1);
+    HVX_Vector acc_lo = Q6_V_lo_W(acc);
+    HVX_Vector acc_hi = Q6_V_hi_W(acc);
+    HVX_Vector addend_hi = Q6_V_hi_W(addend);
+    HVX_Vector addend_lo = Q6_V_lo_W(addend);
+    HVX_Vector new_hi = Q6_Vw_vadd_VwVw(addend_hi, acc_hi);
+    HVX_Vector new_lo = Q6_Vw_vadd_VwVw(acc_lo, addend_lo);
+    HVX_VectorPred ovf = Q6_Q_vcmp_gt_VuwVuw(addend_lo, new_lo);
+    new_hi = Q6_Vw_condacc_QVwVw(ovf, new_hi, v_one);
+    return Q6_W_vcombine_VV(new_hi, new_lo);
+}
+
+// Utilities to convert from uint64 to qf32/sf
+// Moved here so that they can be reused
+//convert long long int into qfloat for sum and sum(squared)
+inline HVX_Vector uint64_to_qfloat(HVX_Vector ll_hi, HVX_Vector ll_lo)
+{
+    HVX_Vector vzero = Q6_V_vzero();
+    HVX_VectorPred q0;
+    HVX_Vector v32 = Q6_V_vsplat_R(32);
+    HVX_Vector qmask = Q6_V_vsplat_R(0xffffff00);
+    HVX_Vector qexpmin = Q6_V_vsplat_R(0x0000009e); //^-9
+    HVX_Vector qf32_out, hi, lo, exp0, mant0, exp;
+    q0 = Q6_Q_vcmp_eq_VwVw(ll_hi, vzero); //if(!hi)
+    hi = Q6_V_vmux_QVV(q0, ll_lo, ll_hi); //
+    lo = Q6_V_vand_QnV(q0, ll_lo); //xxxx | xxxx or xxxx | 0000
+    exp0 = Q6_Vuw_vcl0_Vuw(hi); //get size of value 32 or 64bit
+    mant0 = Q6_Vw_vasl_VwVw(hi, exp0); //shift hi by size
+    exp = Q6_Vw_vsub_VwVw(v32, exp0); //compute missing bit using oppisite shift on lo
+    lo = Q6_Vw_vlsr_VwVw(lo, exp);
+    mant0 = Q6_Vw_vadd_VwVw(mant0, lo); //combine lo and hi
+    exp = Q6_V_vand_QnV(q0, v32); //adjust exp by 32 if 32 or 64bit ll
+    exp0 = Q6_Vw_vsub_VwVw(exp0, exp); //convert to qfloat exponent
+    mant0 = Q6_Vuw_vlsr_VuwR(mant0, 1); //make mant issa signed format
+    mant0 = Q6_V_vand_VV(mant0, qmask);
+    exp0 = Q6_Vw_vsub_VwVw(qexpmin, exp0); //merge mant and exponent
+    qf32_out = Q6_V_vor_VV(mant0, exp0); //qfloat
+    return (qf32_out);
+}
+
+inline HVX_Vector uint64_to_qfloat(HVX_VectorPair bigval)
+{
+    return uint64_to_qfloat(Q6_V_hi_W(bigval), Q6_V_lo_W(bigval));
+}
+
+//This function returns in IEEE FP32 format
+inline HVX_Vector uint64_to_float(HVX_Vector ll_hi, HVX_Vector ll_lo)
+{
+    HVX_Vector vzero = Q6_V_vzero();
+    HVX_Vector v32 = Q6_V_vsplat_R(32);
+
+    HVX_Vector exponent = Q6_V_vsplat_R(32);
+    HVX_VectorPred q0 = Q6_Q_vcmp_eq_VwVw(ll_hi, vzero);
+    exponent = Q6_V_vmux_QVV(q0, vzero, exponent);
+    HVX_Vector hi = Q6_V_vmux_QVV(q0, ll_lo, ll_hi);
+    HVX_Vector lo = Q6_V_vand_QnV(q0, ll_lo);
+
+    HVX_Vector msb_position = Q6_Vw_vsub_VwVw(v32, Q6_Vuw_vcl0_Vuw(hi));
+    HVX_Vector shft_amt = Q6_Vw_vsub_VwVw(Q6_V_vsplat_R(33), msb_position);
+
+    // compute the mantissa (fractional part) of the floating-point representation
+    HVX_Vector mantissa = Q6_Vw_vasl_VwVw(hi, shft_amt);
+    HVX_Vector temp1 = Q6_Vw_vsub_VwVw(v32, shft_amt);
+    temp1 = Q6_Vw_vlsr_VwVw(lo, temp1); //low_bits are shifted right by (32 - shift_amount)
+    mantissa = Q6_V_vor_VV(mantissa, temp1); //combines shifted values to create the mantissa
+    mantissa = Q6_Vuw_vlsr_VuwR(Q6_Vw_vadd_VwVw(mantissa, Q6_V_vsplat_R(0x00000100)),
+                                9); //equivalent to: ((mantissa >> 8) + 1) >> 1;
+
+    // compute the exponent
+    exponent = Q6_Vw_vadd_VwVw(msb_position, exponent);
+    exponent = Q6_Vw_vadd_VwVw(exponent, Q6_V_vsplat_R(126)); //apply exponent bias
+    exponent =
+            Q6_Vw_vasl_VwVw(exponent, Q6_V_vsplat_R(23)); //align the exponent part of the floating-point representation
+
+    HVX_Vector result = Q6_Vw_vadd_VwVw(exponent, mantissa);
+    // handling float conversion for 0 manually
+    const HVX_Vector maskZero = Q6_V_vand_QV(Q6_Q_vcmp_eq_VwVw(ll_lo, vzero), Q6_Q_vcmp_eq_VwVw(ll_hi, vzero));
+    result = Q6_V_vmux_QVV(maskZero, vzero, result);
+
+    return result;
+}
+
+inline HVX_Vector uint64_to_float(HVX_VectorPair bigval)
+{
+#if HEX_ARCH >= 73
+    return uint64_to_float(Q6_V_hi_W(bigval), Q6_V_lo_W(bigval));
+#else
+    // LCOV_EXCL_START [SAFTYSWCCB-1735]
+    return Q6_Vsf_equals_Vqf32(uint64_to_qfloat(Q6_V_hi_W(bigval), Q6_V_lo_W(bigval)));
+    // LCOV_EXCL_STOP
+#endif
+}
+
+inline HVX_Vector int32_to_qfloat(HVX_Vector const in)
+{
+    HVX_Vector const vzero = Q6_V_vzero();
+    HVX_VectorPred is_zero = Q6_Q_vcmp_eq_VwVw(in, vzero);
+    HVX_Vector lshift = Q6_Vw_vnormamt_Vw(in);
+    HVX_Vector normalized = Q6_Vw_vasl_VwVw(in, lshift);
+    HVX_Vector vexp = Q6_Vw_vsub_VwVw(Q6_V_vsplat_R(0x7f + 30), lshift);
+    HVX_Vector mant = Q6_V_vand_VV(Q6_V_vsplat_R(0xFFFFFF00), normalized);
+    HVX_Vector ret = Q6_V_vand_QnV(is_zero, Q6_Vw_vadd_VwVw(mant, vexp));
+    return ret;
+}
+
+inline HVX_Vector int32_to_float(HVX_Vector const in)
+{
+    return Q6_Vsf_equals_Vqf32(int32_to_qfloat(in));
+}
+
+// Hexagon toolchain 19.0 adds support for this function as a macro. The inline function
+// declaration is not needed in that case.
+#if !defined(Q6_Vqf32_equals_Vsf)
+// Convert IEEE 754 float to Qualcomm 32-bit float (qf32)
+[[maybe_unused]] static inline ALWAYSINLINE HVX_Vector Q6_Vqf32_equals_Vsf(HVX_Vector vin)
+{
+    return Q6_Vqf32_vadd_VsfVsf(vin, Q6_V_vzero());
+}
+#endif
+
+[[maybe_unused]] static inline ALWAYSINLINE HVX_Vector Q6_Vqf32_from_int(HVX_Vector vin)
+{
+    HVX_Vector const_126 = Q6_V_vsplat_R(0x0000007e);
+    HVX_Vector const31 = Q6_V_vsplat_R(31);
+    HVX_Vector mant = vin;
+    HVX_Vector exp = Q6_Vw_vnormamt_Vw(mant);
+    mant = Q6_Vw_vasl_VwVw(mant, exp);
+    exp = Q6_Vw_vsub_VwVw(const31, exp);
+    exp = Q6_Vw_vadd_VwVw(exp, const_126);
+    return Q6_V_vor_VV(mant, exp);
+}
+
+//Convert INT32 to return IEEE FP32
+[[maybe_unused]] static inline ALWAYSINLINE HVX_Vector int32_to_fp32(HVX_Vector vin)
+{
+    HVX_Vector v32 = Q6_V_vsplat_R(32);
+    const HVX_Vector Zerofp32 = Q6_V_vsplat_R(0x00000000); // 0.0 in IEEE FP32
+    const HVX_Vector maskSign = Q6_V_vsplat_R(0x80000000);
+    const HVX_Vector vinSgn = Q6_V_vand_VV(vin, maskSign);
+    vin = Q6_Vuw_vabsdiff_VwVw(vin, Q6_V_vzero());
+
+    HVX_Vector msb_position = Q6_Vw_vsub_VwVw(v32, Q6_Vuw_vcl0_Vuw(vin));
+    HVX_Vector shft_amt = Q6_Vw_vsub_VwVw(Q6_V_vsplat_R(33), msb_position);
+    HVX_Vector mantissa = Q6_Vw_vasl_VwVw(vin, shft_amt);
+    mantissa = Q6_Vuw_vlsr_VuwR(Q6_Vw_vadd_VwVw(mantissa, Q6_V_vsplat_R(0x00000100)), 9);
+
+    HVX_Vector exponent = Q6_Vw_vadd_VwVw(msb_position, Q6_V_vsplat_R(126));
+    exponent =
+            Q6_Vw_vasl_VwVw(exponent, Q6_V_vsplat_R(23)); //align the exponent part of the floating-point representation
+
+    HVX_Vector result = Q6_V_vor_VV(vinSgn, Q6_Vw_vadd_VwVw(exponent, mantissa));
+    // handling float conversion for 0 manually
+    const HVX_VectorPred maskZero = Q6_Q_vcmp_eq_VwVw(vin, Zerofp32);
+    result = Q6_V_vmux_QVV(maskZero, Zerofp32, result);
+
+    return result;
+}
+
+template <bool RND> static inline HVX_Vector convert_sf_to_s32_core(HVX_Vector vals)
+{
+    if constexpr (RND) {
+        HVX_Vector const sign = Q6_V_vand_VV(vals, Q6_V_vsplat_R(0x80000000));
+        HVX_Vector vqfadd = Q6_Vqf32_vadd_VsfVsf(vals, Q6_V_vor_VV(sign, q6op_V_vsplat_float32(0.5f)));
+        vals = Q6_Vsf_equals_Vqf32(vqfadd);
+    }
+#if HEX_ARCH >= 73
+    // Can use the fancy new intrinsic for this!
+    return Q6_Vw_equals_Vsf(vals);
+#else
+    // LCOV_EXCL_START [SAFTYSWCCB-1735]
+    const HVX_Vector const_zero = Q6_V_vzero();
+    const HVX_Vector const_7fffff = Q6_V_vsplat_R(0x7fffff);
+    const HVX_Vector const_800000 = Q6_V_vsplat_R(0x800000);
+    const HVX_Vector const_00ff = Q6_V_vsplat_R(0x00ff);
+    const HVX_Vector const_150 = Q6_V_vsplat_R(127 + 23);
+    const HVX_Vector const_n32 = Q6_V_vsplat_R(-32);
+    const HVX_Vector const_7 = Q6_V_vsplat_R(7);
+    const HVX_Vector const_7fffffff = Q6_V_vsplat_R(0x7fffffff);
+
+    HVX_VectorPred p_neg, p_overflow;
+    HVX_Vector mant, exp, shift;
+
+    /* Check for negative values */
+    p_neg = Q6_Q_vcmp_gt_VwVw(const_zero, vals);
+    /* Extract exponent and mantissa, add back hidden 1 */
+    exp = Q6_Vuw_vlsr_VuwR(vals, 23);
+    exp = Q6_V_vand_VV(exp, const_00ff);
+    mant = Q6_V_vand_VV(vals, const_7fffff);
+    mant = Q6_V_vor_VV(mant, const_800000);
+
+    /* shift and round to get integer bits */
+    shift = Q6_Vw_vmax_VwVw(Q6_Vw_vsub_VwVw(exp, const_150), const_n32);
+    mant = Q6_Vw_vasl_VwVw(mant, shift);
+
+    p_overflow = Q6_Q_vcmp_gt_VhVh(shift, const_7);
+    mant = Q6_V_vmux_QVV(p_overflow, const_7fffffff, mant);
+
+    /* Turn negative values into two's complement negative values */
+    HVX_Vector v_neg = Q6_V_vand_QR(p_neg, -1); // 0xFFFFFFFF in -ve lanes, 0 in >=0
+    mant = Q6_V_vxor_VV(Q6_Vw_vadd_VwVw(mant, v_neg), v_neg);
+    return mant;
+    // LCOV_EXCL_STOP
+#endif // HEX_ARCH >= 73
+}
+
+// Convert float to int32, round to nearest, with 0.5 rounded away from 0
+// ie np.copysign(0.5,f).astype('int32')
+static inline HVX_Vector convert_sf_to_s32_rnd(HVX_Vector vals)
+{
+    return convert_sf_to_s32_core<true>(vals);
+}
+
+// Convert float32 to int32, round toward 0
+// ie f.astype('int32')
+static inline HVX_Vector convert_sf_to_s32(HVX_Vector vals)
+{
+    return convert_sf_to_s32_core<false>(vals);
+}
+
+template <bool RND> static inline HVX_Vector convert_hf_to_s16_core(HVX_Vector vals)
+{
+#if HEX_ARCH >= 73
+    if constexpr (RND) {
+        return hnnx::s16_from_hf_rnd_sat<0>(vals);
+    } else {
+        return Q6_Vh_equals_Vhf(vals);
+    }
+#else
+    // LCOV_EXCL_START [SAFTYSWCCB-1735]
+    if constexpr (RND) {
+        return hnnx::s16_from_hf_rnd_sat<0>(vals);
+    } else {
+        return hnnx::s16_from_hf_sat<0>(vals);
+    }
+    // LCOV_EXCL_STOP
+#endif // HEX_ARCH >= 73
+}
+
+static inline HVX_Vector convert_hf_to_s16(HVX_Vector vals)
+{
+    return convert_hf_to_s16_core<0>(vals);
+}
+
+static inline HVX_Vector convert_hf_to_s16_rnd(HVX_Vector vals)
+{
+    return convert_hf_to_s16_core<1>(vals);
+}
+
+// Some existing graphs seem to be sensitive to modifications of int32_to_float
+// So this is added in order to work around that
+static inline HVX_Vector convert_s32_to_sf(const HVX_Vector vals)
+{
+#if HEX_ARCH >= 73
+    return Q6_Vsf_equals_Vw(vals);
+#else
+    // LCOV_EXCL_START [SAFTYSWCCB-1735]
+    return int32_to_float(vals);
+    // LCOV_EXCL_STOP
+#endif
+}
+
+#if defined(__hexagon__)
+#define SCATTER_TYPE(_a) (intptr_t) _a
+inline ALWAYSINLINE void scatter_release_and_stall(const void *p) // must point to TCM
+{
+    asm volatile("vmem(%0+#0):scatter_release" : : "r"(p));
+    // issue load to same address; will stall until vscatters complete
+    *(HVX_Vector const volatile *)p;
+}
+#else
+#define SCATTER_TYPE(_a) (HVX_Vector *)_a
+[[maybe_unused]] inline ALWAYSINLINE void scatter_release_and_stall(const void *p)
+{
+    check_hvx();
+    return; // empty function def on non-hexagon targets
+}
+#endif
+
+/*=======================================*/
+/* Helper Function in assembly        */
+/*=======================================*/
+extern "C" {
+void vmemcpy_h(void *dst, const void *src, size_t len);
+
+void vmemset_h(void *dst, int value, size_t len);
+
+}; // extern "C"
+
+#ifndef __hexagon__
+// map to std. library for x86
+inline void vmemcpy_h(void *dst, const void *src, size_t len)
+{
+    check_hvx();
+    memcpy(dst, src, len);
+}
+inline void vmemset_h(void *dst, int val, size_t len)
+{
+    check_hvx();
+    memset(dst, val, len);
+}
+#endif //__hexagon__
+
+#endif // INTRINSICS_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/is_detected.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/is_detected.h
new file mode 100755
index 0000000000000..a01c6241077b2
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/is_detected.h
@@ -0,0 +1,42 @@
+//==============================================================================
+//
+// Copyright (c) 2020-2023 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+//
+// This is a simple example implementation of is_detected which is implemented
+// in std::experimental, but isn't supported by MSVC.
+//
+
+#ifndef IS_DETECTED_H
+#define IS_DETECTED_H 1
+
+namespace detail {
+
+struct nonesuch {
+    ~nonesuch() = delete;
+    nonesuch(nonesuch const &) = delete;
+    void operator=(nonesuch const &) = delete;
+};
+
+template <class Default, class AlwaysVoid, template <class...> class Op, class... Args> struct detector {
+    using value_t = std::false_type;
+    using type = Default;
+};
+
+template <class Default, template <class...> class Op, class... Args>
+struct detector<Default, std::void_t<Op<Args...>>, Op, Args...> {
+    using value_t = std::true_type;
+    using type = Op<Args...>;
+};
+
+} // namespace detail
+
+template <template <class...> class Op, class... Args>
+using is_detected = typename detail::detector<detail::nonesuch, void, Op, Args...>::value_t;
+
+template <template <class...> class Op, class... Args> constexpr bool is_detected_v = is_detected<Op, Args...>::value;
+
+#endif // IS_DETECTED_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/log.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/log.h
new file mode 100755
index 0000000000000..6fa6f853d83a1
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/log.h
@@ -0,0 +1,299 @@
+//==============================================================================
+//
+// Copyright (c) 2023 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef LOG_H
+#define LOG_H 1
+
+#include "weak_linkage.h"
+#include "macros_attribute.h"
+#include <cstdarg>
+#include <string>
+#include <chrono>
+
+#if !defined(__PRETTY_FUNCTION__) && !defined(__GNUC__)
+#define __FUNC_INFO__ __FUNCSIG__
+#else
+#define __FUNC_INFO__ __PRETTY_FUNCTION__
+#endif
+
+// GCC and Clang define a preprocessor macro which is just the basename of the current file.
+#if defined(__FILE_NAME__)
+#define FILE_BASENAME __FILE_NAME__
+#else
+
+// MSVC doesn't have this nice feature, so we have to do it manually.  Note that the entire path
+// still ends up in the .rodata section, unfortunately.
+
+// Constexpr that will strip the path off of the file for logging purposes
+constexpr char const *stripFilePath(const char *path)
+{
+    const char *file = path;
+    while (*path) {
+        if (*path++ == '/') {
+            file = path;
+        }
+    }
+    return file;
+}
+
+#define FILE_BASENAME stripFilePath(__FILE__)
+
+#endif // defined(__FILE_NAME__)
+
+#define STRINGIZE_DETAIL(X) #X
+#define STRINGIZE(X)        STRINGIZE_DETAIL(X)
+
+#include "graph_status.h"
+#include "cc_pp.h"
+
+#ifdef __cplusplus
+#include <cstdio>
+#else
+#include <stdio.h>
+#endif
+
+// If log level or the dynamic logging flag are defined but don't have a value,
+// then consider them to be undefined.
+#if ~(~NN_LOG_MAXLVL + 0) == 0 && ~(~NN_LOG_MAXLVL + 1) == 1
+#undef NN_LOG_MAXLVL
+#endif
+
+#if ~(~NN_LOG_DYNLVL + 0) == 0 && ~(~NN_LOG_DYNLVL + 1) == 1
+#undef NN_LOG_DYNLVL
+#endif
+
+/*
+ * We have migrated using C++ features like iostream to printf strings.
+ * Why?
+ * * C++ iostream makes it more difficult to use mixed decimal/hex
+ * * C++ iostream isn't easily compatible with on-target logging facilities
+ * * C++ iostream is bad for code size, printf is much better
+ */
+
+//Log levels macro
+#define NN_LOG_ERRORLVL         0 //Error log level is 0
+#define NN_LOG_WARNLVL          1 //Warning log level is 1
+#define NN_LOG_STATLVL          2 //Stats log level is 2
+#define NN_LOG_INFOLVL          3 //Info log level is 3
+#define NN_LOG_VERBOSELVL       4 //Verbose log level is from 4-10
+#define NN_LOG_STATLVL_INTERNAL 8
+#define NN_LOG_INFOLVL_INTERNAL 9
+#define NN_LOG_DEBUGLVL         11 //Debug log level is > 10
+
+typedef void (*DspLogCallbackFunc)(int level, const char *fmt, va_list args);
+
+// Dynamically set the logging priority level.
+PUSH_VISIBILITY(default)
+EXTERN_C_BEGIN
+extern "C" {
+
+API_FUNC_EXPORT void SetLogPriorityLevel(int level);
+API_FUNC_EXPORT int GetLogPriorityLevel();
+API_FUNC_EXPORT void SetLogCallbackFunc(DspLogCallbackFunc fn);
+API_FUNC_EXPORT DspLogCallbackFunc GetLogCallbackFunc();
+
+// This prevents preemption if we're using the TID preemption mechanism.
+// Enable format checking when we're ready to fix all of the broken formats!
+//[[gnu::format(printf, 1, 2)]]
+API_FUNC_EXPORT void nn_log_printf(const char *fmt, ...);
+}
+EXTERN_C_END
+POP_VISIBILITY()
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// special log message for x86 that will log regardless logging level
+void qnndsp_x86_log(const char *fmt, ...);
+
+#ifdef __cplusplus
+}
+#endif
+
+/////////////////////////ENABLE_QNN_LOG
+#ifdef ENABLE_QNNDSP_LOG
+
+PUSH_VISIBILITY(default)
+#include "weak_linkage.h"
+
+API_FUNC_EXPORT API_C_FUNC void API_FUNC_NAME(SetLogCallback)(DspLogCallbackFunc cbFn, int logPriority);
+
+extern "C" {
+API_FUNC_EXPORT void qnndsp_log(int prio, const char *FMT, ...);
+
+API_FUNC_EXPORT void hv3_load_log_functions(decltype(SetLogCallback) **SetLogCallback_f);
+}
+POP_VISIBILITY()
+
+#define qnndsp_base_log(prio, cformat, ...) (void)(qnndsp_log(prio, cformat, ##__VA_ARGS__))
+
+#define _rawlog_(cformat, ...) (qnndsp_base_log(NN_LOG_ERRORLVL, cformat, __VA_ARGS__), GraphStatus::ErrorFatal)
+#define _okaylog_(cformat, ...)                                                                                        \
+    (qnndsp_base_log(NN_LOG_ERRORLVL, "%s:" STRINGIZE(__LINE__) ":" cformat "\n", FILE_BASENAME, __VA_ARGS__),         \
+     GraphStatus::ErrorFatal)
+#define _errlog_(cformat, ...)                                                                                         \
+    (qnndsp_base_log(NN_LOG_ERRORLVL, "%s:" STRINGIZE(__LINE__) ":ERROR:" cformat "\n", FILE_BASENAME, __VA_ARGS__),   \
+     GraphStatus::ErrorFatal)
+#define _warnlog_(cformat, ...) qnndsp_base_log(NN_LOG_WARNLVL, "WARNING: " cformat "\n", __VA_ARGS__)
+#define _statlog_(statname, statvalue, dummy)                                                                          \
+    qnndsp_base_log(NN_LOG_STATLVL, "STAT: %s=%lld\n", statname, (long long)statvalue)
+#define _i_statlog_(statname, statvalue, dummy)                                                                        \
+    qnndsp_base_log(NN_LOG_STATLVL_INTERNAL, "STAT: %s=%lld\n", statname, (long long)statvalue)
+#define _statslog_(statname, statvalue, dummy) qnndsp_base_log(NN_LOG_STATLVL, "STAT: %s=%s\n", statname, statvalue)
+#define _i_statslog_(statname, statvalue, dummy)                                                                       \
+    qnndsp_base_log(NN_LOG_STATLVL_INTERNAL, "STAT: %s=%s\n", statname, statvalue)
+#define _infolog_(cformat, ...)       qnndsp_base_log(NN_LOG_INFOLVL, cformat "\n", __VA_ARGS__)
+#define _i_infolog_(cformat, ...)     qnndsp_base_log(NN_LOG_INFOLVL_INTERNAL, cformat "\n", __VA_ARGS__)
+#define _debuglog_(cformat, ...)      qnndsp_base_log(NN_LOG_DEBUGLVL, cformat "\n", __VA_ARGS__)
+#define _verboselog_(cformat, ...)    qnndsp_base_log(NN_LOG_VERBOSELVL, cformat "\n", __VA_ARGS__)
+#define logmsgraw(prio, cformat, ...) qnndsp_base_log(prio, cformat, ##__VA_ARGS__)
+#define _logmsg_(prio, cformat, ...)                                                                                   \
+    (void)(qnndsp_base_log(prio, "%s:" STRINGIZE(__LINE__) ":" cformat "\n", FILE_BASENAME, __VA_ARGS__))
+#define _logmsgl_(prio, cformat, ...) (void)(qnndsp_base_log(prio, cformat, __VA_ARGS__))
+
+#else // ENABLE_QNNDSP_LOG
+
+// Standalone HexNN default log
+
+#if defined(NN_LOG_DYNLVL) && (NN_LOG_DYNLVL > 0)
+
+// Dynamic logging level test function.
+static inline bool log_condition(const int prio)
+{
+    return (prio <= GetLogPriorityLevel());
+};
+
+#elif defined(NN_LOG_MAXLVL)
+
+// Logging level is fixed at compile time.
+constexpr static bool log_condition(const int prio)
+{
+    return ((prio <= NN_LOG_MAXLVL) ? true : false);
+};
+
+#else
+
+// Logging is completely disabled.
+constexpr static bool log_condition(const int prio)
+{
+    return false;
+};
+
+#endif
+
+// These are conditional, where the condition is set via compile flags.  Note that these are
+// template functions so that we can exclude them from coverage using lcov commands.
+
+template <typename... Types> inline void logmsgraw(const int prio, char const *fmt, Types... args)
+{
+    // LCOV_EXCL_START [SAFTYSWCCB-996]
+    if (log_condition(prio)) {
+        nn_log_printf(fmt, args...);
+    }
+    // LCOV_EXCL_STOP
+}
+
+// These macros are what are used in actual code, so that the line and filename macros will expand
+// properly to show where the macro is invoked.
+
+#define _rawlog_(FMT, ...)  (nn_log_printf((FMT), __VA_ARGS__))
+#define _okaylog_(FMT, ...) (nn_log_printf(FILE_BASENAME ":" STRINGIZE(__LINE__) ":" FMT "\n", __VA_ARGS__))
+#define _errlog_(FMT, ...)                                                                                             \
+    (nn_log_printf(FILE_BASENAME ":" STRINGIZE(__LINE__) ":ERROR:" FMT "\n", __VA_ARGS__), GraphStatus::ErrorFatal)
+#define _logmsg_(PRIO, FMT, ...) logmsgraw(PRIO, FILE_BASENAME ":" STRINGIZE(__LINE__) ": " FMT "\n", __VA_ARGS__)
+#define _warnlog_(FMT, ...)                                                                                            \
+    logmsgraw(NN_LOG_WARNLVL, FILE_BASENAME ":" STRINGIZE(__LINE__) ":WARNING: " FMT "\n", __VA_ARGS__)
+#define _statlog_(statname, statvalue, dummy)                                                                          \
+    logmsgraw(NN_LOG_STATLVL, FILE_BASENAME ":" STRINGIZE(__LINE__) ":STAT: %s=%lld\n", statname, (long long)statvalue)
+#define _i_statlog_(statname, statvalue, dummy)                                                                        \
+    logmsgraw(NN_LOG_STATLVL_INTERNAL, FILE_BASENAME ":" STRINGIZE(__LINE__) ":STAT: %s=%lld\n", statname,             \
+              (long long)statvalue)
+#define _statslog_(statname, statvalue, dummy)                                                                         \
+    logmsgraw(NN_LOG_STATLVL, FILE_BASENAME ":" STRINGIZE(__LINE__) ":STAT: %s=%s\n", statname, statvalue)
+#define _i_statslog_(statname, statvalue, dummy)                                                                       \
+    logmsgraw(NN_LOG_STATLVL_INTERNAL, FILE_BASENAME ":" STRINGIZE(__LINE__) ":STAT: %s=%s\n", statname, (statvalue))
+#define _infolog_(FMT, ...) logmsgraw(NN_LOG_INFOLVL, FILE_BASENAME ":" STRINGIZE(__LINE__) ":" FMT "\n", __VA_ARGS__)
+#define _i_infolog_(FMT, ...)                                                                                          \
+    logmsgraw(NN_LOG_INFOLVL_INTERNAL, FILE_BASENAME ":" STRINGIZE(__LINE__) ":" FMT "\n", __VA_ARGS__)
+#define _debuglog_(FMT, ...) logmsgraw(NN_LOG_DEBUGLVL, FILE_BASENAME ":" STRINGIZE(__LINE__) ":" FMT "\n", __VA_ARGS__)
+#define _verboselog_(FMT, ...)                                                                                         \
+    logmsgraw(NN_LOG_VERBOSELVL, FILE_BASENAME ":" STRINGIZE(__LINE__) ":" FMT "\n", __VA_ARGS__)
+
+#endif // ENABLE_QNNDSP_LOG
+
+template <class T> constexpr const char *format_type_check = "";
+
+// This compile-time expression ensures that we always apply the -Wformat type-safety check.
+#define FORMAT_TYPE_CHECK(...) (format_type_check<decltype(printf(__VA_ARGS__))>)
+
+#define rawlog(...)       _rawlog_(__VA_ARGS__, FORMAT_TYPE_CHECK(__VA_ARGS__))
+#define okaylog(...)      _okaylog_(__VA_ARGS__, FORMAT_TYPE_CHECK(__VA_ARGS__))
+#define errlog(...)       _errlog_(__VA_ARGS__, FORMAT_TYPE_CHECK(__VA_ARGS__))
+#define logmsg(PRIO, ...) _logmsg_(PRIO, __VA_ARGS__, FORMAT_TYPE_CHECK(__VA_ARGS__))
+#define warnlog(...)      _warnlog_(__VA_ARGS__, FORMAT_TYPE_CHECK(__VA_ARGS__))
+#define statlog(...)      _statlog_(__VA_ARGS__, FORMAT_TYPE_CHECK(__VA_ARGS__))
+#define i_statlog(...)    _i_statlog_(__VA_ARGS__, FORMAT_TYPE_CHECK(__VA_ARGS__))
+#define statslog(...)     _statslog_(__VA_ARGS__, FORMAT_TYPE_CHECK(__VA_ARGS__))
+#define i_statslog(...)   _i_statslog_(__VA_ARGS__, FORMAT_TYPE_CHECK(__VA_ARGS__))
+#define infolog(...)      _infolog_(__VA_ARGS__, FORMAT_TYPE_CHECK(__VA_ARGS__))
+#define i_infolog(...)    _i_infolog_(__VA_ARGS__, FORMAT_TYPE_CHECK(__VA_ARGS__))
+#define _debuglog(...)    _debuglog_(__VA_ARGS__, FORMAT_TYPE_CHECK(__VA_ARGS__))
+#define verboselog(...)   _verboselog_(__VA_ARGS__, FORMAT_TYPE_CHECK(__VA_ARGS__))
+
+// Extra hook for debuglog.  This allows files to redefine it in order to add extra compile-time
+// hooks for removing it.
+#define debuglog(...) _debuglog(__VA_ARGS__)
+
+#ifdef NN_LOG_MAXLVL
+#define LOG_STAT()    ((NN_LOG_MAXLVL) >= NN_LOG_STATLVL)
+#define LOG_INFO()    ((NN_LOG_MAXLVL) >= NN_LOG_INFOLVL)
+#define LOG_DEBUG()   ((NN_LOG_MAXLVL) >= NN_LOG_DEBUGLVL)
+#define LOG_VERBOSE() ((NN_LOG_MAXLVL) >= NN_LOG_VERBOSELVL)
+#else
+#define LOG_STAT()    (1)
+#define LOG_INFO()    (1)
+#define LOG_DEBUG()   (1)
+#define LOG_VERBOSE() (1)
+#endif //#ifdef NN_LOG_MAXLVL
+
+class ExternalProgressLogger {
+
+  public:
+    static void start(const char *stage_name);
+
+    static void update_progress(unsigned int numerator, unsigned int denominator);
+
+    static void end(const char *stage_name, const char *duration);
+};
+
+class ExternalTimePoint {
+    using TimePoint = std::chrono::high_resolution_clock::time_point;
+    const std::string stage_name;
+    const TimePoint start_time;
+    unsigned int numerator = 1;
+    unsigned int denominator = 1;
+    bool done = false;
+
+  public:
+    explicit ExternalTimePoint(const std::string &&stage_name);
+
+    void update_progress(unsigned int new_numerator, unsigned int new_denominator);
+
+    void close();
+
+    // Custom destructor
+    ExternalTimePoint() = delete;
+    ExternalTimePoint(const ExternalTimePoint &) = delete;
+    ExternalTimePoint &operator=(ExternalTimePoint &t) = delete;
+    ExternalTimePoint(ExternalTimePoint &&) = delete;
+    ExternalTimePoint &operator=(ExternalTimePoint &&t) = delete;
+    ~ExternalTimePoint() { close(); } // LCOV_EXCL_LINE [SAFTYSWCCB-1542]
+};
+
+#endif //#ifndef LOG_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/macros_attribute.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/macros_attribute.h
new file mode 100755
index 0000000000000..ebdfa37152cdd
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/macros_attribute.h
@@ -0,0 +1,86 @@
+//==============================================================================
+//
+// Copyright (c) 2023 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef MACROS_MSVC_CPP17_H
+#define MACROS_MSVC_CPP17_H
+
+// Macros to compatible with MSVC and clang
+#if defined(_MSC_VER)
+
+// Macros to tells the compiler to never inline a particular member function
+#define NOINLINE __declspec(noinline)
+// Macros to force a function to be inlined, meaning that the function call is replaced with the function body at compile time
+#define ALWAYSINLINE __forceinline
+
+#ifndef API_EXPORT
+//Macros to export symbol from a library
+#define API_EXPORT __declspec(dllexport)
+#endif
+#ifndef API_HIDDEN
+//Macros to hidden symbol from a library
+#define API_HIDDEN
+
+//Macros to tell the compiler that the function returns an object that is not aliased, that is, referenced by any other pointers.
+#define RESTRICT_VAR __restrict
+
+#endif
+#else
+
+#define NOINLINE     __attribute__((noinline))
+#define ALWAYSINLINE __attribute((always_inline))
+
+#ifndef API_EXPORT
+/**
+ * @brief The definition of API_EXPORT is commented out for
+ * now, because it was causing problems with static analysis.
+ * As an alternative, use the PUSH_VISIBILITY(default)/POP_VISIBILITY()
+ * macros to ensure that symbols are exported on Linux.
+*/
+#define API_EXPORT /* [[gnu::visibility("default")]] */
+#endif
+#ifndef API_HIDDEN
+#define API_HIDDEN [[gnu::visibility("hidden")]]
+#endif
+
+#define RESTRICT_VAR __restrict__
+
+#endif // _MSC_VER
+
+/**
+ * @brief The following macros: [PUSH|POP|ENABLE|DISABLE]_WARNING,
+ * allow for in-code enabing and disabling of compiler warnings in
+ * a portable fashion.
+*/
+#define DO_PRAGMA(x) _Pragma(#x)
+
+#define MSVC_NO_EQUIV
+#define GNU_NO_EQUIV ""
+
+#if defined(__clang__)
+#define PUSH_WARNING()             DO_PRAGMA(clang diagnostic push)
+#define POP_WARNING()              DO_PRAGMA(clang diagnostic pop)
+#define ENABLE_WARNING(gnu, msvc)  DO_PRAGMA(clang diagnostic warning gnu)
+#define DISABLE_WARNING(gnu, msvc) DO_PRAGMA(clang diagnostic ignored gnu)
+#elif defined(__GNUG__)
+#define PUSH_WARNING()             DO_PRAGMA(GCC diagnostic push)
+#define POP_WARNING()              DO_PRAGMA(GCC diagnostic pop)
+#define ENABLE_WARNING(gnu, msvc)  DO_PRAGMA(GCC diagnostic warning gnu)
+#define DISABLE_WARNING(gnu, msvc) DO_PRAGMA(GCC diagnostic ignored gnu)
+#elif defined(_MSC_VER)
+#define PUSH_WARNING()             DO_PRAGMA(warning(push))
+#define POP_WARNING()              DO_PRAGMA(warning(pop))
+#define ENABLE_WARNING(gnu, msvc)  DO_PRAGMA(warning(default : msvc))
+#define DISABLE_WARNING(gnu, msvc) DO_PRAGMA(warning(disable : msvc))
+#else
+#define PUSH_WARNING()
+#define POP_WARNING()
+#define ENABLE_WARNING(gnu, msvc)
+#define DISABLE_WARNING(gnu, msvc)
+#endif
+
+#endif //MACROS_MSVC_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/match_op.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/match_op.h
new file mode 100755
index 0000000000000..9e95a060ac09f
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/match_op.h
@@ -0,0 +1,211 @@
+//==============================================================================
+//
+// Copyright (c) 2018-2024 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef MATCH_OP_H_
+#define MATCH_OP_H_ 1
+
+#include <vector>
+#include <stdexcept>
+#include "op_package_name.h"
+#include "opname_tag.h"
+#include "macros_attribute.h"
+#include "weak_linkage.h"
+
+#ifndef PREPARE_DISABLED
+//
+//
+// Classes to make a 'MatchOp'
+//
+// Executing the match rule, in the namespace of MatchBuilder, causes it to
+// return a MatchAstNode; that is analyzed to generate an instance of a subclass of MatchOpBase.
+//
+// MatchAstNode
+//    - node of an Abstract Syntax tree which is built of the rule.
+// MatchAstSubNode
+//    - represents a single input to an Op in an MatchAstNode. May or may not contain a pointer
+//      to a nested MatchAstNode.
+// MatchBuilder
+//   - just provides a namespace, in whicb 'executing' the text
+//     of the rule causes it to return a MatchAstNode
+//
+
+// Limit of match params. This is the sum of
+//   - all Op or OpVarIn the pattern, including the root; whether or not LET attached
+//   - number of distinct operand names which occur in the pattern. Names which appear
+//    in LET are not counted, even if they also appear outside of LET.
+//
+// This limit only affects the array sizes used in the Match object (only one instantiated,
+// during optimization phase) so we can be generous
+
+PUSH_VISIBILITY(default)
+
+namespace hnnx {
+
+static constexpr int MATCH_MAX_PATTERN = 80;
+
+//
+class MatchOpBase;
+using MatchOp_uptr = std::unique_ptr<MatchOpBase>;
+class MatchAstNode;
+using MatchAst_uptr = std::unique_ptr<MatchAstNode>;
+
+/// @brief MatchAstNode represents an input to an Op in a Match Pattern.
+///
+/// Only used as element of m_subnodes array in MatchAstNode.
+
+class MatchAstSubnode {
+    friend MatchAstNode;
+
+  protected:
+    // since most of the descendants are operand_tag_t, we keep those right in the
+    // subnode list. If m_sub is empty, then m_optag should be non-empty, and vice versa
+    MatchAst_uptr m_sub; ///<  points to a MatchAstNode if the input is a contained Op; empty otherwise
+    operand_tag_t m_optag; ///<   m_optag contains the operand tag if the input is not a contained op ; "" otherwise
+  public:
+    MatchAstSubnode(MatchAst_uptr &&ptr) : m_sub(std::move(ptr)) {}
+    MatchAstSubnode(operand_tag_parm_t otag) : m_optag(otag) {}
+    MatchAstSubnode(char const *otag) : m_optag(otag) {}
+    bool is_optag() const { return m_sub.get() == nullptr; }
+    MatchAstNode const *get_subnode_p() const { return m_sub.get(); }
+    MatchAstNode *get_subnode_p() { return m_sub.get(); }
+    operand_tag_t get_optag() const { return m_optag; }
+};
+
+/// @brief MatchAstNode represents an Op in a match rule
+///
+/// A tree of these is built by executing the match rule in a MatchBuilder context, and
+/// exists only long enough to be analyzed so that a MatchOp for the rule can be built.
+///
+
+class MatchAstNode {
+    friend MatchOpBase;
+
+  public:
+    enum node_variant { is_Op, is_OpVarIn };
+    virtual ~MatchAstNode() {}
+    operand_tag_t m_optag; ///< tag assigned via LET, or empty if none ("*" at root)
+    /// true if the node or subnode has any ref to optag (including its own m_optag)
+    API_EXPORT bool contains_optag(operand_tag_parm_t optag) const;
+    MatchAstNode &operator=(MatchAstNode const &) = delete;
+    MatchAstNode(MatchAstNode const &) = delete;
+
+  protected:
+    opname_tag_t m_opname; ///< From first param of Op
+    node_variant m_opvariant; ///< is Op or OpVarIn?
+    std::vector<MatchAstSubnode> m_subnodes; ///< One for each input in the pattern
+
+    /// Count the ops, and put their LET names in the map.
+    /// Returns >1 if ok, -1 if error
+    API_EXPORT int enumerate_ops(std::map<operand_tag_t, int> &op_tag_to_idx_map, int opcount);
+
+    // if WITH_OPT_DEBUG is not defined, this returns an empty pointer.
+    API_EXPORT std::unique_ptr<char[]> make_debug_desc(std::map<operand_tag_t, int> const &opertag_to_idx_map) const;
+
+  public:
+    API_EXPORT MatchAstNode(char const *, char const *, node_variant, int n_subnodes, MatchAstSubnode *subnodes);
+};
+
+} // namespace hnnx
+
+//
+// A rule definition looks like this, after pre-processing:
+//
+//template<>
+//MatchOp_uptr MatchBuilder::matcher<SomeUniqueClass>( )
+// {
+//	return build_matcher( Op("Foo1", Op("Abc","X"),"Y", "Z"));
+// }
+
+class MatchBuilder {
+
+  protected:
+    /// \ingroup OptMatch
+    /// @brief define an Op to match: Op("opname", ...inputs... )
+    /// The inputs can be operand tags (strings), or nested Op (or OpVarIn)
+    ///
+
+    template <typename... S> using are_strings = std::conjunction<std::is_same<const char *, S>...>;
+
+    template <typename... S> using have_matchast_uptr = std::disjunction<std::is_same<hnnx::MatchAst_uptr, S>...>;
+
+    // just convert each 'input' to a MatchAstSubnode, and build a MatchAstNode from that.
+    template <typename... Ts>
+    API_EXPORT static typename std::enable_if<have_matchast_uptr<Ts...>::value, hnnx::MatchAst_uptr>::type
+    Op(char const *opname, Ts &&...ts)
+    {
+        std::array<hnnx::MatchAstSubnode, sizeof...(Ts)> subnodes = {std::move(std::forward<Ts>(ts))...};
+        return std::make_unique<hnnx::MatchAstNode>(opname, pkg_flag.c_str(), hnnx::MatchAstNode::is_Op, sizeof...(Ts),
+                                                    subnodes.data());
+    }
+
+    template <typename... Ts>
+    API_EXPORT static typename std::enable_if<are_strings<Ts...>::value, hnnx::MatchAst_uptr>::type
+    Op(char const *opname, Ts... ts)
+    {
+        std::array<hnnx::MatchAstSubnode, sizeof...(Ts)> subnodes = {(ts)...};
+        return std::make_unique<hnnx::MatchAstNode>(opname, pkg_flag.c_str(), hnnx::MatchAstNode::is_Op, sizeof...(Ts),
+                                                    subnodes.data());
+    }
+
+    template <typename... Ts>
+    API_EXPORT static typename std::enable_if<are_strings<Ts...>::value, hnnx::MatchAst_uptr>::type
+    Op(char const *opname, hnnx::MatchAst_uptr &&t1, Ts... ts)
+    {
+        hnnx::MatchAstSubnode subnodes[sizeof...(Ts) + 1] = {std::move(t1), (ts)...};
+        return std::make_unique<hnnx::MatchAstNode>(opname, pkg_flag.c_str(), hnnx::MatchAstNode::is_Op,
+                                                    sizeof...(Ts) + 1, subnodes);
+    }
+
+    /// \ingroup OptMatch
+    /// @brief define an Op to match, with at least the specified number of inputs: OpVarIn("opname", ..inputs..)
+    /// This will match the same as Op, but will also accept additional (unspecified) inputs.
+    /// The number of inputs may be zero: OpVarIn("opname") matches any op "opname".
+    ///
+    template <typename... Ts>
+    API_EXPORT static typename std::enable_if<have_matchast_uptr<Ts...>::value, hnnx::MatchAst_uptr>::type
+    OpVarIn(char const *opname, Ts &&...ts)
+    {
+        std::array<hnnx::MatchAstSubnode, sizeof...(Ts)> subnodes = {std::move(std::forward<Ts>(ts))...};
+        return std::make_unique<hnnx::MatchAstNode>(opname, pkg_flag.c_str(), hnnx::MatchAstNode::is_OpVarIn,
+                                                    sizeof...(Ts), subnodes.data());
+    }
+
+    template <typename... Ts>
+    API_EXPORT static typename std::enable_if<are_strings<Ts...>::value, hnnx::MatchAst_uptr>::type
+    OpVarIn(char const *opname, Ts... ts)
+    {
+        std::array<hnnx::MatchAstSubnode, sizeof...(Ts)> subnodes = {(ts)...};
+        return std::make_unique<hnnx::MatchAstNode>(opname, pkg_flag.c_str(), hnnx::MatchAstNode::is_OpVarIn,
+                                                    sizeof...(Ts), subnodes.data());
+    }
+    /// \ingroup OptMatch
+    /// @brief give an Op in a pattern a tag: LET("tag", Op("opname", ...inputs... ))
+    /// Second parameter must be Op or OpVarIn, and must not contain the same tag.
+    /// The same tag can be used elsewhere in the pattern, but only in one LET.
+    ///
+    API_EXPORT static hnnx::MatchAst_uptr LET(hnnx::operand_tag_parm_t optag, hnnx::MatchAst_uptr &&subnode)
+    {
+        implement_LET(optag, subnode);
+        return std::move(subnode);
+    }
+    /// @brief internal implementation for LET.
+    /// just adds 'optag' to the node, and complains if it already has a tag.
+    /// Maybe it also checks that the name doesn't appear inside.
+    //
+    API_EXPORT static void implement_LET(hnnx::operand_tag_parm_t optag, hnnx::MatchAst_uptr &subnode);
+
+  public:
+    template <typename T> API_EXPORT static hnnx::MatchAst_uptr matcher();
+    API_EXPORT static hnnx::MatchOp_uptr build_matcher(hnnx::MatchAst_uptr &matchast);
+    API_EXPORT_IMPORT static std::string pkg_flag;
+};
+
+POP_VISIBILITY()
+
+#endif //* !PREPARE_DISABLED */
+#endif /* MATCH_OP_H_ */
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/memory_layout.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/memory_layout.h
new file mode 100755
index 0000000000000..bd0dc73b2c856
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/memory_layout.h
@@ -0,0 +1,284 @@
+//==============================================================================
+//
+// Copyright (c) 2020 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef HEXNN_MEMORY_LAYOUT_H
+#define HEXNN_MEMORY_LAYOUT_H 1
+
+#include <array>
+#include <cstddef>
+#include <tuple>
+#include <utility>
+
+/*
+ * Rewrite memory layout
+ *
+ * Use more recursion for less complexity at each level
+ *
+ * Separate Offset and Index for use by non-contiguous tensor representations
+ */
+
+namespace hnnx {
+
+// is_power_of_two: check for some number of zeros, followed by 1, followed by some number of zeros.
+// FIXME: maybe should use bitset?
+static inline constexpr bool is_power_of_two(unsigned long in)
+{
+    return (in > 0) && ((in & (in - 1)) == 0);
+}
+
+/*
+ * Making a constexpr std::array is kind of tough if a lot of the std::array
+ * member functions are not constexpr, which is true if you have pre-c++17 header
+ * files...
+ */
+// LCOV_EXCL_START [SAFTYSWCCB-1736] constexprs resolved during compile time
+template <typename T, size_t Rank, size_t... I>
+static inline constexpr std::array<T, Rank> make_stdarray_helper(const T val, std::index_sequence<I...>)
+{
+    std::array<T, Rank> out = {((void)I, val)...};
+    return out;
+}
+
+template <typename T, size_t Rank> static inline constexpr std::array<T, Rank> make_stdarray(const T val)
+{
+    return make_stdarray_helper<T, Rank>(val, std::make_index_sequence<Rank>{});
+}
+// LCOV_EXCL_STOP
+
+} // namespace hnnx
+
+/*
+ * We use std::get in a lot of places below because operator[] is not constexpr
+ * if you have pre-C++17 system header files
+ */
+
+/*
+ * The base template... do not use
+ */
+template <size_t... Stuff> struct ChunkedMemoryLayout {
+    //static_assert(false,"Oops: matched generic base. Please use specialized templates.");
+};
+
+/*
+ * The smallest Chunk is just 1 element
+ */
+template <size_t RankVal> struct ChunkedMemoryLayout<RankVal> {
+    static constexpr size_t Rank = RankVal;
+    static constexpr std::array<size_t, Rank> ChunkSizes = hnnx::make_stdarray<size_t, Rank>(1);
+    static constexpr size_t chunk_total = 1;
+    static constexpr bool is_valid_chunk = true;
+    static inline constexpr size_t chunk_offset(const std::array<size_t, Rank> &padded_coords,
+                                                const std::array<size_t, Rank> &dims_total)
+    {
+        return 0;
+    }
+    static inline constexpr size_t linear_offset(const std::array<size_t, Rank> &padded_coords,
+                                                 const std::array<size_t, Rank> &dims_total)
+    {
+        return 0;
+    }
+    static inline constexpr size_t chunk_index(const std::array<size_t, Rank> &padded_coords,
+                                               const std::array<size_t, Rank> &dims_total, size_t offset = 0)
+    {
+        return offset;
+    }
+    static inline constexpr std::array<size_t, Rank> pad_dims(const std::array<size_t, Rank> dims_in)
+    {
+        return dims_in;
+    }
+};
+
+/*
+ * This should boil down to nothing... no non-constexpr storage, no non-constexpr functions.
+ */
+template <size_t RankVal, size_t Dim, size_t ChunkSize, size_t... Rest>
+struct ChunkedMemoryLayout<RankVal, Dim, ChunkSize, Rest...> {
+    using Smaller = ChunkedMemoryLayout<RankVal, Rest...>;
+    static constexpr size_t Rank = RankVal;
+    static_assert(Dim < RankVal);
+    //static_assert(ChunkSize > 0);
+    static_assert((ChunkSize == 0) || hnnx::is_power_of_two(ChunkSize));
+    static_assert((ChunkSize == 0) || Smaller::is_valid_chunk);
+    static constexpr bool is_valid_chunk = ((ChunkSize > 0) && (Smaller::is_valid_chunk));
+    static constexpr std::array<size_t, Rank> embiggen_chunksize(const std::array<size_t, Rank> smaller_chunksize)
+    {
+        std::array<size_t, Rank> out = smaller_chunksize;
+        if (ChunkSize) std::get<Dim>(out) *= ChunkSize;
+        return out;
+    }
+    static constexpr std::array<size_t, Rank> ChunkSizes = embiggen_chunksize(Smaller::ChunkSizes);
+    static constexpr size_t chunk_total = ChunkSize ? Smaller::chunk_total * ChunkSize : Smaller::chunk_total;
+    /* Where in the chunk is this element? */
+    /*
+	 *  FIXME sooner than later: recommendation to return std::pair or similar of chunk_index and chunk_offset
+	 * Can keep compatibility easily enough with a single wrapper.
+	 */
+    static inline constexpr size_t chunk_offset(const std::array<size_t, Rank> &padded_coords,
+                                                const std::array<size_t, Rank> &dims_total)
+    {
+        if constexpr (ChunkSize > 0) {
+            const size_t smaller_offset = Smaller::chunk_offset(padded_coords, dims_total);
+            const size_t dim_coord = padded_coords[Dim];
+            const size_t smaller_idx = dim_coord / std::get<Dim>(Smaller::ChunkSizes);
+            const size_t thischunk_smaller_idx = smaller_idx % ChunkSize;
+            const size_t smaller_chunk_total = Smaller::chunk_total;
+            return thischunk_smaller_idx * smaller_chunk_total + smaller_offset;
+        } else {
+            size_t const chunk_off = Smaller::chunk_offset(padded_coords, dims_total);
+            return chunk_off;
+        }
+    }
+    /* FIXME later: we're going to assume last to first dimension ordering */
+    static inline constexpr size_t chunk_index(const std::array<size_t, Rank> &padded_coords,
+                                               const std::array<size_t, Rank> &dims_total, size_t offset = 0)
+    {
+        if constexpr (is_valid_chunk) {
+            return offset;
+        } else {
+            offset *= std::get<Dim>(dims_total) / std::get<Dim>(ChunkSizes);
+            offset += std::get<Dim>(padded_coords) / std::get<Dim>(ChunkSizes);
+            size_t const chunk_idx = Smaller::chunk_index(padded_coords, dims_total, offset);
+            return chunk_idx;
+        }
+    }
+    static inline constexpr size_t linear_offset(const std::array<size_t, Rank> &padded_coords,
+                                                 const std::array<size_t, Rank> &dims_total)
+    {
+        const size_t offset = chunk_offset(padded_coords, dims_total);
+        const size_t index = chunk_index(padded_coords, dims_total);
+        return index * chunk_total + offset;
+    }
+    static inline std::array<size_t, Rank> pad(const std::array<size_t, Rank> dims_in)
+    {
+        std::array<size_t, Rank> newdims;
+        for (int i = 0; i < Rank; i++) {
+            auto dim_chunk_size = ChunkSizes[i];
+            newdims[i] = ((dims_in[i] + (dim_chunk_size - 1)) & (~(dim_chunk_size - 1)));
+        }
+        return newdims;
+    }
+    static inline size_t num_blocks(const std::array<size_t, Rank> max_dims)
+    {
+        size_t blocks = 1;
+        for (int i = 0; i < Rank; i++) {
+            auto dim_chunk_size = ChunkSizes[i];
+            blocks *= max_dims[i] / dim_chunk_size;
+        }
+        return blocks;
+    }
+#if 0
+    static inline constexpr size_t
+    chunk_index(const std::array<size_t, Rank> padded_coords,
+                const std::array<size_t, Rank> dims_total)
+    {
+        size_t offset = 0;
+        for (int i = 0; i < Rank; i++) {
+            offset *= dims_total[i] / ChunkSizes[i];
+            offset += padded_coords[i] / ChunkSizes[i];
+        }
+        return offset;
+    }
+#endif
+};
+
+// Simplified case,
+// E.g. FlatMemoryLayout<4>
+//  equiv to ChunkedMemoryLayout<4, 0,0, 0,1, 0,2, 0,3>
+
+template <size_t RankVal> struct FlatMemoryLayout {
+    static constexpr size_t Rank = RankVal;
+    static constexpr std::array<size_t, Rank> ChunkSizes = hnnx::make_stdarray<size_t, Rank>(1);
+    static constexpr size_t chunk_total = 1;
+    static inline constexpr size_t chunk_offset(const std::array<size_t, Rank> &padded_coords,
+                                                const std::array<size_t, Rank> &dims_total)
+    {
+        return 0;
+    }
+    static inline constexpr size_t chunk_index(const std::array<size_t, Rank> &padded_coords,
+                                               const std::array<size_t, Rank> &dims_total)
+    {
+        size_t offset = padded_coords[0];
+        for (int i = 1; i < Rank; i++) {
+            offset = offset * dims_total[i] + padded_coords[i];
+        }
+        return offset;
+    }
+    static inline constexpr size_t linear_offset(const std::array<size_t, Rank> &padded_coords,
+                                                 const std::array<size_t, Rank> &dims_total)
+    {
+        return chunk_index(padded_coords, dims_total);
+    }
+    static inline constexpr std::array<size_t, Rank> pad(const std::array<size_t, Rank> dims_in) { return dims_in; }
+
+    static inline constexpr size_t num_blocks(const std::array<size_t, Rank> max_dims)
+    {
+        size_t blocks = max_dims[0];
+        for (int i = 1; i < Rank; i++) {
+            blocks *= max_dims[i];
+        }
+        return blocks;
+    }
+};
+class R4FlatMemoryLayout : public FlatMemoryLayout<4> {
+}; //NHWC
+class R5FlatMemoryLayout : public FlatMemoryLayout<5> {
+}; //NHWDC
+class R6FlatMemoryLayout : public FlatMemoryLayout<6> {
+};
+
+class R4NCHWMemoryLayout : public ChunkedMemoryLayout<4, 0, 0, 3, 0, 2, 0, 1, 0> {
+}; // NCHW
+class R4Depth32MemoryLayout : public ChunkedMemoryLayout<4, 0, 0, 1, 0, 3, 0, 2, 0, 2, 4, 3, 32> {
+};
+
+// Croutons for HMX, YYYXXXDDDDD chunks
+class R4CroutonLayout : public ChunkedMemoryLayout<4, 0, 0, 1, 0, 2, 0, 3, 0, 1, 8, 2, 8, 3, 32> {
+};
+// Croutons for HMX, YXXXXXDDDDD chunks (wide aspect ratio)
+class R4WideCroutonLayout : public ChunkedMemoryLayout<4, 0, 0, 1, 0, 2, 0, 3, 0, 1, 2, 2, 32, 3, 32> {
+};
+
+// Croutons for HMX, YYYXDDDDDXX chunks
+class R4Crouton4x1Layout : public ChunkedMemoryLayout<4, 0, 0, 1, 0, 2, 0, 3, 0, 1, 8, 2, 2, 3, 32, 2, 4> {
+};
+
+// Croutons for HMX, YYXXDDDDDYX chunks
+class R4Crouton2x2Layout : public ChunkedMemoryLayout<4, 0, 0, 1, 0, 2, 0, 3, 0, 1, 4, 2, 4, 3, 32, 1, 2, 2, 2> {
+};
+
+// Croutons for HMX, YYXXDDDDDYX chunks (wide aspect ratio)
+class R4WideCrouton2x2Layout : public ChunkedMemoryLayout<4, 0, 0, 1, 0, 2, 0, 3, 0, 2, 16, 3, 32, 1, 2, 2, 2> {
+};
+
+// Croutons2 for HMX, 8x4x32 chunks where the data is 16b
+class R4Crouton2Layout : public ChunkedMemoryLayout<4, 0, 0, 1, 0, 2, 0, 3, 0, 1, 8, 2, 2, 3, 32, 2, 2> {
+};
+
+class R4Crouton4Layout : public ChunkedMemoryLayout<4, 0, 0, 1, 0, 2, 0, 3, 0, 1, 8, 2, 2, 3, 32> {
+};
+
+class R4WideCrouton4Layout : public ChunkedMemoryLayout<4, 0, 0, 1, 0, 2, 0, 3, 0, 1, 2, 2, 8, 3, 32> {
+};
+
+class R4Weights8x4Layout : public ChunkedMemoryLayout<4, 0, 0, 1, 0, 3, 0, 2, 0, 0, 8, 1, 4, 2, 16, 3, 32, 2, 2> {
+};
+
+class R5CroutonLayout : public ChunkedMemoryLayout<5, 0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 2, 8, 3, 8, 4, 32> {
+};
+
+//typedef FlatMemoryLayout<4> R4FlatMemoryLayout; // NHWC
+//typedef ChunkedMemoryLayout<4, 0,0, 3,0, 2,0, 1,0> R4NCHWMemoryLayout; // NCHW
+//typedef ChunkedMemoryLayout<4, 0,0, 1,0, 3,0, 2,0, 2,4, 3,32> R4Depth32MemoryLayout;
+//typedef ChunkedMemoryLayout<4, 0,0, 1,0, 2,0, 3,0, 1,8, 2,8, 3,32> R4CroutonLayout;		// Croutons for HMX, 8x8x32 chunks
+
+//typedef ChunkedMemoryLayout<3, 2,0, 1,0, 0,0> R3FlatMemoryLayout; // HWC
+//typedef ChunkedMemoryLayout<2, 1,0, 0,0> RowMajorMatrixLayout; // 2D
+//typedef ChunkedMemoryLayout<2, 0,0, 1,0> ColMajorMatrixLayout; // 2D
+//typedef ChunkedMemoryLayout<1, 1,0> VectorLayout; // 1D
+
+#endif
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/minihash.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/minihash.h
new file mode 100755
index 0000000000000..24c05e6465f8c
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/minihash.h
@@ -0,0 +1,971 @@
+//==============================================================================
+//
+// Copyright (c) 2020 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef MINIHASH_H_
+#define MINIHASH_H_
+
+#include <cassert>
+#include <vector>
+#include <utility>
+#include <cstdint>
+#include <cstring>
+#include <type_traits>
+#include <stdexcept>
+#include "conversions.h"
+#include "type_help.h"
+#include "builtin_intrinsics.h"
+
+#define XASSERT assert
+
+namespace hnnx {
+
+namespace minObj {
+
+inline int ceiling_log2(size_t value)
+{
+    if (value < 2) {
+        return 0;
+    }
+    if constexpr (sizeof(size_t) <= sizeof(unsigned long)) {
+        int const clz = HEX_COUNT_LEADING_ZERO_UL((unsigned long)value - 1);
+        return 8 * sizeof(unsigned long) - clz;
+    } else {
+        // LCOV_EXCL_START [SAFTYSWCCB-1736] device with 8 byte size_t
+        int const clz = HEX_COUNT_LEADING_ZERO_ULL((unsigned long long)value - 1);
+        return 8 * sizeof(unsigned long long) - clz;
+    }
+    // LCOV_EXCL_STOP
+}
+
+// hash T to uint32_t
+//
+template <typename T> struct findhash {
+    // uint32_t operator() ( T ) const{...}
+};
+
+// define for 'unsigned'
+// this is intended to be reasonably quick on hexagon
+template <> struct findhash<unsigned> {
+    uint32_t operator()(unsigned n) const
+    {
+        uint64_t const bigprod = uint64_t(n) * 0x740F1DE9;
+        return (uint32_t)bigprod ^ (uint32_t)(bigprod >> 32);
+    }
+};
+// define for 'unsigned long long' assuming it's 64 bits
+template <> struct findhash<unsigned long long> {
+    static_assert(sizeof(unsigned long long) == 8, "assumed true...?");
+    uint32_t operator()(unsigned long long n) const
+    {
+        unsigned const upperhash = mulu32_modular(0x192E2101u, (unsigned)(n >> 32));
+        return findhash<unsigned>()((unsigned)n ^ upperhash);
+    }
+};
+// define for 'unsigned long', which could be either 32 or 64
+template <>
+struct findhash<unsigned long>
+    : public findhash<
+              std::conditional<(sizeof(unsigned long) > sizeof(unsigned)), unsigned long long, unsigned>::type> {
+};
+// this is useful for defining findhash<X> on other types in other headers,
+// in terms of std::hash<X>,  without needing to include this header first.
+inline uint32_t findhash_sizet(size_t val)
+{
+    return findhash<size_t>()(val);
+}
+//
+// define for T*
+//
+template <typename T> struct findhash<T *> {
+    inline uint32_t operator()(T *ptr) const { return findhash<size_t>()((size_t)ptr); }
+};
+
+template <> struct findhash<int> : public findhash<unsigned> {
+};
+template <> struct findhash<long> : public findhash<unsigned long> {
+};
+template <> struct findhash<long long> : public findhash<unsigned long long> {
+};
+
+// hashmap_traits<typename Key,bool ERASE_OK>:
+//    bool valid:                     is this key OK
+//  (only needed if !ERASE_OK):
+//    static Key generate_null();		// make a 'null' entry
+//    static bool is_null(Key);		// test if nul
+template <typename Key, bool ERASE_OK> struct hashmap_traits { // defaults for ERASE_OK=true
+    static constexpr bool valid = std::is_trivial<Key>::value;
+};
+
+template <typename Key> struct hashmap_traits<Key, false> { // defaults for ERASE_OK=false
+    static constexpr bool valid =
+            std::is_trivial<Key>::value && (std::is_integral<Key>::value || std::is_pointer<Key>::value);
+    static inline Key generate_null() { return Key(0); }
+    static inline bool is_null(Key k) { return k == 0; }
+};
+
+// fake instance of integer type IT
+template <typename T> struct stuck_at_0 {
+    stuck_at_0() {}
+    stuck_at_0(T) {}
+    void operator=(T) {}
+    operator T() const { return 0; }
+};
+
+////////////////////////////////////////////////////////////////////
+//
+// minObj::hashmap<Key,T,ERASE_OK [,HASH]>
+//
+// typedefs:
+//    minihash_noerase<Key,T>
+//    minihash<Key,T>
+//
+//
+// This implements some std::map<Key,T> functionality but
+// storing the data in a contiguous array;  there are some constraints:
+//
+// - The key must be a trivial type, for which findhash()(Key) is workable,
+//   and == is defined; for ERASE_OK = false, it must be an integer type of 32 or 64
+//   bits, and the default hash is defined for those.
+//   Pointer type is also acceptable.
+//
+//  - T() must be move-constructable (and ideally should be a simple type).
+//   If it is expensive to move, maybe this type is not a good choice (occasional
+//   'rehash' moves everything in the table).
+//
+// - if ERASE_OK is false, then
+//     (a) key=0 is not allowed (it is used to mark reserved records)
+//     (b) there is no way to delete entries (except clear()).
+//
+//  - iteration is possible; the iterators address a tuple<const Key,T>
+//
+//  - iteration is only possible in forward direction (since the order is indeterminate,
+//    this is not really as issue)
+//    *But* adding a key may invalidate all except the end() iterator.
+//    shrink() may also do this.
+//
+//  - insertions are usually fast, but may sometimes take a while (if the hash table is enlarged).
+//    The policy is that if it is more than half full, its size is increased by 4.
+//
+//
+//  - (for ERASE_OK): deletions do not result in resizing, and do not invalidate iterators;
+//    but there is a 'shrink()' method which may shrink the table (and invalidate iterators)
+//    if there are enough dead nodes in it.
+//
+// The constructor can supply the initial size, which is rounded up to a power of 2
+// (and at least 8); the default is to make an empty table which becomes n=64 on first insert.
+//
+//  methods supported:
+//          hash.size()      ->  size_t
+//          hash.empty()     ->  bool
+//          hash.count(k)    ->  size_t
+//          hash.contains(k) ->  bool
+//          hash.clear()
+//          hash[k]          -> T &
+//          hash.at(k)       -> T &, T const &
+//          hash.find(k)     -> iterator or const iter
+//          hash.emplace( k, ... parms for T ctor )
+//          hash.try_emplace( k, ... parms for T ctor )
+//                (emplace and try_emplace are actually the same..)
+//          hash.begin(), hash.end(), hash.cbegin(), hash.cend()
+//          hash.swap( otherhash )
+//        Special methods:
+//          hash.shrink()->bool   (this does nothing when ERASE_OK is false).
+//              shrink() will rehash if a significant number of deleted entries
+//              exist; it returns true if it changed anything,
+//          hash.count_deleted()->size_t
+//              Returns the number of deleted entries. Always 0 when ERASE_OK = false.
+//
+//   Only if ERASE_OK:
+//          hash.erase(k)				- erase any key 'k'   (returns 0 or 1)
+//          hash.erase( iterator )		- erase iterator, which must be valid.
+//                     returns 'next' iterator
+//       Erase will not cause a 'rehash', so it invalidates any iterator to the
+//       erased element, but no others. Erasing the last entry (so size()=0) may
+//       cause all deleted entries to be cleared out.
+//
+// The hash table size HN is always a power of 2.
+// for a given key, we find a uint32 hash h, and then the path through the buckets is
+// defined by :
+//   - start at bucket h % NH;
+//   - increment by d mod NH, where d=2*(h>>16)+1.
+//  Since d is odd, it is always relatively prime to HN, so the path will cover all
+//   buckets.
+//  Note,
+//   if HN > 64K, a few 'h' bits are used for both the start position and d;
+//   if HN > 128K, the 'd' values are limited to 1,3 .. 128K-1 so that space
+//   is not fully covered. But, if the hash is uniform over the 32 bits these
+//   should not be problematic for reasonable-size hashes.
+//
+//
+template <typename Key> struct simple_key {
+    Key first; // the key value
+    using traits = hashmap_traits<Key, false>;
+    simple_key() : first(traits::generate_null()) {}
+    inline void clear_key() { first = traits::generate_null(); }
+    inline bool is_inuse() const { return !traits::is_null(first); }
+    inline bool is_deleted() const { return false; }
+    inline bool is_never_used() const { return traits::is_null(first); }
+};
+// this is used as the raw_entry type when ERASE_OK=false;
+// it should have the same size and layout as tuple<Key,T>
+template <typename Key, typename T> struct simple_raw_entry : public simple_key<Key> {
+    char value_slot alignas(T)[sizeof(T)];
+    static inline bool is_null(Key k) { return simple_key<Key>::traits::is_null(k); }
+
+    simple_raw_entry() : simple_key<Key>() {}
+    // this is only used when destroying a whole table, so we don't
+    // need to clear the key
+    inline void destroy()
+    {
+        if (this->is_inuse()) reinterpret_cast<T *>(value_slot)->~T();
+    }
+    inline void clear_entry()
+    {
+        if (this->is_inuse()) {
+            reinterpret_cast<T *>(value_slot)->~T();
+            this->clear_key();
+        }
+    }
+    inline T &value_field() { return *reinterpret_cast<T *>(value_slot); }
+    inline T const &value_field() const { return *reinterpret_cast<T const *>(value_slot); }
+    // move_from is only used when rehashing; the 'from' record is
+    // known to be non-empty. We move-construct the value, and
+    // delete the old one.
+    inline void move_from(Key k, simple_raw_entry &from)
+    {
+        XASSERT(!is_null(k) && !this->is_inuse());
+        this->first = k;
+        new (value_slot) T(std::move(from.value_field()));
+        reinterpret_cast<T *>(from.value_slot)->~T();
+    }
+    inline void copy_from(simple_raw_entry const &other)
+    {
+        this->first = other.first;
+        if (other.first != 0) new (value_slot) T(other.value_field());
+    }
+    template <class... Args> inline void emplace_new(Key k, Args &&...args)
+    {
+        XASSERT(!is_null(k) && !this->is_inuse());
+        this->first = k;
+        new (value_slot) T(std::forward<Args>(args)...);
+    }
+};
+// simple_raw_entry<Key,no_value>
+// is used for set-of-Key.
+struct no_value {
+};
+
+template <typename Key> struct simple_raw_entry<Key, no_value> : public simple_key<Key> {
+    static inline bool is_null(Key k) { return simple_key<Key>::traits::is_null(k); }
+    simple_raw_entry() : simple_key<Key>() {}
+    inline void destroy() {}
+    inline void clear_entry() { this->clear_key(); }
+    inline void move_from(Key k, simple_raw_entry &from)
+    {
+        XASSERT(!is_null(k) && !this->is_inuse());
+        this->first = k;
+    }
+    inline void copy_from(simple_raw_entry const &other) { this->first = other.first; }
+    inline void emplace_new(Key k, no_value const & /*unused*/)
+    {
+        XASSERT(!is_null(k) && !this->is_inuse());
+        this->first = k;
+    }
+};
+
+// erasable_raw_entry; for deletable maps
+// erasable_raw_entry<Key,no_value> is used
+// for deletable sets (the 'empty' value_slot
+// may use up a byte, but 'state' already does so it's
+// unlikely to increase the size).
+//
+template <typename Key, typename T> struct erasable_raw_entry {
+    Key first; // the key value
+    char value_slot alignas(T)[sizeof(T)];
+    signed char state;
+    // state = 0: never used; 1 = in use; -1 = deleted.
+    //
+    inline erasable_raw_entry() : state(0) {}
+
+    inline bool is_inuse() const { return state == 1; }
+    inline bool is_deleted() const { return state < 0; }
+    inline bool is_never_used() const { return state == 0; }
+    // this is only used when destroying a whole table, so we don't
+    // need to clear the state
+    inline void destroy()
+    {
+        if (state == 1) reinterpret_cast<T *>(value_slot)->~T();
+    }
+    // this is used in 'clear'; all state to 0.
+    inline void clear_entry()
+    {
+        if (state == 1) {
+            reinterpret_cast<T *>(value_slot)->~T();
+        }
+        state = 0;
+    }
+    // this is used in 'erase'.
+    inline void erase_entry()
+    {
+        XASSERT(state == 1);
+        if (state == 1) {
+            reinterpret_cast<T *>(value_slot)->~T();
+        }
+        state = -1;
+    }
+    inline T &value_field() { return *reinterpret_cast<T *>(value_slot); }
+    inline T const &value_field() const { return *reinterpret_cast<T const *>(value_slot); }
+    inline void move_from(Key k, erasable_raw_entry &from)
+    {
+        XASSERT(state <= 0);
+        first = k;
+        new (value_slot) T(std::move(from.value_field()));
+        reinterpret_cast<T *>(from.value_slot)->~T();
+        state = 1;
+    }
+    inline void copy_from(erasable_raw_entry const &other)
+    {
+        state = other.state;
+        if (other.state == 1) {
+            first = other.first;
+            new (value_slot) T(other.value_field());
+        }
+    }
+
+    template <class... Args> inline void emplace_new(Key k, Args &&...args)
+    {
+        XASSERT(state <= 0);
+        first = k;
+        new (value_slot) T(std::forward<Args>(args)...);
+        state = 1;
+    }
+};
+
+template <typename Key, typename T, bool ERASE_OK, typename HSH = findhash<Key>> class hashmap {
+    using traits = hashmap_traits<Key, false>;
+    static_assert(traits::valid, "Bad key type for hashmap");
+
+    typedef typename std::conditional<ERASE_OK, erasable_raw_entry<Key, T>, simple_raw_entry<Key, T>>::type raw_entry_t;
+
+    static constexpr bool T_needs_dtor = !std::is_trivially_destructible<T>::value;
+
+    // data members
+    size_t m_hashN; // size of the table (a power of 2)
+    unsigned int m_log2N; // log2(hashN)
+    size_t m_entries; // number of used entries, plus deleted entries.
+    // number of deleted entries: 'stuck_at_0' when !ERASE_OK.
+    typename std::conditional<ERASE_OK, size_t, stuck_at_0<size_t>>::type m_deleted; // deleted entries (if ERASE_OK)
+
+    std::vector<raw_entry_t> m_table;
+
+    typedef typename std::vector<raw_entry_t>::iterator table_iter_type;
+    typedef typename std::vector<raw_entry_t>::const_iterator table_kiter_type;
+
+    static constexpr bool is_set = std::is_same<T, no_value>::value;
+    template <typename, typename, bool, typename> friend class hashmap;
+    template <typename, bool, typename> friend class hashset;
+
+  public:
+    typedef Key key_type;
+    typedef T mapped_type;
+    typedef typename std::conditional<is_set, Key, std::pair<const Key, T>>::type value_type;
+    typedef size_t size_type;
+
+    static_assert(ERASE_OK || sizeof(raw_entry_t) == sizeof(value_type), "failed to make compatible layout");
+
+  private:
+    // iterator will be defined as a pointer to 'this'
+    // and an index into the table, with 'end()' being an index
+    // of -1; so we move backwards.
+    class hmap_kiterator {
+        friend class hashmap;
+        template <typename, typename, bool, typename> friend class hashmap;
+        //template <typename K2>
+        //friend class hashmap<K2,T,ERASE_OK,HSH>;
+      protected:
+        hashmap *m_object;
+        int m_posn;
+        hmap_kiterator(hashmap const *obj, int posn) : m_object(const_cast<hashmap *>(obj)), m_posn(posn) {}
+        hmap_kiterator(hashmap const *obj, table_iter_type posn)
+            : m_object(const_cast<hashmap *>(obj)), m_posn(posn - obj->m_table.begin())
+        {
+        }
+
+      public:
+        hmap_kiterator() : m_object(nullptr), m_posn(-1) {}
+        hmap_kiterator(hmap_kiterator const &) = default;
+        hmap_kiterator &operator=(hmap_kiterator const &) = default;
+
+        value_type const &operator*() const { return *reinterpret_cast<value_type *>(&m_object->m_table[m_posn]); }
+        value_type const *operator->() const { return reinterpret_cast<value_type *>(&m_object->m_table[m_posn]); }
+        hmap_kiterator &operator++()
+        { // pre-inc
+            m_posn = m_object->find_next_for_iter(m_posn);
+            return *this;
+        }
+
+        hmap_kiterator operator++(int)
+        { // post-inc
+            hmap_kiterator prev{*this};
+            m_posn = m_object->find_next_for_iter(m_posn);
+            return prev;
+        }
+        bool operator==(hmap_kiterator const &other) const { return m_posn == other.m_posn; }
+        bool operator!=(hmap_kiterator const &other) const { return m_posn != other.m_posn; }
+    };
+    class hmap_iterator : public hmap_kiterator {
+        friend class hashmap;
+        //template <typename K2>
+        //friend class hashmap<K2,T,ERASE_OK,HSH>;
+      protected:
+        hmap_iterator(hashmap *obj, int posn) : hmap_kiterator(obj, posn) {}
+        hmap_iterator(hashmap *obj, table_iter_type posn) : hmap_kiterator(obj, posn) {}
+        explicit hmap_iterator(hmap_kiterator const &k) : hmap_kiterator(k) {}
+        // coerce from an iterator of another type. This is safe if it is
+        // coerced back to the proper type before use, and if the value_type are layout
+        // compatible with the same T (and K of the same size).
+        template <typename otherhash> hmap_iterator coerce_from(typename otherhash::hmap_kiterator const &other)
+        {
+            return hmap_iterator((hashmap *)other.m_object, other.m_posn);
+        }
+
+      public:
+        using value_type = hashmap::value_type;
+        using difference_type = std::ptrdiff_t;
+        using iterator_category = std::forward_iterator_tag;
+        using pointer = value_type *;
+        using reference = value_type &;
+
+        hmap_iterator() : hmap_kiterator() {}
+        hmap_iterator(hmap_iterator const &) = default;
+        hmap_iterator &operator=(hmap_iterator const &) = default;
+
+        value_type &operator*() const { return const_cast<value_type &>(this->hmap_kiterator::operator*()); }
+        value_type *operator->() const { return const_cast<value_type *>(this->hmap_kiterator::operator->()); }
+        hmap_iterator &operator++()
+        { // pre-inc
+            return static_cast<hmap_iterator &>(this->hmap_kiterator::operator++());
+        }
+        hmap_iterator operator++(int)
+        { // post-inc
+            return hmap_iterator(this->hmap_kiterator::operator++(0));
+        }
+        // == and != inherit
+    };
+
+  protected:
+    template <typename otherhash> static hmap_iterator coerce_iter(typename otherhash::hmap_kiterator const &other)
+    {
+        return hmap_iterator((hashmap *)other.m_object, other.m_posn);
+    }
+
+  public:
+    typedef hmap_iterator iterator;
+    typedef hmap_kiterator const_iterator;
+
+    explicit hashmap(size_t n_entries) : m_hashN(0), m_log2N(-1), m_entries(0), m_deleted(0)
+    {
+        make_new_table(ceiling_log2(n_entries | 7));
+    }
+    hashmap() : m_hashN(0), m_log2N(-1), m_entries(0), m_deleted(0) {}
+
+    inline size_t size() const { return m_entries - m_deleted; }
+    inline bool empty() const { return m_entries == m_deleted; }
+    inline size_t count_deleted() const { return m_deleted; }
+
+    ~hashmap()
+    {
+        if constexpr (T_needs_dtor) clear();
+    }
+
+    bool shrink()
+    {
+        if constexpr (ERASE_OK) {
+            if (m_deleted * 2 > m_entries) {
+                rehash_table(false);
+                return true;
+            }
+        }
+        return false; // only does anything when ERASE_OK
+    }
+
+    hashmap(hashmap &&other) noexcept : hashmap() { this->swap(other); }
+    // copy-ctor: create empty, then use copy_from
+    hashmap(hashmap const &other) : hashmap() { copy_from(other); }
+    // assignment: clear, then use copy_from.
+    hashmap &operator=(hashmap const &other)
+    {
+        clear();
+        copy_from(other);
+        return *this;
+    }
+    // for this we can clear 'this' and then swap everything.
+    hashmap &operator=(hashmap &&other) noexcept
+    {
+        clear();
+        this->swap(other);
+        return *this;
+    }
+
+    void swap(hashmap &other) noexcept
+    {
+        std::swap(m_hashN, other.m_hashN);
+        std::swap(m_log2N, other.m_log2N);
+        std::swap(m_entries, other.m_entries);
+        if constexpr (ERASE_OK) std::swap(m_deleted, other.m_deleted);
+        std::swap(m_table, other.m_table);
+    }
+
+    iterator begin() { return hmap_iterator{this, find_next_for_iter(m_hashN)}; }
+    const_iterator cbegin() const { return hmap_kiterator{const_cast<hashmap *>(this), find_next_for_iter(m_hashN)}; }
+    const_iterator begin() const { return cbegin(); }
+
+    iterator end() { return hmap_iterator{this, -1}; }
+    const_iterator cend() const { return hmap_kiterator{const_cast<hashmap *>(this), -1}; }
+    const_iterator end() const { return cend(); }
+
+    void clear()
+    {
+        if (m_entries == 0) return;
+        raw_entry_t *const p = m_table.data();
+        if constexpr (T_needs_dtor || sizeof(raw_entry_t) > 2 * sizeof(Key)) {
+            for (size_t i = 0, n = m_hashN; i < n; i++) {
+                p[i].clear_entry();
+            }
+        } else { // nuke the entire site from orbit
+            memset(p, 0, m_hashN * sizeof(raw_entry_t));
+        }
+        m_entries = 0;
+        m_deleted = 0;
+    }
+
+    size_t count(Key const &k) const
+    {
+        table_kiter_type const iter = find_key(k);
+        return iter != m_table.end() ? 1 : 0;
+    }
+    bool contains(Key const &k) const
+    {
+        table_kiter_type const iter = find_key(k);
+        return iter != m_table.end();
+    }
+    iterator find(Key const &k)
+    {
+        table_kiter_type const iter = find_key(k);
+        return hmap_iterator{this, (iter == m_table.end()) ? -1 : int(iter - m_table.begin())};
+    }
+    const_iterator find(Key const &k) const
+    {
+        table_kiter_type const iter = find_key(k);
+        return hmap_kiterator{this, (iter == m_table.end()) ? -1 : int(iter - m_table.begin())};
+    }
+
+    // this may invalidate iterators by enlarging the table,
+    // if the key is not already there; but the one returned is always valid.
+    template <class... Args> std::pair<iterator, bool> emplace(Key const &k, Args &&...args)
+    {
+        table_iter_type const titer = find_key_for_ins(k);
+        std::pair<iterator, bool> result{iterator{this, titer}, false};
+
+        if (!titer->is_inuse()) {
+            [[maybe_unused]] bool const was_deleted = titer->is_deleted();
+            result.second = true;
+            titer->emplace_new(k, std::forward<Args>(args)...);
+            if constexpr (ERASE_OK) {
+                if (was_deleted)
+                    m_deleted--;
+                else
+                    m_entries++;
+            } else {
+                m_entries++;
+            }
+        }
+        return result;
+    }
+    template <class... Args> std::pair<iterator, bool> try_emplace(Key const &k, Args &&...args)
+    {
+        return emplace(k, std::forward<Args>(args)...);
+    }
+
+    T &at(Key const &k)
+    {
+        table_kiter_type const titer = find_key(k);
+        if (titer == m_table.end()) throw std::out_of_range("minimap::at");
+        return const_cast<T &>(titer->value_field());
+    }
+    T const &at(Key const &k) const { return const_cast<hashmap &>(*this).at(k); }
+
+    T &operator[](Key const &k)
+    {
+        static_assert(std::is_constructible<T>::value, "map[] requires a null-constructible mapped_type");
+        table_iter_type const titer = find_key_for_ins(k);
+        if (!titer->is_inuse()) {
+            bool const was_deleted = titer->is_deleted();
+            titer->emplace_new(k);
+            if constexpr (ERASE_OK) {
+                if (was_deleted)
+                    m_deleted--;
+                else
+                    m_entries++;
+            } else {
+                m_entries++;
+            }
+        }
+        return titer->value_field();
+    }
+
+    size_t erase(Key const &k)
+    {
+        static_assert(ERASE_OK, "erase not supported in this type");
+        if constexpr (ERASE_OK) {
+            table_kiter_type iter = find_key(k);
+            if (iter != m_table.end()) { // found
+                erase_inner(iter);
+                return 1;
+            }
+        }
+        return 0;
+    }
+    iterator erase(iterator const &posn)
+    {
+        static_assert(ERASE_OK, "erase not supported in this type");
+        if constexpr (ERASE_OK) {
+            int const p = posn.m_posn;
+            table_kiter_type const entp = m_table.cbegin() + p;
+            assert(posn.m_object == this && (size_t)p < m_hashN && entp->is_inuse());
+            erase_inner(entp);
+            return hmap_iterator{this, find_next_for_iter(m_hashN)};
+        } else {
+            return end();
+        }
+    }
+
+  protected:
+    void erase_inner(table_kiter_type iter)
+    {
+        if constexpr (ERASE_OK) {
+            const_cast<raw_entry_t &>(*iter).erase_entry();
+            m_deleted++;
+            if (m_deleted == m_entries) {
+                clear_all_deleted();
+            }
+        }
+    }
+    // m_table assumed to be empty; resize it for 2^log2 entries
+    void make_new_table(unsigned int log2n)
+    {
+        size_t const hn = size_t(1) << log2n;
+        m_table.resize(hn);
+        m_hashN = hn;
+        m_log2N = log2n;
+    }
+
+    // locate a key; returns an iterator to where it is, or to where it would go,
+    // if inserted. when 'INSERT' is true, never returns m_table.end().
+    // when INSERT is false, it will not change anything and will return m_table_end()
+    // if if doesn't find a proper entry.
+    // This must not be called on empty table!
+
+    template <bool INSERT = true> table_iter_type lookup_key_template(Key const &k)
+    {
+        size_t const mask = m_hashN - 1;
+        uint32_t const hsh = HSH()(k);
+        size_t probe_at = hsh & mask;
+        size_t const delta = (hsh >> 15) | 1; // must be odd.
+        //printf("       ...probing %llu in %zu at %zu, span=%zu", (unsigned long long)k, mask+1,probe_at,delta&mask);
+        table_iter_type const t0 = m_table.begin();
+        size_t remain = mask;
+        if constexpr (!ERASE_OK) {
+            assert(!raw_entry_t::is_null(k)); // this is not allowed
+            if (raw_entry_t::is_null(k)) return t0;
+            while (1) {
+                table_iter_type titer = t0 + probe_at;
+                if (!titer->is_inuse() || titer->first == k) {
+                    //printf("  ..found in %zu probes\n", mask+1-remain);
+                    if (!INSERT && !titer->is_inuse()) return m_table.end();
+                    return titer;
+                }
+                probe_at = (probe_at + delta) & mask;
+                if (--remain == 0) throw std::runtime_error("hash lookup failed");
+            }
+        } else {
+            // stop when we find a 'never used' slot,or a matching entry.
+            // But, if we are inserting, and we saw any deleted slots, return
+            // the iterator of the first deleted slot we saw, so it can be
+            // reused for the new key.
+            table_iter_type titer = t0 + probe_at;
+            table_iter_type titer_end = m_table.end();
+            table_iter_type titerx = titer_end;
+            while (1) {
+                if (titer->is_never_used()) {
+                    // end of chain. Return a previously seen deleted slot
+                    // if there is one.
+                    if (INSERT)
+                        return (titerx == titer_end) ? titer : titerx;
+                    else
+                        return titer_end;
+                }
+                if (titer->is_inuse()) {
+                    if (titer->first == k) {
+                        //printf("  ..found in %zu probes\n", mask+1-remain);
+                        return titer;
+                    }
+                } else { // is a deleted slot; if INSERT we want to keep that.
+                    if (INSERT)
+                        if (titerx == titer_end) titerx = titer;
+                }
+                probe_at = (probe_at + delta) & mask;
+                if (--remain == 0) throw std::runtime_error("hash lookup failed");
+                titer = t0 + probe_at;
+            }
+        }
+    }
+
+    // find_key_for_ins is a wrapper on lookup_key_template<true>;
+    // It will return either iterator to existing key (and not rehash)
+    // or an iterator to a not-in-use slot (and may have done a rehash).
+    //
+    // (1) find a spot to insert (or existing key)
+    // (2) if it's a new insert that will increase the m_entries if used,
+    //     and the table is too full, rehash and do it again.
+    // (also, if initially empty, go directly to step 2).
+    //
+    table_iter_type find_key_for_ins(Key const &k)
+    {
+        size_t thr = m_entries * 2;
+        while (1) { // at most 2 iterations.
+            if (m_hashN > 0) {
+                table_iter_type pos = lookup_key_template<true>(k);
+                if (m_hashN >= thr || !pos->is_never_used()) {
+                    return pos;
+                }
+            }
+            thr = 0;
+            rehash_table(true);
+        }
+    }
+
+    // find_key is lookup_key_template with no insert;
+    // always returns m_table.end() if key not found.
+    table_kiter_type find_key(Key const &k) const
+    {
+        if (m_hashN == 0) return m_table.end(); // no table!
+        return const_cast<hashmap *>(this)->lookup_key_template<false>(k);
+    }
+
+    // called when the last entry is erased; we can clear all the state
+    // to 'never used'.
+    //
+    void clear_all_deleted()
+    {
+        if constexpr (ERASE_OK) {
+            size_t mdel = m_deleted;
+            XASSERT(mdel >= 1 && mdel == m_entries);
+            raw_entry_t *const p = m_table.data();
+            for (size_t i = 0, n = m_hashN; i < n; i++) {
+                int const t = p[i].state;
+                if (t != 0) {
+                    XASSERT(t < 0);
+                    p[i].state = 0;
+                    if (--mdel == 0) break; // got them all
+                }
+            }
+            m_entries = 0;
+            m_deleted = 0;
+        }
+    }
+
+    // change the table size
+    // and refill from the old data (if any)
+    // Used in 'find_key_for_ins' and 'shrink'.
+    // 'growing' = true when called on insert, false
+    // on shrink.
+    void rehash_table(bool growing)
+    {
+        // how big to make it?
+        unsigned const sz = size(); // number of non-deleted entries
+        if (!ERASE_OK) growing = true; // don't need 'shrink' code path
+        int const new_log2 = growing ? (ceiling_log2(sz | 15) + 2) : (ceiling_log2(sz + 3 + (sz >> 1)) + 1);
+        std::vector<raw_entry_t> old_table;
+        old_table.swap(m_table);
+        make_new_table(new_log2);
+        size_t new_count = 0;
+        size_t const old_size = old_table.size();
+        raw_entry_t *const old_table_p = old_table.data();
+        for (size_t i = 0; i < old_size; i++) {
+            raw_entry_t &old_entry = old_table_p[i];
+            if (old_entry.is_inuse()) {
+                new_count++;
+                Key k = old_entry.first;
+                table_iter_type const new_loc = lookup_key_template<true>(k);
+                new_loc->move_from(k, old_entry);
+            }
+        }
+        assert(new_count == m_entries - m_deleted);
+        if (ERASE_OK) {
+            m_entries = new_count;
+            m_deleted = 0;
+        }
+    }
+    // find the largest index 'i' of an in-use entry such
+    // that i < posn; or return -1 if none.
+    // ('pure' is intended to prevent an inlined erase(iter)
+    // from calling or expanding this in cases where you don't use the
+    // returned iterator; but I suspect that since the compiler can
+    // see the code, it will just develop its own opinion).
+    [[gnu::pure]] int find_next_for_iter(int posn) const
+    {
+        raw_entry_t const *const table_p = m_table.data();
+        assert(posn <= int(m_hashN));
+        while (--posn >= 0) {
+            if (table_p[posn].is_inuse()) return posn;
+        }
+        return -1;
+    }
+
+    // copy from another hashmap.
+    // this is used in copy-ctor;
+    // and in operator = after 'clear'.
+    // We assume the current map is empty (no deleted
+    // items) but the table size could be anything.
+    void copy_from(hashmap const &other)
+    {
+        size_t const n_other = other.m_hashN;
+        if (n_other == 0) return;
+        // adopt the other table size
+        if (n_other != m_hashN) {
+            make_new_table(other.m_log2N);
+        }
+        // if std::is_trivial<T>::value, maybe just memcpy?
+        //
+        raw_entry_t const *const other_p = other.m_table.data();
+        raw_entry_t *const this_p = m_table.data();
+        assert(m_table.size() == n_other);
+        for (size_t i = 0; i < n_other; i++) {
+            this_p[i].copy_from(other_p[i]);
+        }
+        m_entries = other.m_entries;
+        m_deleted = other.m_deleted;
+    }
+
+}; // class hashmap
+
+// A hashset<K> is made by implementing hashmap<K,no_value>
+// and rewriting the interface a little with a private subclass.
+// Since we only need sets of ints (32,64,unsigned,signed)
+// and pointers, we map Key using value_proxy
+// so that all hashset<K> are based on either hashmap<unsigned,no_value>
+// or hashmap<unsigned long long,no_value>, and thus share most of
+// the generated code across many types.
+
+template <typename Key, bool ERASE_OK, typename HSH = findhash<typename value_proxy<Key, true>::type>>
+class hashset : private hashmap<typename value_proxy<Key, true>::type, no_value, ERASE_OK, HSH> {
+    using key_set_t = typename value_proxy<Key, true>::type;
+    using hashimpl = hashmap<key_set_t, no_value, ERASE_OK, HSH>;
+    using hashk = hashmap<Key, no_value, ERASE_OK, HSH>;
+    using impl_iterator = typename hashimpl::iterator;
+    using impl_kiterator = typename hashimpl::const_iterator;
+
+  public:
+    typedef Key key_type;
+    typedef Key value_type;
+    typedef size_t size_type;
+    typedef typename hashk::iterator iterator;
+    typedef typename hashk::const_iterator const_iterator;
+
+  protected:
+    static inline iterator iter_up(impl_iterator const &it) { return hashk::template coerce_iter<hashimpl>(it); }
+    static inline const_iterator iter_up(impl_kiterator const &it) { return hashk::template coerce_iter<hashimpl>(it); }
+    static inline impl_iterator iter_down(iterator const &it) { return hashimpl::template coerce_iter<hashk>(it); }
+    // image conversion of key will allow us to implement e.g.
+    // string_key-> size_t mapping, if string_key is a class
+    // containing just a pointer.
+    static inline key_set_t key_down(Key const &k)
+    {
+        static_assert(sizeof(k) == sizeof(key_set_t));
+        union {
+            Key k1;
+            key_set_t k2;
+        } const uu = {k};
+        return uu.k2;
+    }
+
+  public:
+    hashset() : hashimpl(){};
+    explicit hashset(size_t n_entries) : hashimpl(n_entries) {}
+
+    bool shrink() { return hashimpl::shrink(); }
+
+    hashset(hashset &&other) noexcept : hashimpl(std::move(other)) {}
+    hashset(hashset const &other) : hashimpl(other) {}
+    hashset &operator=(hashset const &other)
+    {
+        hashimpl::operator=(static_cast<hashimpl const &>(other));
+        return *this;
+    }
+    hashset &operator=(hashset &&other) noexcept
+    {
+        hashimpl::operator=(std::move(static_cast<hashimpl &>(other)));
+        return *this;
+    }
+
+    void swap(hashset &other) noexcept { hashimpl::swap(other); }
+
+    inline size_t size() const noexcept { return hashimpl::size(); }
+    inline bool empty() const noexcept { return hashimpl::empty(); }
+    inline size_t count_deleted() const noexcept { return hashimpl::count_deleted(); }
+
+    iterator begin() { return iter_up(hashimpl::begin()); }
+    const_iterator cbegin() const { return iter_up(hashimpl::cbegin()); }
+    const_iterator begin() const { return iter_up(hashimpl::cbegin()); }
+
+    iterator end() { return iter_up(hashimpl::end()); }
+    const_iterator cend() const { return iter_up(hashimpl::cend()); }
+    const_iterator end() const { return iter_up(hashimpl::cend()); }
+
+    void clear() { hashimpl::clear(); }
+
+    size_t count(Key const &k) const { return hashimpl::count(key_down(k)); }
+    bool contains(Key const &k) const { return hashimpl::contains(key_down(k)); }
+    iterator find(Key const &k) { return iter_up(hashimpl::find(key_down(k))); }
+    const_iterator find(Key const &k) const { return iter_up(hashimpl::find(key_down(k))); }
+
+    // insert, emplace, try_emplace are all the same if k is trivial type.
+    std::pair<iterator, bool> emplace(Key const &k) { return insert(k); }
+    std::pair<iterator, bool> try_emplace(Key const &k) { return insert(k); }
+    std::pair<iterator, bool> insert(Key const &k)
+    {
+        auto res = hashimpl::try_emplace(key_down(k), no_value{});
+        return {iter_up(res.first), res.second};
+    }
+    size_t erase(Key const &k) { return hashimpl::erase(key_down(k)); }
+    iterator erase(iterator const &posn) { return iter_up(hashimpl::erase(iter_down(posn))); }
+};
+
+template <typename Key, typename T, bool ERASE_OK, typename HSH = findhash<Key>>
+void swap(hashmap<Key, T, ERASE_OK, HSH> &a, hashmap<Key, T, ERASE_OK, HSH> &b)
+{
+    a.swap(b);
+}
+template <typename Key, bool ERASE_OK, typename HSH = findhash<Key>>
+void swap(hashset<Key, ERASE_OK, HSH> &a, hashset<Key, ERASE_OK, HSH> &b)
+{
+    a.swap(b);
+}
+
+} // namespace minObj
+
+template <typename K, typename T> using minihash_noerase = minObj::hashmap<K, T, false>;
+template <typename K, typename T> using minihash = minObj::hashmap<K, T, true>;
+
+template <typename K> using miniset_noerase = minObj::hashset<K, false>;
+template <typename K> using miniset = minObj::hashset<K, true>;
+
+} // namespace hnnx
+
+#endif /* MINIHASH_H_ */
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/oexpr.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/oexpr.h
new file mode 100755
index 0000000000000..bd8c2a635da1b
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/oexpr.h
@@ -0,0 +1,920 @@
+//==============================================================================
+// Copyright (c) 2020-2024 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef OEXPR_H_
+#define OEXPR_H_
+
+#include <algorithm>
+#include <functional>
+#include <utility>
+#include <numeric>
+#include "dtype_enum.h"
+#include "macros_attribute.h"
+#include "opname_tag.h"
+#include "weak_linkage.h"
+
+#ifndef PREPARE_DISABLED
+// This file is expected to be #included at top of optimize.h,
+// with oexp_post.h included farther down.
+// #include "optimize.h"
+
+// expression mechanism for constraints etc.
+// This supports deferred execution, so if used in replacement
+// rules under autosplit, you can use SPLIT_START etc in
+// expressions (e.g. ADD(SPLIT_START("I"), 2))
+// ... but only in contexts where a late-evaluation result is OK
+// (which includes inputs to gen_Shape, and it should include the first
+//  parameter of SELECT( cond, .. ops.. , ..ops ... )
+//
+//
+// All of the ADD() MUL() GE() etc
+// return objects which are specializations of expr,
+// and which contain the entire expression; these
+// are generally built at compile time.
+//
+// These objects all have a method .eval(ctx&) which
+// evaluates the expression. All expr specializations
+//  have a type 'otype' which defines what type eval()
+//  returns.
+// The mechanism supports short-circuit evaluation of
+// && ||, and SELECT, since nothing is actually done when the
+//  original expression is evaluated, only when the eval() is called.
+//
+//
+// The inputs to ADD() MUL() etc can be:
+//
+//   - specializations of expr containing smaller expressions.
+//   - scalar values (int,float etc)which will be coerced
+//     to a expr<Variant::value,T> that simply contains the valu
+//.
+//   - std::function<T(ctx&)> objects, for supported T types;
+//     these will be packed into expr<Variant::function,T> objects,
+//     which are not called until the .eval() is done (@@@ this is being removed)
+//
+//   Things like RANK_OF("PARM") are implemented by constructing
+//   an expr variant containing the "PARM" string; the eval() method
+//   will get the rank and return it. Thus, these are not called
+//   if they are skipped by AND, OR or SELECT conditions.
+//
+//  SPLIT_START etc, construct an expr variant containing the context
+//  name.
+//
+// For any expr<> object, we could construct it, bind it to a lambda which calls
+// its eval() method, and then that lambda can be converted to a std::function.
+//
+// We can't fully support C++ casts though.
+// What if someone writes  float( DIM_OF("x",2) ) ??
+// We can't write an 'operator float' which doesn't return a float.
+// Will need to do it with unary 'cast' ops.
+//
+// When this is used for constraints, it will be something like this:
+//  bool constraint_function() {
+//          auto result = AND( ..  the whole expression ... )
+//          return result.eval(*this)
+//   }
+// In this case there are no std::function objects; the compiler will
+// expand the whole expression and then evaluate its 'eval' method; and
+// so it will be as if the expressions were directly pasted (and, we
+// get short-cut eval of && ||, SELECT).
+//
+// Actually, I don't think we need to support std::function as input,
+// since we are replacing all of that.
+// But we do need to be able to make a std::function from an expression.
+//
+// This can be done using wrap_as_function, e.g. for an 'int' return type
+//
+//    std::function< int(ECtx&)> as_func = wrap_as_function<int>(  anything );
+//
+//  where the parameter is an expr object, or anything which will convert to one.
+// If it has a different return type, a conversion will be inserted in the wrapper.
+//
+//
+namespace constraint_lib {
+class Constraint;
+}
+namespace hnnx {
+template <typename T> class optim_configvar;
+}
+
+PUSH_VISIBILITY(default)
+
+API_EXPORT hnnx::Crate *get_lambda_crate();
+template <typename R> class OptFunction;
+
+template <typename R, typename... Args> class OptFunction<R(Args...)> {
+  public:
+    using thisType = OptFunction<R(Args...)>;
+    using OptFunctionTType = R (*)(void *, Args...);
+    using OptFunctionType = R (*)(Args...);
+
+    template <typename L> API_EXPORT static R LambdaWrapper(void *t, Args... args)
+    {
+        L *const obj = (L *)t;
+        return obj->operator()(args...);
+    }
+    API_EXPORT static R FunctionWrapper(void *t, Args... args)
+    {
+        OptFunctionType const obj = (OptFunctionType)t;
+        return obj(args...);
+    }
+    template <typename L>
+    API_EXPORT static typename std::enable_if<!std::is_lvalue_reference_v<L>, thisType>::type create(L &&lambda)
+    {
+        L *const l = get_lambda_crate()->emplace<L>(std::forward<L>(lambda));
+        return thisType(LambdaWrapper<L>, l);
+    }
+
+    OptFunctionTType mFunc;
+    void *mObj;
+    API_EXPORT OptFunction() : mFunc(nullptr), mObj(nullptr){};
+    API_EXPORT OptFunction(OptFunctionTType f, void *o) : mFunc(f), mObj(o){};
+
+    API_EXPORT R operator()(Args... args) const { return mFunc(mObj, args...); }
+    API_EXPORT operator bool() const { return (mFunc != nullptr); }
+};
+
+POP_VISIBILITY()
+
+namespace oExp {
+
+using std::forward;
+using std::tuple;
+
+enum class Variant : int {
+    // Core functionality...
+    // template params of expr:
+    value, // <value,T> : a value of type T
+    unop, // <unop,tuple<A,UFUNC>>  : UFUNC(A)
+    binop, // <binop, tuple<A,B,BFUNC>  : BFUNC(A,B)
+    binop1, // <binop, tuple<A,B,BFUNC>  : BFUNC(A,B) (where B is a scalar type, not an oExp)
+    lg_or, // <lg_or, tuple<A,B,...> :  A || B || ...
+    lg_and, // <lg_and, tuple<A,B,...> :  A && B && ...
+    lg_xor, // <lg_xor, tuple<A,B,...> :  A ^ B ^ ...
+    select, // <select, tuple<S,A,B>:   S?A:B
+    message, // <message, <mode,CONDITION>>   'mode' is an empty class
+    message_value, // <message, <mode,VALUE>>   'mode' is an empty class
+
+    // extensions which allow access to OpDef etc
+
+    property, // <property, tuple<OPEXP, prop_extractor_class>>  - extracts a property or dim of OpDef
+    opcompare, // <opcompare,tuple<OPEXPA,OPEXPB,op_compare_class>> - compares two ops in some way.
+    slicedim, // <slicedim,int>		- contains a slice name and a ptr-to-member var.
+    getconst, // <getconst, tuple<getconst_class,A>> - contains operand name and index expression.
+    external, // <external, tuple<FUNC, tuple<extra..>>>		- calls external-constraint
+    config, // <config,   T>  - reads a config var from optim_config_values.
+    producer_for,
+    eq_opstr, //  -- compares the opstr on a node to a constant
+
+    // Internal Exprs used for for D2RTR
+    d2rtr_constraint,
+};
+// in this namespace, ECtx is the type whose reference gets passed
+// to all the eval methods and std::function objects.
+typedef constraint_lib::Constraint ECtx;
+// sFunction<T>:  std::function returning T (using ECtx as a parameter)
+//
+template <typename T> using sFunction = OptFunction<T(ECtx &)>;
+
+template <Variant V, typename ARG> class expr {
+};
+
+////////////////////////////
+// map a 'scalar' value to an expr.
+////////////////////////////
+template <typename T> class expr<Variant::value, T> {
+    const T m_val;
+
+  public:
+    typedef T otype;
+    constexpr expr(T v) : m_val(v) {}
+    constexpr otype eval(ECtx &) const { return m_val; }
+    constexpr T getval() const { return m_val; }
+};
+
+//---------------------------------
+// this is an (uncallable) template function which returns
+// the same type as the eval() method of an expr; for use in decltype
+//  i.e. decltype(fake_eval(x)) will give you the return type of x.eval(ECtx&)
+//
+template <Variant V, typename T> inline auto fake_eval(expr<V, T> const &a)
+{
+    extern ECtx This_should_not_be_referenced;
+    return a.eval(This_should_not_be_referenced);
+}
+inline ALWAYSINLINE ECtx &fake_ectx()
+{
+    extern ECtx This_should_not_be_referenced;
+    return This_should_not_be_referenced;
+}
+
+//---------------------------------
+
+// wrap_param() is used to convert template parameters of things like ADD to an expr.
+// - any expr is unchanged
+// - scalar types are converted to the applicale 'value' (including coercion
+// from e.g. double to float)
+
+// an adapter which converts a scalar to an expr.
+template <typename T> struct wrapper_helper {
+};
+// direct conversions
+template <> struct wrapper_helper<bool> {
+    static constexpr auto wrap(bool x) { return expr<Variant::value, bool>(x); }
+};
+template <> struct wrapper_helper<int> {
+    static constexpr auto wrap(int x) { return expr<Variant::value, int>(x); }
+};
+template <> struct wrapper_helper<float> {
+    static constexpr auto wrap(float x) { return expr<Variant::value, float>(x); }
+};
+template <> struct wrapper_helper<DType> {
+    static constexpr auto wrap(DType x) { return expr<Variant::value, DType>(x); }
+};
+
+// double->float
+template <> struct wrapper_helper<double> {
+    static constexpr auto wrap(double x) { return expr<Variant::value, float>(float(x)); }
+};
+
+// unsigned, unsigned long, unsigned long long all-> size_t.
+// presumably one of them is identical to size_t on any given platform.
+template <> struct wrapper_helper<unsigned> {
+    static constexpr auto wrap(unsigned x) { return expr<Variant::value, size_t>(size_t(x)); }
+};
+template <> struct wrapper_helper<unsigned long> {
+    static constexpr auto wrap(unsigned long x) { return expr<Variant::value, size_t>(size_t(x)); }
+};
+template <> struct wrapper_helper<unsigned long long> {
+    static constexpr auto wrap(unsigned long long x) { return expr<Variant::value, size_t>(size_t(x)); }
+};
+
+// a variant is unchanged when wrapped.
+template <Variant V, typename T> struct wrapper_helper<expr<V, T>> {
+    static constexpr auto wrap(expr<V, T> const &x) { return x; }
+};
+
+template <typename T> inline constexpr auto wrap_param(T &&p)
+{
+    return wrapper_helper<std::remove_const_t<std::remove_reference_t<T>>>::wrap(std::forward<T>(p));
+}
+
+// We can also try to wrap something as a specific scalar type (e.g. to bool for SELECT).
+// here, T2 is always a scalar type
+// (currently, only works for bool; maybe it will be useful coerce consts to a common type
+// before ADD, e.g.).
+//
+
+template <typename T2, typename T> inline constexpr auto wrap_param_to(T &&p)
+{
+    typedef std::remove_reference_t<T> Tparam;
+    if constexpr (std::is_same_v<bool, T2> && std::is_arithmetic_v<Tparam>) {
+        // convert e.g. int to bool directly
+        return wrapper_helper<bool>::wrap(p != 0);
+    } else {
+        return wrapper_helper<Tparam>::wrap(std::forward<T>(p));
+    }
+}
+
+/** \defgroup OptConstraint Constraint Expressions for Optimization Rules
+ * \ingroup OptimizationFuncs
+ *
+ * These are the operations available for writing Constraint expressions.
+ * These may also be used in Replacement rules, e.g. to compute gen_Shape object dimensions.
+ *
+ * The following conversion operations may also be used in constraint expressions:
+ *
+ *     INT(expr)  UINT(expr) FLOAT(expr)  DTYPE(expr)
+ *
+ * Avoid using C-style casts, e.g. (int)expr; for conversion to bool use NE(exp,0)
+ *
+ * @{
+ * @}
+ *
+ */
+
+///////////////////////////////////////////////////////////////////////////////////////
+//
+// a unary operator
+//
+template <typename A, typename OPER> class expr<Variant::unop, tuple<A, OPER>> {
+    const A m_a;
+    const OPER m_op;
+
+  public:
+    using otype = decltype(m_op(fake_eval(m_a)));
+    constexpr expr(A a, OPER op) : m_a(a), m_op(op) {}
+    constexpr otype eval(ECtx &e) const { return m_op(m_a.eval(e)); }
+};
+template <typename A, typename OPER> inline constexpr auto make_unop(A a, OPER op)
+{
+    return expr<Variant::unop, tuple<A, OPER>>(a, op);
+}
+// specialize for case where input is value (constant folding)
+template <typename TA, typename OPER> inline constexpr auto make_unop(expr<Variant::value, TA> a, OPER op)
+{
+    return wrap_param(op(a.getval()));
+}
+
+// a binary operator
+//
+template <typename A, typename B, typename OPER> class expr<Variant::binop, tuple<A, B, OPER>> {
+    const A m_a;
+    const B m_b;
+    const OPER m_op;
+
+  public:
+    using otype = decltype(m_op(fake_eval(m_a), fake_eval(m_b)));
+    constexpr expr(A a, B b, OPER op) : m_a(a), m_b(b), m_op(op) {}
+    constexpr otype eval(ECtx &e) const { return m_op(m_a.eval(e), m_b.eval(e)); }
+};
+template <typename A, typename B, typename OPER> inline constexpr auto make_binop(A a, B b, OPER op)
+{
+    return expr<Variant::binop, tuple<A, B, OPER>>(a, b, op);
+}
+// specialize for the case where both inputs to binop are value (constant folding)
+template <typename TA, typename TB, typename OPER>
+inline constexpr auto make_binop(expr<Variant::value, TA> a, expr<Variant::value, TB> b, OPER op)
+{
+    auto constexpr res = op(a.getval(), b.getval());
+    return wrap_param(res);
+}
+
+// a binary operator where the RHS is a known value (these
+// are very common, so hopefully this reduces complexity of the generated code)
+//
+template <typename A, typename TB, typename OPER> class expr<Variant::binop1, tuple<A, TB, OPER>> {
+    const A m_a; // an oExp
+    const TB m_b; // a scalar
+    const OPER m_op;
+
+  public:
+    using otype = decltype(m_op(fake_eval(m_a), m_b));
+    constexpr expr(A a, TB b, OPER op) : m_a(a), m_b(b), m_op(op) {}
+    constexpr otype eval(ECtx &e) const { return m_op(m_a.eval(e), m_b); }
+};
+// specialize make_binop for the case where the second input is 'value'.
+template <typename A, typename TB, typename OPER>
+inline constexpr auto make_binop(A a, expr<Variant::value, TB> b, OPER op)
+{
+    using TBX = std::common_type_t<typename A::otype, TB>;
+    return expr<Variant::binop1, tuple<A, TBX, OPER>>(a, TBX(b.getval()), op);
+}
+
+// 'opers' is a tuple of N various expr objects which
+// are the inputs to AND,OR or XOR;
+// 'IDX' is in range 0..N-1
+// find the logical operation applied over inputs I..N-1,
+// expands recursively until IDX=N-1
+//
+template <Variant VAR, size_t IDX, typename OPERANDS>
+inline bool ALWAYSINLINE reduce_logop(ECtx &e, OPERANDS const &operands);
+
+// || operator (delayed eval of subexpressions)
+//
+template <typename OPERANDS> // operands is tuple<expr<> ...>
+class expr<Variant::lg_or, OPERANDS> {
+    const OPERANDS m_operands;
+
+  public:
+    using otype = bool;
+    inline constexpr expr(OPERANDS &&op) : m_operands(std::move(op)) {}
+    inline constexpr expr(OPERANDS const &op) : m_operands(op) {}
+    constexpr otype eval(ECtx &e) const { return reduce_logop<Variant::lg_or, 0, OPERANDS>(e, m_operands); }
+};
+template <typename OPERANDS> // operands is tuple<expr<> ...>
+class expr<Variant::lg_and, OPERANDS> {
+    const OPERANDS m_operands;
+
+  public:
+    using otype = bool;
+    inline constexpr expr(OPERANDS &&op) : m_operands(std::move(op)) {}
+    inline constexpr expr(OPERANDS const &op) : m_operands(op) {}
+    constexpr otype eval(ECtx &e) const { return reduce_logop<Variant::lg_and, 0, OPERANDS>(e, m_operands); }
+};
+template <typename OPERANDS> // operands is tuple<expr<> ...>
+class expr<Variant::lg_xor, OPERANDS> {
+    const OPERANDS m_operands;
+
+  public:
+    using otype = bool;
+    inline constexpr expr(OPERANDS &&op) : m_operands(std::move(op)) {}
+    inline constexpr expr(OPERANDS const &op) : m_operands(op) {}
+    constexpr otype eval(ECtx &e) const { return reduce_logop<Variant::lg_xor, 0, OPERANDS>(e, m_operands); }
+};
+template <Variant WHICH, typename OPERANDS> inline constexpr auto make_logop(OPERANDS &&opers)
+{
+    return expr<WHICH, std::remove_reference_t<OPERANDS>>(std::forward<OPERANDS>(opers));
+}
+// select
+template <typename SEL, typename A, typename B> class expr<Variant::select, tuple<SEL, A, B>> {
+    const SEL m_sel;
+    const A m_a;
+    const B m_b;
+
+  public:
+    using otype = decltype(bool() ? fake_eval(m_a) : fake_eval(m_b));
+    constexpr expr(SEL s, A a, B b) : m_sel(s), m_a(a), m_b(b) {}
+    constexpr otype eval(ECtx &e) const { return m_sel.eval(e) ? m_a.eval(e) : m_b.eval(e); }
+};
+template <typename SEL, typename A, typename B> inline constexpr auto make_select(SEL s, A a, B b)
+{
+    return expr<Variant::select, tuple<SEL, A, B>>(s, a, b);
+}
+
+// CLST must be a template class name with <type> missing; so
+// we can use std::plus etc.
+#define OEXP_ARITH(FNAME, CLST)                                                                                        \
+    template <typename A, typename B> inline constexpr auto FNAME(A &&pa, B &&pb)                                      \
+    {                                                                                                                  \
+        if constexpr (std::is_arithmetic_v<std::remove_reference_t<A>> &&                                              \
+                      std::is_arithmetic_v<std::remove_reference_t<B>>) {                                              \
+            /* 'constant folding' ... */                                                                               \
+            using common_t = std::common_type_t<std::remove_reference_t<A>, std::remove_reference_t<B>>;               \
+            return CLST<common_t>()(pa, pb);                                                                           \
+        } else {                                                                                                       \
+            auto wa = wrap_param(std::forward<A>(pa));                                                                 \
+            auto wb = wrap_param(std::forward<B>(pb));                                                                 \
+            using atype = decltype(fake_eval(wa));                                                                     \
+            using btype = decltype(fake_eval(wb));                                                                     \
+            using common_t = std::common_type_t<atype, btype>;                                                         \
+            return make_binop(wa, wb, CLST<common_t>());                                                               \
+        }                                                                                                              \
+    }
+
+/// \addtogroup OptConstraint
+
+// Formal definition of CEILDIV(a, b)
+template <typename T> struct ceil_div {
+    static_assert(std::is_integral_v<T>, "CEILDIV can only apply to integer types");
+    inline T operator()(T a, T b) const
+    {
+        T res = (a + b - 1) / b;
+        return res;
+    }
+};
+
+/// @{
+
+//! ADD(a,b):  A+B
+OEXP_ARITH(ADD, std::plus)
+
+//! SUB(a,b):  A-B
+OEXP_ARITH(SUB, std::minus)
+
+//! MUL(a,b):  A*B
+OEXP_ARITH(MUL, std::multiplies)
+
+//! DIV(a,b):  A/B
+OEXP_ARITH(DIV, std::divides)
+
+//!REM(a,b): A%B
+/// for signed types REM(a,b) is either 0 or has the same sign as a.
+OEXP_ARITH(REM, std::modulus)
+
+//! CEILDIV(a,b) round the result of division toward positive infinity
+OEXP_ARITH(CEILDIV, ceil_div)
+
+/// @}
+
+template <typename T> struct true_modulus {
+    static_assert(std::is_integral_v<T>, "MOD can only apply to integer types");
+    inline T operator()(T a, T b) const
+    {
+        T res = a % b;
+        // if res is non-zero and has opposite sign to b, add b to it.
+        if ((b < 0) ? (res > 0) : (res < 0)) res += b;
+        return res;
+    }
+};
+
+// Formal definition of ROUNDUP(a,b):
+// T can be int or unsigned ('size_t')
+// For b == 0, round to next larger power of two
+// UNDEFINED for b < 0 ( you will get 0, and maybe a runtime warning, some day?)
+// roundup(a,1) = a
+// for b>=2: result is a rounded up (towards +inf) to a multiple of b.
+//  So, roundup(15,10) = 20, roundup(-15,10) = -10
+// When b is a power of 2, this is consistent with (a+(b-1))&~(b-1) for both
+// signed and unsigned cases.
+//
+template <typename T> struct func_roundup {
+    static_assert(std::is_integral_v<T>, "ROUNDUP can only apply to integer types");
+    inline T roundup_pow2(T a) const
+    {
+        if (a <= 1) return a;
+        a -= 1;
+        a |= a >> 1;
+        a |= a >> 2;
+        a |= a >> 4;
+        a |= a >> 8;
+        a |= a >> 16;
+        a += 1;
+        return a;
+    }
+    inline T operator()(T a, T b) const
+    {
+        if (b < 0) return 0;
+        if (b == 0) return roundup_pow2(a);
+        if (b == 1) return a;
+        if ((b & (b - 1)) == 0) return (a + (b - 1)) & (~(b - 1));
+        // avoid overflow (except in cases where the rounded-up value overflows).
+        T rem = a % b;
+        if (rem == 0) return a;
+        // rem is 1..b-1 for  a > 0
+        // and -(b-1) ... -1 for a < 0
+        T anew = a - rem; // rounded towards 0;
+        if (a >= 0) anew += b;
+        return anew;
+    }
+};
+
+/// \addtogroup OptConstraint
+/// @{
+
+//! MOD(a,b) is either 0 or has the same sign as b.
+OEXP_ARITH(MOD, true_modulus)
+
+//! ROUNDUP(a,b): 'a' rounded up to a multiple of b.
+// - undefined if b <=0  (result will be 0, and you may get a runtime warning or something)
+// - for signed int: ROUNDUP(15,10)-> 20 but ROUNDUP(-15,10) -> -10.
+//
+OEXP_ARITH(ROUNDUP, func_roundup)
+
+/// @}
+
+template <typename T> struct min_func {
+    inline T operator()(T a, T b) const { return std::min(a, b); }
+};
+template <typename T> struct max_func {
+    inline T operator()(T a, T b) const { return std::max(a, b); }
+};
+
+/// \addtogroup OptConstraint
+/// @{
+
+//! MIN(a,b) minimum.
+OEXP_ARITH(MIN, min_func)
+//! MIN(a,b) maximum.
+OEXP_ARITH(MAX, max_func)
+
+/// @}
+
+template <typename T> struct lcm_func {
+    inline T operator()(T a, T b) const { return std::lcm(a, b); }
+};
+
+/// \addtogroup OptConstraint
+/// @{
+
+//! LCM(a,b): lcm(a,b)
+// return least common multiple
+OEXP_ARITH(LCM, lcm_func)
+
+/// @}
+
+/// \addtogroup OptConstraint
+/// @{
+// If this is needed, we should also add BIT_AND BIT_XOR BIT_COMPL
+//! BIT_OR(a,b):  A|B
+OEXP_ARITH(BIT_OR, std::bit_or)
+//! BIT_ANDR(a,b):  A&B
+OEXP_ARITH(BIT_AND, std::bit_and)
+/// @}
+
+#define OEXP_COMPARE(FNAME, CLST)                                                                                      \
+    template <typename A, typename B> inline constexpr auto FNAME(A &&pa, B &&pb)                                      \
+    {                                                                                                                  \
+        auto wa = wrap_param(std::forward<A>(pa));                                                                     \
+        auto wb = wrap_param(std::forward<B>(pb));                                                                     \
+        using atype = decltype(fake_eval(wa));                                                                         \
+        using btype = decltype(fake_eval(wb));                                                                         \
+        using common_t = std::common_type_t<atype, btype>;                                                             \
+        return oExp::make_binop(wa, wb, CLST<common_t>());                                                             \
+    }
+
+/// \addtogroup OptConstraint
+/// @{
+
+//! EQ(a,b)  - compare equal
+OEXP_COMPARE(EQ, std::equal_to);
+//! NE(a,b)  - compare not-equal
+OEXP_COMPARE(NE, std::not_equal_to);
+//! GT(a,b)  - compare greater-than
+OEXP_COMPARE(GT, std::greater);
+//! GE(a,b)  - compare greater-than-or-equal
+OEXP_COMPARE(GE, std::greater_equal);
+
+//! LT(a,b)  - compare less-than
+template <typename A, typename B> inline constexpr auto LT(A &&a, B &&b)
+{
+    return GT(std::forward<B>(b), std::forward<A>(a));
+}
+//! LE(a,b)  - compare less-than-or-equal
+template <typename A, typename B> inline constexpr auto LE(A &&a, B &&b)
+{
+    return GE(std::forward<B>(b), std::forward<A>(a));
+}
+
+/// @}
+
+#define OEXP_UNARYMATH(FNAME, CLSNAME)                                                                                 \
+    template <typename A> inline constexpr auto FNAME(A &&pa)                                                          \
+    {                                                                                                                  \
+        auto wa = wrap_param(std::forward<A>(pa));                                                                     \
+        using atype = decltype(fake_eval(wa));                                                                         \
+        return make_unop(wa, CLSNAME<atype>());                                                                        \
+    }
+
+#define OEXP_PREDICATE(FNAME, CODE)                                                                                    \
+    template <typename A> inline constexpr auto FNAME(A &&pa)                                                          \
+    {                                                                                                                  \
+        auto wa = wrap_param(std::forward<A>(pa));                                                                     \
+        using atype = decltype(fake_eval(wa));                                                                         \
+        return make_unop(wa, [](atype a) -> bool { return (CODE); });                                                  \
+    }
+
+template <typename T> inline bool is_pow2_func(T a)
+{
+    static_assert(std::is_integral_v<T>, "IS_POW2 can only apply to integer types");
+    return (a & (a - 1)) == 0;
+}
+template <typename T> struct abs_func {
+    inline T operator()(T a) const { return std::abs(a); }
+};
+
+/// \addtogroup OptConstraint
+/// @{
+
+//! NEG(x) - negation
+OEXP_UNARYMATH(NEG, std::negate)
+
+//! NOT(x) - boolean inversion
+OEXP_PREDICATE(NOT, !a)
+
+//! IS_POW2(x) - is x a power of 2 (assuming x >= 1)
+OEXP_PREDICATE(IS_POW2, is_pow2_func(a))
+
+//! ABS(x) - abs value
+OEXP_UNARYMATH(ABS, abs_func)
+
+/// @}
+
+// casts
+template <typename TTO, typename TFROM> struct cast_oper {
+    inline constexpr TTO operator()(TFROM x) const { return TTO(x); }
+};
+
+template <typename TTO, typename SOMEEXP> inline constexpr auto make_cast_oper(SOMEEXP &&exp)
+{
+    typedef std::remove_reference_t<SOMEEXP> EXPT;
+    typedef typename EXPT::otype from_type;
+    typedef cast_oper<TTO, from_type> OPER;
+    return expr<Variant::unop, tuple<EXPT, OPER>>(std::move(exp), OPER());
+}
+
+/// \addtogroup OptConstraint
+/// @{
+
+//! INT(x) - convert to int
+template <typename TA> static inline auto constexpr INT(TA &&a)
+{
+    auto wa = wrap_param(std::forward<TA>(a));
+    if constexpr (std::is_same_v<typename decltype(wa)::otype, int>) {
+        return wa; // already is
+    } else {
+        return make_cast_oper<int>(std::move(wa));
+    }
+}
+// specialize for when applied to 'value'
+template <typename TI> static inline auto constexpr INT(expr<Variant::value, TI> const &val)
+{
+    return expr<Variant::value, int>(int(val.getval()));
+}
+
+//! UINT(x) - convert to unsigned (i.e. size_t)
+template <typename TA> static inline auto constexpr UINT(TA &&a)
+{
+    auto wa = wrap_param(std::forward<TA>(a));
+    if constexpr (std::is_same_v<typename decltype(wa)::otype, size_t>) {
+        return wa; // already is
+    } else {
+        return make_cast_oper<size_t>(std::move(wa));
+    }
+}
+template <typename TI> static inline auto constexpr UINT(expr<Variant::value, TI> const &val)
+{
+    return expr<Variant::value, size_t>(size_t(val.getval()));
+}
+
+//! DTYPE(x) - convert to DType
+template <typename TA> static inline auto constexpr DTYPE(TA &&a)
+{
+    auto wa = wrap_param(std::forward<TA>(a));
+    if constexpr (std::is_same_v<typename decltype(wa)::otype, DType>) {
+        return wa; // already is
+    } else {
+        return make_cast_oper<DType>(std::move(wa));
+    }
+}
+template <typename TI> static inline auto constexpr DTYPE(expr<Variant::value, TI> const &val)
+{
+    return expr<Variant::value, DType>(DType(val.getval()));
+}
+
+//! FLOAT(x) - convert to float
+template <typename TA> static inline auto constexpr FLOAT(TA &&a)
+{
+    auto wa = wrap_param(std::forward<TA>(a));
+    if constexpr (std::is_same_v<typename decltype(wa)::otype, float>) {
+        return wa; // already is
+    } else {
+        return make_cast_oper<float>(std::move(wa));
+    }
+}
+template <typename TI> static inline auto constexpr FLOAT(expr<Variant::value, TI> const &val)
+{
+    return expr<Variant::value, float>(float(val.getval()));
+}
+
+/// @}
+
+// do we need a BOOL cast?
+
+/// \addtogroup OptConstraint
+/// @{
+
+//! OR(a,b, ...) - logical OR; evaluation stops after first 'true' operand
+template <typename TA, typename TB, typename... Ts> inline constexpr auto OR(TA &&a, TB &&b, Ts &&...ts)
+{
+    auto parms = std::make_tuple(wrap_param_to<bool>(std::forward<TA>(a)), wrap_param_to<bool>(std::forward<TB>(b)),
+                                 wrap_param_to<bool>(std::forward<Ts>(ts))...);
+    return make_logop<Variant::lg_or>(parms);
+}
+
+//! AND(a,b, ...) - logical AND; evaluation stops after first 'false' operand
+template <typename TA, typename TB, typename... Ts> inline constexpr auto AND(TA &&a, TB &&b, Ts &&...ts)
+{
+    auto parms = std::make_tuple(wrap_param_to<bool>(std::forward<TA>(a)), wrap_param_to<bool>(std::forward<TB>(b)),
+                                 wrap_param_to<bool>(std::forward<Ts>(ts))...);
+    return make_logop<Variant::lg_and>(parms);
+}
+
+//! AND(a,b, ...) - logical AND; evaluation stops after first 'false' operand
+template <typename TA> inline constexpr auto AND(TA &&a)
+{
+    return AND(std::forward<TA>(a), true);
+}
+
+//! XOR(a,b, ...) - logical XOR
+template <typename TA, typename TB, typename... Ts> inline constexpr auto XOR(TA &&a, TB &&b, Ts &&...ts)
+{
+    auto parms = std::make_tuple(wrap_param_to<bool>(std::forward<TA>(a)), wrap_param_to<bool>(std::forward<TB>(b)),
+                                 wrap_param_to<bool>(std::forward<Ts>(ts))...);
+    return make_logop<Variant::lg_xor>(parms);
+}
+
+//! ADD(a,b,c...) - equivalent to ADD( ADD(a,b), c ...)
+template <typename TA, typename TB, typename... Ts> inline constexpr auto ADD(TA &&a, TB &&b, Ts &&...ts)
+{
+    return ADD(ADD(std::forward<TA>(a), std::forward<TB>(b)), std::forward<Ts>(ts)...);
+}
+
+//! MUL(a,b,c...) - equivalent to MUL( MUL(a,b), c ...)
+template <typename TA, typename TB, typename... Ts> inline constexpr auto MUL(TA &&a, TB &&b, Ts &&...ts)
+{
+    return MUL(MUL(std::forward<TA>(a), std::forward<TB>(b)), std::forward<Ts>(ts)...);
+}
+
+//! MIN(a,b,c...) - equivalent to MIN( MIN(a,b), c ...)
+template <typename TA, typename TB, typename... Ts> inline constexpr auto MIN(TA &&a, TB &&b, Ts &&...ts)
+{
+    return MIN(MIN(std::forward<TA>(a), std::forward<TB>(b)), std::forward<Ts>(ts)...);
+}
+//! MAX(a,b,c...) - equivalent to MAX( MAX(a,b), c ...)
+template <typename TA, typename TB, typename... Ts> inline constexpr auto MAX(TA &&a, TB &&b, Ts &&...ts)
+{
+    return MAX(MAX(std::forward<TA>(a), std::forward<TB>(b)), std::forward<Ts>(ts)...);
+}
+
+#if 0 // this is in oexpr_post.h now, since it needs to handle opexpr too
+//  ! SELECT(cond, A,B) - cond?A:B
+template <typename SEL, typename A, typename B>
+inline constexpr auto SELECT(SEL &&s, A &&a, B &&b)
+{
+    auto ws = wrap_param_to<bool>(std::forward<SEL>(s));
+    auto wa = wrap_param(std::forward<A>(a));
+    auto wb = wrap_param(std::forward<B>(b));
+    return make_select(ws, wa, wb);
+}
+#endif
+
+template <typename SEL, typename A, typename B> constexpr auto SELECT(SEL &&s, A &&a, B &&b);
+
+/// @}
+
+// make an sFunction<T> which contains and returns a specific value.
+// These are self-contained std::function objects;
+// Hopefully can reduce the number of distinct std::function objects by using
+// this in all such cases...
+template <typename SCALART> sFunction<SCALART> make_literal_sfunction(SCALART val)
+{
+    return sFunction<SCALART>::create([val](ECtx &) -> SCALART { return val; });
+}
+
+// make_literal_sfunction<bool> is a special case: we make a function
+// object bound to one of sfunc_bool_false() or sfunc_bool_true()
+// This makes check_sfunction_bool() possible.
+//
+
+inline constexpr bool sfunc_bool_false(ECtx &)
+{
+    return false;
+}
+inline constexpr bool sfunc_bool_true(ECtx &)
+{
+    return true;
+}
+
+template <> inline sFunction<bool> make_literal_sfunction<bool>(bool val)
+{
+    return sFunction<bool>(sFunction<bool>::FunctionWrapper, (void *)(val ? sfunc_bool_true : sfunc_bool_false));
+}
+// given an sFunction<bool>, return
+//  0 if it will always return false,
+//  1 if it will always return true,
+// -1 if we can't tell.
+int check_sfunction_bool(sFunction<bool> const &);
+
+//// Convert to std::function
+// make any expr (or anything that converts to expr ) into a std::function
+// of the *specified* type, which should be one of the basic types bool,float,int,size_t, DType.
+//
+// Shortcuts:
+// if the input is a function, it's returned;
+// if the input is a scalar type, we use make_literal_sfunction.
+//
+template <typename T, typename EX> sFunction<T> wrap_as_function(EX &&a)
+{
+    // don't change it if it's already a function
+    typedef std::remove_reference_t<EX> EXT;
+    if constexpr (std::is_same_v<EXT, sFunction<T>>) {
+        return std::forward<EX>(a);
+    } else if constexpr (std::is_same_v<std::remove_const_t<EXT>, T> || std::is_arithmetic_v<EXT>) {
+        return make_literal_sfunction<T>(a);
+    } else if constexpr (std::is_same_v<EXT, expr<Variant::value, T>>) {
+        // is an expr<value,T>
+        return make_literal_sfunction<T>(a.getval());
+    } else {
+        auto ftmp = wrap_param_to<T>(std::forward<EX>(a));
+        return sFunction<T>::create([ftmp](ECtx &e) -> T { return ftmp.eval(e); });
+    }
+}
+
+// if wrap_as_function is given a expr which is 'value', it can just
+// snarf the value out of it and call make_literal_sfunction.
+// This requires that OTHERT can quietly convert to T.
+// (note, this doesn't work, the template above seems to be always used, so a
+// case was added there for T and expr<value,T>
+/*
+template <typename T, typename OTHERT>
+inline sFunction<T> wrap_as_function( expr<Variant::value,OTHERT> const & a ){
+	return make_literal_sfunction<T>( a.getval());
+}
+*/
+
+#undef OEXP_ARITH
+#undef OEXP_COMPARE
+#undef OEXP_UNARYMATH
+#undef OEXP_PREDICATE
+
+//this class allows encapsulation of things which evaluate to an Op
+// without the need to construct any Op; e.g,
+//  INPUT_OF("operandname",int-expr)
+//
+//  .. so that e.g. you can write
+//   DTYPE_OF( INPUT_OF( "operand", ITER_VAR("I")))
+// .. in a replacement rule, and can also use INPUT_OF in a constraint.
+//
+//
+
+enum class OpVnt : int {
+    // template params of opexpr:
+    parm, // <parm,void> : an operand_tag_t is within.
+    input_of, // <input_of,tuple<OPA,EXPRB>>  : INPUT_OF(A,B)
+    output_of, // <output_of,tuple<OPA,EXPRB>>  : OUTPUT_OF(A,B)
+    select, // <select,tuple<COND,OPA,OPB>	 : SELECT(A,B)
+};
+template <OpVnt V, typename ARG> class opexpr {
+};
+
+} // namespace oExp
+
+// The definitions in oExp namespace which depend on types
+// defined in optimize.h are in the header 'oexpr_post.h'
+
+#endif /* !PREPARE_DISABLED */
+#endif /* OEXPR_H_ */
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/oexpr_post.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/oexpr_post.h
new file mode 100755
index 0000000000000..41b000715572f
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/oexpr_post.h
@@ -0,0 +1,1100 @@
+//==============================================================================
+//
+// Copyright (c) 2020,2022-2023 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef OEXPR_POST_H_
+#define OEXPR_POST_H_
+
+#ifndef OEXPR_H_
+#error "?? must include oexpr.h first"
+#endif
+
+#ifdef WITH_OPT_DEBUG
+#include <sstream>
+#endif
+#include "build_options_pub.h"
+
+#include "weak_linkage.h"
+#include "macros_attribute.h"
+#ifndef PREPARE_DISABLED
+PUSH_VISIBILITY(default)
+
+namespace oExp {
+
+template <Variant VAR, size_t IDX, typename OPERANDS>
+inline bool ALWAYSINLINE reduce_logop(ECtx &e, OPERANDS const &operands)
+{
+    static constexpr size_t N = std::tuple_size_v<OPERANDS>;
+#ifdef WITH_OPT_DEBUG
+    using TraceState = constraint_lib::Constraint::TraceState;
+    TraceState save_state;
+    if (IDX == 0 and e.curr_rule_info and e.curr_rule_info->is(hnnx::OptimFlags::trace_rule)) {
+        save_state = e.trace;
+        e.trace.depth += 1;
+        e.trace.op = (VAR == Variant::lg_and ? "and" : VAR == Variant::lg_or ? "or" : "xor");
+    }
+    e.trace.clause = IDX;
+    auto result = [&save_state, &e](bool result) {
+        if (e.curr_rule_info and e.curr_rule_info->is(hnnx::OptimFlags::trace_rule))
+            e.trace_vector.push_back(TraceState{e.trace.depth, IDX, e.trace.op, result});
+        if (IDX == 0) e.trace = save_state;
+        return result;
+    };
+#else
+    auto result = [](bool r) { return r; };
+#endif
+    // find the first one.
+    bool result_i = std::get<IDX>(operands).eval(e);
+    if constexpr (IDX + 1 >= N) { // it's the last one .. we're done
+        return result(result_i);
+    } else if constexpr (VAR == Variant::lg_xor) {
+        // must xor with the rest
+        return result_i ^ reduce_logop<VAR, IDX + 1, OPERANDS>(e, operands);
+    } else {
+        if constexpr (VAR == Variant::lg_and) {
+            if (!result_i) return result(false); // short-cut AND if false
+        } else if constexpr (VAR == Variant::lg_or) {
+            if (result_i) return result(true); // short-cut OR if true
+        } else {
+            static_assert(false && IDX, "bad variant!?");
+        }
+        // evaluate the rest.
+        bool next = reduce_logop<VAR, IDX + 1, OPERANDS>(e, operands);
+#ifdef WITH_OPT_DEBUG
+        if (IDX == 0) e.trace = save_state;
+#endif
+        return next;
+    }
+}
+
+// opdef_accessor is a friend class of Constraint;
+// it forms a bridge to the operand map and op_def map; all
+// oExpr use this to map operand tags to OpRef, OpDef and OutputDef
+class opdef_accessor {
+  public:
+    API_EXPORT static OpRef lookup_operand(ECtx &e, hnnx::operand_tag_parm_t);
+    API_EXPORT static OpDef const &get_opdef(ECtx &e, hnnx::operand_tag_parm_t optag);
+    API_EXPORT static OpDef const &get_opdef(ECtx &e, OpRef);
+
+    // this is defined below in same file - wrapper to select a ger_opdef by type.
+    template <typename OEXPR_TYPE> static inline OpDef const &get_opdef_oexpr(ECtx &e, OEXPR_TYPE const &oexp);
+
+    API_EXPORT static OutputDef const &get_outputdef(ECtx &e, hnnx::operand_tag_parm_t optag);
+    API_EXPORT static OutputDef const &get_outputdef(ECtx &e, OpRef);
+    API_EXPORT static Split_Context const &lookup_split(ECtx &e, hnnx::split_context_tag_t const &tag);
+
+    API_EXPORT static OpRef get_input_of(ECtx &e, OpDef const *, int idx);
+    API_EXPORT static OpRef get_output_of(ECtx &e, OpRef, int idx);
+    template <typename T> API_EXPORT static T get_option(ECtx &e, hnnx::opname_tag_parm_t name);
+    static void show_debug_message(ECtx &e, char const *why, char const *str)
+    {
+        if constexpr (build_options_pub::DefOptLog)
+            const_cast<constraint_lib::Constraint &>(e).show_debug_message(why, str);
+    }
+};
+
+//
+// this can be used to extract the V and T from an opexpr<V,T>
+// opexpr< opexpr<V,T>>::variant -> V
+// opexpr< opexpr<V,T>>::type -> T
+
+template <typename DUMMY> struct opexpr_types {
+};
+template <OpVnt V, typename T> struct opexpr_types<opexpr<V, T>> {
+    static constexpr OpVnt variant = V;
+    using parmtype = T;
+};
+
+/////////////////////////////////////////
+// an opexpr which just contains a value.
+/////////////////////////////////////////
+template <> class opexpr<OpVnt::parm, void> : public opdef_accessor {
+    const hnnx::operand_tag_t m_optag;
+
+  public:
+    opexpr(hnnx::operand_tag_parm_t optag) : m_optag(optag) {}
+    OpRef eval(ECtx &e) const { return lookup_operand(e, m_optag); }
+    hnnx::operand_tag_parm_t get_optag() const { return m_optag; }
+};
+
+template <typename OEXPR_TYPE> inline OpDef const &opdef_accessor::get_opdef_oexpr(ECtx &e, OEXPR_TYPE const &oexp)
+{
+    if constexpr (opexpr_types<OEXPR_TYPE>::variant == OpVnt::parm) {
+        return get_opdef(e, oexp.get_optag());
+    } else {
+        return get_opdef(e, oexp.eval(e));
+    }
+}
+
+// 'wrap_opexpr( something )'
+// - turns string constant or operand_tag_t into opexpr<OpVnt::parm,void>
+// - opexpr input is returned unchanged
+// - others are invalid.
+//
+template <typename T> struct opwrapper_helper {
+    static_assert(false && sizeof(T), "wrap_opexpr instantiated on unsupported type");
+};
+
+// wrapping a char const *
+template <> struct opwrapper_helper<char const *> {
+    static auto wrap(char const *p) { return opexpr<OpVnt::parm, void>(p); }
+};
+template <int N> struct opwrapper_helper<char const[N]> {
+    static auto wrap(char const *p) { return opexpr<OpVnt::parm, void>(p); }
+};
+
+// wrapping an operand tag
+template <> struct opwrapper_helper<hnnx::operand_tag_t> {
+    static auto wrap(hnnx::operand_tag_parm_t p) { return opexpr<OpVnt::parm, void>(p); }
+};
+// wrapping an opexpr
+template <OpVnt V, typename T> struct opwrapper_helper<opexpr<V, T>> {
+    static constexpr auto wrap(opexpr<V, T> const &p) { return p; }
+};
+
+template <typename T> inline constexpr auto wrap_opexpr(T &&p)
+{
+    return opwrapper_helper<std::remove_reference_t<T>>::wrap(std::forward<T>(p));
+}
+
+/////////////////////////////////////////
+// opexpr for INPUT_OF
+/////////////////////////////////////////
+
+template <typename OPA, typename EXPRB> class opexpr<OpVnt::input_of, std::tuple<OPA, EXPRB>> : public opdef_accessor {
+    const OPA m_op;
+    const EXPRB m_idx;
+
+  public:
+    constexpr opexpr(OPA const &a, EXPRB const &b) : m_op(a), m_idx(b) {}
+    OpRef eval(ECtx &e) const
+    {
+        OpDef const &opd = get_opdef_oexpr<OPA>(e, m_op);
+        int idx = m_idx.eval(e);
+        return get_input_of(e, &opd, idx);
+    }
+};
+/// \ingroup ingroupOptConstraint
+/// @brief INPUT_OF("operand", index) - select the specified input of an op.
+///
+/// index must be in range 0 ... n-1, where n is the number of input the Op actually has.
+///
+///
+
+template <typename TOP, typename TEXPR> auto INPUT_OF(TOP &&opa, TEXPR &&exprb)
+{
+    auto wa = wrap_opexpr(std::forward<TOP>(opa));
+    auto wb = wrap_param_to<int>(std::forward<TEXPR>(exprb));
+    return opexpr<OpVnt::input_of, std::tuple<decltype(wa), decltype(wb)>>(wa, wb);
+}
+/////////////////////////////////////////
+// opexpr for OUTPUT_OF
+/////////////////////////////////////////
+
+template <typename OPA, typename EXPRB> class opexpr<OpVnt::output_of, std::tuple<OPA, EXPRB>> : public opdef_accessor {
+    const OPA m_op;
+    const EXPRB m_idx;
+
+  public:
+    constexpr opexpr(OPA const &a, EXPRB const &b) : m_op(a), m_idx(b) {}
+    OpRef eval(ECtx &e) const
+    {
+        OpRef opr = m_op.eval(e);
+        int idx = m_idx.eval(e);
+        return get_output_of(e, opr, idx);
+    }
+};
+
+/// \ingroup ingroupOptConstraint
+/// @brief OUTPUT_OF("operand", index) - select the specified output of a multi-output Op.
+///
+/// The "operand" must refer to either (a) a multi-output Op, with at least index+1 outputs;
+/// or (b) one of the $Out nodes of such an Op. The result is the $Out node for the output selected
+/// by 'index' (if index=0, "operand" can refer to an single-output Op, in which case you just get
+/// that Op).
+///
+
+template <typename TOP, typename TEXPR> auto OUTPUT_OF(TOP &&opa, TEXPR &&exprb)
+{
+    auto wa = wrap_opexpr(std::forward<TOP>(opa));
+    auto wb = wrap_param_to<int>(std::forward<TEXPR>(exprb));
+    return opexpr<OpVnt::output_of, std::tuple<decltype(wa), decltype(wb)>>(wa, wb);
+}
+
+////////////////////////// implement Config //////////////////////////////
+
+template <typename T> class expr<Variant::config, T> : public opdef_accessor {
+    const hnnx::operand_tag_t m_option_name;
+
+  public:
+    typedef T otype;
+    expr(char const *optname) : m_option_name(optname) {}
+    otype eval(ECtx &ectx) const { return opdef_accessor::get_option<T>(ectx, m_option_name); }
+};
+// specialize for bool: read as int, convert to bool.
+template <> class expr<Variant::config, bool> : public opdef_accessor {
+    const hnnx::operand_tag_t m_option_name;
+
+  public:
+    typedef bool otype;
+    expr(char const *optname) : m_option_name(optname) {}
+    otype eval(ECtx &ectx) const { return opdef_accessor::get_option<int>(ectx, m_option_name) != 0; }
+};
+
+////////////////////////////// properties of OpDef //////////////////////
+
+// property extractors, implement RANK_OF, STEPSIZE_OF, DIM_OF etc.
+//
+// All of these have two 'eval' methods:
+//    eval( ECtx &, operand_tag_parm_t parmtag)-> OpRef
+//    eval( ECtx &, Opref oref )-> OpRef
+//
+// The first one is used when the extractor is applied directly to a named
+// parameter as in DTYPE_OF("X"); the second is used in the general case
+// e.g DTYPE_OF(INPUT_OF("X", SUB(INPUTS_OF("X"),1)))
+//
+// The reason for this: mapping from parmtag to OpDef & is considerably faster
+// than the two-step mapping parmtag -> OpRef -> OpDef &
+// This is due to a caching mechanism in the Match object. We don't want
+// to defeat this speedup for the more common case.
+// Note that the methods get_outputdef etc of opdef_accessor are overloaded for
+// both cases, so the two eval methods tend to look the same.
+//
+//
+
+struct property_extract_rank : private opdef_accessor {
+    typedef size_t otype;
+    inline otype eval(ECtx &e, hnnx::operand_tag_parm_t ptag) const { return get_outputdef(e, ptag).rank; }
+    inline otype eval(ECtx &e, OpRef oref) const { return get_outputdef(e, oref).rank; }
+};
+struct property_extract_dtype : private opdef_accessor {
+    typedef DType otype;
+    inline otype eval(ECtx &e, hnnx::operand_tag_parm_t ptag) const { return get_outputdef(e, ptag).dtype; }
+    inline otype eval(ECtx &e, OpRef oref) const { return get_outputdef(e, oref).dtype; }
+};
+struct property_extract_zero_offset : private opdef_accessor {
+    typedef int otype;
+    inline otype eval(ECtx &e, hnnx::operand_tag_parm_t ptag) const
+    {
+        auto const &od = get_outputdef(e, ptag);
+        return (od.dtype == DType::Multi) ? 0 : od.zero_offset;
+    }
+    inline otype eval(ECtx &e, OpRef oref) const
+    {
+        auto const &od = get_outputdef(e, oref);
+        return (od.dtype == DType::Multi) ? 0 : od.zero_offset;
+    }
+};
+struct property_extract_n_outputs : private opdef_accessor {
+    typedef int otype;
+    API_EXPORT otype eval(ECtx &e, hnnx::operand_tag_parm_t ptag) const; // in oexpr.cc
+    API_EXPORT otype eval(ECtx &e, OpRef oref) const; // in oexpr.cc
+  private:
+    API_EXPORT static otype eval_common(ECtx &e, OpDef const *od); // in oexpr.cc
+};
+
+struct property_extract_elementsize : private opdef_accessor {
+    typedef size_t otype;
+    API_EXPORT otype eval(ECtx &e, hnnx::operand_tag_parm_t optag) const; // in oexpr.cc
+    API_EXPORT otype eval(ECtx &e, OpRef oref) const; // in oexpr.cc
+};
+
+struct property_extract_stepsize : private opdef_accessor {
+    typedef float otype;
+    inline otype eval(ECtx &e, hnnx::operand_tag_parm_t ptag) const { return get_outputdef(e, ptag).stepsize; }
+    inline otype eval(ECtx &e, OpRef oref) const { return get_outputdef(e, oref).stepsize; }
+};
+
+struct property_extract_n_inputs : private opdef_accessor {
+    typedef size_t otype;
+    inline otype eval(ECtx &e, hnnx::operand_tag_parm_t ptag) const { return get_opdef(e, ptag).n_inputs(); }
+    inline otype eval(ECtx &e, OpRef oref) const { return get_opdef(e, oref).n_inputs(); }
+};
+
+// can only be used embedded in dim_extractor
+struct property_extract_dim : protected opdef_accessor {
+    typedef size_t otype;
+    inline otype eval(ECtx &e, hnnx::operand_tag_parm_t ptag, int i) const
+    {
+        OutputDef const &odef = get_outputdef(e, ptag);
+        if ((unsigned)i > (unsigned)odef.rank) return 0;
+        return odef.max_sizes[i];
+    }
+    inline otype eval(ECtx &e, OpRef oref, int i) const
+    {
+        OutputDef const &odef = get_outputdef(e, oref);
+        if ((unsigned)i > (unsigned)odef.rank) return 0;
+        return odef.max_sizes[i];
+    }
+};
+template <typename ITYPE> struct dim_extractor : public property_extract_dim {
+    ITYPE m_iexpr;
+    inline size_t eval(ECtx &e, hnnx::operand_tag_parm_t ptag) const
+    {
+        int i = m_iexpr.eval(e);
+        return property_extract_dim::eval(e, ptag, i);
+    }
+    inline size_t eval(ECtx &e, OpRef oref) const
+    {
+        int i = m_iexpr.eval(e);
+        return property_extract_dim::eval(e, oref, i);
+    }
+    constexpr dim_extractor(ITYPE const &iexpr) : m_iexpr(wrap_param_to<int>(iexpr)) {}
+};
+template <> struct dim_extractor<int> : public property_extract_dim {
+    int m_iexpr;
+    inline size_t eval(ECtx &e, hnnx::operand_tag_parm_t ptag) const
+    {
+        int const i = m_iexpr;
+        return property_extract_dim::eval(e, ptag, i);
+    }
+    inline size_t eval(ECtx &e, OpRef oref) const
+    {
+        int const i = m_iexpr;
+        return property_extract_dim::eval(e, oref, i);
+    }
+    constexpr dim_extractor(int i) : m_iexpr(i) {}
+};
+
+struct property_data_size : private opdef_accessor {
+    typedef int64_t otype;
+    API_EXPORT int eval(ECtx &e, const OpDef &op) const;
+    API_EXPORT int eval(ECtx &e, hnnx::operand_tag_parm_t ptag) const;
+    API_EXPORT int eval(ECtx &e, OpRef oref) const;
+};
+
+template <typename OPEX, typename PROPEXT> class expr<Variant::property, std::tuple<OPEX, PROPEXT>> {
+    const OPEX m_opexpr;
+    PROPEXT m_propex;
+
+  public:
+    typedef typename PROPEXT::otype otype;
+    constexpr expr(OPEX const &opexp) : m_opexpr(opexp) {}
+    // this is for 'dim', to apply the index parameter
+    template <typename T>
+    constexpr expr(OPEX const &opexp, T &&xparm) : m_opexpr(opexp), m_propex(std::forward<T>(xparm))
+    {
+    }
+
+    otype eval(ECtx &e) const
+    {
+        if constexpr (opexpr_types<OPEX>::variant == OpVnt::parm) {
+            // use the operand tag directly.
+            return m_propex.eval(e, m_opexpr.get_optag());
+        } else {
+            // get an OpRef by evaluating the m_opexpr
+            return m_propex.eval(e, m_opexpr.eval(e));
+        }
+    }
+};
+
+/// \addtogroup OptConstraint
+/// @{
+
+//! RANK_OF("operand") - extract rank of output
+template <typename TOP> inline constexpr auto RANK_OF(TOP &&op)
+{
+    auto op_wrapped = wrap_opexpr(std::forward<TOP>(op));
+    return expr<Variant::property, std::tuple<decltype(op_wrapped), property_extract_rank>>(op_wrapped);
+}
+
+//! DTYPE_OF("operand") - extract dtype of output
+template <typename TOP> inline constexpr auto DTYPE_OF(TOP &&op)
+{
+    auto op_wrapped = wrap_opexpr(std::forward<TOP>(op));
+    return expr<Variant::property, std::tuple<decltype(op_wrapped), property_extract_dtype>>(op_wrapped);
+}
+
+//! ELEMENTSIZE_OF("operand") - extract dtype of output, return element size.
+template <typename TOP> inline constexpr auto ELEMENTSIZE_OF(TOP &&op)
+{
+    auto op_wrapped = wrap_opexpr(std::forward<TOP>(op));
+    return expr<Variant::property, std::tuple<decltype(op_wrapped), property_extract_elementsize>>(op_wrapped);
+}
+
+//! ZERO_OFFSET_OF("operand") - extract zero_offset of output quantization
+template <typename TOP> inline constexpr auto ZERO_OFFSET_OF(TOP &&op)
+{
+    auto op_wrapped = wrap_opexpr(std::forward<TOP>(op));
+    return expr<Variant::property, std::tuple<decltype(op_wrapped), property_extract_zero_offset>>(op_wrapped);
+}
+
+//! STEPSIZE_OF("operand") - extract step size of output quantization
+template <typename TOP> inline constexpr auto STEPSIZE_OF(TOP &&op)
+{
+    auto op_wrapped = wrap_opexpr(std::forward<TOP>(op));
+    return expr<Variant::property, std::tuple<decltype(op_wrapped), property_extract_stepsize>>(op_wrapped);
+}
+
+//! INPUTS_OF("operand") - get the number of inputs of an operand
+template <typename TOP> inline constexpr auto INPUTS_OF(TOP &&op)
+{
+    auto op_wrapped = wrap_opexpr(std::forward<TOP>(op));
+    return expr<Variant::property, std::tuple<decltype(op_wrapped), property_extract_n_inputs>>(op_wrapped);
+}
+//! OUTPUTS_OF("operand")- get the number of outputs of an operand
+/// This can be applied to any OpDef, including $Out; for $Out or Multi Opdef
+/// it will return the full # of outputs.
+template <typename TOP> inline constexpr auto OUTPUTS_OF(TOP &&op)
+{
+    auto op_wrapped = wrap_opexpr(std::forward<TOP>(op));
+    return expr<Variant::property, std::tuple<decltype(op_wrapped), property_extract_n_outputs>>(op_wrapped);
+}
+//! DIM_OF("operand", idx) - get the size of an output in dimension 'idx'
+
+template <typename TOP, typename T> inline constexpr auto DIM_OF(TOP &&op, T &&index)
+{
+    auto op_wrapped = wrap_opexpr(std::forward<TOP>(op));
+    return expr<Variant::property, std::tuple<decltype(op_wrapped), dim_extractor<std::remove_reference_t<T>>>>(
+            op_wrapped, std::forward<T>(index));
+}
+
+/// }@
+
+template <typename TOP> inline constexpr auto DATA_SIZE(TOP &&op)
+{
+    auto op_wrapped = wrap_opexpr(std::forward<TOP>(op));
+    return expr<Variant::property, std::tuple<decltype(op_wrapped), property_data_size>>(op_wrapped);
+}
+
+struct op_compare_same_op {
+    typedef bool otype;
+    otype eval(ECtx &e, OpDef const *a, OpDef const *b) const { return a == b; }
+};
+
+struct op_compare_same_encoding {
+    typedef bool otype;
+    API_EXPORT otype eval(ECtx &e, OpDef const *a, OpDef const *b) const; // in oexpr.cc
+};
+
+struct op_compare_same_shape {
+    typedef bool otype;
+    API_EXPORT otype eval(ECtx &e, OpDef const *a, OpDef const *b) const; // in oexpr.cc
+};
+
+//
+// compare two ops in some way.
+//
+template <typename OPEXA, typename OPEXB, typename OPCMP>
+class expr<Variant::opcompare, std::tuple<OPEXA, OPEXB, OPCMP>> : public property_extract_dim {
+    const OPEXA m_opexpra;
+    const OPEXB m_opexprb;
+    OPCMP m_opcmp;
+
+  public:
+    typedef typename OPCMP::otype otype; // probably bool
+    expr(OPEXA const &opexpa, OPEXB const &opexpb) : m_opexpra(opexpa), m_opexprb(opexpb) {}
+
+    otype eval(ECtx &e) const
+    {
+        OpDef const &opa = get_opdef_oexpr<OPEXA>(e, m_opexpra);
+        OpDef const &opb = get_opdef_oexpr<OPEXB>(e, m_opexprb);
+        return m_opcmp.eval(e, &opa, &opb);
+    }
+};
+
+//! SAME_OP("operanda", "operandb") - same opid.
+//
+template <typename TOPA, typename TOPB> inline auto SAME_OP(TOPA &&opa, TOPB &&opb)
+{
+    auto opa_wrapped = wrap_opexpr(std::forward<TOPA>(opa));
+    auto opb_wrapped = wrap_opexpr(std::forward<TOPB>(opb));
+
+    return expr<Variant::opcompare, std::tuple<decltype(opa_wrapped), decltype(opb_wrapped), op_compare_same_op>>(
+            opa_wrapped, opb_wrapped);
+}
+
+//! SAME_ENCODING("operanda", "operandb") - same dtype, and same quant. when applicable.
+//
+template <typename TOPA, typename TOPB> inline auto SAME_ENCODING(TOPA &&opa, TOPB &&opb)
+{
+    auto opa_wrapped = wrap_opexpr(std::forward<TOPA>(opa));
+    auto opb_wrapped = wrap_opexpr(std::forward<TOPB>(opb));
+
+    return expr<Variant::opcompare, std::tuple<decltype(opa_wrapped), decltype(opb_wrapped), op_compare_same_encoding>>(
+            opa_wrapped, opb_wrapped);
+}
+
+//! SAME_SHAPE("operanda", "operandb") - same shape.
+//
+template <typename TOPA, typename TOPB> inline auto SAME_SHAPE(TOPA &&opa, TOPB &&opb)
+{
+    auto opa_wrapped = wrap_opexpr(std::forward<TOPA>(opa));
+    auto opb_wrapped = wrap_opexpr(std::forward<TOPB>(opb));
+
+    return expr<Variant::opcompare, std::tuple<decltype(opa_wrapped), decltype(opb_wrapped), op_compare_same_shape>>(
+            opa_wrapped, opb_wrapped);
+}
+
+///////////////////////////// slice dimensions ////////////////////
+// The 'expr' contains a tag name, and a pointer-to-member-data
+//
+
+template <typename T> // always int...
+struct expr<Variant::slicedim, T> : private opdef_accessor {
+    hnnx::split_context_tag_t m_tag;
+    typedef T otype;
+    T const Split_Context::*m_which;
+    inline T eval(ECtx &e) const { return lookup_split(e, m_tag).*m_which; }
+    expr(hnnx::split_context_tag_t const &tag, int const Split_Context::*which) : m_tag(tag), m_which(which) {}
+};
+
+/// \addtogroup OptConstraint
+/// @{
+
+//! SPLIT_START("splitname") - get current split start
+inline auto SPLIT_START(hnnx::split_context_tag_t spl)
+{
+    return expr<Variant::slicedim, int>(spl, &Split_Context::start);
+}
+
+//! ITER_VAR("splitname") - get current split start value
+/// synonym for SPLIT_START (for use in OP_ITER)
+inline auto ITER_VAR(hnnx::split_context_tag_t spl)
+{
+    return expr<Variant::slicedim, int>(spl, &Split_Context::start);
+}
+
+//! SPLIT_SIZE("splitname") - get current split size
+inline auto SPLIT_SIZE(hnnx::split_context_tag_t spl)
+{
+    return expr<Variant::slicedim, int>(spl, &Split_Context::size);
+}
+
+//! SPLIT_DIM("splitname") - get current split dimension
+inline auto SPLIT_DIM(hnnx::split_context_tag_t spl)
+{
+    return expr<Variant::slicedim, int>(spl, &Split_Context::dim);
+}
+
+/// }@
+
+PUSH_VISIBILITY(default)
+
+// function classes for Variant::getconst
+//
+struct getconst_int {
+    typedef NN_INT32_T otype;
+    static NN_INT32_T eval(ECtx &e, OpDef const &opdef, int index)
+    {
+        return hnnx::getconst_int_impl(e.graph(), opdef, index).first;
+    }
+};
+struct getconst_int2 {
+    typedef NN_INT32_T otype;
+    static NN_INT32_T eval(ECtx &e, OpDef const &opdef, int index, int index2)
+    {
+        return hnnx::getconst_int_impl(e.graph(), opdef, index, index2).first;
+    }
+};
+
+struct getconst_int_valid {
+    typedef bool otype;
+    static bool eval(ECtx &e, OpDef const &opdef, int index)
+    {
+        return hnnx::getconst_int_impl(e.graph(), opdef, index).second;
+    }
+};
+struct getconst_int2_valid {
+    typedef bool otype;
+    static bool eval(ECtx &e, OpDef const &opdef, int index, int index2)
+    {
+        return hnnx::getconst_int_impl(e.graph(), opdef, index, index2).second;
+    }
+};
+
+struct getconst_float {
+    typedef float otype;
+    static float eval(ECtx &e, OpDef const &opdef, int index)
+    {
+        return hnnx::getconst_float_impl(e.graph(), opdef, index).first;
+    }
+};
+struct getconst_float_valid {
+    typedef bool otype;
+    static bool eval(ECtx &e, OpDef const &opdef, int index)
+    {
+        return hnnx::getconst_float_impl(e.graph(), opdef, index).second;
+    }
+};
+
+POP_VISIBILITY()
+//
+// for CONSTVAL_INT etc
+//
+
+template <typename GETCONST, typename OPEXPR, typename IEXPR>
+class expr<Variant::getconst, tuple<GETCONST, OPEXPR, IEXPR>> : private opdef_accessor {
+    OPEXPR m_opexpr;
+    IEXPR m_iexpr;
+
+  public:
+    typedef typename GETCONST::otype otype;
+    inline otype eval(ECtx &e) const
+    {
+        OpDef const &opd = get_opdef_oexpr<OPEXPR>(e, m_opexpr);
+        int i = m_iexpr.eval(e);
+        return GETCONST::eval(e, opd, i);
+    }
+    constexpr expr(OPEXPR const &oexp, IEXPR const &iexpr) : m_opexpr(oexp), m_iexpr(iexpr) {}
+};
+
+template <typename GETCONST, typename OPEXPR, typename IEXPR, typename IEXPR2>
+class expr<Variant::getconst, tuple<GETCONST, OPEXPR, IEXPR, IEXPR2>> : private opdef_accessor {
+    OPEXPR m_opexpr;
+    IEXPR m_iexpr;
+    IEXPR2 m_iexpr2;
+
+  public:
+    typedef typename GETCONST::otype otype;
+    inline otype eval(ECtx &e) const
+    {
+        OpDef const &opd = get_opdef_oexpr<OPEXPR>(e, m_opexpr);
+        int i = m_iexpr.eval(e);
+        int i2 = m_iexpr2.eval(e);
+        return GETCONST::eval(e, opd, i, i2);
+    }
+    expr(OPEXPR const &oexp, IEXPR const &iexpr, IEXPR2 const &iexpr2)
+        : m_opexpr(oexp), m_iexpr(iexpr), m_iexpr2(iexpr2)
+    {
+    }
+};
+
+template <typename GETCONST, typename OPEXPR, typename IEXPR>
+inline constexpr auto make_getconst_expr(OPEXPR &&opexp, IEXPR &&iexpr)
+{
+    return expr<Variant::getconst, tuple<GETCONST, std::remove_reference_t<OPEXPR>, std::remove_reference_t<IEXPR>>>(
+            std::forward<OPEXPR>(opexp), std::forward<IEXPR>(iexpr));
+}
+
+template <typename GETCONST, typename OPEXPR, typename IEXPR, typename IEXPR2>
+inline constexpr auto make_getconst_expr(OPEXPR &&opexp, IEXPR &&iexpr, IEXPR2 &&iexpr2)
+{
+    return expr<Variant::getconst, tuple<GETCONST, std::remove_reference_t<OPEXPR>, std::remove_reference_t<IEXPR>,
+                                         std::remove_reference_t<IEXPR2>>>(
+            std::forward<OPEXPR>(opexp), std::forward<IEXPR>(iexpr), std::forward<IEXPR2>(iexpr2));
+}
+
+/// \addtogroup OptConstraint
+/// @{
+
+//! CONSTVAL_INT("operand",idx) - extract int value from const at given index
+template <typename OPEXPR, typename IEXPR> constexpr auto CONSTVAL_INT(OPEXPR &&opexp, IEXPR &&idx)
+{
+    auto op_wrapped = wrap_opexpr(std::forward<OPEXPR>(opexp));
+    auto wi = wrap_param(std::forward<IEXPR>(idx));
+    return make_getconst_expr<getconst_int>(std::move(op_wrapped), std::move(wi));
+}
+
+//! CONSTVAL_INT("operand",idx, idx2) - extract int value from const at given index
+template <typename OPEXPR, typename IEXPR, typename IEXPR2>
+constexpr auto CONSTVAL_INT(OPEXPR &&opexp, IEXPR &&idx, IEXPR2 &&idx2)
+{
+    auto op_wrapped = wrap_opexpr(std::forward<OPEXPR>(opexp));
+    auto wi = wrap_param(std::forward<IEXPR>(idx));
+    auto wi2 = wrap_param(std::forward<IEXPR2>(idx2));
+    return make_getconst_expr<getconst_int2>(std::move(op_wrapped), std::move(wi), std::move(wi2));
+}
+
+//! CONSTVAL_INT_VALID("operand",idx) - determine if CONSTVAL_INT("operand",idx) is valid
+template <typename OPEXPR, typename IEXPR> constexpr auto CONSTVAL_INT_VALID(OPEXPR &&opexp, IEXPR &&idx)
+{
+    auto op_wrapped = wrap_opexpr(std::forward<OPEXPR>(opexp));
+    auto wi = wrap_param(std::forward<IEXPR>(idx));
+    return make_getconst_expr<getconst_int_valid>(std::move(op_wrapped), std::move(wi));
+}
+
+//! CONSTVAL_INT_VALID("operand",idx,idx2) - determine if CONSTVAL_INT("operand",idx,idx2) is valid
+template <typename OPEXPR, typename IEXPR, typename IEXPR2>
+constexpr auto CONSTVAL_INT_VALID(OPEXPR &&opexp, IEXPR &&idx, IEXPR2 &&idx2)
+{
+    auto op_wrapped = wrap_opexpr(std::forward<OPEXPR>(opexp));
+    auto wi = wrap_param(std::forward<IEXPR>(idx));
+    auto wi2 = wrap_param(std::forward<IEXPR2>(idx2));
+    return make_getconst_expr<getconst_int2_valid>(std::move(op_wrapped), std::move(wi), std::move(wi2));
+}
+
+//! CONSTVAL_FLOAT("operand",idx) - extract float value from const at given index
+template <typename OPEXPR, typename IEXPR> constexpr auto CONSTVAL_FLOAT(OPEXPR &&opexp, IEXPR &&idx)
+{
+    auto op_wrapped = wrap_opexpr(std::forward<OPEXPR>(opexp));
+    auto wi = wrap_param(std::forward<IEXPR>(idx));
+    return make_getconst_expr<getconst_float>(std::move(op_wrapped), std::move(wi));
+}
+
+//! CONSTVAL_FLOAT_VALID("operand",idx) - determine if CONSTVAL_FLOAT("operand",idx) is valid
+template <typename OPEXPR, typename IEXPR> constexpr auto CONSTVAL_FLOAT_VALID(OPEXPR &&opexp, IEXPR &&idx)
+{
+    auto op_wrapped = wrap_opexpr(std::forward<OPEXPR>(opexp));
+    auto wi = wrap_param(std::forward<IEXPR>(idx));
+    return make_getconst_expr<getconst_float_valid>(std::move(op_wrapped), std::move(wi));
+}
+
+// SELECT( bool, op,op )
+
+template <typename CONDEXP, typename OPA, typename OPB> class opexpr<OpVnt::select, tuple<CONDEXP, OPA, OPB>> {
+    CONDEXP m_cond;
+    OPA m_opa;
+    OPB m_opb;
+
+  public:
+    inline OpRef eval(ECtx &e) const
+    {
+        bool sel = m_cond.eval(e);
+        if (sel)
+            return m_opa.eval(e);
+        else
+            return m_opb.eval(e);
+    }
+    constexpr opexpr(CONDEXP const &sel, OPA const &opa, OPB const &opb) : m_cond(sel), m_opa(opa), m_opb(opb) {}
+};
+template <typename SEL, typename A, typename B> inline constexpr auto make_opselect(SEL s, A a, B b)
+{
+    return opexpr<OpVnt::select, tuple<SEL, A, B>>(s, a, b);
+}
+
+//! SELECT(cond, A,B) - cond?A:B
+template <typename SEL, typename A, typename B> inline constexpr auto SELECT(SEL &&s, A &&a, B &&b)
+{
+    // Op select, or numeric select?
+    // note, the 'Op' variant will not be used in a Replacement rule, since
+    // it's intercepted (in Replacement::SELECT) by a ReplFunc implementation.
+    if constexpr (Replacement::is_Op_type<A>() || Replacement::is_Op_type<B>()) {
+        static_assert(Replacement::is_Op_type<A>() && Replacement::is_Op_type<B>(), "bad SELECT parameters");
+        auto ws = wrap_param_to<bool>(std::forward<SEL>(s));
+        auto wa = wrap_opexpr(std::forward<A>(a));
+        auto wb = wrap_opexpr(std::forward<B>(b));
+        return make_opselect(ws, wa, wb);
+    } else {
+        auto ws = wrap_param_to<bool>(std::forward<SEL>(s));
+        auto wa = wrap_param(std::forward<A>(a));
+        auto wb = wrap_param(std::forward<B>(b));
+        return make_select(ws, wa, wb);
+    }
+}
+
+//! OPTION_INT("option_name") - get the option value as int
+inline expr<Variant::config, int> OPTION_INT(char const *optname)
+{
+    return expr<Variant::config, int>(optname);
+}
+//! OPTION_UINT("option_name") - get the option value as size_t
+inline expr<Variant::config, size_t> OPTION_UINT(char const *optname)
+{
+    return expr<Variant::config, size_t>(optname);
+}
+
+//! OPTION_FLOAT("option_name") - get the option value as float
+inline expr<Variant::config, float> OPTION_FLOAT(char const *optname)
+{
+    return expr<Variant::config, float>(optname);
+}
+//! OPTION_BOOL("option_name") - get the option value as bool
+inline expr<Variant::config, bool> OPTION_BOOL(char const *optname)
+{
+    return expr<Variant::config, bool>(optname);
+}
+
+/// }@
+
+////////////////////// MESSAGE ///////////////////////////////////////
+//  MESSAGE(".. message ..")
+//   - evaluates as true; records message and triggers debug info.
+//
+//   Messages only happen when WITH_OPT_DEBUG is defined; when it is not, we have
+//   MESSAGE("...") -> true
+//   MESSAGE_IF(cond, "...") -> cond
+//   MESSAGE_IFNOT(cond, "...") -> cond
+//   MESSAGE_VALUE(valud, "...") -> value
+//
+#ifdef WITH_OPT_DEBUG
+//
+// Variant::message is a MESSAGE, MESSAGE_IF, or MESSAGE_IFNOT expression.
+// When MESSAGE, CONDEXPR is 'char'.
+//
+struct msg_message {
+};
+struct msg_message_if {
+};
+struct msg_message_ifnot {
+};
+struct msg_message_value {
+};
+// MESSAGE(value, "...") -> value
+
+template <typename MODE, typename CONDEXPR>
+struct expr<Variant::message, tuple<MODE, CONDEXPR>> : private opdef_accessor {
+    CONDEXPR m_condition;
+    std::string m_message;
+
+    typedef bool otype;
+    expr(char const *m, CONDEXPR &&cond) : m_condition(std::move(cond)), m_message(m) {}
+    expr(char const *m, CONDEXPR const &cond) : m_condition(cond), m_message(m) {}
+    inline bool eval(ECtx &e) const
+    {
+        if constexpr (std::is_same_v<MODE, msg_message>) {
+            show_debug_message(e, "MESSAGE", m_message.c_str());
+            return true;
+        } else {
+            bool result = m_condition.eval(e);
+            if constexpr (std::is_same_v<MODE, msg_message_if>) {
+                if (result) show_debug_message(e, "MESSAGE_IF", m_message.c_str());
+            } else {
+                if (!result) show_debug_message(e, "MESSAGE_IFNOT", m_message.c_str());
+            }
+            return result;
+        }
+    }
+};
+template <typename MODE, typename CONDEXPR>
+struct expr<Variant::message_value, tuple<MODE, CONDEXPR>> : private opdef_accessor {
+    CONDEXPR m_condition;
+    std::string m_message;
+
+    typedef bool otype;
+    expr(char const *m, CONDEXPR &&cond) : m_condition(std::move(cond)), m_message(m) {}
+    expr(char const *m, CONDEXPR const &cond) : m_condition(cond), m_message(m) {}
+    inline int eval(ECtx &e) const
+    {
+        int result = m_condition.eval(e);
+        std::stringstream ss;
+        ss << m_message << " = " << result;
+        show_debug_message(e, "MESSAGE", ss.str().c_str());
+        return result;
+    }
+};
+template <typename MODE, typename CND> inline auto make_message_expr(char const *str, CND &&cond)
+{
+    return expr<Variant::message, tuple<MODE, std::remove_reference_t<CND>>>(str, std::forward<CND>(cond));
+}
+#endif
+
+#ifndef WITH_OPT_DEBUG
+// dummy functions (which are constexpr); just generate the bool
+inline constexpr auto MESSAGE(char const *str)
+{
+    return expr<Variant::value, bool>(true);
+}
+template <typename COND> inline constexpr auto MESSAGE_IF(COND a, char const *str)
+{
+    return wrap_param_to<bool>(a);
+}
+template <typename COND> inline constexpr auto MESSAGE_IFNOT(COND a, char const *str)
+{
+    return wrap_param_to<bool>(a);
+}
+template <typename VALUE> inline auto MESSAGE_VALUE(VALUE v, char const *str)
+{
+    return wrap_param_to<int>(v);
+}
+#else
+// actual functions for WITH_OPT_DEBUG - cannot be constexpr
+inline auto MESSAGE(char const *str)
+{
+    return expr<Variant::message, tuple<msg_message, char>>(str, '.');
+}
+template <typename VALUE> inline auto MESSAGE_VALUE(VALUE v, char const *str)
+{
+    auto wv = wrap_param_to<int>(v);
+    return expr<Variant::message_value, tuple<msg_message_value, std::remove_reference_t<VALUE>>>(str,
+                                                                                                  forward<VALUE>(wv));
+}
+template <typename COND> inline auto MESSAGE_IF(COND a, char const *str)
+{
+    auto wa = wrap_param_to<bool>(a);
+    return make_message_expr<msg_message_if>(str, wa);
+}
+template <typename COND> inline auto MESSAGE_IFNOT(COND a, char const *str)
+{
+    auto wa = wrap_param_to<bool>(a);
+    return make_message_expr<msg_message_ifnot>(str, wa);
+}
+
+#endif // WITH_OPT_DEBUG
+
+////////////////////// external constraint ///////////////////////////
+
+// This supports external constraint functions of the form
+//   T function ( Constraint & , OpRef, ... any scalar ...)
+// T *must* be one of the supported basic scalar types (and is usually bool...)
+//
+
+template <typename FUNC, typename ARGPACK, size_t... I>
+inline auto apply_cst_function(ECtx &e, FUNC f, OpRef tgt, ARGPACK args, std::index_sequence<I...>)
+{
+    // allow the function to see a non-const ref
+    auto &cstobj = const_cast<std::remove_const_t<ECtx> &>(e);
+    return (*f)(cstobj, tgt, std::get<I>(args).eval(e)...);
+}
+
+//  The expression type is
+//    expr<Variant::external, tuple< FUNC, tuple<...> >>
+// .. where ... are the types of 0 or or more scalar parameters,
+//   each of which is actually an expr object.
+//
+template <typename FUNC, typename INPARMS>
+struct expr<Variant::external, tuple<FUNC, INPARMS>> : private opdef_accessor {
+    static constexpr size_t NPARMS = std::tuple_size_v<INPARMS>;
+    FUNC m_function; // pointer to function
+    hnnx::operand_tag_t m_optag; // one of the function params
+    INPARMS m_other_operands; // other params (0 or more; all as expr<> objects)
+
+    expr(FUNC f, hnnx::operand_tag_parm_t optag, INPARMS &&etc)
+        : m_function(f), m_optag(optag), m_other_operands(std::move(etc))
+    {
+    }
+
+    using otype = decltype(apply_cst_function<FUNC, INPARMS>(fake_ectx(), m_function, OpRef(), m_other_operands,
+                                                             std::make_index_sequence<NPARMS>()));
+    // TODO: we should ensure here that 'otype' is one of the allowed types (and maybe apply_cse_function
+    // should do minor coercions, e.g. from short to int).
+
+    inline otype eval(ECtx &e) const
+    {
+        OpRef target = lookup_operand(e, m_optag);
+        return apply_cst_function<FUNC, INPARMS>(e, m_function, target, m_other_operands,
+                                                 std::make_index_sequence<NPARMS>());
+    }
+};
+
+// this EXTERNAL_CONSTRAINT requires that the second parameter is a literal operand name;
+// I don't think that's a problem.
+template <typename FUNC, typename... Args>
+auto EXTERNAL_CONSTRAINT(FUNC f, hnnx::operand_tag_parm_t optag, Args &&...args)
+{
+    auto parmpack = std::make_tuple(wrap_param(std::forward<Args>(args))...);
+    return expr<Variant::external, tuple<FUNC, decltype(parmpack)>>(f, optag, std::move(parmpack));
+}
+
+template <typename OPEX> struct expr<Variant::producer_for, OPEX> : private opdef_accessor {
+    std::string m_consumer_opname;
+    const OPEX m_prod_opexpr;
+    inline bool eval(ECtx &e) const
+    {
+        std::string prefix_consumer_opname;
+        const char *const opname = hnnx::get_opname_with_pkg_prefix(prefix_consumer_opname, m_consumer_opname.c_str());
+        OpDef const &prod_opdef = get_opdef_oexpr<OPEX>(e, m_prod_opexpr);
+        return hnnx::producer_for_impl(prod_opdef, opname);
+    }
+    expr(OPEX const &prod_opexpr, char const *consumer_opname)
+        : m_consumer_opname(consumer_opname), m_prod_opexpr(prod_opexpr)
+    {
+    }
+};
+
+//! PRODUCER_FOR("operand", "opname") - check if "operand" has consumer with name "opname"
+template <typename OPEX> auto PRODUCER_FOR(OPEX &&producer, char const *consumer_opname)
+{
+    auto producer_wrapped = wrap_opexpr(std::forward<OPEX>(producer));
+    return expr<Variant::producer_for, decltype(producer_wrapped)>(producer_wrapped, consumer_opname);
+}
+
+template <typename OPEX> struct expr<Variant::eq_opstr, OPEX> : private opdef_accessor {
+    std::string m_opname;
+    const OPEX m_opexpr;
+    inline bool eval(ECtx &e) const
+    {
+        // TODO -- can this be moved to the constructor?
+        std::string prefix_opname;
+        const char *opname = hnnx::get_opname_with_pkg_prefix(prefix_opname, m_opname.c_str());
+        OpDef const &opdef = get_opdef_oexpr<OPEX>(e, m_opexpr);
+        return opdef.opstr == opname;
+    }
+    expr(OPEX const &op_opexpr, char const *opname) : m_opname(opname), m_opexpr(op_opexpr) {}
+};
+
+//! IS_OP("op", "opname") - check if "op" does not opstr equal to  "opname"
+template <typename OPEX> auto IS_OP(OPEX &&op, char const *opname)
+{
+    auto op_wrapped = wrap_opexpr(std::forward<OPEX>(op));
+    return expr<Variant::eq_opstr, decltype(op_wrapped)>(op_wrapped, opname);
+}
+
+//! IS_DTYPE_ALL(Dtype, "operand", ...) -> bool (true if all operands are Dtype)
+template <typename Tdtype, typename Ta, typename... Ts>
+inline constexpr auto IS_DTYPE_ALL(Tdtype &&dtype, Ta &&a, Ts &&...ts)
+{
+    return AND(EQ(DTYPE_OF(std::forward<Ta>(a)), std::forward<Tdtype>(dtype)),
+               IS_DTYPE_ALL(std::forward<Tdtype>(dtype), std::forward<Ts>(ts))...);
+}
+} // namespace oExp
+
+// create namespaces visible in constraints, and in replacements.
+// 'constraint' namespace can't see SPLIT_START etc.
+
+namespace oExp_for_cst {
+
+#ifndef PREPARE_DISABLED
+using oExp::ADD, oExp::SUB, oExp::MUL, oExp::DIV, oExp::NEG;
+using oExp::AND, oExp::OR, oExp::XOR, oExp::NOT;
+using oExp::DATA_SIZE;
+using oExp::DIM_OF, oExp::INPUTS_OF, oExp::OUTPUTS_OF, oExp::ELEMENTSIZE_OF;
+using oExp::EQ, oExp::NE, oExp::LT, oExp::GT, oExp::LE, oExp::GE;
+using oExp::EXTERNAL_CONSTRAINT;
+using oExp::INPUT_OF, oExp::OUTPUT_OF;
+using oExp::IS_POW2;
+using oExp::MESSAGE, oExp::MESSAGE_IF, oExp::MESSAGE_IFNOT, oExp::MESSAGE_VALUE;
+using oExp::MIN, oExp::MAX, oExp::ABS;
+using oExp::RANK_OF, oExp::ZERO_OFFSET_OF, oExp::STEPSIZE_OF, oExp::DTYPE_OF;
+using oExp::REM, oExp::MOD;
+using oExp::ROUNDUP;
+using oExp::SAME_OP, oExp::SAME_ENCODING, oExp::SAME_SHAPE;
+using oExp::SELECT;
+
+using oExp::INT, oExp::UINT, oExp::FLOAT, oExp::DTYPE; // 'cast' operators
+
+using oExp::CONSTVAL_FLOAT, oExp::CONSTVAL_FLOAT_VALID;
+using oExp::CONSTVAL_INT, oExp::CONSTVAL_INT_VALID;
+
+using oExp::IS_DTYPE_ALL;
+using oExp::IS_OP;
+using oExp::OPTION_INT, oExp::OPTION_UINT, oExp::OPTION_FLOAT, oExp::OPTION_BOOL;
+using oExp::PRODUCER_FOR;
+#endif
+
+/// \ingroupOptConstraint
+/// @brief OK can be used when no constraint is needed
+static constexpr bool OK = true;
+
+/// \ingroupOptConstraint
+/// @brief  INF: use for inf in constraints and replacement rules.
+static constexpr float INF = std::numeric_limits<float>::infinity();
+/// \ingroupOptConstraint
+/// @brief  NEG_INF: use for -inf in constraints and replacement rules.
+static constexpr float NEG_INF = -std::numeric_limits<float>::infinity();
+
+/// \ingroupOptConstraint
+/// @brief  INF: use for inf in constraints and replacement rules.
+static constexpr float INF_INT = std::numeric_limits<int32_t>::infinity();
+/// \ingroupOptConstraint
+/// @brief  NEG_INF: use for -inf in constraints and replacement rules.
+static constexpr float NEG_INF_INT = -std::numeric_limits<int32_t>::infinity();
+} // namespace oExp_for_cst
+
+namespace oExp_for_repl {
+#ifndef PREPARE_DISABLED
+using namespace oExp_for_cst;
+using oExp::ITER_VAR;
+using oExp::SPLIT_START, oExp::SPLIT_SIZE, oExp::SPLIT_DIM;
+#endif
+
+// ITER_INPUT_OF( op, spl ) -> INPUT_OF( op, ITER_VAR(spl))
+
+/// \ingroup OptReplacement
+/// @brief ITER_INPUT_OF( <some_op>, "split") - extract the input from <someop> selected by ITER_VAR(split)
+///
+template <typename OPER> inline auto ITER_INPUT_OF(OPER &&oper, hnnx::split_context_tag_t whatsplit)
+{
+    return oExp::INPUT_OF(std::forward<OPER>(oper), oExp::ITER_VAR(whatsplit));
+}
+
+} // namespace oExp_for_repl
+
+POP_VISIBILITY()
+
+#endif /* !PREPARE_DISABLED */
+#endif /* OEXPR_POST_H_ */
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/op.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/op.h
new file mode 100755
index 0000000000000..136270768095e
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/op.h
@@ -0,0 +1,480 @@
+//==============================================================================
+//
+// Copyright (c) 2018-2023 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef OP_H
+#define OP_H
+
+#include <typeinfo>
+#include "flags.h"
+#include "graph_status.h"
+#include "op_def.h"
+#include "executable.h"
+#include "cost_funcs.h"
+#include "unique_types.h"
+#include "serialize_defs.h"
+#include "serialize_oplist.h"
+#include <set>
+#include <vector>
+#include "weak_linkage.h"
+#include "macros_attribute.h"
+
+class Graph;
+class Tensor;
+namespace hnnx {
+class OpIoPtrs;
+class SimpleOpBase;
+class CostBasedFeatureDesc;
+struct OpExtraInfo;
+} // namespace hnnx
+/*
+ * What are the fundamentals of an op?
+ *
+ * It has an ID to be able to refer to it easily
+ * It has zero or more inputs.  Inputs refer to an output of another op.
+ * It has zero or more outputs.  Output definitions determine the max size of an op.
+ * It can execute.  When an op executes, it uses the inputs to produce the outputs.
+ *
+ * There are also, probably some less important aspects to ops:
+ * * Constructor / Destructor
+ * * In Hexagon NN V2, we have a hook during graph preparation.  This isn't
+ *   always necessary, and maybe we should strive to make it unnecessary?
+ * * We sometimes use flags to indicate something about an op
+ *
+ * There will be some other aspects to ops eventually, dealing with when
+ * to "wake up" and what other ops to notify when finished.  But for now
+ * we can just run one op at a time.
+ */
+
+#include "weak_linkage.h"
+PUSH_VISIBILITY(default)
+
+// Flags used to describe the class of checkpoints we have.
+enum ChkptStoreType {
+    ChkptNormal = 0, // N, M
+    ChkptNone = 1, // (-1 or 0) and (-1 or 1).
+    ChkptNoGate = 2, // (-1 or 0), N
+    ChkptNoDone = 3, // N, (-1 or 1)
+    ChkptFlagShift = 2,
+    ChkptOpFlagMask = 0x3,
+    ChkptFlagMask = ((1 << ChkptFlagShift) - 1),
+};
+
+/*
+ * FIXME: instead of deserialize function, we should have a constructor with arguments (const char **bufp, size_t *)
+ */
+
+/**
+ * @class Op
+ *
+ * @brief Basic, minimal Op
+ * Ops inherit from this class
+ *
+ * This is starting out minimal, we will extend this in the future
+ *
+ * Maybe ID should be here, maybe not.
+ */
+
+class Op : public hnnx::Executable {
+    friend void hnnx::op_serialize_common(hnnx::Serializer &, Op const *, std::type_info const *);
+    //! Interface to the external world is 32 bits for an Op ID (0 and above 0xF000_0000 are reserved for internal use).
+    //! However, as we break ops down we want to have some semblance of the original op IDs while still maintaining unique IDs.
+    //! So we make internal OpIDs 64 bits.
+    //! Half of them can be the external OpID, and we can use a counter or something to uniquify in the other bits.
+    //! We can accumulate performance information and such to still represent OpIDs on the interface.
+  public:
+    Op(){};
+    API_EXPORT Op(Graph &graph_in, unsigned long long int my_id_in);
+    API_EXPORT explicit Op(hnnx::Deserz &);
+    Op(Op const &) = delete;
+    Op &operator=(Op const &) = delete;
+    // virtual destructor
+    virtual ~Op() = default;
+    // Use this if you need a destructor which has access to a Graph object.
+    API_EXPORT virtual void clear(Graph *graph_in) {}
+    API_EXPORT virtual GraphStatus prepare(hnnx::OpIoPtrs const &, bool tcm_available) = 0;
+    API_EXPORT virtual GraphStatus allocate(Graph &graph_in) = 0;
+    API_EXPORT OpId id(const Graph &graph_in) const noexcept;
+
+    API_EXPORT ChkptStoreType get_chkpt_store_type(const Graph &graph_in) const;
+    API_EXPORT OpStoreType get_op_store_type(const Graph &gr) const;
+    API_EXPORT static OpStoreType get_op_store_type(uint32_t flags)
+    {
+        return OpStoreType((flags >> ChkptFlagShift) & ChkptOpFlagMask);
+    };
+
+    API_EXPORT void set_chkpts(Graph &graph_in, const std::pair<int, int> chkpts);
+    API_EXPORT void set_chkpts(Graph &graph_in, int gate, int done)
+    {
+        set_chkpts(graph_in, std::make_pair(gate, done));
+    }
+
+    API_EXPORT const Tensor *get_input(size_t which) const { return get_input_output(which, true); }
+    API_EXPORT const Tensor *get_output(size_t which) const { return get_input_output(which, false); }
+
+    API_EXPORT virtual bool set_input(size_t which, const Tensor *tensor) { return false; }
+
+    API_EXPORT virtual bool is_valid() const noexcept = 0; // Is this op valid in this situation?
+    API_EXPORT void dependence_resolved() noexcept;
+    API_EXPORT bool
+    is_const() const noexcept; // Data for this op always available, execution and dependence tracking not needed.
+    API_EXPORT virtual std::pair<size_t, size_t> num_inputs_outputs() const = 0;
+    API_EXPORT inline size_t num_outputs() const { return num_inputs_outputs().second; }
+    API_EXPORT inline size_t num_inputs() const { return num_inputs_outputs().first; }
+    API_EXPORT const char *true_name() const;
+    API_EXPORT virtual Flags_word get_flag_word() const { return hnnx::flags_for<Op>(); }
+    virtual const char *get_docs() const { return hnnx::docs_for<Op>(); } //LCOV_EXCL_LINE [SAFTYSWCCB-1542]
+
+    /// @brief
+    ///     Gets the typeid mangled name of the kernel implementing this operator
+    API_EXPORT const char *true_func() const noexcept;
+
+    // get type, allowing for SimpleOpWrapper to get forwarded type.
+    API_EXPORT std::type_info const *get_type_extended() const;
+    API_EXPORT bool get_flag(Flags flag) const { return hnnx::test_flag_for(get_flag_word(), flag); }
+    //LCOV_EXCL_START [SAFTYSWCCB-1542]
+    API_EXPORT bool get_flag_and(Flags flag0, Flags flag1) const
+    {
+        return hnnx::test_flag_and(get_flag_word(), flag0, flag1);
+    }
+    //LCOV_EXCL_STOP
+    API_EXPORT inline hnnx::blockid_set_t input_blocks(int mc_sel = -1) const
+    {
+        return input_output_blocks(true, mc_sel);
+    }
+    API_EXPORT inline hnnx::blockid_set_t input_blocks(MemoryClass mc) const
+    {
+        return input_output_blocks(true, int(mc));
+    }
+    API_EXPORT inline hnnx::blockid_set_t output_blocks(int mc_sel = -1) const
+    {
+        return input_output_blocks(false, mc_sel);
+    }
+    API_EXPORT inline hnnx::blockid_set_t output_blocks(MemoryClass mc) const
+    {
+        return input_output_blocks(false, int(mc));
+    }
+
+    API_EXPORT virtual void enumerate_blocks(hnnx::MemBlockEnumerator &en, bool is_input) const {}
+    API_EXPORT inline void enumerate_input_blocks(hnnx::MemBlockEnumerator &en) const { enumerate_blocks(en, true); }
+    API_EXPORT inline void enumerate_output_blocks(hnnx::MemBlockEnumerator &en) const { enumerate_blocks(en, false); }
+
+    // The 'ef' parameter to these functions is a callable (function, lambda, std::function...)
+    // compatible with MemBlockEnumerator::supply_blocks_func
+    template <typename ENFUNC> API_EXPORT inline void enumerate_blocks_withfunc(ENFUNC &&ef, bool is_input) const
+    {
+        hnnx::MemBlockEnumWrapper<std::remove_reference_t<ENFUNC>> enumer(std::forward<ENFUNC>(ef));
+        this->enumerate_blocks(enumer, is_input);
+    }
+    template <typename ENFUNC> API_EXPORT inline void enumerate_input_blocks_withfunc(ENFUNC &&ef) const
+    {
+        enumerate_blocks_withfunc(std::forward<ENFUNC>(ef), true);
+    }
+    template <typename ENFUNC> API_EXPORT inline void enumerate_output_blocks_withfunc(ENFUNC &&ef) const
+    {
+        enumerate_blocks_withfunc(std::forward<ENFUNC>(ef), false);
+    }
+
+    API_EXPORT virtual void serialize(hnnx::SerOpsInterface &) const = 0;
+    using tensor_deserializer_register_func = int (*)();
+
+    // there are fewer combinations of true_output_tuple_type than there are
+    // TypicalOpIO, so it's better to return a function here than to make one.
+    //
+    static constexpr tensor_deserializer_register_func get_tensor_deserializer_register_func()
+    {
+        return hnnx::deserialize_tensor_tuple<std::tuple<>, false>::f_ptr();
+    }
+    API_EXPORT float cost(const Graph &) const;
+
+    // 'clone_mode' for Op::clone
+    enum op_clonemode {
+        opclone_auto, // opclone_dup if op has NULL_EXEC, otherwise opclone_realloc
+        opclone_realloc, // when duplicating the output tensors, zero all block ids and reallocate
+        opclone_dup // duplicate output with same block ids; and suppress ctor hooks.
+    };
+    //
+    //
+    // Clone an Op.
+    // This makes an op with the same input tensors as the current Op, and the specified
+    // OpId. The new op has new output tensors which are 'duplicate_clone' of the output
+    // tensors of the existing Op.
+    //
+    // Caveats:
+    //  - ALWAYS CHECK FOR NULL RETURN VALUE. There is no errlog if the clone fails, just a null return.
+    //  - Not all Op can be cloned in this way; it applies only to Ops which can be created from OpDef.generate().
+    //    So, no things like SpillOp or ValidateOp.
+    //  - The Op's 'constructor hooks' are only called if 'opclone_realloc' mode is specified (or selected via opclone_auto)
+    //  - 'prepare' is called with tcm_available = true; it is assumed that if new Op needs that, the original
+    //    op needed it too.
+    //  - You can pass an alternate Op type ('clone X but as Y'... ); use extreme caution, will only work if the
+    //    number and types of input and output tensors are supported by Y.
+    //
+    API_EXPORT hnnx::uptr_Op clone(Graph &graph_in, OpId, op_clonemode opclonemode = opclone_auto,
+                                   std::type_info const *as_type = nullptr) const;
+
+    // these are not virtual, but are thin wrappers of swap_output so they act as if virtual.
+    /// @brief remove output tensor from an op
+    /// returns empty pointer on failure. Always fails on Op types which don't overload swap_output.
+    API_EXPORT hnnx::uptr_Tensor steal_output(size_t which);
+    /// @brief attach an output tensors to an Op.
+    /// succeeds (and returns true) if val is not empty, 'which' is in range and the Op doesn't already have that
+    //  output set; otherwise it returns false and val is unchanged.
+    //  Always fails on Op types which don't overload swap_output.
+    API_EXPORT bool install_output(size_t which, hnnx::uptr_Tensor &&val);
+
+  protected:
+    API_EXPORT virtual Tensor const *get_input_output(size_t which, bool is_input) const = 0;
+    // swap_output underpins steal_output and install_output:
+    // it should:
+    //    return false, if these operations are not supported, or if the index is too large;
+    //    otherwise:
+    //       - if the incoming val is empty, treat it as 'steal_input'; if ok, swap and return true;
+    //         perform any other side-effects which may be needed.
+    //       - otherwise it's a 'set_output'. return false if the output is already set; otherwise
+    //         swap and return true (and perform any side-effects).
+    //
+    API_EXPORT virtual bool swap_output(size_t which, hnnx::uptr_Tensor &val);
+
+    //
+    // These are used in enumerate_blocks implementations
+    API_EXPORT void enumerate_op_input_blocks(hnnx::MemBlockEnumerator &en, Tensor const *const *inputs_p,
+                                              unsigned n) const;
+    API_EXPORT void enumerate_op_output_blocks(hnnx::MemBlockEnumerator &en, hnnx::uptr_Tensor const *outputs_p,
+                                               unsigned n) const;
+    template <typename VIN, typename VOUT>
+    [[gnu::always_inline]] inline void enumerate_op_blocks(hnnx::MemBlockEnumerator &en, VIN const &vinputs,
+                                                           VOUT const &voutputs, bool is_input) const
+    {
+        if (is_input) {
+            enumerate_op_input_blocks(en, vinputs.data(), vinputs.size());
+        } else {
+            enumerate_op_output_blocks(en, voutputs.data(), voutputs.size());
+        }
+    }
+
+    // legacy interface, implemented via enumerate_blocks
+    API_EXPORT hnnx::blockid_set_t input_output_blocks(bool is_input, int mc_sel) const;
+
+    // subclasses can forward enumerate_blocks to this method to reduce copy-pasta -
+    // it just traverses the inputs (or outputs) using the virtual API and calls
+    // enum_memory_blocks on all the tensors it discovers.
+    API_EXPORT void enumerate_blocks_generic(hnnx::MemBlockEnumerator &en, bool is_input) const;
+
+    // subclasses can forward 'allocate' to this method to reduce copy-pasta.
+    // it just calls allocate on all of the outputs it discovers using the
+    // virtual function API.
+    // If allocator is null, it uses the alloc in the graph.
+    API_EXPORT GraphStatus allocate_generic(hnnx::Allocator *alloc = nullptr);
+
+    API_EXPORT void serialize_internal(hnnx::Serializer &sctx, ChkptStoreType st) const;
+    API_EXPORT uint32_t get_serialize_flags(const Graph &, ChkptStoreType st) const;
+};
+
+/**
+ * @brief All Op source files must invoke this macro at the top of the file,
+ * before any COST_OF/REGISTER_OP/DEF_OPT calls.
+ *
+ */
+#define BEGIN_OP_DEFINITION(NAME) INITIALIZE_TABLES()
+
+/**
+ * @brief All Op source files must invoke this macro at the bottom of the
+ * file, after all COST_OF/REGISTER_OP/DEF_OPT calls.
+ *
+ */
+#define END_OP_DEFINITION(NAME) FINALIZE_TABLES(NAME)
+
+/**
+ * @brief Op Cost return types
+ * As of now we support 3 types of cost for Ops
+ */
+
+struct StandardCosts {
+    static constexpr float GLACIAL = 0x1.0p48; // 2**48 cycles
+    static constexpr float SNAIL = 0x1.0p32; // 2**32 cycles
+    static constexpr float FAST = 0x1.0p8; // 256 cycles
+    static constexpr float FREE = 0x1.0p-64;
+    static constexpr float DISABLE = 0x1.0p50; // 2**50 cycles, worse than GLACIAL, don't select this.
+};
+
+/*
+ * EJP: FIXME: Cost here is a simple fixed cost.
+ * Having simple costs available and a slow fixed cost available by default is great.
+ *
+ * But to accurately reflect cost, we need be able to inspect the details of the op definition.
+ * For example, a const of a convolution will depend on the types and shapes of
+ * weights and activations.
+ *
+ */
+
+namespace hnnx {
+
+/**
+ * Return the cost_function_t object for the Op.
+ * The Ops need to specialize this class
+ * if its cost differs from the default one.
+ */
+
+template <typename ConcreteOp> constexpr hnnx::cost_function_t get_costf()
+{
+    return hnnx::cost_function_t(StandardCosts::GLACIAL);
+}
+
+/*
+ * For concrete version of an op, see typical_op.h
+ */
+
+/*
+ * Not the typical Const op, but a wrapper around a Tensor that someone has formed...
+ */
+
+class ConstWrapperOp : public Op {
+    uptr_Tensor owned_tensor;
+
+  public:
+    API_EXPORT ConstWrapperOp(Graph &graph_in, OpId my_id_in, const OpDef *op_def_in);
+    API_EXPORT ConstWrapperOp(Graph &graph_in, OpId my_id_in, uptr_Tensor owned_tensor_in);
+    API_EXPORT explicit ConstWrapperOp(hnnx::Deserz &dctx);
+    // make a persistent Flat tensor with the given type, shape, data,
+    // and wrap it in a ConstWrapperOp. May not support all DTtype, but definitely
+    // Float32 and Int32, and QUint8. See implementaton in op.cc
+    API_EXPORT ConstWrapperOp(Graph &graph_in, OpId my_id_in, const OutputDef &def, void const *data_in);
+
+    API_EXPORT void clear(Graph *graph_in) override;
+    API_EXPORT virtual GraphStatus execute(EXECUTE_METHOD_PARMS) const noexcept override
+    {
+        return GraphStatus::Success;
+    }
+    API_EXPORT virtual hnnx::Executable::ItemType compile(Graph &graph_in) const noexcept override
+    {
+        return hnnx::Executable::null_item();
+    }
+    //LCOV_EXCL_START [SAFTYSWCCB-1542]
+    API_EXPORT virtual GraphStatus prepare(hnnx::OpIoPtrs const &, bool tcm_available) override
+    {
+        return GraphStatus::Success;
+    }
+    API_EXPORT virtual GraphStatus allocate(Graph &graph_in) override { return GraphStatus::Success; }
+    //LCOV_EXCL_STOP
+    API_EXPORT virtual std::pair<size_t, size_t> num_inputs_outputs() const override { return {0, 1}; }
+    API_EXPORT virtual bool is_valid() const noexcept override { return true; }
+
+    API_EXPORT const Tensor *tensor_p() const { return owned_tensor.get(); }
+    API_EXPORT virtual void serialize(hnnx::SerOpsInterface &sctx) const override;
+
+  protected:
+    API_EXPORT virtual const Tensor *get_input_output(size_t which, bool is_input) const override
+    {
+        return is_input ? nullptr : tensor_p();
+    }
+};
+
+class ShapeWrapperOp : public Op {
+    uptr_Tensor shape; // must actually be a TensorShape
+  public:
+    API_EXPORT ShapeWrapperOp(Graph &graph_in, OpId my_id_in, const OpDef *op_def_in);
+    API_EXPORT ShapeWrapperOp(Graph &graph_in, OpId my_id_in, uptr_Tensor owned_tensor_in);
+    API_EXPORT explicit ShapeWrapperOp(hnnx::Deserz &);
+    API_EXPORT virtual GraphStatus execute(EXECUTE_METHOD_PARMS) const noexcept override
+    {
+        return GraphStatus::Success;
+    }
+    API_EXPORT virtual hnnx::Executable::ItemType compile(Graph &graph_in) const noexcept override
+    {
+        return hnnx::Executable::null_item();
+    }
+    //LCOV_EXCL_START [SAFTYSWCCB-1542]
+    API_EXPORT virtual GraphStatus prepare(hnnx::OpIoPtrs const &, bool tcm_available) override
+    {
+        return GraphStatus::Success;
+    }
+    API_EXPORT virtual GraphStatus allocate(Graph &graph_in) override { return GraphStatus::Success; }
+    //LCOV_EXCL_STOP
+    API_EXPORT virtual std::pair<size_t, size_t> num_inputs_outputs() const override { return {0, 1}; }
+    API_EXPORT virtual bool is_valid() const noexcept override { return true; }
+
+    API_EXPORT virtual void serialize(hnnx::SerOpsInterface &sctx) const override;
+
+  protected:
+    API_EXPORT virtual const Tensor *get_input_output(size_t which, bool is_input) const override
+    {
+        return is_input ? nullptr : shape.get();
+    }
+};
+
+// MetaOpBase is a shim which provides empty defs for all of the =0 virtual methods,
+// so that internal Ops (e.g. PreloadOp) can be based on this and not need to define any they don't need
+//
+class MetaOpBase : public Op {
+  public:
+    MetaOpBase(){};
+    MetaOpBase(Graph &graph_in, unsigned long long int my_id_in) : Op(graph_in, my_id_in) {}
+    explicit MetaOpBase(hnnx::Deserz &dctx) : Op(dctx) {}
+
+    API_EXPORT virtual GraphStatus prepare(hnnx::OpIoPtrs const &,
+                                           bool tcm_available) override; //{ return GraphStatus::Success;}
+    API_EXPORT virtual GraphStatus allocate(Graph &graph_in) override; // { return GraphStatus::Success;}
+
+    API_EXPORT virtual bool is_valid() const noexcept override; // {return false;}
+    API_EXPORT virtual std::pair<size_t, size_t> num_inputs_outputs() const override; //{ return {0,0};}
+    API_EXPORT virtual Tensor const *get_input_output(size_t which,
+                                                      bool is_input) const override; // {return nullptr;}
+    API_EXPORT virtual void serialize(hnnx::SerOpsInterface &) const override; // {}
+    API_EXPORT virtual uptr_Op clone_meta(Graph &graph_in, OpId new_opid) const; // {return uptr_Op(nullptr);}
+};
+
+// SpecialPrepOpBase is a shim (based on MetaOpBase) which provides some new virtual methods
+// that are queried during GraphDeps stage of preparation.
+// This is intended for things like SuperTileOp which want to add these discovery methods.
+//
+// LCOV_EXCL_START [SAFTYSWCCB-1542]
+
+class SpecialPrepOpBase : public MetaOpBase {
+  public:
+    SpecialPrepOpBase(){};
+    SpecialPrepOpBase(Graph &graph_in, unsigned long long int my_id_in) : MetaOpBase(graph_in, my_id_in) {}
+    explicit SpecialPrepOpBase(hnnx::Deserz &dctx) : MetaOpBase(dctx) {}
+
+    // new virtual methods to populate the OpDesc for the op:
+    // These return 'true' if the result was changed, and 'false' if unchanged; the caller can set
+    // the variable to reasonable default before calling, and then ignore the result.
+    API_EXPORT virtual bool get_opdef_name(OpId opid, opname_tag_t &result) const; // {return false} in op.cc
+    API_EXPORT virtual bool get_splithist(OpId opid, splithist_t &result) const { return false; }
+    API_EXPORT virtual bool get_is_volatile(OpId opid, bool &result) const { return false; }
+    API_EXPORT virtual bool get_cost(const Graph &, OpId opid, float &result) const { return false; }
+    API_EXPORT virtual bool get_flags_word(OpId opid, Flags_word &result) const { return false; }
+
+    // make a CostBasedFeatureDesc. If 'false' is returned, it should be obtained 'in the usual manner'.
+    API_EXPORT virtual bool get_costbased_feature(OpId opid, CostBasedFeatureDesc &result) const { return false; }
+};
+// LCOV_EXCL_STOP
+
+// this is a base class for adding hooks on construction of Ops.
+// May not have data members or dtor - so it's just a vtable pointer, and is constexpr constructable
+// All methods must be const, and return GraphStatus; the 'default' methods do nothing and return GraphNotApplicable.
+// So, we can allow two or more hooks to be attached to an Op; when calling a method,
+// we will call it on the first one, and if it returns NotApplicable, we will try the next
+// one, etc (so they are 'layered', in effect).
+//
+class OpHookBase {
+  public:
+    API_EXPORT virtual GraphStatus pre_output_prep(OpIoPtrs const &, Op &) const;
+    API_EXPORT virtual GraphStatus pre_allocate(OpIoPtrs const &, Op &) const;
+};
+
+// if the indicated Op is a SpawnOp, get its inner op ptr, otherwise null.
+
+using SimpleOpFactory = std::unique_ptr<SimpleOpBase> (*)(size_t n_inputs_in, size_t n_outputs_in,
+                                                          Tensor const *const *inputs_in,
+                                                          OutputDef const *const *outputs_in, Graph &graph_in);
+
+} // namespace hnnx
+
+POP_VISIBILITY()
+
+#endif /*OP_H*/
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/op_def.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/op_def.h
new file mode 100755
index 0000000000000..2ea1893bf603a
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/op_def.h
@@ -0,0 +1,492 @@
+//==============================================================================
+//
+// Copyright (c) 2020,2022 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef OP_DEF_H
+#define OP_DEF_H 1
+
+#include <cassert>
+#include <functional>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "interface_defs.h"
+#include "splithist.h"
+#include "tensor.h"
+#include "opname_tag.h"
+#include "weak_linkage.h"
+#include "macros_attribute.h"
+
+// does an opname_tag_t start with the given string.
+// this assumes that converting opname_tag to string_view is cheaper than to std::string
+namespace hnnx {
+inline bool starts_with(opname_tag_t const &opstr, std::string const &pref)
+{
+    std::string_view const sv{opstr};
+    std::string_view sv2;
+    auto found = sv.find("::");
+    if (found != std::string_view::npos) {
+        sv2 = sv.substr(found + 2);
+    } else {
+        sv2 = sv;
+    }
+    int const n = pref.size();
+    return sv2.size() >= n && memcmp(sv2.data(), pref.c_str(), n) == 0;
+}
+inline bool starts_with(opname_tag_t const &opstr, char const *pref)
+{
+    std::string_view const sv{opstr};
+    std::string_view sv2;
+    auto found = sv.find("::");
+    if (found != std::string_view::npos) {
+        sv2 = sv.substr(found + 2);
+    } else {
+        sv2 = sv;
+    }
+    int const n = strlen(pref);
+    return sv2.size() >= n && memcmp(sv2.data(), pref, n) == 0;
+}
+
+class OpIoPtrs;
+} // namespace hnnx
+
+// is_deleted:
+//    OpDef has no downstream, and is slated for deletion
+// is_hidden:
+//    OpDef has been replaced or otherwise inactivated (always true if is_deleted).
+// is_const:
+//    OpDef has constant output
+// is_volatile:
+//    output is not invariant, even if inputs are
+//    (or if no inputs).
+//    Set for ops if no inputs, or if name starts with '*'.
+// is_retain:
+//    do not remove, even if it has no consumers.
+//    Currently set the same as volatile
+//
+PUSH_VISIBILITY(default)
+
+class OpDefFlags {
+    constexpr static uint32_t BIT_deleted = 1; // deleted, pending removal
+    constexpr static uint32_t BIT_hidden = 2; // replaced,hidden; not necessarily pending removal (>= deleted).
+    constexpr static uint32_t BIT_const = 4; // is an OpDef_ConstBase, or an op with const output.
+    constexpr static uint32_t BIT_volatile = 8; // is not invariant, even if all inputs are invariant.
+    constexpr static uint32_t BIT_retain = 16; // do not remove even if all outputs are not used.
+    constexpr static uint32_t BIT_dummy_out = 32; // is a $Out node
+    constexpr static uint32_t BIT_constbase = 64; // if and only if it's a OpDef_ConstBase. immutable.
+    constexpr static uint32_t BIT_in_constmap = 128; // set if it's in graph.const_map; <= constbase.
+    // Considered mutable (can change via const ref),
+    constexpr static uint32_t BIT_fake_unsigned = 256; // OpDef had dtype changed to unsigned during prepare
+    constexpr static uint32_t BIT_custom_op = 512; // OpDef is a custom op
+    // Rules for assigning flags at construction:
+    //  OpDef_ConstBase subclasses get 'const' and 'constbase'.
+    //  otherwise:
+    //    - if the opstr starts with "#', set 'const'
+    //    - otherwise if no inputs, has outputs
+    //          set 'volatile'
+    //    - otherwise if no outputs, or the the opstr starts with '*',
+    //          set 'volatile' and 'retain'
+    //  but for $Out nodes,we set retain | dummy_out.
+    //
+    // The following combinations never occur:
+    //   deleted=1, and hidden = 0.
+    //   constbase=1, and const=0
+    //   constbase=0, and in_constmap = 1.
+    //
+    //
+    uint16_t flags;
+
+  protected:
+    uint16_t opstr_hashval; //  hash of opstr.
+    API_EXPORT static inline int flag_init(hnnx::opname_tag_parm_t opstr, int n_in, int n_out)
+    {
+        std::string_view const sv{opstr};
+        char c0;
+        auto found = sv.find("::");
+        if (found != std::string_view::npos && !(sv.substr(found + 2).empty())) {
+            c0 = sv[found + 2];
+        } else if (found == std::string_view::npos && sv.size() > 0) {
+            c0 = sv[0];
+        } else {
+            return 0;
+        }
+        return (c0 == '#')                ? BIT_const
+               : (n_out == 0)             ? (BIT_retain | BIT_volatile)
+               : (n_in == 0 || c0 == '*') ? (BIT_volatile)
+               : (opstr == "$Out")        ? (BIT_retain | BIT_dummy_out)
+                                          : 0;
+    }
+    // ctor used by OpDef (non-const)
+    API_EXPORT OpDefFlags(hnnx::opname_tag_parm_t opstr, int n_in, int n_out)
+        : flags((uint16_t)flag_init(opstr, n_in, n_out)), opstr_hashval((uint16_t)hnnx::find_opname_hash(opstr))
+    {
+    }
+    // ctor used by OpDef_ConstBase
+    API_EXPORT OpDefFlags(hnnx::opname_tag_parm_t opstr,
+                          bool isconst) // is_const ignored; always true
+        : flags(BIT_const | BIT_constbase), opstr_hashval((uint16_t)hnnx::find_opname_hash(opstr))
+    {
+    }
+    template <unsigned F> bool set_flag_state(bool val)
+    {
+        unsigned const f = flags;
+        if (val)
+            flags = f | F;
+        else
+            flags = f & ~F;
+        return (f & F) != 0;
+    }
+
+  public:
+    API_EXPORT unsigned get_opstr_hash() const { return opstr_hashval; }
+    API_EXPORT bool is_const() const { return (flags & BIT_const) != 0; }
+    API_EXPORT bool is_retain() const { return (flags & BIT_retain) != 0; }
+    API_EXPORT bool is_volatile() const { return (flags & BIT_volatile) != 0; }
+    API_EXPORT bool is_deleted() const { return (flags & BIT_deleted) != 0; }
+    API_EXPORT bool is_hidden() const { return (flags & BIT_hidden) != 0; }
+    API_EXPORT bool is_dummy_out() const { return (flags & BIT_dummy_out) != 0; }
+    API_EXPORT bool is_constbase() const { return (flags & BIT_constbase) != 0; }
+    API_EXPORT bool is_in_constmap() const { return (flags & BIT_in_constmap) != 0; }
+    API_EXPORT bool is_fake_unsigned() const { return (flags & BIT_fake_unsigned) != 0; }
+    API_EXPORT bool is_custom_op() const { return (flags & BIT_custom_op) != 0; }
+    // this is all we should need.
+    API_EXPORT bool set_retain(bool val = true) { return set_flag_state<BIT_retain>(val); }
+    API_EXPORT void set_deleted() { flags |= (BIT_hidden | BIT_deleted); }
+    API_EXPORT void set_hidden() { flags |= BIT_hidden; }
+    // this is allowed via 'const' ref
+    API_EXPORT bool set_is_in_constmap(bool val = true) const
+    {
+        return const_cast<OpDefFlags &>(*this).set_flag_state<BIT_in_constmap>(val);
+    }
+    API_EXPORT bool set_fake_unsigned(bool val = true) { return set_flag_state<BIT_fake_unsigned>(val); }
+    API_EXPORT void set_custom_op() { flags |= BIT_custom_op; }
+    API_EXPORT void serialize(hnnx::Serializer &sctx) const;
+    API_EXPORT OpDefFlags(hnnx::Deserializer &dctx);
+
+  protected:
+    // derived ctors can do this.
+    API_EXPORT void set_const(bool val = true) { set_flag_state<BIT_const>(val); }
+};
+
+/**
+ * @class OpDef
+ *
+ * Not sure how much we want to templatize here...
+ *
+ * We want to split out the definition of ops from their execution behavior.
+ * Performance during definition / graph transformation is not as essential, but
+ * we need to avoid big-O problems and ease-of-use is important.
+ *
+ * For pattern matching we'd like to have some facilities to consider an op definition
+ * meet some things in typical usage
+ *
+ */
+
+/*
+ * An Op Reference refers to an OpID and Output Index in some Graph
+ * This should maybe be called "OutRef" or "TensorRef" or something?
+ *
+ * We'll continue to have dereference() return an OpDef for compatibilty
+ * But we often want the output definition, so add interfaces to get the
+ * pointed-to output as well as information about that output for convenience.
+ */
+
+class Op;
+class OpDef;
+class GraphPrepare;
+
+class RefersToGraph {
+  public:
+    std::reference_wrapper<GraphPrepare> m_graph;
+
+    RefersToGraph(GraphPrepare &g) : m_graph(g) {}
+    GraphPrepare &graph() const { return m_graph.get(); }
+};
+// this is used as the parameter to 'dereference' and 'output_def'
+// so they can take Graph &, or anything based on RefersToGraph &,
+// or a pointer to those.
+class AnyGraphContext {
+    GraphPrepare *m_graphp;
+
+  public:
+    AnyGraphContext(GraphPrepare &g) : m_graphp(&g) {}
+    AnyGraphContext(GraphPrepare *gp) : m_graphp(gp) {}
+    AnyGraphContext(RefersToGraph &rtg) : m_graphp(&rtg.graph()) {}
+    AnyGraphContext(RefersToGraph *rtgp) : m_graphp(&rtgp->graph()) {}
+    GraphPrepare &graph() const { return *m_graphp; }
+};
+
+class OpRef final {
+  public:
+    unsigned long long int input_id;
+
+    explicit OpRef(unsigned long long int in_id) // from id
+        : input_id(in_id)
+    {
+    }
+    OpRef(unsigned long long int in_id,
+          size_t out_idx) // from (id, idx) - legacy
+        : input_id(in_id)
+    {
+    }
+    OpRef() : input_id() {}
+    ~OpRef() = default;
+    OpRef(const OpRef &) = default;
+    OpRef(OpRef &&) = default;
+    OpRef &operator=(OpRef const &) = default;
+    OpRef &operator=(OpRef &&) = default;
+    API_EXPORT OpDef &dereference(AnyGraphContext) const;
+    API_EXPORT const OutputDef &output_def(AnyGraphContext) const;
+
+    // note,  these ops all do a lookup via output_def()
+    API_EXPORT size_t rank(AnyGraphContext c) const { return output_def(c).rank; }
+    API_EXPORT DType dtype(AnyGraphContext c) const { return output_def(c).dtype; }
+    API_EXPORT size_t dim(AnyGraphContext c, size_t idx) const
+    {
+        const OutputDef &od = output_def(c);
+        assert(idx < od.rank);
+        return od.max_sizes[idx];
+    }
+    API_EXPORT int32_t zero_offset(AnyGraphContext c) const { return output_def(c).zero_offset; }
+    API_EXPORT float stepsize(AnyGraphContext c) const { return output_def(c).stepsize; }
+
+    bool operator==(const OpRef &ref) const { return input_id == ref.input_id; }
+    bool operator!=(const OpRef &ref) const { return !operator==(ref); }
+};
+
+class OpDef : public OpDefFlags {
+  protected:
+    hnnx::splithist_t splithist;
+    const std::reference_wrapper<GraphPrepare> graphref;
+    class ForConst {
+    };
+    // this constructor is for OpDef_ConstBase
+    // it sets the flags to just 'is_const | is_constbase'.
+    API_EXPORT OpDef(GraphPrepare &graph_in, OpId my_id_in, hnnx::opname_tag_parm_t opstr_in, OutputDef const &odef,
+                     ForConst const &)
+        : OpDefFlags(opstr_in, true), splithist(), graphref(graph_in), id(my_id_in), opstr(opstr_in), input_defs(),
+          output_def(odef)
+    {
+    }
+
+  public:
+    OpId id;
+    hnnx::opname_tag_t opstr;
+    API_EXPORT void change_opstr_internal(hnnx::opname_tag_t new_opstr)
+    {
+        opstr = new_opstr;
+        opstr_hashval = hnnx::find_opname_hash(new_opstr);
+    }
+
+    std::vector<OpRef> input_defs; // These should be mutable, we mess with them during optimization
+    OutputDef output_def;
+
+    API_EXPORT inline hnnx::splithist_t get_splithist() const { return splithist; }
+    API_EXPORT inline void set_splithist(hnnx::splithist_t val) { splithist = val; }
+    API_EXPORT inline void set_splithist(OpDef const &other) { splithist = other.splithist; }
+    API_EXPORT inline void inherit_memos(AnyGraphContext, OpDef const &other);
+    // with 0 or 1 output (output_def_in may be null)
+    API_EXPORT OpDef(GraphPrepare &graph_in, OpId my_id_in, hnnx::opname_tag_parm_t opstr_in,
+                     std::vector<OpRef> &&input_defs_in, OutputDef const *output_def_in, hnnx::splithist_t sl)
+        : OpDefFlags(opstr_in, input_defs_in.size(), (output_def_in == nullptr) ? 0 : 1), splithist(sl),
+          graphref(graph_in), id(my_id_in), opstr(opstr_in), input_defs(std::move(input_defs_in)), output_def()
+    {
+        if (output_def_in != nullptr) {
+            output_def = *output_def_in;
+        } else {
+            output_def.dtype = DType::None;
+        }
+    }
+
+    API_EXPORT OpDef(GraphPrepare &graph_in, OpId my_id_in, hnnx::opname_tag_parm_t opstr_in,
+                     std::vector<OpRef> &&input_defs_in, OutputDef const *output_def_in)
+        : OpDefFlags(opstr_in, input_defs_in.size(), (output_def_in == nullptr) ? 0 : 1), splithist(),
+          graphref(graph_in), id(my_id_in), opstr(opstr_in), input_defs(std::move(input_defs_in)), output_def()
+    {
+        if (output_def_in != nullptr) {
+            output_def = *output_def_in;
+        } else {
+            output_def.dtype = DType::None;
+        }
+    }
+
+    OpDef(OpDef const &) = delete; // use the .copy() method
+    OpDef(OpDef &&) = default; // we can return them though
+    OpDef &operator=(OpDef const &) = delete;
+    OpDef &operator=(OpDef &&) = delete;
+    API_EXPORT GraphPrepare &graph() const { return graphref.get(); }
+    // make a copy with the same output and no inputs; then
+    // copy shape from 'shape_from' (if not null) and output
+    //  spec from 'outp_from' (if not null)
+    API_EXPORT OpDef make_output_exemplar(OutputDef const *size_from, OutputDef const *outp_from) const;
+    // make a copy with the same output and no inputs
+    API_EXPORT inline OpDef make_output_exemplar() const { return make_output_exemplar(nullptr, nullptr); }
+
+    API_EXPORT size_t n_inputs() const { return input_defs.size(); }
+    API_EXPORT size_t n_outputs() const { return output_def.dtype == DType::None ? 0 : 1; }
+    //> true if the OpDef has outputs
+    API_EXPORT bool has_outputs() const { return !(output_def.dtype == DType::None); }
+    //> true if the node is a 'sink' for the purposes of sheduler
+    /// If we add special nodes with no outputs that are not graph sinks, they can be excluded here.
+    //
+    API_EXPORT bool is_graph_sink() const { return !has_outputs(); }
+    //> True if the OpDef has multiple outputs (has 'Multi' output type)
+    API_EXPORT bool has_multiple_outputs() const { return output_def.dtype == DType::Multi; }
+    API_EXPORT OpRef reference() const { return OpRef{id, 0}; }
+    // these are only safe to use when has_outputs()
+    API_EXPORT OutputDef &get_outputdef() { return output_def; } //use when need to modify output_def
+    API_EXPORT OutputDef const &get_outputdef() const { return output_def; } //return read-only output_def
+
+    API_EXPORT virtual hnnx::uptr_Op generate(hnnx::OpIoPtrs const &) const;
+    API_EXPORT virtual const uint8_t *const_data_ptr() const { return nullptr; }
+    API_EXPORT virtual size_t const_data_len() const { return 0; }
+    // By convention, op names that start with '#' are constant regardless of input
+    // This is useful (for example) to get quantization parameters out of output defs
+    API_EXPORT virtual const Tensor *get_tensor() const { return nullptr; }
+    API_EXPORT virtual void release_memory() {}
+    virtual ~OpDef() = default;
+    API_EXPORT static bool compare_less(const OpDef &lhs, const OpDef &rhs);
+
+    API_EXPORT static bool compare_eq(const OpDef &lhs, const OpDef &rhs);
+
+    struct compare_less_ptr_functor {
+        bool operator()(const OpDef *lhs, const OpDef *rhs) const { return OpDef::compare_less(*lhs, *rhs); }
+    };
+    API_EXPORT bool exact_same_as(const OpDef &rhs);
+    API_EXPORT virtual void nndebug_serialize(hnnx::Serializer &sctx) const;
+    API_EXPORT void serialize(hnnx::Serializer &sctx) const;
+    API_EXPORT OpDef(GraphPrepare &graph_in, hnnx::Deserializer &dctx);
+};
+
+namespace hnnx {
+
+API_FUNC_EXPORT bool compare_eq(OutputDef const &, OutputDef const &);
+
+// common base for OpDef_Const and OpDef_Shape.
+// these are the OpDef which will be kept in const_map, keyed by the content_hash.
+// compare_eq() is used to check exact match amongst any two instances; compare_less may be
+// used to order them.
+//
+// the ordering will be a little arbitrary - if two have different hashes, they will first be ordered
+// according to the hashes; otherwise we will go through a multi-key compare of the OpDef attributes
+// and finally call tensor_compare (if both are OpDef_Const). If a more rational ordering is needed,
+// this can be added, but it will tend to be slower than the hashed compare if you have a lot of Const
+// with matching shapes.
+// If A and B are both OpDefConstBase, then OpDef::compare_less(A,B) will be the same as OpDef_ConstBase::compare_less(A,B).
+//
+// The content_hash is never 0; zero is used to mark 'unknown'; the next time get_content_hash() is called, find_content_hash()
+// will be called to determine the hash, and then it will be stored in content_hash for next time.
+//
+class OpDef_ConstBase : public OpDef {
+    mutable uint32_t content_hash = 0;
+
+  protected:
+    OpDef_ConstBase(GraphPrepare &graph_in, OpId my_id_in, opname_tag_parm_t opstr, OutputDef const &output_def)
+        : OpDef(graph_in, my_id_in, opstr, output_def, OpDef::ForConst{})
+    {
+    }
+
+  public:
+    // finds content_hash ( if not already found ) and return it
+    API_EXPORT uint32_t get_content_hash() const
+    {
+        return (content_hash == 0) ? get_content_hash_func() : content_hash;
+    }
+    API_EXPORT void invalidate_content_hash() { content_hash = 0; }
+    // does it have a content_hash?
+    API_EXPORT inline bool has_content_hash() const { return content_hash != 0; }
+    API_EXPORT inline OpDef_ConstBase(GraphPrepare &graph_in, Deserializer &dctx)
+        : OpDef(graph_in, dctx), content_hash(0)
+    {
+    }
+
+  protected:
+    API_EXPORT uint32_t find_basic_hash() const noexcept; // find the hash of opstr and OutputDef.
+    API_EXPORT uint32_t get_content_hash_func() const noexcept;
+    API_EXPORT virtual uint32_t find_content_hash() const noexcept = 0;
+};
+
+API_FUNC_EXPORT int compare_constbase(const OpDef_ConstBase &lhs, const OpDef_ConstBase &rhs);
+API_FUNC_EXPORT inline bool compare_constbase_eq(const OpDef_ConstBase &lhs, const OpDef_ConstBase &rhs)
+{
+    return lhs.get_content_hash() == rhs.get_content_hash() && compare_constbase(lhs, rhs) == 0;
+}
+
+class OpDef_Const : public OpDef_ConstBase {
+  public:
+    std::unique_ptr<Tensor> const_data;
+    OpDef_Const(OpDef_Const const &) = delete;
+    OpDef_Const &operator=(const OpDef_Const &) = delete;
+    OpDef_Const(OpDef_Const &&) = delete;
+    OpDef_Const &operator=(OpDef_Const &&) = delete;
+    API_EXPORT OpDef_Const(GraphPrepare &graph_in, OpId my_id_in, OutputDef const &output_def, const uint8_t *data_in,
+                           size_t len);
+    API_EXPORT OpDef_Const(GraphPrepare &graph_in, OpId my_id_in, std::unique_ptr<Tensor> tensor_in);
+    API_EXPORT virtual ~OpDef_Const();
+    API_EXPORT virtual const uint8_t *const_data_ptr() const override;
+    API_EXPORT virtual size_t const_data_len() const override;
+    API_EXPORT virtual uptr_Op generate(OpIoPtrs const &) const override;
+    API_EXPORT virtual const Tensor *get_tensor() const override { return const_data.get(); }
+    API_EXPORT virtual void release_memory() override;
+    API_EXPORT void serialize(Serializer &sctx) const;
+    API_EXPORT OpDef_Const(GraphPrepare &graph_in, Deserializer &dctx);
+
+  protected:
+    API_EXPORT virtual uint32_t find_content_hash() const noexcept override;
+};
+
+class OpDef_Shape : public OpDef_ConstBase {
+  public:
+    API_EXPORT OpDef_Shape(GraphPrepare &graph_in, OpId my_id_in, OutputDef const &output_def)
+        : OpDef_ConstBase(graph_in, my_id_in, "$Shape", output_def)
+    {
+    }
+    API_EXPORT virtual const uint8_t *const_data_ptr() const override { return nullptr; }
+    API_EXPORT virtual size_t const_data_len() const override { return 0; }
+    API_EXPORT virtual uptr_Op generate(OpIoPtrs const &) const override;
+    API_EXPORT void serialize(Serializer &sctx) const;
+    API_EXPORT OpDef_Shape(GraphPrepare &graph_in, Deserializer &dctx);
+
+  protected:
+    // hash of a shape includes only the basic hash.
+    API_EXPORT virtual uint32_t find_content_hash() const noexcept override;
+};
+
+// This implemnts SAME_ENCODING in optimization constraints.
+API_FUNC_EXPORT inline bool same_encoding(OutputDef const &oda, OutputDef const &odb)
+{
+    DType const d = oda.dtype;
+    if (odb.dtype != d) return false;
+    // done if not quantized
+    if (!DType_info(d).is_quant) return true;
+    return (oda.stepsize == odb.stepsize && oda.zero_offset == odb.zero_offset);
+}
+
+// This implements SAME_SHAPE in optimization constraints.
+API_FUNC_EXPORT inline bool same_shape(OutputDef const &oda, OutputDef const &odb)
+{
+    NN_UINT32_T const rankA = oda.rank;
+    NN_UINT32_T const rankB = odb.rank;
+
+    if (rankA != rankB) {
+        return false;
+    }
+
+    for (size_t idx = 0; idx < rankA; idx++) {
+        if (oda.max_sizes[idx] != odb.max_sizes[idx]) return false;
+    }
+
+    return true;
+}
+
+} // namespace hnnx
+
+POP_VISIBILITY()
+
+#endif
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/op_extra_info.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/op_extra_info.h
new file mode 100755
index 0000000000000..48075e2106eb3
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/op_extra_info.h
@@ -0,0 +1,70 @@
+//==============================================================================
+//
+// Copyright (c) Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef OP_EXTRA_INFO_H
+#define OP_EXTRA_INFO_H 1
+
+#include <utility>
+
+#include "interface_defs.h"
+
+namespace hnnx {
+
+/*
+    map Op* to a few properties, to avoid the need to keep them in the Op
+    object. Currently contains the ID, the gate/done checkpoint indices, and
+    the number of scratch outputs. This sctruct is part of the runlist and its 
+    memory footprint is important. It is currently 24 bytes:
+        opid: 8 bytes
+        chkpts: 8 bytes
+        op_tag: 4 bytes
+        (alignment padding : 4)
+*/
+
+// OpExtraInfo - the 'extra_list' componen of runlists, as an array
+// of these, so we want to keep them small; any attributes that are
+// only needed at prepare time can go in the OpExtraAttrib which is a subclass.
+// The 'mapped' type of m_op_extra_info_map is OpExtraAttrib.
+
+struct OpExtraInfo {
+    using Chkpts = std::pair<int, int>;
+
+    OpId id;
+    Chkpts chkpts;
+    const char *op_tag;
+    explicit OpExtraInfo(OpId id_in) : id(id_in), chkpts(-1, -1) {}
+    OpExtraInfo(OpId id_in, int cg, int dc) : id(id_in), chkpts(cg, dc) {}
+    OpExtraInfo() : OpExtraInfo(0) {}
+
+    bool valid() const { return id != 0; };
+    void clear() { id = 0; };
+};
+
+struct OpExtraAttrib : public OpExtraInfo {
+    // fields below here are valid only at prepare time.
+    bool for_hlx : 1; // HVX op to be moved to HLX
+    unsigned int num_scratch_outputs : 5;
+    unsigned int self_slicing_op_nslices : 4; // 0 means just 1 slice; otherwise >= 2
+
+    // can construct from OpExtraInfo
+    OpExtraAttrib() : OpExtraInfo() { clear_fields(); }
+    OpExtraAttrib(OpExtraInfo const &baseval) : OpExtraInfo(baseval) { clear_fields(); }
+    explicit OpExtraAttrib(OpId id_in) : OpExtraInfo(id_in) { clear_fields(); }
+    OpExtraAttrib(OpId id_in, int cg, int dc) : OpExtraInfo(id_in, cg, dc) { clear_fields(); }
+
+    void clear_fields()
+    {
+        for_hlx = false;
+        num_scratch_outputs = 0;
+        self_slicing_op_nslices = 0;
+    }
+};
+
+} // namespace hnnx
+
+#endif // OP_EXTRA_INFO_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/op_info.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/op_info.h
new file mode 100755
index 0000000000000..bb80c6807b994
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/op_info.h
@@ -0,0 +1,113 @@
+//=============================================================================
+//
+//  Copyright (c) 2020 Qualcomm Technologies, Inc.
+//  All Rights Reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//============================================================================
+
+#ifndef OP_INFO_H
+#define OP_INFO_H
+
+#include <typeinfo>
+#include <typeindex>
+#include <functional>
+#include <map>
+
+#include "flags.h"
+#include "op_registry.h"
+#include "cost_funcs.h"
+#include "weak_linkage.h"
+
+PUSH_VISIBILITY(default)
+
+class Op;
+
+namespace hnnx {
+
+class OpInfo {
+    cost_function_t cost;
+    Flags_word flags;
+    bool is_external_flag;
+    bool is_simple_op;
+    union {
+        OpFactory op_factory;
+        SimpleOpFactory simple_op_factory;
+    };
+    const std::string_view type_tag;
+
+  public:
+    //LCOV_EXCL_START [SAFTYSWCCB-1542]
+    OpInfo(cost_function_t cost_in, Flags_word flags_in, OpFactory op_factory_in, bool is_external_in,
+           const std::string_view type_tag_in)
+        : cost(cost_in), flags(flags_in), is_external_flag(is_external_in), is_simple_op(false),
+          op_factory(op_factory_in), type_tag(type_tag_in)
+    {
+    }
+    //LCOV_EXCL_STOP
+    OpInfo(cost_function_t cost_in, Flags_word flags_in, SimpleOpFactory simple_op_factory_in, bool is_external_in,
+           const std::string_view type_tag_in)
+        : cost(cost_in), flags(flags_in), is_external_flag(is_external_in), is_simple_op(true),
+          simple_op_factory(simple_op_factory_in), type_tag(type_tag_in)
+    {
+    }
+
+    ~OpInfo() = default;
+
+    API_EXPORT Flags_word get_flags() const { return flags; }
+
+    API_EXPORT cost_function_t const &get_cost() const { return cost; } //LCOV_EXCL_LINE [SAFTYSWCCB-1542]
+
+    API_EXPORT bool is_external() const { return is_external_flag; }
+
+    //LCOV_EXCL_START [SAFTYSWCCB-1542]
+    API_EXPORT const char *get_type_tag() const { return type_tag.data(); }
+
+    API_EXPORT OpFactory get_op_factory() const { return !is_simple_op ? op_factory : nullptr; }
+    API_EXPORT SimpleOpFactory get_simple_op_factory() const { return is_simple_op ? simple_op_factory : nullptr; }
+    //LCOV_EXCL_STOP
+};
+
+using InfoMapType = std::map<std::type_index, OpInfo>;
+
+// after the instance is constructed, this points to it.
+extern InfoMapType *op_info_map_inst_p;
+API_FUNC_EXPORT InfoMapType &get_op_info_map_function();
+
+inline InfoMapType &get_op_info_map()
+{
+    return (op_info_map_inst_p != nullptr) ? *op_info_map_inst_p : get_op_info_map_function();
+}
+
+// most access to the map are lookup. This does a lookup and returns null if not found.
+API_FUNC_EXPORT OpInfo const *op_info_map_lookup(std::type_index tind);
+
+// handy adapters
+API_FUNC_EXPORT inline OpInfo const *op_info_map_lookup(std::type_info const &t)
+{
+    return op_info_map_lookup(std::type_index(t));
+}
+template <typename OP> // can't just use Op since it's incomplete here.
+API_FUNC_EXPORT inline OpInfo const *op_info_map_lookup(OP const *op)
+{
+    static_assert(std::is_base_of<Op, OP>::value);
+    return op_info_map_lookup(std::type_index(typeid(*op)));
+}
+
+API_FUNC_EXPORT void register_op_info(const std::type_info &type, cost_function_t cost, Flags_word flags,
+                                      OpFactory op_factory, bool is_external, const std::string_view type_tag);
+API_FUNC_EXPORT void register_op_info(const std::type_info &type, cost_function_t cost, Flags_word flags,
+                                      SimpleOpFactory op_factory, bool is_external, const std::string_view type_tag);
+
+template <typename T, typename OPFACTORY>
+API_FUNC_EXPORT inline void register_op_info(cost_function_t cost, Flags_word flags, OPFACTORY op_factory,
+                                             bool is_external, const std::string_view type_tag)
+{
+    register_op_info(typeid(T), cost, flags, op_factory, is_external, type_tag);
+}
+
+} // namespace hnnx
+
+POP_VISIBILITY()
+
+#endif
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/op_package_feature_support.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/op_package_feature_support.h
new file mode 100755
index 0000000000000..12e4ef2b0a85d
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/op_package_feature_support.h
@@ -0,0 +1,178 @@
+//==============================================================================
+//
+// Copyright (c) 2020, 2023 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef OP_PACKAGE_FEATURE_SUPPORT_H
+#define OP_PACKAGE_FEATURE_SUPPORT_H
+
+/*
+ * Used by external op packages
+ * for specifying orders of op parameters
+ * and listing axis parameters
+ * and listing per channel scale ops
+ *
+ * Using any of the following features/macros on HTP default package is invalid
+ * For any op package using HTP (internal) default package name,
+ *   axis parameters listed using macros below will be ignored
+ *   if there are parameter orders and/or per-channel ops listed using macros below, op package registration will fail
+ */
+
+#include <string>
+#include <string_view>
+#include <set>
+#include <vector>
+#include <unordered_map>
+#include <cstdarg>
+
+#include "op_package_name.h"
+#include "weak_linkage.h"
+#include "macros_attribute.h"
+
+PUSH_VISIBILITY(default)
+
+namespace hnnx {
+
+// configuration of each op package op parameter
+typedef struct {
+    std::string name;
+    bool isMandatory;
+    void *defaultVal;
+
+} ParamConfig_t;
+
+typedef std::unordered_map<std::string, std::vector<ParamConfig_t>> ParamMap_t; // pkg::op name -> vector<param>
+
+API_EXPORT API_C_FUNC std::string API_FUNC_NAME(combine_pkg_op_name)(const char *package_name, const char *op_name);
+
+/*
+ * adds a new op entry into an op parameter map
+ * returns nullptr if this op already exists in the map
+ */
+API_EXPORT std::vector<ParamConfig_t> *add_to_param_map(ParamMap_t &pmap, std::string_view package_op_name);
+
+// adds a new op parameter entry into a vector of parameter configs
+API_EXPORT void add_package_individual_param_config(std::vector<ParamConfig_t> *pvec, const char *param_name,
+                                                    bool mandatory, void *default_val);
+
+// base conditon for hnnx::add_package_param_configs_base
+API_EXPORT void add_package_param_configs_base(std::vector<ParamConfig_t> *pvec);
+
+// adds a variable number of ParamConfig_t constructed from [param_name, mandatory, default_val] to a ParamConfig_t vector
+template <typename... T>
+API_EXPORT void add_package_param_configs_base(std::vector<ParamConfig_t> *pvec, const char *param_name, bool mandatory,
+                                               void *default_val, T &&...args)
+{
+    add_package_individual_param_config(pvec, param_name, mandatory, default_val);
+    add_package_param_configs_base(pvec, std::forward<T>(args)...);
+}
+
+// inserts a new  op entry into a ParamMap_t and adds a variable number of ParamConfig_t
+template <typename... T>
+API_EXPORT void add_package_param_configs(ParamMap_t &pmap, std::string_view package_op_name, const char *param_name,
+                                          bool mandatory, void *default_val, T &&...args)
+{
+    std::vector<ParamConfig_t> *v = add_to_param_map(pmap, package_op_name);
+    if (!v) return;
+
+    add_package_individual_param_config(v, param_name, mandatory, default_val);
+    add_package_param_configs_base(v, std::forward<T>(args)...);
+}
+
+//  base conditon for hnnx::add_package_axis_params
+API_EXPORT void add_package_axis_params(std::set<std::string> &aset);
+
+// adds a variable number of axis parameter names into a set of axis parameter names
+template <typename... T>
+API_EXPORT void add_package_axis_params(std::set<std::string> &aset, const char *param_name, T &&...args)
+{
+    aset.insert(std::string(param_name));
+    add_package_axis_params(aset, std::forward<T>(args)...);
+}
+
+//  base conditon for hnnx::add_package_per_channel_ops
+API_EXPORT void add_package_per_channel_ops(std::set<std::string> &oset);
+
+// adds a variable number of per-channel scaled package_name::op_name into a set of package_name::op_name
+template <typename... T>
+API_EXPORT void add_package_per_channel_ops(std::set<std::string> &oset, const char *op_name, T &&...args)
+{
+    oset.insert(combine_pkg_op_name(THIS_PKG_NAME_STR, op_name));
+    add_package_per_channel_ops(oset, std::forward<T>(args)...);
+}
+
+} // namespace hnnx
+
+// Initialize ParamAxes and ChannelQuantizedOps maps as well
+#define INIT_PACKAGE_PARAM_ORDER_DEF()                                                                                 \
+    API_HIDDEN hnnx::ParamMap_t &current_package_param_order_storage_map_func()                                        \
+    {                                                                                                                  \
+        static hnnx::ParamMap_t pm;                                                                                    \
+        return pm;                                                                                                     \
+    }                                                                                                                  \
+    API_HIDDEN std::set<std::string> &currentPackageParamAxesSetFunc()                                                 \
+    {                                                                                                                  \
+        static std::set<std::string> axes;                                                                             \
+        return axes;                                                                                                   \
+    }                                                                                                                  \
+    API_HIDDEN std::set<std::string> &currentPackagePerChannelQuantizedOpsSetFunc()                                    \
+    {                                                                                                                  \
+        static std::set<std::string> per_channel_ops;                                                                  \
+        return per_channel_ops;                                                                                        \
+    }                                                                                                                  \
+    extern "C" {                                                                                                       \
+    void clearPackageParamOrderStorageMapFunc() { current_package_param_order_storage_map_func().clear(); }            \
+    void clearPackageParamAxesSetFunc() { currentPackageParamAxesSetFunc().clear(); }                                  \
+    void clearPackagePerChannelQuantizedOpsSetFunc() { currentPackagePerChannelQuantizedOpsSetFunc().clear(); }        \
+    }                                                                                                                  \
+    std::unordered_map<std::string, hnnx::ParamMap_t *> &getPkgParamTmpMap();                                          \
+    std::unordered_map<std::string, std::set<std::string> *> &getPkgParamAxesTmpMap();                                 \
+    std::unordered_map<std::string, std::set<std::string> *> &getPkgPerChannelOpsTmpMap();                             \
+    void clearPkgStorage()                                                                                             \
+    {                                                                                                                  \
+        clearPackageOpsStorageVecFunc();                                                                               \
+        clearPackageOptStorageVecFunc();                                                                               \
+        clearPackageParamOrderStorageMapFunc();                                                                        \
+        clearPackageParamAxesSetFunc();                                                                                \
+        clearPackagePerChannelQuantizedOpsSetFunc();                                                                   \
+    }
+
+#define DECLARE_PACKAGE_PARAM_ORDER_DEF() API_HIDDEN hnnx::ParamMap_t &current_package_param_order_storage_map_func();
+
+#define DEF_PACKAGE_PARAM_ORDER(OP, PARAM1, MANDATORY1, DEFAULT1, ...)                                                 \
+    [[maybe_unused]] static bool CTRICKS_PASTER(_PKG_PARAM_ORDER_REG_, __LINE__) =                                     \
+            (hnnx::add_package_param_configs(current_package_param_order_storage_map_func(),                           \
+                                             hnnx::combine_pkg_op_name(THIS_PKG_NAME_STR, OP), PARAM1, MANDATORY1,     \
+                                             DEFAULT1, ##__VA_ARGS__),                                                 \
+             true);
+
+// clean all op_pkg storage during process exit
+#define REGISTER_PACKAGE_PARAM_ORDERS()                                                                                \
+    if (getPkgParamTmpMap().find(std::string(THIS_PKG_NAME_STR)) == getPkgParamTmpMap().end())                         \
+        getPkgParamTmpMap()[std::string(THIS_PKG_NAME_STR)] = &current_package_param_order_storage_map_func();         \
+    [[maybe_unused]] bool CTRICKS_PASTER(_CLEAN_PKG_PARAMS_, __LINE__) = (std::atexit(clearPkgStorage), true);
+
+#define LIST_PACKAGE_AXIS_PARAMS(...)                                                                                  \
+    [[maybe_unused]] static bool CTRICKS_PASTER(_PKG_AXIS_PARAMS_, __LINE__) =                                         \
+            (hnnx::add_package_axis_params(currentPackageParamAxesSetFunc(), ##__VA_ARGS__), true);
+
+#define REGISTER_PACKAGE_AXIS_PARAMS()                                                                                 \
+    if (getPkgParamAxesTmpMap().find(std::string(THIS_PKG_NAME_STR)) == getPkgParamAxesTmpMap().end())                 \
+        getPkgParamAxesTmpMap()[std::string(THIS_PKG_NAME_STR)] = &currentPackageParamAxesSetFunc();
+
+#define LIST_PACKAGE_PER_CHANNEL_QUANTIZED_OPS(...)                                                                    \
+    [[maybe_unused]] static bool CTRICKS_PASTER(_PKG_PER_CHANNEL_OPS_, __LINE__) =                                     \
+            (hnnx::add_package_per_channel_ops(currentPackagePerChannelQuantizedOpsSetFunc(), ##__VA_ARGS__), true);
+
+#define REGISTER_PACKAGE_PER_CHANNEL_QUANTIZED_OPS()                                                                   \
+    if (getPkgPerChannelOpsTmpMap().find(std::string(THIS_PKG_NAME_STR)) == getPkgPerChannelOpsTmpMap().end())         \
+        getPkgPerChannelOpsTmpMap()[std::string(THIS_PKG_NAME_STR)] = &currentPackagePerChannelQuantizedOpsSetFunc();
+
+DECLARE_PACKAGE_PARAM_ORDER_DEF()
+
+POP_VISIBILITY()
+
+#endif // OP_PACKAGE_FEATURE_SUPPORT_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/op_package_name.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/op_package_name.h
new file mode 100755
index 0000000000000..d8da9153bb765
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/op_package_name.h
@@ -0,0 +1,48 @@
+//==============================================================================
+//
+// Copyright (c) 2020-2021, 2023 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef OP_PACKAGE_NAME_H
+#define OP_PACKAGE_NAME_H
+
+#ifndef THIS_PKG_NAME
+#define THIS_PKG_NAME
+#define THIS_PKG_NAME_STR ""
+#else
+#define TO_STR(x)         #x
+#define TO_STR2(x)        TO_STR(x)
+#define THIS_PKG_NAME_STR TO_STR2(THIS_PKG_NAME)
+#endif
+
+#include <cstring>
+#include "weak_linkage.h"
+
+namespace hnnx {
+
+inline char const *get_opname_with_pkg_prefix(std::string &tmp, char const *opstr,
+                                              char const *prefix = THIS_PKG_NAME_STR)
+{
+    if (!opstr || opstr[0] == '$' || strstr(opstr, "::") != nullptr) return opstr;
+    // build result in 'tmp' and return pointer to it
+    tmp = prefix;
+    tmp += "::";
+    tmp += opstr;
+    return tmp.c_str();
+}
+
+// LCOV_EXCL_START [SAFTYSWCCB-1542]
+inline bool opname_has_pkg_prefix(char const *opstr)
+{
+    return strstr(opstr, "::") != nullptr;
+}
+// LCOV_EXCL_STOP
+
+API_C_FUNC std::string API_FUNC_NAME(get_default_pkg_name)();
+
+} // namespace hnnx
+
+#endif
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/op_register.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/op_register.h
new file mode 100755
index 0000000000000..e80db6898b8bf
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/op_register.h
@@ -0,0 +1,216 @@
+//==============================================================================
+//
+// Copyright (c) 2020-2021 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef OP_REGISTER_H
+#define OP_REGISTER_H 1
+
+#include "c_tricks.h"
+#include "op_registry.h"
+#include "serialize_register.h"
+#include "cost_funcs.h"
+#include "op_info.h"
+#include "op_register_types.h"
+#include "op_package_name.h"
+#include "template_help.h"
+#include "weak_linkage.h"
+
+#include <memory>
+#include <string>
+#include <utility>
+
+namespace hnnx {
+PUSH_VISIBILITY(default)
+
+API_EXPORT OpFactory make_op_custom_internal(const std::string_view op_name_in, const std::string_view type_tag,
+                                             op_reg_parms const &opreg_parms, bool is_external = false);
+
+API_EXPORT OpFactory make_op_custom(const std::string_view op_name_in, std::string_view const type_tag,
+                                    op_reg_parms const &opreg_parms);
+
+POP_VISIBILITY()
+template <bool IS_SIMPLE> struct item_return {
+};
+
+template <> struct item_return<false> {
+    typedef op_reg_parms type;
+};
+
+template <> struct item_return<true> {
+    typedef simop_reg_parms type;
+};
+
+// parms_for is wrapped in this class to avoid if constexpr implementation since
+// the AUTOSAR checker doesn't evaluate if constexpr blocks properly
+template <bool IS_SIMPLE> class GetParms {
+  public:
+    template <typename Derived, int I> constexpr static typename item_return<IS_SIMPLE>::type get();
+    template <auto FP, int I> constexpr static typename item_return<IS_SIMPLE>::type get();
+};
+
+// LCOV_EXCL_START [SAFTYSWCCB-1736] constexprs resolved during compile time
+// used in pub/impl/ops_opts_registration_defs.h for internal ops with constexpr lvalue
+template <> class GetParms<false> {
+  public:
+    template <typename Derived, int I> constexpr static typename item_return<false>::type get()
+    {
+        return op_reg_parms::parms_for<Derived, FlagCounter<Derived, I>::get()>();
+    }
+
+    template <auto FP, int I> constexpr static typename item_return<false>::type get()
+    {
+        using Derived = typename DerivedType<FP>::type;
+        return op_reg_parms::parms_for<Derived, FlagCounter<Derived, I>::get()>();
+    }
+};
+
+template <> class GetParms<true> {
+  public:
+    template <typename Derived, int I> constexpr static typename item_return<true>::type get()
+    {
+        return simop_reg_parms::parms_for_simple<Derived, FlagCounter<Derived, I>::get()>();
+    }
+
+    template <auto FP, int I> constexpr static typename item_return<true>::type get()
+    {
+        using Derived = typename DerivedType<FP>::type;
+        return simop_reg_parms::parms_for_simple<Derived, FlagCounter<Derived, I>::get()>();
+    }
+};
+//LCOV_EXCL_STOP
+
+} // namespace hnnx
+
+/** ModifiedDerivedType is used to perform a transformation from
+ * Tensor_TCM -> Tensor for different tensor types. Both FLAGS_FOR and
+ * APPEND_REG_OP_ELEM use this metafunction to implement TCM folding for execute.
+ * For more details, see docs/register-op-tcm-folding.md
+ */
+namespace fold {
+template <auto, int> struct ModifiedDerivedType;
+} //namespace fold
+// Need the line number to avoid making the same template specialization
+// multiple times
+#define MDT(W, LINE)                                                                                                   \
+    namespace fold {                                                                                                   \
+    template <> struct ModifiedDerivedType<W, LINE> : public ModifiedDerivedTypeParent {                               \
+        using Modified = typename DerivedType<W>::type;                                                                \
+    };                                                                                                                 \
+    } //namespace fold
+
+/** @brief Create an Op type's type suffix from an optional name variant and argument types */
+#define TYPE_SUFFIX(OP, NMVRT, ARGS)                                                                                   \
+    (hnnx::ConcatStr<hnnx::ConstexprStrLen(OP), hnnx::ConstexprStrLen(NMVRT) + hnnx::ConstexprStrLen(ARGS)>(           \
+            OP, (hnnx::ConcatStr<hnnx::ConstexprStrLen(NMVRT), hnnx::ConstexprStrLen(ARGS)>(NMVRT, ARGS).data())))
+
+#ifndef OP_REG_COMPILE
+#define DEF_NATIVE_OP(F, OP, LINE) DEF_NATIVE_OP_NMVRT(F, F, OP, "", LINE)
+
+#define DEF_NATIVE_OP_NO_TCM_FOLDING(F, OP) DEF_NATIVE_OP_NMVRT_NO_TCM_FOLDING(F, F, OP, "")
+
+#define DEF_NATIVE_OP_NMVRT(F, W, OP, NMVRT, LINE)                                                                     \
+    MDT(F, LINE)                                                                                                       \
+    APPEND_REG_OP_ELEM(W, THIS_PKG_NAME_STR "::" OP, TYPE_SUFFIX(OP, NMVRT, hnnx::ArgsTuples2<F>::inputTypeNames), LINE)
+
+#define DEF_NATIVE_OP_NMVRT_NO_TCM_FOLDING(F, W, OP, NMVRT)                                                            \
+    APPEND_REG_OP_ELEM_NO_TCM_FOLDING(W, THIS_PKG_NAME_STR "::" OP,                                                    \
+                                      TYPE_SUFFIX(OP, NMVRT, hnnx::ArgsTuples2<F>::inputTypeNames), false)
+
+#else
+#define DEF_NATIVE_OP(F, OP, LINE)                          __reg_op__(F, OP)<<<__FILE__, __LINE__>>>
+#define DEF_NATIVE_OP_NO_TCM_FOLDING(F, OP)                 __reg_op__(F, OP)<<<__FILE__, __LINE__>>>
+#define DEF_NATIVE_OP_NMVRT(F, W, OP, NMVRT, LINE)          __reg_op__(F, OP)<<<__FILE__, __LINE__>>>
+#define DEF_NATIVE_OP_NMVRT_NO_TCM_FOLDING(F, W, OP, NMVRT) __reg_op__(F, OP)<<<__FILE__, __LINE__>>>
+#endif
+
+// TCM folding is an optimization to reduce skel size, so we only need it for execute.
+#if defined(PREPARE_DISABLED) && !defined(TCM_FOLDING_DISABLED)
+#define REGISTER_OP(F, STR) DEF_NATIVE_OP(F, STR, __LINE__)
+#else
+#define REGISTER_OP(F, STR) DEF_NATIVE_OP_NO_TCM_FOLDING(F, STR)
+#endif
+
+// see register-op-tcm-folding.md
+#define REGISTER_OP_NO_TCM_FOLDING(F, STR)    DEF_NATIVE_OP_NO_TCM_FOLDING(F, STR)
+#define REGISTER_OP_WRAPPER(F, W, STR, NMVRT) DEF_NATIVE_OP_NMVRT_NO_TCM_FOLDING(F, W, STR, NMVRT)
+
+#define REGISTER_OP_EXT(F, STR, NMVRT) REGISTER_OP_WRAPPER(F, F, STR, NMVRT)
+
+#define REGISTER_OP_HVX_EXT(F, STR, NMVRT)                                                                             \
+    FLAGS_FOR_DT_NO_TCM_FOLDING(F, Flags::RESOURCE_HVX)                                                                \
+    REGISTER_OP_EXT(F, STR, NMVRT)
+
+#define REGISTER_OP_HVX(F, STR)                                                                                        \
+    FLAGS_FOR_DT(F, Flags::RESOURCE_HVX)                                                                               \
+    REGISTER_OP(F, STR)
+
+#define REGISTER_OP_HVX_NO_TCM_FOLDING(F, STR)                                                                         \
+    FLAGS_FOR_DT_NO_TCM_FOLDING(F, Flags::RESOURCE_HVX)                                                                \
+    REGISTER_OP_NO_TCM_FOLDING(F, STR)
+
+#define REGISTER_OP_HVX_COPY(F, STR)                                                                                   \
+    FLAGS_FOR_DT(F, Flags::RESOURCE_HVX, Flags::IS_COPY);                                                              \
+    REGISTER_OP(F, STR)
+
+#define REGISTER_OP_HVX_COPY_NO_TCM_FOLDING(F, STR)                                                                    \
+    FLAGS_FOR_DT_NO_TCM_FOLDING(F, Flags::RESOURCE_HVX, Flags::IS_COPY)                                                \
+    REGISTER_OP_NO_TCM_FOLDING(F, STR)
+
+#define REGISTER_OP_HLX_EXT(F, STR, NMVRT)                                                                             \
+    FLAGS_FOR_DT_NO_TCM_FOLDING(F, Flags::RESOURCE_HLX)                                                                \
+    REGISTER_OP_EXT(F, STR, NMVRT)
+
+#define REGISTER_OP_HLX(F, STR)                                                                                        \
+    FLAGS_FOR_DT(F, Flags::RESOURCE_HLX)                                                                               \
+    REGISTER_OP(F, STR)
+
+#define REGISTER_OP_HLX_NO_TCM_FOLDING(F, STR)                                                                         \
+    FLAGS_FOR_DT_NO_TCM_FOLDING(F, Flags::RESOURCE_HLX)                                                                \
+    REGISTER_OP_NO_TCM_FOLDING(F, STR)
+
+#define REGISTER_OP_HLX_COPY(F, STR)                                                                                   \
+    FLAGS_FOR_DT(F, Flags::RESOURCE_HLX, Flags::IS_COPY);                                                              \
+    REGISTER_OP(F, STR)
+
+#define REGISTER_OP_HLX_COPY_NO_TCM_FOLDING(F, STR)                                                                    \
+    FLAGS_FOR_DT_NO_TCM_FOLDING(F, Flags::RESOURCE_HLX, Flags::IS_COPY)                                                \
+    REGISTER_OP_NO_TCM_FOLDING(F, STR)
+
+#define REGISTER_OP_HMX(F, STR)                                                                                        \
+    FLAGS_FOR_DT(F, Flags::RESOURCE_HMX);                                                                              \
+    REGISTER_OP(F, STR)
+
+#define REGISTER_OP_HMX_NO_TCM_FOLDING(F, STR)                                                                         \
+    FLAGS_FOR_DT_NO_TCM_FOLDING(F, Flags::RESOURCE_HMX);                                                               \
+    REGISTER_OP_NO_TCM_FOLDING(F, STR)
+
+#define REGISTER_OP_HVX_SRC_DESTRUCTIVE(F, STR)                                                                        \
+    FLAGS_FOR_DT(F, Flags::RESOURCE_HVX, Flags::CAN_BE_SRC_DESTRUCTIVE);                                               \
+    REGISTER_OP(F, STR)
+
+#define REGISTER_OP_HLX_SRC_DESTRUCTIVE(F, STR)                                                                        \
+    FLAGS_FOR_DT(F, Flags::RESOURCE_HLX, Flags::CAN_BE_SRC_DESTRUCTIVE);                                               \
+    REGISTER_OP(F, STR)
+
+#define REGISTER_OP_HVX_SRC_DESTRUCTIVE_NO_TCM_FOLDING(F, STR)                                                         \
+    FLAGS_FOR_DT_NO_TCM_FOLDING(F, Flags::RESOURCE_HVX, Flags::CAN_BE_SRC_DESTRUCTIVE);                                \
+    REGISTER_OP_NO_TCM_FOLDING(F, STR)
+
+//Register Ops which are never serialized, because they will be removed in const propagation
+#define REGISTER_OP_CONST_HVX(F, STR)                                                                                  \
+    FLAGS_FOR_DT(F, Flags::RESOURCE_HVX, Flags::IS_CONST);                                                             \
+    REGISTER_OP(F, STR)
+
+#define REGISTER_OP_CONST_HVX_NO_TCM_FOLDING(F, STR)                                                                   \
+    FLAGS_FOR_DT_NO_TCM_FOLDING(F, Flags::RESOURCE_HVX, Flags::IS_CONST);                                              \
+    REGISTER_OP_NO_TCM_FOLDING(F, STR)
+
+#define REGISTER_OP_CONST(F, STR)                                                                                      \
+    FLAGS_FOR_DT(F, Flags::IS_CONST);                                                                                  \
+    REGISTER_OP(F, STR)
+
+#endif
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/op_register_ext.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/op_register_ext.h
new file mode 100755
index 0000000000000..1dbf651eecd37
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/op_register_ext.h
@@ -0,0 +1,139 @@
+//==============================================================================
+//
+// Copyright (c) 2021-2024 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef OP_REGISTER_EXT_H
+#define OP_REGISTER_EXT_H
+
+#include "graph_status.h"
+#include "template_help.h"
+#include "op_utils.h"
+#include "op_info.h"
+#include "op_registry.h"
+#include "template_help_tensor_ext.h"
+#include "serialize_register.h"
+#include "op_register_types.h"
+#include "pco_declarations.h"
+#include "macros_attribute.h"
+#include "weak_linkage.h"
+
+PUSH_VISIBILITY(default)
+
+namespace hnnx {
+
+class PackageOpStorageBase {
+  public:
+    const std::string op_name;
+    const std::string_view type_tag;
+    const SimpleOpFactory simpop;
+    const std::type_info &type_info;
+    const Op::tensor_deserializer_register_func deserializer_reg_func;
+    const deserialize_op_func deserialize_func;
+    cost_function_t cost_f;
+    const Flags_word flags;
+
+    API_EXPORT PackageOpStorageBase(const std::string_view op_name_in, const std::string_view type_tag_in,
+                                    const SimpleOpFactory simpop_in, const std::type_info &tinf,
+                                    const Op::tensor_deserializer_register_func deserializer_reg_func_in,
+                                    const deserialize_op_func deserialize_func_in, const cost_function_t cost_f_in,
+                                    Flags_word flags_in);
+
+    API_EXPORT OpFactory make_op_wrapper() const;
+};
+
+// The map to store op package ops
+API_EXPORT std::map<std::string, std::vector<std::unique_ptr<PackageOpStorageBase>> *> &get_pkg_op_tmp_map();
+
+} // namespace hnnx
+
+POP_VISIBILITY()
+
+#define INIT_PKG_CORE_INIT_FUNC()                                                                                      \
+    static bool sg_init = false;                                                                                       \
+    extern "C" int op_pkg_init(PackageOpIf &pkg_if)                                                                    \
+    {                                                                                                                  \
+        pkg_if._name = THIS_PKG_NAME_STR;                                                                              \
+        if (sg_init) {                                                                                                 \
+            return GraphStatus::Success;                                                                               \
+        }                                                                                                              \
+        REGISTER_PACKAGE_OPS();                                                                                        \
+        REGISTER_PACKAGE_OPTIMIZATIONS()                                                                               \
+        sg_init = true;                                                                                                \
+        return GraphStatus::Success;                                                                                   \
+    }
+
+#define INIT_PACKAGE_OP_DEF()                                                                                          \
+    API_HIDDEN std::vector<std::unique_ptr<hnnx::PackageOpStorageBase>> &current_package_ops_storage_vec_func()        \
+    {                                                                                                                  \
+        static std::vector<std::unique_ptr<hnnx::PackageOpStorageBase>> opv;                                           \
+        return opv;                                                                                                    \
+    }                                                                                                                  \
+    extern "C" {                                                                                                       \
+    void clearPackageOpsStorageVecFunc() { current_package_ops_storage_vec_func().clear(); }                           \
+    }
+
+#define DECLARE_PACKAGE_OP_DEF()                                                                                       \
+    API_HIDDEN std::vector<std::unique_ptr<hnnx::PackageOpStorageBase>> &current_package_ops_storage_vec_func();
+
+#define REGISTER_PACKAGE_OPS()                                                                                         \
+    if (hnnx::get_pkg_op_tmp_map().find(std::string(THIS_PKG_NAME_STR)) == hnnx::get_pkg_op_tmp_map().end()) {         \
+        hnnx::get_pkg_op_tmp_map()[std::string(THIS_PKG_NAME_STR)] = &current_package_ops_storage_vec_func();          \
+        hnnx::pkg_ops_opts_registration();                                                                             \
+    }
+
+/** @brief Create an Op type's type suffix from argument types */
+#define PKG_TYPE_SUFFIX(OP, ARGS) (hnnx::ConcatStr<hnnx::ConstexprStrLen(OP), hnnx::ConstexprStrLen(ARGS)>(OP, ARGS))
+
+#ifndef OP_REG_COMPILE
+#define DEF_PACKAGE_OP(F, OP)                                                                                          \
+    FLAGS_FOR_DT_NO_TCM_FOLDING(F, Flags::RESOURCE_HVX)                                                                \
+    APPEND_REG_OP_ELEM_NO_TCM_FOLDING(                                                                                 \
+            F, THIS_PKG_NAME_STR "::" OP,                                                                              \
+            PKG_TYPE_SUFFIX(THIS_PKG_NAME_STR "::" OP, hnnx::ArgsTuples2<F>::inputTypeNames), true)
+#else
+#define DEF_PACKAGE_OP(F, OP) __reg_op__(F, OP)<<<__FILE__, __LINE__>>>
+#endif
+
+//LCOV_EXCL_START [SAFTYSWCCB-1542]
+using package_cost_function_t = float (*)(Op const *);
+inline float call_cost_func(package_cost_function_t func, const Op *op)
+{
+    return (func)(op);
+}
+inline float call_cost_func(std::string_view, const Op *op)
+{
+    return 0.0;
+}
+//LCOV_EXCL_STOP
+namespace hnnx {
+template <auto F>
+void add_package_op_ext(std::vector<std::unique_ptr<PackageOpStorageBase>> &ops, const std::string_view op_name_in,
+                        const char *type_tag, const package_cost_function_t cost_f_in, Flags_word flags_in);
+}
+
+#ifndef OP_REG_COMPILE
+#define DEF_PACKAGE_OP_AND_COST_AND_FLAGS(F, OP, COST, ...)                                                            \
+    COST_OF(F, COST)                                                                                                   \
+    FLAGS_FOR_DT_NO_TCM_FOLDING(F, __VA_ARGS__)                                                                        \
+    APPEND_REG_OP_ELEM_NO_TCM_FOLDING(                                                                                 \
+            F, THIS_PKG_NAME_STR "::" OP,                                                                              \
+            PKG_TYPE_SUFFIX(THIS_PKG_NAME_STR "::" OP, hnnx::ArgsTuples2<F>::inputTypeNames), true)
+
+#define DEF_PACKAGE_OP_AND_COST_F_AND_FLAGS(F, OP, COST_F, ...)                                                        \
+    COST_OF_F(F, [](const Graph &, const Op *op) -> float { return call_cost_func(COST_F, op); })                      \
+    FLAGS_FOR_DT_NO_TCM_FOLDING(F, __VA_ARGS__)                                                                        \
+    APPEND_REG_OP_ELEM_NO_TCM_FOLDING(                                                                                 \
+            F, THIS_PKG_NAME_STR "::" OP,                                                                              \
+            PKG_TYPE_SUFFIX(THIS_PKG_NAME_STR "::" OP, hnnx::ArgsTuples2<F>::inputTypeNames), true)
+#else
+#define DEF_PACKAGE_OP_AND_COST_AND_FLAGS(F, OP, COST, ...)     __reg_op__(F, OP)<<<__FILE__, __LINE__>>>
+#define DEF_PACKAGE_OP_AND_COST_F_AND_FLAGS(F, OP, COST_F, ...) __reg_op__(F, OP)<<<__FILE__, __LINE__>>>
+#endif
+
+DECLARE_PACKAGE_OP_DEF()
+
+#endif // OP_REGISTER_EXT_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/op_register_types.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/op_register_types.h
new file mode 100755
index 0000000000000..0fc961e332312
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/op_register_types.h
@@ -0,0 +1,100 @@
+//==============================================================================
+//
+// Copyright (c) Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef OP_REGISTER_TYPES_H
+#define OP_REGISTER_TYPES_H 1
+
+#include "op_registry.h"
+#include "serialize_register.h"
+#include "cost_funcs.h"
+#include "op_info.h"
+#include "op_package_name.h"
+#include "size_align_code.h"
+
+#include <memory>
+#include <string>
+#include <utility>
+
+#if !defined(ANDROID) && !(defined(_WIN32) && defined(_M_ARM64) && defined(PREPARE_DISABLED))
+#define DESERIALIZATION_ENABLED 1
+#else
+#define DESERIALIZATION_ENABLED 0
+#endif
+
+namespace hnnx {
+
+// package of info for op construction.
+// LCOV_EXCL_START [SAFTYSWCCB-1736] constexprs resolved during compile time
+struct op_reg_parms {
+#ifndef PREPARE_DISABLED
+    OpFactory newop;
+    std::type_info const *tinf;
+#endif
+#if DESERIALIZATION_ENABLED == 1
+    Op::tensor_deserializer_register_func deserializer_reg_func;
+    deserialize_op_func deserialize_func;
+    deserialize_dtor_func deserialize_dtor;
+#endif
+#ifndef PREPARE_DISABLED
+    cost_function_t cost_f;
+    Flags_word flags;
+#endif
+#if DESERIALIZATION_ENABLED == 1
+    size_align_code_t size_align_code;
+    inline constexpr size_t get_size() const { return size_align_code.size(); }
+    inline constexpr size_t get_align() const { return size_align_code.align(); }
+#endif
+    template <typename Derived, int N> static constexpr op_reg_parms parms_for();
+};
+
+// generate an 'op_reg_parms' for a given Op type.
+// this should be expanded only once for each Derived, so we want it inlined.
+template <typename Derived, int N> [[gnu::always_inline]] constexpr op_reg_parms op_reg_parms::parms_for()
+{
+    return op_reg_parms
+    {
+#ifndef PREPARE_DISABLED
+        Derived::create, &typeid(Derived),
+#endif
+#if DESERIALIZATION_ENABLED == 1
+                Derived::get_tensor_deserializer_register_func(),
+                test_flag_for(flags_for<Derived, N>(), Flags::IS_CONST) ? nullptr
+                                                                        : alloc_func_for_op<Derived>::alloc_func,
+                !std::is_trivially_destructible<Derived>::value ? dealloc_func_for_op<Derived>::func : nullptr,
+#endif
+#ifndef PREPARE_DISABLED
+                get_costf<Derived>(), flags_for<Derived, N>(),
+#endif
+#if DESERIALIZATION_ENABLED == 1
+                alloc_func_for_op<Derived>::op_size_align,
+#endif
+    };
+}
+// LCOV_EXCL_STOP
+
+struct simop_reg_parms {
+    SimpleOpFactory sim_newop;
+    std::type_info const *tinf;
+    Op::tensor_deserializer_register_func deserializer_reg_func;
+    deserialize_op_func deserialize_func;
+    cost_function_t cost_f;
+    Flags_word flags;
+    template <typename Derived, int N> static constexpr simop_reg_parms parms_for_simple();
+};
+
+template <typename Derived, int N> [[gnu::always_inline]] constexpr simop_reg_parms simop_reg_parms::parms_for_simple()
+{
+    return simop_reg_parms{Derived::create,
+                           &typeid(Derived),
+                           Derived::get_tensor_deserializer_register_func(),
+                           alloc_func_for_op_ext<Derived>::alloc_func,
+                           get_costf<Derived>(),
+                           flags_for<Derived, N>()};
+}
+} // namespace hnnx
+#endif
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/op_registry.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/op_registry.h
new file mode 100755
index 0000000000000..00a3c10bcab56
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/op_registry.h
@@ -0,0 +1,130 @@
+//==============================================================================
+//
+// Copyright (c) 2018 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef OP_REGISTRY_H
+#define OP_REGISTRY_H 1
+
+#include "op.h"
+#include "op_def.h"
+#include "weak_linkage.h"
+
+#include <map>
+#include <memory>
+#include <string>
+
+/*
+ * We need a way to generate ops.  This known as a "factory", being able to
+ * return an appropriate op based on the op type and maybe other factors.
+ *
+ * How do we know what ops exist?  Maintaining a list is not acceptable: it is
+ * inflexible, requires constant code changes, is error prone, doesn't allow
+ * dynamic op addition, etc.
+ *
+ * So we must have some mechanism to register ops with the op factory.
+ *
+ * The interface is pretty simple: register an op with the registry, or generate
+ * an op that exists in the registry
+ *
+ * We're going to use strings to indicate the type of operation, this is less
+ * error prone (mismatching IDs) than a numeric ID.
+ *
+ * The factory has to be virtualized in some way: Map a string to a function
+ * which generates an op.  Using templated classes, however, generates a lot of
+ * extra code.  To keep this smaller, a bare function pointer is used.  This
+ * generates a unique_ptr which wraps the op.  If the op cannot be created, then
+ * the unique_ptr contains nullptr.  Each factory function is actually a static
+ * member of a concrete template class, but since static member functions have
+ * the same signature as simple functions, we can just use a function pointer.
+ *
+ */
+
+namespace hnnx {
+
+// An op factory function.  Just a bare function poiner in order to keep
+// things as small as possible.
+using OpFactory = uptr_Op (*)(OpIoPtrs const &, const OpId, SimpleOpFactory);
+
+struct op_reg_info_t {
+    OpFactory op_factory{nullptr}; // function pointer for generating ops
+    SimpleOpFactory simple_op_factory{nullptr}; // function pointer for generating SimpleOp's for SimpleOpWrapper's
+    bool is_external{false};
+};
+using OpRegistry_map_t = std::multimap<opname_tag_t, struct op_reg_info_t>;
+
+PUSH_VISIBILITY(default)
+/*
+	 * We register an op with the registry by giving it a name, and a std::unique_ptr to a generator for that op.
+	 * Returns a reference to the op once emplaced.
+	 * Why? Because that allows us to create static variables with the results, causing the functions to be loaded automatically...
+	 */
+extern API_FUNC_EXPORT OpFactory register_op(opname_tag_t name, OpFactory newop, SimpleOpFactory simop,
+                                             bool is_external);
+
+/*
+	 * Generate an op
+	 */
+API_FUNC_EXPORT uptr_Op op_factory_generate(OpIoPtrs const &op_io_ptrs, OpId id_in);
+
+/**Function returns a reference for the registered ops structure
+	 * This function currently enables unit tests on the op registry
+	 *
+	 */
+extern API_FUNC_EXPORT const OpRegistry_map_t &get_registered_ops();
+
+// Function clean up all package(external) ops from op maps
+extern API_FUNC_EXPORT void clear_pkg_ops_in_op_maps();
+
+// for 'introspect', we want a mapping from each registered OpFactory (a function pointer)
+// to the corresponding typeid ptr. This map is built via calls to
+// register_optype_by_factory; this function is normally a weak def which does nothing,
+// but introspect.cc redefines it.
+//
+API_FUNC_EXPORT void register_optype_by_factory(OpFactory fp, hnnx::opname_tag_t opname_tag, std::type_info const &typ,
+                                                const std::string_view type_tag);
+API_FUNC_EXPORT void register_optype_by_factory(SimpleOpFactory fp, hnnx::opname_tag_t opname_tag,
+                                                std::type_info const &typ, const std::string_view type_tag);
+
+POP_VISIBILITY()
+
+// LCOV_EXCL_START [SAFTYSWCCB-1736] constexprs resolved during compile time
+template <int N, int M> constexpr auto ConcatStr(const char *a, const char *b)
+{
+    std::array<char, N + M + 1> result{};
+    char *const des = result.data();
+    for (size_t i = 0; i < N; i++) {
+        des[i] = a[i];
+    }
+    size_t idx = N;
+    for (size_t j = 0; j < M; j++, idx++) {
+        des[idx] = b[j];
+    }
+    des[idx] = 0;
+    return result;
+}
+// LCOV_EXCL_STOP
+
+template <typename T> constexpr size_t ConstexprStrLen(T s)
+{
+    return 0;
+}
+
+// LCOV_EXCL_START [SAFTYSWCCB-1736] constexprs resolved during compile time
+template <> constexpr size_t ConstexprStrLen<const char *>(const char *str)
+{
+    size_t len = 0;
+    while (*str != 0) {
+        len++;
+        str++;
+    }
+    return len;
+}
+// LCOV_EXCL_STOP
+
+} // namespace hnnx
+
+#endif /*OP_FACTORY_H*/
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/op_utils.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/op_utils.h
new file mode 100755
index 0000000000000..3611f25406ae1
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/op_utils.h
@@ -0,0 +1,225 @@
+//==============================================================================
+//
+// Copyright (c) Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef OP_UTILS_H
+#define OP_UTILS_H 1
+
+#include "interface_defs.h"
+#include "op_def.h"
+#include "tensor.h"
+#include "build_options_pub.h"
+
+#include <memory>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+namespace hnnx {
+
+template <typename T> static inline bool is_output_def_valid(const OutputDef &output_def, Graph &graph_in)
+{
+    return tensor_generator_valid<T>(nullptr, output_def, graph_in);
+}
+
+template <typename T>
+static inline bool is_input_tensor_compatible(Graph &graph_in, Tensor const *tensor, unsigned position)
+{
+    // dynamic_cast below is used to realise 'std::is_base_of' check with tensor object
+    // the cast uses Run Time Type Identification (RTTI) mechanism
+    // to infer the true object type to downcast when valid
+    // else returns a nullptr
+    if (!tensor || !dynamic_cast<T>(tensor)) {
+        if constexpr (build_options_pub::DebugRegistry)
+            debuglog("input tensor is %p of type %s in position %d, dynamic cast to %s failed", tensor,
+                     typeid(*tensor).name(), position, typeid(T).name());
+        return false;
+    }
+    return true;
+}
+
+template <typename TupType, size_t... I>
+static inline bool are_output_defs_valid_helper(std::index_sequence<I...>, OutputDef const *const *outputs_in,
+                                                Graph &graph_in)
+{
+    //  tensor_generator below returns a unique pointer which will be released on return (i.e. when object goes out of scope)
+    // this check preferably should be done with boolean valid_tensor() method instead of creating an actual tensor
+    // but for now to limit generation of more template code this approach should suffice.
+    return (is_output_def_valid<std::tuple_element_t<I, TupType>>(*outputs_in[I], graph_in) && ...);
+}
+
+template <size_t N, typename TupType>
+static inline bool are_output_defs_valid(OutputDef const *const *outputs_in, Graph &graph_in)
+{
+    return are_output_defs_valid_helper<TupType>(std::make_index_sequence<N>{}, outputs_in, graph_in);
+}
+
+template <typename TupType, size_t... I>
+static inline bool are_input_tensors_compatible_helper(std::index_sequence<I...>, Graph &graph_in,
+                                                       Tensor const *const *inputs_in)
+{
+    return ((is_input_tensor_compatible<std::tuple_element_t<I, TupType>>(graph_in, inputs_in[I], I)) && ...);
+}
+
+template <size_t N, typename TupType>
+static inline bool are_input_tensors_compatible(Graph &graph_in, Tensor const *const *inputs_in)
+{
+    return are_input_tensors_compatible_helper<TupType>(std::make_index_sequence<N>{}, graph_in, inputs_in);
+}
+
+template <typename T>
+std::unique_ptr<Tensor> tensor_output_alloc(const Op *producer_in, const OutputDef &output_def, Graph &graph_in)
+{
+    return std::move(tensor_generator<T>(producer_in, output_def, graph_in));
+}
+
+// a pointer to a tensor_generator<T>() function, for some T
+typedef std::unique_ptr<Tensor> (*tensor_generate_fp)(Op const *, OutputDef const &, Graph &);
+//
+// for TupType being a tuple of N tensor-types:
+//  tensor_gen_array<TupType> returns a constexpr array of N tensor_generate_fp.
+//
+template <typename TupType, size_t N, size_t... I>
+inline constexpr std::array<tensor_generate_fp, N> tensor_gen_array_helper(std::index_sequence<I...>)
+{
+    return {tensor_generator<std::tuple_element_t<I, TupType>>...};
+}
+template <typename TupType>
+inline constexpr std::array<tensor_generate_fp, std::tuple_size_v<TupType>> tensor_gen_array()
+{
+    constexpr size_t N = std::tuple_size_v<TupType>;
+    return tensor_gen_array_helper<TupType, N>(std::make_index_sequence<N>{});
+}
+// and tensor_gen_array_ptr<TupType> returns a pointer to such an array
+template <typename TupType> inline tensor_generate_fp const *tensor_gen_array_ptr()
+{
+    if constexpr (std::tuple_size_v<TupType> != 0) {
+        static constexpr std::array<tensor_generate_fp, std::tuple_size_v<TupType>> ptr_array =
+                tensor_gen_array<TupType>();
+        return ptr_array.data();
+    } else {
+        return nullptr;
+    }
+}
+
+////////////////
+// Code to generate a table of {dtype, rank} pairs, for the 'scratch' tensors in an op.
+struct dt_rank_pair {
+    DType dt;
+    unsigned rank;
+};
+
+// map a Tensor type to a dt_rank_pair: dt_rank_pair_for_tens<T>::value.
+// General case fails.
+template <typename Tens> struct dt_rank_pair_for_tens {
+    static_assert(int(sizeof(Tens)) < 0, "Can't use Tens as 'scratch' output type, only Concrete Tensor");
+};
+// Specialized for Concrete<Tensor> only.
+template <typename Tinfo> struct dt_rank_pair_for_tens<ConcreteTensor<Tinfo>> {
+  private:
+    using CT_traits = tensor_traits<ConcreteTensor<Tinfo>>;
+
+  public:
+    static constexpr dt_rank_pair value = {CT_traits::dtype, CT_traits::rank};
+};
+template <typename TupType, size_t N, size_t... I>
+inline constexpr std::array<dt_rank_pair, N> tensor_dt_rank_array_helper(std::index_sequence<I...>)
+{
+    return {dt_rank_pair_for_tens<std::tuple_element_t<I, TupType>>::value...};
+}
+template <typename TupType> // make and return the array...
+inline constexpr std::array<dt_rank_pair, std::tuple_size_v<TupType>> tensor_dt_rank_array()
+{
+    constexpr size_t N = std::tuple_size_v<TupType>;
+    return tensor_dt_rank_array_helper<TupType, N>(std::make_index_sequence<N>{});
+}
+// and tensor_dt_rank_array_ptr<TupType> returns a pointer to such an array
+template <typename TupType> inline dt_rank_pair const *tensor_dt_rank_array_ptr()
+{
+    if constexpr (std::tuple_size_v<TupType> != 0) {
+        static constexpr std::array<dt_rank_pair, std::tuple_size_v<TupType>> dt_array =
+                tensor_dt_rank_array<TupType>();
+        return dt_array.data();
+    } else {
+        return nullptr;
+    }
+}
+// A mechanism for invoking tensor_dt_rank_array_ptr<..> with the *last* tuple types
+// in tup-types, after removing the first NPREFIX (and, a goal here is to avoid creating
+// more of the 'static constexpr dt_array' above than needed; any cases with the same
+// tail will invoke the same array)
+template <unsigned NPREFIX, typename TupOfTens, bool FINAL = (NPREFIX == 0)> struct tensor_dt_rank_array_for_scratch;
+
+// case with NPREFIX = 0
+template <typename... Tts> struct tensor_dt_rank_array_for_scratch<0, std::tuple<Tts...>, true> {
+    static inline dt_rank_pair const *table_p() { return tensor_dt_rank_array_ptr<std::tuple<Tts...>>(); }
+};
+// other cases with NPREFIX >= 1
+template <unsigned NPREFIX, typename T1, typename... Tts>
+struct tensor_dt_rank_array_for_scratch<NPREFIX, std::tuple<T1, Tts...>, false>
+    : public tensor_dt_rank_array_for_scratch<NPREFIX - 1, std::tuple<Tts...>> {
+    // inherit table_p()
+};
+////////////////
+
+//
+// given tensor type, get spatial mask
+//
+template <typename Ttype> inline uint32_t get_spatial_mask()
+{
+    // NOLINTNEXTLINE(misc-const-correctness): Don't const this variable
+    uint32_t spatial_mask = 0x38; //0b111000
+    if constexpr (std::is_base_of<LayoutWideCrouton_8, Ttype>::value) spatial_mask = 0x20; //0b100000
+    return spatial_mask;
+}
+
+} // namespace hnnx
+
+//
+// find dim + (stride-1))/stride
+// avoid a divide when stride = 1..4
+inline size_t stride_divide(size_t dim, size_t stride)
+{
+    if (stride >= 2) {
+        dim += stride - 1;
+        switch (stride) {
+        case 2:
+            return dim >> 1;
+        case 3:
+            return dim / 3u; // compiler will have a trick for this.
+        case 4:
+            return dim >> 2;
+        default:
+            return dim / stride;
+        }
+    }
+    return dim;
+}
+
+//
+// given input size, window size, and stride, find the output size and the 'prepad' (padding on
+// left/top needed to align first output.
+// 'same_shape' is false for 'valid', true for 'same'
+//
+static inline std::tuple<size_t, size_t> find_output_size_and_prepad(bool same_shape, size_t insize, size_t winsize,
+                                                                     size_t stride)
+{
+    size_t outsize;
+    size_t prepad;
+    if (same_shape) {
+        outsize = insize;
+        prepad = (winsize - 1) / 2;
+    } else {
+        outsize = insize - (winsize - 1);
+        prepad = 0;
+    }
+    // 'outsize' is correct for stride=1; adjust for general stride
+    outsize = stride_divide(outsize, stride);
+    return std::tuple<size_t, size_t>(outsize, prepad);
+}
+
+#endif
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/opname_tag.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/opname_tag.h
new file mode 100755
index 0000000000000..8adb264abdad3
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/opname_tag.h
@@ -0,0 +1,266 @@
+//==============================================================================
+//
+// Copyright (c) 2019-2021,2024 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef OPNAME_TAG_H
+#define OPNAME_TAG_H 1
+
+// uncomment this to get string_tag_t for 'opname_tag_t'
+#define WITH_STRING_REG_OPSTR 1
+// uncomment this to get string_tag_t for operand_tag_t and split_context_t
+#define WITH_STRING_REG_OPERAND 1
+
+#include <string>
+#include <string_view>
+#include "conversions.h"
+#include "macros_attribute.h"
+#include "weak_linkage.h"
+
+#if defined(WITH_STRING_REG_OPSTR) || defined(WITH_STRING_REG_OPERAND)
+#include "string_registry2.h"
+#endif
+#include "op_package_name.h"
+
+struct Options;
+
+namespace hnnx {
+
+namespace opname_hash_ns {
+// a 16-bit string hash; used to speed up optimization passes.
+// must match that used in 'offline' rule builder.
+inline unsigned opname_hash_impl(char const *s, unsigned n)
+{
+    unsigned h = 0;
+    for (int i = 0; i < (int)n; i++) {
+        h = muladdu32_modular(0x381u, h, (unsigned char)s[i]);
+    }
+    return h & 0xFFFF;
+}
+} // namespace opname_hash_ns
+
+#ifndef WITH_STRING_REG_OPSTR // not deployed; keep using std::string
+typedef std::string opname_tag_t;
+// opname_tag_parm_t is an opname_tag_t const & if opname_tag_t is 'heavy',
+// and the same as opname_tag_t if 'light'
+typedef std::string const &opname_tag_parm_t;
+
+inline unsigned find_opname_hash(std::string const &nm)
+{
+    return opname_hash_ns::opname_hash_impl(nm.data(), nm.size());
+}
+#endif
+
+#ifndef WITH_STRING_REG_OPERAND // not deployed; keep using std::string
+typedef std::string operand_tag_t;
+typedef std::string const &operand_tag_parm_t;
+typedef std::string split_context_tag_t;
+typedef std::string const &split_context_tag_parm_t;
+#endif
+
+#if defined(WITH_STRING_REG_OPSTR) || defined(WITH_STRING_REG_OPERAND)
+
+PUSH_VISIBILITY(default)
+namespace opname_hash_ns {
+struct opname_hash_functor {
+    unsigned operator()(char const *s, size_t n) const { return opname_hash_impl(s, n); }
+};
+
+// type for the string registry
+
+typedef string_registry_two<opname_hash_functor> StringRegistry;
+typedef StringRegistry::string_key string_key;
+
+} // namespace opname_hash_ns
+
+// declare that this specialization exists in a C++ file
+extern template class string_registry_two<opname_hash_ns::opname_hash_functor>;
+
+//
+// string_tag_t needs to have the following:
+//   - null_ctor
+//   - copy-ctor, move_ctor, op= and op=move
+//   - construct/assign from: std::string_view, std::string, char const *
+//   - operator ==, != and < , to self, for maps
+//   - specialization of std::hash to support unordered_map
+//   - c_str()  returns char const *
+//   - conversion to std::string (explicit)
+//   - conversion to std::string_view (explicit)
+//   - ideally, == and != comparison to char const*, std::string, and std::string_view
+//     should be possible without converting the other to string_tag_t
+//   - size(), length()
+//   - operator [](size_t), unchecked, read-only
+//
+//
+
+class string_tag_t {
+    using string_key = opname_hash_ns::string_key;
+    using registry_t = opname_hash_ns::StringRegistry;
+    string_key skey; // <-- this is the only data item. It's a pointer.
+
+    // initially-null pointer to the global string reg, which is a static locsl.
+    API_EXPORT static registry_t *globalStringReg;
+    // function to get its address
+    API_EXPORT static registry_t &get_stringreg_func();
+    static inline registry_t &get_stringreg() { return globalStringReg ? *globalStringReg : get_stringreg_func(); }
+    // these are just implementations of get_stringreg().map_str( ..various .. );
+    API_EXPORT static string_key map_str(std::string_view s);
+    API_EXPORT static string_key map_str(std::string const &s);
+    API_EXPORT static string_key map_str(char const *s);
+
+  public:
+    ~string_tag_t() = default;
+    // same as string_tag_t(string_view), but if the name is not already in the registry,
+    // it will return string_tag for "" (can check with result.empty())
+    API_EXPORT static string_tag_t map_str_checked(std::string_view);
+
+    string_tag_t() : skey(opname_hash_ns::StringRegistry::map_empty_str()) {}
+    string_tag_t(string_tag_t const &x) = default;
+    string_tag_t(string_tag_t &&x) = default;
+    string_tag_t(std::string_view x) : skey(map_str(x)) {}
+    string_tag_t(std::string const &x) : skey(map_str(x)) {}
+    string_tag_t(char const *x) : skey(map_str(x)) {}
+    string_tag_t &operator=(string_tag_t const &) = default;
+    string_tag_t &operator=(string_tag_t &&) = default;
+    string_tag_t &operator=(std::string_view x)
+    {
+        skey = map_str(x);
+        return *this;
+    }
+    string_tag_t &operator=(std::string const &x)
+    {
+        skey = map_str(x);
+        return *this;
+    }
+    string_tag_t &operator=(char const *x)
+    {
+        skey = map_str(x);
+        return *this;
+    }
+    bool operator==(string_tag_t const &rhs) const { return skey == rhs.skey; }
+    bool operator<(string_tag_t const &rhs) const { return skey < rhs.skey; }
+    char const *c_str() const { return registry_t::c_str(skey); }
+    // the 'explicit' on conversion to string could be removed, but I'd prefer to have implicit
+    // conversions flagged, they can usually be modified to something that doesn't allocate memory.
+    explicit operator std::string() const { return registry_t::unmap(skey); }
+    operator std::string_view() const { return registry_t::unmap_sv(skey); }
+    char operator[](size_t i) const { return c_str()[i]; }
+    size_t size() const { return skey->first.size(); }
+    size_t length() const { return size(); }
+    bool empty() const { return skey == registry_t::map_empty_str(); }
+    API_EXPORT bool operator==(std::string_view x) const; //{ return registry_t::unmap_sv(skey)==x;}
+    API_EXPORT bool operator==(std::string const &x) const; // { return registry_t::unmap_sv(skey)==x;}
+    API_EXPORT bool operator==(char const *x) const; //{ return registry_t::unmap_sv(skey)==std::string_view(x);}
+    template <class T> bool operator!=(T &&other) const { return !this->operator==(std::forward<T>(other)); }
+    // string_tag_t::nullobj() returns a 'null' object which is mostly unusable; it can
+    // be copied; also == and != will work. null object are equal to each other and
+    // different from other string_tag_t.
+    API_EXPORT static inline string_tag_t nullobj() { return string_tag_t((string_key) nullptr); }
+
+  protected:
+    explicit string_tag_t(string_key k) : skey(k) {}
+    friend struct std::hash<string_tag_t>;
+    friend unsigned find_opname_hash(string_tag_t const &nm);
+    // this  meets the requirements for std::hash: mapping may change from run to run.
+    API_EXPORT size_t std_hash_val() const noexcept { return size_t(skey); }
+};
+POP_VISIBILITY()
+
+inline bool operator==(std::string_view a, string_tag_t const &b)
+{
+    return b == a;
+}
+inline bool operator==(std::string const &a, string_tag_t const &b)
+{
+    return b == a;
+}
+inline bool operator==(char const *a, string_tag_t const &b)
+{
+    return b == a;
+}
+inline bool operator!=(std::string_view a, string_tag_t const &b)
+{
+    return !(b == a);
+}
+inline bool operator!=(std::string const &a, string_tag_t const &b)
+{
+    return !(b == a);
+}
+inline bool operator!=(char const *a, string_tag_t const &b)
+{
+    return !(b == a);
+}
+#endif
+
+#ifdef WITH_STRING_REG_OPSTR // string reg deployed for opname
+typedef string_tag_t opname_tag_t;
+typedef string_tag_t opname_tag_parm_t;
+inline unsigned find_opname_hash(string_tag_t const &nm)
+{
+    return nm.skey->second & 0xFFFF;
+}
+#endif
+
+#ifdef WITH_STRING_REG_OPERAND // deployed for operand_tag and split_context_tag
+typedef string_tag_t operand_tag_t;
+typedef string_tag_t operand_tag_parm_t;
+typedef string_tag_t split_context_tag_t;
+typedef string_tag_t split_context_parm_t;
+#endif
+
+PUSH_VISIBILITY(default)
+// this is for use in code where we need to transform a literal name to opstr,
+// e.g. opname_tag_t opname_Concat = make_opname( "Concat");
+//
+API_EXPORT opname_tag_t make_opname(char const *opname, char const *prefix = THIS_PKG_NAME_STR);
+POP_VISIBILITY()
+
+// For retrieving an option value from within DEF_OPT rules, converted to T.
+// Returns true if the option exists and can convert to T;
+// if it doesn't, return false and sets result to 0.
+// This is only implemented for T = int, size_t, float.
+// 'bool' option values can be obtained as int or size_t, will be 0 or 1.
+// 'string' values are treated as bool (false if empty).
+template <typename T> bool get_option_value(Options const &ops, hnnx::opname_tag_parm_t optname, T &result);
+
+} // namespace hnnx
+
+#if defined(WITH_STRING_REG_OPSTR) || defined(WITH_STRING_REG_OPERAND)
+namespace std {
+template <> struct hash<hnnx::string_tag_t> {
+    typedef hnnx::string_tag_t argument_type;
+    typedef std::size_t result_type;
+    result_type operator()(argument_type const &s) const noexcept { return s.std_hash_val(); }
+};
+} // namespace std
+
+// these declarations make it possible to use a string_tag_t as the key
+// in a minihash or miniset
+namespace hnnx {
+namespace minObj {
+template <typename Key, bool ERASE_OK> struct hashmap_traits;
+template <typename T> struct findhash;
+uint32_t findhash_sizet(size_t);
+template <> struct hashmap_traits<hnnx::string_tag_t, true> {
+    static constexpr bool valid = true;
+};
+template <> struct hashmap_traits<hnnx::string_tag_t, false> {
+    static constexpr bool valid = true;
+    static inline hnnx::string_tag_t generate_null() { return hnnx::string_tag_t::nullobj(); }
+    static inline bool is_null(hnnx::string_tag_t k) { return k == hnnx::string_tag_t::nullobj(); }
+};
+template <> struct findhash<hnnx::string_tag_t> {
+    inline uint32_t operator()(hnnx::string_tag_t s) const
+    {
+        return findhash_sizet(std::hash<hnnx::string_tag_t>()(s));
+    }
+};
+} // namespace minObj
+} // namespace hnnx
+
+#endif
+
+#endif // OPNAME_TAG_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/ops_opts_registration.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/ops_opts_registration.h
new file mode 100755
index 0000000000000..6cba5ede6cab3
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/ops_opts_registration.h
@@ -0,0 +1,444 @@
+//==============================================================================
+//
+// Copyright (c) 2022 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef OPS_OPTS_REGISTRATION_H
+#define OPS_OPTS_REGISTRATION_H 1
+
+// Note that Op files must include this file AFTER they've included either
+// typical_op.h or variadic_op.h, since these headers both give definitions for DerivedType
+
+#include "log.h"
+#include "ops_opts_registration_defs.h"
+#include "optimize_flags.h"
+#include "optimize.h"
+#include "op_register.h"
+#include "op_register_ext.h"
+#include <cstdint>
+#include <cinttypes>
+#include <string>
+#include <string_view>
+
+namespace hnnx {
+
+/** @brief reg_op_node */
+class reg_op_node {
+    /** @brief parms parameters (cost func, flags, etc) for the Op */
+    union {
+        op_reg_parms op_parms;
+        simop_reg_parms simple_op_parms;
+    };
+    /** @brief op_name */
+    uint16_t op_name_offset;
+    /** @brief type_tag */
+    uint16_t type_tag_offset;
+
+    std::string_view const get_subview(std::string_view const strtab, std::string_view::size_type const start) const
+    {
+        return std::string_view{strtab.data() + start};
+    }
+
+  public:
+    // LCOV_EXCL_START [SAFTYSWCCB-1736] constexprs resolved during compile time
+    /** @brief reg_op_node @param a @param n @param t */
+    constexpr reg_op_node(op_reg_parms const p, uint16_t const n, uint16_t const t) noexcept
+        : op_parms(p), op_name_offset(n), type_tag_offset(t)
+    {
+    }
+
+    constexpr reg_op_node(simop_reg_parms const p, uint16_t const n, uint16_t const t) noexcept
+        : simple_op_parms(p), op_name_offset(n), type_tag_offset(t)
+    {
+    }
+
+    /** @brief reg_op_node */
+    constexpr reg_op_node() noexcept : reg_op_node(op_reg_parms{}, 0, 0) {}
+
+    // LCOV_EXCL_STOP
+
+    /** @brief process invoke the make_op_custom function */
+    void core_process(std::string_view const op_name_strtab, std::string_view const type_tag_strtab) const
+    {
+        std::string_view const op_name = get_subview(op_name_strtab, op_name_offset);
+        std::string_view const type_tag = get_subview(type_tag_strtab, type_tag_offset);
+        hnnx::make_op_custom(op_name, type_tag, op_parms);
+    }
+
+    /** @brief process append external oppkg ops into op vector for later use */
+    void pkg_process(std::string_view const op_name_strtab, std::string_view const type_tag_strtab) const
+    {
+        std::string_view const op_name = get_subview(op_name_strtab, op_name_offset);
+        std::string_view const type_tag = get_subview(type_tag_strtab, type_tag_offset);
+        std::vector<std::unique_ptr<PackageOpStorageBase>> &ops = current_package_ops_storage_vec_func();
+        ops.push_back(std::make_unique<PackageOpStorageBase>(
+                op_name, type_tag, simple_op_parms.sim_newop, *(simple_op_parms.tinf),
+                simple_op_parms.deserializer_reg_func, simple_op_parms.deserialize_func, simple_op_parms.cost_f,
+                simple_op_parms.flags));
+    }
+};
+
+#ifdef PREPARE_DISABLED
+/** @brief reg_optim_node This is stub class that does not
+ *  register DEF_OPTs when prepare is disabled.
+*/
+class reg_optim_node {
+  public:
+    /** @brief No-op when prepare is disabled */
+    void core_process(std::string_view const fname) const { (void)fname; }
+
+    /** @brief No-op when prepare is disabled */
+    void pkg_process(std::string_view const fname) const { (void)fname; }
+};
+#else
+/** @brief reg_optim_node */
+class reg_optim_node {
+    /** @brief defopt */
+    hnnx::get_entire_defopt_t defopt;
+    /** @brief flags */
+    OptimFlags::flags_t flags;
+    /** @brief priority */
+    uint16_t priority;
+    /** @brief line */
+    uint16_t line;
+
+  public:
+    /** @brief reg_optim_node @param p @param fl @param m @param c @param r @param f @param l */
+    constexpr reg_optim_node(uint16_t const p, OptimFlags::flags_t const fl, hnnx::get_entire_defopt_t d,
+                             uint16_t const l) noexcept
+        : defopt(d), flags(fl), priority(p), line(l)
+    {
+    }
+
+    /** @brief reg_optim_node */
+    constexpr reg_optim_node() noexcept : reg_optim_node(0, 0U, nullptr, 0) {}
+
+    /** @brief process invoke the add_package_opt function */
+    void core_process(std::string_view const fname) const
+    {
+        hnnx::add_package_opt(current_package_opts_storage_vec_func(), priority, flags, defopt, fname.data(), line);
+    }
+
+    /** @brief process invoke the add_package_opt function for external oppkg */
+    void pkg_process(std::string_view const fname) const
+    {
+        hnnx::add_package_opt(current_package_opts_storage_vec_func(), priority, flags, defopt, fname.data(), line);
+    }
+};
+#endif
+
+/** @brief sv_size_wrapper a wrapper template for string_view that carries the view size as
+ *  a template parameter. This allows the size to be inferred by the built_array
+ *  constructor template.
+ */
+
+// LCOV_EXCL_START [SAFTYSWCCB-1736] constexprs resolved during compile time
+template <std::string_view::size_type S> struct sv_size_wrapper {
+    std::string_view v;
+};
+
+/** @brief built_array */
+template <typename T, uint32_t N> class built_array {
+    /** @brief arr */
+    std::array<T, N> arr{};
+
+  public:
+    /** @brief size
+     *  @return the array size
+     */
+    static constexpr uint32_t size() { return N; }
+    /** @brief get_arr
+     *  @return the array
+     */
+    constexpr const std::array<T, N> get_arr() const noexcept { return arr; }
+    /** @brief built_array
+     *  @param old the previous array
+     *  @param newElem the new element to append
+     */
+    constexpr built_array(built_array<T, N - 1> const &old, T newElem)
+    {
+        if constexpr (N > 1) {
+            for (uint32_t i = 0U; i < N - 1U; i++) {
+                arr[i] = old.get_arr()[i];
+            }
+        }
+        arr[N - 1U] = newElem;
+    }
+    /** @brief append
+     *  @param newElem the new element to append
+     *  @return the new array
+     */
+    constexpr built_array<T, N + 1> append(T newElem) const { return built_array<T, N + 1>(*this, newElem); }
+
+    /** @brief append
+     *  @param newElem the new element to append
+     *  @return the new array
+     */
+    template <std::string_view::size_type I> constexpr built_array<T, N + I> append(sv_size_wrapper<I> newElem) const
+    {
+        return built_array<T, N + I>(*this, newElem);
+    }
+
+    /** @brief built_array
+     *  @param old the previous array
+     *  @param newElem a view of the array of new elements to append
+     */
+    template <std::string_view::size_type I>
+    constexpr built_array(built_array<T, N - I> const &old, sv_size_wrapper<I> newElem)
+    {
+        if constexpr (N > I) {
+            for (uint32_t i = 0U; i < (N - I); i++) {
+                arr[i] = old.get_arr()[i];
+            }
+        }
+        for (uint32_t i = (N - I); i < N; i++) {
+            arr[i] = newElem.v[i - (N - I)];
+        }
+    }
+};
+
+/** @brief built_array specialization for N = 0 */
+template <typename T> class built_array<T, 0> {
+  public:
+    /** @brief built_array constructor */
+    constexpr built_array() = default;
+    /** @brief append
+     *  @param newElem the new element to append
+     *  @return the new array
+     */
+    constexpr built_array<T, 1> append(T newElem) const { return built_array<T, 1>(*this, newElem); }
+
+    /** @brief append
+     *  @param newElem the new element to append
+     *  @return the new array
+     */
+    template <std::string_view::size_type I> constexpr built_array<T, I> append(sv_size_wrapper<I> newElem) const
+    {
+        return built_array<T, I>(*this, newElem);
+    }
+
+    /** @brief get_arr
+     *  @return the array
+     */
+    constexpr static const std::array<T, 0> get_arr() noexcept { return std::array<T, 0>{}; }
+};
+
+/** @brief op_name_strtab_t empty struct to help specialize arr_container for the op_name string table */
+struct op_name_strtab_t {
+};
+/** @brief type_tag_strtab empty struct to help specialize arr_container for the type_tag string table */
+struct type_tag_strtab_t {
+};
+
+template <typename> constexpr bool is_strtab()
+{
+    return false;
+}
+template <> constexpr bool is_strtab<op_name_strtab_t>()
+{
+    return true;
+}
+template <> constexpr bool is_strtab<type_tag_strtab_t>()
+{
+    return true;
+}
+
+// LCOV_EXCL_STOP
+
+/** @brief arr_container */
+template <typename T, bool S = is_strtab<T>()> struct arr_container {
+    /** @brief chain link to the built_array contained in this structure */
+    template <typename UNIQ_TY, uint32_t I> static constexpr built_array<T, I> chain = {};
+};
+
+/** @brief arr_container<T, true> */
+template <typename T> struct arr_container<T, true> {
+    /** @brief chain link to the built_array contained in this structure */
+    template <typename UNIQ_TY, uint32_t I, uint32_t S>
+    static constexpr built_array<std::string::value_type, S> chain = {};
+};
+
+/** @brief reg_op_table */
+class reg_op_table {
+    reg_op_node const *entries;
+    uint32_t num_entries;
+    std::string_view op_name_strtab;
+    std::string_view type_tag_strtab;
+
+  public:
+    constexpr reg_op_node const *get_entries() const noexcept { return entries; }
+    constexpr uint32_t get_num_entries() const noexcept { return num_entries; }
+    constexpr std::string_view const get_op_name_strtab() const noexcept { return op_name_strtab; }
+    constexpr std::string_view const get_type_tag_strtab() const noexcept { return type_tag_strtab; }
+    // LCOV_EXCL_START [SAFTYSWCCB-1736] constexprs resolved during compile time, consexpr constructor
+    constexpr reg_op_table(reg_op_node const *const p, uint32_t const n, std::string_view::value_type const *const o,
+                           std::string_view::size_type const o_size, std::string_view::value_type const *const t,
+                           std::string_view::size_type const t_size) noexcept
+        : entries(p), num_entries(n), op_name_strtab{o, o_size}, type_tag_strtab{t, t_size}
+    {
+    }
+    constexpr reg_op_table() noexcept : reg_op_table(nullptr, 0U, "", 0U, "", 0U) {}
+    // LCOV_EXCL_STOP
+};
+
+/** @brief reg_op_table_wrapper */
+using reg_op_table_wrapper = reg_op_table const *(*)();
+
+/** @brief reg_opt_table */
+class reg_opt_table {
+    reg_optim_node const *entries;
+    uint32_t num_entries;
+    std::string_view file_name;
+
+  public:
+    constexpr reg_optim_node const *get_entries() const noexcept { return entries; }
+    constexpr uint32_t get_num_entries() const noexcept { return num_entries; }
+    constexpr std::string_view const get_file_name() const noexcept { return file_name; }
+    // LCOV_EXCL_START [SAFTYSWCCB-1736] constexprs resolved during compile time, consexpr constructor
+    constexpr reg_opt_table(reg_optim_node const *const p, uint32_t const n,
+                            std::string_view::value_type const *const f) noexcept
+        : entries(p), num_entries(n), file_name{f}
+    {
+    }
+    constexpr reg_opt_table() noexcept : reg_opt_table(nullptr, 0U, "") {}
+    // LCOV_EXCL_STOP
+};
+
+/** @brief reg_opt_table_wrapper */
+using reg_opt_table_wrapper = reg_opt_table const *(*)();
+
+/** @brief op_name_strtab_container */
+using op_name_strtab_container = arr_container<op_name_strtab_t>;
+/** @brief type_tag_strtab_container */
+using type_tag_strtab_container = arr_container<type_tag_strtab_t>;
+/** @brief op_arr_container */
+using op_arr_container = arr_container<reg_op_node>;
+/** @brief opt_arr_container */
+using opt_arr_container = arr_container<reg_optim_node>;
+/** @brief op_table_arr_container */
+using op_table_arr_container = arr_container<reg_op_table_wrapper>;
+/** @brief opt_table_arr_container */
+using opt_table_arr_container = arr_container<reg_opt_table_wrapper>;
+
+/** @brief ba_str a built_array of char strings */
+template <uint32_t I> using ba_str = built_array<std::string::value_type, I>;
+
+/** @brief ba_op a built_array of reg_op_nodes */
+template <uint32_t I> using ba_op = built_array<reg_op_node, I>;
+
+/** @brief ba_opt a built_array of reg_optim_nodes */
+template <uint32_t I> using ba_opt = built_array<reg_optim_node, I>;
+
+/** @brief ba_op_table a built_array of reg_op_table_wrappers */
+template <uint32_t I> using ba_op_table = built_array<reg_op_table_wrapper, I>;
+
+/** @brief ba_opt_table a built_array of reg_opt_table_wrappers */
+template <uint32_t I> using ba_opt_table = built_array<reg_opt_table_wrapper, I>;
+
+/**
+ * @brief
+ * NodeCounter template converts the __COUNTER__'s
+ * current value to counts for number of reg_op_nodes and
+ * reg_optim_nodes created so far. It is specialized upon every
+ * REGISTER_OP/DEF_OPT by incrementing either "reg_op_count"
+ * or "reg_opt_count", with member functions that can get the
+ * current counts.
+ */
+// LCOV_EXCL_START [SAFTYSWCCB-1736] constexprs resolved during compile time
+template <typename UNIQ_TY, int32_t I> class NodeCounter {
+    /** @brief inc_op @return 0, or 1 if the op count is incremented */
+    constexpr static int32_t inc_op() noexcept { return 0; }
+    /** @brief inc_opt @return 0, or 1 if the opt count is incremented */
+    constexpr static int32_t inc_opt() noexcept { return 0; }
+    /** @brief inc_op_name_strtab_size @return 0, or some string size constant if the string table needs to grow */
+    constexpr static uint64_t inc_op_name_strtab_size() noexcept { return 0; }
+    /** @brief inc_type_tag_strtab_size @return 0, or some string size constant if the string table needs to grow */
+    constexpr static uint64_t inc_type_tag_strtab_size() noexcept { return 0; }
+
+  public:
+    /** @brief reg_op_count @return The number of ops that have been registered so far */
+    constexpr static int32_t reg_op_count() noexcept { return inc_op() + NodeCounter<UNIQ_TY, I - 1>::reg_op_count(); }
+    /** @brief reg_op_count @return The number of opts that have been registered so far */
+    constexpr static int32_t reg_opt_count() noexcept
+    {
+        return inc_opt() + NodeCounter<UNIQ_TY, I - 1>::reg_opt_count();
+    }
+    /** @brief op_name_strtab_size @return The string table size for the ops that have been registered so far */
+    constexpr static uint64_t op_name_strtab_size() noexcept
+    {
+        return inc_op_name_strtab_size() + NodeCounter<UNIQ_TY, I - 1>::op_name_strtab_size();
+    }
+    /** @brief type_tag_strtab_size @return The string table size for the ops that have been registered so far */
+    constexpr static uint64_t type_tag_strtab_size() noexcept
+    {
+        return inc_type_tag_strtab_size() + NodeCounter<UNIQ_TY, I - 1>::type_tag_strtab_size();
+    }
+};
+
+/** @brief Shorthand for op_name_strtab_container::chain<...> */
+template <typename U, size_t I> constexpr auto op_name_chain()
+{
+    return op_name_strtab_container::chain<U, NodeCounter<U, I>::reg_op_count(),
+                                           NodeCounter<U, I>::op_name_strtab_size()>;
+}
+
+/** @brief Shorthand for type_tag_strtab_container::chain<...> */
+template <typename U, size_t I> constexpr auto type_tag_chain()
+{
+    return type_tag_strtab_container::chain<U, NodeCounter<U, I>::reg_op_count(),
+                                            NodeCounter<U, I>::type_tag_strtab_size()>;
+}
+
+/**
+ * @brief StrtabUpdate A class template for storing:
+ * - The result of the existence check
+ * - The offset the new string
+ * when appending to the Op name and type suffix string tables.
+ *
+ * @tparam U Unique type for identifying the translation unit containing this update
+ * @tparam I Unique index number identifying the "ith" Op to be registered in this file
+ */
+template <typename U, uint32_t I> struct StrtabUpdate {
+    /** @brief is_new_op_name whether the Op name to be appended is already present in the op_name_strtab */
+    static bool const is_new_op_name;
+    /** @brief op_name_offset short offset locating the Op name in the op_name_strtab */
+    static uint16_t const op_name_offset;
+    /** @brief is_new_type_tag whether the type suffix to be appended is already present in the type_tag_strtab */
+    static bool const is_new_type_tag;
+    /** @brief type_tag_offset short offset locating the type suffix in the type_tag_strtab */
+    static uint16_t const type_tag_offset;
+};
+
+/**
+ * @brief strtab_append Append string to table iff it is not already present.
+ * @tparam U Unique type to ensure independence of specializations across translation units
+ * @tparam I REGISTER_OP index number
+ * @tparam N Current table size
+ */
+template <typename U, uint32_t I, std::string_view::size_type M, bool A, uint32_t N>
+constexpr auto strtab_append(ba_str<N> const &curr, std::string_view const newString)
+{
+    if constexpr (A) {
+        sv_size_wrapper<M> const w{newString};
+        return curr.append(w);
+    } else {
+        return curr;
+    }
+}
+
+/** @brief make_string_view Convert an array of string data into a string_view, but
+ *  substitute in an empty string if the array's .data() would be nullptr.
+ */
+template <std::string_view::size_type N>
+constexpr std::string_view make_string_view(std::array<std::string::value_type, N> const &arr) noexcept
+{
+    return arr.size() != 0 ? std::string_view{arr.data(), arr.size()} : std::string_view{"", 1};
+}
+// LCOV_EXCL_STOP
+
+} // namespace hnnx
+
+#endif // OPS_OPTS_REGISTRATION_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/ops_opts_registration_defs.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/ops_opts_registration_defs.h
new file mode 100755
index 0000000000000..ea6eab42b94ec
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/ops_opts_registration_defs.h
@@ -0,0 +1,445 @@
+//==============================================================================
+//
+// Copyright (c) 2020-2023 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef OPS_OPTS_REGISTRATION_DEFS_H
+#define OPS_OPTS_REGISTRATION_DEFS_H 1
+
+#include "unique_types.h"
+#include "c_tricks.h"
+
+namespace fold {
+template <auto, int> struct ModifiedDerivedType;
+} //namespace fold
+
+/** @brief IMPL_APPEND_REG_OP_ELEM_NO_TCM_FOLDING (used by REGISTER_OP, REGISTER_OP_HVX, etc.) */
+// LCOV_EXCL_START [SAFTYSWCCB-1736] constexprs resolved during compile time
+#define IMPL_APPEND_REG_OP_ELEM_NO_TCM_FOLDING(I, FP, OP, TAG, IS_SIMPLE)                                              \
+    /** @brief Increment the Op count for this file @return 1 */                                                       \
+    template <> constexpr int32_t NC<(I)>::inc_op() noexcept { return 1; }                                             \
+                                                                                                                       \
+    /** @brief Whether the name for this Op is already present in the Op name string table */                          \
+    template <>                                                                                                        \
+    constexpr bool StrUpd<(I)>::is_new_op_name =                                                                       \
+            make_string_view(op_name_chain<UniqTy<0>, I - 1>().get_arr()).rfind(std::string_view{(OP), sizeof(OP)}) == \
+            std::string_view::npos;                                                                                    \
+                                                                                                                       \
+    /** @brief Whether the name for this type suffix is already present in the type suffix string table */             \
+    template <>                                                                                                        \
+    constexpr bool StrUpd<(I)>::is_new_type_tag =                                                                      \
+            make_string_view(type_tag_chain<UniqTy<0>, I - 1>().get_arr())                                             \
+                    .rfind(std::string_view{(TAG).data(), (TAG).size()}) == std::string_view::npos;                    \
+                                                                                                                       \
+    /** @brief Update the size of the Op name string table for this file. @return 0 or sizeof(OP) */                   \
+    template <> constexpr uint64_t NC<(I)>::inc_op_name_strtab_size() noexcept                                         \
+    {                                                                                                                  \
+        if (StrUpd<(I)>::is_new_op_name) {                                                                             \
+            return sizeof(OP);                                                                                         \
+        } else {                                                                                                       \
+            return 0U;                                                                                                 \
+        }                                                                                                              \
+    }                                                                                                                  \
+                                                                                                                       \
+    /** @brief Update the size of the type suffix string table for this file. @return 0 or TAG.size() */               \
+    template <> constexpr uint64_t NC<(I)>::inc_type_tag_strtab_size() noexcept                                        \
+    {                                                                                                                  \
+        if (StrUpd<(I)>::is_new_type_tag) {                                                                            \
+            return (TAG).size();                                                                                       \
+        } else {                                                                                                       \
+            return 0U;                                                                                                 \
+        }                                                                                                              \
+    }                                                                                                                  \
+                                                                                                                       \
+    /** @brief Grow the Op name string table for this file. No-op if it already contains the string */                 \
+    template <>                                                                                                        \
+    template <>                                                                                                        \
+    constexpr ba_str<NC<(I)>::op_name_strtab_size()>                                                                   \
+            op_name_strtab_container::chain<UniqTy<0>, NC<(I)>::reg_op_count(), NC<(I)>::op_name_strtab_size()> =      \
+                    strtab_append<UniqTy<0>, I, sizeof(OP), StrUpd<(I)>::is_new_op_name>(                              \
+                            op_name_chain<UniqTy<0>, I - 1>(), std::string_view{(OP), sizeof(OP)});                    \
+                                                                                                                       \
+    /** @brief Get the offset of this Op name in the Op name string table. */                                          \
+    template <>                                                                                                        \
+    constexpr uint16_t StrUpd<(I)>::op_name_offset = static_cast<uint16_t>(                                            \
+            make_string_view(op_name_chain<UniqTy<0>, I>().get_arr()).rfind(std::string_view{(OP), sizeof(OP)}));      \
+                                                                                                                       \
+    /** @brief Grow the type suffix string table for this file. No-op if it already contains the string */             \
+    template <>                                                                                                        \
+    template <>                                                                                                        \
+    constexpr ba_str<NC<(I)>::type_tag_strtab_size()>                                                                  \
+            type_tag_strtab_container::chain<UniqTy<0>, NC<(I)>::reg_op_count(), NC<(I)>::type_tag_strtab_size()> =    \
+                    strtab_append<UniqTy<0>, I, (TAG).size(), StrUpd<(I)>::is_new_type_tag>(                           \
+                            type_tag_chain<UniqTy<0>, I - 1>(), std::string_view{(TAG).data(), (TAG).size()});         \
+                                                                                                                       \
+    /** @brief Record the offset of this type suffix in the type suffix string table. */                               \
+    template <>                                                                                                        \
+    constexpr uint16_t StrUpd<(I)>::type_tag_offset =                                                                  \
+            static_cast<uint16_t>(make_string_view(type_tag_chain<UniqTy<0>, I>().get_arr())                           \
+                                          .rfind(std::string_view{(TAG).data(), (TAG).size()}));                       \
+                                                                                                                       \
+    /** @brief Finally, append a new element to the Op registration table. */                                          \
+    template <>                                                                                                        \
+    template <>                                                                                                        \
+    constexpr ba_op<NC<(I)>::reg_op_count()> op_arr_container::chain<UniqTy<0>, NC<(I)>::reg_op_count()> =             \
+            chain<UniqTy<0>, NC<(I - 1)>::reg_op_count()>.append(                                                      \
+                    hnnx::reg_op_node{hnnx::GetParms<IS_SIMPLE>::get<FP, I>(), StrUpd<(I)>::op_name_offset,            \
+                                      StrUpd<(I)>::type_tag_offset});
+
+/** @brief IMPL_APPEND_REG_OP_ELEM (used by REGISTER_OP, REGISTER_OP_HVX, etc.) */
+#define IMPL_APPEND_REG_OP_ELEM(I, FP, OP, TAG, LINE)                                                                  \
+    /** @brief Increment the Op count for this file @return 1 */                                                       \
+    template <> constexpr int32_t NC<(I)>::inc_op() noexcept { return 1; }                                             \
+                                                                                                                       \
+    /** @brief Whether the name for this Op is already present in the Op name string table */                          \
+    template <>                                                                                                        \
+    constexpr bool StrUpd<(I)>::is_new_op_name =                                                                       \
+            make_string_view(op_name_chain<UniqTy<0>, I - 1>().get_arr()).find(std::string_view{(OP), sizeof(OP)}) ==  \
+            std::string_view::npos;                                                                                    \
+                                                                                                                       \
+    /** @brief Whether the name for this type suffix is already present in the type suffix string table */             \
+    template <>                                                                                                        \
+    constexpr bool StrUpd<(I)>::is_new_type_tag =                                                                      \
+            make_string_view(type_tag_chain<UniqTy<0>, I - 1>().get_arr())                                             \
+                    .find(std::string_view{(TAG).data(), (TAG).size()}) == std::string_view::npos;                     \
+                                                                                                                       \
+    /** @brief Update the size of the Op name string table for this file. @return 0 or sizeof(OP) */                   \
+    template <> constexpr uint64_t NC<(I)>::inc_op_name_strtab_size() noexcept                                         \
+    {                                                                                                                  \
+        if (StrUpd<(I)>::is_new_op_name) {                                                                             \
+            return sizeof(OP);                                                                                         \
+        } else {                                                                                                       \
+            return 0U;                                                                                                 \
+        }                                                                                                              \
+    }                                                                                                                  \
+                                                                                                                       \
+    /** @brief Update the size of the type suffix string table for this file. @return 0 or TAG.size() */               \
+    template <> constexpr uint64_t NC<(I)>::inc_type_tag_strtab_size() noexcept                                        \
+    {                                                                                                                  \
+        if (StrUpd<(I)>::is_new_type_tag) {                                                                            \
+            return (TAG).size();                                                                                       \
+        } else {                                                                                                       \
+            return 0U;                                                                                                 \
+        }                                                                                                              \
+    }                                                                                                                  \
+                                                                                                                       \
+    /** @brief Grow the Op name string table for this file. No-op if it already contains the string */                 \
+    template <>                                                                                                        \
+    template <>                                                                                                        \
+    constexpr ba_str<NC<(I)>::op_name_strtab_size()>                                                                   \
+            op_name_strtab_container::chain<UniqTy<0>, NC<(I)>::reg_op_count(), NC<(I)>::op_name_strtab_size()> =      \
+                    strtab_append<UniqTy<0>, I, sizeof(OP), StrUpd<(I)>::is_new_op_name>(                              \
+                            op_name_chain<UniqTy<0>, I - 1>(), std::string_view{(OP), sizeof(OP)});                    \
+                                                                                                                       \
+    /** @brief Get the offset of this Op name in the Op name string table. */                                          \
+    template <>                                                                                                        \
+    constexpr uint16_t StrUpd<(I)>::op_name_offset = static_cast<uint16_t>(                                            \
+            make_string_view(op_name_chain<UniqTy<0>, I>().get_arr()).find(std::string_view{(OP), sizeof(OP)}));       \
+                                                                                                                       \
+    /** @brief Grow the type suffix string table for this file. No-op if it already contains the string */             \
+    template <>                                                                                                        \
+    template <>                                                                                                        \
+    constexpr ba_str<NC<(I)>::type_tag_strtab_size()>                                                                  \
+            type_tag_strtab_container::chain<UniqTy<0>, NC<(I)>::reg_op_count(), NC<(I)>::type_tag_strtab_size()> =    \
+                    strtab_append<UniqTy<0>, I, (TAG).size(), StrUpd<(I)>::is_new_type_tag>(                           \
+                            type_tag_chain<UniqTy<0>, I - 1>(), std::string_view{(TAG).data(), (TAG).size()});         \
+                                                                                                                       \
+    /** @brief Record the offset of this type suffix in the type suffix string table. */                               \
+    template <>                                                                                                        \
+    constexpr uint16_t StrUpd<(I)>::type_tag_offset =                                                                  \
+            static_cast<uint16_t>(make_string_view(type_tag_chain<UniqTy<0>, I>().get_arr())                           \
+                                          .find(std::string_view{(TAG).data(), (TAG).size()}));                        \
+                                                                                                                       \
+    /** @brief Finally, append a new element to the Op registration table. */                                          \
+    /** @brief IS_SIMPLE argument to GetParms::get is always false; we only fold for internal ops, not op packages */  \
+    template <>                                                                                                        \
+    template <>                                                                                                        \
+    constexpr ba_op<NC<(I)>::reg_op_count()> op_arr_container::chain<UniqTy<0>, NC<(I)>::reg_op_count()> =             \
+            chain<UniqTy<0>, NC<(I - 1)>::reg_op_count()>.append(                                                      \
+                    hnnx::reg_op_node{hnnx::GetParms<false>::get<fold::ModifiedDerivedType<FP, LINE>::Modified, I>(),  \
+                                      StrUpd<(I)>::op_name_offset, StrUpd<(I)>::type_tag_offset});
+
+/** @brief APPEND_REG_OP_ELEM (used by REGISTER_OP, REGISTER_OP_HVX, etc.) */
+#define APPEND_REG_OP_ELEM(FP, OP, TAG, LINE) IMPL_APPEND_REG_OP_ELEM(__COUNTER__, FP, OP, TAG, LINE)
+/** @breif see register-op-tcm-folding.md **/
+#define APPEND_REG_OP_ELEM_NO_TCM_FOLDING(FP, OP, TAG, IS_SIMPLE)                                                      \
+    IMPL_APPEND_REG_OP_ELEM_NO_TCM_FOLDING(__COUNTER__, FP, OP, TAG, IS_SIMPLE)
+
+/** @brief IMPL_APPEND_REG_OPT_ELEM (used by DEF_OPT and DEF_OPTIM) */
+#define IMPL_APPEND_REG_OPT_ELEM(I, PRIORITY, FLAGS, DEFOPTFN, LINE)                                                   \
+                                                                                                                       \
+    /** @brief Increment the Optimization count for this file @return 1 */                                             \
+    template <> constexpr int32_t NC<(I)>::inc_opt() noexcept { return 1; }                                            \
+                                                                                                                       \
+    /** @brief Append a new element to the Optimization registration table. */                                         \
+    template <>                                                                                                        \
+    template <>                                                                                                        \
+    constexpr ba_opt<NC<(I)>::reg_opt_count()> opt_arr_container::chain<UniqTy<0>, NC<(I)>::reg_opt_count()> =         \
+            chain<UniqTy<0>, NC<(I - 1)>::reg_opt_count()>.append(hnnx::reg_optim_node{                                \
+                    static_cast<uint16_t>(PRIORITY), (FLAGS), (DEFOPTFN), static_cast<uint16_t>(LINE)});
+
+/** @brief APPEND_REG_OPT_ELEM (used by DEF_OPT and DEF_OPTIM) */
+#define APPEND_REG_OPT_ELEM(PRIORITY, FLAGS, DEFOPTFN, LINE)                                                           \
+    IMPL_APPEND_REG_OPT_ELEM(__COUNTER__, PRIORITY, FLAGS, DEFOPTFN, LINE)
+
+#define IMPL_INITIALIZE_TABLES(COUNT)                                                                                  \
+    DEFINE_UNIQ_TY()                                                                                                   \
+    using hnnx::reg_op_node;                                                                                           \
+    using hnnx::reg_optim_node;                                                                                        \
+    using hnnx::built_array;                                                                                           \
+    using hnnx::ba_op;                                                                                                 \
+    using hnnx::ba_opt;                                                                                                \
+    using hnnx::ba_str;                                                                                                \
+    using hnnx::NodeCounter;                                                                                           \
+    using hnnx::op_arr_container;                                                                                      \
+    using hnnx::opt_arr_container;                                                                                     \
+    using hnnx::op_name_strtab_container;                                                                              \
+    using hnnx::type_tag_strtab_container;                                                                             \
+    using hnnx::StrtabUpdate;                                                                                          \
+    using hnnx::strtab_append;                                                                                         \
+    using hnnx::make_string_view;                                                                                      \
+    using hnnx::op_name_chain;                                                                                         \
+    using hnnx::type_tag_chain;                                                                                        \
+    namespace {                                                                                                        \
+    template <int32_t I> using NC = NodeCounter<UniqTy<0>, I>;                                                         \
+    template <int32_t I> using StrUpd = StrtabUpdate<UniqTy<0>, I>;                                                    \
+    }                                                                                                                  \
+    template <> constexpr int32_t NC<(COUNT)>::reg_op_count() noexcept { return 0; }                                   \
+    template <> constexpr int32_t NC<(COUNT)>::reg_opt_count() noexcept { return 0; }                                  \
+    template <> constexpr uint64_t NC<(COUNT)>::op_name_strtab_size() noexcept { return 0U; }                          \
+    template <> constexpr uint64_t NC<(COUNT)>::type_tag_strtab_size() noexcept { return 0U; }                         \
+    template <> template <> constexpr ba_op<0> op_arr_container::chain<UniqTy<0>, 0> = {};                             \
+    template <> template <> constexpr ba_opt<0> opt_arr_container::chain<UniqTy<0>, 0> = {};                           \
+    template <> template <> constexpr ba_str<0> op_name_strtab_container::chain<UniqTy<0>, 0, 0> = {};                 \
+    template <> template <> constexpr ba_str<0> type_tag_strtab_container::chain<UniqTy<0>, 0, 0> = {};
+
+// LCOV_EXCL_STOP
+
+#define INITIALIZE_TABLES() IMPL_INITIALIZE_TABLES(__COUNTER__)
+
+#define OPS_REG_TABLE(NAME)      CTRICKS_PASTER(NAME, _inner_ops_regist_table)
+#define OP_NAME_STR_TABLE(NAME)  CTRICKS_PASTER(NAME, _inner_op_name_strtab)
+#define TYPE_TAG_STR_TABLE(NAME) CTRICKS_PASTER(NAME, _inner_type_tag_strtab)
+#define EXT_OPS_REG_TABLE(NAME)  CTRICKS_PASTER(NAME, _ops_table)
+
+#define OPTS_REG_TABLE(NAME)     CTRICKS_PASTER(NAME, _inner_opts_regist_table)
+#define EXT_OPTS_REG_TABLE(NAME) CTRICKS_PASTER(NAME, _opts_table)
+
+/**
+ * @brief IMPL_FINALIZE_TABLES defines the registration tables for both
+ * the ops and opts defined in the Op source file
+ */
+#define IMPL_FINALIZE_TABLES(COUNT, NAME)                                                                                                                              \
+    namespace {                                                                                                                                                        \
+    /** @brief The completed Op registration table */                                                                                                                  \
+    constexpr auto OPS_REG_TABLE(NAME) = op_arr_container::chain<UniqTy<0>, NC<(COUNT)>::reg_op_count()>.get_arr();                                                    \
+    /** @brief The completed Op name string table */                                                                                                                   \
+    constexpr auto OP_NAME_STR_TABLE(NAME) =                                                                                                                           \
+            op_name_strtab_container::chain<UniqTy<0>, NC<(COUNT)>::reg_op_count(), NC<(COUNT)>::op_name_strtab_size()>.get_arr();                                     \
+    /** @brief The completed type suffix string table */                                                                                                               \
+    constexpr auto TYPE_TAG_STR_TABLE(NAME) = type_tag_strtab_container::chain<UniqTy<0>, NC<(COUNT)>::reg_op_count(), NC<(COUNT)>::type_tag_strtab_size()>.get_arr(); \
+    /** @brief The completed Optimization registration table */                                                                                                        \
+    constexpr auto OPTS_REG_TABLE(NAME) = opt_arr_container::chain<UniqTy<0>, NC<(COUNT)>::reg_opt_count()>.get_arr();                                                 \
+    }                                                                                                                                                                  \
+    namespace hnnx {                                                                                                                                                   \
+    /** @brief Exported getter function for the Op registration table, its associated string tables, and their sizes */                                                \
+    extern "C" reg_op_table const *EXT_OPS_REG_TABLE(NAME)()                                                                                                           \
+    {                                                                                                                                                                  \
+        static constexpr reg_op_table table{                                                                                                                           \
+                OPS_REG_TABLE(NAME).empty() ? nullptr : &OPS_REG_TABLE(NAME).front(),                                                                                  \
+                OPS_REG_TABLE(NAME).size(),                                                                                                                            \
+                OP_NAME_STR_TABLE(NAME).empty() ? nullptr : &OP_NAME_STR_TABLE(NAME).front(),                                                                          \
+                OP_NAME_STR_TABLE(NAME).size(),                                                                                                                        \
+                TYPE_TAG_STR_TABLE(NAME).empty() ? nullptr : &TYPE_TAG_STR_TABLE(NAME).front(),                                                                        \
+                TYPE_TAG_STR_TABLE(NAME).size()};                                                                                                                      \
+        return &table;                                                                                                                                                 \
+    }                                                                                                                                                                  \
+    /** @brief Exported getter function for the Optimization registration table and its size */                                                                        \
+    extern "C" reg_opt_table const *EXT_OPTS_REG_TABLE(NAME)()                                                                                                         \
+    {                                                                                                                                                                  \
+        static constexpr reg_opt_table table{OPTS_REG_TABLE(NAME).size() ? &OPTS_REG_TABLE(NAME).front() : nullptr,                                                    \
+                                             OPTS_REG_TABLE(NAME).size(), __FILE__};                                                                                   \
+        return &table;                                                                                                                                                 \
+    }                                                                                                                                                                  \
+    }
+
+/**
+ * @brief FINALIZE_TABLES is a thunk to IMPL_FINALIZE_TABLES
+ *
+ */
+#define FINALIZE_TABLES(NAME) IMPL_FINALIZE_TABLES(__COUNTER__, NAME)
+
+// The following macros are applied in ops_opts_registration.cc
+
+#ifdef _M_ARM64EC
+// ARM64EC functions with C linkage prepend '#' to the decorated name
+#define EMPTY_OPS_TABLE  #default_empty_ops_table
+#define EMPTY_OPTS_TABLE #default_empty_opts_table
+#else
+#define EMPTY_OPS_TABLE  default_empty_ops_table
+#define EMPTY_OPTS_TABLE default_empty_opts_table
+#endif
+
+#if defined(_MSC_VER)
+/**
+ * These macros provide the MSVC-equivalent implementation of weak linkage,
+ * by use of __pragma(comment(linker, /alternatename:<symbol>=<alias>)).
+ *
+ * This pragma is analogous to __attribute__((weak, alias("<alias>")))
+ * when applied to a symbol on GCC/Clang.
+ */
+#define MSVC_LINKER_PRAGMA2(ARG) __pragma(comment(linker, #ARG))
+
+// clang-format off
+#define MSVC_LINKER_PRAGMA(SYMBOL, ALT) MSVC_LINKER_PRAGMA2(/alternatename:SYMBOL=ALT)
+// clang-format on
+
+#define OPS_TABLE_WEAK_SYMBOL(NAME)                                                                                    \
+    MSVC_LINKER_PRAGMA(EXT_OPS_REG_TABLE(NAME), EMPTY_OPS_TABLE)                                                       \
+    extern "C" reg_op_table const *EXT_OPS_REG_TABLE(NAME)();
+
+#define OPTS_TABLE_WEAK_SYMBOL(NAME)                                                                                   \
+    MSVC_LINKER_PRAGMA(EXT_OPTS_REG_TABLE(NAME), EMPTY_OPTS_TABLE)                                                     \
+    extern "C" reg_opt_table const *EXT_OPTS_REG_TABLE(NAME)();
+
+#else
+
+#define DECLARE_CLANG_WEAK_SYMBOL2(TYPE, IDENT, ALIAS)                                                                 \
+    extern "C" TYPE const *IDENT() __attribute__((weak, alias(#ALIAS)));
+
+#define DECLARE_CLANG_WEAK_SYMBOL(TYPE, ID, ALIAS) DECLARE_CLANG_WEAK_SYMBOL2(TYPE, ID, ALIAS)
+
+#define OPS_TABLE_WEAK_SYMBOL(NAME) DECLARE_CLANG_WEAK_SYMBOL(reg_op_table, EXT_OPS_REG_TABLE(NAME), EMPTY_OPS_TABLE)
+#define OPTS_TABLE_WEAK_SYMBOL(NAME)                                                                                   \
+    DECLARE_CLANG_WEAK_SYMBOL(reg_opt_table, EXT_OPTS_REG_TABLE(NAME), EMPTY_OPTS_TABLE)
+
+#endif
+
+/**
+ * @brief As part of loading the HTP core, register all of the Ops and Optimization rules.
+ * This will be called at static-initialization time.
+*/
+#define OP_OPT_PROCESSOR(PREFIX)                                                                                       \
+    namespace hnnx {                                                                                                   \
+    void PREFIX##_process_op_registration_list()                                                                       \
+    {                                                                                                                  \
+        const uint32_t size = ::PREFIX##_op_package_ops_list.size();                                                   \
+        for (uint32_t i = 0U; i < size; i++) {                                                                         \
+            reg_op_table const *const op_tab = ::PREFIX##_op_package_ops_list[i]();                                    \
+            reg_op_node const *const entries = op_tab->get_entries();                                                  \
+            std::string_view const names = op_tab->get_op_name_strtab();                                               \
+            std::string_view const suffixes = op_tab->get_type_tag_strtab();                                           \
+            for (uint32_t j = 0U; j < op_tab->get_num_entries(); j++) {                                                \
+                const reg_op_node reg_op = entries[j];                                                                 \
+                reg_op.PREFIX##_process(names, suffixes);                                                              \
+            }                                                                                                          \
+        }                                                                                                              \
+    }                                                                                                                  \
+    void PREFIX##_process_opt_registration_list()                                                                      \
+    {                                                                                                                  \
+        const uint32_t size = ::PREFIX##_op_package_opts_list.size();                                                  \
+        for (uint32_t i = 0U; i < size; i++) {                                                                         \
+            reg_opt_table const *const opt_tab = ::PREFIX##_op_package_opts_list[i]();                                 \
+            reg_optim_node const *const entries = opt_tab->get_entries();                                              \
+            std::string_view const fname = opt_tab->get_file_name();                                                   \
+            /* LCOV_EXCL_START [SAFTYSWCCB-1542] */                                                                    \
+            for (uint32_t j = 0U; j < opt_tab->get_num_entries(); j++) {                                               \
+                const reg_optim_node reg_opt = entries[j];                                                             \
+                reg_opt.PREFIX##_process(fname);                                                                       \
+                /* Silences AUTOSAR checker when PREPARE_DISABLED is set */                                            \
+                (void)reg_opt;                                                                                         \
+            }                                                                                                          \
+            /* LCOV_EXCL_STOP */                                                                                       \
+        }                                                                                                              \
+    }                                                                                                                  \
+    static void PREFIX##_ops_opts_registration()                                                                       \
+    {                                                                                                                  \
+        PREFIX##_process_op_registration_list();                                                                       \
+        PREFIX##_process_opt_registration_list();                                                                      \
+    }                                                                                                                  \
+    } // namespace hnnx
+
+#define IMPL_BEGIN_OPS_OPTS_LIST(I)                                                                                    \
+    namespace hnnx {                                                                                                   \
+    extern "C" reg_op_table const *default_empty_ops_table()                                                           \
+    {                                                                                                                  \
+        static const reg_op_table table{};                                                                             \
+        return &table;                                                                                                 \
+    }                                                                                                                  \
+    extern "C" reg_opt_table const *default_empty_opts_table()                                                         \
+    {                                                                                                                  \
+        static const reg_opt_table table{};                                                                            \
+        return &table;                                                                                                 \
+    }                                                                                                                  \
+    template <> template <> constexpr ba_op_table<0> op_table_arr_container::chain<UniqTy<0>, (I)> = {};               \
+    template <> template <> constexpr ba_opt_table<0> opt_table_arr_container::chain<UniqTy<0>, (I)> = {};             \
+    }
+
+/** @brief Begin defining the list of Ops/Opts registration lists */
+#define BEGIN_OPS_OPTS_LIST()     IMPL_BEGIN_OPS_OPTS_LIST(__COUNTER__)
+#define BEGIN_PKG_OPS_OPTS_LIST() IMPL_BEGIN_OPS_OPTS_LIST(__COUNTER__)
+
+#define IMPL_END_OPS_OPTS_LIST(PREFIX, COUNT)                                                                          \
+    /** Append the empty 'default' tables here, so we don't violate AUTOSAR by leaving them unused. */                 \
+    namespace hnnx {                                                                                                   \
+    template <>                                                                                                        \
+    template <>                                                                                                        \
+    constexpr ba_op_table<(COUNT)> op_table_arr_container::chain<UniqTy<0>, (COUNT)> =                                 \
+            chain<UniqTy<0>, (COUNT)-1>.append(&default_empty_ops_table);                                              \
+                                                                                                                       \
+    template <>                                                                                                        \
+    template <>                                                                                                        \
+    constexpr ba_opt_table<(COUNT)> opt_table_arr_container::chain<UniqTy<0>, (COUNT)> =                               \
+            chain<UniqTy<0>, (COUNT)-1>.append(&default_empty_opts_table);                                             \
+    }                                                                                                                  \
+                                                                                                                       \
+    namespace {                                                                                                        \
+    /** @brief PREFIX_op_package_ops_list references all of the registered ops for the HTP Core. */                    \
+    auto const PREFIX##_op_package_ops_list = hnnx::op_table_arr_container::chain<UniqTy<0>, (COUNT)>.get_arr();       \
+                                                                                                                       \
+    /** @brief PREFIX_op_package_opts_list references all of the registered graph optimizations for the HTP Core. */   \
+    auto const PREFIX##_op_package_opts_list = hnnx::opt_table_arr_container::chain<UniqTy<0>, (COUNT)>.get_arr();     \
+    }
+
+/** @brief Finish defining the list of Ops/Opts registration lists */
+#define END_OPS_OPTS_LIST()                                                                                            \
+    IMPL_END_OPS_OPTS_LIST(core, __COUNTER__)                                                                          \
+    OP_OPT_PROCESSOR(core)                                                                                             \
+    /** Force Ops and Opts to be registered at static-init time. */                                                    \
+    /** This is done to avoid init-time regressions when Graph::init_once() is called during graph creation. */        \
+    /** NOTE: OpPackages register in their init functions rather than at static-init time (See op_register_ext.h) */   \
+    [[maybe_unused]] static bool core_REGISTER_OPS_AND_OPTS = (hnnx::core_ops_opts_registration(), true);
+
+#define END_PKG_OPS_OPTS_LIST() IMPL_END_OPS_OPTS_LIST(pkg, __COUNTER__) OP_OPT_PROCESSOR(pkg)
+
+/** @brief Declare the list of registered ops and optimizations
+ *  for the given op file, and append it to the linked list.
+ *
+ *  A weak definition is provided for the registration list that initializes it to an 'empty' list
+ *  (only the null sentinel node is present). Op source files will then override these weak definitions
+ *  with strong ones containing the actual list.
+ */
+#define IMPL_DECLARE_OPS_OPTS_LIST(I, NAME)                                                                            \
+    namespace hnnx {                                                                                                   \
+    OPS_TABLE_WEAK_SYMBOL(NAME)                                                                                        \
+                                                                                                                       \
+    template <>                                                                                                        \
+    template <>                                                                                                        \
+    constexpr ba_op_table<(I)>                                                                                         \
+            op_table_arr_container::chain<UniqTy<0>, (I)> = chain<UniqTy<0>, (I)-1>.append(&EXT_OPS_REG_TABLE(NAME));  \
+                                                                                                                       \
+    OPTS_TABLE_WEAK_SYMBOL(NAME)                                                                                       \
+                                                                                                                       \
+    template <>                                                                                                        \
+    template <>                                                                                                        \
+    constexpr ba_opt_table<(I)> opt_table_arr_container::chain<UniqTy<0>, (I)> =                                       \
+            chain<UniqTy<0>, (I)-1>.append(&EXT_OPTS_REG_TABLE(NAME));                                                 \
+    }
+
+#define DECLARE_OPS_OPTS_LIST(NAME)     IMPL_DECLARE_OPS_OPTS_LIST(__COUNTER__, NAME)
+#define DECLARE_PKG_OPS_OPTS_LIST(NAME) IMPL_DECLARE_OPS_OPTS_LIST(__COUNTER__, NAME)
+
+#endif // OPS_OPTS_REGISTRATION_DEFS_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/optim_filter.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/optim_filter.h
new file mode 100755
index 0000000000000..5eca4a0520122
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/optim_filter.h
@@ -0,0 +1,62 @@
+//==============================================================================
+//
+// Copyright (c) 2021 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef OPTIM_FILTER_H
+#define OPTIM_FILTER_H 1
+#include <memory>
+
+// OptimFilter is built from a string, and implements a filter
+// indicating which which optimizations you want logged for debug.
+// it contains a pointer to a subclass of OptimFilterImplBase;
+// different subclasses could be built, depending on the string.
+class Graph;
+namespace hnnx {
+class GraphOptInfo;
+class Match;
+} // namespace hnnx
+
+namespace hnnx {
+
+#if defined(WITH_OPT_DEBUG)
+
+class OptimFilterImplBase {
+  public:
+    virtual ~OptimFilterImplBase();
+    virtual bool test_optim(hnnx::GraphOptInfo const &, Match const &) const = 0;
+};
+
+class OptimFilter {
+    std::unique_ptr<OptimFilterImplBase> p_impl; // null means never match
+  public:
+    OptimFilter(std::string const &filter_string);
+    OptimFilter(GraphPrepare const &g); // delegates to the other ctor; solves a header ordering problem.
+    OptimFilter(OptimFilter &&) = default;
+    ~OptimFilter();
+    bool test_optim(hnnx::GraphOptInfo const &gi, Match const &m) const
+    {
+        auto const *p = p_impl.get();
+        if (p) return p->test_optim(gi, m);
+        return false;
+    }
+};
+#else
+
+//dummy implementation
+
+class OptimFilter { // this is an empty class when !WITH_OPT_DEBUG
+  public:
+    OptimFilter(std::string const &filter_string) {}
+    OptimFilter(GraphPrepare const &) {}
+    OptimFilter(OptimFilter &&) = default;
+    bool test_optim(hnnx::GraphOptInfo const &gi, Match const &m) const { return false; }
+};
+#endif
+
+} //namespace hnnx
+
+#endif // OPTIM_FILTER_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/optimize.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/optimize.h
new file mode 100755
index 0000000000000..453328d2849d9
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/optimize.h
@@ -0,0 +1,2574 @@
+//==============================================================================
+//
+// Copyright (c) 2020-2023 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef OPTIMIZE_H
+#define OPTIMIZE_H 1
+
+/*
+ * PLEASE LEAVE graph.h OUT OF THIS FILE
+ */
+
+#include "c_tricks.h"
+#include "op_def.h"
+#include "unique_types.h"
+
+#include <array>
+#include <cassert>
+#include <functional>
+#include <limits>
+#include <map>
+#include <string>
+#include <tuple>
+#include <utility>
+#include <vector>
+#include <iso646.h>
+#include "optimize_defs.h"
+#include "optimize_flags.h"
+#include "optim_filter.h"
+#include "match_op.h"
+#include "oexpr.h"
+#include "build_options_pub.h"
+#include "op_package_name.h"
+#include "tensor_info.h"
+#include "macros_attribute.h"
+#include "weak_linkage.h"
+
+#ifndef PREPARE_DISABLED
+/*
+ * We want Match, Replacement, and Constraint to have mostly their own namespace,
+ * so that things like "OP" can mean different things in different places.
+ *
+ * However, we want to be able to share things like the context
+ * We could do this with extra state in each owned class, but that seems wasteful.
+ *
+ * Instead, we use private class members to give things unique namespaces, but
+ * inherit to concatenate classes and values that should be shared.
+ *
+ *
+ * A note about a trick:
+ *  Each class (Match, Constraint, Replacement) has a templated function for
+ *  UniqueType that's unused.  That lets us createa arbitrary member functions
+ *  later.
+ *
+ *  If you're curious, the unique type comes from the current filename and line.
+ *
+ *  There's also a member function pointer that is used when creating instances
+ *  of Match/Replacement/Constraint that we initialize to the arbitrary member
+ *  functions that we're creating.
+ *
+ */
+
+class Replacement;
+
+using ReplFunc = OptFunction<OpRef(Replacement &, OpDef const &)>;
+
+namespace hnnx {
+
+class Match;
+
+typedef std::function<bool(Match &, OpDef const &)> MatchFunc;
+
+template <oExp::OpVnt V, typename T> ReplFunc wrap_as_replfunc(oExp::opexpr<V, T> &&opr)
+{
+    return ReplFunc::create([op{std::move(opr)}](Replacement &rpx, OpDef const &) -> OpRef { return op.eval(rpx); });
+}
+template <oExp::OpVnt V, typename T> ReplFunc wrap_as_replfunc(oExp::opexpr<V, T> const &op)
+{
+    return ReplFunc::create([op](Replacement &rpx, OpDef const &) -> OpRef { return op.eval(rpx); });
+}
+inline ReplFunc wrap_as_replfunc(ReplFunc &&rep)
+{
+    return std::move(rep);
+}
+inline ReplFunc wrap_as_replfunc(ReplFunc const &rep)
+{
+    return rep;
+}
+
+} // namespace hnnx
+
+#include "weak_linkage.h"
+PUSH_VISIBILITY(default)
+namespace gxE {
+class API_EXPORT GXEngine;
+}
+POP_VISIBILITY()
+
+// these are function objects which return the various types,
+// and are called with a const reference to 'Constraint'
+//
+// some of them will cheat and used a static-cast to look at the Split_Context;
+// maybe it should be moved to Constraint class. Also, those that implement MESSAGE
+// etc will cheat and call a non-const method of the Constraint class.
+//
+
+typedef oExp::sFunction<int> ReplFuncInt;
+typedef oExp::sFunction<bool> ReplFuncBool;
+typedef oExp::sFunction<DType> ReplFuncDType;
+typedef oExp::sFunction<float> ReplFuncFloat;
+
+typedef OpRef (*external_replace_funcp)(Replacement &, OpDef const &);
+
+namespace hnnx {
+
+PUSH_VISIBILITY(default)
+
+// EJP: FIXME: Instead of separate optim_config things that require several changes in several places,
+// we need to plumb through a way to get an option out of graph.options that comes from options.def
+
+// the optim_config struct is visible within the namespace of
+// a DEFOPT as 'Config', e.g. Config.tcm_size reads the tcm size.
+//
+// The actual values are kept in struct optim_config_values, which is instantiated
+// within the optimization object.
+//
+// The struct is actually a static variable which contains instances of optim_configvar;
+// each one contains a field pointer into optim_config_values. When these appear
+// in an expression, they are converted to an oExp<config,T> containig a copy
+// of the struct offset; i.e. the oExp can be built without an instance of optim_config_values existing.
+//
+
+struct optim_config_values {
+    // values which are not directly available from 'Options'
+    size_t tcm_size; // the current tcm_size
+    size_t tcm_size_for_tiling; // tcm size to be used for tiling
+};
+
+// wrapper functions for graph access
+API_EXPORT OpRef graph_gen_Const_int32_common_wrapper(GraphPrepare &graph_in, const OpDef &old,
+                                                      const OutputDef &out_def, const uint8_t *data, size_t data_len);
+
+template <DType DT>
+API_EXPORT OpRef graph_gen_Const_scalar_wrapper(GraphPrepare &graph_in, const OpDef &old,
+                                                typename dtype_traits<DT>::element_type constval);
+
+// these are written as specializations.
+template <>
+API_EXPORT OpRef graph_gen_Const_scalar_wrapper<DType::Int32>(GraphPrepare &graph_in, const OpDef &old,
+                                                              NN_INT32_T constval);
+template <>
+API_EXPORT OpRef graph_gen_Const_scalar_wrapper<DType::Float32>(GraphPrepare &graph_in, const OpDef &old,
+                                                                float constval);
+POP_VISIBILITY()
+
+/* EJP: FIXME: A lot of stuff has accumulated here... const generation, helper functions, etc... */
+
+/*
+ * EJP: FIXME: see if we can change some of these functions to just return OpRef instead of
+ * having to return a funcgtion<OpRef(OpDef &)> and all the lambda stuff
+ */
+
+// Need separate function because underlying std::map of opdef_map_t is protected
+template <typename opdef_map_t, typename OpId_t> inline bool exists(opdef_map_t const &m, const OpId_t &test)
+{
+    return m.find(test) != m.end();
+}
+
+template <template <typename, typename> class C, typename K, typename V>
+inline bool exists(C<K, V> const &m, const K &test)
+{
+    return m.find(test) != m.end();
+}
+
+template <template <typename, typename, typename, typename> class C, typename K, typename V, typename V1, typename V2>
+inline bool exists(C<K, V, V1, V2> const &m, const K &test)
+{
+    return m.find(test) != m.end();
+}
+
+/*
+ * EJP: FIXME: this stuff here at a global level should move somewhere.
+ * Maybe even outline the functions...
+ */
+
+namespace opt_util {
+// map_to_size_t(x)
+// maps integer types to size_t;
+// passes ReplFuncInt as-is
+// This is used to minimize the number of distinct specializations
+// of gen_Shape (each having its own lambda).
+template <typename T> struct map_to_sizet_helper {
+    static_assert(std::numeric_limits<T>::is_integer);
+    static inline constexpr size_t convert(T x) { return x; }
+};
+template <oExp::Variant V, typename T> struct map_to_sizet_helper<oExp::expr<V, T>> {
+    static inline ReplFuncInt convert(oExp::expr<V, T> &&x) { return oExp::wrap_as_function<int>(std::move(x)); }
+};
+
+template <typename T> inline auto map_to_size_t(T &&x)
+{
+    return map_to_sizet_helper<T>::convert(std::forward<T>(x));
+}
+
+inline size_t eval_size(oExp::ECtx &, size_t size)
+{
+    return (size_t)size;
+}
+inline size_t eval_size(oExp::ECtx &e, ReplFuncInt const &f)
+{
+    return (size_t)(f(e));
+}
+
+template <typename... Ts> inline ReplFunc gen_Shape_inner(Ts... sizes)
+{
+    return ReplFunc::create([=](Replacement &rpx, const OpDef &old) -> OpRef {
+        OutputDef out_def = {
+                sizeof...(Ts), //rank
+                DType::Int32, //dtype
+                {eval_size(rpx, sizes)...}, //max_sizes
+                0, //zero_offset
+                0, //stepsize
+        };
+        auto &g = old.graph();
+        auto newref = graph_gen_Const_int32_common_wrapper(g, old, out_def, NULL, 0);
+#if 0
+        debuglog("Const shape %llx: rank=%zd (%zd,%zd,%zd,%zd...)",
+                 newref.input_id, out_def.rank, out_def.max_sizes[0],
+                 out_def.max_sizes[1], out_def.max_sizes[2],
+                 out_def.max_sizes[3]);
+#endif
+        return newref;
+    });
+}
+
+} // namespace opt_util
+
+} // namespace hnnx
+
+// This gen_Shape is intended for use in Replacement rules; the parameters
+// are either integer constants or ReplFuncInt 's
+// It returns a function.
+
+/// \ingroup OptReplacement
+/// @brief gen_Shape(..dims..) - construct an OpDef_Shape of the given dimensions.
+///
+/// The dimension parameters can be integers, but may also be one of
+/// SPLIT_START("tag"), SPLIT_SIZE("tag"), SPLIT_DIM("tag"),  provided the expression
+/// appears inside the operand of an AUTOSPLIT which uses the same tag
+///
+template <typename... Ts> inline ReplFunc gen_Shape(Ts... sizes)
+{
+    return hnnx::opt_util::gen_Shape_inner(hnnx::opt_util::map_to_size_t(std::move(sizes))...);
+}
+
+PUSH_VISIBILITY(default)
+
+//
+// 'QuickShape' can be returned from a SHAPEFN_APPLY function; returning
+// a QuickShape is equivalent to returning a gen_Shape() with the same dimensions.
+//
+struct QuickShape {
+    struct empty_rank {
+        unsigned r;
+    };
+
+    static constexpr unsigned maxdims = 8;
+    unsigned rank;
+    size_t dims[maxdims];
+    // make with specific rank and dimensions, up to 4
+    explicit inline constexpr QuickShape(size_t d) : rank(1), dims{d} {}
+    inline constexpr QuickShape(size_t d0, size_t d1) : rank(2), dims{d0, d1} {}
+    inline constexpr QuickShape(size_t d0, size_t d1, size_t d2) : rank(3), dims{d0, d1, d2} {}
+    inline constexpr QuickShape(size_t d0, size_t d1, size_t d2, size_t d3) : rank(4), dims{d0, d1, d2, d3} {}
+    inline constexpr QuickShape(size_t d0, size_t d1, size_t d2, size_t d3, size_t d4)
+        : rank(5), dims{d0, d1, d2, d3, d4}
+    {
+    }
+
+    // build from an OutputDef's shape info
+    QuickShape(OutputDef const &odef)
+    {
+        int const r = std::min((unsigned)odef.rank, maxdims);
+        rank = r;
+        for (int i = 0; i < r; i++) {
+            dims[i] = odef.max_sizes[i];
+        }
+    }
+    // take rank from first argument but fill with specified value
+    QuickShape(OutputDef shape, size_t fill)
+    {
+        rank = shape.rank;
+        for (int i = 0; i < rank; i++) {
+            dims[i] = fill;
+        }
+    }
+
+    // set an output def based on QuickShape. Only useful in implementing modifiers.
+    API_EXPORT void to_outdef(OutputDef &odef) noexcept;
+    explicit inline constexpr QuickShape(empty_rank const &erank) : rank(std::min((unsigned)erank.r, maxdims)), dims()
+    {
+    }
+    // build with a given rank, and all zero dims
+    static inline constexpr QuickShape make_empty(unsigned r) { return QuickShape(empty_rank{r}); }
+    // shortcut for make_empty( odef.rank)
+    API_EXPORT static inline QuickShape make_empty(OutputDef const &odef)
+    {
+        return QuickShape(empty_rank{unsigned(odef.rank)});
+    }
+
+    size_t &operator[](unsigned dim)
+    {
+        assert(dim < rank);
+        return dims[dim];
+    }
+};
+
+// This is an 'immediate' gen_Shape. Shape can be given as vararg ints,
+// or as std::vector<size_t>.
+//
+
+template <typename... Ts> API_EXPORT OpRef gen_Shape_immed(const OpDef &some_op, Ts... sizes);
+API_EXPORT OpRef gen_Shape_immed(const OpDef &some_op, std::vector<size_t> const &shape);
+
+POP_VISIBILITY()
+
+// this is intended to be used with an explicit <DType::Float32> or whatever.
+// (gen_Const_scalar must be specialized for the supported types).
+template <DType DT> inline OpRef gen_ConstScalar_imm(const OpDef &old, typename dtype_traits<DT>::element_type constval)
+{
+    auto &g = old.graph();
+    return hnnx::graph_gen_Const_scalar_wrapper<DT>(g, old, constval);
+}
+// these are intended to be used in replacement rules, they return ReplFunc.
+
+PUSH_VISIBILITY(default)
+/// \ingroup OptReplacement
+/// @brief gen_ConstScalar_f32(floatval) - Make an Opdef_Const with given scalar float value
+API_EXPORT ReplFunc gen_ConstScalar_f32(float constval);
+API_EXPORT ReplFunc gen_ConstScalar_f32_func(ReplFuncFloat &&constval_f);
+POP_VISIBILITY()
+
+template <typename T> inline ReplFunc gen_ConstScalar_f32(T &&expr)
+{
+    return gen_ConstScalar_f32_func(oExp::wrap_as_function<float>(std::forward<T>(expr)));
+}
+
+PUSH_VISIBILITY(default)
+/// \ingroup OptReplacement
+/// @brief gen_ConstScalar_i32(intval) - Make an Opdef_Const with given scalar float value
+API_EXPORT ReplFunc gen_ConstScalar_i32(int constval);
+API_EXPORT ReplFunc gen_ConstScalar_i32_func(ReplFuncInt &&constval_f);
+POP_VISIBILITY()
+
+template <typename T> inline ReplFunc gen_ConstScalar_i32(T &&expr)
+{
+    return gen_ConstScalar_i32_func(oExp::wrap_as_function<int>(std::forward<T>(expr)));
+}
+
+PUSH_VISIBILITY(default)
+API_EXPORT ReplFunc gen_ConstArr_f32(float constval, size_t n);
+API_EXPORT ReplFunc gen_ConstArr_f32_func(ReplFuncFloat &&val_func, ReplFuncInt &&n_func);
+POP_VISIBILITY()
+
+template <typename TVAL, typename TN> inline ReplFunc gen_ConstArr_f32(TVAL &&val, TN &&nn)
+{
+    return gen_ConstArr_f32_func(oExp::wrap_as_function<float>(std::forward<TVAL>(val)),
+                                 oExp::wrap_as_function<int>(std::forward<TN>(nn)));
+}
+
+PUSH_VISIBILITY(default)
+API_EXPORT ReplFunc gen_ConstArr_i32(NN_INT32_T constval, size_t n);
+API_EXPORT ReplFunc gen_ConstArr_i32_func(ReplFuncInt &&val_func, ReplFuncInt &&n_func);
+POP_VISIBILITY()
+
+template <typename TVAL, typename TN> inline ReplFunc gen_ConstArr_i32(TVAL &&val, TN &&nn)
+{
+    return gen_ConstArr_i32_func(oExp::wrap_as_function<int>(std::forward<TVAL>(val)),
+                                 oExp::wrap_as_function<int>(std::forward<TN>(nn)));
+}
+
+PUSH_VISIBILITY(default)
+//
+// gen_ConstArr_vals_i32( ... ) allows creation of a an int32 const array, shape [1,1,1,n],
+// with the given set of values in it.
+
+// this implementation only used when all the values are constants
+API_EXPORT ReplFunc gen_ConstMat_i32__func(std::vector<NN_INT32_T> &&);
+// this one is passed a std::vector of ReplFuncInt
+API_EXPORT ReplFunc gen_ConstMat_i32__func(std::vector<ReplFuncInt> &&);
+POP_VISIBILITY()
+
+namespace hnnx {
+
+// all_are_int<T,T,T,...>()  returns true if all of T,T .. are int,long or unsigned.
+// or reference to.
+//
+template <typename... Ts> struct all_are_int_helper {
+    static_assert(sizeof...(Ts) == 0, "template problem");
+    static constexpr bool value = true;
+};
+template <typename T1, typename... Ts> struct all_are_int_helper<T1, Ts...> {
+    using TX = std::remove_reference_t<T1>;
+    static constexpr bool value = (std::is_same_v<TX, int> || std::is_same_v<TX, long> ||
+                                   std::is_same_v<TX, unsigned>)&&all_are_int_helper<Ts...>::value;
+};
+
+template <typename... Ts> inline constexpr bool all_are_int()
+{
+    return all_are_int_helper<Ts...>::value;
+}
+
+} // namespace hnnx
+
+//
+// gen_ConstMat_i32( wid, ... wid*dep values ... ) -> [1,1,wid,dep] filled
+// in with the values. 'wid' can zero, which is treated as 1.
+
+template <typename TW, typename... Ts> inline ReplFunc gen_ConstMat_i32(TW &&wid, Ts &&...values)
+{
+    if constexpr (hnnx::all_are_int<TW, Ts...>()) {
+        std::vector<NN_INT32_T> parms = {NN_INT32_T(wid), NN_INT32_T(values)...};
+        return gen_ConstMat_i32__func(std::move(parms));
+    } else {
+        std::vector<ReplFuncInt> parms = {oExp::wrap_as_function<int>(std::forward<TW>(wid)),
+                                          oExp::wrap_as_function<int>(std::forward<Ts>(values))...};
+        return gen_ConstMat_i32__func(std::move(parms));
+    }
+}
+// gen_ConstArr_vals_i32 is just a special case of gen_ConstMat_i32
+
+template <typename... Ts> inline ReplFunc gen_ConstArr_vals_i32(Ts &&...values)
+{
+    return gen_ConstMat_i32(0, std::forward<Ts>(values)...);
+}
+
+struct Split_Context {
+    int start;
+    int size;
+    int dim;
+};
+
+PUSH_VISIBILITY(default)
+
+/**
+ * \defgroup AutoSplitShapeFnApply  Functions for AUTOSPLIT_SHAPEFN_APPLY
+ * \ingroup OptReplacement
+ *
+ * These are functions which may be used with SHAPEFN_APPLY.
+ *
+ * The first parameter is always Replacement &; the second is a Split_Context const & (obtained via the 'split_tag' parmeter
+ * to the AUTOSPLIT_SHAPEFN_APPLY' and the remaining parameters are obtained from the AUTOSPLIT_SHAPEFN_APPLY, and may be
+ * OpRef (mapped from "OperandTag" in the SHAPEFN_APPLY), or scalar values.
+ *
+ * The return value may be an OpRef representing a new graph object; instead, the function may return a QuickShape object
+ * representing a shape, and the framework will convert this to an OpDef_Shape.
+ *
+ *
+ * @{
+ */
+// :::EXTERNAL_SHAPEFN::: {  qshape simpledim_split_start(split,op,int); }
+
+/// @brief make 'start' shape for 'simple' split (on specific dimension)
+///
+/// E.g. if dim= 2, and the SPLIT_START is 96, a shape { 0, 0, 96, 0} will be generated.
+///
+/// This is used within CHANGEDIM_SLICE
+///
+API_EXPORT QuickShape simpledim_split_start(Replacement &rpx, Split_Context const &splitinfo, OpRef const &orig,
+                                            int dim);
+
+// :::EXTERNAL_SHAPEFN::: {  qshape simpledim_split_size(split,op,int); }
+
+/// @brief make 'size' shape for 'simple' split (on specific dimension)
+///
+/// E.g. if tdim=2, and the SPLIT_START is 30, a shape { b, h, 30, d} will be generated
+/// (where b,h,d are the 'default' dims)
+///
+/// This is used within CHANGEDIM_SLICE
+///
+API_EXPORT QuickShape simpledim_split_size(Replacement &rpx, Split_Context const &splitinfo, OpRef const &orig,
+                                           int dim);
+
+// :::EXTERNAL_SHAPEFN::: {  qshape simple_split_start(split,op); }
+
+/// @brief make 'start' shape for 'simple' split
+///
+/// E.g. if SPLIT_DIM=3, and the SPLIT_START is 96, a shape { 0, 0, 0, 96} will be generated.
+///
+/// This is used within TYPICAL_SLICE
+///
+API_EXPORT QuickShape simple_split_start(Replacement &rpx, Split_Context const &splitinfo, OpRef const &orig);
+
+// :::EXTERNAL_SHAPEFN::: {  qshape simple_split_size(split,op); }
+
+/// @brief make 'size' shape for 'simple' split
+///
+/// E.g. if SPLIT_DIM=3, and the SPLIT_SIZE is 30, a shape { b, h, w, 30} will be generated.
+/// (where b,h,w are the 'default' dims)
+///
+/// This is used within TYPICAL_SLICE
+///
+API_EXPORT QuickShape simple_split_size(Replacement &rpx, Split_Context const &splitinfo, OpRef const &orig);
+
+// :::EXTERNAL_SHAPEFN::: {  qshape conv_valid_split_start(split,op,op); }
+
+/// @brief make 'start' shape for splitting input to 'valid' convolution, where the input is being split along height or width
+///
+/// Generates shape {0, SPLIT_START * stride_h, 0, 0 }
+/// or
+/// Generates shape {0, 0, SPLIT_START * stride_w, 0 }
+API_EXPORT QuickShape conv_valid_split_start(Replacement &rpx, Split_Context const &splitinfo, OpRef const &Act,
+                                             OpRef const &stride);
+
+// :::EXTERNAL_SHAPEFN::: {  qshape conv_valid_split_size(split,op,op,int,int); }
+
+/// @brief make 'size' shape for splitting input to 'valid' dilated convolution, where the input is being split along height or width
+///
+/// Generates shape {0, inrows, 0, 0} or {0, 0, incols, 0}
+///
+/// where inrows = stride_h * (SPLIT_SIZE-1) + (filter_h - 1) * dilation + 1
+///       incols = stride_w * (SPLIT_SIZE-1) + (filter_w - 1) * dilation + 1
+///
+API_EXPORT QuickShape conv_valid_split_size(Replacement &rpx, Split_Context const &splitinfo, OpRef const &Act,
+                                            OpRef const &stride, int window, int dilation);
+
+// :::EXTERNAL_SHAPEFN::: {  qshape conv_split_start_valid(split,op,op,op); }
+
+/// @brief make 'start' shape for splitting input to 'valid' convolution, where the input is being split along height
+///
+/// Generates shape {0, SPLIT_START * stride_h, 0, 0 }
+
+API_EXPORT QuickShape conv_split_start_valid(Replacement &rpx, Split_Context const &splitinfo, OpRef const &Act,
+                                             OpRef const &weights, OpRef const &stride);
+
+// :::EXTERNAL_SHAPEFN::: {  qshape conv_split_size_valid(split,op,op,op); }
+
+/// @brief make 'size' shape for splitting input to 'valid' convolution, where the input is being split along height
+///
+/// Generates shape {0, inrows, 0, 0 }
+///
+/// where inrows = stride_h * (SPLIT_SIZE-1) + filter_h
+///
+API_EXPORT QuickShape conv_split_size_valid(Replacement &rpx, Split_Context const &splitinfo, OpRef const &Act,
+                                            OpRef const &weights, OpRef const &stride);
+
+// :::EXTERNAL_SHAPEFN::: {  qshape conv_split_size_valid_dil(split,op,op,op,op); }
+
+/// @brief make 'size' shape for splitting input to 'valid' dilated convolution, where the input is being split along height
+///
+/// Generates shape {0, inrows, 0, 0 }
+///
+/// where inrows = stride_h * (SPLIT_SIZE-1) + (filter_h - 1) * dilation + 1
+///
+API_EXPORT QuickShape conv_split_size_valid_dil(Replacement &rpx, Split_Context const &splitinfo, OpRef const &Act,
+                                                OpRef const &weights, OpRef const &stride, OpRef const &dilation);
+
+// :::EXTERNAL_SHAPEFN::: {  qshape pool_split_start_valid(split,op,op,op); }
+
+/// @brief make 'start' shape for splitting input to 'valid' Xpool, where the input is being split along height
+///
+/// Generates shape {0, SPLIT_START * stride_h, 0, 0 }
+API_EXPORT QuickShape pool_split_start_valid(Replacement &rpx, Split_Context const &splitinfo, OpRef const &Act,
+                                             OpRef const &window, OpRef const &stride);
+
+// :::EXTERNAL_SHAPEFN::: {  qshape pool_split_size_valid(split,op,op,op); }
+
+/// @brief make 'size' shape for splitting input to 'valid' Xpool, where the input is being split along height
+///
+/// Generates shape {0, inrows, 0, 0 }
+///
+/// where inrows = stride_h * (SPLIT_SIZE-1) + window_h
+///
+API_EXPORT QuickShape pool_split_size_valid(Replacement &rpx, Split_Context const &splitinfo, OpRef const &Act,
+                                            OpRef const &window, OpRef const &stride);
+
+/** @} */
+
+namespace optim_extfunc { // in concat_opt.cc
+API_EXPORT QuickShape offset_into_concat(Replacement &rpx, Split_Context const &splitinfo, OpRef const &concat,
+                                         OpRef const &base_shape);
+}
+
+/**
+ * \defgroup ShapeFnApply  Functions for SHAPEFN_APPLY
+ * \ingroup OptReplacement
+ *
+ * These are functions which may be used with SHAPEFN_APPLY.
+ *
+ * The first parameter is always Replacement &; the remaining parameters are obtained from the SHAPEFN_APPLY, and may be
+ * OpRef (mapped from "OperandTag" in the SHAPEFN_APPLY), or scalar values.
+ *
+ * The return value may be an OpRef representing a new graph object; instead, the function may return a QuickShape object
+ * representing a shape, and the framework will convert this to an OpDef_Shape.
+ *
+ * @{
+ */
+
+// :::EXTERNAL_SHAPEFN::: {  qshape split_merge_start(op,op); }
+
+API_EXPORT QuickShape split_merge_start(Replacement &rpx, OpRef const &inner, OpRef const &outer);
+
+//@brief Create shape with extra amount added along some axis
+// :::EXTERNAL_SHAPEFN::: { qshape shape_add_on_axis(op,op,int); }
+API_EXPORT QuickShape shape_add_on_axis(Replacement &rpx, OpRef const &start, OpRef const &amt, int axis);
+
+// :::EXTERNAL_SHAPEFN::: {  qshape conv_same_padded_size(op,op,op); }
+///@brief find padded shape for input to 'same' convolution
+///
+/// For a 'same' convolution, produce a shape the same as 'Act', but expanded in H and W dimensions to allow for the
+/// padding needed (as determined by the given filter shape and stride)
+///
+API_EXPORT QuickShape conv_same_padded_size(Replacement &rpx, OpRef const &Act, OpRef const &weights,
+                                            OpRef const &stride);
+
+// :::EXTERNAL_SHAPEFN::: {  qshape conv_same_padded_size(op,op,op,op); }
+///@brief same as \conv_same_padded_size that support dilation, default should be {1,1}
+API_EXPORT QuickShape conv_same_padded_size_dilation(Replacement &rpx, OpRef const &Act, OpRef const &weights,
+                                                     OpRef const &stride, OpRef const &dilation);
+
+// :::EXTERNAL_SHAPEFN::: {  qshape conv_same_before(op,op,op); }
+
+///@brief find padded offset (top/left margin) for input to 'same' convolution
+///
+/// For a 'same' convolution, produce a shape which indicates how the input needs to be padded on top and left to
+/// be processed as 'valid' - as determined by the given filter shape and stride. The resulting shape will be
+///
+///   { 0, top_padding,  left_padding, 0 }
+///
+API_EXPORT QuickShape conv_same_before(Replacement &rpx, OpRef const &Act, OpRef const &weights, OpRef const &stride);
+
+// :::EXTERNAL_SHAPEFN::: {  qshape conv_same_before(op,op,op,op); }
+
+///@brief same as \conv_same_before that support dilation, default should be {1,1}
+API_EXPORT QuickShape conv_same_before_dilation(Replacement &rpx, OpRef const &Act, OpRef const &weights,
+                                                OpRef const &stride, OpRef const &dilation);
+
+// :::EXTERNAL_SHAPEFN::: {  qshape pool_same_padded_size(op,op,op); }
+
+///@brief find padded shape for input to 'same' Xpool
+///
+/// For a 'same' Xpool, produce a shape the same as 'Act', but expanded in H and W dimensions to allow for the
+/// padding needed (as determined by the given window shape and stride)
+///
+API_EXPORT QuickShape pool_same_padded_size(Replacement &rpx, OpRef const &Act, OpRef const &window,
+                                            OpRef const &stride);
+
+// :::EXTERNAL_SHAPEFN::: {  qshape pool_same_before(op,op,op); }
+
+///@brief find padded offset (top/left margin) for input to 'same' Xpool
+///
+/// For a 'same' Xpool, produce a shape which indicates how the input needs to be padded on top and left to
+/// be processed as 'valid' - as determined by the given window shape and stride. The resulting shape will be
+///
+///   { 0, top_padding,  left_padding, 0 }
+///
+API_EXPORT QuickShape pool_same_before(Replacement &rpx, OpRef const &Act, OpRef const &window, OpRef const &stride);
+
+// :::EXTERNAL_SHAPEFN::: {  qshape conv_s2d_shape(op,op,op); }
+
+/// @brief
+///
+/// Compute the out shape of a conv whose input has gone through a space to depth transformation
+/// Effective input shape is roundup(input_shape, stride) / stride
+/// Effective out shape is (eff in - filter + 1) (note that stride is changed to 1 after s2d)
+/// Does not handle dilation (this is handled earlier on in the def opt path)
+///
+API_EXPORT QuickShape conv_s2d_shape(Replacement &rpx, OpRef const &Act, OpRef const &filter, OpRef const &stride);
+
+// :::EXTERNAL_SHAPEFN::: {  qshape pad_total_for_qnn(op,op); }
+
+///@brief use input pad_amount to calculate padded offset for input from 'QNN_Conv' to 'valid' convolution
+///
+/// For a 'QNN_Conv' convolution, produce a shape the same as 'Act', but expanded in H and W dimensions, which
+/// is determined by the input pad_amount: [[h_before, h_after], [w_before, w_after]]
+///
+API_EXPORT QuickShape pad_total_for_qnn(Replacement &rpx, OpRef const &Act, OpRef const &pad_amount);
+
+///@brief use input pad_amount to calculate padded offset for input to use 'valid' pooling
+///
+/// For QNN pool ops, produce a shape the same as 'Act', but expanded in H and W dimensions, which
+/// is determined by the input pad_amount: [[h_before, h_after], [w_before, w_after]]
+///
+API_EXPORT QuickShape pad_total_for_qnn_round(Replacement &rpx, OpRef const &Act, OpRef const &Stride,
+                                              OpRef const &pad_amount, OpRef const &rounding_mode);
+
+// :::EXTERNAL_SHAPEFN::: {  qshape pad_before_for_qnn(op,op); }
+
+///@brief use input pad_amount to get the (top/left margin) for input from 'QNN_Conv' to 'valid' convolution
+///
+/// For a 'QNN_Conv' convolution, produce the result shape of padded shape to
+/// be processed as 'valid' - as determined by the given pad_amount. The resulting shape will be
+///
+///   { 0, top_padding,  left_padding, 0 }
+///
+API_EXPORT QuickShape pad_before_for_qnn(Replacement &rpx, OpRef const &Act, OpRef const &pad_amount);
+
+// :::EXTERNAL_SHAPEFN::: {  qshape explicit_pad_for_qnn(op,op); }
+
+API_EXPORT OpRef explicit_pad_for_qnn(Replacement &rpx, OpRef const &output, OpRef const &pad_amount);
+
+// :::EXTERNAL_SHAPEFN::: {  qshape reshape_hw_to_4d(op); }
+
+///@brief given a tensor representing [h, w] expand to [1, h, w, 1]
+API_EXPORT QuickShape reshape_hw_to_4d(Replacement &rpx, OpRef const &stride);
+
+// :::EXTERNAL_SHAPEFN::: {  qshape reshape_bhw_to_4d(op); }
+
+///@brief given a tensor representing [b, h, w] expand to [b, h, w, 1]
+API_EXPORT QuickShape reshape_bhw_to_4d(Replacement &rpx, OpRef const &stride);
+
+// :::EXTERNAL_SHAPEFN::: {  qshape shape_after_transpose(op,op); }
+
+///@brief gives the new shape of a tensor after a transpose has been applied to it
+API_EXPORT QuickShape shape_after_transpose(Replacement &rpx, OpRef const &input, OpRef const &tx_control);
+
+// :::EXTERNAL_SHAPEFN::: {  qshape shape_after_spaceToBatch(op,op); }
+// :::EXTERNAL_SHAPEFN::: {  qshape shape_after_spaceToBatch_w_pad(op,op,op); }
+
+///@brief gives the new shape of a tensor after a SpaceToBatch transformation
+API_EXPORT QuickShape shape_after_spaceToBatch(Replacement &rpx, OpRef const &input, OpRef const &block_size);
+API_EXPORT QuickShape shape_after_spaceToBatch_w_pad(Replacement &rpx, OpRef const &input, OpRef const &block_size,
+                                                     OpRef const &pads);
+
+// :::EXTERNAL_SHAPEFN::: {  qshape shape_after_depthToSpace(op,op); }
+
+///@brief gives the new shape after depthToSpace transformation
+API_EXPORT QuickShape shape_after_depthToSpace(Replacement &rpx, OpRef const &input, OpRef const &block_size);
+
+// :::EXTERNAL_SHAPEFN::: {  qshape before_pad_shape(op); }
+
+///@brief extracts before pads from pad tensor
+API_EXPORT QuickShape before_pad_shape(Replacement &rpx, OpRef const &input, OpRef const &padding);
+/** @} */
+
+// :::EXTERNAL_SHAPEFN::: {  qshape after_pad_shape(op); }
+
+///@brief extracts before pads from pad tensor
+API_EXPORT QuickShape after_pad_shape(Replacement &rpx, OpRef const &input, OpRef const &padding);
+/** @} */
+
+// :::EXTERNAL_SHAPEFN::: {  qshape gen_null_shape(op); }
+
+///@brief generate a shape of all 0s with same rank as input
+API_EXPORT QuickShape gen_null_shape(Replacement &rpx, OpRef const &input);
+/** @} */
+
+// :::EXTERNAL_SHAPEFN::: {  qshape shape_after_pad(op,op); }
+
+///@brief gives the new shape after pad applied
+API_EXPORT QuickShape shape_after_pad(Replacement &rpx, OpRef const &input, OpRef const &padding);
+/** @} */
+
+// :::EXTERNAL_SHAPEFN::: {  qshape pad_after_shape(op,op,op); }
+
+///@brief calculate the padding after a tensor
+API_EXPORT QuickShape pad_after_shape(Replacement &rpx, OpRef const &input, OpRef const &before_pad,
+                                      OpRef const &total_size);
+
+/////////////////////////////////////////////////////////////
+// Given a match rule like
+//
+//  Op("Add","X","B"),
+// or
+//  Op("Slice_shape",Op("Slice_shape","Input","inner_start","inner_size"),"outer_start","outer_size"),
+//
+// .. we make an MatchOp object that can match it and bind the named parameters
+//
+// This is done in two steps
+//  (1) first, we 'parse' the rule, this is done by executing the code in the context of a MatchBuilder
+//      member function. Each Op() returns a shared_ptr<MatchAstNode>.
+//  (2) we look at that, and based on what it is, we construct something based on MatchOpBase,
+//      which has a method in it to do the matching.
+//      The return from (1) can then be discarded
+//
+//
+// during matching:
+///  - The matcher engine works by first checking all the Op types and parameter counts (in pre-order
+//     traversal), and then going back to bind/check the named operands, all  by following a table.
+//     There are no "operand_tag_t" involved in this process, since the indices into the output array
+//     are baked into the tables in advance.
+//   - in the process, all the OpRef  stored in an array of at most [MATCH_MAX_PATTERN], which is
+//     stored in the Match object. The first will contain the 'root' Opref, the next 'n' are the matched subops,
+//     and the rest are distinct matched parameter names. 'n' could be 0.
+//   - Each instance of MatchOpBase has an array std::vector<pair<operand_tag_t,int>> m_operindex, which supplies
+//     the 'operand' names for those matched refs, and maps them to indices in the array (the names are in order).
+//   - On a complete match we use Match.set_current_matchop() to install a pointer to the MatchOpBase;
+//     subsequently, in 'Constraint' and 'Replace' phases, that object's lookup_opertag() method is used to
+//     map operand names to OpRef (it maps the m_operindex to an index into the array).
+// TODO: we should probably also have a parallel array in Match of the corresponding OpDef pointers, which
+// can be filled in lazily (starting with the ones already obtained during matching, with the rest init to NULL).
+// This could reduce repeated lookups in op_def_map during Constraint and Replace phases.
+//
+//
+
+#define OP_CSTR(op) ((op).c_str())
+
+//
+// The subclasses of MatchOpBase are declared and implemented in match_op.cc
+//
+// MatchOpBase
+//      +---MatchOpSimple		    // for 1-level pattern with no duplicate operand names
+//      +---MatchOpGeneral			// for all other cases.
+//
+//
+namespace hnnx {
+
+class MatchOpBase;
+// These are the state vars within Match which belong to MatchOp.
+struct MatchOpState {
+    MatchOpBase const *current_matchop; // after a match, points to the matchop, which does operand lookups.
+    // table of OpRef bound by the match; current_matchop->lookup_opertag is used to find the index
+    // for a given operand tag.
+    // (only the first 'n' are valid, where n is current_matchop->m_matchcount)
+    OpRef bound_opref[MATCH_MAX_PATTERN];
+    // These are either null or pointing to the OpDef indicated by bound_opref[i]
+    // (only the first 'n' are valid, where n is current_matchop->m_matchcount)
+    OpDef const *bound_opdef[MATCH_MAX_PATTERN];
+
+    // This holds pointers to operands matched by MatchopIterator
+    std::array<OpDef const *, MATCH_MAX_PATTERN> matched_opdef;
+
+    API_EXPORT int lookup_opertag(operand_tag_parm_t optag) const;
+    bool cse_candidate; // True for rules match Op (x, Op(...))
+};
+
+//
+class MatchOpBase {
+  protected:
+    opname_tag_t m_opname0; // name of the root op.
+    // 0 <= min <= max
+    unsigned short m_min_inputs; // range of input counts on the root op
+    unsigned short m_max_inputs;
+    unsigned short m_matchcount; // size of table needed for match
+
+    // A fixed list, mapping operand tags to indices in the mapped operands;
+    // sorted by operand tag.
+    // This is used in lookup_opertag()
+    std::vector<std::pair<operand_tag_t, int>> m_operindex;
+    //
+    // This contains the char const * used for displaying the context
+    // (see optim_trace.cc)
+    // It may be empty, if this was not enabled in the build.
+    std::unique_ptr<const char[]> match_debug_desc;
+
+    API_EXPORT virtual bool do_subclass_match(Match &m, OpDef const &op) const = 0;
+
+    API_EXPORT MatchOpBase(MatchAstNode const *, int matchcount,
+                           std::vector<std::pair<operand_tag_t, int>> &&operindex);
+
+    API_EXPORT static MatchOpState &matchop_state(Match &m);
+    // lookup_ref:   transform an OpRef to OpDef using the methods in Match
+    API_EXPORT OpDef const &lookup_ref(Match &m, OpRef const &op) const;
+
+  public:
+    // OpRef to the matched pattern Ops are stored in a linear array,
+    // with [0] being the 'base' Op.
+    //  For MatchOpSimple, the rest of the array is filled up with the Op's inputs.
+    //  For MatchOpGeneral, starting in [1] the array is filled with refs
+    //   to all the 'subordinate' Ops (in pre-order). There may be 0 of these.
+    //   The rest of the array is filled with OpRef to he named input operands.
+    //
+    // A table of opdesc is used to match and gather the 'Ops'in the table. Results
+    // are stored in order in the match list, starting at [1]
+    struct opdesc {
+        opname_tag_t opname; // name of the sub op
+        unsigned short loc_idx; // index of previously matched containing op, in match table
+        unsigned short in_idx; // which input do we look at
+        unsigned short min_n, max_n; // range of input count
+    };
+
+  protected:
+    // parm desc are used to gather the 'named' params
+    // results are stored in order in match table.
+    // Records with dup_ipx >0 are different: for these, the operand is
+    // obtained, and checked to see if it's a dup of the one already at
+    // dup_index. If it is not, the match fails; if it is, the matching
+    // proceeds, and nothing is added to the output (note, it is not allowed
+    // or useful to have an operand aliased to the root op, index 0).
+    //
+    struct parmdesc {
+        unsigned short loc_idx; // index of previously matched op in match table
+        unsigned short in_idx; // which input do we look at
+        unsigned short dup_idx; // if !=0, must be a dup of dup_index-1
+    };
+
+  public:
+    // this returns the m_matchcount; can be used for auto-sizing the bind array in match.
+    // it needs to be at least as large as the get_mathcount of all the MatchOp.
+    int get_matchcount() const { return m_matchcount; }
+    // this builds a MatchOp of appropriate class from a MatchAst
+    API_EXPORT static MatchOp_uptr build_MatchOp(MatchAstNode *);
+
+    API_EXPORT virtual ~MatchOpBase();
+    API_EXPORT bool do_match(Match &m, OpDef const &op) const;
+    // lookup an operand tag in m_operindex
+    // Returns -1 if not found, or the index (will be in range 0..get_matchcount()-1)
+    //
+    API_EXPORT int lookup_opertag(operand_tag_parm_t optag) const;
+    // this is so we can organize rules based on the root opname.
+    API_EXPORT opname_tag_parm_t get_root_opname() const { return m_opname0; }
+
+    // these are used for WITH_OPT_DEBUG. When it is not defined. they return nullptr and empty-map.
+    API_EXPORT char const *get_debug_desc() const { return match_debug_desc.get(); } // may return nullptr
+    API_EXPORT std::map<OpId, operand_tag_parm_t> get_inverse_map(MatchOpState const &m) const;
+
+    API_EXPORT const std::vector<std::pair<operand_tag_t, int>> &get_operindex() const { return m_operindex; };
+
+    // Number of operators in match
+    API_EXPORT virtual unsigned match_size() const = 0;
+    API_EXPORT virtual const std::vector<opdesc> *get_opdesc() const = 0;
+};
+
+static void fail_lookup(operand_tag_parm_t optag)
+{
+    errlog("Parameter %s not found", optag.c_str());
+    throw std::runtime_error("match parm not found");
+}
+
+API_FUNC_EXPORT inline int MatchOpState::lookup_opertag(operand_tag_parm_t optag) const
+{
+    int const idx = current_matchop->lookup_opertag(optag);
+    if (idx < 0) fail_lookup(optag);
+    return idx;
+}
+
+/////////////////////////////////////////////////////////////
+
+/** \defgroup OptMatch Match-Pattern Expressions for Optimization Rules
+ * \ingroup OptimizationFuncs
+ *
+ * These are the operations available for writing 'Match Pattern' expressions.
+ */
+
+/////////////////////////////////////////////////////////////
+
+/** Base Class for Graph Optimization Context
+ * This has the shared data elements and functionality, available to all parts of the optimization
+ */
+
+class GraphOptContext_Base : public RefersToGraph {
+  protected:
+    API_EXPORT GraphOptContext_Base(GraphPrepare &g) : RefersToGraph(g) {}
+};
+
+// this is a virtual base class which is used to implement MESSAGE dumps
+// while running optimization; it abstracts away the difference between
+// 'built-in' optimizations, and externally generated, via two different
+// subclasses
+
+class OptDebugBase {
+  protected:
+    GraphPrepare &m_graph;
+    uint32_t m_saved_opid;
+    OptDebugBase(GraphPrepare &g) : m_graph(g), m_saved_opid(0) {}
+    OptDebugBase(GraphPrepare &g, uint32_t saved_opid) : m_graph(g), m_saved_opid(saved_opid) {}
+
+  public:
+    GraphPrepare &graph() const { return m_graph; }
+    // these are stubs unless WITH_OPT_DEBUG is #defined
+    API_EXPORT void show_optim(FILE *f, int indent); // show what a rule has matched
+    API_EXPORT void show_optim_replace(FILE *f, OpId opid, int indent);
+    API_EXPORT virtual ~OptDebugBase();
+
+  protected:
+    // these are used by show_optim, show_optim_replace to access the match context
+
+    virtual char const *get_debug_desc() const = 0; // get the 'matchdesc' string for current optimization
+    // get an OpRef of an op which is in the pattern at 'idx'
+    virtual OpRef get_bound_opref(unsigned idx) const = 0;
+    // get an OpDef * to to an op which is in the pattern at 'idx'
+    virtual OpDef const *get_bound_opdef(unsigned idx) const = 0;
+
+  public:
+    API_EXPORT virtual uint32_t saved_opid() const { return m_saved_opid; }
+
+    // get mapping from OpId->parm for all OpId in the match pattern; this is used
+    // to show the replacement pattern.
+    using id_to_parmname_map = std::map<OpId, operand_tag_parm_t>;
+    API_EXPORT virtual id_to_parmname_map get_id_to_parmname_map() const = 0;
+    API_EXPORT virtual std::string get_debug_filepos() const = 0;
+};
+
+/*
+ * The Match class contains the functionality for the match functions
+ * to implement pattern matching
+ *
+ * We want to write something like:
+ *  Op("Relu",Op("ConvLayer","Act","Weights","Bias","Stride"))
+ * Where the first parameter is the name of an operation
+ * And the rest of the strings are names that match an input that we can use to
+ * refer to the input
+ *
+ * We need to refer to inputs again even in matching: if we see the same string
+ * twice it needs to be the same thing in both places.
+ *
+ * But primarily we will need to use these strings while during extra constraints
+ * and replacement.
+ */
+
+class GraphOptInfo;
+
+class Match : public GraphOptContext_Base {
+    friend class GraphOptInfo;
+    friend class MatchOpBase;
+    friend class OptDebugForMatch;
+
+  protected:
+    OptimFilter optim_filter; // used for WITH_OPT_DEBUG; empty otherwise
+    MatchOpState matchop_state;
+    bool pending_show_replacement = false;
+
+    // op_id_counter is saved here before 'replace'; after replace, any
+    // OpId which are >= this in the upper 32 bits are 'new'.
+    uint32_t save_op_id_counter;
+
+    // optim config vars are set here.
+    optim_config_values config_vars;
+
+    Match(GraphPrepare &g) : GraphOptContext_Base(g), optim_filter(g) { set_config_vars(); }
+    API_EXPORT void set_config_vars();
+
+    // these are debug hooks; they are defined later as inlines
+    API_EXPORT void constraint_begin(GraphOptInfo const &);
+    API_EXPORT void replacement_fail();
+
+  public:
+    API_EXPORT void replacement_succeed(OpId newop);
+    GraphOptInfo const *curr_rule_info = 0; // only used in WITH_OPT_DEBUG
+
+  public:
+    // this can be used to test whether an OpId was created since the replacement
+    // rule started (though, not at all reliable for 'OpDef_ConstBase' ops).
+    API_EXPORT inline bool opid_is_new(OpId op) const { return uint32_t(op >> 32) >= save_op_id_counter; }
+
+    API_EXPORT hnnx::MatchOpState &get_matchop_state() { return matchop_state; }
+
+    //template<typename UniqueType> bool match(OpDef &base);
+    typedef MatchAst_uptr (*matchbuilder_type)();
+    void record_op_id_counter();
+    API_EXPORT optim_config_values const &get_config() const { return config_vars; }
+    API_EXPORT void show_debug_message(char const *why,
+                                       char const *str); // defined in optimize.cc if WITH_OPT_DEBUG is set
+};
+
+// these need to be defined after MatchOpBase and Match.
+inline MatchOpState &MatchOpBase::matchop_state(Match &m)
+{
+    return m.matchop_state;
+}
+inline bool MatchOpBase::do_match(Match &m, OpDef const &op) const
+{
+    if (op.opstr != m_opname0) return false;
+    int const nin = op.n_inputs();
+    if (nin < m_min_inputs || nin > m_max_inputs) return false;
+    bool const res = do_subclass_match(m, op);
+    m.matchop_state.current_matchop = res ? this : nullptr;
+    return res;
+}
+
+// Subclass of OptDebugBase for use with Match
+class OptDebugForMatch : public OptDebugBase {
+  protected:
+    Match const &m_match;
+
+  public:
+    OptDebugForMatch(Match const &m) : OptDebugBase(m.graph(), m.save_op_id_counter), m_match(m) {}
+    API_EXPORT virtual ~OptDebugForMatch() override;
+    API_EXPORT virtual std::string get_debug_filepos() const override;
+
+  protected:
+    API_EXPORT virtual char const *get_debug_desc() const override;
+    // get an OpRef of an op which is in the pattern at 'idx'
+    API_EXPORT virtual OpRef get_bound_opref(unsigned idx) const override;
+    // get an OpDef * to to an op which is in the pattern at 'idx'
+    API_EXPORT virtual OpDef const *get_bound_opdef(unsigned idx) const override;
+    API_EXPORT virtual id_to_parmname_map get_id_to_parmname_map() const override;
+};
+
+// define these debug hooks
+inline void Match::constraint_begin(GraphOptInfo const &grinfo)
+{
+    if constexpr (not build_options_pub::DefOptLog) {
+        return;
+    }
+    pending_show_replacement = false;
+    curr_rule_info = &grinfo;
+}
+inline void Match::replacement_fail() {}
+// Match::replacement_succeed(OpId newop) is in optimize.cc
+
+} // namespace hnnx
+
+namespace oExp {
+class opdef_accessor;
+}
+namespace tiling {
+class TileShapeBase;
+}
+
+/*
+ * Constraints are an expression that can inspect a matched pattern
+ * to see if the situation is actually valid
+ *
+ * EXTERNAL_CONSTRAINT is a hook that can be used to write your own constraint functions.
+ */
+
+namespace constraint_lib {
+
+class Constraint : public hnnx::Match {
+    friend class oExp::opdef_accessor;
+
+  protected:
+    Constraint(GraphPrepare &g) : Match(g) {}
+    /* We can put arithmetic functions in a separate library, but we want the namespace here. */
+    /* Functions that need things like the context to evaluate should probably go here */
+    OpRef get_opref(hnnx::operand_tag_parm_t param_name) const
+    {
+        int const idx = matchop_state.lookup_opertag(param_name);
+        return matchop_state.bound_opref[idx];
+    }
+
+  private:
+    const OpDef &get_opdef_from_idx(int idx)
+    {
+        OpDef const *odp = matchop_state.bound_opdef[idx];
+        if (odp == nullptr) {
+            odp = &matchop_state.bound_opref[idx].dereference(this);
+            matchop_state.bound_opdef[idx] = odp;
+        }
+        return *odp;
+    }
+    const OpDef &get_opdef(hnnx::operand_tag_parm_t param_name)
+    {
+        return get_opdef_from_idx(matchop_state.lookup_opertag(param_name));
+    }
+    const OutputDef &get_outdef(hnnx::operand_tag_parm_t param_name)
+    {
+        int const idx = matchop_state.lookup_opertag(param_name);
+        OpDef const &def = get_opdef_from_idx(idx);
+        return def.get_outputdef();
+    }
+    // this method is used by oExp::opdef_accessor; the definition
+    // is in oexpr.cc (it can't be inlined here because it needs Graph).
+    API_EXPORT OpDef const &lookup_opdef(OpId oid) const;
+
+  public:
+    template <typename UniqueType> static ReplFuncBool constraint();
+    typedef ReplFuncBool (*constraintfn_type)();
+    friend class tiling::TileShapeBase;
+
+#ifdef WITH_OPT_DEBUG
+    // This is current state associated with
+    // evaluation of a logical operator (and, or)
+    // inside a predicated
+    struct TraceState {
+        unsigned depth; // depth in the expression tree, 1 is the outermost and or or
+        unsigned clause; // cluse number within that and/or, starting with 0
+        const char *op; // operator type, "and" or "or"
+        bool result; // result
+    };
+    // Current evaluate state
+    TraceState trace{0, 0, "", false};
+    // History of evaluations
+    std::vector<TraceState> trace_vector;
+#endif
+};
+
+} // namespace constraint_lib
+
+using Constraint = constraint_lib::Constraint;
+
+/** \defgroup OptReplacement Replacement-Rule Expressions for Optimization Rules
+ * \ingroup OptimizationFuncs
+ *
+ * These are the operations available for writing 'Replacement Rule' expressions. Certain of these
+ * accept scalar inputs; for these you can use constant values, or 'constraint' expressions.
+ *
+ * Note: the operations in this group which appear to return a graph element ( Op, gen_Shape, etc)
+ * actually return a ReplFunc, which is a std::function that is called to generate the graph element.
+ *
+ * Likewise, SPLIT_START, SPLIT_SIZE, SPLIT_DIM actually return ReplFuncInt, a std::function which is called
+ * to generate the integer result, which changes as the autosplit is iterated.
+ */
+
+/*
+ * The Replacement generates the new pattern.
+ *
+ * EJP: FIXME: maybe we can make things simpler here....
+ *
+ * Once we've passed the Match and Constraint phase, we want to generate a new
+ * set of Ops to replace the sequence.
+ *
+ * We use the same Op() syntax to generate new things, we use "strings" to
+ * refer to matched items, and things typically work nicely.
+ *
+ * Well, sometimes anyway.
+ *
+ * It's common to want to do things like slicing, where we want to generate
+ * lots of ops... so adding some extra things to be able to slice into multiple
+ * things and concatenate them is helpful.
+ *
+ * But when we try to do that, we run into problems where the items in the
+ * dictionary are evaluated before we put them in.  So we do a lot of work
+ * with these deferred std::function returns.  Then we just copy what woks
+ * to do it again... but I think it might be wasteful.
+ *
+ * As we're generating these new ops, we start off with the output definition
+ * of the thing we're replacing.  That works fine for doing a simple substitution
+ * like Op(Relu,Op(ConvLayer,Act,W,B,S)) --> Op(ConvLayer_relu,Act,W,B,S)
+ * But if you want to (for example) split weights or pad activations, you need
+ * to change the sizes of inputs, not just keep inheriting the output's output def.
+ *
+ * So we have this WITH_SIZE and friends, but there's probably a better
+ * system that we could concieve of.
+ *
+ * Beyond that, it seems like a lot of the size / quant parameter / slicing
+ * code might be kind of common, so maybe some more library code that hides the
+ * ugliness is good enough to make the common cases simple.
+ *
+ */
+class Replacement : public Constraint {
+    friend class gxE::GXEngine;
+
+    // Thiis only used to suppress some AUTOSPLIT rules when we are using
+    // the centalilzed tiler. It should not land.
+    class SkipAutosplit : std::exception {
+        virtual const char *what() const noexcept { return "autothread skipped"; }
+    };
+
+  protected:
+    OpDef const *m_curr_op; // used as id reference in 'APPLY'
+    API_EXPORT_IMPORT static std::string pkg_flag;
+    Replacement(GraphPrepare &g) : Constraint(g), m_curr_op(NULL) {}
+
+  public:
+    struct ReplacedId {
+      private:
+        OpId replaced_id = 0;
+
+      public:
+        ReplacedId() {} // = default;
+
+        bool inline is_set() const { return replaced_id != 0; }
+        bool inline is_clear() const { return replaced_id == 0; }
+
+        void inline set(OpId replaced_id_in)
+        {
+            assert(replaced_id_in != 0);
+            assert(is_clear());
+            replaced_id = replaced_id_in;
+        }
+        void inline clear()
+        {
+            assert(is_set());
+            replaced_id = 0;
+        }
+        OpId inline get() const
+        {
+            assert(is_set());
+            return replaced_id;
+        }
+    };
+
+    API_EXPORT auto find_context(hnnx::split_context_tag_t tag)
+    {
+        auto cur = split_context.rbegin();
+        auto end = split_context.rend();
+        for (; cur != end; cur++)
+            if (cur->first == tag) return cur;
+        errlog("no context found for %s", tag.c_str());
+        return split_context.rend();
+    }
+    API_EXPORT const Split_Context &lookup_split(hnnx::split_context_tag_t tag) const
+    {
+        return const_cast<Replacement *>(this)->find_context(tag)->second;
+    }
+    hnnx::MatchOpState &get_matchop_state() { return matchop_state; }
+    OpRef match_root() const { return matchop_state.bound_opref[0]; }
+    API_EXPORT OpRef do_replacement(const OpDef &oldop, ReplFunc const &replace_func);
+    API_EXPORT static void set_pkg_flag(std::string &s) { pkg_flag = s; }
+
+  private:
+    std::vector<std::pair<hnnx::split_context_tag_t, Split_Context>> split_context;
+    Split_Context &push_split(hnnx::split_context_tag_t tag)
+    {
+        if (split_context.capacity() < 8) split_context.reserve(8);
+        assert(split_context.size() < split_context.capacity());
+        return split_context.emplace_back(tag, Split_Context{}).second;
+    }
+    void pop_split() { split_context.pop_back(); }
+    Split_Context &lookup_split(hnnx::split_context_tag_t tag) { return find_context(tag)->second; }
+    // apply_param_adapter is a gasket for parameters to SHAPEFN_APPLY and similar:
+    //   int, size_t, float, dtype -> same
+    //   OpRef -> same;
+    //   operand_tag -> lookup OpRef;
+    //   ReplFunc -> call it to get OpRef.
+    API_EXPORT inline int apply_param_adapter(const OpDef &base, int val) { return val; }
+    API_EXPORT inline size_t apply_param_adapter(const OpDef &base, size_t val) { return val; }
+    API_EXPORT inline float apply_param_adapter(const OpDef &base, float val) { return val; }
+    API_EXPORT inline DType apply_param_adapter(const OpDef &base, DType val) { return val; }
+    API_EXPORT inline OpRef apply_param_adapter(const OpDef &base, hnnx::operand_tag_parm_t str)
+    {
+        return get_opref(str);
+    }
+    API_EXPORT inline OpRef apply_param_adapter(const OpDef &base, OpRef ref) { return ref; }
+    API_EXPORT inline OpRef apply_param_adapter(const OpDef &base, ReplFunc const &f) { return f(*this, base); }
+
+    template <oExp::Variant V, typename T>
+    API_EXPORT inline auto apply_param_adapter(const OpDef &base, oExp::expr<V, T> const &expn)
+    {
+        return expn.eval(*this);
+    }
+    template <oExp::OpVnt V, typename T>
+    API_EXPORT inline OpRef apply_param_adapter(const OpDef &base, oExp::opexpr<V, T> const &expn)
+    {
+        return expn.eval(*this);
+    }
+
+    // 'runtime' of ResizeDim
+    API_EXPORT OpRef do_ResizeDim(OpDef const &old, int dim, int size, ReplFunc const &f, bool reduce_dim = false,
+                                  hnnx::splithist_t const *new_splithist = nullptr);
+
+    // A thin subclass of ReplFunc, which can be constructed from a ReplFunc, but also
+    // from an opexpr<V,T>
+    struct ReplFunc_general : ReplFunc {
+        ReplFunc_general(ReplFunc &&f) : ReplFunc(std::move(f)) {}
+        ReplFunc_general(ReplFunc_general &&src) = default;
+        ReplFunc_general(ReplFunc_general const &) = default;
+        template <oExp::OpVnt V, typename T>
+        ReplFunc_general(oExp::opexpr<V, T> &&op) : ReplFunc(hnnx::wrap_as_replfunc(op))
+        {
+        }
+        template <oExp::OpVnt V, typename T>
+        ReplFunc_general(oExp::opexpr<V, T> const &op) : ReplFunc(hnnx::wrap_as_replfunc(op))
+        {
+        }
+    };
+    // A thin subclass of ReplFunc, which can be constructed from a ReplFunc, but also
+    // from an operand tag, or string (via Operand()), or a fixed OpRef (this is to support OUTPUT_OF and similar)
+    struct ReplFunc_or_Operand : ReplFunc {
+        ReplFunc_or_Operand(ReplFunc &&f) : ReplFunc(std::move(f)) {}
+        ReplFunc_or_Operand(ReplFunc_or_Operand &&src) = default;
+        ReplFunc_or_Operand(ReplFunc_or_Operand const &) = default;
+        ReplFunc_or_Operand(hnnx::operand_tag_parm_t str) : ReplFunc(Operand(str)) {}
+        ReplFunc_or_Operand(char const *str) : ReplFunc(Operand(str)) {}
+        API_EXPORT ReplFunc_or_Operand(OpRef const &);
+
+        template <oExp::OpVnt V, typename T>
+        ReplFunc_or_Operand(oExp::opexpr<V, T> &&op) : ReplFunc(hnnx::wrap_as_replfunc(op))
+        {
+        }
+        template <oExp::OpVnt V, typename T>
+        ReplFunc_or_Operand(oExp::opexpr<V, T> const &op) : ReplFunc(hnnx::wrap_as_replfunc(op))
+        {
+        }
+    };
+
+    /// \ingroup OptReplacement
+    /// @brief ResizeDim(dim,size, expr) - evaluate 'expr' with a modification of reference shape
+    ///
+    /// The reference shape used to evaluate 'expr' is modified from the default by changing dimension 'dim' to size'
+    ///
+    API_EXPORT static ReplFunc ResizeDim(int dim, int size, ReplFunc_general &&f);
+    //
+    // Modifiers (e.g. WITH_SIZE( ref, target )
+    //  work like this:
+    //     (a) evaluate the 'ref' subtree using the current reference OpDef as reference;
+    //     (b) execute the modifier. This creates a temporary OpDef object, which combines
+    //         attributes of the original ref object, and the one constructed from ref;
+    //         e.g. WITH_SIZE takes rank and shape from 'ref' and the dtype etc from previoud ref
+    //     (c) now, execute the 'target' subtree using this temporary object as the reference.
+    //         The result of that is the result of the modifier. The temporary OpDef is discarded.
+    //
+    // this does WITH_SIZE, WITH_TYPE, WITH_SAME_OUTPUT
+    static const int mode_with_size = 1;
+    static const int mode_with_type = 2;
+    static const int mode_with_same_output = mode_with_size | mode_with_type;
+
+    // immed_modifier does step (b) above; it makes the temp object from the ref result and the current opdef
+    // The lambda inside WITH_output_like does steps (a) and (c).
+    //
+    API_EXPORT OpDef immed_modifier(OpRef const &ref, OpDef const &old, int mode);
+
+    API_EXPORT static ReplFunc WITH_output_like(ReplFunc_or_Operand &&ref, ReplFunc &&f, int mode);
+
+    // implements WrapOp and WrapOpAlways
+    API_EXPORT static ReplFunc WrapOp_internal(char const *op_name, char const *package, ReplFunc_or_Operand &&in_op,
+                                               bool is_idem); // True for WrapOp, false for WrapOpAlways
+
+    // implements WrapOp("op", "parmname") specifically
+    API_EXPORT static ReplFunc WrapOp_quick(char const *op_name, char const *package, char const *parm);
+
+  public:
+    API_EXPORT OpDef immed_modifier_OPID(OpRef const &ref, OpDef const &old);
+
+  private:
+    /// \ingroup OptReplacement
+    /// @brief WITH_SAME_ID(refexp, expr) - evaluate 'expr' using 'refexp' for the reference opid
+    API_HIDDEN inline static ReplFunc WITH_SAME_ID(ReplFunc_or_Operand &&ref, ReplFunc_general &&f)
+    {
+        return ReplFunc::create([=](Replacement &rpx, const OpDef &old) -> OpRef {
+            OpDef const new_def = rpx.immed_modifier_OPID(ref(rpx, old), old);
+            return f(rpx, new_def);
+        });
+    }
+
+    /// \ingroup OptReplacement
+    /// @brief WITH_SPLIT_HISTORY(refexp, expr) - evaluate 'expr' using 'refexp' for the split history
+    API_HIDDEN inline static ReplFunc WITH_SPLIT_HISTORY(ReplFunc_or_Operand &&ref, ReplFunc_general &&f)
+    {
+        return ReplFunc::create([=](Replacement &rpx, const OpDef &old) -> OpRef {
+            OpRef new_id = f(rpx, old);
+            OpDef &new_def = new_id.dereference(rpx.graph());
+            OpRef const ref_id = ref(rpx, old);
+            new_def.set_splithist(ref_id.dereference(rpx.graph()).get_splithist());
+            return new_id;
+        });
+    }
+
+    API_EXPORT void do_SPLIT_HISTORY(const OpDef &Src, int dim, OpDef &expr);
+
+    /// \ingroup OptReplacement
+    /// @brief WITH_SPLIT_HISTORY(refexp, dim, expr) - evaluate 'expr' using 'refexp' for the split history
+    //
+    // Add a new entry to split history table using refexp as the parent, dim as dimension.
+    // expr is expected to be a Concat or a InstanceNorm.SumAndSquares_TileReduce.
+    // The number of splits is determined by the number of children of expr
+    // The chunksize is determined by the first non-constant child
+    API_HIDDEN inline static ReplFunc WITH_SPLIT_HISTORY(ReplFunc_or_Operand &&ref, int dim, ReplFunc_general &&f)
+    {
+        return ReplFunc::create([=](Replacement &rpx, const OpDef &old) -> OpRef {
+            OpRef const ref_id = ref(rpx, old);
+            OpRef new_id = f(rpx, old);
+            rpx.do_SPLIT_HISTORY(ref_id.dereference(rpx), dim, new_id.dereference(rpx));
+            return new_id;
+        });
+    }
+
+  public:
+    /// \ingroup OptReplacement
+    /// @brief WITH_SIZE(refexp, expr) - evaluate 'expr' using 'refexp' for the reference output size
+    API_HIDDEN inline static ReplFunc WITH_SIZE(ReplFunc_or_Operand &&shape, ReplFunc_general &&f)
+    {
+        return WITH_output_like(std::move(shape), std::move(f), mode_with_size);
+    }
+
+  private:
+    /// \ingroup OptReplacement
+    /// @brief WITH_TYPE(refexp, expr) - evaluate 'expr' using 'refexp' for the reference output type
+    API_HIDDEN inline static ReplFunc WITH_TYPE(ReplFunc_or_Operand &&type, ReplFunc_general &&f)
+    {
+        return WITH_output_like(std::move(type), std::move(f), mode_with_type);
+    }
+    /// \ingroup OptReplacement
+    /// @brief WITH_SAME_OUTPUT(refexp, expr) - evaluate 'expr' using 'refexp' for the reference output type and size
+    API_HIDDEN inline static ReplFunc WITH_SAME_OUTPUT(ReplFunc_or_Operand &&ref, ReplFunc_general &&f)
+    {
+        return WITH_output_like(std::move(ref), std::move(f), mode_with_type | mode_with_size);
+    }
+
+    /// \ingroup OptReplacement
+    /// @brief Doesn't change anything in graph, just returns the op and adds it to const_tracker if enabled
+    /// Does NOT support tracking scalars
+    /// Please only use quant dependent constants, and avoid double tracking the same const
+    API_HIDDEN inline static ReplFunc MARK_REPLACEABLE_QCONST(ReplFunc_or_Operand &&op)
+    {
+        return ReplFunc::create([op](Replacement &rpx, const OpDef &old) -> OpRef {
+            return rpx.add_TRACKED_OP(rpx, old, std::move(op));
+        });
+    }
+
+    // what are comments
+    OpRef add_TRACKED_OP(Replacement &rpx, const OpDef &old, const ReplFunc_or_Operand &&op);
+
+    API_HIDDEN inline static ReplFunc WrapOp(char const *opname, ReplFunc_or_Operand &&f)
+    {
+        return WrapOp_internal(opname, pkg_flag.c_str(), std::move(f), true);
+    }
+    API_HIDDEN inline static ReplFunc WrapOp(char const *opname, char const *operand)
+    {
+        return WrapOp_quick(opname, pkg_flag.c_str(), operand);
+    }
+    API_HIDDEN inline static ReplFunc WrapOpAlways(char const *opname, ReplFunc_or_Operand &&f)
+    {
+        return WrapOp_internal(opname, pkg_flag.c_str(), std::move(f), false);
+    }
+
+    API_EXPORT OpRef immed_gen_ShapeOf(OpRef const &shaperef, OpDef const &old);
+    /// \ingroup OptReplacement
+    /// @brief gen_ShapeOf(any_oper) - Construct an OpDef_Shape with the shape taken from the given graph operation.
+    API_EXPORT static ReplFunc gen_ShapeOf(ReplFunc_or_Operand &&shape);
+
+    API_EXPORT static inline OpDef immed_modifier_OUTPUT_TYPE(OpDef const &old, DType dtype, int32_t zero_offset,
+                                                              float stepsize)
+    {
+        OutputDef temp{};
+        temp.dtype = dtype;
+        temp.zero_offset = zero_offset;
+        temp.stepsize = stepsize;
+        return old.make_output_exemplar(nullptr, &temp);
+    }
+
+    API_EXPORT static ReplFunc WITH_OUTPUT_TYPE_func(ReplFuncDType &&dtype, ReplFuncInt &&zero_offset,
+                                                     ReplFuncFloat &&stepsize, ReplFunc &&f);
+
+  public:
+    /// \ingroup OptReplacement
+    /// @brief WITH_OUTPUT_TYPE(dtype,zero_offset,stepsize,expr) - evaluate 'expr' but using the specified output type.
+    ///
+    /// A temporary reference is created which specifies the given dtype, step, and offset instead of the
+    /// default; this is used to evaluate 'expr'. If the dtype is not quantized, use 0 and 1.0f for zero_offset and stepsize.
+    API_EXPORT static ReplFunc WITH_OUTPUT_TYPE(DType dtype, int32_t zero_offset, float stepsize, ReplFunc_general &&f);
+
+    // adapter to allow  WITH_OUTPUT_TYPE to be called with some mixture of literals and oExp, and have them all converted to
+    // function objects, which are forwarded to WITH_OUTPUT_TYPE_func
+    template <typename TDT, typename TZO, typename TSS>
+    static inline ReplFunc WITH_OUTPUT_TYPE(TDT &&dtype, TZO &&zero_offset, TSS &&stepsize, ReplFunc_general &&f)
+    {
+        return WITH_OUTPUT_TYPE_func(oExp::wrap_as_function<DType>(std::forward<TDT>(dtype)),
+                                     oExp::wrap_as_function<int>(std::forward<TZO>(zero_offset)),
+                                     oExp::wrap_as_function<float>(std::forward<TSS>(stepsize)), std::move(f));
+    }
+
+  private:
+    /// \ingroup OptReplacement
+    /// @brief WITH_MULT_OUT(int num_outputs, expr) - evaluate 'expr' with 'DType::Multi' for 'num_outputs' outputs
+    ///
+    /// A temporary reference is created with OutputDef configured to make an Multi-Output op with the given number
+    /// of outputs. This is used to evaluate 'expr'. num_outputs must be >=2.
+    ///
+    API_EXPORT static ReplFunc WITH_MULTI_OUT(unsigned num_outputs, ReplFunc_general &&f);
+
+    /// immed_WITH_MULTI_OUT makes the OpDef used in WITH_MULTI_OUT.
+    API_EXPORT static OpDef immed_WITH_MULTI_OUT(OpDef const &old, unsigned num_outputs);
+
+    static OpRef shapefn_adapt_result(const OpDef &old, OpRef const &inp) { return inp; };
+    API_EXPORT static OpRef shapefn_adapt_result(const OpDef &old, QuickShape const &inp);
+
+    template <typename F_T, typename... Arg_Ts>
+    API_HIDDEN OpRef immed_SHAPEFN_APPLY(const OpDef &old, F_T f, Arg_Ts &&...args)
+    {
+        OpDef const *const keep = m_curr_op;
+        m_curr_op = &old;
+        OpRef result = shapefn_adapt_result(old, f(*this, std::forward<Arg_Ts>(args)...));
+        m_curr_op = keep;
+        return result;
+    }
+    /// \ingroup OptReplacement
+    /// @brief SHAPEFN_APPLY(function,parms...) - generate a shape object by calling a specified function.
+    ///
+    /// The named function is called, with specified parameters. These can be strings (assumed to be be operand
+    /// references, and converted to OpRef), or scalar expressions.
+    ///
+    /// See also: \ref ShapeFnApply
+    template <typename F_T, typename... Arg_Ts> API_HIDDEN static ReplFunc SHAPEFN_APPLY(F_T f, Arg_Ts... args)
+    {
+        return ReplFunc::create([=](Replacement &rpx, const OpDef &old) -> OpRef {
+            /* Call f(rpx,args...) */
+            return rpx.immed_SHAPEFN_APPLY(old, f, rpx.apply_param_adapter(old, args)...);
+        });
+    }
+
+    /// \ingroup OptReplacement
+    /// @brief AUTOSPLIT_SHAPEFN_APPLY(function, "split_tag", parms...) - generate a shape object by calling a specified function.
+    ///
+    /// The named function is called, with specified parameters. These can be strings (assumed to be be operand
+    /// references, and converted to OpRef), or scalar expressions.
+    /// The 'split_tag' parameter is converted to a reference to a Split_Context
+    ///
+    /// See also: \ref AutoSplitShapeFnApply
+    template <typename F_T, typename... Arg_Ts>
+    API_HIDDEN static ReplFunc AUTOSPLIT_SHAPEFN_APPLY(F_T f, hnnx::split_context_tag_t whatsplit, Arg_Ts... args)
+    {
+        return ReplFunc::create([=](Replacement &rpx, const OpDef &old) -> OpRef {
+            /* Call f(rpx,split_context.at(whatsplit),args...) */
+            return rpx.immed_SHAPEFN_APPLY(old, f, rpx.lookup_split(whatsplit), rpx.apply_param_adapter(old, args)...);
+        });
+    }
+    // AUTOSPLIT_SLICE, TYPICAL_SLICE, CHANGEDIM_SLICE
+    // are 'macro' operations - same effect as inserting a more
+    // complex expression in the source rule.
+
+    /// \ingroup OptReplacement
+    /// @brief AUTOSPLIT_SLICE(in, start, size ) -> WITH_SIZE( size, WITH_TYPE( in, Op("Slice_shape", in, start, size)))
+    ///
+    /// This generates a "Slice_shape" op applied to the given input 'in', with the given 'start' and 'size' shapes. The
+    /// output shape is configured to match 'size', and the output type is always the same as 'in'
+    ///
+    API_EXPORT static ReplFunc AUTOSPLIT_SLICE(ReplFunc_or_Operand &&in, ReplFunc_or_Operand &&start,
+                                               ReplFunc_or_Operand &&size);
+
+    /// \ingroup OptReplacement
+    /// @brief Create a slice of an autosplit via a simple split along a dimension.
+    ///
+    /// This does an 'AUTOSPLIT_SLICE' where the size and start are calculated by
+    /// simple_split_start, and simple_split_size, i.e. the split is done exactly as the output split,
+    /// in the same dimension, with no overlap.
+    ///
+    /// Equivalent to the following:
+    ///
+    ///     TYPICAL_SLICE(in, "tag" ) ->
+    ///       AUTOSPLIT_SLICE( in,
+    ///          AUTOSPLIT_SHAPEFN_APPLY( simple_split_start, tag, in ),
+    ///          AUTOSPLIT_SHAPEFN_APPLY( simple_split_size, tag, in ))
+    API_EXPORT static ReplFunc TYPICAL_SLICE(ReplFunc_or_Operand &&in, hnnx::split_context_tag_t whatsplit);
+
+    /// \ingroup OptReplacement
+    /// @brief Create a slice of an autosplit
+    ///
+    /// This does an 'AUTOSPLIT_SLICE' where the size and start are calculated by
+    /// simpledim_split_start, and simpledim_split_size, i.e. the split is done as the output split
+    /// with no overlap, but it may be applied to a different axis than that specified
+    /// in the AUTOSPLIT.
+    ///
+    /// Equivalent to the following:
+    ///
+    ///     CHANGEDIM_SLICE(in, "tag", int newdim ) ->
+    ///       AUTOSPLIT_SLICE( in,
+    ///          AUTOSPLIT_SHAPEFN_APPLY( simpledim_split_start, tag, in, newdim ),
+    ///          AUTOSPLIT_SHAPEFN_APPLY( simpledim_split_size, tag, in, newdim ))
+    API_EXPORT static ReplFunc CHANGEDIM_SLICE(ReplFunc_or_Operand &&in, hnnx::split_context_tag_t whatsplit,
+                                               int newdim);
+
+    // Pretty much all TYPICAL_SLICE and CHANGEDIM_SLICE are just ("string", "string") .. so this wrapper
+    // will save some code space at the call sites.
+    API_EXPORT static ReplFunc CHANGEDIM_SLICE(char const *in_parm, char const *whatsplit, int newdim);
+
+  public:
+    static ReplFunc TYPICAL_SLICE(char const *in_parm, char const *whatsplit)
+    {
+        return CHANGEDIM_SLICE(in_parm, whatsplit, -1);
+    }
+
+  public:
+    // this actually implements TYPICAL_SLICE (with newdim=-1) and CHANGEDIM_SLICE (with newdim >=0)
+    API_EXPORT OpRef do_TYPICAL_SLICE(OpDef const &old, OpRef input_op, hnnx::split_context_tag_t whatsplit, int newdim,
+                                      bool reduce_dim = false);
+
+    API_EXPORT OpRef do_AUTOSPLIT(OpDef const &old, int dim, Split_Context &splitinfo, int chunksize, ReplFunc const &f,
+                                  bool reduce_dim = false, bool autothread = false);
+
+    API_EXPORT OpRef do_AUTOTHREAD(OpDef const &old, int dim, hnnx::split_context_tag_t varname, int ntiles,
+                                   ReplFunc const &f);
+
+    /// \ingroup OptReplacement
+    /// @brief Expand an expression by splitting on some dimension.
+    ///
+    /// AUTOSPLIT( dim, "tag", size,  <repl_expression> ) causes the operation to be split into
+    /// slices along dimension dim, with each slice being of 'size' (or possibly smaller, for the last one).
+    ///
+    /// This done by
+    ///
+    ///   * Repeatedly evaluating the 'repl_expression', once for each slice
+    ///   * Using a 'Concat' on the specified dimension to join the results.
+    ///   * Within each iteration, SPLIT_START("tag") and SPLIT_SIZE"tag"), when evaluated within <repl_expression>,
+    ///    will reflect the extent of the current split in the output, and thus can be used to construct the corresponding
+    ///    slices of the input. SLICE_DIM("tag") will always give the value supplied to the AUTOSPLIT as 'dim'. Normally, this is all done within
+    ///    functions invoked via AUTOSPLIT_SHAPEFN_APPLY.
+    ///
+    /// Rules with AUTOSPLIT should have a constraint to prevent them from being applied where the split dimension does not exceed size.
+    ///
+    /// @param dim        Dimension on which to split
+    /// @param varname    A string indentifying the split context
+    /// @param chunksize  The size of each slice of the output
+    /// @param f          The subexpression to generate each part of the split
+  public:
+    API_EXPORT static ReplFunc AUTOSPLIT(int dim, hnnx::split_context_tag_t varname, int chunksize,
+                                         ReplFunc_general &&f);
+
+  private:
+    API_EXPORT static ReplFunc AUTOSPLIT_func(ReplFuncInt &&dim, hnnx::split_context_tag_t varname,
+                                              ReplFuncInt &&chunksize, ReplFunc &&f);
+
+    template <oExp::Variant V1, typename T1, oExp::Variant V2, typename T2>
+    API_HIDDEN inline static ReplFunc AUTOSPLIT(oExp::expr<V1, T1> &&dim, hnnx::split_context_tag_t varname,
+                                                oExp::expr<V2, T2> &&chunksize, ReplFunc_general &&f)
+    {
+        return AUTOSPLIT_func(oExp::wrap_as_function<int>(std::move(dim)), varname,
+                              oExp::wrap_as_function<int>(std::move(chunksize)), std::move(f));
+    }
+    // TODO: need a better way to do this (map 'int' or oExp which is int, to ReplFuncInt).
+    template <oExp::Variant V2, typename T2>
+    API_HIDDEN inline static ReplFunc AUTOSPLIT(int dim, hnnx::split_context_tag_t varname,
+                                                oExp::expr<V2, T2> &&chunksize, ReplFunc_general &&f)
+    {
+        return AUTOSPLIT_func(oExp::make_literal_sfunction<int>(dim), varname,
+                              oExp::wrap_as_function<int>(std::move(chunksize)), std::move(f));
+    }
+    template <oExp::Variant V1, typename T1>
+    API_HIDDEN inline static ReplFunc AUTOSPLIT(oExp::expr<V1, T1> &&dim, hnnx::split_context_tag_t varname,
+                                                int chunksize, ReplFunc_general &&f)
+    {
+        return AUTOSPLIT_func(oExp::wrap_as_function<int>(std::move(dim)), varname,
+                              oExp::make_literal_sfunction<int>(chunksize), std::move(f));
+    }
+
+  public:
+    // Performs AUTOSPLIT in the specified dimension to create at most options.autothread_hvx_ntiles splits
+    // that will not be further autothreaded.
+    API_EXPORT static ReplFunc AUTOTHREAD_HVX(int dim, hnnx::split_context_tag_t varname, ReplFunc_general &&f);
+    // Same for options.autothread_hmx_ntiles.
+    API_EXPORT static ReplFunc AUTOTHREAD_HMX(int dim, hnnx::split_context_tag_t varname, ReplFunc_general &&f);
+
+    static ReplFunc first_AUTOSPLIT(int const dim, hnnx::split_context_tag_t const varname, int const chunksize,
+                                    ReplFunc const &f);
+
+  private:
+    static ReplFunc first_AUTOSPLIT(int const dim, hnnx::split_context_tag_t const varname,
+                                    ReplFuncInt const &&chunksize, ReplFunc const &f);
+
+    template <oExp::Variant V2, typename T2>
+    API_FUNC_HIDDEN inline static ReplFunc AUTOSPLIT_FIRST(int dim, hnnx::split_context_tag_t varname,
+                                                           oExp::expr<V2, T2> &&chunksize, ReplFunc_general &&f)
+    {
+        return Replacement::first_AUTOSPLIT(dim, varname, oExp::wrap_as_function<int>(std::move(chunksize)), f);
+    }
+
+    API_FUNC_HIDDEN inline static ReplFunc AUTOSPLIT_FIRST(int const dim, hnnx::split_context_tag_t const varname,
+                                                           int const chunksize, ReplFunc_general const &&f)
+    {
+        return Replacement::first_AUTOSPLIT(dim, varname, chunksize, f);
+    }
+
+  private:
+    /// AUTOSPLIT and reduce the dim
+    API_EXPORT static ReplFunc AUTOSPLIT_REDUCE(int dim, hnnx::split_context_tag_t varname, ReplFunc_general &&f);
+
+    API_EXPORT static ReplFunc TYPICAL_SLICE_REDUCE(ReplFunc_or_Operand &&in, hnnx::split_context_tag_t whatsplit);
+
+    /// \ingroup OptReplacement
+    /// @brief Create a multi-output Op by iterating over expression
+    ///
+    /// OP_ITER( op_base, "tag", lo_index, hi_index, <repl_expression> )
+    ///
+    /// The operation will iterate for "I" >= lo_index, < hi_index; for each value, the repl_expression
+    /// is evaluated, and a new Op is created which
+    ///
+    ///  - has the same opstr as op_base, and the same inputs, plus additions inputs generated by
+    ///    the iteration
+    ///  - the OutputDef of the new op is defined by the context of the ITER_OP, and may be different
+    ///    from that of the op_base.
+    ///
+    /// if lo_index <= hi_index, no iteration is done, and the built Op has the same inputs
+    /// as op_base. Rules with OP_ITER should have a constraint to prevent them from being
+    /// applied where this could be incorrect.
+    ///
+    /// @param op_base    'Reference' Op supplying the name and fixed inputs
+    /// @param varname    A string indentifying the split context
+    /// @param lo_index   the first input index
+    /// @param hi_index   the last input index+1
+    /// @param f          The subexpression to iterate.
+    ///
+    API_EXPORT static ReplFunc OP_ITER(ReplFunc &&op_base, hnnx::split_context_tag_t varname, int lo_index,
+                                       int hi_index, ReplFunc_general &&f);
+
+    // same with ReplFuncInt for the index values, so they can be expressions
+    API_EXPORT static ReplFunc OP_ITER_func(ReplFunc &&op_base, hnnx::split_context_tag_t const &varname,
+                                            ReplFuncInt &&lo_index, ReplFuncInt &&hi_index, ReplFunc &&f);
+
+    // template to map expressions to ReplFuncInt
+    template <typename TLO, typename THI>
+    API_HIDDEN inline static ReplFunc OP_ITER(ReplFunc &&op_base, hnnx::split_context_tag_t varname, TLO &&lo_index,
+                                              THI &&hi_index, ReplFunc_general &&f)
+    {
+        return OP_ITER_func(std::move(op_base), varname, oExp::wrap_as_function<int>(std::forward<TLO>(lo_index)),
+                            oExp::wrap_as_function<int>(std::forward<THI>(hi_index)), std::move(f));
+    }
+
+    API_EXPORT OpRef do_OP_ITER(OpDef const &old, OpDef const &base_op, Split_Context &splitinfo, int lo_index,
+                                int hi_index, ReplFunc const &f);
+
+    /// \ingroup OptReplacement
+    /// @brief INHERIT_MEMOS_FROM(refexp, expr) - evaluate 'expr' inheriting any persistent memos from 'refexp'
+
+    API_EXPORT void do_INHERIT_MEMOS(const OpDef &old, OpDef &newdef);
+
+    API_HIDDEN inline static ReplFunc INHERIT_MEMOS_FROM(ReplFunc_or_Operand &&ref, ReplFunc_general &&f)
+    {
+        return ReplFunc::create([=](Replacement &rpx, const OpDef &old) -> OpRef {
+            OpRef const ref_id = ref(rpx, old);
+            OpRef new_id = f(rpx, old);
+            rpx.do_INHERIT_MEMOS(ref_id.dereference(rpx), new_id.dereference(rpx));
+            return new_id;
+        });
+    }
+
+    /// \ingroup OptReplacement
+    /// @brief WITH_MEMOS(refexp, expr) - evaluate 'expr' using 'refexp' for persistant memos
+
+    API_EXPORT void do_WITH_MEMOS(const OpDef &old, OpDef &newdef);
+
+    API_HIDDEN inline static ReplFunc WITH_MEMOS(ReplFunc_or_Operand &&ref, ReplFunc_general &&f)
+    {
+        return ReplFunc::create([=](Replacement &rpx, const OpDef &old) -> OpRef {
+            OpRef const ref_id = ref(rpx, old);
+            OpRef new_id = f(rpx, old);
+            rpx.do_WITH_MEMOS(ref_id.dereference(rpx), new_id.dereference(rpx));
+            return new_id;
+        });
+    }
+
+    // this is basically a SHAPEFN_APPLY for a function with no other inputs.
+    // We still want to bind it into a std::function.
+
+    /// \ingroup OptReplacement
+    /// @brief Generate replacement by calling an external function
+    ///
+    /// The function must be: OpRef function( Replacement &, OpDef const &op);
+    ///
+    /// ... where 'op' is the OpDef being replaced.
+    /// The return value is the OpRef of the replacement. If it's the same as the OpRef of 'op', it is assumed
+    /// that the rule has no effect in this situation.
+    ///
+    API_HIDDEN static ReplFunc EXTERNAL_REPLACE(external_replace_funcp f)
+    {
+        return ReplFunc(ReplFunc::FunctionWrapper, (void *)f);
+    }
+
+    /// \ingroup OptReplacement
+    /// @brief Define a new mult-output Op, with one of its outputs
+    ///
+    ///  OpMultiOut( n_out, outno, "opstr", ...inputs... )
+    ///  is equivalent to
+    ///     Op( "$Out",  WITH_MULTI_OUT( n_out, Op("opstr", ... inputs...),
+    ///        gen_Shape(0,0,n_out,outno))
+    ///
+    /// If any of the inputs have Op in them, they will need to have WITH_ modifiers
+    /// for shape and type enclosing them.
+    ///
+    template <typename... Ts>
+    API_HIDDEN inline static ReplFunc OpMultiOut(unsigned n_out, unsigned outno, char const *opstr, Ts &&...ts)
+    {
+        assert(n_out >= 2 && outno < n_out);
+        return Op("$Out", WITH_MULTI_OUT(n_out, Op(opstr, std::forward<Ts>(ts)...)),
+                  gen_Shape(0, 0, size_t(n_out), size_t(outno)));
+    }
+
+  public:
+    /// \ingroup OptReplacement
+    /// @brief Generate reference to an operand in the match rule: Operand("opname")
+    ///
+    /// This need not be written in rules; if "X" appears in any part of a replacement rule
+    /// where an 'Op' expression is needed, it will be interpreted as Operand("X"). Including
+    /// the case where the entire replacement rule is just "X" (i.e. the rule 'bypasses' input X to
+    /// the output).
+    ///
+    static ReplFunc Operand(hnnx::operand_tag_parm_t str)
+    {
+        return ReplFunc::create([=](Replacement &rpx, const OpDef &old) -> OpRef { return rpx.get_opref(str); });
+    }
+    static ReplFunc Operand(ReplFunc const &opf) { return opf; }
+    static ReplFunc Operand(ReplFunc &&opf) { return std::move(opf); }
+    template <oExp::OpVnt V, typename T> static ReplFunc Operand(oExp::opexpr<V, T> &&op)
+    {
+        return hnnx::wrap_as_replfunc(op);
+    }
+    template <oExp::OpVnt V, typename T> static ReplFunc Operand(oExp::opexpr<V, T> const &op)
+    {
+        return hnnx::wrap_as_replfunc(op);
+    }
+
+    API_EXPORT static ReplFunc Op_inner(char const *str, char const *package, int n_in, ReplFunc const *ifuncs);
+
+    // all of the Ts for Op should be either an operand_tag_t (or convertible to one)
+    // or should be a ReplFunc
+    // This Op() just maps all the operand tags to  ReplFunc
+    // (by passing the them all through Operand(), which has no effect on functions)
+    // They are placed in an array, passed to Op_inner.
+    //
+    /// \ingroup OptReplacement
+    /// @brief Generate a new Op in a replacement rule: Op("opname", ... inputs ... )
+    ///
+    /// The inputs can be any 'replacement' expressions, or operand tags; the shape and type of the Op output
+    /// are inherited from the replaced Op -- or from the innermost modifier, if the Op appears
+    /// within a modifier.
+    ///
+
+    template <typename... Ts> API_HIDDEN static ReplFunc Op(char const *str, Ts... ts)
+    {
+        std::array<ReplFunc, sizeof...(Ts)> input_funcs = {Replacement::Operand(ts)...};
+        return Op_inner(str, pkg_flag.c_str(), sizeof...(Ts), input_funcs.data());
+    }
+
+  private:
+    // this is to include oExp::SELECT, on an equal namespace footing with these other select.
+    template <typename TS, typename TA, typename TB> static inline auto SELECT(TS &&sel, TA &&a, TB &&b)
+    {
+        // compiler wants to use this for Repl inputs, too...
+        // send those to SELECT_func
+        if constexpr (std::is_constructible<ReplFunc_or_Operand, TA>::value ||
+                      std::is_constructible<ReplFunc_or_Operand, TB>::value) {
+            return SELECT_func(oExp::wrap_as_function<bool>(std::forward<TS>(sel)),
+                               ReplFunc_or_Operand(std::forward<TA>(a)), ReplFunc_or_Operand(std::forward<TB>(b)));
+        } else {
+            return oExp::SELECT(std::forward<TS>(sel), std::forward<TA>(a), std::forward<TB>(b));
+        }
+    }
+
+    // this is to implement all of the select cases where ?: works, where the result is not ReplFunc
+    /* removed - I doubt this is safe
+	template <typename TA, typename TB>
+	static auto SELECT( bool sel, TA &&iftrue, TB &&iffalse) -> decltype(sel?iftrue:iffalse) {
+		return sel? std::forward<TA>(iftrue): std::forward<TB>(iffalse);
+	} */
+    // SELECT ReplFunc with immediate execution
+    // The second one allows "Parmname" as one operand, third allows two.
+    API_EXPORT static ReplFunc SELECT(bool sel, ReplFunc_general &&iftrue, ReplFunc_general &&iffalse);
+    API_EXPORT static ReplFunc SELECT(bool sel, ReplFunc_or_Operand &&iftrue, ReplFunc_or_Operand &&iffalse);
+    API_EXPORT static ReplFunc SELECT(bool sel, char const *iftrue, char const *iffalse);
+    // SELECT ReplFunc with deferred execution (returned function will call sel(), and then one
+    // of the functions).
+    API_EXPORT static ReplFunc SELECT_func(ReplFuncBool &&sel, ReplFunc_or_Operand &&iftrue,
+                                           ReplFunc_or_Operand &&iffalse);
+
+    template <oExp::Variant V, typename T>
+    static ReplFunc SELECT(oExp::expr<V, T> &&condn, ReplFunc_or_Operand &&iftrue, ReplFunc_or_Operand &&iffalse)
+    {
+        return SELECT_func(oExp::wrap_as_function<bool>(std::move(condn)), std::move(iftrue), std::move(iffalse));
+    }
+
+    /*
+	OpRef do_replacement(const OpDef & oldop, ReplFunc const & f)
+	{
+		return f(*this,oldop);
+	}
+	OpRef do_replacement(const OpDef & oldop, hnnx::operand_tag_parm_t str)
+	{
+		return get_opref(str);
+	}*/
+  public:
+    OpDef const &curr_op() const { return *m_curr_op; }
+
+    API_EXPORT static OpRef gen_node(const hnnx::opname_tag_t str, size_t n_in, OpRef const *inputs, const OpDef &old,
+                                     char const *package_name = THIS_PKG_NAME_STR, const OpDef *model = nullptr);
+    static inline OpRef gen_node(const hnnx::opname_tag_t str, std::vector<OpRef> const &inputs, const OpDef &old,
+                                 char const *package_name = THIS_PKG_NAME_STR, const OpDef *model = nullptr)
+    {
+        return gen_node(str, inputs.size(), inputs.data(), old, package_name, model);
+    }
+    // allow {opref1, opref2} for 'inputs' (without becoming std::vector)
+    static inline OpRef gen_node(const hnnx::opname_tag_t str, std::initializer_list<OpRef> inputs, const OpDef &old,
+                                 char const *package_name = THIS_PKG_NAME_STR, const OpDef *model = nullptr)
+    {
+        return gen_node(str, inputs.size(), inputs.begin(), old, package_name, model);
+    }
+    template <size_t N>
+    static inline OpRef gen_node(const hnnx::opname_tag_t str, std::array<OpRef, N> const &inputs, const OpDef &old,
+                                 char const *package_name = THIS_PKG_NAME_STR, const OpDef *model = nullptr)
+    {
+        return gen_node(str, N, inputs.data(), old, package_name, model);
+    }
+
+    API_EXPORT OpRef gen_Shape_in_graph(const OpDef &old, int rank, size_t const *sizes);
+
+    template <DType DT>
+    API_EXPORT OpRef gen_Const_scalar(const OpDef &old, typename dtype_traits<DT>::element_type constval);
+
+    template <DType DT>
+    API_EXPORT OpRef gen_Const_1D_array(const OpDef &old, typename dtype_traits<DT>::element_type const *vals,
+                                        size_t n);
+
+    template <DType DT>
+    API_EXPORT OpRef gen_Const_mD_array(const OpDef &old, typename dtype_traits<DT>::element_type const *vals, size_t n,
+                                        size_t m);
+
+    API_EXPORT OpRef gen_Const_int32_common(const OpDef &old, const OutputDef &out_def, const uint8_t *data,
+                                            size_t data_len);
+
+    API_EXPORT OpRef gen_Const_float_common(const OpDef &old, const OutputDef &out_def, const uint8_t *data,
+                                            size_t data_len);
+
+    template <typename UniqueType> API_EXPORT static ReplFunc first_replacement();
+    template <typename UniqueType> API_EXPORT static ReplFunc replacement();
+    //typedef OpRef (Replacement::*replacementfn_type)(const OpDef &oldop);
+    typedef ReplFunc (*replacementfn_type)();
+
+    template <typename T> static constexpr bool is_Op_type()
+    {
+        return std::is_constructible<ReplFunc_or_Operand, T>::value;
+    }
+};
+
+namespace hnnx {
+
+//
+// These implement CONSTVAL_INT, CONSTVAL_INT_VALID
+// and GETCONST_FLOAT, CONSTVAL_FLOAT_VALID
+// The first part of the return value is the result from CONSTVAL_INT(op,idx)
+// The second part is the return from CONSTVAL_INT_VALID
+API_EXPORT std::pair<NN_INT32_T, bool> getconst_int_impl(GraphPrepare &g, OpDef const &opdef, int index);
+API_EXPORT std::pair<NN_INT32_T, bool> getconst_int_impl(GraphPrepare &g, OpDef const &opdef, int index, int index2);
+API_EXPORT std::pair<float, bool> getconst_float_impl(GraphPrepare &g, OpDef const &opdef, int index);
+API_EXPORT std::pair<float, bool> getconst_float_impl(GraphPrepare &g, OpDef const &opdef, int index, int index2);
+API_EXPORT bool producer_for_impl(OpDef const &opdef, const hnnx::opname_tag_t consumer_opname);
+
+class GraphOptInfo;
+/*
+ * A GraphOptContext ties these all together, along with the 'attempt' method
+ */
+class GraphOptContext : public Replacement {
+  public:
+    GraphOptContext(GraphPrepare &g) : Replacement(g) {}
+    void set_rule(GraphOptInfo const *const Rule) { curr_rule_info = Rule; }
+    API_EXPORT OpId attempt(GraphOptInfo const &, OpDef &oldop);
+};
+
+class entire_defopt {
+  public:
+    hnnx::MatchAst_uptr matcher;
+    ReplFuncBool constraint;
+    ReplFunc replacement;
+    ReplFunc replacement_first;
+    void (*register_tiling)(GraphOptInfo *);
+};
+
+using get_entire_defopt_t = entire_defopt (*)();
+
+template <typename T> entire_defopt get_entire_defopt();
+
+class GraphOptPass;
+
+// GraphOpInfo: contains pointers to all the specialized methods.
+// These are all created as global variables, and they are linked together
+// in a linked list; optimization_passes will be populated with pointers
+// to them.
+
+class GraphOptInfo {
+    friend class GraphOptContext;
+
+    int priority;
+    OptimFlags::flags_t flags;
+    /** @brief defopt A pointer to the function that generates the
+     *  match, constraint, and replacement characterizing this optimization
+     */
+    get_entire_defopt_t defopt_fn;
+
+  public:
+    MatchOp_uptr matchop_ptr; //stores the built matchop.
+    ReplFuncBool constraint_func; // function object for the constraint.
+    ReplFunc replace_func; // fucntion object for replacement.
+    ReplFunc replace_first_func;
+    GraphOptInfo const *next_in_pass = nullptr; // next opt for the same opstr in the same pass.
+
+    // note, WITH OPT_DEBUG must be consistent across a build now, otherwise you
+    // should get a link error (at least on "add_package_opt").
+    char const *debug_filename = nullptr;
+    int debug_lineno = 0;
+
+    bool is(unsigned flag) const { return flags & flag; }
+
+  protected:
+    // this is done in populate_optimization_map, for all optims.
+    virtual void build_matchop()
+    {
+        entire_defopt defopt;
+        defopt = defopt_fn();
+
+        matchop_ptr = MatchBuilder::build_matcher(defopt.matcher);
+        // build the constraint function too
+        // If the actual constraint function is detected to be the 'always true'
+        // function, we leave constraint_func empty.
+        ReplFuncBool const cfunc = defopt.constraint;
+        int const check = oExp::check_sfunction_bool(cfunc);
+        if (check != 1) constraint_func = cfunc;
+        replace_func = defopt.replacement;
+        replace_first_func = defopt.replacement_first;
+        if (defopt.register_tiling && (get_flags() & (OptimFlags::flags_t(OptimFlags::central_autosplit_flag) |
+                                                      OptimFlags::flags_t(OptimFlags::central_only_autosplit_flag))))
+            defopt.register_tiling(this);
+    }
+
+  public:
+    API_EXPORT GraphOptInfo(int priority, OptimFlags::flags_t flags_in, get_entire_defopt_t defopt_in);
+    GraphOptInfo(int priority, OptimFlags::flags_t flags_in);
+    API_EXPORT virtual ~GraphOptInfo() = default;
+
+    // This fills in the optimization map.
+    API_EXPORT static void insert_optimization(std::map<int, GraphOptPass> &opt_passes, GraphOptInfo *p);
+    API_EXPORT static void populate_package_optimization_map(std::vector<std::unique_ptr<GraphOptInfo>> &opts);
+
+    API_EXPORT inline bool test_constraint(Constraint &cst) const
+    {
+        // an empty constraint_func means 'always'
+        return constraint_func ? constraint_func(cst) : true;
+    }
+    API_EXPORT GraphOptInfo const *next_optim() const { return next_in_pass; }
+    API_EXPORT void set_next_in_pass(const GraphOptInfo *next) { next_in_pass = next; }
+
+    API_EXPORT MatchOpBase &get_matchop() const { return *matchop_ptr.get(); }
+
+    API_EXPORT OptimFlags::flags_t get_flags() const { return flags; }
+    API_EXPORT bool has_flags(OptimFlags::flags_t v) const { return (flags & v) != 0; }
+    API_EXPORT inline void add_debug_info(char const *const filename, const int lineno)
+    {
+        debug_filename = filename;
+        debug_lineno = lineno;
+    }
+    API_EXPORT inline char const *get_filename() const { return debug_filename; }
+    // get the filename.cc:lineo as a string
+    API_EXPORT std::string get_debug_filepos() const;
+
+#ifndef PREPARE_DISABLED
+    template <typename T> static void declare_tiling_rule(GraphOptInfo *);
+
+    // Register a tiling rule (defined in tiler.cc)
+    static unsigned declare_tiling_rule(unsigned dim, const char *Var, GraphOptInfo *, const char *filename,
+                                        unsigned lineno);
+#endif // PREPARE_DISABLED
+    ReplFunc get_replacement() const { return replace_func; }
+    ReplFunc get_replacement_first() const { return replace_first_func; }
+    API_EXPORT int get_priority() const { return priority; }
+#ifndef PREPAREA_DISABLED
+    void set_priority(int new_priority) const { const_cast<GraphOptInfo *>(this)->priority = new_priority; }
+#endif
+};
+
+#ifndef DEF_OPT_COMPILE
+#define DEF_AUTOSPLIT_COMMON(PRIORITY, FLAGS, MATCHCODE, CONSTRAINTCODE, dim, var, CHUNKSIZE, REPLACE)                 \
+    template <> hnnx::MatchAst_uptr MatchBuilder::matcher<UNIQUE_TYPE>() { return MATCHCODE; }                         \
+    template <> ReplFuncBool Constraint::constraint<UNIQUE_TYPE>()                                                     \
+    {                                                                                                                  \
+        using namespace oExp_for_cst;                                                                                  \
+        using oExp::INT;                                                                                               \
+        using oExp::UINT;                                                                                              \
+        auto result = oExp::wrap_param_to<bool>(                                                                       \
+                AND(CONSTRAINTCODE, OR(OPTION_BOOL("ignore_chunksize"), GT(DIM_OF("*", dim), CHUNKSIZE))));            \
+        return oExp::wrap_as_function<bool>(result);                                                                   \
+    }                                                                                                                  \
+    template <> ReplFunc Replacement::replacement<UNIQUE_TYPE>()                                                       \
+    {                                                                                                                  \
+        using namespace oExp_for_repl;                                                                                 \
+        using oExp::INT;                                                                                               \
+        using oExp::UINT;                                                                                              \
+        pkg_flag = THIS_PKG_NAME_STR;                                                                                  \
+        return Operand(AUTOSPLIT(dim, var, CHUNKSIZE, REPLACE));                                                       \
+    }                                                                                                                  \
+    template <> ReplFunc Replacement::first_replacement<UNIQUE_TYPE>()                                                 \
+    {                                                                                                                  \
+        using namespace oExp_for_repl;                                                                                 \
+        using oExp::INT;                                                                                               \
+        using oExp::UINT;                                                                                              \
+        pkg_flag = THIS_PKG_NAME_STR;                                                                                  \
+        return Operand(REPLACE);                                                                                       \
+    }                                                                                                                  \
+    template <> inline constexpr hnnx::OptimFlags::flags_t hnnx::OptimFlags::flag_evaluate<UNIQUE_TYPE>() noexcept     \
+    {                                                                                                                  \
+        return static_cast<uint32_t>(any_rule) | static_cast<uint32_t>(FLAGS);                                         \
+    }                                                                                                                  \
+    template <> void hnnx::GraphOptInfo::declare_tiling_rule<UNIQUE_TYPE>(GraphOptInfo * info)                         \
+    {                                                                                                                  \
+        declare_tiling_rule(dim, var, info, __FILE__, __LINE__);                                                       \
+    }                                                                                                                  \
+    template <> hnnx::entire_defopt hnnx::get_entire_defopt<UNIQUE_TYPE>()                                             \
+    {                                                                                                                  \
+        return hnnx::entire_defopt{MatchBuilder::matcher<UNIQUE_TYPE>(), Constraint::constraint<UNIQUE_TYPE>(),        \
+                                   Replacement::replacement<UNIQUE_TYPE>(),                                            \
+                                   Replacement::first_replacement<UNIQUE_TYPE>(),                                      \
+                                   GraphOptInfo::declare_tiling_rule<UNIQUE_TYPE>};                                    \
+    }                                                                                                                  \
+    REGISTER_INTERNAL_PACKAGE_OPT((PRIORITY), hnnx::OptimFlags::flag_evaluate<UNIQUE_TYPE>(),                          \
+                                  &hnnx::get_entire_defopt<UNIQUE_TYPE>);
+#else
+#define DEF_AUTOSPLIT_COMMON(PRIORITY, FLAGS, MATCHCODE, CONSTRAINTCODE, dim, var, CHUNKSIZE, REPLACE)                 \
+    __def_opt__(PRIORITY, FLAGS, MATCHCODE, AND(CONSTRAINTCODE, GT(DIM_OF("*", dim), CHUNKSIZE)),                      \
+                AUTOSPLIT(dim, var, CHUNKSIZE, REPLACE))<<<__FILE__, __LINE__>>>
+//  ---> the format of this line must agree with the assumption in scripts/rewrite/hash_rule.py
+#endif
+
+#define DEF_AUTOSPLIT(PRIORITY, MATCHCODE, CONSTRAINTCODE, dim, var, CHUNKSIZE, REPLACE)                               \
+    DEF_AUTOSPLIT_COMMON(PRIORITY, 0, MATCHCODE, CONSTRAINTCODE, dim, var, CHUNKSIZE, REPLACE)
+#define DEF_AUTOSPLITIM(PRIORITY, FLAGS, MATCHCODE, CONSTRAINTCODE, dim, var, CHUNKSIZE, REPLACE)                      \
+    DEF_AUTOSPLIT_COMMON(PRIORITY, FLAGS, MATCHCODE, CONSTRAINTCODE, dim, var, CHUNKSIZE, REPLACE)
+#define DEF_AUTOSPLIT_ORDERED(PRIORITY, MATCHCODE, CONSTRAINTCODE, dim, var, CHUNKSIZE, REPLACE)                       \
+    DEF_AUTOSPLIT_COMMON(PRIORITY, ordered_autosplit_flag, MATCHCODE, CONSTRAINTCODE, dim, var, CHUNKSIZE, REPLACE)
+#define DEF_AUTOSPLIT_TYPICAL(PRIORITY, OPSTR, ARITY, dim, CHUNKSIZE)                                                  \
+    DEF_AUTOSPLIT_COMMON(PRIORITY, central_autosplit_flag, OpVarIn(OPSTR), OK, dim, "I", CHUNKSIZE,                    \
+                         OP_ITER(Op(OPSTR), "J", 0, INPUTS_OF("*"),                                                    \
+                                 SELECT(GE(SPLIT_START("J"), ARITY), ITER_INPUT_OF("*", "J"),                          \
+                                        TYPICAL_SLICE(ITER_INPUT_OF("*", "J"), "I"))))
+
+// This class organizes the rules that which
+// are part of the same optimization pass (priority, phase,...)
+// These are grouped into collections which have the same root
+// type in the match. For each group, we maintain a vector
+// the rules in the order in which they should be attempted.
+// This vector is augmented with additional information
+// that guides selection of the next rule to be attempted based
+// on previous information about the match.
+class GraphOptPass {
+
+    // This type is the type of the byte codes used in the
+    // matching enging.
+    using code_t = unsigned short;
+    // This structure holds the vector of rules with
+    // a common priority and root operator type.
+    struct MatcherState {
+        std::vector<const GraphOptInfo *> rules;
+        std::vector<code_t> codes; // byte-codes to drive match checking
+        std::vector<opname_tag_t> opstrs; // opstr values needed in operand matches
+        API_EXPORT int dump() const;
+    };
+    class StateBuilder;
+
+    int priority; // common priority (pass number, phase) for this pass.
+    OptimFlags::flags_t flags; // 'or' of certain flags in the whole pass
+    // use a minhash_noerase for the rules, if we can:
+    using rules_map_t =
+            std::conditional_t<std::is_same_v<opname_tag_t, string_tag_t>, minihash_noerase<opname_tag_t, MatcherState>,
+                               std::map<opname_tag_t, MatcherState>>;
+    // Map of type level match name to associated MatcherState
+    rules_map_t rules;
+    // for each name in the rules, bit  find_opname_hash(name)&63
+    // is set in set_bitmap, so we don't even need to probe the map
+    // unless we see that bit.
+    uint64_t set_bitmap;
+
+    // hash an opstr to a single-bit bit-mask.
+    static uint64_t hash(hnnx::opname_tag_t opstr) { return uint64_t(1) << (find_opname_hash(opstr) & 63); }
+
+    // This class provides iteration over the matche returning each candidate.
+    // A nullptr in "current" is a sentinel for no more rules to attempt.
+    class MatchIterator {
+        MatchOpState &matchop_state; // State carried between match attempts
+        const MatcherState *matcher = nullptr; // the rules and byte codes for matching
+        unsigned state = 0; // the current state of the match
+        const GraphOptInfo *current = nullptr; // the current rule
+
+        API_EXPORT void advance(); // advance to the next rule and update state
+        API_EXPORT void advance_select(); // advance to the next rule by testing an input operand
+
+      public:
+        MatchIterator(MatchOpState &matchop_state, const MatcherState &matcher, unsigned state)
+            : matchop_state(matchop_state), matcher(&matcher), state(state)
+        {
+            advance();
+        }
+        // This constructor is used for "end" iterators and just sets current to nullptr
+        MatchIterator(MatchOpState &matchop_state) : matchop_state(matchop_state) {}
+
+        // These operators are intended only for use in range-for constructs
+        const GraphOptInfo &operator*() { return *current; }
+        bool operator!=(const MatchIterator &other) { return current != other.current; }
+        void operator++() { advance(); }
+    };
+
+  public:
+    explicit GraphOptPass(int pri) : priority(pri), flags(0), set_bitmap(0) {}
+    GraphOptPass(GraphOptPass &&) = default;
+
+    // Add a rule in evaluation order...
+    API_EXPORT void add_optim(GraphOptInfo *p);
+
+    // Build the codes and opstrs for each MatcherState after
+    // all rules have been added.
+    API_EXPORT void build_matchers();
+
+    // return the priority for thi pass
+    API_EXPORT int get_priority() const { return priority; }
+    // return the combined flags for rules in this pass
+    API_EXPORT OptimFlags::flags_t get_flags() const { return flags; }
+
+    // used in introspect.cc only
+    API_EXPORT const rules_map_t &get_rules() const { return rules; }
+
+    // This instance is used whenever there are no matches to
+    // the root of an operator
+    API_EXPORT_IMPORT static MatcherState empty_matcher;
+
+    // Return the matcher state associated with rules that
+    // might match opdef.
+    API_EXPORT const MatcherState &get_optims(OpDef const *opdef) const
+    {
+        // avoid the map if the the filter test failes...
+        if (not(set_bitmap & hash(opdef->opstr))) return empty_matcher;
+        auto iter = rules.find(opdef->opstr);
+        return (iter == rules.end()) ? empty_matcher : iter->second;
+    }
+
+    // This class is just an adapter so we can return information
+    // about a rule in a form that is suitable for use in a range for.
+    class RuleList {
+        MatchOpState &matchop_state;
+        const MatcherState &state;
+
+      public:
+        RuleList(MatchOpState &matchop_state, const MatcherState &state) : matchop_state(matchop_state), state(state) {}
+        API_EXPORT MatchIterator begin() const noexcept { return MatchIterator(matchop_state, state, 0); }
+        API_EXPORT MatchIterator end() const noexcept { return MatchIterator(matchop_state); }
+    };
+
+    // Return the rules which might match 'opdef' using 'matchop_state'
+    // to cachine opdef looksups.
+    API_EXPORT RuleList optims(MatchOpState &matchop_state, const OpDef *opdef) const
+    {
+        matchop_state.matched_opdef[0] = opdef;
+        return RuleList(matchop_state, get_optims(opdef));
+    }
+};
+
+} // namespace hnnx
+
+POP_VISIBILITY()
+
+#include "oexpr_post.h"
+
+//
+//
+
+PUSH_VISIBILITY(default)
+
+namespace hnnx {
+
+API_EXPORT std::map<int, GraphOptPass> &get_optimization_passes();
+
+API_EXPORT std::map<std::string, std::vector<std::unique_ptr<GraphOptInfo>> *> &get_pkg_opt_tmp_map();
+
+API_EXPORT void add_package_opt(std::vector<std::unique_ptr<GraphOptInfo>> &opts, int priority,
+                                OptimFlags::flags_t flags_in, get_entire_defopt_t defopt_in, char const *const fname,
+                                const int lineno);
+// This entry is only for backwards ABI compatibility for exising op packages
+// compiled when fname and line number were not in the default build.
+API_EXPORT void add_package_opt(std::vector<std::unique_ptr<GraphOptInfo>> &opts, int priority,
+                                OptimFlags::flags_t flags_in, get_entire_defopt_t defopt_in);
+
+API_EXPORT std::string get_opname_with_default_pkg_prefix(char const *opname);
+
+} // namespace hnnx
+
+POP_VISIBILITY()
+
+#define INIT_PACKAGE_OPTIMIZATION_DEF()                                                                                \
+    API_HIDDEN std::vector<std::unique_ptr<hnnx::GraphOptInfo>> &current_package_opts_storage_vec_func()               \
+    {                                                                                                                  \
+        static std::vector<std::unique_ptr<hnnx::GraphOptInfo>> optv;                                                  \
+        return optv;                                                                                                   \
+    }                                                                                                                  \
+    extern "C" {                                                                                                       \
+    void clearPackageOptStorageVecFunc() { current_package_opts_storage_vec_func().clear(); }                          \
+    }
+
+#define DECLARE_PACKAGE_OPTIMIZATION_DEF()                                                                             \
+    API_HIDDEN std::vector<std::unique_ptr<hnnx::GraphOptInfo>> &current_package_opts_storage_vec_func();
+
+#define REGISTER_EXTERNAL_PACKAGE_OPT(PRIORITY, FLAGS, DEFOPT) APPEND_REG_OPT_ELEM(PRIORITY, FLAGS, DEFOPT, __LINE__)
+
+#define REGISTER_INTERNAL_PACKAGE_OPT(PRIORITY, FLAGS, DEFOPT) APPEND_REG_OPT_ELEM(PRIORITY, FLAGS, DEFOPT, __LINE__)
+
+#define DEF_PACKAGE_OPTIMIZATION(PRIORITY, MATCHCODE, CONSTRAINTCODE, REPLACECODE)                                     \
+    DEF_PACKAGE_OPTIMIZATION_WITH_FLAGS(PRIORITY, 0, MATCHCODE, CONSTRAINTCODE, REPLACECODE)
+
+#define DEF_PACKAGE_OPTIMIZATION_WITH_FLAGS(PRIORITY, FLAGS, MATCHCODE, CONSTRAINTCODE, REPLACECODE)                   \
+    DEF_PACKAGE_OPTIMIZATION_COMMON(PRIORITY, FLAGS, MATCHCODE, CONSTRAINTCODE, REPLACECODE)                           \
+    REGISTER_EXTERNAL_PACKAGE_OPT((PRIORITY), hnnx::OptimFlags::flag_evaluate<UNIQUE_TYPE>(),                          \
+                                  &hnnx::get_entire_defopt<UNIQUE_TYPE>);
+
+#define DEF_INTERNAL_PACKAGE_OPTIMIZATION(PRIORITY, MATCHCODE, CONSTRAINTCODE, REPLACECODE)                            \
+    DEF_INTERNAL_PACKAGE_OPTIMIZATION_WITH_FLAGS(PRIORITY, 0, MATCHCODE, CONSTRAINTCODE, REPLACECODE)
+
+#define DEF_INTERNAL_PACKAGE_OPTIMIZATION_WITH_FLAGS(PRIORITY, FLAGS, MATCHCODE, CONSTRAINTCODE, REPLACECODE)          \
+    DEF_PACKAGE_OPTIMIZATION_COMMON(PRIORITY, FLAGS, MATCHCODE, CONSTRAINTCODE, REPLACECODE)                           \
+    REGISTER_INTERNAL_PACKAGE_OPT((PRIORITY), hnnx::OptimFlags::flag_evaluate<UNIQUE_TYPE>(),                          \
+                                  &hnnx::get_entire_defopt<UNIQUE_TYPE>);
+
+#ifndef DEF_OPT_COMPILE
+#define DEF_PACKAGE_OPTIMIZATION_COMMON(PRIORITY, FLAGS, MATCHCODE, CONSTRAINTCODE, REPLACECODE)                       \
+    template <> [[gnu::always_inline, gnu::cold]] hnnx::MatchAst_uptr MatchBuilder::matcher<UNIQUE_TYPE>()             \
+    {                                                                                                                  \
+        return MATCHCODE;                                                                                              \
+    }                                                                                                                  \
+    template <> [[gnu::always_inline, gnu::cold]] ReplFuncBool Constraint::constraint<UNIQUE_TYPE>()                   \
+    {                                                                                                                  \
+        using namespace oExp_for_cst;                                                                                  \
+        using oExp::INT;                                                                                               \
+        using oExp::UINT;                                                                                              \
+        auto result = oExp::wrap_param_to<bool>(CONSTRAINTCODE);                                                       \
+        return oExp::wrap_as_function<bool>(result);                                                                   \
+    }                                                                                                                  \
+    template <> [[gnu::always_inline, gnu::cold]] ReplFunc Replacement::replacement<UNIQUE_TYPE>()                     \
+    {                                                                                                                  \
+        using namespace oExp_for_repl;                                                                                 \
+        using oExp::INT;                                                                                               \
+        using oExp::UINT;                                                                                              \
+        pkg_flag = THIS_PKG_NAME_STR;                                                                                  \
+        return Operand(REPLACECODE);                                                                                   \
+    }                                                                                                                  \
+    template <> inline constexpr hnnx::OptimFlags::flags_t hnnx::OptimFlags::flag_evaluate<UNIQUE_TYPE>() noexcept     \
+    {                                                                                                                  \
+        return static_cast<uint32_t>(any_rule) | static_cast<uint32_t>(FLAGS);                                         \
+    }                                                                                                                  \
+    template <> hnnx::entire_defopt hnnx::get_entire_defopt<UNIQUE_TYPE>()                                             \
+    {                                                                                                                  \
+        return hnnx::entire_defopt{MatchBuilder::matcher<UNIQUE_TYPE>(),                                               \
+                                   Constraint::constraint<UNIQUE_TYPE>(),                                              \
+                                   Replacement::replacement<UNIQUE_TYPE>(),                                            \
+                                   {},                                                                                 \
+                                   nullptr};                                                                           \
+    }
+#else
+#define DEF_PACKAGE_OPTIMIZATION_COMMON(PRIORITY, FLAGS, MATCHCODE, CONSTRAINTCODE, REPLACECODE)                       \
+    __def_opt__(PRIORITY, FLAGS, MATCHCODE, CONSTRAINTCODE, REPLACECODE)<<<__FILE__, __LINE__>>>
+//  ---> the format of this line must agree with the assumption in scripts/rewrite/hash_rule.py
+#endif
+
+#define REGISTER_PACKAGE_OPTIMIZATIONS()                                                                               \
+    {                                                                                                                  \
+        auto &pkg_opt_map = hnnx::get_pkg_opt_tmp_map(); /* package registration map */                                \
+        auto [iter, ok] = pkg_opt_map.try_emplace(std::string(THIS_PKG_NAME_STR),                                      \
+                                                  nullptr); /*see if we can insert an empty one */                     \
+        if (ok) iter->second = &current_package_opts_storage_vec_func();                                               \
+    } /* if so, replace it with this */
+
+#define DEF_OPTIM(PRIORITY, FLAGS, MATCHCODE, CONSTRAINTCODE, REPLACECODE)                                             \
+    DEF_INTERNAL_PACKAGE_OPTIMIZATION_WITH_FLAGS(PRIORITY, FLAGS, MATCHCODE, CONSTRAINTCODE, REPLACECODE)
+
+#define DEF_OPT(PRIORITY, MATCHCODE, CONSTRAINTCODE, REPLACECODE)                                                      \
+    DEF_INTERNAL_PACKAGE_OPTIMIZATION(PRIORITY, MATCHCODE, CONSTRAINTCODE, REPLACECODE)
+
+#define FROM_DEFAULT_PACKAGE(OP) hnnx::get_opname_with_default_pkg_prefix(OP).c_str()
+
+DECLARE_PACKAGE_OPTIMIZATION_DEF()
+
+// [DEPRECATED] Old Pass Phases
+// see docs/def_opt_migration.md
+// #define BEGIN         0
+// #define GRAPH_CLEANUP 100
+// #define PRE_QNN       500
+// #define QNN           1000
+// #define EARLY         2000
+// #define MIDDLE        3000
+// #define LATE          4000
+
+// New Pass Phases
+// see docs/def_opt_migration.md to understand how DEF_OPT
+// rules were initially put into these ranges.
+
+// Anything that we need to do before we start optimizing things away
+#define PRE_OPTIMIZATION 50
+
+// Rewriting that needs to happen to clean up to prepare for translation
+#define CLEANUP_GRAPH 100
+
+// Other rewriting that needs to occur to prepare for translation or avoid special cases
+#define PRE_TRANSLATE 1000
+
+// Translate from upper level op definitions to our internal ops and op patterns.
+// This was called "QNN" before
+#define TRANSLATE 2000
+
+// Any rules that need to run fairly early to figure out what's going on in the graph
+#define ANALYSIS 3000
+
+// Fixes for quantization in the graph
+#define QUANT_FIXES 4000
+
+// Replace ops with other ops to simplify the graph, before dimension reshaping.
+// Some of "EARLY" goes here.  Often fission and fusion will go here.
+#define PRE_RESHAPE_OP_SIMPLIFY 5000
+
+// Reshaping spatial dimension to help performance
+#define SPATIAL_RESHAPE 6000
+
+// Exchanging space and depth to help performance
+#define SPACE_DEPTH 7000
+
+// Replace ops with other ops to simplify the graph, post dimension reshaping.
+// A lot of "EARLY" goes here.  Often fission and fusion will go here.
+#define POST_RESHAPE_OP_SIMPLIFY 8000
+
+// Anything that needs to happen before tiling
+#define PRE_TILING 10000
+
+// Tiling large ops to make them smaller
+#define TILING 11000
+
+// This is the phase that central tiling logically runs at
+// TODO(charcall) remove this and just run before TILE_CLEANUP
+#define CENTRAL_TILING 11900
+
+// Clean up the graph after tiling.  Slice-of-concat, etc.
+#define TILE_CLEANUP 12000
+
+// Passes that should happen after tiling
+#define POST_TILING 13000
+
+// Graph rewriting for actual op implementations, specializations, and their requirements.
+// What was once LATE+0 through LATE+9 is often this kind of thing.
+#define HARD_OPS 20000
+
+// Move data to TCM and remove unnecessary data moves.
+// Perhaps, eventually accomplished by different infrastructure.
+#define BEFORE_TCM_MIGRATION 20800
+
+// Move data to TCM and remove unnecessary data moves.
+// Perhaps, eventually accomplished by different infrastructure.
+#define TCM_MIGRATION 21000
+
+// Passes that run after TCM migration ops are inserted
+#define POST_TCM 22000
+
+// Anything that needs to be simplified at the very end
+#define FINAL_CLEANUP 23000
+
+// do we need to do any aux graph specific cleanup
+#define AUX_CLEANUP 25000
+
+// Some DEF_AUTOSPLIT rules are only used by the central tiler
+// and the chunk size is not used. We give a symbolic name for this case
+#define CHUNK_NOT_USED 0
+
+// LEGACY support for OLD pass phase names
+// EXTERNAL use only in OpPackages and at the QNN-level
+// see docs/def_opt_migration.md
+// DO NOT USE THESE ON HTP CORE!
+// HTP Core developers should read docs/def_opt_migration.md
+#ifndef DISABLE_LEGACY_PASS_SYMBOLS
+#define BEGIN         0 // (CLEANUP_GRAPH)
+#define GRAPH_CLEANUP 50 // (CLEANUP_GRAPH + 50)
+#define PRE_QNN       1050 // (PRE_TRANSLATE + 50)
+#define QNN           2050 // (TRANSLATE + 50)
+#define EARLY         3050 // (ANALYSIS + 50)
+#define MIDDLE        20050 // (HARD_OPS + 50)
+#define LATE          21050 // (TCM_MIGRATION + 50)
+// For Upcoming centralized LAYOUT_AND_PLACEMENT changes
+#define LAYOUT_AND_PLACEMENT 21100 // (TCM_MIGRATION + 100)
+#endif
+
+#define GET_DILVALUE(arg1, arg2, ...) arg2
+
+#define TYPICAL_CONV_SLICE(in, tag, stride, filt_taps, ...)                                                            \
+    AUTOSPLIT_SLICE(in, AUTOSPLIT_SHAPEFN_APPLY(conv_valid_split_start, tag, in, stride),                              \
+                    AUTOSPLIT_SHAPEFN_APPLY(conv_valid_split_size, tag, in, stride, filt_taps,                         \
+                                            GET_DILVALUE(dummy, ##__VA_ARGS__, 1)))
+
+#ifndef DTP_COMPILE
+#define DEF_TENSOR_PROPERTIES(...)                                                                                     \
+    namespace DefProperties {                                                                                          \
+    [[maybe_unused]] static bool CTRICKS_PASTER(opdef_proprety, __LINE__) =                                            \
+            hnnx::register_tensor_properties(THIS_PKG_NAME_STR, TensorInfoBuilder(THIS_PKG_NAME_STR, __VA_ARGS__));    \
+    }
+#else
+#define DEF_TENSOR_PROPERTIES(...) __dtp__(__VA_ARGS__)<<<__FILE__, __LINE__>>>
+#endif
+#else
+#define DEF_TENSOR_PROPERTIES(...)
+#define DEF_AUTOSPLIT(...)
+#define DEF_AUTOSPLITIM(...)
+#define DEF_AUTOSPLIT_ORDERED(...)
+#define DEF_AUTOSPLIT_TYPICAL(...)
+#define DEF_PACKAGE_OPTIMIZATION(PRIORITY, MATCHCODE, CONSTRAINTCODE, REPLACECODE)
+#define DEF_PACKAGE_OPTIMIZATION_WITH_FLAGS(PRIORITY, FLAGS, MATCHCODE, CONSTRAINTCODE, REPLACECODE)
+#define DEF_OPTIM(PRIORITY, FLAGS, MATCHCODE, CONSTRAINTCODE, REPLACECODE)
+#define DEF_OPT(PRIORITY, MATCHCODE, CONSTRAINTCODE, REPLACECODE)
+#define INIT_PACKAGE_OPTIMIZATION_DEF()                                                                                \
+    /* Provide no-op definition so clearPkgStorage still works */                                                      \
+    extern "C" void clearPackageOptStorageVecFunc() {}
+#define REGISTER_PACKAGE_OPTIMIZATIONS()
+#endif // PREPARE_DISABLED
+
+#define COMPILER_FOR(XXF, FUNC, PARA)                                                                                  \
+    template <> constexpr bool has_compile_method<XXF> = true;                                                         \
+    template <> struct OpaqueT_FOR<XXF> {                                                                              \
+        using type = PARA;                                                                                             \
+    };                                                                                                                 \
+    template <> hnnx::Executable::ItemType hnnx::TypicalOpWithCompiler<XXF, PARA>::compile(Graph &graph_in) const      \
+    {                                                                                                                  \
+        static_assert(check_szal());                                                                                   \
+        return FUNC(graph_in, this);                                                                                   \
+    }
+
+#endif // OPTIMIZE_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/optimize_defs.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/optimize_defs.h
new file mode 100755
index 0000000000000..dc7a35b015267
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/optimize_defs.h
@@ -0,0 +1,462 @@
+//==============================================================================
+//
+// Copyright (c) 2020-2023 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef OPTIMIZE_DEFS_H
+#define OPTIMIZE_DEFS_H 1
+
+// this file contains #define that need to be seen by the optimization rule parser,
+// in addition to the C++ code. Don't place any #include in here.
+//
+
+/**
+ * \defgroup OptConstraint
+ * @{
+ */
+
+#define IS_SCALAR(X) AND(EQ(DIM_BATCHES(X), 1), EQ(DIM_HEIGHT(X), 1), EQ(DIM_WIDTH(X), 1), EQ(DIM_DEPTH(X), 1))
+
+#define IS_NOT_SCALAR(X) OR(NE(DIM_BATCHES(X), 1), NE(DIM_HEIGHT(X), 1), NE(DIM_WIDTH(X), 1), NE(DIM_DEPTH(X), 1))
+
+#define IS_SHAPE_1x1x1xd(X) AND(EQ(DIM_BATCHES(X), 1), EQ(DIM_HEIGHT(X), 1), EQ(DIM_WIDTH(X), 1), NE(RANK_OF(X), 5))
+
+#define IS_SHAPE_1x1x1x1xc(X)                                                                                          \
+    AND(EQ(RANK_OF(X), 5), EQ(DIM_BATCHES(X), 1), EQ(DIM_HEIGHT(X), 1), EQ(DIM_WIDTH(X), 1), EQ(DIM_DEPTH(X), 1))
+
+#define IS_1HD_H1D(A, B)                                                                                               \
+    AND(EQ(DIM_HEIGHT(A), 1), NE(DIM_WIDTH(A), 1), EQ(DIM_WIDTH(B), 1), EQ(DIM_WIDTH(A), DIM_HEIGHT(B)),               \
+        EQ(DIM_DEPTH(A), DIM_DEPTH(B)))
+
+/// @brief A constant specifying the standard split for output depth
+// NOTE: HEXNNVVV-330 workaround: stay at 32 channels or FP test starts to fail...
+#define MIN_CHANNEL_SPLIT_SIZE 32
+#define CHANNEL_SPLIT_SIZE     256
+
+// TCM_SIZE and TCM_TOOBIG are renamed to TCM_MAXTENSOR_SIZE and TCM_MAXTENSOR_HALF_SIZE
+// to help with tilling on 2x4mb auto hardware
+// where total tcm size is multiple of tiling size
+//#define TCM_SIZE OPTION_UINT("tcm_size")
+//#define TCM_TOOBIG DIV(OPTION_UINT("tcm_size"), 2)
+
+// TCM_MAXTENSOR_HALF_SIZE is one half of the TCM tiling size
+#define TCM_MAXTENSOR_SIZE      OPTION_UINT("tcm_size_for_tiling")
+#define TCM_MAXTENSOR_HALF_SIZE DIV(OPTION_UINT("tcm_size_for_tiling"), 2)
+
+// Used in depth slicing, where height slicing has not been done
+// Depth could be 1.
+#define ELEMWISE_TILE_SIZE(ACT)                                                                                        \
+    MUL(TILE_HEIGHT, ROUNDUP(MUL(DIM_WIDTH(ACT), ELEMENTSIZE_OF(ACT)), 8),                                             \
+        ROUNDUP(MIN(CHANNEL_SPLIT_SIZE, DIM_DEPTH(ACT)), 32))
+
+#define ELEMWISE_TOOBIG(A, B, OUT)                                                                                     \
+    GT(ADD(ELEMWISE_TILE_SIZE(A), ELEMWISE_TILE_SIZE(B), ELEMWISE_TILE_SIZE(OUT)), TCM_MAXTENSOR_SIZE)
+
+#define WEIGHT_STORAGE(WEIGHT, SPLIT)                                                                                  \
+    MUL(ELEMENTSIZE_OF(WEIGHT), DIM_FILTHEIGHT(WEIGHT), DIM_FILTWIDTH(WEIGHT), ROUNDUP(DIM_FILTDEPTH(WEIGHT), 32),     \
+        MIN(SPLIT, DIM_NFILTS(WEIGHT)))
+
+#define ACT_STORAGE_EST(WEIGHT, ACT, SPLIT)                                                                            \
+    ADD(MUL(ROUNDUP(ADD(7, DIM_FILTHEIGHT(WEIGHT)), 8), ROUNDUP(MUL(DIM_WIDTH(ACT), ELEMENTSIZE_OF(ACT)), 8),          \
+            ROUNDUP(DIM_DEPTH(ACT), 32)),                                                                              \
+        MUL(8, ROUNDUP(MUL(DIM_WIDTH("*"), ELEMENTSIZE_OF("*")), 8), SPLIT))
+
+#define GOOD_WEIGHTS(WEIGHT, ACT, SPLIT)                                                                               \
+    LT(ADD(WEIGHT_STORAGE(WEIGHT, SPLIT), ACT_STORAGE_EST(WEIGHT, ACT, SPLIT)),                                        \
+       SELECT(EQ(ELEMENTSIZE_OF(WEIGHT), 1), TCM_MAXTENSOR_HALF_SIZE, MUL(DIV(TCM_MAXTENSOR_SIZE, 2048), 1041)))
+
+/* controls when we can be more aggressive with tiling */
+#define CAN_FINE_SPLIT EQ(OPTION_UINT("can_fine_split"), 1)
+
+#define BIG_WIDTH_SIZE OPTION_UINT("big_width_split")
+
+#define DO_BIG_WIDTH_SPLIT                                                                                             \
+    AND(GT(DIM_WIDTH("*"), BIG_WIDTH_SIZE), CAN_FINE_SPLIT, GE(DIM_DEPTH("*"), 4), GT(DIM_HEIGHT("*"), 1))
+
+#define AUTOTHREAD_ENABLED OPTION_INT("enable_autothread")
+
+// Helper to decide if we should auto thread
+#define SHOULD_AUTOTHREAD                                                                                              \
+    AND(NOT(OPTION_BOOL("central_tiler")), GT(DATA_SIZE("*"), MUL(OPTION_INT("autothread_size_kb"), 1024)))
+// When autothreading on width, beware of rounding based on element size
+#define SHOULD_AUTOTHREAD1(ACT1)       AND(SHOULD_AUTOTHREAD, EQ(ELEMENTSIZE_OF("*"), ELEMENTSIZE_OF(ACT1)))
+#define SHOULD_AUTOTHREAD2(ACT1, ACT2) AND(SHOULD_AUTOTHREAD1(ACT1), EQ(ELEMENTSIZE_OF("*"), ELEMENTSIZE_OF(ACT2)))
+#define SHOULD_AUTOTHREAD3(ACT1, ACT2, ACT3)                                                                           \
+    AND(SHOULD_AUTOTHREAD2(ACT1, ACT2), EQ(ELEMENTSIZE_OF("*"), ELEMENTSIZE_OF(ACT3)))
+
+// This is used for some unary operators so that they are supertiled the same way
+// as binary ops even though they have a smaller footprint
+// In particular Sqrt(Mul(x,y))...
+#define SHOULD_AUTOTHREAD_UNARY                                                                                        \
+    AND(NOT(OPTION_BOOL("central_tiler")), GT(MUL(3, DATA_SIZE("*")), MUL(OPTION_INT("autothread_size_kb"), 2 * 1024)))
+// When autothreading on width, beware of rounding based on element size
+#define SHOULD_AUTOTHREAD_UNARY1(ACT) AND(SHOULD_AUTOTHREAD_UNARY, EQ(ELEMENTSIZE_OF("*"), ELEMENTSIZE_OF(ACT)))
+
+/*
+ * "Choose the maximum channel split size that doesn't make the slice of weights too big"
+ */
+#define SMART_CHANNEL_SIZE(WEIGHT_STR, ACT_STR)                                                                        \
+    SELECT(GOOD_WEIGHTS(WEIGHT_STR, ACT_STR, CHANNEL_SPLIT_SIZE), CHANNEL_SPLIT_SIZE,                                  \
+           SELECT(GOOD_WEIGHTS(WEIGHT_STR, ACT_STR, DIV(CHANNEL_SPLIT_SIZE, 2)), DIV(CHANNEL_SPLIT_SIZE, 2),           \
+                  SELECT(GOOD_WEIGHTS(WEIGHT_STR, ACT_STR, DIV(CHANNEL_SPLIT_SIZE, 4)), DIV(CHANNEL_SPLIT_SIZE, 4),    \
+                         32)))
+
+// For HMX DWC
+#define DWC_ACT_STORAGE_EST(WEIGHT, ACT, SPLIT)                                                                        \
+    ADD(MUL(ROUNDUP(ADD(7, DIM_FILTHEIGHT(WEIGHT)), 8), ROUNDUP(MUL(DIM_WIDTH(ACT), ELEMENTSIZE_OF(ACT)), 8), SPLIT),  \
+        MUL(8, ROUNDUP(MUL(DIM_WIDTH("*"), ELEMENTSIZE_OF("*")), 8), SPLIT))
+
+#define DWC_GOOD_WEIGHTS(WEIGHT, ACT, SPLIT)                                                                           \
+    LT(ADD(WEIGHT_STORAGE(WEIGHT, SPLIT), DWC_ACT_STORAGE_EST(WEIGHT, ACT, SPLIT)), DIV(TCM_MAXTENSOR_SIZE, 2))
+
+#define DWC_SMART_CHANNEL_SIZE(WEIGHT_STR, ACT_STR)                                                                    \
+    SELECT(DWC_GOOD_WEIGHTS(WEIGHT_STR, ACT_STR, CHANNEL_SPLIT_SIZE), CHANNEL_SPLIT_SIZE,                              \
+           SELECT(DWC_GOOD_WEIGHTS(WEIGHT_STR, ACT_STR, DIV(CHANNEL_SPLIT_SIZE, 2)), DIV(CHANNEL_SPLIT_SIZE, 2),       \
+                  SELECT(DWC_GOOD_WEIGHTS(WEIGHT_STR, ACT_STR, DIV(CHANNEL_SPLIT_SIZE, 4)),                            \
+                         DIV(CHANNEL_SPLIT_SIZE, 4), 32)))
+
+// For other depthwise ops
+#define MAX_CHANNEL_SIZE(ACT)                                                                                          \
+    ROUNDUP(MAX(1, DIV(TCM_MAXTENSOR_HALF_SIZE,                                                                        \
+                       MUL(ROUNDUP(DIM_HEIGHT(ACT), 8), ROUNDUP(MUL(DIM_WIDTH(ACT), ELEMENTSIZE_OF(ACT)), 8)))),       \
+            32)
+
+/// @brief SAME_QUANT("A", "B") -> true if the operands have the same stepsize and zero offset
+#define SAME_QUANT(OPA, OPB)                                                                                           \
+    OR(AND(EQ(STEPSIZE_OF(OPA), STEPSIZE_OF(OPB)), EQ(ZERO_OFFSET_OF(OPA), ZERO_OFFSET_OF(OPB))),                      \
+       AND(IS_FLOAT16(OPA), IS_FLOAT16(OPB)), AND(IS_FLOAT32(OPA), IS_FLOAT32(OPB)))
+
+/// @brief SAME_DTYPE_QUANT("A", "B") -> true if the operands have the same dtype, stepsize and zero offset
+#define SAME_DTYPE_QUANT(OPA, OPB)                                                                                     \
+    AND(EQ(DTYPE_OF(OPA), DTYPE_OF(OPB)), EQ(STEPSIZE_OF(OPA), STEPSIZE_OF(OPB)),                                      \
+        EQ(ZERO_OFFSET_OF(OPA), ZERO_OFFSET_OF(OPB)), NOT(OPTION_BOOL("quant_is_updateable")))
+
+/// @brief MIN_QU8(X) -> min of range defined by a scale/offset for a qu8 tensor
+#define MIN_QU8(X) MUL(STEPSIZE_OF(X), MUL(-1.0f, ZERO_OFFSET_OF(X)))
+
+/// @brief MAX_QU8(X) -> max of range defined by a scale/offset for a qu8 tensor
+#define MAX_QU8(X) MUL(STEPSIZE_OF(X), SUB(255.0f, ZERO_OFFSET_OF(X)))
+
+/// @brief OPCONST(X) enforces that op X is a Const during pattern matching
+#define OPCONST(X) LET(X, Op("$Const"))
+
+/// @brief OPCONST_DDR(X) enforces that op Name is a Const during pattern matching
+///   the "constant_crouton_from_ddr" is discarded in final cleanup and the
+///   constant is loaded from memory. It is used in contexts where crouton format
+///   is expected.
+#define OPCONST_DDR(Name)      Op("constant_crouton_from_ddr", Op("ForceFormat_Crouton", OPCONST(Name)))
+#define OPCONST_FLAT_DDR(Name) Op("constant_flat_from_ddr", OPCONST(Name))
+
+/// @brief OPCONST_TCM(X) enforces that op Name is a Const during pattern matching
+///   the "constant_crouton_to_vtcm" will be converted to a sequence to load
+///   the constant into TCM memory during final cloeanup
+#define OPCONST_TCM(Name)      Op("constant_crouton_to_vtcm", OPCONST(Name))
+#define OPCONST_FLAT_TCM(Name) Op("constant_flat_to_vtcm", OPCONST(Name))
+
+// How wide should the output tile be?
+// Well,
+// * We have HEX_VTCM_MB - WEIGHT_STORAGE available from VTCM
+// * Input is roughly (input height * input depth) * (1+filter-related-value) * WIDTH
+// * Output is output depth * 8 * WIDTH
+// So we should take (HEX_VTCM_MB-WEIGHT_STORAGE) and divide by
+//     (input height * input depth) * (1+filter-related-value) + (OUTPUT DEPTH*8)
+// And then round down to a multiple of 8 probably
+// But we need to do at least 8 wide
+
+// need "Too big" indication for constraint (which might be "big total width"), then
+// need to tile into at least ~4 chunks to actually shrink size.
+
+#define SUBS(a, b)      SELECT(GT(a, b), SUB(a, b), 0)
+#define MIN_WIDTH_OF(A) SELECT(EQ(ELEMENTSIZE_OF(A), 1), 8, SELECT(EQ(ELEMENTSIZE_OF(A), 2), 4, 2))
+#define MIN_WIDTH       8
+
+#define ESTIMATE_TENSOR_SIZE(T)                                                                                        \
+    MUL(DIM_BATCHES(T), ROUNDUP(DIM_HEIGHT(T), TILE_HEIGHT), ROUNDUP(MUL(DIM_WIDTH(T), ELEMENTSIZE_OF(T)), 8),         \
+        ROUNDUP(DIM_DEPTH(T), 32))
+
+#define ESTIMATE_SIZE(ACT, WEIGHTS, OUT)                                                                               \
+    ADD(MUL(DIM_BATCHES(ACT),                                                                                          \
+            ROUNDUP(ADD(DIM_HEIGHT(ACT), SELECT(EQ(DIM_FILTHEIGHT(WEIGHTS), 1), 0, 8)), TILE_HEIGHT),                  \
+            ROUNDUP(ADD(MUL(DIM_WIDTH(ACT), ELEMENTSIZE_OF(ACT)), SELECT(EQ(DIM_FILTWIDTH(WEIGHTS), 1), 0, 8)), 8),    \
+            ROUNDUP(DIM_DEPTH(ACT), 32)),                                                                              \
+        ROUNDUP(WEIGHT_STORAGE(WEIGHTS, DIM_NFILTS(WEIGHTS)), 2048), ESTIMATE_TENSOR_SIZE(OUT),                        \
+        ROUNDUP(MUL(8, ROUNDUP(DIM_DEPTH(OUT), 32)), 2048))
+
+#define ESTIMATE_SIZE_QUANT(ACT, WEIGHTS, OUT)                                                                         \
+    ADD(MUL(DIM_BATCHES(ACT),                                                                                          \
+            ROUNDUP(ADD(DIM_HEIGHT(ACT), SELECT(EQ(DIM_FILTHEIGHT(WEIGHTS), 1), 0, 8)), TILE_HEIGHT),                  \
+            ROUNDUP(ADD(MUL(DIM_WIDTH(ACT), ELEMENTSIZE_OF(ACT)), SELECT(EQ(DIM_FILTWIDTH(WEIGHTS), 1), 0, 8)), 8),    \
+            ROUNDUP(DIM_DEPTH(ACT), 32)),                                                                              \
+        ROUNDUP(MUL(WEIGHT_STORAGE(WEIGHTS, DIM_NFILTS(WEIGHTS)), 2), 2048), ESTIMATE_TENSOR_SIZE(OUT),                \
+        ROUNDUP(MUL(8, ROUNDUP(DIM_DEPTH(OUT), 32)), 2048))
+
+// minimum size required for convolution
+#define GET_IN_SIZE(OUT_SIZE, FILT_SIZE, STRIDE, DILATION)                                                             \
+    ADD(MUL(SUB(OUT_SIZE, 1), STRIDE), MUL(SUB(FILT_SIZE, 1), DILATION), 1)
+
+#define ESTIMATE_MIN_SIZE(ACT, WEIGHTS, STRIDE, DIL_H, DIL_W)                                                          \
+    ADD(MUL(ROUNDUP(ADD(GET_IN_SIZE(8, DIM_FILTHEIGHT(WEIGHTS), DIM_HEIGHT(STRIDE), DIL_H),                            \
+                        SELECT(EQ(DIM_FILTHEIGHT(WEIGHTS), 1), 0, 8)),                                                 \
+                    8),                                                                                                \
+            ROUNDUP(ADD(MUL(GET_IN_SIZE(MIN_WIDTH_OF(ACT), DIM_FILTWIDTH(WEIGHTS), DIM_WIDTH(STRIDE), DIL_W),          \
+                            ELEMENTSIZE_OF(ACT)),                                                                      \
+                        SELECT(EQ(DIM_FILTWIDTH(WEIGHTS), 1), 0, 8)),                                                  \
+                    8),                                                                                                \
+            ROUNDUP(DIM_DEPTH(ACT), 32)),                                                                              \
+        ROUNDUP(WEIGHT_STORAGE(WEIGHTS, 32), 2048), 2048, 2048)
+
+#define ESTIMATE_SIZE_ALIGNED_SLICE(ACT, START, OUT)                                                                   \
+    ADD(SELECT(EQ(DIM_WIDTH(START), 0), ESTIMATE_TENSOR_SIZE(ACT), MUL(ESTIMATE_TENSOR_SIZE(ACT), 2)),                 \
+        ESTIMATE_TENSOR_SIZE(OUT))
+
+// minimum width to make convolution fit into vtcm
+#define MAX_GOOD_WIDTH_CONV(ACT_STR, WEIGHT_STR, OUT_STR, STRIDE, DILATION, TCMSIZE)                                   \
+    MAX(ROUNDUP(DIV(SUBS(DIV(MUL(TCMSIZE, 7), 8),                                                                      \
+                         ADD(ROUNDUP(WEIGHT_STORAGE(WEIGHT_STR, DIM_NFILTS(WEIGHT_STR)), 2048),                        \
+                             MUL(ROUNDUP(DIM_HEIGHT(ACT_STR), 8), ROUNDUP(DIM_DEPTH(ACT_STR), 32),                     \
+                                 SUB(DIM_FILTWIDTH(WEIGHT_STR), 1), DILATION, ELEMENTSIZE_OF(ACT_STR)))),              \
+                    MUL(4, ADD(MUL(ELEMENTSIZE_OF(OUT_STR), ROUNDUP(DIM_DEPTH(OUT_STR), 32),                           \
+                                   ROUNDUP(DIM_HEIGHT(OUT_STR), 8)),                                                   \
+                               MUL(ELEMENTSIZE_OF(ACT_STR), ROUNDUP(DIM_HEIGHT(ACT_STR), 8), STRIDE,                   \
+                                   ROUNDUP(DIM_DEPTH(ACT_STR), 32))))),                                                \
+                MIN_WIDTH_OF(OUT_STR)),                                                                                \
+        MIN_WIDTH_OF(OUT_STR))
+
+#define MAX_GOOD_WIDTH_CONV_QUANT(ACT_STR, WEIGHT_STR, OUT_STR, STRIDE, DILATION, TCMSIZE)                             \
+    MAX(ROUNDUP(DIV(SUBS(DIV(MUL(TCMSIZE, 7), 8),                                                                      \
+                         ADD(ROUNDUP(MUL(WEIGHT_STORAGE(WEIGHT_STR, DIM_NFILTS(WEIGHT_STR)), 2), 2048),                \
+                             MUL(ROUNDUP(DIM_HEIGHT(ACT_STR), 8), ROUNDUP(DIM_DEPTH(ACT_STR), 32),                     \
+                                 SUB(DIM_FILTWIDTH(WEIGHT_STR), 1), DILATION, ELEMENTSIZE_OF(ACT_STR)))),              \
+                    MUL(4, ADD(MUL(ELEMENTSIZE_OF(OUT_STR), ROUNDUP(DIM_DEPTH(OUT_STR), 32),                           \
+                                   ROUNDUP(DIM_HEIGHT(OUT_STR), 8)),                                                   \
+                               MUL(ELEMENTSIZE_OF(ACT_STR), ROUNDUP(DIM_HEIGHT(ACT_STR), 8), STRIDE,                   \
+                                   ROUNDUP(DIM_DEPTH(ACT_STR), 32))))),                                                \
+                MIN_WIDTH_OF(OUT_STR)),                                                                                \
+        MIN_WIDTH_OF(OUT_STR))
+
+#define MAX_GOOD_WIDTH(ACT_STR, WEIGHT_STR, OUT_STR, TCMSIZE)                                                          \
+    MAX_GOOD_WIDTH_CONV(ACT_STR, WEIGHT_STR, OUT_STR, 1, 1, TCMSIZE)
+
+// Tile CURRENT into equal-size chunks, less than or equalt to size TARGET
+#define EVEN_TILE_UNDER_CUSTOM(CURRENT, TARGET, ROUNDER) ROUNDUP(DIV(CURRENT, ADD(DIV(CURRENT, TARGET), 1)), ROUNDER)
+
+// Stays adaptive until CURRENT = 2*TARGET, then just tiles to size TARGET consistently
+// Balances the need to avoid the horrible case where you tile 260 into 256, with the benefits of consistent tiling
+// (that is, maybe avoiding lots of concat - retile operations in a large network
+#define EVEN_TILE_UNDER_CUTOFF2_CUSTOM(CURRENT, TARGET, ROUNDER)                                                       \
+    SELECT(GT(CURRENT, MUL(2, TARGET)), TARGET, EVEN_TILE_UNDER_CUSTOM(CURRENT, TARGET, ROUNDER))
+
+// Gotta be small enough to fit into VTCM, and then another factor of 4 so that we exploit parallelism.
+// Using half of VTCM instead of the full VTCM, as a precaution  / to let other ops run in parallel
+// Gotta use DIM_CHANNEL(ACT_STR) since there's no channel-tiling the activation unless it's DWC, and stride 2 isn't.
+// Has to be rounded up to 16, because it must be a multiple of 8 in the end, and space2depth divides width by 2
+// Default is 256; smaller tiles usually get better performance, and size 256 doesn't incur much overhead.
+// Also, for the activation, multiplying height by 2 (becase that's how it'll get tiled if "*" is tiled to TILE_HEIGHT)
+// and multiplying the whole thing by 2 (because whatever we tile "*" to, we tile ACT to 2x, due to stride).
+//
+// Sometimes, we need to tile all the way down to 8; hence, we can specify whether to round to 16 or to 8
+#define SMART_EARLY_WIDTH_S2(ACT_STR, WEIGHT_STR, OUT_STR, TCMSIZE, ROUNDER)                                           \
+    EVEN_TILE_UNDER_CUTOFF2_CUSTOM(                                                                                    \
+            DIM_WIDTH(OUT_STR),                                                                                        \
+            MIN(256, ROUNDUP(MAX(16, DIV(SUBS(DIV(MUL(TCMSIZE, 7), 16),                                                \
+                                              ROUNDUP(WEIGHT_STORAGE(WEIGHT_STR,                                       \
+                                                                     MIN(DIM_NFILTS(WEIGHT_STR), CHANNEL_SPLIT_SIZE)), \
+                                                      2048)),                                                          \
+                                         ADD(MUL(2, ELEMENTSIZE_OF(ACT_STR), DIM_BATCHES(ACT_STR),                     \
+                                                 MUL(TILE_HEIGHT, 2), ROUNDUP(DIM_DEPTH(ACT_STR), 32)),                \
+                                             MUL(ELEMENTSIZE_OF(OUT_STR), DIM_BATCHES(ACT_STR), TILE_HEIGHT,           \
+                                                 MIN_CHANNEL_SPLIT_SIZE)))),                                           \
+                             ROUNDER)),                                                                                \
+            ROUNDER)
+
+#define SMART_EARLY_WIDTH_ADAPTIVE_ROUNDING_S2(ACT_STR, WEIGHT_STR, OUT_STR, TCMSIZE)                                  \
+    SELECT(AND(IS_QUINT8(WEIGHT_STR), IS_QUINT16(ACT_STR)),                                                            \
+           SMART_EARLY_WIDTH_S2(ACT_STR, WEIGHT_STR, OUT_STR, TCMSIZE, 8),                                             \
+           SMART_EARLY_WIDTH_S2(ACT_STR, WEIGHT_STR, OUT_STR, TCMSIZE, 16))
+
+#define FLAT_TENSOR_SIZE(T)                                                                                            \
+    ROUNDUP(MUL(ELEMENTSIZE_OF(T), DIM_BATCHES(T), DIM_HEIGHT(T), DIM_WIDTH(T), DIM_DEPTH(T)), 2048)
+
+// Tile the width based on the input and output size
+// for channel_shuffle op.
+#define MAX_GOOD_WIDTH_CHANSHUF(ACT_STR, OUT_STR, TCMSIZE)                                                             \
+    MAX(MIN_WIDTH,                                                                                                     \
+        ROUNDUP(DIV(DIV(MUL(TCMSIZE, 6), 8), MUL(4, ADD(MUL(ELEMENTSIZE_OF(ACT_STR), ROUNDUP(DIM_HEIGHT(ACT_STR), 8),  \
+                                                            ROUNDUP(DIM_DEPTH(ACT_STR), 32)),                          \
+                                                        MUL(ELEMENTSIZE_OF(OUT_STR), ROUNDUP(DIM_HEIGHT(OUT_STR), 8),  \
+                                                            ROUNDUP(DIM_DEPTH(OUT_STR), 32))))),                       \
+                8))
+
+// Tile the width based on the input and output size
+// for quantize op.
+#define MAX_GOOD_WIDTH_QUANTIZE(IN, OUT, TCMSIZE)                                                                      \
+    MAX(MIN_WIDTH_OF(OUT),                                                                                             \
+        DIV(TCMSIZE, MUL(4, ADD(MUL(ELEMENTSIZE_OF(IN), ROUNDUP(DIM_HEIGHT(IN), 8), ROUNDUP(DIM_DEPTH(IN), 32)),       \
+                                MUL(ELEMENTSIZE_OF(OUT), ROUNDUP(DIM_HEIGHT(OUT), 8), ROUNDUP(DIM_DEPTH(IN), 32))))))
+
+#define MAX_GOOD_WIDTH_ELEMWISE(FIRST_IN_STR, SECOND_IN_STR, OUT_STR, TCMSIZE)                                         \
+    MAX(MIN_WIDTH, ROUNDUP(DIV(DIV(MUL(TCMSIZE, 6), 8),                                                                \
+                               MUL(4, ADD(MUL(ELEMENTSIZE_OF(FIRST_IN_STR), ROUNDUP(DIM_HEIGHT(FIRST_IN_STR), 8),      \
+                                              ROUNDUP(DIM_DEPTH(FIRST_IN_STR), 32)),                                   \
+                                          MUL(ELEMENTSIZE_OF(SECOND_IN_STR), ROUNDUP(DIM_HEIGHT(SECOND_IN_STR), 8),    \
+                                              ROUNDUP(DIM_DEPTH(SECOND_IN_STR), 32)),                                  \
+                                          MUL(ELEMENTSIZE_OF(OUT_STR), ROUNDUP(DIM_HEIGHT(OUT_STR), 8),                \
+                                              ROUNDUP(DIM_DEPTH(OUT_STR), 32))))),                                     \
+                           8))
+
+#define ESTIMATE_H1_MATMUL_SIZE(A, B)                                                                                  \
+    ADD(MUL(8, ROUNDUP(DIM_WIDTH(A), 8), ROUNDUP(DIM_DEPTH(A), 32), ELEMENTSIZE_OF(A)),                                \
+        MUL(1, 1, ROUNDUP(DIM_WIDTH(B), 32), ROUNDUP(DIM_DEPTH(B), 32), ELEMENTSIZE_OF(B)),                            \
+        MUL(8, ROUNDUP(DIM_WIDTH("*"), 8), ROUNDUP(DIM_DEPTH("*"), 32), ELEMENTSIZE_OF("*")))
+
+// Smartly select a suitable width for FP16 batchnorm operation.
+// starting from initial width and then halving it.
+#define SMART_BATCHNORM_WIDTH(IN_STR, WEIGHTS, OUT_STR, INITIAL_WIDTH)                                                 \
+    SELECT(OR(GT(DIM_WIDTH(IN_STR), INITIAL_WIDTH),                                                                    \
+              NOT(AND(GT(DIM_WIDTH(IN_STR), DIV(INITIAL_WIDTH, 2)),                                                    \
+                      GT(ADD(ESTIMATE_TENSOR_SIZE(OUT_STR), ESTIMATE_TENSOR_SIZE(IN_STR),                              \
+                             ESTIMATE_TENSOR_SIZE(WEIGHTS)),                                                           \
+                         TCM_MAXTENSOR_SIZE)))),                                                                       \
+           INITIAL_WIDTH, DIV(INITIAL_WIDTH, 2))
+
+/** @} */
+
+#define CONST_ZERO_OFF(OPERAND) gen_ConstScalar_i32(ZERO_OFFSET_OF(OPERAND))
+
+// wrap tile_height option for better usability
+
+#define TILE_HEIGHT OPTION_UINT("tile_height")
+
+// These are used to help optimize graphs when the relaxed_precision_flag is set
+#define CAST_TO_DTYPE(X, DTYPE) WITH_OUTPUT_TYPE(DTYPE, 0, 1.0f, Op(FROM_DEFAULT_PACKAGE("QNN_Cast"), X))
+
+#define CAST_TO_FP16(X) WITH_SIZE(X, CAST_TO_DTYPE(X, DType::Float16))
+
+#define CAST_TO_FP32(X) CAST_TO_DTYPE(X, DType::Float32)
+
+#define MAKE_OP_FP16_AND_INSERT_CAST(OP) CAST_TO_FP32(WITH_SIZE("*", WITH_OUTPUT_TYPE(DType::Float16, 0, 1.0f, OP)))
+
+#define IS_BINARY_FP16(A, B, Out) AND(IS_FLOAT16(A), IS_FLOAT16(B), IS_FLOAT16(Out))
+
+#define IS_BINARY_FP32(A, B, Out) AND(IS_FLOAT32(A), IS_FLOAT32(B), IS_FLOAT32(Out))
+
+#define FP16_CONST_CAST(X, Y) LET(X, Op(FROM_DEFAULT_PACKAGE("Cast_fp32_to_fp16_plain"), Y))
+
+#define FP16_CONST_CASTSLICE(X, Y, Z)                                                                                  \
+    LET(X, Op(FROM_DEFAULT_PACKAGE("SlicePad_shape_inplace"),                                                          \
+              LET(Y, Op(FROM_DEFAULT_PACKAGE("Cast_fp32_to_fp16_plain"), Z)), "Before", "Start", "Out", "Zero"))
+
+#define CONVERT_BINARY_OP_TO_FP16(OP, A, B)                                                                            \
+    DEF_OPTIM(CLEANUP_GRAPH+130, relaxed_precision_flag, Op(OP, A, B), IS_BINARY_FP32(A, B, "*"),                          \
+              MAKE_OP_FP16_AND_INSERT_CAST(Op(OP, CAST_TO_FP16(A), CAST_TO_FP16(B))))
+
+// These are used to reshape 1xHx1xD or 1x1xWxD QUint8CroutonTensor/QUint16CroutonTensor
+#define SHAPE_FROM_W1(A)                                                                                               \
+    SELECT(IS_QUINT8(A), gen_Shape(DIM_BATCHES(A), DIV(ADD(DIM_HEIGHT(A), 7), 8), 8, DIM_DEPTH(A)),                    \
+           gen_Shape(DIM_BATCHES(A), DIV(ADD(DIM_HEIGHT(A), 3), 4), 4, DIM_DEPTH(A)))
+
+#define REARRANGE_FROM_W1(A) WITH_SIZE(SHAPE_FROM_W1(A), WITH_TYPE(WITH_SAME_ID("*", A), Op("space_rearrange", A)))
+
+// Use this instead of the one above when "A" is a text string instead of an operator constructor
+#define REARRANGE_FROM_W1_OP(A) WITH_SIZE(SHAPE_FROM_W1(A), WITH_TYPE(A, Op("space_rearrange", A)))
+
+#define REARRANGE_TO_W1(OP) Op("space_rearrange", WITH_SIZE(SHAPE_FROM_W1("*"), WITH_TYPE("*", OP)))
+
+#define SHAPE_FROM_H1(A)                                                                                               \
+    SELECT(IS_QUINT8(A),                                                                                               \
+           gen_Shape(DIM_BATCHES(A), MIN(8, DIV(ADD(DIM_WIDTH(A), 7), 8)), MUL(DIV(ADD(DIM_WIDTH(A), 63), 64), 8),     \
+                     DIM_DEPTH(A)),                                                                                    \
+           gen_Shape(DIM_BATCHES(A), MIN(8, DIV(ADD(DIM_WIDTH(A), 3), 4)), MUL(DIV(ADD(DIM_WIDTH(A), 31), 32), 4),     \
+                     DIM_DEPTH(A)))
+
+#define REARRANGE_FROM_H1(A) WITH_SIZE(SHAPE_FROM_H1(A), WITH_TYPE(A, Op("space_rearrange", A)))
+
+#define REARRANGE_TO_H1(OP) Op("space_rearrange", WITH_SIZE(SHAPE_FROM_H1("*"), WITH_TYPE("*", OP)))
+
+// This is intended to be seen only by the external parser, not by the C++ compiler.
+// DEF_OPTIM is mapped to DEF_OPTIM_PARSE(...), with the PRIO and FLAGS parameter both
+// string-quoted -- this allows these to have non-conformant (non-lispy) syntax,
+// without complicating the parser;
+// and DEF_OPT(PRIO,PAT...) is just DEF_OPRIM_PARSE("prio","0"...)
+
+#ifdef EXTERNAL_DEFOPT_PARSER
+#define DEF_OPTIM(PRIO, FLAGS, PAT, CST, REP) DEF_OPTIM_PARSE(#PRIO, #FLAGS, PAT, CST, REP)
+#define DEF_OPT(PRIO, PAT, CST, REP)          DEF_OPTIM_PARSE(#PRIO, "0", PAT, CST, REP)
+
+// Some DEF_OPT use this
+#define MAX_DIMENSIONS 8
+
+// FIXME - maybe the parser should understand FROM_DEFAULT_PACKAGE("opname") directly
+#define FROM_DEFAULT_PACKAGE(name) name
+
+#endif
+
+// ------------- Batch to Space/Depth to Space Ops ----------------
+
+// Checks whether the given width slice factor is enough
+// or should it be halved.
+#define GOOD_WIDTH_DEPTHTOSPACE_CHECK(IN_STR, OUT_STR, SLICE)                                                          \
+    AND(GT(DIM_WIDTH(OUT_STR), SLICE),                                                                                 \
+        GT(ADD(ESTIMATE_TENSOR_SIZE(IN_STR), ESTIMATE_TENSOR_SIZE(OUT_STR)), TCM_MAXTENSOR_SIZE))
+
+// Searches for a good slice factor starting from the
+// initial slice factor (WIDTH_SLICE_FACTOR) and
+// halves it until the slice does not fit into vtcm.
+// This macro is supposed to start with initial slice factor of 128,
+// then check for 64 and then 32 if required.
+#define SMART_WIDTH_DEPTHTOSPACE(IN_STR, OUT_STR, WIDTH_SLICE_FACTOR)                                                  \
+    SELECT(GOOD_WIDTH_DEPTHTOSPACE_CHECK(IN_STR, OUT_STR, WIDTH_SLICE_FACTOR), WIDTH_SLICE_FACTOR,                     \
+           SELECT(GOOD_WIDTH_DEPTHTOSPACE_CHECK(IN_STR, OUT_STR, DIV(WIDTH_SLICE_FACTOR, 2)),                          \
+                  DIV(WIDTH_SLICE_FACTOR, 2), DIV(WIDTH_SLICE_FACTOR, 4)))
+
+// 1) conv + act fusion should only happen if conv doesnt feed any other op
+// 2) User can override this with the "force_conv_fusion" set opt
+// 3) In the case that the graph is a QNN model with --debug (per layer outputs) enabled
+// we still want the fusion to happen so that the final graph output matches the non --debug version
+// QNN's --debug is used purely for accuracy debugging, so the performance impact from the duplication
+// of the conv op isnt a concern
+#define ACT_FUSION_MULTI_OUT_CHECK(OP)                                                                                 \
+    OR(EXTERNAL_CONSTRAINT(has_only_one_consumer, OP), OPTION_BOOL("force_conv_fusion"),                               \
+       AND(PRODUCER_FOR(OP, "*Output"), EXTERNAL_CONSTRAINT(has_n_consumers, OP, 2), PRODUCER_FOR("*", "*Output")))
+
+// For central tiler, just return true but for legacy evaluate the
+// conjunction of the arguments
+
+// This should be used to separate predicate into semantic and tiling preference options
+// where the tiling preferences (typically references to target TCM size) should be
+// wrappered in this macro.
+#define SHOULD_TILE(...) OR(OPTION_BOOL("central_tiler"), AND(__VA_ARGS__))
+
+// if u8, w>8 && w%8 == 0
+// if u16, w>4 && w%4 == 0, not fully utilizing the crouton, but still much better performance
+// TODO: Will remove the second rule once the space rearrange is fully implemented to reshape
+// the entire model from Input toward the output
+#define WIDTH_TO_HEIGHTX_CONSTRAINT(OPSTR)                                                                             \
+    OR(AND(GT(DIM_WIDTH(OPSTR), TILE_HEIGHT), EQ(REM(DIM_WIDTH(OPSTR), TILE_HEIGHT), 0)),                              \
+       AND(IS_QUINT16(OPSTR), GT(DIM_WIDTH(OPSTR), 4), EQ(REM(DIM_WIDTH(OPSTR), 4), 0)))
+
+#define HEIGHTX_SHAPE(OPSTR)                                                                                           \
+    SELECT(EQ(REM(DIM_WIDTH(OPSTR), TILE_HEIGHT), 0),                                                                  \
+           gen_Shape(DIM_BATCHES(OPSTR), MUL(DIM_HEIGHT(OPSTR), TILE_HEIGHT), DIV(DIM_WIDTH(OPSTR), TILE_HEIGHT),      \
+                     DIM_DEPTH(OPSTR)),                                                                                \
+           SELECT(EQ(REM(DIM_WIDTH(OPSTR), 4), 0),                                                                     \
+                  gen_Shape(DIM_BATCHES(OPSTR), MUL(DIM_HEIGHT(OPSTR), 4), DIV(DIM_WIDTH(OPSTR), 4),                   \
+                            DIM_DEPTH(OPSTR)),                                                                         \
+                  gen_Shape(DIM_BATCHES(OPSTR), DIM_HEIGHT(OPSTR), DIM_WIDTH(OPSTR), DIM_DEPTH(OPSTR))))
+
+#define HEIGHT84_SHAPE(OPSTR)                                                                                          \
+    SELECT(EQ(REM(DIM_WIDTH(OPSTR), TILE_HEIGHT), 0),                                                                  \
+           gen_Shape(DIM_BATCHES(OPSTR), TILE_HEIGHT, DIV(DIM_WIDTH(OPSTR), TILE_HEIGHT), DIM_DEPTH(OPSTR)),           \
+           gen_Shape(DIM_BATCHES(OPSTR), 4, DIV(DIM_WIDTH(OPSTR), 4), DIM_DEPTH(OPSTR)))
+
+// Only perform reshape from height to width when the height is huge
+#define HEIGHT_TO_WIDTH_SHAPE(OPSTR)                                                                                   \
+    gen_Shape(DIM_BATCHES(OPSTR), TILE_HEIGHT, DIV(DIM_HEIGHT(OPSTR), TILE_HEIGHT), DIM_DEPTH(OPSTR))
+#define HEIGHT_TO_WIDTH_CONSTRAINT(OPSTR)                                                                              \
+    AND(GE(DIM_HEIGHT(OPSTR), 8192), EQ(REM(DIM_HEIGHT(OPSTR), TILE_HEIGHT), 0), EQ(DIM_WIDTH(OPSTR), 1),              \
+        EQ(REM(DIM_DEPTH(OPSTR), 32), 0))
+
+#endif
\ No newline at end of file
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/optimize_flags.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/optimize_flags.h
new file mode 100755
index 0000000000000..a905b684c54dc
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/optimize_flags.h
@@ -0,0 +1,196 @@
+//==============================================================================
+//
+// Copyright (c) 2020 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef OPTIMIZE_FLAGS_H_
+#define OPTIMIZE_FLAGS_H_
+
+#include <stdint.h>
+#include "weak_linkage.h"
+
+#ifndef PREPARE_DISABLED
+
+PUSH_VISIBILITY(default)
+
+namespace hnnx {
+
+struct OptimFlags {
+    typedef uint32_t flags_t; // can change to uint64 if needed
+
+    template <int N> struct flagbit {
+        static constexpr flags_t val = flags_t(1) << N;
+    };
+
+    // :::>OPTIMFLAG_SYMBOLS{(\w+)\s*=\s*flagbit<(\d+)>::val}
+    enum f : flags_t {
+        any_rule = flagbit<0>::val, // all rules have this bit forced on.
+        cse_after_always = flagbit<1>::val, // always do CSE after a pass containing this rule
+        cse_after_if = flagbit<2>::val, // always do CSE after the pass, if the rule succeeds.
+        cse_set_triggerA = flagbit<3>::val, // trigger CSE when this rule succeeds.
+        cse_before_if_triggerA = flagbit<4>::val, // if triggered, CSE before pass containing this rule
+        cse_set_triggerB = flagbit<5>::val, // trigger CSE when this rule succeeds.
+        cse_before_if_triggerB = flagbit<6>::val, // if triggered, CSE before pass containing this rule
+        fold_relu_flag = flagbit<
+                7>::val, // if overall flag for folding relu is enabled all rules containing this flag will be triggered
+        cp_after_always = flagbit<8>::val, // always do CP after a pass containing this rule
+        hmx_short_conv_flag = flagbit<9>::
+                val, // if overall flag for using hmx to do short conv is enabled all rules containing this flag will be triggered
+        relaxed_precision_flag = flagbit<10>::
+                val, // if overall flag for relaxed precision is enabled all rules containing this flag will be triggered
+
+        // An AUTOSPLIT rule that is not part of central tiling and is executed
+        // when central tiling is enabled or not.
+        explicit_autosplit_flag = flagbit<11>::val,
+        // A rule with an AUTOSPLIT that should be ignored when central tiling
+        // is enabled.
+        central_autosplit_flag = flagbit<12>::val,
+        // A rule which is only used by the central tiler and is never executed
+        // when central tiling is disabled.
+        central_only_autosplit_flag = flagbit<13>::val,
+        // A rule which is ignored completely when central tiling is enabled
+        central_ignore_autosplit_flag = flagbit<14>::val,
+
+        // This is used to disable rules which are centralized into a common TCM migration pass
+        tcm_migration_old_flag = flagbit<15>::val,
+        // This is used to specify rules only used when centralized TCM is in use.
+        tcm_migration_new_flag = flagbit<16>::val,
+        cp_after_if = flagbit<17>::val, // trigger CP when the rule succeeds
+        prepare_aux_graph = flagbit<18>::val, // is this an aux graph prepare?
+        autothread_flag = flagbit<19>::val, // Always set the autothread flag
+        trace_rule = flagbit<20>::val, // extra diagnosic tracing on a rule.
+    };
+
+    /*
+The trace_rule flag is used to enable extra debugging
+for a rule to debug predicates. It should only be used
+in development and not commited to the mainline branch.
+
+It requires WITH_OPT_DEBUG to be defined.
+Adding trace_rule to a rule such as
+
+DEF_OPTIM(SPATIAL_RESHAPE+100, trace_rule,
+	Op("ConvLayer","Activations","Weights","Stride","Bias","Scale","ConvCtrl","OutCtrl"),
+	AND(
+		OR( EQ(ZERO_OFFSET_OF("Weights"), 128), OPTION_BOOL("hmx_short_conv_flag")),
+		OR(IS_QUINT8("Activations"), IS_QUINT16("Activations")),
+		AND(
+			NOT( AND( EQ(DIM_HEIGHT("Stride"),1), EQ(DIM_WIDTH("Stride"),1)) ),
+			NOT( AND( EQ(DIM_HEIGHT("Stride"),2), EQ(DIM_WIDTH("Stride"),2)) ),
+			NOT( AND( EQ(DIM_HEIGHT("Stride"),4), EQ(DIM_WIDTH("Stride"),4), EQ( DIM_FILTHEIGHT("Weights"), 3), EQ( DIM_FILTWIDTH("Weights"), 3)))
+		)
+	),
+
+might generate this output:
+
+optimize.cc:101:conv.cc:1875 attempt 0x140d0000001d q::ConvLayer
+optimize.cc:115:0x140d0000001d predicate depth 2 or clause 1 result=1
+optimize.cc:115:0x140d0000001d predicate depth 2 or clause 0 result=1
+optimize.cc:115:0x140d0000001d predicate depth 3 and clause 1 result=1
+optimize.cc:115:0x140d0000001d predicate depth 2 and clause 0 result=0
+optimize.cc:115:0x140d0000001d predicate depth 1 and clause 2 result=0
+
+here the "depth" field indicate the position in the expression tree of 
+and AND or OR operator with the outermost such operation being a 1. 
+The clause term identifies the operator of that operator numbered left-to-right
+starting with 0. Result is the result of evaluating that cluase.
+
+Note that we stop evaluating an "AND" or "OR" once we determine its result.
+Thus, for an "AND" we generally will list only one clause and all preceding
+clauses are known to be "true". Similarly we list only one clause for an "OR"
+and preceding clauses are known to be "false"
+
+Happy hunting
+
+*/
+
+    // :::<OPTIMFLAG_SYMBOLS
+
+    // To clarify cse_set_triggerA etc:
+    // If rule 'X' has cse_set_triggerA, and rule 'Y' in a *later* pass has cse_before_if_triggerA,
+    // then if X gets applied, there will always be at least one subsequent CSE
+    // operation before the pass containing rule Y starts. Likewise for B.
+
+    // this is the union of the flags which need to be collected across all rules
+    // when building the optimization table; the result is stored in GraphOptPass.flags
+
+    static constexpr flags_t combine_over_pass =
+            cse_after_always | cse_before_if_triggerA | cse_before_if_triggerA | cp_after_always;
+
+    // Engine to make decision to do CSE after a pass.
+    // (and others can be added as needed)
+    // Lifetime is the full optimization process.
+    // After each pass (except the last), update() method is called,
+    // and then need_cse()  returns a bool indicating if CSE should be done.
+    // Note, this is not called after the final pass since we do CSE anyway.
+
+    // In the 'm_trigger' word, cse_set_triggerA and cse_set_triggerB bits are
+    // set when these triggers have happened in a previous pass;
+    // they are cleared whenever we decided to do CSE.
+    //
+    // The 'any_rule' bit in curr_trigger is a special case (all rules have this bit in their flags):
+    // we clear it whenever we decide to do CSE, and we set it in any other case when any rule has been executed.
+    // So, if it's clear when we are called, and the success_flags don't have it, nothing has changed since the previous CSE,
+    // and we can ignore all the other CSE conditions.
+
+    class OptFlagState {
+        flags_t m_trigger; // triggers are held here.
+        bool m_need_cse;
+        bool m_need_cp;
+
+      public:
+        OptFlagState() : m_trigger(0), m_need_cse(false), m_need_cp(false) {}
+        API_EXPORT inline void update(flags_t previous_pass_flags, // GraphOptPass.flags from the previous pass
+                                      flags_t next_pass_flags, // GraphOptPass.flags from the next pass
+                                      flags_t success_flags) // 'or' of all rules which succeeded in previous pass.
+        {
+            flags_t const trigs = m_trigger;
+            flags_t next_trigs = trigs | (success_flags & (any_rule | cse_set_triggerA | cse_set_triggerB));
+            bool const do_cse =
+                    ((next_trigs & any_rule) != 0 &&
+                     ((success_flags & cse_after_if) != 0 || (previous_pass_flags & cse_after_always) != 0 ||
+                      ((next_trigs & cse_set_triggerA) != 0 && (next_pass_flags & cse_before_if_triggerA) != 0) ||
+                      ((next_trigs & cse_set_triggerB) != 0 && (next_pass_flags & cse_before_if_triggerB) != 0)));
+            bool const do_cp = ((next_trigs & any_rule) != 0 &&
+                                ((success_flags & cp_after_if) != 0 || (previous_pass_flags & cp_after_always) != 0));
+            if (do_cse) {
+                // we are going to do cse; so reset all triggers and return true
+                next_trigs = 0;
+            }
+            if (do_cp) {
+                // we are going to do cse; so reset all triggers and return true
+                next_trigs = 0;
+            }
+            // if not doing CSE, accumulate any triggers
+            m_trigger = next_trigs;
+            m_need_cse = do_cse;
+            m_need_cp = do_cp;
+        }
+
+        API_EXPORT inline bool need_cse() const { return m_need_cse; }
+        API_EXPORT inline bool need_cp() const { return m_need_cp; }
+    };
+
+    // this is a trick to allow flags to be used in rules without namespace prefix
+    // i.e. if the flags are "flagname1 | flagname2"
+    // the #define will expand it to
+    //  inline constexpr OptimFlags::flag_eval<SOMETYPE>() {  return any_rule | (flagname1| flagname2); }
+    // .. and then call that to get the value.
+    //
+
+    template <typename U> static constexpr flags_t flag_evaluate() noexcept
+    {
+        static_assert(false && sizeof(U), "must be specialized");
+        return any_rule;
+    }
+};
+
+} // namespace hnnx
+
+POP_VISIBILITY()
+
+#endif
+#endif /* OPTIMIZE_FLAGS_H_ */
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/padding.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/padding.h
new file mode 100755
index 0000000000000..cee0253510cb0
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/padding.h
@@ -0,0 +1,84 @@
+//==============================================================================
+//
+// Copyright (c) 2020, 2023 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef HEXNN_PADDING_H
+#define HEXNN_PADDING_H 1
+#include <array>
+#include <utility>
+#include <functional>
+#include <algorithm>
+
+/* 
+ * This is a nice experiment, but maybe we should always just have padding
+ * and represent nopadding with padding 0,0,0,0
+ */
+
+typedef size_t Idx;
+typedef long SIdx;
+
+template <Idx Rank> class NoPadding {
+  public:
+    static constexpr unsigned int is_padded = 0;
+    template <typename PadT>
+    inline constexpr std::array<Idx, Rank> pad_coords(const std::array<Idx, Rank> &coords,
+                                                      const std::array<PadT, Rank> &left_padding) const
+    {
+        std::array<Idx, Rank> ret{coords};
+        return ret;
+    }
+    template <typename PadT>
+    inline constexpr std::array<Idx, Rank> pad_coords(const std::array<SIdx, Rank> &coords,
+                                                      const std::array<PadT, Rank> &left_padding) const
+    {
+        std::array<Idx, Rank> ret{};
+        std::transform(coords.cbegin(), coords.cend(), ret.begin(), [](SIdx x) -> Idx { return x; });
+        return ret;
+    }
+};
+
+/*
+ * This padding is just the left/top padding, which affects the coordinates.
+ * But if we want right/bottom padding, that would be extra information needed somewhere else.
+ */
+
+template <Idx Rank> class Padding {
+  public:
+    static constexpr unsigned int is_padded = 1;
+    template <typename PadT>
+    inline const std::array<Idx, Rank> pad_coords(const std::array<Idx, Rank> &coords,
+                                                  const std::array<PadT, Rank> &left_padding) const
+    {
+        std::array<Idx, Rank> ret{};
+        std::transform(left_padding.begin(), left_padding.end(), coords.begin(), ret.begin(), std::plus<Idx>());
+        return ret;
+    }
+    template <typename PadT>
+    inline const std::array<Idx, Rank> pad_coords(const std::array<SIdx, Rank> &coords,
+                                                  const std::array<PadT, Rank> &left_padding) const
+    {
+        std::array<Idx, Rank> ret{};
+        std::transform(left_padding.begin(), left_padding.end(), coords.begin(), ret.begin(), std::plus<Idx>());
+        return ret;
+    }
+};
+
+#if 0
+const std::array<Idx, 4> foo(const Padding<4> &x,
+                             const std::array<SIdx, 4> coords)
+{
+    return x.pad_coords(coords);
+}
+
+const std::array<Idx, 4> foo2(const NoPadding<4> &x,
+                              const std::array<SIdx, 4> coords)
+{
+    return x.pad_coords(coords);
+}
+#endif
+
+#endif
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/pco_declarations.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/pco_declarations.h
new file mode 100755
index 0000000000000..de2fdfc35e853
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/pco_declarations.h
@@ -0,0 +1,75 @@
+//==============================================================================
+//
+// Copyright (c) 2024 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+#ifndef PCO_DECL_H
+#define PCO_DECL_H
+#include "dtype_enum.h"
+#include "graph_status.h"
+#include <cstddef>
+
+#ifndef THIS_PKG_NAME_STR
+#ifndef THIS_PKG_NAME
+#define THIS_PKG_NAME
+#define THIS_PKG_NAME_STR ""
+#else
+#define TO_STR(x)         #x
+#define TO_STR2(x)        TO_STR(x)
+#define THIS_PKG_NAME_STR TO_STR2(THIS_PKG_NAME)
+#endif
+#endif
+
+//
+// Interface for HTP op packages. This is a reduced subset of capability compared to QNN.
+//
+
+// Optional termination function.  Perform any shutdown and return success if
+// OK.  May be ommitted.
+typedef GraphStatus (*PackageOpTermFn_t)();
+
+// Interface class.  An op package is dynamically loaded, then the special
+// function op_pkg_init is loaded and called.  It takes a reference argument to
+// a PackageOpIf.
+//
+// In addition to specifying the name and optional termination function, this
+// function should perform any relevant op and optimization rule registration.
+// It's possible that this function may be called more than once, though we try
+// to avoid it.  So, to be on the safe side, it should return immediately with
+// GraphStatus::Success if it's already been called.
+//
+// _name must be non-null and non-empty.  It's used as a unique key into the
+// registry, to avoid duplicate loading of op packages, should one be specified
+// more than once in the list of options.
+//
+// _term may be null.
+
+struct PackageOpIf {
+    const char *_name = nullptr;
+    PackageOpTermFn_t _term = nullptr;
+    const char *decl_json_ptr = nullptr;
+    size_t decl_json_size = 0;
+};
+
+// Entry point function for the op package.
+typedef GraphStatus (*PackageOpInitFn_t)(PackageOpIf &);
+
+#define INIT_PKG_CORE_INIT_FUNC_WITH_JSON_DECLARATION()                                                                \
+    static bool sg_init = false;                                                                                       \
+    extern "C" int op_pkg_init(PackageOpIf &pkg_if)                                                                    \
+    {                                                                                                                  \
+        pkg_if._name = THIS_PKG_NAME_STR;                                                                              \
+        if (sg_init) {                                                                                                 \
+            return GraphStatus::Success;                                                                               \
+        }                                                                                                              \
+        REGISTER_PACKAGE_OPS();                                                                                        \
+        REGISTER_PACKAGE_OPTIMIZATIONS();                                                                              \
+        pkg_if.decl_json_ptr = decl_json;                                                                              \
+        pkg_if.decl_json_size = decl_json_size;                                                                        \
+        sg_init = true;                                                                                                \
+        return GraphStatus::Success;                                                                                   \
+    }
+
+#endif
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/perf_timing.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/perf_timing.h
new file mode 100755
index 0000000000000..22233372b944d
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/perf_timing.h
@@ -0,0 +1,32 @@
+//==============================================================================
+//
+// Copyright (c) 2018,2021 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef PERF_TIMING_H
+#define PERF_TIMING_H 1
+
+#include <stdint.h>
+#include "weak_linkage.h"
+#include "macros_attribute.h"
+
+PUSH_VISIBILITY(default)
+
+class PcyclePoint {
+  public:
+    API_EXPORT PcyclePoint(bool enable);
+    API_EXPORT void stop();
+    API_EXPORT uint64_t get_total() const { return end > start ? (end - start) : 0; }
+    API_EXPORT uint64_t get_start() const { return start; }
+    API_EXPORT uint64_t get_end() const { return end; }
+    //private:
+    uint64_t start;
+    uint64_t end;
+};
+
+POP_VISIBILITY()
+
+#endif //PERF_TIMING_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/pickle_header_tags.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/pickle_header_tags.h
new file mode 100755
index 0000000000000..367eaf1af55b3
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/pickle_header_tags.h
@@ -0,0 +1,94 @@
+//==============================================================================
+//
+// Copyright (c) 2023 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef PICKLE_HEADER_TAGS_H_
+#define PICKLE_HEADER_TAGS_H_
+
+enum HTP_header_const {
+    Hdr_MAGIC = 0x7309F72B,
+    Hdr_MAGIC_MULTI = 0x3790FA5C, // magic # for a 'multi-pickle' main header
+    HdrVersion_VERSION = 1,
+    HdrVersion_GRAPH_PATCH_VERSION = 1,
+    HdrVersion_VERSION_FLAG_MULTI_NSP = 0x8000, // 'or' to version in multi-pickle header.
+    MULTI_SER_ALIGN = 64, // all blobs in multi-pickle are padded out to multiple of this
+    HdrTag_IDENT = 'I' + 256 * 'd',
+    HdrTag_SIZE = 'S' + 256 * 'z',
+    HdrTag_VERSION = 'V' + 256 * 'r',
+    HdrTag_OPTIONS = 'O' + 256 * 'p',
+    HdrTag_SHARES = 'W' + 256 * 's', // list of cbnames for weight-sharing
+    HdrTag_MEMORY = 'M' + 256 * 'm', // 'memory' usage info
+    HdrTag_CONTENTS = 'T' + 256 * 'c', // 'table of contents' in multi-pickle header.
+    HdrTag_MULTI = 'M' + 256 * 'u', // size info for multi-pickle header
+    HdrTag_IOSPEC = 'I' + 256 * 'o',
+    HdrTag_EMPTY = 'E' + 256 * 'm',
+    HdrTag_PATCH_METADATA = 0x2AFF5A2B,
+    HdrTag_CONSTPOOL = 'C' + 256 * 'p',
+    HdrTag_QUANT_PARAM_UPDATE = 'Q' + 256 * 'u',
+    HdrTag_ENDHDR = 'Z' + 256 * 'z',
+
+    // size of field, in bytes, specifying the names within the Sw tag
+    CBNAME_LEN = 45,
+    // size of field, in bytes, including final NULL, specifying the names within the Sw tag
+    LEN_SHARED_BUFFER_NAME = CBNAME_LEN + 1,
+
+    POOLDESC_IS_REPLACEABLE = 8, // const pool constructed as 'replaceable'
+    POOLDESC_IS_DEHYDRATED = 64, // pool requires const graph rehydration
+    POOLDESC_MC_CONST = 15, // represent Const data
+};
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+constexpr inline bool htp_header_is_valid_MAGIC(const unsigned val)
+{
+    return val == Hdr_MAGIC || val == Hdr_MAGIC_MULTI;
+}
+constexpr inline unsigned htp_header_get_MAGIC(void const *const p)
+{
+    return *(unsigned const *)p;
+}
+
+//
+// Given a pointer to an in-memory header, locate the payload field corresponding to 'tag'.
+// If found, returns the length of the payload (which is >=0), after setting *payload_ptr.
+// If not found, returns -1.
+//
+//  hdr, buflen:  where the header is in memory.
+//      'hdr' must be 32-bit aligned; will not access beyond' buflen' bytes.
+//  tag:
+//      the 16-bit tag you're looking for.
+//
+inline int htp_header_locate_field(const void *hdr, const size_t buflen, const unsigned tag, void **const payload_ptr)
+{
+    if (buflen < 12) return -1; // not large enough for any fields.
+    const unsigned *rp = (const unsigned *)hdr;
+    if (!htp_header_is_valid_MAGIC(*rp)) return -1;
+    size_t const hwords = rp[1] & 0xFFFFu;
+    size_t const max_hdr_words = std::min(hwords, buflen / 4);
+    // do not look at this, or past.
+    unsigned const *const limitp = rp + max_hdr_words;
+    rp += 2; // point to first tag
+    while (rp < limitp) {
+        unsigned const recdesc = *rp;
+        unsigned const rlen = recdesc & 0xFFFFu;
+        if (rlen < 1 || rp + rlen > limitp) break; // bad record
+        if ((recdesc >> 16u) == tag) { // found it...
+            *payload_ptr = (void *)(rp + 1);
+            return (rlen - 1) * sizeof(unsigned);
+        }
+        rp += rlen;
+    }
+    return -1;
+}
+
+#ifdef __cplusplus
+} // extern "C"
+#endif // __cplusplus
+
+#endif // PICKLE_HEADER_TAGS_H_
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/serdes_tensors.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/serdes_tensors.h
new file mode 100755
index 0000000000000..80d02cc93f604
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/serdes_tensors.h
@@ -0,0 +1,105 @@
+//==============================================================================
+//
+// Copyright (c) 2021-2023 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef SERDES_TENSORS_H
+#define SERDES_TENSORS_H 1
+
+#include "forward_classes.h"
+
+namespace hnnx {
+
+// common header SerTensorConn, DeserTensorConn
+// these are classes, stored within the Serialize and Deserialize objects,
+// which deal with the serialization of connectivity.
+//
+
+////////////////////////////
+// Assuming no forward refs:
+// -------------------------
+//     ser.tensor_def(tp)  Associates the next sequential index (starting with 1) with tp,
+//                               but does not store it;.
+//     ser.tensor_ref(tp)  Stores the index previously associated with tp
+//
+//  On deserializing:
+//     deser.tensor_def(tp)  Associates the next sequential index (starting with 1) with tp,
+//                               by appending tp to a vector;
+//     deser.tensor_ref(Tensor *&tp)
+//                              reads the index, reads the tp from a table.
+//
+// To support forward refs
+// -----------------------------------
+//
+// We assign 'forward index' codes to tensor_ref for which there is yet no tensor_def.
+// This is described first in terms of decoder actions, which are simpler:
+//
+//   (1) deser.tensor_ref reads a word containing a tensor index. If it has bit 30 set, it is instead the first
+//       word of an update sequence (described below), below, which is folled by an index word.
+//       - If the index word has "00" in bits 31:30, then it's the index of a tensor previously defined by tensor_def
+//       - otherwise, msbs are "10", and the word is a 'forward index, 0x80000000+k; the decoder stores the
+//         adddress of the tensor pointer to be updated in a linear table at offset [k]. The first k seen will be 0;
+//         each subsequent will be most 1 greater than any any previous.
+//
+//   (2) deser.tensor_def defines a tensor; its address is appended to an array used to resolve the normal tensor_ref.
+//
+//   (3) as mentioned, sometimes when expecting the index for tensor_deser, we obtain a word with bit 30 set. This flags
+//       a sequence of one of more 'update records', followed by the index.
+//       Each update record encodes a tensor index (previously defined via tensor_def) and one or more 'forward indices'
+//       which are resolved by that tensor. The decoder sets all of the corresponding pointers immediately, since forward
+//       reference indices resolved may be reused.
+//
+// The enooder thus acts as follows:
+//
+//   We have a vector<int> "forward_allocation_table" for the forward index 'k' values; this contains a set of linked-lists,
+//   each value is actually the index  of the next value in the chain, with -1 marking the end; there is a free list.
+//   Each entry in the list at [k] (except free records) represents an unresolved forward reference which was encoded as
+//   0x80000000+k. Each linked-list corresponds to a specific tensor, the head being in tensor_index_map.
+//
+//   There is a map<Tensor *,unsigned>  "tensor_index_map" which represents what index is assigned to each tensor.
+//   When the value has "00" in the upper  bits, it means tensor_def has been done, and that's the assigned index value.
+//   Otherwise it has "10" in upper bits, it is the most recent forward index 0x80000000+k, and 'k' is the 'head' pointer
+//   into the forward index table.
+//
+//   When a tensor_def is done:
+//      - tensor is assigned the next sequential index.
+//      - if there is already an entry, it must be a forward index: this, and the new index, are
+//         placed on 'pending_update_pairs' list to generate a resolution record at the next opportunity.
+//      - in any case the tensor is assigned the next sequential index, and tensor_index_map is updates.
+//
+//   When a tensor_ref is done:
+//      First, any pending update records are processed, described below [see note [+]]
+//      if the tensor has been defined via tensor_def, we simply encode the index assigned to it.
+//      Otherwise:
+//         - we assign an available forward index to it, by taking one from the free list, or
+//           growing the forward index table.
+//         - if the tensor is not in the map already,y the new entry will have a link of -1, otherwise
+//           the new entry links to the existing chain.
+//         - in either case, the tensor_index_map will now point to the index of the new entry in the chain.
+//
+//     To process a pending update record for a forward index 'k', and a associated tensor index:
+//          - encode the new index value, the forward index 'k', and any subsequent indicies in the table,
+//            following the chain; and in the process, we put all the table entries in the free list.
+//
+// note [+]: a forward index freed in this step, may be reused immediately after to represente a different
+// tensor's forward reference; so the decoder must resolve them immediately.
+//
+// one little problem ... if the deserialize interface uses 'need_fixup' to apply one fixup
+// to two or more tensors, there's no way to represent that in the data above.. I think the proper
+// way to fix this is to eliminate this practice from the code base, so that the 'serialize' process will
+// be able to do all the work - in the meantime I've made 'pending_tensor_updates' table a pair of pointers,
+// so that we can have up to two tensor pointers set from each deserialize event, with little overhead.
+
+class SerTensorConnDefs {
+  public:
+    typedef unsigned tensor_idx;
+    typedef Tensor const *ptr_type;
+    static constexpr tensor_idx LOWER_BIT_MASK = 0x3FFFFFFF;
+};
+
+} // namespace hnnx
+
+#endif // SERDES_TENSORS_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/serialize_defs.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/serialize_defs.h
new file mode 100755
index 0000000000000..eaa8117175124
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/serialize_defs.h
@@ -0,0 +1,79 @@
+//==============================================================================
+//
+// Copyright (c) Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef SERIALIZE_DEFS_H
+#define SERIALIZE_DEFS_H 1
+
+#include <cstdint>
+#include <typeinfo>
+
+// General class of op: foreground for main thread, vec for HVX thread(s), mtx
+// for HMX thread(s).
+enum OpStoreType {
+    OpStoreFg,
+    OpStoreVec, // HVX (vector)
+    OpStoreMtx, // HMX (matrix)
+    OpStoreElt, // HLX (element-wise long vector)
+};
+
+class Op;
+
+namespace hnnx {
+
+class Serializer;
+class Deserializer;
+class Deserz;
+
+/**
+ * @brief Common base to register error in Serialize/Deserialize
+ * Calling register_error sets the error string (unless there is one already).
+ * any_error() should be checked after each full use of serialize/deserialize
+ * Note: register_error must be called with a string constant, or other
+ * persistent string, since the contents of the string are not copied.
+ *
+ */
+class DeSerError {
+    char const *errstr = nullptr; // null if no error
+  public:
+    void reset_error() { errstr = nullptr; }
+    bool any_error() const { return errstr != nullptr; }
+    void register_error(char const *estr)
+    { // must be a persistent string!
+        if (errstr == nullptr) errstr = estr;
+    }
+    char const *error_string() const { return errstr; }
+};
+
+// We allow 4 bits of extra flag storage when storing an Op type, using
+// the upper four bits of the index.
+constexpr uint32_t SerializeOpFlagMask = 0xf0000000u;
+constexpr uint32_t SerializeOpFlagShift = 28u;
+
+void op_serialize_common(Serializer &sctx, Op const *op, std::type_info const *actual_type = nullptr);
+
+static constexpr unsigned OP_SEQNO_MARKER_XOR = 0x1303ee71u;
+static constexpr unsigned OP_SEQNO_MARKER_MASK = 0x1FFFFFFFu; // upper 3 bits reserved for flags.
+static constexpr unsigned OP_SEQNO_PRELOAD_FLAG = 0x80000000u;
+// if this bit is set in the sequence word, it means one or more 'extended attribute'
+// words follow.
+static constexpr unsigned OP_SEQNO_EXTATTR_FLAG = 0x40000000u;
+// The general format for 'extended attribute' word is;
+//  bits 30..24 tell you what it means, and
+//   bit 31 tells you it's not the last one.
+static constexpr unsigned OP_EXTATTR_SELF_SLICING = 0x1; // 8 LSBs = # of slices (>=2)
+
+// bits 23..0 points to the index of predicate conditions
+// bits 30..24 tell you what it means
+// bit 31 tell you that you are are not the last one
+static constexpr unsigned OP_EXTATTR_PREDICATE = 0x2;
+// Getting the last 24 bits. It contains the index of the predicate condition
+static constexpr unsigned OP_PRED_CONDITION_INDEX_MASK = 0x00ffffff;
+
+} // namespace hnnx
+
+#endif // SERIALIZE_DEFS_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/serialize_oplist.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/serialize_oplist.h
new file mode 100755
index 0000000000000..1dcac0ac186e1
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/serialize_oplist.h
@@ -0,0 +1,252 @@
+//==============================================================================
+//
+// Copyright (c) 2023 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef SERIALIZE_OPLIST
+#define SERIALIZE_OPLIST 1
+#include <cstdint>
+#include <array>
+#include <utility>
+
+#include "forward_classes.h"
+#include "bake_defs.h"
+
+namespace hnnx {
+
+class Checkpoints;
+
+namespace bake {
+
+template <unsigned X> static constexpr unsigned log2_ceil()
+{
+    if constexpr (X <= 16) {
+        static_assert(X > 0, "log2_ceil<0> not valid!");
+        return (X <= 2) ? (X - 1) : (X <= 4) ? 2 : (X <= 8) ? 3 : 4;
+    } else {
+        return log2_ceil<(X + 15) / 16>() + 4;
+    }
+}
+
+// this
+template <typename OpaqueT> inline constexpr unsigned encode_opaquet_size()
+{
+    // op_opaque_tgt_info<OpaqueT> must be specialized to provide length and alignment ('on target').
+    constexpr unsigned length = op_opaque_tgt_info<OpaqueT>::length;
+    if constexpr (length == 0) { // if 0, we don't care about the alignment
+        return 2;
+    } else {
+        constexpr unsigned align = op_opaque_tgt_info<OpaqueT>::alignment;
+        // otherwise, alignment must be a suitable power of 2, and length must be a multiple of it.
+        static_assert(align >= 1 && align <= max_opaquet_align && (align & (align - 1)) == 0, "bad alignment value");
+        static_assert(length % align == 0, "length must be a multiple of alignment");
+        constexpr unsigned lower_bits = (align <= 4) ? 2 : log2_ceil<align>();
+        return (length << 8) | lower_bits;
+    }
+}
+
+} // namespace bake
+
+class OpSerHandle;
+
+class SerOpsInterface {
+    friend class hnnx::OpSerHandle;
+
+  protected:
+    SerOpsInterface() = default;
+    ~SerOpsInterface() = default;
+    SerOpsInterface(SerOpsInterface const &) = delete;
+    SerOpsInterface &operator=(const SerOpsInterface &) = delete;
+    SerOpsInterface(SerOpsInterface &&) = delete;
+    SerOpsInterface &operator=(SerOpsInterface &&) = delete;
+    // Common handler for op_typical, op_variadic, op_typical_with_extra.
+    // mode = 0 for op_typical
+    //       = 1 for op_variadic
+    //       = 3 for op_simpleop
+    //   for op_typical_with_extra:
+    //       lower 8 bits are log2(aligment) - must be >= 2, <= log2(max_opaquet_align)
+    //       uppper 24 bits are size, multiple of alignment.
+    //       If the size is 0, lower 8 bits are always 2.
+    // So, codes 4..257 are available.
+    static constexpr unsigned opMODE_typical = 0;
+    static constexpr unsigned opMODE_variadic = 1;
+    static constexpr unsigned opMODE_simpleop = 3;
+
+    virtual void op_serialize_func(Op const *op, unsigned n_in, Tensor const *const *in_tens, unsigned n_out,
+                                   uptr_Tensor const *out_tens, unsigned mode) = 0;
+    // Used for ConstWrapperOp, ShapeWrapperOp, DummyN
+    virtual void op_for_tensor_func(Op const *op, unsigned n_out, uptr_Tensor const *out_tens) = 0;
+
+    virtual void prescan_ops_func(Op *const *seq_of_ops, unsigned n_ops, bool last = false) = 0;
+
+  public:
+    // 'top-level' sequencing calls
+    // Before serializing the allocator,
+    // (1) call 'graph_io_tensots' with prescan = true, just to prescan the tensors
+    //    (no serialization is done)
+    // (2) present all Ops to prescan_ops in the same order as they will be serialized.
+    //     This may be done in more than one call, with 'last = true' on the last one.
+    //     (or finish with a call to prescan_ops_done).
+    inline void prescan_ops(std::vector<Op *> const &seq_of_ops, bool last = false)
+    {
+        prescan_ops_func(seq_of_ops.data(), seq_of_ops.size(), last);
+    }
+    inline void prescan_ops(Op *const *seq_of_ops, unsigned n_ops, bool last = false)
+    {
+        prescan_ops_func(seq_of_ops, n_ops, last);
+    }
+    inline void prescan_ops_done() { prescan_ops_func(nullptr, 0, true); }
+
+    virtual void graph_io_tensors(unsigned n_in, uptr_Tensor const *in_tensors, unsigned n_out,
+                                  uptr_Tensor const *out_tensors, bool is_prescan = false) = 0;
+    virtual void checkpoints_table(hnnx::Checkpoints const &) = 0;
+    virtual void before_runlists(unsigned nops_norun, unsigned nops_main, unsigned nops_vector, unsigned nops_mtx,
+                                 unsigned nops_elt) = 0; // call before serializing 'non-runlist'
+    virtual void after_non_runlist() = 0; // call after serializing 'non_runlist', before 'combined runlist'
+    virtual void after_runlist() = 0; // call after runlist complete.
+
+    // tensor_serialize_func needs to know what basic thing it's dealing with, and it then
+    // can discover everything else via virtual calls.
+    static constexpr unsigned tensMODE_fail = 0; // used for tensors which can't serialize
+    static constexpr unsigned tensMODE_general = 1; // Concrete tensor, all cases
+    static constexpr unsigned tensMODE_shape = 2; // TensorShape<Rank>
+    static constexpr unsigned tensMODE_scalar = 3; // TensorSlcrDT<DT>
+
+    // This is called to serialize each op; op_seqno is the 0-based index
+    // (i.e. the number of ops previously serialized).
+    virtual void serialize_op(Op const &, unsigned op_seqno) = 0;
+    // to be called from TypicalOpIoBase<N_OUT, N_IN>::serialize
+    template <size_t N_IN, size_t N_OUT>
+    inline void op_typical(Op const *op, std::array<const Tensor *, N_IN> const &inputs,
+                           std::array<uptr_Tensor, N_OUT> const &outputs)
+    {
+        op_serialize_func(op, N_IN, inputs.data(), N_OUT, outputs.data(), opMODE_typical);
+    }
+    // to be called from TypicalOpWithCompiler<F, OpaqueT>::serialize, with OpaqueT explicitly specified
+    template <typename OpaqueT, size_t N_IN, size_t N_OUT>
+    inline void op_typical_with_extra(Op const *op, std::array<const Tensor *, N_IN> const &inputs,
+                                      std::array<uptr_Tensor, N_OUT> const &outputs)
+    {
+        op_serialize_func(op, N_IN, inputs.data(), N_OUT, outputs.data(), bake::encode_opaquet_size<OpaqueT>());
+    }
+
+    // to be called from VariadicOpBase::serialize
+    template <typename V_IN, typename V_OUT>
+    inline void op_variadic(Op const *op, V_IN const &inputs, V_OUT const &outputs)
+    {
+        op_serialize_func(op, inputs.size(), inputs.data(), outputs.size(), outputs.data(), opMODE_variadic);
+    }
+
+    // to be used for SimpleOpWrapper::serialize; op_serialize_func will dynamic-cast to SimpleOpWrapper
+    // and then obtain the proper type.
+    template <typename V_IN, typename V_OUT>
+    inline void op_simpleop(Op const *op, V_IN const &inputs, V_OUT const &outputs)
+    {
+        op_serialize_func(op, inputs.size(), inputs.data(), outputs.size(), outputs.data(), opMODE_simpleop);
+    }
+
+    // Used for ConstWrapperOp, ShapeWrapperOp, DummyN
+    inline void op_for_tensor(Op const *op, unsigned n_out, uptr_Tensor const *out_tens)
+    {
+        op_for_tensor_func(op, n_out, out_tens);
+    }
+    inline void op_for_tensor(Op const *op, uptr_Tensor const &out_tens) { op_for_tensor_func(op, 1, &out_tens); }
+
+    // Each call to serialize method of Tensor goes to tensor_serialize.
+    // Normally, these occur within an 'Op' serialize method, but 'graph in/out' tensors are also
+    // serialized.
+    virtual void tensor_serialize(Tensor const *tens) = 0;
+
+    // Given a pointer to a ShapeFlags which is really a Shape<RANK>, serialize its content.
+    // Does not include 'shared object' protocol (only called when we have a new one)
+    virtual void shape_serialize(ShapeFlags const *basep, unsigned rank) = 0;
+
+    // used to handle framework ops.
+    // The serialize method calls methods of the returned handle (which call protected .spcl_XX virtual methods)
+    // and then .spcl_done is called when the handle is deleted. So you can do the whole thing in one line,
+    // e.g.
+    //   sctx.op_special(this).data_u32({val1, val2}}.size_vec(ptr);
+    //
+    // It is expected that no other serialization activity occurs between the call to .op_special(),
+    // and the call to spcl_done (when the handle is deleted).
+    //
+    virtual OpSerHandle op_special(Op const *op) = 0;
+
+  protected:
+    // methods called by methods of the 'OpSerHandle'
+    // See OpSerHandle to see what they do.
+    virtual void spcl_done(OpSerHandle &) = 0; // called by ~OpSerHandle
+    virtual void spcl_add_u32(OpSerHandle &, uint32_t const *p, unsigned n) = 0;
+    virtual void spcl_add_sized_vec(OpSerHandle &, uint32_t const *data, bool extra) = 0;
+    virtual void spcl_fill_nullptr(OpSerHandle &, unsigned n) = 0;
+
+    OpSerHandle make_opser_handle(unsigned info);
+};
+
+class OpSerHandle {
+    friend class SerOpsInterface;
+
+  protected:
+    SerOpsInterface &owner;
+    unsigned info;
+    OpSerHandle(SerOpsInterface &owner_in, unsigned info_in) : owner(owner_in), info(info_in) {}
+
+  public:
+    inline ~OpSerHandle() { owner.spcl_done(*this); }
+    OpSerHandle(const OpSerHandle &) = delete;
+    OpSerHandle &operator=(const OpSerHandle &) = delete;
+    OpSerHandle(OpSerHandle &&) = delete;
+    OpSerHandle &operator=(OpSerHandle &&) = delete;
+    //////////////////////////////
+    // add literal u32 values
+    // (1) generic ptr/offs
+    inline OpSerHandle &data_u32(uint32_t const *p, unsigned n)
+    {
+        owner.spcl_add_u32(*this, p, n);
+        return *this;
+    }
+    // (2) { vals, ... }
+    inline OpSerHandle &data_u32(std::initializer_list<uint32_t> vals)
+    {
+        owner.spcl_add_u32(*this, vals.begin(), vals.size());
+        return *this;
+    }
+    // (3) single value
+    inline OpSerHandle &data_u32(uint32_t val)
+    {
+        owner.spcl_add_u32(*this, &val, 1);
+        return *this;
+    }
+    ///////////////////////////
+    // Add an 'outboard' array of literal u32, as in Spill/Fill/BlockZap/McSend.
+    // The first word must be the len of the remaining values (in bytes; >=4 and a multiple of 4).
+    // pickle format consists of writing the entire array (with the first word serving as the len).
+    // If 'extra' is true, it means that array has an extra word at the end, not in the count,
+    // and not serialized (such as the 0-marker at end of blockzap).
+    inline OpSerHandle &sized_vec(uint32_t const *arr_data, bool extra = false)
+    {
+        owner.spcl_add_sized_vec(*this, arr_data, extra);
+        return *this;
+    }
+    ///////////////////////////
+    // add one or more 'null pointer fill', this has no effect on the pickle but it reserves
+    // pointer slot(s) in the baked op image.
+    inline OpSerHandle &fill_nullptr(unsigned n = 1)
+    {
+        owner.spcl_fill_nullptr(*this, n);
+        return *this;
+    }
+};
+// This is a way for subclasses of SerOpsInterface to make an OpSerHandle via its protected ctor.
+//
+inline OpSerHandle SerOpsInterface::make_opser_handle(unsigned info)
+{
+    return OpSerHandle(*this, info);
+}
+
+} // namespace hnnx
+
+#endif // SERIALIZE_OPLIST
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/serialize_register.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/serialize_register.h
new file mode 100755
index 0000000000000..650fa58a65b53
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/serialize_register.h
@@ -0,0 +1,91 @@
+//==============================================================================
+//
+// Copyright (c) Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef SERIALIZE_REGISTER_H
+#define SERIALIZE_REGISTER_H 1
+
+#include <stdexcept>
+#include "crate.h"
+#include "op_info.h"
+#include "macros_attribute.h"
+#include "weak_linkage.h"
+#include "size_align_code.h"
+#include "dcrate_inlines.h"
+
+namespace hnnx {
+
+class SimpleOpWrapper;
+
+template <typename T> struct deserialize_tensor_using_constructor {
+    static uptr_Tensor deserialize(Deserz &dctx)
+    {
+        // put the deserialized
+        // Tensor into the crate, using a 'Tensor_Deleter' which won't actually try to delete it.
+        Tensor *const t_ptr = dctx.dcrate()->emplace0<T>(dctx);
+        return std::unique_ptr<Tensor, Tensor_Deleter>(t_ptr, Tensor_Deleter(true));
+    }
+};
+
+// Allocation/deallocation for Op
+
+template <typename T> struct alloc_func_for_op {
+    static void *alloc_func(void *ptr, Deserz &dctx) { return new (ptr) T(dctx); }
+    // this is here so that specializations of deserialize_tensor_using_constructor
+    // can be made which have size 0; this is only for 'ConatWrapper' and 'ShapeWrapper'.
+    static constexpr size_align_code_t op_size_align = size_align_code_t::for_type<T>();
+};
+
+PUSH_VISIBILITY(default)
+API_EXPORT void deserialize_simple_op_wrapper(void *, Deserz &dctx, std::unique_ptr<SimpleOpBase> sop_in);
+POP_VISIBILITY()
+
+template <typename T> struct alloc_func_for_op_ext {
+    static void *alloc_func(void *ptr, Deserz &dctx)
+    {
+        auto sop = std::make_unique<T>();
+        deserialize_simple_op_wrapper(ptr, dctx, std::move(sop));
+        return ptr;
+    }
+};
+
+template <typename T> struct dealloc_func_for_op {
+    static void func(Graph *graph_in, void *ptr)
+    {
+        if constexpr (has_clear<T>) {
+            static_cast<T *>(ptr)->clear(graph_in);
+        }
+        static_cast<T *>(ptr)->~T();
+    }
+};
+// specialize for 'int'; used for all trivially-destructable types.
+template <> struct dealloc_func_for_op<int> {
+    static void func(Graph *graph_in, void *ptr) {}
+};
+
+template <typename T> //
+inline constexpr deserialize_dtor_func get_dealloc_func_for_op()
+{
+    if constexpr (!std::is_trivially_destructible<T>::value) {
+        return dealloc_func_for_op<T>::func;
+    } else {
+        // we only need one of these
+        return dealloc_func_for_op<int>::func;
+    }
+}
+
+template <typename OPTYPE> inline void register_framework_op(char const *opname)
+{
+    using alloc_func = alloc_func_for_op<OPTYPE>;
+    register_op_info(typeid(OPTYPE), hnnx::cost_function_t(StandardCosts::FAST), 0, (SimpleOpFactory) nullptr, false,
+                     opname);
+    op_deserializer_fn const fn(alloc_func::alloc_func, get_dealloc_func_for_op<OPTYPE>(), alloc_func::op_size_align);
+    deserialize_op_register(&typeid(OPTYPE), opname, fn);
+}
+
+} // namespace hnnx
+#endif // SERIALIZE_REGISTER_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/shape.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/shape.h
new file mode 100755
index 0000000000000..0fa60e3c3838d
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/shape.h
@@ -0,0 +1,199 @@
+//==============================================================================
+//
+// Copyright (c) 2020 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef HEXNN_SHAPE_H
+#define HEXNN_SHAPE_H 1
+
+#include <cstdint>
+#include <set>
+#include <array>
+#include <cstring>
+#include <map>
+#include "interface_defs.h"
+#include "template_help.h"
+#include "serialize_defs.h"
+#include "weak_linkage.h"
+#include "macros_attribute.h"
+#include "dynamic_tensors.h"
+
+class Graph;
+
+// a bit of weirdness here, to avoid the need to use a different std::map or std::set
+// for each rank of shape.
+// Existing shapes are registered in a multimap<unsigned, void const*>, one per Rank;
+// the key is a hash, and the value is a pointer to a Shape<N>.
+//
+namespace hnnx {
+
+using shape_reduce_map = std::multimap<unsigned, void const *>;
+//
+// shape_hash is used to hash a Shape<Rank> object; the 'len' value is
+// supplied by Shape::shplen(), and depends on Rank.
+unsigned shape_hash(void const *, unsigned shplen);
+//
+// this compares two shapes for equality.
+inline bool shape_compare_eq(void const *a, void const *b, unsigned shplen)
+{
+    return std::memcmp(a, b, shplen) == 0;
+}
+
+// This looks up a shape in a shape_reduce_map, independently of Rank.
+// If a matching value is found, it returns an iterator to it, and the caller
+// can look at iter->second. If no value is found, it inserts an entry
+//  { hash, nullptr}, and returns the iterator pointing to that; caller
+//  will see iter->second is null, and must replace it with a pointer
+//  to a persistent value equal to '*shp'.
+shape_reduce_map::iterator shape_find_in_map(shape_reduce_map &map, void const *shp, unsigned hash, unsigned shplen);
+
+void shape_serialize(Serializer &sctx, unsigned rank, size_t const *dims, size_t const *max_dims, uint8_t const *pad);
+
+// values for the ShapeFlags.flags
+// 'constant' is used for constant tensors
+// 'uncached' is used for 'uncached' (dma spill/fill)
+// Note: other flags may be or'd' in later in upper bits,
+// so test 'constant' and 'uncached' as bit tests.
+enum class ShapeFlag {
+    none = 0,
+    constant = 1,
+    uncached = 2, // mutually exclusive with 'constant'
+};
+
+struct ShapeFlags {
+    uint16_t flags;
+    // must not have any undefined padding between ShapeFlags and Shape<Rank>dims,
+    // so we have this explicit paddings
+  private:
+    uint16_t padding[sizeof(size_t) / sizeof(uint16_t) - 1] = {
+            0,
+    };
+
+  public:
+    ShapeFlags() : flags(0) {}
+    explicit ShapeFlags(ShapeFlag flags_in) : flags(unsigned(flags_in)) {}
+    ShapeFlags(ShapeFlags const &) = default;
+    virtual ~ShapeFlags() = default;
+
+    inline bool is_const_memory() const { return (flags & unsigned(ShapeFlag::constant)) != 0; }
+    inline bool is_uncached_memory() const { return (flags & unsigned(ShapeFlag::uncached)) != 0; }
+    inline bool ok_src_bypass() const
+    {
+        return (flags & (unsigned(ShapeFlag::uncached) | unsigned(ShapeFlag::uncached))) != 0;
+    }
+    inline bool ok_dst_bypass() const { return (flags & unsigned(ShapeFlag::uncached)) != 0; }
+
+    // avoid warning about unused private member:
+    unsigned avoid_warning() const { return padding[0]; }
+};
+// This is used by 'persistent_clone' to duplicate a shape object, but with a new flags value.
+// The 'ref_shape' is really a pointer to a Shape<rank>, with 'rank' in supported range.
+// This will just do Shape<rank>::canonical_shape( graph, *ref_shape, new_flags), and return the result
+// cast back to ShapeFlags const *.
+ShapeFlags const *copy_shape_with_flags(Graph &gr, ShapeFlags const *ref_shape, unsigned rank, ShapeFlag newflags);
+
+} // namespace hnnx
+
+PUSH_VISIBILITY(default)
+
+// Functionality shared between Shape<Rank> and DynamicShape<Rank>
+template <size_t Rank> struct ShapeInterface : public hnnx::ShapeFlags {
+    ShapeInterface() : dims(), isDynamicShape(false){};
+    explicit ShapeInterface(std::array<size_t, Rank> dims_in, const bool is_dynamic_shape_in)
+        : dims(dims_in), isDynamicShape(is_dynamic_shape_in){};
+
+    mutable std::array<size_t, Rank> dims;
+    const size_t isDynamicShape;
+    inline const std::array<size_t, Rank> &get_dims() const { return dims; }
+
+    void set_dims(std::array<size_t, Rank> const &dims_in) const;
+    DynamicStatus get_state() const;
+    void set_state(DynamicStatus new_state) const;
+};
+
+template <size_t Rank> struct Shape : public ShapeInterface<Rank> {
+    using ShapeInterface<Rank>::flags;
+    using ShapeInterface<Rank>::dims;
+
+    Shape() : max_dims(), pad(){};
+    explicit Shape(const size_t *dims_in)
+        : ShapeInterface<Rank>(hnnx::ptr_to_stdarray<Rank, size_t>(dims_in), false),
+          max_dims(hnnx::ptr_to_stdarray<Rank, size_t>(dims_in)), pad(){};
+    Shape(std::array<size_t, Rank> dims_in, std::array<size_t, Rank> max_dims_in)
+        : ShapeInterface<Rank>(dims_in, false), max_dims(max_dims_in), pad(){};
+    //  copy, but change the flags
+    Shape(Shape const &ref, hnnx::ShapeFlag newflags) : Shape(ref) { flags = unsigned(newflags); }
+    std::array<size_t, Rank> max_dims;
+    std::array<uint8_t, Rank> pad;
+    static constexpr size_t RankVal = Rank;
+    // make crated shape matching given shape, or re-use an existing crated shape
+    API_EXPORT static const Shape *canonical_shape(Graph &graph_in, const Shape &val);
+    API_EXPORT static const Shape *canonical_shape(Graph &graph_in, const OutputDef &def);
+    // force a given ShapeFlag state
+    API_EXPORT static const Shape *canonical_shape(Graph &graph_in, const Shape &val, hnnx::ShapeFlag newflags);
+    // copy into crate without checking for existing duplicate
+    API_EXPORT static const Shape *crated_shape(Graph &graph_in, const Shape &val);
+
+    bool operator<(const Shape &rhs) const { return std::memcmp(this, &rhs, shplen()) < 0; }
+    API_EXPORT static const Shape *deserialize(hnnx::Deserz &dctx, Shape const **ptrloc);
+    API_EXPORT void serialize(hnnx::Serializer &sctx) const;
+
+#ifndef PREPARE_DISABLED
+    std::string get_shape_info() const;
+#endif
+
+  protected:
+    API_EXPORT unsigned shplen() const { return (char const *)&pad[0] + Rank - (char const *)this; }
+};
+// FIXME: this is incomplete since it doesn't have Shape<Rank> methods
+// This doesn't have flags either, so there can be only one distinct instance of Shape<0>.
+//
+template <> struct Shape<0> {
+    std::array<uint8_t, 1> dims;
+    std::array<uint8_t, 1> max_dims;
+    std::array<uint8_t, 1> pad;
+
+  protected:
+    unsigned shplen() const { return 0; }
+};
+
+template <size_t Rank> struct DynamicShape : public ShapeInterface<Rank> {
+    using ShapeInterface<Rank>::dims;
+
+  protected:
+    mutable DynamicStatus dynamic_state;
+
+  public:
+    explicit DynamicShape(const size_t *dims_in, DynamicStatus state_in)
+        : ShapeInterface<Rank>(hnnx::ptr_to_stdarray<Rank, size_t>(dims_in), true), dynamic_state(state_in){};
+    explicit DynamicShape(std::array<size_t, Rank> dims_in, DynamicStatus state_in)
+        : ShapeInterface<Rank>(dims_in, true), dynamic_state(state_in){};
+    inline void set_dims(std::array<size_t, Rank> const &dims_in) const { this->dims = dims_in; }
+    DynamicStatus get_state() const { return dynamic_state; }
+    inline void set_state(DynamicStatus new_state) const { dynamic_state = new_state; }
+    API_EXPORT static ShapeInterface<Rank> const *deserialize(hnnx::Deserz &dctx, const ShapeInterface<Rank> **ptrloc,
+                                                              ShapeInterface<Rank> const *shape);
+    // copy into crate without checking for existing duplicate
+    API_EXPORT static DynamicShape<Rank> *crated_shape(Graph &graph_in, const DynamicShape &val);
+};
+
+// Need to define a get_dynamic_shape_obj() function in base Tensor class
+// because Serializer::tensor_serialize() requires it.
+// null_dynamic_shape is a dummy dynamic_shape used for
+// scalar tensor and tensor shape
+static const DynamicShape<1> null_dynamic_shape = DynamicShape<1>(std::array<size_t, 1>({0}), DynamicStatus::ValidData);
+
+POP_VISIBILITY()
+
+using Shapes = hnnx::shape_reduce_map[7];
+
+#if 0
+struct ShapeRepository {
+    Shapes shapes;
+};
+#endif
+
+#endif
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/simple_op.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/simple_op.h
new file mode 100755
index 0000000000000..fbe3f4344241d
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/simple_op.h
@@ -0,0 +1,197 @@
+//==============================================================================
+//
+// Copyright (c) 2021 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef SIMPLE_OP_H
+#define SIMPLE_OP_H
+
+#include "graph_status.h"
+#include "template_help.h"
+#include "op_utils.h"
+#include "template_help_tensor_ext.h"
+#include "tensor.h"
+#include "cost.h"
+#include "weak_linkage.h"
+#include "macros_attribute.h"
+
+namespace hnnx {
+
+PUSH_VISIBILITY(default)
+
+// base class of SimpleOp
+class SimpleOpBase {
+    std::unique_ptr<SimpleOpBase> next_sop;
+
+  public:
+    SimpleOpBase() {}
+    SimpleOpBase(const SimpleOpBase &) = delete;
+    SimpleOpBase &operator=(const SimpleOpBase &) = delete;
+    SimpleOpBase(SimpleOpBase &&) = delete;
+    SimpleOpBase &operator=(SimpleOpBase &&) = delete;
+    API_EXPORT virtual ~SimpleOpBase();
+    API_EXPORT virtual std::type_info const *true_type() const { return &typeid(*this); }
+    API_EXPORT virtual size_t get_n_inputs() const = 0;
+    API_EXPORT virtual size_t get_n_outputs() const = 0;
+    API_EXPORT virtual uint8_t const *get_input_tensor_types() const = 0;
+    API_EXPORT virtual uint8_t const *get_output_tensor_types() const = 0;
+    API_EXPORT virtual bool needs_tcm() const = 0;
+    API_EXPORT virtual GraphStatus execute(Tensor const *const *inputs_p, unsigned n_in, uptr_Tensor const *outputs_p,
+                                           unsigned n_out) const noexcept = 0;
+    API_EXPORT static void release_chain(std::unique_ptr<SimpleOpBase> &listhead) noexcept;
+
+    API_EXPORT inline void set_next(std::unique_ptr<SimpleOpBase> &&nextp) { next_sop = std::move(nextp); }
+};
+
+POP_VISIBILITY()
+
+/*
+ * SimpleOp class
+ * used by external op packages
+ * for the purpose of exposing fewer symbols
+ */
+template <auto F> class SimpleOp : public SimpleOpBase {
+    using Ftype = std::remove_pointer_t<decltype(F)>;
+
+  public:
+    // the collection of input types, as pointers
+    using input_ptr_tuple_type = typename ArgsTuples<Ftype>::input_ptr_tuple;
+    // the collection of output types, as pointers
+    using output_ptr_tuple_type = typename ArgsTuples<Ftype>::output_ptr_tuple;
+    // the inputs as real types
+    using input_tuple_defs = typename ArgsTuples<Ftype>::input_tuple;
+    // the outputs as real types
+    using output_tuple_defs = typename ArgsTuples<Ftype>::output_tuple;
+    // A graph argument is not allowed
+    using graph_ptr_tuple_type = typename ArgsTuples<Ftype>::graph_ptr_tuple;
+
+    // numbers of inputs and outputs
+    static constexpr size_t n_inputs = ArgsTuples<Ftype>::n_inputs;
+    static constexpr size_t n_outputs = ArgsTuples<Ftype>::n_outputs;
+
+    // indices representing input and outputs tensor types
+    // only tensor types from AllTensors in template_help_tensor_ext.h are allowed to be used in SimpleOp
+    static constexpr std::array<uint8_t, n_inputs> input_tensor_type_indices =
+            tensors_to_indices<std::array<uint8_t, n_inputs>, input_tuple_defs>();
+    static constexpr std::array<uint8_t, n_outputs> output_tensor_type_indices =
+            tensors_to_indices<std::array<uint8_t, n_outputs>, output_tuple_defs>();
+    // boolean representing whether all tensor types used in outputs are from AllTensors list
+    static constexpr bool are_tensor_types_valid = check_tensor_types_valid<output_tuple_defs>();
+
+    // number of graph parameter
+    static constexpr size_t n_graphs = std::tuple_size<std::decay_t<graph_ptr_tuple_type>>::value;
+
+    SimpleOp() : SimpleOpBase() {}
+    SimpleOp(const SimpleOp &) = delete;
+    SimpleOp &operator=(const SimpleOp &) = delete;
+    SimpleOp(SimpleOp &&) = delete;
+    SimpleOp &operator=(SimpleOp &&) = delete;
+
+    ~SimpleOp() override = default;
+
+    size_t get_n_inputs() const override { return n_inputs; }
+
+    size_t get_n_outputs() const override { return n_outputs; }
+
+    uint8_t const *get_input_tensor_types() const override { return input_tensor_type_indices.data(); }
+
+    uint8_t const *get_output_tensor_types() const override { return output_tensor_type_indices.data(); }
+
+    bool needs_tcm() const override
+    {
+        // replace with less dependency in the future
+        static constexpr bool needs_tcm_t = has_memclass<MemoryClass::TCM, output_tuple_defs>::value;
+        return needs_tcm_t;
+    }
+
+    static inline bool valid_construction(size_t n_inputs_in, size_t n_outputs_in, Tensor const *const *inputs_in,
+                                          OutputDef const *const *outputs_in, Graph &graph_in)
+    {
+        if (n_inputs != n_inputs_in) return false;
+        if (n_outputs != n_outputs_in) return false;
+        if (!are_input_tensors_compatible<n_inputs, input_ptr_tuple_type>(graph_in, inputs_in)) return false;
+        if (!are_output_defs_valid<n_outputs, output_tuple_defs>(outputs_in, graph_in)) return false;
+        if (n_graphs) return false;
+        return true;
+    }
+
+  protected:
+    // generate parameter I (in range 0..parm_n_total-1) for calling the func within execute.
+    // Return type is 'auto &' so it will always return a reference.
+    template <size_t I>
+    inline auto &get_exec_parm(Tensor const *const *const inputs_in, uptr_Tensor const *const outputs_in) const noexcept
+    {
+        if constexpr (I < n_outputs) { // output
+            using output_ptr_t = std::tuple_element_t<I, output_ptr_tuple_type>;
+            // extract output[I], downcast to output_ptr_t, return ref
+            return *static_cast<output_ptr_t>(outputs_in[I].get());
+        } else {
+            static_assert(I < n_outputs + n_inputs);
+            using input_ptr_t = std::tuple_element_t<I - n_outputs, input_ptr_tuple_type>;
+            // extract input[I - n_inputs], downcast to output_ptr_t, return ref
+            return *static_cast<input_ptr_t>(inputs_in[I - n_outputs]);
+        }
+    }
+    template <size_t... I>
+    inline GraphStatus call_with_parms(Ftype f, Tensor const *const *const inputs_in,
+                                       uptr_Tensor const *const outputs_in, std::index_sequence<I...>) const noexcept
+    {
+        return GraphStatus(f(get_exec_parm<I>(inputs_in, outputs_in)...));
+    }
+
+  public:
+    GraphStatus execute(Tensor const *const *const inputs_p, const unsigned n_in, uptr_Tensor const *const outputs_p,
+                        const unsigned n_out) const noexcept override
+    {
+        // the SimpleOpWrapper ctor calls get_n_inputs and get_n_outputs to size its arrays, so
+        // this correspondence should not need more than an assert.
+        assert(n_in == n_inputs && n_out == n_outputs);
+        return call_with_parms(F, inputs_p, outputs_p, std::make_index_sequence<n_outputs + n_inputs>{});
+    }
+
+    static std::unique_ptr<SimpleOpBase> create(size_t n_inputs_in, size_t n_outputs_in, Tensor const *const *inputs_in,
+                                                OutputDef const *const *outputs_in, Graph &graph_in)
+    {
+        if (SimpleOp::valid_construction(n_inputs_in, n_outputs_in, inputs_in, outputs_in, graph_in)) {
+            return std::move(std::make_unique<SimpleOp>());
+        } else {
+            return std::unique_ptr<SimpleOp>{};
+        }
+    }
+
+    using tensor_deserializer_register_func = int (*)();
+
+    static constexpr tensor_deserializer_register_func get_tensor_deserializer_register_func()
+    {
+        return hnnx::deserialize_tensor_tuple<output_tuple_defs, false>::f_ptr();
+    }
+};
+
+} // namespace hnnx
+
+/**
+ * @brief All external Op source files must invoke this macro at the top of the file,
+ * before any COST_OF/REGISTER_OP/DEF_OPT calls.
+ *
+ */
+#define BEGIN_PKG_OP_DEFINITION(NAME) INITIALIZE_TABLES()
+
+/**
+ * @brief All external Op source files must invoke this macro at the bottom of the
+ * file, after all COST_OF/REGISTER_OP/DEF_OPT calls.
+ *
+ */
+#define END_PKG_OP_DEFINITION(NAME) FINALIZE_TABLES(NAME)
+
+template <auto F> struct SimpleOpType {
+    using type = hnnx::SimpleOp<F>;
+};
+
+template <auto F> struct DerivedType {
+    using type = hnnx::SimpleOp<F>;
+};
+
+#endif // SIMPLE_OP_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/simple_reg.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/simple_reg.h
new file mode 100755
index 0000000000000..68a455f80e281
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/simple_reg.h
@@ -0,0 +1,13 @@
+//==============================================================================
+//
+// Copyright (c) 2023 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+// We need the specific order for these headers
+// clang-format off
+#include "simple_op.h"
+#include "ops_opts_registration.h"
+// clang-format on
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/size_align_code.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/size_align_code.h
new file mode 100755
index 0000000000000..181db1e6ed66f
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/size_align_code.h
@@ -0,0 +1,72 @@
+//=============================================================================
+//
+//  Copyright (c) Qualcomm Technologies, Inc.
+//  All Rights Reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//============================================================================
+
+#ifndef SIZE_ALIGN_CODE_H
+#define SIZE_ALIGN_CODE_H
+
+#include <cstddef>
+#include <cstdint>
+
+PUSH_VISIBILITY(default)
+
+namespace hnnx {
+
+// LCOV_EXCL_START [SAFTYSWCCB-1736] constexprs resolved during compile time
+template <size_t N> unsigned constexpr log2_floor_of()
+{
+    if constexpr (N < 4) {
+        return N >= 2 ? 1 : 0; // note, 0->0
+    }
+    size_t x = N;
+    unsigned res = 2;
+    while (x >= 8) {
+        x >>= 1;
+        res++;
+    }
+    return res;
+}
+
+// LCOV_EXCL_STOP
+
+// size_align_code_t combines the size and alignment of an op in a size_t word.
+
+class size_align_code_t {
+    // this has 'K' in lower 4 bits, and 'W' in upper bits;
+    // the alignment is 1 << K, and the size is W << K.
+    size_t code;
+
+  public:
+    constexpr size_align_code_t() : code(0) {}
+    constexpr size_align_code_t(size_align_code_t const &) = default;
+    constexpr size_align_code_t(size_align_code_t &&) = default;
+    constexpr size_align_code_t &operator=(size_align_code_t const &) = default;
+    constexpr size_align_code_t &operator=(size_align_code_t &&) = default;
+    ~size_align_code_t() = default;
+
+    // construct for a given op type T, e.g. size_align_code_t::for_type<OpType>();
+    template <typename T> static constexpr size_align_code_t for_type()
+    {
+        size_align_code_t result{};
+        constexpr size_t sz = sizeof(T);
+        constexpr size_t algn = alignof(T);
+        static_assert(algn >= 1u && algn <= 32768u && (algn & (algn - 1)) == 0, "bad alignment");
+        static_assert(sz > 0 && sz % algn == 0, "bad size");
+        constexpr unsigned log2a = log2_floor_of<algn>();
+        result.code = (sz / algn) * 16 | log2a;
+        return result;
+    }
+    size_t constexpr size() const { return (code >> 4u) << (code & 0xFu); }
+    size_t constexpr align() const { return size_t(1) << (code & 0xFu); }
+    bool constexpr is_null() const { return code == 0; }
+};
+
+} // namespace hnnx
+
+POP_VISIBILITY()
+
+#endif
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/splithist.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/splithist.h
new file mode 100755
index 0000000000000..765113af77ff7
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/splithist.h
@@ -0,0 +1,191 @@
+//==============================================================================
+//
+// Copyright (c) 2020 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef SPLITHIST_H
+#define SPLITHIST_H
+
+#include <vector>
+#include "interface_defs.h"
+#include "log.h"
+
+namespace hnnx {
+// class to represent a slice history.
+// This is just a wrapper around unsigned;
+// public methods allow null-construct, copy, compare-eq.
+// Also, special 'next_slice' increment operation.
+class SplitHistoryTable;
+
+class splithist_t {
+    unsigned val;
+    explicit splithist_t(unsigned v) : val(v) {}
+    friend SplitHistoryTable;
+    // this is the number of split which can refer to
+    // one record in the table; it determines how many
+    // LSBS of 'val' are the slice no; the remainder
+    // are table index.
+    static unsigned constexpr SPLITPER = 1024;
+
+  public:
+    splithist_t() : val(0) {}
+    splithist_t(splithist_t const &) = default;
+    splithist_t &operator=(splithist_t const &) = default;
+    ~splithist_t() = default;
+
+    inline bool empty() const { return val == 0; }
+    // the following 2 methods only work if nsplit < SPLITPER
+    // they are only intended for results of AUTOTHREAD
+    unsigned get_record() const { return val / SPLITPER; }
+    unsigned get_sliceno() const { return val % SPLITPER; }
+
+    inline bool operator==(splithist_t const &rhs) const { return val == rhs.val; }
+    inline bool operator!=(splithist_t const &rhs) const { return val != rhs.val; }
+    inline void next_slice()
+    {
+        unsigned nextval = val + 1;
+        if ((nextval & (SPLITPER - 1)) == 0) { // rolled over
+            nextval -= SPLITPER * 2; // reset, and back one record.
+        }
+        val = nextval;
+    }
+    inline unsigned value() const { return val; }
+};
+class SplitHistory {
+    friend SplitHistoryTable;
+
+  public:
+    unsigned orig_id = 0;
+    unsigned unique_id = 0;
+    typedef std::vector<std::tuple<int, int, int>> indices_list_t;
+    indices_list_t split_indices;
+    SplitHistory() : orig_id(0), unique_id(0), split_indices() {}
+    SplitHistory(SplitHistory const &) = default;
+    SplitHistory(SplitHistory &&) = default;
+    SplitHistory &operator=(SplitHistory const &) = default;
+    SplitHistory &operator=(SplitHistory &&) = default;
+    ~SplitHistory() = default;
+
+    bool empty() const { return split_indices.empty(); }
+    std::string indices_string() const;
+    std::string indices_string_raw() const;
+    std::vector<int> offsets() const;
+
+  private:
+    SplitHistory(unsigned orig_id_in, unsigned unique_id_in, indices_list_t &&spl_in)
+        : orig_id(orig_id_in), unique_id(unique_id_in), split_indices(std::move(spl_in))
+    {
+    }
+};
+
+// The table in SplitHistoryTable represents the history of 'node split events' for any
+// given OpDef, with a series of trees encoded in a table. Nodes with the same history
+// have the same 32-bit splithist_t (and references are not counted).
+// For each slice generated by an autosplit, we represent the dimension sliced and the slice
+// index; if the op slices was not previously sliced, we retain its 'original id' in the root
+// of a tree; if it was previously sliced, we retain the previous split information as a reference
+// towards the root of the tree.
+// Since all slices within a split have the same information other than the slice index, we
+// store that in the 'pointer': the splithist_t (in its upper bits) references one of the entries
+// in the table via a 1-based index; and the lower bits of splithist_t are a slice index. So, only
+// one record needs to be added to the tree for each AUTOSPLIT done:
+//
+// - If a OpDef Q which was never split before is split on a given index, we make a new
+//   'root' record containing the original id from Q, and the dimension. The splithist_t generated
+//   for each of the parts all have the same table index - to the new record - but the
+//   'slice index' are all different : 0,1,2 ...
+// - If an OpDef Q was previously split (and thus has  non-zero splithist), we instead
+//   make a 'subsplit' record, which contains the splithist_t of Q (this acts as a
+//   pointer towards the tree root), and the split dim.
+//
+//  If any split involves more than SPLITPER parts, the slice indices won't all fit in the
+//  the lower bits of splithist_t. To support this, we add 'extension' records ahead of
+//  the normal records in the table.
+//  E.g. if SPLITPER=1024 and we need to make 2500 split, we need two extension records,
+//  ahead of the 'root' or 'subsplit' entry, as follows in the table:
+//
+//         - an extension record with recoff = 2
+//         - an extension record with recoff = 1
+//         - the main record (root or subsplit).
+//
+//   Now, for the first 1024 slices of that split [0..1023], the splithist_t upper bits contain the index
+//   of the main record, and have slice_idx 0..1023; the next 1024 slices [1024..2047] have upper bits
+//   referencing the extension record with recoff=1, also with slice_idx = 0..0123; and the final 452 slices
+//   [2048..2499] have upper bits referencing the extension record with recoff=2, and have slice_idx = 0..451.
+//   Extension records occupy one entry in the table, so when an extension record is encountered when following
+//   a splithist_t, it is easy to locate the 'main' record it refers to, and to adjust the slice index to the proper
+//   value, by using the 'recoff' in the extension record.
+//
+// So, there are three types of records:
+//   - Root record, contains a 3-bit 'dimno', and 32-bit 'original op id' (the lower 32 bits of the OpId which is split)
+//   - Subsplit record, contains a 3-bit dimno, and 'subsplit_t' which points one level towards the root (and, it may point to
+//     an extension record, if the containing split has more than SPLITPER slices).
+//   - extension records, which contain a 'recoff' value.
+// These records are stored in a vector of 'unsigned'; in addition to the fields given above, there
+// are upper bits in the first word which signify what the record type is. Also, the records are often packed
+// into one word, sometimes two, according to the magnitude of the values to be encoded. The 'extension' record always
+// fits in one word.
+// The details of the encoding are described in splithist.cc, near the three methods of SplitHistoryTable which
+// need to know about them (e.g. cursor_from_splithist unpacks a record to 'Cursor' object, compensating
+// for extension record when present).
+
+class SplitHistoryTable {
+    std::vector<unsigned> table;
+    static unsigned constexpr SPLITPER = splithist_t::SPLITPER;
+    // MAX_SLICE_N determines the max # of splits which can be done at any one
+    // time, which limits the number of extension words allowed. This is set
+    // to a very high value here; it's really just to provide an upper limit
+    // for sanity-checking the data structure, and should not be the deciding
+    // limit (max # of inputs on a concat).
+    static unsigned constexpr MAX_SLICE_N = 4 * 1024 * 1024;
+    static unsigned constexpr MAX_EXTRECS = (MAX_SLICE_N - 1) / SPLITPER;
+
+  public:
+    SplitHistoryTable() = default;
+    splithist_t make_new_split(uint32_t dimno, OpId nodeid, splithist_t oldhist, int nslices, uint32_t slice_size,
+                               uint32_t first_sliceno = 0);
+    SplitHistory get_splithist(splithist_t shist) const;
+    // Equivalent to get_splithist, but it only returns {orig_id, unique_id}
+    // instead of the full SplitHistory, and will be faster as a result.
+    std::pair<unsigned, unsigned> get_splithist_ids(splithist_t shist) const;
+
+    // Return the main split record / slice number.
+    // This differs from the values returned by splithist_t get_record() / get_sliceno() for extension records.
+    unsigned get_splithist_main_record(splithist_t shist) const;
+    unsigned get_splithist_main_sliceno(splithist_t shist) const;
+
+    // When two nodes containing splithist_t values 'a' and 'b' are combined
+    // by CSE, this determines the splithist for the combined node.
+    splithist_t resolve(splithist_t const a, splithist_t const b, const bool is_const) const
+    {
+        return ((a == b) || b.empty()) ? a : a.empty() ? b : resolve_func(a, b, is_const);
+    }
+
+    // Return true iff splithist_b is empty or is a parent of splithist_a
+    bool is_parent_of(splithist_t a, splithist_t b) const;
+    // Return the immediate parent of a
+    splithist_t get_parent(splithist_t a) const;
+
+  protected:
+    splithist_t resolve_func(splithist_t a, splithist_t b, bool is_const) const;
+
+    struct Cursor {
+        unsigned rec_index; // current record's index (skipping indirect record if any)
+        splithist_t parent; // link to parent, or empty (0) if this is root.
+        int sliceno; // slice index (adjusted for indirect record if any)
+        int dimno;
+        int slicesize;
+        inline bool is_root() const { return parent.empty(); }
+    };
+    // internal methods to traverse from a position to the root.
+    void cursor_from_splithist(Cursor &, splithist_t const &) const;
+    void cursor_to_parent(Cursor &c) const; // move to parent, only legal if not root.
+    unsigned extract_orig_id(unsigned recindex) const; // only to be used on root record.
+};
+
+} // end namespace hnnx
+
+#endif // SPLITHIST_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/string_registry2.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/string_registry2.h
new file mode 100755
index 0000000000000..72c7b686d929b
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/string_registry2.h
@@ -0,0 +1,203 @@
+//==============================================================================
+//
+// Copyright (c) 2018, 2023 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef STRING_REGISTRY_TWO
+#define STRING_REGISTRY_TWO 1
+#include <array>
+#include <string>
+#include <string_view>
+#include <map>
+#include <list>
+#include <vector>
+#include <stdexcept>
+#include <cstring>
+#include "weak_linkage.h"
+
+//
+// 'string registry'
+//
+// maps std::string -> string_key (which is a pointer)
+//  and back.
+// Each 'new'  string returns a new key; each previously seen string
+// gives the same key as before.
+// The empty string always maps to a specific value, a statically allocated entity which can be used for static init
+// of string_key objects.
+//
+// The object pointed to is a pair <stringview, hashval> -- and the stringview pointer is guaranteed to be
+// null-terminated - so the conversion from string_key to char * (or to its hash) is very quick.
+
+// NOTE: entries cannot be deleted. the destructor of the string_registry_template<> will free everything.
+// Exception: you can also call clear(), which deletes everything except the entry for empty string.
+//       Of course this forgets the previous mapping completely.
+
+PUSH_VISIBILITY(default)
+
+namespace hnnx {
+
+template <int K = 0x13121> struct polynomial_string_hash {
+    API_EXPORT unsigned operator()(char const *s, size_t n) const
+    {
+        unsigned h = 0;
+        for (int i = 0; i < (int)n; i++) {
+            h = h * (unsigned(K) | 1) + s[i];
+        }
+        return h;
+    }
+};
+
+//
+// The data structure is:
+//
+//     std::map<std::string_view,hashval_t>  m_fwd_map;
+//
+//         This maps 'known' strings to their hashes. the string_view references memory in
+//         'bulk storage' (see below), each string is null-terminated.
+//         This could also be an unordered_map
+//
+//    std::list< std::array< char, BULKN> > m_bulk;	 // list of memory chunks for strings.
+//    char * m_bulk_current;						// points to m_bulk.back()[0] (or null when none)
+//    size_t m_bulk_pos;
+//            No. of bytes used in m_bulk_current.
+//    NOTE: the nodes in m_bulk  cannot be moved, since the m_fwd_map keys point to them.
+//
+//
+
+template <class HASHFUNC> class string_registry_two {
+    typedef unsigned hashval_t;
+    typedef std::pair<const std::string_view, hashval_t> mapval_t;
+
+  public:
+    typedef mapval_t const *string_key;
+
+  protected:
+    static constexpr int BULKN = 4096 - 2 * sizeof(void *);
+
+    HASHFUNC hasher;
+
+    typedef std::array<char, BULKN> bulkarray;
+    std::list<bulkarray> m_bulk;
+    char *m_bulk_current;
+    size_t m_bulk_pos;
+    std::map<std::string_view, hashval_t> m_fwd_map;
+
+    API_EXPORT unsigned get_hash(std::string_view s) { return hasher(s.data(), s.size()); }
+
+    API_EXPORT char *need_bulk(size_t n)
+    {
+        if (n > BULKN) throw std::length_error("string too long");
+        if (m_bulk_current == nullptr || m_bulk_pos + n > BULKN) {
+            m_bulk.emplace_back();
+            m_bulk_current = &m_bulk.back()[0];
+            m_bulk_pos = 0;
+        }
+        char *const res = m_bulk_current + m_bulk_pos;
+        m_bulk_pos += n;
+        return res;
+    }
+
+    static mapval_t empty_string_node;
+
+  public:
+    API_EXPORT string_registry_two();
+    string_registry_two(string_registry_two<HASHFUNC> const &) = delete;
+    string_registry_two &operator=(string_registry_two<HASHFUNC> const &) = delete;
+
+    // the number of entries (not counting empty string)
+
+    API_EXPORT size_t size() const { return m_fwd_map.size(); }
+    API_EXPORT void clear(); // forget everything; free all memory
+    // forward map: string to key
+    API_EXPORT string_key map_str(std::string_view s);
+    API_EXPORT string_key map_str(std::string const &s);
+    API_EXPORT string_key map_str(char const *s);
+
+    // like map_str_(s), but if the string is not in the map already,
+    // will *not* add it; it will return the string_key for "".
+    API_EXPORT string_key map_str_checked(std::string_view s) const;
+
+    // reverse map: to read-only C string.
+    API_EXPORT static char const *c_str(string_key sk) { return sk->first.data(); }
+    // reverse map to std::string or view.
+    API_EXPORT static std::string unmap(string_key sk) { return std::string(sk->first); }
+    API_EXPORT static std::string_view const &unmap_sv(string_key sk) { return sk->first; }
+    // this is the string_key for "", which is a statically allocated value.
+    // Use NOINLINE to avoid "definition of dllimport static field " and "unresolved external symbol" errors on Windows
+    API_EXPORT NOINLINE static string_key map_empty_str() { return &empty_string_node; };
+};
+
+template <class HASHFUNC>
+typename string_registry_two<HASHFUNC>::mapval_t hnnx::string_registry_two<HASHFUNC>::empty_string_node = {{"", 0}, 0};
+
+template <class HASHFUNC>
+API_EXPORT hnnx::string_registry_two<HASHFUNC>::string_registry_two() : m_bulk_current(nullptr), m_bulk_pos(0)
+{
+}
+
+template <class HASHFUNC> API_EXPORT void hnnx::string_registry_two<HASHFUNC>::clear()
+{
+    // clear the rev maps
+    // clear the fwd map
+    m_fwd_map.clear();
+    // clear all the bulk storage (but leave one, if there is one)
+    if (m_bulk.size() > 0) {
+        while (m_bulk.size() > 1)
+            m_bulk.pop_back();
+        m_bulk_current = &m_bulk.back()[0];
+        m_bulk_pos = 0;
+    }
+}
+
+template <class HASHFUNC>
+typename string_registry_two<HASHFUNC>::string_key hnnx::string_registry_two<HASHFUNC>::map_str(char const *s)
+{
+    return map_str(std::string_view(s));
+}
+template <class HASHFUNC>
+typename string_registry_two<HASHFUNC>::string_key hnnx::string_registry_two<HASHFUNC>::map_str(std::string const &s)
+{
+    return map_str(std::string_view(s));
+}
+
+template <class HASHFUNC>
+typename string_registry_two<HASHFUNC>::string_key hnnx::string_registry_two<HASHFUNC>::map_str(std::string_view s)
+{
+    size_t const slen = s.size();
+    if (slen == 0) return &empty_string_node; // empty string
+
+    // (1) try to find key in the current map.
+    // if it's there, and it usually should be, we don't need to do anything else.
+
+    auto found = m_fwd_map.lower_bound(s);
+    if (found != m_fwd_map.end() && s == found->first) return &*found;
+
+    // ok, now we have to do an insert. first put the string in bulk storage
+    //
+    char *const dst = need_bulk(slen + 1);
+    memcpy(dst, s.data(), slen);
+    dst[slen] = '\0';
+
+    unsigned const hash = get_hash(s);
+
+    auto ins_iter = m_fwd_map.emplace_hint(found, std::make_pair(std::string_view(dst, slen), hash));
+    return &*ins_iter;
+}
+template <class HASHFUNC>
+typename string_registry_two<HASHFUNC>::string_key API_FUNC_EXPORT
+hnnx::string_registry_two<HASHFUNC>::map_str_checked(std::string_view s) const
+{
+    if (s.size() != 0) {
+        auto found = m_fwd_map.lower_bound(s);
+        if (found != m_fwd_map.end() && s == found->first) return &*found;
+    }
+    return &empty_string_node;
+}
+} // namespace hnnx
+
+POP_VISIBILITY()
+
+#endif // STRING_REGISTRY_TWO
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/template_help.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/template_help.h
new file mode 100755
index 0000000000000..ffe21c8f90f32
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/template_help.h
@@ -0,0 +1,711 @@
+
+//==============================================================================
+//
+// Copyright (c) Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef HEXNN_TEMPLATE_HELP_H
+#define HEXNN_TEMPLATE_HELP_H 1
+
+#include <functional>
+#include <memory>
+#include <tuple>
+#include <type_traits>
+#include <utility>
+#include "type_name.h"
+
+class Graph;
+class Tensor;
+template <typename P> class Vector;
+
+namespace hnnx {
+struct OsS; // this is the 'real name of hnnx::op_slice_spec
+
+/* Wrap Types or Values in Templates */
+/* I'm not sure that these are always needed, but it came in handy as I'm learning these things */
+
+template <template <typename> typename Tname> struct TemplateTypeWrapper {
+};
+
+template <template <size_t> typename Tname> struct TemplateIdxWrapper {
+};
+
+template <typename twrap, size_t val> struct UnwrapIdxTemplate_struct {
+};
+
+template <template <size_t> typename Twrap, size_t Val>
+struct UnwrapIdxTemplate_struct<TemplateIdxWrapper<Twrap>, Val> {
+    using type = Twrap<Val>;
+};
+
+template <typename Twrap, size_t Val> using UnwrapIdxTemplate = typename UnwrapIdxTemplate_struct<Twrap, Val>::type;
+
+template <typename twrap, typename tapply> struct UnwrapTypeTemplate_struct {
+};
+
+template <template <typename> typename Twrap, typename Tapply>
+struct UnwrapTypeTemplate_struct<TemplateTypeWrapper<Twrap>, Tapply> {
+    using type = Twrap<Tapply>;
+};
+
+template <typename Twrap, typename Tapply>
+using UnwrapTypeTemplate = typename UnwrapTypeTemplate_struct<Twrap, Tapply>::type;
+
+/*
+ * Helper functions for dealing with tuples.
+ *
+ * FIXME: EJP: some of these things might need to be refactored, as sometimes
+ * they have some extra functionality for some specific use, instead of being as
+ * generic and reusable as possible...
+ *
+ */
+
+/*
+ * EJP: As I'm getting better at all this template stuff,
+ * I should go back and refactor all this TypeFilter stuff.
+ */
+
+template <typename...> struct TupleCons;
+
+/*
+ * Create a tuple type of one element and the contents of an additional tuple
+ */
+template <template <typename...> typename C, typename T, typename... Rest> struct TupleCons<T, C<Rest...>> {
+    using type = C<T, Rest...>;
+};
+
+template <template <typename> class Pred, template <typename> class Wrapper, typename...> struct TypeFilter;
+
+/*
+ * Just a single element: create empty tuple or tuple with the single element
+ */
+template <template <typename> class Pred, template <typename> class Wrapper, typename Head>
+struct TypeFilter<Pred, Wrapper, Head> {
+    using type =
+            std::conditional_t<Pred<std::remove_reference_t<std::remove_pointer_t<Head>>>::value,
+                               std::tuple<Wrapper<std::remove_reference_t<Head>>>, // FIXME: remove remove_reference_t
+                               std::tuple<>>;
+};
+/*
+ * Filter this element and concatenate with the rest of the elements
+ */
+
+/*
+ * EJP: Maybe change this to take a tuple, so that we can refine
+ */
+template <template <typename> class Pred, template <typename> class Wrapper, typename Head, typename... Tail>
+struct TypeFilter<Pred, Wrapper, Head, Tail...> {
+    using type = std::conditional_t<Pred<std::remove_reference_t<std::remove_pointer_t<Head>>>::value,
+                                    // FIXME: remove remove_reference_t here...
+                                    typename TupleCons<Wrapper<std::remove_reference_t<Head>>,
+                                                       typename TypeFilter<Pred, Wrapper, Tail...>::type>::type,
+                                    typename TypeFilter<Pred, Wrapper, Tail...>::type>;
+};
+
+template <template <typename> typename Pred, typename...> struct TupFilter;
+//template<template<typename> class Pred, template<typename> class Wrapper, typename...> struct TupFilter;
+
+//template<template<typename> class Pred, template<typename> class Pred2, typename Head>
+//template<template<typename...> typename C, template<typename> class Pred, typename Head>
+
+template <template <typename> typename Pred, template <typename...> typename C> struct TupFilter<Pred, C<>> {
+    using type = C<>;
+};
+
+template <template <typename> typename Pred, template <typename...> typename C, typename Head>
+struct TupFilter<Pred, C<Head>> {
+    using type = std::conditional_t<Pred<Head>::value, C<Head>, C<>>;
+};
+
+template <template <typename> typename Pred, template <typename...> typename C, typename Head, typename... Rest>
+struct TupFilter<Pred, C<Head, Rest...>> {
+    using type = std::conditional_t<Pred<Head>::value,
+                                    typename TupleCons<Head, typename TupFilter<Pred, C<Rest...>>::type>::type,
+                                    typename TupFilter<Pred, C<Rest...>>::type>;
+};
+
+template <template <typename> class Wrap, typename...> struct TupMap;
+
+template <template <typename> typename Wrap, template <typename...> typename C> struct TupMap<Wrap, C<>> {
+    using type = C<>;
+};
+
+template <template <typename> class Wrap, template <typename...> typename C, typename... Rest>
+struct TupMap<Wrap, C<Rest...>> {
+    using type = C<Wrap<Rest>...>;
+};
+
+#if 0
+template <template <typename> class Wrap, template <typename...> typename C,
+          typename... Ts>
+using TupMap_t = typename TupMap<Wrap, C<Ts...>>::type;
+
+template <template <typename> class Filt, template <typename...> typename C,
+          typename... Ts>
+using TupFilter_t = typename TupFilter<Filt, C<Ts...>>::type;
+#else
+template <template <typename> class Wrap, typename Tup> using TupMap_t = typename TupMap<Wrap, Tup>::type;
+
+template <template <typename> class Filt, typename Tup> using TupFilter_t = typename TupFilter<Filt, Tup>::type;
+#endif
+
+template <typename T> struct Unboxed {
+    using type = T;
+};
+
+template <typename T> struct Unboxed<const Vector<T>> {
+    using type = T;
+};
+template <typename T> struct Unboxed<Vector<T>> {
+    using type = T;
+};
+
+//template<template<typename...> typename C, typename T, typename... Ts>
+//struct Unboxed<const C<T,Ts...>> {
+//	using type = T;
+//};
+
+template <typename T> using unboxed_t = typename Unboxed<T>::type;
+
+template <class T>
+using is_not_const = std::integral_constant<bool, !std::is_const<std::remove_pointer_t<unboxed_t<T>>>::value>;
+template <class T>
+using is_const = std::integral_constant<bool, std::is_const<std::remove_pointer_t<unboxed_t<T>>>::value>;
+
+//template<template<typename...> typename C, typename...>
+template <typename T, typename Default> struct First_Tuple_Element {
+};
+
+template <template <typename...> typename C, typename First, typename... Rest, typename Default>
+struct First_Tuple_Element<C<First, Rest...>, Default> {
+    using type = First;
+};
+
+template <template <typename...> typename C, typename Default> struct First_Tuple_Element<C<>, Default> {
+    using type = Default;
+};
+
+template <typename T, typename Default> using first_tuple_element = typename First_Tuple_Element<T, Default>::type;
+
+/*
+ * Use index sequence to turn a normal pointer unknown size array into a fixed size std::array
+ *
+ * Maybe this could get refactored into some kind of thing like tuple map
+ */
+
+template <size_t N, typename T, size_t... I>
+constexpr static inline const std::array<T, N> ptr_to_stdarray_helper(const T *carray, std::index_sequence<I...>)
+{
+    const std::array<T, N> ret = {{carray[I]...}};
+    return ret;
+}
+
+template <size_t N, typename T> constexpr static inline const std::array<T, N> ptr_to_stdarray(const T *carray)
+{
+    return ptr_to_stdarray_helper<N, T>(carray, std::make_index_sequence<N>{});
+}
+
+/*
+ * These are kind of like add_pointer / add_pointer_t
+ */
+
+template <typename T> struct add_uniqueptr {
+    using type = typename std::unique_ptr<T>;
+};
+template <class T> using add_uniqueptr_t = typename add_uniqueptr<T>::type;
+
+//////////
+// Op function parameter categories
+// The order of these is important: The operands must
+// appear in order of increasing category. Also, no two operands
+// can have the same category, unless it's tensor_out or tensor_in
+// (see CheckOpFuncArgs below).
+enum class OpArgCategory { //
+    invalid, // none of the below
+    tensor_out, // T &, where T is a Tensor subclass
+    vararg_out, // Vector<T*> const &; or Vector<T*>
+    tensor_in, // T const &, where T is a Tensor subclass.
+    vararg_in, // Vector<T const*> const &; or Vector<T*>
+    slice_spec, // op_slice_spec (passed by value)
+    graph_ref, // Graph const &
+};
+
+template <typename T> struct OpArgCat {
+    static constexpr OpArgCategory value = OpArgCategory::invalid;
+};
+
+// T& or T const &; Ok if  T subclass of Tensor;
+template <typename T> struct OpArgCat<T &> {
+    static constexpr OpArgCategory value = !std::is_base_of_v<Tensor, T> ? OpArgCategory::invalid
+                                           : std::is_const_v<T>          ? OpArgCategory::tensor_in
+                                                                         : OpArgCategory::tensor_out;
+};
+// Graph const & ok
+template <> struct OpArgCat<Graph const &> {
+    static constexpr OpArgCategory value = OpArgCategory::graph_ref;
+};
+
+// Also: Vector<T*> is ok as pass-by-value or pass-by-const-ref.
+// Implementation of Vector<P> is just {P const *base, size_t n}
+//
+template <typename T> struct OpArgCat<Vector<T *> const &> {
+    static constexpr OpArgCategory value = !std::is_base_of_v<Tensor, T> ? OpArgCategory::invalid
+                                           : std::is_const_v<T>          ? OpArgCategory::vararg_in
+                                                                         : OpArgCategory::vararg_out;
+};
+template <typename T> struct OpArgCat<Vector<T *>> : public OpArgCat<Vector<T *> const &> {
+};
+
+// op_slice_spec is OK as a parameter
+template <> struct OpArgCat<OsS> {
+    static constexpr OpArgCategory value = OpArgCategory::slice_spec;
+};
+//////////
+// Check all the 'category' of the Op function args, which must conform to
+//
+// - `tensor_out` (0 or more) - parameter is `T &`
+// - `varag_out` (0 or 1, only if `VariadicOp`) - parameter is `VECTOR<T *> const &`
+// - `tensor_in` (0 or more) - parameter is `T const &`
+// - `vararg_in` (0 or 1, only if `VariadicOp`) - parameter is `VECTOR<T const *> const &`
+// - `tensor_out` (0 or more) - parameter is `T &` (these are 'scratch outputs')
+// - `slice_spec` (0 or 1) - parameter is `op_slice_spec`
+// - `graph_ref` (0 or 1) - parameter is `Graph &`
+//
+// This is done by traversing and checking these rules:
+//  - Each one's category must be >= the previous category, and can only be equal if it's 'tensor_out' or 'tensor_in'.
+//    The first one's previous is considered to be 'invalid', which is < all valid.
+//  EXCEPT:
+//  - New category of tensor_out is allowed after any > tensor_out and < slice_spec; this is counted as the first 'scratch output'.
+//  - if at least one 'scratch output' has been seen before, the category must be either 'tensor_out' (which is counted), or >= slice_spec.
+//
+// LCOV_EXCL_START [SAFTYSWCCB-1736] constexprs resolved during compile time
+// used in locally with  constexpr lvalue
+template <OpArgCategory... Args> inline constexpr int CheckOpFuncArgs()
+{
+    constexpr unsigned N = sizeof...(Args);
+    int num_scratch_out = 0;
+    if constexpr (N > 0) {
+        OpArgCategory cat_previous = OpArgCategory::invalid;
+        constexpr OpArgCategory cats[N] = {Args...};
+        for (unsigned i = 0; i < N; i++) {
+            OpArgCategory cat = cats[i];
+            if (cat < cat_previous) {
+                // not allowed, except for 'tensor_out' where it's interpreted as first 'scratch'
+                if (cat == OpArgCategory::tensor_out && cat_previous < OpArgCategory::slice_spec) {
+                    num_scratch_out = 1;
+                    cat_previous = cat;
+                    continue;
+                } else {
+                    return -1;
+                }
+            } else if (cat == cat_previous && cat != OpArgCategory::tensor_in && cat != OpArgCategory::tensor_out) {
+                // only tensor_in, tensor_out can repeat previous category.
+                return -1;
+            }
+            // special checks when the previous was a 'scratch output'
+            if (num_scratch_out > 0 && cat_previous == OpArgCategory::tensor_out) {
+                if (cat == OpArgCategory::tensor_out) {
+                    num_scratch_out++; // count one more scratch output
+                } else if (cat < OpArgCategory::slice_spec) {
+                    return -1; // any after 'scratch out' must be slice_spec or graph_ref.
+                }
+            }
+            cat_previous = cat;
+        }
+    }
+    return num_scratch_out;
+}
+// LCOV_EXCL_STOP
+/*
+ * Generic template for Concat<....> (implementation follows later in this file)
+ */
+template <typename... T> struct Concat_struct;
+
+/*
+ * Make the name nice
+ */
+template <typename... T> using Concat = typename Concat_struct<T...>::type;
+
+//////////
+// ArgTupFilter_t<CAT, Args...> -> tuple<Args...> with only ops of given cat removed.
+// Also, refs are removed.
+//
+template <typename T1, typename TUP> struct TupleBuild {
+};
+template <typename T1, typename... Types> struct TupleBuild<T1, std::tuple<Types...>> {
+    using type = std::tuple<T1, Types...>;
+};
+
+template <OpArgCategory CAT, typename... Types> struct ArgTupFilterHelper {
+};
+
+template <OpArgCategory CAT, typename T1, typename... Types> struct ArgTupFilterHelper<CAT, T1, Types...> {
+  private:
+    using tail = typename ArgTupFilterHelper<CAT, Types...>::type;
+
+  public:
+    using type = std::conditional_t<OpArgCat<T1>::value == CAT, // is T1 included?
+                                    typename TupleBuild<std::remove_reference_t<T1>, tail>::type, tail>;
+};
+
+// just one...
+template <OpArgCategory CAT, typename T1> struct ArgTupFilterHelper<CAT, T1> {
+    using type = std::conditional_t<OpArgCat<T1>::value == CAT, std::tuple<std::remove_reference_t<T1>>, std::tuple<>>;
+};
+
+// empty case...
+template <OpArgCategory CAT> struct ArgTupFilterHelper<CAT> {
+    using type = std::tuple<>;
+};
+
+template <OpArgCategory CAT, typename... Types> using ArgTupFilter_t = typename ArgTupFilterHelper<CAT, Types...>::type;
+
+//////////
+template <typename R> struct ArgsTuples;
+
+template <typename R, typename... Args> struct ArgsTuples<R(Args...)> {
+  private:
+    static constexpr int check_op_func_val = CheckOpFuncArgs<OpArgCat<Args>::value...>();
+    static_assert(check_op_func_val >= 0, "Improper Op arg parameters");
+
+  public:
+    // If this is > 0, then that many of the 'output' (the last of them) are actually 'scratch' output,
+    // they are counted in n_outputs.
+    // If not supported in VariadicOp, this must be checked there.
+    static constexpr size_t n_scratch_outputs = (check_op_func_val <= 0) ? size_t(0) : size_t(check_op_func_val);
+
+    // extract 'Graph const &' and 'op_slice_spec'
+    using const_graph_tup = ArgTupFilter_t<OpArgCategory::graph_ref, Args...>; // reference to graph?
+    using slice_spec_tup = ArgTupFilter_t<OpArgCategory::slice_spec, Args...>; // 'slice_spec'?
+
+    using input_tuple = ArgTupFilter_t<OpArgCategory::tensor_in, Args...>; // the inputs as real types
+    using output_tuple = ArgTupFilter_t<OpArgCategory::tensor_out, Args...>; // the outputs as real types
+    using var_input_tuple = ArgTupFilter_t<OpArgCategory::vararg_in, Args...>; // variadic input tuple
+    using var_output_tuple = ArgTupFilter_t<OpArgCategory::vararg_out, Args...>; // variadic output tuple
+
+    using input_ptr_tuple = TupMap_t<std::add_pointer_t, input_tuple>; // The inputs as pointers
+    using output_ptr_tuple = TupMap_t<std::add_pointer_t,
+                                      output_tuple>; // the outputs as pointers
+    using output_uniqueptrs_tuple = TupMap_t<add_uniqueptr_t,
+                                             output_tuple>; // the outputs as std::unique_ptrs
+    using graph_ptr_tuple = TupMap_t<std::add_pointer_t,
+                                     const_graph_tup>; // the graph as pointer
+
+    static constexpr size_t n_inputs = std::tuple_size<input_tuple>::value; // number of inputs
+    static constexpr size_t n_outputs = std::tuple_size<output_tuple>::value; // number of outputs
+    static constexpr bool has_graph = (std::tuple_size<const_graph_tup>::value > 0); // does it have a graph operand?
+    static constexpr bool has_slice_spec = (std::tuple_size<slice_spec_tup>::value > 0); // has op_slice_spec?
+
+    // To support 'scratch output', we want the 'nameArray' to be based on:
+    //  outputs, scratchout, varout, inputs, varin, graphref
+    // .. even though 'scratchout' parms appear later in the function.
+    // 'output_tuple' is the regular outputs followed by the scratch outputs, so the below will work.
+    //
+    using tname_args_tuple = Concat<output_tuple, var_output_tuple, input_tuple, var_input_tuple, const_graph_tup>;
+    //a string in the form of "@t1.t2.t3"... where t1,t2,t3,etc are the typenames of the input arguments as defined by DEFINE_TYPENAME
+    static constexpr auto nameArray =
+            GetTypeNames<tname_args_tuple>(std::make_index_sequence<std::tuple_size_v<tname_args_tuple>>{});
+    static constexpr const char *inputTypeNames = nameArray.data();
+};
+
+template <auto F> struct ArgsTuples2 : public ArgsTuples<std::remove_pointer_t<decltype(F)>> {
+};
+
+// contains_type< tuple<a,b,c>, x >::value: true if x is in a,b,c ...
+// no 'remove ref' etc is done.
+template <typename TUPLET, typename T> struct contains_type {
+};
+
+template <typename T> struct contains_type<std::tuple<>, T> {
+    static const bool value = false; // empty tuple contains nothing
+};
+/*
+template <typename TA, typename T>
+struct contains_type< std::tuple<TA>, T > {
+	static const bool value = std::is_same<TA,T>::value;
+};
+*/
+template <typename T, typename... TX> struct contains_type<std::tuple<T, TX...>, T> {
+    static const bool value = true;
+};
+template <typename TA, typename... TX, typename T> struct contains_type<std::tuple<TA, TX...>, T> {
+    static const bool value = contains_type<std::tuple<TX...>, T>::value;
+};
+template <typename TUPLET, typename T> struct not_contains_type {
+    static const bool value = !contains_type<TUPLET, T>::value;
+};
+
+/*
+ * Specialized that actually does the work:
+ * Given two containers (Containter template C) "A" and "B", concatenate A and B
+ */
+
+template <template <typename...> typename C, typename... As> struct Concat_struct<C<As...>> {
+    using type = C<As...>;
+};
+
+template <template <typename...> typename C, typename... As, typename... Bs> struct Concat_struct<C<As...>, C<Bs...>> {
+    using type = C<As..., Bs...>;
+};
+
+template <template <typename...> typename C, typename... As, typename... Bs, typename... Cs>
+struct Concat_struct<C<As...>, C<Bs...>, C<Cs...>> {
+    using type = C<As..., Bs..., Cs...>;
+};
+
+template <typename W, typename X, typename Y, typename Z, typename... More> struct Concat_struct<W, X, Y, Z, More...> {
+    using type = typename Concat_struct<typename Concat_struct<W, X, Y>::type,
+                                        typename Concat_struct<Z, More...>::type>::type;
+};
+
+#if !defined(NDEBUG)
+static_assert(std::is_same_v<Concat<std::tuple<int>, std::tuple<char, float>, std::tuple<>, std::tuple<bool>,
+                                    std::tuple<void *, int *>>,
+                             std::tuple<int, char, float, bool, void *, int *>>);
+static_assert(std::is_same_v<Concat<std::tuple<>, std::tuple<float, char>, std::tuple<int, char *>, std::tuple<bool>>,
+                             std::tuple<float, char, int, char *, bool>>);
+#endif
+
+#if 0 // UNUSED >>>
+/*
+ * Generic template
+ */
+template <typename... T> struct Product_helper_struct;
+
+/*
+ * Make the name nice
+ */
+template <typename... T> using Product_helper = typename Product_helper_struct<T...>::type;
+
+/*
+ * Product helper specialization:
+ * Container "C"
+ * A single container of types
+ */
+template <template <typename...> typename C, typename... As> struct Product_helper_struct<C<As...>> {
+    using type = C<As...>;
+};
+
+/*
+ * Product helper specialization:
+ * Container "C"
+ * Product with empty set is empty set always
+ */
+
+template <template <typename...> typename C, typename... Prefixes, typename... Rest>
+struct Product_helper_struct<C<Prefixes...>, C<>, Rest...> {
+    using type = C<>;
+};
+
+// The two functions below do the bulk of the work
+
+/*
+ * Product helper specialization
+ * First Arg: a container of prefixes,
+ * Second Arg: a single (containered) element to append to each prefix
+ * Args...: All the rest of the work
+ *
+ * Create a container of new prefixes by concatinating each prefix with the new element
+ * Then recurse using these new prefixes with the rest of the work
+ *
+ * This handles a single element.
+ * The element is containerized so that it also handles a container with a single element,
+ * or the last element in a list.
+ */
+
+template <template <typename...> typename C, typename... Prefixes, typename Elem, typename... Rest>
+struct Product_helper_struct<C<Prefixes...>, C<Elem>, Rest...> {
+    using new_prefixes = C<Concat<Prefixes, C<Elem>>...>;
+    using type = Product_helper<new_prefixes, Rest...>;
+};
+
+/*
+ * Product helper specialization
+ * First Arg: a container of prefixes,
+ * Second Arg: More than one containered elements
+ * Args...: All the rest of the work
+ *
+ * Create a first list with the first element off the second argument, and
+ * create the list recursing with just the single containerized element
+ *  (This will use the specialization above)
+ * Then create a second list by recursing with the rest of the elements of the second argument
+ * Finally, Concatenate these two lists.
+ *
+ * EJP: I think maybe I'm starting to get the hang of these template things.
+ *
+ */
+
+template <template <typename...> typename C, typename... Prefixes, typename FirstElem, typename... RestElem,
+          typename... Rest>
+struct Product_helper_struct<C<Prefixes...>, C<FirstElem, RestElem...>, Rest...> {
+    using type = Concat<Product_helper<C<Prefixes...>, C<FirstElem>, Rest...>,
+                        Product_helper<C<Prefixes...>, C<RestElem...>, Rest...>>;
+};
+
+template <typename... T> struct Product_struct;
+
+template <template <typename...> typename C> struct Product_struct<C<>> {
+    using type = C<>;
+};
+
+#if 0
+template <template <typename...> typename C, typename... First,
+          typename... Rest>
+struct Product_struct<C<C<First...>, Rest...>> {
+    using type = Product_helper<C<C<First>...>, Rest...>;
+};
+#else
+template <template <typename...> typename C, typename... Rest> struct Product_struct<C<Rest...>> {
+    using type = Product_helper<C<C<>>, Rest...>;
+};
+#endif
+
+template <typename... T> using Product = typename Product_struct<T...>::type;
+
+#endif // <<< UNUSED
+
+template <typename IterT> struct pair_to_iterators : std::pair<IterT, IterT> {
+    pair_to_iterators(std::pair<IterT, IterT> const &&iter_pair_in) : std::pair<IterT, IterT>(std::move(iter_pair_in))
+    {
+    }
+    pair_to_iterators(std::pair<IterT, IterT> const &iter_pair_in) : std::pair<IterT, IterT>(iter_pair_in) {}
+    IterT begin() const { return this->first; }
+    IterT end() const { return this->second; }
+    IterT cbegin() const { return this->first; }
+    IterT cend() const { return this->second; }
+};
+
+// insert a numeric object in a sorted vector,
+// unless it's already there. Useful instead of set<T>
+// if sizeof(T) and n are fairly small (since it uses O(n) inserts)
+// Returns true if the value was inserted, false if there was a dup.
+template <typename T, typename Allocator> bool insert_ordered_no_dups(std::vector<T, Allocator> &vec, T const &value)
+{
+    if (vec.empty() || vec.back() < value) {
+        vec.emplace_back(value);
+    } else {
+        int hi = vec.size();
+        int lo = 0;
+        T const *p = &vec[0];
+        while (lo < hi) {
+            int const mid = (lo + hi) / 2u;
+            if (value < p[mid]) {
+                hi = mid;
+            } else if (p[mid] < value) {
+                lo = mid + 1;
+            } else {
+                return false; // is a dup.
+            }
+        }
+        vec.insert(vec.begin() + lo, value);
+    }
+    return true;
+}
+
+// generic binary search.
+// ======
+// This is coded in a form which tends to work well on hexagon;
+// the representation of the remaining sublist as ptr,offset
+// reduces the critical-path calculation, and the update is simple
+// enough that it usually requires no conditional calculations.
+//
+// =====
+// Look for k in arr[0..n-1], which must be in order.
+// Returns address of the first element which is >=k; if n==0 or
+// 'extr' is a key extractor, which reads a K from T.
+// if all are < k, it returns &arr[n].
+//  uses comparisons K < K,  and also K==K if CHECK_EQ
+//
+//  CHECK_EQ: includes an '==k' check in each iteration. Generally
+//    this will be faster if comparisons are cheap and it's common
+//    to actually find the value in the list. Note, if the list
+//    contains more than one of k, the result may be any of these if
+//    CHECK_EQ is true; always the first if CHECK_EQ is false.
+//
+//  LIN_THR: when the sublist is <= this, use a linear search.
+//  This is much faster per iteration than the binary search, when
+//  the key is just an int and the records are not much bigger than an int.
+//  For a table of ints or similar, probably should be about 6.
+//  If you set this to >=3 or so, CHECK_EQ should always be false
+//  since the equality check is unlikely to hit, and it makes the loop longer.
+
+template <bool CHECK_EQ, int LIN_THR, typename EXTRACTOR, typename T, typename K>
+inline T const *array_search_ordered(T const *arr, int n, K const &k, EXTRACTOR extr)
+{
+    T const *p = arr;
+    static constexpr int LTHR = (LIN_THR < 3) ? 0 : 3;
+
+    // invariant : p[0..n-1] have not been examined
+    //  all before p[0] are <k, all at p[n] and beyond are >=k
+    while (n > LTHR) {
+        int const nx = n - 1;
+        n >>= 1;
+        T const *const px = &p[n];
+        K const &pxv = extr(*px);
+        if constexpr (CHECK_EQ)
+            if (pxv == k) return px;
+        if (pxv < k) {
+            p = px + 1; // select part of list after px
+            n = nx >> 1;
+        }
+    }
+    if constexpr (LTHR > 0) {
+        T const *const p_end = &p[n];
+        while (p < p_end && extr(*p) < k)
+            ++p;
+    }
+    return p;
+}
+
+// same with no EXTRACTOR
+template <bool CHECK_EQ, int LIN_THR, typename T> T const *array_search_ordered(T const *arr, int n, T const &k)
+{
+    return array_search_ordered<CHECK_EQ, LIN_THR>(arr, n, k, [](T const &x) { return x; });
+}
+
+// same with T *
+template <bool CHECK_EQ, int LIN_THR, typename EXTRACTOR, typename T, typename K>
+T *array_search_ordered(T *arr, int n, K const &k, EXTRACTOR extr)
+{
+    return const_cast<T *>(
+            array_search_ordered<CHECK_EQ, LIN_THR, EXTRACTOR, T>(const_cast<T const *>(arr), n, k, extr));
+}
+// with T *, no extractor
+template <bool CHECK_EQ, int LIN_THR, typename T> T *array_search_ordered(T *arr, int n, T const &k)
+{
+    return const_cast<T *>(
+            array_search_ordered<CHECK_EQ, LIN_THR>(const_cast<T const *>(arr), n, k, [](T const &x) { return x; }));
+}
+
+// given a std::vector<pair<T1,T2>> - which must be sorted in increasing
+// order of T1 - look up a key (T1) value 'k', and return a pointer to the matching
+// T2 - or null if not found.
+// This makes use of T1 < T1 and T1==T1
+template <typename T1, typename T2> T2 const *sorted_pair_lookup(std::vector<std::pair<T1, T2>> const &v, T1 const &k)
+{
+    int n = v.size();
+    auto const *arrp = v.data();
+    auto const *posn = array_search_ordered<false, 6>(arrp, n, k, [](decltype(*arrp) const &p) { return p.first; });
+    if (posn >= &arrp[n] || posn->first != k) return nullptr;
+    return &posn->second;
+}
+
+// lookup in a vector of sorted tuples; based on the first element.
+// Returns pointer to the whole matching tuple, or nullptr.
+template <typename T1, typename... Tx>
+std::tuple<T1, Tx...> const *sorted_tuple_lookup(std::vector<std::tuple<T1, Tx...>> const &v, T1 const &k)
+{
+    int n = v.size();
+    auto const *arrp = v.data();
+    auto const *posn =
+            array_search_ordered<false, 4>(arrp, n, k, [](decltype(*arrp) const &tup) { return std::get<0>(tup); });
+    if (posn >= &arrp[n] || std::get<0>(*posn) != k) return nullptr;
+    return posn;
+}
+
+} // namespace hnnx
+
+#endif
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/template_help_tensor_ext.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/template_help_tensor_ext.h
new file mode 100755
index 0000000000000..8eccfca1929e7
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/template_help_tensor_ext.h
@@ -0,0 +1,130 @@
+//==============================================================================
+//
+// Copyright (c) Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef TEMPLATE_HELP_TENSOR_EXT_H
+#define TEMPLATE_HELP_TENSOR_EXT_H
+
+#include "tensor.h"
+#include "macros_attribute.h"
+#include "weak_linkage.h"
+
+namespace hnnx {
+
+int tensor_deserializer_register_ext(size_t n_out, uint8_t const *out_indices);
+
+/*
+ * mapping each predefined tensor type to an index
+ * used by SimpleOp and SimpleOpWrapper to support op packages
+ */
+
+template <typename T> static constexpr uint8_t tensor_idx = 0;
+
+template <> inline constexpr uint8_t tensor_idx<Tensor> = 1;
+template <> inline constexpr uint8_t tensor_idx<PlainFloatTensor> = 2;
+template <> inline constexpr uint8_t tensor_idx<PlainFloatTensor_TCM> = 3;
+template <> inline constexpr uint8_t tensor_idx<PlainFloat16Tensor> = 4;
+template <> inline constexpr uint8_t tensor_idx<PlainFloat16Tensor_TCM> = 5;
+// REMOVED template <> inline constexpr uint8_t tensor_idx<D32FloatTensor> = 6;
+// REMOVED template <> inline constexpr uint8_t tensor_idx<D32PaddedFloatTensor> = 7;
+template <> inline constexpr uint8_t tensor_idx<Int32Tensor> = 8;
+template <> inline constexpr uint8_t tensor_idx<Int32Tensor_TCM> = 9;
+template <> inline constexpr uint8_t tensor_idx<Int32CroutonTensor> = 10;
+template <> inline constexpr uint8_t tensor_idx<Int32CroutonTensor_TCM> = 11;
+template <> inline constexpr uint8_t tensor_idx<QuantUint8Tensor> = 12;
+template <> inline constexpr uint8_t tensor_idx<QuantUint8Tensor_TCM> = 13;
+template <> inline constexpr uint8_t tensor_idx<QuantInt8Tensor> = 14;
+template <> inline constexpr uint8_t tensor_idx<QuantInt8Tensor_TCM> = 15;
+template <> inline constexpr uint8_t tensor_idx<QuantUint16Tensor> = 16;
+template <> inline constexpr uint8_t tensor_idx<QuantUint16Tensor_TCM> = 17;
+template <> inline constexpr uint8_t tensor_idx<QuantInt16Tensor> = 18;
+template <> inline constexpr uint8_t tensor_idx<QuantInt16Tensor_TCM> = 19;
+template <> inline constexpr uint8_t tensor_idx<QuantInt32Tensor> = 20;
+template <> inline constexpr uint8_t tensor_idx<QuantInt32Tensor_TCM> = 21;
+template <> inline constexpr uint8_t tensor_idx<QUint8CroutonTensor> = 22;
+template <> inline constexpr uint8_t tensor_idx<QUint8CroutonTensor_TCM> = 23;
+template <> inline constexpr uint8_t tensor_idx<QInt8CroutonTensor> = 24;
+template <> inline constexpr uint8_t tensor_idx<QInt8CroutonTensor_TCM> = 25;
+template <> inline constexpr uint8_t tensor_idx<QUint8Crouton4x1Tensor> = 26;
+template <> inline constexpr uint8_t tensor_idx<QUint8Crouton4x1Tensor_TCM> = 27;
+template <> inline constexpr uint8_t tensor_idx<QUint8Crouton2x2Tensor> = 28;
+template <> inline constexpr uint8_t tensor_idx<QUint8Crouton2x2Tensor_TCM> = 29;
+template <> inline constexpr uint8_t tensor_idx<QUint8WideCroutonTensor> = 30;
+template <> inline constexpr uint8_t tensor_idx<QUint8WideCroutonTensor_TCM> = 31;
+template <> inline constexpr uint8_t tensor_idx<QUint8WideCrouton2x2Tensor> = 32;
+template <> inline constexpr uint8_t tensor_idx<QUint8WideCrouton2x2Tensor_TCM> = 33;
+template <> inline constexpr uint8_t tensor_idx<QUint16CroutonTensor> = 34;
+template <> inline constexpr uint8_t tensor_idx<QUint16CroutonTensor_TCM> = 35;
+template <> inline constexpr uint8_t tensor_idx<QInt32CroutonTensor> = 36;
+template <> inline constexpr uint8_t tensor_idx<QInt32CroutonTensor_TCM> = 37;
+template <> inline constexpr uint8_t tensor_idx<QInt32WideCroutonTensor> = 38;
+template <> inline constexpr uint8_t tensor_idx<QInt32WideCroutonTensor_TCM> = 39;
+
+template <> inline constexpr uint8_t tensor_idx<TensorShape<4>> = 40;
+
+template <> inline constexpr uint8_t tensor_idx<F16CroutonTensor> = 41;
+template <> inline constexpr uint8_t tensor_idx<F16CroutonTensor_TCM> = 42;
+// all tensor types supported in op package ops
+// clang-format off
+using AllTensors =
+        std::tuple<Tensor, Tensor, PlainFloatTensor, PlainFloatTensor_TCM, PlainFloat16Tensor, PlainFloat16Tensor_TCM,
+                   Tensor, Tensor, // REMOVED: D32FloatTensor, D32PaddedFloatTensor,
+                   Int32Tensor, Int32Tensor_TCM, Int32CroutonTensor,
+                   Int32CroutonTensor_TCM, QuantUint8Tensor, QuantUint8Tensor_TCM, QuantInt8Tensor, QuantInt8Tensor_TCM,
+                   QuantUint16Tensor, QuantUint16Tensor_TCM, QuantInt16Tensor, QuantInt16Tensor_TCM, QuantInt32Tensor,
+                   QuantInt32Tensor_TCM, QUint8CroutonTensor, QUint8CroutonTensor_TCM, QInt8CroutonTensor,
+                   QInt8CroutonTensor_TCM, QUint8Crouton4x1Tensor, QUint8Crouton4x1Tensor_TCM, QUint8Crouton2x2Tensor,
+                   QUint8Crouton2x2Tensor_TCM, QUint8WideCroutonTensor, QUint8WideCroutonTensor_TCM,
+                   QUint8WideCrouton2x2Tensor, QUint8WideCrouton2x2Tensor_TCM, QUint16CroutonTensor,
+                   QUint16CroutonTensor_TCM, QInt32CroutonTensor, QInt32CroutonTensor_TCM, QInt32WideCroutonTensor,
+                   QInt32WideCroutonTensor_TCM, TensorShape<4>, F16CroutonTensor, F16CroutonTensor_TCM>;
+// clang-format on
+
+struct tensor_info {
+    std::type_info const *tid;
+    bool needs_des;
+    tensor_deserializer_fn desf;
+    tensor_generate_fp genf;
+};
+
+// returns a map : tensor index -> tensor_info
+PUSH_VISIBILITY(default)
+API_EXPORT std::map<uint8_t, tensor_info> &get_tensor_info_map();
+POP_VISIBILITY()
+
+//LCOV_EXCL_START [SAFTYSWCCB-1736] constexprs resolved during compile time
+// used in pub/impl/simple_op.h for indices representing input and outputs tensor types
+// and have lvalue constexpr
+template <typename AggType, class Tup, size_t... I>
+static inline constexpr AggType tensors_to_indices_helper(std::index_sequence<I...>)
+{
+    return AggType{tensor_idx<std::tuple_element_t<I, Tup>>...};
+}
+
+// converts a tuple of tensor types to a vector of corresponding indices
+template <typename AggType, class Tup> static inline constexpr AggType tensors_to_indices()
+{
+    return tensors_to_indices_helper<AggType, Tup>(
+            std::make_index_sequence<std::tuple_size<std::decay_t<Tup>>::value>{});
+}
+//LCOV_EXCL_STOP
+
+template <class Tup, size_t... I>
+static inline constexpr bool check_tensor_types_valid_helper(std::index_sequence<I...>)
+{
+    return (((bool)tensor_idx<std::tuple_element_t<I, Tup>>)&&...);
+}
+
+// checks tensor types in a tuple are all from AllTensors list
+template <class Tup> static inline constexpr bool check_tensor_types_valid()
+{
+    return check_tensor_types_valid_helper<Tup>(std::make_index_sequence<std::tuple_size<std::decay_t<Tup>>::value>{});
+}
+
+} // namespace hnnx
+
+#endif // TEMPLATE_HELP_TENSOR_EXT_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/tensor.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/tensor.h
new file mode 100755
index 0000000000000..480dc7aded1ac
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/tensor.h
@@ -0,0 +1,4086 @@
+//==============================================================================
+//
+// Copyright (c) Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef HEXNN_TENSOR_H
+#define HEXNN_TENSOR_H 1
+/*
+ * This file is trying to figure out a nice Tensor class, which allows for access
+ * to a data structure with potentially unknown underlying data types and layout.
+ *
+ * What is a Tensor? It's a multidimensional array of data
+ * It has a "Rank": the number of dimensions.
+ * It has a shape.
+ * It contains data values.
+ * There is a mechanism to access the data values.
+ *
+ * From an abstract perspective, that's about all we should have to know about a tensor.
+ * However, to form a concrete tensor, it should also be observed that:
+ *
+ * The data values have some type.  They may be encoded/decoded with some extra information.
+ * The data is laid out in some fashion.  It may be a contiguous block, it may have the data
+ *   shuffled in some way, it may have a table of pointers to fixed-size blocks...
+ * There might be extra padding values around the "true" data
+ *
+ * To facilitate the most abstract interfaces being available while also being
+ * able to specify concrete tensors and have the compiler understand the mechanics
+ * of the concrete tensor, we probably need to have:
+ * * Abstract tensor as a base class that provides a generic interface always, using runtime polymorphism
+ * * Subclasses that provide more concrete tensor representations, finalizing aspects of the tensor,
+ * * Concrete classes that provide the compiler with full visibility in the details of the tensor
+ *
+ * Because values may be encoded/decoded from their internal representation, in the most abstract
+ * representation we can not just return a data element.  Instead, we return an accessor object.
+ * The accessor object works like an rvalue or lvalue, but is able to decode (rvalue) or encode (lvalue)
+ * the data as appropriate.
+ * (At least, that's how I think it should work...)
+ *
+ * We'd like to use the operator() to allow us to have multidimentional-array-indices-like interface,
+ * much in the same way we have in Eigen.  So for a 4D tensor, with indices batchidx,row,col,channel,
+ * we should be able to say out_tensor(batchidx,row,col,channel) = in_tensor(batchidx,row,col,channel)
+ *
+ * Although we might consider a variety of different types for tensor internal values, including int32,
+ * I propose to use "float" as the defualt interface type.  It should work well for many integers, and
+ * is the appropriate interface for real data whether quantized or not.  A reasonable alternative would
+ * be double, but double is quite a bit more expensive on Hexagon.  Of course, other methods of accessing
+ * the data could be made for different types.
+ *
+ * Having extremely abstract tensors allows us to have extremely generic
+ * functions, but having easily available less abstract tensors should allow us
+ * to easily specify constraints for ops that are more optimal or that are
+ * demanding certain parameters for their inputs.  For example, if we always want
+ * our convolutional op to have a 4D input tensor, we might specify that it is
+ * a RankedTensor<4> instead of a Tensor, indicating that the op can only use
+ * a tensor with rank 4.
+ *
+ * Tensors are very fundamental to how we are going to work on things, so it's incredibly important to
+ * be on our best behavior here.
+ *
+ */
+
+/*
+ * EJP: FIXME:
+ * * The helper classes, Accessors and Interfaces and such, should be in a sub-namespace for cleanliness.
+ * * Make the Abstract/Base/Generic naming consistent.  I like "Generic" at the moment.
+ */
+#include <cassert>
+#include <array>
+#include <cmath>
+#include <limits>
+#include <numeric>
+#include <set>
+#include <type_traits>
+#include <vector>
+#include <stdexcept>
+#include <cstddef>
+#include <typeindex>
+
+#include "allocator.h"
+#include "shape.h"
+#include "serialize_oplist.h"
+#include "deserializer.h"
+#include "dtype.h"
+#include "float16.h"
+#include "graph_status.h"
+#include "interface_defs.h"
+#include "log.h"
+#include "memory_layout.h"
+#include "padding.h"
+#include "template_help.h"
+#include "conversions.h"
+#include "crate.h"
+#include "minihash.h"
+#include "macros_attribute.h"
+#include "weak_linkage.h"
+#include "dynamic_tensors.h"
+
+#define TENSOR_MAGIC 0x1337beef
+
+//#define ALWAYS_INLINE /* NOTHING */
+
+#if 0
+/*
+ * What is the type of an Index?
+ * What is a Signed Index that might be negative?
+ * Maybe both of these should just be "int" everywhere.
+ */
+using Idx = size_t;
+using SIdx = long;
+
+#endif
+
+/*
+ * This name makes no sense.
+ */
+#if 0
+struct OctetType {
+    uint8_t *buf;
+    size_t buflen;
+};
+#endif
+
+#include "weak_linkage.h"
+PUSH_VISIBILITY(default)
+
+class NullInterface;
+template <typename T> class PlainInterface;
+template <typename T> class ScaleOffsetInterface;
+
+template <> constexpr DType dtype_of_type<ScaleOffsetInterface<uint8_t>>()
+{
+    return DType::QUInt8;
+}
+template <> constexpr DType dtype_of_type<ScaleOffsetInterface<int8_t>>()
+{
+    return DType::QInt8;
+}
+template <> constexpr DType dtype_of_type<ScaleOffsetInterface<uint16_t>>()
+{
+    return DType::QUInt16;
+}
+template <> constexpr DType dtype_of_type<ScaleOffsetInterface<int16_t>>()
+{
+    return DType::QInt16;
+}
+
+template <> constexpr DType dtype_of_type<PlainInterface<Float16>>()
+{
+    return DType::Float16;
+}
+template <> constexpr DType dtype_of_type<PlainInterface<float>>()
+{
+    return DType::Float32;
+}
+template <> constexpr DType dtype_of_type<ScaleOffsetInterface<NN_INT32_T>>()
+{
+    return DType::QInt32;
+}
+template <> constexpr DType dtype_of_type<PlainInterface<NN_INT32_T>>()
+{
+    return DType::Int32;
+}
+template <> constexpr DType dtype_of_type<PlainInterface<NN_INT64_T>>()
+{
+    return DType::Int64;
+}
+
+extern long long int dma_validate_cycles;
+
+namespace hnnx {
+API_EXPORT extern uint64_t checksum_bytes(uint64_t prev, uint8_t const *bytes, unsigned n);
+class InterfaceRef;
+struct intfc_methods;
+
+// this is to solve a circular dependency issue; defined in graph.h
+class DMA_Manager;
+DMA_Manager *get_dma_manager(Graph const &);
+DMA_Manager *get_dma_manager(Deserz const &);
+
+//
+// This type represent a set of block_id, across a tensor or group of
+// tensors.
+//typedef std::set<void*> blockid_set_t;
+// .. but this should work too...
+typedef miniset<void *> blockid_set_t;
+
+//
+// This is an interface class; a reference to this
+// is passed to tensor->enum_memory_blocks; the tensor then calls the 'supply_blocks_func' method
+// (maybe using one of the handy wrappers) to generate one or more 'void*' which are the block
+// ids.
+//   Rules are:
+//   - if 'supply_blocks_func' is called with memclass < 0, the memory class of the block is unspecified;
+//     if it is called with memclass >=0, the value  is  MemoryClass and the tensor guarantees that all
+//     of the blocks are in that class.
+//   - Tensor may in general make multiple calls to supply_blocks_func in one call to enum_memory_blocks, and may
+//     supply different values of mclass parameter. But currently it is only one.
+//   - The tensor does *not* guarantee that the same id is not presented multiple times in one call
+//     to enum_memory_blocks.
+//
+class MemBlockEnumerator {
+  public:
+    API_EXPORT virtual ~MemBlockEnumerator() {}
+    API_EXPORT virtual void supply_blocks_func(Tensor const *tensp, int memclass, void *const *ptr, size_t num) = 0;
+    // Tensors can use these wrappers
+    API_EXPORT inline void supply_blocks(Tensor const *tensp, void *const *ptr, size_t num)
+    {
+        supply_blocks_func(tensp, -1, ptr, num);
+    }
+    API_EXPORT inline void supply_blocks(Tensor const *tensp, MemoryClass mc, void *const *ptr, size_t num)
+    {
+        supply_blocks_func(tensp, int(mc), ptr, num);
+    }
+};
+// utility class, to enumerate to a std::set
+// if mclass_sel >=0, we skip tensors which have a different memory class.
+class MemBlockEnumToSet : public MemBlockEnumerator {
+    blockid_set_t &m_set;
+    int m_memclass_sel;
+
+  public:
+    API_EXPORT explicit MemBlockEnumToSet(blockid_set_t &s, int mclass_sel = -1) : m_set(s), m_memclass_sel(mclass_sel)
+    {
+    }
+    API_EXPORT MemBlockEnumToSet(blockid_set_t &s, MemoryClass mc) : m_set(s), m_memclass_sel(int(mc)) {}
+    API_EXPORT virtual void supply_blocks_func(Tensor const *, int memclass, void *const *ptr, size_t num) override
+    {
+        if (m_memclass_sel >= 0 && memclass >= 0 && m_memclass_sel != memclass) return;
+        for (size_t i = 0; i < num; i++) {
+            if (ptr[i] != Allocator::vacant()) m_set.emplace(ptr[i]);
+        }
+    }
+};
+// This is to support Tensor::enum_memory_blocks_withfunc( ..callable..)
+//  and similar for Op methods
+template <typename ENFUNC> class MemBlockEnumWrapper : public MemBlockEnumerator {
+    ENFUNC m_enfunc;
+
+    API_EXPORT virtual void supply_blocks_func(Tensor const *tensp, int memclass, void *const *ptr, size_t num) override
+    {
+        m_enfunc(tensp, memclass, ptr, num);
+    }
+
+  public:
+    API_EXPORT inline MemBlockEnumWrapper(ENFUNC &&ef) : m_enfunc(std::move(ef)) {}
+    API_EXPORT inline MemBlockEnumWrapper(ENFUNC const &ef) : m_enfunc(ef) {}
+};
+
+// This is to support Tensor::replace_memory_blocks_withfunc( ..callable..)
+//  and similar for Op methods
+//  The 'replfunc' is called as: void* replfunc( Tensor const *tp, void *old_blkid)
+//  for each block in the tensor; the returned value is used as the replacement blkid.
+template <typename REPLFUNC> class MemBlockReplBlockWrapper : public MemBlockEnumerator {
+    REPLFUNC m_replfunc;
+
+    API_EXPORT virtual void supply_blocks_func(Tensor const *tensp, int memclass, void *const *ptr, size_t num) override
+    {
+        for (unsigned i = 0; i < num; i++) {
+            void *newblk = m_replfunc(tensp, ptr[i]);
+            const_cast<void *&>(ptr[i]) = newblk;
+        }
+    }
+
+  public:
+    API_EXPORT inline MemBlockReplBlockWrapper(REPLFUNC &&ef) : m_replfunc(std::move(ef)) {}
+    API_EXPORT inline MemBlockReplBlockWrapper(REPLFUNC const &ef) : m_replfunc(ef) {}
+};
+
+} // namespace hnnx
+
+/*
+ * An Interface has all the necessary values and functionality to encode and decode values
+ *
+ * virtual methods do generic conversion to/from floats, with a void * to the encoded data.
+ *
+ * Each concrete Tensor (and some less-than-concrete) has an instance of an Interface.
+ *
+ * IMPORTANY: All interface classes must be trivially destructible.
+ * As a result, even though we have virtual methods, it is safe to
+ * have no virtual dtor.
+ * This is important for performance, since every tensor has an Interface subclass
+ * embedded in it; and most tensor classes need no destructor for any other reason.
+ * So a dtor requirement in the 'interface' could add time to the teardown,
+ * even if the dtors don't do very much.
+ */
+
+class Interface {
+  protected:
+    // base class has the 'dtype info' for the interface. It must occupy an
+    // aligned 4-byte location.
+    alignas(4) dtype_info dtinfo;
+
+    explicit constexpr Interface(dtype_info const dti) : dtinfo(dti) {}
+    using intfc_methods = hnnx::intfc_methods;
+
+  public:
+    // Base class can read this info directly from the 'dtinfo' field:
+    constexpr inline dtype_info get_dt_info() const noexcept { return dtinfo; }
+    constexpr inline unsigned element_size() const noexcept { return dtinfo.elbytes; }
+    constexpr inline DType get_dtype() const noexcept { return dtinfo.dtype; }
+    constexpr inline bool is_quantized() const noexcept { return dtinfo.is_quant; }
+
+    struct qparms {
+        int offset;
+        float scale;
+        float scale_recip;
+    };
+    using read_float_fp = float (*)(Interface const *, void const *) noexcept;
+    using write_float_fp = void (*)(Interface const *, void *, const float) noexcept;
+    using get_qparms_fp = qparms const *(*)(Interface const *) noexcept;
+    using ifc_hash_fp = uint32_t (*)(Interface const *) noexcept;
+    using ifc_compare_fp = int (*)(Interface const *, Interface const *) noexcept;
+    static unsigned constexpr N_types = unsigned(DType::ZZ_LAST_DTYPE);
+
+    // This constructs an InterfaceRef for an arbitrary Interface instance, by using the dtype
+    // in the header word to select the method table.
+    // This is in tensor.cc; it is expected to be used fairly rarely, and maybe not at all at runtime,
+    // but it will be pretty quick.
+    API_EXPORT hnnx::InterfaceRef get_refobj() const;
+
+    template <typename IFC> static Interface const *canonical_instance_for(); // inline is below
+
+  protected:
+    template <DType DT> static intfc_methods const &methods_for(); // inline is below
+
+    API_EXPORT static constexpr qparms null_parms = {0, 1.0f, 1.0f};
+    static inline qparms const *get_null_qparms(Interface const *) noexcept { return &null_parms; }
+};
+
+namespace hnnx {
+// ONLY for use in intfc_methods
+class IfcExemplar final : public Interface {
+  public:
+    constexpr IfcExemplar(dtype_info const dt) : Interface(dt) {}
+    constexpr IfcExemplar() : Interface(dtype_info{}) {}
+};
+
+//
+// for each 'concrete' subclass of Interace, there is one private instance
+// of this, which is 'static constexpr methods_instance',
+// e.g. PlainInterface<float>::methods_instance is one of these.
+// The ifc_hash' method does not compute the complete hash; it remains to 'xor' with unsigned(dtype).
+// In cases where it just returns 0, a null ptr can be used.
+//
+struct intfc_methods {
+    hnnx::IfcExemplar exemplar; // <- contains the dtype_info
+    Interface::read_float_fp read_float;
+    Interface::write_float_fp write_float;
+    Interface::get_qparms_fp get_qparms;
+    Interface::ifc_hash_fp ifc_hash; // <- may be null if it always returns 0 (e.g. PlainInterface)
+    Interface::ifc_compare_fp ifc_compare; // <- may be null if always returns 0 (e.g. PlainIterface).
+};
+// All of the intfc_methods are stored in this table, which can be indexed by DType.
+using ifc_method_table_t = std::array<intfc_methods, Interface::N_types>;
+constexpr ifc_method_table_t construct_ifc_method_table(); // in tensor.cc; not publicly visible
+// This is defined in tensor.cc, and built at compile time.
+API_EXPORT_IMPORT extern const ifc_method_table_t ifc_method_table;
+
+} // namespace hnnx
+// can define this now.
+template <DType DT> inline hnnx::intfc_methods const &Interface::methods_for()
+{
+    return hnnx::ifc_method_table[unsigned(DT)];
+}
+//
+// for a given actual interface class, get a pointer to a 'canonical'
+// instance; this is actually the 'exemplar' field in the methods table.
+//  - Must be a subclass of Interface
+//  - Must have a 'dtype' attribute (or be NullInterface)
+//  - Must be the same size as Interface (cannot be used for ScaleOffsetInterface).
+//
+template <typename IFC> inline Interface const *Interface::canonical_instance_for()
+{
+    static_assert(std::is_base_of_v<Interface, IFC>);
+    static_assert(sizeof(IFC) == sizeof(Interface));
+    constexpr DType dt = IFC::dtype;
+    return &hnnx::ifc_method_table[unsigned(dt)].exemplar;
+}
+// NullInterface doesn't have a dtype (and maybe shouldn't...)
+template <> inline Interface const *Interface::canonical_instance_for<NullInterface>()
+{
+    return &hnnx::ifc_method_table[unsigned(DType::UNKNOWN)].exemplar;
+}
+
+namespace hnnx {
+// The 'interface(); virtual method of Tensor now returns this object.
+// If the interface is anything but ScaleOffsetInterface, the 'interface' ptr can be null.
+class InterfaceRef {
+
+  public:
+    using qparms = Interface::qparms;
+
+  protected:
+    intfc_methods const *methods_p;
+    Interface const *intfc_p;
+
+  private: // use make_null() method if needed
+    constexpr InterfaceRef() : methods_p(nullptr), intfc_p(nullptr) {}
+
+  public:
+    InterfaceRef(InterfaceRef const &) = default;
+    InterfaceRef &operator=(InterfaceRef const &) = default;
+    // not really public since null_init_token isn't
+
+    API_EXPORT InterfaceRef(intfc_methods const &mthods, Interface const *ifc_p) : methods_p(&mthods), intfc_p(ifc_p) {}
+    API_EXPORT qparms const *get_qparms() const { return methods_p->get_qparms(intfc_p); }
+    API_EXPORT float get_scale() const { return methods_p->get_qparms(intfc_p)->scale; }
+    API_EXPORT float get_scale_recip() const { return methods_p->get_qparms(intfc_p)->scale_recip; }
+    API_EXPORT int32_t get_offset() const { return methods_p->get_qparms(intfc_p)->offset; }
+    API_EXPORT void write_float(void *ptr, const float in) const noexcept { methods_p->write_float(intfc_p, ptr, in); }
+    API_EXPORT float read_float(const void *ptr) const noexcept { return methods_p->read_float(intfc_p, ptr); }
+    API_EXPORT uint32_t interface_hash() const noexcept
+    {
+        uint32_t h = methods_p->ifc_hash ? methods_p->ifc_hash(intfc_p) : 0;
+        return h ^ uint32_t(methods_p->exemplar.get_dtype());
+    }
+    API_EXPORT dtype_info get_dt_info() const noexcept { return methods_p->exemplar.get_dt_info(); }
+    API_EXPORT unsigned element_size() const noexcept { return methods_p->exemplar.element_size(); }
+    API_EXPORT DType get_dtype() const noexcept { return methods_p->exemplar.get_dtype(); }
+    API_EXPORT bool is_quantized() const noexcept { return methods_p->exemplar.is_quantized(); }
+    // might as well have get_refobj() method, for consistency of interface...
+    API_EXPORT inline InterfaceRef get_refobj() const { return *this; }
+
+    // this is used as a 'pseudo-ctor' to make a null InterfaceRef, in a few places.
+    // (null ctor is currently private)
+    static inline constexpr InterfaceRef make_null() { return InterfaceRef{}; }
+
+    Interface const *get_intfc_ptr() const { return intfc_p; }
+    intfc_methods const *get_methods_ptr() const { return methods_p; }
+
+    // Ordered compare of two 'InterfaceRef'. If the types are different (detected by
+    // different method pointers), then we order based on the addresses of the method
+    // tables; since the method tables are all in one big array, this means they are
+    // ordered according to 'dtype'.
+    API_EXPORT int compare(InterfaceRef const &rhs) const noexcept
+    {
+        if (methods_p != rhs.methods_p) return (methods_p < rhs.methods_p) ? -1 : 1;
+        if (intfc_p == rhs.intfc_p) return 0; // same type, same object
+        auto fp = methods_p->ifc_compare;
+        return (fp == nullptr) ? 0 : (*fp)(intfc_p, rhs.intfc_p);
+    }
+    API_EXPORT bool compare_eq(InterfaceRef const &rhs) const noexcept
+    {
+        if (methods_p != rhs.methods_p) return false;
+        if (intfc_p == rhs.intfc_p) return true; // same type, same object
+        auto fp = methods_p->ifc_compare;
+        return (fp == nullptr) ? true : ((*fp)(intfc_p, rhs.intfc_p) == 0);
+    }
+    friend inline bool operator==(InterfaceRef const &lhs, InterfaceRef const &rhs) noexcept
+    {
+        return lhs.compare_eq(rhs);
+    }
+    friend inline bool operator!=(InterfaceRef const &lhs, InterfaceRef const &rhs) noexcept
+    {
+        return !lhs.compare_eq(rhs);
+    }
+    friend inline bool operator<(InterfaceRef const &lhs, InterfaceRef const &rhs) noexcept
+    {
+        return lhs.compare(rhs) < 0;
+    }
+};
+
+template <typename IFCT> Interface::qparms &qparms_for_interface_patch(size_t);
+} // namespace hnnx
+
+namespace hnnx {
+// make_interface<INTFC>::from_odef( Graph &, OutputDef const &odef)
+// returns a pointer to an INTFC suitable for odef, either
+// by finding an existing one, or by adding a new one to the crate.
+// make_interface<INTFC>::from_deser(Deseralizer & dctx)
+// returns a pointer to an INTFC , deserialized,
+// by finding an existing one which matches, or by adding a new one to the crate.
+template <typename INTFC> struct make_interface {
+    API_EXPORT static Interface const *from_odef(Graph &, OutputDef const &odef);
+    API_EXPORT static Interface const *from_deser(Deserz &dctx, Interface const **ptrloc);
+};
+} // namespace hnnx
+
+/*
+ * But guess what... you can't ever instantiate an abstract class!
+ * So if we want to return a generic Accessor, we need to make it non-abstract.
+ *
+ * We need an abstract pointer to some element.  The way to do this is void *
+ * We need a pointer to the Interface, which needs to be able to work with a void *
+ *
+ * This pushes the runtime polymorphism into the Interface, which we can share
+ * between Accessor instances
+ */
+
+class GenericAccessorRO {
+  protected:
+    void *data;
+    const Interface &interface;
+    Interface::read_float_fp read_fp;
+
+  public:
+    API_EXPORT GenericAccessorRO(void const *const data_in, hnnx::InterfaceRef const &interface_in)
+        : data(const_cast<void *>(data_in)), interface(*interface_in.get_intfc_ptr()),
+          read_fp(interface_in.get_methods_ptr()->read_float)
+    {
+    }
+    API_EXPORT GenericAccessorRO(GenericAccessorRO const &) = default;
+    typedef GenericAccessorRO AccessorRO;
+    API_EXPORT inline float as_float() const { return (*read_fp)(&interface, data); }
+    API_EXPORT inline operator float() const { return as_float(); }
+};
+class GenericAccessor : public GenericAccessorRO {
+    Interface::write_float_fp write_fp;
+
+  public:
+    API_EXPORT GenericAccessor(void *const data_in, hnnx::InterfaceRef const &interface_in)
+        : GenericAccessorRO(data_in, interface_in), write_fp(interface_in.get_methods_ptr()->write_float)
+    {
+    }
+    API_EXPORT GenericAccessor(GenericAccessor const &) = default;
+    API_EXPORT inline void set_float(float v) { write_fp(&interface, data, v); }
+    API_EXPORT inline float operator=(float v)
+    {
+        set_float(v);
+        return v;
+    }
+    API_EXPORT inline float operator=(GenericAccessorRO const &rhs)
+    {
+        float const v = rhs.as_float();
+        set_float(v);
+        return v;
+    }
+    API_EXPORT inline float operator=(GenericAccessor const &rhs)
+    {
+        if (this != &rhs) {
+            return operator=(static_cast<GenericAccessorRO const &>(rhs));
+        }
+        return this->as_float();
+    }
+};
+// this is returned by Tensor::get_dtype_intfc()
+//
+struct DTypeScaleOff {
+    DType dtype;
+    float scale;
+    int offset;
+    DTypeScaleOff(DType dt, float sc, int zo) noexcept : dtype(dt), scale(sc), offset(zo) {}
+    explicit DTypeScaleOff(DType dt) noexcept : dtype(dt), scale(1.0f), offset(0) {}
+    DTypeScaleOff() noexcept : DTypeScaleOff(DType::UNKNOWN) {}
+    // construct from dtype and qparms ref, etc...
+    DTypeScaleOff(DType dt, Interface::qparms const &qpp) noexcept : dtype(dt), scale(qpp.scale), offset(qpp.offset) {}
+    DTypeScaleOff(DType dt, hnnx::InterfaceRef const &iref) noexcept : DTypeScaleOff(dt, *iref.get_qparms()) {}
+    explicit DTypeScaleOff(hnnx::InterfaceRef const &iref) noexcept
+        : DTypeScaleOff(iref.get_dtype(), *iref.get_qparms())
+    {
+    }
+    DTypeScaleOff(DTypeScaleOff const &) = default;
+    DTypeScaleOff &operator=(DTypeScaleOff const &) = default;
+};
+
+/**
+ * For each 'interface' there are a pair of accessor classes
+ *   Interface::Accessor
+ *   Interface::AccessorRO
+ *   .. which the types returned by Tensor(..indices...)
+ *
+ *  These have the following:
+ *       typedef AccessorRO;                            - correponding RO type.
+ *       typedef element_type;							- type of the stored element
+ *       element_type .value() const;					- direct read
+ *       .as_float() const;								- convert to float
+ *       operator float() const;						- same
+ *  (If not RO):
+ *       .set_value( element_type &);                   - direct store
+ *       .set_float( float )							- assign from float
+ *       operator=( float )								- assign from float
+ *       operator=( Accessor const & )				    - assign from same accessor
+ *       operator=( AccessorRO const & )				- assign from R/O accessor
+ *  The assignment operators may return either a float,
+ *    or an AccessorRO by value
+ *    or an Accessor const &  (which is *this)
+ *    or an AccessorRO const & (only if it's *this by subclass).
+ *
+ *  Both have copy ctors, and AccessorRO(Accessor const &) works.
+ *
+ *  AccessorRO may or may not be a direct public base of Accessor
+ *
+ *  The 'GenericAccessor' and GenericAccessorRO have all of the above, except
+ *  for element_type, .value(), and .set_value().
+ */
+
+/**
+ * @class NullInterface
+ *
+ * @brief A NullInterface throws away data and returns zero
+ */
+
+class NullInterface final : public Interface {
+    friend constexpr hnnx::ifc_method_table_t hnnx::construct_ifc_method_table();
+
+  public:
+    API_EXPORT inline constexpr NullInterface() : Interface(hnnx::dtype_info_v<DType::UNKNOWN>) {}
+
+    static inline constexpr dtype_info get_dt_info() noexcept { return hnnx::dtype_info_v<DType::UNKNOWN>; }
+    static inline qparms const *get_qparms() noexcept { return &Interface::null_parms; }
+    static inline constexpr DType get_dtype() noexcept { return get_dt_info().dtype; }
+    static inline constexpr unsigned element_size() noexcept { return get_dt_info().elbytes; }
+    static inline constexpr bool is_quantized() noexcept { return get_dt_info().is_quant; }
+    static inline uint32_t interface_hash() noexcept { return uint32_t(DType::UNKNOWN); }
+
+  private:
+    static void write_float(Interface const *, void *ptr, const float in) noexcept {}
+    static float read_float(Interface const *, const void *ptr) noexcept { return 0.0f; }
+
+    // LCOV_EXCL_START [SAFTYSWCCB-1736] constexprs resolved during compile time
+    static constexpr intfc_methods get_method_table()
+    {
+        return {hnnx::dtype_info_v<DType::UNKNOWN>, read_float, write_float, get_null_qparms, nullptr, nullptr};
+    }
+    // LCOV_EXCL_STOP
+
+  public:
+    // hide the slower implementations in the base class...
+    API_EXPORT inline float get_scale() const noexcept { return 1.0f; }
+    API_EXPORT inline float get_scale_recip() const noexcept { return 1.0f; }
+    API_EXPORT inline int32_t get_offset() const noexcept { return 0; }
+    API_EXPORT int compare(const NullInterface &rhs) const { return 0; };
+
+    static inline hnnx::InterfaceRef get_refobj() noexcept
+    {
+        return hnnx::InterfaceRef(methods_for<DType::UNKNOWN>(), Interface::canonical_instance_for<NullInterface>());
+    }
+    // NullInterface has a null DTypeScaleOff
+    static inline DTypeScaleOff get_dtype_scaleoff() noexcept { return DTypeScaleOff(); }
+
+  private:
+    // Accessor for NullInterface - empty class.
+    struct nullval {
+        operator float() const { return 0.0f; }
+    };
+    class AcsrRO {
+      public:
+        using element_type = nullval;
+        using AccessorRO = AcsrRO;
+        API_EXPORT AcsrRO() {}
+        API_EXPORT AcsrRO(void const *, NullInterface const *) {}
+        API_EXPORT AcsrRO(AcsrRO const &) = default;
+        API_EXPORT inline element_type value() const { return nullval{}; }
+        API_EXPORT inline float as_float() const { return 0.0f; }
+        API_EXPORT inline operator float() const { return 0.0f; }
+    };
+    class Acsr : public AcsrRO {
+      public:
+        using element_type = nullval;
+        using AccessorRO = AcsrRO;
+        API_EXPORT Acsr(void *, const NullInterface *) {}
+        API_EXPORT Acsr(Acsr const &) = default;
+        API_EXPORT inline void set_float(float v) {}
+        API_EXPORT inline void set_value(element_type v) {}
+        API_EXPORT inline float operator=(float v) { return 0.0f; }
+        API_EXPORT inline float operator=(AcsrRO const &rhs) { return 0.0f; }
+        API_EXPORT inline float operator=(Acsr const &rhs) { return 0.0f; }
+    };
+
+  public:
+    using Accessor = Acsr;
+    using AccessorRO = AcsrRO;
+};
+
+// make_interface for NullInterface; easy, just have one
+// and return a pointer to it.
+template <> struct hnnx::make_interface<NullInterface> {
+    API_EXPORT static inline Interface const *from_odef(Graph &, OutputDef const &odef)
+    {
+        return Interface::canonical_instance_for<NullInterface>();
+    }
+    API_EXPORT static inline Interface const *from_deser(Deserz &dctx, Interface const **)
+    {
+        return Interface::canonical_instance_for<NullInterface>();
+    }
+};
+
+/**
+ * @class PlainInterface
+ *
+ * @brief A tensor with Floats needs no conversion.
+ * You could also use this for integral value tensors where the integral values are the true values;
+ * they would get converted to floats.
+ */
+template <typename T> class PlainInterface final : public Interface {
+    friend constexpr hnnx::ifc_method_table_t hnnx::construct_ifc_method_table();
+
+  public:
+    using element_type = T;
+    static constexpr DType dtype = dtype_of_type<PlainInterface>();
+    API_EXPORT explicit constexpr PlainInterface(const OutputDef &def) : Interface(get_dt_info()) {}
+    API_EXPORT constexpr PlainInterface() : Interface(get_dt_info()) {}
+    API_EXPORT explicit constexpr PlainInterface(hnnx::Deserz &) : PlainInterface() {}
+    API_EXPORT static inline constexpr T convert_from_float(const float &in)
+    {
+        return saturate_round<T>(in);
+    } // except for T=float!
+    API_EXPORT static inline constexpr float convert_to_float(const T &in) { return float(in); }
+
+    static inline qparms const *get_qparms() noexcept { return &Interface::null_parms; }
+    static inline constexpr dtype_info get_dt_info() noexcept { return hnnx::dtype_info_v<dtype>; }
+    static inline constexpr DType get_dtype() noexcept { return dtype; }
+    static inline constexpr unsigned element_size() noexcept { return get_dt_info().elbytes; }
+    static inline constexpr bool is_quantized() noexcept { return get_dt_info().is_quant; }
+
+  private:
+    static void write_float(Interface const *self, void *ptr, const float in) noexcept; // inlined below
+    static inline float read_float(Interface const *, const void *ptr) noexcept
+    {
+        auto p = static_cast<const T *>(ptr);
+        return convert_to_float(*p);
+    }
+    // LCOV_EXCL_START [SAFTYSWCCB-1736] constexprs resolved during compile time
+    static constexpr intfc_methods get_method_table()
+    {
+        return {hnnx::dtype_info_v<dtype>, read_float, write_float, get_null_qparms, nullptr, nullptr};
+    }
+    // LCOV_EXCL_STOP
+
+  public:
+    static inline uint32_t interface_hash() noexcept { return uint32_t(dtype); }
+    static inline hnnx::InterfaceRef get_refobj() noexcept
+    {
+        return hnnx::InterfaceRef(methods_for<dtype>(), Interface::canonical_instance_for<PlainInterface>());
+    }
+    static inline DTypeScaleOff get_dtype_scaleoff() noexcept { return DTypeScaleOff(dtype); }
+
+    API_EXPORT static inline int compare(const PlainInterface &rhs) noexcept { return 0; }
+    API_EXPORT static inline float get_scale() noexcept { return 1.0f; }
+    API_EXPORT static inline float get_scale_recip() noexcept { return 1.0f; }
+    API_EXPORT static inline int32_t get_offset() noexcept { return 0; }
+
+  private:
+    // Accessor for PlainInterface
+    // Doesn't need a reference to interface, just a data pointer (or data, for AcsrRO)
+    // We can't actually call it AccessorRO since it needs to contain a typedef AccessorRO.
+    //
+    class Acsr;
+    class AcsrRO {
+      protected:
+        T val;
+
+      public:
+        using element_type = T;
+        using AccessorRO = AcsrRO;
+        API_EXPORT AcsrRO(void const *data_in, PlainInterface const *) : val(*static_cast<T const *>(data_in)) {}
+        API_EXPORT AcsrRO(AcsrRO const &) = default;
+        API_EXPORT AcsrRO &operator=(AcsrRO const &) = default;
+        API_EXPORT AcsrRO(Acsr const &a) : val(a.value()) {}
+        API_EXPORT inline element_type value() const { return val; }
+        API_EXPORT inline float as_float() const { return convert_to_float(val); }
+        API_EXPORT inline operator float() const { return as_float(); }
+    };
+    class Acsr {
+      protected:
+        T *data;
+
+      public:
+        using element_type = T;
+        using AccessorRO = AcsrRO;
+        API_EXPORT Acsr(void *data_in, PlainInterface const *) : data(static_cast<T *>(data_in)) {}
+        API_EXPORT Acsr(Acsr const &) = default;
+        API_EXPORT inline element_type value() const { return *data; }
+        API_EXPORT inline float as_float() const { return convert_to_float(*data); }
+        API_EXPORT inline operator float() const { return as_float(); }
+
+        API_EXPORT inline void set_float(float v) { *data = convert_from_float(v); }
+        API_EXPORT inline void set_value(element_type v) { *data = v; }
+        API_EXPORT inline float operator=(float v)
+        {
+            set_float(v);
+            return v;
+        }
+        // when copying from an Acsr of the same type we don't need to
+        // convert to float and back.
+        // @@we could also define operator= for other cases, e.g.
+        //  int32 from int16, to do the operation without going to float.
+        API_EXPORT inline AcsrRO operator=(Acsr const &rhs)
+        {
+            if (this != &rhs) {
+                T v = rhs.value();
+                set_value(v);
+            }
+            return AcsrRO(*this);
+        }
+        API_EXPORT inline AcsrRO operator=(AcsrRO const &rhs)
+        {
+            T v = rhs.value();
+            set_value(v);
+            return AcsrRO(*this);
+        }
+    };
+
+  public:
+    using Accessor = Acsr;
+    using AccessorRO = AcsrRO;
+};
+
+//PlainInterface<float>::convert_from_float: no-op
+template <> API_EXPORT inline constexpr float PlainInterface<float>::convert_from_float(const float &in)
+{
+    return in;
+}
+
+//PlainInterface<Float16>::convert_from_float: no-op for values in Float16 range, clamp to max otherwise.
+template <> API_EXPORT inline constexpr Float16 PlainInterface<Float16>::convert_from_float(const float &in)
+{
+    Float16 const max_as_fp16 = std::numeric_limits<Float16>::max();
+    float const max_as_fp32 = static_cast<float>(max_as_fp16);
+
+    if (in > max_as_fp32) return std::numeric_limits<Float16>::infinity();
+    if (in < -max_as_fp32) return -std::numeric_limits<Float16>::infinity();
+    return static_cast<Float16>(in);
+}
+
+// needs to be defined *after* convert_from_float is specialized
+template <typename T> inline void PlainInterface<T>::write_float(Interface const *, void *ptr, const float in) noexcept
+{
+    auto p = static_cast<T *>(ptr);
+    *p = convert_from_float(in);
+}
+
+// make_interface for PlainInterface<T>; easy, just have one
+// and return a pointer to it.
+
+template <typename T> struct hnnx::make_interface<PlainInterface<T>> {
+    API_EXPORT static inline Interface const *from_odef(Graph &, OutputDef const &odef)
+    {
+        return Interface::canonical_instance_for<PlainInterface<T>>();
+    }
+    API_EXPORT static inline Interface const *from_deser(Deserz &dctx, Interface const **)
+    {
+        return Interface::canonical_instance_for<PlainInterface<T>>();
+    }
+};
+
+extern template class PlainInterface<float>; // in tensor.cc
+extern template class PlainInterface<NN_INT32_T>;
+extern template class PlainInterface<NN_INT64_T>;
+
+/**
+ * @class ScaleOffsetInterface
+ *
+ * @brief A tensor could also have a scale+offset interface
+ * This is good for quantization schemes where you want to quantize an arbitrary, possibly asymmetric range.
+ * We compute and cache the reciprocal of the scale for conversion from float
+ * A default constructor sets the offset to 0 and the scale to 1.0, which would be suitable for integers.
+ * The reciprocal of scale should be computed so we don't have to divide.
+ */
+
+class SOIfcBase : public Interface { // base of ScaleOffsetInterface<T>
+  protected:
+    Interface::qparms qp; // offset, scale, scale_recip;
+    template <typename X> friend Interface::qparms &hnnx::qparms_for_interface_patch(size_t);
+    Interface::qparms &qparms_for_patch() { return qp; }
+    // Can only be constructed by subclass.
+    SOIfcBase(const int offset_, const float scale_, const dtype_info dt_)
+        : Interface(dt_), qp({offset_, scale_, 1.0f / scale_})
+    {
+    }
+    SOIfcBase(const OutputDef &def, const dtype_info dt_) : SOIfcBase(def.zero_offset, def.stepsize, dt_)
+    {
+        if (def.stepsize == 0.0f) debuglog("Oops: zero stepsize");
+    }
+    SOIfcBase(hnnx::Deserz &dctx, const dtype_info dt_) : Interface(dt_)
+    {
+        qp.offset = dctx.deserialize_uint32();
+        qp.scale = dctx.deserialize_float();
+        qp.scale_recip = 1.0f / qp.scale;
+    }
+
+    // these two are protected here (they only make sense in comparing same type)
+    // but are exposed in subclass via wrappers
+    API_EXPORT inline int compare(const SOIfcBase &rhs) const noexcept
+    {
+        if (qp.offset != rhs.qp.offset) return (qp.offset < rhs.qp.offset) ? -1 : 1;
+        if (qp.scale != rhs.qp.scale) return (qp.scale < rhs.qp.scale) ? -1 : 1;
+        return 0;
+    }
+    API_EXPORT inline bool compare_eq(const SOIfcBase &rhs) const noexcept
+    {
+        return qp.offset == rhs.qp.offset && qp.scale == rhs.qp.scale;
+    }
+
+  public:
+    API_EXPORT inline float get_scale() const noexcept { return qp.scale; }
+    API_EXPORT inline float get_scale_recip() const noexcept { return qp.scale_recip; }
+    API_EXPORT inline int32_t get_offset() const noexcept { return qp.offset; }
+    API_EXPORT inline qparms const *get_qparms() const noexcept { return &qp; }
+
+  protected:
+    static inline Interface::qparms const *get_qparms_meth(Interface const *const self) noexcept
+    {
+        return &static_cast<SOIfcBase const &>(*self).qp;
+    }
+    static int ifc_compare(Interface const *const lhs, Interface const *const rhs) noexcept
+    {
+        auto const &rhs_ref = *static_cast<SOIfcBase const *>(rhs);
+        return static_cast<SOIfcBase const *>(lhs)->compare(rhs_ref);
+    }
+    static uint32_t ifc_hash(Interface const *const self) noexcept
+    {
+        Interface::qparms const *qpp = get_qparms_meth(self);
+        // NOTE; it's important that if two ScaleOffsetInterface<T> objects for two *different*
+        // T have the same scale and offset, they must have different hash values. So 'dtype'.
+        // is rolled into the hash. Hash collisions are OK if either scale or offset is different.
+        return unsigned(qpp->offset) * 0x10661 ^ (image_convert<unsigned, float>(qpp->scale) << 1);
+    }
+};
+
+template <typename T> class ScaleOffsetInterface final : public SOIfcBase {
+    friend constexpr hnnx::ifc_method_table_t hnnx::construct_ifc_method_table();
+
+  public:
+    API_EXPORT ScaleOffsetInterface(const int offs_, const float scale_) : SOIfcBase(offs_, scale_, get_dt_info()) {}
+    API_EXPORT explicit ScaleOffsetInterface(const OutputDef &def) : SOIfcBase(def, get_dt_info()) {}
+    API_EXPORT ScaleOffsetInterface() : SOIfcBase(0, 1.0f, get_dt_info()) {}
+    API_EXPORT explicit ScaleOffsetInterface(hnnx::Deserz &dctx) : SOIfcBase(dctx, get_dt_info()) {}
+
+    using element_type = T;
+    static constexpr DType dtype = dtype_of_type<ScaleOffsetInterface>();
+    template <typename TX> API_EXPORT static inline constexpr T saturate(TX in) { return saturate_cast<T>(in); }
+    API_EXPORT inline constexpr T convert_from_float(float in) const
+    {
+        return saturate_round<T>(qp.offset + in * qp.scale_recip);
+    }
+    API_EXPORT inline constexpr float convert_to_float(T in) const
+    {
+        if constexpr (sizeof(T) <= 2)
+            return (float(int(in) - qp.offset)) * qp.scale;
+        else
+            return (float(in) - qp.offset) * qp.scale;
+    }
+    static constexpr inline dtype_info get_dt_info() noexcept { return hnnx::dtype_info_v<dtype>; }
+    static inline constexpr DType get_dtype() noexcept { return dtype; }
+    static inline constexpr unsigned element_size() noexcept { return get_dt_info().elbytes; }
+    static inline constexpr bool is_quantized() noexcept { return get_dt_info().is_quant; }
+
+  private:
+    static inline void write_float(Interface const *const self, void *ptr, const float in) noexcept
+    {
+        assert(ptr != nullptr);
+        auto p = static_cast<T *>(ptr);
+        *p = static_cast<ScaleOffsetInterface<T> const *>(self)->convert_from_float(in);
+    }
+    static inline float read_float(Interface const *const self, const void *ptr) noexcept
+    {
+        assert(ptr != nullptr);
+        auto p = static_cast<const T *>(ptr);
+        return static_cast<ScaleOffsetInterface<T> const *>(self)->convert_to_float(*p);
+    }
+    // LCOV_EXCL_START [SAFTYSWCCB-1736] constexprs resolved during compile time
+    static constexpr intfc_methods get_method_table()
+    {
+        return {hnnx::dtype_info_v<dtype>, read_float, write_float, get_qparms_meth, ifc_hash, ifc_compare};
+    }
+    // LCOV_EXCL_STOP
+
+  public:
+    inline uint32_t interface_hash() const noexcept { return ifc_hash(this) ^ uint32_t(dtype); }
+
+    inline hnnx::InterfaceRef get_refobj() const noexcept { return hnnx::InterfaceRef(methods_for<dtype>(), this); }
+    inline DTypeScaleOff get_dtype_scaleoff() const noexcept { return DTypeScaleOff(dtype, *get_qparms_meth(this)); }
+
+    API_EXPORT inline int compare(const ScaleOffsetInterface &rhs) const noexcept { return SOIfcBase::compare(rhs); }
+    API_EXPORT inline bool compare_eq(const ScaleOffsetInterface &rhs) const noexcept
+    {
+        return SOIfcBase::compare_eq(rhs);
+    }
+
+  private:
+    // Accessor for ScaleOffsetInterface
+    class Acsr;
+    class AcsrRO {
+      protected:
+        T val;
+        const ScaleOffsetInterface<T> &interface;
+
+      public:
+        using element_type = T;
+        using AccessorRO = AcsrRO;
+        API_EXPORT AcsrRO(void const *data_in, const ScaleOffsetInterface *interface_in)
+            : val(*static_cast<T const *>(data_in)), interface(*interface_in)
+        {
+        }
+
+        API_EXPORT AcsrRO(AcsrRO const &) = default;
+        AcsrRO &operator=(AcsrRO const &) = default;
+        API_EXPORT inline element_type value() const { return val; }
+        API_EXPORT inline float as_float() const { return interface.convert_to_float(val); }
+        API_EXPORT inline operator float() const { return as_float(); }
+        API_EXPORT AcsrRO(Acsr const &a) : val(a.value()), interface(a.interface) {}
+    };
+    class Acsr {
+        friend class AcsrRO;
+
+      protected:
+        T *data;
+        const ScaleOffsetInterface<T> &interface;
+
+      public:
+        using element_type = T;
+        using AccessorRO = AcsrRO;
+        API_EXPORT Acsr(void *data_in, const ScaleOffsetInterface *interface_in)
+            : data(static_cast<T *>(data_in)), interface(*interface_in)
+        {
+        }
+        Acsr(Acsr const &) = default;
+        API_EXPORT inline element_type value() const { return *data; }
+        API_EXPORT inline float as_float() const { return interface.convert_to_float(*data); }
+        API_EXPORT inline operator float() const { return as_float(); }
+        API_EXPORT inline void set_float(float v) { *data = interface.convert_from_float(v); }
+        API_EXPORT inline void set_value(element_type v) { *data = v; }
+        API_EXPORT inline float operator=(float v)
+        {
+            set_float(v);
+            return v;
+        }
+        API_EXPORT inline float operator=(Acsr const &rhs)
+        {
+            if (this != &rhs) {
+                float const v = rhs.as_float();
+                set_float(v);
+                return v;
+            }
+            return this->as_float();
+        }
+        API_EXPORT inline float operator=(AcsrRO const &rhs)
+        {
+            float const v = rhs.as_float();
+            set_float(v);
+            return v;
+        }
+    };
+
+  public:
+    using Accessor = Acsr;
+    using AccessorRO = AcsrRO;
+};
+
+// make_interface for ScaleOffsetInterface.
+template <typename T> struct hnnx::make_interface<ScaleOffsetInterface<T>> {
+    // can only declare these here, since we can't see into Graph at this point.
+    // Code is in tensor.cc
+    API_EXPORT static ScaleOffsetInterface<T> const *from_exemplar(Graph &, ScaleOffsetInterface<T> const &exemplar);
+    API_EXPORT static Interface const *from_odef(Graph &g, OutputDef const &odef)
+    {
+        // make an exemplar...
+        ScaleOffsetInterface<T> const exemplar(odef);
+        return from_exemplar(g, exemplar);
+    }
+    API_EXPORT static Interface const *from_deser(Deserz &dctx, Interface const **const ptrloc)
+    {
+        // deserialize the id; p is a pointer to where it is in index,
+        const auto [objp, indexp] = dctx.deserialize_shared_obj<Interface>(ptrloc);
+        // if indexp is null, it's a ref to previous obj; 'objp' is the pointer and we're done.
+        if (indexp == nullptr) return objp;
+        // otherwise, make a new one and store its address at indexp.
+        Interface const *const new_p = dctx.dcrate()->emplace0<ScaleOffsetInterface<T>>(dctx);
+        *indexp = new_p; // for next time it's used
+        return new_p;
+    }
+
+  protected:
+    // put in crate without checking for dups.
+    API_EXPORT static ScaleOffsetInterface<T> const *to_crate(Graph &, ScaleOffsetInterface<T> const &exemplar);
+};
+
+extern template class ScaleOffsetInterface<uint8_t>; // in tensor.cc
+extern template class ScaleOffsetInterface<uint16_t>;
+
+//////////////////////////////////////////////////////////////////////////////////////////
+/// @brief compile-time traits of tensor classes
+/// E.g. the construct tensor_traits<TYPE>::element_type will obtain the element_type
+/// of any Tensor subclass which has one.
+/// Note that tensor_traits<Tensor> has no defined attributes.
+///
+/// Full list of traits is below
+///
+/// These are defined in all non-abstract Tensor subclasses:
+///  - constexpr DType dtype            // always present (except Tensor,RankedTensor,LayoutTensor); sometimes UNKNOWN
+///  - constexpr unsigned rank          // always present (except Tensor); sometimes 0
+///  - typedef element_type;            // always present (except Tensor,RankedTensor,LayoutTensor); void in TensorShape
+///  - typedef storage_type             // always present (except Tensor,RankedTensor); void in TensorShape
+///
+/// In LayoutTensor and ConcreteTensor:
+///  - typedef layouttensor_type        // The LayoutTensor<> class
+///  - typedef layout_type
+///  - typedef pad_type                 // Padding<rank> or NoPadding<rank>
+///  - constexpr bool has_padding
+///  - constexpr bool is_chunked;
+///  - constexpr bool is_indirect;      // usually same as is_chunked; always <= is_chunked
+///  - typedef raw_type                 // See below [1]
+///
+/// Only in ConcreteTensor:
+///  - constexpr MemoryClass memclass;
+///
+/// Only in ConcreteTensor, TensorScalar:
+///  - typedef interface_type
+///
+///  [1] raw_type is defined in the classes which have get_raw(...),
+///     and is the type which get_raw returns a ref to.
+///     For LayoutTensor, it is the same as storage_type; for ConcreteTensor,
+///     it is the same as element_type.
+///
+template <typename TENST> using tensor_traits = typename TENST::traits;
+
+//////////////////////////////////////////////////////////////////////////////////////////
+
+/*
+ * Now that we have Interfaces and Accessors, which we will use to give a consistent interface to Tensors,
+ * let's work on the actual Tensors
+ */
+
+/*
+ * @class Tensor
+ *
+ * @brief This is the abstract base class for Tensors.
+ * All tensors allow you to index into them with foo(a,b,c);
+ * You can query rank, dim, etc
+ *
+ * But, you're probably better off with one of the more specific Tensor types for performance,
+ * since a lot of the virtual functions become trivial for the compiler if they can be inlined.
+ *
+ */
+
+class Tensor {
+  public:
+    enum class clone_mode {
+        duplicate,
+        UNUSED_persistent,
+    };
+
+    // Use with 'dims' to query dimension sizes by name e.g. auto [h, d] = tensor.dims(Tensor::HEIGHT, Tensor::DEPTH)
+    enum dimensions { BATCH, HEIGHT, WIDTH, DEPTH, CHANNEL };
+
+    API_EXPORT virtual hnnx::InterfaceRef interface() const noexcept = 0;
+
+    struct traits { // empty
+    };
+
+    API_EXPORT virtual const char *true_name() const { return typeid(*this).name(); }
+    API_EXPORT explicit Tensor(const Op *producer_in) {}
+    API_EXPORT explicit Tensor(hnnx::Deserz &) {}
+    API_EXPORT Tensor(const Tensor &old, hnnx::Allocator *allocator, clone_mode) {}
+    API_EXPORT virtual ~Tensor(){}; // virtual destructor
+    API_EXPORT virtual size_t rank() const noexcept = 0; // What's the rank of this tensor?
+    API_EXPORT virtual size_t dim(size_t index) const noexcept = 0; // What's the length of some dimension?
+    API_EXPORT virtual std::pair<size_t const *, size_t>
+    get_dims() const noexcept = 0; // return rank, and address of dims[0..n-1]
+
+    API_EXPORT virtual size_t max_dim(size_t index) const noexcept = 0;
+    API_EXPORT virtual std::pair<size_t const *, size_t>
+    get_max_dims() const noexcept = 0; // return rank, and address of dims[0..n-1]
+    // this is the first virtual method defined externally.
+    API_EXPORT virtual uint32_t find_content_hash() const noexcept; // find 'content hash' of the data.
+
+  protected:
+    // Note, this is a const method returning a non-const pointer;
+    // but we only allow it to publicly return a non-const
+    // pointer when used in non-const wrapper methods.
+    // if 'iref' is not null, *iref is also set to what interface() would return.
+    API_EXPORT virtual void *element_addr(size_t rank, SIdx const coords_in[],
+                                          hnnx::InterfaceRef *iref = nullptr) const noexcept = 0;
+    // this is for implementing dims(..indices...) in Tensor and subclasses.
+    // The 'dims_r' is a return from get_dims() method.
+    template <typename... T> //
+    static inline std::array<size_t, sizeof...(T)> //
+    dims_extractor(std::pair<size_t const *, size_t> const dims_r, T... indices)
+    {
+        auto const read_dim = [dims_r](unsigned i) -> size_t { return (i < dims_r.second) ? dims_r.first[i] : 1; };
+        return {read_dim(indices)...};
+    }
+    // this is for implementing dims()
+    template <unsigned R, typename... T> //
+    static inline constexpr std::array<size_t, R> //
+    dims_extractor_all(std::pair<size_t const *, size_t> const dims_r)
+    {
+        std::array<size_t, R> result{};
+        for (unsigned i = 0; i < R; i++) {
+            result[i] = (i < dims_r.second) ? dims_r.first[i] : 1;
+        }
+        return result;
+    }
+
+  public:
+    // element_ptr on insufficiently specialized class gives the result as a void *.
+    API_EXPORT inline ALWAYSINLINE void const *element_ptr(size_t rank, const SIdx coords[]) const
+    {
+        return (void const *)element_addr(rank, coords);
+    }
+    API_EXPORT inline ALWAYSINLINE void *element_ptr(size_t rank, const SIdx coords[])
+    {
+        return element_addr(rank, coords);
+    }
+
+    API_EXPORT std::tuple<size_t, size_t, size_t, size_t> get_dims_4() const
+    {
+        size_t const *ptr = nullptr;
+        size_t n = 0;
+        std::tie(ptr, n) = get_dims(); // virtual call
+        if (n != 4) throw std::runtime_error("rank not 4");
+        return std::make_tuple(ptr[0], ptr[1], ptr[2], ptr[3]);
+    }
+    // this is a common case.
+    API_EXPORT std::tuple<size_t, size_t> get_dims_1_2() const
+    {
+        size_t const *ptr = nullptr;
+        size_t n = 0;
+        std::tie(ptr, n) = get_dims(); // virtual call
+        if (n < 3) throw std::runtime_error("rank not >=3");
+        return std::make_tuple(ptr[1], ptr[2]);
+    }
+
+    ALWAYSINLINE inline std::array<size_t, 4> dims() const
+    { // make compatible with typical concrete tensor.
+        return dims_extractor_all<4>(get_dims());
+    }
+
+    template <typename... T> API_EXPORT const std::array<size_t, sizeof...(T)> dims(T... indices) const
+    {
+        return dims_extractor(get_dims(), indices...);
+    }
+
+    API_EXPORT virtual DTypeScaleOff get_dtype_intfc() const noexcept = 0;
+    template <typename... T> API_EXPORT const std::array<size_t, sizeof...(T)> max_dims(T... indices) const
+    {
+        return dims_extractor(get_max_dims(), indices...);
+    }
+    // if you need more than one of these, it is recommended to unpack
+    // the result from get_dtype_intfc()
+    API_EXPORT DType get_dtype() const { return get_dtype_intfc().dtype; }
+    API_EXPORT float interface_scale() const { return get_dtype_intfc().scale; }
+    API_EXPORT NN_INT32_T interface_offset() const { return get_dtype_intfc().offset; }
+    API_EXPORT OutputDef gen_output_def() const;
+
+    template <typename... ind_types> API_EXPORT inline GenericAccessorRO operator()(ind_types... inds) const
+    {
+        const std::array<SIdx, sizeof...(ind_types)> indarr = {static_cast<SIdx>(inds)...};
+        hnnx::InterfaceRef intfc = hnnx::InterfaceRef::make_null();
+        void *const ptr = element_addr(sizeof...(ind_types), indarr.data(), &intfc);
+        return GenericAccessorRO(ptr, intfc);
+    }
+    template <typename... ind_types> API_EXPORT inline GenericAccessor operator()(ind_types... inds)
+    {
+        const std::array<SIdx, sizeof...(ind_types)> indarr = {static_cast<SIdx>(inds)...};
+        hnnx::InterfaceRef intfc = hnnx::InterfaceRef::make_null();
+        void *const ptr = element_addr(sizeof...(ind_types), indarr.data(), &intfc);
+        return GenericAccessor(ptr, intfc);
+    }
+
+    /*
+     * Returned by virtual method get_tensor_format_code:
+     *
+     *                                   (General)           (Shape)             (Scalar)
+     *  Bits  3:0  dtype code             x                  0 = UNKNOWN          x
+     *  Bits  5:4  (reserved, zero)
+     *  Bits  7:6  log2(element_size)     x                  0                    x
+     *  Bits 11:8  rank                   x                  x                    0
+     *  Bits 15:12 (reserved, 0)
+     *  Bit  16     is_tcm                x                  0                    0
+     *  Bit  17     is_quantized          x                  0                    x
+     *  Bit  18     is_indirect           x                  0                    0
+     *  Bit  19     is_chunked            x                  0                    0
+     *  Bit  20     is_not_flat           x                  0                    0
+     *  Bits  27:31	 (reserved, 0)
+     *  Bits 31:28  mode                  tensMODE_general   tensMODE_shape       tensMODE_scalar
+	 *-------------------------------------
+     * Returned by get_tensor_info():
+     * This is a bit weird, due to a legacy bug, but I'm restating it as below,
+     * which remains compatible:
+     *
+     *  For Concrete tensor:
+     *      Bits 3:0    DType
+     *      Bits 7:4    '0001'
+     *      Bits 11:8   rank
+     *      Bits 15:12  '0000'
+     *      Bits 19:16  memory class
+     *      Bits 27:20  zero
+     *      Bits 31:28  tensMODE_general
+     *
+     *  For Shape and Scalar tensors: Bits 31:28 contain tensMODE_shape, or tensMODE_scalar; others bits ar 0.
+     *  Classes which cannot be serialized return 0 in the upper 4 bits.
+     */
+    enum {
+        tformat_dtype_shift = 0u,
+        tformat_dtype_mask = 0xFu,
+        tformat_log2sz_shift = 6u,
+        tformat_log2sz_mask = 3u,
+        tformat_rank_shift = 8u,
+        tformat_rank_mask = 0xFu,
+        tformat_is_tcm = 1u << 16u,
+        tformat_is_quantized = 1u << 17u,
+        tformat_is_indirect = 1u << 18u,
+        tformat_is_chunked = 1u << 19u,
+        tformat_is_not_flat = 1u << 20u,
+        tformat_tmode_shift = 28u,
+        tformat_tmode_mask = 0xFu,
+    };
+
+  protected:
+    template <typename IFC> static inline constexpr uint32_t formatcode_for_interface()
+    {
+        constexpr DType dt = dtype_of_type<IFC>();
+        uint32_t result = unsigned(dt);
+        constexpr unsigned elbytes = sizeof(typename IFC::element_type);
+        constexpr unsigned log2sz = (elbytes == 8) ? 3 : (elbytes == 4) ? 2 : (elbytes == 2) ? 1 : 0;
+        static_assert(elbytes == (1u << log2sz));
+        result |= log2sz << tformat_log2sz_shift;
+        if (dtype_traits<dt>::is_quant) result |= tformat_is_quantized;
+        return result;
+    }
+    template <typename TRAITS> static inline constexpr uint32_t formatcode_for_general()
+    {
+        constexpr unsigned rankval = TRAITS::rank;
+        uint32_t result = formatcode_for_interface<typename TRAITS::interface_type>();
+        result |= (rankval << tformat_rank_shift);
+        if (TRAITS::memclass == MemoryClass::TCM) result |= tformat_is_tcm;
+        if (TRAITS::is_indirect) result |= tformat_is_indirect;
+        if (TRAITS::is_chunked) result |= tformat_is_chunked;
+        if (!std::is_base_of_v<FlatMemoryLayout<rankval>, typename TRAITS::layout_type>) {
+            result |= tformat_is_not_flat;
+        }
+        return (hnnx::SerOpsInterface::tensMODE_general << tformat_tmode_shift) | result;
+    }
+
+    template <unsigned RANK> static inline constexpr uint32_t formatcode_for_shape()
+    {
+        static_assert(RANK <= tformat_rank_mask);
+        return (hnnx::SerOpsInterface::tensMODE_shape << tformat_tmode_shift) | (RANK << tformat_rank_shift);
+    }
+    template <typename IFC> static inline constexpr uint32_t formatcode_for_scalar()
+    {
+        return (hnnx::SerOpsInterface::tensMODE_scalar << tformat_tmode_shift) | formatcode_for_interface<IFC>();
+    }
+
+    static inline constexpr uint32_t pack_tensor_info(DType type, uint32_t rank, MemoryClass mclass)
+    {
+        uint32_t tinfo = 0x10;
+        tinfo |= static_cast<uint32_t>(type) & 0xFu;
+        tinfo |= (rank & 0xFu) << 8u;
+        tinfo |= (static_cast<uint32_t>(mclass) & 0xF) << 16u;
+        tinfo |= hnnx::SerOpsInterface::tensMODE_general << tformat_tmode_shift;
+        return tinfo;
+    }
+
+  public:
+#ifndef PREPARE_DISABLED
+    virtual std::string get_shape_info() const { return {}; }
+#endif
+    API_EXPORT virtual uint32_t get_tensor_info() const noexcept; // returns 0;
+    API_EXPORT virtual uint32_t get_tensor_format_code() const noexcept; // returns 0;
+    // returns false if the dims are the same; true if different, or maybe different.
+    API_EXPORT virtual bool set_dims(const size_t dims[]) = 0; // Set the shape of the tensor
+    API_EXPORT virtual bool set_dims(const Tensor &prototype) = 0; // Set the shape of the tensor same as another.
+
+    API_EXPORT virtual void set_valid_dims(const size_t new_dims[]) = 0;
+    API_EXPORT virtual DynamicStatus get_dynamic_state() const = 0;
+    // void * rather than DynamicShape<rank> & because we dont have rank templated
+    API_EXPORT virtual void const *get_dynamic_shape_obj() const noexcept = 0;
+    API_EXPORT inline void allocate(hnnx::Allocator &allocator, unsigned options = 0)
+    {
+        return allocate_func(allocator, options);
+    }
+
+    /* EJP: FIXME: temporary functions */
+    /*
+	 * Some of these functions are convenient for now, but don't necessarily
+	 * need to live for a long time if we find better ways of doing things.
+	 */
+    API_EXPORT virtual void *raw_data() noexcept = 0; // Get pointer to raw data
+    API_EXPORT void const *raw_data_const() const noexcept { return const_cast<Tensor *>(this)->raw_data(); }
+    API_EXPORT virtual void set_raw_data_despite_danger(void *buffer)
+    {
+        assert(!"Invalid to set raw pointer on this type of tensor");
+    }
+    API_EXPORT virtual size_t total_storage_elements() const = 0;
+    API_EXPORT virtual size_t total_storage_bytes() const = 0;
+    API_EXPORT virtual size_t valid_storage_elements() const = 0;
+    API_EXPORT virtual size_t valid_storage_bytes() const = 0;
+    API_EXPORT const char *truetype() const noexcept { return typeid(*this).name(); }
+
+    // Append the set of allocated memory blocks to blocklist.
+    API_EXPORT void get_memory_blocks(hnnx::blockid_set_t &blocklist, int mc_sel = -1) const;
+    API_EXPORT inline void get_memory_blocks(hnnx::blockid_set_t &blocklist, MemoryClass mc) const
+    {
+        get_memory_blocks(blocklist, int(mc));
+    }
+    // return the set of memory blocks
+    API_EXPORT hnnx::blockid_set_t get_memory_blocks(int mc_sel = -1) const;
+    API_EXPORT inline hnnx::blockid_set_t get_memory_blocks(MemoryClass mc) const { return get_memory_blocks(int(mc)); }
+
+    // Supply the allocated memory blocks to the enumerator.
+    API_EXPORT virtual void enum_memory_blocks(hnnx::MemBlockEnumerator &) const = 0;
+
+    // The 'ef' parameter to these functions is a callable (function, lambda, std::function...)
+    // compatible with MemBlockEnumerator::supply_blocks_func
+    template <typename ENFUNC> API_EXPORT inline void enum_memory_blocks_withfunc(ENFUNC &&ef) const
+    {
+        hnnx::MemBlockEnumWrapper<std::remove_reference_t<ENFUNC>> enumer(std::forward<ENFUNC>(ef));
+        this->enum_memory_blocks(enumer);
+    }
+    // The 'rf' parameter to these functions is a callable (function, lambda, std::function...)
+    // .. called as ( Tensor const *, void *old_blkid) -> void *new_blkid
+    template <typename REPLFUNC> API_EXPORT inline void replace_memory_blocks_withfunc(REPLFUNC &&rf)
+    {
+        hnnx::MemBlockReplBlockWrapper<std::remove_reference_t<REPLFUNC>> enumer(std::forward<REPLFUNC>(rf));
+        this->enum_memory_blocks(enumer);
+    }
+    // this is passed a map<void*,void*> or any similar type with find() and end(),
+    // and uses it to edit the blocks in the tensor.
+    template <typename MAPTYPE> API_EXPORT inline void replace_memory_blocks_withmap(MAPTYPE const &map)
+    {
+        replace_memory_blocks_withfunc([&map](Tensor const *, void *oldid) {
+            auto found_at = map.find(oldid);
+            return (found_at != map.end()) ? found_at->second : oldid;
+        });
+    }
+
+    API_EXPORT void serialize(hnnx::SerOpsInterface &sctx) const { sctx.tensor_serialize(this); }
+    // The same tensor in the same layout, but with persistent storage.
+
+    API_EXPORT std::unique_ptr<Tensor> persistent_clone(hnnx::Allocator *allocator, bool zoneb = false) const;
+    // same thing, but does refcounts in 'zone B'
+    API_EXPORT inline std::unique_ptr<Tensor> persistent_clone_Op(hnnx::Allocator *allocator) const
+    {
+        return persistent_clone(allocator, true);
+    }
+    // similar in effect to persistent_clone_Op, but can onlt be applied to
+    // existing persistent tensors; and only copies the tensor, not the data.
+
+    API_EXPORT std::unique_ptr<Tensor> shallow_clone_Op(hnnx::Allocator *allocator) const;
+
+    // decref the ref counts of any contained blocks (all must be persistent)
+    API_EXPORT void persistent_decref(hnnx::Allocator *allocator, bool zoneb = false) const;
+    // same thing, but does refcounts in 'zone B'
+    API_EXPORT inline void persistent_decref_Op(hnnx::Allocator *allocator) const
+    {
+        return persistent_decref(allocator, true);
+    }
+
+    // a 'duplicate' - same type,layout,dims; references the same
+    // memory block(s) (where applicable).
+    API_EXPORT std::unique_ptr<Tensor> duplicate_clone(hnnx::Allocator *allocator) const
+    {
+        return reallocate_clone(allocator, true);
+    }
+    // do a 'reallocate clone': the new tensor is the same type, layout, dims
+    // but the block table is zeroed.
+    // If dup=true, this is the same as duplicate_clone.
+    API_EXPORT std::unique_ptr<Tensor> reallocate_clone(hnnx::Allocator *allocator, bool dup = false) const;
+
+    // 'compare' in the base class:
+    //   - if the types are different, return -1 or 1 depending on that.
+    //   - otherwise call protected virtual compare_sametype(), which can then use static_cast
+    //     to downcast (and doesn't need to recurse back to the base).
+
+    API_EXPORT int compare(const Tensor *rhs) const
+    {
+        Tensor const *const lhs = this;
+        std::type_info const &lhs_type = typeid(*lhs);
+        std::type_info const &rhs_type = typeid(*rhs);
+        if (lhs_type == rhs_type) {
+            return lhs->compare_sametype(rhs);
+        } else {
+            return lhs_type.before(rhs_type) ? -1 : 1;
+        }
+    }
+
+    API_EXPORT virtual uint64_t get_checksum() const { return 0LL; };
+    // these only work on specific types; in others, you inherit the base class implementation
+    // which raises a runtime error. You can use tile_support() to find out if support exists
+    API_EXPORT virtual void const *read_tile(unsigned flags, void *buffer, size_t b, int h, int w, int d) const;
+    API_EXPORT virtual void write_tile(unsigned flags, void const *buffer, size_t b, int h, int w, int d);
+    enum {
+        tile_8bit = 1, // set when the data is 8 bit and the tensor supports tile access
+        tile_16bit = 2, // set when the data is 16 bit and the tensor supports tile access
+        tile_32bit = 4, // set when the data is 32 bit and the tensor supports tile access
+        tile_any = (1 + 2 + 4), // one of these bits is set if there is any support
+        tile_fast = 16, // set only when one of the XXbit is set, and the support is vector accelerated.
+        tile_direct = 32 // set only when when 'fast' is set, and a direct mapping is possible
+    };
+
+    API_EXPORT virtual unsigned tile_support_bits() const;
+    API_EXPORT inline bool tile_support() const { return (tile_support_bits() & tile_any) != 0; }
+    API_EXPORT inline bool tile_support_fast() const { return (tile_support_bits() & tile_fast) != 0; }
+    API_EXPORT inline bool tile_support_direct() const { return (tile_support_bits() & tile_direct) != 0; }
+    // this is currently a wrapper on tile_write, which inserts the 'write_strategy' flag, and suppresses broadcast
+    // and copy flags. It may change to a separate virtual func.
+    // (this is defined as an inline, in tile_extract.h).
+    API_EXPORT void *write_tile_strategy(unsigned flags, void *buffer, size_t b, int h, int w, int d);
+
+    API_EXPORT static uint32_t content_hash_data(void const *, size_t nbytes, bool is_float) noexcept;
+    API_EXPORT static uint32_t content_hash_data_indirect(uint32_t inhash, void **blocks, unsigned nblocks,
+                                                          size_t blockbytes, bool is_float) noexcept;
+
+    API_EXPORT static uint32_t build_hash(size_t const *dims, int n, uint32_t previous) noexcept;
+
+    struct API_EXPORT tensor_blockinfo {
+        void **blkptrs; // pointer to block table (nullptr if no blocks)
+        // shapepp is a pointer to the shape pointer (where applicable; otherwise null). If a clone
+        // is done, it points to the field in the cloned tensor.
+        hnnx::ShapeFlags const *const *shapepp;
+        // This is a pointer to the tensor's 'interface' pointer, if it has one; otherwise nullptr.
+        // If a clone is being done, it points to the field in the cloned tensor.
+        Interface const *const *interfacepp;
+        size_t nblocks; // number of blocks
+        size_t blocksize; // size of block, in bytes
+        DType dtype;
+        MemoryClass mclass;
+        bool is_indirect; // indicates that the layout is indirect.
+        bool is_chunked; // indicates that the layout is chunked;
+        void setup(DType dt = DType::UNKNOWN, MemoryClass mc = MemoryClass::Default)
+        {
+            blkptrs = nullptr;
+            shapepp = nullptr;
+            interfacepp = nullptr;
+            nblocks = 0;
+            blocksize = 0;
+            dtype = dt;
+            mclass = mc;
+            is_indirect = false;
+            is_chunked = false;
+        }
+    };
+    API_EXPORT inline void get_tensor_blockinfo(tensor_blockinfo *infop) const { clone_util(nullptr, nullptr, infop); }
+
+    // deserialize a single block pointer for a contiguous tensor.
+    API_EXPORT static void *deserialize_block_pointer(hnnx::Deserz &dctx);
+    // deserialize an indirect blocktable, given ref to pointer.
+    // 'nblocks' may be 1, instead of actual len, if we are not decoding 'classic' format.
+    template <typename T> // T = storage_type
+    inline static void deserialize_blocktable(hnnx::Deserz &dctx, T **&blockptr, unsigned const nblocks)
+    {
+        deserialize_blocktable_generic(dctx, (void ***)&blockptr, nblocks);
+    }
+    API_EXPORT static void deserialize_blocktable_generic(hnnx::Deserz &dctx, void ***blockp_loc, unsigned nblocks);
+
+  protected:
+    API_EXPORT virtual void allocate_func(hnnx::Allocator &allocator, unsigned options) = 0;
+    API_EXPORT virtual int compare_sametype(const Tensor *rhs) const = 0;
+
+    // clone_util is an overburdened virtual function, which performs duplicate_clone almost directly,
+    // and provides other info by which the other clone methods, and the decref methods,
+    // can all be done generically in the base class.
+    //
+    // - If tensp != null, it will create a duplicate_clone, and store it at *tensp;
+    // - If infop != null, it will fill in *infop with the tensor info.
+    // If *both* are not null, then infop->blkptrs will point to the block table in the
+    //  original tensor, and the return value is the block pointer in the new tensor.
+    //  Otherwise the return value is null (and it will be null in any case, if the tensor
+    //  has no blocks).
+    //
+    //
+    // Note: allocator may be null if tensp is null.
+
+    API_EXPORT virtual void **clone_util(hnnx::Allocator *allocator, std::unique_ptr<Tensor> *tensp,
+                                         tensor_blockinfo *infop) const = 0;
+};
+
+// A FakeTensor is intended as an intermediate base for special subclasses which
+// need to be based on Tensor but don't need to support most of the interface.
+// subclassing should be done in .cc files or private headers where possible.
+//
+// All of the abstract 'virtual=0' methods (other than get_dtype) are overridden here;
+// many (those shown as protected) will all throw exceptions if called; the others do
+// null things as shown.
+// So when you subclass, just override whatever ones you need and leave the rest.
+//
+// In particular, get_dtype() returns DType::None.
+//
+class FakeTensor : public Tensor {
+  public:
+    explicit FakeTensor(const Op *producer_in) : Tensor(producer_in) {}
+    API_EXPORT explicit FakeTensor(hnnx::Deserz &);
+
+  protected:
+    // all will throw exception if called
+    API_EXPORT virtual void *element_addr(size_t rank, SIdx const coords_in[],
+                                          hnnx::InterfaceRef *iref = nullptr) const noexcept override;
+    API_EXPORT virtual hnnx::InterfaceRef interface() const noexcept override;
+    API_EXPORT virtual void allocate_func(hnnx::Allocator &allocator, unsigned options) override;
+    API_EXPORT virtual void *raw_data() noexcept override;
+    API_EXPORT virtual size_t total_storage_elements() const override;
+    API_EXPORT virtual size_t total_storage_bytes() const override;
+    API_EXPORT virtual size_t valid_storage_elements() const override;
+    API_EXPORT virtual size_t valid_storage_bytes() const override;
+    API_EXPORT virtual void const *get_dynamic_shape_obj() const noexcept override;
+    API_EXPORT virtual void **clone_util(hnnx::Allocator *allocator, std::unique_ptr<Tensor> *tensp,
+                                         Tensor::tensor_blockinfo *infop) const override;
+    API_EXPORT virtual int compare_sametype(const Tensor *rhs) const override;
+
+  public:
+    // defined as shown
+    API_EXPORT virtual size_t rank() const noexcept override; //->0
+    API_EXPORT virtual size_t dim(size_t index) const noexcept override; //->0
+    API_EXPORT virtual std::pair<size_t const *, size_t> get_dims() const noexcept override; //->{null,0}
+    API_EXPORT virtual size_t max_dim(size_t index) const noexcept override; //->0
+    API_EXPORT virtual std::pair<size_t const *, size_t> get_max_dims() const noexcept override; //->{null,0}
+    API_EXPORT virtual bool set_dims(const size_t dims[]) override; // -> false
+    API_EXPORT virtual bool set_dims(const Tensor &prototype) override; // ->false
+    API_EXPORT virtual void enum_memory_blocks(hnnx::MemBlockEnumerator &) const override; // nothing
+
+    API_EXPORT virtual DTypeScaleOff get_dtype_intfc() const noexcept override; // { return DTypeScaleOff(None); }
+};
+/**
+ * @class RankedTensor
+ *
+ * @brief Almost as abstract as Tensor, but we template the Rank.
+ * This allows us to have compile-time checking of the operator(), as well as a public Rank that is static constexpr
+ * so it doesn't take a lot of space or performance...
+ * The other benefit here is that we can specify RankedTensor<4> to have a fairly generic tensor,
+ * but enforce the number of dimensions of the tensor.
+ */
+
+template <unsigned TRank> class RankedTensor : public Tensor {
+  public:
+    struct traits {
+        static constexpr unsigned Rank = TRank;
+    };
+
+    API_EXPORT explicit RankedTensor(const Op *producer_in) : Tensor(producer_in) {}
+    API_EXPORT explicit RankedTensor(hnnx::Deserz &dctx) : Tensor(dctx) {}
+    API_EXPORT RankedTensor(const RankedTensor &old, hnnx::Allocator *allocator, clone_mode cmode)
+        : Tensor(old, allocator, cmode)
+    {
+    }
+    static constexpr auto Rank = TRank;
+    API_EXPORT virtual inline size_t rank() const noexcept override final { return Rank; }
+    template <typename... ind_types> API_EXPORT inline GenericAccessorRO operator()(ind_types... inds) const
+    {
+        static_assert(Rank == (sizeof...(ind_types)), "# of coords must match Rank");
+        const SIdx indarr[] = {static_cast<SIdx>(inds)...};
+        hnnx::InterfaceRef intfc = hnnx::InterfaceRef::make_null();
+        void *const ptr = element_addr(Rank, indarr, &intfc);
+        return GenericAccessorRO(ptr, intfc);
+    }
+    template <typename... ind_types> API_EXPORT inline GenericAccessor operator()(ind_types... inds)
+    {
+        static_assert(Rank == (sizeof...(ind_types)), "# of coords must match Rank");
+        const SIdx indarr[] = {static_cast<SIdx>(inds)...};
+        hnnx::InterfaceRef intfc = hnnx::InterfaceRef::make_null();
+        void *const ptr = element_addr(Rank, indarr, &intfc);
+        return GenericAccessor(ptr, intfc);
+    }
+};
+
+/**
+ * @class TensorShape
+ *
+ * @brief This is a tensor that just has a shape, no memory or type or anything.
+ * This needs to be non-abstract
+ *
+ * EJP: FIXME: should we really use this, or just use Const? Or special like-Const op?
+ * EJP: FIXME: Performance is not so criticial here, we need it to respect the interface
+ * but we really want to make this convenient representation and be formable from an OutputDef
+ *
+ * TensorShape should already be canonized by the nature of being a const op.
+ * We might be able to share TensorShapes shapes and Tensor shapes, but it seems unnecessary.
+ */
+
+template <unsigned TRank> class TensorShape : public RankedTensor<TRank> {
+    using Parent = RankedTensor<TRank>;
+
+  protected:
+    API_EXPORT static constexpr NullInterface null_interface{};
+    // These functions are not really part of the interface, but we need them to implement operator()
+    API_EXPORT virtual void *element_addr(size_t rank, SIdx const coords_in[],
+                                          hnnx::InterfaceRef *const iref = nullptr) const noexcept override final
+    {
+        if (iref) *iref = NullInterface::get_refobj();
+        return nullptr;
+    }
+    API_EXPORT virtual hnnx::InterfaceRef interface() const noexcept override final
+    {
+        return NullInterface::get_refobj();
+    }
+
+  public:
+    API_EXPORT const char *true_name() const override { return type_name<TensorShape<TRank>>(); };
+
+    using Parent::Rank;
+    struct traits {
+        using element_type = void;
+        using storage_type = void;
+        static constexpr DType dtype = DType::UNKNOWN;
+        static constexpr unsigned rank = TRank;
+    };
+
+    //using Shape_t = Shape<Rank>;
+    //const Shape_t *shape;
+    const std::array<size_t, Rank> shape;
+    API_EXPORT virtual size_t dim(size_t index) const noexcept override final { return shape[index]; }
+    API_EXPORT const std::array<size_t, Rank> &dims() const { return shape; };
+    API_EXPORT virtual size_t max_dim(size_t index) const noexcept override final { return dim(index); }
+    API_EXPORT const std::array<size_t, Rank> &max_dims() const { return dims(); };
+    template <typename... T> API_EXPORT const std::array<size_t, sizeof...(T)> dims(T... indices) const
+    {
+        return Tensor::dims_extractor(get_dims(), indices...);
+    }
+    API_EXPORT virtual std::pair<size_t const *, size_t> get_dims() const noexcept override
+    {
+        return std::pair<size_t const *, size_t>(&shape[0], Rank);
+    }
+    template <typename... T> API_EXPORT const std::array<size_t, sizeof...(T)> max_dims(T... indices) const
+    {
+        return Tensor::dims_extractor(get_max_dims(), indices...);
+    }
+    API_EXPORT virtual std::pair<size_t const *, size_t> get_max_dims() const noexcept override { return get_dims(); }
+    API_EXPORT virtual DTypeScaleOff get_dtype_intfc() const noexcept override
+    {
+        return NullInterface::get_dtype_scaleoff();
+    }
+
+    API_EXPORT virtual uint32_t get_tensor_format_code() const noexcept override
+    {
+        return Tensor::formatcode_for_shape<Rank>();
+    }
+
+    API_EXPORT virtual uint32_t get_tensor_info() const noexcept override
+    {
+        return hnnx::SerOpsInterface::tensMODE_shape << Tensor::tformat_tmode_shift;
+    }
+
+    // Optional, but maybe helpful?
+    API_EXPORT virtual bool set_dims(const size_t dims[]) override
+    {
+        static_assert("Shapes are immutable");
+        return true;
+    } // immutable
+    API_EXPORT virtual bool set_dims(const Tensor &prototype) override
+    {
+        static_assert("Shapes are immutable");
+        return true;
+    } // immutable
+
+    virtual void set_valid_dims(const size_t new_dims[]) override
+    {
+        static_assert("Shapes are immutable");
+    } // immutable
+    // TensorShapes always contain fully valid data;
+    virtual DynamicStatus get_dynamic_state() const override { return DynamicStatus::ValidData; }
+    // EJP: FIXME: temporary functions
+    API_EXPORT virtual void *raw_data() noexcept override
+    {
+        return nullptr;
+    } // Allocate storage ourselves instead of fancy memory allocator
+    API_EXPORT virtual size_t total_storage_elements() const override { return 0; }
+    API_EXPORT virtual size_t total_storage_bytes() const override { return 0; }
+    API_EXPORT virtual size_t valid_storage_elements() const override { return 0; }
+    API_EXPORT virtual size_t valid_storage_bytes() const override { return 0; }
+    API_EXPORT virtual void const *get_dynamic_shape_obj() const noexcept override
+    {
+        return (void *)&null_dynamic_shape;
+    }
+    TensorShape(const Op *producer_in, const OutputDef &def, Graph &graph_in)
+        : Parent(producer_in), shape(hnnx::ptr_to_stdarray<Rank, size_t>(&def.max_sizes[0]))
+    {
+    }
+    explicit TensorShape(hnnx::Deserz &dctx) : Parent(dctx), shape(dctx.deserialize_uint32_array_sizet<Rank>()) {}
+
+    TensorShape(const TensorShape &old, hnnx::Allocator *allocator, Tensor::clone_mode cmode)
+        : Parent(old, allocator, cmode), shape(old.shape)
+    {
+    }
+
+    API_EXPORT virtual void enum_memory_blocks(hnnx::MemBlockEnumerator &) const override { return; }
+
+    API_EXPORT virtual void allocate_func(hnnx::Allocator &allocator, unsigned options) override final {}
+
+    API_EXPORT virtual void **clone_util(hnnx::Allocator *allocator, std::unique_ptr<Tensor> *tensp,
+                                         Tensor::tensor_blockinfo *infop) const override
+    {
+        if (tensp) *tensp = std::make_unique<TensorShape>(*this, allocator, Tensor::clone_mode::duplicate);
+        if (infop) infop->setup();
+        return nullptr;
+    }
+
+  protected:
+    API_EXPORT virtual int compare_sametype(const Tensor *rhs_in) const override
+    {
+        auto *rhs = static_cast<const TensorShape *>(rhs_in);
+        for (int i = 0; i < Rank; i++) {
+            int const dimdiff = this->shape[i] - rhs->shape[i];
+            if (dimdiff != 0) return dimdiff;
+        }
+        return 0;
+    }
+    API_EXPORT virtual uint32_t find_content_hash() const noexcept override
+    {
+        return Tensor::build_hash(&shape[0], Rank, 0x113014);
+    }
+    API_EXPORT static const char *code_to_type_name() { return TensorTypeStruct<TensorShape<TRank>>::name; }
+};
+
+/*
+ * I think we should have a Scalar Constant
+ * * Immutable shape
+ * * Rank 0
+ * * Coords ignored
+ * * Dim(x) == 0
+ * * Templated type / interface?
+ */
+
+//
+// Tensor Scalar depending on DType
+
+template <DType DT> class TensorSclrDT : public Tensor {
+  protected:
+    using T = typename dtype_traits<DT>::element_type;
+    using Interface_t = std::conditional_t<dtype_traits<DT>::is_quant, ScaleOffsetInterface<T>, PlainInterface<T>>;
+    using Accessor_t = typename Interface_t::Accessor;
+    using Const_Accessor_t = typename Interface_t::AccessorRO;
+    Interface_t interface_inst;
+
+  public:
+    API_EXPORT virtual hnnx::InterfaceRef interface() const noexcept override final
+    {
+        return interface_inst.get_refobj();
+    }
+    API_EXPORT inline float interface_scale() const { return interface_inst.get_scale(); }
+    API_EXPORT inline float interface_scale_recip() const { return interface_inst.get_scale_recip(); }
+    API_EXPORT inline int32_t interface_offset() const { return interface_inst.get_offset(); }
+
+    API_EXPORT virtual uint32_t get_tensor_format_code() const noexcept override
+    {
+        return Tensor::formatcode_for_scalar<Interface_t>();
+    }
+
+    API_EXPORT virtual uint32_t get_tensor_info() const noexcept override
+    {
+        return hnnx::SerOpsInterface::tensMODE_scalar << tformat_tmode_shift;
+    }
+
+    // EJP: FIXME: this should just be the value, but then GenericAccessor constructor
+    // complains about const value going to a const Accessor where the constructor in
+    // const Accessor is written to have a normal void pointer input... sigh.
+    T value;
+
+  protected:
+    // These functions are not really part of the interface, but we need them to implement operator()
+    API_EXPORT virtual void *element_addr(size_t rank, SIdx const coords_in[],
+                                          hnnx::InterfaceRef *const iref = nullptr) const noexcept override final
+    {
+        if (iref) *iref = interface_inst.get_refobj();
+        return (void *)&value;
+    }
+
+  public:
+    API_EXPORT const char *true_name() const override { return type_name<TensorSclrDT<DT>>(); };
+
+    struct traits {
+        using element_type = T;
+        using storage_type = typename dtype_traits<DT>::storage_type;
+        using interface_type = Interface_t;
+        static constexpr DType dtype = DT;
+        static constexpr unsigned rank = 0;
+    };
+
+    API_EXPORT virtual size_t rank() const noexcept override { return 0; } // What's the rank of this tensor?
+    API_EXPORT virtual size_t dim(size_t index) const noexcept override { return 1; }
+    API_EXPORT virtual size_t max_dim(size_t index) const noexcept override
+    {
+        return 1;
+    } // What's the length of some dimension?
+    API_EXPORT virtual std::pair<size_t const *, size_t> get_dims() const noexcept override
+    {
+        return std::pair<size_t const *, size_t>(nullptr, 0);
+    }
+    virtual std::pair<size_t const *, size_t> get_max_dims() const noexcept override
+    {
+        return std::pair<size_t const *, size_t>(nullptr, 0);
+    }
+    static constexpr DType dtype = dtype_of_type<Interface_t>();
+    API_EXPORT virtual DTypeScaleOff get_dtype_intfc() const noexcept override
+    {
+        return interface_inst.get_dtype_scaleoff();
+    }
+
+    // Optional, but maybe helpful?
+    API_EXPORT virtual bool set_dims(const size_t dims[]) override
+    {
+        static_assert("Scalar dims are immutable");
+        return true;
+    } // immutable
+    API_EXPORT virtual bool set_dims(const Tensor &prototype) override
+    {
+        static_assert("Scalar dims are immutable");
+        return true;
+    } // immutable
+    virtual void set_valid_dims(const size_t new_dims[]) override
+    {
+        static_assert("Scalar dims are immutable");
+    } // immutable
+    // scalar tensors always contain fully valid data;
+    virtual DynamicStatus get_dynamic_state() const override { return DynamicStatus::ValidData; }
+    // EJP: FIXME: temporary functions
+    API_EXPORT virtual void *raw_data() noexcept override final
+    {
+        return &value;
+    } // Allocate storage ourselves instead of fancy memory allocator
+    API_EXPORT const void *raw_data() const noexcept
+    {
+        return &value;
+    } // Allocate storage ourselves instead of fancy memory allocator
+    API_EXPORT virtual size_t total_storage_elements() const override { return 0; }
+    API_EXPORT virtual size_t total_storage_bytes() const override { return 0; }
+    API_EXPORT virtual size_t valid_storage_elements() const override { return 0; }
+    API_EXPORT virtual size_t valid_storage_bytes() const override { return 0; }
+    API_EXPORT virtual void const *get_dynamic_shape_obj() const noexcept override
+    {
+        return (void *)&null_dynamic_shape;
+    }
+    API_EXPORT virtual void allocate_func(hnnx::Allocator &allocator, unsigned options) override final {}
+    API_EXPORT TensorSclrDT(const Op *producer_in, T value_in) : Tensor(producer_in), value(value_in)
+    {
+        static_assert(!dtype_traits<DT>::is_quant, "FIXME: need different constructor");
+    }
+    API_EXPORT explicit TensorSclrDT(hnnx::Deserz &dctx)
+        : Tensor(dctx), interface_inst(dctx), value(dctx.deserialize_type<T>())
+    {
+    }
+    API_EXPORT TensorSclrDT(const TensorSclrDT &old, hnnx::Allocator *allocator, clone_mode cmode)
+        : Tensor(old, allocator, cmode), interface_inst(old.interface_inst), value(old.value)
+    {
+    }
+
+    template <typename... ind_types> API_EXPORT inline const Const_Accessor_t operator()(ind_types... inds) const
+    {
+        return Const_Accessor_t((void *)&value, &interface_inst);
+    }
+    template <typename... ind_types> API_EXPORT inline Accessor_t operator()(ind_types... inds)
+    {
+        return Accessor_t(&value, &interface_inst);
+    }
+    API_EXPORT virtual void enum_memory_blocks(hnnx::MemBlockEnumerator &) const override { return; }
+
+    API_EXPORT virtual void **clone_util(hnnx::Allocator *allocator, std::unique_ptr<Tensor> *tensp,
+                                         Tensor::tensor_blockinfo *infop) const override
+    {
+        if (tensp) *tensp = std::make_unique<TensorSclrDT>(*this, allocator, clone_mode::duplicate);
+        if (infop) infop->setup(DT);
+        return nullptr;
+    }
+
+  protected:
+    API_EXPORT virtual int compare_sametype(const Tensor *rhs_in) const override
+    {
+        // FIXME @@ if Interface_t is quantized, we should compare quantization too.
+        auto *rhs = static_cast<const TensorSclrDT *>(rhs_in);
+        if (this->value < rhs->value) return -1;
+        if (this->value == rhs->value) return 0;
+        return 1;
+    }
+    API_EXPORT virtual uint32_t find_content_hash() const noexcept override
+    {
+        uint32_t const h = interface().interface_hash() ^ mulu32_modular(unsigned(DT), 0x107301);
+        return mulu32_modular(h, 0x104301) ^ content_hash_data(&this->value, sizeof(T), dtype_traits<DT>::is_float);
+    }
+    API_EXPORT static const char *code_to_type_name() { return TensorTypeStruct<TensorSclrDT<DT>>::name; }
+};
+// Tensor Scalar depending on type (assuming PlainInterface)
+
+template <typename T> using TensorScalar = TensorSclrDT<dtype_of_type<PlainInterface<T>>()>;
+
+///////////////////////////////////////////
+// this is contained within LayoutTensor, and implements the block pointer,
+// or pointers.
+// The first parameter indicates if the layout is indirect; we specialize
+// the whole class on true vs. false.
+// The remaining template parms are the same as those in the LayoutTensot containing
+// it.
+
+/// >> for contiguous tensors
+
+template <typename STYPE, typename TLayout, typename Pad_t> struct layout_mem_contig {
+    static constexpr unsigned Rank = TLayout::Rank;
+    using Shape_t = Shape<Rank>;
+    using storage_type = STYPE;
+    static constexpr TLayout layout{};
+    static constexpr Pad_t pad{};
+
+    storage_type *bulk_data;
+
+    API_EXPORT inline layout_mem_contig(Shape_t const *shp, Graph &graph_in) : bulk_data(){};
+
+    // duplicate clone from another
+    API_EXPORT inline layout_mem_contig(Shape_t const *shp, layout_mem_contig const &other, hnnx::Allocator *alloc,
+                                        Tensor::clone_mode cmode)
+        : bulk_data(other.bulk_data)
+    {
+    }
+
+    // construct from deserialize
+    API_EXPORT layout_mem_contig(Shape_t const *, hnnx::Deserz &dctx)
+        : bulk_data((storage_type *)Tensor::deserialize_block_pointer(dctx))
+    {
+    }
+
+    // this implements raw_data in the containing tensor
+    API_EXPORT inline ALWAYSINLINE void *raw_data() const noexcept { return (void *)bulk_data; }
+
+    // this implements set_raw_data_despite_danger(void *buffer) override final { bulk_data = static_cast<T *>(buffer); }
+    API_EXPORT inline ALWAYSINLINE void set_raw_data_despite_danger(void *buffer)
+    {
+        bulk_data = static_cast<storage_type *>(buffer);
+    }
+
+    // this implements element_addr in the containing tensor.
+    API_EXPORT ALWAYSINLINE void *element_addr(Shape_t const *shp, size_t rank, SIdx const coords_in[]) const noexcept
+    {
+        //assert(rank == Rank);
+        const std::array<size_t, Rank> padded_coords =
+                pad.pad_coords(hnnx::ptr_to_stdarray<Rank, SIdx>(&coords_in[0]), shp->pad);
+        size_t const offset = layout.linear_offset(padded_coords, shp->max_dims);
+        return (void *)&bulk_data[offset];
+    }
+
+    // element_addr impl that takes into account dynamic valid_dims when calculating
+    // the offset into the flat memory buffer
+    ALWAYSINLINE void *element_addr(Shape_t const *shp, size_t rank, SIdx const coords_in[],
+                                    std::array<size_t, Rank> const &valid_dims) const noexcept
+    {
+        const std::array<size_t, Rank> padded_coords =
+                pad.pad_coords(hnnx::ptr_to_stdarray<Rank, SIdx>(&coords_in[0]), shp->pad);
+        size_t const offset = layout.linear_offset(padded_coords, valid_dims);
+        return (void *)&bulk_data[offset];
+    }
+
+    PUSH_WARNING()
+    DISABLE_WARNING("-Wcast-qual", MSVC_NO_EQUIV)
+    // get pointer to block table, and length
+    API_EXPORT inline ALWAYSINLINE void **get_block_list_ptr() const { return (void **)&bulk_data; }
+    POP_WARNING()
+    API_EXPORT static inline ALWAYSINLINE size_t get_block_list_len(Shape_t const *shp) { return 1; }
+    // block size for allocation
+    API_EXPORT inline ALWAYSINLINE static size_t get_elements_per_block(Shape_t const *shp)
+    {
+        return std::accumulate(shp->max_dims.cbegin(), shp->max_dims.cend(), 1, std::multiplies<size_t>());
+    }
+    // find the address of the block pointer containing the specified coords.
+    // (not used for contig. tensor, but this is reasonable impl).
+    API_EXPORT inline storage_type **block_ptr_addr(Shape_t const *shape, std::array<SIdx, Rank> coords) const
+    {
+        return &bulk_data;
+    }
+    // dummy for this
+    API_EXPORT inline void realloc_blocktab(hnnx::Allocator *alloc, Shape_t const *old_shape, Shape_t const *new_shape)
+    {
+        bulk_data = nullptr;
+    }
+
+    // compare memory (raw compare)
+    API_EXPORT int compare_memory(Shape_t const *shp, layout_mem_contig const &rhs) const
+    {
+        size_t const len = get_elements_per_block(shp) * sizeof(storage_type);
+        return memcmp(bulk_data, rhs.bulk_data, len);
+    }
+    // find content hash of memory.
+    //
+    API_EXPORT uint32_t find_content_hash(Shape_t const *shp, uint32_t oldhash, bool is_float) const
+    {
+        size_t const len = get_elements_per_block(shp) * sizeof(storage_type);
+        return mulu32_modular(oldhash, 0x223131) ^ Tensor::content_hash_data(bulk_data, len, is_float);
+    }
+};
+
+/// >> for indirect tensors
+namespace indirect_layout_mem {
+API_EXPORT inline void **make_blocktab(size_t n_blocks, Graph &graph_in)
+{
+    return hnnx::graph_crate(graph_in)->alloc_array_zero<void *>(n_blocks);
+}
+
+template <typename CRATE> // Crate or DCrate
+inline void **make_blocktab_for_overwrite(const size_t n_blocks, CRATE *const crate_p)
+{
+    return crate_p->template alloc_array<void *>(n_blocks);
+}
+
+// TODO: make this not inline.
+API_EXPORT inline int compare_indirect_blocks(void **ptr_a, void **ptr_b, size_t nblocks, size_t blocklen)
+{
+    for (size_t i = 0; i < nblocks; i++) {
+        int const cmp = memcmp(ptr_a[i], ptr_b[i], blocklen);
+        if (cmp != 0) return cmp;
+    }
+    return 0;
+}
+} // namespace indirect_layout_mem
+
+//  layout_mem for indirect.
+template <typename STYPE, typename TLayout, typename Pad_t> struct layout_mem_indirect {
+    static constexpr unsigned Rank = TLayout::Rank;
+    using Shape_t = Shape<Rank>;
+    using storage_type = STYPE;
+    static constexpr TLayout layout{};
+    static constexpr Pad_t pad{};
+
+    storage_type **blocktab;
+
+    // construct table
+    API_EXPORT layout_mem_indirect(Shape_t const *shp, Graph &graph_in)
+        : blocktab((storage_type **)indirect_layout_mem::make_blocktab(layout.num_blocks(shp->max_dims), graph_in))
+    {
+    }
+    // duplicate clone from another
+    API_EXPORT layout_mem_indirect(Shape_t const *shp, layout_mem_indirect const &other, hnnx::Allocator *alloc,
+                                   Tensor::clone_mode cmode)
+        : blocktab()
+    {
+        unsigned const nblocks = layout.num_blocks(shp->max_dims);
+        hnnx::Crate *crate_p = hnnx::graph_crate(alloc->graph);
+        blocktab = (storage_type **)indirect_layout_mem::make_blocktab_for_overwrite(nblocks, crate_p);
+        std::memcpy(blocktab, other.blocktab, sizeof(void *) * nblocks);
+    }
+    // construct from deserialize
+    API_EXPORT layout_mem_indirect(Shape_t const *shp, hnnx::Deserz &dctx) : blocktab()
+    {
+        // if we are not 'classic' format, we may not be able to access shape object here due to delayed
+        // pointer resolution. But we don't need nblocks unless classic format. 1 is the 'don't know'
+        // value.
+        unsigned const nblocks = dctx.classic_format() ? layout.num_blocks(shp->max_dims) : 1;
+        Tensor::deserialize_blocktable(dctx, blocktab, nblocks);
+    }
+
+    // this implements raw_data in the containing tensor
+    API_EXPORT inline ALWAYSINLINE void *raw_data() const noexcept { return (void *)blocktab[0]; }
+    // this implements set_raw_data_despite_danger(void *buffer) override final { bulk_data = static_cast<T *>(buffer); }
+    API_EXPORT inline void set_raw_data_despite_danger(void *buffer)
+    {
+        assert(!"Invalid to set raw pointer on this type of tensor");
+    }
+
+    // this implements element_addr in the containing tensor.
+    API_EXPORT ALWAYSINLINE void *element_addr(Shape_t const *shp, size_t rank, SIdx const coords_in[]) const noexcept
+    {
+        assert(rank == Rank);
+        std::array<size_t, Rank> const padded_coords =
+                pad.pad_coords(hnnx::ptr_to_stdarray<Rank, SIdx>(&coords_in[0]), shp->pad);
+        size_t const block_offset = layout.chunk_offset(padded_coords, shp->max_dims);
+        size_t const block_idx = layout.chunk_index(padded_coords, shp->max_dims);
+        return (void *)&blocktab[block_idx][block_offset];
+    }
+
+    // element_addr impl for dynamic valid_dims code path. Block offset/index calculation
+    // is identical to non dynamic variant for chunked memory layouts.
+    ALWAYSINLINE void *element_addr(Shape_t const *shp, size_t rank, SIdx const coords_in[],
+                                    std::array<size_t, Rank> const &valid_dims) const noexcept
+    {
+        assert(rank == Rank);
+        std::array<size_t, Rank> const padded_coords =
+                pad.pad_coords(hnnx::ptr_to_stdarray<Rank, SIdx>(&coords_in[0]), shp->pad);
+        size_t const block_offset = layout.chunk_offset(padded_coords, shp->max_dims);
+        size_t const block_idx = layout.chunk_index(padded_coords, shp->max_dims);
+        return (void *)&blocktab[block_idx][block_offset];
+    }
+    // get pointer to block table, and length
+    API_EXPORT inline ALWAYSINLINE void **get_block_list_ptr() const { return (void **)blocktab; }
+    API_EXPORT static inline ALWAYSINLINE size_t get_block_list_len(Shape_t const *shp)
+    {
+        return layout.num_blocks(shp->max_dims);
+    }
+    // block size for allocation
+    API_EXPORT inline ALWAYSINLINE static size_t get_elements_per_block(Shape_t const *) { return layout.chunk_total; }
+    // find the address of the block pointer containing the specified coords.
+    API_EXPORT inline storage_type **block_ptr_addr(Shape_t const *shape, std::array<SIdx, Rank> coords) const
+    {
+        std::array<size_t, Rank> const padded_coords = pad.pad_coords(coords, shape->pad);
+        size_t const block_idx = layout.chunk_index(padded_coords, shape->max_dims);
+        return &blocktab[block_idx];
+    }
+    // reallocate for change from old_shape to new_shape (typically just the padding
+    // is changed) and zero the blocktab. If the shape is not actually changed, or if
+    // the blocktab isn't larger than before, we keep the old one, but we still clear it.
+    API_EXPORT inline void realloc_blocktab(hnnx::Allocator *alloc, Shape_t const *old_shape, Shape_t const *new_shape)
+    {
+        unsigned const nblocks = layout.num_blocks(new_shape->max_dims);
+        if (old_shape != new_shape) {
+            unsigned const old_nblocks = layout.num_blocks(old_shape->max_dims);
+            if (nblocks > old_nblocks) { // need reallocate.
+                blocktab = (storage_type **)indirect_layout_mem::make_blocktab(nblocks, alloc->graph);
+                return; // already zeroed
+            }
+        }
+        ::memset(blocktab, 0, nblocks * sizeof(void *));
+    }
+
+    // compare memory (raw compare)
+    API_EXPORT int compare_memory(Shape_t const *shp, layout_mem_indirect const &rhs) const
+    {
+        size_t const nblocks = layout.num_blocks(shp->max_dims);
+        size_t const blocklen = sizeof(storage_type) * layout.chunk_total;
+        return indirect_layout_mem::compare_indirect_blocks((void **)blocktab, (void **)rhs.blocktab, nblocks,
+                                                            blocklen);
+    }
+    // find content hash of memory.
+    //
+    API_EXPORT uint32_t find_content_hash(Shape_t const *shp, uint32_t oldhash, bool is_float) const
+    {
+        size_t const nblocks = layout.num_blocks(shp->max_dims);
+        size_t const blocklen = sizeof(storage_type) * layout.chunk_total;
+        return Tensor::content_hash_data_indirect(oldhash, (void **)blocktab, nblocks, blocklen, is_float);
+    }
+};
+///////////////////////////////////////////
+template <typename Linfo> class LayoutTensor;
+template <typename Linfo> class BlockTableAccessor {
+  protected:
+    static constexpr unsigned Rank = Linfo::Rank;
+    using storage_type = typename Linfo::storage_type;
+    using pointer_type = storage_type *;
+    using TLayout = typename Linfo::Tlayout;
+    using Pad_t = typename Linfo::Pad_t;
+    static_assert(Linfo::is_indirect && Linfo::is_chunked);
+    pointer_type *blktab; // the base of the block table
+    std::array<size_t, Rank> blkdims; // dims of the block table in blocks
+    std::array<size_t, Rank> blkstrides; // 'strides' (note stride for dim i is blkstrides[i+1];
+    // stride for dim RANK-1  is 1; blkstrides[0] is the whole size.
+    std::array<unsigned, Rank> margin; // margin offset
+  public:
+    API_EXPORT explicit BlockTableAccessor(LayoutTensor<Linfo> const &tens) : blktab(tens.blocktab_ptr())
+    {
+        Shape<Rank> const &shp = *tens.shape;
+        size_t allprod = 1;
+        for (int i = Rank - 1; i >= 0; --i) {
+            unsigned const blk = TLayout::ChunkSizes[i];
+            size_t const blkdim = shp.max_dims[i] / blk;
+            allprod *= blkdim;
+            blkdims[i] = blkdim;
+            margin[i] = shp.pad[i];
+            blkstrides[i] = allprod;
+        }
+    }
+    // methods which have the same name as tensor methods, do
+    // the same thing here.
+
+    API_EXPORT inline static constexpr unsigned rank() { return Rank; }
+
+    API_EXPORT inline size_t blocktab_len() const { return blkstrides[0]; }
+    API_EXPORT inline pointer_type *blocktab_ptr() const { return blktab; }
+    API_EXPORT inline size_t blocktab_blocksize() const { return TLayout::chunk_total; };
+    API_EXPORT inline size_t blocktab_blocksize_bytes() const { return TLayout::chunk_total * sizeof(storage_type); };
+
+    API_EXPORT inline size_t blocktab_dim(int i) const { return blkdims[i]; }
+    API_EXPORT inline size_t blocktab_dim_stride(int i) const { return (i < Rank - 1) ? blkstrides[i + 1] : 1; }
+
+    // block_ptr_address(b,h,w,d) and block_ptr accept element coordinates.
+
+    template <typename... ind_types> API_EXPORT inline pointer_type *block_ptr_address(ind_types... inds) const
+    {
+        static_assert(Rank == (sizeof...(ind_types)), "# of coords must match Rank");
+        const std::array<SIdx, Rank> coords = {{static_cast<SIdx>(inds)...}};
+        return block_ptr_calc(coords);
+    }
+    template <typename... ind_types> API_EXPORT inline pointer_type &block_ptr(ind_types... inds) const
+    {
+        static_assert(Rank == (sizeof...(ind_types)), "# of coords must match Rank");
+        const std::array<SIdx, Rank> coords = {{static_cast<SIdx>(inds)...}};
+        return *block_ptr_calc(coords);
+    }
+    // blktab(b,h,w,d) accepts *block* coords
+    //
+    template <typename... ind_types> API_EXPORT inline pointer_type &blocktab(ind_types... inds) const
+    {
+        static_assert(Rank == (sizeof...(ind_types)), "# of coords must match Rank");
+        const std::array<SIdx, Rank> coords = {{static_cast<SIdx>(inds)...}};
+        return *blktab_ptr_calc(coords);
+    }
+    // same_table_shape: the shape of the table is the same as the 'other'.
+    API_EXPORT bool same_table_shape(BlockTableAccessor const &other) const
+    {
+        for (int i = 0; i < Rank; i++)
+            if (blkdims[i] != other.blkdims[i]) return false;
+        return true;
+    }
+    // 'same_layout' means the same table shape and the same padding offset. Dims may not be identical.
+    API_EXPORT bool same_layout(BlockTableAccessor const &other) const
+    {
+        if (!same_table_shape(other)) return false;
+        for (int i = 0; i < Rank; i++)
+            if (margin[i] != other.margin[i]) return false;
+        return true;
+    }
+
+  protected:
+    API_EXPORT pointer_type *block_ptr_calc(std::array<SIdx, Rank> const &coords) const
+    {
+        size_t sum = 0;
+        for (int i = 0; i < Rank; i++) {
+            unsigned blk = TLayout::ChunkSizes[i];
+            unsigned idx = (coords[i] + margin[i] + (blk - 1)) / blk;
+            sum += idx * ((i < Rank - 1) ? blkstrides[i + 1] : 1);
+        }
+        return blktab + sum;
+    }
+    API_EXPORT pointer_type *blktab_ptr_calc(std::array<SIdx, Rank> const &coords) const
+    {
+        size_t sum = coords[Rank - 1];
+        for (int i = 0; i < Rank - 1; i++) {
+            sum += coords[i] * blkstrides[i + 1];
+        }
+        return blktab + sum;
+    }
+};
+
+//
+// Constructors of LayoutTensor (all protected; only used by subclass ctor):
+// LayoutTensor(const Op * producer_in, const OutputDef &def, Graph &graph_in, <<func pointer>>)
+//    - build for given shape, attached to given producer.
+//  LayoutTensor(const Op *producer_in, hnnx::Deserz & dctx, <<funct pointer>>)
+//    - deserialize. Notr that dctx contains a graph ref.
+//  LayoutTensor(const ConcreteTensor &old, hnnx::Allocator *allocator,Tensor::clone_mode cmode)
+//    - 'clone duplicate' of the given tensor. Note that cmode is ignored.
+//
+// The function pointers in the first two cases are used to construct the correct
+// interface object, according to the Interface_t of the subclass.
+//
+
+template <typename Linfo> class LayoutTensor : public RankedTensor<Linfo::Rank> {
+  protected:
+    using BaseRT = RankedTensor<Linfo::Rank>;
+    API_EXPORT static constexpr unsigned Rank = Linfo::Rank;
+    using storage_type = typename Linfo::storage_type;
+    using TLayout = typename Linfo::Tlayout;
+    using Pad_t = typename Linfo::Pad_t;
+    API_EXPORT static constexpr bool is_chunked = Linfo::is_chunked;
+    static_assert(is_chunked == (TLayout::chunk_total > 1));
+    API_EXPORT static constexpr bool is_indirect = Linfo::is_indirect;
+    API_EXPORT static constexpr bool is_padded = !std::is_same<Pad_t, NoPadding<Rank>>::value;
+
+    static_assert(!(is_indirect && !is_chunked), "non-chunked layouts can't be indirect");
+
+    Interface const *const interface_ptr; // pointer to shared instance of Interface subclass.
+    int32_t const *const dummy_interface_ptr = nullptr; // need to immediately follow the interface pointer
+    using Shape_t = Shape<Rank>;
+    using Dynamic_shape_t = DynamicShape<Rank>;
+    using ShapeInterface_t = ShapeInterface<Rank>;
+
+  public:
+    Shape_t const *shape;
+    ShapeInterface_t const *dynamic_shape; // need to immediately follow the shape pointer
+    API_EXPORT static constexpr TLayout layout{};
+    API_EXPORT static constexpr Pad_t pad{};
+#ifndef PREPARE_DISABLED
+    std::string get_shape_info() const override { return shape->get_shape_info(); }
+#endif
+
+  protected: // interface, then shape, then mem
+    using layout_mem_t = std::conditional_t<is_indirect, layout_mem_indirect<storage_type, TLayout, Pad_t>,
+                                            layout_mem_contig<storage_type, TLayout, Pad_t>>;
+    layout_mem_t mem;
+
+  public:
+    struct API_EXPORT traits {
+        using storage_type = LayoutTensor::storage_type;
+        using raw_type = LayoutTensor::storage_type; // result from get_raw()
+        static constexpr unsigned rank = Rank;
+        static constexpr bool is_indirect = LayoutTensor::is_indirect;
+        static constexpr bool is_chunked = LayoutTensor::is_chunked;
+        static constexpr bool has_padding = !std::is_same<Pad_t, NoPadding<Rank>>::value;
+        using pad_type = Pad_t;
+        using layout_type = TLayout;
+        using layouttensor_type = LayoutTensor;
+    };
+
+  protected:
+    // only used in the deserialize ctor
+    Interface const *&interface_ptr_ref() { return const_cast<Interface const *&>(interface_ptr); }
+    // ctors are marked noinline; otherwise they just get inlined
+    // into all the ConcreteTensor ctors, which isn't really helpful.
+    [[gnu::noinline]] API_EXPORT LayoutTensor(const Op *producer_in, const OutputDef &def, Graph &graph_in,
+                                              Interface const *(*ifc_maker)(Graph &, OutputDef const &))
+        : BaseRT(producer_in), interface_ptr((*ifc_maker)(graph_in, def)),
+          shape(Shape_t::canonical_shape(
+                  graph_in, Shape_t(hnnx::ptr_to_stdarray<Rank, size_t>(&def.max_sizes[0]),
+                                    mem.layout.pad(hnnx::ptr_to_stdarray<Rank, size_t>(&def.max_sizes[0]))))),
+          dynamic_shape(
+                  Dynamic_shape_t::crated_shape(graph_in, Dynamic_shape_t(shape->dims, DynamicStatus::ValidData))),
+          mem(shape, graph_in)
+    {
+    }
+    using interface_deser_func = Interface const *(*)(hnnx::Deserz &, Interface const **);
+    [[gnu::noinline]] API_EXPORT LayoutTensor(hnnx::Deserz &dctx, interface_deser_func const ifc_deser_fp)
+        : BaseRT(dctx), interface_ptr((*ifc_deser_fp)(dctx, &interface_ptr_ref())),
+          shape(Shape_t::deserialize(dctx, &shape)),
+          dynamic_shape(Dynamic_shape_t::deserialize(dctx, &dynamic_shape, (ShapeInterface_t const *)shape)),
+          mem(shape, dctx)
+    {
+    }
+    // clone ctor.
+    [[gnu::noinline]] API_EXPORT LayoutTensor(const LayoutTensor &old, hnnx::Allocator *allocator,
+                                              Tensor::clone_mode cmode)
+        : BaseRT(old, allocator, cmode), interface_ptr(old.interface_ptr), shape(old.shape),
+          dynamic_shape(old.dynamic_shape), mem(shape, old.mem, allocator, cmode)
+    {
+    }
+
+  public:
+    API_EXPORT virtual inline size_t dim(size_t index) const noexcept override final
+    {
+        return dynamic_shape->get_dims()[index];
+    }
+    API_EXPORT const std::array<size_t, Rank> &dims() const { return dynamic_shape->get_dims(); }
+    API_EXPORT virtual inline size_t max_dim(size_t index) const noexcept override final { return shape->dims[index]; }
+    API_EXPORT const std::array<size_t, Rank> &max_dims() const { return shape->dims; }
+
+    template <typename... T> API_EXPORT const std::array<size_t, sizeof...(T)> dims(T... indices) const
+    {
+        return Tensor::dims_extractor(get_dims(), indices...);
+    }
+    API_EXPORT virtual std::pair<size_t const *, size_t> get_dims() const noexcept final
+    {
+        return std::pair<size_t const *, size_t>(&dynamic_shape->get_dims()[0], Rank);
+    }
+    template <typename... T> API_EXPORT const std::array<size_t, sizeof...(T)> max_dims(T... indices) const
+    {
+        return Tensor::dims_extractor(get_max_dims(), indices...);
+    }
+    API_EXPORT virtual std::pair<size_t const *, size_t> get_max_dims() const noexcept final
+    {
+        return std::pair<size_t const *, size_t>(&shape->dims[0], Rank);
+    }
+#if defined(NDEBUG) || defined(NO_SETDIMS_CHECK)
+    API_EXPORT virtual inline bool set_dims(const size_t dims[]) override final { return false; }
+    API_EXPORT virtual inline bool set_dims(const Tensor &prototype) override final { return false; }
+#else
+    API_EXPORT virtual inline bool set_dims(const size_t dims[]) override
+    {
+        // for (int i = 0; i < Rank; i++) {
+        //     assert(dims[i] == shape->dims[i]);
+        // }
+        return false;
+    }
+    API_EXPORT virtual inline bool set_dims(const Tensor &prototype) override
+    {
+        auto [dims_p, dims_n] = prototype.get_max_dims();
+        assert(dims_n == Rank);
+        return set_dims(dims_p);
+    }
+#endif
+    API_EXPORT virtual inline void set_valid_dims(const size_t new_dims[]) override final
+    {
+        DynamicStatus new_state = DynamicStatus::ValidData;
+        for (unsigned i = 0u; i < Rank; i++) {
+            assert(new_dims[i] <= shape->dims[i]);
+            if (new_dims[i] <= 0) {
+                new_state = DynamicStatus::InvalidData;
+            }
+            if (new_state == DynamicStatus::ValidData && new_dims[i] < shape->dims[i]) {
+                new_state = DynamicStatus::SemiValidData;
+            }
+        }
+        dynamic_shape->set_state(new_state);
+        dynamic_shape->set_dims(hnnx::ptr_to_stdarray<Rank, size_t>(&new_dims[0]));
+    }
+    API_EXPORT virtual inline DynamicStatus get_dynamic_state() const override { return dynamic_shape->get_state(); }
+    API_EXPORT virtual void const *get_dynamic_shape_obj() const noexcept override
+    {
+        return (void const *)dynamic_shape;
+    };
+    // 'interface()' needs to be overriden in ConcreteTensor
+    API_EXPORT inline float interface_scale() const { return this->interface().get_scale(); }
+    API_EXPORT inline float interface_scale_recip() const { return this->interface().get_scale_recip(); }
+    API_EXPORT inline int32_t interface_offset() const { return this->interface().get_offset(); }
+
+    // for direct access to bulk_data, in contiguous tensors only
+    //  data_ptr() can be assigned to.
+    API_EXPORT inline std::conditional_t<is_indirect, void, storage_type *&> data_ptr()
+    {
+        if constexpr (!is_indirect) {
+            return mem.bulk_data;
+        }
+    }
+    API_EXPORT inline std::conditional_t<is_indirect, void, storage_type *const &> data_ptr() const
+    {
+        if constexpr (!is_indirect) {
+            return mem.bulk_data;
+        }
+    }
+
+    // block table access
+    API_EXPORT inline storage_type **blocktab_ptr() const { return (storage_type **)mem.get_block_list_ptr(); }
+    API_EXPORT inline storage_type *&blocktab_at(size_t i)
+    {
+        if constexpr (!is_indirect) {
+            assert(i == 0);
+            return *(storage_type **)mem.get_block_list_ptr();
+        } else {
+            return ((storage_type **)mem.get_block_list_ptr())[i];
+        }
+    }
+    API_EXPORT inline storage_type *const &blocktab_at(size_t i) const
+    {
+        if constexpr (!is_indirect) {
+            assert(i == 0);
+            return *(storage_type **)mem.get_block_list_ptr();
+        } else {
+            return ((storage_type **)mem.get_block_list_ptr())[i];
+        }
+    }
+    API_EXPORT inline size_t blocktab_len() const { return mem.get_block_list_len(shape); }
+    API_EXPORT inline size_t blocktab_blocksize() const { return mem.get_elements_per_block(shape); }
+    API_EXPORT inline size_t blocktab_blocksize_bytes() const
+    {
+        return mem.get_elements_per_block(shape) * sizeof(storage_type);
+    }
+
+    // TODO: make total_storage elements have an optional bool parameter
+    // to return in bytes; and then total_storage_bytes is a wrapper.
+    API_EXPORT virtual inline size_t total_storage_bytes() const final override
+    {
+        return total_storage_elements() * sizeof(storage_type);
+    }
+    API_EXPORT virtual inline size_t total_storage_elements() const final override
+    {
+        size_t const total_elements =
+                std::accumulate(shape->max_dims.cbegin(), shape->max_dims.cend(), 1, std::multiplies<size_t>());
+        return total_elements;
+    }
+    API_EXPORT inline size_t valid_storage_bytes() const final override
+    {
+        return valid_storage_elements() * sizeof(storage_type);
+    }
+    API_EXPORT inline size_t valid_storage_elements() const final override
+    {
+        size_t const total_elements = std::accumulate(dynamic_shape->get_dims().cbegin(),
+                                                      dynamic_shape->get_dims().cend(), 1, std::multiplies<size_t>());
+        return total_elements;
+    }
+    API_EXPORT virtual void *raw_data() noexcept override final { return mem.raw_data(); }
+    API_EXPORT virtual void set_raw_data_despite_danger(void *buffer) override final
+    {
+        mem.set_raw_data_despite_danger(buffer);
+    }
+
+  protected:
+    // Underlying code for change_{shape,pad,shape_pad}
+    API_EXPORT void change_shapepad_impl(hnnx::Allocator &allocator, size_t const *const p_new_dims,
+                                         size_t const *const p_new_pads = nullptr) // optional pads
+    {
+#if !defined(PREPARE_DISABLED)
+        Shape_t newshape = *shape; // copy old shape
+        if (p_new_dims) {
+            for (int i = 0; i < Rank; i++)
+                newshape.dims[i] = p_new_dims[i];
+        }
+        if (p_new_pads) {
+            for (int i = 0; i < Rank; i++)
+                newshape.pad[i] = p_new_pads[i];
+        }
+        newshape.max_dims = layout.pad(pad.pad_coords(newshape.dims, newshape.pad));
+        // nake a persistent copy of new shape
+        Shape_t const *const new_shape_p = Shape_t::canonical_shape(allocator.graph, newshape);
+        // new_shape_p will be same pointer as shape, if shape wasn't changed. realloc_blocktab
+        // checks for that.
+        mem.realloc_blocktab(&allocator, shape, new_shape_p);
+        shape = new_shape_p;
+#else
+        throw std::runtime_error("change_pad or shape w/o prepare");
+#endif
+    }
+
+  public:
+    // change the padding; and reallocate blocktab if it's larger as a result.
+    // in any case, all of the block pointers are zeroed.
+    // Can only be done early in prepare (i.e. just as Op is created).
+    inline void change_pad(std::array<size_t, Rank> const &new_pad, hnnx::Allocator &allocator)
+    {
+        change_shapepad_impl(allocator, nullptr, new_pad.data());
+    }
+    // Used to change the shape of a tensor;
+    // Can only be done early in prepare (i.e. just as Op is created). Initially used only to
+    // support 'scratch' outputs.
+    // Please use 'change_shape_pad' if you also want to change padding.
+    inline void change_shape(std::array<size_t, Rank> const &new_dims, hnnx::Allocator &allocator)
+    {
+        change_shapepad_impl(allocator, new_dims.data());
+    }
+    // special entry point for use by 'generic' change_shape operation.
+    inline void change_shape_arr(size_t const *const p_new_dims, hnnx::Allocator &allocator)
+    {
+        change_shapepad_impl(allocator, p_new_dims);
+    }
+
+    // Use instead of change_shape if you want to change the padding too
+    // Can only be done early in prepare (i.e. just as Op is created).
+    inline void change_shape_pad(std::array<size_t, Rank> const &new_dims, std::array<size_t, Rank> const &new_pad,
+                                 hnnx::Allocator &allocator)
+    {
+        change_shapepad_impl(allocator, new_dims.data(), new_pad.data());
+    }
+
+    template <typename... ind_types>
+    API_EXPORT inline storage_type const *const *block_ptr_address(ind_types... inds) const
+    {
+        const std::array<SIdx, Rank> coords = {{static_cast<SIdx>(inds)...}};
+        return mem.block_ptr_addr(shape, coords);
+    }
+    template <typename... ind_types> API_EXPORT inline storage_type *const *block_ptr_address(ind_types... inds)
+    {
+        const std::array<SIdx, Rank> coords = {{static_cast<SIdx>(inds)...}};
+        return mem.block_ptr_addr(shape, coords);
+    }
+    template <typename... ind_types> API_EXPORT inline storage_type const *block_ptr(ind_types... inds) const
+    {
+        return *block_ptr_address(inds...);
+    }
+    template <typename... ind_types> API_EXPORT inline storage_type *block_ptr(ind_types... inds)
+    {
+        return *block_ptr_address(inds...);
+    }
+
+    API_EXPORT std::conditional_t<is_indirect, BlockTableAccessor<Linfo>, void> blocktable_accessor() const
+    {
+        if constexpr (is_indirect) {
+            return BlockTableAccessor<Linfo>(*this);
+        }
+    }
+
+    // this only makes sense for indirect tensors.
+    API_EXPORT std::conditional_t<is_indirect, std::array<size_t, Linfo::Rank>, void> tile_strides() const
+    {
+        if constexpr (is_indirect) {
+            std::array<size_t, Linfo::Rank> ret = {0};
+            ret[Linfo::Rank - 1] = 1;
+            for (int i = Linfo::Rank - 2; i >= 0; i--) {
+                ret[i] = ret[i + 1] * (shape->max_dims[i + 1] / layout.ChunkSizes[i + 1]);
+            }
+            return ret;
+        }
+    }
+
+    // get_raw_addr(...) on this class gives a storage_type *.
+    template <typename... ind_types> API_EXPORT inline storage_type const *get_raw_addr(ind_types... inds) const
+    {
+        static_assert(Rank == (sizeof...(ind_types)), "# of coords must match Rank");
+        const std::array<SIdx, Rank> coords = {{static_cast<SIdx>(inds)...}};
+        return (storage_type const *)element_addr0(Rank, coords.data());
+    }
+    template <typename... ind_types> API_EXPORT inline storage_type *get_raw_addr(ind_types... inds)
+    {
+        static_assert(Rank == (sizeof...(ind_types)), "# of coords must match Rank");
+        const std::array<SIdx, Rank> coords = {{static_cast<SIdx>(inds)...}};
+        return (storage_type *)this->element_ptr(Rank, coords.data());
+    }
+    template <typename... ind_types> API_EXPORT inline storage_type const &get_raw(ind_types... inds) const
+    {
+        static_assert(Rank == (sizeof...(ind_types)), "# of coords must match Rank");
+        const std::array<SIdx, Rank> coords = {{static_cast<SIdx>(inds)...}};
+        return *(storage_type const *)element_addr0(Rank, coords.data());
+    }
+    template <typename... ind_types> API_EXPORT inline storage_type &get_raw(ind_types... inds)
+    {
+        static_assert(Rank == (sizeof...(ind_types)), "# of coords must match Rank");
+        const std::array<SIdx, Rank> coords = {{static_cast<SIdx>(inds)...}};
+        return *(storage_type *)element_addr0(Rank, coords.data());
+    }
+    // tile interface. These are defined in tile_extract.h
+    API_EXPORT virtual void const *read_tile(unsigned flags, void *buffer, size_t b, int h, int w,
+                                             int d) const override;
+    API_EXPORT virtual void write_tile(unsigned flags, void const *buffer, size_t b, int h, int w, int d) override;
+    API_EXPORT virtual unsigned tile_support_bits() const override;
+
+    // Return a reference to *this; useful to get the layout base class reference
+    // for any tensor class which has one.
+    // So if you call func(in.layout_base(), out.layout_base()), where 'func'
+    // is a template func, it will be specialized according to the layout of
+    // in and out, not the subclass.
+    API_EXPORT inline LayoutTensor &layout_base() { return *this; }
+    API_EXPORT inline LayoutTensor const &layout_base() const { return *this; }
+
+    // checksum for debug
+    [[gnu::noinline]] API_EXPORT virtual uint64_t get_checksum() const override
+    {
+        // NOLINTNEXTLINE(misc-const-correctness): Don't const this variable
+        uint64_t chk = 0;
+        if constexpr (Rank == 4) {
+            auto [batch, heights, width, depth] = this->get_dims_4();
+            // TODO : maybe add a special case for R4flat layout/no padding; one call to checksum_bytes.
+            if (batch && heights && width && depth) {
+                storage_type const x0 = *(storage_type const *)this->get_raw_addr(0, 0, 0, 0);
+                for (size_t b = 0; b < batch; b++) {
+                    for (size_t h = 0; h < heights; h++) {
+                        for (size_t w = 0; w < width; w++) {
+                            for (size_t d = 0; d < depth; d++) {
+                                storage_type x = *(storage_type const *)this->get_raw_addr(b, h, w, d);
+                                x ^= x0;
+                                union {
+                                    storage_type as_x;
+                                    uint8_t as_byte[sizeof(storage_type)];
+                                } uu = {x};
+                                chk = hnnx::checksum_bytes(chk, uu.as_byte, sizeof(storage_type));
+                            }
+                        }
+                    }
+                }
+                chk ^= x0;
+            }
+        }
+        return chk;
+    }
+
+  protected:
+    // element_addr is delegated to the particular specialization of layout_mem
+    // virtual method 'element_addr' is defined only in the concrete subclasses, and calls this.
+    // (in addition to sometimes returning an interface)
+    ALWAYSINLINE void *element_addr0(size_t rank, const SIdx coords_in[]) const noexcept
+    {
+        return mem.element_addr(shape, rank, coords_in, dynamic_shape->get_dims());
+    }
+
+    // compare_sametype is not overloaded here; LayoutTensor is an abstract class
+
+    // This is called from ConcreteTensor::compare_sametype to fully compare two tensors
+    // which are already known to be the same type (and have same interface)
+    [[gnu::noinline]] API_EXPORT int compare_sametype_layout(LayoutTensor const *rhs) const
+    {
+        if (shape->dims != rhs->shape->dims) {
+            return std::lexicographical_compare(shape->dims.begin(), shape->dims.end(), rhs->shape->dims.begin(),
+                                                rhs->shape->dims.end())
+                           ? -1
+                           : 1;
+        }
+        if (is_padded) {
+            if (shape->max_dims != rhs->shape->max_dims) {
+                return std::lexicographical_compare(shape->max_dims.begin(), shape->max_dims.end(),
+                                                    rhs->shape->max_dims.begin(), rhs->shape->max_dims.end())
+                               ? -1
+                               : 1;
+            }
+            // TODO: compare padding too. Maybe have a Padding method for this.
+        }
+        // compare memory now (delegate to layout_mem).
+        return mem.compare_memory(shape, rhs->mem);
+    }
+    // allocation and enumeration.
+    [[gnu::noinline]] API_EXPORT void allocate_layout(hnnx::Allocator &allocator, unsigned options, MemoryClass mclass)
+    {
+        // get the pointer to block table; and number of entries in it.
+        void **const blocktab = this->mem.get_block_list_ptr();
+        size_t const nblocks = this->mem.get_block_list_len(this->shape);
+        size_t const blocksize = sizeof(storage_type) * this->mem.get_elements_per_block(this->shape);
+        size_t const align = traits::is_indirect ? blocksize : std::min(size_t(256), sizeof(storage_type));
+
+        allocator.allocate_n(blocktab, // pointer to pointers,
+                             nblocks, // number of pointers
+                             blocksize, align, mclass, options, this->get_dtype());
+    }
+    [[gnu::noinline]] API_EXPORT void enum_memory_blocks_layout(hnnx::MemBlockEnumerator &en, MemoryClass mclass) const
+    {
+        // get the pointer to block table; and number of entries in it.
+        void **const blocktab = this->mem.get_block_list_ptr();
+        size_t const nblocks = this->mem.get_block_list_len(this->shape);
+        en.supply_blocks(this, mclass, (void *const *)blocktab, nblocks);
+    }
+    // called from find_content_hash in the ConcreteTensor class. hash_in includes
+    // hash of dtype and interface.
+    [[gnu::noinline]] API_EXPORT uint32_t find_content_hash_layout(uint32_t hash_in, bool is_float) const noexcept
+    {
+        uint32_t h = hash_in ^ (Linfo::Rank * 0x102401u);
+        h = Tensor::build_hash(shape->dims.data(), Linfo::Rank, hash_in);
+        if (is_padded) {
+            h = Tensor::build_hash(shape->max_dims.data(), Linfo::Rank, h);
+            // TODO: including padding too (or instead)
+        }
+        return mem.find_content_hash(shape, h, is_float);
+    }
+    API_EXPORT static const char *code_to_type_name() { return TensorTypeStruct<LayoutTensor<Linfo>>::name; }
+};
+
+//
+// Constructors of ConcreteTensor:
+// ConcreteTensor(const Op * producer_in, const OutputDef &def, Graph &graph_in)
+//    - build for given shape, attached to given producer.
+//  ConcreteTensor(const Op *producer_in, const OutputDef &def, Graph & graph_in, T * data_in)
+//    - same, but initialize pointer to given. Only available in 'flat' tensors.
+//  ConcreteTensor(hnnx::Deserz & dctx)
+//    - deserialize. Note that dctx contains a grap ref.
+//  ConcreteTensor(const ConcreteTensor &old, hnnx::Allocator *allocator,Tensor::clone_mode cmode)
+//    - 'clone duplicate' of the given tensor. Note that cmode is ignored.
+//
+
+template <typename Tinfo> class ConcreteTensor : public LayoutTensor<typename Tinfo::Lconfig> {
+  protected:
+    using Interface_t = typename Tinfo::Interface_t;
+    using Layout_t = typename Tinfo::Tlayout;
+    using Pad_t = typename Tinfo::Pad_t;
+    static constexpr DType dtype = dtype_of_type<Interface_t>();
+    API_EXPORT static constexpr bool is_indirect = Tinfo::is_indirect;
+    API_EXPORT static constexpr unsigned Rank = Layout_t::Rank;
+    using BaseLayout = LayoutTensor<typename Tinfo::Lconfig>;
+    using BaseRT = typename BaseLayout::BaseRT;
+
+    // make sure it's compatible with supplied base class
+    static_assert(Rank == BaseLayout::Rank && is_indirect == BaseLayout::traits::is_indirect &&
+                          std::is_same<Layout_t, typename BaseLayout::traits::layout_type>::value &&
+                          std::is_same<Pad_t, typename BaseLayout::traits::pad_type>::value,
+                  "incompatible base class for ConcreteTensor");
+
+    inline Interface_t const *interface_typed() const { return static_cast<Interface_t const *>(this->interface_ptr); }
+
+  public:
+    API_EXPORT const char *true_name() const override { return Tinfo::typetag; };
+    using Accessor_t = typename Interface_t::Accessor;
+    using Const_Accessor_t = typename Interface_t::AccessorRO;
+    using element_type = typename Interface_t::element_type;
+
+    struct API_EXPORT traits : public BaseLayout::traits {
+        static constexpr DType dtype = ConcreteTensor::dtype;
+        using element_type = typename dtype_traits<dtype>::element_type;
+        using raw_type = element_type; // result from get_raw()
+        using interface_type = Interface_t;
+        static constexpr MemoryClass memclass = Tinfo::memclass;
+    };
+    //
+    //  - build for given shape, attached to given producer.
+    //  - pass the nase class ctor a specialized ctor, it uses to make the interface
+    //   from the output def.
+    API_EXPORT ConcreteTensor(const Op *producer_in, const OutputDef &def, Graph &graph_in)
+        : BaseLayout(producer_in, def, graph_in, hnnx::make_interface<Interface_t>::from_odef)
+    {
+    }
+    API_EXPORT ConcreteTensor(const Op *producer_in, const OutputDef &def, Graph &graph_in, element_type *data_in)
+        : BaseLayout(producer_in, def, graph_in, hnnx::make_interface<Interface_t>::from_odef)
+    {
+        this->mem.set_raw_data_despite_danger((void *)data_in);
+    }
+    //   - deserialize. Note that dctx contains a graph ref.
+    //   We pass the base class a pointer to specialized function, which it uses to
+    //  deserialize the interface.
+    API_EXPORT explicit ConcreteTensor(hnnx::Deserz &dctx)
+        : BaseLayout(dctx, &hnnx::make_interface<Interface_t>::from_deser)
+    {
+    }
+    //    - 'clone duplicate' of the given tensor. Note that cmode is ignored.
+    API_EXPORT ConcreteTensor(const ConcreteTensor &old, hnnx::Allocator *allocator, Tensor::clone_mode cmode)
+        : BaseLayout(old, allocator, cmode)
+    {
+    }
+
+    API_EXPORT virtual DTypeScaleOff get_dtype_intfc() const noexcept override
+    {
+        return interface_typed()->get_dtype_scaleoff();
+    }
+
+    API_EXPORT virtual hnnx::InterfaceRef interface() const noexcept override final
+    {
+        return interface_typed()->get_refobj();
+    }
+    API_EXPORT inline float interface_scale() const { return interface_typed()->get_scale(); }
+    API_EXPORT inline float interface_scale_recip() const { return interface_typed()->get_scale_recip(); }
+    API_EXPORT inline int32_t interface_offset() const { return interface_typed()->get_offset(); }
+
+    API_EXPORT inline ALWAYSINLINE const element_type *element_ptr(size_t rank, const SIdx coords[]) const
+    {
+        return (element_type const *)this->element_addr0(rank, coords);
+    }
+    API_EXPORT inline ALWAYSINLINE element_type *element_ptr(size_t rank, const SIdx coords[])
+    {
+        return (element_type *)this->element_addr0(rank, coords);
+    }
+
+    // Some methods return the same thing as in LayoutTensor, but
+    // with the type being element_type instead of storage_type.
+    API_EXPORT inline std::conditional_t<is_indirect, void, element_type *&> data_ptr()
+    {
+        if constexpr (!is_indirect) {
+            return (element_type *&)this->mem.bulk_data;
+        }
+    }
+    API_EXPORT inline std::conditional_t<is_indirect, void, element_type *const &> data_ptr() const
+    {
+        if constexpr (!is_indirect) {
+            return (element_type *const &)this->mem.bulk_data;
+        }
+    }
+
+    // block table access
+    API_EXPORT inline element_type **blocktab_ptr() const { return (element_type **)this->mem.get_block_list_ptr(); }
+    API_EXPORT inline element_type *&blocktab_at(size_t i) { return (element_type *&)BaseLayout::blocktab_at(i); }
+    API_EXPORT inline element_type *const &blocktab_at(size_t i) const
+    {
+        return (element_type *const &)BaseLayout::blocktab_at(i);
+    }
+
+    template <typename... ind_types>
+    API_EXPORT inline element_type const *const *block_ptr_address(ind_types... inds) const
+    {
+        return (element_type const *const *)BaseLayout::block_ptr_address(inds...);
+    };
+    template <typename... ind_types> API_EXPORT inline element_type *const *block_ptr_address(ind_types... inds)
+    {
+        return (element_type *const *)BaseLayout::block_ptr_address(inds...);
+    };
+    template <typename... ind_types> API_EXPORT inline element_type const *block_ptr(ind_types... inds) const
+    {
+        return *this->block_ptr_address(inds...);
+    }
+    template <typename... ind_types> API_EXPORT inline element_type *block_ptr(ind_types... inds)
+    {
+        return *this->block_ptr_address(inds...);
+    }
+
+    // direct access methods.
+    //
+    template <typename... ind_types> API_EXPORT inline Const_Accessor_t operator()(ind_types... inds) const
+    {
+        static_assert(Rank == (sizeof...(ind_types)), "# of coords must match Rank");
+        const std::array<SIdx, Rank> coords = {static_cast<SIdx>(inds)...};
+        return Const_Accessor_t(this->element_addr0(Rank, coords.data()), interface_typed());
+    }
+    template <typename... ind_types> API_EXPORT inline Accessor_t operator()(ind_types... inds)
+    {
+        static_assert(Rank == (sizeof...(ind_types)), "# of coords must match Rank");
+        const std::array<SIdx, Rank> coords = {{static_cast<SIdx>(inds)...}};
+        return Accessor_t(this->element_addr0(Rank, coords.data()), interface_typed());
+    }
+    template <typename... ind_types> API_EXPORT inline element_type const &get_raw(ind_types... inds) const
+    {
+        static_assert(Rank == (sizeof...(ind_types)), "# of coords must match Rank");
+        const std::array<SIdx, Rank> coords = {{static_cast<SIdx>(inds)...}};
+        return *(element_type const *)this->element_addr0(Rank, coords.data());
+    }
+    template <typename... ind_types> API_EXPORT inline element_type &get_raw(ind_types... inds)
+    {
+        static_assert(Rank == (sizeof...(ind_types)), "# of coords must match Rank");
+        const std::array<SIdx, Rank> coords = {{static_cast<SIdx>(inds)...}};
+        return *(element_type *)this->element_addr0(Rank, coords.data());
+    }
+    template <typename... ind_types> API_EXPORT inline element_type const *get_raw_addr(ind_types... inds) const
+    {
+        static_assert(Rank == (sizeof...(ind_types)), "# of coords must match Rank");
+        const std::array<SIdx, Rank> coords = {{static_cast<SIdx>(inds)...}};
+        return (element_type const *)this->element_addr0(Rank, coords.data());
+    }
+    template <typename... ind_types> API_EXPORT inline element_type *get_raw_addr(ind_types... inds)
+    {
+        static_assert(Rank == (sizeof...(ind_types)), "# of coords must match Rank");
+        const std::array<SIdx, Rank> coords = {{static_cast<SIdx>(inds)...}};
+        return (element_type *)this->element_addr0(Rank, coords.data());
+    }
+    API_EXPORT virtual uint32_t get_tensor_format_code() const noexcept override
+    {
+        return Tensor::formatcode_for_general<traits>();
+    }
+
+    API_EXPORT virtual uint32_t get_tensor_info() const noexcept override
+    {
+        return Tensor::pack_tensor_info(traits::dtype, Rank, traits::memclass);
+    }
+    // allocation and enumeration.
+    API_EXPORT virtual void allocate_func(hnnx::Allocator &allocator, unsigned options) override final
+    {
+        this->allocate_layout(allocator, options, traits::memclass);
+    }
+    API_EXPORT virtual void enum_memory_blocks(hnnx::MemBlockEnumerator &en) const override
+    {
+        this->enum_memory_blocks_layout(en, traits::memclass);
+    }
+    // hash the dtype and interface, and let find_content_hash_layout do the rest.
+    API_EXPORT virtual uint32_t find_content_hash() const noexcept override final
+    {
+        uint32_t const h = interface().interface_hash() ^ mulu32_modular(unsigned(dtype), 0x107301);
+        static constexpr bool is_float = dtype_traits<dtype>::is_float;
+        return this->find_content_hash_layout(h, is_float);
+    }
+
+  protected:
+    // because this (may) need to return an "InterfaceRef" via iref pointer, it's defined
+    // here in the 'Concrete' class, but it uses the non-virtual 'element_addr0'
+    // in the LayoutTensor base class to find the address, and adds the InterfaceRef if
+    // requested.
+    API_EXPORT virtual ALWAYSINLINE void *
+    element_addr(size_t rank, const SIdx coords_in[],
+                 hnnx::InterfaceRef *const iref = nullptr) const noexcept final override
+    {
+        if (iref) *iref = interface_typed()->get_refobj();
+        return this->element_addr0(rank, coords_in);
+    }
+    API_EXPORT virtual int compare_sametype(const Tensor *rhs_in) const override
+    {
+        // compare the interface, and then all the rest is done in compare_sametype_layout.
+        auto *rhs = static_cast<ConcreteTensor const *>(rhs_in);
+        int const icmp = interface_typed()->compare(*rhs->interface_typed());
+        if (icmp != 0) return icmp;
+        return this->compare_sametype_layout(rhs);
+    }
+
+    API_EXPORT virtual void **clone_util(hnnx::Allocator *allocator, std::unique_ptr<Tensor> *tensp,
+                                         Tensor::tensor_blockinfo *infop) const override
+    {
+        void **retval = nullptr;
+        ConcreteTensor const *newtens = nullptr;
+        if (tensp) {
+            *tensp = std::make_unique<ConcreteTensor>(*this, allocator, Tensor::clone_mode::duplicate);
+            newtens = static_cast<ConcreteTensor const *>(tensp->get());
+            retval = (void **)newtens->mem.get_block_list_ptr();
+        }
+        if (infop) {
+            infop->setup(traits::dtype, traits::memclass);
+            infop->blkptrs = (void **)this->mem.get_block_list_ptr();
+            // pretend that a pointer to Shape<Rank> is really a pointer to its base class ShapeFlags
+            // we provide a pointer to the shape field in the cloned tensor, if applicable; otherwise in 'this'.
+            infop->shapepp = (const hnnx::ShapeFlags *const *)&(newtens ? newtens : this)->shape;
+            infop->interfacepp = &(newtens ? newtens : this)->interface_ptr;
+            infop->nblocks = this->mem.get_block_list_len(this->shape);
+            infop->blocksize = sizeof(element_type) * this->mem.get_elements_per_block(this->shape);
+            infop->is_indirect = this->is_indirect;
+            infop->is_chunked = traits::is_chunked;
+            return retval;
+        }
+        return nullptr;
+    }
+    API_EXPORT static const char *code_to_type_name() { return TensorTypeStruct<ConcreteTensor<Tinfo>>::name; }
+};
+
+template <typename T> class TensorIter;
+template <typename T> class TensorCIter;
+
+template <typename T> class IterableTensor {
+    typedef TensorIter<T> iterator;
+    typedef TensorCIter<T> const_iterator;
+    typedef ptrdiff_t difference_type;
+    typedef size_t size_type;
+    typedef T value_type;
+    typedef T *pointer;
+    typedef const T *const_pointer;
+    typedef T &reference;
+    typedef const T &const_reference;
+
+  protected:
+    pointer myTensor;
+    const_pointer myCTensor;
+    const std::array<size_t, 4> increments;
+    mutable std::array<size_t, 4> dims;
+    const bool is_const;
+
+  public:
+    friend iterator; //class TensorIter<T> ;
+    friend const_iterator;
+
+    API_EXPORT inline IterableTensor(reference t, std::array<size_t, 4> inc)
+        : myTensor(&t), myCTensor(const_cast<const_pointer>(&t)), increments(inc), is_const(false)
+    {
+        assert(myCTensor && myCTensor->rank() == 4);
+        for (int i = 0; i < 4; i++) {
+            dims[i] = myCTensor->dim(i);
+        }
+    }
+
+    API_EXPORT inline IterableTensor(const_reference t, std::array<size_t, 4> inc)
+        : myTensor(nullptr), myCTensor(&t), increments(inc), is_const(true)
+    {
+        assert(myCTensor && myCTensor->rank() == 4);
+        for (int i = 0; i < 4; i++) {
+            dims[i] = myCTensor->dim(i);
+        }
+    }
+
+    API_EXPORT inline size_t dim(size_t index) const { return dims[index]; }
+
+    API_EXPORT inline auto access(size_t b, size_t h, size_t w, size_t d) &
+    {
+        assert(!is_const && myTensor);
+        return (*myTensor)(b, h, w, d);
+    }
+    API_EXPORT inline auto access(size_t b, size_t h, size_t w, size_t d) &&
+    {
+        assert(myCTensor);
+        return (*myCTensor)(b, h, w, d);
+    }
+    API_EXPORT inline auto access(size_t b, size_t h, size_t w, size_t d) const &&
+    {
+        assert(myCTensor);
+        return (*myCTensor)(b, h, w, d);
+    }
+
+    API_EXPORT inline auto read(size_t b, size_t h, size_t w, size_t d) const
+    {
+        assert(myCTensor);
+        return (*myCTensor)(b, h, w, d);
+    }
+
+    API_EXPORT inline auto operator()(size_t b, size_t h, size_t w, size_t d) { return access(b, h, w, d); }
+    API_EXPORT inline auto operator()(size_t b, size_t h, size_t w, size_t d) const
+    {
+        assert(myCTensor);
+        return (*myCTensor)(b, h, w, d);
+    }
+
+    API_EXPORT inline bool operator==(const IterableTensor<T> &it) const
+    {
+        return (this->myCTensor == it.myCTensor) && (this->increments == it.increments);
+    }
+    API_EXPORT inline bool operator!=(const IterableTensor<T> &it) const { return !(*this == it); }
+
+    API_EXPORT inline iterator begin()
+    {
+        std::array<size_t, 4> const start = {0, 0, 0, 0};
+        return iterator(*this, start);
+    }
+    API_EXPORT inline const_iterator begin() const
+    {
+        std::array<size_t, 4> start = {0, 0, 0, 0};
+        return const_iterator(*this, start);
+    }
+    API_EXPORT inline iterator begin(std::array<size_t, 4> start) { return iterator(*this, start); }
+    API_EXPORT inline const_iterator begin(std::array<size_t, 4> start) const { return const_iterator(*this, start); }
+    API_EXPORT inline iterator end()
+    {
+        std::array<size_t, 4> const end = {dims[0], 0, 0, 0};
+        return iterator(*this, end);
+    }
+    API_EXPORT inline const_iterator end() const
+    {
+        std::array<size_t, 4> end = {dims[0], 0, 0, 0};
+        return const_iterator(*this, end);
+    }
+    API_EXPORT inline iterator end(std::array<size_t, 4> end) { return iterator(*this, end); }
+    API_EXPORT inline const_iterator end(std::array<size_t, 4> end) const { return const_iterator(*this, end); }
+
+    API_EXPORT ~IterableTensor() {}
+};
+
+template <typename T> class TensorIter {
+  private:
+    IterableTensor<T> &myITensor;
+    std::array<size_t, 4> location;
+    API_EXPORT bool increment(size_t dim)
+    {
+        size_t const inc = myITensor.increments[dim];
+        if (inc) {
+            size_t const loc = location[dim];
+            if (loc + inc < myITensor.dim(dim)) {
+                location[dim] += inc;
+                return true;
+            } else if (dim != 0) {
+                location[dim] = 0;
+            }
+        }
+        if (dim == 0) {
+            location[0]++;
+            return true;
+        }
+
+        return false;
+    }
+
+    API_EXPORT inline void incrementLocation()
+    {
+        int i = location.size();
+        while (!increment(--i))
+            ;
+    }
+
+  protected:
+    API_EXPORT inline TensorIter(const TensorIter<T> &to_copy)
+        : myITensor(to_copy.myITensor), location(to_copy.location)
+    {
+    }
+
+  public:
+    API_EXPORT inline TensorIter(IterableTensor<T> &it, std::array<size_t, 4> loc) : myITensor(it), location(loc) {}
+
+    API_EXPORT inline TensorIter<T> &clone() { return TensorIter<T>(*this); }
+
+    API_EXPORT inline std::array<size_t, 4> get_location() { return location; }
+
+    API_EXPORT inline bool operator==(const TensorIter<T> &ti) const
+    {
+        if (this->myITensor == ti.myITensor) {
+            for (int i = 0; i < this->location.size(); i++) {
+                if (this->location[i] != ti.location[i]) {
+                    return false;
+                }
+            }
+            return true;
+        }
+        return false;
+    }
+    API_EXPORT inline bool operator!=(const TensorIter<T> &ti) const { return !(*this == ti); }
+    API_EXPORT inline operator float() const { return myITensor(location[0], location[1], location[2], location[3]); }
+    API_EXPORT inline TensorIter<T> &operator=(const float v)
+    {
+        myITensor(location[0], location[1], location[2], location[3]) = v;
+        return *this;
+    }
+    //inline TensorIter<T>&operator=(const TensorIter<T>& v) { return this->operator=(float(v)); }
+    //inline auto & operator*() {return myITensor(location[0],location[1],location[2],location[3]);}
+    API_EXPORT inline TensorIter<T> &operator++()
+    {
+        incrementLocation();
+        return *this;
+    }
+    API_EXPORT inline TensorIter<T> operator++(int)
+    {
+        TensorIter<T> const clone = TensorIter<T>(*this);
+        incrementLocation();
+        return clone;
+    }
+
+    ~TensorIter() {}
+};
+
+template <typename T> class TensorCIter {
+  private:
+    const IterableTensor<T> &myITensor;
+    std::array<size_t, 4> location;
+    API_EXPORT bool increment(size_t dim)
+    {
+        const size_t inc = myITensor.increments[dim];
+        if (inc) {
+            size_t const loc = location[dim];
+            if (loc + inc < myITensor.dim(dim)) {
+                location[dim] += inc;
+                return true;
+            } else if (dim != 0) {
+                location[dim] = 0;
+            }
+        }
+        if (dim == 0) {
+            location[0]++;
+            return true;
+        }
+
+        return false;
+    }
+
+    API_EXPORT inline void incrementLocation()
+    {
+        int i = location.size();
+        while (!increment(--i))
+            ;
+    }
+
+  public:
+    API_EXPORT inline TensorCIter(const IterableTensor<T> &it, std::array<size_t, 4> loc) : myITensor(it), location(loc)
+    {
+    }
+
+    API_EXPORT inline std::array<size_t, 4> get_location() { return location; }
+
+    API_EXPORT inline bool operator==(const TensorCIter<T> &ti) const
+    {
+        if (this->myITensor == ti.myITensor) {
+            for (int i = 0; i < this->location.size(); i++) {
+                if (this->location[i] != ti.location[i]) {
+                    return false;
+                }
+            }
+            return true;
+        }
+        return false;
+    }
+    API_EXPORT inline bool operator!=(const TensorCIter<T> &ti) const { return !(*this == ti); }
+    API_EXPORT inline operator float() const
+    {
+        return myITensor.read(location[0], location[1], location[2], location[3]);
+    }
+    API_EXPORT inline TensorCIter<T> &operator++()
+    {
+        incrementLocation();
+        return *this;
+    }
+    API_EXPORT inline TensorCIter<T> operator++(int)
+    {
+        TensorCIter<T> clone(*this);
+        incrementLocation();
+        return clone;
+    }
+
+    ~TensorCIter() {}
+};
+
+//----- to be removed start -----
+// HashTable tensor
+// EJP: FIXME in the future!
+// We want tensors in the future to be read-only
+// This has mutable members
+// Instead we will need to allocate the normal stuff in memory
+// And separately allocate a little extra in the per-instance mutable data pool for the mutable data
+// But for now we will just modify the tensor.
+template <typename Tinfo> class HashTableTensor : public ConcreteTensor<Tinfo> {
+  public:
+    //basically inherit constructors
+    using ConcreteTensor<Tinfo>::ConcreteTensor;
+    // Add extra scalar parameters for metadata: capacity, etc. Note these are mutable!
+    mutable uint32_t max_chain = 0; // EJP: FIXME: MUTABLE!
+};
+//----- to be removed end -----
+
+namespace Ldefs {
+template <unsigned elbytes> struct stype_for;
+template <> struct stype_for<1> {
+    typedef uint8_t type;
+};
+template <> struct stype_for<2> {
+    typedef uint16_t type;
+};
+template <> struct stype_for<4> {
+    typedef NN_UINT32_T type;
+};
+template <> struct stype_for<8> {
+    typedef NN_UINT64_T type;
+};
+} // namespace Ldefs
+// macro to define a layout config struct in Ldefs namespace:
+// paramaters are:
+//  - name of struct (in Ldefs namespace)
+//  - number of bytes per  storage element
+//  - 'layout' type (which determines rank
+//  - name of 'padding' template (Pading or NoPadding).
+//
+// Normally, if the layout is chunked, you get an indirect tensor.
+// Use LAYOUTDEF_CONTIG to get a contiguous tensor with a chunked layout.
+//
+// Do not create different configurations with the same parameters;
+// all this does is generate extra duplicate code.
+//
+#define LAYOUTDEF(NAME, ELBYTES, LAYOUT, PAD)                                                                          \
+    namespace Ldefs {                                                                                                  \
+    struct API_EXPORT NAME {                                                                                           \
+        using Tlayout = LAYOUT;                                                                                        \
+        using storage_type = stype_for<ELBYTES>::type;                                                                 \
+        static constexpr unsigned Rank = Tlayout::Rank;                                                                \
+        using Pad_t = PAD<Rank>;                                                                                       \
+        static constexpr bool is_chunked = Tlayout::chunk_total > 1;                                                   \
+        static constexpr bool is_indirect = is_chunked;                                                                \
+    };                                                                                                                 \
+    }
+// define a layout config which has chunked addressing, but contiguous alloc.
+#define LAYOUTDEF_CONTIG(NAME, ELBYTES, LAYOUT, PAD)                                                                   \
+    namespace Ldefs {                                                                                                  \
+    struct API_EXPORT NAME {                                                                                           \
+        using Tlayout = LAYOUT;                                                                                        \
+        using storage_type = stype_for<ELBYTES>::type;                                                                 \
+        static constexpr unsigned Rank = Tlayout::Rank;                                                                \
+        using Pad_t = PAD<Rank>;                                                                                       \
+        static constexpr bool is_chunked = Tlayout::chunk_total > 1;                                                   \
+        static constexpr bool is_indirect = false;                                                                     \
+    };                                                                                                                 \
+    }
+
+#define DEFINE_TYPENAMES(TYPE, NAME)                                                                                   \
+    DEFINE_TYPENAME(TYPE, NAME);                                                                                       \
+    DEFINE_TYPENAME_V(Vector<const TYPE *>, NAME);
+
+// Create function that accesses the TensorTypeStruct::name that places the map of opcode ->
+// typename in .rodata.
+// There are two versions of this function, one below (which is called specifically for those
+// tensor types which are NOT one off : RankedTensor, TensorSclrDT, LayoutTensor,
+// ConcreteTensor).
+// If it is one of the above four tensor types, it is declared as a static member function which
+// gets created during the explicity template specialisations below.
+// Behaviour:
+//  - If explicity specialised and one of RankedTensor, TensorSclrDT, LayoutTensor, ConcreteTensor,
+//    static member function creates the map entry in .rodata.
+//  - If not explicity specialised, need to call DECLARE_TENSOR_CODE_TO_TYPENAME_STRING macro in
+//    order to place entry in .rodata
+template <typename T> API_FUNC_EXPORT constexpr const char *code_to_type_name()
+{
+    return "unknown";
+}
+
+#define DECLARE_TENSOR_CODE_TO_TYPENAME_STRING(TYPE)                                                                   \
+    template <> API_FUNC_EXPORT const char *code_to_type_name<TYPE>() { return TensorTypeStruct<TYPE>::name; }
+
+// macro to define a ConcreteTensor config in Tdefs namespace
+//  LAYOUTNAME is a layout defined by LAYOUTDEF macro
+// DTYPE and MCLASS are just dtype and memory class.
+// You must use a layout with element size matching the dtype.
+//
+// It is possible to create different configurations with
+// the same paramaters; and in this way create different
+// ConcreateTensor types which behave in the same way.
+//
+// For instamce, QFloatCrouton and Int32Crouton have different identities
+// and the same configuration.
+//
+#define TENSORDEF_MC(NAME, LAYOUTNAME, DTYPE, MCLASS, ENCODENAME)                                                      \
+    namespace Tdefs {                                                                                                  \
+    struct API_EXPORT NAME {                                                                                           \
+        using Lconfig = Ldefs::LAYOUTNAME;                                                                             \
+        using Tlayout = Lconfig::Tlayout;                                                                              \
+        using storage_type = Lconfig::storage_type;                                                                    \
+        using element_type = dtype_traits<DTYPE>::element_type;                                                        \
+        static_assert(sizeof(element_type) == sizeof(storage_type), "layout has wrong element size");                  \
+        using Interface_t = std::conditional_t<dtype_traits<DTYPE>::is_quant, ScaleOffsetInterface<element_type>,      \
+                                               PlainInterface<element_type>>;                                          \
+        static constexpr size_t Rank = Lconfig::Rank;                                                                  \
+        using Pad_t = Lconfig::Pad_t;                                                                                  \
+        static constexpr bool is_chunked = Lconfig::is_chunked;                                                        \
+        static constexpr bool is_indirect = Lconfig::is_indirect;                                                      \
+        static constexpr MemoryClass memclass = MCLASS;                                                                \
+        static constexpr const char *typetag = ENCODENAME;                                                             \
+    };                                                                                                                 \
+    }                                                                                                                  \
+    DEFINE_TYPENAMES(ConcreteTensor<Tdefs::NAME>, ENCODENAME);
+
+//----- to be removed start -----
+#define TENSORDEF_HASH(NAME, LAYOUTNAME, DTYPE, MCLASS, ENCODENAME)                                                    \
+    namespace Tdefs {                                                                                                  \
+    struct API_EXPORT NAME {                                                                                           \
+        using Lconfig = Ldefs::LAYOUTNAME;                                                                             \
+        using Tlayout = Lconfig::Tlayout;                                                                              \
+        using storage_type = Lconfig::storage_type;                                                                    \
+        using element_type = dtype_traits<DTYPE>::element_type;                                                        \
+        static_assert(sizeof(element_type) == sizeof(storage_type), "layout has wrong element size");                  \
+        using Interface_t = std::conditional_t<dtype_traits<DTYPE>::is_quant, ScaleOffsetInterface<element_type>,      \
+                                               PlainInterface<element_type>>;                                          \
+        static constexpr size_t Rank = Lconfig::Rank;                                                                  \
+        using Pad_t = Lconfig::Pad_t;                                                                                  \
+        static constexpr bool is_chunked = Lconfig::is_chunked;                                                        \
+        static constexpr bool is_indirect = Lconfig::is_indirect;                                                      \
+        static constexpr MemoryClass memclass = MCLASS;                                                                \
+        static constexpr const char *typetag = ENCODENAME;                                                             \
+    };                                                                                                                 \
+    }                                                                                                                  \
+    DEFINE_TYPENAMES(HashTableTensor<Tdefs::NAME>, ENCODENAME);
+//----- to be removed end -----
+
+#define TENSORDEF(NAME, LAYOUTNAME, DTYPE, ENCODENAME)                                                                 \
+    TENSORDEF_MC(NAME, LAYOUTNAME, DTYPE, MemoryClass::Default, ENCODENAME)
+
+// LAYOUTDEF defines a configuration
+//
+LAYOUTDEF(Flat_8, 1, R4FlatMemoryLayout, NoPadding)
+LAYOUTDEF(Flat5D_8, 1, R5FlatMemoryLayout, NoPadding)
+LAYOUTDEF(Flat_16, 2, R4FlatMemoryLayout, NoPadding)
+LAYOUTDEF(Flat5D_16, 2, R5FlatMemoryLayout, NoPadding)
+LAYOUTDEF(Flat_32, 4, R4FlatMemoryLayout, NoPadding)
+LAYOUTDEF(Flat5D_32, 4, R5FlatMemoryLayout, NoPadding)
+LAYOUTDEF(Flat6D_32, 4, R6FlatMemoryLayout, NoPadding)
+LAYOUTDEF(Flat_64, 8, R4FlatMemoryLayout, NoPadding)
+
+LAYOUTDEF(Crouton_8, 1, R4CroutonLayout, Padding)
+LAYOUTDEF(Crouton_16, 2, R4Crouton2Layout, Padding)
+LAYOUTDEF(Crouton_32, 4, R4Crouton4Layout, Padding)
+LAYOUTDEF(Crouton4x1_8, 1, R4Crouton4x1Layout, Padding)
+LAYOUTDEF(Crouton2x2_8, 1, R4Crouton2x2Layout, Padding)
+LAYOUTDEF(WideCrouton_8, 1, R4WideCroutonLayout, Padding)
+LAYOUTDEF(WideCrouton2x2_8, 1, R4WideCrouton2x2Layout, Padding)
+LAYOUTDEF(WideCrouton_32, 4, R4WideCrouton4Layout, Padding)
+
+// UNUSED LAYOUTDEF(R4Depth32_32, 4, R4Depth32MemoryLayout, NoPadding)
+// UNUSED LAYOUTDEF(R4Depth32_32pad, 4, R4Depth32MemoryLayout, Padding)
+
+DEFINE_TYPENAME(LayoutTensor<Ldefs::Flat_8>, "yfB")
+DEFINE_TYPENAME(LayoutTensor<Ldefs::Flat5D_8>, "yf5B")
+DEFINE_TYPENAME(LayoutTensor<Ldefs::Flat_16>, "yfH")
+DEFINE_TYPENAME(LayoutTensor<Ldefs::Flat5D_16>, "yf5H")
+DEFINE_TYPENAME(LayoutTensor<Ldefs::Flat_32>, "yfI")
+DEFINE_TYPENAME(LayoutTensor<Ldefs::Flat5D_32>, "yf5I")
+DEFINE_TYPENAME(LayoutTensor<Ldefs::Flat6D_32>, "yf6I")
+DEFINE_TYPENAME(LayoutTensor<Ldefs::Flat_64>, "yfL")
+
+DEFINE_TYPENAME(LayoutTensor<Ldefs::Crouton_8>, "ycB")
+DEFINE_TYPENAME(LayoutTensor<Ldefs::Crouton_16>, "ycH")
+DEFINE_TYPENAME(LayoutTensor<Ldefs::Crouton_32>, "ycI")
+
+// 5D LAYOUTDEFs for Croutons
+// LAYOUTDEF(Crouton_8_5D, 1, R5CroutonLayout, Padding)
+// LAYOUTDEF(Crouton_16_5D, 2, R5Crouton2Layout, Padding)
+// LAYOUTDEF(Crouton_32_5D, 4, R5Crouton4Layout, Padding)
+
+// TENSORDEF
+// 8-bit
+TENSORDEF(QuantUint8, Flat_8, DType::QUInt8, "fB")
+TENSORDEF(QuantUint8_5D, Flat5D_8, DType::QUInt8, "f5B")
+TENSORDEF(QuantInt8, Flat_8, DType::QInt8, "fb")
+TENSORDEF(QuantInt8_5D, Flat5D_8, DType::QInt8, "f5b")
+TENSORDEF(QUint8Crouton, Crouton_8, DType::QUInt8, "cB")
+TENSORDEF(QUint8Crouton4x1, Crouton4x1_8, DType::QUInt8, "c#B")
+TENSORDEF(QUint8Crouton2x2, Crouton2x2_8, DType::QUInt8, "c#B")
+TENSORDEF(QUint8WideCrouton, WideCrouton_8, DType::QUInt8, "wB")
+TENSORDEF(QUint8WideCrouton2x2, WideCrouton2x2_8, DType::QUInt8, "w#B")
+TENSORDEF(QInt8Crouton, Crouton_8, DType::QInt8, "cb")
+
+TENSORDEF_MC(QuantUint8_TCM, Flat_8, DType::QUInt8, MemoryClass::TCM, "FB")
+TENSORDEF_MC(QuantUint8_5D_TCM, Flat5D_8, DType::QUInt8, MemoryClass::TCM, "F5B")
+TENSORDEF_MC(QuantInt8_TCM, Flat_8, DType::QInt8, MemoryClass::TCM, "Fb")
+TENSORDEF_MC(QuantInt8_5D_TCM, Flat5D_8, DType::QInt8, MemoryClass::TCM, "F5b")
+TENSORDEF_MC(QUint8Crouton_TCM, Crouton_8, DType::QUInt8, MemoryClass::TCM, "CB")
+TENSORDEF_MC(QUint8Crouton4x1_TCM, Crouton4x1_8, DType::QUInt8, MemoryClass::TCM, "C#B")
+TENSORDEF_MC(QUint8Crouton2x2_TCM, Crouton2x2_8, DType::QUInt8, MemoryClass::TCM, "C#B")
+TENSORDEF_MC(QUint8WideCrouton_TCM, WideCrouton_8, DType::QUInt8, MemoryClass::TCM, "WB")
+TENSORDEF_MC(QUint8WideCrouton2x2_TCM, WideCrouton2x2_8, DType::QUInt8, MemoryClass::TCM, "W#B")
+TENSORDEF_MC(QInt8Crouton_TCM, Crouton_8, DType::QInt8, MemoryClass::TCM, "Cb")
+
+// 16-bit
+TENSORDEF(QuantUint16, Flat_16, DType::QUInt16, "fH")
+TENSORDEF(QuantUint16_5D, Flat5D_16, DType::QUInt16, "f5H")
+TENSORDEF(QuantInt16, Flat_16, DType::QInt16, "fh")
+TENSORDEF(QuantInt16_5D, Flat5D_16, DType::QInt16, "f5h")
+TENSORDEF(QUint16Crouton, Crouton_16, DType::QUInt16, "cH")
+TENSORDEF(QInt16Crouton, Crouton_16, DType::QInt16, "ch")
+TENSORDEF(F16Crouton, Crouton_16, DType::Float16, "ce")
+TENSORDEF(F16Weights, Flat_16, DType::Float16, "fw")
+TENSORDEF(PlainFloat16, Flat_16, DType::Float16, "fe")
+TENSORDEF(PlainFloat16_5D, Flat5D_16, DType::Float16, "f5e")
+
+TENSORDEF_MC(QuantUint16_TCM, Flat_16, DType::QUInt16, MemoryClass::TCM, "FH")
+TENSORDEF_MC(QuantUint16_5D_TCM, Flat5D_16, DType::QUInt16, MemoryClass::TCM, "F5H")
+TENSORDEF_MC(QuantInt16_TCM, Flat_16, DType::QInt16, MemoryClass::TCM, "Fh")
+TENSORDEF_MC(QuantInt16_5D_TCM, Flat5D_16, DType::QInt16, MemoryClass::TCM, "F5h")
+TENSORDEF_MC(QUint16Crouton_TCM, Crouton_16, DType::QUInt16, MemoryClass::TCM, "CH")
+TENSORDEF_MC(QInt16Crouton_TCM, Crouton_16, DType::QInt16, MemoryClass::TCM, "Ch")
+TENSORDEF_MC(F16Crouton_TCM, Crouton_16, DType::Float16, MemoryClass::TCM, "Ce")
+TENSORDEF_MC(F16Weights_TCM, Flat_16, DType::Float16, MemoryClass::TCM, "Fw")
+TENSORDEF_MC(PlainFloat16_TCM, Flat_16, DType::Float16, MemoryClass::TCM, "Fe")
+TENSORDEF_MC(PlainFloat16_5D_TCM, Flat5D_16, DType::Float16, MemoryClass::TCM, "F5e")
+
+// 32-bit
+TENSORDEF(Int32, Flat_32, DType::Int32, "fi")
+TENSORDEF(Int32_5D, Flat5D_32, DType::Int32, "f5i")
+TENSORDEF(Int32_6D, Flat6D_32, DType::Int32, "f6i")
+TENSORDEF(QuantInt32, Flat_32, DType::QInt32, "fs")
+TENSORDEF(PlainFloat, Flat_32, DType::Float32, "ff")
+TENSORDEF(PlainFloat5D, Flat5D_32, DType::Float32, "f5f")
+TENSORDEF(QFloat, Flat_32, DType::Int32, "ft")
+// UNUSED TENSORDEF(D32Float, R4Depth32_32, DType::Float32, "rf")
+// UNUSED TENSORDEF(D32PaddedFloat, R4Depth32_32pad, DType::Float32, "pf")
+TENSORDEF(Int32Crouton, Crouton_32, DType::Int32, "ci")
+TENSORDEF(QInt32Crouton, Crouton_32, DType::QInt32, "cs")
+TENSORDEF(QInt32WideCrouton, WideCrouton_32, DType::QInt32, "ws")
+TENSORDEF(QFloatCrouton, Crouton_32, DType::Int32, "ct")
+TENSORDEF(FloatCrouton, Crouton_32, DType::Float32, "cf")
+
+TENSORDEF_MC(Int32_TCM, Flat_32, DType::Int32, MemoryClass::TCM, "Fi")
+TENSORDEF_MC(Int32_5D_TCM, Flat5D_32, DType::Int32, MemoryClass::TCM, "F5i")
+TENSORDEF_MC(QInt32Crouton_TCM, Crouton_32, DType::QInt32, MemoryClass::TCM, "Cs")
+TENSORDEF_MC(QInt32WideCrouton_TCM, WideCrouton_32, DType::QInt32, MemoryClass::TCM, "Ws")
+TENSORDEF_MC(QuantInt32_TCM, Flat_32, DType::QInt32, MemoryClass::TCM, "Fs")
+TENSORDEF_MC(PlainFloat_TCM, Flat_32, DType::Float32, MemoryClass::TCM, "Ff")
+TENSORDEF_MC(PlainFloat_5D_TCM, Flat5D_32, DType::Float32, MemoryClass::TCM, "F5f")
+TENSORDEF_MC(QFloat_TCM, Flat_32, DType::Int32, MemoryClass::TCM, "Ft")
+TENSORDEF_MC(Int32Crouton_TCM, Crouton_32, DType::Int32, MemoryClass::TCM, "Ci")
+TENSORDEF_MC(QFloatCrouton_TCM, Crouton_32, DType::Int32, MemoryClass::TCM, "Ct")
+TENSORDEF_MC(FloatCrouton_TCM, Crouton_32, DType::Float32, MemoryClass::TCM, "Cf")
+TENSORDEF_HASH(Int32Hash, Flat_32, DType::Int32, MemoryClass::Default, "o1i") // to be removed
+TENSORDEF_HASH(Int32Hash_TCM, Flat_32, DType::Int32, MemoryClass::TCM, "O1i") // to be removed
+
+// 64-bit
+TENSORDEF(Int64, Flat_64, DType::Int64, "fl")
+TENSORDEF_MC(Int64_TCM, Flat_64, DType::Int64, MemoryClass::TCM, "Fl")
+
+DEFINE_TYPENAMES(Vector<Tensor *>, "t*");
+DEFINE_TYPENAMES(TensorScalar<float>, "nf");
+DEFINE_TYPENAMES(TensorScalar<NN_INT32_T>, "ni");
+DEFINE_TYPENAMES(TensorScalar<NN_INT64_T>, "nl");
+DEFINE_TYPENAMES(TensorShape<1>, "s1");
+DEFINE_TYPENAMES(TensorShape<2>, "s2");
+DEFINE_TYPENAMES(TensorShape<3>, "s3");
+DEFINE_TYPENAMES(TensorShape<4>, "s4");
+DEFINE_TYPENAMES(TensorShape<5>, "s5");
+DEFINE_TYPENAMES(Tensor, "t");
+
+template <> constexpr const char *type_name<Graph>()
+{
+    return "";
+}
+
+extern template class ConcreteTensor<Tdefs::PlainFloat>;
+extern template class ConcreteTensor<Tdefs::PlainFloat5D>;
+extern template class ConcreteTensor<Tdefs::PlainFloat_TCM>;
+extern template class ConcreteTensor<Tdefs::PlainFloat_5D_TCM>;
+extern template class ConcreteTensor<Tdefs::PlainFloat16>;
+extern template class ConcreteTensor<Tdefs::PlainFloat16_TCM>;
+extern template class ConcreteTensor<Tdefs::PlainFloat16_5D>;
+extern template class ConcreteTensor<Tdefs::PlainFloat16_5D_TCM>;
+extern template class ConcreteTensor<Tdefs::QFloat>;
+extern template class ConcreteTensor<Tdefs::QFloat_TCM>;
+// UNUSED extern template class ConcreteTensor<Tdefs::D32Float>;
+// UNUSED extern template class ConcreteTensor<Tdefs::D32PaddedFloat>;
+extern template class ConcreteTensor<Tdefs::QuantUint8>;
+extern template class ConcreteTensor<Tdefs::QuantUint8_5D>;
+extern template class ConcreteTensor<Tdefs::QuantInt8>;
+extern template class ConcreteTensor<Tdefs::QuantInt8_5D>;
+extern template class ConcreteTensor<Tdefs::QuantUint16>;
+extern template class ConcreteTensor<Tdefs::QuantUint16_5D>;
+extern template class ConcreteTensor<Tdefs::QuantInt16>;
+extern template class ConcreteTensor<Tdefs::QuantInt16_5D>;
+extern template class ConcreteTensor<Tdefs::QuantInt16_TCM>;
+extern template class ConcreteTensor<Tdefs::QuantInt16_5D_TCM>;
+extern template class ConcreteTensor<Tdefs::QuantInt32>;
+extern template class ConcreteTensor<Tdefs::QuantInt32_TCM>;
+extern template class ConcreteTensor<Tdefs::Int32>;
+extern template class ConcreteTensor<Tdefs::Int32_5D>;
+extern template class ConcreteTensor<Tdefs::Int32_6D>;
+extern template class ConcreteTensor<Tdefs::QUint8Crouton>;
+extern template class ConcreteTensor<Tdefs::QInt8Crouton>;
+extern template class ConcreteTensor<Tdefs::QUint8Crouton_TCM>;
+extern template class ConcreteTensor<Tdefs::QInt8Crouton_TCM>;
+extern template class ConcreteTensor<Tdefs::QUint8Crouton4x1>;
+extern template class ConcreteTensor<Tdefs::QUint8Crouton4x1_TCM>;
+extern template class ConcreteTensor<Tdefs::QUint8Crouton2x2>;
+extern template class ConcreteTensor<Tdefs::QUint8Crouton2x2_TCM>;
+extern template class ConcreteTensor<Tdefs::QuantUint8_TCM>;
+extern template class ConcreteTensor<Tdefs::QuantUint8_5D_TCM>;
+extern template class ConcreteTensor<Tdefs::QuantUint16_TCM>;
+extern template class ConcreteTensor<Tdefs::QuantUint16_5D_TCM>;
+extern template class ConcreteTensor<Tdefs::QuantInt8_TCM>;
+extern template class ConcreteTensor<Tdefs::QuantInt8_5D_TCM>;
+extern template class ConcreteTensor<Tdefs::QUint16Crouton>;
+extern template class ConcreteTensor<Tdefs::QUint16Crouton_TCM>;
+extern template class ConcreteTensor<Tdefs::F16Crouton>;
+extern template class ConcreteTensor<Tdefs::F16Crouton_TCM>;
+extern template class ConcreteTensor<Tdefs::F16Weights>;
+extern template class ConcreteTensor<Tdefs::F16Weights_TCM>;
+extern template class ConcreteTensor<Tdefs::QInt32Crouton>;
+extern template class ConcreteTensor<Tdefs::QInt32Crouton_TCM>;
+extern template class ConcreteTensor<Tdefs::Int32Crouton>;
+extern template class ConcreteTensor<Tdefs::Int32Crouton_TCM>;
+extern template class ConcreteTensor<Tdefs::QFloatCrouton>;
+extern template class ConcreteTensor<Tdefs::QFloatCrouton_TCM>;
+extern template class ConcreteTensor<Tdefs::FloatCrouton>;
+extern template class ConcreteTensor<Tdefs::FloatCrouton_TCM>;
+
+// standard layouts are instantiated in tensor.h
+extern template class LayoutTensor<Ldefs::Flat_8>;
+extern template class LayoutTensor<Ldefs::Flat_16>;
+extern template class LayoutTensor<Ldefs::Flat_32>;
+extern template class LayoutTensor<Ldefs::Flat5D_32>;
+extern template class LayoutTensor<Ldefs::Flat6D_32>;
+
+extern template class LayoutTensor<Ldefs::Crouton_8>;
+extern template class LayoutTensor<Ldefs::Crouton_16>;
+extern template class LayoutTensor<Ldefs::Crouton_32>;
+
+// shape and scalar tensor
+extern template class TensorShape<1>;
+extern template class TensorShape<2>;
+extern template class TensorShape<3>;
+extern template class TensorShape<4>;
+extern template class TensorShape<5>;
+
+extern template class TensorSclrDT<dtype_of_type<PlainInterface<float>>()>;
+extern template class TensorSclrDT<dtype_of_type<PlainInterface<NN_INT32_T>>()>;
+
+template <typename T> // FIXME  - alias for transition
+using TensorContiguous = ConcreteTensor<T>;
+
+/////////////////////////
+typedef ConcreteTensor<Tdefs::PlainFloat16_5D> PlainFloat16Tensor5D;
+typedef ConcreteTensor<Tdefs::PlainFloat5D> PlainFloatTensor5D;
+typedef ConcreteTensor<Tdefs::PlainFloat> PlainFloatTensor;
+typedef ConcreteTensor<Tdefs::PlainFloat16> PlainFloat16Tensor;
+// UNUSED typedef ConcreteTensor<Tdefs::D32Float> D32FloatTensor;
+// UNUSED typedef ConcreteTensor<Tdefs::D32PaddedFloat> D32PaddedFloatTensor;
+typedef ConcreteTensor<Tdefs::QuantUint8> QuantUint8Tensor;
+typedef ConcreteTensor<Tdefs::QuantUint8_5D> QuantUint8Tensor5D;
+typedef ConcreteTensor<Tdefs::QuantInt8> QuantInt8Tensor;
+typedef ConcreteTensor<Tdefs::QuantInt8_5D> QuantInt8Tensor5D;
+typedef ConcreteTensor<Tdefs::QuantUint16> QuantUint16Tensor;
+typedef ConcreteTensor<Tdefs::QuantUint16_5D> QuantUint16Tensor5D;
+typedef ConcreteTensor<Tdefs::QuantInt16> QuantInt16Tensor;
+typedef ConcreteTensor<Tdefs::QuantInt16_5D> QuantInt16Tensor5D;
+typedef ConcreteTensor<Tdefs::QuantInt32> QuantInt32Tensor;
+typedef ConcreteTensor<Tdefs::Int32> Int32Tensor;
+typedef ConcreteTensor<Tdefs::Int32_5D> Int32Tensor5D;
+typedef ConcreteTensor<Tdefs::Int32_6D> Int32Tensor6D;
+typedef ConcreteTensor<Tdefs::Int64> Int64Tensor;
+typedef ConcreteTensor<Tdefs::QUint8Crouton> QUint8CroutonTensor;
+typedef ConcreteTensor<Tdefs::QInt8Crouton> QInt8CroutonTensor;
+typedef ConcreteTensor<Tdefs::QUint16Crouton> QUint16CroutonTensor;
+typedef ConcreteTensor<Tdefs::QInt16Crouton> QInt16CroutonTensor;
+typedef ConcreteTensor<Tdefs::F16Crouton> F16CroutonTensor;
+typedef ConcreteTensor<Tdefs::F16Weights> F16WeightsTensor;
+typedef ConcreteTensor<Tdefs::QInt32Crouton> QInt32CroutonTensor;
+typedef ConcreteTensor<Tdefs::Int32Crouton> Int32CroutonTensor;
+typedef ConcreteTensor<Tdefs::QFloatCrouton> QFloatCroutonTensor;
+typedef ConcreteTensor<Tdefs::QUint8WideCrouton> QUint8WideCroutonTensor;
+typedef ConcreteTensor<Tdefs::QUint8WideCrouton2x2> QUint8WideCrouton2x2Tensor;
+typedef ConcreteTensor<Tdefs::QInt32WideCrouton> QInt32WideCroutonTensor;
+typedef ConcreteTensor<Tdefs::QUint8Crouton4x1> QUint8Crouton4x1Tensor;
+typedef ConcreteTensor<Tdefs::QUint8Crouton2x2> QUint8Crouton2x2Tensor;
+typedef ConcreteTensor<Tdefs::QFloat> QFloatTensor;
+typedef HashTableTensor<Tdefs::Int32Hash> Int32HashTableTensor; // to be removed
+
+// These were once TensorContiguous
+typedef ConcreteTensor<Tdefs::PlainFloat> PlainFloatContiguousTensor;
+typedef ConcreteTensor<Tdefs::QFloat> QFloatContiguousTensor;
+
+struct ModifiedDerivedTypeParent {
+    using PlainFloatTensor_TCM = PlainFloatTensor;
+    using PlainFloatTensor5D_TCM = PlainFloatTensor5D;
+    using PlainFloat16Tensor_TCM = PlainFloat16Tensor;
+    using PlainFloat16Tensor5D_TCM = PlainFloat16Tensor5D;
+    using QFloatTensor_TCM = QFloatTensor;
+    using QuantInt16Tensor_TCM = QuantInt16Tensor;
+    using QuantInt16Tensor5D_TCM = QuantInt16Tensor5D;
+    using QuantInt32Tensor_TCM = QuantInt32Tensor;
+    using QUint8CroutonTensor_TCM = QUint8CroutonTensor;
+    using QInt8CroutonTensor_TCM = QInt8CroutonTensor;
+    using QUint8Crouton4x1Tensor_TCM = QUint8Crouton4x1Tensor;
+    using QUint8Crouton2x2Tensor_TCM = QUint8Crouton2x2Tensor;
+    using QuantUint8Tensor_TCM = QuantUint8Tensor;
+    using QuantUint8Tensor5D_TCM = QuantUint8Tensor5D;
+    using QuantUint16Tensor_TCM = QuantUint16Tensor;
+    using QuantUint16Tensor5D_TCM = QuantUint16Tensor5D;
+    using QuantInt8Tensor_TCM = QuantInt8Tensor;
+    using QuantInt8Tensor5D_TCM = QuantInt8Tensor5D;
+    using QUint16CroutonTensor_TCM = QUint16CroutonTensor;
+    using QInt16CroutonTensor_TCM = QInt16CroutonTensor;
+    using F16CroutonTensor_TCM = F16CroutonTensor;
+    using F16WeightsTensor_TCM = F16WeightsTensor;
+    using QInt32CroutonTensor_TCM = QInt32CroutonTensor;
+    using Int32CroutonTensor_TCM = Int32CroutonTensor;
+    using QFloatCroutonTensor_TCM = QFloatCroutonTensor;
+    using QUint8WideCroutonTensor_TCM = QUint8WideCroutonTensor;
+    using QUint8WideCrouton2x2Tensor_TCM = QUint8WideCrouton2x2Tensor;
+    using QInt32WideCroutonTensor_TCM = QInt32WideCroutonTensor;
+    using Int32Tensor_TCM = Int32Tensor;
+    using Int32Tensor5D_TCM = Int32Tensor5D;
+    using Int64Tensor_TCM = Int64Tensor;
+    using Int32HashTableTensor_TCM = Int32HashTableTensor; // to be removed
+};
+
+/////////////////////////
+
+typedef ConcreteTensor<Tdefs::PlainFloat_TCM> PlainFloatTensor_TCM;
+typedef ConcreteTensor<Tdefs::PlainFloat_5D_TCM> PlainFloatTensor5D_TCM;
+typedef ConcreteTensor<Tdefs::PlainFloat16_TCM> PlainFloat16Tensor_TCM;
+typedef ConcreteTensor<Tdefs::PlainFloat16_5D_TCM> PlainFloat16Tensor5D_TCM;
+typedef ConcreteTensor<Tdefs::QFloat_TCM> QFloatTensor_TCM;
+typedef ConcreteTensor<Tdefs::QuantInt16_TCM> QuantInt16Tensor_TCM;
+typedef ConcreteTensor<Tdefs::QuantInt16_5D_TCM> QuantInt16Tensor5D_TCM;
+typedef ConcreteTensor<Tdefs::QuantInt32_TCM> QuantInt32Tensor_TCM;
+typedef ConcreteTensor<Tdefs::QUint8Crouton_TCM> QUint8CroutonTensor_TCM;
+typedef ConcreteTensor<Tdefs::QInt8Crouton_TCM> QInt8CroutonTensor_TCM;
+typedef ConcreteTensor<Tdefs::QUint8Crouton4x1_TCM> QUint8Crouton4x1Tensor_TCM;
+typedef ConcreteTensor<Tdefs::QUint8Crouton2x2_TCM> QUint8Crouton2x2Tensor_TCM;
+typedef ConcreteTensor<Tdefs::QuantUint8_TCM> QuantUint8Tensor_TCM;
+typedef ConcreteTensor<Tdefs::QuantUint8_5D_TCM> QuantUint8Tensor5D_TCM;
+typedef ConcreteTensor<Tdefs::QuantUint16_TCM> QuantUint16Tensor_TCM;
+typedef ConcreteTensor<Tdefs::QuantUint16_5D_TCM> QuantUint16Tensor5D_TCM;
+typedef ConcreteTensor<Tdefs::QuantInt8_TCM> QuantInt8Tensor_TCM;
+typedef ConcreteTensor<Tdefs::QuantInt8_5D_TCM> QuantInt8Tensor5D_TCM;
+typedef ConcreteTensor<Tdefs::QUint16Crouton_TCM> QUint16CroutonTensor_TCM;
+typedef ConcreteTensor<Tdefs::QInt16Crouton_TCM> QInt16CroutonTensor_TCM;
+typedef ConcreteTensor<Tdefs::F16Crouton_TCM> F16CroutonTensor_TCM;
+typedef ConcreteTensor<Tdefs::F16Weights_TCM> F16WeightsTensor_TCM;
+typedef ConcreteTensor<Tdefs::QInt32Crouton_TCM> QInt32CroutonTensor_TCM;
+typedef ConcreteTensor<Tdefs::Int32Crouton_TCM> Int32CroutonTensor_TCM;
+typedef ConcreteTensor<Tdefs::QFloatCrouton_TCM> QFloatCroutonTensor_TCM;
+typedef ConcreteTensor<Tdefs::QUint8WideCrouton_TCM> QUint8WideCroutonTensor_TCM;
+typedef ConcreteTensor<Tdefs::QUint8WideCrouton2x2_TCM> QUint8WideCrouton2x2Tensor_TCM;
+typedef ConcreteTensor<Tdefs::QInt32WideCrouton_TCM> QInt32WideCroutonTensor_TCM;
+typedef HashTableTensor<Tdefs::Int32Hash_TCM> Int32HashTableTensor_TCM; // to be removed
+
+// These were once TensorContiguous
+typedef ConcreteTensor<Tdefs::Int32_TCM> Int32Tensor_TCM;
+typedef ConcreteTensor<Tdefs::Int32_5D_TCM> Int32Tensor5D_TCM;
+typedef ConcreteTensor<Tdefs::Int64_TCM> Int64Tensor_TCM;
+
+// typedef for layouts
+typedef LayoutTensor<Ldefs::Flat_8> LayoutFlat_8;
+typedef LayoutTensor<Ldefs::Flat5D_8> LayoutFlat5D_8;
+typedef LayoutTensor<Ldefs::Flat_16> LayoutFlat_16;
+typedef LayoutTensor<Ldefs::Flat5D_16> LayoutFlat5D_16;
+typedef LayoutTensor<Ldefs::Flat_32> LayoutFlat_32;
+typedef LayoutTensor<Ldefs::Flat5D_32> LayoutFlat5D_32;
+typedef LayoutTensor<Ldefs::Flat_64> LayoutFlat_64;
+
+// 'standard' crouton layouts.
+typedef LayoutTensor<Ldefs::Crouton_8> LayoutCrouton_8; // [1,8,8,32]
+typedef LayoutTensor<Ldefs::WideCrouton_8> LayoutWideCrouton_8; // [1,2,32,32]
+typedef LayoutTensor<Ldefs::Crouton_16> LayoutCrouton_16; // [1,8,4,32] interleaved
+typedef LayoutTensor<Ldefs::Crouton_32> LayoutCrouton_32; // [1,8,2,32]
+typedef LayoutTensor<Ldefs::WideCrouton_32> LayoutWideCrouton_32; // [1,2,8,32]
+
+typedef LayoutTensor<Ldefs::Crouton4x1_8> LayoutCrouton4x1_8;
+typedef LayoutTensor<Ldefs::Crouton2x2_8> LayoutCrouton2x2_8;
+typedef LayoutTensor<Ldefs::WideCrouton2x2_8> LayoutWideCrouton2x2_8;
+
+using TypicalTensors =
+        std::tuple<PlainFloatTensor, PlainFloatTensor5D, PlainFloat16Tensor, QuantUint8Tensor, QuantUint8Tensor5D,
+                   QuantInt8Tensor, QuantInt8Tensor5D, QuantUint16Tensor, QuantUint16Tensor5D, QuantInt16Tensor,
+                   QuantInt32Tensor, Int32Tensor, Int32Tensor5D, Int32Tensor6D, QUint8CroutonTensor, QInt8CroutonTensor,
+                   QUint8Crouton4x1Tensor, QUint8Crouton2x2Tensor, QUint16CroutonTensor, QInt16CroutonTensor,
+                   QInt32CroutonTensor, QFloatTensor, QFloatCroutonTensor, Int32CroutonTensor, PlainFloat16Tensor_TCM,
+                   PlainFloat16Tensor5D, Int64Tensor, Int32HashTableTensor /*to be removed*/, QuantInt16Tensor5D>;
+
+namespace hnnx {
+// these tensor types are 'pre-registered' for deserialize
+// clang-format off
+using CoreTensors =
+        std::tuple<PlainFloatTensor, PlainFloatTensor5D, PlainFloat16Tensor, Int32Tensor, Int32Tensor5D, Int32Tensor6D,
+                   PlainFloatTensor_TCM, PlainFloatTensor5D_TCM, Int32Tensor_TCM, QuantUint8Tensor, QuantUint8Tensor5D,
+                   QuantInt8Tensor, QuantInt8Tensor5D, QuantUint8Tensor_TCM, QuantUint8Tensor5D_TCM,
+                   QuantInt8Tensor_TCM, QuantInt8Tensor5D_TCM, QuantUint16Tensor, QuantUint16Tensor5D, QuantInt16Tensor,
+                   QuantUint16Tensor_TCM, QuantUint16Tensor5D_TCM, QuantInt16Tensor_TCM, QuantInt32Tensor,
+                   QUint8CroutonTensor, QuantInt32Tensor_TCM, QUint8CroutonTensor_TCM, QInt8CroutonTensor,
+                   QUint16CroutonTensor, QInt8CroutonTensor_TCM, QUint16CroutonTensor_TCM, QInt32CroutonTensor,
+                   QInt16CroutonTensor, QInt16CroutonTensor_TCM, QInt32CroutonTensor_TCM, QInt32WideCroutonTensor,
+                   QInt32WideCroutonTensor_TCM, QFloatTensor, QFloatCroutonTensor, Int32CroutonTensor,
+                   Int32CroutonTensor_TCM,
+                   // UNUSED D32FloatTensor, D32PaddedFloatTensor,
+                   F16CroutonTensor, F16CroutonTensor_TCM,
+                   QUint8WideCroutonTensor, QUint8WideCroutonTensor_TCM, QUint8Crouton2x2Tensor_TCM,
+                   QUint8WideCrouton2x2Tensor_TCM, PlainFloat16Tensor_TCM, PlainFloat16Tensor5D, Int32Tensor5D_TCM,
+                   Int64Tensor, Int64Tensor_TCM, Int32HashTableTensor, Int32HashTableTensor_TCM /*to be removed*/,
+                   QuantInt16Tensor5D_TCM, QuantInt16Tensor5D>;
+// clang-format on
+
+API_EXPORT const char *get_op_true_name(const Op *op);
+
+////// Tensor Generator //////////////
+
+template <typename T, typename TX>
+API_EXPORT inline std::unique_ptr<T> make_tensor_template(Op const *op, OutputDef const &odef, Graph &g)
+{
+    return std::unique_ptr<T>(std::make_unique<TX>(op, odef, g));
+}
+
+// we make tables of these entries:
+//  rank, dtype, pointer to function which makes it.
+// The tables are built only as static constexpr variable in tensor_generator_lookup<T>::lookup
+// so there should be only one table per TensorType after link.
+//
+struct tensor_generator_table_entry {
+    typedef Tensor T; // maybe needs to be a template parm
+    typedef std::unique_ptr<T> (*maketens_funcp)(Op const *, OutputDef const &, Graph &);
+
+    int rank;
+    DType dtype;
+    maketens_funcp fp;
+
+    // default ctor
+    inline constexpr tensor_generator_table_entry() : rank(), dtype(), fp() {}
+
+    // each entry is constructed based on pointer to the tensor type.
+    template <typename TX>
+    inline constexpr tensor_generator_table_entry(TX const *)
+        : rank(tensor_traits<TX>::rank), dtype(tensor_traits<TX>::dtype), fp(make_tensor_template<T, TX>)
+    {
+    }
+};
+// a thing to make the constexpr table..
+template <typename TTUPLE, size_t... I>
+inline constexpr std::array<tensor_generator_table_entry, std::tuple_size_v<TTUPLE>>
+        make_tengen_init(std::index_sequence<I...>)
+{
+    return {tensor_generator_table_entry(static_cast<typename std::tuple_element_t<I, TTUPLE> *>(nullptr))...};
+}
+
+template <typename TensorType> struct API_EXPORT tensor_generator_lookup {
+    template <typename TX>
+    using has_TensorType_as_base = std::integral_constant<bool, std::is_base_of<TensorType, TX>::value>;
+    // this is a tuple of types for which T is a common base.
+    using applicable_types = TupFilter_t<has_TensorType_as_base, TypicalTensors>;
+    static constexpr size_t NTYPES = std::tuple_size_v<applicable_types>;
+
+    static tensor_generator_table_entry const *lookup(int rank, DType dtype)
+    {
+        // this is a table of their rank, dtype, ctor function.
+        static constexpr std::array<tensor_generator_table_entry, NTYPES> typedescs =
+                make_tengen_init<applicable_types>(std::make_index_sequence<NTYPES>{});
+        tensor_generator_table_entry const *p = typedescs.data();
+
+        for (int i = 0; i < int(NTYPES); i++) {
+            if (p->dtype == dtype && p->rank == rank) return p;
+            p++;
+        }
+        return nullptr;
+    }
+
+    static std::unique_ptr<Tensor> make [[gnu::noinline]] (const Op *producer_in, const OutputDef &def, Graph &graph_in)
+    {
+        // concrete types get a shortcut if the dtype & rank match...
+        if constexpr (!std::is_abstract<TensorType>::value) {
+            if (def.dtype == tensor_traits<TensorType>::dtype && def.rank == tensor_traits<TensorType>::rank) {
+                return make_tensor_template<Tensor, TensorType>(producer_in, def, graph_in);
+            }
+        }
+        tensor_generator_table_entry const *const lookup_result = lookup(def.rank, def.dtype);
+        if (lookup_result != nullptr) {
+            return lookup_result->fp(producer_in, def, graph_in);
+        }
+        errlog("Lookup in %d tensor types failed (%p: <<%s>>)", int(NTYPES), producer_in,
+               get_op_true_name(producer_in));
+        return nullptr;
+    }
+    // return true if 'make' would succeed.
+    static bool is_valid(const OutputDef &def)
+    {
+        if constexpr (!std::is_abstract<TensorType>::value) {
+            if (def.dtype == tensor_traits<TensorType>::dtype && def.rank == tensor_traits<TensorType>::rank)
+                return true;
+            else {
+                debuglog(
+                        "def.dtype %u, tensor_traits<TensorType>::dtype %u, def.rank %u, tensor_traits<TensorType>::rank %u",
+                        (unsigned)def.dtype, (unsigned)tensor_traits<TensorType>::dtype, def.rank,
+                        unsigned(tensor_traits<TensorType>::rank));
+            }
+        }
+        return lookup(def.rank, def.dtype) != nullptr;
+    }
+};
+// external API of tensor generator:
+//    tensor_generator<T>( Op const *, OutputDef const &, Graph &) ->  std::unique_ptr<Tensor>
+//    tensor_generator_valid<T>( Op const *, OutputDef const &, Graph &) ->  bool
+//
+// A call to tensor_generator<T>(..) is really a call to tensor_generator_lookup<T>::make(..)
+//
+
+API_EXPORT bool tensor_tall_crouton_disabled(Graph const &g);
+API_EXPORT bool tensor_wide_crouton_disabled(Graph const &g);
+
+template <typename T, typename = void> struct is_wide_crouton {
+    static constexpr bool value = false;
+};
+template <typename T, typename = void> struct is_tall_crouton {
+    static constexpr bool value = false;
+};
+template <typename T> struct is_wide_crouton<T, std::void_t<decltype(T::layout)>> {
+    static constexpr bool value = (T::layout.chunk_total == 8 * 8 * 32) && (T::layout.ChunkSizes[2] > 1) &&
+                                  (T::layout.ChunkSizes[1] < T::layout.ChunkSizes[2]);
+};
+template <typename T> struct is_tall_crouton<T, std::void_t<decltype(T::layout)>> {
+    static constexpr bool value = (T::layout.chunk_total == 8 * 8 * 32) && (T::layout.ChunkSizes[1] > 1) &&
+                                  (T::layout.ChunkSizes[1] >= T::layout.ChunkSizes[2]);
+};
+
+template <typename TensorType>
+constexpr std::unique_ptr<Tensor> (*tensor_generator)(const Op *producer_in, const OutputDef &def,
+                                                      Graph &graph_in) = tensor_generator_lookup<TensorType>::make;
+template <typename TensorType>
+API_FUNC_EXPORT inline bool tensor_generator_valid(const Op *producer_in, const OutputDef &def, Graph &graph_in)
+{
+    if constexpr (is_wide_crouton<TensorType>::value) {
+        if (tensor_wide_crouton_disabled(graph_in)) {
+            debuglog("Wide croutons disabled...");
+            return false;
+        }
+    }
+    if constexpr (is_tall_crouton<TensorType>::value) {
+        if (tensor_tall_crouton_disabled(graph_in)) {
+            debuglog("Tall croutons disabled...");
+            return false;
+        }
+    }
+    return tensor_generator_lookup<TensorType>::is_valid(def);
+}
+
+// make a scalar tensor for a given def (with 0 rank, and specific dtype). Returns an empty
+// pointer if there is no support.
+API_FUNC_EXPORT std::unique_ptr<Tensor> tensor_generator_scalar(const Op *producer_in, const OutputDef &def,
+                                                                void const *data, size_t len);
+
+template <int relative_tolerance = 1 /* 1% */, int absolute_tolerance = 1 /* in 'FLT_EPSILON' ref <climits> */>
+static inline constexpr int almost_eq(float rhs, float lhs)
+{
+    return std::abs(rhs - lhs) <= (
+                                          // should it be max of (absolute, relative) ?
+                                          (absolute_tolerance * std::numeric_limits<float>::epsilon()) +
+                                          (relative_tolerance / 100.0 * std::abs(lhs)));
+}
+
+using cmp_function = std::function<int(float, float)>;
+#ifndef PREPARE_DISABLED
+extern GraphStatus tensor_copy(Tensor &lhs, const Tensor &rhs);
+#endif
+extern GraphStatus check_dims(const Tensor &lhs, const Tensor &rhs);
+
+//
+// Set the shape of Tensor D to the same as Tensor S, and
+// then copy the contents, adapting to whatever shapes and data format
+//
+API_FUNC_EXPORT void tensor_copy_4d(Tensor &dst, Tensor const &src);
+
+API_FUNC_EXPORT void tensor_registry_testing();
+
+template <typename T> struct memclass_of {
+    static constexpr MemoryClass memclass = tensor_traits<T>::memclass;
+};
+template <> struct memclass_of<Tensor> {
+    static constexpr MemoryClass memclass = MemoryClass::Default;
+};
+
+template <typename T> struct memclass_of<Vector<T *>> {
+    static constexpr MemoryClass memclass = memclass_of<T>::memclass;
+};
+template <typename T> struct memclass_of<const Vector<T *>> {
+    static constexpr MemoryClass memclass = memclass_of<T>::memclass;
+};
+
+template <MemoryClass C, typename... Ts> struct has_memclass;
+
+template <MemoryClass C> struct has_memclass<C, std::tuple<>> {
+    static constexpr bool value = false;
+};
+
+template <MemoryClass C, typename T, typename... Ts> struct has_memclass<C, std::tuple<T, Ts...>> {
+    static constexpr bool value = memclass_of<T>::memclass == C || has_memclass<C, std::tuple<Ts...>>::value;
+};
+
+///////////////////////////////////////
+
+// mechanism to generate functions to register tensor for serializing
+// reg_tens_for_deser<T1,...>::f() -> int is a static function which
+// registers T1,T2.
+// To keep code size down, the code is only there for <T>; for more than
+// one, the others are all called in sequence.
+//
+//  reg_tens_for_deser<T1,...>::f_ptr() is a static inline which returns
+//  a pointer to f.
+//
+template <typename... T> struct reg_tens_for_deser {
+    static int f() { return (reg_tens_for_deser<T>::f(), ...); }
+    static constexpr auto f_ptr() -> int (*)() { return &f; }
+};
+// For empty list make fptr return null
+// since most of the Ops have nothing to do, and a null pointer takes
+// less code to make than the address of a function.
+template <> struct reg_tens_for_deser<> {
+    static constexpr auto f_ptr() -> int (*)() { return nullptr; }
+};
+
+// single item
+//
+template <typename T> struct reg_tens_for_deser<T> {
+    static int f()
+    {
+        using TT = std::remove_reference_t<std::remove_cv_t<T>>;
+        static_assert(std::is_same_v<T, TT>);
+        if constexpr (!(std::is_abstract<T>::value)) {
+            deserialize_tensor_register(typeid(T), type_name<T>(),
+                                        deserialize_tensor_using_constructor<T>::deserialize);
+        }
+        return 0;
+    }
+    static constexpr auto f_ptr() -> int (*)()
+    {
+        if constexpr (!(std::is_abstract<T>::value)) {
+            return &f;
+        } else {
+            // let's just have one empty function
+            return reg_tens_for_deser<>::f_ptr();
+        }
+    }
+};
+
+template <typename TUP> struct map_rtfd_type {
+};
+template <typename... T> struct map_rtfd_type<std::tuple<T...>> {
+    using type = reg_tens_for_deser<T...>;
+};
+// given a tuple TUP, deserialize_tensor_tuple<TUP,FORCE>::f_ptr()
+// returns a pointer to a function
+// which registers all of the types which are not in SkipRegTensors<FORCE>
+// (i.e. if FORCE is false, all of the types which are not in CoreTensors;
+// if force is true, all of the types).
+//
+
+// This is SkipRegTensors<True>, which is used for FORCE=true (don't skip).
+// SkipRegTensors<false> is defined at the bottom of tensors.h.
+template <bool FORCE> struct SkipRegTensors {
+    using type = std::tuple<>;
+};
+
+template <typename TUP, bool FORCE = false> struct deserialize_tensor_tuple {
+    template <typename T> using not_core_tensor = not_contains_type<typename SkipRegTensors<FORCE>::type, T>;
+    using filtered_T = std::conditional_t<FORCE, TUP, typename TupFilter<not_core_tensor, TUP>::type>;
+    using rtfd_type = typename map_rtfd_type<filtered_T>::type;
+    static constexpr auto f_ptr() -> int (*)() { return rtfd_type::f_ptr(); }
+    static int f() { return rtfd_type::f(); }
+};
+
+template <> struct SkipRegTensors<false> {
+    using type = CoreTensors;
+};
+
+// map a tensor type to its layout tensor.
+template <typename TT> using layout_of = typename tensor_traits<TT>::layouttensor_type;
+
+} // namespace hnnx
+
+POP_VISIBILITY()
+
+#include "tile_extract.h"
+#endif
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/tensor_info.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/tensor_info.h
new file mode 100755
index 0000000000000..56c52e433e4bf
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/tensor_info.h
@@ -0,0 +1,267 @@
+//==============================================================================
+//
+// Copyright (c) 2023-2024 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef TENSOR_INFO_H
+#define TENSOR_INFO_H 1
+
+#include "opname_tag.h"
+
+#include <optional>
+#include <vector>
+#include <limits>
+
+namespace hnnx {
+
+struct TensorInfo {
+    using TensorFlags = unsigned; // treatad a a bit vector
+    using NameSet = unsigned; // a bit vector of input and output names
+            // outputs are the first "num_outputs" bits
+
+    hnnx::opname_tag_t op_name;
+    unsigned num_outputs = 1; // at least 1 "*" but can be longer
+
+    NameSet fixed = 0; // names which do not change, roughly flat&main_memory
+    NameSet is_crouton = 0;
+    NameSet is_flat = 0;
+    NameSet is_main_memory = 0;
+    NameSet is_tcm = 0;
+    NameSet prefer_tcm = 0; // place in tcm if they fit
+    NameSet fixed_constants = 0;
+    NameSet unaligned_ok = 0;
+    NameSet crouton_if = 0; // assign crouton if any input name in this set is
+    unsigned flat_above = std::numeric_limits<unsigned>::max();
+    unsigned main_memory_above = std::numeric_limits<unsigned>::max();
+    unsigned tcm_above = std::numeric_limits<unsigned>::max();
+    unsigned crouton_above = std::numeric_limits<unsigned>::max();
+    unsigned unaligned_ok_above = std::numeric_limits<unsigned>::max();
+
+    std::vector<std::tuple<TensorFlags, hnnx::opname_tag_t>> renames;
+    std::vector<std::pair<NameSet, opname_tag_t>> early_renames;
+    // std::vector<std::pair<unsigned, Flags>> outputs;
+};
+} // namespace hnnx
+
+namespace DefProperties {
+enum class PropertyFlags {
+    TCM = 1,
+    MAIN_MEMORY = 2,
+    CROUTON = 4,
+    FLAT = 8,
+    INVARIANT = 16,
+};
+
+constexpr PropertyFlags FLAT = PropertyFlags::FLAT;
+constexpr PropertyFlags MAIN_MEMORY = PropertyFlags::MAIN_MEMORY;
+constexpr PropertyFlags CROUTON = PropertyFlags::CROUTON;
+constexpr PropertyFlags TCM = PropertyFlags::TCM;
+constexpr PropertyFlags INVARIANT = PropertyFlags::INVARIANT;
+constexpr unsigned OTHERWISE = 0;
+
+// Rename(flag-list, new_name) if after migration all the specified
+// flags are set, the operator is renamted to new_name
+// Example: Rename(CROUTON,TCM, "Add.tcm")
+struct Rename {
+    unsigned flags = 0;
+    const char *name = nullptr;
+    inline void update() {}
+    inline void update1(const PropertyFlags flag) { flags |= static_cast<unsigned>(flag); }
+    inline void update1(const unsigned flag) { assert(flag == OTHERWISE); }
+    inline void update1(const char *const name_p) { name = name_p; }
+    template <typename First, typename... Rest> void update(First first, Rest... rest)
+    {
+        update1(first);
+        update(rest...);
+    }
+    template <typename... Flags> explicit Rename(Flags... flags_) { update(flags_...); }
+};
+
+struct FlatAboveArg {
+    unsigned position;
+    FlatAboveArg(const unsigned position_) : position(position_) {}
+};
+
+struct StringList {
+    std::vector<const char *> names;
+    inline void update() {}
+    inline void update1(const char *const name) { names.push_back(name); }
+    template <typename First, typename... Rest> void update(First first, Rest... rest)
+    {
+        update1(first);
+        update(rest...);
+    }
+    template <typename... Args> explicit StringList(Args... args) { update(args...); }
+};
+
+struct Op : public StringList {
+    template <typename... Args> explicit Op(Args... args) : StringList(args...) {}
+};
+struct Outputs : public StringList {
+    template <typename... Args> explicit Outputs(Args... args) : StringList(args...) {}
+};
+struct Fixed : public StringList {
+    template <typename... Args> explicit Fixed(Args... args) : StringList(args...) {}
+};
+struct Crouton : public StringList {
+    template <typename... Args> explicit Crouton(Args... args) : StringList(args...) {}
+};
+struct Flat : public StringList {
+    template <typename... Args> explicit Flat(Args... args) : StringList(args...) {}
+};
+struct MainMemory : public StringList {
+    template <typename... Args> explicit MainMemory(Args... args) : StringList(args...) {}
+};
+struct Tcm : public StringList {
+    template <typename... Args> explicit Tcm(Args... args) : StringList(args...) {}
+};
+struct PreferTcm : public StringList {
+    template <typename... Args> explicit PreferTcm(Args... args) : StringList(args...) {}
+};
+
+struct FixedConstant : public StringList {
+    template <typename... Args> explicit FixedConstant(Args... args) : StringList(args...) {}
+};
+
+struct UnalignedOk : public StringList {
+    template <typename... Args> explicit UnalignedOk(Args... args) : StringList(args...) {}
+};
+struct WhenFitsIfTCM : public StringList {
+    template <typename... Args> explicit WhenFitsIfTCM(Args... args) : StringList(args...) {}
+};
+struct CroutonIf : public StringList {
+    template <typename... Args> explicit CroutonIf(Args... args) : StringList(args...) {}
+};
+
+struct TilingOnly {
+};
+
+// An object that is used to build a TensorInfo from a variadic constructor clal
+struct TensorInfoBuilder : public hnnx::TensorInfo {
+    const char *package;
+    std::vector<const char *> names;
+    std::vector<const char *> output_names;
+    bool tiling_only = false;
+    // convert to flags and record elipsis position
+    API_FUNC_EXPORT std::pair<unsigned, std::optional<unsigned>> nameset(const StringList &list,
+                                                                         bool input_only = false);
+    inline void update() {}
+
+    // Old style, just the root name
+    inline void update1(const char *const name)
+    {
+        assert(names.empty());
+        op_name = name;
+        names.push_back("*");
+    }
+    // new style Op(root_name, ...)
+    inline void update1(Op names_p)
+    {
+        assert(names.empty());
+        names = std::move(names_p.names);
+        assert((std::all_of(names.begin(), names.end(), [](const char *name) { return strcmp(name, "...") != 0; })) and
+               "Invalid name '...'");
+        op_name = names[0];
+        names[0] = "*";
+    }
+    inline void update1(Outputs names_p)
+    {
+        assert(output_names.empty());
+        assert(not names_p.names.empty());
+        num_outputs = names_p.names.size();
+        output_names = std::move(names_p.names);
+        assert((std::all_of(output_names.begin(), output_names.end(),
+                            [](const char *name) { return strcmp(name, "...") != 0; })) and
+               "Invalid name '...'");
+    }
+    inline void update_flags(unsigned flags_, unsigned &true_, const unsigned false_)
+    {
+        flags_ &= ~(true_ | false_ | fixed); // don't change a position alreay set
+        true_ |= flags_;
+    }
+    inline void update1(const Fixed fixed_p)
+    {
+        const unsigned fixed_names = nameset(fixed_p).first;
+        const unsigned really_fixed = fixed_names & ~(is_flat | is_crouton | is_tcm | prefer_tcm | is_main_memory);
+        update_flags(fixed_names, is_flat, is_crouton);
+        update_flags(fixed_names, is_main_memory, is_tcm | prefer_tcm);
+        fixed |= really_fixed;
+    }
+    inline void update1(const PreferTcm operand_list)
+    {
+        prefer_tcm |= nameset(operand_list, /*input_only*/ true).first;
+    }
+    inline void update1(const Flat operand_list)
+    {
+        auto [f, elipsis] = nameset(operand_list);
+        update_flags(f, is_flat, is_crouton);
+        if (elipsis.has_value()) flat_above = *elipsis;
+    }
+    inline void update1(const Crouton operand_list)
+    {
+        auto [f, elipsis] = nameset(operand_list);
+        update_flags(f, is_crouton, is_flat);
+        if (elipsis.has_value()) crouton_above = *elipsis;
+    }
+    inline void update1(const MainMemory operand_list)
+    {
+        auto [f, elipsis] = nameset(operand_list);
+        update_flags(f, is_main_memory, is_tcm | prefer_tcm);
+        if (elipsis.has_value()) main_memory_above = *elipsis;
+    }
+    inline void update1(const Tcm operand_list)
+    {
+        auto [f, elipsis] = nameset(operand_list);
+        update_flags(f, is_tcm, is_main_memory);
+        if (elipsis.has_value()) tcm_above = *elipsis;
+    }
+    inline void update1(const UnalignedOk operand_list)
+    {
+        auto [f, elipsis] = nameset(operand_list);
+        unaligned_ok |= f;
+        if (elipsis.has_value()) unaligned_ok_above = *elipsis;
+    }
+    inline void update1(const FixedConstant operand_list)
+    {
+        const unsigned f = nameset(operand_list, /*input_only*/ true).first;
+        fixed_constants |= f;
+    }
+    inline void update1(const CroutonIf operand_list)
+    {
+        const unsigned f = nameset(operand_list, /*input_only*/ true).first;
+        crouton_if |= f;
+    }
+    inline void update1(const TilingOnly) { tiling_only = true; }
+    API_FUNC_EXPORT void update1(Rename rename); // in tcm_migration.cc
+    // void update1(Output output) { outputs.emplace_back(output.index, output.flags); }
+    void update1(WhenFitsIfTCM rename);
+
+    // void update1(NoPropagationToArgs stops_p) { stops |= stops_p.positions; }
+    // void update1(NoPropagationAboveArg above) { stops |= 0xffff << (above.position + 1); }
+    inline void update1(const FlatAboveArg flat_above_p) { flat_above = flat_above_p.position; }
+    // inline void update1(const unsigned flag) { flags |= flag; }
+    template <typename First, typename... Rest> void update(First first, Rest... rest)
+    {
+        update1(first);
+        update(rest...);
+        assert(not names.empty() and "No name specified for DEF_TENSOR_PROPERTIES");
+    }
+
+    template <typename... Args> TensorInfoBuilder(const char *package_, Args... args) : package(package_)
+    {
+        update(args...);
+    }
+    TensorInfoBuilder() = default;
+};
+
+} // namespace DefProperties
+
+namespace hnnx {
+// In tcm_migration.cc
+extern API_FUNC_EXPORT bool register_tensor_properties(const char *package, DefProperties::TensorInfoBuilder);
+} // namespace hnnx
+
+#endif // TENSOR_INFO_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/tile_extract.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/tile_extract.h
new file mode 100755
index 0000000000000..2f4fb81b7fa4b
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/tile_extract.h
@@ -0,0 +1,797 @@
+
+//==============================================================================
+//
+// Copyright (c) Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+/*
+ * tile_extract.h
+ *
+ *  Created on: Nov 8, 2019
+ *      Author: smithg
+ */
+#ifndef TILE_EXTRACT_H_
+#define TILE_EXTRACT_H_
+
+#include "intrinsics.h"
+#include "dynamic_tensors.h"
+
+/*
+ *  This defines functions which are templated on Tensor subclasses,
+ *  and which extract a tile of data from the tensor, and replace it.
+ *  The tiles are normally 2K bytes;
+ *
+ *    - for qu8/qi8 data, the tile is 8x8x32 in 'flat' order.
+ *    - for qu16/qi16 data, the tile is 8h x 4w x 32 in 'crouton' order,
+ *         (on each row, the first 2 elements are in 32 {w0,w1} pairs,
+ *         then the rest are in 32 {w2,w3} pairs.
+ *       There is flag to force 'flat' order for qu16 data
+ *
+ *    - For qint32 data (and for int32,float), the default tile is 8x2x32; in order to
+ *         match the 8x8x32 Crouton size, while keeping the tile in 2K bytes
+ *
+ * However, in all cases you can specify a specific tile height (in range 1..8) by or'ing
+ * the value into the lower bits of the 'flags' word (a zero value gives the default for the element size).
+ *
+ *
+ *  Operations accept these 'flags', or'd  together (and combined with an optional tile height)
+ *    tileExt::copy       - forces a copy operation on tile_read, even when not needed.
+ *    tileExt::unshuffled - this has no effect on qu8 data; for qu16,
+ *                          the data will be unpacked into 'flat' order instead of shuffled
+ *                          (when storing back, this refers to the order in which the data
+ *                          is presented).
+ *    tileExt::broadcast  - see below; supports broadcasting of input dimensions on read.
+ *
+ *
+ * READING TILES
+ * =============
+ * The normal 'read' operation is to extract a tile at coordinates (b,h,w,d), which represent
+ * the 'origin' of the tile. The tile of a shape [1,TH,TW,32] is extracted from
+ *                   [ b, h ... h+TH-1,  w ... w + TW-1,  d ... d + 31]
+ *
+ * Here TH = Tile height (as specified by flags, or default by element size)
+ *      TW = Tile width (depends on element size)
+ *
+ * The caller supplies a pointer to a vector-aligned buffer area of sufficient size.
+ * The extract function will either extract the data into this area, or -- when possible --
+ * will simply supply a pointer to where the data already is, in memory. If you use the 'copy'
+ * flag, the data will always be copied to the work area, which can be useful if you want to
+ * modify it in-place (such modification is not safe unless 'copy' is specified).
+ *
+ * EDGE TREATMENT
+ * --------------
+ *  If the specified region for a read falls outside the boundaries of the tensor, the corresponding
+ *  portions of the result will contain 'garbage' data - except for 'broadcast' as below. It is a requirement
+ *  that at least part of the tile falls within the bounds of the input tensor:
+ *
+ *     - 'd' coordinate must be >= -31 and  < input_depth
+ *     - 'w' coordinate must be >= -(TW-1) and < input_width
+ *     - 'h' coordinate must be >= -(TH-1) and < input_width
+ *
+ * Use of negative coordinates causes the data to be displaced to the right/down in the tile,
+ * with the left/top filled with 'garbage'.
+ * When reading from a 'crouton' tensor, the data may be gathered from as many as 8 actual tiles,
+ * according to which of h,w,d dimensions are misaligned in the request.
+ *
+ *
+ * BROADCAST
+ * ---------
+ *  The 'broadcast' flag only applies to 'read' operations, and it has the following effect:
+ *     - if the input tensor has batches=1, the input 'b' parameter is ignored and treated as 0.
+ *     - if the input tensor has height=1, the 'h' parameter is ignored, the single row of
+ *           data will replicated to all TH rows of the extracted tile
+ *     - if the input tensor has width=1, the width parameter is ignored, and the column
+ *           of data will be replicated to all TW columns of the extracted tile.
+ *     - if the input tensor has depth=1, the depth parameter is ignored, the
+ *           data will be replicated to all 32 depths of the extracted tile.
+ *
+ * These are independent; so it may be that you have width broadcast, but height and depth
+ * broadcast do not occur (since the conditions are not met for those dims) and therefore
+ * there could still be 'garbage' bytes in the result.
+ *
+ * WRITING TILES
+ * =============
+ *  Caller supplies the data for a tile; function stores that to the tensor, respecting
+ *  edges (data clipped as needed).
+ *    - TH determined as per read_tile: specified in lower bits of flags, or default if zero;
+ *    - 'unshuffled' flag applies (only affects 16-bit)
+ *    - The ranges of the h and w coords are the same as for reading: tile must contain at least one
+ *      value which falls into the tensor dims.
+ *    - d must be a multiple of 32, 0 <= d < output_depth.  Thus, for crouton format,
+ *      at most 4 actual tiles will be need to be written to (depending on h and w alignment).
+ *      In cases where the output is a 'chunked' format such as crouton or d32, and the output
+ *      depth is not a multiple of 32, the write extent of the last depth unit may be effectively
+ *      padded out (i.e. garbage bytes will be written to a 'margin' area of the tensor). Likewise,
+ *      garbage values may be stored into margin areas when the tile overlaps left or right in width dimension.
+ *
+ *   Another way to do writes, which allows computing the result directly into a crouton tensor:
+ *      (1) before the operation, call
+ *            void *ptr = tens->write_tile_strategy( flags, tmp_buffer, b,h,w,d );
+ *       .. this has the same requirements as 'write_tile', but it will do nothing except either:
+ *          (a) return pointer to where the data can be directly written; or
+ *          (b) return 'tmp_buffer'.
+ *      (2) perform the operation, writing the results to the address returned by write_tile_strategy
+ *      (3) only if (ptr == tmp_buffer):
+ *                 call tens->write_buffer( flags, tmp_buffer, b,h,w,d )
+ *          (with the *exact* params used in the call to write_tile_strategy).
+ *      Step (1) can be skipped if tile_support_direct() returns false for the output tensor (see below). Note also,
+ *      'unshuffled' stores to 16-bit crouton may never be direct-mapped.
+ *
+ * Important: If you specify a particular height in the flags, do not exceed that when storing the output, if
+ * using write_tile_strategy.  For instance, if TH=3 is specified in the flags, and write_tile_strategy
+ * returns a direct pointer, the pointer may be to the last 3 rows of a crouton, so storing 4 rows will corrupt
+ * some other data.
+ *
+ * CHECKING SUPPORT
+ * ================
+ * Tensors have the following virtual methods, which indicate capabilities of the tensor types:
+ *     bool tile_support() const;
+ *     bool tile_support_fast() const;
+ *     bool tile_support_direct() const;
+ *
+ *  - tile_support():       if this returns false, the tile_read/write methods are not supported and will throw an assert.
+ *    (if properly deployed, this should only happen where the dtype of the tensor is not supported by tiles)
+ *  - tile_support_fast():    returns true if the tile support is at least better than a series of element-by-element virtual calls.
+ *  - tile_support_direct():  if true, there is a possibility that a 'direct mapping' to the tile layout can occur, depending
+ *                          on the tile position (i.e. it's a crouton layout). When false, you can skip calling write_tile_strategy()
+ *                          since it will never succeed.
+ *
+ *  Implementation node: there is actually just one virtual method tile_support_bits() which returns 'unsigned'; the methods above test individual bits
+ *  of that method's result.
+ *
+ */
+#include "weak_linkage.h"
+#include "macros_attribute.h"
+PUSH_VISIBILITY(default)
+
+namespace tileExt {
+enum tile_flags : unsigned {
+    // lower 5 bits contain 'ht'. This must be 0 (to indicate 'default') or a number in range 1..8
+    // The default is normally 8; for 32-bit tiles it is 2.
+    tile_ht_mask = 31,
+    copy = 32,
+    unshuffled = 64,
+    broadcast = 128,
+
+    write_strategy = 256, // used internally only
+    write_strategy_keep = unshuffled | tile_ht_mask
+};
+
+} //namespace tileExt
+
+namespace hnnx {
+
+namespace tileExt_priv {
+
+// these are designed so that, for tensor types which don't support tile ops, the read_tile and write_tile
+// methods can just jump to them.
+API_EXPORT uint8_t const *unsupported_read(Tensor const *, unsigned flags_unused, uint8_t *buf);
+API_EXPORT void unsupported_write(Tensor *);
+
+template <typename STYPE, unsigned RANK>
+API_EXPORT uint8_t const *generic_tile_read(Tensor const *, unsigned flags,
+                                            uint8_t *tbuf, // caller-supplied buffer
+                                            size_t b, int h, int w, int d);
+template <typename STYPE, unsigned RANK>
+API_EXPORT void generic_tile_write(Tensor *, unsigned flags,
+                                   uint8_t const *tbuf, // caller-supplied buffer
+                                   size_t b, int h, int w, int d);
+
+template <unsigned FLAGS, typename T> struct tile_support_flags_for {
+    static constexpr unsigned value = FLAGS | ((sizeof(T) == 1) ? Tensor::tile_8bit : 0) |
+                                      ((sizeof(T) == 2) ? Tensor::tile_16bit : 0) |
+                                      ((sizeof(T) == 4) ? Tensor::tile_32bit : 0);
+    static_assert((value & Tensor::tile_any) != 0);
+};
+
+// Determine (based on 'Linfo') if the support for a tensor layout is supported generically;
+// returns true if it is.
+//  (and any such types, hopefully all, can have 'fast' support).
+//
+// We support:
+//  - only with rank 4, and with unsupported storage_type
+//  - any flat;
+//  - no 'contiguous chunked' tensors;
+//  - any 'normal' crouton (i.e. no 'wide' crouton)
+//
+template <typename Linfo> constexpr bool tile_support_test_for_linfo()
+{
+    using storage_type = typename Linfo::storage_type;
+    if constexpr (Linfo::Rank != 4 ||
+                  !(std::is_same_v<storage_type, uint8_t> || std::is_same_v<storage_type, uint16_t> ||
+                    std::is_same_v<storage_type, NN_UINT32_T>)) {
+        return false;
+    } else if constexpr (Linfo::is_chunked) {
+        // only the 'normal' crouton layouts.
+        if constexpr (std::is_same_v<Linfo, R4CroutonLayout> || std::is_same_v<Linfo, R4Crouton2Layout> ||
+                      std::is_same_v<Linfo, R4Crouton4Layout>) {
+            return true;
+        } else {
+            return false;
+        }
+    } else {
+        return true; // flat tensor
+    }
+}
+
+//
+// Generic tile methods - forwards to generic operations,
+// or to 'unsupported' when generic can't be used.
+// We will specialize this class for cases which have specific support.
+template <typename Linfo> struct tile_methods {
+    // we can use a generic method if Rank=4 and storage_type is one of uint8, uint16, NN_UINT32_T
+    using storage_type = typename Linfo::storage_type;
+    using LayoutTensorType = LayoutTensor<Linfo>;
+    static constexpr unsigned Rank = Linfo::Rank;
+    static constexpr bool is_generic = tile_support_test_for_linfo<Linfo>();
+
+    static constexpr bool tile_support_any = is_generic;
+    static constexpr bool tile_support_fast = false;
+
+    API_EXPORT static inline uint8_t const *tile_read(LayoutTensorType const *tensor, // tensor to read from
+                                                      unsigned flags,
+                                                      uint8_t *tbuf, // caller-supplied buffer
+                                                      size_t b, int h, int w, int d) // coordinates
+    {
+        // If this fires, it likely means a new tensor layout has been added, for which 'tile_support_test_for_linfo'
+        // returns true; you can either:
+        //   (a) modify tile_support_test_for_linfo() to return false for that - and no get no tile support;
+        //   (b) re-enable generic support (see WITH_GENERIC_TILE_SUPPORT in tile_support.cc) - and get slow tile support;
+        //       (in that case you should remove this static_assert).
+        //   (c) create a specialization of tile_methods<> for the new layout and add fast support.
+        static_assert(!is_generic, "missing tile support");
+        if constexpr (is_generic) {
+            return tileExt_priv::generic_tile_read<storage_type, Rank>(tensor, flags, tbuf, b, h, w, d);
+        } else {
+            return unsupported_read(tensor, flags, tbuf);
+        }
+    }
+
+    API_EXPORT static inline void tile_write(LayoutTensorType *tensor, // tensor to write to
+                                             unsigned flags,
+                                             uint8_t const *tbuf, // caller-supplied buffer
+                                             size_t b, int h, int w, int d)
+    {
+        if constexpr (is_generic) {
+            tileExt_priv::generic_tile_write<storage_type, Rank>(tensor, flags, tbuf, b, h, w, d);
+        } else {
+            unsupported_write(tensor);
+        }
+    }
+    API_EXPORT static constexpr unsigned tile_support_bits()
+    {
+        if constexpr (is_generic) {
+            return tile_support_flags_for<0, storage_type>::value;
+        } else {
+            return 0;
+        }
+    }
+};
+// specialize for 'flat', no-padding case
+// Methods are defined in tile_extract.cc
+template <typename Linfo> struct tile_methods_r4flat {
+    using TensType = LayoutTensor<Linfo>;
+    static constexpr bool tile_support_any = true;
+    static constexpr bool tile_support_fast = true;
+
+    API_EXPORT static uint8_t const *tile_read(TensType const *tensor, // tensor to read from
+                                               unsigned flags,
+                                               uint8_t *tbuf, // caller-supplied buffer
+                                               size_t b, int h, int w, int d);
+    API_EXPORT static void tile_write(TensType *tensor, // tensor to store to
+                                      unsigned flags,
+                                      uint8_t const *tbuf, // caller-supplied buffer
+                                      size_t b, int h, int w, int d);
+    API_EXPORT static constexpr unsigned tile_support_bits()
+    {
+        using storage_type = typename Linfo::storage_type;
+        return tile_support_flags_for<Tensor::tile_fast, storage_type>::value;
+    }
+};
+// specialize tile_methods for flat layout
+template <> struct tile_methods<Ldefs::Flat_8> : public tile_methods_r4flat<Ldefs::Flat_8> {
+};
+template <> struct tile_methods<Ldefs::Flat_16> : public tile_methods_r4flat<Ldefs::Flat_16> {
+};
+template <> struct tile_methods<Ldefs::Flat_32> : public tile_methods_r4flat<Ldefs::Flat_32> {
+};
+
+// specialize for Crouton, padding case
+// Methods are defined in tile_extract.cc
+template <typename Linfo> struct tile_methods_r4crouton {
+    using TensType = LayoutTensor<Linfo>;
+    static constexpr bool tile_support_any = true;
+    static constexpr bool tile_support_fast = true;
+    API_EXPORT static uint8_t const *tile_read(TensType const *tensor, // tensor to read from
+                                               unsigned flags,
+                                               uint8_t *tbuf, // caller-supplied buffer
+                                               size_t b, int h, int w, int d);
+    API_EXPORT static void tile_write(TensType *tensor, // tensor to store to
+                                      unsigned flags,
+                                      uint8_t const *tbuf, // caller-supplied buffer
+                                      size_t b, int h, int w, int d);
+    API_EXPORT static constexpr unsigned tile_support_bits()
+    {
+        using storage_type = typename Linfo::storage_type;
+        constexpr unsigned direct = Tensor::tile_direct;
+        return tile_support_flags_for<Tensor::tile_fast | direct, storage_type>::value;
+    }
+};
+// specialize tile_methods for crouton layout
+// 8 bit
+template <> struct tile_methods<Ldefs::Crouton_8> : public tile_methods_r4crouton<Ldefs::Crouton_8> {
+};
+// 16 bit (different layout!)
+template <> struct tile_methods<Ldefs::Crouton_16> : public tile_methods_r4crouton<Ldefs::Crouton_16> {
+};
+
+// 32 bit
+template <> struct tile_methods<Ldefs::Crouton_32> : public tile_methods_r4crouton<Ldefs::Crouton_32> {
+};
+
+} // namespace tileExt_priv
+
+} // namespace hnnx
+
+// write_tile_strategy implementation (here, since it depends on the flag defs)
+
+API_FUNC_EXPORT inline void *Tensor::write_tile_strategy(unsigned flags, void *buffer, size_t b, int h, int w, int d)
+{
+    unsigned const newflags = (flags & tileExt::write_strategy_keep) | tileExt::write_strategy;
+    void const *const res = const_cast<Tensor &>(*this).read_tile(newflags, buffer, b, h, w, d);
+    return const_cast<void *>(res);
+}
+//
+// define the virtual methods for the tensor classes.
+// These could be moved inside the classes, provided the "tile_read/write" functions are declared above that,
+// and any specializations of it are defined before the tensor classes are specialized.
+template <typename Linfo>
+API_FUNC_EXPORT void const *LayoutTensor<Linfo>::read_tile(unsigned flags, void *buffer, size_t b, int h, int w,
+                                                           int d) const
+{
+    return (void const *)hnnx::tileExt_priv::tile_methods<Linfo>::tile_read(this, flags, (uint8_t *)buffer, b, h, w, d);
+}
+template <typename Linfo>
+API_FUNC_EXPORT void LayoutTensor<Linfo>::write_tile(unsigned flags, void const *buffer, size_t b, int h, int w, int d)
+{
+    hnnx::tileExt_priv::tile_methods<Linfo>::tile_write(this, flags, (uint8_t const *)buffer, b, h, w, d);
+}
+template <typename Linfo> API_FUNC_EXPORT unsigned LayoutTensor<Linfo>::tile_support_bits() const
+{
+    return hnnx::tileExt_priv::tile_methods<Linfo>::tile_support_bits();
+}
+
+namespace tileExt {
+
+template <typename T> struct layout_def_of {
+};
+template <typename L> struct layout_def_of<LayoutTensor<L>> {
+    using type = L;
+};
+//
+// a way to tell at compile time if a tensor has tile support. It must
+// be a layout tensor, or subclass of.
+//    tileExt::tile_support_test<T>::support_any    <- any support at all, including 'generic'
+//    tileExt::tile_support_test<T>::support_fast    <- support better than 'generic'.
+template <typename TENST> class tile_support_test {
+    using LTYPE = typename tensor_traits<TENST>::layouttensor_type;
+    using methods = hnnx::tileExt_priv::tile_methods<typename layout_def_of<LTYPE>::type>;
+
+  public:
+    static constexpr bool support_any = methods::tile_support_any;
+    static constexpr bool support_fast = methods::tile_support_fast;
+};
+
+/////////////////////////////////////////
+// 'aligned_buffer' classes
+// On hexagon we can make the compiler align
+// it by putting an HVX vector in the union;
+// on x86 it's done manually
+/////////////////////////////////////////
+template <unsigned NVECS> struct aligned_buffer_base {
+    static_assert(NVECS >= 1);
+
+  protected:
+#ifdef __hexagon__
+    static constexpr bool manual_align = false;
+#else
+    static constexpr bool manual_align = true;
+#endif
+    union {
+        uint32_t u32arr[NVECS * 32 + (manual_align ? 31 : 0)];
+#ifdef __hexagon__
+        HVX_Vector varr[NVECS];
+#endif
+    };
+    API_EXPORT void *arr_addr() const
+    {
+        if constexpr (manual_align) {
+            size_t tmp = size_t(&u32arr[0]);
+            tmp = (tmp + 127) & ~size_t(127);
+            return (void *)tmp;
+        } else {
+            return (void *)&u32arr[0];
+        }
+    }
+};
+
+// useful subclasses of tile_buffer_template:
+
+template <unsigned NVECS> struct tile_buffer_template : public aligned_buffer_base<NVECS> {
+  public:
+    uint8_t *buf() { return reinterpret_cast<uint8_t *>(this->arr_addr()); };
+    uint8_t const *buf() const { return reinterpret_cast<uint8_t const *>(this->arr_addr()); };
+};
+// aligned buffer of 2K
+using tile_buffer = tile_buffer_template<16>;
+// aligned buffer of 1K
+using tile_half_buffer = tile_buffer_template<8>;
+// aligned buffer of 4K (for 4x8 32-bit tile)
+using tile_double_buffer = tile_buffer_template<32>;
+// aligned buffer of 8K (for 8x8 32-bit tile)
+using tile_quad_buffer = tile_buffer_template<64>;
+
+//
+// 'arrays' of NBUFS tile buffers...
+//  call 'buf(i)' method, with  i in range 0..NBUFS-1, to get a pointer to one of the buffers.
+//
+template <unsigned NBUFS, unsigned NVECS> struct tile_buffers_template : public aligned_buffer_base<NBUFS * NVECS> {
+    using Parent = aligned_buffer_base<NBUFS * NVECS>;
+
+  public:
+#ifdef SAFE_ALLOC
+    // For safety, clear everything to 0 so that if we load less then the size
+    // of a tile, memory will have a deterministic value.
+    tile_buffers_template()
+    {
+        // Clear memory if compiled with the SAFE_ALLOC option.
+        memset(Parent::u32arr, 0, sizeof(Parent::u32arr));
+    }
+#endif
+
+    API_EXPORT uint8_t *buf(unsigned i = 0) { return reinterpret_cast<uint8_t *>(this->arr_addr()) + NVECS * 128 * i; };
+    API_EXPORT uint8_t const *buf(unsigned i = 0) const
+    {
+        return reinterpret_cast<uint8_t const *>(this->arr_addr()) + NVECS * 128 * i;
+    };
+};
+
+template <unsigned NBUFS> using tile_buffers = tile_buffers_template<NBUFS, 16>;
+
+template <unsigned NBUFS> using tile_half_buffers = tile_buffers_template<NBUFS, 8>;
+
+template <unsigned NBUFS> using tile_double_buffers = tile_buffers_template<NBUFS, 32>;
+
+template <unsigned NBUFS> using tile_quad_buffers = tile_buffers_template<NBUFS, 64>;
+
+////////////////////////////////////////////////////
+/// TileStoreWindow<int ELBYTES>
+////////////////////////////////////////////////////
+// (not really part of the tile_extract interface, but closely related).
+// This is a class to manage storing tiles directly to a 'window'
+// of the output tensor, or any flat tensor, using the same write_tile
+// interface, but relative to (and clipped to) a predetermined window.
+//
+template <unsigned int RANK = 4> class TileStoreWindowBase {
+  protected:
+    void *ptr;
+    void *ptrw; // pointer to window start.
+    unsigned elsize; // element bytes
+    unsigned dims[RANK]; // dimensions of the output
+    size_t winsize[RANK]; // window to store to
+    unsigned winoffs[RANK]; // offset of the window.
+    size_t strides[RANK];
+
+  public:
+    API_EXPORT inline unsigned win_dim(int i) const { return winsize[i]; }
+    API_EXPORT inline unsigned full_dim(int i) const { return dims[i]; }
+    API_EXPORT inline size_t stride(int i) const { return strides[i]; }
+    API_EXPORT void *addr_base() const { return ptr; }
+    API_EXPORT void *win_base() const { return ptrw; }
+    // this is to support Tensor::get_dims()
+    API_EXPORT std::pair<size_t const *, size_t> get_windims() const noexcept { return {winsize, RANK}; }
+
+    // set the descriptor up with  specified 'flat' tensor
+    // for the output (described as pointer and oshape)
+
+    API_EXPORT TileStoreWindowBase(Tensor &otensor, TensorShape<RANK> const &out_shape, unsigned elbytes)
+    {
+        ptr = ptrw = otensor.raw_data();
+        size_t stride = elbytes;
+        for (int i = RANK - 1; i >= 0; --i) {
+            unsigned const dim = out_shape.dim(i);
+            dims[i] = winsize[i] = dim;
+            winoffs[i] = 0;
+            strides[i] = stride;
+            stride *= dim;
+        }
+        elsize = elbytes;
+    }
+
+    API_EXPORT TileStoreWindowBase(Tensor &otensor, std::array<size_t, RANK> out_dims, unsigned elbytes)
+    {
+        ptr = ptrw = otensor.raw_data();
+        size_t stride = elbytes;
+        for (int i = RANK - 1; i >= 0; --i) {
+            size_t const dim = out_dims[i];
+            dims[i] = winsize[i] = dim;
+            winoffs[i] = 0;
+            strides[i] = stride;
+            stride *= dim;
+        }
+        elsize = elbytes;
+    }
+    // set output tensor and window all at once.
+    // might be worth writing this out as a single 'for' loop.
+
+    template <typename ITType>
+    API_EXPORT TileStoreWindowBase(Tensor &otensor, ITType const &itens, TensorShape<RANK> const &offset,
+                                   TensorShape<RANK> const &out_shape, unsigned elbytes)
+        : TileStoreWindowBase(otensor, out_shape, elbytes)
+    {
+        set_window(itens, offset);
+    }
+
+    template <typename ITType>
+    TileStoreWindowBase(Tensor &otensor, ITType const &itens, TensorShape<RANK> const &offset,
+                        std::array<size_t, RANK> const &out_shape, unsigned elbytes)
+        : TileStoreWindowBase(otensor, out_shape, elbytes)
+    {
+        set_window(itens, offset);
+    }
+
+    // set a window with the size taken from the given tensor, and the offset
+    // from the given ShapeTensor.
+    // 'tens' can also be a Shape<4>.
+    //
+    template <typename TType> API_EXPORT inline void set_window(TType const &tens, TensorShape<RANK> const &offset)
+    {
+        size_t const *windims;
+        size_t tens_rank = 0;
+        if constexpr (std::is_same<TType, Shape<4>>::value || std::is_same<TType, Shape<5>>::value) {
+            windims = tens.dims.data();
+            tens_rank = tens.RankVal;
+        } else {
+            windims = tens.get_dims().first;
+            tens_rank = tens.rank();
+        }
+        size_t delta = 0;
+        int dim_offset = 0;
+        if (tens_rank + 1 == RANK) {
+            winsize[0] = 1;
+            dim_offset = 1;
+        }
+
+        unsigned len = 0;
+        for (int i = 0; i < RANK; ++i) {
+            unsigned const offs = offset.dim(i);
+
+            if (1 == dim_offset) {
+                len = (0 == i) ? 1 : windims[i - dim_offset];
+            } else {
+                len = windims[i];
+            }
+            assert(len > 0 && offs + len <= dims[i]);
+            winoffs[i] = offs;
+            winsize[i] = len;
+            delta += offs * strides[i];
+        }
+        ptrw = (void *)((char *)ptr + delta);
+    }
+
+    // It may make make sense to add other ctors, for other uses of the
+    // same kind of thing.
+    // Once the structure is set up, only winsize[], strides[] and ptrw are used
+    // by the write_tile method.
+};
+
+template <unsigned ELBYTES, unsigned RANK = 4> class TileStoreWindow : public TileStoreWindowBase<RANK> {
+  public:
+    API_EXPORT TileStoreWindow(Tensor &otensor, TensorShape<RANK> const &out_shape)
+        : TileStoreWindowBase<RANK>(otensor, out_shape, ELBYTES)
+    {
+    }
+    API_EXPORT TileStoreWindow(Tensor &otensor, std::array<size_t, RANK> out_dims)
+        : TileStoreWindowBase<RANK>(otensor, out_dims, ELBYTES)
+    {
+    }
+    template <typename ITType>
+    API_EXPORT TileStoreWindow(Tensor &otensor, ITType const &itens, TensorShape<RANK> const &offset,
+                               TensorShape<RANK> const &out_shape)
+        : TileStoreWindowBase<RANK>(otensor, itens, offset, out_shape, ELBYTES)
+    {
+    }
+
+    template <typename ITType>
+    TileStoreWindow(Tensor &otensor, ITType const &itens, TensorShape<RANK> const &offset,
+                    std::array<size_t, RANK> const &out_shape)
+        : TileStoreWindowBase<RANK>(otensor, itens, offset, out_shape, ELBYTES)
+    {
+    }
+    // store a tile to the window, at the given b,h,w,d
+    // These are relative to the window size.
+    //
+    //  b,h,w,d can be any value >=0 and < the window size in that dim; exceptions being
+    //   (1) h and w can be <0 (but some of the tile must still fall in range; so they
+    //       must be at least -(tht-1) and -(TW-1) resp.
+    //   (2) 'd' must be a multiple of TD=32.
+    //
+    // Tile dims are TW=8 (or TW=4 for ELBYTES=2), TD=32,
+    // Tile height tht is adjustable, coded into the lower 5 bits of 'flags',
+    // and <=8; if the lower 5 bits are zero, the default tile height is used, which
+    // is 8.
+    //
+    // The input 'tiledata' must be a vector aligned pointer to 'tht' tile row.
+    // For ELBYTES=1,2 or 4, a tile row is 256 bytes;.
+    // (i.e. a tile row is TW*TD*ELBYTES bytes).
+    // The only other thing in flags is optional tileExt::unshuffled, which
+    // applies only when ELBYTES=2, and indicates that the tiledata is not shuffled
+    // i.e it is 4*32*int16, rather than 2 x { 32*2*int16}.
+    //
+    API_EXPORT void write_tile(unsigned flags, void const *tiledata, size_t b, int h, int w, int d);
+
+    // this is to support element_addr virtual method in  TileStoreWindowTensor
+    API_EXPORT void *element_addr(SIdx const indices[RANK]) const
+    {
+        int offset = 1;
+        for (int i = 0; i < RANK - 1; ++i) {
+            offset += indices[i] * TileStoreWindowBase<RANK>::strides[i];
+        }
+        offset += indices[RANK - 1] * ELBYTES;
+        return (void *)((char *)TileStoreWindowBase<RANK>::ptrw + offset);
+    }
+};
+
+// this is the same as a TileStoreWindow, but also supports a general Tensor
+// output interface via virtual methods.
+// It must be constructed with reference to a TensorContiguous of rank 4 and matching dtype.
+// It internally creates a reference to that tensor's interface object, so
+// that t(b,h,w,d) works. .
+//
+// The only reason you need to have the exact dtype is to make get_dtype_intfc work,
+// and so that get_raw and get_raw_addr have the expected return types. If you use a different
+// dtype of the correct element size, everything else will work (and get_raw will work with a different
+// return type of the same size).
+// So there is a way to make a more generic one of these based on element size
+// only, to support use cases where we want the same code to handle QUint8 and QUint8 for instance.
+// The t(b,h,w,d) method will still convert correctly to/from float, since the interface object is taken
+// from 'otensor':
+//     TileStoreWindowTensorGeneric<ELBYTES>
+//
+
+template <DType DT> class TileStoreWindowTensor : public FakeTensor {
+    using element_type = typename dtype_traits<DT>::element_type;
+    static constexpr unsigned Rank = 4;
+    TileStoreWindow<sizeof(element_type)> ts_window;
+    hnnx::InterfaceRef const intfc;
+
+  public:
+    struct traits {
+        static constexpr DType dtype = DT;
+        using element_type = typename dtype_traits<dtype>::element_type;
+        using raw_type = typename dtype_traits<dtype>::raw_type;
+    };
+    API_EXPORT TileStoreWindowTensor(Tensor &otensor, TensorShape<4> const &out_shape)
+        : FakeTensor(nullptr), ts_window(otensor, out_shape), intfc(otensor.interface())
+    {
+    }
+
+    template <typename ITType>
+    API_EXPORT TileStoreWindowTensor(Tensor &otensor, ITType const &itens, TensorShape<4> const &offset,
+                                     TensorShape<4> const &out_shape)
+        : FakeTensor(nullptr), ts_window(otensor, itens, offset, out_shape), intfc(otensor.interface())
+    {
+    }
+
+    template <typename TType> API_EXPORT inline void set_window(TType const &tens, TensorShape<4> const &offset)
+    {
+        ts_window.template set_window<TType>(tens, offset);
+    }
+    template <typename... ind_types> API_EXPORT inline element_type *get_raw_addr(ind_types... inds)
+    {
+        static_assert(Rank == (sizeof...(ind_types)), "# of coords must match Rank");
+        const std::array<SIdx, Rank> coords = {{static_cast<SIdx>(inds)...}};
+        return (element_type *)element_ptr(Rank, coords.data());
+    }
+    template <typename... ind_types> API_EXPORT inline element_type &get_raw(ind_types... inds)
+    {
+        static_assert(Rank == (sizeof...(ind_types)), "# of coords must match Rank");
+        const std::array<SIdx, Rank> coords = {{static_cast<SIdx>(inds)...}};
+        return *(element_type *)this->element_addr(Rank, coords.data());
+    }
+
+  protected:
+    API_EXPORT virtual void *element_addr(size_t rank, SIdx const coords_in[],
+                                          hnnx::InterfaceRef *iref = nullptr) const noexcept override
+    {
+        assert(rank == Rank);
+        if (iref) *iref = intfc;
+        return ts_window.element_addr(coords_in);
+    }
+
+  public:
+    API_EXPORT virtual size_t rank() const noexcept override { return Rank; }
+    API_EXPORT virtual hnnx::InterfaceRef interface() const noexcept override final { return intfc; }
+    API_EXPORT virtual size_t dim(size_t index) const noexcept override { return ts_window.win_dim(index); }
+    API_EXPORT virtual std::pair<size_t const *, size_t> get_dims() const noexcept override
+    {
+        return ts_window.get_windims();
+    }
+
+    API_EXPORT virtual inline bool set_dims(const size_t dims[]) override final
+    {
+        for (int i = 0; i < Rank; i++) {
+            assert(dims[i] == ts_window.win_dim(i));
+        }
+        return false;
+    }
+    API_EXPORT virtual inline bool set_dims(const Tensor &prototype) override final
+    {
+        auto [dims_p, dims_n] = prototype.get_dims();
+        assert(dims_n == Rank);
+        return set_dims(dims_p);
+    }
+
+    API_EXPORT virtual inline void set_valid_dims(const size_t new_dims[]) override final
+    {
+        for (int i = 0; i < Rank; i++) {
+            assert(new_dims[i] <= ts_window.win_dim(i));
+        }
+        // AMINE TODO: update TileStoreWindowBase to handle valid dims correctly
+    }
+    // AMINE TODO: update TileStoreWindowBase to handle valid dims correctly
+    virtual inline DynamicStatus get_dynamic_state() const override { return DynamicStatus::ValidData; }
+
+    API_EXPORT virtual DTypeScaleOff get_dtype_intfc() const noexcept override
+    {
+        // @@FIXME - could be resolved at compile time by mapping DT->Interface_t
+        return DTypeScaleOff(DT, *intfc.get_qparms());
+    }
+
+    API_EXPORT virtual void write_tile(unsigned flags, void const *buffer, size_t b, int h, int w, int d) override final
+    {
+        ts_window.write_tile(flags, buffer, b, h, w, d);
+    }
+    // We don't support actually doing read_tile, but we need to implement it in case someone calls
+    // write_tile_strategy.
+    API_EXPORT virtual void const *read_tile(unsigned flags, void *buffer, size_t b, int h, int w,
+                                             int d) const override final
+    {
+        assert((flags & write_strategy) != 0);
+        return buffer; // always fail on write_tile_strategy.
+    }
+    API_EXPORT virtual unsigned tile_support_bits() const override final
+    {
+        return hnnx::tileExt_priv::tile_support_flags_for<tile_fast, element_type>::value;
+    }
+};
+
+template <unsigned ELBYTES> class TileStoreWindowTensorGeneric {
+    static_assert(false && ELBYTES, "not specialized for this value of ELBYTES");
+};
+
+template <> class TileStoreWindowTensorGeneric<1> : public TileStoreWindowTensor<DType::QUInt8> {
+};
+template <> class TileStoreWindowTensorGeneric<2> : public TileStoreWindowTensor<DType::QUInt16> {
+};
+template <> class TileStoreWindowTensorGeneric<4> : public TileStoreWindowTensor<DType::Int32> {
+};
+
+//
+// generic utilities:
+// raw_copy_via_tiles<ELBYTES>: this copies 'raw data' from 'in' to 'out' using tile operations.
+// Caller must ensure that both tensors have ELBYTES per element, and both types support the tile
+// interface.
+// All tile operations are 8 rows high, even on 32-bit tiles.
+//
+// This is instantiated for ELBYTES = 1,2,4 in tile_extract.cc
+template <unsigned ELBYTES> API_FUNC_EXPORT int raw_copy_by_tiles(Tensor &out, Tensor const &in, unsigned flags = 0);
+
+} // namespace tileExt
+
+POP_VISIBILITY()
+
+#endif /* TILE_EXTRACT_H_ */
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/type_help.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/type_help.h
new file mode 100755
index 0000000000000..a3d30d984e40b
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/type_help.h
@@ -0,0 +1,103 @@
+//==============================================================================
+//
+// Copyright (c) 2020 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef TYPE_HELP_H
+#define TYPE_HELP_H 1
+#include <type_traits>
+#include <typeinfo>
+#include <cstddef>
+
+namespace hnnx {
+
+//////////////////////////////////////////////////////////////
+// value_proxy<T,UNORDERED>::type
+//    maps 'long' to either int or to long long;
+//    unsigned long to unsigned or to unsigned long long;
+//    Any pointer is mapped to whatever size_t maps to;
+//    if UNORDERED: also maps signed integer types to unsigned
+//    all other types are unchanged
+//
+// This is useful to reduce code bloat; e.g. a 'set' class operating
+//  on T can actually use type_proxy<T> internally, so that e.g. set
+//  of 'void *' and set of 'int const *' will both use the same templated code
+//  as size_t.
+//
+
+template <typename T, bool UNORDERED> struct value_proxy {
+    typedef T type;
+};
+// map unsigned long to unsigned int or to unsigned long long
+template <bool UNORDERED> struct value_proxy<unsigned long, UNORDERED> {
+    typedef std::conditional_t<(sizeof(unsigned long) <= 4), unsigned, unsigned long long> type;
+};
+// likewise for long
+template <> struct value_proxy<long, false> {
+    typedef std::conditional_t<(sizeof(long) <= 4), int, long long> type;
+};
+template <> struct value_proxy<long, true> {
+    typedef typename value_proxy<unsigned long, true>::type type;
+};
+
+// map signed types to unsigned, if unordered
+template <> struct value_proxy<short, true> {
+    typedef unsigned short type;
+};
+template <> struct value_proxy<int, true> {
+    typedef unsigned type;
+};
+template <> struct value_proxy<long long, true> {
+    typedef unsigned long long type;
+};
+template <> struct value_proxy<signed char, true> {
+    typedef unsigned char type;
+};
+
+// all pointer -> size_t-> either unsigned or ull
+template <typename T, bool UNORDERED> struct value_proxy<T *, UNORDERED> {
+    typedef typename value_proxy<size_t, UNORDERED>::type type;
+    static_assert(sizeof(type) == sizeof(T *));
+};
+
+#if 0 // >>>> unused now
+typedef uint64_t namesig_t;
+
+////////// hash to 64 /////
+template <int N>
+inline constexpr uint64_t strconst_to_namesig_t(const char (&str)[N])
+{
+    // need to maybe try different K?
+    // any odd number is relatively prime to 2^64
+    const uint64_t K = 0x310901;
+    uint64_t result = 0;
+    for (int i = 0; i < N - 1; i++)
+        result = result * K + (uint8_t)str[i];
+    return result;
+}
+// in serialize.cc
+uint64_t typeinfo_to_namesig_t(std::type_info const &tinfo) noexcept;
+
+// this is a strategy to ensure each name signature is only found once.
+// (Each T gets its own static variable)
+//
+template <typename T> struct nameinfo {
+    static namesig_t namesig()
+    {
+        static namesig_t sig = typeinfo_to_namesig_t(typeid(T));
+        return sig;
+    }
+};
+
+template <typename T> inline namesig_t name_sig_for()
+{
+    return nameinfo<T>::namesig();
+};
+#endif /// <<< unused now
+
+} // namespace hnnx
+
+#endif
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/type_name.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/type_name.h
new file mode 100755
index 0000000000000..1d162a616c5cb
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/type_name.h
@@ -0,0 +1,132 @@
+//==============================================================================
+//
+// Copyright (c) 2021 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef TYPE_NAME_H
+#define TYPE_NAME_H 1
+
+#include <array>
+#include <string_view>
+
+template <typename T> constexpr const char *type_name()
+{
+    return "unknown";
+}
+
+// Macros called from tensor.h when declaring a new tensor type whcih creates a map from op code to
+// typename
+template <typename> struct TensorTypeStruct;
+#define DEFINE_TYPENAME(TYPE, TYPENAME)                                                                                \
+    template <> struct TensorTypeStruct<TYPE> {                                                                        \
+        static constexpr const char *name = "CODE_TO_TENSORTYPE:" TYPENAME " " #TYPE;                                  \
+    };                                                                                                                 \
+    template <> constexpr const char *type_name<TYPE>() { return TYPENAME; }
+
+#define DEFINE_TYPENAME_V(TYPE, TYPENAME)                                                                              \
+    template <> constexpr const char *type_name<TYPE>() { return TYPENAME; }
+
+/* use DEFINE_TYPENAME to define the typename for classes
+e.g.
+DEFINE_TYPENAME(MyTensor8, mt8);
+DEFINE_TYPENAME(MyTensor16, mt16);
+*/
+// DEFINE_TYPENAME(int, int);
+// DEFINE_TYPENAME(float, float);
+
+// LCOV_EXCL_START [SAFTYSWCCB-1736] constexprs resolved during compile time
+// used in /pub/impl/template_help.h with constexpr lvalue
+template <typename T> constexpr void AddTypeNameSize(size_t &size)
+{
+    std::string_view const name = type_name<std::remove_cv_t<std::remove_reference_t<T>>>();
+    size += 1; //add space for "." or "@"
+    size += name.size();
+}
+
+template <typename... TYPES> constexpr size_t GetTypeNamesTotalSize()
+{
+    size_t size = 0;
+    (AddTypeNameSize<TYPES>(size), ...);
+    return size;
+}
+
+template <typename T> constexpr void AppendTypeName(char *des, size_t &offset, size_t &duplicate, size_t &left)
+{
+    left--;
+    std::string_view const name = type_name<std::remove_cv_t<std::remove_reference_t<T>>>();
+    size_t i = offset;
+    bool same = false;
+    if (offset != 0) { //if not the first name
+        same = true;
+        des[i++] = '.'; //add delimiter
+        size_t const len = name.size();
+        for (int j = 0; j < len; j++) {
+            if (des[offset - 1 - j] != name[len - 1 - j]) {
+                same = false;
+                break;
+            }
+        }
+        if (same && des[offset - len - 1] != '.' && des[offset - len - 1] != '@') {
+            same = false;
+        }
+        if (same) duplicate += 1;
+    } else
+        des[i++] = '@';
+    if (!same) {
+        if (offset != 0) {
+            if (duplicate > 1) {
+                des[i - 1] = '*';
+                if (duplicate >= 10) {
+                    des[i++] = 48 + duplicate / 10;
+                    des[i++] = 48 + duplicate % 10;
+                } else {
+                    des[i++] = 48 + duplicate;
+                }
+                des[i++] = '.';
+            }
+            duplicate = 1; //add delimiter
+        }
+        for (auto n : name)
+            des[i++] = n;
+        des[i] = 0;
+        offset = i;
+    }
+    if (left == 0 && duplicate > 1) {
+        des[i - 1] = '*';
+        if (duplicate >= 10) {
+            des[i++] = 48 + duplicate / 10;
+            des[i++] = 48 + duplicate % 10;
+        } else {
+            des[i++] = 48 + duplicate;
+        }
+    }
+}
+
+template <typename... TYPES> constexpr auto GetTypeNames()
+{
+    std::array<char, GetTypeNamesTotalSize<TYPES...>() + 1> result{};
+    char *des = result.data();
+    size_t offset = 0;
+    size_t duplicate = 1;
+    size_t left = sizeof...(TYPES);
+    (AppendTypeName<TYPES>(des, offset, duplicate, left), ...);
+    return result;
+}
+
+template <typename TYPESTUPLE, std::size_t... I> constexpr auto GetTypeNames(std::index_sequence<I...>)
+{
+    std::array<char, GetTypeNamesTotalSize<std::tuple_element_t<I, TYPESTUPLE>...>() + 1> result{};
+    char *const des = result.data();
+    size_t offset = 0;
+    size_t duplicate = 1;
+    size_t left = sizeof...(I);
+    (AppendTypeName<std::tuple_element_t<I, TYPESTUPLE>>(des, offset, duplicate, left), ...);
+    return result;
+}
+
+// LCOV_EXCL_STOP
+
+#endif
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/unique_types.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/unique_types.h
new file mode 100755
index 0000000000000..909a7128af713
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/unique_types.h
@@ -0,0 +1,68 @@
+//==============================================================================
+//
+// Copyright (c) 2020 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef UNIQUE_TYPES_H
+#define UNIQUE_TYPES_H 1
+
+#if 1
+// simpler way ... generates smaller code
+#define DEFINE_UNIQ_TY()                                                                                               \
+    namespace {                                                                                                        \
+    template <int K> struct UniqTy {                                                                                   \
+    };                                                                                                                 \
+    } // namespace
+#define UNIQUE_TYPE UniqTy<__LINE__>
+
+#else
+/*
+ * EJP: FIXME maybe
+ * sizeof() is unsigned, so when we subtract constants we get unsigned results.
+ * This means that instead of just checking for < 0, we need to check for >= sizeof(STR)
+ * Or... we can cast to signed.  That seems to work.
+ */
+
+//#define STRINDEX(NUM,STR) ((((NUM) >= 0) && ((NUM) < sizeof(STR))) ? (STR[NUM]) : 0)
+
+#define STRINDEX(NUM, STR) ((((signed)(NUM)) >= 0) ? (STR[NUM]) : 0)
+#define EXPAND_LAST_16(SIZE, STR)                                                                                      \
+    STRINDEX(((SIZE)-0xF), STR), STRINDEX(((SIZE)-0xE), STR), STRINDEX(((SIZE)-0xD), STR),                             \
+            STRINDEX(((SIZE)-0xC), STR), STRINDEX(((SIZE)-0xB), STR), STRINDEX(((SIZE)-0xA), STR),                     \
+            STRINDEX(((SIZE)-0x9), STR), STRINDEX(((SIZE)-0x8), STR), STRINDEX(((SIZE)-0x7), STR),                     \
+            STRINDEX(((SIZE)-0x6), STR), STRINDEX(((SIZE)-0x5), STR), STRINDEX(((SIZE)-0x4), STR),                     \
+            STRINDEX(((SIZE)-0x3), STR), STRINDEX(((SIZE)-0x2), STR), STRINDEX(((SIZE)-0x1), STR),                     \
+            STRINDEX(((SIZE)-0x0), STR)
+
+#define EXPAND_LAST_32(SIZE, STR) EXPAND_LAST_16((SIZE - 0x10), STR), EXPAND_LAST_16((SIZE - 0x00), STR)
+
+#if 0
+/* If we need bigger file names... */
+#define EXPAND_LAST_64(SIZE, STR)                                                                                      \
+    EXPAND_LAST_16((SIZE - 0x30), STR), EXPAND_LAST_16((SIZE - 0x20), STR), EXPAND_LAST_16((SIZE - 0x10), STR),        \
+            EXPAND_LAST_16((SIZE - 0x00), STR)
+#endif
+
+/*
+ * sizeof(STR)-1 is the trailing '\0'
+ * So let's start at sizeof(STR)-2.
+ */
+#define EXPAND_STR(STR) EXPAND_LAST_32(sizeof(STR) - 2, STR)
+
+/*
+ * FIXME maybe: we could strip out zeros.
+ */
+
+namespace hnnx {
+
+template <int line, char... file_chars> struct Unique_Identifier {
+};
+
+} // namespace hnnx
+
+#define UNIQUE_TYPE hnnx::Unique_Identifier<__LINE__, EXPAND_STR(__FILE__)>
+#endif
+#endif
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/weak_linkage.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/weak_linkage.h
new file mode 100755
index 0000000000000..175d8aa62824b
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTP/core/weak_linkage.h
@@ -0,0 +1,56 @@
+//==============================================================================
+//
+// Copyright (c) 2021-2023 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+#ifndef WEAK_LINKAGE_H
+#define WEAK_LINKAGE_H 1
+
+#include "c_tricks.h"
+
+#if defined(IMPORT_SYMBOLS) && defined(ENABLE_WEAK_LINKAGE)
+#define API_C_FUNC       extern
+#define API_FUNC_NAME(N) (*N)
+#else
+#define API_C_FUNC
+#define API_FUNC_NAME(N) N
+#endif
+
+// Macro API_FUNC_EXPORT to export symbols
+#if defined(_MSC_VER)
+#define API_FUNC_EXPORT __declspec(dllexport)
+#else
+#define API_FUNC_EXPORT __attribute__((visibility("default")))
+#endif // _MSC_VER
+
+// Macro API_EXPORT_IMPORT to export class static variables
+#if defined(_MSC_VER)
+#if defined(BUILD_OP_PACKAGE)
+#define API_EXPORT_IMPORT __declspec(dllimport)
+#else // not BUILD_OP_PACKAGE, export symbols for building library
+#define API_EXPORT_IMPORT __declspec(dllexport)
+#endif
+#else // not define _MSC_VER
+#define API_EXPORT_IMPORT __attribute__((visibility("default")))
+#endif // _MSC_VER
+
+// Macro API_FUNC_HIDDEN to hide symbols
+#if defined(_MSC_VER)
+#define API_FUNC_HIDDEN
+#else
+#define API_FUNC_HIDDEN __attribute__((visibility("hidden")))
+#endif // _MSC_VER
+
+// Add new macros to support #pragma GCC visibility push/pop
+#if defined(_MSC_VER)
+#define PUSH_VISIBILITY(kind)
+#define POP_VISIBILITY()
+#else
+#define PUSH_VISIBILITY(kind) _Pragma(TOSTRING(GCC visibility push(kind)))
+#define POP_VISIBILITY()      _Pragma("GCC visibility pop")
+#endif
+
+#endif // WEAK_LINKAGE_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTPQEMU/QnnHtpQemuCommon.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTPQEMU/QnnHtpQemuCommon.h
new file mode 100755
index 0000000000000..e18c81f12df9e
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/HTPQEMU/QnnHtpQemuCommon.h
@@ -0,0 +1,56 @@
+//=============================================================================
+//
+// Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+// All rights reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//=============================================================================
+
+/** @file
+ *  @brief QNN HTP QEMU Common components
+ *
+ *         This file defines versioning and other identification details
+ *         and supplements QnnCommon.h for HTP QEMU backend
+ */
+
+#ifndef QNN_HTP_QEMU_COMMON_H
+#define QNN_HTP_QEMU_COMMON_H
+
+#include "QnnCommon.h"
+
+/// HTP QEMU Backend identifier
+#define QNN_BACKEND_ID_HTP_QEMU 13
+
+/// HTP QEMU interface provider
+#define QNN_HTP_QEMU_INTERFACE_PROVIDER_NAME "HTP_QEMU_QTI_AISW"
+
+// HTP QEMU API Version values
+#define QNN_HTP_QEMU_API_VERSION_MAJOR 1
+#define QNN_HTP_QEMU_API_VERSION_MINOR 0
+#define QNN_HTP_QEMU_API_VERSION_PATCH 0
+
+// clang-format off
+
+/// Macro to set Qnn_ApiVersion_t for HTP QEMU backend
+#define QNN_HTP_QEMU_API_VERSION_INIT                                 \
+  {                                                                   \
+    {                                                                 \
+        QNN_API_VERSION_MAJOR,        /*coreApiVersion.major*/        \
+        QNN_API_VERSION_MINOR,        /*coreApiVersion.major*/        \
+        QNN_API_VERSION_PATCH         /*coreApiVersion.major*/        \
+    },                                                                \
+    {                                                                 \
+      QNN_HTP_QEMU_API_VERSION_MAJOR,     /*backendApiVersion.major*/ \
+      QNN_HTP_QEMU_API_VERSION_MINOR,     /*backendApiVersion.minor*/ \
+      QNN_HTP_QEMU_API_VERSION_PATCH      /*backendApiVersion.patch*/ \
+    }                                                                 \
+  }
+
+// clang-format on
+
+// DSP Context blob Version values
+#define QNN_HTP_QEMU_CONTEXT_BLOB_VERSION_MAJOR 3
+#define QNN_HTP_QEMU_CONTEXT_BLOB_VERSION_MINOR 2
+#define QNN_HTP_QEMU_CONTEXT_BLOB_VERSION_PATCH 2
+
+#endif  // QNN_HTP_QEMU_COMMON_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/IR/QnnIrCommon.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/IR/QnnIrCommon.h
new file mode 100755
index 0000000000000..03c65cd7e86bd
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/IR/QnnIrCommon.h
@@ -0,0 +1,50 @@
+//=============================================================================
+//
+//  Copyright (c) 2023-2024 Qualcomm Technologies, Inc.
+//  All Rights Reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//=============================================================================
+
+/** @file
+ *  @brief QNN IR Common components
+ *
+ *         This file defines versioning and other identification details
+ *         and supplements QnnCommon.h for Ir backend
+ */
+
+#ifndef QNN_IR_COMMON_H
+#define QNN_IR_COMMON_H
+
+#include "QnnCommon.h"
+
+/// Ir Backend Identifier
+#define QNN_BACKEND_ID_IR 9
+
+/// Ir interface provider
+#define QNN_IR_INTERFACE_PROVIDER_NAME "IR_QTI_AISW"
+
+// Ir API Version Values
+#define QNN_IR_API_VERSION_MAJOR 0
+#define QNN_IR_API_VERSION_MINOR 1
+#define QNN_IR_API_VERSION_PATCH 0
+
+// clang-format off
+// Macro to set Qnn_ApiVersion_t for Ir backend
+#define QNN_IR_API_VERSION_INIT                                    \
+  {                                                                      \
+    {                                                                    \
+      QNN_API_VERSION_MAJOR,    /* coreApiVersion.major */               \
+      QNN_API_VERSION_MINOR,    /* coreApiVersion.minor */               \
+      QNN_API_VERSION_PATCH     /* coreApiVersion.patch */               \
+    },                                                                   \
+    {                                                                    \
+      QNN_IR_API_VERSION_MAJOR, /* backendApirVersion.major */     \
+      QNN_IR_API_VERSION_MINOR, /* backendApirVersion.minor */     \
+      QNN_IR_API_VERSION_PATCH, /* backendApirVersion.patch */     \
+    }                                                                    \
+  }
+
+// clang-format on
+
+#endif // QNN_IR_COMMON_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/IR/QnnIrGraph.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/IR/QnnIrGraph.h
new file mode 100755
index 0000000000000..58188487c803f
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/IR/QnnIrGraph.h
@@ -0,0 +1,76 @@
+//==============================================================================
+//
+// Copyright (c) 2023-2024 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+/**
+ *  @file
+ *  @brief  A header which defines the QNN Ir specialization of the QnnGraph.h interface.
+ */
+
+#ifndef QNN_IR_GRAPH_H
+#define QNN_IR_GRAPH_H
+
+#ifdef __cplusplus
+#include <cstdint>
+#else
+#include <stdint.h>
+#endif
+
+#include "QnnGraph.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef enum {
+  QNN_IR_GRAPH_SERIALIZATION_TYPE_FLAT_BUFFER = 1
+} QnnIrGraph_SerializationType_t;
+
+typedef enum {
+  QNN_IR_GRAPH_CONFIG_OPTION_SERIALIZATION = 1,
+  QNN_IR_GRAPH_CONFIG_OPTION_UNKNOWN       = 0x7fffffff
+} QnnIrGraph_ConfigOption_t;
+
+typedef struct {
+  QnnIrGraph_SerializationType_t serializationType;
+  const char *outputPath;
+} QnnIrGraph_SerializationOption_t;
+
+/**
+ * @brief A struct which Structure describing the set of configurations supported by graph.
+
+*/
+typedef struct {
+  QnnIrGraph_ConfigOption_t option;
+  union {
+    QnnIrGraph_SerializationOption_t serializationOption;
+  };
+} QnnIrGraph_CustomConfig_t;
+
+// clang-format off
+/// QnnIrGraph_CustomConfig_t initializer macro
+
+#define QNN_IR_GRAPH_SERIALIZATION_OPTION_INIT \
+  {                                                  \
+    QNN_IR_GRAPH_SERIALIZATION_TYPE_FLAT_BUFFER        \
+    ""                                               \
+  }
+
+#define QNN_IR_GRAPH_CUSTOM_CONFIG_INIT                       \
+  {                                                                 \
+    QNN_IR_GRAPH_CONFIG_OPTION_SERIALIZATION, /*option*/      \
+    {                                                               \
+      QNN_IR_GRAPH_SERIALIZATION_OPTION_INIT                  \
+    }                                                               \
+  }
+// clang-format on
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/LPAI/QnnLpaiBackend.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/LPAI/QnnLpaiBackend.h
new file mode 100755
index 0000000000000..4b952e3ae9f36
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/LPAI/QnnLpaiBackend.h
@@ -0,0 +1,176 @@
+//==============================================================================
+//
+//  Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+//  All rights reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+/** @file
+ *  @brief QNN LPAI component Backend API.
+ *
+ *         The interfaces in this file work with the top level QNN
+ *         API and supplements QnnBackend.h for LPAI backend
+ */
+
+#ifndef QNN_LPAI_BACKEND_H
+#define QNN_LPAI_BACKEND_H
+
+#include "QnnBackend.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//=============================================================================
+// Macros
+//=============================================================================
+
+//=============================================================================
+// Data Types
+//=============================================================================
+
+/**
+ * @brief An enum which defines the different backend custom config options
+ */
+typedef enum {
+  // see QnnLpaiBackend_CustomConfigHwInfo_t
+  QNN_LPAI_BACKEND_CUSTOM_CFG_HW_INFO,
+  QNN_LPAI_BACKEND_CUSTOM_CFG_UNDEFINED = 0x7fffffff
+} QnnLpaiBackend_CustomConfigOption_t;
+
+/**
+ * @brief An enum which defines the different targets supported by LPAI compilation.
+ */
+typedef enum {
+  /// LPAI model will be compiled for x86
+  QNN_LPAI_BACKEND_TARGET_X86 = 0,
+  /// LPAI model will be compiled for ARM
+  QNN_LPAI_BACKEND_TARGET_ARM = 1,
+  /// LPAI model will be compiled for ADSP
+  QNN_LPAI_BACKEND_TARGET_ADSP = 2,
+  /// LPAI model will be compiled for TENSILICA
+  QNN_LPAI_BACKEND_TARGET_TENSILICA = 3,
+  /// UNKNOWN enum event that must not be used
+  QNN_LPAI_BACKEND_TARGET_UNKNOWN = 0x7fffffff,
+} QnnLpaiBackend_Target_t;
+
+/**
+ * @brief An enum which defines the version of LPAI Hardware.
+ */
+typedef enum {
+  /// No LPAI HW will be used
+  QNN_LPAI_BACKEND_HW_VERSION_NA   = 0,
+  /// LPAI HW version v1
+  QNN_LPAI_BACKEND_HW_VERSION_V1   = 0x00000001,
+  /// LPAI HW version v2
+  QNN_LPAI_BACKEND_HW_VERSION_V2   = 0x00000002,
+  /// LPAI HW version v3
+  QNN_LPAI_BACKEND_HW_VERSION_V3   = 0x00000003,
+  /// LPAI HW version v4
+  QNN_LPAI_BACKEND_HW_VERSION_V4   = 0x00000004,
+  /// LPAI HW version v5
+  QNN_LPAI_BACKEND_HW_VERSION_V5   = 0x00000005,
+  /// LPAI HW version v5.1
+  QNN_LPAI_BACKEND_HW_VERSION_V5_1 = 0x00010005,
+  /// LPAI HW version v6
+  QNN_LPAI_BACKEND_HW_VERSION_V6   = 0x00000006,
+  /// LPAI HW default version v5
+  QNN_LPAI_BACKEND_HW_VERSION_DEFAULT = QNN_LPAI_BACKEND_HW_VERSION_V5,
+  /// UNKNOWN enum event that must not be used
+  QNN_LPAI_BACKEND_HW_VERSION_UNKNOWN = 0x7fffffff,
+} QnnLpaiBackend_HwVersion_t;
+
+//=============================================================================
+// Public Functions
+//=============================================================================
+
+//------------------------------------------------------------------------------
+//   Implementation Definition
+//------------------------------------------------------------------------------
+
+/**
+ * @brief Structure describing the set of configurations supported by the backend.
+ *        Objects of this type are to be referenced through QnnBackend_CustomConfig_t.
+ */
+typedef struct {
+  uint32_t option;
+  void* config;
+} QnnLpaiBackend_CustomConfig_t;
+
+// clang-format off
+/// QnnLpaiBackend_CustomConfig_t initializer macro
+#define QNN_LPAI_BACKEND_CUSTOM_CONFIG_INIT                        \
+  {                                                                \
+    QNN_LPAI_BACKEND_CUSTOM_CFG_UNDEFINED,           /*option*/    \
+    NULL                                             /*config*/    \
+  }
+// clang-format on
+
+typedef struct {
+  QnnLpaiBackend_Target_t lpaiTarget;
+  QnnLpaiBackend_HwVersion_t hwVersion;
+} QnnLpaiBackend_CustomConfigHwInfo_t;
+
+// clang-format off
+/// QnnLpaiBackend_CustomConfigHwInfo_t initializer macro
+#define QNN_LPAI_BACKEND_CUSTOM_CONFIG_HW_INFO_INIT                                 \
+  {                                                                         \
+    QNN_LPAI_BACKEND_TARGET_ADSP,        /*lpaiTarget*/                     \
+    QNN_LPAI_BACKEND_HW_VERSION_DEFAULT, /*hwVersion*/                      \
+  }
+// clang-format on
+
+/**
+ * @brief Enum describing the set of properties supported by the backend.
+ *        Objects of this type are to be referenced through QnnBackend_CustomProperty_t.
+ */
+typedef enum {
+  // get the start address alignment and size alignment requirement of buffers, see
+  // QnnLpaiBackend_BufferAlignmentReq_t
+  QNN_LPAI_BACKEND_GET_PROP_ALIGNMENT_REQ,
+  // indicate if cached binary buffer need to be persistent until QnnContext_free is called, return
+  // bool
+  // if true is returned, need to specify QNN_CONTEXT_CONFIG_PERSISTENT_BINARY during
+  // QnnContext_createFromBinary
+  QNN_LPAI_BACKEND_GET_PROP_REQUIRE_PERSISTENT_BINARY,
+  // Unused, present to ensure 32 bits.
+  QNN_LPAI_BACKEND_GET_PROP_UNDEFINED = 0x7fffffff
+} QnnLpaiBackend_GetPropertyOption_t;
+
+typedef struct {
+  // the start address of the buffer must be startAddrAlignment-byte aligned
+  uint32_t startAddrAlignment;
+  // the allocated buffer must be a multiple of sizeAlignment bytes
+  uint32_t sizeAlignment;
+} QnnLpaiBackend_BufferAlignmentReq_t;
+
+// clang-format off
+/// QnnLpaiBackend_BufferAlignmentReq_t initializer macro
+#define QNN_LPAI_BACKEND_ALIGNMENT_REQ_INIT                          \
+  {                                                                  \
+    0u,                                      /*startAddrAlignment*/  \
+    0u                                       /*sizeAlignment*/       \
+  }
+// clang-format on
+
+// used by QnnBackend_getProperty
+typedef struct {
+  uint32_t option;
+  void* property;
+} QnnLpaiBackend_CustomProperty_t;
+
+// clang-format off
+/// QnnLpaiBackend_CustomProperty_t initializer macro
+#define QNN_LPAI_BACKEND_CUSTOM_PROPERTY_INIT                        \
+  {                                                                  \
+    QNN_LPAI_BACKEND_GET_PROP_UNDEFINED,               /*option*/    \
+    NULL                                               /*property*/  \
+  }
+// clang-format on
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/LPAI/QnnLpaiCommon.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/LPAI/QnnLpaiCommon.h
new file mode 100755
index 0000000000000..5a6f039f28f74
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/LPAI/QnnLpaiCommon.h
@@ -0,0 +1,63 @@
+//==============================================================================
+//
+//  Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+//  All rights reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+/** @file
+ *  @brief QNN LPAI Common components
+ *
+ *         This file defines versioning and other identification details
+ *         and supplements QnnCommon.h for LPAI backend
+ */
+
+#ifndef QNN_LPAI_COMMON_H
+#define QNN_LPAI_COMMON_H
+
+#include "QnnCommon.h"
+
+/// QNN LPAI Backend identifier
+#define QNN_BACKEND_ID_LPAI 12
+#define QNN_BACKEND_ID_LPAI_ISLAND 16
+
+/// QNN LPAI interface provider
+#define QNN_LPAI_INTERFACE_PROVIDER_NAME "LPAI_QTI_AISW"
+
+/// QNN LPAI API Version values for V5
+#define QNN_LPAI_API_VERSION_MAJOR 2
+#define QNN_LPAI_API_VERSION_MINOR 17
+#define QNN_LPAI_API_VERSION_PATCH 0
+
+
+// clang-format off
+
+/// Macro to set Qnn_ApiVersion_t for LPAI backend
+#define QNN_LPAI_API_VERSION_INIT                                \
+  {                                                              \
+    {                                                            \
+      QNN_API_VERSION_MAJOR,     /*coreApiVersion.major*/        \
+      QNN_API_VERSION_MINOR,     /*coreApiVersion.major*/        \
+      QNN_API_VERSION_PATCH      /*coreApiVersion.major*/        \
+    },                                                           \
+    {                                                            \
+      QNN_LPAI_API_VERSION_MAJOR, /*backendApiVersion.major*/    \
+      QNN_LPAI_API_VERSION_MINOR, /*backendApiVersion.minor*/    \
+      QNN_LPAI_API_VERSION_PATCH  /*backendApiVersion.patch*/    \
+    }                                                            \
+  }
+
+// clang-format on
+
+/// QNN LPAI Binary Version values
+#define QNN_LPAI_BINARY_VERSION_MAJOR 1
+#define QNN_LPAI_BINARY_VERSION_MINOR 0
+#define QNN_LPAI_BINARY_VERSION_PATCH 0
+
+/// QNN LPAI Context blob Version values
+#define QNN_LPAI_CONTEXT_BLOB_VERSION_MAJOR 1
+#define QNN_LPAI_CONTEXT_BLOB_VERSION_MINOR 0
+#define QNN_LPAI_CONTEXT_BLOB_VERSION_PATCH 0
+
+#endif  // QNN_LPAI_COMMON_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/LPAI/QnnLpaiContext.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/LPAI/QnnLpaiContext.h
new file mode 100755
index 0000000000000..410e8e75f5dfe
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/LPAI/QnnLpaiContext.h
@@ -0,0 +1,55 @@
+//=============================================================================
+//
+//  Copyright (c) 2024 Qualcomm Technologies, Inc.
+//  All rights reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//=============================================================================
+
+/** @file
+ *  @brief QNN LPAI Context components
+ */
+
+#ifndef QNN_LPAI_CONTEXT_H
+#define QNN_LPAI_CONTEXT_H
+
+#ifdef __cplusplus
+#include <cstdint>
+#else
+#include <stdint.h>
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "QnnLpaiContextInt.h"
+
+typedef struct {
+  uint32_t option;
+  void* config;
+} QnnLpaiContext_CustomConfig_t;
+// clang-format on
+
+typedef enum {
+  // see QnnLpaiMem_MemType_t
+  QNN_LPAI_CONTEXT_SET_CFG_MODEL_BUFFER_MEM_TYPE =
+      QNN_LPAI_CONTEXT_SET_CFG_MODEL_BUFFER_MEM_TYPE_DEFAULT,
+  // Unused, present to ensure 32 bits.
+  QNN_LPAI_CONTEXT_SET_CFG_UNDEFINED = 0x7fffffff
+} QnnLpaiContext_SetConfigOption_t;
+
+// clang-format off
+// QnnLpaiContext_CustomConfig_t initializer macro
+#define QNN_LPAI_CONTEXT_CUSTOM_CONFIG_INIT                        \
+  {                                                                \
+    QNN_LPAI_CONTEXT_SET_CFG_UNDEFINED,               /*option*/   \
+    NULL                                              /*config*/   \
+  }
+
+// clang-format on
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif // QNN_LPAI_CONTEXT_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/LPAI/QnnLpaiContextInt.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/LPAI/QnnLpaiContextInt.h
new file mode 100755
index 0000000000000..b4a1cc4638653
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/LPAI/QnnLpaiContextInt.h
@@ -0,0 +1,25 @@
+//==============================================================================
+//
+//  Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+//  All rights reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+/** @file
+ *  @brief Internal versioning details for QNN LPAI Context components
+ */
+
+#ifndef QNN_LPAI_CONTEXT_INT_H
+#define QNN_LPAI_CONTEXT_INT_H
+
+#define QNN_LPAI_CONTEXT_SET_CFG_BASE  1
+
+// versions for setConfig options
+typedef enum {
+  QNN_LPAI_CONTEXT_SET_CFG_MODEL_BUFFER_MEM_TYPE_V1 = QNN_LPAI_CONTEXT_SET_CFG_BASE,
+  QNN_LPAI_CONTEXT_SET_CFG_MODEL_BUFFER_MEM_TYPE_DEFAULT =
+      QNN_LPAI_CONTEXT_SET_CFG_MODEL_BUFFER_MEM_TYPE_V1
+} QnnLpaiContext_SetConfigOption_ModelBufferMemTypeVersion_t;
+
+#endif  // QNN_LPAI_CONTEXT_INT_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/LPAI/QnnLpaiDevice.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/LPAI/QnnLpaiDevice.h
new file mode 100755
index 0000000000000..b7999480fb254
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/LPAI/QnnLpaiDevice.h
@@ -0,0 +1,50 @@
+//=============================================================================
+//
+//  Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+//  All rights reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//=============================================================================
+
+/** @file
+ *  @brief QNN LPAI Device components
+ */
+
+#ifndef QNN_LPAI_DEVICE_H
+#define QNN_LPAI_DEVICE_H
+
+#ifdef __cplusplus
+#include <cstdint>
+#else
+#include <stdint.h>
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * This structure is being used in QnnDevice_HardwareDeviceInfoV1_t
+ * QnnDevice_getPlatformInfo use this structure to list the supported device features/info
+ */
+typedef struct _QnnDevice_DeviceInfoExtension_t {
+  uint32_t socModel;        // An enum value defined in Qnn Header that represent SoC model
+  uint32_t arch;            // This field shows the architecture of this device
+  const char* domainName;   // This field shows the domain name of this device
+} QnnLpaiDevice_DeviceInfoExtension_t;
+
+// clang-format off
+/// QnnLpaiDevice_DeviceInfoExtension_t initializer macro
+#define QNN_LPAI_DEVICE_INFO_EXTENSION_INIT                          \
+  {                                                                  \
+    0u,                                         /*socModel*/         \
+    0u,                                         /*arch*/             \
+    "adsp"                                      /*domainName*/       \
+  }
+// clang-format on
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // QNN_LPAI_DEVICE_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/LPAI/QnnLpaiGraph.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/LPAI/QnnLpaiGraph.h
new file mode 100755
index 0000000000000..8441a419ce56f
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/LPAI/QnnLpaiGraph.h
@@ -0,0 +1,145 @@
+//==============================================================================
+//
+//  Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+//  All rights reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+/** @file
+ *  @brief QNN LPAI Graph components
+ */
+
+#ifndef QNN_LPAI_GRAPH_H
+#define QNN_LPAI_GRAPH_H
+
+#ifdef __cplusplus
+#include <cstdint>
+#else
+#include <stdint.h>
+#endif
+
+#include "QnnLpaiGraphInternal.h"
+#include "QnnLpaiMem.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef enum {
+  // see QnnLpaiGraph_Mem_t
+  QNN_LPAI_GRAPH_SET_CFG_SCRATCH_MEM = QNN_LPAI_GRAPH_SET_CFG_SCRATCH_MEM_DEFAULT,
+  // see QnnLpaiGraph_Mem_t
+  QNN_LPAI_GRAPH_SET_CFG_PERSISTENT_MEM = QNN_LPAI_GRAPH_SET_CFG_PERSISTENT_MEM_DEFAULT,
+  // see QnnLpaiGraph_PerfCfg_t
+  QNN_LPAI_GRAPH_SET_CFG_PERF_CFG = QNN_LPAI_GRAPH_SET_CFG_PERF_CFG_DEFAULT,
+  // see QnnLpaiGraph_CoreAffinity_t
+  QNN_LPAI_GRAPH_SET_CFG_CORE_AFFINITY = QNN_LPAI_GRAPH_SET_CFG_CORE_AFFINITY_DEFAULT,
+  // Unused, present to ensure 32 bits.
+  QNN_LPAI_GRAPH_SET_CFG_UNDEFINED = 0x7fffffff
+} QnnLpaiGraph_SetConfigOption_t;
+
+typedef enum {
+  // get the size requirement of scratch memory, return uint32_t
+  QNN_LPAI_GRAPH_GET_PROP_SCRATCH_MEM_SIZE = QNN_LPAI_GRAPH_GET_PROP_SCRATCH_MEM_SIZE_DEFAULT,
+  // get the size requirement of persistent memory, return uint32_t
+  QNN_LPAI_GRAPH_GET_PROP_PERSISTENT_MEM_SIZE = QNN_LPAI_GRAPH_GET_PROP_PERSISTENT_MEM_SIZE_DEFAULT,
+  // Unused, present to ensure 32 bits.
+  QNN_LPAI_GRAPH_GET_PROP_UNDEFINED = 0x7fffffff
+} QnnLpaiGraph_GetPropertyOption_t;
+
+typedef enum {
+  QNN_LPAI_GRAPH_CLIENT_PERF_TYPE_REAL_TIME     = 1,
+  QNN_LPAI_GRAPH_CLIENT_PERF_TYPE_NON_REAL_TIME = 2,
+  // Unused, present to ensure 32 bits.
+  QNN_LPAI_GRAPH_CLIENT_PERF_TYPE_UNDEFINED = 0x7fffffff
+} QnnLpaiGraph_ClientPerfType_t;
+
+typedef struct {
+  uint32_t fps;
+  uint32_t ftrtRatio;
+  QnnLpaiGraph_ClientPerfType_t clientType;
+} QnnLpaiGraph_PerfCfg_t;
+
+// clang-format off
+/// QnnLpaiGraph_PerfCfg_t initializer macro
+#define QNN_LPAI_GRAPH_PERF_CFG_INIT                             \
+  {                                                              \
+    1u,                                        /*fps*/           \
+    10u,                                       /*ftrtRatio*/     \
+    QNN_LPAI_GRAPH_CLIENT_PERF_TYPE_REAL_TIME  /*clientType*/    \
+  }
+// clang-format on
+
+typedef enum {
+  QNN_LPAI_GRAPH_CORE_AFFINITY_SOFT = 1,
+  QNN_LPAI_GRAPH_CORE_AFFINITY_HARD = 2,
+  // Unused, present to ensure 32 bits.
+  QNN_LPAI_GRAPH_CORE_AFFINITY_UNDEFINED = 0x7fffffff
+} QnnLpaiGraph_CoreAffinityType_t;
+
+typedef struct {
+  QnnLpaiGraph_CoreAffinityType_t affinity;
+  uint32_t coreSelection;
+} QnnLpaiGraph_CoreAffinity_t;
+
+// clang-format off
+/// QnnLpaiGraph_CoreAffinity_t initializer macro
+#define QNN_LPAI_GRAPH_CORE_AFFINITY_INIT                          \
+  {                                                                \
+    QNN_LPAI_GRAPH_CORE_AFFINITY_SOFT,       /*affinity*/          \
+    0u                                       /*core_selection*/    \
+  }
+// clang-format on
+
+typedef struct {
+  QnnLpaiMem_MemType_t memType;
+  uint32_t size;
+  void* addr;
+} QnnLpaiGraph_Mem_t;
+
+// clang-format off
+/// QnnLpaiGraph_Mem_t initializer macro
+#define QNN_LPAI_GRAPH_MEM_INIT                            \
+  {                                                        \
+    QNN_LPAI_MEM_TYPE_UNDEFINED,         /*memType*/       \
+    0u,                                  /*size*/          \
+    NULL                                 /*addr*/          \
+  }
+// clang-format on
+
+// used by QnnGraph_setConfig
+typedef struct {
+  uint32_t option;
+  void* config;
+} QnnLpaiGraph_CustomConfig_t;
+
+// clang-format off
+/// QnnLpaiGraph_CustomConfig_t initializer macro
+#define QNN_LPAI_GRAPH_CUSTOM_CONFIG_INIT                          \
+  {                                                                \
+    QNN_LPAI_GRAPH_SET_CFG_UNDEFINED,                /*option*/    \
+    NULL                                             /*config*/    \
+  }
+// clang-format on
+
+// used by QnnGraph_getProperty
+typedef struct {
+  uint32_t option;
+  void* property;
+} QnnLpaiGraph_CustomProperty_t;
+
+// clang-format off
+/// QnnLpaiGraph_CustomProperty_t initializer macro
+#define QNN_LPAI_GRAPH_CUSTOM_PROPERTY_INIT                          \
+  {                                                                  \
+    QNN_LPAI_GRAPH_GET_PROP_UNDEFINED,                 /*option*/    \
+    NULL                                               /*property*/  \
+  }
+// clang-format on
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // QNN_LPAI_GRAPH_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/LPAI/QnnLpaiGraphInternal.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/LPAI/QnnLpaiGraphInternal.h
new file mode 100755
index 0000000000000..21249df4ac125
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/LPAI/QnnLpaiGraphInternal.h
@@ -0,0 +1,57 @@
+//==============================================================================
+//
+//  Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+//  All rights reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+/** @file
+ *  @brief Internal versioning details for QNN LPAI Graph components
+ */
+
+#ifndef QNN_LPAI_GRAPH_INTERNAL_H
+#define QNN_LPAI_GRAPH_INTERNAL_H
+
+#define QNN_LPAI_GRAPH_SET_CFG_BASE  1
+#define QNN_LPAI_GRAPH_GET_PROP_BASE 1
+
+// versions for setConfig options
+typedef enum {
+  QNN_LPAI_GRAPH_SET_CFG_SCRATCH_MEM_V1      = QNN_LPAI_GRAPH_SET_CFG_BASE,
+  QNN_LPAI_GRAPH_SET_CFG_SCRATCH_MEM_DEFAULT = QNN_LPAI_GRAPH_SET_CFG_SCRATCH_MEM_V1
+} QnnLpaiGraph_SetConfigOption_ScratchMemVersion_t;
+
+typedef enum {
+  QNN_LPAI_GRAPH_SET_CFG_PERSISTENT_MEM_V1      = QNN_LPAI_GRAPH_SET_CFG_BASE + 100,
+  QNN_LPAI_GRAPH_SET_CFG_PERSISTENT_MEM_DEFAULT = QNN_LPAI_GRAPH_SET_CFG_PERSISTENT_MEM_V1
+} QnnLpaiGraph_SetConfigOption_PersistentMemVersion_t;
+
+typedef enum {
+  QNN_LPAI_GRAPH_SET_CFG_PERF_CFG_V1      = QNN_LPAI_GRAPH_SET_CFG_BASE + 200,
+  QNN_LPAI_GRAPH_SET_CFG_PERF_CFG_DEFAULT = QNN_LPAI_GRAPH_SET_CFG_PERF_CFG_V1
+} QnnLpaiGraph_SetConfigOption_PerfConfigVersion_t;
+
+typedef enum {
+  QNN_LPAI_GRAPH_SET_CFG_CORE_AFFINITY_V1      = QNN_LPAI_GRAPH_SET_CFG_BASE + 300,
+  QNN_LPAI_GRAPH_SET_CFG_CORE_AFFINITY_DEFAULT = QNN_LPAI_GRAPH_SET_CFG_CORE_AFFINITY_V1
+} QnnLpaiGraph_SetConfigOption_CoreAffinityVersion_t;
+
+typedef enum {
+  QNN_LPAI_GRAPH_SET_CFG_PREPARE_V1      = QNN_LPAI_GRAPH_SET_CFG_BASE + 10000,
+  QNN_LPAI_GRAPH_SET_CFG_PREPARE_DEFAULT = QNN_LPAI_GRAPH_SET_CFG_PREPARE_V1
+} QnnLpaiGraph_SetConfigOption_PrepareVersion_t;
+
+// versions for getProperty options
+typedef enum {
+  QNN_LPAI_GRAPH_GET_PROP_SCRATCH_MEM_SIZE_V1      = QNN_LPAI_GRAPH_GET_PROP_BASE,
+  QNN_LPAI_GRAPH_GET_PROP_SCRATCH_MEM_SIZE_DEFAULT = QNN_LPAI_GRAPH_GET_PROP_SCRATCH_MEM_SIZE_V1
+} QnnLpaiGraph_GetPropertyOption_ScratchMemSizeVersion_t;
+
+typedef enum {
+  QNN_LPAI_GRAPH_GET_PROP_PERSISTENT_MEM_SIZE_V1 = QNN_LPAI_GRAPH_GET_PROP_BASE + 100,
+  QNN_LPAI_GRAPH_GET_PROP_PERSISTENT_MEM_SIZE_DEFAULT =
+      QNN_LPAI_GRAPH_GET_PROP_PERSISTENT_MEM_SIZE_V1
+} QnnLpaiGraph_GetPropertyOption_PersistentMemSizeVersion_t;
+
+#endif  // QNN_LPAI_GRAPH_INTERNAL_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/LPAI/QnnLpaiGraphPrepare.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/LPAI/QnnLpaiGraphPrepare.h
new file mode 100755
index 0000000000000..6f5f383c3e179
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/LPAI/QnnLpaiGraphPrepare.h
@@ -0,0 +1,61 @@
+//==============================================================================
+//
+//  Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+//  All rights reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+/** @file
+ *  @brief QNN LPAI Graph Preparation component.
+ *
+ */
+
+#ifndef QNN_LPAI_GRAPH_PREPARE_H
+#define QNN_LPAI_GRAPH_PREPARE_H
+
+#ifdef __cplusplus
+#include <cstdint>
+#else
+#include <stdint.h>
+#endif
+
+#include "QnnLpaiGraphInternal.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//------------------------------------------------------------------------------
+//   Implementation Definition
+//------------------------------------------------------------------------------
+
+typedef enum {
+  QNN_LPAI_GRAPH_SET_CFG_PREPARE = QNN_LPAI_GRAPH_SET_CFG_PREPARE_DEFAULT
+} QnnLpaiGraph_ConfigPrepareOption_t;
+
+/**
+ * @brief Structure describing the set of configurations supported by the graph config prepare.
+ *        Objects of this type are to be referenced through QnnGraph_CustomConfig_t.
+ *
+ */
+// todo: will replace with high-level config
+typedef struct {
+  uint32_t enablePerLayer;
+} QnnLpaiGraph_CustomConfigPrepare_t;
+
+// clang-format off
+/// QnnLpaiGraph_CustomConfigPrepare_t initializer macro
+// todo: will replace with high-level config
+#define QNN_LPAI_GRAPH_CUSTOM_CONFIG_PREPARE_INIT               \
+  {                                                             \
+      0u                                /*enablePerLayer*/      \
+  }
+
+// clang-format on
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/LPAI/QnnLpaiMem.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/LPAI/QnnLpaiMem.h
new file mode 100755
index 0000000000000..3537678d34cd0
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/LPAI/QnnLpaiMem.h
@@ -0,0 +1,55 @@
+//==============================================================================
+//
+//  Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+//  All rights reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+/** @file
+ *  @brief QNN LPAI Memory components
+ */
+
+#ifndef QNN_LPAI_MEM_H
+#define QNN_LPAI_MEM_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef __cplusplus
+#include <cstdint>
+#else
+#include <stdint.h>
+#endif
+
+typedef enum {
+  QNN_LPAI_MEM_TYPE_DDR       = 1,
+  QNN_LPAI_MEM_TYPE_LLC       = 2,
+  QNN_LPAI_MEM_TYPE_TCM       = 3,
+  QNN_LPAI_MEM_TYPE_UNDEFINED = 0x7fffffff
+} QnnLpaiMem_MemType_t;
+
+/**
+ * @brief Definition of custom mem info
+ */
+typedef struct {
+  /// file descriptor for memory
+  int32_t fd;
+  /// offset from start of fd
+  uint32_t offset;
+} QnnLpaiMem_MemInfoCustom_t;
+
+// clang-format off
+#define QNN_LPAI_MEM_INFO_CUSTOM_INIT                             \
+  {                                                               \
+    0,                                          /*fd*/            \
+    0u                                          /*offset*/        \
+  }
+// clang-format on
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // QNN_LPAI_MEM_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/QnnBackend.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/QnnBackend.h
new file mode 100755
index 0000000000000..7d8506b23b0d9
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/QnnBackend.h
@@ -0,0 +1,429 @@
+//=============================================================================
+//
+//  Copyright (c) 2019-2024 Qualcomm Technologies, Inc.
+//  All Rights Reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//=============================================================================
+
+/**
+ *  @file
+ *  @brief  Backend component API.
+ *
+ *          This is top level QNN API component.
+ *          Most of the QNN API requires backend to be created first.
+ */
+
+#ifndef QNN_BACKEND_H
+#define QNN_BACKEND_H
+
+#include "QnnCommon.h"
+#include "QnnTypes.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//=============================================================================
+// Macros
+//=============================================================================
+
+//=============================================================================
+// Data Types
+//=============================================================================
+
+/**
+ * @brief QNN Backend API result / error codes.
+ */
+typedef enum {
+  QNN_BACKEND_MIN_ERROR = QNN_MIN_ERROR_BACKEND,
+  ////////////////////////////////////////////
+
+  /// Qnn Backend success
+  QNN_BACKEND_NO_ERROR = QNN_SUCCESS,
+  /// General error relating to memory allocation in Backend API
+  QNN_BACKEND_ERROR_MEM_ALLOC = QNN_COMMON_ERROR_MEM_ALLOC,
+  /// Backend attempted to be created on an unsupported platform
+  QNN_BACKEND_ERROR_UNSUPPORTED_PLATFORM = QNN_COMMON_ERROR_PLATFORM_NOT_SUPPORTED,
+  /// Backend failed to initialize
+  QNN_BACKEND_ERROR_CANNOT_INITIALIZE = QNN_MIN_ERROR_BACKEND + 0,
+  /// Failed to free allocated resources during termination
+  QNN_BACKEND_ERROR_TERMINATE_FAILED = QNN_MIN_ERROR_BACKEND + 2,
+  /// Backend does not support requested functionality
+  QNN_BACKEND_ERROR_NOT_SUPPORTED = QNN_MIN_ERROR_BACKEND + 3,
+  /// Invalid function argument
+  QNN_BACKEND_ERROR_INVALID_ARGUMENT = QNN_MIN_ERROR_BACKEND + 4,
+  /// Could not find specified op package
+  QNN_BACKEND_ERROR_OP_PACKAGE_NOT_FOUND = QNN_MIN_ERROR_BACKEND + 5,
+  /// Could not load interface provider from op package library
+  QNN_BACKEND_ERROR_OP_PACKAGE_IF_PROVIDER_NOT_FOUND = QNN_MIN_ERROR_BACKEND + 6,
+  /// Failed to register op package
+  QNN_BACKEND_ERROR_OP_PACKAGE_REGISTRATION_FAILED = QNN_MIN_ERROR_BACKEND + 7,
+  /// Backend does not support the op config's interface version
+  QNN_BACKEND_ERROR_OP_PACKAGE_UNSUPPORTED_VERSION = QNN_MIN_ERROR_BACKEND + 8,
+  /// An Op with the same package name and op name was already registered
+  QNN_BACKEND_ERROR_OP_PACKAGE_DUPLICATE = QNN_MIN_ERROR_BACKEND + 9,
+  /// Inconsistent backend configuration
+  QNN_BACKEND_ERROR_INCONSISTENT_CONFIG = QNN_MIN_ERROR_BACKEND + 10,
+  /// Invalid backend handle
+  QNN_BACKEND_ERROR_INVALID_HANDLE = QNN_MIN_ERROR_BACKEND + 11,
+  /// Invalid config
+  QNN_BACKEND_ERROR_INVALID_CONFIG = QNN_MIN_ERROR_BACKEND + 12,
+  ////////////////////////////////////////////
+  QNN_BACKEND_MAX_ERROR = QNN_MAX_ERROR_BACKEND,
+  // Unused, present to ensure 32 bits.
+  QNN_BACKEND_ERROR_UNDEFINED = 0x7FFFFFFF
+} QnnBackend_Error_t;
+
+/**
+ * @brief Backend specific object for custom configuration
+ *
+ * Please refer to documentation provided by the backend for usage information
+ */
+typedef void* QnnBackend_CustomConfig_t;
+
+/**
+ * @brief This enum defines backend config options.
+ */
+typedef enum {
+  /// Sets backend custom options via QnnBackend_CustomConfig_t
+  QNN_BACKEND_CONFIG_OPTION_CUSTOM = 0,
+  /// Sets error reporting level
+  QNN_BACKEND_CONFIG_OPTION_ERROR_REPORTING = 1,
+  /// Key-value pair of platform options.
+  QNN_BACKEND_CONFIG_OPTION_PLATFORM = 2,
+  // Unused, present to ensure 32 bits.
+  QNN_BACKEND_CONFIG_OPTION_UNDEFINED = 0x7FFFFFFF
+} QnnBackend_ConfigOption_t;
+
+/**
+ * @brief This struct provides backend configuration.
+ */
+typedef struct {
+  QnnBackend_ConfigOption_t option;
+  union UNNAMED {
+    QnnBackend_CustomConfig_t customConfig;
+    /// Applies error reporting configuration across backend.
+    /// All QNN contexts share this common error configuration
+    /// for APIs that are independent of a context.
+    Qnn_ErrorReportingConfig_t errorConfig;
+    /// Null-terminated platform option key-value pair. Multiple platform options can be specified.
+    /// Max length is 1024.
+    const char* platformOption;
+  };
+} QnnBackend_Config_t;
+
+/// QnnBackend_Config_t initializer macro
+#define QNN_BACKEND_CONFIG_INIT                     \
+  {                                                 \
+    QNN_BACKEND_CONFIG_OPTION_UNDEFINED, /*option*/ \
+    {                                               \
+      NULL /*customConfig*/                         \
+    }                                               \
+  }
+
+/**
+ * @brief Struct which encapsulates the fully-qualified name of an operation.
+ */
+typedef struct {
+  /// The op package to which the operation belongs.
+  const char* packageName;
+  /// The type name of the operation.
+  const char* name;
+  /// The intended target platform for the combination of domain and operation name.
+  /// Target may be unused (NULL) by some backends.
+  const char* target;
+} QnnBackend_OperationName_t;
+
+// clang-format off
+/// QnnBackend_OperationName_t initializer macro
+#define QNN_BACKEND_OPERATION_NAME_INIT \
+  {                                     \
+    NULL,     /*packageName*/           \
+    NULL,     /*name*/                  \
+    NULL      /*target*/                \
+  }
+// clang-format on
+
+/**
+ * @brief This enum defines backend property options.
+ */
+typedef enum {
+  /// Gets backend custom properties, see backend specific documentation.
+  QNN_BACKEND_PROPERTY_OPTION_CUSTOM = 0,
+  /// Value selected to ensure 32 bits.
+  QNN_BACKEND_PROPERTY_OPTION_UNDEFINED = 0x7FFFFFFF
+} QnnBackend_PropertyOption_t;
+
+/**
+ * @brief Backend specific object for custom property
+ *
+ * Please refer to documentation provided by the backend for usage information
+ */
+typedef void* QnnBackend_CustomProperty_t;
+
+/**
+ * @brief This struct provides backend property.
+ *        Option is specified by the client. Everything
+ *        else is written by the backend.
+ */
+typedef struct {
+  QnnBackend_PropertyOption_t option;
+  union UNNAMED {
+    QnnBackend_CustomProperty_t customProperty;
+  };
+} QnnBackend_Property_t;
+
+// clang-format off
+/// QnnBackend_Property_t initializer macro
+#define QNN_BACKEND_PROPERTY_INIT                     \
+  {                                                   \
+    QNN_BACKEND_PROPERTY_OPTION_UNDEFINED, /*option*/ \
+    {                                                 \
+      NULL /*customProperty*/                         \
+    }                                                 \
+  }
+// clang-format on
+
+//=============================================================================
+// Public Functions
+//=============================================================================
+
+/**
+ * @brief Initialize a backend library and create a backend handle. Function is re-entrant and
+ *        thread-safe.
+ *
+ * @param[in] logger A handle to the logger, use NULL handle to disable logging.
+ *                   QnnBackend doesn't manage the lifecycle of logger and must be freed by using
+ *                   QnnLog_free().
+ *
+ * @param[in] config Pointer to a NULL terminated array of config option pointers.
+ *                   NULL is allowed and indicates no config options are provided.
+ *                   All config options have default value, in case not provided.
+ *                   If same config option type is provided multiple times,
+ *                   the last option value will be used.
+ *
+ * @param[out] backend A handle to the created backend.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: No error encountered
+ *         - QNN_BACKEND_ERROR_UNSUPPORTED_PLATFORM: Backend attempted to be created on
+ *           unsupported platform
+ *         - QNN_BACKEND_ERROR_INCONSISTENT_CONFIG: One or more backend configurations are
+ *           inconsistent between multiple create calls. Refer to backend headers for which
+ *           configuration options must be consistent.
+ *         - QNN_BACKEND_ERROR_CANNOT_INITIALIZE: backend failed to initialize
+ *         - QNN_BACKEND_ERROR_MEM_ALLOC: error related to memory allocation
+ *         - QNN_BACKEND_ERROR_INVALID_HANDLE: _logger_ is not a valid handle
+ *         - QNN_BACKEND_ERROR_INVALID_CONFIG: one or more config values is invalid
+ *         - QNN_BACKEND_ERROR_NOT_SUPPORTED: an optional feature is not supported
+ *
+ * @note Use corresponding API through QnnInterface_t.
+ */
+QNN_API
+Qnn_ErrorHandle_t QnnBackend_create(Qnn_LogHandle_t logger,
+                                    const QnnBackend_Config_t** config,
+                                    Qnn_BackendHandle_t* backend);
+/**
+ * @brief A function to set/modify configuration options on an already generated backend.
+ *
+ * @param[in] backend A backend handle.
+ *
+ * @param[in] config Pointer to a NULL terminated array of config option pointers.
+ *                   NULL is allowed and indicates no config options are provided.
+ *                   All config options have default value, in case not provided.
+ *                   If same config option type is provided multiple times,
+ *                   the last option value will be used.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: no error is encountered
+ *         - QNN_BACKEND_ERROR_INVALID_HANDLE: _backend_ is not a valid handle
+ *         - QNN_BACKEND_ERROR_INVALID_CONFIG: at least one config option is invalid
+ *         - QNN_BACKEND_ERROR_NOT_SUPPORTED: an optional feature is not supported
+ *
+ * @note Use corresponding API through QnnInterface_t.
+ */
+QNN_API
+Qnn_ErrorHandle_t QnnBackend_setConfig(Qnn_BackendHandle_t backend,
+                                       const QnnBackend_Config_t** config);
+
+/**
+ * @brief Get the QNN API version.
+ *
+ * @note Safe to call any time, backend does not have to be created.
+ *
+ * @param[out] pVersion Pointer to version object.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: No error encountered
+ *         - QNN_BACKEND_ERROR_INVALID_ARGUMENT: if _pVersion_ was NULL
+ *
+ * @note Use corresponding API through QnnInterface_t.
+ */
+QNN_API
+Qnn_ErrorHandle_t QnnBackend_getApiVersion(Qnn_ApiVersion_t* pVersion);
+
+/**
+ * @brief Get build id for backend library.
+ *
+ * @note Safe to call any time, backend does not have to be created.
+ *
+ * @param[out] id Pointer to string containing the build id.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: No error encountered
+ *         - QNN_BACKEND_ERROR_NOT_SUPPORTED: No build ID is available
+ *         - QNN_BACKEND_ERROR_INVALID_ARGUMENT: if _id_ is NULL
+ *
+ * @note Use corresponding API through QnnInterface_t.
+ */
+QNN_API
+Qnn_ErrorHandle_t QnnBackend_getBuildId(const char** id);
+
+/**
+ * @brief Register an operation package with the backend handle.
+ *
+ * @param[in] backend  A backend handle.
+ *
+ * @param[in] packagePath Path on disk to the op package library to load.
+ *
+ * @param[in] interfaceProvider The name of a function in the op package library which satisfies
+ *                              the QnnOpPackage_InterfaceProvider_t interface. The backend will
+ *                              use this function to retrieve the op package's interface.
+ *
+ * @param[in] target An optional parameter specifying the target platform on which the backend must
+ *                   register the op package. Required in scenarios where an op package is to be
+ *                   loaded on a processing unit that is different from the target on which the
+ *                   backend runs. Ex: loading a DSP op package on ARM for optional online context
+ *                   caching. Refer to additional documentation for a list of permissible target
+ *                   names.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: No error encountered
+ *         - QNN_BACKEND_ERROR_INVALID_ARGUMENT: if _packagePath_ or _interfaceProvider_ is NULL
+ *         - QNN_BACKEND_ERROR_OP_PACKAGE_NOT_FOUND: Could not open _packagePath_
+ *         - QNN_BACKEND_ERROR_OP_PACKAGE_IF_PROVIDER_NOT_FOUND: Could not find _interfaceProvider_
+ *           symbol in package library
+ *         - QNN_BACKEND_ERROR_OP_PACKAGE_REGISTRATION_FAILED: Op package registration failed
+ *         - QNN_BACKEND_ERROR_OP_PACKAGE_UNSUPPORTED_VERSION: Op package has interface version not
+ *           supported by this backend
+ *         - QNN_BACKEND_ERROR_NOT_SUPPORTED: Op package registration is not supported.
+ *         - QNN_BACKEND_ERROR_INVALID_HANDLE: _backend_ is not a valid handle
+ *         - QNN_BACKEND_ERROR_OP_PACKAGE_DUPLICATE: OpPackageName+OpName must be unique.
+ *           Op package content information can be be obtained with QnnOpPackage interface.
+ *           Indicates that an Op with the same package name and op name was already registered.
+ *         - QNN_COMMON_ERROR_SYSTEM_COMMUNICATION: SSR occurence (successful recovery)
+ *         - QNN_COMMON_ERROR_SYSTEM_COMMUNICATION_FATAL: SSR occurence (unsuccessful recovery)
+ *
+ * @note Use corresponding API through QnnInterface_t.
+ */
+QNN_API
+Qnn_ErrorHandle_t QnnBackend_registerOpPackage(Qnn_BackendHandle_t backend,
+                                               const char* packagePath,
+                                               const char* interfaceProvider,
+                                               const char* target);
+
+/**
+ * @brief Get the supported operations registered to a backend handle including built-in ops.
+ *
+ * @param[in] backend A backend handle. Can be NULL to obtain the built-in op package.
+ *
+ * @param[out] numOperations Number of supported operations.
+ *
+ * @param[out] operations Array of operation names. Memory is backend owned and de-allocated
+ *                        during QnnBackend_free.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: No error encountered
+ *         - QNN_BACKEND_ERROR_INVALID_ARGUMENT: if _numOperations_ or _operations_ is NULL
+ *         - QNN_BACKEND_ERROR_INVALID_HANDLE: _backend_ is not a valid handle
+ *         - QNN_COMMON_ERROR_SYSTEM_COMMUNICATION: SSR occurence (successful recovery)
+ *         - QNN_COMMON_ERROR_SYSTEM_COMMUNICATION_FATAL: SSR occurence (unsuccessful recovery)
+ *
+ * @note Use corresponding API through QnnInterface_t.
+ */
+QNN_API
+Qnn_ErrorHandle_t QnnBackend_getSupportedOperations(Qnn_BackendHandle_t backend,
+                                                    uint32_t* numOperations,
+                                                    const QnnBackend_OperationName_t** operations);
+
+/**
+ * @brief A method to validate op config with an appropriate op package
+ *        This is a wrapper API around the actual OpPackage interface method
+ *        that performs op validation. Backend may pick an appropriate op package
+ *        among ones that are registered with it for validation based on the attributes
+ *        of the op configuration.
+ *
+ * @param[in] backend A backend handle.
+ *
+ * @param[in] opConfig Fully qualified struct containing the configuration of the operation.
+ *
+ * @note  _inputTensors_ and _outputTensors_ inside opConfig must be fully qualified for
+ *        complete validation. However, their IDs (_id_) and names (_name_) are ignored during
+ *        validation.
+ *
+ * @return Error code
+ *         - QNN_SUCCESS if validation is successful
+ *         - QNN_OP_PACKAGE_ERROR_VALIDATION_FAILURE: op config validation failed
+ *         - QNN_BACKEND_ERROR_NOT_SUPPORTED: Validation API not supported
+ *         - QNN_BACKEND_ERROR_OP_PACKAGE_NOT_FOUND: No op package with matching
+ *           op config attributes found.
+ *         - QNN_BACKEND_ERROR_INVALID_HANDLE: _backend_ is not a valid handle
+ *
+ * @note Use corresponding API through QnnInterface_t.
+ */
+QNN_API
+Qnn_ErrorHandle_t QnnBackend_validateOpConfig(Qnn_BackendHandle_t backend, Qnn_OpConfig_t opConfig);
+
+/**
+ * @brief A function to get a list of backend properties.
+ *        Backends are not required to support this API.
+ *
+ * @param[in] backendHandle A backend handle.
+ *
+ * @param[in/out] properties Pointer to a null terminated array of pointers containing the
+ *                           properties associated with the passed backendHandle. Memory for
+ *                           this information is owned and managed by the client. Client
+ *                           needs to populate the property options being requested. If
+ *                           _contextHandle_ is not recognized, the pointer _properties_
+ *                           points to is set to nullptr.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: no error is encountered
+ *         - QNN_BACKEND_ERROR_INVALID_HANDLE: _backendHandle_ is not a valid handle
+ *         - QNN_BACKEND_ERROR_INVALID_ARGUMENT: _properties_ is NULL or at least one property option
+ *           is invalid
+ *         - QNN_BACKEND_ERROR_NOT_SUPPORTED: at least one valid property option is not
+ *           supported
+ *
+ * @note Use corresponding API through QnnInterface_t.
+ */
+QNN_API
+Qnn_ErrorHandle_t QnnBackend_getProperty(Qnn_BackendHandle_t backendHandle,
+                                         QnnBackend_Property_t** properties);
+
+/**
+ * @brief Free all resources associated with a backend handle.
+ *
+ * @param[in] backend handle to be freed.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: No error encountered.
+ *         - QNN_BACKEND_ERROR_MEM_ALLOC: error related to memory deallocation
+ *         - QNN_BACKEND_ERROR_TERMINATE_FAILED: indicates failure to free
+ *           resources or failure to invalidate handles and pointers allocated
+ *           by the library
+ *         - QNN_BACKEND_ERROR_INVALID_HANDLE: _backend_ is not a valid handle
+ *         - QNN_COMMON_ERROR_SYSTEM_COMMUNICATION: SSR occurence (successful recovery)
+ *         - QNN_COMMON_ERROR_SYSTEM_COMMUNICATION_FATAL: SSR occurence (unsuccessful recovery)
+ *
+ * @note Use corresponding API through QnnInterface_t.
+ */
+QNN_API
+Qnn_ErrorHandle_t QnnBackend_free(Qnn_BackendHandle_t backend);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/QnnCommon.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/QnnCommon.h
new file mode 100755
index 0000000000000..ca3d0d2959070
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/QnnCommon.h
@@ -0,0 +1,225 @@
+//==============================================================================
+//
+//  Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+//  All Rights Reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+/**
+ * @file
+ * @brief   Common API components
+ *
+ *          A header which contains common components shared between different
+ *          parts of the API, for example, definition of "context" type. This
+ *          simplifies the cross-inclusion of headers.
+ */
+
+#ifndef QNN_COMMON_H
+#define QNN_COMMON_H
+
+#ifdef __cplusplus
+#include <cstdint>
+#else
+#include <stdint.h>
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//=============================================================================
+// Macros
+//=============================================================================
+
+// Macro controlling visibility of QNN API
+#ifndef QNN_API
+#define QNN_API
+#endif
+
+//! \cond
+// Macro to enable processing unnamed unions under struct for documentation purposes
+#define UNNAMED
+//! \endcond
+
+// Provide values to use for API version.
+#define QNN_API_VERSION_MAJOR 2
+#define QNN_API_VERSION_MINOR 25
+#define QNN_API_VERSION_PATCH 0
+
+/// NULL backend identifier.
+#define QNN_BACKEND_ID_NULL 0
+
+/*
+ * Identifiers for known backends that may be included into the SDK.
+ * These identifiers are defined by each backend in Qnn<backend>Common.h.
+ * Identifiers must be unique per backend.
+ *
+ * - QNN_BACKEND_ID_NULL      0
+ * - QNN_BACKEND_ID_REFERENCE 1
+ * - QNN_BACKEND_ID_SAVER     2
+ * - QNN_BACKEND_ID_CPU       3
+ * - QNN_BACKEND_ID_GPU       4
+ * - QNN_BACKEND_ID_DSP       5
+ * - QNN_BACKEND_ID_HTP       6
+ */
+
+/// Global value indicating success
+#define QNN_SUCCESS 0
+
+// Error code space assigned to API components
+#define QNN_MIN_ERROR_COMMON              1000
+#define QNN_MAX_ERROR_COMMON              1999
+#define QNN_MIN_ERROR_PROPERTY            2000
+#define QNN_MAX_ERROR_PROPERTY            2999
+#define QNN_MIN_ERROR_OP_PACKAGE          3000
+#define QNN_MAX_ERROR_OP_PACKAGE          3999
+#define QNN_MIN_ERROR_BACKEND             4000
+#define QNN_MIN_ERROR_BACKEND_SAVER       4950
+#define QNN_MAX_ERROR_BACKEND_SAVER       4998
+#define QNN_MAX_ERROR_BACKEND             4999
+#define QNN_MIN_ERROR_CONTEXT             5000
+#define QNN_MAX_ERROR_CONTEXT             5999
+#define QNN_MIN_ERROR_GRAPH               6000
+#define QNN_MAX_ERROR_GRAPH               6999
+#define QNN_MIN_ERROR_TENSOR              7000
+#define QNN_MAX_ERROR_TENSOR              7999
+#define QNN_MIN_ERROR_MEM                 8000
+#define QNN_MAX_ERROR_MEM                 8999
+#define QNN_MIN_ERROR_SIGNAL              9000
+#define QNN_MAX_ERROR_SIGNAL              9999
+#define QNN_MIN_ERROR_ERROR               10000
+#define QNN_MAX_ERROR_ERROR               10999
+#define QNN_MIN_ERROR_LOG                 11000
+#define QNN_MAX_ERROR_LOG                 11999
+#define QNN_MIN_ERROR_PROFILE             12000
+#define QNN_MAX_ERROR_PROFILE             12999
+#define QNN_MIN_ERROR_PERF_INFRASTRUCTURE 13000
+#define QNN_MAX_ERROR_PERF_INFRASTRUCTURE 13999
+#define QNN_MIN_ERROR_DEVICE              14000
+#define QNN_MAX_ERROR_DEVICE              14999
+// Reserved range for QNN system APIs: 30000-50000
+#define QNN_MIN_ERROR_SYSTEM    30000
+#define QNN_MAX_ERROR_SYSTEM    49999
+#define QNN_MIN_ERROR_INTERFACE 60000
+#define QNN_MAX_ERROR_INTERFACE 60999
+
+// Utility macros
+#define QNN_PASTE_THREE(a, b, c) a##b##c
+
+/// Simple utility to extract 16-bit error code from 64-bit Qnn_ErrorHandle_t
+#define QNN_GET_ERROR_CODE(errorHandle) (errorHandle & 0xFFFF)
+
+//=============================================================================
+// Data Types
+//=============================================================================
+
+// clang-format off
+
+/**
+ * @brief A typedef to indicate QNN API return handle. Return error codes from APIs are to be read
+ * out from the least significant 16 bits of the field. The higher order bits are reserved for
+ * internal tracking purposes.
+ */
+typedef uint64_t Qnn_ErrorHandle_t;
+
+/**
+ * @brief Definition of the QNN handle type. This handle type is the base type for all other QNN
+ * handle types. Handles typically have corresponding create and free API functions.
+ */
+typedef void* Qnn_Handle_t;
+
+/**
+ * @brief Definition of the QNN backend handle. Backend handles are often used as a parent when
+ * creating handles other QNN components (e.g. contexts).
+ */
+typedef Qnn_Handle_t Qnn_BackendHandle_t;
+
+/**
+ * @brief Definition of the QNN context handle.
+ */
+typedef Qnn_Handle_t Qnn_ContextHandle_t;
+
+/**
+ * @brief Definition of the QNN device handle.
+ */
+typedef Qnn_Handle_t Qnn_DeviceHandle_t;
+
+/**
+ * @brief Definition of the QNN graph handle. Graph handles cannot be free'd.
+ */
+typedef Qnn_Handle_t Qnn_GraphHandle_t;
+
+/**
+ * @brief Definition of the QNN log handle.
+ */
+typedef Qnn_Handle_t Qnn_LogHandle_t;
+
+/**
+ * @brief Definition of the QNN memory handle.
+ */
+typedef Qnn_Handle_t Qnn_MemHandle_t;
+
+/**
+ * @brief Definition of the QNN profile handle.
+ */
+typedef Qnn_Handle_t Qnn_ProfileHandle_t;
+
+/**
+ * @brief An opaque control object which may be used to control the execution behavior of various
+ * QNN functions. A signal object may only be used by one call at a time; if the same signal
+ * object is supplied to a second call before the first has terminated, the second call will
+ * immediately fail with an error. When the call using a signal returns gracefully, the signal
+ * object is made available again.
+ */
+typedef Qnn_Handle_t Qnn_SignalHandle_t;
+
+// clang-format on
+
+/**
+ * @brief An enum which defines error codes commonly used across API components.
+ */
+typedef enum {
+  QNN_COMMON_MIN_ERROR = QNN_MIN_ERROR_COMMON,
+  //////////////////////////////////////////
+
+  /// API or feature is not supported by implementation.
+  QNN_COMMON_ERROR_NOT_SUPPORTED = QNN_MIN_ERROR_COMMON + 0,
+  /// Memory allocation related error.
+  QNN_COMMON_ERROR_MEM_ALLOC = QNN_MIN_ERROR_COMMON + 2,
+  /// System level error, such as related to platform / OS services
+  QNN_COMMON_ERROR_SYSTEM = QNN_MIN_ERROR_COMMON + 3,
+  /// Invalid function argument
+  QNN_COMMON_ERROR_INVALID_ARGUMENT = QNN_MIN_ERROR_COMMON + 4,
+  /// Illegal operation or sequence of operations
+  QNN_COMMON_ERROR_OPERATION_NOT_PERMITTED = QNN_MIN_ERROR_COMMON + 5,
+  /// Attempt to use QNN API on an unsupported platform
+  QNN_COMMON_ERROR_PLATFORM_NOT_SUPPORTED = QNN_MIN_ERROR_COMMON + 6,
+  /// Communication errors with platform / OS service (service is recoverable)
+  QNN_COMMON_ERROR_SYSTEM_COMMUNICATION = QNN_MIN_ERROR_COMMON + 7,
+  /// Loaded libraries are of incompatible versions
+  QNN_COMMON_ERROR_INCOMPATIBLE_BINARIES = QNN_MIN_ERROR_COMMON + 8,
+  /// Attempt to reload library already loaded in this process
+  QNN_COMMON_ERROR_LOADING_BINARIES = QNN_MIN_ERROR_COMMON + 9,
+  /// Resource allocation related error.
+  QNN_COMMON_ERROR_RESOURCE_UNAVAILABLE = QNN_MIN_ERROR_COMMON + 10,
+  /// Communication errors with platform / OS service (service is non-recoverable)
+  QNN_COMMON_ERROR_SYSTEM_COMMUNICATION_FATAL = QNN_MIN_ERROR_COMMON + 11,
+  /// General error, which has not been identified as any other error type.
+  QNN_COMMON_ERROR_GENERAL = QNN_MIN_ERROR_COMMON + 100,
+
+  //////////////////////////////////////////
+  QNN_COMMON_MAX_ERROR = QNN_MAX_ERROR_COMMON,
+  // Unused, present to ensure 32 bits.
+  QNN_COMMON_ERROR_UNDEFINED = 0x7FFFFFFF
+} QnnCommon_Error_t;
+
+//=============================================================================
+// Public Functions
+//=============================================================================
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // QNN_COMMON_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/QnnContext.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/QnnContext.h
new file mode 100755
index 0000000000000..3efc119ac2b16
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/QnnContext.h
@@ -0,0 +1,1034 @@
+//==============================================================================
+//
+// Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+/**
+ *  @file
+ *  @brief  Context component API.
+ *
+ *          Requires Backend to be initialized.
+ *          Graphs and Tensors are created within Context.
+ *          Context content once created can be cached into a binary form.
+ */
+
+#ifndef QNN_CONTEXT_H
+#define QNN_CONTEXT_H
+
+#include "QnnCommon.h"
+#include "QnnTypes.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//=============================================================================
+// Macros
+//=============================================================================
+
+//=============================================================================
+// Data Types
+//=============================================================================
+
+/**
+ * @brief QNN Context API result / error codes.
+ */
+typedef enum {
+  QNN_CONTEXT_MIN_ERROR = QNN_MIN_ERROR_CONTEXT,
+  ////////////////////////////////////////////
+
+  /// Qnn context success
+  QNN_CONTEXT_NO_ERROR = QNN_SUCCESS,
+  /// There is optional API component that is not supported yet. See QnnProperty.
+  QNN_CONTEXT_ERROR_UNSUPPORTED_FEATURE = QNN_COMMON_ERROR_NOT_SUPPORTED,
+  /// Context-specific memory allocation/deallocation failure
+  QNN_CONTEXT_ERROR_MEM_ALLOC = QNN_COMMON_ERROR_MEM_ALLOC,
+  /// An argument to QNN context API is deemed invalid by a backend
+  QNN_CONTEXT_ERROR_INVALID_ARGUMENT = QNN_MIN_ERROR_CONTEXT + 0,
+  /// A QNN context has not yet been created in the backend
+  QNN_CONTEXT_ERROR_CTX_DOES_NOT_EXIST = QNN_MIN_ERROR_CONTEXT + 1,
+  /// Invalid/NULL QNN context handle
+  QNN_CONTEXT_ERROR_INVALID_HANDLE = QNN_MIN_ERROR_CONTEXT + 2,
+  /// Attempting an operation when graphs in a context haven't been finalized
+  QNN_CONTEXT_ERROR_NOT_FINALIZED = QNN_MIN_ERROR_CONTEXT + 3,
+  /// Attempt to access context binary with an incompatible version
+  QNN_CONTEXT_ERROR_BINARY_VERSION = QNN_MIN_ERROR_CONTEXT + 4,
+  /// Failure to create context from binary
+  QNN_CONTEXT_ERROR_CREATE_FROM_BINARY = QNN_MIN_ERROR_CONTEXT + 5,
+  /// Failure to get size of a QNN serialized context
+  QNN_CONTEXT_ERROR_GET_BINARY_SIZE_FAILED = QNN_MIN_ERROR_CONTEXT + 6,
+  /// Failure to generate a QNN serialized context
+  QNN_CONTEXT_ERROR_GET_BINARY_FAILED = QNN_MIN_ERROR_CONTEXT + 7,
+  /// Invalid context binary configuration
+  QNN_CONTEXT_ERROR_BINARY_CONFIGURATION = QNN_MIN_ERROR_CONTEXT + 8,
+  /// Failure to set profile
+  QNN_CONTEXT_ERROR_SET_PROFILE = QNN_MIN_ERROR_CONTEXT + 9,
+  /// Invalid config
+  QNN_CONTEXT_ERROR_INVALID_CONFIG = QNN_MIN_ERROR_CONTEXT + 10,
+  /// Attempt to create a context from suboptimal binary
+  QNN_CONTEXT_ERROR_BINARY_SUBOPTIMAL = QNN_MIN_ERROR_CONTEXT + 11,
+  /// Call aborted early due to a QnnSignal_trigger call issued
+  /// to the observed signal object.
+  QNN_CONTEXT_ERROR_ABORTED = QNN_MIN_ERROR_CONTEXT + 12,
+  /// Call aborted early due to a QnnSignal timeout
+  QNN_CONTEXT_ERROR_TIMED_OUT = QNN_MIN_ERROR_CONTEXT + 13,
+  /// Incremental Binary Buffer was not allocated by backend
+  QNN_CONTEXT_ERROR_INCREMENT_INVALID_BUFFER = QNN_MIN_ERROR_CONTEXT + 14,
+  ////////////////////////////////////////////
+  QNN_CONTEXT_MAX_ERROR = QNN_MAX_ERROR_CONTEXT,
+  // Unused, present to ensure 32 bits.
+  QNN_CONTEXT_ERROR_UNDEFINED = 0x7FFFFFFF
+} QnnContext_Error_t;
+
+/**
+ * @brief Context specific object for custom configuration
+ *
+ * Please refer to documentation provided by the backend for usage information
+ */
+typedef void* QnnContext_CustomConfig_t;
+
+/**
+ * @brief This enum defines context config options.
+ */
+typedef enum {
+  /// Sets context custom options via QnnContext_CustomConfig_t
+  QNN_CONTEXT_CONFIG_OPTION_CUSTOM = 0,
+  /// Sets the default priority for graphs in this context. QNN_GRAPH_CONFIG_OPTION_PRIORITY can be
+  /// used to override this default.
+  QNN_CONTEXT_CONFIG_OPTION_PRIORITY = 1,
+  /// Sets the error reporting level.
+  QNN_CONTEXT_CONFIG_OPTION_ERROR_REPORTING = 2,
+  /// Sets the string used for custom oem functionality. This config option is DEPRECATED.
+  QNN_CONTEXT_CONFIG_OPTION_OEM_STRING = 3,
+  /// Sets async execution queue depth for all graphs in this context. This option represents the
+  /// number of executions that can be in the queue at a given time before QnnGraph_executeAsync()
+  /// will start blocking until a new spot is available. Queue depth is subject to a maximum limit
+  /// determined by the backend and available system resources. The default depth is
+  /// backend-specific, refer to SDK documentation.
+  QNN_CONTEXT_CONFIG_ASYNC_EXECUTION_QUEUE_DEPTH = 4,
+  /// Null terminated array of null terminated strings listing the names of the graphs to
+  /// deserialize from a context binary. All graphs are enabled by default. An error is generated if
+  /// an invalid graph name is provided.
+  QNN_CONTEXT_CONFIG_ENABLE_GRAPHS = 5,
+  /// Sets the peak memory limit hint of a deserialized context in megabytes
+  QNN_CONTEXT_CONFIG_MEMORY_LIMIT_HINT = 6,
+  /// Indicates that the context binary pointer is available during QnnContext_createFromBinary and
+  /// until QnnContext_free is called.
+  QNN_CONTEXT_CONFIG_PERSISTENT_BINARY = 7,
+  /// Sets the context binary check type when reading binary caches
+  QNN_CONTEXT_CONFIG_BINARY_COMPATIBILITY = 8,
+  // Unused, present to ensure 32 bits.
+  QNN_CONTEXT_CONFIG_UNDEFINED = 0x7FFFFFFF
+} QnnContext_ConfigOption_t;
+
+typedef enum {
+  /// A binary cache is compatible if it could run on the device. This is the
+  /// default.
+  QNN_CONTEXT_BINARY_COMPATIBILITY_PERMISSIVE = 0,
+  /// A binary cache is compatible if it could run on the device and fully
+  /// utilize hardware capability, otherwise QnnContext_CreateFromBinary
+  /// may return QNN_CONTEXT_ERROR_BINARY_SUBOPTIMAL.
+  QNN_CONTEXT_BINARY_COMPATIBILITY_STRICT = 1,
+  // Unused, present to ensure 32 bits
+  QNN_CONTEXT_BINARY_COMPATIBILITY_TYPE_UNDEFINED = 0x7FFFFFF
+} QnnContext_BinaryCompatibilityType_t;
+
+typedef enum {
+  /// Sets a numeric value for the maximum queue depth
+  QNN_CONTEXT_ASYNC_EXECUTION_QUEUE_DEPTH_TYPE_NUMERIC = 0,
+
+  // Unused, present to ensure 32 bits
+  QNN_CONTEXT_ASYNC_EXECUTION_QUEUE_DEPTH_TYPE_UNDEFINED = 0x7FFFFFF
+} QnnContext_AsyncExecutionQueueDepthType_t;
+
+/**
+ * @brief This struct provides async execution queue depth.
+ */
+typedef struct {
+  QnnContext_AsyncExecutionQueueDepthType_t type;
+  union UNNAMED {
+    uint32_t depth;
+  };
+} QnnContext_AsyncExecutionQueueDepth_t;
+
+/// QnnContext_AsyncExecutionQueueDepth_t initializer macro
+#define QNN_CONTEXT_ASYNC_EXECUTION_QUEUE_DEPTH_INIT                 \
+  {                                                                  \
+    QNN_CONTEXT_ASYNC_EXECUTION_QUEUE_DEPTH_TYPE_UNDEFINED, /*type*/ \
+    {                                                                \
+      0 /*depth*/                                                    \
+    }                                                                \
+  }
+
+/**
+ * @brief This struct provides context configuration.
+ */
+typedef struct {
+  QnnContext_ConfigOption_t option;
+  union UNNAMED {
+    /// Used with QNN_CONTEXT_CONFIG_OPTION_CUSTOM.
+    QnnContext_CustomConfig_t customConfig;
+    /// Used with QNN_CONTEXT_CONFIG_OPTION_PRIORITY.
+    Qnn_Priority_t priority;
+    /// Used with QNN_CONTEXT_CONFIG_OPTION_ERROR_REPORTING.
+    Qnn_ErrorReportingConfig_t errorConfig;
+    /// DEPRECATED. Used with QNN_CONTEXT_CONFIG_OPTION_OEM_STRING
+    const char* oemString;
+    /// Used with QNN_CONTEXT_CONFIG_ASYNC_EXECUTION_QUEUE_DEPTH
+    QnnContext_AsyncExecutionQueueDepth_t asyncExeQueueDepth;
+    /// Used with QNN_CONTEXT_CONFIG_ENABLE_GRAPHS
+    const char* const* enableGraphs;
+    /// Used with QNN_CONTEXT_CONFIG_MEMORY_LIMIT_HINT
+    uint64_t memoryLimitHint;
+    /// Used with QNN_CONTEXT_CONFIG_PERSISTENT_BINARY
+    uint8_t isPersistentBinary;
+    /// Used with QNN_CONTEXT_CONFIG_BINARY_COMPATIBILITY
+    QnnContext_BinaryCompatibilityType_t binaryCompatibilityType;
+  };
+} QnnContext_Config_t;
+
+/// QnnContext_Config_t initializer macro
+#define QNN_CONTEXT_CONFIG_INIT              \
+  {                                          \
+    QNN_CONTEXT_CONFIG_UNDEFINED, /*option*/ \
+    {                                        \
+      NULL /*customConfig*/                  \
+    }                                        \
+  }
+
+/**
+ * @brief Enum to distinguish notify type
+ */
+typedef enum {
+  // Graph initialization
+  QNN_CONTEXT_NOTIFY_TYPE_GRAPH_INIT = 0,
+  // Context initialization
+  QNN_CONTEXT_NOTIFY_TYPE_CONTEXT_INIT = 1,
+  /// Unused, present to ensure 32 bits.
+  QNN_CONTEXT_NOTIFY_TYPE_UNDEFINED = 0x7FFFFFF
+} QnnContext_createFromBinaryAsyncNotifyType_t;
+
+/**
+ * @brief A client-defined callback function.
+ *
+ * @param[in] context handle to a created context
+ *
+ * @param[in] graph handle to a created graph
+ *
+ * @param[in] graphName created graph's name
+ *
+ * @param[in] notifyType enum type indicating whether a context or a graph init is complete
+ *
+ * @param[in] notifyParam Client supplied data object which may be used to identify
+ *                        which function this callback applies to.
+ *
+ * @param[in] status graph or context initialization result
+ *
+ * @return None
+ *
+ */
+typedef void (*QnnContext_createFromBinaryNotifyFn_t)(
+    Qnn_ContextHandle_t context,
+    Qnn_GraphHandle_t graph,
+    const char* graphName,
+    QnnContext_createFromBinaryAsyncNotifyType_t notifyType,
+    void* notifyParam,
+    Qnn_ErrorHandle_t status);
+
+/**
+ * @brief This structure serves as a consolidated representation of context-related parameters.
+ *        QnnContext_createFromBinaryListAsync API takes a list of these parameters for initializing
+ *        multiple context binaries.
+ */
+typedef struct {
+  /// Config pointer to a NULL-terminated array of config option pointers for one context. NULL
+  /// is allowed and indicates that no config options are provided. If not provided, all config
+  /// options have default values consistent with the serialized context. If the same config option
+  /// type is provided multiple times, the last option value will be used.
+  const QnnContext_Config_t** config;
+  /// A pointer to the context binary
+  const void* binaryBuffer;
+  /// Holds the size of the context binary
+  const Qnn_ContextBinarySize_t binaryBufferSize;
+  /// The profile handle on which metrics are populated and can be queried. Use a NULL handle
+  /// to disable profile collection. If a handle is reused, it will reset and be populated with
+  /// values from the current call.
+  Qnn_ProfileHandle_t profile;
+  /// Pointer to a notification function, cannot be NULL
+  QnnContext_createFromBinaryNotifyFn_t notifyFunc;
+  /// Client-supplied data object which will be passed back via _notifyFn_ and can be used to
+  /// identify which context's asynchronous initialization instance the __notifyFn__ applies to.
+  /// Can be NULL if client does not need it.
+  void* notifyParam;
+} QnnContext_ParamsV1_t;
+
+/**
+ * @brief Enum to distinguish various context params definitions
+ */
+typedef enum {
+  QNN_CONTEXT_PARAMS_VERSION_1 = 1,
+  /// Unused, present to ensure 32 bits.
+  QNN_CONTEXT_PARAMS_VERSION_UNDEFINED = 0x7FFFFFFF
+} QnnContext_ParamsVersion_t;
+
+/**
+ * @brief Structure which provides various versions of context params
+ */
+typedef struct {
+  QnnContext_ParamsVersion_t version;
+  union UNNAMED {
+    QnnContext_ParamsV1_t v1;
+  };
+} QnnContext_Params_t;
+
+/**
+ * @brief Enum to distinguish type of binary section to retrieve
+ */
+typedef enum {
+  /// Portion of the context binary containing recent updates applied through
+  /// QnnTensor_updateGraphTensors() or QnnTensor_updateContextTensors()
+  QNN_CONTEXT_SECTION_UPDATABLE = 1,
+  /// Unused, present to ensure 32 bits.
+  QNN_CONTEXT_SECTION_UNDEFINED = 0x7FFFFFFF
+} QnnContext_SectionType_t;
+
+/**
+ * @brief An enum specifying memory types of context binary data.
+ */
+typedef enum {
+  /// Raw memory pointer
+  QNN_CONTEXTMEMTYPE_RAW = 0,
+  /// Memory object, provide capability for memory sharing in between QNN accelerator backends.
+  QNN_CONTEXTMEMTYPE_MEMHANDLE = 1,
+  // Unused, present to ensure 32 bits.
+  QNN_CONTEXTMEMTYPE_UNDEFINED = 0x7FFFFFFF
+} QnnContext_MemType_t;
+
+/**
+ * @brief A struct which defines a memory buffer
+ */
+typedef struct {
+  /// app-accessible data pointer, provided by app.
+  void* data;
+  /// size of buffer, in bytes, pointed to by data.
+  Qnn_ContextBinarySize_t dataSize;
+} Qnn_BinaryBuffer_t;
+
+/**
+ * @brief This structure defines a client created binary buffer containing the context binary.
+ *
+ */
+typedef struct {
+  QnnContext_MemType_t memType;
+  /// Actual data contained in the context binary.
+  union UNNAMED {
+    /// Context binary data provided by client as a pointer to raw memory (see
+    /// QNN_CONTEXTMEMTYPE_RAW).
+    Qnn_BinaryBuffer_t binaryBuf;
+    /// Context binary data shared via a memory handle (see QNN_CONTEXTMEMTYPE_MEMHANDLE).
+    Qnn_MemHandle_t memHandle;
+  };
+} QnnContext_BufferV1_t;
+
+/**
+ * @brief Enum to distinguish various context params definitions
+ */
+typedef enum {
+  QNN_CONTEXT_BUFFER_VERSION_1 = 1,
+  /// Unused, present to ensure 32 bits.
+  QNN_CONTEXT_BUFFER_VERSION_UNDEFINED = 0x7FFFFFFF
+} QnnContext_BufferVersion_t;
+
+/**
+ * @brief Structure which provides various versions of context params
+ */
+typedef struct {
+  QnnContext_BufferVersion_t version;
+  union UNNAMED {
+    QnnContext_BufferV1_t v1;
+  };
+} QnnContext_Buffer_t;
+
+/**
+ * @brief This enum defines context property options.
+ */
+typedef enum {
+  /// Gets context custom properties, see backend specific documentation.
+  QNN_CONTEXT_PROPERTY_OPTION_CUSTOM = 0,
+  /// Value selected to ensure 32 bits.
+  QNN_CONTEXT_PROPERTY_OPTION_UNDEFINED = 0x7FFFFFFF
+} QnnContext_PropertyOption_t;
+
+/**
+ * @brief Context specific object for custom property
+ *
+ * Please refer to documentation provided by the backend for usage information
+ */
+typedef void* QnnContext_CustomProperty_t;
+
+/**
+ * @brief This struct provides context property.
+ *        Option is specified by the client. Everything
+ *        else is written by the backend.
+ */
+typedef struct {
+  QnnContext_PropertyOption_t option;
+  union UNNAMED {
+    QnnContext_CustomProperty_t customProperty;
+  };
+} QnnContext_Property_t;
+
+// clang-format off
+/// QnnContext_Property_t initializer macro
+#define QNN_CONTEXT_PROPERTY_INIT                     \
+  {                                                   \
+    QNN_CONTEXT_PROPERTY_OPTION_UNDEFINED, /*option*/ \
+    {                                                 \
+      NULL /*customProperty*/                         \
+    }                                                 \
+  }
+// clang-format on
+
+//=============================================================================
+// Public Functions
+//=============================================================================
+
+/**
+ * @brief A function to create a context.
+ *        Context holds graphs, operations and tensors
+ *
+ * @param[in] backend A backend handle.
+ *
+ * @param[in] device A device handle to set hardware affinity for the created context. NULL value
+ *                   can be supplied for device handle and it is equivalent to calling
+ *                   QnnDevice_create() with NULL config.
+ *
+ * @param[in] config Pointer to a NULL terminated array of config option pointers. NULL is allowed
+ *                   and indicates no config options are provided. All config options have default
+ *                   value, in case not provided. If same config option type is provided multiple
+ *                   times, the last option value will be used.
+ *
+ * @param[out] context A handle to the created context.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: no error is encountered
+ *         - QNN_CONTEXT_ERROR_INVALID_ARGUMENT: at least one argument is invalid
+ *         - QNN_CONTEXT_ERROR_MEM_ALLOC: failure in allocating memory when creating context
+ *         - QNN_CONTEXT_ERROR_INVALID_HANDLE: _backend_ or _device_ is not a valid handle
+ *         - QNN_CONTEXT_ERROR_UNSUPPORTED_FEATURE: an optional feature is not supported
+ *         - QNN_CONTEXT_ERROR_INVALID_CONFIG: one or more config values is invalid
+ *         - QNN_COMMON_ERROR_SYSTEM_COMMUNICATION: SSR occurence (successful recovery)
+ *         - QNN_COMMON_ERROR_SYSTEM_COMMUNICATION_FATAL: SSR occurence (unsuccessful recovery)
+ *
+ * @note Use corresponding API through QnnInterface_t.
+ */
+QNN_API
+Qnn_ErrorHandle_t QnnContext_create(Qnn_BackendHandle_t backend,
+                                    Qnn_DeviceHandle_t device,
+                                    const QnnContext_Config_t** config,
+                                    Qnn_ContextHandle_t* context);
+
+/**
+ * @brief A function to set/modify configuration options on an already generated context.
+ *        Backends are not required to support this API.
+ *
+ * @param[in] context A context handle.
+ *
+ * @param[in] config Pointer to a NULL terminated array of config option pointers. NULL is allowed
+ *                   and indicates no config options are provided. All config options have default
+ *                   value, in case not provided. If same config option type is provided multiple
+ *                   times, the last option value will be used. If a backend cannot support all
+ *                   provided configs it will fail.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: no error is encountered
+ *         - QNN_CONTEXT_ERROR_INVALID_HANDLE:  _context_ is not a valid handle
+ *         - QNN_CONTEXT_ERROR_INVALID_ARGUMENT: at least one config option is invalid
+ *         - QNN_CONTEXT_ERROR_UNSUPPORTED_FEATURE: an optional feature is not supported
+ *         - QNN_COMMON_ERROR_SYSTEM_COMMUNICATION: SSR occurence (successful recovery)
+ *         - QNN_COMMON_ERROR_SYSTEM_COMMUNICATION_FATAL: SSR occurence (unsuccessful recovery)
+ *
+ * @note Use corresponding API through QnnInterface_t.
+ */
+QNN_API
+Qnn_ErrorHandle_t QnnContext_setConfig(Qnn_ContextHandle_t context,
+                                       const QnnContext_Config_t** config);
+
+/**
+ * @brief A function to get the size of memory to be allocated to hold
+ *        the context content in binary (serialized) form.
+ *        This function must be called after all entities in the context have been finalized.
+ *
+ * @param[in] context A context handle.
+ *
+ * @param[out] binaryBufferSize The amount of memory in bytes a client will need to allocate
+ *                              to hold context content in binary form.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: no error is encountered
+ *         - QNN_CONTEXT_ERROR_UNSUPPORTED_FEATURE: a feature is not supported
+ *         - QNN_CONTEXT_ERROR_INVALID_HANDLE:  _context_ is not a valid handle
+ *         - QNN_CONTEXT_ERROR_INVALID_ARGUMENT: _binaryBufferSize_ is NULL
+ *         - QNN_CONTEXT_ERROR_NOT_FINALIZED: if there were any non-finalized entities in the
+ *           context
+ *         - QNN_CONTEXT_ERROR_GET_BINARY_SIZE_FAILED: Operation failure due to other factors
+ *         - QNN_COMMON_ERROR_OPERATION_NOT_PERMITTED: Attempting to get binary size for a
+ *           context re-created from a cached binary.
+ *         - QNN_CONTEXT_ERROR_MEM_ALLOC: Not enough memory is available to retrieve the context
+ *           content.
+ *
+ * @note Use corresponding API through QnnInterface_t.
+ */
+QNN_API
+Qnn_ErrorHandle_t QnnContext_getBinarySize(Qnn_ContextHandle_t context,
+                                           Qnn_ContextBinarySize_t* binaryBufferSize);
+
+/**
+ * @brief A function to get the context content in binary (serialized) form.
+ *        The binary can be used to re-create context by using QnnContext_createFromBinary(). This
+ *        function must be called after all entities in the context have been finalized. Unconsumed
+ *        tensors are not included in the binary. Client is responsible for allocating sufficient
+ *        and valid memory to hold serialized context content produced by this method. It is
+ *        recommended the user calls QnnContext_getBinarySize() to allocate a buffer of sufficient
+ *        space to hold the binary.
+ *
+ * @param[in] context A context handle.
+ *
+ * @param[in] binaryBuffer Pointer to the user-allocated context binary memory.
+ *
+ * @param[in] binaryBufferSize Size of _binaryBuffer_ to populate context binary with, in bytes.
+ *
+ * @param[out] writtenBufferSize Amount of memory actually written into _binaryBuffer_, in bytes.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: no error is encountered
+ *         - QNN_CONTEXT_ERROR_UNSUPPORTED_FEATURE: a feature is not supported
+ *         - QNN_CONTEXT_ERROR_INVALID_HANDLE:  _context_ is not a valid handle
+ *         - QNN_CONTEXT_ERROR_INVALID_ARGUMENT: one of the arguments to the API is invalid/NULL
+ *         - QNN_CONTEXT_ERROR_NOT_FINALIZED: if there were any non-finalized entities in the
+ *           context
+ *         - QNN_CONTEXT_ERROR_GET_BINARY_FAILED: Operation failure due to other factors
+ *         - QNN_COMMON_ERROR_OPERATION_NOT_PERMITTED: Attempting to get binary for a
+ *           context re-created from a cached binary.
+ *         - QNN_CONTEXT_ERROR_MEM_ALLOC: Not enough memory is available to retrieve the context
+ *           content.
+ *
+ * @note Use corresponding API through QnnInterface_t.
+ */
+QNN_API
+Qnn_ErrorHandle_t QnnContext_getBinary(Qnn_ContextHandle_t context,
+                                       void* binaryBuffer,
+                                       Qnn_ContextBinarySize_t binaryBufferSize,
+                                       Qnn_ContextBinarySize_t* writtenBufferSize);
+
+/**
+ * @brief A function to validate a stored binary.
+ *        The binary was previously obtained via QnnContext_getBinary() and stored by a client.
+ *
+ * @param[in] backend A backend handle.
+ *
+ * @param[in] device A device handle to set hardware affinity for the created context. NULL value
+ *                   can be supplied for device handle and it is equivalent to calling
+ *                   QnnDevice_create() with NULL config.
+ *
+ * @param[in] config Pointer to a NULL terminated array of config option pointers. NULL is allowed
+ *                   and indicates no config options are provided. In case they are not provided,
+ *                   all config options have a default value in accordance with the serialized
+ *                   context. If the same config option type is provided multiple times, the last
+ *                   option value will be used.
+ *
+ * @param[in] binaryBuffer A pointer to the context binary.
+ *
+ * @param[in] binaryBufferSize Holds the size of the context binary.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: no error is encountered
+ *         - QNN_CONTEXT_ERROR_UNSUPPORTED_FEATURE: a feature is not supported
+ *         - QNN_CONTEXT_ERROR_INVALID_ARGUMENT: _binaryBuffer_ is NULL
+ *         - QNN_CONTEXT_ERROR_MEM_ALLOC: memory allocation error while validating binary cache
+ *         - QNN_CONTEXT_ERROR_CREATE_FROM_BINARY: failed to validate binary cache
+ *         - QNN_CONTEXT_ERROR_BINARY_VERSION: incompatible version of the binary
+ *         - QNN_CONTEXT_ERROR_BINARY_CONFIGURATION: binary is not configured for this device
+ *         - QNN_CONTEXT_ERROR_BINARY_SUBOPTIMAL: suboptimal binary is used when
+ *           QNN_CONTEXT_BINARY_COMPATIBILITY_STRICT is specified in the config option
+ *         - QNN_CONTEXT_ERROR_INVALID_HANDLE: _backend_, or _device_ is not a valid handle
+ *         - QNN_CONTEXT_ERROR_INVALID_CONFIG: one or more config values is invalid
+ *
+ * @note Use corresponding API through QnnInterface_t.
+ */
+QNN_API
+Qnn_ErrorHandle_t QnnContext_validateBinary(Qnn_BackendHandle_t backend,
+                                            Qnn_DeviceHandle_t device,
+                                            const QnnContext_Config_t** config,
+                                            const void* binaryBuffer,
+                                            Qnn_ContextBinarySize_t binaryBufferSize);
+
+/**
+ * @brief A function to create a context from a stored binary.
+ *        The binary was previously obtained via QnnContext_getBinary() and stored by a client. The
+ *        content of a context created in this way cannot be further altered, meaning *no* new
+ *        nodes or tensors can be added to the context. Creating context by deserializing provided
+ *        binary is meant for fast content creation, ready to execute on.
+ *
+ * @param[in] backend A backend handle.
+ *
+ * @param[in] device A device handle to set hardware affinity for the created context. NULL value
+ *                   can be supplied for device handle and it is equivalent to calling
+ *                   QnnDevice_create() with NULL config.
+ *
+ * @param[in] config Pointer to a NULL terminated array of config option pointers. NULL is allowed
+ *                   and indicates no config options are provided. In case they are not provided,
+ *                   all config options have a default value in accordance with the serialized
+ *                   context. If the same config option type is provided multiple times, the last
+ *                   option value will be used.
+ *
+ * @param[in] binaryBuffer A pointer to the context binary.
+ *
+ * @param[in] binaryBufferSize Holds the size of the context binary.
+ *
+ * @param[out] context A handle to the created context.
+ *
+ * @param[in] profile The profile handle on which metrics are populated and can be queried. Use
+ *                    NULL handle to disable profile collection. A handle being re-used would reset
+ *                    and is populated with values from the current call.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: no error is encountered
+ *         - QNN_CONTEXT_ERROR_UNSUPPORTED_FEATURE: a feature is not supported
+ *         - QNN_CONTEXT_ERROR_INVALID_ARGUMENT: _binaryBuffer_ or _context_ is NULL
+ *         - QNN_CONTEXT_ERROR_MEM_ALLOC: memory allocation error while creating context
+ *         - QNN_CONTEXT_ERROR_CREATE_FROM_BINARY: failed to deserialize binary and
+ *           create context from it
+ *         - QNN_CONTEXT_ERROR_BINARY_VERSION: incompatible version of the binary
+ *         - QNN_CONTEXT_ERROR_BINARY_CONFIGURATION: binary is not configured for this device
+ *         - QNN_CONTEXT_ERROR_BINARY_SUBOPTIMAL: suboptimal binary is used when
+ *           QNN_CONTEXT_BINARY_COMPATIBILITY_STRICT is specified in the config option
+ *         - QNN_CONTEXT_ERROR_SET_PROFILE: failed to set profiling info
+ *         - QNN_CONTEXT_ERROR_INVALID_HANDLE: _backend_, __profile_, or _device_ is not a
+ *           valid handle
+ *         - QNN_CONTEXT_ERROR_INVALID_CONFIG: one or more config values is invalid
+ *         - QNN_COMMON_ERROR_SYSTEM_COMMUNICATION: SSR occurence (successful recovery)
+ *         - QNN_COMMON_ERROR_SYSTEM_COMMUNICATION_FATAL: SSR occurence (unsuccessful recovery)
+ *
+ * @note Use corresponding API through QnnInterface_t.
+ */
+QNN_API
+Qnn_ErrorHandle_t QnnContext_createFromBinary(Qnn_BackendHandle_t backend,
+                                              Qnn_DeviceHandle_t device,
+                                              const QnnContext_Config_t** config,
+                                              const void* binaryBuffer,
+                                              Qnn_ContextBinarySize_t binaryBufferSize,
+                                              Qnn_ContextHandle_t* context,
+                                              Qnn_ProfileHandle_t profile);
+
+/**
+ * @brief A function to create a context from a stored binary, which supports control signals.
+ *        The binary was previously obtained via QnnContext_getBinary() and stored by a client. The
+ *        content of a context created in this way cannot be further altered, meaning *no* new
+ *        nodes or tensors can be added to the context. Creating context by deserializing provided
+ *        binary is meant for fast content creation, ready to execute on.
+ *
+ * @param[in] backend A backend handle.
+ *
+ * @param[in] device A device handle to set hardware affinity for the created context. NULL value
+ *                   can be supplied for device handle and it is equivalent to calling
+ *                   QnnDevice_create() with NULL config.
+ *
+ * @param[in] config Pointer to a NULL terminated array of config option pointers. NULL is allowed
+ *                   and indicates no config options are provided. In case they are not provided,
+ *                   all config options have a default value in accordance with the serialized
+ *                   context. If the same config option type is provided multiple times, the last
+ *                   option value will be used.
+ *
+ * @param[in] binaryBuffer A pointer to the context binary.
+ *
+ * @param[in] binaryBufferSize Holds the size of the context binary.
+ *
+ * @param[out] context A handle to the created context.
+ *
+ * @param[in] profile The profile handle on which metrics are populated and can be queried. Use
+ *                    NULL handle to disable profile collection. A handle being re-used would reset
+ *                    and is populated with values from the current call.
+ *
+ * @param[in] signal Signal object to control the execution of the create context from binary
+ *                   process. NULL may be passed to indicate that no execution control is requested,
+ *                   and the create operation should continue to completion uninterrupted.
+ *                   The signal object, if not NULL, is considered to be in-use for
+ *                   the duration of the call.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: no error is encountered
+ *         - QNN_CONTEXT_ERROR_UNSUPPORTED_FEATURE: a feature is not supported
+ *         - QNN_CONTEXT_ERROR_INVALID_ARGUMENT: _binaryBuffer_ or _context_ is NULL
+ *         - QNN_CONTEXT_ERROR_MEM_ALLOC: memory allocation error while creating context
+ *         - QNN_CONTEXT_ERROR_CREATE_FROM_BINARY: failed to deserialize binary and
+ *           create context from it
+ *         - QNN_CONTEXT_ERROR_BINARY_VERSION: incompatible version of the binary
+ *         - QNN_CONTEXT_ERROR_BINARY_CONFIGURATION: binary is not configured for this device
+ *         - QNN_CONTEXT_ERROR_BINARY_SUBOPTIMAL: suboptimal binary is used when
+ *           QNN_CONTEXT_BINARY_COMPATIBILITY_STRICT is specified in the config option
+ *         - QNN_CONTEXT_ERROR_SET_PROFILE: failed to set profiling info
+ *         - QNN_CONTEXT_ERROR_INVALID_HANDLE: _backend_, __profile_, or _device_ is not a
+ *           valid handle
+ *         - QNN_CONTEXT_ERROR_INVALID_CONFIG: one or more config values is invalid
+ *         - QNN_CONTEXT_ERROR_ABORTED: the call is aborted before completion due to user
+ *           cancellation
+ *         - QNN_CONTEXT_ERROR_TIMED_OUT: the call is aborted before completion due to a timeout
+ *
+ * @note Use corresponding API through QnnInterface_t.
+ */
+QNN_API
+Qnn_ErrorHandle_t QnnContext_createFromBinaryWithSignal(Qnn_BackendHandle_t backend,
+                                                        Qnn_DeviceHandle_t device,
+                                                        const QnnContext_Config_t** config,
+                                                        const void* binaryBuffer,
+                                                        Qnn_ContextBinarySize_t binaryBufferSize,
+                                                        Qnn_ContextHandle_t* context,
+                                                        Qnn_ProfileHandle_t profile,
+                                                        Qnn_SignalHandle_t signal);
+
+/**
+ * @brief The purpose of this function is to asynchronously create multiple contexts from binaries
+ *        in a single API call. The API can be used with QnnSignal. When the function is invoked,
+ *        the deserialization/initialization of each context will occur in the background. As soon
+ *        as a graph reaches an executable state (i.e., initialization is completed), the client
+ *        will receive a notification via the specified notification function. Once notification is
+ *        received, the client can proceed to execute the graph. If a context contains multiple
+ *        graphs, there will be multiple callbacks through the notification function.
+ *
+ * @note: Until a notification is received indicating that at least one graph or context is in an
+ *        executable state, other context or graph-based functions cannot be called.
+ *
+ * @param[in] backend A backend handle.
+ *
+ * @param[in] device A device handle used to set hardware affinity for the created context. A
+ *                   NULL value can be supplied for the device handle, which is equivalent to
+ *                   calling QnnDevice_create() with a NULL config.
+ *
+ * @param[in] contextParams Pointer to a NULL-terminated array of context parameters.
+ *
+ * @param[in] listConfig Config pointer to a NULL-terminated array of config option pointers that
+ *                       apply to all contexts in the list. NULL is allowed and indicates that no
+ *                       config options are provided. If not provided, all config options have
+ *                       default values consistent with the serialized context. If the same config
+ *                       option type is provided multiple times, the last option value will be used.
+ *                       listConfig will override options also specified in contextParams.
+ *
+ * @param[in] signal Signal object to control the create context from binary process.
+ *                   NULL may be passed to indicate that no execution control is requested,
+ *                   and the create operation should continue to completion uninterrupted.
+ *                   The signal object, if not NULL, is considered to be in-use for
+ *                   the duration of the call including the asynchronous
+ *                   deserialization/initialization.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: no error is encountered
+ *         - QNN_CONTEXT_ERROR_UNSUPPORTED_FEATURE: a feature is not supported
+ *         - QNN_CONTEXT_ERROR_INVALID_ARGUMENT: contextParams is empty or any individual
+ *           contextParam's _binaryBuffer_ or notifyFunc is NULL or binaryBufferSize is 0
+ *         - QNN_CONTEXT_ERROR_MEM_ALLOC: memory allocation error while creating any context
+ *         - QNN_CONTEXT_ERROR_CREATE_FROM_BINARY: failed to deserialize any binary and create
+ *           context from it
+ *         - QNN_CONTEXT_ERROR_BINARY_VERSION: incompatible version of the binary
+ *         - QNN_CONTEXT_ERROR_BINARY_CONFIGURATION: binary is not configured for this device
+ *         - QNN_CONTEXT_ERROR_BINARY_SUBOPTIMAL: suboptimal binary is used when
+ *           QNN_CONTEXT_BINARY_COMPATIBILITY_STRICT is specified in the config option
+ *         - QNN_CONTEXT_ERROR_SET_PROFILE: failed to set profiling info
+ *         - QNN_CONTEXT_ERROR_INVALID_HANDLE: _backend_, _device_, __signal__ or any individual
+ *           contextParam's _profile_ is not a valid handle
+ *         - QNN_CONTEXT_ERROR_INVALID_CONFIG: one or more config values in either listConfig or
+ *           any individual contextParam's config is invalid
+ *         - QNN_CONTEXT_ERROR_ABORTED: the call is aborted before completion due to user
+ *           cancellation including during any individual asynchronous
+ *           deserialization/initialization
+ *         - QNN_CONTEXT_ERROR_TIMED_OUT: the call is aborted before completion due to a timeout
+ *           including during any individual asynchronous deserialization/initialization
+ *
+ * @note Use corresponding API through QnnInterface_t.
+ */
+QNN_API
+Qnn_ErrorHandle_t QnnContext_createFromBinaryListAsync(Qnn_BackendHandle_t backend,
+                                                       Qnn_DeviceHandle_t device,
+                                                       const QnnContext_Params_t** contextParams,
+                                                       const QnnContext_Config_t** listConfig,
+                                                       Qnn_SignalHandle_t signal);
+/**
+ * @brief Retrieve a section of the binary as specified by __section__. The size of this section
+ *        depends on the type of section requested. For example, for QNN_CONTEXT_SECTION_UPDATABLE
+ *        sections, this will have all the updatable tensor information.
+ *
+ * @param[in] context A context handle.
+ *
+ * @param[in] graph A graph handle. This argument is optional. When supplied the return size only
+ *                  applies to the size of the context binary section pertaining to this graph. When
+ *                  excluded the returned binary contains associated updates to all graphs in the
+ *                  context. Some backends may require _graph_ as an argument. Support is determined
+ *                  by QNN_PROPERTY_CONTEXT_SUPPORT_BINARY_SECTION_FULL_CONTEXT.
+ *
+ * @param[in] section The section of the binary to retrieve.
+ *
+ * @param[out] binaryBufferSize The amount of memory in bytes a client will need to allocate
+ *                              to hold context content updates in binary form.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: no error is encountered
+ *         - QNN_CONTEXT_ERROR_UNSUPPORTED_FEATURE: a feature is not supported
+ *         - QNN_CONTEXT_ERROR_INVALID_HANDLE:  _context_ is not a valid handle
+ *         - QNN_CONTEXT_ERROR_INVALID_ARGUMENT: _binaryBufferSize_ is NULL
+ *         - QNN_CONTEXT_ERROR_NOT_FINALIZED: if there were any non-finalized entities in the
+ *           context
+ *         - QNN_CONTEXT_ERROR_GET_BINARY_SIZE_FAILED: Operation failure due to other factors
+ *         - QNN_COMMON_ERROR_OPERATION_NOT_PERMITTED: Attempting to get binary size for a
+ *           context re-created from a cached binary.
+ *         - QNN_CONTEXT_ERROR_MEM_ALLOC: Not enough memory is available to retrieve the context
+ *           content.
+ *
+ * @note Use corresponding API through QnnInterface_t.
+ */
+
+QNN_API
+Qnn_ErrorHandle_t QnnContext_getBinarySectionSize(Qnn_ContextHandle_t context,
+                                                  Qnn_GraphHandle_t graph,
+                                                  QnnContext_SectionType_t section,
+                                                  Qnn_ContextBinarySize_t* binaryBufferSize);
+
+/**
+ * @brief Retrieve section of the context binary. Content of the section is specified by
+ *        __section__. The size of the section is retrieved from QnnContext_getBinarySectionSize().
+ *
+ * @param[in] context A context handle.
+ *
+ * @param[in] binaryBuffer Pointer to the user-allocated context binary memory.
+ *
+ * @param[in] graph A graph handle. This argument is optional. When supplied the returned binary
+ *                  only contains the context binary section pertaining to this graph. When excluded
+ *                  the returned binary contains associated updates to all graphs in the context.
+ *                  Some backends may require _graph_ as an argument. Support is determined by
+ *                  QNN_PROPERTY_CONTEXT_SUPPORT_BINARY_SECTION_FULL_CONTEXT.
+ *
+ * @param[in] section The section of the binary to retrieve. When section is
+ *                    QNN_CONTEXT_SECTION_UPDATABLE the returned binary will contain all of the
+ *                    updatable tensors associated with the context and graph combination. Binary
+ *                    sections of type QNN_CONTEXT_SECTION_UPDATABLE have Qnn System Context
+ *                    metadata containing information about any modified input and output tensors,
+ *                    and therefore may be used with QnnSystemContext_getMetadata() and
+ *                    QnnSystemContext_getBinaryInfo().
+ *
+ * @param[in] profile The profile handle on which metrics are populated and can be queried. Use
+ *                    NULL handle to disable profile collection. A handle being re-used would reset
+ *                    and is populated with values from the current call.
+ *
+ * @param[in] signal Signal object to control the execution of the create context from binary
+ *                   process. NULL may be passed to indicate that no execution control is requested,
+ *                   and the create operation should continue to completion uninterrupted.
+ *                   The signal object, if not NULL, is considered to be in-use for
+ *                   the duration of the call.
+ *
+ * @param[out] writtenBufferSize Amount of memory actually written into _binaryBuffer_, in bytes.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: no error is encountered
+ *         - QNN_CONTEXT_ERROR_UNSUPPORTED_FEATURE: a feature is not supported
+ *         - QNN_CONTEXT_ERROR_INVALID_HANDLE:  _context_ is not a valid handle
+ *         - QNN_CONTEXT_ERROR_INVALID_ARGUMENT: one of the arguments to the API is invalid/NULL
+ *         - QNN_CONTEXT_ERROR_NOT_FINALIZED: if there were any non-finalized entities in the
+ *           context
+ *         - QNN_CONTEXT_ERROR_GET_BINARY_FAILED: Operation failure due to other factors
+ *         - QNN_COMMON_ERROR_OPERATION_NOT_PERMITTED: Attempting to get binary update for a
+ *           context re-created from a cached binary.
+ *         - QNN_CONTEXT_ERROR_MEM_ALLOC: Not enough memory is available to retrieve the context
+ *           content.
+ *
+ *
+ * @note Use corresponding API through QnnInterface_t.
+ */
+QNN_API
+Qnn_ErrorHandle_t QnnContext_getBinarySection(Qnn_ContextHandle_t context,
+                                              Qnn_GraphHandle_t graph,
+                                              QnnContext_SectionType_t section,
+                                              const QnnContext_Buffer_t* binaryBuffer,
+                                              Qnn_ContextBinarySize_t* writtenBufferSize,
+                                              Qnn_ProfileHandle_t profile,
+                                              Qnn_SignalHandle_t signal);
+
+/**
+ * @brief Apply a section to the contextBinary produced by a prior QnnContext_getBinarySection()
+ *        call. If successful, this section overwrites previously applied sections. If the call to
+ *        applyBinarySection() fails, it indicates the changes were not applied, and that the
+ *        context retains its prior state. In this case the context is still valid and may be used
+ *        for subsequent inferences.
+ *
+ * @param[in] context A context handle.
+ *
+ * @param[in] graph A graph handle. This argument is optional. When supplied the returned binary
+ *                  only contains the context binary section pertaining to this graph. When excluded
+ *                  the returned binary contains associated updates to all graphs in the context.
+ *
+ * @param[in] section The section of the binary to retrieve. When section is
+ *                    QNN_CONTEXT_SECTION_UPDATABLE the returned binary will contain all of the
+ *                    updatable tensors associated with the context and graph combination.
+ *
+ * @param[in] binaryBuffer Pointer to the user-allocated context binary memory.
+ *
+ * @param[in] profile The profile handle on which metrics are populated and can be queried. Use
+ *                    NULL handle to disable profile collection. A handle being re-used would reset
+ *                    and is populated with values from the current call.
+ *
+ * @param[in] signal Signal object to control the execution of the create context from binary
+ *                   process. NULL may be passed to indicate that no execution control is requested,
+ *                   and the create operation should continue to completion uninterrupted.
+ *                   The signal object, if not NULL, is considered to be in-use for
+ *                   the duration of the call.
+ *
+ *         - QNN_SUCCESS: no error is encountered
+ *         - QNN_CONTEXT_ERROR_UNSUPPORTED_FEATURE: a feature is not supported
+ *         - QNN_CONTEXT_ERROR_INVALID_ARGUMENT: _binaryBuffer_ or _context_ is NULL
+ *         - QNN_CONTEXT_ERROR_MEM_ALLOC: memory allocation error while creating context update
+ *         - QNN_CONTEXT_ERROR_SET_PROFILE: failed to set profiling info
+ *         - QNN_CONTEXT_ERROR_INVALID_HANDLE: _backend_, __profile_, or _signal_ is not a
+ *           valid handle
+ *
+ * @note Use corresponding API through QnnInterface_t.
+ * @note When using this API with QNN_CONTEXT_CONFIG_PERSISTENT_BINARY enabled,
+ *       binaryBuffer should be available and persistent from first call to
+ *       QnnContext_applyBinarySection until QnnContext_free.
+ */
+QNN_API
+Qnn_ErrorHandle_t QnnContext_applyBinarySection(Qnn_ContextHandle_t context,
+                                                Qnn_GraphHandle_t graph,
+                                                QnnContext_SectionType_t section,
+                                                const QnnContext_Buffer_t* binaryBuffer,
+                                                Qnn_ProfileHandle_t profile,
+                                                Qnn_SignalHandle_t signal);
+
+/**
+ * @brief A function to get a list of context properties.
+ *        Backends are not required to support this API.
+ *
+ * @param[in] contextHandle A context handle.
+ *
+ * @param[in/out] properties Pointer to a null terminated array of pointers containing the
+ *                           properties associated with the passed contextHandle. Memory for
+ *                           this information is owned and managed by the client. Client
+ *                           needs to populate the property options being requested. If
+ *                           _contextHandle_ is not recognized, the pointer _properties_
+ *                           points to is set to nullptr.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: no error is encountered
+ *         - QNN_CONTEXT_ERROR_INVALID_HANDLE: _contextHandle_ is not a valid handle
+ *         - QNN_CONTEXT_ERROR_INVALID_ARGUMENT: _properties_ is NULL or at least one property option
+ *           is invalid
+ *         - QNN_CONTEXT_ERROR_UNSUPPORTED_FEATURE: at least one valid property option is not
+ *           supported
+ *
+ * @note Use corresponding API through QnnInterface_t.
+ */
+QNN_API
+Qnn_ErrorHandle_t QnnContext_getProperty(Qnn_ContextHandle_t contextHandle,
+                                         QnnContext_Property_t** properties);
+
+
+/**
+ * @brief A function to get the next piece of the context binary, incrementally produced from the backend.
+ *        The backend returns a pointer to constant data which it owns, the data's size and the starting offset
+ *        where the incremental binary buffer begins. The memory provided here must be released through
+ *        QnnContext_releaseIncrementalBinary. Incremental pieces of the context binary may be provided
+ *        in random order i.e. startOffset is independent of previous calls.
+ *
+ *        @note modifications made to the context in between calls to QnnContext_getIncrementalBinary
+ *        results in undefined behavior.
+ *
+ * @param[in] context A context handle.
+ *
+ * @param[out] binaryBuffer Pointer to backend provided/owned buffer
+ *
+ * @param[out] startOffset Starting offset for binary data.
+ *
+ * @param[out] writtenBufferSize Amount of memory actually written into _binaryBuffer_, in bytes.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: no error is encountered
+ *         - QNN_CONTEXT_ERROR_UNSUPPORTED_FEATURE: a feature is not supported
+ *         - QNN_CONTEXT_ERROR_INVALID_HANDLE:  _context_ is not a valid handle
+ *         - QNN_CONTEXT_ERROR_INVALID_ARGUMENT: one of the arguments to the API is invalid/NULL
+ *         - QNN_CONTEXT_ERROR_NOT_FINALIZED: if there were any non-finalized entities in the
+ *           context
+ *         - QNN_CONTEXT_ERROR_GET_BINARY_FAILED: Operation failure due to other factors
+ *         - QNN_COMMON_ERROR_OPERATION_NOT_PERMITTED: Attempting to get binary for a
+ *           context re-created from a cached binary.
+ *         - QNN_CONTEXT_ERROR_MEM_ALLOC: Not enough memory is available to retrieve the context
+ *           content.
+ *
+ * @note Use corresponding API through QnnInterface_t.
+ */
+QNN_API
+Qnn_ErrorHandle_t QnnContext_getIncrementalBinary(Qnn_ContextHandle_t context,
+                                                  const void** binaryBuffer,
+                                                  Qnn_ContextBinarySize_t* startOffset,
+                                                  Qnn_ContextBinarySize_t* writtenBufferSize);
+/**
+ * @brief A function to release a incrementally allocated portion of the context binary
+ *        retrieved from a previous call to QnnContext_getIncrementalBinary.
+ *
+ * @param[in] context A context handle.
+ *
+ * @param[out] binaryBuffer Pointer to backend provided/owned buffer
+ *
+ * @param[out] startOffset Starting offset for binary data.
+ *
+ * @param[out] writtenBufferSize Amount of memory actually written into _binaryBuffer_, in bytes.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: no error is encountered
+ *         - QNN_CONTEXT_ERROR_UNSUPPORTED_FEATURE: a feature is not supported
+ *         - QNN_CONTEXT_ERROR_INVALID_HANDLE:  _context_ is not a valid handle
+ *         - QNN_CONTEXT_ERROR_INVALID_ARGUMENT: one of the arguments to the API is invalid/NULL
+ *         - QNN_CONTEXT_ERROR_NOT_FINALIZED: if there were any non-finalized entities in the
+ *           context
+ *         - QNN_COMMON_ERROR_OPERATION_NOT_PERMITTED: Attempting to get binary for a
+ *           context re-created from a cached binary.
+ *         - QNN_CONTEXT_ERROR_INCREMENT_INVALID_BUFFER: The buffer __binaryBuffer__ starting at
+ *           __startOffset__ was not allocated by the backend.
+ *         - QNN_CONTEXT_ERROR_MEM_ALLOC: Not enough memory is available to retrieve the context
+ *           content.
+ *
+ * @note Use corresponding API through QnnInterface_t.
+ * */
+QNN_API
+Qnn_ErrorHandle_t QnnContext_releaseIncrementalBinary(Qnn_ContextHandle_t context,
+                                                      const void* binaryBuffer,
+                                                      Qnn_ContextBinarySize_t startOffset);
+
+/**
+ * @brief A function to free the context and all associated graphs, operations & tensors
+ *
+ * @param[in] context A context handle.
+ *
+ * @param[in] profile The profile handle on which metrics are populated and can be queried. Use
+ *                    NULL handle to disable profile collection. A handle being re-used would reset
+ *                    and is populated with values from the current call.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: no error is encountered
+ *         - QNN_CONTEXT_ERROR_INVALID_ARGUMENT: _profile_ is not a valid handle
+ *         - QNN_CONTEXT_ERROR_INVALID_HANDLE:  _context_ is not a valid handle
+ *         - QNN_CONTEXT_ERROR_MEM_ALLOC: an error is encountered with de-allocation of associated
+ *           memory
+ *         - QNN_CONTEXT_ERROR_SET_PROFILE: failed to set profiling info
+ *         - QNN_COMMON_ERROR_SYSTEM_COMMUNICATION: SSR occurence (successful recovery)
+ *         - QNN_COMMON_ERROR_SYSTEM_COMMUNICATION_FATAL: SSR occurence (unsuccessful recovery)
+ *
+ * @note Use corresponding API through QnnInterface_t.
+ */
+QNN_API
+Qnn_ErrorHandle_t QnnContext_free(Qnn_ContextHandle_t context, Qnn_ProfileHandle_t profile);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // QNN_CONTEXT_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/QnnDevice.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/QnnDevice.h
new file mode 100755
index 0000000000000..1b792af9e1aca
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/QnnDevice.h
@@ -0,0 +1,461 @@
+//=============================================================================
+//
+//  Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+//  All Rights Reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//=============================================================================
+
+/**
+ *  @file
+ *  @brief  Device component API.
+ *
+ *          This is the top level QNN API component for hardware resource management.
+ */
+
+#ifndef QNN_DEVICE_H
+#define QNN_DEVICE_H
+
+#include "QnnCommon.h"
+#include "QnnTypes.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//=============================================================================
+// Macros
+//=============================================================================
+
+/// Reserved value to select a default device
+#define QNN_DEVICE_DEFAULT_DEVICE_ID 0xFFFFFFFF
+
+/// Reserved value to select a default core
+#define QNN_DEVICE_DEFAULT_CORE_ID 0xFFFFFFFF
+
+//=============================================================================
+// Data Types
+//=============================================================================
+
+/**
+ * @brief QNN Device API result / error codes.
+ */
+typedef enum {
+  QNN_DEVICE_MIN_ERROR = QNN_MIN_ERROR_DEVICE,
+  ////////////////////////////////////////////
+  /// There is optional API component that is not supported yet. See QnnProperty.
+  QNN_DEVICE_ERROR_UNSUPPORTED_FEATURE = QNN_COMMON_ERROR_NOT_SUPPORTED,
+  /// Memory allocation/deallocation failure
+  QNN_DEVICE_ERROR_MEM_ALLOC = QNN_COMMON_ERROR_MEM_ALLOC,
+  /// Invalid function argument
+  QNN_DEVICE_ERROR_INVALID_ARGUMENT = QNN_COMMON_ERROR_INVALID_ARGUMENT,
+  /// Invalid handle
+  QNN_DEVICE_ERROR_INVALID_HANDLE = QNN_MIN_ERROR_DEVICE + 0,
+  /// Invalid config values
+  QNN_DEVICE_ERROR_INVALID_CONFIG = QNN_MIN_ERROR_DEVICE + 1,
+  /// Hardware unavailable
+  QNN_DEVICE_ERROR_HARDWARE_UNAVAILABLE = QNN_MIN_ERROR_DEVICE + 2,
+  /// Device is associated to a context
+  QNN_DEVICE_ERROR_ASSOCIATED_TO_CONTEXT = QNN_MIN_ERROR_DEVICE + 3,
+  /// Qnn Device success
+  QNN_DEVICE_NO_ERROR = QNN_SUCCESS,
+
+  ////////////////////////////////////////////
+  QNN_DEVICE_MAX_ERROR = QNN_MAX_ERROR_DEVICE,
+  // Unused, present to ensure 32 bits.
+  QNN_DEVICE_ERROR_UNDEFINED = 0x7FFFFFFF
+} QnnDevice_Error_t;
+
+/**
+ * @brief Backend specific opaque infrastructure object
+ *
+ * Please refer to the documentation provided by the backend for usage information.
+ */
+typedef struct _QnnDevice_Infrastructure_t* QnnDevice_Infrastructure_t;
+
+/**
+ * @brief Backend-defined structure to populate backend specific information for core info
+ */
+typedef struct _QnnDevice_CoreInfoExtension_t* QnnDevice_CoreInfoExtension_t;
+
+/**
+ * @brief Version 1 of the structure defining per Core info
+ */
+typedef struct {
+  /// ID of the enumerated core
+  uint32_t coreId;
+  /// Type of the core, as specified by the backend
+  uint32_t coreType;
+  /// Backend specific extension for core info. Refer to backend headers for the definition
+  QnnDevice_CoreInfoExtension_t coreInfoExtension;
+} QnnDevice_CoreInfoV1_t;
+
+// clang-format off
+/// QnnDevice_CoreInfoV1_t initializer macro
+#define QNN_DEVICE_CORE_INFO_V1_INIT                  \
+  {                                                   \
+    QNN_DEVICE_DEFAULT_CORE_ID, /*coreId*/            \
+    0u,                         /*coreType*/          \
+    NULL                        /*coreInfoExtension*/ \
+  }
+// clang-format on
+
+/**
+ * @brief Enum to distinguish core info versions
+ */
+typedef enum {
+  QNN_DEVICE_CORE_INFO_VERSION_1 = 1,
+  // Unused, present to ensure 32 bits.
+  QNN_DEVICE_CORE_INFO_VERSION_UNDEFINED = 0x7FFFFFFF
+} QnnDevice_CoreInfoVersion_t;
+
+/**
+ * @brief Structure defining per core info
+ */
+typedef struct {
+  QnnDevice_CoreInfoVersion_t version;
+  union UNNAMED {
+    /// Core info which corresponds to version QNN_DEVICE_CORE_INFO_VERSION_1
+    QnnDevice_CoreInfoV1_t v1;
+  };
+} QnnDevice_CoreInfo_t;
+
+/// QnnDevice_CoreInfo_t initializer macro
+#define QNN_DEVICE_CORE_INFO_INIT               \
+  {                                             \
+    QNN_DEVICE_CORE_INFO_VERSION_1, /*version*/ \
+    {                                           \
+      QNN_DEVICE_CORE_INFO_V1_INIT /*v1*/       \
+    }                                           \
+  }
+
+/**
+ * @brief Backend-defined structure to populate backend specific information for device info
+ */
+typedef struct _QnnDevice_DeviceInfoExtension_t* QnnDevice_DeviceInfoExtension_t;
+
+/**
+ * @brief Version 1 of the structure defining Hardware Device info
+ */
+typedef struct {
+  /// ID of the device
+  uint32_t deviceId;
+  /// Type of the device
+  uint32_t deviceType;
+  /// Number of cores in a device
+  uint32_t numCores;
+  /// Array of core info structures
+  QnnDevice_CoreInfo_t* cores;
+  /// Backend specific extension for device info. Refer to backend headers for the definition
+  QnnDevice_DeviceInfoExtension_t deviceInfoExtension;
+} QnnDevice_HardwareDeviceInfoV1_t;
+
+// clang-format off
+/// QnnDevice_HardwareDeviceInfoV1_t initializer macro
+#define QNN_DEVICE_HARDWARE_DEVICE_INFO_V1_INIT           \
+  {                                                       \
+    QNN_DEVICE_DEFAULT_DEVICE_ID, /*deviceId*/            \
+    0u,                           /*deviceType*/          \
+    0u,                           /*numCores*/            \
+    NULL,                         /*cores*/               \
+    NULL                          /*deviceInfoExtension*/ \
+  }
+// clang-format on
+
+/**
+ * @brief Enum to distinguish device info versions
+ */
+typedef enum {
+  QNN_DEVICE_HARDWARE_DEVICE_INFO_VERSION_1 = 1,
+  // Unused, present to ensure 32 bits.
+  QNN_DEVICE_HARDWARE_DEVICE_INFO_VERSION_UNDEFINED = 0x7FFFFFFF
+} QnnDevice_HardwareDeviceInfoVersion_t;
+
+/**
+ * @brief Structure defining hardware device info (typically a SoC or PCIe extension)
+ */
+typedef struct {
+  QnnDevice_HardwareDeviceInfoVersion_t version;
+  union UNNAMED {
+    /// Device info which corresponds to version QNN_DEVICE_HARDWARE_DEVICE_INFO_VERSION_1
+    QnnDevice_HardwareDeviceInfoV1_t v1;
+  };
+} QnnDevice_HardwareDeviceInfo_t;
+
+/// QnnDevice_HardwareDeviceInfo_t initializer macro
+#define QNN_DEVICE_HARDWARE_DEVICE_INFO_INIT               \
+  {                                                        \
+    QNN_DEVICE_HARDWARE_DEVICE_INFO_VERSION_1, /*version*/ \
+    {                                                      \
+      QNN_DEVICE_HARDWARE_DEVICE_INFO_V1_INIT /*v1*/       \
+    }                                                      \
+  }
+
+/**
+ * @brief Version 1 of the structure defining platform info
+ */
+typedef struct {
+  /// Number of devices
+  uint32_t numHwDevices;
+  /// Array of device info structures
+  QnnDevice_HardwareDeviceInfo_t* hwDevices;
+} QnnDevice_PlatformInfoV1_t;
+
+// clang-format off
+/// QnnDevice_PlatformInfoV1_t initializer macro
+#define QNN_DEVICE_PLATFORM_INFO_V1_INIT \
+  {                                      \
+    0u,      /*numHwDevices*/            \
+    NULL     /*hwDevices*/               \
+  }
+// clang-format on
+
+/**
+ * @brief Enum to distinguish platform info versions
+ */
+typedef enum {
+  QNN_DEVICE_PLATFORM_INFO_VERSION_1 = 1,
+  // Unused, present to ensure 32 bits.
+  QNN_DEVICE_PLATFORM_INFO_VERSION_UNDEFINED = 0x7FFFFFFF
+} QnnDevice_PlatformInfoVersion_t;
+
+/**
+ * @brief Structure defining the platform info
+ */
+typedef struct {
+  QnnDevice_PlatformInfoVersion_t version;
+  union UNNAMED {
+    /// Platform info which corresponds to version QNN_DEVICE_PLATFORM_INFO_VERSION_1
+    QnnDevice_PlatformInfoV1_t v1;
+  };
+} QnnDevice_PlatformInfo_t;
+
+/// QnnDevice_PlatformInfo_t initializer macro
+#define QNN_DEVICE_PLATFORM_INFO_INIT               \
+  {                                                 \
+    QNN_DEVICE_PLATFORM_INFO_VERSION_1, /*version*/ \
+    {                                               \
+      QNN_DEVICE_PLATFORM_INFO_V1_INIT /*v1*/       \
+    }                                               \
+  }
+
+/**
+ * @brief Backend specific object for custom configuration
+ *
+ * Please refer to documentation provided by the backend for usage information
+ */
+typedef void* QnnDevice_CustomConfig_t;
+
+/**
+ * @brief This enum defines config options to control QnnDevice_Config_t
+ */
+typedef enum {
+  /// sets backend custom options
+  QNN_DEVICE_CONFIG_OPTION_CUSTOM = 0,
+  /// select QnnDevice_PlatformInfo_t
+  QNN_DEVICE_CONFIG_OPTION_PLATFORM_INFO = 1,
+  /// Unused, present to ensure 32 bits.
+  QNN_DEVICE_CONFIG_OPTION_UNDEFINED = 0x7FFFFFFF
+} QnnDevice_ConfigOption_t;
+
+/**
+ * @brief This struct provides device configuration.
+ */
+typedef struct {
+  QnnDevice_ConfigOption_t option;
+  union UNNAMED {
+    QnnDevice_CustomConfig_t customConfig;
+    QnnDevice_PlatformInfo_t* hardwareInfo;
+  };
+} QnnDevice_Config_t;
+
+/// QnnDevice_Config_t initializer macro
+#define QNN_DEVICE_CONFIG_INIT                     \
+  {                                                \
+    QNN_DEVICE_CONFIG_OPTION_UNDEFINED, /*option*/ \
+    {                                              \
+      NULL /*customConfig*/                        \
+    }                                              \
+  }
+
+//=============================================================================
+// Public Functions
+//=============================================================================
+
+/**
+ * @brief A function to get the collection of devices and cores that a QNN backend is able to
+ *        recognize and communicate with. Memory is owned by the backend and deallocated with a call
+ *        to QnnDevice_freePlatformInfo().
+ *
+ * @note This function may not be supported for offline preparation
+ *
+ * @param[in] logger A handle to the logger, use NULL handle to disable logging. QnnDevice doesn't
+ *                   manage the lifecycle of logger and must be freed by using QnnLog_free().
+ *
+ * @param[out] platformInfo Information about the platform. Memory for this information is owned
+ *                          and managed by QNN backend.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: no error is encountered
+ *         - QNN_DEVICE_ERROR_INVALID_ARGUMENT: _platformInfo_ is NULL
+ *         - QNN_DEVICE_ERROR_UNSUPPORTED_FEATURE: API is not supported
+ *         - QNN_DEVICE_ERROR_MEM_ALLOC: failure in allocating memory for _platformInfo_
+ *         - QNN_DEVICE_ERROR_INVALID_HANDLE: invalid _logger_
+ *
+ * @note Use corresponding API through QnnInterface_t.
+ */
+QNN_API
+Qnn_ErrorHandle_t QnnDevice_getPlatformInfo(Qnn_LogHandle_t logger,
+                                            const QnnDevice_PlatformInfo_t** platformInfo);
+
+/**
+ * @brief A function to free the memory allocated during QnnDevice_getPlatformInfo()
+ *
+ * @note This function may not be supported for offline preparation.
+ *
+ * @param[in] logger A handle to the logger, use NULL handle to disable logging. QnnDevice doesn't
+ *                   manage the lifecycle of logger and must be freed by using QnnLog_free().
+ *
+ * @param[in] platformInfo Information about the platform. Memory for this information is owned and
+ *                         managed by QNN backend.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: no error is encountered
+ *         - QNN_DEVICE_ERROR_INVALID_ARGUMENT: _platformInfo_ is NULL
+ *         - QNN_DEVICE_ERROR_UNSUPPORTED_FEATURE: API is not supported
+ *         - QNN_DEVICE_ERROR_MEM_ALLOC: failure in de-allocating memory for _platformInfo_
+ *         - QNN_DEVICE_ERROR_INVALID_HANDLE: invalid _logger_
+ *
+ * @note Use corresponding API through QnnInterface_t.
+ */
+QNN_API
+Qnn_ErrorHandle_t QnnDevice_freePlatformInfo(Qnn_LogHandle_t logger,
+                                             const QnnDevice_PlatformInfo_t* platformInfo);
+
+/**
+ * @brief Get device hardware infrastructure interface object
+ *
+ * This is optional capability, support is advertised via QnnProperty. If supported, please refer
+ * to documentation and/or header file provided by the backend for usage information.
+ *
+ * @param[out] deviceInfra Pointer to infrastructure interface object. The pointer returned is a
+ *                         backend owned memory.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: No error encountered
+ *         - QNN_DEVICE_ERROR_INVALID_HANDLE: _device_ is not a valid handle
+ *         - QNN_DEVICE_ERROR_UNSUPPORTED_FEATURE: API is not supported
+ *         - QNN_DEVICE_ERROR_INVALID_ARGUMENT: if _deviceInfra_ is NULL
+ *         - QNN_DEVICE_ERROR_MEM_ALLOC: insufficient memory to return _deviceInfra_
+ *
+ * @note Use corresponding API through QnnInterface_t.
+ */
+QNN_API
+Qnn_ErrorHandle_t QnnDevice_getInfrastructure(const QnnDevice_Infrastructure_t* deviceInfra);
+
+/**
+ * @brief Create a logical device handle to a subset of hardware resources available on the
+ *        platform.
+ *
+ * @param[in] logger A handle to the logger, use NULL handle to disable logging. QnnDevice doesn't
+ *                   manage the lifecycle of logger and must be freed by using QnnLog_free().
+ *
+ * @param[in] config Pointer to a NULL terminated array of config option pointers. NULL is allowed
+ *                   and indicates no config options are provided. All config options have default
+ *                   value, in case not provided. If same config option type is provided multiple
+ *                   times, the last option value will be used.
+ *
+ * @note NULL value for config creates a device handle with default configuration. Unless mentioned
+ *       in backend specific headers, default configuration would enable all the devices and cores
+ *       present on a platform for which a backend can control.
+ *
+ * @param[out] device A handle to the created device.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: no error is encountered
+ *         - QNN_DEVICE_ERROR_INVALID_ARGUMENT: _device_ is NULL
+ *         - QNN_DEVICE_ERROR_INVALID_HANDLE: _logger_ is not a valid handle
+ *         - QNN_DEVICE_ERROR_INVALID_CONFIG: one or more configuration values is invalid
+ *         - QNN_DEVICE_ERROR_MEM_ALLOC: failure in allocating memory when creating device
+ *         - QNN_DEVICE_ERROR_HARDWARE_UNAVAILABLE: requested hardware resources are unavailable
+ *         - QNN_DEVICE_ERROR_UNSUPPORTED_FEATURE: API is not supported
+ *         - QNN_COMMON_ERROR_SYSTEM_COMMUNICATION: SSR occurence (successful recovery)
+ *         - QNN_COMMON_ERROR_SYSTEM_COMMUNICATION_FATAL: SSR occurence (unsuccessful recovery)
+ *
+ * @note Use corresponding API through QnnInterface_t.
+ */
+QNN_API
+Qnn_ErrorHandle_t QnnDevice_create(Qnn_LogHandle_t logger,
+                                   const QnnDevice_Config_t** config,
+                                   Qnn_DeviceHandle_t* device);
+
+/**
+ * @brief A function to set/modify configuration options on an already created device.
+ *        Backends are not required to support this API.
+ *
+ * @param[in] device A device handle.
+ *
+ * @param[in] config Pointer to a NULL terminated array of config option pointers. NULL is allowed
+ *                   and indicates no config options are provided. All config options have default
+ *                   value, in case not provided. If same config option type is provided multiple
+ *                   times, the last option value will be used.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: no error is encountered
+ *         - QNN_DEVICE_ERROR_INVALID_HANDLE: _device_ is not a valid handle
+ *         - QNN_DEVICE_ERROR_INVALID_ARGUMENT: at least one argument is invalid
+ *         - QNN_DEVICE_ERROR_INVALID_CONFIG: one or more configuration values is invalid
+ *         - QNN_DEVICE_ERROR_UNSUPPORTED_FEATURE: API is not supported
+ *         - QNN_DEVICE_ERROR_ASSOCIATED_TO_CONTEXT: _device_ has associated contexts. Free the
+ *           associations before attempting to change the config.
+ *
+ * @note Use corresponding API through QnnInterface_t.
+ */
+QNN_API
+Qnn_ErrorHandle_t QnnDevice_setConfig(Qnn_DeviceHandle_t device, const QnnDevice_Config_t** config);
+
+/**
+ * @brief A function to get platform info associated with a device handle.
+ *
+ * @param[in] device A device handle.
+ *
+ * @param[out] platformInfo Information about the platform. Memory for this information is owned
+ *                          and managed by QNN backend.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: no error is encountered
+ *         - QNN_DEVICE_ERROR_INVALID_HANDLE: _device_ is not a valid handle
+ *         - QNN_DEVICE_ERROR_INVALID_ARGUMENT: _platformInfo_ is NULL
+ *         - QNN_DEVICE_ERROR_UNSUPPORTED_FEATURE: API is not supported
+ *
+ * @note Use corresponding API through QnnInterface_t.
+ */
+QNN_API
+Qnn_ErrorHandle_t QnnDevice_getInfo(Qnn_DeviceHandle_t device,
+                                    const QnnDevice_PlatformInfo_t** platformInfo);
+
+/**
+ * @brief Free the created device and perform any deallocation of the resources allocated during
+ *        device create.
+ *
+ * @param[in] device A device handle.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: no error is encountered
+ *         - QNN_DEVICE_ERROR_INVALID_HANDLE: _device_ is not a valid handle
+ *         - QNN_DEVICE_ERROR_MEM_ALLOC: an error is encountered with de-allocation of associated
+ *           memory, failure to invalidate handles or other allocated resources
+ *         - QNN_DEVICE_ERROR_ASSOCIATED_TO_CONTEXT: One or more contexts associated with the device
+ *           handle is not freed
+ *         - QNN_DEVICE_ERROR_UNSUPPORTED_FEATURE: API is not supported
+ *
+ * @note Use corresponding API through QnnInterface_t.
+ */
+QNN_API
+Qnn_ErrorHandle_t QnnDevice_free(Qnn_DeviceHandle_t device);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // QNN_DEVICE_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/QnnError.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/QnnError.h
new file mode 100755
index 0000000000000..5948b23f8786a
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/QnnError.h
@@ -0,0 +1,122 @@
+//==============================================================================
+//
+// Copyright (c) 2023 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+/**
+ *  @file
+ *  @brief  Error handling API
+ *
+ *          Requires Backend to be initialized.
+ *          Provides means to get detailed error information.
+ */
+
+#ifndef QNN_ERROR_H
+#define QNN_ERROR_H
+
+#include "QnnCommon.h"
+#include "QnnTypes.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//=============================================================================
+// Macros
+//=============================================================================
+
+//=============================================================================
+// Data Types
+//=============================================================================
+
+/**
+ * @brief QNN Error API result / error codes.
+ */
+typedef enum {
+  QNN_ERROR_MIN_ERROR = QNN_MIN_ERROR_ERROR,
+  ////////////////////////////////////////
+
+  /// Qnn Error success
+  QNN_ERROR_NO_ERROR = QNN_SUCCESS,
+  /// Invalid function argument
+  QNN_ERROR_INVALID_ARGUMENT = QNN_MIN_ERROR_ERROR + 0,
+  /// Unrecognized or invalid error handle
+  QNN_ERROR_INVALID_ERROR_HANDLE = QNN_MIN_ERROR_ERROR + 1,
+  ////////////////////////////////////////
+  QNN_ERROR_MAX_ERROR = QNN_MAX_ERROR_ERROR,
+  // Unused, present to ensure 32 bits.
+  QNN_ERROR_UNDEFINED = 0x7FFFFFFF
+} QnnError_Error_t;
+
+//=============================================================================
+// Public Functions
+//=============================================================================
+
+/**
+ * @brief Query QNN backend for string message describing the error code.
+ * Returned message should contain basic information about the nature of the
+ * error.
+ *
+ * @param[in] errorHandle   Error handle to request descriptive message for.
+ *
+ * @param[out] errorMessage Pointer to a null terminated character array containing the message
+ *                          associated with the passed errorHandle. The memory is statically
+ *                          owned and should not be freed by the caller. If _errorHandle_
+ *                          is not recognized, the pointer _errorMessage_ points to is set to
+ *                          nullptr.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: error string corresponding to the error handle successfully queried
+ *         - QNN_ERROR_INVALID_ARGUMENT: _errorMessage_ is null
+ *         - QNN_ERROR_INVALID_ERROR_HANDLE: _errorHandle_ not recognized
+ */
+QNN_API
+Qnn_ErrorHandle_t QnnError_getMessage(Qnn_ErrorHandle_t errorHandle, const char** errorMessage);
+
+/**
+ * @brief Query QNN backend for verbose string message describing the error code.
+ * Returned message should contain detailed information about the nature of the
+ * error.
+ *
+ * @param[in] errorHandle   Error handle to request descriptive message for.
+ *
+ * @param[out] errorMessage Pointer to a null terminated character array containing the verbose
+ *                          message associated with the passed errorHandle. The memory is
+ *                          owned by the backend and only freed when the caller invokes
+ *                          QnnError_freeVerboseMessage, passing the same error handle. If
+ *                          _errorHandle_ is not recognized, the pointer _errorMessage_ points
+ *                          to is set to nullptr.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: error string corresponding to the error handle successfully queried
+ *         - QNN_ERROR_INVALID_ARGUMENT: _errorMessage_ is null
+ *         - QNN_ERROR_INVALID_ERROR_HANDLE: _errorHandle_ not recognized by backend
+ */
+QNN_API
+Qnn_ErrorHandle_t QnnError_getVerboseMessage(Qnn_ErrorHandle_t errorHandle,
+                                             const char** errorMessage);
+
+/**
+ * @brief Inform QNN backend that the memory associated with the verbose message
+ * returned by a previous call to QnnError_getVerboseMessage will no longer be
+ * accessed by the caller and may be freed.
+ *
+ * @param[in] errorMessage Address of character buffer returned in previous call to
+ *                          QnnError_getVerboseMessage.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: backend acknowledges the caller will no longer access memory
+ *           associated with previous call to QnnError_getVerboseMessage
+ *         - QNN_ERROR_INVALID_ARGUMENT: _errorMessage_ is null or unrecognized
+ */
+QNN_API
+Qnn_ErrorHandle_t QnnError_freeVerboseMessage(const char* errorMessage);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // QNN_ERROR_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/QnnGraph.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/QnnGraph.h
new file mode 100755
index 0000000000000..9082af8ba902c
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/QnnGraph.h
@@ -0,0 +1,873 @@
+//==============================================================================
+//
+// Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+// All rights reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+/**
+ *  @file
+ *  @brief  Graph component API
+ *
+ *          Requires Backend to be initialized.
+ *          Provides composable graph API. Graph is created inside Context.
+ *          Nodes are added to the graph. Nodes are connected with Tensors.
+ *          Once finalized graph can be executed.
+ */
+
+#ifndef QNN_GRAPH_H
+#define QNN_GRAPH_H
+
+#include "QnnCommon.h"
+#include "QnnTypes.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//=============================================================================
+// Data Types
+//=============================================================================
+
+/**
+ * @brief QNN Graph API result / error codes.
+ */
+typedef enum {
+  QNN_GRAPH_MIN_ERROR = QNN_MIN_ERROR_GRAPH,
+  ////////////////////////////////////////
+
+  /// Qnn Graph success
+  QNN_GRAPH_NO_ERROR = QNN_SUCCESS,
+  /// There is optional API component that is not supported yet. See QnnProperty.
+  QNN_GRAPH_ERROR_UNSUPPORTED_FEATURE = QNN_COMMON_ERROR_NOT_SUPPORTED,
+  /// General error relating to memory allocation in processing graph API
+  QNN_GRAPH_ERROR_MEM_ALLOC = QNN_COMMON_ERROR_MEM_ALLOC,
+  /// General type of graph error, which has not been identified as any
+  /// other error type. Any Graph API can return this error code.
+  QNN_GRAPH_ERROR_GENERAL = QNN_COMMON_ERROR_GENERAL,
+  /// An argument to QNN API is deemed invalid by a backend
+  QNN_GRAPH_ERROR_INVALID_ARGUMENT = QNN_MIN_ERROR_GRAPH + 0,
+  /// Invalid graph handle
+  QNN_GRAPH_ERROR_INVALID_HANDLE = QNN_MIN_ERROR_GRAPH + 1,
+  /// No graph with specified info is registered in the backend
+  QNN_GRAPH_ERROR_GRAPH_DOES_NOT_EXIST = QNN_MIN_ERROR_GRAPH + 2,
+  /// Invalid or duplicate graph name
+  QNN_GRAPH_ERROR_INVALID_NAME = QNN_MIN_ERROR_GRAPH + 3,
+  /// Invalid or NULL QNN tensor
+  QNN_GRAPH_ERROR_INVALID_TENSOR = QNN_MIN_ERROR_GRAPH + 4,
+  /// Some elements in the op config data are invalid
+  QNN_GRAPH_ERROR_INVALID_OP_CONFIG = QNN_MIN_ERROR_GRAPH + 5,
+  /// Failure to set profile
+  QNN_GRAPH_ERROR_SET_PROFILE = QNN_MIN_ERROR_GRAPH + 6,
+  /// Node added before its dependent node(s)
+  QNN_GRAPH_ERROR_UNCONNECTED_NODE = QNN_MIN_ERROR_GRAPH + 7,
+  /// Failure in creating graph with specified configuration
+  QNN_GRAPH_ERROR_CREATE_FAILED = QNN_MIN_ERROR_GRAPH + 20,
+  /// Graph couldn't be optimized with specified list of ops or config
+  QNN_GRAPH_ERROR_OPTIMIZATION_FAILED = QNN_MIN_ERROR_GRAPH + 21,
+  /// Graph finalize failed
+  QNN_GRAPH_ERROR_FINALIZE_FAILED = QNN_MIN_ERROR_GRAPH + 22,
+  /// Attempt to execute graph before finalizing it
+  QNN_GRAPH_ERROR_GRAPH_NOT_FINALIZED = QNN_MIN_ERROR_GRAPH + 23,
+  /// Attempt to modify graph after finalizing it
+  QNN_GRAPH_ERROR_GRAPH_FINALIZED = QNN_MIN_ERROR_GRAPH + 24,
+  /// FIFO queue cannot register any more async execution requests
+  QNN_GRAPH_ERROR_EXECUTION_ASYNC_FIFO_FULL = QNN_MIN_ERROR_GRAPH + 25,
+  /// A control signal object was provided to a call, but that signal object
+  /// is already in-use by another call.
+  QNN_GRAPH_ERROR_SIGNAL_IN_USE = QNN_MIN_ERROR_GRAPH + 30,
+  /// Call aborted early due to a QnnSignal_trigger call issued
+  /// to the observed signal object.
+  QNN_GRAPH_ERROR_ABORTED = QNN_MIN_ERROR_GRAPH + 31,
+  /// Attempt to bind to a graph a profile handle that is already in-use
+  /// by another graph.
+  QNN_GRAPH_ERROR_PROFILE_IN_USE = QNN_MIN_ERROR_GRAPH + 32,
+  /// Call aborted early due to a QnnSignal timeout
+  QNN_GRAPH_ERROR_TIMED_OUT = QNN_MIN_ERROR_GRAPH + 33,
+  /// Operation not permitted on a subgraph
+  QNN_GRAPH_ERROR_SUBGRAPH = QNN_MIN_ERROR_GRAPH + 34,
+  /// Graph is not enabled
+  QNN_GRAPH_ERROR_DISABLED = QNN_MIN_ERROR_GRAPH + 35,
+  /// Dynamic tensor shape error
+  QNN_GRAPH_ERROR_DYNAMIC_TENSOR_SHAPE = QNN_MIN_ERROR_GRAPH + 36,
+  /// Tensor sparsity error
+  QNN_GRAPH_ERROR_TENSOR_SPARSITY = QNN_MIN_ERROR_GRAPH + 37,
+  /// Early termination error
+  QNN_GRAPH_ERROR_EARLY_TERMINATION = QNN_MIN_ERROR_GRAPH + 38,
+  /// Invalid context error
+  QNN_GRAPH_ERROR_INVALID_CONTEXT = QNN_MIN_ERROR_GRAPH + 39,
+
+  ////////////////////////////////////////
+  QNN_GRAPH_MAX_ERROR = QNN_MAX_ERROR_GRAPH,
+  // Unused, present to ensure 32 bits.
+  QNN_GRAPH_ERROR_UNDEFINED = 0x7FFFFFFF
+} QnnGraph_Error_t;
+
+/**
+ * @brief This enum defines graph config options.
+ */
+typedef enum {
+  /// Sets backend custom configs, see backend specific documentation.
+  QNN_GRAPH_CONFIG_OPTION_CUSTOM = 0,
+  /// Sets priority of a graph within the context. This config overrides
+  /// QNN_CONTEXT_CONFIG_OPTION_PRIORITY which provides the default graph priority.
+  QNN_GRAPH_CONFIG_OPTION_PRIORITY = 3,
+  /// Enables continuous profiling of a graph. This can include finalize and execute data. The
+  /// profile handle will be bound to the graph until a new handle is bound or the graph has been
+  /// freed. This feature is mutually exclusive with the per-API profile handles. A
+  /// Qnn_ProfileHandle_t bound to a graph can be concurrently used with QnnProfile_get* APIs. A
+  /// Qnn_ProfileHandle_t may only be bound to one graph at a time. A different Qnn_ProfileHandle_t
+  /// may be bound to the graph via QnnGraph_setConfig.
+  QNN_GRAPH_CONFIG_OPTION_PROFILE_HANDLE = 4,
+  /// Sets the profiling state of a graph. This config should only be used in conjunction with
+  /// profiling handles bound with QNN_GRAPH_CONFIG_OPTION_PROFILE_HANDLE. The behaviour is that
+  /// the profiling data is only collected when the state is enabled. Setting the state to disabled
+  /// causes the profiling data collection to cease. The default state is
+  /// QNN_GRAPH_PROFILING_STATE_ENABLED.
+  QNN_GRAPH_CONFIG_OPTION_SET_PROFILING_STATE = 5,
+  /// Sets the maximum number of QnnGraph_execute/QnnGraph_executeAsync calls that will be profiled.
+  /// This config should only be used in conjunction with profiling handles bound with
+  /// QNN_GRAPH_CONFIG_OPTION_PROFILE_HANDLE. The default is the
+  /// QnnGraph_Config_t::numProfilingExecutions maximum numerical limit.
+  QNN_GRAPH_CONFIG_OPTION_SET_PROFILING_NUM_EXECUTIONS = 6,
+  // Unused, present to ensure 32 bits.
+  QNN_GRAPH_CONFIG_OPTION_UNDEFINED = 0x7FFFFFFF
+} QnnGraph_ConfigOption_t;
+
+/**
+ * @brief Graph specific object for custom configuration
+ *
+ * Please refer to documentation provided by the backend for usage information
+ */
+typedef void* QnnGraph_CustomConfig_t;
+
+/**
+ * @brief This enum defines graph profiling states.
+ */
+typedef enum {
+  /// Profiling is enabled for the graph
+  QNN_GRAPH_PROFILING_STATE_ENABLED = 1,
+  /// Profiling is disabled for the graph
+  QNN_GRAPH_PROFILING_STATE_DISABLED = 2,
+  // Unused, present to ensure 32 bits.
+  QNN_GRAPH_PROFILING_STATE_UNDEFINED = 0x7FFFFFFF
+} QnnGraph_ProfilingState_t;
+
+/**
+ * @brief This struct provides graph configuration.
+ */
+typedef struct {
+  QnnGraph_ConfigOption_t option;
+  union UNNAMED {
+    QnnGraph_CustomConfig_t customConfig;
+    Qnn_Priority_t priority;
+    Qnn_ProfileHandle_t profileHandle;
+    QnnGraph_ProfilingState_t profilingState;
+    uint32_t numProfilingExecutions;
+  };
+} QnnGraph_Config_t;
+
+/// QnnGraph_Config_t initializer macro
+#define QNN_GRAPH_CONFIG_INIT                     \
+  {                                               \
+    QNN_GRAPH_CONFIG_OPTION_UNDEFINED, /*option*/ \
+    {                                             \
+      NULL /*customConfig*/                       \
+    }                                             \
+  }
+
+/**
+ * @brief This enum defines graph property options.
+ */
+typedef enum {
+  /// Sets backend custom properties, see backend specific documentation.
+  QNN_GRAPH_PROPERTY_OPTION_CUSTOM = 0,
+  /// Value selected to ensure 32 bits.
+  QNN_GRAPH_PROPERTY_OPTION_UNDEFINED = 0x7FFFFFFF
+} QnnGraph_PropertyOption_t;
+
+/**
+ * @brief Graph specific object for custom property
+ *
+ * Please refer to documentation provided by the backend for usage information
+ */
+typedef void* QnnGraph_CustomProperty_t;
+
+/**
+ * @brief This struct provides graph property.
+ *        Option is specified by the client. Everything
+ *        else is written by the backend.
+ */
+typedef struct {
+  QnnGraph_PropertyOption_t option;
+  union UNNAMED {
+    QnnGraph_CustomProperty_t customProperty;
+  };
+} QnnGraph_Property_t;
+
+// clang-format off
+/// QnnGraph_Property_t initializer macro
+#define QNN_GRAPH_PROPERTY_INIT                     \
+  {                                                 \
+    QNN_GRAPH_PROPERTY_OPTION_UNDEFINED, /*option*/ \
+    {                                               \
+      NULL /*customProperty*/                       \
+    }                                               \
+  }
+// clang-format on
+
+/**
+ * @brief This enum defines graph execution environment options.
+ */
+typedef enum {
+  // Environment option for binding a set of client registered memory handles for a tensor set.
+  QNN_GRAPH_EXECUTE_ENVIRONMENT_OPTION_BIND_MEM_HANDLES = 0,
+  // Environment option for discovering backend allocated client buffer pointers.
+  QNN_GRAPH_EXECUTE_ENVIRONMENT_OPTION_POPULATE_CLIENT_BUFS = 1,
+  // Unused, present to ensure 32 bits.
+  QNN_GRAPH_EXECUTE_ENVIRONMENT_OPTION_UNDEFINED = 0x7FFFFFFF
+} QnnGraph_ExecuteEnvironmentOption_t;
+
+/**
+ * @brief This struct provides graph execution environment options.
+ * @note QnnGraph_ExecuteEnvironment_t is entirely owned by the client.
+ */
+typedef struct {
+  // Option is required to be set for any instance of QnnGraph_ExecuteEnvironment_t.
+  QnnGraph_ExecuteEnvironmentOption_t option;
+  union UNNAMED {
+    // See QNN_GRAPH_EXECUTE_ENVIRONMENT_OPTION_BIND_MEM_HANDLES and
+    // QNN_GRAPH_EXECUTE_ENVIRONMENT_OPTION_POPULATE_CLIENT_BUFS.
+    Qnn_TensorSet_t tensorSet;
+  };
+} QnnGraph_ExecuteEnvironment_t;
+
+/**
+ * @brief This struct provides status associated with Qnn_NotifyFn_t() function.
+ */
+typedef struct {
+  Qnn_ErrorHandle_t error;
+} Qnn_NotifyStatus_t;
+
+/// Qnn_NotifyStatus_t initializer macro
+#define QNN_NOTIFY_STATUS_INIT \
+  { 0u /*error*/ }
+
+/**
+ * @brief A client-defined callback function. It is not guaranteed that a spot in the execution
+ *        queue is free once this callback is called. i.e. it cannot be inferred that once a
+ *        callback is received, the next call to QnnGraph_executeAsync() will not block due to the
+ *        queue being full.
+ *
+ * @param[in] notifyParam Client supplied data object which may be used to identify
+ *                        which function this callback applies to.
+ *
+ * @param[in] notifyStatus Execution status associate with callback.
+ *
+ * @return None
+ *
+ */
+typedef void (*Qnn_NotifyFn_t)(void* notifyParam, Qnn_NotifyStatus_t notifyStatus);
+
+//=============================================================================
+// Public Functions
+//=============================================================================
+
+/**
+ * @brief A function to create an empty graph.
+ *        The function returns an opaque object to be used on all graph APIs
+ *        (addNode, finalize, execute, ...)
+ *
+ * @param[in] contextHandle A handle to the context in which the graph would be created.
+ *
+ * @param[in] graphName A string which identifies the graph. Graph name allows retrieval of the
+ *                      graph after creating the context from cached binary.  _graphName_ must be
+ *                      unique within the _context_.
+ *
+ * @param[in] config Pointer to a NULL terminated array of config option pointers. NULL is allowed
+ *                   and indicates no config options are provided. All config options have default
+ *                   value, in case not provided. If same config option type is provided multiple
+ *                   times, the last option value will be used.
+ *
+ * @param[out] graphHandle The created graph handle.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: the graph was successfully created
+ *         - QNN_GRAPH_ERROR_INVALID_ARGUMENT: _graph_ is NULL or at least one config option was
+ *           invalid
+ *         - QNN_GRAPH_ERROR_INVALID_NAME: _graphName_ is NULL or not unique within the
+ *           _context_
+ *         - QNN_GRAPH_ERROR_INVALID_HANDLE: _context_ is not a valid handle
+ *         - QNN_GRAPH_ERROR_MEM_ALLOC: create failed due to memory/resource allocation
+ *         - QNN_GRAPH_ERROR_UNSUPPORTED_FEATURE: some API feature is not supported yet, e.g.
+ *           config option
+ *         - QNN_GRAPH_ERROR_CREATE_FAILED: create failed due to some other reason
+ *         - QNN_COMMON_ERROR_OPERATION_NOT_PERMITTED: create failed when context is
+ *           re-created from binary using QnnContext_createFromBinary().
+ *         - QNN_GRAPH_ERROR_PROFILE_IN_USE: when a profile handle is passed as graph config, that
+ *           profile handle can only be bound to one graph at a time
+ *
+ * @note Use corresponding API through QnnInterface_t.
+ */
+QNN_API
+Qnn_ErrorHandle_t QnnGraph_create(Qnn_ContextHandle_t contextHandle,
+                                  const char* graphName,
+                                  const QnnGraph_Config_t** config,
+                                  Qnn_GraphHandle_t* graphHandle);
+
+/**
+ * @brief A function to create an empty graph which will be a subgraph of another graph.
+ *        The function returns an opaque object to be used to add nodes to the subgraph.
+ *        A subgraph can not be explicitly finalized or executed. Only a graph with no
+ *        parent graphs can be finalized and executed.
+ *
+ * @param[in] graphHandle Handle to the graph in which the subgraph is created.
+ *
+ * @param[in] graphName A string which identifies the graph. Graph name allows retrieval of the
+ *                      graph after creating the context from cached binary. _graphName_ must be
+ *                      unique within the _context_.
+ *
+ * @param[out] subgraphHandle The created subgraph handle.
+ *
+ * @note A subgraph can have another subgraph as a parent.
+ *
+ * @note Nodes and tensors can be added to a subgraph before and/or after the subgraph handle has
+ *       been included as part of an op config added as a node.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: the graph was successfully created
+ *         - QNN_GRAPH_ERROR_INVALID_ARGUMENT: _subgraphHandle_ is NULL
+ *         - QNN_GRAPH_ERROR_INVALID_NAME: _graphName_ is NULL or not unique within the
+ *           _context_
+ *         - QNN_GRAPH_ERROR_INVALID_HANDLE: _graphHandle_ is not a valid handle
+ *         - QNN_GRAPH_ERROR_MEM_ALLOC: create failed due to memory/resource allocation
+ *         - QNN_GRAPH_ERROR_UNSUPPORTED_FEATURE: This API is not yet supported
+ *         - QNN_GRAPH_ERROR_CREATE_FAILED: create failed due to some other reason
+ *         - QNN_COMMON_ERROR_OPERATION_NOT_PERMITTED: create failed when context is
+ *           re-created from binary using QnnContext_createFromBinary().
+ *
+ * @note Use corresponding API through QnnInterface_t.
+ */
+QNN_API
+Qnn_ErrorHandle_t QnnGraph_createSubgraph(Qnn_GraphHandle_t graphHandle,
+                                          const char* graphName,
+                                          Qnn_GraphHandle_t* subgraphHandle);
+
+/**
+ * @brief A function to set/modify configuration options on an already created graph.
+ *        Backends are not required to support this API.
+ *
+ * @param[in] graphHandle A graph handle.
+ *
+ * @param[in] config Pointer to a NULL terminated array of config option pointers. NULL is allowed
+ *                   and indicates no config options are provided. All config options have default
+ *                   value, in case not provided. If same config option type is provided multiple
+ *                   times, the last option value will be used. If a backend cannot support all
+ *                   provided configs it will fail.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: no error is encountered
+ *         - QNN_GRAPH_ERROR_INVALID_HANDLE: _graphHandle_ is not a valid handle
+ *         - QNN_GRAPH_ERROR_INVALID_ARGUMENT: at least one config option is invalid
+ *         - QNN_GRAPH_ERROR_GRAPH_FINALIZED: at least one valid config option is not valid
+ *           on a finalized graph
+ *         - QNN_GRAPH_ERROR_SUBGRAPH: operation not permitted on a subgraph
+ *         - QNN_GRAPH_ERROR_UNSUPPORTED_FEATURE: at least one valid config option is not supported
+ *         - QNN_GRAPH_ERROR_PROFILE_IN_USE: when a profile handle is passed as graph config, that
+ *           profile handle can only be bound to one graph at a time
+ *         - QNN_COMMON_ERROR_SYSTEM_COMMUNICATION: SSR occurence (successful recovery)
+ *         - QNN_COMMON_ERROR_SYSTEM_COMMUNICATION_FATAL: SSR occurence (unsuccessful recovery)
+ *
+ * @note Use corresponding API through QnnInterface_t.
+ */
+QNN_API
+Qnn_ErrorHandle_t QnnGraph_setConfig(Qnn_GraphHandle_t graphHandle,
+                                     const QnnGraph_Config_t** config);
+
+/**
+ * @brief A function to get a list of graph properties.
+ *        Backends are not required to support this API.
+ *
+ * @param[in] graphHandle A graph handle.
+ *
+ * @param[in/out] properties Pointer to a null terminated array of pointers containing the
+ *                           properties associated with the passed graphHandle. Memory for
+ *                           this information is owned and managed by the client. Client
+ *                           needs to populate the property options being requested. If
+ *                           _graphHandle_ is not recognized, the pointer _properties_
+ *                           points to is set to nullptr.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: no error is encountered
+ *         - QNN_GRAPH_ERROR_INVALID_HANDLE: _graphHandle_ is not a valid handle
+ *         - QNN_GRAPH_ERROR_INVALID_ARGUMENT: _properties_ is NULL or at least one property option
+ *           is invalid
+ *         - QNN_GRAPH_ERROR_UNSUPPORTED_FEATURE: at least one valid property option is not
+ *           supported
+ *
+ * @note Use corresponding API through QnnInterface_t.
+ */
+QNN_API
+Qnn_ErrorHandle_t QnnGraph_getProperty(Qnn_GraphHandle_t graphHandle,
+                                       QnnGraph_Property_t** properties);
+
+/**
+ * @brief A function to add a node to the graph
+ *
+ * @param[in] graphHandle The graph or sub-graph handle to add the node to.
+ *
+ * @note The following conditions should be honored by tensors specified as
+ *       part of opConfig:
+ *       1. No tensor in the list opConfig.outputTensors can be of type
+ *          QNN_TENSOR_TYPE_APP_WRITE or QNN_TENSOR_TYPE_STATIC.
+ *       2. All parameters in the opConfig that happen to be tensors must be
+ *          of the type QNN_TENSOR_TYPE_STATIC.
+ *       3. Tensors express connectivity between nodes. However, it is permissible
+ *          for tensors to remain 'unconsumed' in a graph, i.e.,
+ *          not act as inputs to any other node in the graph.
+ *
+ * @note QnnGraph does not validate opConfig used in creating node beyond checks for basic sanity.
+ *       A thorough validation of opConfig for this node defined in a certain op package
+ *       has to be done via QnnBackend_validateOpConfig().
+ *
+ * @note Nodes must be added in dependency order. i.e. all QNN_TENSOR_TYPE_NATIVE inputs to the node
+ *       must be outputs of a previously added node.
+ *
+ * @param[in] opConfig A struct containing the configuration of the operation which should be
+ *                     added as a node in the graph. The tensor objects in this structure for
+ *                     inputs and outputs to the node must be created with APIs in QnnTensor.h
+ *                     which register them with a backend. Unrecognized tensors in the opConfig
+ *                     result in failure. Since the tensor ID is provided by the backend and is
+ *                     unique, it is sufficient to only specify a valid tensor ID in the
+ *                     Qnn_Tensor_t structures associated with the opConfig. All other fields
+ *                     including any static data are ignored by the backend when parsing these
+ *                     tensors.
+ *
+ * @return Error code
+ *         - QNN_SUCCESS: the node is successfully added to the graph
+ *         - QNN_GRAPH_ERROR_INVALID_OP_CONFIG: misconfigured operation - invalid op config
+ *           Thrown when a BE cannot match package name and/or op name with any
+ *           registered op packages, or when
+ *           tensor metadata for tensors in opConfig differs from that used in
+ *           registering them with a graph using QnnTensor_createGraphTensor().
+ *         - QNN_GRAPH_ERROR_INVALID_TENSOR: when tensor objects within opConfig are invalid
+ *         - QNN_GRAPH_ERROR_INVALID_HANDLE: _graph_ is not a valid handle
+ *         - QNN_GRAPH_ERROR_GRAPH_FINALIZED: add nodes on a finalized graph
+ *         - QNN_GRAPH_ERROR_UNCONNECTED_NODE: node added before its dependent node(s)
+ *         - QNN_GRAPH_ERROR_UNSUPPORTED_FEATURE: some API feature is not supported yet
+ *
+ * @note Use corresponding API through QnnInterface_t.
+ */
+QNN_API
+Qnn_ErrorHandle_t QnnGraph_addNode(Qnn_GraphHandle_t graphHandle, Qnn_OpConfig_t opConfig);
+
+/**
+ * @brief A function to finalize the graph.
+ *        If called on a graph that was composed, the runtime will process the graph, validate that
+ *        all operations are created successfully and that connectivity is correct.
+ *        If called on a graph that was retrieved from a context binary (subject to backend support,
+ *        see QNN_PROPERTY_GRAPH_SUPPORT_FINALIZE_DESERIALIZED_GRAPH), the runtime will perform
+ *        additional setup required before execution.
+ *
+ * @param[in] graphHandle Handle to the graph to be finalized.
+ *
+ * @param[in] profileHandle The profile handle on which metrics is populated and can be queried.
+ *                          Use NULL handle to disable profile collection. A handle being re-used
+ *                          would reset and is populated with values from the current call. This
+ *                          handle must be NULL when a continuous profile handle has been configured
+ *                          via the QNN_GRAPH_CONFIG_OPTION_PROFILE_HANDLE option
+ *
+ * @param[in] signalHandle Signal object to control the execution of the finalize process. NULL may
+ *                         be passed to indicate that no execution control is requested, and the
+ *                         finalize operation should continue to completion uninterrupted.
+ *                         The signal object, if not NULL, is considered to be in-use for
+ *                         the duration of the call.
+ *
+ * @note Graphs that contain zero nodes will fail to finalize.
+ *
+ * @note Some runtimes may require that this function is called before execution of a graph
+ *       retrieved from a context binary, refer to backend specific documentation.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: the graph is finalized successfully
+ *         - QNN_GRAPH_ERROR_INVALID_HANDLE: _graph_ is not a valid handle
+ *         - QNN_GRAPH_ERROR_INVALID_ARGUMENT:
+ *            - invalid param passed in OR
+ *            - continuous graph profiling is enabled and the per-API handle is not NULL.
+ *         - QNN_GRAPH_ERROR_CREATE_FAILED: op/kernel creation failed
+ *         - QNN_GRAPH_ERROR_OPTIMIZATION_FAILED: graph optimization failed
+ *         - QNN_GRAPH_ERROR_UNSUPPORTED_FEATURE: some API feature is not supported yet,
+ *           e.g. signal or profile
+ *         - QNN_GRAPH_ERROR_SET_PROFILE: set profile failed
+ *         - QNN_GRAPH_ERROR_SIGNAL_IN_USE: the supplied control signal is
+ *           already in-use by another call.
+ *         - QNN_GRAPH_ERROR_ABORTED: the call is aborted before completion due to user cancellation
+ *         - QNN_GRAPH_ERROR_TIMED_OUT: the call is aborted before completion due to a timeout
+ *         - QNN_GRAPH_ERROR_FINALIZE_FAILED: finalize failed for some other reason
+ *         - QNN_GRAPH_ERROR_SUBGRAPH: operation not permitted on a subgraph
+ *         - QNN_COMMON_ERROR_SYSTEM_COMMUNICATION: SSR occurence (successful recovery)
+ *         - QNN_COMMON_ERROR_SYSTEM_COMMUNICATION_FATAL: SSR occurence (unsuccessful recovery)
+ *
+ * @note Use corresponding API through QnnInterface_t.
+ */
+QNN_API
+Qnn_ErrorHandle_t QnnGraph_finalize(Qnn_GraphHandle_t graphHandle,
+                                    Qnn_ProfileHandle_t profileHandle,
+                                    Qnn_SignalHandle_t signalHandle);
+
+/**
+ * @brief A function to retrieve a graph based on name.
+ *        This function is typically used when a context was created from cached binary. The
+ *        re-created context has graph(s) which are also re-created. The function returns the graph
+ *        handle to be used for all graph APIs (addNode, finalize, execute, ...).
+ *
+ * @param[in] contextHandle An opaque ID to the context.
+ *
+ * @param[in] graphName A string which identifies the graph.
+ *
+ * @param[out] graphHandle A pointer to the graph handle that is being retrieved.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: the graph was successfully retrieved
+ *         - QNN_GRAPH_ERROR_INVALID_NAME: _graphName_ or _graph_ is NULL
+ *         - QNN_GRAPH_ERROR_INVALID_HANDLE: _context_ is not a valid handle
+ *         - QNN_GRAPH_ERROR_GRAPH_DOES_NOT_EXIST: graph not found/created
+ *         - QNN_GRAPH_ERROR_SUBGRAPH: operation not permitted on a subgraph
+ *
+ * @note Use corresponding API through QnnInterface_t.
+ */
+QNN_API
+Qnn_ErrorHandle_t QnnGraph_retrieve(Qnn_ContextHandle_t contextHandle,
+                                    const char* graphName,
+                                    Qnn_GraphHandle_t* graphHandle);
+
+/**
+ * @brief A function to optionally prepare an execution environment. Client can provide environment
+ *        options to a backend such that optimizations can be applied a backend or discovered by the
+ *        client. The options are:
+ *        - QNN_GRAPH_EXECUTE_ENVIRONMENT_OPTION_BIND_MEM_HANDLES: An option to achieve zero copy of
+ *          tensor data during execution. Done by grouping sets of I/O tensors and binding their
+ *          memory layout to a graph handle before execution.
+ *        - QNN_GRAPH_EXECUTE_ENVIRONMENT_OPTION_POPULATE_CLIENT_BUFS: An option to achieve zero
+ *          copy of tensor data in cases of backend-allocated memory. Clients should use this option
+ *          to discover memory layout of input and output tensors allocated by the backend.
+ *
+ * @note See SDK documentation for backend specific behaviour. Backend support for environment
+ *       options can be determined by querying the corresponding capability.
+ *
+ * @param[in] graphHandle A handle to the graph that is being prepared for execution
+ *
+ * @param[in/out] envs An array of pointers to execution environment options of length envSize. The
+ *                     option field is required to be set for all environments in the array. A
+ *                     backend may not support all options provided. If extra environment options
+ *                     are provided, the backend will set them to a default value (e.g.
+ *                     QNN_TENSOR_SET_INIT).
+ *
+ * @param[in] envSize Size of the array pointed to by envs.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: The execution environment was successfully prepared.
+ *         - QNN_GRAPH_ERROR_INVALID_HANDLE: _graph_ is not a valid handle.
+ *         - QNN_GRAPH_ERROR_INVALID_ARGUMENT: One or more fields in the provided envs is NULL or
+ *           invalid.
+ *         - QNN_GRAPH_ERROR_UNSUPPORTED_FEATURE: One or more env options is not supported by the
+ *           backend.
+ *
+ * @note Use corresponding API through QnnInterface_t.
+ */
+QNN_API
+Qnn_ErrorHandle_t QnnGraph_prepareExecutionEnvironment(Qnn_GraphHandle_t graphHandle,
+                                                       QnnGraph_ExecuteEnvironment_t** envs,
+                                                       uint32_t envSize);
+
+/**
+ * @brief Synchronously execute a finalized graph.
+ *
+ * @param[in] graphHandle Handle of finalized graph to execute.
+ *
+ * @param[in] inputs Array of tensors with which to populate graph inputs.
+ *
+ * @param[in] numInputs Number of input tensors.
+ *
+ * @param[out] outputs Array of output tensors which the graph will populate with output values.
+ *
+ * @param[in] numOutputs Number of output tensors.
+ *
+ * @param[in] profileHandle The profile handle on which metrics is populated and can be queried.
+ *                          Use NULL handle to disable profile collection. A handle being reused
+ *                          would reset and is populated with values from the current call. This
+ *                          handle must be NULL when a continuous profile handle has been configured
+ *                          via the QNN_GRAPH_CONFIG_OPTION_PROFILE_HANDLE option
+ *
+ * @param[in] signalHandle Signal object which may be used to control the execution of this call.
+ *                         NULL indicates execution should proceed as normal.
+ *                         The signal object, if not NULL, is considered to be in-use
+ *                         for the duration of the call.
+ *
+ * @note Tensors in _inputs_ and _outputs_ must carry the same ID that was assigned when they were
+ *       created. Values for all other attributes in Qnn_Tensor_t are assumed from the point at
+ *       which they were registered with a backend at the time of tensor creation, with the
+ *       following exceptions:
+ *       - Tensor data provided by client in structs such as _clientBuf_ can be changed between
+ *         invocations to execute().
+ *       - Batch multiple: An _inputs_ or _outputs_ tensor _dimensions_ field, if non-null, should
+ *         match the values provided at tensor creation, with the following exception. The batch
+ *         dimension, as determined by the op definition, can be an integer multiple of the
+ *         respective dimension provided at tensor creation. All _inputs_ and _outputs_ tensors
+ *         must have the same batch multiple.
+ *       - Dynamic output dimensions: An _outputs_ tensor Qnn_TensorV1_t _dimensions_ field, if
+ *         non-null, can vary after graph execution. As determined by the op definition, non-batch
+ *         dimensions may be less than the respective dimension at tensor creation.
+ *       - Dynamic dimensions: If an _inputs_ tensor was created with a non-null Qnn_TensorV2_t
+ *         _isDynamicDimensions_ field, the corresponding dynamic dimensions must be provided by
+ *         the caller. If an _outputs_ tensor was created with a non-null Qnn_TensorV2_t
+ *         _isDynamicDimensions_ field, the _dimensions_ must be non-null and the output dimensions
+ *         will be written by the backend. In a scenario where maximum dimensions will be exceeded,
+ *         the backend will generate an error code indicating loss of data and will fill the tensor
+ *         with as much data as possible.
+ *       - Other fields like _dataType_ can also be permitted to change between invocations to
+ *         QnnGraph_execute()/QnnGraph_executeAsync() for certain ops that perform data type
+ *         conversions.
+ *       - Some backends may be able to execute a graph with no _inputs_ provided the graph has no
+ *         application-writable tensors.
+ *       - QnnGraph_execute() can only accept tensors of type QNN_TENSOR_TYPE_APP_READ,
+ *         QNN_TENSOR_TYPE_APP_WRITE, QNN_TENSOR_TYPE_APP_READ_WRITE,
+ *         QNN_TENSOR_TYPE_OPTIONAL_APP_READ, QNN_TENSOR_TYPE_OPTIONAL_APP_WRITE, and
+ *         QNN_TENSOR_TYPE_OPTIONAL_APP_READWRITE. Tensors provided with a different type will
+ *         result in QnnGraph_execute() failure.
+ *       - Clients may exclude tensors of type QNN_TENSOR_TYPE_OPTIONAL_APP_READ,
+ *         QNN_TENSOR_TYPE_OPTIONAL_APP_WRITE, and QNN_TENSOR_TYPE_OPTIONAL_APP_READ from the
+ *         _inputs_ and _outputs_ arguments. If a QNN_TENSOR_TYPE_OPTIONAL_APP_WRITE tensor is
+ *         excluded from the _inputs_ argument, the value of that tensor will be dictated by the
+ *         backend defined behavior for that model. QNN_TENSOR_TYPE_OPTIONAL_APP_READ tensors may be
+ *         excluded from the _outputs_ argument. In this case a backend will not populate the tensor
+ *         on the QnnGraph_execute() call, and the data of these tensors is null. This is an
+ *         optional feature. Backends broadcast support for this feature with
+ *         QNN_PROPERTY_TENSOR_SUPPORT_OPTIONAL_APP_WRITE,
+ *         QNN_PROPERTY_TENSOR_SUPPORT_OPTIONAL_APP_READ, and
+ *         QNN_PROPERTY_TENSOR_SUPPORT_OPTIONAL_APP_READWRITE.
+ *       - Mixing different tensor versions in the same graph (e.g. Qnn_TensorV1_t and
+ *         Qnn_TensorV2_t) may result in performance degradation.
+ *
+ * @note If there are simultaneous calls to QnnGraph_execute() and QnnGraph_executeAsync(), the
+ *       priority for enqueuing or executing is equal. Both functions operate on the same queue,
+ *       the only difference in behavior is whether the function returns when the execution is
+ *       enqueued, or when the execution finishes. If there are executions already enqueued, the
+ *       execution will be added to the end of the queue, and QnnGraph_execute() will block while
+ *       waiting in the queue.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: the graph was successfully executed
+ *         - QNN_GRAPH_ERROR_INVALID_HANDLE: _graph_ is not a valid handle
+ *         - QNN_GRAPH_ERROR_GRAPH_NOT_FINALIZED: graph was not finalized
+ *         - QNN_GRAPH_ERROR_SUBGRAPH: cannot execute a subgraph
+ *         - QNN_GRAPH_ERROR_INVALID_ARGUMENT:
+ *            - _inputs_ or _outputs_ is NULL or ill-formed OR
+ *            - _inputs_ is NOT NULL and _numInputs_ is 0 OR
+ *            - _outputs_ is NOT NULL and _numOutputs_ is 0 OR
+ *            - _profile_ handle is invalid OR
+ *            - continuous graph profiling is enabled and the per-API handle is not NULL.
+ *         - QNN_GRAPH_ERROR_INVALID_TENSOR: one or more tensors in _inputs_ or _outputs_
+ *           is invalid or not recognized by graph
+ *         - QNN_GRAPH_ERROR_UNSUPPORTED_FEATURE: graph execution is not supported on this
+ *           backend or some API feature is not supported yet, e.g. signal, profile, or batch
+ *           multiplier
+ *         - QNN_GRAPH_ERROR_SET_PROFILE: set profile failed
+ *         - QNN_GRAPH_ERROR_SIGNAL_IN_USE: the supplied control signal is already in-use by
+ *           another call.
+ *         - QNN_GRAPH_ERROR_ABORTED: the call is aborted before completion due to user cancellation
+ *         - QNN_GRAPH_ERROR_TIMED_OUT: the call is aborted before completion due to a timeout
+ *         - QNN_GRAPH_ERROR_DISABLED: the graph was not enabled when the context was deserialized
+ *         - QNN_GRAPH_ERROR_DYNAMIC_TENSOR_SHAPE: An error occurred that is related to dynamic
+ *           tensor shape. For example, a tensor maximum dimension was exceeded.
+ *         - QNN_GRAPH_ERROR_TENSOR_SPARSITY: An error occurred that is related to tensor sparsity.
+ *           For example, the maximum number of specified elements was exceeded.
+ *         - QNN_GRAPH_ERROR_EARLY_TERMINATION: Graph execution terminated early due to defined op
+ *           behavior.
+ *         - QNN_GRAPH_ERROR_INVALID_CONTEXT: Graph execution failed due to context already being
+ *           freed.
+ *         - QNN_COMMON_ERROR_SYSTEM_COMMUNICATION: SSR occurence (successful recovery)
+ *         - QNN_COMMON_ERROR_SYSTEM_COMMUNICATION_FATAL: SSR occurence (unsuccessful recovery)
+ *
+ * @note Use corresponding API through QnnInterface_t.
+ */
+QNN_API
+Qnn_ErrorHandle_t QnnGraph_execute(Qnn_GraphHandle_t graphHandle,
+                                   const Qnn_Tensor_t* inputs,
+                                   uint32_t numInputs,
+                                   Qnn_Tensor_t* outputs,
+                                   uint32_t numOutputs,
+                                   Qnn_ProfileHandle_t profileHandle,
+                                   Qnn_SignalHandle_t signalHandle);
+
+/**
+ * @brief Asynchronously execute a finalized graph. Graphs will be enqueued for execution in FIFO
+ * order. There is no guarantee that graphs will finish execution in the same order they were
+ * enqueued. If the the execution queue is full, this function will block until space is available.
+ *
+ * @param[in] graphHandle Handle of finalized graph to execute.
+ *
+ * @param[in] inputs Array of input tensors with which to populate graph inputs.
+ *
+ * @param[in] numInputs Number of input tensors.
+ *
+ * @param[out] outputs Array of tensors which the graph will populate with output values.
+ *
+ * @param[in] numOutputs Number of output tensors.
+ *
+ * @param[in] profileHandle The profile handle on which metrics is populated and can be queried.
+ *                          Use NULL handle to disable profile collection. A handle being reused
+ *                          would reset and is populated with values from the enqueued execute
+ *                          call. Profile handle management/reuse across asynchronous calls is
+ *                          client's responsibility. Behavior is undefined if same profile handle
+ *                          is used by two enqueued execute instances at the same time. This
+ *                          handle must be NULL when a continuous profile handle has been
+ *                          configured via the QNN_GRAPH_CONFIG_OPTION_PROFILE_HANDLE option
+ *
+ * @param[in] signalHandle Signal object which may be used to control the execution of this call.
+ *                         NULL indicates execution should proceed as normal. All pending
+ *                         executions in the queue are affected by Signal control. Instance
+ *                         executing when Signal control is issued may not be affected.
+ *                         The signal object, if not NULL, is considered to be in-use
+ *                         for the duration of the call. For timeout signals, the timeout
+ *                         duration applies from the QnnGraph_executeAsync call until the
+ *                         callback is called. The same Qnn_GraphHandle_t can be used
+ *                         for multiple calls to QnnGraph_executeAsync, however, different
+ *                         Qnn_SignalHandle_t must be supplied.
+ *
+ * @param[in] notifyFn Pointer to notification function, called when execution is finished. NULL
+ *                     indicates no notification is requested. _notifyFn_ will be called in
+ *                     context of backend owned thread, with priority equal or lower than client's
+ *                     calling thread. Please note that a failed call to QnnGraph_executeAsync
+ *                     does not call the notification function.
+ *
+ * @param[in] notifyParam Client-supplied data object which will be passed back via _notifyFn_ and
+ *                        can be used to identify asynchronous execution instance. Can be NULL.
+ *
+ * @note Tensors in _inputs_ and _outputs_ must carry the same ID that was assigned when they were
+ *       created. Values for all other attributes in Qnn_Tensor_t are assumed from the point at
+ *       which they were registered with a backend at the time of tensor creation, with the
+ *       following exceptions:
+ *       - Tensor data provided by client in structs such as _clientBuf_ can be changed between
+ *         invocations to execute().
+ *       - Batch multiple: An _inputs_ or _outputs_ tensor _dimensions_ field, if non-null, should
+ *         match the values provided at tensor creation, with the following exception. The batch
+ *         dimension, as determined by the op definition, can be an integer multiple of the
+ *         respective dimension provided at tensor creation. All _inputs_ and _outputs_ tensors
+ *         must have the same batch multiple.
+ *       - Dynamic output dimensions: An _outputs_ tensor Qnn_TensorV1_t _dimensions_ field, if
+ *         non-null, can vary after graph execution. As determined by the op definition, non-batch
+ *         dimensions may be less than the respective dimension at tensor creation.
+ *       - Dynamic dimensions: If an _inputs_ tensor was created with a non-null Qnn_TensorV2_t
+ *         _isDynamicDimensions_ field, the corresponding dynamic dimensions must be provided by
+ *         the caller. If an _outputs_ tensor was created with a non-null Qnn_TensorV2_t
+ *         _isDynamicDimensions_ field, the _dimensions_ must be non-null and the output dimensions
+ *         will be written by the backend. In a scenario where maximum dimensions will be exceeded,
+ *         the backend will generate an error code indicating loss of data and will fill the tensor
+ *         with as much data as possible.
+ *       - Other fields like _dataType_ can also be permitted to change between invocations to
+ *         QnnGraph_execute()/QnnGraph_executeAsync() for certain ops that perform data type
+ *         conversions.
+ *       - Some backends may be able to execute a graph with no _inputs_ provided the graph has no
+ *         application-writable tensors.
+ *       - QnnGraph_executeAsync() can only accept tensors of type QNN_TENSOR_TYPE_APP_READ,
+ *         QNN_TENSOR_TYPE_APP_WRITE, QNN_TENSOR_TYPE_APP_READ_WRITE,
+ *         QNN_TENSOR_TYPE_OPTIONAL_APP_READ, QNN_TENSOR_TYPE_OPTIONAL_APP_WRITE, and
+ *         QNN_TENSOR_TYPE_OPTIONAL_APP_READWRITE. Tensors provided with a different type will
+ *         result in QnnGraph_execute() failure.
+ *       - Clients may exclude tensors of type QNN_TENSOR_TYPE_OPTIONAL_APP_READ,
+ *         QNN_TENSOR_TYPE_OPTIONAL_APP_WRITE, and QNN_TENSOR_TYPE_OPTIONAL_APP_READ from the
+ *         _inputs_ and _outputs_ arguments. If a QNN_TENSOR_TYPE_OPTIONAL_APP_WRITE tensor is
+ *         excluded from the _inputs_ argument, the value of that tensor will be dictated by the
+ *         backend defined behavior for that model. QNN_TENSOR_TYPE_OPTIONAL_APP_READ tensors may be
+ *         excluded from the _outputs_ argument. In this case a backend will not populate the tensor
+ *         on the QnnGraph_execute() call, and the data of these tensors is null. This is an
+ *         optional feature. Backends broadcast support for this feature with
+ *         QNN_PROPERTY_TENSOR_SUPPORT_OPTIONAL_APP_WRITE,
+ *         QNN_PROPERTY_TENSOR_SUPPORT_OPTIONAL_APP_READ, and
+ *         QNN_PROPERTY_TENSOR_SUPPORT_OPTIONAL_APP_READWRITE.
+ *       - Mixing different tensor versions in the same graph (e.g. Qnn_TensorV1_t and
+ *         Qnn_TensorV2_t) may result in performance degradation.
+ *
+ * @note If there are simultaneous calls to QnnGraph_execute() and QnnGraph_executeAsync(), the
+ *       priority for enqueuing or executing is equal. Both functions will add to the same queue,
+ *       the only difference in behavior is whether the function returns when the execution is
+ *       enqueued, or when the execution finishes.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: the graph was successfully executed
+ *         - QNN_GRAPH_ERROR_INVALID_HANDLE: _graph_ is not a valid handle
+ *         - QNN_GRAPH_ERROR_GRAPH_NOT_FINALIZED: graph was not finalized
+ *         - QNN_GRAPH_ERROR_SUBGRAPH: cannot execute a subgraph
+ *         - QNN_GRAPH_ERROR_INVALID_ARGUMENT:
+ *            - _inputs_ or _outputs_ is NULL or ill-formed OR
+ *            - _inputs_ is NOT NULL and _numInputs_ is 0 OR
+ *            - _outputs_ is NOT NULL and _numOutputs_ is 0 OR
+ *            - _profile_ handle is invalid OR
+ *            - continuous graph profiling is enabled and the per-API handle is not NULL.
+ *         - QNN_GRAPH_ERROR_INVALID_TENSOR: one or more tensors in _inputs_ or _outputs_
+ *           is invalid or not recognized by graph
+ *         - QNN_GRAPH_ERROR_UNSUPPORTED_FEATURE: asynchronous graph execution is not supported on
+ *           this backend or some API feature is not supported yet, e.g. signal, profile, or batch
+ *           multiplier
+ *         - QNN_GRAPH_ERROR_SIGNAL_IN_USE: the supplied control signal is already in-use by
+ *           another call.
+ *         - QNN_GRAPH_ERROR_ABORTED: the call is aborted before completion due to user cancellation
+ *         - QNN_GRAPH_ERROR_TIMED_OUT: the call is aborted before completion due to a timeout
+ *         - QNN_GRAPH_ERROR_DISABLED: the graph was not enabled when the context was deserialized
+ *         - QNN_GRAPH_ERROR_DYNAMIC_TENSOR_SHAPE: An error occurred that is related to dynamic
+ *           tensor shape. For example, a tensor maximum dimension was exceeded.
+ *         - QNN_GRAPH_ERROR_TENSOR_SPARSITY: An error occurred that is related to tensor sparsity.
+ *           For example, the maximum number of specified elements was exceeded.
+ *         - QNN_GRAPH_ERROR_EARLY_TERMINATION: Graph execution terminated early due to defined op
+ *           behavior.
+ *         - QNN_GRAPH_ERROR_INVALID_CONTEXT: Graph execution failed due to context already being
+ *           freed.
+ *         - QNN_COMMON_ERROR_SYSTEM_COMMUNICATION: SSR occurence (successful recovery)
+ *         - QNN_COMMON_ERROR_SYSTEM_COMMUNICATION_FATAL: SSR occurence (unsuccessful recovery)
+ *
+ * @note Use corresponding API through QnnInterface_t.
+ */
+QNN_API
+Qnn_ErrorHandle_t QnnGraph_executeAsync(Qnn_GraphHandle_t graphHandle,
+                                        const Qnn_Tensor_t* inputs,
+                                        uint32_t numInputs,
+                                        Qnn_Tensor_t* outputs,
+                                        uint32_t numOutputs,
+                                        Qnn_ProfileHandle_t profileHandle,
+                                        Qnn_SignalHandle_t signalHandle,
+                                        Qnn_NotifyFn_t notifyFn,
+                                        void* notifyParam);
+
+/**
+ * @brief A function to release an execution environment prepared via
+ *        QnnGraph_prepareExecutionEnvironment. If this API is not called, environments will be
+ *        released automatically during QnnContext_free.
+ *
+ * @param[in] graphHandle Handle to the graph that the environment is being released from.
+ *
+ * @param[in] envs An array of pointers to execution environment options previously used for
+ *                 preparation.
+ *
+ * @param[in] envSize Size of the array pointed to by envs.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: The execution environment was successfully released.
+ *         - QNN_GRAPH_ERROR_INVALID_HANDLE: _graph_ is not a valid handle.
+ *         - QNN_GRAPH_ERROR_INVALID_ARGUMENT: Invalid envs provided to be released.
+ *         - QNN_GRAPH_ERROR_UNSUPPORTED_FEATURE: One or more envs options is not supported by the
+ *           backend.
+ *
+ * @note Use corresponding API through QnnInterface_t.
+ */
+QNN_API
+Qnn_ErrorHandle_t QnnGraph_releaseExecutionEnvironment(Qnn_GraphHandle_t graphHandle,
+                                                       const QnnGraph_ExecuteEnvironment_t** envs,
+                                                       uint32_t envSize);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // QNN_GRAPH_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/QnnInterface.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/QnnInterface.h
new file mode 100755
index 0000000000000..69486cdf0b53e
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/QnnInterface.h
@@ -0,0 +1,679 @@
+//==============================================================================
+//
+//  Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+//  All rights reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+/**
+ *  @file
+ *  @brief  QNN Interface API
+ *
+ *          QNN Interface is an abstraction combining all QNN component APIs. QNN Interface
+ *          provides typedef variant of QNN component APIs and API to get QNN interface object(s).
+ *          QNN Interface API can coexist with QNN component APIs. Visibility of Interface and
+ *          Component APIs is determined by build configuration, specifically by QNN_API and
+ *          QNN_INTERFACE macro definitions.
+ */
+
+#ifndef QNN_INTERFACE_H
+#define QNN_INTERFACE_H
+
+#include "QnnCommon.h"
+#include "QnnTypes.h"
+
+// QNN Component API headers
+#include "QnnBackend.h"
+#include "QnnContext.h"
+#include "QnnDevice.h"
+#include "QnnError.h"
+#include "QnnGraph.h"
+#include "QnnLog.h"
+#include "QnnMem.h"
+#include "QnnProfile.h"
+#include "QnnProperty.h"
+#include "QnnSignal.h"
+#include "QnnTensor.h"
+
+// QNN Op integration headers
+#include "QnnOpDef.h"
+#include "QnnOpPackage.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//=============================================================================
+// Macros
+//=============================================================================
+
+// Macro controlling visibility of QNN Interface API
+#ifndef QNN_INTERFACE
+#define QNN_INTERFACE
+#endif
+
+// Utility macros for version and name construction
+#define QNN_INTERFACE_VER_EVAL(major, minor)          QNN_PASTE_THREE(major, _, minor)
+#define QNN_INTERFACE_NAME_EVAL(prefix, body, suffix) QNN_PASTE_THREE(prefix, body, suffix)
+
+// Construct interface type name from version, e.g. QnnInterface_ImplementationV0_0_t
+#define QNN_INTERFACE_VER_TYPE_EVAL(ver_major, ver_minor) \
+  QNN_INTERFACE_NAME_EVAL(                                \
+      QnnInterface_ImplementationV, QNN_INTERFACE_VER_EVAL(ver_major, ver_minor), _t)
+
+// Construct interface name from version, e.g. v0_0
+#define QNN_INTERFACE_VER_NAME_EVAL(ver_major, ver_minor) \
+  QNN_INTERFACE_NAME_EVAL(v, QNN_INTERFACE_VER_EVAL(ver_major, ver_minor), )
+
+// Interface type name for current API version
+#define QNN_INTERFACE_VER_TYPE \
+  QNN_INTERFACE_VER_TYPE_EVAL(QNN_API_VERSION_MAJOR, QNN_API_VERSION_MINOR)
+
+// Interface name for current API version
+#define QNN_INTERFACE_VER_NAME \
+  QNN_INTERFACE_VER_NAME_EVAL(QNN_API_VERSION_MAJOR, QNN_API_VERSION_MINOR)
+
+//=============================================================================
+// Data Types
+//=============================================================================
+
+/**
+ * @brief QNN Interface API result / error codes
+ */
+typedef enum {
+  QNN_INTERFACE_MIN_ERROR = QNN_MIN_ERROR_INTERFACE,
+  ////////////////////////////////////////
+
+  QNN_INTERFACE_NO_ERROR                = QNN_SUCCESS,
+  QNN_INTERFACE_ERROR_NOT_SUPPORTED     = QNN_COMMON_ERROR_NOT_SUPPORTED,
+  QNN_INTERFACE_ERROR_INVALID_PARAMETER = QNN_COMMON_ERROR_INVALID_ARGUMENT,
+
+  ////////////////////////////////////////
+  QNN_INTERFACE_MAX_ERROR = QNN_MAX_ERROR_INTERFACE
+} QnnInterface_Error_t;
+
+//
+// From QnnProperty.h
+//
+
+/** @brief See QnnProperty_hasCapability()*/
+typedef Qnn_ErrorHandle_t (*QnnProperty_HasCapabilityFn_t)(QnnProperty_Key_t key);
+
+//
+// From QnnBackend.h
+//
+
+/** @brief See QnnBackend_create()*/
+typedef Qnn_ErrorHandle_t (*QnnBackend_CreateFn_t)(Qnn_LogHandle_t logger,
+                                                   const QnnBackend_Config_t** config,
+                                                   Qnn_BackendHandle_t* backend);
+
+/** @brief See QnnBackend_setConfig()*/
+typedef Qnn_ErrorHandle_t (*QnnBackend_SetConfigFn_t)(Qnn_BackendHandle_t backend,
+                                                      const QnnBackend_Config_t** config);
+
+/** @brief See QnnBackend_getApiVersion()*/
+typedef Qnn_ErrorHandle_t (*QnnBackend_GetApiVersionFn_t)(Qnn_ApiVersion_t* pVersion);
+
+/** @brief See QnnBackend_getBuildId()*/
+typedef Qnn_ErrorHandle_t (*QnnBackend_GetBuildIdFn_t)(const char** id);
+
+/** @brief See QnnBackend_registerOpPackage()*/
+typedef Qnn_ErrorHandle_t (*QnnBackend_RegisterOpPackageFn_t)(Qnn_BackendHandle_t backend,
+                                                              const char* packagePath,
+                                                              const char* interfaceProvider,
+                                                              const char* target);
+
+/** @brief See QnnBackend_getSupportedOperations()*/
+typedef Qnn_ErrorHandle_t (*QnnBackend_GetSupportedOperationsFn_t)(
+    Qnn_BackendHandle_t backend,
+    uint32_t* numOperations,
+    const QnnBackend_OperationName_t** operations);
+
+/** @brief See QnnBackend_validateOpConfig()*/
+typedef Qnn_ErrorHandle_t (*QnnBackend_ValidateOpConfigFn_t)(Qnn_BackendHandle_t backend,
+                                                             Qnn_OpConfig_t opConfig);
+
+/** @brief See QnnBackend_free()*/
+typedef Qnn_ErrorHandle_t (*QnnBackend_FreeFn_t)(Qnn_BackendHandle_t backend);
+
+/** @brief See QnnBackend_getProperty()*/
+typedef Qnn_ErrorHandle_t (*QnnBackend_GetPropertyFn_t)(Qnn_BackendHandle_t backendHandle,
+                                                        QnnBackend_Property_t** properties);
+
+//
+// From QnnContext.h
+//
+
+/** @brief See QnnContext_create()*/
+typedef Qnn_ErrorHandle_t (*QnnContext_CreateFn_t)(Qnn_BackendHandle_t backend,
+                                                   Qnn_DeviceHandle_t device,
+                                                   const QnnContext_Config_t** config,
+                                                   Qnn_ContextHandle_t* context);
+
+/** @brief See QnnContext_setConfig()*/
+typedef Qnn_ErrorHandle_t (*QnnContext_SetConfigFn_t)(Qnn_ContextHandle_t context,
+                                                      const QnnContext_Config_t** config);
+
+/** @brief See QnnContext_getBinarySize()*/
+typedef Qnn_ErrorHandle_t (*QnnContext_GetBinarySizeFn_t)(
+    Qnn_ContextHandle_t context, Qnn_ContextBinarySize_t* binaryBufferSize);
+
+/** @brief See QnnContext_getBinary()*/
+typedef Qnn_ErrorHandle_t (*QnnContext_GetBinaryFn_t)(Qnn_ContextHandle_t context,
+                                                      void* binaryBuffer,
+                                                      Qnn_ContextBinarySize_t binaryBufferSize,
+                                                      Qnn_ContextBinarySize_t* writtenBufferSize);
+
+/** @brief See QnnContext_createFromBinary()*/
+typedef Qnn_ErrorHandle_t (*QnnContext_CreateFromBinaryFn_t)(
+    Qnn_BackendHandle_t backend,
+    Qnn_DeviceHandle_t device,
+    const QnnContext_Config_t** config,
+    const void* binaryBuffer,
+    Qnn_ContextBinarySize_t binaryBufferSize,
+    Qnn_ContextHandle_t* context,
+    Qnn_ProfileHandle_t profile);
+
+/** @brief See QnnContext_free()*/
+typedef Qnn_ErrorHandle_t (*QnnContext_FreeFn_t)(Qnn_ContextHandle_t context,
+                                                 Qnn_ProfileHandle_t profile);
+
+/** @brief See QnnContext_validateBinary()*/
+typedef Qnn_ErrorHandle_t (*QnnContext_ValidateBinaryFn_t)(
+    Qnn_BackendHandle_t backend,
+    Qnn_DeviceHandle_t device,
+    const QnnContext_Config_t** config,
+    const void* binaryBuffer,
+    Qnn_ContextBinarySize_t binaryBufferSize);
+
+/** @brief See QnnContext_createFromBinaryWithSignal()*/
+typedef Qnn_ErrorHandle_t (*QnnContext_CreateFromBinaryWithSignalFn_t)(
+    Qnn_BackendHandle_t backend,
+    Qnn_DeviceHandle_t device,
+    const QnnContext_Config_t** config,
+    const void* binaryBuffer,
+    Qnn_ContextBinarySize_t binaryBufferSize,
+    Qnn_ContextHandle_t* context,
+    Qnn_ProfileHandle_t profile,
+    Qnn_SignalHandle_t signal);
+
+/** @brief See QnnContext_createFromBinaryListAsync()*/
+typedef Qnn_ErrorHandle_t (*QnnContext_CreateFromBinaryListAsyncFn_t)(
+    Qnn_BackendHandle_t backend,
+    Qnn_DeviceHandle_t device,
+    const QnnContext_Params_t** contextParams,
+    const QnnContext_Config_t** listConfig,
+    Qnn_SignalHandle_t signal);
+
+/** @brief See QnnContext_getBinarySectionSize()*/
+typedef Qnn_ErrorHandle_t (*QnnContext_GetBinarySectionSizeFn_t)(
+    Qnn_ContextHandle_t context,
+    Qnn_GraphHandle_t graph,
+    QnnContext_SectionType_t section,
+    Qnn_ContextBinarySize_t* binaryBufferSize);
+
+/** @brief See QnnContext_getBinarySection()*/
+typedef Qnn_ErrorHandle_t (*QnnContext_GetBinarySectionFn_t)(
+    Qnn_ContextHandle_t context,
+    Qnn_GraphHandle_t graph,
+    QnnContext_SectionType_t section,
+    const QnnContext_Buffer_t* binaryBuffer,
+    Qnn_ContextBinarySize_t* writtenBufferSize,
+    Qnn_ProfileHandle_t profile,
+    Qnn_SignalHandle_t signal);
+/** @brief See QnnContext_applyBinarySection()*/
+typedef Qnn_ErrorHandle_t (*QnnContext_ApplyBinarySectionFn_t)(
+    Qnn_ContextHandle_t context,
+    Qnn_GraphHandle_t graph,
+    QnnContext_SectionType_t section,
+    const QnnContext_Buffer_t* binaryBuffer,
+    Qnn_ProfileHandle_t profile,
+    Qnn_SignalHandle_t signal);
+
+/** @brief See QnnContext_getProperty()*/
+typedef Qnn_ErrorHandle_t (*QnnContext_GetPropertyFn_t)(Qnn_ContextHandle_t contextHandle,
+                                                        QnnContext_Property_t** properties);
+
+/** @brief See QnnContext_getIncrementalBinary()*/
+typedef Qnn_ErrorHandle_t (*QnnContext_GetIncrementalBinaryFn_t)(Qnn_ContextHandle_t context,
+                                                      const void** binaryBuffer,
+                                                      Qnn_ContextBinarySize_t* startOffset,
+                                                      Qnn_ContextBinarySize_t* writtenBufferSize);
+
+/** @brief See QnnContext_releaseIncrementalBinary()*/
+typedef Qnn_ErrorHandle_t (*QnnContext_ReleaseIncrementalBinaryFn_t)(Qnn_ContextHandle_t context,
+                                                                 const void* binaryBuffer,
+                                                                 Qnn_ContextBinarySize_t startOffset);
+//
+// From QnnGraph.h
+//
+
+/** @brief See QnnGraph_create()*/
+typedef Qnn_ErrorHandle_t (*QnnGraph_CreateFn_t)(Qnn_ContextHandle_t contextHandle,
+                                                 const char* graphName,
+                                                 const QnnGraph_Config_t** config,
+                                                 Qnn_GraphHandle_t* graphHandle);
+
+/** @brief See QnnGraph_createSubgraph()*/
+typedef Qnn_ErrorHandle_t (*QnnGraph_CreateSubgraphFn_t)(Qnn_GraphHandle_t graphHandle,
+                                                         const char* graphName,
+                                                         Qnn_GraphHandle_t* subgraphHandle);
+
+/** @brief See QnnGraph_setConfig()*/
+typedef Qnn_ErrorHandle_t (*QnnGraph_SetConfigFn_t)(Qnn_GraphHandle_t graphHandle,
+                                                    const QnnGraph_Config_t** config);
+
+/** @brief See QnnGraph_getProperty()*/
+typedef Qnn_ErrorHandle_t (*QnnGraph_GetPropertyFn_t)(Qnn_GraphHandle_t graphHandle,
+                                                      QnnGraph_Property_t** properties);
+
+/** @brief See QnnGraph_addNode()*/
+typedef Qnn_ErrorHandle_t (*QnnGraph_AddNodeFn_t)(Qnn_GraphHandle_t graphHandle,
+                                                  Qnn_OpConfig_t opConfig);
+
+/** @brief See QnnGraph_finalize()*/
+typedef Qnn_ErrorHandle_t (*QnnGraph_FinalizeFn_t)(Qnn_GraphHandle_t graphHandle,
+                                                   Qnn_ProfileHandle_t profileHandle,
+                                                   Qnn_SignalHandle_t signalHandle);
+
+/** @brief See QnnGraph_retrieve()*/
+typedef Qnn_ErrorHandle_t (*QnnGraph_RetrieveFn_t)(Qnn_ContextHandle_t contextHandle,
+                                                   const char* graphName,
+                                                   Qnn_GraphHandle_t* graphHandle);
+
+/** @brief See QnnGraph_prepareExecutionEnvironment()*/
+typedef Qnn_ErrorHandle_t (*QnnGraph_PrepareExecutionEnvironmentFn_t)(
+    Qnn_GraphHandle_t graphHandle, QnnGraph_ExecuteEnvironment_t** envs, uint32_t envSize);
+
+/** @brief See QnnGraph_execute()*/
+typedef Qnn_ErrorHandle_t (*QnnGraph_ExecuteFn_t)(Qnn_GraphHandle_t graphHandle,
+                                                  const Qnn_Tensor_t* inputs,
+                                                  uint32_t numInputs,
+                                                  Qnn_Tensor_t* outputs,
+                                                  uint32_t numOutputs,
+                                                  Qnn_ProfileHandle_t profileHandle,
+                                                  Qnn_SignalHandle_t signalHandle);
+
+/** @brief See QnnGraph_executeAsync()*/
+typedef Qnn_ErrorHandle_t (*QnnGraph_ExecuteAsyncFn_t)(Qnn_GraphHandle_t graphHandle,
+                                                       const Qnn_Tensor_t* inputs,
+                                                       uint32_t numInputs,
+                                                       Qnn_Tensor_t* outputs,
+                                                       uint32_t numOutputs,
+                                                       Qnn_ProfileHandle_t profileHandle,
+                                                       Qnn_SignalHandle_t signalHandle,
+                                                       Qnn_NotifyFn_t notifyFn,
+                                                       void* notifyParam);
+
+/** @brief See QnnGraph_releaseExecutionEnvironment()*/
+typedef Qnn_ErrorHandle_t (*QnnGraph_ReleaseExecutionEnvironmentFn_t)(
+    Qnn_GraphHandle_t graphHandle, const QnnGraph_ExecuteEnvironment_t** envs, uint32_t envSize);
+
+//
+// From QnnTensor.h
+//
+
+/** @brief See QnnTensor_createContextTensor()*/
+typedef Qnn_ErrorHandle_t (*QnnTensor_CreateContextTensorFn_t)(Qnn_ContextHandle_t context,
+                                                               Qnn_Tensor_t* tensor);
+
+/** @brief See QnnTensor_createGraphTensor()*/
+typedef Qnn_ErrorHandle_t (*QnnTensor_CreateGraphTensorFn_t)(Qnn_GraphHandle_t graph,
+                                                             Qnn_Tensor_t* tensor);
+
+/** @brief See QnnTensor_updateContextTensor()*/
+typedef Qnn_ErrorHandle_t (*QnnTensor_UpdateContextTensorsFn_t)(Qnn_ContextHandle_t context,
+                                                                const Qnn_Tensor_t** tensor,
+                                                                uint64_t numTensors);
+
+/** @brief See QnnTensor_updateGraphTensor()*/
+typedef Qnn_ErrorHandle_t (*QnnTensor_UpdateGraphTensorsFn_t)(Qnn_GraphHandle_t graph,
+                                                              const Qnn_Tensor_t** tensor,
+                                                              uint64_t numTensors);
+
+//
+// From QnnLog.h
+//
+
+/** @brief See QnnLog_create()*/
+typedef Qnn_ErrorHandle_t (*QnnLog_CreateFn_t)(QnnLog_Callback_t callback,
+                                               QnnLog_Level_t maxLogLevel,
+                                               Qnn_LogHandle_t* logger);
+
+/** @brief See QnnLog_setLogLevel()*/
+typedef Qnn_ErrorHandle_t (*QnnLog_SetLogLevelFn_t)(Qnn_LogHandle_t logger,
+                                                    QnnLog_Level_t maxLogLevel);
+
+/** @brief See QnnLog_free()*/
+typedef Qnn_ErrorHandle_t (*QnnLog_FreeFn_t)(Qnn_LogHandle_t logger);
+
+//
+// From QnnProfile.h
+//
+
+/** @brief See QnnProfile_create()*/
+typedef Qnn_ErrorHandle_t (*QnnProfile_CreateFn_t)(Qnn_BackendHandle_t backend,
+                                                   QnnProfile_Level_t level,
+                                                   Qnn_ProfileHandle_t* profile);
+
+/** @brief See QnnProfile_setConfig()*/
+typedef Qnn_ErrorHandle_t (*QnnProfile_SetConfigFn_t)(Qnn_ProfileHandle_t profileHandle,
+                                                      const QnnProfile_Config_t** config);
+
+/** @brief See QnnProfile_getEvents()*/
+typedef Qnn_ErrorHandle_t (*QnnProfile_GetEventsFn_t)(Qnn_ProfileHandle_t profile,
+                                                      const QnnProfile_EventId_t** profileEventIds,
+                                                      uint32_t* numEvents);
+
+/** @brief See QnnProfile_getSubEvents()*/
+typedef Qnn_ErrorHandle_t (*QnnProfile_GetSubEventsFn_t)(QnnProfile_EventId_t eventId,
+                                                         const QnnProfile_EventId_t** subEventIds,
+                                                         uint32_t* numSubEvents);
+
+/** @brief See QnnProfile_getEventData()*/
+typedef Qnn_ErrorHandle_t (*QnnProfile_GetEventDataFn_t)(QnnProfile_EventId_t eventId,
+                                                         QnnProfile_EventData_t* eventData);
+
+/** @brief See QnnProfile_getExtendedEventData()*/
+typedef Qnn_ErrorHandle_t (*QnnProfile_GetExtendedEventDataFn_t)(
+    QnnProfile_EventId_t eventId, QnnProfile_ExtendedEventData_t* eventData);
+
+/** @brief See QnnProfile_free()*/
+typedef Qnn_ErrorHandle_t (*QnnProfile_FreeFn_t)(Qnn_ProfileHandle_t profile);
+
+//
+// From QnnMem.h
+//
+
+/** @brief See QnnMem_register()*/
+typedef Qnn_ErrorHandle_t (*QnnMem_RegisterFn_t)(Qnn_ContextHandle_t context,
+                                                 const Qnn_MemDescriptor_t* memDescriptors,
+                                                 uint32_t numDescriptors,
+                                                 Qnn_MemHandle_t* memHandles);
+
+/** @brief See QnnMem_deRegister()*/
+typedef Qnn_ErrorHandle_t (*QnnMem_DeRegisterFn_t)(const Qnn_MemHandle_t* memHandles,
+                                                   uint32_t numHandles);
+
+//
+// From QnnDevice.h
+//
+
+/** @brief See QnnDevice_getPlatformInfo()*/
+typedef Qnn_ErrorHandle_t (*QnnDevice_GetPlatformInfoFn_t)(
+    Qnn_LogHandle_t logger, const QnnDevice_PlatformInfo_t** platformInfo);
+
+/** @brief See QnnDevice_freePlatformInfo()*/
+typedef Qnn_ErrorHandle_t (*QnnDevice_FreePlatformInfoFn_t)(
+    Qnn_LogHandle_t logger, const QnnDevice_PlatformInfo_t* platformInfo);
+
+/** @brief See QnnDevice_getInfrastructure()*/
+typedef Qnn_ErrorHandle_t (*QnnDevice_GetInfrastructureFn_t)(
+    const QnnDevice_Infrastructure_t* deviceInfra);
+
+/** @brief See QnnDevice_create()*/
+typedef Qnn_ErrorHandle_t (*QnnDevice_CreateFn_t)(Qnn_LogHandle_t logger,
+                                                  const QnnDevice_Config_t** config,
+                                                  Qnn_DeviceHandle_t* device);
+
+/** @brief See QnnDevice_setConfig()*/
+typedef Qnn_ErrorHandle_t (*QnnDevice_SetConfigFn_t)(Qnn_DeviceHandle_t device,
+                                                     const QnnDevice_Config_t** config);
+
+/** @brief See QnnDevice_getInfo()*/
+typedef Qnn_ErrorHandle_t (*QnnDevice_GetInfoFn_t)(Qnn_DeviceHandle_t device,
+                                                   const QnnDevice_PlatformInfo_t** platformInfo);
+
+/** @brief See QnnDevice_free()*/
+typedef Qnn_ErrorHandle_t (*QnnDevice_FreeFn_t)(Qnn_DeviceHandle_t device);
+
+//
+// From QnnSignal.h
+//
+
+/** @brief See QnnSignal_create()*/
+typedef Qnn_ErrorHandle_t (*QnnSignal_CreateFn_t)(Qnn_BackendHandle_t backend,
+                                                  const QnnSignal_Config_t** config,
+                                                  Qnn_SignalHandle_t* signal);
+
+/** @brief See QnnSignal_setConfig()*/
+typedef Qnn_ErrorHandle_t (*QnnSignal_SetConfigFn_t)(Qnn_SignalHandle_t signal,
+                                                     const QnnSignal_Config_t** config);
+
+/** @brief See QnnSignal_trigger()*/
+typedef Qnn_ErrorHandle_t (*QnnSignal_TriggerFn_t)(Qnn_SignalHandle_t signal);
+
+/** @brief See QnnSignal_free()*/
+typedef Qnn_ErrorHandle_t (*QnnSignal_FreeFn_t)(Qnn_SignalHandle_t signal);
+
+//
+// From QnnError.h
+//
+
+/** @brief See QnnError_getMessage()*/
+typedef Qnn_ErrorHandle_t (*QnnError_GetMessageFn_t)(Qnn_ErrorHandle_t errorHandle,
+                                                     const char** errorMessage);
+/** @brief See QnnError_getVerboseMessage()*/
+typedef Qnn_ErrorHandle_t (*QnnError_GetVerboseMessageFn_t)(Qnn_ErrorHandle_t errorHandle,
+                                                            const char** errorMessage);
+/** @brief See QnnError_freeVerboseMessage()*/
+typedef Qnn_ErrorHandle_t (*QnnError_FreeVerboseMessageFn_t)(const char* errorMessage);
+
+// clang-format off
+
+/**
+ * @brief This struct defines Qnn interface specific to version.
+ *        Interface functions are allowed to be NULL if not supported/available.
+ *
+ */
+typedef struct {
+  QnnProperty_HasCapabilityFn_t             propertyHasCapability;
+
+  QnnBackend_CreateFn_t                     backendCreate;
+  QnnBackend_SetConfigFn_t                  backendSetConfig;
+  QnnBackend_GetApiVersionFn_t              backendGetApiVersion;
+  QnnBackend_GetBuildIdFn_t                 backendGetBuildId;
+  QnnBackend_RegisterOpPackageFn_t          backendRegisterOpPackage;
+  QnnBackend_GetSupportedOperationsFn_t     backendGetSupportedOperations;
+  QnnBackend_ValidateOpConfigFn_t           backendValidateOpConfig;
+  QnnBackend_FreeFn_t                       backendFree;
+
+  QnnContext_CreateFn_t                     contextCreate;
+  QnnContext_SetConfigFn_t                  contextSetConfig;
+  QnnContext_GetBinarySizeFn_t              contextGetBinarySize;
+  QnnContext_GetBinaryFn_t                  contextGetBinary;
+  QnnContext_CreateFromBinaryFn_t           contextCreateFromBinary;
+  QnnContext_FreeFn_t                       contextFree;
+
+  QnnGraph_CreateFn_t                       graphCreate;
+  QnnGraph_CreateSubgraphFn_t               graphCreateSubgraph;
+  QnnGraph_SetConfigFn_t                    graphSetConfig;
+  QnnGraph_AddNodeFn_t                      graphAddNode;
+  QnnGraph_FinalizeFn_t                     graphFinalize;
+  QnnGraph_RetrieveFn_t                     graphRetrieve;
+  QnnGraph_ExecuteFn_t                      graphExecute;
+  QnnGraph_ExecuteAsyncFn_t                 graphExecuteAsync;
+
+  QnnTensor_CreateContextTensorFn_t         tensorCreateContextTensor;
+  QnnTensor_CreateGraphTensorFn_t           tensorCreateGraphTensor;
+
+  QnnLog_CreateFn_t                         logCreate;
+  QnnLog_SetLogLevelFn_t                    logSetLogLevel;
+  QnnLog_FreeFn_t                           logFree;
+
+  QnnProfile_CreateFn_t                     profileCreate;
+  QnnProfile_SetConfigFn_t                  profileSetConfig;
+  QnnProfile_GetEventsFn_t                  profileGetEvents;
+  QnnProfile_GetSubEventsFn_t               profileGetSubEvents;
+  QnnProfile_GetEventDataFn_t               profileGetEventData;
+  QnnProfile_GetExtendedEventDataFn_t       profileGetExtendedEventData;
+  QnnProfile_FreeFn_t                       profileFree;
+
+  QnnMem_RegisterFn_t                       memRegister;
+  QnnMem_DeRegisterFn_t                     memDeRegister;
+
+  QnnDevice_GetPlatformInfoFn_t             deviceGetPlatformInfo;
+  QnnDevice_FreePlatformInfoFn_t            deviceFreePlatformInfo;
+  QnnDevice_GetInfrastructureFn_t           deviceGetInfrastructure;
+  QnnDevice_CreateFn_t                      deviceCreate;
+  QnnDevice_SetConfigFn_t                   deviceSetConfig;
+  QnnDevice_GetInfoFn_t                     deviceGetInfo;
+  QnnDevice_FreeFn_t                        deviceFree;
+
+  QnnSignal_CreateFn_t                      signalCreate;
+  QnnSignal_SetConfigFn_t                   signalSetConfig;
+  QnnSignal_TriggerFn_t                     signalTrigger;
+  QnnSignal_FreeFn_t                        signalFree;
+
+  QnnError_GetMessageFn_t                   errorGetMessage;
+  QnnError_GetVerboseMessageFn_t            errorGetVerboseMessage;
+  QnnError_FreeVerboseMessageFn_t           errorFreeVerboseMessage;
+
+  QnnGraph_PrepareExecutionEnvironmentFn_t  graphPrepareExecutionEnvironment;
+  QnnGraph_ReleaseExecutionEnvironmentFn_t  graphReleaseExecutionEnvironment;
+  QnnGraph_GetPropertyFn_t                  graphGetProperty;
+
+  QnnContext_ValidateBinaryFn_t             contextValidateBinary;
+  QnnContext_CreateFromBinaryWithSignalFn_t contextCreateFromBinaryWithSignal;
+  QnnContext_CreateFromBinaryListAsyncFn_t  contextCreateFromBinaryListAsync;
+  QnnTensor_UpdateGraphTensorsFn_t          tensorUpdateGraphTensors;
+  QnnTensor_UpdateContextTensorsFn_t        tensorUpdateContextTensors;
+  QnnContext_GetBinarySectionSizeFn_t       contextGetBinarySectionSize;
+  QnnContext_GetBinarySectionFn_t           contextGetBinarySection;
+  QnnContext_ApplyBinarySectionFn_t         contextApplyBinarySection;
+  QnnBackend_GetPropertyFn_t                backendGetProperty;
+  QnnContext_GetPropertyFn_t                contextGetProperty;
+  QnnContext_GetIncrementalBinaryFn_t       contextGetIncrementalBinary;
+  QnnContext_ReleaseIncrementalBinaryFn_t   contextReleaseIncrementalBinary;
+} QNN_INTERFACE_VER_TYPE;
+
+/// QNN_INTERFACE_VER_TYPE initializer macro
+#define QNN_INTERFACE_VER_TYPE_INIT { \
+  NULL, /*propertyHasCapability*/ \
+  NULL, /*backendCreate*/ \
+  NULL, /*backendSetConfig*/ \
+  NULL, /*backendGetApiVersion*/ \
+  NULL, /*backendGetBuildId*/ \
+  NULL, /*backendRegisterOpPackage*/ \
+  NULL, /*backendGetSupportedOperations*/ \
+  NULL, /*backendValidateOpConfig*/ \
+  NULL, /*backendFree*/ \
+  NULL, /*contextCreate*/ \
+  NULL, /*contextSetConfig*/ \
+  NULL, /*contextGetBinarySize*/ \
+  NULL, /*contextGetBinary*/ \
+  NULL, /*contextCreateFromBinary*/ \
+  NULL, /*contextFree*/ \
+  NULL, /*graphCreate*/ \
+  NULL, /*graphCreateSubgraph*/ \
+  NULL, /*graphSetConfig*/ \
+  NULL, /*graphAddNode*/ \
+  NULL, /*graphFinalize*/ \
+  NULL, /*graphRetrieve*/ \
+  NULL, /*graphExecute*/ \
+  NULL, /*graphExecuteAsync*/ \
+  NULL, /*tensorCreateContextTensor*/ \
+  NULL, /*tensorCreateGraphTensor*/ \
+  NULL, /*logCreate*/ \
+  NULL, /*logSetLogLevel*/ \
+  NULL, /*logFree*/ \
+  NULL, /*profileCreate*/ \
+  NULL, /*profileSetConfig*/ \
+  NULL, /*profileGetEvents*/ \
+  NULL, /*profileGetSubEvents*/ \
+  NULL, /*profileGetEventData*/ \
+  NULL, /*profileGetExtendedEventData*/ \
+  NULL, /*profileFree*/ \
+  NULL, /*memRegister*/ \
+  NULL, /*memDeRegister*/ \
+  NULL, /*deviceGetPlatformInfo*/ \
+  NULL, /*deviceFreePlatformInfo*/ \
+  NULL, /*deviceGetInfrastructure*/ \
+  NULL, /*deviceCreate*/ \
+  NULL, /*deviceSetConfig*/ \
+  NULL, /*deviceGetInfo*/ \
+  NULL, /*deviceFree*/ \
+  NULL, /*signalCreate*/ \
+  NULL, /*signalSetConfig*/ \
+  NULL, /*signalTrigger*/ \
+  NULL, /*signalFree*/ \
+  NULL, /*errorGetMessage*/ \
+  NULL, /*errorGetVerboseMessage*/ \
+  NULL, /*errorFreeVerboseMessage*/ \
+  NULL, /*graphPrepareExecutionEnvironment*/ \
+  NULL, /*graphReleaseExecutionEnvironment*/ \
+  NULL, /*graphGetProperty*/ \
+  NULL, /*contextValidateBinary*/ \
+  NULL, /*contextCreateFromBinaryWithSignal*/\
+  NULL, /*contextCreateFromBinaryListAsync*/ \
+  NULL, /*tensorUpdateGraphTensor*/ \
+  NULL, /*tensorUpdateContextTensor*/ \
+  NULL, /*contextGetBinarySectionSize*/ \
+  NULL, /*contextGetBinarySection*/ \
+  NULL, /*contextApplyBinarySection*/ \
+  NULL, /*backendGetProperty*/ \
+  NULL, /*contextGetProperty*/ \
+  NULL, /*contextGetIncrementalProperty*/ \
+  NULL, /*contextReleaseIncrementalProperty*/ \
+}
+
+typedef struct {
+  /// Backend identifier. See QnnCommon.h for details.
+  /// Allowed to be QNN_BACKEND_ID_NULL in case of single backend library, in which case
+  /// clients can deduce backend identifier based on library being loaded.
+  uint32_t backendId;
+  /// Interface provider name. Allowed to be NULL.
+  const char* providerName;
+  // API version for provided interface
+  Qnn_ApiVersion_t apiVersion;
+  union UNNAMED {
+    // Core interface type and name: e.g. QnnInterface_ImplementationV0_0_t v0_0;
+    QNN_INTERFACE_VER_TYPE  QNN_INTERFACE_VER_NAME;
+  };
+} QnnInterface_t;
+
+/// QnnInterface_t initializer macro
+#define QNN_INTERFACE_INIT                                   \
+  {                                                          \
+    QNN_BACKEND_ID_NULL,      /*backendId*/                  \
+    NULL,                     /*providerName*/               \
+    QNN_API_VERSION_INIT,     /*apiVersion*/                 \
+    {                                                        \
+      QNN_INTERFACE_VER_TYPE_INIT /*QNN_INTERFACE_VER_NAME*/ \
+    }                                                        \
+  }
+
+// clang-format on
+
+//=============================================================================
+// Public Functions
+//=============================================================================
+
+/**
+ * @brief Get list of available interface providers.
+ *
+ * @param[out] providerList A pointer to an array of available interface providers. The lifetime of
+ *                          returned interface object pointers corresponds to the lifetime of the
+ *                          provider library. Contents are to be considered invalid if the provider
+ *                          library is terminated/unloaded. This function can be called immediately
+ *                          after provider library has been loaded.
+ *
+ * @param[out] numProviders Number of available interface objects in _providerList_.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: No error.
+ *         - QNN_INTERFACE_INVALID_PARAMETER: Invalid parameter was provided.
+ *           Either _providerList_ or _numProviders_ was NULL.
+ *         - QNN_INTERFACE_ERROR_NOT_SUPPORTED: API not supported.
+ */
+QNN_INTERFACE
+Qnn_ErrorHandle_t QnnInterface_getProviders(const QnnInterface_t*** providerList,
+                                            uint32_t* numProviders);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // QNN_INTERFACE_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/QnnLog.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/QnnLog.h
new file mode 100755
index 0000000000000..5a1ae624a4056
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/QnnLog.h
@@ -0,0 +1,170 @@
+//=============================================================================
+//
+//  Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+//  All rights reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//=============================================================================
+
+/**
+ *  @file
+ *  @brief  Logging component API.
+ *
+ *          Provides means for QNN backends to output logging data.
+ */
+
+#ifndef QNN_LOG_H
+#define QNN_LOG_H
+
+#ifdef __cplusplus
+#include <cstdarg>
+#else
+#include <stdarg.h>
+#endif
+
+#include "QnnCommon.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//=============================================================================
+// Macros
+//=============================================================================
+
+//=============================================================================
+// Data Types
+//=============================================================================
+
+/**
+ * @brief QNN Log API result / error codes.
+ */
+typedef enum {
+  QNN_LOG_MIN_ERROR = QNN_MIN_ERROR_LOG,
+  ////////////////////////////////////
+
+  /// Qnn Log success
+  QNN_LOG_NO_ERROR = QNN_SUCCESS,
+  /// General error relating to memory allocation in Log API
+  QNN_LOG_ERROR_MEM_ALLOC = QNN_COMMON_ERROR_MEM_ALLOC,
+  /// Unable to initialize logging
+  QNN_LOG_ERROR_INITIALIZATION = QNN_MIN_ERROR_LOG + 2,
+  /// Invalid argument passed
+  QNN_LOG_ERROR_INVALID_ARGUMENT = QNN_MIN_ERROR_LOG + 3,
+  /// Invalid log handle passed
+  QNN_LOG_ERROR_INVALID_HANDLE = QNN_MIN_ERROR_LOG + 4,
+  ////////////////////////////////////
+  QNN_LOG_MAX_ERROR = QNN_MAX_ERROR_LOG,
+  // Unused, present to ensure 32 bits.
+  QNN_LOG_ERROR_UNDEFINED = 0x7FFFFFFF
+} QnnLog_Error_t;
+
+typedef enum {
+  // Enum Levels must be in ascending order, so that the enum value
+  // can be compared with the "maximum" set in QnnLog_create().
+  QNN_LOG_LEVEL_ERROR   = 1,
+  QNN_LOG_LEVEL_WARN    = 2,
+  QNN_LOG_LEVEL_INFO    = 3,
+  QNN_LOG_LEVEL_VERBOSE = 4,
+  /// Reserved for developer debugging
+  QNN_LOG_LEVEL_DEBUG = 5,
+  // Present to ensure 32 bits
+  QNN_LOG_LEVEL_MAX = 0x7fffffff
+} QnnLog_Level_t;
+
+/**
+ * @brief Signature for user-supplied logging callback.
+ *
+ * @warning The backend may call this callback from multiple threads, and expects that it is
+ *          re-entrant.
+ *
+ * @param[in] fmt Printf-style message format specifier.
+ *
+ * @param[in] level Log level for the message. Will not be higher than the maximum specified in
+ *                  QnnLog_create.
+ *
+ * @param[in] timestamp Backend-generated timestamp which is monotonically increasing, but
+ *                      otherwise meaningless.
+ *
+ * @param[in] args Message-specific parameters, to be used with fmt.
+ */
+typedef void (*QnnLog_Callback_t)(const char* fmt,
+                                  QnnLog_Level_t level,
+                                  uint64_t timestamp,
+                                  va_list args);
+
+//=============================================================================
+// Public Functions
+//=============================================================================
+
+/**
+ * @brief Create a handle to a logger object. This function can be
+ *        called before QnnBackend_create().
+ *
+ * @warning With different logging level enabled, the inference time may vary.
+ *
+ * @param[in] callback Callback to handle backend-generated logging messages. NULL indicates
+ *                     backend may direct log messages to the default log stream on the target
+ *                     platform when possible (e.g. to logcat in case of Android).
+ *
+ * @param[in] maxLogLevel Maximum level of messages which the backend will generate.
+ *
+ * @param[out] logger The created log handle.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: if logging is successfully initialized.
+ *         - QNN_COMMON_ERROR_NOT_SUPPORTED: logging is not supported.
+ *         - QNN_LOG_ERROR_INVALID_ARGUMENT: if one or more arguments is invalid.
+ *         - QNN_LOG_ERROR_MEM_ALLOC: for memory allocation errors.
+ *         - QNN_LOG_ERROR_INITIALIZATION: log init failed.
+ *         - QNN_COMMON_ERROR_SYSTEM_COMMUNICATION: SSR occurence (successful recovery)
+ *         - QNN_COMMON_ERROR_SYSTEM_COMMUNICATION_FATAL: SSR occurence (unsuccessful recovery)
+ *
+ * @note Use corresponding API through QnnInterface_t.
+ */
+QNN_API
+Qnn_ErrorHandle_t QnnLog_create(QnnLog_Callback_t callback,
+                                QnnLog_Level_t maxLogLevel,
+                                Qnn_LogHandle_t* logger);
+
+/**
+ * @brief A function to change the log level for the supplied log handle.
+ *
+ * @warning With different logging level enabled, the inference time may vary.
+ *
+ * @param[in] logger A log handle.
+ *
+ * @param[in] maxLogLevel New maximum log level.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: if the level is changed successfully.
+ *         - QNN_LOG_ERROR_INVALID_ARGUMENT: if maxLogLevel is not a valid QnnLog_Level_t level.
+ *         - QNN_LOG_ERROR_INVALID_HANDLE: _logHandle_ is not a valid handle
+ *         - QNN_COMMON_ERROR_SYSTEM_COMMUNICATION: SSR occurence (successful recovery)
+ *         - QNN_COMMON_ERROR_SYSTEM_COMMUNICATION_FATAL: SSR occurence (unsuccessful recovery)
+ *
+ * @note Use corresponding API through QnnInterface_t.
+ */
+QNN_API
+Qnn_ErrorHandle_t QnnLog_setLogLevel(Qnn_LogHandle_t logger, QnnLog_Level_t maxLogLevel);
+
+/**
+ * @brief A function to free the memory associated with the log handle.
+ *
+ * @param[in] logger A log handle.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: indicates logging is terminated.
+ *         - QNN_LOG_ERROR_MEM_ALLOC: for memory de-allocation errors.
+ *         - QNN_LOG_ERROR_INVALID_HANDLE: _logHandle_ is not a valid handle
+ *
+ * @note Use corresponding API through QnnInterface_t.
+ */
+QNN_API
+Qnn_ErrorHandle_t QnnLog_free(Qnn_LogHandle_t logger);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/QnnMem.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/QnnMem.h
new file mode 100755
index 0000000000000..6db196c21cfee
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/QnnMem.h
@@ -0,0 +1,241 @@
+//==============================================================================
+//
+// Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+// All rights reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+/**
+ *  @file
+ *  @brief  Memory registration component API.
+ *
+ *          Requires Backend to be initialized.
+ *          Provides means to register externally allocated memory with a backend.
+ */
+
+#ifndef QNN_MEM_H
+#define QNN_MEM_H
+
+#include "QnnCommon.h"
+#include "QnnTypes.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//=============================================================================
+// Macros
+//=============================================================================
+
+/// Invalid memory file descriptor value
+#define QNN_MEM_INVALID_FD -1
+
+//=============================================================================
+// Data Types
+//=============================================================================
+
+/**
+ * @brief QNN Mem(ory) API result / error codes.
+ */
+typedef enum {
+  QNN_MEM_MIN_ERROR = QNN_MIN_ERROR_MEM,
+  ////////////////////////////////////
+
+  /// Qnn Memory success
+  QNN_MEM_NO_ERROR = QNN_SUCCESS,
+  /// Backend does not support requested functionality
+  QNN_MEM_ERROR_NOT_SUPPORTED = QNN_COMMON_ERROR_NOT_SUPPORTED,
+  /// Invalid function argument
+  QNN_MEM_ERROR_INVALID_ARGUMENT = QNN_MIN_ERROR_MEM + 0,
+  /// Invalid memory handle
+  QNN_MEM_ERROR_INVALID_HANDLE = QNN_MIN_ERROR_MEM + 1,
+  /// Provided memory has already been registered
+  QNN_MEM_ERROR_ALREADY_REGISTERED = QNN_MIN_ERROR_MEM + 2,
+  /// Error in memory mapping
+  QNN_MEM_ERROR_MAPPING = QNN_MIN_ERROR_MEM + 3,
+  /// Invalid memory shape based on a backend's memory restrictions (e.g. alignment incompatibility)
+  QNN_MEM_ERROR_INVALID_SHAPE = QNN_MIN_ERROR_MEM + 4,
+  /// Backend does not support requested memory type
+  QNN_MEM_ERROR_UNSUPPORTED_MEMTYPE = QNN_MIN_ERROR_MEM + 5,
+
+  ////////////////////////////////////
+  QNN_MEM_MAX_ERROR = QNN_MAX_ERROR_MEM,
+  // Unused, present to ensure 32 bits.
+  QNN_MEM_ERROR_UNDEFINED = 0x7FFFFFFF
+} QnnMem_Error_t;
+
+/**
+ * @brief A struct which describes the shape of memory
+ */
+typedef struct {
+  /// Number of dimensions
+  uint32_t numDim;
+  /// Array holding size of each dimension. Size of array is = numDim
+  uint32_t* dimSize;
+  /// Additional configuration in string, for extensibility. Allowed to be NULL
+  const char* shapeConfig;
+} Qnn_MemShape_t;
+
+#define SHAPE_CONFIG_DATA_FORMAT_UBWC_RGBA8888 "DATA_FORMAT_UBWC_RGBA8888"
+#define SHAPE_CONFIG_DATA_FORMAT_UBWC_NV12     "DATA_FORMAT_UBWC_NV12"
+#define SHAPE_CONFIG_DATA_FORMAT_UBWC_NV12_Y   "DATA_FORMAT_UBWC_NV12_Y"
+#define SHAPE_CONFIG_DATA_FORMAT_UBWC_NV12_UV  "DATA_FORMAT_UBWC_NV12_UV"
+
+// clang-format off
+/// Qnn_MemShape_t initializer macro
+#define QNN_MEM_SHAPE_INIT    \
+  {                           \
+    0u,       /*numDim*/      \
+    NULL,     /*dimSize*/     \
+    NULL      /*shapeConfig*/ \
+  }
+// clang-format on
+
+/**
+ * @brief An enumeration of memory types which may be used to provide data for a QNN tensor.
+ */
+typedef enum {
+  /// Memory allocated by ION manager. ION memory can only be registered with Backend libraries
+  /// when a device supports ION manager.
+  QNN_MEM_TYPE_ION = 1,
+  /// Memory allocated by a custom backend mechanism.
+  QNN_MEM_TYPE_CUSTOM = 2,
+  /// Memory allocated by DMA-BUF subsystem.
+  QNN_MEM_TYPE_DMA_BUF = 3,
+  // Unused, present to ensure 32 bits.
+  QNN_MEM_TYPE_UNDEFINED = 0x7FFFFFFF
+} Qnn_MemType_t;
+
+/**
+ * @brief a struct which includes ION related information
+ */
+typedef struct {
+  /// file descriptor for memory, must be set to QNN_MEM_INVALID_FD if not applicable
+  int32_t fd;
+} Qnn_MemIonInfo_t;
+
+/// Qnn_MemIonInfo_t initializer macro
+#define QNN_MEM_ION_INFO_INIT \
+  { QNN_MEM_INVALID_FD /*fd*/ }
+
+/**
+ * @brief Definition of custom mem info opaque object. This object type is managed by backend
+ * specific APIs obtained by a custom backend mechanism.
+ */
+typedef void* Qnn_MemInfoCustom_t;
+
+/**
+ * @brief a struct which includes DMA-BUF related information
+ */
+typedef struct {
+  /// file descriptor for memory, must be set to QNN_MEM_INVALID_FD if not applicable
+  int32_t fd;
+  /// data pointer, created by app, using mmap on above file descriptor.
+  void* data;
+} Qnn_MemDmaBufInfo_t;
+
+/// Qnn_MemDmaBufInfo_t initializer macro
+#define QNN_MEM_DMA_BUF_INFO_INIT \
+  {                               \
+    QNN_MEM_INVALID_FD, /*fd*/    \
+        NULL            /*data*/  \
+  }
+
+/**
+ * @brief A struct which describes memory params
+ */
+typedef struct {
+  /// memory shape
+  Qnn_MemShape_t memShape;
+  /// memory data type
+  Qnn_DataType_t dataType;
+  /// memory type
+  Qnn_MemType_t memType;
+
+  union UNNAMED {
+    Qnn_MemIonInfo_t ionInfo;
+    Qnn_MemInfoCustom_t customInfo;
+    Qnn_MemDmaBufInfo_t dmaBufInfo;
+  };
+} Qnn_MemDescriptor_t;
+
+// clang-format off
+/// Qnn_MemDescriptor_t initializer macro
+#define QNN_MEM_DESCRIPTOR_INIT          \
+  {                                      \
+    QNN_MEM_SHAPE_INIT,     /*memShape*/ \
+    QNN_DATATYPE_UNDEFINED, /*dataType*/ \
+    QNN_MEM_TYPE_UNDEFINED, /*memType*/  \
+    {                                    \
+      QNN_MEM_ION_INFO_INIT /*ionInfo*/  \
+    }                                    \
+  }
+// clang-format on
+
+//=============================================================================
+// Public Functions
+//=============================================================================
+
+/**
+ * @brief Register existing memory to memory handle.
+ *        Used to instruct QNN to use this memory directly.
+ *
+ * @param[in] context A context handle.
+ *
+ * @param[in] memDescriptors Array of memory descriptors to be registered.
+ *
+ * @param[in] numDescriptors Number of memory descriptors in the array.
+ *
+ * @param[out] memHandles Array of allocated memory handles, length is _numDescriptors_. Same shape
+ *                        as _memDescriptors_ (i.e. memHandles[n] corresponds to
+ *                        memDescriptors[n]).
+ *
+ * @note memHandles parameter: Array memory is owned by the client. Array size must be at least
+ *       _numDescriptors_*sizeof(Qnn_MemHandle_t). The array will be initialized to NULL by the
+ *       backend. Upon failure, no memory will be registered and the _memHandles_ array will remain
+ *       NULL.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: memory was successfully registered
+ *         - QNN_MEM_ERROR_NOT_SUPPORTED: backend does not support this API
+ *         - QNN_MEM_ERROR_ALREADY_REGISTERED: memory has already been registered
+ *         - QNN_MEM_ERROR_UNSUPPORTED_MEMTYPE: backend does not support a memType specified within
+ *           _memDescriptors_
+ *         - QNN_MEM_ERROR_MAPPING: failed to map between memory file descriptor and memory address
+ *         - QNN_MEM_ERROR_INVALID_ARGUMENT: NULL array ptr or invalid memory descriptor
+ *         - QNN_MEM_ERROR_INVALID_SHAPE: backend does not support a memShape specified within
+ *           _memDescriptors_
+ *         - QNN_MEM_ERROR_INVALID_HANDLE: _context_ is not a valid handle
+ */
+QNN_API
+Qnn_ErrorHandle_t QnnMem_register(Qnn_ContextHandle_t context,
+                                  const Qnn_MemDescriptor_t* memDescriptors,
+                                  uint32_t numDescriptors,
+                                  Qnn_MemHandle_t* memHandles);
+
+/**
+ * @brief Deregister a memory handle which was registered via QnnMem_register and invalidates
+ *        memHandle for the given backend handle.
+ *
+ * @param[in] memHandles Array of memory handles to be deregistered.
+ *
+ * @param[in] numHandles Number of memory handles in the array.
+ *
+ * @note memHandles parameter: Upon failure, all valid handles within _memHandles_ will still be
+ *       de-registered.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: memory was successfully de-registered
+ *         - QNN_MEM_ERROR_NOT_SUPPORTED: backend does not support this API
+ *         - QNN_MEM_ERROR_INVALID_ARGUMENT: _memHandles_ is NULL
+ *         - QNN_MEM_ERROR_INVALID_HANDLE: a handle within _memHandles_ is NULL/invalid
+ */
+QNN_API
+Qnn_ErrorHandle_t QnnMem_deRegister(const Qnn_MemHandle_t* memHandles, uint32_t numHandles);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // QNN_MEM_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/QnnOpDef.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/QnnOpDef.h
new file mode 100755
index 0000000000000..40b752def0574
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/QnnOpDef.h
@@ -0,0 +1,722 @@
+//=============================================================================
+//
+//  Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+//  All rights reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//=============================================================================
+
+//=============================================================================
+// !!! This is an auto-generated file. Do NOT modify manually !!!
+//=============================================================================
+
+/**
+ * @file
+ * @brief QNN operation definition related names and constants.
+ *
+ *        Supported QNN operations are named alphabetically and belong to the
+ *        QNN_OP_PACKAGE_NAME_QTI_AISW.
+ */
+
+#ifndef QNN_OP_DEF_H
+#define QNN_OP_DEF_H
+
+// The Op package name
+#define QNN_OP_PACKAGE_NAME_QTI_AISW "qti.aisw"
+
+#define QNN_OPSET_VERSION_MAJOR 2
+#define QNN_OPSET_VERSION_MINOR 2
+#define QNN_OPSET_VERSION_PATCH 0
+
+#define QNN_OP_ARGB_TO_RGB                      "ArgbToRgb"
+#define QNN_OP_ARGB_TO_RGB_PARAM_INPUT_ORDER    "input_order"
+#define QNN_OP_ARGB_TO_RGB_INPUT_ORDER_ARGB     0
+#define QNN_OP_ARGB_TO_RGB_INPUT_ORDER_RGBA     1
+#define QNN_OP_ARGB_TO_RGB_PARAM_REVERSE_OUTPUT "reverse_output"
+
+#define QNN_OP_ARGMAX                 "Argmax"
+#define QNN_OP_ARGMAX_PARAM_AXIS      "axis"
+#define QNN_OP_ARGMAX_PARAM_KEEP_DIMS "keep_dims"
+
+#define QNN_OP_ARGMIN                 "Argmin"
+#define QNN_OP_ARGMIN_PARAM_AXIS      "axis"
+#define QNN_OP_ARGMIN_PARAM_KEEP_DIMS "keep_dims"
+
+#define QNN_OP_AXIS_ALIGNED_BBOX_TRANSFORM               "AxisAlignedBboxTransform"
+#define QNN_OP_AXIS_ALIGNED_BBOX_TRANSFORM_PARAM_WEIGHTS "weights"
+
+#define QNN_OP_BATCHNORM "Batchnorm"
+
+#define QNN_OP_BATCH_PERMUTATION "BatchPermutation"
+
+#define QNN_OP_BATCH_TO_SPACE                  "BatchToSpace"
+#define QNN_OP_BATCH_TO_SPACE_PARAM_BLOCK_SIZE "block_size"
+#define QNN_OP_BATCH_TO_SPACE_PARAM_CROPS      "crops"
+
+#define QNN_OP_BBOX_TRANSFORM                            "BboxTransform"
+#define QNN_OP_BBOX_TRANSFORM_PARAM_WEIGHTS              "weights"
+#define QNN_OP_BBOX_TRANSFORM_PARAM_APPLY_SCALE          "apply_scale"
+#define QNN_OP_BBOX_TRANSFORM_PARAM_ANGLE_BOUNDS         "angle_bounds"
+#define QNN_OP_BBOX_TRANSFORM_PARAM_ANGLE_CLIP_THRESHOLD "angle_clip_threshold"
+
+#define QNN_OP_BOX_WITH_NMS_LIMIT                            "BoxWithNmsLimit"
+#define QNN_OP_BOX_WITH_NMS_LIMIT_PARAM_NMS_KERNEL_METHOD    "nms_kernel_method"
+#define QNN_OP_BOX_WITH_NMS_LIMIT_NMS_KERNEL_METHOD_HARD     0
+#define QNN_OP_BOX_WITH_NMS_LIMIT_NMS_KERNEL_METHOD_LINEAR   1
+#define QNN_OP_BOX_WITH_NMS_LIMIT_NMS_KERNEL_METHOD_GAUSSIAN 2
+#define QNN_OP_BOX_WITH_NMS_LIMIT_PARAM_NMS_SCORE_THRESHOLD  "nms_score_threshold"
+#define QNN_OP_BOX_WITH_NMS_LIMIT_PARAM_SCORE_THRESHOLD      "score_threshold"
+#define QNN_OP_BOX_WITH_NMS_LIMIT_PARAM_PRE_NMS_LIMIT        "pre_nms_limit"
+#define QNN_OP_BOX_WITH_NMS_LIMIT_PARAM_IOU_THRESHOLD        "iou_threshold"
+#define QNN_OP_BOX_WITH_NMS_LIMIT_PARAM_SIGMA                "sigma"
+
+#define QNN_OP_BUFFER                         "Buffer"
+#define QNN_OP_BUFFER_PARAM_BUFFER_SIZE       "buffer_size"
+#define QNN_OP_BUFFER_PARAM_BUFFER_DIM        "buffer_dim"
+#define QNN_OP_BUFFER_PARAM_STRIDE            "stride"
+#define QNN_OP_BUFFER_PARAM_MODE              "mode"
+#define QNN_OP_BUFFER_MODE_BLOCKING           0
+#define QNN_OP_BUFFER_MODE_NON_BLOCKING_LEFT  1
+#define QNN_OP_BUFFER_MODE_NON_BLOCKING_RIGHT 2
+#define QNN_OP_BUFFER_PARAM_BUFFER_PADDING    "buffer_padding"
+
+#define QNN_OP_CAST "Cast"
+
+#define QNN_OP_CHANNEL_SHUFFLE                  "ChannelShuffle"
+#define QNN_OP_CHANNEL_SHUFFLE_PARAM_NUM_GROUPS "num_groups"
+#define QNN_OP_CHANNEL_SHUFFLE_PARAM_AXIS       "axis"
+
+#define QNN_OP_COL2_IM                   "Col2Im"
+#define QNN_OP_COL2_IM_PARAM_KERNEL_SIZE "kernel_size"
+#define QNN_OP_COL2_IM_PARAM_STRIDE      "stride"
+#define QNN_OP_COL2_IM_PARAM_PAD_AMOUNT  "pad_amount"
+#define QNN_OP_COL2_IM_PARAM_DILATION    "dilation"
+
+#define QNN_OP_COLLECT_RPN_PROPOSALS                     "CollectRpnProposals"
+#define QNN_OP_COLLECT_RPN_PROPOSALS_PARAM_RPN_MIN_LEVEL "rpn_min_level"
+#define QNN_OP_COLLECT_RPN_PROPOSALS_PARAM_RPN_MAX_LEVEL "rpn_max_level"
+#define QNN_OP_COLLECT_RPN_PROPOSALS_PARAM_POST_NMS_TOP  "post_nms_top"
+
+#define QNN_OP_COMBINED_NMS                           "CombinedNms"
+#define QNN_OP_COMBINED_NMS_PARAM_MAX_BOXES_PER_CLASS "max_boxes_per_class"
+#define QNN_OP_COMBINED_NMS_PARAM_MAX_TOTAL_BOXES     "max_total_boxes"
+#define QNN_OP_COMBINED_NMS_PARAM_IOU_THRESHOLD       "iou_threshold"
+#define QNN_OP_COMBINED_NMS_PARAM_SCORE_THRESHOLD     "score_threshold"
+#define QNN_OP_COMBINED_NMS_PARAM_PAD_PER_CLASS       "pad_per_class"
+#define QNN_OP_COMBINED_NMS_PARAM_CLIP_BOXES          "clip_boxes"
+
+#define QNN_OP_CONCAT            "Concat"
+#define QNN_OP_CONCAT_PARAM_AXIS "axis"
+
+#define QNN_OP_CONSTANT_OF_SHAPE             "ConstantOfShape"
+#define QNN_OP_CONSTANT_OF_SHAPE_PARAM_VALUE "value"
+
+#define QNN_OP_CONV_1D                  "Conv1d"
+#define QNN_OP_CONV_1D_PARAM_STRIDE     "stride"
+#define QNN_OP_CONV_1D_PARAM_PAD_AMOUNT "pad_amount"
+#define QNN_OP_CONV_1D_PARAM_GROUP      "group"
+#define QNN_OP_CONV_1D_PARAM_DILATION   "dilation"
+
+#define QNN_OP_CONV_2D                  "Conv2d"
+#define QNN_OP_CONV_2D_PARAM_STRIDE     "stride"
+#define QNN_OP_CONV_2D_PARAM_PAD_AMOUNT "pad_amount"
+#define QNN_OP_CONV_2D_PARAM_GROUP      "group"
+#define QNN_OP_CONV_2D_PARAM_DILATION   "dilation"
+
+#define QNN_OP_CONV_3D                             "Conv3d"
+#define QNN_OP_CONV_3D_PARAM_STRIDE                "stride"
+#define QNN_OP_CONV_3D_PARAM_PAD_AMOUNT            "pad_amount"
+#define QNN_OP_CONV_3D_PARAM_GROUP                 "group"
+#define QNN_OP_CONV_3D_PARAM_DILATION              "dilation"
+#define QNN_OP_CONV_3D_PARAM_REUSE_SPARSE_INDICIES "reuse_sparse_indicies"
+
+#define QNN_OP_CONVERT                           "Convert"
+#define QNN_OP_CONVERT_PARAM_DYNAMIC_INPUT_DATA  "dynamic_input_data"
+#define QNN_OP_CONVERT_PARAM_DYNAMIC_OUTPUT_DATA "dynamic_output_data"
+
+#define QNN_OP_CORRELATION_1D                    "Correlation1D"
+#define QNN_OP_CORRELATION_1D_PARAM_DISPLACEMENT "displacement"
+#define QNN_OP_CORRELATION_1D_PARAM_SHIFT        "shift"
+
+#define QNN_OP_CREATE_SPARSE "CreateSparse"
+
+#define QNN_OP_CROP_AND_RESIZE                                     "CropAndResize"
+#define QNN_OP_CROP_AND_RESIZE_PARAM_RESIZE_DIMS                   "resize_dims"
+#define QNN_OP_CROP_AND_RESIZE_PARAM_INTERPOLATION_MODE            "interpolation_mode"
+#define QNN_OP_CROP_AND_RESIZE_INTERPOLATION_MODE_BILINEAR         0
+#define QNN_OP_CROP_AND_RESIZE_INTERPOLATION_MODE_NEAREST_NEIGHBOR 1
+#define QNN_OP_CROP_AND_RESIZE_PARAM_EXTRAPOLATION_VALUE           "extrapolation_value"
+
+#define QNN_OP_CUMULATIVE_SUM                 "CumulativeSum"
+#define QNN_OP_CUMULATIVE_SUM_PARAM_AXIS      "axis"
+#define QNN_OP_CUMULATIVE_SUM_PARAM_EXCLUSIVE "exclusive"
+#define QNN_OP_CUMULATIVE_SUM_PARAM_REVERSE   "reverse"
+
+#define QNN_OP_DEPTH_TO_SPACE                  "DepthToSpace"
+#define QNN_OP_DEPTH_TO_SPACE_PARAM_BLOCK_SIZE "block_size"
+#define QNN_OP_DEPTH_TO_SPACE_PARAM_MODE       "mode"
+#define QNN_OP_DEPTH_TO_SPACE_MODE_DCR         0
+#define QNN_OP_DEPTH_TO_SPACE_MODE_CRD         1
+
+#define QNN_OP_DEPTH_WISE_CONV_1D                  "DepthWiseConv1d"
+#define QNN_OP_DEPTH_WISE_CONV_1D_PARAM_STRIDE     "stride"
+#define QNN_OP_DEPTH_WISE_CONV_1D_PARAM_PAD_AMOUNT "pad_amount"
+#define QNN_OP_DEPTH_WISE_CONV_1D_PARAM_DILATION   "dilation"
+
+#define QNN_OP_DEPTH_WISE_CONV_2D                  "DepthWiseConv2d"
+#define QNN_OP_DEPTH_WISE_CONV_2D_PARAM_STRIDE     "stride"
+#define QNN_OP_DEPTH_WISE_CONV_2D_PARAM_PAD_AMOUNT "pad_amount"
+#define QNN_OP_DEPTH_WISE_CONV_2D_PARAM_DILATION   "dilation"
+
+#define QNN_OP_DEQUANTIZE "Dequantize"
+
+#define QNN_OP_DETECTION_OUTPUT                             "DetectionOutput"
+#define QNN_OP_DETECTION_OUTPUT_PARAM_DELTA_SCALING_FACTORS "delta_scaling_factors"
+#define QNN_OP_DETECTION_OUTPUT_PARAM_CONFIDENCE_THRESHOLD  "confidence_threshold"
+#define QNN_OP_DETECTION_OUTPUT_PARAM_IOU_THRESHOLD         "iou_threshold"
+#define QNN_OP_DETECTION_OUTPUT_PARAM_NMS_TYPE              "nms_type"
+#define QNN_OP_DETECTION_OUTPUT_NMS_TYPE_FAST               0
+#define QNN_OP_DETECTION_OUTPUT_NMS_TYPE_REGULAR            1
+#define QNN_OP_DETECTION_OUTPUT_PARAM_BACKGROUND_CLASS_IDX  "background_class_idx"
+#define QNN_OP_DETECTION_OUTPUT_PARAM_USE_BG_IN_NMS         "use_bg_in_nms"
+#define QNN_OP_DETECTION_OUTPUT_PARAM_OUTPUT_BACKGROUND     "output_background"
+#define QNN_OP_DETECTION_OUTPUT_PARAM_SHARE_LOCATION        "share_location"
+#define QNN_OP_DETECTION_OUTPUT_PARAM_NMS_ETA               "nms_eta"
+#define QNN_OP_DETECTION_OUTPUT_PARAM_DETECTION_LIMIT       "detection_limit"
+
+#define QNN_OP_DISTRIBUTE_FPN_PROPOSALS                           "DistributeFpnProposals"
+#define QNN_OP_DISTRIBUTE_FPN_PROPOSALS_PARAM_ROI_MIN_LEVEL       "roi_min_level"
+#define QNN_OP_DISTRIBUTE_FPN_PROPOSALS_PARAM_ROI_MAX_LEVEL       "roi_max_level"
+#define QNN_OP_DISTRIBUTE_FPN_PROPOSALS_PARAM_ROI_CANONICAL_SCALE "roi_canonical_scale"
+#define QNN_OP_DISTRIBUTE_FPN_PROPOSALS_PARAM_ROI_CANONICAL_LEVEL "roi_canonical_level"
+
+#define QNN_OP_ELEMENT_WISE_ABS "ElementWiseAbs"
+
+#define QNN_OP_ELEMENT_WISE_ADD "ElementWiseAdd"
+
+#define QNN_OP_ELEMENT_WISE_AND "ElementWiseAnd"
+
+#define QNN_OP_ELEMENT_WISE_ASIN "ElementWiseAsin"
+
+#define QNN_OP_ELEMENT_WISE_ATAN "ElementWiseAtan"
+
+#define QNN_OP_ELEMENT_WISE_BINARY                              "ElementWiseBinary"
+#define QNN_OP_ELEMENT_WISE_BINARY_PARAM_OPERATION              "operation"
+#define QNN_OP_ELEMENT_WISE_BINARY_OPERATION_ADD                0
+#define QNN_OP_ELEMENT_WISE_BINARY_OPERATION_AND                1
+#define QNN_OP_ELEMENT_WISE_BINARY_OPERATION_DIVIDE             2
+#define QNN_OP_ELEMENT_WISE_BINARY_OPERATION_EQUAL              3
+#define QNN_OP_ELEMENT_WISE_BINARY_OPERATION_FLOOR_DIV          4
+#define QNN_OP_ELEMENT_WISE_BINARY_OPERATION_FMOD               5
+#define QNN_OP_ELEMENT_WISE_BINARY_OPERATION_GREATER            6
+#define QNN_OP_ELEMENT_WISE_BINARY_OPERATION_GREATER_EQUAL      7
+#define QNN_OP_ELEMENT_WISE_BINARY_OPERATION_LESS               8
+#define QNN_OP_ELEMENT_WISE_BINARY_OPERATION_LESS_EQUAL         9
+#define QNN_OP_ELEMENT_WISE_BINARY_OPERATION_MAXIMUM            10
+#define QNN_OP_ELEMENT_WISE_BINARY_OPERATION_MINIMUM            11
+#define QNN_OP_ELEMENT_WISE_BINARY_OPERATION_MOD                12
+#define QNN_OP_ELEMENT_WISE_BINARY_OPERATION_MULTIPLY           13
+#define QNN_OP_ELEMENT_WISE_BINARY_OPERATION_NOT_EQUAL          14
+#define QNN_OP_ELEMENT_WISE_BINARY_OPERATION_OR                 15
+#define QNN_OP_ELEMENT_WISE_BINARY_OPERATION_POWER              16
+#define QNN_OP_ELEMENT_WISE_BINARY_OPERATION_SQUARED_DIFFERENCE 17
+#define QNN_OP_ELEMENT_WISE_BINARY_OPERATION_SUBTRACT           18
+#define QNN_OP_ELEMENT_WISE_BINARY_OPERATION_XOR                19
+
+#define QNN_OP_ELEMENT_WISE_CEIL "ElementWiseCeil"
+
+#define QNN_OP_ELEMENT_WISE_COS "ElementWiseCos"
+
+#define QNN_OP_ELEMENT_WISE_DIVIDE "ElementWiseDivide"
+
+#define QNN_OP_ELEMENT_WISE_EQUAL "ElementWiseEqual"
+
+#define QNN_OP_ELEMENT_WISE_EXP "ElementWiseExp"
+
+#define QNN_OP_ELEMENT_WISE_FLOOR "ElementWiseFloor"
+
+#define QNN_OP_ELEMENT_WISE_FLOOR_DIV "ElementWiseFloorDiv"
+
+#define QNN_OP_ELEMENT_WISE_FMOD "ElementWiseFmod"
+
+#define QNN_OP_ELEMENT_WISE_GREATER "ElementWiseGreater"
+
+#define QNN_OP_ELEMENT_WISE_GREATER_EQUAL "ElementWiseGreaterEqual"
+
+#define QNN_OP_ELEMENT_WISE_LESS "ElementWiseLess"
+
+#define QNN_OP_ELEMENT_WISE_LESS_EQUAL "ElementWiseLessEqual"
+
+#define QNN_OP_ELEMENT_WISE_LOG "ElementWiseLog"
+
+#define QNN_OP_ELEMENT_WISE_MAXIMUM "ElementWiseMaximum"
+
+#define QNN_OP_ELEMENT_WISE_MINIMUM "ElementWiseMinimum"
+
+#define QNN_OP_ELEMENT_WISE_MOD "ElementWiseMod"
+
+#define QNN_OP_ELEMENT_WISE_MULTIPLY "ElementWiseMultiply"
+
+#define QNN_OP_ELEMENT_WISE_NEG "ElementWiseNeg"
+
+#define QNN_OP_ELEMENT_WISE_NEURON                        "ElementWiseNeuron"
+#define QNN_OP_ELEMENT_WISE_NEURON_PARAM_OPERATION        "operation"
+#define QNN_OP_ELEMENT_WISE_NEURON_OPERATION_ELU          0
+#define QNN_OP_ELEMENT_WISE_NEURON_OPERATION_GELU         1
+#define QNN_OP_ELEMENT_WISE_NEURON_OPERATION_HARD_SIGMOID 2
+#define QNN_OP_ELEMENT_WISE_NEURON_OPERATION_HARD_SWISH   3
+#define QNN_OP_ELEMENT_WISE_NEURON_OPERATION_RELU         4
+#define QNN_OP_ELEMENT_WISE_NEURON_OPERATION_RELU_MIN_MAX 5
+#define QNN_OP_ELEMENT_WISE_NEURON_OPERATION_SIGMOID      6
+#define QNN_OP_ELEMENT_WISE_NEURON_OPERATION_SOFTPLUS     7
+#define QNN_OP_ELEMENT_WISE_NEURON_OPERATION_TANH         8
+#define QNN_OP_ELEMENT_WISE_NEURON_PARAM_ALPHA            "alpha"
+#define QNN_OP_ELEMENT_WISE_NEURON_PARAM_BETA             "beta"
+#define QNN_OP_ELEMENT_WISE_NEURON_PARAM_MIN_VALUE        "min_value"
+#define QNN_OP_ELEMENT_WISE_NEURON_PARAM_MAX_VALUE        "max_value"
+#define QNN_OP_ELEMENT_WISE_NEURON_PARAM_THRESHOLD        "threshold"
+
+#define QNN_OP_ELEMENT_WISE_NOT "ElementWiseNot"
+
+#define QNN_OP_ELEMENT_WISE_NOT_EQUAL "ElementWiseNotEqual"
+
+#define QNN_OP_ELEMENT_WISE_OR "ElementWiseOr"
+
+#define QNN_OP_ELEMENT_WISE_POWER "ElementWisePower"
+
+#define QNN_OP_ELEMENT_WISE_ROUND "ElementWiseRound"
+
+#define QNN_OP_ELEMENT_WISE_RSQRT "ElementWiseRsqrt"
+
+#define QNN_OP_ELEMENT_WISE_SELECT "ElementWiseSelect"
+
+#define QNN_OP_ELEMENT_WISE_SIN "ElementWiseSin"
+
+#define QNN_OP_ELEMENT_WISE_SIGN "ElementWiseSign"
+
+#define QNN_OP_ELEMENT_WISE_SOFTPLUS                 "ElementWiseSoftplus"
+#define QNN_OP_ELEMENT_WISE_SOFTPLUS_PARAM_BETA      "beta"
+#define QNN_OP_ELEMENT_WISE_SOFTPLUS_PARAM_THRESHOLD "threshold"
+
+#define QNN_OP_ELEMENT_WISE_SQUARED_DIFFERENCE "ElementWiseSquaredDifference"
+
+#define QNN_OP_ELEMENT_WISE_SQUARE_ROOT "ElementWiseSquareRoot"
+
+#define QNN_OP_ELEMENT_WISE_SUBTRACT "ElementWiseSubtract"
+
+#define QNN_OP_ELEMENT_WISE_UNARY                      "ElementWiseUnary"
+#define QNN_OP_ELEMENT_WISE_UNARY_PARAM_OPERATION      "operation"
+#define QNN_OP_ELEMENT_WISE_UNARY_OPERATION_ABS        0
+#define QNN_OP_ELEMENT_WISE_UNARY_OPERATION_ASIN       1
+#define QNN_OP_ELEMENT_WISE_UNARY_OPERATION_ATAN       2
+#define QNN_OP_ELEMENT_WISE_UNARY_OPERATION_CEIL       3
+#define QNN_OP_ELEMENT_WISE_UNARY_OPERATION_COS        4
+#define QNN_OP_ELEMENT_WISE_UNARY_OPERATION_EXP        5
+#define QNN_OP_ELEMENT_WISE_UNARY_OPERATION_FLOOR      6
+#define QNN_OP_ELEMENT_WISE_UNARY_OPERATION_LOG        7
+#define QNN_OP_ELEMENT_WISE_UNARY_OPERATION_NEG        8
+#define QNN_OP_ELEMENT_WISE_UNARY_OPERATION_NOT        9
+#define QNN_OP_ELEMENT_WISE_UNARY_OPERATION_RECIPROCAL 10
+#define QNN_OP_ELEMENT_WISE_UNARY_OPERATION_ROUND      11
+#define QNN_OP_ELEMENT_WISE_UNARY_OPERATION_RSQRT      12
+#define QNN_OP_ELEMENT_WISE_UNARY_OPERATION_SIGN       13
+#define QNN_OP_ELEMENT_WISE_UNARY_OPERATION_SIN        14
+#define QNN_OP_ELEMENT_WISE_UNARY_OPERATION_SQRT       15
+
+#define QNN_OP_ELEMENT_WISE_XOR "ElementWiseXor"
+
+#define QNN_OP_ELU             "Elu"
+#define QNN_OP_ELU_PARAM_ALPHA "alpha"
+
+#define QNN_OP_EXPAND_DIMS            "ExpandDims"
+#define QNN_OP_EXPAND_DIMS_PARAM_AXIS "axis"
+#define QNN_OP_EXPAND_DIMS_PARAM_AXES "axes"
+
+#define QNN_OP_EXTRACT_GLIMPSE                  "ExtractGlimpse"
+#define QNN_OP_EXTRACT_GLIMPSE_PARAM_SIZE       "size"
+#define QNN_OP_EXTRACT_GLIMPSE_PARAM_CENTERED   "centered"
+#define QNN_OP_EXTRACT_GLIMPSE_PARAM_NORMALIZED "normalized"
+#define QNN_OP_EXTRACT_GLIMPSE_PARAM_NOISE      "noise"
+#define QNN_OP_EXTRACT_GLIMPSE_NOISE_UNIFORM    0
+#define QNN_OP_EXTRACT_GLIMPSE_NOISE_GAUSSIAN   1
+#define QNN_OP_EXTRACT_GLIMPSE_NOISE_ZEROES     2
+
+#define QNN_OP_EXTRACT_PATCHES               "ExtractPatches"
+#define QNN_OP_EXTRACT_PATCHES_PARAM_SIZE    "size"
+#define QNN_OP_EXTRACT_PATCHES_PARAM_STRIDE  "stride"
+#define QNN_OP_EXTRACT_PATCHES_PARAM_RATE    "rate"
+#define QNN_OP_EXTRACT_PATCHES_PARAM_PADDING "padding"
+#define QNN_OP_EXTRACT_PATCHES_PADDING_VALID 0
+#define QNN_OP_EXTRACT_PATCHES_PADDING_SAME  1
+
+#define QNN_OP_FULLY_CONNECTED                 "FullyConnected"
+#define QNN_OP_FULLY_CONNECTED_PARAM_KEEP_DIMS "keep_dims"
+
+#define QNN_OP_GATHER            "Gather"
+#define QNN_OP_GATHER_PARAM_AXIS "axis"
+
+#define QNN_OP_GATHER_ELEMENTS            "GatherElements"
+#define QNN_OP_GATHER_ELEMENTS_PARAM_AXIS "axis"
+
+#define QNN_OP_GATHER_ND                  "GatherNd"
+#define QNN_OP_GATHER_ND_PARAM_BATCH_DIMS "batch_dims"
+
+#define QNN_OP_GELU "Gelu"
+
+#define QNN_OP_GENERATE_PROPOSALS                       "GenerateProposals"
+#define QNN_OP_GENERATE_PROPOSALS_PARAM_IMG_SIZE_RATIO  "img_size_ratio"
+#define QNN_OP_GENERATE_PROPOSALS_PARAM_MIN_SIZE        "min_size"
+#define QNN_OP_GENERATE_PROPOSALS_PARAM_PRE_NMS_LIMIT   "pre_nms_limit"
+#define QNN_OP_GENERATE_PROPOSALS_PARAM_POST_NMS_LIMIT  "post_nms_limit"
+#define QNN_OP_GENERATE_PROPOSALS_PARAM_IOU_THRESHOLD   "iou_threshold"
+#define QNN_OP_GENERATE_PROPOSALS_PARAM_BBOX_XFORM_CLIP "bbox_xform_clip"
+
+#define QNN_OP_GET_SPARSE_INDICES "GetSparseIndices"
+
+#define QNN_OP_GET_SPARSE_VALUES "GetSparseValues"
+
+#define QNN_OP_GRID_SAMPLE                         "GridSample"
+#define QNN_OP_GRID_SAMPLE_PARAM_ALIGN_CORNERS     "align_corners"
+#define QNN_OP_GRID_SAMPLE_PARAM_MODE              "mode"
+#define QNN_OP_GRID_SAMPLE_MODE_BILINEAR           0
+#define QNN_OP_GRID_SAMPLE_MODE_NEAREST            1
+#define QNN_OP_GRID_SAMPLE_PARAM_PADDING_MODE      "padding_mode"
+#define QNN_OP_GRID_SAMPLE_PADDING_MODE_ZEROS      0
+#define QNN_OP_GRID_SAMPLE_PADDING_MODE_BORDER     1
+#define QNN_OP_GRID_SAMPLE_PADDING_MODE_REFLECTION 2
+
+#define QNN_OP_GROUP_NORM               "GroupNorm"
+#define QNN_OP_GROUP_NORM_PARAM_EPSILON "epsilon"
+#define QNN_OP_GROUP_NORM_PARAM_GROUP   "group"
+
+#define QNN_OP_GRU                           "Gru"
+#define QNN_OP_GRU_PARAM_DIRECTION           "direction"
+#define QNN_OP_GRU_DIRECTION_FORWARD         0
+#define QNN_OP_GRU_DIRECTION_REVERSE         1
+#define QNN_OP_GRU_PARAM_LINEAR_BEFORE_RESET "linear_before_reset"
+#define QNN_OP_GRU_PARAM_TIME_MAJOR          "time_major"
+
+#define QNN_OP_HARD_SWISH "HardSwish"
+
+#define QNN_OP_HEAT_MAP_MAX_KEY_POINT "HeatMapMaxKeyPoint"
+
+#define QNN_OP_IM2_COL                   "Im2Col"
+#define QNN_OP_IM2_COL_PARAM_KERNEL_SIZE "kernel_size"
+#define QNN_OP_IM2_COL_PARAM_STRIDE      "stride"
+#define QNN_OP_IM2_COL_PARAM_PAD_AMOUNT  "pad_amount"
+#define QNN_OP_IM2_COL_PARAM_DILATION    "dilation"
+
+#define QNN_OP_IF                  "If"
+#define QNN_OP_IF_PARAM_THEN_GRAPH "then_graph"
+#define QNN_OP_IF_PARAM_ELSE_GRAPH "else_graph"
+
+#define QNN_OP_IMAGE_PROJECTION_TRANSFORM                                     "ImageProjectionTransform"
+#define QNN_OP_IMAGE_PROJECTION_TRANSFORM_PARAM_INTERPOLATION_MODE            "interpolation_mode"
+#define QNN_OP_IMAGE_PROJECTION_TRANSFORM_INTERPOLATION_MODE_BILINEAR         0
+#define QNN_OP_IMAGE_PROJECTION_TRANSFORM_INTERPOLATION_MODE_NEAREST_NEIGHBOR 1
+
+#define QNN_OP_INSTANCE_NORM                          "InstanceNorm"
+#define QNN_OP_INSTANCE_NORM_PARAM_EPSILON            "epsilon"
+#define QNN_OP_INSTANCE_NORM_PARAM_MODE               "mode"
+#define QNN_OP_INSTANCE_NORM_MODE_MU_SIGMA            0
+#define QNN_OP_INSTANCE_NORM_MODE_RMS                 1
+#define QNN_OP_INSTANCE_NORM_PARAM_NORMALIZE_VARIANCE "normalize_variance"
+#define QNN_OP_INSTANCE_NORM_PARAM_REGION             "region"
+#define QNN_OP_INSTANCE_NORM_REGION_ACROSS_SPATIAL    0
+#define QNN_OP_INSTANCE_NORM_REGION_ACROSS_CHANNEL    1
+#define QNN_OP_INSTANCE_NORM_REGION_ACROSS_ALL        2
+
+#define QNN_OP_L2_NORM               "L2Norm"
+#define QNN_OP_L2_NORM_PARAM_AXIS    "axis"
+#define QNN_OP_L2_NORM_PARAM_AXES    "axes"
+#define QNN_OP_L2_NORM_PARAM_EPSILON "epsilon"
+
+#define QNN_OP_L2_POOL_2D                   "L2Pool2d"
+#define QNN_OP_L2_POOL_2D_PARAM_FILTER_SIZE "filter_size"
+#define QNN_OP_L2_POOL_2D_PARAM_STRIDE      "stride"
+#define QNN_OP_L2_POOL_2D_PARAM_PAD_AMOUNT  "pad_amount"
+
+#define QNN_OP_LAYER_NORM               "LayerNorm"
+#define QNN_OP_LAYER_NORM_PARAM_EPSILON "epsilon"
+#define QNN_OP_LAYER_NORM_PARAM_AXES    "axes"
+
+#define QNN_OP_LOG_SOFTMAX            "LogSoftmax"
+#define QNN_OP_LOG_SOFTMAX_PARAM_AXIS "axis"
+#define QNN_OP_LOG_SOFTMAX_PARAM_BETA "beta"
+
+#define QNN_OP_LRN                       "Lrn"
+#define QNN_OP_LRN_PARAM_ALPHA           "alpha"
+#define QNN_OP_LRN_PARAM_BETA            "beta"
+#define QNN_OP_LRN_PARAM_BIAS            "bias"
+#define QNN_OP_LRN_PARAM_RADIUS          "radius"
+#define QNN_OP_LRN_PARAM_REGION          "region"
+#define QNN_OP_LRN_REGION_ACROSS_CHANNEL 0
+#define QNN_OP_LRN_REGION_WITHIN_CHANNEL 1
+
+#define QNN_OP_LSTM                             "Lstm"
+#define QNN_OP_LSTM_PARAM_DIRECTION             "direction"
+#define QNN_OP_LSTM_DIRECTION_FORWARD           0
+#define QNN_OP_LSTM_DIRECTION_REVERSE           1
+#define QNN_OP_LSTM_PARAM_CELL_CLIP_THRESHOLD   "cell_clip_threshold"
+#define QNN_OP_LSTM_PARAM_OUTPUT_CLIP_THRESHOLD "output_clip_threshold"
+#define QNN_OP_LSTM_PARAM_TIME_MAJOR            "time_major"
+#define QNN_OP_LSTM_PARAM_INPUT_GATE_QSCALE     "input_gate_qscale"
+#define QNN_OP_LSTM_PARAM_FORGET_GATE_QSCALE    "forget_gate_qscale"
+#define QNN_OP_LSTM_PARAM_CELL_GATE_QSCALE      "cell_gate_qscale"
+#define QNN_OP_LSTM_PARAM_OUTPUT_GATE_QSCALE    "output_gate_qscale"
+#define QNN_OP_LSTM_PARAM_HIDDEN_STATE_OFFSET   "hidden_state_offset"
+#define QNN_OP_LSTM_PARAM_HIDDEN_STATE_QSCALE   "hidden_state_qscale"
+
+#define QNN_OP_MASKED_SOFTMAX                   "MaskedSoftmax"
+#define QNN_OP_MASKED_SOFTMAX_PARAM_MODE        "mode"
+#define QNN_OP_MASKED_SOFTMAX_MODE_UNCOMPRESSED 0
+#define QNN_OP_MASKED_SOFTMAX_MODE_COMPRESSED   1
+
+#define QNN_OP_MOMENTS                 "Moments"
+#define QNN_OP_MOMENTS_PARAM_AXES      "axes"
+#define QNN_OP_MOMENTS_PARAM_KEEP_DIMS "keep_dims"
+
+#define QNN_OP_MULTI_CLASS_NMS                       "MultiClassNms"
+#define QNN_OP_MULTI_CLASS_NMS_PARAM_IOU_THRESHOLD   "iou_threshold"
+#define QNN_OP_MULTI_CLASS_NMS_PARAM_SCORE_THRESHOLD "score_threshold"
+#define QNN_OP_MULTI_CLASS_NMS_PARAM_SOFT_NMS_SIGMA  "soft_nms_sigma"
+
+#define QNN_OP_NON_MAX_SUPPRESSION                          "NonMaxSuppression"
+#define QNN_OP_NON_MAX_SUPPRESSION_PARAM_IOU_THRESHOLD      "iou_threshold"
+#define QNN_OP_NON_MAX_SUPPRESSION_PARAM_SCORE_THRESHOLD    "score_threshold"
+#define QNN_OP_NON_MAX_SUPPRESSION_PARAM_MAX_BOXES_SELECTED "max_boxes_selected"
+
+#define QNN_OP_NON_ZERO "NonZero"
+
+#define QNN_OP_NV12_TO_RGB                    "Nv12ToRgb"
+#define QNN_OP_NV12_TO_RGB_PARAM_OUTPUT_ORDER "output_order"
+#define QNN_OP_NV12_TO_RGB_OUTPUT_ORDER_RGB   0
+#define QNN_OP_NV12_TO_RGB_OUTPUT_ORDER_BGR   1
+
+#define QNN_OP_NV21_TO_RGB                    "Nv21ToRgb"
+#define QNN_OP_NV21_TO_RGB_PARAM_OUTPUT_ORDER "output_order"
+#define QNN_OP_NV21_TO_RGB_OUTPUT_ORDER_RGB   0
+#define QNN_OP_NV21_TO_RGB_OUTPUT_ORDER_BGR   1
+
+#define QNN_OP_ONE_HOT                 "OneHot"
+#define QNN_OP_ONE_HOT_PARAM_DEPTH     "depth"
+#define QNN_OP_ONE_HOT_PARAM_AXIS      "axis"
+#define QNN_OP_ONE_HOT_PARAM_ON_VALUE  "on_value"
+#define QNN_OP_ONE_HOT_PARAM_OFF_VALUE "off_value"
+
+#define QNN_OP_PACK            "Pack"
+#define QNN_OP_PACK_PARAM_AXIS "axis"
+
+#define QNN_OP_MAT_MUL                     "MatMul"
+#define QNN_OP_MAT_MUL_PARAM_TRANSPOSE_IN0 "transpose_in0"
+#define QNN_OP_MAT_MUL_PARAM_TRANSPOSE_IN1 "transpose_in1"
+
+#define QNN_OP_PAD                          "Pad"
+#define QNN_OP_PAD_PARAM_SCHEME             "scheme"
+#define QNN_OP_PAD_SCHEME_CONSTANT          0
+#define QNN_OP_PAD_SCHEME_MIRROR_SYMMETRIC  1
+#define QNN_OP_PAD_SCHEME_MIRROR_REFLECT    2
+#define QNN_OP_PAD_SCHEME_EDGE              3
+#define QNN_OP_PAD_PARAM_PAD_AMOUNT         "pad_amount"
+#define QNN_OP_PAD_PARAM_PAD_CONSTANT_VALUE "pad_constant_value"
+
+#define QNN_OP_POOL_AVG_2D                           "PoolAvg2d"
+#define QNN_OP_POOL_AVG_2D_PARAM_FILTER_SIZE         "filter_size"
+#define QNN_OP_POOL_AVG_2D_PARAM_STRIDE              "stride"
+#define QNN_OP_POOL_AVG_2D_PARAM_PAD_AMOUNT          "pad_amount"
+#define QNN_OP_POOL_AVG_2D_PARAM_COUNT_PAD_FOR_EDGES "count_pad_for_edges"
+#define QNN_OP_POOL_AVG_2D_PARAM_ROUNDING_MODE       "rounding_mode"
+#define QNN_OP_POOL_AVG_2D_ROUNDING_MODE_FLOOR       0
+#define QNN_OP_POOL_AVG_2D_ROUNDING_MODE_CEIL        1
+
+#define QNN_OP_POOL_AVG_3D                           "PoolAvg3d"
+#define QNN_OP_POOL_AVG_3D_PARAM_FILTER_SIZE         "filter_size"
+#define QNN_OP_POOL_AVG_3D_PARAM_STRIDE              "stride"
+#define QNN_OP_POOL_AVG_3D_PARAM_PAD_AMOUNT          "pad_amount"
+#define QNN_OP_POOL_AVG_3D_PARAM_COUNT_PAD_FOR_EDGES "count_pad_for_edges"
+#define QNN_OP_POOL_AVG_3D_PARAM_ROUNDING_MODE       "rounding_mode"
+#define QNN_OP_POOL_AVG_3D_ROUNDING_MODE_FLOOR       0
+#define QNN_OP_POOL_AVG_3D_ROUNDING_MODE_CEIL        1
+
+#define QNN_OP_POOL_MAX_2D                     "PoolMax2d"
+#define QNN_OP_POOL_MAX_2D_PARAM_FILTER_SIZE   "filter_size"
+#define QNN_OP_POOL_MAX_2D_PARAM_STRIDE        "stride"
+#define QNN_OP_POOL_MAX_2D_PARAM_PAD_AMOUNT    "pad_amount"
+#define QNN_OP_POOL_MAX_2D_PARAM_ROUNDING_MODE "rounding_mode"
+#define QNN_OP_POOL_MAX_2D_ROUNDING_MODE_FLOOR 0
+#define QNN_OP_POOL_MAX_2D_ROUNDING_MODE_CEIL  1
+
+#define QNN_OP_POOL_MAX_3D                     "PoolMax3d"
+#define QNN_OP_POOL_MAX_3D_PARAM_FILTER_SIZE   "filter_size"
+#define QNN_OP_POOL_MAX_3D_PARAM_STRIDE        "stride"
+#define QNN_OP_POOL_MAX_3D_PARAM_PAD_AMOUNT    "pad_amount"
+#define QNN_OP_POOL_MAX_3D_PARAM_ROUNDING_MODE "rounding_mode"
+#define QNN_OP_POOL_MAX_3D_ROUNDING_MODE_FLOOR 0
+#define QNN_OP_POOL_MAX_3D_ROUNDING_MODE_CEIL  1
+
+#define QNN_OP_PRELU "Prelu"
+
+#define QNN_OP_QUANTIZE "Quantize"
+
+#define QNN_OP_REDUCE_MAX                 "ReduceMax"
+#define QNN_OP_REDUCE_MAX_PARAM_AXES      "axes"
+#define QNN_OP_REDUCE_MAX_PARAM_KEEP_DIMS "keep_dims"
+
+#define QNN_OP_REDUCE_MEAN                 "ReduceMean"
+#define QNN_OP_REDUCE_MEAN_PARAM_AXES      "axes"
+#define QNN_OP_REDUCE_MEAN_PARAM_KEEP_DIMS "keep_dims"
+
+#define QNN_OP_REDUCE_MIN                 "ReduceMin"
+#define QNN_OP_REDUCE_MIN_PARAM_AXES      "axes"
+#define QNN_OP_REDUCE_MIN_PARAM_KEEP_DIMS "keep_dims"
+
+#define QNN_OP_REDUCE_PROD                 "ReduceProd"
+#define QNN_OP_REDUCE_PROD_PARAM_AXES      "axes"
+#define QNN_OP_REDUCE_PROD_PARAM_KEEP_DIMS "keep_dims"
+
+#define QNN_OP_REDUCE_SUM                 "ReduceSum"
+#define QNN_OP_REDUCE_SUM_PARAM_AXES      "axes"
+#define QNN_OP_REDUCE_SUM_PARAM_KEEP_DIMS "keep_dims"
+
+#define QNN_OP_REDUCE_SUM_SQUARE                 "ReduceSumSquare"
+#define QNN_OP_REDUCE_SUM_SQUARE_PARAM_AXES      "axes"
+#define QNN_OP_REDUCE_SUM_SQUARE_PARAM_KEEP_DIMS "keep_dims"
+
+#define QNN_OP_RELU "Relu"
+
+#define QNN_OP_RELU1 "Relu1"
+
+#define QNN_OP_RELU6 "Relu6"
+
+#define QNN_OP_RELU_MIN_MAX                 "ReluMinMax"
+#define QNN_OP_RELU_MIN_MAX_PARAM_MIN_VALUE "min_value"
+#define QNN_OP_RELU_MIN_MAX_PARAM_MAX_VALUE "max_value"
+
+#define QNN_OP_RESHAPE "Reshape"
+
+#define QNN_OP_RESIZE                                        "Resize"
+#define QNN_OP_RESIZE_PARAM_EXCLUDE_OUTSIDE                  "exclude_outside"
+#define QNN_OP_RESIZE_PARAM_TRANSFORMATION_MODE              "transformation_mode"
+#define QNN_OP_RESIZE_TRANSFORMATION_MODE_HALF_PIXEL         0
+#define QNN_OP_RESIZE_TRANSFORMATION_MODE_PYTORCH_HALF_PIXEL 1
+#define QNN_OP_RESIZE_TRANSFORMATION_MODE_ALIGN_CORNERS      2
+#define QNN_OP_RESIZE_TRANSFORMATION_MODE_ASYMMETRIC         3
+#define QNN_OP_RESIZE_PARAM_INTERPOLATION_MODE               "interpolation_mode"
+#define QNN_OP_RESIZE_INTERPOLATION_MODE_NEAREST             0
+#define QNN_OP_RESIZE_INTERPOLATION_MODE_LINEAR              1
+#define QNN_OP_RESIZE_INTERPOLATION_MODE_CUBIC               2
+#define QNN_OP_RESIZE_PARAM_NEAREST_MODE                     "nearest_mode"
+#define QNN_OP_RESIZE_NEAREST_MODE_ROUND_PREFER_FLOOR        0
+#define QNN_OP_RESIZE_NEAREST_MODE_ROUND_PREFER_CEIL         1
+#define QNN_OP_RESIZE_NEAREST_MODE_FLOOR                     2
+#define QNN_OP_RESIZE_NEAREST_MODE_CEIL                      3
+#define QNN_OP_RESIZE_PARAM_CUBIC_COEFF                      "cubic_coeff"
+
+#define QNN_OP_RESIZE_BILINEAR                          "ResizeBilinear"
+#define QNN_OP_RESIZE_BILINEAR_PARAM_ALIGN_CORNERS      "align_corners"
+#define QNN_OP_RESIZE_BILINEAR_PARAM_HALF_PIXEL_CENTERS "half_pixel_centers"
+#define QNN_OP_RESIZE_BILINEAR_PARAM_ANTIALIAS          "antialias"
+
+#define QNN_OP_RESIZE_NEAREST_NEIGHBOR                          "ResizeNearestNeighbor"
+#define QNN_OP_RESIZE_NEAREST_NEIGHBOR_PARAM_ALIGN_CORNERS      "align_corners"
+#define QNN_OP_RESIZE_NEAREST_NEIGHBOR_PARAM_HALF_PIXEL_CENTERS "half_pixel_centers"
+
+#define QNN_OP_RMS_NORM               "RmsNorm"
+#define QNN_OP_RMS_NORM_PARAM_EPSILON "epsilon"
+#define QNN_OP_RMS_NORM_PARAM_AXES    "axes"
+
+#define QNN_OP_ROI_ALIGN                         "RoiAlign"
+#define QNN_OP_ROI_ALIGN_PARAM_IMG_SIZE_RATIO    "img_size_ratio"
+#define QNN_OP_ROI_ALIGN_PARAM_NUM_SAMPLES_Y     "num_samples_y"
+#define QNN_OP_ROI_ALIGN_PARAM_NUM_SAMPLES_X     "num_samples_x"
+#define QNN_OP_ROI_ALIGN_PARAM_ALIGNED           "aligned"
+#define QNN_OP_ROI_ALIGN_PARAM_ALLOW_INVALID_ROI "allow_invalid_roi"
+
+#define QNN_OP_ROI_POOLING                      "RoiPooling"
+#define QNN_OP_ROI_POOLING_PARAM_IMG_SIZE_RATIO "img_size_ratio"
+
+#define QNN_OP_SCATTER_ELEMENTS                 "ScatterElements"
+#define QNN_OP_SCATTER_ELEMENTS_PARAM_AXIS      "axis"
+#define QNN_OP_SCATTER_ELEMENTS_PARAM_REDUCTION "reduction"
+#define QNN_OP_SCATTER_ELEMENTS_REDUCTION_NONE  0
+#define QNN_OP_SCATTER_ELEMENTS_REDUCTION_ADD   1
+#define QNN_OP_SCATTER_ELEMENTS_REDUCTION_MUL   2
+#define QNN_OP_SCATTER_ELEMENTS_REDUCTION_MAX   3
+
+#define QNN_OP_SCATTER_ND                 "ScatterNd"
+#define QNN_OP_SCATTER_ND_PARAM_REDUCTION "reduction"
+#define QNN_OP_SCATTER_ND_REDUCTION_NONE  0
+#define QNN_OP_SCATTER_ND_REDUCTION_ADD   1
+#define QNN_OP_SCATTER_ND_REDUCTION_MUL   2
+
+#define QNN_OP_SHAPE             "Shape"
+#define QNN_OP_SHAPE_PARAM_START "start"
+#define QNN_OP_SHAPE_PARAM_END   "end"
+
+#define QNN_OP_SIGMOID "Sigmoid"
+
+#define QNN_OP_SOFTMAX            "Softmax"
+#define QNN_OP_SOFTMAX_PARAM_AXIS "axis"
+#define QNN_OP_SOFTMAX_PARAM_BETA "beta"
+
+#define QNN_OP_SPACE_TO_BATCH                  "SpaceToBatch"
+#define QNN_OP_SPACE_TO_BATCH_PARAM_BLOCK_SIZE "block_size"
+#define QNN_OP_SPACE_TO_BATCH_PARAM_PAD_AMOUNT "pad_amount"
+
+#define QNN_OP_SPACE_TO_DEPTH                  "SpaceToDepth"
+#define QNN_OP_SPACE_TO_DEPTH_PARAM_BLOCK_SIZE "block_size"
+#define QNN_OP_SPACE_TO_DEPTH_PARAM_MODE       "mode"
+#define QNN_OP_SPACE_TO_DEPTH_MODE_DCR         0
+#define QNN_OP_SPACE_TO_DEPTH_MODE_CRD         1
+
+#define QNN_OP_SPARSE_TO_DENSE "SparseToDense"
+
+#define QNN_OP_SPLIT                   "Split"
+#define QNN_OP_SPLIT_PARAM_AXIS        "axis"
+#define QNN_OP_SPLIT_PARAM_SPLIT_INDEX "split_index"
+
+#define QNN_OP_SQUEEZE            "Squeeze"
+#define QNN_OP_SQUEEZE_PARAM_AXES "axes"
+
+#define QNN_OP_STRIDED_SLICE                     "StridedSlice"
+#define QNN_OP_STRIDED_SLICE_PARAM_RANGES        "ranges"
+#define QNN_OP_STRIDED_SLICE_PARAM_BEGIN_MASK    "begin_mask"
+#define QNN_OP_STRIDED_SLICE_PARAM_END_MASK      "end_mask"
+#define QNN_OP_STRIDED_SLICE_PARAM_SHRINK_AXES   "shrink_axes"
+#define QNN_OP_STRIDED_SLICE_PARAM_NEW_AXES_MASK "new_axes_mask"
+
+#define QNN_OP_TANH "Tanh"
+
+#define QNN_OP_TILE                 "Tile"
+#define QNN_OP_TILE_PARAM_MULTIPLES "multiples"
+
+#define QNN_OP_TOP_K               "TopK"
+#define QNN_OP_TOP_K_PARAM_K       "k"
+#define QNN_OP_TOP_K_PARAM_LARGEST "largest"
+
+#define QNN_OP_TRANSPOSE            "Transpose"
+#define QNN_OP_TRANSPOSE_PARAM_PERM "perm"
+
+#define QNN_OP_TRANSPOSE_CONV_1D                      "TransposeConv1d"
+#define QNN_OP_TRANSPOSE_CONV_1D_PARAM_STRIDE         "stride"
+#define QNN_OP_TRANSPOSE_CONV_1D_PARAM_PAD_AMOUNT     "pad_amount"
+#define QNN_OP_TRANSPOSE_CONV_1D_PARAM_GROUP          "group"
+#define QNN_OP_TRANSPOSE_CONV_1D_PARAM_OUTPUT_PADDING "output_padding"
+
+#define QNN_OP_TRANSPOSE_CONV_2D                      "TransposeConv2d"
+#define QNN_OP_TRANSPOSE_CONV_2D_PARAM_STRIDE         "stride"
+#define QNN_OP_TRANSPOSE_CONV_2D_PARAM_PAD_AMOUNT     "pad_amount"
+#define QNN_OP_TRANSPOSE_CONV_2D_PARAM_GROUP          "group"
+#define QNN_OP_TRANSPOSE_CONV_2D_PARAM_OUTPUT_PADDING "output_padding"
+
+#define QNN_OP_TRANSPOSE_CONV_3D                      "TransposeConv3d"
+#define QNN_OP_TRANSPOSE_CONV_3D_PARAM_STRIDE         "stride"
+#define QNN_OP_TRANSPOSE_CONV_3D_PARAM_PAD_AMOUNT     "pad_amount"
+#define QNN_OP_TRANSPOSE_CONV_3D_PARAM_DILATION       "dilation"
+#define QNN_OP_TRANSPOSE_CONV_3D_PARAM_GROUP          "group"
+#define QNN_OP_TRANSPOSE_CONV_3D_PARAM_OUTPUT_PADDING "output_padding"
+
+#define QNN_OP_UN_PACK            "UnPack"
+#define QNN_OP_UN_PACK_PARAM_AXIS "axis"
+
+#endif  // QNN_OP_DEF_H
\ No newline at end of file
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/QnnOpPackage.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/QnnOpPackage.h
new file mode 100755
index 0000000000000..8df5f4328a7f7
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/QnnOpPackage.h
@@ -0,0 +1,561 @@
+//=============================================================================
+//
+//  Copyright (c) 2019-2023 Qualcomm Technologies, Inc.
+//  All Rights Reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//=============================================================================
+
+/**
+ *  @file
+ *  @brief  QNN Operation Package API
+ *
+ *          Provides interface to the backend to use registered OpPackage libraries.
+ */
+
+#ifndef QNN_OP_PACKAGE_H
+#define QNN_OP_PACKAGE_H
+
+#include "QnnCommon.h"
+#include "QnnLog.h"
+#include "QnnTypes.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//=============================================================================
+// Macros
+//=============================================================================
+
+#define QNN_OP_PACKAGE_RESERVED_INFO_SIZE 12
+
+//=============================================================================
+// Data Types
+//=============================================================================
+
+/**
+ * @brief A typedef for op package handles.
+ */
+typedef Qnn_Handle_t Qnn_OpPackageHandle_t;
+
+/**
+ * @brief Backend-defined and -provided infrastructure object which provides the package
+ *        access to backend-wide facilities, e.g. memory management.
+ */
+typedef struct _QnnOpPackage_GlobalInfrastructure_t* QnnOpPackage_GlobalInfrastructure_t;
+
+/**
+ * @brief Backend-defined and -provided infrastructure object which provides the package
+ *        access to graph-specific facilities, e.g. execution context or graph structure
+ *        manipulation methods.
+ */
+typedef struct _QnnOpPackage_GraphInfrastructure_t* QnnOpPackage_GraphInfrastructure_t;
+
+/**
+ * @brief Backend-defined structure which represents Op implementation, with content
+ *        executable within the context of a backend. Provided and managed by the package.
+ */
+typedef struct _QnnOpPackage_OpImpl_t* QnnOpPackage_OpImpl_t;
+
+/**
+ * @brief Backend-defined structure which contains the parameters and connectivity information
+ *        for an operation node.
+ */
+typedef struct _QnnOpPackage_Node_t* QnnOpPackage_Node_t;
+
+/**
+ * @brief Backend-defined structure which encapsulates a graph optimization. Provided by the
+ *        package to the backend to enable the backend to optimize graphs containing
+ *        operation nodes for operations defined by this package.
+ */
+typedef struct _QnnOpPackage_Optimization_t* QnnOpPackage_Optimization_t;
+
+/**
+ * @brief Backend-defined structure which contains information for an operation.
+ *        Provided by the package to the backend to convey information needed to properly
+ *        construct an operation.
+ */
+typedef struct _QnnOpPackage_OperationInfo_t* QnnOpPackage_OperationInfo_t;
+
+/**
+ * @brief Backend-defined structure which contains information about Op package.
+ *        Provided by the package to the backend to convey information needed to properly
+ *        use the package.
+ */
+typedef struct _QnnOpPackage_PackageInfo_t QnnOpPackage_PackageInfo_t;
+
+/**
+ * @brief QNN OpPackage API result / error codes.
+ */
+typedef enum {
+  QNN_OP_PACKAGE_MIN_ERROR = QNN_MIN_ERROR_OP_PACKAGE,
+  //////////////////////////////////////////////////
+
+  QNN_OP_PACKAGE_NO_ERROR = QNN_SUCCESS,
+  /// There is optional API component that is not supported yet. See QnnProperty.
+  QNN_OP_PACKAGE_ERROR_UNSUPPORTED_FEATURE = QNN_COMMON_ERROR_NOT_SUPPORTED,
+  /// Op package library was already initialized.
+  QNN_OP_PACKAGE_ERROR_LIBRARY_ALREADY_INITIALIZED = QNN_MIN_ERROR_OP_PACKAGE + 0,
+  /// Attempt to call a function in an uninitialized op package library.
+  QNN_OP_PACKAGE_ERROR_LIBRARY_NOT_INITIALIZED = QNN_MIN_ERROR_OP_PACKAGE + 1,
+  /// An invalid op package handle was provided.
+  QNN_OP_PACKAGE_ERROR_INVALID_HANDLE = QNN_MIN_ERROR_OP_PACKAGE + 2,
+  /// Invalid infrastructure object used in initializing op package
+  QNN_OP_PACKAGE_ERROR_INVALID_INFRASTRUCTURE = QNN_MIN_ERROR_OP_PACKAGE + 100,
+  /// Invalid op package info object used in initializing op package
+  QNN_OP_PACKAGE_ERROR_INVALID_INFO = QNN_MIN_ERROR_OP_PACKAGE + 101,
+  /// Op configuration failed validation
+  QNN_OP_PACKAGE_ERROR_VALIDATION_FAILURE = QNN_MIN_ERROR_OP_PACKAGE + 110,
+  /// Invalid function argument
+  QNN_OP_PACKAGE_ERROR_INVALID_ARGUMENT = QNN_MIN_ERROR_OP_PACKAGE + 200,
+  /// Indicates an error has occurred due to a condition unforeseen by QNN, and possibly
+  /// meaningful only in the context of the particular op package. Unless otherwise
+  /// noted, any op package function may return this error.
+  QNN_OP_PACKAGE_ERROR_GENERAL = QNN_COMMON_ERROR_GENERAL,
+
+  //////////////////////////////////////////////////
+  QNN_OP_PACKAGE_MAX_ERROR = QNN_MAX_ERROR_OP_PACKAGE,
+  // Unused, present to ensure 32 bits.
+  QNN_OP_PACKAGE_ERROR_UNDEFINED = 0x7FFFFFFF
+} QnnOpPackage_Error_t;
+
+/**
+ * @brief Struct describing the contents of an Op package.
+ *        \n Reported to the backend by QnnOpPackage_GetInfoFn_t.
+ */
+typedef struct {
+  /// Op package name. Must not be NULL nor empty string.
+  const char* packageName;
+  /// Array holding names of operations provided by the op package. Must not be NULL.
+  /// Number of elements in the array is specified with _numOperations_.
+  const char** operationNames;
+  /// Array holding backend-defined operation information.
+  /// This is optional, backend-specific information. Can be NULL.
+  /// If not NULL, number of elements in the array is specified with _numOperations_.
+  const QnnOpPackage_OperationInfo_t* operationInfo;
+  /// Number of elements in _operationNames_ and _operationInfo_ arrays.
+  uint32_t numOperations;
+  /// Array holding backend-defined graph optimizations.
+  /// This is optional, backend-specific information. Can be NULL.
+  /// If not NULL, number of elements in the array is specified with _numOptimizations_.
+  const QnnOpPackage_Optimization_t* optimizations;
+  /// Number of elements in _optimizations_ array.
+  uint32_t numOptimizations;
+  /// BuildId (as returned by QnnBackend_getBuildId(), also see QNN_SDK_BUILD_ID)
+  /// from QNN SDK which was used to create this OpPackage with. Allowed to be NULL.
+  const char* sdkBuildId;
+  /// API Version (as returned by QnnBackend_getApiVersion()) from QNN SDK which was
+  /// used to create this OpPackage with. Allowed to be NULL.
+  const Qnn_ApiVersion_t* sdkApiVersion;
+  /// Op package level information. Allowed to be NULL.
+  const QnnOpPackage_PackageInfo_t* packageInfo;
+  /// Version of the set of operations implemented in the op package.
+  const Qnn_Version_t* opsetVersion;
+  /// Reserved for future extensibility. Must be memset to 0.
+  size_t reserved[QNN_OP_PACKAGE_RESERVED_INFO_SIZE];
+} QnnOpPackage_Info_t;
+
+// clang-format off
+/// QnnOpPackage_Info_t initializer macro
+#define QNN_OP_PACKAGE_INFO_INIT   \
+  {                                \
+    NULL,     /*packageName*/      \
+    NULL,     /*operationNames*/   \
+    NULL,     /*operationInfo*/    \
+    0u,       /*numOperations*/    \
+    NULL,     /*optimizations*/    \
+    0u,       /*numOptimizations*/ \
+    NULL,     /*sdkBuildId*/       \
+    NULL,     /*sdkApiVersion*/    \
+    NULL,     /*packageInfo*/      \
+    NULL,     /*opsetVersion*/     \
+    { 0u }    /*reserved*/         \
+  }
+// clang-format on
+
+//------------------------------------------------------------------------------
+//   API Methods
+//------------------------------------------------------------------------------
+
+/**
+ * @brief Initialize an Op package library's data structures. This function must be called before
+ *        any other library functions. Calling multiple times will result in errors after the first
+ *        call. This function can be called again after QnnOpPackage_TerminateFn_t.
+ *
+ * @param[in] infrastructure Global infrastructure object provided by the backend, for use in all
+ *                           operations in the package. This is guaranteed to live at least until
+ *                           QnnOpPackage_TerminateFn_t returns, and is safe to cache.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: Op package library was successfully initialized.
+ *         - QNN_OP_PACKAGE_ERROR_LIBRARY_ALREADY_INITIALIZED: This package library
+ *           has already been initialized.
+ *         - QNN_OP_PACKAGE_ERROR_INVALID_INFRASTRUCTURE: Op package initialization failed
+ *           due to invalid infrastructure content.
+ *         - QNN_OP_PACKAGE_ERROR_GENERAL: Op package library failed to initialize.
+ */
+typedef Qnn_ErrorHandle_t (*QnnOpPackage_InitFn_t)(
+    QnnOpPackage_GlobalInfrastructure_t infrastructure);
+
+/**
+ * @brief Terminate an Op package library, freeing all data structures and invalidating any memory
+ *        or handles provided by the library. This function may be called again after a subsequent
+ *        call to QnnOpPackage_InitFn_t.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: Op package library was successfully terminated.
+ *         - QNN_OP_PACKAGE_ERROR_GENERAL: Op package library termination failed.
+ */
+typedef Qnn_ErrorHandle_t (*QnnOpPackage_TerminateFn_t)();
+
+/**
+ * @brief Retrieve a QnnOpPackage_Info_t struct from an Op package library describing all
+ *        operations and optimizations provided by the library.
+ *
+ * @param[out] info Info object for the library. This pointer shall point to memory owned by the op
+ *                  package library and remain valid until QnnOpPackage_TerminateFn_t is called on
+ *                  the library. The contents of this struct shall not change before
+ *                  QnnOpPackage_TerminateFn_t is called.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: Info is fetched successfully.
+ *         - QNN_OP_PACKAGE_ERROR_INVALID_INFO: 'info' argument was NULL or invalid.
+ *         - QNN_OP_PACKAGE_ERROR_GENERAL: Other error occurred.
+ */
+typedef Qnn_ErrorHandle_t (*QnnOpPackage_GetInfoFn_t)(const QnnOpPackage_Info_t** info);
+
+/**
+ * @brief Verifies that this op with the specified config can be successfully executed.
+ *
+ * @param[in] opConfig Op configuration in question.
+ *
+ * @note  _inputTensors_ and _outputTensors_ inside opConfig must be fully qualified for
+ *        complete validation. However, their unique IDs (_id_) are ignored during validation.
+ *
+ * @return error code:
+ *         - QNN_SUCCESS if validation is successful
+ *         - QNN_OP_PACKAGE_ERROR_VALIDATION_FAILURE: op config validation failed
+ *         - QNN_OP_PACKAGE_ERROR_UNSUPPORTED_FEATURE: Validation API not supported
+ */
+typedef Qnn_ErrorHandle_t (*QnnOpPackage_ValidateOpConfigFn_t)(Qnn_OpConfig_t opConfig);
+
+/**
+ * @brief Create Op implementation with executable content for a given node.
+ *
+ * @pre The corresponding QnnOpPackage_ValidateOpConfigFn_t should return
+ *      QNN_SUCCESS for the supplied node.
+ *
+ * @param[in] graphInfrastructure Infrastructure for the graph to which the node and kernels
+ *                                belong. This memory is guaranteed to live at least until all
+ *                                created kernels are freed, and may be safely cached.
+ *
+ * @param[in] node Node object for which kernels should be created. This node may be freed before
+ *                 the created kernels. Neither the node nor it's members should be cached.
+ *
+ * @param[out] opImpl Op implementation with executable content to compute the operation specified
+ *                    by _node_. The Op implementation contents will be freed by the backend with
+ *                    QnnOpPackage_FreeOpImplFn_t.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: Op implementation is created successfully
+ *         - QNN_OP_PACKAGE_ERROR_INVALID_INFRASTRUCTURE: Failed to create op implementation
+ *           due to invalid graph infrastructure content.
+ *         - QNN_OP_PACKAGE_ERROR_INVALID_ARGUMENT: one or more invalid arguments (e.g. NULL)
+ *         - QNN_OP_PACKAGE_ERROR_UNSUPPORTED_FEATURE: API not supported
+ *         - QNN_OP_PACKAGE_ERROR_GENERAL: Other error occurred.
+ */
+typedef Qnn_ErrorHandle_t (*QnnOpPackage_CreateOpImplFn_t)(
+    QnnOpPackage_GraphInfrastructure_t graphInfrastructure,
+    QnnOpPackage_Node_t node,
+    QnnOpPackage_OpImpl_t* opImpl);
+
+/**
+ * @brief Free the resources associated with Op implementation previously allocated by
+ *        QnnOpPackage_CreateOpImplFn_t.
+ *
+ * @param[in] opImpl Op implementation which should be freed.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS if Op implementation resources are successfully freed.
+ *         - QNN_OP_PACKAGE_ERROR_INVALID_ARGUMENT: _opImpl_ argument was NULL.
+ *         - QNN_OP_PACKAGE_ERROR_UNSUPPORTED_FEATURE: API not supported.
+ *         - QNN_OP_PACKAGE_ERROR_GENERAL: Other error occurred.
+ */
+typedef Qnn_ErrorHandle_t (*QnnOpPackage_FreeOpImplFn_t)(QnnOpPackage_OpImpl_t opImpl);
+
+/**
+ * @brief See QnnLog_create() in QnnLog.h for documentation.
+ */
+typedef Qnn_ErrorHandle_t (*QnnOpPackage_LogInitializeFn_t)(QnnLog_Callback_t callback,
+                                                            QnnLog_Level_t maxLogLevel);
+
+/**
+ * @brief See QnnLog_setLogLevel() in QnnLog.h for documentation.
+ */
+typedef Qnn_ErrorHandle_t (*QnnOpPackage_LogSetLevelFn_t)(QnnLog_Level_t maxLogLevel);
+
+/**
+ * @brief See QnnLog_free() in QnnLog.h for documentation.
+ */
+typedef Qnn_ErrorHandle_t (*QnnOpPackage_LogTerminateFn_t)(void);
+
+/**
+ * @brief Initialize an op package library and create an op package handle.
+ *
+ * @param[in] infrastructure Global infrastructure object provided by the backend for use in all
+ *                           operations in the package.
+ *
+ * @param[in] callback Callback to handle op package generated logging messages. NULL represents
+ *                     that logging is disabled.
+ *
+ * @param[in] maxLogLevel Maximum level of messages which the op package will generate.
+ *
+ * @param[out] opPackage The created op package handle.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: Op package was successfully created.
+ *         - QNN_OP_PACKAGE_ERROR_UNSUPPORTED_PLATFORM: Op package attempted to be created on an
+ *           unsupported platform.
+ *         - QNN_OP_PACKAGE_ERROR_INVALID_ARGUMENT: if one or more arguments is invalid.
+ *         - QNN_OP_PACKAGE_ERROR_INVALID_INFRASTRUCTURE: Op package initialization failed due to
+ *           invalid infrastructure content.
+ *         - QNN_OP_PACKAGE_ERROR_GENERAL: Op package library failed to initialize.
+ */
+typedef Qnn_ErrorHandle_t (*QnnOpPackage_CreateFn_t)(
+    QnnOpPackage_GlobalInfrastructure_t infrastructure,
+    QnnLog_Callback_t callback,
+    QnnLog_Level_t maxLogLevel,
+    Qnn_OpPackageHandle_t* opPackage);
+
+/**
+ * @brief Verifies that this op with the specified config can be successfully executed.
+ *
+ * @param[in] opPackage An op package handle.
+ *
+ * @param[in] opConfig Op configuration in question.
+ *
+ * @note  _inputTensors_ and _outputTensors_ inside opConfig must be fully qualified for complete
+ *        validation. However, their unique _id_ and _name_ are ignored during validation.
+ *
+ * @return error code:
+ *         - QNN_SUCCESS No error encountered.
+ *         - QNN_OP_PACKAGE_ERROR_INVALID_HANDLE: _opPackage_ is not a valid handle.
+ *         - QNN_OP_PACKAGE_ERROR_VALIDATION_FAILURE: op config validation failed
+ *         - QNN_OP_PACKAGE_ERROR_UNSUPPORTED_FEATURE: Validation API not supported
+ */
+typedef Qnn_ErrorHandle_t (*QnnOpPackage_ValidateOpConfigHandleFn_t)(
+    Qnn_OpPackageHandle_t opPackage, Qnn_OpConfig_t opConfig);
+
+/**
+ * @brief Create op implementation with executable content for a given node.
+ *
+ * @pre The corresponding QnnOpPackage_ValidateOpConfigFn_t should return QNN_SUCCESS for the
+ *      supplied node.
+ *
+ * @param[in] opPackage An op package handle.
+ *
+ * @param[in] graphInfrastructure Infrastructure for the graph to which the node and kernels belong.
+ *                                This memory is guaranteed to live at least until all created
+ *                                kernels are freed and may be safely cached.
+ *
+ * @param[in] node Node object for which kernels should be created. This node may be freed before
+ *                 the created kernels. Neither the node nor it's members should be cached.
+ *
+ * @param[out] opImpl Op implementation with executable content to compute the operation specified
+ *                    by _node_. The Op implementation contents will be freed by the backend with
+ *                    QnnOpPackage_FreeOpImplFn_t.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: No error encountered.
+ *         - QNN_OP_PACKAGE_ERROR_INVALID_HANDLE: _opPackage_ is not a valid handle.
+ *         - QNN_OP_PACKAGE_ERROR_INVALID_INFRASTRUCTURE: Failed to create op implementation
+ *           due to invalid graph infrastructure content.
+ *         - QNN_OP_PACKAGE_ERROR_INVALID_ARGUMENT: one or more invalid arguments (e.g. NULL)
+ *         - QNN_OP_PACKAGE_ERROR_UNSUPPORTED_FEATURE: API not supported
+ *         - QNN_OP_PACKAGE_ERROR_GENERAL: Other error occurred.
+ */
+typedef Qnn_ErrorHandle_t (*QnnOpPackage_CreateOpImplHandleFn_t)(
+    Qnn_OpPackageHandle_t opPackage,
+    QnnOpPackage_GraphInfrastructure_t graphInfrastructure,
+    QnnOpPackage_Node_t node,
+    QnnOpPackage_OpImpl_t* opImpl);
+
+/**
+ * @brief Free the resources associated with Op implementation previously allocated by
+ *        QnnOpPackage_CreateOpImplFn_t.
+ *
+ * @param[in] opPackage An op package handle.
+ *
+ * @param[in] opImpl Op implementation which should be freed.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS No error encountered.
+ *         - QNN_OP_PACKAGE_ERROR_INVALID_HANDLE: _opPackage_ is not a valid handle.
+ *         - QNN_OP_PACKAGE_ERROR_INVALID_ARGUMENT: _opImpl_ argument was NULL.
+ *         - QNN_OP_PACKAGE_ERROR_UNSUPPORTED_FEATURE: API not supported.
+ *         - QNN_OP_PACKAGE_ERROR_GENERAL: Other error occurred.
+ */
+typedef Qnn_ErrorHandle_t (*QnnOpPackage_FreeOpImplHandleFn_t)(Qnn_OpPackageHandle_t opPackage,
+                                                               QnnOpPackage_OpImpl_t opImpl);
+
+/**
+ * @brief A function to change the log level for the supplied op package handle.
+ *
+ * @param[in] opPackage An op package handle.
+ *
+ * @param[in] maxLogLevel New maximum log level.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: No error encountered.
+ *         - QNN_OP_PACKAGE_ERROR_INVALID_HANDLE: _opPackage_ is not a valid handle.
+ *         - QNN_OP_PACKAGE_ERROR_INVALID_ARGUMENT: if maxLogLevel is not a valid log level.
+ */
+typedef Qnn_ErrorHandle_t (*QnnOpPackage_LogSetLevelHandleFn_t)(Qnn_OpPackageHandle_t opPackage,
+                                                                QnnLog_Level_t maxLogLevel);
+
+/**
+ * @brief Free all resources associated with an op package handle.
+ *
+ * @param[in] Op package handle to be freed.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: No error encountered.
+ *         - QNN_OP_PACKAGE_ERROR_INVALID_HANDLE: _opPackage_ is not a valid handle.
+ *         - QNN_OP_PACKAGE_ERROR_GENERAL: Indicates failure to free op package allocated resources.
+ */
+typedef Qnn_ErrorHandle_t (*QnnOpPackage_FreeFn_t)(Qnn_OpPackageHandle_t opPackage);
+
+//------------------------------------------------------------------------------
+//   Implementation Definition
+//------------------------------------------------------------------------------
+
+// clang-format off
+/// QnnOpPackage_ImplementationV1_4_t version initializer macro
+#define QNN_OP_PACKAGE_API_VERSION_1_4_0 \
+{                                        \
+  1u, /*major*/                          \
+  4u, /*minor*/                          \
+  0u  /*patch*/                          \
+}
+
+/**
+ * @brief Version 1.4 QNN Op Package Implementation structure.
+ *
+ *        Contains function pointers for each interface method defined in the
+ *        1.4 QNN Op Package API.
+ */
+typedef struct
+{
+  QnnOpPackage_InitFn_t             init;
+  QnnOpPackage_TerminateFn_t        terminate;
+  QnnOpPackage_GetInfoFn_t          getInfo;
+  QnnOpPackage_ValidateOpConfigFn_t validateOpConfig;
+  QnnOpPackage_CreateOpImplFn_t     createOpImpl;
+  QnnOpPackage_FreeOpImplFn_t       freeOpImpl;
+  QnnOpPackage_LogInitializeFn_t    logInitialize;
+  QnnOpPackage_LogSetLevelFn_t      logSetLevel;
+  QnnOpPackage_LogTerminateFn_t     logTerminate;
+} QnnOpPackage_ImplementationV1_4_t;
+
+/// QnnOpPackage_ImplementationV1_4_t initializer macro
+#define QNN_OP_PACKAGE_IMPLEMENTATION_V1_4_INIT \
+  {                                             \
+    NULL,     /*init*/                          \
+    NULL,     /*terminate*/                     \
+    NULL,     /*getInfo*/                       \
+    NULL,     /*validateOpConfig*/              \
+    NULL,     /*createOpImpl*/                  \
+    NULL,     /*freeOpImpl*/                    \
+    NULL,     /*logInitialize*/                 \
+    NULL,     /*logSetLevel*/                   \
+    NULL      /*logTerminate*/                  \
+  }
+// clang-format on
+
+// clang-format off
+/// QnnOpPackage_ImplementationV2_0_t version initializer macro
+#define QNN_OP_PACKAGE_API_VERSION_2_0_0 \
+{                                        \
+  2u, /*major*/                          \
+  0u, /*minor*/                          \
+  0u  /*patch*/                          \
+}
+
+/**
+ * @brief Version 2.0 QNN Op Package Implementation structure.
+ *
+ *        Contains function pointers for each interface method defined in the
+ *        2.0 QNN Op Package API.
+ */
+typedef struct
+{
+  QnnOpPackage_CreateFn_t                 create;
+  QnnOpPackage_GetInfoFn_t                getInfo;
+  QnnOpPackage_ValidateOpConfigHandleFn_t validateOpConfig;
+  QnnOpPackage_CreateOpImplHandleFn_t     createOpImpl;
+  QnnOpPackage_FreeOpImplHandleFn_t       freeOpImpl;
+  QnnOpPackage_LogSetLevelHandleFn_t      logSetLevel;
+  QnnOpPackage_FreeFn_t                   free;
+} QnnOpPackage_ImplementationV2_0_t;
+
+/// QnnOpPackage_ImplementationV2_0_t initializer macro
+#define QNN_OP_PACKAGE_IMPLEMENTATION_V2_0_INIT \
+  {                                             \
+    NULL,     /*create*/                        \
+    NULL,     /*getInfo*/                       \
+    NULL,     /*validateOpConfig*/              \
+    NULL,     /*createOpImpl*/                  \
+    NULL,     /*freeOpImpl*/                    \
+    NULL,     /*logSetLevel*/                   \
+    NULL      /*free*/                          \
+  }
+// clang-format on
+
+/**
+ * @brief Structure which provides the package version and implementation
+ *        for a given package. Will be queried by the backend using the
+ *        package's implementation provider.
+ */
+typedef struct {
+  /// Version of the QNN Op Package Interface which this package provides.
+  /// The Op Package Interface is accessed through correspondingly named implementation.
+  Qnn_Version_t interfaceVersion;
+  union UNNAMED {
+    QnnOpPackage_ImplementationV1_4_t v1_4;
+    QnnOpPackage_ImplementationV2_0_t v2_0;
+  };
+} QnnOpPackage_Interface_t;
+
+/// QnnOpPackage_Interface_t initializer macro
+#define QNN_OP_PACKAGE_INTERFACE_INIT                      \
+  {                                                        \
+    QNN_OP_PACKAGE_API_VERSION_1_4_0, /*interfaceVersion*/ \
+    {                                                      \
+      QNN_OP_PACKAGE_IMPLEMENTATION_V1_4_INIT /*v1_4*/     \
+    }                                                      \
+  }
+
+/**
+ * @brief A function to retrieve the interface provided by the Op package.
+ *        The name of this function is not prescribed by Op Package API, but must
+ *        be documented by the package developer and supplied to QNN backend by the client.
+ *        See QnnBackend_registerOpPackage().
+ *
+ * @param[out] interface QNN Op Package interface structure, populated with the version and
+ *                       interface methods this Op package provides. Caller to manage the lifetime
+ *                       of the pointer, though the contents are to be considered invalid if the op
+ *                       package library is terminated/unloaded.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: Op package interface is successfully retrieved.
+ *         - QNN_OP_PACKAGE_ERROR_INVALID_ARGUMENT: _interface_ argument was NULL.
+ *         - QNN_OP_PACKAGE_ERROR_GENERAL: Other error occurred.
+ */
+typedef Qnn_ErrorHandle_t (*QnnOpPackage_InterfaceProvider_t)(QnnOpPackage_Interface_t* interface);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/QnnProfile.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/QnnProfile.h
new file mode 100755
index 0000000000000..6bdc270834970
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/QnnProfile.h
@@ -0,0 +1,610 @@
+//==============================================================================
+//
+// Copyright (c) 2019-2024 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+/**
+ *  @file
+ *  @brief Profile component API.
+ *
+ *          Requires Backend to be initialized.
+ *          Provides means to profile QNN backends to evaluate performance
+ *          (memory and timing) of graphs and operations.
+ */
+
+#ifndef QNN_PROFILE_H
+#define QNN_PROFILE_H
+
+#include "QnnCommon.h"
+#include "QnnTypes.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//=============================================================================
+// Macros
+//=============================================================================
+/**
+ * @brief QnnProfile_EventType_t definition to get stats related to creation of
+ *        context and graphs. If supported, this profile data captures stats
+ *        starting with the context creation (QnnContext_create) and ending with
+ *        graph finalize (QnnGraph_finalize). Alternatively, in case of loading
+ *        a cached context, it captures stats for creating context from the
+ *        cache (QnnContext_createFromBinary).
+ *
+ * @note init information maybe available on both QNN_PROFILE_LEVEL_BASIC and
+ *       QNN_PROFILE_LEVEL_DETAILED levels
+ *
+ * @note If unit information is not available, the value should be interpreted
+ *       as time in microseconds.
+ */
+#define QNN_PROFILE_EVENTTYPE_INIT 100
+
+/**
+ * @brief QnnProfile_EventType_t definition to get stats related to finalize
+ *        operation on graphs in a context. If supported, this profile data
+ *        captures stats for graph finalize (QnnGraph_finalize).
+ *
+ * @note finalize information maybe available on both QNN_PROFILE_LEVEL_BASIC and
+ *       QNN_PROFILE_LEVEL_DETAILED levels
+ *
+ * @note If unit information is not available, the value should be interpreted
+ *       as time in microseconds.
+ */
+#define QNN_PROFILE_EVENTTYPE_FINALIZE 300
+
+/**
+ * @brief QnnProfile_EventType_t definition to get stats related to execution
+ *        of graphs in a context (QnnGraph_execute or QnnGraph_executeAsync).
+ *        Basic level might include stats related to execution of entire graphs.
+ *        In addition, detailed level can include stats related to individual
+ *        nodes in graphs as sub-events.
+ *
+ * @note execute information maybe available on both QNN_PROFILE_LEVEL_BASIC and
+ *       QNN_PROFILE_LEVEL_DETAILED levels
+ *
+ * @note If unit information is not available, the value should be interpreted
+ *       as time in microseconds.
+ */
+#define QNN_PROFILE_EVENTTYPE_EXECUTE 400
+
+/**
+ * @brief QnnProfile_EventType_t definition to get data related to execution of
+ *        an operation. This value can be interpreted appropriately in conjunction
+ *        with the unit.
+ *
+ * @note node specific information is available on QNN_PROFILE_LEVEL_DETAILED level
+ *
+ * @note This is a sub-event of the QNN_PROFILE_EVENTTYPE_EXECUTE event.
+ */
+#define QNN_PROFILE_EVENTTYPE_NODE 404
+
+/**
+ * @brief QnnProfile_EventType_t definition to get stats related to time spent
+ *        waiting in a queue when executing a graph.
+ *
+ * @note execute enqueue information maybe available on both QNN_PROFILE_LEVEL_BASIC
+ *       and QNN_PROFILE_LEVEL_DETAILED levels
+ *
+ * @note This is a sub-event of the QNN_PROFILE_EVENTTYPE_EXECUTE event.
+ */
+#define QNN_PROFILE_EVENTTYPE_EXECUTE_QUEUE_WAIT 405
+
+/**
+ * @brief QnnProfile_EventType_t definition to get stats related to time spent
+ *        pre-processing in preparation of executing a graph.
+ *
+ * @note execute preprocess information maybe available on both QNN_PROFILE_LEVEL_BASIC
+ *       and QNN_PROFILE_LEVEL_DETAILED levels
+ *
+ * @note This is a sub-event of the QNN_PROFILE_EVENTTYPE_EXECUTE event.
+ */
+#define QNN_PROFILE_EVENTTYPE_EXECUTE_PREPROCESS 406
+
+/**
+ * @brief QnnProfile_EventType_t definition to get stats related to time spent
+ *        on-device executing a graph.
+ *
+ * @note execute device information maybe available on both QNN_PROFILE_LEVEL_BASIC
+ *       and QNN_PROFILE_LEVEL_DETAILED levels
+ *
+ * @note This is a sub-event of the QNN_PROFILE_EVENTTYPE_EXECUTE event.
+ */
+#define QNN_PROFILE_EVENTTYPE_EXECUTE_DEVICE 407
+
+/**
+ * @brief QnnProfile_EventType_t definition to get stats related to time spent
+ *        post-processing after execution of a graph.
+ *
+ * @note execute postprocess information maybe available on both QNN_PROFILE_LEVEL_BASIC
+ *       and QNN_PROFILE_LEVEL_DETAILED levels
+ *
+ * @note This is a sub-event of the QNN_PROFILE_EVENTTYPE_EXECUTE event.
+ */
+#define QNN_PROFILE_EVENTTYPE_EXECUTE_POSTPROCESS 408
+
+/**
+ * @brief QnnProfile_EventType_t definition to get stats related to deinit
+ *        graphs and free context operation. This profile data captures stats
+ *        for QnnContext_free.
+ *
+ * @note deinit information maybe available on both QNN_PROFILE_LEVEL_BASIC and
+ *       QNN_PROFILE_LEVEL_DETAILED levels
+ *
+ * @note If unit information is not available, the value should be interpreted
+ *       as time in microseconds.
+ */
+#define QNN_PROFILE_EVENTTYPE_DEINIT 500
+
+/**
+ * @brief QnnProfile_EventType_t definition to get traces related to graph
+ *        preparation and execution steps. This profile data captures stats
+ *        for QnnGraph_execute.
+ *
+ * @note trace information is available on QNN_PROFILE_LEVEL_DETAILED
+ *       level only.
+ */
+#define QNN_PROFILE_EVENTTYPE_TRACE 600
+
+/**
+ * @brief QnnProfile_EventType_t definition reserved for each back end to define
+ *        and extend
+ *
+ * @note The client should consult the backend-specific SDK documentation for
+ *       information regarding interpretation of unit, value and identifier.
+ */
+#define QNN_PROFILE_EVENTTYPE_BACKEND 1000
+
+/**
+ * @brief Basic QnnProfile_Level_t definition that allows to collect performance
+ *        metrics for graph finalization and execution stages.
+ */
+#define QNN_PROFILE_LEVEL_BASIC 1
+
+/**
+ * @brief Detailed QnnProfile_Level_t definition that allows to collect performance
+ *        metrics for each operation in the graph
+ */
+#define QNN_PROFILE_LEVEL_DETAILED 2
+
+/**
+ * @brief QnnProfile_Level_t definition reserved for each back end to define and
+ *        extend
+ */
+#define QNN_PROFILE_LEVEL_BACKEND 1000
+
+/**
+ * @brief QnnProfile_EventUnit_t definition to provide profiling measurement as
+ *        time in microseconds
+ */
+#define QNN_PROFILE_EVENTUNIT_MICROSEC 1
+
+/**
+ * @brief QnnProfile_EventUnit_t definition to provide profiling measurement as
+ *        memory in bytes
+ */
+#define QNN_PROFILE_EVENTUNIT_BYTES 2
+
+/**
+ * @brief QnnProfile_EventUnit_t definition to provide profiling measurement as
+ *        time in cycles
+ */
+#define QNN_PROFILE_EVENTUNIT_CYCLES 3
+
+/**
+ * @brief QnnProfile_EventUnit_t definition to provide profiling measurement as
+ *        a count
+ */
+#define QNN_PROFILE_EVENTUNIT_COUNT 4
+
+/**
+ * @brief QnnProfile_EventUnit_t definition to provide profiling measurement as
+ *        an opaque object
+ */
+#define QNN_PROFILE_EVENTUNIT_OBJECT 5
+
+/**
+ * @brief QnnProfile_EventUnit_t definition to provide profiling measurement with
+ *        no unit
+ */
+#define QNN_PROFILE_EVENTUNIT_NONE 6
+
+/**
+ * @brief QnnProfile_EventUnit_t definition reserved for each back end to define
+ *        and extend
+ */
+#define QNN_PROFILE_EVENTUNIT_BACKEND 1000
+
+//=============================================================================
+// Data Types
+//=============================================================================
+
+/**
+ * @brief QNN Profile API result / error codes.
+ */
+typedef enum {
+  QNN_PROFILE_MIN_ERROR = QNN_MIN_ERROR_PROFILE,
+  ////////////////////////////////////////////
+
+  /// Qnn Profile success
+  QNN_PROFILE_NO_ERROR = QNN_SUCCESS,
+  /// Backend does not support requested functionality
+  QNN_PROFILE_ERROR_UNSUPPORTED = QNN_COMMON_ERROR_NOT_SUPPORTED,
+  /// Invalid function argument
+  QNN_PROFILE_ERROR_INVALID_ARGUMENT = QNN_COMMON_ERROR_INVALID_ARGUMENT,
+  /// General error relating to memory allocation in Profile API
+  QNN_PROFILE_ERROR_MEM_ALLOC = QNN_COMMON_ERROR_MEM_ALLOC,
+  /// Invalid/NULL QNN profile handle
+  QNN_PROFILE_ERROR_INVALID_HANDLE = QNN_MIN_ERROR_PROFILE + 0,
+  /// Attempt to free or reconfigure a profile handle that is in-use
+  QNN_PROFILE_ERROR_HANDLE_IN_USE = QNN_MIN_ERROR_PROFILE + 1,
+  /// Event is incompatible with API
+  QNN_PROFILE_ERROR_INCOMPATIBLE_EVENT = QNN_MIN_ERROR_PROFILE + 2,
+
+  ////////////////////////////////////////////
+  QNN_PROFILE_MAX_ERROR = QNN_MAX_ERROR_PROFILE,
+  // Unused, present to ensure 32 bits.
+  QNN_PROFILE_ERROR_UNDEFINED = 0x7FFFFFFF
+} QnnProfile_Error_t;
+
+/**
+ * @brief Backend defined type for a profiled event such as time_taken, time_start, memory
+ */
+typedef uint32_t QnnProfile_EventType_t;
+
+/**
+ * @brief Represents a profiled event value
+ */
+typedef uint64_t QnnProfile_EventValue_t;
+
+/**
+ * @brief Profile levels supported by each backend
+ */
+typedef uint32_t QnnProfile_Level_t;
+
+/**
+ * @brief ID of a profiling event
+ */
+typedef uint64_t QnnProfile_EventId_t;
+
+/**
+ * @brief Unit of measurement of a profiling event
+ */
+typedef uint32_t QnnProfile_EventUnit_t;
+
+/**
+ * @brief This struct provides event information.
+ */
+typedef struct {
+  /// Type of event
+  QnnProfile_EventType_t type;
+  /// Unit of measurement for the event
+  QnnProfile_EventUnit_t unit;
+  /// Value for the event
+  QnnProfile_EventValue_t value;
+  /// Identifier for the event
+  const char* identifier;
+} QnnProfile_EventData_t;
+
+// clang-format off
+/// QnnProfile_EventData_t initializer macro
+#define QNN_PROFILE_EVENT_DATA_INIT \
+  {                                 \
+    0u,      /*type*/               \
+    0u,      /*unit*/               \
+    0u,      /*value*/              \
+    NULL     /*identifier*/         \
+  }
+// clang-format on
+
+/**
+ * @brief A struct which defines a backend opaque object
+ */
+typedef struct {
+  /// Opaque object
+  Qnn_OpaqueObject_t opaqueObject;
+  /// Name of the file. Can be NULL.
+  const char* fileName;
+} QnnProfile_BackendOpaqueObject_t;
+
+// clang-format off
+/// QnnProfile_BackendOpaqueObject_t initializer macro
+#define QNN_PROFILE_BACKEND_OPAQUE_OBJECT_INIT \
+  {                                            \
+    QNN_OPAQUE_OBJECT_INIT, /*opaqueObject*/   \
+    NULL                    /*fileName*/       \
+  }
+// clang-format on
+
+/**
+ * @brief This struct provides extended event information.
+ */
+typedef struct {
+  /// Type of the event
+  QnnProfile_EventType_t type;
+  /// Unit of measurement for the event
+  QnnProfile_EventUnit_t unit;
+  /// Event data
+  /// The field used is dependent on the event unit.
+  union UNNAMED {
+    /// Used for MICROSEC, BYTES, CYCLES, COUNT.
+    Qnn_Scalar_t value;
+    /// Used for OBJECT.
+    QnnProfile_BackendOpaqueObject_t backendOpaqueObject;
+  };
+  /// Timestamp for the event, represented in microsecond unit.
+  uint64_t timestamp;
+  /// Identifier for the event. Can be NULL.
+  const char* identifier;
+} QnnProfile_ExtendedEventDataV1_t;
+
+// clang-format off
+/// QnnProfile_ExtendedEventDataV1_t initializer macro
+#define QNN_PROFILE_EXTENDED_EVENT_DATA_V1_INIT \
+  {                                             \
+    0u,               /*type*/                  \
+    0u,               /*unit*/                  \
+    {                                           \
+      QNN_SCALAR_INIT /*value*/                 \
+    },                                          \
+    0u,               /*timestamp*/             \
+    NULL              /*identifier*/            \
+  }
+// clang-format on
+
+typedef enum {
+  QNN_PROFILE_DATA_VERSION_1         = 1,
+  QNN_PROFILE_DATA_VERSION_UNDEFINED = 0x7FFFFFFF
+} QnnProfile_ExtendedEventDataVersion_t;
+
+typedef struct {
+  QnnProfile_ExtendedEventDataVersion_t version;
+  union UNNAMED {
+    QnnProfile_ExtendedEventDataV1_t v1;
+  };
+} QnnProfile_ExtendedEventData_t;
+
+// clang-format off
+/// QnnProfile_ExtendedEventData_t initializer macro
+#define QNN_PROFILE_EXTENDED_EVENT_DATA_INIT         \
+  {                                                  \
+    QNN_PROFILE_DATA_VERSION_1, /*version*/          \
+    {                                                \
+      QNN_PROFILE_EXTENDED_EVENT_DATA_V1_INIT /*v1*/ \
+    }                                                \
+  }
+// clang-format on
+
+/**
+ * @brief This enum defines profile config options.
+ */
+typedef enum {
+  /// Sets backend custom configs, see backend specific documentation.
+  QNN_PROFILE_CONFIG_OPTION_CUSTOM = 0,
+  /// This config sets the maximum number of profiling events
+  /// that can be stored in the profile handle. Once the maximum
+  /// number of events is reached, no more events will be stored.
+  /// The absolute maximum number of events is subject to a maximum limit
+  /// determined by the backend and available system resources. The default
+  /// maximum number of events is backend-specific, refer to SDK documentation.
+  QNN_PROFILE_CONFIG_OPTION_MAX_EVENTS = 1,
+  /// Set optrace profiling support via enableOptrace flag.
+  /// Please note that the trace information is available on QNN_PROFILE_LEVEL_DETAILED level only.
+  QNN_PROFILE_CONFIG_OPTION_ENABLE_OPTRACE = 2,
+  /// Value selected to ensure 32 bits.
+  QNN_PROFILE_CONFIG_OPTION_UNDEFINED = 0x7FFFFFFF
+} QnnProfile_ConfigOption_t;
+
+/**
+ * @brief Profile specific object for custom configuration
+ *
+ * Please refer to documentation provided by the backend for usage information
+ */
+typedef void* QnnProfile_CustomConfig_t;
+
+/**
+ * @brief This struct provides profile configuration.
+ */
+typedef struct {
+  QnnProfile_ConfigOption_t option;
+  union UNNAMED {
+    QnnProfile_CustomConfig_t customConfig;
+    uint64_t numMaxEvents;
+    uint8_t enableOptrace;
+  };
+} QnnProfile_Config_t;
+
+// clang-format off
+/// QnnProfile_Config_t initializer macro
+#define QNN_PROFILE_CONFIG_INIT                     \
+  {                                                 \
+    QNN_PROFILE_CONFIG_OPTION_UNDEFINED, /*option*/ \
+    {                                               \
+      NULL /*customConfig*/                         \
+    }                                               \
+  }
+// clang-format on
+
+//=============================================================================
+// Public Functions
+//=============================================================================
+
+/**
+ * @brief Create a handle to a profile object.
+ *
+ * @param[in] backend A backend handle.
+ *
+ * @param[in] level Granularity level at which the profile should collect events.
+ *
+ * @param[out] profile A profile handle.
+ *
+ * @return Error code
+ *         - QNN_SUCCESS: No error encountered
+ *         - QNN_PROFILE_ERROR_INVALID_ARGUMENT: _profile_ is NULL or _level_ is invalid.
+ *         - QNN_PROFILE_ERROR_UNSUPPORTED: Profiling is unsupported on a backend.
+ *         - QNN_PROFILE_ERROR_MEM_ALLOC: Error in allocating memory when creating profile handle
+ *         - QNN_PROFILE_ERROR_INVALID_HANDLE: _backend_ is not a valid handle
+ *
+ * @note Use corresponding API through QnnInterface_t.
+ */
+QNN_API
+Qnn_ErrorHandle_t QnnProfile_create(Qnn_BackendHandle_t backend,
+                                    QnnProfile_Level_t level,
+                                    Qnn_ProfileHandle_t* profile);
+
+/**
+ * @brief A function to set/modify configuration options on an already created profile handle.
+ *
+ * @param[in] profileHandle A profile handle.
+ *
+ * @param[in] config Pointer to a NULL terminated array of config option pointers. NULL is allowed
+ *                   and indicates no config options are provided. All config options have default
+ *                   value, in case not provided. If same config option type is provided multiple
+ *                   times, the last option value will be used. If a backend cannot support all
+ *                   provided configs it will fail.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: no error is encountered
+ *         - QNN_PROFILE_ERROR_INVALID_HANDLE: _profileHandle_ is not a valid handle
+ *         - QNN_PROFILE_ERROR_INVALID_ARGUMENT: at least one config option is invalid
+ *         - QNN_PROFILE_ERROR_HANDLE_IN_USE: when attempting to reconfigure a profile handle
+ *         - QNN_PROFILE_ERROR_UNSUPPORTED: Config option is not supported
+ *
+ * @note Use corresponding API through QnnInterface_t.
+ */
+QNN_API
+Qnn_ErrorHandle_t QnnProfile_setConfig(Qnn_ProfileHandle_t profileHandle,
+                                       const QnnProfile_Config_t** config);
+
+/**
+ * @brief Get Qnn profile events collected on the profile handle.
+ *
+ * @param[in] profile A profile handle.
+ *
+ * @param[out] profileEventIds Returns handles to Qnn profile events collected on this profile
+ *                             object.
+ *
+ * @param[out] numEvents Number of profile events.
+ *
+ * @note profileEvents parameter: profile event memory is associated with the profile object and
+ *       released on profile object release in QnnProfile_free().
+ *
+ * @return Error code
+ *         - QNN_SUCCESS: No error encountered
+ *         - QNN_PROFILE_ERROR_INVALID_ARGUMENT: _profileEventIds_ or _numEvents_ is NULL.
+ *         - QNN_PROFILE_ERROR_INVALID_HANDLE: _profile_ is not a valid handle.
+ *         - QNN_PROFILE_ERROR_MEM_ALLOC: error related to memory allocation
+ *
+ * @note Use corresponding API through QnnInterface_t.
+ */
+QNN_API
+Qnn_ErrorHandle_t QnnProfile_getEvents(Qnn_ProfileHandle_t profile,
+                                       const QnnProfile_EventId_t** profileEventIds,
+                                       uint32_t* numEvents);
+
+/**
+ * @brief Get Qnn profile event handles nested within this Qnn profile event handle.
+ *
+ * @param[in] eventId QNN Profile event whose sub events are being queried.
+ *
+ * @param[out] subEventIds Nested profile events on this event.
+ *
+ * @param[out] numSubEvents Number of profile events.
+ *
+ * @note subEventIds parameter: profile event memory is associated with the profile object and
+ *       released on profile object release in QnnProfile_free().
+ *
+ * @return Error code
+ *         - QNN_SUCCESS: No error encountered
+ *         - QNN_PROFILE_ERROR_INVALID_ARGUMENT: _subEventIds_ or _numSubEvents_ is NULL.
+ *         - QNN_PROFILE_ERROR_INVALID_HANDLE: _eventId_ does not identify a valid event.
+ *         - QNN_PROFILE_ERROR_MEM_ALLOC: error related to memory allocation
+ *
+ * @note Use corresponding API through QnnInterface_t.
+ */
+QNN_API
+Qnn_ErrorHandle_t QnnProfile_getSubEvents(QnnProfile_EventId_t eventId,
+                                          const QnnProfile_EventId_t** subEventIds,
+                                          uint32_t* numSubEvents);
+
+/**
+ * @brief Query the data associated with this profile event.
+ *
+ * @param[in] eventId Qnn profile event being queried.
+ *
+ * @param[out] eventData Event data associated to this event.
+ *
+ * @note eventData parameter: eventData memory is associated with the profile object and released
+ *       on profile object release in QnnProfile_free().
+ *
+ * @return Error code
+ *         - QNN_SUCCESS: No error encountered
+ *         - QNN_PROFILE_ERROR_UNSUPPORTED: API not supported.
+ *         - QNN_PROFILE_ERROR_INCOMPATIBLE_EVENT: _eventData_ is incompatible with the API. Use
+ *           QnnProfile_getExtendedEventData instead.
+ *         - QNN_PROFILE_ERROR_INVALID_ARGUMENT: _eventData_ is NULL.
+ *         - QNN_PROFILE_ERROR_INVALID_HANDLE: _eventId_ does not identify a valid event.
+ *         - QNN_PROFILE_ERROR_MEM_ALLOC: error related to memory allocation
+ *
+ * @note Use corresponding API through QnnInterface_t.
+ */
+QNN_API
+Qnn_ErrorHandle_t QnnProfile_getEventData(QnnProfile_EventId_t eventId,
+                                          QnnProfile_EventData_t* eventData);
+
+/**
+ * @brief Query the data associated with this profile extended event.
+ *
+ * @param[in] eventId Qnn profile extended event being queried.
+ *
+ * @param[out] eventData Event data associated to this extended event.
+ *
+ * @note eventData parameter: eventData memory is associated with the profile object and released
+ *       on profile object release in QnnProfile_free().
+ *
+ * @return Error code
+ *         - QNN_SUCCESS: No error encountered
+ *         - QNN_PROFILE_ERROR_UNSUPPORTED: API not supported.
+ *         - QNN_PROFILE_ERROR_INVALID_ARGUMENT: _eventData_ is NULL.
+ *         - QNN_PROFILE_ERROR_INVALID_HANDLE: _eventId_ does not identify a valid event.
+ *         - QNN_PROFILE_ERROR_MEM_ALLOC: error related to memory allocation
+ *
+ * @note Use corresponding API through QnnInterface_t.
+ */
+QNN_API
+Qnn_ErrorHandle_t QnnProfile_getExtendedEventData(QnnProfile_EventId_t eventId,
+                                                  QnnProfile_ExtendedEventData_t* eventData);
+
+/**
+ * @brief Free memory associated with the profile handle.
+ *        All associated QnnProfile_EventId_t event handles are implicitly freed.
+ *
+ * @param[in] profile Handle to be freed.
+ *
+ * @note Releasing the profile handle invalidates the memory returned via calls on this handle such
+ *       as QnnProfile_getEvents(), QnnProfile_getSubEvents(), QnnProfile_getEventData(),
+ *       QnnProfile_getExtendedEventData(), etc.
+ *
+ * @note The profile handle cannot be freed when it is bound to another API component or
+ *       in use by an API call.
+ *
+ * @return Error code
+ *         - QNN_SUCCESS: No error encountered
+ *         - QNN_PROFILE_ERROR_INVALID_HANDLE: _profile_ is not a valid handle.
+ *         - QNN_PROFILE_ERROR_MEM_ALLOC: error related to memory de-allocation
+ *         - QNN_PROFILE_ERROR_HANDLE_IN_USE: _profile_ is in-use and cannot be freed.
+ *
+ * @note Use corresponding API through QnnInterface_t.
+ */
+QNN_API
+Qnn_ErrorHandle_t QnnProfile_free(Qnn_ProfileHandle_t profile);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // QNN_PROFILE_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/QnnProperty.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/QnnProperty.h
new file mode 100755
index 0000000000000..3843462a24d8f
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/QnnProperty.h
@@ -0,0 +1,680 @@
+//==============================================================================
+//
+// Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+// All rights reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+/**
+ *  @file
+ *  @brief  Property component API.
+ *
+ *          Provides means for client to discover capabilities of a backend.
+ */
+
+#ifndef QNN_PROPERTY_H
+#define QNN_PROPERTY_H
+
+#include "QnnCommon.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//=============================================================================
+// Macros
+//=============================================================================
+///
+/// Definition of QNN_PROPERTY_GROUP_CORE property group.
+///
+
+/**
+ * @brief Property group for the QNN core property group.
+ */
+#define QNN_PROPERTY_GROUP_CORE 0x00000001
+
+///
+/// Definition of QNN_PROPERTY_GROUP_BACKEND property group. This group is Core (non-optional) API.
+///
+
+/**
+ * @brief Property group for the QNN Backend API property group. This is a non-optional API
+ *        component and cannot be used as a property key.
+ */
+#define QNN_PROPERTY_GROUP_BACKEND (QNN_PROPERTY_GROUP_CORE + 100)
+
+/**
+ * @brief Property key for determining if a backend supports QnnBackend_registerOpPackage.
+ */
+#define QNN_PROPERTY_BACKEND_SUPPORT_OP_PACKAGE (QNN_PROPERTY_GROUP_BACKEND + 4)
+
+/**
+ * @brief Property key for determining whether a backend supports the
+ *        QNN_BACKEND_CONFIG_OPTION_PLATFORM configuration.
+ */
+#define QNN_PROPERTY_BACKEND_SUPPORT_PLATFORM_OPTIONS (QNN_PROPERTY_GROUP_BACKEND + 5)
+
+/**
+ * @brief Property key for determining whether a backend supports graph composition.
+ *        The following are considered graph composition APIs:
+ *        - QnnContext_create
+ *        - QnnGraph_create
+ *        - QnnGraph_addNode
+ *        - QnnGraph_finalize
+ *        - QnnTensor_createContextTensor
+ *        - QnnTensor_createGraphTensor
+ *        - QnnBackend_validateOpConfig
+ */
+#define QNN_PROPERTY_BACKEND_SUPPORT_COMPOSITION (QNN_PROPERTY_GROUP_BACKEND + 6)
+
+/**
+ * @brief Property key for determining whether a backend supports setting
+ *        QNN_BACKEND_PROPERTY_OPTION_CUSTOM as a property option.
+ */
+#define QNN_PROPERTY_BACKEND_SUPPORT_CUSTOM_PROPERTY (QNN_PROPERTY_GROUP_BACKEND + 7)
+
+///
+/// Definition of QNN_PROPERTY_GROUP_CONTEXT property group. This group is Core (non-optional) API.
+///
+
+/**
+ * @brief Property group for the QNN Context API property group. This is a non-optional API
+ *        component and cannot be used as a property key.
+ */
+#define QNN_PROPERTY_GROUP_CONTEXT (QNN_PROPERTY_GROUP_CORE + 200)
+
+/**
+ * @brief Property key for determining whether a backend supports context binaries. It determines
+ *        supports for the following APIs:
+ *        - QnnContext_getBinarySize
+ *        - QnnContext_getBinary
+ *        - QnnContext_createFromBinary
+ */
+#define QNN_PROPERTY_CONTEXT_SUPPORT_CACHING (QNN_PROPERTY_GROUP_CONTEXT + 1)
+
+/**
+ * @brief Property key for determining whether a backend supports the QnnContext_Config_t data
+ *        structure.
+ */
+#define QNN_PROPERTY_CONTEXT_SUPPORT_CONFIGURATION (QNN_PROPERTY_GROUP_CONTEXT + 4)
+
+/**
+ * @brief Property key for determining whether a backend supports graph enablement in a context. See
+ *        QNN_CONTEXT_CONFIG_ENABLE_GRAPHS.
+ */
+#define QNN_PROPERTY_CONTEXT_SUPPORT_CONFIG_ENABLE_GRAPHS (QNN_PROPERTY_GROUP_CONTEXT + 5)
+
+/**
+ * @brief Property key for determining whether a backend supports memory limits in a context. See
+ *        QNN_CONTEXT_CONFIG_MEMORY_LIMIT.
+ */
+#define QNN_PROPERTY_CONTEXT_SUPPORT_CONFIG_MEMORY_LIMIT_HINT (QNN_PROPERTY_GROUP_CONTEXT + 6)
+
+/**
+ * @brief Property key for determining whether a backend supports context binaries that are readable
+ *        throughout the lifetime of the context. See QNN_CONTEXT_CONFIG_PERSISTENT_BINARY.
+ */
+#define QNN_PROPERTY_CONTEXT_SUPPORT_CONFIG_PERSISTENT_BINARY (QNN_PROPERTY_GROUP_CONTEXT + 7)
+
+/**
+ * @brief Property key for determining whether a backend supports binary compatibility control in a
+ *        context. See QNN_CONTEXT_CONFIG_BINARY_COMPATIBILITY.
+ */
+#define QNN_PROPERTY_CONTEXT_SUPPORT_CONFIG_BINARY_COMPATIBILITY_TYPE \
+  (QNN_PROPERTY_GROUP_CONTEXT + 8)
+
+/**
+ * @brief Property key for determining whether a backend supports validation of a stored binary. It
+ *        determines support for QnnContext_validateBinary.
+ */
+#define QNN_PROPERTY_CONTEXT_SUPPORT_VALIDATE_BINARY (QNN_PROPERTY_GROUP_CONTEXT + 9)
+
+/**
+ * @brief Property key for determining whether a backend supports creating a context from a stored
+ *        binary, which supports control signals. It determines support for
+ *        QnnContext_createFromBinaryWithSignal.
+ */
+#define QNN_PROPERTY_CONTEXT_SUPPORT_CREATE_FROM_BINARY_WITH_SIGNALS \
+  (QNN_PROPERTY_GROUP_CONTEXT + 10)
+
+/**
+ * @brief Property key for determining whether a backend supports creating multiple contexts from
+ *        binaries in a single API call. It determines support for
+ *        QnnContext_createFromBinaryListAsync.
+ */
+#define QNN_PROPERTY_CONTEXT_SUPPORT_CREATE_FROM_BINARY_LIST_ASYNC (QNN_PROPERTY_GROUP_CONTEXT + 11)
+
+/**
+ * @brief Property key for determining whether a backend supports creation and application of
+ *        updates for an existing context binary. This determines support for
+ *        QnnContext_getBinarySectionSize(), QnnContext_retrieveBinarySection(), and
+ *        QnnContext_applyBinarySection() with QNN_CONTEXT_SECTION_UPDATABLE.
+ */
+#define QNN_PROPERTY_CONTEXT_SUPPORT_BINARY_UPDATES (QNN_PROPERTY_GROUP_CONTEXT + 12)
+
+/**
+ * @brief Property key for determining whether a backend supports use of binary sections without the
+ *        __graph__ argument provided.
+ */
+#define QNN_PROPERTY_CONTEXT_SUPPORT_BINARY_SECTION_FULL_CONTEXT (QNN_PROPERTY_GROUP_CONTEXT + 13)
+
+/**
+ * @brief Property key for determining whether a backend supports setting
+ *        QNN_CONTEXT_PROPERTY_OPTION_CUSTOM as a property option.
+ */
+#define QNN_PROPERTY_CONTEXT_SUPPORT_CUSTOM_PROPERTY (QNN_PROPERTY_GROUP_CONTEXT + 14)
+
+/**
+ * @brief Property key for determining whether a backend supports QnnContext_getIncrementalBinary
+ *        and QnnContext_releaseIncrementalBinary.
+ */
+#define QNN_PROPERTY_CONTEXT_SUPPORT_INCREMENTAL_BINARY (QNN_PROPERTY_GROUP_CONTEXT + 15)
+
+///
+/// Definition of QNN_PROPERTY_GROUP_GRAPH property group. This group is Core (non-optional) API.
+///
+
+/**
+ * @brief Property group for the QNN Graph API property group. This is a non-optional API
+ *        component and cannot be used as a property key.
+ */
+#define QNN_PROPERTY_GROUP_GRAPH (QNN_PROPERTY_GROUP_CORE + 300)
+
+/**
+ * @brief Property key for determining whether a backend supports graph configuration. It determines
+ *        support for QnnGraph_setConfig.
+ */
+#define QNN_PROPERTY_GRAPH_SUPPORT_CONFIG (QNN_PROPERTY_GROUP_GRAPH + 1)
+
+/**
+ * @brief Property key for determining whether a backend supports signals.
+ * @note This capability is equivalent to all of QNN_PROPERTY_GRAPH_SUPPORT_FINALIZE_SIGNAL,
+ *       QNN_PROPERTY_GRAPH_SUPPORT_EXECUTE_SIGNAL, and
+ *       QNN_PROPERTY_GRAPH_SUPPORT_EXECUTE_ASYNC_SIGNAL having support.
+ * @note DEPRECATED: Use QNN_PROPERTY_GRAPH_SUPPORT_FINALIZE_SIGNAL,
+ *       QNN_PROPERTY_GRAPH_SUPPORT_EXECUTE_SIGNAL, or
+ *       QNN_PROPERTY_GRAPH_SUPPORT_EXECUTE_ASYNC_SIGNAL for QnnGraph API support for QnnSignal.
+ */
+#define QNN_PROPERTY_GRAPH_SUPPORT_SIGNALS (QNN_PROPERTY_GROUP_GRAPH + 2)
+
+/**
+ * @brief Property key for determining whether a backend supports asynchronous graph execution. It
+ *        determines support for QnnGraph_executeAsync.
+ */
+#define QNN_PROPERTY_GRAPH_SUPPORT_ASYNC_EXECUTION (QNN_PROPERTY_GROUP_GRAPH + 3)
+
+/**
+ * @brief Property key for determining whether a backend supports execution of graphs with null
+ *        inputs. This implies that the graph will contain no APP_WRITE tensors.
+ */
+#define QNN_PROPERTY_GRAPH_SUPPORT_NULL_INPUTS (QNN_PROPERTY_GROUP_GRAPH + 4)
+
+/**
+ * @brief Property key for determining whether a backend supports priority control of graphs within
+ *        a context. See QNN_GRAPH_CONFIG_OPTION_PRIORITY.
+ */
+#define QNN_PROPERTY_GRAPH_SUPPORT_PRIORITY_CONTROL (QNN_PROPERTY_GROUP_GRAPH + 5)
+
+/**
+ * @brief Property key for determining whether a backend supports QnnSignal for QnnGraph_finalize.
+ */
+#define QNN_PROPERTY_GRAPH_SUPPORT_FINALIZE_SIGNAL (QNN_PROPERTY_GROUP_GRAPH + 6)
+
+/**
+ * @brief Property key for determining whether a backend supports QnnSignal for QnnGraph_execute.
+ */
+#define QNN_PROPERTY_GRAPH_SUPPORT_EXECUTE_SIGNAL (QNN_PROPERTY_GROUP_GRAPH + 7)
+
+/**
+ * @brief Property key for determining whether a backend supports QnnSignal for
+ *        QnnGraph_executeAsync.
+ */
+#define QNN_PROPERTY_GRAPH_SUPPORT_EXECUTE_ASYNC_SIGNAL (QNN_PROPERTY_GROUP_GRAPH + 8)
+
+/**
+ * @brief Property key for determining whether a backend supports graph-level continuous profiling.
+ *        See QNN_GRAPH_CONFIG_OPTION_PROFILE_HANDLE.
+ */
+#define QNN_PROPERTY_GRAPH_SUPPORT_CONTINUOUS_PROFILING (QNN_PROPERTY_GROUP_GRAPH + 9)
+
+/**
+ * @brief Property key for determining whether a backend supports graph execution. It determines
+ *        support for QnnGraph_execute.
+ */
+#define QNN_PROPERTY_GRAPH_SUPPORT_EXECUTE (QNN_PROPERTY_GROUP_GRAPH + 10)
+
+/**
+ * @brief Property key for determining whether a backend supports batch multiplier.
+ */
+#define QNN_PROPERTY_GRAPH_SUPPORT_BATCH_MULTIPLE (QNN_PROPERTY_GROUP_GRAPH + 11)
+
+/**
+ * @brief Property key for determining whether a backend supports per-API profiling data
+ *        for graph execution.
+ */
+#define QNN_PROPERTY_GRAPH_SUPPORT_EXECUTE_PER_API_PROFILING (QNN_PROPERTY_GROUP_GRAPH + 12)
+
+/**
+ * @brief Property key for determining whether a backend supports subgraphs. It determines support
+ *        for QnnGraph_createSubgraph.
+ */
+#define QNN_PROPERTY_GRAPH_SUPPORT_SUBGRAPH (QNN_PROPERTY_GROUP_GRAPH + 13)
+
+/**
+ * @brief Property key for determining whether a backend supports graph profiling state. See
+ *        QNN_GRAPH_CONFIG_OPTION_SET_PROFILING_STATE.
+ */
+#define QNN_PROPERTY_GRAPH_SUPPORT_PROFILING_STATE (QNN_PROPERTY_GROUP_GRAPH + 14)
+
+/**
+ * @brief Property key for determining whether a backend supports controlling the number of
+ *        profiling executions of a graph. See QNN_GRAPH_CONFIG_OPTION_SET_PROFILING_NUM_EXECUTIONS.
+ */
+#define QNN_PROPERTY_GRAPH_SUPPORT_SET_PROFILING_NUM_EXECUTIONS (QNN_PROPERTY_GROUP_GRAPH + 15)
+
+/**
+ * @brief Property key for determining whether a backend supports the
+ *        QNN_GRAPH_EXECUTE_ENVIRONMENT_OPTION_BIND_MEM_HANDLES execution environment option for
+ *        binding client allocated mem handles to a graph.
+ */
+#define QNN_PROPERTY_GRAPH_SUPPORT_ENV_OPTION_BIND_MEM_HANDLES (QNN_PROPERTY_GROUP_GRAPH + 16)
+
+/**
+ * @brief Property key for determining whether a backend supports the
+ *        QNN_GRAPH_EXECUTE_ENVIRONMENT_OPTION_POPULATE_CLIENT_BUFS execution environment option for
+ *        populating client buffers with backend allocated memory.
+ */
+#define QNN_PROPERTY_GRAPH_SUPPORT_ENV_OPTION_POPULATE_CLIENT_BUFS (QNN_PROPERTY_GROUP_GRAPH + 17)
+
+/**
+ * @brief Property key for determining whether a backend supports finalizing
+ *        (QnnGraph_finalize) a graph retrieved from a context binary.
+ */
+#define QNN_PROPERTY_GRAPH_SUPPORT_FINALIZE_DESERIALIZED_GRAPH (QNN_PROPERTY_GROUP_GRAPH + 18)
+
+/**
+ * @brief Property key for determining whether a backend supports setting
+ *        QNN_GRAPH_PROPERTY_OPTION_CUSTOM as a property option.
+ */
+#define QNN_PROPERTY_GRAPH_SUPPORT_CUSTOM_PROPERTY (QNN_PROPERTY_GROUP_GRAPH + 19)
+
+/**
+ * @brief Property key for determining whether a backend supports early termination of graph
+ *        execution.
+ */
+#define QNN_PROPERTY_GRAPH_SUPPORT_EARLY_TERMINATION (QNN_PROPERTY_GROUP_GRAPH + 20)
+
+/**
+ * @brief Property key for determining whether a backend supports online preparation of
+ *        graphs.
+ */
+#define QNN_PROPERTY_GRAPH_SUPPORT_ONLINE_PREPARE (QNN_PROPERTY_GROUP_GRAPH + 21)
+
+///
+/// Definition of QNN_PROPERTY_GROUP_OP_PACKAGE property group. This group is Optional portion of
+/// API.
+///
+
+/**
+ * @brief Property group for the QNN Op Package API property group. This can be used as a key to
+ *        check if Op Package API is supported by a backend.
+ */
+#define QNN_PROPERTY_GROUP_OP_PACKAGE (QNN_PROPERTY_GROUP_CORE + 400)
+
+/**
+ * @brief Property key for determining whether an op package supports validation.
+ */
+#define QNN_PROPERTY_OP_PACKAGE_SUPPORTS_VALIDATION (QNN_PROPERTY_GROUP_OP_PACKAGE + 1)
+
+/**
+ * @brief Property key for determining whether an op package supports op implementation creation and
+ *        freeing.
+ */
+#define QNN_PROPERTY_OP_PACKAGE_SUPPORTS_OP_IMPLS (QNN_PROPERTY_GROUP_OP_PACKAGE + 2)
+
+/**
+ * @brief Property key for determining whether an op package supports duplication of operation
+ *        names, such that there are duplicated op_package_name::op_name combinations.
+ */
+#define QNN_PROPERTY_OP_PACKAGE_SUPPORTS_DUPLICATE_NAMES (QNN_PROPERTY_GROUP_OP_PACKAGE + 3)
+
+///
+/// Definition of QNN_PROPERTY_GROUP_TENSOR property group. This group is Core (non-optional) API.
+///
+
+/**
+ * @brief Property group for the QNN Tensor API property group. This is a non-optional API
+ *        component and cannot be used as a property key.
+ */
+#define QNN_PROPERTY_GROUP_TENSOR (QNN_PROPERTY_GROUP_CORE + 500)
+
+/**
+ * @brief Property key to determine whether a backend supports Qnn_MemHandle_t type tensors.
+ */
+#define QNN_PROPERTY_TENSOR_SUPPORT_MEMHANDLE_TYPE (QNN_PROPERTY_GROUP_TENSOR + 1)
+
+/**
+ * @brief Property key to determine whether a backend supports creating context tensors. It
+ *        determines support for QnnTensor_createContextTensor.
+ */
+#define QNN_PROPERTY_TENSOR_SUPPORT_CONTEXT_TENSORS (QNN_PROPERTY_GROUP_TENSOR + 2)
+
+/**
+ * @brief Property key to determine whether a backend supports dynamic tensor dimensions.
+ */
+#define QNN_PROPERTY_TENSOR_SUPPORT_DYNAMIC_DIMENSIONS (QNN_PROPERTY_GROUP_TENSOR + 3)
+
+/**
+ * @brief Property key to determine whether a backend supports tensor sparsity.
+ */
+#define QNN_PROPERTY_TENSOR_SUPPORT_SPARSITY (QNN_PROPERTY_GROUP_TENSOR + 4)
+
+/**
+ * @brief Property key to determine whether a backend supports updating static tensor weight data
+ *        and quantization encodings, if applicable.
+ */
+#define QNN_PROPERTY_TENSOR_SUPPORT_UPDATEABLE_STATIC_TENSORS (QNN_PROPERTY_GROUP_TENSOR + 5)
+
+/**
+ * @brief Property key to determine whether a backend supports updating quantization tensor
+ *        encodings for UPDATABLE_NATIVE tensors.
+ */
+#define QNN_PROPERTY_TENSOR_SUPPORT_UPDATEABLE_NATIVE_TENSORS (QNN_PROPERTY_GROUP_TENSOR + 6)
+
+/**
+ * @brief Property key to determine whether a backend supports updating quantization tensor
+ *        encodings for UPDATABLE_APP_READ, UPDATABLE_APP_WRITE, and UPDATABLE_APP_READWRITE
+ *        tensors.
+ */
+#define QNN_PROPERTY_TENSOR_SUPPORT_UPDATEABLE_APP_TENSORS (QNN_PROPERTY_GROUP_TENSOR + 7)
+
+/**
+ * @brief Property key to determine whether a backend supports scale-offset quantization encodings.
+ *        See QNN_QUANTIZATION_ENCODING_SCALE_OFFSET.
+ */
+#define QNN_PROPERTY_TENSOR_SUPPORT_QUANTIZATION_ENCODING_SCALE_OFFSET \
+  (QNN_PROPERTY_GROUP_TENSOR + 8)
+
+/**
+ * @brief Property key to determine whether a backend supports axis scale-offset quantization
+ *        encodings. See QNN_QUANTIZATION_ENCODING_AXIS_SCALE_OFFSET.
+ */
+#define QNN_PROPERTY_TENSOR_SUPPORT_QUANTIZATION_ENCODING_AXIS_SCALE_OFFSET \
+  (QNN_PROPERTY_GROUP_TENSOR + 9)
+
+/**
+ * @brief Property key to determine whether a backend supports bit-width scale-offset quantization
+ *        encodings. See QNN_QUANTIZATION_ENCODING_BW_SCALE_OFFSET.
+ */
+#define QNN_PROPERTY_TENSOR_SUPPORT_QUANTIZATION_ENCODING_BW_SCALE_OFFSET \
+  (QNN_PROPERTY_GROUP_TENSOR + 10)
+
+/**
+ * @brief Property key to determine whether a backend supports bit-width axis scale-offset
+ *        quantization encodings. See QNN_QUANTIZATION_ENCODING_BW_AXIS_SCALE_OFFSET.
+ */
+#define QNN_PROPERTY_TENSOR_SUPPORT_QUANTIZATION_ENCODING_BW_AXIS_SCALE_OFFSET \
+  (QNN_PROPERTY_GROUP_TENSOR + 11)
+
+/**
+ * @brief Property key to determine whether a backend supports block quantization encodings. See
+ *        QNN_QUANTIZATION_ENCODING_BLOCK.
+ */
+#define QNN_PROPERTY_TENSOR_SUPPORT_QUANTIZATION_ENCODING_BLOCK (QNN_PROPERTY_GROUP_TENSOR + 12)
+
+/**
+ * @brief Property key to determine whether a backend supports blockwise expansion
+ *        quantization encodings. See QNN_QUANTIZATION_ENCODING_BLOCKWISE_EXPANSION.
+ */
+#define QNN_PROPERTY_TENSOR_SUPPORT_QUANTIZATION_ENCODING_BLOCKWISE_EXPANSION \
+  (QNN_PROPERTY_GROUP_TENSOR + 13)
+
+/**
+ * @brief Property key to determine whether a backend supports vector quantization encodings. See
+ *        QNN_QUANTIZATION_ENCODING_VECTOR.
+ */
+#define QNN_PROPERTY_TENSOR_SUPPORT_QUANTIZATION_ENCODING_VECTOR (QNN_PROPERTY_GROUP_TENSOR + 14)
+
+/**
+ * @brief Property key to determine whether a backend supports deferred loading of raw tensor data
+ *        through a callback. See QNN_TENSORMEMTYPE_RETRIEVE_RAW.
+ */
+#define QNN_PROPERTY_TENSOR_SUPPORT_RETRIEVE_RAW (QNN_PROPERTY_GROUP_TENSOR + 15)
+
+/**
+ * @brief Property key for determining whether a backend supports optional application
+ *        writable tensors.
+ */
+#define QNN_PROPERTY_TENSOR_SUPPORT_OPTIONAL_APP_WRITE (QNN_PROPERTY_GROUP_TENSOR + 16)
+/**
+ * @brief Property key for determining whether a backend supports optional application
+ *        readable tensors.
+ */
+#define QNN_PROPERTY_TENSOR_SUPPORT_OPTIONAL_APP_READ (QNN_PROPERTY_GROUP_TENSOR + 17)
+/**
+ * @brief Property key for determining whether a backend supports optional application
+ *        readable/writable tensors.
+ */
+#define QNN_PROPERTY_TENSOR_SUPPORT_OPTIONAL_APP_READWRITE (QNN_PROPERTY_GROUP_TENSOR + 18)
+
+/**
+ * @brief Property key for determining whether a backend supports MX data format
+ */
+#define QNN_PROPERTY_TENSOR_SUPPORT_MX_DATA_FORMAT (QNN_PROPERTY_GROUP_TENSOR + 19)
+
+/**
+ *  @brief Property key for determining whether a backend supports
+ *         QNN_TENSOR_DATA_FORMAT_UBWC_RGBA8888 data format.
+ */
+#define QNN_PROPERTY_TENSOR_SUPPORT_UBWC_RGBA8888 (QNN_PROPERTY_GROUP_TENSOR + 20)
+
+/**
+ * @brief Property key for determining whether a backend supports QNN_TENSOR_DATA_FORMAT_UBWC_NV12
+ *        data format.
+ */
+#define QNN_PROPERTY_TENSOR_SUPPORT_UBWC_NV12 (QNN_PROPERTY_GROUP_TENSOR + 21)
+
+/**
+ * @brief Property key for determining whether a backend supports QNN_TENSOR_DATA_FORMAT_UBWC_NV12_Y
+ *        data format.
+ */
+#define QNN_PROPERTY_TENSOR_SUPPORT_UBWC_NV12_Y (QNN_PROPERTY_GROUP_TENSOR + 22)
+
+/**
+ * @brief Property key for determining whether a backend supports
+ *        QNN_TENSOR_DATA_FORMAT_UBWC_NV12_UV data format.
+ */
+#define QNN_PROPERTY_TENSOR_SUPPORT_UBWC_NV12_UV (QNN_PROPERTY_GROUP_TENSOR + 23)
+
+///
+/// Definition of QNN_PROPERTY_GROUP_ERROR property group. This group is Optional portion of API.
+///
+
+/**
+ * @brief Property key for the QNN Error API property group. This can be used as a key to
+ *        check if Error API is supported by a backend.
+ */
+#define QNN_PROPERTY_GROUP_ERROR (QNN_PROPERTY_GROUP_CORE + 1000)
+
+/**
+ * @brief Property key to determine whether a backend supports retrieving verbose string descriptors
+ *        of errorHandles. It determines support for QnnError_getVerboseMessage.
+ */
+#define QNN_PROPERTY_ERROR_GET_VERBOSE_MESSAGE (QNN_PROPERTY_GROUP_ERROR + 1)
+
+///
+/// Definition of QNN_PROPERTY_GROUP_MEMORY property group. This group is an optional API.
+///
+
+/**
+ * @brief Property group for the QNN Memory API property group. This can be used as a key to
+ *        check if Memory API is supported by a backend.
+ */
+#define QNN_PROPERTY_GROUP_MEMORY (QNN_PROPERTY_GROUP_CORE + 1100)
+
+/**
+ * @brief Property key to determine whether a backend supports ion memory type.
+ */
+#define QNN_PROPERTY_MEMORY_SUPPORT_MEM_TYPE_ION (QNN_PROPERTY_GROUP_MEMORY + 1)
+
+/**
+ * @brief Property key to determine whether a backend supports custom memory type.
+ */
+#define QNN_PROPERTY_MEMORY_SUPPORT_MEM_TYPE_CUSTOM (QNN_PROPERTY_GROUP_MEMORY + 2)
+
+/**
+ * @brief Property key to determine whether a backend supports DMA-BUF memory type.
+ */
+#define QNN_PROPERTY_MEMORY_SUPPORT_MEM_TYPE_DMA_BUF (QNN_PROPERTY_GROUP_MEMORY + 3)
+
+///
+/// Definition of QNN_PROPERTY_GROUP_SIGNAL property group. This group is an optional API.
+///
+
+/**
+ * @brief Property group for signal support. This can be used as a key to
+ *        check if Signal API is supported by a backend.
+ */
+#define QNN_PROPERTY_GROUP_SIGNAL (QNN_PROPERTY_GROUP_CORE + 1200)
+
+/**
+ * @brief Property key to determine whether a backend supports abort signals.
+ */
+#define QNN_PROPERTY_SIGNAL_SUPPORT_ABORT QNN_PROPERTY_GROUP_SIGNAL + 1
+
+/**
+ * @brief Property key to determine whether a backend supports timeout signals.
+ */
+#define QNN_PROPERTY_SIGNAL_SUPPORT_TIMEOUT QNN_PROPERTY_GROUP_SIGNAL + 2
+
+///
+/// Definition of QNN_PROPERTY_GROUP_LOG property group. This group is an optional API.
+///
+
+/**
+ * @brief Property group for log support. This can be used as a key to
+ *        check if Log API is supported by a backend.
+ */
+#define QNN_PROPERTY_GROUP_LOG (QNN_PROPERTY_GROUP_CORE + 1300)
+
+/**
+ * @brief Property key for determining whether a backend supports logging with the
+ *        system's default stream (callback=NULL).
+ */
+#define QNN_PROPERTY_LOG_SUPPORTS_DEFAULT_STREAM (QNN_PROPERTY_GROUP_LOG + 1)
+
+///
+/// Definition of QNN_PROPERTY_GROUP_PROFILE property group. This group is an optional API.
+///
+
+/**
+ * @brief Property group for profile support. This can be used as a key to
+ *        check if Profile API is supported by a backend.
+ */
+#define QNN_PROPERTY_GROUP_PROFILE (QNN_PROPERTY_GROUP_CORE + 1400)
+
+/**
+ * @brief Property key for determining whether a backend supports the
+ *        QNN_PROFILE_CONFIG_OPTION_CUSTOM config option.
+ */
+#define QNN_PROPERTY_PROFILE_SUPPORT_CUSTOM_CONFIG (QNN_PROPERTY_GROUP_PROFILE + 1)
+
+/**
+ * @brief Property key for determining whether a backend supports the
+ *        QNN_PROFILE_CONFIG_OPTION_MAX_EVENTS config option.
+ */
+#define QNN_PROPERTY_PROFILE_SUPPORT_MAX_EVENTS_CONFIG (QNN_PROPERTY_GROUP_PROFILE + 2)
+
+/**
+ * @brief Property key for determining whether a backend supports querying extended event data. It
+ *        determines support for QnnProfile_getExtendedEventData.
+ */
+#define QNN_PROPERTY_PROFILE_SUPPORTS_EXTENDED_EVENT (QNN_PROPERTY_GROUP_PROFILE + 3)
+
+/**
+ * @brief Property key for determining whether a backend supports the
+ *        QNN_PROFILE_CONFIG_OPTION_ENABLE_OPTRACE config option.
+ */
+#define QNN_PROPERTY_PROFILE_SUPPORT_OPTRACE_CONFIG (QNN_PROPERTY_GROUP_PROFILE + 4)
+
+/**
+ * @brief Property group for device support. This can be used as a key to
+ *        check if Device API is supported by a backend.
+ */
+#define QNN_PROPERTY_GROUP_DEVICE (QNN_PROPERTY_GROUP_CORE + 1500)
+
+/**
+ * @brief Property key for determining if a backend supports QnnDevice_getInfrastructure.
+ */
+#define QNN_PROPERTY_DEVICE_SUPPORT_INFRASTRUCTURE (QNN_PROPERTY_GROUP_DEVICE + 1)
+
+///
+/// Definition of QNN_PROPERTY_GROUP_CUSTOM property group. This group represents backend defined
+/// properties.
+///
+
+/**
+ * @brief Property group for custom backend properties.
+ */
+#define QNN_PROPERTY_GROUP_CUSTOM (QNN_PROPERTY_GROUP_CORE + 2000)
+
+//=============================================================================
+// Data Types
+//=============================================================================
+
+/**
+ * @brief Type used for unique property identifiers.
+ */
+typedef uint32_t QnnProperty_Key_t;
+
+/**
+ * @brief QNN Property API result / error codes.
+ */
+typedef enum {
+  QNN_PROPERTY_MIN_ERROR = QNN_MIN_ERROR_PROPERTY,
+  //////////////////////////////////////////////
+
+  QNN_PROPERTY_NO_ERROR = QNN_SUCCESS,
+  /// Property in question is supported
+  QNN_PROPERTY_SUPPORTED = QNN_SUCCESS,
+  /// Property in question not supported.
+  QNN_PROPERTY_NOT_SUPPORTED = QNN_COMMON_ERROR_NOT_SUPPORTED,
+
+  // Remaining values signal errors.
+
+  /// Backend did not recognize the property key.
+  QNN_PROPERTY_ERROR_UNKNOWN_KEY = QNN_MIN_ERROR_PROPERTY + 0,
+
+  //////////////////////////////////////////////
+  QNN_PROPERTY_MAX_ERROR = QNN_MAX_ERROR_PROPERTY,
+  // Unused, present to ensure 32 bits.
+  QNN_PROPERTY_ERROR_UNDEFINED = 0x7FFFFFFF
+} QnnProperty_Error_t;
+
+//=============================================================================
+// Public Functions
+//=============================================================================
+
+/**
+ * @brief Queries a capability of the backend.
+ *
+ * @note Safe to call any time, backend does not have to be created.
+ *
+ * @param[in] key Key which identifies the capability within group.
+ *
+ * @return Error code:
+ *         - QNN_PROPERTY_SUPPORTED: if the backend supports capability.
+ *         - QNN_PROPERTY_ERROR_UNKNOWN_KEY: The provided key is not valid.
+ *         - QNN_PROPERTY_NOT_SUPPORTED: if the backend does not support capability.
+ *
+ * @note Use corresponding API through QnnInterface_t.
+ */
+QNN_API
+Qnn_ErrorHandle_t QnnProperty_hasCapability(QnnProperty_Key_t key);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/QnnSdkBuildId.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/QnnSdkBuildId.h
new file mode 100755
index 0000000000000..4c5eaea5964b6
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/QnnSdkBuildId.h
@@ -0,0 +1,19 @@
+//==============================================================================
+//
+// Copyright (c) 2021-2025 Qualcomm Technologies, Inc.
+// All Rights Reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+//=============================================================================
+// !!! This is an auto-generated file. Do NOT modify manually !!!
+//=============================================================================
+
+#ifndef QNN_SDK_BUILD_ID_H
+#define QNN_SDK_BUILD_ID_H
+
+/// QNN SDK build id
+#define QNN_SDK_BUILD_ID "v2.34.0.250424201103_119471"
+
+#endif  // QNN_SDK_BUILD_ID_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/QnnSignal.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/QnnSignal.h
new file mode 100755
index 0000000000000..34cd3bca4d2a1
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/QnnSignal.h
@@ -0,0 +1,219 @@
+//=============================================================================
+//
+//  Copyright (c) 2020-2024 Qualcomm Technologies, Inc.
+//  All Rights Reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//=============================================================================
+
+/**
+ *  @file
+ *  @brief  Signal component API.
+ *
+ *          Requires Backend to be initialized.
+ *          Provides means to manage Signal objects.
+ *          Signal objects are used to control execution of other components.
+ */
+
+#ifndef QNN_SIGNAL_H
+#define QNN_SIGNAL_H
+
+#include "QnnCommon.h"
+#include "QnnTypes.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//=============================================================================
+// Macros
+//=============================================================================
+
+//=============================================================================
+// Data Types
+//=============================================================================
+
+/**
+ * @brief QNN Signal API result / error codes.
+ */
+typedef enum {
+  QNN_SIGNAL_MIN_ERROR = QNN_MIN_ERROR_SIGNAL,
+  //////////////////////////////////////////
+
+  QNN_SIGNAL_NO_ERROR = QNN_SUCCESS,
+  /// Backend does not support the requested functionality
+  QNN_SIGNAL_ERROR_UNSUPPORTED = QNN_COMMON_ERROR_NOT_SUPPORTED,
+  /// Attempt to reconfigure, free, or supply to a second QNN function call
+  /// a signal object that is already in use.
+  QNN_SIGNAL_ERROR_SIGNAL_IN_USE = QNN_MIN_ERROR_SIGNAL + 0,
+  /// Signal object is idle and not being used by an outstanding function
+  /// call.
+  QNN_SIGNAL_ERROR_SIGNAL_IDLE = QNN_MIN_ERROR_SIGNAL + 1,
+  /// Invalid configuration error
+  QNN_SIGNAL_ERROR_INVALID_ARGUMENT = QNN_MIN_ERROR_SIGNAL + 2,
+  /// NULL or unrecognized signal handle error
+  QNN_SIGNAL_ERROR_INVALID_HANDLE = QNN_MIN_ERROR_SIGNAL + 3,
+  /// Timeout error
+  QNN_SIGNAL_ERROR_TIMEOUT = QNN_MIN_ERROR_SIGNAL + 4,
+  /// API supplied with incompatible signal type
+  QNN_SIGNAL_ERROR_INCOMPATIBLE_SIGNAL_TYPE = QNN_MIN_ERROR_SIGNAL + 5,
+  // Mem allocation error
+  QNN_SIGNAL_ERROR_MEM_ALLOC = QNN_COMMON_ERROR_MEM_ALLOC,
+
+  //////////////////////////////////////////
+  QNN_SIGNAL_MAX_ERROR = QNN_MAX_ERROR_SIGNAL,
+  // Unused, present to ensure 32 bits.
+  QNN_SIGNAL_ERROR_UNDEFINED = 0x7FFFFFFF
+} QnnSignal_Error_t;
+
+/**
+ * @brief Custom configuration for Signal object
+ *
+ * Please refer to documentation provided by the backend for usage information
+ */
+typedef void* QnnSignal_CustomConfig_t;
+
+/**
+ * @brief This enum defines signal config options.
+ */
+typedef enum {
+  /// Sets signal custom options via QnnSignal_CustomConfig_t
+  QNN_SIGNAL_CONFIG_OPTION_CUSTOM = 0,
+  /// Sets abort on API calls invoked with a signal object.
+  /// Abort and Timeout signals are mutually exclusive and
+  /// cannot be used together.
+  QNN_SIGNAL_CONFIG_OPTION_ABORT = 1,
+  /// Sets timeout interval on API calls invoked with a signal
+  /// object. Timeout and Abort signals are mutually exclusive
+  /// and cannot be used together.
+  QNN_SIGNAL_CONFIG_OPTION_TIMEOUT = 2,
+  // Unused, present to ensure 32 bits.
+  QNN_SIGNAL_CONFIG_UNDEFINED = 0x7FFFFFFF
+} QnnSignal_ConfigOption_t;
+
+/**
+ * @brief This struct provides signal configuration.
+ */
+typedef struct {
+  /// Type of config object used to configure the signal
+  QnnSignal_ConfigOption_t option;
+  /// Union of mutually exclusive config values based on
+  /// the type specified by 'option'.
+  union UNNAMED {
+    QnnSignal_CustomConfig_t customConfig;
+    /// Timeout interval is represented in microseconds.
+    /// Tolerance for the Timeout is platform dependent and
+    /// cannot be guaranteed.
+    uint64_t timeoutDurationUs;
+  };
+} QnnSignal_Config_t;
+
+/// QnnSignal_Config_t initializer macro
+#define QNN_SIGNAL_CONFIG_INIT              \
+  {                                         \
+    QNN_SIGNAL_CONFIG_UNDEFINED, /*option*/ \
+    {                                       \
+      NULL /*customConfig*/                 \
+    }                                       \
+  }
+
+//=============================================================================
+// Public Functions
+//=============================================================================
+
+/**
+ * @brief Create a new signal object. The object will be configured with desired
+ *        behavior and is idle and available for usage.
+ *
+ * @param[in] backend A backend handle
+ *
+ * @param[in] config  Pointer to a NULL terminated array of config option pointers.
+ *                    NULL is allowed, indicates no config options are provided, and
+ *                    signal will not be configured to do anything. All config options
+ *                    have default value, in case not provided. If same config
+ *                    option type is provided multiple times, the last option value
+ *                    will be used.
+ *
+ * @param[out] signal Handle to newly created signal object.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: if the signal is created successfully
+ *         - QNN_SIGNAL_ERROR_INVALID_ARGUMENT: at least one argument or config option invalid
+ *         - QNN_SIGNAL_ERROR_INVALID_HANDLE: _backend_ is not a valid handle
+ *         - QNN_SIGNAL_ERROR_UNSUPPORTED: if QnnSignal API is not supported on the backend
+ *         - QNN_COMMON_ERROR_SYSTEM_COMMUNICATION: SSR occurence (successful recovery)
+ *         - QNN_COMMON_ERROR_SYSTEM_COMMUNICATION_FATAL: SSR occurence (unsuccessful recovery)
+ */
+QNN_API
+Qnn_ErrorHandle_t QnnSignal_create(Qnn_BackendHandle_t backend,
+                                   const QnnSignal_Config_t** config,
+                                   Qnn_SignalHandle_t* signal);
+
+/**
+ * @brief Set/change a configuration on an existing signal
+ *
+ * @param[in] signal Signal object whose configuration needs to be set
+ *
+ * @param[in] config Pointer to a NULL terminated array of config option pointers.
+ *                   NULL is allowed and may be used to reset any previously set configuration.
+ *                   No default values are assumed for config options that are not set.
+ *                   If same config option type is provided multiple times,
+ *                   the last option value will be used. If a backend cannot support
+ *                   all provided configs it will fail.
+ *
+ * @return Error Code:
+ *         - QNN_SUCCESS: if the config is set successfully
+ *         - QNN_SIGNAL_ERROR_INVALID_HANDLE: signal handle is null or invalid
+ *         - QNN_SIGNAL_ERROR_INVALID_ARGUMENT: one or more config values is invalid
+ *         - QNN_SIGNAL_ERROR_SIGNAL_IN_USE: when attempting to reconfigure a signal
+ *           that is active and in-use.
+ *         - QNN_SIGNAL_ERROR_UNSUPPORTED: if QnnSignal API is not supported on the backend
+ */
+QNN_API
+Qnn_ErrorHandle_t QnnSignal_setConfig(Qnn_SignalHandle_t signal, const QnnSignal_Config_t** config);
+
+/**
+ * @brief Triggers the signal action during the associated API call. For abort config signals, it
+ *        causes the associated API call to gracefully cease execution at the earliest opportunity.
+ *        This function will block until the targeted call has released associated resources and is
+ *        ready to return in it's own calling context. When the associated API call is initiated,
+ *        the signal object will be in-use and not available to another call. When the associated
+ *        API call returns, the associated signal object will be available and can safely be passed
+ *        to another call.
+ *
+ * @param[in] signal Signal handle used by the associated API call
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: if the trigger is successful.
+ *         - QNN_SIGNAL_ERROR_INVALID_HANDLE: signal handle is null or invalid
+ *         - QNN_SIGNAL_ERROR_INCOMPATIBLE_SIGNAL_TYPE: API does not support the signal type
+ *         - QNN_SIGNAL_ERROR_TRIGGER_SIGNAL_IDLE: if the signal is not currently in-use, and hence
+ *           can not be triggered.
+ *         - QNN_SIGNAL_ERROR_UNSUPPORTED: if QnnSignal API is not supported on the backend
+ *         - QNN_COMMON_ERROR_SYSTEM_COMMUNICATION: SSR occurence (successful recovery)
+ *         - QNN_COMMON_ERROR_SYSTEM_COMMUNICATION_FATAL: SSR occurence (unsuccessful recovery)
+ */
+QNN_API
+Qnn_ErrorHandle_t QnnSignal_trigger(Qnn_SignalHandle_t signal);
+
+/**
+ * @brief Free memory and resources associated with an available signal object.
+ *
+ * @param[in] signal The signal object to free.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: if the signal object is successfully freed
+ *         - QNN_SIGNAL_ERROR_INVALID_HANDLE: signal handle is null or invalid
+ *         - QNN_SIGNAL_ERROR_SIGNAL_IN_USE: if the signal object is currently in-use
+ *         - QNN_SIGNAL_ERROR_MEM_ALLOC: an error is encountered with de-allocation of associated
+ *           memory
+ *         - QNN_SIGNAL_ERROR_UNSUPPORTED: if QnnSignal API is not supported on the backend
+ */
+QNN_API
+Qnn_ErrorHandle_t QnnSignal_free(Qnn_SignalHandle_t signal);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/QnnTensor.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/QnnTensor.h
new file mode 100755
index 0000000000000..a8fbec5fe45c3
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/QnnTensor.h
@@ -0,0 +1,222 @@
+//==============================================================================
+//
+//  Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+//  All rights reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+/**
+ *  @file
+ *  @brief  Tensor component API.
+ *
+ *          Requires Backend to be initialized.
+ *          Tensors have either Context or Graph scope. Tensors created with
+ *          Context scope can be used within Graphs that belong to same Context,
+ *          but not vice versa. Tensors hold either operation's static/constant
+ *          data or input/output activation data.
+ */
+
+#ifndef QNN_TENSOR_H
+#define QNN_TENSOR_H
+
+#include "QnnCommon.h"
+#include "QnnTypes.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//=============================================================================
+// Macros
+//=============================================================================
+
+//=============================================================================
+// Data Types
+//=============================================================================
+
+/**
+ * @brief QNN Tensor API result / error codes.
+ */
+typedef enum {
+  QNN_TENSOR_MIN_ERROR = QNN_MIN_ERROR_TENSOR,
+  //////////////////////////////////////////
+
+  /// Success.
+  QNN_TENSOR_NO_ERROR = QNN_SUCCESS,
+  /// Invalid context/graph handle in creating tensor.
+  QNN_TENSOR_ERROR_INVALID_HANDLE = QNN_MIN_ERROR_TENSOR + 1,
+  /// Tensor with specified credentials not registered with a context/graph.
+  QNN_TENSOR_ERROR_DOES_NOT_EXIST = QNN_MIN_ERROR_TENSOR + 2,
+  /// (deprecated) Tensor has already been registered with backend.
+  QNN_TENSOR_ERROR_ALREADY_EXISTS = QNN_MIN_ERROR_TENSOR + 3,
+  /// Invalid tensor param.
+  QNN_TENSOR_ERROR_INVALID_TENSOR_PARAM = QNN_MIN_ERROR_TENSOR + 4,
+  /// This tensor param is currently unsupported.
+  QNN_TENSOR_ERROR_UNSUPPORTED_TENSOR_PARAM = QNN_MIN_ERROR_TENSOR + 5,
+  /// (deprecated) A hash collision has occurred with a previously registered tensor's name.
+  QNN_TENSOR_ERROR_NAME_HASH_COLLISION = QNN_MIN_ERROR_TENSOR + 6,
+  /// Tensor provided for update is invalid.
+  QNN_TENSOR_ERROR_INCOMPATIBLE_TENSOR_UPDATE = QNN_MIN_ERROR_TENSOR + 7,
+  /// There is optional API component that is not supported yet. See QnnProperty.
+  QNN_TENSOR_ERROR_UNSUPPORTED_FEATURE = QNN_COMMON_ERROR_NOT_SUPPORTED,
+
+  //////////////////////////////////////////
+  QNN_TENSOR_MAX_ERROR = QNN_MAX_ERROR_TENSOR,
+  // Unused, present to ensure 32 bits.
+  QNN_TENSOR_ERROR_UNDEFINED = 0x7FFFFFFF
+} QnnTensor_Error_t;
+
+//=============================================================================
+// Public Functions
+//=============================================================================
+
+/**
+ * @brief A function to create a new tensor on Qnn_ContextHandle_t.
+ *
+ *        This call may or may not allocate memory, depending on the Qnn_TensorType_t
+ *        value specified in tensor and the accelerator implementation.
+ *        Optionally it may be initialized with data provided in the tensor if present.
+ *
+ * @warning Context tensors cannot be of type QNN_TENSOR_TYPE_NATIVE.
+ *          Native tensors connect nodes within a single graph.
+ *
+ * @warning Context tensors cannot be of datatype QNN_DATATYPE_STRING.
+ *
+ * @param[in] context The context in which the tensor would be created.
+ *
+ * @param[in,out] tensor Pointer to a user-allocated struct containing information on the tensor
+ *                       (type, name, data format, dimensions, data, etc). For tensors containing
+ *                       static data (such as weights or biases), the tensor type is expected to be
+ *                       QNN_TENSOR_TYPE_STATIC. Valid data must be presented in the tensor object
+ *                       at creation. This data will be copied, and may be safely de-allocated
+ *                       after this call returns. Other tensor types (e.g: APP_READ, APP_WRITE,
+ *                       APP_READWRITE, NULL) must have the data pointer set to NULL at the time of
+ *                       creation. Any preset value in _id_ will be overwritten by the backend as
+ *                       part of this call. Subsequent usage of the tensor must reference this _id_.
+ *                       Creating a tensor with a name that duplicates a previously created tensor
+ *                       name in the context and all child graphs results in undefined behaviour.
+ *                       The _dimensions_ are treated as the maximum dimensions during tensor
+ *                       creation.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: Successfully created a context tensor
+ *         - QNN_TENSOR_ERROR_INVALID_HANDLE: Provided context handle is invalid
+ *         - QNN_TENSOR_ERROR_INVALID_TENSOR_PARAM: One or more tensor parameters is invalid
+ *         - QNN_TENSOR_ERROR_UNSUPPORTED_TENSOR_PARAM: One or more tensor parameters are
+ *           unsupported
+ *         - QNN_COMMON_ERROR_MEM_ALLOC: Failure in creating tensor due to issues with memory
+ *           allocation
+ *         - QNN_TENSOR_ERROR_UNSUPPORTED_FEATURE: some API feature is not supported yet
+ *
+ * @note Use corresponding API through QnnInterface_t.
+ */
+QNN_API
+Qnn_ErrorHandle_t QnnTensor_createContextTensor(Qnn_ContextHandle_t context, Qnn_Tensor_t* tensor);
+
+/**
+ * @brief A function to create a new tensor on Qnn_GraphHandle_t.
+ *
+ *        This call may or may not allocate memory, depending on the Qnn_TensorType_t
+ *        value specified in tensor and the accelerator implementation.
+ *        Optionally it may be initialized with data provided in the tensor if present.
+ *
+ * @warning Graph tensors cannot be of type QNN_TENSOR_TYPE_APP_READWRITE. R/W tensors connect
+ *          multiple graphs.
+ *
+ * @warning Graph tensors cannot be of datatype QNN_DATATYPE_STRING.
+ *
+ * @param[in] graph The graph or sub-graph in which the tensor would be created.
+ *
+ * @param[in,out] tensor Pointer to a user-allocated struct containing information on the tensor
+ *                (type, name, data format, dimensions, data, etc). For tensors containing static
+ *                data (such as weights or biases), the tensor type is expected to be
+ *                QNN_TENSOR_TYPE_STATIC. Valid data must be presented in the tensor object at
+ *                creation. This data will be copied, and may be safely de-allocated after this
+ *                call returns. Other tensor types (e.g: NATIVE, APP_READ, APP_WRITE, NULL) must
+ *                have the data pointer set to NULL at the time of creation. Any preset value in
+ *                _id_ will be overwritten by the backend as part of this call. Subsequent usage of
+ *                the tensor must reference this _id_. Creating a tensor with a name that
+ *                duplicates a previously created tensor name in the graph or parent context
+ *                results in undefined behaviour. The _dimensions_ are treated as the maximum
+ *                dimensions during tensor creation.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: Successfully created a graph tensor
+ *         - QNN_TENSOR_ERROR_INVALID_HANDLE: Provided graph handle is invalid
+ *         - QNN_TENSOR_ERROR_INVALID_TENSOR_PARAM: One or more tensor parameters is invalid
+ *         - QNN_TENSOR_ERROR_UNSUPPORTED_TENSOR_PARAM: One or more tensor parameters are
+ *           unsupported
+ *         - QNN_COMMON_ERROR_MEM_ALLOC: Failure in creating tensor due to issues with memory
+ *           allocation
+ *         - QNN_TENSOR_ERROR_UNSUPPORTED_FEATURE: some API feature is not supported yet
+ *
+ * @note Use corresponding API through QnnInterface_t.
+ */
+QNN_API
+Qnn_ErrorHandle_t QnnTensor_createGraphTensor(Qnn_GraphHandle_t graph, Qnn_Tensor_t* tensor);
+
+/**
+ * @brief Update a graph tensor with the new provided tensor information.
+ *        Tensors provided here are associated with the tensor in the backend through the ID field.
+ *        Valid fields to update are: data and quantization parameters for UPDATEABLE_STATIC
+ *        tensors, quantization parameters for UPDATEABLE_NATIVE, UPDATEABLE_APP_READ,
+ *        UPDATEABLE_APP_WRITE, and UPDATEABLE_APP_READWRITE tensors.
+ *        Multiple calls to QnnTensor_updateGraphTensors() can be made, but the updates will
+ *        not take effect until QnnGraph_finalize() is called.
+ *        Backends may support a subset of updateable tensor types.
+
+ *
+ *  @return Error code:
+ *         - QNN_SUCCESS: Successfully updated the graph tensors
+ *         - QNN_TENSOR_ERROR_INVALID_HANDLE: Provided graph handle is invalid
+ *         - QNN_TENSOR_ERROR_INVALID_TENSOR_PARAM: One or more tensor parameters is invalid
+ *         - QNN_TENSOR_ERROR_UNSUPPORTED_TENSOR_PARAM: One or more tensor parameters are
+ *           unsupported
+ *         - QNN_GRAPH_ERROR_GRAPH_NOT_FINALIZED: graph needs to be finalized before updating
+ *           graph tensors.
+ *         - QNN_COMMON_ERROR_MEM_ALLOC: Failure in creating tensor due to issues with memory
+ *           allocation
+ *         - QNN_TENSOR_ERROR_INCOMPATIBLE_TENSOR_UPDATE: provided tensor is invalid and cannot
+ *           be applied as an update.
+ *         - QNN_TENSOR_ERROR_UNSUPPORTED_FEATURE: some API feature is not supported yet
+ *
+ */
+QNN_API
+Qnn_ErrorHandle_t QnnTensor_updateGraphTensors(Qnn_GraphHandle_t graph,
+                                               const Qnn_Tensor_t** tensor,
+                                               uint64_t numTensors);
+
+/**
+ * @brief Update a context tensor with the new provided tensor information.
+ *        Tensors provided here are associated with the tensor in the backend through the ID field.
+ *        Valid fields to update are: data and quantization parameters for UPDATEABLE_STATIC
+ *        tensors, quantization parameters for UPDATEABLE_NATIVE, UPDATEABLE_APP_READ,
+ *        UPDATEABLE_APP_WRITE, and UPDATEABLE_APP_READWRITE tensors. Multiple calls to
+ *        QnnTensor_updateContextTensors() can be made, but the updates will not take effect until
+ *        QnnGraph_finalize() is called for one or more of the graphs to which the context tensors
+ *        are associated. Backends may support a subset of updateable tensor types.
+ *
+ *  @return Error code:
+ *         - QNN_SUCCESS: Successfully updated the context tensor
+ *         - QNN_TENSOR_ERROR_INVALID_HANDLE: Provided context handle is invalid
+ *         - QNN_TENSOR_ERROR_INVALID_TENSOR_PARAM: One or more tensor parameters is invalid
+ *         - QNN_TENSOR_ERROR_UNSUPPORTED_TENSOR_PARAM: One or more tensor parameters are
+ *           unsupported
+ *         - QNN_COMMON_ERROR_MEM_ALLOC: Failure in creating tensor due to issues with memory
+ *           allocation
+ *         - QNN_TENSOR_ERROR_INCOMPATIBLE_TENSOR_UPDATE: provided tensor is invalid and cannot
+ *           be applied as an update.
+ *         - QNN_TENSOR_ERROR_UNSUPPORTED_FEATURE: some API feature is not supported yet
+ *
+ */
+QNN_API
+Qnn_ErrorHandle_t QnnTensor_updateContextTensors(Qnn_ContextHandle_t context,
+                                                 const Qnn_Tensor_t** tensor,
+                                                 uint64_t numTensors);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // QNN_TENSOR_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/QnnTypes.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/QnnTypes.h
new file mode 100755
index 0000000000000..03dcd3567d539
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/QnnTypes.h
@@ -0,0 +1,1296 @@
+//==============================================================================
+//
+// Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+// All rights reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+/**
+ *  @file
+ *  @brief  A header which contains the base types required by the API.
+ *          Strings are expected to be UTF-8 encoded and NULL terminated.
+ */
+
+#ifndef QNN_TYPES_H
+#define QNN_TYPES_H
+
+#ifdef __cplusplus
+#include <cstddef>
+#include <cstdint>
+#else
+#include <stddef.h>
+#include <stdint.h>
+#endif
+
+#include "QnnCommon.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//=============================================================================
+// Data Types
+//=============================================================================
+/**
+ * @brief A structure which defines Op Mapping information from Source Framework Operation
+ *        to QNN Operation.
+ */
+typedef enum {
+  QNN_OP_MAPPING_TYPE_TENSOR = 0,
+  QNN_OP_MAPPING_TYPE_OP = 1
+} Qnn_MappingType_t;
+
+typedef struct {
+  const char* name;
+  Qnn_MappingType_t type;
+} Qnn_OpMappingPair_t;
+
+typedef struct {
+  /// Name of the QNN Operation or Tensor
+  const char* name;
+  /// Associated pairs to this tensor or operation.
+  Qnn_OpMappingPair_t* pair;
+  /// Number of pairs
+  uint32_t numPairs;
+} Qnn_Mapping_t;
+
+typedef struct {
+  const char* graphName;
+  Qnn_Mapping_t* opMappings;
+  uint32_t numOpMappings;
+  Qnn_Mapping_t* tensorMappings;
+  uint32_t numTensorMappings;
+} Qnn_OpMappingV1_t;
+
+/// Version for Qnn_OpMapping_t
+typedef enum {
+  QNN_OP_MAPPING_VERSION_1 = 1,
+} Qnn_OpMappingVersion_t;
+
+typedef struct {
+  Qnn_OpMappingVersion_t version;
+  union UNNAMED {
+    Qnn_OpMappingV1_t* v1;
+  };
+} Qnn_OpMapping_t;
+
+// clang-format off
+/// Qnn_OpMapping_t initializer macro
+#define QNN_OP_MAPPING_INIT        \
+  {                                \
+    QNN_OP_MAPPING_VERSION_1,      \
+    {                              \
+      NULL                         \
+    }                              \
+  }
+// clang-format on
+
+// clang-format off
+/// Qnn_OpMappingV1_t initializer macro
+#define QNN_OP_MAPPING_V1_INIT     \
+  {                                \
+      NULL, /*graphName*/          \
+      NULL, /*opMappings*/         \
+      0,    /*numOpMappings*/      \
+      NULL, /*tensorMappings*/     \
+      0,    /*numTensorMappings*/  \
+  }
+// clang-format on
+
+/**
+ * @brief An enum which defines various data types.
+ *
+ * @note  4-bit data types (QNN_DATATYPE_SFIXED_POINT_4 and
+ *        QNN_DATATYPE_UFIXED_POINT_4) are stored in tightly
+ *        packed format into a single byte in little endian
+ *        format. This allows two 4-bit quantized elements to be
+ *        stored in a single byte. The lower nibble stores the first
+ *        value while the higher nibble stores the second value.
+ *        For example, to represent two 4-bit quantized values of
+ *        10 and 4, they will be stored in a single byte as (0100 1010).
+ */
+typedef enum {
+  // Signed Int: 0x00XX
+
+  /// 8-bit integer type
+  QNN_DATATYPE_INT_8 = 0x0008,
+  /// 16-bit integer type
+  QNN_DATATYPE_INT_16 = 0x0016,
+  /// 32-bit integer type
+  QNN_DATATYPE_INT_32 = 0x0032,
+  /// 64-bit integer type
+  QNN_DATATYPE_INT_64 = 0x0064,
+
+  // Unsigned Int: 0x01XX
+  QNN_DATATYPE_UINT_8  = 0x0108,
+  QNN_DATATYPE_UINT_16 = 0x0116,
+  QNN_DATATYPE_UINT_32 = 0x0132,
+  QNN_DATATYPE_UINT_64 = 0x0164,
+
+  // Float: 0x02XX
+  QNN_DATATYPE_FLOAT_16 = 0x0216,
+  QNN_DATATYPE_FLOAT_32 = 0x0232,
+  QNN_DATATYPE_FLOAT_64 = 0x0264,
+
+  // Signed Fixed Point: 0x03XX
+  QNN_DATATYPE_SFIXED_POINT_4  = 0x0304,
+  QNN_DATATYPE_SFIXED_POINT_8  = 0x0308,
+  QNN_DATATYPE_SFIXED_POINT_16 = 0x0316,
+  QNN_DATATYPE_SFIXED_POINT_32 = 0x0332,
+
+  // Unsigned Fixed Point: 0x04XX
+  QNN_DATATYPE_UFIXED_POINT_4  = 0x0404,
+  QNN_DATATYPE_UFIXED_POINT_8  = 0x0408,
+  QNN_DATATYPE_UFIXED_POINT_16 = 0x0416,
+  QNN_DATATYPE_UFIXED_POINT_32 = 0x0432,
+
+  // Bool: 0x05XX
+  /// 8-bit boolean type, 0 = false, any non-zero value = true
+  QNN_DATATYPE_BOOL_8 = 0x0508,
+
+  // String: 0x06xx
+  QNN_DATATYPE_STRING = 0x0608,
+
+  // Unused, present to ensure 32 bits.
+  QNN_DATATYPE_UNDEFINED = 0x7FFFFFFF
+} Qnn_DataType_t;
+
+/**
+ * @brief An enum which defines the different precision modes supported by QNN backends.
+ *        A precision mode may be used to express the math type used in the implementation
+ *        of an operation.
+ */
+typedef enum {
+  // FLOATING POINT REPRESENTATIONS
+
+  /// 32-bit Floating point precision. The format of the floating point
+  /// value is left to backends to choose.
+  QNN_PRECISION_FLOAT32 = 0,
+  /// 16-bit Floating point precision. The format of the floating point
+  /// value is left to backends to choose.
+  QNN_PRECISION_FLOAT16 = 1,
+
+  // Unused, present to ensure 32 bits.
+  QNN_PRECISION_UNDEFINED = 0x7FFFFFFF
+} Qnn_Precision_t;
+
+/**
+ * @brief An enum to specify the tensor type, application accessible or native to QNN
+ *
+ */
+typedef enum {
+  /// Client application writeable tensor.
+  QNN_TENSOR_TYPE_APP_WRITE = 0,
+  /// Client application readable tensor.
+  QNN_TENSOR_TYPE_APP_READ = 1,
+  /// Tensor that can both be read and written by an application. Used in scenarios that may include
+  /// supplying an output tensor from one graph as the input to another graph.
+  QNN_TENSOR_TYPE_APP_READWRITE = 2,
+  /// Tensor native to a graph which may be optimized by a backend and are not accessible by a
+  /// client.
+  QNN_TENSOR_TYPE_NATIVE = 3,
+  /// Static data which doesn't change during execution and may be optimized by a backend. Since the
+  /// data cannot change, static tensors cannot have dynamic dimensions.
+  QNN_TENSOR_TYPE_STATIC = 4,
+  /// Tensor type NULL which can be used to represent optional tensors. Other Qnn_Tensor_t metadata
+  /// is ignored.
+  QNN_TENSOR_TYPE_NULL = 5,
+  /// Tensor containing static data whose content or quantization encodings may
+  /// be modified by a client after tensor creation.
+  QNN_TENSOR_TYPE_UPDATEABLE_STATIC = 6,
+  /// Tensor native to a graph whose quantization encodings may be modified by a
+  /// client after tensor creation.
+  QNN_TENSOR_TYPE_UPDATEABLE_NATIVE = 7,
+  /// Application writable tensor whose quantization encodings may be modified by a
+  /// client after tensor creation.
+  QNN_TENSOR_TYPE_UPDATEABLE_APP_WRITE = 8,
+  /// Application readable tensor whose quantization encodings may be modified by a
+  /// client after tensor creation.
+  QNN_TENSOR_TYPE_UPDATEABLE_APP_READ = 9,
+  /// Application readable/writable tensor whose quantization encodings may be modified by a
+  /// client after tensor creation.
+  QNN_TENSOR_TYPE_UPDATEABLE_APP_READWRITE = 10,
+  /// Tensor type OPTIONAL_APP_WRITE represents an application writable (input) tensor that may be
+  /// excluded from inferences
+  QNN_TENSOR_TYPE_OPTIONAL_APP_WRITE = 11,
+  /// Tensor type OPTIONAL_APP_READ represents an application readable (output) tensor that may be
+  /// excluded from inferences.
+  QNN_TENSOR_TYPE_OPTIONAL_APP_READ = 12,
+  /// Tensor type OPTIONAL_APP_READ_WRITE represents an application readable (output) or writable
+  /// (input) tensor that may be excluded from inferences.
+  QNN_TENSOR_TYPE_OPTIONAL_APP_READWRITE = 13,
+  // Unused, present to ensure 32 bits.
+  QNN_TENSOR_TYPE_UNDEFINED = 0x7FFFFFFF
+} Qnn_TensorType_t;
+
+/**
+ * @brief An enum to specify the parameter type : Scalar or Tensor
+ */
+typedef enum {
+  QNN_PARAMTYPE_SCALAR = 0,
+  QNN_PARAMTYPE_TENSOR = 1,
+  // Unused, present to ensure 32 bits.
+  QNN_PARAMTYPE_UNDEFINED = 0xFFFFFFFF
+} Qnn_ParamType_t;
+
+/**
+ * @brief An enum to specify definition source for field(s) following this enum
+ */
+typedef enum {
+  /// Indicates backend implementation to update or decide
+  QNN_DEFINITION_IMPL_GENERATED = 0,
+  /// Indicates that provided definition needs to be used
+  QNN_DEFINITION_DEFINED = 1,
+  // Unused, present to ensure 32 bits.
+  QNN_DEFINITION_UNDEFINED = 0x7FFFFFFF
+} Qnn_Definition_t;
+
+/**
+ * @brief An enum to specify a priority.
+ */
+typedef enum {
+  /// QNN_PRIORITY_LOW is always available for use.
+  QNN_PRIORITY_LOW = 0,
+  /// QNN_PRIORITY_NORMAL is always available for use.
+  QNN_PRIORITY_NORMAL  = 100,
+  QNN_PRIORITY_DEFAULT = QNN_PRIORITY_NORMAL,
+  /// QNN_PRIORITY_NORMAL_HIGH usage may be restricted and would silently be treated as
+  /// QNN_PRIORITY_NORMAL
+  QNN_PRIORITY_NORMAL_HIGH = 150,
+  /// QNN_PRIORITY_HIGH usage may be restricted and would silently be treated as
+  /// QNN_PRIORITY_NORMAL
+  QNN_PRIORITY_HIGH = 200,
+  // Unused, present to ensure 32 bits.
+  QNN_PRIORITY_UNDEFINED = 0x7FFFFFFF
+} Qnn_Priority_t;
+
+/**
+ * @brief A typedef to indicate context binary size.
+ */
+typedef uint64_t Qnn_ContextBinarySize_t;
+
+/**
+ * @brief An enum to describe reporting levels for the error handling API
+ * QNN_ERROR_REPORTING_LEVEL_BRIEF: get basic information about an error
+ * QNN_ERROR_REPORTING_LEVEL_DETAILED: get detailed information about an error
+ * in memory-based object forms
+ */
+typedef enum {
+  QNN_ERROR_REPORTING_LEVEL_BRIEF    = 0,
+  QNN_ERROR_REPORTING_LEVEL_DETAILED = 1,
+  // Unused, present to ensure 32 bits.
+  QNN_ERROR_REPORTING_LEVEL_UNDEFINED = 0x7FFFFFFF
+} Qnn_ErrorReportingLevel_t;
+
+/**
+ * @brief A typedef describing error reporting configuration
+ */
+typedef struct {
+  /// Error reporting level
+  Qnn_ErrorReportingLevel_t reportingLevel;
+  /// Amount of memory to be reserved for error information. Specified in KB
+  uint32_t storageLimit;
+} Qnn_ErrorReportingConfig_t;
+
+// clang-format off
+/// Qnn_ErrorReportingConfig_t initializer macro
+#define QNN_ERROR_REPORTING_CONFIG_INIT                     \
+  {                                                         \
+    QNN_ERROR_REPORTING_LEVEL_UNDEFINED, /*reportingLevel*/ \
+    0u                                   /*storageLimit*/   \
+  }
+// clang-format on
+
+/**
+ * @brief A struct which is used to provide a version number using 3 values:
+ * major, minor, patch
+ */
+typedef struct {
+  uint32_t major;
+  uint32_t minor;
+  uint32_t patch;
+} Qnn_Version_t;
+
+// clang-format off
+/// Qnn_Version_t initializer macro
+#define QNN_VERSION_INIT \
+  {                      \
+    0u,    /*major*/     \
+    0u,    /*minor*/     \
+    0u     /*patch*/     \
+  }
+// clang-format on
+
+/**
+ * @brief A struct used to provide the versions of both the core QNN API
+ * and any Backend Specific API
+ */
+typedef struct {
+  /// Version of the QNN core API common to all backends
+  Qnn_Version_t coreApiVersion;
+  /// Version of the backend-specific API
+  Qnn_Version_t backendApiVersion;
+} Qnn_ApiVersion_t;
+
+/// Qnn_ApiVersion_t initializer macro
+#define QNN_API_VERSION_INIT                            \
+  {                                                     \
+    {                                                   \
+        QNN_API_VERSION_MAJOR, /*coreApiVersion.major*/ \
+        QNN_API_VERSION_MINOR, /*coreApiVersion.minor*/ \
+        QNN_API_VERSION_PATCH  /*coreApiVersion.patch*/ \
+    },                                                  \
+        QNN_VERSION_INIT /*backendApiVersion*/          \
+  }
+
+/**
+ * @brief A value representing an immutable value which configures a node.
+ */
+typedef struct {
+  Qnn_DataType_t dataType;
+  union UNNAMED {
+    float floatValue;
+    double doubleValue;
+    uint64_t uint64Value;
+    int64_t int64Value;
+    uint32_t uint32Value;
+    int32_t int32Value;
+    uint16_t uint16Value;
+    int16_t int16Value;
+    uint8_t uint8Value;
+    int8_t int8Value;
+    uint8_t bool8Value;
+    const char* stringValue;
+  };
+} Qnn_Scalar_t;
+
+/// Qnn_Scalar_t initializer macro
+#define QNN_SCALAR_INIT                  \
+  {                                      \
+    QNN_DATATYPE_UNDEFINED, /*dataType*/ \
+    {                                    \
+      0.0f /*floatValue*/                \
+    }                                    \
+  }
+
+/**
+ * @brief An enum to specify quantization encoding type structure
+ *
+ */
+typedef enum {
+  /// Indicates per-tensor scale-offset encoding type. See Qnn_ScaleOffset_t. Support can be checked
+  /// via QNN_PROPERTY_TENSOR_SUPPORT_QUANTIZATION_ENCODING_SCALE_OFFSET.
+  QNN_QUANTIZATION_ENCODING_SCALE_OFFSET = 0,
+  /// Indicates per-axis (e.g. per-channel) scale-offset encoding type. See Qnn_AxisScaleOffset_t.
+  /// Support can be checked via
+  /// QNN_PROPERTY_TENSOR_SUPPORT_QUANTIZATION_ENCODING_AXIS_SCALE_OFFSET.
+  QNN_QUANTIZATION_ENCODING_AXIS_SCALE_OFFSET = 1,
+  /// Indicates bit-width scale-offset encoding type. See Qnn_BwScaleOffset_t. Support can be
+  /// checked via QNN_PROPERTY_TENSOR_SUPPORT_QUANTIZATION_ENCODING_BW_SCALE_OFFSET.
+  QNN_QUANTIZATION_ENCODING_BW_SCALE_OFFSET = 2,
+  /// Indicates bit-width per-axis scale-offset encoding type. See Qnn_BwAxisScaleOffset_t. Support
+  /// can be checked via QNN_PROPERTY_TENSOR_SUPPORT_QUANTIZATION_ENCODING_BW_AXIS_SCALE_OFFSET.
+  QNN_QUANTIZATION_ENCODING_BW_AXIS_SCALE_OFFSET = 3,
+  /// Indicates per-block scale-offset encoding type. See Qnn_BlockScaleOffset_t. Support can be
+  /// checked via QNN_PROPERTY_TENSOR_SUPPORT_QUANTIZATION_ENCODING_BLOCK_SCALE_OFFSET.
+  QNN_QUANTIZATION_ENCODING_BLOCK = 4,
+  /// Indicates per-block scale-offset encoding type. See Qnn_BlockScaleOffset_t. Support can be
+  /// checked via QNN_PROPERTY_TENSOR_SUPPORT_QUANTIZATION_ENCODING_BLOCKWISE_EXPANSION.
+  QNN_QUANTIZATION_ENCODING_BLOCKWISE_EXPANSION = 5,
+  /// Indicates VQ compression encoding type. See Qnn_VectorQuantCompression_t. Support can be
+  /// checked via QNN_PROPERTY_TENSOR_SUPPORT_QUANTIZATION_ENCODING_VQ_COMPRESSION.
+  QNN_QUANTIZATION_ENCODING_VECTOR = 6,
+  // Unused, present to ensure 32 bits.
+  QNN_QUANTIZATION_ENCODING_UNDEFINED = 0x7FFFFFFF
+} Qnn_QuantizationEncoding_t;
+
+/**
+ * @brief A struct to express scale-offset quantization encoding.
+ *
+ * float_value = (quantized_value + offset) * scale
+ */
+typedef struct {
+  /// scale must be strictly positive
+  float scale;
+  int32_t offset;
+} Qnn_ScaleOffset_t;
+
+// clang-format off
+/// Qnn_ScaleOffset_t initializer macro
+#define QNN_SCALE_OFFSET_INIT \
+  {                           \
+    0.0f, /*scale*/           \
+    0     /*offset*/          \
+  }
+// clang-format on
+
+/**
+ * @brief A struct to express quantization parameters as a positive scale with a zero offset and a
+ * bitwidth.
+ *
+ * float_value = (quantized_value + offset) * scale
+ *
+ * bitwidth must be > 0, and is used to express the true number of bits used to quantize the value,
+ * which may be different from the bitwidth of the tensor indicated by its data type. For example:
+ * the quantization encoding for a tensor of type QNN_DATATYPE_UFIXED_POINT_8 that is quantized to
+ * 4-bit precision may be expressed by setting bitwidth = 4. In such circumstances, data quantized
+ * to a lower precision will still occupy the full extent of bits allotted to the tensor as per its
+ * data type in unpacked form.
+ *
+ * The datatype used must be the smallest type which can accommodate the bitwidth. For example: a
+ * tensor quantized to 4-bit precision must use an 8-bit datatype, 16-bit or larger datatypes are
+ * not permitted.
+ *
+ * Tensor elements are expected to occupy the least significant bits of the total size alloted to
+ * the datatype, and all bits above the specified bitwidth will be ignored. For example: an 8-bit
+ * datatype tensor quantized to 4-bit precision will be interpreted as a 4-bit value contained in
+ * the lower 4 bits of each element, and the upper 4 bits will be ignored. For signed datatypes, the
+ * value will be interpreted as a two's complement integer where the signed bit is the most
+ * significant bit permitted by the specified bitwidth. For example: -3 would be represented as
+ * 0b11111101 as a signed 8-bit integer, but can also be represented as 0b00001101 as a signed 4-bit
+ * integer stored in an 8-bit container. Either of these representations are valid to express -3 as
+ * a 4-bit signed integer in an 8-bit container, and will be treated identically because the upper 4
+ * bits will be ignored.
+ */
+typedef struct {
+  /// bitwidth must be <= number of bits specified by data type of tensor
+  uint32_t bitwidth;
+  /// scale must be strictly positive
+  float scale;
+  int32_t offset;
+} Qnn_BwScaleOffset_t;
+
+// clang-format off
+/// Qnn_BwScaleOffset_t initializer macro
+#define QNN_BW_SCALE_OFFSET_INIT \
+  {                              \
+    0u,   /*bitwidth*/           \
+    0.0f, /*scale*/              \
+    0     /*offset*/             \
+  }
+// clang-format on
+
+/**
+ * @brief A struct to express per-axis quantization parameters as a scale with a zero offset
+ */
+typedef struct {
+  int32_t axis;
+  uint32_t numScaleOffsets;
+  Qnn_ScaleOffset_t* scaleOffset;
+} Qnn_AxisScaleOffset_t;
+
+// clang-format off
+/// Qnn_AxisScaleOffset_t initializer macro
+#define QNN_AXIS_SCALE_OFFSET_INIT \
+  {                                \
+    0,       /*axis*/              \
+    0u,      /*numScaleOffsets*/   \
+    NULL     /*scaleOffset*/       \
+  }                                \
+// clang-format on
+
+/**
+ * @brief A struct to express per-axis quantization parameters as collection of scales, offsets
+ * and bitwidth.
+ *
+ * bitwidth must be > 0 and applies commonly to all axes. It is used to express the true number of
+ * bits used to quantize the value, which may be different from the bitwidth of the tensor indicated
+ * by its data type. For example: the quantization encoding for a tensor of type
+ * QNN_DATATYPE_UFIXED_POINT_8 that is quantized to 4-bit precision may be expressed by setting
+ * bitwidth = 4. In such circumstances, data quantized to a lower precision will still occupy the
+ * full extent of bits allotted to the tensor as per its data type in unpacked form.
+ *
+ * The datatype used must be the smallest type which can accommodate the bitwidth. For example: a
+ * tensor quantized to 4-bit precision must use an 8-bit datatype, 16-bit or larger datatypes are
+ * not permitted.
+ *
+ * Tensor elements are expected to occupy the least significant bits of the total size alloted to
+ * the datatype, and all bits above the specified bitwidth will be ignored. For example: an 8-bit
+ * datatype tensor quantized to 4-bit precision will be interpreted as a 4-bit value contained in
+ * the lower 4 bits of each element, and the upper 4 bits will be ignored. For signed datatypes, the
+ * value will be interpreted as a two's complement integer where the signed bit is the most
+ * significant bit permitted by the specified bitwidth. For example: -3 would be represented as
+ * 0b11111101 as a signed 8-bit integer, but can also be represented as 0b00001101 as a signed 4-bit
+ * integer stored in an 8-bit container. Either of these representations are valid to express -3 as
+ * a 4-bit signed integer in an 8-bit container, and will be treated identically because the upper 4
+ * bits will be ignored.
+ */
+typedef struct {
+  /// bitwidth must be <= number of bits specified by data type of tensor
+  uint32_t bitwidth;
+  int32_t axis;
+  /// numElements applies to both scales and offsets and they are supposed to be a one-to-one match
+  uint32_t numElements;
+  /// scales must be strictly positive
+  float* scales;
+  /// offsets must match scales in their dimension except when it can be NULL to indicate that the
+  /// value is symmetrically quantized and hence, offset = 0
+  int32_t* offsets;
+} Qnn_BwAxisScaleOffset_t;
+
+// clang-format off
+/// Qnn_BwAxisScaleOffset_t initializer macro
+#define QNN_BW_AXIS_SCALE_OFFSET_INIT \
+  {                                   \
+    0u,      /*bitwidth*/             \
+    0,       /*axis*/                 \
+    0u,      /*numElements*/          \
+    NULL,    /*scales*/               \
+    NULL     /*offsets*/              \
+  }
+// clang-format on
+
+/**
+ * @brief A struct to express block quantization parameters. A tensor is divided into blocks of
+ * size blockSize, where blockSize is an array of length rank.
+ *
+ * @note num of scaleOffsets (i.e. num of blocks) must be ==
+ * ceil(dimensions[0]/blockSize[0])*ceil(dimensions[1]/blockSize[1]) ...
+ * .... *ceil(dimensions[rank-1] / blockSize[rank-1]). *
+ */
+typedef struct {
+  /// Dimensions of the block in number of tensor elements.
+  /// Pointer to an array of size RANK(Weight). Each element specifies the size along the
+  /// corresponding dimension
+  uint32_t* blockSize;
+
+  /// Array of size numBlocks of scale offset pairs.
+  Qnn_ScaleOffset_t* scaleOffset;
+} Qnn_BlockEncoding_t;
+
+// clang-format off
+/// Qnn_BlockEncoding_t initializer macro
+#define QNN_BLOCK_ENCODING_INIT     \
+  {                                 \
+    0u,      /*blockSize*/          \
+    NULL     /*scaleOffset*/        \
+  }                                 \
+// clang-format on
+
+/**
+ * @brief An enum to specify blockwise expansion block scale storage widths
+ *
+ */
+typedef enum {
+    QNN_BLOCKWISE_EXPANSION_BITWIDTH_SCALE_STORAGE_8 = 0,
+    QNN_BLOCKWISE_EXPANSION_BITWIDTH_SCALE_STORAGE_16 = 1,
+    // Unused, present to ensure 32 bits.
+    QNN_BLOCKWISE_EXPANSION_BITWIDTH_SCALE_STORAGE_UNDEFINED = 0x7FFFFFFF
+} Qnn_BlockwiseExpansionBlockScaleStorageType_t;
+
+/**
+ * @brief A struct to express block-wise expansion quantization parameters.
+ *
+ * @note This quantization encoding must not be used with dynamically shaped tensors.
+ *
+ */
+typedef struct {
+    /// The dimension (typically the channel dimension)
+    int32_t axis;
+    /// Array of size axisSize of scale offset pairs.
+    Qnn_ScaleOffset_t* scaleOffsets;
+    /// Number of blocks within the axis.
+    uint32_t numBlocksPerAxis;
+    /// Block bitwidth (e.g. 12 bits for 4 to 16 expansion)
+    uint32_t blockScaleBitwidth;
+    /// Size of the block scaling storage, must be able to store at least blockScaleBitwidth sized values.
+    Qnn_BlockwiseExpansionBlockScaleStorageType_t blockScaleStorageType;
+    union UNNAMED {
+        /// A contiguous array of block scalings of size axisSize*numBlocksPerAxis. The array is laid out such that an element can be accessed via blocksScale8[axisIter*numBlocksPerAxis+blockIter].
+        /// Used when blockStorageSize is QNN_BLOCKWISE_EXPANSION_BITWIDTH_SCALE_STORAGE_8.
+        uint8_t* blocksScale8;
+        /// A contiguous array of block scalings of size axisSize*numBlocksPerAxis. The array is laid out such that an element can be accessed via blocksScale16[axisIter*numBlocksPerAxis+blockIter].
+        /// Used when blockStorageSize is QNN_BLOCKWISE_EXPANSION_BITWIDTH_SCALE_STORAGE_16.
+        uint16_t* blocksScale16;
+    };
+} Qnn_BlockwiseExpansion_t;
+
+// clang-format off
+/// Qnn_BlockScaleOffset_t initializer macro
+#define QNN_BLOCKWISE_EXPANSION_INIT                                              \
+  {                                                                               \
+    0,                                                  /*axis*/                  \
+    NULL,                                               /*scaleOffsets*/          \
+    0u,                                                 /*numBlocksPerAxis*/      \
+    0u,                                                 /*blockScaleBitwidth*/    \
+    QNN_BLOCKWISE_EXPANSION_BITWIDTH_SCALE_STORAGE_UNDEFINED, /*blockScaleStorageType*/ \
+    {                                                                             \
+      NULL,                                             /*blocksScale8*/          \
+    }                                                                             \
+  }                                                                               \
+// clang-format on
+
+/**
+ * @brief A struct to express vector quantization parameters.
+ *
+ * @note This quantization encoding is a specific case of per-channel quantization where
+ * the weights and parameters are crafted in such a way to allow for compression and
+ * codebook generation. For each group of rowsPerBlock*columnsPerBlock weights, there
+ * will be 2^indexBitwidth unique vectorDimension-tuples of weights.
+ *
+ * @note This quantization encoding must not be used with dynamically shaped tensors.
+ *
+ */
+typedef struct {
+    /// Vector Quantization can be thought of as per-channel quantization with specifically
+    /// crafted weights and encoding parameters that allow for codebook generation
+    /// Each weight within the codebook is bwAxisScaleOffset.bitwidth bits wide
+    Qnn_BwAxisScaleOffset_t bwAxisScaleOffset;
+    /// Number of rows in the block of decoded weight coordinates
+    uint32_t rowsPerBlock;
+    /// Number of colums inf the block of decoded weight coordinates
+    uint32_t columnsPerBlock;
+    /// The dimension of the vector encoding. e.g 1D,2D,3D... for 1, 2 or 3 weights per index, respectively
+    uint8_t vectorDimension;
+    /// A value describing how the weights from a given lookup will be unpacked
+    uint8_t vectorStride;
+    /// The bitwidth of the each index into the codebook
+    uint8_t indexBitwidth;
+} Qnn_VectorEncoding_t;
+
+// clang-format off
+/// Qnn_VectorEncoding_t initializer macro
+#define QNN_VECTOR_ENCODING_INIT                                                  \
+  {                                                                               \
+    QNN_BW_AXIS_SCALE_OFFSET_INIT,                        /*bwAxisScaleOffset*/   \
+    0u,                                                   /*rowsPerBlock*/        \
+    0u,                                                   /*columnsPerBlock*/     \
+    0u,                                                   /*vectorDimension*/     \
+    0u,                                                   /*vectorStride*/        \
+    0u,                                                   /*indexBitwidth*/       \
+  }                                                                               \
+// clang-format on
+
+/**
+ * @brief A struct which defines the quantization parameters, and union of supported quantization
+ * encoding structs.
+ */
+typedef struct {
+  Qnn_Definition_t encodingDefinition;
+  /// Quantization encoding type identifying quantization encoding structure to use
+  Qnn_QuantizationEncoding_t quantizationEncoding;
+  union UNNAMED {
+    /// Used when quantizationEncoding is QNN_QUANTIZATION_ENCODING_SCALE_OFFSET. Note that this field is a value.
+    Qnn_ScaleOffset_t scaleOffsetEncoding;
+    /// Used when quantizationEncoding is QNN_QUANTIZATION_ENCODING_AXIS_SCALE_OFFSET. Note that this field is a value.
+    Qnn_AxisScaleOffset_t axisScaleOffsetEncoding;
+    /// Used when quantizationEncoding is QNN_QUANTIZATION_ENCODING_BW_SCALE_OFFSET. Note that this field is a value.
+    Qnn_BwScaleOffset_t bwScaleOffsetEncoding;
+    /// Used when quantizationEncoding is QNN_QUANTIZATION_ENCODING_BW_AXIS_SCALE_OFFSET. Note that this field is a value.
+    Qnn_BwAxisScaleOffset_t bwAxisScaleOffsetEncoding;
+    /// Used when quantizationEncoding is QNN_QUANTIZATION_ENCODING_BLOCK. Note that this field is a value.
+    Qnn_BlockEncoding_t blockEncoding;
+    /// Used when quantizationEncoding is QNN_QUANTIZATION_ENCODING_BLOCKWISE_EXPANSION. Note that this field is a pointer.
+    Qnn_BlockwiseExpansion_t* blockwiseExpansion;
+    /// Used when quantizationEncoding is QNN_QUANTIZATION_ENCODING_VECTOR. Note that this field is a pointer.
+    Qnn_VectorEncoding_t* vectorEncoding;
+  };
+} Qnn_QuantizeParams_t;
+
+// clang-format off
+/// Qnn_QuantizeParams_t initializer macro
+#define QNN_QUANTIZE_PARAMS_INIT                                      \
+  {                                                                   \
+    QNN_DEFINITION_UNDEFINED,                /*encodingDefinition*/   \
+    QNN_QUANTIZATION_ENCODING_UNDEFINED,     /*quantizationEncoding*/ \
+    {                                                                 \
+      QNN_SCALE_OFFSET_INIT /*scaleOffsetEncoding*/                   \
+    }                                                                 \
+  }
+// clang-format on
+
+/**
+ * @brief An n-dimensional tensor formatted in memory as flat buffer where the last dimension varies
+ *        the fastest. Also known as a dense tensor.
+ */
+#define QNN_TENSOR_DATA_FORMAT_DENSE       0
+#define QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER QNN_TENSOR_DATA_FORMAT_DENSE
+
+/**
+ * @brief An n-dimensional tensor formatted in memory as a sparse tensor. Sparse tensors may only be
+ *        QNN_TENSOR_TYPE_NATIVE. Sparse tensors must also fully specify Qnn_SparseParams_t.
+ */
+#define QNN_TENSOR_DATA_FORMAT_SPARSE 1
+
+// TODO: advertise the layout
+/**
+ * @brief A tensor formatted as a codebook. This tensor data format is to be used only in
+ * conjunction with a quantized QNN_TENSOR_TYPE_STATIC tensor using the Qnn_VectorEncoding_t
+ * encoding.
+ */
+#define QNN_TENSOR_DATA_FORMAT_CODEBOOK 2
+
+/**
+ * @brief Tensor data formatted in microscaling (MX) format. Compatible with multiple data types.
+ */
+#define QNN_TENSOR_DATA_FORMAT_MX 3
+
+/**
+* @brief An tensor compressed in memory in UBWC_RGBA8888 format, using the universal
+ *       bandwidth compression (UBWC) scheme.
+*/
+#define QNN_TENSOR_DATA_FORMAT_UBWC_RGBA8888 4
+
+/**
+* @brief An tensor compressed in memory in UBWC_NV12 format, using the universal
+ *       bandwidth compression (UBWC) scheme.
+*/
+#define QNN_TENSOR_DATA_FORMAT_UBWC_NV12 5
+
+/**
+* @brief An tensor compressed in memory in UBWC_NV12 format, using the universal
+ *       bandwidth compression (UBWC) scheme. This data format particularly represents
+ *       the Y plane of the NV12 format
+*/
+#define QNN_TENSOR_DATA_FORMAT_UBWC_NV12_Y 6
+
+/**
+* @brief An tensor compressed in memory in UBWC_NV12 format, using the universal
+ *       bandwidth compression (UBWC) scheme. This data format particularly represents
+ *       the UV plane of the NV12 format
+*/
+#define QNN_TENSOR_DATA_FORMAT_UBWC_NV12_UV 7
+
+/**
+ * @brief Tensor data formatted in native HMX weight format. This data format is desgined
+ *        specifically for HMX weights.
+ *        This format only supports the following datatype for now:
+ *        UFIXED_UINT_8 with offset=128.
+*/
+#define QNN_TENSOR_DATA_FORMAT_HMX_WEIGHT_LAYOUT 8
+
+/**
+ * @brief Tensor data format identifier. The default format
+ *        QNN_TENSOR_DATA_FORMAT_DENSE is supported by all backends. Backends may also support
+ *        QNN_TENSOR_DATA_FORMAT_SPARSE or QNN_TENSOR_DATA_FORMAT_CODEBOOK.
+ * @note  Data format for intermediate tensors, i.e ones of type QNN_TENSOR_TYPE_NATIVE
+ *        may not be honored by a backend, because it can choose to pick a data format that is
+ *        more conducive for its execution.
+ */
+typedef uint32_t Qnn_TensorDataFormat_t;
+
+/**
+ * @brief An enum specifying memory types of tensor data.
+ */
+typedef enum {
+  /// Raw memory pointer
+  QNN_TENSORMEMTYPE_RAW = 0,
+  /// Memory object, provide capability for memory sharing in between QNN accelerator backends.
+  QNN_TENSORMEMTYPE_MEMHANDLE = 1,
+  /// Callback to retrieve a raw memory pointer
+  QNN_TENSORMEMTYPE_RETRIEVE_RAW = 2,
+  // Unused, present to ensure 32 bits.
+  QNN_TENSORMEMTYPE_UNDEFINED = 0x7FFFFFFF
+} Qnn_TensorMemType_t;
+
+/**
+ * @brief A struct which defines a memory buffer
+ *
+ */
+typedef struct {
+  /// app-accessible data pointer, provided by app.
+  void* data;
+  /// size of buffer, in bytes, pointed to by data.
+  uint32_t dataSize;
+} Qnn_ClientBuffer_t;
+
+/**
+ * @brief A client-defined function used to obtain tensor data when the tensor memory type is
+ *        QNN_TENSORMEMTYPE_RETRIEVE_RAW. Qnn_GetTensorRawDataRn_t may be called multiple times for
+ * the same tensor. Each call to Qnn_GetTensorRawDataRn_t must be accompanied by a call to
+ * Qnn_FreeTensorRawDataFn_t to free any allocated data for that tensor. It is not required that
+ * this function be thread safe, unless needed to support retrieval of tensor resources that may be
+ * shared between threads.
+ *
+ * @param[in] id The tensor ID.
+ * @param[in] context the context to which the tensor is associated
+ * @param[in] graph the graph to which the context is associated. For context tensors this field
+ *            should be null.
+ *
+ * @param[out] clientBuf Pointer to the tensor's client buffer.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: Client Buffer data successfully provided.
+ *         - QNN_TENSOR_ERROR_DOES_NOT_EXIST: Tensor with __id__ does not exist or was not created
+ * as QNN_TENSORMEMTYPE_RETRIEVE_RAW.
+ *         - QNN_COMMON_ERROR_INVALID_ARGUMENT: __clientBuf__ is NULL
+ *         - QNN_COMMON_ERROR_RESOURCE_UNAVAILABLE: Requested tensor data cannot be allocated.
+ *
+ */
+typedef Qnn_ErrorHandle_t (*Qnn_GetTensorRawDataFn_t)(Qnn_ContextHandle_t context,
+                                                      Qnn_GraphHandle_t graph,
+                                                      uint64_t id,
+                                                      Qnn_ClientBuffer_t* clientBuf);
+
+/**
+ * @brief A client-defined function used to free tensor data previously obtained by
+ * Qnn_GetTensorDataFn_t. After the call to Qnn_FreeTensorDataFn_t the data provided in the client
+ * buffer clientBuf should be considered invalid. If Qnn_GetTensorRawDataRn_t has been called
+ * multiple times for the same tensor then Qnn_FreeTensorRawDataFn_t must be called an equivalent
+ * number of times to free all allocated data for this tensor. It is not required that this function
+ * be thread safe, unless needed to support releasing of tensor resources that may be shared between
+ * threads.
+ *
+ * @param[in] id The tensor ID.
+ * @param[in] context the context to which the tensor is associated
+ * @param[in] graph the graph to which the context is associated. For context tensors this field
+ * should be null.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: Client Buffer data successfully freed.
+ *         - QNN_TENSOR_ERROR_DOES_NOT_EXIST: Tensor with __id__ does not exist, was not created
+ *           as QNN_TENSORMEMTYPE_RETRIEVE_RAW, or has already been free'd.
+ *
+ */
+typedef Qnn_ErrorHandle_t (*Qnn_FreeTensorRawDataFn_t)(Qnn_ContextHandle_t context,
+                                                       Qnn_GraphHandle_t graph,
+                                                       uint64_t id);
+
+typedef struct {
+  Qnn_GetTensorRawDataFn_t getTensorData;
+  Qnn_FreeTensorRawDataFn_t freeTensorData;
+} Qnn_TensorRetrieveRaw_t;
+
+// clang-format off
+/// Qnn_TensorDataRetrieve_t initializer macro
+#define QNN_TENSOR_RETRIEVE_RAW_INIT \
+  {                                  \
+    NULL,   /*getTensorData*/        \
+    NULL    /*freeTensorData*/       \
+  }
+// clang-format on
+
+// clang-format off
+/// Qnn_ClientBuffer_t initializer macro
+#define QNN_CLIENT_BUFFER_INIT \
+  {                            \
+    NULL, /*data*/             \
+    0u    /*dataSize*/         \
+  }
+// clang-format on
+
+/**
+ * @brief A struct which defines an opaque object
+ *
+ */
+typedef struct {
+  /// Data pointer to the opaque object
+  void* data;
+  /// Size of buffer, in bytes, pointed to by data
+  uint64_t len;
+} Qnn_OpaqueObject_t;
+
+// clang-format off
+/// Qnn_OpaqueObject_t initializer macro
+#define QNN_OPAQUE_OBJECT_INIT \
+  {                            \
+    NULL, /*data*/             \
+    0u    /*len*/              \
+  }
+// clang-format on
+
+/**
+ * @brief A struct which describes the properties of a V1 version of tensor.
+ *
+ */
+typedef struct {
+  /// Integer identifier for a tensor.
+  uint32_t id;
+  /// Tensor name.
+  const char* name;
+  /// Tensor type.
+  Qnn_TensorType_t type;
+  /// Tensor data formatting in memory (refer to definition type for info).
+  Qnn_TensorDataFormat_t dataFormat;
+  /// Tensor data type.
+  Qnn_DataType_t dataType;
+  /// Tensor quantization params.
+  Qnn_QuantizeParams_t quantizeParams;
+  /// Tensor rank.
+  uint32_t rank;
+  /// Tensor dimension array of length _rank_. For detailed behavior of dimensions field with
+  /// various APIs, refer SDK documentation. Must be NULL when rank is 0.
+  uint32_t* dimensions;
+  /// Tensor memory type.
+  Qnn_TensorMemType_t memType;
+  /// Actual data contained in the tensor.
+  union UNNAMED {
+    /// Tensor data provided by client as a pointer to raw memory (see QNN_TENSORMEMTYPE_RAW).
+    Qnn_ClientBuffer_t clientBuf;
+    /// Tensor data shared via a memory handle (see QNN_TENSORMEMTYPE_MEMHANDLE).
+    Qnn_MemHandle_t memHandle;
+  };
+} Qnn_TensorV1_t;
+
+// clang-format off
+/// Qnn_TensorV1_t initializer macro
+#define QNN_TENSOR_V1_INIT                                        \
+  {                                                               \
+    0u,                                     /*id*/                \
+    NULL,                                   /*name*/              \
+    QNN_TENSOR_TYPE_UNDEFINED,              /*type*/              \
+    QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER,     /*dataFormat*/        \
+    QNN_DATATYPE_UNDEFINED,                 /*dataType*/          \
+    QNN_QUANTIZE_PARAMS_INIT,               /*quantizeParams*/    \
+    0u,                                     /*rank*/              \
+    NULL,                                   /*dimensions*/        \
+    QNN_TENSORMEMTYPE_UNDEFINED,            /*memType*/           \
+    {                                                             \
+      QNN_CLIENT_BUFFER_INIT                /*clientBuf*/         \
+    }                                                             \
+  }
+// clang-format on
+
+/**
+ * @brief An enum specifying sparse layout of a tensor. Used only when *dataFormat* is set to
+ * QNN_TENSOR_DATA_FORMAT_SPARSE.
+ */
+typedef enum {
+  /// Hybrid coordinate list sparse tensor layout
+  QNN_SPARSE_LAYOUT_HYBRID_COO = 0,
+  // Unused, present to ensure 32 bits.
+  QNN_SPARSE_LAYOUT_UNDEFINED = 0x7FFFFFFF
+} Qnn_SparseLayoutType_t;
+
+/**
+ * @brief A struct which defines the parameters for a COO sparse tensor layout.
+ */
+typedef struct {
+  /// Number of specified elements of a sparse tensor. Treated as the maximum when creating a
+  /// tensor.
+  uint32_t numSpecifiedElements;
+  /// Size of the index for a hybrid COO sparse tensor. The size of the index can range from 1 to
+  /// the rank of the tensor. This feature allows for partially sparse tensors.
+  uint32_t numSparseDimensions;
+} Qnn_SparseLayoutHybridCoo_t;
+
+// clang-format off
+/// Qnn_SparseLayoutCoo_t initializer macro
+#define QNN_SPARSE_LAYOUT_HYBRID_COO_INIT \
+  {                                       \
+    0u, /*numSpecifiedElements*/          \
+    0u  /*numSparseDimensions*/           \
+  }
+// clang-format on
+
+/**
+ * @brief A struct which defines the sparse tensor parameters. See the SDK documentation for
+ *        details. Used only when *dataFormat* is set to QNN_TENSOR_DATA_FORMAT_SPARSE.
+ */
+typedef struct {
+  /// Specifies the sparse tensor layout
+  Qnn_SparseLayoutType_t type;
+  union UNNAMED {
+    /// Hybrid coordinate list layout. Used when *type* is QNN_SPARSE_LAYOUT_HYBRID_COO.
+    Qnn_SparseLayoutHybridCoo_t hybridCoo;
+  };
+} Qnn_SparseParams_t;
+
+// clang-format off
+/// Qnn_SparseParams_t initializer macro
+#define QNN_SPARSE_PARAMS_INIT                      \
+  {                                                 \
+    QNN_SPARSE_LAYOUT_UNDEFINED,      /*type*/      \
+    QNN_SPARSE_LAYOUT_HYBRID_COO_INIT /*hybridCoo*/ \
+  }
+// clang-format on
+
+/**
+ * @brief A struct which describes the properties of a V2 version of tensor.
+ *
+ */
+typedef struct {
+  /// Unique integer identifier for a tensor, generated by the backend based on the tensor name.
+  uint32_t id;
+  /// Unique tensor name.
+  const char* name;
+  /// Tensor type.
+  Qnn_TensorType_t type;
+  /// Tensor data formatting in memory (refer to definition type for info).
+  Qnn_TensorDataFormat_t dataFormat;
+  /// Tensor data type.
+  Qnn_DataType_t dataType;
+  /// Tensor quantization params.
+  Qnn_QuantizeParams_t quantizeParams;
+  /// Tensor rank. Note that rank cannot be dynamic.
+  uint32_t rank;
+  /// Tensor dimension array of length _rank_. For detailed behavior of dimensions field with
+  /// various APIs, refer to their API documentation. Must be NULL when rank is 0. Must contain
+  /// non-zero values if non-null.
+  uint32_t* dimensions;
+  /// Tensor memory type.
+  Qnn_TensorMemType_t memType;
+  /// Actual data contained in the tensor.
+  union UNNAMED {
+    /// Tensor data provided by client as a pointer to raw memory (see QNN_TENSORMEMTYPE_RAW).
+    Qnn_ClientBuffer_t clientBuf;
+    /// Tensor data shared via a memory handle (see QNN_TENSORMEMTYPE_MEMHANDLE).
+    Qnn_MemHandle_t memHandle;
+    /// Tensor data provided by client as a raw pointer retrieved through a callback
+    /// (QNN_TENSORMEMTYPE_RETRIEVE_RAW)
+    Qnn_TensorRetrieveRaw_t* retrieveRaw;
+  };
+  /// A boolean array of length _rank_ indicating if a tensor dimension is dynamic. Must be NULL
+  /// when rank is 0. Can be NULL if all dimensions are static. A true (non-zero) value indicates
+  /// the corresponding dimension is dynamic and a false (zero) value indicates the corresponding
+  /// dimension is static. Note that QNN_TENSOR_TYPE_STATIC tensors (see _type_) cannot have dynamic
+  /// dimensions. Support for this field can be queried via
+  /// QNN_PROPERTY_TENSOR_SUPPORT_DYNAMIC_DIMENSIONS. If this field is unsupported, it must be NULL.
+  uint8_t* isDynamicDimensions;
+  /// Sparse tensor parameters. Pertains only to sparse tensors (see QNN_TENSOR_DATA_FORMAT_SPARSE).
+  /// Support for this field can be queried via QNN_PROPERTY_TENSOR_SUPPORT_SPARSITY.
+  Qnn_SparseParams_t sparseParams;
+  /// Indicates whether or not a call to QnnGraph_execute[Async] produced this output tensor.
+  /// Applicable only to QNN_TENSOR_TYPE_APP_READ and QNN_TENSOR_TYPE_APP_READWRITE tensor types.
+  /// This field will be undefined if QNN_PROPERTY_GRAPH_SUPPORT_EARLY_TERMINATION is not
+  /// supported. Otherwise, this field is not used.
+  uint8_t isProduced;
+} Qnn_TensorV2_t;
+
+// clang-format off
+/// Qnn_TensorV2_t initializer macro
+#define QNN_TENSOR_V2_INIT                                     \
+  {                                                            \
+    0u,                                 /*id*/                 \
+    NULL,                               /*name*/               \
+    QNN_TENSOR_TYPE_UNDEFINED,          /*type*/               \
+    QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER, /*dataFormat*/         \
+    QNN_DATATYPE_UNDEFINED,             /*dataType*/           \
+    QNN_QUANTIZE_PARAMS_INIT,           /*quantizeParams*/     \
+    0u,                                 /*rank*/               \
+    NULL,                               /*dimensions*/         \
+    QNN_TENSORMEMTYPE_UNDEFINED,        /*memType*/            \
+    {                                                          \
+      QNN_CLIENT_BUFFER_INIT            /*clientBuf*/          \
+    },                                                         \
+    NULL,                               /*isDynamicDimension*/ \
+    QNN_SPARSE_PARAMS_INIT,             /*sparseParams*/       \
+    0u                                  /*isProduced*/         \
+  }
+// clang-format on
+
+/**
+ * @brief Enum to distinguish various tensor versions
+ */
+typedef enum {
+  /// Enum to choose usage of Qnn_TensorV1_t in Qnn_Tensor_t
+  QNN_TENSOR_VERSION_1 = 1,
+  /// Enum to choose usage of Qnn_TensorV2_t in Qnn_Tensor_t
+  QNN_TENSOR_VERSION_2 = 2,
+  // Unused, present to ensure 32 bits.
+  QNN_TENSOR_VERSION_UNDEFINED = 0x7FFFFFFF
+} Qnn_TensorVersion_t;
+
+/**
+ * @brief A struct which provides various versions of a tensor
+ */
+typedef struct {
+  /// Version of the QNN tensor
+  Qnn_TensorVersion_t version;
+  union UNNAMED {
+    /// Tensor version 1 (see QNN_TENSOR_VERSION_1)
+    Qnn_TensorV1_t v1;
+    /// Tensor version 2 (see QNN_TENSOR_VERSION_2)
+    Qnn_TensorV2_t v2;
+  };
+} Qnn_Tensor_t;
+
+/// Qnn_Tensor_t initializer macro
+#define QNN_TENSOR_INIT               \
+  {                                   \
+    QNN_TENSOR_VERSION_1, /*version*/ \
+    {                                 \
+      QNN_TENSOR_V1_INIT /*v1*/       \
+    }                                 \
+  }
+
+/**
+ * @brief A struct which describes the properties of a V1 set of input and output tensors
+ *
+ */
+typedef struct {
+  /// The number of input tensors.
+  uint32_t numInputs;
+  /// Array of input tensors.
+  Qnn_Tensor_t* inputs;
+  /// The number of output tensors.
+  uint32_t numOutputs;
+  /// Array of output tensors.
+  Qnn_Tensor_t* outputs;
+} Qnn_TensorSetV1_t;
+
+// clang-format off
+/// Qnn_TensorSetV1_t initializer macro
+#define QNN_TENSOR_SET_V1_INIT \
+  {                            \
+    0u,   /*inputs*/           \
+    NULL, /*inputTensors*/     \
+    0u,   /*numOutputs*/       \
+    NULL  /*outputs*/          \
+  }
+// clang-format on
+
+/**
+ * @brief Enum to distinguish between tensor set versions
+ */
+typedef enum {
+  /// Enum to choose usage of Qnn_TensorSetV1_t in Qnn_TensorSet_t
+  QNN_TENSOR_SET_VERSION_1 = 1,
+  /// Unused, present to ensure 32 bits.
+  QNN_TENSOR_SET_VERSION_UNDEFINED = 0x7FFFFFFF
+} Qnn_TensorSetVersion_t;
+
+/**
+ * @brief A struct which provides the version of a tensor set
+ */
+typedef struct {
+  /// Version of the QNN tensor set
+  Qnn_TensorSetVersion_t version;
+  union UNNAMED {
+    /// Tensor set version 1 (see QNN_TENSOR_SET_VERSION_1)
+    Qnn_TensorSetV1_t v1;
+  };
+} Qnn_TensorSet_t;
+
+/// Qnn_TensorSet_t initializer macro
+#define QNN_TENSOR_SET_INIT               \
+  {                                       \
+    QNN_TENSOR_SET_VERSION_1, /*version*/ \
+    {                                     \
+      QNN_TENSOR_SET_V1_INIT /*v1*/       \
+    }                                     \
+  }
+
+/**
+ * @brief A struct which defines a named scalar or tensor parameter.
+ *
+ */
+typedef struct {
+  /// Parameter type: scalar or tensor
+  Qnn_ParamType_t paramType;
+  /// Name of the parameter
+  const char* name;
+
+  union UNNAMED {
+    /// Scalar parameter specification
+    Qnn_Scalar_t scalarParam;
+    /// Tensor parameter specification; tensors referred to must be STATIC.
+    Qnn_Tensor_t tensorParam;
+  };
+} Qnn_Param_t;
+
+// clang-format off
+/// Qnn_Param_t initializer macro
+#define QNN_PARAM_INIT                     \
+  {                                        \
+    QNN_PARAMTYPE_UNDEFINED, /*paramType*/ \
+    NULL,                    /*name*/      \
+    {                                      \
+      QNN_SCALAR_INIT /*scalarParam*/      \
+    }                                      \
+  }
+// clang-format on
+
+/**
+ * @brief This struct defines the configuration for a single operation.
+ */
+typedef struct {
+  /// A human-readable name for the operation instance.
+  const char* name;
+  /// The name of the operation package to which this operation's type belongs.
+  const char* packageName;
+  /// The name of operation type (e.g. Conv2D).
+  const char* typeName;
+  /// The number of static parameters provided in the params array.
+  uint32_t numOfParams;
+  /// Array of operation parameters.
+  Qnn_Param_t* params;
+  /// The number of input tensors.
+  uint32_t numOfInputs;
+  /// Array of input tensors.
+  Qnn_Tensor_t* inputTensors;
+  /// The number of output tensors.
+  uint32_t numOfOutputs;
+  /// Array of output tensors.
+  Qnn_Tensor_t* outputTensors;
+} Qnn_OpConfigV1_t;
+
+// clang-format off
+/// Qnn_OpConfigV1_t initializer macro
+#define QNN_OPCONFIG_V1_INIT    \
+  {                             \
+    NULL,     /*name*/          \
+    NULL,     /*packageName*/   \
+    NULL,     /*typeName*/      \
+    0u,       /*numOfParams*/   \
+    NULL,     /*params*/        \
+    0u,       /*numOfInputs*/   \
+    NULL,     /*inputTensors*/  \
+    0u,       /*numOfOutputs*/  \
+    NULL      /*outputTensors*/ \
+  }
+// clang-format on
+
+/**
+ * @brief Enum to distinguish various opConfig versions
+ */
+typedef enum {
+  /// Enum to choose usage of Qnn_OpConfigV1_t in Qnn_OpConfig_t
+  QNN_OPCONFIG_VERSION_1 = 1,
+  // Unused, present to ensure 32 bits.
+  QNN_OPCONFIG_VERSION_UNDEFINED = 0x7FFFFFFF
+} Qnn_OpConfigVersion_t;
+
+/**
+ * @brief Structure which provides various versions of an opConfig
+ */
+typedef struct {
+  /// Version of the QNN opConfig
+  Qnn_OpConfigVersion_t version;
+  union UNNAMED {
+    /// Op config version 1 (see QNN_OPCONFIG_VERSION_1)
+    Qnn_OpConfigV1_t v1;
+  };
+} Qnn_OpConfig_t;
+
+// clang-format off
+/// Qnn_OpConfig_t initializer macro
+#define QNN_OPCONFIG_INIT               \
+  {                                     \
+    QNN_OPCONFIG_VERSION_1, /*version*/ \
+    {                                   \
+      QNN_OPCONFIG_V1_INIT /*v1*/       \
+    }                                   \
+  }
+// clang-format on
+
+/**
+ * @brief An enum which identifies SOC models.
+ *
+ * @deprecated This enumeration will no longer be updated.
+ */
+typedef enum {
+  QNN_SOC_MODEL_UNKNOWN = 0,
+  QNN_SOC_MODEL_SM8350  = 30,
+  QNN_SOC_MODEL_SM8325  = 34,
+  QNN_SOC_MODEL_SM7350  = 32,
+  QNN_SOC_MODEL_SM7325  = 35,
+  QNN_SOC_MODEL_SM8450  = 36,
+  QNN_SOC_MODEL_SC8280X = 37,
+  QNN_SOC_MODEL_SM7315  = 38,
+  QNN_SOC_MODEL_SA8295  = 39,
+  QNN_SOC_MODEL_SM7450  = 41,
+  QNN_SOC_MODEL_SM8475  = 42,
+  QNN_SOC_MODEL_SM8550  = 43,
+  QNN_SOC_MODEL_SM6450  = 50,
+  QNN_SOC_MODEL_SA8255  = 52,
+  QNN_SOC_MODEL_SM7475  = 54,
+  QNN_SOC_MODEL_SM4450  = 59,
+} Qnn_SocModel_t;
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // QNN_TYPES_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/Saver/QnnSaver.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/Saver/QnnSaver.h
new file mode 100755
index 0000000000000..0d3960a0cd5b7
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/Saver/QnnSaver.h
@@ -0,0 +1,202 @@
+//=============================================================================
+//
+//  Copyright (c) 2020-2023 Qualcomm Technologies, Inc.
+//  All Rights Reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//=============================================================================
+
+/** @file
+ *  @brief QNN Saver component API.
+ *
+ *         Provides an interface to the client to allow configuration
+ *         of settings that are specific to the Saver Backend
+ */
+
+#ifndef QNN_SAVER_H
+#define QNN_SAVER_H
+
+#include "QnnTypes.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//=============================================================================
+// Macros
+//=============================================================================
+
+// Macro controlling visibility of Saver API
+#ifndef QNN_SAVER_API
+#define QNN_SAVER_API
+#endif
+
+//=============================================================================
+// Data Types
+//=============================================================================
+
+/**
+ * @brief QNN Saver API result / error codes.
+ */
+typedef enum {
+  QNN_SAVER_MIN_ERROR = QNN_MIN_ERROR_BACKEND_SAVER,
+  ////////////////////////////////////////
+
+  /// The API has been recorded by Saver, however the return value is fake and should not be used.
+  /// This error code is generally returned from get() APIs where Saver has no capability to
+  /// actually fulfill the request, but can still record the API in saver_output.c.
+  /// Saver will return this error code from the following QNN APIs:
+  ///   - QnnBackend_getSupportedOperations()
+  ///   - QnnContext_getBinarySize()
+  ///   - QnnContext_getBinary()
+  ///   - QnnProperty_hasCapability()
+  ///   - QnnProfile_getEvents()
+  ///   - QnnProfile_getSubEvents()
+  ///   - QnnProfile_getEventData()
+  ///   - QnnProfile_getExtendedEventData()
+  ///   - QnnDevice_getPlatformInfo()
+  ///   - QnnDevice_getInfo()
+  ///   - QnnDevice_getInfrastructure()
+  QNN_SAVER_ERROR_DUMMY_RETVALUE = QNN_MIN_ERROR_BACKEND_SAVER + 0,
+  /// The API must be called before any others, but backend instance has already been instantiated.
+  QNN_SAVER_ERROR_ALREADY_INSTANTIATED = QNN_MIN_ERROR_BACKEND_SAVER + 1,
+
+  ////////////////////////////////////////
+  QNN_SAVER_MAX_ERROR = QNN_MAX_ERROR_BACKEND_SAVER
+} QnnSaver_Error_t;
+
+/**
+ * @brief A struct which is used to provide alternative model + data file names for Saver outputs
+ */
+typedef struct {
+  /// Configuration of the model file name. Must not be NULL and must not contain slashes. Default
+  /// is "saver_output.c"
+  const char* modelFileName;
+  /// Configuration of the data file name. Must not be NULL and must not contain slashes. Default is
+  /// "params.bin"
+  const char* dataFileName;
+} QnnSaver_FileConfig_t;
+
+/**
+ * @brief This enum contains the supported config options for Saver
+ */
+typedef enum {
+  /// Configuration of the location Saver outputs.
+  /// This config option must be provided before any other QNN APIs are called, unless provided
+  /// concurrently with QNN_SAVER_CONFIG_OPTION_FILE_CONFIG.
+  QNN_SAVER_CONFIG_OPTION_OUTPUT_DIRECTORY = 0,
+  /// Configuration of timestamp appended to Saver outputs.
+  /// This config option must be provided before any other QNN APIs are called, and is mutually
+  /// exclusive with QNN_SAVER_CONFIG_OPTION_FILE_CONFIG.
+  QNN_SAVER_CONFIG_OPTION_APPEND_TIMESTAMP = 1,
+  /// Configuration indicating to Saver which backend to interpret custom configs as. This option
+  /// should only be used if you are providing custom configs to QNN APIs that support them
+  /// (e.g. QnnBackend_create()) and you want these custom configs to be recorded by Saver.
+  /// This config option must be provided before any other QNN APIs are called, unless provided
+  /// concurrently with QNN_SAVER_CONFIG_OPTION_FILE_CONFIG.
+  QNN_SAVER_CONFIG_OPTION_BACKEND_ID = 2,
+  /// Configuration of the filenames of outputs from Saver. This configuration can be used to switch
+  /// the output file streams dynamically during runtime.
+  /// This config option is mutually exclusive with QNN_SAVER_CONFIG_OPTION_APPEND_TIMESTAMP.
+  QNN_SAVER_CONFIG_OPTION_FILE_CONFIG = 3,
+  /// Configuration controlling whether the header should be written to the saver output file.
+  /// This config must be provided concurrently with QNN_SAVER_CONFIG_OPTION_FILE_CONFIG.
+  QNN_SAVER_CONFIG_WRITE_OUTPUT_HEADER = 4,
+  /// Configuration controlling whether the footer should be written to the saver output file.
+  /// This config must be provided concurrently with QNN_SAVER_CONFIG_OPTION_FILE_CONFIG.
+  QNN_SAVER_CONFIG_WRITE_OUTPUT_FOOTER = 5,
+  // Unused, present to ensure 32 bits.
+  QNN_SAVER_CONFIG_OPTION_UNDEFINED = 0x7FFFFFFF
+} QnnSaver_ConfigOption_t;
+
+/**
+ * @brief A struct that provides configuration for Saver
+ */
+typedef struct {
+  /// Type of Saver configuration option
+  QnnSaver_ConfigOption_t option;
+  /// Union of mutually exclusive config values based on
+  /// the type specified by 'option'.
+  union UNNAMED {
+    /// Path to a directory where Saver output should be stored. The directory will
+    /// be created if it doesn't exist already. If a relative filepath is given, the location is
+    /// relative to the current working directory. Defaults to "./saver_output/" if not provided.
+    const char* outputDirectory;
+    /// Boolean flag to indicate if a timestamp should be appended to the
+    /// filename of Saver outputs to prevent them from being overwritten during
+    /// consecutive uses of Saver. Note that all input tensor data is dumped into params.bin, so
+    /// this setting may use lots of storage over time. Any nonzero value will enable the timestamp.
+    /// Defaults to 0 (false) if not provided.
+    uint8_t appendTimestamp;
+    /// Backend identifier indicating which backend to interpret custom configs as.
+    /// These identifiers are defined by each backend in a Qnn<Backend>Common.h file
+    /// included with the SDK.
+    uint32_t backendId;
+    /// Alternative filenames for Saver outputs.
+    QnnSaver_FileConfig_t fileConfig;
+    /// Boolean flag to indicate if the saver output header should be written or not.
+    /// The 'header' refers to the static text at the top of the output file before any APIs are
+    /// recorded (header includes, beginning of main(), command line parsing, etc.)
+    /// This config would be used when writing to a pre-existing saver output file created from a
+    /// previous call to QnnSaver_initialize(), providing a fileConfig
+    /// (QNN_SAVER_CONFIG_OPTION_FILE_CONFIG) and writeOutputFooter == 0.
+    /// Because the output files already exist, the they will be opened in append mode.
+    /// Defaults to 1 (true) if not provided.
+    uint8_t writeOutputHeader;
+    /// Boolean flag to indicate if the saver output footer should be written or not.
+    /// The 'footer' refers to the static text at the bottom of the output file after all APIs have
+    /// been recorded (misc. cleanup, the end of main(), etc.)
+    /// This config would be used when writing to a saver output file that will be appended to at
+    /// later point with a subsequent call to QnnSaver_initialize(), providing a fileConfig
+    /// (QNN_SAVER_CONFIG_OPTION_FILE_CONFIG) and writeOutputHeader == 0
+    /// Defaults to 1 (true) if not provided.
+    uint8_t writeOutputFooter;
+  };
+} QnnSaver_Config_t;
+
+// clang-format off
+/// QnnSaver_Config_t initializer macro
+#define QNN_SAVER_CONFIG_INIT                     \
+  {                                               \
+    QNN_SAVER_CONFIG_OPTION_UNDEFINED, /*option*/ \
+    {                                             \
+      NULL /*outputDirectory*/                    \
+    }                                             \
+  }
+// clang-format on
+
+//=============================================================================
+// API Methods
+//=============================================================================
+
+/**
+ * @brief Supply the Saver backend with configuration options.
+ *        This function only needs to be called if you are providing configs to Saver.
+ *        If no configuration is needed, you may simply call any other QNN API to initialize the
+ *        Saver.
+ *
+ * @note There are restrictions which affect when certain configurations can be provided, refer to
+ *       QnnSaver_ConfigOption_t.
+ *
+ * @param[in] config Pointer to a NULL terminated array of config option pointers.
+ *                   NULL is allowed and indicates no config options are provided,
+ *                   however this function only serves to supply configs, so it
+ *                   is unnecessary to call if no configuration is desired.
+ *                   All config options have a default value, in case not provided.
+ *                   If the same config option type is provided multiple times,
+ *                   the last option value will be used.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: No error encountered
+ *         - QNN_COMMON_ERROR_INVALID_ARGUMENT: A config was supplied incorrectly
+ *         - QNN_SAVER_ERROR_ALREADY_INSTANTIATED: Saver backend was already initialized
+ */
+
+QNN_SAVER_API
+Qnn_ErrorHandle_t QnnSaver_initialize(const QnnSaver_Config_t** config);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // QNN_SAVER_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/Saver/QnnSaverCommon.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/Saver/QnnSaverCommon.h
new file mode 100755
index 0000000000000..9af94daed45b7
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/Saver/QnnSaverCommon.h
@@ -0,0 +1,51 @@
+//=============================================================================
+//
+//  Copyright (c) 2020-2021, 2023 Qualcomm Technologies, Inc.
+//  All Rights Reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//=============================================================================
+
+/** @file
+ *  @brief QNN Saver Common components
+ *
+ *         This file defines versioning and other identification details
+ *         and supplements QnnCommon.h for Saver backend
+ */
+
+#ifndef QNN_SAVER_COMMON_H
+#define QNN_SAVER_COMMON_H
+
+#include "QnnCommon.h"
+
+/// Saver Backend identifier
+#define QNN_BACKEND_ID_SAVER 2
+
+/// Saver interface provider
+#define QNN_SAVER_INTERFACE_PROVIDER_NAME "SAVER_QTI_AISW"
+
+// Saver API Version values
+#define QNN_SAVER_API_VERSION_MAJOR 1
+#define QNN_SAVER_API_VERSION_MINOR 1
+#define QNN_SAVER_API_VERSION_PATCH 0
+
+// clang-format off
+
+/// Macro to set Qnn_ApiVersion_t for Saver backend
+#define QNN_SAVER_API_VERSION_INIT                               \
+  {                                                              \
+    {                                                            \
+      QNN_API_VERSION_MAJOR,     /*coreApiVersion.major*/        \
+      QNN_API_VERSION_MINOR,     /*coreApiVersion.major*/        \
+      QNN_API_VERSION_PATCH      /*coreApiVersion.major*/        \
+    },                                                           \
+    {                                                            \
+      QNN_SAVER_API_VERSION_MAJOR, /*backendApiVersion.major*/   \
+      QNN_SAVER_API_VERSION_MINOR, /*backendApiVersion.minor*/   \
+      QNN_SAVER_API_VERSION_PATCH  /*backendApiVersion.patch*/   \
+    }                                                            \
+  }
+
+// clang-format on
+
+#endif  // QNN_SAVER_COMMON_H
\ No newline at end of file
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/System/QnnSystemCommon.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/System/QnnSystemCommon.h
new file mode 100755
index 0000000000000..e5474ee7ac54f
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/System/QnnSystemCommon.h
@@ -0,0 +1,59 @@
+//==============================================================================
+//
+//  Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+//  All Rights Reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+/**
+ * @file
+ * @brief   QNN System Common API component
+ *
+ *          A header which contains common types shared by QNN system components.
+ *          This simplifies the cross-inclusion of headers.
+ */
+
+#ifndef QNN_SYSTEM_COMMON_H
+#define QNN_SYSTEM_COMMON_H
+
+#include "QnnCommon.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//=============================================================================
+// Macros
+//=============================================================================
+
+// libQnnSystem.so system interface provider name
+#define QNN_SYSTEM_INTERFACE_PROVIDER_NAME "SYSTEM_QTI_AISW"
+
+// Macro controlling visibility of QNN_SYSTEM API
+#ifndef QNN_SYSTEM_API
+#define QNN_SYSTEM_API
+#endif
+
+// Provide values to use for API version.
+#define QNN_SYSTEM_API_VERSION_MAJOR 1
+#define QNN_SYSTEM_API_VERSION_MINOR 4
+#define QNN_SYSTEM_API_VERSION_PATCH 0
+
+// Error code space assigned to system API components
+#define QNN_SYSTEM_CONTEXT_MIN_ERROR QNN_MIN_ERROR_SYSTEM
+#define QNN_SYSTEM_CONTEXT_MAX_ERROR (QNN_SYSTEM_CONTEXT_MIN_ERROR + 999)
+
+//=============================================================================
+// Data Types
+//=============================================================================
+
+//=============================================================================
+// Public Functions
+//=============================================================================
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // QNN_SYSTEM_COMMON_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/System/QnnSystemContext.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/System/QnnSystemContext.h
new file mode 100755
index 0000000000000..9fd21cdccb053
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/System/QnnSystemContext.h
@@ -0,0 +1,537 @@
+//==============================================================================
+//
+//  Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+//  All rights reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+/**
+ *  @file
+ *  @brief  QNN System Context API.
+ *
+ *          This is a system API header dedicated to extensions to QnnContext
+ *          that provide backend-agnostic services to users.
+ */
+
+#ifndef QNN_SYSTEM_CONTEXT_H
+#define QNN_SYSTEM_CONTEXT_H
+
+#include "QnnDevice.h"
+#include "QnnTypes.h"
+#include "System/QnnSystemCommon.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//=============================================================================
+// Error Codes
+//=============================================================================
+
+/**
+ * @brief QNN System Context API result / error codes.
+ */
+typedef enum {
+  QNN_SYSTEM_CONTEXT_MINERROR = QNN_MIN_ERROR_SYSTEM,
+  //////////////////////////////////////////
+
+  /// Qnn System Context success
+  QNN_SYSTEM_CONTEXT_NO_ERROR = QNN_SUCCESS,
+  /// There is optional API component that is not supported yet.
+  QNN_SYSTEM_CONTEXT_ERROR_UNSUPPORTED_FEATURE = QNN_COMMON_ERROR_NOT_SUPPORTED,
+  /// QNN System Context invalid handle
+  QNN_SYSTEM_CONTEXT_ERROR_INVALID_HANDLE = QNN_SYSTEM_CONTEXT_MINERROR + 0,
+  /// One or more arguments to a System Context API is/are NULL/invalid.
+  QNN_SYSTEM_CONTEXT_ERROR_INVALID_ARGUMENT = QNN_SYSTEM_CONTEXT_MINERROR + 1,
+  /// Generic Failure in achieving the objective of a System Context API
+  QNN_SYSTEM_CONTEXT_ERROR_OPERATION_FAILED = QNN_SYSTEM_CONTEXT_MINERROR + 2,
+
+  // Errors related to context caching
+  /// Malformed context binary
+  QNN_SYSTEM_CONTEXT_ERROR_MALFORMED_BINARY = QNN_SYSTEM_CONTEXT_MINERROR + 10,
+  //////////////////////////////////////////
+  QNN_SYSTEM_CONTEXT_MAXERROR = QNN_MAX_ERROR_SYSTEM
+} QnnSystemContext_Error_t;
+
+/*****************************************************************************/
+/* Enums and data structures corresponding to QnnSystemContext               */
+/*****************************************************************************/
+
+typedef enum {
+  QNN_SYSTEM_CONTEXT_GRAPH_INFO_VERSION_1 = 0x01,
+  QNN_SYSTEM_CONTEXT_GRAPH_INFO_VERSION_2 = 0x02,
+  QNN_SYSTEM_CONTEXT_GRAPH_INFO_VERSION_3 = 0x03,
+  // Unused, present to ensure 32 bits.
+  QNN_SYSTEM_CONTEXT_GRAPH_INFO_UNDEFINED = 0x7FFFFFFF
+} QnnSystemContext_GraphInfoVersion_t;
+
+typedef enum {
+  QNN_SYSTEM_CONTEXT_BINARY_INFO_VERSION_1 = 0x01,
+  QNN_SYSTEM_CONTEXT_BINARY_INFO_VERSION_2 = 0x02,
+  QNN_SYSTEM_CONTEXT_BINARY_INFO_VERSION_3 = 0x03,
+  // Unused, present to ensure 32 bits.
+  QNN_SYSTEM_CONTEXT_BINARY_INFO_UNDEFINED = 0x7FFFFFFF
+} QnnSystemContext_BinaryInfoVersion_t;
+
+//=============================================================================
+// Data structures representing context binary metadata contents
+//=============================================================================
+
+/**
+ * @brief Struct that provides information about graphs registered with a context.
+ *        This is version V1 of the structure.
+ */
+typedef struct {
+  /// Name of graph
+  const char* graphName;
+  /// Number of input tensors to graph
+  uint32_t numGraphInputs;
+  /// List of input tensors to graph
+  Qnn_Tensor_t* graphInputs;
+  /// Number of output tensors from graph
+  uint32_t numGraphOutputs;
+  /// List of output tensors from graph
+  Qnn_Tensor_t* graphOutputs;
+} QnnSystemContext_GraphInfoV1_t;
+
+// clang-format off
+/// QnnSystemContext_GraphInfoV1_t initializer macro
+#define QNN_SYSTEM_CONTEXT_GRAPH_INFO_V1_INIT  \
+  {                                            \
+     NULL,    /* graphName */                  \
+     0,       /* numGraphInputs */             \
+     NULL,    /* graphInputs */                \
+     0,       /* numGraphOutputs */            \
+     NULL,    /* graphOutputs */               \
+  }
+// clang-format on
+
+/**
+ * @brief Struct that provides information about graphs registered with a context.
+ *        This is version V2 of the structure.
+ */
+typedef struct {
+  /// Name of graph
+  const char* graphName;
+  /// Number of input tensors to graph
+  uint32_t numGraphInputs;
+  /// List of input tensors to graph
+  Qnn_Tensor_t* graphInputs;
+  /// Number of output tensors from graph
+  uint32_t numGraphOutputs;
+  /// List of output tensors from graph
+  Qnn_Tensor_t* graphOutputs;
+  /// Number of updatable tensors from graph
+  uint32_t numUpdateableTensors;
+  /// List of updatable tensors from graph
+  Qnn_Tensor_t* updateableTensors;
+} QnnSystemContext_GraphInfoV2_t;
+
+// clang-format off
+/// QnnSystemContext_GraphInfoV2_t initializer macro
+#define QNN_SYSTEM_CONTEXT_GRAPH_INFO_V2_INIT  \
+  {                                            \
+     NULL,    /* graphName */                  \
+     0,       /* numGraphInputs */             \
+     NULL,    /* graphInputs */                \
+     0,       /* numGraphOutputs */            \
+     NULL,    /* graphOutputs */               \
+     0,       /* numUpdateableTensors */        \
+     NULL,    /* updateableTensors */           \
+  }
+// clang-format on
+
+/**
+ * @brief Struct that provides information about graphs registered with a context.
+ *        This is version V3 of the structure.
+ */
+typedef struct {
+  /// Name of graph
+  const char* graphName;
+  /// Number of input tensors to graph
+  uint32_t numGraphInputs;
+  /// List of input tensors to graph
+  Qnn_Tensor_t* graphInputs;
+  /// Number of output tensors from graph
+  uint32_t numGraphOutputs;
+  /// List of output tensors from graph
+  Qnn_Tensor_t* graphOutputs;
+  /// Number of updatable tensors from graph
+  uint32_t numUpdateableTensors;
+  /// List of updatable tensors from graph
+  Qnn_Tensor_t* updateableTensors;
+  /// Size of graph info blob stored in the context binary, in bytes
+  uint32_t graphBlobInfoSize;
+  /// Graph Info blob. Needs to be interpreted based on backend-specific instructions
+  void* graphBlobInfo;
+  /// start Op Index of a graph
+  uint32_t startOpIndex;
+  /// end Op Index of a graph
+  uint32_t endOpIndex;
+} QnnSystemContext_GraphInfoV3_t;
+
+// clang-format off
+/// QnnSystemContext_GraphInfoV3_t initializer macro
+#define QNN_SYSTEM_CONTEXT_GRAPH_INFO_V3_INIT  \
+  {                                            \
+     NULL,    /* graphName */                  \
+     0,       /* numGraphInputs */             \
+     NULL,    /* graphInputs */                \
+     0,       /* numGraphOutputs */            \
+     NULL,    /* graphOutputs */               \
+     0,       /* numUpdateableTensors */       \
+     NULL,    /* updateableTensors */          \
+     0,       /* graphBlobInfoSize */          \
+     NULL,    /* graphInfoBlob */              \
+     0,       /* startOpIndex */               \
+     0,       /* endOpIndex */                 \
+  }
+// clang-format on
+
+typedef struct {
+  QnnSystemContext_GraphInfoVersion_t version;
+  union UNNAMED {
+    QnnSystemContext_GraphInfoV1_t graphInfoV1;
+    QnnSystemContext_GraphInfoV2_t graphInfoV2;
+    QnnSystemContext_GraphInfoV3_t graphInfoV3;
+  };
+} QnnSystemContext_GraphInfo_t;
+
+// clang-format off
+/// QnnSystemContext_GraphInfo_t initializer macro
+#define QNN_SYSTEM_CONTEXT_GRAPH_INFO_INIT                      \
+  {                                                             \
+    QNN_SYSTEM_CONTEXT_GRAPH_INFO_UNDEFINED,  /* version */     \
+    {                                                           \
+      QNN_SYSTEM_CONTEXT_GRAPH_INFO_V1_INIT  /* graphInfoV1 */  \
+    }                                                           \
+  }
+// clang-format on
+
+/**
+ * @brief Struct that provides information about contents of a context binary.
+ *        This is version V1 of the structure.
+ */
+typedef struct {
+  /// Backend that this context binary is associated with
+  uint32_t backendId;
+  /// Build ID of QNN SDK used to create context binary
+  const char* buildId;
+  /// QNN core API version
+  Qnn_Version_t coreApiVersion;
+  /// Version of backend-specific API for the backend producing context binary
+  Qnn_Version_t backendApiVersion;
+  /// Version of the SOC for which context binary was generated
+  const char* socVersion;
+  /// Version of hardware info blob stored in the context binary
+  Qnn_Version_t hwInfoBlobVersion;
+  /// Version of the opaque context blob generated by backend that is packed into the context binary
+  /// Note that the context blob is not part of metadata. It is described by the metadata
+  Qnn_Version_t contextBlobVersion;
+  /// Size of hardware info blob stored in the context binary, in bytes
+  uint32_t hwInfoBlobSize;
+  /// Hardware Info blob. Needs to be interpreted based on backend-specific instructions
+  void* hwInfoBlob;
+
+  /// Size of opaque backend-specific context blob, in bytes
+  uint64_t contextBlobSize;
+
+  // details about graphs stored in context
+  /// Number of context tensors
+  uint32_t numContextTensors;
+  /// List of tensors registered to this context. Includes updatable context tensors.
+  Qnn_Tensor_t* contextTensors;
+  /// Number of graphs registered with this context
+  uint32_t numGraphs;
+  /// List of graphs registered to this context
+  QnnSystemContext_GraphInfo_t* graphs;
+} QnnSystemContext_BinaryInfoV1_t;
+
+// clang-format off
+/// QnnSystemContext_BinaryInfoV1_t initializer macro
+#define QNN_SYSTEM_CONTEXT_BINARY_INFO_V1_INIT                           \
+  {                                                                      \
+    0,                     /* backendId */                               \
+    NULL,                  /* buildId */                                 \
+    QNN_VERSION_INIT,      /* coreApiVersion */                          \
+    QNN_VERSION_INIT,      /* backendApiVersion */                       \
+    NULL,                  /* socVersion */                              \
+    QNN_VERSION_INIT,      /* hwInfoBlobVersion */                       \
+    QNN_VERSION_INIT,      /* contextBlobVersion */                      \
+    0,                     /* hwInfoBlobSize */                          \
+    NULL,                  /* hwInfoBlob */                              \
+    0,                     /* contextBlobSize */                         \
+    0,                     /* numContextTensors */                       \
+    NULL,                  /* contextTensors */                          \
+    0,                     /* numGraphs */                               \
+    NULL                  /* graphs */                                   \
+  }
+// clang-format on
+
+/**
+ * @brief Struct that provides information about contents of a context binary.
+ *        This is version V2 of the structure.
+ */
+typedef struct {
+  /// Backend that this context binary is associated with
+  uint32_t backendId;
+  /// Build ID of QNN SDK used to create context binary
+  const char* buildId;
+  /// QNN core API version
+  Qnn_Version_t coreApiVersion;
+  /// Version of backend-specific API for the backend producing context binary
+  Qnn_Version_t backendApiVersion;
+  /// Version of the SOC for which context binary was generated
+  const char* socVersion;
+  /// Version of hardware info blob stored in the context binary
+  Qnn_Version_t hwInfoBlobVersion;
+  /// Version of the opaque context blob generated by backend that is packed into the context binary
+  /// Note that the context blob is not part of metadata. It is described by the metadata
+  Qnn_Version_t contextBlobVersion;
+  /// Size of hardware info blob stored in the context binary, in bytes
+  uint32_t hwInfoBlobSize;
+  /// Hardware Info blob. Needs to be interpreted based on backend-specific instructions
+  void* hwInfoBlob;
+  /// Size of opaque backend-specific context blob, in bytes
+  uint64_t contextBlobSize;
+
+  // details about graphs stored in context
+  /// Number of context tensors
+  uint32_t numContextTensors;
+  /// List of tensors registered to this context
+  Qnn_Tensor_t* contextTensors;
+  /// Number of graphs registered with this context
+  uint32_t numGraphs;
+  /// List of graphs registered to this context
+  QnnSystemContext_GraphInfo_t* graphs;
+
+  /// Device information associated with the context
+  QnnDevice_PlatformInfo_t* platformInfo;
+} QnnSystemContext_BinaryInfoV2_t;
+
+// clang-format off
+/// QnnSystemContext_BinaryInfoV2_t initializer macro
+#define QNN_SYSTEM_CONTEXT_BINARY_INFO_V2_INIT       \
+  {                                                  \
+    0,                  /* backendId */              \
+    NULL,               /* buildId */                \
+    QNN_VERSION_INIT,   /* coreApiVersion */         \
+    QNN_VERSION_INIT,   /* backendApiVersion */      \
+    NULL,               /* socVersion */             \
+    QNN_VERSION_INIT,   /* hwInfoBlobVersion */      \
+    QNN_VERSION_INIT,   /* contextBlobVersion */     \
+    0,                  /* hwInfoBlobSize */         \
+    NULL,               /* hwInfoBlob */             \
+    0,                  /* contextBlobSize */        \
+    0,                  /* numContextTensors */      \
+    NULL,               /* contextTensors */         \
+    0,                  /* numGraphs */              \
+    NULL,               /* graphs */                 \
+    NULL                /* platformInfo */           \
+  }
+// clang-format on
+
+/**
+ * @brief Struct that provides information about contents of a context binary.
+ *        This is version V3 of the structure.
+ */
+typedef struct {
+  /// Backend that this context binary is associated with
+  uint32_t backendId;
+  /// Build ID of QNN SDK used to create context binary
+  const char* buildId;
+  /// QNN core API version
+  Qnn_Version_t coreApiVersion;
+  /// Version of backend-specific API for the backend producing context binary
+  Qnn_Version_t backendApiVersion;
+  /// Version of the SOC for which context binary was generated
+  const char* socVersion;
+  /// Version of the opaque context blob generated by backend that is packed into the context binary
+  /// Note that the context blob is not part of metadata. It is described by the metadata
+  Qnn_Version_t contextBlobVersion;
+  /// Size of opaque backend-specific context blob, in bytes
+  uint64_t contextBlobSize;
+
+  // details about graphs stored in context
+  /// Number of context tensors
+  uint32_t numContextTensors;
+  /// List of tensors registered to this context
+  Qnn_Tensor_t* contextTensors;
+  /// Number of graphs registered with this context
+  uint32_t numGraphs;
+  /// List of graphs registered to this context
+  QnnSystemContext_GraphInfo_t* graphs;
+
+  /// Device information associated with the context
+  QnnDevice_PlatformInfo_t* platformInfo;
+  /// Size of context metadata stored in the context binary, in bytes
+  uint32_t contextMetadataSize;
+  /// context-specific settings
+  void* contextMetadata;
+  /// An integer representation of the identifier for the SoC
+  uint32_t socModel;
+} QnnSystemContext_BinaryInfoV3_t;
+
+// clang-format off
+/// QnnSystemContext_BinaryInfoV3_t initializer macro
+#define QNN_SYSTEM_CONTEXT_BINARY_INFO_V3_INIT       \
+  {                                                  \
+    0,                  /* backendId */              \
+    NULL,               /* buildId */                \
+    QNN_VERSION_INIT,   /* coreApiVersion */         \
+    QNN_VERSION_INIT,   /* backendApiVersion */      \
+    NULL,               /* socVersion */             \
+    QNN_VERSION_INIT,   /* contextBlobVersion */     \
+    0,                  /* contextBlobSize */        \
+    0,                  /* numContextTensors */      \
+    NULL,               /* contextTensors */         \
+    0,                  /* numGraphs */              \
+    NULL,               /* graphs */                 \
+    NULL,               /* platformInfo */           \
+    0,                  /* contextMetadataSize */    \
+    NULL,               /* contextMetadata */        \
+    0                   /* socModel */               \
+    }
+// clang-format on
+
+typedef struct {
+  QnnSystemContext_BinaryInfoVersion_t version;
+  union UNNAMED {
+    QnnSystemContext_BinaryInfoV1_t contextBinaryInfoV1;
+    QnnSystemContext_BinaryInfoV2_t contextBinaryInfoV2;
+    QnnSystemContext_BinaryInfoV3_t contextBinaryInfoV3;
+  };
+} QnnSystemContext_BinaryInfo_t;
+
+// clang-format off
+/// QnnSystemContext_BinaryInfo_t initializer macro
+#define QNN_SYSTEM_CONTEXT_BINARYINFO_INIT                             \
+  {                                                                    \
+    QNN_SYSTEM_CONTEXT_BINARY_INFO_UNDEFINED, /* version */            \
+    {                                                                  \
+      QNN_SYSTEM_CONTEXT_BINARY_INFO_V1_INIT /* contextBinaryInfoV1 */ \
+    }                                                                  \
+  }
+// clang-format on
+
+//=============================================================================
+// Data Types
+//=============================================================================
+
+/**
+ * @brief A typedef to indicate a QNN System context handle
+ */
+typedef void* QnnSystemContext_Handle_t;
+
+//=============================================================================
+// Public Functions
+//=============================================================================
+
+/**
+ * @brief A function to create an instance of the QNN system context
+ *
+ * @param[out] sysCtxHandle A handle to the created instance of a systemContext entity
+ *
+ * @return Error code
+ *         - QNN_SUCCESS: Successfully created a systemContext entity
+ *         - QNN_SYSTEM_CONTEXT_ERROR_INVALID_ARGUMENT: sysCtxHandle is NULL
+ *         - QNN_COMMON_ERROR_MEM_ALLOC: Error encountered in allocating memory for
+ *           systemContext instance
+ *         - QNN_SYSTEM_CONTEXT_ERROR_UNSUPPORTED_FEATURE: system context features not supported
+ */
+QNN_SYSTEM_API
+Qnn_ErrorHandle_t QnnSystemContext_create(QnnSystemContext_Handle_t* sysCtxHandle);
+
+/**
+ * @brief A function to get context info from the serialized binary buffer.
+ *
+ * @deprecated Use QnnSystemContext_getMetadata instead
+ *
+ * @param[in]  sysCtxHandle     Handle to the systemContext object
+ *
+ * @param[in]  binaryBuffer     Serialized buffer representing a context binary.
+ *
+ * @param[in]  binaryBufferSize Size of context binary in bytes
+ *
+ * @param[out] binaryInfo       Pointer to memory that will be populated with
+ *                              user-visible information about the context binary.
+ *                              Memory for this information is internally allocated
+ *                              and managed by QNN, and is associated with the
+ *                              handle _sysCtxHandle_ created with QnnSystemContext_create().
+ *                              This memory has to be released by calling
+ *                              QnnSystemContext_free() when it is no longer needed.
+ *
+ * @param[out] binaryInfoSize   Size of metadata describing the contents
+ *                              of the context binary, in bytes.
+ *
+ * @return Error code
+ *         - QNN_SUCCESS: Successfully returned context binary info to caller
+ *         - QNN_SYSTEM_CONTEXT_ERROR_INVALID_HANDLE: Invalid System Context handle
+ *         - QNN_SYSTEM_CONTEXT_ERROR_INVALID_ARGUMENT: One or more arguments to the API
+ *           is/are NULL/invalid.
+ *         - QNN_SYSTEM_CONTEXT_ERROR_OPERATION_FAILED: Failed to obtain context binary info
+ *         - QNN_SYSTEM_CONTEXT_ERROR_MALFORMED_BINARY: The binary is either malformed or
+ *           cannot be parsed successfully.
+ *         - QNN_SYSTEM_CONTEXT_ERROR_UNSUPPORTED_FEATURE: not supported
+ */
+QNN_SYSTEM_API
+Qnn_ErrorHandle_t QnnSystemContext_getBinaryInfo(QnnSystemContext_Handle_t sysCtxHandle,
+                                                 void* binaryBuffer,
+                                                 uint64_t binaryBufferSize,
+                                                 const QnnSystemContext_BinaryInfo_t** binaryInfo,
+                                                 Qnn_ContextBinarySize_t* binaryInfoSize);
+
+/**
+ * @brief A function to get meta data from the serialized binary buffer.
+ *
+ * @param[in]  sysCtxHandle     Handle to the systemContext object
+ *
+ * @param[in]  binaryBuffer     Serialized buffer representing a const context binary.
+ *
+ * @param[in]  binaryBufferSize Size of context binary in bytes
+ *
+ * @param[out] binaryInfo       Pointer to memory that will be populated with
+ *                              user-visible information about the context binary.
+ *                              Memory for this information is internally allocated
+ *                              and managed by QNN, and is associated with the
+ *                              handle _sysCtxHandle_ created with QnnSystemContext_create().
+ *                              This memory has to be released by calling
+ *                              QnnSystemContext_free() when it is no longer needed.
+ *
+ * @return Error code
+ *         - QNN_SUCCESS: Successfully returned context binary info to caller
+ *         - QNN_SYSTEM_CONTEXT_ERROR_INVALID_HANDLE: Invalid System Context handle
+ *         - QNN_SYSTEM_CONTEXT_ERROR_INVALID_ARGUMENT: One or more arguments to the API
+ *           is/are NULL/invalid.
+ *         - QNN_SYSTEM_CONTEXT_ERROR_OPERATION_FAILED: Failed to obtain context binary info
+ *         - QNN_SYSTEM_CONTEXT_ERROR_MALFORMED_BINARY: The binary is either malformed or
+ *           cannot be parsed successfully.
+ *         - QNN_SYSTEM_CONTEXT_ERROR_UNSUPPORTED_FEATURE: not supported
+ */
+QNN_SYSTEM_API
+Qnn_ErrorHandle_t QnnSystemContext_getMetadata(QnnSystemContext_Handle_t sysCtxHandle,
+                                               const void* binaryBuffer,
+                                               Qnn_ContextBinarySize_t binaryBufferSize,
+                                               const QnnSystemContext_BinaryInfo_t** binaryInfo);
+
+/**
+ * @brief A function to free the instance of the System Context object.
+ *        This API clears any intermediate memory allocated and associated
+ *        with a valid handle.
+ *
+ * @param[in] sysCtxHandle Handle to the System Context object
+ *
+ * @return Error code
+ *         - QNN_SUCCESS: Successfully freed instance of System Context
+ *         - QNN_SYSTEM_CONTEXT_ERROR_INVALID_HANDLE: Invalid System Context handle to free
+ *         - QNN_SYSTEM_CONTEXT_ERROR_UNSUPPORTED_FEATURE: not supported
+ */
+QNN_SYSTEM_API
+Qnn_ErrorHandle_t QnnSystemContext_free(QnnSystemContext_Handle_t sysCtxHandle);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // QNN_SYSTEM_CONTEXT_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/System/QnnSystemDlc.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/System/QnnSystemDlc.h
new file mode 100755
index 0000000000000..9b20015aec964
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/System/QnnSystemDlc.h
@@ -0,0 +1,201 @@
+//==============================================================================
+//
+//  Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+//  All rights reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+/**
+ *  @file
+ *  @brief  QNN System Context API.
+ *
+ *          This is a system API header to provide
+ *          Deep Learning Container (DLC) services to users.
+ */
+
+#ifndef QNN_SYSTEM_DLC_H
+#define QNN_SYSTEM_DLC_H
+
+#include "QnnInterface.h"
+#include "QnnTypes.h"
+#include "System/QnnSystemCommon.h"
+#include "System/QnnSystemContext.h"
+#include "System/QnnSystemLog.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//=============================================================================
+// Error Codes
+//=============================================================================
+
+/**
+ * @brief QNN System Context API result / error codes.
+ */
+typedef enum {
+  QNN_SYSTEM_DLC_MINERROR = QNN_MIN_ERROR_SYSTEM,
+  //////////////////////////////////////////
+
+  /// Qnn System Context success
+  QNN_SYSTEM_DLC_NO_ERROR = QNN_SUCCESS,
+  /// There is optional API component that is not supported yet.
+  QNN_SYSTEM_DLC_ERROR_UNSUPPORTED_FEATURE = QNN_COMMON_ERROR_NOT_SUPPORTED,
+  /// QNN System DLC invalid handle
+  QNN_SYSTEM_DLC_ERROR_INVALID_HANDLE = QNN_SYSTEM_DLC_MINERROR + 0,
+  /// One or more arguments to a System DLC API is/are NULL/invalid.
+  QNN_SYSTEM_DLC_ERROR_INVALID_ARGUMENT = QNN_SYSTEM_DLC_MINERROR + 1,
+  /// Generic Failure in achieving the objective of a System DLC API
+  QNN_SYSTEM_DLC_ERROR_OPERATION_FAILED = QNN_SYSTEM_DLC_MINERROR + 2,
+
+
+  /// Malformed DLC Binary
+  QNN_SYSTEM_DLC_ERROR_MALFORMED_BINARY = QNN_SYSTEM_DLC_MINERROR + 10,
+  //////////////////////////////////////////
+  QNN_SYSTEM_DLC_MAXERROR = QNN_MAX_ERROR_SYSTEM
+} QnnSystemDlc_Error_t;
+
+//=============================================================================
+// Data Types
+//=============================================================================
+
+/// Version of the graph config info
+typedef enum {
+  QNN_SYSTEM_DLC_GRAPH_CONFIG_INFO_VERSION_1 = 0x01,
+  // Unused, present to ensure 32 bits.
+  QNN_SYSTEM_DLC_GRAPH_CONFIG_INFO_UNDEFINED = 0x7FFFFFFF
+} QnnSystemContext_GraphConfigInfoVersion_t;
+
+typedef struct {
+  const char* graphName;
+  const QnnGraph_Config_t** graphConfigs;
+  uint32_t numConfigs;
+} QnnSystemDlc_GraphConfigInfoV1_t;
+
+/// @brief structure to define
+typedef struct {
+  QnnSystemContext_GraphConfigInfoVersion_t version;
+  union UNNAMED {
+    QnnSystemDlc_GraphConfigInfoV1_t v1;
+  };
+} QnnSystemDlc_GraphConfigInfo_t;
+
+/**
+ * @brief A typedef to indicate a QNN System DLC handle
+ */
+typedef void* QnnSystemDlc_Handle_t;
+
+//=============================================================================
+// Public Functions
+//=============================================================================
+
+/**
+ * @brief A function to create an instance of the DLC from a file
+ *
+ * @param[in] dlcPath path the DLC
+ * @param[in] logger a log handle produced from QnnSystemLog_create(). Can be NULL
+ * @param[out] dlcHandle A handle to the created instance of a systemContext entity
+ *
+ * @return Error code
+ *         - QNN_SUCCESS: Successfully created a systemContext entity
+ *         - QNN_SYSTEM_DLC_ERROR_INVALID_ARGUMENT: sysCtxHandle is NULL
+ *         - QNN_COMMON_ERROR_MEM_ALLOC: Error encountered in allocating memory for
+ *           systemContext instance
+ *         - QNN_SYSTEM_DLC_ERROR_UNSUPPORTED_FEATURE: system context features not supported
+ */
+QNN_SYSTEM_API
+Qnn_ErrorHandle_t QnnSystemDlc_createFromFile(Qnn_LogHandle_t logger, const char* dlcPath, QnnSystemDlc_Handle_t* dlcHandle);
+
+/**
+ * @brief A function to create an instance of the DLC from a binary buffer
+ *
+ * @param[in]  buffer pointer to buffer representing the DLC
+ * @param[in]  logger a log handle produced from QnnSystemLog_create(). Can be NULL
+ * @param[in]  bufferSize size of the binary buffer
+ * @param[out] dlcHandle A handle to the created instance of a systemContext entity
+ *
+ * @return Error code
+ *         - QNN_SUCCESS: Successfully created a systemContext entity
+ *         - QNN_SYSTEM_DLC_ERROR_INVALID_ARGUMENT: sysCtxHandle is NULL
+ *         - QNN_COMMON_ERROR_MEM_ALLOC: Error encountered in allocating memory for
+ *           systemContext instance
+ *         - QNN_SYSTEM_DLC_ERROR_UNSUPPORTED_FEATURE: system context features not supported
+ */
+QNN_SYSTEM_API
+Qnn_ErrorHandle_t QnnSystemDlc_createFromBinary(Qnn_LogHandle_t logger, const uint8_t* buffer,
+                                                const Qnn_ContextBinarySize_t bufferSize, QnnSystemDlc_Handle_t* dlcHandle);
+
+
+/**
+ * @brief A function to compose graphs from a DLC on a particular backend, __backend__, through
+ *        an interface __interface__. Memory allocated in __graphs__ is owned by clients and may
+ *        be released with calls to free().
+ *
+ * @param[in]  dlcHandle the DLC to retrieve graphs from
+ * @param[in]  graphConfigs the graph configuration information for a particular graph
+ * @param[in]  numGraphConfigs number of graph configurations
+ * @param[in] backend the backend on which to compose the graphs
+ * @param[in]  context the context on which to compose the graphs
+ * @param[in]  interface the interface used to compose the graph.
+ * @param[in]  logger a log handle produced by QnnSystemLog_create()
+ * @param[in] graphVersion version of the graph info structure to be returned
+ * @param[out] graphs An array of graph information representing what was created with the backend.
+ * @param[out] numGraphs the number of created graphs
+ *
+ * @return Error code
+ *         - QNN_SUCCESS: Successfully composed graphs.
+ *         - QNN_SYSTEM_DLC_ERROR_INVALID_ARGUMENT: Argument is NULL
+ *         - QNN_COMMON_ERROR_MEM_ALLOC: Error encountered in allocating memory for
+ *         - QNN_SYSTEM_DLC_ERROR_INVALID_HANDLE: Invalid Dlc handle to free
+ *         - QNN_SYSTEM_DLC_ERROR_UNSUPPORTED_FEATURE: DLC features not supported
+ *
+ */
+QNN_SYSTEM_API
+Qnn_ErrorHandle_t QnnSystemDlc_composeGraphs(QnnSystemDlc_Handle_t dlcHandle,
+                                             const QnnSystemDlc_GraphConfigInfo_t** graphConfigs,
+                                             const uint32_t numGraphConfigs,
+                                             Qnn_BackendHandle_t backend,
+                                             Qnn_ContextHandle_t context,
+                                             QnnInterface_t interface,
+                                             QnnSystemContext_GraphInfoVersion_t graphVersion,
+                                             QnnSystemContext_GraphInfo_t** graphs,
+                                             uint32_t* numGraphs);
+/**
+ * @brief A function to retrieve Op Mapping information from a DLC
+ *
+ * @param[in]  dlcHandle Handle to the DLC
+ * @param[out] opMappings a list of op mappings. The memory allocated here is owned by the System
+ *             library and is released when the corresponding DLC Handle is freed.
+ * @param[out] numOpMappings the number of opMappings
+ *
+ * @return Error code
+ *         - QNN_SUCCESS: Successfully freed instance of System Context
+ *         - QNN_SYSTEM_DLC_ERROR_INVALID_HANDLE: Invalid Dlc handle to free
+ *         - QNN_SYSTEM_DLC_ERROR_UNSUPPORTED_FEATURE: not supported
+ */
+QNN_SYSTEM_API
+Qnn_ErrorHandle_t QnnSystemDlc_getOpMappings(QnnSystemDlc_Handle_t dlcHandle,
+                                             const Qnn_OpMapping_t** opMappings,
+                                             uint32_t* numOpMappings);
+
+/**
+ * @brief A function to free the instance of the System Context object.
+ *        This API clears any intermediate memory allocated and associated
+ *        with a valid handle.
+ *
+ * @param[in] sysCtxHandle Handle to the System Context object
+ *
+ * @return Error code
+ *         - QNN_SUCCESS: Successfully freed instance of System Context
+ *         - QNN_SYSTEM_DLC_ERROR_INVALID_HANDLE: Invalid System Context handle to free
+ *         - QNN_SYSTEM_DLC_ERROR_UNSUPPORTED_FEATURE: not supported
+ */
+QNN_SYSTEM_API
+Qnn_ErrorHandle_t QnnSystemDlc_free(QnnSystemDlc_Handle_t dlcHandle);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // QNN_SYSTEM_DLC_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/System/QnnSystemInterface.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/System/QnnSystemInterface.h
new file mode 100755
index 0000000000000..2e998a6d07569
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/System/QnnSystemInterface.h
@@ -0,0 +1,263 @@
+//==============================================================================
+//
+// Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+// All rights reserved.
+// Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+/**
+ *  @file
+ *  @brief  QNN System Interface API
+ *
+ *          QNN System Interface is an abstraction combining all QNN System APIs.
+ *          QNN System Interface provides typedef variant of QNN System APIs and
+ *          API to get QNN System interface object(s).
+ *          QNN System Interface API can coexist with QNN System APIs. Visibility
+ *          of Interface and System APIs is determined by build configuration,
+ *          specifically by QNN_SYSTEM_API and QNN_SYSTEM_INTERFACE macro definitions.
+ */
+
+#ifndef QNN_SYSTEM_INTERFACE_H
+#define QNN_SYSTEM_INTERFACE_H
+
+#include "System/QnnSystemCommon.h"
+
+// QNN System API headers
+#include "System/QnnSystemContext.h"
+#include "System/QnnSystemTensor.h"
+#include "System/QnnSystemLog.h"
+#include "System/QnnSystemDlc.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//=============================================================================
+// Macros
+//=============================================================================
+
+// Macro controlling visibility of QNN System Interface API
+#ifndef QNN_SYSTEM_INTERFACE
+#define QNN_SYSTEM_INTERFACE
+#endif
+
+// Utility macros for version and name construction
+#define QNN_SYSTEM_INTERFACE_VER_EVAL(major, minor)          QNN_PASTE_THREE(major, _, minor)
+#define QNN_SYSTEM_INTERFACE_NAME_EVAL(prefix, body, suffix) QNN_PASTE_THREE(prefix, body, suffix)
+
+// Construct interface type name from version, e.g. QnnSystemInterface_ImplementationV0_0_t
+#define QNN_SYSTEM_INTERFACE_VER_TYPE_EVAL(ver_major, ver_minor) \
+  QNN_SYSTEM_INTERFACE_NAME_EVAL(                                \
+      QnnSystemInterface_ImplementationV, QNN_SYSTEM_INTERFACE_VER_EVAL(ver_major, ver_minor), _t)
+
+// Construct interface name from version, e.g. v0_0
+#define QNN_SYSTEM_INTERFACE_VER_NAME_EVAL(ver_major, ver_minor) \
+  QNN_SYSTEM_INTERFACE_NAME_EVAL(v, QNN_SYSTEM_INTERFACE_VER_EVAL(ver_major, ver_minor), )
+
+// Interface type name for current API version
+#define QNN_SYSTEM_INTERFACE_VER_TYPE \
+  QNN_SYSTEM_INTERFACE_VER_TYPE_EVAL(QNN_SYSTEM_API_VERSION_MAJOR, QNN_SYSTEM_API_VERSION_MINOR)
+
+// Interface name for current API version
+#define QNN_SYSTEM_INTERFACE_VER_NAME \
+  QNN_SYSTEM_INTERFACE_VER_NAME_EVAL(QNN_SYSTEM_API_VERSION_MAJOR, QNN_SYSTEM_API_VERSION_MINOR)
+
+//=============================================================================
+// Data Types
+//=============================================================================
+
+/**
+ * @brief QNN System Interface API result / error codes
+ */
+typedef enum {
+  QNN_SYSTEM_INTERFACE_MIN_ERROR = QNN_MIN_ERROR_SYSTEM,
+  ////////////////////////////////////////
+
+  QNN_SYSTEM_INTERFACE_NO_ERROR                = QNN_SUCCESS,
+  QNN_SYSTEM_INTERFACE_ERROR_NOT_SUPPORTED     = QNN_COMMON_ERROR_NOT_SUPPORTED,
+  QNN_SYSTEM_INTERFACE_ERROR_INVALID_PARAMETER = QNN_COMMON_ERROR_INVALID_ARGUMENT,
+
+  ////////////////////////////////////////
+  QNN_SYSTEM_INTERFACE_MAX_ERROR = QNN_MAX_ERROR_SYSTEM
+} QnnSystemInterface_Error_t;
+
+//
+// From QnnSystemContext.h
+//
+
+/** @brief See QnnSystemContext_create()*/
+typedef Qnn_ErrorHandle_t (*QnnSystemContext_CreateFn_t)(QnnSystemContext_Handle_t* sysCtxHandle);
+
+/** @brief See QnnSystemContext_getBinaryInfo()*/
+typedef Qnn_ErrorHandle_t (*QnnSystemContext_GetBinaryInfoFn_t)(
+    QnnSystemContext_Handle_t sysCtxHandle,
+    void* binaryBuffer,
+    uint64_t binaryBufferSize,
+    const QnnSystemContext_BinaryInfo_t** binaryInfo,
+    Qnn_ContextBinarySize_t* binaryInfoSize);
+
+/** @brief See QnnSystemContext_getMetadata()*/
+typedef Qnn_ErrorHandle_t (*QnnSystemContext_GetMetaDataFn_t)(
+    QnnSystemContext_Handle_t sysCtxHandle,
+    const void* binaryBuffer,
+    uint64_t binaryBufferSize,
+    const QnnSystemContext_BinaryInfo_t** binaryInfo);
+
+/** @brief See QnnSystemContext_free()*/
+typedef Qnn_ErrorHandle_t (*QnnSystemContext_FreeFn_t)(QnnSystemContext_Handle_t sysCtxHandle);
+
+//
+// From QnnSystemTensor.h
+//
+
+/** @brief See QnnSystemTensor_getMemoryFootprint()*/
+typedef Qnn_ErrorHandle_t (*QnnSystemTensor_getMemoryFootprintFn_t)(Qnn_Tensor_t tensor,
+                                                                    uint64_t* footprint);
+
+//
+// From QnnSystemLog.h
+//
+
+/** @brief See QnnSystemLog_create()*/
+typedef Qnn_ErrorHandle_t (*QnnSystemLog_createFn_t)(QnnLog_Callback_t callback,
+                                                     QnnLog_Level_t maxLogLevel,
+                                                     Qnn_LogHandle_t* logger);
+
+/** @brief See QnnSystemLog_setLogLevel()*/
+typedef Qnn_ErrorHandle_t (*QnnSystemLog_setLogLevelFn_t)(Qnn_LogHandle_t logger,
+                                                          QnnLog_Level_t maxLogLevel);
+
+/** @brief See QnnSystemLog_free()*/
+typedef Qnn_ErrorHandle_t (*QnnSystemLog_freeFn_t)(Qnn_LogHandle_t logger);
+// clang-format off
+
+//
+// From QnnSystemDlc.h
+//
+
+/** @brief See QnnSystemDlc_createFromFile()*/
+typedef Qnn_ErrorHandle_t (*QnnSystemDlc_createFromFileFn_t)(Qnn_LogHandle_t logger,
+                                                             const char* dlcPath,
+                                                             QnnSystemDlc_Handle_t* dlcHandle);
+/** @brief See QnnSystemDlc_createFromBinary()*/
+typedef Qnn_ErrorHandle_t (*QnnSystemDlc_createFromBinaryFn_t)(Qnn_LogHandle_t logger,
+                                                               const uint8_t* buffer,
+                                                               const Qnn_ContextBinarySize_t bufferSize,
+                                                               QnnSystemDlc_Handle_t* dlcHandle);
+
+/** @brief See QnnSystemDlc_composeGraphs()*/
+typedef Qnn_ErrorHandle_t (*QnnSystemDlc_composeGraphsFn_t)(QnnSystemDlc_Handle_t dlcHandle,
+                                                            const QnnSystemDlc_GraphConfigInfo_t** graphConfigs,
+                                                            const uint32_t numGraphConfigs,
+                                                            Qnn_BackendHandle_t backend,
+                                                            Qnn_ContextHandle_t context,
+                                                            QnnInterface_t interface,
+                                                            QnnSystemContext_GraphInfoVersion_t graphVersion,
+                                                            QnnSystemContext_GraphInfo_t** graphs,
+                                                            uint32_t* numGraphs);
+/** @brief See QnnSystemDlc_getOpMappings()*/
+typedef Qnn_ErrorHandle_t (*QnnSystemDlc_getOpMappingsFn_t)(QnnSystemDlc_Handle_t dlcHandle,
+                                                          const Qnn_OpMapping_t** opMappings,
+                                                          uint32_t* numOpMappings);
+
+/** @brief See QnnSystemDlc_free()*/
+typedef Qnn_ErrorHandle_t (*QnnSystemDlc_freeFn_t)(QnnSystemDlc_Handle_t dlcHandle);
+
+/**
+ * @brief This struct defines Qnn system interface specific to version.
+ *        Interface functions are allowed to be NULL if not supported/available.
+ *
+ */
+typedef struct {
+  QnnSystemContext_CreateFn_t            systemContextCreate;
+  QnnSystemContext_GetBinaryInfoFn_t     systemContextGetBinaryInfo;
+  QnnSystemContext_GetMetaDataFn_t       systemContextGetMetaData;
+  QnnSystemContext_FreeFn_t              systemContextFree;
+  QnnSystemTensor_getMemoryFootprintFn_t systemTensorGetMemoryFootprint;
+  QnnSystemLog_createFn_t                systemLogCreate;
+  QnnSystemLog_setLogLevelFn_t           systemLogSetLogLevel;
+  QnnSystemLog_freeFn_t                  systemLogFree;
+  QnnSystemDlc_createFromFileFn_t        systemDlcCreateFromFile;
+  QnnSystemDlc_createFromBinaryFn_t      systemDlcCreateFromBinary;
+  QnnSystemDlc_composeGraphsFn_t         systemDlcComposeGraphs;
+  QnnSystemDlc_getOpMappingsFn_t         systemDlcGetOpMappings;
+  QnnSystemDlc_freeFn_t                  systemDlcFree;
+} QNN_SYSTEM_INTERFACE_VER_TYPE;
+
+/// QNN_INTERFACE_VER_TYPE initializer macro
+#define QNN_SYSTEM_INTERFACE_VER_TYPE_INIT { \
+  NULL, /*systemContextCreate*/ \
+  NULL, /*systemContextGetBinaryInfo*/ \
+  NULL, /*systemContextGetMetaData*/ \
+  NULL, /*systemContextFree*/ \
+  NULL, /*systemTensorGetMemoryFootprint*/ \
+  NULL, /*systemLogCreate*/ \
+  NULL, /*systemLogSetLogLevel*/ \
+  NULL, /*systemLogFree*/ \
+  NULL, /*systemDlcCreateFromFile*/ \
+  NULL, /*systemDlcCreateFromBinary*/ \
+  NULL, /*systemDlcComposeGraphs*/ \
+  NULL, /*systemDlcGetOpMappings*/ \
+  NULL, /*systemDlcFree*/ \
+}
+
+typedef struct {
+  /// Backend identifier. See QnnCommon.h for details.
+  /// Allowed to be QNN_BACKEND_ID_NULL in case of single backend library or a dedicated system
+  /// library, in which case clients can deduce backend identifier based on library being loaded.
+  uint32_t backendId;
+  /// Interface provider name. Allowed to be NULL.
+  const char* providerName;
+  // API version for provided interface
+  Qnn_Version_t systemApiVersion;
+  union UNNAMED {
+    // Core interface type and name: e.g. QnnSystemInterface_ImplementationV0_0_t v0_0;
+    QNN_SYSTEM_INTERFACE_VER_TYPE  QNN_SYSTEM_INTERFACE_VER_NAME;
+  };
+} QnnSystemInterface_t;
+
+/// QnnSystemInterface_t initializer macro
+#define QNN_SYSTEM_INTERFACE_INIT                                          \
+  {                                                                        \
+    QNN_BACKEND_ID_NULL,     /*backendId*/                                 \
+    NULL,                    /*providerName*/                              \
+    QNN_VERSION_INIT,        /*apiVersion*/                                \
+    {                                                                      \
+      QNN_SYSTEM_INTERFACE_VER_TYPE_INIT /*QNN_SYSTEM_INTERFACE_VER_NAME*/ \
+    }                                                                      \
+  }
+
+// clang-format on
+
+//=============================================================================
+// Public Functions
+//=============================================================================
+
+/**
+ * @brief Get list of available interface providers.
+ *
+ * @param[out] providerList A pointer to an array of available interface providers.
+ *                          The lifetime of returned interface object pointers
+ *                          corresponds to the lifetime of the provider library.
+ *                          Contents are to be considered invalid if the provider
+ *                          library is terminated/unloaded.
+ *                          This function can be called immediately after provider
+ *                          library has been loaded.
+ * @param[out] numProviders Number of available interface objects in _providerList_.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: No error.
+ *         - QNN_SYSTEM_INTERFACE_INVALID_PARAMETER: Invalid parameter was provided.
+ *           Either _providerList_ or _numProviders_ was NULL.
+ *         - QNN_SYSTEM_INTERFACE_ERROR_NOT_SUPPORTED: API not supported.
+ */
+QNN_SYSTEM_INTERFACE
+Qnn_ErrorHandle_t QnnSystemInterface_getProviders(const QnnSystemInterface_t*** providerList,
+                                                  uint32_t* numProviders);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // QNN_SYSTEM_INTERFACE_H
\ No newline at end of file
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/System/QnnSystemLog.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/System/QnnSystemLog.h
new file mode 100755
index 0000000000000..5a0f2ed3baabb
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/System/QnnSystemLog.h
@@ -0,0 +1,99 @@
+//==============================================================================
+//
+//  Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+//  All Rights Reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+/**
+ * @file
+ * @brief   QNN System Log API component.
+ *
+ *          Provides means for QNN System to output logging data.
+ */
+
+#ifndef QNN_SYSTEM_LOG_H
+#define QNN_SYSTEM_LOG_H
+
+#include "QnnCommon.h"
+#include "QnnLog.h"
+#include "System/QnnSystemCommon.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//=============================================================================
+// Macros
+//=============================================================================
+
+//=============================================================================
+// Data Types
+//=============================================================================
+
+//=============================================================================
+// Public Functions
+//=============================================================================
+/**
+ * @brief Create a handle to a logger object.
+ *
+ * @param[in] callback Callback to handle system library generated logging messages. NULL indicates
+ *                     system library may direct log messages to the default log stream on the
+ *                     target platform when possible (e.g. to logcat in case of Android).
+ *
+ * @param[in] maxLogLevel Maximum level of messages which the system library will generate.
+ *
+ * @param[out] logger The created log handle.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: if logging is successfully initialized.
+ *         - QNN_COMMON_ERROR_NOT_SUPPORTED: logging is not supported.
+ *         - QNN_LOG_ERROR_INVALID_ARGUMENT: if one or more arguments is invalid.
+ *         - QNN_LOG_ERROR_MEM_ALLOC: for memory allocation errors.
+ *         - QNN_LOG_ERROR_INITIALIZATION: log init failed.
+ *
+ * @note Use corresponding API through QnnSystemInterface_t.
+ */
+QNN_SYSTEM_API
+Qnn_ErrorHandle_t QnnSystemLog_create(QnnLog_Callback_t callback,
+                                      QnnLog_Level_t maxLogLevel,
+                                      Qnn_LogHandle_t* logger);
+
+/**
+ * @brief A function to change the log level for the supplied log handle.
+ *
+ * @param[in] logger A log handle.
+ *
+ * @param[in] maxLogLevel New maximum log level.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: if the level is changed successfully.
+ *         - QNN_LOG_ERROR_INVALID_ARGUMENT: if maxLogLevel is not a valid QnnLog_Level_t level.
+ *         - QNN_LOG_ERROR_INVALID_HANDLE: _logHandle_ is not a valid handle
+ *
+ * @note Use corresponding API through QnnSystemInterface_t.
+ */
+QNN_SYSTEM_API
+Qnn_ErrorHandle_t QnnSystemLog_setLogLevel(Qnn_LogHandle_t logger, QnnLog_Level_t maxLogLevel);
+
+/**
+ * @brief A function to free the memory associated with the log handle.
+ *
+ * @param[in] logger A log handle.
+ *
+ * @return Error code:
+ *         - QNN_SUCCESS: indicates logging is terminated.
+ *         - QNN_LOG_ERROR_MEM_ALLOC: for memory de-allocation errors.
+ *         - QNN_LOG_ERROR_INVALID_HANDLE: _logHandle_ is not a valid handle
+ *
+ * @note Use corresponding API through QnnSystemInterface_t.
+ */
+QNN_SYSTEM_API
+Qnn_ErrorHandle_t QnnSystemLog_free(Qnn_LogHandle_t logger);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // QNN_SYSTEM_LOG_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/System/QnnSystemTensor.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/System/QnnSystemTensor.h
new file mode 100755
index 0000000000000..9c199fcbbf250
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/System/QnnSystemTensor.h
@@ -0,0 +1,76 @@
+//==============================================================================
+//
+//  Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+//  All rights reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+
+/**
+ *  @file
+ *  @brief  QNN System Tensor API.
+ *
+ *          This is a system API header dedicated to extensions to QnnTensor
+ *          that provide backend-agnostic services to users.
+ */
+
+#ifndef QNN_SYSTEM_TENSOR_H
+#define QNN_SYSTEM_TENSOR_H
+
+#include "QnnTypes.h"
+#include "System/QnnSystemCommon.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//=============================================================================
+// Error Codes
+//=============================================================================
+
+/**
+ * @brief QNN System Tensor API result / error codes.
+ */
+typedef enum {
+  QNN_SYSTEM_TENSOR_MIN_ERROR = QNN_MIN_ERROR_SYSTEM,
+  //////////////////////////////////////////
+
+  /// Qnn System Tensor success
+  QNN_SYSTEM_TENSOR_NO_ERROR = QNN_SUCCESS,
+  /// Qnn System Tensor API is not supported yet
+  QNN_SYSTEM_TENSOR_ERROR_UNSUPPORTED_FEATURE = QNN_COMMON_ERROR_NOT_SUPPORTED,
+  /// One or more arguments to a System Tensor API is/are NULL/invalid.
+  QNN_SYSTEM_TENSOR_ERROR_INVALID_ARGUMENT = QNN_SYSTEM_TENSOR_MIN_ERROR + 1,
+  /// A Qnn_Tensor_t data structure in invalid
+  QNN_SYSTEM_TENSOR_ERROR_INVALID_TENSOR = QNN_SYSTEM_TENSOR_MIN_ERROR + 2,
+  //////////////////////////////////////////
+  QNN_SYSTEM_TENSOR_MAX_ERROR = QNN_MAX_ERROR_SYSTEM
+} QnnSystemTensor_Error_t;
+
+//=============================================================================
+// Public Functions
+//=============================================================================
+
+/**
+ * @brief A function to compute the maximum amount of memory in bytes required to contain tensor data.
+ *        Currently supported data formats are:
+ *        - QNN_DATA_FORMAT_DENSE
+ *
+ * @param[in] tensor A Qnn_Tensor_t data structure.
+ *
+ * @param[out] footprint The maximum amount of memory required to fully contain tensor data.
+ *
+ * @return Error code
+ *         - QNN_SUCCESS: Successfully compute the tensor memory extent
+ *         - QNN_SYSTEM_TENSOR_ERROR_INVALID_ARGUMENT: extent is NULL
+ *         - QNN_SYSTEM_TENSOR_ERROR_INVALID_TENSOR: tensor is ill-configured
+ *         - QNN_SYSTEM_TENSOR_ERROR_UNSUPPORTED_FEATURE: this API is not supported yet
+ */
+QNN_SYSTEM_API
+Qnn_ErrorHandle_t QnnSystemTensor_getMemoryFootprint(Qnn_Tensor_t tensor, uint64_t* footprint);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // QNN_SYSTEM_TENSOR_H
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/TFLiteDelegate/QnnTFLiteDelegate.h b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/TFLiteDelegate/QnnTFLiteDelegate.h
new file mode 100755
index 0000000000000..e54c7de3e0bd3
--- /dev/null
+++ b/prebuilts/QNN_SDK/qairt/2.34.0.250424/include/QNN/TFLiteDelegate/QnnTFLiteDelegate.h
@@ -0,0 +1,614 @@
+//==============================================================================
+//
+//  Copyright (c) Qualcomm Technologies, Inc.
+//  All Rights Reserved.
+//  Confidential and Proprietary - Qualcomm Technologies, Inc.
+//
+//==============================================================================
+#ifndef TENSORFLOW_LITE_DELEGATES_QNN_QNN_TFLITE_DELEGATE_H_
+#define TENSORFLOW_LITE_DELEGATES_QNN_QNN_TFLITE_DELEGATE_H_
+
+#include "tensorflow/lite/c/common.h"
+
+#ifndef QNN_DELEGATE_CAPI_EXPORT
+#define QNN_DELEGATE_CAPI_EXPORT
+#endif /* QNN_DELEGATE_CAPI_EXPORT */
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// Provide values to use for API version
+// NOLINTBEGIN(cppcoreguidelines-macro-usage)
+#define QNN_DELEGATE_API_VERSION_MAJOR 0
+#define QNN_DELEGATE_API_VERSION_MINOR 24
+#define QNN_DELEGATE_API_VERSION_PATCH 0
+// NOLINTEND(cppcoreguidelines-macro-usage)
+
+/// A struct which is used to provide a version number using 3 values:
+/// major, minor, patch
+typedef struct {  // NOLINT(modernize-use-using)
+  uint32_t major;
+  uint32_t minor;
+  uint32_t patch;
+} QnnDelegateApiVersion;
+
+/// The QNN backend used to delegate the model's nodes. Each backend has
+/// its own set of supported ops and tensor types.
+typedef enum TfLiteQnnDelegateBackendType {  // NOLINT(modernize-use-using)
+  kUndefinedBackend = 0,
+  /// Backend for Adreno<sup>TM</sup> GPU hardware accelerator.
+  kGpuBackend,
+  /// Backend for Hexagon HTP hardware accelerator.
+  kHtpBackend,
+  /// Backend for Hexagon DSP hardware accelerator.
+  kDspBackend,
+  /// Backend for serializing model into dlc
+  kIrBackend,
+} TfLiteQnnDelegateBackendType;
+
+/// Logging level of the delegate and QNN backend.
+typedef enum TfLiteQnnDelegateLogLevel {  // NOLINT(modernize-use-using)
+  /// Disable delegate and QNN backend logging messages.
+  kLogOff = 0,
+  kLogLevelError = 1,
+  kLogLevelWarn = 2,
+  kLogLevelInfo = 3,
+  kLogLevelVerbose = 4,
+  kLogLevelDebug = 5,
+} TfLiteQnnDelegateLogLevel;
+
+/// Options to set Graph Priority. This is directly mapped to Qnn_Priority_t.
+/// Please refer to QNN SDK for additional information.
+typedef enum TfLiteQnnDelegateGraphPriority {  // NOLINT(modernize-use-using)
+  kQnnPriorityDefault = 0,
+  kQnnPriorityLow,
+  kQnnPriorityNormal,
+  kQnnPriorityNormalHigh,
+  kQnnPriorityHigh,
+  kQnnPriorityUndefined,
+} TfLiteQnnDelegateGraphPriority;
+
+/// Options to profile the QNN Delegate execution.
+typedef enum TfLiteQnnDelegateProfilingOptions {  // NOLINT(modernize-use-using)
+  kProfilingOff = 0,
+  kBasicProfiling,
+  kPerOpProfiling,
+  kLintingProfiling,
+} TfLiteQnnDelegateProfilingOptions;
+
+/// Defines the optimization levels of the graph tensors that are not input
+/// nor output tensors. This enum controls the trade-off between performance
+/// and accuracy.
+typedef enum TfLiteQnnDelegateGpuPrecision {  // NOLINT(modernize-use-using)
+  kGpuUserProvided = 0,
+  kGpuFp32,
+  kGpuFp16,
+  kGpuHybrid,
+} TfLiteQnnDelegateGpuPrecision;
+
+/// Defines performance modes available for GPU backend.
+typedef enum TfLiteQnnDelegateGpuPerformanceMode {  // NOLINT(modernize-use-using)
+  kGpuDefault = 0,
+  kGpuHigh,
+  kGpuNormal,
+  kGpuLow,
+} TfLiteQnnDelegateGpuPerformanceMode;
+
+/// Defines performance modes available for HTP backend.
+typedef enum TfLiteQnnDelegateHtpPerformanceMode {  // NOLINT(modernize-use-using)
+  kHtpDefault = 0,
+  kHtpSustainedHighPerformance = 1,
+  kHtpBurst = 2,
+  kHtpHighPerformance = 3,
+  kHtpPowerSaver = 4,
+  kHtpLowPowerSaver = 5,
+  kHtpHighPowerSaver = 6,
+  kHtpLowBalanced = 7,
+  kHtpBalanced = 8,
+  kHtpExtremePowerSaver = 9,
+} TfLiteQnnDelegateHtpPerformanceMode;
+
+/// Defines performance modes available for DSP backend.
+typedef enum TfLiteQnnDelegateDspPerformanceMode {  // NOLINT(modernize-use-using)
+  kDspDefault = 0,
+  kDspSustainedHighPerformance = 1,
+  kDspBurst = 2,
+  kDspHighPerformance = 3,
+  kDspPowerSaver = 4,
+  kDspLowPowerSaver = 5,
+  kDspHighPowerSaver = 6,
+  kDspLowBalanced = 7,
+  kDspBalanced = 8,
+} TfLiteQnnDelegateDspPerformanceMode;
+
+///   Defines performance control strategy
+///
+///   **Manual**: The performance mode is voted as the backend is initialized,
+///   and released at the moment of the backend is destroyed.
+///
+///   Users can control the vote/release of the performance mode by
+///   TfLiteQnnDelegateSetPerf().
+///
+///   Note that this is the default strategy.
+///
+///   For example, users can vote before inference starts, and release after all
+///   invocations are complete.
+///
+///   ~~~~~~~~~~~~~{.cpp}
+///      TfLiteQnnDelegateSetPerf(delegate, kPerformanceVote);
+///      // invoke inferences...
+///      TfLiteQnnDelegateSetPerf(delegate, kPerformanceRelease);
+///   ~~~~~~~~~~~~~
+///
+///   **AUTO**: QNN Delegate votes before starting inference, and releases after
+///   an idle interval.
+typedef enum TfLiteQnnDelegateHtpPerfCtrlStrategy {  // NOLINT(modernize-use-using)
+  kHtpPerfCtrlManual = 0,
+  kHtpPerfCtrlAuto = 1,
+} TfLiteQnnDelegateHtpPerfCtrlStrategy;
+
+/// Defines DSP performance control strategy. Similar to HTP cases.
+typedef enum TfLiteQnnDelegateDspPerfCtrlStrategy {  // NOLINT(modernize-use-using)
+  kDspPerfCtrlManual = 0,
+  kDspPerfCtrlAuto = 1,
+} TfLiteQnnDelegateDspPerfCtrlStrategy;
+
+/// Defines pd sessions available for DSP backend.
+typedef enum TfLiteQnnDelegateDspPdSession {  // NOLINT(modernize-use-using)
+  kDspUnsignedPd = 0,
+  kDspSignedPd,
+  kDspAdaptivePd,
+} TfLiteQnnDelegateDspPdSession;
+
+/// Defines encoding for DSP backend. Dynamic encoding is more precise but
+/// sacrifices a bit of performance.
+typedef enum TfLiteQnnDelegateDspEncoding {  // NOLINT(modernize-use-using)
+  kDspStatic = 0,
+  kDspDynamic = 1,
+  kDspUnknown = 0x7fffffff,
+} TfLiteQnnDelegateDspEncoding;
+
+/// Defines pd sessions available for HTP backend.
+typedef enum TfLiteQnnDelegateHtpPdSession {  // NOLINT(modernize-use-using)
+  kHtpUnsignedPd = 0,
+  kHtpSignedPd,
+} TfLiteQnnDelegateHtpPdSession;
+
+/// Defines the optimization levels of the graph tensors that are not input nor
+/// output tensors. This enum controls the trade-off between performance and
+/// accuracy.
+typedef enum TfLiteQnnDelegateHtpPrecision {  // NOLINT(modernize-use-using)
+  kHtpQuantized = 0,
+  kHtpFp16,
+} TfLiteQnnDelegateHtpPrecision;
+
+/// Defines the optimization strategy used by the HTP backend.
+/// \ref kHtpOptimizeForInference will have longer preparation time, but more
+/// optimal graph. \ref kHtpOptimizeForPrepare will have shorter preparation
+/// time, but less optimal graph. \ref kHtpOptimizeForInferenceO3 will take into
+/// account QNN_HTP_DEVICE_CONFIG_OPTION_SOC configuration when possible. When
+/// SOC information is taken into account, O3 configuration is expected to
+/// provide more optimal graph in most cases, but may result in less optimal
+/// graph in some cases. Please check HTP section in Qnn docs for more detail.
+typedef enum TfLiteQnnDelegateHtpOptimizationStrategy {  // NOLINT(modernize-use-using)
+  kHtpOptimizeForInference = 0,
+  kHtpOptimizeForPrepare,
+  kHtpOptimizeForInferenceO3,
+} TfLiteQnnDelegateHtpOptimizationStrategy;
+
+/// Defines the performance action used by TfLiteQnnDelegateSetPerf()
+typedef enum TfLiteQnnDelegatePerformanceAction {  // NOLINT(modernize-use-using)
+  kPerformanceVote = 0,
+  kPerformanceRelease = 1,
+} TfLiteQnnDelegatePerformanceAction;
+
+/// Specifies the backend options for the GPU backend. To be used when selecting
+/// \ref TfLiteQnnDelegateBackendType.kGpuBackend for the \ref
+/// TfLiteQnnDelegateOptions.backend_type.
+typedef struct {  // NOLINT
+  /// The default precision is half float for the best performance.
+  TfLiteQnnDelegateGpuPrecision precision;
+  /// The default performance mode sets high.
+  TfLiteQnnDelegateGpuPerformanceMode performance_mode;
+  /// The QNN GPU backend supports on-disk kernel persistence strategies where
+  /// compiled GPU kernel binaries are cached to disk and can be shared across
+  /// models having the same kernels and improve warm init times significantly.
+  const char* kernel_repo_dir;
+} TfLiteQnnDelegateGpuBackendOptions;
+
+// clang-format off
+#define QNN_DELEGATE_GPU_OPTION_INIT   \
+  {                                   \
+    kGpuFp16,    /*precision*/        \
+    kGpuDefault, /*performance_mode*/ \
+    ""           /*kernel_repo_dir*/  \
+  }
+// clang-format on
+
+/// Specifies the backend options for the HTP backend. To be used when selecting
+/// \ref TfLiteQnnDelegateBackendType.kGpuBackend for the \ref
+/// TfLiteQnnDelegateOptions.backend_type.
+typedef struct {  // NOLINT
+  /// The default performance mode sets no configurations on the HTP.
+  TfLiteQnnDelegateHtpPerformanceMode performance_mode;
+  /// The default performance control strategy is Manual.
+  TfLiteQnnDelegateHtpPerfCtrlStrategy perf_ctrl_strategy;
+  /// The default precision mode supports quantized networks. Other precision
+  /// modes may only be supported on certain SoCs.
+  TfLiteQnnDelegateHtpPrecision precision;
+  /// Signed or unsigned HTP PD session. The default PD session is unsigned.
+  TfLiteQnnDelegateHtpPdSession pd_session;
+  /// The default optimization strategy will optimize the graph for inference.
+  TfLiteQnnDelegateHtpOptimizationStrategy optimization_strategy;
+  /// When using short conv hmx, one might have better performance,
+  /// but convolution that have short depth and/or weights that are not
+  /// symmetric could exhibit inaccurate results.
+  bool useConvHmx;
+  /// When using fold relu, one might have better performance. This optimization
+  /// is correct when quantization ranges for convolution are equal to or are
+  /// subset of the Relu operation.
+  bool useFoldRelu;
+  /// Option to set VTCM size in MB. This is directly mapped to
+  /// QNN_HTP_GRAPH_CONFIG_OPTION_VTCM_SIZE under QnnHtpGraph_ConfigOption_t. If
+  /// VTCM size is set to 0, the default VTCM size will be used.
+  /// If VTCM size is greater than VTCM size available for this device,
+  /// it will be set to the maximum VTCM size for this device.
+  uint32_t vtcm_size;
+  /// Option to set number of HVX threads. This is directly mapped to
+  /// QNN_HTP_GRAPH_CONFIG_OPTION_NUM_HVX_THREADS under
+  /// QnnHtpGraph_ConfigOption_t. If this this option is set to 0, the default
+  /// number of HVX threads will be used. If input exceeds the max number of HVX
+  /// threads, the maximum number of threads supported will be used.
+  uint32_t num_hvx_threads;
+  /// Some SoCs come with more than 1 HTP device. You can set which HTP device
+  /// you want to run the model on by this attribute.
+  /// But in most cases, you can just use the default device_id.
+  uint32_t device_id;
+} TfLiteQnnDelegateHtpBackendOptions;
+
+// clang-format off
+#define QNN_DELEGATE_HTP_OPTION_INIT                      \
+  {                                                       \
+    kHtpDefault,              /*performance_mode*/        \
+    kHtpPerfCtrlManual,       /*perf_ctrl_strategy*/      \
+    kHtpFp16,                 /*precision*/               \
+    kHtpUnsignedPd,           /*pd_session*/              \
+    kHtpOptimizeForInference, /*optimization_strategy*/   \
+    true,                     /*useConvHmx*/              \
+    false,                    /*useFoldRelu*/             \
+    0,                        /*vtcm_size*/               \
+    0,                        /*num_hvx_threads*/         \
+    0,                        /*device_id*/               \
+  }
+// clang-format on
+
+/// Specifies the backend options for the DSP backend. To be used when selecting
+/// kDspBackend as the <backend_type>.
+typedef struct {  // NOLINT
+  /// The default performance mode sets no configurations on the DSP.
+  TfLiteQnnDelegateDspPerformanceMode performance_mode;
+  /// The default performance control strategy is Manual.
+  TfLiteQnnDelegateDspPerfCtrlStrategy perf_ctrl_strategy;
+  /// The default PD session is unsigned.
+  TfLiteQnnDelegateDspPdSession pd_session;
+  /// The default Encoding is static
+  TfLiteQnnDelegateDspEncoding encoding;
+} TfLiteQnnDelegateDspBackendOptions;
+
+// clang-format off
+#define QNN_DELEGATE_DSP_OPTION_INIT                      \
+  {                                                       \
+    kDspDefault,              /*performance_mode*/        \
+    kDspPerfCtrlManual,       /*perf_ctrl_strategy*/      \
+    kDspUnsignedPd,           /*pd_session*/              \
+    kDspStatic,               /*encoding*/                \
+  }
+// clang-format on
+
+/// Specifies the backend options for the IR writer backend. To be used when
+/// selecting \ref TfLiteQnnDelegateBackendType.kIrBackend for the \ref
+/// TfLiteQnnDelegateOptions.backend_type.
+typedef struct {  // NOLINT
+  const char* output_path;
+} TfLiteQnnDelegateIrBackendOptions;
+
+// clang-format off
+#define QNN_DELEGATE_IR_OPTION_INIT                      \
+  {                                                      \
+    nullptr,              /*output_path*/                \
+  }
+// clang-format on
+
+/// Map of TFLite custom operator name to op type defined within an op package.
+typedef struct {  // NOLINT
+  /// The TfLiteRegistration::custom_name set during registration.
+  const char* custom_op_name;
+  /// The corresponding op type name defined in the op package.
+  const char* qnn_op_type_name;
+} TfLiteQnnDelegateOpPackageOpMap;
+
+// clang-format off
+#define QNN_DELEGATE_OP_PACKAGE_OPTION_INIT   \
+  {                                           \
+    0,              /*num_op_package_infos*/  \
+    nullptr,        /*op_package_infos*/      \
+  }
+// clang-format on
+
+/// Structure containing the information needed to register and use an op
+/// package with QNN.
+typedef struct {  // NOLINT
+  /// The name of the op package.
+  const char* op_package_name;
+  /// The path on disk to the op package library.
+  const char* op_package_path;
+  /// The name of a function in the op package library which satisfies the
+  /// QnnOpPackage_InterfaceProvider_t interface.
+  const char* interface_provider;
+  /// The target which this op package library was compiled for.
+  const char* target;
+  /// Number of elements in the TfLiteQnnDelegateOpPackageInfo.ops_map array.
+  int num_ops_map;
+  /// An array of TfLiteQnnDelegateOpPackageOpMap structures.
+  TfLiteQnnDelegateOpPackageOpMap* ops_map;
+} TfLiteQnnDelegateOpPackageInfo;
+
+typedef struct {  // NOLINT
+  /// Number of elements in TfLiteQnnDelegateOpPackageOptions.op_package_infos
+  /// array.
+  int num_op_package_infos;
+  /// An array of TfLiteQnnDelegateOpPackageInfo structures.
+  TfLiteQnnDelegateOpPackageInfo* op_package_infos;
+} TfLiteQnnDelegateOpPackageOptions;
+
+typedef struct {  // NOLINT
+  /// Set ops not to be delegated manually based on the op id(s).
+  /// To obtain all the op ids, please refer to tensorflow/lite/builtin_ops.h.
+  /// Notice that we skip all of with the types specified in the
+  /// \ref skip_delegate_ops array. For example, if you set skip to include
+  /// SquaredDifference, all instances of SquaredDifference ops in the
+  /// model will not be delegated.
+  const int* skip_delegate_ops;
+  /// Indicates the length of \ref skip_delegate_ops array.
+  uint32_t skip_delegate_ops_nr;
+  /// Set node IDs not to be delegated.
+  /// Node id can be obtained by node's location information in .tflite.
+  const int* skip_delegate_node_ids;
+  /// Indicates the length of \ref skip_delegate_node_ids array.
+  uint32_t skip_delegate_node_ids_nr;
+} TfLiteQnnDelegateSkipOption;
+
+// clang-format off
+#define QNN_DELEGATE_SKIP_OPTION_INIT          \
+  {                                            \
+    nullptr,     /*skip_delegate_ops*/         \
+    0,           /*skip_delegate_ops_nr*/      \
+    nullptr,     /*skip_delegate_node_ids*/    \
+    0,           /*skip_delegate_node_ids_nr*/ \
+  }
+// clang-format on
+
+typedef struct {  // NOLINT
+  /// The backend QNN library to open and execute the graph with. This is a
+  /// required argument and will error out if kUndefinedBackend is supplied.
+  TfLiteQnnDelegateBackendType backend_type;
+
+  /// Optional parameter to override the QNN backend library.
+  const char* library_path;
+
+  /// Optional parameter specifying the directory of QNN Skel library. Only
+  /// useful for backends which have a Skel library.
+  const char* skel_library_dir;
+
+  /// Optional backend specific options for the GPU backend. Only used when
+  /// selecting \ref TfLiteQnnDelegateBackendType.kGpuBackend, otherwise will be
+  /// ignored.
+  TfLiteQnnDelegateGpuBackendOptions gpu_options;
+
+  /// Optional backend specific options for the HTP backend. Only used when
+  /// selecting \ref TfLiteQnnDelegateBackendType.kHtpBackend, otherwise will be
+  /// ignored.
+  TfLiteQnnDelegateHtpBackendOptions htp_options;
+
+  /// Optional backend specific options for the DSP backend. Only used when
+  /// selecting \ref TfLiteQnnDelegateBackendType.kDspBackend, otherwise will be
+  /// ignored.
+  TfLiteQnnDelegateDspBackendOptions dsp_options;
+
+  /// Optional backend specific options for the IR backend. Only used when
+  /// selecting \ref TfLiteQnnDelegateBackendType.kIrBackend, otherwise will be
+  /// ignored.
+  TfLiteQnnDelegateIrBackendOptions ir_options;
+
+  /// Logging level of the delegate and the backend. Default is off.
+  TfLiteQnnDelegateLogLevel log_level;
+
+  /// Option to enable profiling with the delegate. Default is off.
+  TfLiteQnnDelegateProfilingOptions profiling;
+
+  /// Optional structure to specify op packages loaded and used by the backend.
+  TfLiteQnnDelegateOpPackageOptions op_package_options;
+
+  /// Tensor dump output path. If a path is given, Delegate will write
+  /// outputs of each OP there.
+  /// We don't recommend using this option. It exists only for debugging
+  /// accuracy issues.
+  const char* tensor_dump_output_path;
+
+  /// Specifies the directory of a compiled model. Signals intent to either:
+  ///   * Save the model if the file doesn't exist, or
+  ///   * Restore model from the file.
+  ///
+  /// Model Cache specific options. Only used when setting \ref model_token,
+  /// otherwise will be ignored.
+  ///
+  /// We don't recommend that delegate instances with/without cache be mixed in
+  /// same process, unless an instance <b>without</b> cache is initialized,
+  /// invoked, and *terminated* before an instance with cache is used in order
+  /// to make sure all resources are prepared correctly.
+  ///
+  ///   ~~~~~~~~~~~~~{.cpp}
+  ///
+  ///   TfLiteDelegate* delegate_wo_cache =
+  ///   TfLiteQnnDelegateCreate(&options_wo_cache);
+  ///   interpreter_0->ModifyGraphWithDelegate(delegate_wo_cache);
+  ///
+  ///   // Perform inference with interpreter_0
+  ///
+  ///   TfLiteQnnDelegateDelete(delegate_wo_cache);
+  ///
+  ///   // after this, another delegate_with_cache can be used in the same
+  ///   // process, though not recommended at this moment.
+  ///   TfLiteDelegate* delegate_with_cache =
+  ///   TfLiteQnnDelegateCreate(&options_with_cache);
+  ///
+  ///   // another interpreter
+  ///   interpreter_1->ModifyGraphWithDelegate(delegate_with_cache);
+  ///
+  ///   // more delegates...etc.
+  ///   ~~~~~~~~~~~~~
+  const char* cache_dir;
+  /// The unique null-terminated token string that acts as a ‘namespace’ for all
+  /// serialization entries. Should be unique to a particular model (graph &
+  /// constants). For an example of how to generate this from a TFLite model,
+  /// see StrFingerprint() in lite/delegates/serialization.h.
+  ///
+  /// Model Cache specific options. Only used when setting \ref cache_dir,
+  /// otherwise will be ignored.
+  const char* model_token;
+  /// Option to skip node by specifying node types or node ids.
+  TfLiteQnnDelegateSkipOption skip_options;
+  /// Option to set graph priority.
+  TfLiteQnnDelegateGraphPriority graph_priority;
+} TfLiteQnnDelegateOptions;
+
+// clang-format off
+#define QNN_DELEGATE_OPTION_INIT                                        \
+  {                                                                     \
+    kUndefinedBackend,                    /*backend_type*/              \
+    "",                                   /*library_path*/              \
+    "",                                   /*skel_library_dir*/          \
+    QNN_DELEGATE_GPU_OPTION_INIT,         /*gpu_options*/               \
+    QNN_DELEGATE_HTP_OPTION_INIT,         /*htp_options*/               \
+    QNN_DELEGATE_DSP_OPTION_INIT,         /*dsp_options*/               \
+    QNN_DELEGATE_IR_OPTION_INIT,          /*ir_options*/                \
+    kLogOff,                              /*log_level*/                 \
+    kProfilingOff,                        /*profiling*/                 \
+    QNN_DELEGATE_OP_PACKAGE_OPTION_INIT,  /*op_package_options*/        \
+    "",                                   /*tensor_dump_output_path*/   \
+    "",                                   /*cache_dir*/                 \
+    "",                                   /*model_token*/               \
+    QNN_DELEGATE_SKIP_OPTION_INIT,        /*skip_options*/              \
+    kQnnPriorityDefault,                  /*graph_priority*/            \
+  }
+// clang-format on
+
+typedef int32_t  // NOLINT(modernize-use-using)
+    TfLiteQnnDelegateCapabilityStatus;
+
+// NOLINTBEGIN(cppcoreguidelines-macro-usage)
+/// Return by TfLiteQnnDelegateHasCapability() if the capability is supported.
+#define TfLiteQnnDelegateCapabilitySupported 1
+/// Return by TfLiteQnnDelegateHasCapability() if the capability is not
+/// supported.
+#define TfLiteQnnDelegateCapabilityNotSupported 0
+// NOLINTEND(cppcoreguidelines-macro-usage)
+
+/// Defines possible QNN Delegate capabilities.
+typedef enum TfLiteQnnDelegateCapability {  // NOLINT(modernize-use-using)
+  kCapHtpRuntimeQuant = 0,
+  kCapHtpRuntimeFp16 = 1,
+  kCapGpuRuntime = 2,
+  kCapDspRuntime = 3,
+} TfLiteQnnDelegateCapability;
+
+/// Create the QNN Delegate options structure and populate with default values.
+QNN_DELEGATE_CAPI_EXPORT TfLiteQnnDelegateOptions
+TfLiteQnnDelegateOptionsDefault();
+
+/// Create the QNN Delegate with the specified options.
+QNN_DELEGATE_CAPI_EXPORT TfLiteDelegate* TfLiteQnnDelegateCreate(
+    const TfLiteQnnDelegateOptions* options);
+
+/// Delete the QNN Delegate once no longer required.
+///
+/// Note that this is not a thread-safe function, which might cause unexpected
+/// behaviour when using it with \ref TfLiteQnnDelegateSetPerf, \ref
+/// TfLiteQnnDelegateUpdateHtpPerfMode, \ref TfLiteQnnDelegateUpdateDspPerfMode,
+/// or \ref TfLiteQnnDelegateDelete at the same time.
+QNN_DELEGATE_CAPI_EXPORT void TfLiteQnnDelegateDelete(TfLiteDelegate* delegate);
+
+/// Manually vote or release performance mode. "Vote" to request hardware to
+/// obey the performance mode setting as soon as possible. "Release" to
+/// release the vote. Note that this API only work for HTP/DSP backend with \ref
+/// kHtpPerfCtrlManual or \ref kDspPerfCtrlManual. Return true for success,
+/// false for failure.
+QNN_DELEGATE_CAPI_EXPORT bool TfLiteQnnDelegateSetPerf(
+    TfLiteDelegate* delegate, const TfLiteQnnDelegatePerformanceAction action);
+
+/// Detect whether the capability is supported on the platform running QNN
+/// Delegate.
+///
+/// Note that this is an experimental feature.
+QNN_DELEGATE_CAPI_EXPORT TfLiteQnnDelegateCapabilityStatus
+TfLiteQnnDelegateHasCapability(const TfLiteQnnDelegateCapability cap);
+
+/// This API changes the performance mode of a created QNN Delegate on HTP
+/// backend, returning `true` for the mode set correctly, `false` for any
+/// failure.
+///
+/// It will perform a vote after a successful update. If the strategy of
+/// performance controlling is **manual**, the new mode takes effect before this
+/// API returns.
+///
+/// Note that this API cannot be called during graph invocation, and this is an
+/// experimental feature.
+QNN_DELEGATE_CAPI_EXPORT bool TfLiteQnnDelegateUpdateHtpPerfMode(
+    TfLiteDelegate* delegate, const TfLiteQnnDelegateHtpPerformanceMode mode);
+
+/// This API changes the performance mode of a created QNN Delegate on DSP
+/// backend, returning `true` for the mode set correctly, `false` for any
+/// failure.
+QNN_DELEGATE_CAPI_EXPORT bool TfLiteQnnDelegateUpdateDspPerfMode(
+    TfLiteDelegate* delegate, const TfLiteQnnDelegateDspPerformanceMode mode);
+
+/// Get QNN Delegate API version.
+QNN_DELEGATE_CAPI_EXPORT QnnDelegateApiVersion TfLiteQnnDelegateGetApiVersion();
+
+/// Allocate specific tensors (usually graph inputs and outputs) on shared
+/// memory. Users are responsible to allocate "enough" tensor bytes, and set
+/// alignment as kDefaultTensorAlignment. The function returns a valid pointer
+/// if allocation is successful.
+///
+/// Note that this is an experimental feature.
+QNN_DELEGATE_CAPI_EXPORT void* TfLiteQnnDelegateAllocCustomMem(
+    size_t bytes, size_t alignment);
+
+/// Free the allocated shared memory.
+///
+/// Note that this is an experimental feature.
+QNN_DELEGATE_CAPI_EXPORT void TfLiteQnnDelegateFreeCustomMem(void* buffer_ptr);
+
+/// Structure of profiling result.
+typedef struct {  // NOLINT(modernize-use-using)
+  /// Buffer of profiling result
+  /// will be invalid once TfLiteQnnDelegateClearProfilingResult gets called
+  const uint8_t* buffer;
+  /// Buffer length of profiling result in bytes
+  uint32_t buffer_length;
+} TfLiteQnnDelegateProfilingResult;
+
+/// Get profiling result.
+QNN_DELEGATE_CAPI_EXPORT TfLiteQnnDelegateProfilingResult
+TfLiteQnnDelegateGetProfilingResult(TfLiteDelegate* delegate);
+
+/// Free the recorded profiling result.
+QNN_DELEGATE_CAPI_EXPORT void TfLiteQnnDelegateClearProfilingResult(
+    TfLiteDelegate* delegate);
+
+#ifdef __cplusplus
+}
+#endif  // __cplusplus
+
+#endif  // TENSORFLOW_LITE_DELEGATES_QNN_QNN_TFLITE_DELEGATE_H_
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libPlatformValidatorShared.so b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libPlatformValidatorShared.so
new file mode 100755
index 0000000000000..614dc4822c8a9
Binary files /dev/null and b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libPlatformValidatorShared.so differ
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnChrometraceProfilingReader.so b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnChrometraceProfilingReader.so
new file mode 100755
index 0000000000000..9060d479b1adc
Binary files /dev/null and b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnChrometraceProfilingReader.so differ
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnCpu.so b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnCpu.so
new file mode 100755
index 0000000000000..1a12d1d8a8b06
Binary files /dev/null and b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnCpu.so differ
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnGpu.so b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnGpu.so
new file mode 100755
index 0000000000000..5389da7d26cd4
Binary files /dev/null and b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnGpu.so differ
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnGpuNetRunExtensions.so b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnGpuNetRunExtensions.so
new file mode 100755
index 0000000000000..d047d2f30b5a4
Binary files /dev/null and b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnGpuNetRunExtensions.so differ
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnGpuProfilingReader.so b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnGpuProfilingReader.so
new file mode 100755
index 0000000000000..e7b132a074625
Binary files /dev/null and b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnGpuProfilingReader.so differ
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnHta.so b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnHta.so
new file mode 100755
index 0000000000000..da1ac38e414b9
Binary files /dev/null and b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnHta.so differ
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnHtaNetRunExtensions.so b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnHtaNetRunExtensions.so
new file mode 100755
index 0000000000000..b4aa55d31aad8
Binary files /dev/null and b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnHtaNetRunExtensions.so differ
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnHtp.so b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnHtp.so
new file mode 100755
index 0000000000000..151e7dd23d51a
Binary files /dev/null and b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnHtp.so differ
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnHtpNetRunExtensions.so b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnHtpNetRunExtensions.so
new file mode 100755
index 0000000000000..21def50de605d
Binary files /dev/null and b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnHtpNetRunExtensions.so differ
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnHtpOptraceProfilingReader.so b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnHtpOptraceProfilingReader.so
new file mode 100755
index 0000000000000..92f47ae2338f8
Binary files /dev/null and b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnHtpOptraceProfilingReader.so differ
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnHtpPrepare.so b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnHtpPrepare.so
new file mode 100755
index 0000000000000..e0858ffde2511
Binary files /dev/null and b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnHtpPrepare.so differ
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnHtpProfilingReader.so b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnHtpProfilingReader.so
new file mode 100755
index 0000000000000..5ef985d6f17aa
Binary files /dev/null and b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnHtpProfilingReader.so differ
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnHtpV68Stub.so b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnHtpV68Stub.so
new file mode 100755
index 0000000000000..e6756906a3a0a
Binary files /dev/null and b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnHtpV68Stub.so differ
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnHtpV69Stub.so b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnHtpV69Stub.so
new file mode 100755
index 0000000000000..80d871e8f9b52
Binary files /dev/null and b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnHtpV69Stub.so differ
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnHtpV73Stub.so b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnHtpV73Stub.so
new file mode 100755
index 0000000000000..1095ed348700c
Binary files /dev/null and b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnHtpV73Stub.so differ
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnHtpV75Stub.so b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnHtpV75Stub.so
new file mode 100755
index 0000000000000..e95cef23fc2b4
Binary files /dev/null and b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnHtpV75Stub.so differ
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnHtpV79Stub.so b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnHtpV79Stub.so
new file mode 100755
index 0000000000000..bd46b4292eb1d
Binary files /dev/null and b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnHtpV79Stub.so differ
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnIr.so b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnIr.so
new file mode 100755
index 0000000000000..6f61df3e55ba7
Binary files /dev/null and b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnIr.so differ
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnJsonProfilingReader.so b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnJsonProfilingReader.so
new file mode 100755
index 0000000000000..23c44cf813477
Binary files /dev/null and b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnJsonProfilingReader.so differ
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnModelDlc.so b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnModelDlc.so
new file mode 100755
index 0000000000000..2022596b18bb6
Binary files /dev/null and b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnModelDlc.so differ
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnNetRunDirectV79Stub.so b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnNetRunDirectV79Stub.so
new file mode 100755
index 0000000000000..ad7e6f21947f0
Binary files /dev/null and b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnNetRunDirectV79Stub.so differ
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnSaver.so b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnSaver.so
new file mode 100755
index 0000000000000..46c1a07118a22
Binary files /dev/null and b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnSaver.so differ
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnSystem.so b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnSystem.so
new file mode 100755
index 0000000000000..ebaaf73c11f59
Binary files /dev/null and b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libQnnSystem.so differ
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libhta_hexagon_runtime_qnn.so b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libhta_hexagon_runtime_qnn.so
new file mode 100755
index 0000000000000..2eec419ffaf9b
Binary files /dev/null and b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/aarch64-android/libhta_hexagon_runtime_qnn.so differ
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v68/unsigned/libQnnHtpV68.so b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v68/unsigned/libQnnHtpV68.so
new file mode 100755
index 0000000000000..47f2023462fea
Binary files /dev/null and b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v68/unsigned/libQnnHtpV68.so differ
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v68/unsigned/libQnnHtpV68Skel.so b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v68/unsigned/libQnnHtpV68Skel.so
new file mode 100755
index 0000000000000..3c5b06d26503e
Binary files /dev/null and b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v68/unsigned/libQnnHtpV68Skel.so differ
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v68/unsigned/libQnnSaver.so b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v68/unsigned/libQnnSaver.so
new file mode 100755
index 0000000000000..6f0c851d807cf
Binary files /dev/null and b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v68/unsigned/libQnnSaver.so differ
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v68/unsigned/libQnnSystem.so b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v68/unsigned/libQnnSystem.so
new file mode 100755
index 0000000000000..962dcf0e07ac9
Binary files /dev/null and b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v68/unsigned/libQnnSystem.so differ
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v69/unsigned/libQnnHtpV69.so b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v69/unsigned/libQnnHtpV69.so
new file mode 100755
index 0000000000000..cd76354b331ff
Binary files /dev/null and b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v69/unsigned/libQnnHtpV69.so differ
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v69/unsigned/libQnnHtpV69Skel.so b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v69/unsigned/libQnnHtpV69Skel.so
new file mode 100755
index 0000000000000..04b394ecc335a
Binary files /dev/null and b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v69/unsigned/libQnnHtpV69Skel.so differ
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v69/unsigned/libQnnSaver.so b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v69/unsigned/libQnnSaver.so
new file mode 100755
index 0000000000000..3182094c14764
Binary files /dev/null and b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v69/unsigned/libQnnSaver.so differ
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v69/unsigned/libQnnSystem.so b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v69/unsigned/libQnnSystem.so
new file mode 100755
index 0000000000000..5cca038f909a5
Binary files /dev/null and b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v69/unsigned/libQnnSystem.so differ
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v73/unsigned/libQnnHtpV73.so b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v73/unsigned/libQnnHtpV73.so
new file mode 100755
index 0000000000000..1d7beb15373c9
Binary files /dev/null and b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v73/unsigned/libQnnHtpV73.so differ
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v73/unsigned/libQnnHtpV73QemuDriver.so b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v73/unsigned/libQnnHtpV73QemuDriver.so
new file mode 100755
index 0000000000000..edcd3c5f0ee9b
Binary files /dev/null and b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v73/unsigned/libQnnHtpV73QemuDriver.so differ
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v73/unsigned/libQnnHtpV73Skel.so b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v73/unsigned/libQnnHtpV73Skel.so
new file mode 100755
index 0000000000000..cd5417cb4807a
Binary files /dev/null and b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v73/unsigned/libQnnHtpV73Skel.so differ
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v73/unsigned/libQnnSaver.so b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v73/unsigned/libQnnSaver.so
new file mode 100755
index 0000000000000..32ec373dbdeab
Binary files /dev/null and b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v73/unsigned/libQnnSaver.so differ
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v73/unsigned/libQnnSystem.so b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v73/unsigned/libQnnSystem.so
new file mode 100755
index 0000000000000..6eb416bc109ce
Binary files /dev/null and b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v73/unsigned/libQnnSystem.so differ
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v73/unsigned/libqnnhtpv73.cat b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v73/unsigned/libqnnhtpv73.cat
new file mode 100755
index 0000000000000..ecb30d6269994
Binary files /dev/null and b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v73/unsigned/libqnnhtpv73.cat differ
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v73/unsigned/libsnpehtpv73.cat b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v73/unsigned/libsnpehtpv73.cat
new file mode 100755
index 0000000000000..17372b43b1eee
Binary files /dev/null and b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v73/unsigned/libsnpehtpv73.cat differ
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v75/unsigned/libQnnHtpV75.so b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v75/unsigned/libQnnHtpV75.so
new file mode 100755
index 0000000000000..77bc6b968334f
Binary files /dev/null and b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v75/unsigned/libQnnHtpV75.so differ
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v75/unsigned/libQnnHtpV75Skel.so b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v75/unsigned/libQnnHtpV75Skel.so
new file mode 100755
index 0000000000000..ee5ab21705cda
Binary files /dev/null and b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v75/unsigned/libQnnHtpV75Skel.so differ
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v75/unsigned/libQnnSaver.so b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v75/unsigned/libQnnSaver.so
new file mode 100755
index 0000000000000..c97486fee13a8
Binary files /dev/null and b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v75/unsigned/libQnnSaver.so differ
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v75/unsigned/libQnnSystem.so b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v75/unsigned/libQnnSystem.so
new file mode 100755
index 0000000000000..09ea592e81405
Binary files /dev/null and b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v75/unsigned/libQnnSystem.so differ
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v79/unsigned/libQnnHexagonSkel_dspApp.so b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v79/unsigned/libQnnHexagonSkel_dspApp.so
new file mode 100755
index 0000000000000..ad0ddad17e5f4
Binary files /dev/null and b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v79/unsigned/libQnnHexagonSkel_dspApp.so differ
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v79/unsigned/libQnnHtpV79.so b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v79/unsigned/libQnnHtpV79.so
new file mode 100755
index 0000000000000..698420bef38a4
Binary files /dev/null and b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v79/unsigned/libQnnHtpV79.so differ
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v79/unsigned/libQnnHtpV79Skel.so b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v79/unsigned/libQnnHtpV79Skel.so
new file mode 100755
index 0000000000000..85157e5b280ac
Binary files /dev/null and b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v79/unsigned/libQnnHtpV79Skel.so differ
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v79/unsigned/libQnnNetRunDirectV79Skel.so b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v79/unsigned/libQnnNetRunDirectV79Skel.so
new file mode 100755
index 0000000000000..e390b30688228
Binary files /dev/null and b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v79/unsigned/libQnnNetRunDirectV79Skel.so differ
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v79/unsigned/libQnnSaver.so b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v79/unsigned/libQnnSaver.so
new file mode 100755
index 0000000000000..ec6049cb2ba91
Binary files /dev/null and b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v79/unsigned/libQnnSaver.so differ
diff --git a/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v79/unsigned/libQnnSystem.so b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v79/unsigned/libQnnSystem.so
new file mode 100755
index 0000000000000..d01c7cc6ce3d4
Binary files /dev/null and b/prebuilts/QNN_SDK/qairt/2.34.0.250424/lib/hexagon-v79/unsigned/libQnnSystem.so differ
diff --git a/prebuilts/README.md b/prebuilts/README.md
new file mode 100644
index 0000000000000..b7a86b8a19c04
--- /dev/null
+++ b/prebuilts/README.md
@@ -0,0 +1,28 @@
+### Compliance Statement
+
+we should strictly follow Qualcomm's IPR policy, even in open-source community.
+
+
+### the [KanTV](https://github.com/kantv-ai) way
+
+- Simple is beautiful
+
+  we believe the philosophy of "<b>simple is beautiful</b>" which <b>comes from the great Unix</b>.
+
+- Make it run, then make it right, then make it fast
+
+- Explore and have fun!
+
+  we believe the philosophy of <b>try crazy ideas, build wild demos, and push the edge of what’s possible</b>(which is one of the core spirits of ggml-way).
+
+- The rule-based order
+
+  we respect the rule-based order and we respect the IPR.
+
+### README
+
+- QNN_SDK: a customized/tailored Qualcomm's QNN SDK for build project ggml-hexagon conveniently. the fully QNN SDK could be found at Qualcomm's offcial website: https://www.qualcomm.com/developer/software/qualcomm-ai-engine-direct-sdk
+
+- Hexagon_SDK: a customized/tailored Qualcomm's Hexagon SDK for build project ggml-hexagon conveniently. the fully Hexagon SDK could be found at Qualcomm's offcial website: https://developer.qualcomm.com/software/hexagon-dsp-sdk/tools. one more important thing, the fully Hexagon SDK must be obtained with a Qualcomm Developer Account.
+
+- [ggml-dsp](https://github.com/zhouwg/ggml-hexagon/tree/self-build/prebuilts/ggml-dsp): prebuilt libggmldsp-skel.so for Qualcomm Hexagon NPU on Android phone equipped with Qualcomm Snapdragon <b>high-end</b> mobile SoC
diff --git a/prebuilts/ggml-dsp/20250531/libggmldsp-skelv68.so b/prebuilts/ggml-dsp/20250531/libggmldsp-skelv68.so
new file mode 100755
index 0000000000000..2f737cccd27e0
Binary files /dev/null and b/prebuilts/ggml-dsp/20250531/libggmldsp-skelv68.so differ
diff --git a/prebuilts/ggml-dsp/20250531/libggmldsp-skelv69.so b/prebuilts/ggml-dsp/20250531/libggmldsp-skelv69.so
new file mode 100755
index 0000000000000..3e4ac6ae48401
Binary files /dev/null and b/prebuilts/ggml-dsp/20250531/libggmldsp-skelv69.so differ
diff --git a/prebuilts/ggml-dsp/20250531/libggmldsp-skelv73.so b/prebuilts/ggml-dsp/20250531/libggmldsp-skelv73.so
new file mode 100755
index 0000000000000..03fe1db880dbd
Binary files /dev/null and b/prebuilts/ggml-dsp/20250531/libggmldsp-skelv73.so differ
diff --git a/prebuilts/ggml-dsp/20250531/libggmldsp-skelv75.so b/prebuilts/ggml-dsp/20250531/libggmldsp-skelv75.so
new file mode 100755
index 0000000000000..1fe3912aad04a
Binary files /dev/null and b/prebuilts/ggml-dsp/20250531/libggmldsp-skelv75.so differ
diff --git a/prebuilts/ggml-dsp/20250531/libggmldsp-skelv79.so b/prebuilts/ggml-dsp/20250531/libggmldsp-skelv79.so
new file mode 100755
index 0000000000000..d54ded4fe1861
Binary files /dev/null and b/prebuilts/ggml-dsp/20250531/libggmldsp-skelv79.so differ
diff --git a/prebuilts/ggml-dsp/20250609/libggmldsp-skelv68.so b/prebuilts/ggml-dsp/20250609/libggmldsp-skelv68.so
new file mode 100755
index 0000000000000..10404b9000cb8
Binary files /dev/null and b/prebuilts/ggml-dsp/20250609/libggmldsp-skelv68.so differ
diff --git a/prebuilts/ggml-dsp/20250609/libggmldsp-skelv69.so b/prebuilts/ggml-dsp/20250609/libggmldsp-skelv69.so
new file mode 100755
index 0000000000000..15f465679fbc5
Binary files /dev/null and b/prebuilts/ggml-dsp/20250609/libggmldsp-skelv69.so differ
diff --git a/prebuilts/ggml-dsp/20250609/libggmldsp-skelv73.so b/prebuilts/ggml-dsp/20250609/libggmldsp-skelv73.so
new file mode 100755
index 0000000000000..2ffb1626785e1
Binary files /dev/null and b/prebuilts/ggml-dsp/20250609/libggmldsp-skelv73.so differ
diff --git a/prebuilts/ggml-dsp/20250609/libggmldsp-skelv75.so b/prebuilts/ggml-dsp/20250609/libggmldsp-skelv75.so
new file mode 100755
index 0000000000000..5e171a17717a4
Binary files /dev/null and b/prebuilts/ggml-dsp/20250609/libggmldsp-skelv75.so differ
diff --git a/prebuilts/ggml-dsp/20250609/libggmldsp-skelv79.so b/prebuilts/ggml-dsp/20250609/libggmldsp-skelv79.so
new file mode 100755
index 0000000000000..241a6dbb90f92
Binary files /dev/null and b/prebuilts/ggml-dsp/20250609/libggmldsp-skelv79.so differ
diff --git a/prebuilts/ggml-dsp/20250625/libggmldsp-skelv75.so b/prebuilts/ggml-dsp/20250625/libggmldsp-skelv75.so
new file mode 100755
index 0000000000000..d64b3afb37de9
Binary files /dev/null and b/prebuilts/ggml-dsp/20250625/libggmldsp-skelv75.so differ
diff --git a/prebuilts/ggml-dsp/20250625/libggmldsp-skelv79.so b/prebuilts/ggml-dsp/20250625/libggmldsp-skelv79.so
new file mode 100755
index 0000000000000..cc3bd1a1b7d5d
Binary files /dev/null and b/prebuilts/ggml-dsp/20250625/libggmldsp-skelv79.so differ
diff --git a/prebuilts/ggml-dsp/README.md b/prebuilts/ggml-dsp/README.md
new file mode 100644
index 0000000000000..edd1bd28eb951
--- /dev/null
+++ b/prebuilts/ggml-dsp/README.md
@@ -0,0 +1,28 @@
+### About ggml-hexagon
+
+ggml-hexagon backend is a specified backend for llama.cpp on Qualcomm Hexagon NPU.
+
+details of ggml-hexagon can be found at: [about ggml-hexagon](https://github.com/zhouwg/ggml-hexagon/discussions/18)
+
+ggml-hexagon backend consists of two parts:
+
+ - codes on ARM AP side(libggml-hexagon.so), <b>fully</b> source code can be found at https://github.com/zhouwg/ggml-hexagon/blob/self-build/ggml/src/ggml-hexagon/ggml-hexagon.cpp
+
+ - codes on cDSP side(libggmldsp-skel.so). <b>reference</b> source code can be found at https://github.com/zhouwg/ggml-hexagon/blob/self-build/ggml/src/ggml-hexagon/kernels, the prebuilt libggmldsp-skel.so can be found in this directory.
+
+### Supported Qualcomm Snapdragon mobile SoC
+
+```
+#v68 --- Snapdragon 888
+#v69 --- Snapdragon 8 Gen1
+#v73 --- Snapdragon 8 Gen2
+#v75 --- Snapdragon 8 Gen3
+#v79 --- Snapdragon 8 Elite(aka Gen4)
+```
+
+
+### ChangeLog
+
+- 20250531: ggml-hexagon.cpp v1.08 + ggml-dsp v0.96
+- 20250609: ggml-hexagon.cpp v1.10 + ggml-dsp v0.97
+- 20250625: ggml-hexagon.cpp v1.13 + ggml-dsp v0.98
diff --git a/requirements/requirements-compare-llama-bench.txt b/requirements/requirements-compare-llama-bench.txt
index e0aaa32043ce2..d87e897e17199 100644
--- a/requirements/requirements-compare-llama-bench.txt
+++ b/requirements/requirements-compare-llama-bench.txt
@@ -1,2 +1,3 @@
 tabulate~=0.9.0
 GitPython~=3.1.43
+matplotlib~=3.10.0
diff --git a/scripts/build-run-android.sh b/scripts/build-run-android.sh
new file mode 100755
index 0000000000000..4d8c8109466c8
--- /dev/null
+++ b/scripts/build-run-android.sh
@@ -0,0 +1,784 @@
+#!/usr/bin/env bash
+#
+# Copyright (c) 2024-2025 The KanTV authors
+#
+# 1. build llama.cpp + ggml-hexagon backend on Linux for Android phone equipped with Qualcomm Snapdragon mobile SoC
+#    this script will setup local dev envs automatically
+#
+# 2. verify prebuilt libggmldsp-skel.so on Android phone equipped with Qualcomm Snapdragon mobile SoC(8Elite is recommended)
+#
+# 3. compare performance of QNN-CPU,QNN-GPU,QNN-NPU,Hexagon-cDSP,ggml on Android phone equipped with Qualcomm Snapdragon mobile SoC
+#
+set -e
+
+######## part-1: don't modify contents in this part ########
+
+PWD=`pwd`
+PROJECT_HOME_PATH=`pwd`
+PROJECT_ROOT_PATH=${PROJECT_HOME_PATH}
+HOST_CPU_COUNTS=`cat /proc/cpuinfo | grep "processor" | wc | awk '{print int($1)}'`
+
+#running path on Android phone
+REMOTE_PATH=/data/local/tmp
+
+#Android NDK can be found at:
+#https://developer.android.com/ndk/downloads
+ANDROID_PLATFORM=android-34
+ANDROID_NDK_VERSION=r28
+ANDROID_NDK_NAME=android-ndk-${ANDROID_NDK_VERSION}
+ANDROID_NDK_FULLNAME=${ANDROID_NDK_NAME}-linux.zip
+ANDROID_NDK=${PROJECT_ROOT_PATH}/prebuilts/${ANDROID_NDK_NAME}
+
+#Qualcomm QNN SDK can be found at:
+#https://www.qualcomm.com/developer/software/qualcomm-ai-engine-direct-sdk
+QNN_SDK_URL=https://www.qualcomm.com/developer/software/qualcomm-ai-engine-direct-sdk
+QNN_SDK_VERSION=2.32.0.250228
+QNN_SDK_VERSION=2.33.0.250327
+QNN_SDK_VERSION=2.34.0.250424
+QNN_SDK_VERSION=2.35.0.250530
+#fully official QNN SDK, will be downloaded automatically via this script
+QNN_SDK_PATH=${PROJECT_ROOT_PATH}/prebuilts/QNN_SDK/qairt/2.34.0.250424/
+QNN_SDK_PATH=${PROJECT_ROOT_PATH}/prebuilts/QNN_SDK/qairt/2.35.0.250530/
+
+#Qualcomm Hexagon SDK can be found at:
+#https://developer.qualcomm.com/software/hexagon-dsp-sdk/tools
+#the official Hexagon SDK, must be obtained with Qualcomm Developer Account
+HEXAGON_SDK_PATH=/opt/qcom/Hexagon_SDK/6.2.0.1
+#customized/tailored Hexagon SDK from the offcial Hexagon SDK for simplify workflow
+HEXAGON_SDK_PATH=${PROJECT_ROOT_PATH}/prebuilts/Hexagon_SDK/6.2.0.1
+
+#running_params=" -ngl 99 -t 8 -n 256 --no-warmup -fa 1 "
+running_params=" -ngl 99 -t 8 -n 256 --no-warmup "
+
+#available prebuilt libs can be found at prebuilts/ggml-dsp
+GGMLDSP_RELEASE_DATE=20250531
+GGMLDSP_RELEASE_DATE=20250609
+GGMLDSP_RELEASE_DATE=20250625
+
+
+######## part-2: contents in this part can be modified ########
+
+PROMPT_STRING="every day of your life, it is important to take the time to smell the roses — to appreciate the experiences that lead to happiness. This is part of being truly happy.Happiness is a state of mind. It starts with accepting where you are, knowing where you are going and planning to enjoy every moment along the way. You know how to be happy, and feel that you have enough time or money or love or whatever you need to achieve your goals. And just feeling that you have enough of everything means that you do indeed have enough.You have to choose to be happy, and focus upon being happy, in order to be happy. If you instead focus upon knowing that you will be happy if you achieve something, you will never be happy, as you have not learned to smell the roses. The irony is that when you are happy, you are inevitably more productive, and far more likely to achieve what everything-seekers are seeking. you will never be happy, as you have not learned to smell the roses. The irony is that when you are happy, you are inevitably more productive, and far more likely to achieve what everything-seekers are seeking."
+PROMPT_STRING="introduce the movie Once Upon a Time in America briefly.\n"
+
+#for llama-cli, 20.4 MiB in models/t5-very-small-random-F32.gguf
+#TEST_MODEL_NAME=/sdcard/t5-very-small-random-F32.gguf
+#for llama-cli, 1.1 GiB, will be downloaded automatically via this script
+#TEST_MODEL_NAME=/sdcard/t5-277M-F32.gguf
+TEST_MODEL_NAME=/sdcard/qwen1_5-1_8b-chat-q4_0.gguf
+#for llama-cli, 4.5 GiB, can be downloadded automatically via this script
+TEST_MODEL_NAME=/sdcard/gemma-3n-E2B-it-Q8_0.gguf
+#self-defined LLM models
+#TEST_MODEL_NAME=/sdcard/Qwen3-8B-Q8_0.gguf
+#TEST_MODEL_NAME=/sdcard/Qwen3-4B-Q8_0.gguf
+#TEST_MODEL_NAME=/sdcard/gemma-3-4b-it-Q8_0.gguf
+
+
+#for llama-bench, 1.12 GiB, will be downloadded automatically via this script
+GGUF_MODEL_NAME=/sdcard/qwen1_5-1_8b-chat-q4_0.gguf
+
+#ref: https://github.com/quic/ai-hub-apps/tree/main/tutorials/llm_on_genie
+#supported htp arch version:
+#v73 --- Snapdragon 8 Gen2
+#v75 --- Snapdragon 8 Gen3
+#v79 --- Snapdragon 8 Elite
+
+#8Gen2
+#HTP_ARCH_VERSION=v73
+#HTP_ARCH_VERSION_a=V73
+
+#8Gen3
+#HTP_ARCH_VERSION=v75
+#HTP_ARCH_VERSION_a=V75
+
+#8Elite
+#HTP_ARCH_VERSION=v79
+#HTP_ARCH_VERSION_a=V79
+
+#modify the following two lines to adapt to test phone
+#for simplify workflow, only support v75 and v79, or only support 8Gen3 and 8Elite
+#v79/8Elite is strongly recommended because:
+#1. sometimes the same dsp codes can running well as expected on Snapdragon 8Elite based phone
+#   but can't works as expected on other Snapdragon based phone(e.g. 8Gen3).
+#2. DSP clock rate on 8Gen3 is slower than DSP clock rate on 8Elite.
+#3. 8Elite support for LP-DDR5x memory, up to 5300 MHz; 8Gen3 support for LP-DDR5x memory, up to 4800 MHz.
+HTP_ARCH_VERSION=v79
+HTP_ARCH_VERSION_a=V79
+
+
+######## part-3: utilities and functions ########
+
+function dump_vars()
+{
+    echo -e "ANDROID_NDK:          ${ANDROID_NDK}"
+    echo -e "QNN_SDK_PATH:         ${QNN_SDK_PATH}"
+    echo -e "HEXAGON_SDK_PATH:     ${HEXAGON_SDK_PATH}"
+}
+
+
+function show_pwd()
+{
+    echo -e "current working path:$(pwd)\n"
+}
+
+
+function check_command_in_host()
+{
+    set +e
+    cmd=$1
+    ls /usr/bin/${cmd}
+    if [ $? -eq 0 ]; then
+        #printf "${cmd} already exist on host machine\n"
+        echo ""
+    else
+        printf "${cmd} not exist on host machine, pls install command line utility ${cmd} firstly and accordingly\n"
+        exit 1
+    fi
+    set -e
+}
+
+
+function check_commands_in_host()
+{
+    check_command_in_host wget
+    check_command_in_host xzcat
+}
+
+
+function check_and_download_hexagon_sdk()
+{
+    is_hexagon_llvm_exist=1
+    if [ ! -f ${PROJECT_ROOT_PATH}/prebuilts/Hexagon_SDK/6.2.0.1/tools/HEXAGON_Tools/8.8.06/NOTICE.txt ]; then
+        echo -e "${TEXT_RED}minimal-hexagon-sdk not exist...${TEXT_RESET}\n"
+        is_hexagon_llvm_exist=0
+    fi
+
+    if [ ${is_hexagon_llvm_exist} -eq 0 ]; then
+        if [ -f ${PROJECT_ROOT_PATH}/prebuilts/Hexagon_SDK/minimal-hexagon-sdk-6.2.0.1.xz ]; then
+            echo -e "minimal-hexagon-sdk-6.2.0.1.xz already exist\n"
+        else
+            echo -e "begin downloading minimal-hexagon-sdk-6.2.0.1.xz \n"
+            wget --no-config --quiet --show-progress -O ${PROJECT_ROOT_PATH}/prebuilts/Hexagon_SDK/minimal-hexagon-sdk-6.2.0.1.xz https://github.com/kantv-ai/toolchain/raw/refs/heads/main/minimal-hexagon-sdk-6.2.0.1.xz
+            if [ $? -ne 0 ]; then
+                printf "failed to download minimal-hexagon-sdk-6.2.0.1.xz\n"
+                exit 1
+            fi
+        fi
+
+        echo -e "begin decompressing minimal-hexagon-sdk-6.2.0.1.xz \n"
+        xzcat ${PROJECT_ROOT_PATH}/prebuilts/Hexagon_SDK/minimal-hexagon-sdk-6.2.0.1.xz | tar -C ${PROJECT_ROOT_PATH}/prebuilts/Hexagon_SDK/ -xf -
+        if [ $? -ne 0 ]; then
+            printf "failed to decompress minimal-hexagon-sdk-6.2.0.1.xz\n"
+            exit 1
+        fi
+        printf "install minimal-hexagon-sdk successfully\n\n"
+    fi
+
+    if [ ! -d ${HEXAGON_SDK_PATH} ]; then
+        echo -e "HEXAGON_SDK_PATH ${HEXAGON_SDK_PATH} not exist, pls install it accordingly...\n"
+        exit 0
+    else
+        printf "Qualcomm Hexagon SDK already exist:${HEXAGON_SDK_PATH} \n\n"
+    fi
+}
+
+
+function check_and_download_qnn_sdk()
+{
+    is_qnn_sdk_exist=1
+
+    if [ ! -d ${QNN_SDK_PATH} ]; then
+        echo -e "QNN_SDK_PATH ${QNN_SDK_PATH} not exist, download it from ${QNN_SDK_URL}...\n"
+        is_qnn_sdk_exist=0
+    fi
+
+    if [ ${is_qnn_sdk_exist} -eq 0 ]; then
+        if [ ! -f ${PROJECT_ROOT_PATH}/prebuild/v${QNN_SDK_VERSION}.zip ]; then
+            wget --no-config --quiet --show-progress -O ${PROJECT_ROOT_PATH}/prebuilts/QNN_SDK/v${QNN_SDK_VERSION}.zip https://softwarecenter.qualcomm.com/api/download/software/sdks/Qualcomm_AI_Runtime_Community/All/${QNN_SDK_VERSION}/v${QNN_SDK_VERSION}.zip
+        fi
+        if [ $? -ne 0 ]; then
+            printf "failed to download Qualcomm QNN SDK to %s \n" "${QNN_SDK_PATH}"
+            exit 1
+        fi
+        cd ${PROJECT_ROOT_PATH}/prebuilts/QNN_SDK/
+        unzip v${QNN_SDK_VERSION}.zip
+        printf "Qualcomm QNN SDK saved to ${QNN_SDK_PATH} \n\n"
+        cd ${PROJECT_ROOT_PATH}
+    else
+        printf "Qualcomm QNN SDK already exist:    ${QNN_SDK_PATH} \n\n"
+    fi
+}
+
+
+function check_and_download_ndk()
+{
+    is_android_ndk_exist=1
+
+    if [ ! -d ${ANDROID_NDK} ]; then
+        is_android_ndk_exist=0
+    fi
+
+    if [ ! -f ${ANDROID_NDK}/build/cmake/android.toolchain.cmake ]; then
+        is_android_ndk_exist=0
+    fi
+
+    if [ ${is_android_ndk_exist} -eq 0 ]; then
+
+        if [ ! -f ${PROJECT_ROOT_PATH}/prebuilts/${ANDROID_NDK_FULLNAME} ]; then
+            wget --no-config --quiet --show-progress -O ${PROJECT_ROOT_PATH}/prebuilts/${ANDROID_NDK_FULLNAME} https://dl.google.com/android/repository/${ANDROID_NDK_FULLNAME}
+        fi
+
+        cd ${PROJECT_ROOT_PATH}/prebuilts/
+        unzip ${ANDROID_NDK_FULLNAME}
+
+        if [ $? -ne 0 ]; then
+            printf "failed to download Android NDK to %s \n" "${ANDROID_NDK}"
+            exit 1
+        fi
+        cd ${PROJECT_ROOT_PATH}
+
+        printf "Android NDK saved to ${ANDROID_NDK} \n\n"
+    else
+        printf "Android NDK already exist:         ${ANDROID_NDK} \n\n"
+    fi
+}
+
+
+function build_arm64
+{
+    cmake -H. -B./out/android -DCMAKE_BUILD_TYPE=Release -DGGML_OPENMP=OFF -DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK}/build/cmake/android.toolchain.cmake -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=latest -DCMAKE_C_FLAGS=-march=armv8.7-a -DGGML_HEXAGON=ON -DLLAMA_CURL=OFF -DQNN_SDK_PATH=${QNN_SDK_PATH} -DHEXAGON_SDK_PATH=${HEXAGON_SDK_PATH} -DHTP_ARCH_VERSION=${HTP_ARCH_VERSION}
+    cd out/android
+    make -j${HOST_CPU_COUNTS}
+    show_pwd
+
+    cd -
+}
+
+
+function build_arm64_debug
+{
+    cmake -H. -B./out/android -DCMAKE_BUILD_TYPE=Debug -DGGML_OPENMP=OFF -DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK}/build/cmake/android.toolchain.cmake -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=latest -DCMAKE_C_FLAGS=-march=armv8.7-a -DGGML_HEXAGON=ON -DLLAMA_CURL=OFF -DQNN_SDK_PATH=${QNN_SDK_PATH} -DHEXAGON_SDK_PATH=${HEXAGON_SDK_PATH} -DHTP_ARCH_VERSION=${HTP_ARCH_VERSION}
+    cd out/android
+    make -j${HOST_CPU_COUNTS}
+    show_pwd
+
+    cd -
+}
+
+
+function remove_temp_dir()
+{
+    if [ -d out/android ]; then
+        echo "remove out/android directory in `pwd`"
+        rm -rf out/android
+    fi
+}
+
+
+function check_qnn_libs()
+{
+    set +e
+
+    #reuse the cached qnn libs on Android phone
+    adb shell ls ${REMOTE_PATH}/libQnnCpu.so
+    adb shell ls ${REMOTE_PATH}/libQnnGpu.so
+    adb shell ls ${REMOTE_PATH}/libQnnHtp.so
+    if [ $? -eq 0 ]; then
+        printf "QNN runtime libs already exist on Android phone\n"
+    else
+        printf "QNN runtime libs not exist on Android phone\n"
+        update_qnn_libs
+    fi
+    update_qnn_cfg
+
+    set -e
+}
+
+
+function update_qnn_libs()
+{
+    adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnSystem.so              ${REMOTE_PATH}/
+    adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnCpu.so                 ${REMOTE_PATH}/
+    adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnGpu.so                 ${REMOTE_PATH}/
+
+    adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtp.so                 ${REMOTE_PATH}/
+    adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtpNetRunExtensions.so ${REMOTE_PATH}/
+    adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtpPrepare.so          ${REMOTE_PATH}/
+    adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtp${HTP_ARCH_VERSION_a}Stub.so          ${REMOTE_PATH}/
+    adb push ${QNN_SDK_PATH}/lib/hexagon-${HTP_ARCH_VERSION}/unsigned/libQnnHtp${HTP_ARCH_VERSION_a}Skel.so     ${REMOTE_PATH}/
+}
+
+
+function update_qnn_cfg()
+{
+    adb push ./scripts/ggml-hexagon.cfg ${REMOTE_PATH}/
+}
+
+
+function build_ggml_hexagon()
+{
+    show_pwd
+    check_and_download_ndk
+    check_and_download_qnn_sdk
+    check_and_download_hexagon_sdk
+    dump_vars
+    remove_temp_dir
+    build_arm64
+}
+
+
+function build_ggml_hexagon_debug()
+{
+    show_pwd
+    check_and_download_ndk
+    check_and_download_qnn_sdk
+    check_and_download_hexagon_sdk
+    dump_vars
+    remove_temp_dir
+    build_arm64_debug
+}
+
+
+function prepare_ggmldsp()
+{
+    adb push ./scripts/ggml-hexagon-for-binary-lib.cfg ${REMOTE_PATH}/ggml-hexagon.cfg
+    echo "adb push ${PROJECT_ROOT_PATH}/prebuilts/ggml-dsp/${GGMLDSP_RELEASE_DATE}/libggmldsp-skel${HTP_ARCH_VERSION}.so ${REMOTE_PATH}/libggmldsp-skel.so"
+case "$HTP_ARCH_VERSION" in
+    v69)
+        adb push ${PROJECT_ROOT_PATH}/prebuilts/ggml-dsp/${GGMLDSP_RELEASE_DATE}/libggmldsp-skel${HTP_ARCH_VERSION}.so ${REMOTE_PATH}/libggmldsp-skel.so
+    ;;
+    v73)
+        adb push ${PROJECT_ROOT_PATH}/prebuilts/ggml-dsp/${GGMLDSP_RELEASE_DATE}/libggmldsp-skel${HTP_ARCH_VERSION}.so ${REMOTE_PATH}/libggmldsp-skel.so
+    ;;
+    v75)
+        adb push ${PROJECT_ROOT_PATH}/prebuilts/ggml-dsp/${GGMLDSP_RELEASE_DATE}/libggmldsp-skel${HTP_ARCH_VERSION}.so ${REMOTE_PATH}/libggmldsp-skel.so
+    ;;
+    v79)
+        adb push ${PROJECT_ROOT_PATH}/prebuilts/ggml-dsp/${GGMLDSP_RELEASE_DATE}/libggmldsp-skel${HTP_ARCH_VERSION}.so ${REMOTE_PATH}/libggmldsp-skel.so
+    ;;
+    *)
+        show_usage
+        exit 1
+    ;;
+esac
+}
+
+
+function check_and_download_model()
+{
+    set +e
+
+    model_name=$1
+    model_url=$2
+
+    adb shell ls /sdcard/${model_name}
+    if [ $? -eq 0 ]; then
+        printf "the prebuild LLM model ${model_name} already exist on Android phone\n"
+    else
+        printf "the prebuild LLM model ${model_name} not exist on Android phone\n"
+        wget --no-config --quiet --show-progress -O ${PROJECT_ROOT_PATH}/models/${model_name} ${model_url}
+        adb push ${PROJECT_ROOT_PATH}/models/${model_name} /sdcard/
+    fi
+
+    set -e
+}
+
+
+function check_prebuilt_models()
+{
+    #normal LLM models
+    #https://huggingface.co/ggml-org/gemma-3-4b-it-GGUF/blob/main/gemma-3-4b-it-Q8_0.gguf,              size 4.13 GiB
+    #https://huggingface.co/Qwen/Qwen1.5-1.8B-Chat-GGUF/blob/main/qwen1_5-1_8b-chat-q4_0.gguf,          size 1.12 GiB
+
+    #customized LLM models for compare inference peformance of QNN-CPU, QNN-GPU, QNN-NPU, cDSP, the default ggml backend
+    #during development stage
+    #https://huggingface.co/zhouwg/kantv/blob/main/t5-very-small-random-F32.gguf,                       size 20.4 MiB
+    #original model:  https://huggingface.co/stas/t5-very-small-random
+
+    #https://huggingface.co/zhouwg/kantv/blob/main/MiniCPM4-0.5B-F32.gguf,                              size 1.74 GiB
+    #original model:  https://huggingface.co/openbmb/MiniCPM4-0.5B
+
+    #customized LLM models for compare inference peformance of QNN-CPU, QNN-GPU, QNN-NPU, cDSP, the default ggml backend
+    #during development stage
+    #https://huggingface.co/zhouwg/kantv/blob/main/t5-277M-F32.gguf,                                    size 1.1  GiB
+
+    set +e
+
+    adb shell ls /sdcard/t5-very-small-random-F32.gguf
+    if [ $? -eq 0 ]; then
+        printf "the prebuild LLM model t5-very-small-random-F32.gguf already exist on Android phone\n"
+    else
+        printf "the prebuild LLM model t5-very-small-random-F32.gguf not exist on Android phone\n"
+        adb push ${PROJECT_ROOT_PATH}/models/t5-very-small-random-F32.gguf /sdcard/
+    fi
+
+    check_and_download_model qwen1_5-1_8b-chat-q4_0.gguf https://huggingface.co/Qwen/Qwen1.5-1.8B-Chat-GGUF/resolve/main/qwen1_5-1_8b-chat-q4_0.gguf
+    #check_and_download_model MiniCPM4-0.5B-F32.gguf https://huggingface.co/zhouwg/kantv/resolve/main/MiniCPM4-0.5B-F32.gguf
+    #check_and_download_model t5-277M-F32.gguf https://huggingface.co/zhouwg/kantv/resolve/main/t5-277M-F32.gguf
+    check_and_download_model gemma-3n-E2B-it-Q8_0.gguf https://huggingface.co/ggml-org/gemma-3n-E2B-it-GGUF/resolve/main/gemma-3n-E2B-it-Q8_0.gguf
+
+    set -e
+}
+
+
+function prepare_run_on_phone()
+{
+    if [ $# != 1 ]; then
+        print "invalid param"
+        return
+    fi
+    program=$1
+
+    check_qnn_libs
+
+    check_prebuilt_models
+
+    if [ -f ./out/android/bin/libggml-cpu.so ]; then
+        adb push ./out/android/bin/*.so ${REMOTE_PATH}/
+    fi
+    adb push ./out/android/bin/${program} ${REMOTE_PATH}/
+
+    #for troubleshooting issues in upstream llama.cpp project
+    adb shell ls -l ${REMOTE_PATH}/libggml-*.so
+
+    #for verify prebuilt binary library(after 06/2025) on Hexagon cDSP
+    #comment this line when build library on Hexagon cDSP from the reference/self-develop source codes in this project
+    prepare_ggmldsp
+
+    #un-comment this line when build library on Hexagon cDSP from the reference/self-develop source codes in this project
+    #adb push ./scripts/ggml-hexagon.cfg ${REMOTE_PATH}/ggml-hexagon.cfg
+
+    adb shell chmod +x ${REMOTE_PATH}/${program}
+}
+
+function run_llamacli()
+{
+    prepare_run_on_phone llama-cli
+
+    echo "${REMOTE_PATH}/llama-cli ${running_params} -mg ${hexagon_backend} -no-cnv -m ${TEST_MODEL_NAME} -p \"${PROMPT_STRING}\""
+    adb shell "cd ${REMOTE_PATH} \
+               && export LD_LIBRARY_PATH=${REMOTE_PATH} \
+               && ${REMOTE_PATH}/llama-cli ${running_params} -mg ${hexagon_backend} -no-cnv -m ${TEST_MODEL_NAME} -p \"${PROMPT_STRING}\""
+
+}
+
+
+function run_llamabench()
+{
+    prepare_run_on_phone llama-bench
+
+    echo "adb shell \"cd ${REMOTE_PATH} \
+               && export LD_LIBRARY_PATH=${REMOTE_PATH} \
+               && ${REMOTE_PATH}/llama-bench ${running_params} -mg ${hexagon_backend} -m ${GGUF_MODEL_NAME}\""
+    echo "${REMOTE_PATH}/llama-bench ${running_params} -mg ${hexagon_backend} -m ${GGUF_MODEL_NAME}"
+
+    adb shell "cd ${REMOTE_PATH} \
+               && export LD_LIBRARY_PATH=${REMOTE_PATH} \
+               && ${REMOTE_PATH}/llama-bench ${running_params} -mg ${hexagon_backend} -m ${GGUF_MODEL_NAME}"
+
+}
+
+
+function run_threadsafety()
+{
+    prepare_run_on_phone test-thread-safety
+
+    echo "${REMOTE_PATH}/test-thread-safety -np 2 -mg ${hexagon_backend} -m ${GGUF_MODEL_NAME} -p \"hello,world\" -n 256 -ngl 99 "
+    adb shell "cd ${REMOTE_PATH} \
+               && export LD_LIBRARY_PATH=${REMOTE_PATH} \
+               && ${REMOTE_PATH}/test-thread-safety -np 1 -mg ${hexagon_backend} -m ${GGUF_MODEL_NAME} -p \"hello,world\" -n 256 -ngl 99 "
+
+}
+
+
+
+function run_test-ops()
+{
+    prepare_run_on_phone test-backend-ops
+
+    adb shell "cd ${REMOTE_PATH} \
+               && export LD_LIBRARY_PATH=${REMOTE_PATH} \
+               && ${REMOTE_PATH}/test-backend-ops test"
+
+}
+
+
+function check_hexagon_backend
+{
+    if [[ ${hexagon_backend} != 0 ]] && [[ ${hexagon_backend} != 1 ]] && [[ ${hexagon_backend} != 2 ]] && [[ ${hexagon_backend} != 3 ]] && [[ ${hexagon_backend} != 4 ]] ; then
+        printf "invalid hexagon backend\n"
+        printf "valid hexagon backend: 0(QNN_CPU), 1(QNN_GPU), 2(QNN_NPU), 3(cDSP), 4(ggml)\n"
+        exit 1
+    fi
+}
+
+
+function check_mulmat_algotype
+{
+    printf "mulmat_algotype ${mulmat_algotype} \n"
+    if [[ ${mulmat_algotype} != 0 ]] && [[ ${mulmat_algotype} != 1 ]] && [[ ${mulmat_algotype} != 2 ]] && [[ ${mulmat_algotype} != 3 ]] && [[ ${mulmat_algotype} != 4 ]] && [[ ${mulmat_algotype} != 5 ]] && [[ ${mulmat_algotype} != 6 ]] && [[ ${mulmat_algotype} != 32 ]] && [[ ${mulmat_algotype} != 33 ]]; then
+        printf "invalid mulmat algotype\n"
+        printf "valid mulmat algotype: 0, 1, 2, 3, 4, 5, 6, 32, 33 \n"
+        exit 1
+    fi
+}
+
+
+function run_test-op()
+{
+    prepare_run_on_phone test-backend-ops
+
+    check_mulmat_algotype
+
+    echo "adb shell cd ${REMOTE_PATH} \
+               && export LD_LIBRARY_PATH=${REMOTE_PATH} \
+               && ${REMOTE_PATH}/test-backend-ops test -o $opname -a ${mulmat_algotype}"
+
+    echo "\n"
+    adb shell "cd ${REMOTE_PATH} \
+               && export LD_LIBRARY_PATH=${REMOTE_PATH} \
+               && ${REMOTE_PATH}/test-backend-ops test -o $opname -a ${mulmat_algotype}"
+
+}
+
+
+function run_benchmark()
+{
+    prepare_run_on_phone ggmlhexagon-benchmark
+
+    check_mulmat_algotype
+
+    echo "${REMOTE_PATH}/ggmlhexagon-benchmark -t ${opname} -b ${hexagon_backend} -m ${row} -n ${col} -a ${mulmat_algotype}"
+    adb shell "cd ${REMOTE_PATH} \
+               && export LD_LIBRARY_PATH=${REMOTE_PATH} \
+               && ${REMOTE_PATH}/ggmlhexagon-benchmark -t ${opname} -b ${hexagon_backend} -m ${row} -n ${col} -a ${mulmat_algotype}"
+
+}
+
+
+function print_oplist()
+{
+oplist="DUP
+    ADD
+    ADD1
+    ACC
+    SUB
+    MUL
+    DIV
+    SQR
+    SQRT
+    LOG
+    SIN
+    COS
+    SUM
+    SUM_ROWS
+    MEAN
+    ARGMAX
+    COUNT_EQUAL
+    REPEAT
+    REPEAT_BACK
+    CONCAT
+    SILU_BACK
+    NORM
+    RMS_NORM
+    RMS_NORM_BACK
+    GROUP_NORM
+
+    MUL_MAT
+    MUL_MAT_ID
+    OUT_PROD
+
+    SCALE
+    SET
+    CPY
+    CONT
+    RESHAPE
+    VIEW
+    PERMUTE
+    TRANSPOSE
+    GET_ROWS
+    GET_ROWS_BACK
+    DIAG
+    DIAG_MASK_INF
+    DIAG_MASK_ZERO
+    SOFT_MAX
+    SOFT_MAX_BACK
+    ROPE
+    ROPE_BACK
+    CLAMP
+    CONV_TRANSPOSE_1D
+    IM2COL
+    IM2COL_BACK
+    CONV_TRANSPOSE_2D
+    POOL_1D
+    POOL_2D
+    POOL_2D_BACK
+    UPSCALE
+    PAD
+    PAD_REFLECT_1D
+    ARANGE
+    TIMESTEP_EMBEDDING
+    ARGSORT
+    LEAKY_RELU
+
+    FLASH_ATTN_EXT
+    FLASH_ATTN_BACK
+    SSM_CONV
+    SSM_SCAN
+    WIN_PART
+    WIN_UNPART
+    GET_REL_POS
+    ADD_REL_POS
+    RWKV_WKV6
+    GATED_LINEAR_ATTN"
+
+echo "opname list: "
+echo ${oplist}
+}
+
+
+function show_usage()
+{
+    echo -e "\n\n\n"
+    echo "Usage:"
+    echo "  $0 help"
+    echo "  $0 print_oplist"
+    echo "  $0 build"
+    echo "  $0 build_debug (enable debug log for developers on ARM-AP side and cDSP side)"
+    echo "  $0 updateqnnlib"
+    echo "  $0 run_testops"
+    echo "  $0 run_testop     ADD/MUL_MAT"
+    echo "  $0 run_llamacli                 0(QNN_CPU)/1(QNN_GPU)/2(QNN_NPU)/3(cdsp)/4(ggml)"
+    echo "  $0 run_llamabench               0(QNN_CPU)/1(QNN_GPU)/2(QNN_NPU)/3(cdsp)/4(ggml)"
+    echo "  $0 run_threadsafety             0(QNN_CPU)/1(QNN_GPU)/2(QNN_NPU)/3(cdsp)/4(ggml)"
+    echo "  $0 run_benchmark  ADD/MUL_MAT   0(QNN_CPU)/1(QNN_GPU)/2(QNN_NPU)/3(cdsp)/4(ggml)"
+    echo "  $0 run_benchmark  ADD/MUL_MAT   0(QNN_CPU)/1(QNN_GPU)/2(QNN_NPU)/3(cdsp)/4(ggml) 256/512/1024/2048/4096 256/512/1024/2048/4096"
+    #verify performance of mulmat on cDSP
+    echo "  $0 run_benchmark  MUL_MAT       3(cdsp)   mulmat_algotype(0,1,2,3,4,5,6,32,33)  (verify performance of mulmat on cDSP)"
+    #verify accuracy    of mulmat on cDSP
+    echo "  $0 run_testop     MUL_MAT                 mulmat_algotype(0,1,2,3,4,5,6,32,33)  (verify accuracy    of mulmat on cDSP)"
+    #verify accuracy    of add    on cDSP
+    echo "  $0 run_testop     ADD                                                           (verify accuracy    of add    on cDSP)"
+
+    echo -e "\n\n\n"
+}
+
+
+######## part-4: entry point  ########
+
+show_pwd
+
+check_commands_in_host
+check_and_download_ndk
+check_and_download_qnn_sdk
+check_and_download_hexagon_sdk
+check_prebuilt_models
+
+if [ $# == 0 ]; then
+    show_usage
+    exit 1
+elif [ $# == 1 ]; then
+    if [ "$1" == "-h" ]; then
+        show_usage
+        exit 1
+    elif [ "$1" == "help" ]; then
+        show_usage
+        exit 1
+    elif [ "$1" == "print_oplist" ]; then
+        print_oplist
+        exit 1
+    elif [ "$1" == "build" ]; then
+        build_ggml_hexagon
+        exit 0
+    elif [ "$1" == "build_debug" ]; then
+        build_ggml_hexagon_debug
+        exit 0
+    elif [ "$1" == "run_testops" ]; then
+        run_test-ops
+        exit 0
+    elif [ "$1" == "updateqnnlib" ]; then
+        update_qnn_libs
+        exit 0
+    else
+        show_usage
+        exit 1
+    fi
+elif [ $# == 2 ]; then
+#TODO: check opname in oplist
+#opname can be found via print_oplist:
+
+    if [ "$1" == "run_testop" ]; then
+        opname=$2
+        mulmat_algotype=0
+        run_test-op
+        exit 0
+    elif [ "$1" == "run_llamacli" ]; then
+        hexagon_backend=$2
+        check_hexagon_backend
+        run_llamacli
+        exit 0
+    elif [ "$1" == "run_llamabench" ]; then
+        hexagon_backend=$2
+        check_hexagon_backend
+        run_llamabench
+        exit 0
+    elif [ "$1" == "run_threadsafety" ]; then
+        hexagon_backend=$2
+        check_hexagon_backend
+        run_threadsafety
+        exit 0
+    else
+        show_usage
+        exit 1
+    fi
+elif [ $# == 3 ]; then
+    if [ "$1" == "run_benchmark" ]; then
+        opname=$2
+        hexagon_backend=$3
+        row=4096
+        col=4096
+        mulmat_algotype=0
+        check_hexagon_backend
+        run_benchmark
+        exit 0
+    elif [ "$1" == "run_testop" ]; then
+        opname=$2
+        mulmat_algotype=$3
+        run_test-op
+        exit 0
+    else
+        show_usage
+        exit 1
+    fi
+elif [ $# == 4 ]; then
+    if [ "$1" == "run_benchmark" ]; then
+        opname=MUL_MAT
+        #cDSP
+        hexagon_backend=3
+        row=4096
+        col=4096
+        mulmat_algotype=$4
+        run_benchmark
+        exit 0
+    else
+        show_usage
+        exit 1
+    fi
+elif [ $# == 5 ]; then
+    if [ "$1" == "run_benchmark" ]; then
+        opname=$2
+        hexagon_backend=$3
+        row=$4
+        col=$5
+        mulmat_algotype=0
+        check_hexagon_backend
+        run_benchmark
+        exit 0
+    else
+        show_usage
+        exit 1
+    fi
+else
+    show_usage
+    exit 1
+fi
diff --git a/scripts/compare-llama-bench.py b/scripts/compare-llama-bench.py
index a1013c3b7a66d..30e3cf8649e8a 100755
--- a/scripts/compare-llama-bench.py
+++ b/scripts/compare-llama-bench.py
@@ -19,6 +19,7 @@
     print("the following Python libraries are required: GitPython, tabulate.") # noqa: NP100
     raise e
 
+
 logger = logging.getLogger("compare-llama-bench")
 
 # All llama-bench SQL fields
@@ -122,11 +123,15 @@
 parser.add_argument("--check", action="store_true", help="check if all required Python libraries are installed")
 parser.add_argument("-s", "--show", help=help_s)
 parser.add_argument("--verbose", action="store_true", help="increase output verbosity")
+parser.add_argument("--plot", help="generate a performance comparison plot and save to specified file (e.g., plot.png)")
+parser.add_argument("--plot_x", help="parameter to use as x axis for plotting (default: n_depth)", default="n_depth")
+parser.add_argument("--plot_log_scale", action="store_true", help="use log scale for x axis in plots (off by default)")
 
 known_args, unknown_args = parser.parse_known_args()
 
 logging.basicConfig(level=logging.DEBUG if known_args.verbose else logging.INFO)
 
+
 if known_args.check:
     # Check if all required Python libraries are installed. Would have failed earlier if not.
     sys.exit(0)
@@ -499,7 +504,6 @@ def valid_format(data_files: list[str]) -> bool:
 
 name_compare = bench_data.get_commit_name(hexsha8_compare)
 
-
 # If the user provided columns to group the results by, use them:
 if known_args.show is not None:
     show = known_args.show.split(",")
@@ -544,6 +548,14 @@ def valid_format(data_files: list[str]) -> bool:
             show.remove(prop)
         except ValueError:
             pass
+
+    # Add plot_x parameter to parameters to show if it's not already present:
+    if known_args.plot:
+        for k, v in PRETTY_NAMES.items():
+            if v == known_args.plot_x and k not in show:
+                show.append(k)
+                break
+
     rows_show = bench_data.get_rows(show, hexsha8_baseline, hexsha8_compare)
 
 if not rows_show:
@@ -600,6 +612,161 @@ def valid_format(data_files: list[str]) -> bool:
 headers  = [PRETTY_NAMES[p] for p in show]
 headers += ["Test", f"t/s {name_baseline}", f"t/s {name_compare}", "Speedup"]
 
+if known_args.plot:
+    def create_performance_plot(table_data: list[list[str]], headers: list[str], baseline_name: str, compare_name: str, output_file: str, plot_x_param: str, log_scale: bool = False):
+        try:
+            import matplotlib.pyplot as plt
+            import matplotlib
+            matplotlib.use('Agg')
+        except ImportError as e:
+            logger.error("matplotlib is required for --plot.")
+            raise e
+
+        data_headers = headers[:-4] # Exclude the last 4 columns (Test, baseline t/s, compare t/s, Speedup)
+        plot_x_index = None
+        plot_x_label = plot_x_param
+
+        if plot_x_param not in ["n_prompt", "n_gen", "n_depth"]:
+            pretty_name = PRETTY_NAMES.get(plot_x_param, plot_x_param)
+            if pretty_name in data_headers:
+                plot_x_index = data_headers.index(pretty_name)
+                plot_x_label = pretty_name
+            elif plot_x_param in data_headers:
+                plot_x_index = data_headers.index(plot_x_param)
+                plot_x_label = plot_x_param
+            else:
+                logger.error(f"Parameter '{plot_x_param}' not found in current table columns. Available columns: {', '.join(data_headers)}")
+                return
+
+        grouped_data = {}
+
+        for i, row in enumerate(table_data):
+            group_key_parts = []
+            test_name = row[-4]
+
+            base_test = ""
+            x_value = None
+
+            if plot_x_param in ["n_prompt", "n_gen", "n_depth"]:
+                for j, val in enumerate(row[:-4]):
+                    header_name = data_headers[j]
+                    if val is not None and str(val).strip():
+                        group_key_parts.append(f"{header_name}={val}")
+
+                if plot_x_param == "n_prompt" and "pp" in test_name:
+                    base_test = test_name.split("@")[0]
+                    x_value = base_test
+                elif plot_x_param == "n_gen" and "tg" in test_name:
+                    x_value = test_name.split("@")[0]
+                elif plot_x_param == "n_depth" and "@d" in test_name:
+                    base_test = test_name.split("@d")[0]
+                    x_value = int(test_name.split("@d")[1])
+                else:
+                    base_test = test_name
+
+                if base_test.strip():
+                    group_key_parts.append(f"Test={base_test}")
+            else:
+                for j, val in enumerate(row[:-4]):
+                    if j != plot_x_index:
+                        header_name = data_headers[j]
+                        if val is not None and str(val).strip():
+                            group_key_parts.append(f"{header_name}={val}")
+                    else:
+                        x_value = val
+
+                group_key_parts.append(f"Test={test_name}")
+
+            group_key = tuple(group_key_parts)
+
+            if group_key not in grouped_data:
+                grouped_data[group_key] = []
+
+            grouped_data[group_key].append({
+                'x_value': x_value,
+                'baseline': float(row[-3]),
+                'compare': float(row[-2]),
+                'speedup': float(row[-1])
+            })
+
+        if not grouped_data:
+            logger.error("No data available for plotting")
+            return
+
+        def make_axes(num_groups, max_cols=2, base_size=(8, 4)):
+            from math import ceil
+            cols = 1 if num_groups == 1 else min(max_cols, num_groups)
+            rows = ceil(num_groups / cols)
+
+            # Scale figure size by grid dimensions
+            w, h = base_size
+            fig, ax_arr = plt.subplots(rows, cols,
+                                       figsize=(w * cols, h * rows),
+                                       squeeze=False)
+
+            axes = ax_arr.flatten()[:num_groups]
+            return fig, axes
+
+        num_groups = len(grouped_data)
+        fig, axes = make_axes(num_groups)
+
+        plot_idx = 0
+
+        for group_key, points in grouped_data.items():
+            if plot_idx >= len(axes):
+                break
+            ax = axes[plot_idx]
+
+            try:
+                points_sorted = sorted(points, key=lambda p: float(p['x_value']) if p['x_value'] is not None else 0)
+                x_values = [float(p['x_value']) if p['x_value'] is not None else 0 for p in points_sorted]
+            except ValueError:
+                points_sorted = sorted(points, key=lambda p: group_key)
+                x_values = [p['x_value'] for p in points_sorted]
+
+            baseline_vals = [p['baseline'] for p in points_sorted]
+            compare_vals = [p['compare'] for p in points_sorted]
+
+            ax.plot(x_values, baseline_vals, 'o-', color='skyblue',
+                    label=f'{baseline_name}', linewidth=2, markersize=6)
+            ax.plot(x_values, compare_vals, 's--', color='lightcoral', alpha=0.8,
+                    label=f'{compare_name}', linewidth=2, markersize=6)
+
+            if log_scale:
+                ax.set_xscale('log', base=2)
+                unique_x = sorted(set(x_values))
+                ax.set_xticks(unique_x)
+                ax.set_xticklabels([str(int(x)) for x in unique_x])
+
+            title_parts = []
+            for part in group_key:
+                if '=' in part:
+                    key, value = part.split('=', 1)
+                    title_parts.append(f"{key}: {value}")
+
+            title = ', '.join(title_parts) if title_parts else "Performance comparison"
+
+            ax.set_xlabel(plot_x_label, fontsize=12, fontweight='bold')
+            ax.set_ylabel('Tokens per second (t/s)', fontsize=12, fontweight='bold')
+            ax.set_title(title, fontsize=12, fontweight='bold')
+            ax.legend(loc='best', fontsize=10)
+            ax.grid(True, alpha=0.3)
+
+            plot_idx += 1
+
+        for i in range(plot_idx, len(axes)):
+            axes[i].set_visible(False)
+
+        fig.suptitle(f'Performance comparison: {compare_name} vs. {baseline_name}',
+                     fontsize=14, fontweight='bold')
+        fig.subplots_adjust(top=1)
+
+        plt.tight_layout()
+        plt.savefig(output_file, dpi=300, bbox_inches='tight')
+        plt.close()
+
+    create_performance_plot(table, headers, name_baseline, name_compare, known_args.plot, known_args.plot_x, known_args.plot_log_scale)
+
 print(tabulate( # noqa: NP100
     table,
     headers=headers,
diff --git a/scripts/ggml-hexagon-for-binary-lib.cfg b/scripts/ggml-hexagon-for-binary-lib.cfg
new file mode 100644
index 0000000000000..53b662b256191
--- /dev/null
+++ b/scripts/ggml-hexagon-for-binary-lib.cfg
@@ -0,0 +1,95 @@
+[general]
+#version of ggml-hexagon.cpp on ARM-AP side
+version = "1.13"
+#version of ggml-dsp.c on cDSP side
+ggmldsp_version = "0.98"
+
+#0: HEXAGON_BACKEND_QNNCPU
+#1: HEXAGON_BACKEND_QNNGPU
+#2: HEXAGON_BACKEND_QNNNPU
+#3: HEXAGON_BACKEND_CDSP
+#4: default ggml backend
+hexagon_backend  = 3
+# 0: hwaccel approach through HWACCEL_QNN: offload ggml op to QNN
+# 1: hwaccel approach through HWACCEL_QNN_SINGLEGRAPH: mapping entire ggml cgraph to a single QNN graph
+# 2: hwaccel approach through HWACCEL_CDSP:offload ggml op to cDSP directly
+hwaccel_approach = 2
+#
+#attention:
+#          a. HWACCEL_QNN_SINGLEGRAPH not supported at the moment;
+#          b. following combinations are valid:
+#             1: hwaccel_approach = 2 AND hexagon_backend = 3(HWACCEL_CDSP, this is the default setting)
+#             2: hwaccel_approach = 0 AND hexagon_backend = 2(QNNNPU)
+#             3: hwaccel_approach = 0 AND hexagon_backend = 1(QNNGPU)
+#             4: hwaccel_approach = 0 AND hexagon_backend = 0(QNNCPU)
+#             5: hwaccel_approach = 2 AND hexagon_backend = 4(fall back to the default ggml backend)
+#             6: hwaccel_approach = 0 AND hexagon_backend = 4(fall back to the default ggml backend)
+#
+#generally speaking,
+#          a. we only need to focus on b-1(HWACCEL_CDSP) and b-2(QNNNPU).
+#          b. we can compare Hexagon NPU performance between HWACCEL_CDSP/QNNNPU/the default ggml backend accordingly
+
+
+#enable/disable offload quantized type mulmat
+#quatized type mulmat works fine through QNNNPU at the moment
+#quatized type mulmat can works through HWACCEL_CDSP at the moment
+#this item will make mulmat performance comprision easily
+#default value is 0, which is used to compare the performance of GGML_OP_ADD between QNNCPU,QNNGPU,QNNNPU, cDSP, ggml
+enable_q_mulmat = 0
+
+
+# enable/disable print tensors info in op function
+print_tensors_info = 0
+# enable/disable dump op info in handle_op
+dump_op_info = 0
+
+
+# enable/disable perf of op function
+# this is the default setting
+enable_perf = 1
+
+
+# enablie/disable profiler feature to visually compare NPU performance between HWACCEL_CDSP and QNNNPU
+# this is default setting
+enable_profiler = 0
+#threshold duration of NPU performance profiler, per seconds
+profiler_duration = 5
+#threshold counst of NPU performance profiler
+profiler_counts = 200
+#attention:
+#          NPU performance might be slower when enable_profiler = 1 because of file I/O in this feature;
+#          ensure enable_perf = 1 when set enable_profiler = 1;
+
+
+#enable/disable pinned-memory feature
+enable_pinned_memory = 0
+
+#hwaccel approach through QNN(offload ggml op to QNN-NPU)
+[qnn]
+# enable/disable QNN SDK's internal log, this will very helpful for troubleshooting in HWACCEL_QNN approach
+print_qnn_internal_log = 0
+
+hvx_threads = 8
+vtcm_size_in_mb = 8
+enable_dlbc = 1
+precision_mode = "fp16"
+
+
+#hwaccel approach through cDSP(offload ggml op to Hexagon cDSP directly)
+[cdsp]
+#enable/disable rpc ion memory pool
+enable_rpc_ion_mempool = 1
+
+#enable/disable offload all quantized type mulmat to cDSP
+enable_all_q_mulmat = 1
+#attention:
+#ensure enable_q_mulmat = 1 when set enable_all_q_mulmat = 1
+
+#enable/disable multi-threading on cDSP side
+# 0    disable multi-threading on cDSP side
+# 1    disable multi-threading on cDSP side
+# 2-8  thread_counts on cDSP side
+thread_counts = 8
+
+#algorithm type of mulmat on cDSP side
+mulmat_algotype = 0
diff --git a/scripts/ggml-hexagon.cfg b/scripts/ggml-hexagon.cfg
new file mode 100644
index 0000000000000..4a425676aae6f
--- /dev/null
+++ b/scripts/ggml-hexagon.cfg
@@ -0,0 +1,94 @@
+[general]
+#version of ggml-hexagon.cpp on ARM-AP side
+version = "1.13"
+#version of ggml-dsp.c on cDSP side
+ggmldsp_version = "0.63"
+
+#0: HEXAGON_BACKEND_QNNCPU
+#1: HEXAGON_BACKEND_QNNGPU
+#2: HEXAGON_BACKEND_QNNNPU
+#3: HEXAGON_BACKEND_CDSP
+#4: default ggml backend
+hexagon_backend  = 3
+# 0: hwaccel approach through HWACCEL_QNN: offload ggml op to QNN
+# 1: hwaccel approach through HWACCEL_QNN_SINGLEGRAPH: mapping entire ggml cgraph to a single QNN graph
+# 2: hwaccel approach through HWACCEL_CDSP:offload ggml op to cDSP directly
+hwaccel_approach = 2
+#
+#attention:
+#          a. HWACCEL_QNN_SINGLEGRAPH not supported at the moment;
+#          b. following combinations are valid:
+#             1: hwaccel_approach = 2 AND hexagon_backend = 3(HWACCEL_CDSP, this is the default setting)
+#             2: hwaccel_approach = 0 AND hexagon_backend = 2(QNNNPU)
+#             3: hwaccel_approach = 0 AND hexagon_backend = 1(QNNGPU)
+#             4: hwaccel_approach = 0 AND hexagon_backend = 0(QNNCPU)
+#             5: hwaccel_approach = 2 AND hexagon_backend = 4(fall back to the default ggml backend)
+#             6: hwaccel_approach = 0 AND hexagon_backend = 4(fall back to the default ggml backend)
+#
+#generally speaking,
+#          a. we only need to focus on b-1(HWACCEL_CDSP) and b-2(QNNNPU).
+#          b. we can compare Hexagon NPU performance between HWACCEL_CDSP/QNNNPU/the default ggml backend accordingly
+
+
+#enable/disable offload quantized type mulmat
+#quatized type mulmat works fine through QNNNPU at the moment
+#quatized type mulmat doesn't works fine through HWACCEL_CDSP at the moment
+#this item will make mulmat performance comprision easily
+enable_q_mulmat = 0
+
+
+# enable/disable print tensors info in op function
+print_tensors_info = 0
+# enable/disable dump op info in handle_op
+dump_op_info = 0
+
+
+# enable/disable perf of op function
+# this is the default setting
+enable_perf = 1
+
+
+# enablie/disable profiler feature to visually compare NPU performance between HWACCEL_CDSP and QNNNPU
+# this is default setting
+enable_profiler = 0
+#threshold duration of NPU performance profiler, per seconds
+profiler_duration = 5
+#threshold counst of NPU performance profiler
+profiler_counts = 200
+#attention:
+#          NPU performance might be slower when enable_profiler = 1 because of file I/O in this feature;
+#          ensure enable_perf = 1 when set enable_profiler = 1;
+
+
+#enable/disable pinned-memory feature
+enable_pinned_memory = 0
+
+#hwaccel approach through QNN(offload ggml op to QNN-NPU)
+[qnn]
+# enable/disable QNN SDK's internal log, this will very helpful for troubleshooting in HWACCEL_QNN approach
+print_qnn_internal_log = 0
+
+hvx_threads = 8
+vtcm_size_in_mb = 8
+enable_dlbc = 1
+precision_mode = "fp16"
+
+
+#hwaccel approach through cDSP(offload ggml op to Hexagon cDSP directly)
+[cdsp]
+#enable/disable rpc ion memory pool
+enable_rpc_ion_mempool = 1
+
+#enable/disable offload all quantized type mulmat to cDSP
+enable_all_q_mulmat = 0
+#attention:
+#ensure enable_q_mulmat = 1 when set enable_all_q_mulmat = 1
+
+#enable/disable multi-threading on cDSP side
+# 0    disable multi-threading on cDSP side
+# 1    disable multi-threading on cDSP side
+# 2-8  thread_counts on cDSP side
+thread_counts = 1
+
+#algorithm type of mulmat on cDSP side
+mulmat_algotype = 0
diff --git a/scripts/sync-ggml.last b/scripts/sync-ggml.last
index 88a1219f57908..bd1e04ed07434 100644
--- a/scripts/sync-ggml.last
+++ b/scripts/sync-ggml.last
@@ -1 +1 @@
-06b715f4c170232af261425240914fa49c44f982
+9e4bee1c5afc2d677a5b32ecb90cbdb483e81fff
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index e93e8f9cc0d55..8f9cd652447ab 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -20,8 +20,11 @@ add_library(llama
             llama-hparams.cpp
             llama-impl.cpp
             llama-io.cpp
-            llama-kv-cache.cpp
+            llama-kv-cache-unified.cpp
+            llama-kv-cache-unified-iswa.cpp
             llama-memory.cpp
+            llama-memory-hybrid.cpp
+            llama-memory-recurrent.cpp
             llama-mmap.cpp
             llama-model-loader.cpp
             llama-model-saver.cpp
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
index c0590e105c889..435e3b9ba3db8 100644
--- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
@@ -20,6 +20,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
     { LLM_ARCH_BERT,             "bert"             },
     { LLM_ARCH_NOMIC_BERT,       "nomic-bert"       },
     { LLM_ARCH_NOMIC_BERT_MOE,   "nomic-bert-moe"   },
+    { LLM_ARCH_NEO_BERT,         "neo-bert"         },
     { LLM_ARCH_JINA_BERT_V2,     "jina-bert-v2"     },
     { LLM_ARCH_BLOOM,            "bloom"            },
     { LLM_ARCH_STABLELM,         "stablelm"         },
@@ -41,6 +42,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
     { LLM_ARCH_GEMMA,            "gemma"            },
     { LLM_ARCH_GEMMA2,           "gemma2"           },
     { LLM_ARCH_GEMMA3,           "gemma3"           },
+    { LLM_ARCH_GEMMA3N,          "gemma3n"          },
     { LLM_ARCH_STARCODER2,       "starcoder2"       },
     { LLM_ARCH_MAMBA,            "mamba"            },
     { LLM_ARCH_XVERSE,           "xverse"           },
@@ -72,6 +74,8 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
     { LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" },
     { LLM_ARCH_PLM,              "plm"              },
     { LLM_ARCH_BAILINGMOE,       "bailingmoe"       },
+    { LLM_ARCH_DOTS1,            "dots1"            },
+    { LLM_ARCH_ARCEE,            "arcee"            },
     { LLM_ARCH_UNKNOWN,          "(unknown)"        },
 };
 
@@ -144,6 +148,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
     { LLM_KV_ATTENTION_SCALE,                        "%s.attention.scale"                        },
     { LLM_KV_ATTENTION_KEY_LENGTH_MLA,               "%s.attention.key_length_mla"               },
     { LLM_KV_ATTENTION_VALUE_LENGTH_MLA,             "%s.attention.value_length_mla"             },
+    { LLM_KV_ATTENTION_LAYER_INDICES,                "%s.attention.layer_indices"                },
 
     { LLM_KV_ROPE_DIMENSION_COUNT,      "%s.rope.dimension_count"                 },
     { LLM_KV_ROPE_DIMENSION_SECTIONS,   "%s.rope.dimension_sections"              },
@@ -194,13 +199,13 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
     { LLM_KV_TOKENIZER_MASK_ID,              "tokenizer.ggml.mask_token_id"            },
     { LLM_KV_TOKENIZER_ADD_BOS,              "tokenizer.ggml.add_bos_token"            },
     { LLM_KV_TOKENIZER_ADD_EOS,              "tokenizer.ggml.add_eos_token"            },
+    { LLM_KV_TOKENIZER_ADD_SEP,              "tokenizer.ggml.add_sep_token"            },
     { LLM_KV_TOKENIZER_ADD_PREFIX,           "tokenizer.ggml.add_space_prefix"         },
     { LLM_KV_TOKENIZER_REMOVE_EXTRA_WS,      "tokenizer.ggml.remove_extra_whitespaces" },
     { LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP, "tokenizer.ggml.precompiled_charsmap"     },
     { LLM_KV_TOKENIZER_HF_JSON,              "tokenizer.huggingface.json"              },
     { LLM_KV_TOKENIZER_RWKV,                 "tokenizer.rwkv.world"                    },
     { LLM_KV_TOKENIZER_CHAT_TEMPLATE,        "tokenizer.chat_template"                 },
-    { LLM_KV_TOKENIZER_CHAT_TEMPLATE_N,      "tokenizer.chat_template.%s"              },
     { LLM_KV_TOKENIZER_FIM_PRE_ID,           "tokenizer.ggml.fim_pre_token_id"         },
     { LLM_KV_TOKENIZER_FIM_SUF_ID,           "tokenizer.ggml.fim_suf_token_id"         },
     { LLM_KV_TOKENIZER_FIM_MID_ID,           "tokenizer.ggml.fim_mid_token_id"         },
@@ -244,6 +249,24 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
             { LLM_TENSOR_FFN_UP_EXPS,     "blk.%d.ffn_up_exps" },
         },
     },
+    {
+        LLM_ARCH_ARCEE,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
+            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
+            { LLM_TENSOR_OUTPUT,          "output" },
+            { LLM_TENSOR_ROPE_FREQS,      "rope_freqs" },
+            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
+            { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
+            { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
+            { LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },
+            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
+            { LLM_TENSOR_ATTN_ROT_EMBD,   "blk.%d.attn_rot_embd" },
+            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
+            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
+            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
+        },
+    },
     {
         LLM_ARCH_LLAMA4,
         {
@@ -495,6 +518,21 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
             { LLM_TENSOR_FFN_UP_EXPS,     "blk.%d.ffn_up_exps" },
         },
     },
+    {
+        LLM_ARCH_NEO_BERT,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
+            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
+            { LLM_TENSOR_ATTN_QKV,        "blk.%d.attn_qkv" },
+            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
+            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
+            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
+            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
+            { LLM_TENSOR_ENC_OUTPUT_NORM, "enc.output_norm" },
+            { LLM_TENSOR_CLS,             "cls" },
+            { LLM_TENSOR_CLS_OUT,         "cls.output" },
+        },
+    },
     {
         LLM_ARCH_JINA_BERT_V2,
         {
@@ -895,6 +933,42 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
             { LLM_TENSOR_FFN_POST_NORM,   "blk.%d.post_ffw_norm" },
         },
     },
+    {
+        LLM_ARCH_GEMMA3N,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,           "token_embd" },
+            { LLM_TENSOR_OUTPUT_NORM,          "output_norm" },
+            { LLM_TENSOR_ATTN_NORM,            "blk.%d.attn_norm" },
+            { LLM_TENSOR_ATTN_Q,               "blk.%d.attn_q" },
+            { LLM_TENSOR_ATTN_Q_NORM,          "blk.%d.attn_q_norm" },
+            { LLM_TENSOR_ATTN_K,               "blk.%d.attn_k" },
+            { LLM_TENSOR_ATTN_K_NORM,          "blk.%d.attn_k_norm" },
+            { LLM_TENSOR_ATTN_V,               "blk.%d.attn_v" },
+            { LLM_TENSOR_ATTN_OUT,             "blk.%d.attn_output" },
+            { LLM_TENSOR_ATTN_POST_NORM,       "blk.%d.post_attention_norm" },
+            { LLM_TENSOR_FFN_NORM,             "blk.%d.ffn_norm" },
+            { LLM_TENSOR_FFN_GATE,             "blk.%d.ffn_gate" },
+            { LLM_TENSOR_FFN_DOWN,             "blk.%d.ffn_down" },
+            { LLM_TENSOR_FFN_UP,               "blk.%d.ffn_up" },
+            { LLM_TENSOR_FFN_POST_NORM,        "blk.%d.post_ffw_norm" },
+            { LLM_TENSOR_PER_LAYER_TOKEN_EMBD, "per_layer_token_embd" },
+            { LLM_TENSOR_PER_LAYER_MODEL_PROJ, "per_layer_model_proj" },
+            { LLM_TENSOR_PER_LAYER_PROJ_NORM,  "per_layer_proj_norm" },
+            { LLM_TENSOR_ALTUP_UNEMBD_PROJ,    "altup_unembd_proj" },
+            { LLM_TENSOR_ALTUP_PROJ,           "altup_proj" },
+            { LLM_TENSOR_PER_LAYER_INP_GATE,   "blk.%d.inp_gate" },
+            { LLM_TENSOR_PER_LAYER_PROJ,       "blk.%d.proj" },
+            { LLM_TENSOR_PER_LAYER_POST_NORM,  "blk.%d.post_norm" },
+            { LLM_TENSOR_ALTUP_CORRECT_COEF,   "blk.%d.altup_correct_coef" },
+            { LLM_TENSOR_ALTUP_CORRECT_SCALE,  "blk.%d.altup_correct_scale" },
+            { LLM_TENSOR_ALTUP_PREDICT_COEF,   "blk.%d.altup_predict_coef" },
+            { LLM_TENSOR_ALTUP_ROUTER,         "blk.%d.altup_router" },
+            { LLM_TENSOR_ALTUP_ROUTER_NORM,    "blk.%d.altup_router_norm" },
+            { LLM_TENSOR_LAUREL_L,             "blk.%d.laurel_l" },
+            { LLM_TENSOR_LAUREL_R,             "blk.%d.laurel_r" },
+            { LLM_TENSOR_LAUREL_POST_NORM,     "blk.%d.laurel_post_norm" },
+        },
+    },
     {
         LLM_ARCH_STARCODER2,
         {
@@ -1556,6 +1630,34 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
             { LLM_TENSOR_FFN_UP_SHEXP,       "blk.%d.ffn_up_shexp" },
         },
     },
+    {
+        LLM_ARCH_DOTS1,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,         "token_embd" },
+            { LLM_TENSOR_OUTPUT_NORM,        "output_norm" },
+            { LLM_TENSOR_OUTPUT,             "output" },
+            { LLM_TENSOR_ATTN_NORM,          "blk.%d.attn_norm" },
+            { LLM_TENSOR_ATTN_Q,             "blk.%d.attn_q" },
+            { LLM_TENSOR_ATTN_Q_NORM,        "blk.%d.attn_q_norm" },
+            { LLM_TENSOR_ATTN_K,             "blk.%d.attn_k" },
+            { LLM_TENSOR_ATTN_K_NORM,        "blk.%d.attn_k_norm" },
+            { LLM_TENSOR_ATTN_V,             "blk.%d.attn_v" },
+            { LLM_TENSOR_ATTN_OUT,           "blk.%d.attn_output" },
+            { LLM_TENSOR_FFN_NORM,           "blk.%d.ffn_norm" },
+            { LLM_TENSOR_FFN_GATE,           "blk.%d.ffn_gate" },
+            { LLM_TENSOR_FFN_UP,             "blk.%d.ffn_up" },
+            { LLM_TENSOR_FFN_DOWN,           "blk.%d.ffn_down" },
+            { LLM_TENSOR_FFN_GATE_INP,       "blk.%d.ffn_gate_inp" },
+            { LLM_TENSOR_FFN_GATE_EXPS,      "blk.%d.ffn_gate_exps" },
+            { LLM_TENSOR_FFN_DOWN_EXPS,      "blk.%d.ffn_down_exps" },
+            { LLM_TENSOR_FFN_UP_EXPS,        "blk.%d.ffn_up_exps" },
+            { LLM_TENSOR_FFN_GATE_INP_SHEXP, "blk.%d.ffn_gate_inp_shexp" },
+            { LLM_TENSOR_FFN_GATE_SHEXP,     "blk.%d.ffn_gate_shexp" },
+            { LLM_TENSOR_FFN_DOWN_SHEXP,     "blk.%d.ffn_down_shexp" },
+            { LLM_TENSOR_FFN_UP_SHEXP,       "blk.%d.ffn_up_shexp" },
+            { LLM_TENSOR_FFN_EXP_PROBS_B,    "blk.%d.exp_probs_b" },
+        }
+    },
     {
         LLM_ARCH_UNKNOWN,
         {
@@ -1684,6 +1786,23 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
     {LLM_TENSOR_FFN_GATE_EXPS,              {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
     {LLM_TENSOR_FFN_UP_EXPS,                {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
     {LLM_TENSOR_FFN_EXP_PROBS_B,            {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
+    // altup / laurel (gemma 3n)
+    {LLM_TENSOR_PER_LAYER_TOKEN_EMBD,       {LLM_TENSOR_LAYER_OUTPUT,    GGML_OP_GET_ROWS}},
+    {LLM_TENSOR_PER_LAYER_MODEL_PROJ,       {LLM_TENSOR_LAYER_OUTPUT,    GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_PER_LAYER_PROJ_NORM,        {LLM_TENSOR_LAYER_OUTPUT,    GGML_OP_MUL}},
+    {LLM_TENSOR_ALTUP_PROJ,                 {LLM_TENSOR_LAYER_OUTPUT,    GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_ALTUP_UNEMBD_PROJ,          {LLM_TENSOR_LAYER_OUTPUT,    GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_PER_LAYER_INP_GATE,         {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_PER_LAYER_PROJ,             {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_PER_LAYER_POST_NORM,        {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+    {LLM_TENSOR_ALTUP_CORRECT_COEF,         {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_ALTUP_CORRECT_SCALE,        {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+    {LLM_TENSOR_ALTUP_PREDICT_COEF,         {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_ALTUP_ROUTER,               {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_ALTUP_ROUTER_NORM,          {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+    {LLM_TENSOR_LAUREL_L,                   {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_LAUREL_R,                   {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_LAUREL_POST_NORM,           {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
     // this tensor is loaded for T5, but never used
     {LLM_TENSOR_DEC_CROSS_ATTN_REL_B,       {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
     {LLM_TENSOR_CONV1D,                     {LLM_TENSOR_LAYER_INPUT,     GGML_OP_IM2COL}},
@@ -1707,8 +1826,14 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
 LLM_KV::LLM_KV(llm_arch arch, const char * suffix) : arch(arch), suffix(suffix) {}
 
 std::string LLM_KV::operator()(llm_kv kv) const {
-    return suffix ? ::format(LLM_KV_NAMES.at(kv), LLM_ARCH_NAMES.at(arch), suffix)
-        : ::format(LLM_KV_NAMES.at(kv), LLM_ARCH_NAMES.at(arch));
+    std::string name = ::format(LLM_KV_NAMES.at(kv), LLM_ARCH_NAMES.at(arch));
+
+    if (suffix != nullptr) {
+        name += ".";
+        name += suffix;
+    }
+
+    return name;
 }
 
 std::string LLM_TN_IMPL::str() const {
@@ -1747,3 +1872,25 @@ llm_arch llm_arch_from_string(const std::string & name) {
 const llm_tensor_info & llm_tensor_info_for(llm_tensor tensor) {
     return LLM_TENSOR_INFOS.at(tensor);
 }
+
+bool llm_arch_is_recurrent(const llm_arch & arch) {
+    switch (arch) {
+        case LLM_ARCH_MAMBA:
+        case LLM_ARCH_RWKV6:
+        case LLM_ARCH_RWKV6QWEN2:
+        case LLM_ARCH_RWKV7:
+        case LLM_ARCH_ARWKV7:
+            return true;
+        default:
+            return false;
+    }
+}
+
+bool llm_arch_is_hybrid(const llm_arch & arch) {
+    // TODO: There are currently no hybrid models! Once there are, this will be
+    //  the place to identify them
+    switch (arch) {
+        default:
+            return false;
+    }
+}
diff --git a/src/llama-arch.h b/src/llama-arch.h
index 930cb4eca33ab..9181ad053f6b3 100644
--- a/src/llama-arch.h
+++ b/src/llama-arch.h
@@ -24,6 +24,7 @@ enum llm_arch {
     LLM_ARCH_BERT,
     LLM_ARCH_NOMIC_BERT,
     LLM_ARCH_NOMIC_BERT_MOE,
+    LLM_ARCH_NEO_BERT,
     LLM_ARCH_JINA_BERT_V2,
     LLM_ARCH_BLOOM,
     LLM_ARCH_STABLELM,
@@ -45,6 +46,7 @@ enum llm_arch {
     LLM_ARCH_GEMMA,
     LLM_ARCH_GEMMA2,
     LLM_ARCH_GEMMA3,
+    LLM_ARCH_GEMMA3N,
     LLM_ARCH_STARCODER2,
     LLM_ARCH_MAMBA,
     LLM_ARCH_XVERSE,
@@ -76,6 +78,8 @@ enum llm_arch {
     LLM_ARCH_WAVTOKENIZER_DEC,
     LLM_ARCH_PLM,
     LLM_ARCH_BAILINGMOE,
+    LLM_ARCH_DOTS1,
+    LLM_ARCH_ARCEE,
     LLM_ARCH_UNKNOWN,
 };
 
@@ -148,6 +152,7 @@ enum llm_kv {
     LLM_KV_ATTENTION_SCALE,
     LLM_KV_ATTENTION_KEY_LENGTH_MLA,
     LLM_KV_ATTENTION_VALUE_LENGTH_MLA,
+    LLM_KV_ATTENTION_LAYER_INDICES,
 
     LLM_KV_ROPE_DIMENSION_COUNT,
     LLM_KV_ROPE_DIMENSION_SECTIONS,
@@ -190,13 +195,13 @@ enum llm_kv {
     LLM_KV_TOKENIZER_MASK_ID,
     LLM_KV_TOKENIZER_ADD_BOS,
     LLM_KV_TOKENIZER_ADD_EOS,
+    LLM_KV_TOKENIZER_ADD_SEP,
     LLM_KV_TOKENIZER_ADD_PREFIX,
     LLM_KV_TOKENIZER_REMOVE_EXTRA_WS,
     LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP,
     LLM_KV_TOKENIZER_HF_JSON,
     LLM_KV_TOKENIZER_RWKV,
     LLM_KV_TOKENIZER_CHAT_TEMPLATE,
-    LLM_KV_TOKENIZER_CHAT_TEMPLATE_N,
     LLM_KV_TOKENIZER_FIM_PRE_ID,
     LLM_KV_TOKENIZER_FIM_SUF_ID,
     LLM_KV_TOKENIZER_FIM_MID_ID,
@@ -265,6 +270,22 @@ enum llm_tensor {
     LLM_TENSOR_LAYER_OUT_NORM,
     LLM_TENSOR_POST_ATTN_NORM,
     LLM_TENSOR_POST_MLP_NORM,
+    LLM_TENSOR_PER_LAYER_TOKEN_EMBD, // gemma3n
+    LLM_TENSOR_PER_LAYER_MODEL_PROJ, // gemma3n
+    LLM_TENSOR_PER_LAYER_INP_GATE,   // gemma3n
+    LLM_TENSOR_PER_LAYER_PROJ,       // gemma3n
+    LLM_TENSOR_PER_LAYER_PROJ_NORM,  // gemma3n
+    LLM_TENSOR_PER_LAYER_POST_NORM,  // gemma3n
+    LLM_TENSOR_ALTUP_PROJ,           // gemma3n
+    LLM_TENSOR_ALTUP_UNEMBD_PROJ,    // gemma3n
+    LLM_TENSOR_ALTUP_CORRECT_COEF,   // gemma3n
+    LLM_TENSOR_ALTUP_CORRECT_SCALE,  // gemma3n
+    LLM_TENSOR_ALTUP_PREDICT_COEF,   // gemma3n
+    LLM_TENSOR_ALTUP_ROUTER,         // gemma3n
+    LLM_TENSOR_ALTUP_ROUTER_NORM,    // gemma3n
+    LLM_TENSOR_LAUREL_L,             // gemma3n
+    LLM_TENSOR_LAUREL_R,             // gemma3n
+    LLM_TENSOR_LAUREL_POST_NORM,     // gemma3n
     LLM_TENSOR_SSM_IN,
     LLM_TENSOR_SSM_CONV1D,
     LLM_TENSOR_SSM_X,
@@ -437,3 +458,6 @@ const char * llm_arch_name(llm_arch arch);
 llm_arch llm_arch_from_string(const std::string & name);
 
 const llm_tensor_info & llm_tensor_info_for(llm_tensor tensor);
+
+bool llm_arch_is_recurrent(const llm_arch & arch);
+bool llm_arch_is_hybrid   (const llm_arch & arch);
diff --git a/src/llama-batch.cpp b/src/llama-batch.cpp
index 6a19a243118d3..91b1d6078a252 100644
--- a/src/llama-batch.cpp
+++ b/src/llama-batch.cpp
@@ -1,320 +1,782 @@
 #include "llama-batch.h"
 
+#include "llama-impl.h"
+#include "llama-vocab.h"
+#include "llama-memory.h"
+
 #include <cassert>
 #include <cstring>
 #include <algorithm>
+#include <sstream>
 
-llama_ubatch llama_sbatch::reserve_ubatch(size_t n_ubatch, bool has_embd) {
-    // clear empty sequences
-    // the previous ubatch is assumed to be gone,
-    // so nothing should refer to values in these sequences anymore.
-    for (size_t i = seq.size(); i-- > 0;) {
-        if (seq[i].length == 0) {
-            seq.pop_back();
-        } else {
-            break;
+llama_batch_allocr::llama_batch_allocr(uint32_t n_pos_per_embd) : n_pos_per_embd(n_pos_per_embd) {
+    const char * LLAMA_BATCH_DEBUG = getenv("LLAMA_BATCH_DEBUG");
+    debug = LLAMA_BATCH_DEBUG ? atoi(LLAMA_BATCH_DEBUG) : 0;
+
+    seq_pos.resize(LLAMA_MAX_SEQ);
+    seq_cpl.resize(LLAMA_MAX_SEQ);
+    for (auto & cur : seq_cpl) {
+        cur.resize(LLAMA_MAX_SEQ);
+    }
+
+    seq_idx.resize(LLAMA_MAX_SEQ, -1);
+}
+
+bool llama_batch_allocr::init(
+        const llama_batch & batch_inp,
+        const llama_vocab & vocab,
+        const llama_memory_i * memory,
+        uint32_t n_embd,
+        bool output_all) {
+    clear();
+
+    batch = batch_inp;
+
+    this->vocab = &vocab;
+
+    GGML_ASSERT(batch.n_tokens > 0);
+
+    //
+    // validate input batch
+    //
+
+    if (batch.token) {
+        for (int32_t i = 0; i < batch.n_tokens; ++i) {
+            if (batch.token[i] < 0 || (uint32_t) batch.token[i] >= vocab.n_tokens()) {
+                LLAMA_LOG_ERROR("%s: invalid token[%d] = %d\n", __func__, i, batch.token[i]);
+                return false;
+            }
         }
     }
 
-    udatas.push_back({});
+    if (batch.seq_id) {
+        for (int32_t i = 0; i < batch.n_tokens; ++i) {
+            for (int32_t s = 0; s < batch.n_seq_id[i]; ++s) {
+                if (batch.seq_id && (batch.seq_id[i][s] < 0 || batch.seq_id[i][s] >= LLAMA_MAX_SEQ)) {
+                    LLAMA_LOG_ERROR("%s: invalid seq_id[%d][%d] = %d > %d\n", __func__, i, s, batch.seq_id[i][s], LLAMA_MAX_SEQ);
+                    return false;
+                }
+            }
+        }
+    }
 
-    auto & udata = udatas.back();
+    //
+    // auto-generate missing fields
+    //
 
-    udata.token.resize(!has_embd ? n_ubatch : 0);
-    udata.embd.resize(has_embd ? n_embd * n_ubatch : 0);
-    udata.pos.resize(n_ubatch);
-    udata.n_seq_id.resize(n_ubatch);
-    udata.seq_id.resize(n_ubatch);
-    udata.output.resize(n_ubatch);
+    if (!batch.n_seq_id) {
+        n_seq_id.resize(batch.n_tokens);
+        for (int32_t i = 0; i < batch.n_tokens; i++) {
+            n_seq_id[i] = seq_id_0.size();
+        }
+        batch.n_seq_id = n_seq_id.data();
+    }
 
-    llama_ubatch ubatch = {
-        /*equal_seqs   =*/ true,
-        /*n_tokens     =*/ 0,
-        /*n_seq_tokens =*/ 0,
-        /*n_seqs       =*/ 0,
-        /*token        =*/ !has_embd ? udata.token.data() : nullptr,
-        /*embd         =*/ has_embd  ? udata.embd.data()  : nullptr,
-        /*pos          =*/ udata.pos.data(),
-        /*n_seq_id     =*/ udata.n_seq_id.data(),
-        /*seq_id       =*/ udata.seq_id.data(),
-        /*output       =*/ udata.output.data(),
-    };
+    if (!batch.seq_id) {
+        seq_id.resize(batch.n_tokens + 1);
+        seq_id[batch.n_tokens] = NULL;
+        for (int32_t i = 0; i < batch.n_tokens; i++) {
+            seq_id[i] = seq_id_0.data();
+        }
+        batch.seq_id = seq_id.data();
+    }
 
-    return ubatch;
-}
+    if (!batch.pos) {
+        pos.resize(batch.n_tokens);
 
-void llama_sbatch::add_seq_to_ubatch(llama_ubatch & ubatch, llama_sbatch_seq & seq, size_t length) {
-    GGML_ASSERT(batch != nullptr);
-    GGML_ASSERT(length <= seq.length);
-    // Can only add sequences of equal lengths to a batch,
-    // otherwise it isn't clear to which sequence a token belongs
-    GGML_ASSERT(seq.n_seq_id == 0 || ubatch.n_seqs == 0 || length == (size_t) ubatch.n_tokens / ubatch.n_seqs);
-    GGML_ASSERT((seq.n_seq_id != 0) == ubatch.equal_seqs);
-    // NOTE: loops are separated for cache-friendliness
-    if (batch->token) {
-        if (ubatch.equal_seqs) {
-            for (size_t i = 0; i < length; ++i) {
-                ubatch.token[ubatch.n_tokens + i] = batch->token[ids[seq.offset + i]];
+        // initialize the starting position for each sequence based on the positions in the memory
+        llama_pos p0[LLAMA_MAX_SEQ];
+        for (int32_t s = 0; s < LLAMA_MAX_SEQ; ++s) {
+            if (!memory) {
+                // if no memory -> start from 0
+                p0[s] = 0;
+            } else {
+                p0[s] = memory->seq_pos_max(s) + 1;
             }
-        } else {
-            // simple split
-            ubatch.token = batch->token + seq.offset;
         }
-    } else {
-        ubatch.token = nullptr;
-    }
-    if (batch->embd) {
-        if (ubatch.equal_seqs) {
-            for (size_t i = 0; i < length; ++i) {
-                memcpy(
-                        ubatch.embd + (n_embd * (ubatch.n_tokens + i)),
-                        batch->embd + (n_embd * ids[seq.offset + i]),
-                        n_embd * sizeof(float)
-                      );
+
+        for (int32_t i = 0; i < batch.n_tokens; i++) {
+            const llama_seq_id seq_id = batch.seq_id[i][0];
+
+            pos[i] = p0[seq_id];
+
+            // update the starting position for all sequences that are assigned to the this token
+            for (int32_t s = 0; s < batch.n_seq_id[i]; ++s) {
+                const llama_seq_id seq_id = batch.seq_id[i][s];
+
+                p0[seq_id] = pos[i] + 1;
             }
+        }
+
+        batch.pos = pos.data();
+    }
+
+    if (!batch.logits) {
+        if (output_all) {
+            // return the output for all tokens
+            output.resize(batch.n_tokens, true);
         } else {
-            // simple split
-            ubatch.embd = batch->embd + (n_embd * seq.offset);
+            // return the output only for the last token
+            output.resize(batch.n_tokens, false);
+            output[output.size() - 1] = true;
         }
-    } else {
-        ubatch.embd = nullptr;
+
+        batch.logits = output.data();
+    } else if (output_all) {
+        bool warn = false;
+
+        for (int32_t i = 0; i < batch.n_tokens; ++i) {
+            if (batch.logits[i] == 0) {
+                warn = true;
+            }
+        }
+
+        if (warn) {
+            LLAMA_LOG_WARN("%s: embeddings required but some input tokens were not marked as outputs -> overriding\n", __func__);
+
+            output.resize(batch.n_tokens, true);
+            batch.logits = output.data();
+        }
+    }
+
+    //
+    // compute stats
+    //
+
+    this->n_embd = n_embd;
+
+    // count the outputs in this batch
+    for (int32_t i = 0; i < batch.n_tokens; ++i) {
+        n_outputs += batch.logits[i] != 0;
     }
-    if (ubatch.equal_seqs) {
-        for (size_t i = 0; i < length; ++i) {
-            ubatch.pos[ubatch.n_tokens + i] = batch->pos[ids[seq.offset + i]];
+
+    // determine coupled sequences
+    // these are pairs of sequences that have at least one token in the input batch that is assigned to both of them
+    for (int32_t i = 0; i < batch.n_tokens; ++i) {
+        const llama_seq_id s0 = batch.seq_id[i][0];
+
+        for (int32_t s = 0; s < batch.n_seq_id[i]; ++s) {
+            const llama_seq_id s1 = batch.seq_id[i][s];
+
+            seq_pos[s1].insert(batch.pos[i]);
+
+            if (s > 0) {
+                // mark that sequence s1 is coupled to s0
+                seq_cpl[s1][s0] = true;
+
+                // note: tracking the other way around is not necessary for now
+                //seq_cpl[s0][s1] = true;
+            }
         }
-    } else {
-        // simple split
-        ubatch.pos = batch->pos + seq.offset;
     }
-    if (ubatch.equal_seqs) {
-        ubatch.n_seq_id[ubatch.n_seqs] = seq.n_seq_id;
-        if (seq.seq_id) {
-            ubatch.seq_id[ubatch.n_seqs] = seq.seq_id;
+
+    // precompute the sequence sets for each token and determine the unique sequence ids that participate in the batch
+    {
+        seq_set_t seq_set_unq;
+
+        for (int32_t i = 0; i < batch.n_tokens; ++i) {
+            seq_set_t cur;
+            for (int32_t s = 0; s < batch.n_seq_id[i]; ++s) {
+                const llama_seq_id seq_id = batch.seq_id[i][s];
+
+                cur        .set(seq_id);
+                seq_set_unq.set(seq_id);
+            }
+
+            seq_set.push_back(cur);
+            seq_set_map[cur].push_back(i);
         }
-    } else {
-        // simple split
-        if (batch->n_seq_id) {
-            ubatch.n_seq_id = batch->n_seq_id + seq.offset;
-        } else {
-            for (size_t i = 0; i < length; ++i) {
-                ubatch.n_seq_id[ubatch.n_seqs + i] = 1;
+
+        for (int32_t s = 0; s < LLAMA_MAX_SEQ; ++s) {
+            if (seq_set_unq.test(s)) {
+                seq_idx[s] = seq_id_unq.size();
+                seq_id_unq.push_back(s);
             }
         }
-        if (batch->seq_id) {
-            ubatch.seq_id = batch->seq_id + seq.offset;
+    }
+
+    if (debug > 0) {
+        LLAMA_LOG_DEBUG("%s: input batch info:\n", __func__);
+
+        llama_ubatch ubatch {
+            /*.equal_seqs   =*/ false,
+            /*.n_tokens     =*/ (uint32_t) batch.n_tokens,
+            /*.n_seq_tokens =*/ (uint32_t) 1,
+            /*.n_seqs       =*/ (uint32_t) batch.n_tokens,
+            /*.n_seqs_unq   =*/ (uint32_t) this->seq_id_unq.size(),
+            /*.token        =*/ batch.token,
+            /*.embd         =*/ batch.embd,
+            /*.pos          =*/ batch.pos,
+            /*.n_seq_id     =*/ batch.n_seq_id,
+            /*.seq_id       =*/ batch.seq_id,
+            /*.seq_id_unq   =*/ this->seq_id_unq.data(),
+            /*.seq_idx      =*/ this->seq_idx.data(),
+            /*.output       =*/ batch.logits,
+        };
+
+        ubatch_print(ubatch, debug);
+
+        LLAMA_LOG_DEBUG("%s:   seq       = [\n", __func__);
+        for (int s0 = 0; s0 < (int) seq_pos.size(); ++s0) {
+            if (seq_pos[s0].empty()) {
+                continue;
+            }
+
+            std::stringstream ss;
+            for (int s1 = 0; s1 < (int) seq_cpl[s0].size(); ++s1) {
+                if (seq_cpl[s0][s1]) {
+                    ss << s1 << " ";
+                }
+            }
+
+            LLAMA_LOG_DEBUG("%s:  %4d: pos = [%4d, %4d], cpl = %s\n",
+                    __func__, s0, seq_pos_min(s0), seq_pos_max(s0), ss.str().empty() ? "-" : ss.str().c_str());
         }
+        LLAMA_LOG_DEBUG("%s:   ]\n", __func__);
     }
-    if (logits_all) {
-        for (size_t i = 0; i < length; ++i) {
-            ubatch.output[ubatch.n_tokens + i] = 1;
-            out_ids.push_back(ids[seq.offset + i]);
+
+    //
+    // consistency checks
+    //
+
+    for (int32_t s = 0; s < LLAMA_MAX_SEQ; ++s) {
+        if (seq_pos[s].empty()) {
+            continue;
         }
-    } else if (batch->logits) {
-        if (ubatch.equal_seqs) {
-            for (size_t i = 0; i < length; ++i) {
-                size_t id = ids[seq.offset + i];
-                int8_t is_output = batch->logits[id];
-                ubatch.output[ubatch.n_tokens + i] = is_output;
-                if (is_output) { out_ids.push_back(id); }
+
+        const llama_pos p0 = memory ? memory->seq_pos_max(s) : -1;
+
+        if (p0 >= 0) {
+            bool ok = true;
+
+            if (batch.token) {
+                if (seq_pos_min(s) != p0 + 1) {
+                    ok = false;
+                }
+            } else {
+                assert(batch.embd);
+
+                // for embeddings (typically used as vision input), we allow them to have repeating positions
+                // ref: https://github.com/ggml-org/llama.cpp/issues/13694#issuecomment-2983871762
+                if (seq_pos_min(s) != p0 && seq_pos_min(s) != p0 + 1) {
+                    ok = false;
+                }
             }
-        } else {
-            // simple split
-            ubatch.output = batch->logits + seq.offset;
-            for (size_t i = 0; i < length; ++i) {
-                if (ubatch.output[i] != 0) { out_ids.push_back(seq.offset + i); }
+
+            if (!ok) {
+                LLAMA_LOG_ERROR(
+                        "%s: the tokens of sequence %d in the input batch have inconsistent sequence positions:\n"
+                        " - the last position stored in the memory module of the context (i.e. the KV cache) for sequence %d is X = %d\n"
+                        " - the tokens for sequence %d in the input batch have a starting position of Y = %d\n"
+                        " it is required that the sequence positions remain consecutive: Y = X + 1\n",
+                        __func__, s, s, p0, s, seq_pos_min(s));
+
+                return false;
             }
         }
-    } else {
-        // only get last output
-        for (size_t i = 0; i < length; ++i) {
-            size_t id = ids[seq.offset + i];
-            int8_t is_last = id == ids.size() - 1;
-            ubatch.output[ubatch.n_tokens + i] = is_last;
-            if (is_last) { out_ids.push_back(id); }
-        }
-    }
-    if (ubatch.n_tokens == 0 && ubatch.n_seqs == 0) {
-        ubatch.n_seq_tokens = ubatch.equal_seqs ? length : 1;
-    }
-    ubatch.n_tokens += length;
-    ubatch.n_seqs += ubatch.equal_seqs ? 1 : length; // virtual sequences for simple splits
-    seq.offset += length;
-    seq.length -= length;
-    n_tokens -= length;
-    GGML_ASSERT(ubatch.n_tokens == ubatch.n_seq_tokens * ubatch.n_seqs);
-}
 
-llama_ubatch llama_sbatch::split_simple(size_t n_ubatch) {
-    n_ubatch = n_tokens < n_ubatch ? n_tokens : n_ubatch;
-    llama_ubatch ubatch = reserve_ubatch(n_ubatch, /* has_embd */ batch->embd != nullptr);
-    ubatch.equal_seqs = false;
-    if (!seq.empty()) {
-        llama_sbatch_seq & s = seq[0];
-        size_t length = s.length < n_ubatch ? s.length : n_ubatch;
-        GGML_ASSERT(seq.size() == 1 && s.n_seq_id == 0); // don't mix with other splits
-        add_seq_to_ubatch(ubatch, s, length);
-    }
-    return ubatch;
-}
+        if (seq_pos_max(s) - seq_pos_min(s) + 1 > (int) seq_pos[s].size()) {
+            LLAMA_LOG_ERROR("%s: sequence %d positions are not continuous\n", __func__, s);
+            return false;
+        }
+    }
+
+    if (memory) {
+        for (int32_t s0 = 0; s0 < LLAMA_MAX_SEQ; ++s0) {
+            for (int32_t s1 = 0; s1 < LLAMA_MAX_SEQ; ++s1) {
+                if (seq_cpl[s0][s1]) {
+                    if (memory->seq_pos_min(s0) != memory->seq_pos_min(s1) ||
+                        memory->seq_pos_max(s0) != memory->seq_pos_max(s1)) {
+                        LLAMA_LOG_ERROR("%s: sequence %d is coupled to %d in the input batch, but have divereged\n", __func__, s0, s1);
+                        return false;
+                    }
+                }
+            }
+        }
+    }
+
+    // disallow partial sequence sub-sets:
+    //
+    // invalid:          x
+    //            i: 0 1 2 ...
+    // ---------------------------------------
+    // seq_id[i][0]: 0 0 1
+    // seq_id[i][1]: 1 1 2
+    // seq_id[i][2]: 2
+    //
+    // disallow decreasing sequence positions:
+    //
+    // invalid:                  x
+    //            i: 0 1 2 3 4 5 6 ...
+    // ---------------------------------------
+    //       pos[i]: 4 5 0 1 6 2 3
+    // seq_id[i][0]: 0 0 1 1 0 1 0
+    //
+    {
+        seq_set_t cur_seq_set[LLAMA_MAX_SEQ];
+        for (int32_t s = 0; s < LLAMA_MAX_SEQ; ++s) {
+            cur_seq_set[s].set();
+        }
+
+        llama_pos cur_seq_pos[LLAMA_MAX_SEQ];
+        for (int32_t s = 0; s < LLAMA_MAX_SEQ; ++s) {
+            cur_seq_pos[s] = -1;
+        }
+
+        for (int32_t i = 0; i < batch.n_tokens; ++i) {
+            const llama_pos pos = batch.pos[i];
+
+            for (int32_t s = 0; s < batch.n_seq_id[i]; ++s) {
+                const llama_seq_id seq_id = batch.seq_id[i][s];
 
-llama_ubatch llama_sbatch::split_equal(size_t n_ubatch) {
-    n_ubatch = n_tokens < n_ubatch ? n_tokens : n_ubatch;
-    llama_ubatch ubatch = reserve_ubatch(n_ubatch, /* has_embd */ batch->embd != nullptr);
-    if (!seq.empty()) {
-        size_t length = 0;
-        size_t n_tokens_in_ubatch = 0;
-        GGML_ASSERT(seq[0].n_seq_id > 0); // should not be mixed with simple splits
-                                          // smallest first, because it's easier to split this way;
-                                          // starting from the end to pop in constant time.
-        for (size_t i = seq.size(); i-- > 0;) {
-            llama_sbatch_seq & s = seq[i];
-            GGML_ASSERT(s.length > 0);
-            if (length == 0) {
-                length = s.length < n_ubatch ? s.length : n_ubatch;
+                cur_seq_set[seq_id] &= seq_set[i];
+
+                if (cur_seq_set[seq_id].none()) {
+                    LLAMA_LOG_ERROR("%s: sequence %d belongs to incompatible sequence sets (not allowed)\n", __func__, seq_id);
+                    return false;
+                }
+
+                if (pos < cur_seq_pos[seq_id]) {
+                    LLAMA_LOG_ERROR("%s: sequence %d positions are decreasing (not allowed)\n", __func__, seq_id);
+                    return false;
+                }
             }
-            add_seq_to_ubatch(ubatch, s, length);
-            n_tokens_in_ubatch += length;
-            // shared prompts can't be mixed with any of their sequences,
-            // so it's safer to compute them in their own ubatch
-            if (s.n_seq_id > 1) { break; }
-            // stop when there isn't enough space for another sequence
-            if (length + n_tokens_in_ubatch > n_ubatch) { break; }
         }
     }
-    return ubatch;
+
+    split_reset();
+
+    return true;
 }
 
-llama_ubatch llama_sbatch::split_seq(size_t n_ubatch) {
-    n_ubatch = n_tokens < n_ubatch ? n_tokens : n_ubatch;
-    llama_ubatch ubatch = reserve_ubatch(n_ubatch, /* has_embd */ batch->embd != nullptr);
-    if (!seq.empty()) {
-        llama_sbatch_seq & s = seq[seq.size() - 1];
-        size_t length = s.length < n_ubatch ? s.length : n_ubatch;
-        GGML_ASSERT(s.n_seq_id > 0); // should not be mixed with simple splits
-        add_seq_to_ubatch(ubatch, s, length);
+llama_ubatch llama_batch_allocr::ubatch_reserve(uint32_t n_seq_tokens, uint32_t n_seqs) {
+    const uint32_t n_tokens = n_seq_tokens*n_seqs;
+
+    clear();
+    split_reset();
+
+    ubatches.emplace_back();
+
+    auto & ubatch = ubatches.back();
+
+    ubatch.token     .resize(n_tokens);
+    ubatch.embd      .clear();
+    ubatch.pos       .resize(n_tokens);
+    ubatch.n_seq_id  .resize(n_tokens);
+    ubatch.seq_id    .resize(n_tokens);
+    ubatch.seq_id_unq.resize(0);
+    ubatch.seq_idx   .resize(LLAMA_MAX_SEQ, -1);
+    ubatch.output    .resize(n_tokens);
+
+    for (uint32_t s = 0; s < n_seqs; ++s) {
+        ubatch.seq_idx[s] = s;
+        ubatch.seq_id_unq.push_back(s);
     }
-    return ubatch;
+
+    llama_ubatch res {
+        /*.equal_seqs   =*/ true,
+        /*.n_tokens     =*/ n_tokens,
+        /*.n_seq_tokens =*/ n_seq_tokens,
+        /*.n_seqs       =*/ n_seqs,
+        /*.n_seqs_unq   =*/ n_seqs,
+
+        /*.token        =*/ ubatch.token.data(),
+        /*.embd         =*/ nullptr,
+        /*.pos          =*/ ubatch.pos.data(),
+        /*.n_seq_id     =*/ ubatch.n_seq_id.data(),
+        /*.seq_id       =*/ ubatch.seq_id.data(),
+        /*.seq_id_unq   =*/ ubatch.seq_id_unq.data(),
+        /*.seq_idx      =*/ ubatch.seq_idx.data(),
+        /*.output       =*/ ubatch.output.data(),
+    };
+
+    return res;
 }
 
-llama_sbatch::llama_sbatch(const llama_batch & batch, size_t n_embd, bool simple_split, bool logits_all) {
-    GGML_ASSERT(batch.n_tokens >= 0);
-    this->batch = &batch;
-    this->n_embd = n_embd;
-    this->logits_all = logits_all;
+const llama_batch & llama_batch_allocr::get_batch() const {
+    return batch;
+}
+
+uint32_t llama_batch_allocr::get_n_tokens() const {
+    return batch.n_tokens;
+}
+
+uint32_t llama_batch_allocr::get_n_outputs() const {
+    return n_outputs;
+}
+
+std::vector<int32_t> & llama_batch_allocr::get_out_ids() {
+    return out_ids;
+}
 
-    n_tokens = batch.n_tokens;
-    ids.resize(n_tokens);
+llama_pos llama_batch_allocr::seq_pos_min(llama_seq_id seq_id) const {
+    return seq_pos[seq_id].empty() ? -1 : *seq_pos[seq_id].begin();
+}
+
+llama_pos llama_batch_allocr::seq_pos_max(llama_seq_id seq_id) const {
+    return seq_pos[seq_id].empty() ? -1 : *seq_pos[seq_id].rbegin();
+}
+
+void llama_batch_allocr::split_reset() {
     out_ids.clear();
-    // TODO: reserve out_ids and seq
-
-    for (size_t i = 0; i < n_tokens; ++i) {
-        ids[i] = i;
-    }
-
-    if (simple_split) {
-        seq.resize(1);
-        llama_sbatch_seq & s = seq[0];
-        s.n_seq_id = 0;
-        s.seq_id = nullptr;
-        s.offset = 0;
-        s.length = n_tokens;
-        return;
-    }
-
-    std::sort(ids.begin(), ids.end(),
-            [&batch](size_t a, size_t b) {
-                int32_t n_seq_a = batch.n_seq_id ? batch.n_seq_id[a] : 1;
-                int32_t n_seq_b = batch.n_seq_id ? batch.n_seq_id[b] : 1;
-                // sort by seq_id, then by pos
-                if (n_seq_a == n_seq_b) {
-                    if (batch.seq_id) {
-                        for (int32_t i = 0; i < n_seq_a; ++i) {
-                            llama_seq_id seq_id_a = batch.seq_id[a][i];
-                            llama_seq_id seq_id_b = batch.seq_id[b][i];
-                            // smaller seq_ids go first
-                            if (seq_id_a != seq_id_b) {
-                                return seq_id_a < seq_id_b;
-                            }
-                        }
-                    }
-                    // when all else is equal, sort by pos
-                    if (batch.pos) {
-                        return batch.pos[a] < batch.pos[b];
-                    }
-                    // no pos, sort by id
-                    return a < b;
-                }
-                // shared prompts go first
-                return n_seq_a > n_seq_b;
-            }
-    );
-
-    // init seq
-    llama_sbatch_seq * last_seq = nullptr;
-
-    for (size_t i = 0; i < n_tokens; ++i) {
-        const size_t bi = ids[i];
-        const int32_t n_seqs = batch.n_seq_id[bi];
-        llama_seq_id * seq_ids = batch.seq_id[bi];
-        if (last_seq != nullptr) {
-            bool same = n_seqs == last_seq->n_seq_id;
-            for (int32_t j = 0; same && j < n_seqs; ++j) {
-                if (seq_ids[j] != last_seq->seq_id[j]) {
-                    same = false;
-                }
+
+    used.clear();
+    used.resize(get_n_tokens(), false);
+
+    ubatches.clear();
+}
+
+llama_ubatch llama_batch_allocr::split_simple(uint32_t n_ubatch) {
+    // find the first unused token
+    uint32_t cur_idx = 0;
+    while (cur_idx < used.size() && used[cur_idx]) {
+        ++cur_idx;
+    }
+
+    // we are done
+    if (cur_idx >= used.size()) {
+        return {};
+    }
+
+    std::vector<int32_t> idxs;
+
+    while (true) {
+        idxs.push_back(cur_idx);
+
+        used[cur_idx] = true;
+
+        ++cur_idx;
+
+        if (cur_idx >= used.size()) {
+            break;
+        }
+
+        if (idxs.size() >= n_ubatch) {
+            break;
+        }
+    }
+
+    return ubatch_add(idxs, idxs.size(), false);
+}
+
+llama_ubatch llama_batch_allocr::split_equal(uint32_t n_ubatch) {
+    std::vector<seq_set_t> cur_seq_set;
+
+    // determine the non-overlapping sequence sets participating in this ubatch
+    for (int32_t i = 0; i < batch.n_tokens; ++i) {
+        if (used[i]) {
+            continue;
+        }
+
+        bool add = true;
+
+        for (uint32_t s = 0; s < cur_seq_set.size(); ++s) {
+            // no overlap with existing sequence sets:
+            if (!(cur_seq_set[s] & seq_set[i]).none()) {
+                add = false;
+                break;
             }
-            if (same) {
-                last_seq->length += 1;
-                continue;
+        }
+
+        if (add) {
+            cur_seq_set.push_back(seq_set[i]);
+
+            if (cur_seq_set.size() > n_ubatch) {
+                break;
             }
         }
-        llama_sbatch_seq new_seq = {n_seqs, seq_ids, i, 1};
-        seq.push_back(new_seq);
-        last_seq = &seq.back();
     }
 
-    // keep shared prompts first at the end, then sort by length descending.
-    std::sort(seq.begin(), seq.end(),
-            [](llama_sbatch_seq & a, llama_sbatch_seq & b) {
-                if (a.n_seq_id == b.n_seq_id) {
-                    return a.length > b.length;
-                }
-                return a.n_seq_id < b.n_seq_id;
+    const uint32_t n_seqs = cur_seq_set.size();
+
+    // we are done
+    if (n_seqs == 0) {
+        return {};
+    }
+
+    // the current batch index of each sequence set
+    std::vector<int32_t> cur_idx(n_seqs, 0);
+
+    for (uint32_t s = 0; s < n_seqs; ++s) {
+        while (used[seq_set_map[cur_seq_set[s]][cur_idx[s]]]) {
+            ++cur_idx[s];
+        }
+    }
+
+    // the list of batch indices for each sequence set
+    // at the end we will concat these to get the final ubatch
+    std::vector<idx_vec_t> idxs_per_seq(n_seqs);
+
+    while (true) {
+        // we can only add new n_seq_tokens tokens if all the sequence sets have at least one more unused token and
+        //   if we haven't reached n_ubatch
+        bool can_expand = true;
+
+        for (uint32_t s = 0; s < n_seqs; ++s) {
+            if (cur_idx[s] >= (int32_t) seq_set_map[cur_seq_set[s]].size()) {
+                can_expand = false;
+                break;
             }
-            );
+        }
+
+        if (!can_expand) {
+            break;
+        }
+
+        for (uint32_t s = 0; s < n_seqs; ++s) {
+            const int32_t idx = seq_set_map[cur_seq_set[s]][cur_idx[s]];
+
+            idxs_per_seq[s].push_back(idx);
+
+            used[idx] = true;
+
+            ++cur_idx[s];
+        }
+
+        if  ((idxs_per_seq[0].size() + 1)*n_seqs > n_ubatch) {
+            break;
+        }
+    }
+
+    // concat the per-sequence-set lists
+    std::vector<int32_t> idxs;
+
+    for (uint32_t s = 0; s < n_seqs; ++s) {
+        idxs.insert(idxs.end(), idxs_per_seq[s].begin(), idxs_per_seq[s].end());
+    }
+
+    return ubatch_add(idxs, n_seqs, true);
 }
 
-llama_batch_allocr::llama_batch_allocr(struct llama_batch in_batch, llama_pos p0) {
-    batch = in_batch;
-    GGML_ASSERT(batch.n_tokens > 0);
-    if (!batch.pos) {
-        assert(p0 >= 0);
-        pos.resize(batch.n_tokens);
-        for (int32_t i = 0; i < batch.n_tokens; i++) {
-            pos[i] = p0 + i;
+llama_ubatch llama_batch_allocr::split_seq(uint32_t n_ubatch) {
+    // find the first unused token
+    uint32_t cur_idx = 0;
+    while (cur_idx < used.size() && used[cur_idx]) {
+        ++cur_idx;
+    }
+
+    // we are done
+    if (cur_idx >= used.size()) {
+        return {};
+    }
+
+    // this is the starting sequence set
+    // we allow adding tokens only if their sequence set is a subset of the current sequence set
+    auto cur_seq_set = seq_set[cur_idx];
+
+    std::vector<int32_t> idxs;
+
+    while (true) {
+        idxs.push_back(cur_idx);
+
+        used[cur_idx] = true;
+
+        if (idxs.size() >= n_ubatch) {
+            break;
         }
-        batch.pos = pos.data();
+
+        do {
+            ++cur_idx;
+        } while (cur_idx < get_n_tokens() && (used[cur_idx] || ((cur_seq_set & seq_set[cur_idx]) != seq_set[cur_idx])));
+
+        if (cur_idx == get_n_tokens()) {
+            break;
+        }
+
+        cur_seq_set = seq_set[cur_idx];
     }
-    if (!batch.n_seq_id) {
-        n_seq_id.resize(batch.n_tokens);
-        for (int32_t i = 0; i < batch.n_tokens; i++) {
-            n_seq_id[i] = seq_id_0.size();
+
+    return ubatch_add(idxs, 1, true);
+}
+
+void llama_batch_allocr::clear() {
+    n_outputs = 0;
+
+    batch = {};
+
+    pos       .clear();
+    n_seq_id  .clear();
+    seq_id    .clear();
+    seq_id_unq.clear();
+    output    .clear();
+
+    for (auto & cur : seq_pos) {
+        cur.clear();
+    }
+
+    for (auto & cur : seq_cpl) {
+        std::fill(cur.begin(), cur.end(), false);
+    }
+
+    seq_set.clear();
+
+    seq_set_map.clear();
+
+    std::fill(seq_idx.begin(), seq_idx.end(), -1);
+}
+
+llama_ubatch llama_batch_allocr::ubatch_add(const std::vector<int32_t> & idxs, uint32_t n_seqs, bool equal_seqs) {
+    const uint32_t n_tokens = idxs.size();
+
+    assert(n_tokens%n_seqs == 0);
+
+    ubatches.emplace_back();
+
+    auto & ubatch = ubatches.back();
+
+    const int32_t n_pos_cur = batch.embd ? n_pos_per_embd : 1;
+
+    const int64_t n_embd_all = batch.embd ? (int64_t) n_tokens*n_embd : 0;
+    const int64_t n_pos_all  =              (int64_t) n_tokens*n_pos_cur;
+
+    ubatch.token     .resize(n_tokens);
+    ubatch.embd      .resize(n_embd_all);
+    ubatch.pos       .resize(n_pos_all);
+    ubatch.n_seq_id  .resize(n_tokens);
+    ubatch.seq_id    .resize(n_tokens);
+    ubatch.seq_id_unq.resize(0);
+    ubatch.seq_idx   .resize(LLAMA_MAX_SEQ, -1);
+    ubatch.output    .resize(n_tokens);
+
+    seq_set_t seq_set_unq;
+
+    for (size_t i = 0; i < idxs.size(); ++i) {
+        if (batch.token) {
+            ubatch.token[i] = batch.token[idxs[i]];
+        }
+
+        if (batch.embd) {
+            memcpy(ubatch.embd.data() + i*n_embd, batch.embd + (int64_t) idxs[i]*n_embd, n_embd*sizeof(float));
+        }
+
+        for (int j = 0; j < n_pos_cur; ++j) {
+            ubatch.pos[j*n_tokens + i] = batch.pos[j*batch.n_tokens + idxs[i]];
+        }
+
+        ubatch.n_seq_id[i] = batch.n_seq_id[idxs[i]];
+        ubatch.seq_id[i]   = batch.seq_id[idxs[i]];
+        ubatch.output[i]   = batch.logits[idxs[i]];
+
+        for (int s = 0; s < ubatch.n_seq_id[i]; ++s) {
+            seq_set_unq.set(ubatch.seq_id[i][s]);
+        }
+
+        if (ubatch.output[i]) {
+            out_ids.push_back(idxs[i]);
         }
-        batch.n_seq_id = n_seq_id.data();
     }
-    if (!batch.seq_id) {
-        seq_id.resize(batch.n_tokens + 1);
-        seq_id[batch.n_tokens] = NULL;
-        for (int32_t i = 0; i < batch.n_tokens; i++) {
-            seq_id[i] = seq_id_0.data();
+
+    for (int32_t s = 0; s < LLAMA_MAX_SEQ; ++s) {
+        if (seq_set_unq.test(s)) {
+            ubatch.seq_idx[s] = ubatch.seq_id_unq.size();
+            ubatch.seq_id_unq.push_back(s);
         }
-        batch.seq_id = seq_id.data();
     }
-    if (!batch.logits) {
-        logits.resize(batch.n_tokens);
-        logits[logits.size() - 1] = true;
-        batch.logits = logits.data();
+
+    llama_ubatch res {
+        /*.equal_seqs   =*/ equal_seqs,
+        /*.n_tokens     =*/ n_tokens,
+        /*.n_seq_tokens =*/ n_tokens/n_seqs,
+        /*.n_seqs       =*/ n_seqs,
+        /*.n_seqs_unq   =*/ (uint32_t) ubatch.seq_id_unq.size(),
+
+        /*.token        =*/ batch.token ? ubatch.token.data() : nullptr,
+        /*.embd         =*/ batch.embd ? ubatch.embd.data() : nullptr,
+        /*.pos          =*/ ubatch.pos.data(),
+        /*.n_seq_id     =*/ ubatch.n_seq_id.data(),
+        /*.seq_id       =*/ ubatch.seq_id.data(),
+        /*.seq_id_unq   =*/ ubatch.seq_id_unq.data(),
+        /*.seq_idx      =*/ ubatch.seq_idx.data(),
+        /*.output       =*/ ubatch.output.data(),
+    };
+
+    if (debug > 0) {
+        LLAMA_LOG_DEBUG("%s: added ubatch %d to split:\n", __func__, (int) ubatches.size() - 1);
+
+        ubatch_print(res, debug);
+    }
+
+    return res;
+}
+
+void llama_batch_allocr::ubatch_print(const llama_ubatch & ubatch, int debug) {
+    if (debug > 0) {
+        LLAMA_LOG_DEBUG("%s:   equal_seqs   = %d\n", __func__, ubatch.equal_seqs);
+        LLAMA_LOG_DEBUG("%s:   n_tokens     = %d\n", __func__, ubatch.n_tokens);
+        LLAMA_LOG_DEBUG("%s:   n_seq_tokens = %d\n", __func__, ubatch.n_seq_tokens);
+        LLAMA_LOG_DEBUG("%s:   n_seqs       = %d\n", __func__, ubatch.n_seqs);
+        LLAMA_LOG_DEBUG("%s:   n_seqs_unq   = %d\n", __func__, ubatch.n_seqs_unq);
+
+        std::stringstream ss_seq_id_unq;
+        std::stringstream ss_seq_idx;
+
+        ss_seq_id_unq << "[ ";
+        ss_seq_idx << "[";
+
+        for (uint32_t s = 0; s < ubatch.n_seqs_unq; ++s) {
+            ss_seq_id_unq << ubatch.seq_id_unq[s] << " ";
+        }
+
+        for (uint32_t s = 0; s < LLAMA_MAX_SEQ; ++s) {
+            if (ubatch.seq_idx[s] >= 0) {
+                ss_seq_idx << ubatch.seq_idx[s]%10;
+            } else {
+                ss_seq_idx << ".";
+            }
+        }
+
+        ss_seq_id_unq << "]";
+        ss_seq_idx    << "]";
+
+        LLAMA_LOG_DEBUG("%s:   token      = %p\n", __func__, (void *) ubatch.token);
+        LLAMA_LOG_DEBUG("%s:   embd       = %p\n", __func__, (void *) ubatch.embd);
+        LLAMA_LOG_DEBUG("%s:   pos        = %p\n", __func__, (void *) ubatch.pos);
+        LLAMA_LOG_DEBUG("%s:   n_seq_id   = %p\n", __func__, (void *) ubatch.n_seq_id);
+        LLAMA_LOG_DEBUG("%s:   seq_id     = %p\n", __func__, (void *) ubatch.seq_id);
+        LLAMA_LOG_DEBUG("%s:   seq_id_unq = %s\n", __func__, ss_seq_id_unq.str().c_str());
+        LLAMA_LOG_DEBUG("%s:   seq_idx    = %s\n", __func__, ss_seq_idx.str().c_str());
+        LLAMA_LOG_DEBUG("%s:   output     = %p\n", __func__, (void *) ubatch.output);
+        LLAMA_LOG_DEBUG("%s:   n_outputs  = %d\n", __func__, n_outputs);
+
+        if (debug > 1) {
+            int seq_id_max = 0;
+            for (uint32_t i = 0; i < ubatch.n_tokens; ++i) {
+                for (int s = 0; s < ubatch.n_seq_id[i]; ++s) {
+                    for (int s = 0; s < ubatch.n_seq_id[i]; ++s) {
+                        seq_id_max = std::max(seq_id_max, ubatch.seq_id[i][s]);
+                    }
+                }
+            }
+            ++seq_id_max;
+
+            LLAMA_LOG_DEBUG("%s:   token     = [\n", __func__);
+            for (uint32_t i = 0; i < ubatch.n_tokens; ++i) {
+                std::vector<int8_t> seq_id(seq_id_max);
+
+                for (int s = 0; s < ubatch.n_seq_id[i]; ++s) {
+                    seq_id[ubatch.seq_id[i][s]] = 1;
+                }
+
+                std::stringstream ss;
+                for (int s = 0; s < seq_id_max; ++s) {
+                    if (seq_id[s]) {
+                        ss << s%10;
+                    } else {
+                        ss << ".";
+                    }
+                }
+
+                if (ubatch.token) {
+                    LLAMA_LOG_DEBUG("%s:  %4d: id = %6d (%16s), pos = %4d, n_seq_id = %2d, seq_id = [%s], output = %d\n",
+                            __func__, i, ubatch.token[i], vocab->token_to_piece(ubatch.token[i]).c_str(),
+                            ubatch.pos[i], ubatch.n_seq_id[i], ss.str().c_str(), ubatch.output[i]);
+                } else {
+                    LLAMA_LOG_DEBUG("%s:  %4d: [embd], pos = %4d, n_seq_id = %2d, seq_id = [%s], output = %d\n",
+                            __func__, i, ubatch.pos[i], ubatch.n_seq_id[i], ss.str().c_str(), ubatch.output[i]);
+                }
+            }
+            LLAMA_LOG_DEBUG("%s:   ]\n", __func__);
+        }
     }
 }
 
@@ -326,25 +788,25 @@ struct llama_batch llama_batch_get_one(
              llama_token * tokens,
                  int32_t   n_tokens) {
     return {
-        /*n_tokens       =*/ n_tokens,
-        /*tokens         =*/ tokens,
-        /*embd           =*/ nullptr,
-        /*pos            =*/ nullptr,
-        /*n_seq_id       =*/ nullptr,
-        /*seq_id         =*/ nullptr,
-        /*logits         =*/ nullptr,
+        /*n_tokens =*/ n_tokens,
+        /*tokens   =*/ tokens,
+        /*embd     =*/ nullptr,
+        /*pos      =*/ nullptr,
+        /*n_seq_id =*/ nullptr,
+        /*seq_id   =*/ nullptr,
+        /*logits   =*/ nullptr,
     };
 }
 
 struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_t n_seq_max) {
     llama_batch batch = {
-        /*n_tokens       =*/ 0,
-        /*tokens         =*/ nullptr,
-        /*embd           =*/ nullptr,
-        /*pos            =*/ nullptr,
-        /*n_seq_id       =*/ nullptr,
-        /*seq_id         =*/ nullptr,
-        /*logits         =*/ nullptr,
+        /*n_tokens =*/ 0,
+        /*tokens   =*/ nullptr,
+        /*embd     =*/ nullptr,
+        /*pos      =*/ nullptr,
+        /*n_seq_id =*/ nullptr,
+        /*seq_id   =*/ nullptr,
+        /*logits   =*/ nullptr,
     };
 
     if (embd) {
diff --git a/src/llama-batch.h b/src/llama-batch.h
index b8260b94fd2d0..d2c5376188a0b 100644
--- a/src/llama-batch.h
+++ b/src/llama-batch.h
@@ -2,93 +2,146 @@
 
 #include "llama.h"
 
+#include "llama-cparams.h"
+
 #include <array>
 #include <vector>
+#include <set>
+#include <bitset>
+#include <unordered_map>
 
-// very similar to llama_batch,
-// but has more metadata about sequences
+// keep this struct lightweight
+// it points to data in `llama_batch_allocr`
 struct llama_ubatch {
     bool equal_seqs;
     // TODO: whole_seqs for embeddings?
 
     uint32_t n_tokens;     // total tokens (n_seq_tokens * n_seqs)
-    uint32_t n_seq_tokens; // tokens per sequence
-    uint32_t n_seqs;
-
-    llama_token  *  token;    // [n_tokens]
-    float        *  embd;     // [n_embd, n_tokens]
-    llama_pos    *  pos;      // [n_tokens]
-    int32_t      *  n_seq_id; // [n_seqs] // TODO: remove, should belong to only 1 sequence
-    llama_seq_id ** seq_id;   // [n_seqs] // TODO: become llama_seq_id * seq_id;
-    int8_t       *  output;   // [n_tokens]
+    uint32_t n_seq_tokens; // tokens per sequence set
+    uint32_t n_seqs;       // sequence sets in the ubatch
+    uint32_t n_seqs_unq;   // unique sequence ids in the ubatch
+
+    // seq_id_unq: unique sequence ids in the ubatch
+    // seq_idx:    indices of the unique sequence ids in the ubatch in [0, n_seqs_unq)
+    //             used for extracting sequence pooled embeddings
+
+    //                          // size               | idx | val
+    llama_token  *  token;      // [n_tokens]         | i   | id, token
+    float        *  embd;       // [n_embd, n_tokens] | i   | embd
+    llama_pos    *  pos;        // [n_tokens]         | i   | pos
+    int32_t      *  n_seq_id;   // [n_tokens]         | i   | -
+    llama_seq_id ** seq_id;     // [n_tokens]         | s   | s0, s1, seq_id
+    llama_seq_id *  seq_id_unq; // [n_seqs_unq]       | s   | seq_id
+    int32_t      *  seq_idx;    // [LLAMA_MAX_SEQ]    | -   | seq_idx
+    int8_t       *  output;     // [n_tokens]         | i   | -
 };
 
-struct llama_sbatch_seq {
-    int32_t n_seq_id;
+// a helper for sanitizing, fulfilling and splitting a batch
+class llama_batch_allocr {
+public:
+    llama_batch_allocr(uint32_t n_pos_per_embd);
 
-    llama_seq_id * seq_id;
+    // sanitize and auto-gen missing data in the input batch
+    // memory is optional. if provided will be used to check for sequence continuity and to determine the positions
+    bool init(
+            const llama_batch & batch_inp,
+            const llama_vocab & vocab,
+            const llama_memory_i * memory,
+            uint32_t n_embd,
+            bool output_all);
 
-    size_t offset;
-    size_t length;
-};
+    const llama_batch & get_batch() const;
 
-// sequence-length-aware batch splitting
-struct llama_sbatch {
-    // tokens left in this batch
-    size_t n_tokens;
+    uint32_t get_n_tokens()  const;
+    uint32_t get_n_outputs() const;
 
-    size_t n_embd;
+    // the array of output indices in the order they were encountered during the ubatch splitting
+    std::vector<int32_t> & get_out_ids();
 
-    bool logits_all; // TODO: remove once lctx.logits_all is removed too
+    // min/max positions of each sequence in the current ubatch
+    llama_pos seq_pos_min(llama_seq_id seq_id) const;
+    llama_pos seq_pos_max(llama_seq_id seq_id) const;
 
-    // sorted indices into the batch
-    std::vector<int64_t> ids;
-    // batch indices of the output
-    std::vector<int64_t> out_ids;
-    std::vector<llama_sbatch_seq> seq;
+    // call once before splitting the batch to reset the internal state
+    void split_reset();
 
-    const llama_batch * batch = nullptr;
+    // simple split, unknown number of sequence sets of unequal lengths
+    llama_ubatch split_simple(uint32_t n_ubatch);
 
-    // buffers for the ubatches
-    // TODO: very hacky, this needs a complete rework
-    struct ubatch_data {
-        std::vector<llama_token>    token;
-        std::vector<float>          embd;
-        std::vector<llama_pos>      pos;
-        std::vector<int32_t>        n_seq_id;
-        std::vector<llama_seq_id *> seq_id;
-        std::vector<int8_t>         output;
-    };
+    // make ubatches of equal-length sequences sets
+    llama_ubatch split_equal(uint32_t n_ubatch);
 
-    std::vector<ubatch_data> udatas;
+    // sequence-set-wise split - each ubatch contains a single sequence-set
+    llama_ubatch split_seq(uint32_t n_ubatch);
 
-    llama_ubatch reserve_ubatch(size_t n_ubatch, bool has_embd = false);
+    // a helper method for creating a well-defined ubatch of tokens
+    // TODO: support embeddings if needed in the future
+    llama_ubatch ubatch_reserve(uint32_t n_seq_tokens, uint32_t n_seqs);
 
-    void add_seq_to_ubatch(llama_ubatch & ubatch, llama_sbatch_seq & seq, size_t length);
+private:
+    void clear();
 
-    // simple split, unknown number of sequences of unequal lengths
-    llama_ubatch split_simple(size_t n_ubatch);
+    // create the next ubatch based on the provided batch indices (idxs) and the number of sequence sets (n_seqs)
+    // return llama_ubatch.n_tokens == 0 if the entire batch was consumed
+    llama_ubatch ubatch_add(const std::vector<int32_t> & idxs, uint32_t n_seqs, bool equal_seqs);
 
-    // make batches of equal-length sequences
-    llama_ubatch split_equal(size_t n_ubatch);
+    // for debugging, start with LLAMA_BATCH_DEBUG=2
+    void ubatch_print(const llama_ubatch & ubatch, int debug);
 
-    // sequence-wise split
-    llama_ubatch split_seq(size_t n_ubatch);
+    llama_batch batch;
 
-    llama_sbatch() = default;
-    llama_sbatch(const llama_batch & batch, size_t n_embd, bool simple_split = false, bool logits_all = false);
-};
+    // only for debugging purposes
+    const llama_vocab * vocab;
+
+    // TODO: this is more of a temporary solution until we have a better way to handle multiple positions per token/embd
+    //       ref: https://github.com/ggml-org/llama.cpp/issues/13694#issuecomment-2983871762
+    const uint32_t n_pos_per_embd;
 
-// temporary allocate memory for the input batch if needed
-struct llama_batch_allocr {
-    struct llama_batch batch;
+    uint32_t n_embd;
+    uint32_t n_outputs;
 
     std::array<llama_seq_id, 1> seq_id_0 = { 0 }; // default sequence id
+
     std::vector<llama_pos>      pos;
     std::vector<int32_t>        n_seq_id;
     std::vector<llama_seq_id *> seq_id;
-    std::vector<int8_t>         logits;
+    std::vector<llama_seq_id>   seq_id_unq;
+    std::vector<int32_t>        seq_idx;
+    std::vector<int8_t>         output;
+
+    using pos_set_t = std::set<llama_pos>;
+    using seq_cpl_t = std::vector<bool>;
+
+    std::vector<pos_set_t> seq_pos; // seq_pos[s]: the set of positions in sequence s
+    std::vector<seq_cpl_t> seq_cpl; // seq_cpl[s0][s1]: if sequence s0 is coupled to sequence s1
+
+    using idx_vec_t = std::vector<int32_t>;
+    using seq_set_t = std::bitset<LLAMA_MAX_SEQ>;
+
+    std::vector<seq_set_t> seq_set; // seq_set[i]: the sequence set of token i
+
+    std::unordered_map<seq_set_t, idx_vec_t> seq_set_map; // the indices at which the sequence set appears
+
+    // batch indices of the output
+    std::vector<int32_t> out_ids;
+
+    // used[i] indicates if token i has already been used in a previous ubatch
+    std::vector<bool> used;
+
+    // llama_ubatch points to this data:
+    struct ubatch {
+        std::vector<llama_token>    token;
+        std::vector<float>          embd;
+        std::vector<llama_pos>      pos;
+        std::vector<int32_t>        n_seq_id;
+        std::vector<llama_seq_id *> seq_id;
+        std::vector<llama_seq_id>   seq_id_unq;
+        std::vector<int32_t>        seq_idx;
+        std::vector<int8_t>         output;
+    };
+
+    // current splitting state:
+    std::vector<ubatch> ubatches;
 
-    // optionally fulfill the batch returned by llama_batch_get_one
-    llama_batch_allocr(struct llama_batch in_batch, llama_pos p0);
+    int debug;
 };
diff --git a/src/llama-chat.cpp b/src/llama-chat.cpp
index d12743e6b9a0c..5d317f4ee62eb 100644
--- a/src/llama-chat.cpp
+++ b/src/llama-chat.cpp
@@ -183,6 +183,8 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
         return LLM_CHAT_TEMPLATE_BAILING;
     } else if (tmpl_contains("<|header_start|>") && tmpl_contains("<|header_end|>")) {
         return LLM_CHAT_TEMPLATE_LLAMA4;
+    } else if (tmpl_contains("<|endofuserprompt|>")) {
+        return LLM_CHAT_TEMPLATE_DOTS1;
     }
     return LLM_CHAT_TEMPLATE_UNKNOWN;
 }
@@ -331,7 +333,7 @@ int32_t llm_chat_apply_template(
             std::string role(message->role);
             if (role == "system") {
                 // there is no system message for gemma, but we will merge it with user prompt, so nothing is broken
-                system_prompt = trim(message->content);
+                system_prompt += trim(message->content);
                 continue;
             }
             // in gemma, "assistant" is "model"
@@ -353,7 +355,7 @@ int32_t llm_chat_apply_template(
             std::string role(message->role);
             if (role == "system") {
                 // there is no system message support, we will merge it with user prompt
-                system_prompt = message->content;
+                system_prompt += message->content;
                 continue;
             } else if (role == "user") {
                 ss << "Human: ";
@@ -526,12 +528,17 @@ int32_t llm_chat_apply_template(
         }
     } else if (tmpl == LLM_CHAT_TEMPLATE_RWKV_WORLD) {
         // this template requires the model to have "\n\n" as EOT token
-        for (auto message : chat) {
-            std::string role(message->role);
-            if (role == "user") {
-                ss << "User: " << message->content << "\n\nAssistant:";
-            } else {
-                ss << message->content << "\n\n";
+        for (size_t i = 0; i < chat.size(); i++) {
+            std::string role(chat[i]->role);
+            if (role == "system") {
+                ss << "System: " << trim(chat[i]->content) << "\n\n";
+            } else if (role == "user") {
+                ss << "User: " << trim(chat[i]->content) << "\n\n";
+                if (i == chat.size() - 1) {
+                    ss << "Assistant:";
+                }
+            } else if (role == "assistant") {
+                ss << "Assistant: " << trim(chat[i]->content) << "\n\n";
             }
         }
     } else if (tmpl == LLM_CHAT_TEMPLATE_GRANITE) {
@@ -643,6 +650,21 @@ int32_t llm_chat_apply_template(
         if (add_ass) {
             ss << "Assistant:";
         }
+    } else if (tmpl == LLM_CHAT_TEMPLATE_DOTS1) {
+        // dots.llm1.inst (DOTS1)
+        for (auto message : chat) {
+            std::string role(message->role);
+            if (role == "system") {
+                ss << "<|system|>" << message->content << "<|endofsystem|>";
+            } else if (role == "user") {
+                ss << "<|userprompt|>" << message->content << "<|endofuserprompt|>";
+            } else {
+                ss << "<|response|>" << message->content << "<|endofresponse|>";
+            }
+        }
+        if (add_ass) {
+            ss << "<|response|>";
+        }
     } else {
         // template not supported
         return -1;
diff --git a/src/llama-chat.h b/src/llama-chat.h
index db24ade21e2ad..38800010ae48b 100644
--- a/src/llama-chat.h
+++ b/src/llama-chat.h
@@ -43,6 +43,7 @@ enum llm_chat_template {
     LLM_CHAT_TEMPLATE_BAILING,
     LLM_CHAT_TEMPLATE_LLAMA4,
     LLM_CHAT_TEMPLATE_SMOLVLM,
+    LLM_CHAT_TEMPLATE_DOTS1,
     LLM_CHAT_TEMPLATE_UNKNOWN,
 };
 
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 4ab5743879400..aff34023707d4 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -1,16 +1,21 @@
 #include "llama-context.h"
 
 #include "llama-impl.h"
+#include "llama-batch.h"
 #include "llama-io.h"
+#include "llama-memory.h"
 #include "llama-mmap.h"
 #include "llama-model.h"
-#include "llama-kv-cache.h"
 
 #include <cinttypes>
 #include <cstring>
 #include <limits>
 #include <stdexcept>
 
+#ifdef GGML_USE_HEXAGON
+#include "ggml-hexagon.h"
+#endif
+
 //
 // llama_context
 //
@@ -18,7 +23,8 @@
 llama_context::llama_context(
         const llama_model & model,
               llama_context_params params) :
-    model(model) {
+    model(model),
+    balloc(std::make_unique<llama_batch_allocr>(model.hparams.n_pos_per_embd())) {
     LLAMA_LOG_INFO("%s: constructing llama_context\n", __func__);
 
     t_start_us = model.t_start_us;
@@ -27,8 +33,8 @@ llama_context::llama_context(
     const auto & hparams = model.hparams;
 
     cparams.n_seq_max = std::max(1u, params.n_seq_max);
-    if (cparams.n_seq_max > LLAMA_MAX_PARALLEL_SEQUENCES) {
-        throw std::runtime_error("n_seq_max must be <= " + std::to_string(LLAMA_MAX_PARALLEL_SEQUENCES));
+    if (cparams.n_seq_max > LLAMA_MAX_SEQ) {
+        throw std::runtime_error("n_seq_max must be <= " + std::to_string(LLAMA_MAX_SEQ));
     }
 
     cparams.n_threads        = params.n_threads;
@@ -123,7 +129,7 @@ llama_context::llama_context(
                 __func__, n_ctx_per_seq, hparams.n_ctx_train);
     }
 
-    if (!params.swa_full && cparams.n_seq_max > 1) {
+    if (!params.swa_full && cparams.n_seq_max > 1 && hparams.is_swa_any()) {
         LLAMA_LOG_WARN("%s: requested n_seq_max (%u) > 1, but swa_full is not enabled -- performance may be degraded: %s\n",
                 __func__, cparams.n_seq_max, "https://github.com/ggml-org/llama.cpp/pull/13845#issuecomment-2924800573");
     }
@@ -131,7 +137,15 @@ llama_context::llama_context(
     if (!hparams.vocab_only) {
         // GPU backends
         for (auto * dev : model.devices) {
+#ifdef GGML_USE_HEXAGON
+            if (model.params.main_gpu == HEXAGON_BACKEND_GGML)
+                break;
+#endif
+#ifndef GGML_USE_HEXAGON
             ggml_backend_t backend = ggml_backend_dev_init(dev, nullptr);
+#else
+            ggml_backend_t backend = ggml_backend_dev_init(dev,reinterpret_cast<const char *>(model.params.main_gpu));
+#endif
             if (backend == nullptr) {
                 throw std::runtime_error(format("failed to initialize %s backend", ggml_backend_dev_name(dev)));
             }
@@ -140,9 +154,18 @@ llama_context::llama_context(
 
         // add ACCEL backends (such as BLAS)
         for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
+#ifdef GGML_USE_HEXAGON
+            if (model.params.main_gpu == HEXAGON_BACKEND_GGML)
+                break;
+#endif
             ggml_backend_dev_t dev = ggml_backend_dev_get(i);
+
             if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_ACCEL) {
+#ifndef GGML_USE_HEXAGON
                 ggml_backend_t backend = ggml_backend_dev_init(dev, nullptr);
+#else
+                ggml_backend_t backend = ggml_backend_dev_init(dev,reinterpret_cast<const char *>(model.params.main_gpu));
+#endif
                 if (backend == nullptr) {
                     throw std::runtime_error(format("failed to initialize %s backend", ggml_backend_dev_name(dev)));
                 }
@@ -277,10 +300,9 @@ llama_context::llama_context(
         int n_nodes_tg  = -1;
 
         // simulate full KV cache
-        llama_kv_cache * kv_self = static_cast<llama_kv_cache *>(memory.get());
 
-        const auto kv_state = kv_self->init_full();
-        if (!kv_state) {
+        const auto mctx = memory->init_full();
+        if (!mctx) {
             throw std::runtime_error("failed to initialize KV cache");
         }
 
@@ -288,7 +310,7 @@ llama_context::llama_context(
 
         // reserve pp graph first so that buffers are only allocated once
         {
-            auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, kv_state.get());
+            auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get());
             if (!gf) {
                 throw std::runtime_error("failed to allocate compute pp buffers");
             }
@@ -299,7 +321,7 @@ llama_context::llama_context(
 
         // reserve with tg graph to get the number of splits and nodes
         {
-            auto * gf = graph_reserve(1, 1, 1, kv_state.get());
+            auto * gf = graph_reserve(1, 1, 1, mctx.get());
             if (!gf) {
                 throw std::runtime_error("failed to allocate compute tg buffers");
             }
@@ -310,7 +332,7 @@ llama_context::llama_context(
 
         // reserve again with pp graph to avoid ggml-alloc reallocations during inference
         {
-            auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, kv_state.get());
+            auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get());
             if (!gf) {
                 throw std::runtime_error("failed to allocate compute pp buffers");
             }
@@ -419,40 +441,68 @@ uint32_t llama_context::n_threads_batch() const {
     return cparams.n_threads_batch;
 }
 
-llama_kv_cache * llama_context::get_kv_self() {
-    llama_kv_cache * kv_self = static_cast<llama_kv_cache *>(memory.get());
-    return kv_self;
+llama_memory_t llama_context::get_memory() const {
+    return memory.get();
 }
 
-const llama_kv_cache * llama_context::get_kv_self() const {
-    llama_kv_cache * kv_self = static_cast<llama_kv_cache *>(memory.get());
-    return kv_self;
+// deprecated
+void llama_context::kv_self_defrag_sched() {
+    if (!memory) {
+        return;
+    }
+
+    memory_force_optimize = true;
 }
 
-bool llama_context::kv_self_update() {
+// deprecated
+bool llama_context::kv_self_update(bool optimize) {
     if (!memory) {
         return false;
     }
 
-    llama_kv_cache * kv_self = static_cast<llama_kv_cache *>(memory.get());
+    {
+        // TODO: remove in the future
+        optimize |= memory_force_optimize;
+        memory_force_optimize = false;
 
-    if (!kv_self->update(*this)) {
-        // no updates have been performed
-        return false;
-    }
+        const auto mctx = memory->init_update(this, optimize);
+        switch (mctx->get_status()) {
+            case LLAMA_MEMORY_STATUS_SUCCESS:
+                {
+                    // noop
+                } break;
+            case LLAMA_MEMORY_STATUS_NO_UPDATE:
+                {
+                    // no updates need to be performed
+                    return false;
+                }
+            case LLAMA_MEMORY_STATUS_FAILED_PREPARE:
+            case LLAMA_MEMORY_STATUS_FAILED_COMPUTE:
+                {
+                    LLAMA_LOG_ERROR("%s: failed to prepare memory update\n", __func__);
+                    return false;
+                }
+        }
 
-    // if the KV cache did any computation, we have to reserve a new worst-case graph
-    const auto kv_state = kv_self->init_full();
-    if (!kv_state) {
-        throw std::runtime_error("failed to initialize KV cache");
+        if (!mctx->apply()) {
+            LLAMA_LOG_ERROR("%s: failed to apply memory update\n", __func__);
+        }
     }
 
-    const uint32_t n_seqs   = cparams.n_seq_max;
-    const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
+    // if the memory module did any computation, we have to reserve a new worst-case graph
+    {
+        const auto mctx = memory->init_full();
+        if (!mctx) {
+            throw std::runtime_error("failed to initialize memory context");
+        }
 
-    auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, kv_state.get());
-    if (!gf) {
-        LLAMA_LOG_ERROR("%s: failed to reserve graph after the KV cache update\n", __func__);
+        const uint32_t n_seqs   = cparams.n_seq_max;
+        const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
+
+        auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get());
+        if (!gf) {
+            LLAMA_LOG_ERROR("%s: failed to reserve graph after the memory update\n", __func__);
+        }
     }
 
     return true;
@@ -467,7 +517,7 @@ float * llama_context::get_logits() {
 }
 
 float * llama_context::get_logits_ith(int32_t i) {
-    int32_t j = -1;
+    int64_t j = -1;
 
     try {
         if (logits == nullptr) {
@@ -490,7 +540,7 @@ float * llama_context::get_logits_ith(int32_t i) {
         }
         if (j >= n_outputs) {
             // This should not happen
-            throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, n_outputs));
+            throw std::runtime_error(format("corrupt output buffer (j=%" PRId64 ", n_outputs=%d)", j, n_outputs));
         }
 
         return logits + j*model.vocab.n_tokens();
@@ -509,7 +559,7 @@ float * llama_context::get_embeddings() {
 }
 
 float * llama_context::get_embeddings_ith(int32_t i) {
-    int32_t j = -1;
+    int64_t j = -1;
 
     try {
         if (embd == nullptr) {
@@ -532,7 +582,7 @@ float * llama_context::get_embeddings_ith(int32_t i) {
         }
         if (j >= n_outputs) {
             // This should not happen
-            throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, n_outputs));
+            throw std::runtime_error(format("corrupt output buffer (j=%" PRId64 ", n_outputs=%d)", j, n_outputs));
         }
 
         return embd + j*model.hparams.n_embd;
@@ -649,9 +699,9 @@ bool llama_context::apply_adapter_cvec(
     return cvec.apply(model, data, len, n_embd, il_start, il_end);
 }
 
-llm_graph_result_ptr llama_context::process_ubatch(const llama_ubatch & ubatch, llm_graph_type gtype, llama_memory_state_i * mstate, ggml_status & ret) {
-    if (mstate && !mstate->apply()) {
-        LLAMA_LOG_ERROR("%s: failed to apply memory state\n", __func__);
+llm_graph_result_ptr llama_context::process_ubatch(const llama_ubatch & ubatch, llm_graph_type gtype, llama_memory_context_i * mctx, ggml_status & ret) {
+    if (mctx && !mctx->apply()) {
+        LLAMA_LOG_ERROR("%s: failed to apply memory context\n", __func__);
         ret = GGML_STATUS_FAILED;
         return nullptr;
     }
@@ -663,7 +713,7 @@ llm_graph_result_ptr llama_context::process_ubatch(const llama_ubatch & ubatch,
         return nullptr;
     }
 
-    auto res = graph_build(ctx_compute.get(), gf, ubatch, gtype, mstate);
+    auto res = graph_build(ctx_compute.get(), gf, ubatch, gtype, mctx);
     if (!res) {
         LLAMA_LOG_ERROR("%s: failed to build graph\n", __func__);
         ret = GGML_STATUS_FAILED;
@@ -692,62 +742,47 @@ llm_graph_result_ptr llama_context::process_ubatch(const llama_ubatch & ubatch,
     return res;
 }
 
-int llama_context::encode(llama_batch & inp_batch) {
-    if (inp_batch.n_tokens == 0) {
+int llama_context::encode(const llama_batch & batch_inp) {
+    GGML_ASSERT((!batch_inp.token && batch_inp.embd) || (batch_inp.token && !batch_inp.embd)); // NOLINT
+
+    if (batch_inp.n_tokens == 0) {
         LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__);
         return -1;
     }
 
-    // temporary allocate memory for the input batch if needed
-    // note: during encode, we always pass the full sequence starting from pos = 0
-    llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : 0);
-
-    const llama_batch & batch = batch_allocr.batch;
-    const int32_t n_tokens = batch.n_tokens;
-
     const auto & hparams = model.hparams;
 
-    GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
-
-    // TODO: move the validation to the llama_batch_allocr
-    if (batch.token) {
-        for (int32_t i = 0; i < n_tokens; ++i) {
-            if (batch.token[i] < 0 || (uint32_t) batch.token[i] >= model.vocab.n_tokens()) {
-                LLAMA_LOG_ERROR("%s: invalid token[%d] = %d\n", __func__, i, batch.token[i]);
-                return -1;
-            }
+    const int64_t n_embd = hparams.n_embd;
 
-            if (batch.seq_id && (batch.seq_id[i][0] < 0 || batch.seq_id[i][0] >= LLAMA_MAX_PARALLEL_SEQUENCES)) {
-                LLAMA_LOG_ERROR("%s: invalid seq_id[%d] = %d > %d\n", __func__, i, batch.seq_id[i][0], LLAMA_MAX_PARALLEL_SEQUENCES);
-                throw -1;
-            }
-        }
+    // note: during encode, we always pass the full sequence starting from pos = 0
+    if (!balloc->init(batch_inp, model.vocab, nullptr, n_embd, true)) {
+        LLAMA_LOG_ERROR("%s: failed to initialize batch\n", __func__);
+        return -1;
     }
 
+    const uint32_t n_tokens = balloc->get_n_tokens();
+
+    const llama_ubatch ubatch = balloc->split_simple(n_tokens);
+
     // micro-batching is not possible for non-causal encoding, so we process the batch in a single shot
-    GGML_ASSERT(cparams.n_ubatch >= (uint32_t) n_tokens && "encoder requires n_ubatch >= n_tokens");
+    GGML_ASSERT(cparams.n_ubatch >= n_tokens && "encoder requires n_ubatch >= n_tokens");
 
     if (t_compute_start_us == 0) {
         t_compute_start_us = ggml_time_us();
     }
 
+    // TODO: this clear of the buffer can easily be forgotten - need something better
     embd_seq.clear();
 
     n_queued_tokens += n_tokens;
 
-    const int64_t n_embd = hparams.n_embd;
-
-    llama_sbatch sbatch = llama_sbatch(batch, n_embd, /* simple_split */ true, /* logits_all */ true);
-
-    const llama_ubatch ubatch = sbatch.split_simple(n_tokens);
-
     // reserve output buffer
     if (output_reserve(n_tokens) < n_tokens) {
         LLAMA_LOG_ERROR("%s: could not reserve space for batch with %u outputs\n", __func__, n_tokens);
         return -2;
     };
 
-    for (int32_t i = 0; i < n_tokens; ++i) {
+    for (uint32_t i = 0; i < n_tokens; ++i) {
         output_ids[i] = i;
     }
 
@@ -799,31 +834,28 @@ int llama_context::encode(llama_batch & inp_batch) {
                 {
                     // extract sequence embeddings
                     auto & embd_seq_out = embd_seq;
-                    embd_seq_out.clear();
 
-                    GGML_ASSERT(!ubatch.equal_seqs); // TODO: handle equal splits
+                    for (uint32_t s = 0; s < ubatch.n_seqs_unq; ++s) {
+                        const llama_seq_id seq_id  = ubatch.seq_id_unq[s];
+                        const int32_t      seq_idx = ubatch.seq_idx[seq_id];
 
-                    for (int32_t i = 0; i < n_tokens; i++) {
-                        const llama_seq_id seq_id = ubatch.seq_id[i][0];
-                        if (embd_seq_out.find(seq_id) != embd_seq_out.end()) {
-                            continue;
-                        }
                         embd_seq_out[seq_id].resize(n_embd);
-                        ggml_backend_tensor_get_async(backend_embd, t_embd, embd_seq_out[seq_id].data(), (n_embd*seq_id)*sizeof(float), n_embd*sizeof(float));
+                        ggml_backend_tensor_get_async(backend_embd, t_embd, embd_seq_out[seq_id].data(), (n_embd*seq_idx)*sizeof(float), n_embd*sizeof(float));
                     }
                 } break;
             case LLAMA_POOLING_TYPE_RANK:
                 {
-                    // extract the rerank score - a single float per sequence
+                    // extract the rerank score - n_cls_out floats per sequence
                     auto & embd_seq_out = embd_seq;
 
-                    for (uint32_t s = 0; s < ubatch.n_seqs; ++s) {
-                        const llama_seq_id seq_id = ubatch.seq_id[s][0];
-                        if (embd_seq_out.find(seq_id) != embd_seq_out.end()) {
-                            continue;
-                        }
-                        embd_seq_out[seq_id].resize(1);
-                        ggml_backend_tensor_get_async(backend_embd, t_embd, embd_seq_out[seq_id].data(), (seq_id)*sizeof(float), sizeof(float));
+                    const uint32_t n_cls_out = hparams.n_cls_out;
+
+                    for (uint32_t s = 0; s < ubatch.n_seqs_unq; ++s) {
+                        const llama_seq_id seq_id  = ubatch.seq_id_unq[s];
+                        const int32_t      seq_idx = ubatch.seq_idx[seq_id];
+
+                        embd_seq_out[seq_id].resize(n_cls_out);
+                        ggml_backend_tensor_get_async(backend_embd, t_embd, embd_seq_out[seq_id].data(), (n_cls_out*seq_idx)*sizeof(float), n_cls_out*sizeof(float));
                     }
                 } break;
             case LLAMA_POOLING_TYPE_UNSPECIFIED:
@@ -848,12 +880,16 @@ int llama_context::encode(llama_batch & inp_batch) {
         cross.v_embd.resize(cross.n_embd*cross.n_enc);
         memcpy(cross.v_embd.data(), embd, ggml_nbytes(t_embd));
 
+        const auto & batch = balloc->get_batch();
+
         // remember the sequence ids used during the encoding - needed for cross attention later
         cross.seq_ids_enc.resize(n_tokens);
-        for (int32_t i = 0; i < n_tokens; i++) {
+        for (uint32_t i = 0; i < n_tokens; i++) {
             cross.seq_ids_enc[i].clear();
-            for (int s = 0; s < ubatch.n_seq_id[i]; s++) {
-                llama_seq_id seq_id = ubatch.seq_id[i][s];
+
+            for (int s = 0; s < batch.n_seq_id[i]; s++) {
+                const llama_seq_id seq_id = batch.seq_id[i][s];
+
                 cross.seq_ids_enc[i].insert(seq_id);
             }
         }
@@ -862,53 +898,42 @@ int llama_context::encode(llama_batch & inp_batch) {
     return 0;
 }
 
-int llama_context::decode(llama_batch & inp_batch) {
+int llama_context::decode(const llama_batch & batch_inp) {
+    GGML_ASSERT((!batch_inp.token && batch_inp.embd) || (batch_inp.token && !batch_inp.embd)); // NOLINT
+
     if (!memory) {
         LLAMA_LOG_DEBUG("%s: cannot decode batches with this context (calling encode() instead)\n", __func__);
-        return encode(inp_batch);
+        return encode(batch_inp);
     }
 
-    if (inp_batch.n_tokens == 0) {
+    if (batch_inp.n_tokens == 0) {
         LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__);
         return -1;
     }
 
-    if (!inp_batch.pos) {
-        if (inp_batch.seq_id) {
-            LLAMA_LOG_ERROR("%s: pos == NULL, but seq_id != NULL\n", __func__);
-            return -1;
-        }
-    }
-
-    llama_kv_cache * kv_self = static_cast<llama_kv_cache *>(memory.get());
-
-    // temporary allocate memory for the input batch if needed
-    llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : kv_self->seq_pos_max(0) + 1);
-
-    const llama_batch & batch = batch_allocr.batch;
-
     const auto & vocab   = model.vocab;
     const auto & hparams = model.hparams;
 
     const int32_t n_vocab = vocab.n_tokens();
+    const int64_t n_embd  = hparams.n_embd;
 
-    const int64_t n_tokens_all = batch.n_tokens;
-    const int64_t n_embd       = hparams.n_embd;
+    // when computing embeddings, all tokens are output
+    const bool output_all = cparams.embeddings;
 
-    GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
+    if (!balloc->init(batch_inp, vocab, memory.get(), n_embd, output_all)) {
+        LLAMA_LOG_ERROR("%s: failed to initialize batch\n", __func__);
+        return -1;
+    }
 
-    // TODO: move the validation to the llama_batch_allocr
-    if (batch.token) {
-        for (int64_t i = 0; i < n_tokens_all; ++i) {
-            if (batch.token[i] < 0 || (uint32_t) batch.token[i] >= model.vocab.n_tokens()) {
-                LLAMA_LOG_ERROR("%s: invalid token[%" PRId64 "] = %d\n", __func__, i, batch.token[i]);
-                return -1;
-            }
+    const uint32_t n_tokens_all  = balloc->get_n_tokens();
+    const uint32_t n_outputs_all = balloc->get_n_outputs();
 
-            if (batch.seq_id && (batch.seq_id[i][0] < 0 || batch.seq_id[i][0] >= LLAMA_MAX_PARALLEL_SEQUENCES)) {
-                LLAMA_LOG_ERROR("%s: invalid seq_id[%" PRId64 "] = %d >= %d\n", __func__, i, batch.seq_id[i][0], LLAMA_MAX_PARALLEL_SEQUENCES);
-                return -1;
-            }
+    if (output_all) {
+        // require that all tokens are output
+        if (n_outputs_all != n_tokens_all) {
+            LLAMA_LOG_ERROR("%s: pooled embedding requires that all tokens are output (n_outputs_all = %d, n_tokens_all = %d)\n",
+                    __func__, n_outputs_all, n_tokens_all);
+            return -1;
         }
     }
 
@@ -921,61 +946,52 @@ int llama_context::decode(llama_batch & inp_batch) {
     }
     n_queued_tokens += n_tokens_all;
 
-    // this indicates we are doing pooled embedding, so we ignore batch.logits and output all tokens
-    const bool embd_pooled = cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE;
-
+    // TODO: this clear of the buffer can easily be forgotten - need something better
     embd_seq.clear();
 
-    int64_t n_outputs_all = 0;
-
-    // count outputs
-    if (batch.logits && !embd_pooled) {
-        for (uint32_t i = 0; i < n_tokens_all; ++i) {
-            n_outputs_all += batch.logits[i] != 0;
-        }
-    } else if (embd_pooled) {
-        n_outputs_all = n_tokens_all;
-    } else {
-        // keep last output only
-        n_outputs_all = 1;
-    }
+    bool did_optimize = false;
 
     // handle any pending defrags/shifts
-    kv_self_update();
+    kv_self_update(false);
 
-    llama_memory_state_ptr kv_state;
-
-    bool did_defrag = false;
+    llama_memory_context_ptr mctx;
 
     while (true) {
-        kv_state = kv_self->init_batch(batch, cparams.n_ubatch, embd_pooled, /* logits_all */ n_outputs_all == n_tokens_all);
-        if (!kv_state) {
+        mctx = memory->init_batch(*balloc, cparams.n_ubatch, output_all);
+        if (!mctx) {
             return -2;
         }
 
-        switch (kv_state->get_status()) {
+        switch (mctx->get_status()) {
             case LLAMA_MEMORY_STATUS_SUCCESS:
                 {
                 } break;
+            case LLAMA_MEMORY_STATUS_NO_UPDATE:
+                {
+                    LLAMA_LOG_ERROR("%s: unexpected memory context status: %d\n", __func__, mctx->get_status());
+
+                    return -2;
+                }
             case LLAMA_MEMORY_STATUS_FAILED_PREPARE:
                 {
-                    if (!did_defrag) {
-                        did_defrag = true;
+                    if (!did_optimize) {
+                        did_optimize = true;
 
-                        kv_self->defrag_sched(-1.0f);
-                        if (kv_self_update()) {
-                            LLAMA_LOG_DEBUG("%s: failed to init batch of size %d, retrying after defrag\n", __func__, batch.n_tokens);
+                        if (kv_self_update(true)) {
+                            LLAMA_LOG_DEBUG("%s: retrying batch size %d after cache optimization\n", __func__, balloc->get_n_tokens());
 
                             continue;
                         }
                     }
 
-                    LLAMA_LOG_WARN("%s: failed to find KV cache slot for batch of size %d\n", __func__, batch.n_tokens);
+                    LLAMA_LOG_WARN("%s: failed to find a memory slot for batch of size %d\n", __func__, balloc->get_n_tokens());
 
                     return 1;
                 }
             case LLAMA_MEMORY_STATUS_FAILED_COMPUTE:
                 {
+                    LLAMA_LOG_ERROR("%s: compute failed while preparing batch of size %d\n", __func__, balloc->get_n_tokens());
+
                     return -2;
                 }
         }
@@ -985,23 +1001,22 @@ int llama_context::decode(llama_batch & inp_batch) {
 
     // reserve output buffer
     if (output_reserve(n_outputs_all) < n_outputs_all) {
-        LLAMA_LOG_ERROR("%s: could not reserve space for batch with %" PRId64 " outputs\n", __func__, n_outputs_all);
+        LLAMA_LOG_ERROR("%s: could not reserve space for batch with %d outputs\n", __func__, n_outputs_all);
         return -2;
     };
 
     int64_t n_outputs_prev = 0;
 
     do {
-        const auto & ubatch = kv_state->get_ubatch();
+        const auto & ubatch = mctx->get_ubatch();
 
-        // count the outputs in this u_batch
+        // count the outputs in this ubatch
         {
             int32_t n_outputs_new = 0;
 
             if (n_outputs_all == n_tokens_all) {
                 n_outputs_new = ubatch.n_tokens;
             } else {
-                GGML_ASSERT(ubatch.output);
                 for (uint32_t i = 0; i < ubatch.n_tokens; i++) {
                     n_outputs_new += (int32_t) (ubatch.output[i] != 0);
                 }
@@ -1015,11 +1030,14 @@ int llama_context::decode(llama_batch & inp_batch) {
         ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data);
 
         ggml_status status;
-        const auto res = process_ubatch(ubatch, LLM_GRAPH_TYPE_DECODER, kv_state.get(), status);
+        const auto res = process_ubatch(ubatch, LLM_GRAPH_TYPE_DECODER, mctx.get(), status);
 
         if (!res) {
             // the last ubatch failed or was aborted -> remove all positions of that ubatch from the KV cache
-            llama_pos pos_min[LLAMA_MAX_PARALLEL_SEQUENCES] = { std::numeric_limits<llama_pos>::max() };
+            llama_pos pos_min[LLAMA_MAX_SEQ];
+            for (int s = 0; s < LLAMA_MAX_SEQ; ++s) {
+                pos_min[s] = std::numeric_limits<llama_pos>::max();
+            }
 
             for (uint32_t i = 0; i < ubatch.n_tokens; ++i) {
                 const auto & seq_id = ubatch.seq_id[i][0];
@@ -1027,14 +1045,14 @@ int llama_context::decode(llama_batch & inp_batch) {
                 pos_min[seq_id] = std::min(pos_min[seq_id], ubatch.pos[i]);
             }
 
-            for (int s = 0; s < LLAMA_MAX_PARALLEL_SEQUENCES; ++s) {
+            for (int s = 0; s < LLAMA_MAX_SEQ; ++s) {
                 if (pos_min[s] == std::numeric_limits<llama_pos>::max()) {
                     continue;
                 }
 
                 LLAMA_LOG_WARN("%s: removing KV cache entries for seq_id = %d, pos = [%d, +inf)\n", __func__, s, pos_min[s]);
 
-                llama_kv_self_seq_rm(this, s, pos_min[s], -1);
+                memory->seq_rm(s, pos_min[s], -1);
             }
 
             switch (status) {
@@ -1050,7 +1068,7 @@ int llama_context::decode(llama_batch & inp_batch) {
         //    ggml_graph_dump_dot(gf, NULL, "llama.dot");
         //}
 
-        auto * t_logits = cparams.embeddings ? nullptr         : res->get_logits();
+        auto * t_logits = res->get_logits();
         auto * t_embd   = cparams.embeddings ? res->get_embd() : nullptr;
 
         if (t_embd && res->get_embd_pooled()) {
@@ -1097,27 +1115,27 @@ int llama_context::decode(llama_batch & inp_batch) {
                         // extract sequence embeddings (cleared before processing each batch)
                         auto & embd_seq_out = embd_seq;
 
-                        for (uint32_t s = 0; s < ubatch.n_seqs; ++s) {
-                            const llama_seq_id seq_id = ubatch.seq_id[s][0];
-                            if (embd_seq_out.find(seq_id) != embd_seq_out.end()) {
-                                continue;
-                            }
+                        for (uint32_t s = 0; s < ubatch.n_seqs_unq; ++s) {
+                            const llama_seq_id seq_id  = ubatch.seq_id_unq[s];
+                            const int32_t      seq_idx = ubatch.seq_idx[seq_id];
+
                             embd_seq_out[seq_id].resize(n_embd);
-                            ggml_backend_tensor_get_async(backend_embd, t_embd, embd_seq_out[seq_id].data(), (n_embd*seq_id)*sizeof(float), n_embd*sizeof(float));
+                            ggml_backend_tensor_get_async(backend_embd, t_embd, embd_seq_out[seq_id].data(), (n_embd*seq_idx)*sizeof(float), n_embd*sizeof(float));
                         }
                     } break;
                 case LLAMA_POOLING_TYPE_RANK:
                     {
-                        // extract the rerank score - a single float per sequence
+                        // extract the rerank score - n_cls_out floats per sequence
                         auto & embd_seq_out = embd_seq;
 
-                        for (uint32_t s = 0; s < ubatch.n_seqs; ++s) {
-                            const llama_seq_id seq_id = ubatch.seq_id[s][0];
-                            if (embd_seq_out.find(seq_id) != embd_seq_out.end()) {
-                                continue;
-                            }
-                            embd_seq_out[seq_id].resize(1);
-                            ggml_backend_tensor_get_async(backend_embd, t_embd, embd_seq_out[seq_id].data(), (seq_id)*sizeof(float), sizeof(float));
+                        const uint32_t n_cls_out = hparams.n_cls_out;
+
+                        for (uint32_t s = 0; s < ubatch.n_seqs_unq; ++s) {
+                            const llama_seq_id seq_id  = ubatch.seq_id_unq[s];
+                            const int32_t      seq_idx = ubatch.seq_idx[seq_id];
+
+                            embd_seq_out[seq_id].resize(n_cls_out);
+                            ggml_backend_tensor_get_async(backend_embd, t_embd, embd_seq_out[seq_id].data(), (n_cls_out*seq_idx)*sizeof(float), n_cls_out*sizeof(float));
                         }
                     } break;
                 case LLAMA_POOLING_TYPE_UNSPECIFIED:
@@ -1128,20 +1146,20 @@ int llama_context::decode(llama_batch & inp_batch) {
         }
 
         n_outputs_prev += n_outputs;
-    } while (kv_state->next());
+    } while (mctx->next());
 
     // set to total number of outputs in the batch, for use in llama_get_logits_ith
     n_outputs = n_outputs_all;
 
     // set output mappings
-    {
+    if (n_outputs > 0) {
         bool sorted_output = true;
 
-        auto & out_ids = kv_state->out_ids();
+        auto & out_ids = balloc->get_out_ids();
 
-        GGML_ASSERT(out_ids.size() == (size_t) n_outputs_all);
+        GGML_ASSERT(out_ids.size() == (size_t) n_outputs);
 
-        for (int64_t i = 0; i < n_outputs_all; ++i) {
+        for (int64_t i = 0; i < n_outputs; ++i) {
             int64_t out_id = out_ids[i];
             output_ids[out_id] = i;
             if (out_id != i) {
@@ -1153,20 +1171,22 @@ int llama_context::decode(llama_batch & inp_batch) {
         // note: this is mostly relevant for recurrent models atm
         if (!sorted_output) {
             const uint32_t n_vocab = model.vocab.n_tokens();
-            const uint32_t n_embd  = model.hparams.n_embd;
+            const uint64_t n_embd  = model.hparams.n_embd;
 
             GGML_ASSERT((size_t) n_outputs == out_ids.size());
 
             // TODO: is there something more efficient which also minimizes swaps?
             // selection sort, to minimize swaps (from https://en.wikipedia.org/wiki/Selection_sort)
-            for (int32_t i = 0; i < n_outputs - 1; ++i) {
-                int32_t j_min = i;
-                for (int32_t j = i + 1; j < n_outputs; ++j) {
+            for (uint32_t i = 0; i < n_outputs - 1; ++i) {
+                uint32_t j_min = i;
+                for (uint32_t j = i + 1; j < n_outputs; ++j) {
                     if (out_ids[j] < out_ids[j_min]) {
                         j_min = j;
                     }
                 }
-                if (j_min == i) { continue; }
+                if (j_min == i) {
+                    continue;
+                }
                 std::swap(out_ids[i], out_ids[j_min]);
                 if (logits_size > 0) {
                     for (uint32_t k = 0; k < n_vocab; k++) {
@@ -1179,8 +1199,10 @@ int llama_context::decode(llama_batch & inp_batch) {
                     }
                 }
             }
+
             std::fill(output_ids.begin(), output_ids.end(), -1);
-            for (int32_t i = 0; i < n_outputs; ++i) {
+
+            for (uint32_t i = 0; i < n_outputs; ++i) {
                 output_ids[out_ids[i]] = i;
             }
         }
@@ -1189,11 +1211,6 @@ int llama_context::decode(llama_batch & inp_batch) {
     // wait for the computation to finish (automatically done when obtaining the model output)
     //synchronize();
 
-    // decide if we need to defrag the kv cache
-    if (cparams.defrag_thold > 0.0f) {
-        kv_self->defrag_sched(cparams.defrag_thold);
-    }
-
     // Reset state for the next token before backend sync, to allow the CPU activities in the reset to
     // overlap with device computation.
     ggml_backend_sched_reset(sched.get());
@@ -1205,7 +1222,7 @@ int llama_context::decode(llama_batch & inp_batch) {
 // output
 //
 
-int32_t llama_context::output_reserve(int32_t n_outputs) {
+uint32_t llama_context::output_reserve(int32_t n_outputs) {
     const auto & hparams = model.hparams;
     const auto & vocab   = model.vocab;
 
@@ -1215,9 +1232,8 @@ int32_t llama_context::output_reserve(int32_t n_outputs) {
     const auto n_vocab = vocab.n_tokens();
     const auto n_embd  = hparams.n_embd;
 
-    // TODO: use a per-batch flag for logits presence instead
-    bool has_logits = !cparams.embeddings;
-    bool has_embd   =  cparams.embeddings && (cparams.pooling_type == LLAMA_POOLING_TYPE_NONE);
+    bool has_logits = true;
+    bool has_embd   = cparams.embeddings;
 
     // TODO: hacky enc-dec support
     if (model.arch == LLM_ARCH_T5) {
@@ -1271,8 +1287,7 @@ int32_t llama_context::output_reserve(int32_t n_outputs) {
     // set all ids as invalid (negative)
     std::fill(output_ids.begin(), output_ids.end(), -1);
 
-    this->n_outputs     = 0;
-    this->n_outputs_max = n_outputs_max;
+    this->n_outputs = 0;
 
     return n_outputs_max;
 }
@@ -1297,11 +1312,11 @@ ggml_cgraph * llama_context::graph_init() {
     return ggml_new_graph_custom(ctx_compute.get(), graph_max_nodes(), false);
 }
 
-ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_state_i * mstate) {
+ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx) {
     LLAMA_LOG_DEBUG("%s: reserving a graph for ubatch with n_tokens = %4u, n_seqs = %2u, n_outputs = %4u\n", __func__, n_tokens, n_seqs, n_outputs);
 
     if (n_tokens % n_seqs != 0) {
-        n_tokens = (n_tokens / n_seqs) * n_seqs;
+        n_tokens = ((n_tokens + (n_seqs - 1)) / n_seqs) * n_seqs; // round to next multiple of n_seqs
         n_outputs = std::min(n_outputs, n_tokens);
 
         LLAMA_LOG_DEBUG("%s: making n_tokens a multiple of n_seqs - n_tokens = %u, n_seqs = %u, n_outputs = %u\n", __func__, n_tokens, n_seqs, n_outputs);
@@ -1313,11 +1328,11 @@ ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, u
 
     this->n_outputs = n_outputs;
 
-    llama_token token = model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
-    llama_ubatch ubatch = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
+    llama_batch_allocr balloc(model.hparams.n_pos_per_embd());
+    llama_ubatch ubatch = balloc.ubatch_reserve(n_tokens/n_seqs, n_seqs);
 
     auto * gf = graph_init();
-    auto res = graph_build(ctx_compute.get(), gf, ubatch, LLM_GRAPH_TYPE_DEFAULT, mstate);
+    auto res = graph_build(ctx_compute.get(), gf, ubatch, LLM_GRAPH_TYPE_DEFAULT, mctx);
 
     this->n_outputs = save_n_outputs;
 
@@ -1338,11 +1353,11 @@ ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, u
 }
 
 llm_graph_result_ptr llama_context::graph_build(
-                    ggml_context * ctx,
-                     ggml_cgraph * gf,
-              const llama_ubatch & ubatch,
-                  llm_graph_type   gtype,
-      const llama_memory_state_i * mstate) {
+                      ggml_context * ctx,
+                       ggml_cgraph * gf,
+                const llama_ubatch & ubatch,
+                    llm_graph_type   gtype,
+      const llama_memory_context_i * mctx) {
     return model.build_graph(
             {
                 /*.ctx         =*/ ctx,
@@ -1354,7 +1369,7 @@ llm_graph_result_ptr llama_context::graph_build(
                 /*.backend_cpu =*/ backend_cpu,
                 /*.cvec        =*/ &cvec,
                 /*.loras       =*/ &loras,
-                /*.mstate      =*/ mstate,
+                /*.mctx        =*/ mctx,
                 /*.cross       =*/ &cross,
                 /*.n_outputs   =*/ n_outputs,
                 /*.cb          =*/ graph_get_cb(),
@@ -1763,14 +1778,12 @@ size_t llama_context::state_write_data(llama_io_write_i & io) {
 
         std::vector<int32_t> w_output_pos;
 
-        GGML_ASSERT(n_outputs <= n_outputs_max);
-
         w_output_pos.resize(n_outputs);
 
         // build a more compact representation of the output ids
         for (size_t i = 0; i < n_batch(); ++i) {
             // map an output id to a position in the batch
-            int32_t pos = output_ids[i];
+            int64_t pos = output_ids[i];
             if (pos >= 0) {
                 GGML_ASSERT(pos < n_outputs);
                 w_output_pos[pos] = i;
@@ -1810,11 +1823,9 @@ size_t llama_context::state_write_data(llama_io_write_i & io) {
         }
     }
 
-    llama_kv_cache * kv_self = static_cast<llama_kv_cache *>(memory.get());
-
-    if (kv_self != nullptr) {
+    if (memory != nullptr) {
         LLAMA_LOG_DEBUG("%s: - writing KV self\n", __func__);
-        kv_self->state_write(io);
+        memory->state_write(io);
     }
 
     return io.n_bytes();
@@ -1901,9 +1912,7 @@ size_t llama_context::state_read_data(llama_io_read_i & io) {
     if (memory) {
         LLAMA_LOG_DEBUG("%s: - reading KV self\n", __func__);
 
-        llama_kv_cache * kv_self = static_cast<llama_kv_cache *>(memory.get());
-
-        kv_self->state_read(io);
+        memory->state_read(io);
     }
 
     return io.n_bytes();
@@ -1913,9 +1922,7 @@ size_t llama_context::state_seq_write_data(llama_io_write_i & io, llama_seq_id s
     GGML_UNUSED(seq_id);
 
     if (memory) {
-        llama_kv_cache * kv_self = static_cast<llama_kv_cache *>(memory.get());
-
-        kv_self->state_write(io, seq_id);
+        memory->state_write(io, seq_id);
     }
 
     return io.n_bytes();
@@ -1925,9 +1932,7 @@ size_t llama_context::state_seq_read_data(llama_io_read_i & io, llama_seq_id seq
     GGML_UNUSED(seq_id);
 
     if (memory) {
-        llama_kv_cache * kv_self = static_cast<llama_kv_cache *>(memory.get());
-
-        kv_self->state_read(io, seq_id);
+        memory->state_read(io, seq_id);
     }
 
     return io.n_bytes();
@@ -2032,9 +2037,7 @@ void llama_context::opt_epoch_iter(
     const uint32_t n_batch  = std::min(this->n_batch(),  n_ctx);
     const uint32_t n_ubatch = std::min(this->n_ubatch(), n_batch);
 
-    llama_kv_cache * kv_self = static_cast<llama_kv_cache *>(memory.get());
-
-    kv_self->clear();
+    memory->clear(true);
 
     for (uint32_t pos_ctx = 0; pos_ctx < n_ctx; pos_ctx += n_batch) {
         batch.n_tokens = n_batch;
@@ -2046,42 +2049,44 @@ void llama_context::opt_epoch_iter(
             batch.logits  [pos_batch]    = true;
         }
 
-        const auto n_tokens_all = batch.n_tokens;
+        if (!balloc->init(batch, model.vocab, nullptr, model.hparams.n_embd, true)) {
+            LLAMA_LOG_ERROR("%s: failed to initialize batch\n", __func__);
+            return;
+        }
 
-        n_queued_tokens += n_tokens_all;
+        const uint32_t n_tokens_all = balloc->get_n_tokens();
 
-        // this indicates we are doing pooled embedding, so we ignore batch.logits and output all tokens
-        const bool embd_pooled = cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE;
+        n_queued_tokens += n_tokens_all;
 
         embd_seq.clear();
 
-        int64_t n_outputs_all = n_tokens_all;
+        uint32_t n_outputs_all = n_tokens_all;
 
-        auto kv_state = kv_self->init_batch(batch, cparams.n_ubatch, embd_pooled, /* logits_all */ true);
-        if (!kv_state || kv_state->get_status() != LLAMA_MEMORY_STATUS_SUCCESS) {
+        auto mctx = memory->init_batch(*balloc, cparams.n_ubatch, true);
+        if (!mctx || mctx->get_status() != LLAMA_MEMORY_STATUS_SUCCESS) {
             LLAMA_LOG_ERROR("%s: could not initialize batch\n", __func__);
             break;
         }
 
         // reserve output buffer
         if (output_reserve(n_outputs_all) < n_outputs_all) {
-            LLAMA_LOG_ERROR("%s: could not reserve space for batch with %" PRId64 " outputs\n", __func__, n_outputs_all);
+            LLAMA_LOG_ERROR("%s: could not reserve space for batch with %d outputs\n", __func__, n_outputs_all);
             GGML_ABORT("TODO: handle this error");
         };
 
         uint32_t pos_batch = 0;
         do {
-            const auto & ubatch = kv_state->get_ubatch();
+            const auto & ubatch = mctx->get_ubatch();
 
             n_outputs = ubatch.n_tokens;
 
-            if (!kv_state->apply()) {
-                LLAMA_LOG_ERROR("%s: failed to update the memory state\n", __func__);
+            if (!mctx->apply()) {
+                LLAMA_LOG_ERROR("%s: failed to update the memory context\n", __func__);
                 break;
             }
 
             auto * gf = graph_init();
-            auto res = graph_build(ctx_compute.get(), gf, ubatch, LLM_GRAPH_TYPE_DEFAULT, kv_state.get());
+            auto res = graph_build(ctx_compute.get(), gf, ubatch, LLM_GRAPH_TYPE_DEFAULT, mctx.get());
 
             struct ggml_context * ctx_compute_opt;
             {
@@ -2116,7 +2121,7 @@ void llama_context::opt_epoch_iter(
             ggml_free(ctx_compute_opt);
 
             pos_batch += ubatch.n_tokens;
-        } while (kv_state->next());
+        } while (mctx->next());
     }
 }
 
@@ -2277,13 +2282,14 @@ const llama_model * llama_get_model(const llama_context * ctx) {
     return &ctx->get_model();
 }
 
+// deprecated
 llama_kv_cache * llama_get_kv_self(llama_context * ctx) {
-    return ctx->get_kv_self();
+    return dynamic_cast<llama_kv_cache *>(ctx->get_memory());
 }
 
 // deprecated
 void llama_kv_self_update(llama_context * ctx) {
-    ctx->kv_self_update();
+    ctx->kv_self_update(false);
 }
 
 enum llama_pooling_type llama_pooling_type(const llama_context * ctx) {
@@ -2398,13 +2404,118 @@ int32_t llama_apply_adapter_cvec(
     return res ? 0 : -1;
 }
 
+//
+// memory
+//
+
+llama_memory_t llama_get_memory(const struct llama_context * ctx) {
+    return ctx->get_memory();
+}
+
+void llama_memory_clear(llama_memory_t mem, bool data) {
+    if (!mem) {
+        return;
+    }
+
+    mem->clear(data);
+}
+
+bool llama_memory_seq_rm(
+        llama_memory_t mem,
+          llama_seq_id seq_id,
+             llama_pos p0,
+             llama_pos p1) {
+    if (!mem) {
+        return true;
+    }
+
+    return mem->seq_rm(seq_id, p0, p1);
+}
+
+void llama_memory_seq_cp(
+        llama_memory_t mem,
+          llama_seq_id seq_id_src,
+          llama_seq_id seq_id_dst,
+             llama_pos p0,
+             llama_pos p1) {
+    if (!mem) {
+        return;
+    }
+
+    mem->seq_cp(seq_id_src, seq_id_dst, p0, p1);
+}
+
+void llama_memory_seq_keep(
+        llama_memory_t mem,
+          llama_seq_id seq_id) {
+    if (!mem) {
+        return;
+    }
+
+    mem->seq_keep(seq_id);
+}
+
+void llama_memory_seq_add(
+        llama_memory_t mem,
+          llama_seq_id seq_id,
+             llama_pos p0,
+             llama_pos p1,
+             llama_pos delta) {
+    if (!mem) {
+        return;
+    }
+
+    mem->seq_add(seq_id, p0, p1, delta);
+}
+
+void llama_memory_seq_div(
+        llama_memory_t mem,
+          llama_seq_id seq_id,
+             llama_pos p0,
+             llama_pos p1,
+                   int d) {
+    if (!mem) {
+        return;
+    }
+
+    mem->seq_div(seq_id, p0, p1, d);
+}
+
+llama_pos llama_memory_seq_pos_min(
+        llama_memory_t mem,
+          llama_seq_id seq_id) {
+    if (!mem) {
+        return -1;
+    }
+
+    return mem->seq_pos_min(seq_id);
+}
+
+llama_pos llama_memory_seq_pos_max(
+        llama_memory_t mem,
+          llama_seq_id seq_id) {
+    if (!mem) {
+        return -1;
+    }
+
+    return mem->seq_pos_max(seq_id);
+}
+
+bool llama_memory_can_shift(llama_memory_t mem) {
+    if (!mem) {
+        return false;
+    }
+
+    return mem->get_can_shift();
+}
+
 //
 // kv cache
 //
 
 // deprecated
 int32_t llama_kv_self_n_tokens(const llama_context * ctx) {
-    const auto * kv = ctx->get_kv_self();
+    const auto * kv = llama_get_memory(ctx);
     if (!kv) {
         return 0;
     }
@@ -2426,7 +2537,7 @@ int32_t llama_kv_self_n_tokens(const llama_context * ctx) {
 // deprecated
 // note: this is the same as above - will be removed anyway, so it's ok
 int32_t llama_kv_self_used_cells(const llama_context * ctx) {
-    const auto * kv = ctx->get_kv_self();
+    const auto * kv = llama_get_memory(ctx);
     if (!kv) {
         return 0;
     }
@@ -2445,115 +2556,119 @@ int32_t llama_kv_self_used_cells(const llama_context * ctx) {
     return res;
 }
 
+// deprecated
 void llama_kv_self_clear(llama_context * ctx) {
-    auto * kv = ctx->get_kv_self();
+    auto * kv = llama_get_memory(ctx);
     if (!kv) {
         return;
     }
 
-    kv->clear();
+    llama_memory_clear(kv, true);
 }
 
+// deprecated
 bool llama_kv_self_seq_rm(
         llama_context * ctx,
          llama_seq_id   seq_id,
             llama_pos   p0,
             llama_pos   p1) {
-    auto * kv = ctx->get_kv_self();
+    auto * kv = llama_get_memory(ctx);
     if (!kv) {
         return true;
     }
 
-    return kv->seq_rm(seq_id, p0, p1);
+    return llama_memory_seq_rm(kv, seq_id, p0, p1);
 }
 
+// deprecated
 void llama_kv_self_seq_cp(
         llama_context * ctx,
          llama_seq_id   seq_id_src,
          llama_seq_id   seq_id_dst,
             llama_pos   p0,
             llama_pos   p1) {
-    auto * kv = ctx->get_kv_self();
+    auto * kv = llama_get_memory(ctx);
     if (!kv) {
         return;
     }
 
-    kv->seq_cp(seq_id_src, seq_id_dst, p0, p1);
+    llama_memory_seq_cp(kv, seq_id_src, seq_id_dst, p0, p1);
 }
 
+// deprecated
 void llama_kv_self_seq_keep(llama_context * ctx, llama_seq_id seq_id) {
-    auto * kv = ctx->get_kv_self();
+    auto * kv = llama_get_memory(ctx);
     if (!kv) {
         return;
     }
 
-    kv->seq_keep(seq_id);
+    llama_memory_seq_keep(kv, seq_id);
 }
 
+// deprecated
 void llama_kv_self_seq_add(
         llama_context * ctx,
          llama_seq_id   seq_id,
             llama_pos   p0,
             llama_pos   p1,
             llama_pos   delta) {
-    auto * kv = ctx->get_kv_self();
+    auto * kv = llama_get_memory(ctx);
     if (!kv) {
         return;
     }
 
-    kv->seq_add(seq_id, p0, p1, delta);
+    llama_memory_seq_add(kv, seq_id, p0, p1, delta);
 }
 
+// deprecated
 void llama_kv_self_seq_div(
         llama_context * ctx,
          llama_seq_id   seq_id,
             llama_pos   p0,
             llama_pos   p1,
                   int   d) {
-    auto * kv = ctx->get_kv_self();
+    auto * kv = llama_get_memory(ctx);
     if (!kv) {
         return;
     }
 
-    kv->seq_div(seq_id, p0, p1, d);
+    llama_memory_seq_div(kv, seq_id, p0, p1, d);
 }
 
+// deprecated
 llama_pos llama_kv_self_seq_pos_min(llama_context * ctx, llama_seq_id seq_id) {
-    const auto * kv = ctx->get_kv_self();
+    auto * kv = llama_get_memory(ctx);
     if (!kv) {
         return -1;
     }
 
-    return kv->seq_pos_min(seq_id);
+    return llama_memory_seq_pos_min(kv, seq_id);
 }
 
+// deprecated
 llama_pos llama_kv_self_seq_pos_max(llama_context * ctx, llama_seq_id seq_id) {
-    const auto * kv = ctx->get_kv_self();
+    auto * kv = llama_get_memory(ctx);
     if (!kv) {
         return -1;
     }
 
-    return kv->seq_pos_max(seq_id);
+    return llama_memory_seq_pos_max(kv, seq_id);
 }
 
 // deprecated
 void llama_kv_self_defrag(llama_context * ctx) {
-    auto * kv = ctx->get_kv_self();
-    if (!kv) {
-        return;
-    }
-
     // force defrag
-    kv->defrag_sched(-1.0f);
+    ctx->kv_self_defrag_sched();
 }
 
+// deprecated
 bool llama_kv_self_can_shift(const llama_context * ctx) {
-    const auto * kv = ctx->get_kv_self();
+    auto * kv = llama_get_memory(ctx);
     if (!kv) {
         return false;
     }
 
-    return kv->get_can_shift();
+    return llama_memory_can_shift(kv);
 }
 
 // llama state API
diff --git a/src/llama-context.h b/src/llama-context.h
index 3b880286bfd5d..9ce05715a8c03 100644
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -1,7 +1,6 @@
 #pragma once
 
 #include "llama.h"
-#include "llama-batch.h"
 #include "llama-cparams.h"
 #include "llama-graph.h"
 #include "llama-adapter.h"
@@ -13,13 +12,13 @@
 #include <vector>
 
 struct llama_model;
-struct llama_kv_cache;
+class llama_batch_allocr;
 
 class llama_io_read_i;
 class llama_io_write_i;
 
-class llama_memory_i;
-class llama_memory_state_i;
+struct llama_memory_i;
+struct llama_memory_context_i;
 
 struct llama_context {
     // init scheduler and compute buffers, reserve worst-case graphs
@@ -47,12 +46,12 @@ struct llama_context {
     uint32_t n_threads()       const;
     uint32_t n_threads_batch() const;
 
-          llama_kv_cache * get_kv_self();
-    const llama_kv_cache * get_kv_self() const;
+    llama_memory_t get_memory() const;
 
     // return true of the KV cache was updated
     // TODO: remove
-    bool kv_self_update();
+    bool kv_self_update(bool optimize);
+    void kv_self_defrag_sched();
 
     enum llama_pooling_type pooling_type() const;
 
@@ -94,17 +93,17 @@ struct llama_context {
                 int32_t   il_end);
 
     // process a single ubatch with a specific graph type
-    // if memory_state is provided, it will be applied first to the context's memory
+    // if memory_context is provided, it will be applied first to the context's memory
     // ret contains the status of the graph computation
     // returns nullptr only if ret != GGML_STATUS_SUCCESS
     llm_graph_result_ptr process_ubatch(
-              const llama_ubatch & ubatch,
-                  llm_graph_type   gtype,
-            llama_memory_state_i * mstate,
-                     ggml_status & ret);
+                const llama_ubatch & ubatch,
+                    llm_graph_type   gtype,
+            llama_memory_context_i * mctx,
+                       ggml_status & ret);
 
-    int encode(llama_batch & inp_batch);
-    int decode(llama_batch & inp_batch);
+    int encode(const llama_batch & batch_inp);
+    int decode(const llama_batch & batch_inp);
 
     //
     // state save/load
@@ -182,7 +181,7 @@ struct llama_context {
 
     // Make sure enough space is available for outputs.
     // Returns max number of outputs for which space was reserved.
-    int32_t output_reserve(int32_t n_outputs);
+    uint32_t output_reserve(int32_t n_outputs);
 
     //
     // graph
@@ -198,15 +197,15 @@ struct llama_context {
     ggml_status graph_compute(ggml_cgraph * gf, bool batched);
 
     // reserve a graph with a dummy ubatch of the specified size
-    ggml_cgraph * graph_reserve(uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_state_i * mstate);
+    ggml_cgraph * graph_reserve(uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx);
 
 private:
     llm_graph_result_ptr graph_build(
-                    ggml_context * ctx,
-                     ggml_cgraph * gf,
-              const llama_ubatch & ubatch,
-                  llm_graph_type   gtype,
-      const llama_memory_state_i * mstate);
+                      ggml_context * ctx,
+                       ggml_cgraph * gf,
+                const llama_ubatch & ubatch,
+                    llm_graph_type   gtype,
+      const llama_memory_context_i * mctx);
 
     llm_graph_cb graph_get_cb() const;
 
@@ -231,6 +230,9 @@ struct llama_context {
 
     std::unique_ptr<llama_memory_i> memory;
 
+    // TODO: temporary, until the llama_kv_self_defrag() API is removed
+    bool memory_force_optimize = false;
+
     // decode output (2-dimensional array: [n_outputs][n_vocab])
     size_t  logits_size = 0; // capacity (of floats) for logits
     float * logits      = nullptr;
@@ -244,8 +246,10 @@ struct llama_context {
     // populated only when pooling_type != LLAMA_POOLING_TYPE_NONE
     std::map<llama_seq_id, std::vector<float>> embd_seq;
 
-    int32_t n_outputs     = 0; // number of actually-used outputs in the current ubatch or last logical batch
-    int32_t n_outputs_max = 0; // capacity (of tokens positions) for the output buffers
+    // reuse the batch_allocr to avoid unnecessary memory allocations
+    std::unique_ptr<llama_batch_allocr> balloc;
+
+    uint32_t n_outputs = 0; // number of actually-used outputs in the current ubatch or last logical batch
 
     std::vector<int32_t> output_ids; // map batch token positions to ids of the logits and embd buffers
 
diff --git a/src/llama-cparams.cpp b/src/llama-cparams.cpp
index f7b36590fe3e3..a3e7a37ee36d7 100644
--- a/src/llama-cparams.cpp
+++ b/src/llama-cparams.cpp
@@ -1,5 +1,5 @@
 #include "llama-cparams.h"
 
 size_t llama_max_parallel_sequences(void) {
-    return LLAMA_MAX_PARALLEL_SEQUENCES;
+    return LLAMA_MAX_SEQ;
 }
diff --git a/src/llama-cparams.h b/src/llama-cparams.h
index 2871031ef0961..118615d5bd2d5 100644
--- a/src/llama-cparams.h
+++ b/src/llama-cparams.h
@@ -4,7 +4,7 @@
 
 #include <cstdint>
 
-#define LLAMA_MAX_PARALLEL_SEQUENCES 64
+#define LLAMA_MAX_SEQ 64
 
 struct llama_cparams {
     uint32_t n_ctx;           // context size used during inference
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
index b30f6fb4f4145..71ee431a977ba 100644
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@@ -3,7 +3,11 @@
 #include "llama-impl.h"
 #include "llama-batch.h"
 #include "llama-cparams.h"
-#include "llama-kv-cache.h"
+
+#include "llama-kv-cache-unified.h"
+#include "llama-kv-cache-unified-iswa.h"
+#include "llama-memory-hybrid.h"
+#include "llama-memory-recurrent.h"
 
 #include <cassert>
 #include <cmath>
@@ -83,41 +87,33 @@ void llm_graph_input_pos_bucket::set_input(const llama_ubatch * ubatch) {
 
 void llm_graph_input_pos_bucket_kv::set_input(const llama_ubatch * ubatch) {
     if (pos_bucket) {
-        kv_state->set_input_pos_bucket(pos_bucket, ubatch);
+        mctx->set_input_pos_bucket(pos_bucket, ubatch);
     }
 }
 
 void llm_graph_input_out_ids::set_input(const llama_ubatch * ubatch) {
-    if (hparams.causal_attn || cparams.pooling_type == LLAMA_POOLING_TYPE_NONE) {
-        //GGML_ASSERT(out_ids && "every model that can must skip unused outputs");
+    GGML_ASSERT(out_ids);
 
-        if (!out_ids) {
-            LLAMA_LOG_WARN("%s: 'out_ids' is not created\n", __func__);
-        } else {
-            const int64_t n_tokens = ubatch->n_tokens;
+    const int64_t n_tokens = ubatch->n_tokens;
 
-            GGML_ASSERT(ggml_backend_buffer_is_host(out_ids->buffer));
-            int32_t * data = (int32_t *) out_ids->data;
+    GGML_ASSERT(ggml_backend_buffer_is_host(out_ids->buffer));
+    int32_t * data = (int32_t *) out_ids->data;
 
-            if (n_outputs == n_tokens) {
-                for (int i = 0; i < n_tokens; ++i) {
-                    data[i] = i;
-                }
-            } else if (ubatch->output) {
-                int32_t n_outputs = 0;
-                for (int i = 0; i < n_tokens; ++i) {
-                    if (ubatch->output[i]) {
-                        data[n_outputs++] = i;
-                    }
-                }
-                // the graph needs to have been passed the correct number of outputs
-                GGML_ASSERT(n_outputs == n_outputs);
-            } else if (n_outputs == 1) {
-                // only keep last output
-                data[0] = n_tokens - 1;
-            } else {
-                GGML_ASSERT(n_outputs == 0);
-            }
+    if (n_outputs == n_tokens) {
+        for (int i = 0; i < n_tokens; ++i) {
+            data[i] = i;
+        }
+
+        return;
+    }
+
+    GGML_ASSERT(ubatch->output);
+
+    int n_outputs = 0;
+
+    for (int i = 0; i < n_tokens; ++i) {
+        if (ubatch->output[i]) {
+            data[n_outputs++] = i;
         }
     }
 }
@@ -126,139 +122,114 @@ void llm_graph_input_mean::set_input(const llama_ubatch * ubatch) {
     if (cparams.embeddings && cparams.pooling_type == LLAMA_POOLING_TYPE_MEAN) {
         const int64_t n_tokens     = ubatch->n_tokens;
         const int64_t n_seq_tokens = ubatch->n_seq_tokens;
-        const int64_t n_seqs       = ubatch->n_seqs;
+        const int64_t n_seqs_unq   = ubatch->n_seqs_unq;
 
         GGML_ASSERT(mean);
         GGML_ASSERT(ggml_backend_buffer_is_host(mean->buffer));
 
         float * data = (float *) mean->data;
-        memset(mean->data, 0, n_tokens * n_tokens * ggml_element_size(mean));
-
-        std::vector<uint64_t> sum(n_tokens, 0);
+        memset(mean->data, 0, n_tokens*n_seqs_unq*ggml_element_size(mean));
 
-        for (int s = 0; s < n_seqs; ++s) {
-            const llama_seq_id seq_id = ubatch->seq_id[s][0];
+        std::vector<uint64_t> sums(n_seqs_unq, 0);
+        for (int i = 0; i < n_tokens; i += n_seq_tokens) {
+            for (int s = 0; s < ubatch->n_seq_id[i]; ++s) {
+                const llama_seq_id seq_id  = ubatch->seq_id[i][s];
+                const int32_t      seq_idx = ubatch->seq_idx[seq_id];
 
-            // TODO: adapt limits to n_seqs when ubatch->equal_seqs is true
-            GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == MEAN");
-
-            sum[seq_id] += ubatch->n_seq_tokens;
+                sums[seq_idx] += ubatch->n_seq_tokens;
+            }
         }
 
-        std::vector<float> div(n_tokens, 0.0f);
-        for (int i = 0; i < n_tokens; ++i) {
-            const uint64_t s = sum[i];
-            if (s > 0) {
-                div[i] = 1.0f/float(s);
+        std::vector<float> div(n_seqs_unq, 0.0f);
+        for (int s = 0; s < n_seqs_unq; ++s) {
+            const uint64_t sum = sums[s];
+            if (sum > 0) {
+                div[s] = 1.0f/float(sum);
             }
         }
 
-        for (int s = 0; s < n_seqs; ++s) {
-            const llama_seq_id seq_id = ubatch->seq_id[s][0];
+        for (int i = 0; i < n_tokens; i += n_seq_tokens) {
+            for (int s = 0; s < ubatch->n_seq_id[i]; ++s) {
+                const llama_seq_id seq_id  = ubatch->seq_id[i][s];
+                const int32_t      seq_idx = ubatch->seq_idx[seq_id];
 
-            for (int i = 0; i < n_seq_tokens; ++i) {
-                data[seq_id*n_tokens + s*n_seq_tokens + i] = div[seq_id];
+                for (int j = 0; j < n_seq_tokens; ++j) {
+                    data[seq_idx*n_tokens + i + j] = div[seq_idx];
+                }
             }
         }
     }
 }
 
 void llm_graph_input_cls::set_input(const llama_ubatch * ubatch) {
-    if (cparams.embeddings && (
-                cparams.pooling_type == LLAMA_POOLING_TYPE_CLS ||
-                cparams.pooling_type == LLAMA_POOLING_TYPE_RANK)) {
-        const int64_t n_tokens     = ubatch->n_tokens;
-        const int64_t n_seq_tokens = ubatch->n_seq_tokens;
-        const int64_t n_seqs       = ubatch->n_seqs;
+    const int64_t n_tokens     = ubatch->n_tokens;
+    const int64_t n_seq_tokens = ubatch->n_seq_tokens;
+    const int64_t n_seqs_unq   = ubatch->n_seqs_unq;
 
+    if (cparams.embeddings && (
+            cparams.pooling_type == LLAMA_POOLING_TYPE_CLS ||
+            cparams.pooling_type == LLAMA_POOLING_TYPE_RANK
+        )) {
         GGML_ASSERT(cls);
         GGML_ASSERT(ggml_backend_buffer_is_host(cls->buffer));
 
         uint32_t * data = (uint32_t *) cls->data;
-        memset(cls->data, 0, n_tokens * ggml_element_size(cls));
+        memset(cls->data, 0, n_seqs_unq*ggml_element_size(cls));
 
-        for (int s = 0; s < n_seqs; ++s) {
-            const llama_seq_id seq_id = ubatch->seq_id[s][0];
+        for (int i = 0; i < n_tokens; i += n_seq_tokens) {
+            for (int s = 0; s < ubatch->n_seq_id[i]; ++s) {
+                const llama_seq_id seq_id  = ubatch->seq_id[i][s];
+                const int32_t      seq_idx = ubatch->seq_idx[seq_id];
 
-            // TODO: adapt limits to n_seqs when ubatch->equal_seqs is true
-            GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == CLS or RANK");
-
-            for (int i = 0; i < n_seq_tokens; ++i) {
-                const llama_pos pos = ubatch->pos[s*n_seq_tokens + i];
-
-                if (pos == 0) {
-                    data[seq_id] = s*n_seq_tokens + i;
-                }
+                data[seq_idx] = i;
             }
         }
     }
 
     if (cparams.embeddings && cparams.pooling_type == LLAMA_POOLING_TYPE_LAST) {
-        const int64_t n_tokens     = ubatch->n_tokens;
-        const int64_t n_seq_tokens = ubatch->n_seq_tokens;
-        const int64_t n_seqs       = ubatch->n_seqs;
-
         GGML_ASSERT(cls);
         GGML_ASSERT(ggml_backend_buffer_is_host(cls->buffer));
 
         uint32_t * data = (uint32_t *) cls->data;
-        memset(cls->data, 0, n_tokens * ggml_element_size(cls));
+        memset(cls->data, 0, n_seqs_unq*ggml_element_size(cls));
 
-        std::vector<int> last_pos(n_tokens, -1);
-        std::vector<int> last_row(n_tokens, -1);
+        std::vector<int> last_pos(n_seqs_unq, -1);
+        std::vector<int> last_row(n_seqs_unq, -1);
 
-        for (int s = 0; s < n_seqs; ++s) {
-            const llama_seq_id seq_id = ubatch->seq_id[s][0];
-
-            // TODO: adapt limits to n_seqs when ubatch->equal_seqs is true
-            GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == LAST");
+        for (int i = 0; i < n_tokens; ++i) {
+            const llama_pos pos = ubatch->pos[i];
 
-            for (int i = 0; i < n_seq_tokens; ++i) {
-                const llama_pos pos = ubatch->pos[s*n_seq_tokens + i];
+            for (int s = 0; s < ubatch->n_seq_id[i]; ++s) {
+                const llama_seq_id seq_id  = ubatch->seq_id[i][s];
+                const int32_t      seq_idx = ubatch->seq_idx[seq_id];
 
-                if (pos >= last_pos[seq_id]) {
-                    last_pos[seq_id] = pos;
-                    last_row[seq_id] = s*n_seq_tokens + i;
+                if (pos >= last_pos[seq_idx]) {
+                    last_pos[seq_idx] = pos;
+                    last_row[seq_idx] = i;
                 }
             }
         }
 
-        for (int i = 0; i < n_tokens; ++i) {
-            if (last_row[i] >= 0) {
-                data[i] = last_row[i];
+        for (int s = 0; s < n_seqs_unq; ++s) {
+            if (last_row[s] >= 0) {
+                data[s] = last_row[s];
             }
         }
     }
 }
 
-void llm_graph_input_s_copy::set_input(const llama_ubatch * ubatch) {
+void llm_graph_input_rs::set_input(const llama_ubatch * ubatch) {
     GGML_UNUSED(ubatch);
 
-    const int64_t n_kv = kv_state->get_n_kv();
+    const int64_t n_rs = mctx->get_n_rs();
 
     if (s_copy) {
         GGML_ASSERT(ggml_backend_buffer_is_host(s_copy->buffer));
         int32_t * data = (int32_t *) s_copy->data;
 
         // assuming copy destinations ALWAYS happen ONLY on the cells between head and head+n
-        for (uint32_t i = 0; i < n_kv; ++i) {
-            data[i] = kv_state->s_copy(i);
-        }
-    }
-}
-
-void llm_graph_input_s_mask::set_input(const llama_ubatch * ubatch) {
-    GGML_UNUSED(ubatch);
-
-    const int64_t n_kv = kv_state->get_n_kv();
-
-    if (s_mask) {
-        GGML_ASSERT(ggml_backend_buffer_is_host(s_mask->buffer));
-        float * data = (float *) s_mask->data;
-
-        // clear unused states
-        for (int i = 0; i < n_kv; ++i) {
-            data[i] = kv_state->s_mask(i);
+        for (uint32_t i = 0; i < n_rs; ++i) {
+            data[i] = mctx->s_copy(i);
         }
     }
 }
@@ -274,87 +245,36 @@ void llm_graph_input_cross_embd::set_input(const llama_ubatch * ubatch) {
 }
 
 void llm_graph_input_attn_no_cache::set_input(const llama_ubatch * ubatch) {
-    if (kq_mask) {
-        if (cparams.causal_attn) {
-            const int64_t n_kv         = ubatch->n_tokens;
-            const int64_t n_tokens     = ubatch->n_tokens;
-            const int64_t n_seq_tokens = ubatch->n_seq_tokens;
-            const int64_t n_seqs       = ubatch->n_seqs;
-
-            GGML_ASSERT(ggml_backend_buffer_is_host(kq_mask->buffer));
-            float * data = (float *) kq_mask->data;
-
-            for (int h = 0; h < 1; ++h) {
-                for (int s1 = 0; s1 < n_seqs; ++s1) {
-                    const llama_seq_id seq_id = ubatch->seq_id[s1][0];
-
-                    for (int j = 0; j < n_seq_tokens; ++j) {
-                        const int32_t tj = s1*n_seq_tokens + j;
-
-                        for (int s0 = 0; s0 < n_seqs; ++s0) {
-                            for (int i = 0; i < n_seq_tokens; ++i) {
-                                const int32_t ti = s0*n_seq_tokens + i;
-                                float f = -INFINITY;
-
-                                for (int s = 0; s < ubatch->n_seq_id[s0]; ++s) {
-                                    if (ubatch->seq_id[s0][s] == seq_id && ubatch->pos[ti] <= ubatch->pos[tj]) {
-                                        if (hparams.use_alibi) {
-                                            f = -std::abs(ubatch->pos[ti] - ubatch->pos[tj]);
-                                        } else {
-                                            f = 0.0f;
-                                        }
-                                        break;
-                                    }
-                                }
-
-                                data[h*(n_kv*n_tokens) + tj*n_kv + ti] = f;
-                            }
-                        }
-                    }
-                }
-            }
-        } else {
-            const int64_t n_tokens     = ubatch->n_tokens;
-            const int64_t n_seq_tokens = ubatch->n_seq_tokens;
-            const int64_t n_seqs       = ubatch->n_seqs;
-            const int64_t n_stride     = ubatch->n_tokens;
-
-            GGML_ASSERT(ggml_backend_buffer_is_host(kq_mask->buffer));
-
-            float * data = (float *) kq_mask->data;
-
-            for (int h = 0; h < 1; ++h) {
-                for (int s1 = 0; s1 < n_seqs; ++s1) {
-                    const llama_seq_id seq_id = ubatch->seq_id[s1][0];
-
-                    for (int j = 0; j < n_seq_tokens; ++j) {
-                        const int32_t tj = s1*n_seq_tokens + j;
-
-                        for (int s0 = 0; s0 < n_seqs; ++s0) {
-                            for (int i = 0; i < n_seq_tokens; ++i) {
-                                const int32_t ti = s0*n_seq_tokens + i;
-                                float f = -INFINITY;
-
-                                for (int s = 0; s < ubatch->n_seq_id[s0]; ++s) {
-                                    if (ubatch->seq_id[s0][s] == seq_id) {
-                                        if (hparams.use_alibi) {
-                                            f = -std::abs(ubatch->pos[ti] - ubatch->pos[tj]);
-                                        } else {
-                                            f = 0.0f;
-                                        }
-                                        break;
-                                    }
-                                }
-
-                                data[h*(n_tokens*n_tokens) + tj*n_stride + ti] = f;
-                            }
-                        }
+    const int64_t n_kv     = ubatch->n_tokens;
+    const int64_t n_tokens = ubatch->n_tokens;
+
+    GGML_ASSERT(kq_mask);
+    GGML_ASSERT(ggml_backend_buffer_is_host(kq_mask->buffer));
 
-                        for (int i = n_tokens; i < n_stride; ++i) {
-                            data[h*(n_tokens*n_tokens) + tj*n_stride + i] = -INFINITY;
+    float * data = (float *) kq_mask->data;
+
+    for (int h = 0; h < 1; ++h) {
+        for (int i1 = 0; i1 < n_tokens; ++i1) {
+            const llama_seq_id s1 = ubatch->seq_id[i1][0];
+
+            for (int i0 = 0; i0 < n_tokens; ++i0) {
+                float f = -INFINITY;
+
+                for (int s = 0; s < ubatch->n_seq_id[i0]; ++s) {
+                    const llama_seq_id s0 = ubatch->seq_id[i0][0];
+
+                    // TODO: reimplement this like in llama_kv_cache_unified
+                    if (s0 == s1 && (!cparams.causal_attn || ubatch->pos[i0] <= ubatch->pos[i1])) {
+                        if (hparams.use_alibi) {
+                            f = -std::abs(ubatch->pos[i0] - ubatch->pos[i1]);
+                        } else {
+                            f = 0.0f;
                         }
+                        break;
                     }
                 }
+
+                data[h*(n_kv*n_tokens) + i1*n_kv + i0] = f;
             }
         }
     }
@@ -362,53 +282,80 @@ void llm_graph_input_attn_no_cache::set_input(const llama_ubatch * ubatch) {
 
 void llm_graph_input_attn_kv_unified::set_input(const llama_ubatch * ubatch) {
     if (self_kq_mask) {
-        kv_state->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn);
+        mctx->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn);
     }
 }
 
 void llm_graph_input_attn_kv_unified_iswa::set_input(const llama_ubatch * ubatch) {
     if (self_kq_mask) {
-        kv_state->get_base()->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn);
+        mctx->get_base()->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn);
     }
 
     if (self_kq_mask_swa) {
-        kv_state->get_swa()->set_input_kq_mask(self_kq_mask_swa, ubatch, cparams.causal_attn);
+        mctx->get_swa()->set_input_kq_mask(self_kq_mask_swa, ubatch, cparams.causal_attn);
     }
 }
 
 void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) {
-    if (cross_kq_mask) {
-        const int64_t n_enc    = cross_kq_mask->ne[0];
-        const int64_t n_tokens = ubatch->n_tokens;
+    GGML_ASSERT(cross_kq_mask);
 
-        GGML_ASSERT(ggml_backend_buffer_is_host(cross_kq_mask->buffer));
-        GGML_ASSERT(!ubatch->equal_seqs); // TODO: use ubatch->n_seqs instead of failing
+    const int64_t n_enc    = cross_kq_mask->ne[0];
+    const int64_t n_tokens = ubatch->n_tokens;
 
-        float * data = (float *) cross_kq_mask->data;
+    GGML_ASSERT(ggml_backend_buffer_is_host(cross_kq_mask->buffer));
+    GGML_ASSERT(!ubatch->equal_seqs); // TODO: use ubatch->n_seqs instead of failing
 
-        for (int h = 0; h < 1; ++h) {
-            for (int j = 0; j < n_tokens; ++j) {
-                for (int i = 0; i < n_enc; ++i) {
-                    float f = -INFINITY;
-                    for (int s = 0; s < ubatch->n_seq_id[j]; ++s) {
-                        const llama_seq_id seq_id = ubatch->seq_id[j][s];
-                        if (cross->seq_ids_enc[i].find(seq_id) != cross->seq_ids_enc[i].end()) {
-                            f = 0.0f;
-                        }
+    float * data = (float *) cross_kq_mask->data;
+
+    for (int h = 0; h < 1; ++h) {
+        for (int i = 0; i < n_tokens; ++i) {
+            for (int j = 0; j < n_enc; ++j) {
+                float f = -INFINITY;
+
+                for (int s = 0; s < ubatch->n_seq_id[i]; ++s) {
+                    const llama_seq_id seq_id = ubatch->seq_id[i][s];
+
+                    if (cross->seq_ids_enc[j].find(seq_id) != cross->seq_ids_enc[j].end()) {
+                        f = 0.0f;
                     }
-                    data[h*(n_enc*n_tokens) + j*n_enc + i] = f;
                 }
+
+                data[h*(n_enc*n_tokens) + i*n_enc + j] = f;
             }
+        }
 
-            for (int i = n_tokens; i < GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); ++i) {
-                for (int j = 0; j < n_enc; ++j) {
-                    data[h*(n_enc*n_tokens) + i*n_enc + j] = -INFINITY;
-                }
+        for (int i = n_tokens; i < GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); ++i) {
+            for (int j = 0; j < n_enc; ++j) {
+                data[h*(n_enc*n_tokens) + i*n_enc + j] = -INFINITY;
             }
         }
     }
 }
 
+void llm_graph_input_mem_hybrid::set_input(const llama_ubatch * ubatch) {
+    if (self_kq_mask) {
+        mctx->get_attn()->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn);
+    }
+
+    const int64_t n_rs = mctx->get_recr()->get_n_rs();
+
+    if (s_copy) {
+        GGML_ASSERT(ggml_backend_buffer_is_host(s_copy->buffer));
+        int32_t * data = (int32_t *) s_copy->data;
+
+        // assuming copy destinations ALWAYS happen ONLY on the cells between head and head+n
+        for (uint32_t i = 0; i < n_rs; ++i) {
+            data[i] = mctx->get_recr()->s_copy(i);
+        }
+    }
+}
+
+void llm_graph_input_one::set_input(const llama_ubatch *) {
+    GGML_ASSERT(one && ggml_nelements(one) == 1);
+    float f_one = 1.0f;
+    ggml_backend_tensor_set(one, &f_one, 0, sizeof(float));
+}
+
 //
 // llm_graph_context
 //
@@ -448,16 +395,12 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) :
     backend_cpu      (params.backend_cpu),
     cvec             (params.cvec),
     loras            (params.loras),
-    mstate           (params.mstate),
+    mctx             (params.mctx),
     cross            (params.cross),
     cb_func          (params.cb),
     res              (std::make_unique<llm_graph_result>()) {
     }
 
-int64_t llm_graph_context::n_pos_per_embd() const {
-    return hparams.rope_type == LLAMA_ROPE_TYPE_MROPE ? 4 : 1;
-}
-
 void llm_graph_context::cb(ggml_tensor * cur, const char * name, int il) const {
     if (cb_func) {
         cb_func(ubatch, cur, name, il);
@@ -647,6 +590,7 @@ ggml_tensor * llm_graph_context::build_ffn(
             {
                 // Project to 4h. If using swiglu double the output width, see https://arxiv.org/pdf/2002.05202.pdf
                 int64_t split_point = cur->ne[0] / 2;
+                // TODO: these conts should not be needed, see https://github.com/ggml-org/llama.cpp/pull/14090#discussion_r2137437217
                 ggml_tensor * x0 = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], 0));
                 ggml_tensor * x1 = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], split_point * ggml_element_size(cur)));
 
@@ -656,6 +600,20 @@ ggml_tensor * llm_graph_context::build_ffn(
                 cur = ggml_mul(ctx0, x0, x1);
                 cb(cur, "ffn_mul", il);
             } break;
+        case LLM_FFN_GEGLU:
+            {
+                // Split into two equal parts
+                int64_t split_point = cur->ne[0] / 2;
+                // TODO: these conts should not be needed, see https://github.com/ggml-org/llama.cpp/pull/14090#discussion_r2137437217
+                ggml_tensor * x0 = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], 0));
+                ggml_tensor * x1 = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], split_point * ggml_element_size(cur)));
+
+                x0 = ggml_gelu(ctx0, x0);
+                cb(x0, "ffn_gelu", il);
+
+                cur = ggml_mul(ctx0, x0, x1);
+                cb(cur, "ffn_geglu", il);
+            } break;
     }
 
     if (gate && type_gate == LLM_FFN_PAR) {
@@ -766,9 +724,8 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
     cur = ggml_reshape_3d(ctx0, cur, n_embd, 1, n_tokens);
 
     if (weight_before_ffn) {
-        // TODO: this is a workaround as we don't yet have a repeat op that takes custom dim (ggml_repeat_4d)
-        ggml_tensor * repeated = ggml_new_tensor_3d(ctx0, cur->type, n_embd, n_expert_used, n_tokens);
-        repeated = ggml_repeat(ctx0, cur, repeated); // [n_embd, n_expert_used, n_tokens]
+        // repeat cur to [n_embd, n_expert_used, n_tokens]
+        ggml_tensor * repeated = ggml_repeat_4d(ctx0, cur, n_embd, n_expert_used, n_tokens, 1);
         cur = ggml_mul(ctx0, repeated, weights);
         cb(cur, "ffn_moe_weighted", il);
     }
@@ -888,11 +845,11 @@ ggml_tensor * llm_graph_context::build_inp_embd(ggml_tensor * tok_embd) const {
 }
 
 ggml_tensor * llm_graph_context::build_inp_pos() const {
-    auto inp = std::make_unique<llm_graph_input_pos>(n_pos_per_embd());
+    auto inp = std::make_unique<llm_graph_input_pos>(hparams.n_pos_per_embd());
 
     auto & cur = inp->pos;
 
-    cur = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens*n_pos_per_embd());
+    cur = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, (int64_t)n_tokens*hparams.n_pos_per_embd());
     ggml_set_input(cur);
 
     res->add_input(std::move(inp));
@@ -915,6 +872,14 @@ ggml_tensor * llm_graph_context::build_inp_attn_scale() const {
 }
 
 ggml_tensor * llm_graph_context::build_inp_out_ids() const {
+    // note: when all tokens are output, we could skip this optimization to spare the ggml_get_rows() calls,
+    //       but this would make the graph topology depend on the number of output tokens, which can interere with
+    //       features that require constant topology such as pipline parallelism
+    //       ref: https://github.com/ggml-org/llama.cpp/pull/14275#issuecomment-2987424471
+    //if (n_outputs < n_tokens) {
+    //    return nullptr;
+    //}
+
     auto inp = std::make_unique<llm_graph_input_out_ids>(hparams, cparams, n_outputs);
 
     auto & cur = inp->out_ids;
@@ -932,7 +897,7 @@ ggml_tensor * llm_graph_context::build_inp_mean() const {
 
     auto & cur = inp->mean;
 
-    cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens);
+    cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, ubatch.n_seqs_unq);
     ggml_set_input(cur);
 
     res->add_input(std::move(inp));
@@ -945,41 +910,7 @@ ggml_tensor * llm_graph_context::build_inp_cls() const {
 
     auto & cur = inp->cls;
 
-    cur = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
-    ggml_set_input(cur);
-
-    res->add_input(std::move(inp));
-
-    return cur;
-}
-
-ggml_tensor * llm_graph_context::build_inp_s_copy() const {
-    const auto * kv_state = static_cast<const llama_kv_cache_recurrent_state *>(mstate);
-
-    auto inp = std::make_unique<llm_graph_input_s_copy>(kv_state);
-
-    const auto n_kv = kv_state->get_n_kv();
-
-    auto & cur = inp->s_copy;
-
-    cur = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_kv);
-    ggml_set_input(cur);
-
-    res->add_input(std::move(inp));
-
-    return cur;
-}
-
-ggml_tensor * llm_graph_context::build_inp_s_mask() const {
-    const auto * kv_state = static_cast<const llama_kv_cache_recurrent_state *>(mstate);
-
-    auto inp = std::make_unique<llm_graph_input_s_mask>(kv_state);
-
-    const auto n_kv = kv_state->get_n_kv();
-
-    auto & cur = inp->s_mask;
-
-    cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 1, n_kv);
+    cur = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_seqs_unq);
     ggml_set_input(cur);
 
     res->add_input(std::move(inp));
@@ -1025,11 +956,11 @@ ggml_tensor * llm_graph_context::build_inp_pos_bucket_enc() const {
 }
 
 ggml_tensor * llm_graph_context::build_inp_pos_bucket_dec() const {
-    const auto * kv_state = static_cast<const llama_kv_cache_unified_state *>(mstate);
+    const auto * mctx_cur = static_cast<const llama_kv_cache_unified_context *>(mctx);
 
-    auto inp = std::make_unique<llm_graph_input_pos_bucket_kv>(hparams, kv_state);
+    auto inp = std::make_unique<llm_graph_input_pos_bucket_kv>(hparams, mctx_cur);
 
-    const auto n_kv = kv_state->get_n_kv();
+    const auto n_kv = mctx_cur->get_n_kv();
 
     auto & cur = inp->pos_bucket;
 
@@ -1056,6 +987,33 @@ ggml_tensor * llm_graph_context::build_pos_bias(ggml_tensor * pos_bucket, ggml_t
     return pos_bias;
 }
 
+llm_graph_input_mem_hybrid * llm_graph_context::build_inp_mem_hybrid() const {
+    const auto * mctx_cur = static_cast<const llama_memory_hybrid_context *>(mctx);
+
+    auto inp = std::make_unique<llm_graph_input_mem_hybrid>(hparams, cparams, mctx_cur);
+
+    {
+        GGML_ASSERT(hparams.swa_type == LLAMA_SWA_TYPE_NONE && "Hybrid recurrent is not supported with SWA attention layers");
+
+        const auto n_kv = inp->mctx->get_attn()->get_n_kv();
+
+        inp->self_kq_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
+        //cb(inp->self_kq_mask, "KQ_mask", -1);
+        ggml_set_input(inp->self_kq_mask);
+
+        inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
+    }
+
+    {
+        const auto n_rs = mctx_cur->get_recr()->get_n_rs();
+
+        inp->s_copy = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_rs);
+        ggml_set_input(inp->s_copy);
+    }
+
+    return (llm_graph_input_mem_hybrid *) res->add_input(std::move(inp));
+}
+
 ggml_tensor * llm_graph_context::build_attn_mha(
          ggml_cgraph * gf,
          ggml_tensor * q,
@@ -1231,14 +1189,14 @@ ggml_tensor * llm_graph_context::build_attn(
 }
 
 llm_graph_input_attn_kv_unified * llm_graph_context::build_attn_inp_kv_unified() const {
-    const auto * kv_state = static_cast<const llama_kv_cache_unified_state *>(mstate);
+    const auto * mctx_cur = static_cast<const llama_kv_cache_unified_context *>(mctx);
 
-    auto inp = std::make_unique<llm_graph_input_attn_kv_unified>(hparams, cparams, kv_state);
+    auto inp = std::make_unique<llm_graph_input_attn_kv_unified>(hparams, cparams, mctx_cur);
 
     {
         GGML_ASSERT(hparams.swa_type == LLAMA_SWA_TYPE_NONE && "Use llama_kv_cache_unified_iswa for SWA");
 
-        const auto n_kv = kv_state->get_n_kv();
+        const auto n_kv = mctx_cur->get_n_kv();
 
         inp->self_kq_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
         //cb(inp->self_kq_mask, "KQ_mask", -1);
@@ -1268,19 +1226,19 @@ ggml_tensor * llm_graph_context::build_attn(
     ggml_build_forward_expand(gf, k_cur);
     ggml_build_forward_expand(gf, v_cur);
 
-    const auto * kv_state = static_cast<const llama_kv_cache_unified_state *>(mstate);
+    const auto * mctx_cur = static_cast<const llama_kv_cache_unified_context *>(mctx);
 
     // store to KV cache
     {
-        ggml_build_forward_expand(gf, kv_state->cpy_k(ctx0, k_cur, il));
-        ggml_build_forward_expand(gf, kv_state->cpy_v(ctx0, v_cur, il));
+        ggml_build_forward_expand(gf, mctx_cur->cpy_k(ctx0, k_cur, il));
+        ggml_build_forward_expand(gf, mctx_cur->cpy_v(ctx0, v_cur, il));
     }
 
     const auto & kq_mask = inp->get_kq_mask();
 
     ggml_tensor * q = q_cur;
-    ggml_tensor * k = kv_state->get_k(ctx0, il);
-    ggml_tensor * v = kv_state->get_v(ctx0, il);
+    ggml_tensor * k = mctx_cur->get_k(ctx0, il);
+    ggml_tensor * v = mctx_cur->get_v(ctx0, il);
 
     ggml_tensor * cur = build_attn_mha(gf, q, k, v, kq_b, kq_mask, v_mla, kq_scale);
     cb(cur, "kqv_out", il);
@@ -1300,36 +1258,6 @@ ggml_tensor * llm_graph_context::build_attn(
     return cur;
 }
 
-llm_graph_input_attn_kv_unified_iswa * llm_graph_context::build_attn_inp_kv_unified_iswa() const {
-    const auto * kv_state = static_cast<const llama_kv_cache_unified_iswa_state *>(mstate);
-
-    auto inp = std::make_unique<llm_graph_input_attn_kv_unified_iswa>(hparams, cparams, kv_state);
-
-    {
-        const auto n_kv = kv_state->get_base()->get_n_kv();
-
-        inp->self_kq_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
-        //cb(inp->self_kq_mask, "KQ_mask", -1);
-        ggml_set_input(inp->self_kq_mask);
-
-        inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
-    }
-
-    {
-        GGML_ASSERT(hparams.swa_type != LLAMA_SWA_TYPE_NONE && "Use llama_kv_cache_unified for non-SWA");
-
-        const auto n_kv = kv_state->get_swa()->get_n_kv();
-
-        inp->self_kq_mask_swa = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
-        //cb(inp->self_kq_mask_swa, "KQ_mask_swa", -1);
-        ggml_set_input(inp->self_kq_mask_swa);
-
-        inp->self_kq_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask_swa, GGML_TYPE_F16) : inp->self_kq_mask_swa;
-    }
-
-    return (llm_graph_input_attn_kv_unified_iswa *) res->add_input(std::move(inp));
-}
-
 ggml_tensor * llm_graph_context::build_attn(
         llm_graph_input_attn_kv_unified_iswa * inp,
         ggml_cgraph * gf,
@@ -1345,26 +1273,35 @@ ggml_tensor * llm_graph_context::build_attn(
     // these nodes are added to the graph together so that they are not reordered
     // by doing so, the number of splits in the graph is reduced
     ggml_build_forward_expand(gf, q_cur);
-    ggml_build_forward_expand(gf, k_cur);
-    ggml_build_forward_expand(gf, v_cur);
 
-    const auto * kv_state_iswa = static_cast<const llama_kv_cache_unified_iswa_state *>(mstate);
+    if (k_cur) {
+        ggml_build_forward_expand(gf, k_cur);
+    }
+
+    if (v_cur) {
+        ggml_build_forward_expand(gf, v_cur);
+    }
+
+    const auto * mctx_iswa = static_cast<const llama_kv_cache_unified_iswa_context *>(mctx);
 
     const bool is_swa = hparams.is_swa(il);
 
-    const auto * kv_state = is_swa ? kv_state_iswa->get_swa() : kv_state_iswa->get_base();
+    const auto * mctx_cur = is_swa ? mctx_iswa->get_swa() : mctx_iswa->get_base();
 
-    // store to KV cache
-    {
-        ggml_build_forward_expand(gf, kv_state->cpy_k(ctx0, k_cur, il));
-        ggml_build_forward_expand(gf, kv_state->cpy_v(ctx0, v_cur, il));
+    // optionally store to KV cache
+    if (k_cur) {
+        ggml_build_forward_expand(gf, mctx_cur->cpy_k(ctx0, k_cur, il));
+    }
+
+    if (v_cur) {
+        ggml_build_forward_expand(gf, mctx_cur->cpy_v(ctx0, v_cur, il));
     }
 
     const auto & kq_mask = is_swa ? inp->get_kq_mask_swa() : inp->get_kq_mask();
 
     ggml_tensor * q = q_cur;
-    ggml_tensor * k = kv_state->get_k(ctx0, il);
-    ggml_tensor * v = kv_state->get_v(ctx0, il);
+    ggml_tensor * k = mctx_cur->get_k(ctx0, il);
+    ggml_tensor * v = mctx_cur->get_v(ctx0, il);
 
     ggml_tensor * cur = build_attn_mha(gf, q, k, v, kq_b, kq_mask, v_mla, kq_scale);
     cb(cur, "kqv_out", il);
@@ -1439,56 +1376,182 @@ ggml_tensor * llm_graph_context::build_attn(
     return cur;
 }
 
-ggml_tensor * llm_graph_context::build_copy_mask_state(
-         ggml_cgraph * gf,
-         ggml_tensor * s,
-         ggml_tensor * state_copy,
-         ggml_tensor * state_mask,
-             int32_t   n_state,
-             int32_t   n_seqs) const {
-    const auto * kv_state = static_cast<const llama_kv_cache_recurrent_state *>(mstate);
+ggml_tensor * llm_graph_context::build_attn(
+        llm_graph_input_mem_hybrid * inp,
+        ggml_cgraph * gf,
+        ggml_tensor * wo,
+        ggml_tensor * wo_b,
+        ggml_tensor * q_cur,
+        ggml_tensor * k_cur,
+        ggml_tensor * v_cur,
+        ggml_tensor * kq_b,
+        ggml_tensor * v_mla,
+            float     kq_scale,
+            int       il) const {
+    // these nodes are added to the graph together so that they are not reordered
+    // by doing so, the number of splits in the graph is reduced
+    ggml_build_forward_expand(gf, q_cur);
+    ggml_build_forward_expand(gf, k_cur);
+    ggml_build_forward_expand(gf, v_cur);
+
+    const auto * mctx_cur = static_cast<const llama_memory_hybrid_context *>(mctx)->get_attn();
+
+    // store to KV cache
+    {
+        ggml_build_forward_expand(gf, mctx_cur->cpy_k(ctx0, k_cur, il));
+        ggml_build_forward_expand(gf, mctx_cur->cpy_v(ctx0, v_cur, il));
+    }
+
+    const auto & kq_mask = inp->get_kq_mask();
 
-    const auto n_kv    = kv_state->get_n_kv();
-    const auto kv_head = kv_state->get_head();
+    ggml_tensor * q = q_cur;
+    ggml_tensor * k = mctx_cur->get_k(ctx0, il);
+    ggml_tensor * v = mctx_cur->get_v(ctx0, il);
 
-    ggml_tensor * states = ggml_reshape_2d(ctx0, s, n_state, kv_state->get_size());
+    ggml_tensor * cur = build_attn_mha(gf, q, k, v, kq_b, kq_mask, v_mla, kq_scale);
+    cb(cur, "kqv_out", il);
 
-    // copy states
-    // NOTE: assuming the copy destinations are ALL contained between kv_head and kv_head + n_kv
-    // this shrinks the tensors's ne[1] to n_kv
-    states = ggml_get_rows(ctx0, states, state_copy);
+    if (wo) {
+        cur = build_lora_mm(wo, cur);
+        if (arch == LLM_ARCH_GLM4) {
+            // GLM4 seems to have numerical issues with half-precision accumulators
+            ggml_mul_mat_set_prec(cur, GGML_PREC_F32);
+        }
+    }
+
+    if (wo_b) {
+        cur = ggml_add(ctx0, cur, wo_b);
+    }
 
-    // clear states of sequences which are starting at the beginning of this batch
-    // FIXME: zero-out NANs?
-    states = ggml_mul(ctx0, states, state_mask);
+    return cur;
+}
 
-    // copy states which won't be changed further (between n_seqs and n_kv)
+llm_graph_input_attn_kv_unified_iswa * llm_graph_context::build_attn_inp_kv_unified_iswa() const {
+    const auto * mctx_cur = static_cast<const llama_kv_cache_unified_iswa_context *>(mctx);
+
+    auto inp = std::make_unique<llm_graph_input_attn_kv_unified_iswa>(hparams, cparams, mctx_cur);
+
+    {
+        const auto n_kv = mctx_cur->get_base()->get_n_kv();
+
+        inp->self_kq_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
+        //cb(inp->self_kq_mask, "KQ_mask", -1);
+        ggml_set_input(inp->self_kq_mask);
+
+        inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
+    }
+
+    {
+        GGML_ASSERT(hparams.swa_type != LLAMA_SWA_TYPE_NONE && "Use llama_kv_cache_unified for non-SWA");
+
+        const auto n_kv = mctx_cur->get_swa()->get_n_kv();
+
+        inp->self_kq_mask_swa = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
+        //cb(inp->self_kq_mask_swa, "KQ_mask_swa", -1);
+        ggml_set_input(inp->self_kq_mask_swa);
+
+        inp->self_kq_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask_swa, GGML_TYPE_F16) : inp->self_kq_mask_swa;
+    }
+
+    return (llm_graph_input_attn_kv_unified_iswa *) res->add_input(std::move(inp));
+}
+
+ggml_tensor * llm_graph_context::build_rs(
+        ggml_cgraph * gf,
+        ggml_tensor * s,
+        ggml_tensor * state_copy,
+            int32_t   state_size,
+            int32_t   n_seqs,
+           uint32_t   n_kv,
+           uint32_t   kv_head,
+           uint32_t   kv_size,
+            int32_t   rs_zero,
+               bool   avoid_copies) const {
+
+    ggml_tensor * states = ggml_reshape_2d(ctx0, s, state_size, kv_size);
+
+    // Clear a single state which will then be copied to the other cleared states.
+    // Note that this is a no-op when the view is zero-sized.
+    ggml_tensor * state_zero = ggml_view_1d(ctx0, states, state_size*(rs_zero >= 0), rs_zero*states->nb[1]*(rs_zero >= 0));
+    ggml_build_forward_expand(gf, ggml_scale_inplace(ctx0, state_zero, 0));
+
+    ggml_tensor * output_states;
+
+    if (!avoid_copies) {
+        // copy states
+        // NOTE: assuming the copy destinations are ALL contained between kv_head and kv_head + n_kv
+        // {state_size, kv_size} -> {state_size, n_seqs}
+        output_states = ggml_get_rows(ctx0, states, ggml_view_1d(ctx0, state_copy, n_seqs, 0));
+        ggml_build_forward_expand(gf, output_states);
+    } else {
+        // FIXME: make the gathering operation happen before the copy below
+        //        (maybe with an optional lambda function passed as a parameter instead of `avoid_copies`?)
+        output_states = states;
+    }
+
+    // copy extra states which won't be changed further (between n_seqs and n_kv)
+    ggml_tensor * states_extra = ggml_get_rows(ctx0, states, ggml_view_1d(ctx0, state_copy, n_kv - n_seqs, n_seqs*state_copy->nb[0]));
     ggml_build_forward_expand(gf,
         ggml_cpy(ctx0,
-            ggml_view_1d(ctx0, states, n_state*(n_kv - n_seqs), (n_seqs          )*n_state*ggml_element_size(states)),
-            ggml_view_1d(ctx0, s,      n_state*(n_kv - n_seqs), (kv_head + n_seqs)*n_state*ggml_element_size(s))));
+            states_extra,
+            ggml_view_1d(ctx0, s, state_size*(n_kv - n_seqs), (kv_head + n_seqs)*state_size*ggml_element_size(s))));
+
+    return output_states;
+}
+
+llm_graph_input_rs * llm_graph_context::build_rs_inp() const {
+    const auto * mctx_cur = static_cast<const llama_memory_recurrent_context *>(mctx);
 
-    // the part of the states that will be used and modified
-    return ggml_view_2d(ctx0, states, n_state, n_seqs, states->nb[1], 0);
+    auto inp = std::make_unique<llm_graph_input_rs>(mctx_cur);
+
+    const auto n_rs = mctx_cur->get_n_rs();
+
+    inp->s_copy = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_rs);
+    ggml_set_input(inp->s_copy);
+
+    return (llm_graph_input_rs *) res->add_input(std::move(inp));
+}
+
+ggml_tensor * llm_graph_context::build_rs(
+        llm_graph_input_rs * inp,
+        ggml_cgraph * gf,
+        ggml_tensor * s,
+            int32_t   state_size,
+            int32_t   n_seqs,
+               bool   avoid_copies) const {
+    const auto * mctx_cur = static_cast<const llama_memory_recurrent_context *>(mctx);
+
+    return build_rs(gf, s, inp->s_copy, state_size, n_seqs, mctx_cur->get_n_rs(), mctx_cur->get_head(), mctx_cur->get_size(), mctx_cur->get_rs_z(), avoid_copies);
+}
+
+ggml_tensor * llm_graph_context::build_rs(
+        llm_graph_input_mem_hybrid * inp,
+        ggml_cgraph * gf,
+        ggml_tensor * s,
+            int32_t   state_size,
+            int32_t   n_seqs,
+               bool   avoid_copies) const {
+    const auto * mctx_cur = static_cast<const llama_memory_hybrid_context *>(mctx)->get_recr();
+
+    return build_rs(gf, s, inp->s_copy, state_size, n_seqs, mctx_cur->get_n_rs(), mctx_cur->get_head(), mctx_cur->get_size(), mctx_cur->get_rs_z(), avoid_copies);
 }
 
 ggml_tensor * llm_graph_context::build_rwkv_token_shift_load(
-         ggml_cgraph * gf,
-         ggml_tensor * state_copy,
-         ggml_tensor * state_mask,
-  const llama_ubatch & ubatch,
+    llm_graph_input_rs * inp,
+           ggml_cgraph * gf,
+    const llama_ubatch & ubatch,
                  int   il) const {
-    const auto * kv_state = static_cast<const llama_kv_cache_recurrent_state *>(mstate);
+    const auto * mctx_cur = static_cast<const llama_memory_recurrent_context *>(mctx);
 
     const auto token_shift_count = hparams.token_shift_count;
 
     const int64_t n_seqs  = ubatch.n_seqs;
 
-    ggml_tensor * token_shift_all = kv_state->get_k_l(il);
+    ggml_tensor * token_shift_all = mctx_cur->get_r_l(il);
 
-    ggml_tensor * token_shift = build_copy_mask_state(
-            gf, token_shift_all, state_copy, state_mask,
-            hparams.n_embd_k_s(), n_seqs);
+    ggml_tensor * token_shift = build_rs(
+            inp, gf, token_shift_all,
+            hparams.n_embd_r(), n_seqs);
 
     token_shift = ggml_reshape_3d(ctx0, token_shift, hparams.n_embd, token_shift_count, n_seqs);
 
@@ -1499,19 +1562,19 @@ ggml_tensor * llm_graph_context::build_rwkv_token_shift_store(
          ggml_tensor * token_shift,
   const llama_ubatch & ubatch,
                  int   il) const {
-    const auto * kv_state = static_cast<const llama_kv_cache_recurrent_state *>(mstate);
+    const auto * mctx_cur = static_cast<const llama_memory_recurrent_context *>(mctx);
 
     const auto token_shift_count = hparams.token_shift_count;
     const auto n_embd = hparams.n_embd;
 
     const int64_t n_seqs = ubatch.n_seqs;
 
-    const auto kv_head = kv_state->get_head();
+    const auto kv_head = mctx_cur->get_head();
 
     return ggml_cpy(
         ctx0,
         ggml_view_1d(ctx0, token_shift, n_embd * n_seqs * token_shift_count, 0),
-        ggml_view_1d(ctx0, kv_state->get_k_l(il), hparams.n_embd_k_s()*n_seqs, hparams.n_embd_k_s()*kv_head*ggml_element_size(kv_state->get_k_l(il)))
+        ggml_view_1d(ctx0, mctx_cur->get_r_l(il), hparams.n_embd_r()*n_seqs, hparams.n_embd_r()*kv_head*ggml_element_size(mctx_cur->get_r_l(il)))
     );
 }
 
@@ -1562,23 +1625,30 @@ void llm_graph_context::build_pooling(
                 ggml_tensor * inp_cls = build_inp_cls();
                 inp = ggml_get_rows(ctx0, inp, inp_cls);
 
-                if (cls != nullptr && cls_b != nullptr) {
+                if (cls) {
                     // classification head
                     // https://github.com/huggingface/transformers/blob/5af7d41e49bbfc8319f462eb45253dcb3863dfb7/src/transformers/models/roberta/modeling_roberta.py#L1566
-                    cur = ggml_add(ctx0, ggml_mul_mat(ctx0, cls, inp), cls_b);
+                    cur = ggml_mul_mat(ctx0, cls, inp);
+                    if (cls_b) {
+                        cur = ggml_add(ctx0, cur, cls_b);
+                    }
                     cur = ggml_tanh(ctx0, cur);
 
                     // some models don't have `cls_out`, for example: https://huggingface.co/jinaai/jina-reranker-v1-tiny-en
                     // https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/blob/cb5347e43979c3084a890e3f99491952603ae1b7/modeling_bert.py#L884-L896
                     if (cls_out) {
-                        GGML_ASSERT(cls_out_b != nullptr);
-                        cur = ggml_add(ctx0, ggml_mul_mat(ctx0, cls_out, cur), cls_out_b);
+                        cur = ggml_mul_mat(ctx0, cls_out, cur);
+                        if (cls_out_b) {
+                            cur = ggml_add(ctx0, cur, cls_out_b);
+                        }
                     }
                 } else if (cls_out) {
                     // Single layer classification head (direct projection)
                     // https://github.com/huggingface/transformers/blob/f4fc42216cd56ab6b68270bf80d811614d8d59e4/src/transformers/models/bert/modeling_bert.py#L1476
-                    GGML_ASSERT(cls_out_b != nullptr);
-                    cur = ggml_add(ctx0, ggml_mul_mat(ctx0, cls_out, inp), cls_out_b);
+                    cur = ggml_mul_mat(ctx0, cls_out, inp);
+                    if (cls_out_b) {
+                        cur = ggml_add(ctx0, cur, cls_out_b);
+                    }
                 } else {
                     GGML_ABORT("RANK pooling requires either cls+cls_b or cls_out+cls_out_b");
                 }
diff --git a/src/llama-graph.h b/src/llama-graph.h
index d1c5dd1bf036f..4b1ec354dfc30 100644
--- a/src/llama-graph.h
+++ b/src/llama-graph.h
@@ -17,11 +17,12 @@ struct ggml_tensor;
 struct llama_ubatch;
 struct llama_cparams;
 
-class llama_memory_state_i;
+struct llama_memory_context_i;
 
-class llama_kv_cache_unified_state;
-class llama_kv_cache_unified_iswa_state;
-class llama_kv_cache_recurrent_state;
+class llama_kv_cache_unified_context;
+class llama_kv_cache_unified_iswa_context;
+class llama_memory_recurrent_context;
+class llama_memory_hybrid_context;
 
 // certain models (typically multi-modal) can produce different types of graphs
 enum llm_graph_type {
@@ -36,6 +37,7 @@ enum llm_ffn_op_type {
     LLM_FFN_RELU,
     LLM_FFN_RELU_SQR,
     LLM_FFN_SWIGLU,
+    LLM_FFN_GEGLU,
 };
 
 enum llm_ffn_gate_type {
@@ -93,14 +95,14 @@ class llm_graph_input_embd : public llm_graph_input_i {
 
 class llm_graph_input_pos : public llm_graph_input_i {
 public:
-    llm_graph_input_pos(int64_t n_pos_per_embd) : n_pos_per_embd(n_pos_per_embd) {}
+    llm_graph_input_pos(uint32_t n_pos_per_embd) : n_pos_per_embd(n_pos_per_embd) {}
     virtual ~llm_graph_input_pos() = default;
 
     void set_input(const llama_ubatch * ubatch) override;
 
     ggml_tensor * pos = nullptr; // I32 [n_batch]
 
-    const int64_t n_pos_per_embd = 1;
+    const uint32_t n_pos_per_embd = 1;
 };
 
 // temperature tuning, used by llama4
@@ -134,7 +136,7 @@ class llm_graph_input_pos_bucket_kv : public llm_graph_input_i {
 public:
     llm_graph_input_pos_bucket_kv(
             const llama_hparams & hparams,
-            const llama_kv_cache_unified_state * kv_state) : hparams(hparams), kv_state(kv_state) {}
+            const llama_kv_cache_unified_context * mctx) : hparams(hparams), mctx(mctx) {}
     virtual ~llm_graph_input_pos_bucket_kv() = default;
 
     void set_input(const llama_ubatch * ubatch) override;
@@ -142,7 +144,8 @@ class llm_graph_input_pos_bucket_kv : public llm_graph_input_i {
     ggml_tensor * pos_bucket = nullptr; // I32 [n_kv, n_batch]
 
     const llama_hparams & hparams;
-    const llama_kv_cache_unified_state * kv_state;
+
+    const llama_kv_cache_unified_context * mctx;
 };
 
 class llm_graph_input_out_ids : public llm_graph_input_i {
@@ -187,28 +190,16 @@ class llm_graph_input_cls : public llm_graph_input_i {
     const llama_cparams & cparams;
 };
 
-class llm_graph_input_s_copy : public llm_graph_input_i {
+class llm_graph_input_rs : public llm_graph_input_i {
 public:
-    llm_graph_input_s_copy(const llama_kv_cache_recurrent_state * kv_state) : kv_state(kv_state) {}
-    virtual ~llm_graph_input_s_copy() = default;
+    llm_graph_input_rs(const llama_memory_recurrent_context * mctx) : mctx(mctx) {}
+    virtual ~llm_graph_input_rs() = default;
 
     void set_input(const llama_ubatch * ubatch) override;
 
     ggml_tensor * s_copy; // I32 [kv_size]
 
-    const llama_kv_cache_recurrent_state * kv_state;
-};
-
-class llm_graph_input_s_mask : public llm_graph_input_i {
-public:
-    llm_graph_input_s_mask(const llama_kv_cache_recurrent_state * kv_state) : kv_state(kv_state) {}
-    virtual ~llm_graph_input_s_mask() = default;
-
-    void set_input(const llama_ubatch * ubatch) override;
-
-    ggml_tensor * s_mask; // F32 [1, n_kv]
-
-    const llama_kv_cache_recurrent_state * kv_state;
+    const llama_memory_recurrent_context * mctx;
 };
 
 class llm_graph_input_cross_embd : public llm_graph_input_i {
@@ -248,10 +239,10 @@ class llm_graph_input_attn_kv_unified : public llm_graph_input_i {
     llm_graph_input_attn_kv_unified(
             const llama_hparams & hparams,
             const llama_cparams & cparams,
-            const llama_kv_cache_unified_state * kv_state) :
+            const llama_kv_cache_unified_context * mctx) :
         hparams(hparams),
         cparams(cparams),
-        kv_state(kv_state) {
+        mctx(mctx) {
     }
     ~llm_graph_input_attn_kv_unified() = default;
 
@@ -265,7 +256,7 @@ class llm_graph_input_attn_kv_unified : public llm_graph_input_i {
     const llama_hparams & hparams;
     const llama_cparams & cparams;
 
-    const llama_kv_cache_unified_state * kv_state;
+    const llama_kv_cache_unified_context * mctx;
 };
 
 class llm_graph_input_attn_kv_unified_iswa : public llm_graph_input_i {
@@ -273,10 +264,10 @@ class llm_graph_input_attn_kv_unified_iswa : public llm_graph_input_i {
     llm_graph_input_attn_kv_unified_iswa(
             const llama_hparams & hparams,
             const llama_cparams & cparams,
-            const llama_kv_cache_unified_iswa_state * kv_state) :
+            const llama_kv_cache_unified_iswa_context * mctx) :
         hparams(hparams),
         cparams(cparams),
-        kv_state(kv_state) {
+        mctx(mctx) {
     }
     ~llm_graph_input_attn_kv_unified_iswa() = default;
 
@@ -293,7 +284,7 @@ class llm_graph_input_attn_kv_unified_iswa : public llm_graph_input_i {
     const llama_hparams & hparams;
     const llama_cparams & cparams;
 
-    const llama_kv_cache_unified_iswa_state * kv_state;
+    const llama_kv_cache_unified_iswa_context * mctx;
 };
 
 class llm_graph_input_attn_cross : public llm_graph_input_i {
@@ -311,6 +302,44 @@ class llm_graph_input_attn_cross : public llm_graph_input_i {
     const llama_cross * cross = nullptr;
 };
 
+class llm_graph_input_mem_hybrid : public llm_graph_input_i {
+public:
+    llm_graph_input_mem_hybrid(
+            const llama_hparams & hparams,
+            const llama_cparams & cparams,
+            const llama_memory_hybrid_context * mctx) :
+        hparams(hparams),
+        cparams(cparams),
+        mctx(mctx) {
+    }
+    virtual ~llm_graph_input_mem_hybrid() = default;
+
+    void set_input(const llama_ubatch * ubatch) override;
+
+    ggml_tensor * s_copy; // I32 [kv_size]
+
+    ggml_tensor * get_kq_mask() const { return self_kq_mask_cnv; }
+
+    ggml_tensor * self_kq_mask     = nullptr; // F32 [n_kv, n_batch]
+    ggml_tensor * self_kq_mask_cnv = nullptr; //     [n_kv, n_batch]
+
+    const llama_hparams & hparams;
+    const llama_cparams & cparams;
+
+    const llama_memory_hybrid_context * mctx;
+};
+
+// TODO: remove this when ggml_scale_add is implemented
+class llm_graph_input_one : public llm_graph_input_i {
+public:
+    llm_graph_input_one() {}
+    virtual ~llm_graph_input_one() = default;
+
+    void set_input(const llama_ubatch *) override;
+
+    ggml_tensor * one = nullptr; // F32
+};
+
 //
 // llm_graph_result
 //
@@ -384,12 +413,12 @@ struct llm_graph_params {
     ggml_backend_sched_t sched;
     ggml_backend_t backend_cpu;
 
-    const llama_adapter_cvec   * cvec;
-    const llama_adapter_loras  * loras;
-    const llama_memory_state_i * mstate;
-    const llama_cross          * cross;
+    const llama_adapter_cvec     * cvec;
+    const llama_adapter_loras    * loras;
+    const llama_memory_context_i * mctx;
+    const llama_cross            * cross;
 
-    int32_t n_outputs;
+    uint32_t n_outputs;
 
     const llm_graph_cb & cb;
 };
@@ -423,8 +452,8 @@ struct llm_graph_context {
     const float norm_eps;
     const float norm_rms_eps;
 
-    const int32_t n_tokens;
-    const int32_t n_outputs;
+    const int64_t n_tokens;
+    const int64_t n_outputs;
     const int32_t n_ctx_orig; // yarn
 
     const enum llama_pooling_type pooling_type;
@@ -436,10 +465,10 @@ struct llm_graph_context {
 
     ggml_backend_t backend_cpu; // TODO: needed by build_attn_mha, figure out a way to remove?
 
-    const llama_adapter_cvec   * cvec;
-    const llama_adapter_loras  * loras;
-    const llama_memory_state_i * mstate;
-    const llama_cross          * cross;
+    const llama_adapter_cvec     * cvec;
+    const llama_adapter_loras    * loras;
+    const llama_memory_context_i * mctx;
+    const llama_cross            * cross;
 
     const llm_graph_cb & cb_func;
 
@@ -447,8 +476,6 @@ struct llm_graph_context {
 
     llm_graph_context(const llm_graph_params & params);
 
-    int64_t n_pos_per_embd() const;
-
     void cb(ggml_tensor * cur, const char * name, int il) const;
 
     //
@@ -519,14 +546,14 @@ struct llm_graph_context {
     ggml_tensor * build_inp_out_ids() const;
     ggml_tensor * build_inp_mean() const;
     ggml_tensor * build_inp_cls() const;
-    ggml_tensor * build_inp_s_copy() const;
-    ggml_tensor * build_inp_s_mask() const;
 
     ggml_tensor * build_inp_cross_embd() const;
     ggml_tensor * build_inp_pos_bucket_enc() const;
     ggml_tensor * build_inp_pos_bucket_dec() const;
     ggml_tensor * build_pos_bias(ggml_tensor * pos_bucket, ggml_tensor * attn_rel_b) const;
 
+    llm_graph_input_mem_hybrid * build_inp_mem_hybrid() const;
+
     //
     // attention
     //
@@ -573,14 +600,15 @@ struct llm_graph_context {
 
     llm_graph_input_attn_kv_unified_iswa * build_attn_inp_kv_unified_iswa() const;
 
+    // note: if k_cur or v_cur are not provided, they will not be stored in the memory
     ggml_tensor * build_attn(
             llm_graph_input_attn_kv_unified_iswa * inp,
             ggml_cgraph * gf,
             ggml_tensor * wo,
             ggml_tensor * wo_b,
             ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
-            ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens]
-            ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens]
+            ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens] optional
+            ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens] optional
             ggml_tensor * kq_b,
             ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
                   float   kq_scale,
@@ -601,23 +629,62 @@ struct llm_graph_context {
                   float   kq_scale,
                     int   il) const;
 
+    ggml_tensor * build_attn(
+            llm_graph_input_mem_hybrid * inp,
+            ggml_cgraph * gf,
+            ggml_tensor * wo,
+            ggml_tensor * wo_b,
+            ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
+            ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens]
+            ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens]
+            ggml_tensor * kq_b,
+            ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
+                  float   kq_scale,
+                    int   il) const;
     //
     // recurrent
     //
 
-    ggml_tensor * build_copy_mask_state(
-             ggml_cgraph * gf,
-             ggml_tensor * s,
-             ggml_tensor * state_copy,
-             ggml_tensor * state_mask,
-                 int32_t   n_state,
-                 int32_t   n_seqs) const;
+    // TODO: avoid notion of "kv"
+    // TODO: move this implementation to llama_memory_recurrent.
+    //       this is analogous to llama_kv_cache_unified::cpy_k / cpy_v
+    //       when moving, avoid passing `ggml_cgraph` - only pass `ggml_context`. would likely need to split the
+    //         implementation in 2 separate methods. the goal is to avoid calling `ggml_build_forward_expand` in
+    //         `llama_memory_recurrent`
+    ggml_tensor * build_rs(
+            ggml_cgraph * gf,
+            ggml_tensor * s,
+            ggml_tensor * state_copy,
+                int32_t   state_size,
+                int32_t   n_seqs,
+               uint32_t   n_kv,
+               uint32_t   kv_head,
+               uint32_t   kv_size,
+                int32_t   rs_zero,
+                   bool   avoid_copies = false) const;
+
+    llm_graph_input_rs * build_rs_inp() const;
+
+    ggml_tensor * build_rs(
+            llm_graph_input_rs * inp,
+            ggml_cgraph * gf,
+            ggml_tensor * s,
+                int32_t   state_size,
+                int32_t   n_seqs,
+                   bool   avoid_copies = false) const;
+
+    ggml_tensor * build_rs(
+            llm_graph_input_mem_hybrid * inp,
+            ggml_cgraph * gf,
+            ggml_tensor * s,
+                int32_t   state_size,
+                int32_t   n_seqs,
+                   bool   avoid_copies = false) const;
 
     ggml_tensor * build_rwkv_token_shift_load(
-             ggml_cgraph * gf,
-             ggml_tensor * state_copy,
-             ggml_tensor * state_mask,
-      const llama_ubatch & ubatch,
+        llm_graph_input_rs * inp,
+               ggml_cgraph * gf,
+        const llama_ubatch & ubatch,
                      int   il) const;
 
     ggml_tensor * build_rwkv_token_shift_store(
diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp
index 1499eb08a5dd9..bba7a12dc5496 100644
--- a/src/llama-hparams.cpp
+++ b/src/llama-hparams.cpp
@@ -65,7 +65,7 @@ uint32_t llama_hparams::n_embd_v_gqa(uint32_t il) const {
     return n_embd_head_v * n_head_kv;
 }
 
-uint32_t llama_hparams::n_embd_k_s() const {
+uint32_t llama_hparams::n_embd_r() const {
     if (wkv_head_size != 0) {
         // for RWKV models
         return token_shift_count * n_embd;
@@ -76,7 +76,7 @@ uint32_t llama_hparams::n_embd_k_s() const {
     return (ssm_d_conv > 0 ? ssm_d_conv - 1 : 0) * ssm_d_inner;
 }
 
-uint32_t llama_hparams::n_embd_v_s() const {
+uint32_t llama_hparams::n_embd_s() const {
     if (wkv_head_size != 0) {
         // corresponds to RWKV's wkv_states size
         return n_embd * wkv_head_size;
@@ -86,6 +86,14 @@ uint32_t llama_hparams::n_embd_v_s() const {
     return ssm_d_state * ssm_d_inner;
 }
 
+bool llama_hparams::is_recurrent(uint32_t il) const {
+    return recurrent_layer_arr[il];
+}
+
+uint32_t llama_hparams::n_pos_per_embd() const {
+    return rope_type == LLAMA_ROPE_TYPE_MROPE ? 4 : 1;
+}
+
 bool llama_hparams::is_swa(uint32_t il) const {
     if (il < n_layer) {
         return swa_layers[il];
diff --git a/src/llama-hparams.h b/src/llama-hparams.h
index b2bcb8b01a18b..e85afe145a922 100644
--- a/src/llama-hparams.h
+++ b/src/llama-hparams.h
@@ -115,6 +115,9 @@ struct llama_hparams {
     uint32_t ssm_d_state = 0;
     uint32_t ssm_dt_rank = 0;
 
+    // for hybrid state space models
+    std::array<bool, LLAMA_MAX_LAYERS> recurrent_layer_arr;
+
     bool ssm_dt_b_c_rms = false;
 
     float f_clamp_kqv      = 0.0f;
@@ -140,6 +143,12 @@ struct llama_hparams {
     uint32_t n_attn_temp_floor_scale = 8192;
     float    f_attn_temp_scale       = 0.1;
 
+    // gemma3n altup
+    uint32_t n_altup      = 4; // altup_num_inputs
+    uint32_t i_altup_act  = 0; // altup_active_idx
+    uint32_t laurel_rank  = 64;
+    uint32_t n_embd_altup = 256;
+
     // needed by encoder-decoder models (e.g. T5, FLAN-T5)
     // ref: https://github.com/ggerganov/llama.cpp/pull/8141
     llama_token dec_start_token_id = LLAMA_TOKEN_NULL;
@@ -181,10 +190,15 @@ struct llama_hparams {
 
     // dimension of the rolling state embeddings
     // corresponds to Mamba's conv_states size or RWKV's token_shift states size
-    uint32_t n_embd_k_s() const;
+    uint32_t n_embd_r() const;
 
     // dimension of the recurrent state embeddings
-    uint32_t n_embd_v_s() const;
+    uint32_t n_embd_s() const;
+
+    // whether or not the given layer is recurrent (for hybrid models)
+    bool is_recurrent(uint32_t il) const;
+
+    uint32_t n_pos_per_embd() const;
 
     bool is_swa(uint32_t il) const;
 };
diff --git a/src/llama-kv-cache-unified-iswa.cpp b/src/llama-kv-cache-unified-iswa.cpp
new file mode 100644
index 0000000000000..b9169299c0760
--- /dev/null
+++ b/src/llama-kv-cache-unified-iswa.cpp
@@ -0,0 +1,279 @@
+#include "llama-kv-cache-unified-iswa.h"
+
+#include "llama-impl.h"
+#include "llama-batch.h"
+#include "llama-model.h"
+
+#include <algorithm>
+#include <cassert>
+
+//
+// llama_kv_cache_unified_iswa
+//
+
+llama_kv_cache_unified_iswa::llama_kv_cache_unified_iswa(
+        const llama_model & model,
+                ggml_type   type_k,
+                ggml_type   type_v,
+                     bool   v_trans,
+                     bool   offload,
+                     bool   swa_full,
+                 uint32_t   kv_size,
+                 uint32_t   n_seq_max,
+                 uint32_t   n_ubatch,
+                 uint32_t   n_pad) : hparams(model.hparams) {
+    llama_kv_cache_unified::layer_filter_cb filter_base = [&](int32_t il) { return !model.hparams.is_swa(il); };
+    llama_kv_cache_unified::layer_filter_cb filter_swa  = [&](int32_t il) { return  model.hparams.is_swa(il); };
+
+    const uint32_t size_base = kv_size;
+
+    uint32_t size_swa = std::min(size_base, GGML_PAD(hparams.n_swa*n_seq_max + n_ubatch, n_pad));
+
+    // when using full-size SWA cache, we set the SWA cache size to be equal to the base cache size
+    if (swa_full) {
+        LLAMA_LOG_WARN("%s: using full-size SWA cache (ref: %s)\n",
+                __func__, "https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055");
+
+        size_swa = size_base;
+    }
+
+    LLAMA_LOG_INFO("%s: creating non-SWA KV cache, size = %u cells\n", __func__, size_base);
+
+    kv_base = std::make_unique<llama_kv_cache_unified>(
+            model, std::move(filter_base), type_k, type_v,
+            v_trans, offload, size_base, n_seq_max, n_pad,
+            0, LLAMA_SWA_TYPE_NONE);
+
+    LLAMA_LOG_INFO("%s: creating     SWA KV cache, size = %u cells\n", __func__, size_swa);
+
+    kv_swa = std::make_unique<llama_kv_cache_unified>(
+            model, std::move(filter_swa), type_k, type_v,
+            v_trans, offload, size_swa, n_seq_max, n_pad,
+            hparams.n_swa, hparams.swa_type);
+}
+
+void llama_kv_cache_unified_iswa::clear(bool data) {
+    kv_base->clear(data);
+    kv_swa ->clear(data);
+}
+
+bool llama_kv_cache_unified_iswa::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
+    bool res = true;
+
+    res = res & kv_base->seq_rm(seq_id, p0, p1);
+    res = res & kv_swa ->seq_rm(seq_id, p0, p1);
+
+    return res;
+}
+
+void llama_kv_cache_unified_iswa::seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) {
+    kv_base->seq_cp(seq_id_src, seq_id_dst, p0, p1);
+    kv_swa ->seq_cp(seq_id_src, seq_id_dst, p0, p1);
+}
+
+void llama_kv_cache_unified_iswa::seq_keep(llama_seq_id seq_id) {
+    kv_base->seq_keep(seq_id);
+    kv_swa ->seq_keep(seq_id);
+}
+
+void llama_kv_cache_unified_iswa::seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos shift) {
+    kv_base->seq_add(seq_id, p0, p1, shift);
+    kv_swa ->seq_add(seq_id, p0, p1, shift);
+}
+
+void llama_kv_cache_unified_iswa::seq_div(llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) {
+    kv_base->seq_div(seq_id, p0, p1, d);
+    kv_swa ->seq_div(seq_id, p0, p1, d);
+}
+
+llama_pos llama_kv_cache_unified_iswa::seq_pos_min(llama_seq_id seq_id) const {
+    // the base cache is a superset of the SWA cache, so we can just check the SWA cache
+    return kv_swa->seq_pos_min(seq_id);
+}
+
+llama_pos llama_kv_cache_unified_iswa::seq_pos_max(llama_seq_id seq_id) const {
+    return kv_swa->seq_pos_max(seq_id);
+}
+
+llama_memory_context_ptr llama_kv_cache_unified_iswa::init_batch(llama_batch_allocr & balloc, uint32_t n_ubatch, bool embd_all) {
+    GGML_UNUSED(embd_all);
+
+    // first try simple split
+    do {
+        balloc.split_reset();
+
+        std::vector<llama_ubatch> ubatches;
+        while (true) {
+            auto ubatch = balloc.split_simple(n_ubatch);
+
+            if (ubatch.n_tokens == 0) {
+                break;
+            }
+
+            ubatches.push_back(std::move(ubatch)); // NOLINT
+        }
+
+        auto heads_base = kv_base->prepare(ubatches);
+        if (heads_base.empty()) {
+            break;
+        }
+
+        auto heads_swa = kv_swa->prepare(ubatches);
+        if (heads_swa.empty()) {
+            break;
+        }
+
+        assert(heads_base.size() == heads_swa.size());
+
+        return std::make_unique<llama_kv_cache_unified_iswa_context>(
+                this, std::move(heads_base), std::move(heads_swa), std::move(ubatches));
+    } while (false);
+
+    // if it fails, try equal split
+    do {
+        balloc.split_reset();
+
+        std::vector<llama_ubatch> ubatches;
+        while (true) {
+            auto ubatch = balloc.split_equal(n_ubatch);
+
+            if (ubatch.n_tokens == 0) {
+                break;
+            }
+
+            ubatches.push_back(std::move(ubatch)); // NOLINT
+        }
+
+        auto heads_base = kv_base->prepare(ubatches);
+        if (heads_base.empty()) {
+            break;
+        }
+
+        auto heads_swa = kv_swa->prepare(ubatches);
+        if (heads_swa.empty()) {
+            break;
+        }
+
+        assert(heads_base.size() == heads_swa.size());
+
+        return std::make_unique<llama_kv_cache_unified_iswa_context>(
+                this, std::move(heads_base), std::move(heads_swa), std::move(ubatches));
+    } while (false);
+
+    // TODO: if we fail again, we should attempt different splitting strategies
+    //       but to do that properly, we first have to refactor the batches to be more flexible
+
+    return std::make_unique<llama_kv_cache_unified_iswa_context>(LLAMA_MEMORY_STATUS_FAILED_PREPARE);
+}
+
+llama_memory_context_ptr llama_kv_cache_unified_iswa::init_full() {
+    return std::make_unique<llama_kv_cache_unified_iswa_context>(this);
+}
+
+llama_memory_context_ptr llama_kv_cache_unified_iswa::init_update(llama_context * lctx, bool optimize) {
+    return std::make_unique<llama_kv_cache_unified_iswa_context>(this, lctx, optimize);
+}
+
+bool llama_kv_cache_unified_iswa::get_can_shift() const {
+    return kv_base->get_size() == kv_swa->get_size();
+}
+
+void llama_kv_cache_unified_iswa::state_write(llama_io_write_i & io, llama_seq_id seq_id) const {
+    kv_base->state_write(io, seq_id);
+    kv_swa ->state_write(io, seq_id);
+}
+
+void llama_kv_cache_unified_iswa::state_read(llama_io_read_i & io, llama_seq_id seq_id) {
+    kv_base->state_read(io, seq_id);
+    kv_swa ->state_read(io, seq_id);
+}
+
+llama_kv_cache_unified * llama_kv_cache_unified_iswa::get_base() const {
+    return kv_base.get();
+}
+
+llama_kv_cache_unified * llama_kv_cache_unified_iswa::get_swa() const {
+    return kv_swa.get();
+}
+
+//
+// llama_kv_cache_unified_iswa_context
+//
+
+llama_kv_cache_unified_iswa_context::llama_kv_cache_unified_iswa_context(llama_memory_status status) : status(status) {}
+
+llama_kv_cache_unified_iswa_context::llama_kv_cache_unified_iswa_context(
+        llama_kv_cache_unified_iswa * kv) :
+    ctx_base(kv->get_base()->init_full()),
+    ctx_swa (kv->get_swa ()->init_full()),
+    status(llama_memory_status_combine(ctx_base->get_status(), ctx_swa->get_status())) {
+}
+
+llama_kv_cache_unified_iswa_context::llama_kv_cache_unified_iswa_context(
+        llama_kv_cache_unified_iswa * kv,
+        llama_context * lctx,
+        bool optimize) :
+    ctx_base(kv->get_base()->init_update(lctx, optimize)),
+    ctx_swa (kv->get_swa ()->init_update(lctx, optimize)),
+    status(llama_memory_status_combine(ctx_base->get_status(), ctx_swa->get_status())) {
+}
+
+llama_kv_cache_unified_iswa_context::llama_kv_cache_unified_iswa_context(
+        llama_kv_cache_unified_iswa * kv,
+        std::vector<uint32_t> heads_base,
+        std::vector<uint32_t> heads_swa,
+        std::vector<llama_ubatch> ubatches) :
+    ubatches(std::move(ubatches)),
+    // note: here we copy the ubatches. not sure if this is ideal
+    ctx_base(new llama_kv_cache_unified_context(kv->get_base(), std::move(heads_base), this->ubatches)),
+    ctx_swa (new llama_kv_cache_unified_context(kv->get_swa (), std::move(heads_swa),  this->ubatches)),
+    status(llama_memory_status_combine(ctx_base->get_status(), ctx_swa->get_status())) {
+}
+
+llama_kv_cache_unified_iswa_context:: ~llama_kv_cache_unified_iswa_context() = default;
+
+bool llama_kv_cache_unified_iswa_context::next() {
+    assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
+
+    ctx_base->next();
+    ctx_swa ->next();
+
+    if (++i_next >= ubatches.size()) {
+        return false;
+    }
+
+    return true;
+}
+
+bool llama_kv_cache_unified_iswa_context::apply() {
+    assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
+
+    bool res = true;
+
+    res = res & ctx_base->apply();
+    res = res & ctx_swa ->apply();
+
+    return res;
+}
+
+llama_memory_status llama_kv_cache_unified_iswa_context::get_status() const {
+    return status;
+}
+
+const llama_ubatch & llama_kv_cache_unified_iswa_context::get_ubatch() const {
+    assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
+
+    return ubatches[i_next];
+}
+
+const llama_kv_cache_unified_context * llama_kv_cache_unified_iswa_context::get_base() const {
+    assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
+
+    return static_cast<const llama_kv_cache_unified_context *>(ctx_base.get());
+}
+
+const llama_kv_cache_unified_context * llama_kv_cache_unified_iswa_context::get_swa()  const {
+    assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
+
+    return static_cast<const llama_kv_cache_unified_context *>(ctx_swa.get());
+}
diff --git a/src/llama-kv-cache-unified-iswa.h b/src/llama-kv-cache-unified-iswa.h
new file mode 100644
index 0000000000000..46c1ed614f2f0
--- /dev/null
+++ b/src/llama-kv-cache-unified-iswa.h
@@ -0,0 +1,128 @@
+#pragma once
+
+#include "llama-kv-cache-unified.h"
+
+#include <vector>
+
+//
+// llama_kv_cache_unified_iswa
+//
+
+// utilizes two instances of llama_kv_cache_unified
+//   the first instance is for the non-SWA layers of the model and the second instance is for the SWA layers
+
+class llama_kv_cache_unified_iswa : public llama_memory_i {
+public:
+    llama_kv_cache_unified_iswa(
+            const llama_model & model,
+                    ggml_type   type_k,
+                    ggml_type   type_v,
+                         bool   v_trans,
+                         bool   offload,
+                         bool   swa_full,
+                     uint32_t   kv_size,
+                     uint32_t   n_seq_max,
+                     uint32_t   n_ubatch,
+                     uint32_t   n_pad);
+
+    ~llama_kv_cache_unified_iswa() = default;
+
+    //
+    // llama_memory_i
+    //
+
+    llama_memory_context_ptr init_batch(
+            llama_batch_allocr & balloc,
+            uint32_t n_ubatch,
+            bool embd_all) override;
+
+    llama_memory_context_ptr init_full() override;
+
+    llama_memory_context_ptr init_update(llama_context * lctx, bool optimize) override;
+
+    bool get_can_shift() const override;
+
+    void clear(bool data) override;
+
+    bool seq_rm  (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1) override;
+    void seq_cp  (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) override;
+    void seq_keep(llama_seq_id seq_id)                                                          override;
+    void seq_add (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1, llama_pos shift) override;
+    void seq_div (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1, int d) override;
+
+    llama_pos seq_pos_min(llama_seq_id seq_id) const override;
+    llama_pos seq_pos_max(llama_seq_id seq_id) const override;
+
+    // state write/load
+
+    void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const override;
+    void state_read (llama_io_read_i  & io, llama_seq_id seq_id = -1)       override;
+
+    //
+    // llama_kv_cache_unified_iswa specific API
+    //
+
+    llama_kv_cache_unified * get_base() const;
+    llama_kv_cache_unified * get_swa () const;
+
+private:
+    const llama_hparams & hparams;
+
+    std::unique_ptr<llama_kv_cache_unified> kv_base;
+    std::unique_ptr<llama_kv_cache_unified> kv_swa;
+};
+
+class llama_kv_cache_unified_iswa_context : public llama_memory_context_i {
+public:
+    // used for errors
+    llama_kv_cache_unified_iswa_context(llama_memory_status status);
+
+    // used to create a full-cache context
+    llama_kv_cache_unified_iswa_context(
+            llama_kv_cache_unified_iswa * kv);
+
+    // used to create an update context
+    llama_kv_cache_unified_iswa_context(
+            llama_kv_cache_unified_iswa * kv,
+            llama_context * lctx,
+            bool optimize);
+
+    // used to create a batch processing context from a batch
+    llama_kv_cache_unified_iswa_context(
+            llama_kv_cache_unified_iswa * kv,
+            std::vector<uint32_t> heads_base,
+            std::vector<uint32_t> heads_swa,
+            std::vector<llama_ubatch> ubatches);
+
+    virtual ~llama_kv_cache_unified_iswa_context();
+
+    //
+    // llama_memory_context_i
+    //
+
+    bool next()  override;
+    bool apply() override;
+
+    llama_memory_status  get_status() const override;
+    const llama_ubatch & get_ubatch() const override;
+
+    //
+    // llama_kv_cache_unified_iswa_context specific API
+    //
+
+    const llama_kv_cache_unified_context * get_base() const;
+    const llama_kv_cache_unified_context * get_swa()  const;
+
+private:
+    //llama_kv_cache_unified_iswa * kv;
+
+    // the index of the next ubatch to process
+    size_t i_next = 0;
+
+    std::vector<llama_ubatch> ubatches;
+
+    const llama_memory_context_ptr ctx_base;
+    const llama_memory_context_ptr ctx_swa;
+
+    const llama_memory_status status;
+};
diff --git a/src/llama-kv-cache-unified.cpp b/src/llama-kv-cache-unified.cpp
new file mode 100644
index 0000000000000..8517b722a9f80
--- /dev/null
+++ b/src/llama-kv-cache-unified.cpp
@@ -0,0 +1,1841 @@
+#include "llama-kv-cache-unified.h"
+
+#include "llama-impl.h"
+#include "llama-io.h"
+#include "llama-model.h"
+#include "llama-context.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cmath>
+#include <limits>
+#include <map>
+#include <stdexcept>
+
+//
+// llama_kv_cache_unified
+//
+
+llama_kv_cache_unified::llama_kv_cache_unified(
+        const llama_model &  model,
+          layer_filter_cb && filter,
+                ggml_type    type_k,
+                ggml_type    type_v,
+                     bool    v_trans,
+                     bool    offload,
+                 uint32_t    kv_size,
+                 uint32_t    n_seq_max,
+                 uint32_t    n_pad,
+                 uint32_t    n_swa,
+           llama_swa_type    swa_type) :
+    model(model), hparams(model.hparams), v_trans(v_trans),
+    n_seq_max(n_seq_max), n_pad(n_pad), n_swa(n_swa), swa_type(swa_type) {
+
+    GGML_ASSERT(kv_size % n_pad == 0);
+
+    // TODO: this is temporary until we support passing reuse layer filters [KV_REUSE]
+    auto n_layer_cache = hparams.n_layer;
+    if (model.arch == LLM_ARCH_GEMMA3N) {
+        n_layer_cache = 20;
+    }
+
+    // create a context for each buffer type
+    std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
+    auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
+        auto it = ctx_map.find(buft);
+        if (it == ctx_map.end()) {
+            ggml_init_params params = {
+                /*.mem_size   =*/ size_t(2u*n_layer_cache*ggml_tensor_overhead()),
+                /*.mem_buffer =*/ NULL,
+                /*.no_alloc   =*/ true,
+            };
+
+            ggml_context * ctx = ggml_init(params);
+            if (!ctx) {
+                return nullptr;
+            }
+
+            ctx_map[buft] = ctx;
+            ctxs.emplace_back(ctx);
+
+            return ctx;
+        }
+
+        return it->second;
+    };
+
+    head = 0;
+
+    cells.resize(kv_size);
+
+    for (uint32_t il = 0; il < n_layer_cache; il++) {
+        if (filter && !filter(il)) {
+            LLAMA_LOG_DEBUG("%s: layer %3d: skipped\n", __func__, il);
+            continue;
+        }
+
+        const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
+        const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
+
+        const char * dev_name = "CPU";
+
+        ggml_backend_buffer_type_t buft = ggml_backend_cpu_buffer_type();
+
+        if (offload) {
+            auto * dev = model.dev_layer(il);
+            buft = ggml_backend_dev_buffer_type(dev);
+
+            dev_name = ggml_backend_dev_name(dev);
+        }
+
+        LLAMA_LOG_DEBUG("%s: layer %3d: dev = %s\n", __func__, il, dev_name);
+
+        ggml_context * ctx = ctx_for_buft(buft);
+        if (!ctx) {
+            throw std::runtime_error("failed to create ggml context for kv cache");
+        }
+
+        ggml_tensor * k;
+        ggml_tensor * v;
+
+        k = ggml_new_tensor_2d(ctx, type_k, n_embd_k_gqa, kv_size);
+        v = ggml_new_tensor_2d(ctx, type_v, n_embd_v_gqa, kv_size);
+
+        ggml_format_name(k, "cache_k_l%d", il);
+        ggml_format_name(v, "cache_v_l%d", il);
+
+        map_layer_ids[il] = layers.size();
+        layers.push_back({ il, k, v });
+    }
+
+    // TODO: this is temporary until we support passing reuse layer filters [KV_REUSE]
+    if (model.arch == LLM_ARCH_GEMMA3N) {
+        LLAMA_LOG_DEBUG("%s: GEMMA3N: reuse layers [%d, %d]\n", __func__, n_layer_cache, hparams.n_layer - 1);
+
+        for (uint32_t il = n_layer_cache; il < hparams.n_layer; il++) {
+            if (filter && !filter(il)) {
+                LLAMA_LOG_DEBUG("%s: layer %3d: skipped\n", __func__, il);
+                continue;
+            }
+
+            const bool     is_swa   = hparams.is_swa(il);
+            const uint32_t il_reuse = n_layer_cache - (is_swa ? 2 : 1);
+
+            GGML_ASSERT(map_layer_ids.find(il_reuse) != map_layer_ids.end());
+            map_layer_ids[il] = map_layer_ids[il_reuse];
+
+            LLAMA_LOG_DEBUG("%s: layer %3d: reuse layer %d, isw = %d\n", __func__, il, il_reuse, is_swa);
+        }
+    }
+
+    // allocate tensors and initialize the buffers to avoid NaNs in the padding
+    for (auto it : ctx_map) {
+        auto * buft = it.first;
+        auto * ctx  = it.second;
+
+        ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
+        if (!buf) {
+            throw std::runtime_error("failed to allocate buffer for kv cache");
+        }
+
+        LLAMA_LOG_INFO("%s: %10s KV buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0);
+
+        ggml_backend_buffer_clear(buf, 0);
+        bufs.emplace_back(buf);
+    }
+
+    {
+        const size_t memory_size_k = size_k_bytes();
+        const size_t memory_size_v = size_v_bytes();
+
+        LLAMA_LOG_INFO("%s: size = %7.2f MiB (%6u cells, %3d layers, %2u seqs), K (%s): %7.2f MiB, V (%s): %7.2f MiB\n", __func__,
+                (float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f), kv_size, (int) layers.size(), n_seq_max,
+                ggml_type_name(type_k), (float)memory_size_k / (1024.0f * 1024.0f),
+                ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f));
+    }
+
+    const char * LLAMA_KV_CACHE_DEBUG = getenv("LLAMA_KV_CACHE_DEBUG");
+    debug = LLAMA_KV_CACHE_DEBUG ? atoi(LLAMA_KV_CACHE_DEBUG) : 0;
+}
+
+void llama_kv_cache_unified::clear(bool data) {
+    cells.reset();
+
+    head = 0;
+
+    if (data) {
+        for (auto & buf : bufs) {
+            ggml_backend_buffer_clear(buf.get(), 0);
+        }
+    }
+}
+
+bool llama_kv_cache_unified::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
+    uint32_t new_head = cells.size();
+
+    if (p0 < 0) {
+        p0 = 0;
+    }
+
+    if (p1 < 0) {
+        p1 = std::numeric_limits<llama_pos>::max();
+    }
+
+    if (seq_id >= 0) {
+        for (uint32_t i = 0; i < cells.size(); ++i) {
+            if (!cells.pos_in(i, p0, p1)) {
+                continue;
+            }
+
+            if (cells.seq_has(i, seq_id) && cells.seq_rm(i, seq_id)) {
+                if (new_head == cells.size()) {
+                    new_head = i;
+                }
+            }
+        }
+    } else {
+        // match any sequence
+        for (uint32_t i = 0; i < cells.size(); ++i) {
+            if (!cells.pos_in(i, p0, p1)) {
+                continue;
+            }
+
+            cells.rm(i);
+
+            if (new_head == cells.size()) {
+                new_head = i;
+            }
+        }
+    }
+
+    // If we freed up a slot, set head to it so searching can start there.
+    if (new_head != cells.size() && new_head < head) {
+        head = new_head;
+    }
+
+    return true;
+}
+
+void llama_kv_cache_unified::seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) {
+    if (seq_id_src == seq_id_dst) {
+        return;
+    }
+
+    if (p0 < 0) {
+        p0 = 0;
+    }
+
+    if (p1 < 0) {
+        p1 = std::numeric_limits<llama_pos>::max();
+    }
+
+    for (uint32_t i = 0; i < cells.size(); ++i) {
+        if (!cells.pos_in(i, p0, p1)) {
+            continue;
+        }
+
+        if (cells.seq_has(i, seq_id_src)) {
+            cells.seq_add(i, seq_id_dst);
+        }
+    }
+}
+
+void llama_kv_cache_unified::seq_keep(llama_seq_id seq_id) {
+    uint32_t new_head = cells.size();
+
+    for (uint32_t i = 0; i < cells.size(); ++i) {
+        if (cells.seq_keep(i, seq_id)) {
+            if (new_head == cells.size()) {
+                new_head = i;
+            }
+        }
+    }
+
+    // If we freed up a slot, set head to it so searching can start there.
+    if (new_head != cells.size() && new_head < head) {
+        head = new_head;
+    }
+}
+
+void llama_kv_cache_unified::seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos shift) {
+    if (shift == 0) {
+        return;
+    }
+
+    uint32_t new_head = cells.size();
+
+    if (p0 < 0) {
+        p0 = 0;
+    }
+
+    if (p1 < 0) {
+        p1 = std::numeric_limits<llama_pos>::max();
+    }
+
+    // If there is no range then return early to avoid looping over all cells.
+    if (p0 == p1) {
+        return;
+    }
+
+    for (uint32_t i = 0; i < cells.size(); ++i) {
+        if (!cells.pos_in(i, p0, p1)) {
+            continue;
+        }
+
+        if (cells.seq_has(i, seq_id)) {
+            if (cells.pos_add(i, shift)) {
+                if (new_head == cells.size()) {
+                    new_head = i;
+                }
+            }
+        }
+    }
+
+    // If we freed up a slot, set head to it so searching can start there.
+    // Otherwise we just start the next search from the beginning.
+    head = new_head != cells.size() ? new_head : 0;
+}
+
+void llama_kv_cache_unified::seq_div(llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) {
+    if (d == 1) {
+        return;
+    }
+
+    if (p0 < 0) {
+        p0 = 0;
+    }
+
+    if (p1 < 0) {
+        p1 = std::numeric_limits<llama_pos>::max();
+    }
+
+    // If there is no range then return early to avoid looping over the cache.
+    if (p0 == p1) {
+        return;
+    }
+
+    for (uint32_t i = 0; i < cells.size(); ++i) {
+        if (!cells.pos_in(i, p0, p1)) {
+            continue;
+        }
+
+        if (cells.seq_has(i, seq_id)) {
+            cells.pos_div(i, d);
+        }
+    }
+}
+
+llama_pos llama_kv_cache_unified::seq_pos_min(llama_seq_id seq_id) const {
+    return cells.seq_pos_min(seq_id);
+}
+
+llama_pos llama_kv_cache_unified::seq_pos_max(llama_seq_id seq_id) const {
+    return cells.seq_pos_max(seq_id);
+}
+
+llama_memory_context_ptr llama_kv_cache_unified::init_batch(
+            llama_batch_allocr & balloc,
+            uint32_t n_ubatch,
+            bool embd_all) {
+    GGML_UNUSED(embd_all);
+
+    do {
+        balloc.split_reset();
+
+        std::vector<llama_ubatch> ubatches;
+        while (true) {
+            auto ubatch = balloc.split_simple(n_ubatch);
+
+            if (ubatch.n_tokens == 0) {
+                break;
+            }
+
+            ubatches.push_back(std::move(ubatch)); // NOLINT
+        }
+
+        auto heads = prepare(ubatches);
+        if (heads.empty()) {
+            break;
+        }
+
+        return std::make_unique<llama_kv_cache_unified_context>(
+                this, std::move(heads), std::move(ubatches));
+    } while (false);
+
+    return std::make_unique<llama_kv_cache_unified_context>(LLAMA_MEMORY_STATUS_FAILED_PREPARE);
+}
+
+llama_memory_context_ptr llama_kv_cache_unified::init_full() {
+    return std::make_unique<llama_kv_cache_unified_context>(this);
+}
+
+llama_memory_context_ptr llama_kv_cache_unified::init_update(llama_context * lctx, bool optimize) {
+    bool do_shift = get_has_shift();
+
+    defrag_info dinfo;
+
+    // see if we need to defrag
+    {
+        bool do_defrag = optimize;
+
+        const auto thold = lctx->get_cparams().defrag_thold;
+
+        if (!do_defrag && thold > 0.0f) {
+            const auto n_kv = cells.used_max_p1();
+
+            // - do not defrag small contexts (i.e. < 2048 tokens)
+            // - count the padding towards the number of used tokens
+            const float fragmentation = n_kv >= 2048 ? std::max(0.0f, 1.0f - (float(cells.get_used() + n_pad)/n_kv)) : 0.0f;
+
+            if (fragmentation > thold) {
+                LLAMA_LOG_DEBUG("%s: fragmentation: %.2f - requesting defrag\n", __func__, fragmentation);
+
+                do_defrag = true;
+            }
+        }
+
+        if (do_defrag) {
+            dinfo = defrag_prepare(lctx->graph_max_nodes());
+        }
+    }
+
+    return std::make_unique<llama_kv_cache_unified_context>(this, lctx, do_shift, std::move(dinfo));
+}
+
+llama_kv_cache_unified::ubatch_heads llama_kv_cache_unified::prepare(const std::vector<llama_ubatch> & ubatches) {
+    llama_kv_cache_unified::ubatch_heads res;
+
+    struct state {
+        uint32_t head_old; // old position of the head, before placing the ubatch
+        uint32_t head_new; // new position of the head, after placing the ubatch
+
+        llama_kv_cells_unified cells; // copy of the old cells, before placing the ubatch
+    };
+
+    // remember the old state of the cells so we can restore it in the end
+    std::vector<state> states;
+
+    bool success = true;
+
+    for (const auto & ubatch : ubatches) {
+        // only find a suitable slot for the ubatch. don't modify the cells yet
+        const int32_t head_new = find_slot(ubatch);
+        if (head_new < 0) {
+            success = false;
+            break;
+        }
+
+        // remeber the position that we found
+        res.push_back(head_new);
+
+        // store the old state of the cells in the recovery stack
+        states.push_back({head, (uint32_t) head_new, cells.cp(head_new, ubatch.n_tokens)});
+
+        // now emplace the ubatch
+        apply_ubatch(head_new, ubatch);
+    }
+
+    // iterate backwards and restore the cells to their original state
+    for (auto it = states.rbegin(); it != states.rend(); ++it) {
+        cells.set(it->head_new, it->cells);
+        head = it->head_old;
+    }
+
+    if (!success) {
+        return {};
+    }
+
+    return res;
+}
+
+bool llama_kv_cache_unified::update(llama_context * lctx, bool do_shift, const defrag_info & dinfo) {
+    bool updated = false;
+
+    auto * sched = lctx->get_sched();
+
+    if (do_shift) {
+        if (!get_can_shift()) {
+            GGML_ABORT("The current KV cache / model configuration does not support K-shift");
+        }
+
+        LLAMA_LOG_DEBUG("%s: applying K-shift\n", __func__);
+
+        // apply K-shift if needed
+        if (hparams.rope_type != LLAMA_ROPE_TYPE_NONE) {
+            ggml_backend_sched_reset(sched);
+
+            auto * gf = lctx->graph_init();
+
+            auto res = build_graph_shift(lctx->get_cparams(), lctx->get_ctx_compute(), gf);
+            if (!res) {
+                LLAMA_LOG_ERROR("%s: failed to build graph for K-shift\n", __func__);
+                return updated;
+            }
+
+            if (!ggml_backend_sched_alloc_graph(sched, gf)) {
+                LLAMA_LOG_ERROR("%s: failed to allocate compute graph for K-shift\n", __func__);
+                return updated;
+            }
+
+            res->set_inputs(nullptr);
+
+            if (lctx->graph_compute(gf, false) != GGML_STATUS_SUCCESS) {
+                LLAMA_LOG_ERROR("%s: failed to compute K-shift\n", __func__);
+                return updated;
+            }
+
+            updated = true;
+        }
+
+        cells.reset_shift();
+    }
+
+    if (!dinfo.empty()) {
+        LLAMA_LOG_DEBUG("%s: defragmenting KV cache\n", __func__);
+
+        // apply moves:
+        {
+            const auto n_kv = dinfo.ids.size();
+
+            for (uint32_t i = 0; i < n_kv; ++i) {
+                assert(dinfo.ids[i] <= n_kv);
+
+                if (dinfo.ids[i] == n_kv || dinfo.ids[i] == i) {
+                    continue;
+                }
+
+                cells.mv(i, dinfo.ids[i]);
+            }
+
+            // reset the head so we can find the first free slot during the next ubatch
+            head = 0;
+        }
+
+        ggml_backend_sched_reset(sched);
+
+        auto * gf = lctx->graph_init();
+
+        auto res = build_graph_defrag(lctx->get_cparams(), lctx->get_ctx_compute(), gf, dinfo);
+        if (!res) {
+            LLAMA_LOG_ERROR("%s: failed to build graph for defrag\n", __func__);
+            return updated;
+        }
+
+        if (!ggml_backend_sched_alloc_graph(sched, gf)) {
+            LLAMA_LOG_ERROR("%s: failed to allocate compute graph for defrag\n", __func__);
+            return updated;
+        }
+
+        res->set_inputs(nullptr);
+
+        if (lctx->graph_compute(gf, false) != GGML_STATUS_SUCCESS) {
+            LLAMA_LOG_ERROR("%s: failed to compute defrag\n", __func__);
+            return updated;
+        }
+
+        updated = true;
+    }
+
+    return updated;
+}
+
+int32_t llama_kv_cache_unified::find_slot(const llama_ubatch & ubatch) const {
+    const uint32_t n_tokens = ubatch.n_tokens;
+
+    uint32_t head_cur = this->head;
+
+    // if we have enough unused cells before the current head ->
+    //   better to start searching from the beginning of the cache, hoping to fill it
+    if (head_cur > cells.get_used() + 2*ubatch.n_tokens) {
+        head_cur = 0;
+    }
+
+    if (n_tokens > cells.size()) {
+        LLAMA_LOG_ERROR("%s: n_tokens = %d > size = %u\n", __func__, n_tokens, cells.size());
+        return -1;
+    }
+
+    if (debug > 0) {
+        LLAMA_LOG_DEBUG("%s: n = %5d, used = %5d, head = %5d, size = %5d, n_swa = %5d\n", __func__, cells.used_max_p1(), cells.get_used(), head, get_size(), n_swa);
+
+        if ((debug == 2 && n_swa > 0) || debug > 2) {
+            std::string ss;
+            for (uint32_t i = 0; i < cells.size(); ++i) {
+                if (cells.is_empty(i)) {
+                    ss += '.';
+                } else {
+                    assert(cells.seq_count(i) >= 1);
+
+                    if (cells.seq_count(i) == 1) {
+                        ss += std::to_string(cells.seq_get(i));
+                    } else {
+                        ss += 'M';
+                    }
+                }
+                if (i%256 == 255) {
+                    ss += " *";
+                    ss += '\n';
+                }
+            }
+            LLAMA_LOG_DEBUG("\n%s\n", ss.c_str());
+        }
+
+        if ((debug == 2 && n_swa > 0) || debug > 2) {
+            std::string ss;
+            for (uint32_t i = 0; i < cells.size(); ++i) {
+                std::string cur;
+                if (cells.is_empty(i)) {
+                    cur = '.';
+                } else {
+                    cur = std::to_string(cells.pos_get(i));
+                }
+                const int n = cur.size();
+                for (int j = 0; j < 5 - n; ++j) {
+                    cur += ' ';
+                }
+                ss += cur;
+                if (i%256 == 255) {
+                    ss += " *";
+                }
+                if (i%64 == 63) {
+                    ss += '\n';
+                }
+            }
+            LLAMA_LOG_DEBUG("\n%s\n", ss.c_str());
+        }
+
+        for (int s = 0; s < LLAMA_MAX_SEQ; ++s) {
+            if (cells.seq_pos_min(s) < 0) {
+                continue;
+            }
+
+            LLAMA_LOG_DEBUG("%s: min[%d] = %5d, max[%d] = %5d\n", __func__, s, cells.seq_pos_min(s), s, cells.seq_pos_max(s));
+        }
+    }
+
+    uint32_t n_tested = 0;
+
+    while (true) {
+        if (head_cur + n_tokens > cells.size()) {
+            n_tested += cells.size() - head_cur;
+            head_cur = 0;
+            continue;
+        }
+
+        bool found = true;
+        for (uint32_t i = 0; i < n_tokens; i++) {
+            //const llama_pos    pos    = ubatch.pos[i];
+            //const llama_seq_id seq_id = ubatch.seq_id[i][0];
+
+            // can we use this cell? either:
+            //  - the cell is empty
+            //  - the cell is occupied only by one sequence:
+            //    - (disabled) mask causally, if the sequence is the same as the one we are inserting
+            //    - mask SWA, using current max pos for that sequence in the cache
+            //                always insert in the cell with minimum pos
+            bool can_use = cells.is_empty(head_cur + i);
+
+            if (!can_use && cells.seq_count(head_cur + i) == 1) {
+                const llama_pos pos_cell = cells.pos_get(head_cur + i);
+
+                // (disabled) causal mask
+                // note: it's better to purge any "future" tokens beforehand
+                //if (cells.seq_has(head_cur + i, seq_id)) {
+                //    can_use = pos_cell >= pos;
+                //}
+
+                if (!can_use) {
+                    const llama_seq_id seq_id_cell = cells.seq_get(head_cur + i);
+
+                    // SWA mask
+                    if (is_masked_swa(pos_cell, cells.seq_pos_max(seq_id_cell) + 1)) {
+                        can_use = true;
+                    }
+                }
+            }
+
+            if (!can_use) {
+                found = false;
+                head_cur += i + 1;
+                n_tested += i + 1;
+                break;
+            }
+        }
+
+        if (found) {
+            break;
+        }
+
+        if (n_tested >= cells.size()) {
+            //LLAMA_LOG_ERROR("%s: failed to find a slot for %d tokens\n", __func__, n_tokens);
+            return -1;
+        }
+    }
+
+    return head_cur;
+}
+
+void llama_kv_cache_unified::apply_ubatch(uint32_t head_cur, const llama_ubatch & ubatch) {
+    // keep track of the max sequence position that we would overwrite with this ubatch
+    // for non-SWA cache, this would be always empty
+    llama_seq_id seq_pos_max_rm[LLAMA_MAX_SEQ];
+    for (int s = 0; s < LLAMA_MAX_SEQ; ++s) {
+        seq_pos_max_rm[s] = -1;
+    }
+
+    for (uint32_t i = 0; i < ubatch.n_tokens; ++i) {
+        if (!cells.is_empty(head_cur + i)) {
+            assert(cells.seq_count(head_cur + i) == 1);
+
+            const llama_seq_id seq_id = cells.seq_get(head_cur + i);
+            const llama_pos    pos    = cells.pos_get(head_cur + i);
+
+            seq_pos_max_rm[seq_id] = std::max(seq_pos_max_rm[seq_id], pos);
+
+            cells.rm(head_cur + i);
+        }
+
+        cells.pos_set(head_cur + i, ubatch.pos[i]);
+
+        for (int32_t s = 0; s < ubatch.n_seq_id[i]; s++) {
+            cells.seq_add(head_cur + i, ubatch.seq_id[i][s]);
+        }
+    }
+
+    // note: we want to preserve the invariant that all positions between [pos_min, pos_max] for each sequence
+    //       will be present in the cache. so we have to purge any position which is less than those we would overwrite
+    //       ref: https://github.com/ggml-org/llama.cpp/pull/13746#issuecomment-2916057092
+    for (int s = 0; s < LLAMA_MAX_SEQ; ++s) {
+        if (seq_pos_max_rm[s] == -1) {
+            continue;
+        }
+
+        if (cells.seq_pos_min(s) <= seq_pos_max_rm[s]) {
+            LLAMA_LOG_DEBUG("%s: purging positions [%d, %d] of sequence %d from KV cache\n",
+                    __func__, cells.seq_pos_min(s), seq_pos_max_rm[s], s);
+
+            seq_rm(s, cells.seq_pos_min(s), seq_pos_max_rm[s] + 1);
+        }
+    }
+
+    // move the head at the end of the slot
+    head = head_cur + ubatch.n_tokens;
+}
+
+bool llama_kv_cache_unified::get_can_shift() const {
+    return true;
+}
+
+uint32_t llama_kv_cache_unified::get_size() const {
+    return cells.size();
+}
+
+bool llama_kv_cache_unified::get_has_shift() const {
+    return cells.get_has_shift();
+}
+
+uint32_t llama_kv_cache_unified::get_n_kv() const {
+    return std::min(cells.size(), std::max(n_pad, GGML_PAD(cells.used_max_p1(), n_pad)));
+}
+
+ggml_tensor * llama_kv_cache_unified::get_k(ggml_context * ctx, int32_t il, uint32_t n_kv) const {
+    const int32_t ikv = map_layer_ids.at(il);
+
+    auto * k = layers[ikv].k;
+
+    return ggml_view_3d(ctx, k,
+            hparams.n_embd_head_k, hparams.n_head_kv(il), n_kv,
+            ggml_row_size(k->type, hparams.n_embd_head_k),
+            ggml_row_size(k->type, hparams.n_embd_k_gqa(il)),
+            0);
+}
+
+ggml_tensor * llama_kv_cache_unified::get_v(ggml_context * ctx, int32_t il, uint32_t n_kv) const {
+    const int32_t ikv = map_layer_ids.at(il);
+
+    auto * v = layers[ikv].v;
+
+    if (!v_trans) {
+        // note: v->nb[1] <= v->nb[2]
+        return ggml_view_3d(ctx, v,
+                hparams.n_embd_head_v, hparams.n_head_kv(il), n_kv,
+                ggml_row_size(v->type, hparams.n_embd_head_v),    // v->nb[1]
+                ggml_row_size(v->type, hparams.n_embd_v_gqa(il)), // v->nb[2]
+                0);
+    }
+
+    // note: v->nb[1] > v->nb[2]
+    return ggml_view_3d(ctx, v,
+            n_kv, hparams.n_head_kv(il), hparams.n_embd_head_v,
+            ggml_row_size(v->type, v->ne[1]*hparams.n_embd_head_v), // v->nb[1]
+            ggml_row_size(v->type, v->ne[1]),                       // v->nb[2]
+            0);
+}
+
+ggml_tensor * llama_kv_cache_unified::cpy_k(ggml_context * ctx, ggml_tensor * k_cur, int32_t il, uint32_t head_cur) const {
+    const int32_t ikv = map_layer_ids.at(il);
+
+    auto * k = layers[ikv].k;
+
+    const int64_t n_tokens = k_cur->ne[2];
+
+    ggml_tensor * k_view = ggml_view_1d(ctx, k,
+            n_tokens*hparams.n_embd_k_gqa(il),
+            ggml_row_size(k->type, hparams.n_embd_k_gqa(il))*head_cur);
+
+    return ggml_cpy(ctx, k_cur, k_view);
+}
+
+ggml_tensor * llama_kv_cache_unified::cpy_v(ggml_context * ctx, ggml_tensor * v_cur, int32_t il, uint32_t head_cur) const {
+    const int32_t ikv = map_layer_ids.at(il);
+
+    auto * v = layers[ikv].v;
+
+    const int64_t n_tokens = v_cur->ne[2];
+
+    v_cur = ggml_reshape_2d(ctx, v_cur, hparams.n_embd_v_gqa(il), n_tokens);
+
+    ggml_tensor * v_view = nullptr;
+
+    if (!v_trans) {
+        v_view = ggml_view_1d(ctx, v,
+                n_tokens*hparams.n_embd_v_gqa(il),
+                ggml_row_size(v->type, hparams.n_embd_v_gqa(il))*head_cur);
+    } else {
+        // note: the V cache is transposed when not using flash attention
+        v_view = ggml_view_2d(ctx, v, n_tokens, hparams.n_embd_v_gqa(il),
+                (v->ne[1])*ggml_element_size(v),
+                (head_cur)*ggml_element_size(v));
+
+        v_cur = ggml_transpose(ctx, v_cur);
+    }
+
+    return ggml_cpy(ctx, v_cur, v_view);
+}
+
+void llama_kv_cache_unified::set_input_kq_mask(ggml_tensor * dst, const llama_ubatch * ubatch, bool causal_attn) const {
+    const uint32_t n_tokens = ubatch->n_tokens;
+
+    GGML_ASSERT(ggml_backend_buffer_is_host(dst->buffer));
+    float * data = (float *) dst->data;
+
+    const int64_t n_kv = dst->ne[0];
+
+    // Use only the previous KV cells of the correct sequence for each token of the ubatch.
+    // It's assumed that if a token in the batch has multiple sequences, they are equivalent.
+    // Example with a cache of 10 tokens, 2 tokens populated in cache and 3 tokens in batch:
+    //   Causal mask:
+    //      xxx-------
+    //      xxxx------
+    //      xxxxx-----
+    //   Non-causal mask:
+    //      xxxxx-----
+    //      xxxxx-----
+    //      xxxxx-----
+    // To visualize the mask, see https://github.com/ggml-org/llama.cpp/pull/12615
+    for (uint32_t h = 0; h < 1; ++h) {
+        for (uint32_t i = 0; i < n_tokens; ++i) {
+            const llama_seq_id seq_id = ubatch->seq_id[i][0];
+
+            const llama_pos p1 = ubatch->pos[i];
+
+            for (uint32_t j = 0; j < n_kv; ++j) {
+                float f = 0.0f;
+
+                bool masked = false;
+
+                if (cells.is_empty(j)) {
+                    masked = true;
+                } else {
+                    const llama_pos p0 = cells.pos_get(j);
+
+                    // mask the token if not the same sequence
+                    masked = masked || (!cells.seq_has(j, seq_id));
+
+                    // mask future tokens
+                    masked = masked || (causal_attn && p0 > p1);
+
+                    // apply SWA if any
+                    masked = masked || (is_masked_swa(p0, p1));
+
+                    if (!masked && hparams.use_alibi) {
+                        f = -std::abs(p0 - p1);
+                    }
+                }
+
+                if (masked) {
+                    f = -INFINITY;
+                }
+
+                data[h*(n_kv*n_tokens) + i*n_kv + j] = f;
+            }
+        }
+
+        // mask padded tokens
+        if (data) {
+            for (uint32_t i = n_tokens; i < GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); ++i) {
+                for (uint32_t j = 0; j < n_kv; ++j) {
+                    data[h*(n_kv*n_tokens) + i*n_kv + j] = -INFINITY;
+                }
+            }
+        }
+    }
+}
+
+void llama_kv_cache_unified::set_input_k_shift(ggml_tensor * dst) const {
+    GGML_ASSERT(ggml_backend_buffer_is_host(dst->buffer));
+
+    int32_t * data = (int32_t *) dst->data;
+
+    for (uint32_t i = 0; i < cells.size(); ++i) {
+        data[i] = cells.is_empty(i) ? 0 : cells.get_shift(i);
+    }
+}
+
+void llama_kv_cache_unified::set_input_pos_bucket(ggml_tensor * dst, const llama_ubatch * ubatch) const {
+    const int64_t n_tokens = ubatch->n_tokens;
+
+    GGML_ASSERT(ggml_backend_buffer_is_host(dst->buffer));
+    GGML_ASSERT(!ubatch->equal_seqs); // TODO: use ubatch->n_seqs instead of failing
+
+    int32_t * data = (int32_t *) dst->data;
+
+    const int32_t n_kv = dst->ne[0];
+
+    for (int h = 0; h < 1; ++h) {
+        for (int i = 0; i < n_tokens; ++i) {
+            for (int j = 0; j < n_kv; ++j) {
+                // the position when the cells is empty is irrelevant - it will be masked out later in the attention
+                const llama_pos p0 = cells.is_empty(j) ? -1 : cells.pos_get(j);
+
+                data[h*(n_kv*n_tokens) + i*n_kv + j] = llama_relative_position_bucket(p0, ubatch->pos[i], hparams.n_rel_attn_bkts, false);
+            }
+        }
+    }
+}
+
+size_t llama_kv_cache_unified::total_size() const {
+    size_t size = 0;
+
+    for (const auto & buf : bufs) {
+        size += ggml_backend_buffer_get_size(buf.get());
+    }
+
+    return size;
+}
+
+size_t llama_kv_cache_unified::size_k_bytes() const {
+    size_t size_k_bytes = 0;
+
+    for (const auto & layer : layers) {
+        size_k_bytes += ggml_nbytes(layer.k);
+    }
+
+    return size_k_bytes;
+}
+
+size_t llama_kv_cache_unified::size_v_bytes() const {
+    size_t size_v_bytes = 0;
+
+    for (const auto & layer : layers) {
+        size_v_bytes += ggml_nbytes(layer.v);
+    }
+
+    return size_v_bytes;
+}
+
+ggml_tensor * llama_kv_cache_unified::build_rope_shift(
+        const llama_cparams & cparams,
+               ggml_context * ctx,
+                ggml_tensor * cur,
+                ggml_tensor * shift,
+                ggml_tensor * factors,
+                      float   freq_base,
+                      float   freq_scale) const {
+    const auto & n_ctx_orig = cparams.n_ctx_orig_yarn;
+
+    const auto & yarn_ext_factor = cparams.yarn_ext_factor;
+    const auto & yarn_beta_fast  = cparams.yarn_beta_fast;
+    const auto & yarn_beta_slow  = cparams.yarn_beta_slow;
+
+    const auto & n_rot     = hparams.n_rot;
+    const auto & rope_type = hparams.rope_type == LLAMA_ROPE_TYPE_MROPE
+                                // @ngxson : this is a workaround
+                                // for M-RoPE, we want to rotate the whole vector when doing KV shift
+                                // a normal RoPE should work, we just need to use the correct ordering
+                                // ref: https://github.com/ggml-org/llama.cpp/pull/13870
+                                ? LLAMA_ROPE_TYPE_NEOX
+                                : hparams.rope_type;
+
+    // See llm_build_deepseek2() for why attn_factor has to be scaled for YaRN RoPE to work correctly.
+    // See https://github.com/ggerganov/llama.cpp/discussions/7416 for detailed explanation.
+    const float yarn_attn_factor = model.arch == LLM_ARCH_DEEPSEEK2
+                                    ? 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale))
+                                    : cparams.yarn_attn_factor;
+
+    ggml_tensor * tmp;
+
+    if (ggml_is_quantized(cur->type)) {
+        // dequantize to f32 -> RoPE -> quantize back
+        tmp = ggml_cast(ctx, cur, GGML_TYPE_F32);
+
+        tmp = ggml_rope_ext(ctx, tmp,
+                shift, factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                yarn_ext_factor, yarn_attn_factor, yarn_beta_fast, yarn_beta_slow);
+
+        tmp = ggml_cpy(ctx, tmp, cur);
+    } else {
+        // we rotate only the first n_rot dimensions
+        tmp = ggml_rope_ext_inplace(ctx, cur,
+                shift, factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                yarn_ext_factor, yarn_attn_factor, yarn_beta_fast, yarn_beta_slow);
+    }
+
+    return tmp;
+}
+
+class llm_graph_input_k_shift : public llm_graph_input_i {
+public:
+    llm_graph_input_k_shift(const llama_kv_cache_unified * kv_self) : kv_self(kv_self) {}
+    virtual ~llm_graph_input_k_shift() = default;
+
+    void set_input(const llama_ubatch * ubatch) override;
+
+    ggml_tensor * k_shift; // I32 [kv_size]
+
+    const llama_kv_cache_unified * kv_self;
+};
+
+void llm_graph_input_k_shift::set_input(const llama_ubatch * ubatch) {
+    GGML_UNUSED(ubatch);
+
+    if (k_shift) {
+        kv_self->set_input_k_shift(k_shift);
+    }
+}
+
+llm_graph_result_ptr llama_kv_cache_unified::build_graph_shift(
+        const llama_cparams & cparams,
+               ggml_context * ctx,
+                ggml_cgraph * gf) const {
+    auto res = std::make_unique<llm_graph_result>();
+
+    const auto & n_embd_head_k = hparams.n_embd_head_k;
+  //const auto & n_embd_head_v = hparams.n_embd_head_v;
+
+    auto inp = std::make_unique<llm_graph_input_k_shift>(this);
+
+    inp->k_shift = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, cells.size());
+    ggml_set_input(inp->k_shift);
+
+    for (const auto & layer : layers) {
+        const uint32_t il = layer.il;
+
+        const int64_t n_head_kv    = hparams.n_head_kv(il);
+        const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
+
+        const float freq_base_l  = model.get_rope_freq_base (cparams, il);
+        const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
+
+        ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
+
+        ggml_tensor * k =
+            ggml_view_3d(ctx, layer.k,
+                n_embd_head_k, n_head_kv, cells.size(),
+                ggml_row_size(layer.k->type, n_embd_head_k),
+                ggml_row_size(layer.k->type, n_embd_k_gqa),
+                0);
+
+        ggml_tensor * cur = build_rope_shift(cparams, ctx, k, inp->k_shift, rope_factors, freq_base_l, freq_scale_l);
+
+        ggml_build_forward_expand(gf, cur);
+    }
+
+    res->add_input(std::move(inp));
+
+    return res;
+}
+
+llm_graph_result_ptr llama_kv_cache_unified::build_graph_defrag(
+                const llama_cparams & cparams,
+                       ggml_context * ctx,
+                        ggml_cgraph * gf,
+                  const defrag_info & dinfo) const {
+    auto res = std::make_unique<llm_graph_result>();
+
+    const auto & ids = dinfo.ids;
+
+#if 0
+    // CPU defrag
+    //
+    // TODO: optimizations are possible:
+    //       - multiple threads
+    //       - avoid copying to the host memory when already there
+    //
+    // likely not worth the effort, as we have ggml_graph based defrag
+    //
+
+    const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa();
+    const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa();
+
+    const uint32_t kv_size = size;
+
+    std::vector<uint8_t> buf_k;
+    std::vector<uint8_t> buf_v;
+
+    for (uint32_t il = 0; il < n_layer; ++il) {
+        const size_t k_size_row = ggml_row_size(k_l[il]->type, n_embd_k_gqa);
+        const size_t k_size     = ggml_row_size(k_l[il]->type, n_embd_k_gqa*kv_size);
+
+        const size_t v_size_el = ggml_type_size(v_l[il]->type);
+        const size_t v_size    = ggml_row_size (v_l[il]->type, n_embd_v_gqa*kv_size);
+
+        buf_k.resize(k_size);
+        buf_v.resize(v_size);
+
+        ggml_backend_tensor_get(k_l[il], buf_k.data(), 0, buf_k.size());
+        ggml_backend_tensor_get(v_l[il], buf_v.data(), 0, buf_v.size());
+
+        // batch move [i, i+nm) to [id, id+nm)
+        // note: cells can move only to a lower index
+        for (uint32_t i = 0; i < n_kv; ++i) {
+            const uint32_t id = ids[i];
+
+            if (i == id || id == n_kv) {
+                continue;
+            }
+
+            uint32_t nm = 1;
+
+            while (i + nm < n_kv && ids[i + nm] == id + nm) {
+                nm++;
+            }
+
+            // move keys
+            {
+                const int64_t os =  i*k_size_row;
+                const int64_t od = id*k_size_row;
+
+                memcpy(buf_k.data() + od, buf_k.data() + os, nm*k_size_row);
+            }
+
+            // move values (note: they are transposed)
+            {
+                const int64_t os =  i;
+                const int64_t od = id;
+
+                for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
+                    memcpy(buf_v.data() + (od + j*kv_size)*v_size_el, buf_v.data() + (os + j*kv_size)*v_size_el, nm*v_size_el);
+                }
+            }
+
+            i += nm - 1;
+        }
+
+        ggml_backend_tensor_set(k_l[il], buf_k.data(), 0, buf_k.size());
+        ggml_backend_tensor_set(v_l[il], buf_v.data(), 0, buf_v.size());
+    }
+#else
+    for (uint32_t i = 0; i < ids.size(); ++i) {
+        const uint32_t id = ids[i];
+
+        if (i == id || id == ids.size()) {
+            continue;
+        }
+
+        uint32_t nm = 1;
+
+        while (i + nm < ids.size() && ids[i + nm] == id + nm) {
+            nm++;
+        }
+
+        for (const auto & layer : layers) {
+            const uint32_t il = layer.il;
+
+            const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
+            const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
+
+            ggml_tensor * view_k_src = ggml_view_2d(ctx, layer.k,
+                    n_embd_k_gqa, nm,
+                    ggml_row_size(layer.k->type, n_embd_k_gqa),
+                    ggml_row_size(layer.k->type, n_embd_k_gqa*i));
+
+            ggml_tensor * view_k_dst = ggml_view_2d(ctx, layer.k,
+                    n_embd_k_gqa, nm,
+                    ggml_row_size(layer.k->type, n_embd_k_gqa),
+                    ggml_row_size(layer.k->type, n_embd_k_gqa*id));
+
+            ggml_tensor * view_v_src;
+            ggml_tensor * view_v_dst;
+
+            if (cparams.flash_attn) {
+                // NOTE: the V cache is not transposed when using flash attention
+                view_v_src = ggml_view_2d(ctx, layer.v,
+                        n_embd_v_gqa, nm,
+                        ggml_row_size(layer.v->type, n_embd_v_gqa),
+                        ggml_row_size(layer.v->type, n_embd_v_gqa*i));
+
+                view_v_dst = ggml_view_2d(ctx, layer.v,
+                        n_embd_v_gqa, nm,
+                        ggml_row_size(layer.v->type, n_embd_v_gqa),
+                        ggml_row_size(layer.v->type, n_embd_v_gqa*id));
+            } else {
+                view_v_src = ggml_view_2d(ctx, layer.v,
+                        nm, n_embd_v_gqa,
+                        ggml_row_size(layer.v->type, cells.size()),
+                        ggml_row_size(layer.v->type, i));
+
+                view_v_dst = ggml_view_2d(ctx, layer.v,
+                        nm, n_embd_v_gqa,
+                        ggml_row_size(layer.v->type, cells.size()),
+                        ggml_row_size(layer.v->type, id));
+            }
+
+            ggml_build_forward_expand(gf, ggml_cpy(ctx, view_k_src, view_k_dst));
+            ggml_build_forward_expand(gf, ggml_cpy(ctx, view_v_src, view_v_dst));
+        }
+
+        i += nm - 1;
+    }
+
+    //LLAMA_LOG_INFO("gf->n_nodes = %d\n", gf->n_nodes);
+#endif
+
+    return res;
+}
+
+llama_kv_cache_unified::defrag_info llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) const {
+    const uint32_t n_layer = layers.size();
+
+    const uint32_t n_kv   = cells.used_max_p1();
+    const uint32_t n_used = cells.get_used();
+
+    assert(n_used <= n_kv);
+
+    //const int64_t t_start = ggml_time_us();
+
+    // number of cells moved
+    uint32_t n_moves = 0;
+
+    // each move requires 6*n_layer tensors (see graph_build_kv_self_defrag)
+    //   - source view, destination view, copy operation
+    //   - x2 for keys and values
+    //const uint32_t max_moves = max_nodes()/(6*n_layer);
+    // TODO: tmp fix https://github.com/ggerganov/llama.cpp/issues/6685#issuecomment-2057579516
+    const uint32_t max_moves = (n_max_nodes - 2*n_layer)/(6*n_layer);
+
+    // determine which KV cells to move where
+    defrag_info res;
+    auto & ids = res.ids;
+
+    ids.resize(n_kv, n_kv);
+
+    for (uint32_t i0 = 0; i0 < n_used; ++i0) {
+        if (!cells.is_empty(i0)) {
+            ids[i0] = i0;
+
+            continue;
+        }
+
+        // found a hole - fill it with data from the end of the cache
+
+        uint32_t nh = 1;
+
+        // determine the size of the hole
+        while (i0 + nh < n_used && cells.is_empty(i0 + nh)) {
+            nh++;
+        }
+
+        uint32_t nf = 0;
+        uint32_t is = n_kv - 1;
+
+        // starting from the end, find nh non-empty cells
+        for (; is > i0; --is) {
+            if (cells.is_empty(is) || ids[is] != n_kv) {
+                continue;
+            }
+
+            // non-empty cell which is not yet moved
+            nf++;
+
+            if (nf == nh) {
+                break;
+            }
+        }
+
+        // this can only happen if `n_used` is not accurate, which would be a bug
+        GGML_ASSERT(nf == nh && "KV defrag bug: nf != nh");
+
+        nf = 0;
+
+        uint32_t i1 = is;
+
+        // are we moving a continuous block of memory?
+        bool cont = false;
+
+        // should we stop searching for the next move?
+        bool stop = false;
+
+        // go back and move the nf cells to the hole
+        for (; i1 < n_kv; ++i1) {
+            if (cells.is_empty(i1) || ids[i1] != n_kv) {
+                if (n_moves == max_moves) {
+                    stop = true;
+                    break;
+                }
+
+                cont = false;
+                continue;
+            }
+
+            // this cell goes to (i0 + nf)
+            ids[i1] = i0 + nf;
+
+            if (!cont) {
+                n_moves++;
+                cont = true;
+            }
+
+            nf++;
+
+            if (nf == nh) {
+                break;
+            }
+        }
+
+        if (stop || n_moves == max_moves) {
+            break;
+        }
+
+        //LLAMA_LOG_INFO("(tmp log) KV defrag: move [%u, %u) to [%u, %u)\n", is, i1 + 1, i0, i0 + nh);
+
+        i0 += nh - 1;
+    }
+
+    if (n_moves == 0) {
+        return {};
+    }
+
+    LLAMA_LOG_DEBUG("%s: (tmp log) KV defrag cell moves: %u\n", __func__, n_moves);
+
+    LLAMA_LOG_DEBUG("%s: expected gf nodes: %u\n", __func__, 6*n_moves*n_layer);
+
+    return res;
+}
+
+bool llama_kv_cache_unified::is_masked_swa(llama_pos p0, llama_pos p1) const {
+    assert(p0 >= 0 && p1 >= 0);
+
+    switch (swa_type) {
+        case LLAMA_SWA_TYPE_NONE:
+            {
+            } break;
+        case LLAMA_SWA_TYPE_STANDARD:
+            {
+                if (p1 - p0 >= (int32_t) n_swa) {
+                    return true;
+                }
+            } break;
+        case LLAMA_SWA_TYPE_CHUNKED:
+            {
+                const llama_pos pos_chunk_start = (p1 / n_swa) * n_swa;
+
+                if (p0 < pos_chunk_start) {
+                    return true;
+                }
+            } break;
+    }
+
+    return false;
+}
+
+void llama_kv_cache_unified::state_write(llama_io_write_i & io, llama_seq_id seq_id) const {
+    std::vector<std::pair<uint32_t, uint32_t>> cell_ranges; // ranges, from inclusive, to exclusive
+    uint32_t cell_count = 0;
+
+    // Count the number of cells with the specified seq_id
+    // Find all the ranges of cells with this seq id (or all, when -1)
+    uint32_t cell_range_begin = cells.size();
+
+    for (uint32_t i = 0; i < cells.size(); ++i) {
+        if (!cells.is_empty(i) && (seq_id == -1 || cells.seq_has(i, seq_id))) {
+            ++cell_count;
+            if (cell_range_begin == cells.size()) {
+                cell_range_begin = i;
+            }
+        } else {
+            if (cell_range_begin != cells.size()) {
+                cell_ranges.emplace_back(cell_range_begin, i);
+                cell_range_begin = cells.size();
+            }
+        }
+    }
+
+    if (cell_range_begin != cells.size()) {
+        cell_ranges.emplace_back(cell_range_begin, cells.size());
+    }
+
+    // DEBUG CHECK: Sum of cell counts in ranges should equal the total cell count
+    uint32_t cell_count_check = 0;
+    for (const auto & range : cell_ranges) {
+        cell_count_check += range.second - range.first;
+    }
+    GGML_ASSERT(cell_count == cell_count_check);
+
+    io.write(&cell_count, sizeof(cell_count));
+
+    state_write_meta(io, cell_ranges, seq_id);
+    state_write_data(io, cell_ranges);
+}
+
+void llama_kv_cache_unified::state_read(llama_io_read_i & io, llama_seq_id seq_id) {
+    uint32_t cell_count;
+    io.read_to(&cell_count, sizeof(cell_count));
+
+    bool res = true;
+    res = res && state_read_meta(io, cell_count, seq_id);
+    res = res && state_read_data(io, cell_count);
+
+    if (!res) {
+        if (seq_id == -1) {
+            clear(true);
+        } else {
+            seq_rm(seq_id, -1, -1);
+        }
+        throw std::runtime_error("failed to restore kv cache");
+    }
+}
+
+void llama_kv_cache_unified::state_write_meta(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges, llama_seq_id seq_id) const {
+    for (const auto & range : cell_ranges) {
+        for (uint32_t i = range.first; i < range.second; ++i) {
+            std::vector<llama_seq_id> seq_ids;
+
+            for (llama_seq_id cur = 0; cur < (int) n_seq_max; ++cur) {
+                if (cur == seq_id || seq_id == -1) {
+                    if (cells.seq_has(i, cur)) {
+                        seq_ids.push_back(cur);
+                    }
+                }
+            }
+
+            const llama_pos pos     = cells.pos_get(i);
+            const uint32_t n_seq_id = seq_ids.size();
+
+            io.write(&pos,      sizeof(pos));
+            io.write(&n_seq_id, sizeof(n_seq_id));
+
+            for (const auto & seq_id : seq_ids) {
+                io.write(&seq_id, sizeof(seq_id));
+            }
+        }
+    }
+}
+
+void llama_kv_cache_unified::state_write_data(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges) const {
+    const uint32_t v_trans = this->v_trans ? 1 : 0;
+    const uint32_t n_layer = layers.size();
+
+    io.write(&v_trans, sizeof(v_trans));
+    io.write(&n_layer, sizeof(n_layer));
+
+    std::vector<uint8_t> tmp_buf;
+
+    // Iterate and write all the keys first, each row is a cell
+    // Get whole range at a time
+    for (const auto & layer : layers) {
+        const uint32_t il = layer.il;
+
+        const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
+
+        // Write key type
+        const int32_t k_type_i = (int32_t)layer.k->type;
+        io.write(&k_type_i, sizeof(k_type_i));
+
+        // Write row size of key
+        const uint64_t k_size_row = ggml_row_size(layer.k->type, n_embd_k_gqa);
+        io.write(&k_size_row, sizeof(k_size_row));
+
+        // Read each range of cells of k_size length each into tmp_buf and write out
+        for (const auto & range : cell_ranges) {
+            const size_t range_size = range.second - range.first;
+            const size_t buf_size = range_size * k_size_row;
+            io.write_tensor(layer.k, range.first * k_size_row, buf_size);
+        }
+    }
+
+    if (!v_trans) {
+        for (const auto & layer : layers) {
+            const uint32_t il = layer.il;
+
+            const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
+
+            // Write value type
+            const int32_t v_type_i = (int32_t)layer.v->type;
+            io.write(&v_type_i, sizeof(v_type_i));
+
+            // Write row size of value
+            const uint64_t v_size_row = ggml_row_size(layer.v->type, n_embd_v_gqa);
+            io.write(&v_size_row, sizeof(v_size_row));
+
+            // Read each range of cells of v_size length each into tmp_buf and write out
+            for (const auto & range : cell_ranges) {
+                const size_t range_size = range.second - range.first;
+                const size_t buf_size = range_size * v_size_row;
+                io.write_tensor(layer.v, range.first * v_size_row, buf_size);
+            }
+        }
+    } else {
+        // When v is transposed, we also need the element size and get the element ranges from each row
+        const uint32_t kv_size = cells.size();
+
+        for (const auto & layer : layers) {
+            const uint32_t il = layer.il;
+
+            const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
+
+            // Write value type
+            const int32_t v_type_i = (int32_t)layer.v->type;
+            io.write(&v_type_i, sizeof(v_type_i));
+
+            // Write element size
+            const uint32_t v_size_el = ggml_type_size(layer.v->type);
+            io.write(&v_size_el, sizeof(v_size_el));
+
+            // Write GQA embedding size
+            io.write(&n_embd_v_gqa, sizeof(n_embd_v_gqa));
+
+            // For each row, we get the element values of each cell
+            for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
+                // Read each range of cells of v_size_el length each into tmp_buf and write out
+                for (const auto & range : cell_ranges) {
+                    const size_t range_size = range.second - range.first;
+                    const size_t src_offset = (range.first + j * kv_size) * v_size_el;
+                    const size_t buf_size = range_size * v_size_el;
+                    io.write_tensor(layer.v, src_offset, buf_size);
+                }
+            }
+        }
+    }
+}
+
+bool llama_kv_cache_unified::state_read_meta(llama_io_read_i & io, uint32_t cell_count, llama_seq_id dest_seq_id) {
+    if (dest_seq_id != -1) {
+        // single sequence
+
+        seq_rm(dest_seq_id, -1, -1);
+
+        llama_batch_allocr balloc(hparams.n_pos_per_embd());
+
+        llama_ubatch ubatch = balloc.ubatch_reserve(cell_count, 1);
+
+        for (uint32_t i = 0; i < cell_count; ++i) {
+            llama_pos pos;
+            uint32_t n_seq_id;
+
+            io.read_to(&pos,      sizeof(pos));
+            io.read_to(&n_seq_id, sizeof(n_seq_id));
+
+            if (n_seq_id != 1) {
+                LLAMA_LOG_ERROR("%s: invalid seq_id-agnostic kv cell\n", __func__);
+                return false;
+            }
+
+            // read the sequence id, but directly discard it - we will use dest_seq_id instead
+            {
+                llama_seq_id seq_id;
+                io.read_to(&seq_id, sizeof(seq_id));
+            }
+
+            ubatch.pos[i]      = pos;
+            ubatch.n_seq_id[i] = n_seq_id;
+            ubatch.seq_id[i]   = &dest_seq_id;
+        }
+
+        const auto head_cur = find_slot(ubatch);
+        if (head_cur < 0) {
+            LLAMA_LOG_ERROR("%s: failed to find available cells in kv cache\n", __func__);
+            return false;
+        }
+
+        apply_ubatch(head_cur, ubatch);
+
+        // keep the head at the old position because we will read the KV data into it in state_read_data()
+        head = head_cur;
+
+        // DEBUG CHECK: head_cur should be our first cell, head_cur + cell_count - 1 should be our last cell (verify seq_id and pos values)
+        // Assume that this is one contiguous block of cells
+        GGML_ASSERT(head_cur + cell_count <= cells.size());
+        GGML_ASSERT(cells.pos_get(head_cur)                  == ubatch.pos[0]);
+        GGML_ASSERT(cells.pos_get(head_cur + cell_count - 1) == ubatch.pos[cell_count - 1]);
+        GGML_ASSERT(cells.seq_has(head_cur,                  dest_seq_id));
+        GGML_ASSERT(cells.seq_has(head_cur + cell_count - 1, dest_seq_id));
+    } else {
+        // whole KV cache restore
+
+        if (cell_count > cells.size()) {
+            LLAMA_LOG_ERROR("%s: not enough cells in kv cache\n", __func__);
+            return false;
+        }
+
+        clear(true);
+
+        for (uint32_t i = 0; i < cell_count; ++i) {
+            llama_pos pos;
+            uint32_t  n_seq_id;
+
+            io.read_to(&pos,      sizeof(pos));
+            io.read_to(&n_seq_id, sizeof(n_seq_id));
+
+            cells.pos_set(i, pos);
+
+            for (uint32_t j = 0; j < n_seq_id; ++j) {
+                llama_seq_id seq_id;
+                io.read_to(&seq_id, sizeof(seq_id));
+
+                if (seq_id < 0 || (uint32_t) seq_id >= n_seq_max) {
+                    LLAMA_LOG_ERROR("%s: invalid seq_id, %d is out of range [0, %u)\n", __func__, seq_id, n_seq_max);
+                    return false;
+                }
+
+                cells.seq_add(i, seq_id);
+            }
+        }
+
+        head = 0;
+    }
+
+    return true;
+}
+
+bool llama_kv_cache_unified::state_read_data(llama_io_read_i & io, uint32_t cell_count) {
+    uint32_t v_trans;
+    uint32_t n_layer;
+
+    io.read_to(&v_trans, sizeof(v_trans));
+    io.read_to(&n_layer, sizeof(n_layer));
+
+    if (n_layer != layers.size()) {
+        LLAMA_LOG_ERROR("%s: mismatched layer count (%u instead of %u)\n", __func__, n_layer, (uint32_t) layers.size());
+        return false;
+    }
+
+    if (cell_count > cells.size()) {
+        LLAMA_LOG_ERROR("%s: not enough cells in kv cache to restore state (%u > %u)\n", __func__, cell_count, cells.size());
+        return false;
+    }
+
+    if (this->v_trans != (bool) v_trans) {
+        LLAMA_LOG_ERROR("%s: incompatible V transposition\n", __func__);
+        return false;
+    }
+
+    // For each layer, read the keys for each cell, one row is one cell, read as one contiguous block
+    for (const auto & layer : layers) {
+        const uint32_t il = layer.il;
+
+        const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
+
+        // Read type of key
+        int32_t k_type_i_ref;
+        io.read_to(&k_type_i_ref, sizeof(k_type_i_ref));
+        const int32_t k_type_i = (int32_t) layer.k->type;
+        if (k_type_i != k_type_i_ref) {
+            LLAMA_LOG_ERROR("%s: mismatched key type (%d != %d, layer %d)\n", __func__, k_type_i, k_type_i_ref, il);
+            return false;
+        }
+
+        // Read row size of key
+        uint64_t k_size_row_ref;
+        io.read_to(&k_size_row_ref, sizeof(k_size_row_ref));
+        const size_t k_size_row = ggml_row_size(layer.k->type, n_embd_k_gqa);
+        if (k_size_row != k_size_row_ref) {
+            LLAMA_LOG_ERROR("%s: mismatched key row size (%zu != %zu, layer %d)\n", __func__, k_size_row, (size_t) k_size_row_ref, il);
+            return false;
+        }
+
+        if (cell_count) {
+            // Read and set the keys for the whole cell range
+            ggml_backend_tensor_set(layer.k, io.read(cell_count * k_size_row), head * k_size_row, cell_count * k_size_row);
+        }
+    }
+
+    if (!this->v_trans) {
+        for (const auto & layer : layers) {
+            const uint32_t il = layer.il;
+
+            const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
+
+            // Read type of value
+            int32_t v_type_i_ref;
+            io.read_to(&v_type_i_ref, sizeof(v_type_i_ref));
+            const int32_t v_type_i = (int32_t)layer.v->type;
+            if (v_type_i != v_type_i_ref) {
+                LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il);
+                return false;
+            }
+
+            // Read row size of value
+            uint64_t v_size_row_ref;
+            io.read_to(&v_size_row_ref, sizeof(v_size_row_ref));
+            const size_t v_size_row = ggml_row_size(layer.v->type, n_embd_v_gqa);
+            if (v_size_row != v_size_row_ref) {
+                LLAMA_LOG_ERROR("%s: mismatched value row size (%zu != %zu, layer %d)\n", __func__, v_size_row, (size_t) v_size_row_ref, il);
+                return false;
+            }
+
+            if (cell_count) {
+                // Read and set the values for the whole cell range
+                ggml_backend_tensor_set(layer.v, io.read(cell_count * v_size_row), head * v_size_row, cell_count * v_size_row);
+            }
+        }
+    } else {
+        // For each layer, read the values for each cell (transposed)
+        for (const auto & layer : layers) {
+            const uint32_t il = layer.il;
+
+            const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
+
+            // Read type of value
+            int32_t v_type_i_ref;
+            io.read_to(&v_type_i_ref, sizeof(v_type_i_ref));
+            const int32_t v_type_i = (int32_t)layer.v->type;
+            if (v_type_i != v_type_i_ref) {
+                LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il);
+                return false;
+            }
+
+            // Read element size of value
+            uint32_t v_size_el_ref;
+            io.read_to(&v_size_el_ref, sizeof(v_size_el_ref));
+            const size_t v_size_el = ggml_type_size(layer.v->type);
+            if (v_size_el != v_size_el_ref) {
+                LLAMA_LOG_ERROR("%s: mismatched value element size (%zu != %zu, layer %d)\n", __func__, v_size_el, (size_t) v_size_el_ref, il);
+                return false;
+            }
+
+            // Read GQA embedding size
+            uint32_t n_embd_v_gqa_ref;
+            io.read_to(&n_embd_v_gqa_ref, sizeof(n_embd_v_gqa_ref));
+            if (n_embd_v_gqa != n_embd_v_gqa_ref) {
+                LLAMA_LOG_ERROR("%s: mismatched GQA embedding size (%u != %u, layer %d)\n", __func__, n_embd_v_gqa, n_embd_v_gqa_ref, il);
+                return false;
+            }
+
+            if (cell_count) {
+                // For each row in the transposed matrix, read the values for the whole cell range
+                for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
+                    const size_t dst_offset = (head + j * cells.size()) * v_size_el;
+                    ggml_backend_tensor_set(layer.v, io.read(cell_count * v_size_el), dst_offset, cell_count * v_size_el);
+                }
+            }
+        }
+    }
+
+    return true;
+}
+
+//
+// llama_kv_cache_unified_context
+//
+
+llama_kv_cache_unified_context::llama_kv_cache_unified_context(llama_memory_status status) : status(status) {}
+
+llama_kv_cache_unified_context::llama_kv_cache_unified_context(
+        llama_kv_cache_unified * kv) : status(LLAMA_MEMORY_STATUS_SUCCESS), kv(kv) {
+    n_kv = kv->get_size();
+    head = 0;
+}
+
+llama_kv_cache_unified_context::llama_kv_cache_unified_context(
+        llama_kv_cache_unified * kv,
+        llama_context * lctx,
+        bool do_shift,
+        defrag_info dinfo) : status(LLAMA_MEMORY_STATUS_SUCCESS), kv(kv), lctx(lctx), do_shift(do_shift), dinfo(std::move(dinfo)) {
+    if (!do_shift && this->dinfo.empty()) {
+        status = LLAMA_MEMORY_STATUS_NO_UPDATE;
+    }
+}
+
+llama_kv_cache_unified_context::llama_kv_cache_unified_context(
+        llama_kv_cache_unified * kv,
+        llama_kv_cache_unified::ubatch_heads heads,
+        std::vector<llama_ubatch> ubatches) : status(LLAMA_MEMORY_STATUS_SUCCESS), kv(kv), heads(std::move(heads)), ubatches(std::move(ubatches)) {
+}
+
+llama_kv_cache_unified_context::~llama_kv_cache_unified_context() = default;
+
+bool llama_kv_cache_unified_context::next() {
+    assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
+
+    if (++i_next >= ubatches.size()) {
+        return false;
+    }
+
+    return true;
+}
+
+bool llama_kv_cache_unified_context::apply() {
+    assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
+
+    // no ubatches -> this is a KV cache update
+    if (ubatches.empty()) {
+        kv->update(lctx, do_shift, dinfo);
+
+        return true;
+    }
+
+    kv->apply_ubatch(heads[i_next], ubatches[i_next]);
+
+    n_kv = kv->get_n_kv();
+    head = heads[i_next];
+
+    return true;
+}
+
+llama_memory_status llama_kv_cache_unified_context::get_status() const {
+    return status;
+}
+
+const llama_ubatch & llama_kv_cache_unified_context::get_ubatch() const {
+    assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
+
+    return ubatches[i_next];
+}
+
+uint32_t llama_kv_cache_unified_context::get_n_kv() const {
+    return n_kv;
+}
+
+ggml_tensor * llama_kv_cache_unified_context::get_k(ggml_context * ctx, int32_t il) const {
+    return kv->get_k(ctx, il, n_kv);
+}
+
+ggml_tensor * llama_kv_cache_unified_context::get_v(ggml_context * ctx, int32_t il) const {
+    return kv->get_v(ctx, il, n_kv);
+}
+
+ggml_tensor * llama_kv_cache_unified_context::cpy_k(ggml_context * ctx, ggml_tensor * k_cur, int32_t il) const {
+    return kv->cpy_k(ctx, k_cur, il, head);
+}
+
+ggml_tensor * llama_kv_cache_unified_context::cpy_v(ggml_context * ctx, ggml_tensor * v_cur, int32_t il) const {
+    return kv->cpy_v(ctx, v_cur, il, head);
+}
+
+void llama_kv_cache_unified_context::set_input_k_shift(ggml_tensor * dst) const {
+    kv->set_input_k_shift(dst);
+}
+
+void llama_kv_cache_unified_context::set_input_kq_mask(ggml_tensor * dst, const llama_ubatch * ubatch, bool causal_attn) const {
+    kv->set_input_kq_mask(dst, ubatch, causal_attn);
+}
+
+void llama_kv_cache_unified_context::set_input_pos_bucket(ggml_tensor * dst, const llama_ubatch * ubatch) const {
+    kv->set_input_pos_bucket(dst, ubatch);
+}
+
+uint32_t llama_kv_cache_unified::get_padding(const llama_cparams & cparams) {
+    // the FA kernels require padding to avoid extra runtime boundary checks
+    return cparams.flash_attn ? 256u : 32u;
+}
diff --git a/src/llama-kv-cache-unified.h b/src/llama-kv-cache-unified.h
new file mode 100644
index 0000000000000..4c53f1273ab88
--- /dev/null
+++ b/src/llama-kv-cache-unified.h
@@ -0,0 +1,303 @@
+#pragma once
+
+#include "llama-batch.h"
+#include "llama-graph.h"
+#include "llama-kv-cells.h"
+#include "llama-memory.h"
+
+#include <unordered_map>
+#include <vector>
+
+struct llama_cparams;
+struct llama_hparams;
+struct llama_model;
+struct llama_context;
+
+//
+// llama_kv_cache_unified
+//
+
+class llama_kv_cache_unified : public llama_memory_i {
+public:
+    static uint32_t get_padding(const llama_cparams & cparams);
+
+    // this callback is used to filter out layers that should not be included in the cache
+    using layer_filter_cb = std::function<bool(int32_t il)>;
+
+    using ubatch_heads = std::vector<uint32_t>;
+
+    struct defrag_info {
+        bool empty() const {
+            return ids.empty();
+        }
+
+        // contains information about which cell moves where:
+        //  - cell i moves to ids[i]
+        //  - if ids[i] == i || ids[i] == ids.size(), then cell i is not moved
+        std::vector<uint32_t> ids;
+    };
+
+    llama_kv_cache_unified(
+            const llama_model &  model,
+              layer_filter_cb && filter,
+                    ggml_type    type_k,
+                    ggml_type    type_v,
+                         bool    v_trans,
+                         bool    offload,
+                     uint32_t    kv_size,
+                     uint32_t    n_seq_max,
+                     uint32_t    n_pad,
+                     uint32_t    n_swa,
+               llama_swa_type    swa_type);
+
+    ~llama_kv_cache_unified() = default;
+
+    //
+    // llama_memory_i
+    //
+
+    llama_memory_context_ptr init_batch(
+            llama_batch_allocr & balloc,
+            uint32_t n_ubatch,
+            bool embd_all) override;
+
+    llama_memory_context_ptr init_full() override;
+
+    llama_memory_context_ptr init_update(llama_context * lctx, bool optimize) override;
+
+    bool get_can_shift() const override;
+
+    void clear(bool data) override;
+
+    bool seq_rm  (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1) override;
+    void seq_cp  (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) override;
+    void seq_keep(llama_seq_id seq_id)                                                          override;
+    void seq_add (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1, llama_pos shift) override;
+    void seq_div (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1, int d) override;
+
+    llama_pos seq_pos_min(llama_seq_id seq_id) const override;
+    llama_pos seq_pos_max(llama_seq_id seq_id) const override;
+
+    // state write/load
+
+    void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const override;
+    void state_read (llama_io_read_i  & io, llama_seq_id seq_id = -1)       override;
+
+    //
+    // llama_kv_cache_unified specific API
+    //
+
+    uint32_t get_size() const;
+
+    bool get_has_shift() const;
+
+    //
+    // graph_build API
+    //
+
+    uint32_t get_n_kv() const;
+
+    // get views of the current state of the cache
+    ggml_tensor * get_k(ggml_context * ctx, int32_t il, uint32_t n_kv) const;
+    ggml_tensor * get_v(ggml_context * ctx, int32_t il, uint32_t n_kv) const;
+
+    // store k_cur and v_cur in the cache based on the provided head location
+    ggml_tensor * cpy_k(ggml_context * ctx, ggml_tensor * k_cur, int32_t il, uint32_t head_cur) const;
+    ggml_tensor * cpy_v(ggml_context * ctx, ggml_tensor * v_cur, int32_t il, uint32_t head_cur) const;
+
+    //
+    // preparation API
+    //
+
+    // find places for the provided ubatches in the cache, returns the head locations
+    // return empty vector on failure
+    ubatch_heads prepare(const std::vector<llama_ubatch> & ubatches);
+
+    bool update(llama_context * lctx, bool do_shift, const defrag_info & dinfo);
+
+    // return the cell position where we can insert the ubatch
+    // return -1 on failure to find a contiguous slot of kv cells
+    int32_t find_slot(const llama_ubatch & ubatch) const;
+
+    // emplace the ubatch context into slot: [head_cur, head_cur + ubatch.n_tokens)
+    void apply_ubatch(uint32_t head_cur, const llama_ubatch & ubatch);
+
+    //
+    // set_input API
+    //
+
+    void set_input_kq_mask   (ggml_tensor * dst, const llama_ubatch * ubatch, bool causal_attn) const;
+    void set_input_k_shift   (ggml_tensor * dst) const;
+    void set_input_pos_bucket(ggml_tensor * dst, const llama_ubatch * ubatch) const;
+
+private:
+    const llama_model & model;
+    const llama_hparams & hparams;
+
+    struct kv_layer {
+        // layer index in the model
+        // note: can be different from the layer index in the KV cache
+        uint32_t il;
+
+        ggml_tensor * k;
+        ggml_tensor * v;
+    };
+
+    bool v_trans = true;  // the value tensor is transposed
+
+    // the current index from where we start searching for a free slot in the ring buffer of KV cells (see find_slot())
+    // note: this is not part of the KV state and it's only used to speed-up the find_slot() method
+    uint32_t head = 0;
+
+    const uint32_t n_seq_max = 1;
+
+    // required padding
+    const uint32_t n_pad = 1;
+
+    // SWA
+    const uint32_t n_swa = 0;
+
+    int debug = 0;
+
+    const llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE;
+
+    std::vector<ggml_context_ptr>        ctxs;
+    std::vector<ggml_backend_buffer_ptr> bufs;
+
+    llama_kv_cells_unified cells;
+
+    std::vector<kv_layer> layers;
+
+    // model layer id -> KV cache layer id
+    std::unordered_map<int32_t, int32_t> map_layer_ids;
+
+    // return non-empty vector if cells have been moved
+    defrag_info defrag_prepare(int32_t n_max_nodes) const;
+
+    size_t total_size() const;
+
+    size_t size_k_bytes() const;
+    size_t size_v_bytes() const;
+
+    bool is_masked_swa(llama_pos p0, llama_pos p1) const;
+
+    ggml_tensor * build_rope_shift(
+            const llama_cparams & cparams,
+                   ggml_context * ctx,
+                    ggml_tensor * cur,
+                    ggml_tensor * shift,
+                    ggml_tensor * factors,
+                          float   freq_base,
+                          float   freq_scale) const;
+
+    llm_graph_result_ptr build_graph_shift(
+            const llama_cparams & cparams,
+                   ggml_context * ctx,
+                    ggml_cgraph * gf) const;
+
+    llm_graph_result_ptr build_graph_defrag(
+            const llama_cparams & cparams,
+                   ggml_context * ctx,
+                    ggml_cgraph * gf,
+              const defrag_info & dinfo) const;
+
+    void state_write_meta(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges, llama_seq_id seq_id = -1) const;
+    void state_write_data(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges) const;
+
+    bool state_read_meta(llama_io_read_i & io, uint32_t cell_count, llama_seq_id dest_seq_id = -1);
+    bool state_read_data(llama_io_read_i & io, uint32_t cell_count);
+};
+
+class llama_kv_cache_unified_context : public llama_memory_context_i {
+public:
+    // some shorthands
+    using ubatch_heads = llama_kv_cache_unified::ubatch_heads;
+    using defrag_info  = llama_kv_cache_unified::defrag_info;
+
+    // used for errors
+    llama_kv_cache_unified_context(llama_memory_status status);
+
+    // used to create a full-cache context
+    llama_kv_cache_unified_context(
+            llama_kv_cache_unified * kv);
+
+    // used to create an update context
+    llama_kv_cache_unified_context(
+            llama_kv_cache_unified * kv,
+            llama_context * lctx,
+            bool do_shift,
+            defrag_info dinfo);
+
+    // used to create a batch procesing context from a batch
+    llama_kv_cache_unified_context(
+            llama_kv_cache_unified * kv,
+            ubatch_heads heads,
+            std::vector<llama_ubatch> ubatches);
+
+    virtual ~llama_kv_cache_unified_context();
+
+    //
+    // llama_memory_context_i
+    //
+
+    bool next()  override;
+    bool apply() override;
+
+    llama_memory_status  get_status() const override;
+    const llama_ubatch & get_ubatch() const override;
+
+    //
+    // llama_kv_cache_unified_context specific API
+    //
+
+    uint32_t get_n_kv() const;
+
+    // get views of the current state of the cache
+    ggml_tensor * get_k(ggml_context * ctx, int32_t il) const;
+    ggml_tensor * get_v(ggml_context * ctx, int32_t il) const;
+
+    // store k_cur and v_cur in the cache based on the provided head location
+    ggml_tensor * cpy_k(ggml_context * ctx, ggml_tensor * k_cur, int32_t il) const;
+    ggml_tensor * cpy_v(ggml_context * ctx, ggml_tensor * v_cur, int32_t il) const;
+
+    void set_input_k_shift(ggml_tensor * dst) const;
+
+    void set_input_kq_mask   (ggml_tensor * dst, const llama_ubatch * ubatch, bool causal_attn) const;
+    void set_input_pos_bucket(ggml_tensor * dst, const llama_ubatch * ubatch) const;
+
+private:
+    llama_memory_status status;
+
+    llama_kv_cache_unified * kv;
+    llama_context * lctx;
+
+    //
+    // update context
+    //
+
+    bool do_shift = false;
+
+    defrag_info dinfo;
+
+    //
+    // batch processing context
+    //
+
+    // the index of the next ubatch to process
+    size_t i_next = 0;
+
+    ubatch_heads heads;
+
+    std::vector<llama_ubatch> ubatches;
+
+    //
+    // data needed for building the compute graph for the current ubatch:
+    //
+
+    // a heuristic, to avoid attending the full cache if it is not yet utilized
+    // as the cache gets filled, the benefit from this heuristic disappears
+    int32_t n_kv;
+
+    // the beginning of the current slot in which the ubatch will be inserted
+    int32_t head;
+};
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
deleted file mode 100644
index 447c09c969baa..0000000000000
--- a/src/llama-kv-cache.cpp
+++ /dev/null
@@ -1,3081 +0,0 @@
-#include "llama-kv-cache.h"
-
-#include "llama-impl.h"
-#include "llama-batch.h"
-#include "llama-cparams.h"
-#include "llama-model.h"
-#include "llama-context.h"
-
-#include <algorithm>
-#include <cassert>
-#include <cmath>
-#include <limits>
-#include <map>
-#include <stdexcept>
-
-//
-// llama_kv_cache_unified
-//
-
-llama_kv_cache_unified::llama_kv_cache_unified(
-        const llama_model &  model,
-          layer_filter_cb && filter,
-                ggml_type    type_k,
-                ggml_type    type_v,
-                     bool    v_trans,
-                     bool    offload,
-                 uint32_t    kv_size,
-                 uint32_t    n_seq_max,
-                 uint32_t    n_pad,
-                 uint32_t    n_swa,
-           llama_swa_type    swa_type) :
-    model(model), hparams(model.hparams), v_trans(v_trans),
-    n_seq_max(n_seq_max), n_pad(n_pad), n_swa(n_swa), swa_type(swa_type) {
-
-    GGML_ASSERT(kv_size % n_pad == 0);
-
-    // create a context for each buffer type
-    std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
-    auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
-        auto it = ctx_map.find(buft);
-        if (it == ctx_map.end()) {
-            ggml_init_params params = {
-                /*.mem_size   =*/ size_t(2u*hparams.n_layer*ggml_tensor_overhead()),
-                /*.mem_buffer =*/ NULL,
-                /*.no_alloc   =*/ true,
-            };
-
-            ggml_context * ctx = ggml_init(params);
-            if (!ctx) {
-                return nullptr;
-            }
-
-            ctx_map[buft] = ctx;
-            ctxs.emplace_back(ctx);
-
-            return ctx;
-        }
-
-        return it->second;
-    };
-
-    head = 0;
-
-    cells.resize(kv_size);
-
-    for (uint32_t il = 0; il < hparams.n_layer; il++) {
-        if (filter && !filter(il)) {
-            LLAMA_LOG_DEBUG("%s: layer %3d: skipped\n", __func__, il);
-            continue;
-        }
-
-        const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il) + hparams.n_embd_k_s();
-        const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
-
-        const char * dev_name = "CPU";
-
-        ggml_backend_buffer_type_t buft = ggml_backend_cpu_buffer_type();
-
-        if (offload) {
-            auto * dev = model.dev_layer(il);
-            buft = ggml_backend_dev_buffer_type(dev);
-
-            dev_name = ggml_backend_dev_name(dev);
-        }
-
-        LLAMA_LOG_DEBUG("%s: layer %3d: dev = %s\n", __func__, il, dev_name);
-
-        ggml_context * ctx = ctx_for_buft(buft);
-        if (!ctx) {
-            throw std::runtime_error("failed to create ggml context for kv cache");
-        }
-
-        ggml_tensor * k;
-        ggml_tensor * v;
-
-        k = ggml_new_tensor_2d(ctx, type_k, n_embd_k_gqa, kv_size);
-        v = ggml_new_tensor_2d(ctx, type_v, n_embd_v_gqa, kv_size);
-
-        ggml_format_name(k, "cache_k_l%d", il);
-        ggml_format_name(v, "cache_v_l%d", il);
-
-        map_layer_ids[il] = layers.size();
-        layers.push_back({ il, k, v });
-    }
-
-    // allocate tensors and initialize the buffers to avoid NaNs in the padding
-    for (auto it : ctx_map) {
-        auto * buft = it.first;
-        auto * ctx  = it.second;
-
-        ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
-        if (!buf) {
-            throw std::runtime_error("failed to allocate buffer for kv cache");
-        }
-
-        LLAMA_LOG_INFO("%s: %10s KV buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0);
-
-        ggml_backend_buffer_clear(buf, 0);
-        bufs.emplace_back(buf);
-    }
-
-    {
-        const size_t memory_size_k = size_k_bytes();
-        const size_t memory_size_v = size_v_bytes();
-
-        LLAMA_LOG_INFO("%s: size = %7.2f MiB (%6u cells, %3d layers, %2u seqs), K (%s): %7.2f MiB, V (%s): %7.2f MiB\n", __func__,
-                (float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f), kv_size, (int) layers.size(), n_seq_max,
-                ggml_type_name(type_k), (float)memory_size_k / (1024.0f * 1024.0f),
-                ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f));
-    }
-}
-
-void llama_kv_cache_unified::clear() {
-    cells.reset();
-
-    head = 0;
-
-    for (auto & buf : bufs) {
-        ggml_backend_buffer_clear(buf.get(), 0);
-    }
-}
-
-bool llama_kv_cache_unified::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
-    uint32_t new_head = cells.size();
-
-    if (p0 < 0) {
-        p0 = 0;
-    }
-
-    if (p1 < 0) {
-        p1 = std::numeric_limits<llama_pos>::max();
-    }
-
-    for (uint32_t i = 0; i < cells.size(); ++i) {
-        if (!cells.pos_in(i, p0, p1)) {
-            continue;
-        }
-
-        if (cells.seq_has(i, seq_id) && cells.seq_rm(i, seq_id)) {
-            if (new_head == cells.size()) {
-                new_head = i;
-            }
-        }
-    }
-
-    // If we freed up a slot, set head to it so searching can start there.
-    if (new_head != cells.size() && new_head < head) {
-        head = new_head;
-    }
-
-    return true;
-}
-
-void llama_kv_cache_unified::seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) {
-    if (seq_id_src == seq_id_dst) {
-        return;
-    }
-
-    if (p0 < 0) {
-        p0 = 0;
-    }
-
-    if (p1 < 0) {
-        p1 = std::numeric_limits<llama_pos>::max();
-    }
-
-    for (uint32_t i = 0; i < cells.size(); ++i) {
-        if (!cells.pos_in(i, p0, p1)) {
-            continue;
-        }
-
-        if (cells.seq_has(i, seq_id_src)) {
-            cells.seq_add(i, seq_id_dst);
-        }
-    }
-}
-
-void llama_kv_cache_unified::seq_keep(llama_seq_id seq_id) {
-    uint32_t new_head = cells.size();
-
-    for (uint32_t i = 0; i < cells.size(); ++i) {
-        if (cells.seq_keep(i, seq_id)) {
-            if (new_head == cells.size()) {
-                new_head = i;
-            }
-        }
-    }
-
-    // If we freed up a slot, set head to it so searching can start there.
-    if (new_head != cells.size() && new_head < head) {
-        head = new_head;
-    }
-}
-
-void llama_kv_cache_unified::seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos shift) {
-    if (shift == 0) {
-        return;
-    }
-
-    uint32_t new_head = cells.size();
-
-    if (p0 < 0) {
-        p0 = 0;
-    }
-
-    if (p1 < 0) {
-        p1 = std::numeric_limits<llama_pos>::max();
-    }
-
-    // If there is no range then return early to avoid looping over all cells.
-    if (p0 == p1) {
-        return;
-    }
-
-    for (uint32_t i = 0; i < cells.size(); ++i) {
-        if (!cells.pos_in(i, p0, p1)) {
-            continue;
-        }
-
-        if (cells.seq_has(i, seq_id)) {
-            if (cells.pos_add(i, shift)) {
-                if (new_head == cells.size()) {
-                    new_head = i;
-                }
-            }
-        }
-    }
-
-    // If we freed up a slot, set head to it so searching can start there.
-    // Otherwise we just start the next search from the beginning.
-    head = new_head != cells.size() ? new_head : 0;
-}
-
-void llama_kv_cache_unified::seq_div(llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) {
-    if (d == 1) {
-        return;
-    }
-
-    if (p0 < 0) {
-        p0 = 0;
-    }
-
-    if (p1 < 0) {
-        p1 = std::numeric_limits<llama_pos>::max();
-    }
-
-    // If there is no range then return early to avoid looping over the cache.
-    if (p0 == p1) {
-        return;
-    }
-
-    for (uint32_t i = 0; i < cells.size(); ++i) {
-        if (!cells.pos_in(i, p0, p1)) {
-            continue;
-        }
-
-        if (cells.seq_has(i, seq_id)) {
-            cells.pos_div(i, d);
-        }
-    }
-}
-
-llama_pos llama_kv_cache_unified::seq_pos_min(llama_seq_id seq_id) const {
-    return cells.seq_pos_min(seq_id);
-}
-
-llama_pos llama_kv_cache_unified::seq_pos_max(llama_seq_id seq_id) const {
-    return cells.seq_pos_max(seq_id);
-}
-
-llama_memory_state_ptr llama_kv_cache_unified::init_batch(
-            const llama_batch & batch,
-            uint32_t n_ubatch,
-            bool embd_pooled,
-            bool logits_all) {
-    GGML_UNUSED(embd_pooled);
-
-    auto sbatch = llama_sbatch(batch, hparams.n_embd, true, logits_all);
-
-    std::vector<llama_ubatch> ubatches;
-    while (sbatch.n_tokens > 0) {
-        ubatches.push_back(sbatch.split_simple(n_ubatch));
-    }
-
-    auto heads = prepare(ubatches);
-    if (heads.empty()) {
-        return std::make_unique<llama_kv_cache_unified_state>(LLAMA_MEMORY_STATUS_FAILED_PREPARE);
-    }
-
-    return std::make_unique<llama_kv_cache_unified_state>(LLAMA_MEMORY_STATUS_SUCCESS,
-            this, std::move(sbatch), std::move(heads), std::move(ubatches));
-}
-
-llama_memory_state_ptr llama_kv_cache_unified::init_full() {
-    return std::make_unique<llama_kv_cache_unified_state>(LLAMA_MEMORY_STATUS_SUCCESS, this);
-}
-
-std::vector<uint32_t> llama_kv_cache_unified::prepare(const std::vector<llama_ubatch> & ubatches) {
-    std::vector<uint32_t> res;
-
-    struct state {
-        uint32_t head_old; // old position of the head, before placing the ubatch
-        uint32_t head_new; // new position of the head, after placing the ubatch
-
-        llama_kv_cells_unified cells; // copy of the old cells, before placing the ubatch
-    };
-
-    // remember the old state of the cells so we can restore it in the end
-    std::vector<state> states;
-
-    bool success = true;
-
-    for (const auto & ubatch : ubatches) {
-        // only find a suitable slot for the ubatch. don't modify the cells yet
-        const int32_t head_new = find_slot(ubatch);
-        if (head_new < 0) {
-            success = false;
-            break;
-        }
-
-        // remeber the position that we found
-        res.push_back(head_new);
-
-        // store the old state of the cells in the recovery stack
-        states.push_back({head, (uint32_t) head_new, cells.cp(head_new, ubatch.n_tokens)});
-
-        // now emplace the ubatch
-        apply_ubatch(head_new, ubatch);
-    }
-
-    // iterate backwards and restore the cells to their original state
-    for (auto it = states.rbegin(); it != states.rend(); ++it) {
-        cells.set(it->head_new, it->cells);
-        head = it->head_old;
-    }
-
-    if (!success) {
-        return {};
-    }
-
-    return res;
-}
-
-bool llama_kv_cache_unified::update(llama_context & lctx) {
-    bool updated = false;
-
-    auto * sched = lctx.get_sched();
-
-    if (cells.get_has_shift()) {
-        if (!get_can_shift()) {
-            GGML_ABORT("The current KV cache / model configuration does not support K-shift");
-        }
-
-        LLAMA_LOG_DEBUG("%s: applying K-shift\n", __func__);
-
-        // apply K-shift if needed
-        if (hparams.rope_type != LLAMA_ROPE_TYPE_NONE) {
-            ggml_backend_sched_reset(sched);
-
-            auto * gf = lctx.graph_init();
-
-            auto res = build_graph_shift(lctx.get_cparams(), lctx.get_ctx_compute(), gf);
-            if (!res) {
-                LLAMA_LOG_ERROR("%s: failed to build graph for K-shift\n", __func__);
-                return updated;
-            }
-
-            if (!ggml_backend_sched_alloc_graph(sched, gf)) {
-                LLAMA_LOG_ERROR("%s: failed to allocate compute graph for K-shift\n", __func__);
-                return updated;
-            }
-
-            res->set_inputs(nullptr);
-
-            if (lctx.graph_compute(gf, false) != GGML_STATUS_SUCCESS) {
-                LLAMA_LOG_ERROR("%s: failed to compute K-shift\n", __func__);
-                return updated;
-            }
-
-            updated = true;
-        }
-
-        cells.reset_shift();
-    }
-
-    if (do_defrag) {
-        LLAMA_LOG_DEBUG("%s: defragmenting KV cache\n", __func__);
-
-        if (defrag_prepare(lctx.graph_max_nodes())) {
-            ggml_backend_sched_reset(sched);
-
-            auto * gf = lctx.graph_init();
-
-            auto res = build_graph_defrag(lctx.get_cparams(), lctx.get_ctx_compute(), gf);
-            if (!res) {
-                LLAMA_LOG_ERROR("%s: failed to build graph for defrag\n", __func__);
-                return updated;
-            }
-
-            if (!ggml_backend_sched_alloc_graph(sched, gf)) {
-                LLAMA_LOG_ERROR("%s: failed to allocate compute graph for defrag\n", __func__);
-                return updated;
-            }
-
-            res->set_inputs(nullptr);
-
-            if (lctx.graph_compute(gf, false) != GGML_STATUS_SUCCESS) {
-                LLAMA_LOG_ERROR("%s: failed to compute defrag\n", __func__);
-                return updated;
-            }
-
-            updated = true;
-        }
-
-        do_defrag = false;
-    }
-
-    return updated;
-}
-
-void llama_kv_cache_unified::defrag_sched(float thold) {
-    const auto n_kv = cells.used_max_p1();
-
-    // - do not defrag small contexts (i.e. < 2048 tokens)
-    // - count the padding towards the number of used tokens
-    const float fragmentation = n_kv >= 2048 ? std::max(0.0f, 1.0f - (float(cells.get_used() + n_pad)/n_kv)) : 0.0f;
-
-    // queue defragmentation for next llama_kv_cache_update
-    if (fragmentation > thold) {
-        LLAMA_LOG_DEBUG("%s: fragmentation: %.2f - requesting defrag\n", __func__, fragmentation);
-
-        do_defrag = true;
-    }
-}
-
-int32_t llama_kv_cache_unified::find_slot(const llama_ubatch & ubatch) const {
-    const uint32_t n_tokens = ubatch.n_tokens;
-
-    uint32_t head_cur = this->head;
-
-    // if we have enough unused cells before the current head ->
-    //   better to start searching from the beginning of the cache, hoping to fill it
-    if (head_cur > cells.get_used() + 2*ubatch.n_tokens) {
-        head_cur = 0;
-    }
-
-    // otherwise, one cell per token.
-
-    if (n_tokens > cells.size()) {
-        LLAMA_LOG_ERROR("%s: n_tokens = %d > size = %u\n", __func__, n_tokens, cells.size());
-        return -1;
-    }
-
-//#define FIND_SLOT_DEBUG 1
-#if FIND_SLOT_DEBUG
-    LLAMA_LOG_WARN("begin: n = %5d, used = %5d, head = %5d, n_swa = %5d\n", cells.used_max_p1(), cells.get_used(), head, n_swa);
-
-    // for debugging
-    {
-        std::string ss;
-        if (n_swa > 0) {
-            for (uint32_t i = 0; i < cells.size(); ++i) {
-                if (cells.is_empty(i)) {
-                    ss += '.';
-                } else {
-                    ss += std::to_string(cells.seq_get(i));
-                }
-                if (i%256 == 255) {
-                    ss += '\n';
-                }
-            }
-        }
-        LLAMA_LOG_WARN("\n%s\n", ss.c_str());
-    }
-
-    for (int s = 0; s < LLAMA_MAX_PARALLEL_SEQUENCES; ++s) {
-        if (cells.seq_pos_min(s) < 0) {
-            continue;
-        }
-
-        LLAMA_LOG_WARN("kv_cells: n_swa = %4d, min[%d] = %5d, max[%d] = %5d\n", n_swa, s, cells.seq_pos_min(s), s, cells.seq_pos_max(s));
-    }
-#endif
-
-    uint32_t n_tested = 0;
-
-    while (true) {
-        if (head_cur + n_tokens > cells.size()) {
-            n_tested += cells.size() - head_cur;
-            head_cur = 0;
-            continue;
-        }
-
-        // keep track of what the minimum sequence positions would be if we accept the ubatch
-        llama_seq_id seq_pos_min[LLAMA_MAX_PARALLEL_SEQUENCES];
-        for (int s = 0; s < LLAMA_MAX_PARALLEL_SEQUENCES; ++s) {
-            seq_pos_min[s] = cells.seq_pos_min(s);
-        }
-
-        bool found = true;
-        for (uint32_t i = 0; i < n_tokens; i++) {
-            const llama_pos    pos    = ubatch.pos[i];
-            const llama_seq_id seq_id = ubatch.seq_id[i][0];
-
-            // can we use this cell? either:
-            //  - the cell is empty
-            //  - the cell is occupied only by one sequence:
-            //    - mask causally, if the sequence is the same as the one we are inserting
-            //    - mask SWA, using current max pos for that sequence in the cache
-            //                always insert in the cell with minimum pos
-            bool can_use = cells.is_empty(head_cur + i);
-
-            if (!can_use && cells.seq_count(head_cur + i) == 1) {
-                const llama_pos pos_cell = cells.pos_get(head_cur + i);
-
-                // causal mask
-                if (cells.seq_has(head_cur + i, seq_id)) {
-                    can_use = pos_cell >= pos;
-                }
-
-                if (!can_use) {
-                    const llama_seq_id seq_id_cell = cells.seq_get(head_cur + i);
-
-                    // SWA mask
-                    // note: we insert only in the cell with minimum pos in order to preserve the invariant that
-                    //       all positions between [pos_min, pos_max] for each sequence will be present in the cache
-                    //       ref: https://github.com/ggml-org/llama.cpp/pull/13746#issuecomment-2916057092
-                    if (pos_cell == seq_pos_min[seq_id_cell] &&
-                        is_masked_swa(pos_cell, cells.seq_pos_max(seq_id_cell) + 1)) {
-                        seq_pos_min[seq_id_cell]++;
-                        can_use = true;
-                    }
-                }
-            }
-
-            if (!can_use) {
-                found = false;
-                head_cur += i + 1;
-                n_tested += i + 1;
-                break;
-            }
-        }
-
-        if (found) {
-            break;
-        }
-
-        if (n_tested >= cells.size()) {
-            //LLAMA_LOG_ERROR("%s: failed to find a slot for %d tokens\n", __func__, n_tokens);
-            return -1;
-        }
-    }
-
-    return head_cur;
-}
-
-void llama_kv_cache_unified::apply_ubatch(uint32_t head_cur, const llama_ubatch & ubatch) {
-    for (uint32_t i = 0; i < ubatch.n_tokens; ++i) {
-        if (!cells.is_empty(head_cur + i)) {
-            cells.rm(head_cur + i);
-        }
-
-        cells.pos_set(head_cur + i, ubatch.pos[i]);
-
-        for (int32_t j = 0; j < ubatch.n_seq_id[i]; j++) {
-            cells.seq_add(head_cur + i, ubatch.seq_id[i][j]);
-        }
-    }
-
-    // move the head at the end of the slot
-    head = head_cur + ubatch.n_tokens;
-}
-
-bool llama_kv_cache_unified::get_can_shift() const {
-    return true;
-}
-
-uint32_t llama_kv_cache_unified::get_size() const {
-    return cells.size();
-}
-
-uint32_t llama_kv_cache_unified::get_n_kv() const {
-    return std::min(cells.size(), std::max(n_pad, GGML_PAD(cells.used_max_p1(), n_pad)));
-}
-
-ggml_tensor * llama_kv_cache_unified::get_k(ggml_context * ctx, int32_t il, uint32_t n_kv) const {
-    const int32_t ikv = map_layer_ids.at(il);
-
-    auto * k = layers[ikv].k;
-
-    return ggml_view_3d(ctx, k,
-            hparams.n_embd_head_k, hparams.n_head_kv(il), n_kv,
-            ggml_row_size(k->type, hparams.n_embd_head_k),
-            ggml_row_size(k->type, hparams.n_embd_k_gqa(il)),
-            0);
-}
-
-ggml_tensor * llama_kv_cache_unified::get_v(ggml_context * ctx, int32_t il, uint32_t n_kv) const {
-    const int32_t ikv = map_layer_ids.at(il);
-
-    auto * v = layers[ikv].v;
-
-    if (!v_trans) {
-        // note: v->nb[1] <= v->nb[2]
-        return ggml_view_3d(ctx, v,
-                hparams.n_embd_head_v, hparams.n_head_kv(il), n_kv,
-                ggml_row_size(v->type, hparams.n_embd_head_v),    // v->nb[1]
-                ggml_row_size(v->type, hparams.n_embd_v_gqa(il)), // v->nb[2]
-                0);
-    }
-
-    // note: v->nb[1] > v->nb[2]
-    return ggml_view_3d(ctx, v,
-            n_kv, hparams.n_head_kv(il), hparams.n_embd_head_v,
-            ggml_row_size(v->type, v->ne[1]*hparams.n_embd_head_v), // v->nb[1]
-            ggml_row_size(v->type, v->ne[1]),                       // v->nb[2]
-            0);
-}
-
-ggml_tensor * llama_kv_cache_unified::cpy_k(ggml_context * ctx, ggml_tensor * k_cur, int32_t il, uint32_t head_cur) const {
-    const int32_t ikv = map_layer_ids.at(il);
-
-    auto * k = layers[ikv].k;
-
-    const int64_t n_tokens = k_cur->ne[2];
-
-    ggml_tensor * k_view = ggml_view_1d(ctx, k,
-            n_tokens*hparams.n_embd_k_gqa(il),
-            ggml_row_size(k->type, hparams.n_embd_k_gqa(il))*head_cur);
-
-    return ggml_cpy(ctx, k_cur, k_view);
-}
-
-ggml_tensor * llama_kv_cache_unified::cpy_v(ggml_context * ctx, ggml_tensor * v_cur, int32_t il, uint32_t head_cur) const {
-    const int32_t ikv = map_layer_ids.at(il);
-
-    auto * v = layers[ikv].v;
-
-    const int64_t n_tokens = v_cur->ne[2];
-
-    v_cur = ggml_reshape_2d(ctx, v_cur, hparams.n_embd_v_gqa(il), n_tokens);
-
-    ggml_tensor * v_view = nullptr;
-
-    if (!v_trans) {
-        v_view = ggml_view_1d(ctx, v,
-                n_tokens*hparams.n_embd_v_gqa(il),
-                ggml_row_size(v->type, hparams.n_embd_v_gqa(il))*head_cur);
-    } else {
-        // note: the V cache is transposed when not using flash attention
-        v_view = ggml_view_2d(ctx, v, n_tokens, hparams.n_embd_v_gqa(il),
-                (v->ne[1])*ggml_element_size(v),
-                (head_cur)*ggml_element_size(v));
-
-        v_cur = ggml_transpose(ctx, v_cur);
-    }
-
-    return ggml_cpy(ctx, v_cur, v_view);
-}
-
-void llama_kv_cache_unified::set_input_kq_mask(ggml_tensor * dst, const llama_ubatch * ubatch, bool causal_attn) const {
-    const int64_t n_tokens     = ubatch->n_tokens;
-    const int64_t n_seq_tokens = ubatch->n_seq_tokens;
-    const int64_t n_seqs       = ubatch->n_seqs;
-
-    GGML_ASSERT(ggml_backend_buffer_is_host(dst->buffer));
-    float * data = (float *) dst->data;
-
-    const auto n_kv = dst->ne[0];
-
-    // Use only the previous KV cells of the correct sequence for each token of the ubatch.
-    // It's assumed that if a token in the batch has multiple sequences, they are equivalent.
-    // Example with a cache of 10 tokens, 2 tokens populated in cache and 3 tokens in batch:
-    //   Causal mask:
-    //      xxx-------
-    //      xxxx------
-    //      xxxxx-----
-    //   Non-causal mask:
-    //      xxxxx-----
-    //      xxxxx-----
-    //      xxxxx-----
-    // To visualize the mask, see https://github.com/ggml-org/llama.cpp/pull/12615
-    for (int h = 0; h < 1; ++h) {
-        for (int s = 0; s < n_seqs; ++s) {
-            const llama_seq_id seq_id = ubatch->seq_id[s][0];
-
-            for (int j = 0; j < n_seq_tokens; ++j) {
-                const llama_pos p1 = ubatch->pos[s*n_seq_tokens + j];
-
-                for (uint32_t i = 0; i < n_kv; ++i) {
-                    float f = 0.0f;
-
-                    bool masked = false;
-
-                    if (cells.is_empty(i)) {
-                        masked = true;
-                    } else {
-                        const llama_pos p0 = cells.pos_get(i);
-
-                        // mask the token if not the same sequence
-                        masked = masked || (!cells.seq_has(i, seq_id));
-
-                        // mask future tokens
-                        masked = masked || (causal_attn && p0 > p1);
-
-                        // apply SWA if any
-                        masked = masked || (is_masked_swa(p0, p1));
-
-                        if (!masked && hparams.use_alibi) {
-                            f = -std::abs(p0 - p1);
-                        }
-                    }
-
-                    if (masked) {
-                        f = -INFINITY;
-                    }
-
-                    data[h*(n_kv*n_tokens) + s*(n_kv*n_seq_tokens) + j*n_kv + i] = f;
-                }
-            }
-        }
-
-        // mask padded tokens
-        if (data) {
-            for (int i = n_tokens; i < GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); ++i) {
-                for (uint32_t j = 0; j < n_kv; ++j) {
-                    data[h*(n_kv*n_tokens) + i*n_kv + j] = -INFINITY;
-                }
-            }
-        }
-    }
-}
-
-void llama_kv_cache_unified::set_input_k_shift(ggml_tensor * dst) const {
-    GGML_ASSERT(ggml_backend_buffer_is_host(dst->buffer));
-
-    int32_t * data = (int32_t *) dst->data;
-
-    for (uint32_t i = 0; i < cells.size(); ++i) {
-        data[i] = cells.is_empty(i) ? 0 : cells.get_shift(i);
-    }
-}
-
-void llama_kv_cache_unified::set_input_pos_bucket(ggml_tensor * dst, const llama_ubatch * ubatch) const {
-    const int64_t n_tokens = ubatch->n_tokens;
-
-    GGML_ASSERT(ggml_backend_buffer_is_host(dst->buffer));
-    GGML_ASSERT(!ubatch->equal_seqs); // TODO: use ubatch->n_seqs instead of failing
-
-    int32_t * data = (int32_t *) dst->data;
-
-    const int32_t n_kv = dst->ne[0];
-
-    for (int h = 0; h < 1; ++h) {
-        for (int j = 0; j < n_tokens; ++j) {
-            for (int i = 0; i < n_kv; ++i) {
-                // the position when the cells is empty is irrelevant - it will be masked out later in the attention
-                const llama_pos p0 = cells.is_empty(i) ? -1 : cells.pos_get(i);
-
-                data[h*(n_kv*n_tokens) + j*n_kv + i] = llama_relative_position_bucket(p0, ubatch->pos[j], hparams.n_rel_attn_bkts, false);
-            }
-        }
-    }
-}
-
-size_t llama_kv_cache_unified::total_size() const {
-    size_t size = 0;
-
-    for (const auto & buf : bufs) {
-        size += ggml_backend_buffer_get_size(buf.get());
-    }
-
-    return size;
-}
-
-size_t llama_kv_cache_unified::size_k_bytes() const {
-    size_t size_k_bytes = 0;
-
-    for (const auto & layer : layers) {
-        size_k_bytes += ggml_nbytes(layer.k);
-    }
-
-    return size_k_bytes;
-}
-
-size_t llama_kv_cache_unified::size_v_bytes() const {
-    size_t size_v_bytes = 0;
-
-    for (const auto & layer : layers) {
-        size_v_bytes += ggml_nbytes(layer.v);
-    }
-
-    return size_v_bytes;
-}
-
-ggml_tensor * llama_kv_cache_unified::build_rope_shift(
-        const llama_cparams & cparams,
-               ggml_context * ctx,
-                ggml_tensor * cur,
-                ggml_tensor * shift,
-                ggml_tensor * factors,
-                      float   freq_base,
-                      float   freq_scale) const {
-    const auto & n_ctx_orig = cparams.n_ctx_orig_yarn;
-
-    const auto & yarn_ext_factor = cparams.yarn_ext_factor;
-    const auto & yarn_beta_fast  = cparams.yarn_beta_fast;
-    const auto & yarn_beta_slow  = cparams.yarn_beta_slow;
-
-    const auto & n_rot     = hparams.n_rot;
-    const auto & rope_type = hparams.rope_type == LLAMA_ROPE_TYPE_MROPE
-                                // @ngxson : this is a workaround
-                                // for M-RoPE, we want to rotate the whole vector when doing KV shift
-                                // a normal RoPE should work, we just need to use the correct ordering
-                                // ref: https://github.com/ggml-org/llama.cpp/pull/13870
-                                ? LLAMA_ROPE_TYPE_NEOX
-                                : hparams.rope_type;
-
-    // See llm_build_deepseek2() for why attn_factor has to be scaled for YaRN RoPE to work correctly.
-    // See https://github.com/ggerganov/llama.cpp/discussions/7416 for detailed explanation.
-    const float yarn_attn_factor = model.arch == LLM_ARCH_DEEPSEEK2
-                                    ? 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale))
-                                    : cparams.yarn_attn_factor;
-
-    ggml_tensor * tmp;
-
-    if (ggml_is_quantized(cur->type)) {
-        // dequantize to f32 -> RoPE -> quantize back
-        tmp = ggml_cast(ctx, cur, GGML_TYPE_F32);
-
-        tmp = ggml_rope_ext(ctx, tmp,
-                shift, factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                yarn_ext_factor, yarn_attn_factor, yarn_beta_fast, yarn_beta_slow);
-
-        tmp = ggml_cpy(ctx, tmp, cur);
-    } else {
-        // we rotate only the first n_rot dimensions
-        tmp = ggml_rope_ext_inplace(ctx, cur,
-                shift, factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                yarn_ext_factor, yarn_attn_factor, yarn_beta_fast, yarn_beta_slow);
-    }
-
-    return tmp;
-}
-
-class llm_graph_input_k_shift : public llm_graph_input_i {
-public:
-    llm_graph_input_k_shift(const llama_kv_cache_unified * kv_self) : kv_self(kv_self) {}
-    virtual ~llm_graph_input_k_shift() = default;
-
-    void set_input(const llama_ubatch * ubatch) override;
-
-    ggml_tensor * k_shift; // I32 [kv_size]
-
-    const llama_kv_cache_unified * kv_self;
-};
-
-void llm_graph_input_k_shift::set_input(const llama_ubatch * ubatch) {
-    GGML_UNUSED(ubatch);
-
-    if (k_shift) {
-        kv_self->set_input_k_shift(k_shift);
-    }
-}
-
-llm_graph_result_ptr llama_kv_cache_unified::build_graph_shift(
-        const llama_cparams & cparams,
-               ggml_context * ctx,
-                ggml_cgraph * gf) const {
-    auto res = std::make_unique<llm_graph_result>();
-
-    const auto & n_embd_head_k = hparams.n_embd_head_k;
-  //const auto & n_embd_head_v = hparams.n_embd_head_v;
-
-    //GGML_ASSERT(kv_self->size == n_ctx);
-
-    auto inp = std::make_unique<llm_graph_input_k_shift>(this);
-
-    inp->k_shift = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, cparams.n_ctx);
-    ggml_set_input(inp->k_shift);
-
-    for (const auto & layer : layers) {
-        const uint32_t il = layer.il;
-
-        const int64_t n_head_kv    = hparams.n_head_kv(il);
-        const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
-
-        const float freq_base_l  = model.get_rope_freq_base (cparams, il);
-        const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
-
-        ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
-
-        ggml_tensor * k =
-            ggml_view_3d(ctx, layer.k,
-                n_embd_head_k, n_head_kv, cells.size(),
-                ggml_row_size(layer.k->type, n_embd_head_k),
-                ggml_row_size(layer.k->type, n_embd_k_gqa),
-                0);
-
-        ggml_tensor * cur = build_rope_shift(cparams, ctx, k, inp->k_shift, rope_factors, freq_base_l, freq_scale_l);
-
-        ggml_build_forward_expand(gf, cur);
-    }
-
-    res->add_input(std::move(inp));
-
-    return res;
-}
-
-llm_graph_result_ptr llama_kv_cache_unified::build_graph_defrag(
-        const llama_cparams & cparams,
-               ggml_context * ctx,
-                ggml_cgraph * gf) const {
-    auto res = std::make_unique<llm_graph_result>();
-
-    const auto & ids = defrag_info.ids;
-
-#if 0
-    // CPU defrag
-    //
-    // TODO: optimizations are possible:
-    //       - multiple threads
-    //       - avoid copying to the host memory when already there
-    //
-    // likely not worth the effort, as we have ggml_graph based defrag
-    //
-
-    const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa();
-    const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa();
-
-    const uint32_t kv_size = size;
-
-    std::vector<uint8_t> buf_k;
-    std::vector<uint8_t> buf_v;
-
-    for (uint32_t il = 0; il < n_layer; ++il) {
-        const size_t k_size_row = ggml_row_size(k_l[il]->type, n_embd_k_gqa);
-        const size_t k_size     = ggml_row_size(k_l[il]->type, n_embd_k_gqa*kv_size);
-
-        const size_t v_size_el = ggml_type_size(v_l[il]->type);
-        const size_t v_size    = ggml_row_size (v_l[il]->type, n_embd_v_gqa*kv_size);
-
-        buf_k.resize(k_size);
-        buf_v.resize(v_size);
-
-        ggml_backend_tensor_get(k_l[il], buf_k.data(), 0, buf_k.size());
-        ggml_backend_tensor_get(v_l[il], buf_v.data(), 0, buf_v.size());
-
-        // batch move [i, i+nm) to [id, id+nm)
-        // note: cells can move only to a lower index
-        for (uint32_t i = 0; i < n_kv; ++i) {
-            const uint32_t id = ids[i];
-
-            if (i == id || id == n_kv) {
-                continue;
-            }
-
-            uint32_t nm = 1;
-
-            while (i + nm < n_kv && ids[i + nm] == id + nm) {
-                nm++;
-            }
-
-            // move keys
-            {
-                const int64_t os =  i*k_size_row;
-                const int64_t od = id*k_size_row;
-
-                memcpy(buf_k.data() + od, buf_k.data() + os, nm*k_size_row);
-            }
-
-            // move values (note: they are transposed)
-            {
-                const int64_t os =  i;
-                const int64_t od = id;
-
-                for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
-                    memcpy(buf_v.data() + (od + j*kv_size)*v_size_el, buf_v.data() + (os + j*kv_size)*v_size_el, nm*v_size_el);
-                }
-            }
-
-            i += nm - 1;
-        }
-
-        ggml_backend_tensor_set(k_l[il], buf_k.data(), 0, buf_k.size());
-        ggml_backend_tensor_set(v_l[il], buf_v.data(), 0, buf_v.size());
-    }
-#else
-    for (uint32_t i = 0; i < ids.size(); ++i) {
-        const uint32_t id = ids[i];
-
-        if (i == id || id == ids.size()) {
-            continue;
-        }
-
-        uint32_t nm = 1;
-
-        while (i + nm < ids.size() && ids[i + nm] == id + nm) {
-            nm++;
-        }
-
-        for (const auto & layer : layers) {
-            const uint32_t il = layer.il;
-
-            const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
-            const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
-
-            ggml_tensor * view_k_src = ggml_view_2d(ctx, layer.k,
-                    n_embd_k_gqa, nm,
-                    ggml_row_size(layer.k->type, n_embd_k_gqa),
-                    ggml_row_size(layer.k->type, n_embd_k_gqa*i));
-
-            ggml_tensor * view_k_dst = ggml_view_2d(ctx, layer.k,
-                    n_embd_k_gqa, nm,
-                    ggml_row_size(layer.k->type, n_embd_k_gqa),
-                    ggml_row_size(layer.k->type, n_embd_k_gqa*id));
-
-            ggml_tensor * view_v_src;
-            ggml_tensor * view_v_dst;
-
-            if (cparams.flash_attn) {
-                // NOTE: the V cache is not transposed when using flash attention
-                view_v_src = ggml_view_2d(ctx, layer.v,
-                        n_embd_v_gqa, nm,
-                        ggml_row_size(layer.v->type, n_embd_v_gqa),
-                        ggml_row_size(layer.v->type, n_embd_v_gqa*i));
-
-                view_v_dst = ggml_view_2d(ctx, layer.v,
-                        n_embd_v_gqa, nm,
-                        ggml_row_size(layer.v->type, n_embd_v_gqa),
-                        ggml_row_size(layer.v->type, n_embd_v_gqa*id));
-            } else {
-                view_v_src = ggml_view_2d(ctx, layer.v,
-                        nm, n_embd_v_gqa,
-                        ggml_row_size(layer.v->type, cells.size()),
-                        ggml_row_size(layer.v->type, i));
-
-                view_v_dst = ggml_view_2d(ctx, layer.v,
-                        nm, n_embd_v_gqa,
-                        ggml_row_size(layer.v->type, cells.size()),
-                        ggml_row_size(layer.v->type, id));
-            }
-
-            ggml_build_forward_expand(gf, ggml_cpy(ctx, view_k_src, view_k_dst));
-            ggml_build_forward_expand(gf, ggml_cpy(ctx, view_v_src, view_v_dst));
-        }
-
-        i += nm - 1;
-    }
-
-    //LLAMA_LOG_INFO("gf->n_nodes = %d\n", gf->n_nodes);
-#endif
-
-    return res;
-}
-
-bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
-    const uint32_t n_layer = layers.size();
-
-    const uint32_t n_kv   = cells.used_max_p1();
-    const uint32_t n_used = cells.get_used();
-
-    assert(n_used <= n_kv);
-
-    //const int64_t t_start = ggml_time_us();
-
-    // number of cells moved
-    uint32_t n_moves = 0;
-
-    // each move requires 6*n_layer tensors (see graph_build_kv_self_defrag)
-    //   - source view, destination view, copy operation
-    //   - x2 for keys and values
-    //const uint32_t max_moves = max_nodes()/(6*n_layer);
-    // TODO: tmp fix https://github.com/ggerganov/llama.cpp/issues/6685#issuecomment-2057579516
-    const uint32_t max_moves = (n_max_nodes - 2*n_layer)/(6*n_layer);
-
-    // determine which KV cells to move where
-    //
-    //  cell i moves to ids[i]
-    //
-    //  if ids[i] == i || ids[i] == n_kv, then cell i is not moved
-    //
-    auto & ids = defrag_info.ids;
-
-    ids.clear();
-    ids.resize(n_kv, n_kv);
-
-    for (uint32_t i0 = 0; i0 < n_used; ++i0) {
-        if (!cells.is_empty(i0)) {
-            ids[i0] = i0;
-
-            continue;
-        }
-
-        // found a hole - fill it with data from the end of the cache
-
-        uint32_t nh = 1;
-
-        // determine the size of the hole
-        while (i0 + nh < n_used && cells.is_empty(i0 + nh)) {
-            nh++;
-        }
-
-        uint32_t nf = 0;
-        uint32_t is = n_kv - 1;
-
-        // starting from the end, find nh non-empty cells
-        for (; is > i0; --is) {
-            if (cells.is_empty(is) || ids[is] != n_kv) {
-                continue;
-            }
-
-            // non-empty cell which is not yet moved
-            nf++;
-
-            if (nf == nh) {
-                break;
-            }
-        }
-
-        // this can only happen if `n_used` is not accurate, which would be a bug
-        GGML_ASSERT(nf == nh && "KV defrag bug: nf != nh");
-
-        nf = 0;
-
-        uint32_t i1 = is;
-
-        // are we moving a continuous block of memory?
-        bool cont = false;
-
-        // should we stop searching for the next move?
-        bool stop = false;
-
-        // go back and move the nf cells to the hole
-        for (; i1 < n_kv; ++i1) {
-            if (cells.is_empty(i1) || ids[i1] != n_kv) {
-                if (n_moves == max_moves) {
-                    stop = true;
-                    break;
-                }
-
-                cont = false;
-                continue;
-            }
-
-            // this cell goes to (i0 + nf)
-            ids[i1] = i0 + nf;
-
-            // move the cell meta data
-            cells.mv(i1, i0 + nf);
-
-            head = n_used;
-
-            if (!cont) {
-                n_moves++;
-                cont = true;
-            }
-
-            nf++;
-
-            if (nf == nh) {
-                break;
-            }
-        }
-
-        if (stop || n_moves == max_moves) {
-            break;
-        }
-
-        //LLAMA_LOG_INFO("(tmp log) KV defrag: move [%u, %u) to [%u, %u)\n", is, i1 + 1, i0, i0 + nh);
-
-        i0 += nh - 1;
-    }
-
-    if (n_moves == 0) {
-        return false;
-    }
-
-    LLAMA_LOG_DEBUG("%s: (tmp log) KV defrag cell moves: %u\n", __func__, n_moves);
-
-    LLAMA_LOG_DEBUG("%s: expected gf nodes: %u\n", __func__, 6*n_moves*n_layer);
-
-    return true;
-}
-
-bool llama_kv_cache_unified::is_masked_swa(llama_pos p0, llama_pos p1) const {
-    assert(p0 >= 0 && p1 >= 0);
-
-    switch (swa_type) {
-        case LLAMA_SWA_TYPE_NONE:
-            {
-            } break;
-        case LLAMA_SWA_TYPE_STANDARD:
-            {
-                if (p1 - p0 >= (int32_t) n_swa) {
-                    return true;
-                }
-            } break;
-        case LLAMA_SWA_TYPE_CHUNKED:
-            {
-                const llama_pos pos_chunk_start = (p1 / n_swa) * n_swa;
-
-                if (p0 < pos_chunk_start) {
-                    return true;
-                }
-            } break;
-    }
-
-    return false;
-}
-
-void llama_kv_cache_unified::state_write(llama_io_write_i & io, llama_seq_id seq_id) const {
-    std::vector<std::pair<uint32_t, uint32_t>> cell_ranges; // ranges, from inclusive, to exclusive
-    uint32_t cell_count = 0;
-
-    // Count the number of cells with the specified seq_id
-    // Find all the ranges of cells with this seq id (or all, when -1)
-    uint32_t cell_range_begin = cells.size();
-
-    for (uint32_t i = 0; i < cells.size(); ++i) {
-        if (!cells.is_empty(i) && (seq_id == -1 || cells.seq_has(i, seq_id))) {
-            ++cell_count;
-            if (cell_range_begin == cells.size()) {
-                cell_range_begin = i;
-            }
-        } else {
-            if (cell_range_begin != cells.size()) {
-                cell_ranges.emplace_back(cell_range_begin, i);
-                cell_range_begin = cells.size();
-            }
-        }
-    }
-
-    if (cell_range_begin != cells.size()) {
-        cell_ranges.emplace_back(cell_range_begin, cells.size());
-    }
-
-    // DEBUG CHECK: Sum of cell counts in ranges should equal the total cell count
-    uint32_t cell_count_check = 0;
-    for (const auto & range : cell_ranges) {
-        cell_count_check += range.second - range.first;
-    }
-    GGML_ASSERT(cell_count == cell_count_check);
-
-    io.write(&cell_count, sizeof(cell_count));
-
-    state_write_meta(io, cell_ranges, seq_id);
-    state_write_data(io, cell_ranges);
-}
-
-void llama_kv_cache_unified::state_read(llama_io_read_i & io, llama_seq_id seq_id) {
-    uint32_t cell_count;
-    io.read_to(&cell_count, sizeof(cell_count));
-
-    bool res = true;
-    res = res && state_read_meta(io, cell_count, seq_id);
-    res = res && state_read_data(io, cell_count);
-
-    if (!res) {
-        if (seq_id == -1) {
-            clear();
-        } else {
-            seq_rm(seq_id, -1, -1);
-        }
-        throw std::runtime_error("failed to restore kv cache");
-    }
-}
-
-void llama_kv_cache_unified::state_write_meta(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges, llama_seq_id seq_id) const {
-    for (const auto & range : cell_ranges) {
-        for (uint32_t i = range.first; i < range.second; ++i) {
-            std::vector<llama_seq_id> seq_ids;
-
-            for (llama_seq_id cur = 0; cur < (int) n_seq_max; ++cur) {
-                if (cur == seq_id || seq_id == -1) {
-                    if (cells.seq_has(i, cur)) {
-                        seq_ids.push_back(cur);
-                    }
-                }
-            }
-
-            const llama_pos pos     = cells.pos_get(i);
-            const uint32_t n_seq_id = seq_ids.size();
-
-            io.write(&pos,      sizeof(pos));
-            io.write(&n_seq_id, sizeof(n_seq_id));
-
-            for (const auto & seq_id : seq_ids) {
-                io.write(&seq_id, sizeof(seq_id));
-            }
-        }
-    }
-}
-
-void llama_kv_cache_unified::state_write_data(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges) const {
-    const uint32_t v_trans = this->v_trans ? 1 : 0;
-    const uint32_t n_layer = layers.size();
-
-    io.write(&v_trans, sizeof(v_trans));
-    io.write(&n_layer, sizeof(n_layer));
-
-    std::vector<uint8_t> tmp_buf;
-
-    // Iterate and write all the keys first, each row is a cell
-    // Get whole range at a time
-    for (const auto & layer : layers) {
-        const uint32_t il = layer.il;
-
-        const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il) + hparams.n_embd_k_s();
-
-        // Write key type
-        const int32_t k_type_i = (int32_t)layer.k->type;
-        io.write(&k_type_i, sizeof(k_type_i));
-
-        // Write row size of key
-        const uint64_t k_size_row = ggml_row_size(layer.k->type, n_embd_k_gqa);
-        io.write(&k_size_row, sizeof(k_size_row));
-
-        // Read each range of cells of k_size length each into tmp_buf and write out
-        for (const auto & range : cell_ranges) {
-            const size_t range_size = range.second - range.first;
-            const size_t buf_size = range_size * k_size_row;
-            io.write_tensor(layer.k, range.first * k_size_row, buf_size);
-        }
-    }
-
-    if (!v_trans) {
-        for (const auto & layer : layers) {
-            const uint32_t il = layer.il;
-
-            const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
-
-            // Write value type
-            const int32_t v_type_i = (int32_t)layer.v->type;
-            io.write(&v_type_i, sizeof(v_type_i));
-
-            // Write row size of value
-            const uint64_t v_size_row = ggml_row_size(layer.v->type, n_embd_v_gqa);
-            io.write(&v_size_row, sizeof(v_size_row));
-
-            // Read each range of cells of v_size length each into tmp_buf and write out
-            for (const auto & range : cell_ranges) {
-                const size_t range_size = range.second - range.first;
-                const size_t buf_size = range_size * v_size_row;
-                io.write_tensor(layer.v, range.first * v_size_row, buf_size);
-            }
-        }
-    } else {
-        // When v is transposed, we also need the element size and get the element ranges from each row
-        const uint32_t kv_size = cells.size();
-
-        for (const auto & layer : layers) {
-            const uint32_t il = layer.il;
-
-            const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
-
-            // Write value type
-            const int32_t v_type_i = (int32_t)layer.v->type;
-            io.write(&v_type_i, sizeof(v_type_i));
-
-            // Write element size
-            const uint32_t v_size_el = ggml_type_size(layer.v->type);
-            io.write(&v_size_el, sizeof(v_size_el));
-
-            // Write GQA embedding size
-            io.write(&n_embd_v_gqa, sizeof(n_embd_v_gqa));
-
-            // For each row, we get the element values of each cell
-            for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
-                // Read each range of cells of v_size_el length each into tmp_buf and write out
-                for (const auto & range : cell_ranges) {
-                    const size_t range_size = range.second - range.first;
-                    const size_t src_offset = (range.first + j * kv_size) * v_size_el;
-                    const size_t buf_size = range_size * v_size_el;
-                    io.write_tensor(layer.v, src_offset, buf_size);
-                }
-            }
-        }
-    }
-}
-
-bool llama_kv_cache_unified::state_read_meta(llama_io_read_i & io, uint32_t cell_count, llama_seq_id dest_seq_id) {
-    if (dest_seq_id != -1) {
-        // single sequence
-
-        seq_rm(dest_seq_id, -1, -1);
-
-        llama_sbatch sbatch;
-        llama_ubatch batch = sbatch.reserve_ubatch(cell_count, /* has_embd */ false);
-
-        batch.n_tokens = cell_count;
-
-        for (uint32_t i = 0; i < cell_count; ++i) {
-            llama_pos pos;
-            uint32_t n_seq_id;
-
-            io.read_to(&pos,      sizeof(pos));
-            io.read_to(&n_seq_id, sizeof(n_seq_id));
-
-            if (n_seq_id != 1) {
-                LLAMA_LOG_ERROR("%s: invalid seq_id-agnostic kv cell\n", __func__);
-                return false;
-            }
-
-            // read the sequence id, but directly discard it - we will use dest_seq_id instead
-            {
-                llama_seq_id seq_id;
-                io.read_to(&seq_id, sizeof(seq_id));
-            }
-
-            batch.pos[i]      = pos;
-            batch.n_seq_id[i] = n_seq_id;
-            batch.seq_id[i]   = &dest_seq_id;
-        }
-
-        const auto head_cur = find_slot(batch);
-        if (head_cur < 0) {
-            LLAMA_LOG_ERROR("%s: failed to find available cells in kv cache\n", __func__);
-            return false;
-        }
-
-        apply_ubatch(head_cur, batch);
-
-        // keep the head at the old position because we will read the KV data into it in state_read_data()
-        head = head_cur;
-
-        // DEBUG CHECK: head_cur should be our first cell, head_cur + cell_count - 1 should be our last cell (verify seq_id and pos values)
-        // Assume that this is one contiguous block of cells
-        GGML_ASSERT(head_cur + cell_count <= cells.size());
-        GGML_ASSERT(cells.pos_get(head_cur)                  == batch.pos[0]);
-        GGML_ASSERT(cells.pos_get(head_cur + cell_count - 1) == batch.pos[cell_count - 1]);
-        GGML_ASSERT(cells.seq_has(head_cur,                  dest_seq_id));
-        GGML_ASSERT(cells.seq_has(head_cur + cell_count - 1, dest_seq_id));
-    } else {
-        // whole KV cache restore
-
-        if (cell_count > cells.size()) {
-            LLAMA_LOG_ERROR("%s: not enough cells in kv cache\n", __func__);
-            return false;
-        }
-
-        clear();
-
-        for (uint32_t i = 0; i < cell_count; ++i) {
-            llama_pos pos;
-            uint32_t  n_seq_id;
-
-            io.read_to(&pos,      sizeof(pos));
-            io.read_to(&n_seq_id, sizeof(n_seq_id));
-
-            cells.pos_set(i, pos);
-
-            for (uint32_t j = 0; j < n_seq_id; ++j) {
-                llama_seq_id seq_id;
-                io.read_to(&seq_id, sizeof(seq_id));
-
-                if (seq_id < 0 || (uint32_t) seq_id >= n_seq_max) {
-                    LLAMA_LOG_ERROR("%s: invalid seq_id, %d is out of range [0, %u)\n", __func__, seq_id, n_seq_max);
-                    return false;
-                }
-
-                cells.seq_add(i, seq_id);
-            }
-        }
-
-        head = 0;
-    }
-
-    return true;
-}
-
-bool llama_kv_cache_unified::state_read_data(llama_io_read_i & io, uint32_t cell_count) {
-    uint32_t v_trans;
-    uint32_t n_layer;
-
-    io.read_to(&v_trans, sizeof(v_trans));
-    io.read_to(&n_layer, sizeof(n_layer));
-
-    if (n_layer != layers.size()) {
-        LLAMA_LOG_ERROR("%s: mismatched layer count (%u instead of %u)\n", __func__, n_layer, (uint32_t) layers.size());
-        return false;
-    }
-
-    if (cell_count > cells.size()) {
-        LLAMA_LOG_ERROR("%s: not enough cells in kv cache to restore state (%u > %u)\n", __func__, cell_count, cells.size());
-        return false;
-    }
-
-    if (this->v_trans != (bool) v_trans) {
-        LLAMA_LOG_ERROR("%s: incompatible V transposition\n", __func__);
-        return false;
-    }
-
-    // For each layer, read the keys for each cell, one row is one cell, read as one contiguous block
-    for (const auto & layer : layers) {
-        const uint32_t il = layer.il;
-
-        const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il) + hparams.n_embd_k_s();
-
-        // Read type of key
-        int32_t k_type_i_ref;
-        io.read_to(&k_type_i_ref, sizeof(k_type_i_ref));
-        const int32_t k_type_i = (int32_t) layer.k->type;
-        if (k_type_i != k_type_i_ref) {
-            LLAMA_LOG_ERROR("%s: mismatched key type (%d != %d, layer %d)\n", __func__, k_type_i, k_type_i_ref, il);
-            return false;
-        }
-
-        // Read row size of key
-        uint64_t k_size_row_ref;
-        io.read_to(&k_size_row_ref, sizeof(k_size_row_ref));
-        const size_t k_size_row = ggml_row_size(layer.k->type, n_embd_k_gqa);
-        if (k_size_row != k_size_row_ref) {
-            LLAMA_LOG_ERROR("%s: mismatched key row size (%zu != %zu, layer %d)\n", __func__, k_size_row, (size_t) k_size_row_ref, il);
-            return false;
-        }
-
-        if (cell_count) {
-            // Read and set the keys for the whole cell range
-            ggml_backend_tensor_set(layer.k, io.read(cell_count * k_size_row), head * k_size_row, cell_count * k_size_row);
-        }
-    }
-
-    if (!this->v_trans) {
-        for (const auto & layer : layers) {
-            const uint32_t il = layer.il;
-
-            const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
-
-            // Read type of value
-            int32_t v_type_i_ref;
-            io.read_to(&v_type_i_ref, sizeof(v_type_i_ref));
-            const int32_t v_type_i = (int32_t)layer.v->type;
-            if (v_type_i != v_type_i_ref) {
-                LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il);
-                return false;
-            }
-
-            // Read row size of value
-            uint64_t v_size_row_ref;
-            io.read_to(&v_size_row_ref, sizeof(v_size_row_ref));
-            const size_t v_size_row = ggml_row_size(layer.v->type, n_embd_v_gqa);
-            if (v_size_row != v_size_row_ref) {
-                LLAMA_LOG_ERROR("%s: mismatched value row size (%zu != %zu, layer %d)\n", __func__, v_size_row, (size_t) v_size_row_ref, il);
-                return false;
-            }
-
-            if (cell_count) {
-                // Read and set the values for the whole cell range
-                ggml_backend_tensor_set(layer.v, io.read(cell_count * v_size_row), head * v_size_row, cell_count * v_size_row);
-            }
-        }
-    } else {
-        // For each layer, read the values for each cell (transposed)
-        for (const auto & layer : layers) {
-            const uint32_t il = layer.il;
-
-            const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
-
-            // Read type of value
-            int32_t v_type_i_ref;
-            io.read_to(&v_type_i_ref, sizeof(v_type_i_ref));
-            const int32_t v_type_i = (int32_t)layer.v->type;
-            if (v_type_i != v_type_i_ref) {
-                LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il);
-                return false;
-            }
-
-            // Read element size of value
-            uint32_t v_size_el_ref;
-            io.read_to(&v_size_el_ref, sizeof(v_size_el_ref));
-            const size_t v_size_el = ggml_type_size(layer.v->type);
-            if (v_size_el != v_size_el_ref) {
-                LLAMA_LOG_ERROR("%s: mismatched value element size (%zu != %zu, layer %d)\n", __func__, v_size_el, (size_t) v_size_el_ref, il);
-                return false;
-            }
-
-            // Read GQA embedding size
-            uint32_t n_embd_v_gqa_ref;
-            io.read_to(&n_embd_v_gqa_ref, sizeof(n_embd_v_gqa_ref));
-            if (n_embd_v_gqa != n_embd_v_gqa_ref) {
-                LLAMA_LOG_ERROR("%s: mismatched GQA embedding size (%u != %u, layer %d)\n", __func__, n_embd_v_gqa, n_embd_v_gqa_ref, il);
-                return false;
-            }
-
-            if (cell_count) {
-                // For each row in the transposed matrix, read the values for the whole cell range
-                for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
-                    const size_t dst_offset = (head + j * cells.size()) * v_size_el;
-                    ggml_backend_tensor_set(layer.v, io.read(cell_count * v_size_el), dst_offset, cell_count * v_size_el);
-                }
-            }
-        }
-    }
-
-    return true;
-}
-
-//
-// llama_kv_cache_unified_state
-//
-
-llama_kv_cache_unified_state::llama_kv_cache_unified_state(llama_memory_status status) : status(status) {}
-
-llama_kv_cache_unified_state::llama_kv_cache_unified_state(
-            llama_memory_status status,
-            llama_kv_cache_unified * kv) : status(status), kv(kv) {
-        n_kv = kv->get_size();
-        head = 0;
-    }
-
-llama_kv_cache_unified_state::llama_kv_cache_unified_state(
-            llama_memory_status status,
-            llama_kv_cache_unified * kv,
-            llama_sbatch sbatch,
-            std::vector<uint32_t> heads,
-            std::vector<llama_ubatch> ubatches)
-            : status(status),
-              kv(kv),
-              sbatch(std::move(sbatch)),
-              heads(std::move(heads)),
-              ubatches(std::move(ubatches)) {
-    }
-
-llama_kv_cache_unified_state::~llama_kv_cache_unified_state() = default;
-
-bool llama_kv_cache_unified_state::next() {
-    assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
-
-    if (++i_next >= ubatches.size()) {
-        return false;
-    }
-
-    return true;
-}
-
-bool llama_kv_cache_unified_state::apply() {
-    assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
-
-    kv->apply_ubatch(heads[i_next], ubatches[i_next]);
-
-    n_kv = kv->get_n_kv();
-    head = heads[i_next];
-
-    return true;
-}
-
-std::vector<int64_t> & llama_kv_cache_unified_state::out_ids() {
-    assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
-
-    return sbatch.out_ids;
-}
-
-llama_memory_status llama_kv_cache_unified_state::get_status() const {
-    return status;
-}
-
-const llama_ubatch & llama_kv_cache_unified_state::get_ubatch() const {
-    assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
-
-    return ubatches[i_next];
-}
-
-uint32_t llama_kv_cache_unified_state::get_n_kv() const {
-    return n_kv;
-}
-
-ggml_tensor * llama_kv_cache_unified_state::get_k(ggml_context * ctx, int32_t il) const {
-    return kv->get_k(ctx, il, n_kv);
-}
-
-ggml_tensor * llama_kv_cache_unified_state::get_v(ggml_context * ctx, int32_t il) const {
-    return kv->get_v(ctx, il, n_kv);
-}
-
-ggml_tensor * llama_kv_cache_unified_state::cpy_k(ggml_context * ctx, ggml_tensor * k_cur, int32_t il) const {
-    return kv->cpy_k(ctx, k_cur, il, head);
-}
-
-ggml_tensor * llama_kv_cache_unified_state::cpy_v(ggml_context * ctx, ggml_tensor * v_cur, int32_t il) const {
-    return kv->cpy_v(ctx, v_cur, il, head);
-}
-
-void llama_kv_cache_unified_state::set_input_k_shift(ggml_tensor * dst) const {
-    kv->set_input_k_shift(dst);
-}
-
-void llama_kv_cache_unified_state::set_input_kq_mask(ggml_tensor * dst, const llama_ubatch * ubatch, bool causal_attn) const {
-    kv->set_input_kq_mask(dst, ubatch, causal_attn);
-}
-
-void llama_kv_cache_unified_state::set_input_pos_bucket(ggml_tensor * dst, const llama_ubatch * ubatch) const {
-    kv->set_input_pos_bucket(dst, ubatch);
-}
-
-uint32_t llama_kv_cache_unified::get_padding(const llama_cparams & cparams) {
-    // the FA kernels require padding to avoid extra runtime boundary checks
-    return cparams.flash_attn ? 256u : 32u;
-}
-
-//
-// llama_kv_cache_unified_iswa
-//
-
-llama_kv_cache_unified_iswa::llama_kv_cache_unified_iswa(
-        const llama_model & model,
-                ggml_type   type_k,
-                ggml_type   type_v,
-                     bool   v_trans,
-                     bool   offload,
-                     bool   swa_full,
-                 uint32_t   kv_size,
-                 uint32_t   n_seq_max,
-                 uint32_t   n_ubatch,
-                 uint32_t   n_pad) : hparams(model.hparams) {
-    llama_kv_cache_unified::layer_filter_cb filter_base = [&](int32_t il) { return !model.hparams.is_swa(il); };
-    llama_kv_cache_unified::layer_filter_cb filter_swa  = [&](int32_t il) { return  model.hparams.is_swa(il); };
-
-    const uint32_t size_base = kv_size;
-
-    uint32_t size_swa = std::min(size_base, GGML_PAD(hparams.n_swa*n_seq_max + n_ubatch, n_pad));
-
-    // when using full-size SWA cache, we set the SWA cache size to be equal to the base cache size
-    if (swa_full) {
-        LLAMA_LOG_WARN("%s: using full-size SWA cache (ref: %s)\n",
-                __func__, "https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055");
-
-        size_swa = size_base;
-    }
-
-    LLAMA_LOG_INFO("%s: creating non-SWA KV cache, size = %u cells\n", __func__, size_base);
-
-    kv_base = std::make_unique<llama_kv_cache_unified>(
-            model, std::move(filter_base), type_k, type_v,
-            v_trans, offload, size_base, n_seq_max, n_pad,
-            0, LLAMA_SWA_TYPE_NONE);
-
-    LLAMA_LOG_INFO("%s: creating     SWA KV cache, size = %u cells\n", __func__, size_swa);
-
-    kv_swa = std::make_unique<llama_kv_cache_unified>(
-            model, std::move(filter_swa), type_k, type_v,
-            v_trans, offload, size_swa, n_seq_max, n_pad,
-            hparams.n_swa, hparams.swa_type);
-}
-
-void llama_kv_cache_unified_iswa::clear() {
-    kv_base->clear();
-    kv_swa ->clear();
-}
-
-bool llama_kv_cache_unified_iswa::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
-    bool res = true;
-
-    res = res & kv_base->seq_rm(seq_id, p0, p1);
-    res = res & kv_swa ->seq_rm(seq_id, p0, p1);
-
-    return res;
-}
-
-void llama_kv_cache_unified_iswa::seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) {
-    kv_base->seq_cp(seq_id_src, seq_id_dst, p0, p1);
-    kv_swa ->seq_cp(seq_id_src, seq_id_dst, p0, p1);
-}
-
-void llama_kv_cache_unified_iswa::seq_keep(llama_seq_id seq_id) {
-    kv_base->seq_keep(seq_id);
-    kv_swa ->seq_keep(seq_id);
-}
-
-void llama_kv_cache_unified_iswa::seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos shift) {
-    kv_base->seq_add(seq_id, p0, p1, shift);
-    kv_swa ->seq_add(seq_id, p0, p1, shift);
-}
-
-void llama_kv_cache_unified_iswa::seq_div(llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) {
-    kv_base->seq_div(seq_id, p0, p1, d);
-    kv_swa ->seq_div(seq_id, p0, p1, d);
-}
-
-llama_pos llama_kv_cache_unified_iswa::seq_pos_min(llama_seq_id seq_id) const {
-    // the base cache is a superset of the SWA cache, so we can just check the SWA cache
-    return kv_swa->seq_pos_min(seq_id);
-}
-
-llama_pos llama_kv_cache_unified_iswa::seq_pos_max(llama_seq_id seq_id) const {
-    return kv_swa->seq_pos_max(seq_id);
-}
-
-llama_memory_state_ptr llama_kv_cache_unified_iswa::init_batch(const llama_batch & batch, uint32_t n_ubatch, bool embd_pooled, bool logits_all) {
-    GGML_UNUSED(embd_pooled);
-
-    // TODO: if we fail with split_simple, we should attempt different splitting strategies
-    //       but to do that properly, we first have to refactor the batches to be more flexible
-
-    auto sbatch = llama_sbatch(batch, hparams.n_embd, true, logits_all);
-
-    std::vector<llama_ubatch> ubatches;
-
-    while (sbatch.n_tokens > 0) {
-        auto ubatch = sbatch.split_simple(n_ubatch);
-
-        ubatches.push_back(ubatch);
-    }
-
-    auto heads_base = kv_base->prepare(ubatches);
-    if (heads_base.empty()) {
-        return std::make_unique<llama_kv_cache_unified_iswa_state>(LLAMA_MEMORY_STATUS_FAILED_PREPARE);
-    }
-
-    auto heads_swa = kv_swa->prepare(ubatches);
-    if (heads_swa.empty()) {
-        return std::make_unique<llama_kv_cache_unified_iswa_state>(LLAMA_MEMORY_STATUS_FAILED_PREPARE);
-    }
-
-    assert(heads_base.size() == heads_swa.size());
-
-    return std::make_unique<llama_kv_cache_unified_iswa_state>(LLAMA_MEMORY_STATUS_SUCCESS,
-            this, std::move(sbatch), std::move(heads_base), std::move(heads_swa), std::move(ubatches));
-}
-
-llama_memory_state_ptr llama_kv_cache_unified_iswa::init_full() {
-    return std::make_unique<llama_kv_cache_unified_iswa_state>(LLAMA_MEMORY_STATUS_SUCCESS, this);
-}
-
-bool llama_kv_cache_unified_iswa::update(llama_context & lctx) {
-    bool res = false;
-
-    res = res | kv_base->update(lctx);
-    res = res | kv_swa ->update(lctx);
-
-    return res;
-}
-
-void llama_kv_cache_unified_iswa::defrag_sched(float thold) {
-    kv_base->defrag_sched(thold);
-    kv_swa ->defrag_sched(thold);
-}
-
-bool llama_kv_cache_unified_iswa::get_can_shift() const {
-    return kv_base->get_size() == kv_swa->get_size();
-}
-
-void llama_kv_cache_unified_iswa::state_write(llama_io_write_i & io, llama_seq_id seq_id) const {
-    kv_base->state_write(io, seq_id);
-    kv_swa ->state_write(io, seq_id);
-}
-
-void llama_kv_cache_unified_iswa::state_read(llama_io_read_i & io, llama_seq_id seq_id) {
-    kv_base->state_read(io, seq_id);
-    kv_swa ->state_read(io, seq_id);
-}
-
-llama_kv_cache_unified * llama_kv_cache_unified_iswa::get_base() const {
-    return kv_base.get();
-}
-
-llama_kv_cache_unified * llama_kv_cache_unified_iswa::get_swa() const {
-    return kv_swa.get();
-}
-
-//
-// llama_kv_cache_unified_iswa_state
-//
-
-llama_kv_cache_unified_iswa_state::llama_kv_cache_unified_iswa_state(llama_memory_status status) : status(status) {}
-
-llama_kv_cache_unified_iswa_state::llama_kv_cache_unified_iswa_state(
-        llama_memory_status status,
-        llama_kv_cache_unified_iswa * kv) : status(status) {
-    state_base.reset(new llama_kv_cache_unified_state(status, kv->get_base()));
-    state_swa .reset(new llama_kv_cache_unified_state(status, kv->get_swa ()));
-}
-
-llama_kv_cache_unified_iswa_state::llama_kv_cache_unified_iswa_state(
-        llama_memory_status status,
-        llama_kv_cache_unified_iswa * kv,
-        llama_sbatch sbatch,
-        std::vector<uint32_t> heads_base,
-        std::vector<uint32_t> heads_swa,
-        std::vector<llama_ubatch> ubatches)
-    : status(status),
-    sbatch(std::move(sbatch)),
-    ubatches(std::move(ubatches)) {
-        // note: here we copy the ubatches. not sure if this is ideal
-        state_base.reset(new llama_kv_cache_unified_state(status, kv->get_base(), {}, std::move(heads_base), this->ubatches));
-        state_swa .reset(new llama_kv_cache_unified_state(status, kv->get_swa (), {}, std::move(heads_swa),  this->ubatches));
-    }
-
-llama_kv_cache_unified_iswa_state:: ~llama_kv_cache_unified_iswa_state() = default;
-
-bool llama_kv_cache_unified_iswa_state::next() {
-    assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
-
-    state_base->next();
-    state_swa ->next();
-
-    if (++i_next >= ubatches.size()) {
-        return false;
-    }
-
-    return true;
-}
-
-bool llama_kv_cache_unified_iswa_state::apply() {
-    assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
-
-    bool res = true;
-
-    res = res & state_base->apply();
-    res = res & state_swa ->apply();
-
-    return res;
-}
-
-std::vector<int64_t> & llama_kv_cache_unified_iswa_state::out_ids() {
-    assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
-
-    return sbatch.out_ids;
-}
-
-llama_memory_status llama_kv_cache_unified_iswa_state::get_status() const {
-    return status;
-}
-
-const llama_ubatch & llama_kv_cache_unified_iswa_state::get_ubatch() const {
-    assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
-    return ubatches[i_next];
-}
-
-const llama_kv_cache_unified_state * llama_kv_cache_unified_iswa_state::get_base() const {
-    assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
-
-    return state_base.get();
-}
-
-const llama_kv_cache_unified_state * llama_kv_cache_unified_iswa_state::get_swa()  const {
-    assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
-
-    return state_swa.get();
-}
-
-//
-// llama_kv_cache_recurrent
-//
-
-llama_kv_cache_recurrent::llama_kv_cache_recurrent(
-        const llama_model & model,
-                ggml_type   type_k,
-                ggml_type   type_v,
-                     bool   offload,
-                 uint32_t   kv_size,
-                 uint32_t   n_seq_max) : hparams(model.hparams), n_seq_max(n_seq_max) {
-    const int32_t n_layer = hparams.n_layer;
-
-    LLAMA_LOG_INFO("%s: kv_size = %u, n_seq_max = %u, type_k = '%s', type_v = '%s', n_layer = %d\n",
-            __func__, kv_size, n_seq_max, ggml_type_name(type_k), ggml_type_name(type_v), n_layer);
-
-    head = 0;
-    size = kv_size;
-    used = 0;
-
-    cells.clear();
-    cells.resize(kv_size);
-
-    // create a context for each buffer type
-    std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
-    auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
-        auto it = ctx_map.find(buft);
-        if (it == ctx_map.end()) {
-            ggml_init_params params = {
-                /*.mem_size   =*/ size_t(2u*n_layer*ggml_tensor_overhead()),
-                /*.mem_buffer =*/ NULL,
-                /*.no_alloc   =*/ true,
-            };
-
-            ggml_context * ctx = ggml_init(params);
-            if (!ctx) {
-                return nullptr;
-            }
-
-            ctx_map[buft] = ctx;
-            ctxs.emplace_back(ctx);
-
-            return ctx;
-        }
-
-        return it->second;
-    };
-
-    k_l.reserve(n_layer);
-    v_l.reserve(n_layer);
-
-    for (int i = 0; i < n_layer; i++) {
-        const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i) + hparams.n_embd_k_s();
-        const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i) + hparams.n_embd_v_s();
-
-        const char * dev_name = "CPU";
-
-        ggml_backend_buffer_type_t buft = ggml_backend_cpu_buffer_type();
-
-        if (offload) {
-            auto * dev = model.dev_layer(i);
-            buft = ggml_backend_dev_buffer_type(dev);
-
-            dev_name = ggml_backend_dev_name(dev);
-        }
-
-        LLAMA_LOG_DEBUG("%s, layer %3d: dev = %s\n", __func__, i, dev_name);
-
-        ggml_context * ctx = ctx_for_buft(buft);
-        if (!ctx) {
-            throw std::runtime_error("failed to create ggml context for kv cache");
-        }
-
-        ggml_tensor * k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size);
-        ggml_tensor * v = ggml_new_tensor_1d(ctx, type_v, n_embd_v_gqa*kv_size);
-        ggml_format_name(k, "cache_k_l%d", i);
-        ggml_format_name(v, "cache_v_l%d", i);
-        k_l.push_back(k);
-        v_l.push_back(v);
-    }
-
-    // allocate tensors and initialize the buffers to avoid NaNs in the padding
-    for (auto it : ctx_map) {
-        auto * buft = it.first;
-        auto * ctx  = it.second;
-
-        ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
-        if (!buf) {
-            throw std::runtime_error("failed to allocate buffer for kv cache");
-        }
-        ggml_backend_buffer_clear(buf, 0);
-        LLAMA_LOG_INFO("%s: %10s KV buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0);
-        bufs.emplace_back(buf);
-    }
-
-    {
-        const size_t memory_size_k = size_k_bytes();
-        const size_t memory_size_v = size_v_bytes();
-
-        LLAMA_LOG_INFO("%s: KV self size  = %7.2f MiB, K (%s): %7.2f MiB, V (%s): %7.2f MiB\n", __func__,
-                (float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f),
-                ggml_type_name(type_k), (float)memory_size_k / (1024.0f * 1024.0f),
-                ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f));
-    }
-}
-
-void llama_kv_cache_recurrent::clear() {
-    for (int32_t i = 0; i < (int32_t) size; ++i) {
-        cells[i].pos = -1;
-        cells[i].seq_id.clear();
-        cells[i].src = -1;
-        cells[i].tail = -1;
-    }
-    head = 0;
-    used = 0;
-
-    for (auto & buf : bufs) {
-        ggml_backend_buffer_clear(buf.get(), 0);
-    }
-}
-
-bool llama_kv_cache_recurrent::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
-    uint32_t new_head = size;
-
-    if (p0 < 0) {
-        p0 = 0;
-    }
-
-    if (p1 < 0) {
-        p1 = std::numeric_limits<llama_pos>::max();
-    }
-
-    // models like Mamba or RWKV can't have a state partially erased
-    if (seq_id >= (int64_t) size) {
-        // could be fatal
-        return false;
-    }
-    if (0 <= seq_id) {
-        int32_t & tail_id = cells[seq_id].tail;
-        if (tail_id >= 0) {
-            const kv_cell & cell = cells[tail_id];
-            // partial intersection is invalid
-            if ((0 < p0 && p0 <= cell.pos) || (0 < p1 && p1 <= cell.pos)) {
-                return false;
-            }
-            // invalidate tails which will be cleared
-            if (p0 <= cell.pos && cell.pos < p1) {
-                tail_id = -1;
-            }
-        }
-    } else {
-        // seq_id is negative, then the range should include everything or nothing
-        if (p0 != p1 && (p0 != 0 || p1 != std::numeric_limits<llama_pos>::max())) {
-            return false;
-        }
-    }
-
-    for (uint32_t i = 0; i < size; ++i) {
-        if (cells[i].pos >= p0 && cells[i].pos < p1) {
-            if (seq_id < 0) {
-                cells[i].seq_id.clear();
-            } else if (cells[i].has_seq_id(seq_id)) {
-                cells[i].seq_id.erase(seq_id);
-            } else {
-                continue;
-            }
-            if (cells[i].is_empty()) {
-                // keep count of the number of used cells
-                if (cells[i].pos >= 0) {
-                    used--;
-                }
-                cells[i].pos = -1;
-                cells[i].src = -1;
-                if (new_head == size) {
-                    new_head = i;
-                }
-            }
-        }
-    }
-
-    // If we freed up a slot, set head to it so searching can start there.
-    if (new_head != size && new_head < head) {
-        head = new_head;
-    }
-
-    return true;
-}
-
-void llama_kv_cache_recurrent::seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) {
-    if (seq_id_src == seq_id_dst) {
-        return;
-    }
-
-    if (p0 < 0) {
-        p0 = 0;
-    }
-
-    if (p1 < 0) {
-        p1 = std::numeric_limits<llama_pos>::max();
-    }
-
-    if ((uint32_t) seq_id_dst < size && (uint32_t) seq_id_src < size) {
-        kv_cell & tail_src = cells[seq_id_src];
-        kv_cell & tail_dst = cells[seq_id_dst];
-        if (tail_dst.tail >= 0) {
-            // clear destination seq_id if it wasn't empty
-            kv_cell & cell_dst = cells[tail_dst.tail];
-
-            cell_dst.seq_id.erase(seq_id_dst);
-            tail_dst.tail = -1;
-            if (cell_dst.seq_id.empty()) {
-                cell_dst.pos = -1;
-                cell_dst.src = -1;
-                used -= 1;
-            }
-        }
-        if (tail_src.tail >= 0) {
-            kv_cell & cell_src = cells[tail_src.tail];
-
-            cell_src.seq_id.insert(seq_id_dst);
-            tail_dst.tail = tail_src.tail;
-        }
-    }
-}
-
-void llama_kv_cache_recurrent::seq_keep(llama_seq_id seq_id) {
-    uint32_t new_head = size;
-
-    for (uint32_t i = 0; i < size; ++i) {
-        if ((llama_seq_id) i != seq_id) {
-            cells[i].tail = -1;
-        }
-
-        if (!cells[i].has_seq_id(seq_id)) {
-            if (cells[i].pos >= 0) {
-                used--;
-            }
-
-            cells[i].pos = -1;
-            cells[i].src = -1;
-            cells[i].seq_id.clear();
-
-            if (new_head == size){
-                new_head = i;
-            }
-        } else {
-            cells[i].seq_id.clear();
-            cells[i].seq_id.insert(seq_id);
-        }
-    }
-
-    // If we freed up a slot, set head to it so searching can start there.
-    if (new_head != size && new_head < head) {
-        head = new_head;
-    }
-}
-
-void llama_kv_cache_recurrent::seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos shift) {
-    if (shift == 0) {
-        return;
-    }
-
-    if (p0 < 0) {
-        p0 = 0;
-    }
-
-    if (p1 < 0) {
-        p1 = std::numeric_limits<llama_pos>::max();
-    }
-
-    // If there is no range then return early to avoid looping over the
-    if (p0 == p1) {
-        return;
-    }
-
-    // for Mamba-like or RWKV models, only the pos needs to be shifted
-    if (0 <= seq_id && seq_id < (int64_t) size) {
-        const int32_t tail_id = cells[seq_id].tail;
-        if (tail_id >= 0) {
-            kv_cell & cell = cells[tail_id];
-            if (cell.has_seq_id(seq_id) && p0 <= cell.pos && cell.pos < p1) {
-                cell.pos += shift;
-            }
-        }
-    }
-}
-
-void llama_kv_cache_recurrent::seq_div(llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) {
-    if (d == 1) {
-        return;
-    }
-
-    if (p0 < 0) {
-        p0 = 0;
-    }
-
-    if (p1 < 0) {
-        p1 = std::numeric_limits<llama_pos>::max();
-    }
-
-    // If there is no range then return early to avoid looping over the cache.
-    if (p0 == p1) {
-        return;
-    }
-
-    // for Mamba-like or RWKV models, only the pos needs to be changed
-    if (0 <= seq_id && seq_id < (int64_t) size) {
-        const int32_t tail_id = cells[seq_id].tail;
-        if (tail_id >= 0) {
-            kv_cell & cell = cells[tail_id];
-            if (cell.has_seq_id(seq_id) && p0 <= cell.pos && cell.pos < p1) {
-                cell.pos /= d;
-            }
-        }
-    }
-}
-
-llama_pos llama_kv_cache_recurrent::seq_pos_min(llama_seq_id seq_id) const {
-    llama_pos result = std::numeric_limits<llama_pos>::max();
-
-    for (uint32_t i = 0; i < size; ++i) {
-        if (cells[i].has_seq_id(seq_id)) {
-            result = std::min(result, cells[i].pos);
-        }
-    }
-
-    if (result == std::numeric_limits<llama_pos>::max()) {
-        result = -1;
-    }
-
-    return result;
-}
-
-llama_pos llama_kv_cache_recurrent::seq_pos_max(llama_seq_id seq_id) const {
-    llama_pos result = -1;
-
-    for (uint32_t i = 0; i < size; ++i) {
-        if (cells[i].has_seq_id(seq_id)) {
-            result = std::max(result, cells[i].pos);
-        }
-    }
-
-    return result;
-}
-
-llama_memory_state_ptr llama_kv_cache_recurrent::init_batch(const llama_batch & batch, uint32_t n_ubatch, bool embd_pooled, bool logits_all) {
-    GGML_UNUSED(embd_pooled);
-
-    auto sbatch = llama_sbatch(batch, hparams.n_embd, false, logits_all);
-
-    std::vector<llama_ubatch> ubatches;
-
-    while (sbatch.n_tokens > 0) {
-        llama_ubatch ubatch;
-
-        if (embd_pooled) {
-            // Pooled embeddings cannot be split across ubatches (yet)
-            ubatch = sbatch.split_seq(n_ubatch);
-        } else {
-            ubatch = sbatch.split_equal(n_ubatch);
-        }
-
-        ubatches.push_back(ubatch);
-    }
-
-    if (!prepare(ubatches)) {
-        return std::make_unique<llama_kv_cache_recurrent_state>(LLAMA_MEMORY_STATUS_FAILED_PREPARE);
-    }
-
-    return std::make_unique<llama_kv_cache_recurrent_state>(LLAMA_MEMORY_STATUS_SUCCESS, this, std::move(sbatch), std::move(ubatches));
-}
-
-llama_memory_state_ptr llama_kv_cache_recurrent::init_full() {
-    return std::make_unique<llama_kv_cache_recurrent_state>(LLAMA_MEMORY_STATUS_SUCCESS, this);
-}
-
-bool llama_kv_cache_recurrent::prepare(const std::vector<llama_ubatch> & ubatches) {
-    // simply remember the full state because it is very small for this type of cache
-    // TODO: optimize
-    auto org_cells = cells;
-    auto org_used = used;
-    auto org_head = head;
-
-    bool success = true;
-
-    // TODO: here we have to verify that all ubatches can fit in the cells
-    //       however, the current implementation is broken because it relies on s_copy() and s_mask() to update the cells
-    //         during the compute of each ubatch. to reproduce, uncomment the following loop and run:
-    //
-    //           $ llama-parallel -m ./mamba-130m/ggml-model-f16.gguf -np 5 -ns 8
-    //
-    //       recovery from failures when the batch does not fit in the KV cache will not work correctly until this is fixed
-    //
-    GGML_UNUSED(ubatches);
-    //for (const auto & ubatch : ubatches) {
-    //    if (!find_slot(ubatch)) {
-    //        success = false;
-    //        break;
-    //    }
-    //}
-
-    // restore the original state
-    cells = std::move(org_cells);
-    used = org_used;
-    head = org_head;
-
-    return success;
-}
-
-bool llama_kv_cache_recurrent::update(llama_context & lctx) {
-    GGML_UNUSED(lctx);
-    // noop
-    return false;
-}
-
-void llama_kv_cache_recurrent::defrag_sched(float thold) {
-    GGML_UNUSED(thold);
-    // noop
-}
-
-bool llama_kv_cache_recurrent::find_slot(const llama_ubatch & ubatch) {
-    const uint32_t n_tokens = ubatch.n_tokens;
-    const uint32_t n_seqs   = ubatch.n_seqs;
-
-    const uint32_t n_seq_tokens = ubatch.n_seq_tokens;
-
-    // if we have enough unused cells before the current head ->
-    //   better to start searching from the beginning of the cache, hoping to fill it
-    if (head > used + 2*n_tokens) {
-        head = 0;
-    }
-
-    // For recurrent state architectures (like Mamba or RWKV),
-    // each cache cell can store the state for a whole sequence.
-    // A slot should be always be contiguous.
-
-    // can only process batches with an equal number of new tokens in each sequence
-    GGML_ASSERT(ubatch.equal_seqs);
-
-    int32_t min = size - 1;
-    int32_t max = 0;
-
-    // everything should fit if all seq_ids are smaller than the max
-    for (uint32_t s = 0; s < n_seqs; ++s) {
-        const uint32_t n_seq_id = ubatch.n_seq_id[s];
-        for (uint32_t j = 0; j < n_seq_id; ++j) {
-            const llama_seq_id seq_id = ubatch.seq_id[s][j];
-
-            if (seq_id < 0 || (uint32_t) seq_id >= size) {
-                // too big seq_id
-                // TODO: would it be possible to resize the cache instead?
-                LLAMA_LOG_ERROR("%s: seq_id=%d >= n_seq_max=%u Try using a bigger --parallel value\n", __func__, seq_id, n_seq_max);
-                return false;
-            }
-            if (j > 0) {
-                kv_cell & seq = cells[seq_id];
-                if (seq.tail >= 0) {
-                    kv_cell & cell = cells[seq.tail];
-                    // clear cells from seq_ids that become shared
-                    // (should not normally happen, but let's handle it anyway)
-                    cell.seq_id.erase(seq_id);
-                    seq.tail = -1;
-                    if (cell.seq_id.empty()) {
-                        cell.pos = -1;
-                        cell.src = -1;
-                        used -= 1;
-                    }
-                }
-            }
-        }
-    }
-
-#ifndef NDEBUG
-    {
-        std::vector<int32_t> tails_verif;
-        tails_verif.assign(size, -1);
-        for (uint32_t i = 0; i < size; ++i) {
-            kv_cell & cell = cells[i];
-            for (llama_seq_id seq_id : cell.seq_id) {
-                if (tails_verif[seq_id] != -1) {
-                    LLAMA_LOG_ERROR("%s: duplicate tail for seq_id %d in cell %d and %d\n", __func__, seq_id, i, tails_verif[seq_id]);
-                }
-                tails_verif[seq_id] = i;
-            }
-        }
-        for (uint32_t i = 0; i < size; ++i) {
-            if (tails_verif[i] != cells[i].tail) {
-                LLAMA_LOG_ERROR("%s: wrong tail for seq_id %d, (%d instead of %d)\n", __func__, i, cells[i].tail, tails_verif[i]);
-            }
-        }
-    }
-#endif
-
-    // find next empty cell
-    uint32_t next_empty_cell = head;
-
-    for (uint32_t i = 0; i < size; ++i) {
-        if (next_empty_cell >= size) { next_empty_cell -= size; }
-        kv_cell & cell = cells[next_empty_cell];
-        if (cell.is_empty()) { break; }
-        next_empty_cell += 1;
-    }
-
-    // find usable cell range
-    for (uint32_t s = 0; s < n_seqs; ++s) {
-        const llama_seq_id seq_id = ubatch.seq_id[s][0];
-        kv_cell & seq_meta = cells[seq_id];
-        bool has_cell = false;
-        if (seq_meta.tail >= 0) {
-            kv_cell & cell = cells[seq_meta.tail];
-            GGML_ASSERT(cell.has_seq_id(seq_id));
-            // does this seq_id "own" the cell?
-            if (cell.seq_id.size() == 1) { has_cell = true; }
-        }
-        if (!has_cell) {
-            kv_cell & empty_cell = cells[next_empty_cell];
-            GGML_ASSERT(empty_cell.is_empty());
-            // copy old tail into the empty cell
-            if (seq_meta.tail >= 0) {
-                kv_cell & orig_cell = cells[seq_meta.tail];
-                empty_cell.pos = orig_cell.pos;
-                empty_cell.src = orig_cell.src;
-                orig_cell.seq_id.erase(seq_id);
-                empty_cell.seq_id.insert(seq_id); // will be overwritten
-            }
-            seq_meta.tail = next_empty_cell;
-            // find next empty cell
-            if (s + 1 < n_seqs) {
-                next_empty_cell += 1;
-                for (uint32_t i = 0; i < size; ++i) {
-                    if (next_empty_cell >= size) { next_empty_cell -= size; }
-                    kv_cell & cell = cells[next_empty_cell];
-                    if (cell.is_empty()) { break; }
-                    next_empty_cell += 1;
-                }
-            }
-        }
-        if (min > seq_meta.tail) { min = seq_meta.tail; }
-        if (max < seq_meta.tail) { max = seq_meta.tail; }
-    }
-
-    // gather and re-order
-    for (uint32_t s = 0; s < n_seqs; ++s) {
-        int32_t dst_id = s + min;
-        int32_t src_id = cells[ubatch.seq_id[s][0]].tail;
-        if (dst_id != src_id) {
-            kv_cell & dst_cell = cells[dst_id];
-            kv_cell & src_cell = cells[src_id];
-
-            std::swap(dst_cell.pos, src_cell.pos);
-            std::swap(dst_cell.src, src_cell.src);
-            std::swap(dst_cell.seq_id, src_cell.seq_id);
-
-            // swap tails (assuming they NEVER overlap)
-            for (const llama_seq_id seq_id : src_cell.seq_id) {
-                cells[seq_id].tail = src_id;
-            }
-            for (const llama_seq_id seq_id : dst_cell.seq_id) {
-                cells[seq_id].tail = dst_id;
-            }
-        }
-    }
-
-    // update the pos of the used seqs
-    for (uint32_t s = 0; s < n_seqs; ++s) {
-        const llama_pos last_pos = ubatch.pos[n_seq_tokens * s + n_seq_tokens - 1];
-        int32_t cell_id = s + min;
-        kv_cell & cell = cells[cell_id];
-
-        if (cell.pos >= 0 && last_pos != cell.pos + (llama_pos) n_seq_tokens) {
-            // What should happen when the pos backtracks or skips a value?
-            // Clearing the state mid-batch would require special-casing which isn't done.
-            LLAMA_LOG_WARN("%s: non-consecutive token position %d after %d for sequence %d with %u new tokens\n",
-                __func__, last_pos, cell.pos, ubatch.seq_id[s][0], n_seq_tokens);
-        }
-        cell.pos = last_pos;
-        cell.seq_id.clear();
-        for (int32_t j = 0; j < ubatch.n_seq_id[s]; ++j) {
-            const llama_seq_id seq_id = ubatch.seq_id[s][j];
-            cell.seq_id.insert(seq_id);
-            cells[seq_id].tail = cell_id;
-        }
-    }
-
-    // allow getting the range of used cells, from head to head + n
-    head = min;
-    n    = max - min + 1;
-    used = std::count_if(cells.begin(), cells.end(),
-        [](const kv_cell & cell){ return !cell.is_empty(); });
-
-    // sanity check
-    return n >= n_seqs;
-}
-
-bool llama_kv_cache_recurrent::get_can_shift() const {
-    return false;
-}
-
-int32_t llama_kv_cache_recurrent::s_copy(int i) const {
-    const uint32_t cell_id = i + head;
-
-    //////////////////////////////////////////////
-    // TODO: this should not mutate the KV cache !
-    kv_cell & cell = const_cast<kv_cell &>(cells[cell_id]);
-
-    // prevent out-of-bound sources
-    if (cell.src < 0 || (uint32_t) cell.src >= size) {
-        cell.src = cell_id;
-    }
-
-    int32_t res = cell.src;
-
-    // TODO: do not mutate the KV cache
-    // ensure copy only happens once
-    if (cell.src != (int32_t) cell_id) {
-        cell.src = cell_id;
-    }
-
-    return res;
-}
-
-float llama_kv_cache_recurrent::s_mask(int i) const {
-    const uint32_t cell_id = i + head;
-
-    //////////////////////////////////////////////
-    // TODO: this should not mutate the KV cache !
-    kv_cell & cell = const_cast<kv_cell &>(cells[cell_id]);
-
-    float res = (float) (cell.src >= 0);
-
-    // only clear once
-    if (cell.src < 0) {
-        cell.src = cell_id;
-    }
-
-    return res;
-}
-
-size_t llama_kv_cache_recurrent::total_size() const {
-    size_t size = 0;
-    for (const auto & buf : bufs) {
-        size += ggml_backend_buffer_get_size(buf.get());
-    }
-
-    return size;
-}
-
-size_t llama_kv_cache_recurrent::size_k_bytes() const {
-    size_t size_k_bytes = 0;
-
-    for (const auto & k : k_l) {
-        size_k_bytes += ggml_nbytes(k);
-    }
-
-    return size_k_bytes;
-}
-
-size_t llama_kv_cache_recurrent::size_v_bytes() const {
-    size_t size_v_bytes = 0;
-
-    for (const auto & v : v_l) {
-        size_v_bytes += ggml_nbytes(v);
-    }
-
-    return size_v_bytes;
-}
-
-void llama_kv_cache_recurrent::state_write(llama_io_write_i & io, llama_seq_id seq_id) const {
-    std::vector<std::pair<uint32_t, uint32_t>> cell_ranges; // ranges, from inclusive, to exclusive
-    uint32_t cell_count = 0;
-
-    // Count the number of cells with the specified seq_id
-    // Find all the ranges of cells with this seq id (or all, when -1)
-    uint32_t cell_range_begin = size;
-    for (uint32_t i = 0; i < size; ++i) {
-        const auto & cell = cells[i];
-        if ((seq_id == -1 && !cell.is_empty()) || cell.has_seq_id(seq_id)) {
-            ++cell_count;
-            if (cell_range_begin == size) {
-                cell_range_begin = i;
-            }
-        } else {
-            if (cell_range_begin != size) {
-                cell_ranges.emplace_back(cell_range_begin, i);
-                cell_range_begin = size;
-            }
-        }
-    }
-    if (cell_range_begin != size) {
-        cell_ranges.emplace_back(cell_range_begin, size);
-    }
-
-    // DEBUG CHECK: Sum of cell counts in ranges should equal the total cell count
-    uint32_t cell_count_check = 0;
-    for (const auto & range : cell_ranges) {
-        cell_count_check += range.second - range.first;
-    }
-    GGML_ASSERT(cell_count == cell_count_check);
-
-    io.write(&cell_count, sizeof(cell_count));
-
-    state_write_meta(io, cell_ranges, seq_id);
-    state_write_data(io, cell_ranges);
-}
-
-void llama_kv_cache_recurrent::state_read(llama_io_read_i & io, llama_seq_id seq_id) {
-    uint32_t cell_count;
-    io.read_to(&cell_count, sizeof(cell_count));
-
-    bool res = true;
-
-    res = res && state_read_meta(io, cell_count, seq_id);
-    res = res && state_read_data(io, cell_count);
-
-    if (!res) {
-        if (seq_id == -1) {
-            clear();
-        } else {
-            seq_rm(seq_id, -1, -1);
-        }
-        throw std::runtime_error("failed to restore kv cache");
-    }
-}
-
-void llama_kv_cache_recurrent::state_write_meta(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges, llama_seq_id seq_id) const {
-    for (const auto & range : cell_ranges) {
-        for (uint32_t i = range.first; i < range.second; ++i) {
-            const auto & cell = cells[i];
-            const llama_pos pos      = cell.pos;
-            const uint32_t  n_seq_id = seq_id == -1 ? cell.seq_id.size() : 0;
-
-            io.write(&pos,      sizeof(pos));
-            io.write(&n_seq_id, sizeof(n_seq_id));
-
-            if (n_seq_id) {
-                for (auto seq_id : cell.seq_id) {
-                    io.write(&seq_id, sizeof(seq_id));
-                }
-            }
-        }
-    }
-}
-
-void llama_kv_cache_recurrent::state_write_data(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges) const {
-    const uint32_t v_trans = 0;
-    const uint32_t n_layer = hparams.n_layer;
-
-    io.write(&v_trans, sizeof(v_trans));
-    io.write(&n_layer, sizeof(n_layer));
-
-    std::vector<uint8_t> tmp_buf;
-
-    // Iterate and write all the keys first, each row is a cell
-    // Get whole range at a time
-    for (uint32_t il = 0; il < n_layer; ++il) {
-        const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il) + hparams.n_embd_k_s();
-
-        // Write key type
-        const int32_t k_type_i = (int32_t)k_l[il]->type;
-        io.write(&k_type_i, sizeof(k_type_i));
-
-        // Write row size of key
-        const uint64_t k_size_row = ggml_row_size(k_l[il]->type, n_embd_k_gqa);
-        io.write(&k_size_row, sizeof(k_size_row));
-
-        // Read each range of cells of k_size length each into tmp_buf and write out
-        for (const auto & range : cell_ranges) {
-            const size_t range_size = range.second - range.first;
-            const size_t buf_size = range_size * k_size_row;
-            io.write_tensor(k_l[il], range.first * k_size_row, buf_size);
-        }
-    }
-
-    if (!v_trans) {
-        for (uint32_t il = 0; il < n_layer; ++il) {
-            const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
-
-            // Write value type
-            const int32_t v_type_i = (int32_t)v_l[il]->type;
-            io.write(&v_type_i, sizeof(v_type_i));
-
-            // Write row size of value
-            const uint64_t v_size_row = ggml_row_size(v_l[il]->type, n_embd_v_gqa);
-            io.write(&v_size_row, sizeof(v_size_row));
-
-            // Read each range of cells of v_size length each into tmp_buf and write out
-            for (const auto & range : cell_ranges) {
-                const size_t range_size = range.second - range.first;
-                const size_t buf_size = range_size * v_size_row;
-                io.write_tensor(v_l[il], range.first * v_size_row, buf_size);
-            }
-        }
-    } else {
-        // When v is transposed, we also need the element size and get the element ranges from each row
-        const uint32_t kv_size = size;
-        for (uint32_t il = 0; il < n_layer; ++il) {
-            const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
-
-            // Write value type
-            const int32_t v_type_i = (int32_t)v_l[il]->type;
-            io.write(&v_type_i, sizeof(v_type_i));
-
-            // Write element size
-            const uint32_t v_size_el = ggml_type_size(v_l[il]->type);
-            io.write(&v_size_el, sizeof(v_size_el));
-
-            // Write GQA embedding size
-            io.write(&n_embd_v_gqa, sizeof(n_embd_v_gqa));
-
-            // For each row, we get the element values of each cell
-            for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
-                // Read each range of cells of v_size_el length each into tmp_buf and write out
-                for (const auto & range : cell_ranges) {
-                    const size_t range_size = range.second - range.first;
-                    const size_t src_offset = (range.first + j * kv_size) * v_size_el;
-                    const size_t buf_size = range_size * v_size_el;
-                    io.write_tensor(v_l[il], src_offset, buf_size);
-                }
-            }
-        }
-    }
-}
-
-bool llama_kv_cache_recurrent::state_read_meta(llama_io_read_i & io, uint32_t cell_count, llama_seq_id dest_seq_id) {
-    if (dest_seq_id != -1) {
-        // single sequence
-
-        seq_rm(dest_seq_id, -1, -1);
-
-        llama_sbatch sbatch;
-        llama_ubatch batch = sbatch.reserve_ubatch(cell_count, /* has_embd */ false);
-
-        batch.n_tokens = cell_count;
-        batch.n_seq_tokens = cell_count;
-        batch.n_seqs = 1;
-
-        for (uint32_t i = 0; i < cell_count; ++i) {
-            llama_pos pos;
-            uint32_t n_seq_id;
-
-            io.read_to(&pos,      sizeof(pos));
-            io.read_to(&n_seq_id, sizeof(n_seq_id));
-
-            if (n_seq_id != 0) {
-                LLAMA_LOG_ERROR("%s: invalid seq_id-agnostic kv cell\n", __func__);
-                return false;
-            }
-
-            batch.pos[i] = pos;
-        }
-        batch.n_seq_id[0] = 1;
-        batch.seq_id[0] = &dest_seq_id;
-
-        if (!find_slot(batch)) {
-            LLAMA_LOG_ERROR("%s: failed to find available cells in kv cache\n", __func__);
-            return false;
-        }
-
-        // DEBUG CHECK: kv.head should be our first cell, kv.head + cell_count - 1 should be our last cell (verify seq_id and pos values)
-        // Assume that this is one contiguous block of cells
-        GGML_ASSERT(head + cell_count <= size);
-        GGML_ASSERT(cells[head].pos == batch.pos[0]);
-        GGML_ASSERT(cells[head + cell_count - 1].pos == batch.pos[cell_count - 1]);
-        GGML_ASSERT(cells[head].has_seq_id(dest_seq_id));
-        GGML_ASSERT(cells[head + cell_count - 1].has_seq_id(dest_seq_id));
-    } else {
-        // whole KV cache restore
-
-        if (cell_count > size) {
-            LLAMA_LOG_ERROR("%s: not enough cells in kv cache\n", __func__);
-            return false;
-        }
-
-        clear();
-
-        for (uint32_t i = 0; i < cell_count; ++i) {
-            kv_cell & cell = cells[i];
-
-            llama_pos pos;
-            uint32_t  n_seq_id;
-
-            io.read_to(&pos,      sizeof(pos));
-            io.read_to(&n_seq_id, sizeof(n_seq_id));
-
-            cell.pos = pos;
-
-            for (uint32_t j = 0; j < n_seq_id; ++j) {
-                llama_seq_id seq_id;
-                io.read_to(&seq_id, sizeof(seq_id));
-
-                // TODO: llama_kv_cache_recurrent should have a notion of max sequences
-                //if (seq_id < 0 || (uint32_t) seq_id >= llama_n_seq_max(ctx)) {
-                if (seq_id < 0) {
-                    //LLAMA_LOG_ERROR("%s: invalid seq_id, %d is out of range [0, %u)\n", __func__, seq_id, llama_n_seq_max(ctx));
-                    LLAMA_LOG_ERROR("%s: invalid seq_id, %d is out of range [0, inf)\n", __func__, seq_id);
-                    return false;
-                }
-
-                cell.seq_id.insert(seq_id);
-
-                int32_t & tail = cells[seq_id].tail;
-                if (tail != -1) {
-                    LLAMA_LOG_ERROR("%s: duplicate tail for seq_id %d in cell %d and %d\n", __func__, seq_id, i, tail);
-                    return false;
-                }
-                tail = i;
-            }
-        }
-
-        head = 0;
-        used = cell_count;
-    }
-
-    for (uint32_t i = 0; i < cell_count; ++i) {
-        uint32_t cell_id = head + i;
-        // make sure the recurrent states will keep their restored state
-        cells[cell_id].src = cell_id;
-    }
-
-    return true;
-}
-
-bool llama_kv_cache_recurrent::state_read_data(llama_io_read_i & io, uint32_t cell_count) {
-    uint32_t v_trans;
-    uint32_t n_layer;
-    io.read_to(&v_trans, sizeof(v_trans));
-    io.read_to(&n_layer, sizeof(n_layer));
-
-    if (n_layer != hparams.n_layer) {
-        LLAMA_LOG_ERROR("%s: mismatched layer count (%u instead of %u)\n", __func__, n_layer, hparams.n_layer);
-        return false;
-    }
-    if (cell_count > size) {
-        LLAMA_LOG_ERROR("%s: not enough cells in kv cache to restore state (%u > %u)\n", __func__, cell_count, size);
-        return false;
-    }
-    if (false != (bool) v_trans) {
-        LLAMA_LOG_ERROR("%s: incompatible V transposition\n", __func__);
-        return false;
-    }
-
-    // For each layer, read the keys for each cell, one row is one cell, read as one contiguous block
-    for (uint32_t il = 0; il < n_layer; ++il) {
-        const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il) + hparams.n_embd_k_s();
-
-        // Read type of key
-        int32_t k_type_i_ref;
-        io.read_to(&k_type_i_ref, sizeof(k_type_i_ref));
-        const int32_t k_type_i = (int32_t) k_l[il]->type;
-        if (k_type_i != k_type_i_ref) {
-            LLAMA_LOG_ERROR("%s: mismatched key type (%d != %d, layer %d)\n", __func__, k_type_i, k_type_i_ref, il);
-            return false;
-        }
-
-        // Read row size of key
-        uint64_t k_size_row_ref;
-        io.read_to(&k_size_row_ref, sizeof(k_size_row_ref));
-        const size_t k_size_row = ggml_row_size(k_l[il]->type, n_embd_k_gqa);
-        if (k_size_row != k_size_row_ref) {
-            LLAMA_LOG_ERROR("%s: mismatched key row size (%zu != %zu, layer %d)\n", __func__, k_size_row, (size_t) k_size_row_ref, il);
-            return false;
-        }
-
-        if (cell_count) {
-            // Read and set the keys for the whole cell range
-            ggml_backend_tensor_set(k_l[il], io.read(cell_count * k_size_row), head * k_size_row, cell_count * k_size_row);
-        }
-    }
-
-    if (!v_trans) {
-        for (uint32_t il = 0; il < n_layer; ++il) {
-            const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
-
-            // Read type of value
-            int32_t v_type_i_ref;
-            io.read_to(&v_type_i_ref, sizeof(v_type_i_ref));
-            const int32_t v_type_i = (int32_t)v_l[il]->type;
-            if (v_type_i != v_type_i_ref) {
-                LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il);
-                return false;
-            }
-
-            // Read row size of value
-            uint64_t v_size_row_ref;
-            io.read_to(&v_size_row_ref, sizeof(v_size_row_ref));
-            const size_t v_size_row = ggml_row_size(v_l[il]->type, n_embd_v_gqa);
-            if (v_size_row != v_size_row_ref) {
-                LLAMA_LOG_ERROR("%s: mismatched value row size (%zu != %zu, layer %d)\n", __func__, v_size_row, (size_t) v_size_row_ref, il);
-                return false;
-            }
-
-            if (cell_count) {
-                // Read and set the values for the whole cell range
-                ggml_backend_tensor_set(v_l[il], io.read(cell_count * v_size_row), head * v_size_row, cell_count * v_size_row);
-            }
-        }
-    } else {
-        // For each layer, read the values for each cell (transposed)
-        for (uint32_t il = 0; il < n_layer; ++il) {
-            const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
-
-            // Read type of value
-            int32_t v_type_i_ref;
-            io.read_to(&v_type_i_ref, sizeof(v_type_i_ref));
-            const int32_t v_type_i = (int32_t)v_l[il]->type;
-            if (v_type_i != v_type_i_ref) {
-                LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il);
-                return false;
-            }
-
-            // Read element size of value
-            uint32_t v_size_el_ref;
-            io.read_to(&v_size_el_ref, sizeof(v_size_el_ref));
-            const size_t v_size_el = ggml_type_size(v_l[il]->type);
-            if (v_size_el != v_size_el_ref) {
-                LLAMA_LOG_ERROR("%s: mismatched value element size (%zu != %zu, layer %d)\n", __func__, v_size_el, (size_t) v_size_el_ref, il);
-                return false;
-            }
-
-            // Read GQA embedding size
-            uint32_t n_embd_v_gqa_ref;
-            io.read_to(&n_embd_v_gqa_ref, sizeof(n_embd_v_gqa_ref));
-            if (n_embd_v_gqa != n_embd_v_gqa_ref) {
-                LLAMA_LOG_ERROR("%s: mismatched GQA embedding size (%u != %u, layer %d)\n", __func__, n_embd_v_gqa, n_embd_v_gqa_ref, il);
-                return false;
-            }
-
-            if (cell_count) {
-                // For each row in the transposed matrix, read the values for the whole cell range
-                for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
-                    const size_t dst_offset = (head + j * size) * v_size_el;
-                    ggml_backend_tensor_set(v_l[il], io.read(cell_count * v_size_el), dst_offset, cell_count * v_size_el);
-                }
-            }
-        }
-    }
-
-    return true;
-}
-
-//
-// llama_kv_cache_recurrent_state
-//
-
-llama_kv_cache_recurrent_state::llama_kv_cache_recurrent_state(llama_memory_status status) : status(status) {}
-
-llama_kv_cache_recurrent_state::llama_kv_cache_recurrent_state(
-        llama_memory_status status,
-        llama_kv_cache_recurrent * kv) : status(status), kv(kv), is_full(true) {
-}
-
-llama_kv_cache_recurrent_state::llama_kv_cache_recurrent_state(
-        llama_memory_status status,
-        llama_kv_cache_recurrent * kv,
-        llama_sbatch sbatch,
-        std::vector<llama_ubatch> ubatches) : status(status), kv(kv), sbatch(std::move(sbatch)), ubatches(std::move(ubatches)) {}
-
-llama_kv_cache_recurrent_state::~llama_kv_cache_recurrent_state() = default;
-
-bool llama_kv_cache_recurrent_state::next() {
-    assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
-
-    if (++i_next >= ubatches.size()) {
-        return false;
-    }
-
-    return true;
-}
-
-bool llama_kv_cache_recurrent_state::apply() {
-    assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
-
-    kv->find_slot(ubatches[i_next]);
-
-    return true;
-}
-
-std::vector<int64_t> & llama_kv_cache_recurrent_state::out_ids() {
-    assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
-
-    return sbatch.out_ids;
-}
-
-llama_memory_status llama_kv_cache_recurrent_state::get_status() const {
-    return status;
-}
-
-const llama_ubatch & llama_kv_cache_recurrent_state::get_ubatch() const {
-    assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
-
-    return ubatches[i_next];
-}
-
-uint32_t llama_kv_cache_recurrent_state::get_n_kv() const {
-    return is_full ? kv->size : kv->n;
-}
-
-uint32_t llama_kv_cache_recurrent_state::get_head() const {
-    return is_full ? 0 : kv->head;
-}
-
-uint32_t llama_kv_cache_recurrent_state::get_size() const {
-    return kv->size;
-}
-
-ggml_tensor * llama_kv_cache_recurrent_state::get_k_l(int32_t il) const {
-    return kv->k_l[il];
-}
-
-ggml_tensor * llama_kv_cache_recurrent_state::get_v_l(int32_t il) const {
-    return kv->v_l[il];
-}
-
-int32_t llama_kv_cache_recurrent_state::s_copy(int i) const {
-    return kv->s_copy(i);
-}
-
-float llama_kv_cache_recurrent_state::s_mask(int i) const {
-    return kv->s_mask(i);
-}
diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h
deleted file mode 100644
index 75969878e318c..0000000000000
--- a/src/llama-kv-cache.h
+++ /dev/null
@@ -1,636 +0,0 @@
-#pragma once
-
-#include "llama.h"
-#include "llama-io.h"
-#include "llama-batch.h"
-#include "llama-graph.h"
-#include "llama-memory.h"
-#include "llama-kv-cells.h"
-
-#include "ggml-cpp.h"
-
-#include <set>
-#include <unordered_map>
-#include <vector>
-
-struct llama_cparams;
-struct llama_hparams;
-struct llama_model;
-struct llama_context;
-
-struct llama_kv_cache : public llama_memory_i {
-    virtual ~llama_kv_cache() = default;
-
-    // split the input batch into a set of ubatches and verify that they can fit into the cache
-    // return a state object containing the ubatches and KV cache state required to process them
-    // check the llama_memory_state_i::get_status() for the result
-    virtual llama_memory_state_ptr init_batch(
-            const llama_batch & batch,
-            uint32_t n_ubatch,
-            bool embd_pooled,
-            bool logits_all) = 0;
-
-    // simulate full cache, used for allocating worst-case compute buffers
-    virtual llama_memory_state_ptr init_full() = 0;
-
-    // process any pending defrag/shift/etc. operations
-    // optionally call once before processing a new batch
-    // return true if any operations were performed
-    virtual bool update(llama_context & lctx) = 0;
-
-    // schedule a defrag if the fragmentation threshold is exceeded. otherwise, do nothing
-    // TODO: change to
-    //   llama_memory_state_ptr init_defrag(float thold) = 0;
-    //
-    virtual void defrag_sched(float thold) = 0;
-
-    // getters
-    virtual bool get_can_shift() const = 0;
-
-    bool get_can_edit() const override { return get_can_shift(); }
-
-    //
-    // state write/read
-    //
-
-    virtual void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const = 0;
-    virtual void state_read (llama_io_read_i  & io, llama_seq_id seq_id = -1) = 0;
-};
-
-//
-// llama_kv_cache_unified
-//
-
-class llama_kv_cache_unified : public llama_kv_cache {
-public:
-    static uint32_t get_padding(const llama_cparams & cparams);
-
-    // this callback is used to filter out layers that should not be included in the cache
-    using layer_filter_cb = std::function<bool(int32_t il)>;
-
-    llama_kv_cache_unified(
-            const llama_model &  model,
-              layer_filter_cb && filter,
-                    ggml_type    type_k,
-                    ggml_type    type_v,
-                         bool    v_trans,
-                         bool    offload,
-                     uint32_t    kv_size,
-                     uint32_t    n_seq_max,
-                     uint32_t    n_pad,
-                     uint32_t    n_swa,
-               llama_swa_type    swa_type);
-
-    ~llama_kv_cache_unified() = default;
-
-    //
-    // llama_memory_i
-    //
-
-    void clear() override;
-
-    bool seq_rm  (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1) override;
-    void seq_cp  (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) override;
-    void seq_keep(llama_seq_id seq_id)                                                          override;
-    void seq_add (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1, llama_pos shift) override;
-    void seq_div (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1, int d) override;
-
-    llama_pos seq_pos_min(llama_seq_id seq_id) const override;
-    llama_pos seq_pos_max(llama_seq_id seq_id) const override;
-
-    //
-    // llama_kv_cache
-    //
-
-    llama_memory_state_ptr init_batch(
-            const llama_batch & batch,
-            uint32_t n_ubatch,
-            bool embd_pooled,
-            bool logits_all) override;
-
-    llama_memory_state_ptr init_full() override;
-
-    bool update(llama_context & lctx) override;
-
-    void defrag_sched(float thold) override;
-
-    bool get_can_shift() const override;
-
-    // state write/load
-
-    void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const override;
-    void state_read (llama_io_read_i  & io, llama_seq_id seq_id = -1)       override;
-
-    //
-    // llama_kv_cache_unified specific API
-    //
-
-    uint32_t get_size() const;
-
-    //
-    // graph_build API
-    //
-
-    uint32_t get_n_kv() const;
-
-    // get views of the current state of the cache
-    ggml_tensor * get_k(ggml_context * ctx, int32_t il, uint32_t n_kv) const;
-    ggml_tensor * get_v(ggml_context * ctx, int32_t il, uint32_t n_kv) const;
-
-    // store k_cur and v_cur in the cache based on the provided head location
-    ggml_tensor * cpy_k(ggml_context * ctx, ggml_tensor * k_cur, int32_t il, uint32_t head_cur) const;
-    ggml_tensor * cpy_v(ggml_context * ctx, ggml_tensor * v_cur, int32_t il, uint32_t head_cur) const;
-
-    //
-    // preparation API
-    //
-
-    // find places for the provided ubatches in the cache, returns the head locations
-    // return empty vector on failure
-    std::vector<uint32_t> prepare(const std::vector<llama_ubatch> & ubatches);
-
-    // return the cell position where we can insert the ubatch
-    // return -1 on failure to find a contiguous slot of kv cells
-    int32_t find_slot(const llama_ubatch & ubatch) const;
-
-    // emplace the ubatch context into slot: [head_cur, head_cur + ubatch.n_tokens)
-    void apply_ubatch(uint32_t head_cur, const llama_ubatch & ubatch);
-
-    //
-    // set_input API
-    //
-
-    void set_input_kq_mask   (ggml_tensor * dst, const llama_ubatch * ubatch, bool causal_attn) const;
-    void set_input_k_shift   (ggml_tensor * dst) const;
-    void set_input_pos_bucket(ggml_tensor * dst, const llama_ubatch * ubatch) const;
-
-private:
-    const llama_model & model;
-    const llama_hparams & hparams;
-
-    struct kv_layer {
-        // layer index in the model
-        // note: can be different from the layer index in the KV cache
-        uint32_t il;
-
-        ggml_tensor * k;
-        ggml_tensor * v;
-    };
-
-    bool do_defrag = false;
-    bool v_trans   = true;  // the value tensor is transposed
-
-    // the current index from where we start searching for a free slot in the ring buffer of KV cells (see find_slot())
-    // note: this is not part of the KV state and it's only used to speed-up the find_slot() method
-    uint32_t head = 0;
-
-    const uint32_t n_seq_max = 1;
-
-    // required padding
-    const uint32_t n_pad = 1;
-
-    // SWA
-    const uint32_t n_swa = 0;
-
-    const llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE;
-
-    std::vector<ggml_context_ptr>        ctxs;
-    std::vector<ggml_backend_buffer_ptr> bufs;
-
-    llama_kv_cells_unified cells;
-
-    std::vector<kv_layer> layers;
-
-    // model layer id -> KV cache layer id
-    std::unordered_map<int32_t, int32_t> map_layer_ids;
-
-    // defrag
-    struct {
-        std::vector<uint32_t> ids;
-    } defrag_info;
-
-    // return true if cells have been moved
-    bool defrag_prepare(int32_t n_max_nodes);
-
-    size_t total_size() const;
-
-    size_t size_k_bytes() const;
-    size_t size_v_bytes() const;
-
-    bool is_masked_swa(llama_pos p0, llama_pos p1) const;
-
-    ggml_tensor * build_rope_shift(
-            const llama_cparams & cparams,
-                   ggml_context * ctx,
-                    ggml_tensor * cur,
-                    ggml_tensor * shift,
-                    ggml_tensor * factors,
-                          float   freq_base,
-                          float   freq_scale) const;
-
-    llm_graph_result_ptr build_graph_shift(
-            const llama_cparams & cparams,
-                   ggml_context * ctx,
-                    ggml_cgraph * gf) const;
-
-    llm_graph_result_ptr build_graph_defrag(
-            const llama_cparams & cparams,
-                   ggml_context * ctx,
-                    ggml_cgraph * gf) const;
-
-    void state_write_meta(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges, llama_seq_id seq_id = -1) const;
-    void state_write_data(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges) const;
-
-    bool state_read_meta(llama_io_read_i & io, uint32_t cell_count, llama_seq_id dest_seq_id = -1);
-    bool state_read_data(llama_io_read_i & io, uint32_t cell_count);
-};
-
-class llama_kv_cache_unified_state : public llama_memory_state_i {
-public:
-    // used for errors
-    llama_kv_cache_unified_state(llama_memory_status status);
-
-    // used to create a full-cache state
-    llama_kv_cache_unified_state(
-            llama_memory_status status,
-            llama_kv_cache_unified * kv);
-
-    // used to create a state from a batch
-    llama_kv_cache_unified_state(
-            llama_memory_status status,
-            llama_kv_cache_unified * kv,
-            llama_sbatch sbatch,
-            std::vector<uint32_t> heads,
-            std::vector<llama_ubatch> ubatches);
-
-    virtual ~llama_kv_cache_unified_state();
-
-    //
-    // llama_memory_state_i
-    //
-
-    bool next()  override;
-    bool apply() override;
-
-    std::vector<int64_t> & out_ids() override;
-
-    llama_memory_status  get_status() const override;
-    const llama_ubatch & get_ubatch() const override;
-
-    //
-    // llama_kv_cache_unified_state specific API
-    //
-
-    uint32_t get_n_kv() const;
-
-    // get views of the current state of the cache
-    ggml_tensor * get_k(ggml_context * ctx, int32_t il) const;
-    ggml_tensor * get_v(ggml_context * ctx, int32_t il) const;
-
-    // store k_cur and v_cur in the cache based on the provided head location
-    ggml_tensor * cpy_k(ggml_context * ctx, ggml_tensor * k_cur, int32_t il) const;
-    ggml_tensor * cpy_v(ggml_context * ctx, ggml_tensor * v_cur, int32_t il) const;
-
-    void set_input_k_shift(ggml_tensor * dst) const;
-
-    void set_input_kq_mask   (ggml_tensor * dst, const llama_ubatch * ubatch, bool causal_attn) const;
-    void set_input_pos_bucket(ggml_tensor * dst, const llama_ubatch * ubatch) const;
-
-private:
-    const llama_memory_status status;
-
-    llama_kv_cache_unified * kv;
-
-    llama_sbatch sbatch;
-
-    // the index of the next ubatch to process
-    size_t i_next = 0;
-
-    std::vector<uint32_t> heads;
-    std::vector<llama_ubatch> ubatches;
-
-    //
-    // data needed for building the compute graph for the current ubatch:
-    //
-
-    // a heuristic, to avoid attending the full cache if it is not yet utilized
-    // as the cache gets filled, the benefit from this heuristic disappears
-    int32_t n_kv;
-
-    // the beginning of the current slot in which the ubatch will be inserted
-    int32_t head;
-};
-
-//
-// llama_kv_cache_unified_iswa
-//
-
-// utilizes two instances of llama_kv_cache_unified
-//   the first instance is for the non-SWA layers of the model and the second instance is for the SWA layers
-
-class llama_kv_cache_unified_iswa : public llama_kv_cache {
-public:
-    llama_kv_cache_unified_iswa(
-            const llama_model & model,
-                    ggml_type   type_k,
-                    ggml_type   type_v,
-                         bool   v_trans,
-                         bool   offload,
-                         bool   swa_full,
-                     uint32_t   kv_size,
-                     uint32_t   n_seq_max,
-                     uint32_t   n_ubatch,
-                     uint32_t   n_pad);
-
-    ~llama_kv_cache_unified_iswa() = default;
-
-    //
-    // llama_memory_i
-    //
-
-    void clear() override;
-
-    bool seq_rm  (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1) override;
-    void seq_cp  (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) override;
-    void seq_keep(llama_seq_id seq_id)                                                          override;
-    void seq_add (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1, llama_pos shift) override;
-    void seq_div (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1, int d) override;
-
-    llama_pos seq_pos_min(llama_seq_id seq_id) const override;
-    llama_pos seq_pos_max(llama_seq_id seq_id) const override;
-
-    //
-    // llama_kv_cache
-    //
-
-    llama_memory_state_ptr init_batch(
-            const llama_batch & batch,
-            uint32_t n_ubatch,
-            bool embd_pooled,
-            bool logits_all) override;
-
-    llama_memory_state_ptr init_full() override;
-
-    bool update(llama_context & lctx) override;
-
-    void defrag_sched(float thold) override;
-
-    bool get_can_shift() const override;
-
-    // state write/load
-
-    void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const override;
-    void state_read (llama_io_read_i  & io, llama_seq_id seq_id = -1)       override;
-
-    //
-    // llama_kv_cache_unified_iswa specific API
-    //
-
-    llama_kv_cache_unified * get_base() const;
-    llama_kv_cache_unified * get_swa () const;
-
-private:
-    const llama_hparams & hparams;
-
-    std::unique_ptr<llama_kv_cache_unified> kv_base;
-    std::unique_ptr<llama_kv_cache_unified> kv_swa;
-};
-
-class llama_kv_cache_unified_iswa_state : public llama_memory_state_i {
-public:
-    // used for errors
-    llama_kv_cache_unified_iswa_state(llama_memory_status status);
-
-    // used to create a full-cache state
-    llama_kv_cache_unified_iswa_state(
-            llama_memory_status status,
-            llama_kv_cache_unified_iswa * kv);
-
-    // used to create a state from a batch
-    llama_kv_cache_unified_iswa_state(
-            llama_memory_status status,
-            llama_kv_cache_unified_iswa * kv,
-            llama_sbatch sbatch,
-            std::vector<uint32_t> heads_base,
-            std::vector<uint32_t> heads_swa,
-            std::vector<llama_ubatch> ubatches);
-
-    virtual ~llama_kv_cache_unified_iswa_state();
-
-    //
-    // llama_memory_state_i
-    //
-
-    bool next()  override;
-    bool apply() override;
-
-    std::vector<int64_t> & out_ids() override;
-
-    llama_memory_status  get_status() const override;
-    const llama_ubatch & get_ubatch() const override;
-
-    //
-    // llama_kv_cache_unified_iswa_state specific API
-    //
-
-    const llama_kv_cache_unified_state * get_base() const;
-    const llama_kv_cache_unified_state * get_swa()  const;
-
-private:
-    const llama_memory_status status;
-
-    //llama_kv_cache_unified_iswa * kv;
-
-    llama_sbatch sbatch;
-
-    // the index of the next ubatch to process
-    size_t i_next = 0;
-
-    std::vector<llama_ubatch> ubatches;
-
-    std::unique_ptr<llama_kv_cache_unified_state> state_base;
-    std::unique_ptr<llama_kv_cache_unified_state> state_swa;
-};
-
-//
-// llama_kv_cache_recurrent
-//
-
-// TODO: extract the KV cache state used for graph computation into llama_kv_cache_recurrent_state_i
-//       see the implementation of llama_kv_cache_unified_state_i for an example how to do it
-class llama_kv_cache_recurrent : public llama_kv_cache {
-public:
-    llama_kv_cache_recurrent(
-            const llama_model & model,
-                    ggml_type   type_k,
-                    ggml_type   type_v,
-                         bool   offload,
-                     uint32_t   kv_size,
-                     uint32_t   n_seq_max);
-
-    ~llama_kv_cache_recurrent() = default;
-
-    //
-    // llama_memory_i
-    //
-
-    void clear() override;
-
-    bool seq_rm  (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1) override;
-    void seq_cp  (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) override;
-    void seq_keep(llama_seq_id seq_id)                                                          override;
-    void seq_add (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1, llama_pos shift) override;
-    void seq_div (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1, int d) override;
-
-    llama_pos seq_pos_min(llama_seq_id seq_id) const override;
-    llama_pos seq_pos_max(llama_seq_id seq_id) const override;
-
-    //
-    // llama_kv_cache
-    //
-
-    llama_memory_state_ptr init_batch(
-            const llama_batch & batch,
-            uint32_t n_ubatch,
-            bool embd_pooled,
-            bool logits_all) override;
-
-    llama_memory_state_ptr init_full() override;
-
-    bool update(llama_context & lctx) override;
-
-    void defrag_sched(float thold) override;
-
-    bool prepare(const std::vector<llama_ubatch> & ubatches);
-
-    // find a contiguous slot of kv cells and emplace the ubatch there
-    bool find_slot(const llama_ubatch & ubatch);
-
-    bool get_can_shift() const override;
-
-    // TODO: temporary methods - they are not really const as they do const_cast<>, fix this
-    int32_t s_copy(int i) const;
-    float   s_mask(int i) const;
-
-    // state write/load
-
-    void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const override;
-    void state_read (llama_io_read_i  & io, llama_seq_id seq_id = -1) override;
-
-    uint32_t head = 0; // the location where the batch will be placed in the cache (see find_slot())
-    uint32_t size = 0; // total number of cells, shared across all sequences
-    uint32_t used = 0; // used cells (i.e. at least one seq_id)
-
-    // computed before each graph build
-    uint32_t n = 0;
-
-    // TODO: optimize for recurrent state needs
-    struct kv_cell {
-        llama_pos pos  = -1;
-        int32_t   src  = -1; // used to copy states
-        int32_t   tail = -1;
-
-        std::set<llama_seq_id> seq_id;
-
-        bool has_seq_id(const llama_seq_id & id) const {
-            return seq_id.find(id) != seq_id.end();
-        }
-
-        bool is_empty() const {
-            return seq_id.empty();
-        }
-
-        bool is_same_seq(const kv_cell & other) const {
-            return seq_id == other.seq_id;
-        }
-    };
-
-    std::vector<kv_cell> cells;
-
-    std::vector<ggml_tensor *> k_l; // per layer
-    std::vector<ggml_tensor *> v_l;
-
-private:
-    //const llama_model & model;
-    const llama_hparams & hparams;
-
-    const uint32_t n_seq_max = 1;
-
-    std::vector<ggml_context_ptr>        ctxs;
-    std::vector<ggml_backend_buffer_ptr> bufs;
-
-    size_t total_size() const;
-
-    size_t size_k_bytes() const;
-    size_t size_v_bytes() const;
-
-    void state_write_meta(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges, llama_seq_id seq_id = -1) const;
-    void state_write_data(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges) const;
-
-    bool state_read_meta(llama_io_read_i & io, uint32_t cell_count, llama_seq_id dest_seq_id = -1);
-    bool state_read_data(llama_io_read_i & io, uint32_t cell_count);
-};
-
-class llama_kv_cache_recurrent_state : public llama_memory_state_i {
-public:
-    // used for errors
-    llama_kv_cache_recurrent_state(llama_memory_status status);
-
-    // used to create a full-cache state
-    llama_kv_cache_recurrent_state(
-            llama_memory_status status,
-            llama_kv_cache_recurrent * kv);
-
-    // used to create a state from a batch
-    llama_kv_cache_recurrent_state(
-            llama_memory_status status,
-            llama_kv_cache_recurrent * kv,
-            llama_sbatch sbatch,
-            std::vector<llama_ubatch> ubatches);
-
-    virtual ~llama_kv_cache_recurrent_state();
-
-    //
-    // llama_memory_state_i
-    //
-
-    bool next()  override;
-    bool apply() override;
-
-    std::vector<int64_t> & out_ids() override;
-
-    llama_memory_status  get_status() const override;
-    const llama_ubatch & get_ubatch() const override;
-
-    //
-    // llama_kv_cache_recurrent_state specific API
-    //
-
-    uint32_t get_n_kv() const;
-    uint32_t get_head() const;
-    uint32_t get_size() const;
-
-    ggml_tensor * get_k_l(int32_t il) const;
-    ggml_tensor * get_v_l(int32_t il) const;
-
-    int32_t s_copy(int i) const;
-    float   s_mask(int i) const;
-
-private:
-    const llama_memory_status status;
-
-    llama_kv_cache_recurrent * kv;
-
-    llama_sbatch sbatch;
-
-    size_t i_next = 0;
-
-    std::vector<llama_ubatch> ubatches;
-
-    //
-    // data needed for building the compute graph for the current ubatch:
-    // TODO: extract all the state like `head` and `n` here
-    //
-
-    const bool is_full = false;
-};
diff --git a/src/llama-kv-cells.h b/src/llama-kv-cells.h
index 9e2c4d927699d..c95d635948b5d 100644
--- a/src/llama-kv-cells.h
+++ b/src/llama-kv-cells.h
@@ -7,6 +7,7 @@
 #include <cassert>
 #include <vector>
 #include <set>
+#include <map>
 
 // meta information about KV cells that can be part of multiple sequences at the same time
 // TODO: add unit tests
@@ -23,7 +24,7 @@ class llama_kv_cells_unified {
 
         used.clear();
 
-        for (uint32_t s = 0; s < LLAMA_MAX_PARALLEL_SEQUENCES; ++s) {
+        for (uint32_t s = 0; s < LLAMA_MAX_SEQ; ++s) {
             seq_pos[s].clear();
         }
     }
@@ -80,6 +81,9 @@ class llama_kv_cells_unified {
         assert(isrc < pos.size());
         assert(idst < pos.size());
 
+        assert(pos[idst] == -1);
+        assert(pos[isrc] != -1);
+
         pos  [idst] = pos  [isrc];
         shift[idst] = shift[isrc];
         seq  [idst] = seq  [isrc];
@@ -144,9 +148,10 @@ class llama_kv_cells_unified {
         assert(pos[i] != -1);
 
         seq_pos_rm(i);
+        seq[i].reset();
 
         pos[i] = -1;
-        seq[i].reset();
+        shift[i] = 0;
 
         used.erase(i);
     }
@@ -160,10 +165,11 @@ class llama_kv_cells_unified {
         assert(seq_id >= 0);
 
         seq[i].reset(seq_id);
-        seq_pos[seq_id].erase(pos[i]);
+        seq_pos_dec(seq_id, pos[i]);
 
         if (seq[i].none()) {
             pos[i] = -1;
+            shift[i] = 0;
 
             used.erase(i);
 
@@ -182,7 +188,7 @@ class llama_kv_cells_unified {
             seq[i].reset();
 
             seq[i].set(seq_id);
-            seq_pos[seq_id].insert(pos[i]);
+            seq_pos_inc(seq_id, pos[i]);
 
             return false;
         }
@@ -192,6 +198,7 @@ class llama_kv_cells_unified {
             seq[i].reset();
 
             pos[i] = -1;
+            shift[i] = 0;
 
             used.erase(i);
 
@@ -226,7 +233,7 @@ class llama_kv_cells_unified {
         assert(!seq[i].test(seq_id));
 
         seq[i].set(seq_id);
-        seq_pos[seq_id].insert(pos[i]);
+        seq_pos_inc(seq_id, pos[i]);
     }
 
     // return the sequence id of this cell
@@ -234,7 +241,7 @@ class llama_kv_cells_unified {
     llama_seq_id seq_get(uint32_t i) const {
         assert(seq[i].count() == 1);
 
-        for (int s = 0; s < LLAMA_MAX_PARALLEL_SEQUENCES; ++s) {
+        for (int s = 0; s < LLAMA_MAX_SEQ; ++s) {
             if (seq[i].test(s)) {
                 return s;
             }
@@ -247,26 +254,30 @@ class llama_kv_cells_unified {
     // return -1 if the sequence is not present
     llama_pos seq_pos_min(llama_seq_id seq_id) const {
         assert(seq_id >= 0);
-        assert(seq_id < LLAMA_MAX_PARALLEL_SEQUENCES);
+        assert(seq_id < LLAMA_MAX_SEQ);
 
         if (seq_pos[seq_id].empty()) {
             return -1;
         }
 
-        return *seq_pos[seq_id].begin();
+        assert(seq_pos[seq_id].begin()->second > 0);
+
+        return seq_pos[seq_id].begin()->first;
     }
 
     // the maximum position of sequence seq_id currently present in any of the cells
     // return -1 if the sequence is not present
     llama_pos seq_pos_max(llama_seq_id seq_id) const {
         assert(seq_id >= 0);
-        assert(seq_id < LLAMA_MAX_PARALLEL_SEQUENCES);
+        assert(seq_id < LLAMA_MAX_SEQ);
 
         if (seq_pos[seq_id].empty()) {
             return -1;
         }
 
-        return *seq_pos[seq_id].rbegin();
+        assert(seq_pos[seq_id].rbegin()->second > 0);
+
+        return seq_pos[seq_id].rbegin()->first;
     }
 
     // note: call only if the cell is not empty
@@ -317,21 +328,20 @@ class llama_kv_cells_unified {
         pos[i]   += d;
         shift[i] += d;
 
-        seq_pos_add(i);
-
         has_shift = true;
 
         if (pos[i] < 0) {
-            seq_pos_rm(i);
-
             seq[i].reset();
             pos[i] = -1;
+            shift[i] = 0;
 
             used.erase(i);
 
             return true;
         }
 
+        seq_pos_add(i);
+
         return false;
     }
 
@@ -379,31 +389,50 @@ class llama_kv_cells_unified {
     //
     std::vector<llama_pos> shift;
 
-    using bits_t = std::bitset<LLAMA_MAX_PARALLEL_SEQUENCES>;
+    using seq_set_t = std::bitset<LLAMA_MAX_SEQ>;
 
     // the bitset seq[i] tells us which sequences are currently occupying the i-th cell
-    std::vector<bits_t> seq;
+    std::vector<seq_set_t> seq;
 
-    // the set seq_pos[s] tells us which positions are currently present for sequence s
+    // the set seq_pos[s][p] tells us how many times the position p is currently present for sequence s
+    // if the position p is not present, seq_pos[s][p] is not set
     // this way seq_pos[s].begin() and seq_pos[s].rbegin() give us the min/max positions currently in the cache
-    std::set<llama_pos> seq_pos[LLAMA_MAX_PARALLEL_SEQUENCES];
+    //
+    // note that we cannot a use an std::set because in some cases a position can occur more than once for the same seq:
+    //  - during performing a cache reuse via (rm + add)
+    //  - some vision models have input embeddings with repeating positions
+    //
+    std::map<llama_pos, int> seq_pos[LLAMA_MAX_SEQ];
 
     // helper functions for updating `seq_pos`, once cell at a time:
 
+    void seq_pos_dec(llama_seq_id s, llama_pos p) {
+        auto it = seq_pos[s].find(p);
+        assert(it != seq_pos[s].end());
+
+        if (--it->second == 0) {
+            seq_pos[s].erase(it);
+        }
+    }
+
+    void seq_pos_inc(llama_seq_id s, llama_pos p) {
+        seq_pos[s][p]++;
+    }
+
     // remove cell i
     void seq_pos_rm(uint32_t i) {
-        for (int s = 0; s < LLAMA_MAX_PARALLEL_SEQUENCES; ++s) {
+        for (int s = 0; s < LLAMA_MAX_SEQ; ++s) {
             if (seq[i].test(s)) {
-                seq_pos[s].erase(pos[i]);
+                seq_pos_dec(s, pos[i]);
             }
         }
     }
 
     // add cell i
     void seq_pos_add(uint32_t i) {
-        for (int s = 0; s < LLAMA_MAX_PARALLEL_SEQUENCES; ++s) {
+        for (int s = 0; s < LLAMA_MAX_SEQ; ++s) {
             if (seq[i].test(s)) {
-                seq_pos[s].insert(pos[i]);
+                seq_pos_inc(s, pos[i]);
             }
         }
     }
diff --git a/src/llama-memory-hybrid.cpp b/src/llama-memory-hybrid.cpp
new file mode 100644
index 0000000000000..15cde98d138a8
--- /dev/null
+++ b/src/llama-memory-hybrid.cpp
@@ -0,0 +1,246 @@
+#include "llama-memory-hybrid.h"
+
+#include "llama-impl.h"
+#include "llama-model.h"
+#include "llama-context.h"
+
+//
+// llama_memory_hybrid
+//
+
+llama_memory_hybrid::llama_memory_hybrid(
+    const llama_model & model,
+                         /* attn */
+            ggml_type    type_k,
+            ggml_type    type_v,
+                 bool    v_trans,
+             uint32_t    kv_size,
+             uint32_t    n_pad,
+             uint32_t    n_swa,
+       llama_swa_type    swa_type,
+                         /* recurrent */
+            ggml_type    type_r,
+            ggml_type    type_s,
+             uint32_t    rs_size,
+                         /* common */
+             uint32_t    n_seq_max,
+                 bool    offload,
+                         /* layer filters */
+      layer_filter_cb && filter_attn,
+      layer_filter_cb && filter_recr) :
+    hparams(model.hparams),
+    mem_attn(new llama_kv_cache_unified(
+        model,
+        filter_attn == nullptr ?
+            [&](int32_t il) { return !hparams.is_recurrent(il); }
+            : filter_attn,
+        type_k,
+        type_v,
+        v_trans,
+        offload,
+        kv_size,
+        n_seq_max,
+        n_pad,
+        n_swa,
+        swa_type
+    )),
+    mem_recr(new llama_memory_recurrent(
+        model,
+        filter_recr == nullptr ?
+            [&](int32_t il) { return hparams.is_recurrent(il); }
+            : filter_recr,
+        type_r,
+        type_s,
+        offload,
+        rs_size,
+        n_seq_max
+    )) {}
+
+llama_memory_context_ptr llama_memory_hybrid::init_batch(llama_batch_allocr & balloc, uint32_t n_ubatch, bool embd_all) {
+    do {
+        balloc.split_reset();
+
+        // follow the recurrent pattern for creating the ubatch splits
+        std::vector<llama_ubatch> ubatches;
+
+        while (true) {
+            llama_ubatch ubatch;
+
+            if (embd_all) {
+                // if all tokens are output, split by sequence
+                ubatch = balloc.split_seq(n_ubatch);
+            } else {
+                ubatch = balloc.split_equal(n_ubatch);
+            }
+
+            if (ubatch.n_tokens == 0) {
+                break;
+            }
+
+            ubatches.push_back(std::move(ubatch)); // NOLINT
+        }
+
+        // prepare the recurrent batches first
+        if (!mem_recr->prepare(ubatches)) {
+            // TODO: will the recurrent cache be in an undefined context at this point?
+            LLAMA_LOG_ERROR("%s: failed to prepare recurrent ubatches\n", __func__);
+            return std::make_unique<llama_memory_hybrid_context>(LLAMA_MEMORY_STATUS_FAILED_PREPARE);
+        }
+
+        // prepare the attention cache
+        auto heads_attn = mem_attn->prepare(ubatches);
+        if (heads_attn.empty()) {
+            LLAMA_LOG_ERROR("%s: failed to prepare attention ubatches\n", __func__);
+            return std::make_unique<llama_memory_hybrid_context>(LLAMA_MEMORY_STATUS_FAILED_PREPARE);
+        }
+
+        return std::make_unique<llama_memory_hybrid_context>(
+                this, std::move(heads_attn), std::move(ubatches));
+    } while(false);
+
+    return std::make_unique<llama_memory_hybrid_context>(LLAMA_MEMORY_STATUS_FAILED_PREPARE);
+}
+
+llama_memory_context_ptr llama_memory_hybrid::init_full() {
+    return std::make_unique<llama_memory_hybrid_context>(this);
+}
+
+llama_memory_context_ptr llama_memory_hybrid::init_update(llama_context * lctx, bool optimize) {
+    return std::make_unique<llama_memory_hybrid_context>(this, lctx, optimize);
+}
+
+bool llama_memory_hybrid::get_can_shift() const {
+    // Shifting is trivially supported for recurrent
+    return mem_attn->get_can_shift();
+}
+
+void llama_memory_hybrid::clear(bool data) {
+    mem_attn->clear(data);
+    mem_recr->clear(data);
+}
+
+bool llama_memory_hybrid::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
+    // Try removing from the recurrent cache first since it may fail. If it does
+    // fail, the cache will not have been mutated.
+    if (!mem_recr->seq_rm(seq_id, p0, p1)) {
+        return false;
+    }
+    return mem_attn->seq_rm(seq_id, p0, p1);
+}
+
+void llama_memory_hybrid::seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) {
+    mem_attn->seq_cp(seq_id_src, seq_id_dst, p0, p1);
+    mem_recr->seq_cp(seq_id_src, seq_id_dst, p0, p1);
+}
+
+void llama_memory_hybrid::seq_keep(llama_seq_id seq_id) {
+    mem_attn->seq_keep(seq_id);
+    mem_recr->seq_keep(seq_id);
+}
+
+void llama_memory_hybrid::seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos shift) {
+    mem_attn->seq_add(seq_id, p0, p1, shift);
+    mem_recr->seq_add(seq_id, p0, p1, shift);
+}
+
+void llama_memory_hybrid::seq_div(llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) {
+    mem_attn->seq_div(seq_id, p0, p1, d);
+    mem_recr->seq_div(seq_id, p0, p1, d);
+}
+
+llama_pos llama_memory_hybrid::seq_pos_min(llama_seq_id seq_id) const {
+    // the min of the total cache is the max of the two caches' min values
+    return std::max(mem_attn->seq_pos_min(seq_id), mem_recr->seq_pos_min(seq_id));
+}
+
+llama_pos llama_memory_hybrid::seq_pos_max(llama_seq_id seq_id) const {
+    // the max of the total cache is the min of the two caches' max values
+    return std::min(mem_attn->seq_pos_max(seq_id), mem_recr->seq_pos_max(seq_id));
+}
+
+void llama_memory_hybrid::state_write(llama_io_write_i & io, llama_seq_id seq_id) const {
+    mem_attn->state_write(io, seq_id);
+    mem_recr->state_write(io, seq_id);
+}
+
+void llama_memory_hybrid::state_read(llama_io_read_i & io, llama_seq_id seq_id) {
+    mem_attn->state_read(io, seq_id);
+    mem_recr->state_read(io, seq_id);
+}
+
+llama_kv_cache_unified * llama_memory_hybrid::get_mem_attn() const {
+    return mem_attn.get();
+}
+
+llama_memory_recurrent * llama_memory_hybrid::get_mem_recr() const {
+    return mem_recr.get();
+}
+
+llama_memory_hybrid_context::llama_memory_hybrid_context(llama_memory_status status) : status(status) {}
+
+llama_memory_hybrid_context::llama_memory_hybrid_context(llama_memory_hybrid * mem) :
+    ctx_attn(mem->get_mem_attn()->init_full()),
+    ctx_recr(mem->get_mem_recr()->init_full()),
+    status(llama_memory_status_combine(ctx_attn->get_status(), ctx_recr->get_status())) {
+}
+
+llama_memory_hybrid_context::llama_memory_hybrid_context(
+        llama_memory_hybrid * mem,
+              llama_context * lctx,
+                       bool   optimize) :
+    ctx_attn(mem->get_mem_attn()->init_update(lctx, optimize)),
+    ctx_recr(mem->get_mem_recr()->init_update(lctx, optimize)),
+    status(llama_memory_status_combine(ctx_attn->get_status(), ctx_recr->get_status())) {
+}
+
+llama_memory_hybrid_context::llama_memory_hybrid_context(
+              llama_memory_hybrid * mem,
+            std::vector<uint32_t>   heads_attn,
+        std::vector<llama_ubatch>   ubatches) :
+    ubatches(std::move(ubatches)),
+    // note: here we copy the ubatches. not sure if this is ideal
+    ctx_attn(new llama_kv_cache_unified_context(mem->get_mem_attn(), std::move(heads_attn), this->ubatches)),
+    ctx_recr(new llama_memory_recurrent_context(mem->get_mem_recr(),                        this->ubatches)),
+    status(llama_memory_status_combine(ctx_attn->get_status(), ctx_recr->get_status())) {
+}
+
+bool llama_memory_hybrid_context::next() {
+    assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
+
+    ctx_attn->next();
+    ctx_recr->next();
+
+    if (++i_next >= ubatches.size()) {
+        return false;
+    }
+
+    return true;
+}
+
+bool llama_memory_hybrid_context::apply() {
+    assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
+
+    bool res = true;
+
+    res = res & ctx_attn->apply();
+    res = res & ctx_recr->apply();
+
+    return res;
+}
+
+llama_memory_status llama_memory_hybrid_context::get_status() const {
+    return status;
+}
+
+const llama_ubatch & llama_memory_hybrid_context::get_ubatch() const {
+    assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
+    return ubatches[i_next];
+}
+
+const llama_kv_cache_unified_context * llama_memory_hybrid_context::get_attn() const {
+    return static_cast<const llama_kv_cache_unified_context *>(ctx_attn.get());
+}
+
+const llama_memory_recurrent_context * llama_memory_hybrid_context::get_recr() const {
+    return static_cast<const llama_memory_recurrent_context *>(ctx_recr.get());
+}
diff --git a/src/llama-memory-hybrid.h b/src/llama-memory-hybrid.h
new file mode 100644
index 0000000000000..f0c2420e9a2df
--- /dev/null
+++ b/src/llama-memory-hybrid.h
@@ -0,0 +1,138 @@
+#pragma once
+
+#include "llama-batch.h"
+#include "llama-graph.h"
+#include "llama-kv-cache-unified.h"
+#include "llama-memory.h"
+#include "llama-memory-recurrent.h"
+
+#include <memory>
+#include <vector>
+
+//
+// llama_memory_hybrid
+//
+
+// utilizes instances of llama_memory_recurrent and llama_kv_cache_unified to
+//   support models where each layer may be either attention-based or recurrent
+
+class llama_memory_hybrid : public llama_memory_i {
+public:
+
+    // this callback is used to filter out layers that should not be included in the cache
+    using layer_filter_cb = std::function<bool(int32_t il)>;
+
+    llama_memory_hybrid(
+        const llama_model & model,
+                            /* attn */
+                ggml_type    type_k,
+                ggml_type    type_v,
+                     bool    v_trans,
+                 uint32_t    kv_size,
+                 uint32_t    n_pad,
+                 uint32_t    n_swa,
+           llama_swa_type    swa_type,
+                             /* recurrent */
+                ggml_type    type_r,
+                ggml_type    type_s,
+                 uint32_t    rs_size,
+                             /* common */
+                 uint32_t    n_seq_max,
+                     bool    offload,
+                             /* layer filters */
+          layer_filter_cb && filter_attn = nullptr,
+          layer_filter_cb && filter_recr = nullptr);
+
+    ~llama_memory_hybrid() = default;
+
+    //
+    // llama_memory_i
+    //
+
+    llama_memory_context_ptr init_batch(
+            llama_batch_allocr & balloc,
+            uint32_t n_ubatch,
+            bool embd_all) override;
+
+    llama_memory_context_ptr init_full() override;
+
+    llama_memory_context_ptr init_update(llama_context * lctx, bool optimize) override;
+
+    bool get_can_shift() const override;
+
+    void clear(bool data) override;
+
+    bool seq_rm  (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1) override;
+    void seq_cp  (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) override;
+    void seq_keep(llama_seq_id seq_id)                                                          override;
+    void seq_add (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1, llama_pos shift) override;
+    void seq_div (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1, int d) override;
+
+    llama_pos seq_pos_min(llama_seq_id seq_id) const override;
+    llama_pos seq_pos_max(llama_seq_id seq_id) const override;
+
+    // state write/load
+
+    void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const override;
+    void state_read (llama_io_read_i  & io, llama_seq_id seq_id = -1)       override;
+
+    //
+    // llama_memory_hybrid specific API
+    //
+
+    llama_kv_cache_unified * get_mem_attn() const;
+    llama_memory_recurrent * get_mem_recr() const;
+
+private:
+    const llama_hparams & hparams;
+
+    const std::unique_ptr<llama_kv_cache_unified> mem_attn;
+    const std::unique_ptr<llama_memory_recurrent> mem_recr;
+};
+
+class llama_memory_hybrid_context : public llama_memory_context_i {
+public:
+    // init failure
+    explicit llama_memory_hybrid_context(llama_memory_status status);
+
+    // init full
+    explicit llama_memory_hybrid_context(llama_memory_hybrid * mem);
+
+    // init update
+    explicit llama_memory_hybrid_context(
+        llama_memory_hybrid * mem,
+              llama_context * lctx,
+                       bool   optimize);
+
+    // init success
+    llama_memory_hybrid_context(
+              llama_memory_hybrid * mem,
+            std::vector<uint32_t>   heads_attn,
+        std::vector<llama_ubatch>   ubatches);
+
+    ~llama_memory_hybrid_context() = default;
+
+    bool next()  override;
+    bool apply() override;
+
+    llama_memory_status  get_status() const override;
+    const llama_ubatch & get_ubatch() const override;
+
+    //
+    // llama_memory_hybrid_context
+    //
+
+    const llama_kv_cache_unified_context * get_attn() const;
+    const llama_memory_recurrent_context * get_recr() const;
+
+private:
+    // the index of the next ubatch to process
+    size_t i_next = 0;
+
+    std::vector<llama_ubatch> ubatches;
+
+    const llama_memory_context_ptr ctx_attn;
+    const llama_memory_context_ptr ctx_recr;
+
+    const llama_memory_status status;
+};
diff --git a/src/llama-memory-recurrent.cpp b/src/llama-memory-recurrent.cpp
new file mode 100644
index 0000000000000..1b1e95d567a6c
--- /dev/null
+++ b/src/llama-memory-recurrent.cpp
@@ -0,0 +1,1112 @@
+#include "llama-memory-recurrent.h"
+
+#include "llama-impl.h"
+#include "llama-io.h"
+#include "llama-batch.h"
+#include "llama-model.h"
+
+#include <algorithm>
+#include <cassert>
+#include <limits>
+#include <map>
+#include <stdexcept>
+
+//
+// llama_memory_recurrent
+//
+
+llama_memory_recurrent::llama_memory_recurrent(
+        const llama_model &  model,
+          layer_filter_cb && filter,
+                ggml_type    type_r,
+                ggml_type    type_s,
+                     bool    offload,
+                 uint32_t    mem_size,
+                 uint32_t    n_seq_max) : hparams(model.hparams), n_seq_max(n_seq_max) {
+    const int32_t n_layer = hparams.n_layer;
+
+    LLAMA_LOG_INFO("%s: mem_size = %u, n_seq_max = %u, type_r = '%s', type_s = '%s', n_layer = %d\n",
+            __func__, mem_size, n_seq_max, ggml_type_name(type_r), ggml_type_name(type_s), n_layer);
+
+    head = 0;
+    size = mem_size;
+    used = 0;
+
+    cells.clear();
+    cells.resize(mem_size);
+
+    // create a context for each buffer type
+    std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
+    auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
+        auto it = ctx_map.find(buft);
+        if (it == ctx_map.end()) {
+            ggml_init_params params = {
+                /*.mem_size   =*/ size_t(2u*n_layer*ggml_tensor_overhead()),
+                /*.mem_buffer =*/ NULL,
+                /*.no_alloc   =*/ true,
+            };
+
+            ggml_context * ctx = ggml_init(params);
+            if (!ctx) {
+                return nullptr;
+            }
+
+            ctx_map[buft] = ctx;
+            ctxs.emplace_back(ctx);
+
+            return ctx;
+        }
+
+        return it->second;
+    };
+
+    r_l.resize(n_layer);
+    s_l.resize(n_layer);
+
+    for (int i = 0; i < n_layer; i++) {
+        if (filter && !filter(i)) {
+            LLAMA_LOG_DEBUG("%s: layer %3d: skipped\n", __func__, i);
+            continue;
+        }
+
+        const char * dev_name = "CPU";
+
+        ggml_backend_buffer_type_t buft = ggml_backend_cpu_buffer_type();
+
+        if (offload) {
+            auto * dev = model.dev_layer(i);
+            buft = ggml_backend_dev_buffer_type(dev);
+
+            dev_name = ggml_backend_dev_name(dev);
+        }
+
+        LLAMA_LOG_DEBUG("%s, layer %3d: dev = %s\n", __func__, i, dev_name);
+
+        ggml_context * ctx = ctx_for_buft(buft);
+        if (!ctx) {
+            throw std::runtime_error("failed to create ggml context for kv cache");
+        }
+
+        ggml_tensor * r = ggml_new_tensor_1d(ctx, type_r, hparams.n_embd_r()*mem_size);
+        ggml_tensor * s = ggml_new_tensor_1d(ctx, type_s, hparams.n_embd_s()*mem_size);
+        ggml_format_name(r, "cache_r_l%d", i);
+        ggml_format_name(s, "cache_s_l%d", i);
+        r_l[i] = r;
+        s_l[i] = s;
+    }
+
+    // allocate tensors and initialize the buffers to avoid NaNs in the padding
+    for (auto it : ctx_map) {
+        auto * buft = it.first;
+        auto * ctx  = it.second;
+
+        ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
+        if (!buf) {
+            throw std::runtime_error("failed to allocate buffer for kv cache");
+        }
+        ggml_backend_buffer_clear(buf, 0);
+        LLAMA_LOG_INFO("%s: %10s KV buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0);
+        bufs.emplace_back(buf);
+    }
+
+    {
+        const size_t memory_size_r = size_r_bytes();
+        const size_t memory_size_s = size_s_bytes();
+
+        LLAMA_LOG_INFO("%s: KV self size  = %7.2f MiB, R (%s): %7.2f MiB, S (%s): %7.2f MiB\n", __func__,
+                (float)(memory_size_r + memory_size_s) / (1024.0f * 1024.0f),
+                ggml_type_name(type_r), (float)memory_size_r / (1024.0f * 1024.0f),
+                ggml_type_name(type_s), (float)memory_size_s / (1024.0f * 1024.0f));
+    }
+}
+
+void llama_memory_recurrent::clear(bool data) {
+    for (int32_t i = 0; i < (int32_t) size; ++i) {
+        cells[i].pos = -1;
+        cells[i].seq_id.clear();
+        cells[i].src = -1;
+        cells[i].tail = -1;
+    }
+
+    head = 0;
+    used = 0;
+
+    if (data) {
+        for (auto & buf : bufs) {
+            ggml_backend_buffer_clear(buf.get(), 0);
+        }
+    }
+}
+
+bool llama_memory_recurrent::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
+    uint32_t new_head = size;
+
+    if (p0 < 0) {
+        p0 = 0;
+    }
+
+    if (p1 < 0) {
+        p1 = std::numeric_limits<llama_pos>::max();
+    }
+
+    // models like Mamba or RWKV can't have a state partially erased
+    if (seq_id >= (int64_t) size) {
+        // could be fatal
+        return false;
+    }
+    if (0 <= seq_id) {
+        int32_t & tail_id = cells[seq_id].tail;
+        if (tail_id >= 0) {
+            const auto & cell = cells[tail_id];
+            // partial intersection is invalid
+            if ((0 < p0 && p0 <= cell.pos) || (0 < p1 && p1 <= cell.pos)) {
+                return false;
+            }
+            // invalidate tails which will be cleared
+            if (p0 <= cell.pos && cell.pos < p1) {
+                tail_id = -1;
+            }
+        }
+    } else {
+        // seq_id is negative, then the range should include everything or nothing
+        if (p0 != p1 && (p0 != 0 || p1 != std::numeric_limits<llama_pos>::max())) {
+            return false;
+        }
+    }
+
+    for (uint32_t i = 0; i < size; ++i) {
+        if (cells[i].pos >= p0 && cells[i].pos < p1) {
+            if (seq_id < 0) {
+                cells[i].seq_id.clear();
+            } else if (cells[i].has_seq_id(seq_id)) {
+                cells[i].seq_id.erase(seq_id);
+            } else {
+                continue;
+            }
+            if (cells[i].is_empty()) {
+                // keep count of the number of used cells
+                if (cells[i].pos >= 0) {
+                    used--;
+                }
+                cells[i].pos = -1;
+                cells[i].src = -1;
+                if (new_head == size) {
+                    new_head = i;
+                }
+            }
+        }
+    }
+
+    // If we freed up a slot, set head to it so searching can start there.
+    if (new_head != size && new_head < head) {
+        head = new_head;
+    }
+
+    return true;
+}
+
+void llama_memory_recurrent::seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) {
+    if (seq_id_src == seq_id_dst) {
+        return;
+    }
+
+    if (p0 < 0) {
+        p0 = 0;
+    }
+
+    if (p1 < 0) {
+        p1 = std::numeric_limits<llama_pos>::max();
+    }
+
+    if ((uint32_t) seq_id_dst < size && (uint32_t) seq_id_src < size) {
+        auto & tail_src = cells[seq_id_src];
+        auto & tail_dst = cells[seq_id_dst];
+        if (tail_dst.tail >= 0) {
+            // clear destination seq_id if it wasn't empty
+            auto & cell_dst = cells[tail_dst.tail];
+
+            cell_dst.seq_id.erase(seq_id_dst);
+            tail_dst.tail = -1;
+            if (cell_dst.seq_id.empty()) {
+                cell_dst.pos = -1;
+                cell_dst.src = -1;
+                used -= 1;
+            }
+        }
+        if (tail_src.tail >= 0) {
+            auto & cell_src = cells[tail_src.tail];
+
+            cell_src.seq_id.insert(seq_id_dst);
+            tail_dst.tail = tail_src.tail;
+        }
+    }
+}
+
+void llama_memory_recurrent::seq_keep(llama_seq_id seq_id) {
+    uint32_t new_head = size;
+
+    for (uint32_t i = 0; i < size; ++i) {
+        if ((llama_seq_id) i != seq_id) {
+            cells[i].tail = -1;
+        }
+
+        if (!cells[i].has_seq_id(seq_id)) {
+            if (cells[i].pos >= 0) {
+                used--;
+            }
+
+            cells[i].pos = -1;
+            cells[i].src = -1;
+            cells[i].seq_id.clear();
+
+            if (new_head == size){
+                new_head = i;
+            }
+        } else {
+            cells[i].seq_id.clear();
+            cells[i].seq_id.insert(seq_id);
+        }
+    }
+
+    // If we freed up a slot, set head to it so searching can start there.
+    if (new_head != size && new_head < head) {
+        head = new_head;
+    }
+}
+
+void llama_memory_recurrent::seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos shift) {
+    if (shift == 0) {
+        return;
+    }
+
+    if (p0 < 0) {
+        p0 = 0;
+    }
+
+    if (p1 < 0) {
+        p1 = std::numeric_limits<llama_pos>::max();
+    }
+
+    // If there is no range then return early to avoid looping over the
+    if (p0 == p1) {
+        return;
+    }
+
+    // for Mamba-like or RWKV models, only the pos needs to be shifted
+    if (0 <= seq_id && seq_id < (int64_t) size) {
+        const int32_t tail_id = cells[seq_id].tail;
+        if (tail_id >= 0) {
+            auto & cell = cells[tail_id];
+            if (cell.has_seq_id(seq_id) && p0 <= cell.pos && cell.pos < p1) {
+                cell.pos += shift;
+            }
+        }
+    }
+}
+
+void llama_memory_recurrent::seq_div(llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) {
+    if (d == 1) {
+        return;
+    }
+
+    if (p0 < 0) {
+        p0 = 0;
+    }
+
+    if (p1 < 0) {
+        p1 = std::numeric_limits<llama_pos>::max();
+    }
+
+    // If there is no range then return early to avoid looping over the cache.
+    if (p0 == p1) {
+        return;
+    }
+
+    // for Mamba-like or RWKV models, only the pos needs to be changed
+    if (0 <= seq_id && seq_id < (int64_t) size) {
+        const int32_t tail_id = cells[seq_id].tail;
+        if (tail_id >= 0) {
+            auto & cell = cells[tail_id];
+            if (cell.has_seq_id(seq_id) && p0 <= cell.pos && cell.pos < p1) {
+                cell.pos /= d;
+            }
+        }
+    }
+}
+
+llama_pos llama_memory_recurrent::seq_pos_min(llama_seq_id seq_id) const {
+    llama_pos result = std::numeric_limits<llama_pos>::max();
+
+    for (uint32_t i = 0; i < size; ++i) {
+        if (cells[i].has_seq_id(seq_id)) {
+            result = std::min(result, cells[i].pos);
+        }
+    }
+
+    if (result == std::numeric_limits<llama_pos>::max()) {
+        result = -1;
+    }
+
+    return result;
+}
+
+llama_pos llama_memory_recurrent::seq_pos_max(llama_seq_id seq_id) const {
+    llama_pos result = -1;
+
+    for (uint32_t i = 0; i < size; ++i) {
+        if (cells[i].has_seq_id(seq_id)) {
+            result = std::max(result, cells[i].pos);
+        }
+    }
+
+    return result;
+}
+
+llama_memory_context_ptr llama_memory_recurrent::init_batch(llama_batch_allocr & balloc, uint32_t n_ubatch, bool embd_all) {
+    std::vector<llama_ubatch> ubatches;
+
+    while (true) {
+        llama_ubatch ubatch;
+
+        if (embd_all) {
+            // if all tokens are output, split by sequence
+            ubatch = balloc.split_seq(n_ubatch);
+        } else {
+            ubatch = balloc.split_equal(n_ubatch);
+        }
+
+        if (ubatch.n_tokens == 0) {
+            break;
+        }
+
+        ubatches.push_back(std::move(ubatch)); // NOLINT
+    }
+
+    if (!prepare(ubatches)) {
+        return std::make_unique<llama_memory_recurrent_context>(LLAMA_MEMORY_STATUS_FAILED_PREPARE);
+    }
+
+    return std::make_unique<llama_memory_recurrent_context>(this, std::move(ubatches));
+}
+
+llama_memory_context_ptr llama_memory_recurrent::init_full() {
+    return std::make_unique<llama_memory_recurrent_context>(this);
+}
+
+llama_memory_context_ptr llama_memory_recurrent::init_update(llama_context * lctx, bool optimize) {
+    GGML_UNUSED(lctx);
+    GGML_UNUSED(optimize);
+
+    return std::make_unique<llama_memory_recurrent_context>(LLAMA_MEMORY_STATUS_NO_UPDATE);
+}
+
+bool llama_memory_recurrent::prepare(const std::vector<llama_ubatch> & ubatches) {
+    // simply remember the full state because it is very small for this type of cache
+    // TODO: optimize
+    auto org_cells = cells;
+    auto org_used = used;
+    auto org_head = head;
+
+    bool success = true;
+
+    for (const auto & ubatch : ubatches) {
+        if (!find_slot(ubatch)) {
+            success = false;
+            break;
+        }
+    }
+
+    // restore the original state
+    cells = std::move(org_cells);
+    used = org_used;
+    head = org_head;
+
+    return success;
+}
+
+bool llama_memory_recurrent::find_slot(const llama_ubatch & ubatch) {
+    const uint32_t n_seq_tokens = ubatch.n_seq_tokens;
+    const uint32_t n_seqs       = ubatch.n_seqs;
+
+    // if we have enough unused cells before the current head ->
+    //   better to start searching from the beginning of the cache, hoping to fill it
+    if (head > used + 2*n_seqs) {
+        head = 0;
+    }
+
+    // For recurrent state architectures (like Mamba or RWKV),
+    // each cache cell can store the state for a whole sequence.
+    // A slot should be always be contiguous.
+
+    // can only process batches with an equal number of new tokens in each sequence
+    GGML_ASSERT(ubatch.equal_seqs);
+
+    int32_t min = size - 1;
+    int32_t max = 0;
+
+    // everything should fit if all seq_ids are smaller than the max
+    for (uint32_t s = 0; s < n_seqs; ++s) {
+        const uint32_t i = s*n_seq_tokens; // first token of sequence set s
+        const uint32_t n_seq_id = ubatch.n_seq_id[i];
+
+        for (uint32_t j = 0; j < n_seq_id; ++j) {
+            const llama_seq_id seq_id = ubatch.seq_id[i][j];
+
+            if (seq_id < 0 || (uint32_t) seq_id >= size) {
+                // too big seq_id
+                // TODO: would it be possible to resize the cache instead?
+                LLAMA_LOG_ERROR("%s: seq_id=%d >= n_seq_max=%u Try using a bigger --parallel value\n", __func__, seq_id, n_seq_max);
+                return false;
+            }
+            if (j > 0) {
+                auto & seq = cells[seq_id];
+                if (seq.tail >= 0) {
+                    auto & cell = cells[seq.tail];
+                    // clear cells from seq_ids that become shared
+                    // (should not normally happen, but let's handle it anyway)
+                    cell.seq_id.erase(seq_id);
+                    seq.tail = -1;
+                    if (cell.seq_id.empty()) {
+                        cell.pos = -1;
+                        cell.src = -1;
+                        used -= 1;
+                    }
+                }
+            }
+        }
+    }
+
+#ifndef NDEBUG
+    {
+        std::vector<int32_t> tails_verif;
+        tails_verif.assign(size, -1);
+        for (uint32_t i = 0; i < size; ++i) {
+            auto & cell = cells[i];
+            for (llama_seq_id seq_id : cell.seq_id) {
+                if (tails_verif[seq_id] != -1) {
+                    LLAMA_LOG_ERROR("%s: duplicate tail for seq_id %d in cell %d and %d\n", __func__, seq_id, i, tails_verif[seq_id]);
+                }
+                tails_verif[seq_id] = i;
+            }
+        }
+        for (uint32_t i = 0; i < size; ++i) {
+            if (tails_verif[i] != cells[i].tail) {
+                LLAMA_LOG_ERROR("%s: wrong tail for seq_id %d, (%d instead of %d)\n", __func__, i, cells[i].tail, tails_verif[i]);
+            }
+        }
+    }
+#endif
+
+    // find next empty cell
+    uint32_t next_empty_cell = head;
+
+    for (uint32_t i = 0; i < size; ++i) {
+        if (next_empty_cell >= size) { next_empty_cell -= size; }
+        auto & cell = cells[next_empty_cell];
+        if (cell.is_empty()) { break; }
+        next_empty_cell += 1;
+    }
+
+    // find usable cell range
+    for (uint32_t s = 0; s < n_seqs; ++s) {
+        const uint32_t i = s*n_seq_tokens;
+        const llama_seq_id seq_id = ubatch.seq_id[i][0];
+        auto & seq_meta = cells[seq_id];
+        bool has_cell = false;
+        if (seq_meta.tail >= 0) {
+            auto & cell = cells[seq_meta.tail];
+            GGML_ASSERT(cell.has_seq_id(seq_id));
+            // does this seq_id "own" the cell?
+            if (cell.seq_id.size() == 1) { has_cell = true; }
+        }
+        if (!has_cell) {
+            auto & empty_cell = cells[next_empty_cell];
+            GGML_ASSERT(empty_cell.is_empty());
+            // copy old tail into the empty cell
+            if (seq_meta.tail >= 0) {
+                auto & orig_cell = cells[seq_meta.tail];
+                empty_cell.pos = orig_cell.pos;
+                empty_cell.src = orig_cell.src;
+                orig_cell.seq_id.erase(seq_id);
+                empty_cell.seq_id.insert(seq_id); // will be overwritten
+                GGML_ASSERT(!orig_cell.is_empty()); // has at least one remaining seq_id
+            }
+            seq_meta.tail = next_empty_cell;
+            // find next empty cell
+            if (s + 1 < n_seqs) {
+                for (uint32_t j = 0; j < size; ++j) {
+                    next_empty_cell += 1;
+                    if (next_empty_cell >= size) { next_empty_cell -= size; }
+                    auto & cell = cells[next_empty_cell];
+                    if (cell.is_empty()) { break; }
+                }
+            }
+        }
+        if (min > seq_meta.tail) { min = seq_meta.tail; }
+        if (max < seq_meta.tail) { max = seq_meta.tail; }
+    }
+
+    // gather and re-order
+    for (uint32_t s = 0; s < n_seqs; ++s) {
+        const uint32_t i = s*n_seq_tokens;
+        const int32_t dst_id = s + min;
+        const int32_t src_id = cells[ubatch.seq_id[i][0]].tail;
+        if (dst_id != src_id) {
+            auto & dst_cell = cells[dst_id];
+            auto & src_cell = cells[src_id];
+
+            std::swap(dst_cell.pos, src_cell.pos);
+            std::swap(dst_cell.src, src_cell.src);
+            std::swap(dst_cell.seq_id, src_cell.seq_id);
+
+            // swap tails
+            for (uint32_t j = 0; j < size; ++j) {
+                int32_t & tail = cells[j].tail;
+                if (tail == src_id) {
+                    tail = dst_id;
+                } else if (tail == dst_id) {
+                    tail = src_id;
+                }
+            }
+        }
+    }
+
+    // update the pos of the used seqs
+    for (uint32_t s = 0; s < n_seqs; ++s) {
+        const uint32_t i = s*n_seq_tokens;
+        const llama_pos last_pos = ubatch.pos[i + n_seq_tokens - 1];
+        const int32_t cell_id = s + min;
+        auto & cell = cells[cell_id];
+
+        if (cell.pos >= 0 && last_pos != cell.pos + (llama_pos) n_seq_tokens) {
+            // What should happen when the pos backtracks or skips a value?
+            // Clearing the state mid-batch would require special-casing which isn't done.
+            LLAMA_LOG_WARN("%s: non-consecutive token position %d after %d for sequence %d with %u new tokens\n",
+                __func__, last_pos, cell.pos, ubatch.seq_id[i][0], n_seq_tokens);
+        }
+        cell.pos = last_pos;
+        cell.seq_id.clear();
+        for (int32_t j = 0; j < ubatch.n_seq_id[i]; ++j) {
+            const llama_seq_id seq_id = ubatch.seq_id[i][j];
+            cell.seq_id.insert(seq_id);
+            cells[seq_id].tail = cell_id;
+        }
+    }
+
+    // Find first cell without src refs, to use as the zero-ed state
+    {
+        // TODO: bake-in src refcounts in the cell metadata
+        std::vector<int32_t> refcounts(size, 0);
+        for (size_t i = 0; i < size; ++i) {
+            const int32_t src = cells[i].src;
+            if (src >= 0) {
+                refcounts[src] += 1;
+            }
+        }
+
+        rs_z = -1;
+        for (int i = min; i <= max; ++i) {
+            if (refcounts[i] == 0) {
+                rs_z = i;
+                break;
+            }
+        }
+
+        for (int i = min; i <= max; ++i) {
+            if (cells[i].src < 0) {
+                GGML_ASSERT(rs_z >= 0);
+                cells[i].src0 = rs_z;
+            } else {
+                // Stage the source ids for all used cells to allow correct seq_* behavior
+                // and still make these values available when setting the inputs
+                cells[i].src0 = cells[i].src;
+            }
+            cells[i].src = i; // avoid moving or clearing twice
+        }
+    }
+
+    // allow getting the range of used cells, from head to head + n
+    head = min;
+    n    = max - min + 1;
+    used = std::count_if(cells.begin(), cells.end(),
+        [](const mem_cell & cell){ return !cell.is_empty(); });
+
+    // sanity check
+    return n >= n_seqs;
+}
+
+bool llama_memory_recurrent::get_can_shift() const {
+    // shifting the pos is trivial for recurrent models
+    return true;
+}
+
+size_t llama_memory_recurrent::total_size() const {
+    size_t size = 0;
+    for (const auto & buf : bufs) {
+        size += ggml_backend_buffer_get_size(buf.get());
+    }
+
+    return size;
+}
+
+size_t llama_memory_recurrent::size_r_bytes() const {
+    size_t size_r_bytes = 0;
+
+    for (const auto & r : r_l) {
+        if (r != nullptr) {
+            size_r_bytes += ggml_nbytes(r);
+        }
+    }
+
+    return size_r_bytes;
+}
+
+size_t llama_memory_recurrent::size_s_bytes() const {
+    size_t size_s_bytes = 0;
+
+    for (const auto & s : s_l) {
+        if (s != nullptr) {
+            size_s_bytes += ggml_nbytes(s);
+        }
+    }
+
+    return size_s_bytes;
+}
+
+void llama_memory_recurrent::state_write(llama_io_write_i & io, llama_seq_id seq_id) const {
+    std::vector<std::pair<uint32_t, uint32_t>> cell_ranges; // ranges, from inclusive, to exclusive
+    uint32_t cell_count = 0;
+
+    // Count the number of cells with the specified seq_id
+    // Find all the ranges of cells with this seq id (or all, when -1)
+    uint32_t cell_range_begin = size;
+    for (uint32_t i = 0; i < size; ++i) {
+        const auto & cell = cells[i];
+        if ((seq_id == -1 && !cell.is_empty()) || cell.has_seq_id(seq_id)) {
+            ++cell_count;
+            if (cell_range_begin == size) {
+                cell_range_begin = i;
+            }
+        } else {
+            if (cell_range_begin != size) {
+                cell_ranges.emplace_back(cell_range_begin, i);
+                cell_range_begin = size;
+            }
+        }
+    }
+    if (cell_range_begin != size) {
+        cell_ranges.emplace_back(cell_range_begin, size);
+    }
+
+    // DEBUG CHECK: Sum of cell counts in ranges should equal the total cell count
+    uint32_t cell_count_check = 0;
+    for (const auto & range : cell_ranges) {
+        cell_count_check += range.second - range.first;
+    }
+    GGML_ASSERT(cell_count == cell_count_check);
+
+    io.write(&cell_count, sizeof(cell_count));
+
+    state_write_meta(io, cell_ranges, seq_id);
+    state_write_data(io, cell_ranges);
+}
+
+void llama_memory_recurrent::state_read(llama_io_read_i & io, llama_seq_id seq_id) {
+    uint32_t cell_count;
+    io.read_to(&cell_count, sizeof(cell_count));
+
+    bool res = true;
+
+    res = res && state_read_meta(io, cell_count, seq_id);
+    res = res && state_read_data(io, cell_count);
+
+    if (!res) {
+        if (seq_id == -1) {
+            clear(true);
+        } else {
+            seq_rm(seq_id, -1, -1);
+        }
+        throw std::runtime_error("failed to restore kv cache");
+    }
+}
+
+void llama_memory_recurrent::state_write_meta(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges, llama_seq_id seq_id) const {
+    for (const auto & range : cell_ranges) {
+        for (uint32_t i = range.first; i < range.second; ++i) {
+            const auto & cell = cells[i];
+            const llama_pos pos      = cell.pos;
+            const uint32_t  n_seq_id = seq_id == -1 ? cell.seq_id.size() : 0;
+
+            io.write(&pos,      sizeof(pos));
+            io.write(&n_seq_id, sizeof(n_seq_id));
+
+            if (n_seq_id) {
+                for (auto seq_id : cell.seq_id) {
+                    io.write(&seq_id, sizeof(seq_id));
+                }
+            }
+        }
+    }
+}
+
+void llama_memory_recurrent::state_write_data(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges) const {
+    const uint32_t s_trans = 0;
+    const uint32_t n_layer = hparams.n_layer;
+
+    io.write(&s_trans, sizeof(s_trans));
+    io.write(&n_layer,   sizeof(n_layer));
+
+    std::vector<uint8_t> tmp_buf;
+
+    // Iterate and write all the keys first, each row is a cell
+    // Get whole range at a time
+    for (uint32_t il = 0; il < n_layer; ++il) {
+
+        // Write key type
+        const int32_t r_type_i = (int32_t)r_l[il]->type;
+        io.write(&r_type_i, sizeof(r_type_i));
+
+        // Write row size of key
+        const uint64_t r_size_row = ggml_row_size(r_l[il]->type, hparams.n_embd_r());
+        io.write(&r_size_row, sizeof(r_size_row));
+
+        // Read each range of cells of k_size length each into tmp_buf and write out
+        for (const auto & range : cell_ranges) {
+            const size_t range_size = range.second - range.first;
+            const size_t buf_size = range_size * r_size_row;
+            io.write_tensor(r_l[il], range.first * r_size_row, buf_size);
+        }
+    }
+
+    if (!s_trans) {
+        for (uint32_t il = 0; il < n_layer; ++il) {
+
+            // Write value type
+            const int32_t s_type_i = (int32_t)s_l[il]->type;
+            io.write(&s_type_i, sizeof(s_type_i));
+
+            // Write row size of value
+            const uint64_t s_size_row = ggml_row_size(s_l[il]->type, hparams.n_embd_s());
+            io.write(&s_size_row, sizeof(s_size_row));
+
+            // Read each range of cells of s_size length each into tmp_buf and write out
+            for (const auto & range : cell_ranges) {
+                const size_t range_size = range.second - range.first;
+                const size_t buf_size = range_size * s_size_row;
+                io.write_tensor(s_l[il], range.first * s_size_row, buf_size);
+            }
+        }
+    } else {
+        // When v is transposed, we also need the element size and get the element ranges from each row
+        const uint32_t mem_size = size;
+        for (uint32_t il = 0; il < n_layer; ++il) {
+            const uint32_t n_embd_s = hparams.n_embd_s();
+
+            // Write value type
+            const int32_t s_type_i = (int32_t)s_l[il]->type;
+            io.write(&s_type_i, sizeof(s_type_i));
+
+            // Write element size
+            const uint32_t s_size_el = ggml_type_size(s_l[il]->type);
+            io.write(&s_size_el, sizeof(s_size_el));
+
+            // Write GQA embedding size
+            io.write(&n_embd_s, sizeof(n_embd_s));
+
+            // For each row, we get the element values of each cell
+            for (uint32_t j = 0; j < n_embd_s; ++j) {
+                // Read each range of cells of v_size_el length each into tmp_buf and write out
+                for (const auto & range : cell_ranges) {
+                    const size_t range_size = range.second - range.first;
+                    const size_t src_offset = (range.first + j * mem_size) * s_size_el;
+                    const size_t buf_size = range_size * s_size_el;
+                    io.write_tensor(s_l[il], src_offset, buf_size);
+                }
+            }
+        }
+    }
+}
+
+bool llama_memory_recurrent::state_read_meta(llama_io_read_i & io, uint32_t cell_count, llama_seq_id dest_seq_id) {
+    if (dest_seq_id != -1) {
+        // single sequence
+
+        seq_rm(dest_seq_id, -1, -1);
+
+        llama_batch_allocr balloc(hparams.n_pos_per_embd());
+
+        llama_ubatch ubatch = balloc.ubatch_reserve(cell_count, 1);
+
+        for (uint32_t i = 0; i < cell_count; ++i) {
+            llama_pos pos;
+            uint32_t n_seq_id;
+
+            io.read_to(&pos,      sizeof(pos));
+            io.read_to(&n_seq_id, sizeof(n_seq_id));
+
+            if (n_seq_id != 0) {
+                LLAMA_LOG_ERROR("%s: invalid seq_id-agnostic kv cell\n", __func__);
+                return false;
+            }
+
+            ubatch.pos[i] = pos;
+        }
+        ubatch.n_seq_id[0] = 1;
+        ubatch.seq_id[0] = &dest_seq_id;
+
+        if (!find_slot(ubatch)) {
+            LLAMA_LOG_ERROR("%s: failed to find available cells in kv cache\n", __func__);
+            return false;
+        }
+
+        // DEBUG CHECK: kv.head should be our first cell, kv.head + cell_count - 1 should be our last cell (verify seq_id and pos values)
+        // Assume that this is one contiguous block of cells
+        GGML_ASSERT(head + cell_count <= size);
+        GGML_ASSERT(cells[head].pos == ubatch.pos[0]);
+        GGML_ASSERT(cells[head + cell_count - 1].pos == ubatch.pos[cell_count - 1]);
+        GGML_ASSERT(cells[head].has_seq_id(dest_seq_id));
+        GGML_ASSERT(cells[head + cell_count - 1].has_seq_id(dest_seq_id));
+    } else {
+        // whole KV cache restore
+
+        if (cell_count > size) {
+            LLAMA_LOG_ERROR("%s: not enough cells in kv cache\n", __func__);
+            return false;
+        }
+
+        clear(true);
+
+        for (uint32_t i = 0; i < cell_count; ++i) {
+            auto & cell = cells[i];
+
+            llama_pos pos;
+            uint32_t  n_seq_id;
+
+            io.read_to(&pos,      sizeof(pos));
+            io.read_to(&n_seq_id, sizeof(n_seq_id));
+
+            cell.pos = pos;
+
+            for (uint32_t j = 0; j < n_seq_id; ++j) {
+                llama_seq_id seq_id;
+                io.read_to(&seq_id, sizeof(seq_id));
+
+                // TODO: llama_memory_recurrent should have a notion of max sequences
+                //if (seq_id < 0 || (uint32_t) seq_id >= llama_n_seq_max(ctx)) {
+                if (seq_id < 0) {
+                    //LLAMA_LOG_ERROR("%s: invalid seq_id, %d is out of range [0, %u)\n", __func__, seq_id, llama_n_seq_max(ctx));
+                    LLAMA_LOG_ERROR("%s: invalid seq_id, %d is out of range [0, inf)\n", __func__, seq_id);
+                    return false;
+                }
+
+                cell.seq_id.insert(seq_id);
+
+                int32_t & tail = cells[seq_id].tail;
+                if (tail != -1) {
+                    LLAMA_LOG_ERROR("%s: duplicate tail for seq_id %d in cell %d and %d\n", __func__, seq_id, i, tail);
+                    return false;
+                }
+                tail = i;
+            }
+        }
+
+        head = 0;
+        used = cell_count;
+    }
+
+    for (uint32_t i = 0; i < cell_count; ++i) {
+        uint32_t cell_id = head + i;
+        // make sure the recurrent states will keep their restored state
+        cells[cell_id].src = cell_id;
+    }
+
+    return true;
+}
+
+bool llama_memory_recurrent::state_read_data(llama_io_read_i & io, uint32_t cell_count) {
+    uint32_t s_trans;
+    uint32_t n_layer;
+    io.read_to(&s_trans, sizeof(s_trans));
+    io.read_to(&n_layer, sizeof(n_layer));
+
+    if (n_layer != hparams.n_layer) {
+        LLAMA_LOG_ERROR("%s: mismatched layer count (%u instead of %u)\n", __func__, n_layer, hparams.n_layer);
+        return false;
+    }
+    if (cell_count > size) {
+        LLAMA_LOG_ERROR("%s: not enough cells in kv cache to restore state (%u > %u)\n", __func__, cell_count, size);
+        return false;
+    }
+    if (false != (bool) s_trans) {
+        LLAMA_LOG_ERROR("%s: incompatible s transposition\n", __func__);
+        return false;
+    }
+
+    // For each layer, read the keys for each cell, one row is one cell, read as one contiguous block
+    for (uint32_t il = 0; il < n_layer; ++il) {
+
+        // Read type of key
+        int32_t r_type_i_ref;
+        io.read_to(&r_type_i_ref, sizeof(r_type_i_ref));
+        const int32_t r_type_i = (int32_t) r_l[il]->type;
+        if (r_type_i != r_type_i_ref) {
+            LLAMA_LOG_ERROR("%s: mismatched r type (%d != %d, layer %d)\n", __func__, r_type_i, r_type_i_ref, il);
+            return false;
+        }
+
+        // Read row size of key
+        uint64_t r_size_row_ref;
+        io.read_to(&r_size_row_ref, sizeof(r_size_row_ref));
+        const size_t r_size_row = ggml_row_size(r_l[il]->type, hparams.n_embd_r());
+        if (r_size_row != r_size_row_ref) {
+            LLAMA_LOG_ERROR("%s: mismatched r row size (%zu != %zu, layer %d)\n", __func__, r_size_row, (size_t) r_size_row_ref, il);
+            return false;
+        }
+
+        if (cell_count) {
+            // Read and set the keys for the whole cell range
+            ggml_backend_tensor_set(r_l[il], io.read(cell_count * r_size_row), head * r_size_row, cell_count * r_size_row);
+        }
+    }
+
+    if (!s_trans) {
+        for (uint32_t il = 0; il < n_layer; ++il) {
+
+            // Read type of value
+            int32_t s_type_i_ref;
+            io.read_to(&s_type_i_ref, sizeof(s_type_i_ref));
+            const int32_t s_type_i = (int32_t)s_l[il]->type;
+            if (s_type_i != s_type_i_ref) {
+                LLAMA_LOG_ERROR("%s: mismatched s type (%d != %d, layer %d)\n", __func__, s_type_i, s_type_i_ref, il);
+                return false;
+            }
+
+            // Read row size of value
+            uint64_t s_size_row_ref;
+            io.read_to(&s_size_row_ref, sizeof(s_size_row_ref));
+            const size_t s_size_row = ggml_row_size(s_l[il]->type, hparams.n_embd_s());
+            if (s_size_row != s_size_row_ref) {
+                LLAMA_LOG_ERROR("%s: mismatched s row size (%zu != %zu, layer %d)\n", __func__, s_size_row, (size_t) s_size_row_ref, il);
+                return false;
+            }
+
+            if (cell_count) {
+                // Read and set the values for the whole cell range
+                ggml_backend_tensor_set(s_l[il], io.read(cell_count * s_size_row), head * s_size_row, cell_count * s_size_row);
+            }
+        }
+    } else {
+        // For each layer, read the values for each cell (transposed)
+        for (uint32_t il = 0; il < n_layer; ++il) {
+            const uint32_t n_embd_s = hparams.n_embd_s();
+
+            // Read type of value
+            int32_t s_type_i_ref;
+            io.read_to(&s_type_i_ref, sizeof(s_type_i_ref));
+            const int32_t s_type_i = (int32_t)s_l[il]->type;
+            if (s_type_i != s_type_i_ref) {
+                LLAMA_LOG_ERROR("%s: mismatched s type (%d != %d, layer %d)\n", __func__, s_type_i, s_type_i_ref, il);
+                return false;
+            }
+
+            // Read element size of value
+            uint32_t s_size_el_ref;
+            io.read_to(&s_size_el_ref, sizeof(s_size_el_ref));
+            const size_t s_size_el = ggml_type_size(s_l[il]->type);
+            if (s_size_el != s_size_el_ref) {
+                LLAMA_LOG_ERROR("%s: mismatched s element size (%zu != %zu, layer %d)\n", __func__, s_size_el, (size_t) s_size_el_ref, il);
+                return false;
+            }
+
+            // Read state embedding size
+            uint32_t n_embd_s_ref;
+            io.read_to(&n_embd_s_ref, sizeof(n_embd_s_ref));
+            if (n_embd_s != n_embd_s_ref) {
+                LLAMA_LOG_ERROR("%s: mismatched s embedding size (%u != %u, layer %d)\n", __func__, n_embd_s, n_embd_s_ref, il);
+                return false;
+            }
+
+            if (cell_count) {
+                // For each row in the transposed matrix, read the values for the whole cell range
+                for (uint32_t j = 0; j < n_embd_s; ++j) {
+                    const size_t dst_offset = (head + j * size) * s_size_el;
+                    ggml_backend_tensor_set(s_l[il], io.read(cell_count * s_size_el), dst_offset, cell_count * s_size_el);
+                }
+            }
+        }
+    }
+
+    return true;
+}
+
+//
+// llama_memory_recurrent_context
+//
+
+llama_memory_recurrent_context::llama_memory_recurrent_context(llama_memory_status status) : status(status) {}
+
+llama_memory_recurrent_context::llama_memory_recurrent_context(
+        llama_memory_recurrent * mem) : status(LLAMA_MEMORY_STATUS_SUCCESS), mem(mem), is_full(true) {
+}
+
+llama_memory_recurrent_context::llama_memory_recurrent_context(
+        llama_memory_recurrent * mem,
+        std::vector<llama_ubatch> ubatches) : status(LLAMA_MEMORY_STATUS_SUCCESS), mem(mem), ubatches(std::move(ubatches)) {}
+
+llama_memory_recurrent_context::~llama_memory_recurrent_context() = default;
+
+bool llama_memory_recurrent_context::next() {
+    assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
+
+    if (++i_next >= ubatches.size()) {
+        return false;
+    }
+
+    return true;
+}
+
+bool llama_memory_recurrent_context::apply() {
+    assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
+
+    mem->find_slot(ubatches[i_next]);
+
+    return true;
+}
+
+llama_memory_status llama_memory_recurrent_context::get_status() const {
+    return status;
+}
+
+const llama_ubatch & llama_memory_recurrent_context::get_ubatch() const {
+    assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
+
+    return ubatches[i_next];
+}
+
+uint32_t llama_memory_recurrent_context::get_n_rs() const {
+    return is_full ? mem->size : mem->n;
+}
+
+uint32_t llama_memory_recurrent_context::get_head() const {
+    return is_full ? 0 : mem->head;
+}
+
+int32_t llama_memory_recurrent_context::get_rs_z() const {
+    return is_full ? 0 : mem->rs_z;
+}
+
+uint32_t llama_memory_recurrent_context::get_size() const {
+    return mem->size;
+}
+
+ggml_tensor * llama_memory_recurrent_context::get_r_l(int32_t il) const {
+    return mem->r_l[il];
+}
+
+ggml_tensor * llama_memory_recurrent_context::get_s_l(int32_t il) const {
+    return mem->s_l[il];
+}
+
+int32_t llama_memory_recurrent_context::s_copy(int i) const {
+    return  mem->cells[i + mem->head].src0;
+}
diff --git a/src/llama-memory-recurrent.h b/src/llama-memory-recurrent.h
new file mode 100644
index 0000000000000..4d094f9a05788
--- /dev/null
+++ b/src/llama-memory-recurrent.h
@@ -0,0 +1,183 @@
+#pragma once
+
+#include "llama-batch.h"
+#include "llama-graph.h"
+#include "llama-memory.h"
+
+#include <set>
+#include <vector>
+
+//
+// llama_memory_recurrent
+//
+
+// TODO: extract the cache state used for graph computation into llama_memory_recurrent_context_i
+//       see the implementation of llama_kv_cache_unified_context_i for an example how to do it
+class llama_memory_recurrent : public llama_memory_i {
+public:
+
+    // this callback is used to filter out layers that should not be included in the cache
+    using layer_filter_cb = std::function<bool(int32_t il)>;
+
+    llama_memory_recurrent(
+            const llama_model &  model,
+              layer_filter_cb && filter,
+                    ggml_type    type_r,
+                    ggml_type    type_s,
+                         bool    offload,
+                     uint32_t    mem_size,
+                     uint32_t    n_seq_max);
+
+    ~llama_memory_recurrent() = default;
+
+    //
+    // llama_memory_i
+    //
+
+    llama_memory_context_ptr init_batch(
+            llama_batch_allocr & balloc,
+            uint32_t n_ubatch,
+            bool embd_all) override;
+
+    llama_memory_context_ptr init_full() override;
+
+    llama_memory_context_ptr init_update(llama_context * lctx, bool optimize) override;
+
+    void clear(bool data) override;
+
+    bool seq_rm  (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1) override;
+    void seq_cp  (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) override;
+    void seq_keep(llama_seq_id seq_id)                                                          override;
+    void seq_add (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1, llama_pos shift) override;
+    void seq_div (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1, int d) override;
+
+    llama_pos seq_pos_min(llama_seq_id seq_id) const override;
+    llama_pos seq_pos_max(llama_seq_id seq_id) const override;
+
+    bool prepare(const std::vector<llama_ubatch> & ubatches);
+
+    // find a contiguous slot of memory cells and emplace the ubatch there
+    bool find_slot(const llama_ubatch & ubatch);
+
+    bool get_can_shift() const override;
+
+    // state write/load
+
+    void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const override;
+    void state_read (llama_io_read_i  & io, llama_seq_id seq_id = -1) override;
+
+    uint32_t head = 0; // the location where the batch will be placed in the cache (see find_slot())
+    uint32_t size = 0; // total number of cells, shared across all sequences
+    uint32_t used = 0; // used cells (i.e. at least one seq_id)
+
+    // computed before each graph build
+    uint32_t n = 0;
+
+    // first zero-ed state
+    int32_t rs_z = -1;
+
+    // TODO: optimize for recurrent state needs
+    struct mem_cell {
+        llama_pos pos  = -1;
+        int32_t   src  = -1; // used to know where states should be copied from
+        int32_t   src0 = -1; // like src, but only used when setting the inputs (allowing to copy once)
+        int32_t   tail = -1;
+
+        std::set<llama_seq_id> seq_id;
+
+        bool has_seq_id(const llama_seq_id & id) const {
+            return seq_id.find(id) != seq_id.end();
+        }
+
+        bool is_empty() const {
+            return seq_id.empty();
+        }
+
+        bool is_same_seq(const mem_cell & other) const {
+            return seq_id == other.seq_id;
+        }
+    };
+
+    std::vector<mem_cell> cells;
+
+    // per layer
+    std::vector<ggml_tensor *> r_l;
+    std::vector<ggml_tensor *> s_l;
+
+private:
+    //const llama_model & model;
+    const llama_hparams & hparams;
+
+    const uint32_t n_seq_max = 1;
+
+    std::vector<ggml_context_ptr>        ctxs;
+    std::vector<ggml_backend_buffer_ptr> bufs;
+
+    size_t total_size() const;
+
+    size_t size_r_bytes() const;
+    size_t size_s_bytes() const;
+
+    void state_write_meta(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges, llama_seq_id seq_id = -1) const;
+    void state_write_data(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges) const;
+
+    bool state_read_meta(llama_io_read_i & io, uint32_t cell_count, llama_seq_id dest_seq_id = -1);
+    bool state_read_data(llama_io_read_i & io, uint32_t cell_count);
+};
+
+class llama_memory_recurrent_context : public llama_memory_context_i {
+public:
+    // used for errors
+    llama_memory_recurrent_context(llama_memory_status status);
+
+    // used to create a full-cache or update context
+    llama_memory_recurrent_context(
+            llama_memory_recurrent * mem);
+
+    // used to create a batch processing context from a batch
+    llama_memory_recurrent_context(
+            llama_memory_recurrent * mem,
+            std::vector<llama_ubatch> ubatches);
+
+    virtual ~llama_memory_recurrent_context();
+
+    //
+    // llama_memory_context_i
+    //
+
+    bool next()  override;
+    bool apply() override;
+
+    llama_memory_status  get_status() const override;
+    const llama_ubatch & get_ubatch() const override;
+
+    //
+    // llama_memory_recurrent_context specific API
+    //
+
+    uint32_t get_n_rs() const;
+    uint32_t get_head() const;
+    int32_t  get_rs_z() const;
+    uint32_t get_size() const;
+
+    ggml_tensor * get_r_l(int32_t il) const;
+    ggml_tensor * get_s_l(int32_t il) const;
+
+    int32_t s_copy(int i) const;
+
+private:
+    const llama_memory_status status;
+
+    llama_memory_recurrent * mem;
+
+    size_t i_next = 0;
+
+    std::vector<llama_ubatch> ubatches;
+
+    //
+    // data needed for building the compute graph for the current ubatch:
+    // TODO: extract all the state like `head` and `n` here
+    //
+
+    const bool is_full = false;
+};
diff --git a/src/llama-memory.cpp b/src/llama-memory.cpp
index 10173253edfe4..f1107672c6476 100644
--- a/src/llama-memory.cpp
+++ b/src/llama-memory.cpp
@@ -1 +1,42 @@
 #include "llama-memory.h"
+
+llama_memory_status llama_memory_status_combine(llama_memory_status s0, llama_memory_status s1) {
+    bool has_update = false;
+
+    switch (s0) {
+        case LLAMA_MEMORY_STATUS_SUCCESS:
+            {
+                has_update = true;
+                break;
+            }
+        case LLAMA_MEMORY_STATUS_NO_UPDATE:
+            {
+                break;
+            }
+        case LLAMA_MEMORY_STATUS_FAILED_PREPARE:
+        case LLAMA_MEMORY_STATUS_FAILED_COMPUTE:
+            {
+                return s0;
+            }
+    }
+
+    switch (s1) {
+        case LLAMA_MEMORY_STATUS_SUCCESS:
+            {
+                has_update = true;
+                break;
+            }
+        case LLAMA_MEMORY_STATUS_NO_UPDATE:
+            {
+                break;
+            }
+        case LLAMA_MEMORY_STATUS_FAILED_PREPARE:
+        case LLAMA_MEMORY_STATUS_FAILED_COMPUTE:
+            {
+                return s1;
+            }
+    }
+
+    // if either status has an update, then the combined status has an update
+    return has_update ? LLAMA_MEMORY_STATUS_SUCCESS : LLAMA_MEMORY_STATUS_NO_UPDATE;
+}
diff --git a/src/llama-memory.h b/src/llama-memory.h
index b3799d66e8c17..16b7e5ee2484a 100644
--- a/src/llama-memory.h
+++ b/src/llama-memory.h
@@ -3,10 +3,14 @@
 #include "llama.h"
 
 #include <memory>
-#include <vector>
 
 struct llama_ubatch;
 
+class llama_batch_allocr;
+
+class llama_io_write_i;
+class llama_io_read_i;
+
 struct llama_memory_params {
     // kv cache
     ggml_type type_k;
@@ -16,46 +20,28 @@ struct llama_memory_params {
     bool swa_full;
 };
 
-// general concept of LLM memory
-// the KV cache is a type of LLM memory, but there can be other types
-class llama_memory_i {
-public:
-    virtual ~llama_memory_i() = default;
-
-    virtual void clear() = 0;
-
-    virtual bool seq_rm  (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1) = 0;
-    virtual void seq_cp  (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) = 0;
-    virtual void seq_keep(llama_seq_id seq_id) = 0;
-    virtual void seq_add (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1, llama_pos shift) = 0;
-    virtual void seq_div (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1, int d) = 0;
-
-    virtual llama_pos seq_pos_min(llama_seq_id seq_id) const = 0;
-    virtual llama_pos seq_pos_max(llama_seq_id seq_id) const = 0;
-
-    virtual bool get_can_edit() const = 0;
-};
-
 enum llama_memory_status {
     LLAMA_MEMORY_STATUS_SUCCESS = 0,
+    LLAMA_MEMORY_STATUS_NO_UPDATE,
     LLAMA_MEMORY_STATUS_FAILED_PREPARE,
     LLAMA_MEMORY_STATUS_FAILED_COMPUTE,
 };
 
-// the interface for managing the memory state during batch processing
+// helper function for combining the status of two memory contexts
+// useful for implementing hybrid memory types (e.g. iSWA)
+llama_memory_status llama_memory_status_combine(llama_memory_status s0, llama_memory_status s1);
+
+// the interface for managing the memory context during batch processing
 // this interface is implemented per memory type. see:
-//   - llama_kv_cache_unified_state
-//   - llama_kv_cache_unified_iswa_state
+//   - llama_kv_cache_unified_context
+//   - llama_kv_cache_unified_iswa_context
 //   ...
 //
-// the only method that can mutate the memory and the memory state is llama_memory_i::apply()
-//
-// TODO: rename to llama_memory_context_i ?
-class llama_memory_state_i {
-public:
-    virtual ~llama_memory_state_i() = default;
+// the only method that should mutate the memory and the memory context is llama_memory_i::apply()
+struct llama_memory_context_i {
+    virtual ~llama_memory_context_i() = default;
 
-    // consume the current ubatch from the state and proceed to the next one
+    // consume the current ubatch from the context and proceed to the next one
     // return false if we are done
     virtual bool next() = 0;
 
@@ -63,14 +49,65 @@ class llama_memory_state_i {
     // return false on failure
     virtual bool apply() = 0;
 
-    // TODO: this might get reworked in the future when refactoring llama_batch
-    virtual std::vector<int64_t> & out_ids() = 0;
-
     // get the current ubatch
     virtual const llama_ubatch & get_ubatch() const = 0;
 
-    // get the status of the memory state
+    // get the status of the memory context - used for error handling and checking if any updates would be applied
     virtual llama_memory_status get_status() const = 0;
 };
 
-using llama_memory_state_ptr = std::unique_ptr<llama_memory_state_i>;
+using llama_memory_context_ptr = std::unique_ptr<llama_memory_context_i>;
+
+// general concept of LLM memory
+// the KV cache is a type of LLM memory, but there can be other types
+struct llama_memory_i {
+    virtual ~llama_memory_i() = default;
+
+    // split the input batch into a set of ubatches and verify that they can fit into the cache
+    // return a context object containing the ubatches and memory state required to process them
+    // check the llama_memory_context_i::get_status() for the result
+    virtual llama_memory_context_ptr init_batch(
+            llama_batch_allocr & balloc,
+            uint32_t n_ubatch,
+            bool embd_all) = 0;
+
+    // simulate full cache, used for allocating worst-case compute buffers
+    virtual llama_memory_context_ptr init_full() = 0;
+
+    // prepare for any pending memory updates, such as shifts, defrags, etc.
+    // status == LLAMA_MEMORY_STATUS_NO_UPDATE if there is nothing to update
+    virtual llama_memory_context_ptr init_update(llama_context * lctx, bool optimize) = 0;
+
+    // getters
+    virtual bool get_can_shift() const = 0;
+
+    //
+    // ops
+    //
+
+    // if data == true, the data buffers will also be cleared together with the metadata
+    virtual void clear(bool data) = 0;
+
+    virtual bool seq_rm  (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1) = 0;
+    virtual void seq_cp  (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) = 0;
+    virtual void seq_keep(llama_seq_id seq_id) = 0;
+    virtual void seq_add (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1, llama_pos shift) = 0;
+    virtual void seq_div (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1, int d) = 0;
+
+    virtual llama_pos seq_pos_min(llama_seq_id seq_id) const = 0;
+    virtual llama_pos seq_pos_max(llama_seq_id seq_id) const = 0;
+
+    //
+    // state write/read
+    //
+
+    virtual void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const = 0;
+    virtual void state_read (llama_io_read_i  & io, llama_seq_id seq_id = -1) = 0;
+};
+
+using llama_memory_ptr = std::unique_ptr<llama_memory_i>;
+
+// TODO: temporary until the llama_kv_cache is removed from the public API
+struct llama_kv_cache : public llama_memory_i {
+    virtual ~llama_kv_cache() = default;
+};
diff --git a/src/llama-mmap.cpp b/src/llama-mmap.cpp
index 9da97f1bc5057..47497cf953fd3 100644
--- a/src/llama-mmap.cpp
+++ b/src/llama-mmap.cpp
@@ -401,7 +401,7 @@ struct llama_mmap::impl {
                 }
             }
 #else
-            throw std::runtime_error("PrefetchVirtualMemory unavailable");
+            LLAMA_LOG_DEBUG("skipping PrefetchVirtualMemory because _WIN32_WINNT < 0x602\n");
 #endif
         }
     }
diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp
index ddb1b03675b28..bd9e6da8832b7 100644
--- a/src/llama-model-loader.cpp
+++ b/src/llama-model-loader.cpp
@@ -288,9 +288,10 @@ namespace GGUFMeta {
 
     template<typename T>
     bool llama_model_loader::get_arr(const std::string & key, std::vector<T> & result, bool required) {
-        const int kid = gguf_find_key(meta.get(), key.c_str());
+        const gguf_context * ctx = meta.get();
+        const int kid = gguf_find_key(ctx, key.c_str());
 
-        if (kid < 0 || gguf_get_kv_type(meta.get(), kid) != GGUF_TYPE_ARRAY) {
+        if (kid < 0 || gguf_get_kv_type(ctx, kid) != GGUF_TYPE_ARRAY) {
             if (required) {
                 throw std::runtime_error(format("array key not found in model: %s", key.c_str()));
             }
@@ -298,28 +299,40 @@ namespace GGUFMeta {
         }
 
         struct GGUFMeta::ArrayInfo arr_info =
-            GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(meta.get(), kid);
+            GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(ctx, kid);
 
         switch (arr_info.gt) {
             case GGUF_TYPE_UINT32:
-            case GGUF_TYPE_INT32:   GGML_ASSERT((std::is_same<T,  int32_t>::value) ||
-                                                (std::is_same<T, uint32_t>::value)); break;
-            case GGUF_TYPE_FLOAT32: GGML_ASSERT((std::is_same<T,    float>::value)); break;
+            case GGUF_TYPE_INT32:   GGML_ASSERT((std::is_same<T,     int32_t>::value) ||
+                                                (std::is_same<T,    uint32_t>::value)); break;
+            case GGUF_TYPE_FLOAT32: GGML_ASSERT((std::is_same<T,       float>::value)); break;
+            case GGUF_TYPE_STRING:  GGML_ASSERT((std::is_same<T, std::string>::value)); break;
             default:
-                throw std::runtime_error(format("%s is not a float32/uint32/int32 array", key.c_str()));
+                throw std::runtime_error(format("%s is not a string/float32/uint32/int32 array", key.c_str()));
         }
 
-        result.resize(arr_info.length);
-        result.assign((const T*)arr_info.data, (const T *)arr_info.data + arr_info.length);
+        if constexpr (std::is_same<T, std::string>::value) {
+            const size_t n_items = gguf_get_arr_n(ctx, kid);
+            result.clear();
+
+            for (size_t i = 0; i < n_items; i++) {
+                const T value = gguf_get_arr_str(ctx, kid, i);
+                result.emplace_back(value);
+            }
+        } else {
+            result.resize(arr_info.length);
+            result.assign((const T*)arr_info.data, (const T *)arr_info.data + arr_info.length);
+        }
 
         return true;
     }
 
     template<typename T, size_t N_MAX>
     bool llama_model_loader::get_arr(const std::string & key, std::array<T, N_MAX> & result, bool required) {
-        const int kid = gguf_find_key(meta.get(), key.c_str());
+        const gguf_context * ctx = meta.get();
+        const int kid = gguf_find_key(ctx, key.c_str());
 
-        if (kid < 0 || gguf_get_kv_type(meta.get(), kid) != GGUF_TYPE_ARRAY) {
+        if (kid < 0 || gguf_get_kv_type(ctx, kid) != GGUF_TYPE_ARRAY) {
             if (required) {
                 throw std::runtime_error(format("array key not found in model: %s", key.c_str()));
             }
@@ -327,22 +340,32 @@ namespace GGUFMeta {
         }
 
         struct GGUFMeta::ArrayInfo arr_info =
-            GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(meta.get(), kid);
+            GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(ctx, kid);
 
         switch (arr_info.gt) {
             case GGUF_TYPE_UINT32:
-            case GGUF_TYPE_INT32:   GGML_ASSERT((std::is_same<T,  int32_t>::value) ||
-                                                (std::is_same<T, uint32_t>::value)); break;
-            case GGUF_TYPE_FLOAT32: GGML_ASSERT((std::is_same<T,    float>::value)); break;
+            case GGUF_TYPE_INT32:   GGML_ASSERT((std::is_same<T,     int32_t>::value) ||
+                                                (std::is_same<T,    uint32_t>::value)); break;
+            case GGUF_TYPE_FLOAT32: GGML_ASSERT((std::is_same<T,       float>::value)); break;
+            case GGUF_TYPE_STRING:  GGML_ASSERT((std::is_same<T, std::string>::value)); break;
             default:
-                throw std::runtime_error(format("%s is not a float32/uint32/int32 array", key.c_str()));
+                throw std::runtime_error(format("%s is not a string/float32/uint32/int32 array", key.c_str()));
         }
 
         if (arr_info.length > N_MAX) {
             throw std::runtime_error(format("array length %u for key %s exceeds max %u", (uint32_t) arr_info.length, key.c_str(), (uint32_t) N_MAX));
         }
 
-        std::copy((const T*)arr_info.data, (const T *)arr_info.data + arr_info.length, result.begin());
+        if constexpr (std::is_same<T, std::string>::value) {
+            const size_t n_items = gguf_get_arr_n(ctx, kid);
+
+            for (size_t i = 0; i < n_items; i++) {
+                const T value = gguf_get_arr_str(ctx, kid, i);
+                result[i] = value;
+            }
+        } else {
+            std::copy((const T*)arr_info.data, (const T *)arr_info.data + arr_info.length, result.begin());
+        }
 
         return true;
     }
@@ -352,6 +375,8 @@ namespace GGUFMeta {
         return get_arr(llm_kv(kid), result, required);
     }
 
+    template bool llama_model_loader::get_arr<std::vector<std::string>>(enum llm_kv kid, std::vector<std::string> & result, bool required);
+
     template<typename T>
     bool llama_model_loader::get_key(const std::string & key, T & result, bool required) {
         auto it = kv_overrides.find(key);
diff --git a/src/llama-model-saver.cpp b/src/llama-model-saver.cpp
index a70b9892347cb..563823dc35d8e 100644
--- a/src/llama-model-saver.cpp
+++ b/src/llama-model-saver.cpp
@@ -228,6 +228,7 @@ void llama_model_saver::add_kv_from_model() {
     // add_kv(LLM_KV_TOKENIZER_MASK_ID,                 ???);
     add_kv(LLM_KV_TOKENIZER_ADD_BOS,                 vocab.get_add_bos());
     add_kv(LLM_KV_TOKENIZER_ADD_EOS,                 vocab.get_add_eos());
+    add_kv(LLM_KV_TOKENIZER_ADD_SEP,                 vocab.get_add_sep());
     add_kv(LLM_KV_TOKENIZER_ADD_PREFIX,              vocab.get_add_space_prefix());
     add_kv(LLM_KV_TOKENIZER_REMOVE_EXTRA_WS,         vocab.get_remove_extra_whitespaces());
     add_kv(LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP,    vocab.get_precompiled_charsmap());
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 44c07ff457dce..fc39195ed5177 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -5,7 +5,11 @@
 #include "llama-batch.h"
 #include "llama-cparams.h"
 #include "llama-model-loader.h"
-#include "llama-kv-cache.h"
+
+#include "llama-kv-cache-unified.h"
+#include "llama-kv-cache-unified-iswa.h"
+#include "llama-memory-hybrid.h"
+#include "llama-memory-recurrent.h"
 
 #include "ggml-cpp.h"
 
@@ -77,6 +81,7 @@ const char * llm_type_name(llm_type type) {
         case LLM_TYPE_40B:           return "40B";
         case LLM_TYPE_65B:           return "65B";
         case LLM_TYPE_70B:           return "70B";
+        case LLM_TYPE_142B:          return "142B";
         case LLM_TYPE_236B:          return "236B";
         case LLM_TYPE_290B:          return "290B";
         case LLM_TYPE_314B:          return "314B";
@@ -98,6 +103,8 @@ const char * llm_type_name(llm_type type) {
         case LLM_TYPE_17B_128E:      return "17Bx128E (Maverick)";
         case LLM_TYPE_30B_A3B:       return "30B.A3B";
         case LLM_TYPE_235B_A22B:     return "235B.A22B";
+        case LLM_TYPE_E2B:           return "E2B";
+        case LLM_TYPE_E4B:           return "E4B";
         default:                     return "?B";
     }
 }
@@ -466,6 +473,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
     std::fill(hparams.n_head_arr.begin(),    hparams.n_head_arr.end(),    0);
     std::fill(hparams.n_head_kv_arr.begin(), hparams.n_head_kv_arr.end(), 0);
     std::fill(hparams.n_ff_arr.begin(),      hparams.n_ff_arr.end(),      0);
+    std::fill(
+        hparams.recurrent_layer_arr.begin(),
+        hparams.recurrent_layer_arr.end(),
+        llm_arch_is_recurrent(ml.get_arch()));
 
     std::fill(hparams.rope_sections.begin(), hparams.rope_sections.end(), 0);
 
@@ -540,6 +551,12 @@ void llama_model::load_hparams(llama_model_loader & ml) {
     uint32_t n_vocab = 0;
     ml.get_key(LLM_KV_VOCAB_SIZE, n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, n_vocab, false);
 
+    // for classifier models
+    ml.get_arr(LLM_KV_CLASSIFIER_OUTPUT_LABELS, classifier_labels, false);
+    if (!classifier_labels.empty()) {
+        hparams.n_cls_out = classifier_labels.size();
+    }
+
     // arch-specific KVs
     switch (arch) {
         case LLM_ARCH_LLAMA:
@@ -589,6 +606,16 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                     hparams.use_kq_norm = false;
                 }
             } break;
+        case LLM_ARCH_ARCEE:
+            {
+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+
+                // Arcee uses the same structure as Llama
+                switch (hparams.n_layer) {
+                    case 36: type = LLM_TYPE_4B; break;
+                    default: type = LLM_TYPE_UNKNOWN;
+                }
+            } break;
         case LLM_ARCH_DECI:
             {
                 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@@ -683,7 +710,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,    hparams.f_norm_eps);
                 ml.get_key(LLM_KV_ATTENTION_CAUSAL,           hparams.causal_attn);
                 ml.get_key(LLM_KV_POOLING_TYPE,               hparams.pooling_type, false);
-                ml.get_arr_n(LLM_KV_CLASSIFIER_OUTPUT_LABELS, hparams.n_cls_out, false);
 
                 switch (hparams.n_layer) {
                     case 3:
@@ -730,6 +756,16 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                     }
                 }
             } break;
+        case LLM_ARCH_NEO_BERT:
+            {
+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+                ml.get_key(LLM_KV_ATTENTION_CAUSAL,            hparams.causal_attn);
+                ml.get_key(LLM_KV_POOLING_TYPE,                hparams.pooling_type);
+
+                if (hparams.n_layer == 28) {
+                    type = LLM_TYPE_250M;
+                }
+            } break;
         case LLM_ARCH_BLOOM:
             {
                 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
@@ -953,6 +989,11 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                     case 46: type = LLM_TYPE_27B; break;
                     default: type = LLM_TYPE_UNKNOWN;
                }
+
+                // ref: https://github.com/google/gemma_pytorch/blob/014acb7ac4563a5f77c76d7ff98f31b568c16508/gemma/config.py#L173
+                hparams.f_attention_scale = type == LLM_TYPE_27B
+                    ? 1.0f / std::sqrt(float(hparams.n_embd / hparams.n_head(0)))
+                    : 1.0f / std::sqrt(float(hparams.n_embd_head_k));
             } break;
         case LLM_ARCH_GEMMA3:
             {
@@ -973,10 +1014,29 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                     default: type = LLM_TYPE_UNKNOWN;
                 }
 
+                // ref: https://github.com/google/gemma_pytorch/blob/014acb7ac4563a5f77c76d7ff98f31b568c16508/gemma/config.py#L289
                 hparams.f_attention_scale = type == LLM_TYPE_27B
                     ? 1.0f / std::sqrt(float(hparams.n_embd / hparams.n_head(0)))
                     : 1.0f / std::sqrt(float(hparams.n_embd_head_k));
             } break;
+        case LLM_ARCH_GEMMA3N:
+            {
+                hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
+                hparams.set_swa_pattern(5);
+
+                hparams.rope_freq_base_train_swa  = 10000.0f;
+                hparams.rope_freq_scale_train_swa = 1.0f;
+                hparams.f_attention_scale         = 1.0f;
+
+                ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW,    hparams.n_swa);
+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+
+                switch (hparams.n_layer) {
+                    case 30: type = LLM_TYPE_E2B; break;
+                    case 35: type = LLM_TYPE_E4B; break;
+                    default: type = LLM_TYPE_UNKNOWN;
+                }
+            } break;
         case LLM_ARCH_STARCODER2:
             {
                 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
@@ -1430,6 +1490,20 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                     default: type = LLM_TYPE_UNKNOWN;
                 }
             } break;
+        case LLM_ARCH_DOTS1:
+            {
+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+                ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT,   hparams.n_layer_dense_lead);
+                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,  hparams.n_ff_exp);
+                ml.get_key(LLM_KV_EXPERT_SHARED_COUNT,         hparams.n_expert_shared);
+                ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE,        hparams.expert_weights_scale);
+                ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM,         hparams.expert_weights_norm, false);
+                ml.get_key(LLM_KV_EXPERT_GATING_FUNC,          hparams.expert_gating_func, false);
+                switch (hparams.n_layer) {
+                    case 62: type = LLM_TYPE_142B; break;
+                    default: type = LLM_TYPE_UNKNOWN;
+                }
+            } break;
         default: throw std::runtime_error("unsupported model architecture");
     }
 
@@ -2173,6 +2247,32 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                         layer.layer_out_norm_b = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i),   {n_embd}, 0);
                     }
                 } break;
+            case LLM_ARCH_NEO_BERT:
+                {
+                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD,  "weight"), {n_embd, n_vocab}, 0);
+
+                    cls   = create_tensor(tn(LLM_TENSOR_CLS, "weight"), {n_embd, n_embd}, TENSOR_NOT_REQUIRED);
+                    cls_b = create_tensor(tn(LLM_TENSOR_CLS, "bias"),   {n_embd},         TENSOR_NOT_REQUIRED);
+
+                    cls_out   = create_tensor(tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
+                    cls_out_b = create_tensor(tn(LLM_TENSOR_CLS_OUT, "bias"),   {hparams.n_cls_out},         TENSOR_NOT_REQUIRED);
+
+                    output_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_OUTPUT_NORM, "weight"), {n_embd}, 0);
+
+                    for (int i = 0; i < n_layer; ++i) {
+                        auto & layer = layers[i];
+
+                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
+                        layer.wo   = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+
+                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+
+                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff*2}, 0);
+                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
+                    }
+                } break;
             case LLM_ARCH_JINA_BERT_V2:
                 {
                     tok_embd  = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD,  "weight"), {n_embd, n_vocab}, 0); // word_embeddings
@@ -2210,8 +2310,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                         layer.attn_norm_2   = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
                         layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias",   i), {n_embd}, TENSOR_NOT_REQUIRED);
 
-                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff}, 0);
-                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
+                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
+                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, layer.ffn_gate ? n_ff : n_ff * 2}, 0);
 
                         layer.ffn_down   = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
                         layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias",   i), {n_embd}, 0);
@@ -2870,6 +2970,62 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                         layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
                     }
                 } break;
+            case LLM_ARCH_GEMMA3N:
+                {
+                    const int64_t n_altup      = hparams.n_altup;
+                    const int64_t laurel_rank  = hparams.laurel_rank;
+                    const int64_t n_embd_altup = hparams.n_embd_altup;
+
+                    output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+                    // if output is NULL, init from the input tok embed
+                    if (output == NULL) {
+                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+                    }
+
+                    tok_embd           = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD,           "weight"), {n_embd, n_vocab}, 0);
+                    tok_embd_per_layer = create_tensor(tn(LLM_TENSOR_PER_LAYER_TOKEN_EMBD, "weight"), {n_embd_altup * n_layer, n_vocab}, 0);
+
+                    altup_proj           = create_tensor(tn(LLM_TENSOR_ALTUP_PROJ,           "weight"), {n_embd, n_embd, n_altup - 1}, 0);
+                    altup_unembd_proj    = create_tensor(tn(LLM_TENSOR_ALTUP_UNEMBD_PROJ,    "weight"), {n_embd, n_embd, n_altup - 1}, 0);
+                    per_layer_model_proj = create_tensor(tn(LLM_TENSOR_PER_LAYER_MODEL_PROJ, "weight"), {n_embd, n_embd_altup * n_layer}, 0);
+                    per_layer_proj_norm  = create_tensor(tn(LLM_TENSOR_PER_LAYER_PROJ_NORM,  "weight"), {n_embd_altup}, 0);
+
+                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+
+                    for (int i = 0; i < n_layer; ++i) {
+                        auto & layer = layers[i];
+
+                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
+                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
+                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
+                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
+
+                        layer.attn_q_norm    = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM,    "weight", i), {n_embd_head_k}, 0);
+                        layer.attn_k_norm    = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM,    "weight", i), {n_embd_head_k}, 0);
+                        layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
+
+                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
+                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
+                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
+                        layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
+
+                        // altup & laurel
+                        layer.per_layer_inp_gate   = create_tensor(tn(LLM_TENSOR_PER_LAYER_INP_GATE,  "weight", i), {n_embd, n_embd_altup}, 0);
+                        layer.per_layer_proj       = create_tensor(tn(LLM_TENSOR_PER_LAYER_PROJ,      "weight", i), {n_embd_altup, n_embd}, 0);
+                        layer.per_layer_post_norm  = create_tensor(tn(LLM_TENSOR_PER_LAYER_POST_NORM, "weight", i), {n_embd}, 0);
+                        layer.altup_correct_coef   = create_tensor(tn(LLM_TENSOR_ALTUP_CORRECT_COEF,  "weight", i), {n_altup, n_altup}, 0);
+                        layer.altup_correct_scale  = create_tensor(tn(LLM_TENSOR_ALTUP_CORRECT_SCALE, "weight", i), {n_embd}, 0);
+                        layer.altup_predict_coef   = create_tensor(tn(LLM_TENSOR_ALTUP_PREDICT_COEF,  "weight", i), {n_altup, n_altup * n_altup}, 0);
+                        layer.altup_router         = create_tensor(tn(LLM_TENSOR_ALTUP_ROUTER,        "weight", i), {n_embd, n_altup}, 0);
+                        layer.altup_router_norm    = create_tensor(tn(LLM_TENSOR_ALTUP_ROUTER_NORM,   "weight", i), {n_embd}, 0);
+                        layer.laurel_l             = create_tensor(tn(LLM_TENSOR_LAUREL_L,            "weight", i), {n_embd, laurel_rank}, 0);
+                        layer.laurel_r             = create_tensor(tn(LLM_TENSOR_LAUREL_R,            "weight", i), {laurel_rank, n_embd}, 0);
+                        layer.laurel_post_norm     = create_tensor(tn(LLM_TENSOR_LAUREL_POST_NORM,    "weight", i), {n_embd}, 0);
+                    }
+                } break;
             case LLM_ARCH_STARCODER2:
                 {
                     tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@@ -4109,6 +4265,89 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                         layer.ffn_up_shexp   = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
                     }
                 } break;
+            case LLM_ARCH_DOTS1:
+                {
+                    const int64_t n_ff_exp        = hparams.n_ff_exp;
+                    const int64_t n_expert_shared = hparams.n_expert_shared;
+
+                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
+
+                    for (int i = 0; i < n_layer; ++i) {
+                        auto & layer = layers[i];
+
+                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
+                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
+                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
+                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
+
+                        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
+                        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
+
+                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+
+                        if (i < (int) hparams.n_layer_dense_lead) {
+                            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
+                            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
+                            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
+                        } else {
+                            layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
+                            layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED);
+
+                            if (n_expert == 0) {
+                                throw std::runtime_error("n_expert must be > 0");
+                            }
+                            if (n_expert_used == 0) {
+                                throw std::runtime_error("n_expert_used must be > 0");
+                            }
+
+                            // MoE branch
+                            layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {  n_embd, n_ff_exp, n_expert}, 0);
+                            layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp,   n_embd, n_expert}, 0);
+                            layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {  n_embd, n_ff_exp, n_expert}, 0);
+
+                            // Shared expert branch
+                            layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
+                            layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {        n_ff_exp * n_expert_shared, n_embd}, 0);
+                            layer.ffn_up_shexp   = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
+                        }
+                    }
+                } break;
+            case LLM_ARCH_ARCEE:
+                {
+                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+                    // output
+                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+
+                    // if output is NULL, init from the input tok embed
+                    if (output == NULL) {
+                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+                    }
+
+                    for (int i = 0; i < n_layer; ++i) {
+                        auto & layer = layers[i];
+
+                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
+                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
+                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
+                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
+
+                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+
+                        layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
+
+                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
+                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
+                    }
+                } break;
             default:
                 throw std::runtime_error("unknown architecture");
         }
@@ -4353,6 +4592,15 @@ void llama_model::print_info() const {
         LLAMA_LOG_INFO("%s: ssm_d_state      = %u\n",     __func__, hparams.ssm_d_state);
         LLAMA_LOG_INFO("%s: ssm_dt_rank      = %u\n",     __func__, hparams.ssm_dt_rank);
         LLAMA_LOG_INFO("%s: ssm_dt_b_c_rms   = %d\n",     __func__, hparams.ssm_dt_b_c_rms);
+
+        if (!classifier_labels.empty()) {
+            LLAMA_LOG_INFO("%s: n_cls_out        = %u\n", __func__, hparams.n_cls_out);
+
+            size_t i = 0;
+            for (auto label : classifier_labels) {
+                LLAMA_LOG_INFO("%s: cls_label[%2zu]    = %s\n", __func__, i++, label.c_str());
+            }
+        }
     }
 
     LLAMA_LOG_INFO("%s: model type       = %s\n",     __func__, type_name().c_str());
@@ -4535,6 +4783,8 @@ struct llm_build_llama : public llm_graph_context {
 
         const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
 
+        ggml_tensor * inp_out_ids = build_inp_out_ids();
+
         for (int il = 0; il < n_layer; ++il) {
             ggml_tensor * inpSA = inpL;
 
@@ -4597,9 +4847,7 @@ struct llm_build_llama : public llm_graph_context {
                 cb(cur, "attn_out", il);
             }
 
-            if (il == n_layer - 1) {
-                // skip computing output for unused tokens
-                ggml_tensor * inp_out_ids = build_inp_out_ids();
+            if (il == n_layer - 1 && inp_out_ids) {
                 cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
                 inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
             }
@@ -4695,6 +4943,8 @@ struct llm_build_llama_iswa : public llm_graph_context {
 
         const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
 
+        ggml_tensor * inp_out_ids = build_inp_out_ids();
+
         for (int il = 0; il < n_layer; ++il) {
             ggml_tensor * inpSA = inpL;
 
@@ -4771,9 +5021,7 @@ struct llm_build_llama_iswa : public llm_graph_context {
                 cb(cur, "attn_out", il);
             }
 
-            if (il == n_layer - 1) {
-                // skip computing output for unused tokens
-                ggml_tensor * inp_out_ids = build_inp_out_ids();
+            if (il == n_layer - 1 && inp_out_ids) {
                 cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
                 inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
             }
@@ -4873,6 +5121,9 @@ struct llm_build_deci : public llm_graph_context {
         auto * inp_attn = build_attn_inp_kv_unified();
 
         const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
+
+        ggml_tensor * inp_out_ids = build_inp_out_ids();
+
         for (int il = 0; il < n_layer; ++il) {
             ggml_tensor * inpSA = inpL;
             const int64_t n_head_kv = hparams.n_head_kv(il);
@@ -4946,9 +5197,7 @@ struct llm_build_deci : public llm_graph_context {
                         Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
             }
 
-            if (il == n_layer - 1) {
-                // skip computing output for unused tokens
-                ggml_tensor * inp_out_ids = build_inp_out_ids();
+            if (il == n_layer - 1 && inp_out_ids) {
                 cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
                 inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
             }
@@ -5027,6 +5276,8 @@ struct llm_build_baichuan : public llm_graph_context {
 
         auto * inp_attn = build_attn_inp_kv_unified();
 
+        ggml_tensor * inp_out_ids = build_inp_out_ids();
+
         for (int il = 0; il < n_layer; ++il) {
             ggml_tensor * inpSA = inpL;
 
@@ -5078,9 +5329,7 @@ struct llm_build_baichuan : public llm_graph_context {
                         Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
             }
 
-            if (il == n_layer - 1) {
-                // skip computing output for unused tokens
-                ggml_tensor * inp_out_ids = build_inp_out_ids();
+            if (il == n_layer - 1 && inp_out_ids) {
                 cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
                 inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
             }
@@ -5149,6 +5398,8 @@ struct llm_build_xverse : public llm_graph_context {
 
         auto * inp_attn = build_attn_inp_kv_unified();
 
+        ggml_tensor * inp_out_ids = build_inp_out_ids();
+
         for (int il = 0; il < n_layer; ++il) {
             ggml_tensor * inpSA = inpL;
 
@@ -5193,9 +5444,7 @@ struct llm_build_xverse : public llm_graph_context {
                         Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
             }
 
-            if (il == n_layer - 1) {
-                // skip computing output for unused tokens
-                ggml_tensor * inp_out_ids = build_inp_out_ids();
+            if (il == n_layer - 1 && inp_out_ids) {
                 cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
                 inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
             }
@@ -5263,6 +5512,8 @@ struct llm_build_falcon : public llm_graph_context {
 
         auto * inp_attn = build_attn_inp_kv_unified();
 
+        ggml_tensor * inp_out_ids = build_inp_out_ids();
+
         for (int il = 0; il < n_layer; ++il) {
             ggml_tensor * attn_norm;
 
@@ -5318,9 +5569,7 @@ struct llm_build_falcon : public llm_graph_context {
                         Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
             }
 
-            if (il == n_layer - 1) {
-                // skip computing output for unused tokens
-                ggml_tensor * inp_out_ids = build_inp_out_ids();
+            if (il == n_layer - 1 && inp_out_ids) {
                 cur       = ggml_get_rows(ctx0,       cur, inp_out_ids);
                 inpL      = ggml_get_rows(ctx0,      inpL, inp_out_ids);
                 attn_norm = ggml_get_rows(ctx0, attn_norm, inp_out_ids);
@@ -5389,6 +5638,8 @@ struct llm_build_grok : public llm_graph_context {
 
         auto * inp_attn = build_attn_inp_kv_unified();
 
+        ggml_tensor * inp_out_ids = build_inp_out_ids();
+
         for (int il = 0; il < n_layer; ++il) {
             ggml_tensor * inpSA = inpL;
 
@@ -5448,9 +5699,7 @@ struct llm_build_grok : public llm_graph_context {
                         Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
             }
 
-            if (il == n_layer - 1) {
-                // skip computing output for unused tokens
-                ggml_tensor * inp_out_ids = build_inp_out_ids();
+            if (il == n_layer - 1 && inp_out_ids) {
                 cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
                 inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
             }
@@ -5549,6 +5798,8 @@ struct llm_build_dbrx : public llm_graph_context {
 
         auto * inp_attn = build_attn_inp_kv_unified();
 
+        ggml_tensor * inp_out_ids = build_inp_out_ids();
+
         for (int il = 0; il < n_layer; ++il) {
             ggml_tensor * inpSA = inpL;
 
@@ -5599,9 +5850,7 @@ struct llm_build_dbrx : public llm_graph_context {
                         Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
             }
 
-            if (il == n_layer - 1) {
-                // skip computing output for unused tokens
-                ggml_tensor * inp_out_ids = build_inp_out_ids();
+            if (il == n_layer - 1 && inp_out_ids) {
                 cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
                 inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
             }
@@ -5681,6 +5930,8 @@ struct llm_build_starcoder : public llm_graph_context {
         inpL = ggml_add(ctx0, inpL, pos);
         cb(inpL, "inpL", -1);
 
+        ggml_tensor * inp_out_ids = build_inp_out_ids();
+
         for (int il = 0; il < n_layer; ++il) {
             cur = build_norm(inpL,
                     model.layers[il].attn_norm,
@@ -5713,9 +5964,7 @@ struct llm_build_starcoder : public llm_graph_context {
                         Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
             }
 
-            if (il == n_layer - 1) {
-                // skip computing output for unused tokens
-                ggml_tensor * inp_out_ids = build_inp_out_ids();
+            if (il == n_layer - 1 && inp_out_ids) {
                 cur  = ggml_get_rows(ctx0,  cur, inp_out_ids);
                 inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
             }
@@ -5780,6 +6029,8 @@ struct llm_build_refact : public llm_graph_context {
 
         auto * inp_attn = build_attn_inp_kv_unified();
 
+        ggml_tensor * inp_out_ids = build_inp_out_ids();
+
         for (int il = 0; il < n_layer; ++il) {
             ggml_tensor * inpSA = inpL;
 
@@ -5812,9 +6063,7 @@ struct llm_build_refact : public llm_graph_context {
                         Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
             }
 
-            if (il == n_layer - 1) {
-                // skip computing output for unused tokens
-                ggml_tensor * inp_out_ids = build_inp_out_ids();
+            if (il == n_layer - 1 && inp_out_ids) {
                 cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
                 inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
             }
@@ -5900,78 +6149,79 @@ struct llm_build_bert : public llm_graph_context {
 
         auto * inp_attn = build_attn_inp_no_cache();
 
-        // iterate layers
+        ggml_tensor * inp_out_ids = build_inp_out_ids();
+
         for (int il = 0; il < n_layer; ++il) {
             ggml_tensor * cur = inpL;
 
-            ggml_tensor * Qcur;
-            ggml_tensor * Kcur;
-            ggml_tensor * Vcur;
+            {
+                ggml_tensor * Qcur;
+                ggml_tensor * Kcur;
+                ggml_tensor * Vcur;
 
-            // self-attention
-            if (model.layers[il].wqkv) {
-                cur = build_lora_mm(model.layers[il].wqkv, cur);
-                cb(cur, "wqkv", il);
+                // self-attention
+                if (model.layers[il].wqkv) {
+                    cur = build_lora_mm(model.layers[il].wqkv, cur);
+                    cb(cur, "wqkv", il);
 
-                if (model.layers[il].bqkv) {
-                    cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
-                    cb(cur, "bqkv", il);
+                    if (model.layers[il].bqkv) {
+                        cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
+                        cb(cur, "bqkv", il);
+                    }
+
+                    Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd,     n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
+                    Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
+                    Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
+                } else {
+                    Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, cur), model.layers[il].bq);
+                    Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, cur), model.layers[il].bk);
+                    Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, cur), model.layers[il].bv);
                 }
 
-                Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd,     n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
-                Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
-                Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
-            } else {
-                Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, cur), model.layers[il].bq);
-                Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, cur), model.layers[il].bk);
-                Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, cur), model.layers[il].bv);
-            }
+                if (model.layers[il].attn_q_norm) {
+                    Qcur = build_norm(Qcur,
+                            model.layers[il].attn_q_norm,
+                            model.layers[il].attn_q_norm_b,
+                            LLM_NORM, il);
+                }
 
-            if (model.layers[il].attn_q_norm) {
-                Qcur = build_norm(Qcur,
-                        model.layers[il].attn_q_norm,
-                        model.layers[il].attn_q_norm_b,
-                        LLM_NORM, il);
-            }
+                if (model.layers[il].attn_k_norm) {
+                    Kcur = build_norm(Kcur,
+                            model.layers[il].attn_k_norm,
+                            model.layers[il].attn_k_norm_b,
+                            LLM_NORM, il);
+                }
 
-            if (model.layers[il].attn_k_norm) {
-                Kcur = build_norm(Kcur,
-                        model.layers[il].attn_k_norm,
-                        model.layers[il].attn_k_norm_b,
-                        LLM_NORM, il);
-            }
+                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
+                Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+                Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
 
-            Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
-            Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
-            Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+                // RoPE
+                if (model.arch == LLM_ARCH_NOMIC_BERT || model.arch == LLM_ARCH_NOMIC_BERT_MOE) {
+                    Qcur = ggml_rope_ext(
+                            ctx0, Qcur, inp_pos, nullptr,
+                            n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                            ext_factor, attn_factor, beta_fast, beta_slow
+                            );
 
-            // RoPE
-            if (model.arch == LLM_ARCH_NOMIC_BERT || model.arch == LLM_ARCH_NOMIC_BERT_MOE) {
-                Qcur = ggml_rope_ext(
-                        ctx0, Qcur, inp_pos, nullptr,
-                        n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                        ext_factor, attn_factor, beta_fast, beta_slow
-                        );
+                    Kcur = ggml_rope_ext(
+                            ctx0, Kcur, inp_pos, nullptr,
+                            n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                            ext_factor, attn_factor, beta_fast, beta_slow
+                            );
+                }
 
-                Kcur = ggml_rope_ext(
-                        ctx0, Kcur, inp_pos, nullptr,
-                        n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                        ext_factor, attn_factor, beta_fast, beta_slow
-                        );
-            }
-
-            cb(Qcur, "Qcur", il);
-            cb(Kcur, "Kcur", il);
-            cb(Vcur, "Vcur", il);
+                cb(Qcur, "Qcur", il);
+                cb(Kcur, "Kcur", il);
+                cb(Vcur, "Vcur", il);
 
-            cur = build_attn(inp_attn, gf,
-                    model.layers[il].wo, model.layers[il].bo,
-                    Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
-            cb(cur, "kqv_out", il);
+                cur = build_attn(inp_attn, gf,
+                        model.layers[il].wo, model.layers[il].bo,
+                        Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+                cb(cur, "kqv_out", il);
+            }
 
-            if (il == n_layer - 1 && pooling_type == LLAMA_POOLING_TYPE_NONE) {
-                // skip computing output for unused tokens
-                ggml_tensor * inp_out_ids = build_inp_out_ids();
+            if (il == n_layer - 1 && inp_out_ids) {
                 cur  = ggml_get_rows(ctx0,  cur, inp_out_ids);
                 inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
             }
@@ -6020,7 +6270,7 @@ struct llm_build_bert : public llm_graph_context {
                         model.layers[il].ffn_gate, NULL,                        NULL,
                         model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
                         NULL,
-                        LLM_FFN_GELU, LLM_FFN_PAR, il);
+                        model.layers[il].ffn_gate ? LLM_FFN_GELU : LLM_FFN_GEGLU, LLM_FFN_PAR, il);
                 cb(cur, "ffn_out", il);
             } else {
                 cur = build_ffn(cur,
@@ -6051,6 +6301,118 @@ struct llm_build_bert : public llm_graph_context {
     }
 };
 
+struct llm_build_neo_bert : public llm_graph_context {
+    llm_build_neo_bert(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
+        const int64_t n_embd_head = hparams.n_embd_head_v;
+        const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
+
+        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+
+        ggml_tensor * cur;
+        ggml_tensor * inpL;
+        ggml_tensor * inp_pos = build_inp_pos();
+
+        // construct input embeddings (token, type, position)
+        inpL = build_inp_embd(model.tok_embd);
+        cb(inpL, "inp_embd", -1);
+
+        auto * inp_attn = build_attn_inp_no_cache();
+
+        ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+        for (int il = 0; il < n_layer; ++il) {
+            ggml_tensor * cur = inpL;
+
+            // pre-norm
+            cur = build_norm(inpL,
+                    model.layers[il].attn_norm, NULL,
+                    LLM_NORM_RMS, il);
+
+            {
+                ggml_tensor * Qcur;
+                ggml_tensor * Kcur;
+                ggml_tensor * Vcur;
+
+                // self-attention
+                cur = build_lora_mm(model.layers[il].wqkv, cur);
+                cb(cur, "wqkv", il);
+
+                Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd,     n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
+                Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
+                Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
+
+                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
+                Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+                Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+                // RoPE
+                Qcur = ggml_rope_ext(
+                        ctx0, Qcur, inp_pos, nullptr,
+                        n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                        ext_factor, attn_factor, beta_fast, beta_slow
+                        );
+
+                Kcur = ggml_rope_ext(
+                        ctx0, Kcur, inp_pos, nullptr,
+                        n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                        ext_factor, attn_factor, beta_fast, beta_slow
+                        );
+
+                cb(Qcur, "Qcur", il);
+                cb(Kcur, "Kcur", il);
+                cb(Vcur, "Vcur", il);
+
+                cur = build_attn(inp_attn, gf,
+                        model.layers[il].wo, nullptr,
+                        Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+                cb(cur, "kqv_out", il);
+            }
+
+            if (il == n_layer - 1 && inp_out_ids) {
+                cur  = ggml_get_rows(ctx0,  cur, inp_out_ids);
+                inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
+            }
+
+            // re-add the layer input
+            cur = ggml_add(ctx0, cur, inpL);
+
+            ggml_tensor * ffn_inp = cur;
+            cb(ffn_inp, "ffn_inp", il);
+
+            // pre-norm
+            cur = build_norm(ffn_inp,
+                    model.layers[il].ffn_norm, NULL,
+                    LLM_NORM_RMS, il);
+            cb(cur, "ffn_norm", il);
+
+            // feed-forward network
+            cur = build_ffn(cur,
+                    model.layers[il].ffn_up,
+                    NULL, NULL, NULL, NULL, NULL,
+                    model.layers[il].ffn_down,
+                    NULL, NULL, NULL,
+                    LLM_FFN_SWIGLU, LLM_FFN_SEQ, il);
+
+            // attentions bypass the intermediate layer
+            cur = ggml_add(ctx0, cur, ffn_inp);
+
+            // input for next layer
+            inpL = cur;
+        }
+
+        cur = inpL;
+
+        cur = build_norm(cur,
+                model.output_norm_enc, NULL,
+                LLM_NORM_RMS, -1);
+
+        cb(cur, "result_embd", -1);
+        res->t_embd = cur;
+
+        ggml_build_forward_expand(gf, cur);
+    }
+};
+
 struct llm_build_bloom : public llm_graph_context {
     llm_build_bloom(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
@@ -6071,6 +6433,8 @@ struct llm_build_bloom : public llm_graph_context {
                 LLM_NORM, -1);
         cb(inpL, "inp_norm", -1);
 
+        ggml_tensor * inp_out_ids = build_inp_out_ids();
+
         for (int il = 0; il < n_layer; ++il) {
             cur = build_norm(inpL,
                     model.layers[il].attn_norm,
@@ -6103,9 +6467,7 @@ struct llm_build_bloom : public llm_graph_context {
                         Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
             }
 
-            if (il == n_layer - 1) {
-                // skip computing output for unused tokens
-                ggml_tensor * inp_out_ids = build_inp_out_ids();
+            if (il == n_layer - 1 && inp_out_ids) {
                 cur  = ggml_get_rows(ctx0,  cur, inp_out_ids);
                 inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
             }
@@ -6182,6 +6544,8 @@ struct llm_build_mpt : public llm_graph_context {
             cb(inpL, "inpL", -1);
         }
 
+        ggml_tensor * inp_out_ids = build_inp_out_ids();
+
         for (int il = 0; il < n_layer; ++il) {
             ggml_tensor * attn_norm;
 
@@ -6244,9 +6608,7 @@ struct llm_build_mpt : public llm_graph_context {
                         Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
             }
 
-            if (il == n_layer - 1) {
-                // skip computing output for unused tokens
-                ggml_tensor * inp_out_ids = build_inp_out_ids();
+            if (il == n_layer - 1 && inp_out_ids) {
                 cur  = ggml_get_rows(ctx0,  cur, inp_out_ids);
                 inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
             }
@@ -6315,6 +6677,8 @@ struct llm_build_stablelm : public llm_graph_context {
 
         auto * inp_attn = build_attn_inp_kv_unified();
 
+        ggml_tensor * inp_out_ids = build_inp_out_ids();
+
         for (int il = 0; il < n_layer; ++il) {
             // norm
             cur = build_norm(inpL,
@@ -6390,9 +6754,7 @@ struct llm_build_stablelm : public llm_graph_context {
                         Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
             }
 
-            if (il == n_layer - 1) {
-                // skip computing output for unused tokens
-                ggml_tensor * inp_out_ids = build_inp_out_ids();
+            if (il == n_layer - 1 && inp_out_ids) {
                 cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
                 inpL  = ggml_get_rows(ctx0,  inpL, inp_out_ids);
                 inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
@@ -6467,6 +6829,8 @@ struct llm_build_qwen : public llm_graph_context {
 
         auto * inp_attn = build_attn_inp_kv_unified();
 
+        ggml_tensor * inp_out_ids = build_inp_out_ids();
+
         for (int il = 0; il < n_layer; ++il) {
             ggml_tensor * inpSA = inpL;
 
@@ -6513,9 +6877,7 @@ struct llm_build_qwen : public llm_graph_context {
                         Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
             }
 
-            if (il == n_layer - 1) {
-                // skip computing output for unused tokens
-                ggml_tensor * inp_out_ids = build_inp_out_ids();
+            if (il == n_layer - 1 && inp_out_ids) {
                 cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
                 inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
             }
@@ -6584,6 +6946,8 @@ struct llm_build_qwen2 : public llm_graph_context {
 
         auto * inp_attn = build_attn_inp_kv_unified();
 
+        ggml_tensor * inp_out_ids = build_inp_out_ids();
+
         for (int il = 0; il < n_layer; ++il) {
             ggml_tensor * inpSA = inpL;
 
@@ -6633,9 +6997,7 @@ struct llm_build_qwen2 : public llm_graph_context {
                         Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
             }
 
-            if (il == n_layer - 1) {
-                // skip computing output for unused tokens
-                ggml_tensor * inp_out_ids = build_inp_out_ids();
+            if (il == n_layer - 1 && inp_out_ids) {
                 cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
                 inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
             }
@@ -6705,6 +7067,8 @@ struct llm_build_qwen2vl : public llm_graph_context {
         int sections[4];
         std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
 
+        ggml_tensor * inp_out_ids = build_inp_out_ids();
+
         for (int il = 0; il < n_layer; ++il) {
             ggml_tensor * inpSA = inpL;
 
@@ -6754,9 +7118,7 @@ struct llm_build_qwen2vl : public llm_graph_context {
                         Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
             }
 
-            if (il == n_layer - 1) {
-                // skip computing output for unused tokens
-                ggml_tensor * inp_out_ids = build_inp_out_ids();
+            if (il == n_layer - 1 && inp_out_ids) {
                 cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
                 inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
             }
@@ -6823,6 +7185,8 @@ struct llm_build_qwen2moe : public llm_graph_context {
 
         auto * inp_attn = build_attn_inp_kv_unified();
 
+        ggml_tensor * inp_out_ids = build_inp_out_ids();
+
         for (int il = 0; il < n_layer; ++il) {
             ggml_tensor * inpSA = inpL;
 
@@ -6881,9 +7245,7 @@ struct llm_build_qwen2moe : public llm_graph_context {
                         Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
             }
 
-            if (il == n_layer - 1) {
-                // skip computing output for unused tokens
-                ggml_tensor * inp_out_ids = build_inp_out_ids();
+            if (il == n_layer - 1 && inp_out_ids) {
                 cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
                 inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
             }
@@ -6982,6 +7344,8 @@ struct llm_build_qwen3 : public llm_graph_context {
 
         auto * inp_attn = build_attn_inp_kv_unified();
 
+        ggml_tensor * inp_out_ids = build_inp_out_ids();
+
         for (int il = 0; il < n_layer; ++il) {
             ggml_tensor * inpSA = inpL;
 
@@ -7034,9 +7398,7 @@ struct llm_build_qwen3 : public llm_graph_context {
                         Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
             }
 
-            if (il == n_layer - 1) {
-                // skip computing output for unused tokens
-                ggml_tensor * inp_out_ids = build_inp_out_ids();
+            if (il == n_layer - 1 && inp_out_ids) {
                 cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
                 inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
             }
@@ -7103,6 +7465,8 @@ struct llm_build_qwen3moe : public llm_graph_context {
 
         auto * inp_attn = build_attn_inp_kv_unified();
 
+        ggml_tensor * inp_out_ids = build_inp_out_ids();
+
         for (int il = 0; il < n_layer; ++il) {
             ggml_tensor * inpSA = inpL;
 
@@ -7155,9 +7519,7 @@ struct llm_build_qwen3moe : public llm_graph_context {
                         Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
             }
 
-            if (il == n_layer - 1) {
-                // skip computing output for unused tokens
-                ggml_tensor * inp_out_ids = build_inp_out_ids();
+            if (il == n_layer - 1 && inp_out_ids) {
                 cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
                 inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
             }
@@ -7233,6 +7595,8 @@ struct llm_build_phi2 : public llm_graph_context {
 
         auto * inp_attn = build_attn_inp_kv_unified();
 
+        ggml_tensor * inp_out_ids = build_inp_out_ids();
+
         for (int il = 0; il < n_layer; ++il) {
             attn_norm_output = build_norm(inpL,
                     model.layers[il].attn_norm,
@@ -7295,9 +7659,7 @@ struct llm_build_phi2 : public llm_graph_context {
                         Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
             }
 
-            if (il == n_layer - 1) {
-                // skip computing output for unused tokens
-                ggml_tensor * inp_out_ids = build_inp_out_ids();
+            if (il == n_layer - 1 && inp_out_ids) {
                 cur              = ggml_get_rows(ctx0,              cur, inp_out_ids);
                 inpL             = ggml_get_rows(ctx0,             inpL, inp_out_ids);
                 attn_norm_output = ggml_get_rows(ctx0, attn_norm_output, inp_out_ids);
@@ -7369,6 +7731,8 @@ struct llm_build_phi3 : public llm_graph_context {
             inp_attn = build_attn_inp_kv_unified();
         }
 
+        ggml_tensor * inp_out_ids = build_inp_out_ids();
+
         for (int il = 0; il < n_layer; ++il) {
             auto * residual = inpL;
 
@@ -7432,9 +7796,7 @@ struct llm_build_phi3 : public llm_graph_context {
                         Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
             }
 
-            if (il == n_layer - 1) {
-                // skip computing output for unused tokens
-                ggml_tensor* inp_out_ids = build_inp_out_ids();
+            if (il == n_layer - 1 && inp_out_ids) {
                 cur      = ggml_get_rows(ctx0, cur,      inp_out_ids);
                 residual = ggml_get_rows(ctx0, residual, inp_out_ids);
             }
@@ -7520,15 +7882,16 @@ struct llm_build_plamo : public llm_graph_context {
 
         auto * inp_attn = build_attn_inp_kv_unified();
 
-        for (int il = 0; il < n_layer; ++il) {
+        ggml_tensor * inp_out_ids = build_inp_out_ids();
 
+        for (int il = 0; il < n_layer; ++il) {
             // norm
             cur = build_norm(inpL,
                     model.layers[il].attn_norm, NULL,
                     LLM_NORM_RMS, il);
             cb(cur, "attn_norm", il);
 
-            ggml_tensor * attention_norm = cur;
+            ggml_tensor * sa_inp = cur;
 
             // self-attention
             {
@@ -7566,18 +7929,17 @@ struct llm_build_plamo : public llm_graph_context {
                         model.layers[il].wo, NULL,
                         Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
             }
-            ggml_tensor * sa_out = cur;
 
-            cur = attention_norm;
-
-            if (il == n_layer - 1) {
-                // skip computing output for unused tokens
-                ggml_tensor * inp_out_ids = build_inp_out_ids();
+            if (il == n_layer - 1 && inp_out_ids) {
                 cur    = ggml_get_rows(ctx0,    cur, inp_out_ids);
-                sa_out = ggml_get_rows(ctx0, sa_out, inp_out_ids);
+                sa_inp = ggml_get_rows(ctx0, sa_inp, inp_out_ids);
                 inpL   = ggml_get_rows(ctx0,   inpL, inp_out_ids);
             }
 
+            ggml_tensor * sa_out = cur;
+
+            cur = sa_inp;
+
             // feed-forward network
             {
                 cur = build_ffn(cur,
@@ -7642,6 +8004,8 @@ struct llm_build_gpt2 : public llm_graph_context {
         inpL = ggml_add(ctx0, inpL, pos);
         cb(inpL, "inpL", -1);
 
+        ggml_tensor * inp_out_ids = build_inp_out_ids();
+
         for (int il = 0; il < n_layer; ++il) {
             cur = build_norm(inpL,
                     model.layers[il].attn_norm,
@@ -7674,9 +8038,7 @@ struct llm_build_gpt2 : public llm_graph_context {
                         Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
             }
 
-            if (il == n_layer - 1) {
-                // skip computing output for unused tokens
-                ggml_tensor * inp_out_ids = build_inp_out_ids();
+            if (il == n_layer - 1 && inp_out_ids) {
                 cur  = ggml_get_rows(ctx0,  cur, inp_out_ids);
                 inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
             }
@@ -7746,6 +8108,8 @@ struct llm_build_codeshell : public llm_graph_context {
 
         auto * inp_attn = build_attn_inp_kv_unified();
 
+        ggml_tensor * inp_out_ids = build_inp_out_ids();
+
         for (int il = 0; il < n_layer; ++il) {
             cur = build_norm(inpL,
                     model.layers[il].attn_norm,
@@ -7790,9 +8154,7 @@ struct llm_build_codeshell : public llm_graph_context {
                         Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
             }
 
-            if (il == n_layer - 1) {
-                // skip computing output for unused tokens
-                ggml_tensor * inp_out_ids = build_inp_out_ids();
+            if (il == n_layer - 1 && inp_out_ids) {
                 cur  = ggml_get_rows(ctx0,  cur, inp_out_ids);
                 inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
             }
@@ -7846,128 +8208,128 @@ struct llm_build_codeshell : public llm_graph_context {
 
 struct llm_build_orion : public llm_graph_context {
     llm_build_orion(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+        const int64_t n_embd_head = hparams.n_embd_head_v;
 
-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-    GGML_ASSERT(n_embd_head == hparams.n_rot);
+        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+        GGML_ASSERT(n_embd_head == hparams.n_rot);
 
-    ggml_tensor * cur;
-    ggml_tensor * inpL;
+        ggml_tensor * cur;
+        ggml_tensor * inpL;
 
-    inpL = build_inp_embd(model.tok_embd);
+        inpL = build_inp_embd(model.tok_embd);
 
-    // inp_pos - contains the positions
-    ggml_tensor * inp_pos = build_inp_pos();
+        // inp_pos - contains the positions
+        ggml_tensor * inp_pos = build_inp_pos();
 
-    auto * inp_attn = build_attn_inp_kv_unified();
+        auto * inp_attn = build_attn_inp_kv_unified();
 
-    for (int il = 0; il < n_layer; ++il) {
-        ggml_tensor * inpSA = inpL;
+        ggml_tensor * inp_out_ids = build_inp_out_ids();
 
-        // norm
-        cur = build_norm(inpL,
-                model.layers[il].attn_norm, model.layers[il].attn_norm_b,
-                LLM_NORM, il);
-        cb(cur, "attn_norm", il);
+        for (int il = 0; il < n_layer; ++il) {
+            ggml_tensor * inpSA = inpL;
 
-        // self-attention
-        {
-            // compute Q and K and RoPE them
-            ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
-            cb(Qcur, "Qcur", il);
-            // if (model.layers[il].bq) {
-            //     Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
-            //     cb(Qcur, "Qcur", il);
-            // }
-
-            ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
-            cb(Kcur, "Kcur", il);
-            // if (model.layers[il].bk) {
-            //     Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
-            //     cb(Kcur, "Kcur", il);
-            // }
-
-            ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
-            cb(Vcur, "Vcur", il);
-            // if (model.layers[il].bv) {
-            //     Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
-            //     cb(Vcur, "Vcur", il);
-            // }
-
-            Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
-            Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
-            Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
-
-            Qcur = ggml_rope_ext(
-                ctx0, Qcur, inp_pos, nullptr,
-                n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                ext_factor, attn_factor, beta_fast, beta_slow
-            );
+            // norm
+            cur = build_norm(inpL,
+                    model.layers[il].attn_norm, model.layers[il].attn_norm_b,
+                    LLM_NORM, il);
+            cb(cur, "attn_norm", il);
 
-            Kcur = ggml_rope_ext(
-                ctx0, Kcur, inp_pos, nullptr,
-                n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                ext_factor, attn_factor, beta_fast, beta_slow
-            );
+            // self-attention
+            {
+                // compute Q and K and RoPE them
+                ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+                cb(Qcur, "Qcur", il);
+                // if (model.layers[il].bq) {
+                //     Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+                //     cb(Qcur, "Qcur", il);
+                // }
 
-            cb(Qcur, "Qcur", il);
-            cb(Kcur, "Kcur", il);
-            cb(Vcur, "Vcur", il);
+                ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+                cb(Kcur, "Kcur", il);
+                // if (model.layers[il].bk) {
+                //     Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+                //     cb(Kcur, "Kcur", il);
+                // }
 
-            cur = build_attn(inp_attn, gf,
-                    model.layers[il].wo, NULL,
-                    Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
-        }
+                ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+                cb(Vcur, "Vcur", il);
+                // if (model.layers[il].bv) {
+                //     Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+                //     cb(Vcur, "Vcur", il);
+                // }
 
-        if (il == n_layer - 1) {
-            // skip computing output for unused tokens
-            ggml_tensor * inp_out_ids = build_inp_out_ids();
-            cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
-            inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
-        }
+                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
+                Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+                Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
 
-        ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
-        cb(ffn_inp, "ffn_inp", il);
+                Qcur = ggml_rope_ext(
+                        ctx0, Qcur, inp_pos, nullptr,
+                        n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                        ext_factor, attn_factor, beta_fast, beta_slow
+                        );
 
-        // feed-forward network
-        cur = build_norm(ffn_inp,
-                model.layers[il].ffn_norm, model.layers[il].ffn_norm_b,
-                LLM_NORM, il);
-        cb(cur, "ffn_norm", il);
+                Kcur = ggml_rope_ext(
+                        ctx0, Kcur, inp_pos, nullptr,
+                        n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                        ext_factor, attn_factor, beta_fast, beta_slow
+                        );
 
-        cur = build_ffn(cur,
-                model.layers[il].ffn_up,   NULL, NULL,
-                model.layers[il].ffn_gate, NULL, NULL,
-                model.layers[il].ffn_down, NULL, NULL,
-                NULL,
-                LLM_FFN_SILU, LLM_FFN_PAR, il);
-        cb(cur, "ffn_out", il);
+                cb(Qcur, "Qcur", il);
+                cb(Kcur, "Kcur", il);
+                cb(Vcur, "Vcur", il);
 
-        cur = ggml_add(ctx0, cur, ffn_inp);
+                cur = build_attn(inp_attn, gf,
+                        model.layers[il].wo, NULL,
+                        Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+            }
 
-        cur = build_cvec(cur, il);
-        cb(cur, "l_out", il);
+            if (il == n_layer - 1 && inp_out_ids) {
+                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
+                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+            }
 
-        // input for next layer
-        inpL = cur;
-    }
+            ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+            cb(ffn_inp, "ffn_inp", il);
+
+            // feed-forward network
+            cur = build_norm(ffn_inp,
+                    model.layers[il].ffn_norm, model.layers[il].ffn_norm_b,
+                    LLM_NORM, il);
+            cb(cur, "ffn_norm", il);
+
+            cur = build_ffn(cur,
+                    model.layers[il].ffn_up,   NULL, NULL,
+                    model.layers[il].ffn_gate, NULL, NULL,
+                    model.layers[il].ffn_down, NULL, NULL,
+                    NULL,
+                    LLM_FFN_SILU, LLM_FFN_PAR, il);
+            cb(cur, "ffn_out", il);
 
-    cur = inpL;
+            cur = ggml_add(ctx0, cur, ffn_inp);
+
+            cur = build_cvec(cur, il);
+            cb(cur, "l_out", il);
+
+            // input for next layer
+            inpL = cur;
+        }
 
-    cur = build_norm(cur,
-            model.output_norm, model.output_norm_b,
-            LLM_NORM, -1);
+        cur = inpL;
+
+        cur = build_norm(cur,
+                model.output_norm, model.output_norm_b,
+                LLM_NORM, -1);
 
-    cb(cur, "result_norm", -1);
-    res->t_embd = cur;
+        cb(cur, "result_norm", -1);
+        res->t_embd = cur;
 
-    // lm_head
-    cur = build_lora_mm(model.output, cur);
+        // lm_head
+        cur = build_lora_mm(model.output, cur);
 
-    cb(cur, "result_output", -1);
-    res->t_logits = cur;
+        cb(cur, "result_output", -1);
+        res->t_logits = cur;
 
-    ggml_build_forward_expand(gf, cur);
+        ggml_build_forward_expand(gf, cur);
     }
 };
 
@@ -7988,6 +8350,8 @@ struct llm_build_internlm2 : public llm_graph_context {
 
         auto * inp_attn = build_attn_inp_kv_unified();
 
+        ggml_tensor * inp_out_ids = build_inp_out_ids();
+
         for (int il = 0; il < n_layer; ++il) {
             ggml_tensor * inpSA = inpL;
 
@@ -8046,9 +8410,7 @@ struct llm_build_internlm2 : public llm_graph_context {
                         Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
             }
 
-            if (il == n_layer - 1) {
-                // skip computing output for unused tokens
-                ggml_tensor * inp_out_ids = build_inp_out_ids();
+            if (il == n_layer - 1 && inp_out_ids) {
                 cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
                 inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
             }
@@ -8124,6 +8486,8 @@ struct llm_build_minicpm3 : public llm_graph_context {
 
         auto * inp_attn = build_attn_inp_kv_unified();
 
+        ggml_tensor * inp_out_ids = build_inp_out_ids();
+
         for (int il = 0; il < n_layer; ++il) {
             ggml_tensor * inpSA = inpL;
 
@@ -8243,15 +8607,13 @@ struct llm_build_minicpm3 : public llm_graph_context {
                         q_states, k_states, v_states, nullptr, nullptr, kq_scale, il);
             }
 
-            if (il == n_layer - 1) {
-                // skip computing output for unused tokens
-                ggml_tensor * inp_out_ids = build_inp_out_ids();
+            if (il == n_layer - 1 && inp_out_ids) {
                 cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
                 inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
             }
 
             // scale_res - scale the hidden states for residual connection
-            const float scale_res = scale_depth/sqrtf(float(n_layer));
+            const float scale_res = scale_depth/sqrtf(float(n_layer)); // TODO: is this correct?
             cur = ggml_scale(ctx0, cur, scale_res);
             cb(cur, "hidden_scaled", il);
 
@@ -8328,6 +8690,8 @@ struct llm_build_gemma : public llm_graph_context {
 
         auto * inp_attn = build_attn_inp_kv_unified();
 
+        ggml_tensor * inp_out_ids = build_inp_out_ids();
+
         for (int il = 0; il < n_layer; ++il) {
             // norm
             cur = build_norm(inpL,
@@ -8373,9 +8737,7 @@ struct llm_build_gemma : public llm_graph_context {
                         Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
             }
 
-            if (il == n_layer - 1) {
-                // skip computing output for unused tokens
-                ggml_tensor * inp_out_ids = build_inp_out_ids();
+            if (il == n_layer - 1 && inp_out_ids) {
                 cur  = ggml_get_rows(ctx0,  cur, inp_out_ids);
                 inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
             }
@@ -8444,6 +8806,8 @@ struct llm_build_gemma2_iswa : public llm_graph_context {
 
         auto * inp_attn = build_attn_inp_kv_unified_iswa();
 
+        ggml_tensor * inp_out_ids = build_inp_out_ids();
+
         for (int il = 0; il < n_layer; ++il) {
             // norm
             cur = build_norm(inpL,
@@ -8481,32 +8845,23 @@ struct llm_build_gemma2_iswa : public llm_graph_context {
                 cb(Kcur, "Kcur", il);
                 cb(Vcur, "Vcur", il);
 
-                // ref: https://github.com/google/gemma_pytorch/commit/03e657582d17cb5a8617ebf333c1c16f3694670e
-                switch (model.type) {
-                    case LLM_TYPE_2B:
-                    case LLM_TYPE_9B:
-                    case LLM_TYPE_27B: Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head))); break;
-                    default: GGML_ABORT("fatal error");
-                };
-                cb(Qcur, "Qcur_scaled", il);
+                Qcur = ggml_scale(ctx0, Qcur, hparams.f_attention_scale);
 
                 cur = build_attn(inp_attn, gf,
                         model.layers[il].wo, NULL,
                         Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
             }
 
+            if (il == n_layer - 1 && inp_out_ids) {
+                cur  = ggml_get_rows(ctx0,  cur, inp_out_ids);
+                inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
+            }
+
             cur = build_norm(cur,
                     model.layers[il].attn_post_norm, NULL,
                     LLM_NORM_RMS, il);
             cb(cur, "attn_post_norm", il);
 
-            if (il == n_layer - 1) {
-                // skip computing output for unused tokens
-                ggml_tensor * inp_out_ids = build_inp_out_ids();
-                cur  = ggml_get_rows(ctx0,  cur, inp_out_ids);
-                inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
-            }
-
             ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL);
             cb(sa_out, "sa_out", il);
 
@@ -8585,6 +8940,8 @@ struct llm_build_gemma3_iswa : public llm_graph_context {
         // TODO: is causal == true correct? might need some changes
         auto * inp_attn = build_attn_inp_kv_unified_iswa();
 
+        ggml_tensor * inp_out_ids = build_inp_out_ids();
+
         for (int il = 0; il < n_layer; ++il) {
             const float freq_base_l  = model.get_rope_freq_base (cparams, il);
             const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
@@ -8629,9 +8986,17 @@ struct llm_build_gemma3_iswa : public llm_graph_context {
                 cb(Kcur, "Kcur", il);
                 cb(Vcur, "Vcur", il);
 
+                // ref: https://github.com/google/gemma_pytorch/blob/014acb7ac4563a5f77c76d7ff98f31b568c16508/gemma/model.py#L315
+                Qcur = ggml_scale(ctx0, Qcur, hparams.f_attention_scale);
+
                 cur = build_attn(inp_attn, gf,
                         model.layers[il].wo, NULL,
-                        Qcur, Kcur, Vcur, nullptr, nullptr, hparams.f_attention_scale, il);
+                        Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
+            }
+
+            if (il == n_layer - 1 && inp_out_ids) {
+                cur  = ggml_get_rows(ctx0,  cur, inp_out_ids);
+                inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
             }
 
             cur = build_norm(cur,
@@ -8639,13 +9004,6 @@ struct llm_build_gemma3_iswa : public llm_graph_context {
                     LLM_NORM_RMS, il);
             cb(cur, "attn_post_norm", il);
 
-            if (il == n_layer - 1) {
-                // skip computing output for unused tokens
-                ggml_tensor * inp_out_ids = build_inp_out_ids();
-                cur  = ggml_get_rows(ctx0,  cur, inp_out_ids);
-                inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
-            }
-
             ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL);
             cb(sa_out, "sa_out", il);
 
@@ -8698,7 +9056,443 @@ struct llm_build_gemma3_iswa : public llm_graph_context {
     }
 };
 
-// TODO: move up next to build_starcoder
+struct llm_build_gemma3n_iswa : public llm_graph_context {
+    const llama_model & model;
+    ggml_cgraph * gf;
+
+    const int64_t n_embd_head;
+    const int64_t n_embd_altup;
+    const int64_t n_altup;
+    const int     i_altup_act;
+    const int     n_layer_kv = 20; // number of layers having KV [KV_REUSE]
+    const int     n_layer_sparsity = 10; // number of layers using activation sparsity
+    const float   f_sparsity_std_mul = 1.6448533535003662f; // std_multiplier = normal_dist.icdf(0.95)
+
+    ggml_tensor * one; // containing single element 1.0f
+
+    llm_build_gemma3n_iswa(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf)
+            : llm_graph_context(params),
+              model(model),
+              gf(gf),
+              n_embd_head(model.hparams.n_embd_head_k),
+              n_embd_altup(model.hparams.n_embd_altup),
+              n_altup(model.hparams.n_altup),
+              i_altup_act(model.hparams.i_altup_act) {
+        ggml_tensor * cur;
+        ggml_tensor * inpL;
+
+        // TODO: remove this when ggml_scale_add is implemented
+        one = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
+        {
+            auto inp = std::make_unique<llm_graph_input_one>();
+            inp->one = one;
+            res->add_input(std::move(inp));
+        }
+
+        inpL = build_inp_embd(model.tok_embd);
+
+        // important: do not normalize weights for raw embeddings input (i.e. encoded image emdeddings)
+        if (ubatch.token) {
+            inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
+            cb(inpL, "inp_scaled", -1);
+        }
+
+        // inp_pos - contains the positions
+        ggml_tensor * inp_pos = build_inp_pos();
+
+        // TODO: is causal == true correct? might need some changes
+        auto * inp_attn = build_attn_inp_kv_unified_iswa();
+
+        // inp_per_layer shape: [n_embd_altup, n_tokens, n_layer]
+        ggml_tensor * inp_per_layer = project_per_layer_inputs(inpL, get_per_layer_inputs());
+
+        // inpL now has only 1 altup, project it to the rest of the altups
+        // these "added" altups will be concat to the last dim of inpL
+        {
+            ggml_tensor * target_magnitude = calc_magnitude(inpL);
+            ggml_tensor * inp_repeated = ggml_repeat_4d(ctx0, inpL, n_embd, n_tokens, n_altup - 1, 1);
+            ggml_tensor * altup_added = ggml_mul_mat(ctx0, model.altup_proj, inp_repeated); // shape: [n_embd, n_tokens, n_altup - 1]
+            ggml_tensor * new_magnitude = calc_magnitude(altup_added);
+            altup_added = ggml_div(ctx0,
+                                ggml_mul(ctx0, altup_added, target_magnitude),
+                                new_magnitude);
+            inpL = ggml_concat(ctx0, inpL, altup_added, 2); // shape: [n_embd, n_tokens, n_altup]
+            cb(inpL, "inp_stacked", -1);
+        }
+
+        // inpL now has shape:          [n_embd,       n_tokens, n_altup]
+        // inp_per_layer now has shape: [n_embd_altup, n_tokens, n_layer]
+
+        for (int il = 0; il < n_layer; ++il) {
+            // this block is made to be closely resemble Gemma3p5DecoderLayer on python code
+            const bool has_kv = (il < n_layer_kv);
+
+            const float freq_base_l  = model.get_rope_freq_base (cparams, il);
+            const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
+
+            ggml_tensor * cur = inpL; // [n_embd, n_tokens, n_altup]
+            ggml_tensor * predictions = altup_predict(cur, il); // [n_embd, n_tokens, n_altup]
+
+            // predicted value will go through self-attention and laurel
+            ggml_tensor * active_prediction = view_2d_slice(predictions, i_altup_act); // [n_embd, n_tokens]
+            cur = active_prediction;
+            cb(cur, "active_prediction", il);
+
+            // norm
+            cur = build_norm(cur, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
+            cb(cur, "attn_norm", il);
+
+            // laurel
+            ggml_tensor * laurel_out = laurel(cur, il); // [n_embd, n_tokens]
+
+            // self-attention
+            if (has_kv) {
+                // compute Q and K and RoPE them
+                ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+                cb(Qcur, "Qcur", il);
+
+                ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+                cb(Kcur, "Kcur", il);
+
+                ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+                cb(Vcur, "Vcur", il);
+
+                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
+                Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+                Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+                Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
+                Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
+                Vcur = ggml_rms_norm(ctx0, Vcur, hparams.f_norm_rms_eps);
+
+                cb(Qcur, "Qcur_normed", il);
+                cb(Kcur, "Kcur_normed", il);
+                cb(Vcur, "Vcur_normed", il);
+
+                Qcur = ggml_rope_ext(
+                        ctx0, Qcur, inp_pos, nullptr,
+                        n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
+                        ext_factor, attn_factor, beta_fast, beta_slow);
+
+                Kcur = ggml_rope_ext(
+                        ctx0, Kcur, inp_pos, nullptr,
+                        n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
+                        ext_factor, attn_factor, beta_fast, beta_slow);
+
+                cb(Qcur, "Qcur_pos", il);
+                cb(Kcur, "Kcur_pos", il);
+
+                cur = build_attn(inp_attn, gf,
+                        model.layers[il].wo, NULL,
+                        Qcur, Kcur, Vcur, nullptr, nullptr, hparams.f_attention_scale, il);
+            } else {
+                // no KV layers
+                ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+                cb(Qcur, "Qcur", il);
+                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+
+                Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
+                cb(Qcur, "Qcur_normed", il);
+
+                Qcur = ggml_rope_ext(
+                        ctx0, Qcur, inp_pos, nullptr,
+                        n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
+                        ext_factor, attn_factor, beta_fast, beta_slow);
+                cb(Qcur, "Qcur_pos", il);
+
+                cur = build_attn(inp_attn, gf,
+                    model.layers[il].wo, NULL,
+                    Qcur, nullptr, nullptr, nullptr, nullptr, hparams.f_attention_scale, il);
+            }
+
+            cur = build_norm(cur,
+                    model.layers[il].attn_post_norm, NULL,
+                    LLM_NORM_RMS, il);
+            cb(cur, "attn_post_norm", il);
+
+            cur = ggml_add(ctx0, cur, active_prediction); // [n_embd, n_tokens]
+            cb(cur, "attn_gated", il);
+
+            ggml_tensor * attn_laurel = ggml_scale(ctx0,
+                                            ggml_add(ctx0, cur, laurel_out),
+                                            1.0f / sqrtf(2.0f)); // [n_embd, n_tokens]
+            cb(attn_laurel, "attn_laurel", il);
+
+            cur = build_norm(attn_laurel,
+                    model.layers[il].ffn_norm, NULL,
+                    LLM_NORM_RMS, il);
+            cb(cur, "ffn_norm", il);
+
+            // feed-forward network
+            {
+                ggml_tensor * up_proj   = build_lora_mm(model.layers[il].ffn_up,   cur);
+                ggml_tensor * gate_proj = build_lora_mm(model.layers[il].ffn_gate, cur);
+
+                if (il < n_layer_sparsity) {
+                    // apply activation sparsity
+                    gate_proj = gaussian_topk(gate_proj);
+                }
+                gate_proj = ggml_gelu(ctx0, gate_proj);
+
+                cur = ggml_mul(ctx0, up_proj, gate_proj);
+                cur = build_lora_mm(model.layers[il].ffn_down, cur);
+                cb(cur, "ffn_out", il);
+            }
+
+            cur = build_norm(cur,
+                    model.layers[il].ffn_post_norm, NULL,
+                    LLM_NORM_RMS, -1);
+            cb(cur, "ffn_post_norm", il);
+
+            ggml_tensor * attn_ffw_laurel_gated = ggml_add(ctx0, cur, attn_laurel); // [n_embd, n_tokens]
+            cb(attn_ffw_laurel_gated, "attn_ffw_laurel_gated", il);
+
+            ggml_tensor * corrected = altup_correct(predictions, attn_ffw_laurel_gated, il); // [n_embd, n_tokens, n_altup]
+
+            ggml_tensor * first_prediction; // [n_embd, n_tokens]
+            {
+                first_prediction = view_2d_slice(corrected, i_altup_act); // [n_embd, n_tokens]
+                first_prediction = ggml_mul(ctx0, first_prediction, model.layers[il].altup_correct_scale);
+                first_prediction = build_lora_mm(model.layers[il].per_layer_inp_gate, first_prediction);
+                first_prediction = ggml_gelu(ctx0, first_prediction); // [n_embd_altup, n_tokens]
+                cb(first_prediction, "first_prediction_gated", il);
+                ggml_tensor * inp_this_layer = view_2d_slice(inp_per_layer, il); // [n_embd_altup, n_tokens]
+                first_prediction = ggml_mul(ctx0, first_prediction, inp_this_layer); // [n_embd_altup, n_tokens]
+                cb(first_prediction, "first_prediction_scaled", il);
+
+                first_prediction = build_lora_mm(model.layers[il].per_layer_proj, first_prediction); // [n_embd, n_tokens]
+                first_prediction = build_norm(first_prediction,
+                        model.layers[il].per_layer_post_norm, NULL,
+                        LLM_NORM_RMS, il);
+                cb(first_prediction, "first_prediction_out", il);
+            }
+
+            // equivalent to python code: corrected_predictions[1:] += first_prediction
+            {
+                ggml_tensor * slice_first = view_2d_slice(corrected, 0);
+                ggml_tensor * slice_rest  = ggml_view_3d(ctx0, corrected, n_embd, n_tokens, n_altup - 1,
+                                                    ggml_row_size(corrected->type, n_embd),
+                                                    ggml_row_size(corrected->type, n_embd*n_tokens),
+                                                    n_embd*n_tokens*ggml_element_size(corrected));
+                ggml_tensor * tmp = ggml_add(ctx0, slice_rest, first_prediction); // [n_embd, n_tokens, n_altup - 1]
+                corrected = ggml_concat(ctx0, slice_first, tmp, 2); // [n_embd, n_tokens, n_altup]
+            }
+
+            cur = corrected; // [n_embd, n_tokens, n_altup]
+            cur = build_cvec(cur, il);
+            cb(cur, "l_out", il);
+
+            // input for next layer
+            inpL = cur;
+        }
+
+        cur = inpL; // [n_embd, n_tokens, n_altup]
+
+        // cur now has multiple altup(s), we want to merge them back to 1 altup
+        {
+            ggml_tensor * target_magnitude = calc_magnitude(view_2d_slice(cur, i_altup_act)); // [n_embd, n_tokens]
+            // do a view to skip the first slice (active altup)
+            ggml_tensor * alt_slice = ggml_view_3d(ctx0, cur, n_embd, n_tokens, n_altup - 1,
+                                                    ggml_row_size(cur->type, n_embd),
+                                                    ggml_row_size(cur->type, n_embd*n_tokens),
+                                                    n_embd*n_tokens*ggml_element_size(cur));
+            ggml_tensor * altup_unembd = ggml_mul_mat(ctx0, model.altup_unembd_proj, alt_slice); // shape: [n_embd, n_tokens, n_altup - 1]
+            ggml_tensor * new_magnitude = calc_magnitude(altup_unembd);
+            altup_unembd = ggml_div(ctx0,
+                                ggml_mul(ctx0, altup_unembd, target_magnitude),
+                                new_magnitude);
+            cb(altup_unembd, "altup_unembd", -1);
+
+            // equivalent to torch.mean(hidden_states, dim=0)
+            cur = view_2d_slice(cur, 0); // [n_embd, n_tokens]
+            for (int i = 0; i < n_altup - 1; ++i) {
+                cur = ggml_add(ctx0, cur, view_2d_slice(altup_unembd, i));
+            }
+            cur = ggml_scale(ctx0, cur, 1.0f / float(n_altup)); // [n_embd, n_tokens]
+            cb(cur, "unembd_merged", -1);
+        }
+
+        // cur now has shape: [n_embd, n_tokens]
+
+        // TODO: move this to right after the last KV layer
+        {
+            // skip computing output for unused tokens
+            ggml_tensor * inp_out_ids = build_inp_out_ids();
+            cur  = ggml_get_rows(ctx0,  cur, inp_out_ids);
+        }
+
+        cur = build_norm(cur,
+                model.output_norm, NULL,
+                LLM_NORM_RMS, -1);
+
+        cb(cur, "result_norm", -1);
+        res->t_embd = cur;
+
+        cur = build_lora_mm(model.output, cur);
+
+        {
+            // final logit soft-capping
+            cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_final_logit_softcapping);
+            cur = ggml_tanh(ctx0, cur);
+            cur = ggml_scale(ctx0, cur, hparams.f_final_logit_softcapping);
+        }
+
+        cb(cur, "result_output", -1);
+        res->t_logits = cur;
+
+        ggml_build_forward_expand(gf, cur);
+    }
+
+    ggml_tensor * calc_magnitude(ggml_tensor * x) {
+        return ggml_sqrt(ctx0, ggml_sum_rows(ctx0, ggml_sqr(ctx0, x)));
+    }
+
+    // get 2D slice view from a 3D tensor, the idx corresponds to the 3rd dim
+    ggml_tensor * view_2d_slice(ggml_tensor * x, int idx) {
+        GGML_ASSERT(idx < (int)x->ne[2]);
+        return ggml_view_2d(ctx0, x, x->ne[0], x->ne[1],
+                            ggml_row_size(x->type, x->ne[0]),
+                            idx * x->ne[0] * x->ne[1] * ggml_element_size(x));
+    }
+
+    // equivalent to get_per_layer_inputs() in python code
+    // output shape: [n_embd_altup, n_layer, n_tokens]
+    ggml_tensor * get_per_layer_inputs() {
+        auto inp = std::make_unique<llm_graph_input_embd>();
+        ggml_tensor * inp_per_layer;
+        if (ubatch.token) {
+            inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens);
+            ggml_set_input(inp->tokens);
+            res->t_tokens = inp->tokens;
+            inp_per_layer = ggml_get_rows(ctx0, model.tok_embd_per_layer, inp->tokens);
+            inp_per_layer = ggml_reshape_3d(ctx0, inp_per_layer, n_embd_altup, n_layer, n_tokens);
+            inp_per_layer = ggml_scale(ctx0, inp_per_layer, sqrtf((float)n_embd_altup));
+            cb(inp_per_layer, "inp_per_layer_selected", -1);
+        } else {
+            GGML_ABORT("TODO: support embd input");
+        }
+        res->add_input(std::move(inp));
+        return inp_per_layer;
+    }
+
+    // equivalent to project_per_layer_inputs() in python code
+    // this calculates the per-layer inputs, so the final tensor shape will have n_layer as the last dim
+    // output shape: [n_embd_altup, n_tokens, n_layer]
+    ggml_tensor * project_per_layer_inputs(ggml_tensor * inputs_embeds, ggml_tensor * inp_per_layer) {
+        const float per_layer_projection_scale = 1.0f / sqrtf((float)n_embd);
+        const float per_layer_input_scale      = 1.0f / sqrtf(2.0f);
+
+        ggml_tensor * per_layer_proj = ggml_mul_mat(ctx0, model.per_layer_model_proj, inputs_embeds);
+        per_layer_proj = ggml_scale(ctx0, per_layer_proj, per_layer_projection_scale);
+        per_layer_proj = ggml_reshape_3d(ctx0, per_layer_proj, n_embd_altup, n_layer, n_tokens);
+        per_layer_proj = build_norm(per_layer_proj,
+                                    model.per_layer_proj_norm, NULL,
+                                    LLM_NORM_RMS, -1); // [n_embd_altup, n_layer, n_tokens]
+        cb(per_layer_proj, "per_layer_proj", -1);
+
+        inp_per_layer = ggml_add(ctx0, inp_per_layer, per_layer_proj);
+        inp_per_layer = ggml_scale(ctx0, inp_per_layer, per_layer_input_scale);
+        cb(inp_per_layer, "inp_per_layer", -1);
+
+        // permute to shape: [n_embd_altup, n_tokens, n_layer]
+        inp_per_layer = ggml_cont(ctx0, ggml_permute(ctx0, inp_per_layer, 0, 2, 1, 3));
+        return inp_per_layer;
+    }
+
+    // input cur shape: [n_altup, n_tokens]
+    // output    shape: [n_altup, n_tokens]
+    ggml_tensor * laurel(ggml_tensor * cur, int il) {
+        ggml_tensor * tmp = cur;
+        tmp = build_lora_mm(model.layers[il].laurel_l, tmp);
+        tmp = build_lora_mm(model.layers[il].laurel_r, tmp);
+        tmp = build_norm(tmp, model.layers[il].laurel_post_norm, NULL, LLM_NORM_RMS, il);
+        tmp = ggml_add(ctx0, tmp, cur);
+        cb(tmp, "laurel_out", il);
+        return tmp;
+    }
+
+    // input x shape: [n_embd, n_tokens]
+    // output  shape: [n_embd, n_tokens]
+    ggml_tensor * gaussian_topk(ggml_tensor * x) {
+        ggml_tensor * mean = ggml_mean(ctx0, x);
+        ggml_tensor * std  = ggml_sqrt(ctx0, ggml_scale(ctx0,
+            ggml_sum_rows(ctx0, ggml_sqr(ctx0, ggml_sub(ctx0, x, mean))),
+            1.0f / (float)(x->ne[0] - 1)
+        ));
+        ggml_tensor * cutoff_x = ggml_add(ctx0, mean, ggml_scale(ctx0, std, f_sparsity_std_mul));
+        return ggml_relu(ctx0, ggml_sub(ctx0, x, cutoff_x));
+    }
+
+    //
+    // altup functions
+    //
+
+    // equivalent to compute_router_modalities() in python code
+    // input x shape: [n_embd,  n_tokens]
+    // output  shape: [n_altup, n_tokens]
+    ggml_tensor * altup_compute_router_modalities(ggml_tensor * x, int il) {
+        ggml_tensor * router_inputs = build_norm(x,
+            model.layers[il].altup_router_norm, NULL,
+            LLM_NORM_RMS, il);
+
+        // router_input_scale
+        router_inputs = ggml_scale(ctx0, router_inputs, 1.0f / (float)n_embd);
+
+        ggml_tensor * output = ggml_mul_mat(ctx0, model.layers[il].altup_router, router_inputs);
+        return ggml_tanh(ctx0, output); // [n_altup, n_tokens]
+    }
+
+    // input cur shape: [n_embd, n_tokens, n_altup]
+    // output    shape: [n_embd, n_tokens, n_altup]
+    ggml_tensor * altup_predict(ggml_tensor * cur, int il) {
+        ggml_tensor * activated = view_2d_slice(cur, i_altup_act); // [n_embd, n_tokens]
+        ggml_tensor * modalities = altup_compute_router_modalities(activated, il); // [n_altup, n_tokens]
+        cb(modalities, "modalities", il);
+
+        ggml_tensor * all_coefs = build_lora_mm(model.layers[il].altup_predict_coef, modalities);
+        cb(all_coefs, "all_coefs", il);
+        // first dim now having n_altup^2 elements, we reshape it to 2D (so we end up with 3D tensor)
+        all_coefs = ggml_reshape_3d(ctx0, all_coefs, n_altup, n_altup, n_tokens);
+
+        // permute to [n_altup, n_embd, n_tokens]
+        ggml_tensor * cur_permuted = ggml_cont(ctx0, ggml_permute(ctx0, cur, 1, 2, 0, 3));
+        ggml_tensor * predictions = ggml_mul_mat(ctx0, cur_permuted, all_coefs); // [n_altup, n_embd, n_tokens]
+
+        // final shape must be the same as cur: [n_embd, n_tokens, n_altup]
+        predictions = ggml_cont(ctx0, ggml_permute(ctx0, predictions, 0, 2, 1, 3));
+        predictions = ggml_add(ctx0, predictions, cur);
+        cb(predictions, "predictions", il);
+
+        return predictions;
+    }
+
+    // input predictions       shape: [n_embd, n_tokens, n_altup]
+    // input activated         shape: [n_embd, n_tokens]
+    // output                  shape: [n_embd, n_tokens, n_altup]
+    ggml_tensor * altup_correct(ggml_tensor * predictions, ggml_tensor * activated, int il) {
+        ggml_tensor * modalities = altup_compute_router_modalities(activated, il); // [n_altup, n_tokens]
+        cb(modalities, "modalities", il);
+
+        ggml_tensor * active_prediction = view_2d_slice(predictions, i_altup_act);
+        ggml_tensor * innovation = ggml_sub(ctx0, activated, active_prediction); // [n_embd, n_tokens]
+        cb(innovation, "innovation", il);
+
+        ggml_tensor * all_coefs = build_lora_mm(model.layers[il].altup_correct_coef, modalities); // [n_altup, n_tokens]
+        all_coefs = ggml_add(ctx0, all_coefs, one);
+        cb(all_coefs, "all_coefs", il);
+        all_coefs = ggml_cont(ctx0, ggml_transpose(ctx0, all_coefs)); // [n_tokens, n_altup]
+        all_coefs = ggml_reshape_3d(ctx0, all_coefs, 1, n_tokens, n_altup); // [1, n_tokens, n_altup]
+
+        innovation = ggml_repeat_4d(ctx0, innovation, n_embd, n_tokens, n_altup, 1);
+        ggml_tensor * corrected = ggml_mul(ctx0, innovation, all_coefs); // [n_embd, n_tokens, n_altup]
+        corrected = ggml_add(ctx0, corrected, predictions); // [n_embd, n_tokens, n_altup]
+        cb(corrected, "corrected", il);
+
+        return corrected;
+    }
+};
+
+// TODO: move up next to build_starcoder
 struct llm_build_starcoder2 : public llm_graph_context {
     llm_build_starcoder2(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
@@ -8716,6 +9510,8 @@ struct llm_build_starcoder2 : public llm_graph_context {
 
         auto * inp_attn = build_attn_inp_kv_unified();
 
+        ggml_tensor * inp_out_ids = build_inp_out_ids();
+
         for (int il = 0; il < n_layer; ++il) {
             ggml_tensor * inpSA = inpL;
 
@@ -8774,9 +9570,7 @@ struct llm_build_starcoder2 : public llm_graph_context {
                         Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
             }
 
-            if (il == n_layer - 1) {
-                // skip computing output for unused tokens
-                ggml_tensor * inp_out_ids = build_inp_out_ids();
+            if (il == n_layer - 1 && inp_out_ids) {
                 cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
                 inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
             }
@@ -8837,8 +9631,9 @@ struct llm_build_mamba : public llm_graph_context {
         // {n_embd, n_tokens}
         inpL = build_inp_embd(model.tok_embd);
 
-        ggml_tensor * state_copy = build_inp_s_copy();
-        ggml_tensor * state_mask = build_inp_s_mask();
+        auto * rs_inp = build_rs_inp();
+
+        ggml_tensor * inp_out_ids = build_inp_out_ids();
 
         for (int il = 0; il < n_layer; ++il) {
             // norm
@@ -8847,12 +9642,9 @@ struct llm_build_mamba : public llm_graph_context {
                     LLM_NORM_RMS, il);
             cb(cur, "attn_norm", il);
 
-            //cur = build_mamba_layer(gf, cur, state_copy, state_mask, il);
-            cur = build_mamba_layer(gf, cur, state_copy, state_mask, ubatch, il);
+            cur = build_mamba_layer(rs_inp, gf, cur, ubatch, il);
 
-            if (il == n_layer - 1) {
-                // skip computing output for unused tokens
-                ggml_tensor * inp_out_ids = build_inp_out_ids();
+            if (il == n_layer - 1 && inp_out_ids) {
                 cur  = ggml_get_rows(ctx0,  cur, inp_out_ids);
                 inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
             }
@@ -8886,15 +9678,14 @@ struct llm_build_mamba : public llm_graph_context {
 
     // TODO: split
     ggml_tensor * build_mamba_layer(
-             ggml_cgraph * gf,
-             ggml_tensor * cur,
-             ggml_tensor * state_copy,
-             ggml_tensor * state_mask,
-      const llama_ubatch & ubatch,
-                     int   il) const {
-        const auto * kv_state = static_cast<const llama_kv_cache_recurrent_state *>(mstate);
+        llm_graph_input_rs * inp,
+               ggml_cgraph * gf,
+               ggml_tensor * cur,
+        const llama_ubatch & ubatch,
+                       int   il) const {
+        const auto * mctx_cur = static_cast<const llama_memory_recurrent_context *>(mctx);
 
-        const auto kv_head = kv_state->get_head();
+        const auto kv_head = mctx_cur->get_head();
 
         const int64_t d_conv  = hparams.ssm_d_conv;
         const int64_t d_inner = hparams.ssm_d_inner;
@@ -8912,17 +9703,17 @@ struct llm_build_mamba : public llm_graph_context {
         GGML_ASSERT(ubatch.equal_seqs);
         GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);
 
-        ggml_tensor * conv_states_all = kv_state->get_k_l(il);
-        ggml_tensor * ssm_states_all  = kv_state->get_v_l(il);
+        ggml_tensor * conv_states_all = mctx_cur->get_r_l(il);
+        ggml_tensor * ssm_states_all  = mctx_cur->get_s_l(il);
 
         // (ab)using the KV cache to store the states
-        ggml_tensor * conv = build_copy_mask_state(
-                gf, conv_states_all, state_copy, state_mask,
-                hparams.n_embd_k_s(), n_seqs);
+        ggml_tensor * conv = build_rs(
+                inp, gf, conv_states_all,
+                hparams.n_embd_r(), n_seqs);
         conv = ggml_reshape_3d(ctx0, conv, d_conv - 1, d_inner, n_seqs);
-        ggml_tensor * ssm = build_copy_mask_state(
-                gf, ssm_states_all, state_copy, state_mask,
-                hparams.n_embd_v_s(), n_seqs);
+        ggml_tensor * ssm = build_rs(
+                inp, gf, ssm_states_all,
+                hparams.n_embd_s(), n_seqs);
         ssm = ggml_reshape_3d(ctx0, ssm, d_state, d_inner, n_seqs);
 
         // {n_embd, n_tokens} => {n_embd, n_seq_tokens, n_seqs}
@@ -9035,13 +9826,15 @@ struct llm_build_command_r : public llm_graph_context {
 
         auto * inp_attn = build_attn_inp_kv_unified();
 
-        for (int il = 0; il < n_layer; ++il) {
+        ggml_tensor * inp_out_ids = build_inp_out_ids();
 
+        for (int il = 0; il < n_layer; ++il) {
             // norm
             cur = build_norm(inpL,
                     model.layers[il].attn_norm, NULL,
                     LLM_NORM, il);
             cb(cur, "attn_norm", il);
+
             ggml_tensor * ffn_inp = cur;
 
             // self-attention
@@ -9109,9 +9902,7 @@ struct llm_build_command_r : public llm_graph_context {
                         Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
             }
 
-            if (il == n_layer - 1) {
-                // skip computing output for unused tokens
-                ggml_tensor * inp_out_ids = build_inp_out_ids();
+            if (il == n_layer - 1 && inp_out_ids) {
                 cur     = ggml_get_rows(ctx0,     cur, inp_out_ids);
                 inpL    = ggml_get_rows(ctx0,    inpL, inp_out_ids);
                 ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
@@ -9182,6 +9973,8 @@ struct llm_build_cohere2_iswa : public llm_graph_context {
 
         auto * inp_attn = build_attn_inp_kv_unified_iswa();
 
+        ggml_tensor * inp_out_ids = build_inp_out_ids();
+
         for (int il = 0; il < n_layer; ++il) {
             const bool is_swa = hparams.is_swa(il);
 
@@ -9244,9 +10037,7 @@ struct llm_build_cohere2_iswa : public llm_graph_context {
                         Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
             }
 
-            if (il == n_layer - 1) {
-                // skip computing output for unused tokens
-                ggml_tensor * inp_out_ids = build_inp_out_ids();
+            if (il == n_layer - 1 && inp_out_ids) {
                 cur     = ggml_get_rows(ctx0, cur, inp_out_ids);
                 inpL    = ggml_get_rows(ctx0, inpL, inp_out_ids);
                 ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
@@ -9317,6 +10108,8 @@ struct llm_build_olmo : public llm_graph_context {
 
         auto * inp_attn = build_attn_inp_kv_unified();
 
+        ggml_tensor * inp_out_ids = build_inp_out_ids();
+
         for (int il = 0; il < n_layer; ++il) {
             ggml_tensor * inpSA = inpL;
 
@@ -9375,9 +10168,7 @@ struct llm_build_olmo : public llm_graph_context {
                         Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
             }
 
-            if (il == n_layer - 1) {
-                // skip computing output for unused tokens
-                ggml_tensor * inp_out_ids = build_inp_out_ids();
+            if (il == n_layer - 1 && inp_out_ids) {
                 cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
                 inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
             }
@@ -9445,6 +10236,8 @@ struct llm_build_olmo2 : public llm_graph_context {
 
         auto * inp_attn = build_attn_inp_kv_unified();
 
+        ggml_tensor * inp_out_ids = build_inp_out_ids();
+
         for (int il = 0; il < n_layer; ++il) {
             ggml_tensor * inpSA = inpL;
 
@@ -9495,18 +10288,16 @@ struct llm_build_olmo2 : public llm_graph_context {
                         Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
             }
 
+            if (il == n_layer - 1 && inp_out_ids) {
+                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
+                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+            }
+
             cur = build_norm(cur,
                     model.layers[il].attn_post_norm, NULL,
                     LLM_NORM_RMS, il);
             cb(cur, "attn_post_norm", il);
 
-            if (il == n_layer - 1) {
-                // skip computing output for unused tokens
-                ggml_tensor * inp_out_ids = build_inp_out_ids();
-                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
-                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
-            }
-
             ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
             cb(ffn_inp, "ffn_inp", il);
 
@@ -9574,6 +10365,8 @@ struct llm_build_olmoe : public llm_graph_context {
 
         auto * inp_attn = build_attn_inp_kv_unified();
 
+        ggml_tensor * inp_out_ids = build_inp_out_ids();
+
         for (int il = 0; il < n_layer; ++il) {
             ggml_tensor * inpSA = inpL;
 
@@ -9628,9 +10421,7 @@ struct llm_build_olmoe : public llm_graph_context {
                         Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
             }
 
-            if (il == n_layer - 1) {
-                // skip computing output for unused tokens
-                ggml_tensor * inp_out_ids = build_inp_out_ids();
+            if (il == n_layer - 1 && inp_out_ids) {
                 cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
                 inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
             }
@@ -9700,6 +10491,8 @@ struct llm_build_openelm : public llm_graph_context {
 
         auto * inp_attn = build_attn_inp_kv_unified();
 
+        ggml_tensor * inp_out_ids = build_inp_out_ids();
+
         for (int il = 0; il < n_layer; ++il) {
             const int64_t n_head    = hparams.n_head(il);
             const int64_t n_head_kv = hparams.n_head_kv(il);
@@ -9761,11 +10554,9 @@ struct llm_build_openelm : public llm_graph_context {
                         Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
             }
 
-            if (il == n_layer - 1) {
-                // skip computing output for unused tokens
-                ggml_tensor * inp_out_ids = build_inp_out_ids();
+            if (il == n_layer - 1 && inp_out_ids) {
                 residual = ggml_get_rows(ctx0, residual, inp_out_ids);
-                cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+                cur      = ggml_get_rows(ctx0, cur,      inp_out_ids);
             }
 
             ggml_tensor * ffn_inp = ggml_add(ctx0, residual, cur);
@@ -9831,6 +10622,8 @@ struct llm_build_gptneox : public llm_graph_context {
 
         auto * inp_attn = build_attn_inp_kv_unified();
 
+        ggml_tensor * inp_out_ids = build_inp_out_ids();
+
         for (int il = 0; il < n_layer; ++il) {
             cur = build_norm(inpL,
                     model.layers[il].attn_norm,
@@ -9875,9 +10668,7 @@ struct llm_build_gptneox : public llm_graph_context {
                         Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
             }
 
-            if (il == n_layer - 1) {
-                // skip computing output for unused tokens
-                ggml_tensor * inp_out_ids = build_inp_out_ids();
+            if (il == n_layer - 1 && inp_out_ids) {
                 cur  = ggml_get_rows(ctx0,  cur, inp_out_ids);
                 inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
             }
@@ -9979,6 +10770,8 @@ struct llm_build_arctic : public llm_graph_context {
 
         auto * inp_attn = build_attn_inp_kv_unified();
 
+        ggml_tensor * inp_out_ids = build_inp_out_ids();
+
         for (int il = 0; il < n_layer; ++il) {
             ggml_tensor * inpSA = inpL;
 
@@ -10025,9 +10818,7 @@ struct llm_build_arctic : public llm_graph_context {
                         Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
             }
 
-            if (il == n_layer - 1) {
-                // skip computing output for unused tokens
-                ggml_tensor * inp_out_ids = build_inp_out_ids();
+            if (il == n_layer - 1 && inp_out_ids) {
                 cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
                 inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
             }
@@ -10119,6 +10910,8 @@ struct llm_build_deepseek : public llm_graph_context {
 
         const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
 
+        ggml_tensor * inp_out_ids = build_inp_out_ids();
+
         for (int il = 0; il < n_layer; ++il) {
             ggml_tensor * inpSA = inpL;
 
@@ -10180,14 +10973,11 @@ struct llm_build_deepseek : public llm_graph_context {
                         Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
             }
 
-            if (il == n_layer - 1) {
-                // skip computing output for unused tokens
-                ggml_tensor * inp_out_ids = build_inp_out_ids();
+            if (il == n_layer - 1 && inp_out_ids) {
                 cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
                 inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
             }
 
-
             ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
             cb(ffn_inp, "ffn_inp", il);
 
@@ -10295,6 +11085,8 @@ struct llm_build_deepseek2 : public llm_graph_context {
 
         auto * inp_attn = build_attn_inp_kv_unified();
 
+        ggml_tensor * inp_out_ids = build_inp_out_ids();
+
         for (int il = 0; il < n_layer; ++il) {
             ggml_tensor * inpSA = inpL;
 
@@ -10444,9 +11236,7 @@ struct llm_build_deepseek2 : public llm_graph_context {
                 }
             }
 
-            if (il == n_layer - 1) {
-                // skip computing output for unused tokens
-                ggml_tensor * inp_out_ids = build_inp_out_ids();
+            if (il == n_layer - 1 && inp_out_ids) {
                 cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
                 inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
             }
@@ -10542,6 +11332,8 @@ struct llm_build_bitnet : public llm_graph_context {
 
         auto * inp_attn = build_attn_inp_kv_unified();
 
+        ggml_tensor * inp_out_ids = build_inp_out_ids();
+
         for (int il = 0; il < n_layer; ++il) {
             ggml_tensor * inpSA = inpL;
 
@@ -10624,9 +11416,7 @@ struct llm_build_bitnet : public llm_graph_context {
                 cb(cur, "attn_o_out", il);
             }
 
-            if (il == n_layer - 1) {
-                // skip computing output for unused tokens
-                ggml_tensor * inp_out_ids = build_inp_out_ids();
+            if (il == n_layer - 1 && inp_out_ids) {
                 cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
                 inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
             }
@@ -10701,6 +11491,8 @@ struct llm_build_t5_enc : public llm_graph_context {
 
         auto * inp_attn = build_attn_inp_no_cache();
 
+        ggml_tensor * inp_out_ids = build_inp_out_ids();
+
         for (int il = 0; il < n_layer; ++il) {
             ggml_tensor * inpSA = inpL;
 
@@ -10734,9 +11526,7 @@ struct llm_build_t5_enc : public llm_graph_context {
                 cb(cur, "kqv_out", il);
             }
 
-            if (il == n_layer - 1) {
-                // skip computing output for unused tokens
-                ggml_tensor * inp_out_ids = build_inp_out_ids();
+            if (il == n_layer - 1 && inp_out_ids) {
                 cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
                 inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
             }
@@ -10807,6 +11597,8 @@ struct llm_build_t5_dec : public llm_graph_context {
         auto * inp_attn_self  = build_attn_inp_kv_unified();
         auto * inp_attn_cross = build_attn_inp_cross();
 
+        ggml_tensor * inp_out_ids = build_inp_out_ids();
+
         for (int il = 0; il < n_layer; ++il) {
             ggml_tensor * inpSA = inpL;
 
@@ -10898,11 +11690,8 @@ struct llm_build_t5_dec : public llm_graph_context {
                 //cb(cur, "kqv_out", il);
             }
 
-            if (il == n_layer - 1) {
-                // skip computing output for unused tokens
-                ggml_tensor * inp_out_ids = build_inp_out_ids();
+            if (il == n_layer - 1 && inp_out_ids) {
                 cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
-                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
                 inpCA = ggml_get_rows(ctx0, inpCA, inp_out_ids);
             }
 
@@ -10972,6 +11761,8 @@ struct llm_build_jais : public llm_graph_context {
 
         auto * inp_attn = build_attn_inp_kv_unified();
 
+        ggml_tensor * inp_out_ids = build_inp_out_ids();
+
         for (int il = 0; il < n_layer; ++il) {
             cur = build_norm(inpL,
                     model.layers[il].attn_norm,
@@ -11004,9 +11795,7 @@ struct llm_build_jais : public llm_graph_context {
                         Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/float(n_embd_head), il);
             }
 
-            if (il == n_layer - 1) {
-                // skip computing output for unused tokens
-                ggml_tensor * inp_out_ids = build_inp_out_ids();
+            if (il == n_layer - 1 && inp_out_ids) {
                 cur  = ggml_get_rows(ctx0,  cur, inp_out_ids);
                 inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
             }
@@ -11070,6 +11859,8 @@ struct llm_build_chatglm : public llm_graph_context {
 
         auto * inp_attn = build_attn_inp_kv_unified();
 
+        ggml_tensor * inp_out_ids = build_inp_out_ids();
+
         for (int il = 0; il < n_layer; ++il) {
             ggml_tensor * inpSA = inpL;
 
@@ -11136,9 +11927,7 @@ struct llm_build_chatglm : public llm_graph_context {
                         Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
             }
 
-            if (il == n_layer - 1) {
-                // skip computing output for unused tokens
-                ggml_tensor * inp_out_ids = build_inp_out_ids();
+            if (il == n_layer - 1 && inp_out_ids) {
                 cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
                 inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
             }
@@ -11203,6 +11992,8 @@ struct llm_build_glm4 : public llm_graph_context {
 
         auto * inp_attn = build_attn_inp_kv_unified();
 
+        ggml_tensor * inp_out_ids = build_inp_out_ids();
+
         for (int il = 0; il < n_layer; ++il) {
             ggml_tensor * inpSA = inpL;
 
@@ -11269,9 +12060,7 @@ struct llm_build_glm4 : public llm_graph_context {
                         Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
             }
 
-            if (il == n_layer - 1) {
-                // skip computing output for unused tokens
-                ggml_tensor * inp_out_ids = build_inp_out_ids();
+            if (il == n_layer - 1 && inp_out_ids) {
                 cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
                 inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
             }
@@ -11354,6 +12143,8 @@ struct llm_build_nemotron : public llm_graph_context {
 
         auto * inp_attn = build_attn_inp_kv_unified();
 
+        ggml_tensor * inp_out_ids = build_inp_out_ids();
+
         for (int il = 0; il < n_layer; ++il) {
             ggml_tensor * inpSA = inpL;
 
@@ -11413,9 +12204,7 @@ struct llm_build_nemotron : public llm_graph_context {
                         Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
             }
 
-            if (il == n_layer - 1) {
-                // skip computing output for unused tokens
-                ggml_tensor * inp_out_ids = build_inp_out_ids();
+            if (il == n_layer - 1 && inp_out_ids) {
                 cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
                 inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
             }
@@ -11483,6 +12272,8 @@ struct llm_build_exaone : public llm_graph_context {
 
         auto * inp_attn = build_attn_inp_kv_unified();
 
+        ggml_tensor * inp_out_ids = build_inp_out_ids();
+
         for (int il = 0; il < n_layer; ++il) {
             ggml_tensor * inpSA = inpL;
 
@@ -11544,9 +12335,7 @@ struct llm_build_exaone : public llm_graph_context {
                         Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
             }
 
-            if (il == n_layer - 1) {
-                // skip computing output for unused tokens
-                ggml_tensor * inp_out_ids = build_inp_out_ids();
+            if (il == n_layer - 1 && inp_out_ids) {
                 cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
                 inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
             }
@@ -11633,14 +12422,13 @@ struct llm_build_rwkv6_base : public llm_graph_context {
     }
 
     ggml_tensor * build_rwkv6_time_mix(
+            llm_graph_input_rs * inp,
             ggml_cgraph * gf,
             ggml_tensor * cur,
             ggml_tensor * x_prev,
-            ggml_tensor * state_copy,
-            ggml_tensor * state_mask,
             const llama_ubatch & ubatch,
             int   il) const {
-        const auto * kv_state = static_cast<const llama_kv_cache_recurrent_state *>(mstate);
+        const auto * mctx_cur = static_cast<const llama_memory_recurrent_context *>(mctx);
 
         const auto n_tokens = ubatch.n_tokens;
         const auto n_seqs = ubatch.n_seqs;
@@ -11650,7 +12438,7 @@ struct llm_build_rwkv6_base : public llm_graph_context {
         const auto n_head = n_embd / head_size;
         const auto n_head_kv = hparams.n_head_kv(il);
 
-        const auto kv_head = kv_state->get_head();
+        const auto kv_head = mctx_cur->get_head();
 
         const auto & layer = model.layers[il];
 
@@ -11761,9 +12549,9 @@ struct llm_build_rwkv6_base : public llm_graph_context {
             k = ggml_sub(ctx0, k, ggml_mul(ctx0, k, w));
         }
 
-        ggml_tensor * wkv_state = build_copy_mask_state(
-                gf, kv_state->get_v_l(il), state_copy, state_mask,
-                hparams.n_embd_v_s(), n_seqs);
+        ggml_tensor * wkv_state = build_rs(
+                inp, gf, mctx_cur->get_s_l(il),
+                hparams.n_embd_s(), n_seqs);
 
         ggml_tensor * wkv_output;
         if (is_qrwkv) {
@@ -11781,9 +12569,9 @@ struct llm_build_rwkv6_base : public llm_graph_context {
                     wkv_state,
                     ggml_view_1d(
                         ctx0,
-                        kv_state->get_v_l(il),
-                        hparams.n_embd_v_s() * n_seqs,
-                        hparams.n_embd_v_s() * kv_head * ggml_element_size(kv_state->get_v_l(il))
+                        mctx_cur->get_s_l(il),
+                        hparams.n_embd_s() * n_seqs,
+                        hparams.n_embd_s() * kv_head * ggml_element_size(mctx_cur->get_s_l(il))
                         )
                     )
                 );
@@ -11817,20 +12605,19 @@ struct llm_build_rwkv6 : public llm_build_rwkv6_base {
         inpL = build_inp_embd(model.tok_embd);
         inpL = build_norm(inpL, model.tok_norm, model.tok_norm_b, LLM_NORM, -1);
 
-        ggml_tensor * state_copy = build_inp_s_copy();
-        ggml_tensor * state_mask = build_inp_s_mask();
+        auto * rs_inp = build_rs_inp();
 
         const auto n_embd = hparams.n_embd;
         const auto n_seq_tokens = ubatch.n_seq_tokens;
         const auto n_seqs = ubatch.n_seqs;
 
+        ggml_tensor * inp_out_ids = build_inp_out_ids();
+
         for (int il = 0; il < n_layer; ++il) {
             const llama_layer * layer = &model.layers[il];
             inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
 
-            ggml_tensor * token_shift = build_rwkv_token_shift_load(
-                    gf, state_copy, state_mask, ubatch, il
-                    );
+            ggml_tensor * token_shift = build_rwkv_token_shift_load(rs_inp, gf, ubatch, il);
 
             ggml_tensor * att_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], 0);
             ggml_tensor * ffn_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], n_embd * ggml_element_size(token_shift));
@@ -11845,7 +12632,7 @@ struct llm_build_rwkv6 : public llm_build_rwkv6_base {
                     1
                     );
 
-            cur = build_rwkv6_time_mix(gf, att_norm, x_prev, state_copy, state_mask, ubatch, il);
+            cur = build_rwkv6_time_mix(rs_inp, gf, att_norm, x_prev, ubatch, il);
 
             ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
             cb(ffn_inp, "ffn_inp", il);
@@ -11867,13 +12654,16 @@ struct llm_build_rwkv6 : public llm_build_rwkv6_base {
                     );
             ggml_build_forward_expand(gf, build_rwkv_token_shift_store(token_shift, ubatch, il));
 
-            if (il == n_layer - 1) {
-                // skip computing output for unused tokens
-                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
-                ffn_inp  = ggml_get_rows(ctx0, ggml_reshape_2d(ctx0, ffn_inp,  n_embd, n_tokens), inp_out_ids);
-                ffn_norm = ggml_get_rows(ctx0, ggml_reshape_2d(ctx0, ffn_norm, n_embd, n_tokens), inp_out_ids);
-                x_prev   = ggml_get_rows(ctx0, ggml_reshape_2d(ctx0, x_prev,   n_embd, n_tokens), inp_out_ids);
-                cur      = ggml_get_rows(ctx0, ggml_reshape_2d(ctx0, cur,      n_embd, n_tokens), inp_out_ids);
+            ffn_inp  = ggml_reshape_2d(ctx0, ffn_inp,  n_embd, n_tokens);
+            ffn_norm = ggml_reshape_2d(ctx0, ffn_norm, n_embd, n_tokens);
+            x_prev   = ggml_reshape_2d(ctx0, x_prev,   n_embd, n_tokens);
+            cur      = ggml_reshape_2d(ctx0, cur,      n_embd, n_tokens);
+
+            if (il == n_layer - 1 && inp_out_ids) {
+                ffn_inp  = ggml_get_rows(ctx0, ffn_inp,  inp_out_ids);
+                ffn_norm = ggml_get_rows(ctx0, ffn_norm, inp_out_ids);
+                x_prev   = ggml_get_rows(ctx0, x_prev,   inp_out_ids);
+                cur      = ggml_get_rows(ctx0, cur,      inp_out_ids);
             }
 
             cur = build_rwkv6_channel_mix(layer, ffn_norm, x_prev, LLM_ARCH_RWKV6);
@@ -11908,27 +12698,26 @@ struct llm_build_rwkv6 : public llm_build_rwkv6_base {
 // ref: https://huggingface.co/recursal/QRWKV6-32B-Instruct-Preview-v0.1/blob/main/modeling_rwkv6qwen2.py
 struct llm_build_rwkv6qwen2 : public llm_build_rwkv6_base {
     llm_build_rwkv6qwen2(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_build_rwkv6_base(model, params) {
-        GGML_ASSERT(n_embd == hparams.n_embd_k_s());
+        GGML_ASSERT(n_embd == hparams.n_embd_r());
 
         ggml_tensor * cur;
         ggml_tensor * inpL;
 
         inpL = build_inp_embd(model.tok_embd);
 
-        ggml_tensor * state_copy = build_inp_s_copy();
-        ggml_tensor * state_mask = build_inp_s_mask();
+        auto * rs_inp = build_rs_inp();
 
         const auto n_embd = hparams.n_embd;
         const auto n_seq_tokens = ubatch.n_seq_tokens;
         const auto n_seqs = ubatch.n_seqs;
 
+        ggml_tensor * inp_out_ids = build_inp_out_ids();
+
         for (int il = 0; il < n_layer; ++il) {
             const llama_layer * layer = &model.layers[il];
             inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
 
-            ggml_tensor * token_shift = build_rwkv_token_shift_load(
-                    gf, state_copy, state_mask, ubatch, il
-                    );
+            ggml_tensor * token_shift = build_rwkv_token_shift_load(rs_inp, gf, ubatch, il);
 
             ggml_tensor * att_norm = build_norm(inpL, layer->attn_norm, layer->attn_norm_b, LLM_NORM_RMS, il);
             cb(att_norm, "attn_norm", il);
@@ -11940,7 +12729,7 @@ struct llm_build_rwkv6qwen2 : public llm_build_rwkv6_base {
                     1
                     );
 
-            cur = build_rwkv6_time_mix(gf, att_norm, x_prev, state_copy, state_mask, ubatch, il);
+            cur = build_rwkv6_time_mix(rs_inp, gf, att_norm, x_prev, ubatch, il);
 
             token_shift = ggml_view_3d(ctx0, att_norm, n_embd, 1, n_seqs, att_norm->nb[1], att_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(att_norm));
             ggml_build_forward_expand(gf, build_rwkv_token_shift_store(token_shift, ubatch, il));
@@ -11948,11 +12737,12 @@ struct llm_build_rwkv6qwen2 : public llm_build_rwkv6_base {
             ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
             cb(ffn_inp, "ffn_inp", il);
 
-            if (il == n_layer - 1) {
-                // skip computing output for unused tokens
-                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
-                cur     = ggml_get_rows(ctx0, ggml_reshape_2d(ctx0, cur, n_embd, n_tokens), inp_out_ids);
-                ffn_inp = ggml_get_rows(ctx0, ggml_reshape_2d(ctx0, ffn_inp, n_embd, n_tokens), inp_out_ids);
+            cur     = ggml_reshape_2d(ctx0, cur,     n_embd, n_tokens);
+            ffn_inp = ggml_reshape_2d(ctx0, ffn_inp, n_embd, n_tokens);
+
+            if (il == n_layer - 1 && inp_out_ids) {
+                cur     = ggml_get_rows(ctx0, cur,     inp_out_ids);
+                ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
             }
 
             // feed-forward network
@@ -12028,15 +12818,14 @@ struct llm_build_rwkv7_base : public llm_graph_context {
     }
 
     ggml_tensor * build_rwkv7_time_mix(
+            llm_graph_input_rs * inp,
             ggml_cgraph * gf,
             ggml_tensor * cur,
             ggml_tensor * x_prev,
-            ggml_tensor * state_copy,
-            ggml_tensor * state_mask,
             ggml_tensor *& first_layer_value,
             const llama_ubatch & ubatch,
             int   il) const {
-        const auto * kv_state = static_cast<const llama_kv_cache_recurrent_state *>(mstate);
+        const auto * mctx_cur = static_cast<const llama_memory_recurrent_context *>(mctx);
 
         const auto n_tokens = ubatch.n_tokens;
         const auto n_seqs = ubatch.n_seqs;
@@ -12045,7 +12834,7 @@ struct llm_build_rwkv7_base : public llm_graph_context {
         const auto head_count = n_embd / head_size;
         const auto n_seq_tokens = ubatch.n_seq_tokens;
 
-        const auto kv_head = kv_state->get_head();
+        const auto kv_head = mctx_cur->get_head();
 
         const auto & layer = model.layers[il];
 
@@ -12115,9 +12904,9 @@ struct llm_build_rwkv7_base : public llm_graph_context {
         v = ggml_reshape_3d(ctx0, v, head_size, head_count, n_tokens);
         a = ggml_reshape_3d(ctx0, a, head_size, head_count, n_tokens);
 
-        ggml_tensor * wkv_state = build_copy_mask_state(
-                gf, kv_state->get_v_l(il), state_copy, state_mask,
-                hparams.n_embd_v_s(), n_seqs);
+        ggml_tensor * wkv_state = build_rs(
+                inp, gf, mctx_cur->get_s_l(il),
+                hparams.n_embd_s(), n_seqs);
 
         ggml_tensor * wkv_output = ggml_rwkv_wkv7(ctx0, r, w, k, v, ggml_neg(ctx0, kk), ggml_mul(ctx0, kk, a), wkv_state);
         cur = ggml_view_1d(ctx0, wkv_output, n_embd * n_tokens, 0);
@@ -12130,9 +12919,9 @@ struct llm_build_rwkv7_base : public llm_graph_context {
                     wkv_state,
                     ggml_view_1d(
                         ctx0,
-                        kv_state->get_v_l(il),
-                        hparams.n_embd_v_s() * n_seqs,
-                        hparams.n_embd_v_s() * kv_head * ggml_element_size(kv_state->get_v_l(il))
+                        mctx_cur->get_s_l(il),
+                        hparams.n_embd_s() * n_seqs,
+                        hparams.n_embd_s() * kv_head * ggml_element_size(mctx_cur->get_s_l(il))
                         )
                     )
                 );
@@ -12173,20 +12962,19 @@ struct llm_build_rwkv7 : public llm_build_rwkv7_base {
         inpL = build_inp_embd(model.tok_embd);
         inpL = build_norm(inpL, model.tok_norm, model.tok_norm_b, LLM_NORM, -1);
 
-        ggml_tensor * state_copy = build_inp_s_copy();
-        ggml_tensor * state_mask = build_inp_s_mask();
+        auto * rs_inp = build_rs_inp();
 
         const auto n_embd = hparams.n_embd;
         const auto n_seq_tokens = ubatch.n_seq_tokens;
         const auto n_seqs = ubatch.n_seqs;
 
+        ggml_tensor * inp_out_ids = build_inp_out_ids();
+
         for (int il = 0; il < n_layer; ++il) {
             const llama_layer * layer = &model.layers[il];
             inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
 
-            ggml_tensor * token_shift = build_rwkv_token_shift_load(
-                    gf, state_copy, state_mask, ubatch, il
-                    );
+            ggml_tensor * token_shift = build_rwkv_token_shift_load(rs_inp, gf, ubatch, il);
 
             ggml_tensor * att_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], 0);
             ggml_tensor * ffn_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], n_embd * ggml_element_size(token_shift));
@@ -12201,7 +12989,7 @@ struct llm_build_rwkv7 : public llm_build_rwkv7_base {
                     1
                     );
 
-            cur = build_rwkv7_time_mix(gf, att_norm, x_prev, state_copy, state_mask, v_first, ubatch, il);
+            cur = build_rwkv7_time_mix(rs_inp, gf, att_norm, x_prev, v_first, ubatch, il);
 
             ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
             cb(ffn_inp, "ffn_inp", il);
@@ -12223,12 +13011,14 @@ struct llm_build_rwkv7 : public llm_build_rwkv7_base {
                     );
             ggml_build_forward_expand(gf, build_rwkv_token_shift_store(token_shift, ubatch, il));
 
-            if (il == n_layer - 1) {
-                // skip computing output for unused tokens
-                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
-                ffn_inp  = ggml_get_rows(ctx0, ggml_reshape_2d(ctx0, ffn_inp,  n_embd, n_tokens), inp_out_ids);
-                ffn_norm = ggml_get_rows(ctx0, ggml_reshape_2d(ctx0, ffn_norm, n_embd, n_tokens), inp_out_ids);
-                x_prev   = ggml_get_rows(ctx0, ggml_reshape_2d(ctx0, x_prev,   n_embd, n_tokens), inp_out_ids);
+            ffn_inp  = ggml_reshape_2d(ctx0, ffn_inp,  n_embd, n_tokens);
+            ffn_norm = ggml_reshape_2d(ctx0, ffn_norm, n_embd, n_tokens);
+            x_prev   = ggml_reshape_2d(ctx0, x_prev,   n_embd, n_tokens);
+
+            if (il == n_layer - 1 && inp_out_ids) {
+                ffn_inp  = ggml_get_rows(ctx0, ffn_inp,  inp_out_ids);
+                ffn_norm = ggml_get_rows(ctx0, ffn_norm, inp_out_ids);
+                x_prev   = ggml_get_rows(ctx0, x_prev,   inp_out_ids);
             }
 
             cur = build_rwkv7_channel_mix(layer, ffn_norm, x_prev, LLM_ARCH_RWKV7);
@@ -12259,7 +13049,7 @@ struct llm_build_rwkv7 : public llm_build_rwkv7_base {
 
 struct llm_build_arwkv7 : public llm_build_rwkv7_base {
     llm_build_arwkv7(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_build_rwkv7_base(model, params) {
-        GGML_ASSERT(n_embd == hparams.n_embd_k_s());
+        GGML_ASSERT(n_embd == hparams.n_embd_r());
 
         ggml_tensor * cur;
         ggml_tensor * inpL;
@@ -12267,20 +13057,19 @@ struct llm_build_arwkv7 : public llm_build_rwkv7_base {
 
         inpL = build_inp_embd(model.tok_embd);
 
-        ggml_tensor * state_copy = build_inp_s_copy();
-        ggml_tensor * state_mask = build_inp_s_mask();
+        auto * rs_inp = build_rs_inp();
 
         const auto n_embd = hparams.n_embd;
         const auto n_seq_tokens = ubatch.n_seq_tokens;
         const auto n_seqs = ubatch.n_seqs;
 
+        ggml_tensor * inp_out_ids = build_inp_out_ids();
+
         for (int il = 0; il < n_layer; ++il) {
             const llama_layer * layer = &model.layers[il];
             inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
 
-            ggml_tensor * token_shift = build_rwkv_token_shift_load(
-                    gf, state_copy, state_mask, ubatch, il
-                    );
+            ggml_tensor * token_shift = build_rwkv_token_shift_load(rs_inp, gf, ubatch, il);
 
             ggml_tensor * att_norm = build_norm(inpL, layer->attn_norm, layer->attn_norm_b, LLM_NORM_RMS, il);
             cb(att_norm, "attn_norm", il);
@@ -12292,7 +13081,7 @@ struct llm_build_arwkv7 : public llm_build_rwkv7_base {
                     1
                     );
 
-            cur = build_rwkv7_time_mix(gf, att_norm, x_prev, state_copy, state_mask, v_first, ubatch, il);
+            cur = build_rwkv7_time_mix(rs_inp, gf, att_norm, x_prev, v_first, ubatch, il);
 
             token_shift = ggml_view_3d(ctx0, att_norm, n_embd, 1, n_seqs, att_norm->nb[1], att_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(att_norm));
             ggml_build_forward_expand(gf, build_rwkv_token_shift_store(token_shift, ubatch, il));
@@ -12300,11 +13089,12 @@ struct llm_build_arwkv7 : public llm_build_rwkv7_base {
             ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
             cb(ffn_inp, "ffn_inp", il);
 
-            if (il == n_layer - 1) {
-                // skip computing output for unused tokens
-                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
-                cur     = ggml_get_rows(ctx0, ggml_reshape_2d(ctx0, cur, n_embd, n_tokens), inp_out_ids);
-                ffn_inp = ggml_get_rows(ctx0, ggml_reshape_2d(ctx0, ffn_inp, n_embd, n_tokens), inp_out_ids);
+            cur     = ggml_reshape_2d(ctx0, cur,     n_embd, n_tokens);
+            ffn_inp = ggml_reshape_2d(ctx0, ffn_inp, n_embd, n_tokens);
+
+            if (il == n_layer - 1 && inp_out_ids) {
+                cur     = ggml_get_rows(ctx0, cur,     inp_out_ids);
+                ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
             }
 
             // feed-forward network
@@ -12373,6 +13163,9 @@ struct llm_build_granite : public llm_graph_context {
         auto * inp_attn = build_attn_inp_kv_unified();
 
         const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
+
+        ggml_tensor * inp_out_ids = build_inp_out_ids();
+
         for (int il = 0; il < n_layer; ++il) {
             ggml_tensor * inpSA = inpL;
 
@@ -12435,9 +13228,7 @@ struct llm_build_granite : public llm_graph_context {
                 cb(cur, "attn_out", il);
             }
 
-            if (il == n_layer - 1) {
-                // skip computing output for unused tokens
-                ggml_tensor * inp_out_ids = build_inp_out_ids();
+            if (il == n_layer - 1 && inp_out_ids) {
                 cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
                 inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
             }
@@ -12556,6 +13347,8 @@ struct llm_build_chameleon : public llm_graph_context {
 
         auto * inp_attn = build_attn_inp_kv_unified();
 
+        ggml_tensor * inp_out_ids = build_inp_out_ids();
+
         for (int il = 0; il < n_layer; ++il) {
             ggml_tensor * inpSA = inpL;
 
@@ -12632,21 +13425,19 @@ struct llm_build_chameleon : public llm_graph_context {
                 cur = build_attn(inp_attn, gf,
                         model.layers[il].wo, nullptr,
                         Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
-
-                if (hparams.swin_norm) {
-                    cur = build_norm(cur,
-                            model.layers[il].attn_norm, NULL,
-                            LLM_NORM_RMS, il);
-                }
             }
 
-            if (il == n_layer - 1) {
-                // skip computing output for unused tokens
-                ggml_tensor * inp_out_ids = build_inp_out_ids();
+            if (il == n_layer - 1 && inp_out_ids) {
                 cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
                 inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
             }
 
+            if (hparams.swin_norm) {
+                cur = build_norm(cur,
+                        model.layers[il].attn_norm, NULL,
+                        LLM_NORM_RMS, il);
+            }
+
             ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
             cb(ffn_inp, "ffn_inp", il);
 
@@ -12887,6 +13678,8 @@ struct llm_build_plm : public llm_graph_context {
 
         auto * inp_attn = build_attn_inp_kv_unified();
 
+        ggml_tensor * inp_out_ids = build_inp_out_ids();
+
         for (int il = 0; il < n_layer; ++il) {
             ggml_tensor * inpSA = inpL;
 
@@ -12990,9 +13783,7 @@ struct llm_build_plm : public llm_graph_context {
                         q_states, k_states, v_states, nullptr, nullptr, kq_scale, il);
             }
 
-            if (il == n_layer - 1) {
-                // skip computing output for unused tokens
-                ggml_tensor * inp_out_ids = build_inp_out_ids();
+            if (il == n_layer - 1 && inp_out_ids) {
                 cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
                 inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
             }
@@ -13052,6 +13843,8 @@ struct llm_build_bailingmoe : public llm_graph_context {
 
         auto * inp_attn = build_attn_inp_kv_unified();
 
+        ggml_tensor * inp_out_ids = build_inp_out_ids();
+
         for (int il = 0; il < n_layer; ++il) {
             ggml_tensor * inpSA = inpL;
 
@@ -13113,9 +13906,7 @@ struct llm_build_bailingmoe : public llm_graph_context {
                         Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_rot)), il);
             }
 
-            if (il == n_layer - 1) {
-                // skip computing output for unused tokens
-                ggml_tensor * inp_out_ids = build_inp_out_ids();
+            if (il == n_layer - 1 && inp_out_ids) {
                 cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
                 inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
             }
@@ -13184,69 +13975,375 @@ struct llm_build_bailingmoe : public llm_graph_context {
     }
 };
 
-llama_memory_i * llama_model::create_memory(const llama_memory_params & params, llama_cparams & cparams) const {
-    llama_memory_i * res;
+struct llm_build_dots1 : public llm_graph_context {
+    llm_build_dots1(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
+        const int64_t n_embd_head = hparams.n_embd_head_v;
 
-    switch (arch) {
-        case LLM_ARCH_BERT:
-        case LLM_ARCH_JINA_BERT_V2:
-        case LLM_ARCH_NOMIC_BERT:
-        case LLM_ARCH_NOMIC_BERT_MOE:
-        case LLM_ARCH_WAVTOKENIZER_DEC:
-            {
-                res = nullptr;
-            } break;
-        case LLM_ARCH_MAMBA:
-        case LLM_ARCH_RWKV6:
-        case LLM_ARCH_RWKV6QWEN2:
-        case LLM_ARCH_RWKV7:
-        case LLM_ARCH_ARWKV7:
+        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+        GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+        ggml_tensor * cur;
+        ggml_tensor * inpL;
+
+        inpL = build_inp_embd(model.tok_embd);
+
+        // inp_pos - contains the positions
+        ggml_tensor * inp_pos = build_inp_pos();
+
+        auto * inp_attn = build_attn_inp_kv_unified();
+
+        ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+        for (int il = 0; il < n_layer; ++il) {
+            ggml_tensor * inpSA = inpL;
+
+            // norm
+            cur = build_norm(inpL,
+                    model.layers[il].attn_norm, NULL,
+                    LLM_NORM_RMS, il);
+            cb(cur, "attn_norm", il);
+
+            // self_attention
             {
-                res = new llama_kv_cache_recurrent(
-                        *this,
-                        GGML_TYPE_F32,
-                        GGML_TYPE_F32,
-                        cparams.offload_kqv,
-                        std::max((uint32_t) 1, cparams.n_seq_max),
-                        cparams.n_seq_max);
-            } break;
-        default:
+                // compute Q and K and RoPE them
+                ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+                cb(Qcur, "Qcur", il);
+
+                ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+                cb(Kcur, "Kcur", il);
+
+                ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+                cb(Vcur, "Vcur", il);
+
+                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
+                Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+                Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+                Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
+                cb(Qcur, "Qcur_normed", il);
+
+                Qcur = ggml_rope_ext(
+                        ctx0, Qcur, inp_pos, nullptr,
+                        n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                        ext_factor, attn_factor, beta_fast, beta_slow
+                        );
+
+                Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
+                cb(Kcur, "Kcur_normed", il);
+
+                Kcur = ggml_rope_ext(
+                        ctx0, Kcur, inp_pos, nullptr,
+                        n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                        ext_factor, attn_factor, beta_fast, beta_slow
+                        );
+
+                cb(Qcur, "Qcur", il);
+                cb(Kcur, "Kcur", il);
+                cb(Vcur, "Vcur", il);
+
+                cur = build_attn(inp_attn, gf,
+                        model.layers[il].wo, model.layers[il].bo,
+                        Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+            }
+
+            if (il == n_layer - 1 && inp_out_ids) {
+                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
+                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+            }
+
+            ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+            cb(ffn_inp, "ffn_inp", il);
+
+            // MoE branch
+            cur = build_norm(ffn_inp,
+                    model.layers[il].ffn_norm, NULL,
+                    LLM_NORM_RMS, il);
+            cb(cur, "ffn_norm", il);
+
+            if ((uint32_t) il < hparams.n_layer_dense_lead) {
+                cur = build_ffn(cur,
+                        model.layers[il].ffn_up,   NULL, NULL,
+                        model.layers[il].ffn_gate, NULL, NULL,
+                        model.layers[il].ffn_down, NULL, NULL,
+                        NULL,
+                        LLM_FFN_SILU, LLM_FFN_PAR, il);
+                cb(cur, "ffn_out", il);
+            } else {
+                ggml_tensor * moe_out =
+                    build_moe_ffn(cur,
+                            model.layers[il].ffn_gate_inp,
+                            model.layers[il].ffn_up_exps,
+                            model.layers[il].ffn_gate_exps,
+                            model.layers[il].ffn_down_exps,
+                            model.layers[il].ffn_exp_probs_b,
+                            n_expert, n_expert_used,
+                            LLM_FFN_SILU, hparams.expert_weights_norm,
+                            true, hparams.expert_weights_scale,
+                            (llama_expert_gating_func_type) hparams.expert_gating_func,
+                            il);
+                cb(moe_out, "ffn_moe_out", il);
+
+                {
+                    ggml_tensor * ffn_shexp = build_ffn(cur,
+                            model.layers[il].ffn_up_shexp,   NULL, NULL,
+                            model.layers[il].ffn_gate_shexp, NULL, NULL,
+                            model.layers[il].ffn_down_shexp, NULL, NULL,
+                            NULL,
+                            LLM_FFN_SILU, LLM_FFN_PAR, il);
+                    cb(ffn_shexp, "ffn_shexp", il);
+
+                    cur = ggml_add(ctx0, moe_out, ffn_shexp);
+                    cb(cur, "ffn_out", il);
+                }
+            }
+
+            cur = ggml_add(ctx0, cur, ffn_inp);
+
+            cur = build_cvec(cur, il);
+            cb(cur, "l_out", il);
+
+            // input for next layer
+            inpL = cur;
+        }
+
+        cur = inpL;
+
+        cur = build_norm(cur,
+                model.output_norm, NULL,
+                LLM_NORM_RMS, -1);
+
+        cb(cur, "result_norm", -1);
+        res->t_embd = cur;
+
+        // lm_head
+        cur = build_lora_mm(model.output, cur);
+
+        cb(cur, "result_output", -1);
+        res->t_logits = cur;
+
+        ggml_build_forward_expand(gf, cur);
+    }
+};
+
+struct llm_build_arcee : public llm_graph_context {
+    llm_build_arcee(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
+        const int64_t n_embd_head = hparams.n_embd_head_v;
+
+        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+        GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+        ggml_tensor * cur;
+        ggml_tensor * inpL;
+
+        inpL = build_inp_embd(model.tok_embd);
+
+        // inp_pos - contains the positions
+        ggml_tensor * inp_pos = build_inp_pos();
+
+        auto * inp_attn = build_attn_inp_kv_unified();
+
+        const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
+
+        ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+        for (int il = 0; il < n_layer; ++il) {
+            ggml_tensor * inpSA = inpL;
+
+            // norm
+            cur = build_norm(inpL,
+                    model.layers[il].attn_norm, NULL,
+                    LLM_NORM_RMS, il);
+            cb(cur, "attn_norm", il);
+
+            // self-attention
             {
-                const auto padding = llama_kv_cache_unified::get_padding(cparams);
+                // rope freq factors for llama3; may return nullptr for llama2 and other models
+                ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
 
-                cparams.n_ctx = GGML_PAD(cparams.n_ctx, padding);
+                // compute Q and K and RoPE them
+                ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+                cb(Qcur, "Qcur", il);
+                if (model.layers[il].bq) {
+                    Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+                    cb(Qcur, "Qcur", il);
+                }
 
-                LLAMA_LOG_DEBUG("%s: n_ctx = %u (padded)\n", __func__, cparams.n_ctx);
+                ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+                cb(Kcur, "Kcur", il);
+                if (model.layers[il].bk) {
+                    Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+                    cb(Kcur, "Kcur", il);
+                }
 
-                if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
-                    GGML_ASSERT(hparams.is_swa_any());
+                ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+                cb(Vcur, "Vcur", il);
+                if (model.layers[il].bv) {
+                    Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+                    cb(Vcur, "Vcur", il);
+                }
 
-                    res = new llama_kv_cache_unified_iswa(
-                            *this,
-                            params.type_k,
-                            params.type_v,
-                            !cparams.flash_attn,
-                            cparams.offload_kqv,
-                            params.swa_full,
-                            cparams.n_ctx,
-                            cparams.n_seq_max,
-                            cparams.n_ubatch,
-                            padding);
-                } else {
-                    GGML_ASSERT(!hparams.is_swa_any());
+                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
+                Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+                Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+                Qcur = ggml_rope_ext(
+                        ctx0, Qcur, inp_pos, rope_factors,
+                        n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                        ext_factor, attn_factor, beta_fast, beta_slow
+                        );
+
+                Kcur = ggml_rope_ext(
+                        ctx0, Kcur, inp_pos, rope_factors,
+                        n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                        ext_factor, attn_factor, beta_fast, beta_slow
+                        );
+
+                cb(Qcur, "Qcur", il);
+                cb(Kcur, "Kcur", il);
+                cb(Vcur, "Vcur", il);
+
+                cur = build_attn(inp_attn, gf,
+                        model.layers[il].wo, model.layers[il].bo,
+                        Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
+                cb(cur, "attn_out", il);
+            }
+
+            if (il == n_layer - 1 && inp_out_ids) {
+                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
+                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+            }
+
+            ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+            cb(ffn_inp, "ffn_inp", il);
+
+            // feed-forward network
+            // ARCEE uses relu^2 instead of silu
+            cur = build_norm(ffn_inp,
+                    model.layers[il].ffn_norm, NULL,
+                    LLM_NORM_RMS, il);
+            cb(cur, "ffn_norm", il);
+
+            cur = build_ffn(cur,
+                    model.layers[il].ffn_up,   NULL, NULL,
+                    NULL,                      NULL, NULL,
+                    model.layers[il].ffn_down, NULL, NULL,
+                    NULL,
+                    LLM_FFN_RELU_SQR, LLM_FFN_SEQ, il);
+            cb(cur, "ffn_out", il);
+
+            cur = ggml_add(ctx0, cur, ffn_inp);
+            cb(cur, "ffn_out", il);
+
+            cur = build_cvec(cur, il);
+            cb(cur, "l_out", il);
+
+            // input for next layer
+            inpL = cur;
+        }
+
+        cur = inpL;
+
+        cur = build_norm(cur,
+                model.output_norm, NULL,
+                LLM_NORM_RMS, -1);
+
+        cb(cur, "result_norm", -1);
+        res->t_embd = cur;
 
-                    res = new llama_kv_cache_unified(
+        // lm_head
+        cur = build_lora_mm(model.output, cur);
+
+        cb(cur, "result_output", -1);
+        res->t_logits = cur;
+
+        ggml_build_forward_expand(gf, cur);
+    }
+};
+
+llama_memory_i * llama_model::create_memory(const llama_memory_params & params, llama_cparams & cparams) const {
+    llama_memory_i * res;
+
+    switch (arch) {
+        // Models that need specific instantiation should be handled in the
+        // switch statement
+        case LLM_ARCH_BERT:
+        case LLM_ARCH_JINA_BERT_V2:
+        case LLM_ARCH_NOMIC_BERT:
+        case LLM_ARCH_NOMIC_BERT_MOE:
+        case LLM_ARCH_NEO_BERT:
+        case LLM_ARCH_WAVTOKENIZER_DEC:
+            {
+                res = nullptr;
+            } break;
+        // Models that need standard caching should rely on recurrent/hybrid
+        // checks
+        default:
+            {
+                if (llm_arch_is_recurrent(arch)) {
+                    res = new llama_memory_recurrent(
                             *this,
                             nullptr,
-                            params.type_k,
-                            params.type_v,
-                            !cparams.flash_attn,
+                            GGML_TYPE_F32,
+                            GGML_TYPE_F32,
                             cparams.offload_kqv,
-                            cparams.n_ctx,
-                            cparams.n_seq_max,
-                            padding,
-                            hparams.n_swa,
-                            hparams.swa_type);
+                            std::max((uint32_t) 1, cparams.n_seq_max),
+                            cparams.n_seq_max);
+                } else if (llm_arch_is_hybrid(arch)) {
+                    const auto padding = llama_kv_cache_unified::get_padding(cparams);
+
+                    cparams.n_ctx = GGML_PAD(cparams.n_ctx, padding);
+
+                    res = new llama_memory_hybrid(
+                        /* model             */ *this,
+                        /* attn_type_k       */ params.type_k,
+                        /* attn_type_v       */ params.type_v,
+                        /* attn_v_trans      */ !cparams.flash_attn,
+                        /* attn_kv_size      */ cparams.n_ctx,
+                        /* attn_n_pad        */ padding,
+                        /* attn_n_swa        */ hparams.n_swa,
+                        /* attn_swa_type     */ hparams.swa_type,
+                        /* recurrent_type_k  */ GGML_TYPE_F32,
+                        /* recurrent_type_v  */ GGML_TYPE_F32,
+                        /* recurrent_kv_size */ std::max((uint32_t) 1, cparams.n_seq_max),
+                        /* n_seq_max         */ cparams.n_seq_max,
+                        /* offload           */ cparams.offload_kqv);
+                } else {
+                    const auto padding = llama_kv_cache_unified::get_padding(cparams);
+
+                    cparams.n_ctx = GGML_PAD(cparams.n_ctx, padding);
+
+                    LLAMA_LOG_DEBUG("%s: n_ctx = %u (padded)\n", __func__, cparams.n_ctx);
+
+                    if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
+                        GGML_ASSERT(hparams.is_swa_any());
+
+                        res = new llama_kv_cache_unified_iswa(
+                                *this,
+                                params.type_k,
+                                params.type_v,
+                                !cparams.flash_attn,
+                                cparams.offload_kqv,
+                                params.swa_full,
+                                cparams.n_ctx,
+                                cparams.n_seq_max,
+                                cparams.n_ubatch,
+                                padding);
+                    } else {
+                        GGML_ASSERT(!hparams.is_swa_any());
+
+                        res = new llama_kv_cache_unified(
+                                *this,
+                                nullptr,
+                                params.type_k,
+                                params.type_v,
+                                !cparams.flash_attn,
+                                cparams.offload_kqv,
+                                cparams.n_ctx,
+                                cparams.n_seq_max,
+                                padding,
+                                hparams.n_swa,
+                                hparams.swa_type);
+                    }
                 }
             }
     }
@@ -13300,6 +14397,10 @@ llm_graph_result_ptr llama_model::build_graph(
             {
                 llm = std::make_unique<llm_build_bert>(*this, params, gf);
             } break;
+        case LLM_ARCH_NEO_BERT:
+            {
+                llm = std::make_unique<llm_build_neo_bert>(*this, params, gf);
+            } break;
         case LLM_ARCH_BLOOM:
             {
                 llm = std::make_unique<llm_build_bloom>(*this, params, gf);
@@ -13385,6 +14486,10 @@ llm_graph_result_ptr llama_model::build_graph(
             {
                 llm = std::make_unique<llm_build_gemma3_iswa>(*this, params, gf);
             } break;
+        case LLM_ARCH_GEMMA3N:
+            {
+                llm = std::make_unique<llm_build_gemma3n_iswa>(*this, params, gf);
+            } break;
         case LLM_ARCH_STARCODER2:
             {
                 llm = std::make_unique<llm_build_starcoder2>(*this, params, gf);
@@ -13522,6 +14627,14 @@ llm_graph_result_ptr llama_model::build_graph(
             {
                 llm = std::make_unique<llm_build_bailingmoe>(*this, params, gf);
             } break;
+        case LLM_ARCH_DOTS1:
+            {
+                llm = std::make_unique<llm_build_dots1>(*this, params, gf);
+            } break;
+        case LLM_ARCH_ARCEE:
+            {
+                llm = std::make_unique<llm_build_arcee>(*this, params, gf);
+            } break;
         default:
             GGML_ABORT("fatal error");
     }
@@ -13597,6 +14710,18 @@ int32_t llama_model_n_swa(const llama_model * model) {
     return model->hparams.n_swa;
 }
 
+uint32_t llama_model_n_cls_out(const struct llama_model * model) {
+    return model->hparams.n_cls_out;
+}
+
+const char * llama_model_cls_label(const struct llama_model * model, uint32_t i) {
+    if (i < model->classifier_labels.size()) {
+        return model->classifier_labels[i].c_str();
+    }
+
+    return nullptr;
+}
+
 // deprecated
 int32_t llama_n_ctx_train(const llama_model * model) {
     return llama_model_n_ctx_train(model);
@@ -13659,6 +14784,8 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
         case LLM_ARCH_GRANITE_MOE:
         case LLM_ARCH_CHAMELEON:
         case LLM_ARCH_BAILINGMOE:
+        case LLM_ARCH_NEO_BERT:
+        case LLM_ARCH_ARCEE:
             return LLAMA_ROPE_TYPE_NORM;
 
         // the pairs of head values are offset by n_rot/2
@@ -13684,6 +14811,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
         case LLM_ARCH_GEMMA:
         case LLM_ARCH_GEMMA2:
         case LLM_ARCH_GEMMA3:
+        case LLM_ARCH_GEMMA3N:
         case LLM_ARCH_STARCODER2:
         case LLM_ARCH_OPENELM:
         case LLM_ARCH_GPTNEOX:
@@ -13692,6 +14820,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
         case LLM_ARCH_NEMOTRON:
         case LLM_ARCH_EXAONE:
         case LLM_ARCH_MINICPM3:
+        case LLM_ARCH_DOTS1:
             return LLAMA_ROPE_TYPE_NEOX;
 
         case LLM_ARCH_QWEN2VL:
@@ -13757,7 +14886,7 @@ uint64_t llama_model_size(const llama_model * model) {
 }
 
 const char * llama_model_chat_template(const llama_model * model, const char * name) {
-    const auto key = name ? LLM_KV(model->arch, name)(LLM_KV_TOKENIZER_CHAT_TEMPLATE_N)
+    const auto key = name ? LLM_KV(model->arch, name)(LLM_KV_TOKENIZER_CHAT_TEMPLATE)
         : LLM_KV(model->arch)(LLM_KV_TOKENIZER_CHAT_TEMPLATE);
     const auto & it = model->gguf_kv.find(key);
     if (it == model->gguf_kv.end()) {
@@ -13765,7 +14894,7 @@ const char * llama_model_chat_template(const llama_model * model, const char * n
         // do not extend this list unless absolutely necessary
         // Mistral-Small-2503 does not have built-in chat template
         llama_vocab_pre_type pre_type = model->vocab.get_pre_type();
-        if (pre_type == LLAMA_VOCAB_PRE_TYPE_TEKKEN && model->layers.size() == 40) {
+        if (!name && pre_type == LLAMA_VOCAB_PRE_TYPE_TEKKEN && model->layers.size() == 40) {
             return "mistral-v7-tekken";
         }
 
@@ -13799,14 +14928,7 @@ llama_token llama_model_decoder_start_token(const llama_model * model) {
 }
 
 bool llama_model_is_recurrent(const llama_model * model) {
-    switch (model->arch) {
-        case     LLM_ARCH_MAMBA:      return true;
-        case     LLM_ARCH_RWKV6:      return true;
-        case     LLM_ARCH_RWKV6QWEN2: return true;
-        case     LLM_ARCH_RWKV7:      return true;
-        case     LLM_ARCH_ARWKV7:     return true;
-        default: return false;
-    }
+    return llm_arch_is_recurrent(model->arch);
 }
 
 const std::vector<std::pair<std::string, ggml_tensor *>> & llama_internal_get_tensor_map(const llama_model * model) {
diff --git a/src/llama-model.h b/src/llama-model.h
index cbea2cb331b62..40063b790d434 100644
--- a/src/llama-model.h
+++ b/src/llama-model.h
@@ -73,6 +73,7 @@ enum llm_type {
     LLM_TYPE_40B,
     LLM_TYPE_65B,
     LLM_TYPE_70B,
+    LLM_TYPE_142B,
     LLM_TYPE_236B,
     LLM_TYPE_290B,
     LLM_TYPE_314B,
@@ -94,6 +95,8 @@ enum llm_type {
     LLM_TYPE_17B_128E, // llama4 Maverick
     LLM_TYPE_30B_A3B,
     LLM_TYPE_235B_A22B,
+    LLM_TYPE_E2B,
+    LLM_TYPE_E4B,
 };
 
 std::string llama_rope_scaling_type_name(llama_rope_scaling_type rope_scaling_type);
@@ -315,6 +318,19 @@ struct llama_layer {
     struct ggml_tensor * ffn_up_scale   = nullptr;
     struct ggml_tensor * ffn_down_scale = nullptr;
 
+    // altup & laurel
+    struct ggml_tensor * per_layer_inp_gate   = nullptr;
+    struct ggml_tensor * per_layer_proj       = nullptr;
+    struct ggml_tensor * per_layer_post_norm  = nullptr;
+    struct ggml_tensor * altup_correct_coef   = nullptr;
+    struct ggml_tensor * altup_correct_scale  = nullptr;
+    struct ggml_tensor * altup_predict_coef   = nullptr;
+    struct ggml_tensor * altup_router         = nullptr;
+    struct ggml_tensor * altup_router_norm    = nullptr;
+    struct ggml_tensor * laurel_l             = nullptr;
+    struct ggml_tensor * laurel_r             = nullptr;
+    struct ggml_tensor * laurel_post_norm     = nullptr;
+
     struct llama_layer_posnet posnet;
 
     struct llama_layer_convnext convnext;
@@ -329,6 +345,9 @@ struct llama_model {
     llama_hparams hparams = {};
     llama_vocab   vocab;
 
+    // for classifier models
+    std::vector<std::string> classifier_labels;
+
     struct ggml_tensor * tok_embd   = nullptr;
     struct ggml_tensor * type_embd  = nullptr;
     struct ggml_tensor * pos_embd   = nullptr;
@@ -350,6 +369,13 @@ struct llama_model {
     struct ggml_tensor * conv1d   = nullptr;
     struct ggml_tensor * conv1d_b = nullptr;
 
+    // gemma3n altup
+    struct ggml_tensor * tok_embd_per_layer   = nullptr;
+    struct ggml_tensor * altup_proj           = nullptr;
+    struct ggml_tensor * altup_unembd_proj    = nullptr;
+    struct ggml_tensor * per_layer_model_proj = nullptr;
+    struct ggml_tensor * per_layer_proj_norm  = nullptr;
+
     std::vector<llama_layer> layers;
 
     llama_model_params params;
diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 159b1307a4c5d..f4b5713d7dd9a 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -1,5 +1,4 @@
 #include "llama-quant.h"
-
 #include "llama-impl.h"
 #include "llama-model.h"
 #include "llama-model-loader.h"
@@ -27,6 +26,56 @@ static void zeros(std::ofstream & file, size_t n) {
     }
 }
 
+static std::string remap_layer(const std::string & orig_name, const std::vector<int> & prune, std::map<int, std::string> & mapped, int & next_id) {
+    if (prune.empty()) {
+        return orig_name;
+    }
+
+    static const std::regex pattern(R"(blk\.(\d+)\.)");
+    if (std::smatch match; std::regex_search(orig_name, match, pattern)) {
+        const int blk = std::stoi(match[1]);
+        std::string new_name = orig_name;
+
+        if (mapped.count(blk)) {
+            // Already mapped, do nothing
+        } else if (std::find(prune.begin(), prune.end(), blk) != prune.end()) {
+            mapped[blk] = "";
+        } else if (blk < prune.front()) {
+            mapped[blk] = std::to_string(blk);
+            next_id = blk + 1;
+        } else {
+            mapped[blk] = std::to_string(next_id);
+            ++next_id;
+        }
+
+        return mapped[blk].empty() ? mapped[blk] : new_name.replace(match.position(1), match.length(1), mapped[blk]);
+    }
+
+    return orig_name;
+}
+
+static std::string remap_imatrix (const std::string & orig_name, const std::map<int, std::string> & mapped) {
+    if (mapped.empty()) {
+        return orig_name;
+    }
+
+    static const std::regex pattern(R"(blk\.(\d+)\.)");
+    if (std::smatch match; std::regex_search(orig_name, match, pattern)) {
+        const std::string blk(match[1]);
+        std::string new_name = orig_name;
+
+        for (const auto & p : mapped) {
+            if (p.second == blk) {
+                LLAMA_LOG_DEBUG("(blk.%d imatrix) ", p.first);
+                return new_name.replace(match.position(1), match.length(1), std::to_string(p.first));
+            }
+        }
+        GGML_ABORT("\n%s: imatrix mapping error for %s\n", __func__, orig_name.c_str());
+    }
+
+    return orig_name;
+}
+
 struct quantize_state_impl {
     const llama_model                 & model;
     const llama_model_quantize_params * params;
@@ -174,7 +223,7 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
                 new_type = GGML_TYPE_Q6_K;
             }
         }
-    } else if (name == "token_embd.weight") {
+    } else if (name == "token_embd.weight" || name == "per_layer_token_embd.weight") {
         if (qs.params->token_embedding_type < GGML_TYPE_COUNT) {
             new_type = qs.params->token_embedding_type;
         } else {
@@ -568,6 +617,11 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
     const size_t align = GGUF_DEFAULT_ALIGNMENT;
     gguf_context_ptr ctx_out { gguf_init_empty() };
 
+    std::vector<int> prune_list = {};
+    if (params->prune_layers) {
+        prune_list = *static_cast<const std::vector<int> *>(params->prune_layers);
+    }
+
     // copy the KV pairs from the input file
     gguf_set_kv     (ctx_out.get(), ml.meta.get());
     gguf_set_val_u32(ctx_out.get(), "general.quantization_version", GGML_QNT_VERSION); // TODO: use LLM_KV
@@ -585,7 +639,8 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
             if (o.tag == LLAMA_KV_OVERRIDE_TYPE_FLOAT) {
                 gguf_set_val_f32(ctx_out.get(), o.key, o.val_f64);
             } else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_INT) {
-                gguf_set_val_i32(ctx_out.get(), o.key, o.val_i64);
+                // Setting type to UINT32. See https://github.com/ggml-org/llama.cpp/pull/14182 for context
+                gguf_set_val_u32(ctx_out.get(), o.key, (uint32_t)abs(o.val_i64));
             } else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_BOOL) {
                 gguf_set_val_bool(ctx_out.get(), o.key, o.val_bool);
             } else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_STR) {
@@ -596,12 +651,32 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
         }
     }
 
+    std::map<int, std::string> mapped;
+    int blk_id = 0;
+    int pruned_attention_w = 0;
+
     // make a list of weights
     std::vector<const llama_model_loader::llama_tensor_weight *> tensors;
     tensors.reserve(ml.weights_map.size());
     for (const auto & it : ml.weights_map) {
+        const std::string remapped_name(remap_layer(it.first, prune_list, mapped, blk_id));
+        if (remapped_name.empty()) {
+            if (it.first.find("attn_v.weight") != std::string::npos ||
+                it.first.find("attn_qkv.weight") != std::string::npos ||
+                it.first.find("attn_kv_b.weight") != std::string::npos) {
+                    pruned_attention_w++;
+            }
+            LLAMA_LOG_DEBUG("%s: pruning tensor %s\n", __func__, it.first.c_str());
+            continue;
+        } else if (remapped_name != it.first) {
+            ggml_set_name(it.second.tensor, remapped_name.c_str());
+            LLAMA_LOG_DEBUG("%s: tensor %s remapped to %s\n", __func__, it.first.c_str(), ggml_get_name(it.second.tensor));
+        }
         tensors.push_back(&it.second);
     }
+    if (!prune_list.empty()) {
+        gguf_set_val_u32(ctx_out.get(), ml.llm_kv(LLM_KV_BLOCK_COUNT).c_str(), blk_id);
+    }
 
     // keep_split requires that the weights are sorted by split index
     if (params->keep_split) {
@@ -639,7 +714,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
         if (llama_model_has_encoder(&model)) {
             n_attn_layer *= 3;
         }
-        GGML_ASSERT((qs.n_attention_wv == n_attn_layer) && "n_attention_wv is unexpected");
+        GGML_ASSERT((qs.n_attention_wv == n_attn_layer - pruned_attention_w) && "n_attention_wv is unexpected");
     }
 
     size_t total_size_org = 0;
@@ -680,7 +755,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
         for (size_t i = 0; i < ctx_outs.size(); ++i) {
             gguf_set_val_u16(ctx_outs[i].get(), ml.llm_kv(LLM_KV_SPLIT_NO).c_str(), i);
             gguf_set_val_u16(ctx_outs[i].get(), ml.llm_kv(LLM_KV_SPLIT_COUNT).c_str(), n_split);
-            gguf_set_val_i32(ctx_outs[i].get(), ml.llm_kv(LLM_KV_SPLIT_TENSORS_COUNT).c_str(), ml.n_tensors);
+            gguf_set_val_i32(ctx_outs[i].get(), ml.llm_kv(LLM_KV_SPLIT_TENSORS_COUNT).c_str(), (int32_t)tensors.size());
         }
     }
 
@@ -755,6 +830,13 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
         // NOTE: can't use LLM_TN here because the layer number is not known
         quantize &= name.find("ffn_gate_inp.weight") == std::string::npos;
 
+        // these are very small (e.g. 4x4)
+        quantize &= name.find("altup")  == std::string::npos;
+        quantize &= name.find("laurel") == std::string::npos;
+
+        // these are not too big so keep them as it is
+        quantize &= name.find("per_layer_model_proj") == std::string::npos;
+
         // do not quantize positional embeddings and token types (BERT)
         quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_POS_EMBD,    "weight");
         quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_TOKEN_TYPES, "weight");
@@ -831,7 +913,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
 
             const float * imatrix = nullptr;
             if (imatrix_data) {
-                auto it = imatrix_data->find(tensor->name);
+                auto it = imatrix_data->find(remap_imatrix(tensor->name, mapped));
                 if (it == imatrix_data->end()) {
                     LLAMA_LOG_INFO("\n====== %s: did not find weights for %s\n", __func__, tensor->name);
                 } else {
@@ -946,6 +1028,7 @@ llama_model_quantize_params llama_model_quantize_default_params() {
         /*.imatrix                     =*/ nullptr,
         /*.kv_overrides                =*/ nullptr,
         /*.tensor_type                 =*/ nullptr,
+        /*.prune_layers                =*/ nullptr
     };
 
     return result;
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
index d5a036a8c4413..5c9eb87566dde 100644
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@@ -9,16 +9,16 @@
 
 #include <algorithm>
 #include <cassert>
+#include <cctype>
 #include <cfloat>
-#include <climits>
 #include <cstdarg>
 #include <cstring>
 #include <forward_list>
+#include <limits>
 #include <map>
 #include <queue>
 #include <set>
 #include <unordered_map>
-#include <cctype>
 
 //
 // helpers
@@ -1269,6 +1269,7 @@ struct llama_vocab::impl {
     bool add_space_prefix           = false;
     bool add_bos                    = false;
     bool add_eos                    = false;
+    bool add_sep                    = false;
     bool ignore_merges              = false;
     bool clean_spaces               = false;  // clean_up_tokenization_spaces
     bool remove_extra_whitespaces   = false;
@@ -1421,6 +1422,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
             special_sep_id  = 102;
             special_pad_id  = 0;
             special_mask_id = 103;
+
+            add_sep = true;
         } else if (tokenizer_model == "gpt2") {
             type = LLAMA_VOCAB_TYPE_BPE;
 
@@ -1550,12 +1553,15 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                     tokenizer_pre == "jina-es" ||
                     tokenizer_pre == "jina-de" ||
                     tokenizer_pre == "gigachat"   ||
-                    tokenizer_pre == "jina-v1-en" ||
                     tokenizer_pre == "jina-v2-es" ||
-                    tokenizer_pre == "jina-v2-de" ||
+                    tokenizer_pre == "jina-v2-de") {
+                pre_type = LLAMA_VOCAB_PRE_TYPE_GPT2;
+            } else if (
+                    tokenizer_pre == "jina-v1-en" ||
                     tokenizer_pre == "jina-v2-code" ||
                     tokenizer_pre == "roberta-bpe") {
                 pre_type = LLAMA_VOCAB_PRE_TYPE_GPT2;
+                add_sep = true;
             } else if (
                     tokenizer_pre == "refact") {
                 pre_type = LLAMA_VOCAB_PRE_TYPE_REFACT;
@@ -1665,6 +1671,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
             clean_spaces = true;
             add_bos = true;
             add_eos = false;
+            add_sep = true;
         } else if (type == LLAMA_VOCAB_TYPE_UGM) {
             pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
             add_bos = false;
@@ -1801,7 +1808,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
             }
         }
 
-        // Handle add_bos and add_eos
+        // Handle add_bos, add_eos and add_sep
         {
             bool temp = true;
 
@@ -1811,6 +1818,9 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
             if (ml.get_key(LLM_KV_TOKENIZER_ADD_EOS, temp, false)) {
                 add_eos = temp;
             }
+            if (ml.get_key(LLM_KV_TOKENIZER_ADD_SEP, temp, false)) {
+                add_sep = temp;
+            }
         }
 
         // auto-detect special tokens by text
@@ -1987,6 +1997,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                     || t.first == "<|eom_id|>"
                     || t.first == "<EOT>"
                     || t.first == "_<EOT>"
+                    || t.first == "<|end_of_text|>"
                ) {
                 special_eog_ids.insert(t.second);
                 if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
@@ -2059,9 +2070,9 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
     //NOTE: Per token attributes are missing from the GGUF file.
     //TODO: Extract attributes from GGUF file.
     {
-        auto _contains_any = [] (const std::string & str, const std::vector<std::string> & substrs) -> bool {
+        auto _contains_any = [] (const std::string & str, const std::vector<std::string_view> & substrs) -> bool {
             for (const auto & substr : substrs) {
-                if (str.find(substr) < std::string::npos) {
+                if (str.find(substr) != std::string::npos) {
                     return true;
                 }
             }
@@ -2080,9 +2091,11 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
 
         std::string model_name;
         std::string tokenizer_pre;
+        std::string general_arch;
 
         ml.get_key(LLM_KV_GENERAL_NAME,  model_name,    false);
         ml.get_key(LLM_KV_TOKENIZER_PRE, tokenizer_pre, false);
+        ml.get_key(LLM_KV_GENERAL_ARCHITECTURE, general_arch, false);
 
         // model name to lowercase
         std::transform(model_name.begin(), model_name.end(), model_name.begin(),
@@ -2091,9 +2104,16 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
             }
         );
 
-        // set attributes by model/tokenizer name
-        if (_contains_any(tokenizer_pre, {"jina-v2-de", "jina-v2-es", "jina-v2-code"})) {
-            _set_token_attr("<mask>", LLAMA_TOKEN_ATTR_LSTRIP, true);
+        // set attributes by model/tokenizer/architecture name
+        if (false
+                || _contains_any(tokenizer_pre, {"jina-v2-de", "jina-v2-es", "jina-v2-code"})
+                || _contains_any(general_arch, {"nomic-bert-moe"})
+           ) {
+            if (token_to_id.count("<mask>") == 0) {
+                LLAMA_LOG_WARN("%s: Mask token is missing in vocab, please reconvert model!\n", __func__);
+            } else {
+                _set_token_attr("<mask>", LLAMA_TOKEN_ATTR_LSTRIP, true);
+            }
         } else if (_contains_any(model_name, {"phi-3", "phi3"})) {
             for (auto id : cache_special_tokens) {
                 _set_tokenid_attr(id, LLAMA_TOKEN_ATTR_RSTRIP, true);
@@ -2563,6 +2583,10 @@ int32_t llama_vocab::impl::token_to_piece(llama_token token, char * buf, int32_t
     // copy piece chars to output text buffer
     // skip up to 'lstrip' leading spaces before copying
     auto _try_copy = [=] (const char * token, size_t size) -> int32_t {
+        if (size >= static_cast<size_t>(std::numeric_limits<int32_t>::max())) {
+            GGML_ABORT("invalid token size: %zu exceeds int32_t limit", size);
+        }
+
         for (int32_t i = 0; i < lstrip && size && *token == ' '; ++i) {
             token++;
             size--;
@@ -2759,26 +2783,26 @@ void llama_vocab::impl::print_info() const {
     LLAMA_LOG_INFO("%s: n_merges         = %u\n",     __func__, (uint32_t) bpe_ranks.size());
 
     // special tokens
-    if (special_bos_id  != LLAMA_TOKEN_NULL)    { LLAMA_LOG_INFO( "%s: BOS token        = %d '%s'\n", __func__, special_bos_id,     id_to_token[special_bos_id].text.c_str() );  }
-    if (special_eos_id  != LLAMA_TOKEN_NULL)    { LLAMA_LOG_INFO( "%s: EOS token        = %d '%s'\n", __func__, special_eos_id,     id_to_token[special_eos_id].text.c_str() );  }
-    if (special_eot_id  != LLAMA_TOKEN_NULL)    { LLAMA_LOG_INFO( "%s: EOT token        = %d '%s'\n", __func__, special_eot_id,     id_to_token[special_eot_id].text.c_str() );  }
-    if (special_eom_id  != LLAMA_TOKEN_NULL)    { LLAMA_LOG_INFO( "%s: EOM token        = %d '%s'\n", __func__, special_eom_id,     id_to_token[special_eom_id].text.c_str() );  }
-    if (special_unk_id  != LLAMA_TOKEN_NULL)    { LLAMA_LOG_INFO( "%s: UNK token        = %d '%s'\n", __func__, special_unk_id,     id_to_token[special_unk_id].text.c_str() );  }
-    if (special_sep_id  != LLAMA_TOKEN_NULL)    { LLAMA_LOG_INFO( "%s: SEP token        = %d '%s'\n", __func__, special_sep_id,     id_to_token[special_sep_id].text.c_str() );  }
-    if (special_pad_id  != LLAMA_TOKEN_NULL)    { LLAMA_LOG_INFO( "%s: PAD token        = %d '%s'\n", __func__, special_pad_id,     id_to_token[special_pad_id].text.c_str() );  }
-    if (special_mask_id != LLAMA_TOKEN_NULL)    { LLAMA_LOG_INFO( "%s: MASK token       = %d '%s'\n", __func__, special_mask_id,    id_to_token[special_mask_id].text.c_str() ); }
-
-    if (linefeed_id != LLAMA_TOKEN_NULL)        { LLAMA_LOG_INFO( "%s: LF token         = %d '%s'\n", __func__, linefeed_id,        id_to_token[linefeed_id].text.c_str() ); }
-
-    if (special_fim_pre_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM PRE token    = %d '%s'\n", __func__, special_fim_pre_id, id_to_token[special_fim_pre_id].text.c_str() ); }
-    if (special_fim_suf_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM SUF token    = %d '%s'\n", __func__, special_fim_suf_id, id_to_token[special_fim_suf_id].text.c_str() ); }
-    if (special_fim_mid_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM MID token    = %d '%s'\n", __func__, special_fim_mid_id, id_to_token[special_fim_mid_id].text.c_str() ); }
-    if (special_fim_pad_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM PAD token    = %d '%s'\n", __func__, special_fim_pad_id, id_to_token[special_fim_pad_id].text.c_str() ); }
-    if (special_fim_rep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM REP token    = %d '%s'\n", __func__, special_fim_rep_id, id_to_token[special_fim_rep_id].text.c_str() ); }
-    if (special_fim_sep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM SEP token    = %d '%s'\n", __func__, special_fim_sep_id, id_to_token[special_fim_sep_id].text.c_str() ); }
+    if (special_bos_id  != LLAMA_TOKEN_NULL)    { LLAMA_LOG_INFO( "%s: BOS token        = %d '%s'\n", __func__, special_bos_id,     id_to_token.at(special_bos_id).text.c_str() );  }
+    if (special_eos_id  != LLAMA_TOKEN_NULL)    { LLAMA_LOG_INFO( "%s: EOS token        = %d '%s'\n", __func__, special_eos_id,     id_to_token.at(special_eos_id).text.c_str() );  }
+    if (special_eot_id  != LLAMA_TOKEN_NULL)    { LLAMA_LOG_INFO( "%s: EOT token        = %d '%s'\n", __func__, special_eot_id,     id_to_token.at(special_eot_id).text.c_str() );  }
+    if (special_eom_id  != LLAMA_TOKEN_NULL)    { LLAMA_LOG_INFO( "%s: EOM token        = %d '%s'\n", __func__, special_eom_id,     id_to_token.at(special_eom_id).text.c_str() );  }
+    if (special_unk_id  != LLAMA_TOKEN_NULL)    { LLAMA_LOG_INFO( "%s: UNK token        = %d '%s'\n", __func__, special_unk_id,     id_to_token.at(special_unk_id).text.c_str() );  }
+    if (special_sep_id  != LLAMA_TOKEN_NULL)    { LLAMA_LOG_INFO( "%s: SEP token        = %d '%s'\n", __func__, special_sep_id,     id_to_token.at(special_sep_id).text.c_str() );  }
+    if (special_pad_id  != LLAMA_TOKEN_NULL)    { LLAMA_LOG_INFO( "%s: PAD token        = %d '%s'\n", __func__, special_pad_id,     id_to_token.at(special_pad_id).text.c_str() );  }
+    if (special_mask_id != LLAMA_TOKEN_NULL)    { LLAMA_LOG_INFO( "%s: MASK token       = %d '%s'\n", __func__, special_mask_id,    id_to_token.at(special_mask_id).text.c_str() ); }
+
+    if (linefeed_id != LLAMA_TOKEN_NULL)        { LLAMA_LOG_INFO( "%s: LF token         = %d '%s'\n", __func__, linefeed_id,        id_to_token.at(linefeed_id).text.c_str() ); }
+
+    if (special_fim_pre_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM PRE token    = %d '%s'\n", __func__, special_fim_pre_id, id_to_token.at(special_fim_pre_id).text.c_str() ); }
+    if (special_fim_suf_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM SUF token    = %d '%s'\n", __func__, special_fim_suf_id, id_to_token.at(special_fim_suf_id).text.c_str() ); }
+    if (special_fim_mid_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM MID token    = %d '%s'\n", __func__, special_fim_mid_id, id_to_token.at(special_fim_mid_id).text.c_str() ); }
+    if (special_fim_pad_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM PAD token    = %d '%s'\n", __func__, special_fim_pad_id, id_to_token.at(special_fim_pad_id).text.c_str() ); }
+    if (special_fim_rep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM REP token    = %d '%s'\n", __func__, special_fim_rep_id, id_to_token.at(special_fim_rep_id).text.c_str() ); }
+    if (special_fim_sep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM SEP token    = %d '%s'\n", __func__, special_fim_sep_id, id_to_token.at(special_fim_sep_id).text.c_str() ); }
 
     for (const auto & id : special_eog_ids) {
-        LLAMA_LOG_INFO( "%s: EOG token        = %d '%s'\n", __func__, id, id_to_token[id].text.c_str() );
+        LLAMA_LOG_INFO( "%s: EOG token        = %d '%s'\n", __func__, id, id_to_token.at(id).text.c_str() );
     }
 
     LLAMA_LOG_INFO("%s: max token length = %d\n", __func__, max_token_len);
@@ -2986,6 +3010,10 @@ bool llama_vocab::get_add_eos() const {
     return pimpl->add_eos;
 }
 
+bool llama_vocab::get_add_sep() const {
+    return pimpl->add_sep;
+}
+
 bool llama_vocab::get_ignore_merges() const {
     return pimpl->ignore_merges;
 }
@@ -3046,6 +3074,11 @@ int32_t llama_vocab::tokenize(
                         bool   add_special,
                         bool   parse_special) const {
     auto res = tokenize(std::string(text, text_len), add_special, parse_special);
+    if (res.size() >= static_cast<size_t>(std::numeric_limits<int32_t>::max())) {
+        LLAMA_LOG_ERROR("%s: tokenization result size %zu exceeds int32_t limit\n", __func__, res.size());
+        return std::numeric_limits<int32_t>::min();
+    }
+
     if (n_tokens_max < (int) res.size()) {
         // LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
         return -((int) res.size());
@@ -3177,6 +3210,10 @@ bool llama_vocab_get_add_eos(const struct llama_vocab * vocab) {
     return vocab->get_add_eos();
 }
 
+bool llama_vocab_get_add_sep(const struct llama_vocab * vocab) {
+    return vocab->get_add_sep();
+}
+
 llama_token llama_vocab_fim_pre(const struct llama_vocab * vocab) {
     return vocab->token_fim_pre();
 }
diff --git a/src/llama-vocab.h b/src/llama-vocab.h
index daa6cf3082f90..40e4d1c05b18e 100644
--- a/src/llama-vocab.h
+++ b/src/llama-vocab.h
@@ -74,6 +74,7 @@ struct llama_vocab {
     bool get_add_space_prefix          () const;
     bool get_add_bos                   () const;
     bool get_add_eos                   () const;
+    bool get_add_sep                   () const;
     bool get_ignore_merges             () const;
     bool get_clean_spaces              () const;
     bool get_remove_extra_whitespaces  () const;
diff --git a/src/llama.cpp b/src/llama.cpp
index 2f06e0f8ce12d..71da994adfb7d 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -9,6 +9,9 @@
 
 #include "ggml.h"
 #include "ggml-backend.h"
+#ifdef GGML_USE_HEXAGON
+#include "ggml-hexagon.h"
+#endif
 
 #include <algorithm>
 #include <cstddef>
@@ -181,6 +184,11 @@ static struct llama_model * llama_model_load_from_file_impl(
                     break;
 
                 case GGML_BACKEND_DEVICE_TYPE_GPU:
+#if GGML_USE_HEXAGON
+                    if (params.main_gpu == HEXAGON_BACKEND_GGML) {
+                        break;
+                    }
+#endif
                     ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
                     if (ggml_backend_reg_name(reg) == std::string("RPC")) {
                         rpc_servers.push_back(dev);
@@ -198,14 +206,18 @@ static struct llama_model * llama_model_load_from_file_impl(
 
     // if using single GPU mode, remove all except the main GPU
     if (params.split_mode == LLAMA_SPLIT_MODE_NONE) {
-        if (params.main_gpu < 0 || params.main_gpu >= (int)model->devices.size()) {
-            LLAMA_LOG_ERROR("%s: invalid value for main_gpu: %d (available devices: %d)\n", __func__, params.main_gpu, (int)model->devices.size());
-            llama_model_free(model);
-            return nullptr;
+        if (params.main_gpu < 0) {
+            model->devices.clear();
+        } else {
+            if (params.main_gpu >= (int)model->devices.size()) {
+                LLAMA_LOG_ERROR("%s: invalid value for main_gpu: %d (available devices: %zu)\n", __func__, params.main_gpu, model->devices.size());
+                llama_model_free(model);
+                return nullptr;
+            }
+            ggml_backend_dev_t main_gpu = model->devices[params.main_gpu];
+            model->devices.clear();
+            model->devices.push_back(main_gpu);
         }
-        ggml_backend_dev_t main_gpu = model->devices[params.main_gpu];
-        model->devices.clear();
-        model->devices.push_back(main_gpu);
     }
 
     for (auto * dev : model->devices) {
diff --git a/src/unicode.cpp b/src/unicode.cpp
index e63bb4ab085d6..43a4581b961fe 100644
--- a/src/unicode.cpp
+++ b/src/unicode.cpp
@@ -204,12 +204,17 @@ static inline std::wstring unicode_wstring_from_utf8(const std::string & s) {
     // disable C++17 deprecation warning for std::codecvt_utf8
 #    pragma clang diagnostic push
 #    pragma clang diagnostic ignored "-Wdeprecated-declarations"
+#elif defined(__GNUC__)
+#    pragma GCC diagnostic push
+#    pragma GCC diagnostic ignored "-Wdeprecated-declarations"
 #endif
 
     std::wstring_convert<std::codecvt_utf8<wchar_t>> conv;
 
 #if defined(__clang__)
 #    pragma clang diagnostic pop
+#elif defined(__GNUC__)
+#    pragma GCC diagnostic pop
 #endif
 
     return conv.from_bytes(s);
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 83f7d1a4584f7..59b2b1536af5f 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -42,6 +42,34 @@ function(llama_test target)
     set_property(TEST ${TEST_NAME} PROPERTY LABELS ${LLAMA_TEST_LABEL})
 endfunction()
 
+function(llama_test_cmd target)
+    include(CMakeParseArguments)
+    set(options)
+    set(oneValueArgs NAME LABEL WORKING_DIRECTORY)
+    set(multiValueArgs ARGS)
+    cmake_parse_arguments(LLAMA_TEST "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+
+    if (NOT DEFINED LLAMA_TEST_LABEL)
+        set(LLAMA_TEST_LABEL "main")
+    endif()
+    if (NOT DEFINED LLAMA_TEST_WORKING_DIRECTORY)
+        set(LLAMA_TEST_WORKING_DIRECTORY .)
+    endif()
+    if (DEFINED LLAMA_TEST_NAME)
+        set(TEST_NAME ${LLAMA_TEST_NAME})
+    else()
+        set(TEST_NAME ${target})
+    endif()
+
+    add_test(
+        NAME ${TEST_NAME}
+        WORKING_DIRECTORY ${LLAMA_TEST_WORKING_DIRECTORY}
+        COMMAND ${target}
+        ${LLAMA_TEST_ARGS})
+
+    set_property(TEST ${TEST_NAME} PROPERTY LABELS ${LLAMA_TEST_LABEL})
+endfunction()
+
 # Builds and runs a test source file.
 # Optional args:
 # - NAME: name of the executable & test target (defaults to the source file name without extension)
@@ -83,29 +111,35 @@ endfunction()
 # build test-tokenizer-0 target once and add many tests
 llama_build(test-tokenizer-0.cpp)
 
-llama_test(test-tokenizer-0 NAME test-tokenizer-0-bert-bge          ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-bert-bge.gguf)
-llama_test(test-tokenizer-0 NAME test-tokenizer-0-command-r         ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-command-r.gguf)
-llama_test(test-tokenizer-0 NAME test-tokenizer-0-deepseek-coder    ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-coder.gguf)
-llama_test(test-tokenizer-0 NAME test-tokenizer-0-deepseek-llm      ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-llm.gguf)
-llama_test(test-tokenizer-0 NAME test-tokenizer-0-falcon            ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
-llama_test(test-tokenizer-0 NAME test-tokenizer-0-gpt-2             ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-2.gguf)
-llama_test(test-tokenizer-0 NAME test-tokenizer-0-llama-bpe         ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-bpe.gguf)
-llama_test(test-tokenizer-0 NAME test-tokenizer-0-llama-spm         ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-spm.gguf)
-llama_test(test-tokenizer-0 NAME test-tokenizer-0-mpt               ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-mpt.gguf)
-llama_test(test-tokenizer-0 NAME test-tokenizer-0-phi-3             ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-phi-3.gguf)
-llama_test(test-tokenizer-0 NAME test-tokenizer-0-qwen2             ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-qwen2.gguf)
-llama_test(test-tokenizer-0 NAME test-tokenizer-0-refact            ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf)
-llama_test(test-tokenizer-0 NAME test-tokenizer-0-starcoder         ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf)
-
-# TODO: missing HF tokenizer for this model in convert_hf_to_gguf_update.py, see https://github.com/ggml-org/llama.cpp/pull/13847
-# llama_test(test-tokenizer-0 NAME test-tokenizer-0-nomic-bert-moe    ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-nomic-bert-moe.gguf)
+llama_test(test-tokenizer-0 NAME test-tokenizer-0-bert-bge          ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-bert-bge.gguf)
+llama_test(test-tokenizer-0 NAME test-tokenizer-0-command-r         ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-command-r.gguf)
+llama_test(test-tokenizer-0 NAME test-tokenizer-0-deepseek-coder    ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-deepseek-coder.gguf)
+llama_test(test-tokenizer-0 NAME test-tokenizer-0-deepseek-llm      ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-deepseek-llm.gguf)
+llama_test(test-tokenizer-0 NAME test-tokenizer-0-falcon            ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-falcon.gguf)
+llama_test(test-tokenizer-0 NAME test-tokenizer-0-gpt-2             ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-gpt-2.gguf)
+llama_test(test-tokenizer-0 NAME test-tokenizer-0-llama-bpe         ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-llama-bpe.gguf)
+llama_test(test-tokenizer-0 NAME test-tokenizer-0-llama-spm         ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-llama-spm.gguf)
+llama_test(test-tokenizer-0 NAME test-tokenizer-0-mpt               ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-mpt.gguf)
+llama_test(test-tokenizer-0 NAME test-tokenizer-0-phi-3             ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-phi-3.gguf)
+llama_test(test-tokenizer-0 NAME test-tokenizer-0-qwen2             ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-qwen2.gguf)
+llama_test(test-tokenizer-0 NAME test-tokenizer-0-refact            ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-refact.gguf)
+llama_test(test-tokenizer-0 NAME test-tokenizer-0-starcoder         ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-starcoder.gguf)
+
+if (NOT WIN32)
+    llama_test_cmd(
+        ${CMAKE_CURRENT_SOURCE_DIR}/test-tokenizers-repo.sh
+        NAME test-tokenizers-ggml-vocabs
+        WORKING_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}
+        ARGS https://huggingface.co/ggml-org/vocabs ${PROJECT_SOURCE_DIR}/models/ggml-vocabs
+    )
+endif()
 
 if (LLAMA_LLGUIDANCE)
-    llama_build_and_test(test-grammar-llguidance.cpp ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-bpe.gguf)
+    llama_build_and_test(test-grammar-llguidance.cpp ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-llama-bpe.gguf)
 endif ()
 
-if (NOT WIN32)
-    # these tests are disabled on Windows because they use internal functions not exported with LLAMA_API
+if (NOT WIN32 OR NOT BUILD_SHARED_LIBS)
+    # these tests are disabled on Windows because they use internal functions not exported with LLAMA_API (when building with shared libraries)
     llama_build_and_test(test-sampling.cpp)
     llama_build_and_test(test-grammar-parser.cpp)
     llama_build_and_test(test-grammar-integration.cpp)
@@ -113,8 +147,8 @@ if (NOT WIN32)
     llama_build_and_test(test-chat.cpp)
     # TODO: disabled on loongarch64 because the ggml-ci node lacks Python 3.8
     if (NOT ${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
-        llama_build_and_test(test-json-schema-to-grammar.cpp   WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/..)
-        target_include_directories(test-json-schema-to-grammar PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../tools/server)
+        llama_build_and_test(test-json-schema-to-grammar.cpp   WORKING_DIRECTORY ${PROJECT_SOURCE_DIR})
+        target_include_directories(test-json-schema-to-grammar PRIVATE ${PROJECT_SOURCE_DIR}/tools/server)
     endif()
 
     if (NOT GGML_BACKEND_DL)
@@ -127,20 +161,20 @@ if (NOT WIN32)
     llama_build(test-tokenizer-1-bpe.cpp)
 
     # TODO: disabled due to slowness
-    #llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-aquila    ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-aquila.gguf)
-    #llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-falcon    ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
-    #llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-gpt-2     ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-2.gguf)
-    #llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-gpt-neox  ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-neox.gguf)
-    #llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-llama-bpe ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-bpe.gguf --ignore-merges)
-    #llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-mpt       ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-mpt.gguf)
-    #llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-refact    ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf)
-    #llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-starcoder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf)
+    #llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-aquila    ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-aquila.gguf)
+    #llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-falcon    ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-falcon.gguf)
+    #llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-gpt-2     ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-gpt-2.gguf)
+    #llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-gpt-neox  ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-gpt-neox.gguf)
+    #llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-llama-bpe ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-llama-bpe.gguf --ignore-merges)
+    #llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-mpt       ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-mpt.gguf)
+    #llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-refact    ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-refact.gguf)
+    #llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-starcoder ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-starcoder.gguf)
 
     # build test-tokenizer-1-spm target once and add many tests
     llama_build(test-tokenizer-1-spm.cpp)
 
-    llama_test(test-tokenizer-1-spm  NAME test-tokenizer-1-llama-spm ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-spm.gguf)
-    #llama_test(test-tokenizer-1-spm  NAME test-tokenizer-1-baichuan  ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-baichuan.gguf)
+    llama_test(test-tokenizer-1-spm  NAME test-tokenizer-1-llama-spm ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-llama-spm.gguf)
+    #llama_test(test-tokenizer-1-spm  NAME test-tokenizer-1-baichuan  ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-baichuan.gguf)
 
     # llama_build_and_test(test-double-float.cpp) # SLOW
 endif()
@@ -151,6 +185,8 @@ llama_build_and_test(test-json-partial.cpp)
 llama_build_and_test(test-log.cpp)
 llama_build_and_test(test-regex-partial.cpp)
 
+llama_build_and_test(test-thread-safety.cpp ARGS -hf ggml-org/models -hff tinyllamas/stories15M-q4_0.gguf -ngl 99 -p "The meaning of life is" -n 128 -c 256 -ub 32 -np 4)
+
 # this fails on windows (github hosted runner) due to curl DLL not found (exit code 0xc0000135)
 if (NOT WIN32)
     llama_build_and_test(test-arg-parser.cpp)
@@ -160,6 +196,8 @@ endif()
 llama_build_and_test(test-gguf.cpp)
 llama_build_and_test(test-backend-ops.cpp)
 
+llama_build_and_test(ggmlhexagon-benchmark.cpp)
+
 llama_build_and_test(test-model-load-cancel.cpp  LABEL "model")
 llama_build_and_test(test-autorelease.cpp        LABEL "model")
 
diff --git a/tests/ggmlhexagon-benchmark.cpp b/tests/ggmlhexagon-benchmark.cpp
new file mode 100644
index 0000000000000..90da8963171a5
--- /dev/null
+++ b/tests/ggmlhexagon-benchmark.cpp
@@ -0,0 +1,477 @@
+/*
+ * Copyright (c) 2024-2025 The KanTV authors
+ *
+ * implementation of self-made Android command line tool for benchmark of ggml-hexagon backend
+ *
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdarg.h>
+#include <string.h>
+#include <stddef.h>
+#include <inttypes.h>
+#if defined(__ANDROID__) || defined(__linux__)
+#include <unistd.h>
+#include <math.h>
+#include <time.h>
+#include <unistd.h>
+#include <dlfcn.h>
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <limits.h>
+#include <signal.h>
+#include <fcntl.h>
+#include <sys/types.h>
+#endif
+
+#include <string>
+#include <vector>
+#include <thread>
+#include <mutex>
+#include <map>
+#include <set>
+#include <tuple>
+#include <queue>
+#include <fstream>
+#include <iostream>
+#include <iomanip>
+#include <sstream>
+#include <chrono>
+#include <memory>
+#include <regex>
+#include <random>
+#include <functional>
+#include <unordered_map>
+#include <condition_variable>
+#include <cassert>
+#include <unordered_set>
+#include <utility>
+#include <algorithm>
+
+#include "gguf.h"
+#include "ggml.h"
+#include "ggml-cpu.h"
+#include "ggml-alloc.h"
+#include "ggml-backend.h"
+#include "ggml-hexagon.h"
+
+static void tensor_dump(const ggml_tensor * tensor, const char * name);
+
+#define TENSOR_DUMP(tensor, bdump) tensor_dump(tensor, #tensor, bdump)
+#define TMPBUF_LEN                 256
+
+static bool ggml_graph_compute_helper(
+        struct ggml_backend * backend,
+        struct ggml_cgraph * graph,
+        std::vector<uint8_t> & buf,
+        int n_threads,
+        ggml_abort_callback abort_callback,
+        void * abort_callback_data) {
+    struct ggml_cplan plan = ggml_graph_plan(graph, n_threads, NULL);
+
+    plan.abort_callback = abort_callback;
+    plan.abort_callback_data = abort_callback_data;
+
+    if (plan.work_size > 0) {
+        buf.resize(plan.work_size);
+        plan.work_data = buf.data();
+    }
+
+    if (nullptr != backend)
+        return ggml_backend_graph_compute(backend, graph) == GGML_STATUS_SUCCESS;
+    else
+        return ggml_graph_compute(graph, &plan);
+}
+
+
+static void tensor_dump_elements(const ggml_tensor * tensor) {
+    float value = 0;
+    std::ostringstream tmposs;
+    if (tensor->type == GGML_TYPE_F32) {
+        for (int h = 0; h < tensor->ne[3]; h++) {
+            for (int i = 0; i < tensor->ne[2]; i++) {
+                for (int j = 0; j < tensor->ne[1]; j++) {
+                    for (int k = 0; k < tensor->ne[0]; k++) {
+                        value = ((float *) tensor->data)[h * tensor->ne[2] + i * tensor->ne[1] +
+                                                         j * tensor->ne[0] + k];
+                        tmposs << std::setw(8) << std::fixed << std::setprecision(2) << value
+                               << " ";
+                    }
+                    if (strlen(tmposs.str().c_str()) <= (4096 - 96)) {
+                        printf("%s\n", tmposs.str().c_str());
+                    }
+                    tmposs.clear();
+                    tmposs.str("");
+                }
+            }
+        }
+    }
+
+    printf("\n");
+}
+
+
+static void tensor_dump(const ggml_tensor * tensor, const char * name, int bdump) {
+    printf("dump ggml tensor %s(%s)\n", name, tensor->name);
+    printf("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64", nb = (%5zi, %5zi, %5zi, %5zi)\n",
+          name,
+          tensor->type, ggml_type_name(tensor->type),
+          tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3],
+          tensor->nb[0], tensor->nb[1], tensor->nb[2], tensor->nb[2]);
+    if (1 == bdump)
+        tensor_dump_elements(tensor);
+
+    printf("\n");
+}
+
+
+static uint32_t get_tensor_rank(const ggml_tensor * tensor) {
+    uint32_t rank = 0;
+    for (int i = 0; i < GGML_MAX_DIMS; i++) {
+        if ((0 != tensor->ne[i]) && (1 != tensor->ne[i])) {
+            rank++;
+        }
+    }
+    return rank;
+}
+
+
+static uint32_t get_tensor_data_size(const ggml_tensor * tensor) {
+    return ggml_nbytes(tensor);
+}
+
+
+//ref: https://github.com/ggerganov/llama.cpp/blob/master/tests/test-backend-ops.cpp#L20
+static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float max = 1.0f) {
+    // static RNG initialization (revisit if n_threads stops being constant)
+    static const size_t n_threads = std::thread::hardware_concurrency();
+    static std::vector<std::default_random_engine> generators = []() {
+        std::random_device rd;
+        std::vector<std::default_random_engine> vec;
+        vec.reserve(n_threads);
+        //for (size_t i = 0; i < n_threads; i++) { vec.emplace_back(1234 + i); } // fixed seed
+        for (size_t i = 0; i < n_threads; i++) { vec.emplace_back(rd()); }
+        return vec;
+    }();
+
+    size_t size = ggml_nelements(tensor);
+    std::vector<float> data(size);
+
+    auto init_thread = [&](size_t ith, size_t start, size_t end) {
+        std::uniform_real_distribution<float> distribution(min, max);
+        for (size_t i = start; i < end; i++) {
+            data[i] = distribution(generators[ith]);
+        }
+    };
+
+    std::vector<std::thread> threads;
+    threads.reserve(n_threads);
+    for (size_t i = 0; i < n_threads; i++) {
+        size_t start =     i*size/n_threads;
+        size_t end   = (i+1)*size/n_threads;
+        threads.emplace_back(init_thread, i, start, end);
+    }
+    for (auto & t : threads) {
+        t.join();
+    }
+    if (tensor->type == GGML_TYPE_F32 || tensor->type == GGML_TYPE_I32) {
+        ggml_backend_tensor_set(tensor, data.data(), 0, size * sizeof(float));
+    } else if (ggml_is_quantized(tensor->type) || tensor->type == GGML_TYPE_F16 || tensor->type == GGML_TYPE_BF16) {
+        GGML_ASSERT(size % ggml_blck_size(tensor->type) == 0);
+        std::vector<uint8_t> dataq(ggml_row_size(tensor->type, size));
+        std::vector<float> imatrix(tensor->ne[0], 1.0f); // dummy importance matrix
+        const float * im = imatrix.data();
+        if (!ggml_quantize_requires_imatrix(tensor->type)) {
+            // when the imatrix is optional, we want to test both quantization with and without imatrix
+            // use one of the random numbers to decide
+            if (data[0] > 0.5f*(min + max)) {
+                im = nullptr;
+            }
+        }
+        ggml_quantize_chunk(tensor->type, data.data(), dataq.data(), 0, size/tensor->ne[0], tensor->ne[0], im);
+        GGML_ASSERT(ggml_validate_row_data(tensor->type, dataq.data(), dataq.size()));
+        ggml_backend_tensor_set(tensor, dataq.data(), 0, dataq.size());
+    } else if (tensor->type == GGML_TYPE_I8 || tensor->type == GGML_TYPE_I16 || tensor->type == GGML_TYPE_I32) {
+        // This is going to create some weird integers though.
+        ggml_backend_tensor_set(tensor, data.data(), 0, ggml_nbytes(tensor));
+    } else {
+        GGML_ASSERT(false);
+    }
+}
+
+
+//ref: https://github.com/ggerganov/llama.cpp/blob/master/tests/test-backend-ops.cpp#L310
+static void initialize_tensors(ggml_context * ctx) {
+    for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) {
+        init_tensor_uniform(t);
+    }
+}
+
+
+static void show_usage() {
+    printf(" " \
+        "\nUsage: ggmlhexagon-benchmark [options]\n" \
+        "\n" \
+        "Options:\n" \
+        " -t ADD / MUL_MAT \n" \
+        " -b 0(QNN_CPU) 1(QNN_GPU) 2(QNN_NPU) 3(Hexagon-cDSP) 4(ggml)\n" \
+        " -m row\n" \
+        " -n col\n" \
+        " ?/h print usage information\n\n"
+    );
+}
+
+
+static void get_timestring(char * p_currenttime) {
+    if (nullptr == p_currenttime)
+        return;
+
+
+    auto time_to_string = [](const std::chrono::system_clock::time_point & tp)->std::string {
+        auto as_time_t = std::chrono::system_clock::to_time_t(tp);
+        struct tm tm;
+
+        localtime_r(&as_time_t, &tm);
+
+        std::chrono::milliseconds ms = std::chrono::duration_cast<std::chrono::milliseconds>(tp.time_since_epoch());
+        char buf[TMPBUF_LEN];
+        memset(buf, 0, TMPBUF_LEN);
+        snprintf(buf, sizeof(buf), "%04d-%02d-%02d,%02d:%02d:%02d",
+                 tm.tm_year + 1900, tm.tm_mon + 1, tm.tm_mday, tm.tm_hour, tm.tm_min, tm.tm_sec);
+        GGML_UNUSED(ms);
+        return buf;
+    };
+
+    std::chrono::system_clock::time_point tp = std::chrono::system_clock::now();
+    snprintf(p_currenttime, TMPBUF_LEN, "%s", time_to_string(tp).c_str());
+}
+
+
+int main(int argc, char * argv[]) {
+    int64_t n_begin_time        = 0LL;
+    int64_t n_end_time          = 0LL;
+    int64_t n_duration          = 0LL;
+    size_t  ctx_size            = 0;
+    int     sizex               = 4096;
+    int     sizey               = 4096;
+    //int     sizez               = 4096;
+
+    int n_backend_type          = HEXAGON_BACKEND_QNNNPU;
+    int n_ggml_op_type          = GGML_OP_ADD;
+    int n_mulmat_algotype       = 0;
+
+    struct ggml_context * ctx   = nullptr;
+    struct ggml_cgraph  * gf    = nullptr;
+    struct ggml_tensor  * src0  = nullptr;
+    struct ggml_tensor  * src1  = nullptr;
+    //struct ggml_tensor  * src2  = nullptr;
+    struct ggml_tensor  * dst   = nullptr;
+    ggml_backend_t backend      = nullptr;
+    ggml_backend_buffer_t buffer= nullptr;
+    ggml_type qtype             = GGML_TYPE_F32;
+    //ggml_type qtype           = GGML_TYPE_Q4_0;
+    std::vector<uint8_t> work_buffer;
+
+    for (int i = 1; i < argc; i++) {
+        if (0 == strcmp(argv[i], "-t")) {
+            if (i + 1 < argc) {
+                if (0 == memcmp(argv[i + 1], "ADD", 3)) {
+                    n_ggml_op_type = GGML_OP_ADD;
+                } else if (0 == memcmp(argv[i + 1], "MUL_MAT", 7)) {
+                    n_ggml_op_type = GGML_OP_MUL_MAT;
+                } else {
+                    show_usage();
+                    return 1;
+                }
+                i++;
+            }
+        } else if (0 == strcmp(argv[i], "-b")) {
+            if (i + 1 < argc) {
+                int backend = atoi(argv[i + 1]);
+                if (backend <= HEXAGON_BACKEND_GGML)
+                {
+                    n_backend_type     = backend;
+                    printf("backend_type %d\n", backend);
+                }
+                else {
+                    show_usage();
+                    return 2;
+                }
+                i++;
+            }
+        } else if (0 == strcmp(argv[i], "-m")) {
+            if (i + 1 < argc) {
+                sizex = atoi(argv[i+1]);
+                i++;
+            }
+        } else if (0 == strcmp(argv[i], "-n")) {
+            if (i + 1 < argc) {
+                sizey = atoi(argv[i+1]);
+                i++;
+            }
+        } else if (0 == strcmp(argv[i], "-a")) {
+            if (i + 1 < argc) {
+                n_mulmat_algotype = atoi(argv[i+1]);
+                i++;
+            }
+        } else {
+            show_usage();
+            return 3;
+        }
+    }
+
+    printf("init backend %d\n", n_backend_type);
+
+#ifdef GGML_USE_HEXAGON
+    //avoid manually modify ggml-hexagon.cfg
+    if (n_backend_type >= HEXAGON_BACKEND_CDSP) {
+        ggml_backend_hexagon_set_cfg(n_backend_type, HWACCEL_CDSP);
+    }
+    if (n_backend_type < HEXAGON_BACKEND_CDSP) {
+        ggml_backend_hexagon_set_cfg(n_backend_type, HWACCEL_QNN);
+    }
+    ggml_backend_hexagon_set_mulmat_algotype(n_mulmat_algotype);
+#endif
+
+    srand(time(NULL));
+
+    ctx_size += 4096 * 4096 * 64;
+    ctx_size += 4096 * 4096 * 64;
+    printf("Allocating Memory of size %ld bytes, %ld MB\n", ctx_size, (ctx_size / 1024 / 1024));
+
+    struct ggml_init_params params = {
+            /*.mem_size   =*/ ctx_size,
+            /*.mem_buffer =*/ NULL,
+            /* no_alloc   =*/ 0
+    };
+
+#ifdef GGML_USE_HEXAGON
+    if (n_backend_type != HEXAGON_BACKEND_GGML) {
+        params.no_alloc = true;
+    }
+#endif
+
+    ctx = ggml_init(params);
+    if (!ctx) {
+        printf("ggml_init failure\n");
+        return 4;
+    }
+
+    printf("creating new tensors\n");
+    printf("ggml_blck_size(%s) %ld\n", ggml_type_name(qtype), ggml_blck_size(qtype));
+    printf("ggml_type_size(%s) %ld\n", ggml_type_name(qtype), ggml_type_size(qtype));
+    if (qtype != GGML_TYPE_F32) {
+        sizex = ggml_blck_size(qtype);
+    }
+
+    printf("ggml op:%d(%s)", n_ggml_op_type, ggml_op_name((enum ggml_op) n_ggml_op_type));
+    src0 = ggml_new_tensor_2d(ctx, qtype,         sizey, sizex);
+    src1 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizey, sizex);
+
+    ggml_set_input(src0);
+    ggml_set_input(src1);
+    switch (n_ggml_op_type) {
+        case GGML_OP_ADD:
+            dst = ggml_add(ctx, src0, src1);
+            break;
+        case GGML_OP_MUL_MAT:
+            dst = ggml_mul_mat(ctx, src0, src1);
+            break;
+        default:
+            printf("ggml op %d(%s) not supported", n_ggml_op_type,
+                  ggml_op_name((enum ggml_op) n_ggml_op_type));
+            ggml_free(ctx);
+            return 5;
+    }
+
+    ggml_set_output(dst);
+
+#ifdef GGML_USE_HEXAGON
+    if (n_backend_type != HEXAGON_BACKEND_GGML) {
+        printf("init backend %d\n", n_backend_type);
+        backend = ggml_backend_hexagon_init(n_backend_type, "/data/local/tmp/");
+        if (nullptr == backend) {
+            printf("create  backend %d(%s) failed\n", n_backend_type, ggml_backend_hexagon_get_devname(n_backend_type));
+            ggml_free(ctx);
+            return 6;
+        } else {
+            printf("create  backend %d(%s) succeed\n", n_backend_type, ggml_backend_hexagon_get_devname(n_backend_type));
+        }
+
+        ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(backend);
+        buffer = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
+        if (nullptr == buffer) {
+            printf("%s: failed to allocate backend buffer\n", __func__);
+            ggml_free(ctx);
+            ggml_backend_free(backend);
+            return 7;
+        }
+    } else {
+        printf("init default cpu backend\n");
+        backend = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
+    }
+#else
+    backend = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
+#endif
+    GGML_ASSERT(backend != nullptr);
+
+    printf("creating compute graph\n");
+    gf = ggml_new_graph(ctx);
+    ggml_build_forward_expand(gf, dst);
+
+    if (qtype == GGML_TYPE_F32) {
+        ggml_set_f32(src0, 1.0f);
+        ggml_set_f32(src1, 2.0f);
+        ggml_set_f32(dst, 0.0f);
+    } else {
+        initialize_tensors(ctx);
+    }
+
+    n_begin_time = ggml_time_us();
+    ggml_backend_graph_compute(backend, gf);
+    n_end_time = ggml_time_us();
+    n_duration = (n_end_time - n_begin_time) / 1000;
+    if (get_tensor_data_size(dst) < (256 * 256)) {
+        printf("dump result tensors:\n");
+        TENSOR_DUMP(src0, 1);
+        TENSOR_DUMP(src1, 1);
+        TENSOR_DUMP(dst, 1);
+    } else {
+        if (get_tensor_data_size(dst) < (512 * 512)) {
+            TENSOR_DUMP(dst, 1);
+        }
+        printf("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n",
+              src0->name,
+              src0->type, ggml_type_name(src0->type), src0->ne[0], src0->ne[1], src0->ne[2],
+              src0->nb[0], src0->nb[1], src0->nb[2]);
+        printf("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n",
+              src1->name,
+              src1->type, ggml_type_name(src1->type), src1->ne[0], src1->ne[1], src1->ne[2],
+              src1->nb[0], src1->nb[1], src1->nb[2]);
+        printf("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n",
+              dst->name,
+              dst->type, ggml_type_name(dst->type), dst->ne[0], dst->ne[1], dst->ne[2], dst->nb[0],
+              dst->nb[1], dst->nb[2]);
+
+    }
+
+    ggml_free(ctx);
+    ggml_backend_buffer_free(buffer);
+    ggml_backend_free(backend);
+
+    char currenttime_string[TMPBUF_LEN];
+    get_timestring(currenttime_string);
+
+#ifdef GGML_USE_HEXAGON
+    if ((n_backend_type == HEXAGON_BACKEND_CDSP) && (n_ggml_op_type == GGML_OP_MUL_MAT)) {
+        printf("[%s] duration of ut GGML_OP_%s with backend %s(algo type:%d): %ld milliseconds\n", currenttime_string, ggml_op_name((enum ggml_op)n_ggml_op_type), ggml_backend_hexagon_get_devname(n_backend_type), ggml_backend_hexagon_get_mulmat_algotype(), n_duration);
+    } else {
+        printf("[%s] duration of ut GGML_OP_%s with backend %s: %ld milliseconds\n", currenttime_string, ggml_op_name((enum ggml_op)n_ggml_op_type), ggml_backend_hexagon_get_devname(n_backend_type), n_duration);
+    }
+#else
+    printf("[%s] duration of ut GGML_OP_%s with the default ggml backend: %ld milliseconds\n", currenttime_string, ggml_op_name((enum ggml_op)n_ggml_op_type), n_duration);
+#endif
+
+    return 0;
+}
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index 543db93402190..67fc48cbeb321 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -36,6 +36,10 @@
 #include <thread>
 #include <vector>
 
+#ifdef GGML_USE_HEXAGON
+#include "ggml-hexagon.h"
+#endif
+
 static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float max = 1.0f) {
     size_t nels = ggml_nelements(tensor);
     std::vector<float> data(nels);
@@ -2706,8 +2710,8 @@ struct test_conv_transpose_1d : public test_case {
         return VARS_TO_STR5(ne_input, ne_kernel, s0, p0, d0);
     }
 
-    test_conv_transpose_1d(std::array<int64_t, 4> ne_input = {197, 32, 1, 1}, // [input_width, input_height, input_channels, 1]
-                           std::array<int64_t, 4> ne_kernel = {16, 32, 32, 1}, // [kernel_width, kernel_height, input_channels, 1]
+    test_conv_transpose_1d(std::array<int64_t, 4> ne_input = {197, 32, 1, 1}, // [input_width, input_channels, 1 /* assert in cpu kernel*/, 1 (should be batch)]
+                           std::array<int64_t, 4> ne_kernel = {16, 32, 32, 1}, // [kernel_width, output_channels, input_channels, 1 (should be batch)]
                            int s0 = 1, int p0 = 0, int d0 = 1)
         : ne_input(ne_input), ne_kernel(ne_kernel), s0(s0), p0(p0), d0(d0) {}
 
@@ -2725,6 +2729,35 @@ struct test_conv_transpose_1d : public test_case {
     }
 };
 
+// GGML_OP_CONV_TRANSPOSE_2D
+struct test_conv_transpose_2d : public test_case {
+    const std::array<int64_t, 4> ne_input;
+    const std::array<int64_t, 4> ne_kernel;
+    const int stride;
+
+    std::string vars() override {
+        return VARS_TO_STR3(ne_input, ne_kernel, stride);
+    }
+
+    test_conv_transpose_2d(std::array<int64_t, 4> ne_input = {10, 10, 3, 1}, // [input_width, input_height, input_channels, 1]
+                           std::array<int64_t, 4> ne_kernel = {3, 3, 3, 1}, // [kernel_width, kernel_height, input_channels, 1]
+                           int stride = 1)
+        : ne_input(ne_input), ne_kernel(ne_kernel), stride(stride){}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * input = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne_input.data());
+        ggml_set_name(input, "input");
+
+        ggml_tensor * kernel = ggml_new_tensor(ctx, GGML_TYPE_F16, 4, ne_kernel.data());
+        ggml_set_name(kernel, "kernel");
+
+        ggml_tensor * out = ggml_conv_transpose_2d_p0(ctx, kernel, input, stride);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+};
+
 // GGML_OP_IM2COL
 struct test_im2col : public test_case {
     const ggml_type type_input;
@@ -4029,6 +4062,18 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
     test_cases.emplace_back(new test_conv_2d_dw({32, 8, 64, 1}, {3, 3, 1, 64}, 2, 1, 1, false));
     test_cases.emplace_back(new test_conv_2d_dw({32, 8, 64, 1}, {3, 3, 1, 64}, 2, 1, 1, true));
 
+    for(uint32_t Cout : {1, 9}){
+        for(uint32_t Cin : {1, 7}){
+            for(uint32_t K : {1, 3, 1337}){
+                for(uint32_t L : {1, 2, 13}){
+                    for(uint32_t s0: {1, 2, 3}){
+                        test_cases.emplace_back(new test_conv_transpose_1d({L,Cin,1,1}, {K,Cout,Cin,1}, s0, 0, 1));
+                    }
+                }
+            }
+        }
+    }
+
     test_cases.emplace_back(new test_conv_transpose_1d());
     test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {2,3,2,1}, 3, 0, 1));
     test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {2,3,2,1}, 2, 0, 1));
@@ -4038,6 +4083,9 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
     test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {3,1,2,1}, 1, 0, 1));
     test_cases.emplace_back(new test_conv_transpose_1d({2,1,1,1}, {3,1,1,1}, 1, 0, 1));
 
+    test_cases.emplace_back(new test_conv_transpose_2d({3, 2, 3, 1}, {2, 2, 1, 3}, 1));
+    test_cases.emplace_back(new test_conv_transpose_2d({10, 10, 9, 1}, {3, 3, 1, 9}, 2));
+
     test_cases.emplace_back(new test_count_equal(GGML_TYPE_F32, {4,  500, 1, 1}));
     test_cases.emplace_back(new test_count_equal(GGML_TYPE_F32, {4, 5000, 1, 1}));
 
@@ -4208,39 +4256,45 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
 #if 1
     for (ggml_type type_a : base_types) {
         for (ggml_type type_b : {GGML_TYPE_F32, GGML_TYPE_F16}) {
-            // test cases without permutation
-            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16,  1, 256, {1, 1}, {1, 1}));
-            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16,  1, 256, {1, 1}, {2, 1}));
-            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16,  1, 256, {1, 1}, {1, 2}));
-            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16,  1, 256, {3, 1}, {1, 1}));
-            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16,  1, 256, {3, 1}, {2, 1}));
-            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16,  1, 256, {3, 2}, {1, 1}));
-            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16,  1, 256, {3, 2}, {2, 1}));
-            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16,  1, 256, {3, 2}, {1, 2}));
-            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16,  1, 256, {3, 2}, {2, 2}));
-
-            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {1, 1}, {1, 1}));
-            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {1, 1}, {2, 1}));
-            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {1, 1}, {1, 2}));
-            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {3, 1}, {1, 1}));
-            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {3, 1}, {2, 1}));
-            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {3, 2}, {1, 1}));
-            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {3, 2}, {2, 1}));
-            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {3, 2}, {1, 2}));
-            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {3, 2}, {2, 2}));
-
-            // test cases with permutation
-            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16,  1, 256, {2, 3}, {1, 1}, {0, 2, 1, 3}));
-            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16,  1, 256, {2, 3}, {1, 1}, {0, 1, 3, 2}));
-            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16,  1, 256, {2, 3}, {1, 1}, {0, 3, 2, 1}));
-
-            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16,  8, 256, {2, 3}, {1, 1}, {0, 2, 1, 3}));
-            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16,  8, 256, {2, 3}, {1, 1}, {0, 1, 3, 2}));
-            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16,  8, 256, {2, 3}, {1, 1}, {0, 3, 2, 1}));
-
-            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {2, 3}, {1, 1}, {0, 2, 1, 3}));
-            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {2, 3}, {1, 1}, {0, 1, 3, 2}));
-            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {2, 3}, {1, 1}, {0, 3, 2, 1}));
+            std::vector<int> ks = { 256 };
+            if (ggml_blck_size(type_a) == 1) {
+                ks.push_back(4);
+            }
+            for (auto k : ks) {
+                // test cases without permutation
+                test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16,  1, k, {1, 1}, {1, 1}));
+                test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16,  1, k, {1, 1}, {2, 1}));
+                test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16,  1, k, {1, 1}, {1, 2}));
+                test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16,  1, k, {3, 1}, {1, 1}));
+                test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16,  1, k, {3, 1}, {2, 1}));
+                test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16,  1, k, {3, 2}, {1, 1}));
+                test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16,  1, k, {3, 2}, {2, 1}));
+                test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16,  1, k, {3, 2}, {1, 2}));
+                test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16,  1, k, {3, 2}, {2, 2}));
+
+                test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, k, {1, 1}, {1, 1}));
+                test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, k, {1, 1}, {2, 1}));
+                test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, k, {1, 1}, {1, 2}));
+                test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, k, {3, 1}, {1, 1}));
+                test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, k, {3, 1}, {2, 1}));
+                test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, k, {3, 2}, {1, 1}));
+                test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, k, {3, 2}, {2, 1}));
+                test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, k, {3, 2}, {1, 2}));
+                test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, k, {3, 2}, {2, 2}));
+
+                // test cases with permutation
+                test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16,  1, k, {2, 3}, {1, 1}, {0, 2, 1, 3}));
+                test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16,  1, k, {2, 3}, {1, 1}, {0, 1, 3, 2}));
+                test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16,  1, k, {2, 3}, {1, 1}, {0, 3, 2, 1}));
+
+                test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16,  8, k, {2, 3}, {1, 1}, {0, 2, 1, 3}));
+                test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16,  8, k, {2, 3}, {1, 1}, {0, 1, 3, 2}));
+                test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16,  8, k, {2, 3}, {1, 1}, {0, 3, 2, 1}));
+
+                test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, k, {2, 3}, {1, 1}, {0, 2, 1, 3}));
+                test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, k, {2, 3}, {1, 1}, {0, 1, 3, 2}));
+                test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, k, {2, 3}, {1, 1}, {0, 3, 2, 1}));
+            }
 
             // test cases with large ne00/ne10 to cover stream-k fixup
             test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16,  1, 1024, {3, 2}, {1, 1}));
@@ -4546,6 +4600,30 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
     test_cases.emplace_back(new test_falcon(2));
 #endif
 
+#ifdef GGML_USE_HEXAGON
+    //verify computation result of add on cDSP
+    test_cases.emplace_back(new test_bin_bcast(ggml_add, GGML_TYPE_F32, {384, 1, 1, 1}, {1,   1, 1, 1}));
+    test_cases.emplace_back(new test_bin_bcast(ggml_add, GGML_TYPE_F32, {1536, 1, 1, 1}, {1,   1, 1, 1}));
+    test_cases.emplace_back(new test_bin_bcast(ggml_add, GGML_TYPE_F32, {1024, 1024, 1, 1}, {1,   1, 1, 1}));
+    test_cases.emplace_back(new test_bin_bcast(ggml_add, GGML_TYPE_F32, {2048, 2048, 1, 1}, {1,   1, 1, 1}));
+    test_cases.emplace_back(new test_bin_bcast(ggml_add, GGML_TYPE_F32, {2048, 1, 1, 1}, {1,   1, 1, 1}));
+    test_cases.emplace_back(new test_bin_bcast(ggml_add, GGML_TYPE_F32, {4096, 4096, 1, 1}, {1,   1, 1, 1}));
+    //verify computation result of mulmat on cDSP
+    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F32,    GGML_TYPE_F32, 32, 14, 64, { 1,  1}, {1, 1}));
+    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F32,    GGML_TYPE_F32, 256, 128, 256, { 1,  1}, {1, 1}));
+    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F32,    GGML_TYPE_F32, 600, 300, 600, { 1,  1}, {1, 1}));
+    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F32,    GGML_TYPE_F32, 900, 450, 900, { 1,  1}, {1, 1}));
+    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F32,    GGML_TYPE_F32, 512, 512, 1024, { 1,  1}, {1, 1}));
+    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F32,    GGML_TYPE_F32, 1024, 512, 1024, { 1,  1}, {1, 1}));
+    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F32,    GGML_TYPE_F32, 1024, 1024, 1024, { 1,  1}, {1, 1}));
+    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F32,    GGML_TYPE_F32, 2048, 1024, 2048, { 1,  1}, {1, 1}));
+    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F32,    GGML_TYPE_F32, 2048, 2048, 2048, { 1,  1}, {1, 1}));
+    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F32,    GGML_TYPE_F32, 4096, 38, 4096, { 1,  1}, {1, 1}));
+    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F32,    GGML_TYPE_F32, 4096, 1024, 4096, { 1,  1}, {1, 1}));
+    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F32,    GGML_TYPE_F32, 4096, 2048, 4096, { 1,  1}, {1, 1}));
+    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F32,    GGML_TYPE_F32, 4096, 4096, 4096, { 1,  1}, {1, 1}));
+#endif
+
     return test_cases;
 }
 
@@ -4606,6 +4684,10 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
     test_cases.emplace_back(new test_conv_2d_dw({512, 512, 256, 1}, {3, 3, 1, 256}, 1, 1, 1, false));
     test_cases.emplace_back(new test_conv_2d_dw({512, 512, 256, 1}, {3, 3, 1, 256}, 1, 1, 1, true));
 
+    test_cases.emplace_back(new test_conv_transpose_2d({256, 256, 256, 1}, {3, 3, 16, 256}, 1));
+
+    test_cases.emplace_back(new test_mean(GGML_TYPE_F32, {256, 256, 3, 1}));
+
     return test_cases;
 }
 
@@ -4690,6 +4772,17 @@ int main(int argc, char ** argv) {
     const char * backend_filter = nullptr;
     const char * params_filter = nullptr;
 
+#ifdef GGML_USE_HEXAGON
+    int mulmat_algotype = 0;
+    for (int i = 1; i < argc; i++) {
+        if (0 == strcmp(argv[i], "-a")) {
+            mulmat_algotype = atoi(argv[i+1]);
+        }
+    }
+    printf("mulmat_algotype %d\n", mulmat_algotype);
+    ggml_backend_hexagon_set_mulmat_algotype(mulmat_algotype);
+#endif
+
     for (int i = 1; i < argc; i++) {
         if (strcmp(argv[i], "test") == 0) {
             mode = MODE_TEST;
@@ -4718,6 +4811,13 @@ int main(int argc, char ** argv) {
                 usage(argv);
                 return 1;
             }
+#ifdef GGML_USE_HEXAGON
+        } else if (strcmp(argv[i], "-a") == 0) {
+            if (i + 1 < argc) {
+                char * temp = argv[++i];
+                (void)temp;
+            }
+#endif
         } else {
             usage(argv);
             return 1;
diff --git a/tests/test-chat.cpp b/tests/test-chat.cpp
index 1c98079217235..6ebf1464d911a 100644
--- a/tests/test-chat.cpp
+++ b/tests/test-chat.cpp
@@ -7,6 +7,8 @@
 //
 #include "chat.h"
 
+#include "log.h"
+
 #include "../src/unicode.h"
 #include "../src/llama-grammar.h"
 
@@ -19,8 +21,8 @@
 using json = nlohmann::ordered_json;
 
 static std::ostream & operator<<(std::ostream & os, const common_chat_msg_diff & diff) {
-    // os << "reasoning_content_delta: " << diff.reasoning_content_delta << '\n';
     os << "{ content_delta: " << diff.content_delta << "; ";
+    os << "reasoning_content_delta: " << diff.reasoning_content_delta << "; ";
     if (diff.tool_call_index != std::string::npos) {
         os << "tool_call_index: " << diff.tool_call_index << "; ";
         os << "tool_call_delta.name: " << diff.tool_call_delta.name << "; ";
@@ -1428,6 +1430,8 @@ static void test_msg_diffs_compute() {
 }
 
 int main(int argc, char ** argv) {
+    common_log_set_verbosity_thold(999);
+
     // try {
 #ifndef _WIN32
         if (argc > 1) {
diff --git a/tests/test-gguf.cpp b/tests/test-gguf.cpp
index eaf572c666410..3f0c312e2f003 100644
--- a/tests/test-gguf.cpp
+++ b/tests/test-gguf.cpp
@@ -16,6 +16,7 @@ constexpr int offset_has_data    = 3000;
 
 enum handcrafted_file_type {
     HANDCRAFTED_HEADER_BAD_MAGIC           =  10,
+    HANDCRAFTED_HEADER_BAD_VERSION_0       =  15,
     HANDCRAFTED_HEADER_BAD_VERSION_1       =  20,
     HANDCRAFTED_HEADER_BAD_VERSION_FUTURE  =  30,
     HANDCRAFTED_HEADER_BAD_N_TENSORS       =  40,
@@ -51,6 +52,7 @@ enum handcrafted_file_type {
 static std::string handcrafted_file_type_name(const enum handcrafted_file_type hft) {
     switch (hft) {
         case HANDCRAFTED_HEADER_BAD_MAGIC:           return "HEADER_BAD_MAGIC";
+        case HANDCRAFTED_HEADER_BAD_VERSION_0:       return "HEADER_BAD_VERSION_0";
         case HANDCRAFTED_HEADER_BAD_VERSION_1:       return "HEADER_BAD_VERSION_1";
         case HANDCRAFTED_HEADER_BAD_VERSION_FUTURE:  return "HEADER_BAD_VERSION_FUTURE";
         case HANDCRAFTED_HEADER_BAD_N_KV:            return "HEADER_BAD_N_KV";
@@ -171,7 +173,10 @@ static FILE * get_handcrafted_file(const unsigned int seed, const enum handcraft
         helper_write(file, GGUF_MAGIC, 4);
     }
 
-    if (hft == HANDCRAFTED_HEADER_BAD_VERSION_1) {
+    if (hft == HANDCRAFTED_HEADER_BAD_VERSION_0) {
+        const uint32_t version = 0;
+        helper_write(file, version);
+    } else if (hft == HANDCRAFTED_HEADER_BAD_VERSION_1) {
         const uint32_t version = 1;
         helper_write(file, version);
     } else if (hft == HANDCRAFTED_HEADER_BAD_VERSION_FUTURE) {
@@ -660,6 +665,7 @@ static std::pair<int, int> test_handcrafted_file(const unsigned int seed) {
 
     const std::vector<handcrafted_file_type> hfts = {
         HANDCRAFTED_HEADER_BAD_MAGIC,
+        HANDCRAFTED_HEADER_BAD_VERSION_0,
         HANDCRAFTED_HEADER_BAD_VERSION_1,
         HANDCRAFTED_HEADER_BAD_VERSION_FUTURE,
         HANDCRAFTED_HEADER_BAD_N_KV,
diff --git a/tests/test-thread-safety.cpp b/tests/test-thread-safety.cpp
new file mode 100644
index 0000000000000..07685bafadd35
--- /dev/null
+++ b/tests/test-thread-safety.cpp
@@ -0,0 +1,171 @@
+// thread safety test
+// - Loads a copy of the same model on each GPU, plus a copy on the CPU
+// - Creates n_parallel (--parallel) contexts per model
+// - Runs inference in parallel on each context
+
+#include <thread>
+#include <vector>
+#include <atomic>
+#include "llama.h"
+#include "arg.h"
+#include "common.h"
+#include "log.h"
+#include "sampling.h"
+
+#ifdef GGML_USE_HEXAGON
+#include "ggml-hexagon.h"
+#endif
+
+int main(int argc, char ** argv) {
+#ifdef GGML_USE_HEXAGON
+    int backend = HEXAGON_BACKEND_CDSP;
+    for (int i = 1; i < argc; i++) {
+        if (0 == strcmp(argv[i], "-mg")) {
+            backend = atoi(argv[i+1]);
+        }
+    }
+    printf("backend %d\n", backend);
+    if (backend >= HEXAGON_BACKEND_CDSP) {
+        ggml_backend_hexagon_set_cfg(backend, HWACCEL_CDSP);
+    }
+    if (backend < HEXAGON_BACKEND_CDSP) {
+        ggml_backend_hexagon_set_cfg(backend, HWACCEL_QNN);
+    }
+#endif
+    common_params params;
+
+    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
+        return 1;
+    }
+
+    common_init();
+
+    llama_backend_init();
+    llama_numa_init(params.numa);
+
+    LOG_INF("%s\n", common_params_get_system_info(params).c_str());
+
+    //llama_log_set([](ggml_log_level level, const char * text, void * /*user_data*/) {
+    //    if (level == GGML_LOG_LEVEL_ERROR) {
+    //        common_log_add(common_log_main(), level, "%s", text);
+    //    }
+    //}, NULL);
+
+    auto cparams = common_context_params_to_llama(params);
+
+    int dev_count = ggml_backend_dev_count();
+    int gpu_dev_count = 0;
+    for (int i = 0; i < dev_count; ++i) {
+        auto * dev = ggml_backend_dev_get(i);
+        if (dev && ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_GPU) {
+            gpu_dev_count++;
+        }
+    }
+    const int num_models = gpu_dev_count + 1 + 1; // GPUs + 1 CPU model + 1 layer split
+    //const int num_models = std::max(1, gpu_dev_count);
+    const int num_contexts = std::max(1, params.n_parallel);
+
+    std::vector<llama_model_ptr> models;
+    std::vector<std::thread> threads;
+    std::atomic<bool> failed = false;
+
+    for (int m = 0; m < num_models; ++m) {
+        auto mparams = common_model_params_to_llama(params);
+
+        if (m < gpu_dev_count) {
+            mparams.split_mode = LLAMA_SPLIT_MODE_NONE;
+            mparams.main_gpu = m;
+        } else if (m == gpu_dev_count) {
+            mparams.split_mode = LLAMA_SPLIT_MODE_NONE;
+            mparams.main_gpu = -1; // CPU model
+        } else {
+            mparams.split_mode = LLAMA_SPLIT_MODE_LAYER;;
+        }
+
+        llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams);
+        if (model == NULL) {
+            LOG_ERR("%s: failed to load model '%s'\n", __func__, params.model.path.c_str());
+            return 1;
+        }
+
+        models.emplace_back(model);
+    }
+
+    for  (int m = 0; m < num_models; ++m) {
+        auto * model = models[m].get();
+        for (int c = 0; c < num_contexts; ++c) {
+            threads.emplace_back([&, m, c, model]() {
+                LOG_INF("Creating context %d/%d for model %d/%d\n", c + 1, num_contexts, m + 1, num_models);
+
+                llama_context_ptr ctx { llama_init_from_model(model, cparams) };
+                if (ctx == NULL) {
+                    LOG_ERR("failed to create context\n");
+                    failed.store(true);
+                    return;
+                }
+
+                std::unique_ptr<common_sampler, decltype(&common_sampler_free)> sampler { common_sampler_init(model, params.sampling), common_sampler_free };
+                if (sampler == NULL) {
+                    LOG_ERR("failed to create sampler\n");
+                    failed.store(true);
+                    return;
+                }
+
+                llama_batch batch = {};
+                {
+                    auto prompt = common_tokenize(ctx.get(), params.prompt, true);
+                    if (prompt.empty()) {
+                        LOG_ERR("failed to tokenize prompt\n");
+                        failed.store(true);
+                        return;
+                    }
+                    batch = llama_batch_get_one(prompt.data(), prompt.size());
+                    if (llama_decode(ctx.get(), batch)) {
+                        LOG_ERR("failed to decode prompt\n");
+                        failed.store(true);
+                        return;
+                    }
+                }
+
+                const auto * vocab = llama_model_get_vocab(model);
+                std::string result = params.prompt;
+
+                for (int i = 0; i < params.n_predict; i++) {
+                    llama_token token;
+                    if (batch.n_tokens > 0) {
+                        token = common_sampler_sample(sampler.get(), ctx.get(), batch.n_tokens - 1);
+                    } else {
+                        token = llama_vocab_bos(vocab);
+                    }
+
+                    result += common_token_to_piece(ctx.get(), token);
+
+                    if (llama_vocab_is_eog(vocab, token)) {
+                        break;
+                    }
+
+                    batch = llama_batch_get_one(&token, 1);
+                    if (llama_decode(ctx.get(), batch)) {
+                        LOG_ERR("Model %d/%d, Context %d/%d: failed to decode\n", m + 1, num_models, c + 1, num_contexts);
+                        failed.store(true);
+                        return;
+                    }
+                }
+
+                LOG_INF("Model %d/%d, Context %d/%d: %s\n\n", m + 1, num_models, c + 1, num_contexts, result.c_str());
+            });
+        }
+    }
+
+    for (auto & thread : threads) {
+        thread.join();
+    }
+
+    if (failed) {
+        LOG_ERR("One or more threads failed.\n");
+        return 1;
+    }
+
+    LOG_INF("All threads finished without errors.\n");
+    return 0;
+}
diff --git a/tests/test-tokenizers-repo.sh b/tests/test-tokenizers-repo.sh
new file mode 100755
index 0000000000000..86e839133ce62
--- /dev/null
+++ b/tests/test-tokenizers-repo.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+
+if [ $# -lt 2 ]; then
+    printf "Usage: $0 <git-repo> <target-folder> [<test-exe>]\n"
+    exit 1
+fi
+
+if [ $# -eq 3 ]; then
+    toktest=$3
+else
+    toktest="./test-tokenizer-0"
+fi
+
+if [ ! -x $toktest ]; then
+    printf "Test executable \"$toktest\" not found!\n"
+    exit 1
+fi
+
+repo=$1
+folder=$2
+
+if [ -d $folder ] && [ -d $folder/.git ]; then
+    (cd $folder; git pull)
+else
+    git clone $repo $folder
+fi
+
+shopt -s globstar
+for gguf in $folder/**/*.gguf; do
+    if [ -f $gguf.inp ] && [ -f $gguf.out ]; then
+        $toktest $gguf
+    else
+        printf "Found \"$gguf\" without matching inp/out files, ignoring...\n"
+    fi
+done
+
diff --git a/tools/batched-bench/batched-bench.cpp b/tools/batched-bench/batched-bench.cpp
index 119df471b25ee..a0a2e5ac56ea9 100644
--- a/tools/batched-bench/batched-bench.cpp
+++ b/tools/batched-bench/batched-bench.cpp
@@ -57,6 +57,8 @@ int main(int argc, char ** argv) {
         return 1;
     }
 
+    auto * mem = llama_get_memory(ctx);
+
     const int32_t n_kv_max = llama_n_ctx(ctx);
 
     llama_batch batch = llama_batch_init(n_kv_max, 0, 1);
@@ -132,7 +134,7 @@ int main(int argc, char ** argv) {
 
                 const auto t_pp_start = ggml_time_us();
 
-                llama_kv_self_clear(ctx);
+                llama_memory_clear(mem, false);
 
                 if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
                     LOG_ERR("%s: llama_decode() failed\n", __func__);
@@ -141,7 +143,7 @@ int main(int argc, char ** argv) {
 
                 if (is_pp_shared) {
                     for (int32_t i = 1; i < pl; ++i) {
-                        llama_kv_self_seq_cp(ctx, 0, i, -1, -1);
+                        llama_memory_seq_cp(mem, 0, i, -1, -1);
                     }
                 }
 
diff --git a/tools/cvector-generator/cvector-generator.cpp b/tools/cvector-generator/cvector-generator.cpp
index 2a907155010cb..d2d97e05cebb0 100644
--- a/tools/cvector-generator/cvector-generator.cpp
+++ b/tools/cvector-generator/cvector-generator.cpp
@@ -342,7 +342,7 @@ static bool cb_eval(struct ggml_tensor * t, bool ask, void * user_data) {
 }
 
 static bool get_hidden_layers(llama_context * ctx, std::vector<llama_token> & tokens) {
-    llama_kv_self_clear(ctx);
+    llama_memory_clear(llama_get_memory(ctx), true);
     if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size()))) {
         fprintf(stderr, "%s : failed to eval\n", __func__);
         return false;
diff --git a/tools/imatrix/imatrix.cpp b/tools/imatrix/imatrix.cpp
index 81d0404d683d5..daad44e59579f 100644
--- a/tools/imatrix/imatrix.cpp
+++ b/tools/imatrix/imatrix.cpp
@@ -498,7 +498,7 @@ static bool compute_imatrix(llama_context * ctx, const common_params & params) {
         const auto t_start = std::chrono::high_resolution_clock::now();
 
         // clear the KV cache
-        llama_kv_self_clear(ctx);
+        llama_memory_clear(llama_get_memory(ctx), true);
 
         llama_batch batch = llama_batch_init(n_batch, 0, 1);
 
diff --git a/tools/llama-bench/llama-bench.cpp b/tools/llama-bench/llama-bench.cpp
index 803630d2650ed..ab8be6dc7f770 100644
--- a/tools/llama-bench/llama-bench.cpp
+++ b/tools/llama-bench/llama-bench.cpp
@@ -21,6 +21,9 @@
 #include "common.h"
 #include "ggml.h"
 #include "llama.h"
+#ifdef GGML_USE_HEXAGON
+#include "ggml-hexagon.h"
+#endif
 
 #ifdef _WIN32
 #    define WIN32_LEAN_AND_MEAN
@@ -267,6 +270,7 @@ struct cmd_params {
     int                              delay;
     bool                             verbose;
     bool                             progress;
+    bool                             no_warmup;
     output_formats                   output_format;
     output_formats                   output_format_stderr;
 };
@@ -303,6 +307,7 @@ static const cmd_params cmd_params_defaults = {
     /* delay                */ 0,
     /* verbose              */ false,
     /* progress             */ false,
+    /* no_warmup            */ false,
     /* output_format        */ MARKDOWN,
     /* output_format_stderr */ NONE,
 };
@@ -325,6 +330,7 @@ static void print_usage(int /* argc */, char ** argv) {
            output_format_str(cmd_params_defaults.output_format_stderr));
     printf("  -v, --verbose                             verbose output\n");
     printf("  --progress                                print test progress indicators\n");
+    printf("  --no-warmup                               skip warmup runs before benchmarking\n");
     printf("\n");
     printf("test parameters:\n");
     printf("  -m, --model <filename>                    (default: %s)\n", join(cmd_params_defaults.model, ",").c_str());
@@ -425,6 +431,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
     params.prio                 = cmd_params_defaults.prio;
     params.delay                = cmd_params_defaults.delay;
     params.progress             = cmd_params_defaults.progress;
+    params.no_warmup            = cmd_params_defaults.no_warmup;
 
     for (int i = 1; i < argc; i++) {
         arg = argv[i];
@@ -798,6 +805,8 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
                 params.verbose = true;
             } else if (arg == "--progress") {
                 params.progress = true;
+            } else if (arg == "--no-warmup") {
+                params.no_warmup = true;
             } else {
                 invalid_param = true;
                 break;
@@ -1696,6 +1705,24 @@ struct markdown_printer : public printer {
 
     void print_footer() override {
         fprintf(fout, "\nbuild: %s (%d)\n", test::build_commit.c_str(), test::build_number);
+
+        auto time_to_string = [](const std::chrono::system_clock::time_point & tp)->std::string {
+            auto as_time_t = std::chrono::system_clock::to_time_t(tp);
+            struct tm tm;
+
+            localtime_r(&as_time_t, &tm);
+
+            std::chrono::milliseconds ms = std::chrono::duration_cast<std::chrono::milliseconds>(tp.time_since_epoch());
+            char buf[256];
+            memset(buf, 0, 256);
+            snprintf(buf, sizeof(buf), "%04d-%02d-%02d,%02d:%02d:%02d",
+                     tm.tm_year + 1900, tm.tm_mon + 1, tm.tm_mday, tm.tm_hour, tm.tm_min, tm.tm_sec);
+            GGML_UNUSED(ms);
+            return buf;
+        };
+
+        std::chrono::system_clock::time_point tp = std::chrono::system_clock::now();
+        fprintf(fout, "running time:%s\n", time_to_string(tp).c_str());
     }
 };
 
@@ -1813,6 +1840,21 @@ static std::unique_ptr<printer> create_printer(output_formats format) {
 }
 
 int main(int argc, char ** argv) {
+#ifdef GGML_USE_HEXAGON
+    int backend = HEXAGON_BACKEND_CDSP;
+    for (int i = 1; i < argc; i++) {
+        if (0 == strcmp(argv[i], "-mg")) {
+            backend = atoi(argv[i+1]);
+        }
+    }
+    printf("backend %d\n", backend);
+    if (backend >= HEXAGON_BACKEND_CDSP) {
+        ggml_backend_hexagon_set_cfg(backend, HWACCEL_CDSP);
+    }
+    if (backend < HEXAGON_BACKEND_CDSP) {
+        ggml_backend_hexagon_set_cfg(backend, HWACCEL_QNN);
+    }
+#endif
     // try to set locale for unicode characters in markdown
     setlocale(LC_CTYPE, ".UTF-8");
 
@@ -1900,7 +1942,7 @@ int main(int argc, char ** argv) {
 
         test t(inst, lmodel, ctx);
 
-        llama_kv_self_clear(ctx);
+        llama_memory_clear(llama_get_memory(ctx), false);
 
         // cool off before the test
         if (params.delay) {
@@ -1925,30 +1967,32 @@ int main(int argc, char ** argv) {
         llama_attach_threadpool(ctx, threadpool, NULL);
 
         // warmup run
-        if (t.n_prompt > 0) {
-            if (params.progress) {
-                fprintf(stderr, "llama-bench: benchmark %d/%zu: warmup prompt run\n", params_idx, params_count);
-            }
-            //test_prompt(ctx, std::min(t.n_batch, std::min(t.n_prompt, 32)), 0, t.n_batch, t.n_threads);
-            bool res = test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads);
-            if (!res) {
-                fprintf(stderr, "%s: error: failed to run prompt warmup\n", __func__);
-                exit(1);
-            }
-        }
-        if (t.n_gen > 0) {
-            if (params.progress) {
-                fprintf(stderr, "llama-bench: benchmark %d/%zu: warmup generation run\n", params_idx, params_count);
+        if (!params.no_warmup) {
+            if (t.n_prompt > 0) {
+                if (params.progress) {
+                    fprintf(stderr, "llama-bench: benchmark %d/%zu: warmup prompt run\n", params_idx, params_count);
+                }
+                //test_prompt(ctx, std::min(t.n_batch, std::min(t.n_prompt, 32)), 0, t.n_batch, t.n_threads);
+                bool res = test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads);
+                if (!res) {
+                    fprintf(stderr, "%s: error: failed to run prompt warmup\n", __func__);
+                    exit(1);
+                }
             }
-            bool res = test_gen(ctx, 1, t.n_threads);
-            if (!res) {
-                fprintf(stderr, "%s: error: failed to run gen warmup\n", __func__);
-                exit(1);
+            if (t.n_gen > 0) {
+                if (params.progress) {
+                    fprintf(stderr, "llama-bench: benchmark %d/%zu: warmup generation run\n", params_idx, params_count);
+                }
+                bool res = test_gen(ctx, 1, t.n_threads);
+                if (!res) {
+                    fprintf(stderr, "%s: error: failed to run gen warmup\n", __func__);
+                    exit(1);
+                }
             }
         }
 
         for (int i = 0; i < params.reps; i++) {
-            llama_kv_self_clear(ctx);
+            llama_memory_clear(llama_get_memory(ctx), false);
 
             if (t.n_depth > 0) {
                 if (params.progress) {
diff --git a/tools/main/main.cpp b/tools/main/main.cpp
index 1bd2be2d94f51..fdd948344ae10 100644
--- a/tools/main/main.cpp
+++ b/tools/main/main.cpp
@@ -6,6 +6,10 @@
 #include "llama.h"
 #include "chat.h"
 
+#ifdef GGML_USE_HEXAGON
+#include "ggml-hexagon.h"
+#endif
+
 #include <cstdio>
 #include <cstring>
 #include <ctime>
@@ -84,6 +88,22 @@ static void sigint_handler(int signo) {
 #endif
 
 int main(int argc, char ** argv) {
+#ifdef GGML_USE_HEXAGON
+    int backend = HEXAGON_BACKEND_CDSP;
+    for (int i = 1; i < argc; i++) {
+        if (0 == strcmp(argv[i], "-mg")) {
+            backend = atoi(argv[i+1]);
+        }
+    }
+    printf("backend %d\n", backend);
+    if (backend >= HEXAGON_BACKEND_CDSP) {
+        ggml_backend_hexagon_set_cfg(backend, HWACCEL_CDSP);
+    }
+    if (backend < HEXAGON_BACKEND_CDSP) {
+        ggml_backend_hexagon_set_cfg(backend, HWACCEL_QNN);
+    }
+#endif
+
     common_params params;
     g_params = &params;
     if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_MAIN, print_usage)) {
@@ -147,6 +167,8 @@ int main(int argc, char ** argv) {
         return 1;
     }
 
+    auto * mem = llama_get_memory(ctx);
+
     const llama_vocab * vocab = llama_model_get_vocab(model);
     auto chat_templates = common_chat_templates_init(model, params.chat_template);
 
@@ -290,6 +312,7 @@ int main(int argc, char ** argv) {
 
             if (!params.system_prompt.empty() || !params.prompt.empty()) {
                 common_chat_templates_inputs inputs;
+                inputs.use_jinja = g_params->use_jinja;
                 inputs.messages = chat_msgs;
                 inputs.add_generation_prompt = !params.prompt.empty();
 
@@ -351,7 +374,7 @@ int main(int argc, char ** argv) {
         }
 
         // remove any "future" tokens that we might have inherited from the previous session
-        llama_kv_self_seq_rm(ctx, -1, n_matching_session_tokens, -1);
+        llama_memory_seq_rm(mem, -1, n_matching_session_tokens, -1);
     }
 
     LOG_DBG("recalculate the cached logits (check): embd_inp.size() %zu, n_matching_session_tokens %zu, embd_inp.size() %zu, session_tokens.size() %zu\n",
@@ -599,8 +622,8 @@ int main(int argc, char ** argv) {
                     LOG_DBG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
                             n_past, n_left, n_ctx, params.n_keep, n_discard);
 
-                    llama_kv_self_seq_rm (ctx, 0, params.n_keep            , params.n_keep + n_discard);
-                    llama_kv_self_seq_add(ctx, 0, params.n_keep + n_discard, n_past, -n_discard);
+                    llama_memory_seq_rm (mem, 0, params.n_keep            , params.n_keep + n_discard);
+                    llama_memory_seq_add(mem, 0, params.n_keep + n_discard, n_past, -n_discard);
 
                     n_past -= n_discard;
 
@@ -623,9 +646,9 @@ int main(int argc, char ** argv) {
                     LOG_DBG("div:   [%6d, %6d] / %6d -> [%6d, %6d]\n", ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n, (ga_i + ib*bd)/ga_n, (ga_i + ib*bd + ga_w)/ga_n);
                     LOG_DBG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i + ib*bd + ga_w, n_past + ib*bd, dd, ga_i + ib*bd + ga_w + dd, n_past + ib*bd + dd);
 
-                    llama_kv_self_seq_add(ctx, 0, ga_i,                n_past,              ib*bd);
-                    llama_kv_self_seq_div(ctx, 0, ga_i + ib*bd,        ga_i + ib*bd + ga_w, ga_n);
-                    llama_kv_self_seq_add(ctx, 0, ga_i + ib*bd + ga_w, n_past + ib*bd,      dd);
+                    llama_memory_seq_add(mem, 0, ga_i,                n_past,              ib*bd);
+                    llama_memory_seq_div(mem, 0, ga_i + ib*bd,        ga_i + ib*bd + ga_w, ga_n);
+                    llama_memory_seq_add(mem, 0, ga_i + ib*bd + ga_w, n_past + ib*bd,      dd);
 
                     n_past -= bd;
 
@@ -914,10 +937,19 @@ int main(int argc, char ** argv) {
                     embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
                     embd_inp.insert(embd_inp.end(), line_sfx.begin(), line_sfx.end());
 
+                    if (params.verbose_prompt) {
+                        LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size() - original_size);
+                    }
+
                     for (size_t i = original_size; i < embd_inp.size(); ++i) {
                         const llama_token token = embd_inp[i];
+                        const std::string token_str = common_token_to_piece(ctx, token);
                         output_tokens.push_back(token);
-                        output_ss << common_token_to_piece(ctx, token);
+                        output_ss << token_str;
+
+                        if (params.verbose_prompt) {
+                            LOG_INF("%6d -> '%s'\n", token, token_str.c_str());
+                        }
                     }
 
                     // reset assistant message
diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
index c25bacc17769b..a990520ed3fbb 100644
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -187,7 +187,7 @@ struct clip_hparams {
     float eps = 1e-6;
     float rope_theta = 0.0;
 
-    std::vector<int32_t> image_grid_pinpoints;
+    std::vector<clip_image_size> image_res_candidates; // for llava-uhd style models
     int32_t image_crop_resolution;
     std::unordered_set<int32_t> vision_feature_layer;
     int32_t attn_window_size = 0;
@@ -2109,8 +2109,7 @@ struct clip_model_loader {
             if (is_vision) {
                 get_u32(KEY_IMAGE_SIZE, hparams.image_size);
                 get_u32(KEY_PATCH_SIZE, hparams.patch_size);
-                get_u32(KEY_IMAGE_CROP_RESOLUTION,    hparams.image_crop_resolution, false);
-                get_arr_int(KEY_IMAGE_GRID_PINPOINTS, hparams.image_grid_pinpoints, false);
+                get_u32(KEY_IMAGE_CROP_RESOLUTION, hparams.image_crop_resolution, false);
                 get_i32(KEY_MINICPMV_VERSION, hparams.minicpmv_version, false); // legacy
 
             } else if (is_audio) {
@@ -2120,6 +2119,20 @@ struct clip_model_loader {
                 GGML_ASSERT(false && "unknown modality");
             }
 
+            // for pinpoints, we need to convert it into a list of resolution candidates
+            {
+                std::vector<int> pinpoints;
+                get_arr_int(KEY_IMAGE_GRID_PINPOINTS, pinpoints, false);
+                if (!pinpoints.empty()) {
+                    for (size_t i = 0; i < pinpoints.size(); i += 2) {
+                        hparams.image_res_candidates.push_back({
+                            pinpoints[i],
+                            pinpoints[i+1],
+                        });
+                    }
+                }
+            }
+
             // default warmup value
             hparams.warmup_image_size = hparams.image_size;
 
@@ -2198,6 +2211,9 @@ struct clip_model_loader {
                     {
                         hparams.rope_theta = 10000.0f;
                         hparams.warmup_image_size = hparams.patch_size * 8;
+                        // Mistral Small 2506 needs 1024x1024 image size cap to prevent OOM
+                        // ref: https://github.com/ggml-org/llama.cpp/issues/14310
+                        hparams.image_size = 1024;
                         get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.spatial_merge_size, false);
                     } break;
                 case PROJECTOR_TYPE_GEMMA3:
@@ -2231,16 +2247,7 @@ struct clip_model_loader {
                     {
                         hparams.rope_theta = 10000.0f;
                         get_u32(KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor);
-
-                        // borrowed from llava-1.6
-                        const int isize = hparams.image_size;
-                        hparams.image_grid_pinpoints = {
-                            isize,   isize*2, // 336, 672
-                            isize*2, isize,   // 672, 336
-                            isize*2, isize*2, // 672, 672
-                            isize*3, isize,   // 1008, 336
-                            isize,   isize*3, // 336, 1008
-                        };
+                        set_llava_uhd_res_candidates(model, 3);
                     } break;
                 case PROJECTOR_TYPE_ULTRAVOX:
                 case PROJECTOR_TYPE_QWEN2A:
@@ -2674,6 +2681,21 @@ struct clip_model_loader {
             output[i] = values[i];
         }
     }
+
+    void set_llava_uhd_res_candidates(clip_model & model, const int max_patches_per_side) {
+        auto & hparams = model.hparams;
+        for (int x = 1; x <= max_patches_per_side; x++) {
+            for (int y = 1; y <= max_patches_per_side; y++) {
+                if (x == 1 && y == 1) {
+                    continue; // skip the first point
+                }
+                hparams.image_res_candidates.push_back(clip_image_size{
+                    x*hparams.image_size,
+                    y*hparams.image_size,
+                });
+            }
+        }
+    }
 };
 
 struct clip_init_result clip_init(const char * fname, struct clip_context_params ctx_params) {
@@ -3028,36 +3050,41 @@ struct llava_uhd {
         bool padding_refined = false;  // if true, refine image will be padded to the grid size (e.g. llava-1.6)
     };
 
-    static int get_max_slices(struct clip_ctx * ctx) {
-        if (clip_is_minicpmv(ctx)) {
-            return 9;
-        }
-        return 0;
-    }
-
     static slice_instructions get_slice_instructions(struct clip_ctx * ctx, const clip_image_size & original_size) {
         slice_instructions res;
         const int patch_size      = clip_get_patch_size(ctx);
         const int slice_size      = clip_get_image_size(ctx);
-        const int max_slice_nums  = get_max_slices(ctx);
         const int original_width  = original_size.width;
         const int original_height = original_size.height;
-        const float log_ratio = log((float)original_width / original_height);
-        const float ratio = (float)original_width * original_height / (slice_size * slice_size);
-        const int multiple = fmin(ceil(ratio), max_slice_nums);
-        const bool has_slices = (multiple > 1);
-        const bool has_pinpoints = !ctx->model.hparams.image_grid_pinpoints.empty();
+
+        const bool has_slices    = original_size.width > slice_size || original_size.height > slice_size;
+        const bool has_pinpoints = !ctx->model.hparams.image_res_candidates.empty();
+
+        if (!has_slices) {
+            // skip slicing logic
+            res.overview_size = clip_image_size{slice_size, slice_size};
+            res.refined_size  = clip_image_size{0, 0};
+            res.grid_size     = clip_image_size{0, 0};
+
+            return res;
+        }
 
         if (has_pinpoints) {
             // has pinpoints, use them to calculate the grid size (e.g. llava-1.6)
             auto refine_size = llava_uhd::select_best_resolution(
-                ctx->model.hparams.image_grid_pinpoints,
-                original_size);
+                original_size,
+                ctx->model.hparams.image_res_candidates);
             res.overview_size   = clip_image_size{slice_size, slice_size};
             res.refined_size    = refine_size;
             res.grid_size       = clip_image_size{0, 0};
             res.padding_refined = true;
 
+            LOG_DBG("%s: using pinpoints for slicing\n", __func__);
+            LOG_DBG("%s: original size: %d x %d, overview size: %d x %d, refined size: %d x %d\n",
+                    __func__, original_width, original_height,
+                    res.overview_size.width, res.overview_size.height,
+                    res.refined_size.width,  res.refined_size.height);
+
             for (int y = 0; y < refine_size.height; y += slice_size) {
                 for (int x = 0; x < refine_size.width; x += slice_size) {
                     slice_coordinates slice;
@@ -3066,13 +3093,16 @@ struct llava_uhd {
                     slice.size.width  = std::min(slice_size, refine_size.width  - x);
                     slice.size.height = std::min(slice_size, refine_size.height - y);
                     res.slices.push_back(slice);
-                    if (x == 0) {
-                        res.grid_size.width++;
-                    }
+                    LOG_DBG("%s: slice %d: x=%d, y=%d, size=%dx%d\n",
+                            __func__, (int)res.slices.size() - 1,
+                            slice.x, slice.y, slice.size.width, slice.size.height);
                 }
-                res.grid_size.height++;
             }
 
+            res.grid_size.height = refine_size.height / slice_size;
+            res.grid_size.width  = refine_size.width  / slice_size;
+            LOG_DBG("%s: grid size: %d x %d\n", __func__, res.grid_size.width, res.grid_size.height);
+
             return res;
         }
 
@@ -3081,17 +3111,23 @@ struct llava_uhd {
         auto best_size    = get_best_resize(original_size, slice_size, patch_size, !has_slices);
         res.overview_size = best_size;
 
-        if (!has_slices) {
-            // skip slicing logic
-            res.refined_size = clip_image_size{0, 0};
-            res.grid_size    = clip_image_size{0, 0};
+        {
+            const int max_slice_nums = 9; // TODO: this is only used by minicpmv, maybe remove it
+            const float log_ratio = log((float)original_width / original_height);
+            const float ratio = (float)original_width * original_height / (slice_size * slice_size);
+            const int multiple = fmin(ceil(ratio), max_slice_nums);
 
-        } else {
             auto best_grid   = get_best_grid(max_slice_nums, multiple, log_ratio);
             auto refine_size = get_refine_size(original_size, best_grid, slice_size, patch_size, true);
             res.grid_size    = best_grid;
             res.refined_size = refine_size;
 
+            LOG_DBG("%s: original size: %d x %d, overview size: %d x %d, refined size: %d x %d, grid size: %d x %d\n",
+                    __func__, original_width, original_height,
+                    res.overview_size.width, res.overview_size.height,
+                    res.refined_size.width, res.refined_size.height,
+                    res.grid_size.width, res.grid_size.height);
+
             int width  = refine_size.width;
             int height = refine_size.height;
             int grid_x = int(width  / best_grid.width);
@@ -3108,7 +3144,9 @@ struct llava_uhd {
                     slice.size.width  = grid_x;
                     slice.size.height = grid_y;
                     res.slices.push_back(slice);
-                    // LOG_INF("slice %d: %d %d %d %d\n", ic, patches_i, patches_j, grid_x, grid_y);
+                    LOG_DBG("%s: slice %d: x=%d, y=%d, size=%dx%d\n",
+                            __func__, (int)res.slices.size() - 1,
+                            slice.x, slice.y, slice.size.width, slice.size.height);
                 }
             }
         }
@@ -3166,48 +3204,55 @@ struct llava_uhd {
         return res;
     }
 
+    static clip_image_size resize_maintain_aspect_ratio(const clip_image_size & orig, const clip_image_size & target_max) {
+        float scale_width  = static_cast<float>(target_max.width)  / orig.width;
+        float scale_height = static_cast<float>(target_max.height) / orig.height;
+        float scale = std::min(scale_width, scale_height);
+        return clip_image_size{
+            static_cast<int>(orig.width  * scale),
+            static_cast<int>(orig.height * scale),
+        };
+    }
+
     /**
      * Selects the best resolution from a list of possible resolutions based on the original size.
      *
+     * For example, when given a list of resolutions:
+     *  - 100x100
+     *  - 200x100
+     *  - 100x200
+     *  - 200x200
+     *
+     * And an input image of size 111x200, then 100x200 is the best fit (least wasted resolution).
+     *
      * @param original_size The original size of the image
      * @param possible_resolutions A list of possible resolutions
      * @return The best fit resolution
      */
     static clip_image_size select_best_resolution(const clip_image_size & original_size, const std::vector<clip_image_size> & possible_resolutions) {
-        int original_width = original_size.width;
-        int original_height = original_size.height;
         clip_image_size best_fit;
+        int min_wasted_area = std::numeric_limits<int>::max();
         int max_effective_resolution = 0;
-        int min_wasted_resolution = std::numeric_limits<int>::max();
-
-        for (const auto & resolution : possible_resolutions) {
-            int width  = resolution.width;
-            int height = resolution.height;
-            float scale = std::min(static_cast<float>(width) / original_width, static_cast<float>(height) / original_height);
-            int downscaled_width  = static_cast<int>(original_width * scale);
-            int downscaled_height = static_cast<int>(original_height * scale);
-            int effective_resolution = std::min(downscaled_width * downscaled_height, original_width * original_height);
-            int wasted_resolution = (width * height) - effective_resolution;
-            // LOG_INF("resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution);
-            if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_resolution < min_wasted_resolution)) {
+
+        for (const clip_image_size & candidate : possible_resolutions) {
+            auto target_size = resize_maintain_aspect_ratio(original_size, candidate);
+            int effective_resolution = std::min(
+                target_size.width * target_size.height,
+                original_size.width * original_size.height);
+            int wasted_area = (candidate.width * candidate.height) - effective_resolution;
+
+            if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_area < min_wasted_area)) {
                 max_effective_resolution = effective_resolution;
-                min_wasted_resolution = wasted_resolution;
-                best_fit = resolution;
+                min_wasted_area = wasted_area;
+                best_fit = candidate;
             }
+
+            LOG_DBG("%s: candidate: %d x %d, target: %d x %d, wasted: %d, effective: %d\n", __func__, candidate.width, candidate.height, target_size.width, target_size.height, wasted_area, effective_resolution);
         }
 
         return best_fit;
     }
 
-    // used by llava 1.6 with custom list of pinpoints
-    static clip_image_size select_best_resolution(const std::vector<int32_t> & pinpoints, const clip_image_size & original_size) {
-        std::vector<clip_image_size> possible_resolutions; // TODO @ngxson : construct this inside hparams, not here
-        for (size_t i = 0; i < pinpoints.size(); i += 2) {
-            possible_resolutions.push_back(clip_image_size{pinpoints[i], pinpoints[i+1]});
-        }
-        return select_best_resolution(original_size, possible_resolutions);
-    }
-
     static int ensure_divide(int length, int patch_size) {
         return std::max(static_cast<int>(std::round(static_cast<float>(length) / patch_size) * patch_size), patch_size);
     }
@@ -3331,7 +3376,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
         return true;
 
     } else if (ctx->proj_type() == PROJECTOR_TYPE_LLAMA4) {
-        GGML_ASSERT(!params.image_grid_pinpoints.empty());
+        GGML_ASSERT(!params.image_res_candidates.empty());
         auto const inst = llava_uhd::get_slice_instructions(ctx, original_size);
         std::vector<clip_image_u8_ptr> imgs = llava_uhd::slice_image(img, inst);
 
@@ -3371,7 +3416,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
         res_imgs->entries.push_back(std::move(res));
         return true;
 
-    } else if (!params.image_grid_pinpoints.empty()) {
+    } else if (!params.image_res_candidates.empty()) {
         // "spatial_unpad" with "anyres" processing for llava-1.6
         auto const inst = llava_uhd::get_slice_instructions(ctx, original_size);
         std::vector<clip_image_u8_ptr> imgs = llava_uhd::slice_image(img, inst);
@@ -3431,17 +3476,6 @@ const char * clip_patch_merge_type(const struct clip_ctx * ctx) {
     return ctx->model.hparams.mm_patch_merge_type == PATCH_MERGE_SPATIAL_UNPAD ? "spatial_unpad" : "flat";
 }
 
-const int32_t * clip_image_grid(const struct clip_ctx * ctx) {
-    if (ctx->model.hparams.image_grid_pinpoints.size()) {
-        return &ctx->model.hparams.image_grid_pinpoints.front();
-    }
-    return nullptr;
-}
-
-size_t get_clip_image_grid_size(const struct clip_ctx * ctx) {
-    return ctx->model.hparams.image_grid_pinpoints.size();
-}
-
 int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
     const auto & params = ctx->model.hparams;
     const int n_total = clip_n_output_tokens(ctx, img);
diff --git a/tools/mtmd/clip.h b/tools/mtmd/clip.h
index cb2eb261fe2e8..08f3efb7b1daf 100644
--- a/tools/mtmd/clip.h
+++ b/tools/mtmd/clip.h
@@ -46,9 +46,6 @@ int32_t clip_get_hidden_size(const struct clip_ctx * ctx);
 // TODO: should be enum, not string
 const char * clip_patch_merge_type(const struct clip_ctx * ctx);
 
-const int32_t * clip_image_grid(const struct clip_ctx * ctx);
-size_t get_clip_image_grid_size(const struct clip_ctx * ctx);
-
 int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * img);
 
 // for M-RoPE, this will be the number of token positions in X and Y directions
diff --git a/tools/mtmd/mtmd-cli.cpp b/tools/mtmd/mtmd-cli.cpp
index 508a64c586de1..599e682e0f894 100644
--- a/tools/mtmd/mtmd-cli.cpp
+++ b/tools/mtmd/mtmd-cli.cpp
@@ -70,6 +70,7 @@ struct mtmd_cli_context {
     llama_model       * model;
     llama_context     * lctx;
     const llama_vocab * vocab;
+    common_sampler    * smpl;
     llama_batch         batch;
     int                 n_batch;
 
@@ -89,8 +90,9 @@ struct mtmd_cli_context {
         model = llama_init.model.get();
         lctx = llama_init.context.get();
         vocab = llama_model_get_vocab(model);
+        smpl = common_sampler_init(model, params.sampling);
         n_threads = params.cpuparams.n_threads;
-        batch = llama_batch_init(params.n_batch, 0, 1);
+        batch = llama_batch_init(1, 0, 1); // batch for next token generation
         n_batch = params.n_batch;
 
         if (!model || !lctx) {
@@ -118,6 +120,11 @@ struct mtmd_cli_context {
         }
     }
 
+    ~mtmd_cli_context() {
+        llama_batch_free(batch);
+        common_sampler_free(smpl);
+    }
+
     void init_vision_context(common_params & params) {
         const char * clip_path = params.mmproj.path.c_str();
         mtmd_context_params mparams = mtmd_context_params_default();
@@ -153,7 +160,7 @@ struct mtmd_cli_context {
     }
 };
 
-static int generate_response(mtmd_cli_context & ctx, common_sampler * smpl, int n_predict) {
+static int generate_response(mtmd_cli_context & ctx, int n_predict) {
     llama_tokens generated_tokens;
     for (int i = 0; i < n_predict; i++) {
         if (i > n_predict || !g_is_generating || g_is_interrupted) {
@@ -161,9 +168,9 @@ static int generate_response(mtmd_cli_context & ctx, common_sampler * smpl, int
             break;
         }
 
-        llama_token token_id = common_sampler_sample(smpl, ctx.lctx, -1);
+        llama_token token_id = common_sampler_sample(ctx.smpl, ctx.lctx, -1);
         generated_tokens.push_back(token_id);
-        common_sampler_accept(smpl, token_id, true);
+        common_sampler_accept(ctx.smpl, token_id, true);
 
         if (llama_vocab_is_eog(ctx.vocab, token_id) || ctx.check_antiprompt(generated_tokens)) {
             LOG("\n");
@@ -261,7 +268,6 @@ int main(int argc, char ** argv) {
 
     bool is_single_turn = !params.prompt.empty() && !params.image.empty();
 
-    struct common_sampler * smpl = common_sampler_init(ctx.model, params.sampling);
     int n_predict = params.n_predict < 0 ? INT_MAX : params.n_predict;
 
     // Ctrl+C handling
@@ -300,7 +306,7 @@ int main(int argc, char ** argv) {
         if (eval_message(ctx, msg, true)) {
             return 1;
         }
-        if (!g_is_interrupted && generate_response(ctx, smpl, n_predict)) {
+        if (!g_is_interrupted && generate_response(ctx, n_predict)) {
             return 1;
         }
 
@@ -336,7 +342,7 @@ int main(int argc, char ** argv) {
             }
             if (line == "/clear") {
                 ctx.n_past = 0;
-                llama_kv_self_seq_rm(ctx.lctx, 0, 1, -1); // keep BOS
+                llama_memory_seq_rm(llama_get_memory(ctx.lctx), 0, 1, -1); // keep BOS
                 LOG("Chat history cleared\n\n");
                 continue;
             }
@@ -366,7 +372,7 @@ int main(int argc, char ** argv) {
                 return 1;
             }
             if (g_is_interrupted) break;
-            if (generate_response(ctx, smpl, n_predict)) {
+            if (generate_response(ctx, n_predict)) {
                 return 1;
             }
             content.clear();
diff --git a/tools/mtmd/mtmd-helper.cpp b/tools/mtmd/mtmd-helper.cpp
index 64f03fd1e7eb2..686f42f3960fe 100644
--- a/tools/mtmd/mtmd-helper.cpp
+++ b/tools/mtmd/mtmd-helper.cpp
@@ -311,6 +311,7 @@ int32_t mtmd_helper_eval_chunk_single(mtmd_context * ctx,
         GGML_ABORT("chunk type not supported");
     }
 
+    llama_batch_free(text_batch);
     return 0;
 }
 
diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp
index 8573f11437f1b..e3829738338c3 100644
--- a/tools/mtmd/mtmd.cpp
+++ b/tools/mtmd/mtmd.cpp
@@ -501,7 +501,10 @@ struct mtmd_tokenizer {
                 || ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_6
                 || ctx->slice_tmpl == MTMD_SLICE_TMPL_LLAMA4
             ) {
+                const int n_col = batch_f32.grid_x;
+                const int n_row = batch_f32.grid_y;
                 // split batch into chunks of single images
+                // NOTE: batch_f32 will be invalidated after this call
                 auto chunks = split_batch_to_chunk(std::move(batch_f32), bitmap->id);
                 GGML_ASSERT(chunks.size() > 0);
 
@@ -521,8 +524,7 @@ struct mtmd_tokenizer {
 
                 // add slices (or tiles)
                 if (!chunks.empty()) {
-                    const int n_col = batch_f32.grid_x;
-                    const int n_row = batch_f32.grid_y;
+                    GGML_ASSERT((int)chunks.size() == n_row * n_col);
                     if (ctx->tok_slices_start != LLAMA_TOKEN_NULL) {
                         add_text({ctx->tok_slices_start});
                     }
diff --git a/tools/perplexity/perplexity.cpp b/tools/perplexity/perplexity.cpp
index b5cdf5beb1b24..189dcb3d72f5e 100644
--- a/tools/perplexity/perplexity.cpp
+++ b/tools/perplexity/perplexity.cpp
@@ -361,7 +361,7 @@ static results_perplexity perplexity_v2(llama_context * ctx, const common_params
         const auto t_start = std::chrono::high_resolution_clock::now();
 
         // clear the KV cache
-        llama_kv_self_clear(ctx);
+        llama_memory_clear(llama_get_memory(ctx), true);
 
         llama_batch batch = llama_batch_init(n_batch, 0, 1);
 
@@ -547,7 +547,7 @@ static results_perplexity perplexity(llama_context * ctx, const common_params &
         const auto t_start = std::chrono::high_resolution_clock::now();
 
         // clear the KV cache
-        llama_kv_self_clear(ctx);
+        llama_memory_clear(llama_get_memory(ctx), true);
 
         for (int j = 0; j < num_batches; ++j) {
             const int batch_start = start + j * n_batch;
@@ -924,7 +924,7 @@ static void hellaswag_score(llama_context * ctx, const common_params & params) {
             return;
         }
 
-        llama_kv_self_clear(ctx);
+        llama_memory_clear(llama_get_memory(ctx), true);
 
         // decode all tasks [i0, i1)
         if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) {
@@ -1217,7 +1217,7 @@ static void winogrande_score(llama_context * ctx, const common_params & params)
             return;
         }
 
-        llama_kv_self_clear(ctx);
+        llama_memory_clear(llama_get_memory(ctx), true);
 
         // decode all tasks [i0, i1)
         if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) {
@@ -1592,7 +1592,7 @@ static void multiple_choice_score(llama_context * ctx, const common_params & par
             return;
         }
 
-        llama_kv_self_clear(ctx);
+        llama_memory_clear(llama_get_memory(ctx), true);
 
         // decode all tasks [i0, i1)
         if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) {
@@ -1782,7 +1782,7 @@ static void kl_divergence(llama_context * ctx, const common_params & params) {
         }
 
         // clear the KV cache
-        llama_kv_self_clear(ctx);
+        llama_memory_clear(llama_get_memory(ctx), true);
 
         llama_batch batch = llama_batch_init(n_batch, 0, 1);
 
diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp
index 3f54af7c58158..8acc765178846 100644
--- a/tools/quantize/quantize.cpp
+++ b/tools/quantize/quantize.cpp
@@ -107,13 +107,11 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp
     return false;
 }
 
-// usage:
-//  ./llama-quantize [--allow-requantize] [--leave-output-tensor] [--pure] models/llama/ggml-model.gguf [models/llama/ggml-model-quant.gguf] type [nthreads]
-//
 [[noreturn]]
 static void usage(const char * executable) {
-    printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights] [--exclude-weights] [--output-tensor-type]\n", executable);
-    printf("       [--token-embedding-type] [--tensor-type] [--keep-split] [--override-kv] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n");
+    printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights]\n", executable);
+    printf("       [--exclude-weights] [--output-tensor-type] [--token-embedding-type] [--tensor-type] [--prune-layers] [--keep-split] [--override-kv]\n");
+    printf("       model-f32.gguf [model-quant.gguf] type [nthreads]\n\n");
     printf("  --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
     printf("  --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
     printf("  --pure: Disable k-quant mixtures and quantize all tensors to the same type\n");
@@ -124,6 +122,8 @@ static void usage(const char * executable) {
     printf("  --token-embedding-type ggml_type: use this ggml_type for the token embeddings tensor\n");
     printf("  --tensor-type TENSOR=TYPE: quantize this tensor to this ggml_type. example: --tensor-type attn_q=q8_0\n");
     printf("      Advanced option to selectively quantize tensors. May be specified multiple times.\n");
+    printf("  --prune-layers L0,L1,L2...comma-separated list of layer numbers to prune from the model\n");
+    printf("      Advanced option to remove all tensors from the given layers\n");
     printf("  --keep-split: will generate quantized model in the same shards as input\n");
     printf("  --override-kv KEY=TYPE:VALUE\n");
     printf("      Advanced option to override model metadata by key in the quantized model. May be specified multiple times.\n");
@@ -286,6 +286,32 @@ static bool parse_tensor_type(const char * data, std::vector<tensor_quantization
     return true;
 }
 
+static bool parse_layer_prune(const char * data, std::vector<int> & prune_layers) {
+    if (!data) {
+        printf("\n%s: no layer pruning ids provided\n\n", __func__);
+        return false;
+    }
+
+    const auto block_ids = string_split<std::string>(data, ',');
+    for (const auto & block_id : block_ids) {
+        int id;
+        try {
+            id = std::stoi(block_id);
+        } catch (...) {
+            id = -1;
+        }
+        if (id < 0) {
+            printf("\n%s: invalid layer id '%s'\n\n", __func__, block_id.c_str());
+            return false;
+        }
+        prune_layers.emplace_back(id);
+    }
+
+    sort(prune_layers.begin(), prune_layers.end());
+    prune_layers.erase(std::unique(prune_layers.begin(), prune_layers.end()), prune_layers.end());
+    return true;
+}
+
 int main(int argc, char ** argv) {
     if (argc < 3) {
         usage(argv[0]);
@@ -298,6 +324,7 @@ int main(int argc, char ** argv) {
     std::vector<std::string> included_weights, excluded_weights;
     std::vector<llama_model_kv_override> kv_overrides;
     std::vector<tensor_quantization> tensor_types;
+    std::vector<int> prune_layers;
 
     for (; arg_idx < argc && strncmp(argv[arg_idx], "--", 2) == 0; arg_idx++) {
         if (strcmp(argv[arg_idx], "--leave-output-tensor") == 0) {
@@ -324,6 +351,10 @@ int main(int argc, char ** argv) {
             if (arg_idx == argc-1 || !parse_tensor_type(argv[++arg_idx], tensor_types)) {
                 usage(argv[0]);
             }
+        } else if (strcmp(argv[arg_idx], "--prune-layers") == 0) {
+            if (arg_idx == argc-1 || !parse_layer_prune(argv[++arg_idx], prune_layers)) {
+                usage(argv[0]);
+            }
         } else if (strcmp(argv[arg_idx], "--override-kv") == 0) {
             if (arg_idx == argc-1 || !string_parse_kv_override(argv[++arg_idx], kv_overrides)) {
                 usage(argv[0]);
@@ -411,6 +442,9 @@ int main(int argc, char ** argv) {
     if (!tensor_types.empty()) {
         params.tensor_types = &tensor_types;
     }
+    if (!prune_layers.empty()) {
+        params.prune_layers = &prune_layers;
+    }
 
     llama_backend_init();
 
diff --git a/tools/run/run.cpp b/tools/run/run.cpp
index 4aef93863ceec..6fe728c685358 100644
--- a/tools/run/run.cpp
+++ b/tools/run/run.cpp
@@ -9,6 +9,9 @@
 #include <nlohmann/json.hpp>
 
 #if defined(_WIN32)
+#    ifndef NOMINMAX
+#        define NOMINMAX
+#    endif
 #    include <windows.h>
 #    include <io.h>
 #else
@@ -939,23 +942,36 @@ static int apply_chat_template(const struct common_chat_templates * tmpls, Llama
 // Function to tokenize the prompt
 static int tokenize_prompt(const llama_vocab * vocab, const std::string & prompt,
                            std::vector<llama_token> & prompt_tokens, const LlamaData & llama_data) {
-    const bool is_first = llama_kv_self_seq_pos_max(llama_data.context.get(), 0) == 0;
-
-    const int n_prompt_tokens = -llama_tokenize(vocab, prompt.c_str(), prompt.size(), NULL, 0, is_first, true);
-    prompt_tokens.resize(n_prompt_tokens);
-    if (llama_tokenize(vocab, prompt.c_str(), prompt.size(), prompt_tokens.data(), prompt_tokens.size(), is_first,
-                       true) < 0) {
-        printe("failed to tokenize the prompt\n");
+    const bool is_first = llama_memory_seq_pos_max(llama_get_memory(llama_data.context.get()), 0) == -1;
+    int n_tokens = prompt.size() + 2 * is_first;
+    prompt_tokens.resize(n_tokens);
+    n_tokens = llama_tokenize(vocab, prompt.c_str(), prompt.size(),
+                              prompt_tokens.data(), prompt_tokens.size(),
+                              is_first, /*parse_special =*/true);
+    if (n_tokens == std::numeric_limits<int32_t>::min()) {
+        printe("tokenization failed: input too large\n");
         return -1;
     }
-
-    return n_prompt_tokens;
+    if (n_tokens < 0) {
+        prompt_tokens.resize(-n_tokens);
+        int check = llama_tokenize(vocab, prompt.c_str(), prompt.size(),
+                                   prompt_tokens.data(), prompt_tokens.size(),
+                                   is_first, /*parse_special =*/true);
+        if (check != -n_tokens) {
+            printe("failed to tokenize the prompt (size mismatch)\n");
+            return -1;
+        }
+        n_tokens = check;
+    } else {
+        prompt_tokens.resize(n_tokens);
+    }
+    return n_tokens;
 }
 
 // Check if we have enough space in the context to evaluate this batch
 static int check_context_size(const llama_context_ptr & ctx, const llama_batch & batch) {
     const int n_ctx      = llama_n_ctx(ctx.get());
-    const int n_ctx_used = llama_kv_self_seq_pos_max(ctx.get(), 0);
+    const int n_ctx_used = llama_memory_seq_pos_max(llama_get_memory(ctx.get()), 0);
     if (n_ctx_used + batch.n_tokens > n_ctx) {
         printf(LOG_COL_DEFAULT "\n");
         printe("context size exceeded\n");
diff --git a/tools/server/README.md b/tools/server/README.md
index 06533c172e530..1a624c13bee96 100644
--- a/tools/server/README.md
+++ b/tools/server/README.md
@@ -187,6 +187,8 @@ The project is under active development, and we are [looking for feedback and co
 | `-devd, --device-draft <dev1,dev2,..>` | comma-separated list of devices to use for offloading the draft model (none = don't offload)<br/>use --list-devices to see a list of available devices |
 | `-ngld, --gpu-layers-draft, --n-gpu-layers-draft N` | number of layers to store in VRAM for the draft model<br/>(env: LLAMA_ARG_N_GPU_LAYERS_DRAFT) |
 | `-md, --model-draft FNAME` | draft model for speculative decoding (default: unused)<br/>(env: LLAMA_ARG_MODEL_DRAFT) |
+| `-ctkd, --cache-type-k-draft TYPE` | KV cache data type for K for speculative decoding model<br/>allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1<br/>(default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_K_DRAFT) |
+| `-ctvd, --cache-type-v-draft TYPE` | KV cache data type for V for speculative decoding model<br/>allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1<br/>(default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_V_DRAFT) |
 | `-mv, --model-vocoder FNAME` | vocoder model for audio generation (default: unused) |
 | `--tts-use-guide-tokens` | Use guide tokens to improve TTS word recall |
 | `--embd-bge-small-en-default` | use default bge-small-en-v1.5 model (note: can download weights from the internet) |
@@ -368,6 +370,8 @@ node index.js
 
 ### GET `/health`: Returns heath check result
 
+This endpoint is public (no API key check).
+
 **Response format**
 
 - HTTP status code 503
@@ -706,7 +710,7 @@ If the tokens are missing, then the extra context is simply prefixed at the star
 
 ### **GET** `/props`: Get server global properties.
 
-This endpoint is public (no API key check). By default, it is read-only. To make POST request to change global properties, you need to start server with `--props`
+By default, it is read-only. To make POST request to change global properties, you need to start server with `--props`
 
 **Response format**
 
diff --git a/tools/server/public/index.html.gz b/tools/server/public/index.html.gz
index f8e3043421d33..0fb01665ae5cc 100644
Binary files a/tools/server/public/index.html.gz and b/tools/server/public/index.html.gz differ
diff --git a/tools/server/server.cpp b/tools/server/server.cpp
index 4b92eeac9499b..852352383bdbe 100644
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@@ -88,6 +88,26 @@ enum error_type {
     ERROR_TYPE_NOT_SUPPORTED, // custom error
 };
 
+static bool server_task_type_need_embd(server_task_type task_type) {
+    switch (task_type) {
+        case SERVER_TASK_TYPE_EMBEDDING:
+        case SERVER_TASK_TYPE_RERANK:
+            return true;
+        default:
+            return false;
+    }
+}
+
+static bool server_task_type_need_logits(server_task_type task_type) {
+    switch (task_type) {
+        case SERVER_TASK_TYPE_COMPLETION:
+        case SERVER_TASK_TYPE_INFILL:
+            return true;
+        default:
+            return false;
+    }
+}
+
 struct slot_params {
     bool stream        = true;
     bool cache_prompt  = true; // remember the prompt to avoid reprocessing all prompt
@@ -233,6 +253,7 @@ struct server_task {
         slot_params defaults;
         defaults.sampling    = params_base.sampling;
         defaults.speculative = params_base.speculative;
+        defaults.n_keep      = params_base.n_keep;
 
         // enabling this will output extra debug information in the HTTP responses from the server
         params.verbose           = params_base.verbosity > 9;
@@ -360,7 +381,7 @@ struct server_task {
                 params.oaicompat_chat_syntax.format = defaults.oaicompat_chat_syntax.format;
             }
             params.oaicompat_chat_syntax.reasoning_format = params_base.reasoning_format;
-            params.oaicompat_chat_syntax.reasoning_in_content = params.stream;
+            params.oaicompat_chat_syntax.reasoning_in_content = params.stream && (params_base.reasoning_format == COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY);
             params.oaicompat_chat_syntax.thinking_forced_open = json_value(data, "thinking_forced_open", false);
             params.oaicompat_chat_syntax.parse_tool_calls = json_value(data, "parse_tool_calls", false);
         }
@@ -1329,13 +1350,24 @@ struct server_slot {
         n_draft_accepted = 0;
     }
 
-    bool is_non_causal() const {
-        return task_type == SERVER_TASK_TYPE_EMBEDDING || task_type == SERVER_TASK_TYPE_RERANK;
+    bool need_embd() const {
+        return server_task_type_need_embd(task_type);
+    }
+
+    bool need_logits() const {
+        return server_task_type_need_logits(task_type);
+    }
+
+    // if the context does not have a memory module then all embeddings have to be computed within a single ubatch
+    // also we cannot split if the pooling would require any past tokens
+    bool can_split() const {
+        return
+            !need_embd() ||
+            (llama_get_memory(ctx) && llama_pooling_type(ctx) == LLAMA_POOLING_TYPE_LAST);
     }
 
     bool can_batch_with(server_slot & other_slot) const {
-        return is_non_causal() == other_slot.is_non_causal()
-            && are_lora_equal(lora, other_slot.lora);
+        return task_type == other_slot.task_type && are_lora_equal(lora, other_slot.lora);
     }
 
     bool has_budget(const common_params & global_params) {
@@ -1479,7 +1511,6 @@ struct server_slot {
             {"n_ctx",         n_ctx},
             {"speculative",   can_speculate()},
             {"is_processing", is_processing()},
-            {"non_causal",    is_non_causal()},
             {"params",        params.to_json()},
             {"prompt",        prompt_tokens.detokenize(ctx, true)},
             {"next_token",
@@ -1938,10 +1969,8 @@ struct server_context {
             params_dft.n_ctx        = params_base.speculative.n_ctx == 0 ? params_base.n_ctx / params_base.n_parallel : params_base.speculative.n_ctx;
             params_dft.n_gpu_layers = params_base.speculative.n_gpu_layers;
             params_dft.n_parallel   = 1;
-
-            // force F16 KV cache for the draft model for extra performance
-            params_dft.cache_type_k = GGML_TYPE_F16;
-            params_dft.cache_type_v = GGML_TYPE_F16;
+            params_dft.cache_type_k = params_base.speculative.cache_type_k;
+            params_dft.cache_type_v = params_base.speculative.cache_type_v;
 
             llama_init_dft = common_init_from_params(params_dft);
 
@@ -2006,7 +2035,7 @@ struct server_context {
             }
         }
 
-        if (!llama_kv_self_can_shift(ctx)) {
+        if (!llama_memory_can_shift(llama_get_memory(ctx))) {
             if (params_base.ctx_shift) {
                 params_base.ctx_shift = false;
                 SRV_WRN("%s\n", "ctx_shift is not supported by this context, it will be disabled");
@@ -2055,6 +2084,7 @@ struct server_context {
             SLT_INF(slot, "new slot n_ctx_slot = %d\n", slot.n_ctx);
 
             slot.params.sampling = params_base.sampling;
+            slot.params.n_keep = params_base.n_keep;
 
             slot.callback_on_release = [this](int) {
                 queue_tasks.pop_deferred_task();
@@ -2137,7 +2167,8 @@ struct server_context {
 
         // find the slot that has been least recently used
         if (ret == nullptr) {
-            int64_t t_last = ggml_time_us();
+            int64_t t_last = -1;
+
             for (server_slot & slot : slots) {
                 // skip the slot if it is not available
                 if (slot.is_processing()) {
@@ -2145,7 +2176,7 @@ struct server_context {
                 }
 
                 // select the current slot if the criteria match
-                if (slot.t_last_used < t_last) {
+                if (!ret || slot.t_last_used <= t_last) {
                     t_last = slot.t_last_used;
                     ret = &slot;
                 }
@@ -2219,7 +2250,7 @@ struct server_context {
         SRV_DBG("%s", "clearing KV cache\n");
 
         // clear the entire KV cache
-        llama_kv_self_clear(ctx);
+        llama_memory_clear(llama_get_memory(ctx), true);
         clean_kv_cache = false;
     }
 
@@ -2727,6 +2758,7 @@ struct server_context {
                         queue_tasks.defer(std::move(task));
                         break;
                     }
+
                     if (slot->is_processing()) {
                         // if requested slot is unavailable, we defer this task for processing later
                         SRV_DBG("requested slot is unavailable, defer task, id_task = %d\n", task.id);
@@ -2905,7 +2937,7 @@ struct server_context {
 
                     // Erase token cache
                     const size_t n_erased = slot->cache_tokens.size();
-                    llama_kv_self_seq_rm(ctx, slot->id, -1, -1);
+                    llama_memory_seq_rm(llama_get_memory(ctx), slot->id, -1, -1);
                     slot->cache_tokens.clear();
 
                     auto res = std::make_unique<server_task_result_slot_erase>();
@@ -2980,8 +3012,8 @@ struct server_context {
 
                 SLT_WRN(slot, "slot context shift, n_keep = %d, n_left = %d, n_discard = %d\n", n_keep, n_left, n_discard);
 
-                llama_kv_self_seq_rm (ctx, slot.id, n_keep            , n_keep + n_discard);
-                llama_kv_self_seq_add(ctx, slot.id, n_keep + n_discard, slot.n_past,        -n_discard);
+                llama_memory_seq_rm (llama_get_memory(ctx), slot.id, n_keep            , n_keep + n_discard);
+                llama_memory_seq_add(llama_get_memory(ctx), slot.id, n_keep + n_discard, slot.n_past,        -n_discard);
 
                 // add generated tokens to cache
                 {
@@ -3089,7 +3121,14 @@ struct server_context {
                             continue;
                         }
 
-                        if (slot.is_non_causal()) {
+                        // TODO: support memory-less logits computation
+                        if (slot.need_logits() && !llama_get_memory(ctx)) {
+                            slot.release();
+                            send_error(slot, "the current context does not logits computation. skipping", ERROR_TYPE_SERVER);
+                            continue;
+                        }
+
+                        if (!slot.can_split()) {
                             if (slot.n_prompt_tokens > n_ubatch) {
                                 slot.release();
                                 send_error(slot, "input is too large to process. increase the physical batch size", ERROR_TYPE_SERVER);
@@ -3184,8 +3223,8 @@ struct server_context {
 
                                             const int64_t kv_shift = (int64_t) head_p - (int64_t) head_c;
 
-                                            llama_kv_self_seq_rm (ctx, slot.id, head_p, head_c);
-                                            llama_kv_self_seq_add(ctx, slot.id, head_c, head_c + n_match, kv_shift);
+                                            llama_memory_seq_rm (llama_get_memory(ctx), slot.id, head_p, head_c);
+                                            llama_memory_seq_add(llama_get_memory(ctx), slot.id, head_c, head_c + n_match, kv_shift);
 
                                             for (size_t i = 0; i < n_match; i++) {
                                                 slot.cache_tokens.set_token(head_p + i, slot.cache_tokens[head_c + i]);
@@ -3203,32 +3242,28 @@ struct server_context {
                                 }
                             } else {
                                 // if we don't cache the prompt, we have to remove the entire KV cache
-                                llama_kv_self_seq_rm(ctx, slot.id, 0, -1);
                                 slot.n_past = 0;
-                                slot.cache_tokens.clear(); // TODO: not needed, will be cleared later via "keep_first()"
                             }
 
                             if (slot.n_past > 0 && slot.n_past < (int) slot.cache_tokens.size()) {
-                                const auto pos_min = llama_kv_self_seq_pos_min(ctx, slot.id);
+                                const auto pos_min = llama_memory_seq_pos_min(llama_get_memory(ctx), slot.id);
                                 if (pos_min == -1) {
                                     SLT_ERR(slot, "n_past = %d, cache_tokens.size() = %d, seq_id = %d, pos_min = %d\n", slot.n_past, (int) slot.cache_tokens.size(), slot.id, pos_min);
                                     GGML_ABORT("pos_min == -1, but n_past > 0 - should not happen: https://github.com/ggml-org/llama.cpp/pull/13833#discussion_r2116181237");
                                 }
 
                                 const auto n_swa = llama_model_n_swa(model);
-                                if (pos_min > slot.n_past - n_swa) {
+                                if (pos_min > std::max(0, slot.n_past - n_swa)) {
                                     SLT_WRN(slot, "n_past = %d, cache_tokens.size() = %d, seq_id = %d, pos_min = %d, n_swa = %d\n", slot.n_past, (int) slot.cache_tokens.size(), slot.id, pos_min, n_swa);
                                     SLT_WRN(slot, "forcing full prompt re-processing due to lack of cache data (likely due to SWA, see %s)\n",
                                             "https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055");
-                                    llama_kv_self_seq_rm(ctx, slot.id, 0, -1);
                                     slot.n_past = 0;
                                 }
                             }
                         }
 
                         if (slot.n_past == slot.n_prompt_tokens && slot.n_past > 0) {
-                            // we have to evaluate at least 1 token to generate logits.
-                            SLT_WRN(slot, "need to evaluate at least 1 token to generate logits, n_past = %d, n_prompt_tokens = %d\n", slot.n_past, slot.n_prompt_tokens);
+                            SLT_WRN(slot, "need to evaluate at least 1 token for each active slot, n_past = %d, n_prompt_tokens = %d\n", slot.n_past, slot.n_prompt_tokens);
 
                             slot.n_past--;
                         }
@@ -3236,8 +3271,7 @@ struct server_context {
                         slot.n_prompt_tokens_processed = 0;
                     }
 
-                    // non-causal tasks require to fit the entire prompt in the physical batch
-                    if (slot.is_non_causal()) {
+                    if (!slot.can_split()) {
                         // cannot fit the prompt in the current batch - will try next iter
                         if (batch.n_tokens + slot.n_prompt_tokens > n_batch) {
                             continue;
@@ -3245,9 +3279,9 @@ struct server_context {
                     }
 
                     // keep only the common part
-                    if (!llama_kv_self_seq_rm(ctx, slot.id, slot.n_past, -1)) {
+                    if (!llama_memory_seq_rm(llama_get_memory(ctx), slot.id, slot.n_past, -1)) {
                         // could not partially delete (likely using a non-Transformer model)
-                        llama_kv_self_seq_rm(ctx, slot.id, -1, -1);
+                        llama_memory_seq_rm(llama_get_memory(ctx), slot.id, -1, -1);
 
                         // there is no common part left
                         slot.n_past = 0;
@@ -3259,8 +3293,7 @@ struct server_context {
                     slot.cache_tokens.keep_first(slot.n_past);
 
                     // check if we should process the image
-                    if (slot.n_past < slot.n_prompt_tokens
-                            && slot.prompt_tokens[slot.n_past] == LLAMA_TOKEN_NULL) {
+                    if (slot.n_past < slot.n_prompt_tokens && slot.prompt_tokens[slot.n_past] == LLAMA_TOKEN_NULL) {
                         // process the image
                         int32_t new_n_past;
                         int32_t res = slot.prompt_tokens.process_chunk(ctx, mctx, slot.n_past, slot.id, new_n_past);
@@ -3291,8 +3324,8 @@ struct server_context {
                             break; // end of text chunk
                         }
 
-                        // without pooling, we want to output the embeddings for all the tokens in the batch
-                        const bool need_embd = slot.task_type == SERVER_TASK_TYPE_EMBEDDING && llama_pooling_type(slot.ctx) == LLAMA_POOLING_TYPE_NONE;
+                        // embedding requires all tokens in the batch to be output
+                        const bool need_embd = server_task_type_need_embd(slot.task_type);
 
                         common_batch_add(batch, cur_tok, slot.n_past, { slot.id }, need_embd);
                         slot.cache_tokens.push_back(cur_tok);
@@ -3346,41 +3379,10 @@ struct server_context {
         SRV_DBG("decoding batch, n_tokens = %d\n", batch.n_tokens);
 
         if (slot_batched) {
-            // make sure we're in the right embedding mode
-            llama_set_embeddings(ctx, slot_batched->is_non_causal());
             // apply lora, only need to do it once per batch
             common_set_adapter_lora(ctx, slot_batched->lora);
-        }
-
-        const bool do_encode = (params_base.embedding || params_base.reranking);
-
-        // pad the batch so that batch.n_tokens >= n_slots
-        // TODO: temporary workaround for https://github.com/ggml-org/llama.cpp/issues/13689
-        if (do_encode) {
-            const int n_slots = slots.size();
 
-            if (batch.n_tokens < n_slots) {
-                std::set<llama_seq_id> seq_ids;
-                for (int j = 0; j < batch.n_tokens; ++j) {
-                    seq_ids.insert(batch.seq_id[j][0]);
-                }
-
-                // find unused sequence id
-                llama_seq_id seq_id = -1;
-                for (int i = 0; i < n_slots; ++i) {
-                    if (seq_ids.find(i) == seq_ids.end()) {
-                        seq_id = i;
-                    }
-                }
-
-                const int n_add = n_slots - batch.n_tokens;
-
-                SRV_WRN("adding %d dummy tokens to the batch, seq_id = %d\n", n_add, seq_id);
-
-                for (int j = 0; j < n_add; ++j) {
-                    common_batch_add(batch, 0, j, { seq_id }, false);
-                }
-            }
+            llama_set_embeddings(ctx, slot_batched->need_embd());
         }
 
         int32_t i_next = 0;
@@ -3416,9 +3418,12 @@ struct server_context {
                     }
 
                     if (ret < -1) {
+                        // TODO: update slot state based on llama_memory_seq_pos_min() and llama_memory_seq_pos_max()
                         err = "Compute error.";
                     }
 
+                    // TODO: handle ret == 2 (abort) when we start aborting
+
                     if (!err.empty()) {
                         SRV_ERR("%s, i = %d, n_batch = %d, ret = %d\n", err.c_str(), i, n_batch, ret);
                         for (auto & slot : slots) {
@@ -3553,9 +3558,6 @@ struct server_context {
                 const llama_tokens & cached_text_tokens = slot.cache_tokens.get_text_tokens();
                 llama_tokens draft = common_speculative_gen_draft(slot.spec, params_spec, cached_text_tokens, id);
 
-                // keep track of total number of tokens generated in the draft
-                slot.n_draft_total += draft.size();
-
                 // ignore small drafts
                 if (slot.params.speculative.n_min > (int) draft.size()) {
                     SLT_DBG(slot, "ignoring small draft: %d < %d\n", (int) draft.size(), slot.params.speculative.n_min);
@@ -3563,6 +3565,9 @@ struct server_context {
                     continue;
                 }
 
+                // keep track of total number of drafted tokens tested
+                slot.n_draft_total += draft.size();
+
                 // construct the speculation batch
                 common_batch_clear(slot.batch_spec);
                 common_batch_add  (slot.batch_spec, id, slot.n_past, { slot.id }, true);
@@ -3581,13 +3586,13 @@ struct server_context {
                 slot.n_past    += ids.size();
                 slot.n_decoded += ids.size();
 
-                // update how many tokens out of draft was accepted
+                // update how many tokens out of those tested were accepted
                 slot.n_draft_accepted += ids.size() - 1;
 
                 slot.cache_tokens.push_back(id);
                 slot.cache_tokens.insert({ids.begin(), ids.end() - 1});
 
-                llama_kv_self_seq_rm(ctx, slot.id, slot.n_past, -1);
+                llama_memory_seq_rm(llama_get_memory(ctx), slot.id, slot.n_past, -1);
 
                 for (size_t i = 0; i < ids.size(); ++i) {
                     completion_token_output result;
@@ -4174,11 +4179,6 @@ int main(int argc, char ** argv) {
             oaicompat_type oaicompat) -> void {
         GGML_ASSERT(type == SERVER_TASK_TYPE_COMPLETION || type == SERVER_TASK_TYPE_INFILL);
 
-        if (ctx_server.params_base.embedding) {
-            res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings`", ERROR_TYPE_NOT_SUPPORTED));
-            return;
-        }
-
         auto completion_id = gen_chatcmplid();
         std::unordered_set<int> task_ids;
         try {
@@ -4433,12 +4433,8 @@ int main(int argc, char ** argv) {
             OAICOMPAT_TYPE_NONE); // infill is not OAI compatible
     };
 
-    const auto handle_chat_completions = [&ctx_server, &res_error, &handle_completions_impl](const httplib::Request & req, httplib::Response & res) {
+    const auto handle_chat_completions = [&ctx_server, &handle_completions_impl](const httplib::Request & req, httplib::Response & res) {
         LOG_DBG("request: %s\n", req.body.c_str());
-        if (ctx_server.params_base.embedding) {
-            res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings`", ERROR_TYPE_NOT_SUPPORTED));
-            return;
-        }
 
         auto body = json::parse(req.body);
         std::vector<raw_buffer> files;
@@ -4566,13 +4562,18 @@ int main(int argc, char ** argv) {
     };
 
     const auto handle_embeddings_impl = [&ctx_server, &res_error, &res_ok](const httplib::Request & req, httplib::Response & res, oaicompat_type oaicompat) {
-        const json body = json::parse(req.body);
+        if (!ctx_server.params_base.embedding) {
+            res_error(res, format_error_response("This server does not support embeddings. Start it with `--embeddings`", ERROR_TYPE_NOT_SUPPORTED));
+            return;
+        }
 
         if (oaicompat != OAICOMPAT_TYPE_NONE && llama_pooling_type(ctx_server.ctx) == LLAMA_POOLING_TYPE_NONE) {
             res_error(res, format_error_response("Pooling type 'none' is not OAI compatible. Please use a different pooling type", ERROR_TYPE_INVALID_REQUEST));
             return;
         }
 
+        const json body = json::parse(req.body);
+
         // for the shape of input/content, see tokenize_input_prompts()
         json prompt;
         if (body.count("input") != 0) {
@@ -4662,8 +4663,8 @@ int main(int argc, char ** argv) {
     };
 
     const auto handle_rerank = [&ctx_server, &res_error, &res_ok](const httplib::Request & req, httplib::Response & res) {
-        if (!ctx_server.params_base.reranking || ctx_server.params_base.embedding) {
-            res_error(res, format_error_response("This server does not support reranking. Start it with `--reranking` and without `--embedding`", ERROR_TYPE_NOT_SUPPORTED));
+        if (!ctx_server.params_base.embedding || ctx_server.params_base.pooling_type != LLAMA_POOLING_TYPE_RANK) {
+            res_error(res, format_error_response("This server does not support reranking. Start it with `--reranking`", ERROR_TYPE_NOT_SUPPORTED));
             return;
         }
 
@@ -4878,7 +4879,9 @@ int main(int argc, char ** argv) {
     };
 
     bool was_bound = false;
+    bool is_sock = false;
     if (string_ends_with(std::string(params.hostname), ".sock")) {
+        is_sock = true;
         LOG_INF("%s: setting address family to AF_UNIX\n", __func__);
         svr->set_address_family(AF_UNIX);
         // bind_to_port requires a second arg, any value other than 0 should
@@ -4956,7 +4959,9 @@ int main(int argc, char ** argv) {
     SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
 #endif
 
-    LOG_INF("%s: server is listening on http://%s:%d - starting the main loop\n", __func__, params.hostname.c_str(), params.port);
+    LOG_INF("%s: server is listening on %s - starting the main loop\n", __func__,
+            is_sock ? string_format("unix://%s", params.hostname.c_str()).c_str() :
+                      string_format("http://%s:%d", params.hostname.c_str(), params.port).c_str());
 
     // this call blocks the main thread until queue_tasks.terminate() is called
     ctx_server.queue_tasks.start_loop();
diff --git a/tools/server/tests/unit/test_tool_call.py b/tools/server/tests/unit/test_tool_call.py
index 610610749bd34..20f048c6f6aa5 100755
--- a/tools/server/tests/unit/test_tool_call.py
+++ b/tools/server/tests/unit/test_tool_call.py
@@ -499,13 +499,12 @@ def do_test_calc_result(server: ServerProcess, result_override: str | None, n_pr
 
 
 @pytest.mark.slow
-@pytest.mark.parametrize("n_predict,reasoning_format,stream,expect_reasoning_content,expect_content,hf_repo,template_override", [
-    (128, 'deepseek',   CompletionMode.NORMAL,   None, "^The sum of 102 and 7 is 109[\\s\\S]*",                                       "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M",       None),
-    (128,  None,        CompletionMode.NORMAL,   None, "^The sum of 102 and 7 is 109[\\s\\S]*",                                       "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M",       None),
-    (1024, 'deepseek',  CompletionMode.NORMAL,   "I need to calculate the sum of 102 and 7[\\s\\S]*", "To find the sum of[\\s\\S]*",  "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
-    (1024, 'deepseek',  CompletionMode.STREAMED, None, "^<think>I need to calculate [\\s\\S]*?</think>To find the sum of [\\s\\S]*",  "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
-    (1024, 'deepseek',  CompletionMode.NORMAL,   "First, I [\\s\\S]*", "To find the sum of[\\s\\S]*",                                 "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", ("llama-cpp-deepseek-r1", None)),
-    (1024, 'deepseek',  CompletionMode.STREAMED, None, "^<think>First, I [\\s\\S]*?</think>To find the sum of[\\s\\S]*",              "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", ("llama-cpp-deepseek-r1", None)),
+@pytest.mark.parametrize("stream", [CompletionMode.NORMAL, CompletionMode.STREAMED])
+@pytest.mark.parametrize("n_predict,reasoning_format,expect_reasoning_content,expect_content,hf_repo,template_override", [
+    (128, 'deepseek',   None, "^The sum of 102 and 7 is 109[\\s\\S]*",                                       "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M",       None),
+    (128,  None,        None, "^The sum of 102 and 7 is 109[\\s\\S]*",                                       "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M",       None),
+    (1024, 'deepseek',  "I need to calculate the sum of 102 and 7[\\s\\S]*", "To find the sum of[\\s\\S]*",  "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
+    (1024, 'deepseek',  "First, I [\\s\\S]*", "To find the sum of[\\s\\S]*",                                 "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", ("llama-cpp-deepseek-r1", None)),
     # (1024, 'none',      CompletionMode.NORMAL,   None, "^(<think>\\s*)?I need[\\s\\S]*?</think>\\s*To find[\\s\\S]*",                 "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
     # (128,  'deepseek',  None, "^Okay, let me figure out the sum of 102 and 7[\\s\\S]*",                      "bartowski/Qwen_QwQ-32B-GGUF:Q4_K_M",                None),
 ])
diff --git a/tools/server/tests/utils.py b/tools/server/tests/utils.py
index f7e1b3b3b7b8e..bc547ca03bf1b 100644
--- a/tools/server/tests/utils.py
+++ b/tools/server/tests/utils.py
@@ -308,10 +308,12 @@ def make_any_request(
         stream = data.get('stream', False)
         if stream:
             content: list[str] = []
+            reasoning_content: list[str] = []
             tool_calls: list[dict] = []
             finish_reason: Optional[str] = None
 
             content_parts = 0
+            reasoning_content_parts = 0
             tool_call_parts = 0
             arguments_parts = 0
 
@@ -322,6 +324,10 @@ def make_any_request(
                     assert len(choice['delta']['content']) > 0, f'Expected non empty content delta!'
                     content.append(choice['delta']['content'])
                     content_parts += 1
+                if choice['delta'].get('reasoning_content') is not None:
+                    assert len(choice['delta']['reasoning_content']) > 0, f'Expected non empty reasoning_content delta!'
+                    reasoning_content.append(choice['delta']['reasoning_content'])
+                    reasoning_content_parts += 1
                 if choice['delta'].get('finish_reason') is not None:
                     finish_reason = choice['delta']['finish_reason']
                 for tc in choice['delta'].get('tool_calls', []):
@@ -349,8 +355,10 @@ def make_any_request(
                         tool_call['function']['name'] = tool_call['function'].get('name', '') + fct['name']
                     if fct.get('arguments') is not None:
                         tool_call['function']['arguments'] += fct['arguments']
+                        arguments_parts += 1
+                    tool_call_parts += 1
 
-            print(f'Streamed response had {content_parts} content parts, {tool_call_parts} tool call parts incl. {arguments_parts} arguments parts')
+            print(f'Streamed response had {content_parts} content parts, {reasoning_content_parts} reasoning_content parts, {tool_call_parts} tool call parts incl. {arguments_parts} arguments parts')
             result = dict(
                 choices=[
                     dict(
@@ -359,6 +367,7 @@ def make_any_request(
                         message=dict(
                             role='assistant',
                             content=''.join(content) if content else None,
+                            reasoning_content=''.join(reasoning_content) if reasoning_content else None,
                             tool_calls=tool_calls if tool_calls else None,
                         ),
                     )
diff --git a/tools/server/utils.hpp b/tools/server/utils.hpp
index f3e0392a4e9d1..f8fab2c86664e 100644
--- a/tools/server/utils.hpp
+++ b/tools/server/utils.hpp
@@ -271,12 +271,20 @@ static llama_tokens format_rerank(const struct llama_vocab * vocab, const llama_
     }
 
     result.reserve(doc.size() + query.size() + 4);
-    result.push_back(llama_vocab_bos(vocab));
+    if (llama_vocab_get_add_bos(vocab)) {
+        result.push_back(llama_vocab_bos(vocab));
+    }
     result.insert(result.end(), query.begin(), query.end());
-    result.push_back(eos_token);
-    result.push_back(llama_vocab_sep(vocab));
+    if (llama_vocab_get_add_eos(vocab)) {
+        result.push_back(eos_token);
+    }
+    if (llama_vocab_get_add_sep(vocab)) {
+        result.push_back(llama_vocab_sep(vocab));
+    }
     result.insert(result.end(), doc.begin(), doc.end());
-    result.push_back(eos_token);
+    if (llama_vocab_get_add_eos(vocab)) {
+        result.push_back(eos_token);
+    }
 
     return result;
 }
diff --git a/tools/server/webui/src/App.tsx b/tools/server/webui/src/App.tsx
index 02f1719d3d2ce..8dfcf49075803 100644
--- a/tools/server/webui/src/App.tsx
+++ b/tools/server/webui/src/App.tsx
@@ -32,7 +32,7 @@ function AppLayout() {
     <>
       <Sidebar />
       <main
-        className="drawer-content grow flex flex-col h-screen w-screen mx-auto px-4 overflow-auto bg-base-100"
+        className="drawer-content grow flex flex-col h-screen mx-auto px-4 overflow-auto bg-base-100"
         id="main-scroll"
       >
         <Header />
diff --git a/tools/server/webui/src/index.scss b/tools/server/webui/src/index.scss
index 64460b74092e1..362db6e17df5e 100644
--- a/tools/server/webui/src/index.scss
+++ b/tools/server/webui/src/index.scss
@@ -41,6 +41,10 @@ html {
   max-width: 900px;
 }
 
+.chat-bubble {
+  @apply break-words;
+}
+
 .chat-bubble-base-300 {
   --tw-bg-opacity: 1;
   --tw-text-opacity: 1;